diff --git a/.gitignore b/.gitignore index 8e2dbbd7..e0bb7ec8 100644 --- a/.gitignore +++ b/.gitignore @@ -1,9 +1,10 @@ # Ignore the sysbox build token and build dir -build +build/ .buildinfo # Ignore virtual-studio-code metadata .vscode +.idea # Ignore paths holding build artifacts and related files. image/deb/debbuild/ diff --git a/.gitmodules b/.gitmodules deleted file mode 100644 index a016477a..00000000 --- a/.gitmodules +++ /dev/null @@ -1,21 +0,0 @@ -[submodule "sysbox-fs"] - path = sysbox-fs - url = ../sysbox-fs.git -[submodule "sysbox-runc"] - path = sysbox-runc - url = ../sysbox-runc.git -[submodule "sysbox-ipc"] - path = sysbox-ipc - url = ../sysbox-ipc.git -[submodule "sysbox-mgr"] - path = sysbox-mgr - url = ../sysbox-mgr.git -[submodule "sysbox-libs"] - path = sysbox-libs - url = ../sysbox-libs.git -[submodule "sysbox-pkgr"] - path = sysbox-pkgr - url = ../sysbox-pkgr.git -[submodule "sysbox-dockerfiles"] - path = sysbox-dockerfiles - url = ../dockerfiles.git diff --git a/Makefile b/Makefile index 3d7b0d79..62c1539a 100644 --- a/Makefile +++ b/Makefile @@ -38,6 +38,8 @@ ifeq ($(UNAME_M),x86_64) SYS_ARCH := amd64 else ifeq ($(UNAME_M),aarch64) SYS_ARCH := arm64 +else ifeq ($(UNAME_M),arm64) + SYS_ARCH := arm64 else ifeq ($(UNAME_M),arm) SYS_ARCH := armhf else ifeq ($(UNAME_M),armel) @@ -80,7 +82,8 @@ else INSTALL_DIR := ${DESTDIR} endif -IMAGE_BASE_DISTRO := $(shell cat /etc/os-release | grep "^ID=" | cut -d "=" -f2 | tr -d '"') +IMAGE_BASE_DISTRO := ubuntu +IMAGE_BASE_RELEASE := jammy # Host kernel info KERNEL_REL := $(shell uname -r) @@ -89,23 +92,23 @@ KERNEL_REL_MIN := $(shell echo $(KERNEL_REL) | cut -d'.' -f2) export KERNEL_REL # Sysbox image-generation globals utilized during the sysbox's building and testing process. -ifeq ($(IMAGE_BASE_DISTRO),$(filter $(IMAGE_BASE_DISTRO),centos fedora redhat almalinux rocky amzn)) - IMAGE_BASE_RELEASE := $(shell cat /etc/os-release | grep "^VERSION_ID" | cut -d "=" -f2 | tr -d '"' | cut -d "." -f1) - KERNEL_HEADERS := kernels/$(KERNEL_REL) -else - IMAGE_BASE_RELEASE := $(shell cat /etc/os-release | grep "^VERSION_CODENAME" | cut -d "=" -f2) - ifeq ($(IMAGE_BASE_DISTRO),linuxmint) - IMAGE_BASE_DISTRO := ubuntu - ifeq ($(IMAGE_BASE_RELEASE),$(filter $(IMAGE_BASE_RELEASE),ulyana ulyssa uma)) - IMAGE_BASE_RELEASE := focal - endif - ifeq ($(IMAGE_BASE_RELEASE),$(filter $(IMAGE_BASE_RELEASE),tara tessa tina tricia)) - IMAGE_BASE_RELEASE := bionic - endif - endif - KERNEL_HEADERS := linux-headers-$(KERNEL_REL) - KERNEL_HEADERS_BASE := $(shell find /usr/src/$(KERNEL_HEADERS) -maxdepth 1 -type l -exec readlink {} \; | cut -d"/" -f2 | egrep -v "^\.\." | head -1) -endif +#ifeq ($(IMAGE_BASE_DISTRO),$(filter $(IMAGE_BASE_DISTRO),centos fedora redhat almalinux rocky amzn)) +# IMAGE_BASE_RELEASE := $(shell cat /etc/os-release | grep "^VERSION_ID" | cut -d "=" -f2 | tr -d '"' | cut -d "." -f1) +# KERNEL_HEADERS := kernels/$(KERNEL_REL) +#else +# IMAGE_BASE_RELEASE := $(shell cat /etc/os-release | grep "^VERSION_CODENAME" | cut -d "=" -f2) +# ifeq ($(IMAGE_BASE_DISTRO),linuxmint) +# IMAGE_BASE_DISTRO := ubuntu +# ifeq ($(IMAGE_BASE_RELEASE),$(filter $(IMAGE_BASE_RELEASE),ulyana ulyssa uma)) +# IMAGE_BASE_RELEASE := focal +# endif +# ifeq ($(IMAGE_BASE_RELEASE),$(filter $(IMAGE_BASE_RELEASE),tara tessa tina tricia)) +# IMAGE_BASE_RELEASE := bionic +# endif +# endif +# KERNEL_HEADERS := linux-headers-$(KERNEL_REL) +# KERNEL_HEADERS_BASE := $(shell find /usr/src/$(KERNEL_HEADERS) -maxdepth 1 -type l -exec readlink {} \; | cut -d"/" -f2 | egrep -v "^\.\." | head -1) +#endif TEST_DIR := $(CURDIR)/tests TEST_IMAGE := sysbox-test-$(TARGET_ARCH) @@ -226,8 +229,10 @@ sysbox-runc-debug: sysbox-ipc @cd $(SYSRUNC_DIR) && chown -R $(HOST_UID):$(HOST_GID) build sysbox-runc-static: sysbox-ipc + echo "Before Building runc static" @cd $(SYSRUNC_DIR) && make static @cd $(SYSRUNC_DIR) && chown -R $(HOST_UID):$(HOST_GID) build + echo "After Building runc static" sysbox-fs: sysbox-ipc @cd $(SYSFS_DIR) && make @@ -464,6 +469,9 @@ endif test-img: ## Build test container image test-img: @printf "\n** Building the test container **\n\n" + echo "docker build -t $(TEST_IMAGE) \ + --build-arg sys_arch=$(SYS_ARCH) --build-arg target_arch=$(TARGET_ARCH) \ + -f Dockerfile.$(IMAGE_BASE_DISTRO)-$(IMAGE_BASE_RELEASE) ." @cd $(TEST_DIR) && docker build -t $(TEST_IMAGE) \ --build-arg sys_arch=$(SYS_ARCH) --build-arg target_arch=$(TARGET_ARCH) \ -f Dockerfile.$(IMAGE_BASE_DISTRO)-$(IMAGE_BASE_RELEASE) . diff --git a/sysbox-dockerfiles b/sysbox-dockerfiles deleted file mode 160000 index 6f923807..00000000 --- a/sysbox-dockerfiles +++ /dev/null @@ -1 +0,0 @@ -Subproject commit 6f9238079b9b1a9364895da5a429f025528ab033 diff --git a/sysbox-fs b/sysbox-fs deleted file mode 160000 index aeba775e..00000000 --- a/sysbox-fs +++ /dev/null @@ -1 +0,0 @@ -Subproject commit aeba775e52cc6385fa4807c594fc7ee164ad624c diff --git a/sysbox-fs/.gitignore b/sysbox-fs/.gitignore new file mode 100644 index 00000000..9f83fd78 --- /dev/null +++ b/sysbox-fs/.gitignore @@ -0,0 +1,13 @@ +# Ignore sysbox-fs compilation artifacts +build + +# Ignore protobuf auto-generated code +sysbox-ipc + +# Ignore virtual-studio-code metadata +.vscode + +# GNU global tags +GPATH +GRTAGS +GTAGS diff --git a/sysbox-fs/CONTRIBUTING.md b/sysbox-fs/CONTRIBUTING.md new file mode 100644 index 00000000..dfc092fb --- /dev/null +++ b/sysbox-fs/CONTRIBUTING.md @@ -0,0 +1,5 @@ +# Contribute to Sysbox-fs + +Sysbox-fs is a component of the Sysbox container runtime. If you want to +contribute, please refer to the Sysbox contribution +[guidelines](https://github.com/nestybox/sysbox/blob/master/CONTRIBUTING.md). diff --git a/sysbox-fs/LICENSE b/sysbox-fs/LICENSE new file mode 100644 index 00000000..c6087d5b --- /dev/null +++ b/sysbox-fs/LICENSE @@ -0,0 +1,191 @@ + + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + Copyright 2020 Nestybox, Inc. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/sysbox-fs/MAINTAINERS b/sysbox-fs/MAINTAINERS new file mode 100644 index 00000000..3af2dbb0 --- /dev/null +++ b/sysbox-fs/MAINTAINERS @@ -0,0 +1,2 @@ +Rodny Molina (@rodnymolina) +Cesar Talledo (@ctalledo) diff --git a/sysbox-fs/Makefile b/sysbox-fs/Makefile new file mode 100644 index 00000000..b2c2de81 --- /dev/null +++ b/sysbox-fs/Makefile @@ -0,0 +1,93 @@ +# +# sysbox-fs Makefile +# +# Note: targets must execute from the $SYSFS_DIR + +.PHONY: clean sysbox-fs-debug sysbox-fs-static lint list-packages + +GO := go + +SYSFS_BUILDROOT := build +SYSFS_BUILDDIR := $(SYSFS_BUILDROOT)/$(TARGET_ARCH) +SYSFS_TARGET := sysbox-fs +SYSFS_DEBUG_TARGET := sysbox-fs-debug +SYSFS_STATIC_TARGET := sysbox-fs-static +SYSFS_DIR := $(CURDIR) +SYSFS_SRC := $(shell find . 2>&1 | grep -E '.*\.(c|h|go)$$') + +SYSFS_GRPC_DIR := ../sysbox-ipc/sysboxFsGrpc +SYSFS_GRPC_SRC := $(shell find $(SYSFS_GRPC_DIR) 2>&1 | grep -E '.*\.(c|h|go|proto)$$') + +SYSLIB_DIR := ../sysbox-libs +SYSLIB_SRC := $(shell find $(SYSLIB_DIR) 2>&1 | grep -E '.*\.(c|h|go|proto)$$') + +LIBSECCOMP_DIR := ../lib/seccomp- +LIBSECCOMP_SRC := $(shell find $(LIBSECCOMP_DIR) 2>&1 | grep -E '.*\.(go)') + +LIBPIDMON_DIR := ../sysbox-libs/pidmonitor +LIBSPIDMON_SRC := $(shell find $(LIBPIDMON_DIR) 2>&1 | grep -E '.*\.(go)') + +NSENTER_DIR := ../sysbox-runc/libcontainer/nsenter +NSENTER_SRC := $(shell find $(NSENTER_DIR) 2>&1 | grep -E '.*\.(c|h|go)') + +COMMIT_NO := $(shell git rev-parse HEAD 2> /dev/null || true) +COMMIT ?= $(if $(shell git status --porcelain --untracked-files=no),$(COMMIT_NO)-dirty,$(COMMIT_NO)) +BUILT_AT := $(shell date) +BUILT_BY := $(shell git config user.name) + +LDFLAGS := -X 'main.edition=${EDITION}' -X main.version=${VERSION} \ + -X main.commitId=$(COMMIT) -X 'main.builtAt=$(BUILT_AT)' \ + -X 'main.builtBy=$(BUILT_BY)' + +# Set cross-compilation flags if applicable. +ifneq ($(SYS_ARCH),$(TARGET_ARCH)) + ifeq ($(TARGET_ARCH),armel) + GO_XCOMPILE := CGO_ENABLED=1 GOOS=linux GOARCH=arm GOARM=6 CC=arm-linux-gnueabi-gcc + else ifeq ($(TARGET_ARCH),armhf) + GO_XCOMPILE := CGO_ENABLED=1 GOOS=linux GOARCH=arm GOARM=7 CC=arm-linux-gnueabihf-gcc + else ifeq ($(TARGET_ARCH),arm64) + GO_XCOMPILE = CGO_ENABLED=1 GOOS=linux GOARCH=arm64 CC=aarch64-linux-gnu-gcc + else ifeq ($(TARGET_ARCH),amd64) + GO_XCOMPILE = CGO_ENABLED=1 GOOS=linux GOARCH=amd64 CC=x86_64-linux-gnu-gcc + endif +endif + +.DEFAULT: sysbox-fs + +sysbox-fs: $(SYSFS_BUILDDIR)/$(SYSFS_TARGET) + +$(SYSFS_BUILDDIR)/$(SYSFS_TARGET): $(SYSFS_SRC) $(SYSFS_GRPC_SRC) $(SYSLIB_SRC) $(LIBSECCOMP_SRC) $(LIBPIDMON_SRC) $(NSENTER_SRC) + $(GO_XCOMPILE) $(GO) build -buildvcs=false -trimpath -ldflags "${LDFLAGS}" -o $(SYSFS_BUILDDIR)/sysbox-fs ./cmd/sysbox-fs + +sysbox-fs-debug: $(SYSFS_BUILDDIR)/$(SYSFS_DEBUG_TARGET) + +$(SYSFS_BUILDDIR)/$(SYSFS_DEBUG_TARGET): $(SYSFS_SRC) $(SYSFS_GRPC_SRC) $(SYSLIB_SRC) $(LIBSECCOMP_SRC) $(LIBPIDMON_SRC) $(NSENTER_SRC) + $(GO_XCOMPILE) $(GO) build -buildvcs=false -trimpath -gcflags="all=-N -l" -ldflags "${LDFLAGS}" \ + -o $(SYSFS_BUILDDIR)/sysbox-fs ./cmd/sysbox-fs + +sysbox-fs-static: $(SYSFS_BUILDDIR)/$(SYSFS_STATIC_TARGET) + +$(SYSFS_BUILDDIR)/$(SYSFS_STATIC_TARGET): $(SYSFS_SRC) $(SYSFS_GRPC_SRC) $(SYSLIB_SRC) $(LIBSECCOMP_SRC) $(LIBPIDMON_SRC) $(NSENTER_SRC) + CGO_ENABLED=1 $(GO_XCOMPILE) $(GO) build -buildvcs=false -trimpath -tags "netgo osusergo" \ + -installsuffix netgo -ldflags "-extldflags -static ${LDFLAGS}" \ + -o $(SYSFS_BUILDDIR)/sysbox-fs ./cmd/sysbox-fs + +gomod-tidy: + $(GO) mod tidy + +lint: + $(GO) vet $(allpackages) + $(GO) fmt $(allpackages) + +listpackages: + @echo $(allpackages) + +clean: + rm -f $(SYSFS_BUILDROOT)/sysbox-fs + +distclean: clean + rm -rf $(SYSFS_BUILDROOT) + +# memoize allpackages, so that it's executed only once and only if used +_allpackages = $(shell $(GO) list ./... | grep -v vendor) +allpackages = $(if $(__allpackages),,$(eval __allpackages := $$(_allpackages)))$(__allpackages) diff --git a/sysbox-fs/README.md b/sysbox-fs/README.md new file mode 100644 index 00000000..c3ffa76a --- /dev/null +++ b/sysbox-fs/README.md @@ -0,0 +1,50 @@ +# Sysbox-fs + +The Sysbox file-system (Sysbox-fs) is one of the three active components of the +Sysbox runtime, along Sysbox-mgr and Sysbox-runc. + +Sysbox-fs provides file-system emulation capabilities to offer a more complete +and secure "virtual-host" abstraction to the processes running inside Sysbox +containers. + +## Main Features + +As of today, Sysbox-fs supports the (partial) emulation of the following +components: + +* procfs & sysfs emulation: The goal here is to expose and emulate resources +that are not yet namespaced by the Linux kernel, or that are only reachable +within the initial user-namespace. + + Sysbox-fs achieves this by mounting a FUSE file-system over specific + sections of the `/proc` and `/sys` virtual file-systems, so that I/O + requests targeting those resources are handled by Sysbox-fs in user-space. + +* Syscall emulation: Sysbox-fs traps and emulate a small set of syscalls inside +a system container. The main purpose here is to provide processes inside the +system container with a more complete and consistent view of the resources +that are reachable within a system container. We rely on the Linux kernel's +seccomp BPF features to achieve this. + + For example, inside a system container we trap the `mount` system call in + order to ensure that such mounts always result in the Sysbox-fs' emulated + procfs being mounted, rather than the kernel's procfs. + + Another example is the `umount` syscall, which we trap to ensure that + Sysbox-fs' emulated components cannot be unmounted to expose the kernel's + version of the corresponding FS node. + +## Build & Usage + +Sysbox-fs is built through the Makefile targets exposed in the Sysbox +repository. Refer to its [README](../README.md) file for details. + +## Testing + +Sysbox-fs' repository incorporates unit-tests to verify the basic operation +of its main packages. You can manually execute these unit-tests through the +usual `go test ./...` instruction. + +For a more thorough verification of Sysbox-fs features, refer to the +integration-testsuites hosted in the Sysbox repository and executed as +part of the testing Makefile targets (e.g. `make test`). diff --git a/sysbox-fs/cmd/sysbox-fs/main.go b/sysbox-fs/cmd/sysbox-fs/main.go new file mode 100644 index 00000000..28734abf --- /dev/null +++ b/sysbox-fs/cmd/sysbox-fs/main.go @@ -0,0 +1,476 @@ +// +// Copyright 2019-2020 Nestybox, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +package main + +import ( + "flag" + "fmt" + "log" + "math/rand" + "os" + "os/signal" + "runtime" + "syscall" + "time" + + "github.com/nestybox/sysbox-fs/domain" + "github.com/nestybox/sysbox-fs/fuse" + "github.com/nestybox/sysbox-fs/handler" + "github.com/nestybox/sysbox-fs/ipc" + "github.com/nestybox/sysbox-fs/mount" + "github.com/nestybox/sysbox-fs/nsenter" + "github.com/nestybox/sysbox-fs/process" + "github.com/nestybox/sysbox-fs/seccomp" + "github.com/nestybox/sysbox-fs/state" + "github.com/nestybox/sysbox-fs/sysio" + libutils "github.com/nestybox/sysbox-libs/utils" + + systemd "github.com/coreos/go-systemd/daemon" + + "github.com/pkg/profile" + "github.com/sirupsen/logrus" + "github.com/urfave/cli" +) + +const ( + sysboxRunDir string = "/run/sysbox" + sysboxFsPidFile string = sysboxRunDir + "/sysfs.pid" + usage string = `sysbox-fs file-system + +sysbox-fs is a daemon that emulates portions of the system container's +file system (e.g., procfs, sysfs). It's purpose is to make the +system container closely resemble a virtual host while ensuring +proper isolation. +` +) + +// Globals to be populated at build time during Makefile processing. +var ( + edition string // Sysbox Edition: CE or EE. + version string // extracted from VERSION file + commitId string // latest sysbox-fs' git commit-id + builtAt string // build time + builtBy string // build owner +) + +// +// sysbox-fs exit handler goroutine. +// +func exitHandler( + signalChan chan os.Signal, + fss domain.FuseServerServiceIface, + profile interface{ Stop() }) { + + var printStack = false + + s := <-signalChan + + logrus.Warnf("sysbox-fs caught signal: %s", s) + + logrus.Info("Stopping (gracefully) ...") + + systemd.SdNotify(false, systemd.SdNotifyStopping) + + switch s { + + case syscall.SIGABRT: + printStack = true + + case syscall.SIGINT: + printStack = true + + case syscall.SIGQUIT: + printStack = true + + case syscall.SIGSEGV: + printStack = true + } + + if printStack { + // Buffer size = 1024 x 32, enough to hold every goroutine stack-trace. + stacktrace := make([]byte, 32768) + length := runtime.Stack(stacktrace, true) + logrus.Warnf("\n\n%s\n", string(stacktrace[:length])) + } + + // Destroy fuse-service and inner fuse-servers. + fss.DestroyFuseService() + + // Stop cpu/mem profiling tasks. + if profile != nil { + profile.Stop() + } + + // Deferring exit() to allow FUSE to dump unnmount() logs + time.Sleep(2) + + // Delete pid file. + if err := libutils.DestroyPidFile(sysboxFsPidFile); err != nil { + logrus.Warnf("failed to destroy sysbox-fs pid file: %v", err) + } + + logrus.Info("Exiting ...") + os.Exit(0) +} + +// Run cpu / memory profiling collection. +func runProfiler(ctx *cli.Context) (interface{ Stop() }, error) { + + var prof interface{ Stop() } + + cpuProfOn := ctx.Bool("cpu-profiling") + memProfOn := ctx.Bool("memory-profiling") + + // Cpu and Memory profiling options seem to be mutually exclused in pprof. + if cpuProfOn && memProfOn { + return nil, fmt.Errorf("Unsupported parameter combination: cpu and memory profiling") + } + + // Typical / non-profiling case. + if !(cpuProfOn || memProfOn) { + return nil, nil + } + + // Notice that 'NoShutdownHook' option is passed to profiler constructor to + // avoid this one reacting to 'sigterm' signal arrival. IOW, we want + // sysbox-fs signal handler to be the one stopping all profiling tasks. + + if cpuProfOn { + prof = profile.Start( + profile.CPUProfile, + profile.ProfilePath("."), + profile.NoShutdownHook, + ) + } + + if memProfOn { + prof = profile.Start( + profile.MemProfile, + profile.ProfilePath("."), + profile.NoShutdownHook, + ) + } + + return prof, nil +} + +func setupRunDir() error { + if err := os.MkdirAll(sysboxRunDir, 0700); err != nil { + return fmt.Errorf("failed to create %s: %s", sysboxRunDir, err) + } + return nil +} + +// +// sysbox-fs main function +// +func main() { + + app := cli.NewApp() + app.Name = "sysbox-fs" + app.Usage = usage + app.Version = version + + app.Flags = []cli.Flag{ + cli.StringFlag{ + Name: "mountpoint", + Value: "/var/lib/sysboxfs", + Usage: "mount-point location", + }, + cli.BoolFlag{ + Name: "allow-immutable-remounts", + Usage: "sys container's initial mounts are considered immutable; this option allows them to be remounted from within the container (default: \"false\")", + }, + cli.BoolTFlag{ + Name: "allow-immutable-unmounts", + Usage: "sys container's initial mounts are considered immutable; this option allows them to be unmounted from within the container (default: \"true\")", + }, + cli.StringFlag{ + Name: "seccomp-fd-release", + Value: "proc-exit", + Usage: "Policy to close syscall interception handles; allowed values are \"proc-exit\" and \"cont-exit\" (default = \"proc-exit\")", + }, + cli.StringFlag{ + Name: "log", + Value: "", + Usage: "log file path or empty string for stderr output (default: \"\")", + }, + cli.StringFlag{ + Name: "log-level", + Value: "info", + Usage: "log categories to include (debug, info, warning, error, fatal)", + }, + cli.StringFlag{ + Name: "log-format", + Value: "text", + Usage: "log format; must be json or text", + }, + cli.BoolFlag{ + Name: "ignore-handler-errors", + Usage: "ignore errors during procfs / sysfs node interactions (testing purposes)", + Hidden: true, + }, + cli.BoolFlag{ + Name: "cpu-profiling", + Usage: "enable cpu-profiling data collection", + Hidden: true, + }, + cli.BoolFlag{ + Name: "memory-profiling", + Usage: "enable memory-profiling data collection", + Hidden: true, + }, + } + + // show-version specialization. + cli.VersionPrinter = func(c *cli.Context) { + fmt.Printf("sysbox-fs\n"+ + "\tedition: \t%s\n"+ + "\tversion: \t%s\n"+ + "\tcommit: \t%s\n"+ + "\tbuilt at: \t%s\n"+ + "\tbuilt by: \t%s\n", + edition, c.App.Version, commitId, builtAt, builtBy) + } + + // Nsenter command to allow 'rexec' functionality. + app.Commands = []cli.Command{ + { + Name: "nsenter", + Usage: "Execute action within container namespaces", + Action: func(c *cli.Context) error { + // nsenter errors are passed back to sysbox-fs via a pipe + nsenter.Init() + return nil + }, + }, + } + + // Define 'debug' and 'log' settings. + app.Before = func(ctx *cli.Context) error { + + // Random generator seed + rand.Seed(time.Now().UnixNano()) + + // Create/set the log-file destination. + if path := ctx.GlobalString("log"); path != "" { + f, err := os.OpenFile( + path, + os.O_CREATE|os.O_WRONLY|os.O_APPEND|os.O_SYNC, + 0666, + ) + if err != nil { + logrus.Fatalf( + "Error opening log file %v: %v. Exiting ...", + path, err, + ) + return err + } + + logrus.SetOutput(f) + log.SetOutput(f) + } else { + logrus.SetOutput(os.Stderr) + log.SetOutput(os.Stderr) + } + + if logFormat := ctx.GlobalString("log-format"); logFormat == "json" { + logrus.SetFormatter(&logrus.JSONFormatter{ + TimestampFormat: "2006-01-02 15:04:05", + }) + } else { + logrus.SetFormatter(&logrus.TextFormatter{ + TimestampFormat: "2006-01-02 15:04:05", + FullTimestamp: true, + }) + } + + // Set desired log-level. + if logLevel := ctx.GlobalString("log-level"); logLevel != "" { + switch logLevel { + case "debug": + // Following instruction is to have Bazil's fuze-lib logs being + // included into sysbox-fs' log stream. + flag.Set("fuse.debug", "true") + logrus.SetLevel(logrus.DebugLevel) + case "info": + logrus.SetLevel(logrus.InfoLevel) + case "warning": + logrus.SetLevel(logrus.WarnLevel) + case "error": + logrus.SetLevel(logrus.ErrorLevel) + case "fatal": + logrus.SetLevel(logrus.FatalLevel) + default: + logrus.Fatalf( + "log-level option '%v' not recognized. Exiting ...", + logLevel, + ) + } + } else { + // Set 'info' as our default log-level. + logrus.SetLevel(logrus.InfoLevel) + } + + return nil + } + + // sysbox-fs main-loop execution. + app.Action = func(ctx *cli.Context) error { + + logrus.Info("Initiating sysbox-fs ...") + + err := libutils.CheckPidFile("sysbox-fs", sysboxFsPidFile) + if err != nil { + return err + } + + // Print key configuration knobs settings. + if ctx.BoolT("allow-immutable-remounts") { + logrus.Info("Initializing with 'allow-immutable-remounts' enabled") + } else { + logrus.Info("Initializing with 'allow-immutable-remounts' knob disabled (default)") + } + if ctx.Bool("allow-immutable-unmounts") { + logrus.Info("Initializing with 'allow-immutable-unmounts' knob enabled (default)") + } else { + logrus.Info("Initializing with 'allow-immutable-unmounts' knob disabled") + } + if ctx.GlobalString("seccomp-fd-release") == "cont-exit" { + logrus.Info("Seccomp-notify fd release policy set to container exit") + } + logrus.Infof("FUSE dir = %s", ctx.GlobalString("mountpoint")) + + // Construct sysbox-fs services. + var nsenterService = nsenter.NewNSenterService() + var ioService = sysio.NewIOService(domain.IOOsFileService) + var processService = process.NewProcessService() + var handlerService = handler.NewHandlerService() + var fuseServerService = fuse.NewFuseServerService() + var containerStateService = state.NewContainerStateService() + var syscallMonitorService = seccomp.NewSyscallMonitorService() + var ipcService = ipc.NewIpcService() + var mountService = mount.NewMountService() + + // Create the sysbox run dir + err = setupRunDir() + if err != nil { + return fmt.Errorf("failed to setup the sysbox run dir: %v", err) + } + + // Setup sysbox-fs services. + processService.Setup(ioService) + + nsenterService.Setup(processService, nil) + + handlerService.Setup( + handler.DefaultHandlers, + ctx.Bool("ignore-handler-errors"), + containerStateService, + nsenterService, + processService, + ioService, + ) + + if err := fuseServerService.Setup( + ctx.GlobalString("mountpoint"), + containerStateService, + ioService, + handlerService, + ); err != nil { + return err + } + + containerStateService.Setup( + fuseServerService, + processService, + ioService, + mountService, + ) + + mountService.Setup( + containerStateService, + handlerService, + processService, + nsenterService, + ) + + syscallMonitorService.Setup( + nsenterService, + containerStateService, + processService, + mountService, + ctx.BoolT("allow-immutable-remounts"), + ctx.Bool("allow-immutable-unmounts"), + ctx.GlobalString("seccomp-fd-release"), + ) + + ipcService.Setup( + containerStateService, + processService, + ioService, + ctx.GlobalString("mountpoint"), + ) + + // If requested, launch cpu/mem profiling collection. + profile, err := runProfiler(ctx) + if err != nil { + logrus.Fatal(err) + } + + // Launch exit handler (performs proper cleanup of sysbox-fs upon + // receiving termination signals). + var exitChan = make(chan os.Signal, 1) + signal.Notify( + exitChan, + syscall.SIGHUP, + syscall.SIGINT, + syscall.SIGTERM, + syscall.SIGSEGV, + syscall.SIGQUIT) + go exitHandler(exitChan, fuseServerService, profile) + + // TODO: Consider adding sync.Workgroups to ensure that all goroutines + // are done with their in-fly tasks before exit()ing. + + systemd.SdNotify(false, systemd.SdNotifyReady) + + // Create sysbox-fs pid file. + err = libutils.CreatePidFile("sysbox-fs", sysboxFsPidFile) + if err != nil { + return fmt.Errorf("failed to create sysfs.pid file: %s", err) + } + + logrus.Info("Ready ...") + + if err := ipcService.Init(); err != nil { + logrus.Errorf("failed to start sysbox-fs: %v", err) + } + + // Exited main event-loop. Delete pid file. + if err := libutils.DestroyPidFile(sysboxFsPidFile); err != nil { + logrus.Warnf("failed to destroy sysbox-fs pid file: %v", err) + } + logrus.Info("Done.") + + return nil + } + + if err := app.Run(os.Args); err != nil { + logrus.Fatal(err) + } +} diff --git a/sysbox-fs/cmd/sysbox-fs/main_test.go b/sysbox-fs/cmd/sysbox-fs/main_test.go new file mode 100644 index 00000000..8b665b82 --- /dev/null +++ b/sysbox-fs/cmd/sysbox-fs/main_test.go @@ -0,0 +1,32 @@ +// +// Copyright 2019-2020 Nestybox, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +package main + +import ( + "io/ioutil" + "testing" + + "github.com/sirupsen/logrus" +) + +func TestMain(m *testing.M) { + + // Disable log generation during UT. + logrus.SetOutput(ioutil.Discard) + + m.Run() +} diff --git a/sysbox-fs/domain/container.go b/sysbox-fs/domain/container.go new file mode 100644 index 00000000..75634b51 --- /dev/null +++ b/sysbox-fs/domain/container.go @@ -0,0 +1,101 @@ +// +// Copyright 2019-2020 Nestybox, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +package domain + +import ( + "time" + + libpidfd "github.com/nestybox/sysbox-libs/pidfd" +) + +// Container interface. +type ContainerIface interface { + // + // Getters + // + ID() string + InitPid() uint32 + InitPidFd() libpidfd.PidFd + Ctime() time.Time + Data(name string, offset int64, data *[]byte) (int, error) + UID() uint32 + GID() uint32 + UidSize() uint32 + GidSize() uint32 + ProcRoPaths() []string + ProcMaskPaths() []string + InitProc() ProcessIface + ExtractInode(path string) (Inode, error) + IsMountInfoInitialized() bool + InitializeMountInfo() error + IsRootMount(info *MountInfo) (bool, error) + IsRootMountID(id int) (bool, error) + IsImmutableMount(info *MountInfo) (bool, error) + IsImmutableRoMount(info *MountInfo) (bool, error) + IsImmutableMountID(id int) bool + IsImmutableRoMountID(id int) bool + IsImmutableBindMount(info *MountInfo) bool + IsImmutableRoBindMount(info *MountInfo) bool + IsImmutableMountpoint(mp string) bool + IsImmutableRoMountpoint(mp string) bool + IsImmutableOverlapMountpoint(mp string) bool + IsRegistrationCompleted() bool + // + // Setters + // + SetData(name string, offset int64, data []byte) error + SetInitProc(pid, uid, gid uint32) error + SetRegistrationCompleted() + // + // Locks for read-modify-write operations on container data via the Data() + // and SetData() methods. + // + Lock() + Unlock() +} + +// ContainerStateService interface defines the APIs that sysbox-fs components +// must utilize to interact with the sysbox-fs state-storage backend. +type ContainerStateServiceIface interface { + Setup( + fss FuseServerServiceIface, + prs ProcessServiceIface, + ios IOServiceIface, + mts MountServiceIface) + + ContainerCreate( + id string, + pid uint32, + ctime time.Time, + uidFirst uint32, + uidSize uint32, + gidFirst uint32, + gidSize uint32, + procRoPaths []string, + procMaskPaths []string, + service ContainerStateServiceIface) ContainerIface + + ContainerPreRegister(id, netns string) error + ContainerRegister(c ContainerIface) error + ContainerUpdate(c ContainerIface) error + ContainerUnregister(c ContainerIface) error + ContainerLookupById(id string) ContainerIface + FuseServerService() FuseServerServiceIface + ProcessService() ProcessServiceIface + MountService() MountServiceIface + ContainerDBSize() int +} diff --git a/sysbox-fs/domain/fileinfo.go b/sysbox-fs/domain/fileinfo.go new file mode 100644 index 00000000..f517f674 --- /dev/null +++ b/sysbox-fs/domain/fileinfo.go @@ -0,0 +1,83 @@ +// +// Copyright 2019-2020 Nestybox, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +package domain + +import ( + "os" + "syscall" + "time" +) + +const ( + MaxUid = 0xFFFF + MaxGid = 0xFFFF +) + +// FileInfo is sysbox-fs' implementation of os.FileInfo interface. A concrete +// type is required during serialization operations when exchanging state between +// sysbox-fs' main and its re-exec instances. +type FileInfo struct { + Fname string + Fsize int64 + Fmode os.FileMode + FmodTime time.Time + FisDir bool + Fsys *syscall.Stat_t +} + +func (c FileInfo) Name() string { + return c.Fname +} + +func (c FileInfo) Size() int64 { + return c.Fsize +} + +func (c FileInfo) Mode() os.FileMode { + return c.Fmode +} + +func (c FileInfo) ModTime() time.Time { + return c.FmodTime +} + +func (c FileInfo) IsDir() bool { + return c.FisDir +} + +func (c FileInfo) Sys() interface{} { + return c.Fsys +} + +// Utility function to eliminate duplicates from FileInfo slice. Notice that +// if duplicated elements are present, the first one is left untouched while +// the subsequent ones are eliminated. +func FileInfoSliceUniquify(s []os.FileInfo) []os.FileInfo { + var result = []os.FileInfo{} + + var keys = make(map[string]bool) + + for _, info := range s { + fname := info.Name() + if _, ok := keys[fname]; !ok { + keys[fname] = true + result = append(result, info) + } + } + + return result +} \ No newline at end of file diff --git a/sysbox-fs/domain/fuse.go b/sysbox-fs/domain/fuse.go new file mode 100644 index 00000000..66abc55d --- /dev/null +++ b/sysbox-fs/domain/fuse.go @@ -0,0 +1,41 @@ +// +// Copyright 2019-2020 Nestybox, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +package domain + +type FuseServerServiceIface interface { + Setup( + mp string, + css ContainerStateServiceIface, + ios IOServiceIface, + hds HandlerServiceIface) error + + CreateFuseServer(serveCntr, stateCntr ContainerIface) error + DestroyFuseServer(mp string) error + DestroyFuseService() + FuseServerCntrRegComplete(cntr ContainerIface) error +} + +type FuseServerIface interface { + Create() error + Run() error + Destroy() error + MountPoint() string + Unmount() + InitWait() + SetCntrRegComplete() + IsCntrRegCompleted() bool +} diff --git a/sysbox-fs/domain/handler.go b/sysbox-fs/domain/handler.go new file mode 100644 index 00000000..5379d53d --- /dev/null +++ b/sysbox-fs/domain/handler.go @@ -0,0 +1,154 @@ +// +// Copyright 2019-2020 Nestybox, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +package domain + +import ( + "os" + "sync" +) + +// HandlerBase is a type common to all the handlers. +// +// HandlerBase type is used to bundle the different file-system operations that +// can be executed over sysbox-fs' emulated resources. As such, handlers are +// typically associated with a directory path inside of which there is at least +// one resource (file or subdir) that needs to be emulated. +// +// Handlers can be paired with a file too though, but usually they are associated +// with directories to leverage the fact that, within a given directory, there +// are commonalities among the resources being emulated. Hence, this approach +// reduces the amount of duplicated code that would otherwise derive from +// handler sprawling. +// +// The handler resources being emulated are stored within a map indexed by the +// resource name. +type HandlerBase struct { + // Camel-case representation of every handler path. + Name string + + // Abs path of the resource emulated by the handler. + Path string + + // Map of additional resources emulated by the handler (e.g., for handlers + // emulating directories, these would list any subdirs that they emulate). + EmuResourceMap map[string]*EmuResource + + Enabled bool + + // Pointer to the parent handler service. + Service HandlerServiceIface +} + +type EmuResourceType int + +const ( + UnknownEmuResource EmuResourceType = iota + DirEmuResource + FileEmuResource +) + +// EmuResource represents the nodes being emulated by sysbox-fs. +// +// The "mutex" variable is utilized to synchronize access among concurrent i/o +// operations made over the same host resource (e.g. if multiple processes within +// the same sys container or across different sys containers are accessing the +// same sysbox-fs emulated resource). By relying on a per-resource "mutex", and +// not a per-handler one, we are maximizing the level of concurrency that can be +// attained. +type EmuResource struct { + Kind EmuResourceType + Mode os.FileMode + Size int64 + Enabled bool + Mutex sync.Mutex +} + +// HandlerRequest represents a request to be processed by a handler +type HandlerRequest struct { + ID uint64 + Name string + Path string + Pid uint32 + Uid uint32 + Gid uint32 + SkipIdRemap bool + Offset int64 + NoCache bool + Data []byte + Container ContainerIface +} + +// HandlerIface is the interface that each handler must implement +type HandlerIface interface { + // FS operations. + Open(node IOnodeIface, req *HandlerRequest) (bool, error) + Lookup(n IOnodeIface, req *HandlerRequest) (os.FileInfo, error) + Read(node IOnodeIface, req *HandlerRequest) (int, error) + Write(node IOnodeIface, req *HandlerRequest) (int, error) + ReadDirAll(node IOnodeIface, req *HandlerRequest) ([]os.FileInfo, error) + ReadLink(node IOnodeIface, req *HandlerRequest) (string, error) + + // getters/setters. + GetName() string + GetPath() string + GetEnabled() bool + SetEnabled(b bool) + GetService() HandlerServiceIface + SetService(hs HandlerServiceIface) + GetResourcesList() []string + GetResourceMutex(node IOnodeIface) *sync.Mutex +} + +type PassthroughHandlerIface interface { + HandlerIface + OpenWithNS(node IOnodeIface, req *HandlerRequest, namespaces []NStype) (bool, error) + ReadWithNS(node IOnodeIface, req *HandlerRequest, namespaces []NStype) (int, error) + WriteWithNS(node IOnodeIface, req *HandlerRequest, namespaces []NStype) (int, error) +} + +type HandlerServiceIface interface { + Setup( + hdlrs []HandlerIface, + ignoreErrors bool, + css ContainerStateServiceIface, + nss NSenterServiceIface, + prs ProcessServiceIface, + ios IOServiceIface) + + RegisterHandler(h HandlerIface) error + UnregisterHandler(h HandlerIface) error + LookupHandler(i IOnodeIface) (HandlerIface, bool) + FindHandler(s string) (HandlerIface, bool) + EnableHandler(path string) error + DisableHandler(path string) error + + // getters/setters + HandlersResourcesList() []string + GetPassThroughHandler() PassthroughHandlerIface + StateService() ContainerStateServiceIface + SetStateService(css ContainerStateServiceIface) + ProcessService() ProcessServiceIface + NSenterService() NSenterServiceIface + IOService() IOServiceIface + IgnoreErrors() bool + + // Auxiliar methods. + HostUserNsInode() Inode + FindUserNsInode(pid uint32) (Inode, error) + HostUuid() string + FindHostUuid() (string, error) +} diff --git a/sysbox-fs/domain/ionode.go b/sysbox-fs/domain/ionode.go new file mode 100644 index 00000000..11c42167 --- /dev/null +++ b/sysbox-fs/domain/ionode.go @@ -0,0 +1,81 @@ +// +// Copyright 2019-2020 Nestybox, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +package domain + +import "os" + +type Inode = uint64 // 0 = invalid inode + +// +// ioNode interface serves as an abstract-class to represent all I/O resources +// with whom sysbox-fs operates. All I/O transactions will be carried out +// through the methods exposed by this interface and its derived sub-classes. +// There are two specializations of this interface at the moment: +// +// 1. ioNodeFile: Basically, a wrapper over os.File type to allow interactions +// with the host FS. To be utilized in production scenarios. +// +// 2. iMemFile: Utilized for unit testing. +// + +type IOServiceType = int + +const ( + Unknown IOServiceType = iota + IOOsFileService // production / regular purposes + IOMemFileService // unit-testing purposes +) + +type IOServiceIface interface { + NewIOnode(n string, p string, attr os.FileMode) IOnodeIface + RemoveAllIOnodes() error + GetServiceType() IOServiceType +} + +type IOnodeIface interface { + Open() error + Read(p []byte) (n int, err error) + Write(p []byte) (n int, err error) + Close() error + Seek(offset int64, whence int) (int64, error) + ReadAt(p []byte, off int64) (n int, err error) + ReadDirAll() ([]os.FileInfo, error) + ReadFile() ([]byte, error) + ReadLine() (string, error) + ReadLink() (string, error) + WriteAt(p []byte, off int64) (n int, err error) + WriteFile(p []byte) error + Mkdir() error + MkdirAll() error + Stat() (os.FileInfo, error) + Lstat() (os.FileInfo, error) + SeekReset() (int64, error) + Remove() error + RemoveAll() error + // + // Required getters/setters. + // + Name() string + Path() string + OpenFlags() int + OpenMode() os.FileMode + GetNsInode() (Inode, error) + SetName(s string) + SetPath(s string) + SetOpenFlags(flags int) + SetOpenMode(mode os.FileMode) +} diff --git a/sysbox-fs/domain/ipc.go b/sysbox-fs/domain/ipc.go new file mode 100644 index 00000000..86c131f6 --- /dev/null +++ b/sysbox-fs/domain/ipc.go @@ -0,0 +1,27 @@ +// +// Copyright 2019-2020 Nestybox, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +package domain + +type IpcServiceIface interface { + Setup( + css ContainerStateServiceIface, + prs ProcessServiceIface, + ios IOServiceIface, + fuseMp string) + + Init() error +} diff --git a/sysbox-fs/domain/mount.go b/sysbox-fs/domain/mount.go new file mode 100644 index 00000000..1b91c297 --- /dev/null +++ b/sysbox-fs/domain/mount.go @@ -0,0 +1,145 @@ +// +// Copyright 2020 Nestybox, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +package domain + +// Service interface to expose mount-service's components. +type MountServiceIface interface { + Setup( + css ContainerStateServiceIface, + hds HandlerServiceIface, + prs ProcessServiceIface, + nss NSenterServiceIface) + + NewMountInfoParser( + c ContainerIface, + process ProcessIface, + launchParser bool, + fetchOptions bool, + fetchInodes bool) (MountInfoParserIface, error) + + NewMountHelper() MountHelperIface + MountHelper() MountHelperIface +} + +// Interface to define the mountInfoParser api. +type MountInfoParserIface interface { + GetProcessID() uint32 + GetInfo(mountpoint string) *MountInfo + GetParentMount(info *MountInfo) *MountInfo + LookupByMountID(id int) *MountInfo + LookupByMountpoint(mp string) *MountInfo + IsSysboxfsBaseMount(mountpoint string) bool + IsSysboxfsBaseRoMount(mountpoint string) bool + IsSysboxfsSubmount(mountpoint string) bool + IsSysboxfsRoSubmount(mountpoint string) bool + IsSysboxfsMaskedSubmount(mountpoint string) bool + GetSysboxfsSubMounts(basemount string) []string + HasNonSysboxfsSubmount(basemount string) bool + IsRecursiveBindMount(info *MountInfo) bool + IsSelfMount(info *MountInfo) bool + IsOverlapMount(info *MountInfo) bool + IsRoMount(info *MountInfo) bool + IsBindMount(info *MountInfo) bool + IsRoBindMount(info *MountInfo) bool + IsRootMount(info *MountInfo) (bool, error) + IsCloneMount(info *MountInfo, readonly bool) (bool, error) + ExtractMountInfo() ([]byte, error) + ExtractInode(mp string) (Inode, error) + ExtractAncestorInodes(info *MountInfo) error +} + +// Interface to define the mountHelper api. +type MountHelperIface interface { + IsNewMount(flags uint64) bool + IsRemount(flags uint64) bool + IsBind(flags uint64) bool + IsMove(flags uint64) bool + HasPropagationFlag(flags uint64) bool + IsReadOnlyMount(flags uint64) bool + StringToFlags(s map[string]string) uint64 + FilterFsFlags(fsOpts map[string]string) string + ProcMounts() []string + SysMounts() []string +} + +// mountInfo reveals information about a particular mounted filesystem. This +// struct is populated from the content in the /proc//mountinfo file. The +// fields described in each entry of /proc/self/mountinfo are described here: +// http://man7.org/linux/man-pages/man5/proc.5.html +// +// Note: Defnition borrowed from OCI runc's mount package ... +// +// 36 35 98:0 /mnt1 /mnt2 rw,noatime master:1 - ext3 /dev/root rw,errors=continue +// (1)(2)(3) (4) (5) (6) (7) (8) (9) (10) (11) +// +// (1) mount ID: unique identifier of the mount (may be reused after umount) +// (2) parent ID: ID of parent (or of self for the top of the mount tree) +// (3) major:minor: value of st_dev for files on filesystem +// (4) root: root of the mount within the filesystem +// (5) mount point: mount point relative to the process's root +// (6) mount options: per mount options +// (7) optional fields: zero or more fields of the form "tag[:value]" +// (8) separator: marks the end of the optional fields +// (9) filesystem type: name of filesystem of the form "type[.subtype]" +// (10) mount source: filesystem specific information or "none" +// (11) super options: per super block options*/ +type MountInfo struct { + // Mount identifier. + MountID int `json:"mountid"` + + // Parent-mount identifier. + ParentID int `json:"parentid"` + + // 'st_dev' value for files in FS. + MajorMinorVer string `json:"majorminorver"` + + // File-system type. + FsType string `json:"fstype"` + + // File-system specific information or "none". + Source string `json:"source"` + + // Pathname of root of the mount within the FS. + Root string `json:"root"` + + // Pathname of the mount point relative to the root. + MountPoint string `json:"mountpoint"` + + // Mount-specific options. + Options map[string]string `json:"options"` + + // Optional-fields. + OptionalFields map[string]string `json:"optionalfields"` + + // Superblock options. + VfsOptions map[string]string `json:"vfsoptions"` + + // FS inode corresponding to this mountpoint. + MpInode Inode `json:"mpinode"` + + // Backpointer to mountInfoParser. + Mip MountInfoParserIface `json:"-"` +} + +// Mount structure utilized to exchange mount-state across sysbox-fs components. +type Mount struct { + Source string `json:"source"` + Target string `json:"target"` + FsType string `json:"fstype"` + Flags uint64 `json:"flags"` + Data string `json:"data"` +} diff --git a/sysbox-fs/domain/nsenter.go b/sysbox-fs/domain/nsenter.go new file mode 100644 index 00000000..eff62813 --- /dev/null +++ b/sysbox-fs/domain/nsenter.go @@ -0,0 +1,287 @@ +// +// Copyright 2019-2023 Nestybox, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +package domain + +// Aliases to leverage strong-typing. +type NStype = string +type NSenterMsgType = string + +// NStype defines all namespace types +const ( + NStypeCgroup NStype = "cgroup" + NStypeIpc NStype = "ipc" + NStypeNet NStype = "net" + NStypePid NStype = "pid" + NStypeUts NStype = "uts" + NStypeUser NStype = "user" + NStypeMount NStype = "mnt" +) + +// Security note: nsenter processes spawned by sysbox-fs that enter the pid +// namespace must also enter the container's mount ns, as otherwise the nsenter +// process will inherit sysbox-fs host mounts, resulting in a process inside the +// container that exposes info about those mounts. If needed, the nsenter +// process can always unshare the mount ns inside the container so that it can +// perform mounts without affecting the container processes. + +var AllNSs = []NStype{ + string(NStypeUser), + string(NStypePid), + string(NStypeNet), + string(NStypeMount), + string(NStypeIpc), + string(NStypeCgroup), + string(NStypeUts), +} + +var AllNSsButUser = []NStype{ + string(NStypeMount), + string(NStypePid), + string(NStypeNet), + string(NStypeIpc), + string(NStypeCgroup), + string(NStypeUts), +} + +// NSenterEvent types. Define all possible messages that can be handled +// by nsenterEvent class. +const ( + LookupRequest NSenterMsgType = "lookupRequest" + LookupResponse NSenterMsgType = "lookupResponse" + OpenFileRequest NSenterMsgType = "openFileRequest" + OpenFileResponse NSenterMsgType = "openFileResponse" + ReadFileRequest NSenterMsgType = "readFileRequest" + ReadFileResponse NSenterMsgType = "readFileResponse" + WriteFileRequest NSenterMsgType = "writeFileRequest" + WriteFileResponse NSenterMsgType = "writeFileResponse" + ReadDirRequest NSenterMsgType = "readDirRequest" + ReadDirResponse NSenterMsgType = "readDirResponse" + ReadLinkRequest NSenterMsgType = "readLinkRequest" + ReadLinkResponse NSenterMsgType = "readLinkResponse" + MountSyscallRequest NSenterMsgType = "mountSyscallRequest" + MountSyscallResponse NSenterMsgType = "mountSyscallResponse" + UmountSyscallRequest NSenterMsgType = "umountSyscallRequest" + UmountSyscallResponse NSenterMsgType = "umountSyscallResponse" + ChownSyscallRequest NSenterMsgType = "chownSyscallRequest" + ChownSyscallResponse NSenterMsgType = "chownSyscallResponse" + MountInfoRequest NSenterMsgType = "mountInfoRequest" + MountInfoResponse NSenterMsgType = "mountInfoResponse" + MountInodeRequest NSenterMsgType = "mountInodeRequest" + MountInodeResponse NSenterMsgType = "mountInodeResponse" + SleepRequest NSenterMsgType = "sleepRequest" + SleepResponse NSenterMsgType = "sleepResponse" + SetxattrSyscallRequest NSenterMsgType = "setxattrSyscallRequest" + SetxattrSyscallResponse NSenterMsgType = "setxattrSyscallResponse" + GetxattrSyscallRequest NSenterMsgType = "getxattrSyscallRequest" + GetxattrSyscallResponse NSenterMsgType = "getxattrSyscallResponse" + RemovexattrSyscallRequest NSenterMsgType = "RemovexattrSyscallRequest" + RemovexattrSyscallResponse NSenterMsgType = "RemovexattrSyscallResponse" + ListxattrSyscallRequest NSenterMsgType = "ListxattrSyscallRequest" + ListxattrSyscallResponse NSenterMsgType = "ListxattrSyscallResponse" + UidInfoRequest NSenterMsgType = "uidInfoRequest" + UidInfoResponse NSenterMsgType = "uidInfoResponse" + GidInfoRequest NSenterMsgType = "gidInfoRequest" + GidInfoResponse NSenterMsgType = "gidInfoResponse" + ErrorResponse NSenterMsgType = "errorResponse" +) + +// NSenterService interface serves as a wrapper construct to provide a +// communication channel between sysbox-fs 'master' and sysbox-fs 'child' +// entities. See more details further below. +type NSenterServiceIface interface { + NewEvent( + pid uint32, + ns *[]NStype, + cloneFlags uint32, + req *NSenterMessage, + res *NSenterMessage, + async bool) NSenterEventIface + + Setup(prs ProcessServiceIface, mts MountServiceIface) + SendRequestEvent(e NSenterEventIface) error + ReceiveResponseEvent(e NSenterEventIface) *NSenterMessage + TerminateRequestEvent(e NSenterEventIface) error + GetEventProcessID(e NSenterEventIface) uint32 +} + +// NSenterEvent struct serves as a transport abstraction (envelope) to carry +// all the potential messages that can be exchanged between sysbox-fs master +// instance and secondary (forked) ones. These sysbox-fs' auxiliary instances +// are utilized to perform actions over namespaced resources, and as such, +// cannot be executed by sysbox-fs' main instance. +// +// Every bidirectional transaction is represented by an event structure +// (nsenterEvent), which holds both 'request' and 'response' messages, as well +// as the context necessary to complete any action demanding inter-namespace +// message exchanges. +type NSenterEventIface interface { + SendRequest() error + TerminateRequest() error + ReceiveResponse() *NSenterMessage + SetRequestMsg(m *NSenterMessage) + GetRequestMsg() *NSenterMessage + SetResponseMsg(m *NSenterMessage) + GetResponseMsg() *NSenterMessage + GetProcessID() uint32 +} + +// NSenterMessage struct defines the layout of the messages being exchanged +// between sysbox-fs 'main' and 'forked' ones. +type NSenterMessage struct { + // Message type being exchanged. + Type NSenterMsgType `json:"message"` + + // Message payload. + Payload interface{} `json:"payload"` +} + +type NSenterMsgHeader struct { + Pid uint32 `json:"pid"` + Uid uint32 `json:"uid"` + Gid uint32 `json:"gid"` + Root string `json:"root"` + Cwd string `json:"cwd"` + Capabilities [2]uint32 `json:"capabilities"` +} + +type LookupPayload struct { + Entry string `json:"entry"` + MountSysfs bool `json:mountSysfs` + MountProcfs bool `json:mountProcfs` +} + +type OpenFilePayload struct { + File string `json:"file"` + Flags string `json:"flags"` + Mode string `json:"mode"` + MountSysfs bool `json:mountSysfs` + MountProcfs bool `json:mountProcfs` +} + +type ReadFilePayload struct { + File string `json:"file"` + Offset int64 `json:"offset"` + Len int `json:"len"` + MountSysfs bool `json:mountSysfs` + MountProcfs bool `json:mountProcfs` +} + +type WriteFilePayload struct { + File string `json:"file"` + Offset int64 `json:"offset"` + Data []byte `json:"data"` + MountSysfs bool `json:mountSysfs` + MountProcfs bool `json:mountProcfs` +} + +type ReadDirPayload struct { + Dir string `json:"dir"` + MountSysfs bool `json:mountSysfs` + MountProcfs bool `json:mountProcfs` +} + +type ReadLinkPayload struct { + Link string `json:"link"` + MountSysfs bool `json:mountSysfs` + MountProcfs bool `json:mountProcfs` +} + +type MountSyscallPayload struct { + Header NSenterMsgHeader + Mount +} + +type UmountSyscallPayload struct { + Header NSenterMsgHeader + Mount +} + +type ChownSyscallPayload struct { + Target string `json:"target"` + TargetUid int `json:"uid"` + TargetGid int `json:"gid"` +} + +type SetxattrSyscallPayload struct { + Syscall string `json:"syscall"` + Path string `json:"path"` + Name string `json:"name"` + Val []byte `json:"val"` + Flags int `json:"flags"` +} + +type GetxattrSyscallPayload struct { + Header NSenterMsgHeader + Syscall string `json:"syscall"` + Path string `json:"path"` + Name string `json:"name"` + Size uint64 `json:"size"` +} + +type GetxattrRespPayload struct { + Val []byte `json:"val"` + Size int `json:"size"` +} + +type RemovexattrSyscallPayload struct { + Syscall string `json:"syscall"` + Path string `json:"path"` + Name string `json:"name"` +} + +type ListxattrSyscallPayload struct { + Header NSenterMsgHeader + Syscall string `json:"syscall"` + Path string `json:"path"` + Size uint64 `json:"size"` +} + +type ListxattrRespPayload struct { + Val []byte `json:"val"` + Size int `json:"size"` +} + +type MountInfoRespPayload struct { + Data []byte `json:"data"` +} + +type MountInodeReqPayload struct { + Mountpoints []string `json:"mountpoints"` +} +type MountInodeRespPayload struct { + MpInodes []Inode `json:"mpinodes"` +} + +type SleepReqPayload struct { + Ival string `json:"attr"` +} + +type UidInfoReqPayload struct { + User string `json:"user"` +} + +type UidInfoRespPayload struct { + Uid string `json:"uid"` +} + +type GidInfoReqPayload struct { + Group string `json:"group"` +} + +type GidInfoRespPayload struct { + Gid string `json:"gid"` +} diff --git a/sysbox-fs/domain/process.go b/sysbox-fs/domain/process.go new file mode 100644 index 00000000..3eb86c03 --- /dev/null +++ b/sysbox-fs/domain/process.go @@ -0,0 +1,85 @@ +// +// Copyright 2019-2020 Nestybox, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +package domain + +import ( + "reflect" + + cap "github.com/nestybox/sysbox-libs/capability" + "github.com/nestybox/sysbox-runc/libcontainer/user" +) + +const ( + SymlinkMax = 40 +) + +type AccessMode uint32 + +const ( + R_OK AccessMode = 0x4 // read ok + W_OK AccessMode = 0x2 // write ok + X_OK AccessMode = 0x1 // execute ok +) + +type ProcessIface interface { + Pid() uint32 + Uid() uint32 + Gid() uint32 + Cwd() string + Root() string + RootInode() uint64 + SGid() []uint32 + UidMap() ([]user.IDMap, error) + GidMap() ([]user.IDMap, error) + IsCapabilitySet(cap.CapType, cap.Cap) bool + IsSysAdminCapabilitySet() bool + NsInodes() (map[string]Inode, error) + MountNsInode() (Inode, error) + NetNsInode() (Inode, error) + UserNsInode() (Inode, error) + UserNsInodeParent() (Inode, error) + UsernsRootUidGid() (uint32, uint32, error) + CreateNsInodes(Inode) error + PathAccess(path string, accessFlags AccessMode, followSymlink bool) (string, error) + ResolveProcSelf(string) (string, error) + GetEffCaps() [2]uint32 + SetEffCaps(caps [2]uint32) + GetFd(int32) (string, error) + AdjustPersonality( + uid uint32, + gid uint32, + root string, + cwd string, + caps [2]uint32) error +} + +type ProcessServiceIface interface { + Setup(ios IOServiceIface) + ProcessCreate(pid uint32, uid uint32, gid uint32) ProcessIface +} + +// ProcessNsMatch returns true if the given processes are in the same namespaces. +func ProcessNsMatch(p1, p2 ProcessIface) bool { + p1Inodes, p1Err := p1.NsInodes() + p2Inodes, p2Err := p2.NsInodes() + + if p1Err != nil || p2Err != nil { + return false + } + + return reflect.DeepEqual(p1Inodes, p2Inodes) +} diff --git a/sysbox-fs/domain/seccomp.go b/sysbox-fs/domain/seccomp.go new file mode 100644 index 00000000..e8eb2e4d --- /dev/null +++ b/sysbox-fs/domain/seccomp.go @@ -0,0 +1,27 @@ +// +// Copyright 2020 Nestybox, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +package domain + +type SyscallMonitorServiceIface interface { + Setup( + nss NSenterServiceIface, + css ContainerStateServiceIface, + prs ProcessServiceIface, + mts MountServiceIface, + allowImmutableRemounts bool, + allowImmutableUnmounts bool) +} diff --git a/sysbox-fs/domain/utils.go b/sysbox-fs/domain/utils.go new file mode 100644 index 00000000..50b8d90d --- /dev/null +++ b/sysbox-fs/domain/utils.go @@ -0,0 +1,48 @@ +// +// Copyright 2020 Nestybox, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +package domain + +import ( + "os" + "syscall" +) + +// FileExists reports whether the named file or directory exists. +func FileExists(name string) bool { + if _, err := os.Stat(name); err != nil { + if os.IsNotExist(err) { + return false + } + } + return true +} + +// FileInode obtains the inode associated with any given file-system resource. +func FileInode(name string) Inode { + + fi, err := os.Stat(name) + if err != nil { + return 0 + } + + st, ok := fi.Sys().(*syscall.Stat_t) + if !ok { + return 0 + } + + return st.Ino +} diff --git a/sysbox-fs/fuse/dir.go b/sysbox-fs/fuse/dir.go new file mode 100644 index 00000000..8a8d971a --- /dev/null +++ b/sysbox-fs/fuse/dir.go @@ -0,0 +1,375 @@ +// +// Copyright 2019-2021 Nestybox, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +package fuse + +import ( + "context" + "fmt" + "io" + "os" + "path/filepath" + "time" + + "github.com/nestybox/sysbox-fs/domain" + + "bazil.org/fuse" + "bazil.org/fuse/fs" + "github.com/sirupsen/logrus" +) + +// Default dentry-cache-timeout interval: This is the maximum +// amount of time that VFS will hold on to dentry elements before starting +// to forward lookup() operations to FUSE server. We want to set this to +// infinite ideally; we set it to the max allowed value. +var DentryCacheTimeout int64 = 0x7fffffffffffffff + +// Attribute's cache-timeout: This is the maximum amount of time that +// kernel will hold attributes associated to any given file/dir. Refer +// to man fuse(4) for details. +var AttribCacheTimeout int64 = 0x7fffffffffffffff + +// Dir struct serves as a FUSE-friendly abstraction to represent directories +// present in the host FS. +type Dir struct { + // + // Underlying File struct representing each directory. + // + File + + // + // TODO: Think about the need to define virtual-folders structs and its + // associated logic. If there's no such a need, and there's no + // differentiated logic between Dir and File structs, then consider the + // option of consolidating all associated logic within a single + // abstraction. + // +} + +// NewDir method serves as Dir constructor. +func NewDir(req *domain.HandlerRequest, attr *fuse.Attr, srv *fuseServer) *Dir { + + newDir := &Dir{ + File: *NewFile(req, attr, srv), + } + + return newDir +} + +// Lookup FS operation. +func (d *Dir) Lookup( + ctx context.Context, + req *fuse.LookupRequest, + resp *fuse.LookupResponse) (fs.Node, error) { + + logrus.Debugf("Requested Lookup() operation for entry %v (req ID=%#x)", + req.Name, uint64(req.ID)) + + path := filepath.Join(d.path, req.Name) + + // nodeDB caches the attributes associated with each file. This way, we perform the + // lookup of a given procfs/sysfs dir/file only once, improving performance. This works + // because: + // * there's a dedicated nodeDB (fuseServer) per sys-container + // * all attributes of procfs/sysfs dirs/files are static (e.g., permissions never + // change, and uid/gid values match those of the root user in the sys-container's + // user-ns as long as user-ns-nesting continue to be unsupported). + d.server.RLock() + node, ok := d.server.nodeDB[path] + if ok { + d.server.RUnlock() + return *node, nil + } + d.server.RUnlock() + + // Ensure operation is generated from within a registered sys container. + if d.server.container == nil { + logrus.Errorf("Could not find the container originating this request (pid %v)", + req.Pid) + return nil, fmt.Errorf("Could not find container originating this request (pid %v)", + req.Pid) + } + + // Upon arrival of lookup() request we must construct a temporary ionode + // that reflects the path of the element that needs to be looked up. + ionode := d.server.service.ios.NewIOnode(req.Name, path, 0) + + // Lookup the associated handler within handler-DB. + handler, ok := d.server.service.hds.LookupHandler(ionode) + if !ok { + logrus.Errorf("No supported handler for %v resource", d.path) + return nil, fmt.Errorf("No supported handler for %v resource", d.path) + } + + handlerReq := &domain.HandlerRequest{ + ID: uint64(req.ID), + Name: req.Name, + Path: path, + Pid: req.Pid, + Uid: req.Uid, + Gid: req.Gid, + Container: d.server.container, + } + + // Handler execution. + info, err := handler.Lookup(ionode, handlerReq) + if err != nil { + return nil, fuse.ENOENT + } + + // Convert os.FileInfo attributes to fuseAttr format. + fuseAttrs := convertFileInfoToFuse(info) + + // Identify the root uid & gid in the requester's user-ns. + prs := d.server.service.hds.ProcessService() + process := prs.ProcessCreate(req.Pid, req.Uid, req.Gid) + + rootUid, rootGid, err := process.UsernsRootUidGid() + if err != nil { + return nil, err + } + + // Override the uid & gid attributes with the root uid & gid in the requester's + // user-ns if, and only if, these ones have not been explicitly banned from + // being remapped. + if !handlerReq.SkipIdRemap { + fuseAttrs.Uid = rootUid + fuseAttrs.Gid = rootGid + } + + var newNode fs.Node + + // Create a new file/dir entry associated to the received os.FileInfo. + if info.IsDir() { + fuseAttrs.Mode |= os.ModeDir + newNode = NewDir(handlerReq, &fuseAttrs, d.File.server) + } else if info.Mode()&os.ModeSymlink != 0 { + fuseAttrs.Mode |= os.ModeSymlink + newNode = NewFile(handlerReq, &fuseAttrs, d.File.server) + } else { + newNode = NewFile(handlerReq, &fuseAttrs, d.File.server) + } + + // Insert new fs node into nodeDB. + d.server.Lock() + d.server.nodeDB[path] = &newNode + d.server.Unlock() + + // Adjust response to carry the largest dentry-cache-timeout value + // possible to reduce lookups() to the minimum. + resp.EntryValid = time.Duration(DentryCacheTimeout) + + return newNode, nil +} + +// Open FS operation. +func (d *Dir) Open( + ctx context.Context, + req *fuse.OpenRequest, + resp *fuse.OpenResponse) (fs.Handle, error) { + + // Ensure operation is generated from within a registered sys container. + if d.server.container == nil { + logrus.Errorf("Could not find the container originating this request (pid %v)", + req.Pid) + return nil, fmt.Errorf("Could not find container originating this request (pid %v)", + req.Pid) + } + + _, err := d.File.Open(ctx, req, resp) + if err != nil { + return nil, err + } + + return d, nil +} + +// Create FS operation. +func (d *Dir) Create( + ctx context.Context, + req *fuse.CreateRequest, + resp *fuse.CreateResponse) (fs.Node, fs.Handle, error) { + + logrus.Debugf("Requested Create() operation for entry %v (req ID=%#x)", req.Name, uint64(req.ID)) + + // Ensure operation is generated from within a registered sys container. + if d.server.container == nil { + logrus.Errorf("Could not find the container originating this request (pid %v)", + req.Pid) + return nil, nil, fmt.Errorf("Could not find container originating this request (pid %v)", + req.Pid) + } + + path := filepath.Join(d.path, req.Name) + + // New ionode reflecting the path of the element to be created. + ionode := d.server.service.ios.NewIOnode(req.Name, path, 0) + ionode.SetOpenFlags(int(req.Flags)) + ionode.SetOpenMode(req.Mode) + + // Lookup the associated handler within handler-DB. + handler, ok := d.server.service.hds.LookupHandler(ionode) + if !ok { + logrus.Errorf("No supported handler for %v resource", path) + return nil, nil, fmt.Errorf("No supported handler for %v resource", path) + } + + handlerReq := &domain.HandlerRequest{ + ID: uint64(req.ID), + Name: req.Name, + Path: path, + Pid: req.Pid, + Uid: req.Uid, + Gid: req.Gid, + Container: d.server.container, + } + + // Handler execution. 'Open' handler will create new element if requesting + // process has the proper credentials / capabilities. + nonSeekable, err := handler.Open(ionode, handlerReq) + if err != nil && err != io.EOF { + logrus.Debugf("Open() error: %v", err) + return nil, nil, err + } + + resp.Flags |= fuse.OpenDirectIO + if nonSeekable { + resp.Flags |= fuse.OpenNonSeekable + } + + // To satisfy Bazil FUSE lib we are expected to return a lookup-response + // and an open-response, let's start with the lookup() one. + info, err := handler.Lookup(ionode, handlerReq) + if err != nil { + return nil, nil, fuse.ENOENT + } + + // Extract received file attributes. + fuseAttrs := convertFileInfoToFuse(info) + + // Adjust response to carry the proper dentry-cache-timeout value. + resp.EntryValid = time.Duration(DentryCacheTimeout) + + var newNode fs.Node + newNode = NewFile(handlerReq, &fuseAttrs, d.File.server) + + // Insert new fs node into nodeDB. + d.server.Lock() + d.server.nodeDB[path] = &newNode + d.server.Unlock() + + return newNode, newNode, nil +} + +// ReadDirAll FS operation. +func (d *Dir) ReadDirAll(ctx context.Context, req *fuse.ReadRequest) ([]fuse.Dirent, error) { + + var children []fuse.Dirent + + logrus.Debugf("Requested ReadDirAll() on directory %v (req ID=%#v)", d.path, uint64(req.ID)) + + // Ensure operation is generated from within a registered sys container. + if d.server.container == nil { + logrus.Errorf("Could not find the container originating this request (pid %v)", + req.Pid) + return nil, fmt.Errorf("Could not find container originating this request (pid %v)", + req.Pid) + } + + // New ionode reflecting the path of the element to be created. + ionode := d.server.service.ios.NewIOnode(d.name, d.path, 0) + ionode.SetOpenFlags(int(req.Flags)) + + // Lookup the associated handler within handler-DB. + handler, ok := d.server.service.hds.LookupHandler(ionode) + if !ok { + logrus.Errorf("No supported handler for %v resource", d.path) + return nil, fmt.Errorf("No supported handler for %v resource", d.path) + } + + handlerReq := &domain.HandlerRequest{ + ID: uint64(req.ID), + Pid: req.Pid, + Uid: req.Uid, + Gid: req.Gid, + Container: d.server.container, + } + + // Handler execution. + files, err := handler.ReadDirAll(ionode, handlerReq) + if err != nil { + logrus.Debugf("ReadDirAll() error: %v", err) + return nil, fuse.ENOENT + } + + for _, node := range files { + // + // For ReadDirAll on the sysbox-fs root dir ("/"), we only act + // on the subdirs emulated by sysbox-fs (e.g., /proc, /sys). + // + if d.path == "/" { + if node.Name() != "sys" && node.Name() != "proc" && + node.Name() != "testing" { + continue + } + } + + elem := fuse.Dirent{Name: node.Name()} + + if node.IsDir() { + elem.Type = fuse.DT_Dir + } else if node.Mode().IsRegular() { + elem.Type = fuse.DT_File + } else if node.Mode()&os.ModeSymlink != 0 { + elem.Type = fuse.DT_Link + } + + children = append(children, elem) + } + + return children, nil +} + +// Mkdir FS operation. +func (d *Dir) Mkdir(ctx context.Context, req *fuse.MkdirRequest) (fs.Node, error) { + + logrus.Debugf("Requested Mkdir() on directory %v (Req ID=%#v)", req.Name, uint64(req.ID)) + + // Ensure operation is generated from within a registered sys container. + if d.server.container == nil { + logrus.Errorf("Could not find the container originating this request (pid %v)", + req.Pid) + return nil, fmt.Errorf("Could not find container originating this request (pid %v)", + req.Pid) + } + + path := filepath.Join(d.path, req.Name) + + handlerReq := &domain.HandlerRequest{ + Name: req.Name, + Path: path, + } + + newDir := NewDir(handlerReq, &fuse.Attr{}, d.File.server) + + return newDir, nil +} + +// Forget FS operation. +func (d *Dir) Forget() { + + d.File.Forget() +} diff --git a/sysbox-fs/fuse/error.go b/sysbox-fs/fuse/error.go new file mode 100644 index 00000000..ad204487 --- /dev/null +++ b/sysbox-fs/fuse/error.go @@ -0,0 +1,91 @@ +// +// Copyright 2019-2020 Nestybox, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +package fuse + +import ( + "encoding/json" + "os" + "reflect" + "syscall" + + "bazil.org/fuse" +) + +// +// IOerror's purpose is to encapsulate errors to be delivered to FUSE-Bazil +// library, which imposes certain demands on the error types that can be +// handled (i.e. it must satisfy 'errorNumber' interface). +// +// As part of this 'error' implementation, we are also providing an encoding +// specialization method to the (un)marshalling routines involved in 'nsenter' +// processing events. Note that without this specialization, we wouldn't be +// able to encode generic 'error' interface types; which is precisely the +// reason that the 'RcvError' member below is not being exposed to JSON +// marshalling logic. +// +type IOerror struct { + RcvError error `json:"-"` + Type string `json:"type"` + Code syscall.Errno `json:"code"` + Message string `json:"message"` +} + +func (e IOerror) Error() string { + return e.Message +} + +// Method requested by fuse.ErrorNumber interface. By implementing this +// interface, we are allowed to return IOerrors back to our FUSE-lib +// modules without making any modification to Bazil-FUSE code. +func (e IOerror) Errno() fuse.Errno { + return fuse.Errno(e.Code) +} + +// MarshallJSON's interface specialization to allow a customized encoding +// of IOerror struct. +func (e *IOerror) MarshalJSON() ([]byte, error) { + + err := e.RcvError + if err == nil { + return nil, nil + } + + var errcode syscall.Errno + + // Type assertion is needed here to extract the error code corresponding + // to the different error flavors that may be generated during I/O ops. + switch v := err.(type) { + case *os.PathError: + errcode = v.Err.(syscall.Errno) + + case *os.SyscallError: + errcode = v.Err.(syscall.Errno) + + case syscall.Errno: + errcode = v + + default: + errcode = syscall.EIO + } + + // Finally, let's populate the fields of NSenterError struct. + e.Type = reflect.TypeOf(err).String() + e.Code = errcode + e.Message = err.Error() + + return json.Marshal(*e) +} diff --git a/sysbox-fs/fuse/file.go b/sysbox-fs/fuse/file.go new file mode 100644 index 00000000..41d93816 --- /dev/null +++ b/sysbox-fs/fuse/file.go @@ -0,0 +1,440 @@ +// +// Copyright 2019-2020 Nestybox, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +package fuse + +import ( + "context" + "fmt" + "io" + "os" + "syscall" + "time" + + "bazil.org/fuse" + "bazil.org/fuse/fs" + "github.com/sirupsen/logrus" + + "github.com/nestybox/sysbox-fs/domain" +) + +type File struct { + // File name. + name string + + // File absolute-path + name. + path string + + // File attributes. + attr *fuse.Attr + + // Skip remapping uid/gid values. + skipIdRemap bool + + // Pointer to parent fuseService hosting this file/dir. + server *fuseServer +} + +// NewFile method serves as File constructor. +func NewFile(req *domain.HandlerRequest, attr *fuse.Attr, srv *fuseServer) *File { + + newFile := &File{ + name: req.Name, + path: req.Path, + attr: attr, + skipIdRemap: req.SkipIdRemap, + server: srv, + } + + return newFile +} + +// Attr FS operation. +func (f *File) Attr(ctx context.Context, a *fuse.Attr) error { + + logrus.Debugf("Requested Attr() operation for entry %v", f.path) + + // Simply return the attributes that were previously collected during the + // lookup() execution. + *a = *f.attr + + // Override the uid & gid attributes with the user-ns' root uid & gid of the + // sys container under which the request is received. In the future we should + // return the requester's user-ns root uid & gid instead, which could differ + // from the sys container's one if request is originated from an L2 container. + // Also, this will help us to support "unshare -U -m --mount-proc" inside a + // sys container. + // + // Notice, that in certain cases we may want to skip this uid/gid remapping + // process for certain nodes if its associated handler requests so. + if a.Uid == 0 && !f.skipIdRemap { + a.Uid = f.server.ContainerUID() + } + if a.Gid == 0 && !f.skipIdRemap { + a.Gid = f.server.ContainerGID() + } + + // As per man fuse(4), here we set the attribute's cache-duration to the + // largest possible value to ensure getattr()s are only received once per + // node. Notice that this behavior can be only enforced once the container + // is fully initialized as we don't want interim node attrs (i.e., during + // registration the container's uid/gid attrs are temporarily absent) to + // be permanently recorded in the FUSE nodes DB. By setting this value to + // zero during container initialization, we are slowing this process down + // (around 1/3rd extra file-ops), but that's the price to pay to be able + // to offer a consistent experience: users will always see the proper + // node attrs, regardless of the timing of the incoming file-ops. + if !f.server.IsCntrRegCompleted() { + a.Valid = time.Duration(0) + } else { + a.Valid = time.Duration(AttribCacheTimeout) + } + + logrus.Debugf("Attr() operation for entry %v: %+v", f.path, *a) + + return nil +} + +// Open FS operation. +func (f *File) Open( + ctx context.Context, + req *fuse.OpenRequest, + resp *fuse.OpenResponse) (fs.Handle, error) { + + logrus.Debugf("Requested Open() operation for entry %v (Req ID=%#v)", + f.path, uint64(req.ID)) + + // Ensure operation is generated from within a registered sys container. + if f.server.container == nil { + logrus.Errorf("Could not find the container originating this request (pid %v)", + req.Pid) + return nil, fmt.Errorf("Could not find container originating this request (pid %v)", + req.Pid) + } + + ionode := f.server.service.ios.NewIOnode(f.name, f.path, f.attr.Mode) + ionode.SetOpenFlags(int(req.Flags)) + + // Lookup the associated handler within handler-DB. + handler, ok := f.server.service.hds.LookupHandler(ionode) + if !ok { + logrus.Errorf("No supported handler for %v resource", f.path) + return nil, fmt.Errorf("No supported handler for %v resource", f.path) + } + + handlerReq := &domain.HandlerRequest{ + ID: uint64(req.ID), + Pid: req.Pid, + Uid: req.Uid, + Gid: req.Gid, + Container: f.server.container, + } + + // Handler execution. + nonSeekable, err := handler.Open(ionode, handlerReq) + if err != nil && err != io.EOF { + logrus.Debugf("Open() error: %v", err) + return nil, err + } + + // + // Due to the nature of procfs and sysfs, files lack explicit sizes (other + // than zero) as regular files have. In consequence, read operations (also + // writes) may not be properly handled by kernel, as these ones extend + // beyond the file sizes reported by Attr() / GetAttr(). + // + // A solution to this problem is to rely on O_DIRECT flag for all the + // interactions with procfs/sysfs files. By making use of this flag, + // sysbox-fs will ensure that it receives all read/write requests + // generated by fuse-clients, regardless of the file-size issue mentioned + // above. For regular files, this approach usually comes with a cost, as + // page-cache is being bypassed for all files I/O; however, this doesn't + // pose a problem for Sysbox as we are dealing with special FSs. + // + resp.Flags |= fuse.OpenDirectIO + + if nonSeekable { + resp.Flags |= fuse.OpenNonSeekable + } + + return f, nil +} + +// Release FS operation. +func (f *File) Release(ctx context.Context, req *fuse.ReleaseRequest) error { + + logrus.Debugf("Requested Release() operation for entry %v (Req ID=%#v)", + f.path, uint64(req.ID)) + + // + // Upon arrival of incoming fuse requests, sysbox-fs open()s and close()s + // the associated file-system node. IOW, upon successful handling of an + // open() fuse request, no file-system state (i.e. opened file-descriptor) + // will be held in sysbox-fs for opened dentries. Subsequent fuse requests + // generated by the same fuse-client process, will re-open the associated + // file to carry out the corresponding read/write operation. + // + // Notice that this approach allows us to handle emulated and non-emulated + // fs resources in the same manner. Non-emulated resources are only + // reachable through 'nsexec' mechanisms, which relies on the utilization + // of different processes to perform a determined i/o operation. In this + // scenario, there's no point in open()ing and clos()ing files, as the + // process performing the interim action (let's say, an open request) will + // die upon completion, which will necessarily end up with the process' + // fd-table getting wiped out by kernel upon process' exit(). + // + // That is all to say, that there is no need to do anything with these + // release() requests, as the associated inode is already closed by the + // time these requests arrive. And that covers both non-emulated ('nsexec') + // and emulated nodes. + + return nil +} + +// Read FS operation. +func (f *File) Read( + ctx context.Context, + req *fuse.ReadRequest, + resp *fuse.ReadResponse) error { + + logrus.Debugf("Requested Read() operation for entry %v (Req ID=%#v)", + f.path, uint64(req.ID)) + + // Ensure operation is generated from within a registered sys container. + if f.server.container == nil { + logrus.Errorf("Could not find the container originating this request (pid %v)", + req.Pid) + return fmt.Errorf("Could not find container originating this request (pid %v)", + req.Pid) + } + + ionode := f.server.service.ios.NewIOnode(f.name, f.path, f.attr.Mode) + + // Identify the associated handler and execute it accordingly. + handler, ok := f.server.service.hds.LookupHandler(ionode) + if !ok { + logrus.Errorf("Read() error: No supported handler for %v resource", f.path) + return fmt.Errorf("No supported handler for %v resource", f.path) + } + + handlerReq := &domain.HandlerRequest{ + ID: uint64(req.ID), + Pid: req.Pid, + Uid: req.Uid, + Gid: req.Gid, + Offset: req.Offset, + Data: make([]byte, req.Size), + Container: f.server.container, + } + + // Handler execution. + n, err := handler.Read(ionode, handlerReq) + if err != nil && err != io.EOF { + logrus.Debugf("Read() error: %v", err) + return err + } + + resp.Data = handlerReq.Data[:n] + return nil +} + +// Write FS operation. +func (f *File) Write( + ctx context.Context, + req *fuse.WriteRequest, + resp *fuse.WriteResponse) error { + + logrus.Debugf("Requested Write() operation for entry %v (Req ID=%#v)", + f.path, uint64(req.ID)) + + // Ensure operation is generated from within a registered sys container. + if f.server.container == nil { + logrus.Errorf("Could not find the container originating this request (pid %v)", + req.Pid) + return fmt.Errorf("Could not find container originating this request (pid %v)", + req.Pid) + } + + ionode := f.server.service.ios.NewIOnode(f.name, f.path, f.attr.Mode) + + // Lookup the associated handler within handler-DB. + handler, ok := f.server.service.hds.LookupHandler(ionode) + if !ok { + logrus.Errorf("Write() error: No supported handler for %v resource", f.path) + return fmt.Errorf("No supported handler for %v resource", f.path) + } + + request := &domain.HandlerRequest{ + ID: uint64(req.ID), + Pid: req.Pid, + Uid: req.Uid, + Gid: req.Gid, + Data: req.Data, + Container: f.server.container, + } + + // Handler execution. + n, err := handler.Write(ionode, request) + if err != nil && err != io.EOF { + logrus.Debugf("Write() error: %v", err) + return err + } + + resp.Size = n + return nil +} + +func (f *File) Readlink( + ctx context.Context, + req *fuse.ReadlinkRequest) (string, error) { + + logrus.Debugf("Requested Readlink() operation for entry %v (Req ID=%#v)", + f.path, uint64(req.ID)) + + // Ensure operation is generated from within a registered sys container. + if f.server.container == nil { + logrus.Errorf("Could not find the container originating this request (pid %v)", req.Pid) + return "", fmt.Errorf("Could not find container originating this request (pid %v)", req.Pid) + } + + ionode := f.server.service.ios.NewIOnode(f.name, f.path, f.attr.Mode) + + // Lookup the associated handler within handler-DB. + handler, ok := f.server.service.hds.LookupHandler(ionode) + if !ok { + logrus.Errorf("Readlink() error: No supported handler for %v resource", f.path) + return "", fmt.Errorf("No supported handler for %v resource", f.path) + } + + request := &domain.HandlerRequest{ + ID: uint64(req.ID), + Pid: req.Pid, + Uid: req.Uid, + Gid: req.Gid, + Container: f.server.container, + } + + // Handler execution. + link, err := handler.ReadLink(ionode, request) + if err != nil && err != io.EOF { + logrus.Debugf("Readlink() error: %v", err) + return "", err + } + + return link, nil +} + +// Setattr FS operation. +func (f *File) Setattr( + ctx context.Context, + req *fuse.SetattrRequest, + resp *fuse.SetattrResponse) error { + + logrus.Debugf("Requested Setattr() operation for entry %v (Req ID=%#v)", + f.path, uint64(req.ID)) + + // Ensure operation is generated from within a registered sys container. + if f.server.container == nil { + logrus.Errorf("Could not find the container originating this request (pid %v)", + req.Pid) + return fmt.Errorf("Could not find container originating this request (pid %v)", + req.Pid) + } + + // No file attr changes are allowed in a procfs, with the exception of + // 'size' modifications which are needed to allow write()/truncate() ops. + // All other 'fuse.SetattrValid' operations will be rejected. + if req.Valid.Size() { + return nil + } + + return fuse.EPERM +} + +// Forget FS operation. +func (f *File) Forget() { + + logrus.Debugf("Requested Forget() operation for entry %v", f.path) + + f.server.Lock() + defer f.server.Unlock() + + if _, ok := f.server.nodeDB[f.path]; !ok { + return + } + + delete(f.server.nodeDB, f.path) +} + +// Size method returns the 'size' of a File element. +func (f *File) Size() uint64 { + return f.attr.Size +} + +// Mode method returns the 'mode' of a File element. +func (f *File) Mode() os.FileMode { + return f.attr.Mode +} + +// ModTime method returns the modification-time of a File element. +func (f *File) ModTime() time.Time { + return f.attr.Mtime +} + +// convertFileInfoToFuse function translates FS node-attributes from a kernel +// friendly DS type, to those expected by Bazil-FUSE-lib to interact with +// FUSE-clients. +// +// Function takes as parameter the os.FileInfo object holding the attributes +// that we want to convert, and then place the converted attributes into a +// new DS which later on will be processed by Bazil FUSE-lib. +// +// For reference, the attributes' format expected by Bazil-FUSE-lib are defined +// here: bazil/fuse.go (fuse.Attr DS). +func convertFileInfoToFuse(info os.FileInfo) fuse.Attr { + var a fuse.Attr + + // If the fileInfo does not have a stat() method (e.g., for files that are + // virtual and not present in the host file system), translate using the + // available file info. + stat := info.Sys().(*syscall.Stat_t) + if stat == nil { + a.Size = uint64(info.Size()) + a.Mode = info.Mode() + a.Mtime = info.ModTime() + a.Nlink = 1 + a.BlockSize = 1024 + return a + } + + // Otherwise, translate using the file info returned by stat(), except for + // the file size, which is always picked up from the file info. + a.Size = uint64(info.Size()) + + a.Inode = uint64(stat.Ino) + a.Blocks = uint64(stat.Blocks) + a.Atime = time.Unix(int64(stat.Atim.Sec), int64(stat.Atim.Nsec)) + a.Mtime = time.Unix(int64(stat.Mtim.Sec), int64(stat.Mtim.Nsec)) + a.Ctime = time.Unix(int64(stat.Ctim.Sec), int64(stat.Ctim.Nsec)) + a.Mode = os.FileMode(stat.Mode) + a.Nlink = uint32(stat.Nlink) + a.Uid = uint32(stat.Uid) + a.Gid = uint32(stat.Gid) + a.Rdev = uint32(stat.Rdev) + a.BlockSize = uint32(stat.Blksize) + + return a +} diff --git a/sysbox-fs/fuse/server.go b/sysbox-fs/fuse/server.go new file mode 100644 index 00000000..31102831 --- /dev/null +++ b/sysbox-fs/fuse/server.go @@ -0,0 +1,254 @@ +// +// Copyright 2019-2020 Nestybox, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +package fuse + +import ( + "errors" + "os" + "sync" + + "bazil.org/fuse" + "bazil.org/fuse/fs" + + _ "bazil.org/fuse/fs/fstestutil" + "github.com/sirupsen/logrus" + + "github.com/nestybox/sysbox-fs/domain" +) + +// FuseServer class in charge of running/hosting sysbox-fs' FUSE server features. +type fuseServer struct { + sync.RWMutex // nodeDB protection + conn *fuse.Conn // Associated fuse connection + path string // fs path to emulate -- "/" by default + mountPoint string // mountpoint -- "/var/lib/sysboxfs" by default + container domain.ContainerIface // associated sys container + containerUid uint32 // container UID for caching purposes + containerGid uint32 // container GID for caching purposes + server *fs.Server // bazil-fuse server instance + nodeDB map[string]*fs.Node // map to store all fs nodes, e.g. "/proc/uptime" -> File + root *Dir // root node of fuse fs -- "/" by default + initDone chan bool // sync-up channel to alert about fuse-server's init-completion + cntrReg bool // flag to track the container's registration state + service *FuseServerService // backpointer to parent service +} + +func NewFuseServer( + path string, + mountpoint string, + container domain.ContainerIface, + service *FuseServerService) domain.FuseServerIface { + + srv := &fuseServer{ + path: path, + mountPoint: mountpoint, + container: container, + service: service, + } + + return srv +} + +func (s *fuseServer) Create() error { + + // Verify the existence of the requested path in the host FS. + pathIOnode := s.service.ios.NewIOnode(s.path, s.path, os.ModeDir) + pathInfo, err := pathIOnode.Stat() + if err != nil { + if os.IsNotExist(err) { + logrus.Errorf("File-System path not found: %v", s.path) + return err + } else { + logrus.Errorf("File-System path not accessible: %v", s.path) + return err + } + } + + // Verify the existence of the requested mountpoint in the host FS. + mountPointIOnode := s.service.ios.NewIOnode( + s.mountPoint, + s.mountPoint, + 0600, + ) + _, err = mountPointIOnode.Stat() + if err != nil { + if os.IsNotExist(err) { + logrus.Errorf("File-System mountpoint not found: %v", s.mountPoint) + return err + } else { + logrus.Errorf("File-System mountpoint not accessible: %v", s.mountPoint) + return err + } + } + + // Create a first node corresponding to the root (dir) element in + // sysbox-fs. + var attr fuse.Attr + if s.service.ios.GetServiceType() == domain.IOMemFileService { + attr = fuse.Attr{} + } else { + attr = convertFileInfoToFuse(pathInfo) + } + attr.Mode = os.ModeDir | os.FileMode(int(0600)) + + // Build sysbox-fs top-most directory (root). + request := &domain.HandlerRequest{ + Name: s.path, + Path: s.path, + } + s.root = NewDir(request, &attr, s) + + // Initialize pending members. + s.nodeDB = make(map[string]*fs.Node) + s.initDone = make(chan bool) + + return nil +} + +func (s *fuseServer) Run() error { + // + // Creating a FUSE mount at the associated mountpoint. + // + // The "AllowOther" flag allows unprivileged users to access the resources + // exposed on this mountpoint. + // + // The "DefaultPermissions" flag serves to instruct the kernel to perform + // its own permission check, instead of deferring all permission checking + // to sysbox-fs filesystem. + // + c, err := fuse.Mount( + s.mountPoint, + fuse.FSName("sysboxfs"), + fuse.AllowOther(), + fuse.DefaultPermissions(), + ) + if err != nil { + logrus.Error(err) + return err + } + s.conn = c + + // Deferred routine to enforce a clean exit should an unrecoverable error is + // ever returned from fuse-lib. + defer func() { + s.Unmount() + c.Close() + }() + + if p := c.Protocol(); !p.HasInvalidate() { + logrus.Panic("Kernel FUSE support is too old to have invalidations: version ", p) + return err + } + + // Creating a FUSE server to drive kernel interactions. + s.server = fs.New(c, nil) + if s.server == nil { + logrus.Panic("FUSE file-system could not be created") + return errors.New("FUSE file-system could not be created") + } + + // At this point we are done with fuse-server initialization, so let's + // caller know about it. + s.initDone <- true + + // Launch fuse-server's main-loop to handle incoming requests. + if err := s.server.Serve(s); err != nil { + logrus.Panic(err) + return err + } + + // Return if any error is reported by mount logic. + <-c.Ready + if err := c.MountError; err != nil { + logrus.Panic(err) + return err + } + + return nil +} + +func (s *fuseServer) Destroy() error { + + // Unmount sysboxfs from mountpoint. + err := fuse.Unmount(s.mountPoint) + if err != nil { + logrus.Errorf("FUSE file-system could not be unmounted: %v", err) + return err + } + + // Unset pointers for GC purposes. + s.container = nil + s.server = nil + s.root = nil + s.service = nil + s.conn = nil + + return nil +} + +// Root method. This is a Bazil-FUSE-lib requirement. Function returns +// sysbox-fs' root-node. +func (s *fuseServer) Root() (fs.Node, error) { + + return s.root, nil +} + +// Ensure that fuse-server initialization is completed before moving on +// with sys container's pre-registration sequence. +func (s *fuseServer) InitWait() { + <-s.initDone +} + +func (s *fuseServer) MountPoint() string { + + return s.mountPoint +} + +func (s *fuseServer) Unmount() { + + fuse.Unmount(s.mountPoint) +} + +// Helper functions to extract the container UID and GID (below) corresponding to +// the sys container associated to each fuseServer. Notice that by caching these +// values we are reducing the level of contention between FUSE operations (e.g., +// every Attr() call) and syscall handling ones. +func (s *fuseServer) ContainerUID() uint32 { + + if s.containerUid == 0 { + s.containerUid = s.container.UID() + } + + return s.containerUid +} + +func (s *fuseServer) ContainerGID() uint32 { + + if s.containerGid == 0 { + s.containerGid = s.container.GID() + } + + return s.containerGid +} + +func (s *fuseServer) SetCntrRegComplete() { + s.cntrReg = true +} + +func (s *fuseServer) IsCntrRegCompleted() bool { + return s.cntrReg +} diff --git a/sysbox-fs/fuse/service.go b/sysbox-fs/fuse/service.go new file mode 100644 index 00000000..cd2dbba6 --- /dev/null +++ b/sysbox-fs/fuse/service.go @@ -0,0 +1,199 @@ +// +// Copyright 2019-2020 Nestybox, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +package fuse + +import ( + "errors" + "os" + "path/filepath" + "sync" + + _ "bazil.org/fuse/fs/fstestutil" + + "github.com/nestybox/sysbox-fs/domain" + "github.com/sirupsen/logrus" +) + +type FuseServerService struct { + sync.RWMutex // servers map protection + path string // fs path to emulate -- "/" by default + mountPoint string // base mountpoint -- "/var/lib/sysboxfs" by default + serversMap map[string]*fuseServer // tracks created fuse-servers + css domain.ContainerStateServiceIface // containerState service pointer + ios domain.IOServiceIface // i/o service pointer + hds domain.HandlerServiceIface // handler service pointer +} + +// FuseServerService constructor. +func NewFuseServerService() *FuseServerService { + + newServerService := &FuseServerService{ + serversMap: make(map[string]*fuseServer), + } + + return newServerService +} + +func (fss *FuseServerService) Setup( + mp string, + css domain.ContainerStateServiceIface, + ios domain.IOServiceIface, + hds domain.HandlerServiceIface) error { + + fss.css = css + fss.ios = ios + fss.hds = hds + fss.mountPoint = mp + + if err := os.MkdirAll(mp, 0600); err != nil { + return err + } + + return nil +} + +// FuseServerService destructor. +func (fss *FuseServerService) DestroyFuseService() { + + for k, _ := range fss.serversMap { + fss.DestroyFuseServer(k) + } + + if err := os.RemoveAll(fss.mountPoint); err != nil { + logrus.Warnf("failed to remove %s: %s", fss.mountPoint, err) + } +} + +// Creates new fuse-server. +// +// serveCntr is the container on which the fuse server will listen. +// stateCntr is the container object tracking the state for the fuse accesses. +// +// Normally serveCntr and stateCntr refer to the same cntr object. However, if +// multiple containers want to share the same fuse state (as sysbox-fs does for +// kubernetes pods), then this function may be called with different serveCntr +// objects but the same stateCntr object. +func (fss *FuseServerService) CreateFuseServer(serveCntr, stateCntr domain.ContainerIface) error { + + cntrId := serveCntr.ID() + + // Ensure a fuse-server does not exist for this serveCntr. + fss.RLock() + if _, ok := fss.serversMap[cntrId]; ok { + fss.RUnlock() + logrus.Errorf("FuseServer to create is already present for container id %s", + cntrId) + return errors.New("FuseServer already present") + } + fss.RUnlock() + + // Create required mountpoint in host file-system. + cntrMountpoint := filepath.Join(fss.mountPoint, cntrId) + mountpointIOnode := fss.ios.NewIOnode("", cntrMountpoint, 0600) + if err := mountpointIOnode.MkdirAll(); err != nil { + return errors.New("FuseServer with invalid mountpoint") + } + + srv := NewFuseServer( + "/", + cntrMountpoint, + stateCntr, + fss, + ) + + // Create new fuse-server. + if err := srv.Create(); err != nil { + return errors.New("FuseServer initialization error") + } + + // Launch fuse-server in a separate goroutine and wait for 'ack' before + // moving on. + go srv.Run() + srv.InitWait() + + // Store newly created fuse-server. + fss.Lock() + fss.serversMap[cntrId] = srv.(*fuseServer) + fss.Unlock() + + logrus.Debugf("Created fuse server for container %s", cntrId) + + if serveCntr != stateCntr { + logrus.Debugf("Fuse server for container %s shares state with container %s", cntrId, stateCntr.ID()) + } + + return nil +} + +// Destroy a fuse-server. +func (fss *FuseServerService) DestroyFuseServer(cntrId string) error { + + // Ensure fuse-server to eliminate is present. + fss.RLock() + srv, ok := fss.serversMap[cntrId] + if !ok { + fss.RUnlock() + logrus.Errorf("FuseServer to destroy is not present for container id %s", + cntrId) + return nil + } + fss.RUnlock() + + // Destroy fuse-server. + if err := srv.Destroy(); err != nil { + logrus.Errorf("FuseServer to destroy could not be eliminated for container id %s", + cntrId) + return nil + } + + // Remove mountpoint dir from host file-system. + cntrMountpoint := filepath.Join(fss.mountPoint, cntrId) + if err := os.Remove(cntrMountpoint); err != nil { + logrus.Errorf("FuseServer mountpoint could not be eliminated for container id %s", + cntrId) + return nil + } + + // Update state. + fss.Lock() + delete(fss.serversMap, cntrId) + fss.Unlock() + + logrus.Debugf("Destroyed fuse server for container %s", cntrId) + + return nil +} + +func (fss *FuseServerService) FuseServerCntrRegComplete(cntr domain.ContainerIface) error { + + cntrId := cntr.ID() + + // Ensure fuse-server to eliminate is present. + fss.RLock() + srv, ok := fss.serversMap[cntrId] + if !ok { + fss.RUnlock() + logrus.Errorf("FuseServer to update is not present for container id %s", + cntrId) + return nil + } + fss.RUnlock() + + srv.SetCntrRegComplete() + + return nil +} diff --git a/sysbox-fs/go.mod b/sysbox-fs/go.mod new file mode 100644 index 00000000..4cafe26f --- /dev/null +++ b/sysbox-fs/go.mod @@ -0,0 +1,91 @@ +module github.com/nestybox/sysbox-fs + +go 1.22 + +toolchain go1.22.6 + +require ( + bazil.org/fuse v0.0.0-20180421153158-65cc252bf669 + github.com/coreos/go-systemd v0.0.0-20191104093116-d3cd4ed1dbcf + github.com/hashicorp/go-immutable-radix v1.3.0 + github.com/nestybox/sysbox-ipc v0.0.0-00010101000000-000000000000 + github.com/nestybox/sysbox-libs/capability v0.0.0-00010101000000-000000000000 + github.com/nestybox/sysbox-libs/formatter v0.0.0-20210709231355-1ea69f2f6dbb + github.com/nestybox/sysbox-libs/linuxUtils v0.0.0-00010101000000-000000000000 + github.com/nestybox/sysbox-libs/pidfd v0.0.0-00010101000000-000000000000 + github.com/nestybox/sysbox-libs/utils v0.0.0-00010101000000-000000000000 + github.com/nestybox/sysbox-runc v0.0.0-00010101000000-000000000000 + github.com/pkg/profile v1.5.0 + github.com/seccomp/libseccomp-golang v0.10.0 + github.com/sirupsen/logrus v1.9.3 + github.com/spf13/afero v1.4.1 + github.com/stretchr/testify v1.8.4 + github.com/urfave/cli v1.22.14 + github.com/vishvananda/netlink v1.1.0 + golang.org/x/sys v0.27.0 + google.golang.org/grpc v1.64.0 + gopkg.in/hlandau/service.v1 v1.0.7 +) + +require ( + github.com/Masterminds/semver v1.5.0 // indirect + github.com/checkpoint-restore/go-criu/v4 v4.1.0 // indirect + github.com/cilium/ebpf v0.3.0 // indirect + github.com/containerd/console v1.0.1 // indirect + github.com/coreos/go-systemd/v22 v22.1.0 // indirect + github.com/cpuguy83/go-md2man/v2 v2.0.2 // indirect + github.com/cyphar/filepath-securejoin v0.2.2 // indirect + github.com/davecgh/go-spew v1.1.1 // indirect + github.com/deckarep/golang-set v1.8.0 // indirect + github.com/deckarep/golang-set/v2 v2.3.1 // indirect + github.com/docker/docker v26.0.0+incompatible // indirect + github.com/godbus/dbus/v5 v5.0.3 // indirect + github.com/golang/protobuf v1.5.4 // indirect + github.com/hashicorp/go-uuid v1.0.1 // indirect + github.com/hashicorp/golang-lru v0.5.1 // indirect + github.com/joshlf/go-acl v0.0.0-20200411065538-eae00ae38531 // indirect + github.com/karrick/godirwalk v1.16.1 // indirect + github.com/kr/pretty v0.1.0 // indirect + github.com/moby/sys/mountinfo v0.4.0 // indirect + github.com/mrunalp/fileutils v0.5.0 // indirect + github.com/nestybox/sysbox-libs/idMap v0.0.0-00010101000000-000000000000 // indirect + github.com/nestybox/sysbox-libs/idShiftUtils v0.0.0-00010101000000-000000000000 // indirect + github.com/nestybox/sysbox-libs/mount v0.0.0-20240602025437-33cbdf5a9e98 // indirect + github.com/nestybox/sysbox-libs/overlayUtils v0.0.0-00010101000000-000000000000 // indirect + github.com/nestybox/sysbox-libs/shiftfs v0.0.0-00010101000000-000000000000 // indirect + github.com/opencontainers/runc v1.1.4 // indirect + github.com/opencontainers/runtime-spec v1.1.1-0.20230823135140-4fec88fd00a4 // indirect + github.com/opencontainers/selinux v1.8.0 // indirect + github.com/pkg/errors v0.9.1 // indirect + github.com/pmezard/go-difflib v1.0.0 // indirect + github.com/russross/blackfriday/v2 v2.1.0 // indirect + github.com/stretchr/objx v0.5.0 // indirect + github.com/vishvananda/netns v0.0.0-20191106174202-0a2b9b5464df // indirect + github.com/willf/bitset v1.1.11 // indirect + golang.org/x/net v0.23.0 // indirect + golang.org/x/text v0.15.0 // indirect + google.golang.org/genproto/googleapis/rpc v0.0.0-20240513163218-0867130af1f8 // indirect + google.golang.org/protobuf v1.35.1 // indirect + gopkg.in/check.v1 v1.0.0-20180628173108-788fd7840127 // indirect + gopkg.in/yaml.v3 v3.0.1 // indirect +) + +replace ( + bazil.org/fuse => github.com/loft-sh/fuse v0.0.0-20241122125726-039cf8af2c85 + github.com/godbus/dbus => github.com/godbus/dbus/v5 v5.0.3 + github.com/nestybox/sysbox-ipc => ../sysbox-ipc + github.com/nestybox/sysbox-libs/capability => ../sysbox-libs/capability + github.com/nestybox/sysbox-libs/dockerUtils => ../sysbox-libs/dockerUtils + github.com/nestybox/sysbox-libs/formatter => ../sysbox-libs/formatter + github.com/nestybox/sysbox-libs/idMap => ../sysbox-libs/idMap + github.com/nestybox/sysbox-libs/idShiftUtils => ../sysbox-libs/idShiftUtils + github.com/nestybox/sysbox-libs/linuxUtils => ../sysbox-libs/linuxUtils + github.com/nestybox/sysbox-libs/mount => ../sysbox-libs/mount + github.com/nestybox/sysbox-libs/overlayUtils => ../sysbox-libs/overlayUtils + github.com/nestybox/sysbox-libs/pidfd => ../sysbox-libs/pidfd + github.com/nestybox/sysbox-libs/pidmonitor => ../sysbox-libs/pidmonitor + github.com/nestybox/sysbox-libs/shiftfs => ../sysbox-libs/shiftfs + github.com/nestybox/sysbox-libs/utils => ../sysbox-libs/utils + github.com/nestybox/sysbox-runc => ../sysbox-runc + github.com/opencontainers/runc => ./../sysbox-runc +) diff --git a/sysbox-fs/go.sum b/sysbox-fs/go.sum new file mode 100644 index 00000000..4b25b32d --- /dev/null +++ b/sysbox-fs/go.sum @@ -0,0 +1,158 @@ +github.com/BurntSushi/toml v1.3.2/go.mod h1:CxXYINrC8qIiEnFrOxCa7Jy5BFHlXnUU2pbicEuybxQ= +github.com/Masterminds/semver v1.5.0 h1:H65muMkzWKEuNDnfl9d70GUjFniHKHRbFPGBuZ3QEww= +github.com/Masterminds/semver v1.5.0/go.mod h1:MB6lktGJrhw8PrUyiEoblNEGEQ+RzHPF078ddwwvV3Y= +github.com/checkpoint-restore/go-criu/v4 v4.1.0 h1:WW2B2uxx9KWF6bGlHqhm8Okiafwwx7Y2kcpn8lCpjgo= +github.com/checkpoint-restore/go-criu/v4 v4.1.0/go.mod h1:xUQBLp4RLc5zJtWY++yjOoMoB5lihDt7fai+75m+rGw= +github.com/cilium/ebpf v0.3.0 h1:LI3lsl5GmTh+OFYamrj8sp+R0yam38zHG6NTDhSlNmQ= +github.com/cilium/ebpf v0.3.0/go.mod h1:To2CFviqOWL/M0gIMsvSMlqe7em/l1ALkX1PyjrX2Qs= +github.com/containerd/console v1.0.1 h1:u7SFAJyRqWcG6ogaMAx3KjSTy1e3hT9QxqX7Jco7dRc= +github.com/containerd/console v1.0.1/go.mod h1:XUsP6YE/mKtz6bxc+I8UiKKTP04qjQL4qcS3XoQ5xkw= +github.com/coreos/go-systemd v0.0.0-20191104093116-d3cd4ed1dbcf h1:iW4rZ826su+pqaw19uhpSCzhj44qo35pNgKFGqzDKkU= +github.com/coreos/go-systemd v0.0.0-20191104093116-d3cd4ed1dbcf/go.mod h1:F5haX7vjVVG0kc13fIWeqUViNPyEJxv/OmvnBo0Yme4= +github.com/coreos/go-systemd/v22 v22.1.0 h1:kq/SbG2BCKLkDKkjQf5OWwKWUKj1lgs3lFI4PxnR5lg= +github.com/coreos/go-systemd/v22 v22.1.0/go.mod h1:xO0FLkIi5MaZafQlIrOotqXZ90ih+1atmu1JpKERPPk= +github.com/cpuguy83/go-md2man/v2 v2.0.2 h1:p1EgwI/C7NhT0JmVkwCD2ZBK8j4aeHQX2pMHHBfMQ6w= +github.com/cpuguy83/go-md2man/v2 v2.0.2/go.mod h1:tgQtvFlXSQOSOSIRvRPT7W67SCa46tRHOmNcaadrF8o= +github.com/cyphar/filepath-securejoin v0.2.2 h1:jCwT2GTP+PY5nBz3c/YL5PAIbusElVrPujOBSCj8xRg= +github.com/cyphar/filepath-securejoin v0.2.2/go.mod h1:FpkQEhXnPnOthhzymB7CGsFk2G9VLXONKD9G7QGMM+4= +github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= +github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= +github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= +github.com/deckarep/golang-set v1.8.0 h1:sk9/l/KqpunDwP7pSjUg0keiOOLEnOBHzykLrsPppp4= +github.com/deckarep/golang-set v1.8.0/go.mod h1:5nI87KwE7wgsBU1F4GKAw2Qod7p5kyS383rP6+o6qqo= +github.com/deckarep/golang-set/v2 v2.3.1 h1:vjmkvJt/IV27WXPyYQpAh4bRyWJc5Y435D17XQ9QU5A= +github.com/deckarep/golang-set/v2 v2.3.1/go.mod h1:VAky9rY/yGXJOLEDv3OMci+7wtDpOF4IN+y82NBOac4= +github.com/docker/docker v26.0.0+incompatible h1:Ng2qi+gdKADUa/VM+6b6YaY2nlZhk/lVJiKR/2bMudU= +github.com/docker/docker v26.0.0+incompatible/go.mod h1:eEKB0N0r5NX/I1kEveEz05bcu8tLC/8azJZsviup8Sk= +github.com/godbus/dbus/v5 v5.0.3 h1:ZqHaoEF7TBzh4jzPmqVhE/5A1z9of6orkAe5uHoAeME= +github.com/godbus/dbus/v5 v5.0.3/go.mod h1:xhWf0FNVPg57R7Z0UbKHbJfkEywrmjJnf7w5xrFpKfA= +github.com/golang/protobuf v1.3.5/go.mod h1:6O5/vntMXwX2lRkT1hjjk0nAC1IDOTvTlVgjlRvqsdk= +github.com/golang/protobuf v1.5.4 h1:i7eJL8qZTpSEXOPTxNKhASYpMn+8e5Q6AdndVa1dWek= +github.com/golang/protobuf v1.5.4/go.mod h1:lnTiLA8Wa4RWRcIUkrtSVa5nRhsEGBg48fD6rSs7xps= +github.com/google/go-cmp v0.5.2/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE= +github.com/google/go-cmp v0.6.0 h1:ofyhxvXcZhMsU5ulbFiLKl/XBFqE1GSq7atu8tAmTRI= +github.com/google/go-cmp v0.6.0/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY= +github.com/hashicorp/go-immutable-radix v1.3.0 h1:8exGP7ego3OmkfksihtSouGMZ+hQrhxx+FVELeXpVPE= +github.com/hashicorp/go-immutable-radix v1.3.0/go.mod h1:0y9vanUI8NX6FsYoO3zeMjhV/C5i9g4Q3DwcSNZ4P60= +github.com/hashicorp/go-uuid v1.0.0/go.mod h1:6SBZvOh/SIDV7/2o3Jml5SYk/TvGqwFJ/bN7x4byOro= +github.com/hashicorp/go-uuid v1.0.1 h1:fv1ep09latC32wFoVwnqcnKJGnMSdBanPczbHAYm1BE= +github.com/hashicorp/go-uuid v1.0.1/go.mod h1:6SBZvOh/SIDV7/2o3Jml5SYk/TvGqwFJ/bN7x4byOro= +github.com/hashicorp/golang-lru v0.5.0/go.mod h1:/m3WP610KZHVQ1SGc6re/UDhFvYD7pJ4Ao+sR/qLZy8= +github.com/hashicorp/golang-lru v0.5.1 h1:0hERBMJE1eitiLkihrMvRVBYAkpHzc/J3QdDN+dAcgU= +github.com/hashicorp/golang-lru v0.5.1/go.mod h1:/m3WP610KZHVQ1SGc6re/UDhFvYD7pJ4Ao+sR/qLZy8= +github.com/joshlf/go-acl v0.0.0-20200411065538-eae00ae38531 h1:hgVxRoDDPtQE68PT4LFvNlPz2nBKd3OMlGKIQ69OmR4= +github.com/joshlf/go-acl v0.0.0-20200411065538-eae00ae38531/go.mod h1:fqTUQpVYBvhCNIsMXGl2GE9q6z94DIP6NtFKXCSTVbg= +github.com/joshlf/testutil v0.0.0-20170608050642-b5d8aa79d93d h1:J8tJzRyiddAFF65YVgxli+TyWBi0f79Sld6rJP6CBcY= +github.com/joshlf/testutil v0.0.0-20170608050642-b5d8aa79d93d/go.mod h1:b+Q3v8Yrg5o15d71PSUraUzYb+jWl6wQMSBXSGS/hv0= +github.com/karrick/godirwalk v1.16.1 h1:DynhcF+bztK8gooS0+NDJFrdNZjJ3gzVzC545UNA9iw= +github.com/karrick/godirwalk v1.16.1/go.mod h1:j4mkqPuvaLI8mp1DroR3P6ad7cyYd4c1qeJ3RV7ULlk= +github.com/kr/fs v0.1.0/go.mod h1:FFnZGqtBN9Gxj7eW1uZ42v5BccTP0vu6NEaFoC2HwRg= +github.com/kr/pretty v0.1.0 h1:L/CwN0zerZDmRFUapSPitk6f+Q3+0za1rQkzVuMiMFI= +github.com/kr/pretty v0.1.0/go.mod h1:dAy3ld7l9f0ibDNOQOHHMYYIIbhfbHSm3C4ZsoJORNo= +github.com/kr/pty v1.1.1/go.mod h1:pFQYn66WHrOpPYNljwOMqo10TkYh1fy3cYio2l3bCsQ= +github.com/kr/text v0.1.0 h1:45sCR5RtlFHMR4UwH9sdQ5TC8v0qDQCHnXt+kaKSTVE= +github.com/kr/text v0.1.0/go.mod h1:4Jbv+DJW3UT/LiOwJeYQe1efqtUx/iVham/4vfdArNI= +github.com/loft-sh/fuse v0.0.0-20241122125726-039cf8af2c85 h1:iMVnmige/yViyPpxIMiWZy/L7U2fbVsBL96i79SUfNY= +github.com/loft-sh/fuse v0.0.0-20241122125726-039cf8af2c85/go.mod h1:qpHqFkt+TgpEOS1UDyG4wDy7gse84M1FKS6aIB08/lE= +github.com/moby/sys/mountinfo v0.4.0 h1:1KInV3Huv18akCu58V7lzNlt+jFmqlu1EaErnEHE/VM= +github.com/moby/sys/mountinfo v0.4.0/go.mod h1:rEr8tzG/lsIZHBtN/JjGG+LMYx9eXgW2JI+6q0qou+A= +github.com/mrunalp/fileutils v0.5.0 h1:NKzVxiH7eSk+OQ4M+ZYW1K6h27RUV3MI6NUTsHhU6Z4= +github.com/mrunalp/fileutils v0.5.0/go.mod h1:M1WthSahJixYnrXQl/DFQuteStB1weuxD2QJNHXfbSQ= +github.com/opencontainers/runtime-spec v1.1.1-0.20230823135140-4fec88fd00a4 h1:EctkgBjZ1y4q+sibyuuIgiKpa0QSd2elFtSSdNvBVow= +github.com/opencontainers/runtime-spec v1.1.1-0.20230823135140-4fec88fd00a4/go.mod h1:jwyrGlmzljRJv/Fgzds9SsS/C5hL+LL3ko9hs6T5lQ0= +github.com/opencontainers/selinux v1.8.0 h1:+77ba4ar4jsCbL1GLbFL8fFM57w6suPfSS9PDLDY7KM= +github.com/opencontainers/selinux v1.8.0/go.mod h1:RScLhm78qiWa2gbVCcGkC7tCGdgk3ogry1nUQF8Evvo= +github.com/pkg/errors v0.8.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0= +github.com/pkg/errors v0.9.1 h1:FEBLx1zS214owpjy7qsBeixbURkuhQAwrK5UwLGTwt4= +github.com/pkg/errors v0.9.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0= +github.com/pkg/profile v1.5.0 h1:042Buzk+NhDI+DeSAA62RwJL8VAuZUMQZUjCsRz1Mug= +github.com/pkg/profile v1.5.0/go.mod h1:qBsxPvzyUincmltOk6iyRVxHYg4adc0OFOv72ZdLa18= +github.com/pkg/sftp v1.10.1/go.mod h1:lYOWFsE0bwd1+KfKJaKeuokY15vzFx25BLbzYYoAxZI= +github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= +github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= +github.com/russross/blackfriday/v2 v2.1.0 h1:JIOH55/0cWyOuilr9/qlrm0BSXldqnqwMsf35Ld67mk= +github.com/russross/blackfriday/v2 v2.1.0/go.mod h1:+Rmxgy9KzJVeS9/2gXHxylqXiyQDYRxCVz55jmeOWTM= +github.com/seccomp/libseccomp-golang v0.10.0 h1:aA4bp+/Zzi0BnWZ2F1wgNBs5gTpm+na2rWM6M9YjLpY= +github.com/seccomp/libseccomp-golang v0.10.0/go.mod h1:JA8cRccbGaA1s33RQf7Y1+q9gHmZX1yB/z9WDN1C6fg= +github.com/sirupsen/logrus v1.9.3 h1:dueUQJ1C2q9oE3F7wvmSGAaVtTmUizReu6fjN8uqzbQ= +github.com/sirupsen/logrus v1.9.3/go.mod h1:naHLuLoDiP4jHNo9R0sCBMtWGeIprob74mVsIT4qYEQ= +github.com/spf13/afero v1.4.1 h1:asw9sl74539yqavKaglDM5hFpdJVK0Y5Dr/JOgQ89nQ= +github.com/spf13/afero v1.4.1/go.mod h1:Ai8FlHk4v/PARR026UzYexafAt9roJ7LcLMAmO6Z93I= +github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= +github.com/stretchr/objx v0.4.0/go.mod h1:YvHI0jy2hoMjB+UWwv71VJQ9isScKT/TqJzVSSt89Yw= +github.com/stretchr/objx v0.5.0 h1:1zr/of2m5FGMsad5YfcqgdqdWrIhu+EBEJRhR1U7z/c= +github.com/stretchr/objx v0.5.0/go.mod h1:Yh+to48EsGEfYuaHDzXPcE3xhTkx73EhmCGUpEOglKo= +github.com/stretchr/testify v1.4.0/go.mod h1:j7eGeouHqKxXV5pUuKE4zz7dFj8WfuZ+81PSLYec5m4= +github.com/stretchr/testify v1.7.0/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg= +github.com/stretchr/testify v1.7.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg= +github.com/stretchr/testify v1.8.0/go.mod h1:yNjHg4UonilssWZ8iaSj1OCr/vHnekPRkoO+kdMU+MU= +github.com/stretchr/testify v1.8.4 h1:CcVxjf3Q8PM0mHUKJCdn+eZZtm5yQwehR5yeSVQQcUk= +github.com/stretchr/testify v1.8.4/go.mod h1:sz/lmYIOXD/1dqDmKjjqLyZ2RngseejIcXlSw2iwfAo= +github.com/urfave/cli v1.22.14 h1:ebbhrRiGK2i4naQJr+1Xj92HXZCrK7MsyTS/ob3HnAk= +github.com/urfave/cli v1.22.14/go.mod h1:X0eDS6pD6Exaclxm99NJ3FiCDRED7vIHpx2mDOHLvkA= +github.com/vishvananda/netlink v1.1.0 h1:1iyaYNBLmP6L0220aDnYQpo1QEV4t4hJ+xEEhhJH8j0= +github.com/vishvananda/netlink v1.1.0/go.mod h1:cTgwzPIzzgDAYoQrMm0EdrjRUBkTqKYppBueQtXaqoE= +github.com/vishvananda/netns v0.0.0-20191106174202-0a2b9b5464df h1:OviZH7qLw/7ZovXvuNyL3XQl8UFofeikI1NW1Gypu7k= +github.com/vishvananda/netns v0.0.0-20191106174202-0a2b9b5464df/go.mod h1:JP3t17pCcGlemwknint6hfoeCVQrEMVwxRLRjXpq+BU= +github.com/willf/bitset v1.1.11 h1:N7Z7E9UvjW+sGsEl7k/SJrvY2reP1A07MrGuCjIOjRE= +github.com/willf/bitset v1.1.11/go.mod h1:83CECat5yLh5zVOf4P1ErAgKA5UDvKtgyUABdr3+MjI= +github.com/yuin/goldmark v1.4.13/go.mod h1:6yULJ656Px+3vBD8DxQVa3kxgyrAnzto9xy5taEt/CY= +golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w= +golang.org/x/crypto v0.0.0-20190820162420-60c769a6c586/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI= +golang.org/x/crypto v0.0.0-20210921155107-089bfa567519/go.mod h1:GvvjBRRGRdwPK5ydBHafDWAxML/pGHZbMvKqRZ5+Abc= +golang.org/x/mod v0.6.0-dev.0.20220419223038-86c51ed26bb4/go.mod h1:jJ57K6gSWd91VN4djpZkiMVwK6gcyfeH4XE8wZrZaV4= +golang.org/x/net v0.0.0-20190404232315-eb5bcb51f2a3/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg= +golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= +golang.org/x/net v0.0.0-20210226172049-e18ecbb05110/go.mod h1:m0MpNAwzfU5UDzcl9v0D8zg8gWTRqZa9RBIspLL5mdg= +golang.org/x/net v0.0.0-20220722155237-a158d28d115b/go.mod h1:XRhObCWvk6IyKnWLug+ECip1KBveYUHfp+8e9klMJ9c= +golang.org/x/net v0.7.0/go.mod h1:2Tu9+aMcznHK/AK1HMvgo6xiTLG5rD5rZLDS+rp2Bjs= +golang.org/x/net v0.23.0 h1:7EYJ93RZ9vYSZAIb2x3lnuvqO5zneoD6IvWjuhfxjTs= +golang.org/x/net v0.23.0/go.mod h1:JKghWKKOSdJwpW2GEx0Ja7fmaKnMsbu+MWVZTokSYmg= +golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= +golang.org/x/sync v0.0.0-20220722155255-886fb9371eb4/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= +golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= +golang.org/x/sys v0.0.0-20190412213103-97732733099d/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20190606203320-7fc4e5ec1444/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20191115151921-52ab43148777/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20200124204421-9fbb57f87de9/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20200909081042-eff7692f9009/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20200916030750-2334cc1a136f/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20201119102817-f84b799fce68/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20210615035016-665e8c7367d1/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.0.0-20220412211240-33da011f77ad/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.0.0-20220520151302-bc2c85ada10a/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.0.0-20220715151400-c0bba94af5f8/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.0.0-20220722155257-8c9f86f7a55f/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.5.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.27.0 h1:wBqf8DvsY9Y/2P8gAfPDEYNuS30J4lPHJxXSb/nJZ+s= +golang.org/x/sys v0.27.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= +golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo= +golang.org/x/term v0.0.0-20210927222741-03fcf44c2211/go.mod h1:jbD1KX2456YbFQfuXm/mYQcufACuNUgVhRMnK/tPxf8= +golang.org/x/term v0.5.0/go.mod h1:jMB1sMXY+tzblOD4FWmEbocvup2/aLOaQEp7JmGp78k= +golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= +golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= +golang.org/x/text v0.3.7/go.mod h1:u+2+/6zg+i71rQMx5EYifcz6MCKuco9NR6JIITiCfzQ= +golang.org/x/text v0.7.0/go.mod h1:mrYo+phRRbMaCq/xk9113O4dZlRixOauAjOtrjsXDZ8= +golang.org/x/text v0.15.0 h1:h1V/4gjBv8v9cjcR6+AR5+/cIYK5N/WAgiv4xlsEtAk= +golang.org/x/text v0.15.0/go.mod h1:18ZOQIKpY8NJVqYksKHtTdi31H5itFRjB5/qKTNYzSU= +golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= +golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo= +golang.org/x/tools v0.1.12/go.mod h1:hNGJHUnrk76NpqgfD5Aqm5Crs+Hm0VOH/i9J2+nxYbc= +golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= +golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= +google.golang.org/genproto/googleapis/rpc v0.0.0-20240513163218-0867130af1f8 h1:mxSlqyb8ZAHsYDCfiXN1EDdNTdvjUJSLY+OnAUtYNYA= +google.golang.org/genproto/googleapis/rpc v0.0.0-20240513163218-0867130af1f8/go.mod h1:I7Y+G38R2bu5j1aLzfFmQfTcU/WnFuqDwLZAbvKTKpM= +google.golang.org/grpc v1.64.0 h1:KH3VH9y/MgNQg1dE7b3XfVK0GsPSIzJwdF617gUSbvY= +google.golang.org/grpc v1.64.0/go.mod h1:oxjF8E3FBnjp+/gVFYdWacaLDx9na1aqy9oovLpxQYg= +google.golang.org/protobuf v1.35.1 h1:m3LfL6/Ca+fqnjnlqQXNpFPABW1UD7mjh8KO2mKFytA= +google.golang.org/protobuf v1.35.1/go.mod h1:9fA7Ob0pmnwhb644+1+CVWFRbNajQ6iRojtC/QF5bRE= +gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= +gopkg.in/check.v1 v1.0.0-20180628173108-788fd7840127 h1:qIbj1fsPNlZgppZ+VLlY7N33q108Sa+fhmuc+sWQYwY= +gopkg.in/check.v1 v1.0.0-20180628173108-788fd7840127/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= +gopkg.in/hlandau/service.v1 v1.0.7 h1:16G5AJ1Cp8Vr65QItJXpyAIzf/FWAWCZBsTgsc6eyA8= +gopkg.in/hlandau/service.v1 v1.0.7/go.mod h1:sZw6ksxcoafC04GoZtw32UeqqEuPSABX35lVBaJP/bE= +gopkg.in/yaml.v2 v2.2.2/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI= +gopkg.in/yaml.v2 v2.4.0/go.mod h1:RDklbk79AGWmwhnvt/jBztapEOGDOx6ZbXqjP6csGnQ= +gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= +gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA= +gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= diff --git a/sysbox-fs/handler/handlerDB.go b/sysbox-fs/handler/handlerDB.go new file mode 100644 index 00000000..8af70ecc --- /dev/null +++ b/sysbox-fs/handler/handlerDB.go @@ -0,0 +1,347 @@ +// +// Copyright 2019-2020 Nestybox, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +package handler + +import ( + "errors" + "fmt" + "io" + "io/ioutil" + "os" + "sync" + + "github.com/sirupsen/logrus" + + "github.com/nestybox/sysbox-fs/domain" + "github.com/nestybox/sysbox-fs/handler/implementations" + + iradix "github.com/hashicorp/go-immutable-radix" +) + +// Slice of sysbox-fs' default handlers and the respective paths where they +// apply. Notice that the path associated to the pass-through handler is +// symbolic as this one can be invoked from within any of the other handlers, +// regardless of the FS location where they operate. +var DefaultHandlers = []domain.HandlerIface{ + implementations.PassThrough_Handler, // * + implementations.Root_Handler, // / + implementations.ProcUptime_Handler, // /proc/uptime + implementations.ProcSwaps_Handler, // /proc/swaps + implementations.ProcSys_Handler, // /proc/sys + implementations.ProcSysFs_Handler, // /proc/sys/fs + implementations.ProcSysKernel_Handler, // /proc/sys/kernel + implementations.ProcSysKernelRandom_Handler, // /proc/sys/kernel/random + implementations.ProcSysKernelYama_Handler, // /proc/sys/kernel/yama + implementations.ProcSysNetCore_Handler, // /proc/sys/net/core + implementations.ProcSysNetIpv4_Handler, // /proc/sys/net/ipv4 + implementations.ProcSysNetIpv4Vs_Handler, // /proc/sys/net/ipv4/vs + implementations.ProcSysNetIpv4Neigh_Handler, // /proc/sys/net/ipv4/neigh + implementations.ProcSysNetNetfilter_Handler, // /proc/sys/net/netfilter + implementations.ProcSysNetUnix_Handler, // /proc/sys/net/unix + implementations.ProcSysVm_Handler, // /proc/sys/vm + implementations.SysKernel_Handler, // /sys/kernel + implementations.SysDevicesVirtual_Handler, // /sys/devices/virtual + implementations.SysDevicesVirtualDmi_Handler, // /sys/devices/virtual/dmi + implementations.SysDevicesVirtualDmiId_Handler, // /sys/devices/virtual/dmi/id + implementations.SysModuleNfconntrackParameters_Handler, // /sys/module/nf_conntrack/parameters +} + +type handlerService struct { + sync.RWMutex + + // Radix-tree indexed by node FS path. Tree serves as an ordered DB where to + // keep track of the association between the FS nodes being emulated, and + // their matching handler object. + handlerTree *iradix.Tree + + // Pointer to the service providing container-state storage functionality. + css domain.ContainerStateServiceIface + + // Pointer to the service providing nsenter (rexec) capabilities. + nss domain.NSenterServiceIface + + // Pointer to the service providing process-handling functionality. + prs domain.ProcessServiceIface + + // Pointer to the service providing file-system I/O capabilities. + ios domain.IOServiceIface + + // Represents the user-namespace inode of the host's true-root. + hostUserNsInode domain.Inode + + // Holds value of the host's UUID. + hostUuid string + + // Passthrough handler. + passThroughHandler domain.PassthroughHandlerIface + + // Handler i/o errors should be obviated if this flag is enabled (testing + // purposes). + ignoreErrors bool +} + +// HandlerService constructor. +func NewHandlerService() domain.HandlerServiceIface { + return &handlerService{} +} + +func (hs *handlerService) Setup( + hdlrs []domain.HandlerIface, + ignoreErrors bool, + css domain.ContainerStateServiceIface, + nss domain.NSenterServiceIface, + prs domain.ProcessServiceIface, + ios domain.IOServiceIface) { + + hs.css = css + hs.nss = nss + hs.prs = prs + hs.ios = ios + hs.ignoreErrors = ignoreErrors + + hs.handlerTree = iradix.New() + if hs.handlerTree == nil { + logrus.Fatalf("Unable to allocate handler radix-tree") + } + + // Register all handlers declared and their associated resources. + for _, h := range hdlrs { + hs.RegisterHandler(h) + } + + // Set pointer to passthrough handler. + hs.passThroughHandler = implementations.PassThrough_Handler + + // Obtain user-ns inode corresponding to sysbox-fs. + hostUserNsInode, err := hs.FindUserNsInode(uint32(os.Getpid())) + if err != nil { + logrus.Fatalf("Invalid init user-namespace found") + } + hs.hostUserNsInode = hostUserNsInode + + // Obtain the host's UUID value. + hs.hostUuid, err = hs.FindHostUuid() + if err != nil { + logrus.Fatalf("Unable to determine the host UUID value") + } +} + +func (hs *handlerService) RegisterHandler(h domain.HandlerIface) error { + hs.Lock() + + name := h.GetName() + path := h.GetPath() + + if _, ok := hs.handlerTree.Get([]byte(path)); ok { + hs.Unlock() + logrus.Errorf("Handler %v already registered", name) + return errors.New("Handler already registered") + } + + h.SetService(hs) + + tree, _, ok := hs.handlerTree.Insert([]byte(path), h) + if ok { + hs.Unlock() + logrus.Errorf("Handler %v already registered", name) + return errors.New("Handler already registered") + } + hs.handlerTree = tree + hs.Unlock() + + return nil +} + +func (hs *handlerService) UnregisterHandler(h domain.HandlerIface) error { + hs.Lock() + + name := h.GetName() + path := h.GetPath() + + if _, ok := hs.handlerTree.Get([]byte(path)); !ok { + hs.Unlock() + logrus.Errorf("Handler %v not previously registered", name) + return errors.New("Handler not previously registered") + } + + hs.handlerTree, _, _ = hs.handlerTree.Delete([]byte(path)) + hs.Unlock() + + return nil +} + +func (hs *handlerService) LookupHandler( + i domain.IOnodeIface) (domain.HandlerIface, bool) { + + hs.RLock() + defer hs.RUnlock() + + var h domain.HandlerIface + + // Iterate the handler's radix-tree looking for the handler that better + // match the fs node being searched. + // + // Notice that this approach could potentially lead to overlapping scenarios + // if we were to have handlers such as "/proc/update" and "/proc/update_1", + // but there's no such a case today. If we ever need to address this point, + // we would simply extend this handler-lookup logic by placing it in a "for" + // loop and by comparing the "base" components of the overlapping elements. + _, node, ok := hs.handlerTree.Root().LongestPrefix([]byte(i.Path())) + if !ok { + return nil, false + } + + h = node.(domain.HandlerIface) + + return h, true +} + +func (hs *handlerService) FindHandler(s string) (domain.HandlerIface, bool) { + hs.RLock() + defer hs.RUnlock() + + h, ok := hs.handlerTree.Get([]byte(s)) + if !ok { + return nil, false + } + + return h.(domain.HandlerIface), true +} + +func (hs *handlerService) EnableHandler(path string) error { + hs.Lock() + defer hs.Unlock() + + h, ok := hs.FindHandler(path) + if !ok { + return fmt.Errorf("handler %s not found in handlerDB", path) + } + + h.SetEnabled(true) + + return nil +} + +func (hs *handlerService) DisableHandler(path string) error { + hs.Lock() + defer hs.Unlock() + + h, ok := hs.FindHandler(path) + if !ok { + return fmt.Errorf("handler %s not found in handlerDB", path) + } + + h.SetEnabled(false) + + return nil +} + +func (hs *handlerService) HandlersResourcesList() []string { + + var resourcesList []string + + // Technically not needed as this method is only expected to be called + // during sysbox-fs initialization (handlers are not in service at that + // point), but just in case utilization changes overtime. + hs.RLock() + defer hs.RUnlock() + + // Iterate through the handlerDB to extract the list of resources being + // emulated. + hs.handlerTree.Root().Walk(func(key []byte, val interface{}) bool { + + h := val.(domain.HandlerIface) + if !h.GetEnabled() { + return true + } + + list := h.GetResourcesList() + resourcesList = append(resourcesList, list...) + + return false + }) + + return resourcesList +} + +func (hs *handlerService) GetPassThroughHandler() domain.PassthroughHandlerIface { + return hs.passThroughHandler +} + +func (hs *handlerService) StateService() domain.ContainerStateServiceIface { + return hs.css +} + +func (hs *handlerService) SetStateService(css domain.ContainerStateServiceIface) { + hs.css = css +} + +func (hs *handlerService) NSenterService() domain.NSenterServiceIface { + return hs.nss +} + +func (hs *handlerService) ProcessService() domain.ProcessServiceIface { + return hs.prs +} + +func (hs *handlerService) IOService() domain.IOServiceIface { + return hs.ios +} + +func (hs *handlerService) IgnoreErrors() bool { + return hs.ignoreErrors +} + +// +// Auxiliary methods +// + +func (hs *handlerService) HostUserNsInode() domain.Inode { + return hs.hostUserNsInode +} + +func (hs *handlerService) FindUserNsInode(pid uint32) (domain.Inode, error) { + process := hs.prs.ProcessCreate(pid, 0, 0) + + userNsInode, err := process.UserNsInode() + if err != nil { + return 0, err + } + + return userNsInode, nil +} + +func (hs *handlerService) HostUuid() string { + return hs.hostUuid +} + +func (hs *handlerService) FindHostUuid() (string, error) { + + hostUuid, err := ioutil.ReadFile("/sys/devices/virtual/dmi/id/product_uuid") + + // Careful here: a missing 'product_uuid' is a perfectly valid scenario. + // Refer to 'handler/implementations/sysDevicesVirtualDmiId.go' for details. + if err != nil && err != io.EOF { + if os.IsNotExist(err) { + hostUuid = []byte("00000000-0000-0000-0000-000000000000") + } else { + return "", err + } + } + + return string(hostUuid), nil +} diff --git a/sysbox-fs/handler/implementations/passThrough.go b/sysbox-fs/handler/implementations/passThrough.go new file mode 100644 index 00000000..02c06597 --- /dev/null +++ b/sysbox-fs/handler/implementations/passThrough.go @@ -0,0 +1,652 @@ +// +// Copyright 2019-2023 Nestybox, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +package implementations + +import ( + "io" + "os" + "path/filepath" + "strconv" + "strings" + "sync" + "syscall" + + "github.com/nestybox/sysbox-fs/domain" + "github.com/nestybox/sysbox-fs/fuse" + + "github.com/sirupsen/logrus" + "golang.org/x/sys/unix" +) + +// +// Pass-through handler +// +// Handler for all non-emulated resources under /proc/sys or /sys. It does a +// simple "passthrough" of the access by entering the namespaces of the +// container process that is doing the I/O operation and performs the access on +// behalf of it. It enters the namespaces by dispatching an "nsenter agent" +// process that enters the namespaces, performs the filesystem operation, and +// returns the result. Note that the nsenter agent does NOT enter the mount +// namespace of the container process, to avoid a recursion of sysbox-fs mounts +// /proc/sys and /sys. +// + +type PassThrough struct { + domain.HandlerBase +} + +var PassThrough_Handler = &PassThrough{ + domain.HandlerBase{ + Name: "PassThrough", + Path: "*", + Enabled: true, + }, +} + +func (h *PassThrough) Lookup( + n domain.IOnodeIface, + req *domain.HandlerRequest) (os.FileInfo, error) { + + logrus.Debugf("Executing Lookup() for req-id: %#x, handler: %s, resource: %s", + req.ID, h.Name, n.Name()) + + mountSysfs, mountProcfs, cloneFlags := checkProcAndSysRemount(n) + + // Create nsenterEvent to initiate interaction with container namespaces. + nss := h.Service.NSenterService() + event := nss.NewEvent( + req.Pid, + &domain.AllNSs, + cloneFlags, + &domain.NSenterMessage{ + Type: domain.LookupRequest, + Payload: &domain.LookupPayload{ + Entry: n.Path(), + MountSysfs: mountSysfs, + MountProcfs: mountProcfs, + }, + }, + nil, + false, + ) + + // Launch nsenter-event. + err := nss.SendRequestEvent(event) + if err != nil { + return nil, err + } + + // Obtain nsenter-event response. + responseMsg := nss.ReceiveResponseEvent(event) + if responseMsg.Type == domain.ErrorResponse { + return nil, responseMsg.Payload.(error) + } + + info := responseMsg.Payload.(domain.FileInfo) + + // The file size will be 0 when passing through to files under /proc (i.e., + // because /proc is a virtual filesystem). This was not a problem in the + // past, but starting with Linux kernel 6.5, returning a size of 0 causes the + // kernel show the file as empty when read. Thus we need to return a size > + // 0. However what size to we return? The size needs to be >= largest file + // size that could be passed through, otherwise the contents of the file will + // be cutoff. We choose size = 32K since it should be large enough to hold + // the contents of any file under /proc. Note that files under /sys have a + // size (typically 4096), so this override does not apply to them. + if info.Fsize == 0 { + info.Fsize = 32768 + } + + return info, nil +} + +func (h *PassThrough) Open( + n domain.IOnodeIface, + req *domain.HandlerRequest) (bool, error) { + + return h.OpenWithNS(n, req, domain.AllNSs) +} + +func (h *PassThrough) OpenWithNS( + n domain.IOnodeIface, + req *domain.HandlerRequest, + namespaces []domain.NStype) (bool, error) { + + logrus.Debugf("Executing Open() for req-id: %#x, handler: %s, resource: %s", + req.ID, h.Name, n.Name()) + + mountSysfs, mountProcfs, cloneFlags := checkProcAndSysRemount(n) + + // Create nsenterEvent to initiate interaction with container namespaces. + nss := h.Service.NSenterService() + event := nss.NewEvent( + req.Pid, + &namespaces, + cloneFlags, + &domain.NSenterMessage{ + Type: domain.OpenFileRequest, + Payload: &domain.OpenFilePayload{ + File: n.Path(), + Flags: strconv.Itoa(n.OpenFlags()), + Mode: strconv.Itoa(int(n.OpenMode())), + MountSysfs: mountSysfs, + MountProcfs: mountProcfs, + }, + }, + nil, + false, + ) + + // Launch nsenter-event. + err := nss.SendRequestEvent(event) + if err != nil { + return false, err + } + + // Obtain nsenter-event response. + responseMsg := nss.ReceiveResponseEvent(event) + if responseMsg.Type == domain.ErrorResponse { + return false, responseMsg.Payload.(error) + } + + return false, nil +} + +// Reads the given node by entering all container namespaces. +// Caches the result after reading, to avoid the performance hit of entering the +// container namespaces in future calls (unless req.noCache is set). +func (h *PassThrough) Read( + n domain.IOnodeIface, + req *domain.HandlerRequest) (int, error) { + + return h.ReadWithNS(n, req, domain.AllNSs) +} + +// Same as Read(), but enters the given container namespaces only. +func (h *PassThrough) ReadWithNS( + n domain.IOnodeIface, + req *domain.HandlerRequest, + namespaces []domain.NStype) (int, error) { + + var ( + sz int + err error + ) + + logrus.Debugf("Executing Read() for req-id: %#x, handler: %s, resource: %s", + req.ID, h.Name, n.Name()) + + path := n.Path() + cntr := req.Container + + prs := h.Service.ProcessService() + process := prs.ProcessCreate(req.Pid, req.Uid, req.Gid) + + // The passthrough driver is slow because it must spawn a process that enters + // the container's namespaces (i.e., the nsenter agent) and read the data + // from there. To improve things, we cache the data on the first access to + // avoid dispatching the nsenter agent on subsequent accesses. + // + // A couple of caveats on the caching: + // + // 1) Caching is only done for processes at the sys container level, not in + // inner containers or inner unshared namespaces. To enable caching for + // those, we would need to have a cache per each namespace set (since the + // values under /proc/sys depend on the namespaces that the process belongs + // to). This would be expensive and would also require Sysbox to know when + // the namespace ceases to exist in order to destroy the cache associated + // with it. + // + // 2) As an optimization, we fetch data from the container's filesystem only + // when the req.Offset is 0. For req.Offset > 0, we assume that the data is + // cached already. Without this optimization, we will likely go through + // fetchFile() twice for each read: one with req.Offset 0, and one at + // req.Offset X, where X is the number of bytes of the resource being + // read. That is, the handler's Read() method is normally invoked twice: the + // first read returns X bytes, the second read returns 0 bytes. + + if domain.ProcessNsMatch(process, cntr.InitProc()) { + + cntr.Lock() + + // Check the data cache + sz, err = cntr.Data(path, req.Offset, &req.Data) + if err != nil && err != io.EOF { + cntr.Unlock() + return 0, fuse.IOerror{Code: syscall.EINVAL} + } + + if req.Offset == 0 && sz == 0 && err == io.EOF { + + // Resource is not cached, read it from the filesystem. + sz, err = h.fetchFile(process, namespaces, n, req.Offset, &req.Data) + if err != nil { + cntr.Unlock() + return 0, fuse.IOerror{Code: syscall.EINVAL} + } + + if sz == 0 { + cntr.Unlock() + return 0, nil + } + + if !req.NoCache { + err = cntr.SetData(path, req.Offset, req.Data) + if err != nil { + cntr.Unlock() + return 0, fuse.IOerror{Code: syscall.EINVAL} + } + } + } + + cntr.Unlock() + + } else { + sz, err = h.fetchFile(process, namespaces, n, req.Offset, &req.Data) + if err != nil { + return 0, fuse.IOerror{Code: syscall.EINVAL} + } + } + + return sz, nil +} + +// Writes to the given node by entering all the container namespaces. +// Caches the result after writing, to avoid the performance hit of entering the +// container namespaces in future read calls (unless req.noCache is set). +func (h *PassThrough) Write( + n domain.IOnodeIface, + req *domain.HandlerRequest) (int, error) { + + return h.WriteWithNS(n, req, domain.AllNSs) +} + +// Same as Write(), but enters the given container namespaces only. +func (h *PassThrough) WriteWithNS( + n domain.IOnodeIface, + req *domain.HandlerRequest, + namespaces []domain.NStype) (int, error) { + + var ( + len int + err error + ) + + resource := n.Name() + + logrus.Debugf("Executing Write() for req-id: %#x, handler: %s, resource: %s", + req.ID, h.Name, resource) + + path := n.Path() + cntr := req.Container + + prs := h.Service.ProcessService() + process := prs.ProcessCreate(req.Pid, req.Uid, req.Gid) + + if len, err = h.pushFile(process, namespaces, n, req.Offset, req.Data); err != nil { + return 0, err + } + + // If the write comes from a process inside the sys container's namespaces, + // (not in inner containers or unshared namespaces) then cache the data. + // See explanation in Read() method above. + + if domain.ProcessNsMatch(process, cntr.InitProc()) { + if !req.NoCache { + cntr.Lock() + err = cntr.SetData(path, req.Offset, req.Data) + if err != nil { + cntr.Unlock() + return 0, fuse.IOerror{Code: syscall.EINVAL} + } + cntr.Unlock() + } + } + + return len, nil +} + +func (h *PassThrough) ReadDirAll( + n domain.IOnodeIface, + req *domain.HandlerRequest) ([]os.FileInfo, error) { + + logrus.Debugf("Executing ReadDirAll() for req-id: %#x, handler: %s, resource: %s", + req.ID, h.Name, n.Name()) + + mountSysfs, mountProcfs, cloneFlags := checkProcAndSysRemount(n) + + // Create nsenterEvent to initiate interaction with container namespaces. + nss := h.Service.NSenterService() + event := nss.NewEvent( + req.Pid, + &domain.AllNSs, + cloneFlags, + &domain.NSenterMessage{ + Type: domain.ReadDirRequest, + Payload: &domain.ReadDirPayload{ + Dir: n.Path(), + MountSysfs: mountSysfs, + MountProcfs: mountProcfs, + }, + }, + nil, + false, + ) + + // Launch nsenter-event. + err := nss.SendRequestEvent(event) + if err != nil { + return nil, err + } + + // Obtain nsenter-event response. + responseMsg := nss.ReceiveResponseEvent(event) + if responseMsg.Type == domain.ErrorResponse { + return nil, responseMsg.Payload.(error) + } + + var osFileEntries = make([]os.FileInfo, 0) + + // Transform event-response payload into a FileInfo slice. Notice that to + // convert []T1 struct to a []T2 one, we must iterate through each element + // and do the conversion one element at a time. + dirEntries := responseMsg.Payload.([]domain.FileInfo) + for _, v := range dirEntries { + osFileEntries = append(osFileEntries, v) + } + + return osFileEntries, nil +} + +func (h *PassThrough) ReadLink( + n domain.IOnodeIface, + req *domain.HandlerRequest) (string, error) { + + logrus.Debugf("Executing ReadLink() for req-id: %#x, handler: %s, resource: %s", + req.ID, h.Name, n.Name()) + + mountSysfs, mountProcfs, cloneFlags := checkProcAndSysRemount(n) + + // Create nsenterEvent to initiate interaction with container namespaces. + nss := h.Service.NSenterService() + + event := nss.NewEvent( + req.Pid, + &domain.AllNSs, + cloneFlags, + &domain.NSenterMessage{ + Type: domain.ReadLinkRequest, + Payload: &domain.ReadLinkPayload{ + Link: n.Path(), + MountSysfs: mountSysfs, + MountProcfs: mountProcfs, + }, + }, + nil, + false, + ) + + // Launch nsenter-event to obtain file state within container + // namespaces. + err := nss.SendRequestEvent(event) + if err != nil { + return "", err + } + + // Obtain nsenter-event response. + responseMsg := nss.ReceiveResponseEvent(event) + if responseMsg.Type == domain.ErrorResponse { + return "", responseMsg.Payload.(error) + } + + resp := responseMsg.Payload.(string) + + return resp, nil +} + +func (h *PassThrough) Setattr( + n domain.IOnodeIface, + req *domain.HandlerRequest) error { + + logrus.Debugf("Executing Setattr() for req-id: %#x, handler: %s, resource: %s", + req.ID, h.Name, n.Name()) + + mountSysfs, mountProcfs, cloneFlags := checkProcAndSysRemount(n) + + // Create nsenterEvent to initiate interaction with container namespaces. + nss := h.Service.NSenterService() + event := nss.NewEvent( + req.Pid, + &domain.AllNSs, + cloneFlags, + &domain.NSenterMessage{ + Type: domain.OpenFileRequest, + Payload: &domain.OpenFilePayload{ + File: n.Path(), + Flags: strconv.Itoa(n.OpenFlags()), + Mode: strconv.Itoa(int(n.OpenMode())), + MountSysfs: mountSysfs, + MountProcfs: mountProcfs, + }, + }, + nil, + false, + ) + + // Launch nsenter-event. + err := nss.SendRequestEvent(event) + if err != nil { + return err + } + + // Obtain nsenter-event response. + responseMsg := nss.ReceiveResponseEvent(event) + if responseMsg.Type == domain.ErrorResponse { + return responseMsg.Payload.(error) + } + + return nil +} + +// Auxiliary method to fetch the content of any given file within a container. +func (h *PassThrough) fetchFile( + process domain.ProcessIface, + namespaces []domain.NStype, + n domain.IOnodeIface, + offset int64, + data *[]byte) (int, error) { + + mountSysfs, mountProcfs, cloneFlags := checkProcAndSysRemount(n) + + // Create nsenterEvent to initiate interaction with container namespaces. + nss := h.Service.NSenterService() + + event := nss.NewEvent( + process.Pid(), + &namespaces, + cloneFlags, + &domain.NSenterMessage{ + Type: domain.ReadFileRequest, + Payload: &domain.ReadFilePayload{ + File: n.Path(), + Offset: offset, + Len: len(*data), + MountSysfs: mountSysfs, + MountProcfs: mountProcfs, + }, + }, + nil, + false, + ) + + // Launch nsenter-event to obtain file state within container + // namespaces. + err := nss.SendRequestEvent(event) + if err != nil { + return 0, err + } + + // Obtain nsenter-event response. + responseMsg := nss.ReceiveResponseEvent(event) + if responseMsg.Type == domain.ErrorResponse { + return 0, responseMsg.Payload.(error) + } + + *data = responseMsg.Payload.([]byte) + + return len(*data), nil +} + +// Auxiliary method to inject content into any given file within a container. +func (h *PassThrough) pushFile( + process domain.ProcessIface, + namespaces []domain.NStype, + n domain.IOnodeIface, + offset int64, + data []byte) (int, error) { + + mountSysfs, mountProcfs, cloneFlags := checkProcAndSysRemount(n) + + // Create nsenterEvent to initiate interaction with container namespaces. + nss := h.Service.NSenterService() + + event := nss.NewEvent( + process.Pid(), + &namespaces, + cloneFlags, + &domain.NSenterMessage{ + Type: domain.WriteFileRequest, + Payload: &domain.WriteFilePayload{ + File: n.Path(), + Offset: offset, + Data: data, + MountSysfs: mountSysfs, + MountProcfs: mountProcfs, + }, + }, + nil, + false, + ) + + // Launch nsenter-event to write file state within container + // namespaces. + err := nss.SendRequestEvent(event) + if err != nil { + return 0, err + } + + // Obtain nsenter-event response. + responseMsg := nss.ReceiveResponseEvent(event) + if responseMsg.Type == domain.ErrorResponse { + return 0, responseMsg.Payload.(error) + } + + return len(data), nil +} + +func (h *PassThrough) GetName() string { + return h.Name +} + +func (h *PassThrough) GetPath() string { + return h.Path +} + +func (h *PassThrough) GetService() domain.HandlerServiceIface { + return h.Service +} + +func (h *PassThrough) GetEnabled() bool { + return h.Enabled +} + +func (h *PassThrough) SetEnabled(b bool) { + h.Enabled = b +} + +func (h *PassThrough) GetResourcesList() []string { + + var resources []string + + for resourceKey, resource := range h.EmuResourceMap { + resource.Mutex.Lock() + if !resource.Enabled { + resource.Mutex.Unlock() + continue + } + resource.Mutex.Unlock() + + resources = append(resources, filepath.Join(h.GetPath(), resourceKey)) + } + + return resources +} + +func (h *PassThrough) GetResourceMutex(n domain.IOnodeIface) *sync.Mutex { + resource, ok := h.EmuResourceMap[n.Name()] + if !ok { + return nil + } + + return &resource.Mutex +} + +func (h *PassThrough) SetService(hs domain.HandlerServiceIface) { + h.Service = hs +} + +// checkProcAndSysRemount checks if the nsenter agent deployed by the passthrough handler +// should remount procfs and sysfs. +func checkProcAndSysRemount(n domain.IOnodeIface) (bool, bool, uint32) { + var ( + mountSysfs bool + mountProcfs bool + cloneFlags uint32 + ) + + // The nsenter agent will enter/join the namespaces of the container, + // including the mount ns. This way, the nsenter process no longer carries + // the sysbox-fs mounts as it enters the container. + // + // However, when accessing files under /proc or /sys, the agent needs to + // remount these as otherwise they won't pick up the container's assigned + // resources (e.g., net devices, etc). + // + // To avoid the container processes seeing the nsenter process mounts of + // procfs and sysfs, we direct the nsenter agent create a new mount namespace + // so as to not mess up mounts in the container (see cloneFlags below). The + // creation of this new mount ns occurs **after** the nsenter process has + // joined the container namespaces (see sysbox-runc/libcontainer/nsexec). Thus it's + // equivalent to a "setns" to join all container namespaces, immediately + // followed by an "unshare" of the mount namespace. + + if strings.HasPrefix(n.Path(), "/sys/") { + mountSysfs = true + } + + if strings.HasPrefix(n.Path(), "/proc/") { + mountProcfs = true + } + + // Tell nsenter agent to unshare the mount-ns (occurs after nsenter has + // already joined container namespaces). + if mountSysfs || mountProcfs { + cloneFlags = unix.CLONE_NEWNS + } + + return mountSysfs, mountProcfs, cloneFlags +} diff --git a/sysbox-fs/handler/implementations/passThrough_test.go b/sysbox-fs/handler/implementations/passThrough_test.go new file mode 100644 index 00000000..be0fd00d --- /dev/null +++ b/sysbox-fs/handler/implementations/passThrough_test.go @@ -0,0 +1,1122 @@ +// +// Copyright 2019-2023 Nestybox, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +package implementations_test + +import ( + "io/ioutil" + "os" + "reflect" + "strconv" + "syscall" + "testing" + "time" + + "github.com/nestybox/sysbox-fs/domain" + "github.com/nestybox/sysbox-fs/handler/implementations" + "github.com/nestybox/sysbox-fs/mocks" + "github.com/nestybox/sysbox-fs/mount" + "github.com/nestybox/sysbox-fs/nsenter" + "github.com/nestybox/sysbox-fs/process" + "github.com/nestybox/sysbox-fs/state" + "github.com/nestybox/sysbox-fs/sysio" + "github.com/sirupsen/logrus" + "golang.org/x/sys/unix" +) + +// Sysbox-fs global services for all handler's testing consumption. +var css domain.ContainerStateServiceIface +var mts domain.MountServiceIface +var ios domain.IOServiceIface +var prs domain.ProcessServiceIface +var nss *mocks.NSenterServiceIface +var hds *mocks.HandlerServiceIface + +func TestMain(m *testing.M) { + + // Disable log generation during UT. + logrus.SetOutput(ioutil.Discard) + + // + // Test-cases common settings. + // + ios = sysio.NewIOService(domain.IOMemFileService) + prs = process.NewProcessService() + nss = &mocks.NSenterServiceIface{} + hds = &mocks.HandlerServiceIface{} + css = state.NewContainerStateService() + mts = mount.NewMountService() + + prs.Setup(ios) + css.Setup(nil, prs, ios, mts) + mts.Setup(css, hds, prs, nss) + + // HandlerService's common mocking instructions. + hds.On("NSenterService").Return(nss) + hds.On("ProcessService").Return(prs) + hds.On("DirHandlerEntries", "/proc/sys/net").Return(nil) + + // Run test-suite. + m.Run() +} + +func TestPassThrough_Lookup(t *testing.T) { + type fields struct { + Name string + Path string + Service domain.HandlerServiceIface + } + + var f1 = fields{ + Name: "PassThrough", + Path: "PassThrough", + Service: hds, + } + + type args struct { + n domain.IOnodeIface + req *domain.HandlerRequest + } + + // Valid method arguments. + var a1 = args{ + n: ios.NewIOnode("net", "/proc/sys/net", 0), + req: &domain.HandlerRequest{ + Pid: 1001, + Container: css.ContainerCreate( + "c1", + uint32(1001), + time.Time{}, + 231072, + 65535, + 231072, + 65535, + nil, + nil, + nil), + }, + } + + tests := []struct { + name string + fields fields + args args + want os.FileInfo + wantErr bool + wantErrVal error + prepare func() + }{ + { + // + // Test-case 1: Regular Lookup operation. No errors expected. + // + name: "1", + fields: f1, + args: a1, + want: domain.FileInfo{Fname: a1.n.Path(), Fsize: 32768}, + wantErr: false, + wantErrVal: nil, + prepare: func() { + + // Expected nsenter request. + nsenterEventReq := &nsenter.NSenterEvent{ + Pid: a1.req.Pid, + Namespace: &domain.AllNSs, + ReqMsg: &domain.NSenterMessage{ + Type: domain.LookupRequest, + Payload: &domain.LookupPayload{a1.n.Path(), false, true}, + }, + } + + // Expected nsenter response. + nsenterEventResp := &nsenter.NSenterEvent{ + ResMsg: &domain.NSenterMessage{ + Type: domain.LookupResponse, + Payload: domain.FileInfo{ + Fname: a1.n.Path()}, + }, + } + + nss.On( + "NewEvent", + a1.req.Pid, + &domain.AllNSs, + uint32(unix.CLONE_NEWNS), + nsenterEventReq.ReqMsg, + (*domain.NSenterMessage)(nil), + false).Return(nsenterEventReq) + + nss.On("SendRequestEvent", nsenterEventReq).Return(nil) + nss.On("ReceiveResponseEvent", nsenterEventReq).Return(nsenterEventResp.ResMsg) + }, + }, + { + // + // Test-case 2: Verify proper behavior during nsenter error conditions + // (EACCESS). + // + name: "2", + fields: f1, + args: a1, + want: nil, + wantErr: true, + wantErrVal: syscall.EACCES, + prepare: func() { + + // Expected nsenter request. + nsenterEventReq := &nsenter.NSenterEvent{ + Pid: a1.req.Pid, + Namespace: &domain.AllNSs, + ReqMsg: &domain.NSenterMessage{ + Type: domain.LookupRequest, + Payload: &domain.LookupPayload{a1.n.Path(), false, true}, + }, + } + + // Expected nsenter response. + nsenterEventResp := &nsenter.NSenterEvent{ + ResMsg: &domain.NSenterMessage{ + Type: domain.ErrorResponse, + Payload: syscall.Errno(syscall.EACCES), + }, + } + + nss.On( + "NewEvent", + a1.req.Pid, + &domain.AllNSs, + uint32(unix.CLONE_NEWNS), + nsenterEventReq.ReqMsg, + (*domain.NSenterMessage)(nil), + false).Return(nsenterEventReq) + + nss.On("SendRequestEvent", nsenterEventReq).Return(nil) + nss.On("ReceiveResponseEvent", nsenterEventReq).Return(nsenterEventResp.ResMsg) + }, + }, + } + + // + // Testcase executions. + // + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + h := &implementations.PassThrough{ + domain.HandlerBase{ + Name: tt.fields.Name, + Path: tt.fields.Path, + Service: tt.fields.Service, + }, + } + + // Prepare the mocks. + if tt.prepare != nil { + tt.prepare() + } + + got, err := h.Lookup(tt.args.n, tt.args.req) + if (err != nil) != tt.wantErr { + t.Errorf("PassThrough.Lookup() error = %v, wantErr %v", + err, tt.wantErr) + return + } + if err != nil && tt.wantErrVal != nil && err.Error() != tt.wantErrVal.Error() { + t.Errorf("PassThrough.Lookup() error = %v, wantErr %v, wantErrVal %v", + err, tt.wantErr, tt.wantErrVal) + return + } + + if !reflect.DeepEqual(got, tt.want) { + t.Errorf("PassThrough.Lookup() = %v, want %v", got, tt.want) + } + + // Ensure that mocks were properly invoked and reset expectedCalls + // object. + nss.AssertExpectations(t) + nss.ExpectedCalls = nil + }) + } +} + +func TestPassThrough_Open(t *testing.T) { + type fields struct { + Name string + Path string + Service domain.HandlerServiceIface + } + + var f1 = fields{ + Name: "PassThrough", + Path: "PassThrough", + Service: hds, + } + + type args struct { + n domain.IOnodeIface + req *domain.HandlerRequest + } + + // Valid method arguments. + var a1 = args{ + n: ios.NewIOnode("net", "/proc/sys/net", 0), + req: &domain.HandlerRequest{ + Pid: 1001, + Container: css.ContainerCreate( + "c1", + uint32(1001), + time.Time{}, + 231072, + 65535, + 231072, + 65535, + nil, + nil, + nil), + }, + } + + tests := []struct { + name string + fields fields + args args + wantErr bool + wantErrVal error + prepare func() + }{ + { + // + // Test-case 1: Regular Open operation. No errors expected. + // + name: "1", + fields: f1, + args: a1, + wantErr: false, + wantErrVal: nil, + prepare: func() { + + // Expected nsenter request. + nsenterEventReq := &nsenter.NSenterEvent{ + Pid: a1.req.Pid, + Namespace: &domain.AllNSs, + ReqMsg: &domain.NSenterMessage{ + Type: domain.OpenFileRequest, + Payload: &domain.OpenFilePayload{ + File: a1.n.Path(), + Flags: strconv.Itoa(a1.n.OpenFlags()), + Mode: strconv.Itoa(int(a1.n.OpenMode())), + MountSysfs: false, + MountProcfs: true, + }, + }, + } + + // Expected nsenter response. + nsenterEventResp := &nsenter.NSenterEvent{ + ResMsg: &domain.NSenterMessage{ + Type: domain.OpenFileResponse, + Payload: nil, + }, + } + + nss.On( + "NewEvent", + a1.req.Pid, + &domain.AllNSs, + uint32(unix.CLONE_NEWNS), + nsenterEventReq.ReqMsg, + (*domain.NSenterMessage)(nil), + false).Return(nsenterEventReq) + + nss.On("SendRequestEvent", nsenterEventReq).Return(nil) + nss.On("ReceiveResponseEvent", nsenterEventReq).Return(nsenterEventResp.ResMsg) + }, + }, + { + // + // Test-case 2: Verify proper behavior during nsenter error conditions + // (EACCESS). + // + name: "2", + fields: f1, + args: a1, + wantErr: true, + wantErrVal: syscall.EPERM, + prepare: func() { + + // Expected nsenter request. + nsenterEventReq := &nsenter.NSenterEvent{ + Pid: a1.req.Pid, + Namespace: &domain.AllNSs, + ReqMsg: &domain.NSenterMessage{ + Type: domain.OpenFileRequest, + Payload: &domain.OpenFilePayload{ + File: a1.n.Path(), + Flags: strconv.Itoa(a1.n.OpenFlags()), + Mode: strconv.Itoa(int(a1.n.OpenMode())), + MountSysfs: false, + MountProcfs: true, + }, + }, + } + + // Expected nsenter response. + nsenterEventResp := &nsenter.NSenterEvent{ + ResMsg: &domain.NSenterMessage{ + Type: domain.ErrorResponse, + Payload: syscall.Errno(syscall.EPERM), + }, + } + + nss.On( + "NewEvent", + a1.req.Pid, + &domain.AllNSs, + uint32(unix.CLONE_NEWNS), + nsenterEventReq.ReqMsg, + (*domain.NSenterMessage)(nil), + false).Return(nsenterEventReq) + + nss.On("SendRequestEvent", nsenterEventReq).Return(nil) + nss.On("ReceiveResponseEvent", nsenterEventReq).Return(nsenterEventResp.ResMsg) + }, + }, + } + + // + // Testcase executions. + // + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + h := &implementations.PassThrough{ + domain.HandlerBase{ + Name: tt.fields.Name, + Path: tt.fields.Path, + Service: tt.fields.Service, + }, + } + + // Prepare the mocks. + if tt.prepare != nil { + tt.prepare() + } + + _, err := h.Open(tt.args.n, tt.args.req) + if (err != nil) != tt.wantErr { + t.Errorf("PassThrough.Open() error = %v, wantErr %v", err, tt.wantErr) + } + if err != nil && tt.wantErrVal != nil && err.Error() != tt.wantErrVal.Error() { + t.Errorf("PassThrough.Lookup() error = %v, wantErr %v, wantErrVal %v", + err, tt.wantErr, tt.wantErrVal) + } + + // Ensure that mocks were properly invoked and reset expectedCalls + // object. + nss.AssertExpectations(t) + nss.ExpectedCalls = nil + }) + } +} + +func TestPassThrough_Read(t *testing.T) { + type fields struct { + Name string + Path string + Service domain.HandlerServiceIface + } + + // Caching enabled. + var f1 = fields{ + Name: "PassThrough", + Path: "PassThrough", + Service: hds, + } + + type args struct { + n domain.IOnodeIface + req *domain.HandlerRequest + } + + // Valid method arguments. + var a1 = args{ + n: ios.NewIOnode("node_1", "/proc/sys/net/node_1", 0), + req: &domain.HandlerRequest{ + Pid: 1001, + Data: make([]byte, len(string("file content 0123456789"))), + Container: css.ContainerCreate( + "c1", + uint32(1001), + time.Time{}, + 231072, + 65535, + 231072, + 65535, + nil, + nil, + css), + }, + } + + tests := []struct { + name string + fields fields + args args + want int + wantErr bool + wantErrVal error + prepare func() + }{ + { + // + // Test-case 1: Regular Read operation. No errors expected. + // + name: "1", + fields: f1, + args: a1, + want: len(string("file content 0123456789")), + wantErr: false, + wantErrVal: nil, + prepare: func() { + + // Setup dynamic state associated to tested container. + c1 := a1.req.Container + _ = c1.SetInitProc(c1.InitPid(), c1.UID(), c1.GID()) + c1.InitProc().CreateNsInodes(123456) + + // Expected nsenter request. + nsenterEventReq := &nsenter.NSenterEvent{ + Pid: a1.req.Pid, + Namespace: &domain.AllNSs, + ReqMsg: &domain.NSenterMessage{ + Type: domain.ReadFileRequest, + Payload: &domain.ReadFilePayload{ + File: a1.n.Path(), + Offset: 0, + Len: len(string("file content 0123456789")), + MountSysfs: false, + MountProcfs: true, + }, + }, + } + + // Expected nsenter response. + nsenterEventResp := &nsenter.NSenterEvent{ + ResMsg: &domain.NSenterMessage{ + Type: domain.ReadFileResponse, + Payload: []byte("file content 0123456789"), + }, + } + + nss.On( + "NewEvent", + a1.req.Pid, + &domain.AllNSs, + uint32(unix.CLONE_NEWNS), + nsenterEventReq.ReqMsg, + (*domain.NSenterMessage)(nil), + false).Return(nsenterEventReq) + + nss.On("SendRequestEvent", nsenterEventReq).Return(nil) + nss.On("ReceiveResponseEvent", nsenterEventReq).Return(nsenterEventResp.ResMsg) + }, + }, + } + + // + // Testcase executions. + // + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + h := &implementations.PassThrough{ + domain.HandlerBase{ + Name: tt.fields.Name, + Path: tt.fields.Path, + Service: tt.fields.Service, + }, + } + + // Prepare the mocks. + if tt.prepare != nil { + tt.prepare() + } + + got, err := h.Read(tt.args.n, tt.args.req) + if (err != nil) != tt.wantErr { + t.Errorf("PassThrough.Read() error = %v, wantErr %v", err, tt.wantErr) + return + } + if got != tt.want { + t.Errorf("PassThrough.Read() = %v, want %v", got, tt.want) + } + + // Ensure that mocks were properly invoked and reset expectedCalls + // object. + nss.AssertExpectations(t) + nss.ExpectedCalls = nil + }) + } +} + +func TestPassThrough_Write(t *testing.T) { + type fields struct { + Name string + Path string + Service domain.HandlerServiceIface + } + + var f1 = fields{ + Name: "PassThrough", + Path: "PassThrough", + Service: hds, + } + + type args struct { + n domain.IOnodeIface + req *domain.HandlerRequest + } + + // Valid method arguments. + var a1 = args{ + n: ios.NewIOnode("node_1", "/proc/sys/net/node_1", 0), + req: &domain.HandlerRequest{ + Pid: 1001, + Data: []byte(string("file content 0123456789")), + Container: css.ContainerCreate( + "c1", + uint32(1001), + time.Time{}, + 231072, + 65535, + 231072, + 65535, + nil, + nil, + css), + }, + } + + tests := []struct { + name string + fields fields + args args + want int + wantErr bool + wantErrVal error + prepare func() + }{ + { + // + // Test-case 1: Regular Write operation. No errors expected. + // + name: "1", + fields: f1, + args: a1, + want: len(string("file content 0123456789")), + wantErr: false, + wantErrVal: nil, + prepare: func() { + + // Setup dynamic state associated to tested container. + c1 := a1.req.Container + _ = c1.SetInitProc(c1.InitPid(), c1.UID(), c1.GID()) + c1.InitProc().CreateNsInodes(123456) + + // Expected nsenter request. + nsenterEventReq := &nsenter.NSenterEvent{ + Pid: a1.req.Pid, + Namespace: &domain.AllNSs, + ReqMsg: &domain.NSenterMessage{ + Type: domain.WriteFileRequest, + Payload: &domain.WriteFilePayload{ + File: a1.n.Path(), + Offset: 0, + Data: []byte("file content 0123456789"), + MountSysfs: false, + MountProcfs: true, + }, + }, + } + + // Expected nsenter response. + nsenterEventResp := &nsenter.NSenterEvent{ + ResMsg: &domain.NSenterMessage{ + Type: domain.WriteFileResponse, + Payload: "file content 0123456789", + }, + } + + nss.On( + "NewEvent", + a1.req.Pid, + &domain.AllNSs, + uint32(unix.CLONE_NEWNS), + nsenterEventReq.ReqMsg, + (*domain.NSenterMessage)(nil), + false).Return(nsenterEventReq) + + nss.On("SendRequestEvent", nsenterEventReq).Return(nil) + nss.On("ReceiveResponseEvent", nsenterEventReq).Return(nsenterEventResp.ResMsg) + }, + }, + { + // + // Test-case 2: Verify proper behavior during nsenter error conditions + // (EACCESS). + // + name: "2", + fields: f1, + args: a1, + want: 0, + wantErr: true, + wantErrVal: syscall.EACCES, + prepare: func() { + + // Setup dynamic state associated to tested container. + c1 := a1.req.Container + _ = c1.SetInitProc(c1.InitPid(), c1.UID(), c1.GID()) + c1.InitProc().CreateNsInodes(123456) + + // Expected nsenter request. + nsenterEventReq := &nsenter.NSenterEvent{ + Pid: a1.req.Pid, + Namespace: &domain.AllNSs, + ReqMsg: &domain.NSenterMessage{ + Type: domain.WriteFileRequest, + Payload: &domain.WriteFilePayload{ + File: a1.n.Path(), + Offset: 0, + Data: []byte("file content 0123456789"), + MountSysfs: false, + MountProcfs: true, + }, + }, + } + + // Expected nsenter response. + nsenterEventResp := &nsenter.NSenterEvent{ + ResMsg: &domain.NSenterMessage{ + Type: domain.ErrorResponse, + Payload: syscall.Errno(syscall.EACCES), + }, + } + + nss.On( + "NewEvent", + a1.req.Pid, + &domain.AllNSs, + uint32(unix.CLONE_NEWNS), + nsenterEventReq.ReqMsg, + (*domain.NSenterMessage)(nil), + false).Return(nsenterEventReq) + + nss.On("SendRequestEvent", nsenterEventReq).Return(nil) + nss.On("ReceiveResponseEvent", nsenterEventReq).Return(nsenterEventResp.ResMsg) + }, + }, + } + + // + // Testcase executions. + // + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + h := &implementations.PassThrough{ + domain.HandlerBase{ + Name: tt.fields.Name, + Path: tt.fields.Path, + Service: tt.fields.Service, + }, + } + + // Prepare the mocks. + if tt.prepare != nil { + tt.prepare() + } + + got, err := h.Write(tt.args.n, tt.args.req) + if (err != nil) != tt.wantErr { + t.Errorf("PassThrough.Write() error = %v, wantErr %v", err, tt.wantErr) + return + } + if got != tt.want { + t.Errorf("PassThrough.Write() = %v, want %v", got, tt.want) + } + + // Ensure that mocks were properly invoked and reset expectedCalls + // object. + nss.AssertExpectations(t) + nss.ExpectedCalls = nil + }) + } +} + +func TestPassThrough_ReadDirAll(t *testing.T) { + type fields struct { + Name string + Path string + Service domain.HandlerServiceIface + } + + var f1 = fields{ + Name: "PassThrough", + Path: "PassThrough", + Service: hds, + } + + type args struct { + n domain.IOnodeIface + req *domain.HandlerRequest + } + + // Valid method arguments. + var a1 = args{ + n: ios.NewIOnode("net", "/proc/sys/net", 0), + req: &domain.HandlerRequest{ + Pid: 1001, + Container: css.ContainerCreate( + "c1", + uint32(1001), + time.Time{}, + 231072, + 65535, + 231072, + 65535, + nil, + nil, + css), + }, + } + + // Expected responses. + var t1_result = []os.FileInfo{ + domain.FileInfo{ + Fname: "/proc/sys/net/ipv4", + }, + domain.FileInfo{ + Fname: "/proc/sys/net/ipv6", + }, + } + + tests := []struct { + name string + fields fields + args args + want []os.FileInfo + wantErr bool + wantErrVal error + prepare func() + }{ + { + // + // Test-case 1: Regular ReadDirAll operation. No errors expected. + // + name: "1", + fields: f1, + args: a1, + want: t1_result, + wantErr: false, + wantErrVal: nil, + prepare: func() { + + // Setup dynamic state associated to tested container. + c1 := a1.req.Container + _ = c1.SetInitProc(c1.InitPid(), c1.UID(), c1.GID()) + c1.InitProc().CreateNsInodes(123456) + + // Expected nsenter request. + nsenterEventReq := &nsenter.NSenterEvent{ + Pid: a1.req.Pid, + Namespace: &domain.AllNSs, + ReqMsg: &domain.NSenterMessage{ + Type: domain.ReadDirRequest, + Payload: &domain.ReadDirPayload{ + Dir: a1.n.Path(), + MountSysfs: false, + MountProcfs: true, + }, + }, + } + + // Expected nsenter response. + nsenterEventResp := &nsenter.NSenterEvent{ + ResMsg: &domain.NSenterMessage{ + Type: domain.ReadDirResponse, + Payload: []domain.FileInfo{ + domain.FileInfo{ + Fname: "/proc/sys/net/ipv4", + }, + domain.FileInfo{ + Fname: "/proc/sys/net/ipv6", + }, + }, + }, + } + + nss.On( + "NewEvent", + a1.req.Pid, + &domain.AllNSs, + uint32(unix.CLONE_NEWNS), + nsenterEventReq.ReqMsg, + (*domain.NSenterMessage)(nil), + false).Return(nsenterEventReq) + + nss.On("SendRequestEvent", nsenterEventReq).Return(nil) + nss.On("ReceiveResponseEvent", nsenterEventReq).Return(nsenterEventResp.ResMsg) + }, + }, + { + // + // Test-case 2: Verify proper behavior during nsenter error conditions + // (EACCESS). + // + name: "2", + fields: f1, + args: a1, + want: nil, + wantErr: true, + wantErrVal: syscall.EACCES, + prepare: func() { + + // Setup dynamic state associated to tested container. + c1 := a1.req.Container + _ = c1.SetInitProc(c1.InitPid(), c1.UID(), c1.GID()) + c1.InitProc().CreateNsInodes(123456) + + // Expected nsenter request. + nsenterEventReq := &nsenter.NSenterEvent{ + Pid: a1.req.Pid, + Namespace: &domain.AllNSs, + ReqMsg: &domain.NSenterMessage{ + Type: domain.ReadDirRequest, + Payload: &domain.ReadDirPayload{ + Dir: a1.n.Path(), + MountSysfs: false, + MountProcfs: true, + }, + }, + } + + // Expected nsenter response. + nsenterEventResp := &nsenter.NSenterEvent{ + ResMsg: &domain.NSenterMessage{ + Type: domain.ErrorResponse, + Payload: syscall.Errno(syscall.EACCES), + }, + } + + nss.On( + "NewEvent", + a1.req.Pid, + &domain.AllNSs, + uint32(unix.CLONE_NEWNS), + nsenterEventReq.ReqMsg, + (*domain.NSenterMessage)(nil), + false).Return(nsenterEventReq) + + nss.On("SendRequestEvent", nsenterEventReq).Return(nil) + nss.On("ReceiveResponseEvent", nsenterEventReq).Return(nsenterEventResp.ResMsg) + }, + }, + } + + // + // Testcase executions. + // + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + h := &implementations.PassThrough{ + domain.HandlerBase{ + Name: tt.fields.Name, + Path: tt.fields.Path, + Service: tt.fields.Service, + }, + } + + // Prepare the mocks. + if tt.prepare != nil { + tt.prepare() + } + + got, err := h.ReadDirAll(tt.args.n, tt.args.req) + if (err != nil) != tt.wantErr { + t.Errorf("PassThrough.ReadDirAll() error = %v, wantErr %v", + err, tt.wantErr) + return + } + if !reflect.DeepEqual(got, tt.want) { + t.Errorf("PassThrough.ReadDirAll() = %v, want %v", + got, tt.want) + } + + // Ensure that mocks were properly invoked and reset expectedCalls + // object. + nss.AssertExpectations(t) + nss.ExpectedCalls = nil + + }) + } +} + +func TestPassThrough_Setattr(t *testing.T) { + type fields struct { + Name string + Path string + Service domain.HandlerServiceIface + } + type args struct { + n domain.IOnodeIface + req *domain.HandlerRequest + } + tests := []struct { + name string + fields fields + args args + wantErr bool + }{ + // TODO: Add test cases. + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + h := &implementations.PassThrough{ + domain.HandlerBase{ + Name: tt.fields.Name, + Path: tt.fields.Path, + Service: tt.fields.Service, + }, + } + if err := h.Setattr(tt.args.n, tt.args.req); (err != nil) != tt.wantErr { + t.Errorf("PassThrough.Setattr() error = %v, wantErr %v", err, tt.wantErr) + } + }) + } +} + +func TestPassThrough_GetName(t *testing.T) { + type fields struct { + Name string + Path string + Service domain.HandlerServiceIface + } + tests := []struct { + name string + fields fields + want string + }{ + // TODO: Add test cases. + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + h := &implementations.PassThrough{ + domain.HandlerBase{ + Name: tt.fields.Name, + Path: tt.fields.Path, + Service: tt.fields.Service, + }, + } + if got := h.GetName(); got != tt.want { + t.Errorf("PassThrough.GetName() = %v, want %v", got, tt.want) + } + }) + } +} + +func TestPassThrough_GetPath(t *testing.T) { + type fields struct { + Name string + Path string + Service domain.HandlerServiceIface + } + tests := []struct { + name string + fields fields + want string + }{ + // TODO: Add test cases. + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + h := &implementations.PassThrough{ + domain.HandlerBase{ + Name: tt.fields.Name, + Path: tt.fields.Path, + Service: tt.fields.Service, + }, + } + if got := h.GetPath(); got != tt.want { + t.Errorf("PassThrough.GetPath() = %v, want %v", got, tt.want) + } + }) + } +} + +func TestPassThrough_GetService(t *testing.T) { + type fields struct { + Name string + Path string + Service domain.HandlerServiceIface + } + tests := []struct { + name string + fields fields + want domain.HandlerServiceIface + }{ + // TODO: Add test cases. + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + h := &implementations.PassThrough{ + domain.HandlerBase{ + Name: tt.fields.Name, + Path: tt.fields.Path, + Service: tt.fields.Service, + }, + } + if got := h.GetService(); !reflect.DeepEqual(got, tt.want) { + t.Errorf("PassThrough.GetService() = %v, want %v", got, tt.want) + } + }) + } +} + +func TestPassThrough_SetService(t *testing.T) { + type fields struct { + Name string + Path string + Service domain.HandlerServiceIface + } + type args struct { + hs domain.HandlerServiceIface + } + tests := []struct { + name string + fields fields + args args + }{ + // TODO: Add test cases. + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + h := &implementations.PassThrough{ + domain.HandlerBase{ + Name: tt.fields.Name, + Path: tt.fields.Path, + Service: tt.fields.Service, + }, + } + h.SetService(tt.args.hs) + }) + } +} diff --git a/sysbox-fs/handler/implementations/procSwaps.go b/sysbox-fs/handler/implementations/procSwaps.go new file mode 100644 index 00000000..bf1f028f --- /dev/null +++ b/sysbox-fs/handler/implementations/procSwaps.go @@ -0,0 +1,198 @@ +// +// Copyright 2019-2023 Nestybox, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +package implementations + +import ( + "io" + "os" + "path/filepath" + "sync" + "syscall" + "time" + + "github.com/sirupsen/logrus" + + "github.com/nestybox/sysbox-fs/domain" + "github.com/nestybox/sysbox-fs/fuse" +) + +// +// /proc/swaps handler +// + +// /proc/swaps static header +var swapsHeader = "Filename Type Size Used Priority" + +type ProcSwaps struct { + domain.HandlerBase +} + +var ProcSwaps_Handler = &ProcSwaps{ + domain.HandlerBase{ + Name: "ProcSwaps", + Path: "/proc/swaps", + Enabled: true, + }, +} + +func (h *ProcSwaps) Lookup( + n domain.IOnodeIface, + req *domain.HandlerRequest) (os.FileInfo, error) { + + var resource = n.Name() + + logrus.Debugf("Executing Lookup() for req-id: %#x, handler: %s, resource: %s", + req.ID, h.Name, resource) + + info := &domain.FileInfo{ + Fname: resource, + Fmode: os.FileMode(uint32(0444)), + FmodTime: time.Now(), + Fsize: 4096, + } + + return info, nil +} + +func (h *ProcSwaps) Open( + n domain.IOnodeIface, + req *domain.HandlerRequest) (bool, error) { + + logrus.Debugf("Executing Open() for req-id: %#x, handler: %s, resource: %s", + req.ID, h.Name, n.Name()) + + flags := n.OpenFlags() + + if flags&syscall.O_WRONLY == syscall.O_WRONLY || + flags&syscall.O_RDWR == syscall.O_RDWR { + return false, fuse.IOerror{Code: syscall.EACCES} + } + + return false, nil +} + +func (h *ProcSwaps) Read( + n domain.IOnodeIface, + req *domain.HandlerRequest) (int, error) { + + logrus.Debugf("Executing Read() for req-id: %#x, handler: %s, resource: %s", + req.ID, h.Name, n.Name()) + + return h.readSwaps(n, req) +} + +func (h *ProcSwaps) Write( + n domain.IOnodeIface, + req *domain.HandlerRequest) (int, error) { + + logrus.Debugf("Executing Write() for req-id: %#x, handler: %s, resource: %s", + req.ID, h.Name, n.Name()) + + return 0, nil +} + +func (h *ProcSwaps) ReadDirAll( + n domain.IOnodeIface, + req *domain.HandlerRequest) ([]os.FileInfo, error) { + + var resource = n.Name() + + logrus.Debugf("Executing ReadDirAll() for req-id: %#x, handler: %s, resource: %s", + req.ID, h.Name, resource) + + return nil, nil +} + +func (h *ProcSwaps) ReadLink( + n domain.IOnodeIface, + req *domain.HandlerRequest) (string, error) { + + logrus.Debugf("Executing ReadLink() for req-id: %#x, handler: %s, resource: %s", + req.ID, h.Name, n.Name()) + + return "", nil +} + +func (h *ProcSwaps) GetName() string { + return h.Name +} + +func (h *ProcSwaps) GetPath() string { + return h.Path +} + +func (h *ProcSwaps) GetService() domain.HandlerServiceIface { + return h.Service +} + +func (h *ProcSwaps) GetEnabled() bool { + return h.Enabled +} + +func (h *ProcSwaps) SetEnabled(b bool) { + h.Enabled = b +} + +func (h *ProcSwaps) GetResourcesList() []string { + + var resources []string + + for resourceKey, resource := range h.EmuResourceMap { + resource.Mutex.Lock() + if !resource.Enabled { + resource.Mutex.Unlock() + continue + } + resource.Mutex.Unlock() + + resources = append(resources, filepath.Join(h.GetPath(), resourceKey)) + } + + return resources +} + +func (h *ProcSwaps) GetResourceMutex(n domain.IOnodeIface) *sync.Mutex { + resource, ok := h.EmuResourceMap[n.Name()] + if !ok { + return nil + } + + return &resource.Mutex +} + +func (h *ProcSwaps) SetService(hs domain.HandlerServiceIface) { + h.Service = hs +} + +func (h *ProcSwaps) readSwaps( + n domain.IOnodeIface, + req *domain.HandlerRequest) (int, error) { + + logrus.Debugf("Executing %v Read() method", h.Name) + + if req.Offset > 0 { + return 0, io.EOF + } + + // Pretend swapping is off + // + // TODO: fix this once Sysbox intercepts the swapon() and swapoff() syscalls. + + req.Data = []byte(swapsHeader + "\n") + + return len(req.Data), nil +} diff --git a/sysbox-fs/handler/implementations/procSys.go b/sysbox-fs/handler/implementations/procSys.go new file mode 100644 index 00000000..d63b00c2 --- /dev/null +++ b/sysbox-fs/handler/implementations/procSys.go @@ -0,0 +1,189 @@ +// +// Copyright 2019-2023 Nestybox, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +package implementations + +import ( + "os" + "path/filepath" + "sync" + "time" + + "github.com/nestybox/sysbox-fs/domain" + + "github.com/sirupsen/logrus" +) + +// +// /proc/sys handler +// +// Handles all accesses to /proc/sys. Currently just a thin wrapper over the +// pass-through handler. +// + +type ProcSys struct { + domain.HandlerBase +} + +var ProcSys_Handler = &ProcSys{ + domain.HandlerBase{ + Name: "ProcSys", + Path: "/proc/sys", + Enabled: true, + EmuResourceMap: map[string]*domain.EmuResource{ + ".": { + Kind: domain.DirEmuResource, + Mode: os.ModeDir | os.FileMode(uint32(0555)), + Enabled: true, + }, + }, + }, +} + +func (h *ProcSys) Lookup( + n domain.IOnodeIface, + req *domain.HandlerRequest) (os.FileInfo, error) { + + logrus.Debugf("Executing Lookup() for req-id: %#x, handler: %s, resource: %s", + req.ID, h.Name, n.Name()) + + relpath, err := filepath.Rel(h.Path, n.Path()) + if err != nil { + return nil, err + } + + var resource = relpath + + if v, ok := h.EmuResourceMap[resource]; ok { + if resource == "." { + resource = "sys" + } + info := &domain.FileInfo{ + Fname: resource, + Fmode: v.Mode, + FmodTime: time.Now(), + } + + if v.Kind == domain.DirEmuResource { + info.FisDir = true + } + + return info, nil + } + + return h.Service.GetPassThroughHandler().Lookup(n, req) +} + +func (h *ProcSys) Open( + n domain.IOnodeIface, + req *domain.HandlerRequest) (bool, error) { + + logrus.Debugf("Executing Open() for req-id: %#x, handler: %s, resource: %s", + req.ID, h.Name, n.Name()) + + return h.Service.GetPassThroughHandler().Open(n, req) +} + +func (h *ProcSys) Read( + n domain.IOnodeIface, + req *domain.HandlerRequest) (int, error) { + + logrus.Debugf("Executing Read() for req-id: %#x, handler: %s, resource: %s", + req.ID, h.Name, n.Name()) + + return h.Service.GetPassThroughHandler().Read(n, req) +} + +func (h *ProcSys) Write( + n domain.IOnodeIface, + req *domain.HandlerRequest) (int, error) { + + logrus.Debugf("Executing Write() for req-id: %#x, handler: %s, resource: %s", + req.ID, h.Name, n.Name()) + + return h.Service.GetPassThroughHandler().Write(n, req) +} + +func (h *ProcSys) ReadDirAll( + n domain.IOnodeIface, + req *domain.HandlerRequest) ([]os.FileInfo, error) { + + logrus.Debugf("Executing ReadDirAll() for req-id: %#x, handler: %s, resource: %s", + req.ID, h.Name, n.Name()) + + return h.Service.GetPassThroughHandler().ReadDirAll(n, req) +} + +func (h *ProcSys) ReadLink( + n domain.IOnodeIface, + req *domain.HandlerRequest) (string, error) { + + logrus.Debugf("Executing ReadLink() for req-id: %#x, handler: %s, resource: %s", + req.ID, h.Name, n.Name()) + + return h.Service.GetPassThroughHandler().ReadLink(n, req) +} + +func (h *ProcSys) GetName() string { + return h.Name +} + +func (h *ProcSys) GetPath() string { + return h.Path +} + +func (h *ProcSys) GetService() domain.HandlerServiceIface { + return h.Service +} + +func (h *ProcSys) GetEnabled() bool { + return h.Enabled +} + +func (h *ProcSys) SetEnabled(b bool) { + h.Enabled = b +} + +func (h *ProcSys) GetResourcesList() []string { + + var resources []string + + for resourceKey, resource := range h.EmuResourceMap { + resource.Mutex.Lock() + if !resource.Enabled { + resource.Mutex.Unlock() + continue + } + resource.Mutex.Unlock() + + resources = append(resources, filepath.Join(h.GetPath(), resourceKey)) + } + + return resources +} + +func (h *ProcSys) GetResourceMutex(n domain.IOnodeIface) *sync.Mutex { + resource, ok := h.EmuResourceMap[n.Name()] + if !ok { + return nil + } + + return &resource.Mutex +} + +func (h *ProcSys) SetService(hs domain.HandlerServiceIface) { + h.Service = hs +} diff --git a/sysbox-fs/handler/implementations/procSysFs.go b/sysbox-fs/handler/implementations/procSysFs.go new file mode 100644 index 00000000..4f5739e6 --- /dev/null +++ b/sysbox-fs/handler/implementations/procSysFs.go @@ -0,0 +1,274 @@ +// +// Copyright 2019-2023 Nestybox, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +package implementations + +import ( + "os" + "path/filepath" + "sync" + "syscall" + "time" + + "github.com/sirupsen/logrus" + + "github.com/nestybox/sysbox-fs/domain" + "github.com/nestybox/sysbox-fs/fuse" +) + +// +// /proc/sys/fs handler +// +// Emulated resources: +// +// * /proc/sys/fs/file-max +// * /proc/sys/fs/nr_open +// * /proc/sys/fs/protected_hardlinks +// * /proc/sys/fs/protected_symlinks +// + +const ( + minProtectedSymlinksVal = 0 + maxProtectedSymlinksVal = 1 +) + +const ( + minProtectedHardlinksVal = 0 + maxProtectedHardlinksVal = 1 +) + +type ProcSysFs struct { + domain.HandlerBase +} + +var ProcSysFs_Handler = &ProcSysFs{ + domain.HandlerBase{ + Name: "ProcSysFs", + Path: "/proc/sys/fs", + Enabled: true, + EmuResourceMap: map[string]*domain.EmuResource{ + "file-max": { + Kind: domain.FileEmuResource, + Mode: os.FileMode(uint32(0644)), + Enabled: true, + Size: 1024, + }, + "nr_open": { + Kind: domain.FileEmuResource, + Mode: os.FileMode(uint32(0644)), + Enabled: true, + Size: 1024, + }, + "protected_hardlinks": { + Kind: domain.FileEmuResource, + Mode: os.FileMode(uint32(0600)), + Enabled: true, + Size: 1024, + }, + "protected_symlinks": { + Kind: domain.FileEmuResource, + Mode: os.FileMode(uint32(0600)), + Enabled: true, + Size: 1024, + }, + }, + }, +} + +func (h *ProcSysFs) Lookup( + n domain.IOnodeIface, + req *domain.HandlerRequest) (os.FileInfo, error) { + + var resource = n.Name() + + logrus.Debugf("Executing Lookup() for req-id: %#x, handler: %s, resource: %s", + req.ID, h.Name, resource) + + // Return an artificial fileInfo if looked-up element matches any of the + // emulated nodes. + if v, ok := h.EmuResourceMap[resource]; ok { + info := &domain.FileInfo{ + Fname: resource, + Fmode: v.Mode, + FmodTime: time.Now(), + Fsize: v.Size, + } + + return info, nil + } + + // If looked-up element hasn't been found by now, let's look into the actual + // sys container rootfs. + return h.Service.GetPassThroughHandler().Lookup(n, req) +} + +func (h *ProcSysFs) Open( + n domain.IOnodeIface, + req *domain.HandlerRequest) (bool, error) { + + var resource = n.Name() + + logrus.Debugf("Executing Open() for req-id: %#x, handler: %s, resource: %s", + req.ID, h.Name, resource) + + switch resource { + case "file-max": + return false, nil + + case "nr_open": + return false, nil + + case "protected_hardlinks": + return false, nil + + case "protected_symlinks": + return false, nil + } + + return h.Service.GetPassThroughHandler().Open(n, req) +} + +func (h *ProcSysFs) Read( + n domain.IOnodeIface, + req *domain.HandlerRequest) (int, error) { + + var resource = n.Name() + + logrus.Debugf("Executing Read() for req-id: %#x, handler: %s, resource: %s", + req.ID, h.Name, resource) + + switch resource { + case "file-max": + return readCntrData(h, n, req) + + case "nr_open": + return readCntrData(h, n, req) + + case "protected_hardlinks": + return readCntrData(h, n, req) + + case "protected_symlinks": + return readCntrData(h, n, req) + } + + // Refer to generic handler if no node match is found above. + return h.Service.GetPassThroughHandler().Read(n, req) +} + +func (h *ProcSysFs) Write( + n domain.IOnodeIface, + req *domain.HandlerRequest) (int, error) { + + var resource = n.Name() + + logrus.Debugf("Executing Write() for req-id: %#x, handler: %s, resource: %s", + req.ID, h.Name, resource) + + switch resource { + case "file-max": + return writeCntrData(h, n, req, writeMaxIntToFs) + + case "nr_open": + return writeCntrData(h, n, req, writeMaxIntToFs) + + case "protected_hardlinks": + if !checkIntRange(req.Data, minProtectedHardlinksVal, maxProtectedHardlinksVal) { + return 0, fuse.IOerror{Code: syscall.EINVAL} + } + return writeCntrData(h, n, req, nil) + + case "protected_symlinks": + if !checkIntRange(req.Data, minProtectedSymlinksVal, maxProtectedSymlinksVal) { + return 0, fuse.IOerror{Code: syscall.EINVAL} + } + return writeCntrData(h, n, req, nil) + } + + // Refer to generic handler if no node match is found above. + return h.Service.GetPassThroughHandler().Write(n, req) +} + +func (h *ProcSysFs) ReadDirAll( + n domain.IOnodeIface, + req *domain.HandlerRequest) ([]os.FileInfo, error) { + + logrus.Debugf("Executing ReadDirAll() for req-id: %#x, handler: %s, resource: %s", + req.ID, h.Name, n.Name()) + + // Return all entries as seen within container's namespaces. + return h.Service.GetPassThroughHandler().ReadDirAll(n, req) +} + +func (h *ProcSysFs) ReadLink( + n domain.IOnodeIface, + req *domain.HandlerRequest) (string, error) { + + logrus.Debugf("Executing ReadLink() for req-id: %#x, handler: %s, resource: %s", + req.ID, h.Name, n.Name()) + + return h.Service.GetPassThroughHandler().ReadLink(n, req) +} + +func (h *ProcSysFs) GetName() string { + return h.Name +} + +func (h *ProcSysFs) GetPath() string { + return h.Path +} + +func (h *ProcSysFs) GetService() domain.HandlerServiceIface { + return h.Service +} + +func (h *ProcSysFs) GetEnabled() bool { + return h.Enabled +} + +func (h *ProcSysFs) SetEnabled(b bool) { + h.Enabled = b +} + +func (h *ProcSysFs) GetResourcesList() []string { + + var resources []string + + for resourceKey, resource := range h.EmuResourceMap { + resource.Mutex.Lock() + if !resource.Enabled { + resource.Mutex.Unlock() + continue + } + resource.Mutex.Unlock() + + resources = append(resources, filepath.Join(h.GetPath(), resourceKey)) + } + + return resources +} + +func (h *ProcSysFs) GetResourceMutex(n domain.IOnodeIface) *sync.Mutex { + resource, ok := h.EmuResourceMap[n.Name()] + if !ok { + return nil + } + + return &resource.Mutex +} + +func (h *ProcSysFs) SetService(hs domain.HandlerServiceIface) { + h.Service = hs +} diff --git a/sysbox-fs/handler/implementations/procSysKernel.go b/sysbox-fs/handler/implementations/procSysKernel.go new file mode 100644 index 00000000..613c6d30 --- /dev/null +++ b/sysbox-fs/handler/implementations/procSysKernel.go @@ -0,0 +1,597 @@ +// +// Copyright 2019-2023 Nestybox, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +package implementations + +import ( + "os" + "path/filepath" + "sync" + "syscall" + "time" + + "github.com/sirupsen/logrus" + + "github.com/nestybox/sysbox-fs/domain" + "github.com/nestybox/sysbox-fs/fuse" +) + +// +// /proc/sys/kernel handler +// +// Emulated resources: +// +// * /proc/sys/kernel/cap_last_cap +// +// Documentation: The value in this file exposes the numerical value of the +// highest capability supported by the running kernel ('37' as of today's +// latest / 5.X kernels ). +// +// This handler is used for performance reasons (rather than functional reasons), +// as having it avoids using the passthrough (common) handler for accesses to +// /proc/sys/kernel/cap_last_cap which is the most commonly accessed sysctl. +// +// +// * /proc/sys/kernel/sysrq +// +// Documentation: It is a ‘magical’ key combo you can hit which the kernel will +// respond to regardless of whatever else it is doing, unless it is completely +// locked up. +// +// Supported values: +// +// 0 - disable sysrq completely +// +// 1 - enable all functions of sysrq +// +// >1 - bitmask of allowed sysrq functions (see below for detailed function +// description): +// +// 2 = 0x2 - enable control of console logging level +// 4 = 0x4 - enable control of keyboard (SAK, unraw) +// 8 = 0x8 - enable debugging dumps of processes etc. +// 16 = 0x10 - enable sync command +// 32 = 0x20 - enable remount read-only +// 64 = 0x40 - enable signalling of processes (term, kill, oom-kill) +// 128 = 0x80 - allow reboot/poweroff +// 256 = 0x100 - allow nicing of all RT tasks +// +// Note: As this is a system-wide attribute, changes will be only made +// superficially (at sys-container level). IOW, the host FS value will be left +// untouched. +// +// +// * /proc/sys/kernel/panic handler +// +// Documentation: The value in this file represents the number of seconds the +// kernel waits before rebooting on a panic. The default setting is 0, which +// doesn't cause a reboot. +// +// Taking into account the semantics of the value held within this file (time +// units), and the obvious conflicts that can arise among containers / hosts +// when defining different values, in this implementation we have opted by +// allowing read/write operations within the container, but we don't push +// these values down to the host FS. IOW, the host value will be the one +// honored at panic time. +// +// +// * /proc/sys/kernel/panic_on_oops handler +// +// Documentation: The value in this file defines the kernel behavior +// when an 'oops' is encountered. The following values are supported: +// +// 0: try to continue operation (default option) +// +// 1: panic immediately. If the 'panic' procfs node is also non-zero then the +// machine will be rebooted. +// +// Taking into account that kernel can either operate in one mode or the other, +// we cannot let the values defined within a sys container to be pushed down to +// the host FS, as that could potentially affect the overall system stability. +// IOW, the host value will be the one honored upon 'oops' arrival. +// +// +// * /proc/sys/kernel/kptr_restrict +// +// Documentation: This toggle indicates whether restrictions are placed on +// exposing kernel addresses via /proc and other interfaces. +// +// Supported values: +// +// - "0": (default) the address is hashed before printing. (This is the +// equivalent to %p.). +// +// - "1": kernel pointers printed using the %pK format specifier will be +// replaced with 0's unless the user has CAP_SYSLOG and effective user and +// group ids are equal to the real ids. This is because %pK checks are done +// at read() time rather than open() time, so if permissions are elevated +// between the open() and the read() (e.g via a setuid binary) then %pK will +// not leak kernel pointers to unprivileged users. Note, this is a temporary +// solution only. The correct long-term solution is to do the permission +// checks at open() time. Consider removing world read permissions from files +// that use %pK, and using dmesg_restrict to protect against uses of %pK in +// dmesg(8) if leaking kernel pointer values to unprivileged users is a +// concern. +// +// - "2": kernel pointers printed using %pK will be replaced with 0's +// regardless of privileges. +// +// Note: As this is a system-wide attribute with mutually-exclusive values, +// changes will be only made superficially (at sys-container level). IOW, +// the host FS value will be left untouched. +// +// +// * /proc/sys/kernel/dmesg_restrict +// +// Documentation: This toggle indicates whether unprivileged users are prevented +// from using dmesg(8) to view messages from the kernel’s log buffer. The following +// values are supported: +// +// 0: there are no restrictions +// +// 1: users must have CAP_SYSLOG to use dmesg +// +// Note: As this is a system-wide attribute with mutually-exclusive values, changes +// will be only made superficially (at sys-container level). IOW, the host FS value +// will be left untouched. As result, the value being set in this resource will have +// no impact on the output (if any) generated by 'dmesg'. +// +// +// * /proc/sys/kernel/ngroups_max handler +// +// Documentation: The numerical value stored in this file represents the maximum +// number of supplementary groups of which a process can be a member of (65k in +// kernels 2.2+). This is a system-wide number and does not appear to be +// re-configurable at runtime, so we will proceed to cache its value on a +// per-container basis. +// +// Notice that this resource is perfectly reachable within a regular or system +// container. That's to say that our main purpose here is not 'functional'; we +// we are creating this handler to enhance sysbox-fs performance: every 'sudo' +// instruction does two consecutive reads() over this resource -- and that +// entails the execution of all the other file-operations too (i.e. lookup, +// getattr, etc). +// +// +// * /proc/sys/kernel/printk handler +// +// Documentation: The four values in printk denote: console_loglevel, +// default_message_loglevel, minimum_console_loglevel and default_console_loglevel +// respectively. These values influence printk() behavior when printing or logging +// error messages. +// +// Supported values: +// +// - console_loglevel: messages with a higher priority than this will be printed +// to the console. +// - default_message_loglevel: messages without an explicit priority will be +// printed with this priority. +// - minimum_console_loglevel: minimum (highest) value to which console_loglevel +// can be set. +// - default_console_loglevel: default value for console_loglevel. +// +// Note 1: As this is a system-wide attribute with mutually-exclusive values, +// changes will be only made superficially (at sys-container level). IOW, +// the host FS value will be left untouched. +// +// Note 2: For this specific node we are not verifying that the values passed by +// the user in write() operations match the semantics and the format expected by +// the kernel. This is something that we may need to improve in the future. +// Example: "4 4 1 7". +// +// +// * /proc/sys/kernel/pid_max (since Linux 2.5.34) +// +// Documentation: This file specifies the value at which PIDs wrap around (i.e., +// the value in this file is one greater than the maximum PID). PIDs greater +// than this value are not allocated; thus, the value in this file also acts as +// a system-wide limit on the total number of processes and threads. The +// default value for this file, 32768, results in the same range of PIDs as on +// earlier kernels. On 32-bit platforms, 32768 is the maximum value for +// pid_max. On 64-bit systems, pid_max can be set to any value up to 2^22 +// (PID_MAX_LIMIT, approximately 4 million). +// + +const ( + minSysrqVal = 0 + maxSysrqVal = 511 + + minKptrRestrictVal = 0 + maxKptrRestrictVal = 3 + + minDmesgRestrictVal = 0 + maxDmesgRestrictVal = 1 + + minPanicOopsVal = 0 + maxPanicOopsVal = 1 + + minPidMaxVal = 1 + maxPidMaxVal = 4194304 +) + +type ProcSysKernel struct { + domain.HandlerBase +} + +var ProcSysKernel_Handler = &ProcSysKernel{ + domain.HandlerBase{ + Name: "ProcSysKernel", + Path: "/proc/sys/kernel", + Enabled: true, + EmuResourceMap: map[string]*domain.EmuResource{ + "domainname": { + Kind: domain.FileEmuResource, + Mode: os.FileMode(uint32(0644)), + Enabled: true, + Size: 4096, + }, + "hostname": { + Kind: domain.FileEmuResource, + Mode: os.FileMode(uint32(0644)), + Enabled: true, + Size: 4096, + }, + "kptr_restrict": { + Kind: domain.FileEmuResource, + Mode: os.FileMode(uint32(0644)), + Enabled: true, + Size: 2, + }, + "dmesg_restrict": { + Kind: domain.FileEmuResource, + Mode: os.FileMode(uint32(0644)), + Enabled: true, + Size: 2, + }, + "ngroups_max": { + Kind: domain.FileEmuResource, + Mode: os.FileMode(uint32(0444)), + Enabled: true, + Size: 1024, + }, + "cap_last_cap": { + Kind: domain.FileEmuResource, + Mode: os.FileMode(uint32(0444)), + Enabled: true, + Size: 1024, + }, + "panic": { + Kind: domain.FileEmuResource, + Mode: os.FileMode(uint32(0644)), + Enabled: true, + Size: 4096, + }, + "panic_on_oops": { + Kind: domain.FileEmuResource, + Mode: os.FileMode(uint32(0644)), + Enabled: true, + Size: 2, + }, + "printk": { + Kind: domain.FileEmuResource, + Mode: os.FileMode(uint32(0644)), + Enabled: true, + Size: 1024, + }, + "sysrq": { + Kind: domain.FileEmuResource, + Mode: os.FileMode(uint32(0644)), + Enabled: true, + Size: 1024, + }, + "pid_max": { + Kind: domain.FileEmuResource, + Mode: os.FileMode(uint32(0644)), + Enabled: true, + Size: 1024, + }, + }, + }, +} + +func (h *ProcSysKernel) Lookup( + n domain.IOnodeIface, + req *domain.HandlerRequest) (os.FileInfo, error) { + + var resource = n.Name() + + logrus.Debugf("Executing Lookup() for req-id: %#x, handler: %s, resource: %s", + req.ID, h.Name, resource) + + // Return an artificial fileInfo if looked-up element matches any of the + // emulated nodes. + if v, ok := h.EmuResourceMap[resource]; ok { + info := &domain.FileInfo{ + Fname: resource, + Fmode: v.Mode, + FmodTime: time.Now(), + Fsize: v.Size, + } + + return info, nil + } + + // If looked-up element hasn't been found by now, let's look into the actual + // container rootfs. + return h.Service.GetPassThroughHandler().Lookup(n, req) +} + +func (h *ProcSysKernel) Open( + n domain.IOnodeIface, + req *domain.HandlerRequest) (bool, error) { + + var resource = n.Name() + + logrus.Debugf("Executing Open() for req-id: %#x, handler: %s, resource: %s", + req.ID, h.Name, resource) + + flags := n.OpenFlags() + + switch resource { + case "cap_last_cap": + if flags&syscall.O_WRONLY == syscall.O_WRONLY || + flags&syscall.O_RDWR == syscall.O_RDWR { + return false, fuse.IOerror{Code: syscall.EACCES} + } + return false, nil + + case "pid_max": + return false, nil + + case "ngroups_max": + if flags&syscall.O_WRONLY == syscall.O_WRONLY || + flags&syscall.O_RDWR == syscall.O_RDWR { + return false, fuse.IOerror{Code: syscall.EACCES} + } + return false, nil + + case "domainname": + return false, nil + + case "hostname": + return false, nil + + case "kptr_restrict": + return false, nil + + case "dmesg_restrict": + return false, nil + + case "panic": + return false, nil + + case "panic_on_oops": + return false, nil + + case "sysrq": + return false, nil + + case "printk": + return false, nil + + case "shmall": + fallthrough + case "shmmax": + fallthrough + case "shmmni": + return h.Service.GetPassThroughHandler().OpenWithNS(n, req, domain.AllNSsButUser) + } + + // Refer to generic handler if no node match is found above. + return h.Service.GetPassThroughHandler().Open(n, req) +} + +func (h *ProcSysKernel) Read( + n domain.IOnodeIface, + req *domain.HandlerRequest) (int, error) { + + var resource = n.Name() + + logrus.Debugf("Executing Read() for req-id: %#x, handler: %s, resource: %s", + req.ID, h.Name, resource) + + switch resource { + case "cap_last_cap": + return readCntrData(h, n, req) + + case "pid_max": + return readCntrData(h, n, req) + + case "ngroups_max": + return readCntrData(h, n, req) + + case "domainname": + return readCntrData(h, n, req) + + case "hostname": + return readCntrData(h, n, req) + + case "kptr_restrict": + return readCntrData(h, n, req) + + case "dmesg_restrict": + return readCntrData(h, n, req) + + case "panic": + return readCntrData(h, n, req) + + case "panic_on_oops": + return readCntrData(h, n, req) + + case "sysrq": + return readCntrData(h, n, req) + + case "printk": + return readCntrData(h, n, req) + + case "shmall": + fallthrough + case "shmmax": + fallthrough + case "shmmni": + return h.Service.GetPassThroughHandler().ReadWithNS(n, req, domain.AllNSsButUser) + } + + // Refer to generic handler if no node match is found above. + return h.Service.GetPassThroughHandler().Read(n, req) +} + +func (h *ProcSysKernel) Write( + n domain.IOnodeIface, + req *domain.HandlerRequest) (int, error) { + + var resource = n.Name() + + logrus.Debugf("Executing Write() for req-id: %#x, handler: %s, resource: %s", + req.ID, h.Name, resource) + + switch resource { + case "cap_last_cap": + return 0, nil + + case "ngroups_max": + return 0, nil + + case "pid_max": + if !checkIntRange(req.Data, minPidMaxVal, maxPidMaxVal) { + return 0, fuse.IOerror{Code: syscall.EINVAL} + } + return writeCntrData(h, n, req, nil) + + case "panic": + return writeCntrData(h, n, req, nil) + + case "printk": + return writeCntrData(h, n, req, nil) + + case "panic_on_oops": + // Even though only values 0 and 1 are defined for panic_on_oops, the + // kernel allows other values to be written; thus no range check is + // performed here. + return writeCntrData(h, n, req, nil) + + case "kptr_restrict": + if !checkIntRange(req.Data, minKptrRestrictVal, maxKptrRestrictVal) { + return 0, fuse.IOerror{Code: syscall.EINVAL} + } + return writeCntrData(h, n, req, nil) + + case "dmesg_restrict": + if !checkIntRange(req.Data, minDmesgRestrictVal, maxDmesgRestrictVal) { + return 0, fuse.IOerror{Code: syscall.EINVAL} + } + return writeCntrData(h, n, req, nil) + + case "sysrq": + if !checkIntRange(req.Data, minSysrqVal, maxSysrqVal) { + return 0, fuse.IOerror{Code: syscall.EINVAL} + } + return writeCntrData(h, n, req, nil) + + case "domainname": + return writeCntrData(h, n, req, nil) + + case "hostname": + return writeCntrData(h, n, req, nil) + + case "shmall": + fallthrough + case "shmmax": + fallthrough + case "shmmni": + // The kernel only allows true root to write to /proc/sys/kernel/shm*. + // Root in the container's user-namespaces is not allowed to modify these + // values, even though they are namespaced via the IPC namespace. + // Therefore ask the passhthrough handler to enter all namespaces except + // the user-ns, as otherwise we get permission denied. + return h.Service.GetPassThroughHandler().WriteWithNS(n, req, domain.AllNSsButUser) + } + + // Refer to generic handler if no node match is found above. + return h.Service.GetPassThroughHandler().Write(n, req) +} + +func (h *ProcSysKernel) ReadDirAll( + n domain.IOnodeIface, + req *domain.HandlerRequest) ([]os.FileInfo, error) { + + logrus.Debugf("Executing ReadDirAll() for req-id: %#x, handler: %s, resource: %s", + req.ID, h.Name, n.Name()) + + // Return all entries as seen within container's namespaces. + return h.Service.GetPassThroughHandler().ReadDirAll(n, req) +} + +func (h *ProcSysKernel) ReadLink( + n domain.IOnodeIface, + req *domain.HandlerRequest) (string, error) { + + logrus.Debugf("Executing ReadLink() for req-id: %#x, handler: %s, resource: %s", + req.ID, h.Name, n.Name()) + + return h.Service.GetPassThroughHandler().ReadLink(n, req) +} + +func (h *ProcSysKernel) GetName() string { + return h.Name +} + +func (h *ProcSysKernel) GetPath() string { + return h.Path +} + +func (h *ProcSysKernel) GetService() domain.HandlerServiceIface { + return h.Service +} + +func (h *ProcSysKernel) GetEnabled() bool { + return h.Enabled +} + +func (h *ProcSysKernel) SetEnabled(b bool) { + h.Enabled = b +} + +func (h *ProcSysKernel) GetResourcesList() []string { + + var resources []string + + for resourceKey, resource := range h.EmuResourceMap { + resource.Mutex.Lock() + if !resource.Enabled { + resource.Mutex.Unlock() + continue + } + resource.Mutex.Unlock() + + resources = append(resources, filepath.Join(h.GetPath(), resourceKey)) + } + + return resources +} + +func (h *ProcSysKernel) GetResourceMutex(n domain.IOnodeIface) *sync.Mutex { + resource, ok := h.EmuResourceMap[n.Name()] + if !ok { + return nil + } + + return &resource.Mutex +} + +func (h *ProcSysKernel) SetService(hs domain.HandlerServiceIface) { + h.Service = hs +} diff --git a/sysbox-fs/handler/implementations/procSysKernelRandom.go b/sysbox-fs/handler/implementations/procSysKernelRandom.go new file mode 100644 index 00000000..7947afe6 --- /dev/null +++ b/sysbox-fs/handler/implementations/procSysKernelRandom.go @@ -0,0 +1,215 @@ +// +// Copyright 2019-2024 Nestybox, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +package implementations + +import ( + "io" + "os" + "path/filepath" + "sync" + "syscall" + "time" + + "github.com/sirupsen/logrus" + + "github.com/nestybox/sysbox-fs/domain" + "github.com/nestybox/sysbox-fs/fuse" +) + +// +// /proc/sys/kernel/random handler +// +// Emulated resources: +// +// * /proc/sys/kernel/random/uuid +// +// Documentation: a UUID generated every time this is retrieved (this can thus +// be used to generate UUIDs at will). It's emulated here because for some +// unknown reason the kernel returns the same value when this is read from +// inside a Sysbox container. +// + +type ProcSysKernelRandom struct { + domain.HandlerBase +} + +var ProcSysKernelRandom_Handler = &ProcSysKernelRandom{ + domain.HandlerBase{ + Name: "ProcSysKernelRandom", + Path: "/proc/sys/kernel/random", + Enabled: true, + EmuResourceMap: map[string]*domain.EmuResource{ + "uuid": { + Kind: domain.FileEmuResource, + Mode: os.FileMode(uint32(0444)), + Enabled: true, + Size: 1024, + }, + }, + }, +} + +func (h *ProcSysKernelRandom) Lookup( + n domain.IOnodeIface, + req *domain.HandlerRequest) (os.FileInfo, error) { + + var resource = n.Name() + + logrus.Debugf("Executing Lookup() for req-id: %#x, handler: %s, resource: %s", + req.ID, h.Name, resource) + + // Return an artificial fileInfo if looked-up element matches any of the + // emulated nodes. + if v, ok := h.EmuResourceMap[resource]; ok { + info := &domain.FileInfo{ + Fname: resource, + Fmode: v.Mode, + FmodTime: time.Now(), + Fsize: v.Size, + } + + return info, nil + } + + // If looked-up element hasn't been found by now, let's look into the actual + // container rootfs. + return h.Service.GetPassThroughHandler().Lookup(n, req) +} + +func (h *ProcSysKernelRandom) Open( + n domain.IOnodeIface, + req *domain.HandlerRequest) (bool, error) { + + return false, nil +} + +func (h *ProcSysKernelRandom) Read( + n domain.IOnodeIface, + req *domain.HandlerRequest) (int, error) { + + var resource = n.Name() + + logrus.Debugf("Executing Read() for req-id: %#x, handler: %s, resource: %s", + req.ID, h.Name, resource) + + switch resource { + case "uuid": + // Read /proc/sys/kernel/uuid from the kernel + sz, err := readFs(h, n, req.Offset, &req.Data) + if err != nil && err != io.EOF { + return 0, fuse.IOerror{Code: syscall.EINVAL} + } + if sz == 0 && err == io.EOF { + return 0, nil + } + return sz, nil + } + + // Refer to generic handler if no node match is found above. + return h.Service.GetPassThroughHandler().Read(n, req) +} + +func (h *ProcSysKernelRandom) Write( + n domain.IOnodeIface, + req *domain.HandlerRequest) (int, error) { + + var resource = n.Name() + + logrus.Debugf("Executing Write() for req-id: %#x, handler: %s, resource: %s", + req.ID, h.Name, resource) + + switch resource { + case "uuid": + // uuid is read-only + return 0, fuse.IOerror{Code: syscall.EPERM} + } + + // Refer to generic handler if no node match is found above. + return h.Service.GetPassThroughHandler().Write(n, req) +} + +func (h *ProcSysKernelRandom) ReadDirAll( + n domain.IOnodeIface, + req *domain.HandlerRequest) ([]os.FileInfo, error) { + + logrus.Debugf("Executing ReadDirAll() for req-id: %#x, handler: %s, resource: %s", + req.ID, h.Name, n.Name()) + + // Return all entries as seen within container's namespaces. + return h.Service.GetPassThroughHandler().ReadDirAll(n, req) +} + +func (h *ProcSysKernelRandom) ReadLink( + n domain.IOnodeIface, + req *domain.HandlerRequest) (string, error) { + + logrus.Debugf("Executing ReadLink() for req-id: %#x, handler: %s, resource: %s", + req.ID, h.Name, n.Name()) + + return h.Service.GetPassThroughHandler().ReadLink(n, req) +} + +func (h *ProcSysKernelRandom) GetName() string { + return h.Name +} + +func (h *ProcSysKernelRandom) GetPath() string { + return h.Path +} + +func (h *ProcSysKernelRandom) GetService() domain.HandlerServiceIface { + return h.Service +} + +func (h *ProcSysKernelRandom) GetEnabled() bool { + return h.Enabled +} + +func (h *ProcSysKernelRandom) SetEnabled(b bool) { + h.Enabled = b +} + +func (h *ProcSysKernelRandom) GetResourcesList() []string { + + var resources []string + + for resourceKey, resource := range h.EmuResourceMap { + resource.Mutex.Lock() + if !resource.Enabled { + resource.Mutex.Unlock() + continue + } + resource.Mutex.Unlock() + + resources = append(resources, filepath.Join(h.GetPath(), resourceKey)) + } + + return resources +} + +func (h *ProcSysKernelRandom) GetResourceMutex(n domain.IOnodeIface) *sync.Mutex { + resource, ok := h.EmuResourceMap[n.Name()] + if !ok { + return nil + } + + return &resource.Mutex +} + +func (h *ProcSysKernelRandom) SetService(hs domain.HandlerServiceIface) { + h.Service = hs +} diff --git a/sysbox-fs/handler/implementations/procSysKernelYamaPtrace.go b/sysbox-fs/handler/implementations/procSysKernelYamaPtrace.go new file mode 100644 index 00000000..0cef93dc --- /dev/null +++ b/sysbox-fs/handler/implementations/procSysKernelYamaPtrace.go @@ -0,0 +1,262 @@ +// +// Copyright 2019-2023 Nestybox, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +package implementations + +import ( + "os" + "path/filepath" + "sync" + "syscall" + "time" + + "github.com/sirupsen/logrus" + + "github.com/nestybox/sysbox-fs/domain" + "github.com/nestybox/sysbox-fs/fuse" +) + +// +// /proc/sys/kernel/yama handler +// +// Emulated resources: +// +// * /proc/sys/kernel/yama/ptrace_scope +// +// Documentation: As Linux grows in popularity, it will become a larger target +// for malware. One particularly troubling weakness of the Linux process +// interfaces is that a single user is able to examine the memory and running +// state of any of their processes. For example, if one application (e.g. +// Pidgin) was compromised, it would be possible for an attacker to attach to +// other running processes (e.g. Firefox, SSH sessions, GPG agent, etc) to +// extract additional credentials and continue to expand the scope of their +// attack without resorting to user-assisted phishing. +// +// For a solution, some applications use prctl(PR_SET_DUMPABLE, ...) to +// specifically disallow such ptrace attachment (e.g. ssh-agent), but many do +// not. A more general solution is to only allow ptrace directly from a parent +// to a child process (i.e. direct "gdb EXE" and "strace EXE" still work), or +// with CAP_SYS_PTRACE (i.e. "gdb --pid=PID", and "strace -p PID" still work +// as root). +// +// In mode 1, software that has defined application-specific relationships +// between a debugging process and its inferior (crash handlers, etc), +// prctl(PR_SET_PTRACER, pid, ...) can be used. An inferior can declare which +// other process (and its descendants) are allowed to call PTRACE_ATTACH +// against it. Only one such declared debugging process can exists for +// each inferior at a time. For example, this is used by KDE, Chromium, and +// Firefox's crash handlers, and by Wine for allowing only Wine processes +// to ptrace each other. If a process wishes to entirely disable these ptrace +// restrictions, it can call prctl(PR_SET_PTRACER, PR_SET_PTRACER_ANY, ...) +// so that any otherwise allowed process (even those in external pid namespaces) +// may attach. +// +// The sysctl settings (writable only with CAP_SYS_PTRACE) are: +// +// 0 - classic ptrace permissions: a process can PTRACE_ATTACH to any other +// process running under the same uid, as long as it is dumpable (i.e. +// did not transition uids, start privileged, or have called +// prctl(PR_SET_DUMPABLE...) already). Similarly, PTRACE_TRACEME is +// unchanged. +// +// 1 - restricted ptrace: a process must have a predefined relationship +// with the inferior it wants to call PTRACE_ATTACH on. By default, +// this relationship is that of only its descendants when the above +// classic criteria is also met. To change the relationship, an +// inferior can call prctl(PR_SET_PTRACER, debugger, ...) to declare +// an allowed debugger PID to call PTRACE_ATTACH on the inferior. +// Using PTRACE_TRACEME is unchanged. +// +// 2 - admin-only attach: only processes with CAP_SYS_PTRACE may use ptrace +// with PTRACE_ATTACH, or through children calling PTRACE_TRACEME. +// +// 3 - no attach: no processes may use ptrace with PTRACE_ATTACH nor via +// PTRACE_TRACEME. Once set, this sysctl value cannot be changed. +// +// Note: As this is a system-wide attribute with mutually-exclusive values, +// changes will be only made superficially (at sys-container level). IOW, +// the host FS value will be left untouched. +// + +const ( + minScopeVal = 0 + maxScopeVal = 3 +) + +type ProcSysKernelYama struct { + domain.HandlerBase +} + +var ProcSysKernelYama_Handler = &ProcSysKernelYama{ + domain.HandlerBase{ + Name: "ProcSysKernelYama", + Path: "/proc/sys/kernel/yama", + Enabled: true, + EmuResourceMap: map[string]*domain.EmuResource{ + "ptrace_scope": { + Kind: domain.FileEmuResource, + Mode: os.FileMode(uint32(0644)), + Enabled: true, + Size: 2, // value + newline + }, + }, + }, +} + +func (h *ProcSysKernelYama) Lookup( + n domain.IOnodeIface, + req *domain.HandlerRequest) (os.FileInfo, error) { + + var resource = n.Name() + + logrus.Debugf("Executing Lookup() for req-id: %#x, handler: %s, resource: %s", + req.ID, h.Name, resource) + + // Return an artificial fileInfo if looked-up element matches any of the + // emulated nodes. + if v, ok := h.EmuResourceMap[resource]; ok { + info := &domain.FileInfo{ + Fname: resource, + Fmode: v.Mode, + FmodTime: time.Now(), + Fsize: v.Size, + } + + return info, nil + } + + // If looked-up element hasn't been found by now, let's look into the actual + // container rootfs. + return h.Service.GetPassThroughHandler().Lookup(n, req) +} + +func (h *ProcSysKernelYama) Open( + n domain.IOnodeIface, + req *domain.HandlerRequest) (bool, error) { + + return false, nil +} + +func (h *ProcSysKernelYama) Read( + n domain.IOnodeIface, + req *domain.HandlerRequest) (int, error) { + + var resource = n.Name() + + logrus.Debugf("Executing Read() for req-id: %#x, handler: %s, resource: %s", + req.ID, h.Name, resource) + + switch resource { + case "ptrace_scope": + return readCntrData(h, n, req) + } + + // Refer to generic handler if no node match is found above. + return h.Service.GetPassThroughHandler().Read(n, req) +} + +func (h *ProcSysKernelYama) Write( + n domain.IOnodeIface, + req *domain.HandlerRequest) (int, error) { + + var resource = n.Name() + + logrus.Debugf("Executing Write() for req-id: %#x, handler: %s, resource: %s", + req.ID, h.Name, resource) + + switch resource { + case "ptrace_scope": + if !checkIntRange(req.Data, minScopeVal, maxScopeVal) { + return 0, fuse.IOerror{Code: syscall.EINVAL} + } + return writeCntrData(h, n, req, nil) + } + + // Refer to generic handler if no node match is found above. + return h.Service.GetPassThroughHandler().Write(n, req) +} + +func (h *ProcSysKernelYama) ReadDirAll( + n domain.IOnodeIface, + req *domain.HandlerRequest) ([]os.FileInfo, error) { + + logrus.Debugf("Executing ReadDirAll() for req-id: %#x, handler: %s, resource: %s", + req.ID, h.Name, n.Name()) + + // Return all entries as seen within container's namespaces. + return h.Service.GetPassThroughHandler().ReadDirAll(n, req) +} + +func (h *ProcSysKernelYama) ReadLink( + n domain.IOnodeIface, + req *domain.HandlerRequest) (string, error) { + + logrus.Debugf("Executing ReadLink() for req-id: %#x, handler: %s, resource: %s", + req.ID, h.Name, n.Name()) + + return h.Service.GetPassThroughHandler().ReadLink(n, req) +} + +func (h *ProcSysKernelYama) GetName() string { + return h.Name +} + +func (h *ProcSysKernelYama) GetPath() string { + return h.Path +} + +func (h *ProcSysKernelYama) GetService() domain.HandlerServiceIface { + return h.Service +} + +func (h *ProcSysKernelYama) GetEnabled() bool { + return h.Enabled +} + +func (h *ProcSysKernelYama) SetEnabled(b bool) { + h.Enabled = b +} + +func (h *ProcSysKernelYama) GetResourcesList() []string { + + var resources []string + + for resourceKey, resource := range h.EmuResourceMap { + resource.Mutex.Lock() + if !resource.Enabled { + resource.Mutex.Unlock() + continue + } + resource.Mutex.Unlock() + + resources = append(resources, filepath.Join(h.GetPath(), resourceKey)) + } + + return resources +} + +func (h *ProcSysKernelYama) GetResourceMutex(n domain.IOnodeIface) *sync.Mutex { + resource, ok := h.EmuResourceMap[n.Name()] + if !ok { + return nil + } + + return &resource.Mutex +} + +func (h *ProcSysKernelYama) SetService(hs domain.HandlerServiceIface) { + h.Service = hs +} diff --git a/sysbox-fs/handler/implementations/procSysNetCore.go b/sysbox-fs/handler/implementations/procSysNetCore.go new file mode 100644 index 00000000..5f97dfdb --- /dev/null +++ b/sysbox-fs/handler/implementations/procSysNetCore.go @@ -0,0 +1,283 @@ +// +// Copyright 2019-2023 Nestybox, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +package implementations + +import ( + "os" + "path/filepath" + "strings" + "sync" + "syscall" + "time" + + "github.com/sirupsen/logrus" + + "github.com/nestybox/sysbox-fs/domain" + "github.com/nestybox/sysbox-fs/fuse" +) + +// /proc/sys/net/core handler +// +// Emulated resources: +// +// * /proc/sys/net/core/default_qdisc +// +// Documentation: The default queuing discipline to use for network devices. +// This allows overriding the default of pfifo_fast with an alternative. Since +// the default queuing discipline is created without additional parameters so +// is best suited to queuing disciplines that work well without configuration +// like stochastic fair queue (sfq), CoDel (codel) or fair queue CoDel +// (fq_codel). Don’t use queuing disciplines like Hierarchical Token Bucket or +// Deficit Round Robin which require setting up classes and bandwidths. Note +// that physical multiqueue interfaces still use mq as root qdisc, which in +// turn uses this default for its leaves. Virtual devices (like e.g. lo or +// veth) ignore this setting and instead default to noqueue. Default: +// pfifo_fast. +// +// Supported schedulers (https://github.com/torvalds/linux/blob/master/net/sched/Kconfig#L478): +// +// - "pfifo_fast" +// - "fq" +// - "fq_codel" +// - "sfq" +// - "pfifo_fast" +// +// As this is a system-wide attribute with mutually-exclusive values, changes +// will be only made superficially (at sys-container level). IOW, the host FS +// value will be left untouched. +// +// * /proc/sys/net/core/somaxconn +// +// Description: Limit of socket listen() backlog, known in userspace as SOMAXCONN. +// Somaxconn refers to the maximum number of clients that the server can accept +// to process data, that is, to complete the connection limit. Defaults to 128. +type ProcSysNetCore struct { + domain.HandlerBase +} + +var ProcSysNetCore_Handler = &ProcSysNetCore{ + domain.HandlerBase{ + Name: "ProcSysNetCore", + Path: "/proc/sys/net/core", + Enabled: true, + EmuResourceMap: map[string]*domain.EmuResource{ + "default_qdisc": { + Kind: domain.FileEmuResource, + Mode: os.FileMode(uint32(0644)), + Enabled: true, + Size: 1024, + }, + "somaxconn": { + Kind: domain.FileEmuResource, + Mode: os.FileMode(uint32(0644)), + Enabled: true, + Size: 1024, + }, + }, + }, +} + +func (h *ProcSysNetCore) Lookup( + n domain.IOnodeIface, + req *domain.HandlerRequest) (os.FileInfo, error) { + + var resource = n.Name() + + logrus.Debugf("Executing Lookup() for req-id: %#x, handler: %s, resource: %s", + req.ID, h.Name, resource) + + // Return an artificial fileInfo if looked-up element matches any of the + // emulated nodes. + if v, ok := h.EmuResourceMap[resource]; ok { + info := &domain.FileInfo{ + Fname: resource, + Fmode: v.Mode, + FmodTime: time.Now(), + Fsize: v.Size, + } + + return info, nil + } + + // If looked-up element hasn't been found by now, let's look into the actual + // sys container rootfs. + return h.Service.GetPassThroughHandler().Lookup(n, req) +} + +func (h *ProcSysNetCore) Open( + n domain.IOnodeIface, + req *domain.HandlerRequest) (bool, error) { + + logrus.Debugf("Executing Open() for req-id: %#x, handler: %s, resource: %s", + req.ID, h.Name, n.Name()) + + return false, nil +} + +func (h *ProcSysNetCore) Read( + n domain.IOnodeIface, + req *domain.HandlerRequest) (int, error) { + + var resource = n.Name() + + logrus.Debugf("Executing Read() for req-id: %#x, handler: %s, resource: %s", + req.ID, h.Name, resource) + + switch resource { + case "default_qdisc": + return readCntrData(h, n, req) + + case "somaxconn": + return readCntrData(h, n, req) + } + + // Refer to generic handler if no node match is found above. + return h.Service.GetPassThroughHandler().Read(n, req) +} + +func (h *ProcSysNetCore) Write( + n domain.IOnodeIface, + req *domain.HandlerRequest) (int, error) { + + var resource = n.Name() + + logrus.Debugf("Executing Write() for req-id: %#x, handler: %s, resource: %s", + req.ID, h.Name, resource) + + switch resource { + case "default_qdisc": + return h.writeDefaultQdisc(n, req) + + case "somaxconn": + return writeCntrData(h, n, req, writeMaxIntToFs) + } + + // Refer to generic handler if no node match is found above. + return h.Service.GetPassThroughHandler().Write(n, req) +} + +func (h *ProcSysNetCore) ReadDirAll( + n domain.IOnodeIface, + req *domain.HandlerRequest) ([]os.FileInfo, error) { + + var resource = n.Name() + + logrus.Debugf("Executing ReadDirAll() for req-id: %#x, handler: %s, resource: %s", + req.ID, h.Name, resource) + + var fileEntries []os.FileInfo + + // Iterate through map of emulated components. + for k, _ := range h.EmuResourceMap { + info := &domain.FileInfo{ + Fname: k, + FmodTime: time.Now(), + } + + fileEntries = append(fileEntries, info) + } + + // Obtain the usual entries seen within container's namespaces and add them + // to the emulated ones. + usualEntries, err := h.Service.GetPassThroughHandler().ReadDirAll(n, req) + if err == nil { + fileEntries = append(fileEntries, usualEntries...) + } + + fileEntries = domain.FileInfoSliceUniquify(fileEntries) + + return fileEntries, nil +} + +func (h *ProcSysNetCore) ReadLink( + n domain.IOnodeIface, + req *domain.HandlerRequest) (string, error) { + + logrus.Debugf("Executing ReadLink() for req-id: %#x, handler: %s, resource: %s", + req.ID, h.Name, n.Name()) + + return h.Service.GetPassThroughHandler().ReadLink(n, req) +} + +func (h *ProcSysNetCore) GetName() string { + return h.Name +} + +func (h *ProcSysNetCore) GetPath() string { + return h.Path +} + +func (h *ProcSysNetCore) GetService() domain.HandlerServiceIface { + return h.Service +} + +func (h *ProcSysNetCore) GetEnabled() bool { + return h.Enabled +} + +func (h *ProcSysNetCore) SetEnabled(b bool) { + h.Enabled = b +} + +func (h *ProcSysNetCore) GetResourcesList() []string { + + var resources []string + + for resourceKey, resource := range h.EmuResourceMap { + resource.Mutex.Lock() + if !resource.Enabled { + resource.Mutex.Unlock() + continue + } + resource.Mutex.Unlock() + + resources = append(resources, filepath.Join(h.GetPath(), resourceKey)) + } + + return resources +} +func (h *ProcSysNetCore) GetResourceMutex(n domain.IOnodeIface) *sync.Mutex { + resource, ok := h.EmuResourceMap[n.Name()] + if !ok { + return nil + } + + return &resource.Mutex +} + +func (h *ProcSysNetCore) SetService(hs domain.HandlerServiceIface) { + h.Service = hs +} + +func (h *ProcSysNetCore) writeDefaultQdisc( + n domain.IOnodeIface, + req *domain.HandlerRequest) (int, error) { + + newVal := strings.TrimSpace(string(req.Data)) + + // Only supported values must be accepted. + switch newVal { + case "fq": + case "fq_codel": + case "sfq": + case "pfifo_fast": + default: + return 0, fuse.IOerror{Code: syscall.EINVAL} + } + + return writeCntrData(h, n, req, writeToFs) +} diff --git a/sysbox-fs/handler/implementations/procSysNetIpv4.go b/sysbox-fs/handler/implementations/procSysNetIpv4.go new file mode 100644 index 00000000..34acc4dc --- /dev/null +++ b/sysbox-fs/handler/implementations/procSysNetIpv4.go @@ -0,0 +1,281 @@ +// +// Copyright 2019-2023 Nestybox, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +package implementations + +import ( + "fmt" + "math" + "os" + "path/filepath" + "strconv" + "strings" + "sync" + "syscall" + "time" + + "github.com/sirupsen/logrus" + + "github.com/nestybox/sysbox-fs/domain" + "github.com/nestybox/sysbox-fs/fuse" + + "github.com/nestybox/sysbox-runc/libcontainer/user" +) + +// /proc/sys/net/ipv4 handler +// +// Emulated resources: +// +// * /proc/sys/net/ipv4/ping_group_range + +type ProcSysNetIpv4 struct { + domain.HandlerBase +} + +var ProcSysNetIpv4_Handler = &ProcSysNetIpv4{ + domain.HandlerBase{ + Name: "ProcSysNetIpv4", + Path: "/proc/sys/net/ipv4", + Enabled: true, + EmuResourceMap: map[string]*domain.EmuResource{ + "ping_group_range": { + Kind: domain.FileEmuResource, + Mode: os.FileMode(uint32(0644)), + Enabled: true, + Size: 1024, + }, + }, + }, +} + +func (h *ProcSysNetIpv4) Lookup( + n domain.IOnodeIface, + req *domain.HandlerRequest) (os.FileInfo, error) { + + var resource = n.Name() + + logrus.Debugf("Executing Lookup() for req-id: %#x, handler: %s, resource: %s", + req.ID, h.Name, resource) + + // Return an artificial fileInfo if looked-up element matches any of the + // emulated nodes. + if v, ok := h.EmuResourceMap[resource]; ok { + info := &domain.FileInfo{ + Fname: resource, + Fmode: v.Mode, + FmodTime: time.Now(), + Fsize: v.Size, + } + + return info, nil + } + + return h.Service.GetPassThroughHandler().Lookup(n, req) +} + +func (h *ProcSysNetIpv4) Open( + n domain.IOnodeIface, + req *domain.HandlerRequest) (bool, error) { + + return false, nil +} + +func (h *ProcSysNetIpv4) Read( + n domain.IOnodeIface, + req *domain.HandlerRequest) (int, error) { + + var resource = n.Name() + + logrus.Debugf("Executing Read() for req-id: %#x, handler: %s, resource: %s", + req.ID, h.Name, resource) + + return h.Service.GetPassThroughHandler().Read(n, req) +} + +func (h *ProcSysNetIpv4) Write( + n domain.IOnodeIface, + req *domain.HandlerRequest) (int, error) { + + var resource = n.Name() + + logrus.Debugf("Executing Write() for req-id: %#x, handler: %s, resource: %s", + req.ID, h.Name, resource) + + switch resource { + case "ping_group_range": + return h.writePingGroupRange(n, req) + } + + // Refer to generic handler if no node match is found above. + return h.Service.GetPassThroughHandler().Write(n, req) +} + +func (h *ProcSysNetIpv4) ReadDirAll( + n domain.IOnodeIface, + req *domain.HandlerRequest) ([]os.FileInfo, error) { + + var resource = n.Name() + + logrus.Debugf("Executing ReadDirAll() for req-id: %#x, handler: %s, resource: %s", + req.ID, h.Name, resource) + + // Return all entries as seen within container's namespaces. + return h.Service.GetPassThroughHandler().ReadDirAll(n, req) +} + +func (h *ProcSysNetIpv4) ReadLink( + n domain.IOnodeIface, + req *domain.HandlerRequest) (string, error) { + + logrus.Debugf("Executing ReadLink() for req-id: %#x, handler: %s, resource: %s", + req.ID, h.Name, n.Name()) + + return h.Service.GetPassThroughHandler().ReadLink(n, req) +} + +func (h *ProcSysNetIpv4) GetName() string { + return h.Name +} + +func (h *ProcSysNetIpv4) GetPath() string { + return h.Path +} + +func (h *ProcSysNetIpv4) GetService() domain.HandlerServiceIface { + return h.Service +} + +func (h *ProcSysNetIpv4) GetEnabled() bool { + return h.Enabled +} + +func (h *ProcSysNetIpv4) SetEnabled(b bool) { + h.Enabled = b +} + +func (h *ProcSysNetIpv4) GetResourcesList() []string { + + var resources []string + + for resourceKey, resource := range h.EmuResourceMap { + resource.Mutex.Lock() + if !resource.Enabled { + resource.Mutex.Unlock() + continue + } + resource.Mutex.Unlock() + + resources = append(resources, filepath.Join(h.GetPath(), resourceKey)) + } + + return resources +} +func (h *ProcSysNetIpv4) GetResourceMutex(n domain.IOnodeIface) *sync.Mutex { + resource, ok := h.EmuResourceMap[n.Name()] + if !ok { + return nil + } + + return &resource.Mutex +} + +func (h *ProcSysNetIpv4) SetService(hs domain.HandlerServiceIface) { + h.Service = hs +} + +func (h *ProcSysNetIpv4) writePingGroupRange( + n domain.IOnodeIface, + req *domain.HandlerRequest) (int, error) { + + var path = n.Path() + var origDataLength = len(req.Data) + + fields := strings.Fields(string(req.Data)) + if len(fields) != 2 { + return 0, fuse.IOerror{Code: syscall.EINVAL} + } + + // Obtain mindGid / maxGid integer values. + + minGid := strings.TrimSpace(fields[0]) + intMinGid, err := strconv.Atoi(minGid) + if err != nil { + return 0, fuse.IOerror{Code: syscall.EINVAL} + } + + maxGid := strings.TrimSpace(fields[1]) + intMaxGid, err := strconv.Atoi(maxGid) + if err != nil { + return 0, fuse.IOerror{Code: syscall.EINVAL} + } + + // Sanity-check input values. + if intMinGid < 0 { + return 0, fuse.IOerror{Code: syscall.EINVAL} + } + if intMaxGid > math.MaxInt32 { + return 0, fuse.IOerror{Code: syscall.EINVAL} + } + + // Parse the container process' gid_map to extract the gid_size within the + // user-ns. + idMap, err := user.ParseIDMapFile(fmt.Sprintf("/proc/%d/gid_map", req.Pid)) + if err != nil { + return 0, fuse.IOerror{Code: syscall.EINVAL} + } + + // Cache the new provided range. Notice that this is done before we + // adjust the input values to account for the gid-size of the container's + // user-namespace. Our goal here is to cache the values provided by the + // user, even though we may end up pushing slightly different values down + // to kernel. + cntr := req.Container + cacheData := []byte(fmt.Sprintf("%s\t%s", minGid, maxGid)) + + cntr.Lock() + err = cntr.SetData(path, 0, cacheData) + if err != nil { + cntr.Unlock() + return 0, fuse.IOerror{Code: syscall.EINVAL} + } + cntr.Unlock() + + // Adjust the received minGid / maxGid values if these ones happen to fall + // beyond the container's user-namespace boundaries. + + if intMinGid < (int(idMap[0].ID)) { + intMinGid = int(idMap[0].ID) + minGid = strconv.Itoa(intMinGid) + } + if intMaxGid > (int(idMap[0].Count) - 1) { + intMaxGid = (int(idMap[0].Count) - 1) + maxGid = strconv.Itoa(intMaxGid) + } + + req.Data = []byte(fmt.Sprintf("%s\t%s", minGid, maxGid)) + + // Tag the nsenter-request operation to prevent its handler from tampering + // with the already-formatted data, and from overwriting the already-cached + // information. + req.NoCache = true + + len, err := h.Service.GetPassThroughHandler().Write(n, req) + if err != nil { + return len, err + } + + return origDataLength, nil +} diff --git a/sysbox-fs/handler/implementations/procSysNetIpv4Neigh.go b/sysbox-fs/handler/implementations/procSysNetIpv4Neigh.go new file mode 100644 index 00000000..8ac63869 --- /dev/null +++ b/sysbox-fs/handler/implementations/procSysNetIpv4Neigh.go @@ -0,0 +1,331 @@ +// +// Copyright 2019-2020 Nestybox, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +package implementations + +import ( + "io" + "math" + "os" + "path/filepath" + "strings" + "sync" + "syscall" + "time" + + "github.com/sirupsen/logrus" + + "github.com/nestybox/sysbox-fs/domain" + "github.com/nestybox/sysbox-fs/fuse" +) + +// /proc/sys/net/ipv4/neigh handler +// +// Emulated resources: +// +// * /proc/sys/net/ipv4/default/gc_thresh1 +// * /proc/sys/net/ipv4/default/gc_thresh2 +// * /proc/sys/net/ipv4/default/gc_thresh3 + +type ProcSysNetIpv4Neigh struct { + domain.HandlerBase +} + +var ProcSysNetIpv4Neigh_Handler = &ProcSysNetIpv4Neigh{ + domain.HandlerBase{ + Name: "ProcSysNetIpv4Neigh", + Path: "/proc/sys/net/ipv4/neigh", + Enabled: true, + EmuResourceMap: map[string]*domain.EmuResource{ + "default": { + Kind: domain.DirEmuResource, + Mode: os.FileMode(uint32(0555)), + Enabled: true, + }, + "default/gc_thresh1": { + Kind: domain.FileEmuResource, + Mode: os.FileMode(uint32(0644)), + Enabled: true, + Size: 1024, + }, + "default/gc_thresh2": { + Kind: domain.FileEmuResource, + Mode: os.FileMode(uint32(0644)), + Enabled: true, + Size: 1024, + }, + "default/gc_thresh3": { + Kind: domain.FileEmuResource, + Mode: os.FileMode(uint32(0644)), + Enabled: true, + Size: 1024, + }, + }, + }, +} + +func (h *ProcSysNetIpv4Neigh) Lookup( + n domain.IOnodeIface, + req *domain.HandlerRequest) (os.FileInfo, error) { + + logrus.Debugf("Executing Lookup() for req-id: %#x, handler: %s, resource: %s", + req.ID, h.Name, n.Name()) + + var resource string + + // Obtain relative path to the element being looked up. + relPath, err := filepath.Rel(h.Path, n.Path()) + if err != nil { + return nil, err + } + + // Adjust the looked-up element to match the emulated-nodes naming. + relPathDir := filepath.Dir(relPath) + if relPathDir == "." || + strings.HasPrefix(relPath, "default/gc_thresh") { + resource = relPath + } + + // Return an artificial fileInfo if looked-up element matches any of the + // emulated components. + if v, ok := h.EmuResourceMap[resource]; ok { + info := &domain.FileInfo{ + Fname: resource, + FmodTime: time.Now(), + Fsize: v.Size, + } + + if v.Kind == domain.DirEmuResource { + info.Fmode = os.FileMode(uint32(os.ModeDir)) | v.Mode + info.FisDir = true + } else if v.Kind == domain.FileEmuResource { + info.Fmode = v.Mode + } + + return info, nil + } + + // If looked-up element hasn't been found by now, look into the actual + // container rootfs. + return h.Service.GetPassThroughHandler().Lookup(n, req) +} + +func (h *ProcSysNetIpv4Neigh) Open( + n domain.IOnodeIface, + req *domain.HandlerRequest) (bool, error) { + + return false, nil +} + +func (h *ProcSysNetIpv4Neigh) Read( + n domain.IOnodeIface, + req *domain.HandlerRequest) (int, error) { + + logrus.Debugf("Executing Read() for req-id: %#x, handler: %s, resource: %s", + req.ID, h.Name, n.Name()) + + // We are dealing with a single boolean element being read, so we can save + // some cycles by returning right away if offset is any higher than zero. + if req.Offset > 0 { + return 0, io.EOF + } + + // Obtain relative path to the element being written. + relPath, err := filepath.Rel(h.Path, n.Path()) + if err != nil { + return 0, err + } + + // Skip if node is not part of the emulated components. + if _, ok := h.EmuResourceMap[relPath]; !ok { + return 0, nil + } + + // As the "default" dir node isn't exposed within containers, sysbox's + // integration testsuites will fail when executing within the test framework. + // In these cases, we will redirect all "default" queries to a static node + // that is always present in the testing environment. + if h.GetService().IgnoreErrors() && + strings.HasPrefix(relPath, "default/gc_thresh") { + n.SetName("lo/retrans_time") + n.SetPath("/proc/sys/net/ipv4/neigh/lo/retrans_time") + h.EmuResourceMap["lo/retrans_time"] = + &domain.EmuResource{Kind: domain.FileEmuResource, Mode: os.FileMode(uint32(0644))} + } + + return readCntrData(h, n, req) +} + +func (h *ProcSysNetIpv4Neigh) Write( + n domain.IOnodeIface, + req *domain.HandlerRequest) (int, error) { + + logrus.Debugf("Executing Write() for req-id: %#x, handler: %s, resource: %s", + req.ID, h.Name, n.Name()) + + // Obtain relative path to the element being written. + relPath, err := filepath.Rel(h.Path, n.Path()) + if err != nil { + return 0, err + } + + // Skip if node is not part of the emulated components. + if _, ok := h.EmuResourceMap[relPath]; !ok { + return 0, nil + } + + // As the "default" dir node isn't exposed within containers, sysbox's + // integration testsuites will fail when executing within the test framework. + // In these cases, we will redirect all "default" queries to a static node + // that is always present in the testing environment. + if h.GetService().IgnoreErrors() && + strings.HasPrefix(relPath, "default/gc_thresh") { + n.SetName("lo/retrans_time") + n.SetPath("/proc/sys/net/ipv4/neigh/lo/retrans_time") + h.EmuResourceMap["lo/retrans_time"] = + &domain.EmuResource{Kind: domain.FileEmuResource, Mode: os.FileMode(uint32(0644))} + } + + if !checkIntRange(req.Data, 0, math.MaxInt32) { + return 0, fuse.IOerror{Code: syscall.EINVAL} + } + + return writeCntrData(h, n, req, nil) +} + +func (h *ProcSysNetIpv4Neigh) ReadDirAll( + n domain.IOnodeIface, + req *domain.HandlerRequest) ([]os.FileInfo, error) { + + logrus.Debugf("Executing ReadDirAll() for req-id: %#x, handler: %s, resource: %s", + req.ID, h.Name, n.Name()) + + var ( + info *domain.FileInfo + fileEntries []os.FileInfo + ) + + // Obtain relative path to the element being read. + relpath, err := filepath.Rel(h.Path, n.Path()) + if err != nil { + return nil, err + } + + // Iterate through map of virtual components. + for k, _ := range h.EmuResourceMap { + + if relpath == filepath.Dir(k) { + info = &domain.FileInfo{ + Fname: filepath.Base(k), + Fmode: os.ModeDir, + FmodTime: time.Now(), + FisDir: true, + } + + fileEntries = append(fileEntries, info) + + } else if relpath != "." && relpath == filepath.Dir(k) { + info = &domain.FileInfo{ + Fname: filepath.Base(k), + FmodTime: time.Now(), + } + + fileEntries = append(fileEntries, info) + } + } + + // Obtain the usual entries seen within container's namespaces and add them + // to the emulated ones. + usualEntries, err := h.Service.GetPassThroughHandler().ReadDirAll(n, req) + if err == nil { + fileEntries = append(fileEntries, usualEntries...) + } + + fileEntries = domain.FileInfoSliceUniquify(fileEntries) + + return fileEntries, nil +} + +func (h *ProcSysNetIpv4Neigh) ReadLink( + n domain.IOnodeIface, + req *domain.HandlerRequest) (string, error) { + + logrus.Debugf("Executing ReadLink() for req-id: %#x, handler: %s, resource: %s", + req.ID, h.Name, n.Name()) + + return h.Service.GetPassThroughHandler().ReadLink(n, req) +} + +func (h *ProcSysNetIpv4Neigh) GetName() string { + return h.Name +} + +func (h *ProcSysNetIpv4Neigh) GetPath() string { + return h.Path +} + +func (h *ProcSysNetIpv4Neigh) GetService() domain.HandlerServiceIface { + return h.Service +} + +func (h *ProcSysNetIpv4Neigh) GetEnabled() bool { + return h.Enabled +} + +func (h *ProcSysNetIpv4Neigh) SetEnabled(b bool) { + h.Enabled = b +} + +func (h *ProcSysNetIpv4Neigh) GetResourcesList() []string { + + var resources []string + + for resourceKey, resource := range h.EmuResourceMap { + resource.Mutex.Lock() + if !resource.Enabled { + resource.Mutex.Unlock() + continue + } + resource.Mutex.Unlock() + + resources = append(resources, filepath.Join(h.GetPath(), resourceKey)) + } + + return resources +} + +func (h *ProcSysNetIpv4Neigh) GetResourceMutex(n domain.IOnodeIface) *sync.Mutex { + + // Obtain the relative path to the element being acted on. + relPath, err := filepath.Rel(h.Path, n.Path()) + if err != nil { + return nil + } + + // Identify the associated entry matching the passed node and, if found, + // return its mutex. + for k, v := range h.EmuResourceMap { + if match, _ := filepath.Match(k, relPath); match { + return &v.Mutex + } + } + + return nil +} + +func (h *ProcSysNetIpv4Neigh) SetService(hs domain.HandlerServiceIface) { + h.Service = hs +} diff --git a/sysbox-fs/handler/implementations/procSysNetIpv4Vs.go b/sysbox-fs/handler/implementations/procSysNetIpv4Vs.go new file mode 100644 index 00000000..86d78c21 --- /dev/null +++ b/sysbox-fs/handler/implementations/procSysNetIpv4Vs.go @@ -0,0 +1,283 @@ +// +// Copyright 2019-2023 Nestybox, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +package implementations + +import ( + "math" + "os" + "path/filepath" + "sync" + "syscall" + "time" + + "github.com/sirupsen/logrus" + + "github.com/nestybox/sysbox-fs/domain" + "github.com/nestybox/sysbox-fs/fuse" +) + +// +// /proc/sys/net/ipv4/vs handler +// +// Note: The procfs nodes managed by this handler will only be visible if the +// path they are part of (/proc/sys/net/ipv4/vs") is exposed within the system, +// which can only happen if the "ip_vs" kernel module is loaded. +// +// Note: the resources handled by this handler are already namespaced by the +// Linux kernel's net-ns. However, these resources are hidden inside non-init +// user-namespace. Thus, this handler's only purpose is to expose these +// resources inside a sys container. +// +// Emulated resources: +// +// * /proc/sys/net/ipv4/vs/conn_reuse_mode handler +// * /proc/sys/net/ipv4/vs/expire_nodest_conn handler +// * /proc/sys/net/ipv4/vs/expire_quiescent_template handler +// + +const ( + minConnReuseMode = 0 + maxConnReuseMode = 1 +) + +type ProcSysNetIpv4Vs struct { + domain.HandlerBase +} + +var ProcSysNetIpv4Vs_Handler = &ProcSysNetIpv4Vs{ + domain.HandlerBase{ + Name: "ProcSysNetIpv4Vs", + Path: "/proc/sys/net/ipv4/vs", + Enabled: true, + EmuResourceMap: map[string]*domain.EmuResource{ + "conntrack": { + Kind: domain.FileEmuResource, + Mode: os.FileMode(uint32(0644)), + Enabled: true, + Size: 2, + }, + "conn_reuse_mode": { + Kind: domain.FileEmuResource, + Mode: os.FileMode(uint32(0644)), + Enabled: true, + Size: 2, + }, + "expire_nodest_conn": { + Kind: domain.FileEmuResource, + Mode: os.FileMode(uint32(0644)), + Enabled: true, + Size: 2, + }, + "expire_quiescent_template": { + Kind: domain.FileEmuResource, + Mode: os.FileMode(uint32(0644)), + Enabled: true, + Size: 2, + }, + }, + }, +} + +func (h *ProcSysNetIpv4Vs) Lookup( + n domain.IOnodeIface, + req *domain.HandlerRequest) (os.FileInfo, error) { + + var resource = n.Name() + + logrus.Debugf("Executing Lookup() for req-id: %#x, handler: %s, resource: %s", + req.ID, h.Name, resource) + + // Return an artificial fileInfo if looked-up element matches any of the + // emulated components. + if v, ok := h.EmuResourceMap[resource]; ok { + info := &domain.FileInfo{ + Fname: resource, + Fmode: v.Mode, + FmodTime: time.Now(), + Fsize: v.Size, + } + + return info, nil + } + + // If looked-up element hasn't been found by now, let's look into the actual + // sys container rootfs. + return h.Service.GetPassThroughHandler().Lookup(n, req) +} + +func (h *ProcSysNetIpv4Vs) Open( + n domain.IOnodeIface, + req *domain.HandlerRequest) (bool, error) { + + return false, nil +} + +func (h *ProcSysNetIpv4Vs) Read( + n domain.IOnodeIface, + req *domain.HandlerRequest) (int, error) { + + var resource = n.Name() + + logrus.Debugf("Executing Read() for req-id: %#x, handler: %s, resource: %s", + req.ID, h.Name, resource) + + switch resource { + case "conntrack": + return readCntrData(h, n, req) + + case "conn_reuse_mode": + return readCntrData(h, n, req) + + case "expire_nodest_conn": + return readCntrData(h, n, req) + + case "expire_quiescent_template": + return readCntrData(h, n, req) + } + + // Refer to generic handler if no node match is found above. + return h.Service.GetPassThroughHandler().Read(n, req) +} + +func (h *ProcSysNetIpv4Vs) Write( + n domain.IOnodeIface, + req *domain.HandlerRequest) (int, error) { + + var resource = n.Name() + + logrus.Debugf("Executing Write() for req-id: %#x, handler: %s, resource: %s", + req.ID, h.Name, resource) + + switch resource { + case "conntrack": + return writeCntrData(h, n, req, writeMaxIntToFs) + + case "conn_reuse_mode": + if !checkIntRange(req.Data, minConnReuseMode, maxConnReuseMode) { + return 0, fuse.IOerror{Code: syscall.EINVAL} + } + return writeCntrData(h, n, req, nil) + + case "expire_nodest_conn": + if !checkIntRange(req.Data, math.MinInt32, math.MaxInt32) { + return 0, fuse.IOerror{Code: syscall.EINVAL} + } + return writeCntrData(h, n, req, nil) + + case "expire_quiescent_template": + if !checkIntRange(req.Data, math.MinInt32, math.MaxInt32) { + return 0, fuse.IOerror{Code: syscall.EINVAL} + } + return writeCntrData(h, n, req, nil) + } + + // Refer to generic handler if no node match is found above. + return h.Service.GetPassThroughHandler().Write(n, req) +} + +func (h *ProcSysNetIpv4Vs) ReadDirAll( + n domain.IOnodeIface, + req *domain.HandlerRequest) ([]os.FileInfo, error) { + + var resource = n.Name() + + logrus.Debugf("Executing ReadDirAll() for req-id: %#x, handler: %s, resource: %s", + req.ID, h.Name, resource) + + var fileEntries []os.FileInfo + + // Iterate through map of virtual components. + for k, _ := range h.EmuResourceMap { + info := &domain.FileInfo{ + Fname: k, + FmodTime: time.Now(), + } + + fileEntries = append(fileEntries, info) + } + + // Obtain the usual entries seen within container's namespaces and add them + // to the emulated ones. + usualEntries, err := h.Service.GetPassThroughHandler().ReadDirAll(n, req) + if err == nil { + fileEntries = append(fileEntries, usualEntries...) + } + + fileEntries = domain.FileInfoSliceUniquify(fileEntries) + + return fileEntries, nil +} + +func (h *ProcSysNetIpv4Vs) ReadLink( + n domain.IOnodeIface, + req *domain.HandlerRequest) (string, error) { + + logrus.Debugf("Executing ReadLink() for req-id: %#x, handler: %s, resource: %s", + req.ID, h.Name, n.Name()) + + return h.Service.GetPassThroughHandler().ReadLink(n, req) +} + +func (h *ProcSysNetIpv4Vs) GetName() string { + return h.Name +} + +func (h *ProcSysNetIpv4Vs) GetPath() string { + return h.Path +} + +func (h *ProcSysNetIpv4Vs) GetService() domain.HandlerServiceIface { + return h.Service +} + +func (h *ProcSysNetIpv4Vs) GetEnabled() bool { + return h.Enabled +} + +func (h *ProcSysNetIpv4Vs) SetEnabled(b bool) { + h.Enabled = b +} + +func (h *ProcSysNetIpv4Vs) GetResourcesList() []string { + + var resources []string + + for resourceKey, resource := range h.EmuResourceMap { + resource.Mutex.Lock() + if !resource.Enabled { + resource.Mutex.Unlock() + continue + } + resource.Mutex.Unlock() + + resources = append(resources, filepath.Join(h.GetPath(), resourceKey)) + } + + return resources +} +func (h *ProcSysNetIpv4Vs) GetResourceMutex(n domain.IOnodeIface) *sync.Mutex { + resource, ok := h.EmuResourceMap[n.Name()] + if !ok { + return nil + } + + return &resource.Mutex +} + +func (h *ProcSysNetIpv4Vs) SetService(hs domain.HandlerServiceIface) { + h.Service = hs +} diff --git a/sysbox-fs/handler/implementations/procSysNetNetfilter.go b/sysbox-fs/handler/implementations/procSysNetNetfilter.go new file mode 100644 index 00000000..d22a21de --- /dev/null +++ b/sysbox-fs/handler/implementations/procSysNetNetfilter.go @@ -0,0 +1,323 @@ +// +// Copyright 2019-2023 Nestybox, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +package implementations + +import ( + "os" + "path/filepath" + "strconv" + "strings" + "sync" + "time" + + "github.com/sirupsen/logrus" + + "github.com/nestybox/sysbox-fs/domain" +) + +// +// /proc/sys/net/netfilter handler +// +// Emulated resources: +// +// * /proc/sys/net/netfilter/nf_conntrack_max +// +// * /proc/sys/net/netfilter/nf_conntrack_generic_timeout +// +// * /proc/sys/net/netfilter/nf_conntrack_tcp_timeout_established +// +// * /proc/sys/net/netfilter/nf_conntrack_tcp_timeout_close_wait +// +// * /proc/sys/net/netfilter/nf_conntrack_tcp_be_liberal +// +// Documentation: https://www.kernel.org/doc/Documentation/networking/nf_conntrack-sysctl.txt +// +// nf_conntrack_tcp_be_liberal - BOOLEAN +// 0 - disabled (default) +// not 0 - enabled +// +// Be conservative in what you do, be liberal in what you accept from others. +// If it's non-zero, we mark only out of window RST segments as INVALID. +// +// Taking into account that kernel's netfilter can either operate in one mode or +// the other, we opt for letting the liberal mode prevail if set within any sys-container. +// + +const ( + tcpLiberalOff = 0 + tcpLiberalOn = 1 +) + +type ProcSysNetNetfilter struct { + domain.HandlerBase +} + +var ProcSysNetNetfilter_Handler = &ProcSysNetNetfilter{ + domain.HandlerBase{ + Name: "ProcSysNetNetfilter", + Path: "/proc/sys/net/netfilter", + Enabled: true, + EmuResourceMap: map[string]*domain.EmuResource{ + "nf_conntrack_max": { + Kind: domain.FileEmuResource, + Mode: os.FileMode(uint32(0644)), + Enabled: true, + Size: 1024, + }, + "nf_conntrack_generic_timeout": { + Kind: domain.FileEmuResource, + Mode: os.FileMode(uint32(0644)), + Enabled: true, + Size: 1024, + }, + "nf_conntrack_tcp_be_liberal": { + Kind: domain.FileEmuResource, + Mode: os.FileMode(uint32(0644)), + Enabled: true, + Size: 2, + }, + "nf_conntrack_tcp_timeout_established": { + Kind: domain.FileEmuResource, + Mode: os.FileMode(uint32(0644)), + Enabled: true, + Size: 1024, + }, + "nf_conntrack_tcp_timeout_close_wait": { + Kind: domain.FileEmuResource, + Mode: os.FileMode(uint32(0644)), + Enabled: true, + Size: 1024, + }, + }, + }, +} + +func (h *ProcSysNetNetfilter) Lookup( + n domain.IOnodeIface, + req *domain.HandlerRequest) (os.FileInfo, error) { + + var resource = n.Name() + + logrus.Debugf("Executing Lookup() for req-id: %#x, handler: %s, resource: %s", + req.ID, h.Name, resource) + + // Return an artificial fileInfo if looked-up element matches any of the + // virtual-components. + if v, ok := h.EmuResourceMap[resource]; ok { + info := &domain.FileInfo{ + Fname: resource, + Fmode: v.Mode, + FmodTime: time.Now(), + Fsize: v.Size, + } + + return info, nil + } + + // If looked-up element hasn't been found by now, let's look into the actual + // sys container rootfs. + return h.Service.GetPassThroughHandler().Lookup(n, req) +} + +func (h *ProcSysNetNetfilter) Open( + n domain.IOnodeIface, + req *domain.HandlerRequest) (bool, error) { + + logrus.Debugf("Executing %v Open() method\n", h.Name) + + return false, nil +} + +func (h *ProcSysNetNetfilter) Read( + n domain.IOnodeIface, + req *domain.HandlerRequest) (int, error) { + + var resource = n.Name() + + logrus.Debugf("Executing Read() for req-id: %#x, handler: %s, resource: %s", + req.ID, h.Name, resource) + + switch resource { + case "nf_conntrack_max": + return readCntrData(h, n, req) + + case "nf_conntrack_generic_timeout": + return readCntrData(h, n, req) + + case "nf_conntrack_tcp_be_liberal": + return readCntrData(h, n, req) + + case "nf_conntrack_tcp_timeout_established": + return readCntrData(h, n, req) + + case "nf_conntrack_tcp_timeout_close_wait": + return readCntrData(h, n, req) + } + + // Refer to generic handler if no node match is found above. + return h.Service.GetPassThroughHandler().Read(n, req) +} + +func (h *ProcSysNetNetfilter) Write( + n domain.IOnodeIface, + req *domain.HandlerRequest) (int, error) { + + var resource = n.Name() + + logrus.Debugf("Executing Write() for req-id: %#x, handler: %s, resource: %s", + req.ID, h.Name, resource) + + switch resource { + case "nf_conntrack_max": + return writeCntrData(h, n, req, writeMaxIntToFs) + + case "nf_conntrack_generic_timeout": + return writeCntrData(h, n, req, writeMaxIntToFs) + + case "nf_conntrack_tcp_be_liberal": + return writeCntrData(h, n, req, writeTcpLiberal) + + case "nf_conntrack_tcp_timeout_established": + return writeCntrData(h, n, req, writeMaxIntToFs) + + case "nf_conntrack_tcp_timeout_close_wait": + return writeCntrData(h, n, req, writeMaxIntToFs) + } + + // Refer to generic handler if no node match is found above. + return h.Service.GetPassThroughHandler().Write(n, req) +} + +func (h *ProcSysNetNetfilter) ReadDirAll( + n domain.IOnodeIface, + req *domain.HandlerRequest) ([]os.FileInfo, error) { + + var resource = n.Name() + + logrus.Debugf("Executing ReadDirAll() for req-id: %#x, handler: %s, resource: %s", + req.ID, h.Name, resource) + + var fileEntries []os.FileInfo + + // Obtain relative path to the element being read. + relpath, err := filepath.Rel(h.Path, n.Path()) + if err != nil { + return nil, err + } + + // Iterate through map of emulated components. + for k, _ := range h.EmuResourceMap { + + if relpath == filepath.Dir(k) { + info := &domain.FileInfo{ + Fname: k, + Fmode: os.FileMode(uint32(0644)), + FmodTime: time.Now(), + } + + fileEntries = append(fileEntries, info) + } + } + + // Obtain the usual entries seen within container's namespaces and add them + // to the emulated ones. + usualEntries, err := h.Service.GetPassThroughHandler().ReadDirAll(n, req) + if err == nil { + fileEntries = append(fileEntries, usualEntries...) + } + + fileEntries = domain.FileInfoSliceUniquify(fileEntries) + + return fileEntries, nil +} + +func (h *ProcSysNetNetfilter) ReadLink( + n domain.IOnodeIface, + req *domain.HandlerRequest) (string, error) { + + logrus.Debugf("Executing ReadLink() for req-id: %#x, handler: %s, resource: %s", + req.ID, h.Name, n.Name()) + + return h.Service.GetPassThroughHandler().ReadLink(n, req) +} + +func (h *ProcSysNetNetfilter) GetName() string { + return h.Name +} + +func (h *ProcSysNetNetfilter) GetPath() string { + return h.Path +} + +func (h *ProcSysNetNetfilter) GetService() domain.HandlerServiceIface { + return h.Service +} + +func (h *ProcSysNetNetfilter) GetEnabled() bool { + return h.Enabled +} + +func (h *ProcSysNetNetfilter) SetEnabled(b bool) { + h.Enabled = b +} + +func (h *ProcSysNetNetfilter) GetResourcesList() []string { + + var resources []string + + for resourceKey, resource := range h.EmuResourceMap { + resource.Mutex.Lock() + if !resource.Enabled { + resource.Mutex.Unlock() + continue + } + resource.Mutex.Unlock() + + resources = append(resources, filepath.Join(h.GetPath(), resourceKey)) + } + + return resources +} +func (h *ProcSysNetNetfilter) GetResourceMutex(n domain.IOnodeIface) *sync.Mutex { + resource, ok := h.EmuResourceMap[n.Name()] + if !ok { + return nil + } + + return &resource.Mutex +} + +func (h *ProcSysNetNetfilter) SetService(hs domain.HandlerServiceIface) { + h.Service = hs +} + +func writeTcpLiberal(curr, new []byte) (bool, error) { + + newStr := strings.TrimSpace(string(new)) + newInt, err := strconv.Atoi(newStr) + if err != nil { + return false, err + } + + currStr := strings.TrimSpace(string(curr)) + currInt, err := strconv.Atoi(currStr) + if err != nil { + return false, err + } + + return (newInt != currInt && newInt != tcpLiberalOff), nil +} diff --git a/sysbox-fs/handler/implementations/procSysNetUnix.go b/sysbox-fs/handler/implementations/procSysNetUnix.go new file mode 100644 index 00000000..b73758b8 --- /dev/null +++ b/sysbox-fs/handler/implementations/procSysNetUnix.go @@ -0,0 +1,237 @@ +// +// Copyright 2019-2023 Nestybox, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +package implementations + +import ( + "os" + "path/filepath" + "sync" + "time" + + "github.com/sirupsen/logrus" + + "github.com/nestybox/sysbox-fs/domain" +) + +// /proc/sys/net/unix handler +// +// Emulated resources: +// +// * /proc/sys/net/unix/max_dgram_qlen + +type ProcSysNetUnix struct { + domain.HandlerBase +} + +var ProcSysNetUnix_Handler = &ProcSysNetUnix{ + domain.HandlerBase{ + Name: "ProcSysNetUnix", + Path: "/proc/sys/net/unix", + Enabled: true, + EmuResourceMap: map[string]*domain.EmuResource{ + "max_dgram_qlen": { + Kind: domain.FileEmuResource, + Mode: os.FileMode(uint32(0644)), + Enabled: true, + Size: 1024, + }, + }, + }, +} + +func (h *ProcSysNetUnix) Lookup( + n domain.IOnodeIface, + req *domain.HandlerRequest) (os.FileInfo, error) { + + var resource = n.Name() + + logrus.Debugf("Executing Lookup() for req-id: %#x, handler: %s, resource: %s", + req.ID, h.Name, resource) + + // Return an artificial fileInfo if looked-up element matches any of the + // emulated nodes. + if v, ok := h.EmuResourceMap[resource]; ok { + info := &domain.FileInfo{ + Fname: resource, + Fmode: v.Mode, + FmodTime: time.Now(), + Fsize: v.Size, + } + + return info, nil + } + + // If looked-up element hasn't been found by now, let's look into the actual + // sys container rootfs. + return h.Service.GetPassThroughHandler().Lookup(n, req) +} + +func (h *ProcSysNetUnix) Open( + n domain.IOnodeIface, + req *domain.HandlerRequest) (bool, error) { + + var resource = n.Name() + + logrus.Debugf("Executing Open() for req-id: %#x, handler: %s, resource: %s", + req.ID, h.Name, resource) + + switch resource { + case "max_dgram_qlen": + return false, nil + } + + return h.Service.GetPassThroughHandler().Open(n, req) +} + +func (h *ProcSysNetUnix) Read( + n domain.IOnodeIface, + req *domain.HandlerRequest) (int, error) { + + var resource = n.Name() + + logrus.Debugf("Executing Read() for req-id: %#x, handler: %s, resource: %s", + req.ID, h.Name, resource) + + switch resource { + case "max_dgram_qlen": + return readCntrData(h, n, req) + } + + // Refer to generic handler if no node match is found above. + return h.Service.GetPassThroughHandler().Read(n, req) +} + +func (h *ProcSysNetUnix) Write( + n domain.IOnodeIface, + req *domain.HandlerRequest) (int, error) { + + var resource = n.Name() + + logrus.Debugf("Executing Write() for req-id: %#x, handler: %s, resource: %s", + req.ID, h.Name, resource) + + switch resource { + case "max_dgram_qlen": + return writeCntrData(h, n, req, writeMaxIntToFs) + } + + // Refer to generic handler if no node match is found above. + return h.Service.GetPassThroughHandler().Write(n, req) +} + +func (h *ProcSysNetUnix) ReadDirAll( + n domain.IOnodeIface, + req *domain.HandlerRequest) ([]os.FileInfo, error) { + + var resource = n.Name() + + logrus.Debugf("Executing ReadDirAll() for req-id: %#x, handler: %s, resource: %s", + req.ID, h.Name, resource) + + var fileEntries []os.FileInfo + + // Obtain relative path to the element being read. + relpath, err := filepath.Rel(h.Path, n.Path()) + if err != nil { + return nil, err + } + + // Iterate through map of emulated components. + for k, _ := range h.EmuResourceMap { + + if relpath == filepath.Dir(k) { + info := &domain.FileInfo{ + Fname: k, + Fmode: os.FileMode(uint32(0644)), + FmodTime: time.Now(), + } + + fileEntries = append(fileEntries, info) + } + } + + // Obtain the usual entries seen within container's namespaces and add them + // to the emulated ones. + usualEntries, err := h.Service.GetPassThroughHandler().ReadDirAll(n, req) + if err == nil { + fileEntries = append(fileEntries, usualEntries...) + } + + fileEntries = domain.FileInfoSliceUniquify(fileEntries) + + return fileEntries, nil +} + +func (h *ProcSysNetUnix) ReadLink( + n domain.IOnodeIface, + req *domain.HandlerRequest) (string, error) { + + logrus.Debugf("Executing ReadLink() for req-id: %#x, handler: %s, resource: %s", + req.ID, h.Name, n.Name()) + + return h.Service.GetPassThroughHandler().ReadLink(n, req) +} + +func (h *ProcSysNetUnix) GetName() string { + return h.Name +} + +func (h *ProcSysNetUnix) GetPath() string { + return h.Path +} + +func (h *ProcSysNetUnix) GetService() domain.HandlerServiceIface { + return h.Service +} + +func (h *ProcSysNetUnix) GetEnabled() bool { + return h.Enabled +} + +func (h *ProcSysNetUnix) SetEnabled(b bool) { + h.Enabled = b +} + +func (h *ProcSysNetUnix) GetResourcesList() []string { + + var resources []string + + for resourceKey, resource := range h.EmuResourceMap { + resource.Mutex.Lock() + if !resource.Enabled { + resource.Mutex.Unlock() + continue + } + resource.Mutex.Unlock() + + resources = append(resources, filepath.Join(h.GetPath(), resourceKey)) + } + + return resources +} +func (h *ProcSysNetUnix) GetResourceMutex(n domain.IOnodeIface) *sync.Mutex { + resource, ok := h.EmuResourceMap[n.Name()] + if !ok { + return nil + } + + return &resource.Mutex +} + +func (h *ProcSysNetUnix) SetService(hs domain.HandlerServiceIface) { + h.Service = hs +} diff --git a/sysbox-fs/handler/implementations/procSysVm.go b/sysbox-fs/handler/implementations/procSysVm.go new file mode 100644 index 00000000..d4ace6a8 --- /dev/null +++ b/sysbox-fs/handler/implementations/procSysVm.go @@ -0,0 +1,262 @@ +// +// Copyright 2019-2023 Nestybox, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +package implementations + +import ( + "math" + "os" + "path/filepath" + "sync" + "syscall" + "time" + + "github.com/sirupsen/logrus" + + "github.com/nestybox/sysbox-fs/domain" + "github.com/nestybox/sysbox-fs/fuse" +) + +// +// /proc/sys/vm handler +// +// Emulated resources: +// +// * /proc/sys/vm/mmap_min_addr +// +// Documentation: This file indicates the amount of address space which a user +// process will be restricted from mmapping. Since kernel null dereference bugs +// could accidentally operate based on the information in the first couple of +// pages of memory userspace processes should not be allowed to write to them. +// +// By default this value is set to 0 and no protections will be enforced by the +// security module. Setting this value to something like 64k will allow the vast +// majority of applications to work correctly and provide defense in depth +// against future potential kernel bugs. +// +// Note: As this is a system-wide attribute, changes will be only made +// superficially (at sys-container level). IOW, the host FS value will be left +// untouched. +// +// * /proc/sys/vm/overcommit_memory +// + +const ( + minOvercommitMem = 0 + maxOverCommitMem = 2 +) + +type ProcSysVm struct { + domain.HandlerBase +} + +var ProcSysVm_Handler = &ProcSysVm{ + domain.HandlerBase{ + Name: "ProcSysVm", + Path: "/proc/sys/vm", + Enabled: true, + EmuResourceMap: map[string]*domain.EmuResource{ + "overcommit_memory": { + Kind: domain.FileEmuResource, + Mode: os.FileMode(uint32(0644)), + Enabled: true, + Size: 2, + }, + "mmap_min_addr": { + Kind: domain.FileEmuResource, + Mode: os.FileMode(uint32(0644)), + Enabled: true, + Size: 1024, + }, + }, + }, +} + +func (h *ProcSysVm) Lookup( + n domain.IOnodeIface, + req *domain.HandlerRequest) (os.FileInfo, error) { + + var resource = n.Name() + + logrus.Debugf("Executing Lookup() for req-id: %#x, handler: %s, resource: %s", + req.ID, h.Name, resource) + + // Return an artificial fileInfo if looked-up element matches any of the + // emulated nodes. + if v, ok := h.EmuResourceMap[resource]; ok { + info := &domain.FileInfo{ + Fname: resource, + Fmode: v.Mode, + FmodTime: time.Now(), + Fsize: v.Size, + } + + return info, nil + } + + // If looked-up element hasn't been found by now, let's look into the actual + // sys container rootfs. + return h.Service.GetPassThroughHandler().Lookup(n, req) +} + +func (h *ProcSysVm) Open( + n domain.IOnodeIface, + req *domain.HandlerRequest) (bool, error) { + + var resource = n.Name() + + logrus.Debugf("Executing Open() for req-id: %#x, handler: %s, resource: %s", + req.ID, h.Name, resource) + + switch resource { + case "overcommit_memory": + return false, nil + + case "mmap_min_addr": + return false, nil + } + + return h.Service.GetPassThroughHandler().Open(n, req) +} + +func (h *ProcSysVm) Read( + n domain.IOnodeIface, + req *domain.HandlerRequest) (int, error) { + + var resource = n.Name() + + logrus.Debugf("Executing Read() for req-id: %#x, handler: %s, resource: %s", + req.ID, h.Name, resource) + + switch resource { + case "overcommit_memory": + return readCntrData(h, n, req) + + case "mmap_min_addr": + return readCntrData(h, n, req) + } + + // Refer to generic handler if no node match is found above. + return h.Service.GetPassThroughHandler().Read(n, req) +} + +func (h *ProcSysVm) Write( + n domain.IOnodeIface, + req *domain.HandlerRequest) (int, error) { + + var resource = n.Name() + + logrus.Debugf("Executing Write() for req-id: %#x, handler: %s, resource: %s", + req.ID, h.Name, resource) + + switch resource { + case "overcommit_memory": + // Ensure that only proper values are allowed as per this resource semantics: + // + // 0: Kernel is free to overcommit memory (this is the default), a heuristic + // algorithm is applied to figure out if enough memory is available. + // 1: Kernel will always overcommit memory, and never check if enough memory + // is available. This increases the risk of out-of-memory situations, but + // also improves memory-intensive workloads. + // 2: Kernel will not overcommit memory, and only allocate as much memory as + // defined in overcommit_ratio. + if !checkIntRange(req.Data, minOvercommitMem, maxOverCommitMem) { + return 0, fuse.IOerror{Code: syscall.EINVAL} + } + return writeCntrData(h, n, req, nil) + + case "mmap_min_addr": + if !checkIntRange(req.Data, 0, math.MaxInt64) { + return 0, fuse.IOerror{Code: syscall.EINVAL} + } + return writeCntrData(h, n, req, nil) + } + + // Refer to generic handler if no node match is found above. + return h.Service.GetPassThroughHandler().Write(n, req) +} + +func (h *ProcSysVm) ReadDirAll( + n domain.IOnodeIface, + req *domain.HandlerRequest) ([]os.FileInfo, error) { + + logrus.Debugf("Executing ReadDirAll() for req-id: %#x, handler: %s, resource: %s", + req.ID, h.Name, n.Name()) + + // Return all entries as seen within container's namespaces. + return h.Service.GetPassThroughHandler().ReadDirAll(n, req) +} + +func (h *ProcSysVm) ReadLink( + n domain.IOnodeIface, + req *domain.HandlerRequest) (string, error) { + + logrus.Debugf("Executing ReadLink() for req-id: %#x, handler: %s, resource: %s", + req.ID, h.Name, n.Name()) + + return h.Service.GetPassThroughHandler().ReadLink(n, req) +} + +func (h *ProcSysVm) GetName() string { + return h.Name +} + +func (h *ProcSysVm) GetPath() string { + return h.Path +} + +func (h *ProcSysVm) GetService() domain.HandlerServiceIface { + return h.Service +} + +func (h *ProcSysVm) GetEnabled() bool { + return h.Enabled +} + +func (h *ProcSysVm) SetEnabled(b bool) { + h.Enabled = b +} + +func (h *ProcSysVm) GetResourcesList() []string { + + var resources []string + + for resourceKey, resource := range h.EmuResourceMap { + resource.Mutex.Lock() + if !resource.Enabled { + resource.Mutex.Unlock() + continue + } + resource.Mutex.Unlock() + + resources = append(resources, filepath.Join(h.GetPath(), resourceKey)) + } + + return resources +} + +func (h *ProcSysVm) GetResourceMutex(n domain.IOnodeIface) *sync.Mutex { + resource, ok := h.EmuResourceMap[n.Name()] + if !ok { + return nil + } + + return &resource.Mutex +} + +func (h *ProcSysVm) SetService(hs domain.HandlerServiceIface) { + h.Service = hs +} diff --git a/sysbox-fs/handler/implementations/procUptime.go b/sysbox-fs/handler/implementations/procUptime.go new file mode 100644 index 00000000..d4cf3d2a --- /dev/null +++ b/sysbox-fs/handler/implementations/procUptime.go @@ -0,0 +1,218 @@ +// +// Copyright 2019-2023 Nestybox, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +package implementations + +import ( + "fmt" + "io" + "os" + "path/filepath" + "sync" + "syscall" + "time" + + "github.com/sirupsen/logrus" + + "github.com/nestybox/sysbox-fs/domain" + "github.com/nestybox/sysbox-fs/fuse" +) + +// +// /proc/uptime handler +// + +type ProcUptime struct { + domain.HandlerBase +} + +var ProcUptime_Handler = &ProcUptime{ + domain.HandlerBase{ + Name: "ProcUptime", + Path: "/proc/uptime", + Enabled: true, + }, +} + +func (h *ProcUptime) Lookup( + n domain.IOnodeIface, + req *domain.HandlerRequest) (os.FileInfo, error) { + + var resource = n.Name() + + logrus.Debugf("Executing Lookup() for req-id: %#x, handler: %s, resource: %s", + req.ID, h.Name, resource) + + info := &domain.FileInfo{ + Fname: resource, + Fmode: os.FileMode(uint32(0444)), + FmodTime: time.Now(), + Fsize: 4096, + } + + return info, nil +} + +func (h *ProcUptime) Open( + n domain.IOnodeIface, + req *domain.HandlerRequest) (bool, error) { + + var resource = n.Name() + + logrus.Debugf("Executing Open() for req-id: %#x, handler: %s, resource: %s", + req.ID, h.Name, resource) + + flags := n.OpenFlags() + + if flags&syscall.O_WRONLY == syscall.O_WRONLY || + flags&syscall.O_RDWR == syscall.O_RDWR { + return false, fuse.IOerror{Code: syscall.EACCES} + } + + // /proc/uptime is not seekable + return true, nil +} + +func (h *ProcUptime) Read( + n domain.IOnodeIface, + req *domain.HandlerRequest) (int, error) { + + var resource = n.Name() + + logrus.Debugf("Executing Read() for req-id: %#x, handler: %s, resource: %s", + req.ID, h.Name, resource) + + return h.readUptime(n, req) +} + +func (h *ProcUptime) Write( + n domain.IOnodeIface, + req *domain.HandlerRequest) (int, error) { + + return 0, nil +} + +func (h *ProcUptime) ReadDirAll( + n domain.IOnodeIface, + req *domain.HandlerRequest) ([]os.FileInfo, error) { + + var resource = n.Name() + + logrus.Debugf("Executing ReadDirAll() for req-id: %#x, handler: %s, resource: %s", + req.ID, h.Name, resource) + + return nil, nil +} + +func (h *ProcUptime) ReadLink( + n domain.IOnodeIface, + req *domain.HandlerRequest) (string, error) { + + logrus.Debugf("Executing ReadLink() for req-id: %#x, handler: %s, resource: %s", + req.ID, h.Name, n.Name()) + + return "", nil +} + +func (h *ProcUptime) GetName() string { + return h.Name +} + +func (h *ProcUptime) GetPath() string { + return h.Path +} + +func (h *ProcUptime) GetService() domain.HandlerServiceIface { + return h.Service +} + +func (h *ProcUptime) GetEnabled() bool { + return h.Enabled +} + +func (h *ProcUptime) SetEnabled(b bool) { + h.Enabled = b +} + +func (h *ProcUptime) GetResourcesList() []string { + + var resources []string + + for resourceKey, resource := range h.EmuResourceMap { + resource.Mutex.Lock() + if !resource.Enabled { + resource.Mutex.Unlock() + continue + } + resource.Mutex.Unlock() + + resources = append(resources, filepath.Join(h.GetPath(), resourceKey)) + } + + return resources +} + +func (h *ProcUptime) GetResourceMutex(n domain.IOnodeIface) *sync.Mutex { + resource, ok := h.EmuResourceMap[n.Name()] + if !ok { + return nil + } + + return &resource.Mutex +} + +func (h *ProcUptime) SetService(hs domain.HandlerServiceIface) { + h.Service = hs +} + +func (h *ProcUptime) readUptime( + n domain.IOnodeIface, + req *domain.HandlerRequest) (int, error) { + + logrus.Debugf("Executing %v Read() method", h.Name) + + // We are dealing with a single integer element being read, so we can save + // some cycles by returning right away if offset is any higher than zero. + if req.Offset > 0 { + return 0, io.EOF + } + + cntr := req.Container + + // + // We can assume that by the time a user generates a request to read + // /proc/uptime, the embedding container has been fully initialized, + // so Ctime() is already holding a valid value. + // + data := cntr.Ctime() + + // Calculate container's uptime, convert it to float to obtain required + // precission (as per host FS), and finally format it into string for + // storage purposes. + // + // TODO: Notice that we are dumping the same values into the two columns + // expected in /proc/uptime. The value utilized for the first column is + // an accurate one (uptime seconds); however, the second one is just + // an approximation. + // + uptimeDur := time.Now().Sub(data) / time.Nanosecond + var uptime float64 = uptimeDur.Seconds() + uptimeStr := fmt.Sprintf("%.2f %.2f\n", uptime, uptime) + + req.Data = []byte(uptimeStr) + + return len(req.Data), nil +} diff --git a/sysbox-fs/handler/implementations/root.go b/sysbox-fs/handler/implementations/root.go new file mode 100644 index 00000000..3fd3f5f1 --- /dev/null +++ b/sysbox-fs/handler/implementations/root.go @@ -0,0 +1,159 @@ +// +// Copyright 2019-2023 Nestybox, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +package implementations + +import ( + "os" + "path/filepath" + "sync" + + "github.com/sirupsen/logrus" + + "github.com/nestybox/sysbox-fs/domain" +) + +// +// root dir (/) dummy handler +// +// Since the sysbox-fs root dir is not mounted inside a system container, +// accesses to it are only possible from host level (e.g., via /var/lib/sysboxfs//). +// +// Such accesses typically occur when sysbox-runc is creating the container and +// it bind-mounts sysbox-fs to subdirs under the container's "/proc" or "/sys" +// (e.g., /proc/uptime, /proc/sys, etc); as part of the bind-mount, the kernel +// walks the bind-source path, which results in sysbox-fs receiving lookups into +// this handler. Thus, this handler only serves such lookups; all other handler +// methods are purposefully dummy, as we generally want to ignore accesses to +// sysbox-fs from host level. + +type Root struct { + domain.HandlerBase +} + +var Root_Handler = &Root{ + domain.HandlerBase{ + Name: "root", + Path: "/", + Enabled: true, + }, +} + +func (h *Root) Lookup( + n domain.IOnodeIface, + req *domain.HandlerRequest) (os.FileInfo, error) { + + var resource = n.Name() + + logrus.Debugf("Executing Lookup() for req-id: %#x, handler: %s, resource: %s", + req.ID, h.Name, resource) + + return n.Lstat() +} + +func (h *Root) Open( + n domain.IOnodeIface, + req *domain.HandlerRequest) (bool, error) { + + return false, nil +} + +func (h *Root) Read( + n domain.IOnodeIface, + req *domain.HandlerRequest) (int, error) { + + return 0, nil +} + +func (h *Root) Write( + n domain.IOnodeIface, + req *domain.HandlerRequest) (int, error) { + + return 0, nil +} + +func (h *Root) ReadDirAll( + n domain.IOnodeIface, + req *domain.HandlerRequest) ([]os.FileInfo, error) { + + var resource = n.Name() + + logrus.Debugf("Executing ReadDirAll() for req-id: %#x, handler: %s, resource: %s", + req.ID, h.Name, resource) + + return nil, nil +} + +func (h *Root) ReadLink( + n domain.IOnodeIface, + req *domain.HandlerRequest) (string, error) { + + logrus.Debugf("Executing ReadLink() for req-id: %#x, handler: %s, resource: %s", + req.ID, h.Name, n.Name()) + + return "", nil +} + +func (h *Root) GetName() string { + return h.Name +} + +func (h *Root) GetPath() string { + return h.Path +} + +func (h *Root) GetService() domain.HandlerServiceIface { + return h.Service +} + +func (h *Root) GetEnabled() bool { + return h.Enabled +} + +func (h *Root) SetEnabled(b bool) { + h.Enabled = b +} + +func (h *Root) GetResourcesList() []string { + + var resources []string + + for resourceKey, resource := range h.EmuResourceMap { + resource.Mutex.Lock() + if !resource.Enabled { + resource.Mutex.Unlock() + continue + } + resource.Mutex.Unlock() + + resources = append(resources, filepath.Join(h.GetPath(), resourceKey)) + } + + return resources +} + +func (h *Root) GetResourceMutex(n domain.IOnodeIface) *sync.Mutex { + resource, ok := h.EmuResourceMap[n.Name()] + if !ok { + return nil + } + + return &resource.Mutex +} + +func (h *Root) SetService(hs domain.HandlerServiceIface) { + h.Service = hs +} diff --git a/sysbox-fs/handler/implementations/sysDevicesVirtual.go b/sysbox-fs/handler/implementations/sysDevicesVirtual.go new file mode 100644 index 00000000..e1120478 --- /dev/null +++ b/sysbox-fs/handler/implementations/sysDevicesVirtual.go @@ -0,0 +1,328 @@ +// +// Copyright 2019-2023 Nestybox, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +package implementations + +import ( + "os" + "path/filepath" + "strings" + "sync" + "time" + + "github.com/sirupsen/logrus" + + "github.com/nestybox/sysbox-fs/domain" +) + +// +// /sys/devices/virtual handler +// +// * /sys/devices/virtual/dmi +// +// In hardware platforms with reduced (or lacking) SMBIOS/DMI support (e.g., arm64), +// the "/sys/devices/virtual/dmi" path hierarchy is absent. In consequence, Sysbox +// must explicitly expose the "dmi" directory as this one contains critical system +// nodes utilized by certain applications. +// + +type SysDevicesVirtual struct { + domain.HandlerBase + passthruNodes map[string]bool +} + +var SysDevicesVirtual_Handler = &SysDevicesVirtual{ + domain.HandlerBase{ + Name: "SysDevicesVirtual", + Path: "/sys/devices/virtual", + Enabled: true, + EmuResourceMap: map[string]*domain.EmuResource{ + ".": { + Kind: domain.DirEmuResource, + Mode: os.ModeDir | os.FileMode(uint32(0755)), + Enabled: true, + }, + "dmi": { + Kind: domain.DirEmuResource, + Mode: os.ModeDir | os.FileMode(uint32(0755)), + Enabled: true, + }, + }, + }, + map[string]bool{ + "net": true, + }, +} + +func (h *SysDevicesVirtual) Lookup( + n domain.IOnodeIface, + req *domain.HandlerRequest) (os.FileInfo, error) { + + logrus.Debugf("Executing Lookup() for req-id: %#x, handler: %s, resource: %s", + req.ID, h.Name, n.Name()) + + // Users should not be allowed to alter any of the sysfs nodes being exposed. We + // accomplish this by returning "nobody:nogroup" to the user during lookup() / + // getattr() operations. This behavior is enforced by setting the handler's + // SkipIdRemap value to 'true' to alert callers of the need to leave the returned + // uid/gid as is (uid=0, gid=0). + req.SkipIdRemap = true + + // Return an artificial fileInfo if looked-up element matches any of the + // emulated components. + relpath, err := filepath.Rel(h.Path, n.Path()) + if err != nil { + return nil, err + } + + var resource = relpath + + if v, ok := h.EmuResourceMap[resource]; ok { + if resource == "." { + resource = "virtual" + } + + info := &domain.FileInfo{ + Fname: resource, + Fmode: v.Mode, + Fsize: v.Size, + FmodTime: time.Now(), + } + + if v.Kind == domain.DirEmuResource { + info.FisDir = true + } + + return info, nil + } + + // For non emulated resources under /sys/devices/virtual, we should + // ideally request the passthrough handler to always perform the lookup; + // however this slows down the lookup and causes sysbox containers with + // systemd inside to fail in hosts with kernel < 5.19 (i.e., systemd takes + // too long to boot because for some reason it's doing lots of lookups of + // /sys/virtual/devices/block/loopX devices, causing it to timeout). Instead + // we do the lookup at host level, except for resources under /sys/devices/virtual + // for which we know we must enter the container namespaces. + for node, _ := range h.passthruNodes { + if node == resource || strings.HasPrefix(resource, node+"/") { + return h.Service.GetPassThroughHandler().Lookup(n, req) + } + } + + return n.Lstat() +} + +func (h *SysDevicesVirtual) Open( + n domain.IOnodeIface, + req *domain.HandlerRequest) (bool, error) { + + logrus.Debugf("Executing Open() for req-id: %#x, handler: %s, resource: %s", + req.ID, h.Name, n.Name()) + + return false, nil +} + +func (h *SysDevicesVirtual) Read( + n domain.IOnodeIface, + req *domain.HandlerRequest) (int, error) { + + logrus.Debugf("Executing Read() for req-id: %#x, handler: %s, resource: %s", + req.ID, h.Name, n.Name()) + + relpath, err := filepath.Rel(h.Path, n.Path()) + if err != nil { + return 0, err + } + + var resource = relpath + + for node, _ := range h.passthruNodes { + if node == resource || strings.HasPrefix(resource, node+"/") { + return h.Service.GetPassThroughHandler().Read(n, req) + } + } + + return 0, nil +} + +func (h *SysDevicesVirtual) Write( + n domain.IOnodeIface, + req *domain.HandlerRequest) (int, error) { + + logrus.Debugf("Executing Write() for req-id: %#x, handler: %s, resource: %s", + req.ID, h.Name, n.Name()) + + relpath, err := filepath.Rel(h.Path, n.Path()) + if err != nil { + return 0, err + } + + var resource = relpath + + for node, _ := range h.passthruNodes { + if node == resource || strings.HasPrefix(resource, node+"/") { + return h.Service.GetPassThroughHandler().Write(n, req) + } + } + + return 0, nil +} + +func (h *SysDevicesVirtual) ReadDirAll( + n domain.IOnodeIface, + req *domain.HandlerRequest) ([]os.FileInfo, error) { + + var ( + fileEntries []os.FileInfo + emulatedElemsAdded bool + ) + + logrus.Debugf("Executing ReadDirAll() for req-id: %#x, handler: %s, resource: %s", + req.ID, h.Name, n.Name()) + + // Obtain relative path to the node being readdir(). + relpath, err := filepath.Rel(h.Path, n.Path()) + if err != nil { + return nil, err + } + + var resource = relpath + + // Invoke the passthrough handler for the corresponding resources (e.g., /sys/devices/virtual/net). + // We return here since we are looking for the host's view of these resources -- i.e., we don't + // want to include emulated resources here (emuResourceMap). + for node, _ := range h.passthruNodes { + if node == resource || strings.HasPrefix(resource, node+"/") { + return h.Service.GetPassThroughHandler().ReadDirAll(n, req) + } + } + + // Create info entries for emulated components. + for k, v := range h.EmuResourceMap { + if k == "." { + continue + } + + if relpath != filepath.Dir(k) { + continue + } + + info := &domain.FileInfo{ + Fname: k, + Fmode: v.Mode, + FmodTime: time.Now(), + } + + if v.Kind == domain.DirEmuResource { + info.FisDir = true + } + + fileEntries = append(fileEntries, info) + + emulatedElemsAdded = true + } + + // Obtain the usual node entries. + usualEntries, err := n.ReadDirAll() + if err != nil { + return nil, err + } + + fileEntries = append(fileEntries, usualEntries...) + + if emulatedElemsAdded { + fileEntries = domain.FileInfoSliceUniquify(fileEntries) + } + + return fileEntries, nil +} + +func (h *SysDevicesVirtual) ReadLink( + n domain.IOnodeIface, + req *domain.HandlerRequest) (string, error) { + + logrus.Debugf("Executing ReadLink() for req-id: %#x, handler: %s, resource: %s", + req.ID, h.Name, n.Name()) + + // Obtain relative path to the node being readlink(). + relpath, err := filepath.Rel(h.Path, n.Path()) + if err != nil { + return "", err + } + + var resource = relpath + + // Invoke the passthrough handler for the passthrough resources. + for node, _ := range h.passthruNodes { + if node == resource || strings.HasPrefix(resource, node+"/") { + return h.Service.GetPassThroughHandler().ReadLink(n, req) + } + } + + return n.ReadLink() +} + +func (h *SysDevicesVirtual) GetName() string { + return h.Name +} + +func (h *SysDevicesVirtual) GetPath() string { + return h.Path +} + +func (h *SysDevicesVirtual) GetService() domain.HandlerServiceIface { + return h.Service +} + +func (h *SysDevicesVirtual) GetEnabled() bool { + return h.Enabled +} + +func (h *SysDevicesVirtual) SetEnabled(b bool) { + h.Enabled = b +} + +func (h *SysDevicesVirtual) GetResourcesList() []string { + + var resources []string + + for resourceKey, resource := range h.EmuResourceMap { + resource.Mutex.Lock() + if !resource.Enabled { + resource.Mutex.Unlock() + continue + } + resource.Mutex.Unlock() + + resources = append(resources, filepath.Join(h.GetPath(), resourceKey)) + } + + return resources +} + +func (h *SysDevicesVirtual) GetResourceMutex(n domain.IOnodeIface) *sync.Mutex { + resource, ok := h.EmuResourceMap[n.Name()] + if !ok { + return nil + } + + return &resource.Mutex +} + +func (h *SysDevicesVirtual) SetService(hs domain.HandlerServiceIface) { + h.Service = hs +} diff --git a/sysbox-fs/handler/implementations/sysDevicesVirtualDmi.go b/sysbox-fs/handler/implementations/sysDevicesVirtualDmi.go new file mode 100644 index 00000000..9c6c6487 --- /dev/null +++ b/sysbox-fs/handler/implementations/sysDevicesVirtualDmi.go @@ -0,0 +1,278 @@ +// +// Copyright 2019-2023 Nestybox, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +package implementations + +import ( + "os" + "path/filepath" + "sync" + "time" + + "github.com/sirupsen/logrus" + + "github.com/nestybox/sysbox-fs/domain" +) + +// +// /sys/devices/virtual/dmi handler +// +// Emulated resources: +// +// * /sys/devices/virtual/dmi +// +// In hardware platforms with reduced (or lacking) SMBIOS/DMI support (e.g., arm64), +// the "/sys/devices/virtual/dmi" path hierarchy is absent. In consequence, Sysbox +// must explictly expose the "dmi" directoy as this one contains critical system +// nodes utilized by certain applications. +// +// * /sys/devices/virtual/dmi/id +// +// Same as above. The "id" subdirectory must be emulated too as this contains +// SMBIOS data usually queried by DMI tools. +// + +type SysDevicesVirtualDmi struct { + domain.HandlerBase +} + +var SysDevicesVirtualDmi_Handler = &SysDevicesVirtualDmi{ + domain.HandlerBase{ + Name: "SysDevicesVirtualDmi", + Path: "/sys/devices/virtual/dmi", + Enabled: true, + EmuResourceMap: map[string]*domain.EmuResource{ + ".": { + Kind: domain.DirEmuResource, + Mode: os.ModeDir | os.FileMode(uint32(0755)), + Enabled: true, + }, + "id": { + Kind: domain.DirEmuResource, + Mode: os.ModeDir | os.FileMode(uint32(0755)), + Enabled: true, + }, + }, + }, +} + +func (h *SysDevicesVirtualDmi) Lookup( + n domain.IOnodeIface, + req *domain.HandlerRequest) (os.FileInfo, error) { + + logrus.Debugf("Executing Lookup() for req-id: %#x, handler: %s, resource: %s", + req.ID, h.Name, n.Name()) + + relpath, err := filepath.Rel(h.Path, n.Path()) + if err != nil { + return nil, err + } + + var resource = relpath + + // Users should not be allowed to alter any of the sysfs nodes being exposed. We + // accomplish this by returning "nobody:nogroup" to the user during lookup() / + // getattr() operations. This behavior is enforced by setting the handler's + // SkipIdRemap value to 'true' to alert callers of the need to leave the returned + // uid/gid as is (uid=0, gid=0). + req.SkipIdRemap = true + + // Return an artificial fileInfo if looked-up element matches any of the + // emulated components. + if v, ok := h.EmuResourceMap[resource]; ok { + + if resource == "." { + resource = "dmi" + } + + info := &domain.FileInfo{ + Fname: resource, + Fmode: v.Mode, + FmodTime: time.Now(), + } + + if v.Kind == domain.DirEmuResource { + info.FisDir = true + } + + return info, nil + } + + return n.Lstat() +} + +func (h *SysDevicesVirtualDmi) Open( + n domain.IOnodeIface, + req *domain.HandlerRequest) (bool, error) { + + logrus.Debugf("Executing Open() for req-id: %#x, handler: %s, resource: %s", + req.ID, h.Name, n.Name()) + + return false, nil +} + +func (h *SysDevicesVirtualDmi) Read( + n domain.IOnodeIface, + req *domain.HandlerRequest) (int, error) { + + logrus.Debugf("Executing Read() for req-id: %#x, handler: %s, resource: %s", + req.ID, h.Name, n.Name()) + + return 0, nil +} + +func (h *SysDevicesVirtualDmi) Write( + n domain.IOnodeIface, + req *domain.HandlerRequest) (int, error) { + + logrus.Debugf("Executing Write() for req-id: %#x, handler: %s, resource: %s", + req.ID, h.Name, n.Name()) + + return 0, nil +} + +func (h *SysDevicesVirtualDmi) ReadDirAll( + n domain.IOnodeIface, + req *domain.HandlerRequest) ([]os.FileInfo, error) { + + var ( + fileEntries []os.FileInfo + emulatedElemsAdded bool + ) + + logrus.Debugf("Executing ReadDirAll() for req-id: %#x, handler: %s, resource: %s", + req.ID, h.Name, n.Name()) + + // Obtain relative path to the node being readdir(). + relpath, err := filepath.Rel(h.Path, n.Path()) + if err != nil { + return nil, err + } + + // Create info entries for emulated components. + for k, v := range h.EmuResourceMap { + if k == "." { + continue + } + + if relpath != filepath.Dir(k) { + continue + } + + info := &domain.FileInfo{ + Fname: k, + Fmode: v.Mode, + FmodTime: time.Now(), + } + + if v.Kind == domain.DirEmuResource { + info.FisDir = true + } + + fileEntries = append(fileEntries, info) + + emulatedElemsAdded = true + } + + // Obtain the usual node entries. + usualEntries, err := n.ReadDirAll() + if err == nil { + fileEntries = append(fileEntries, usualEntries...) + } + + // Uniquify entries to return. + if emulatedElemsAdded { + fileEntries = domain.FileInfoSliceUniquify(fileEntries) + } + + return fileEntries, nil +} + +func (h *SysDevicesVirtualDmi) ReadLink( + n domain.IOnodeIface, + req *domain.HandlerRequest) (string, error) { + + logrus.Debugf("Executing ReadLink() for req-id: %#x, handler: %s, resource: %s", + req.ID, h.Name, n.Name()) + + return n.ReadLink() +} + +func (h *SysDevicesVirtualDmi) GetName() string { + return h.Name +} + +func (h *SysDevicesVirtualDmi) GetPath() string { + return h.Path +} + +func (h *SysDevicesVirtualDmi) GetService() domain.HandlerServiceIface { + return h.Service +} + +func (h *SysDevicesVirtualDmi) GetEnabled() bool { + return h.Enabled +} + +func (h *SysDevicesVirtualDmi) SetEnabled(b bool) { + h.Enabled = b +} + +func (h *SysDevicesVirtualDmi) GetResourcesList() []string { + + var resources []string + + for resourceKey, resource := range h.EmuResourceMap { + resource.Mutex.Lock() + if !resource.Enabled { + resource.Mutex.Unlock() + continue + } + resource.Mutex.Unlock() + + // Resource name must be adjusted to account for the presence of the "dmi" + // directory (i.e., ".") as one of the emulated resources. + if resourceKey == "." { + resources = append(resources, h.Path) + } else { + resources = append(resources, filepath.Join(h.GetPath(), resourceKey)) + } + } + + return resources +} + +func (h *SysDevicesVirtualDmi) GetResourceMutex(n domain.IOnodeIface) *sync.Mutex { + + // Resource name must be adjusted to account for the possibility of caller asking + // for the "dmi" directory itself (i.e., "." resource). + relpath, err := filepath.Rel(h.Path, n.Path()) + if err != nil { + return nil + } + var node = relpath + + resource, ok := h.EmuResourceMap[node] + if !ok { + return nil + } + + return &resource.Mutex +} + +func (h *SysDevicesVirtualDmi) SetService(hs domain.HandlerServiceIface) { + h.Service = hs +} diff --git a/sysbox-fs/handler/implementations/sysDevicesVirtualDmiId.go b/sysbox-fs/handler/implementations/sysDevicesVirtualDmiId.go new file mode 100644 index 00000000..6905e6ec --- /dev/null +++ b/sysbox-fs/handler/implementations/sysDevicesVirtualDmiId.go @@ -0,0 +1,398 @@ +// +// Copyright 2019-2023 Nestybox, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +package implementations + +import ( + "io" + "os" + "path/filepath" + "sync" + "syscall" + "time" + + "github.com/sirupsen/logrus" + + "github.com/nestybox/sysbox-fs/domain" + "github.com/nestybox/sysbox-fs/fuse" + "github.com/nestybox/sysbox-libs/formatter" +) + +// +// /sys/devices/virtual/dmi/id handler +// +// Emulated resources: +// +// * /sys/devices/virtual/dmi/id +// +// In hardware platforms with reduced (or lacking) SMBIOS/DMI support (e.g., arm64), +// the "/sys/devices/virtual/dmi/id" path hierarchy is absent. In consequence, Sysbox +// must explictly expose the "dmi" directoy as this one contains critical system +// nodes utilized by certain applications (see below). +// +// * /sys/devices/virtual/dmi/id/product_uuid +// +// The main purpose here is to allow each sys container to have a unique and +// stable UUID, to satisfy applications that rely on this value for their +// operation (e.g., Kubernetes, Weave CNI, Calico, etc). Notice that this is +// not an option in the regular (oci) runc, where all containers within a +// host/vm share the same UUID value. +// +// A host UUID is typically extracted from the 'product_uuid' sysfs node, and +// its value is represented by 36 characters with the following layout: +// +// $ cat /sys/class/dmi/id/product_uuid +// e617c421-0026-4941-9e95-56a1ab1f4cb3 +// +// As we want to provide a unique and stable UUID for each container, we will +// expose an artificial 'product_uuid' file through this handler. The UUID +// value that will be displayed within each container's 'product_uuid' file +// will follow these simple guidelines: +// +// * The first 24 characters will continue to match those seen by the hosts (its +// own UUID). +// * The last 12 characters will be extracted from the container ID field. +// +// In scenarios where no UUID is available for a given host (e.g., vm launched +// without qemu's --uuid parameter), no reference 'product_uuid' file will be +// found at the host level, so in this case we will set the first 24 characters +// of each container's UUID to zero. +// +// Example: +// +// e617c421-0026-4941-9e95- +// e617c421-0026-4941-9e95- +// +// 00000000-0000-0000-0000- // no 'product_uuid' found +// + +// UUID constants as per rfc/4122 +const ( + // Time + Version fields length + timeFieldLen = 24 + + // Node field length + nodeFieldLen = 12 +) + +type SysDevicesVirtualDmiId struct { + domain.HandlerBase +} + +var SysDevicesVirtualDmiId_Handler = &SysDevicesVirtualDmiId{ + domain.HandlerBase{ + Name: "SysDevicesVirtualDmiId", + Path: "/sys/devices/virtual/dmi/id", + Enabled: true, + EmuResourceMap: map[string]*domain.EmuResource{ + ".": { + Kind: domain.DirEmuResource, + Mode: os.ModeDir | os.FileMode(uint32(0755)), + Enabled: true, + }, + "product_uuid": { + Kind: domain.FileEmuResource, + Mode: os.FileMode(uint32(0400)), + Size: 4096, + Enabled: true, + }, + }, + }, +} + +func (h *SysDevicesVirtualDmiId) Lookup( + n domain.IOnodeIface, + req *domain.HandlerRequest) (os.FileInfo, error) { + + logrus.Debugf("Executing Lookup() for req-id: %#x, handler: %s, resource: %s", + req.ID, h.Name, n.Name()) + + relpath, err := filepath.Rel(h.Path, n.Path()) + if err != nil { + return nil, err + } + + var resource = relpath + + // Return an artificial fileInfo if looked-up element matches any of the + // emulated components. + if v, ok := h.EmuResourceMap[resource]; ok { + + if resource == "." { + resource = "id" + // Skip uid/gid remaps for 'id' folder node. + req.SkipIdRemap = true + } + + info := &domain.FileInfo{ + Fname: resource, + Fmode: v.Mode, + Fsize: v.Size, + FmodTime: time.Now(), + } + + if v.Kind == domain.DirEmuResource { + info.FisDir = true + } + + return info, nil + } + + // Skip uid/gid remaps for all other (non-emulated) resources. + req.SkipIdRemap = true + + return n.Lstat() +} + +func (h *SysDevicesVirtualDmiId) Open( + n domain.IOnodeIface, + req *domain.HandlerRequest) (bool, error) { + + logrus.Debugf("Executing Open() for req-id: %#x, handler: %s, resource: %s", + req.ID, h.Name, n.Name()) + + relpath, err := filepath.Rel(h.Path, n.Path()) + if err != nil { + return false, err + } + + var resource = relpath + + flags := n.OpenFlags() + + switch resource { + + case ".": + return false, nil + + case "product_uuid": + if flags&syscall.O_WRONLY == syscall.O_WRONLY || + flags&syscall.O_RDWR == syscall.O_RDWR { + return false, fuse.IOerror{Code: syscall.EACCES} + } + return false, nil + } + + return false, n.Open() +} + +func (h *SysDevicesVirtualDmiId) Read( + n domain.IOnodeIface, + req *domain.HandlerRequest) (int, error) { + + var resource = n.Name() + + logrus.Debugf("Executing Read() for req-id: %#x, handler: %s, resource: %s", + req.ID, h.Name, n.Name()) + + if req.Offset != 0 { + return 0, nil + } + + switch resource { + + case "product_uuid": + return h.readProductUuid(n, req) + } + + return readHostFs(h, n, req.Offset, &req.Data) +} + +func (h *SysDevicesVirtualDmiId) Write( + n domain.IOnodeIface, + req *domain.HandlerRequest) (int, error) { + + return 0, nil +} + +func (h *SysDevicesVirtualDmiId) ReadDirAll( + n domain.IOnodeIface, + req *domain.HandlerRequest) ([]os.FileInfo, error) { + + var ( + fileEntries []os.FileInfo + emulatedElemsAdded bool + ) + + logrus.Debugf("Executing ReadDirAll() for req-id: %#x, handler: %s, resource: %s", + req.ID, h.Name, n.Name()) + + // Obtain relative path to the node being readdir(). + relpath, err := filepath.Rel(h.Path, n.Path()) + if err != nil { + return nil, err + } + + // Create info entries for emulated components. + for k, v := range h.EmuResourceMap { + if k == "." { + continue + } + + if relpath != filepath.Dir(k) { + continue + } + + info := &domain.FileInfo{ + Fname: k, + Fmode: v.Mode, + FmodTime: time.Now(), + } + + if v.Kind == domain.DirEmuResource { + info.FisDir = true + } + + fileEntries = append(fileEntries, info) + + emulatedElemsAdded = true + } + + // Obtain the usual node entries. + usualEntries, err := n.ReadDirAll() + if err == nil { + fileEntries = append(fileEntries, usualEntries...) + } + + // Uniquify entries to return. + if emulatedElemsAdded { + fileEntries = domain.FileInfoSliceUniquify(fileEntries) + } + + return fileEntries, nil +} + +func (h *SysDevicesVirtualDmiId) ReadLink( + n domain.IOnodeIface, + req *domain.HandlerRequest) (string, error) { + + logrus.Debugf("Executing ReadLink() for req-id: %#x, handler: %s, resource: %s", + req.ID, h.Name, n.Name()) + + return n.ReadLink() +} + +func (h *SysDevicesVirtualDmiId) GetName() string { + return h.Name +} + +func (h *SysDevicesVirtualDmiId) GetPath() string { + return h.Path +} + +func (h *SysDevicesVirtualDmiId) GetService() domain.HandlerServiceIface { + return h.Service +} + +func (h *SysDevicesVirtualDmiId) GetEnabled() bool { + return h.Enabled +} + +func (h *SysDevicesVirtualDmiId) SetEnabled(b bool) { + h.Enabled = b +} + +func (h *SysDevicesVirtualDmiId) GetResourcesList() []string { + + var resources []string + + for resourceKey, resource := range h.EmuResourceMap { + resource.Mutex.Lock() + if !resource.Enabled { + resource.Mutex.Unlock() + continue + } + resource.Mutex.Unlock() + + // Resource name must be adjusted to account for the presence of the "id" + // directory (i.e., ".") as one of the emulated resources. + if resourceKey == "." { + resources = append(resources, h.Path) + } else { + resources = append(resources, filepath.Join(h.GetPath(), resourceKey)) + } + } + + return resources +} + +func (h *SysDevicesVirtualDmiId) GetResourceMutex(n domain.IOnodeIface) *sync.Mutex { + + // Resource name must be adjusted to account for the possibility of caller asking + // for the "id" directory itself (i.e., "." resource). + relpath, err := filepath.Rel(h.Path, n.Path()) + if err != nil { + return nil + } + var node = relpath + + resource, ok := h.EmuResourceMap[node] + if !ok { + return nil + } + + return &resource.Mutex +} + +func (h *SysDevicesVirtualDmiId) SetService(hs domain.HandlerServiceIface) { + h.Service = hs +} + +func (h *SysDevicesVirtualDmiId) readProductUuid( + n domain.IOnodeIface, + req *domain.HandlerRequest) (int, error) { + + path := n.Path() + cntr := req.Container + + cntr.Lock() + defer cntr.Unlock() + + // Check if this product_uuid value has been initialized for this container. + sz, err := cntr.Data(path, req.Offset, &req.Data) + if err != nil && err != io.EOF { + return 0, fuse.IOerror{Code: syscall.EINVAL} + } + + if req.Offset == 0 && sz == 0 && err == io.EOF { + // Create an artificial (but consistent) container uuid value and store it + // in cache. + cntrUuid := h.CreateCntrUuid(cntr) + + req.Data = []byte(cntrUuid + "\n") + err = cntr.SetData(path, 0, req.Data) + if err != nil { + return 0, fuse.IOerror{Code: syscall.EINVAL} + } + } + + return len(req.Data), nil +} + +// Method is public exclusively for unit-testing purposes. +func (h *SysDevicesVirtualDmiId) CreateCntrUuid(cntr domain.ContainerIface) string { + + hostUuid := h.Service.HostUuid() + hostUuidPref := hostUuid[:timeFieldLen-1] + + // Pad the containerId with zeroes if it doesn't fill its slot. + cntrIdPref := formatter.ContainerID{cntr.ID()}.String() + if len(cntrIdPref) < nodeFieldLen { + cntrIdPref = padRight(cntrIdPref, "0", nodeFieldLen) + } + + return hostUuidPref + "-" + cntrIdPref +} diff --git a/sysbox-fs/handler/implementations/sysDevicesVirtualDmiId_test.go b/sysbox-fs/handler/implementations/sysDevicesVirtualDmiId_test.go new file mode 100644 index 00000000..53fc2494 --- /dev/null +++ b/sysbox-fs/handler/implementations/sysDevicesVirtualDmiId_test.go @@ -0,0 +1,135 @@ +// +// Copyright 2019-2023 Nestybox, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +package implementations_test + +import ( + "testing" + "time" + + "github.com/nestybox/sysbox-fs/domain" + "github.com/nestybox/sysbox-fs/handler/implementations" +) + +func TestSysDevicesVirtualDmiId_CreateCntrUuid(t *testing.T) { + type fields struct { + HandlerBase domain.HandlerBase + } + var f1 = fields{ + domain.HandlerBase{ + Name: "SysDevicesVirtualDmiId", + Path: "/sys/devices/virtual/dmi/id", + Service: hds, + }, + } + + type args struct { + cntr domain.ContainerIface + } + + var a1 = args{ + cntr: css.ContainerCreate( + "012345678901", + uint32(1001), + time.Time{}, + 231072, + 65535, + 231072, + 65535, + nil, + nil, + nil, + ), + } + + var a2 = args{ + cntr: css.ContainerCreate( + "0123", + uint32(1001), + time.Time{}, + 231072, + 65535, + 231072, + 65535, + nil, + nil, + nil, + ), + } + + tests := []struct { + name string + fields fields + args args + want string + prepare func() + }{ + { + // Test-case 1: Proper product_uuid and and full cntr-id length. + name: "1", + fields: f1, + args: a1, + want: "abcdefgh-ijkl-mnop-qrst-012345678901", + prepare: func() { + hds.On("HostUuid").Return("abcdefgh-ijkl-mnop-qrst-uvwxyz123456") + }, + }, + { + // Test-case 2: Proper product_uuid and partial cntr-id length. + name: "2", + fields: f1, + args: a2, + want: "abcdefgh-ijkl-mnop-qrst-012300000000", + prepare: func() { + hds.On("HostUuid").Return("abcdefgh-ijkl-mnop-qrst-uvwxyz123456") + }, + }, + { + // Test-case 3: Missing product_uuid and full cntr-id length. + name: "3", + fields: f1, + args: a1, + want: "00000000-0000-0000-0000-012345678901", + prepare: func() { + hds.On("HostUuid").Return("00000000-0000-0000-0000-000000000000") + }, + }, + } + + // + // Testcase executions. + // + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + h := &implementations.SysDevicesVirtualDmiId{ + HandlerBase: tt.fields.HandlerBase, + } + + // Prepare the mocks. + if tt.prepare != nil { + tt.prepare() + } + + if got := h.CreateCntrUuid(tt.args.cntr); got != tt.want { + t.Errorf("SysDevicesVirtualDmiId_createCntrUuid() = %v, want %v", got, tt.want) + } + + // Ensure that mocks were properly invoked and reset expectedCalls + // object. + hds.ExpectedCalls = nil + }) + } +} diff --git a/sysbox-fs/handler/implementations/sysKernel.go b/sysbox-fs/handler/implementations/sysKernel.go new file mode 100644 index 00000000..95cad984 --- /dev/null +++ b/sysbox-fs/handler/implementations/sysKernel.go @@ -0,0 +1,297 @@ +// +// Copyright 2019-2023 Nestybox, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +package implementations + +import ( + "os" + "path/filepath" + "sync" + "time" + + "github.com/sirupsen/logrus" + + "github.com/nestybox/sysbox-fs/domain" +) + +// +// /sys/kernel handler +// +// The following dirs are emulated within /sys/kernel directory to ensure that +// they are exposed within sys containers regardless of the system's kernel +// configuration in place (i.e., they are absent in systems where configfs, +// debugfs and tracefs kernel modules are disabled). Moreover, even if these +// modules were to be loaded, their associated sysfs nodes would still appear as +// 'nobody:nogroup' as they are being accessed by process hosted within a +// non-init user-ns. By virtue of emulating them, we expose them with proper +// permissions. +// +// We are also including "/sys/kernel/security" dir as part of the emulated +// resources to ensure that system-wide security-related details are not exposed +// within sysbox containers. In the future, we could expose specific nodes within +// this hierarchy if we see a need for it. +// +// Emulated resources: +// +// * /sys/kernel/config +// * /sys/kernel/debug +// * /sys/kernel/tracing +// * /sys/kernel/security +// +// Finally, notice that unlike the procSys handler, we don't rely on the +// "passthrough" handler to access the "/sys/kernel" file hierarchy through +// nsenter() into the container's namespaces. Rather, we are accessing the files +// directly through the host's sysfs. This approach is feasible due to the +// global (i.e., system-wide) nature of /sys/kernel. +// + +type SysKernel struct { + domain.HandlerBase +} + +var SysKernel_Handler = &SysKernel{ + domain.HandlerBase{ + Name: "SysKernel", + Path: "/sys/kernel", + Enabled: true, + + // Emulated components under /sys/kernel + EmuResourceMap: map[string]*domain.EmuResource{ + "config": { + Kind: domain.DirEmuResource, + Mode: os.ModeDir | os.FileMode(uint32(0755)), + Enabled: true, + }, + "debug": { + Kind: domain.DirEmuResource, + Mode: os.ModeDir | os.FileMode(uint32(0700)), + Enabled: true, + }, + "tracing": { + Kind: domain.DirEmuResource, + Mode: os.ModeDir | os.FileMode(uint32(0700)), + Enabled: true, + }, + "security": { + Kind: domain.DirEmuResource, + Mode: os.ModeDir | os.FileMode(uint32(0755)), + Enabled: true, + }, + }, + }, +} + +func (h *SysKernel) Lookup( + n domain.IOnodeIface, + req *domain.HandlerRequest) (os.FileInfo, error) { + + var resource = n.Name() + + logrus.Debugf("Executing Lookup() for req-id: %#x, handler: %s, resource: %s", + req.ID, h.Name, resource) + + // Return an artificial fileInfo if looked-up element matches any of the + // emulated components. + if v, ok := h.EmuResourceMap[resource]; ok { + info := &domain.FileInfo{ + Fname: resource, + Fmode: v.Mode, + FmodTime: time.Now(), + } + + return info, nil + } + + // Non-emulated files/dirs under /sys/kernel should show up without + // permissions inside the sysbox container. We accomplish this by returning + // "nobody:nogroup" to the user during lookup() / getattr() operations. This + // behavior is enforced by setting the handler's SkipIdRemap value to 'true' + // to alert callers of the need to leave the returned uid/gid as is (uid=0, + // gid=0). + req.SkipIdRemap = true + + return n.Lstat() +} + +func (h *SysKernel) Open( + n domain.IOnodeIface, + req *domain.HandlerRequest) (bool, error) { + + var resource = n.Name() + + logrus.Debugf("Executing Open() for req-id: %#x, handler: %s, resource: %s", + req.ID, h.Name, resource) + + // All emulated resources are currently dummy / empty + switch resource { + case "config": + return false, nil + case "debug": + return false, nil + case "tracing": + return false, nil + case "security": + return false, nil + } + + return false, n.Open() +} + +func (h *SysKernel) Read( + n domain.IOnodeIface, + req *domain.HandlerRequest) (int, error) { + + var resource = n.Name() + + logrus.Debugf("Executing Read() for req-id: %#x, handler: %s, resource: %s", + req.ID, h.Name, resource) + + // All emulated resources are currently dummy / empty + switch resource { + case "config": + return 0, nil + case "debug": + return 0, nil + case "tracing": + return 0, nil + case "security": + return 0, nil + } + + return readHostFs(h, n, req.Offset, &req.Data) +} + +func (h *SysKernel) Write( + n domain.IOnodeIface, + req *domain.HandlerRequest) (int, error) { + + return 0, nil +} + +func (h *SysKernel) ReadDirAll( + n domain.IOnodeIface, + req *domain.HandlerRequest) ([]os.FileInfo, error) { + + var resource = n.Name() + + logrus.Debugf("Executing ReadDirAll() for req-id: %#x, handler: %s, resource: %s", + req.ID, h.Name, resource) + + var fileEntries []os.FileInfo + + // Obtain relative path to the node being readdir(). + relpath, err := filepath.Rel(h.Path, n.Path()) + if err != nil { + return nil, err + } + + var emulatedElemsAdded bool + + // Create info entries for emulated components under /sys/kernel. + for k, v := range h.EmuResourceMap { + if relpath != filepath.Dir(k) { + continue + } + + info := &domain.FileInfo{ + Fname: k, + Fmode: v.Mode, + FmodTime: time.Now(), + } + + if v.Kind == domain.DirEmuResource { + info.FisDir = true + } + + fileEntries = append(fileEntries, info) + + emulatedElemsAdded = true + } + + // Obtain the usual node entries. + usualEntries, err := n.ReadDirAll() + if err == nil { + fileEntries = append(fileEntries, usualEntries...) + } + + // Uniquify entries to return. + if emulatedElemsAdded { + fileEntries = domain.FileInfoSliceUniquify(fileEntries) + } + + return fileEntries, nil +} + +func (h *SysKernel) ReadLink( + n domain.IOnodeIface, + req *domain.HandlerRequest) (string, error) { + + logrus.Debugf("Executing ReadLink() for req-id: %#x, handler: %s, resource: %s", + req.ID, h.Name, n.Name()) + + return n.ReadLink() +} + +func (h *SysKernel) GetName() string { + return h.Name +} + +func (h *SysKernel) GetPath() string { + return h.Path +} + +func (h *SysKernel) GetService() domain.HandlerServiceIface { + return h.Service +} + +func (h *SysKernel) GetEnabled() bool { + return h.Enabled +} + +func (h *SysKernel) SetEnabled(b bool) { + h.Enabled = b +} + +func (h *SysKernel) GetResourcesList() []string { + + var resources []string + + for resourceKey, resource := range h.EmuResourceMap { + resource.Mutex.Lock() + if !resource.Enabled { + resource.Mutex.Unlock() + continue + } + resource.Mutex.Unlock() + + resources = append(resources, filepath.Join(h.GetPath(), resourceKey)) + } + + return resources +} + +func (h *SysKernel) GetResourceMutex(n domain.IOnodeIface) *sync.Mutex { + resource, ok := h.EmuResourceMap[n.Name()] + if !ok { + return nil + } + + return &resource.Mutex +} + +func (h *SysKernel) SetService(hs domain.HandlerServiceIface) { + h.Service = hs +} diff --git a/sysbox-fs/handler/implementations/sysModuleNfconntrackParameters.go b/sysbox-fs/handler/implementations/sysModuleNfconntrackParameters.go new file mode 100644 index 00000000..448a0227 --- /dev/null +++ b/sysbox-fs/handler/implementations/sysModuleNfconntrackParameters.go @@ -0,0 +1,252 @@ +// +// Copyright 2019-2023 Nestybox, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +package implementations + +import ( + "os" + "path/filepath" + "sync" + "time" + + "github.com/sirupsen/logrus" + + "github.com/nestybox/sysbox-fs/domain" +) + +// +// /sys/module/nf_conntrack/parameters handler +// +// Emulated resources: +// +// * /sys/module/nf_conntrack/parameters/hashsize +// + +type SysModuleNfconntrackParameters struct { + domain.HandlerBase +} + +var SysModuleNfconntrackParameters_Handler = &SysModuleNfconntrackParameters{ + domain.HandlerBase{ + Name: "SysModuleNfconntrackParameters", + Path: "/sys/module/nf_conntrack/parameters", + Enabled: true, + EmuResourceMap: map[string]*domain.EmuResource{ + "hashsize": { + Kind: domain.FileEmuResource, + Mode: os.FileMode(uint32(0600)), + Size: 4096, + Enabled: true, + }, + }, + }, +} + +func (h *SysModuleNfconntrackParameters) Lookup( + n domain.IOnodeIface, + req *domain.HandlerRequest) (os.FileInfo, error) { + + var resource = n.Name() + + logrus.Debugf("Executing Lookup() for req-id: %#x, handler: %s, resource: %s", + req.ID, h.Name, resource) + + // Return an artificial fileInfo if looked-up element matches any of the + // emulated components. + if v, ok := h.EmuResourceMap[resource]; ok { + info := &domain.FileInfo{ + Fname: resource, + Fmode: v.Mode, + FmodTime: time.Now(), + Fsize: v.Size, + } + + if v.Kind == domain.DirEmuResource { + info.FisDir = true + } + + return info, nil + } + + // Users should not be allowed to alter any of the non-emulated sysfs nodes. + req.SkipIdRemap = true + + return n.Lstat() +} + +func (h *SysModuleNfconntrackParameters) Open( + n domain.IOnodeIface, + req *domain.HandlerRequest) (bool, error) { + + return false, nil +} + +func (h *SysModuleNfconntrackParameters) Read( + n domain.IOnodeIface, + req *domain.HandlerRequest) (int, error) { + + var resource = n.Name() + + logrus.Debugf("Executing Read() for req-id: %#x, handler: %s, resource: %s", + req.ID, h.Name, resource) + + if req.Offset != 0 { + return 0, nil + } + + switch resource { + case "hashsize": + return readCntrData(h, n, req) + } + + return readHostFs(h, n, req.Offset, &req.Data) +} + +func (h *SysModuleNfconntrackParameters) Write( + n domain.IOnodeIface, + req *domain.HandlerRequest) (int, error) { + + var resource = n.Name() + + logrus.Debugf("Executing Write() for req-id: %#x, handler: %s, resource: %s", + req.ID, h.Name, resource) + + if req.Offset != 0 { + return 0, nil + } + + switch resource { + case "hashsize": + return writeCntrData(h, n, req, writeToFs) + } + + return writeHostFs(h, n, req.Offset, req.Data) +} + +func (h *SysModuleNfconntrackParameters) ReadDirAll( + n domain.IOnodeIface, + req *domain.HandlerRequest) ([]os.FileInfo, error) { + + var resource = n.Name() + + logrus.Debugf("Executing ReadDirAll() for req-id: %#x, handler: %s, resource: %s", + req.ID, h.Name, resource) + + var fileEntries []os.FileInfo + + // Obtain relative path to the node being readdir(). + relpath, err := filepath.Rel(h.Path, n.Path()) + if err != nil { + return nil, err + } + + var emulatedElemsAdded bool + + // Create info entries for emulated resources under /sys/module/nf_conntrack/parameters + for k, v := range h.EmuResourceMap { + if relpath != filepath.Dir(k) { + continue + } + + info := &domain.FileInfo{ + Fname: k, + Fmode: v.Mode, + FmodTime: time.Now(), + } + + if v.Kind == domain.DirEmuResource { + info.FisDir = true + } + + fileEntries = append(fileEntries, info) + + emulatedElemsAdded = true + } + + // Obtain the usual node entries. + usualEntries, err := n.ReadDirAll() + if err == nil { + fileEntries = append(fileEntries, usualEntries...) + } + + // Uniquify entries to return. + if emulatedElemsAdded { + fileEntries = domain.FileInfoSliceUniquify(fileEntries) + } + + return fileEntries, nil +} + +func (h *SysModuleNfconntrackParameters) ReadLink( + n domain.IOnodeIface, + req *domain.HandlerRequest) (string, error) { + + logrus.Debugf("Executing ReadLink() for req-id: %#x, handler: %s, resource: %s", + req.ID, h.Name, n.Name()) + + return n.ReadLink() +} + +func (h *SysModuleNfconntrackParameters) GetName() string { + return h.Name +} + +func (h *SysModuleNfconntrackParameters) GetPath() string { + return h.Path +} + +func (h *SysModuleNfconntrackParameters) GetService() domain.HandlerServiceIface { + return h.Service +} + +func (h *SysModuleNfconntrackParameters) GetEnabled() bool { + return h.Enabled +} + +func (h *SysModuleNfconntrackParameters) SetEnabled(b bool) { + h.Enabled = b +} + +func (h *SysModuleNfconntrackParameters) GetResourcesList() []string { + + var resources []string + + for resourceKey, resource := range h.EmuResourceMap { + resource.Mutex.Lock() + if !resource.Enabled { + resource.Mutex.Unlock() + continue + } + resource.Mutex.Unlock() + + resources = append(resources, filepath.Join(h.GetPath(), resourceKey)) + } + + return resources +} + +func (h *SysModuleNfconntrackParameters) GetResourceMutex(n domain.IOnodeIface) *sync.Mutex { + resource, ok := h.EmuResourceMap[n.Name()] + if !ok { + return nil + } + + return &resource.Mutex +} + +func (h *SysModuleNfconntrackParameters) SetService(hs domain.HandlerServiceIface) { + h.Service = hs +} diff --git a/sysbox-fs/handler/implementations/utils.go b/sysbox-fs/handler/implementations/utils.go new file mode 100644 index 00000000..6b049bb9 --- /dev/null +++ b/sysbox-fs/handler/implementations/utils.go @@ -0,0 +1,369 @@ +// +// Copyright 2019-2023 Nestybox, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +package implementations + +import ( + "errors" + "io" + "math/rand" + "os" + "strconv" + "strings" + "syscall" + "time" + + "github.com/nestybox/sysbox-fs/domain" + "github.com/nestybox/sysbox-fs/fuse" + "github.com/sirupsen/logrus" +) + +func readCntrData( + h domain.HandlerIface, + n domain.IOnodeIface, + req *domain.HandlerRequest) (int, error) { + + cntr := req.Container + path := n.Path() + + cntr.Lock() + defer cntr.Unlock() + + // Check if this resource is cached for this container. If it isn't, fetch + // its data from the host FS and cache it within the container struct. + + sz, err := cntr.Data(path, req.Offset, &req.Data) + if err != nil && err != io.EOF { + return 0, fuse.IOerror{Code: syscall.EINVAL} + } + + if req.Offset == 0 && sz == 0 && err == io.EOF { + + sz, err = readFs(h, n, req.Offset, &req.Data) + if err != nil && err != io.EOF { + return 0, fuse.IOerror{Code: syscall.EINVAL} + } + + if sz == 0 && err == io.EOF { + return 0, nil + } + + err = cntr.SetData(path, req.Offset, req.Data[0:sz]) + if err != nil { + return 0, fuse.IOerror{Code: syscall.EINVAL} + } + } + + return sz, nil +} + +func writeCntrData( + h domain.HandlerIface, + n domain.IOnodeIface, + req *domain.HandlerRequest, + pushToFs func(currData, newData []byte) (bool, error)) (int, error) { + + cntr := req.Container + path := n.Path() + ignoreFsErrors := h.GetService().IgnoreErrors() + + cntr.Lock() + defer cntr.Unlock() + + sz, err := writeFs(h, n, req.Offset, req.Data, pushToFs) + + if ignoreFsErrors { + err = nil + sz = len(req.Data) + } + + if err != nil { + logrus.Errorf("Failed to write to %s: %s", path, err) + return 0, err + } + + err = cntr.SetData(path, req.Offset, req.Data) + if err != nil { + return 0, fuse.IOerror{Code: syscall.EINVAL} + } + + return sz, nil +} + +// readFs reads data from the given IO node. +func readFs( + h domain.HandlerIface, + n domain.IOnodeIface, + offset int64, + data *[]byte) (int, error) { + + // We need the per-resource lock since we are about to access the resource + // on the host FS. See writeFs() for a full explanation. + resourceMutex := h.GetResourceMutex(n) + + if resourceMutex == nil { + logrus.Errorf("Unexpected error: no mutex found for emulated resource %s", + n.Path()) + return 0, errors.New("no mutex found for emulated resource") + } + resourceMutex.Lock() + defer resourceMutex.Unlock() + + // Read from the host FS to extract the existing value. + if err := n.Open(); err != nil { + logrus.Debugf("Could not open file %v", n.Path()) + return 0, err + } + defer n.Close() + + // TODO: ReadAt may not read all data; check sz and loop until we read all + // the data + sz, err := n.ReadAt(*data, offset) + if err != nil && err != io.EOF { + logrus.Errorf("Could not read from file %v at offset %d", n.Path(), offset) + return 0, err + } + + return sz, err +} + +// Same as above but without concurrency protection. To be utilized only when +// reading from non-emulated nodes. +func readHostFs( + h domain.HandlerIface, + n domain.IOnodeIface, + offset int64, + data *[]byte) (int, error) { + + // Read from the host FS to extract the existing value. + if err := n.Open(); err != nil { + logrus.Debugf("Could not open file %v", n.Path()) + return 0, err + } + defer n.Close() + + // TODO: ReadAt may not read all data; check sz and loop until we read all + // the data + sz, err := n.ReadAt(*data, offset) + if err != nil && err != io.EOF { + logrus.Errorf("Could not read from file %v at offset %d", n.Path(), offset) + return 0, err + } + + return sz, err +} + +// writeFs writes the given data to the given IO node. argument 'wrCondition' +// is a function that the caller can pass to determine if the write should +// actually happen given the IO node's current and new data. If set to nil +// the write is skipped. +func writeFs( + h domain.HandlerIface, + n domain.IOnodeIface, + offset int64, + data []byte, + wrCondition func(currData, newData []byte) (bool, error)) (int, error) { + + if wrCondition == nil { + return len(data), nil + } + + // We need the per-resource lock since we are about to access the resource + // on the host FS and multiple sys containers could be accessing that same + // resource concurrently. + // + // But that's not sufficient. Some users may deploy sysbox inside a + // privileged container, and thus can have multiple sysbox instances running + // concurrently on the same host. If those sysbox instances write conflicting + // values to a kernel resource that uses this handler (e.g., a sysctl under + // /proc/sys), a race condition arises that could cause the value to be + // written to not be the max across all instances. + // + // To reduce the chance of this occurring, in addition to the per-resource + // lock, we use a heuristic in which we read-after-write to verify the value + // of the resource is equal to the one we wrote. If it isn't, it means some + // other agent on the host wrote a value to the resource after we wrote to + // it, so we must retry the write. + // + // When retrying, we wait a small but random amount of time to reduce the + // chance of hitting the race condition again. And we retry a limited amount + // of times. + // + // Note that this solution works well for resolving race conditions among + // sysbox instances, but may not address race conditions with other host + // agents that write to the same sysctl. That's because there is no guarantee + // that the other host agent will read-after-write and retry as sysbox does. + + resourceMutex := h.GetResourceMutex(n) + + if resourceMutex == nil { + logrus.Errorf("Unexpected error: no mutex found for emulated resource %s", + n.Path()) + return 0, errors.New("no mutex found for emulated resource") + } + resourceMutex.Lock() + defer resourceMutex.Unlock() + + n.SetOpenFlags(int(os.O_RDWR)) + if err := n.Open(); err != nil { + return 0, err + } + defer n.Close() + + retries := 5 + retryDelay := 100 // microsecs + currData := make([]byte, 65536, 65536) + + for i := 0; i < retries; i++ { + + // TODO: ReadAt may not read all data; check sz and loop until we read all + // the data + sz, err := n.ReadAt(currData, offset) + if err != nil && err != io.EOF { + return 0, err + } + currData = currData[0:sz] + + if string(currData) == string(data) { + break + } + + write, err := wrCondition(currData, data) + if err != nil { + return 0, err + } + + if !write { + break + } + + // When retrying, wait a random delay to reduce chances of a new collision. + if i > 0 { + d := rand.Intn(retryDelay) + time.Sleep(time.Duration(d) * time.Microsecond) + } + + // TODO: WriteAt may not write all data; check sz and loop until we write + // all the data + _, err = n.WriteAt(data, offset) + if err != nil { + return 0, err + } + } + + return len(data), nil +} + +// Returns true unconditionally; meant to be used as the 'wrCondition' argument in writeFs() +func writeToFs(curr, new []byte) (bool, error) { + return true, nil +} + +// Same as above but without concurrency protection. To be utilized only when +// writing into non-emulated nodes. +func writeHostFs( + h domain.HandlerIface, + n domain.IOnodeIface, + offset int64, + data []byte) (int, error) { + + n.SetOpenFlags(int(os.O_RDWR)) + if err := n.Open(); err != nil { + return 0, err + } + defer n.Close() + + _, err := n.WriteAt(data, offset) + if err != nil { + return 0, err + } + + return len(data), nil +} + +// writeMaxIntToFs interprets the given data as 64-bit signed integers and +// returns true if new > curr; meant to be used at the 'wrCondition' argument in +// writeFs() +func writeMaxIntToFs(curr, new []byte) (bool, error) { + + newStr := strings.TrimSpace(string(new)) + newInt, err := strconv.ParseInt(newStr, 10, 64) + if err != nil { + return false, err + } + + currStr := strings.TrimSpace(string(curr)) + currInt, err := strconv.ParseInt(currStr, 10, 64) + if err != nil { + return false, err + } + + return newInt > currInt, nil +} + +// writeMinIntToFs interprets the given data as 64-bit signed integers and +// returns true if new < curr; meant to be used at the 'wrCondition' argument in +// writeFs() +func writeMinIntToFs(curr, new []byte) (bool, error) { + + newStr := strings.TrimSpace(string(new)) + newInt, err := strconv.ParseInt(newStr, 10, 64) + if err != nil { + return false, err + } + + currStr := strings.TrimSpace(string(curr)) + currInt, err := strconv.ParseInt(currStr, 10, 64) + if err != nil { + return false, err + } + + return newInt < currInt, nil +} + +// checkIntRange interprets the given data as an integer and checks if it's +// within the given range (inclusive). +func checkIntRange(data []byte, min, max int) bool { + str := strings.TrimSpace(string(data)) + val, err := strconv.Atoi(str) + if err != nil { + return false + } + + if val < min || val > max { + return false + } + + return true +} + +func padRight(str, pad string, length int) string { + for { + str += pad + if len(str) > length { + return str[0:length] + } + } +} + +func padLeft(str, pad string, length int) string { + for { + str = pad + str + if len(str) > length { + return str[0:length] + } + } +} diff --git a/sysbox-fs/ipc/apis.go b/sysbox-fs/ipc/apis.go new file mode 100644 index 00000000..46bd48b6 --- /dev/null +++ b/sysbox-fs/ipc/apis.go @@ -0,0 +1,154 @@ +// +// Copyright 2019-2020 Nestybox, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +package ipc + +import ( + "github.com/sirupsen/logrus" + + "github.com/nestybox/sysbox-fs/domain" + grpc "github.com/nestybox/sysbox-ipc/sysboxFsGrpc" + grpcCodes "google.golang.org/grpc/codes" + grpcStatus "google.golang.org/grpc/status" +) + +type ipcService struct { + grpcServer *grpc.Server + css domain.ContainerStateServiceIface + prs domain.ProcessServiceIface + ios domain.IOServiceIface +} + +func NewIpcService() domain.IpcServiceIface { + return &ipcService{} +} + +func (ips *ipcService) Setup( + css domain.ContainerStateServiceIface, + prs domain.ProcessServiceIface, + ios domain.IOServiceIface, + fuseMp string) { + + ips.css = css + ips.prs = prs + ips.ios = ios + + // Instantiate a grpcServer for inter-process communication. + ips.grpcServer = grpc.NewServer( + ips, + &grpc.CallbacksMap{ + grpc.ContainerPreRegisterMessage: ContainerPreRegister, + grpc.ContainerRegisterMessage: ContainerRegister, + grpc.ContainerUnregisterMessage: ContainerUnregister, + grpc.ContainerUpdateMessage: ContainerUpdate, + }, + fuseMp, + ) + + logrus.Infof("Listening on %v", ips.grpcServer.GetAddr()) +} + +func (ips *ipcService) Init() error { + return ips.grpcServer.Init() +} + +func ContainerPreRegister(ctx interface{}, data *grpc.ContainerData) error { + + ipcService := ctx.(*ipcService) + + err := ipcService.css.ContainerPreRegister(data.Id, data.Netns) + if err != nil { + return err + } + + return nil +} + +func ContainerRegister(ctx interface{}, data *grpc.ContainerData) error { + + ipcService := ctx.(*ipcService) + + // Create temporary container struct to be passed as reference to containerDB, + // where the matching (real) container will be identified and then updated. + cntr := ipcService.css.ContainerCreate( + data.Id, + uint32(data.InitPid), + data.Ctime, + uint32(data.UidFirst), + uint32(data.UidSize), + uint32(data.GidFirst), + uint32(data.GidSize), + data.ProcRoPaths, + data.ProcMaskPaths, + ipcService.css, + ) + + err := ipcService.css.ContainerRegister(cntr) + if err != nil { + return err + } + + return nil +} + +func ContainerUnregister(ctx interface{}, data *grpc.ContainerData) error { + + ipcService := ctx.(*ipcService) + + // Identify the container being unregistered. + cntr := ipcService.css.ContainerLookupById(data.Id) + if cntr == nil { + return grpcStatus.Errorf( + grpcCodes.NotFound, + "Container %s not found", + data.Id, + ) + } + + err := ipcService.css.ContainerUnregister(cntr) + if err != nil { + return err + } + + return nil +} + +func ContainerUpdate(ctx interface{}, data *grpc.ContainerData) error { + + ipcService := ctx.(*ipcService) + + // Create temporary container struct to be passed as reference to containerDB, + // where the matching (real) container will be identified and then updated. + cntr := ipcService.css.ContainerCreate( + data.Id, + uint32(data.InitPid), + data.Ctime, + uint32(data.UidFirst), + uint32(data.UidSize), + uint32(data.GidFirst), + uint32(data.GidSize), + nil, + nil, + ipcService.css, + ) + + err := ipcService.css.ContainerUpdate(cntr) + if err != nil { + return err + } + + return nil +} diff --git a/sysbox-fs/ipc/apis_test.go b/sysbox-fs/ipc/apis_test.go new file mode 100644 index 00000000..91363d51 --- /dev/null +++ b/sysbox-fs/ipc/apis_test.go @@ -0,0 +1,497 @@ +// +// Copyright 2019-2021 Nestybox, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +package ipc_test + +import ( + "errors" + "io/ioutil" + "reflect" + "testing" + "time" + + "github.com/nestybox/sysbox-fs/domain" + "github.com/nestybox/sysbox-fs/ipc" + "github.com/nestybox/sysbox-fs/mocks" + "github.com/nestybox/sysbox-fs/state" + grpc "github.com/nestybox/sysbox-ipc/sysboxFsGrpc" + "github.com/sirupsen/logrus" +) + +// Sysbox-fs global services for all state's pkg unit-tests. +var css *mocks.ContainerStateServiceIface + +func TestMain(m *testing.M) { + + // Disable log generation during UT. + logrus.SetOutput(ioutil.Discard) + + // + // Test-cases common settings. + // + css = &mocks.ContainerStateServiceIface{} + css.On("Setup", nil, nil, nil).Return(nil) + + // Run test-suite. + m.Run() +} + +func TestNewIpcService(t *testing.T) { + tests := []struct { + name string + want domain.IpcServiceIface + }{ + // TODO: Add test cases. + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + if got := ipc.NewIpcService(); !reflect.DeepEqual(got, tt.want) { + t.Errorf("NewIpcService() = %v, want %v", got, tt.want) + } + }) + } +} + +func Test_ipcService_Setup(t *testing.T) { + type fields struct { + grpcServer *grpc.Server + css domain.ContainerStateServiceIface + prs domain.ProcessServiceIface + ios domain.IOServiceIface + fuseMp string + } + + var f1 = fields{ + grpcServer: nil, + css: css, + prs: nil, + ios: nil, + fuseMp: "/var/lib/sysboxfs", + } + + type args struct { + css domain.ContainerStateServiceIface + prs domain.ProcessServiceIface + ios domain.IOServiceIface + fuseMp string + } + var a1 = args{ + css: css, + prs: nil, + ios: nil, + fuseMp: "/var/lib/sysboxfs", + } + + tests := []struct { + name string + fields fields + args args + }{ + {"1", f1, a1}, + } + + // + // Testcase executions. + // + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + ips := ipc.NewIpcService() + ips.Setup(tt.args.css, tt.args.prs, tt.args.ios, tt.args.fuseMp) + }) + } +} + +func Test_ipcService_Init(t *testing.T) { + type fields struct { + grpcServer *grpc.Server + css domain.ContainerStateServiceIface + prs domain.ProcessServiceIface + ios domain.IOServiceIface + } + tests := []struct { + name string + fields fields + wantErr bool + }{ + // TODO: Add test cases. + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + ips := ipc.NewIpcService() + if err := ips.Init(); (err != nil) != tt.wantErr { + t.Errorf("ipcService.Init() error = %v, wantErr %v", err, tt.wantErr) + } + }) + } +} + +func TestContainerPreRegister(t *testing.T) { + type args struct { + ctx interface{} + data *grpc.ContainerData + } + + var ctx = ipc.NewIpcService() + ctx.Setup(css, nil, nil, "/var/lib/sysboxfs") + + var a1 = args{ + ctx: ctx, + data: &grpc.ContainerData{ + Id: "c1", + Netns: "", + }, + } + + tests := []struct { + name string + args args + wantErr bool + prepare func() + }{ + { + // + // Test-case 1: Proper pre-registration request. No errors expected. + // + name: "1", + args: a1, + wantErr: false, + prepare: func() { + css.On("ContainerPreRegister", a1.data.Id, a1.data.Netns).Return(nil) + }, + }, + { + // + // Test-case 2: Verify proper behavior during css' pre-registration + // error. + // + name: "2", + args: a1, + wantErr: true, + prepare: func() { + css.On("ContainerPreRegister", a1.data.Id, a1.data.Netns).Return( + errors.New("Container pre-registration error: container %s already present")) + }, + }, + } + + // + // Testcase executions. + // + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + + // Reset mock expectations from previous iterations. + css.ExpectedCalls = nil + + // Prepare the mocks. + if tt.prepare != nil { + tt.prepare() + } + + if err := ipc.ContainerPreRegister(tt.args.ctx, tt.args.data); (err != nil) != tt.wantErr { + t.Errorf("ContainerPreRegister() error = %v, wantErr %v", err, tt.wantErr) + } + + // Ensure that mocks were properly invoked. + css.AssertExpectations(t) + }) + } +} + +func TestContainerRegister(t *testing.T) { + type args struct { + ctx interface{} + data *grpc.ContainerData + } + + var c1 domain.ContainerIface + + var ctx = ipc.NewIpcService() + ctx.Setup(css, nil, nil, "/var/lib/sysboxfs") + + var a1 = args{ + ctx: ctx, + data: &grpc.ContainerData{ + Id: "c1", + }, + } + + tests := []struct { + name string + args args + wantErr bool + prepare func() + }{ + { + // + // Test-case 1: Proper pre-registration request. No errors expected. + // + name: "1", + args: a1, + wantErr: false, + prepare: func() { + css.On("ContainerCreate", + a1.data.Id, + uint32(a1.data.InitPid), + a1.data.Ctime, + uint32(a1.data.UidFirst), + uint32(a1.data.UidSize), + uint32(a1.data.GidFirst), + uint32(a1.data.GidSize), + a1.data.ProcRoPaths, + a1.data.ProcMaskPaths, + css).Return(c1) + + css.On("ContainerRegister", c1).Return(nil) + }, + }, + { + // + // Test-case 2: Verify proper behavior during css' pre-registration + // error. + // + name: "2", + args: a1, + wantErr: true, + prepare: func() { + + css.On("ContainerCreate", + a1.data.Id, + uint32(a1.data.InitPid), + a1.data.Ctime, + uint32(a1.data.UidFirst), + uint32(a1.data.UidSize), + uint32(a1.data.GidFirst), + uint32(a1.data.GidSize), + a1.data.ProcRoPaths, + a1.data.ProcMaskPaths, + css).Return(c1) + + css.On("ContainerRegister", c1).Return( + errors.New("registration error found")) + }, + }, + } + + // + // Testcase executions. + // + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + + // Reset mock expectations from previous iterations. + css.ExpectedCalls = nil + + // Prepare the mocks. + if tt.prepare != nil { + tt.prepare() + } + + if err := ipc.ContainerRegister(tt.args.ctx, tt.args.data); (err != nil) != tt.wantErr { + t.Errorf("ContainerRegister() error = %v, wantErr %v", err, tt.wantErr) + } + + // Ensure that mocks were properly invoked. + css.AssertExpectations(t) + }) + } +} + +func TestContainerUnregister(t *testing.T) { + type args struct { + ctx interface{} + data *grpc.ContainerData + } + + var c1 = state.NewContainerStateService().ContainerCreate( + "c1", + 1001, + time.Time{}, + 231072, + 65535, + 231072, + 65535, + nil, + nil, + nil, + ) + + var ctx = ipc.NewIpcService() + ctx.Setup(css, nil, nil, "/var/lib/sysboxfs") + + var a1 = args{ + ctx: ctx, + data: &grpc.ContainerData{ + Id: "c1", + }, + } + + tests := []struct { + name string + args args + wantErr bool + prepare func() + }{ + { + // + // Test-case 1: Proper pre-registration request. No errors expected. + // + name: "1", + args: a1, + wantErr: false, + prepare: func() { + + css.On("ContainerLookupById", a1.data.Id).Return(c1) + css.On("ContainerUnregister", c1).Return(nil) + }, + }, + { + // + // Test-case 2: Verify proper behavior during css' pre-registration + // error. + // + name: "2", + args: a1, + wantErr: true, + prepare: func() { + css.On("ContainerLookupById", a1.data.Id).Return(nil) + }, + }, + } + + // + // Testcase executions. + // + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + + // Reset mock expectations from previous iterations. + css.ExpectedCalls = nil + + // Prepare the mocks. + if tt.prepare != nil { + tt.prepare() + } + + if err := ipc.ContainerUnregister(tt.args.ctx, tt.args.data); (err != nil) != tt.wantErr { + t.Errorf("ContainerUnregister() error = %v, wantErr %v", err, tt.wantErr) + } + + // Ensure that mocks were properly invoked. + css.AssertExpectations(t) + }) + } +} + +func TestContainerUpdate(t *testing.T) { + type args struct { + ctx interface{} + data *grpc.ContainerData + } + + var c1 domain.ContainerIface + + var ctx = ipc.NewIpcService() + ctx.Setup(css, nil, nil, "/var/lib/sysboxfs") + + var a1 = args{ + ctx: ctx, + data: &grpc.ContainerData{ + Id: "c1", + }, + } + + tests := []struct { + name string + args args + wantErr bool + prepare func() + }{ + { + // + // Test-case 1: Proper pre-registration request. No errors expected. + // + name: "1", + args: a1, + wantErr: false, + prepare: func() { + + css.On("ContainerCreate", + a1.data.Id, + uint32(a1.data.InitPid), + a1.data.Ctime, + uint32(a1.data.UidFirst), + uint32(a1.data.UidSize), + uint32(a1.data.GidFirst), + uint32(a1.data.GidSize), + a1.data.ProcRoPaths, + a1.data.ProcMaskPaths, + css).Return(c1) + + css.On("ContainerUpdate", c1).Return(nil) + }, + }, + { + // + // Test-case 2: Verify proper behavior during css' pre-registration + // error. + // + name: "2", + args: a1, + wantErr: true, + prepare: func() { + + css.On("ContainerCreate", + a1.data.Id, + uint32(a1.data.InitPid), + a1.data.Ctime, + uint32(a1.data.UidFirst), + uint32(a1.data.UidSize), + uint32(a1.data.GidFirst), + uint32(a1.data.GidSize), + a1.data.ProcRoPaths, + a1.data.ProcMaskPaths, + css).Return(c1) + + css.On("ContainerUpdate", c1).Return( + errors.New("registration error found")) + }, + }, + } + + // + // Testcase executions. + // + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + + // Reset mock expectations from previous iterations. + css.ExpectedCalls = nil + + // Prepare the mocks. + if tt.prepare != nil { + tt.prepare() + } + + if err := ipc.ContainerUpdate(tt.args.ctx, tt.args.data); (err != nil) != tt.wantErr { + t.Errorf("ContainerUpdate() error = %v, wantErr %v", err, tt.wantErr) + } + + // Ensure that mocks were properly invoked. + css.AssertExpectations(t) + }) + } +} diff --git a/sysbox-fs/mocks/ContainerIface.go b/sysbox-fs/mocks/ContainerIface.go new file mode 100644 index 00000000..8f1f0f0b --- /dev/null +++ b/sysbox-fs/mocks/ContainerIface.go @@ -0,0 +1,330 @@ +// Code generated by mockery v1.0.0. DO NOT EDIT. + +package mocks + +import ( + domain "github.com/nestybox/sysbox-fs/domain" + mock "github.com/stretchr/testify/mock" + + time "time" +) + +// ContainerIface is an autogenerated mock type for the ContainerIface type +type ContainerIface struct { + mock.Mock +} + +// Ctime provides a mock function with given fields: +func (_m *ContainerIface) Ctime() time.Time { + ret := _m.Called() + + var r0 time.Time + if rf, ok := ret.Get(0).(func() time.Time); ok { + r0 = rf() + } else { + r0 = ret.Get(0).(time.Time) + } + + return r0 +} + +// Data provides a mock function with given fields: path, name +func (_m *ContainerIface) Data(path string, name string) (string, bool) { + ret := _m.Called(path, name) + + var r0 string + if rf, ok := ret.Get(0).(func(string, string) string); ok { + r0 = rf(path, name) + } else { + r0 = ret.Get(0).(string) + } + + var r1 bool + if rf, ok := ret.Get(1).(func(string, string) bool); ok { + r1 = rf(path, name) + } else { + r1 = ret.Get(1).(bool) + } + + return r0, r1 +} + +// ExtractInode provides a mock function with given fields: path +func (_m *ContainerIface) ExtractInode(path string) (uint64, error) { + ret := _m.Called(path) + + var r0 uint64 + if rf, ok := ret.Get(0).(func(string) uint64); ok { + r0 = rf(path) + } else { + r0 = ret.Get(0).(uint64) + } + + var r1 error + if rf, ok := ret.Get(1).(func(string) error); ok { + r1 = rf(path) + } else { + r1 = ret.Error(1) + } + + return r0, r1 +} + +// GID provides a mock function with given fields: +func (_m *ContainerIface) GID() uint32 { + ret := _m.Called() + + var r0 uint32 + if rf, ok := ret.Get(0).(func() uint32); ok { + r0 = rf() + } else { + r0 = ret.Get(0).(uint32) + } + + return r0 +} + +// ID provides a mock function with given fields: +func (_m *ContainerIface) ID() string { + ret := _m.Called() + + var r0 string + if rf, ok := ret.Get(0).(func() string); ok { + r0 = rf() + } else { + r0 = ret.Get(0).(string) + } + + return r0 +} + +// InitPid provides a mock function with given fields: +func (_m *ContainerIface) InitPid() uint32 { + ret := _m.Called() + + var r0 uint32 + if rf, ok := ret.Get(0).(func() uint32); ok { + r0 = rf() + } else { + r0 = ret.Get(0).(uint32) + } + + return r0 +} + +// InitProc provides a mock function with given fields: +func (_m *ContainerIface) InitProc() domain.ProcessIface { + ret := _m.Called() + + var r0 domain.ProcessIface + if rf, ok := ret.Get(0).(func() domain.ProcessIface); ok { + r0 = rf() + } else { + if ret.Get(0) != nil { + r0 = ret.Get(0).(domain.ProcessIface) + } + } + + return r0 +} + +// IsImmutableBindMount provides a mock function with given fields: info +func (_m *ContainerIface) IsImmutableBindMount(info *domain.MountInfo) bool { + ret := _m.Called(info) + + var r0 bool + if rf, ok := ret.Get(0).(func(*domain.MountInfo) bool); ok { + r0 = rf(info) + } else { + r0 = ret.Get(0).(bool) + } + + return r0 +} + +// IsImmutableMount provides a mock function with given fields: info +func (_m *ContainerIface) IsImmutableMount(info *domain.MountInfo) bool { + ret := _m.Called(info) + + var r0 bool + if rf, ok := ret.Get(0).(func(*domain.MountInfo) bool); ok { + r0 = rf(info) + } else { + r0 = ret.Get(0).(bool) + } + + return r0 +} + +// IsImmutableMountID provides a mock function with given fields: id +func (_m *ContainerIface) IsImmutableMountID(id int) bool { + ret := _m.Called(id) + + var r0 bool + if rf, ok := ret.Get(0).(func(int) bool); ok { + r0 = rf(id) + } else { + r0 = ret.Get(0).(bool) + } + + return r0 +} + +// IsImmutableMountpoint provides a mock function with given fields: mp +func (_m *ContainerIface) IsImmutableMountpoint(mp string) bool { + ret := _m.Called(mp) + + var r0 bool + if rf, ok := ret.Get(0).(func(string) bool); ok { + r0 = rf(mp) + } else { + r0 = ret.Get(0).(bool) + } + + return r0 +} + +// IsImmutableOverlapMountpoint provides a mock function with given fields: mp +func (_m *ContainerIface) IsImmutableOverlapMountpoint(mp string) bool { + ret := _m.Called(mp) + + var r0 bool + if rf, ok := ret.Get(0).(func(string) bool); ok { + r0 = rf(mp) + } else { + r0 = ret.Get(0).(bool) + } + + return r0 +} + +// IsImmutableRoBindMount provides a mock function with given fields: info +func (_m *ContainerIface) IsImmutableRoBindMount(info *domain.MountInfo) bool { + ret := _m.Called(info) + + var r0 bool + if rf, ok := ret.Get(0).(func(*domain.MountInfo) bool); ok { + r0 = rf(info) + } else { + r0 = ret.Get(0).(bool) + } + + return r0 +} + +// IsImmutableRoMount provides a mock function with given fields: info +func (_m *ContainerIface) IsImmutableRoMount(info *domain.MountInfo) bool { + ret := _m.Called(info) + + var r0 bool + if rf, ok := ret.Get(0).(func(*domain.MountInfo) bool); ok { + r0 = rf(info) + } else { + r0 = ret.Get(0).(bool) + } + + return r0 +} + +// IsImmutableRoMountID provides a mock function with given fields: id +func (_m *ContainerIface) IsImmutableRoMountID(id int) bool { + ret := _m.Called(id) + + var r0 bool + if rf, ok := ret.Get(0).(func(int) bool); ok { + r0 = rf(id) + } else { + r0 = ret.Get(0).(bool) + } + + return r0 +} + +// IsImmutableRoMountpoint provides a mock function with given fields: mp +func (_m *ContainerIface) IsImmutableRoMountpoint(mp string) bool { + ret := _m.Called(mp) + + var r0 bool + if rf, ok := ret.Get(0).(func(string) bool); ok { + r0 = rf(mp) + } else { + r0 = ret.Get(0).(bool) + } + + return r0 +} + +// Lock provides a mock function with given fields: +func (_m *ContainerIface) Lock() { + _m.Called() +} + +// ProcMaskPaths provides a mock function with given fields: +func (_m *ContainerIface) ProcMaskPaths() []string { + ret := _m.Called() + + var r0 []string + if rf, ok := ret.Get(0).(func() []string); ok { + r0 = rf() + } else { + if ret.Get(0) != nil { + r0 = ret.Get(0).([]string) + } + } + + return r0 +} + +// ProcRoPaths provides a mock function with given fields: +func (_m *ContainerIface) ProcRoPaths() []string { + ret := _m.Called() + + var r0 []string + if rf, ok := ret.Get(0).(func() []string); ok { + r0 = rf() + } else { + if ret.Get(0) != nil { + r0 = ret.Get(0).([]string) + } + } + + return r0 +} + +// SetData provides a mock function with given fields: path, name, data +func (_m *ContainerIface) SetData(path string, name string, data string) { + _m.Called(path, name, data) +} + +// SetInitProc provides a mock function with given fields: pid, uid, gid +func (_m *ContainerIface) SetInitProc(pid uint32, uid uint32, gid uint32) error { + ret := _m.Called(pid, uid, gid) + + var r0 error + if rf, ok := ret.Get(0).(func(uint32, uint32, uint32) error); ok { + r0 = rf(pid, uid, gid) + } else { + r0 = ret.Error(0) + } + + return r0 +} + +// UID provides a mock function with given fields: +func (_m *ContainerIface) UID() uint32 { + ret := _m.Called() + + var r0 uint32 + if rf, ok := ret.Get(0).(func() uint32); ok { + r0 = rf() + } else { + r0 = ret.Get(0).(uint32) + } + + return r0 +} + +// Unlock provides a mock function with given fields: +func (_m *ContainerIface) Unlock() { + _m.Called() +} diff --git a/sysbox-fs/mocks/ContainerStateServiceIface.go b/sysbox-fs/mocks/ContainerStateServiceIface.go new file mode 100644 index 00000000..01210b41 --- /dev/null +++ b/sysbox-fs/mocks/ContainerStateServiceIface.go @@ -0,0 +1,170 @@ +// Code generated by mockery v1.0.0. DO NOT EDIT. + +package mocks + +import ( + domain "github.com/nestybox/sysbox-fs/domain" + mock "github.com/stretchr/testify/mock" + + time "time" +) + +// ContainerStateServiceIface is an autogenerated mock type for the ContainerStateServiceIface type +type ContainerStateServiceIface struct { + mock.Mock +} + +// ContainerCreate provides a mock function with given fields: id, pid, ctime, uidFirst, uidSize, gidFirst, gidSize, procRoPaths, procMaskPaths, service +func (_m *ContainerStateServiceIface) ContainerCreate(id string, pid uint32, ctime time.Time, uidFirst uint32, uidSize uint32, gidFirst uint32, gidSize uint32, procRoPaths []string, procMaskPaths []string, service domain.ContainerStateServiceIface) domain.ContainerIface { + ret := _m.Called(id, pid, ctime, uidFirst, uidSize, gidFirst, gidSize, procRoPaths, procMaskPaths, service) + + var r0 domain.ContainerIface + if rf, ok := ret.Get(0).(func(string, uint32, time.Time, uint32, uint32, uint32, uint32, []string, []string, domain.ContainerStateServiceIface) domain.ContainerIface); ok { + r0 = rf(id, pid, ctime, uidFirst, uidSize, gidFirst, gidSize, procRoPaths, procMaskPaths, service) + } else { + if ret.Get(0) != nil { + r0 = ret.Get(0).(domain.ContainerIface) + } + } + + return r0 +} + +// ContainerDBSize provides a mock function with given fields: +func (_m *ContainerStateServiceIface) ContainerDBSize() int { + ret := _m.Called() + + var r0 int + if rf, ok := ret.Get(0).(func() int); ok { + r0 = rf() + } else { + r0 = ret.Get(0).(int) + } + + return r0 +} + +// ContainerLookupById provides a mock function with given fields: id +func (_m *ContainerStateServiceIface) ContainerLookupById(id string) domain.ContainerIface { + ret := _m.Called(id) + + var r0 domain.ContainerIface + if rf, ok := ret.Get(0).(func(string) domain.ContainerIface); ok { + r0 = rf(id) + } else { + if ret.Get(0) != nil { + r0 = ret.Get(0).(domain.ContainerIface) + } + } + + return r0 +} + +// ContainerPreRegister provides a mock function with given fields: id, netns +func (_m *ContainerStateServiceIface) ContainerPreRegister(id string, netns string) error { + ret := _m.Called(id, netns) + + var r0 error + if rf, ok := ret.Get(0).(func(string, string) error); ok { + r0 = rf(id, netns) + } else { + r0 = ret.Error(0) + } + + return r0 +} + +// ContainerRegister provides a mock function with given fields: c +func (_m *ContainerStateServiceIface) ContainerRegister(c domain.ContainerIface) error { + ret := _m.Called(c) + + var r0 error + if rf, ok := ret.Get(0).(func(domain.ContainerIface) error); ok { + r0 = rf(c) + } else { + r0 = ret.Error(0) + } + + return r0 +} + +// ContainerUnregister provides a mock function with given fields: c +func (_m *ContainerStateServiceIface) ContainerUnregister(c domain.ContainerIface) error { + ret := _m.Called(c) + + var r0 error + if rf, ok := ret.Get(0).(func(domain.ContainerIface) error); ok { + r0 = rf(c) + } else { + r0 = ret.Error(0) + } + + return r0 +} + +// ContainerUpdate provides a mock function with given fields: c +func (_m *ContainerStateServiceIface) ContainerUpdate(c domain.ContainerIface) error { + ret := _m.Called(c) + + var r0 error + if rf, ok := ret.Get(0).(func(domain.ContainerIface) error); ok { + r0 = rf(c) + } else { + r0 = ret.Error(0) + } + + return r0 +} + +// FuseServerService provides a mock function with given fields: +func (_m *ContainerStateServiceIface) FuseServerService() domain.FuseServerServiceIface { + ret := _m.Called() + + var r0 domain.FuseServerServiceIface + if rf, ok := ret.Get(0).(func() domain.FuseServerServiceIface); ok { + r0 = rf() + } else { + if ret.Get(0) != nil { + r0 = ret.Get(0).(domain.FuseServerServiceIface) + } + } + + return r0 +} + +// MountService provides a mock function with given fields: +func (_m *ContainerStateServiceIface) MountService() domain.MountServiceIface { + ret := _m.Called() + + var r0 domain.MountServiceIface + if rf, ok := ret.Get(0).(func() domain.MountServiceIface); ok { + r0 = rf() + } else { + if ret.Get(0) != nil { + r0 = ret.Get(0).(domain.MountServiceIface) + } + } + + return r0 +} + +// ProcessService provides a mock function with given fields: +func (_m *ContainerStateServiceIface) ProcessService() domain.ProcessServiceIface { + ret := _m.Called() + + var r0 domain.ProcessServiceIface + if rf, ok := ret.Get(0).(func() domain.ProcessServiceIface); ok { + r0 = rf() + } else { + if ret.Get(0) != nil { + r0 = ret.Get(0).(domain.ProcessServiceIface) + } + } + + return r0 +} + +// Setup provides a mock function with given fields: fss, prs, ios, mts +func (_m *ContainerStateServiceIface) Setup(fss domain.FuseServerServiceIface, prs domain.ProcessServiceIface, ios domain.IOServiceIface, mts domain.MountServiceIface) { + _m.Called(fss, prs, ios, mts) +} diff --git a/sysbox-fs/mocks/FuseServerIface.go b/sysbox-fs/mocks/FuseServerIface.go new file mode 100644 index 00000000..03c7270a --- /dev/null +++ b/sysbox-fs/mocks/FuseServerIface.go @@ -0,0 +1,95 @@ +// Code generated by mockery v1.0.0. DO NOT EDIT. + +package mocks + +import mock "github.com/stretchr/testify/mock" + +// FuseServerIface is an autogenerated mock type for the FuseServerIface type +type FuseServerIface struct { + mock.Mock +} + +// Create provides a mock function with given fields: +func (_m *FuseServerIface) Create() error { + ret := _m.Called() + + var r0 error + if rf, ok := ret.Get(0).(func() error); ok { + r0 = rf() + } else { + r0 = ret.Error(0) + } + + return r0 +} + +// Destroy provides a mock function with given fields: +func (_m *FuseServerIface) Destroy() error { + ret := _m.Called() + + var r0 error + if rf, ok := ret.Get(0).(func() error); ok { + r0 = rf() + } else { + r0 = ret.Error(0) + } + + return r0 +} + +// InitWait provides a mock function with given fields: +func (_m *FuseServerIface) InitWait() { + _m.Called() +} + +// IsCntrRegCompleted provides a mock function with given fields: +func (_m *FuseServerIface) IsCntrRegCompleted() bool { + ret := _m.Called() + + var r0 bool + if rf, ok := ret.Get(0).(func() bool); ok { + r0 = rf() + } else { + r0 = ret.Get(0).(bool) + } + + return r0 +} + +// MountPoint provides a mock function with given fields: +func (_m *FuseServerIface) MountPoint() string { + ret := _m.Called() + + var r0 string + if rf, ok := ret.Get(0).(func() string); ok { + r0 = rf() + } else { + r0 = ret.Get(0).(string) + } + + return r0 +} + +// Run provides a mock function with given fields: +func (_m *FuseServerIface) Run() error { + ret := _m.Called() + + var r0 error + if rf, ok := ret.Get(0).(func() error); ok { + r0 = rf() + } else { + r0 = ret.Error(0) + } + + return r0 +} + +// SetCntrRegComplete provides a mock function with given fields: +func (_m *FuseServerIface) SetCntrRegComplete() { + _m.Called() +} + +// Unmount provides a mock function with given fields: +func (_m *FuseServerIface) Unmount() { + _m.Called() +} diff --git a/sysbox-fs/mocks/FuseServerServiceIface.go b/sysbox-fs/mocks/FuseServerServiceIface.go new file mode 100644 index 00000000..d2e1911d --- /dev/null +++ b/sysbox-fs/mocks/FuseServerServiceIface.go @@ -0,0 +1,74 @@ +// Code generated by mockery v1.0.0. DO NOT EDIT. + +package mocks + +import ( + domain "github.com/nestybox/sysbox-fs/domain" + mock "github.com/stretchr/testify/mock" +) + +// FuseServerServiceIface is an autogenerated mock type for the FuseServerServiceIface type +type FuseServerServiceIface struct { + mock.Mock +} + +// CreateFuseServer provides a mock function with given fields: serveCntr, stateCntr +func (_m *FuseServerServiceIface) CreateFuseServer(serveCntr domain.ContainerIface, stateCntr domain.ContainerIface) error { + ret := _m.Called(serveCntr, stateCntr) + + var r0 error + if rf, ok := ret.Get(0).(func(domain.ContainerIface, domain.ContainerIface) error); ok { + r0 = rf(serveCntr, stateCntr) + } else { + r0 = ret.Error(0) + } + + return r0 +} + +// DestroyFuseServer provides a mock function with given fields: mp +func (_m *FuseServerServiceIface) DestroyFuseServer(mp string) error { + ret := _m.Called(mp) + + var r0 error + if rf, ok := ret.Get(0).(func(string) error); ok { + r0 = rf(mp) + } else { + r0 = ret.Error(0) + } + + return r0 +} + +// DestroyFuseService provides a mock function with given fields: +func (_m *FuseServerServiceIface) DestroyFuseService() { + _m.Called() +} + +// FuseServerCntrRegComplete provides a mock function with given fields: cntr +func (_m *FuseServerServiceIface) FuseServerCntrRegComplete(cntr domain.ContainerIface) error { + ret := _m.Called(cntr) + + var r0 error + if rf, ok := ret.Get(0).(func(domain.ContainerIface) error); ok { + r0 = rf(cntr) + } else { + r0 = ret.Error(0) + } + + return r0 +} + +// Setup provides a mock function with given fields: mp, css, ios, hds +func (_m *FuseServerServiceIface) Setup(mp string, css domain.ContainerStateServiceIface, ios domain.IOServiceIface, hds domain.HandlerServiceIface) error { + ret := _m.Called(mp, css, ios, hds) + + var r0 error + if rf, ok := ret.Get(0).(func(string, domain.ContainerStateServiceIface, domain.IOServiceIface, domain.HandlerServiceIface) error); ok { + r0 = rf(mp, css, ios, hds) + } else { + r0 = ret.Error(0) + } + + return r0 +} diff --git a/sysbox-fs/mocks/HandlerIface.go b/sysbox-fs/mocks/HandlerIface.go new file mode 100644 index 00000000..1606ac3c --- /dev/null +++ b/sysbox-fs/mocks/HandlerIface.go @@ -0,0 +1,200 @@ +// Code generated by mockery v1.0.0. DO NOT EDIT. + +package mocks + +import ( + os "os" + sync "sync" + + domain "github.com/nestybox/sysbox-fs/domain" + + mock "github.com/stretchr/testify/mock" +) + +// HandlerIface is an autogenerated mock type for the HandlerIface type +type HandlerIface struct { + mock.Mock +} + +// GetName provides a mock function with given fields: +func (_m *HandlerIface) GetName() string { + ret := _m.Called() + + var r0 string + if rf, ok := ret.Get(0).(func() string); ok { + r0 = rf() + } else { + r0 = ret.Get(0).(string) + } + + return r0 +} + +// GetPath provides a mock function with given fields: +func (_m *HandlerIface) GetPath() string { + ret := _m.Called() + + var r0 string + if rf, ok := ret.Get(0).(func() string); ok { + r0 = rf() + } else { + r0 = ret.Get(0).(string) + } + + return r0 +} + +// GetResourceMap provides a mock function with given fields: +func (_m *HandlerIface) GetResourceMap() map[string]domain.EmuResource { + ret := _m.Called() + + var r0 map[string]domain.EmuResource + if rf, ok := ret.Get(0).(func() map[string]domain.EmuResource); ok { + r0 = rf() + } else { + if ret.Get(0) != nil { + r0 = ret.Get(0).(map[string]domain.EmuResource) + } + } + + return r0 +} + +// GetResourceMutex provides a mock function with given fields: s +func (_m *HandlerIface) GetResourceMutex(s string) *sync.Mutex { + ret := _m.Called(s) + + var r0 *sync.Mutex + if rf, ok := ret.Get(0).(func(string) *sync.Mutex); ok { + r0 = rf(s) + } else { + if ret.Get(0) != nil { + r0 = ret.Get(0).(*sync.Mutex) + } + } + + return r0 +} + +// GetService provides a mock function with given fields: +func (_m *HandlerIface) GetService() domain.HandlerServiceIface { + ret := _m.Called() + + var r0 domain.HandlerServiceIface + if rf, ok := ret.Get(0).(func() domain.HandlerServiceIface); ok { + r0 = rf() + } else { + if ret.Get(0) != nil { + r0 = ret.Get(0).(domain.HandlerServiceIface) + } + } + + return r0 +} + +// Lookup provides a mock function with given fields: n, req +func (_m *HandlerIface) Lookup(n domain.IOnodeIface, req *domain.HandlerRequest) (os.FileInfo, error) { + ret := _m.Called(n, req) + + var r0 os.FileInfo + if rf, ok := ret.Get(0).(func(domain.IOnodeIface, *domain.HandlerRequest) os.FileInfo); ok { + r0 = rf(n, req) + } else { + if ret.Get(0) != nil { + r0 = ret.Get(0).(os.FileInfo) + } + } + + var r1 error + if rf, ok := ret.Get(1).(func(domain.IOnodeIface, *domain.HandlerRequest) error); ok { + r1 = rf(n, req) + } else { + r1 = ret.Error(1) + } + + return r0, r1 +} + +// Open provides a mock function with given fields: node, req +func (_m *HandlerIface) Open(node domain.IOnodeIface, req *domain.HandlerRequest) error { + ret := _m.Called(node, req) + + var r0 error + if rf, ok := ret.Get(0).(func(domain.IOnodeIface, *domain.HandlerRequest) error); ok { + r0 = rf(node, req) + } else { + r0 = ret.Error(0) + } + + return r0 +} + +// Read provides a mock function with given fields: node, req +func (_m *HandlerIface) Read(node domain.IOnodeIface, req *domain.HandlerRequest) (int, error) { + ret := _m.Called(node, req) + + var r0 int + if rf, ok := ret.Get(0).(func(domain.IOnodeIface, *domain.HandlerRequest) int); ok { + r0 = rf(node, req) + } else { + r0 = ret.Get(0).(int) + } + + var r1 error + if rf, ok := ret.Get(1).(func(domain.IOnodeIface, *domain.HandlerRequest) error); ok { + r1 = rf(node, req) + } else { + r1 = ret.Error(1) + } + + return r0, r1 +} + +// ReadDirAll provides a mock function with given fields: node, req +func (_m *HandlerIface) ReadDirAll(node domain.IOnodeIface, req *domain.HandlerRequest) ([]os.FileInfo, error) { + ret := _m.Called(node, req) + + var r0 []os.FileInfo + if rf, ok := ret.Get(0).(func(domain.IOnodeIface, *domain.HandlerRequest) []os.FileInfo); ok { + r0 = rf(node, req) + } else { + if ret.Get(0) != nil { + r0 = ret.Get(0).([]os.FileInfo) + } + } + + var r1 error + if rf, ok := ret.Get(1).(func(domain.IOnodeIface, *domain.HandlerRequest) error); ok { + r1 = rf(node, req) + } else { + r1 = ret.Error(1) + } + + return r0, r1 +} + +// SetService provides a mock function with given fields: hs +func (_m *HandlerIface) SetService(hs domain.HandlerServiceIface) { + _m.Called(hs) +} + +// Write provides a mock function with given fields: node, req +func (_m *HandlerIface) Write(node domain.IOnodeIface, req *domain.HandlerRequest) (int, error) { + ret := _m.Called(node, req) + + var r0 int + if rf, ok := ret.Get(0).(func(domain.IOnodeIface, *domain.HandlerRequest) int); ok { + r0 = rf(node, req) + } else { + r0 = ret.Get(0).(int) + } + + var r1 error + if rf, ok := ret.Get(1).(func(domain.IOnodeIface, *domain.HandlerRequest) error); ok { + r1 = rf(node, req) + } else { + r1 = ret.Error(1) + } + + return r0, r1 +} diff --git a/sysbox-fs/mocks/HandlerServiceIface.go b/sysbox-fs/mocks/HandlerServiceIface.go new file mode 100644 index 00000000..dfed9add --- /dev/null +++ b/sysbox-fs/mocks/HandlerServiceIface.go @@ -0,0 +1,399 @@ +// Code generated by mockery v2.44.1. DO NOT EDIT. + +package mocks + +import ( + domain "github.com/nestybox/sysbox-fs/domain" + mock "github.com/stretchr/testify/mock" +) + +// HandlerServiceIface is an autogenerated mock type for the HandlerServiceIface type +type HandlerServiceIface struct { + mock.Mock +} + +// DisableHandler provides a mock function with given fields: path +func (_m *HandlerServiceIface) DisableHandler(path string) error { + ret := _m.Called(path) + + if len(ret) == 0 { + panic("no return value specified for DisableHandler") + } + + var r0 error + if rf, ok := ret.Get(0).(func(string) error); ok { + r0 = rf(path) + } else { + r0 = ret.Error(0) + } + + return r0 +} + +// EnableHandler provides a mock function with given fields: path +func (_m *HandlerServiceIface) EnableHandler(path string) error { + ret := _m.Called(path) + + if len(ret) == 0 { + panic("no return value specified for EnableHandler") + } + + var r0 error + if rf, ok := ret.Get(0).(func(string) error); ok { + r0 = rf(path) + } else { + r0 = ret.Error(0) + } + + return r0 +} + +// FindHandler provides a mock function with given fields: s +func (_m *HandlerServiceIface) FindHandler(s string) (domain.HandlerIface, bool) { + ret := _m.Called(s) + + if len(ret) == 0 { + panic("no return value specified for FindHandler") + } + + var r0 domain.HandlerIface + var r1 bool + if rf, ok := ret.Get(0).(func(string) (domain.HandlerIface, bool)); ok { + return rf(s) + } + if rf, ok := ret.Get(0).(func(string) domain.HandlerIface); ok { + r0 = rf(s) + } else { + if ret.Get(0) != nil { + r0 = ret.Get(0).(domain.HandlerIface) + } + } + + if rf, ok := ret.Get(1).(func(string) bool); ok { + r1 = rf(s) + } else { + r1 = ret.Get(1).(bool) + } + + return r0, r1 +} + +// FindHostUuid provides a mock function with given fields: +func (_m *HandlerServiceIface) FindHostUuid() (string, error) { + ret := _m.Called() + + if len(ret) == 0 { + panic("no return value specified for FindHostUuid") + } + + var r0 string + var r1 error + if rf, ok := ret.Get(0).(func() (string, error)); ok { + return rf() + } + if rf, ok := ret.Get(0).(func() string); ok { + r0 = rf() + } else { + r0 = ret.Get(0).(string) + } + + if rf, ok := ret.Get(1).(func() error); ok { + r1 = rf() + } else { + r1 = ret.Error(1) + } + + return r0, r1 +} + +// FindUserNsInode provides a mock function with given fields: pid +func (_m *HandlerServiceIface) FindUserNsInode(pid uint32) (uint64, error) { + ret := _m.Called(pid) + + if len(ret) == 0 { + panic("no return value specified for FindUserNsInode") + } + + var r0 uint64 + var r1 error + if rf, ok := ret.Get(0).(func(uint32) (uint64, error)); ok { + return rf(pid) + } + if rf, ok := ret.Get(0).(func(uint32) uint64); ok { + r0 = rf(pid) + } else { + r0 = ret.Get(0).(uint64) + } + + if rf, ok := ret.Get(1).(func(uint32) error); ok { + r1 = rf(pid) + } else { + r1 = ret.Error(1) + } + + return r0, r1 +} + +// GetPassThroughHandler provides a mock function with given fields: +func (_m *HandlerServiceIface) GetPassThroughHandler() domain.PassthroughHandlerIface { + ret := _m.Called() + + if len(ret) == 0 { + panic("no return value specified for GetPassThroughHandler") + } + + var r0 domain.PassthroughHandlerIface + if rf, ok := ret.Get(0).(func() domain.PassthroughHandlerIface); ok { + r0 = rf() + } else { + if ret.Get(0) != nil { + r0 = ret.Get(0).(domain.PassthroughHandlerIface) + } + } + + return r0 +} + +// HandlersResourcesList provides a mock function with given fields: +func (_m *HandlerServiceIface) HandlersResourcesList() []string { + ret := _m.Called() + + if len(ret) == 0 { + panic("no return value specified for HandlersResourcesList") + } + + var r0 []string + if rf, ok := ret.Get(0).(func() []string); ok { + r0 = rf() + } else { + if ret.Get(0) != nil { + r0 = ret.Get(0).([]string) + } + } + + return r0 +} + +// HostUserNsInode provides a mock function with given fields: +func (_m *HandlerServiceIface) HostUserNsInode() uint64 { + ret := _m.Called() + + if len(ret) == 0 { + panic("no return value specified for HostUserNsInode") + } + + var r0 uint64 + if rf, ok := ret.Get(0).(func() uint64); ok { + r0 = rf() + } else { + r0 = ret.Get(0).(uint64) + } + + return r0 +} + +// HostUuid provides a mock function with given fields: +func (_m *HandlerServiceIface) HostUuid() string { + ret := _m.Called() + + if len(ret) == 0 { + panic("no return value specified for HostUuid") + } + + var r0 string + if rf, ok := ret.Get(0).(func() string); ok { + r0 = rf() + } else { + r0 = ret.Get(0).(string) + } + + return r0 +} + +// IOService provides a mock function with given fields: +func (_m *HandlerServiceIface) IOService() domain.IOServiceIface { + ret := _m.Called() + + if len(ret) == 0 { + panic("no return value specified for IOService") + } + + var r0 domain.IOServiceIface + if rf, ok := ret.Get(0).(func() domain.IOServiceIface); ok { + r0 = rf() + } else { + if ret.Get(0) != nil { + r0 = ret.Get(0).(domain.IOServiceIface) + } + } + + return r0 +} + +// IgnoreErrors provides a mock function with given fields: +func (_m *HandlerServiceIface) IgnoreErrors() bool { + ret := _m.Called() + + if len(ret) == 0 { + panic("no return value specified for IgnoreErrors") + } + + var r0 bool + if rf, ok := ret.Get(0).(func() bool); ok { + r0 = rf() + } else { + r0 = ret.Get(0).(bool) + } + + return r0 +} + +// LookupHandler provides a mock function with given fields: i +func (_m *HandlerServiceIface) LookupHandler(i domain.IOnodeIface) (domain.HandlerIface, bool) { + ret := _m.Called(i) + + if len(ret) == 0 { + panic("no return value specified for LookupHandler") + } + + var r0 domain.HandlerIface + var r1 bool + if rf, ok := ret.Get(0).(func(domain.IOnodeIface) (domain.HandlerIface, bool)); ok { + return rf(i) + } + if rf, ok := ret.Get(0).(func(domain.IOnodeIface) domain.HandlerIface); ok { + r0 = rf(i) + } else { + if ret.Get(0) != nil { + r0 = ret.Get(0).(domain.HandlerIface) + } + } + + if rf, ok := ret.Get(1).(func(domain.IOnodeIface) bool); ok { + r1 = rf(i) + } else { + r1 = ret.Get(1).(bool) + } + + return r0, r1 +} + +// NSenterService provides a mock function with given fields: +func (_m *HandlerServiceIface) NSenterService() domain.NSenterServiceIface { + ret := _m.Called() + + if len(ret) == 0 { + panic("no return value specified for NSenterService") + } + + var r0 domain.NSenterServiceIface + if rf, ok := ret.Get(0).(func() domain.NSenterServiceIface); ok { + r0 = rf() + } else { + if ret.Get(0) != nil { + r0 = ret.Get(0).(domain.NSenterServiceIface) + } + } + + return r0 +} + +// ProcessService provides a mock function with given fields: +func (_m *HandlerServiceIface) ProcessService() domain.ProcessServiceIface { + ret := _m.Called() + + if len(ret) == 0 { + panic("no return value specified for ProcessService") + } + + var r0 domain.ProcessServiceIface + if rf, ok := ret.Get(0).(func() domain.ProcessServiceIface); ok { + r0 = rf() + } else { + if ret.Get(0) != nil { + r0 = ret.Get(0).(domain.ProcessServiceIface) + } + } + + return r0 +} + +// RegisterHandler provides a mock function with given fields: h +func (_m *HandlerServiceIface) RegisterHandler(h domain.HandlerIface) error { + ret := _m.Called(h) + + if len(ret) == 0 { + panic("no return value specified for RegisterHandler") + } + + var r0 error + if rf, ok := ret.Get(0).(func(domain.HandlerIface) error); ok { + r0 = rf(h) + } else { + r0 = ret.Error(0) + } + + return r0 +} + +// SetStateService provides a mock function with given fields: css +func (_m *HandlerServiceIface) SetStateService(css domain.ContainerStateServiceIface) { + _m.Called(css) +} + +// Setup provides a mock function with given fields: hdlrs, ignoreErrors, css, nss, prs, ios +func (_m *HandlerServiceIface) Setup(hdlrs []domain.HandlerIface, ignoreErrors bool, css domain.ContainerStateServiceIface, nss domain.NSenterServiceIface, prs domain.ProcessServiceIface, ios domain.IOServiceIface) { + _m.Called(hdlrs, ignoreErrors, css, nss, prs, ios) +} + +// StateService provides a mock function with given fields: +func (_m *HandlerServiceIface) StateService() domain.ContainerStateServiceIface { + ret := _m.Called() + + if len(ret) == 0 { + panic("no return value specified for StateService") + } + + var r0 domain.ContainerStateServiceIface + if rf, ok := ret.Get(0).(func() domain.ContainerStateServiceIface); ok { + r0 = rf() + } else { + if ret.Get(0) != nil { + r0 = ret.Get(0).(domain.ContainerStateServiceIface) + } + } + + return r0 +} + +// UnregisterHandler provides a mock function with given fields: h +func (_m *HandlerServiceIface) UnregisterHandler(h domain.HandlerIface) error { + ret := _m.Called(h) + + if len(ret) == 0 { + panic("no return value specified for UnregisterHandler") + } + + var r0 error + if rf, ok := ret.Get(0).(func(domain.HandlerIface) error); ok { + r0 = rf(h) + } else { + r0 = ret.Error(0) + } + + return r0 +} + +// NewHandlerServiceIface creates a new instance of HandlerServiceIface. It also registers a testing interface on the mock and a cleanup function to assert the mocks expectations. +// The first argument is typically a *testing.T value. +func NewHandlerServiceIface(t interface { + mock.TestingT + Cleanup(func()) +}) *HandlerServiceIface { + mock := &HandlerServiceIface{} + mock.Mock.Test(t) + + t.Cleanup(func() { mock.AssertExpectations(t) }) + + return mock +} diff --git a/sysbox-fs/mocks/IOServiceIface.go b/sysbox-fs/mocks/IOServiceIface.go new file mode 100644 index 00000000..105dcbf7 --- /dev/null +++ b/sysbox-fs/mocks/IOServiceIface.go @@ -0,0 +1,60 @@ +// Code generated by mockery v1.0.0. DO NOT EDIT. + +package mocks + +import ( + os "os" + + domain "github.com/nestybox/sysbox-fs/domain" + + mock "github.com/stretchr/testify/mock" +) + +// IOServiceIface is an autogenerated mock type for the IOServiceIface type +type IOServiceIface struct { + mock.Mock +} + +// GetServiceType provides a mock function with given fields: +func (_m *IOServiceIface) GetServiceType() int { + ret := _m.Called() + + var r0 int + if rf, ok := ret.Get(0).(func() int); ok { + r0 = rf() + } else { + r0 = ret.Get(0).(int) + } + + return r0 +} + +// NewIOnode provides a mock function with given fields: n, p, attr +func (_m *IOServiceIface) NewIOnode(n string, p string, attr os.FileMode) domain.IOnodeIface { + ret := _m.Called(n, p, attr) + + var r0 domain.IOnodeIface + if rf, ok := ret.Get(0).(func(string, string, os.FileMode) domain.IOnodeIface); ok { + r0 = rf(n, p, attr) + } else { + if ret.Get(0) != nil { + r0 = ret.Get(0).(domain.IOnodeIface) + } + } + + return r0 +} + +// RemoveAllIOnodes provides a mock function with given fields: +func (_m *IOServiceIface) RemoveAllIOnodes() error { + ret := _m.Called() + + var r0 error + if rf, ok := ret.Get(0).(func() error); ok { + r0 = rf() + } else { + r0 = ret.Error(0) + } + + return r0 +} diff --git a/sysbox-fs/mocks/IOnodeIface.go b/sysbox-fs/mocks/IOnodeIface.go new file mode 100644 index 00000000..ca5131e4 --- /dev/null +++ b/sysbox-fs/mocks/IOnodeIface.go @@ -0,0 +1,383 @@ +// Code generated by mockery v1.0.0. DO NOT EDIT. + +package mocks + +import ( + os "os" + + mock "github.com/stretchr/testify/mock" +) + +// IOnodeIface is an autogenerated mock type for the IOnodeIface type +type IOnodeIface struct { + mock.Mock +} + +// Close provides a mock function with given fields: +func (_m *IOnodeIface) Close() error { + ret := _m.Called() + + var r0 error + if rf, ok := ret.Get(0).(func() error); ok { + r0 = rf() + } else { + r0 = ret.Error(0) + } + + return r0 +} + +// GetNsInode provides a mock function with given fields: +func (_m *IOnodeIface) GetNsInode() (uint64, error) { + ret := _m.Called() + + var r0 uint64 + if rf, ok := ret.Get(0).(func() uint64); ok { + r0 = rf() + } else { + r0 = ret.Get(0).(uint64) + } + + var r1 error + if rf, ok := ret.Get(1).(func() error); ok { + r1 = rf() + } else { + r1 = ret.Error(1) + } + + return r0, r1 +} + +// Mkdir provides a mock function with given fields: +func (_m *IOnodeIface) Mkdir() error { + ret := _m.Called() + + var r0 error + if rf, ok := ret.Get(0).(func() error); ok { + r0 = rf() + } else { + r0 = ret.Error(0) + } + + return r0 +} + +// MkdirAll provides a mock function with given fields: +func (_m *IOnodeIface) MkdirAll() error { + ret := _m.Called() + + var r0 error + if rf, ok := ret.Get(0).(func() error); ok { + r0 = rf() + } else { + r0 = ret.Error(0) + } + + return r0 +} + +// Name provides a mock function with given fields: +func (_m *IOnodeIface) Name() string { + ret := _m.Called() + + var r0 string + if rf, ok := ret.Get(0).(func() string); ok { + r0 = rf() + } else { + r0 = ret.Get(0).(string) + } + + return r0 +} + +// Open provides a mock function with given fields: +func (_m *IOnodeIface) Open() error { + ret := _m.Called() + + var r0 error + if rf, ok := ret.Get(0).(func() error); ok { + r0 = rf() + } else { + r0 = ret.Error(0) + } + + return r0 +} + +// OpenFlags provides a mock function with given fields: +func (_m *IOnodeIface) OpenFlags() int { + ret := _m.Called() + + var r0 int + if rf, ok := ret.Get(0).(func() int); ok { + r0 = rf() + } else { + r0 = ret.Get(0).(int) + } + + return r0 +} + +// OpenMode provides a mock function with given fields: +func (_m *IOnodeIface) OpenMode() os.FileMode { + ret := _m.Called() + + var r0 os.FileMode + if rf, ok := ret.Get(0).(func() os.FileMode); ok { + r0 = rf() + } else { + r0 = ret.Get(0).(os.FileMode) + } + + return r0 +} + +// Path provides a mock function with given fields: +func (_m *IOnodeIface) Path() string { + ret := _m.Called() + + var r0 string + if rf, ok := ret.Get(0).(func() string); ok { + r0 = rf() + } else { + r0 = ret.Get(0).(string) + } + + return r0 +} + +// Read provides a mock function with given fields: p +func (_m *IOnodeIface) Read(p []byte) (int, error) { + ret := _m.Called(p) + + var r0 int + if rf, ok := ret.Get(0).(func([]byte) int); ok { + r0 = rf(p) + } else { + r0 = ret.Get(0).(int) + } + + var r1 error + if rf, ok := ret.Get(1).(func([]byte) error); ok { + r1 = rf(p) + } else { + r1 = ret.Error(1) + } + + return r0, r1 +} + +// ReadAt provides a mock function with given fields: p, off +func (_m *IOnodeIface) ReadAt(p []byte, off int64) (int, error) { + ret := _m.Called(p, off) + + var r0 int + if rf, ok := ret.Get(0).(func([]byte, int64) int); ok { + r0 = rf(p, off) + } else { + r0 = ret.Get(0).(int) + } + + var r1 error + if rf, ok := ret.Get(1).(func([]byte, int64) error); ok { + r1 = rf(p, off) + } else { + r1 = ret.Error(1) + } + + return r0, r1 +} + +// ReadDirAll provides a mock function with given fields: +func (_m *IOnodeIface) ReadDirAll() ([]os.FileInfo, error) { + ret := _m.Called() + + var r0 []os.FileInfo + if rf, ok := ret.Get(0).(func() []os.FileInfo); ok { + r0 = rf() + } else { + if ret.Get(0) != nil { + r0 = ret.Get(0).([]os.FileInfo) + } + } + + var r1 error + if rf, ok := ret.Get(1).(func() error); ok { + r1 = rf() + } else { + r1 = ret.Error(1) + } + + return r0, r1 +} + +// ReadFile provides a mock function with given fields: +func (_m *IOnodeIface) ReadFile() ([]byte, error) { + ret := _m.Called() + + var r0 []byte + if rf, ok := ret.Get(0).(func() []byte); ok { + r0 = rf() + } else { + if ret.Get(0) != nil { + r0 = ret.Get(0).([]byte) + } + } + + var r1 error + if rf, ok := ret.Get(1).(func() error); ok { + r1 = rf() + } else { + r1 = ret.Error(1) + } + + return r0, r1 +} + +// ReadLine provides a mock function with given fields: +func (_m *IOnodeIface) ReadLine() (string, error) { + ret := _m.Called() + + var r0 string + if rf, ok := ret.Get(0).(func() string); ok { + r0 = rf() + } else { + r0 = ret.Get(0).(string) + } + + var r1 error + if rf, ok := ret.Get(1).(func() error); ok { + r1 = rf() + } else { + r1 = ret.Error(1) + } + + return r0, r1 +} + +// Remove provides a mock function with given fields: +func (_m *IOnodeIface) Remove() error { + ret := _m.Called() + + var r0 error + if rf, ok := ret.Get(0).(func() error); ok { + r0 = rf() + } else { + r0 = ret.Error(0) + } + + return r0 +} + +// RemoveAll provides a mock function with given fields: +func (_m *IOnodeIface) RemoveAll() error { + ret := _m.Called() + + var r0 error + if rf, ok := ret.Get(0).(func() error); ok { + r0 = rf() + } else { + r0 = ret.Error(0) + } + + return r0 +} + +// SeekReset provides a mock function with given fields: +func (_m *IOnodeIface) SeekReset() (int64, error) { + ret := _m.Called() + + var r0 int64 + if rf, ok := ret.Get(0).(func() int64); ok { + r0 = rf() + } else { + r0 = ret.Get(0).(int64) + } + + var r1 error + if rf, ok := ret.Get(1).(func() error); ok { + r1 = rf() + } else { + r1 = ret.Error(1) + } + + return r0, r1 +} + +// SetName provides a mock function with given fields: s +func (_m *IOnodeIface) SetName(s string) { + _m.Called(s) +} + +// SetOpenFlags provides a mock function with given fields: flags +func (_m *IOnodeIface) SetOpenFlags(flags int) { + _m.Called(flags) +} + +// SetOpenMode provides a mock function with given fields: mode +func (_m *IOnodeIface) SetOpenMode(mode os.FileMode) { + _m.Called(mode) +} + +// SetPath provides a mock function with given fields: s +func (_m *IOnodeIface) SetPath(s string) { + _m.Called(s) +} + +// Stat provides a mock function with given fields: +func (_m *IOnodeIface) Stat() (os.FileInfo, error) { + ret := _m.Called() + + var r0 os.FileInfo + if rf, ok := ret.Get(0).(func() os.FileInfo); ok { + r0 = rf() + } else { + if ret.Get(0) != nil { + r0 = ret.Get(0).(os.FileInfo) + } + } + + var r1 error + if rf, ok := ret.Get(1).(func() error); ok { + r1 = rf() + } else { + r1 = ret.Error(1) + } + + return r0, r1 +} + +// Write provides a mock function with given fields: p +func (_m *IOnodeIface) Write(p []byte) (int, error) { + ret := _m.Called(p) + + var r0 int + if rf, ok := ret.Get(0).(func([]byte) int); ok { + r0 = rf(p) + } else { + r0 = ret.Get(0).(int) + } + + var r1 error + if rf, ok := ret.Get(1).(func([]byte) error); ok { + r1 = rf(p) + } else { + r1 = ret.Error(1) + } + + return r0, r1 +} + +// WriteFile provides a mock function with given fields: p +func (_m *IOnodeIface) WriteFile(p []byte) error { + ret := _m.Called(p) + + var r0 error + if rf, ok := ret.Get(0).(func([]byte) error); ok { + r0 = rf(p) + } else { + r0 = ret.Error(0) + } + + return r0 +} diff --git a/sysbox-fs/mocks/MountHelperIface.go b/sysbox-fs/mocks/MountHelperIface.go new file mode 100644 index 00000000..d94c0ad0 --- /dev/null +++ b/sysbox-fs/mocks/MountHelperIface.go @@ -0,0 +1,154 @@ +// Code generated by mockery v1.0.0. DO NOT EDIT. + +package mocks + +import mock "github.com/stretchr/testify/mock" + +// MountHelperIface is an autogenerated mock type for the MountHelperIface type +type MountHelperIface struct { + mock.Mock +} + +// FilterFsFlags provides a mock function with given fields: fsOpts +func (_m *MountHelperIface) FilterFsFlags(fsOpts map[string]string) string { + ret := _m.Called(fsOpts) + + var r0 string + if rf, ok := ret.Get(0).(func(map[string]string) string); ok { + r0 = rf(fsOpts) + } else { + r0 = ret.Get(0).(string) + } + + return r0 +} + +// HasPropagationFlag provides a mock function with given fields: flags +func (_m *MountHelperIface) HasPropagationFlag(flags uint64) bool { + ret := _m.Called(flags) + + var r0 bool + if rf, ok := ret.Get(0).(func(uint64) bool); ok { + r0 = rf(flags) + } else { + r0 = ret.Get(0).(bool) + } + + return r0 +} + +// IsBind provides a mock function with given fields: flags +func (_m *MountHelperIface) IsBind(flags uint64) bool { + ret := _m.Called(flags) + + var r0 bool + if rf, ok := ret.Get(0).(func(uint64) bool); ok { + r0 = rf(flags) + } else { + r0 = ret.Get(0).(bool) + } + + return r0 +} + +// IsMove provides a mock function with given fields: flags +func (_m *MountHelperIface) IsMove(flags uint64) bool { + ret := _m.Called(flags) + + var r0 bool + if rf, ok := ret.Get(0).(func(uint64) bool); ok { + r0 = rf(flags) + } else { + r0 = ret.Get(0).(bool) + } + + return r0 +} + +// IsNewMount provides a mock function with given fields: flags +func (_m *MountHelperIface) IsNewMount(flags uint64) bool { + ret := _m.Called(flags) + + var r0 bool + if rf, ok := ret.Get(0).(func(uint64) bool); ok { + r0 = rf(flags) + } else { + r0 = ret.Get(0).(bool) + } + + return r0 +} + +// IsReadOnlyMount provides a mock function with given fields: flags +func (_m *MountHelperIface) IsReadOnlyMount(flags uint64) bool { + ret := _m.Called(flags) + + var r0 bool + if rf, ok := ret.Get(0).(func(uint64) bool); ok { + r0 = rf(flags) + } else { + r0 = ret.Get(0).(bool) + } + + return r0 +} + +// IsRemount provides a mock function with given fields: flags +func (_m *MountHelperIface) IsRemount(flags uint64) bool { + ret := _m.Called(flags) + + var r0 bool + if rf, ok := ret.Get(0).(func(uint64) bool); ok { + r0 = rf(flags) + } else { + r0 = ret.Get(0).(bool) + } + + return r0 +} + +// ProcMounts provides a mock function with given fields: +func (_m *MountHelperIface) ProcMounts() []string { + ret := _m.Called() + + var r0 []string + if rf, ok := ret.Get(0).(func() []string); ok { + r0 = rf() + } else { + if ret.Get(0) != nil { + r0 = ret.Get(0).([]string) + } + } + + return r0 +} + +// StringToFlags provides a mock function with given fields: s +func (_m *MountHelperIface) StringToFlags(s map[string]string) uint64 { + ret := _m.Called(s) + + var r0 uint64 + if rf, ok := ret.Get(0).(func(map[string]string) uint64); ok { + r0 = rf(s) + } else { + r0 = ret.Get(0).(uint64) + } + + return r0 +} + +// SysMounts provides a mock function with given fields: +func (_m *MountHelperIface) SysMounts() []string { + ret := _m.Called() + + var r0 []string + if rf, ok := ret.Get(0).(func() []string); ok { + r0 = rf() + } else { + if ret.Get(0) != nil { + r0 = ret.Get(0).([]string) + } + } + + return r0 +} diff --git a/sysbox-fs/mocks/MountServiceIface.go b/sysbox-fs/mocks/MountServiceIface.go new file mode 100644 index 00000000..4d9306d6 --- /dev/null +++ b/sysbox-fs/mocks/MountServiceIface.go @@ -0,0 +1,73 @@ +// Code generated by mockery v1.0.0. DO NOT EDIT. + +package mocks + +import ( + domain "github.com/nestybox/sysbox-fs/domain" + mock "github.com/stretchr/testify/mock" +) + +// MountServiceIface is an autogenerated mock type for the MountServiceIface type +type MountServiceIface struct { + mock.Mock +} + +// MountHelper provides a mock function with given fields: +func (_m *MountServiceIface) MountHelper() domain.MountHelperIface { + ret := _m.Called() + + var r0 domain.MountHelperIface + if rf, ok := ret.Get(0).(func() domain.MountHelperIface); ok { + r0 = rf() + } else { + if ret.Get(0) != nil { + r0 = ret.Get(0).(domain.MountHelperIface) + } + } + + return r0 +} + +// NewMountHelper provides a mock function with given fields: +func (_m *MountServiceIface) NewMountHelper() domain.MountHelperIface { + ret := _m.Called() + + var r0 domain.MountHelperIface + if rf, ok := ret.Get(0).(func() domain.MountHelperIface); ok { + r0 = rf() + } else { + if ret.Get(0) != nil { + r0 = ret.Get(0).(domain.MountHelperIface) + } + } + + return r0 +} + +// NewMountInfoParser provides a mock function with given fields: c, process, launchParser, fetchOptions, fetchInodes +func (_m *MountServiceIface) NewMountInfoParser(c domain.ContainerIface, process domain.ProcessIface, launchParser bool, fetchOptions bool, fetchInodes bool) (domain.MountInfoParserIface, error) { + ret := _m.Called(c, process, launchParser, fetchOptions, fetchInodes) + + var r0 domain.MountInfoParserIface + if rf, ok := ret.Get(0).(func(domain.ContainerIface, domain.ProcessIface, bool, bool, bool) domain.MountInfoParserIface); ok { + r0 = rf(c, process, launchParser, fetchOptions, fetchInodes) + } else { + if ret.Get(0) != nil { + r0 = ret.Get(0).(domain.MountInfoParserIface) + } + } + + var r1 error + if rf, ok := ret.Get(1).(func(domain.ContainerIface, domain.ProcessIface, bool, bool, bool) error); ok { + r1 = rf(c, process, launchParser, fetchOptions, fetchInodes) + } else { + r1 = ret.Error(1) + } + + return r0, r1 +} + +// Setup provides a mock function with given fields: css, hds, prs, nss +func (_m *MountServiceIface) Setup(css domain.ContainerStateServiceIface, hds domain.HandlerServiceIface, prs domain.ProcessServiceIface, nss domain.NSenterServiceIface) { + _m.Called(css, hds, prs, nss) +} diff --git a/sysbox-fs/mocks/NSenterEventIface.go b/sysbox-fs/mocks/NSenterEventIface.go new file mode 100644 index 00000000..b2605146 --- /dev/null +++ b/sysbox-fs/mocks/NSenterEventIface.go @@ -0,0 +1,113 @@ +// Code generated by mockery v1.0.0. DO NOT EDIT. + +package mocks + +import ( + domain "github.com/nestybox/sysbox-fs/domain" + mock "github.com/stretchr/testify/mock" +) + +// NSenterEventIface is an autogenerated mock type for the NSenterEventIface type +type NSenterEventIface struct { + mock.Mock +} + +// GetProcessID provides a mock function with given fields: +func (_m *NSenterEventIface) GetProcessID() uint32 { + ret := _m.Called() + + var r0 uint32 + if rf, ok := ret.Get(0).(func() uint32); ok { + r0 = rf() + } else { + r0 = ret.Get(0).(uint32) + } + + return r0 +} + +// GetRequestMsg provides a mock function with given fields: +func (_m *NSenterEventIface) GetRequestMsg() *domain.NSenterMessage { + ret := _m.Called() + + var r0 *domain.NSenterMessage + if rf, ok := ret.Get(0).(func() *domain.NSenterMessage); ok { + r0 = rf() + } else { + if ret.Get(0) != nil { + r0 = ret.Get(0).(*domain.NSenterMessage) + } + } + + return r0 +} + +// GetResponseMsg provides a mock function with given fields: +func (_m *NSenterEventIface) GetResponseMsg() *domain.NSenterMessage { + ret := _m.Called() + + var r0 *domain.NSenterMessage + if rf, ok := ret.Get(0).(func() *domain.NSenterMessage); ok { + r0 = rf() + } else { + if ret.Get(0) != nil { + r0 = ret.Get(0).(*domain.NSenterMessage) + } + } + + return r0 +} + +// ReceiveResponse provides a mock function with given fields: +func (_m *NSenterEventIface) ReceiveResponse() *domain.NSenterMessage { + ret := _m.Called() + + var r0 *domain.NSenterMessage + if rf, ok := ret.Get(0).(func() *domain.NSenterMessage); ok { + r0 = rf() + } else { + if ret.Get(0) != nil { + r0 = ret.Get(0).(*domain.NSenterMessage) + } + } + + return r0 +} + +// SendRequest provides a mock function with given fields: +func (_m *NSenterEventIface) SendRequest() error { + ret := _m.Called() + + var r0 error + if rf, ok := ret.Get(0).(func() error); ok { + r0 = rf() + } else { + r0 = ret.Error(0) + } + + return r0 +} + +// SetRequestMsg provides a mock function with given fields: m +func (_m *NSenterEventIface) SetRequestMsg(m *domain.NSenterMessage) { + _m.Called(m) +} + +// SetResponseMsg provides a mock function with given fields: m +func (_m *NSenterEventIface) SetResponseMsg(m *domain.NSenterMessage) { + _m.Called(m) +} + +// TerminateRequest provides a mock function with given fields: +func (_m *NSenterEventIface) TerminateRequest() error { + ret := _m.Called() + + var r0 error + if rf, ok := ret.Get(0).(func() error); ok { + r0 = rf() + } else { + r0 = ret.Error(0) + } + + return r0 +} diff --git a/sysbox-fs/mocks/NSenterServiceIface.go b/sysbox-fs/mocks/NSenterServiceIface.go new file mode 100644 index 00000000..19e83e56 --- /dev/null +++ b/sysbox-fs/mocks/NSenterServiceIface.go @@ -0,0 +1,106 @@ +// Code generated by mockery v2.36.0. DO NOT EDIT. + +package mocks + +import ( + domain "github.com/nestybox/sysbox-fs/domain" + mock "github.com/stretchr/testify/mock" +) + +// NSenterServiceIface is an autogenerated mock type for the NSenterServiceIface type +type NSenterServiceIface struct { + mock.Mock +} + +// GetEventProcessID provides a mock function with given fields: e +func (_m *NSenterServiceIface) GetEventProcessID(e domain.NSenterEventIface) uint32 { + ret := _m.Called(e) + + var r0 uint32 + if rf, ok := ret.Get(0).(func(domain.NSenterEventIface) uint32); ok { + r0 = rf(e) + } else { + r0 = ret.Get(0).(uint32) + } + + return r0 +} + +// NewEvent provides a mock function with given fields: pid, ns, cloneFlags, req, res, async +func (_m *NSenterServiceIface) NewEvent(pid uint32, ns *[]string, cloneFlags uint32, req *domain.NSenterMessage, res *domain.NSenterMessage, async bool) domain.NSenterEventIface { + ret := _m.Called(pid, ns, cloneFlags, req, res, async) + + var r0 domain.NSenterEventIface + if rf, ok := ret.Get(0).(func(uint32, *[]string, uint32, *domain.NSenterMessage, *domain.NSenterMessage, bool) domain.NSenterEventIface); ok { + r0 = rf(pid, ns, cloneFlags, req, res, async) + } else { + if ret.Get(0) != nil { + r0 = ret.Get(0).(domain.NSenterEventIface) + } + } + + return r0 +} + +// ReceiveResponseEvent provides a mock function with given fields: e +func (_m *NSenterServiceIface) ReceiveResponseEvent(e domain.NSenterEventIface) *domain.NSenterMessage { + ret := _m.Called(e) + + var r0 *domain.NSenterMessage + if rf, ok := ret.Get(0).(func(domain.NSenterEventIface) *domain.NSenterMessage); ok { + r0 = rf(e) + } else { + if ret.Get(0) != nil { + r0 = ret.Get(0).(*domain.NSenterMessage) + } + } + + return r0 +} + +// SendRequestEvent provides a mock function with given fields: e +func (_m *NSenterServiceIface) SendRequestEvent(e domain.NSenterEventIface) error { + ret := _m.Called(e) + + var r0 error + if rf, ok := ret.Get(0).(func(domain.NSenterEventIface) error); ok { + r0 = rf(e) + } else { + r0 = ret.Error(0) + } + + return r0 +} + +// Setup provides a mock function with given fields: prs, mts +func (_m *NSenterServiceIface) Setup(prs domain.ProcessServiceIface, mts domain.MountServiceIface) { + _m.Called(prs, mts) +} + +// TerminateRequestEvent provides a mock function with given fields: e +func (_m *NSenterServiceIface) TerminateRequestEvent(e domain.NSenterEventIface) error { + ret := _m.Called(e) + + var r0 error + if rf, ok := ret.Get(0).(func(domain.NSenterEventIface) error); ok { + r0 = rf(e) + } else { + r0 = ret.Error(0) + } + + return r0 +} + +// NewNSenterServiceIface creates a new instance of NSenterServiceIface. It also registers a testing interface on the mock and a cleanup function to assert the mocks expectations. +// The first argument is typically a *testing.T value. +func NewNSenterServiceIface(t interface { + mock.TestingT + Cleanup(func()) +}) *NSenterServiceIface { + mock := &NSenterServiceIface{} + mock.Mock.Test(t) + + t.Cleanup(func() { mock.AssertExpectations(t) }) + + return mock +} diff --git a/sysbox-fs/mocks/README.md b/sysbox-fs/mocks/README.md new file mode 100644 index 00000000..7b3f37d7 --- /dev/null +++ b/sysbox-fs/mocks/README.md @@ -0,0 +1,19 @@ +# Mocks generation + +To generate or update code mocks in this folder simply follow these steps ... + +1) Download required binaries / libs if not already done: + +``` +rmolina@dev-vm1:~/wsp/05-07-2020/sysbox/sysbox-fs$ go get github.com/stretchr/testify +rmolina@dev-vm1:~/wsp/05-07-2020/sysbox/sysbox-fs$ go get github.com/vektra/mockery/.../ +``` + +2) Execute 'mock' binary by pointing to the interface that you want to mock and +the path where this one is located. In sysbox-fs' case, all interfaces are defined +within the "domain" folder: + +``` +rmolina@dev-vm1:~/wsp/05-07-2020/sysbox/sysbox-fs$ mockery -name=FuseServerIface -dir=domain +Generating mock for: FuseServerIface in file: mocks/FuseServerIface.go +``` \ No newline at end of file diff --git a/sysbox-fs/mount/helper.go b/sysbox-fs/mount/helper.go new file mode 100644 index 00000000..2a961367 --- /dev/null +++ b/sysbox-fs/mount/helper.go @@ -0,0 +1,159 @@ +// +// Copyright 2019-2020 Nestybox, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +package mount + +import ( + "strings" + + libutils "github.com/nestybox/sysbox-libs/utils" + "golang.org/x/sys/unix" +) + +// The mountPropFlags in a mount syscall indicate a change in the propagation type of an +// existing mountpoint. +const mountPropFlags = (unix.MS_SHARED | unix.MS_PRIVATE | unix.MS_SLAVE | unix.MS_UNBINDABLE) + +// The mountModFlags in a mount syscall indicate a change to an existing mountpoint. If +// these flags are not present, the mount syscall creates a new mountpoint. +const mountModFlags = (unix.MS_REMOUNT | unix.MS_BIND | unix.MS_MOVE | mountPropFlags) + +// mountHelper provides methods to aid in obtaining info about container mountpoints +// managed by sysboxfs. +type mountHelper struct { + mapMounts map[string]struct{} // map of all sysboxfs bind-mounts (rdonly + mask) + procMounts []string // slice of procfs bind-mounts + sysMounts []string // slice of sysfs bind-mounts + flagsMap map[string]uint64 // helper map to aid in flag conversion + service *MountService // backpointer to parent service object +} + +func newMountHelper(svc *MountService) *mountHelper { + + info := &mountHelper{ + mapMounts: make(map[string]struct{}), + service: svc, + procMounts: ProcfsMounts, + sysMounts: SysfsMounts, + } + + // Sort proc and sys mounts hierarchically in case later mounts depend on + // earlier ones. + libutils.FilepathSort(info.procMounts) + libutils.FilepathSort(info.sysMounts) + + // + // Initialize a flagsMap to help in "/proc/pid/mountHelper" parsing. Note that + // even though these are a subset of the flags supported by Linux kernel, these + // are the ones that are taken into account to generate /proc/pid/mountinfo + // content. Details here: + // https://github.com/torvalds/linux/blob/master/fs/proc_namespace.c#L131 + // https://github.com/torvalds/linux/blob/master/include/linux/mount.h + // + info.flagsMap = map[string]uint64{ + "ro": unix.MS_RDONLY, // Read-only file-system + "nodev": unix.MS_NODEV, // Will not interpret character or block special devices + "noexec": unix.MS_NOEXEC, // Will not allow execution of any binaries + "nosuid": unix.MS_NOSUID, // Will not allow set-user/group-identifier + "noatime": unix.MS_NOATIME, // Will not update the file access-time when reading from a file + "nodiratime": unix.MS_NODIRATIME, // Will not update the directory access time + "relatime": unix.MS_RELATIME, // Updates inode access-times relative to modify time + "strictatime": unix.MS_STRICTATIME, // Always update last access time + "sync": unix.MS_SYNCHRONOUS, // Make writes synchronous + } + + return info +} + +// ProcMounts returns sysbox-fs' procfs submounts. +func (m *mountHelper) ProcMounts() []string { + return m.procMounts +} + +// SysMounts returns sysbox-fs' sysfs submounts. +func (m *mountHelper) SysMounts() []string { + return m.sysMounts +} + +// IsNewMount returns true if the mount flags indicate creation of a new mountpoint. +func (m *mountHelper) IsNewMount(flags uint64) bool { + return flags&unix.MS_MGC_MSK == unix.MS_MGC_VAL || flags&mountModFlags == 0 +} + +// IsRemount returns true if the mount flags indicate a remount operation. +func (m *mountHelper) IsRemount(flags uint64) bool { + return flags&unix.MS_REMOUNT == unix.MS_REMOUNT +} + +// IsBind returns true if the mount flags indicate a bind-mount operation. +func (m *mountHelper) IsBind(flags uint64) bool { + return flags&unix.MS_BIND == unix.MS_BIND +} + +// IsMove returns true if the mount flags indicate a mount move operation. +func (m *mountHelper) IsMove(flags uint64) bool { + return flags&unix.MS_MOVE == unix.MS_MOVE +} + +// HasPropagationFlag returns true if the mount flags indicate a mount +// propagation change. +func (m *mountHelper) HasPropagationFlag(flags uint64) bool { + return flags&mountPropFlags != 0 +} + +// IsReadOnlyMount returns 'true' if the mount flags indicate a read-only mount +// operation. Otherwise, 'false' is returned to refer to a read-write instruction. +func (m *mountHelper) IsReadOnlyMount(flags uint64) bool { + return flags&unix.MS_RDONLY == unix.MS_RDONLY +} + +// StringToFlags converts string-based mount flags (as extracted from +// /proc/pid/mountinfo), into their corresponding numerical values. +func (m *mountHelper) StringToFlags(s map[string]string) uint64 { + var flags uint64 + + for k, _ := range s { + // Skip read-write option as it shows up in per-mount and per-vfs options. + if k == "rw" { + continue + } + val, ok := m.flagsMap[k] + if !ok { + continue + } + + flags |= val + } + + return flags +} + +// FilterFsFlags takes filesystem options as extracted from /proc/pid/mountinfo, filters +// out options corresponding to mount flags, and returns options corresponding to +// filesystem-specific mount data. +func (m *mountHelper) FilterFsFlags(fsOpts map[string]string) string { + + opts := []string{} + + for k, _ := range fsOpts { + _, ok := m.flagsMap[k] + if ok && k != "rw" { + opts = append(opts, k) + } + } + + return strings.Join(opts, ",") +} diff --git a/sysbox-fs/mount/infoParser.go b/sysbox-fs/mount/infoParser.go new file mode 100644 index 00000000..eeee3462 --- /dev/null +++ b/sysbox-fs/mount/infoParser.go @@ -0,0 +1,1063 @@ +// +// Copyright 2019-2020 Nestybox, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +// +// This file provides info about mounts seen by a given process' and whether some of these +// are managed by sysbox-fs. +// +// For example, in the following mount tree for a given process: +// +// |-/proc proc proc ro,nosuid,nodev,noexec,relatime,hidepid=2 +// | |-/proc/bus proc[/bus] proc ro,relatime,hidepid=2 +// | |-/proc/fs proc[/fs] proc ro,relatime,hidepid=2 +// | |-/proc/irq proc[/irq] proc ro,relatime,hidepid=2 +// | |-/proc/sysrq-trigger proc[/sysrq-trigger] proc ro,relatime,hidepid=2 +// | |-/proc/asound tmpfs tmpfs ro,relatime,uid=165536,gid=165536 +// | |-/proc/acpi tmpfs tmpfs ro,relatime,uid=165536,gid=165536 +// | |-/proc/kcore tmpfs[/null] tmpfs rw,nosuid,size=65536k,mode=755 +// | |-/proc/keys tmpfs[/null] tmpfs rw,nosuid,size=65536k,mode=755 +// | |-/proc/timer_list tmpfs[/null] tmpfs rw,nosuid,size=65536k,mode=755 +// | |-/proc/sched_debug tmpfs[/null] tmpfs rw,nosuid,size=65536k,mode=755 +// | |-/proc/scsi tmpfs tmpfs ro,relatime,uid=165536,gid=165536 +// | |-/proc/swaps sysboxfs[/proc/swaps] fuse rw,nosuid,nodev,relatime,user_id=0,group_id=0,default_permissions,allow_other +// | |-/proc/sys sysboxfs[/proc/sys] fuse rw,nosuid,nodev,relatime,user_id=0,group_id=0,default_permissions,allow_other +// | `-/proc/uptime sysboxfs[/proc/uptime] fuse rw,nosuid,nodev,relatime,user_id=0,group_id=0,default_permissions,allow_other +// +// +// "/proc" is a sysbox-fs managed base mount. +// "/proc/*" are sysbox-fs managed submounts used to expose, hide, or emulate portions of procfs. +// +// Same applies to sysfs mounts. + +package mount + +import ( + "bufio" + "bytes" + "fmt" + "io/ioutil" + "strconv" + "strings" + + "github.com/nestybox/sysbox-fs/domain" + "github.com/sirupsen/logrus" + "golang.org/x/sys/unix" +) + +// mountInfoParser holds info about a process' mountpoints, and can be queried +// to check if a given mountpoint is a sysbox-fs managed mountpoint (i.e., base +// mount or submount). +type mountInfoParser struct { + cntr domain.ContainerIface + process domain.ProcessIface + launchParser bool // if set, it launches mountinfo parser + fetchOptions bool // superficial vs deep parsing mode + fetchInodes bool // if set, parser fetches mountpoints inodes + mpInfo map[string]*domain.MountInfo // mountinfo, indexed by mountpoint path + idInfo map[int]*domain.MountInfo // mountinfo, indexed by mount ID + inInfo map[domain.Inode][]*domain.MountInfo // mountinfo, indexed by mountpoint inode + fsIdInfo map[string][]*domain.MountInfo // mountinfo, indexed by file-sys id (major/minor ver) + service *MountService // backpointer to mount service +} + +// newMountInfoParser returns a new mountInfoParser object. +func newMountInfoParser( + cntr domain.ContainerIface, + process domain.ProcessIface, + launchParser bool, + fetchOptions bool, + fetchInodes bool, + mts *MountService) (*mountInfoParser, error) { + + mip := &mountInfoParser{ + cntr: cntr, + process: process, + launchParser: launchParser, + fetchOptions: fetchOptions, + fetchInodes: fetchInodes, + mpInfo: make(map[string]*domain.MountInfo), + idInfo: make(map[int]*domain.MountInfo), + inInfo: make(map[domain.Inode][]*domain.MountInfo), + fsIdInfo: make(map[string][]*domain.MountInfo), + service: mts, + } + + if launchParser { + err := mip.parse() + if err != nil { + return nil, fmt.Errorf("mountInfoParser error for pid = %d: %s", + process.Pid(), err) + } + } + + return mip, nil +} + +// Simple wrapper over parseData() method. We are keeping this one separated +// to decouple file-handling operations and allow actual parser to take []byte +// input parameter for benchmarking purposes. +func (mi *mountInfoParser) parse() error { + + data, err := mi.extractMountInfo() + if err != nil { + return err + } + + if err := mi.parseData(data); err != nil { + return err + } + + if mi.fetchInodes { + err = mi.extractAllInodes() + if err != nil { + return err + } + } + + return nil +} + +// parseData parses the process' mountinfo file and extracts the info for the +// base mount and it's submounts. +func (mi *mountInfoParser) parseData(data []byte) error { + + scanner := bufio.NewScanner(bytes.NewReader(data)) + for scanner.Scan() { + data := scanner.Text() + parsedMounts, err := mi.parseComponents(data) + if err != nil { + return err + } + + mi.mpInfo[parsedMounts.MountPoint] = parsedMounts + mi.idInfo[parsedMounts.MountID] = parsedMounts + + // File-system-id map utilized for remount / unmount processing. + fsIdSlice, ok := mi.fsIdInfo[parsedMounts.MajorMinorVer] + if ok { + mi.fsIdInfo[parsedMounts.MajorMinorVer] = + append(fsIdSlice, parsedMounts) + } else { + mi.fsIdInfo[parsedMounts.MajorMinorVer] = + []*domain.MountInfo{parsedMounts} + } + } + + return scanner.Err() +} + +// parseComponents parses a mountinfo file line. +func (mi *mountInfoParser) parseComponents(data string) (*domain.MountInfo, error) { + + var err error + + componentSplit := strings.Split(data, " ") + componentSplitLength := len(componentSplit) + + if componentSplitLength < 10 { + return nil, fmt.Errorf("Not enough fields in mount string: %s", data) + } + + // Hyphen separator is expected, otherwise line is malformed. + if componentSplit[componentSplitLength-4] != "-" { + return nil, fmt.Errorf("No separator found in field: %s", + componentSplit[componentSplitLength-4]) + } + + mount := &domain.MountInfo{ + MajorMinorVer: componentSplit[2], + Root: componentSplit[3], + MountPoint: componentSplit[4], + FsType: componentSplit[componentSplitLength-3], + Source: componentSplit[componentSplitLength-2], + Mip: mi, + } + + mount.MountID, err = strconv.Atoi(componentSplit[0]) + if err != nil { + return nil, fmt.Errorf("Error parsing mountID field") + } + mount.ParentID, err = strconv.Atoi(componentSplit[1]) + if err != nil { + return nil, fmt.Errorf("Error parsing parentID field") + } + + // Continue parsing process if 'deep' mode has been requested. + if mi.fetchOptions { + mount.Options = + mi.parseOptionsComponent(componentSplit[5]) + mount.VfsOptions = + mi.parseOptionsComponent(componentSplit[componentSplitLength-1]) + + if componentSplit[6] != "" { + mount.OptionalFields = + mi.parseOptFieldsComponent(componentSplit[6 : componentSplitLength-4]) + if err != nil { + return nil, err + } + } + } + + return mount, nil +} + +// parseOptionsComponent parses both regular mount-options and superblock +// mount-options. +func (mi *mountInfoParser) parseOptionsComponent(s string) map[string]string { + + optionsMap := make(map[string]string) + + // Separate all mount options. + options := strings.Split(s, ",") + for _, opt := range options { + + // Discern between binomial and monomial options. + optionSplit := strings.Split(opt, "=") + + if len(optionSplit) >= 2 { + // Example: "... size=4058184k,mode=755" + key, value := optionSplit[0], optionSplit[1] + optionsMap[key] = value + + } else { + // Example: "... rw,net_cls,net_prio" + key := optionSplit[0] + optionsMap[key] = "" + } + } + + return optionsMap +} + +// parseOptFieldsComponent parses the list of optional-fields. +func (mi *mountInfoParser) parseOptFieldsComponent(s []string) map[string]string { + + optionalFieldsMap := make(map[string]string) + + for _, field := range s { + var value string + + // Separate all optional-fields. + optionSplit := strings.SplitN(field, ":", 2) + + // Example: "... master:2 ..."" + if len(optionSplit) == 2 { + value = optionSplit[1] + } else { + value = "" + } + + // Ensure that only supported options are handled. + switch optionSplit[0] { + case + "shared", + "master", + "propagate_from", + "unbindable": + optionalFieldsMap[optionSplit[0]] = value + } + } + + return optionalFieldsMap +} + +func (mi *mountInfoParser) extractMountInfo() ([]byte, error) { + + // In regular scenarios (i.e. mount/umount request launched by un-chroot'ed + // processes), we extract the mountInfo state by simply parsing the + // corresponding entry in procfs. + if mi.process.Root() == "/" { + data, err := + ioutil.ReadFile(fmt.Sprintf("/proc/%d/mountinfo", mi.process.Pid())) + if err != nil { + return nil, err + } + return data, nil + } + + // In chroot-jail scenarios, launch an asynchronous nsenter-event to access + // the namespaces of the process that originated the mount/umount request. + // This initial nsenter process will not be chroot'ed, and as such, will not + // be constrained by the narrowed mountInfo view of the original process. + // We will then rely on this initial nsenter process to launch a subsequent + // nsenter-event to collect all the mountInfo state available within this + // process' mount namespace. Having this complete picture will probe usual + // later on when trying to validate the legitimacy of the mount/unmount + // request. + asyncEvent := mi.service.nss.NewEvent( + mi.process.Pid(), + &domain.AllNSs, + 0, + &domain.NSenterMessage{ + Type: domain.SleepRequest, + Payload: &domain.SleepReqPayload{Ival: strconv.Itoa(30)}, + }, + nil, + true, + ) + + // Launch the async nsenter-event. + defer asyncEvent.TerminateRequest() + err := mi.service.nss.SendRequestEvent(asyncEvent) + if err != nil { + return nil, err + } + + // Obtain the pid of the nsenter-event's process. + asyncEventPid := mi.service.nss.GetEventProcessID(asyncEvent) + if asyncEventPid == 0 { + return nil, fmt.Errorf("Invalid nsexec process agent") + } + + // Create a new nsenter-event. Notice that we are passing the async + // event's pid as the one for which the mountInfo data will be collected. + event := mi.service.nss.NewEvent( + asyncEventPid, + &domain.AllNSs, + 0, + &domain.NSenterMessage{Type: domain.MountInfoRequest}, + nil, + false, + ) + + // Launch nsenter-event. + err = mi.service.nss.SendRequestEvent(event) + if err != nil { + return nil, err + } + + // Obtain nsenter-event response. + responseMsg := mi.service.nss.ReceiveResponseEvent(event) + if responseMsg.Type == domain.ErrorResponse { + return nil, fmt.Errorf("nsenter error received") + } + + return []byte(responseMsg.Payload.(domain.MountInfoRespPayload).Data), nil +} + +func (mi *mountInfoParser) extractAllInodes() error { + + var reqMounts []string + + for _, info := range mi.idInfo { + // Skip sysbox-fs' emulated resources to avoid the hassle of dealing + // with nested accesses to sysbox-fs' fuse-server from nsenter's + // backend processes. No inode will be required for these mountpoints + // anyways as sysbox-fs handle these file-systems differently. + if _, ok := mi.service.mh.mapMounts[info.MountPoint]; ok { + continue + } + + reqMounts = append(reqMounts, info.MountPoint) + } + + respMounts, err := mi.extractInodes(reqMounts) + if err != nil { + logrus.Warnf("Unable to extract inodes, err: %s", err) + return err + } + + if len(reqMounts) != len(respMounts) { + return fmt.Errorf("Unexpected number of inodes rcvd, expected %d, rcvd %d", + len(reqMounts), len(respMounts)) + } + + for i := 0; i < len(reqMounts); i++ { + info, ok := mi.mpInfo[reqMounts[i]] + if !ok { + return fmt.Errorf("Missing mountInfo entry for mountpoint %s", + reqMounts[i]) + } + + info.MpInode = respMounts[i] + } + + return nil +} + +func (mi *mountInfoParser) extractAncestorInodes(info *domain.MountInfo) error { + + var reqMounts []string + + for { + if info == nil { + break + } + + // Skip sysbox-fs' emulated resources to avoid the hassle of dealing + // with nested accesses to sysbox-fs' fuse-server from nsenter's + // backend processes. No inode will be required for these mountpoints + // anyways as sysbox-fs handle these file-systems differently. + if _, ok := mi.service.mh.mapMounts[info.MountPoint]; !ok { + reqMounts = append(reqMounts, info.MountPoint) + } + + info = mi.GetParentMount(info) + } + + respMounts, err := mi.extractInodes(reqMounts) + if err != nil { + return nil + } + + if len(reqMounts) != len(respMounts) { + return fmt.Errorf("Unexpected number of inodes rcvd, expected %d, rcvd %d", + len(reqMounts), len(respMounts)) + } + + for i := 0; i < len(reqMounts); i++ { + info, ok := mi.mpInfo[reqMounts[i]] + if !ok { + return fmt.Errorf("Missing mountInfo entry for mountpoint %s", + reqMounts[i]) + } + + info.MpInode = respMounts[i] + } + + return nil +} + +func (mi *mountInfoParser) extractInodes(mps []string) ([]domain.Inode, error) { + + // Create nsenter-event. + nss := mi.service.nss + event := nss.NewEvent( + mi.process.Pid(), + &domain.AllNSsButUser, + 0, + &domain.NSenterMessage{ + Type: domain.MountInodeRequest, + Payload: &domain.MountInodeReqPayload{ + Mountpoints: mps, + }, + }, + nil, + false, + ) + + // Launch nsenter-event. + err := nss.SendRequestEvent(event) + if err != nil { + return nil, err + } + + // Obtain nsenter-event response. + responseMsg := nss.ReceiveResponseEvent(event) + if responseMsg.Type == domain.ErrorResponse { + return nil, fmt.Errorf("nsenter error received") + } + + return responseMsg.Payload.(domain.MountInodeRespPayload).MpInodes, nil +} + +// isSysboxfsBasemount checks if the given mountpoint is a sysbox-fs managed +// base mount (e.g., a procfs or sysfs mountpoint). +func (mi *mountInfoParser) isSysboxfsBaseMount(info *domain.MountInfo) bool { + return (info.FsType == "proc" || info.FsType == "sysfs") && info.Root == "/" +} + +// isSysboxfsSubmountOf checks is the given mountpoint is a sysbox-fs managed +// submount of the given sysbox-fs base mount (e.g., /proc/sys is a sysbox-fs +// managed submount of /proc). +func (mi *mountInfoParser) isSysboxfsSubMountOf(info, baseInfo *domain.MountInfo) bool { + if info.ParentID != baseInfo.MountID { + return false + } + + // Note: submounts may contain mounts *not* managed by sysbox-fs (e.g., if a + // user mounts something under /proc/*). Check if the given submount is + // managed by sysbox-fs or not. + + relMountpoint := strings.TrimPrefix(info.MountPoint, baseInfo.MountPoint) + + switch baseInfo.FsType { + case "proc": + if isMountpointUnder(relMountpoint, mi.service.mh.procMounts) || + isMountpointUnder(relMountpoint, mi.cntr.ProcRoPaths()) || + isMountpointUnder(relMountpoint, mi.cntr.ProcMaskPaths()) { + return true + } + case "sysfs": + if isMountpointUnder(relMountpoint, mi.service.mh.sysMounts) { + return true + } + } + + return false +} + +// isMountpointUnder returns true if the given mountpoint is under of one the +// mountpoints in the given set. +func isMountpointUnder(mountpoint string, mpSet []string) bool { + for _, mp := range mpSet { + if strings.HasSuffix(mp, mountpoint) { + return true + } + } + return false +} + +// isSysboxfsSubMount returns true if the given mountpoint is a sysboxfs-managed +// submount (e.g., /proc/sys is a sysbox-fs managed submount of /proc). +func (mi *mountInfoParser) isSysboxfsSubMount(info *domain.MountInfo) bool { + + parentInfo := mi.GetParentMount(info) + + // parent may be nil if it's a mount outside the process mount namespace + if parentInfo == nil { + return false + } + + if !mi.isSysboxfsBaseMount(parentInfo) { + return false + } + + return mi.isSysboxfsSubMountOf(info, parentInfo) +} + +// GetInfo returns the mountinfo for a given mountpoint. +func (mi *mountInfoParser) GetInfo(mountpoint string) *domain.MountInfo { + info, found := mi.mpInfo[mountpoint] + if !found { + return nil + } + return info +} + +// GetProcessID returns the pid of the process that triggered the creation of +// a mountInfoParser object. +func (mi *mountInfoParser) GetProcessID() uint32 { + return mi.process.Pid() +} + +// GetParentMount returns the parent of a given mountpoint (or nil if none is +// found). +func (mi *mountInfoParser) GetParentMount(info *domain.MountInfo) *domain.MountInfo { + return mi.idInfo[info.ParentID] +} + +func (mi *mountInfoParser) ExtractMountInfo() ([]byte, error) { + return mi.extractMountInfo() +} + +func (mi *mountInfoParser) ExtractInode(mp string) (domain.Inode, error) { + info, ok := mi.mpInfo[mp] + if !ok { + return 0, fmt.Errorf("No entry found for mountpoint %s", mp) + } + + if info.MpInode == 0 { + inodes, err := mi.extractInodes([]string{mp}) + if err != nil { + return 0, err + } + info.MpInode = inodes[0] + } + + return info.MpInode, nil +} + +func (mi *mountInfoParser) ExtractAncestorInodes(info *domain.MountInfo) error { + return mi.extractAncestorInodes(info) +} + +// IsRootMount returns true if the given mount is the root mount (i.e., "/") +func (mi *mountInfoParser) IsRootMount(info *domain.MountInfo) (bool, error) { + rootMntInfo, found := mi.mpInfo["/"] + if !found { + return false, nil + } + + if info.MountID == rootMntInfo.MountID { + return true, nil + } + + mh := mi.service.mh + isClone, err := isCloneMount(mh, rootMntInfo, info) + if err != nil { + return false, err + } + + return isClone, nil +} + +// IsSysboxfsBaseMount checks if the given mountpoint is a sysbox-fs managed +// base mount (e.g., a procfs or sysfs mountpoint). +func (mi *mountInfoParser) IsSysboxfsBaseMount(mountpoint string) bool { + + info, found := mi.mpInfo[mountpoint] + if !found { + return false + } + + return mi.isSysboxfsBaseMount(info) +} + +// IsSysboxfsBaseMount checks if the given mountpoint is a sysbox-fs managed +// base mount (e.g., a procfs or sysfs mountpoint) mounted as read-only. +func (mi *mountInfoParser) IsSysboxfsBaseRoMount(mountpoint string) bool { + + info, found := mi.mpInfo[mountpoint] + if !found { + return false + } + + if mi.isSysboxfsBaseMount(info) && + mi.IsRoMount(info) { + return true + } + + return false +} + +// IsSysboxfsSubmount checks if the given mountpoint is a sysbox-fs managed +// submount (e.g., /proc/sys is a sysbox-fs managed submount of /proc). +func (mi *mountInfoParser) IsSysboxfsSubmount(mountpoint string) bool { + + info, found := mi.mpInfo[mountpoint] + if !found { + return false + } + + return mi.isSysboxfsSubMount(info) +} + +// IsSysboxfsRoSubmount checks if the given mountpoint is a sysbox-fs managed +// submount that is mounted as read-only. +func (mi *mountInfoParser) IsSysboxfsRoSubmount(mountpoint string) bool { + + info, found := mi.mpInfo[mountpoint] + if !found { + return false + } + + if !mi.isSysboxfsSubMount(info) { + return false + } + + baseInfo := mi.GetParentMount(info) + + // "/some/path/proc/uptime" -> "/uptime" + relMp := strings.TrimPrefix(mountpoint, baseInfo.MountPoint) + + if baseInfo.FsType == "proc" { + if isMountpointUnder(relMp, mi.cntr.ProcRoPaths()) { + return true + } + } + + return false +} + +// IsSysboxfsMaskedSubmount checks if the given moutpoint is a sysbox-fs managed +// submount that is masked (i.e., bind mounted from /dev/null). +func (mi *mountInfoParser) IsSysboxfsMaskedSubmount(mountpoint string) bool { + + info, found := mi.mpInfo[mountpoint] + if !found { + return false + } + + if !mi.isSysboxfsSubMount(info) { + return false + } + + baseInfo := mi.GetParentMount(info) + + // "/some/path/proc/uptime" -> "/uptime" + relMp := strings.TrimPrefix(mountpoint, baseInfo.MountPoint) + + if baseInfo.FsType == "proc" { + if isMountpointUnder(relMp, mi.cntr.ProcMaskPaths()) { + return true + } + } + + return false +} + +// GetSysboxfsSubMounts returns a list of sysbox-fs managed submounts under the +// given base mount (e.g., if basemount is /proc, returns all /proc/* submounts +// managed by sysbox-fs). +func (mi *mountInfoParser) GetSysboxfsSubMounts(basemount string) []string { + + baseInfo := mi.mpInfo[basemount] + + submounts := []string{} + for mp, info := range mi.mpInfo { + if mi.isSysboxfsSubMountOf(info, baseInfo) { + submounts = append(submounts, mp) + } + } + + return submounts +} + +// HasNonSysboxfsSubmount checks if there is at least one non sysbox-fs managed +// submount under the given base mount (e.g., if basemount is /proc, returns +// true if there is a mount under /proc that was not setup by sysbox-fs, such as +// when a user inside the sys container creates a mount under /proc). +func (mi *mountInfoParser) HasNonSysboxfsSubmount(basemount string) bool { + + baseInfo := mi.mpInfo[basemount] + baseID := baseInfo.MountID + + for _, info := range mi.mpInfo { + if info.ParentID == baseID { + if !mi.isSysboxfsSubMountOf(info, baseInfo) { + return true + } + } + } + + return false +} + +// IsRoMount checks if the passed mountpoint is currently present and tagged as +// read-only. +func (mi *mountInfoParser) IsRoMount(info *domain.MountInfo) bool { + + if info == nil { + return false + } + + perMountFlags := mi.service.mh.StringToFlags(info.Options) + + return perMountFlags&unix.MS_RDONLY == unix.MS_RDONLY +} + +// IsRecursiveBindMount verifies if the passed mountinfo entry is a recursive +// bind-mount. +// +// Example: mountID-3413 is a recursive mount of mountID-3544 +// +// 3544 3503 0:129 / /usr/src/linux-headers-5.4.0-48 ro,relatime - shiftfs /usr/src/linux-headers-5.4.0-48 rw +// 3413 3544 0:129 / /usr/src/linux-headers-5.4.0-48 ro,relatime - shiftfs /usr/src/linux-headers-5.4.0-48 rw +func (mi *mountInfoParser) IsRecursiveBindMount(info *domain.MountInfo) bool { + + if info == nil { + return false + } + + // Extract all the mountpoints that match the fs-id of the 'info' object. + fsIdSlice := mi.fsIdInfo[info.MajorMinorVer] + + for _, elem := range fsIdSlice { + if elem.MountID == info.MountID { + continue + } + + if elem.MountID == info.ParentID && + elem.Source == info.Source && + elem.Root == info.Root { + return true + } + } + + return false +} + +// IsSelfMount identifies mountInfo entries that have been created by +// self bind-mounting actions (i.e. "mount -o bind /x /x"). +// +// Example 1: mountID-3074 is a 'self' mount of the original mountID-2712 entry. +// +// 2712 2192 0:153 / /usr/src/linux-headers-5.4.0-62 ro,relatime - shiftfs /usr/src/linux-headers-5.4.0-62 rw +// 3074 2712 0:153 / /usr/src/linux-headers-5.4.0-62 ro,relatime - shiftfs /usr/src/linux-headers-5.4.0-62 rw +// +// Example 2: mountID-3074 is a 'self' mount of the original mountID-2706 entry. +// +// 2706 2192 0:155 /resolv.conf /etc/resolv.conf rw,relatime - shiftfs /var/lib/docker/containers... rw +// 3074 2706 0:155 /resolv.conf /etc/resolv.conf rw,relatime - shiftfs /var/lib/docker/containers... rw +func (mi *mountInfoParser) IsSelfMount(info *domain.MountInfo) bool { + + if info == nil { + return false + } + + infoParent := mi.GetParentMount(info) + if infoParent == nil { + return false + } + + return info.Root == infoParent.Root && + info.MountPoint == infoParent.MountPoint && + info.Source == infoParent.Source +} + +// IsOverlapMount determines if the mountpoint associated to a mountInfo entry +// overlaps with any other mountpoint in the mountInfo tree. A 'self' mount +// is a special case (subset) of an 'overlap' one. +// +// Example 1: Same as the above one (IsSelfMount method). +// +// 2712 2192 0:153 / /usr/src/linux-headers-5.4.0-62 ro,relatime - shiftfs /usr/src/linux-headers-5.4.0-62 rw +// 3074 2712 0:153 / /usr/src/linux-headers-5.4.0-62 ro,relatime - shiftfs /usr/src/linux-headers-5.4.0-62 rw +// +// Example 2: mountID-3074 is an 'overlap' mount of the original mountID-2706 entry +// +// 2706 2192 0:155 /resolv.conf /etc/resolv.conf rw,relatime - shiftfs /var/lib/docker/containers... rw +// 3074 2706 0:6 /null /etc/resolv.conf rw,nosuid,noexec,relatime master:2 - devtmpfs udev rw,size=4048120k,nr_inodes=1012030,mode=755 +func (mi *mountInfoParser) IsOverlapMount(info *domain.MountInfo) bool { + + if info == nil { + return false + } + + infoParent := mi.GetParentMount(info) + if infoParent == nil { + return false + } + + return info.MountPoint == infoParent.MountPoint +} + +// IsBindMount verifies if the passed mountinfo entry is a 'bind-mount'. Notice +// that the 'overlap' classification is orthogonal to the 'bind-mount' one (i.e. +// an overlap may, or may not, fit the 'bind-mount' requirements). On the other +// hand, a 'self' mount is always also a 'bind-mount'. +// +// This implementation relies on the basic assumption that both the 'source' and +// the 'destination' mountpoints in a bind-mount share the same file-system-id +// (major:minor number pair), which is something being imposed by kernel during +// the bind-mount operation. Furthermore, this fs-id-based association is kept +// consistent across (mount) namespaces, which allow us to identify bind-mount +// associations between mountpoints seating in containers at different levels +// of the nesting hierarchy. +// +// Now, there is one caveat: virtual file-systems such as the one associated to +// '/dev/null', make use of a common fs-id to represent all the /dev/null +// bind-mounts (i.e. mount -o bind /dev/null /tmp/example). Other virtual file +// systems (e.g. 'tmpfs') allocate a unique fs-id for every tmpfs mountpoint +// being created. In consequence, this method's logic has some limitations when +// there's a need to identify bind-mounted resources across different namespaces. +func (mi *mountInfoParser) IsBindMount(info *domain.MountInfo) bool { + + if info == nil { + return false + } + + mh := mi.service.mh + if mh == nil { + return false + } + + // Extract all the mountpoints that match the fs-id of the 'info' object. + fsIdSlice := mi.fsIdInfo[info.MajorMinorVer] + + // Iterate through this slice of mountpoints looking for one that qualifies + // as the 'source' of the 'info' mountpoint. + for _, elem := range fsIdSlice { + if elem.MountID == info.MountID { + continue + } + + // To qualify as bind-mount 'source', candidates must meet this minimum + // criteria set. + if elem.Root == info.Root && elem.Source == info.Source { + return true + } + } + + return false +} + +// IsBindMount verifies if the passed mountinfo entry is a read-only bind-mount. +// Refer to above method for implementation details. +func (mi *mountInfoParser) IsRoBindMount(info *domain.MountInfo) bool { + + if info == nil { + return false + } + + mh := mi.service.mh + if mh == nil { + return false + } + + fsIdSlice := mi.fsIdInfo[info.MajorMinorVer] + + for _, elem := range fsIdSlice { + if elem.MountID == info.MountID { + continue + } + + if elem.Root == info.Root && elem.Source == info.Source { + return mh.StringToFlags(elem.Options)&unix.MS_RDONLY == unix.MS_RDONLY + } + } + + return false +} + +// IsCloneMount determines if the passed mountInfo entry is a 'clone' of any of +// the entries in the 'mi' object. For this purpose we compare the attributes of +// the given mountpoint with those of the entries in the 'mi' object. If the +// mountpoint attributes are not sufficient (they all match), we also compare +// the attributes of the parent/ancestor mounts. +// +// IsCloneMount exposes the 'readonly' parameter to allow callee to request +// 'clone' elements that are necessarily read-only mountpoints. +func (mi *mountInfoParser) IsCloneMount( + mntInfo *domain.MountInfo, + readonly bool) (bool, error) { + + mh := mi.service.mh + + // Extract the list of mountpoints matching the given mount's filesystem + // maj:min numbers. + candidates := mi.fsIdInfo[mntInfo.MajorMinorVer] + + for _, candidate := range candidates { + + // Skip check if it doesn't fit the readonly criteria. + candidateFlags := mh.StringToFlags(candidate.Options) + if readonly && !mh.IsReadOnlyMount(candidateFlags) { + continue + } + + isClone, err := isCloneMount(mh, candidate, mntInfo) + if err != nil { + return false, err + } + + if isClone { + return true, nil + } + } + + return false, nil +} + +// isCloneMount returns true if the given mounts are clones. +func isCloneMount(mh *mountHelper, mnt1, mnt2 *domain.MountInfo) (bool, error) { + + // A mountpoint with the same ID can't be a clone (by definition). + if mnt1.MountID == mnt2.MountID { + return false, nil + } + + mip1 := mnt1.Mip + mip2 := mnt2.Mip + + mnt1Flags := mh.StringToFlags(mnt1.Options) + mnt2Flags := mh.StringToFlags(mnt2.Options) + + // All clones must meet a minimum set of criteria. + if mnt1.Root != mnt2.Root || + mnt1.Source != mnt2.Source || + mnt1Flags&^unix.MS_RDONLY != mnt2Flags&^unix.MS_RDONLY { + return false, nil + } + + // If not already present, fetch the inodes of the elements being + // compared, and also those within their ancestry line. This last point + // is an optimization that takes into account the relatively-low cost + // of obtaining multiple inodes vs the cost of collecting a single one + // in various (nsenter) iterations. + if mnt1.MpInode == 0 { + err := mip1.ExtractAncestorInodes(mnt1) + if err != nil { + return false, err + } + } + if mnt2.MpInode == 0 { + err := mip2.ExtractAncestorInodes(mnt2) + if err != nil { + return false, err + } + } + + // If the mountpoint inodes match and parent mount (i.e., ancestry) is the + // same, it's a clone. + if mnt1.MpInode == mnt2.MpInode { + if ancestryLineMatch(mh, mnt2, mnt1) { + return true, nil + } + } + + return false, nil +} + +// ancestryLineMatch determines if the passed mountpoints have the same +// ancestry (i.e., same parent mount, same grandparent mount, etc.) +func ancestryLineMatch(mh *mountHelper, m1, m2 *domain.MountInfo) bool { + + for { + m1 = m1.Mip.GetParentMount(m1) + m2 = m2.Mip.GetParentMount(m2) + + // A full match is encountered whenever there are no more elements to + // compare in either ancestry line. + if m1 == nil || m2 == nil { + return true + } + + if m1.MpInode == 0 { + err := m1.Mip.ExtractAncestorInodes(m1) + if err != nil { + return false + } + } + if m2.MpInode == 0 { + err := m2.Mip.ExtractAncestorInodes(m2) + if err != nil { + return false + } + } + + // Return 'false' whenever a mismatch is found in any of the elements + // of the ancestry line. + if m1.MpInode != m2.MpInode || + m1.Root != m2.Root || + m1.Source != m2.Source || + mh.StringToFlags(m1.Options)&^unix.MS_RDONLY != mh.StringToFlags(m2.Options)&^unix.MS_RDONLY { + return false + } + } + + return false +} + +// LookupByMountID does a simple lookup in IdInfo map. +func (mi *mountInfoParser) LookupByMountID(id int) *domain.MountInfo { + + if info, ok := mi.idInfo[id]; ok { + return info + } + + return nil +} + +// LookupByMountpoint does a simple lookup in mpInfo map. +func (mi *mountInfoParser) LookupByMountpoint(mp string) *domain.MountInfo { + + if info, ok := mi.mpInfo[mp]; ok { + return info + } + + return nil +} + +func (mi *mountInfoParser) MountInode(mp string) uint64 { + + if info, ok := mi.mpInfo[mp]; ok { + return info.MpInode + } + + return 0 +} diff --git a/sysbox-fs/mount/infoParser_test.go b/sysbox-fs/mount/infoParser_test.go new file mode 100644 index 00000000..059468c8 --- /dev/null +++ b/sysbox-fs/mount/infoParser_test.go @@ -0,0 +1,104 @@ +// +// Copyright 2019-2020 Nestybox, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +package mount + +import ( + "testing" + + "github.com/nestybox/sysbox-fs/domain" +) + +var mountInfoData = []byte(`1526 1218 0:86 / / rw,relatime - shiftfs /var/lib/docker/overlay2/85257da8a9d3ce990cc15656845ff381b195501df3aedce24748282556baec11/merged rw +1531 1526 0:95 / /sys rw,nosuid,nodev,noexec,relatime - sysfs sysfs rw +1532 1531 0:96 / /sys/fs/cgroup ro,nosuid,nodev,noexec - tmpfs tmpfs ro,mode=755,uid=231072,gid=231072 +1533 1532 0:27 / /sys/fs/cgroup/systemd rw,nosuid,nodev,noexec,relatime - cgroup systemd rw,xattr,name=systemd +1534 1532 0:30 / /sys/fs/cgroup/cpu,cpuacct rw,nosuid,nodev,noexec,relatime - cgroup cgroup rw,cpu,cpuacct +1535 1532 0:31 / /sys/fs/cgroup/blkio rw,nosuid,nodev,noexec,relatime - cgroup cgroup rw,blkio +1536 1532 0:32 / /sys/fs/cgroup/net_cls,net_prio rw,nosuid,nodev,noexec,relatime - cgroup cgroup rw,net_cls,net_prio +1537 1532 0:33 / /sys/fs/cgroup/hugetlb rw,nosuid,nodev,noexec,relatime - cgroup cgroup rw,hugetlb +1538 1532 0:34 / /sys/fs/cgroup/perf_event rw,nosuid,nodev,noexec,relatime - cgroup cgroup rw,perf_event +1539 1532 0:35 / /sys/fs/cgroup/cpuset rw,nosuid,nodev,noexec,relatime - cgroup cgroup rw,cpuset,clone_children +1540 1532 0:36 / /sys/fs/cgroup/devices rw,nosuid,nodev,noexec,relatime - cgroup cgroup rw,devices +1541 1532 0:37 / /sys/fs/cgroup/memory rw,nosuid,nodev,noexec,relatime - cgroup cgroup rw,memory +1542 1532 0:38 / /sys/fs/cgroup/rdma rw,nosuid,nodev,noexec,relatime - cgroup cgroup rw,rdma +1543 1532 0:39 / /sys/fs/cgroup/pids rw,nosuid,nodev,noexec,relatime - cgroup cgroup rw,pids +1544 1532 0:40 / /sys/fs/cgroup/freezer rw,nosuid,nodev,noexec,relatime - cgroup cgroup rw,freezer +1555 1531 0:97 / /sys/kernel/config rw,nosuid,nodev,noexec,relatime - tmpfs tmpfs rw,size=1024k,uid=231072,gid=231072 +1583 1531 0:98 / /sys/kernel/debug rw,nosuid,nodev,noexec,relatime - tmpfs tmpfs rw,size=1024k,uid=231072,gid=231072 +1589 1531 0:77 /sys/module/nf_conntrack/parameters/hashsize /sys/module/nf_conntrack/parameters/hashsize rw,nosuid,nodev,relatime - fuse sysboxfs rw,user_id=0,group_id=0,default_permissions,allow_other +1590 1526 0:85 / /proc rw,nosuid,nodev,noexec,relatime - proc proc rw +1610 1590 0:77 /proc/swaps /proc/swaps rw,nosuid,nodev,relatime - fuse sysboxfs rw,user_id=0,group_id=0,default_permissions,allow_other +1638 1590 0:77 /proc/sys /proc/sys rw,nosuid,nodev,relatime - fuse sysboxfs rw,user_id=0,group_id=0,default_permissions,allow_other +1644 1590 0:77 /proc/uptime /proc/uptime rw,nosuid,nodev,relatime - fuse sysboxfs rw,user_id=0,group_id=0,default_permissions,allow_other +1645 1526 0:104 / /dev rw,nosuid - tmpfs tmpfs rw,size=65536k,mode=755,uid=231072,gid=231072 +1711 1645 0:6 /null /dev/kmsg rw,nosuid,relatime - devtmpfs udev rw,size=4058184k,nr_inodes=1014546,mode=755 +1712 1645 0:84 / /dev/mqueue rw,nosuid,nodev,noexec,relatime - mqueue mqueue rw +1713 1645 0:105 / /dev/pts rw,nosuid,noexec,relatime - devpts devpts rw,gid=231077,mode=620,ptmxmode=666 +1714 1645 0:106 / /dev/shm rw,nosuid,nodev,noexec,relatime - tmpfs shm rw,size=65536k,uid=231072,gid=231072 +1715 1526 0:94 /resolv.conf /etc/resolv.conf rw,relatime - shiftfs /var/lib/docker/containers/acbc2a6670e672cbaf39897aaaabce7f245a8c09a27458173e8a9b99c28ac6ae rw +1716 1526 0:94 /hostname /etc/hostname rw,relatime - shiftfs /var/lib/docker/containers/acbc2a6670e672cbaf39897aaaabce7f245a8c09a27458173e8a9b99c28ac6ae rw +1717 1526 0:94 /hosts /etc/hosts rw,relatime - shiftfs /var/lib/docker/containers/acbc2a6670e672cbaf39897aaaabce7f245a8c09a27458173e8a9b99c28ac6ae rw +1718 1526 0:90 / /usr/src/linux-headers-5.0.0-38-generic ro,relatime - shiftfs /usr/src/linux-headers-5.0.0-38-generic rw +1719 1526 0:88 / /usr/src/linux-headers-5.0.0-38 ro,relatime - shiftfs /usr/src/linux-headers-5.0.0-38 rw +1720 1526 0:87 / /usr/lib/modules/5.0.0-38-generic ro,relatime - shiftfs /lib/modules/5.0.0-38-generic rw +1721 1526 8:1 /var/lib/sysbox/docker/baseVol/acbc2a6670e672cbaf39897aaaabce7f245a8c09a27458173e8a9b99c28ac6ae /var/lib/docker rw,relatime shared:815 - ext4 /dev/sda1 rw,errors=remount-ro +1722 1526 8:1 /var/lib/sysbox/kubelet/acbc2a6670e672cbaf39897aaaabce7f245a8c09a27458173e8a9b99c28ac6ae /var/lib/kubelet rw,relatime - ext4 /dev/sda1 rw,errors=remount-ro +1723 1526 8:1 /var/lib/sysbox/containerd/acbc2a6670e672cbaf39897aaaabce7f245a8c09a27458173e8a9b99c28ac6ae /var/lib/containerd rw,relatime - ext4 /dev/sda1 rw,errors=remount-ro +1724 1526 0:107 / /run rw,nosuid,nodev,relatime - tmpfs tmpfs rw,size=65536k,mode=755,uid=231072,gid=231072 +1725 1724 0:108 / /run/lock rw,nosuid,nodev,noexec,relatime - tmpfs tmpfs rw,size=4096k,uid=231072,gid=231072 +1726 1526 0:109 / /tmp rw,nosuid,nodev,noexec,relatime - tmpfs tmpfs rw,size=65536k,uid=231072,gid=231072 +1727 1645 0:6 /null /dev/null rw,nosuid,relatime master:2 - devtmpfs udev rw,size=4058184k,nr_inodes=1014546,mode=755 +1728 1645 0:6 /random /dev/random rw,nosuid,relatime master:2 - devtmpfs udev rw,size=4058184k,nr_inodes=1014546,mode=755 +1729 1645 0:6 /full /dev/full rw,nosuid,relatime master:2 - devtmpfs udev rw,size=4058184k,nr_inodes=1014546,mode=755 +1730 1645 0:6 /tty /dev/tty rw,nosuid,relatime master:2 - devtmpfs udev rw,size=4058184k,nr_inodes=1014546,mode=755 +1731 1645 0:6 /zero /dev/zero rw,nosuid,relatime master:2 - devtmpfs udev rw,size=4058184k,nr_inodes=1014546,mode=755 +1732 1645 0:6 /urandom /dev/urandom rw,nosuid,relatime master:2 - devtmpfs udev rw,size=4058184k,nr_inodes=1014546,mode=755 +1219 1645 0:105 /0 /dev/console rw,nosuid,noexec,relatime - devpts devpts rw,gid=231077,mode=620,ptmxmode=666 +1343 1590 0:85 /bus /proc/bus ro,relatime - proc proc rw +1344 1590 0:85 /fs /proc/fs ro,relatime - proc proc rw +1345 1590 0:85 /irq /proc/irq ro,relatime - proc proc rw +1360 1590 0:85 /sysrq-trigger /proc/sysrq-trigger ro,relatime - proc proc rw +1361 1590 0:110 / /proc/asound ro,relatime - tmpfs tmpfs ro,uid=231072,gid=231072 +1362 1590 0:111 / /proc/acpi ro,relatime - tmpfs tmpfs ro,uid=231072,gid=231072 +1393 1590 0:6 /null /proc/keys rw,nosuid,relatime master:2 - devtmpfs udev rw,size=4058184k,nr_inodes=1014546,mode=755 +1399 1590 0:6 /null /proc/timer_list rw,nosuid,relatime master:2 - devtmpfs udev rw,size=4058184k,nr_inodes=1014546,mode=755 +1400 1590 0:6 /null /proc/sched_debug rw,nosuid,relatime master:2 - devtmpfs udev rw,size=4058184k,nr_inodes=1014546,mode=755 +1416 1590 0:112 / /proc/scsi ro,relatime - tmpfs tmpfs ro,uid=231072,gid=231072 +1417 1531 0:113 / /sys/firmware ro,relatime - tmpfs tmpfs ro,uid=231072,gid=231072 +`) + +// Benchmark /proc/pid/mountinfo parsing logic. +func Benchmark_parseData(b *testing.B) { + + mi := &mountInfoParser{ + cntr: nil, + process: nil, //process, + launchParser: true, + fetchOptions: true, + fetchInodes: true, + mpInfo: make(map[string]*domain.MountInfo), + idInfo: make(map[int]*domain.MountInfo), + fsIdInfo: make(map[string][]*domain.MountInfo), + } + + for i := 0; i < b.N; i++ { + err := mi.parseData(mountInfoData) + if err != nil { + b.Errorf("err") + } + } +} diff --git a/sysbox-fs/mount/service.go b/sysbox-fs/mount/service.go new file mode 100644 index 00000000..756115ed --- /dev/null +++ b/sysbox-fs/mount/service.go @@ -0,0 +1,110 @@ +// +// Copyright 2019-2020 Nestybox, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +package mount + +import ( + "github.com/nestybox/sysbox-fs/domain" +) + +// List of sysbox-fs mountpoints within a sysbox container's procfs and sysfs. +// +// These mountpoints need to be tracked here to ensure that they are handled +// with special care. That is: +// +// * These mountpoints must be exposed in new procfs / sysfs file-systems +// created within a sys container (e.g. chroot jails, l2 containers, etc). +// +// * During the sys container initialization process, sysbox-fs must avoid +// generating a request to obtain the inodes associated to these mountpoints +// -- see extractAllInodes(). The goal here is to prevent recursive i/o +// operations from being able to arrive to sysbox-fs which could potentially +// stall its FSM. + +var ProcfsMounts = []string{ + "/proc/uptime", + "/proc/swaps", + "/proc/sys", +} + +var SysfsMounts = []string{ + "/sys/kernel", + "/sys/devices/virtual", + "/sys/module/nf_conntrack/parameters", +} + +type MountService struct { + mh *mountHelper // mountHelper instance for mount-clients + css domain.ContainerStateServiceIface // for container-state interactions + hds domain.HandlerServiceIface // for handler package interactions + prs domain.ProcessServiceIface // for process package interactions + nss domain.NSenterServiceIface // for nsexec package interactions +} + +func NewMountService() *MountService { + return &MountService{} +} + +func (mts *MountService) Setup( + css domain.ContainerStateServiceIface, + hds domain.HandlerServiceIface, + prs domain.ProcessServiceIface, + nss domain.NSenterServiceIface) { + + mts.css = css + mts.hds = hds + mts.prs = prs + mts.nss = nss +} + +func (mts *MountService) NewMountInfoParser( + cntr domain.ContainerIface, + process domain.ProcessIface, + launchParser bool, + fetchOptions bool, + fetchInodes bool) (domain.MountInfoParserIface, error) { + + if mts.mh == nil { + mts.NewMountHelper() + } + + return newMountInfoParser( + cntr, + process, + launchParser, + fetchOptions, + fetchInodes, + mts, + ) +} + +func (mts *MountService) NewMountHelper() domain.MountHelperIface { + + // Handler-service should be initialized by now, but there's one case + // (nsexec's mts utilization) where a mount-service instance may be + // partially initialized for reduced mts functionality. + if mts.hds == nil { + return nil + } + + mts.mh = newMountHelper(mts) + + return mts.mh +} + +func (mts *MountService) MountHelper() domain.MountHelperIface { + return mts.mh +} diff --git a/sysbox-fs/nsenter/event.go b/sysbox-fs/nsenter/event.go new file mode 100644 index 00000000..ceba20e8 --- /dev/null +++ b/sysbox-fs/nsenter/event.go @@ -0,0 +1,1858 @@ +// +// Copyright 2019-2020 Nestybox, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +package nsenter + +import ( + "bytes" + "encoding/json" + "errors" + "fmt" + "io" + "io/ioutil" + "os" + "os/exec" + "os/user" + "path/filepath" + "runtime" + "strconv" + "strings" + "syscall" + "time" + + _ "github.com/nestybox/sysbox-runc/libcontainer/nsenter" + "github.com/nestybox/sysbox-runc/libcontainer/utils" + "github.com/sirupsen/logrus" + "github.com/vishvananda/netlink/nl" + "golang.org/x/sys/unix" + + "github.com/nestybox/sysbox-fs/domain" + "github.com/nestybox/sysbox-fs/fuse" + "github.com/nestybox/sysbox-fs/mount" + "github.com/nestybox/sysbox-fs/process" + "github.com/nestybox/sysbox-runc/libcontainer" +) + +func init() { + if len(os.Args) > 1 && os.Args[1] == "nsenter" { + runtime.GOMAXPROCS(1) + runtime.LockOSThread() + } +} + +// Pid struct. Utilized by sysbox-runc's nsexec code. +type pid struct { + Pid int `json:"pid"` + PidFirstChild int `json:"pid_first"` +} + +// NSenterEvent struct serves as a transport abstraction (envelope) to carry +// all the potential messages that can be exchanged between sysbox-fs master +// instance and secondary (forked) ones. These sysbox-fs' auxiliary instances +// are utilized to perform actions over namespaced resources, and as such, +// cannot be executed by sysbox-fs' main instance. +// +// Every bidirectional transaction is represented by an event structure +// (nsenterEvent), which holds both 'request' and 'response' messages, as well +// as the context necessary to complete any action demanding inter-namespace +// message exchanges. +type NSenterEvent struct { + + // Pid on behalf of which sysbox-fs is creating the nsenter event. + Pid uint32 `json:"pid"` + + // namespace-types to attach to. + Namespace *[]domain.NStype `json:"namespace"` + + // namepsaces to create (i.e., unshare) + CloneFlags uint32 + + // Request message to be sent. + ReqMsg *domain.NSenterMessage `json:"request"` + + // Response message to be received. + ResMsg *domain.NSenterMessage `json:"response"` + + // Sysbox-fs' spawned process carrying out the nsexec instruction. + Process *os.Process `json:"process"` + + // Asynchronous flag to tag events for which no response is expected. + Async bool + + // IPC pipes among sysbox-fs parent / child processes. + parentPipe *os.File + + // Zombie Reaper (for left-over nsenter child processes) + reaper *zombieReaper + + // Backpointer to Nsenter service + service *nsenterService +} + +// +// Generic getter / setter methods. +// + +func (e *NSenterEvent) SetRequestMsg(m *domain.NSenterMessage) { + e.ReqMsg = m +} + +func (e *NSenterEvent) GetRequestMsg() *domain.NSenterMessage { + return e.ReqMsg +} + +func (e *NSenterEvent) SetResponseMsg(m *domain.NSenterMessage) { + e.ResMsg = m +} + +func (e *NSenterEvent) GetResponseMsg() *domain.NSenterMessage { + return e.ResMsg +} + +func (e *NSenterEvent) GetProcessID() uint32 { + return uint32(e.Process.Pid) +} + +/////////////////////////////////////////////////////////////////////////////// +// +// nsenterEvent methods below execute within the context of sysbox-fs' main +// instance, upon invocation of sysbox-fs' handler or seccomp-bpf logic. +// +/////////////////////////////////////////////////////////////////////////////// + +// Called by sysbox-fs handler routines to parse the response generated +// by sysbox-fs' grand-child processes. +func (e *NSenterEvent) processResponse(pipe io.Reader) error { + + // Raw message payload to aid in decoding generic messages (see below + // explanation). + var payload json.RawMessage + nsenterMsg := domain.NSenterMessage{ + Payload: &payload, + } + + // Decode received msg header to help us determine the payload type. + // Received message will be decoded in two phases. The decode instruction + // below help us determine the message-type being received. Based on the + // obtained type, we are able to decode the payload generated by the + // remote-end. This second step is executed as part of a subsequent + // unmarshal instruction (see further below). + if err := json.NewDecoder(pipe).Decode(&nsenterMsg); err != nil { + logrus.Warnf("Error decoding received nsenterMsg response: %s", err) + return fmt.Errorf("Error decoding received nsenterMsg response: %s", err) + } + + switch nsenterMsg.Type { + + case domain.LookupResponse: + logrus.Debug("Received nsenterEvent lookupResponse message.") + + var p domain.FileInfo + + if payload != nil { + err := json.Unmarshal(payload, &p) + if err != nil { + logrus.Error(err) + return err + } + } + + e.ResMsg = &domain.NSenterMessage{ + Type: nsenterMsg.Type, + Payload: p, + } + break + + case domain.OpenFileResponse: + logrus.Debug("Received nsenterEvent OpenResponse message.") + + var p int + + if payload != nil { + err := json.Unmarshal(payload, &p) + if err != nil { + logrus.Error(err) + return err + } + } + + e.ResMsg = &domain.NSenterMessage{ + Type: nsenterMsg.Type, + Payload: p, + } + break + + case domain.ReadFileResponse: + logrus.Debug("Received nsenterEvent readResponse message.") + + var p []byte + + if payload != nil { + err := json.Unmarshal(payload, &p) + if err != nil { + logrus.Error(err) + return err + } + } + + e.ResMsg = &domain.NSenterMessage{ + Type: nsenterMsg.Type, + Payload: p, + } + break + + case domain.WriteFileResponse: + logrus.Debug("Received nsenterEvent writeResponse message.") + + e.ResMsg = &domain.NSenterMessage{ + Type: nsenterMsg.Type, + Payload: "", + } + break + + case domain.ReadDirResponse: + logrus.Debug("Received nsenterEvent readDirAllResponse message.") + + var p []domain.FileInfo + + if payload != nil { + err := json.Unmarshal(payload, &p) + if err != nil { + logrus.Error(err) + return err + } + } + + e.ResMsg = &domain.NSenterMessage{ + Type: nsenterMsg.Type, + Payload: p, + } + break + + case domain.ReadLinkResponse: + logrus.Debug("Received nsenterEvent readLinkResponse message.") + + var p string + + if payload != nil { + err := json.Unmarshal(payload, &p) + if err != nil { + logrus.Error(err) + return err + } + } + + e.ResMsg = &domain.NSenterMessage{ + Type: nsenterMsg.Type, + Payload: p, + } + break + + case domain.MountSyscallResponse: + logrus.Debug("Received nsenterEvent mountSyscallResponse message.") + + e.ResMsg = &domain.NSenterMessage{ + Type: nsenterMsg.Type, + Payload: "", + } + break + + case domain.UmountSyscallResponse: + logrus.Debug("Received nsenterEvent umountSyscallResponse message.") + + e.ResMsg = &domain.NSenterMessage{ + Type: nsenterMsg.Type, + Payload: "", + } + break + + case domain.MountInfoResponse: + logrus.Debug("Received nsenterEvent mountInfoResponse message.") + + var p domain.MountInfoRespPayload + + if payload != nil { + err := json.Unmarshal(payload, &p) + if err != nil { + logrus.Error(err) + return err + } + } + + e.ResMsg = &domain.NSenterMessage{ + Type: nsenterMsg.Type, + Payload: p, + } + break + + case domain.MountInodeResponse: + logrus.Debug("Received nsenterEvent mountInodeResponse message.") + + var p domain.MountInodeRespPayload + + if payload != nil { + err := json.Unmarshal(payload, &p) + if err != nil { + logrus.Error(err) + return err + } + } + + e.ResMsg = &domain.NSenterMessage{ + Type: nsenterMsg.Type, + Payload: p, + } + break + + case domain.ChownSyscallResponse: + logrus.Debug("Received nsenterEvent chownSyscallResponse message.") + + e.ResMsg = &domain.NSenterMessage{ + Type: nsenterMsg.Type, + Payload: "", + } + break + + case domain.SetxattrSyscallResponse: + logrus.Debug("Received nsenterEvent setxattrSyscallResponse message.") + + e.ResMsg = &domain.NSenterMessage{ + Type: nsenterMsg.Type, + Payload: "", + } + break + + case domain.GetxattrSyscallResponse: + logrus.Debug("Received nsenterEvent getxattrSyscallResponse message.") + + var p domain.GetxattrRespPayload + + if payload != nil { + err := json.Unmarshal(payload, &p) + if err != nil { + logrus.Error(err) + return err + } + } + + e.ResMsg = &domain.NSenterMessage{ + Type: nsenterMsg.Type, + Payload: p, + } + break + + case domain.RemovexattrSyscallResponse: + logrus.Debug("Received nsenterEvent removexattrSyscallResponse message.") + + e.ResMsg = &domain.NSenterMessage{ + Type: nsenterMsg.Type, + Payload: "", + } + break + + case domain.ListxattrSyscallResponse: + logrus.Debug("Received nsenterEvent listxattrSyscallResponse message.") + + var p domain.ListxattrRespPayload + + if payload != nil { + err := json.Unmarshal(payload, &p) + if err != nil { + logrus.Error(err) + return err + } + } + + e.ResMsg = &domain.NSenterMessage{ + Type: nsenterMsg.Type, + Payload: p, + } + break + + case domain.SleepResponse: + logrus.Debug("Received nsenterEvent sleepResponse message.") + + e.ResMsg = &domain.NSenterMessage{ + Type: nsenterMsg.Type, + Payload: "", + } + break + + case domain.UidInfoResponse: + logrus.Debug("Received nsenterEvent uidInfoResponse message.") + + var p domain.UidInfoRespPayload + + if payload != nil { + err := json.Unmarshal(payload, &p) + if err != nil { + logrus.Error(err) + return err + } + } + + e.ResMsg = &domain.NSenterMessage{ + Type: nsenterMsg.Type, + Payload: p, + } + break + + case domain.GidInfoResponse: + logrus.Debug("Received nsenterEvent gidInfoResponse message.") + + var p domain.GidInfoRespPayload + + if payload != nil { + err := json.Unmarshal(payload, &p) + if err != nil { + logrus.Error(err) + return err + } + } + + e.ResMsg = &domain.NSenterMessage{ + Type: nsenterMsg.Type, + Payload: p, + } + break + + case domain.ErrorResponse: + logrus.Debug("Received nsenterEvent errorResponse message.") + + var p fuse.IOerror + + if payload != nil { + err := json.Unmarshal(payload, &p) + if err != nil { + logrus.Error(err) + return err + } + } + + e.ResMsg = &domain.NSenterMessage{ + Type: nsenterMsg.Type, + Payload: p, + } + break + + default: + return errors.New("Received unsupported nsenterEvent message.") + } + + return nil +} + +// Auxiliary function to obtain the FS path associated to any given namespace. +// These FS paths are utilized by sysbox-runc's nsexec logic to enter the +// desired namespaces. +// +// Expected format example: "mnt:/proc//ns/mnt" +func (e *NSenterEvent) namespacePaths() []string { + + var paths []string + + // Note: e.Namespace is assumed to be ordered such that if userns is present, it's + // always first. + + for _, nstype := range *(e.Namespace) { + path := nstype + ":" + filepath.Join("/proc", strconv.Itoa(int(e.Pid)), "/ns", nstype) + paths = append(paths, path) + } + + return paths +} + +// Sysbox-fs requests are generated through this method. Handlers seeking to +// access namespaced resources will call this method to invoke nsexec, +// which will enter the container namespaces that host these resources. +func (e *NSenterEvent) SendRequest() error { + + logrus.Debug("Executing nsenterEvent's SendRequest() method") + + // Alert the zombie reaper that nsenter is about to start. Notice that we + // skip reaper's services for async requests as, in those cases, the callee + // is expected to sigkill its generated nsenter processes. + if !e.Async { + e.reaper.nsenterStarted() + } + defer func() { + if !e.Async { + e.reaper.nsenterEnded() + } + }() + + // Create a socket pair + parentPipe, childPipe, err := utils.NewSockPair("nsenterPipe") + if err != nil { + return errors.New("Error creating sysbox-fs nsenter pipe") + } + e.parentPipe = parentPipe + defer func() { + if !e.Async { + e.parentPipe.Close() + } + }() + + // Set the SO_PASSCRED on the socket (so we can pass process credentials across it) + socket := int(parentPipe.Fd()) + err = syscall.SetsockoptInt(socket, syscall.SOL_SOCKET, syscall.SO_PASSCRED, 1) + if err != nil { + return fmt.Errorf("Error setting socket options on nsenter pipe: %v", err) + } + + // Create the nsenter instruction packet + r := nl.NewNetlinkRequest(int(libcontainer.InitMsg), 0) + + // existing namespaces to join (if any) + namespaces := e.namespacePaths() + r.AddData(&libcontainer.Bytemsg{ + Type: libcontainer.NsPathsAttr, + Value: []byte(strings.Join(namespaces, ",")), + }) + + // new namespaces to create (after joining existing namespaces) + r.AddData(&libcontainer.Int32msg{ + Type: libcontainer.CloneFlagsAttr, + Value: e.CloneFlags, + }) + + // Prepare exec.cmd in charge of running: "sysbox-fs nsenter". + cmd := &exec.Cmd{ + Path: "/proc/self/exe", + Args: []string{os.Args[0], "nsenter"}, + ExtraFiles: []*os.File{childPipe}, + Env: []string{"_LIBCONTAINER_INITPIPE=3", fmt.Sprintf("GOMAXPROCS=%s", os.Getenv("GOMAXPROCS"))}, + SysProcAttr: &syscall.SysProcAttr{Pdeathsig: syscall.SIGTERM}, + Stdin: nil, + Stdout: nil, + Stderr: nil, + } + + // Launch sysbox-fs' first child process. + err = cmd.Start() + childPipe.Close() + if err != nil { + logrus.Errorf("Error launching sysbox-fs first child process: %s", err) + return errors.New("Error launching sysbox-fs first child process") + } + + // Send the config to child process. + if _, err := io.Copy(e.parentPipe, bytes.NewReader(r.Serialize())); err != nil { + logrus.Warnf("Error copying payload to pipe: %s", err) + if !e.Async { + e.reaper.nsenterReapReq() + } + return errors.New("Error copying payload to pipe") + } + + // Wait for sysbox-fs' first child process to finish. + status, err := cmd.Process.Wait() + if err != nil { + logrus.Warnf("Error waiting for sysbox-fs first child process: %d, status: %s, error: %s", + cmd.Process.Pid, status.String(), err) + if !e.Async { + e.reaper.nsenterReapReq() + } + return err + } + if !status.Success() { + logrus.Warnf("Sysbox-fs first child process error status: %s, pid: %d", + status.String(), cmd.Process.Pid) + if !e.Async { + e.reaper.nsenterReapReq() + } + return errors.New("Error waiting for sysbox-fs first child process") + } + + // Receive sysbox-fs' first-child pid. + var pid pid + decoder := json.NewDecoder(e.parentPipe) + if err := decoder.Decode(&pid); err != nil { + logrus.Warnf("Error receiving first-child pid: %s", err) + return errors.New("Error receiving first-child pid") + } + + firstChildProcess, err := os.FindProcess(pid.PidFirstChild) + if err != nil { + logrus.Warnf("Error finding first-child pid: %s", err) + return err + } + + // Wait for sysbox-fs' second child process to finish. Ignore the error in + // case the child has already been reaped for any reason. + _, _ = firstChildProcess.Wait() + + // Sysbox-fs' third child (grand-child) process remains and will enter the + // go runtime. + process, err := os.FindProcess(pid.Pid) + if err != nil { + logrus.Warnf("Error finding grand-child pid %d: %s", pid.Pid, err) + return err + } + e.Process = process + + // + // Transfer the nsenterEvent details to grand-child for processing. + // + + // Send the pid using SCM rights, so it shows up properly inside the + // nsexec process. + // + // TODO: in the future we should also send the process uid and gid + // credentials, so that the event handler can use this info to set ownership + // of files or mountpoints it creates on behalf of the process. This would + // void the need to send that info in the payload as done in the + // chown handler (i.e., it would void the need for processChownNSenter()). + + reqCred := &syscall.Ucred{ + Pid: int32(e.Pid), + } + + credMsg := syscall.UnixCredentials(reqCred) + if err := syscall.Sendmsg(socket, nil, credMsg, nil, 0); err != nil { + logrus.Warnf("Error while sending process credentials to nsenter (%v).", err) + if !e.Async { + e.reaper.nsenterReapReq() + } + return err + } + + // Transfer the rest of the payload + data, err := json.Marshal(*(e.ReqMsg)) + if err != nil { + logrus.Warnf("Error while encoding nsenter payload (%v).", err) + if !e.Async { + e.reaper.nsenterReapReq() + } + return err + } + _, err = e.parentPipe.Write(data) + if err != nil { + logrus.Warnf("Error while writing nsenter payload into pipeline (%v)", err) + if !e.Async { + e.reaper.nsenterReapReq() + } + return err + } + + // Return if dealing with an asynchronous request. + if e.Async { + return nil + } + + // Wait for sysbox-fs' grand-child response and process it accordingly. + ierr := e.processResponse(e.parentPipe) + + // Destroy the socket pair. + if err := unix.Shutdown(int(parentPipe.Fd()), unix.SHUT_WR); err != nil { + logrus.Warnf("Error shutting down sysbox-fs nsenter pipe: %s", err) + } + + if ierr != nil { + e.reaper.nsenterReapReq() + return ierr + } + + e.Process.Wait() + + return nil +} + +func (e *NSenterEvent) ReceiveResponse() *domain.NSenterMessage { + + return e.ResMsg +} + +// TerminateRequest serves to unwind the nsenter-event FSM after the generation +// of an asynchronous event. This method is not required for regular nsenter +// events, as in those cases the SendRequest() method itself takes care of +// cleaning all the utilized resources. +func (e *NSenterEvent) TerminateRequest() error { + + logrus.Debug("Executing nsenterEvent's TerminateRequest() method") + + if e.Process == nil { + return nil + } + + // Destroy the socket pair. + if err := unix.Shutdown(int(e.parentPipe.Fd()), unix.SHUT_WR); err != nil { + logrus.Warnf("Error shutting down sysbox-fs nsenter pipe: %s", err) + } + + // Kill ongoing request. + if err := e.Process.Kill(); err != nil { + return err + } + + e.Process.Wait() + e.Process = nil + + return nil +} + +/////////////////////////////////////////////////////////////////////////////// +// +// nsenterEvent methods below execute within the context of container +// namespaces. In other words, they are invoked as part of "sysbox-fs nsenter" +// execution. +// +/////////////////////////////////////////////////////////////////////////////// + +func (e *NSenterEvent) processLookupRequest() error { + + payload := e.ReqMsg.Payload.(domain.LookupPayload) + + pmi, err := processPayloadMounts(payload.MountSysfs, payload.MountProcfs) + if err != nil { + e.ResMsg = &domain.NSenterMessage{ + Type: domain.ErrorResponse, + Payload: &fuse.IOerror{RcvError: err}, + } + return nil + } + defer pmi.cleanup(pmi.sysfsMountpoint, pmi.procfsMountpoint) + + payload.Entry = replaceProcfsAndSysfsPaths(payload.Entry, pmi) + + // Verify if the resource being looked up is reachable and obtain FileInfo + // details. + info, err := os.Lstat(payload.Entry) + if err != nil { + // Send an error-message response. + e.ResMsg = &domain.NSenterMessage{ + Type: domain.ErrorResponse, + Payload: &fuse.IOerror{RcvError: err}, + } + + return nil + } + + // Allocate new FileInfo struct to return to sysbpx-fs' main instance. + fileInfo := domain.FileInfo{ + Fname: info.Name(), + Fsize: info.Size(), + Fmode: info.Mode(), + FmodTime: info.ModTime(), + FisDir: info.IsDir(), + Fsys: info.Sys().(*syscall.Stat_t), + } + + // Create a response message. + e.ResMsg = &domain.NSenterMessage{ + Type: domain.LookupResponse, + Payload: fileInfo, + } + + return nil +} + +// Once a file has been opened with open(), no permission checking is performed +// by subsequent system calls that work with the returned file descriptor (such +// as read(), write(), fstat(), fcntl(), and mmap()). +func (e *NSenterEvent) processOpenFileRequest() error { + + payload := e.ReqMsg.Payload.(domain.OpenFilePayload) + + pmi, err := processPayloadMounts(payload.MountSysfs, payload.MountProcfs) + if err != nil { + e.ResMsg = &domain.NSenterMessage{ + Type: domain.ErrorResponse, + Payload: &fuse.IOerror{RcvError: err}, + } + return nil + } + defer pmi.cleanup(pmi.sysfsMountpoint, pmi.procfsMountpoint) + + payload.File = replaceProcfsAndSysfsPaths(payload.File, pmi) + + // Extract openflags from the incoming payload. + openFlags, err := strconv.Atoi(payload.Flags) + if err != nil { + e.ResMsg = &domain.NSenterMessage{ + Type: domain.ErrorResponse, + Payload: &fuse.IOerror{RcvError: err}, + } + return nil + } + // Extract openMode from the incoming payload. + mode, err := strconv.Atoi(payload.Mode) + if err != nil { + e.ResMsg = &domain.NSenterMessage{ + Type: domain.ErrorResponse, + Payload: &fuse.IOerror{RcvError: err}, + } + return nil + } + + // Open the file in question. Notice that we are hardcoding the 'mode' + // argument (third one) as this one is not relevant in a procfs; that + // is, user cannot create files -- openflags 'O_CREAT' and 'O_TMPFILE' + // are not expected (refer to "man open(2)" for details). + fd, err := os.OpenFile(payload.File, openFlags, os.FileMode(mode)) + if err != nil { + e.ResMsg = &domain.NSenterMessage{ + Type: domain.ErrorResponse, + Payload: &fuse.IOerror{RcvError: err}, + } + return nil + } + fd.Close() + + // Create a response message. + e.ResMsg = &domain.NSenterMessage{ + Type: domain.OpenFileResponse, + Payload: nil, + } + + return nil +} + +func (e *NSenterEvent) processFileReadRequest() error { + var ( + fd *os.File + err error + sz int + ) + + payload := e.ReqMsg.Payload.(domain.ReadFilePayload) + + pmi, err := processPayloadMounts(payload.MountSysfs, payload.MountProcfs) + if err != nil { + e.ResMsg = &domain.NSenterMessage{ + Type: domain.ErrorResponse, + Payload: &fuse.IOerror{RcvError: err}, + } + return nil + } + defer pmi.cleanup(pmi.sysfsMountpoint, pmi.procfsMountpoint) + + payload.File = replaceProcfsAndSysfsPaths(payload.File, pmi) + + fd, err = os.Open(payload.File) + if err != nil { + e.ResMsg = &domain.NSenterMessage{ + Type: domain.ErrorResponse, + Payload: &fuse.IOerror{RcvError: err}, + } + return nil + } + defer fd.Close() + + data := make([]byte, payload.Len) + + sz, err = fd.ReadAt(data, payload.Offset) + if err != nil && err != io.EOF { + e.ResMsg = &domain.NSenterMessage{ + Type: domain.ErrorResponse, + Payload: &fuse.IOerror{RcvError: err}, + } + return nil + } + + e.ResMsg = &domain.NSenterMessage{ + Type: domain.ReadFileResponse, + Payload: data[:sz], + } + + return nil +} + +func (e *NSenterEvent) processFileWriteRequest() error { + var ( + fd *os.File + err error + ) + + payload := e.ReqMsg.Payload.(domain.WriteFilePayload) + + pmi, err := processPayloadMounts(payload.MountSysfs, payload.MountProcfs) + if err != nil { + e.ResMsg = &domain.NSenterMessage{ + Type: domain.ErrorResponse, + Payload: &fuse.IOerror{RcvError: err}, + } + return nil + } + defer pmi.cleanup(pmi.sysfsMountpoint, pmi.procfsMountpoint) + + payload.File = replaceProcfsAndSysfsPaths(payload.File, pmi) + + fd, err = os.OpenFile(payload.File, os.O_WRONLY, 0) + if err != nil { + e.ResMsg = &domain.NSenterMessage{ + Type: domain.ErrorResponse, + Payload: &fuse.IOerror{RcvError: err}, + } + return nil + } + defer fd.Close() + + _, err = fd.WriteAt(payload.Data, payload.Offset) + if err != nil { + e.ResMsg = &domain.NSenterMessage{ + Type: domain.ErrorResponse, + Payload: &fuse.IOerror{RcvError: err}, + } + return nil + } + + e.ResMsg = &domain.NSenterMessage{ + Type: domain.WriteFileResponse, + Payload: nil, + } + + return nil +} + +func (e *NSenterEvent) processDirReadRequest() error { + + payload := e.ReqMsg.Payload.(domain.ReadDirPayload) + + pmi, err := processPayloadMounts(payload.MountSysfs, payload.MountProcfs) + if err != nil { + e.ResMsg = &domain.NSenterMessage{ + Type: domain.ErrorResponse, + Payload: &fuse.IOerror{RcvError: err}, + } + return nil + } + defer pmi.cleanup(pmi.sysfsMountpoint, pmi.procfsMountpoint) + + payload.Dir = replaceProcfsAndSysfsPaths(payload.Dir, pmi) + + // Perform readDir operation and return error msg should this one fail. + dirContent, err := ioutil.ReadDir(payload.Dir) + if err != nil { + e.ResMsg = &domain.NSenterMessage{ + Type: domain.ErrorResponse, + Payload: &fuse.IOerror{RcvError: err}, + } + return nil + } + + // Create a FileInfo slice to return to sysbox-fs' main instance. + var dirContentList []domain.FileInfo + + for _, entry := range dirContent { + elem := domain.FileInfo{ + Fname: entry.Name(), + Fsize: entry.Size(), + Fmode: entry.Mode(), + FmodTime: entry.ModTime(), + FisDir: entry.IsDir(), + Fsys: entry.Sys().(*syscall.Stat_t), + } + dirContentList = append(dirContentList, elem) + } + + // Create a response message. + e.ResMsg = &domain.NSenterMessage{ + Type: domain.ReadDirResponse, + Payload: dirContentList, + } + + return nil +} + +func (e *NSenterEvent) processReadLinkRequest() error { + + payload := e.ReqMsg.Payload.(domain.ReadLinkPayload) + + pmi, err := processPayloadMounts(payload.MountSysfs, payload.MountProcfs) + if err != nil { + e.ResMsg = &domain.NSenterMessage{ + Type: domain.ErrorResponse, + Payload: err, + } + return nil + } + defer pmi.cleanup(pmi.sysfsMountpoint, pmi.procfsMountpoint) + + payload.Link = replaceProcfsAndSysfsPaths(payload.Link, pmi) + + // Perform readLink operation and return error msg should this one fail. + link, err := os.Readlink(payload.Link) + if err != nil { + e.ResMsg = &domain.NSenterMessage{ + Type: domain.ErrorResponse, + Payload: err, + } + return nil + } + + // Create a response message. + e.ResMsg = &domain.NSenterMessage{ + Type: domain.ReadLinkResponse, + Payload: link, + } + + return nil +} + +func (e *NSenterEvent) processMountSyscallRequest() error { + + var ( + i int + err error + ) + + payload := e.ReqMsg.Payload.([]domain.MountSyscallPayload) + + // Extract payload-header from the first element + header := payload[0].Header + + // For overlayfs mounts we adjust 'nsexec' process' personality (i.e. + // uid/gid and capabilities) to match the one of the original process + // performing the syscall. Our goal is mainly to avoid permission issues + // while accessing kernel's created overlayfs components. + if payload[0].FsType == "overlay" { + + // Create a dummy 'process' struct to represent the 'sysbox-fs nsenter' process + // executing this logic. + this := e.service.prs.ProcessCreate(0, 0, 0) + + // Adjust 'nsenter' process personality to match the container's original + // process. + if err := this.AdjustPersonality( + header.Uid, + header.Gid, + header.Root, + header.Cwd, + header.Capabilities); err != nil { + + // Send an error-message response. + e.ResMsg = &domain.NSenterMessage{ + Type: domain.ErrorResponse, + Payload: &fuse.IOerror{RcvError: err}, + } + + return nil + } + } + + // Perform mount instructions. + for i = 0; i < len(payload); i++ { + err = unix.Mount( + payload[i].Source, + payload[i].Target, + payload[i].FsType, + uintptr(payload[i].Flags), + payload[i].Data, + ) + if err != nil { + break + } + } + + if err != nil { + // Unmount previously executed mount instructions (unless it's a remount). + // + // TODO: ideally we would revert remounts too, but to do this we need information + // that we don't have at this stage. + for j := i - 1; j >= 0; j-- { + if payload[j].Flags&unix.MS_REMOUNT != unix.MS_REMOUNT { + _ = unix.Unmount(payload[j].Target, 0) + } + } + + // Create error response msg. + e.ResMsg = &domain.NSenterMessage{ + Type: domain.ErrorResponse, + Payload: &fuse.IOerror{RcvError: err}, + } + + return nil + } + + // Create success response message. + e.ResMsg = &domain.NSenterMessage{ + Type: domain.MountSyscallResponse, + Payload: "", + } + + return nil +} + +func (e *NSenterEvent) processUmountSyscallRequest() error { + + var ( + i int + err error + ) + + payload := e.ReqMsg.Payload.([]domain.UmountSyscallPayload) + + // Perform umount instructions. + for i = 0; i < len(payload); i++ { + err = unix.Unmount( + payload[i].Target, + int(payload[i].Flags), + ) + if err != nil { + // Create error response msg. + e.ResMsg = &domain.NSenterMessage{ + Type: domain.ErrorResponse, + Payload: &fuse.IOerror{RcvError: err}, + } + + break + } + } + + // TODO: If an error is found, notice that we will not revert the changes we could have + // made thus far. In order to do that (i.e., mount again), we need information that we + // don't have at this stage (mount-source, mount-flags, etc). + if err != nil { + return nil + } + + // Create success response message. + e.ResMsg = &domain.NSenterMessage{ + Type: domain.UmountSyscallResponse, + Payload: "", + } + + return nil +} + +func (e *NSenterEvent) processChownSyscallRequest() error { + + payload := e.ReqMsg.Payload.([]domain.ChownSyscallPayload) + + for _, p := range payload { + var err error + if err = unix.Chown(p.Target, p.TargetUid, p.TargetGid); err != nil { + e.ResMsg = &domain.NSenterMessage{ + Type: domain.ErrorResponse, + Payload: &fuse.IOerror{RcvError: err}, + } + return nil + } + } + + e.ResMsg = &domain.NSenterMessage{ + Type: domain.ChownSyscallResponse, + Payload: "", + } + + return nil +} + +func (e *NSenterEvent) processSetxattrSyscallRequest() error { + var err error + + p := e.ReqMsg.Payload.(domain.SetxattrSyscallPayload) + + if p.Syscall == "lsetxattr" { + err = unix.Lsetxattr(p.Path, p.Name, p.Val, p.Flags) + } else { + err = unix.Setxattr(p.Path, p.Name, p.Val, p.Flags) + } + + if err != nil { + e.ResMsg = &domain.NSenterMessage{ + Type: domain.ErrorResponse, + Payload: &fuse.IOerror{RcvError: err}, + } + return nil + } + + e.ResMsg = &domain.NSenterMessage{ + Type: domain.SetxattrSyscallResponse, + Payload: "", + } + + return nil +} + +func (e *NSenterEvent) processGetxattrSyscallRequest() error { + var ( + err error + size int + ) + + p := e.ReqMsg.Payload.(domain.GetxattrSyscallPayload) + val := make([]byte, p.Size) + + // Create a dummy 'process' struct to represent the 'sysbox-fs nsenter' process + // executing this logic. + this := e.service.prs.ProcessCreate(0, 0, 0) + + // Adjust 'nsenter' process personality to match the container's original + // process. + if err := this.AdjustPersonality( + p.Header.Uid, + p.Header.Gid, + p.Header.Root, + p.Header.Cwd, + p.Header.Capabilities); err != nil { + + // Send an error-message response. + e.ResMsg = &domain.NSenterMessage{ + Type: domain.ErrorResponse, + Payload: &fuse.IOerror{RcvError: err}, + } + + return nil + } + + if p.Syscall == "lgetxattr" { + size, err = unix.Lgetxattr(p.Path, p.Name, val) + } else { + size, err = unix.Getxattr(p.Path, p.Name, val) + } + + if err != nil { + e.ResMsg = &domain.NSenterMessage{ + Type: domain.ErrorResponse, + Payload: &fuse.IOerror{RcvError: err}, + } + return nil + } + + e.ResMsg = &domain.NSenterMessage{ + Type: domain.GetxattrSyscallResponse, + Payload: domain.GetxattrRespPayload{ + Val: val, + Size: size, + }, + } + + return nil +} + +func (e *NSenterEvent) processRemovexattrSyscallRequest() error { + var err error + + p := e.ReqMsg.Payload.(domain.RemovexattrSyscallPayload) + + if p.Syscall == "lremovexattr" { + err = unix.Lremovexattr(p.Path, p.Name) + } else { + err = unix.Removexattr(p.Path, p.Name) + } + + if err != nil { + e.ResMsg = &domain.NSenterMessage{ + Type: domain.ErrorResponse, + Payload: &fuse.IOerror{RcvError: err}, + } + return nil + } + + e.ResMsg = &domain.NSenterMessage{ + Type: domain.RemovexattrSyscallResponse, + Payload: "", + } + + return nil +} + +func (e *NSenterEvent) processListxattrSyscallRequest() error { + var ( + err error + size int + ) + + p := e.ReqMsg.Payload.(domain.ListxattrSyscallPayload) + val := make([]byte, p.Size) + + // Create a dummy 'process' struct to represent the 'sysbox-fs nsenter' process + // executing this logic. + this := e.service.prs.ProcessCreate(0, 0, 0) + + // Adjust 'nsenter' process personality to match the container's original + // process. + if err := this.AdjustPersonality( + p.Header.Uid, + p.Header.Gid, + p.Header.Root, + p.Header.Cwd, + p.Header.Capabilities); err != nil { + + // Send an error-message response. + e.ResMsg = &domain.NSenterMessage{ + Type: domain.ErrorResponse, + Payload: &fuse.IOerror{RcvError: err}, + } + + return nil + } + + if p.Syscall == "llistxattr" { + size, err = unix.Llistxattr(p.Path, val) + } else { + size, err = unix.Listxattr(p.Path, val) + } + + if err != nil { + e.ResMsg = &domain.NSenterMessage{ + Type: domain.ErrorResponse, + Payload: &fuse.IOerror{RcvError: err}, + } + return nil + } + + e.ResMsg = &domain.NSenterMessage{ + Type: domain.ListxattrSyscallResponse, + Payload: domain.ListxattrRespPayload{ + Val: val, + Size: size, + }, + } + + return nil +} + +func (e *NSenterEvent) getProcCreds(pipe *os.File) error { + + socket := int(pipe.Fd()) + + err := syscall.SetsockoptInt(socket, syscall.SOL_SOCKET, syscall.SO_PASSCRED, 1) + if err != nil { + return fmt.Errorf("Error setting socket options for credential passing: %v", err) + } + + var cred syscall.Ucred + ucred := syscall.UnixCredentials(&cred) + buf := make([]byte, syscall.CmsgSpace(len(ucred))) + + _, rbytes, _, _, err := syscall.Recvmsg(socket, nil, buf, 0) + if err != nil { + return errors.New("Error decoding received process credentials.") + } + buf = buf[:rbytes] + + msgs, err := syscall.ParseSocketControlMessage(buf) + if err != nil || len(msgs) != 1 { + return errors.New("Error parsing socket control msg.") + } + + procCred, err := syscall.ParseUnixCredentials(&msgs[0]) + if err != nil { + return errors.New("Error parsing unix credentials.") + } + + e.Pid = uint32(procCred.Pid) + + return nil +} + +func (e *NSenterEvent) processMountInfoRequest() error { + + pid := os.Getpid() + + // Create a 'process' struct to represent the 'sysbox-fs nsenter' process + // executing this logic. + process := e.service.prs.ProcessCreate(uint32(pid), 0, 0) + + // Create shallow mountInfo DB. + mip, err := e.service.mts.NewMountInfoParser( + nil, + process, + false, + false, + false) + if err != nil { + e.ResMsg = &domain.NSenterMessage{ + Type: domain.ErrorResponse, + Payload: &fuse.IOerror{RcvError: err}, + } + return nil + } + + // Create a MountInfo slice to return to sysbox-fs' main instance. + mountInfoData, err := mip.ExtractMountInfo() + if err != nil { + e.ResMsg = &domain.NSenterMessage{ + Type: domain.ErrorResponse, + Payload: &fuse.IOerror{RcvError: err}, + } + return nil + } + + // Create a response message. + e.ResMsg = &domain.NSenterMessage{ + Type: domain.MountInfoResponse, + Payload: &domain.MountInfoRespPayload{mountInfoData}, + } + + return nil +} + +func (e *NSenterEvent) processMountInodeRequest() error { + + payload := e.ReqMsg.Payload.(domain.MountInodeReqPayload) + + var mpInodeList []domain.Inode + + // Iterate through the received mountpoints and extract the corresponding + // inode. + for _, mp := range payload.Mountpoints { + mpInode := domain.FileInode(mp) + mpInodeList = append(mpInodeList, mpInode) + } + + e.ResMsg = &domain.NSenterMessage{ + Type: domain.MountInodeResponse, + Payload: &domain.MountInodeRespPayload{mpInodeList}, + } + + return nil +} + +func (e *NSenterEvent) processSleepRequest() error { + + payload := e.ReqMsg.Payload.(domain.SleepReqPayload) + + ival, err := strconv.ParseInt(payload.Ival, 10, 64) + if err != nil { + e.ResMsg = &domain.NSenterMessage{ + Type: domain.ErrorResponse, + Payload: &fuse.IOerror{RcvError: err}, + } + return nil + } + + time.Sleep(time.Duration(ival) * time.Second) + + e.ResMsg = &domain.NSenterMessage{ + Type: domain.SleepResponse, + Payload: "", + } + + return nil +} + +func (e *NSenterEvent) processUidInfoRequest() error { + + payload := e.ReqMsg.Payload.(domain.UidInfoReqPayload) + + uObj, err := user.Lookup(payload.User) + if err != nil { + e.ResMsg = &domain.NSenterMessage{ + Type: domain.ErrorResponse, + Payload: &fuse.IOerror{RcvError: fmt.Errorf("Unknown user name")}, + } + return nil + } + + e.ResMsg = &domain.NSenterMessage{ + Type: domain.SleepResponse, + Payload: &domain.UidInfoRespPayload{Uid: uObj.Uid}, + } + + return nil +} + +func (e *NSenterEvent) processGidInfoRequest() error { + + payload := e.ReqMsg.Payload.(domain.GidInfoReqPayload) + + gObj, err := user.LookupGroup(payload.Group) + if err != nil { + e.ResMsg = &domain.NSenterMessage{ + Type: domain.ErrorResponse, + Payload: &fuse.IOerror{RcvError: fmt.Errorf("Unknown group name")}, + } + return nil + } + + e.ResMsg = &domain.NSenterMessage{ + Type: domain.SleepResponse, + Payload: &domain.GidInfoRespPayload{Gid: gObj.Gid}, + } + + return nil +} + +// Method in charge of processing all requests generated by sysbox-fs' master +// instance. +func (e *NSenterEvent) processRequest(pipe *os.File) error { + + // Get the credentials of the process on whose behalf we are operating + if err := e.getProcCreds(pipe); err != nil { + return err + } + + // Raw message payload to aid in decoding generic messages (see below + // explanation). + var payload json.RawMessage + nsenterMsg := domain.NSenterMessage{ + Payload: &payload, + } + + // Decode received msg header to help us determine the payload type. + // Received message will be decoded in two phases. The decode instruction + // below help us determine the message-type being received. Based on the + // obtained type, we are able to decode the payload generated by the + // remote-end. This second step is executed as part of a subsequent + // unmarshal instruction (see further below). + if err := json.NewDecoder(pipe).Decode(&nsenterMsg); err != nil { + logrus.Warnf("Error decoding received nsenterMsg request (%v).", err) + return errors.New("Error decoding received event request.") + } + + switch nsenterMsg.Type { + + case domain.LookupRequest: + var p domain.LookupPayload + if payload != nil { + err := json.Unmarshal(payload, &p) + if err != nil { + logrus.Error(err) + return err + } + } + + e.ReqMsg = &domain.NSenterMessage{ + Type: nsenterMsg.Type, + Payload: p, + } + return e.processLookupRequest() + + case domain.OpenFileRequest: + var p domain.OpenFilePayload + if payload != nil { + err := json.Unmarshal(payload, &p) + if err != nil { + logrus.Error(err) + return err + } + } + + e.ReqMsg = &domain.NSenterMessage{ + Type: nsenterMsg.Type, + Payload: p, + } + return e.processOpenFileRequest() + + case domain.ReadFileRequest: + var p domain.ReadFilePayload + if payload != nil { + err := json.Unmarshal(payload, &p) + if err != nil { + logrus.Error(err) + return err + } + } + + e.ReqMsg = &domain.NSenterMessage{ + Type: nsenterMsg.Type, + Payload: p, + } + return e.processFileReadRequest() + + case domain.WriteFileRequest: + var p domain.WriteFilePayload + if payload != nil { + err := json.Unmarshal(payload, &p) + if err != nil { + logrus.Error(err) + return err + } + } + + e.ReqMsg = &domain.NSenterMessage{ + Type: nsenterMsg.Type, + Payload: p, + } + return e.processFileWriteRequest() + + case domain.ReadDirRequest: + var p domain.ReadDirPayload + if payload != nil { + err := json.Unmarshal(payload, &p) + if err != nil { + logrus.Error(err) + return err + } + } + + e.ReqMsg = &domain.NSenterMessage{ + Type: nsenterMsg.Type, + Payload: p, + } + return e.processDirReadRequest() + + case domain.ReadLinkRequest: + var p domain.ReadLinkPayload + if payload != nil { + err := json.Unmarshal(payload, &p) + if err != nil { + logrus.Error(err) + return err + } + } + + e.ReqMsg = &domain.NSenterMessage{ + Type: nsenterMsg.Type, + Payload: p, + } + return e.processReadLinkRequest() + + case domain.SetxattrSyscallRequest: + var p domain.SetxattrSyscallPayload + if payload != nil { + err := json.Unmarshal(payload, &p) + if err != nil { + logrus.Error(err) + return err + } + } + + e.ReqMsg = &domain.NSenterMessage{ + Type: nsenterMsg.Type, + Payload: p, + } + return e.processSetxattrSyscallRequest() + + case domain.GetxattrSyscallRequest: + var p domain.GetxattrSyscallPayload + if payload != nil { + err := json.Unmarshal(payload, &p) + if err != nil { + logrus.Error(err) + return err + } + } + + e.ReqMsg = &domain.NSenterMessage{ + Type: nsenterMsg.Type, + Payload: p, + } + return e.processGetxattrSyscallRequest() + + case domain.RemovexattrSyscallRequest: + var p domain.RemovexattrSyscallPayload + if payload != nil { + err := json.Unmarshal(payload, &p) + if err != nil { + logrus.Error(err) + return err + } + } + + e.ReqMsg = &domain.NSenterMessage{ + Type: nsenterMsg.Type, + Payload: p, + } + return e.processRemovexattrSyscallRequest() + + case domain.ListxattrSyscallRequest: + var p domain.ListxattrSyscallPayload + if payload != nil { + err := json.Unmarshal(payload, &p) + if err != nil { + logrus.Error(err) + return err + } + } + + e.ReqMsg = &domain.NSenterMessage{ + Type: nsenterMsg.Type, + Payload: p, + } + return e.processListxattrSyscallRequest() + + case domain.MountSyscallRequest: + var p []domain.MountSyscallPayload + if payload != nil { + err := json.Unmarshal(payload, &p) + if err != nil { + logrus.Error(err) + return err + } + } + + e.ReqMsg = &domain.NSenterMessage{ + Type: nsenterMsg.Type, + Payload: p, + } + + return e.processMountSyscallRequest() + + case domain.UmountSyscallRequest: + var p []domain.UmountSyscallPayload + if payload != nil { + err := json.Unmarshal(payload, &p) + if err != nil { + logrus.Error(err) + return err + } + } + + e.ReqMsg = &domain.NSenterMessage{ + Type: nsenterMsg.Type, + Payload: p, + } + + return e.processUmountSyscallRequest() + + case domain.MountInfoRequest: + e.ReqMsg = &domain.NSenterMessage{ + Type: nsenterMsg.Type, + } + + return e.processMountInfoRequest() + + case domain.MountInodeRequest: + var p domain.MountInodeReqPayload + if payload != nil { + err := json.Unmarshal(payload, &p) + if err != nil { + logrus.Error(err) + return err + } + } + + e.ReqMsg = &domain.NSenterMessage{ + Type: nsenterMsg.Type, + Payload: p, + } + + return e.processMountInodeRequest() + + case domain.ChownSyscallRequest: + var p []domain.ChownSyscallPayload + if payload != nil { + err := json.Unmarshal(payload, &p) + if err != nil { + logrus.Error(err) + return err + } + } + + e.ReqMsg = &domain.NSenterMessage{ + Type: nsenterMsg.Type, + Payload: p, + } + + return e.processChownSyscallRequest() + + case domain.SleepRequest: + var p domain.SleepReqPayload + if payload != nil { + err := json.Unmarshal(payload, &p) + if err != nil { + logrus.Error(err) + return err + } + } + + e.ReqMsg = &domain.NSenterMessage{ + Type: nsenterMsg.Type, + Payload: p, + } + + return e.processSleepRequest() + + case domain.UidInfoRequest: + var p string + if payload != nil { + err := json.Unmarshal(payload, &p) + if err != nil { + logrus.Error(err) + return err + } + } + + e.ReqMsg = &domain.NSenterMessage{ + Type: nsenterMsg.Type, + } + + return e.processUidInfoRequest() + + case domain.GidInfoRequest: + var p string + if payload != nil { + err := json.Unmarshal(payload, &p) + if err != nil { + logrus.Error(err) + return err + } + } + + e.ReqMsg = &domain.NSenterMessage{ + Type: nsenterMsg.Type, + } + + return e.processGidInfoRequest() + + default: + e.ResMsg = &domain.NSenterMessage{ + Type: domain.ErrorResponse, + Payload: "Unsupported request", + } + } + + return nil +} + +// Sysbox-fs' post-nsexec initialization function. To be executed within the +// context of one (or more) container namespaces. +func Init() (err error) { + + var ( + pipefd int + envInitPipe = os.Getenv("_LIBCONTAINER_INITPIPE") + ) + + // Get the INITPIPE. + pipefd, err = strconv.Atoi(envInitPipe) + if err != nil { + return fmt.Errorf("Unable to convert _LIBCONTAINER_INITPIPE=%s to int: %s", + envInitPipe, err) + } + + var pipe = os.NewFile(uintptr(pipefd), "pipe") + defer pipe.Close() + + // Clear the current process's environment to clean any libcontainer + // specific env vars. + os.Clearenv() + + // Setup nsenterService and its dependencies. + var nsenterSvc = NewNSenterService() + var processSvc = process.NewProcessService() + var mountSvc = mount.NewMountService() + nsenterSvc.Setup(processSvc, mountSvc) + mountSvc.Setup(nil, nil, processSvc, nsenterSvc) + + var event = NSenterEvent{service: nsenterSvc.(*nsenterService)} + + // Process incoming request. + err = event.processRequest(pipe) + if err != nil { + event.ResMsg = &domain.NSenterMessage{ + Type: domain.ErrorResponse, + Payload: &fuse.IOerror{RcvError: err}, + } + } + + // Encode / push response back to sysbox-main. + data, err := json.Marshal(*(event.ResMsg)) + if err != nil { + return err + } + _, err = pipe.Write(data) + if err != nil { + return err + } + + return nil +} diff --git a/sysbox-fs/nsenter/eventService.go b/sysbox-fs/nsenter/eventService.go new file mode 100644 index 00000000..24d6dc71 --- /dev/null +++ b/sysbox-fs/nsenter/eventService.go @@ -0,0 +1,107 @@ +// +// Copyright 2019-2020 Nestybox, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +package nsenter + +import ( + "github.com/nestybox/sysbox-fs/domain" +) + +type nsenterService struct { + prs domain.ProcessServiceIface // for process class interactions (capabilities) + mts domain.MountServiceIface // for mount class interactions (mountInfoParser) + reaper *zombieReaper +} + +func NewNSenterService() domain.NSenterServiceIface { + return &nsenterService{ + reaper: newZombieReaper(), + } +} + +func (s *nsenterService) Setup( + prs domain.ProcessServiceIface, + mts domain.MountServiceIface) { + + s.prs = prs + s.mts = mts +} + +func (s *nsenterService) NewEvent( + pid uint32, + ns *[]domain.NStype, + cloneFlags uint32, + req *domain.NSenterMessage, + res *domain.NSenterMessage, + async bool) domain.NSenterEventIface { + + event := &NSenterEvent{ + Pid: pid, + Namespace: ns, + CloneFlags: cloneFlags, + ReqMsg: req, + ResMsg: res, + Async: async, + reaper: s.reaper, + } + + return event +} + +func (s *nsenterService) SendRequestEvent( + e domain.NSenterEventIface) error { + return e.SendRequest() +} + +func (s *nsenterService) TerminateRequestEvent(e domain.NSenterEventIface) error { + return e.TerminateRequest() +} + +func (s *nsenterService) ReceiveResponseEvent( + e domain.NSenterEventIface) *domain.NSenterMessage { + + return e.ReceiveResponse() +} + +func (s *nsenterService) SetRequestEventPayload( + e domain.NSenterEventIface, + m *domain.NSenterMessage) { + + e.SetRequestMsg(m) +} + +func (s *nsenterService) GetRequestEventPayload( + e domain.NSenterEventIface) *domain.NSenterMessage { + + return e.GetRequestMsg() +} + +func (s *nsenterService) SetResponseEventPayload( + e domain.NSenterEventIface, + m *domain.NSenterMessage) { + + e.SetResponseMsg(m) +} + +func (s *nsenterService) GetResponseEventPayload( + e domain.NSenterEventIface) *domain.NSenterMessage { + + return e.GetResponseMsg() +} + +func (s *nsenterService) GetEventProcessID(e domain.NSenterEventIface) uint32 { + return e.GetProcessID() +} diff --git a/sysbox-fs/nsenter/reaper.go b/sysbox-fs/nsenter/reaper.go new file mode 100644 index 00000000..c42cf979 --- /dev/null +++ b/sysbox-fs/nsenter/reaper.go @@ -0,0 +1,92 @@ +// +// Copyright 2019-2020 Nestybox, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +package nsenter + +import ( + "sync" + "syscall" + "time" + + "github.com/sirupsen/logrus" +) + +type zombieReaper struct { + mu sync.RWMutex + signal chan bool +} + +func newZombieReaper() *zombieReaper { + + zr := &zombieReaper{ + signal: make(chan bool), + } + + go reaper(zr.signal, &zr.mu) + return zr +} + +func (zr *zombieReaper) nsenterStarted() { + zr.mu.RLock() +} + +func (zr *zombieReaper) nsenterEnded() { + zr.mu.RUnlock() +} + +func (zr *zombieReaper) nsenterReapReq() { + select { + case zr.signal <- true: + logrus.Debugf("nsenter child reaping requested") + default: + // no action required (someone else has signaled already) + } +} + +// Go-routine that performs reaping +func reaper(signal chan bool, mu *sync.RWMutex) { + var wstatus syscall.WaitStatus + + for { + <-signal + + // Without this delay, sysbox-fs sometimes hangs the FUSE request that generates an + // nsenter event that requires reaping. It's not clear why, but the tell-tale sign + // of the hang is that the reaper is signaled but finds nothing to reap. This delay + // mitigates this condition and the reaper finds something to reap. + // + // The delay chosen is one that allows nsenter agents to complete their tasks before + // reaping occurs. Since the reaper runs in its own goroutine, this delay only + // affects it (there is no undesired side-effect on nsenters). + + time.Sleep(time.Second) + + for { + mu.Lock() + + // WNOHANG: if there is no child to reap, don't block + wpid, err := syscall.Wait4(-1, &wstatus, syscall.WNOHANG, nil) + if err != nil || wpid == 0 { + logrus.Infof("reaper: nothing to reap") + mu.Unlock() + break + } + + logrus.Infof("reaper: reaped pid %d", wpid) + mu.Unlock() + } + } +} diff --git a/sysbox-fs/nsenter/utils.go b/sysbox-fs/nsenter/utils.go new file mode 100644 index 00000000..387e9352 --- /dev/null +++ b/sysbox-fs/nsenter/utils.go @@ -0,0 +1,154 @@ +// +// Copyright 2023 Nestybox, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +package nsenter + +import ( + "errors" + "os" + "strings" + + "golang.org/x/sys/unix" +) + +type payloadMountsInfo struct { + sysfsMountpoint string + procfsMountpoint string + cleanup func(string, string) +} + +// This function runs when the sysbox-fs nsenter helper process requests to +// mount procfs or sysfs. +// +// This function assumes the nsenter agent is running in an unshared mount +// namespace, such that the procfs and sysfs mounts it creates are NOT visible +// inside the container (otherwise container processes will see the nsenter +// process mounts, which is not what we want). +// +// Note also that the nsenter process mounts the "real" procfs and sysfs, not +// the sysbox-fs emulated one we use for containers, but does so from within the +// container's namespaces (except it's own mount-ns as described above). This +// way the nsenter agent can access host info for the container that may not be +// available to the container itself. This info can then be used to emulate +// procfs and sysfs resources inside the container. +// +// [@ctalledo]: this function needs cleanup, it can accept either mountSyfs or +// mountProcfs, but not both (because for the case where the container is read-only, +// it mounts on the same dir (/dev), so there's a collision). +func processPayloadMounts(mountSysfs, mountProcfs bool) (*payloadMountsInfo, error) { + var ( + flags uintptr + sysfsMountpoint string + procfsMountpoint string + err error + ) + + flags = unix.MS_NOSUID | unix.MS_NOEXEC | unix.MS_NODEV | unix.MS_RELATIME + + // Note: ideally we want to mount procfs on /proc and sysfs on /sys, inside + // the container; and since the mounts are done by the nsenter process in a + // dedicated mount-ns, the container processes won't see them. While it's + // possible for the nsenter process to mount procfs on top of the container's + // /proc, turns out it's not possible to mount sysfs on top of the + // container's /sys (the kernel returns a "resource busy" error). Thus, we + // mount sysfs on a temporary ephemeral dir inside the container, at + // /.sysbox-sysfs-. Note that while that directory is visible + // inside the container (for a very brief period of time while the nsenter + // agent does its thing), the container can't see the sysfs mount on that dir + // (only the nsenter process can see the mount because it operates in a + // dedicated mount-ns). The container processes will never see the mount, and + // therefore the /.sysbox-sysfs- dir will always look empty to + // container processes. + if mountSysfs { + + sysfsMountpoint, err = os.MkdirTemp("/", ".sysbox-sysfs-") + if errors.Is(err, unix.EROFS) { + // @ctalledo: hack: if the container has a read-only filesystem, then + // we can't create the temporary sysfs mount dir on it. In this case we + // use a directory that we know is present in the container (e.g., + // "/dev") to mount sysfs. Since the mount occurs in the nsenter + // agent's own mount-ns, it's not visible to anyone else (i.e, the + // container can still access the "/dev" dir without noticing anything + // differently on it). + sysfsMountpoint = "/dev" + + } else if err != nil { + return nil, err + } + if err = unix.Mount("sysfs", sysfsMountpoint, "sysfs", flags, ""); err != nil { + os.RemoveAll(sysfsMountpoint) + return nil, err + } + } + cleanupSysfs := func(mountpoint string) { + if mountpoint != "" { + unix.Unmount(mountpoint, unix.MNT_DETACH) + if strings.HasPrefix(mountpoint, ".sysbox-sysfs-") { + os.RemoveAll(mountpoint) + } + } + } + + if mountProcfs { + procfsMountpoint, err = os.MkdirTemp("/", ".sysbox-procfs-") + if errors.Is(err, unix.EROFS) { + // @ctalledo: hack: if the container has a read-only filesystem, then + // we can't create the temporary procfs mount dir on it. In this case we + // use a directory that we know is present in the container (e.g., + // "/dev") to mount procfs. Since the mount occurs in the nsenter + // agent's own mount-ns, it's not visible to anyone else (i.e, the + // container can still access the "/dev" dir without noticing anything + // differently on it). + procfsMountpoint = "/dev" + + } else if err != nil { + return nil, err + } + if err = unix.Mount("proc", procfsMountpoint, "proc", flags, ""); err != nil { + cleanupSysfs(sysfsMountpoint) + return nil, err + } + } + cleanupProcfs := func(mountpoint string) { + if mountpoint != "" { + unix.Unmount(mountpoint, unix.MNT_DETACH) + } + } + + cleanup := func(sysfsMountpoint, procfsMountpoint string) { + cleanupSysfs(sysfsMountpoint) + cleanupProcfs(procfsMountpoint) + } + + pmi := &payloadMountsInfo{ + sysfsMountpoint: sysfsMountpoint, + procfsMountpoint: procfsMountpoint, + cleanup: cleanup, + } + + return pmi, nil +} + +func replaceProcfsAndSysfsPaths(path string, pmi *payloadMountsInfo) string { + + if strings.HasPrefix(path, "/sys/") { + path = strings.Replace(path, "/sys/", pmi.sysfsMountpoint+"/", 1) + } else if strings.HasPrefix(path, "/proc/") { + path = strings.Replace(path, "/proc/", pmi.procfsMountpoint+"/", 1) + } + + return path +} diff --git a/sysbox-fs/process/process.go b/sysbox-fs/process/process.go new file mode 100644 index 00000000..54c439ca --- /dev/null +++ b/sysbox-fs/process/process.go @@ -0,0 +1,998 @@ +// +// Copyright 2019-2021 Nestybox, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +package process + +import ( + "bufio" + "fmt" + "os" + "path/filepath" + "regexp" + "strconv" + "strings" + "syscall" + "unsafe" + + "github.com/nestybox/sysbox-fs/domain" + cap "github.com/nestybox/sysbox-libs/capability" + "github.com/nestybox/sysbox-runc/libcontainer/user" + "golang.org/x/sys/unix" + setxid "gopkg.in/hlandau/service.v1/daemon/setuid" + + "github.com/sirupsen/logrus" +) + +type processService struct { + ios domain.IOServiceIface +} + +func NewProcessService() domain.ProcessServiceIface { + return &processService{} +} + +func (ps *processService) Setup(ios domain.IOServiceIface) { + ps.ios = ios +} + +func (ps *processService) ProcessCreate( + pid uint32, + uid uint32, + gid uint32) domain.ProcessIface { + + return &process{ + pid: pid, + uid: uid, + gid: gid, + ps: ps, + } +} + +type process struct { + pid uint32 // process id + root string // root dir + procroot string // proc's root string (/proc//root) + cwd string // current working dir + proccwd string // proc's cwd string (/proc//cwd) + uid uint32 // effective uid + gid uint32 // effective gid + sgid []uint32 // supplementary groups + cap cap.Capabilities // process capabilities + status map[string]string // process status fields + nsInodes map[string]domain.Inode // process namespace inodes + initialized bool // process initialization completed + ps *processService // pointer to parent processService +} + +func (p *process) Pid() uint32 { + + if !p.initialized { + p.init() + } + + return p.pid +} + +func (p *process) Uid() uint32 { + + if !p.initialized { + p.init() + } + + return p.uid +} + +func (p *process) Gid() uint32 { + + if !p.initialized { + p.init() + } + + return p.gid +} + +func (p *process) UidMap() ([]user.IDMap, error) { + f := fmt.Sprintf("/proc/%d/uid_map", p.pid) + return user.ParseIDMapFile(f) +} + +func (p *process) GidMap() ([]user.IDMap, error) { + f := fmt.Sprintf("/proc/%d/gid_map", p.pid) + return user.ParseIDMapFile(f) +} + +func (p *process) Cwd() string { + + if !p.initialized { + p.init() + } + + return p.cwd +} + +func (p *process) Root() string { + + if !p.initialized { + p.init() + } + + return p.root +} + +func (p *process) RootInode() uint64 { + + if !p.initialized { + p.procroot = fmt.Sprintf("/proc/%d/root", p.pid) + } + + return domain.FileInode(p.procroot) +} + +func (p *process) SGid() []uint32 { + + if !p.initialized { + p.init() + } + + return p.sgid +} + +func (p *process) IsSysAdminCapabilitySet() bool { + return p.IsCapabilitySet(cap.EFFECTIVE, cap.CAP_SYS_ADMIN) +} + +func (p *process) GetEffCaps() [2]uint32 { + if p.cap == nil { + if err := p.initCapability(); err != nil { + return [2]uint32{0, 0} + } + } + + return p.cap.GetEffCaps() +} + +func (p *process) SetEffCaps(caps [2]uint32) { + if p.cap == nil { + if err := p.initCapability(); err != nil { + return + } + } + + p.cap.SetEffCaps(caps) +} + +// Simple wrapper method to set capability values. +func (p *process) setCapability(which cap.CapType, what ...cap.Cap) { + + if p.cap == nil { + if err := p.initCapability(); err != nil { + return + } + } + + for _, elem := range what { + p.cap.Set(which, elem) + } +} + +// Simple wrapper method to determine capability settings. +func (p *process) IsCapabilitySet(which cap.CapType, what cap.Cap) bool { + + if p.cap == nil { + if err := p.initCapability(); err != nil { + return false + } + } + + return p.cap.Get(which, what) +} + +// initCapability method retrieves process capabilities from kernel and store +// them within 'capability' data-struct. +func (p *process) initCapability() error { + + c, err := cap.NewPid2(int(p.pid)) + if err != nil { + return err + } + + if err = c.Load(); err != nil { + return err + } + + p.cap = c + + return nil +} + +// GetFd() returns a path to the file associated with a process' file descriptor. +func (p *process) GetFd(fd int32) (string, error) { + fdlink := fmt.Sprintf("/proc/%d/fd/%d", p.pid, fd) + return os.Readlink(fdlink) +} + +// AdjustPersonality() method's purpose is to modify process' main attributes to +// match those of a secondary process. The main use-case is to allow sysbox-fs' +// nsexec logic to act on behalf of a user-triggered process. +func (p *process) AdjustPersonality( + uid uint32, + gid uint32, + root string, + cwd string, + caps [2]uint32) error { + + if cwd != p.Cwd() { + if err := unix.Chdir(cwd); err != nil { + return err + } + } + + if root != p.Root() { + if err := unix.Chroot(root); err != nil { + return err + } + } + + if gid != p.Gid() { + // Execute setresgid() syscall to set this process' effective gid. + if err := setxid.Setresgid(-1, int(gid), -1); err != nil { + return err + } + } + + if uid != p.Uid() { + // Execute setresuid() syscall to set this process' effective uid. + // Notice that as part of this instruction all effective capabilities of + // the running process will be reset, which is something that we are looking + // after given that "sysbox-fs nsenter" process runs with all capabilities + // turned on. Further below we re-apply only those capabilities that were + // present in the original process. + if err := setxid.Setresuid(-1, int(uid), -1); err != nil { + return err + } + } + + if caps != p.GetEffCaps() { + // Set process' effective capabilities to match those passed by callee. + p.cap.SetEffCaps(caps) + if err := p.cap.Apply( + cap.EFFECTIVE | cap.PERMITTED | cap.INHERITABLE); err != nil { + return err + } + } + + return nil +} + +func (p *process) NsInodes() (map[string]domain.Inode, error) { + + // First invocation causes the process ns inodes to be parsed + if p.nsInodes == nil { + nsInodes, err := p.GetNsInodes() + if err != nil { + return nil, err + } + p.nsInodes = nsInodes + } + + return p.nsInodes, nil +} + +func (p *process) MountNsInode() (domain.Inode, error) { + nsInodes, err := p.NsInodes() + if err != nil { + return 0, err + } + + mountns, found := nsInodes["mnt"] + if !found { + return 0, fmt.Errorf("mountns not found") + } + + return mountns, nil +} + +func (p *process) NetNsInode() (domain.Inode, error) { + nsInodes, err := p.NsInodes() + if err != nil { + return 0, err + } + + netns, found := nsInodes["net"] + if !found { + return 0, fmt.Errorf("netns not found") + } + + return netns, nil +} + +func (p *process) UserNsInode() (domain.Inode, error) { + nsInodes, err := p.NsInodes() + if err != nil { + return 0, err + } + + userns, found := nsInodes["user"] + if !found { + return 0, fmt.Errorf("userns not found") + } + + return userns, nil +} + +func (p *process) UserNsInodeParent() (domain.Inode, error) { + + // ioctl to retrieve the parent namespace. + const NS_GET_PARENT = 0xb702 + + usernsPath := filepath.Join( + "/proc", + strconv.FormatUint(uint64(p.pid), 10), + "ns", + "user", + ) + + // Open /proc//ns/user to obtain a file-desc to refer to. + childNsFd, err := os.Open(usernsPath) + if err != nil { + return 0, err + } + defer childNsFd.Close() + + // Launch ioctl to collect parent namespace fd. + ret, _, errno := unix.Syscall( + unix.SYS_IOCTL, + childNsFd.Fd(), + uintptr(NS_GET_PARENT), + 0) + if errno != 0 { + return 0, errno + } + parentNsFd := (int)((uintptr)(unsafe.Pointer(ret))) + defer syscall.Close(parentNsFd) + + // Run stat() over the returned file descriptor to obtain the inode that + // uniquely identifies the parent namespace. + var stat syscall.Stat_t + err = syscall.Fstat(parentNsFd, &stat) + if err != nil { + return 0, err + } + + return stat.Ino, nil +} + +// Collects the namespace inodes of the given process +func (p *process) GetNsInodes() (map[string]domain.Inode, error) { + + nsInodes := make(map[string]domain.Inode) + pidStr := strconv.FormatUint(uint64(p.pid), 10) + + // Iterate through all namespaces to collect the process' namespace inodes. + for _, ns := range domain.AllNSs { + nsPath := filepath.Join("/proc", pidStr, "ns", ns) + + fnode := p.ps.ios.NewIOnode("", nsPath, 0) + nsInode, err := fnode.GetNsInode() + if err != nil { + return nil, err + } + + nsInodes[ns] = nsInode + } + + return nsInodes, nil +} + +// Exclusively utilized when dealing with memory file-systems during unit-testing. +// NsInodes are automatically created by kernel in regular scenarios. +func (p *process) CreateNsInodes(inode domain.Inode) error { + + pidStr := strconv.FormatUint(uint64(p.pid), 10) + inodeStr := strconv.FormatUint(uint64(inode), 10) + + // Iterate through all namespaces to collect the process' namespace inodes. + for _, ns := range domain.AllNSs { + nsPath := filepath.Join("/proc", pidStr, "ns", ns) + + fnode := p.ps.ios.NewIOnode("", nsPath, 0) + err := fnode.WriteFile([]byte(inodeStr)) + if err != nil { + return err + } + } + + return nil +} + +// UsernsRootUidGid returns the uid and gid for the root user in the user-ns associated +// with the process. If the user-ns has no mapping for the root user, the overflow +// uid & gid are returned (e.g., uid = gid = 65534). +func (p *process) UsernsRootUidGid() (uint32, uint32, error) { + var ( + uid, gid uint32 + found bool + err error + ) + + found = false + uidMap, err := p.UidMap() + if err == nil { + for _, m := range uidMap { + if m.ID == 0 { + uid = uint32(m.ParentID) + found = true + break + } + } + } + + if !found { + uid, err = overflowUid() + if err != nil { + uid = 65534 + } + } + + found = false + gidMap, err := p.GidMap() + if err == nil { + for _, m := range gidMap { + if m.ID == 0 { + gid = uint32(m.ParentID) + found = true + break + } + } + } + + if !found { + gid, err = overflowGid() + if err != nil { + uid = 65534 + } + } + + return uid, gid, nil +} + +// PathAccess emulates the path resolution and permission checking process done by +// the Linux kernel, as described in path_resolution(7). +// +// It checks if the process with the given pid can access the file or directory at the +// given path. +// +// The given path may be absolute or relative. Each component of the path is checked to +// see if it exists and whether the process has permissions to access it, following the +// rules for path resolution in Linux (see path_resolution(7)). The path may contain ".", +// "..", and symlinks. For absolute paths, the check is done starting from the process' +// root directory. For relative paths, the check is done starting from the process' +// current working directory. +// +// The given mode determines what type of access to check for (e.g., read, +// write, execute, or a combination of these). If the mode is 0, this function checks +// if the process has execute/search permissions on all components of the path, but +// does not check access permissions on the the file itself. +// +// Returns the resolved path and a nil error if the process can access the path, or one +// of the following errors otherwise: +// +// syscall.ENOENT: some component of the path does not exist. +// syscall.ENOTDIR: a non-final component of the path is not a directory. +// syscall.EACCES: the process does not have permission to access at least one component of the path. +// syscall.ELOOP: the path too many symlinks (e.g. > 40). + +func (p *process) PathAccess(path string, aMode domain.AccessMode, followSymlink bool) (string, error) { + + err := p.init() + if err != nil { + return "", err + } + + path, err = p.ResolveProcSelf(path) + if err != nil { + return "", syscall.EINVAL + } + + return p.pathAccess(path, aMode, followSymlink) +} + +// init() retrieves info about the process to initialize its main attributes. +func (p *process) init() error { + + if p.initialized { + return nil + } + + space := regexp.MustCompile(`\s+`) + + fields := []string{"Uid", "Gid", "Groups"} + if err := p.getStatus(fields); err != nil { + return err + } + + // effective uid + str := space.ReplaceAllString(p.status["Uid"], " ") + str = strings.TrimSpace(str) + uids := strings.Split(str, " ") + if len(uids) != 4 { + return fmt.Errorf("invalid uid status: %+v", uids) + } + euid, err := strconv.Atoi(uids[1]) + if err != nil { + return err + } + + // effective gid + str = space.ReplaceAllString(p.status["Gid"], " ") + str = strings.TrimSpace(str) + gids := strings.Split(str, " ") + if len(gids) != 4 { + return fmt.Errorf("invalid gid status: %+v", gids) + } + egid, err := strconv.Atoi(gids[1]) + if err != nil { + return err + } + + // supplementary groups + sgid := []uint32{} + str = space.ReplaceAllString(p.status["Groups"], " ") + str = strings.TrimSpace(str) + groups := strings.Split(str, " ") + for _, g := range groups { + if g == "" { + continue + } + val, err := strconv.Atoi(g) + if err != nil { + return err + } + sgid = append(sgid, uint32(val)) + } + + // process root & cwd + root := fmt.Sprintf("/proc/%d/root", p.pid) + cwd := fmt.Sprintf("/proc/%d/cwd", p.pid) + + // process capabilities + if p.cap == nil { + if err := p.initCapability(); err != nil { + return err + } + } + + // store all collected attributes + p.root, _ = os.Readlink(root) + p.cwd, _ = os.Readlink(cwd) + p.procroot = root + p.proccwd = cwd + p.uid = uint32(euid) + p.gid = uint32(egid) + p.sgid = sgid + + // Mark process as fully initialized. + p.initialized = true + + return nil +} + +// getStatus retrieves process status info obtained from the +// /proc/[pid]/status file. +func (p *process) getStatus(fields []string) error { + + filename := fmt.Sprintf("/proc/%d/status", p.pid) + f, err := os.Open(filename) + if err != nil { + return err + } + defer f.Close() + + s := bufio.NewScanner(f) + + status := make(map[string]string) + for s.Scan() { + text := s.Text() + parts := strings.Split(text, ":") + + if len(parts) < 1 { + continue + } + + for _, f := range fields { + if parts[0] == f { + if len(parts) > 1 { + status[f] = parts[1] + } else { + status[f] = "" + } + } + } + } + + if err := s.Err(); err != nil { + return err + } + + p.status = status + + return nil +} + +// Replaces the given path such as "/proc/self/*" with "/proc//*", or +// "/proc/self/task//*" with "/proc//task//*". +func replaceProcSelfWithProcPid(path string, pid uint32, tid uint32) string { + var repl, p string + + pidMatch := regexp.MustCompile(`^/proc/self/(.*)`) + tidMatch := regexp.MustCompile(`^/proc/self/task/[0-9]+/(.*)`) + + repl = fmt.Sprintf("/proc/self/task/%d/${1}", tid) + p = tidMatch.ReplaceAllString(path, repl) + + repl = fmt.Sprintf("/proc/%d/${1}", pid) + p = pidMatch.ReplaceAllString(p, repl) + + return p +} + +// Given a path "/proc/self/path/to/symlink" it resolves it to the location +// pointed to by symlink. For example, if path is "/proc/self/fd/3" and +// "/proc/self/fd/3" is a symlink to "/some/path", then this function returns +// "/some/path". Note that "self" refers to the process struct, so we replace +// "self" with p.pid. The path resolution is recursive: E.g., if +// "/proc/self/fd/3" symlink points to "/proc/self/cwd", and "/proc/self/cwd" +// points to "/some/path", this function follows the symlinks and returns +// "/some/path". + +func (p *process) ResolveProcSelf(path string) (string, error) { + + // NOTE: this function assumes procfs is mounted on /proc and path is + // absolute. + + if !filepath.IsAbs(path) { + return path, nil + } + + if !strings.HasPrefix(path, "/proc/self/") { + return path, nil + } + + currPath := path + linkCnt := 0 + + for { + if !strings.HasPrefix(currPath, "/proc/self/") { + break + } + + // Note: for paths such as /proc/self/task//*, it's easy to replace + // /proc/self with /proc/ since we have the container's process pid + // in sysbox's pid-namespace. However, that's not the case for the , + // which is in the container's pid namespace and we have no good/easy way + // to translate it sysbox's pid-ns. For now, assume that = . + // It's not ideal, but it's normally the case when we receive such paths in + // mount syscalls. + + tid := p.pid + currPath = replaceProcSelfWithProcPid(currPath, p.pid, tid) + + fi, err := os.Lstat(currPath) + if err != nil { + return "", err + } + + if fi.Mode()&os.ModeSymlink != os.ModeSymlink { + break + } + + linkCnt += 1 + if linkCnt > domain.SymlinkMax { + logrus.Errorf("number of symlinks while resolving path %s exceeded the max allowed (40).", path) + return "", syscall.ELOOP + } + + // path starts with "/proc/self/" and it's a symlink, resolve it + currPath, err = os.Readlink(currPath) + if err != nil { + return "", err + } + } + + return currPath, nil +} + +func (p *process) pathAccess(path string, mode domain.AccessMode, followSymlink bool) (string, error) { + + if path == "" { + return "", syscall.ENOENT + } + + if len(path)+1 > syscall.PathMax { + return "", syscall.ENAMETOOLONG + } + + // Determine the start point. + var start string + if filepath.IsAbs(path) { + start = p.procroot + } else { + start = p.proccwd + } + + // Break up path into it's components; note that repeated "/" results in + // empty path components. + components := strings.Split(path, "/") + + cur := start + linkCnt := 0 + final := false + resolvedPath := "" + + for i, c := range components { + if i == len(components)-1 { + final = true + } + + if c == "" { + continue + } + + if c == ".." { + parent := filepath.Dir(cur) + if !strings.HasPrefix(parent, p.procroot) { + parent = p.procroot + } + cur = parent + } else if c != "." { + cur = filepath.Join(cur, c) + } + + symlink, isDir, err := isSymlink(cur) + if err != nil { + return "", syscall.ENOENT + } + + if !final && !symlink && !isDir { + return "", syscall.ENOTDIR + } + + // Follow the symlink (unless it's the process root, or if it's the final + // component of the path and followSymlink is false); may recurse if + // symlink points to another symlink and so on; we stop at symlinkMax + // recursions (just as the Linux kernel does). + + if !final || followSymlink { + if symlink && cur != p.procroot { + for { + if linkCnt >= domain.SymlinkMax { + return "", syscall.ELOOP + } + + link, err := os.Readlink(cur) + if err != nil { + return "", syscall.ENOENT + } + + if filepath.IsAbs(link) { + cur = filepath.Join(p.procroot, link) + } else { + cur = filepath.Join(filepath.Dir(cur), link) + } + + // If 'cur' ever matches 'p.procroot' then there's no need to continue + // iterating as we know for sure that 'p.procroot' is a valid / + // non-cyclical path. If we were to continue our iteration, we + // would end up dereferencing 'p.procroot' -- through readlink() -- + // which would erroneously points us to "/" in the host fs. + if cur == p.procroot { + break + } + + symlink, isDir, err = isSymlink(cur) + if err != nil { + return "", syscall.ENOENT + } + + if !symlink { + break + } + linkCnt += 1 + } + + if !final && !isDir { + return "", syscall.ENOTDIR + } + } + } + + perm := false + if !final { + perm, err = p.checkPerm(cur, domain.X_OK, followSymlink) + } else { + perm, err = p.checkPerm(cur, mode, followSymlink) + } + + if err != nil || !perm { + return "", syscall.EACCES + } + + // Obtain the resolved path by simply removing the root/cwd prefix previously appended + // to the current string. + if final { + if strings.HasPrefix(cur, p.procroot) { + resolvedPath = strings.TrimPrefix(cur, p.procroot) + } else { + resolvedPath = strings.TrimPrefix(cur, p.proccwd + "/") + } + } + } + + return resolvedPath, nil +} + +// checkPerm checks if the given process has permission to access the file or +// directory at the given path. The access mode indicates what type of access is +// being checked (i.e., read, write, execute, or a combination of these). The +// given path must not be a symlink. Returns true if the given process has the +// required permission, false otherwise. The returned error indicates if an +// error occurred during the check. +func (p *process) checkPerm(path string, aMode domain.AccessMode, followSymlink bool) (bool, error) { + var ( + fi os.FileInfo + err error + ) + + if followSymlink { + fi, err = os.Stat(path) + } else { + fi, err = os.Lstat(path) + } + + if err != nil { + return false, err + } + + fperm := fi.Mode().Perm() + + st, ok := fi.Sys().(*syscall.Stat_t) + if !ok { + return false, fmt.Errorf("failed to convert to syscall.Stat_t") + } + fuid := st.Uid + fgid := st.Gid + + mode := uint32(aMode) + + // no access = permission granted + if mode == 0 { + return true, nil + } + + // Note: the order of the checks below mimics those done by the Linux kernel. + + // owner check + if fuid == p.uid { + perm := uint32((fperm & 0700) >> 6) + if mode&perm == mode { + return true, nil + } + } + + // group check + if fgid == p.gid || uint32SliceContains(p.sgid, fgid) { + perm := uint32((fperm & 0070) >> 3) + if mode&perm == mode { + return true, nil + } + } + + // "other" check + perm := uint32(fperm & 0007) + if mode&perm == mode { + return true, nil + } + + // capability checks + if p.IsCapabilitySet(cap.EFFECTIVE, cap.CAP_DAC_OVERRIDE) { + // Per capabilitis(7): CAP_DAC_OVERRIDE bypasses file read, write, + // and execute permission checks. + // + // Per The Linux Programming Interface, 15.4.3: A process with the + // CAP_DAC_OVERRIDE capability always has read and write permissions + // for any type of file, and also has execute permission if the file + // is a directory or if execute permission is granted to at least one + // of the permission categories for the file. + if fi.IsDir() { + return true, nil + } else { + if aMode&domain.X_OK != domain.X_OK { + return true, nil + } else { + if fperm&0111 != 0 { + return true, nil + } + } + } + } + + if p.IsCapabilitySet(cap.EFFECTIVE, cap.CAP_DAC_READ_SEARCH) { + // Per capabilities(7): CAP_DAC_READ_SEARCH bypasses file read permission + // checks and directory read and execute permission checks + if fi.IsDir() && (aMode&domain.W_OK != domain.W_OK) { + return true, nil + } + + if !fi.IsDir() && (aMode == domain.R_OK) { + return true, nil + } + } + + return false, nil +} + +// +// Miscellaneous auxiliary functions +// + +// isSymlink returns true if the given file is a symlink +func isSymlink(path string) (bool, bool, error) { + fi, err := os.Lstat(path) + if err != nil { + return false, false, err + } + + return fi.Mode()&os.ModeSymlink == os.ModeSymlink, fi.IsDir(), nil +} + +// uint32SliceContains returns true if x is in a +func uint32SliceContains(a []uint32, x uint32) bool { + for _, n := range a { + if x == n { + return true + } + } + return false +} + +func readOverflowID(path string) (uint32, error) { + f, err := os.Open(path) + if err != nil { + return 0, err + } + defer f.Close() + + str, err := bufio.NewReader(f).ReadString('\n') + if err != nil { + return 0, err + } + str = strings.Trim(str, "\n") + + val, err := strconv.Atoi(str) + if err != nil { + return 0, err + } + + return uint32(val), nil +} + +func overflowUid() (uint32, error) { + return readOverflowID("/proc/sys/fs/overflowuid") +} + +func overflowGid() (uint32, error) { + return readOverflowID("/proc/sys/fs/overflowgid") +} diff --git a/sysbox-fs/process/process_test.go b/sysbox-fs/process/process_test.go new file mode 100644 index 00000000..97a3eeb2 --- /dev/null +++ b/sysbox-fs/process/process_test.go @@ -0,0 +1,958 @@ +// +// Copyright 2019-2020 Nestybox, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +package process + +import ( + "io/ioutil" + "os" + "path/filepath" + "syscall" + "testing" + + "github.com/nestybox/sysbox-fs/domain" + cap "github.com/nestybox/sysbox-libs/capability" +) + +func TestCheckPermOwner(t *testing.T) { + + tmpDir, err := ioutil.TempDir("/tmp", "TestPathres") + if err != nil { + t.Fatalf("failed to create test dir: %v", err) + } + defer os.RemoveAll(tmpDir) + + filename := filepath.Join(tmpDir, "testFile") + _, err = os.Create(filename) + if err != nil { + t.Fatalf("failed to create test file: %v", err) + } + if err := os.Chmod(filename, 0664); err != nil { + t.Fatalf("failed to chmod test file: %v", err) + } + + p := &process{ + root: tmpDir, + cwd: tmpDir, + uid: uint32(os.Geteuid()), + gid: uint32(os.Getegid()), + } + + mode := domain.R_OK | domain.W_OK + ok, err := p.checkPerm(filename, mode, true) + if err != nil || !ok { + t.Fatalf("checkPerm() failed: ok = %v, err = %v", ok, err) + } + + // check no execute perm + mode = domain.X_OK + ok, err = p.checkPerm(filename, mode, true) + if err != nil || ok { + t.Fatalf("checkPerm() failed: ok = %v, err = %v", ok, err) + } +} + +func TestCheckPermGroup(t *testing.T) { + + tmpDir, err := ioutil.TempDir("/tmp", "TestPathres") + if err != nil { + t.Fatalf("failed to create test dir: %v", err) + } + defer os.RemoveAll(tmpDir) + + filename := filepath.Join(tmpDir, "testFile") + _, err = os.Create(filename) + if err != nil { + t.Fatalf("failed to create test file: %v", err) + } + if err := os.Chmod(filename, 0664); err != nil { + t.Fatalf("failed to chmod test file: %v", err) + } + + // check group perm + p := &process{ + root: tmpDir, + cwd: tmpDir, + uid: 800, + gid: uint32(os.Getegid()), + } + + mode := domain.R_OK | domain.W_OK + ok, err := p.checkPerm(filename, mode, true) + if err != nil || !ok { + t.Fatalf("checkPerm() failed: ok = %v, err = %v", ok, err) + } + + // check suppl group perm + p = &process{ + root: tmpDir, + cwd: tmpDir, + uid: 800, + gid: 800, + sgid: []uint32{uint32(os.Getegid())}, + } + + mode = domain.R_OK | domain.W_OK + ok, err = p.checkPerm(filename, mode, true) + if err != nil || !ok { + t.Fatalf("checkPerm() failed: ok = %v, err = %v", ok, err) + } + + // check no execute perm + mode = domain.X_OK + ok, err = p.checkPerm(filename, mode, true) + if err != nil || ok { + t.Fatalf("checkPerm() failed: ok = %v, err = %v", ok, err) + } +} + +func TestCheckPermOther(t *testing.T) { + + tmpDir, err := ioutil.TempDir("/tmp", "TestPathres") + if err != nil { + t.Fatalf("failed to create test dir: %v", err) + } + defer os.RemoveAll(tmpDir) + + filename := filepath.Join(tmpDir, "testFile") + _, err = os.Create(filename) + if err != nil { + t.Fatalf("failed to create test file: %v", err) + } + if err := os.Chmod(filename, 0664); err != nil { + t.Fatalf("failed to chmod test file: %v", err) + } + + // check other perm + p := &process{ + root: tmpDir, + cwd: tmpDir, + uid: 800, + gid: 800, + } + + mode := domain.R_OK + ok, err := p.checkPerm(filename, mode, true) + if err != nil || !ok { + t.Fatalf("checkPerm() failed: ok = %v, err = %v", ok, err) + } + + // check no write or execute perm + mode = domain.W_OK | domain.X_OK + ok, err = p.checkPerm(filename, mode, true) + if err != nil || ok { + t.Fatalf("checkPerm() failed: ok = %v, err = %v", ok, err) + } +} + +func TestCheckPermCapDacOverride(t *testing.T) { + + tmpDir, err := ioutil.TempDir("/tmp", "TestPathres") + if err != nil { + t.Fatalf("failed to create test dir: %v", err) + } + defer os.RemoveAll(tmpDir) + + filename := filepath.Join(tmpDir, "testFile") + _, err = os.Create(filename) + if err != nil { + t.Fatalf("failed to create test file: %v", err) + } + + // File has execute-by-owner only; CAP_DAC_OVERRIDE will allow rwx on it + if err := os.Chmod(filename, 0100); err != nil { + t.Fatalf("failed to chmod test file: %v", err) + } + + p := &process{ + root: tmpDir, + cwd: tmpDir, + uid: 800, + gid: 800, + } + + p.setCapability(cap.EFFECTIVE, cap.CAP_DAC_OVERRIDE) + + mode := domain.R_OK | domain.W_OK | domain.X_OK + ok, err := p.checkPerm(filename, mode, true) + if err != nil || !ok { + t.Fatalf("checkPerm() failed: ok = %v, err = %v", ok, err) + } + + // File has no permissions; CAP_DAC_OVERRIDE will allow rw on it, but not execute. + if err := os.Chmod(filename, 0000); err != nil { + t.Fatalf("failed to chmod test file: %v", err) + } + + mode = domain.R_OK | domain.W_OK + ok, err = p.checkPerm(filename, mode, true) + if err != nil || !ok { + t.Fatalf("checkPerm() failed: ok = %v, err = %v", ok, err) + } + + mode = domain.X_OK + ok, err = p.checkPerm(filename, mode, true) + if err != nil || ok { + t.Fatalf("checkPerm() failed: ok = %v, err = %v", ok, err) + } +} + +func TestCheckPermCapDacReadSearch(t *testing.T) { + + tmpDir, err := ioutil.TempDir("/tmp", "TestPathres") + if err != nil { + t.Fatalf("failed to create test dir: %v", err) + } + defer os.RemoveAll(tmpDir) + + filename := filepath.Join(tmpDir, "testFile") + _, err = os.Create(filename) + if err != nil { + t.Fatalf("failed to create test file: %v", err) + } + + // File has no permissions; CAP_DAC_READ_SEARCH allows read on it + if err := os.Chmod(filename, 0000); err != nil { + t.Fatalf("failed to chmod test file: %v", err) + } + + p := &process{ + pid: uint32(os.Getpid()), + root: tmpDir, + cwd: tmpDir, + uid: 800, + gid: 800, + } + + // Init caps explicitly to prevent p.setCapability() below from loading caps for the current process. + p.cap, err = cap.NewPid2(int(p.pid)) + if err != nil { + t.Fatalf("failed to allocate capabilities: %v", err) + } + + p.setCapability(cap.EFFECTIVE, cap.CAP_DAC_READ_SEARCH) + + mode := domain.R_OK + ok, err := p.checkPerm(filename, mode, true) + if err != nil || !ok { + t.Fatalf("checkPerm() failed: ok = %v, err = %v", ok, err) + } + + // Directory has no perm; CAP_DAC_READ_SEARCH allows read and execute on it + dirname := filepath.Join(tmpDir, "testDir") + err = os.MkdirAll(dirname, 0000) + if err != nil { + t.Fatalf("failed to create test path: %v", err) + } + + mode = domain.R_OK | domain.X_OK + ok, err = p.checkPerm(dirname, mode, true) + if err != nil || !ok { + t.Fatalf("checkPerm() failed: ok = %v, err = %v", ok, err) + } + + // CAP_DAC_READ_SEARCH does not allow writes + mode = domain.W_OK + ok, err = p.checkPerm(filename, mode, true) + if err != nil || ok { + t.Fatalf("checkPerm() failed: ok = %v, err = %v", ok, err) + } + ok, err = p.checkPerm(dirname, mode, true) + if err != nil || ok { + t.Fatalf("checkPerm() failed: ok = %v, err = %v", ok, err) + } +} + +func TestProcPathAccess(t *testing.T) { + + tmpDir, err := ioutil.TempDir("/tmp", "TestPathres") + if err != nil { + t.Fatalf("failed to create test dir: %v", err) + } + defer os.RemoveAll(tmpDir) + + path := filepath.Join(tmpDir, "/some/path/to/a/dir") + err = os.MkdirAll(path, 0755) + if err != nil { + t.Fatalf("failed to create test path: %v", err) + } + + cwd := filepath.Join(tmpDir, "/some/path/to") + + p := &process{ + procroot: tmpDir, + proccwd: cwd, + uid: uint32(os.Geteuid()), + gid: uint32(os.Getegid()), + } + + mode := domain.R_OK | domain.W_OK | domain.X_OK + + if _, err := p.pathAccess("a/dir", mode, true); err != nil { + t.Fatalf("pathAccess() failed: %v", err) + } + + // test handling of repeated "/" + if _, err := p.pathAccess("a////dir", mode, true); err != nil { + t.Fatalf("pathAccess() failed: %v", err) + } + + // test handling of "." + if _, err := p.pathAccess("./a/dir", mode, true); err != nil { + t.Fatalf("pathAccess() failed: %v", err) + } + + if _, err := p.pathAccess("a/dir/.", mode, true); err != nil { + t.Fatalf("pathAccess() failed: %v", err) + } + + if _, err := p.pathAccess("././a/./dir/.", mode, true); err != nil { + t.Fatalf("pathAccess() failed: %v", err) + } + + // test handling of ".." + if _, err := p.pathAccess("../to/a/dir", mode, true); err != nil { + t.Fatalf("pathAccess() failed: %v", err) + } + + if _, err := p.pathAccess("../../path/to/a/dir", mode, true); err != nil { + t.Fatalf("pathAccess() failed: %v", err) + } + + if _, err := p.pathAccess("../../../some/path/to/a/dir", mode, true); err != nil { + t.Fatalf("pathAccess() failed: %v", err) + } + + if _, err := p.pathAccess("../../../../some/path/to/a/dir", mode, true); err != nil { + t.Fatalf("pathAccess() failed: %v", err) + } + + if _, err := p.pathAccess("a/../a/dir", mode, true); err != nil { + t.Fatalf("pathAccess() failed: %v", err) + } + + if _, err := p.pathAccess("a/../a/../../to/a/dir", mode, true); err != nil { + t.Fatalf("pathAccess() failed: %v", err) + } + + if _, err := p.pathAccess("../../../../../../../some/path/../path/to/a/dir", mode, true); err != nil { + t.Fatalf("pathAccess() failed: %v", err) + } + + if _, err := p.pathAccess("../to/a/dir/..", mode, true); err != nil { + t.Fatalf("pathAccess() failed: %v", err) + } + + // combine all of the above + if _, err := p.pathAccess("../../../../.././../.././///some/path/../path///to/./a/dir////", mode, true); err != nil { + t.Fatalf("pathAccess() failed: %v", err) + } +} + +func TestProcPathAccessDirAndFilePerm(t *testing.T) { + + tmpDir, err := ioutil.TempDir("/tmp", "TestPathres") + if err != nil { + t.Fatalf("failed to create test dir: %v", err) + } + defer os.RemoveAll(tmpDir) + + path := filepath.Join(tmpDir, "/some/path/to/a/dir") + err = os.MkdirAll(path, 0755) + if err != nil { + t.Fatalf("failed to create test path: %v", err) + } + + filename := filepath.Join(path, "somefile") + _, err = os.Create(filename) + if err != nil { + t.Fatalf("failed to create test file: %v", err) + } + + cwd := filepath.Join(tmpDir, "/some/path/to") + + p := &process{ + procroot: tmpDir, + proccwd: cwd, + uid: uint32(os.Geteuid()), + gid: uint32(os.Getegid()), + } + + if _, err := p.pathAccess("/some/path/to/a/dir/somefile", 0, true); err != nil { + t.Fatalf("pathAccess() failed: %v", err) + } + + // Restrict access on the file and verify + if err := os.Chmod(filename, 0700); err != nil { + t.Fatalf("failed to chmod test file: %v", err) + } + + p = &process{ + pid: uint32(os.Getpid()), + procroot: tmpDir, + proccwd: cwd, + uid: 800, + gid: 800, + } + + // Initialize the process caps + p.cap, err = cap.NewPid2(int(p.pid)) + if err != nil { + t.Fatalf("failed to allocate capabilities: %v", err) + } + + if _, err := p.pathAccess("/some/path/to/a/dir/somefile", 0, true); err != nil { + t.Fatalf("procPathAccess() failed: %v", err) + } + + if _, err := p.pathAccess("/some/path/to/a/dir/somefile", domain.R_OK, true); err != syscall.EACCES { + t.Fatalf("pathAccess() expected to fail with \"%s\" but did not; err = \"%s\"", syscall.EACCES, err) + } + if _, err := p.pathAccess("/some/path/to/a/dir/somefile", domain.W_OK, true); err != syscall.EACCES { + t.Fatalf("pathAccess() expected to fail with \"%s\" but did not; err = \"%s\"", syscall.EACCES, err) + } + if _, err := p.pathAccess("/some/path/to/a/dir/somefile", domain.X_OK, true); err != syscall.EACCES { + t.Fatalf("pathAccess() expected to fail with \"%s\" but did not; err = \"%s\"", syscall.EACCES, err) + } + + // Restrict access on a dir of the path and verify + if err := os.Chmod(path, 0700); err != nil { + t.Fatalf("failed to chmod test file: %v", err) + } + if err := os.Chmod(filename, 0777); err != nil { + t.Fatalf("failed to chmod test file: %v", err) + } + if _, err := p.pathAccess( + "/some/path/to/a/dir/somefile", + domain.R_OK|domain.W_OK|domain.X_OK, true); err != syscall.EACCES { + t.Fatalf("pathAccess() expected to fail with \"%s\" but did not; err = \"%s\"", syscall.EACCES, err) + } + + p = &process{ + procroot: tmpDir, + proccwd: cwd, + uid: uint32(os.Geteuid()), + gid: uint32(os.Getegid()), + } + + if _, err := p.pathAccess( + "/some/path/to/a/dir/somefile", + domain.R_OK|domain.W_OK|domain.X_OK, true); err != nil { + t.Fatalf("pathAccess() failed: %v", err) + } +} + +func TestProcPathAccessEnoent(t *testing.T) { + + tmpDir, err := ioutil.TempDir("/tmp", "TestPathres") + if err != nil { + t.Fatalf("failed to create test dir: %v", err) + } + defer os.RemoveAll(tmpDir) + + path := filepath.Join(tmpDir, "/some/path/to/a/dir") + err = os.MkdirAll(path, 0755) + if err != nil { + t.Fatalf("failed to create test path: %v", err) + } + + cwd := filepath.Join(tmpDir, "/some/path/to") + + p := &process{ + root: tmpDir, + cwd: cwd, + uid: uint32(os.Geteuid()), + gid: uint32(os.Getegid()), + } + + mode := domain.R_OK + + if _, err = p.pathAccess("a/non/existent/dir", mode, true); err != syscall.ENOENT { + goto Fail + } + + if _, err = p.pathAccess("../to/a/non/existent/dir", mode, true); err != syscall.ENOENT { + goto Fail + } + + if _, err = p.pathAccess("a/dir/../bad", mode, true); err != syscall.ENOENT { + goto Fail + } + + if _, err = p.pathAccess("a/dir/../../bad", mode, true); err != syscall.ENOENT { + goto Fail + } + + if _, err = p.pathAccess("a/dir/../../../../../../../bad", mode, true); err != syscall.ENOENT { + goto Fail + } + + if _, err = p.pathAccess("a/./bad/./dir/", mode, true); err != syscall.ENOENT { + goto Fail + } + + if _, err = p.pathAccess("/some/path/to/a/non/existent/dir", mode, true); err != syscall.ENOENT { + goto Fail + } + + return + +Fail: + t.Fatalf("procPathAccess() expected to fail with \"%s\" but did not; err = \"%s\"", syscall.ENOENT, err) +} + +func TestProcPathAccessSymlink(t *testing.T) { + + // This test creates the following dir and symlink hierarchy and verifies all + // symlinks get resolved correctly. + // + // /tmp/TestPathres/ + // ├── another + // │ └── path + // │ ├── again + // │ │ └── link4 -> ../../path/link3 + // │ └── link3 -> /link2 + // ├── link -> ./this/is/the/real/path + // ├── link2 -> ./link + // └── this + // └── is + // └── the + // └── real + // └── path + + tmpDir, err := ioutil.TempDir("/tmp", "TestPathres") + if err != nil { + t.Fatalf("failed to create test dir: %v", err) + } + defer os.RemoveAll(tmpDir) + + path := filepath.Join(tmpDir, "/this/is/the/real/path") + err = os.MkdirAll(path, 0755) + if err != nil { + t.Fatalf("failed to create test path: %v", err) + } + + old := "./this/is/the/real/path" + new := filepath.Join(tmpDir, "/link") + if err := os.Symlink(old, new); err != nil { + t.Fatalf("failed to create test path: %v", err) + } + + p := &process{ + procroot: tmpDir, + proccwd: tmpDir, + uid: uint32(os.Geteuid()), + gid: uint32(os.Getegid()), + } + + mode := domain.R_OK | domain.X_OK + + expectedPath := "/this/is/the/real/path" + if resolvedPath, err := p.pathAccess("/link", mode, true); resolvedPath != expectedPath || err != nil { + t.Fatalf("pathAccess() failed: resolvedPath %s, error %v", resolvedPath, err) + } + + expectedPath = "/this/is/the/real" + if resolvedPath, err := p.pathAccess("/link/..", mode, true); resolvedPath != expectedPath || err != nil { + t.Fatalf("pathAccess() failed: resolvedPath %s, error %v", resolvedPath, err) + } + + // test recursive symlinks + old = filepath.Join("./link") + new = filepath.Join(tmpDir, "/link2") + if err := os.Symlink(old, new); err != nil { + t.Fatalf("failed to create test path: %v", err) + } + + expectedPath = "/this/is/the/real/path" + if resolvedPath, err := p.pathAccess("/link2", mode, true); resolvedPath != expectedPath || err != nil { + t.Fatalf("pathAccess() failed: resolvedPath %s, error %v", resolvedPath, err) + } + + path = filepath.Join(tmpDir, "/another/path") + err = os.MkdirAll(path, 0755) + if err != nil { + t.Fatalf("failed to create test path: %v", err) + } + + old = "/link2" + new = filepath.Join(tmpDir, "/another/path/link3") + if err := os.Symlink(old, new); err != nil { + t.Fatalf("failed to create test path: %v", err) + } + + expectedPath = "/this/is/the/real/path" + if resolvedPath, err := p.pathAccess("/another/path/link3", mode, true); resolvedPath != expectedPath || err != nil { + t.Fatalf("pathAccess() failed: resolvedPath %s, error %v", resolvedPath, err) + } + + path = filepath.Join(tmpDir, "/another/path/again") + err = os.MkdirAll(path, 0755) + if err != nil { + t.Fatalf("failed to create test path: %v", err) + } + + // test relative symlink + testCwd, err := os.Getwd() + if err != nil { + t.Fatalf("failed on os.Getwd(): %v", err) + } + + if err := os.Chdir(filepath.Join(tmpDir, "/another/path/again")); err != nil { + t.Fatalf("failed on os.Chdir(): %v", err) + } + + if err := os.Symlink("../../path/link3", "link4"); err != nil { + t.Fatalf("failed to create test path: %v", err) + } + + expectedPath = "/this/is/the/real/path" + if resolvedPath, err := p.pathAccess("/another/path/again/link4", mode, true); resolvedPath != expectedPath || err != nil { + t.Fatalf("pathAccess() failed: resolvedPath %s, error %v", resolvedPath, err) + } + + expectedPath = "/this/is/the/real" + if resolvedPath, err := p.pathAccess("/another/path/again/link4/..", mode, true); resolvedPath != expectedPath || err != nil { + t.Fatalf("pathAccess() failed: resolvedPath %s, error %v", resolvedPath, err) + } + + if err := os.Chdir(testCwd); err != nil { + t.Fatalf("failed on os.Chdir(): %v", err) + } + + // test broken symlink + old = filepath.Join("./nowhere") + new = filepath.Join(tmpDir, "/brokenlink") + if err := os.Symlink(old, new); err != nil { + t.Fatalf("failed to create test path: %v", err) + } + + expectedPath = "" + if resolvedPath, err := p.pathAccess("/brokenlink", mode, true); resolvedPath != expectedPath || err == nil { + t.Fatalf("pathAccess() expected to fail but passed on broken link") + } + + // set followSymlink = false and verify that since the broken link is not + // followed, pathAccess passes. + expectedPath = "/brokenlink" + if resolvedPath, err := p.pathAccess("/brokenlink", mode, false); resolvedPath != expectedPath || err != nil { + t.Fatalf("pathAccess() failed: resolvedPath %s, error %v", resolvedPath, err) + } + + // + // Reproducing corner case exposed by issue #574, observed during execution + // initialization of an inner container ("mount -o bind,remount ."). + // + // /tmp/TestPathres/ + // |-- cwdLink -> / + // |-- rootLink -> / + // + + cwd := filepath.Join(tmpDir, "/cwdLink") + if err := os.Symlink("/", cwd); err != nil { + t.Fatalf("failed to create test path: %v", err) + } + root := filepath.Join(tmpDir, "/rootLink") + if err := os.Symlink("/", root); err != nil { + t.Fatalf("failed to create test path: %v", err) + } + + p = &process{ + procroot: root, + proccwd: cwd, + uid: uint32(os.Geteuid()), + gid: uint32(os.Getegid()), + } + + if _, err := p.pathAccess(".", mode, true); err != nil { + t.Fatalf("pathAccess() failed: %v", err) + } +} + +func TestPathAccess(t *testing.T) { + + p := &process{pid: uint32(os.Getpid())} + + tmpDir, err := ioutil.TempDir("/tmp", "TestPathres") + if err != nil { + t.Fatalf("failed to create test dir: %v", err) + } + defer os.RemoveAll(tmpDir) + + path := filepath.Join(tmpDir, "/some/path/to/a/dir") + + err = os.MkdirAll(path, 0755) + if err != nil { + t.Fatalf("failed to create test path: %v", err) + } + + filename := filepath.Join(path, "somefile") + _, err = os.Create(filename) + if err != nil { + t.Fatalf("failed to create test file: %v", err) + } + + // file access + if _, err := p.PathAccess(filename, domain.R_OK|domain.W_OK, true); err != nil { + t.Fatalf("PathAccess() failed: %v", err) + } + + // dir access + path = tmpDir + "/some/path/to/a/dir" + if _, err := p.PathAccess( + path, + domain.R_OK|domain.X_OK, true); err != nil { + t.Fatalf("PathAccess() failed: %v", err) + } + + // .. and . + path = tmpDir + "/some/path/../../some/path/to/a/./dir/somefile" + if _, err := p.PathAccess( + path, + domain.R_OK|domain.W_OK, true); err != nil { + t.Fatalf("PathAccess() failed: %v", err) + } + + path = tmpDir + "/some/path/../../some/path/to/a/./dir/./././" + if _, err := p.PathAccess( + path, + domain.R_OK|domain.X_OK, true); err != nil { + t.Fatalf("PathAccess() failed: %v", err) + } + + path = tmpDir + "/../../../../" + tmpDir + "/some/path/to/a/../a/dir/." + if _, err := p.PathAccess( + path, + domain.R_OK|domain.X_OK, true); err != nil { + t.Fatalf("PathAccess() failed: %v", err) + } + + // relative paths + + testCwd, err := os.Getwd() + if err != nil { + t.Fatalf("failed on os.Getwd(): %v", err) + } + + if err := os.Chdir(filepath.Join(tmpDir, "/some/path")); err != nil { + t.Fatalf("failed on os.Chdir(): %v", err) + } + + path = "to/a/dir/somefile" + if _, err := p.PathAccess( + path, + domain.R_OK|domain.W_OK, true); err != nil { + t.Fatalf("PathAccess() failed: %v", err) + } + + if err := os.Chdir(filepath.Join(tmpDir, "/some/path/to")); err != nil { + t.Fatalf("failed on os.Chdir(): %v", err) + } + + path = "a/dir" + if _, err := p.PathAccess( + path, + domain.R_OK|domain.X_OK, true); err != nil { + t.Fatalf("PathAccess() failed: %v", err) + } + + if err := os.Chdir(testCwd); err != nil { + t.Fatalf("failed on os.Chdir(): %v", err) + } +} + +func TestPathAccessPerm(t *testing.T) { + var err error + + p := &process{pid: uint32(os.Getpid())} + + // Initialize the process caps + p.cap, err = cap.NewPid2(int(p.pid)) + if err != nil { + t.Fatalf("failed to allocate capabilities: %v", err) + } + + tmpDir, err := ioutil.TempDir("/tmp", "TestPathres") + if err != nil { + t.Fatalf("failed to create test dir: %v", err) + } + defer os.RemoveAll(tmpDir) + + path := filepath.Join(tmpDir, "/some/path/to/a/dir") + + err = os.MkdirAll(path, 0755) + if err != nil { + t.Fatalf("failed to create test path: %v", err) + } + + filename := filepath.Join(path, "somefile") + _, err = os.Create(filename) + if err != nil { + t.Fatalf("failed to create test file: %v", err) + } + + // read-only + if err := os.Chmod(filename, 0400); err != nil { + t.Fatalf("failed to chmod test file: %v", err) + } + + if _, err := p.PathAccess(filename, domain.R_OK, true); err != nil { + t.Fatalf("PathAccess() failed: %v", err) + } + + if _, err := p.PathAccess(filename, domain.W_OK, true); err != syscall.EACCES { + t.Fatalf("PathAccess() expected to fail with \"%s\" but did not; err = \"%s\"", syscall.EACCES, err) + } + + if _, err := p.PathAccess(filename, domain.X_OK, true); err != syscall.EACCES { + t.Fatalf("PathAccess() expected to fail with \"%s\" but did not; err = \"%s\"", syscall.EACCES, err) + } + + // write-only + if err := os.Chmod(filename, 0200); err != nil { + t.Fatalf("failed to chmod test file: %v", err) + } + + if _, err := p.PathAccess(filename, domain.W_OK, true); err != nil { + t.Fatalf("PathAccess() failed: %v", err) + } + + if _, err := p.PathAccess(filename, domain.R_OK, true); err != syscall.EACCES { + t.Fatalf("PathAccess() expected to fail with \"%s\" but did not; err = \"%s\"", syscall.EACCES, err) + } + + if _, err := p.PathAccess(filename, domain.X_OK, true); err != syscall.EACCES { + t.Fatalf("PathAccess() expected to fail with \"%s\" but did not; err = \"%s\"", syscall.EACCES, err) + } + + // execute-only + if err := os.Chmod(filename, 0100); err != nil { + t.Fatalf("failed to chmod test file: %v", err) + } + + if _, err := p.PathAccess(filename, domain.X_OK, true); err != nil { + t.Fatalf("PathAccess() failed: %v", err) + } + + if _, err := p.PathAccess(filename, domain.R_OK, true); err != syscall.EACCES { + t.Fatalf("PathAccess() expected to fail with \"%s\" but did not; err = \"%s\"", syscall.EACCES, err) + } + + if _, err := p.PathAccess(filename, domain.W_OK, true); err != syscall.EACCES { + t.Fatalf("PathAccess() expected to fail with \"%s\" but did not; err = \"%s\"", syscall.EACCES, err) + } + + // dir read-only + if err := os.Chmod(filename, 0777); err != nil { + t.Fatalf("failed to chmod test file: %v", err) + } + + if err := os.Chmod(path, 0400); err != nil { + t.Fatalf("failed to chmod test file: %v", err) + } + + if _, err := p.PathAccess(filename, domain.R_OK, true); err != syscall.EACCES { + t.Fatalf("PathAccess() expected to fail with \"%s\" but did not; err = \"%s\"", syscall.EACCES, err) + } +} + +func TestPathAccessSymlink(t *testing.T) { + + p := &process{pid: uint32(os.Getpid())} + + tmpDir, err := ioutil.TempDir("/tmp", "TestPathres") + if err != nil { + t.Fatalf("failed to create test dir: %v", err) + } + defer os.RemoveAll(tmpDir) + + path := filepath.Join(tmpDir, "/this/is/the/real/path") + err = os.MkdirAll(path, 0755) + if err != nil { + t.Fatalf("failed to create test path: %v", err) + } + + filename := filepath.Join(path, "somefile") + _, err = os.Create(filename) + if err != nil { + t.Fatalf("failed to create test file: %v", err) + } + + // test absolute symlink + + link := filepath.Join(tmpDir, "/link") + if err := os.Symlink(filename, link); err != nil { + t.Fatalf("failed to create test path: %v", err) + } + + if _, err := p.PathAccess(link, domain.R_OK|domain.W_OK, true); err != nil { + t.Fatalf("PathAccess() failed: %v", err) + } + + // test relative symlink + + testCwd, err := os.Getwd() + if err != nil { + t.Fatalf("failed on os.Getwd(): %v", err) + } + + if err := os.Chdir(tmpDir); err != nil { + t.Fatalf("failed on os.Chdir(): %v", err) + } + + if _, err := p.PathAccess("link", domain.R_OK|domain.W_OK, true); err != nil { + t.Fatalf("PathAccess() failed: %v", err) + } + + if err := os.Chdir(testCwd); err != nil { + t.Fatalf("failed on os.Chdir(): %v", err) + } + + // negative test on file perm + + if _, err := p.PathAccess(filename, domain.X_OK, true); err != syscall.EACCES { + t.Fatalf("PathAccess() expected to fail with \"%s\" but did not; err = \"%s\"", syscall.EACCES, err) + } +} + +func TestReplaceProcSelfWithProcPid(t *testing.T) { + + type testInput struct { + path string + pid uint32 + tid uint32 + } + + test := map[testInput]string{ + {"/proc/self/mem", 10, 100}: "/proc/10/mem", + {"/proc/self/task/123/io", 20, 200}: "/proc/20/task/200/io", + {"/proc/123/task/456/mem", 20, 200}: "/proc/123/task/456/mem", + {"/some/other/path", 20, 200}: "/some/other/path", + } + + for k, v := range test { + res := replaceProcSelfWithProcPid(k.path, k.pid, k.tid) + if res != v { + t.Fatalf("failed: replaceProcSelfWithProcPid(%s, %d, %d): got %s, want %s", + k.path, k.pid, k.tid, res, v) + } + } +} + +// TODO: +// Improve PathAccess tests: +// * test symlink resolution limit +// * test long path diff --git a/sysbox-fs/seccomp/chown.go b/sysbox-fs/seccomp/chown.go new file mode 100644 index 00000000..6d93c174 --- /dev/null +++ b/sysbox-fs/seccomp/chown.go @@ -0,0 +1,232 @@ +// +// Copyright 2020 Nestybox, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +// This file contains Sysbox's chown syscall trapping & handling code. The only +// reason we trap chown (as well as fchown and fchownat) is to prevent chown to +// /sys inside a sys container from failing. The reason chown to /sys inside a +// sys container would fail without this code is that /sys is owned by the +// host's true root, so it shows up as "nobody:nogroup" inside the sys container +// and thus its ownership can't be changed from within the container. Some apps +// running inside the container (e.g,. RedHat's RPM package manager) want to +// chown /sys to root:root, causing the apps to get an EPERM and fail. As a +// work-around, Sysbox ignores chown to "/sys" inside the sys container (or in +// any inner containers). All other chown operations are handled normally by the +// kernel. + +package seccomp + +import ( + "path/filepath" + "syscall" + + "github.com/nestybox/sysbox-fs/domain" + "github.com/nestybox/sysbox-fs/fuse" + "github.com/sirupsen/logrus" + "golang.org/x/sys/unix" +) + +type chownSyscallInfo struct { + syscallCtx // syscall generic info + path string + pathFd int32 + ownerUid int64 + ownerGid int64 + dirFd int32 + dirPath string + flags int +} + +func (ci *chownSyscallInfo) ignoreChown(absPath string) bool { + + // Note: we only ignore chown targeting "/sys" directly. We purposely avoid + // resolving symlinks to "/sys" because such symlinks are unusual and + // resolving them would slow down every chown syscall. This means that if a + // user chowns "/sys" by way of one or more symlinks, the syscall will not be + // ignored and will thus still fail. + + if absPath != "/sys" { + return false + } + + // Check if /sys is a sysfs mount. In the rare case where it's not, we can't + // ignore the chown. + + mts := ci.tracer.service.mts + + mip, err := mts.NewMountInfoParser(ci.cntr, ci.processInfo, true, false, false) + if err != nil { + logrus.Errorf("Failed to get mount info while processing fchown from pid %d: %s", ci.pid, err) + return false + } + + mi := mip.GetInfo(absPath) + if mi == nil || mi.FsType != "sysfs" { + return false + } + + return true +} + +func (ci *chownSyscallInfo) processChown() (*sysResponse, error) { + var err error + + t := ci.tracer + ci.processInfo = t.service.prs.ProcessCreate(ci.pid, 0, 0) + + ci.path, err = ci.processInfo.ResolveProcSelf(ci.path) + if err != nil { + return t.createErrorResponse(ci.reqId, syscall.EACCES), nil + } + + if !filepath.IsAbs(ci.path) { + ci.path = filepath.Join(ci.processInfo.Cwd(), ci.path) + } + + if ci.ignoreChown(ci.path) { + logrus.Debugf("Ignoring chown syscall from pid %d: path = %v, uid = %v, gid = %v", + ci.pid, ci.path, ci.ownerUid, ci.ownerGid) + return t.createSuccessResponse(ci.reqId), nil + } + + return t.createContinueResponse(ci.reqId), nil +} + +func (ci *chownSyscallInfo) processFchown() (*sysResponse, error) { + + t := ci.tracer + ci.processInfo = t.service.prs.ProcessCreate(ci.pid, 0, 0) + + path, err := ci.processInfo.GetFd(ci.pathFd) + if err != nil { + return t.createContinueResponse(ci.reqId), nil + } + + path, err = ci.processInfo.ResolveProcSelf(path) + if err != nil { + return t.createErrorResponse(ci.reqId, syscall.EACCES), nil + } + + if !filepath.IsAbs(path) { + path = filepath.Join(ci.processInfo.Cwd(), path) + } + + if ci.ignoreChown(path) { + logrus.Debugf("Ignoring chown syscall from pid %d: path = %v, uid = %v, gid = %v", + ci.pid, path, ci.ownerUid, ci.ownerGid) + return t.createSuccessResponse(ci.reqId), nil + } + + return t.createContinueResponse(ci.reqId), nil +} + +func (ci *chownSyscallInfo) processFchownat() (*sysResponse, error) { + var err error + + t := ci.tracer + ci.processInfo = t.service.prs.ProcessCreate(ci.pid, 0, 0) + path := ci.path + + // Interpret dirFd (if the pathname is not absolute) + if !filepath.IsAbs(path) { + + if (ci.flags&unix.AT_EMPTY_PATH == unix.AT_EMPTY_PATH) && path == "" { + + // Per chown(2): when the AT_EMPTY_PATH flag is set: If pathname is an + // empty string, operate on the file referred to by dirfd. If dirfd is + // AT_FDCWD, the call operates on the current working directory. + + if ci.dirFd == unix.AT_FDCWD { + path = ci.processInfo.Cwd() + } else { + dirPath, err := ci.processInfo.GetFd(ci.dirFd) + if err != nil { + return t.createContinueResponse(ci.reqId), nil + } + path = dirPath + } + + } else { + + // Per chown(2) (when the AT_EMPTY_PATH flag is not set): + // dirFd is AT_FDCWD, path is interpreted relative to the process' current + // working dir. Otherwise it's interpreted relative to dirFd. + + if ci.dirFd == unix.AT_FDCWD { + path = filepath.Join(ci.processInfo.Cwd(), path) + } else { + dirPath, err := ci.processInfo.GetFd(ci.dirFd) + if err != nil { + return t.createContinueResponse(ci.reqId), nil + } + path = filepath.Join(dirPath, path) + } + } + } + + path, err = ci.processInfo.ResolveProcSelf(path) + if err != nil { + return t.createErrorResponse(ci.reqId, syscall.EACCES), nil + } + + if ci.ignoreChown(path) { + logrus.Debugf("Ignoring fchownat syscall from pid %d: path = %v, uid = %v, gid = %v", + ci.pid, path, ci.ownerUid, ci.ownerGid) + return t.createSuccessResponse(ci.reqId), nil + } + + return t.createContinueResponse(ci.reqId), nil +} + +func (ci *chownSyscallInfo) processChownNSenter(nstype []domain.NStype) (*sysResponse, error) { + + chownPayload := []*domain.ChownSyscallPayload{} + + newElem := &domain.ChownSyscallPayload{ + Target: ci.path, + TargetUid: int(ci.ownerUid), + TargetGid: int(ci.ownerGid), + } + + chownPayload = append(chownPayload, newElem) + + nss := ci.tracer.service.nss + event := nss.NewEvent( + ci.pid, + &nstype, + 0, + &domain.NSenterMessage{ + Type: domain.ChownSyscallRequest, + Payload: chownPayload, + }, + nil, + false, + ) + + err := nss.SendRequestEvent(event) + if err != nil { + return nil, err + } + + responseMsg := nss.ReceiveResponseEvent(event) + if responseMsg.Type == domain.ErrorResponse { + resp := ci.tracer.createErrorResponse( + ci.reqId, + responseMsg.Payload.(fuse.IOerror).Code) + return resp, nil + } + + return ci.tracer.createSuccessResponse(ci.reqId), nil +} diff --git a/sysbox-fs/seccomp/memParser.go b/sysbox-fs/seccomp/memParser.go new file mode 100644 index 00000000..db10e2ff --- /dev/null +++ b/sysbox-fs/seccomp/memParser.go @@ -0,0 +1,32 @@ +// +// Copyright 2022 Nestybox, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +package seccomp + +// memParser interface defines the set of operations required to interact +// with seccomp-tracee processes to extract/inject state from/into their +// address-spaces. +type memParser interface { + ReadSyscallStringArgs(pid uint32, elems []memParserDataElem) ([]string, error) + ReadSyscallBytesArgs(pid uint32, elems []memParserDataElem) ([]string, error) + WriteSyscallBytesArgs(pid uint32, elems []memParserDataElem) error +} + +type memParserDataElem struct { + addr uint64 // mem address in tracee's address space + size int // size of the data element to read / write + data []byte // data to write to tracee's address space +} diff --git a/sysbox-fs/seccomp/memParserIOvec.go b/sysbox-fs/seccomp/memParserIOvec.go new file mode 100644 index 00000000..1bf1af92 --- /dev/null +++ b/sysbox-fs/seccomp/memParserIOvec.go @@ -0,0 +1,151 @@ +// +// Copyright 2022 Nestybox, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +package seccomp + +import ( + "fmt" + "unsafe" + + "C" + + "golang.org/x/sys/unix" +) + +// File contains memParser specialization logic to allow interaction with seccomp +// tracee's through a scatter-gather (IOvec) interface. This approach is the default +// one in kernels built with 'CONFIG_CROSS_MEMORY_ATTACH' flag enabled -- the usual +// case in most of the linux distros. + +type memParserIOvec struct{} + +// ReadSyscallBytesArgs reads data from the tracee's process address space to extract +// arguments utilized by the traced syscall. +func (mp *memParserIOvec) ReadSyscallStringArgs(pid uint32, elems []memParserDataElem) ([]string, error) { + var result []string + + for _, e := range elems { + if e.size > 0 { + dataBuf := make([]byte, e.size) + + if err := mp.readProcessMem(pid, dataBuf, e.addr, e.size); err != nil { + return nil, err + } + + data := C.GoString((*C.char)(unsafe.Pointer(&dataBuf[0]))) + result = append(result, data) + } + } + + return result, nil +} + +// ReadSyscallBytesArgs reads arbitrary byte data from the tracee's process address +// space to extract arguments utilized by the traced syscall. +func (mp *memParserIOvec) ReadSyscallBytesArgs(pid uint32, elems []memParserDataElem) ([]string, error) { + var result []string + + for _, e := range elems { + if e.size > 0 { + dataBuf := make([]byte, e.size) + + if err := mp.readProcessMem(pid, dataBuf, e.addr, e.size); err != nil { + return nil, err + } + + data := C.GoStringN((*C.char)(unsafe.Pointer(&dataBuf[0])), C.int(e.size)) + result = append(result, data) + } + } + + return result, nil +} + +// WriteSyscallBytesArgs writes collected state (i.e. syscall responses) into the +// the tracee's address space. +func (mp *memParserIOvec) WriteSyscallBytesArgs(pid uint32, elems []memParserDataElem) error { + + for _, e := range elems { + if e.size > 0 { + if err := mp.writeProcessMem(pid, e.addr, e.data, e.size); err != nil { + return err + } + } + } + + return nil +} + +// readsProcessMem reads size bytes at addr from the mem space of process pid, +// and stores the result in the local byte array. Size must be > 0. +func (t *memParserIOvec) readProcessMem(pid uint32, local []byte, addr uint64, size int) error { + localIovec := make([]unix.Iovec, 1) + remoteIovec := make([]unix.RemoteIovec, 1) + + localIovec[0].Base = &local[0] + localIovec[0].Len = uint64(size) + + remoteIovec[0].Base = uintptr(addr) + remoteIovec[0].Len = size + + // this denotes the end of the read + if remoteIovec[0].Base == 0 { + return nil + } + + // Read from the traced process' memory + n, err := unix.ProcessVMReadv(int(pid), localIovec, remoteIovec, 0) + + if err != nil { + return fmt.Errorf("failed to read from mem of pid %d: %s", pid, err) + } else if n > size { + return fmt.Errorf("read more bytes (%d) from mem of pid %d than expected (%d)", + n, pid, size) + } + + return nil +} + +// writeProcessMem writes size bytes in array data to the given address in the +// mem space of process pid. +func (mp *memParserIOvec) writeProcessMem(pid uint32, addr uint64, data []byte, size int) error { + data = data[:size] + + localIov := []unix.Iovec{ + { + Base: &data[0], + Len: uint64(size), + }, + } + + remoteIov := []unix.RemoteIovec{ + { + Base: uintptr(addr), + Len: size, + }, + } + + // Write to the traced process' memory + n, err := unix.ProcessVMWritev(int(pid), localIov, remoteIov, 0) + + if err != nil { + return fmt.Errorf("failed to write to mem of pid %d: %s", pid, err) + } else if n != size { + return fmt.Errorf("failed to write %d bytes to mem of pid %d: wrote %d bytes only", size, pid, n) + } + + return nil +} diff --git a/sysbox-fs/seccomp/memParserProcfs.go b/sysbox-fs/seccomp/memParserProcfs.go new file mode 100644 index 00000000..b042c1c2 --- /dev/null +++ b/sysbox-fs/seccomp/memParserProcfs.go @@ -0,0 +1,169 @@ +// +// Copyright 2022 Nestybox, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +package seccomp + +import ( + "bufio" + "fmt" + "io" + "os" + "strings" + + "C" +) + +// File hosts memParser specialization logic to allow interaction with seccomp tracee's +// through the '/proc/pid/mem' interface. Note that this approach is expected to be +// less performant than the scatter-gather (IOvec) one, but is needed to support systems +// where this option is not available. + +type memParserProcfs struct{} + +// ReadSyscallStringArgs iterates through the tracee's process /proc/pid/mem file to +// identify string (i.e., null-terminated) arguments utilized by the traced syscall. +// The assumption here is that the process invoking the syscall is 'stopped' at the +// time that this routine is executed. That is, tracee runs within a a single +// execution context (single-thread), and thefore its memory can be safely referenced. +func (mp *memParserProcfs) ReadSyscallStringArgs(pid uint32, elems []memParserDataElem) ([]string, error) { + + if len(elems) == 0 { + return nil, nil + } + + name := fmt.Sprintf("/proc/%d/mem", pid) + f, err := os.Open(name) + if err != nil { + return nil, fmt.Errorf("failed to open %s: %s", name, err) + } + defer f.Close() + + result := make([]string, len(elems)) + + reader := bufio.NewReader(f) + var line string + + // Iterate through the memory locations passed by caller. + for i, e := range elems { + if e.addr == 0 { + result[i] = "" + } else { + reader.Reset(f) + _, err := f.Seek(int64(e.addr), 0) + if err != nil { + return nil, fmt.Errorf("seek of %s failed: %s", name, err) + } + line, err = reader.ReadString('\x00') + if err != nil { + return nil, fmt.Errorf("read of %s at offset %d failed: %s", + name, e.addr, err) + } + result[i] = strings.TrimSuffix(line, "\x00") + } + } + + return result, nil +} + +// ReadSyscallBytesArgs iterates through the tracee's process /proc/pid/mem file to +// identify arbitrary byte data arguments utilized by the traced syscall. +func (mp *memParserProcfs) ReadSyscallBytesArgs(pid uint32, elems []memParserDataElem) ([]string, error) { + + if len(elems) == 0 { + return nil, nil + } + + name := fmt.Sprintf("/proc/%d/mem", pid) + f, err := os.Open(name) + if err != nil { + return nil, fmt.Errorf("failed to open %s: %s", name, err) + } + defer f.Close() + + result := make([]string, len(elems)) + reader := bufio.NewReader(f) + + for i, e := range elems { + if e.addr == 0 { + result[i] = string([]byte{}) + } else { + reader.Reset(f) + _, err := f.Seek(int64(e.addr), 0) + if err != nil { + return nil, fmt.Errorf("seek of %s failed: %s", name, err) + } + + // read the number of bytes specified by "size" (exactly) + byteData := make([]byte, e.size) + _, err = io.ReadFull(reader, byteData) + if err != nil { + return nil, fmt.Errorf("read of %s at offset %d with size %d failed: %s", + name, e.addr, e.size, err) + } + + result[i] = string(byteData) + } + } + + return result, nil +} + +// WriteSyscallBytesArgs writes collected state (i.e. syscall responses) into the +// the tracee's address space. This is accomplished by writing into the tracee's +// process /proc/pid/mem file. +func (mp *memParserProcfs) WriteSyscallBytesArgs(pid uint32, elems []memParserDataElem) error { + + if len(elems) == 0 { + return nil + } + + name := fmt.Sprintf("/proc/%d/mem", pid) + f, err := os.OpenFile(name, os.O_RDWR, 0600) + if err != nil { + return fmt.Errorf("failed to open %s: %s", name, err) + } + defer f.Close() + + writer := bufio.NewWriter(f) + + for _, e := range elems { + + data := e.data[:e.size] + + if e.addr == 0 { + continue + } else { + writer.Reset(f) + _, err := f.Seek(int64(e.addr), 0) + if err != nil { + return fmt.Errorf("seek of %s failed: %s", name, err) + } + + _, err = writer.Write(data) + if err != nil { + return fmt.Errorf("write of %s at offset %d with size %d failed: %s", + name, e.addr, e.size, err) + } + + if err = writer.Flush(); err != nil { + return fmt.Errorf("write of %s at offset %d with size %d failed: %s", + name, e.addr, e.size, err) + } + } + } + + return nil +} diff --git a/sysbox-fs/seccomp/mount.go b/sysbox-fs/seccomp/mount.go new file mode 100644 index 00000000..4eb5beff --- /dev/null +++ b/sysbox-fs/seccomp/mount.go @@ -0,0 +1,1096 @@ +// +// Copyright 2019-2020 Nestybox, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +package seccomp + +import ( + "fmt" + "path/filepath" + "strings" + "syscall" + + "github.com/nestybox/sysbox-fs/domain" + "github.com/nestybox/sysbox-fs/fuse" + "github.com/sirupsen/logrus" + "golang.org/x/sys/unix" +) + +// MountSyscall information structure. +type mountSyscallInfo struct { + syscallCtx // syscall generic info + *domain.MountSyscallPayload // mount-syscall specific details +} + +// Mount syscall processing wrapper instruction. +func (m *mountSyscallInfo) process() (*sysResponse, error) { + + mts := m.tracer.service.mts + if mts == nil { + return nil, fmt.Errorf("unexpected mount-service handler") + } + mh := mts.MountHelper() + if mh == nil { + return nil, fmt.Errorf("unexpected mount-service-helper handler") + } + + // Adjust mount attributes attending to the process' root path. + m.targetAdjust() + + // Ensure that the mountInfoDB corresponding to the sys-container hosting + // this process has been already built. This info is necessary to be able + // to discern between 'initial' and 'regular' mounts, which is required + // for the proper operation of the mount-hardening feature. + if !m.cntr.IsMountInfoInitialized() { + if err := m.cntr.InitializeMountInfo(); err != nil { + return nil, err + } + } + + // Handle requests that create a new mountpoint for filesystems managed by + // sysbox-fs. + if mh.IsNewMount(m.Flags) { + + mip, err := mts.NewMountInfoParser(m.cntr, m.processInfo, true, true, false) + if err != nil { + return nil, err + } + + switch m.FsType { + case "proc": + return m.processProcMount(mip) + case "sysfs": + return m.processSysMount(mip) + case "overlay": + return m.processOverlayMount(mip) + case "nfs": + return m.processNfsMount(mip) + } + } + + // Mount moves are handled by the kernel + if mh.IsMove(m.Flags) { + return m.tracer.createContinueResponse(m.reqId), nil + } + + // Handle propagation type changes on filesystems managed by sysbox-fs (no + // action required; let the kernel handle mount propagation changes). + if mh.HasPropagationFlag(m.Flags) { + return m.tracer.createContinueResponse(m.reqId), nil + } + + // Handle remount requests on filesystems managed by sysbox-fs + if mh.IsRemount(m.Flags) { + + mip, err := mts.NewMountInfoParser(m.cntr, m.processInfo, true, true, false) + if err != nil { + return nil, err + } + + if ok, resp := m.remountAllowed(mip); !ok { + return resp, nil + } + + if mip.IsSysboxfsBaseMount(m.Target) || + mip.IsSysboxfsSubmount(m.Target) { + return m.processRemount(mip) + } + + // No action by sysbox-fs + return m.tracer.createContinueResponse(m.reqId), nil + } + + // Handle bind-mount requests on filesystems managed by sysbox-fs. + if mh.IsBind(m.Flags) { + + mip, err := mts.NewMountInfoParser(m.cntr, m.processInfo, true, true, false) + if err != nil { + return nil, err + } + + // Ignore binds-to-self requests on sysbox-fs managed submounts (these + // are already bind-mounts, so we want to avoid the redundant bind mount + // for cosmetic purposes). + if m.Source == m.Target && mip.IsSysboxfsSubmount(m.Target) { + logrus.Debugf("Ignoring bind-to-self request of sysbox-fs managed submount at %s", + m.Target) + return m.tracer.createSuccessResponse(m.reqId), nil + } + + // Ignore /dev/null bind mounts on sysbox-fs managed submounts which are + // already bind-mounted to /dev/null (i.e., masked). + if m.Source == "/dev/null" && mip.IsSysboxfsMaskedSubmount(m.Target) { + logrus.Debugf("Ignoring /dev/null bind request over sysbox-fs masked submount at %s", + m.Target) + return m.tracer.createSuccessResponse(m.reqId), nil + } + + // Process bind-mounts whose source is a sysbox-fs base mount (as we + // want the submounts to also be bind-mounted at the target). + if m.Source != m.Target && mip.IsSysboxfsBaseMount(m.Source) { + return m.processBindMount(mip) + } + + // No action by sysbox-fs + return m.tracer.createContinueResponse(m.reqId), nil + } + + // No action by sysbox-fs otherwise + return m.tracer.createContinueResponse(m.reqId), nil +} + +// Method handles procfs mount syscall requests. As part of this function, we +// also create submounts under procfs (to expose, hide, or emulate resources). +func (m *mountSyscallInfo) processProcMount( + mip domain.MountInfoParserIface) (*sysResponse, error) { + + logrus.Debugf("Processing new procfs mount: %v", m) + + // Create instructions payload. + payload := m.createProcPayload(mip) + if payload == nil { + return nil, fmt.Errorf("Could not construct procMount payload") + } + + // Create nsenter-event envelope. + nss := m.tracer.service.nss + event := nss.NewEvent( + m.syscallCtx.pid, + &domain.AllNSs, + 0, + &domain.NSenterMessage{ + Type: domain.MountSyscallRequest, + Payload: payload, + }, + nil, + false, + ) + + // Launch nsenter-event. + err := nss.SendRequestEvent(event) + if err != nil { + return nil, err + } + + // Obtain nsenter-event response. + responseMsg := nss.ReceiveResponseEvent(event) + if responseMsg.Type == domain.ErrorResponse { + resp := m.tracer.createErrorResponse( + m.reqId, + responseMsg.Payload.(fuse.IOerror).Code) + return resp, nil + } + + // Chown the proc mount to the requesting process' uid:gid (typically + // root:root) as otherwise it will show up as "nobody:nogroup". + // + // NOTE: for now we skip the chown if the mount is read-only, as otherwise + // the chown will fail. This means that read-only mounts of proc will still + // show up as "nobody:nouser" inside the sys container (e.g., in inner + // containers). Solving this would require that we first mount proc, then + // chown, then remount read-only. This would in turn require 3 nsenter + // events, because the namespaces that we must enter for each are not the + // same (in particular for the chown to succeed, we must not enter the + // user-ns of the container). + + if m.Flags&unix.MS_RDONLY == unix.MS_RDONLY { + return m.tracer.createSuccessResponse(m.reqId), nil + } + + ci := &chownSyscallInfo{ + path: m.Target, + ownerUid: int64(m.uid), + ownerGid: int64(m.gid), + } + + ci.syscallCtx.reqId = m.reqId + ci.syscallCtx.pid = m.pid + ci.syscallCtx.tracer = m.tracer + + return ci.processChownNSenter(domain.AllNSsButUser) +} + +// Build instructions payload required to mount "/proc" subtree. +func (m *mountSyscallInfo) createProcPayload( + mip domain.MountInfoParserIface) *[]*domain.MountSyscallPayload { + + var payload []*domain.MountSyscallPayload + + // Payload instruction for original "/proc" mount request. + payload = append(payload, m.MountSyscallPayload) + + // If procfs has a read-only attribute at super-block level, we must also + // apply this to the new mountpoint (otherwise we will get a permission + // denied from the kernel when doing the mount). + procInfo := mip.GetInfo("/proc") + if procInfo != nil { + if _, ok := procInfo.VfsOptions["ro"]; ok { + payload[0].Flags |= unix.MS_RDONLY + } + } + + mh := m.tracer.service.mts.MountHelper() + + // Sysbox-fs "/proc" bind-mounts. + procBindMounts := mh.ProcMounts() + for _, v := range procBindMounts { + relPath := strings.TrimPrefix(v, "/proc") + + newelem := &domain.MountSyscallPayload{ + domain.NSenterMsgHeader{}, + domain.Mount{ + Source: v, + Target: filepath.Join(m.Target, relPath), + FsType: "", + Flags: unix.MS_BIND, + Data: "", + }, + } + payload = append(payload, newelem) + } + + // Container-specific read-only paths. + procRoPaths := m.cntr.ProcRoPaths() + for _, v := range procRoPaths { + if !domain.FileExists(v) { + continue + } + relPath := strings.TrimPrefix(v, "/proc") + + newelem := &domain.MountSyscallPayload{ + domain.NSenterMsgHeader{}, + domain.Mount{ + Source: v, + Target: filepath.Join(m.Target, relPath), + FsType: "", + Flags: unix.MS_BIND, + Data: "", + }, + } + payload = append(payload, newelem) + } + + // Container-specific masked paths. + procMaskPaths := m.cntr.ProcMaskPaths() + for _, v := range procMaskPaths { + if !domain.FileExists(v) { + continue + } + relPath := strings.TrimPrefix(v, "/proc") + + newelem := &domain.MountSyscallPayload{ + domain.NSenterMsgHeader{}, + domain.Mount{ + Source: v, + Target: filepath.Join(m.Target, relPath), + FsType: "", + Flags: unix.MS_BIND, + Data: "", + }, + } + payload = append(payload, newelem) + } + + // If "/proc" is to be mounted as read-only, we want this requirement to + // extend to all of its inner bind-mounts. + if m.Flags&unix.MS_RDONLY == unix.MS_RDONLY { + + for _, v := range procBindMounts { + relPath := strings.TrimPrefix(v, "/proc") + + newelem := &domain.MountSyscallPayload{ + domain.NSenterMsgHeader{}, + domain.Mount{ + Source: "", + Target: filepath.Join(m.Target, relPath), + FsType: "", + // TODO: Avoid hard-coding these flags. + Flags: unix.MS_RDONLY | unix.MS_BIND | unix.MS_REMOUNT | unix.MS_NOSUID | unix.MS_NODEV | unix.MS_NOEXEC, + Data: "", + }, + } + payload = append(payload, newelem) + } + } + + return &payload +} + +// Method handles sysfs mount syscall requests. As part of this function, we +// also create submounts under sysfs (to expose, hide, or emulate resources). +func (m *mountSyscallInfo) processSysMount( + mip domain.MountInfoParserIface) (*sysResponse, error) { + + logrus.Debugf("Processing new sysfs mount: %v", m) + + // Create instruction's payload. + payload := m.createSysPayload(mip) + if payload == nil { + return nil, fmt.Errorf("Could not construct sysfsMount payload") + } + + // Create nsenter-event envelope. + nss := m.tracer.service.nss + event := nss.NewEvent( + m.syscallCtx.pid, + &domain.AllNSs, + 0, + &domain.NSenterMessage{ + Type: domain.MountSyscallRequest, + Payload: payload, + }, + nil, + false, + ) + + // Launch nsenter-event. + err := nss.SendRequestEvent(event) + if err != nil { + return nil, err + } + + // Obtain nsenter-event response. + responseMsg := nss.ReceiveResponseEvent(event) + if responseMsg.Type == domain.ErrorResponse { + resp := m.tracer.createErrorResponse( + m.reqId, + responseMsg.Payload.(fuse.IOerror).Code) + return resp, nil + } + + return m.tracer.createSuccessResponse(m.reqId), nil +} + +// Build instructions payload required to mount "/sys" subtree. +func (m *mountSyscallInfo) createSysPayload( + mip domain.MountInfoParserIface) *[]*domain.MountSyscallPayload { + + var payload []*domain.MountSyscallPayload + + // Payload instruction for original "/sys" mount request. + payload = append(payload, m.MountSyscallPayload) + + // If sysfs has a read-only attribute at super-block level, we must also + // apply this to the new mountpoint (otherwise we will get a permission + // denied from the kernel when doing the mount). + sysInfo := mip.GetInfo("/sys") + if sysInfo != nil { + if _, ok := sysInfo.VfsOptions["ro"]; ok { + payload[0].Flags |= unix.MS_RDONLY + } + } + + mh := m.tracer.service.mts.MountHelper() + + // Sysbox-fs "/sys" bind-mounts. + sysBindMounts := mh.SysMounts() + for _, v := range sysBindMounts { + relPath := strings.TrimPrefix(v, "/sys") + + newelem := &domain.MountSyscallPayload{ + domain.NSenterMsgHeader{}, + domain.Mount{ + Source: v, + Target: filepath.Join(m.Target, relPath), + FsType: "", + Flags: unix.MS_BIND, + Data: "", + }, + } + payload = append(payload, newelem) + } + + // If "/sys" is to be mounted as read-only, we want this requirement to + // extend to all of its inner bind-mounts. + if m.Flags&unix.MS_RDONLY == unix.MS_RDONLY { + + for _, v := range sysBindMounts { + relPath := strings.TrimPrefix(v, "/sys") + + newelem := &domain.MountSyscallPayload{ + domain.NSenterMsgHeader{}, + domain.Mount{ + Source: "", + Target: filepath.Join(m.Target, relPath), + FsType: "", + // TODO: Avoid hard-coding these flags. + Flags: unix.MS_RDONLY | unix.MS_BIND | unix.MS_REMOUNT | unix.MS_NOSUID | unix.MS_NODEV | unix.MS_NOEXEC, + Data: "", + }, + } + payload = append(payload, newelem) + } + } + + return &payload +} + +// Method handles overlayfs mount syscall requests. +func (m *mountSyscallInfo) processOverlayMount( + mip domain.MountInfoParserIface) (*sysResponse, error) { + + logrus.Debugf("Processing new overlayfs mount: %v", m) + + // Notice that, in chroot scenarios, we are undoing the previous call to + // targetAdjust() to avoid the need to mess around with the paths in the + // 'data' object. Once within the 'nsenter' context, we will adjust all + // path elements by doing a chroot() as part of the personality-adjustment + // logic. + m.targetUnadjust() + + // Create instructions payload. + payload := m.createOverlayMountPayload(mip) + if payload == nil { + return nil, fmt.Errorf("Could not construct overlayMount payload") + } + + // Create nsenter-event envelope. + nss := m.tracer.service.nss + event := nss.NewEvent( + m.syscallCtx.pid, + &domain.AllNSsButUser, + 0, + &domain.NSenterMessage{ + Type: domain.MountSyscallRequest, + Payload: payload, + }, + nil, + false, + ) + + // Launch nsenter-event. + err := nss.SendRequestEvent(event) + if err != nil { + return nil, err + } + + // Obtain nsenter-event response. + responseMsg := nss.ReceiveResponseEvent(event) + if responseMsg.Type == domain.ErrorResponse { + resp := m.tracer.createErrorResponse( + m.reqId, + responseMsg.Payload.(fuse.IOerror).Code) + return resp, nil + } + + return m.tracer.createSuccessResponse(m.reqId), nil +} + +// Build instructions payload required for overlay-mount operations. +func (m *mountSyscallInfo) createOverlayMountPayload( + mip domain.MountInfoParserIface) *[]*domain.MountSyscallPayload { + + var payload []*domain.MountSyscallPayload + + // Create a process struct to represent the process generating the 'mount' + // instruction, and extract its capabilities to hand them out to 'nsenter' + // logic. + process := m.tracer.service.prs.ProcessCreate(m.pid, 0, 0) + + // Payload instruction for overlayfs mount request. + payload = append(payload, m.MountSyscallPayload) + + // Insert appended fields. + payload[0].Header = domain.NSenterMsgHeader{ + Pid: m.pid, + Uid: m.uid, + Gid: m.gid, + Root: m.root, + Cwd: m.cwd, + Capabilities: process.GetEffCaps(), + } + + return &payload +} + +// Method handles "nfs" mount syscall requests. Sysbox-fs does not manage nfs +// mounts per-se, but only "proxies" the nfs mount syscall. It does this in +// order to enable nfs to be mounted from within a (non init) user-ns. +func (m *mountSyscallInfo) processNfsMount( + mip domain.MountInfoParserIface) (*sysResponse, error) { + + logrus.Debugf("Processing new nfs mount: %v", m) + + // Create instruction's payload. + payload := m.createNfsMountPayload(mip) + if payload == nil { + return nil, fmt.Errorf("Could not construct nfsMount payload") + } + + // Create nsenter-event envelope + nss := m.tracer.service.nss + event := nss.NewEvent( + m.syscallCtx.pid, + &domain.AllNSsButUser, + 0, + &domain.NSenterMessage{ + Type: domain.MountSyscallRequest, + Payload: payload, + }, + nil, + false, + ) + + // Launch nsenter-event. + err := nss.SendRequestEvent(event) + if err != nil { + return nil, err + } + + // Obtain nsenter-event response. + responseMsg := nss.ReceiveResponseEvent(event) + if responseMsg.Type == domain.ErrorResponse { + resp := m.tracer.createErrorResponse( + m.reqId, + responseMsg.Payload.(fuse.IOerror).Code) + return resp, nil + } + + return m.tracer.createSuccessResponse(m.reqId), nil +} + +// Build instructions payload required for remount operations. +func (m *mountSyscallInfo) createNfsMountPayload( + mip domain.MountInfoParserIface) *[]*domain.MountSyscallPayload { + + var payload []*domain.MountSyscallPayload + + // Payload instruction for re-mount request. + payload = append(payload, m.MountSyscallPayload) + + return &payload +} + +// remountAllowed purpose is to prevent certain remount operations from +// succeeding, such as preventing RO mountpoints to be remounted as RW. +// +// Method will return 'true' when the remount operation is deemed legit, and +// will return 'false' otherwise. +func (m *mountSyscallInfo) remountAllowed( + mip domain.MountInfoParserIface) (bool, *sysResponse) { + + mh := m.tracer.service.mts.MountHelper() + + // Skip verification process if explicitly requested by the user. By default, + // remount operations of RO immutables are not allowed. + if m.tracer.service.allowImmutableRemounts { + return true, nil + } + + // Skip instructions targeting file-systems explicitly handled by sysbox-fs. + if m.FsType == "proc" || m.FsType == "sysfs" { + return true, nil + } + + // Allow operation if it attempts to remount target as read-only. + if mh.IsReadOnlyMount(m.Flags) { + return true, nil + } + + // There must be mountinfo state present for this target. Otherwise, return + // error back to the user. + info := mip.GetInfo(m.Target) + if info == nil { + return false, m.tracer.createErrorResponse(m.reqId, syscall.EINVAL) + } + + // Allow operation if the remount target is a read-write mountpoint. + if !mip.IsRoMount(info) { + return true, nil + } + + // + // The following scenarios are relevant within the context of this function + // and will be handled separately to ease the logic comprehension and its + // maintenability / debuggability. + // + // The different columns in this table denote the 'context' in which the + // remount process is executing, and thereby, dictates the logic chosen + // to handle each remount request. + // + // +-----------+--------------+--------------+----------+ + // | Scenarios | Unshare(mnt) | Pivot-root() | Chroot() | + // +-----------+--------------+--------------+----------+ + // | 1 | no | no | no | + // | 2 | no | yes | no | + // | 3 | no | no | yes | + // | 4 | no | yes | yes | + // | 5 | yes | no | no | + // | 6 | yes | yes | no | + // | 7 | yes | no | yes | + // | 8 | yes | yes | yes | + // +-----------+--------------+--------------+----------+ + // + + // Identify the mount-ns of the process launching the remount to compare it + // with the one of the sys container's initpid. In the unlikely case of an + // error, let the kernel deal with it. + processMountNs, err := m.processInfo.MountNsInode() + if err != nil { + return false, m.tracer.createErrorResponse(m.reqId, syscall.EINVAL) + } + initProcMountNs, err := m.cntr.InitProc().MountNsInode() + if err != nil { + return false, m.tracer.createErrorResponse(m.reqId, syscall.EINVAL) + } + + // Obtain the sys-container's root-path inode. + syscntrRootInode := m.cntr.InitProc().RootInode() + + // If process' mount-ns matches the sys-container's one, then we can simply + // rely on the target's mountID to discern an immutable target from a + // regular one. Otherwise, we cannot rely on the mountID field, as the values + // allocated by kernel for these very mountpoints will differ in other mount + // namespaces. + if processMountNs == initProcMountNs { + + var ( + immutable bool + bindmountImmutable bool + ) + + if ok := m.cntr.IsImmutableRoMountID(info.MountID); ok { + logrus.Infof("Rejected remount operation over read-only immutable target: %s", + m.Target) + immutable = true + } + + if !immutable { + if ok := m.cntr.IsImmutableRoBindMount(info); ok { + logrus.Infof("Rejected remount operation over bind-mount to read-only immutable target: %s", + m.Target) + bindmountImmutable = true + } + } + + if !immutable && !bindmountImmutable { + return true, nil + } + + if logrus.IsLevelEnabled(logrus.DebugLevel) { + if m.processInfo.Root() == "/" { + processRootInode := m.processInfo.RootInode() + + // Scenario 1): no-unshare(mnt) & no-privot() & no-chroot() + if processRootInode == syscntrRootInode { + logrus.Debug("Rejected remount operation -- scenario 1") + } + + // Scenario 2): no-unshare(mnt) & pivot() & no-chroot() + if processRootInode != syscntrRootInode { + logrus.Debug("Rejected remount operation -- scenario 2") + } + } + + if m.processInfo.Root() != "/" { + // We are dealing with a chroot'ed process, so obtain the inode of "/" + // as seen within the process' namespaces, and *not* the one associated + // to the process' root-path. + processRootInode, err := mip.ExtractInode("/") + if err != nil { + return false, m.tracer.createErrorResponse(m.reqId, syscall.EINVAL) + } + + // Scenario 3): no-unshare(mnt) & no-pivot() & chroot() + if processRootInode == syscntrRootInode { + logrus.Debug("Rejected remount operation -- scenario 3") + } + + // Scenario 4): no-unshare(mnt) & pivot() & chroot() + if processRootInode != syscntrRootInode { + logrus.Debug("Rejected remount operation -- scenario 4") + } + } + } + + return false, m.tracer.createErrorResponse(m.reqId, syscall.EPERM) + + } else { + + if m.processInfo.Root() == "/" { + processRootInode := m.processInfo.RootInode() + + // Scenario 5): unshare(mnt) & no-pivot() & no-chroot() + if processRootInode == syscntrRootInode { + + // We need to check if we're dealing with an overlapped mount, as + // this is a case that we usually (see exception below) want to + // allow. + if mip.IsOverlapMount(info) { + // The exception mentioned above refer to the scenario where + // the overlapped mountpoint is an immutable itself, hence the + // checkpoint below. + if m.cntr.IsImmutableOverlapMountpoint(info.MountPoint) { + logrus.Infof("Rejected remount operation over immutable overlapped target: %s (scenario 5)", + m.Target) + return false, m.tracer.createErrorResponse(m.reqId, syscall.EPERM) + } + return true, nil + } + + // In this scenario we have full access to all the mountpoints + // within the sys-container (different mount-id though), so we + // can safely rely on their mountinfo attributes to determine + // resource's immutability. + if m.cntr.IsImmutableRoMountpoint(info.MountPoint) { + logrus.Infof("Rejected remount operation over read-only immutable target: %s (scenario 5)", + m.Target) + return false, m.tracer.createErrorResponse(m.reqId, syscall.EPERM) + } + + if ok := m.cntr.IsImmutableRoBindMount(info); ok { + logrus.Infof("Rejected remount operation over bind-mount to read-only immutable target: %s (scenario 5)", + m.Target) + return false, m.tracer.createErrorResponse(m.reqId, syscall.EPERM) + } + + return true, nil + } + + // Scenario 6): unshare(mnt) & pivot() & no-chroot() + if processRootInode != syscntrRootInode { + isImmutable, err := m.cntr.IsImmutableRoMount(info) + if err != nil { + return false, m.tracer.createErrorResponse(m.reqId, syscall.EINVAL) + } + if isImmutable { + logrus.Infof("Rejected remount operation over read-only immutable target: %s (scenario 6)", + m.Target) + return false, m.tracer.createErrorResponse(m.reqId, syscall.EPERM) + } + + if ok := m.cntr.IsImmutableRoBindMount(info); ok { + logrus.Infof("Rejected remount operation over bind-mount to read-only-immutable target: %s (scenario 6)", + m.Target) + return false, m.tracer.createErrorResponse(m.reqId, syscall.EPERM) + } + + return true, nil + } + } + + if m.processInfo.Root() != "/" { + // We are dealing with a chroot'ed process, so obtain the inode of "/" + // as seen within the process' namespaces, and *not* the one associated + // to the process' root-path. + processRootInode, err := mip.ExtractInode("/") + if err != nil { + return false, m.tracer.createErrorResponse(m.reqId, syscall.EINVAL) + } + + // Scenario 7): unshare(mnt) & no-pivot() & chroot() + if processRootInode == syscntrRootInode { + + // We need to check if we're dealing with an overlapped mount, as + // this is a case that we usually (see exception below) want to + // allow. + if mip.IsOverlapMount(info) { + // The exception mentioned above refer to the scenario where + // the overlapped mountpoint is an immutable itself, hence the + // checkpoint below. + if m.cntr.IsImmutableOverlapMountpoint(info.MountPoint) { + logrus.Infof("Rejected remount operation over immutable overlapped target: %s (scenario 7)", + m.Target) + return false, m.tracer.createErrorResponse(m.reqId, syscall.EPERM) + } + return true, nil + } + + // In this scenario we have full access to all the mountpoints + // within the sys-container (different mount-id though), so we + // can safely rely on their mountinfo attributes to determine + // resource's immutability. + if m.cntr.IsImmutableRoMountpoint(info.MountPoint) { + logrus.Infof("Rejected remount operation over read-only immutable target: %s (scenario 7)", + m.Target) + return false, m.tracer.createErrorResponse(m.reqId, syscall.EPERM) + } + + if ok := m.cntr.IsImmutableRoBindMount(info); ok { + logrus.Infof("Rejected remount operation over bind-mount to read-only immutable target: %s (scenario 7)", + m.Target) + return false, m.tracer.createErrorResponse(m.reqId, syscall.EPERM) + } + + return true, nil + } + + // Scenario 8): unshare(mnt) & pivot() & chroot() + if processRootInode != syscntrRootInode { + isImmutable, err := m.cntr.IsImmutableRoMount(info) + if err != nil { + return false, m.tracer.createErrorResponse(m.reqId, syscall.EINVAL) + } + if isImmutable { + logrus.Infof("Rejected remount operation over read-only immutable target: %s (scenario 8)", + m.Target) + return false, m.tracer.createErrorResponse(m.reqId, syscall.EPERM) + } + + if ok := m.cntr.IsImmutableRoBindMount(info); ok { + logrus.Infof("Rejected remount operation over bind-mount to read-only immutable target: %s (scenario 8)", + m.Target) + return false, m.tracer.createErrorResponse(m.reqId, syscall.EPERM) + } + + return true, nil + } + } + } + + return true, nil +} + +func (m *mountSyscallInfo) processRemount( + mip domain.MountInfoParserIface) (*sysResponse, error) { + + logrus.Debugf("Processing re-mount: %v", m) + + // Create instruction's payload. + payload := m.createRemountPayload(mip) + if payload == nil { + return nil, fmt.Errorf("Could not construct ReMount payload") + } + + // Create nsenter-event envelope. + nss := m.tracer.service.nss + event := nss.NewEvent( + m.syscallCtx.pid, + &domain.AllNSsButUser, + 0, + &domain.NSenterMessage{ + Type: domain.MountSyscallRequest, + Payload: payload, + }, + nil, + false, + ) + + // Launch nsenter-event. + err := nss.SendRequestEvent(event) + if err != nil { + return nil, err + } + + // Obtain nsenter-event response. + responseMsg := nss.ReceiveResponseEvent(event) + if responseMsg.Type == domain.ErrorResponse { + resp := m.tracer.createErrorResponse( + m.reqId, + responseMsg.Payload.(fuse.IOerror).Code) + return resp, nil + } + + return m.tracer.createSuccessResponse(m.reqId), nil +} + +// Build instructions payload required for remount operations. +func (m *mountSyscallInfo) createRemountPayload( + mip domain.MountInfoParserIface) *[]*domain.MountSyscallPayload { + + var payload []*domain.MountSyscallPayload + + mh := m.tracer.service.mts.MountHelper() + + // A procfs mount inside a sys container is a combination of a base proc + // mount plus sysbox-fs submounts. If the remount is done on the base mount, + // its effect is also applied to the submounts. If the remount is on a + // submount, its effect is limited to that submount. + + submounts := []string{} + + if mip.IsSysboxfsBaseMount(m.Target) { + submounts = mip.GetSysboxfsSubMounts(m.Target) + } else { + submounts = append(submounts, m.Target) + } + + for _, subm := range submounts { + submInfo := mip.GetInfo(subm) + + perMountFlags := mh.StringToFlags(submInfo.Options) + perFsFlags := mh.StringToFlags(submInfo.VfsOptions) + submFlags := perMountFlags | perFsFlags + + // Pass the remount flags to the submounts + submFlags |= unix.MS_REMOUNT + + // The submounts must always be remounted with "MS_BIND" to ensure that + // only the submounts are affected. Otherwise, the remount effect + // applies at the sysbox-fs fuse level, causing weird behavior (e.g., + // remounting /proc as read-only would cause all sysbox-fs managed + // submounts under /sys to become read-only too!). + submFlags |= unix.MS_BIND + + // We only propagate changes to the MS_RDONLY flag to the submounts. In + // the future we could propagate other flags too. + // + // For MS_RDONLY: + // + // When set, we apply the read-only flag on all submounts. When cleared, + // we apply the read-write flag on all submounts which are not mounted + // as read-only in the container's /proc. + + if m.Flags&unix.MS_RDONLY == unix.MS_RDONLY { + submFlags |= unix.MS_RDONLY + } else { + if !mip.IsSysboxfsRoSubmount(subm) { + submFlags = submFlags &^ unix.MS_RDONLY + } + } + + // Leave the filesystem options (aka data) unchanged; note that since + // mountinfo provides them mixed with flags, we must filter the options + // out. + submOpts := mh.FilterFsFlags(submInfo.VfsOptions) + + newelem := &domain.MountSyscallPayload{ + domain.NSenterMsgHeader{}, + domain.Mount{ + Source: "", + Target: subm, + FsType: "", + Flags: submFlags, + Data: submOpts, + }, + } + payload = append(payload, newelem) + } + + if mip.IsSysboxfsBaseMount(m.Target) { + payload = append(payload, m.MountSyscallPayload) + } + + return &payload +} + +// Method handles bind-mount requests whose source is a mountpoint managed by +// sysbox-fs. +func (m *mountSyscallInfo) processBindMount( + mip domain.MountInfoParserIface) (*sysResponse, error) { + + logrus.Debugf("Processing bind mount: %v", m) + + // Create instruction's payload. + payload := m.createBindMountPayload(mip) + if payload == nil { + return nil, fmt.Errorf("Could not construct ReMount payload") + } + + // Create nsenter-event envelope. + nss := m.tracer.service.nss + event := nss.NewEvent( + m.syscallCtx.pid, + &domain.AllNSs, + 0, + &domain.NSenterMessage{ + Type: domain.MountSyscallRequest, + Payload: payload, + }, + nil, + false, + ) + + // Launch nsenter-event. + err := nss.SendRequestEvent(event) + if err != nil { + return nil, err + } + + // Obtain nsenter-event response. + responseMsg := nss.ReceiveResponseEvent(event) + if responseMsg.Type == domain.ErrorResponse { + resp := m.tracer.createErrorResponse( + m.reqId, + responseMsg.Payload.(fuse.IOerror).Code) + return resp, nil + } + + return m.tracer.createSuccessResponse(m.reqId), nil +} + +// Build instructions payload required for bind-mount operations. +func (m *mountSyscallInfo) createBindMountPayload( + mip domain.MountInfoParserIface) *[]*domain.MountSyscallPayload { + + var payload []*domain.MountSyscallPayload + + // A procfs mount inside a sys container is a combination of a base proc + // mount plus sysbox-fs submounts. If the bind-mount is done on the base + // mount, its effect is also applied to the submounts. + + payload = append(payload, m.MountSyscallPayload) + + // If the bind-mount is recursive, then the kernel will do the remounting + // of the submounts. No need for us to do anything. + if m.Flags&unix.MS_REC == unix.MS_REC { + return &payload + } + + // If the bind-mount is not recursive, then we do the bind-mount of the + // sysbox-fs managed submounts explicitly. + submounts := mip.GetSysboxfsSubMounts(m.Source) + + for _, subm := range submounts { + relTarget := strings.TrimPrefix(subm, m.Source) + subTarget := filepath.Join(m.Target, relTarget) + + newelem := &domain.MountSyscallPayload{ + domain.NSenterMsgHeader{}, + domain.Mount{ + Source: subm, + Target: subTarget, + FsType: "", + Flags: m.Flags, + Data: "", + }, + } + payload = append(payload, newelem) + } + + return &payload +} + +// Method addresses scenarios where the process generating the mount syscall has +// a 'root' attribute different than default one ("/"). This is typically the +// case in chroot'ed environments. Method's goal is to make the required target +// adjustments so that sysbox-fs can carry out the mount in the expected context. +func (m *mountSyscallInfo) targetAdjust() { + + root := m.syscallCtx.root + + if root == "/" { + return + } + + m.Target = filepath.Join(root, m.Target) +} + +// Undo targetAdjust() +func (m *mountSyscallInfo) targetUnadjust() { + + root := m.syscallCtx.root + + if root == "/" { + return + } + + m.Target = strings.TrimPrefix(m.Target, m.root) +} + +func (m *mountSyscallInfo) String() string { + return fmt.Sprintf("source: %s, target: %s, fstype: %s, flags: %#x, data: %s, root: %s, cwd: %s", + m.Source, m.Target, m.FsType, m.Flags, m.Data, m.root, m.cwd) +} diff --git a/sysbox-fs/seccomp/pidTracker.go b/sysbox-fs/seccomp/pidTracker.go new file mode 100644 index 00000000..36318220 --- /dev/null +++ b/sysbox-fs/seccomp/pidTracker.go @@ -0,0 +1,106 @@ +// +// Copyright 2019-2022 Nestybox, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +package seccomp + +import ( + "sync" +) + +// The seccompNotifPidTracker helps serialize the processing of seccomp +// notifications per thread, so that only one seccomp notif is processed per +// thread-id (pid) at any given time. + +type seccompNotifPidTracker struct { + mu sync.RWMutex + pidTable map[uint32]*pidData +} + +type pidData struct { + refcnt int + mu sync.Mutex +} + +func newSeccompNotifPidTracker() *seccompNotifPidTracker { + return &seccompNotifPidTracker{ + pidTable: make(map[uint32]*pidData), + } +} + +// Adds the given pid to the tracker's table of tracked pids. +func (t *seccompNotifPidTracker) track(pid uint32) { + t.mu.Lock() + defer t.mu.Unlock() + + // If pid not present in pidTable, add entry with count = 1; else increase + // the pid's refcount. + pd, ok := t.pidTable[pid] + if !ok { + t.pidTable[pid] = &pidData{refcnt: 1} + } else { + pd.refcnt++ + t.pidTable[pid] = pd + } +} + +// Removes the given pid from the tracker's table of tracked pids. +func (t *seccompNotifPidTracker) untrack(pid uint32) { + t.mu.Lock() + defer t.mu.Unlock() + + pd, ok := t.pidTable[pid] + if !ok { + return + } + + pd.refcnt-- + + if pd.refcnt > 0 { + t.pidTable[pid] = pd + } else { + delete(t.pidTable, pid) + } +} + +// Requests a lock on the given pid. Blocks if another process has the lock. +func (t *seccompNotifPidTracker) Lock(pid uint32) { + t.track(pid) + + t.mu.RLock() + pd, ok := t.pidTable[pid] + t.mu.RUnlock() + if !ok { + return + } + + // Grab the per-pid lock + pd.mu.Lock() +} + +// Releases the lock on the given pid. Must be called after Lock(). +func (t *seccompNotifPidTracker) Unlock(pid uint32) { + t.mu.RLock() + pd, ok := t.pidTable[pid] + t.mu.RUnlock() + if !ok { + return + } + + // Release the per-pid lock + pd.mu.Unlock() + + t.untrack(pid) +} diff --git a/sysbox-fs/seccomp/syscall.go b/sysbox-fs/seccomp/syscall.go new file mode 100644 index 00000000..1f7c4e67 --- /dev/null +++ b/sysbox-fs/seccomp/syscall.go @@ -0,0 +1,36 @@ +// +// Copyright 2019-2020 Nestybox, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +package seccomp + +import ( + "github.com/nestybox/sysbox-fs/domain" +) + +// Syscall generic information / state. +type syscallCtx struct { + syscallNum int32 // Value representing the syscall + syscallName string // Name of the syscall + reqId uint64 // Id associated to the syscall request + pid uint32 // Pid of the process generating the syscall + uid uint32 // Uid of the process generating the syscall + gid uint32 // Gid of the process generating the syscall + cwd string // Cwd of process generating the syscall + root string // Root of process generating the syscall + processInfo domain.ProcessIface // Process details associated to the syscall request + cntr domain.ContainerIface // Container hosting the process generating the syscall + tracer *syscallTracer // Backpointer to the seccomp-tracer owning the syscall +} diff --git a/sysbox-fs/seccomp/tracer.go b/sysbox-fs/seccomp/tracer.go new file mode 100644 index 00000000..4873340b --- /dev/null +++ b/sysbox-fs/seccomp/tracer.go @@ -0,0 +1,1263 @@ +// +// Copyright 2019-2022 Nestybox, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +package seccomp + +import ( + "fmt" + "net" + "path/filepath" + "sync" + "syscall" + + "C" + + "github.com/nestybox/sysbox-fs/domain" + unixIpc "github.com/nestybox/sysbox-ipc/unix" + "github.com/nestybox/sysbox-libs/formatter" + linuxUtils "github.com/nestybox/sysbox-libs/linuxUtils" + libpidfd "github.com/nestybox/sysbox-libs/pidfd" + libseccomp "github.com/seccomp/libseccomp-golang" + "golang.org/x/sys/unix" + + "github.com/sirupsen/logrus" +) + +const seccompTracerSockAddr = "/run/sysbox/sysfs-seccomp.sock" + +// libseccomp req/resp aliases. +type sysRequest = libseccomp.ScmpNotifReq +type sysResponse = libseccomp.ScmpNotifResp + +// Slice of supported syscalls to monitor. +var monitoredSyscalls = []string{ + "mount", + "umount2", + "reboot", + "swapon", + "swapoff", + "chown", + "fchown", + "fchownat", + "setxattr", + "lsetxattr", + "fsetxattr", + "getxattr", + "lgetxattr", + "fgetxattr", + "removexattr", + "lremovexattr", + "fremovexattr", + "listxattr", + "llistxattr", + "flistxattr", +} + +// Seccomp's syscall-monitoring/trapping service struct. External packages +// will solely rely on this struct for their syscall-monitoring demands. +type SyscallMonitorService struct { + nss domain.NSenterServiceIface // for nsenter functionality requirements + css domain.ContainerStateServiceIface // for container-state interactions + prs domain.ProcessServiceIface // for process class interactions + mts domain.MountServiceIface // for mount-services purposes + allowImmutableRemounts bool // allow immutable mounts to be remounted + allowImmutableUnmounts bool // allow immutable mounts to be unmounted + closeSeccompOnContExit bool // close seccomp fds on container exit, not on process exit + tracer *syscallTracer // pointer to actual syscall-tracer instance +} + +func NewSyscallMonitorService() *SyscallMonitorService { + return &SyscallMonitorService{} +} + +func (scs *SyscallMonitorService) Setup( + nss domain.NSenterServiceIface, + css domain.ContainerStateServiceIface, + prs domain.ProcessServiceIface, + mts domain.MountServiceIface, + allowImmutableRemounts bool, + allowImmutableUnmounts bool, + seccompFdReleasePolicy string) { + + scs.nss = nss + scs.css = css + scs.prs = prs + scs.mts = mts + scs.allowImmutableRemounts = allowImmutableRemounts + scs.allowImmutableUnmounts = allowImmutableUnmounts + + if seccompFdReleasePolicy == "cont-exit" { + scs.closeSeccompOnContExit = true + } + + // Allocate a new syscall-tracer. + scs.tracer = newSyscallTracer(scs) + + // Initialize and launch the syscall-tracer. + if err := scs.tracer.start(); err != nil { + logrus.Fatalf("syscallMonitorService initialization error (%v). Exiting ...", + err) + } +} + +type seccompArchSyscallPair struct { + archId libseccomp.ScmpArch + syscallId libseccomp.ScmpSyscall +} + +// SeccompSession holds state associated to every seccomp tracee session. +type seccompSession struct { + pid uint32 // pid of the tracee process + fd int32 // tracee's seccomp-fd to allow kernel interaction + pidfd int32 // fd associated to tracee's pid to influence poll() cycle + cntrId string // container(id) on which each seccomp session lives +} + +// Seccomp's syscall-monitor/tracer. +type syscallTracer struct { + srv *unixIpc.Server // unix server listening to seccomp-notifs + pollsrv *unixIpc.PollServer // unix pollserver for non-blocking i/o on seccomp-fd + syscalls map[seccompArchSyscallPair]string // hashmap of supported syscalls, indexed by seccomp architecture and syscall id + memParser memParser // memParser to utilize for tracee interactions + seccompSessionCMap map[string][]seccompSession // tracks all seccomp sessions associated with a given container + pidToContMap map[uint32]string // maps pid -> container id + seccompSessionMu sync.RWMutex // seccomp session table lock + seccompUnusedNotif bool // seccomp-fd unused notification feature supported by kernel + seccompNotifPidTrk *seccompNotifPidTracker // Ensures seccomp notifs for the same pid are processed sequentially (not in parallel). + service *SyscallMonitorService // backpointer to syscall-monitor service +} + +func getSupportedCompatibleSyscalls(nativeArchId libseccomp.ScmpArch) map[libseccomp.ScmpArch][]string { + switch nativeArchId { + case libseccomp.ArchAMD64: + return map[libseccomp.ScmpArch][]string{ + libseccomp.ArchAMD64: monitoredSyscalls, + // TODO: Add x86 specific syscalls such as chown32 + libseccomp.ArchX86: monitoredSyscalls, + } + default: + return map[libseccomp.ScmpArch][]string{ + nativeArchId: monitoredSyscalls, + } + } +} + +// syscallTracer constructor. +func newSyscallTracer(sms *SyscallMonitorService) *syscallTracer { + + tracer := &syscallTracer{ + service: sms, + syscalls: make(map[seccompArchSyscallPair]string), + } + + if sms.closeSeccompOnContExit { + tracer.seccompSessionCMap = make(map[string][]seccompSession) + tracer.pidToContMap = make(map[uint32]string) + } + + // Populate hashmap of supported syscalls to monitor. + nativeArchId, err := libseccomp.GetNativeArch() + if err != nil { + logrus.Warnf("Seccomp-tracer initialization error: Error obtaining native architecture") + return nil + } + + for archId, syscalls := range getSupportedCompatibleSyscalls(nativeArchId) { + for _, syscall := range syscalls { + syscallId, err := libseccomp.GetSyscallFromNameByArch(syscall, archId) + if err != nil { + logrus.Warnf("Seccomp-tracer initialization error: unknown syscall (%v, %v).", + archId, syscall) + return nil + } + tracer.syscalls[seccompArchSyscallPair{archId, syscallId}] = syscall + } + } + + // Elect the memParser to utilize based on the availability of process_vm_readv() + // syscall. + _, err = unix.ProcessVMReadv(int(1), nil, nil, 0) + if err == syscall.ENOSYS { + tracer.memParser = &memParserProcfs{} + logrus.Info("Procfs memParser elected") + } else { + tracer.memParser = &memParserIOvec{} + logrus.Info("IOvec memParser elected") + } + + // Seccomp-fd's unused notification feature is provided by kernel starting with v5.8. + cmp, err := linuxUtils.KernelCurrentVersionCmp(5, 8) + if err != nil { + logrus.Warnf("Seccomp-tracer initialization error: unable to parse kernel string (%v).", + err) + return nil + } + if cmp >= 0 { + tracer.seccompUnusedNotif = true + } + + tracer.seccompNotifPidTrk = newSeccompNotifPidTracker() + + return tracer +} + +// Start syscall tracer. +func (t *syscallTracer) start() error { + + // Enforce proper support of seccomp-monitoring capabilities by the existing + // kernel; bail otherwise. + api, err := libseccomp.GetAPI() + if err != nil { + logrus.Errorf("Error while obtaining seccomp API level (%v).", err) + return err + } else if api < 5 { + logrus.Errorf("Error: need seccomp API level >= 5; it's currently %d", api) + return fmt.Errorf("Error: unsupported kernel") + } + + // Launch a new server to listen to seccomp-tracer's socket. Incoming messages + // will be handled through a separated / dedicated goroutine. + srv, err := unixIpc.NewServer(seccompTracerSockAddr, t.connHandler) + if err != nil { + logrus.Errorf("Unable to initialize seccomp-tracer server") + return err + } + t.srv = srv + + return nil +} + +func (t *syscallTracer) seccompSessionAdd(s seccompSession) { + + t.seccompSessionMu.Lock() + + if t.service.closeSeccompOnContExit { + + // Return if seccomp session's pid is already registered. + if _, ok := t.pidToContMap[s.pid]; ok { + t.seccompSessionMu.Unlock() + return + } + + // Collect seccomp fds associated with container so we can + // release them together when the container dies. + t.pidToContMap[s.pid] = s.cntrId + sessions, ok := t.seccompSessionCMap[s.cntrId] + if ok { + sessions = append(sessions, s) + t.seccompSessionCMap[s.cntrId] = sessions + } else { + t.seccompSessionCMap[s.cntrId] = []seccompSession{s} + } + } + + t.seccompSessionMu.Unlock() + + logrus.Debugf("Created seccomp-tracee session for fd %d, pid %d, cntr-id %s", + s.fd, s.pid, s.cntrId) +} + +func (t *syscallTracer) seccompSessionDelete(s seccompSession) { + var closeFds []int32 + + t.seccompSessionMu.Lock() + + if t.service.closeSeccompOnContExit { + var cntrInitPid uint32 + + cntr := t.service.css.ContainerLookupById(s.cntrId) + if cntr != nil { + cntrInitPid = cntr.InitPid() + } + + // If the container is no longer there, or the pid being deleted is the + // container's init pid, we close all seccomp sessions for that container. + if cntr == nil || s.pid == cntrInitPid { + sessions := t.seccompSessionCMap[s.cntrId] + for _, s := range sessions { + closeFds = append(closeFds, s.fd) + } + delete(t.seccompSessionCMap, s.cntrId) + } + + delete(t.pidToContMap, s.pid) + + } else { + closeFds = []int32{s.fd} + + // pidfd = 0 implies we are not using pidfd to track the release of the seccomp-fd. + if s.pidfd != 0 { + closeFds = append(closeFds, s.pidfd) + } + } + + t.seccompSessionMu.Unlock() + + if len(closeFds) > 0 { + for _, fd := range closeFds { + // We are finally ready to close the seccomp-fd. + if err := unix.Close(int(fd)); err != nil { + logrus.Errorf("Failed to close seccomp fd %v for pid %d: %v", + s.fd, s.pid, err) + } + } + + logrus.Debugf("Removed session for seccomp-tracee for pid %d, fd(s) %v", + s.pid, closeFds) + } +} + +func (t *syscallTracer) seccompSessionPidfd( + pid int32, + cntrID string, + fd int32) libpidfd.PidFd { + + var ( + pidfd libpidfd.PidFd + err error + ) + + // In scenarios lacking seccomp's unused-filter notifications, we rely on pidfd + // constructs to help us identify the precise time at which we must stop polling + // over seccomp-fds. Within these scenarios we handle the two following cases + // separately attending to the value of the '--seccomp-fd-release' cli knob: + // + // 1) 'Cntr-Exit': In this scenario, all the seccomp sessions make use of the + // same pidfd: the one associated with the container's initPid. By doing this + // we ensure that all seccomp sessions are kept alive until the container's + // initPid dies. + // + // 2) 'Proc-Exit' (default): In this case we want to associate the live-span of + // the seccomp-fd polling session with the one of the user-process that exec() + // into the container's namespaces (e.g., docker exec ). For this purpose + // we obtain a pidfd associated to the user-process pid. + if !t.seccompUnusedNotif { + if t.service.closeSeccompOnContExit { + cntr := t.service.css.ContainerLookupById(cntrID) + if cntr == nil { + logrus.Errorf("Unexpected error during cntr.Lookup(%s) execution on fd %d, pid %d", + cntrID, fd, pid) + return 0 + } + pidfd = cntr.InitPidFd() + + } else { + pidfd, err = libpidfd.Open(int(pid), 0) + if err != nil { + logrus.Errorf("Unexpected error during pidfd.Open() execution (%v) on fd %d, pid %d", + err, fd, pid) + return 0 + } + } + } + + return pidfd +} + +// Tracer's connection-handler method. Executed within a dedicated goroutine (one +// per connection). +func (t *syscallTracer) connHandler(c *net.UnixConn) { + + // Obtain seccomp-notification's file-descriptor and associated context. + pid, cntrID, fd, err := unixIpc.RecvSeccompInitMsg(c) + if err != nil { + return + } + + // Send Ack message back to sysbox-runc. + if err = unixIpc.SendSeccompInitAckMsg(c); err != nil { + return + } + + // If needed, obtain pidfd associated to this seccomp-bfd session. + pidfd := t.seccompSessionPidfd(pid, cntrID, fd) + + // Register the new seccomp-fd session. + session := seccompSession{uint32(pid), fd, int32(pidfd), cntrID} + t.seccompSessionAdd(session) + + for { + var fds []unix.PollFd + + if t.seccompUnusedNotif { + fds = []unix.PollFd{ + {int32(fd), unix.POLLIN, 0}, + } + } else { + fds = []unix.PollFd{ + {int32(fd), unix.POLLIN, 0}, + {int32(pidfd), unix.POLLIN, 0}, + } + } + + // Poll the obtained seccomp-fd for incoming syscalls. + _, err := unix.Poll(fds, -1) + if err != nil { + // As per signal(7), poll() syscall isn't restartable by kernel, so we must + // manually handle its potential interruption. + if err == syscall.EINTR { + logrus.Debugf("EINTR error during Poll() execution (%v) on fd %d, pid %d, cntr %s", + err, fd, pid, formatter.ContainerID{cntrID}) + continue + } + + logrus.Debugf("Error during Poll() execution (%v) on fd %d, pid %d, cntr %s", + err, fd, pid, formatter.ContainerID{cntrID}) + break + } + + // As per pidfd_open(2), a pidfd becomes readable when its associated pid + // terminates. Exit the polling loop when this occurs. + if !t.seccompUnusedNotif && fds[1].Revents == unix.POLLIN { + logrus.Debugf("POLLIN event received on pidfd %d, pid %d, cntr %s", + pidfd, pid, formatter.ContainerID{cntrID}) + break + } + + // Exit the polling loop whenever the received event on the seccomp-fd is not + // the expected one. + if fds[0].Revents != unix.POLLIN { + logrus.Debugf("Non-POLLIN event received on fd %d, pid %d, cntr %s", + fd, pid, formatter.ContainerID{cntrID}) + break + } + + // Retrieves seccomp-notification message. Notice that we will not 'break' + // upon error detection as libseccomp/kernel could return non-fatal errors + // (i.e., ENOENT) to alert of a problem with a specific notification. + req, err := libseccomp.NotifReceive(libseccomp.ScmpFd(fd)) + if err != nil { + logrus.Infof("Unexpected error during NotifReceive() execution (%v) on fd %d, pid %d, cntr %s", + err, fd, pid, formatter.ContainerID{cntrID}) + continue + } + + // Process the incoming syscall and obtain response for seccomp-tracee. + go t.process(req, fd, cntrID) + } + + t.seccompSessionDelete(session) + + c.Close() +} + +func (t *syscallTracer) process( + req *sysRequest, + fd int32, + cntrID string) { + + // This ensures that for a given pid, we only process one syscall at a time. + // Syscalls for different pids are processed in parallel. + t.seccompNotifPidTrk.Lock(req.Pid) + defer t.seccompNotifPidTrk.Unlock(req.Pid) + + // Process the incoming syscall and obtain response for seccomp-tracee. + resp, err := t.processSyscall(req, fd, cntrID) + if err != nil { + return + } + + // Responds to a previously received seccomp-notification. + _ = libseccomp.NotifRespond(libseccomp.ScmpFd(fd), resp) +} + +// Syscall processing entrypoint. Returns the response to be delivered to the +// process (seccomp-tracee) generating the syscall. +func (t *syscallTracer) processSyscall( + req *sysRequest, + fd int32, + cntrID string) (*sysResponse, error) { + + var ( + resp *sysResponse + err error + ) + + // Obtain container associated to the received containerId value. + cntr := t.service.css.ContainerLookupById(cntrID) + if cntr == nil { + logrus.Warnf("Received seccompNotifMsg generated by unknown container: %s", + formatter.ContainerID{cntrID}) + return t.createErrorResponse(req.ID, syscall.Errno(syscall.EPERM)), nil + } + + archId := req.Data.Arch + syscallId := req.Data.Syscall + syscallName := t.syscalls[seccompArchSyscallPair{archId, syscallId}] + + switch syscallName { + case "mount": + resp, err = t.processMount(req, fd, cntr) + + case "umount2": + resp, err = t.processUmount(req, fd, cntr) + + case "reboot": + resp, err = t.processReboot(req, fd, cntr) + + case "swapon": + resp, err = t.processSwapon(req, fd, cntr) + + case "swapoff": + resp, err = t.processSwapoff(req, fd, cntr) + + case "chown": + resp, err = t.processChown(req, fd, cntr) + + case "fchown": + resp, err = t.processFchown(req, fd, cntr) + + case "fchownat": + resp, err = t.processFchownat(req, fd, cntr) + + case "setxattr", "lsetxattr": + resp, err = t.processSetxattr(req, fd, cntr, syscallName) + + case "fsetxattr": + resp, err = t.processFsetxattr(req, fd, cntr) + + case "getxattr", "lgetxattr": + resp, err = t.processGetxattr(req, fd, cntr, syscallName) + + case "fgetxattr": + resp, err = t.processFgetxattr(req, fd, cntr) + + case "removexattr", "lremovexattr": + resp, err = t.processRemovexattr(req, fd, cntr, syscallName) + + case "fremovexattr": + resp, err = t.processFremovexattr(req, fd, cntr) + + case "listxattr", "llistxattr": + resp, err = t.processListxattr(req, fd, cntr, syscallName) + + case "flistxattr": + resp, err = t.processFlistxattr(req, fd, cntr) + + default: + logrus.Warnf("Unsupported syscall notification received (%v) on fd %d, pid %d, cntr %s", + syscallId, fd, req.Pid, formatter.ContainerID{cntrID}) + return t.createErrorResponse(req.ID, syscall.EINVAL), nil + } + + // If an 'infrastructure' error is encountered during syscall processing, + // then return a common error back to tracee process. By 'infrastructure' + // errors we are referring to problems beyond the end-user realm: EPERM + // error during Open() doesn't qualify, whereas 'nsenter' operational + // errors or inexistent "/proc/pid/mem" does. + if err != nil { + logrus.Warnf("Error during syscall %v processing on fd %d, pid %d, req Id %d, cntr %s (%v)", + syscallName, fd, req.Pid, req.ID, formatter.ContainerID{cntrID}, err) + return t.createErrorResponse(req.ID, syscall.EINVAL), nil + } + + // TOCTOU check. + if err := libseccomp.NotifIDValid(libseccomp.ScmpFd(fd), req.ID); err != nil { + logrus.Debugf("TOCTOU check failed on fd %d pid %d cntr %s: req.ID %d is no longer valid (%s)", + fd, req.Pid, formatter.ContainerID{cntrID}, req.ID, err) + return t.createErrorResponse(req.ID, err), fmt.Errorf("TOCTOU error") + } + + return resp, nil +} + +func (t *syscallTracer) processMount( + req *sysRequest, + fd int32, + cntr domain.ContainerIface) (*sysResponse, error) { + + logrus.Debugf("Received mount syscall from pid %d", req.Pid) + + // Extract the "path", "name", "fstype" and "data" syscall attributes. + // Note: even though "data" is defined as a "void *" in the mount(2), we + // assume it's a string because the mount syscall does not specify its + // length. + parsedArgs, err := t.memParser.ReadSyscallStringArgs( + req.Pid, + []memParserDataElem{ + {req.Data.Args[0], unix.PathMax, nil}, + {req.Data.Args[1], unix.PathMax, nil}, + {req.Data.Args[2], unix.PathMax, nil}, + {req.Data.Args[4], unix.PathMax, nil}, + }, + ) + if err != nil { + return t.createErrorResponse(req.ID, syscall.EPERM), nil + } + source := parsedArgs[0] + target := parsedArgs[1] + fstype := parsedArgs[2] + data := parsedArgs[3] + + mount := &mountSyscallInfo{ + syscallCtx: syscallCtx{ + syscallNum: int32(req.Data.Syscall), + reqId: req.ID, + pid: req.Pid, + cntr: cntr, + tracer: t, + }, + MountSyscallPayload: &domain.MountSyscallPayload{ + domain.NSenterMsgHeader{}, + domain.Mount{ + Source: source, + Target: target, + FsType: fstype, + Data: data, + Flags: req.Data.Args[3], + }, + }, + } + + // cap_sys_admin capability is required for mount operations. + process := t.service.prs.ProcessCreate(req.Pid, 0, 0) + if !process.IsSysAdminCapabilitySet() { + return t.createErrorResponse(req.ID, syscall.EPERM), nil + } + + mount.Source, err = process.ResolveProcSelf(mount.Source) + if err != nil { + return t.createErrorResponse(req.ID, syscall.EACCES), nil + } + + mount.Target, err = process.ResolveProcSelf(mount.Target) + if err != nil { + return t.createErrorResponse(req.ID, syscall.EACCES), nil + } + + // Verify the process has the proper rights to access the target and + // update it in case it requires path resolution. + mount.Target, err = process.PathAccess(mount.Target, 0, true) + if err != nil { + return t.createErrorResponse(req.ID, err), nil + } + + // Collect process attributes required for mount execution. + mount.uid = process.Uid() + mount.gid = process.Gid() + mount.cwd = process.Cwd() + mount.root = process.Root() + mount.processInfo = process + + logrus.Debug(mount) + + // To simplify mount processing logic, convert to absolute path if dealing + // with a relative path request. + if !filepath.IsAbs(mount.Target) { + mount.Target = filepath.Join(mount.cwd, mount.Target) + } + + // Process mount syscall. + return mount.process() +} + +func (t *syscallTracer) processUmount( + req *sysRequest, + fd int32, + cntr domain.ContainerIface) (*sysResponse, error) { + + logrus.Debugf("Received umount syscall from pid %d", req.Pid) + + // Extract "target" syscall attribute. + parsedArgs, err := t.memParser.ReadSyscallStringArgs( + req.Pid, + []memParserDataElem{{req.Data.Args[0], unix.PathMax, nil}}, + ) + if err != nil { + return t.createErrorResponse(req.ID, syscall.EPERM), nil + } + target := parsedArgs[0] + + umount := &umountSyscallInfo{ + syscallCtx: syscallCtx{ + syscallNum: int32(req.Data.Syscall), + reqId: req.ID, + pid: req.Pid, + cntr: cntr, + tracer: t, + }, + UmountSyscallPayload: &domain.UmountSyscallPayload{ + domain.NSenterMsgHeader{}, + domain.Mount{ + Target: target, + Flags: req.Data.Args[1], + }, + }, + } + + // As per man's capabilities(7), cap_sys_admin capability is required for + // umount operations. Otherwise, return here and let kernel handle the mount + // instruction. + process := t.service.prs.ProcessCreate(req.Pid, 0, 0) + if !(process.IsSysAdminCapabilitySet()) { + return t.createErrorResponse(req.ID, syscall.EPERM), nil + } + + umount.Target, err = process.ResolveProcSelf(umount.Target) + if err != nil { + return t.createErrorResponse(req.ID, syscall.EACCES), nil + } + + // Verify the process has the proper rights to access the target and + // update it in case it requires path resolution. + umount.Target, err = process.PathAccess(umount.Target, 0, true) + if err != nil { + return t.createErrorResponse(req.ID, err), nil + } + + // Collect process attributes required for umount execution. + umount.uid = process.Uid() + umount.gid = process.Gid() + umount.cwd = process.Cwd() + umount.root = process.Root() + umount.processInfo = process + + logrus.Debug(umount) + + // To simplify umount processing logic, convert to absolute path if dealing + // with a relative path request. + if !filepath.IsAbs(umount.Target) { + umount.Target = filepath.Join(umount.cwd, umount.Target) + } + + // Process umount syscall. + return umount.process() +} + +func (t *syscallTracer) processChown( + req *sysRequest, + fd int32, + cntr domain.ContainerIface) (*sysResponse, error) { + + // Extract "path" syscall attribute. + parsedArgs, err := t.memParser.ReadSyscallStringArgs( + req.Pid, + []memParserDataElem{{req.Data.Args[0], unix.PathMax, nil}}, + ) + if err != nil { + return t.createErrorResponse(req.ID, syscall.EPERM), nil + } + path := parsedArgs[0] + + uid := int64(req.Data.Args[1]) + gid := int64(req.Data.Args[2]) + + chown := &chownSyscallInfo{ + syscallCtx: syscallCtx{ + syscallNum: int32(req.Data.Syscall), + reqId: req.ID, + pid: req.Pid, + cntr: cntr, + tracer: t, + }, + path: path, + ownerUid: uid, + ownerGid: gid, + } + + return chown.processChown() +} + +func (t *syscallTracer) processFchown( + req *sysRequest, + fd int32, + cntr domain.ContainerIface) (*sysResponse, error) { + + // We trap fchown() for the same reason we trap chown() (see processChown()). + + pathFd := int32(req.Data.Args[0]) + uid := int64(req.Data.Args[1]) + gid := int64(req.Data.Args[2]) + + chown := &chownSyscallInfo{ + syscallCtx: syscallCtx{ + syscallNum: int32(req.Data.Syscall), + reqId: req.ID, + pid: req.Pid, + cntr: cntr, + tracer: t, + }, + pathFd: pathFd, + ownerUid: uid, + ownerGid: gid, + } + + return chown.processFchown() +} + +func (t *syscallTracer) processFchownat( + req *sysRequest, + fd int32, + cntr domain.ContainerIface) (*sysResponse, error) { + + // We trap fchownat() for the same reason we trap chown() (see processChown()). + + // Extract "path" syscall attribute. + parsedArgs, err := t.memParser.ReadSyscallStringArgs( + req.Pid, + []memParserDataElem{{req.Data.Args[1], unix.PathMax, nil}}, + ) + if err != nil { + return t.createErrorResponse(req.ID, syscall.EPERM), nil + } + path := parsedArgs[0] + + // Get the other args. + dirFd := int32(req.Data.Args[0]) + uid := int64(req.Data.Args[2]) + gid := int64(req.Data.Args[3]) + flags := int(req.Data.Args[4]) + + chown := &chownSyscallInfo{ + syscallCtx: syscallCtx{ + syscallNum: int32(req.Data.Syscall), + reqId: req.ID, + pid: req.Pid, + cntr: cntr, + tracer: t, + }, + path: path, + ownerUid: uid, + ownerGid: gid, + dirFd: dirFd, + flags: flags, + } + + return chown.processFchownat() +} + +func (t *syscallTracer) processSetxattr( + req *sysRequest, + fd int32, + cntr domain.ContainerIface, + syscallName string) (*sysResponse, error) { + + // Extract the "path" and "name" syscall attributes. + parsedArgs, err := t.memParser.ReadSyscallStringArgs( + req.Pid, + []memParserDataElem{ + {req.Data.Args[0], unix.PathMax, nil}, + {req.Data.Args[1], unix.PathMax, nil}, + }, + ) + if err != nil { + return t.createErrorResponse(req.ID, syscall.EPERM), nil + } + path := parsedArgs[0] + name := parsedArgs[1] + + // Per setxattr(2): + // Value is a "void *", not necessarily a string (i.e., it may not be null terminated). + // The size of value (in bytes) is defined by the args[3] parameter. + parsedArgs, err = t.memParser.ReadSyscallBytesArgs( + req.Pid, + []memParserDataElem{{req.Data.Args[2], int(req.Data.Args[3]), nil}}, + ) + if err != nil { + return t.createErrorResponse(req.ID, syscall.EPERM), nil + } + val := parsedArgs[0] + + flags := int(req.Data.Args[4]) + + si := &setxattrSyscallInfo{ + syscallCtx: syscallCtx{ + syscallNum: int32(req.Data.Syscall), + syscallName: syscallName, + reqId: req.ID, + pid: req.Pid, + cntr: cntr, + tracer: t, + }, + path: path, + name: name, + val: []byte(val), + flags: flags, + } + + return si.processSetxattr() +} + +func (t *syscallTracer) processFsetxattr( + req *sysRequest, + fd int32, + cntr domain.ContainerIface) (*sysResponse, error) { + + pathFd := int32(req.Data.Args[0]) + flags := int(req.Data.Args[4]) + + // Extract "name" syscall attribute. + parsedArgs, err := t.memParser.ReadSyscallStringArgs( + req.Pid, + []memParserDataElem{{req.Data.Args[1], unix.PathMax, nil}}, + ) + if err != nil { + return t.createErrorResponse(req.ID, syscall.EPERM), nil + } + name := parsedArgs[0] + + // Per setxattr(2): + // Value is a "void *", not necessarily a string (i.e., it may not be null terminated). + // The size of value (in bytes) is defined by the args[3] parameter. + parsedArgs, err = t.memParser.ReadSyscallBytesArgs( + req.Pid, + []memParserDataElem{{req.Data.Args[2], int(req.Data.Args[3]), nil}}, + ) + if err != nil { + return t.createErrorResponse(req.ID, syscall.EPERM), nil + } + val := parsedArgs[0] + + si := &setxattrSyscallInfo{ + syscallCtx: syscallCtx{ + syscallNum: int32(req.Data.Syscall), + reqId: req.ID, + pid: req.Pid, + cntr: cntr, + tracer: t, + }, + pathFd: pathFd, + name: name, + val: []byte(val), + flags: flags, + } + + return si.processSetxattr() +} + +func (t *syscallTracer) processGetxattr( + req *sysRequest, + fd int32, + cntr domain.ContainerIface, + syscallName string) (*sysResponse, error) { + + // Extract the "path" and "name" syscall attributes. + parsedArgs, err := t.memParser.ReadSyscallStringArgs( + req.Pid, + []memParserDataElem{ + {req.Data.Args[0], unix.PathMax, nil}, + {req.Data.Args[1], unix.NAME_MAX, nil}, + }, + ) + if err != nil { + return t.createErrorResponse(req.ID, syscall.EPERM), nil + } + path := parsedArgs[0] + name := parsedArgs[1] + + // "addr" is the mem address where the syscall's result is stored; it's an + // address in the virtual memory of the process that performed the syscall. + // We will write the result there. + addr := uint64(req.Data.Args[2]) + + // "size" is the number of bytes in the buffer pointed to by "addr"; we must + // never write more than this amount of bytes into that buffer. If set to 0 + // then getxattr will return the size of the extended attribute (and + // not write into the buffer pointed to by "addr"). + size := uint64(req.Data.Args[3]) + + si := &getxattrSyscallInfo{ + syscallCtx: syscallCtx{ + syscallNum: int32(req.Data.Syscall), + syscallName: syscallName, + reqId: req.ID, + pid: req.Pid, + cntr: cntr, + tracer: t, + }, + path: path, + name: name, + addr: addr, + size: size, + } + + return si.processGetxattr() +} + +func (t *syscallTracer) processFgetxattr( + req *sysRequest, + fd int32, + cntr domain.ContainerIface) (*sysResponse, error) { + + pathFd := int32(req.Data.Args[0]) + + // Extract "name" syscall attribute. + parsedArgs, err := t.memParser.ReadSyscallStringArgs( + req.Pid, + []memParserDataElem{{req.Data.Args[1], unix.PathMax, nil}}, + ) + if err != nil { + return t.createErrorResponse(req.ID, syscall.EPERM), nil + } + name := parsedArgs[0] + + // "addr" is the mem address where the syscall's result is stored; it's an + // address in the virtual memory of the process that performed the syscall. + // We will write the result there. + addr := uint64(req.Data.Args[2]) + + // "size" is the number of bytes in the buffer pointed to by "addr"; we must + // never write more than this amount of bytes into that buffer. If set to 0 + // then getxattr will return the size of the extended attribute (and + // not write into the buffer pointed to by "addr"). + size := uint64(req.Data.Args[3]) + + si := &getxattrSyscallInfo{ + syscallCtx: syscallCtx{ + syscallNum: int32(req.Data.Syscall), + reqId: req.ID, + pid: req.Pid, + cntr: cntr, + tracer: t, + }, + pathFd: pathFd, + name: name, + addr: addr, + size: size, + } + + return si.processGetxattr() +} + +func (t *syscallTracer) processRemovexattr( + req *sysRequest, + fd int32, + cntr domain.ContainerIface, + syscallName string) (*sysResponse, error) { + + // Extract the "path" and "name" syscall attributes. + parsedArgs, err := t.memParser.ReadSyscallStringArgs( + req.Pid, + []memParserDataElem{ + {req.Data.Args[0], unix.PathMax, nil}, + {req.Data.Args[1], unix.PathMax, nil}, + }, + ) + if err != nil { + return t.createErrorResponse(req.ID, syscall.EPERM), nil + } + path := parsedArgs[0] + name := parsedArgs[1] + + si := &removexattrSyscallInfo{ + syscallCtx: syscallCtx{ + syscallNum: int32(req.Data.Syscall), + syscallName: syscallName, + reqId: req.ID, + pid: req.Pid, + cntr: cntr, + tracer: t, + }, + path: path, + name: name, + } + + return si.processRemovexattr() +} + +func (t *syscallTracer) processFremovexattr( + req *sysRequest, + fd int32, + cntr domain.ContainerIface) (*sysResponse, error) { + + pathFd := int32(req.Data.Args[0]) + + // Extract "name" syscall attribute. + parsedArgs, err := t.memParser.ReadSyscallStringArgs( + req.Pid, + []memParserDataElem{{req.Data.Args[1], unix.PathMax, nil}}, + ) + if err != nil { + return t.createErrorResponse(req.ID, syscall.EPERM), nil + } + name := parsedArgs[0] + + si := &removexattrSyscallInfo{ + syscallCtx: syscallCtx{ + syscallNum: int32(req.Data.Syscall), + reqId: req.ID, + pid: req.Pid, + cntr: cntr, + tracer: t, + }, + pathFd: pathFd, + name: name, + } + + return si.processRemovexattr() +} + +func (t *syscallTracer) processListxattr( + req *sysRequest, + fd int32, + cntr domain.ContainerIface, + syscallName string) (*sysResponse, error) { + + // Extract "path" syscall attribute. + parsedArgs, err := t.memParser.ReadSyscallStringArgs( + req.Pid, + []memParserDataElem{{req.Data.Args[0], unix.PathMax, nil}}, + ) + if err != nil { + return t.createErrorResponse(req.ID, syscall.EPERM), nil + } + path := parsedArgs[0] + + // "addr" is the mem address where the syscall's result is stored; it's an + // address in the virtual memory of the process that performed the syscall. + // We will write the result there. + addr := uint64(req.Data.Args[1]) + + // "size" is the number of bytes in the buffer pointed to by "addr"; we must + // never write more than this amount of bytes into that buffer. If set to 0 + // then listxattr will return the size of the extended attribute (and + // not write into the buffer pointed to by "addr"). + size := uint64(req.Data.Args[2]) + + si := &listxattrSyscallInfo{ + syscallCtx: syscallCtx{ + syscallNum: int32(req.Data.Syscall), + syscallName: syscallName, + reqId: req.ID, + pid: req.Pid, + cntr: cntr, + tracer: t, + }, + path: path, + addr: addr, + size: size, + } + + return si.processListxattr() +} + +func (t *syscallTracer) processFlistxattr( + req *sysRequest, + fd int32, + cntr domain.ContainerIface) (*sysResponse, error) { + + pathFd := int32(req.Data.Args[0]) + + // "addr" is the mem address where the syscall's result is stored; it's an + // address in the virtual memory of the process that performed the syscall. + // We will write the result there. + addr := uint64(req.Data.Args[1]) + + // "size" is the number of bytes in the buffer pointed to by "addr"; we must + // never write more than this amount of bytes into that buffer. If set to 0 + // then listxattr will return the size of the extended attribute (and + // not write into the buffer pointed to by "addr"). + size := uint64(req.Data.Args[2]) + + si := &listxattrSyscallInfo{ + syscallCtx: syscallCtx{ + syscallNum: int32(req.Data.Syscall), + reqId: req.ID, + pid: req.Pid, + cntr: cntr, + tracer: t, + }, + pathFd: pathFd, + addr: addr, + size: size, + } + + return si.processListxattr() +} + +func (t *syscallTracer) processReboot( + req *sysRequest, + fd int32, + cntrID domain.ContainerIface) (*sysResponse, error) { + + logrus.Warnf("Received reboot syscall") + + return t.createSuccessResponse(req.ID), nil +} + +func (t *syscallTracer) processSwapon( + req *sysRequest, + fd int32, + cntr domain.ContainerIface) (*sysResponse, error) { + + logrus.Warnf("Received swapon syscall") + + return t.createSuccessResponse(req.ID), nil +} + +func (t *syscallTracer) processSwapoff( + req *sysRequest, + fd int32, + cntr domain.ContainerIface) (*sysResponse, error) { + + logrus.Warnf("Received swapoff syscall") + + return t.createSuccessResponse(req.ID), nil +} + +func (t *syscallTracer) createSuccessResponse(id uint64) *sysResponse { + + resp := &sysResponse{ + ID: id, + Error: 0, + Val: 0, + Flags: 0, + } + + return resp +} + +func (t *syscallTracer) createSuccessResponseWithRetValue(id, val uint64) *sysResponse { + + resp := &sysResponse{ + ID: id, + Error: 0, + Val: val, + Flags: 0, + } + + return resp +} + +func (t *syscallTracer) createContinueResponse(id uint64) *sysResponse { + + resp := &sysResponse{ + ID: id, + Error: 0, + Val: 0, + Flags: libseccomp.NotifRespFlagContinue, + } + + return resp +} + +func (t *syscallTracer) createErrorResponse(id uint64, err error) *sysResponse { + + // Override the passed error if this one doesn't match the supported type. + rcvdError, ok := err.(syscall.Errno) + if !ok { + rcvdError = syscall.EINVAL + } + + resp := &sysResponse{ + ID: id, + Error: int32(rcvdError), + Val: 0, + Flags: 0, + } + + return resp +} diff --git a/sysbox-fs/seccomp/tracer_test.go b/sysbox-fs/seccomp/tracer_test.go new file mode 100644 index 00000000..8fa58a31 --- /dev/null +++ b/sysbox-fs/seccomp/tracer_test.go @@ -0,0 +1,91 @@ +// +// Copyright 2019-2020 Nestybox, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +package seccomp + +import ( + "errors" + "fmt" + "reflect" + "syscall" + "testing" + + unixIpc "github.com/nestybox/sysbox-ipc/unix" +) + +func Test_syscallTracer_createErrorResponse(t *testing.T) { + type fields struct { + sms *SyscallMonitorService + srv *unixIpc.Server + pollsrv *unixIpc.PollServer + syscalls map[seccompArchSyscallPair]string + } + + var f1 = &fields{ + sms: nil, + srv: nil, + pollsrv: nil, + syscalls: nil, + } + + // Expected results. + + var r1 = &sysResponse{ + ID: 0, + Error: int32(syscall.EPERM), + Val: 0, + Flags: 0, + } + var r2 = &sysResponse{ + ID: 1, + Error: int32(syscall.EINVAL), + Val: 0, + Flags: 0, + } + + type args struct { + id uint64 + err error + } + tests := []struct { + name string + fields fields + args args + want *sysResponse + }{ + // A received syscall.Errno error must be honored (no modifications allowed). + {"1", *f1, args{0, syscall.EPERM}, r1}, + + // Verify that "errorString" errors are properly type-asserted. + {"2", *f1, args{1, fmt.Errorf("testing errorString error type 1")}, r2}, + + // Same as above but with another error constructor. + {"3", *f1, args{1, errors.New("testing errorString error type 2")}, r2}, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + tracer := &syscallTracer{ + service: tt.fields.sms, + srv: tt.fields.srv, + pollsrv: tt.fields.pollsrv, + syscalls: tt.fields.syscalls, + } + if got := tracer.createErrorResponse(tt.args.id, tt.args.err); !reflect.DeepEqual(got, tt.want) { + t.Errorf("syscallTracer.createErrorResponse() = %v, want %v", got, tt.want) + } + }) + } +} diff --git a/sysbox-fs/seccomp/umount.go b/sysbox-fs/seccomp/umount.go new file mode 100644 index 00000000..5b3a6eab --- /dev/null +++ b/sysbox-fs/seccomp/umount.go @@ -0,0 +1,525 @@ +// +// Copyright 2019-2020 Nestybox, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +package seccomp + +import ( + "fmt" + "path/filepath" + "strings" + "syscall" + + "github.com/nestybox/sysbox-fs/domain" + "github.com/nestybox/sysbox-fs/fuse" + "github.com/sirupsen/logrus" +) + +type umountSyscallInfo struct { + syscallCtx // syscall generic info + *domain.UmountSyscallPayload // unmount-syscall specific details +} + +// MountSyscall processing wrapper instruction. +func (u *umountSyscallInfo) process() (*sysResponse, error) { + + // A procfs mount inside the container is a combination of a base procfs + // mount plus sysbox-fs submounts, and we want the procfs mount to act as + // one whole rather than a collection of mounts (i.e., just like a regular + // procfs mount on a host). Thus, unmounts that occur on the base procfs + // mount are accepted (we unmount the submounts first and then the base + // mount). On the other hand, unmounts that target only the sysbox-fs + // submounts are ignored. The reason we ignore them as opposed to returning + // an error message is that we also ignore bind-to-self mounts on submounts + // (see handling of bind-to-self submounts in mount.go); thus, returning an + // error message would cause the sequence "mount --bind submount submount + // && umount submount" to fail on the second command. + // + // Same applies to sysfs mounts. + + mts := u.tracer.service.mts + if mts == nil { + return nil, fmt.Errorf("unexpected mount-service handler") + } + + // Ensure that the mountInfoDB corresponding to the sys-container hosting + // this process has been already built. This info is necessary to be able + // to discern between 'initial' and 'regular' mounts, which is required + // for the proper operation of the mount-hardening feature. + if !u.cntr.IsMountInfoInitialized() { + if err := u.cntr.InitializeMountInfo(); err != nil { + return nil, err + } + } + + mip, err := mts.NewMountInfoParser(u.cntr, u.processInfo, true, true, false) + if err != nil { + return nil, err + } + + // Adjust umount target attribute attending to the process' root path. + u.targetAdjust() + + if mip.IsSysboxfsBaseMount(u.Target) { + + // Special case: disallow unmounting of /proc; we must do this because we + // use /proc as the source of other procfs mounts within the container + // (i.e., if a new procfs is mounted at /some/path/inside/container/proc, + // we bind-mount /proc/uptime to /some/path/inside/container/proc/uptime). + // This restriction should be fine because unmounting /proc inside the + // container is not a useful thing to do anyways. Same rationale applies + // to "/sys". + // + // Also, notice that we want to clearly differentiate the root /proc and + // /sys mounts from those that are present within chroot'ed contexts. In + // the later case we want to allow users to mount (and umount) both /proc + // and /sys file-systems. + if (u.Target == "/proc" || u.Target == "/sys") && (u.syscallCtx.root == "/") { + resp := u.tracer.createErrorResponse(u.reqId, syscall.EBUSY) + return resp, nil + } + + // If under the base mount there are any submounts *not* managed by + // sysbox-fs, fail the unmount with EBUSY (such submounts must be + // explicitly unmounted prior to unmounting the base mount). + if mip.HasNonSysboxfsSubmount(u.Target) { + resp := u.tracer.createErrorResponse(u.reqId, syscall.EBUSY) + return resp, nil + } + + // Process the unmount + info := mip.GetInfo(u.Target) + + switch info.FsType { + case "proc": + // Sysbox-fs emulates all new procfs mounts inside the container by + // mounting the kernel's procfs and the mounting sysbox-fs on + // portions of procfs (e.g., proc/sys, proc/uptime, etc.) + logrus.Debugf("Processing procfs unmount: %v", u) + return u.processUmount(mip) + + case "sysfs": + // For sysfs we do something similar to procfs. + logrus.Debugf("Processing sysfs unmount: %v", u) + return u.processUmount(mip) + + case "overlay": + // Handle umounts of overlay fs. + logrus.Debugf("Processing overlayfs unmount: %v", u) + return u.processUmount(mip) + } + + // Not a mount we manage, have the kernel do the unmount. + return u.tracer.createContinueResponse(u.reqId), nil + + } else if mip.IsSysboxfsSubmount(u.Target) { + logrus.Infof("Rejected unmount of sysbox-fs managed submount at %s", + u.Target) + return u.tracer.createErrorResponse(u.reqId, syscall.EINVAL), nil + } + + // Verify if the umount op is addressing an immutable resource and prevent + // it if that's the case. + if ok, resp := u.umountAllowed(mip); ok == false { + return resp, nil + } + + // Not a mount we manage, have the kernel do the unmount. + return u.tracer.createContinueResponse(u.reqId), nil +} + +// umountAllowed purpose is to prevent immutable resources from being +// unmounted. +// +// Method will return 'true' when the unmount operation is deemed legit, and +// will return 'false' otherwise. +func (u *umountSyscallInfo) umountAllowed( + mip domain.MountInfoParserIface) (bool, *sysResponse) { + + if u.tracer.service.allowImmutableUnmounts { + return true, nil + } + + // There must be mountinfo state present for this target. Otherwise, return + // error back to the user. + info := mip.GetInfo(u.Target) + if info == nil { + return false, u.tracer.createErrorResponse(u.reqId, syscall.EINVAL) + } + + // + // The following scenarios are relevant within the context of this function + // and will be handled separately to ease the logic comprehension and its + // maintenability / debuggability. + // + // The different columns in this table denote the 'context' in which the + // unmount process is executing, and thereby, dictates the logic chosen + // to handle each unmount request. + // + // +-----------+--------------+--------------+----------+ + // | Scenarios | Unshare(mnt) | Pivot-root() | Chroot() | + // +-----------+--------------+--------------+----------+ + // | 1 | no | no | no | + // | 2 | no | yes | no | + // | 3 | no | no | yes | + // | 4 | no | yes | yes | + // | 5 | yes | no | no | + // | 6 | yes | yes | no | + // | 7 | yes | no | yes | + // | 8 | yes | yes | yes | + // +-----------+--------------+--------------+----------+ + // + + // Identify the mount-ns of the process launching the unmount to compare it + // with the one of the sys container's initpid. In the unlikely case of an + // error, let the kernel deal with it. + processMountNs, err := u.processInfo.MountNsInode() + if err != nil { + return false, u.tracer.createErrorResponse(u.reqId, syscall.EINVAL) + } + initProcMountNs, err := u.cntr.InitProc().MountNsInode() + if err != nil { + return false, u.tracer.createErrorResponse(u.reqId, syscall.EINVAL) + } + + // Obtain the sys-container's root-path inode. + syscntrRootInode := u.cntr.InitProc().RootInode() + + // If process' mount-ns matches the sys-container's one, then we can simply + // rely on the target's mountID to discern an immutable target from a + // regular one. Otherwise, we cannot rely on the mountID field, as the values + // allocated by kernel for these very mountpoints will differ in other mount + // namespaces. + if processMountNs == initProcMountNs { + + // Allow unmounts of the container's root mount (i.e., "/"). This is + // required in order for processes to perform a pivot-root operation, + // where the process changes root to a container subdir and then usually + // unmount the original root mount (i.e., we want that unmount to + // succeed). It's safe to allow unmounts of the container's root mount + // because unless there's a pivot-root, the kernel will actually deny the + // unmount (EBUSY) since the process is operating under that mount. Even + // if the kernel were to allow it, the only thing it would expose is the + // empty host dir on top of which the container root dir was mounted + // (e.g., /var/lib/docker/overlay2//merged or similar). + isRootMnt, err := u.cntr.IsRootMountID(info.MountID) + if err != nil { + return false, u.tracer.createErrorResponse(u.reqId, syscall.EINVAL) + } + if isRootMnt { + return true, nil + } + + if u.cntr.IsImmutableMountID(info.MountID) { + logrus.Infof("Rejected unmount operation on immutable target: %s", + u.Target) + + if logrus.IsLevelEnabled(logrus.DebugLevel) { + if u.processInfo.Root() == "/" { + processRootInode := u.processInfo.RootInode() + + if processRootInode == syscntrRootInode { + // Scenario 1): no-unshare(mnt) & no-pivot() & no-chroot() + logrus.Info("Rejected unmount operation -- scenario 1") + } else { + // Scenario 2): no-unshare(mnt) & pivot() & no-chroot() + logrus.Info("Rejected unmount operation -- scenario 2") + } + + } else { + // We are dealing with a chroot'ed process, so obtain the inode of "/" + // as seen within the process' namespaces, and *not* the one associated + // to the process' root-path. + processRootInode, err := mip.ExtractInode("/") + if err != nil { + return false, u.tracer.createErrorResponse(u.reqId, syscall.EINVAL) + } + + if processRootInode == syscntrRootInode { + // Scenario 3: no-unshare(mnt) & no-pivot() & chroot() + logrus.Info("Rejected unmount operation -- scenario 3") + } else { + // Scenario 4: no-unshare(mnt) & pivot() & chroot() + logrus.Info("Rejected unmount operation -- scenario 4") + } + } + } + + return false, u.tracer.createErrorResponse(u.reqId, syscall.EPERM) + } + + return true, nil + + } else { + + if u.processInfo.Root() == "/" { + processRootInode := u.processInfo.RootInode() + + // Scenario 5): unshare(mnt) & no-pivot() & no-chroot() + if processRootInode == syscntrRootInode { + + // We need to check if we're dealing with an overlapped mount, as + // this is a case that we usually (see exception below) want to + // allow. + if mip.IsOverlapMount(info) { + + // The exception mentioned above refer to the scenario where + // the overlapped mountpoint is an immutable itself, hence the + // checkpoint below. + if u.cntr.IsImmutableOverlapMountpoint(info.MountPoint) { + logrus.Infof("Rejected unmount operation on immutable overlapped target: %s (scenario 5)", + u.Target) + return false, u.tracer.createErrorResponse(u.reqId, syscall.EPERM) + } + return true, nil + } + + // In this scenario we have full access to all the mountpoints + // within the sys-container, though with different mount-IDs + // due to a different mount-ns. But we can safely rely on the + // mountinfo attributes (e.g., mountpoint path) to determine + // resource's immutability. + if info.MountPoint == "/" { + return true, nil + } + + if u.cntr.IsImmutableMountpoint(info.MountPoint) { + logrus.Infof("Rejected unmount operation on immutable target: %s (scenario 5)", + u.Target) + return false, u.tracer.createErrorResponse(u.reqId, syscall.EPERM) + } + + return true, nil + } + + // Scenario 6): unshare(mnt) & pivot() & no-chroot() + if processRootInode != syscntrRootInode { + if mip.IsOverlapMount(info) { + return true, nil + } + + isRootMnt, err := u.cntr.IsRootMount(info) + if err != nil { + return false, u.tracer.createErrorResponse(u.reqId, syscall.EINVAL) + } + if isRootMnt { + return true, nil + } + + isImmutable, err := u.cntr.IsImmutableMount(info) + if err != nil { + return false, u.tracer.createErrorResponse(u.reqId, syscall.EINVAL) + } + if isImmutable { + logrus.Infof("Rejected unmount operation on immutable target: %s (scenario 6)", + u.Target) + return false, u.tracer.createErrorResponse(u.reqId, syscall.EPERM) + } + + return true, nil + } + + return true, nil + } + + if u.processInfo.Root() != "/" { + + // We are dealing with a chroot'ed process, so obtain the inode of "/" + // as seen within the process' namespaces, and *not* the one associated + // to the process' root-path. + processRootInode, err := mip.ExtractInode("/") + if err != nil { + return false, u.tracer.createErrorResponse(u.reqId, syscall.EINVAL) + } + + // Scenario 7): unshare(mnt) & no-pivot() & chroot() + if processRootInode == syscntrRootInode { + + // We need to check if we're dealing with an overlapped mount, as + // this is a case that we usually (see exception below) want to + // allow. + if mip.IsOverlapMount(info) { + + // The exception mentioned above refer to the scenario where + // the overlapped mountpoint is an immutable itself, hence the + // checkpoint below. + if u.cntr.IsImmutableOverlapMountpoint(info.MountPoint) { + logrus.Infof("Rejected unmount operation on immutable overlapped target: %s (scenario 7)", + u.Target) + return false, u.tracer.createErrorResponse(u.reqId, syscall.EPERM) + } + return true, nil + } + + if info.MountPoint == "/" { + return true, nil + } + + if u.cntr.IsImmutableMountpoint(info.MountPoint) { + logrus.Infof("Rejected unmount operation on immutable target: %s (scenario 7)", + u.Target) + return false, u.tracer.createErrorResponse(u.reqId, syscall.EPERM) + } + + return true, nil + } + + // Scenario 8): unshare(mnt) & pivot() & chroot() + if processRootInode != syscntrRootInode { + + if mip.IsOverlapMount(info) { + isImmutable, err := u.cntr.IsImmutableMount(info) + if err != nil { + return false, u.tracer.createErrorResponse(u.reqId, syscall.EINVAL) + } + if isImmutable { + logrus.Infof("Rejected unmount operation on immutable overlapped target: %s (scenario 8)", + u.Target) + return false, u.tracer.createErrorResponse(u.reqId, syscall.EPERM) + } + return true, nil + } + + isRoot, err := u.cntr.IsRootMount(info) + if err != nil { + return false, u.tracer.createErrorResponse(u.reqId, syscall.EINVAL) + } + if isRoot { + return true, nil + } + + isImmutable, err := u.cntr.IsImmutableMount(info) + if err != nil { + return false, u.tracer.createErrorResponse(u.reqId, syscall.EINVAL) + } + if isImmutable { + logrus.Infof("Rejected unmount operation on immutable target: %s (scenario 8)", + u.Target) + return false, u.tracer.createErrorResponse(u.reqId, syscall.EPERM) + } + + return true, nil + } + } + } + + return true, nil +} + +// Method handles umount syscall requests on sysbox-fs managed base mounts. +func (u *umountSyscallInfo) processUmount( + mip domain.MountInfoParserIface) (*sysResponse, error) { + + // Create instructions payload. + payload := u.createUmountPayload(mip) + + // Create nsenter-event envelope. + nss := u.tracer.service.nss + event := nss.NewEvent( + u.syscallCtx.pid, + &domain.AllNSs, + 0, + &domain.NSenterMessage{ + Type: domain.UmountSyscallRequest, + Payload: payload, + }, + nil, + false, + ) + + // Launch nsenter-event. + err := nss.SendRequestEvent(event) + if err != nil { + return nil, err + } + + // Obtain nsenter-event response. + responseMsg := nss.ReceiveResponseEvent(event) + if responseMsg.Type == domain.ErrorResponse { + resp := u.tracer.createErrorResponse( + u.reqId, + responseMsg.Payload.(fuse.IOerror).Code) + + return resp, nil + } + + return u.tracer.createSuccessResponse(u.reqId), nil +} + +// Build instructions payload required to unmount a sysbox-fs base mount (and +// any submounts under it) +func (u *umountSyscallInfo) createUmountPayload( + mip domain.MountInfoParserIface) *[]*domain.UmountSyscallPayload { + + var payload []*domain.UmountSyscallPayload + + submounts := []string{} + + if mip.IsSysboxfsBaseMount(u.Target) { + submounts = mip.GetSysboxfsSubMounts(u.Target) + } else { + submounts = append(submounts, u.Target) + } + + for _, subm := range submounts { + info := mip.GetInfo(subm) + newelem := &domain.UmountSyscallPayload{ + domain.NSenterMsgHeader{}, + domain.Mount{ + Target: info.MountPoint, + Flags: u.Flags, + }, + } + payload = append(payload, newelem) + } + + if mip.IsSysboxfsBaseMount(u.Target) { + payload = append(payload, u.UmountSyscallPayload) + } + + return &payload +} + +// Method addresses scenarios where the process generating the umount syscall has +// a 'root' attribute different than default one ("/"). This is typically the +// case in chroot'ed environments. Method's goal is to make the required target +// adjustments so that sysbox-fs can carry out the mount in the expected context. +func (u *umountSyscallInfo) targetAdjust() { + + root := u.syscallCtx.root + + if root == "/" { + return + } + + u.Target = filepath.Join(root, u.Target) +} + +// Undo targetAdjust(). +func (u *umountSyscallInfo) targetUnadjust() { + + root := u.syscallCtx.root + + if root == "/" { + return + } + + u.Target = strings.TrimPrefix(u.Target, u.root) +} + +func (u *umountSyscallInfo) String() string { + return fmt.Sprintf("target: %s, flags: %#x, root: %s, cwd: %s", + u.Target, u.Flags, u.root, u.cwd) +} diff --git a/sysbox-fs/seccomp/xattr.go b/sysbox-fs/seccomp/xattr.go new file mode 100644 index 00000000..75d12e2b --- /dev/null +++ b/sysbox-fs/seccomp/xattr.go @@ -0,0 +1,452 @@ +// +// Copyright 2021 Nestybox, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +// This file contains Sysbox's *xattr syscall trapping & handling code. The +// reason we trap these syscalls is to allow processes inside the sys container +// that have sufficient capabilities (e.g., CAP_SYS_ADMIN) to set "trusted." +// extended attributes on files inside the container. The kernel does not +// currently allow this from within a user-namespace other than the initial user +// namespace (since it would allow an unprivileged user to unshare it's user-ns, +// become root in it, and set the trusted extended attribute on arbitrary +// files). But Sysbox allows this for processes inside the sys container because +// we know the container can only do this on files in its file-system jail. + +package seccomp + +import ( + "path/filepath" + "syscall" + + "github.com/nestybox/sysbox-fs/domain" + "github.com/nestybox/sysbox-fs/fuse" + cap "github.com/nestybox/sysbox-libs/capability" + utils "github.com/nestybox/sysbox-libs/utils" + + "github.com/sirupsen/logrus" +) + +var allowedXattrList = []string{ + "trusted.overlay.opaque", +} + +type setxattrSyscallInfo struct { + syscallCtx // syscall generic info + pathFd int32 + path string + name string + val []byte + flags int +} + +type getxattrSyscallInfo struct { + syscallCtx // syscall generic info + pathFd int32 + path string + name string + addr uint64 + size uint64 +} + +type removexattrSyscallInfo struct { + syscallCtx // syscall generic info + pathFd int32 + path string + name string +} + +type listxattrSyscallInfo struct { + syscallCtx // syscall generic info + pathFd int32 + path string + addr uint64 + size uint64 +} + +// sanitizePath normalizes the file path associated with the xattr operation and +// ensures the process doing the syscall has access to it. +func sanitizePath(process domain.ProcessIface, path string, followSymlink bool) (string, error) { + var err error + + // It's rare that the xattr be applied on a /proc/self/* path, but it's + // technically possible. + path, err = process.ResolveProcSelf(path) + if err != nil { + return path, syscall.EACCES + } + + // Verify the process has the proper rights to access the file + path, err = process.PathAccess(path, 0, followSymlink) + if err != nil { + return path, err + } + + // Convert to absolute path if dealing with a relative path request. + if !filepath.IsAbs(path) { + path = filepath.Join(process.Cwd(), path) + } + + // The process may be chroot'ed; adjust the path accordingly + path = filepath.Join(process.Root(), path) + + return path, nil +} + +func (si *setxattrSyscallInfo) processSetxattr() (*sysResponse, error) { + var err error + + t := si.tracer + + if !utils.StringSliceContains(allowedXattrList, si.name) { + return t.createContinueResponse(si.reqId), nil + } + + // Ensure the process that performed the syscall has the required caps + process := t.service.prs.ProcessCreate(si.pid, 0, 0) + + // We currently only handle trusted.* xattrs; these require CAP_SYS_ADMIN + if !process.IsCapabilitySet(cap.EFFECTIVE, cap.CAP_SYS_ADMIN) { + return t.createErrorResponse(si.reqId, syscall.EPERM), nil + } + + // If pathFd is defined, we are processing fsetxattr(); convert pathFd to + // path so we can then handle fsetxattr() as setxattr(). + if si.pathFd != 0 { + si.path, err = process.GetFd(si.pathFd) + if err != nil { + return t.createContinueResponse(si.reqId), nil + } + } + + followSymlink := si.syscallName != "lsetxattr" + + si.path, err = sanitizePath(process, si.path, followSymlink) + if err != nil { + return t.createErrorResponse(si.reqId, err), nil + } + + logrus.Debugf("setxattr(): path = %s, name = %s, val = %s, flags = %x", + si.path, si.name, string(si.val), si.flags) + + // Perform the nsenter into the process namespaces (except the user-ns) + payload := domain.SetxattrSyscallPayload{ + Syscall: si.syscallName, + Path: si.path, + Name: si.name, + Val: si.val, + Flags: si.flags, + } + + nss := t.service.nss + event := nss.NewEvent( + si.pid, + &domain.AllNSsButUser, + 0, + &domain.NSenterMessage{ + Type: domain.SetxattrSyscallRequest, + Payload: payload, + }, + nil, + false, + ) + + err = nss.SendRequestEvent(event) + if err != nil { + return nil, err + } + + responseMsg := nss.ReceiveResponseEvent(event) + if responseMsg.Type == domain.ErrorResponse { + resp := t.createErrorResponse( + si.reqId, + responseMsg.Payload.(fuse.IOerror).Code) + return resp, nil + } + + return t.createSuccessResponse(si.reqId), nil +} + +func (si *getxattrSyscallInfo) processGetxattr() (*sysResponse, error) { + var err error + + t := si.tracer + + if !utils.StringSliceContains(allowedXattrList, si.name) { + return t.createContinueResponse(si.reqId), nil + } + + // Ensure process has required capabilities + process := t.service.prs.ProcessCreate(si.pid, 0, 0) + + // We currently only handle trusted.* xattr; these require CAP_SYS_ADMIN + if !process.IsCapabilitySet(cap.EFFECTIVE, cap.CAP_SYS_ADMIN) { + return t.createErrorResponse(si.reqId, syscall.EPERM), nil + } + + // If pathFd is defined, we are processing fgetxattr(); convert pathFd to + // path so we can then handle fgetxattr() as getxattr(). + if si.pathFd != 0 { + si.path, err = process.GetFd(si.pathFd) + if err != nil { + return t.createContinueResponse(si.reqId), nil + } + } + + followSymlink := si.syscallName != "lgetxattr" + + si.path, err = sanitizePath(process, si.path, followSymlink) + if err != nil { + return t.createErrorResponse(si.reqId, err), nil + } + + logrus.Debugf("getxattr(): path = %s, name = %s, size = %d", + si.path, si.name, si.size) + + // Perform the nsenter into the process namespaces (except the user-ns) + payload := domain.GetxattrSyscallPayload{ + Header: domain.NSenterMsgHeader{ + Pid: process.Pid(), + Uid: process.Uid(), + Gid: process.Gid(), + Root: process.Root(), + Cwd: process.Cwd(), + Capabilities: process.GetEffCaps(), + }, + Syscall: si.syscallName, + Path: si.path, + Name: si.name, + Size: si.size, + } + + nss := t.service.nss + event := nss.NewEvent( + si.pid, + &domain.AllNSsButUser, + 0, + &domain.NSenterMessage{ + Type: domain.GetxattrSyscallRequest, + Payload: payload, + }, + nil, + false, + ) + + err = nss.SendRequestEvent(event) + if err != nil { + return nil, err + } + + responseMsg := nss.ReceiveResponseEvent(event) + + if responseMsg.Type == domain.ErrorResponse { + sysResp := t.createErrorResponse( + si.reqId, + responseMsg.Payload.(fuse.IOerror).Code) + return sysResp, nil + } + + resp := responseMsg.Payload.(domain.GetxattrRespPayload) + + // Write the data returned by getxattr() to the memory of the process whose + // syscall we are processing. Refer to the comments written as part of the + // processListxattr() method for more details relevant to this code section. + + if si.size > 0 && resp.Size > 0 { + if err := t.memParser.WriteSyscallBytesArgs( + si.pid, + []memParserDataElem{{si.addr, resp.Size, resp.Val}}, + ); err != nil { + sysResp := t.createErrorResponse(si.reqId, syscall.ENOTSUP) + return sysResp, nil + } + } + + sysResp := t.createSuccessResponseWithRetValue(si.reqId, uint64(resp.Size)) + + return sysResp, nil +} + +func (si *removexattrSyscallInfo) processRemovexattr() (*sysResponse, error) { + var err error + + t := si.tracer + + if !utils.StringSliceContains(allowedXattrList, si.name) { + return t.createContinueResponse(si.reqId), nil + } + + // Ensure process has required capabilities + process := t.service.prs.ProcessCreate(si.pid, 0, 0) + + // We currently only handle trusted.* xattr; these require CAP_SYS_ADMIN + if !process.IsCapabilitySet(cap.EFFECTIVE, cap.CAP_SYS_ADMIN) { + return t.createErrorResponse(si.reqId, syscall.EPERM), nil + } + + // If pathFd is defined, we are processing fremovexattr(); convert pathFd to + // path so we can then handle fremovexattr() as removexattr(). + if si.pathFd != 0 { + si.path, err = process.GetFd(si.pathFd) + if err != nil { + return t.createContinueResponse(si.reqId), nil + } + } + + followSymlink := si.syscallName != "lremovexattr" + + si.path, err = sanitizePath(process, si.path, followSymlink) + if err != nil { + return t.createErrorResponse(si.reqId, err), nil + } + + logrus.Debugf("removexattr(): path = %s, name = %s", si.path, si.name) + + // Perform the nsenter into the process namespaces (except the user-ns) + payload := domain.RemovexattrSyscallPayload{ + Syscall: si.syscallName, + Path: si.path, + Name: si.name, + } + + nss := t.service.nss + event := nss.NewEvent( + si.pid, + &domain.AllNSsButUser, + 0, + &domain.NSenterMessage{ + Type: domain.RemovexattrSyscallRequest, + Payload: payload, + }, + nil, + false, + ) + + err = nss.SendRequestEvent(event) + if err != nil { + return nil, err + } + + responseMsg := nss.ReceiveResponseEvent(event) + if responseMsg.Type == domain.ErrorResponse { + resp := t.createErrorResponse( + si.reqId, + responseMsg.Payload.(fuse.IOerror).Code) + return resp, nil + } + + return t.createSuccessResponse(si.reqId), nil +} + +func (si *listxattrSyscallInfo) processListxattr() (*sysResponse, error) { + var err error + + t := si.tracer + + process := t.service.prs.ProcessCreate(si.pid, 0, 0) + + // If pathFd is defined, we are processing flistxattr(); convert pathFd to + // path so we can then handle flistxattr() as listxattr(). + if si.pathFd != 0 { + si.path, err = process.GetFd(si.pathFd) + if err != nil { + return t.createContinueResponse(si.reqId), nil + } + } + + followSymlink := si.syscallName != "llistxattr" + + si.path, err = sanitizePath(process, si.path, followSymlink) + if err != nil { + return t.createErrorResponse(si.reqId, err), nil + } + + logrus.Debugf("listxattr(): path = %s, size = %d", si.path, si.size) + + // Perform the nsenter into the process namespaces (except the user-ns) + payload := domain.ListxattrSyscallPayload{ + Header: domain.NSenterMsgHeader{ + Pid: process.Pid(), + Uid: process.Uid(), + Gid: process.Gid(), + Root: process.Root(), + Cwd: process.Cwd(), + Capabilities: process.GetEffCaps(), + }, + Syscall: si.syscallName, + Path: si.path, + Size: si.size, + } + + nss := t.service.nss + event := nss.NewEvent( + si.pid, + &domain.AllNSsButUser, + 0, + &domain.NSenterMessage{ + Type: domain.ListxattrSyscallRequest, + Payload: payload, + }, + nil, + false, + ) + + err = nss.SendRequestEvent(event) + if err != nil { + return nil, err + } + + responseMsg := nss.ReceiveResponseEvent(event) + + if responseMsg.Type == domain.ErrorResponse { + sysResp := t.createErrorResponse( + si.reqId, + responseMsg.Payload.(fuse.IOerror).Code) + return sysResp, nil + } + + resp := responseMsg.Payload.(domain.ListxattrRespPayload) + + // Write the data returned by listxattr() to the memory of the process whose + // syscall we are processing. + // + // Notice that the nsexec process executing this syscall in the container + // namespaces, is adopting the 'personality' of the original user process. + // Thus, there's no need to adjust the list of received xattrs as these ones + // should match those obtained if Sysbox were not in the picture. + // + // Also, bear in mind that the list of returned xattrs during the second + // listxattr(), may reflect a subset (or an empty set) of those reported in + // the first litstxattr() syscall, as it's only during the second one (i.e., + // 'si.size != 0') when the kernel verifies the capabilities of the user + // process. That's just to say that the amount of data to write into the + // user's address-space must be exclusively based on the response given by + // the kernel during the second listxattr() syscall. That is, 'si.size' is + // not relevant as the kernel already handles the scenario where this one is + // smaller than 'resp.Size' by returning an 'ERANGE' error to the user. + + if si.size > 0 && resp.Size > 0 { + if err := t.memParser.WriteSyscallBytesArgs( + si.pid, + []memParserDataElem{{si.addr, resp.Size, resp.Val}}, + ); err != nil { + sysResp := t.createErrorResponse(si.reqId, syscall.ENOTSUP) + return sysResp, nil + } + } + + sysResp := t.createSuccessResponseWithRetValue(si.reqId, uint64(resp.Size)) + + return sysResp, nil +} diff --git a/sysbox-fs/state/container.go b/sysbox-fs/state/container.go new file mode 100644 index 00000000..8e1aa2d7 --- /dev/null +++ b/sysbox-fs/state/container.go @@ -0,0 +1,489 @@ +// +// Copyright 2019-2020 Nestybox, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +package state + +import ( + "fmt" + "io" + "sync" + "time" + + "github.com/nestybox/sysbox-fs/domain" + "github.com/nestybox/sysbox-libs/formatter" + libpidfd "github.com/nestybox/sysbox-libs/pidfd" + "golang.org/x/sys/unix" +) + +// Container type to represent all the container-state relevant to sysbox-fs. +type container struct { + sync.RWMutex + id string // container-id value generated by runC + initPid uint32 // initPid within container + initPidFd libpidfd.PidFd // + rootInode uint64 // initPid's root-path inode + ctime time.Time // container creation time + uidFirst uint32 // first value of Uid range (host side) + uidSize uint32 // Uid range size + gidFirst uint32 // first value of Gid range (host side) + gidSize uint32 // Gid range size + regCompleted bool // registration completion flag + procRoPaths []string // OCI spec read-only proc paths + procMaskPaths []string // OCI spec masked proc paths + mountInfoParser domain.MountInfoParserIface // Per container mountinfo DB & parser + dataStore map[string][]byte // Per container data store for FUSE handlers (procfs, sysfs, etc); maps fuse path to data. + initProc domain.ProcessIface // container's init process + service *containerStateService // backpointer to service + intLock sync.RWMutex // internal lock + extLock sync.Mutex // external lock (exposed via Lock() and Unlock() methods) + usernsInode domain.Inode // inode associated with the container's user namespace + netnsInode domain.Inode // inode associated with the container's network namespace +} + +func newContainer( + id string, + initPid uint32, + ctime time.Time, + uidFirst uint32, + uidSize uint32, + gidFirst uint32, + gidSize uint32, + procRoPaths []string, + procMaskPaths []string, + css *containerStateService, +) domain.ContainerIface { + + cntr := &container{ + id: id, + initPid: initPid, + ctime: ctime, + uidFirst: uidFirst, + uidSize: uidSize, + gidFirst: gidFirst, + gidSize: gidSize, + procRoPaths: procRoPaths, + procMaskPaths: procMaskPaths, + service: css, + } + + return cntr +} + +// +// Getters implementations. +// + +func (c *container) ID() string { + c.intLock.RLock() + defer c.intLock.RUnlock() + + return c.id +} + +func (c *container) InitPid() uint32 { + c.intLock.RLock() + defer c.intLock.RUnlock() + + return c.initPid +} + +func (c *container) InitPidFd() libpidfd.PidFd { + c.intLock.RLock() + defer c.intLock.RUnlock() + + return c.initPidFd +} + +func (c *container) Ctime() time.Time { + c.intLock.RLock() + defer c.intLock.RUnlock() + + return c.ctime +} + +func (c *container) UID() uint32 { + c.intLock.RLock() + defer c.intLock.RUnlock() + + return c.uidFirst +} + +func (c *container) GID() uint32 { + c.intLock.RLock() + defer c.intLock.RUnlock() + + return c.gidFirst +} + +func (c *container) UidSize() uint32 { + c.intLock.RLock() + defer c.intLock.RUnlock() + + return c.uidSize +} + +func (c *container) GidSize() uint32 { + c.intLock.RLock() + defer c.intLock.RUnlock() + + return c.gidSize +} + +func (c *container) ProcRoPaths() []string { + c.intLock.RLock() + defer c.intLock.RUnlock() + + return c.procRoPaths +} + +func (c *container) ProcMaskPaths() []string { + c.intLock.RLock() + defer c.intLock.RUnlock() + + return c.procMaskPaths +} + +func (c *container) InitProc() domain.ProcessIface { + c.intLock.RLock() + defer c.intLock.RUnlock() + + return c.initProc +} + +func (c *container) IsRootMountID(id int) (bool, error) { + c.intLock.RLock() + defer c.intLock.RUnlock() + + info := c.mountInfoParser.LookupByMountID(id) + if info == nil { + return false, nil + } + + return c.mountInfoParser.IsRootMount(info) +} + +func (c *container) IsRootMount(info *domain.MountInfo) (bool, error) { + c.intLock.RLock() + defer c.intLock.RUnlock() + return c.mountInfoParser.IsRootMount(info) +} + +func (c *container) IsImmutableMountID(id int) bool { + c.intLock.RLock() + defer c.intLock.RUnlock() + + if info := c.mountInfoParser.LookupByMountID(id); info != nil { + return true + } + + return false +} + +// ExtractInode obtains the inode of any given resource within a sys container's +// file-system. +func (c *container) ExtractInode(path string) (domain.Inode, error) { + return c.mountInfoParser.ExtractInode(path) +} + +func (c *container) IsImmutableRoMountID(id int) bool { + c.intLock.RLock() + defer c.intLock.RUnlock() + + if info := c.mountInfoParser.LookupByMountID(id); info != nil { + mh := c.service.mts.MountHelper() + return mh.StringToFlags(info.Options)&unix.MS_RDONLY == unix.MS_RDONLY + } + + return false +} + +func (c *container) IsImmutableMountpoint(mp string) bool { + c.intLock.RLock() + defer c.intLock.RUnlock() + + if info := c.mountInfoParser.LookupByMountpoint(mp); info != nil { + return true + } + + return false +} + +func (c *container) IsImmutableRoMountpoint(mp string) bool { + c.intLock.RLock() + defer c.intLock.RUnlock() + + if info := c.mountInfoParser.LookupByMountpoint(mp); info != nil { + mh := c.service.mts.MountHelper() + return mh.StringToFlags(info.Options)&unix.MS_RDONLY == unix.MS_RDONLY + } + + return false +} + +func (c *container) IsImmutableOverlapMountpoint(mp string) bool { + c.intLock.RLock() + defer c.intLock.RUnlock() + + info := c.mountInfoParser.LookupByMountpoint(mp) + if info == nil { + return false + } + + return c.mountInfoParser.IsOverlapMount(info) +} + +func (c *container) IsImmutableMount(info *domain.MountInfo) (bool, error) { + c.intLock.RLock() + defer c.intLock.RUnlock() + + return c.mountInfoParser.IsCloneMount(info, false) +} + +func (c *container) IsImmutableRoMount(info *domain.MountInfo) (bool, error) { + c.intLock.RLock() + defer c.intLock.RUnlock() + + return c.mountInfoParser.IsCloneMount(info, true) +} + +func (c *container) IsImmutableBindMount(info *domain.MountInfo) bool { + c.intLock.RLock() + defer c.intLock.RUnlock() + + return c.mountInfoParser.IsBindMount(info) +} + +func (c *container) IsImmutableRoBindMount(info *domain.MountInfo) bool { + c.intLock.RLock() + defer c.intLock.RUnlock() + + return c.mountInfoParser.IsRoBindMount(info) +} + +func (c *container) IsRegistrationCompleted() bool { + c.intLock.RLock() + defer c.intLock.RUnlock() + return c.regCompleted +} + +// +// Setters implementations. +// + +func (c *container) update(src *container) error { + c.intLock.Lock() + defer c.intLock.Unlock() + + var err error + + if c.initPid != src.initPid { + // Initialize initProc. + c.initProc = src.service.ProcessService().ProcessCreate( + src.initPid, + src.uidFirst, + src.gidFirst, + ) + c.initPid = src.initPid + c.rootInode = c.initProc.RootInode() + + c.initPidFd, err = libpidfd.Open(int(c.initPid), 0) + if err != nil { + return err + } + } + + if c.ctime != src.ctime { + c.ctime = src.ctime + } + + if c.uidFirst != src.uidFirst { + c.uidFirst = src.uidFirst + } + + if c.uidSize != src.uidSize { + c.uidSize = src.uidSize + } + + if c.gidFirst != src.gidFirst { + c.gidFirst = src.gidFirst + } + + if c.gidSize != src.gidSize { + c.gidSize = src.gidSize + } + + if c.service != src.service { + c.service = src.service + } + + // Unconditional malloc + copy -- think about how to optimize if no changes + // are detected. + c.procRoPaths = make([]string, len(src.procRoPaths)) + copy(c.procRoPaths, src.procRoPaths) + c.procMaskPaths = make([]string, len(src.procMaskPaths)) + copy(c.procMaskPaths, src.procMaskPaths) + + return nil +} + +func (c *container) InitializeMountInfo() error { + c.intLock.Lock() + defer c.intLock.Unlock() + + // A per-container mountInfoParser object will be created here to hold the + // mount-state created by sysbox-runc during container initialization. + if c.mountInfoParser == nil { + mip, err := c.service.mts.NewMountInfoParser(c, c.initProc, true, true, true) + if err != nil { + return err + } + c.mountInfoParser = mip + } + + return nil +} + +func (c *container) IsMountInfoInitialized() bool { + c.intLock.RLock() + defer c.intLock.RUnlock() + + return c.mountInfoParser != nil +} + +// Container's stringer method. Notice that no internal lock is being acquired +// in this method to avoid collisions (and potential deadlocks) with Container's +// public methods. In consequence, callee methods must ensure that container's +// internal (read)lock is acquired prior to invoking this method. +func (c *container) string() string { + + return fmt.Sprintf("id = %s, initPid = %d, uid:gid = %v:%v", + formatter.ContainerID{c.id}, int(c.initPid), c.uidFirst, c.gidFirst) +} + +func (c *container) SetCtime(t time.Time) { + c.intLock.Lock() + defer c.intLock.Unlock() + + c.ctime = t +} + +func (c *container) Data(name string, offset int64, data *[]byte) (int, error) { + var err error + + c.intLock.RLock() + defer c.intLock.RUnlock() + + if offset < 0 { + return 0, fmt.Errorf("invalid offset: %d", offset) + } + + if c.dataStore == nil { + c.dataStore = make(map[string][]byte) + } + + currData, ok := c.dataStore[name] + if !ok { + return 0, io.EOF + } + + readLen := int64(len(*data)) + + // Out-of-bounds offset + if offset >= readLen { + return 0, io.EOF + } + + if offset+readLen >= int64(len(currData)) { + // Out-of-bound length (read until end) + *data = currData[offset:] + err = io.EOF + } else { + // In-bound length + *data = currData[offset:(offset + readLen)] + } + + return len(*data), err +} + +func (c *container) SetData(name string, offset int64, data []byte) error { + + c.intLock.Lock() + defer c.intLock.Unlock() + + if offset < 0 { + return fmt.Errorf("invalid offset: %d", offset) + } + + if c.dataStore == nil { + c.dataStore = make(map[string][]byte) + } + + currData, ok := c.dataStore[name] + + // if this is the first write, we expect offset to be 0 (we don't support + // sparse files yet) + if !ok { + if offset != 0 { + return fmt.Errorf("invalid offset: %d", offset) + } + + tmp := make([]byte, len(data)) + copy(tmp, data) + c.dataStore[name] = tmp + + return nil + } + + // if this is not the first write, we expect it to either overwrite the + // existing data (or a subset of it), or extend it contiguously. + if offset > int64(len(currData)) { + return fmt.Errorf("invalid offset: %d", offset) + } + + newData := append(currData[0:offset], data...) + c.dataStore[name] = newData + + return nil +} + +func (c *container) Lock() { + c.extLock.Lock() +} + +func (c *container) Unlock() { + c.extLock.Unlock() +} + +// Exclusively utilized for unit-testing purposes. +func (c *container) SetInitProc(pid, uid, gid uint32) error { + if c.service == nil { + return fmt.Errorf("No css service identified") + } + + if c.service.ProcessService() == nil { + return fmt.Errorf("No pts service identified") + } + + c.initProc = c.service.ProcessService().ProcessCreate(pid, uid, gid) + + return nil +} + +func (c *container) SetRegistrationCompleted() { + c.intLock.Lock() + defer c.intLock.Unlock() + c.regCompleted = true +} diff --git a/sysbox-fs/state/containerDB.go b/sysbox-fs/state/containerDB.go new file mode 100644 index 00000000..44db588b --- /dev/null +++ b/sysbox-fs/state/containerDB.go @@ -0,0 +1,441 @@ +// +// Copyright 2019-2020 Nestybox, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +package state + +import ( + "fmt" + "sync" + "time" + + "github.com/sirupsen/logrus" + "golang.org/x/sys/unix" + grpcCodes "google.golang.org/grpc/codes" + grpcStatus "google.golang.org/grpc/status" + + "github.com/nestybox/sysbox-fs/domain" + "github.com/nestybox/sysbox-libs/formatter" +) + +type containerStateService struct { + sync.RWMutex + + // Map to store the association between container ids (string) and its + // corresponding container data structure. + idTable map[string]*container + + // Map to keep track of containers sharing the same net-ns. + netnsTable map[domain.Inode][]*container + + // Pointer to the fuse-server service engine. + fss domain.FuseServerServiceIface + + // Pointer to the service providing process-handling capabilities. + prs domain.ProcessServiceIface + + // Pointer to the service providing file-system I/O capabilities. + ios domain.IOServiceIface + + // Pointer to the service providing mount helper/parser capabilities. + mts domain.MountServiceIface +} + +func NewContainerStateService() domain.ContainerStateServiceIface { + + newCss := &containerStateService{ + idTable: make(map[string]*container), + netnsTable: make(map[domain.Inode][]*container), + } + + return newCss +} + +func (css *containerStateService) Setup( + fss domain.FuseServerServiceIface, + prs domain.ProcessServiceIface, + ios domain.IOServiceIface, + mts domain.MountServiceIface) { + + css.fss = fss + css.prs = prs + css.ios = ios + css.mts = mts +} + +func (css *containerStateService) ContainerCreate( + id string, + initPid uint32, + ctime time.Time, + uidFirst uint32, + uidSize uint32, + gidFirst uint32, + gidSize uint32, + procRoPaths []string, + procMaskPaths []string, + service domain.ContainerStateServiceIface, +) domain.ContainerIface { + + return newContainer( + id, + initPid, + ctime, + uidFirst, + uidSize, + gidFirst, + gidSize, + procRoPaths, + procMaskPaths, + css, + ) +} + +func (css *containerStateService) ContainerPreRegister(id, netns string) error { + var stateCntr *container + + logrus.Debugf("Container pre-registration started: id = %s", + formatter.ContainerID{id}) + + css.Lock() + + // Ensure that new container's id is not already present. + if _, ok := css.idTable[id]; ok { + css.Unlock() + logrus.Errorf("Container pre-registration error: container %s already present", + formatter.ContainerID{id}) + return grpcStatus.Errorf( + grpcCodes.AlreadyExists, + "Container %s already pre-registered", + id, + ) + } + + cntr := &container{ + id: id, + service: css, + } + + stateCntr = cntr + + // Track sharing of the container's net-ns + cntrSameNetns := []*container{} + + if netns != "" { + var err error + cntrSameNetns, err = css.trackNetns(cntr, netns) + if err != nil { + css.Unlock() + logrus.Errorf("Container pre-registration error: %s has invalid net-ns: %s", + formatter.ContainerID{cntr.id}, err) + return grpcStatus.Errorf(grpcCodes.NotFound, err.Error(), cntr.id) + } + } + + css.idTable[cntr.id] = cntr + + // Create a dedicated fuse-server for each sys container. + // + // Each sys container has a dedicated fuse-server. However, for sys + // containers that share the same net-ns (e.g., for K8s + sysbox pods), the + // fuse-servers for each are passed the same container state object (the + // container struct associated with the first container in the net-ns). + // + // This means that all containers sharing the same net-ns will share the + // state for resources in the container's procfs and sysfs emulated by + // sysbox-fs (e.g., in a K8s + Sysbox pod, all containers see the same + // /proc/uptime). + // + // Note that sharing a net-ns implies sharing a user-ns, because the net-ns + // is "owned" by it's associated user-ns (see user_namespaces (7)). + // + // Design detail: when multiple containers share sysbox-fs emulation state, + // even if the first container is destroyed, a reference to its container + // state object will be held by the fuse servers associated with the other + // containers sharing the fuse state. Therefore those will continue to + // operate properly. Only when all containers sharing the same fuse state are + // destroyed will the container state object be garbage collected. + + if len(cntrSameNetns) > 1 { + stateCntr = cntrSameNetns[0] + logrus.Debugf("Container %s will share sysbox-fs state with %v", + formatter.ContainerID{id}, cntrSameNetns) + } + + err := css.fss.CreateFuseServer(cntr, stateCntr) + if err != nil { + css.Unlock() + logrus.Errorf("Container pre-registration error: unable to initialize fuseServer for container %s: %s", + formatter.ContainerID{id}, err) + return grpcStatus.Errorf( + grpcCodes.Internal, + "Initialization error for container-id %s", + id, + ) + } + + css.Unlock() + + logrus.Infof("Container pre-registration completed: id = %s", + formatter.ContainerID{id}) + + return nil +} + +func (css *containerStateService) ContainerRegister(c domain.ContainerIface) error { + + cntr := c.(*container) + + logrus.Debugf("Container registration started: id = %s", + formatter.ContainerID{cntr.id}) + + css.Lock() + + // Ensure that container's id is already present (preregistration completed). + currCntr, ok := css.idTable[cntr.id] + if !ok { + css.Unlock() + logrus.Errorf("Container registration error: container %s not present", + formatter.ContainerID{cntr.id}) + return grpcStatus.Errorf( + grpcCodes.NotFound, + "Container %s not found", + cntr.id, + ) + } + + // Update existing container with received attributes. + if err := currCntr.update(cntr); err != nil { + css.Unlock() + logrus.Errorf("Container registration error: container %s not updated", + formatter.ContainerID{cntr.id}) + return grpcStatus.Errorf( + grpcCodes.Internal, + "Container %s not updated", + cntr.id, + ) + } + + // In case we don't yet have the netns info for the container's + // init process (e.g., we didn't receive it during pre-registration because + // the container is not in a pod), get it now. + if _, err := css.trackNetns(currCntr, ""); err != nil { + css.Unlock() + logrus.Errorf("Container registration error: %s has invalid net-ns: %s", + formatter.ContainerID{cntr.id}, err) + return grpcStatus.Errorf(grpcCodes.NotFound, err.Error(), cntr.id) + } + + // Let the associated fuse-server know about the sys-container's registration + // being completed. + if err := css.fss.FuseServerCntrRegComplete(cntr); err != nil { + logrus.Errorf("Container registration error: container %s not present", + formatter.ContainerID{cntr.id}) + return grpcStatus.Errorf(grpcCodes.NotFound, err.Error(), cntr.id) + } + + currCntr.SetRegistrationCompleted() + + css.Unlock() + + logrus.Infof("Container registration completed: %v", cntr.string()) + return nil +} + +func (css *containerStateService) ContainerUpdate(c domain.ContainerIface) error { + + cntr := c.(*container) + + logrus.Debugf("Container update started: id = %s", + formatter.ContainerID{cntr.id}) + + css.Lock() + + // Identify the container being updated. + currCntr, ok := css.idTable[cntr.id] + if !ok { + css.Unlock() + logrus.Errorf("Container update failure: container %v not found", + formatter.ContainerID{cntr.id}) + return grpcStatus.Errorf( + grpcCodes.NotFound, + "Container %s not found", + cntr.id, + ) + } + + // Update the existing container-state struct with the one being received. + // Only 'creation-time' attribute is supported for now. + currCntr.SetCtime(cntr.ctime) + css.Unlock() + + logrus.Debugf("Container update completed: id = %s", + formatter.ContainerID{cntr.id}) + + return nil +} + +func (css *containerStateService) ContainerUnregister(c domain.ContainerIface) error { + + cntr := c.(*container) + + logrus.Debugf("Container unregistration started: id = %s", + formatter.ContainerID{cntr.id}) + + css.Lock() + + // Ensure that container's id is already present + _, ok := css.idTable[cntr.id] + if !ok { + css.Unlock() + logrus.Errorf("Container unregistration error: container %s not present", + cntr.id) + return grpcStatus.Errorf( + grpcCodes.NotFound, + "Container %s not found", + cntr.id, + ) + } + + // Close the container's initPidFd. + if cntr.initPidFd != 0 { + unix.Close(int(cntr.InitPidFd())) + } + + // Remove the net-ns tracking info for the unregistered container. + // + // Note: we don't do error checking because this can fail if the netns is not + // yet tracked for the container (e.g., if a container is pre-registered and + // then unregistered because the container failed to start for some reason). + css.untrackNetns(cntr) + + // Destroy the fuse server for the container + err := css.fss.DestroyFuseServer(cntr.id) + if err != nil { + css.Unlock() + logrus.Errorf("Container unregistration error: unable to destroy fuseServer for container %s", + cntr.id) + return grpcStatus.Errorf( + grpcCodes.Internal, + "Container %s unable to destroy associated fuse-server", + cntr.id, + ) + } + + delete(css.idTable, cntr.id) + css.Unlock() + + logrus.Infof("Container unregistration completed: id = %s", + formatter.ContainerID{cntr.id}) + + return nil +} + +func (css *containerStateService) ContainerLookupById(id string) domain.ContainerIface { + css.RLock() + defer css.RUnlock() + + cntr, ok := css.idTable[id] + if !ok { + return nil + } + + return cntr +} + +func (css *containerStateService) FuseServerService() domain.FuseServerServiceIface { + return css.fss +} + +func (css *containerStateService) ProcessService() domain.ProcessServiceIface { + return css.prs +} + +func (css *containerStateService) MountService() domain.MountServiceIface { + return css.mts +} + +func (css *containerStateService) ContainerDBSize() int { + css.RLock() + defer css.RUnlock() + + return len(css.idTable) +} + +// trackNetns keeps track of the container's network namespace. +func (css *containerStateService) trackNetns(cntr *container, netns string) ([]*container, error) { + + var ( + cntrSameNetns []*container + netnsInode uint64 + err error + ok bool + ) + + if cntr.netnsInode == 0 { + + if netns != "" { + fnode := css.ios.NewIOnode("", netns, 0) + netnsInode, err = fnode.GetNsInode() + if err != nil { + return nil, fmt.Errorf("Error getting netns inode: %v", err) + } + } else { + netnsInode, err = cntr.InitProc().NetNsInode() + if err != nil { + return nil, fmt.Errorf("Error getting netns inode: %v", err) + } + } + + cntr.netnsInode = netnsInode + + // Update the netnsTable with this container's info + cntrSameNetns, ok = css.netnsTable[netnsInode] + if ok { + cntrSameNetns = append(cntrSameNetns, cntr) + } else { + cntrSameNetns = []*container{cntr} + } + css.netnsTable[netnsInode] = cntrSameNetns + } + + return cntrSameNetns, nil +} + +// untrackNetns removes tracking info for the given container's net-namespace. +func (css *containerStateService) untrackNetns(cntr *container) error { + + // Find all containers sharing the same netns. + cntrSameNetns, ok := css.netnsTable[cntr.netnsInode] + if !ok { + return fmt.Errorf("could not find entry in netnsTable for container %s", cntr.id) + } + + // Remove the unregistered container from the list of containers sharing the netns. + newCntrSameNetns := []*container{} + for _, c := range cntrSameNetns { + if c.id == cntr.id { + continue + } + newCntrSameNetns = append(newCntrSameNetns, c) + } + + if len(newCntrSameNetns) > 0 { + css.netnsTable[cntr.netnsInode] = newCntrSameNetns + } else { + delete(css.netnsTable, cntr.netnsInode) + } + + return nil +} diff --git a/sysbox-fs/state/containerDB_test.go b/sysbox-fs/state/containerDB_test.go new file mode 100644 index 00000000..70c3ba64 --- /dev/null +++ b/sysbox-fs/state/containerDB_test.go @@ -0,0 +1,775 @@ +// +// Copyright 2019-2020 Nestybox, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +package state + +import ( + "io/ioutil" + "reflect" + "sync" + "testing" + "time" + + "github.com/nestybox/sysbox-fs/domain" + "github.com/nestybox/sysbox-fs/mocks" + "github.com/nestybox/sysbox-fs/process" + "github.com/nestybox/sysbox-fs/sysio" + "github.com/sirupsen/logrus" +) + +// Sysbox-fs global services for all state's pkg unit-tests. +var ios domain.IOServiceIface +var prs domain.ProcessServiceIface +var nss *mocks.NSenterServiceIface +var fss *mocks.FuseServerServiceIface +var hds *mocks.HandlerServiceIface +var mts *mocks.MountServiceIface + +func TestMain(m *testing.M) { + + // Disable log generation during UT. + logrus.SetOutput(ioutil.Discard) + + // + // Test-cases common settings. + // + ios = sysio.NewIOService(domain.IOMemFileService) + prs = process.NewProcessService() + nss = &mocks.NSenterServiceIface{} + hds = &mocks.HandlerServiceIface{} + fss = &mocks.FuseServerServiceIface{} + mts = &mocks.MountServiceIface{} + + prs.Setup(ios) + + // Run test-suite. + m.Run() +} + +func Test_containerStateService_Setup(t *testing.T) { + type fields struct { + RWMutex sync.RWMutex + idTable map[string]*container + netnsTable map[domain.Inode][]*container + fss domain.FuseServerServiceIface + prs domain.ProcessServiceIface + ios domain.IOServiceIface + mts domain.MountServiceIface + } + + var f1 = fields{ + idTable: make(map[string]*container), + netnsTable: make(map[domain.Inode][]*container), + fss: fss, + prs: prs, + ios: ios, + } + + type args struct { + fss domain.FuseServerServiceIface + prs domain.ProcessServiceIface + ios domain.IOServiceIface + mts domain.MountServiceIface + } + + a1 := args{ + fss: fss, + prs: prs, + ios: ios, + mts: mts, + } + + tests := []struct { + name string + fields fields + args args + }{ + {"1", f1, a1}, + } + + // + // Testcase executions. + // + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + css := &containerStateService{ + RWMutex: tt.fields.RWMutex, + idTable: tt.fields.idTable, + netnsTable: tt.fields.netnsTable, + fss: tt.fields.fss, + prs: tt.fields.prs, + ios: tt.fields.ios, + mts: tt.fields.mts, + } + css.Setup(tt.args.fss, tt.args.prs, tt.args.ios, tt.args.mts) + }) + } +} + +func Test_containerStateService_ContainerCreate(t *testing.T) { + + type fields struct { + idTable map[string]*container + netnsTable map[domain.Inode][]*container + fss domain.FuseServerServiceIface + prs domain.ProcessServiceIface + ios domain.IOServiceIface + mts domain.MountServiceIface + } + + var f1 = fields{ + idTable: make(map[string]*container), + netnsTable: make(map[domain.Inode][]*container), + fss: fss, + prs: prs, + ios: ios, + mts: mts, + } + + css := &containerStateService{ + idTable: f1.idTable, + netnsTable: f1.netnsTable, + fss: f1.fss, + prs: f1.prs, + ios: f1.ios, + mts: f1.mts, + } + type args struct { + id string + initPid uint32 + ctime time.Time + uidFirst uint32 + uidSize uint32 + gidFirst uint32 + gidSize uint32 + procRoPaths []string + procMaskPaths []string + } + + // Manually create a container to compare with. + var c1 = &container{ + id: "1", + initPid: 1001, + ctime: time.Time{}, + uidFirst: 1, + uidSize: 65535, + gidFirst: 1, + gidSize: 65535, + procRoPaths: nil, + procMaskPaths: nil, + dataStore: nil, + initProc: nil, + service: css, + } + + tests := []struct { + name string + fields fields + args args + want domain.ContainerIface + }{ + // + // Testcase 1: Compare previously create container with a new one to be + // built through the container's constructor method. They should fully + // match. + // + {"1", f1, args{ + c1.id, + c1.initPid, + c1.ctime, + c1.uidFirst, + c1.uidSize, + c1.gidFirst, + c1.gidSize, + nil, + nil, + }, c1}, + } + + // + // Testcase executions. + // + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + if got := css.ContainerCreate( + tt.args.id, + tt.args.initPid, + tt.args.ctime, + tt.args.uidFirst, + tt.args.uidSize, + tt.args.gidFirst, + tt.args.gidSize, + tt.args.procRoPaths, + tt.args.procMaskPaths, + css); !reflect.DeepEqual(got, tt.want) { + t.Errorf("containerStateService.ContainerCreate() = %v, want %v", + got, tt.want) + } + }) + } +} + +func Test_containerStateService_ContainerPreRegister(t *testing.T) { + + type fields struct { + idTable map[string]*container + netnsTable map[domain.Inode][]*container + fss domain.FuseServerServiceIface + prs domain.ProcessServiceIface + ios domain.IOServiceIface + mts domain.MountServiceIface + } + + var f1 = fields{ + idTable: make(map[string]*container), + netnsTable: make(map[domain.Inode][]*container), + fss: fss, + prs: prs, + ios: ios, + mts: mts, + } + + css := &containerStateService{ + idTable: f1.idTable, + netnsTable: f1.netnsTable, + fss: f1.fss, + prs: f1.prs, + ios: f1.ios, + mts: f1.mts, + } + + var c1 = &container{ + id: "c1", + service: css, + } + + var c2 = &container{ + id: "c2", + service: css, + } + + type args struct { + id string + id2 string + } + tests := []struct { + name string + fields fields + args args + wantErr bool + prepare func() + }{ + { + // + // Test-case 1: Pre-register a new container. + // + name: "1", + fields: f1, + args: args{"c1", "c1"}, + wantErr: false, + prepare: func() { + + css.FuseServerService().(*mocks.FuseServerServiceIface).On( + "CreateFuseServer", c1, c1).Return(nil) + }, + }, + { + // + // Test-case 2: Pre-register an already-present container (with + // matching container ID). Error expected. + // + name: "2", + fields: f1, + args: args{"c2", "c2"}, + wantErr: true, + prepare: func() { + + f1.idTable[c2.id] = c2 + css.FuseServerService().(*mocks.FuseServerServiceIface).On( + "CreateFuseServer", c2, c2).Return(nil) + }, + }, + } + + // + // Testcase executions. + // + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + + // Prepare the mocks. + if tt.prepare != nil { + tt.prepare() + } + + if err := css.ContainerPreRegister(tt.args.id, ""); (err != nil) != tt.wantErr { + t.Errorf("containerStateService.ContainerPreRegister() error = %v, wantErr %v", + err, tt.wantErr) + } + }) + } +} + +func Test_containerStateService_ContainerRegister(t *testing.T) { + + type fields struct { + RWMutex sync.RWMutex + idTable map[string]*container + netnsTable map[domain.Inode][]*container + fss domain.FuseServerServiceIface + prs domain.ProcessServiceIface + ios domain.IOServiceIface + mts domain.MountServiceIface + } + + var f1 = fields{ + idTable: make(map[string]*container), + netnsTable: make(map[domain.Inode][]*container), + fss: fss, + prs: prs, + ios: ios, + mts: mts, + } + + var c1 = &container{ + id: "c1", + initPid: 1001, + initProc: f1.prs.ProcessCreate(1001, 0, 0), + } + + var c2 = &container{ + id: "c2", + } + + var c3 = &container{ + id: "c3", + initPid: 3003, + initProc: f1.prs.ProcessCreate(3003, 0, 0), + } + + type args struct { + c domain.ContainerIface + } + + tests := []struct { + name string + fields fields + args args + wantErr bool + prepare func(css *containerStateService) + }{ + { + // + // Test-case 1: Register a pre-registered container with valid + // user-ns. + // + name: "1", + fields: f1, + args: args{c1}, + wantErr: false, + prepare: func(css *containerStateService) { + + c1.service = css + + c1.InitProc().CreateNsInodes(123456) + + f1.idTable[c1.id] = c1 + + c1.service.MountService().(*mocks.MountServiceIface).On( + "NewMountInfoParser", c1, c1.initProc, true, true, true).Return(nil, nil) + + css.FuseServerService().(*mocks.FuseServerServiceIface).On( + "FuseServerCntrRegComplete", c1).Return(nil) + }, + }, + { + // + // Test-case 2: Register a non-pre-registered container. Error + // expected. + // + name: "2", + fields: f1, + args: args{c2}, + wantErr: true, + prepare: func(css *containerStateService) {}, + }, + { + // + // Test-case 3: Register a pre-registered container with missing + // user-ns inode (i.e. missing /proc/pid/ns/). Error + // expected. + // + name: "3", + fields: f1, + args: args{c3}, + wantErr: true, + prepare: func(css *containerStateService) { + + c3.service = css + f1.idTable[c3.id] = c3 + + css.MountService().(*mocks.MountServiceIface).On( + "NewMountInfoParser", c3, c3.initProc, true, true, true).Return(nil, nil) + + css.FuseServerService().(*mocks.FuseServerServiceIface).On( + "FuseServerCntrRegComplete", c3).Return(nil) + }, + }, + } + + // + // Testcase executions. + // + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + css := &containerStateService{ + RWMutex: tt.fields.RWMutex, + idTable: tt.fields.idTable, + netnsTable: tt.fields.netnsTable, + fss: tt.fields.fss, + prs: tt.fields.prs, + ios: tt.fields.ios, + mts: tt.fields.mts, + } + + // Initialize memory-based mock FS. + css.ios.RemoveAllIOnodes() + + // Prepare the mocks. + if tt.prepare != nil { + tt.prepare(css) + } + + if err := css.ContainerRegister(tt.args.c); (err != nil) != tt.wantErr { + t.Errorf("containerStateService.ContainerRegister() error = %v, wantErr %v", + err, tt.wantErr) + } + }) + } +} + +func Test_containerStateService_ContainerUpdate(t *testing.T) { + type fields struct { + RWMutex sync.RWMutex + idTable map[string]*container + netnsTable map[domain.Inode][]*container + fss domain.FuseServerServiceIface + prs domain.ProcessServiceIface + ios domain.IOServiceIface + mts domain.MountServiceIface + } + + var f1 = fields{ + idTable: make(map[string]*container), + netnsTable: make(map[domain.Inode][]*container), + fss: fss, + prs: prs, + ios: ios, + mts: mts, + } + + var c1 = &container{ + id: "c1", + initProc: f1.prs.ProcessCreate(1001, 0, 0), + } + f1.idTable[c1.id] = c1 + + var c2 = &container{ + id: "c2", + initProc: f1.prs.ProcessCreate(2002, 0, 0), + } + + type args struct { + c domain.ContainerIface + } + tests := []struct { + name string + fields fields + args args + wantErr bool + prepare func(css *containerStateService) + }{ + { + // + // Test-case 1: Update a properly registered container. + // + name: "1", + fields: f1, + args: args{c1}, + wantErr: false, + prepare: func(css *containerStateService) { + + c1.service = css + + c1.InitProc().CreateNsInodes(123456) + inode, _ := c1.InitProc().NetNsInode() + + f1.idTable[c1.id] = c1 + f1.netnsTable[inode] = []*container{c1} + + c1.service.MountService().(*mocks.MountServiceIface).On( + "NewMountInfoParser", c1, c1.initProc, true, true, true).Return(nil, nil) + }, + }, + { + // + // Test-case 2: Update a container whose container-ID is not present + // in the idTable. Error expected. + // + name: "2", + fields: f1, + args: args{c2}, + wantErr: true, + prepare: func(css *containerStateService) { + + c2.service = css + + c2.InitProc().CreateNsInodes(123456) + inode, _ := c2.InitProc().NetNsInode() + + f1.netnsTable[inode] = []*container{c2} + + c2.service.MountService().(*mocks.MountServiceIface).On( + "NewMountInfoParser", c2, c2.initProc, true, true, true).Return(nil, nil) + }, + }, + } + + // + // Testcase executions. + // + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + css := &containerStateService{ + RWMutex: tt.fields.RWMutex, + idTable: tt.fields.idTable, + netnsTable: tt.fields.netnsTable, + fss: tt.fields.fss, + prs: tt.fields.prs, + ios: tt.fields.ios, + mts: tt.fields.mts, + } + + // Initialize memory-based mock FS. + css.ios.RemoveAllIOnodes() + + // Prepare the mocks. + if tt.prepare != nil { + tt.prepare(css) + } + + if err := css.ContainerUpdate(tt.args.c); (err != nil) != tt.wantErr { + t.Errorf("containerStateService.ContainerUpdate() error = %v, wantErr %v", + err, tt.wantErr) + } + }) + } +} + +func Test_containerStateService_ContainerUnregister(t *testing.T) { + type fields struct { + RWMutex sync.RWMutex + idTable map[string]*container + netnsTable map[domain.Inode][]*container + fss domain.FuseServerServiceIface + prs domain.ProcessServiceIface + ios domain.IOServiceIface + } + + var f1 = fields{ + idTable: make(map[string]*container), + netnsTable: make(map[domain.Inode][]*container), + fss: fss, + prs: prs, + ios: ios, + } + + var c1 = &container{ + id: "c1", + initProc: f1.prs.ProcessCreate(1001, 0, 0), + } + + var c2 = &container{ + id: "c2", + initProc: f1.prs.ProcessCreate(2002, 0, 0), + } + + type args struct { + c domain.ContainerIface + } + tests := []struct { + name string + fields fields + args args + wantErr bool + prepare func(css *containerStateService) + }{ + { + // + // Test-case 1: Unregister a valid (properly registered) container. + // + name: "1", + fields: f1, + args: args{c1}, + wantErr: false, + prepare: func(css *containerStateService) { + + c1.InitProc().CreateNsInodes(123456) + inode, _ := c1.InitProc().NetNsInode() + + c1.service = css + + f1.idTable[c1.id] = c1 + f1.netnsTable[inode] = []*container{c1} + + css.FuseServerService().(*mocks.FuseServerServiceIface).On( + "DestroyFuseServer", c1.id).Return(nil) + }, + }, + { + // + // Test-case 2: Unregister a container with an id not present in + // idTable. Error expected. + // + name: "2", + fields: f1, + args: args{c2}, + wantErr: true, + prepare: func(css *containerStateService) { + + c2.initProc.CreateNsInodes(123456) + inode, _ := c2.InitProc().NetNsInode() + + c2.service = css + + f1.netnsTable[inode] = []*container{c2} + }, + }, + { + // + // Test-case 3: Unregister a container with valid ID but with missing + // netns. Error not expected (this can happen when a container pre-registers + // and immediately unregisters due to some error in sysbox-runc). + // + name: "3", + fields: f1, + args: args{c1}, + wantErr: false, + prepare: func(css *containerStateService) { + + c1.service = css + f1.idTable[c1.id] = c1 + + // clear the netns map + for entry := range f1.netnsTable { + delete(f1.netnsTable, entry) + } + }, + }, + } + + // + // Testcase executions. + // + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + css := &containerStateService{ + RWMutex: tt.fields.RWMutex, + idTable: tt.fields.idTable, + netnsTable: tt.fields.netnsTable, + fss: tt.fields.fss, + prs: tt.fields.prs, + ios: tt.fields.ios, + } + + // Initialize memory-based mock FS. + css.ios.RemoveAllIOnodes() + + // Prepare the mocks. + if tt.prepare != nil { + tt.prepare(css) + } + + if err := css.ContainerUnregister(tt.args.c); (err != nil) != tt.wantErr { + t.Errorf("containerStateService.ContainerUnregister() error = %v, wantErr %v", + err, tt.wantErr) + } + }) + } +} + +func Test_containerStateService_ContainerLookupById(t *testing.T) { + type fields struct { + RWMutex sync.RWMutex + idTable map[string]*container + netnsTable map[domain.Inode][]*container + fss domain.FuseServerServiceIface + prs domain.ProcessServiceIface + ios domain.IOServiceIface + } + + var f1 = fields{ + idTable: make(map[string]*container), + netnsTable: make(map[domain.Inode][]*container), + fss: fss, + prs: prs, + ios: ios, + } + + var c1 = &container{ + id: "c1", + } + f1.idTable[c1.id] = c1 + + type args struct { + id string + } + + tests := []struct { + name string + fields fields + args args + want domain.ContainerIface + }{ + // Lookup a valid/existing container. + {"1", f1, args{"c1"}, c1}, + + // Lookup a container with no matching entry in the idTable. + // Error expected. + {"2", f1, args{"c2"}, nil}, + } + + // + // Testcase executions. + // + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + css := &containerStateService{ + RWMutex: tt.fields.RWMutex, + idTable: tt.fields.idTable, + netnsTable: tt.fields.netnsTable, + fss: tt.fields.fss, + prs: tt.fields.prs, + ios: tt.fields.ios, + } + + if got := css.ContainerLookupById(tt.args.id); !reflect.DeepEqual(got, tt.want) { + t.Errorf("containerStateService.ContainerLookupById() = %v, want %v", + got, tt.want) + } + }) + } +} diff --git a/sysbox-fs/state/container_test.go b/sysbox-fs/state/container_test.go new file mode 100644 index 00000000..b2088164 --- /dev/null +++ b/sysbox-fs/state/container_test.go @@ -0,0 +1,287 @@ +// +// Copyright 2019-2020 Nestybox, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +package state + +import ( + "io" + "reflect" + "testing" + "time" + + "github.com/nestybox/sysbox-fs/domain" + "github.com/nestybox/sysbox-fs/mocks" + "github.com/stretchr/testify/assert" +) + +func Test_container_ID(t *testing.T) { + + var cs1 = &container{ + id: "cs1", + } + + var cs2 = &container{ + id: "", + } + + tests := []struct { + name string + c *container + want string + }{ + // Regular case. + {"1", cs1, "cs1"}, + + // Lame testcase -- of course it works. + {"2", cs2, ""}, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + if got := tt.c.ID(); got != tt.want { + t.Errorf("container.ID() = %v, want %v", got, tt.want) + } + }) + } +} + +func Test_container_InitPid(t *testing.T) { + + var cs1 = &container{ + initPid: 1111, + } + + var cs2 = &container{ + initPid: 0, + } + + tests := []struct { + name string + c *container + want uint32 + }{ + // Regular case. + {"1", cs1, 1111}, + + // Lame testcase -- of course it works. + {"2", cs2, 0}, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + if got := tt.c.InitPid(); got != tt.want { + t.Errorf("container.InitPid() = %v, want %v", got, tt.want) + } + }) + } +} + +func Test_container_Ctime(t *testing.T) { + + var cs1 = &container{ + ctime: time.Date(2019, 05, 01, 0, 0, 0, 0, time.UTC), + } + + var cs2 = &container{ + ctime: time.Time{}, + } + + tests := []struct { + name string + c *container + want time.Time + }{ + // Regular case. + {"1", cs1, time.Date(2019, 05, 01, 0, 0, 0, 0, time.UTC)}, + + // Lame testcase -- of course it works. + {"2", cs2, time.Time{}}, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + if got := tt.c.Ctime(); !reflect.DeepEqual(got, tt.want) { + t.Errorf("container.Ctime() = %v, want %v", got, tt.want) + } + }) + } +} + +func Test_container_SetCtime(t *testing.T) { + + var cs1 = &container{ + ctime: time.Date(2019, 05, 01, 0, 0, 0, 0, time.UTC), + } + + type args struct { + t time.Time + } + tests := []struct { + name string + c *container + args args + }{ + // Regular case. + {"1", cs1, args{time.Date(2019, 05, 01, 0, 0, 0, 0, time.UTC)}}, + + // Lame testcase -- of course it works. + {"2", cs1, args{time.Time{}}}, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + tt.c.SetCtime(tt.args.t) + }) + + assert.Equal(t, tt.args.t, tt.c.Ctime(), "ctime fields are not matching") + } +} + +func Test_container_SetData(t *testing.T) { + + var cs1 = &container{ + dataStore: make(map[string][]byte), + } + + cs1.dataStore["/proc/cpuinfo"] = []byte("FOO") + + type args struct { + name string + offset int64 + data []byte + } + + tests := []struct { + name string + c *container + args args + }{ + // Insert new data record. + {"1", cs1, args{"/proc/testing", 0, []byte("12345")}}, + + // Update existing data record. + {"2", cs1, args{"/proc/cpuinfo", 0, []byte("FOO \n BAR")}}, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + tt.c.SetData(tt.args.name, tt.args.offset, tt.args.data) + + buf := make([]byte, 20) + _, err := tt.c.Data(tt.args.name, 0, &buf) + if err != nil && err != io.EOF { + t.Errorf("Unexpected result during execution of testcase %v", tt.name) + } + + assert.Equal(t, tt.args.data, buf, "data fields are not matching") + }) + } +} + +func Test_container_update(t *testing.T) { + type fields struct { + id string + initPid uint32 + ctime time.Time + uidFirst uint32 + uidSize uint32 + gidFirst uint32 + gidSize uint32 + procRoPaths []string + procMaskPaths []string + specPaths map[string]struct{} + dataStore map[string][]byte + initProc domain.ProcessIface + service *containerStateService + } + f1 := fields{ + id: "1", + initPid: 1011, + initProc: prs.ProcessCreate(1001, 0, 0), + } + + // Create local css as it's required by cntr.update() method. + css := &containerStateService{ + idTable: nil, + netnsTable: nil, + fss: fss, + prs: prs, + ios: ios, + mts: &mocks.MountServiceIface{}, + } + + type args struct { + src *container + } + a1 := args{ + src: &container{ + id: "1", + initPid: 1011, + ctime: time.Time{}, + uidFirst: 1, + uidSize: 65535, + gidFirst: 1, + gidSize: 65535, + procRoPaths: nil, + procMaskPaths: nil, + dataStore: nil, + service: css, + }, + } + + tests := []struct { + name string + fields fields + args args + wantErr bool + }{ + {"1", f1, a1, false}, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + c := &container{ + id: tt.fields.id, + initPid: tt.fields.initPid, + ctime: tt.fields.ctime, + uidFirst: tt.fields.uidFirst, + uidSize: tt.fields.uidSize, + gidFirst: tt.fields.gidFirst, + gidSize: tt.fields.gidSize, + procRoPaths: tt.fields.procRoPaths, + procMaskPaths: tt.fields.procMaskPaths, + dataStore: tt.fields.dataStore, + initProc: tt.fields.initProc, + service: css, + } + + c.service.MountService().(*mocks.MountServiceIface).On( + "NewMountInfoParser", c, c.initProc, true, true, true).Return(nil, nil) + + if err := c.update(tt.args.src); (err != nil) != tt.wantErr { + t.Errorf("container.update() error = %v, wantErr %v", + err, tt.wantErr) + } + + assert.Equal(t, c.initPid, tt.args.src.initPid) + assert.Equal(t, c.ctime, tt.args.src.ctime) + assert.Equal(t, c.uidFirst, tt.args.src.uidFirst) + assert.Equal(t, c.uidSize, tt.args.src.uidSize) + assert.Equal(t, c.gidFirst, tt.args.src.gidFirst) + assert.Equal(t, c.gidSize, tt.args.src.gidSize) + }) + } +} diff --git a/sysbox-fs/sysio/ionode.go b/sysbox-fs/sysio/ionode.go new file mode 100644 index 00000000..65f46e19 --- /dev/null +++ b/sysbox-fs/sysio/ionode.go @@ -0,0 +1,40 @@ +// +// Copyright 2019-2020 Nestybox, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +package sysio + +import ( + "github.com/sirupsen/logrus" + + "github.com/nestybox/sysbox-fs/domain" +) + +func NewIOService(t domain.IOServiceType) domain.IOServiceIface { + + switch t { + + case domain.IOOsFileService: + return newIOFileService(domain.IOOsFileService) + + case domain.IOMemFileService: + return newIOFileService(domain.IOMemFileService) + + default: + logrus.Panic("Unsupported ioService required: ", t) + } + + return nil +} diff --git a/sysbox-fs/sysio/ionodeFile.go b/sysbox-fs/sysio/ionodeFile.go new file mode 100644 index 00000000..e3afd67c --- /dev/null +++ b/sysbox-fs/sysio/ionodeFile.go @@ -0,0 +1,333 @@ +// +// Copyright 2019-2020 Nestybox, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +package sysio + +import ( + "bufio" + "fmt" + "io" + "io/ioutil" + "os" + "strconv" + "syscall" + + "github.com/nestybox/sysbox-fs/domain" + "github.com/sirupsen/logrus" + "github.com/spf13/afero" +) + +// Ensure IOnodeFile implements IOnode's interfaces. +var _ domain.IOServiceIface = (*ioFileService)(nil) +var _ domain.IOnodeIface = (*IOnodeFile)(nil) + +// I/O Service providing FS interaction capabilities. +type ioFileService struct { + fsType domain.IOServiceType + appFs afero.Fs +} + +func newIOFileService(fsType domain.IOServiceType) domain.IOServiceIface { + + var fs = &ioFileService{} + + if fsType == domain.IOMemFileService { + fs.appFs = afero.NewMemMapFs() + fs.fsType = domain.IOMemFileService + } else { + fs.appFs = afero.NewOsFs() + fs.fsType = domain.IOOsFileService + } + + return fs +} + +func (s *ioFileService) NewIOnode( + n string, + p string, + mode os.FileMode) domain.IOnodeIface { + newFile := &IOnodeFile{ + name: n, + path: p, + mode: mode, + fss: s, + } + + return newFile +} + +// Eliminate all nodes from a previously created file-system. Utilized exclusively +// for unit-testing purposes (i.e. afero.MemFs). +func (s *ioFileService) RemoveAllIOnodes() error { + if err := s.appFs.RemoveAll("/"); err != nil { + return err + } + + return nil +} + +func (i *ioFileService) GetServiceType() domain.IOServiceType { + return i.fsType +} + +// IOnode class specialization for FS interaction. +type IOnodeFile struct { + name string + path string + flags int + mode os.FileMode + file afero.File + fss *ioFileService +} + +func (i *IOnodeFile) Open() error { + + file, err := i.fss.appFs.OpenFile(i.path, i.flags, i.mode) + if err != nil { + return err + } + + i.file = file + + return nil +} + +func (i *IOnodeFile) Read(p []byte) (int, error) { + + if i.file == nil { + return 0, fmt.Errorf("File not currently opened.") + } + + return i.file.Read(p) + +} + +func (i *IOnodeFile) Write(p []byte) (int, error) { + + if i.file == nil { + return 0, fmt.Errorf("File not currently opened.") + } + + return i.file.Write(p) +} + +func (i *IOnodeFile) Close() error { + + if i.file == nil { + return fmt.Errorf("File not currently opened.") + } + + return i.file.Close() +} + +func (i *IOnodeFile) Seek(offset int64, whence int) (int64, error) { + + if i.file == nil { + return 0, fmt.Errorf("File not currently opened.") + } + + return i.file.Seek(int64(offset), whence) +} + +func (i *IOnodeFile) ReadAt(p []byte, off int64) (n int, err error) { + + if i.file == nil { + return 0, fmt.Errorf("File not currently opened.") + } + + return i.file.ReadAt(p, off) +} + +func (i *IOnodeFile) ReadDirAll() ([]os.FileInfo, error) { + return afero.ReadDir(i.fss.appFs, i.path) +} + +func (i *IOnodeFile) ReadFile() ([]byte, error) { + + var ( + content []byte + err error + ) + + if i.fss.fsType == domain.IOMemFileService { + content, err = afero.ReadFile(i.fss.appFs, i.path) + if err != nil { + return nil, err + } + } else { + content, err = ioutil.ReadFile(i.path) + if err != nil { + return nil, err + } + } + + return content, nil +} + +func (i *IOnodeFile) ReadLine() (string, error) { + + var res string + + // Open file and return empty string if an error is received. + inFile, err := i.fss.appFs.Open(i.path) + if err != nil { + return res, err + } + defer inFile.Close() + + // Rely on bufio scanner to be able to break file in lines. + scanner := bufio.NewScanner(inFile) + scanner.Split(bufio.ScanLines) + scanner.Scan() + res = scanner.Text() + + return res, nil +} + +func (i *IOnodeFile) ReadLink() (string, error) { + return os.Readlink(i.path) +} + +func (i *IOnodeFile) WriteAt(p []byte, off int64) (n int, err error) { + + if i.file == nil { + return 0, fmt.Errorf("File not currently opened.") + } + + return i.file.WriteAt(p, off) +} + +func (i *IOnodeFile) WriteFile(p []byte) error { + + if i.fss.fsType == domain.IOMemFileService { + err := afero.WriteFile(i.fss.appFs, i.path, p, 0644) + if err != nil { + return err + } + + return nil + } + + return ioutil.WriteFile(i.path, p, i.mode) +} + +func (i *IOnodeFile) Mkdir() error { + return i.fss.appFs.Mkdir(i.path, i.mode) +} + +func (i *IOnodeFile) MkdirAll() error { + return i.fss.appFs.MkdirAll(i.path, i.mode) +} + +// Collects the namespace inodes of the passed /proc/pid/ns/ file. +func (i *IOnodeFile) GetNsInode() (domain.Inode, error) { + + // In unit-testing scenarios we will extract the nsInode value from the + // file content itself. This is a direct consequence of afero-fs lacking + // Sys() api support. + if i.fss.fsType == domain.IOMemFileService { + content, err := afero.ReadFile(i.fss.appFs, i.path) + if err != nil { + return 0, err + } + + nsInode, err := strconv.ParseUint(string(content), 10, 64) + if err != nil { + return 0, err + } + + return nsInode, nil + } + + info, err := os.Stat(i.path) + if err != nil { + logrus.Errorf("No namespace file found %s", i.path) + return 0, err + } + + stat := info.Sys().(*syscall.Stat_t) + + return stat.Ino, nil +} + +func (i *IOnodeFile) Stat() (os.FileInfo, error) { + return i.fss.appFs.Stat(i.path) +} + +func (i *IOnodeFile) Lstat() (os.FileInfo, error) { + return os.Lstat(i.path) +} + +func (i *IOnodeFile) SeekReset() (int64, error) { + + if i.file == nil { + return 0, fmt.Errorf("File not currently opened.") + } + + return i.file.Seek(io.SeekStart, 0) +} + +// Eliminate a node from a previously created file-system. Utilized exclusively +// for unit-testing purposes (i.e. afero.MemFs). +func (i *IOnodeFile) Remove() error { + if err := i.fss.appFs.Remove(i.path); err != nil { + return err + } + + return nil +} + +// Eliminate all nodes under the path indicated by the given ionode. Utilized +// exclusively for unit-testing purposes (i.e. afero.MemFs). +func (i *IOnodeFile) RemoveAll() error { + if err := i.fss.appFs.RemoveAll(i.path); err != nil { + return err + } + + return nil +} + +func (i *IOnodeFile) Name() string { + return i.name +} + +func (i *IOnodeFile) Path() string { + return i.path +} + +func (i *IOnodeFile) OpenFlags() int { + return i.flags +} + +func (i *IOnodeFile) OpenMode() os.FileMode { + return i.mode +} + +func (i *IOnodeFile) SetName(s string) { + i.name = s +} + +func (i *IOnodeFile) SetPath(s string) { + i.path = s +} + +func (i *IOnodeFile) SetOpenFlags(flags int) { + i.flags = flags +} + +func (i *IOnodeFile) SetOpenMode(mode os.FileMode) { + i.mode = mode +} diff --git a/sysbox-fs/sysio/ionodeFile_test.go b/sysbox-fs/sysio/ionodeFile_test.go new file mode 100644 index 00000000..4a5675bd --- /dev/null +++ b/sysbox-fs/sysio/ionodeFile_test.go @@ -0,0 +1,1112 @@ +// +// Copyright 2019-2020 Nestybox, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +package sysio_test + +import ( + "io/ioutil" + "os" + "reflect" + "testing" + + "github.com/nestybox/sysbox-fs/domain" + "github.com/nestybox/sysbox-fs/sysio" + "github.com/sirupsen/logrus" + "github.com/stretchr/testify/assert" +) + +var ios domain.IOServiceIface + +func TestMain(m *testing.M) { + + // Disable log generation during UT. + logrus.SetOutput(ioutil.Discard) + + ios = sysio.NewIOService(domain.IOMemFileService) + + m.Run() +} + +func TestIOnodeFile_Open(t *testing.T) { + type fields struct { + name string + path string + mode os.FileMode + } + + var f1 = fields{ + name: "node_1", + path: "/proc/sys/net/node_1", + mode: 0600, + } + + tests := []struct { + name string + fields fields + wantErr bool + prepare func(i domain.IOnodeIface) + }{ + { + // + // Test-case 1: Regular Open operation. No errors expected. + // + name: "1", + fields: f1, + wantErr: false, + prepare: func(i domain.IOnodeIface) { + + // Create memfs file. + i.WriteFile([]byte("content for file 0123456789")) + }, + }, + { + // + // Test-case 2: Verify proper behavior when file where to operate + // is not present. + // + name: "2", + fields: f1, + wantErr: true, + prepare: func(i domain.IOnodeIface) {}, + }, + } + + // + // Testcase executions. + // + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + i := ios.NewIOnode( + tt.fields.name, + tt.fields.path, + tt.fields.mode, + ) + + // Initialize memory-based fs. + ios.RemoveAllIOnodes() + + // Prepare the mocks. + if tt.prepare != nil { + tt.prepare(i) + } + + if err := i.Open(); (err != nil) != tt.wantErr { + t.Errorf("IOnodeFile.Open() error = %v, wantErr %v", err, tt.wantErr) + } + }) + } +} + +func TestIOnodeFile_Read(t *testing.T) { + type fields struct { + name string + path string + mode os.FileMode + } + + var f1 = fields{ + name: "node_1", + path: "/proc/sys/net/node_1", + mode: 0600, + } + + type args struct { + p []byte + } + + var a1 = args{ + p: make([]byte, len("content for file 0123456789")), + } + + tests := []struct { + name string + fields fields + args args + wantN int + wantErr bool + prepare func(i domain.IOnodeIface) + }{ + { + // + // Test-case 1: Regular Read operation. No errors expected. + // + name: "1", + fields: f1, + args: a1, + wantN: len(a1.p), + wantErr: false, + prepare: func(i domain.IOnodeIface) { + + // Create memfs file. + i.WriteFile([]byte("content for file 0123456789")) + + // Open file as Read() expects it to be already opened. + i.Open() + }, + }, + { + // + // Test-case 2: Verify proper behavior when file where to operate + // is not present. + // + name: "2", + fields: f1, + wantErr: true, + prepare: func(i domain.IOnodeIface) {}, + }, + } + + // + // Testcase executions. + // + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + i := ios.NewIOnode( + tt.fields.name, + tt.fields.path, + tt.fields.mode, + ) + + // Initialize memory-based fs. + ios.RemoveAllIOnodes() + + // Prepare the mocks. + if tt.prepare != nil { + tt.prepare(i) + } + + gotN, err := i.Read(tt.args.p) + if (err != nil) != tt.wantErr { + t.Errorf("IOnodeFile.Read() error = %v, wantErr %v", err, tt.wantErr) + return + } + if gotN != tt.wantN { + t.Errorf("IOnodeFile.Read() = %v, want %v", gotN, tt.wantN) + } + }) + } +} + +func TestIOnodeFile_Write(t *testing.T) { + type fields struct { + name string + path string + mode os.FileMode + } + + var f1 = fields{ + name: "node_1", + path: "/proc/sys/net/node_1", + mode: 0600, + } + + type args struct { + p []byte + } + + var a1 = args{ + p: []byte("content for file 0123456789"), + } + + tests := []struct { + name string + fields fields + args args + wantN int + wantErr bool + prepare func(i domain.IOnodeIface) + }{ + { + // + // Test-case 1: Regular Write operation. No errors expected. + // + name: "1", + fields: f1, + args: a1, + wantN: len(a1.p), + wantErr: false, + prepare: func(i domain.IOnodeIface) { + + // Create memfs file. + i.WriteFile([]byte("no content")) + + // Open file as Write() expects it to be already opened. + i.SetOpenFlags(int(os.O_WRONLY)) + i.Open() + + }, + }, + { + // + // Test-case 2: Verify proper behavior when file where to write + // is not present -- missing file-descriptor. + // + name: "2", + fields: f1, + wantErr: true, + prepare: func(i domain.IOnodeIface) {}, + }, + } + + // + // Testcase executions. + // + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + i := ios.NewIOnode( + tt.fields.name, + tt.fields.path, + tt.fields.mode, + ) + + // Initialize memory-based fs. + ios.RemoveAllIOnodes() + + // Prepare the mocks. + if tt.prepare != nil { + tt.prepare(i) + } + + gotN, err := i.Write(tt.args.p) + if (err != nil) != tt.wantErr { + t.Errorf("IOnodeFile.Write() error = %v, wantErr %v", err, tt.wantErr) + return + } + if gotN != tt.wantN { + t.Errorf("IOnodeFile.Write() = %v, want %v", gotN, tt.wantN) + } + }) + } +} + +func TestIOnodeFile_Close(t *testing.T) { + type fields struct { + name string + path string + mode os.FileMode + } + + var f1 = fields{ + name: "node_1", + path: "/proc/sys/net/node_1", + mode: 0600, + } + + tests := []struct { + name string + fields fields + wantErr bool + prepare func(i domain.IOnodeIface) + }{ + { + // + // Test-case 1: Regular Close operation. No errors expected. + // + name: "1", + fields: f1, + wantErr: false, + prepare: func(i domain.IOnodeIface) { + + // Create memfs file. + i.WriteFile([]byte("file content 0123456789")) + + // Open file as Close() expects it to be already opened. + i.Open() + }, + }, + { + // + // Test-case 2: Verify proper behavior when file where to operate + // is not present. + // + name: "2", + fields: f1, + wantErr: true, + prepare: func(i domain.IOnodeIface) {}, + }, + } + + // + // Testcase executions. + // + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + i := ios.NewIOnode( + tt.fields.name, + tt.fields.path, + tt.fields.mode, + ) + + // Initialize memory-based fs. + ios.RemoveAllIOnodes() + + // Prepare the mocks. + if tt.prepare != nil { + tt.prepare(i) + } + + if err := i.Close(); (err != nil) != tt.wantErr { + t.Errorf("IOnodeFile.Close() error = %v, wantErr %v", err, tt.wantErr) + } + }) + } +} + +func TestIOnodeFile_ReadAt(t *testing.T) { + type fields struct { + name string + path string + mode os.FileMode + } + + var f1 = fields{ + name: "node_1", + path: "/proc/sys/net/node_1", + mode: 0600, + } + + type args struct { + p []byte + off int64 + } + + var bytesToRead = 5 + var a1 = args{ + p: make([]byte, bytesToRead), + off: int64(len("file content 0123456789") - bytesToRead), + } + + var a2 = args{ + p: make([]byte, bytesToRead), + off: int64(len("file content 0123456789") - bytesToRead + 1), + } + + tests := []struct { + name string + fields fields + args args + wantN int + wantErr bool + prepare func(i domain.IOnodeIface) + }{ + { + // + // Test-case 1: Regular ReadAt operation. No errors expected. + // + name: "1", + fields: f1, + args: a1, + wantN: bytesToRead, + wantErr: false, + prepare: func(i domain.IOnodeIface) { + + // Create memfs file. + i.WriteFile([]byte("file content 0123456789")) + + // Open file as Close() expects it to be already opened. + i.Open() + }, + }, + { + // + // Test-case 2: Verify proper behavior when file where to operate + // is not present. + // + name: "2", + fields: f1, + args: a1, + wantN: 0, + wantErr: true, + prepare: func(i domain.IOnodeIface) {}, + }, + { + // + // Test-case 3: Verify proper behavior when there's no enough data + // to read (offset too large). No errors expected. + // + name: "3", + fields: f1, + args: a2, + wantN: bytesToRead - 1, + wantErr: false, + prepare: func(i domain.IOnodeIface) { + + // Create memfs file. + i.WriteFile([]byte("file content 0123456789")) + + // Open file as Read() expects it to be already opened. + i.Open() + }, + }, + } + + // + // Testcase executions. + // + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + i := ios.NewIOnode( + tt.fields.name, + tt.fields.path, + tt.fields.mode, + ) + + // Initialize memory-based fs. + ios.RemoveAllIOnodes() + + // Prepare the mocks. + if tt.prepare != nil { + tt.prepare(i) + } + + gotN, err := i.ReadAt(tt.args.p, tt.args.off) + if (err != nil) != tt.wantErr { + t.Errorf("IOnodeFile.ReadAt() error = %v, wantErr %v", err, tt.wantErr) + return + } + if gotN != tt.wantN { + t.Errorf("IOnodeFile.ReadAt() = %v, want %v", gotN, tt.wantN) + } + }) + } +} + +func TestIOnodeFile_ReadDirAll(t *testing.T) { + type fields struct { + name string + path string + mode os.FileMode + } + + var f1 = fields{ + name: "net", + path: "/proc/sys/net", + mode: 0600, + } + + // In this case we need to Wipe out the memory-based fs built in + // previous test-cases. + ios.RemoveAllIOnodes() + + // Build expected-response slice. + var expectedResult = []os.FileInfo{ + &domain.FileInfo{ + Fname: "ipv4", + FisDir: true, + }, + &domain.FileInfo{ + Fname: "ipv6", + FisDir: true, + }, + } + + // Create memfs entries corresponding to above expectedResult. + base := ios.NewIOnode(f1.name, f1.path, 0) + if err := base.Mkdir(); err != nil { + t.Errorf("Could not create base-dir %s element", base.Path()) + } + for _, v := range expectedResult { + i := ios.NewIOnode(v.Name(), base.Path()+"/"+v.Name(), 0) + if err := i.Mkdir(); err != nil { + t.Errorf("Could not create expectedResult %s element", i.Path()) + } + } + + tests := []struct { + name string + fields fields + want []os.FileInfo + wantErr bool + prepare func(i domain.IOnodeIface) + unwind func(i domain.IOnodeIface) + }{ + { + // + // Test-case 1: Regular ReadDirAll operation. No errors expected. + // + name: "1", + fields: f1, + want: expectedResult, + wantErr: false, + prepare: func(i domain.IOnodeIface) {}, + }, + { + // + // Test-case 2: Verify proper behavior when file where to operate + // is not present. + // + name: "2", + fields: f1, + want: nil, + wantErr: true, + prepare: func(i domain.IOnodeIface) {}, + }, + } + + // + // Testcase executions. + // + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + i := ios.NewIOnode( + tt.fields.name, + tt.fields.path, + tt.fields.mode, + ) + + // Prepare the mocks. + if tt.prepare != nil { + tt.prepare(i) + } + + got, err := i.ReadDirAll() + if (err != nil) != tt.wantErr { + t.Errorf("IOnodeFile.ReadDirAll() error = %v, wantErr %v", + err, tt.wantErr) + return + } + + assert.Equal(t, len(tt.want), len(got)) + for i, v := range got { + assert.Equal(t, v.Name(), tt.want[i].Name()) + assert.Equal(t, v.ModTime(), tt.want[i].ModTime()) + assert.Equal(t, v.IsDir(), tt.want[i].IsDir()) + } + + // Wipe out memfs entries. + ios.RemoveAllIOnodes() + }) + } +} + +func TestIOnodeFile_ReadFile(t *testing.T) { + type fields struct { + name string + path string + mode os.FileMode + } + + var f1 = fields{ + name: "node_1", + path: "/proc/sys/net/node_1", + mode: 0600, + } + + tests := []struct { + name string + fields fields + want []byte + wantErr bool + prepare func(i domain.IOnodeIface) + }{ + { + // + // Test-case 1: Regular ReadFile operation. No errors expected. + // + name: "1", + fields: f1, + want: []byte("file content 0123456789"), + wantErr: false, + prepare: func(i domain.IOnodeIface) { + + // Create memfs file. + i.WriteFile([]byte("file content 0123456789")) + }, + }, + { + // + // Test-case 2: Verify proper behavior when file where to operate + // is not present. + // + name: "2", + fields: f1, + want: nil, + wantErr: true, + prepare: func(i domain.IOnodeIface) {}, + }, + } + + // + // Testcase executions. + // + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + i := ios.NewIOnode( + tt.fields.name, + tt.fields.path, + tt.fields.mode, + ) + + // Initialize memory-based fs. + ios.RemoveAllIOnodes() + + // Prepare the mocks. + if tt.prepare != nil { + tt.prepare(i) + } + + got, err := i.ReadFile() + if (err != nil) != tt.wantErr { + t.Errorf("IOnodeFile.ReadFile() error = %v, wantErr %v", err, tt.wantErr) + return + } + if !reflect.DeepEqual(got, tt.want) { + t.Errorf("IOnodeFile.ReadFile() = %v, want %v", got, tt.want) + } + }) + } +} + +func TestIOnodeFile_ReadLine(t *testing.T) { + type fields struct { + name string + path string + mode os.FileMode + } + + var f1 = fields{ + name: "node_1", + path: "/proc/sys/net/node_1", + mode: 0600, + } + + tests := []struct { + name string + fields fields + want string + wantErr bool + prepare func(i domain.IOnodeIface) + }{ + { + // + // Test-case 1: Regular ReadLine operation. No errors expected. + // + name: "1", + fields: f1, + want: "line 1", + wantErr: false, + prepare: func(i domain.IOnodeIface) { + + // Create memfs file. + i.WriteFile([]byte("line 1\nline 2\nline 3")) + }, + }, + { + // + // Test-case 2: Verify proper behavior when file where to operate + // is not present. + // + name: "2", + fields: f1, + want: "", + wantErr: true, + prepare: func(i domain.IOnodeIface) {}, + }, + } + + // + // Testcase executions. + // + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + i := ios.NewIOnode( + tt.fields.name, + tt.fields.path, + tt.fields.mode, + ) + + // Initialize memory-based fs. + ios.RemoveAllIOnodes() + + // Prepare the mocks. + if tt.prepare != nil { + tt.prepare(i) + } + + got, err := i.ReadLine() + if (err != nil) != tt.wantErr { + t.Errorf("IOnodeFile.ReadLine() error = %v, wantErr %v", err, tt.wantErr) + return + } + if got != tt.want { + t.Errorf("IOnodeFile.ReadLine() = %v, want %v", got, tt.want) + } + }) + } +} + +func TestIOnodeFile_WriteFile(t *testing.T) { + type fields struct { + name string + path string + mode os.FileMode + } + + var f1 = fields{ + name: "node_1", + path: "/proc/sys/net/node_1", + mode: 0600, + } + + type args struct { + p []byte + } + + var a1 = args{ + p: []byte("file content 0123456789"), + } + + tests := []struct { + name string + fields fields + args args + wantErr bool + prepare func(i domain.IOnodeIface) + }{ + { + // + // Test-case 1: Regular WriteFile operation. No errors expected. + // + name: "1", + fields: f1, + args: a1, + wantErr: false, + prepare: func(i domain.IOnodeIface) {}, + }, + } + + // + // Testcase executions. + // + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + i := ios.NewIOnode( + tt.fields.name, + tt.fields.path, + tt.fields.mode, + ) + + // Initialize memory-based fs. + ios.RemoveAllIOnodes() + + // Prepare the mocks. + if tt.prepare != nil { + tt.prepare(i) + } + + if err := i.WriteFile(tt.args.p); (err != nil) != tt.wantErr { + t.Errorf("IOnodeFile.WriteFile() error = %v, wantErr %v", err, tt.wantErr) + } + }) + } +} + +func TestIOnodeFile_Mkdir(t *testing.T) { + type fields struct { + name string + path string + mode os.FileMode + } + + var f1 = fields{ + name: "net", + path: "/proc/sys/net", + mode: 0600, + } + + tests := []struct { + name string + fields fields + wantErr bool + prepare func(i domain.IOnodeIface) + }{ + { + // + // Test-case 1: Regular Mkdir operation. No errors expected. + // + name: "1", + fields: f1, + wantErr: false, + prepare: func(i domain.IOnodeIface) {}, + }, + } + + // + // Testcase executions. + // + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + i := ios.NewIOnode( + tt.fields.name, + tt.fields.path, + tt.fields.mode, + ) + + // Initialize memory-based fs. + ios.RemoveAllIOnodes() + + // Prepare the mocks. + if tt.prepare != nil { + tt.prepare(i) + } + + if err := i.Mkdir(); (err != nil) != tt.wantErr { + t.Errorf("IOnodeFile.Mkdir() error = %v, wantErr %v", err, tt.wantErr) + } + + if _, err := i.Stat(); err != nil { + t.Errorf("Directory %v was not properly created", i.Path()) + } + }) + } +} + +func TestIOnodeFile_MkdirAll(t *testing.T) { + type fields struct { + name string + path string + mode os.FileMode + } + + var f1 = fields{ + name: "ipv4", + path: "/proc/sys/net/ipv4", + mode: 0600, + } + var f2 = fields{ + name: "net", + path: "/proc/sys/net", + mode: 0600, + } + + tests := []struct { + name string + fields1 fields + fields2 fields + wantErr bool + prepare func(i domain.IOnodeIface) + }{ + { + // + // Test-case 1: Regular MkdirAll operation. No errors expected. + // + name: "1", + fields1: f1, + fields2: f2, + wantErr: false, + prepare: func(i domain.IOnodeIface) {}, + }, + } + + // + // Testcase executions. + // + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + i1 := ios.NewIOnode( + tt.fields1.name, + tt.fields1.path, + tt.fields1.mode, + ) + i2 := ios.NewIOnode( + tt.fields2.name, + tt.fields2.path, + tt.fields2.mode, + ) + + // Initialize memory-based fs. + ios.RemoveAllIOnodes() + + // Prepare the mocks. + if tt.prepare != nil { + tt.prepare(i1) + } + + if err := i1.MkdirAll(); (err != nil) != tt.wantErr { + t.Errorf("IOnodeFile.MkdirAll() error = %v, wantErr %v", err, tt.wantErr) + } + + // Verify that both "/proc/sys/net" and /proc/sys/net/ipv4" folders + // are created in Memfs. + if _, err := i1.Stat(); err != nil { + t.Errorf("Directory %v was not properly created", i1.Path()) + } + if _, err := i2.Stat(); err != nil { + t.Errorf("Directory %v was not properly created", i2.Path()) + } + }) + } +} + +// Notice that we are mainly testing the Memfs specific code-path of this +// method, so there's not much value in having this UT. +func TestIOnodeFile_GetNsInode(t *testing.T) { + type fields struct { + name string + path string + mode os.FileMode + } + + var f1 = fields{ + name: "user", + path: "/proc/1001/ns/user", + mode: 0600, + } + + tests := []struct { + name string + fields fields + want domain.Inode + wantErr bool + prepare func(i domain.IOnodeIface) + }{ + { + // + // Test-case 1: GetNsInode operation. No errors expected. + // + name: "1", + fields: f1, + want: 123456, + wantErr: false, + prepare: func(i domain.IOnodeIface) { + + // Create memfs file. + i.WriteFile([]byte("123456")) + }, + }, + { + // + // Test-case 2: Verify proper operation when file is not present. + // + name: "2", + fields: f1, + want: 0, + wantErr: true, + prepare: func(i domain.IOnodeIface) {}, + }, + } + + // + // Testcase executions. + // + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + i := ios.NewIOnode( + tt.fields.name, + tt.fields.path, + tt.fields.mode, + ) + + // Initialize memory-based fs. + ios.RemoveAllIOnodes() + + // Prepare the mocks. + if tt.prepare != nil { + tt.prepare(i) + } + + got, err := i.GetNsInode() + if (err != nil) != tt.wantErr { + t.Errorf("IOnodeFile.GetNsInode() error = %v, wantErr %v", err, tt.wantErr) + return + } + if !reflect.DeepEqual(got, tt.want) { + t.Errorf("IOnodeFile.GetNsInode() = %v, want %v", got, tt.want) + } + }) + } +} + +func TestIOnodeFile_Stat(t *testing.T) { + type fields struct { + name string + path string + mode os.FileMode + } + + var f1 = fields{ + name: "node_1", + path: "/proc/sys/net/node_1", + mode: 0600, + } + + // Create memfs file. + expectedResultIOnode := ios.NewIOnode("", "/proc/sys/net/node_1", 0) + expectedResultIOnode.WriteFile([]byte("file content 0123456789")) + expectedResult, err := expectedResultIOnode.Stat() + if err != nil { + t.Errorf("Could not create expected_result attribute") + } + + tests := []struct { + name string + fields fields + want os.FileInfo + wantErr bool + prepare func(i domain.IOnodeIface) + }{ + { + // + // Test-case 1: Regular Stat operation. No errors expected. + // + name: "1", + fields: f1, + want: expectedResult, + wantErr: false, + prepare: func(i domain.IOnodeIface) {}, + }, + { + // + // Test-case 2: Verify proper operation when file is not present. + // + name: "2", + fields: f1, + want: nil, + wantErr: true, + prepare: func(i domain.IOnodeIface) {}, + }, + } + + // + // Testcase executions. + // + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + i := ios.NewIOnode( + tt.fields.name, + tt.fields.path, + tt.fields.mode, + ) + + // Prepare the mocks. + if tt.prepare != nil { + tt.prepare(i) + } + + got, err := i.Stat() + if (err != nil) != tt.wantErr { + t.Errorf("IOnodeFile.Stat() error = %v, wantErr %v", err, tt.wantErr) + return + } + if !reflect.DeepEqual(got, tt.want) { + t.Errorf("IOnodeFile.Stat() = %v, want %v", got, tt.want) + } + + // Re-initialize memory-based fs. + ios.RemoveAllIOnodes() + }) + } +} diff --git a/sysbox-fs/tools/log-parser/.gitignore b/sysbox-fs/tools/log-parser/.gitignore new file mode 100644 index 00000000..7f983f56 --- /dev/null +++ b/sysbox-fs/tools/log-parser/.gitignore @@ -0,0 +1 @@ +log-parser \ No newline at end of file diff --git a/sysbox-fs/tools/log-parser/README.md b/sysbox-fs/tools/log-parser/README.md new file mode 100644 index 00000000..57dc2dd3 --- /dev/null +++ b/sysbox-fs/tools/log-parser/README.md @@ -0,0 +1,29 @@ +# Sysbox-fs Log Parser + +Simple program to parse FUSE transactions per sys-container in +sysbox-fs logs. + +Requires that sysbox-fs debug logging be enabled and that Sysbox +uid-shifting be enabled (i.e., the script uses the uid(gid) to +differentiate between system containers). + +## Build + +``` +go build +``` + +## Usage + +* Run sysbox with uid-shifting, and sysbox-fs with debug logging enabled. + +* Parse the sysbox-fs log with: + +``` +./log-parser /var/log/sysbox-fs.log +``` + +* This creates several files in the current directory, each showing + FUSE transactions received by sysbox-fs from each sys container. + +* FYI: the parsing can take several seconds. diff --git a/sysbox-fs/tools/log-parser/main.go b/sysbox-fs/tools/log-parser/main.go new file mode 100644 index 00000000..42305f5c --- /dev/null +++ b/sysbox-fs/tools/log-parser/main.go @@ -0,0 +1,195 @@ +// +// Copyright 2019-2020 Nestybox, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +// sysboxfs log parser + +package main + +import ( + "bufio" + "fmt" + "io" + "io/ioutil" + "os" + "regexp" + "strconv" + "strings" + "sync" +) + +func parseTrans(infile string, transMap map[int][]int) error { + + file, err := os.Open(infile) + if err != nil { + return err + } + defer file.Close() + + reader := bufio.NewReader(file) + + for { + line, err := reader.ReadSlice('\n') + if err == io.EOF { + break + } else if err != nil { + return fmt.Errorf("failed to read file %s: %v\n", infile, err) + } + + // parse Uid= + re := regexp.MustCompile(`Uid=[0-9]+`) + token := re.Find(line) + + if token == nil { + continue + } + + // convert Uid string to int + uidStr := string(token) + uidStr = strings.Trim(uidStr, "Uid=") + uid64, err := strconv.ParseInt(uidStr, 0, 32) + if err != nil { + return fmt.Errorf("failed to convert %s to int: %v\n", uidStr, err) + } + uid := int(uid64) + + // Add Uid to map (if not present already) + if _, found := transMap[uid]; !found { + transMap[uid] = []int{} + } + + // parse ID= + re = regexp.MustCompile(`ID=0x[0-9a-f]+`) + token = re.Find(line) + if token == nil { + continue + } + + // convert ID string to int + idStr := string(token) + idStr = strings.Trim(idStr, "ID=") + id64, err := strconv.ParseInt(idStr, 0, 32) + if err != nil { + return fmt.Errorf("failed to convert %s to int: %v\n", idStr, err) + } + id := int(id64) + + // Add ID to list of IDs for Uid + ids, found := transMap[uid] + if !found { + return fmt.Errorf("did not find uid %d in transaction map!", uid) + } + ids = append(ids, id) + transMap[uid] = ids + } + + return nil +} + +func uidTransParser(data []byte, uid int, ids []int, wg *sync.WaitGroup, errch chan error) { + + defer wg.Done() + + // create output file + outfile := fmt.Sprintf("uid_%d", uid) + outf, err := os.Create(outfile) + if err != nil { + errch <- err + return + } + defer outf.Close() + + // start parsing + lines := strings.Split(string(data), "\n") + for _, line := range lines { + for _, id := range ids { + token := fmt.Sprintf("ID=%#x", id) + match, _ := regexp.MatchString("\\b"+token+"\\b", line) + if match { + _, err := outf.WriteString(line + "\n") + if err != nil { + errch <- fmt.Errorf("failed to write to file %s: %v", outfile, err) + return + } + } + } + } +} + +func dumpTrans(infile string, transMap map[int][]int) error { + var wg sync.WaitGroup + + // read the file into mem; it will then be parsed concurrently + inData, err := ioutil.ReadFile(infile) + if err != nil { + return fmt.Errorf("failed to read file %s: %v", infile, err) + } + + errch := make(chan error, len(transMap)) + + // dump transactions per uid + for uid, ids := range transMap { + wg.Add(1) + go uidTransParser(inData, uid, ids, &wg, errch) + } + + wg.Wait() + + select { + case err := <-errch: + return err + default: + } + + return nil +} + +func usage() { + fmt.Printf("%s \n", os.Args[0]) +} + +func main() { + + if len(os.Args) < 2 { + usage() + os.Exit(1) + } + + filename := os.Args[1] + + // maps container uid -> list of transactions associated with that container + transMap := make(map[int][]int) + + if err := parseTrans(filename, transMap); err != nil { + fmt.Printf("Failed to parse file %s: %v", filename, err) + os.Exit(1) + } + + // XXX: for debug + // for uid, ids := range transMap { + // fmt.Printf("uid %d: ", uid) + // for _, id := range ids { + // fmt.Printf("%#x ", id) + // } + // fmt.Printf("\n") + // } + + if err := dumpTrans(filename, transMap); err != nil { + fmt.Printf("Failed to dump transactions: %v", err) + os.Exit(1) + } + + fmt.Printf("Done.\n") +} diff --git a/sysbox-ipc b/sysbox-ipc deleted file mode 160000 index f05151f4..00000000 --- a/sysbox-ipc +++ /dev/null @@ -1 +0,0 @@ -Subproject commit f05151f4b4c1df63d7fd241577ca032905c1bd0e diff --git a/sysbox-ipc/.gitignore b/sysbox-ipc/.gitignore new file mode 100644 index 00000000..33bb16c1 --- /dev/null +++ b/sysbox-ipc/.gitignore @@ -0,0 +1,9 @@ +# Test binary, build with `go test -c` +*.test + +# Ignore virtual-studio-code metadata +.vscode + +# Ignore autogenerated pb code +sysboxFsProtobuf.pb.go +sysboxMgrProtobuf.pb.go diff --git a/sysbox-ipc/CONTRIBUTING.md b/sysbox-ipc/CONTRIBUTING.md new file mode 100644 index 00000000..423e7165 --- /dev/null +++ b/sysbox-ipc/CONTRIBUTING.md @@ -0,0 +1,5 @@ +# Contribute to Sysbox-ipc + +Sysbox-ipc is a component of the Sysbox container runtime. If you want to +contribute, please refer to the Sysbox contribution +[guidelines](https://github.com/nestybox/sysbox/blob/master/CONTRIBUTING.md). diff --git a/sysbox-ipc/LICENSE b/sysbox-ipc/LICENSE new file mode 100644 index 00000000..c6087d5b --- /dev/null +++ b/sysbox-ipc/LICENSE @@ -0,0 +1,191 @@ + + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + Copyright 2020 Nestybox, Inc. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/sysbox-ipc/MAINTAINERS b/sysbox-ipc/MAINTAINERS new file mode 100644 index 00000000..3af2dbb0 --- /dev/null +++ b/sysbox-ipc/MAINTAINERS @@ -0,0 +1,2 @@ +Rodny Molina (@rodnymolina) +Cesar Talledo (@ctalledo) diff --git a/sysbox-ipc/Makefile b/sysbox-ipc/Makefile new file mode 100644 index 00000000..0e0a244c --- /dev/null +++ b/sysbox-ipc/Makefile @@ -0,0 +1,39 @@ +# +# sysbox-ipc Makefile +# + +.PHONY: clean sysbox-ipc sysipc-grpc-fs-proto sysipc-grpc-mgr-proto lint list-packages + +GO := go + +SYSIPC_GRPC_FS_DIR := sysboxFsGrpc +SYSIPC_GRPC_MGR_DIR := sysboxMgrGrpc + +sysbox-ipc: sysipc-grpc-fs-proto sysipc-grpc-mgr-proto + +sysipc-grpc-fs-proto: + $(MAKE) -C $(SYSIPC_GRPC_FS_DIR)/sysboxFsProtobuf + +sysipc-grpc-mgr-proto: + $(MAKE) -C $(SYSIPC_GRPC_MGR_DIR)/sysboxMgrProtobuf + +# Note: we must build the protobuf before go mod tidy +gomod-tidy: sysipc-grpc-fs-proto sysipc-grpc-mgr-proto + $(GO) mod tidy + +lint: + $(GO) vet $(allpackages) + $(GO) fmt $(allpackages) + +listpackages: + @echo $(allpackages) + +clean: + $(MAKE) -C $(SYSIPC_GRPC_FS_DIR)/sysboxFsProtobuf clean + $(MAKE) -C $(SYSIPC_GRPC_MGR_DIR)/sysboxMgrProtobuf clean + +distclean: clean + +# memoize allpackages, so that it's executed only once and only if used +_allpackages = $(shell $(GO) list ./... | grep -v vendor) +allpackages = $(if $(__allpackages),,$(eval __allpackages := $$(_allpackages)))$(__allpackages) diff --git a/sysbox-ipc/go.mod b/sysbox-ipc/go.mod new file mode 100644 index 00000000..2c13f49b --- /dev/null +++ b/sysbox-ipc/go.mod @@ -0,0 +1,51 @@ +module github.com/nestybox/sysbox-ipc + +go 1.22 + +toolchain go1.22.0 + +require ( + github.com/golang/protobuf v1.5.4 + github.com/nestybox/sysbox-libs/formatter v0.0.0-00010101000000-000000000000 + github.com/nestybox/sysbox-libs/idShiftUtils v0.0.0-00010101000000-000000000000 + github.com/nestybox/sysbox-libs/shiftfs v0.0.0-00010101000000-000000000000 + github.com/opencontainers/runc v1.1.4 + github.com/opencontainers/runtime-spec v1.1.1-0.20230823135140-4fec88fd00a4 + github.com/sirupsen/logrus v1.9.3 + golang.org/x/sys v0.27.0 + google.golang.org/grpc v1.64.0 + google.golang.org/protobuf v1.35.1 +) + +require ( + github.com/coreos/go-systemd/v22 v22.1.0 // indirect + github.com/deckarep/golang-set v1.8.0 // indirect + github.com/docker/docker v26.0.0+incompatible // indirect + github.com/godbus/dbus/v5 v5.0.3 // indirect + github.com/joshlf/go-acl v0.0.0-20200411065538-eae00ae38531 // indirect + github.com/karrick/godirwalk v1.16.1 // indirect + github.com/nestybox/sysbox-libs/linuxUtils v0.0.0-00010101000000-000000000000 // indirect + github.com/nestybox/sysbox-libs/mount v0.0.0-20240602025437-33cbdf5a9e98 // indirect + github.com/nestybox/sysbox-libs/utils v0.0.0-00010101000000-000000000000 // indirect + github.com/pkg/errors v0.9.1 // indirect + github.com/spf13/afero v1.4.1 // indirect + golang.org/x/net v0.23.0 // indirect + golang.org/x/text v0.15.0 // indirect + google.golang.org/genproto/googleapis/rpc v0.0.0-20240513163218-0867130af1f8 // indirect + gopkg.in/hlandau/service.v1 v1.0.7 // indirect +) + +replace ( + github.com/godbus/dbus => github.com/godbus/dbus/v5 v5.0.3 + github.com/nestybox/sysbox-ipc => ./ + github.com/nestybox/sysbox-libs/capability => ../sysbox-libs/capability + github.com/nestybox/sysbox-libs/dockerUtils => ../sysbox-libs/dockerUtils + github.com/nestybox/sysbox-libs/formatter => ../sysbox-libs/formatter + github.com/nestybox/sysbox-libs/idShiftUtils => ../sysbox-libs/idShiftUtils + github.com/nestybox/sysbox-libs/libseccomp-golang => ../sysbox-libs/libseccomp-golang + github.com/nestybox/sysbox-libs/linuxUtils => ../sysbox-libs/linuxUtils + github.com/nestybox/sysbox-libs/mount => ../sysbox-libs/mount + github.com/nestybox/sysbox-libs/shiftfs => ../sysbox-libs/shiftfs + github.com/nestybox/sysbox-libs/utils => ../sysbox-libs/utils + github.com/opencontainers/runc => ../sysbox-runc +) diff --git a/sysbox-ipc/go.sum b/sysbox-ipc/go.sum new file mode 100644 index 00000000..5127b6e3 --- /dev/null +++ b/sysbox-ipc/go.sum @@ -0,0 +1,65 @@ +github.com/coreos/go-systemd/v22 v22.1.0 h1:kq/SbG2BCKLkDKkjQf5OWwKWUKj1lgs3lFI4PxnR5lg= +github.com/coreos/go-systemd/v22 v22.1.0/go.mod h1:xO0FLkIi5MaZafQlIrOotqXZ90ih+1atmu1JpKERPPk= +github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= +github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= +github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= +github.com/deckarep/golang-set v1.8.0 h1:sk9/l/KqpunDwP7pSjUg0keiOOLEnOBHzykLrsPppp4= +github.com/deckarep/golang-set v1.8.0/go.mod h1:5nI87KwE7wgsBU1F4GKAw2Qod7p5kyS383rP6+o6qqo= +github.com/docker/docker v26.0.0+incompatible h1:Ng2qi+gdKADUa/VM+6b6YaY2nlZhk/lVJiKR/2bMudU= +github.com/docker/docker v26.0.0+incompatible/go.mod h1:eEKB0N0r5NX/I1kEveEz05bcu8tLC/8azJZsviup8Sk= +github.com/godbus/dbus/v5 v5.0.3 h1:ZqHaoEF7TBzh4jzPmqVhE/5A1z9of6orkAe5uHoAeME= +github.com/godbus/dbus/v5 v5.0.3/go.mod h1:xhWf0FNVPg57R7Z0UbKHbJfkEywrmjJnf7w5xrFpKfA= +github.com/golang/protobuf v1.5.4 h1:i7eJL8qZTpSEXOPTxNKhASYpMn+8e5Q6AdndVa1dWek= +github.com/golang/protobuf v1.5.4/go.mod h1:lnTiLA8Wa4RWRcIUkrtSVa5nRhsEGBg48fD6rSs7xps= +github.com/google/go-cmp v0.6.0 h1:ofyhxvXcZhMsU5ulbFiLKl/XBFqE1GSq7atu8tAmTRI= +github.com/google/go-cmp v0.6.0/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY= +github.com/joshlf/go-acl v0.0.0-20200411065538-eae00ae38531 h1:hgVxRoDDPtQE68PT4LFvNlPz2nBKd3OMlGKIQ69OmR4= +github.com/joshlf/go-acl v0.0.0-20200411065538-eae00ae38531/go.mod h1:fqTUQpVYBvhCNIsMXGl2GE9q6z94DIP6NtFKXCSTVbg= +github.com/joshlf/testutil v0.0.0-20170608050642-b5d8aa79d93d h1:J8tJzRyiddAFF65YVgxli+TyWBi0f79Sld6rJP6CBcY= +github.com/joshlf/testutil v0.0.0-20170608050642-b5d8aa79d93d/go.mod h1:b+Q3v8Yrg5o15d71PSUraUzYb+jWl6wQMSBXSGS/hv0= +github.com/karrick/godirwalk v1.16.1 h1:DynhcF+bztK8gooS0+NDJFrdNZjJ3gzVzC545UNA9iw= +github.com/karrick/godirwalk v1.16.1/go.mod h1:j4mkqPuvaLI8mp1DroR3P6ad7cyYd4c1qeJ3RV7ULlk= +github.com/kr/fs v0.1.0/go.mod h1:FFnZGqtBN9Gxj7eW1uZ42v5BccTP0vu6NEaFoC2HwRg= +github.com/opencontainers/runtime-spec v1.1.1-0.20230823135140-4fec88fd00a4 h1:EctkgBjZ1y4q+sibyuuIgiKpa0QSd2elFtSSdNvBVow= +github.com/opencontainers/runtime-spec v1.1.1-0.20230823135140-4fec88fd00a4/go.mod h1:jwyrGlmzljRJv/Fgzds9SsS/C5hL+LL3ko9hs6T5lQ0= +github.com/pkg/errors v0.8.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0= +github.com/pkg/errors v0.9.1 h1:FEBLx1zS214owpjy7qsBeixbURkuhQAwrK5UwLGTwt4= +github.com/pkg/errors v0.9.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0= +github.com/pkg/sftp v1.10.1/go.mod h1:lYOWFsE0bwd1+KfKJaKeuokY15vzFx25BLbzYYoAxZI= +github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= +github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= +github.com/sirupsen/logrus v1.9.3 h1:dueUQJ1C2q9oE3F7wvmSGAaVtTmUizReu6fjN8uqzbQ= +github.com/sirupsen/logrus v1.9.3/go.mod h1:naHLuLoDiP4jHNo9R0sCBMtWGeIprob74mVsIT4qYEQ= +github.com/spf13/afero v1.4.1 h1:asw9sl74539yqavKaglDM5hFpdJVK0Y5Dr/JOgQ89nQ= +github.com/spf13/afero v1.4.1/go.mod h1:Ai8FlHk4v/PARR026UzYexafAt9roJ7LcLMAmO6Z93I= +github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= +github.com/stretchr/testify v1.4.0/go.mod h1:j7eGeouHqKxXV5pUuKE4zz7dFj8WfuZ+81PSLYec5m4= +github.com/stretchr/testify v1.7.0 h1:nwc3DEeHmmLAfoZucVR881uASk0Mfjw8xYJ99tb5CcY= +github.com/stretchr/testify v1.7.0/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg= +golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w= +golang.org/x/crypto v0.0.0-20190820162420-60c769a6c586/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI= +golang.org/x/net v0.0.0-20190404232315-eb5bcb51f2a3/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg= +golang.org/x/net v0.23.0 h1:7EYJ93RZ9vYSZAIb2x3lnuvqO5zneoD6IvWjuhfxjTs= +golang.org/x/net v0.23.0/go.mod h1:JKghWKKOSdJwpW2GEx0Ja7fmaKnMsbu+MWVZTokSYmg= +golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= +golang.org/x/sys v0.0.0-20190412213103-97732733099d/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20220715151400-c0bba94af5f8/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.27.0 h1:wBqf8DvsY9Y/2P8gAfPDEYNuS30J4lPHJxXSb/nJZ+s= +golang.org/x/sys v0.27.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= +golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= +golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= +golang.org/x/text v0.15.0 h1:h1V/4gjBv8v9cjcR6+AR5+/cIYK5N/WAgiv4xlsEtAk= +golang.org/x/text v0.15.0/go.mod h1:18ZOQIKpY8NJVqYksKHtTdi31H5itFRjB5/qKTNYzSU= +golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= +google.golang.org/genproto/googleapis/rpc v0.0.0-20240513163218-0867130af1f8 h1:mxSlqyb8ZAHsYDCfiXN1EDdNTdvjUJSLY+OnAUtYNYA= +google.golang.org/genproto/googleapis/rpc v0.0.0-20240513163218-0867130af1f8/go.mod h1:I7Y+G38R2bu5j1aLzfFmQfTcU/WnFuqDwLZAbvKTKpM= +google.golang.org/grpc v1.64.0 h1:KH3VH9y/MgNQg1dE7b3XfVK0GsPSIzJwdF617gUSbvY= +google.golang.org/grpc v1.64.0/go.mod h1:oxjF8E3FBnjp+/gVFYdWacaLDx9na1aqy9oovLpxQYg= +google.golang.org/protobuf v1.35.1 h1:m3LfL6/Ca+fqnjnlqQXNpFPABW1UD7mjh8KO2mKFytA= +google.golang.org/protobuf v1.35.1/go.mod h1:9fA7Ob0pmnwhb644+1+CVWFRbNajQ6iRojtC/QF5bRE= +gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= +gopkg.in/hlandau/service.v1 v1.0.7 h1:16G5AJ1Cp8Vr65QItJXpyAIzf/FWAWCZBsTgsc6eyA8= +gopkg.in/hlandau/service.v1 v1.0.7/go.mod h1:sZw6ksxcoafC04GoZtw32UeqqEuPSABX35lVBaJP/bE= +gopkg.in/yaml.v2 v2.2.2/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI= +gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c h1:dUUwHk2QECo/6vqA44rthZ8ie2QXMNeKRTHCNY2nXvo= +gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= diff --git a/sysbox-ipc/sysboxFsGrpc/grpcClient.go b/sysbox-ipc/sysboxFsGrpc/grpcClient.go new file mode 100644 index 00000000..73c09355 --- /dev/null +++ b/sysbox-ipc/sysboxFsGrpc/grpcClient.go @@ -0,0 +1,262 @@ +// +// Copyright 2019-2020 Nestybox, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +package sysboxFsGrpc + +import ( + "context" + "fmt" + "net" + "time" + + "github.com/golang/protobuf/ptypes" + pb "github.com/nestybox/sysbox-ipc/sysboxFsGrpc/sysboxFsProtobuf" + "github.com/nestybox/sysbox-libs/formatter" + "google.golang.org/grpc" + + "github.com/sirupsen/logrus" +) + +var grpcTimeout time.Duration = 20 * time.Second + +// Container info passed by the client to the server across the grpc channel +type ContainerData struct { + Id string + Netns string + InitPid int32 + Hostname string + Ctime time.Time + UidFirst int32 + UidSize int32 + GidFirst int32 + GidSize int32 + ProcRoPaths []string + ProcMaskPaths []string +} + +func unixConnect(addr string, t time.Duration) (net.Conn, error) { + unixAddr, err := net.ResolveUnixAddr("unix", sysFsGrpcSockAddr) + conn, err := net.DialUnix("unix", nil, unixAddr) + return conn, err +} + +// +// Establishes grpc connection to sysbox-fs' remote-end. +// +func connect() (*grpc.ClientConn, error) { + // Set up a connection to the server. + // TODO: Secure me through TLS. + conn, err := grpc.Dial( + sysFsGrpcSockAddr, + grpc.WithInsecure(), + grpc.WithDialer(unixConnect), + ) + if err != nil { + return nil, err + } + return conn, nil +} + +func containerDataToPbData(data *ContainerData) (*pb.ContainerData, error) { + pbTime, err := ptypes.TimestampProto(data.Ctime) + if err != nil { + return nil, fmt.Errorf("time conversion error: %v", err) + } + + return &pb.ContainerData{ + Id: data.Id, + Netns: data.Netns, + InitPid: data.InitPid, + Ctime: pbTime, + UidFirst: data.UidFirst, + UidSize: data.UidSize, + GidFirst: data.GidFirst, + GidSize: data.GidSize, + ProcRoPaths: data.ProcRoPaths, + ProcMaskPaths: data.ProcMaskPaths, + }, nil +} + +func pbDatatoContainerData(pbdata *pb.ContainerData) (*ContainerData, error) { + cTime, err := ptypes.Timestamp(pbdata.Ctime) + if err != nil { + return nil, fmt.Errorf("time conversion error: %v", err) + } + + return &ContainerData{ + Id: pbdata.Id, + Netns: pbdata.Netns, + InitPid: pbdata.InitPid, + Ctime: cTime, + UidFirst: pbdata.UidFirst, + UidSize: pbdata.UidSize, + GidFirst: pbdata.GidFirst, + GidSize: pbdata.GidSize, + ProcRoPaths: pbdata.ProcRoPaths, + ProcMaskPaths: pbdata.ProcMaskPaths, + }, nil +} + +// +// Pre-registers container creation in sysbox-fs. Notice that this +// is a blocking call that can potentially have a minor impact +// on container's boot-up speed. +// +func SendContainerPreRegistration(data *ContainerData) (err error) { + // Set up sysbox-fs pipeline. + conn, err := connect() + if err != nil { + return fmt.Errorf("failed to connect with sysbox-fs: %v", err) + } + defer conn.Close() + + cntrChanIntf := pb.NewSysboxStateChannelClient(conn) + + ctx, cancel := context.WithTimeout(context.Background(), grpcTimeout) + defer cancel() + + pbData, err := containerDataToPbData(data) + if err != nil { + return fmt.Errorf("convertion to protobuf data failed: %v", err) + } + + _, err = cntrChanIntf.ContainerPreRegistration(ctx, pbData) + if err != nil { + logrus.Warning("Container %s pre-registration error: %s", + formatter.ContainerID{data.Id}, err) + return fmt.Errorf("failed to pre-register container with sysbox-fs: %v", err) + } + + return nil +} + +// +// Registers container creation in sysbox-fs. +// +func SendContainerRegistration(data *ContainerData) (err error) { + // Set up sysbox-fs pipeline. + conn, err := connect() + if err != nil { + return fmt.Errorf("failed to connect with sysbox-fs: %v", err) + } + defer conn.Close() + + cntrChanIntf := pb.NewSysboxStateChannelClient(conn) + + ctx, cancel := context.WithTimeout(context.Background(), grpcTimeout) + defer cancel() + + pbData, err := containerDataToPbData(data) + if err != nil { + return fmt.Errorf("convertion to protobuf data failed: %v", err) + } + + _, err = cntrChanIntf.ContainerRegistration(ctx, pbData) + if err != nil { + logrus.Warning("Container %s registration error: %s", + formatter.ContainerID{data.Id}, err) + return fmt.Errorf("failed to register container with sysbox-fs: %v", err) + } + + return nil +} + +// +// Unregisters container from sysbox-fs. +// +func SendContainerUnregistration(data *ContainerData) (err error) { + // Set up sysbox-fs pipeline. + conn, err := connect() + if err != nil { + return fmt.Errorf("failed to connect with sysbox-fs: %v", err) + } + defer conn.Close() + + cntrChanIntf := pb.NewSysboxStateChannelClient(conn) + + ctx, cancel := context.WithTimeout(context.Background(), grpcTimeout) + defer cancel() + + pbData, err := containerDataToPbData(data) + if err != nil { + return fmt.Errorf("convertion to protobuf data failed: %v", err) + } + + _, err = cntrChanIntf.ContainerUnregistration(ctx, pbData) + if err != nil { + return fmt.Errorf("failed to unregister container with sysbox-fs: %v", err) + } + + return nil +} + +// +// Sends a container-update message to sysbox-fs end. At this point, we are +// only utilizing this message for a particular case, update the container +// creation-time attribute, but this function can serve more general purposes +// in the future. +// +func SendContainerUpdate(data *ContainerData) (err error) { + // Set up sysbox-fs pipeline. + conn, err := connect() + if err != nil { + return fmt.Errorf("failed to connect with sysbox-fs: %v", err) + } + defer conn.Close() + + cntrChanIntf := pb.NewSysboxStateChannelClient(conn) + + ctx, cancel := context.WithTimeout(context.Background(), grpcTimeout) + defer cancel() + + pbData, err := containerDataToPbData(data) + if err != nil { + return fmt.Errorf("convertion to protobuf data failed: %v", err) + } + + _, err = cntrChanIntf.ContainerUpdate(ctx, pbData) + if err != nil { + logrus.Warning("Container %s update error: %s", + formatter.ContainerID{data.Id}, err) + return fmt.Errorf("failed to send container-update message to ", + "sysbox-fs: %v", err) + } + + return nil +} + +func GetMountpoint() (string, error) { + var mpResp *pb.MountpointResp + + // Set up sysbox-fs pipeline. + conn, err := connect() + if err != nil { + return "", fmt.Errorf("failed to connect with sysbox-fs: %v", err) + } + defer conn.Close() + + cntrChanIntf := pb.NewSysboxStateChannelClient(conn) + + ctx, cancel := context.WithTimeout(context.Background(), grpcTimeout) + defer cancel() + + mpResp, err = cntrChanIntf.GetMountpoint(ctx, &pb.MountpointReq{}) + if err != nil { + return "", fmt.Errorf("failed to get sysbox-fs mountpoint: %v", err) + } + + return mpResp.GetMountpoint(), nil +} diff --git a/sysbox-ipc/sysboxFsGrpc/grpcServer.go b/sysbox-ipc/sysboxFsGrpc/grpcServer.go new file mode 100644 index 00000000..ff4fbbc3 --- /dev/null +++ b/sysbox-ipc/sysboxFsGrpc/grpcServer.go @@ -0,0 +1,198 @@ +// +// Copyright 2019-2020 Nestybox, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +package sysboxFsGrpc + +import ( + "context" + "fmt" + "log" + "net" + "os" + "path" + + pb "github.com/nestybox/sysbox-ipc/sysboxFsGrpc/sysboxFsProtobuf" + + "google.golang.org/grpc" + grpcCodes "google.golang.org/grpc/codes" + "google.golang.org/grpc/reflection" + grpcStatus "google.golang.org/grpc/status" +) + +// +// File dealing with all the logic related to sysbox-fs' external-communication +// (ipc) logic. +// + +const sysFsGrpcSockAddr = "/run/sysbox/sysfs.sock" + +const ( + Unknown MessageType = iota + ContainerPreRegisterMessage + ContainerRegisterMessage + ContainerUnregisterMessage + ContainerUpdateMessage + GetMountpointMessage + MaxSupportedMessage +) + +var messageTypeStrings = [...]string{ + "Unknown", + "ContainerPreRegister", + "ContainerRegister", + "ContainerUnregister", + "ContainerUpdate", +} + +type MessageType uint16 + +type Callback func(client interface{}, c *ContainerData) error + +type CallbacksMap = map[MessageType]Callback + +type Server struct { + Ctx interface{} + Callbacks CallbacksMap + FuseMp string +} + +func NewServer(ctx interface{}, cb *CallbacksMap, fuseMp string) *Server { + + if cb == nil { + return nil + } + + if err := os.RemoveAll(sysFsGrpcSockAddr); err != nil { + return nil + } + + if err := os.MkdirAll(path.Dir(sysFsGrpcSockAddr), 0700); err != nil { + return nil + } + + newServer := &Server{ + Ctx: ctx, + Callbacks: make(map[MessageType]Callback), + FuseMp: fuseMp, + } + + for ctype, cval := range *cb { + newServer.Callbacks[ctype] = cval + } + + return newServer +} + +func (s *Server) Init() error { + + lis, err := net.Listen("unix", sysFsGrpcSockAddr) + if err != nil { + return fmt.Errorf("failed to listen: %v", err) + } + + if err := os.Chmod(sysFsGrpcSockAddr, 0600); err != nil { + return fmt.Errorf("failed to chmod %s: %v", sysFsGrpcSockAddr, err) + } + + // Initializing grpc server + grpcServer := grpc.NewServer() + pb.RegisterSysboxStateChannelServer(grpcServer, s) + + // Register reflection service on gRPC server. + reflection.Register(grpcServer) + if err := grpcServer.Serve(lis); err != nil { + log.Fatalf("failed to serve: %v", err) + } + + return nil +} + +func (s *Server) GetAddr() string { + return sysFsGrpcSockAddr +} + +// TODO: To be implemented in the future if needed. +func (s *Server) CallbackRegister(c *Callback) { + +} + +// TODO: To be implemented in the future if needed. +func (s *Server) CallbackUnregister(c *Callback) { + +} + +func (s *Server) ContainerPreRegistration( + ctx context.Context, data *pb.ContainerData) (*pb.Response, error) { + + return s.executeCallback(ContainerPreRegisterMessage, data) +} + +func (s *Server) ContainerRegistration( + ctx context.Context, data *pb.ContainerData) (*pb.Response, error) { + + return s.executeCallback(ContainerRegisterMessage, data) +} + +func (s *Server) ContainerUnregistration( + ctx context.Context, data *pb.ContainerData) (*pb.Response, error) { + + return s.executeCallback(ContainerUnregisterMessage, data) +} + +func (s *Server) ContainerUpdate( + ctx context.Context, data *pb.ContainerData) (*pb.Response, error) { + + return s.executeCallback(ContainerUpdateMessage, data) +} + +func (s *Server) GetMountpoint( + ctx context.Context, req *pb.MountpointReq) (*pb.MountpointResp, error) { + return &pb.MountpointResp{Mountpoint: s.FuseMp}, nil +} + +func (s *Server) executeCallback(mtype MessageType, + data *pb.ContainerData) (*pb.Response, error) { + + // Sanity-check data field here to avoid doing it in server backend. + if data == nil { + return &pb.Response{Success: false}, + grpcStatus.Error(grpcCodes.InvalidArgument, "Invalid data field") + } + + // Obtain the associated callback matching this incoming request. + cb, ok := s.Callbacks[mtype] + if !ok { + return &pb.Response{Success: false}, + grpcStatus.Errorf( + grpcCodes.Unimplemented, + "Method type %v not implemented", + mtype, + ) + } + + // Transform received payload to a grpc/protobuf-agnostic message. + cont, err := pbDatatoContainerData(data) + if err != nil { + return &pb.Response{Success: false}, err + } + + err = (cb)(s.Ctx, cont) + if err != nil { + return &pb.Response{Success: false}, err + } + + return &pb.Response{Success: true}, nil +} diff --git a/sysbox-ipc/sysboxFsGrpc/sysboxFsProtobuf/Makefile b/sysbox-ipc/sysboxFsGrpc/sysboxFsProtobuf/Makefile new file mode 100644 index 00000000..bddd2fb1 --- /dev/null +++ b/sysbox-ipc/sysboxFsGrpc/sysboxFsProtobuf/Makefile @@ -0,0 +1,39 @@ +# Build dependencies: +# +# 1) Install the protoc compiler that is used to generate gRPC service code. +# +# $ mkdir -p ~/bin/protoc +# $ cd ~/bin/protoc +# $ wget https://github.com/protocolbuffers/protobuf/releases/download/v3.6.1/protoc-3.6.1-linux-x86_64.zip +# $ unzip protoc-3.6.1-linux-x86_64.zip +# $ sudo cp -r include/* /usr/local/include/ +# $ sudo cp bin/protoc /usr/local/bin/ +# $ sudo chmod 755 /usr/local/bin/protoc [ providing execution rights to all users ] +# $ sudo chmod -R 755 /usr/local/include/google [ providing execution rights to all users ] +# +# temporary folder ~/bin/protoc can be now eliminated if desired: +# $ rm -rf ~/bin/protoc +# +# 2) Install protoc plugin for golang +# +# $ go get -u github.com/golang/protobuf/protoc-gen-go +# +# Make sure that PATH is properly set to cover $GOPATH/bin/: +# +# $ export PATH=$PATH:$GOPATH/bin +# + +.PHONY: clean + +RUNC_FS_SRC := sysboxFsProtobuf.proto +RUNC_FS_PB := sysboxFsProtobuf.pb.go + +.DEFAULT: $(RUNC_FS_PB) + +$(RUNC_FS_PB): $(RUNC_FS_SRC) + GOFLAGS='-buildvcs=false' protoc -I . -I /usr/local/include/ sysboxFsProtobuf.proto --go_out=plugins=grpc:. + +clean: + rm -f $(RUNC_FS_PB) + +distclean: clean diff --git a/sysbox-ipc/sysboxFsGrpc/sysboxFsProtobuf/sysboxFsProtobuf.proto b/sysbox-ipc/sysboxFsGrpc/sysboxFsProtobuf/sysboxFsProtobuf.proto new file mode 100644 index 00000000..97033570 --- /dev/null +++ b/sysbox-ipc/sysboxFsGrpc/sysboxFsProtobuf/sysboxFsProtobuf.proto @@ -0,0 +1,60 @@ +// +// Sysbox-fs Protobuffer Definitions. +// + +syntax = "proto3"; + +option go_package = "./;sysboxFsProtobuf"; + +import "google/protobuf/timestamp.proto"; + +package protobuf; + +// ContainerRegistration/Unregistration service definition. +service sysboxStateChannel { + + // Queries sysbox-fs for the FUSE mountpoint + rpc GetMountpoint(MountpointReq) returns (MountpointResp) {} + + // Generates a container-preregistration message + rpc ContainerPreRegistration (ContainerData) returns (Response) {} + + // Generates a container-registration message + rpc ContainerRegistration (ContainerData) returns (Response) {} + + // Generates a container-unregistration message + rpc ContainerUnregistration (ContainerData) returns (Response) {} + + // Generates a container-update message + rpc ContainerUpdate (ContainerData) returns (Response) {} +} + +// +// Request message sent by runC to sysbox-fs process during container +// registration, unregistration and update phases. +// +message ContainerData { + string Id = 1; + int32 InitPid = 2; + string Hostname = 3; + google.protobuf.Timestamp Ctime = 4; + int32 UidFirst = 5; + int32 UidSize = 6; + int32 GidFirst = 7; + int32 GidSize = 8; + repeated string ProcRoPaths = 9; + repeated string ProcMaskPaths = 10; + string Netns = 11; +} + +// Response message sent from sysbox-fs to runC process. +message Response { + bool Success = 1; +} + +message MountpointReq { +} + +message MountpointResp { + string Mountpoint = 1; +} \ No newline at end of file diff --git a/sysbox-ipc/sysboxMgrGrpc/grpcClient.go b/sysbox-ipc/sysboxMgrGrpc/grpcClient.go new file mode 100644 index 00000000..ec38e828 --- /dev/null +++ b/sysbox-ipc/sysboxMgrGrpc/grpcClient.go @@ -0,0 +1,525 @@ +// +// Copyright 2019-2020 Nestybox, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +// Client-side gRPC interface for the sysbox manager daemon + +package sysboxMgrGrpc + +import ( + "context" + "fmt" + "net" + "os" + "time" + + pb "github.com/nestybox/sysbox-ipc/sysboxMgrGrpc/sysboxMgrProtobuf" + ipcLib "github.com/nestybox/sysbox-ipc/sysboxMgrLib" + "github.com/nestybox/sysbox-libs/idShiftUtils" + "github.com/nestybox/sysbox-libs/shiftfs" + "github.com/opencontainers/runc/libcontainer/configs" + "github.com/opencontainers/runtime-spec/specs-go" + "google.golang.org/grpc" +) + +//var grpcTimeout time.Duration = 40 * time.Second +var grpcTimeout time.Duration = 10 * time.Minute + +func unixConnect(addr string, t time.Duration) (net.Conn, error) { + unixAddr, err := net.ResolveUnixAddr("unix", sysMgrGrpcSockAddr) + conn, err := net.DialUnix("unix", nil, unixAddr) + return conn, err +} + +// connect establishes grpc connection to the sysbox-mgr daemon. +func connect() (*grpc.ClientConn, error) { + conn, err := grpc.Dial(sysMgrGrpcSockAddr, grpc.WithInsecure(), grpc.WithDialer(unixConnect)) + if err != nil { + return nil, err + } + return conn, nil +} + +// Registers a container with sysbox-mgr +func Register(regInfo *ipcLib.RegistrationInfo) (*ipcLib.ContainerConfig, error) { + conn, err := connect() + if err != nil { + return nil, fmt.Errorf("failed to connect with sysbox-mgr: %v", err) + } + defer conn.Close() + + ch := pb.NewSysboxMgrStateChannelClient(conn) + ctx, cancel := context.WithTimeout(context.Background(), grpcTimeout) + defer cancel() + + req := &pb.RegisterReq{ + Id: regInfo.Id, + Rootfs: regInfo.Rootfs, + Userns: regInfo.Userns, + Netns: regInfo.Netns, + UidMappings: linuxIDMapToProtoIDMap(regInfo.UidMappings), + GidMappings: linuxIDMapToProtoIDMap(regInfo.GidMappings), + } + + resp, err := ch.Register(ctx, req) + if err != nil { + return nil, fmt.Errorf("failed to invoke Register via grpc: %v", err) + } + + config := &ipcLib.ContainerConfig{ + AliasDns: resp.ContainerConfig.GetAliasDns(), + ShiftfsOk: resp.ContainerConfig.GetShiftfsOk(), + ShiftfsOnOverlayfsOk: resp.ContainerConfig.GetShiftfsOnOverlayfsOk(), + IDMapMountOk: resp.ContainerConfig.GetIDMapMountOk(), + OverlayfsOnIDMapMountOk: resp.ContainerConfig.GetOverlayfsOnIDMapMountOk(), + NoRootfsCloning: resp.ContainerConfig.GetNoRootfsCloning(), + IgnoreSysfsChown: resp.ContainerConfig.GetIgnoreSysfsChown(), + AllowTrustedXattr: resp.ContainerConfig.GetAllowTrustedXattr(), + HonorCaps: resp.ContainerConfig.GetHonorCaps(), + SyscontMode: resp.ContainerConfig.GetSyscontMode(), + Userns: resp.ContainerConfig.GetUserns(), + UidMappings: protoIDMapToLinuxIDMap(resp.ContainerConfig.GetUidMappings()), + GidMappings: protoIDMapToLinuxIDMap(resp.ContainerConfig.GetGidMappings()), + FsuidMapFailOnErr: resp.ContainerConfig.GetFsuidMapFailOnErr(), + RootfsUidShiftType: idShiftUtils.IDShiftType(resp.ContainerConfig.GetRootfsUidShiftType()), + NoShiftfsOnFuse: resp.ContainerConfig.GetNoShiftfsOnFuse(), + RelaxedReadOnly: resp.ContainerConfig.GetRelaxedReadOnly(), + } + + return config, nil +} + +// Update a container info with sysbox-mgr +func Update(updateInfo *ipcLib.UpdateInfo) error { + + conn, err := connect() + if err != nil { + return fmt.Errorf("failed to connect with sysbox-mgr: %v", err) + } + defer conn.Close() + + ch := pb.NewSysboxMgrStateChannelClient(conn) + ctx, cancel := context.WithTimeout(context.Background(), grpcTimeout) + defer cancel() + + req := &pb.UpdateReq{ + Id: updateInfo.Id, + Userns: updateInfo.Userns, + Netns: updateInfo.Netns, + UidMappings: linuxIDMapToProtoIDMap(updateInfo.UidMappings), + GidMappings: linuxIDMapToProtoIDMap(updateInfo.GidMappings), + RootfsUidShiftType: uint32(updateInfo.RootfsUidShiftType), + } + + _, err = ch.Update(ctx, req) + if err != nil { + return fmt.Errorf("failed to invoke Update via grpc: %v", err) + } + + return nil +} + +// Unregisters a container with sysbox-mgr +func Unregister(id string) error { + conn, err := connect() + if err != nil { + return fmt.Errorf("failed to connect with sysbox-mgr: %v", err) + } + defer conn.Close() + + ch := pb.NewSysboxMgrStateChannelClient(conn) + ctx, cancel := context.WithTimeout(context.Background(), grpcTimeout) + defer cancel() + + req := &pb.UnregisterReq{ + Id: id, + } + + _, err = ch.Unregister(ctx, req) + if err != nil { + return fmt.Errorf("failed to invoke Unregister via grpc: %v", err) + } + + return nil +} + +// SubidAlloc requests sysbox-mgr to allocate a range of 'size' subuids and subgids. +func SubidAlloc(id string, size uint64) (uint32, uint32, error) { + conn, err := connect() + if err != nil { + return 0, 0, fmt.Errorf("failed to connect with sysbox-mgr: %v", err) + } + defer conn.Close() + + ch := pb.NewSysboxMgrStateChannelClient(conn) + ctx, cancel := context.WithTimeout(context.Background(), grpcTimeout) + defer cancel() + + req := &pb.SubidAllocReq{ + Id: id, + Size: size, + } + + resp, err := ch.SubidAlloc(ctx, req) + if err != nil { + return 0, 0, fmt.Errorf("failed to invoke SubidAlloc via grpc: %v", err) + } + + return resp.Uid, resp.Gid, err +} + +// ReqMounts requests the sysbox-mgr to setup sys container special mounts +func ReqMounts(id string, rootfsUidShiftType idShiftUtils.IDShiftType, reqList []ipcLib.MountReqInfo) ([]specs.Mount, error) { + + conn, err := connect() + if err != nil { + return nil, fmt.Errorf("failed to connect with sysbox-mgr: %v", err) + } + defer conn.Close() + + // We don't use context timeout for this API because the time it takes to + // setup the mounts can be large, in particular for sys containers that come + // preloaded with heavy inner images and in machines where the load is high. + ch := pb.NewSysboxMgrStateChannelClient(conn) + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + + // Convert []ipcLib.MountReqInfo -> []*pb.MountReqInfo + pbReqList := []*pb.MountReqInfo{} + for _, info := range reqList { + pbInfo := &pb.MountReqInfo{ + Kind: uint32(info.Kind), + Dest: info.Dest, + } + pbReqList = append(pbReqList, pbInfo) + } + + req := &pb.MountReq{ + Id: id, + RootfsUidShiftType: uint32(rootfsUidShiftType), + ReqList: pbReqList, + } + + resp, err := ch.ReqMounts(ctx, req) + if err != nil { + return nil, fmt.Errorf("failed to invoke ReqMounts via grpc: %v", err) + } + + // Convert []*pb.Mount -> []specs.Mount + specMounts := []specs.Mount{} + for _, m := range resp.Mounts { + specm := specs.Mount{ + Source: m.GetSource(), + Destination: m.GetDest(), + Type: m.GetType(), + Options: m.GetOpt(), + } + specMounts = append(specMounts, specm) + } + + return specMounts, nil +} + +// PrepMounts requests sysbox-mgr to prepare a mount source for use by a sys container. +func PrepMounts(id string, uid, gid uint32, prepList []ipcLib.MountPrepInfo) error { + conn, err := connect() + if err != nil { + return fmt.Errorf("failed to connect with sysbox-mgr: %v", err) + } + defer conn.Close() + + ch := pb.NewSysboxMgrStateChannelClient(conn) + ctx, cancel := context.WithTimeout(context.Background(), grpcTimeout) + defer cancel() + + // Convert []ipcLib.MountPrepInfo -> []*pb.MountPrepInfo + pbPrepList := []*pb.MountPrepInfo{} + for _, info := range prepList { + pbInfo := &pb.MountPrepInfo{ + Source: info.Source, + Exclusive: info.Exclusive, + } + pbPrepList = append(pbPrepList, pbInfo) + } + + req := &pb.MountPrepReq{ + Id: id, + Uid: uid, + Gid: gid, + PrepList: pbPrepList, + } + + _, err = ch.PrepMounts(ctx, req) + if err != nil { + return fmt.Errorf("failed to invoke PrepMounts via grpc: %v", err) + } + + return nil +} + +// ReqShiftfsMark requests sysbox-mgr to perform shiftfs marking on the +// container's rootfs and the given mount list. Returns a list of paths where +// the shiftfs marks where actually placed (need not be the same as the given +// mount list). Refer to the sysbox-mgr shiftfs manager for more info. +func ReqShiftfsMark(id string, mounts []shiftfs.MountPoint) ([]shiftfs.MountPoint, error) { + var resp *pb.ShiftfsMarkResp + + conn, err := connect() + if err != nil { + return nil, fmt.Errorf("failed to connect with sysbox-mgr: %v", err) + } + defer conn.Close() + + ch := pb.NewSysboxMgrStateChannelClient(conn) + ctx, cancel := context.WithTimeout(context.Background(), grpcTimeout) + defer cancel() + + // convert shiftfs.MountPoint to grpc ShiftfsMark + markReq := []*pb.ShiftfsMark{} + for _, m := range mounts { + sm := &pb.ShiftfsMark{ + Source: m.Source, + Readonly: m.Readonly, + } + markReq = append(markReq, sm) + } + + req := &pb.ShiftfsMarkReq{ + Id: id, + ShiftfsMarks: markReq, + } + + resp, err = ch.ReqShiftfsMark(ctx, req) + if err != nil { + return nil, fmt.Errorf("failed to invoke ReqShiftfsMark via grpc: %v", err) + } + + // convert grpc ShiftfsMark to shiftfs.MountPoint + markpoints := []shiftfs.MountPoint{} + for _, m := range resp.GetShiftfsMarks() { + sm := shiftfs.MountPoint{ + Source: m.Source, + Readonly: m.Readonly, + } + markpoints = append(markpoints, sm) + } + + return markpoints, nil +} + +// ReqFsState inquires sysbox-mgr for state to be written into container's +// rootfs. +func ReqFsState(id, rootfs string) ([]configs.FsEntry, error) { + + conn, err := connect() + if err != nil { + return nil, fmt.Errorf("failed to connect with sysbox-mgr: %v", err) + } + defer conn.Close() + + ch := pb.NewSysboxMgrStateChannelClient(conn) + ctx, cancel := context.WithTimeout(context.Background(), grpcTimeout) + defer cancel() + + req := &pb.FsStateReq{ + Id: id, + Rootfs: rootfs, + } + + resp, err := ch.ReqFsState(ctx, req) + if err != nil { + return nil, fmt.Errorf("failed to invoke ReqFsState via grpc: %v", err) + } + + fsEntries := []configs.FsEntry{} + + // Convert []*pb.FsEntry -> []configs.FsEntry + for _, e := range resp.FsEntries { + entry := configs.NewFsEntry( + e.GetPath(), + e.GetDst(), + os.FileMode(e.GetMode()), + configs.FsEntryKind(e.GetKind()), + ) + + fsEntries = append(fsEntries, *entry) + } + + return fsEntries, nil +} + +// Pause notifies the sysbox-mgr that the container has been paused. +// 'id' is the containers id +func Pause(id string) error { + conn, err := connect() + if err != nil { + return fmt.Errorf("failed to connect with sysbox-mgr: %v", err) + } + defer conn.Close() + + ch := pb.NewSysboxMgrStateChannelClient(conn) + ctx, cancel := context.WithTimeout(context.Background(), grpcTimeout) + defer cancel() + + req := &pb.PauseReq{ + Id: id, + } + + _, err = ch.Pause(ctx, req) + if err != nil { + return fmt.Errorf("failed to invoke Pause via grpc: %v", err) + } + + return nil +} + +// Resume notifies the sysbox-mgr that the container has been resumed. +// 'id' is the containers id +func Resume(id string) error { + conn, err := connect() + if err != nil { + return fmt.Errorf("failed to connect with sysbox-mgr: %v", err) + } + defer conn.Close() + + ch := pb.NewSysboxMgrStateChannelClient(conn) + ctx, cancel := context.WithTimeout(context.Background(), grpcTimeout) + defer cancel() + + req := &pb.ResumeReq{ + Id: id, + } + + _, err = ch.Resume(ctx, req) + if err != nil { + return fmt.Errorf("failed to invoke Resume via grpc: %v", err) + } + + return nil +} + +func linuxIDMapToProtoIDMap(idMappings []specs.LinuxIDMapping) []*pb.IDMapping { + + convert := func(m specs.LinuxIDMapping) *pb.IDMapping { + return &pb.IDMapping{ + ContainerID: uint32(m.ContainerID), + HostID: uint32(m.HostID), + Size: uint32(m.Size), + } + } + + protoMappings := []*pb.IDMapping{} + for _, m := range idMappings { + protoMappings = append(protoMappings, convert(m)) + } + + return protoMappings +} + +func protoIDMapToLinuxIDMap(idMappings []*pb.IDMapping) []specs.LinuxIDMapping { + + convert := func(m *pb.IDMapping) specs.LinuxIDMapping { + return specs.LinuxIDMapping{ + ContainerID: uint32(m.ContainerID), + HostID: uint32(m.HostID), + Size: uint32(m.Size), + } + } + + linuxMappings := []specs.LinuxIDMapping{} + for _, m := range idMappings { + linuxMappings = append(linuxMappings, convert(m)) + } + + return linuxMappings +} + +// ReqCloneRootfs requests the sysbox-mgr to clone the container's rootfs. +// It returns the path to the new rootfs. +func ReqCloneRootfs(id string) (string, error) { + + conn, err := connect() + if err != nil { + return "", fmt.Errorf("failed to connect with sysbox-mgr: %v", err) + } + defer conn.Close() + + ch := pb.NewSysboxMgrStateChannelClient(conn) + ctx, cancel := context.WithTimeout(context.Background(), grpcTimeout) + defer cancel() + + req := &pb.CloneRootfsReq{ + Id: id, + } + + resp, err := ch.ReqCloneRootfs(ctx, req) + if err != nil { + return "", fmt.Errorf("failed to invoke ReqCloneRootfs via grpc: %v", err) + } + + return resp.GetRootfs(), nil +} + +// ChownClonedRootfs requests the sysbox-mgr to chown a cloned rootfs. +func ChownClonedRootfs(id string, uidOffset, gidOffset int32) error { + + conn, err := connect() + if err != nil { + return fmt.Errorf("failed to connect with sysbox-mgr: %v", err) + } + defer conn.Close() + + ch := pb.NewSysboxMgrStateChannelClient(conn) + ctx, cancel := context.WithTimeout(context.Background(), grpcTimeout) + defer cancel() + + req := &pb.ChownClonedRootfsReq{ + Id: id, + UidOffset: uidOffset, + GidOffset: gidOffset, + } + + _, err = ch.ChownClonedRootfs(ctx, req) + if err != nil { + return fmt.Errorf("failed to invoke ChownClonedRootfs via grpc: %v", err) + } + + return nil +} + +// RevertClonedRootfsChown requests the sysbox-mgr to revert the chown of a cloned rootfs. +func RevertClonedRootfsChown(id string) error { + + conn, err := connect() + if err != nil { + return fmt.Errorf("failed to connect with sysbox-mgr: %v", err) + } + defer conn.Close() + + ch := pb.NewSysboxMgrStateChannelClient(conn) + ctx, cancel := context.WithTimeout(context.Background(), grpcTimeout) + defer cancel() + + req := &pb.RevertClonedRootfsChownReq{ + Id: id, + } + + _, err = ch.RevertClonedRootfsChown(ctx, req) + if err != nil { + return fmt.Errorf("failed to invoke RevertClonedRootfsChown via grpc: %v", err) + } + + return nil +} diff --git a/sysbox-ipc/sysboxMgrGrpc/grpcServer.go b/sysbox-ipc/sysboxMgrGrpc/grpcServer.go new file mode 100644 index 00000000..009fb6b7 --- /dev/null +++ b/sysbox-ipc/sysboxMgrGrpc/grpcServer.go @@ -0,0 +1,386 @@ +// +// Copyright 2019-2020 Nestybox, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +// Server-side gRPC interface for the sysbox manager daemon + +package sysboxMgrGrpc + +import ( + "context" + "errors" + "fmt" + "net" + "os" + "path" + + pb "github.com/nestybox/sysbox-ipc/sysboxMgrGrpc/sysboxMgrProtobuf" + ipcLib "github.com/nestybox/sysbox-ipc/sysboxMgrLib" + "github.com/nestybox/sysbox-libs/idShiftUtils" + "github.com/nestybox/sysbox-libs/shiftfs" + "github.com/opencontainers/runc/libcontainer/configs" + "github.com/opencontainers/runtime-spec/specs-go" + + "google.golang.org/grpc" + "google.golang.org/grpc/reflection" +) + +const sysMgrGrpcSockAddr = "/run/sysbox/sysmgr.sock" + +type ServerCallbacks struct { + Register func(regInfo *ipcLib.RegistrationInfo) (*ipcLib.ContainerConfig, error) + Update func(updateInfo *ipcLib.UpdateInfo) error + Unregister func(id string) error + SubidAlloc func(id string, size uint64) (uint32, uint32, error) + ReqMounts func(id string, rootfsUidShiftType idShiftUtils.IDShiftType, reqList []ipcLib.MountReqInfo) ([]specs.Mount, error) + PrepMounts func(id string, uid, gid uint32, prepList []ipcLib.MountPrepInfo) error + ReqShiftfsMark func(id string, mounts []shiftfs.MountPoint) ([]shiftfs.MountPoint, error) + ReqFsState func(id string, rootfs string) ([]configs.FsEntry, error) + Pause func(id string) error + Resume func(id string) error + CloneRootfs func(id string) (string, error) + ChownClonedRootfs func(id string, uidOffset, gidOffset int32) error + RevertClonedRootfsChown func(id string) error +} + +type ServerStub struct { + cb *ServerCallbacks +} + +func NewServerStub(cb *ServerCallbacks) *ServerStub { + if cb == nil { + return nil + } + + if err := os.RemoveAll(sysMgrGrpcSockAddr); err != nil { + return nil + } + + if err := os.MkdirAll(path.Dir(sysMgrGrpcSockAddr), 0700); err != nil { + return nil + } + + return &ServerStub{ + cb: cb, + } +} + +func (s *ServerStub) Init() error { + + lis, err := net.Listen("unix", sysMgrGrpcSockAddr) + if err != nil { + return fmt.Errorf("failed to listen: %v", err) + } + + if err := os.Chmod(sysMgrGrpcSockAddr, 0600); err != nil { + return fmt.Errorf("failed to chmod %s: %v", sysMgrGrpcSockAddr, err) + } + + grpcServer := grpc.NewServer() + pb.RegisterSysboxMgrStateChannelServer(grpcServer, s) + reflection.Register(grpcServer) + + if err := grpcServer.Serve(lis); err != nil { + return fmt.Errorf("failed to serve: %v", err) + } + + return nil +} + +func (s *ServerStub) GetAddr() string { + return sysMgrGrpcSockAddr +} + +func (s *ServerStub) Register(ctx context.Context, req *pb.RegisterReq) (*pb.RegisterResp, error) { + if req == nil { + return &pb.RegisterResp{}, errors.New("invalid payload") + } + + regInfo := &ipcLib.RegistrationInfo{ + Id: req.GetId(), + Rootfs: req.GetRootfs(), + Userns: req.GetUserns(), + Netns: req.GetNetns(), + UidMappings: protoIDMapToLinuxIDMap(req.GetUidMappings()), + GidMappings: protoIDMapToLinuxIDMap(req.GetGidMappings()), + } + + config, err := s.cb.Register(regInfo) + if err != nil { + return nil, err + } + + mgrConfig := pb.ContainerConfig{ + AliasDns: config.AliasDns, + ShiftfsOk: config.ShiftfsOk, + ShiftfsOnOverlayfsOk: config.ShiftfsOnOverlayfsOk, + IDMapMountOk: config.IDMapMountOk, + OverlayfsOnIDMapMountOk: config.OverlayfsOnIDMapMountOk, + NoRootfsCloning: config.NoRootfsCloning, + IgnoreSysfsChown: config.IgnoreSysfsChown, + AllowTrustedXattr: config.AllowTrustedXattr, + HonorCaps: config.HonorCaps, + SyscontMode: config.SyscontMode, + Userns: config.Userns, + UidMappings: linuxIDMapToProtoIDMap(config.UidMappings), + GidMappings: linuxIDMapToProtoIDMap(config.GidMappings), + FsuidMapFailOnErr: config.FsuidMapFailOnErr, + RootfsUidShiftType: uint32(config.RootfsUidShiftType), + NoShiftfsOnFuse: config.NoShiftfsOnFuse, + RelaxedReadOnly: config.RelaxedReadOnly, + } + + resp := &pb.RegisterResp{ + ContainerConfig: &mgrConfig, + } + + return resp, nil +} + +func (s *ServerStub) Update(ctx context.Context, req *pb.UpdateReq) (*pb.UpdateResp, error) { + if req == nil { + return &pb.UpdateResp{}, errors.New("invalid payload") + } + + updateInfo := &ipcLib.UpdateInfo{ + Id: req.GetId(), + Userns: req.GetUserns(), + Netns: req.GetNetns(), + UidMappings: protoIDMapToLinuxIDMap(req.GetUidMappings()), + GidMappings: protoIDMapToLinuxIDMap(req.GetGidMappings()), + RootfsUidShiftType: idShiftUtils.IDShiftType(req.GetRootfsUidShiftType()), + } + + if err := s.cb.Update(updateInfo); err != nil { + return nil, err + } + + return &pb.UpdateResp{}, nil +} + +func (s *ServerStub) Unregister(ctx context.Context, req *pb.UnregisterReq) (*pb.UnregisterResp, error) { + if req == nil { + return &pb.UnregisterResp{}, errors.New("invalid payload") + } + + if err := s.cb.Unregister(req.GetId()); err != nil { + return nil, err + } + + return &pb.UnregisterResp{}, nil +} + +func (s *ServerStub) SubidAlloc(ctx context.Context, req *pb.SubidAllocReq) (*pb.SubidAllocResp, error) { + if req == nil { + return &pb.SubidAllocResp{}, errors.New("invalid payload") + } + + uid, gid, err := s.cb.SubidAlloc(req.GetId(), req.GetSize()) + if err != nil { + return nil, err + } + + return &pb.SubidAllocResp{ + Uid: uid, + Gid: gid, + }, nil +} + +func (s *ServerStub) ReqMounts(ctx context.Context, req *pb.MountReq) (*pb.MountResp, error) { + if req == nil { + return &pb.MountResp{}, errors.New("invalid payload") + } + + // convert []*pb.MountReqInfo -> []ipcLib.MountReqInfo + reqList := []ipcLib.MountReqInfo{} + for _, pbInfo := range req.ReqList { + info := ipcLib.MountReqInfo{ + Kind: ipcLib.MntKind(pbInfo.GetKind()), + Dest: pbInfo.GetDest(), + } + reqList = append(reqList, info) + } + + mounts, err := s.cb.ReqMounts(req.GetId(), idShiftUtils.IDShiftType(req.GetRootfsUidShiftType()), reqList) + if err != nil { + return nil, err + } + + // convert []*specs.Mount -> []*pb.Mount + pbMounts := []*pb.Mount{} + for _, m := range mounts { + pbm := &pb.Mount{ + Source: m.Source, + Dest: m.Destination, + Type: m.Type, + Opt: m.Options, + } + pbMounts = append(pbMounts, pbm) + } + + return &pb.MountResp{ + Mounts: pbMounts, + }, nil +} + +func (s *ServerStub) PrepMounts(ctx context.Context, req *pb.MountPrepReq) (*pb.MountPrepResp, error) { + if req == nil { + return &pb.MountPrepResp{}, errors.New("invalid payload") + } + + // convert []*pb.MountPrepInfo -> []ipcLib.MountPrepInfo + prepList := []ipcLib.MountPrepInfo{} + for _, pbInfo := range req.PrepList { + info := ipcLib.MountPrepInfo{ + Source: pbInfo.GetSource(), + Exclusive: pbInfo.GetExclusive(), + } + prepList = append(prepList, info) + } + + if err := s.cb.PrepMounts(req.GetId(), req.GetUid(), req.GetGid(), prepList); err != nil { + return nil, err + } + + return &pb.MountPrepResp{}, nil +} + +func (s *ServerStub) ReqShiftfsMark(ctx context.Context, req *pb.ShiftfsMarkReq) (*pb.ShiftfsMarkResp, error) { + if req == nil { + return &pb.ShiftfsMarkResp{}, errors.New("invalid payload") + } + + // Convert pb.ShiftfsMark to shiftfs.MountPoint + reqList := []shiftfs.MountPoint{} + for _, m := range req.GetShiftfsMarks() { + sm := shiftfs.MountPoint{ + Source: m.Source, + Readonly: m.Readonly, + } + reqList = append(reqList, sm) + } + + respList, err := s.cb.ReqShiftfsMark(req.GetId(), reqList) + if err != nil { + return nil, err + } + + // Convert shiftfs.MountPoint to pb.ShiftfsMarkResp + markResp := []*pb.ShiftfsMark{} + for _, m := range respList { + sm := &pb.ShiftfsMark{ + Source: m.Source, + Readonly: m.Readonly, + } + markResp = append(markResp, sm) + } + + shiftfsMarkResp := &pb.ShiftfsMarkResp{ + ShiftfsMarks: markResp, + } + + return shiftfsMarkResp, nil +} + +func (s *ServerStub) ReqFsState( + ctx context.Context, + req *pb.FsStateReq) (*pb.FsStateResp, error) { + + if req == nil { + return &pb.FsStateResp{}, errors.New("invalid payload") + } + + fsState, err := s.cb.ReqFsState(req.GetId(), req.GetRootfs()) + if err != nil { + return nil, err + } + + // convert []configs.FsEntry -> []*pb.FsEntry + pbFsEntries := []*pb.FsEntry{} + for _, e := range fsState { + pbe := &pb.FsEntry{ + Kind: uint32(e.GetKind()), + Path: e.GetPath(), + Mode: uint32(e.GetMode()), + Dst: e.GetDest(), + } + pbFsEntries = append(pbFsEntries, pbe) + } + + return &pb.FsStateResp{FsEntries: pbFsEntries}, nil +} + +func (s *ServerStub) Pause(ctx context.Context, req *pb.PauseReq) (*pb.PauseResp, error) { + if req == nil { + return &pb.PauseResp{}, errors.New("invalid payload") + } + + if err := s.cb.Pause(req.GetId()); err != nil { + return nil, err + } + + return &pb.PauseResp{}, nil +} + +func (s *ServerStub) Resume(ctx context.Context, req *pb.ResumeReq) (*pb.ResumeResp, error) { + if req == nil { + return &pb.ResumeResp{}, errors.New("invalid payload") + } + + if err := s.cb.Resume(req.GetId()); err != nil { + return nil, err + } + + return &pb.ResumeResp{}, nil +} + +func (s *ServerStub) ReqCloneRootfs(ctx context.Context, req *pb.CloneRootfsReq) (*pb.CloneRootfsResp, error) { + if req == nil { + return &pb.CloneRootfsResp{}, errors.New("invalid payload") + } + + rootfs, err := s.cb.CloneRootfs(req.GetId()) + if err != nil { + return nil, err + } + + return &pb.CloneRootfsResp{Rootfs: rootfs}, nil +} + +func (s *ServerStub) ChownClonedRootfs(ctx context.Context, req *pb.ChownClonedRootfsReq) (*pb.ChownClonedRootfsResp, error) { + if req == nil { + return &pb.ChownClonedRootfsResp{}, errors.New("invalid payload") + } + + err := s.cb.ChownClonedRootfs(req.GetId(), req.GetUidOffset(), req.GetGidOffset()) + if err != nil { + return nil, err + } + + return &pb.ChownClonedRootfsResp{}, nil +} + +func (s *ServerStub) RevertClonedRootfsChown(ctx context.Context, req *pb.RevertClonedRootfsChownReq) (*pb.RevertClonedRootfsChownResp, error) { + if req == nil { + return &pb.RevertClonedRootfsChownResp{}, errors.New("invalid payload") + } + + err := s.cb.RevertClonedRootfsChown(req.GetId()) + if err != nil { + return nil, err + } + + return &pb.RevertClonedRootfsChownResp{}, nil +} diff --git a/sysbox-ipc/sysboxMgrGrpc/sysboxMgrProtobuf/Makefile b/sysbox-ipc/sysboxMgrGrpc/sysboxMgrProtobuf/Makefile new file mode 100644 index 00000000..e69f39b2 --- /dev/null +++ b/sysbox-ipc/sysboxMgrGrpc/sysboxMgrProtobuf/Makefile @@ -0,0 +1,39 @@ +# Build dependencies: +# +# 1) Install the protoc compiler that is used to generate gRPC service code. +# +# $ mkdir -p ~/bin/protoc +# $ cd ~/bin/protoc +# $ wget https://github.com/protocolbuffers/protobuf/releases/download/v3.6.1/protoc-3.6.1-linux-x86_64.zip +# $ unzip protoc-3.6.1-linux-x86_64.zip +# $ sudo cp -r include/* /usr/local/include/ +# $ sudo cp bin/protoc /usr/local/bin/ +# $ sudo chmod 755 /usr/local/bin/protoc [ providing execution rights to all users ] +# $ sudo chmod -R 755 /usr/local/include/google [ providing execution rights to all users ] +# +# temporary folder ~/bin/protoc can be now eliminated if desired: +# $ rm -rf ~/bin/protoc +# +# 2) Install protoc plugin for golang +# +# $ go get -u github.com/golang/protobuf/protoc-gen-go +# +# Make sure that PATH is properly set to cover $GOPATH/bin/: +# +# $ export PATH=$PATH:$GOPATH/bin +# + +.PHONY: clean + +RUNC_FS_SRC := sysboxMgrProtobuf.proto +RUNC_FS_PB := sysboxMgrProtobuf.pb.go + +.DEFAULT: $(RUNC_FS_PB) + +$(RUNC_FS_PB): $(RUNC_FS_SRC) + GOFLAGS='-buildvcs=false' protoc -I . -I /usr/local/include/ sysboxMgrProtobuf.proto --go_out=plugins=grpc:. + +clean: + rm -f $(RUNC_FS_PB) + +distclean: clean diff --git a/sysbox-ipc/sysboxMgrGrpc/sysboxMgrProtobuf/sysboxMgrProtobuf.proto b/sysbox-ipc/sysboxMgrGrpc/sysboxMgrProtobuf/sysboxMgrProtobuf.proto new file mode 100644 index 00000000..392e4fe4 --- /dev/null +++ b/sysbox-ipc/sysboxMgrGrpc/sysboxMgrProtobuf/sysboxMgrProtobuf.proto @@ -0,0 +1,271 @@ +// +// SysboxMgr Protobuffer Definitions. +// + +syntax = "proto3"; + +option go_package = "./;sysboxMgrProtobuf"; + +package protobuf; + +// +// sysboxMgr channel +// +service sysboxMgrStateChannel { + + // Container registration + rpc Register (RegisterReq) returns (RegisterResp) {} + + // Container Update + rpc Update (UpdateReq) returns (UpdateResp) {} + + // Container Unregistration + rpc Unregister (UnregisterReq) returns (UnregisterResp) {} + + // Subuid(gid) allocation request + rpc SubidAlloc (SubidAllocReq) returns (SubidAllocResp) {} + + // Mount source prep request + rpc PrepMounts (MountPrepReq) returns (MountPrepResp) {} + + // Mount request + rpc ReqMounts (MountReq) returns (MountResp) {} + + // Shiftfs mark request + rpc ReqShiftfsMark (ShiftfsMarkReq) returns (ShiftfsMarkResp) {} + + // FsState request + rpc ReqFsState (FsStateReq) returns (FsStateResp) {} + + // Pause request + rpc Pause (PauseReq) returns (PauseResp) {} + + // Resume request + rpc Resume (ResumeReq) returns (ResumeResp) {} + + // Clone rootfs request + rpc ReqCloneRootfs (CloneRootfsReq) returns (CloneRootfsResp) {} + + // Chown cloned rootfs request + rpc ChownClonedRootfs (ChownClonedRootfsReq) returns (ChownClonedRootfsResp) {} + + // Revert cloned rootfs chown + rpc RevertClonedRootfsChown (RevertClonedRootfsChownReq) returns (RevertClonedRootfsChownResp) {} +} + +// +// Registration & Unregistration +// + +message IDMapping { + uint32 containerID = 1; + uint32 hostID = 2; + uint32 size = 3; +} + +message RegisterReq { + string id = 1; + string rootfs = 2; + string userns = 3; + string netns = 4; + repeated IDMapping uidMappings = 5; + repeated IDMapping gidMappings = 6; +} + +message ContainerConfig { + bool aliasDns = 1; + bool ShiftfsOk = 2; + bool ShiftfsOnOverlayfsOk = 3; + bool IDMapMountOk = 4; + bool OverlayfsOnIDMapMountOk = 5; + bool noRootfsCloning = 6; + bool ignoreSysfsChown = 7; + bool allowTrustedXattr = 8; + bool honorCaps = 9; + bool syscontMode = 10; + string userns = 11; + repeated IDMapping uidMappings = 12; + repeated IDMapping gidMappings = 13; + bool fsuidMapFailOnErr = 14; + uint32 rootfsUidShiftType = 15; + bool noShiftfsOnFuse = 16; + bool relaxedReadOnly = 17; +} + +message RegisterResp { + ContainerConfig containerConfig = 1; +} + +message UpdateReq { + string id = 1; + string userns = 2; + string netns = 3; + repeated IDMapping uidMappings = 4; + repeated IDMapping gidMappings = 5; + uint32 rootfsUidShiftType = 6; +} + +message UpdateResp { +} + +message UnregisterReq { + string id = 1; +} + +message UnregisterResp { +} + +// +// Subid alloc +// + +message SubidAllocReq { + string id = 1; + uint64 size = 2; +} + +message SubidAllocResp { + uint32 uid = 1; + uint32 gid = 2; +} + +// +// Mount Source Prep Request +// + +message MountPrepInfo { + string source = 1; + bool exclusive = 2; +} + +message MountPrepReq { + string id = 1; + uint32 uid = 2; + uint32 gid = 3; + repeated MountPrepInfo prepList = 4; +} + +message MountPrepResp { +} + +// +// Mount Requests +// + +message MountReqInfo { + uint32 kind = 1; + string dest = 2; +} + +message MountReq { + string id = 1; + uint32 rootfsUidShiftType = 2; + repeated MountReqInfo reqList = 3; +} + +message Mount { + string source = 1; + string dest = 2; + string type = 3; + repeated string opt = 4; +} + +message MountResp { + repeated Mount mounts = 1; +} + +// +// Shiftfs mark request +// + +message ShiftfsMark { + string source = 1; + bool readonly = 2; +} + +message ShiftfsMarkReq { + string id = 1; + repeated ShiftfsMark shiftfsMarks = 2; +} + +message ShiftfsMarkResp { + repeated ShiftfsMark shiftfsMarks = 1; +} + +// +// FsState Requests +// + +message FsStateReq { + string id = 1; + string rootfs = 2; +} + +message FsEntry { + uint32 kind = 1; + string path = 2; + uint32 mode = 3; + string dst = 4; +} + +message FsStateResp { + repeated FsEntry fsEntries = 1; +} + +// +// Pause request +// + +message PauseReq { + string id = 1; +} + +message PauseResp { +} + +// +// Resume request +// + +message ResumeReq { + string id = 1; +} + +message ResumeResp { +} + +// +// CloneRootfs request +// + +message CloneRootfsReq { + string id = 1; +} + +message CloneRootfsResp { + string rootfs = 1; +} + +// +// ChownClonedRootfs request +// + +message ChownClonedRootfsReq { + string id = 1; + int32 uidOffset = 2; + int32 gidOffset = 3; +} + +message ChownClonedRootfsResp { +} + +// +// RevertClonedRootfsChown request +// + +message RevertClonedRootfsChownReq { + string id = 1; +} + +message RevertClonedRootfsChownResp { +} diff --git a/sysbox-ipc/sysboxMgrLib/sysboxMgrLib.go b/sysbox-ipc/sysboxMgrLib/sysboxMgrLib.go new file mode 100644 index 00000000..9f8272d7 --- /dev/null +++ b/sysbox-ipc/sysboxMgrLib/sysboxMgrLib.go @@ -0,0 +1,117 @@ +// +// Copyright 2019-2020 Nestybox, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +// Common definitions for grpc transfers with sysbox-mgr + +package sysboxMgrLib + +import ( + "github.com/nestybox/sysbox-libs/idShiftUtils" + "github.com/opencontainers/runtime-spec/specs-go" +) + +// Sysbox-mgr container registration info +type RegistrationInfo struct { + Id string + Rootfs string + Userns string + Netns string + UidMappings []specs.LinuxIDMapping + GidMappings []specs.LinuxIDMapping +} + +// Sysbox-mgr container update info +type UpdateInfo struct { + Id string + Userns string + Netns string + UidMappings []specs.LinuxIDMapping + GidMappings []specs.LinuxIDMapping + RootfsUidShiftType idShiftUtils.IDShiftType +} + +// Sysbox-mgr mandated container configs (passed from sysbox-mgr -> sysbox-runc) +type ContainerConfig struct { + AliasDns bool + ShiftfsOk bool + ShiftfsOnOverlayfsOk bool + NoShiftfsOnFuse bool + IDMapMountOk bool + OverlayfsOnIDMapMountOk bool + NoRootfsCloning bool + IgnoreSysfsChown bool + AllowTrustedXattr bool + HonorCaps bool + SyscontMode bool + Userns string + UidMappings []specs.LinuxIDMapping + GidMappings []specs.LinuxIDMapping + FsuidMapFailOnErr bool + RootfsUidShiftType idShiftUtils.IDShiftType + RelaxedReadOnly bool +} + +// +// Mount requests from sysbox-runc to sysbox-mgr +// + +type MountPrepInfo struct { + Source string + Exclusive bool +} + +type MntKind int + +const ( + MntVarLibDocker MntKind = iota + MntVarLibKubelet + MntVarLibK0s + MntVarLibRancherK3s + MntVarLibRancherRke2 + MntVarLibBuildkit + MntVarLibContainerdOvfs + MntUsrSrcLinuxHdr +) + +func (k MntKind) String() string { + str := "unknown" + + switch k { + case MntVarLibDocker: + str = "var-lib-docker" + case MntVarLibKubelet: + str = "var-lib-kubelet" + case MntVarLibK0s: + str = "var-lib-k0s" + case MntVarLibRancherK3s: + str = "var-lib-rancher-k3s" + case MntVarLibRancherRke2: + str = "var-lib-rancher-rke2" + case MntVarLibBuildkit: + str = "var-lib-buildkit" + case MntVarLibContainerdOvfs: + str = "var-lib-containerd-ovfs" + case MntUsrSrcLinuxHdr: + str = "usr-src-linux-header" + } + + return str +} + +type MountReqInfo struct { + Kind MntKind + Dest string +} diff --git a/sysbox-ipc/unix/io.go b/sysbox-ipc/unix/io.go new file mode 100644 index 00000000..a1f327a4 --- /dev/null +++ b/sysbox-ipc/unix/io.go @@ -0,0 +1,265 @@ +// +// Copyright 2019-2020 Nestybox, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +package unix + +import ( + "bytes" + "encoding/json" + "fmt" + "net" + "os" + "path" + + "github.com/sirupsen/logrus" + "golang.org/x/sys/unix" +) + +type Server struct { + listener net.UnixListener + handler func(*net.UnixConn) +} + +// NewServer constructs a unix-server to handle inbound connections made to the +// 'addr' unix-socket. Upon establishment, the connection will be handled by the +// 'func' closure passed as parameter. +func NewServer(addr string, handler func(*net.UnixConn)) (*Server, error) { + + if err := os.RemoveAll(addr); err != nil { + logrus.Errorf("Unable to remove address %v (%v).", addr, err) + return nil, err + } + + if err := os.MkdirAll(path.Dir(addr), 0700); err != nil { + logrus.Errorf("Unable to mkdir %v (%v).", path.Dir(addr), err) + return nil, err + } + + unixAddr, err := net.ResolveUnixAddr("unix", addr) + if err != nil { + logrus.Errorf("Unable to resolve address %v (%v).", addr, err) + return nil, err + } + + listener, err := net.ListenUnix("unix", unixAddr) + if err != nil { + logrus.Errorf("Unable to listen through addr %v (%v).", addr, err) + return nil, err + } + + err = os.Chmod(addr, 0600) + if err != nil { + logrus.Errorf("Unable to set %v socket permissions (%v).", addr, err) + return nil, err + } + + srv := &Server{ + listener: *listener, + handler: handler, + } + + go srv.run() + + return srv, nil +} + +func (s *Server) run() { + + // TODO: Handler stop-signals from main() thread. + for { + conn, err := s.listener.AcceptUnix() + if err != nil { + logrus.Errorf("Unable to establish connection (%v).", err) + return + } + + go s.handler(conn) + } +} + +func Connect(addr string) (*net.UnixConn, error) { + + unixAddr, err := net.ResolveUnixAddr("unix", addr) + if err != nil { + logrus.Errorf("Unable to resolve address %v (%v).", addr, err) + return nil, err + } + + conn, err := net.DialUnix("unix", nil, unixAddr) + if err != nil { + logrus.Errorf("Unable to dial to addr %v (%v).", addr, err) + return nil, err + } + + return conn, nil +} + +type seccompInit struct { + Pid int32 `json:"pid"` + CntrId string `json:"cntrId"` +} + +func RecvSeccompInitMsg(c *net.UnixConn) (int32, string, int32, error) { + + // TODO: Define these literals in a proper location. + // {"pid": 27693,"cntrId":"54970eb2fa086ccd0f550b23679d13fc577e041be14121d78d9389ec051b20f8"} + const inbLength = 100 // 4 bytes pid + 64 bytes cntrId + padding + var oobLength = unix.CmsgSpace(4) + + inb := make([]byte, inbLength) + oob := make([]byte, oobLength) + + if err := recvGenericMsg(c, inb, oob); err != nil { + return -1, "", -1, err + } + + // Parse received control-msg to extract one file-descriptor. + fd, err := parseScmRightsFd(c, oob) + if err != nil { + return -1, "", -1, err + } + + // Remove any null character that may have come along. + payload := bytes.TrimRight(inb, "\x00") + + // Decode inband payload msg. + var seccompInit seccompInit + err = json.Unmarshal(payload, &seccompInit) + if err != nil { + return -1, "", -1, err + } + + return seccompInit.Pid, seccompInit.CntrId, fd, nil +} + +func SendSeccompInitMsg( + c *net.UnixConn, pid int32, cntrId string, fd int32) error { + + // Construct scm message. + oob := unix.UnixRights(int(fd)) + + seccompInit := &seccompInit{Pid: pid, CntrId: cntrId} + inb, err := json.Marshal(seccompInit) + if err != nil { + logrus.Errorf("Could not encode seccompInit payload (%v)", err) + return err + } + + // Send payload + scm messages. + err = sendGenericMsg(c, inb, oob) + if err != nil { + return err + } + + return nil +} + +func RecvSeccompInitAckMsg(c *net.UnixConn) error { + + buf := make([]byte, 3) + + // Send payload. + err := recvGenericMsg(c, buf, nil) + if err != nil { + return err + } + + if string(buf) != "ack" { + return fmt.Errorf("invalid ack: %v", buf) + } + + return nil +} + +func SendSeccompInitAckMsg(c *net.UnixConn) error { + + // Send payload. + err := sendGenericMsg(c, []byte("ack"), nil) + if err != nil { + return err + } + + return nil +} + +func recvGenericMsg(c *net.UnixConn, inb []byte, oob []byte) error { + + inbSize := len(inb) + oobSize := len(oob) + + inbn, oobn, _, _, err := c.ReadMsgUnix(inb, oob) + if err != nil { + logrus.Errorf("Unable to read message from endpoint %v (%v).", + c.RemoteAddr(), err) + return err + } + + if inbn > inbSize || oobn > oobSize { + logrus.Errorf("Invalid msg received from endpoint %v", c.RemoteAddr()) + return err + } + + // Truncate inband and outbound buffers to match received sizes. + inb = inb[:inbn] + oob = oob[:oobn] + + return nil +} + +func sendGenericMsg(c *net.UnixConn, inb []byte, oob []byte) error { + + inbSize := len(inb) + oobSize := len(oob) + + inbn, oobn, err := c.WriteMsgUnix(inb, oob, nil) + if err != nil { + logrus.Errorf("Unable to write message to endpoint %v", c.RemoteAddr()) + return err + } + + if inbn < inbSize || oobn < oobSize { + logrus.Errorf("Invalid msg sent to endpoint %v", c.RemoteAddr()) + return err + } + + return nil +} + +func parseScmRightsFd(c *net.UnixConn, oob []byte) (int32, error) { + + scms, err := unix.ParseSocketControlMessage(oob) + if err != nil { + logrus.Errorf("Unexpected error while parsing SocketControlMessage msg") + return 0, err + } + if len(scms) != 1 { + logrus.Errorf("Unexpected number of SocketControlMessages received: expected 1, received %v", + len(scms)) + return -1, err + } + + fds, err := unix.ParseUnixRights(&scms[0]) + if err != nil { + return -1, err + } + if len(fds) != 1 { + return -1, fmt.Errorf("Unexpected number of fd's received: expected 1, received %v", + len(fds)) + } + fd := int32(fds[0]) + + return fd, nil +} diff --git a/sysbox-ipc/unix/pollServer.go b/sysbox-ipc/unix/pollServer.go new file mode 100644 index 00000000..5e70da12 --- /dev/null +++ b/sysbox-ipc/unix/pollServer.go @@ -0,0 +1,440 @@ +// +// Copyright 2019-2020 Nestybox, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +package unix + +import ( + "fmt" + "os" + "sync" + "syscall" + + "github.com/sirupsen/logrus" + "golang.org/x/sys/unix" +) + +// +// The following PollServer implementation offers non-blocking I/O capabilities +// to applications that interact with generic file-descriptors. For network +// specific I/O there are known implementations providing similar functionality. +// +// The following API is offered as part of this PollServer implementation: +// +// StartWaitRead(): Place caller goroutine in 'standby' mode till incoming +// traffic is detected over the respective file-descriptor (POLLIN revent +// received). +// +// StartWaitWrite(): Place caller goroutine in 'standby' mode till outgoing +// traffic can be sent through the respective file-descriptor (POLLOUT revent +// received). +// +// StopWait(): Interrupts pollserver event-loop to update the list of file +// descriptors to poll on. This 'interruption' process relies on the use of a +// unidirectional pipe whose receiving-end is being added to the list of file +// descriptors to monitor. This voids the need to define an explicit 'timeout' +// interval during polling attempts. +// + +type pollActionType uint8 + +const ( + CREATE_POLL_REQUEST pollActionType = iota + DELETE_POLL_REQUEST +) + +// Defines the poll action to execute for a given file-descriptor. Each +// poll-server client allocates one of these objects to interact with the +// poll-server. +type pollAction struct { + + // File-descriptor to poll(). + fd int32 + + // Action the poll-server client is interested on: 'addition' or 'deletion' + // of the fd into/from the poll-event loop. + actionType pollActionType + + // Channel on which the poll-server client waits during read-request + // operations. + waitReadCh chan error + + // Channel on which the poll-server client waits during write-request + // operations. + waitWriteCh chan error + + // Channel on which the poll-server client waits for fds to be + // completely eliminated from the pollServer event-loop. + waitCloseCh chan error +} + +func newPollAction(fd int32, action pollActionType) *pollAction { + return &pollAction{ + fd: fd, + actionType: action, + waitReadCh: make(chan error), + waitWriteCh: make(chan error), + waitCloseCh: make(chan error), + } +} + +// General pollserver struct. +type PollServer struct { + sync.RWMutex + + // 'fd' to 'pollAction' map. This map holds one pollAction per poll-server + // client. + pollActionMap map[int32]*pollAction + + // Buffered channel through which pollActions arrive. + pollActionCh chan *pollAction + + // Wake-up pipe -- writing end + wakeupReader *os.File + + // Wake-up pipe -- receiving end + wakeupWriter *os.File +} + +func NewPollServer() (*PollServer, error) { + var err error + + // PollServer struct initialization. Notice that pollActionCh must be buffered + // to allow incoming pollActions to be injected by pollserver clients *before* + // the wakeup signal interrupts the polling cycle (see comments below as part + // of pushPollAction method). + ps := &PollServer{ + pollActionMap: make(map[int32]*pollAction), + pollActionCh: make(chan *pollAction, 1), + } + + // Initialize pollserver's wakeup pipe. + ps.wakeupReader, ps.wakeupWriter, err = os.Pipe() + if err != nil { + logrus.Errorf("Unable to initialize wakeup pipe in pollserver (%v)", err) + return nil, err + } + + // Add wakeup pipe's received-end to the map of fds to monitor. + ps.pollActionMap[int32(ps.wakeupReader.Fd())] = nil + + go ps.run() + + return ps, nil +} + +func (ps *PollServer) StartWaitRead(fd int32) error { + + ps.RLock() + _, ok := ps.pollActionMap[fd] + if ok { + ps.RUnlock() + return fmt.Errorf("Unexpected fd %d found in pollServer DB", fd) + } + ps.RUnlock() + + pa := newPollAction(fd, CREATE_POLL_REQUEST) + + if err := ps.pushPollAction(pa); err != nil { + return err + } + + // Block till a POLLIN event (or any POLL-error) is received for this fd. + err := <-pa.waitReadCh + + return err +} + +func (ps *PollServer) StartWaitWrite(fd int32) error { + + ps.RLock() + _, ok := ps.pollActionMap[fd] + if ok { + ps.RUnlock() + return fmt.Errorf("Unexpected fd %d found in pollServer DB", fd) + } + ps.RUnlock() + + pa := newPollAction(fd, CREATE_POLL_REQUEST) + + if err := ps.pushPollAction(pa); err != nil { + return err + } + + // Block till a POLLOUT event (or any POLL-error) is received for this fd. + err := <-pa.waitWriteCh + + return err +} + +func (ps *PollServer) StopWait(fd int32) error { + + // We are making a write operation below over the shared pollAction object, + // so let's acquire the write lock in this case. + ps.Lock() + pa, ok := ps.pollActionMap[fd] + if !ok { + ps.Unlock() + return nil + } + + // Re-tag the existing pollAction as a "delete" one, so that the pollServer + // client that's currently waiting on the associated fd, can now wakeup from + // its nap. + pa.actionType = DELETE_POLL_REQUEST + ps.Unlock() + + if err := ps.pushPollAction(pa); err != nil { + return err + } + + // Wait for an ack from the pollServer's event-loop to confirm that the + // "delete" has been fully processed. + err := <-pa.waitCloseCh + + return err +} + +// Runs pollserver event-loop. +func (ps *PollServer) run() { + + for { + // Build file-descriptor slice out of the map that tracks all the polling + // actions to process. No need to acquire rlock here as this is the very + // goroutine (and the only one) that modifies pollServer internal structs. + i := 0 + ps.RLock() + pfds := make([]unix.PollFd, len(ps.pollActionMap)) + for k := range ps.pollActionMap { + pfds[i].Fd = k + pfds[i].Events = unix.POLLIN + i++ + } + ps.RUnlock() + + // Initiating polling attempt. Notice that no timeout interval is passed. + n, err := unix.Poll(pfds, -1) + if err != nil && err != syscall.EINTR { + logrus.Debugf("pollserver: error during poll() syscall (%v)", err) + break + } + if n <= 0 { + logrus.Debugf("pollserver: unexpected value (n = %d) during poll() syscall", n) + continue + } + + // Iterate through all fds to evaluate i/o activity. + for _, pfd := range pfds { + + if pfd.Revents&(unix.POLLHUP|unix.POLLNVAL|unix.POLLERR) != 0 { + if pfd.Revents&unix.POLLHUP == unix.POLLHUP { + ps.processPollHupEvent(&pfd) + } + if pfd.Revents&unix.POLLNVAL == unix.POLLNVAL { + ps.processPollNvalEvent(&pfd) + } + if pfd.Revents&unix.POLLERR == unix.POLLERR { + ps.processPollErrEvent(&pfd) + } + + } else if pfd.Revents&unix.POLLIN == unix.POLLIN { + if pfd.Fd == int32(ps.wakeupReader.Fd()) { + ps.processPollWakeupEvent(&pfd) + continue + } + ps.processPollInEvent(&pfd) + + } else if pfd.Revents&unix.POLLOUT == unix.POLLOUT { + ps.processPollOutEvent(&pfd) + } + } + } +} + +// Processes 'pollhup' events received through one of the monitored fds. +func (ps *PollServer) processPollHupEvent(pfd *unix.PollFd) error { + + logrus.Debugf("pollserver: POLLHUP event received on fd %d", pfd.Fd) + + ps.Lock() + pa, ok := ps.pollActionMap[pfd.Fd] + if !ok { + ps.Unlock() + return fmt.Errorf("pollserver: POLLHUP event received on unknown fd %v", + pfd.Fd) + } + + // Delete fd from pollAction map. + delete(ps.pollActionMap, pfd.Fd) + ps.Unlock() + + // Wake-up pollserver client. + pa.waitReadCh <- fmt.Errorf("PollHup event received") + + return nil +} + +// Processes 'pollnval' events received through one of the monitored fds. +func (ps *PollServer) processPollNvalEvent(pfd *unix.PollFd) error { + + logrus.Debugf("pollserver: POLLNVAL event received on fd %d", pfd.Fd) + + ps.Lock() + pa, ok := ps.pollActionMap[pfd.Fd] + if !ok { + ps.Unlock() + return fmt.Errorf("pollserver: POLLNVAL event received on unknown fd %v", + pfd.Fd) + } + + // Delete fd from pollAction map. + delete(ps.pollActionMap, pfd.Fd) + ps.Unlock() + + // Wake-up pollserver client. + pa.waitReadCh <- fmt.Errorf("PollNval event received") + + return nil +} + +// Processes 'pollErr' events received through one of the monitored fds. +func (ps *PollServer) processPollErrEvent(pfd *unix.PollFd) error { + + logrus.Debugf("pollserver: POLLERR event received on fd %d", pfd.Fd) + + ps.Lock() + pa, ok := ps.pollActionMap[pfd.Fd] + if !ok { + ps.Unlock() + return fmt.Errorf("pollserver: POLLERR event received on unknown fd %v", + pfd.Fd) + } + + // Delete fd from pollAction map. + delete(ps.pollActionMap, pfd.Fd) + ps.Unlock() + + // Wake-up pollserver client. + pa.waitReadCh <- fmt.Errorf("PollErr event received") + + return nil +} + +// Processes out-of-band 'wakeup' events generated by pollserver clients. +func (ps *PollServer) processPollWakeupEvent(pfd *unix.PollFd) error { + + logrus.Debugf("pollserver: WAKEUP event received on fd %d", pfd.Fd) + + // TODO: Define this literal in a global var and document its rationale. + buf := make([]byte, 100) + _, err := ps.wakeupReader.Read(buf) + if err != nil { + logrus.Errorf("processPollWakeupEvent read error (%v)", err) + return fmt.Errorf("processPollWakeupEvent read error (%v)", err) + } + + // Collect received pollAction and add it to the pollActionMap + pa := <-ps.pollActionCh + + switch pa.actionType { + + case CREATE_POLL_REQUEST: + ps.Lock() + ps.pollActionMap[pa.fd] = pa + ps.Unlock() + + case DELETE_POLL_REQUEST: + ps.Lock() + oldPa, ok := ps.pollActionMap[pa.fd] + if !ok { + ps.Unlock() + return fmt.Errorf("Could not find pollAction to delete for fd %d", pa.fd) + } + delete(ps.pollActionMap, pa.fd) + ps.Unlock() + + // Send an error back to the pollserver client to wake him up from his + // nap. + closedFdError := fmt.Errorf("Interrupted poll operation: closed file-descriptor") + oldPa.waitReadCh <- closedFdError + + oldPa.waitCloseCh <- nil + } + + return nil +} + +// Processes 'pollin' events received through one of the monitored fds. +func (ps *PollServer) processPollInEvent(pfd *unix.PollFd) error { + + logrus.Debugf("pollserver: POLLIN event received on fd %d", pfd.Fd) + + ps.Lock() + pa, ok := ps.pollActionMap[pfd.Fd] + if !ok { + ps.Unlock() + return fmt.Errorf("pollserver: POLLIN event received on unknown fd %v", + pfd.Fd) + } + + // Delete fd from pollAction map. + delete(ps.pollActionMap, pfd.Fd) + ps.Unlock() + + // Wake-up pollserver client. + pa.waitReadCh <- nil + + return nil +} + +// Processes 'pollout' events received through one of the monitored fds. +func (ps *PollServer) processPollOutEvent(pfd *unix.PollFd) error { + + logrus.Debugf("pollserver: POLLOUT event received on fd %d", pfd.Fd) + + ps.Lock() + pa, ok := ps.pollActionMap[pfd.Fd] + if !ok { + ps.Unlock() + return fmt.Errorf("pollserver: POLLOUT event received on unknown fd %v", + pfd.Fd) + } + + // Delete fd from action map. + delete(ps.pollActionMap, pfd.Fd) + ps.Unlock() + + // Wakeup pollserver client. + pa.waitReadCh <- nil + + return nil +} + +// Method pushes incoming pollActions into the pollserver's event-loop. Notice +// that the order of instructions in this method is critical: we must first +// send the pollAction, and then proceed to generate the wakeup signal. If we +// were to invert the order, no incoming state (pollAction) could have been +// received by the time that the pollserver awakes, which would end up with +// the pollserver going back to sleep right before the pollAction arrives. +func (ps *PollServer) pushPollAction(pa *pollAction) error { + + ps.pollActionCh <- pa + + // Write into pollserver's wakeupWriter end to interrupt poll() event loop. + var buf [1]byte + ps.wakeupWriter.Write(buf[0:]) + + return nil +} diff --git a/sysbox-libs b/sysbox-libs deleted file mode 160000 index 2ccacbeb..00000000 --- a/sysbox-libs +++ /dev/null @@ -1 +0,0 @@ -Subproject commit 2ccacbeb20a7095b1ea1b6d2916cd0fc5be498bb diff --git a/sysbox-libs/CONTRIBUTING.md b/sysbox-libs/CONTRIBUTING.md new file mode 100644 index 00000000..ced90edf --- /dev/null +++ b/sysbox-libs/CONTRIBUTING.md @@ -0,0 +1,5 @@ +# Contribute to Sysbox-libs + +Sysbox-libs is a component of the Sysbox container runtime. If you want to +contribute, please refer to the Sysbox contribution +[guidelines](https://github.com/nestybox/sysbox/blob/master/CONTRIBUTING.md). diff --git a/sysbox-libs/LICENSE b/sysbox-libs/LICENSE new file mode 100644 index 00000000..c6087d5b --- /dev/null +++ b/sysbox-libs/LICENSE @@ -0,0 +1,191 @@ + + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + Copyright 2020 Nestybox, Inc. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/sysbox-libs/MAINTAINERS b/sysbox-libs/MAINTAINERS new file mode 100644 index 00000000..3af2dbb0 --- /dev/null +++ b/sysbox-libs/MAINTAINERS @@ -0,0 +1,2 @@ +Rodny Molina (@rodnymolina) +Cesar Talledo (@ctalledo) diff --git a/sysbox-libs/Makefile b/sysbox-libs/Makefile new file mode 100644 index 00000000..f22965e9 --- /dev/null +++ b/sysbox-libs/Makefile @@ -0,0 +1,23 @@ +# +# sysbox-ipc Makefile +# + +.PHONY: lint listpackages + +GO := go + +lint: + @for d in $(_allgodirs); do \ + cd $$d; \ + $(GO) vet ./...; \ + $(GO) fmt ./...; \ + cd ..; \ + done + +listpackages: + @echo $(allpackages) + +# memoize allpackages, so that it's executed only once and only if used +_allgodirs = $(shell $(GO) list ./... | xargs basename -s) +_allpackages = $(shell for d in $(_allgodirs); do cd $$d && $(GO) list ./... | grep -v vendor; cd ..; done) +allpackages = $(if $(__allpackages),,$(eval __allpackages := $$(_allpackages)))$(__allpackages) diff --git a/sysbox-libs/capability/LICENSE b/sysbox-libs/capability/LICENSE new file mode 100644 index 00000000..b9d84ccd --- /dev/null +++ b/sysbox-libs/capability/LICENSE @@ -0,0 +1,24 @@ +Copyright 2013 Suryandaru Triandana +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + * Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. \ No newline at end of file diff --git a/sysbox-libs/capability/capability.go b/sysbox-libs/capability/capability.go new file mode 100644 index 00000000..061a0b51 --- /dev/null +++ b/sysbox-libs/capability/capability.go @@ -0,0 +1,669 @@ +// +// Copyright: (C) 2020 Nestybox Inc. All rights reserved. +// + +// Copyright (c) 2013, Suryandaru Triandana +// All rights reserved. +// +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +// Package capability provides utilities for manipulating POSIX capabilities. + +package capability + +type CapType uint + +func (c CapType) String() string { + switch c { + case EFFECTIVE: + return "effective" + case PERMITTED: + return "permitted" + case INHERITABLE: + return "inheritable" + case BOUNDING: + return "bounding" + case CAPS: + return "caps" + case AMBIENT: + return "ambient" + } + return "unknown" +} + +const ( + EFFECTIVE CapType = 1 << iota + PERMITTED + INHERITABLE + BOUNDING + AMBIENT + + CAPS = EFFECTIVE | PERMITTED | INHERITABLE + BOUNDS = BOUNDING + AMBS = AMBIENT +) + +type CapFormat uint + +const ( + STRING CapFormat = iota + OCI_STRING +) + +//go:generate go run enumgen/gen.go +type Cap int + +// POSIX-draft defined capabilities and Linux extensions. +// +// Defined in https://github.com/torvalds/linux/blob/master/include/uapi/linux/capability.h +const ( + // In a system with the [_POSIX_CHOWN_RESTRICTED] option defined, this + // overrides the restriction of changing file ownership and group + // ownership. + CAP_CHOWN = Cap(0) + + // Override all DAC access, including ACL execute access if + // [_POSIX_ACL] is defined. Excluding DAC access covered by + // CAP_LINUX_IMMUTABLE. + CAP_DAC_OVERRIDE = Cap(1) + + // Overrides all DAC restrictions regarding read and search on files + // and directories, including ACL restrictions if [_POSIX_ACL] is + // defined. Excluding DAC access covered by CAP_LINUX_IMMUTABLE. + CAP_DAC_READ_SEARCH = Cap(2) + + // Overrides all restrictions about allowed operations on files, where + // file owner ID must be equal to the user ID, except where CAP_FSETID + // is applicable. It doesn't override MAC and DAC restrictions. + CAP_FOWNER = Cap(3) + + // Overrides the following restrictions that the effective user ID + // shall match the file owner ID when setting the S_ISUID and S_ISGID + // bits on that file; that the effective group ID (or one of the + // supplementary group IDs) shall match the file owner ID when setting + // the S_ISGID bit on that file; that the S_ISUID and S_ISGID bits are + // cleared on successful return from chown(2) (not implemented). + CAP_FSETID = Cap(4) + + // Overrides the restriction that the real or effective user ID of a + // process sending a signal must match the real or effective user ID + // of the process receiving the signal. + CAP_KILL = Cap(5) + + // Allows setgid(2) manipulation + // Allows setgroups(2) + // Allows forged gids on socket credentials passing. + CAP_SETGID = Cap(6) + + // Allows set*uid(2) manipulation (including fsuid). + // Allows forged pids on socket credentials passing. + CAP_SETUID = Cap(7) + + // Linux-specific capabilities + + // Without VFS support for capabilities: + // Transfer any capability in your permitted set to any pid, + // remove any capability in your permitted set from any pid + // With VFS support for capabilities (neither of above, but) + // Add any capability from current's capability bounding set + // to the current process' inheritable set + // Allow taking bits out of capability bounding set + // Allow modification of the securebits for a process + CAP_SETPCAP = Cap(8) + + // Allow modification of S_IMMUTABLE and S_APPEND file attributes + CAP_LINUX_IMMUTABLE = Cap(9) + + // Allows binding to TCP/UDP sockets below 1024 + // Allows binding to ATM VCIs below 32 + CAP_NET_BIND_SERVICE = Cap(10) + + // Allow broadcasting, listen to multicast + CAP_NET_BROADCAST = Cap(11) + + // Allow interface configuration + // Allow administration of IP firewall, masquerading and accounting + // Allow setting debug option on sockets + // Allow modification of routing tables + // Allow setting arbitrary process / process group ownership on + // sockets + // Allow binding to any address for transparent proxying (also via NET_RAW) + // Allow setting TOS (type of service) + // Allow setting promiscuous mode + // Allow clearing driver statistics + // Allow multicasting + // Allow read/write of device-specific registers + // Allow activation of ATM control sockets + CAP_NET_ADMIN = Cap(12) + + // Allow use of RAW sockets + // Allow use of PACKET sockets + // Allow binding to any address for transparent proxying (also via NET_ADMIN) + CAP_NET_RAW = Cap(13) + + // Allow locking of shared memory segments + // Allow mlock and mlockall (which doesn't really have anything to do + // with IPC) + CAP_IPC_LOCK = Cap(14) + + // Override IPC ownership checks + CAP_IPC_OWNER = Cap(15) + + // Insert and remove kernel modules - modify kernel without limit + CAP_SYS_MODULE = Cap(16) + + // Allow ioperm/iopl access + // Allow sending USB messages to any device via /proc/bus/usb + CAP_SYS_RAWIO = Cap(17) + + // Allow use of chroot() + CAP_SYS_CHROOT = Cap(18) + + // Allow ptrace() of any process + CAP_SYS_PTRACE = Cap(19) + + // Allow configuration of process accounting + CAP_SYS_PACCT = Cap(20) + + // Allow configuration of the secure attention key + // Allow administration of the random device + // Allow examination and configuration of disk quotas + // Allow setting the domainname + // Allow setting the hostname + // Allow calling bdflush() + // Allow mount() and umount(), setting up new smb connection + // Allow some autofs root ioctls + // Allow nfsservctl + // Allow VM86_REQUEST_IRQ + // Allow to read/write pci config on alpha + // Allow irix_prctl on mips (setstacksize) + // Allow flushing all cache on m68k (sys_cacheflush) + // Allow removing semaphores + // Used instead of CAP_CHOWN to "chown" IPC message queues, semaphores + // and shared memory + // Allow locking/unlocking of shared memory segment + // Allow turning swap on/off + // Allow forged pids on socket credentials passing + // Allow setting readahead and flushing buffers on block devices + // Allow setting geometry in floppy driver + // Allow turning DMA on/off in xd driver + // Allow administration of md devices (mostly the above, but some + // extra ioctls) + // Allow tuning the ide driver + // Allow access to the nvram device + // Allow administration of apm_bios, serial and bttv (TV) device + // Allow manufacturer commands in isdn CAPI support driver + // Allow reading non-standardized portions of pci configuration space + // Allow DDI debug ioctl on sbpcd driver + // Allow setting up serial ports + // Allow sending raw qic-117 commands + // Allow enabling/disabling tagged queuing on SCSI controllers and sending + // arbitrary SCSI commands + // Allow setting encryption key on loopback filesystem + // Allow setting zone reclaim policy + // Allow everything under CAP_BPF and CAP_PERFMON for backward compatibility + CAP_SYS_ADMIN = Cap(21) + + // Allow use of reboot() + CAP_SYS_BOOT = Cap(22) + + // Allow raising priority and setting priority on other (different + // UID) processes + // Allow use of FIFO and round-robin (realtime) scheduling on own + // processes and setting the scheduling algorithm used by another + // process. + // Allow setting cpu affinity on other processes + CAP_SYS_NICE = Cap(23) + + // Override resource limits. Set resource limits. + // Override quota limits. + // Override reserved space on ext2 filesystem + // Modify data journaling mode on ext3 filesystem (uses journaling + // resources) + // NOTE: ext2 honors fsuid when checking for resource overrides, so + // you can override using fsuid too + // Override size restrictions on IPC message queues + // Allow more than 64hz interrupts from the real-time clock + // Override max number of consoles on console allocation + // Override max number of keymaps + // Control memory reclaim behavior + CAP_SYS_RESOURCE = Cap(24) + + // Allow manipulation of system clock + // Allow irix_stime on mips + // Allow setting the real-time clock + CAP_SYS_TIME = Cap(25) + + // Allow configuration of tty devices + // Allow vhangup() of tty + CAP_SYS_TTY_CONFIG = Cap(26) + + // Allow the privileged aspects of mknod() + CAP_MKNOD = Cap(27) + + // Allow taking of leases on files + CAP_LEASE = Cap(28) + + CAP_AUDIT_WRITE = Cap(29) + CAP_AUDIT_CONTROL = Cap(30) + CAP_SETFCAP = Cap(31) + + // Override MAC access. + // The base kernel enforces no MAC policy. + // An LSM may enforce a MAC policy, and if it does and it chooses + // to implement capability based overrides of that policy, this is + // the capability it should use to do so. + CAP_MAC_OVERRIDE = Cap(32) + + // Allow MAC configuration or state changes. + // The base kernel requires no MAC configuration. + // An LSM may enforce a MAC policy, and if it does and it chooses + // to implement capability based checks on modifications to that + // policy or the data required to maintain it, this is the + // capability it should use to do so. + CAP_MAC_ADMIN = Cap(33) + + // Allow configuring the kernel's syslog (printk behaviour) + CAP_SYSLOG = Cap(34) + + // Allow triggering something that will wake the system + CAP_WAKE_ALARM = Cap(35) + + // Allow preventing system suspends + CAP_BLOCK_SUSPEND = Cap(36) + + // Allow reading the audit log via multicast netlink socket + CAP_AUDIT_READ = Cap(37) + + // Allow system performance and observability privileged operations + // using perf_events, i915_perf and other kernel subsystems + CAP_PERFMON = Cap(38) + + // CAP_BPF allows the following BPF operations: + // - Creating all types of BPF maps + // - Advanced verifier features + // - Indirect variable access + // - Bounded loops + // - BPF to BPF function calls + // - Scalar precision tracking + // - Larger complexity limits + // - Dead code elimination + // - And potentially other features + // - Loading BPF Type Format (BTF) data + // - Retrieve xlated and JITed code of BPF programs + // - Use bpf_spin_lock() helper + // + // CAP_PERFMON relaxes the verifier checks further: + // - BPF progs can use of pointer-to-integer conversions + // - speculation attack hardening measures are bypassed + // - bpf_probe_read to read arbitrary kernel memory is allowed + // - bpf_trace_printk to print kernel memory is allowed + // + // CAP_SYS_ADMIN is required to use bpf_probe_write_user. + // + // CAP_SYS_ADMIN is required to iterate system wide loaded + // programs, maps, links, BTFs and convert their IDs to file descriptors. + // + // CAP_PERFMON and CAP_BPF are required to load tracing programs. + // CAP_NET_ADMIN and CAP_BPF are required to load networking programs. + CAP_BPF = Cap(39) + + // Allow checkpoint/restore related operations. + // Introduced in kernel 5.9 + CAP_CHECKPOINT_RESTORE = Cap(40) +) + +func (c Cap) String() string { + switch c { + case CAP_CHOWN: + return "chown" + case CAP_DAC_OVERRIDE: + return "dac_override" + case CAP_DAC_READ_SEARCH: + return "dac_read_search" + case CAP_FOWNER: + return "fowner" + case CAP_FSETID: + return "fsetid" + case CAP_KILL: + return "kill" + case CAP_SETGID: + return "setgid" + case CAP_SETUID: + return "setuid" + case CAP_SETPCAP: + return "setpcap" + case CAP_LINUX_IMMUTABLE: + return "linux_immutable" + case CAP_NET_BIND_SERVICE: + return "net_bind_service" + case CAP_NET_BROADCAST: + return "net_broadcast" + case CAP_NET_ADMIN: + return "net_admin" + case CAP_NET_RAW: + return "net_raw" + case CAP_IPC_LOCK: + return "ipc_lock" + case CAP_IPC_OWNER: + return "ipc_owner" + case CAP_SYS_MODULE: + return "sys_module" + case CAP_SYS_RAWIO: + return "sys_rawio" + case CAP_SYS_CHROOT: + return "sys_chroot" + case CAP_SYS_PTRACE: + return "sys_ptrace" + case CAP_SYS_PACCT: + return "sys_pacct" + case CAP_SYS_ADMIN: + return "sys_admin" + case CAP_SYS_BOOT: + return "sys_boot" + case CAP_SYS_NICE: + return "sys_nice" + case CAP_SYS_RESOURCE: + return "sys_resource" + case CAP_SYS_TIME: + return "sys_time" + case CAP_SYS_TTY_CONFIG: + return "sys_tty_config" + case CAP_MKNOD: + return "mknod" + case CAP_LEASE: + return "lease" + case CAP_AUDIT_WRITE: + return "audit_write" + case CAP_AUDIT_CONTROL: + return "audit_control" + case CAP_SETFCAP: + return "setfcap" + case CAP_MAC_OVERRIDE: + return "mac_override" + case CAP_MAC_ADMIN: + return "mac_admin" + case CAP_SYSLOG: + return "syslog" + case CAP_WAKE_ALARM: + return "wake_alarm" + case CAP_BLOCK_SUSPEND: + return "block_suspend" + case CAP_AUDIT_READ: + return "audit_read" + case CAP_PERFMON: + return "perfmon" + case CAP_BPF: + return "bpf" + case CAP_CHECKPOINT_RESTORE: + return "checkpoint_restore" + } + return "unknown" +} + +func (c Cap) OCIString() string { + switch c { + case CAP_CHOWN: + return "CAP_CHOWN" + case CAP_DAC_OVERRIDE: + return "CAP_DAC_OVERRIDE" + case CAP_DAC_READ_SEARCH: + return "CAP_DAC_READ_SEARCH" + case CAP_FOWNER: + return "CAP_FOWNER" + case CAP_FSETID: + return "CAP_FSETID" + case CAP_KILL: + return "CAP_KILL" + case CAP_SETGID: + return "CAP_SETGID" + case CAP_SETUID: + return "CAP_SETUID" + case CAP_SETPCAP: + return "CAP_SETPCAP" + case CAP_LINUX_IMMUTABLE: + return "CAP_LINUX_IMMUTABLE" + case CAP_NET_BIND_SERVICE: + return "CAP_NET_BIND_SERVICE" + case CAP_NET_BROADCAST: + return "CAP_NET_BROADCAST" + case CAP_NET_ADMIN: + return "CAP_NET_ADMIN" + case CAP_NET_RAW: + return "CAP_NET_RAW" + case CAP_IPC_LOCK: + return "CAP_IPC_LOCK" + case CAP_IPC_OWNER: + return "CAP_IPC_OWNER" + case CAP_SYS_MODULE: + return "CAP_SYS_MODULE" + case CAP_SYS_RAWIO: + return "CAP_SYS_RAWIO" + case CAP_SYS_CHROOT: + return "CAP_SYS_CHROOT" + case CAP_SYS_PTRACE: + return "CAP_SYS_PTRACE" + case CAP_SYS_PACCT: + return "CAP_SYS_PACCT" + case CAP_SYS_ADMIN: + return "CAP_SYS_ADMIN" + case CAP_SYS_BOOT: + return "CAP_SYS_BOOT" + case CAP_SYS_NICE: + return "CAP_SYS_NICE" + case CAP_SYS_RESOURCE: + return "CAP_SYS_RESOURCE" + case CAP_SYS_TIME: + return "CAP_SYS_TIME" + case CAP_SYS_TTY_CONFIG: + return "CAP_SYS_TTY_CONFIG" + case CAP_MKNOD: + return "CAP_MKNOD" + case CAP_LEASE: + return "CAP_LEASE" + case CAP_AUDIT_WRITE: + return "CAP_AUDIT_WRITE" + case CAP_AUDIT_CONTROL: + return "CAP_AUDIT_CONTROL" + case CAP_SETFCAP: + return "CAP_SETFCAP" + case CAP_MAC_OVERRIDE: + return "CAP_MAC_OVERRIDE" + case CAP_MAC_ADMIN: + return "CAP_MAC_ADMIN" + case CAP_SYSLOG: + return "CAP_SYSLOG" + case CAP_WAKE_ALARM: + return "CAP_WAKE_ALARM" + case CAP_BLOCK_SUSPEND: + return "CAP_BLOCK_SUSPEND" + case CAP_AUDIT_READ: + return "CAP_AUDIT_READ" + case CAP_PERFMON: + return "CAP_PERFMON" + case CAP_BPF: + return "CAP_BPF" + case CAP_CHECKPOINT_RESTORE: + return "CAP_CHECKPOINT_RESTORE" + } + return "unknown" +} + +// List returns list of all supported capabilities +func List() []Cap { + return []Cap{ + CAP_CHOWN, + CAP_DAC_OVERRIDE, + CAP_DAC_READ_SEARCH, + CAP_FOWNER, + CAP_FSETID, + CAP_KILL, + CAP_SETGID, + CAP_SETUID, + CAP_SETPCAP, + CAP_LINUX_IMMUTABLE, + CAP_NET_BIND_SERVICE, + CAP_NET_BROADCAST, + CAP_NET_ADMIN, + CAP_NET_RAW, + CAP_IPC_LOCK, + CAP_IPC_OWNER, + CAP_SYS_MODULE, + CAP_SYS_RAWIO, + CAP_SYS_CHROOT, + CAP_SYS_PTRACE, + CAP_SYS_PACCT, + CAP_SYS_ADMIN, + CAP_SYS_BOOT, + CAP_SYS_NICE, + CAP_SYS_RESOURCE, + CAP_SYS_TIME, + CAP_SYS_TTY_CONFIG, + CAP_MKNOD, + CAP_LEASE, + CAP_AUDIT_WRITE, + CAP_AUDIT_CONTROL, + CAP_SETFCAP, + CAP_MAC_OVERRIDE, + CAP_MAC_ADMIN, + CAP_SYSLOG, + CAP_WAKE_ALARM, + CAP_BLOCK_SUSPEND, + CAP_AUDIT_READ, + CAP_PERFMON, + CAP_BPF, + CAP_CHECKPOINT_RESTORE, + } +} + +type Capabilities interface { + // Get check whether a capability present in the given + // capabilities set. The 'which' value should be one of EFFECTIVE, + // PERMITTED, INHERITABLE, BOUNDING or AMBIENT. + Get(which CapType, what Cap) bool + + // Empty check whether all capability bits of the given capabilities + // set are zero. The 'which' value should be one of EFFECTIVE, + // PERMITTED, INHERITABLE, BOUNDING or AMBIENT. + Empty(which CapType) bool + + // Full check whether all capability bits of the given capabilities + // set are one. The 'which' value should be one of EFFECTIVE, + // PERMITTED, INHERITABLE, BOUNDING or AMBIENT. + Full(which CapType) bool + + // Set sets capabilities of the given capabilities sets. The + // 'which' value should be one or combination (OR'ed) of EFFECTIVE, + // PERMITTED, INHERITABLE, BOUNDING or AMBIENT. + Set(which CapType, caps ...Cap) + + // Unset unsets capabilities of the given capabilities sets. The + // 'which' value should be one or combination (OR'ed) of EFFECTIVE, + // PERMITTED, INHERITABLE, BOUNDING or AMBIENT. + Unset(which CapType, caps ...Cap) + + // Fill sets all bits of the given capabilities kind to one. The + // 'kind' value should be one or combination (OR'ed) of CAPS, + // BOUNDS or AMBS. + Fill(kind CapType) + + // Clear sets all bits of the given capabilities kind to zero. The + // 'kind' value should be one or combination (OR'ed) of CAPS, + // BOUNDS or AMBS. + Clear(kind CapType) + + // StringCap returns current capabilities state of the given capabilities + // set as string. The 'which' value should be one of EFFECTIVE, + // PERMITTED, INHERITABLE BOUNDING or AMBIENT + StringCap(which CapType, format CapFormat) string + + // String return current capabilities state as string. + String(format CapFormat) string + + // Load load actual capabilities value. This will overwrite all + // outstanding changes. + Load() error + + // Apply apply the capabilities settings, so all changes will take + // effect. + Apply(kind CapType) error + + // Collect effective capabilities. + GetEffCaps() [2]uint32 + + // Set effective capabilities. + SetEffCaps(caps [2]uint32) +} + +// NewPid initializes a new Capabilities object for given pid when +// it is nonzero, or for the current process if pid is 0. +// +// Deprecated: Replace with NewPid2. For example, replace: +// +// c, err := NewPid(0) +// if err != nil { +// return err +// } +// +// with: +// +// c, err := NewPid2(0) +// if err != nil { +// return err +// } +// err = c.Load() +// if err != nil { +// return err +// } +func NewPid(pid int) (Capabilities, error) { + c, err := newPid(pid) + if err != nil { + return c, err + } + err = c.Load() + return c, err +} + +// NewPid2 initializes a new Capabilities object for given pid when +// it is nonzero, or for the current process if pid is 0. This +// does not load the process's current capabilities; to do that you +// must call Load explicitly. +func NewPid2(pid int) (Capabilities, error) { + return newPid(pid) +} + +// NewFile initializes a new Capabilities object for given file path. +// +// Deprecated: Replace with NewFile2. For example, replace: +// +// c, err := NewFile(path) +// if err != nil { +// return err +// } +// +// with: +// +// c, err := NewFile2(path) +// if err != nil { +// return err +// } +// err = c.Load() +// if err != nil { +// return err +// } +func NewFile(path string) (Capabilities, error) { + c, err := newFile(path) + if err != nil { + return c, err + } + err = c.Load() + return c, err +} + +// NewFile2 creates a new initialized Capabilities object for given +// file path. This does not load the process's current capabilities; +// to do that you must call Load explicitly. +func NewFile2(path string) (Capabilities, error) { + return newFile(path) +} diff --git a/sysbox-libs/capability/capability_linux.go b/sysbox-libs/capability/capability_linux.go new file mode 100644 index 00000000..7c423da7 --- /dev/null +++ b/sysbox-libs/capability/capability_linux.go @@ -0,0 +1,638 @@ +// +// Copyright: (C) 2020 Nestybox Inc. All rights reserved. +// + +// Copyright (c) 2013, Suryandaru Triandana +// All rights reserved. +// +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +package capability + +import ( + "bufio" + "errors" + "fmt" + "io" + "os" + "strings" + "sync" + "syscall" +) + +var errUnknownVers = errors.New("unknown capability version") + +const ( + linuxCapVer1 = 0x19980330 + linuxCapVer2 = 0x20071026 + linuxCapVer3 = 0x20080522 +) + +var ( + capVers uint32 + + capLastCap Cap + + // Highest valid capability of the running kernel. + CAP_LAST_CAP = Cap(63) + + capUpperMask = ^uint32(0) + + pkgInitialized = false + initMutex sync.RWMutex +) + +func initialize() { + var hdr capHeader + capget(&hdr, nil) + capVers = hdr.version + + if initLastCap() == nil { + CAP_LAST_CAP = capLastCap + if capLastCap > 31 { + capUpperMask = (uint32(1) << (uint(capLastCap) - 31)) - 1 + } else { + capUpperMask = 0 + } + } + + initMutex.Lock() + pkgInitialized = true + initMutex.Unlock() +} + +func initLastCap() error { + if capLastCap != 0 { + return nil + } + + f, err := os.Open("/proc/sys/kernel/cap_last_cap") + if err != nil { + return err + } + defer f.Close() + + var b []byte = make([]byte, 11) + _, err = f.Read(b) + if err != nil { + return err + } + + fmt.Sscanf(string(b), "%d", &capLastCap) + + return nil +} + +func mkStringCap(c Capabilities, which CapType, format CapFormat) (ret string) { + for i, first := Cap(0), true; i <= CAP_LAST_CAP; i++ { + if !c.Get(which, i) { + continue + } + if first { + first = false + } else { + ret += ", " + } + if format == OCI_STRING { + ret += i.OCIString() + } else { + ret += i.String() + } + } + return +} + +func mkString(c Capabilities, max CapType, format CapFormat) (ret string) { + ret = "{" + for i := CapType(1); i <= max; i <<= 1 { + ret += " " + i.String() + "=\"" + if c.Empty(i) { + ret += "empty" + } else if c.Full(i) { + ret += "full" + } else { + ret += c.StringCap(i, format) + } + ret += "\"" + } + ret += " }" + return +} + +func initializationCompleted() bool { + initMutex.RLock() + res := pkgInitialized + initMutex.RUnlock() + + return res +} + +func newPid(pid int) (c Capabilities, err error) { + + if !initializationCompleted() { + initialize() + } + + switch capVers { + case linuxCapVer2, linuxCapVer3: + p := new(capsV3) + p.hdr.version = capVers + p.hdr.pid = int32(pid) + c = p + default: + err = errUnknownVers + return + } + return +} + +type capsV3 struct { + hdr capHeader + data [2]capData + bounds [2]uint32 + ambient [2]uint32 +} + +func (c *capsV3) Get(which CapType, what Cap) bool { + var i uint + if what > 31 { + i = uint(what) >> 5 + what %= 32 + } + + switch which { + case EFFECTIVE: + return (1< 31 { + i = uint(what) >> 5 + what %= 32 + } + + if which&EFFECTIVE != 0 { + c.data[i].effective |= 1 << uint(what) + } + if which&PERMITTED != 0 { + c.data[i].permitted |= 1 << uint(what) + } + if which&INHERITABLE != 0 { + c.data[i].inheritable |= 1 << uint(what) + } + if which&BOUNDING != 0 { + c.bounds[i] |= 1 << uint(what) + } + if which&AMBIENT != 0 { + c.ambient[i] |= 1 << uint(what) + } + } +} + +func (c *capsV3) Unset(which CapType, caps ...Cap) { + for _, what := range caps { + var i uint + if what > 31 { + i = uint(what) >> 5 + what %= 32 + } + + if which&EFFECTIVE != 0 { + c.data[i].effective &= ^(1 << uint(what)) + } + if which&PERMITTED != 0 { + c.data[i].permitted &= ^(1 << uint(what)) + } + if which&INHERITABLE != 0 { + c.data[i].inheritable &= ^(1 << uint(what)) + } + if which&BOUNDING != 0 { + c.bounds[i] &= ^(1 << uint(what)) + } + if which&AMBIENT != 0 { + c.ambient[i] &= ^(1 << uint(what)) + } + } +} + +func (c *capsV3) Fill(kind CapType) { + if kind&CAPS == CAPS { + c.data[0].effective = 0xffffffff + c.data[0].permitted = 0xffffffff + c.data[0].inheritable = 0 + c.data[1].effective = 0xffffffff + c.data[1].permitted = 0xffffffff + c.data[1].inheritable = 0 + } + + if kind&BOUNDS == BOUNDS { + c.bounds[0] = 0xffffffff + c.bounds[1] = 0xffffffff + } + if kind&AMBS == AMBS { + c.ambient[0] = 0xffffffff + c.ambient[1] = 0xffffffff + } +} + +func (c *capsV3) ClearOriginal(kind CapType) { + if kind&CAPS == CAPS { + c.data[0].effective = 0 + c.data[0].permitted = 0 + c.data[0].inheritable = 0 + c.data[1].effective = 0 + c.data[1].permitted = 0 + c.data[1].inheritable = 0 + } + + if kind&BOUNDS == BOUNDS { + c.bounds[0] = 0 + c.bounds[1] = 0 + } + if kind&AMBS == AMBS { + c.ambient[0] = 0 + c.ambient[1] = 0 + } +} + +// Sysbox implementation of the original Clear() method (see above). In this +// implementation we are handling every CAPS category separately to allow any +// capability-type (kind) to be individually updated. +func (c *capsV3) Clear(kind CapType) { + if kind&EFFECTIVE == EFFECTIVE { + c.data[0].effective = 0 + c.data[1].effective = 0 + } + if kind&PERMITTED == PERMITTED { + c.data[0].permitted = 0 + c.data[1].permitted = 0 + } + if kind&INHERITABLE == INHERITABLE { + c.data[0].inheritable = 0 + c.data[1].inheritable = 0 + } + + if kind&BOUNDS == BOUNDS { + c.bounds[0] = 0 + c.bounds[1] = 0 + } + if kind&AMBS == AMBS { + c.ambient[0] = 0 + c.ambient[1] = 0 + } +} + +func (c *capsV3) StringCap(which CapType, format CapFormat) (ret string) { + return mkStringCap(c, which, format) +} + +func (c *capsV3) String(format CapFormat) (ret string) { + return mkString(c, BOUNDING, format) +} + +func (c *capsV3) LoadOriginal() (err error) { + err = capget(&c.hdr, &c.data[0]) + if err != nil { + return + } + + var status_path string + + if c.hdr.pid == 0 { + status_path = fmt.Sprintf("/proc/self/status") + } else { + status_path = fmt.Sprintf("/proc/%d/status", c.hdr.pid) + } + + f, err := os.Open(status_path) + if err != nil { + return + } + b := bufio.NewReader(f) + for { + line, e := b.ReadString('\n') + if e != nil { + if e != io.EOF { + err = e + } + break + } + if strings.HasPrefix(line, "CapB") { + fmt.Sscanf(line[4:], "nd: %08x%08x", &c.bounds[1], &c.bounds[0]) + continue + } + if strings.HasPrefix(line, "CapA") { + fmt.Sscanf(line[4:], "mb: %08x%08x", &c.ambient[1], &c.ambient[0]) + continue + } + } + f.Close() + + return +} + +// Sysbox implementation of the original Load() method (see above). For +// efficiency purposes, in this case we are not parsing 'status' file to +// extract 'ambient' and 'bounding' capabilities. +func (c *capsV3) Load() (err error) { + err = capget(&c.hdr, &c.data[0]) + if err != nil { + return + } + + return +} + +func (c *capsV3) Apply(kind CapType) (err error) { + if kind&BOUNDS == BOUNDS { + var data [2]capData + err = capget(&c.hdr, &data[0]) + if err != nil { + return + } + if (1< 31 { + if c.data.version == 1 { + return false + } + i = uint(what) >> 5 + what %= 32 + } + + switch which { + case EFFECTIVE: + return (1< 31 { + if c.data.version == 1 { + continue + } + i = uint(what) >> 5 + what %= 32 + } + + if which&EFFECTIVE != 0 { + c.data.effective[i] |= 1 << uint(what) + } + if which&PERMITTED != 0 { + c.data.data[i].permitted |= 1 << uint(what) + } + if which&INHERITABLE != 0 { + c.data.data[i].inheritable |= 1 << uint(what) + } + } +} + +func (c *capsFile) Unset(which CapType, caps ...Cap) { + for _, what := range caps { + var i uint + if what > 31 { + if c.data.version == 1 { + continue + } + i = uint(what) >> 5 + what %= 32 + } + + if which&EFFECTIVE != 0 { + c.data.effective[i] &= ^(1 << uint(what)) + } + if which&PERMITTED != 0 { + c.data.data[i].permitted &= ^(1 << uint(what)) + } + if which&INHERITABLE != 0 { + c.data.data[i].inheritable &= ^(1 << uint(what)) + } + } +} + +func (c *capsFile) Fill(kind CapType) { + if kind&CAPS == CAPS { + c.data.effective[0] = 0xffffffff + c.data.data[0].permitted = 0xffffffff + c.data.data[0].inheritable = 0 + if c.data.version == 2 { + c.data.effective[1] = 0xffffffff + c.data.data[1].permitted = 0xffffffff + c.data.data[1].inheritable = 0 + } + } +} + +func (c *capsFile) Clear(kind CapType) { + if kind&CAPS == CAPS { + c.data.effective[0] = 0 + c.data.data[0].permitted = 0 + c.data.data[0].inheritable = 0 + if c.data.version == 2 { + c.data.effective[1] = 0 + c.data.data[1].permitted = 0 + c.data.data[1].inheritable = 0 + } + } +} + +func (c *capsFile) StringCap(which CapType, format CapFormat) (ret string) { + return mkStringCap(c, which, format) +} + +func (c *capsFile) String(format CapFormat) (ret string) { + return mkString(c, INHERITABLE, format) +} + +func (c *capsFile) Load() (err error) { + return getVfsCap(c.path, &c.data) +} + +func (c *capsFile) Apply(kind CapType) (err error) { + if kind&CAPS == CAPS { + return setVfsCap(c.path, &c.data) + } + return +} diff --git a/sysbox-libs/capability/capability_noop.go b/sysbox-libs/capability/capability_noop.go new file mode 100644 index 00000000..25d9739f --- /dev/null +++ b/sysbox-libs/capability/capability_noop.go @@ -0,0 +1,23 @@ +// +// Copyright: (C) 2020 Nestybox Inc. All rights reserved. +// + +// Copyright (c) 2013, Suryandaru Triandana +// All rights reserved. +// +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +// +build !linux + +package capability + +import "errors" + +func newPid(pid int) (Capabilities, error) { + return nil, errors.New("not supported") +} + +func newFile(path string) (Capabilities, error) { + return nil, errors.New("not supported") +} diff --git a/sysbox-libs/capability/capability_test.go b/sysbox-libs/capability/capability_test.go new file mode 100644 index 00000000..998a7fe5 --- /dev/null +++ b/sysbox-libs/capability/capability_test.go @@ -0,0 +1,86 @@ +// +// Copyright: (C) 2020 Nestybox Inc. All rights reserved. +// + +// Copyright (c) 2013, Suryandaru Triandana +// All rights reserved. +// +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +package capability + +import "testing" + +func TestState(t *testing.T) { + testEmpty := func(name string, c Capabilities, whats CapType) { + for i := CapType(1); i <= BOUNDING; i <<= 1 { + if (i&whats) != 0 && !c.Empty(i) { + t.Errorf(name+": capabilities set %q wasn't empty", i) + } + } + } + testFull := func(name string, c Capabilities, whats CapType) { + for i := CapType(1); i <= BOUNDING; i <<= 1 { + if (i&whats) != 0 && !c.Full(i) { + t.Errorf(name+": capabilities set %q wasn't full", i) + } + } + } + testPartial := func(name string, c Capabilities, whats CapType) { + for i := CapType(1); i <= BOUNDING; i <<= 1 { + if (i&whats) != 0 && (c.Empty(i) || c.Full(i)) { + t.Errorf(name+": capabilities set %q wasn't partial", i) + } + } + } + testGet := func(name string, c Capabilities, whats CapType, max Cap) { + for i := CapType(1); i <= BOUNDING; i <<= 1 { + if (i & whats) == 0 { + continue + } + for j := Cap(0); j <= max; j++ { + if !c.Get(i, j) { + t.Errorf(name+": capability %q wasn't found on %q", j, i) + } + } + } + } + + capf := new(capsFile) + capf.data.version = 2 + for _, tc := range []struct { + name string + c Capabilities + sets CapType + max Cap + }{ + {"v3", new(capsV3), EFFECTIVE | PERMITTED | BOUNDING, CAP_LAST_CAP}, + {"file_v1", new(capsFile), EFFECTIVE | PERMITTED, CAP_AUDIT_CONTROL}, + {"file_v2", capf, EFFECTIVE | PERMITTED, CAP_LAST_CAP}, + } { + testEmpty(tc.name, tc.c, tc.sets) + tc.c.Fill(CAPS | BOUNDS) + testFull(tc.name, tc.c, tc.sets) + testGet(tc.name, tc.c, tc.sets, tc.max) + tc.c.Clear(CAPS | BOUNDS) + testEmpty(tc.name, tc.c, tc.sets) + for i := CapType(1); i <= BOUNDING; i <<= 1 { + for j := Cap(0); j <= CAP_LAST_CAP; j++ { + tc.c.Set(i, j) + } + } + testFull(tc.name, tc.c, tc.sets) + testGet(tc.name, tc.c, tc.sets, tc.max) + for i := CapType(1); i <= BOUNDING; i <<= 1 { + for j := Cap(0); j <= CAP_LAST_CAP; j++ { + tc.c.Unset(i, j) + } + } + testEmpty(tc.name, tc.c, tc.sets) + tc.c.Set(PERMITTED, CAP_CHOWN) + testPartial(tc.name, tc.c, PERMITTED) + tc.c.Clear(CAPS | BOUNDS) + testEmpty(tc.name, tc.c, tc.sets) + } +} diff --git a/sysbox-libs/capability/go.mod b/sysbox-libs/capability/go.mod new file mode 100644 index 00000000..8a00fe7f --- /dev/null +++ b/sysbox-libs/capability/go.mod @@ -0,0 +1,5 @@ +module github.com/nestybox/sysbox-libs/capability + +go 1.21 + +toolchain go1.21.0 diff --git a/sysbox-libs/capability/syscall_linux.go b/sysbox-libs/capability/syscall_linux.go new file mode 100644 index 00000000..16e61493 --- /dev/null +++ b/sysbox-libs/capability/syscall_linux.go @@ -0,0 +1,158 @@ +// +// Copyright: (C) 2020 Nestybox Inc. All rights reserved. +// + +// Copyright (c) 2013, Suryandaru Triandana +// All rights reserved. +// +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +package capability + +import ( + "syscall" + "unsafe" +) + +type capHeader struct { + version uint32 + pid int32 +} + +type capData struct { + effective uint32 + permitted uint32 + inheritable uint32 +} + +func capget(hdr *capHeader, data *capData) (err error) { + _, _, e1 := syscall.Syscall(syscall.SYS_CAPGET, uintptr(unsafe.Pointer(hdr)), uintptr(unsafe.Pointer(data)), 0) + if e1 != 0 { + err = e1 + } + return +} + +func capset(hdr *capHeader, data *capData) (err error) { + _, _, e1 := syscall.Syscall(syscall.SYS_CAPSET, uintptr(unsafe.Pointer(hdr)), uintptr(unsafe.Pointer(data)), 0) + if e1 != 0 { + err = e1 + } + return +} + +// not yet in syscall +const ( + pr_CAP_AMBIENT = 47 + pr_CAP_AMBIENT_IS_SET = uintptr(1) + pr_CAP_AMBIENT_RAISE = uintptr(2) + pr_CAP_AMBIENT_LOWER = uintptr(3) + pr_CAP_AMBIENT_CLEAR_ALL = uintptr(4) +) + +func prctl(option int, arg2, arg3, arg4, arg5 uintptr) (err error) { + _, _, e1 := syscall.Syscall6(syscall.SYS_PRCTL, uintptr(option), arg2, arg3, arg4, arg5, 0) + if e1 != 0 { + err = e1 + } + return +} + +const ( + vfsXattrName = "security.capability" + + vfsCapVerMask = 0xff000000 + vfsCapVer1 = 0x01000000 + vfsCapVer2 = 0x02000000 + + vfsCapFlagMask = ^vfsCapVerMask + vfsCapFlageffective = 0x000001 + + vfscapDataSizeV1 = 4 * (1 + 2*1) + vfscapDataSizeV2 = 4 * (1 + 2*2) +) + +type vfscapData struct { + magic uint32 + data [2]struct { + permitted uint32 + inheritable uint32 + } + effective [2]uint32 + version int8 +} + +var ( + _vfsXattrName *byte +) + +func init() { + _vfsXattrName, _ = syscall.BytePtrFromString(vfsXattrName) +} + +func getVfsCap(path string, dest *vfscapData) (err error) { + var _p0 *byte + _p0, err = syscall.BytePtrFromString(path) + if err != nil { + return + } + r0, _, e1 := syscall.Syscall6(syscall.SYS_GETXATTR, uintptr(unsafe.Pointer(_p0)), uintptr(unsafe.Pointer(_vfsXattrName)), uintptr(unsafe.Pointer(dest)), vfscapDataSizeV2, 0, 0) + if e1 != 0 { + if e1 == syscall.ENODATA { + dest.version = 2 + return + } + err = e1 + } + switch dest.magic & vfsCapVerMask { + case vfsCapVer1: + dest.version = 1 + if r0 != vfscapDataSizeV1 { + return syscall.EINVAL + } + dest.data[1].permitted = 0 + dest.data[1].inheritable = 0 + case vfsCapVer2: + dest.version = 2 + if r0 != vfscapDataSizeV2 { + return syscall.EINVAL + } + default: + return syscall.EINVAL + } + if dest.magic&vfsCapFlageffective != 0 { + dest.effective[0] = dest.data[0].permitted | dest.data[0].inheritable + dest.effective[1] = dest.data[1].permitted | dest.data[1].inheritable + } else { + dest.effective[0] = 0 + dest.effective[1] = 0 + } + return +} + +func setVfsCap(path string, data *vfscapData) (err error) { + var _p0 *byte + _p0, err = syscall.BytePtrFromString(path) + if err != nil { + return + } + var size uintptr + if data.version == 1 { + data.magic = vfsCapVer1 + size = vfscapDataSizeV1 + } else if data.version == 2 { + data.magic = vfsCapVer2 + if data.effective[0] != 0 || data.effective[1] != 0 { + data.magic |= vfsCapFlageffective + } + size = vfscapDataSizeV2 + } else { + return syscall.EINVAL + } + _, _, e1 := syscall.Syscall6(syscall.SYS_SETXATTR, uintptr(unsafe.Pointer(_p0)), uintptr(unsafe.Pointer(_vfsXattrName)), uintptr(unsafe.Pointer(data)), size, 0, 0) + if e1 != 0 { + err = e1 + } + return +} diff --git a/sysbox-libs/containerdUtils/containerdUtils.go b/sysbox-libs/containerdUtils/containerdUtils.go new file mode 100644 index 00000000..82561927 --- /dev/null +++ b/sysbox-libs/containerdUtils/containerdUtils.go @@ -0,0 +1,66 @@ +// +// Copyright: (C) 2024 Nestybox Inc. All rights reserved. +// +package containerdUtils + +import ( + "fmt" + "os" + + "github.com/BurntSushi/toml" +) + +// Location of containerd config file +// (see https://github.com/containerd/containerd/blob/main/docs/man/containerd-config.toml.5.md) +var ( + configPath = []string{ + "/etc/containerd/containerd.toml", + "/etc/containerd/config.toml", + "/usr/local/etc/containerd/config.toml", + } + + defaultDataRoot = "/var/lib/containerd" +) + +type containerdConfig struct { + Root string `toml:"Root"` +} + +// GetDataRoot returns the containerd data root directory, as read from +// the containerd config file. +func GetDataRoot() (string, error) { + for _, path := range configPath { + dataRoot, err := parseDataRoot(path) + if err != nil { + if os.IsNotExist(err) { + continue + } + return "", fmt.Errorf("failed to open file %s: %w", path, err) + } + return dataRoot, nil + } + return defaultDataRoot, nil +} + +func parseDataRoot(path string) (string, error) { + var config containerdConfig + + // open the config file; if it does not exist, move on to the next one. + f, err := os.Open(path) + if err != nil { + return "", err + } + defer f.Close() + + // parse the "root" + if _, err := toml.NewDecoder(f).Decode(&config); err != nil { + return "", fmt.Errorf("could not decode %s: %w", path, err) + } + + // if no "root" present, assume it's the default + if config.Root == "" { + return defaultDataRoot, nil + } + + return config.Root, nil +} diff --git a/sysbox-libs/containerdUtils/containerdUtils_test.go b/sysbox-libs/containerdUtils/containerdUtils_test.go new file mode 100644 index 00000000..43439747 --- /dev/null +++ b/sysbox-libs/containerdUtils/containerdUtils_test.go @@ -0,0 +1,106 @@ +package containerdUtils + +import ( + "io/ioutil" + "os" + "testing" +) + +func TestGetDataRoot(t *testing.T) { + tests := []struct { + name string + configPath string + configContent string + expectedRoot string + expectError bool + }{ + { + name: "Config with root entry", + configPath: "/etc/containerd/containerd.toml", + configContent: ` +version = 2 + +root = "/var/lib/desktop-containerd/daemon" +state = "/run/containerd" + +oom_score = 0 +imports = ["/etc/containerd/runtime_*.toml", "./debug.toml"] + +[grpc] + address = "/run/containerd/containerd.sock" + uid = 0 + gid = 0 + +[plugins] + [plugins."io.containerd.grpc.v1.cri"] + sandbox_image = "k8s.gcr.io/pause:3.2" + [plugins."io.containerd.snapshotter.v1.overlayfs"] + root_path = "/var/lib/containerd/snapshotter" +`, + expectedRoot: "/var/lib/desktop-containerd/daemon", + expectError: false, + }, + { + name: "Config without root entry", + configPath: "/etc/containerd/config.toml", + configContent: ` +version = 2 + +state = "/run/containerd" +oom_score = 0 +imports = ["/etc/containerd/runtime_*.toml", "./debug.toml"] + +[plugins] + [plugins."io.containerd.grpc.v1.cri"] + sandbox_image = "k8s.gcr.io/pause:3.2" + [plugins."io.containerd.snapshotter.v1.overlayfs"] + root_path = "/var/lib/containerd/snapshotter" +`, + expectedRoot: "/var/lib/containerd", // Default path + expectError: false, + }, + { + name: "Nonexistent config file", + configPath: "/path/to/nowhere", + expectedRoot: "", + expectError: true, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + var configPath string + var err error + + // Create a temporary config file if content is provided + if tt.configContent != "" { + tmpFile, err := ioutil.TempFile("", "config-*.toml") + if err != nil { + t.Fatalf("Failed to create temp file: %v", err) + } + defer os.Remove(tmpFile.Name()) // Clean up after test + + // Write the content + configPath = tmpFile.Name() + if _, err = tmpFile.WriteString(tt.configContent); err != nil { + t.Fatalf("Failed to write to temp file: %v", err) + } + tmpFile.Close() + } else { + configPath = "/nonexistent/config.toml" + } + + root, err := parseDataRoot(configPath) + + // Check if an error was expected or not + if tt.expectError && err == nil { + t.Fatalf("Expected error: %v, got: %v", tt.expectError, err) + } + + // Check the expected root path if no error was expected + if !tt.expectError && root != tt.expectedRoot { + t.Fatalf("Expected root: %s, got: %s", tt.expectedRoot, root) + } + }) + } +} diff --git a/sysbox-libs/containerdUtils/go.mod b/sysbox-libs/containerdUtils/go.mod new file mode 100644 index 00000000..0051d4d3 --- /dev/null +++ b/sysbox-libs/containerdUtils/go.mod @@ -0,0 +1,5 @@ +module github.com/nestybox/sysbox-libs/containerdUtils + +go 1.21.3 + +require github.com/BurntSushi/toml v1.4.0 diff --git a/sysbox-libs/containerdUtils/go.sum b/sysbox-libs/containerdUtils/go.sum new file mode 100644 index 00000000..8bc10f66 --- /dev/null +++ b/sysbox-libs/containerdUtils/go.sum @@ -0,0 +1,2 @@ +github.com/BurntSushi/toml v1.4.0 h1:kuoIxZQy2WRRk1pttg9asf+WVv6tWQuBNVmK8+nqPr0= +github.com/BurntSushi/toml v1.4.0/go.mod h1:ukJfTF/6rtPPRCnwkur4qwRxa8vTRFBF0uk2lLoLwho= diff --git a/sysbox-libs/dockerUtils/dockerUtils.go b/sysbox-libs/dockerUtils/dockerUtils.go new file mode 100644 index 00000000..f18fad0b --- /dev/null +++ b/sysbox-libs/dockerUtils/dockerUtils.go @@ -0,0 +1,250 @@ +// +// Copyright: (C) 2019 - 2020 Nestybox Inc. All rights reserved. +// + +package dockerUtils + +import ( + "context" + "fmt" + "os" + "path/filepath" + "strings" + "time" + + "github.com/nestybox/sysbox-libs/utils" + + "github.com/docker/docker/api/types/container" + "github.com/docker/docker/api/types/filters" + "github.com/docker/docker/api/types/volume" + "github.com/docker/docker/client" +) + +// Set to true during testing only +var testMode = false + +type ErrCode int + +const ( + DockerConnErr ErrCode = iota + DockerDiscErr + DockerInfoErr + DockerContInfoErr + DockerOtherErr +) + +type DockerErr struct { + Code ErrCode + msg string +} + +func newDockerErr(code ErrCode, msg string) *DockerErr { + return &DockerErr{ + Code: code, + msg: msg, + } +} + +func (e *DockerErr) Error() string { + return e.msg +} + +type ContainerInfo struct { + Rootfs string + AutoRemove bool +} + +type Docker struct { + cli *client.Client + dataRoot string +} + +// DockerConnect establishes a session with the Docker daemon. +func DockerConnect() (*Docker, error) { + + // Profiling shows Docker takes on average ~10ms to respond to a single + // client; with up to 1000 concurrent clients, it takes ~400ms to respond on + // average (see the TestDockerConnectDelay() test in dockerUtils_test.go). + // Thus we set the timeout to 1 sec; if it doesn't respond in this time, it + // likely means Docker is not present. + timeout := time.Duration(1 * time.Second) + + cli, err := client.NewClientWithOpts( + client.FromEnv, + client.WithTimeout(timeout), + client.WithAPIVersionNegotiation(), + ) + + if err != nil { + return nil, newDockerErr(DockerConnErr, fmt.Sprintf("failed to connect to Docker API: %v", err)) + } + + // Get the docker data root dir (usually /var/lib/docker) + info, err := cli.Info(context.Background()) + if err != nil { + err2 := cli.Close() + if err2 != nil { + return nil, newDockerErr(DockerInfoErr, fmt.Sprintf("failed to retrieve Docker info (%v) and disconnect from Docker API (%v)", err, err2)) + } + return nil, newDockerErr(DockerInfoErr, fmt.Sprintf("failed to retrieve Docker info: %v", err)) + } + + return &Docker{ + cli: cli, + dataRoot: info.DockerRootDir, + }, nil +} + +func (d *Docker) Disconnect() error { + err := d.cli.Close() + if err != nil { + return newDockerErr(DockerDiscErr, fmt.Sprintf("failed to disconnect from Docker API: %v", err)) + } + return nil +} + +// GetDataRoot returns the Docker daemon's data-root dir (usually "/var/lib/docker/"). +func (d *Docker) GetDataRoot() string { + return d.dataRoot +} + +// ContainerGetImageID returns the image ID of the given container; may be +// called during container creation. +func (d *Docker) ContainerGetImageID(containerID string) (string, error) { + + filter := filters.NewArgs() + filter.Add("id", containerID) + + containers, err := d.cli.ContainerList(context.Background(), container.ListOptions{ + All: true, // required since container may not yet be running + Filters: filter, + }) + + if err != nil { + return "", newDockerErr(DockerContInfoErr, err.Error()) + } + + if len(containers) == 0 { + return "", newDockerErr(DockerContInfoErr, fmt.Sprintf("container %s found", containerID)) + } else if len(containers) > 1 { + return "", newDockerErr(DockerContInfoErr, fmt.Sprintf("more than one container matches ID %s: %v", containerID, containers)) + } + + return containers[0].ImageID, nil +} + +// ContainerGetInfo returns info for the given container. Must be called +// after the container is created. +func (d *Docker) ContainerGetInfo(containerID string) (*ContainerInfo, error) { + + info, err := d.cli.ContainerInspect(context.Background(), containerID) + if err != nil { + return nil, err + } + + rootfs := "" + if info.GraphDriver.Name == "overlay2" { + rootfs = info.GraphDriver.Data["MergedDir"] + } + + return &ContainerInfo{ + Rootfs: rootfs, + AutoRemove: info.HostConfig.AutoRemove, + }, nil +} + +// ListVolumesAt lists Docker volumes with the given host mount point (which implies +// volumes using the "local" driver only). +func (d *Docker) ListVolumesAt(mountPoint string) ([]volume.Volume, error) { + + filterArgs := filters.NewArgs() + filterArgs.Add("driver", "local") + + // List volumes using the filter + volumeList, err := d.cli.VolumeList(context.Background(), volume.ListOptions{Filters: filterArgs}) + if err != nil { + return nil, err + } + + // Filter volumes by mount point + var filteredVolumes []volume.Volume + for _, vol := range volumeList.Volumes { + if vol.Mountpoint == mountPoint { + filteredVolumes = append(filteredVolumes, *vol) + break + } + } + + return filteredVolumes, nil +} + +// ContainerIsDocker returns true if the given container ID corresponds to a +// Docker container. It does this by first trying to query Docker for the +// container. If this doesn't work, it uses a heuristic based on the container's +// rootfs. +func ContainerIsDocker(id, rootfs string) (bool, error) { + + docker, err := DockerConnect() + if err == nil { + defer docker.Disconnect() + _, err := docker.ContainerGetImageID(id) + return (err == nil), nil + } + + // The connection to Docker can fail when containers are restarted + // automatically after reboot (i.e., containers originally launched with + // "--restart"); Docker won't respond until those are up. See Sysbox issue + // #184. In this case we determine if the container is a Docker container by + // examining the container's rootfs. + + return isDockerRootfs(rootfs) +} + +// isDockerRootfs determines if the given a container rootfs is for a Docker container. +func isDockerRootfs(rootfs string) (bool, error) { + + // Check if the container rootfs is under Docker's default data root + // (when in test-mode, we skip this so as to do the deeper check below) + if !testMode { + if strings.Contains(rootfs, "/var/lib/docker") { + return true, nil + } + } + + // Check the parent dirs of the rootfs (up to 5 levels) and look for the + // `image, network, swarm, and containers` directories that are part of the + // Docker data root. + + dockerDirs := []string{"image", "network", "containers", "swarm"} + + searchLevels := 5 + maxFilesPerDir := 30 // the docker data root dir has typically 10->20 subdirs in it + path := rootfs + + for i := 0; i < searchLevels; i++ { + path = filepath.Dir(path) + + dir, err := os.Open(path) + if err != nil { + return false, newDockerErr(DockerOtherErr, fmt.Sprintf("failed to open %s: %s\n", path, err)) + } + + filenames, err := dir.Readdirnames(maxFilesPerDir) + if err != nil { + return false, newDockerErr(DockerOtherErr, fmt.Sprintf("failed to read directories under %s: %s\n", path, err)) + } + + isDocker := true + for _, dockerDir := range dockerDirs { + if !utils.StringSliceContains(filenames, dockerDir) { + isDocker = false + } + } + + if isDocker { + return true, nil + } + } + + return false, nil +} diff --git a/sysbox-libs/dockerUtils/dockerUtils_test.go b/sysbox-libs/dockerUtils/dockerUtils_test.go new file mode 100644 index 00000000..ba17152c --- /dev/null +++ b/sysbox-libs/dockerUtils/dockerUtils_test.go @@ -0,0 +1,216 @@ +// +// Copyright: (C) 2019 - 2020 Nestybox Inc. All rights reserved. +// + +package dockerUtils + +import ( + "bytes" + "context" + "fmt" + "os/exec" + "path/filepath" + "strings" + "sync" + "testing" + "time" + + "github.com/docker/docker/api/types/volume" + "github.com/stretchr/testify/assert" +) + +func TestGetContainer(t *testing.T) { + + testMode = true + defer func() { testMode = false }() + + docker, err := DockerConnect() + if err != nil { + t.Fatalf("DockerConnect() failed: %v", err) + } + defer docker.Disconnect() + + dataRoot := docker.GetDataRoot() + if dataRoot != "/var/lib/docker" { + t.Errorf("docker.GetDataRoot(): want /var/lib/docker; got %s", dataRoot) + } + + id, err := testStartContainer(false) + if err != nil { + t.Fatalf("Failed to start test container: %v", err) + } + + ci, err := docker.ContainerGetInfo(id) + if err != nil { + t.Errorf("ContainerGetInfo(%s) failed: %v", id, err) + } + + if ci.AutoRemove != false { + t.Errorf("Container autoRemove mismatch: want false, got true") + } + + isDocker, err := ContainerIsDocker(id, ci.Rootfs) + if err != nil { + t.Errorf("ContainerIsDocker(%s, %s) failed: %v", id, ci.Rootfs, err) + } + if !isDocker { + t.Errorf("ContainerIsDocker(%s, %s) returned false; expecting true", id, ci.Rootfs) + } + + isDockerRootfs, err := isDockerRootfs(ci.Rootfs) + if err != nil { + t.Errorf("isDockerRootfs(%s) failed: %v", ci.Rootfs, err) + } + if !isDockerRootfs { + t.Errorf("isDockerRootfs(%s) returned false; expecting true", ci.Rootfs) + } + + if err := testStopContainer(id, true); err != nil { + t.Errorf("Failed to stop test container: %v", err) + } +} + +func TestGetContainerAutoRemove(t *testing.T) { + + docker, err := DockerConnect() + if err != nil { + t.Fatalf("DockerConnect() failed: %v", err) + } + + id, err := testStartContainer(true) + if err != nil { + t.Fatalf("Failed to start test container: %v", err) + } + + ci, err := docker.ContainerGetInfo(id) + if err != nil { + t.Errorf("ContainerGetInfo(%s) failed: %v", id, err) + } + + if ci.AutoRemove != true { + t.Errorf("Container autoRemove mismatch: want true, got false") + } + + if err := testStopContainer(id, false); err != nil { + t.Errorf("Failed to stop test container: %v", err) + } +} + +func TestListVolumesAt(t *testing.T) { + + docker, err := DockerConnect() + if err != nil { + t.Fatalf("DockerConnect() failed: %v", err) + } + defer docker.Disconnect() + + // Prepare by creating a volume to test against + volName := "testvolume" + ctx := context.Background() + _, err = docker.cli.VolumeCreate(ctx, volume.CreateOptions{Name: volName, Driver: "local"}) + assert.NoError(t, err, "should be able to create a volume") + + // Clean up after test + defer func() { + err := docker.cli.VolumeRemove(ctx, volName, true) + assert.NoError(t, err, "should be able to remove the volume") + }() + + // Test the function + mountPoint := filepath.Join("/var/lib/docker/volumes/", volName, "_data") + volumes, err := docker.ListVolumesAt(mountPoint) + assert.NoError(t, err, "should not have an error listing volumes") + assert.True(t, len(volumes) > 0, "should find at least one volume") + found := false + for _, vol := range volumes { + if vol.Name == volName && vol.Mountpoint == mountPoint { + found = true + break + } + } + assert.True(t, found, "should find the test volume in the filtered list") +} + +func TestDockerConnectDelay(t *testing.T) { + var wg sync.WaitGroup + + numWorkers := 1000 + maxDelay := 500 * time.Millisecond + delayCh := make(chan time.Duration, numWorkers) + + for i := 0; i < numWorkers; i++ { + wg.Add(1) + go dockerConnectWorker(&wg, delayCh) + } + + wg.Wait() + + sum := 0 * time.Second + for i := 0; i < numWorkers; i++ { + sum += <-delayCh + } + avg := sum / time.Duration(numWorkers) + + if avg > time.Duration(maxDelay) { + t.Fatalf("DockerConnect() delay failed: want <= %v, got %v", maxDelay, avg) + } + + t.Logf("DockerConnect() delay for %d concurrent clients = %v (average)\n", numWorkers, avg) +} + +// test helpers + +func testStartContainer(autoRemove bool) (string, error) { + var cmd *exec.Cmd + var stdout, stderr bytes.Buffer + + if autoRemove { + cmd = exec.Command("docker", "run", "-d", "--rm", "alpine", "tail", "-f", "/dev/null") + } else { + cmd = exec.Command("docker", "run", "-d", "alpine", "tail", "-f", "/dev/null") + } + cmd.Stdout = &stdout + cmd.Stderr = &stderr + + err := cmd.Run() + if err != nil { + return "", fmt.Errorf("failed to start test container: %s %s\n", stdout.String(), stderr.String()) + } + + id := strings.TrimSuffix(stdout.String(), "\n") + return id, nil +} + +func testStopContainer(id string, remove bool) error { + var cmd *exec.Cmd + var stdout, stderr bytes.Buffer + + if remove { + cmd = exec.Command("docker", "rm", "-f", id) + } else { + cmd = exec.Command("docker", "stop", "-t0", id) + } + + cmd.Stdout = &stdout + cmd.Stderr = &stderr + + err := cmd.Run() + if err != nil { + return fmt.Errorf("failed to stop test container: %s %s\n", stdout.String(), stderr.String()) + } + + return nil +} + +func dockerConnectWorker(wg *sync.WaitGroup, delayCh chan time.Duration) { + start := time.Now() + _, err := DockerConnect() + delay := time.Since(start) + + if err != nil { + fmt.Printf("error connecting to docker (delay = %v): %v\n", delay, err) + } + + delayCh <- delay + wg.Done() +} diff --git a/sysbox-libs/dockerUtils/go.mod b/sysbox-libs/dockerUtils/go.mod new file mode 100644 index 00000000..851d2274 --- /dev/null +++ b/sysbox-libs/dockerUtils/go.mod @@ -0,0 +1,46 @@ +module github.com/nestybox/sysbox-libs/dockerUtils + +go 1.21 + +toolchain go1.21.3 + +require ( + github.com/docker/docker v26.0.0+incompatible + github.com/nestybox/sysbox-libs/utils v0.0.0-00010101000000-000000000000 + github.com/stretchr/testify v1.9.0 +) + +require ( + github.com/Microsoft/go-winio v0.4.16 // indirect + github.com/containerd/log v0.1.0 // indirect + github.com/davecgh/go-spew v1.1.1 // indirect + github.com/distribution/reference v0.6.0 // indirect + github.com/docker/go-connections v0.4.0 // indirect + github.com/docker/go-units v0.4.0 // indirect + github.com/felixge/httpsnoop v1.0.4 // indirect + github.com/go-logr/logr v1.4.1 // indirect + github.com/go-logr/stdr v1.2.2 // indirect + github.com/gogo/protobuf v1.3.2 // indirect + github.com/moby/docker-image-spec v1.3.1 // indirect + github.com/moby/term v0.0.0-20201216013528-df9cb8a40635 // indirect + github.com/morikuni/aec v1.0.0 // indirect + github.com/opencontainers/go-digest v1.0.0 // indirect + github.com/opencontainers/image-spec v1.0.2 // indirect + github.com/opencontainers/runtime-spec v1.0.2 // indirect + github.com/pkg/errors v0.9.1 // indirect + github.com/pmezard/go-difflib v1.0.0 // indirect + github.com/sirupsen/logrus v1.9.3 // indirect + go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.51.0 // indirect + go.opentelemetry.io/otel v1.26.0 // indirect + go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracehttp v1.26.0 // indirect + go.opentelemetry.io/otel/metric v1.26.0 // indirect + go.opentelemetry.io/otel/sdk v1.26.0 // indirect + go.opentelemetry.io/otel/trace v1.26.0 // indirect + golang.org/x/net v0.23.0 // indirect + golang.org/x/sys v0.19.0 // indirect + golang.org/x/time v0.0.0-20201208040808-7e3f01d25324 // indirect + gopkg.in/yaml.v3 v3.0.1 // indirect + gotest.tools/v3 v3.0.3 // indirect +) + +replace github.com/nestybox/sysbox-libs/utils => ../utils diff --git a/sysbox-libs/dockerUtils/go.sum b/sysbox-libs/dockerUtils/go.sum new file mode 100644 index 00000000..8b8e91bb --- /dev/null +++ b/sysbox-libs/dockerUtils/go.sum @@ -0,0 +1,138 @@ +github.com/Azure/go-ansiterm v0.0.0-20170929234023-d6e3b3328b78 h1:w+iIsaOQNcT7OZ575w+acHgRric5iCyQh+xv+KJ4HB8= +github.com/Azure/go-ansiterm v0.0.0-20170929234023-d6e3b3328b78/go.mod h1:LmzpDX56iTiv29bbRTIsUNlaFfuhWRQBWjQdVyAevI8= +github.com/Microsoft/go-winio v0.4.16 h1:FtSW/jqD+l4ba5iPBj9CODVtgfYAD8w2wS923g/cFDk= +github.com/Microsoft/go-winio v0.4.16/go.mod h1:XB6nPKklQyQ7GC9LdcBEcBl8PF76WugXOPRXwdLnMv0= +github.com/cenkalti/backoff/v4 v4.3.0 h1:MyRJ/UdXutAwSAT+s3wNd7MfTIcy71VQueUuFK343L8= +github.com/cenkalti/backoff/v4 v4.3.0/go.mod h1:Y3VNntkOUPxTVeUxJ/G5vcM//AlwfmyYozVcomhLiZE= +github.com/containerd/log v0.1.0 h1:TCJt7ioM2cr/tfR8GPbGf9/VRAX8D2B4PjzCpfX540I= +github.com/containerd/log v0.1.0/go.mod h1:VRRf09a7mHDIRezVKTRCrOq78v577GXq3bSa3EhrzVo= +github.com/creack/pty v1.1.11/go.mod h1:oKZEueFk5CKHvIhNR5MUki03XCEU+Q6VDXinZuGJ33E= +github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= +github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= +github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= +github.com/distribution/reference v0.6.0 h1:0IXCQ5g4/QMHHkarYzh5l+u8T3t73zM5QvfrDyIgxBk= +github.com/distribution/reference v0.6.0/go.mod h1:BbU0aIcezP1/5jX/8MP0YiH4SdvB5Y4f/wlDRiLyi3E= +github.com/docker/docker v26.0.0+incompatible h1:Ng2qi+gdKADUa/VM+6b6YaY2nlZhk/lVJiKR/2bMudU= +github.com/docker/docker v26.0.0+incompatible/go.mod h1:eEKB0N0r5NX/I1kEveEz05bcu8tLC/8azJZsviup8Sk= +github.com/docker/go-connections v0.4.0 h1:El9xVISelRB7BuFusrZozjnkIM5YnzCViNKohAFqRJQ= +github.com/docker/go-connections v0.4.0/go.mod h1:Gbd7IOopHjR8Iph03tsViu4nIes5XhDvyHbTtUxmeec= +github.com/docker/go-units v0.4.0 h1:3uh0PgVws3nIA0Q+MwDC8yjEPf9zjRfZZWXZYDct3Tw= +github.com/docker/go-units v0.4.0/go.mod h1:fgPhTUdO+D/Jk86RDLlptpiXQzgHJF7gydDDbaIK4Dk= +github.com/felixge/httpsnoop v1.0.4 h1:NFTV2Zj1bL4mc9sqWACXbQFVBBg2W3GPvqp8/ESS2Wg= +github.com/felixge/httpsnoop v1.0.4/go.mod h1:m8KPJKqk1gH5J9DgRY2ASl2lWCfGKXixSwevea8zH2U= +github.com/go-logr/logr v1.2.2/go.mod h1:jdQByPbusPIv2/zmleS9BjJVeZ6kBagPoEUsqbVz/1A= +github.com/go-logr/logr v1.4.1 h1:pKouT5E8xu9zeFC39JXRDukb6JFQPXM5p5I91188VAQ= +github.com/go-logr/logr v1.4.1/go.mod h1:9T104GzyrTigFIr8wt5mBrctHMim0Nb2HLGrmQ40KvY= +github.com/go-logr/stdr v1.2.2 h1:hSWxHoqTgW2S2qGc0LTAI563KZ5YKYRhT3MFKZMbjag= +github.com/go-logr/stdr v1.2.2/go.mod h1:mMo/vtBO5dYbehREoey6XUKy/eSumjCCveDpRre4VKE= +github.com/gogo/protobuf v1.3.2 h1:Ov1cvc58UF3b5XjBnZv7+opcTcQFZebYjWzi34vdm4Q= +github.com/gogo/protobuf v1.3.2/go.mod h1:P1XiOD3dCwIKUDQYPy72D8LYyHL2YPYrpS2s69NZV8Q= +github.com/google/go-cmp v0.3.0/go.mod h1:8QqcDgzrUqlUb/G2PQTWiueGozuR1884gddMywk6iLU= +github.com/google/go-cmp v0.4.0/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE= +github.com/google/go-cmp v0.6.0 h1:ofyhxvXcZhMsU5ulbFiLKl/XBFqE1GSq7atu8tAmTRI= +github.com/google/go-cmp v0.6.0/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY= +github.com/grpc-ecosystem/grpc-gateway/v2 v2.19.1 h1:/c3QmbOGMGTOumP2iT/rCwB7b0QDGLKzqOmktBjT+Is= +github.com/grpc-ecosystem/grpc-gateway/v2 v2.19.1/go.mod h1:5SN9VR2LTsRFsrEC6FHgRbTWrTHu6tqPeKxEQv15giM= +github.com/kisielk/errcheck v1.5.0/go.mod h1:pFxgyoBC7bSaBwPgfKdkLd5X25qrDl4LWUI2bnpBCr8= +github.com/kisielk/gotool v1.0.0/go.mod h1:XhKaO+MFFWcvkIS/tQcRk01m1F5IRFswLeQ+oQHNcck= +github.com/konsorten/go-windows-terminal-sequences v1.0.1/go.mod h1:T0+1ngSBFLxvqU3pZ+m/2kptfBszLMUkC4ZK/EgS/cQ= +github.com/moby/docker-image-spec v1.3.1 h1:jMKff3w6PgbfSa69GfNg+zN/XLhfXJGnEx3Nl2EsFP0= +github.com/moby/docker-image-spec v1.3.1/go.mod h1:eKmb5VW8vQEh/BAr2yvVNvuiJuY6UIocYsFu/DxxRpo= +github.com/moby/term v0.0.0-20201216013528-df9cb8a40635 h1:rzf0wL0CHVc8CEsgyygG0Mn9CNCCPZqOPaz8RiiHYQk= +github.com/moby/term v0.0.0-20201216013528-df9cb8a40635/go.mod h1:FBS0z0QWA44HXygs7VXDUOGoN/1TV3RuWkLO04am3wc= +github.com/morikuni/aec v1.0.0 h1:nP9CBfwrvYnBRgY6qfDQkygYDmYwOilePFkwzv4dU8A= +github.com/morikuni/aec v1.0.0/go.mod h1:BbKIizmSmc5MMPqRYbxO4ZU0S0+P200+tUnFx7PXmsc= +github.com/opencontainers/go-digest v1.0.0 h1:apOUWs51W5PlhuyGyz9FCeeBIOUDA/6nW8Oi/yOhh5U= +github.com/opencontainers/go-digest v1.0.0/go.mod h1:0JzlMkj0TRzQZfJkVvzbP0HBR3IKzErnv2BNG4W4MAM= +github.com/opencontainers/image-spec v1.0.2 h1:9yCKha/T5XdGtO0q9Q9a6T5NUCsTn/DrBg0D7ufOcFM= +github.com/opencontainers/image-spec v1.0.2/go.mod h1:BtxoFyWECRxE4U/7sNtV5W15zMzWCbyJoFRP3s7yZA0= +github.com/opencontainers/runtime-spec v1.0.2 h1:UfAcuLBJB9Coz72x1hgl8O5RVzTdNiaglX6v2DM6FI0= +github.com/opencontainers/runtime-spec v1.0.2/go.mod h1:jwyrGlmzljRJv/Fgzds9SsS/C5hL+LL3ko9hs6T5lQ0= +github.com/pkg/errors v0.8.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0= +github.com/pkg/errors v0.9.1 h1:FEBLx1zS214owpjy7qsBeixbURkuhQAwrK5UwLGTwt4= +github.com/pkg/errors v0.9.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0= +github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= +github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= +github.com/sirupsen/logrus v1.4.1/go.mod h1:ni0Sbl8bgC9z8RoU9G6nDWqqs/fq4eDPysMBDgk/93Q= +github.com/sirupsen/logrus v1.9.3 h1:dueUQJ1C2q9oE3F7wvmSGAaVtTmUizReu6fjN8uqzbQ= +github.com/sirupsen/logrus v1.9.3/go.mod h1:naHLuLoDiP4jHNo9R0sCBMtWGeIprob74mVsIT4qYEQ= +github.com/spf13/pflag v1.0.3/go.mod h1:DYY7MBk1bdzusC3SYhjObp+wFpr4gzcvqqNjLnInEg4= +github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= +github.com/stretchr/objx v0.1.1/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= +github.com/stretchr/testify v1.2.2/go.mod h1:a8OnRcib4nhh0OaRAV+Yts87kKdq0PP7pXfy6kDkUVs= +github.com/stretchr/testify v1.7.0/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg= +github.com/stretchr/testify v1.9.0 h1:HtqpIVDClZ4nwg75+f6Lvsy/wHu+3BoSGCbBAcpTsTg= +github.com/stretchr/testify v1.9.0/go.mod h1:r2ic/lqez/lEtzL7wO/rwa5dbSLXVDPFyf8C91i36aY= +github.com/yuin/goldmark v1.1.27/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74= +github.com/yuin/goldmark v1.2.1/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74= +go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.51.0 h1:Xs2Ncz0gNihqu9iosIZ5SkBbWo5T8JhhLJFMQL1qmLI= +go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.51.0/go.mod h1:vy+2G/6NvVMpwGX/NyLqcC41fxepnuKHk16E6IZUcJc= +go.opentelemetry.io/otel v1.26.0 h1:LQwgL5s/1W7YiiRwxf03QGnWLb2HW4pLiAhaA5cZXBs= +go.opentelemetry.io/otel v1.26.0/go.mod h1:UmLkJHUAidDval2EICqBMbnAd0/m2vmpf/dAM+fvFs4= +go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.26.0 h1:1u/AyyOqAWzy+SkPxDpahCNZParHV8Vid1RnI2clyDE= +go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.26.0/go.mod h1:z46paqbJ9l7c9fIPCXTqTGwhQZ5XoTIsfeFYWboizjs= +go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracehttp v1.26.0 h1:1wp/gyxsuYtuE/JFxsQRtcCDtMrO2qMvlfXALU5wkzI= +go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracehttp v1.26.0/go.mod h1:gbTHmghkGgqxMomVQQMur1Nba4M0MQ8AYThXDUjsJ38= +go.opentelemetry.io/otel/metric v1.26.0 h1:7S39CLuY5Jgg9CrnA9HHiEjGMF/X2VHvoXGgSllRz30= +go.opentelemetry.io/otel/metric v1.26.0/go.mod h1:SY+rHOI4cEawI9a7N1A4nIg/nTQXe1ccCNWYOJUrpX4= +go.opentelemetry.io/otel/sdk v1.26.0 h1:Y7bumHf5tAiDlRYFmGqetNcLaVUZmh4iYfmGxtmz7F8= +go.opentelemetry.io/otel/sdk v1.26.0/go.mod h1:0p8MXpqLeJ0pzcszQQN4F0S5FVjBLgypeGSngLsmirs= +go.opentelemetry.io/otel/trace v1.26.0 h1:1ieeAUb4y0TE26jUFrCIXKpTuVK7uJGN9/Z/2LP5sQA= +go.opentelemetry.io/otel/trace v1.26.0/go.mod h1:4iDxvGDQuUkHve82hJJ8UqrwswHYsZuWCBllGV2U2y0= +go.opentelemetry.io/proto/otlp v1.2.0 h1:pVeZGk7nXDC9O2hncA6nHldxEjm6LByfA2aN8IOkz94= +go.opentelemetry.io/proto/otlp v1.2.0/go.mod h1:gGpR8txAl5M03pDhMC79G6SdqNV26naRm/KDsgaHD8A= +golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w= +golang.org/x/crypto v0.0.0-20191011191535-87dc89f01550/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI= +golang.org/x/crypto v0.0.0-20200622213623-75b288015ac9/go.mod h1:LzIPMQfyMNhhGPhUkYOs5KpL4U8rLKemX1yGLhDgUto= +golang.org/x/mod v0.2.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA= +golang.org/x/mod v0.3.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA= +golang.org/x/net v0.0.0-20190311183353-d8887717615a/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg= +golang.org/x/net v0.0.0-20190404232315-eb5bcb51f2a3/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg= +golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= +golang.org/x/net v0.0.0-20200226121028-0de0cce0169b/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= +golang.org/x/net v0.0.0-20201021035429-f5854403a974/go.mod h1:sp8m0HH+o8qH0wwXwYZr8TS3Oi6o0r6Gce1SSxlDquU= +golang.org/x/net v0.23.0 h1:7EYJ93RZ9vYSZAIb2x3lnuvqO5zneoD6IvWjuhfxjTs= +golang.org/x/net v0.23.0/go.mod h1:JKghWKKOSdJwpW2GEx0Ja7fmaKnMsbu+MWVZTokSYmg= +golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= +golang.org/x/sync v0.0.0-20190911185100-cd5d95a43a6e/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= +golang.org/x/sync v0.0.0-20201020160332-67f06af15bc9/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= +golang.org/x/sys v0.0.0-20180905080454-ebe1bf3edb33/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= +golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= +golang.org/x/sys v0.0.0-20190412213103-97732733099d/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20190916202348-b4ddaad3f8a3/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20200831180312-196b9ba8737a/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20200930185726-fdedc70b468f/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20220715151400-c0bba94af5f8/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.19.0 h1:q5f1RH2jigJ1MoAWp2KTp3gm5zAGFUTarQZ5U386+4o= +golang.org/x/sys v0.19.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= +golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= +golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= +golang.org/x/text v0.14.0 h1:ScX5w1eTa3QqT8oi6+ziP7dTV1S2+ALU0bI+0zXKWiQ= +golang.org/x/text v0.14.0/go.mod h1:18ZOQIKpY8NJVqYksKHtTdi31H5itFRjB5/qKTNYzSU= +golang.org/x/time v0.0.0-20201208040808-7e3f01d25324 h1:Hir2P/De0WpUhtrKGGjvSb2YxUgyZ7EFOSLIcSSpiwE= +golang.org/x/time v0.0.0-20201208040808-7e3f01d25324/go.mod h1:tRJNPiyCQ0inRvYxbN9jk5I+vvW/OXSQhTDSoE431IQ= +golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= +golang.org/x/tools v0.0.0-20190624222133-a101b041ded4/go.mod h1:/rFqwRUd4F7ZHNgwSSTFct+R/Kf4OFW1sUzUTQQTgfc= +golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo= +golang.org/x/tools v0.0.0-20200619180055-7c47624df98f/go.mod h1:EkVYQZoAsY45+roYkvgYkIh4xh/qjgUK9TdY2XT94GE= +golang.org/x/tools v0.0.0-20210106214847-113979e3529a/go.mod h1:emZCQorbCU4vsT4fOWvOPXz4eW1wZW4PmDk9uLelYpA= +golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= +golang.org/x/xerrors v0.0.0-20191011141410-1b5146add898/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= +golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= +golang.org/x/xerrors v0.0.0-20200804184101-5ec99f83aff1/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= +google.golang.org/genproto/googleapis/api v0.0.0-20240227224415-6ceb2ff114de h1:jFNzHPIeuzhdRwVhbZdiym9q0ory/xY3sA+v2wPg8I0= +google.golang.org/genproto/googleapis/api v0.0.0-20240227224415-6ceb2ff114de/go.mod h1:5iCWqnniDlqZHrd3neWVTOwvh/v6s3232omMecelax8= +google.golang.org/genproto/googleapis/rpc v0.0.0-20240401170217-c3f982113cda h1:LI5DOvAxUPMv/50agcLLoo+AdWc1irS9Rzz4vPuD1V4= +google.golang.org/genproto/googleapis/rpc v0.0.0-20240401170217-c3f982113cda/go.mod h1:WtryC6hu0hhx87FDGxWCDptyssuo68sk10vYjF+T9fY= +google.golang.org/grpc v1.63.2 h1:MUeiw1B2maTVZthpU5xvASfTh3LDbxHd6IJ6QQVU+xM= +google.golang.org/grpc v1.63.2/go.mod h1:WAX/8DgncnokcFUldAxq7GeB5DXHDbMF+lLvDomNkRA= +google.golang.org/protobuf v1.33.0 h1:uNO2rsAINq/JlFpSdYEKIZ0uKD/R9cpdv0T+yoGwGmI= +google.golang.org/protobuf v1.33.0/go.mod h1:c6P6GXX6sHbq/GpV6MGZEdwhWPcYBgnhAHhKbcUYpos= +gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405 h1:yhCVgyC4o1eVCa2tZl7eS0r+SDo693bJlVdllGtEeKM= +gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= +gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= +gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA= +gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= +gotest.tools/v3 v3.0.2/go.mod h1:3SzNCllyD9/Y+b5r9JIKQ474KzkZyqLqEfYqMsX94Bk= +gotest.tools/v3 v3.0.3 h1:4AuOwCGf4lLR9u3YOe2awrHygurzhO/HeQ6laiA6Sx0= +gotest.tools/v3 v3.0.3/go.mod h1:Z7Lb0S5l+klDB31fvDQX8ss/FlKDxtlFlw3Oa8Ymbl8= diff --git a/sysbox-libs/fileMonitor/fileMon.go b/sysbox-libs/fileMonitor/fileMon.go new file mode 100644 index 00000000..5d81b7e0 --- /dev/null +++ b/sysbox-libs/fileMonitor/fileMon.go @@ -0,0 +1,96 @@ +// +// Copyright 2023 Nestybox Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +// The fileMonitor notifies the caller about file removal events. +// It uses a simple polling algorithm. + +package fileMonitor + +import ( + "fmt" + "sync" + "time" +) + +type Cfg struct { + EventBufSize int + PollInterval time.Duration // in milliseconds +} + +// polling config limits +const ( + PollMin = 1 * time.Millisecond + PollMax = 10000 * time.Millisecond +) + +type Event struct { + Filename string + Err error +} + +type FileMon struct { + mu sync.Mutex + cfg Cfg + eventTable map[string]bool + cmdCh chan cmd + eventCh chan []Event // receives events from monitor thread +} + +func New(cfg *Cfg) (*FileMon, error) { + if err := validateCfg(cfg); err != nil { + return nil, err + } + + fm := &FileMon{ + cfg: *cfg, + eventTable: make(map[string]bool), + cmdCh: make(chan cmd), + eventCh: make(chan []Event, cfg.EventBufSize), + } + + go fileMon(fm) + + return fm, nil +} + +func (fm *FileMon) Add(file string) { + fm.mu.Lock() + fm.eventTable[file] = true + fm.mu.Unlock() +} + +func (fm *FileMon) Remove(file string) { + fm.mu.Lock() + if _, ok := fm.eventTable[file]; ok { + delete(fm.eventTable, file) + } + fm.mu.Unlock() +} + +func (fm *FileMon) Events() <-chan []Event { + return fm.eventCh +} + +func (fm *FileMon) Close() { + fm.cmdCh <- stop +} + +func validateCfg(cfg *Cfg) error { + if cfg.PollInterval < PollMin || cfg.PollInterval > PollMax { + return fmt.Errorf("invalid config: poll interval must be in range [%d, %d]; found %d", PollMin, PollMax, cfg.PollInterval) + } + return nil +} diff --git a/sysbox-libs/fileMonitor/fileMon_test.go b/sysbox-libs/fileMonitor/fileMon_test.go new file mode 100644 index 00000000..67a3dae7 --- /dev/null +++ b/sysbox-libs/fileMonitor/fileMon_test.go @@ -0,0 +1,341 @@ +// +// Copyright 2023 Nestybox Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +package fileMonitor + +import ( + "fmt" + "github.com/nestybox/sysbox-libs/utils" + log "github.com/sirupsen/logrus" + "io/ioutil" + "os" + "testing" + "time" +) + +func init() { + //log.SetLevel(log.DebugLevel) +} + +func TestOneRemovalPerInterval(t *testing.T) { + + numFiles := 5 + + // create a bunch of temp files + tmpFiles := []string{} + for i := 0; i < numFiles; i++ { + file, err := ioutil.TempFile("", "fileMonTest") + if err != nil { + t.Fatal(err) + } + defer os.Remove(file.Name()) + tmpFiles = append(tmpFiles, file.Name()) + t.Logf("Created file %s", file.Name()) + } + + // create a new file mon + pollInterval := 100 * time.Millisecond + cfg := Cfg{ + EventBufSize: 10, + PollInterval: pollInterval, + } + fm, err := New(&cfg) + if err != nil { + t.Fatal(err) + } + + // watch files + for _, file := range tmpFiles { + fm.Add(file) + } + fileEvents := fm.Events() + + // remove one file at a time (one per poll interval) + for _, file := range tmpFiles { + if err := os.Remove(file); err != nil { + t.Fatal(err) + } + t.Logf("Removed file %s", file) + time.Sleep(pollInterval) + events := <-fileEvents + if len(events) != 1 { + t.Fatalf("incorrect events list size: want 1, got %d (%+v)", len(events), events) + } + e := events[0] + if e.Filename != file { + t.Fatalf("incorrect event file name: want %s, got %s", file, e.Filename) + } + if e.Err != nil { + t.Fatalf("event has error: %s", e.Err) + } + t.Logf("OK: got event for file %s", e.Filename) + } + + fm.Close() + log.Debugf("Done.") +} + +func TestMultiRemovalPerInterval(t *testing.T) { + + numFiles := 5 + + // create a bunch of temp files + tmpFiles := []string{} + for i := 0; i < numFiles; i++ { + file, err := ioutil.TempFile("", "fileMonTest") + if err != nil { + t.Fatal(err) + } + defer os.Remove(file.Name()) + tmpFiles = append(tmpFiles, file.Name()) + t.Logf("Created file %s", file.Name()) + } + + // create a new file mon + pollInterval := 100 * time.Millisecond + cfg := Cfg{ + EventBufSize: 10, + PollInterval: pollInterval, + } + fm, err := New(&cfg) + if err != nil { + t.Fatal(err) + } + + // watch files + for _, file := range tmpFiles { + fm.Add(file) + } + fileEvents := fm.Events() + + // remove all files in a single poll interval + time.Sleep(pollInterval) + + for _, file := range tmpFiles { + if err := os.Remove(file); err != nil { + t.Fatal(err) + } + t.Logf("Removed file %s", file) + } + + // verify we got all events + time.Sleep(2 * pollInterval) + + events := []Event{} + for { + events = append(events, <-fileEvents...) + numEvents := len(events) + if numEvents == numFiles { + break + } else if numEvents > numFiles { + t.Fatalf("got more file removal events than files (want %d, got %d)", numFiles, numEvents) + } + } + + for _, e := range events { + if !utils.StringSliceContains(tmpFiles, e.Filename) { + t.Fatalf("event %+v does not match a removed file", e) + } + if e.Err != nil { + t.Fatalf("event has error: %s", e.Err) + } + t.Logf("OK: got event for file %s", e.Filename) + } + + fm.Close() +} + +func TestSymlinkedFileRemoval(t *testing.T) { + + numFiles := 5 + tmpFiles := []string{} + symlinks := []string{} + + // create a bunch of temp files with symlinks to them + for i := 0; i < numFiles; i++ { + file, err := ioutil.TempFile("", "fileMonTest") + if err != nil { + t.Fatal(err) + } + defer os.Remove(file.Name()) + + // create symlink to file + link := fmt.Sprintf("symlink%d", i) + if err := os.Symlink(file.Name(), link); err != nil { + t.Fatal(err) + } + defer os.Remove(link) + + tmpFiles = append(tmpFiles, file.Name()) + symlinks = append(symlinks, link) + t.Logf("Created file %s and symlink %s", file.Name(), link) + } + + // create a new file mon + pollInterval := 100 * time.Millisecond + cfg := Cfg{ + EventBufSize: 10, + PollInterval: pollInterval, + } + fm, err := New(&cfg) + if err != nil { + t.Fatal(err) + } + + // watch the symlinks + for _, file := range symlinks { + fm.Add(file) + } + fileEvents := fm.Events() + + // remove one file at a time, verify we get the event + for i := 0; i < numFiles; i++ { + file := tmpFiles[i] + link := symlinks[i] + + if err := os.Remove(file); err != nil { + t.Fatal(err) + } + t.Logf("Removed file %s", file) + time.Sleep(pollInterval) + events := <-fileEvents + if len(events) != 1 { + t.Fatalf("incorrect events list size: want 1, got %d (%+v)", len(events), events) + } + e := events[0] + if e.Filename != link { + t.Fatalf("incorrect event file name: want %s, got %s", link, e.Filename) + } + if e.Err != nil { + t.Fatalf("event has error: %s", e.Err) + } + t.Logf("OK: got event for file %s", e.Filename) + } + + fm.Close() + log.Debugf("Done.") +} + +func TestEventRemoval(t *testing.T) { + + numFiles := 5 + + // create a bunch of temp files + tmpFiles := []string{} + for i := 0; i < numFiles; i++ { + file, err := ioutil.TempFile("", "fileMonTest") + if err != nil { + t.Fatal(err) + } + defer os.Remove(file.Name()) + tmpFiles = append(tmpFiles, file.Name()) + t.Logf("Created file %s", file.Name()) + } + + // create a new file mon + pollInterval := 100 * time.Millisecond + cfg := Cfg{ + EventBufSize: 10, + PollInterval: pollInterval, + } + fm, err := New(&cfg) + if err != nil { + t.Fatal(err) + } + + // watch files + for _, file := range tmpFiles { + fm.Add(file) + } + fileEvents := fm.Events() + + // remove event for last file + last := len(tmpFiles) - 1 + lastFile := tmpFiles[last] + fm.Remove(lastFile) + + // Remove all files + for _, file := range tmpFiles { + if err := os.Remove(file); err != nil { + t.Fatal(err) + } + t.Logf("Removed file %s", file) + } + + // Verify notification was received for all files, except the last file + time.Sleep(2 * pollInterval) + + events := []Event{} + for { + events = append(events, <-fileEvents...) + numEvents := len(events) + if numEvents == numFiles-1 { + break + } else if numEvents > numFiles-1 { + t.Fatalf("got more file removal events than files (want %d, got %d)", numFiles-1, numEvents) + } + } + + for _, e := range events { + if e.Filename == lastFile { + t.Fatalf("event %+v should not have been received", e) + } + if !utils.StringSliceContains(tmpFiles, e.Filename) { + t.Fatalf("event %+v does not match a removed file", e) + } + if e.Err != nil { + t.Fatalf("event has error: %s", e.Err) + } + t.Logf("OK: got event for file %s", e.Filename) + } +} + +func TestEventOnNonExistentFile(t *testing.T) { + + // create a new file mon + pollInterval := 100 * time.Millisecond + cfg := Cfg{ + EventBufSize: 10, + PollInterval: pollInterval, + } + fm, err := New(&cfg) + if err != nil { + t.Fatal(err) + } + + // Watch a non-existent file + file := "/tmp/__doesnotexist__" + fm.Add(file) + + // Should return event indicating file does not exist + events := <-fm.Events() + + if len(events) != 1 { + t.Fatalf("incorrect number of events; want 1, got %d (%+v)", len(events), events) + } + + e := events[0] + + if e.Err != nil { + t.Fatalf("event has error: %v", err) + } + + if e.Filename != file { + t.Fatalf("incorrect event filename: want %s, got %s", file, e.Filename) + } + + fm.Close() +} diff --git a/sysbox-libs/fileMonitor/go.mod b/sysbox-libs/fileMonitor/go.mod new file mode 100644 index 00000000..8a274df4 --- /dev/null +++ b/sysbox-libs/fileMonitor/go.mod @@ -0,0 +1,17 @@ +module github.com/nestybox/sysbox-libs/fileMonitor + +go 1.21 + +toolchain go1.21.0 + +require ( + github.com/nestybox/sysbox-libs/utils v0.0.0-00010101000000-000000000000 + github.com/sirupsen/logrus v1.9.1 +) + +require ( + github.com/opencontainers/runtime-spec v1.0.2 // indirect + golang.org/x/sys v0.19.0 // indirect +) + +replace github.com/nestybox/sysbox-libs/utils => ../utils diff --git a/sysbox-libs/fileMonitor/go.sum b/sysbox-libs/fileMonitor/go.sum new file mode 100644 index 00000000..7c38b0bb --- /dev/null +++ b/sysbox-libs/fileMonitor/go.sum @@ -0,0 +1,18 @@ +github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= +github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= +github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= +github.com/opencontainers/runtime-spec v1.0.2 h1:UfAcuLBJB9Coz72x1hgl8O5RVzTdNiaglX6v2DM6FI0= +github.com/opencontainers/runtime-spec v1.0.2/go.mod h1:jwyrGlmzljRJv/Fgzds9SsS/C5hL+LL3ko9hs6T5lQ0= +github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= +github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= +github.com/sirupsen/logrus v1.9.1 h1:Ou41VVR3nMWWmTiEUnj0OlsgOSCUFgsPAOl6jRIcVtQ= +github.com/sirupsen/logrus v1.9.1/go.mod h1:naHLuLoDiP4jHNo9R0sCBMtWGeIprob74mVsIT4qYEQ= +github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= +github.com/stretchr/testify v1.7.0 h1:nwc3DEeHmmLAfoZucVR881uASk0Mfjw8xYJ99tb5CcY= +github.com/stretchr/testify v1.7.0/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg= +golang.org/x/sys v0.0.0-20220715151400-c0bba94af5f8/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.19.0 h1:q5f1RH2jigJ1MoAWp2KTp3gm5zAGFUTarQZ5U386+4o= +golang.org/x/sys v0.19.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= +gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= +gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c h1:dUUwHk2QECo/6vqA44rthZ8ie2QXMNeKRTHCNY2nXvo= +gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= diff --git a/sysbox-libs/fileMonitor/monitor.go b/sysbox-libs/fileMonitor/monitor.go new file mode 100644 index 00000000..51fe6570 --- /dev/null +++ b/sysbox-libs/fileMonitor/monitor.go @@ -0,0 +1,94 @@ +// +// Copyright 2023 Nestybox, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +package fileMonitor + +import ( + "os" + "time" +) + +type cmd int + +const ( + stop cmd = iota +) + +// Monitors files associated with the given FileMon instance +func fileMon(fm *FileMon) { + for { + eventList := []Event{} + rmList := []Event{} + + // handle incoming commands first + select { + case cmd := <-fm.cmdCh: + if cmd == stop { + fm.eventCh <- eventList + return + } + default: + } + + // perform monitoring action + fm.mu.Lock() + for filename, _ := range fm.eventTable { + exists, err := checkFileExists(filename) + if err != nil || !exists { + eventList = append(eventList, Event{ + Filename: filename, + Err: err, + }) + + // file removal implies event won't hit again; remove it. + rmList = append(rmList, Event{filename, nil}) + } + } + + // release the lock so that we don't hold it while sending the event list + // (in case the event channel is blocked); this way new events can + // continue to be added. + fm.mu.Unlock() + + // send event list + if len(eventList) > 0 { + fm.eventCh <- eventList + } + + // remove events that won't hit any more + fm.mu.Lock() + for _, e := range rmList { + if _, ok := fm.eventTable[e.Filename]; ok { + delete(fm.eventTable, e.Filename) + } + } + fm.mu.Unlock() + + // wait for the poll period + time.Sleep(fm.cfg.PollInterval) + } +} + +// Checks if the given file exists +func checkFileExists(path string) (bool, error) { + _, err := os.Stat(path) + if os.IsNotExist(err) { + return false, nil + } else if err != nil { + return false, err + } + return true, nil +} diff --git a/sysbox-libs/formatter/containerID.go b/sysbox-libs/formatter/containerID.go new file mode 100644 index 00000000..07a236fc --- /dev/null +++ b/sysbox-libs/formatter/containerID.go @@ -0,0 +1,19 @@ +package formatter + +import "github.com/docker/docker/pkg/stringid" + +type ContainerID struct { + ID string +} + +func (cid ContainerID) ShortID() string { + return stringid.TruncateID(cid.ID) +} + +func (cid ContainerID) LongID() string { + return cid.ID +} + +func (cid ContainerID) String() string { + return cid.ShortID() +} diff --git a/sysbox-libs/formatter/go.mod b/sysbox-libs/formatter/go.mod new file mode 100644 index 00000000..db2a8be1 --- /dev/null +++ b/sysbox-libs/formatter/go.mod @@ -0,0 +1,7 @@ +module github.com/nestybox/sysbox-libs/formatter + +go 1.21 + +toolchain go1.21.0 + +require github.com/docker/docker v20.10.2+incompatible diff --git a/sysbox-libs/formatter/go.sum b/sysbox-libs/formatter/go.sum new file mode 100644 index 00000000..78cc06f3 --- /dev/null +++ b/sysbox-libs/formatter/go.sum @@ -0,0 +1,2 @@ +github.com/docker/docker v20.10.2+incompatible h1:vFgEHPqWBTp4pTjdLwjAA4bSo3gvIGOYwuJTlEjVBCw= +github.com/docker/docker v20.10.2+incompatible/go.mod h1:eEKB0N0r5NX/I1kEveEz05bcu8tLC/8azJZsviup8Sk= diff --git a/sysbox-libs/idMap/.gitignore b/sysbox-libs/idMap/.gitignore new file mode 100644 index 00000000..c56069fe --- /dev/null +++ b/sysbox-libs/idMap/.gitignore @@ -0,0 +1 @@ +*.test \ No newline at end of file diff --git a/sysbox-libs/idMap/go.mod b/sysbox-libs/idMap/go.mod new file mode 100644 index 00000000..581dbfb3 --- /dev/null +++ b/sysbox-libs/idMap/go.mod @@ -0,0 +1,19 @@ +module github.com/nestybox/sysbox-libs/idMap + +go 1.21 + +toolchain go1.21.0 + +require ( + github.com/nestybox/sysbox-libs/linuxUtils v0.0.0-00010101000000-000000000000 + github.com/opencontainers/runtime-spec v1.0.2 + github.com/pkg/errors v0.8.1 + golang.org/x/sys v0.19.0 +) + +require ( + github.com/spf13/afero v1.4.1 // indirect + golang.org/x/text v0.3.8 // indirect +) + +replace github.com/nestybox/sysbox-libs/linuxUtils => ../linuxUtils diff --git a/sysbox-libs/idMap/go.sum b/sysbox-libs/idMap/go.sum new file mode 100644 index 00000000..0a590dde --- /dev/null +++ b/sysbox-libs/idMap/go.sum @@ -0,0 +1,26 @@ +github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= +github.com/kr/fs v0.1.0/go.mod h1:FFnZGqtBN9Gxj7eW1uZ42v5BccTP0vu6NEaFoC2HwRg= +github.com/opencontainers/runtime-spec v1.0.2 h1:UfAcuLBJB9Coz72x1hgl8O5RVzTdNiaglX6v2DM6FI0= +github.com/opencontainers/runtime-spec v1.0.2/go.mod h1:jwyrGlmzljRJv/Fgzds9SsS/C5hL+LL3ko9hs6T5lQ0= +github.com/pkg/errors v0.8.1 h1:iURUrRGxPUNPdy5/HRSm+Yj6okJ6UtLINN0Q9M4+h3I= +github.com/pkg/errors v0.8.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0= +github.com/pkg/sftp v1.10.1/go.mod h1:lYOWFsE0bwd1+KfKJaKeuokY15vzFx25BLbzYYoAxZI= +github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= +github.com/spf13/afero v1.4.1 h1:asw9sl74539yqavKaglDM5hFpdJVK0Y5Dr/JOgQ89nQ= +github.com/spf13/afero v1.4.1/go.mod h1:Ai8FlHk4v/PARR026UzYexafAt9roJ7LcLMAmO6Z93I= +github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= +github.com/stretchr/testify v1.4.0/go.mod h1:j7eGeouHqKxXV5pUuKE4zz7dFj8WfuZ+81PSLYec5m4= +golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w= +golang.org/x/crypto v0.0.0-20190820162420-60c769a6c586/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI= +golang.org/x/net v0.0.0-20190404232315-eb5bcb51f2a3/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg= +golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= +golang.org/x/sys v0.0.0-20190412213103-97732733099d/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.19.0 h1:q5f1RH2jigJ1MoAWp2KTp3gm5zAGFUTarQZ5U386+4o= +golang.org/x/sys v0.19.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= +golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= +golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= +golang.org/x/text v0.3.8 h1:nAL+RVCQ9uMn3vJZbV+MRnydTJFPf8qqY42YiA6MrqY= +golang.org/x/text v0.3.8/go.mod h1:E6s5w1FMmriuDzIBO73fBruAKo1PCIq6d2Q6DHfQ8WQ= +golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= +gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= +gopkg.in/yaml.v2 v2.2.2/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI= diff --git a/sysbox-libs/idMap/helpers.go b/sysbox-libs/idMap/helpers.go new file mode 100644 index 00000000..35a04b30 --- /dev/null +++ b/sysbox-libs/idMap/helpers.go @@ -0,0 +1,45 @@ +// +// Copyright 2019-2023 Nestybox, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +package idMap + +import ( + "github.com/nestybox/sysbox-libs/linuxUtils" +) + +// checkKernelVersion checks if the kernel vesion is equal or newer than the +// given major.minor number. +func checkKernelVersion(reqMaj, reqMin int) (bool, error) { + var major, minor int + + rel, err := linuxUtils.GetKernelRelease() + if err != nil { + return false, err + } + + major, minor, err = linuxUtils.ParseKernelRelease(rel) + if err != nil { + return false, err + } + + if major < reqMaj { + return false, nil + } else if major == reqMaj && minor < reqMin { + return false, nil + } else { + return true, nil + } +} diff --git a/sysbox-libs/idMap/idMapMount.go b/sysbox-libs/idMap/idMapMount.go new file mode 100644 index 00000000..e645e76e --- /dev/null +++ b/sysbox-libs/idMap/idMapMount.go @@ -0,0 +1,264 @@ +// +// Copyright 2019-2023 Nestybox, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +//go:build linux && idmapped_mnt && cgo +// +build linux,idmapped_mnt,cgo + +package idMap + +import ( + "fmt" + "os" + "path/filepath" + "strings" + "syscall" + "time" + + "github.com/nestybox/sysbox-libs/linuxUtils" + specs "github.com/opencontainers/runtime-spec/specs-go" + "github.com/pkg/errors" + "golang.org/x/sys/unix" +) + +// The following are filesystems and host directories where we never ID-map +// mount as it causes functional problems (i.e., the kernel does not yet support +// ID-mapped mounts over them). +// +// TODO: remove this blacklist and instead run experiments on each fs + +var idMapMountFsBlackList = []int64{ + unix.OVERLAYFS_SUPER_MAGIC, // can't id-map on top of an overlayfs mount + 0x65735546, // unix.FUSE_SUPER_MAGIC + 0x6a656a63, // FAKEOWNER (Docker Desktop's Linux VM only) +} + +var idMapMountDevBlackList = []string{"/dev/null"} + +// ID-maps the given mountpoint, using the given userns ID mappings; both paths must be absolute. +func IDMapMount(usernsPath, mountPath string, unmountFirst bool) error { + + // open the usernsPath + usernsFd, err := os.Open(usernsPath) + if err != nil { + return fmt.Errorf("Failed to open %s: %s", usernsPath, err) + } + defer usernsFd.Close() + + // If mountPath is procfd based, read the magic link + if strings.HasPrefix(mountPath, "/proc/self/fd/") { + mountPath, err = os.Readlink(mountPath) + if err != nil { + return fmt.Errorf("Failed to read link %s: %s", mountPath, err) + } + } else { + mountPath, err = filepath.EvalSymlinks(mountPath) + if err != nil { + return fmt.Errorf("Failed to eval symlink on %s: %s", mountPath, err) + } + } + + // clone the given mount + fdTree, err := unix.OpenTree(-1, mountPath, unix.OPEN_TREE_CLONE|unix.OPEN_TREE_CLOEXEC|unix.AT_EMPTY_PATH|unix.AT_RECURSIVE) + if err != nil { + return fmt.Errorf("Failed to open mount at %s: %s", mountPath, err) + } + + // Set the ID-mapped mount attribute on the clone + // TODO: add propagation type? (use the original mountpoints propagation)? + + mountAttr := &unix.MountAttr{ + Attr_set: unix.MOUNT_ATTR_IDMAP, + Userns_fd: uint64(usernsFd.Fd()), + } + + err = unix.MountSetattr(int(fdTree), "", unix.AT_EMPTY_PATH|unix.AT_RECURSIVE, mountAttr) + if err != nil { + return fmt.Errorf("Failed to set mount attr: %s", err) + } + + // Unmount the original mountPath mount to prevent redundant / stacked mounting + if unmountFirst { + err = unix.Unmount(mountPath, unix.MNT_DETACH) + if err != nil { + return fmt.Errorf("Failed to unmount %s: %s", mountPath, err) + } + } + + // Attach the clone to the to mount point + err = unix.MoveMount(fdTree, "", -1, mountPath, unix.MOVE_MOUNT_F_EMPTY_PATH) + if err != nil { + return fmt.Errorf("Failed to move mount: %s", err) + } + + unix.Close(fdTree) + return nil +} + +// IDMapMountSupported checks if ID-mapping is supported on the host. +func IDMapMountSupported(dir string) (bool, error) { + + // ID-Mapped mounts requires Linux kernel >= 5.12 + kernelOK, err := checkKernelVersion(5, 12) + if err != nil { + return false, err + } + + if !kernelOK { + return false, nil + } + + return runIDMapMountCheckOnHost(dir, false) +} + +// OverlayfsOnIDMapMountSupported checks if overlayfs over ID-mapped lower +// layers is supported on the host. +func OverlayfsOnIDMapMountSupported(dir string) (bool, error) { + + // overlayfs on ID-mapped lower layers requires Linux kernel >= 5.19 + kernelOK, err := checkKernelVersion(5, 19) + if err != nil { + return false, err + } + + if !kernelOK { + return false, nil + } + + return runIDMapMountCheckOnHost(dir, true) +} + +// runIDMapMountCheckOnHost runs a quick test on the host to check if ID-mapping is +// supported. dir is the path where the test will run. If checkOnOverlayfs +// is true, the test checks if overlayfs supports ID-mapped lower layers. +func runIDMapMountCheckOnHost(dir string, checkOnOverlayfs bool) (bool, error) { + var ( + lowerDir, upperDir, workDir, idMapDir string + ) + + tmpDir, err := os.MkdirTemp(dir, "sysbox-ovfs-check") + if err != nil { + return false, err + } + defer func() { + os.RemoveAll(tmpDir) + }() + + testDir := filepath.Join(tmpDir, "merged") + if err := os.Mkdir(testDir, 0700); err != nil { + return false, err + } + + if checkOnOverlayfs { + lowerDir = filepath.Join(tmpDir, "lower") + upperDir = filepath.Join(tmpDir, "upper") + workDir = filepath.Join(tmpDir, "work") + + dirs := []string{lowerDir, upperDir, workDir} + for _, dir := range dirs { + if err := os.Mkdir(dir, 0700); err != nil { + return false, err + } + } + } + + // Create a userns process that simply pauses until killed + execFunc := func() { + for i := 0; i < 3600; i++ { + time.Sleep(1 * time.Second) + } + } + + idmap := &specs.LinuxIDMapping{ + ContainerID: 0, + HostID: 0, + Size: 1, + } + + pid, childKill, err := linuxUtils.CreateUsernsProcess(idmap, execFunc, testDir, false) + if err != nil { + return false, err + } + + defer func() { + var wstatus syscall.WaitStatus + var rusage syscall.Rusage + childKill() + syscall.Wait4(pid, &wstatus, 0, &rusage) + }() + + // Create the ID mapped mount associated with the child process user-ns + usernsPath := fmt.Sprintf("/proc/%d/ns/user", pid) + + if checkOnOverlayfs { + idMapDir = lowerDir + } else { + idMapDir = testDir + } + + if err := IDMapMount(usernsPath, idMapDir, false); err != nil { + return false, errors.Wrap(err, "create mapped mount") + } + defer unix.Unmount(idMapDir, unix.MNT_DETACH) + + if checkOnOverlayfs { + opts := fmt.Sprintf("lowerdir=%s,upperdir=%s,workdir=%s", lowerDir, upperDir, workDir) + flags := uintptr(0) + if err := unix.Mount("overlay", testDir, "overlay", flags, opts); err != nil { + return false, err + } + unix.Unmount(testDir, unix.MNT_DETACH) + return true, nil + } + + return true, nil +} + +// Checkf if the dir at the given path can be ID-mapped based on the underlying filesystem. +func IDMapMountSupportedOnPath(path string) (bool, error) { + var fs unix.Statfs_t + + for _, m := range idMapMountDevBlackList { + if path == m { + return false, nil + } + } + + err := unix.Statfs(path, &fs) + if err != nil { + return false, err + } + + for _, name := range idMapMountFsBlackList { + if fs.Type == name { + return false, nil + } + } + + // ID-mapped mounts on tmpfs supported since kernel 6.3 + // Ref: https://lore.kernel.org/lkml/20230217080552.1628786-1-brauner@kernel.org/ + + if fs.Type == unix.TMPFS_MAGIC { + cmp, err := linuxUtils.KernelCurrentVersionCmp(6, 3) + if err != nil { + return false, fmt.Errorf("failed to compare kernel version: %v", err) + } + if cmp < 0 { + return false, nil + } + } + + return true, nil +} diff --git a/sysbox-libs/idMap/idMapMount_test.go b/sysbox-libs/idMap/idMapMount_test.go new file mode 100644 index 00000000..e1e33145 --- /dev/null +++ b/sysbox-libs/idMap/idMapMount_test.go @@ -0,0 +1,81 @@ +// +// Copyright 2023 Nestybox, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +// NOTE: +// +// Run test with "go test -tags idmapped_mnt" when running on a host with kernel +// >= 5.12. Otherwise the test will use the idMapMount_unsupported.go file. + +package idMap + +import ( + "os" + "testing" +) + +func TestIDMapMountSupported(t *testing.T) { + + kernelOK, err := checkKernelVersion(5, 12) + if err != nil { + t.Fatal(err) + } + + if kernelOK { + dir := "/var/lib/sysbox" + + if err := os.MkdirAll(dir, 0755); err != nil { + t.Fatal(err) + } + + supported, err := IDMapMountSupported(dir) + if err != nil { + t.Fatalf("IDMapMountSupported() failed with error: %s", err) + } + + if supported { + t.Logf("ID-mapping supported on this host.") + } else { + t.Logf("ID-mapping not supported on this host.") + } + } +} + +func TestIDMapMountSupportedOnOverlayfs(t *testing.T) { + + kernelOK, err := checkKernelVersion(5, 19) + if err != nil { + t.Fatal(err) + } + + if kernelOK { + dir := "/var/lib/sysbox" + + if err := os.MkdirAll(dir, 0755); err != nil { + t.Fatal(err) + } + + supported, err := IDMapMountSupportedOnOverlayfs(dir) + if err != nil { + t.Fatalf("IDMapMountSupportedOnOverlayfs() failed with error: %s", err) + } + + if supported { + t.Logf("ID-mapping-on-overlayfs supported on this host.") + } else { + t.Logf("ID-mapping-on-overlayfs not supported on this host.") + } + } +} diff --git a/sysbox-libs/idMap/idMapMount_unsupported.go b/sysbox-libs/idMap/idMapMount_unsupported.go new file mode 100644 index 00000000..541fce99 --- /dev/null +++ b/sysbox-libs/idMap/idMapMount_unsupported.go @@ -0,0 +1,44 @@ +// +// Copyright 2019-2023 Nestybox, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +//go:build !linux || !idmapped_mnt || !cgo +// +build !linux !idmapped_mnt !cgo + +package idMap + +import ( + "fmt" +) + +func IDMapMount(usernsPath, mountPath string, unmountFirst bool) error { + return fmt.Errorf("idmapped mount unsupported in this Sysbox build.") +} + +func IDMapMountSupported(dir string) (bool, error) { + return false, nil +} + +func IDMapMountSupportedOnOverlayfs(dir string) (bool, error) { + return false, nil +} + +func IDMapMountSupportedOnPath(path string) (bool, error) { + return false, nil +} + +func OverlayfsOnIDMapMountSupported(dir string) (bool, error) { + return false, nil +} diff --git a/sysbox-libs/idShiftUtils/go.mod b/sysbox-libs/idShiftUtils/go.mod new file mode 100644 index 00000000..0c66537d --- /dev/null +++ b/sysbox-libs/idShiftUtils/go.mod @@ -0,0 +1,19 @@ +module github.com/nestybox/sysbox-libs/idShiftUtils + +go 1.21 + +toolchain go1.21.0 + +require ( + github.com/deckarep/golang-set v1.7.1 + github.com/joshlf/go-acl v0.0.0-20200411065538-eae00ae38531 + github.com/karrick/godirwalk v1.16.1 + github.com/sirupsen/logrus v1.7.0 + golang.org/x/sys v0.19.0 +) + +require github.com/stretchr/testify v1.4.0 // indirect + +require github.com/joshlf/testutil v0.0.0-20170608050642-b5d8aa79d93d // indirect + +replace github.com/nestybox/sysbox-libs/utils => ../utils diff --git a/sysbox-libs/idShiftUtils/go.sum b/sysbox-libs/idShiftUtils/go.sum new file mode 100644 index 00000000..25ce254e --- /dev/null +++ b/sysbox-libs/idShiftUtils/go.sum @@ -0,0 +1,25 @@ +github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= +github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= +github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= +github.com/deckarep/golang-set v1.7.1 h1:SCQV0S6gTtp6itiFrTqI+pfmJ4LN85S1YzhDf9rTHJQ= +github.com/deckarep/golang-set v1.7.1/go.mod h1:93vsz/8Wt4joVM7c2AVqh+YRMiUSc14yDtF28KmMOgQ= +github.com/joshlf/go-acl v0.0.0-20200411065538-eae00ae38531 h1:hgVxRoDDPtQE68PT4LFvNlPz2nBKd3OMlGKIQ69OmR4= +github.com/joshlf/go-acl v0.0.0-20200411065538-eae00ae38531/go.mod h1:fqTUQpVYBvhCNIsMXGl2GE9q6z94DIP6NtFKXCSTVbg= +github.com/joshlf/testutil v0.0.0-20170608050642-b5d8aa79d93d h1:J8tJzRyiddAFF65YVgxli+TyWBi0f79Sld6rJP6CBcY= +github.com/joshlf/testutil v0.0.0-20170608050642-b5d8aa79d93d/go.mod h1:b+Q3v8Yrg5o15d71PSUraUzYb+jWl6wQMSBXSGS/hv0= +github.com/karrick/godirwalk v1.16.1 h1:DynhcF+bztK8gooS0+NDJFrdNZjJ3gzVzC545UNA9iw= +github.com/karrick/godirwalk v1.16.1/go.mod h1:j4mkqPuvaLI8mp1DroR3P6ad7cyYd4c1qeJ3RV7ULlk= +github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= +github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= +github.com/sirupsen/logrus v1.7.0 h1:ShrD1U9pZB12TX0cVy0DtePoCH97K8EtX+mg7ZARUtM= +github.com/sirupsen/logrus v1.7.0/go.mod h1:yWOB1SBYBC5VeMP7gHvWumXLIWorT60ONWic61uBYv0= +github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= +github.com/stretchr/testify v1.2.2/go.mod h1:a8OnRcib4nhh0OaRAV+Yts87kKdq0PP7pXfy6kDkUVs= +github.com/stretchr/testify v1.4.0 h1:2E4SXV/wtOkTonXsotYi4li6zVWxYlZuYNCXe9XRJyk= +github.com/stretchr/testify v1.4.0/go.mod h1:j7eGeouHqKxXV5pUuKE4zz7dFj8WfuZ+81PSLYec5m4= +golang.org/x/sys v0.0.0-20191026070338-33540a1f6037/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.19.0 h1:q5f1RH2jigJ1MoAWp2KTp3gm5zAGFUTarQZ5U386+4o= +golang.org/x/sys v0.19.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= +gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= +gopkg.in/yaml.v2 v2.2.2 h1:ZCJp+EgiOT7lHqUV2J862kp8Qj64Jo6az82+3Td9dZw= +gopkg.in/yaml.v2 v2.2.2/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI= diff --git a/sysbox-libs/idShiftUtils/idShiftUtils.go b/sysbox-libs/idShiftUtils/idShiftUtils.go new file mode 100644 index 00000000..89736d9a --- /dev/null +++ b/sysbox-libs/idShiftUtils/idShiftUtils.go @@ -0,0 +1,296 @@ +// +// Copyright 2019-2021 Nestybox, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +// Utilities for shifting user and group IDs on the file system using chown +// (e.g., shifting uids:gids from range [0:65536] to range [165536:231071]). + +package idShiftUtils + +import ( + "fmt" + "os" + "strconv" + "syscall" + + "github.com/joshlf/go-acl" + aclLib "github.com/joshlf/go-acl" + "github.com/karrick/godirwalk" + "github.com/sirupsen/logrus" + "golang.org/x/sys/unix" + + mapset "github.com/deckarep/golang-set" +) + +type IDShiftType int + +const ( + NoShift IDShiftType = iota + Shiftfs + IDMappedMount + IDMappedMountOrShiftfs + Chown +) + +type aclType int + +const ( + aclTypeAccess aclType = iota + aclTypeDefault +) + +type IDMapping struct { + ContainerID uint32 + HostID uint32 + Size uint32 +} + +// checkACLSupport attempts to set an extended ACL attribute on a file to check ACL support. +func checkACLSupport(path string) bool { + file, err := os.Open(path) + if err != nil { + return false + } + defer file.Close() + + // Try setting an extended attribute specific to ACLs + err = unix.Fsetxattr(int(file.Fd()), "system.posix_acl_access", []byte{}, 0) + + // ENOTSUP means ACL is not supported; any other error indicates something + // else went wrong, so we assume ACLs are supported + return err != unix.ENOTSUP +} + +// shiftAclType shifts the ACL type user and group IDs by the given offset +func shiftAclType(aclT aclType, path string, uidOffset, gidOffset int32) error { + var facl aclLib.ACL + var err error + + // Read the ACL + if aclT == aclTypeDefault { + facl, err = acl.GetDefault(path) + } else { + facl, err = acl.Get(path) + } + + if err != nil { + return fmt.Errorf("failed to get ACL for %s: %s", path, err) + } + + // Shift the user and group ACLs (if any) + newACL := aclLib.ACL{} + aclShifted := false + + for _, e := range facl { + + // ACL_USER id shifting + if e.Tag == aclLib.TagUser { + uid, err := strconv.ParseUint(e.Qualifier, 10, 32) + if err != nil { + logrus.Warnf("failed to convert ACL qualifier for %v: %s", e, err) + continue + } + + targetUid := uint64(int32(uid) + uidOffset) + e.Qualifier = strconv.FormatUint(targetUid, 10) + aclShifted = true + } + + // ACL_GROUP id shifting + if e.Tag == aclLib.TagGroup { + gid, err := strconv.ParseUint(e.Qualifier, 10, 32) + if err != nil { + logrus.Warnf("failed to convert ACL qualifier %v: %s", e, err) + continue + } + + targetGid := uint64(int32(gid) + gidOffset) + e.Qualifier = strconv.FormatUint(targetGid, 10) + aclShifted = true + } + + newACL = append(newACL, e) + } + + // Write back the modified ACL + if aclShifted { + if aclT == aclTypeDefault { + err = acl.SetDefault(path, newACL) + } else { + err = acl.Set(path, newACL) + } + if err != nil { + return fmt.Errorf("failed to set ACL %v for %s: %s", newACL, path, err) + } + } + + return nil +} + +// Shifts the ACL user and group IDs by the given offset, both for access and default ACLs +func shiftAclIds(path string, isDir bool, uidOffset, gidOffset int32) error { + + // Access list + err := shiftAclType(aclTypeAccess, path, uidOffset, gidOffset) + if err != nil { + return err + } + + // Default list (for directories only) + if isDir { + err = shiftAclType(aclTypeDefault, path, uidOffset, gidOffset) + if err != nil { + return err + } + } + + return nil +} + +// "Shifts" ownership of user and group IDs on the given directory and files and directories +// below it by the given offset, using chown. +func ShiftIdsWithChown(baseDir string, uidOffset, gidOffset int32) error { + + aclSupported := checkACLSupport(baseDir) + + hardLinks := []uint64{} + err := godirwalk.Walk(baseDir, &godirwalk.Options{ + Callback: func(path string, de *godirwalk.Dirent) error { + + // When doing the chown, we don't follow symlinks as we want to change + // the ownership of the symlinks themselves. We will chown the + // symlink's target during the godirwalk (unless the symlink is + // dangling in which case there is nothing to be done). + + fi, err := os.Lstat(path) + if err != nil { + return err + } + + st, ok := fi.Sys().(*syscall.Stat_t) + if !ok { + return fmt.Errorf("failed to convert to syscall.Stat_t") + } + + // If a file has multiple hardlinks, change its ownership once + if st.Nlink >= 2 { + for _, linkInode := range hardLinks { + if linkInode == st.Ino { + return nil + } + } + + hardLinks = append(hardLinks, st.Ino) + } + + targetUid := int32(st.Uid) + uidOffset + targetGid := int32(st.Gid) + gidOffset + + err = unix.Lchown(path, int(targetUid), int(targetGid)) + if err != nil { + return fmt.Errorf("chown %s to %d:%d failed: %s", path, targetUid, targetGid, err) + } + + // chown will turn-off the set-user-ID and set-group-ID bits on files, + // so we need to restore them. + fMode := fi.Mode() + setuid := fMode&os.ModeSetuid == os.ModeSetuid + setgid := fMode&os.ModeSetgid == os.ModeSetgid + + if fMode.IsRegular() && (setuid || setgid) { + if err := os.Chmod(path, fMode); err != nil { + return fmt.Errorf("chmod %s to %s failed: %s", path, fMode, err) + } + } + + // Chowning the file is not sufficient; we also need to shift user and group IDs in + // the Linux access control list (ACL) for the file + if fMode&os.ModeSymlink == 0 && aclSupported { + if err := shiftAclIds(path, fi.IsDir(), uidOffset, gidOffset); err != nil { + return fmt.Errorf("failed to shift ACL for %s: %s", path, err) + } + } + + return nil + }, + + ErrorCallback: func(path string, err error) godirwalk.ErrorAction { + + fi, err := os.Lstat(path) + if err != nil { + return godirwalk.Halt + } + + // Ignore errors due to chown on dangling symlinks (they often occur in container image layers) + if fi.Mode()&os.ModeSymlink == os.ModeSymlink { + return godirwalk.SkipNode + } + + return godirwalk.Halt + }, + + Unsorted: true, // Speeds up the directory tree walk + }) + + return err +} + +// Returns the lists of user and group IDs for all files and directories at or +// below the given path. +func GetDirIDs(baseDir string) ([]uint32, []uint32, error) { + + uidSet := mapset.NewSet() + gidSet := mapset.NewSet() + + err := godirwalk.Walk(baseDir, &godirwalk.Options{ + Callback: func(path string, de *godirwalk.Dirent) error { + + fi, err := os.Lstat(path) + if err != nil { + return err + } + + st, ok := fi.Sys().(*syscall.Stat_t) + if !ok { + return fmt.Errorf("failed to convert to syscall.Stat_t") + } + + uidSet.Add(st.Uid) + gidSet.Add(st.Gid) + + return nil + }, + + Unsorted: true, // Speeds up the directory tree walk + }) + + if err != nil { + return nil, nil, err + } + + uidList := []uint32{} + for _, id := range uidSet.ToSlice() { + val := id.(uint32) + uidList = append(uidList, val) + } + + gidList := []uint32{} + for _, id := range gidSet.ToSlice() { + val := id.(uint32) + gidList = append(gidList, val) + } + + return uidList, gidList, nil +} diff --git a/sysbox-libs/idShiftUtils/idShiftUtils_test.go b/sysbox-libs/idShiftUtils/idShiftUtils_test.go new file mode 100644 index 00000000..2d2f6ebb --- /dev/null +++ b/sysbox-libs/idShiftUtils/idShiftUtils_test.go @@ -0,0 +1,258 @@ +// +// Copyright 2019-2024 Nestybox, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +// Unit tests for idShiftUtils package + +package idShiftUtils + +import ( + "os" + "path/filepath" + "testing" + + aclLib "github.com/joshlf/go-acl" +) + +func TestCheckACLSupport(t *testing.T) { + + // create a tmp dir on ext4, where ACL is known to be supported + tmpdir, err := os.MkdirTemp("/mnt/scratch", "no_acl_test") + if err != nil { + t.Fatalf("Failed to create tmp dir: %v", err) + } + defer func() { + os.RemoveAll(tmpdir) + }() + + // Create a test file in the mounted filesystem + testFile := filepath.Join(tmpdir, "testfile") + if _, err := os.Create(testFile); err != nil { + t.Fatalf("Failed to create test file: %v", err) + } + defer os.Remove(testFile) + + // Run the ACL check on the test file + supportsACL := checkACLSupport(testFile) + if !supportsACL { + t.Error("Expected ACLs to be supported, but checkACLSupport() returned false.") + } +} + +func TestShiftAclIds(t *testing.T) { + + testDir, err := os.MkdirTemp("", "shiftAclTest") + if err != nil { + t.Fatal(err) + } + defer os.RemoveAll(testDir) + + // Access ACL to be set on testDir + aclUserEntry := aclLib.Entry{ + Tag: aclLib.TagUser, + Qualifier: "1001", + Perms: 7, + } + + aclGroupEntry := aclLib.Entry{ + Tag: aclLib.TagGroup, + Qualifier: "1005", + Perms: 4, + } + + aclMaskEntry := aclLib.Entry{ + Tag: aclLib.TagMask, + Perms: 7, + } + + // Default ACL to be set on testDir + aclDef := aclLib.ACL{ + aclLib.Entry{ + Tag: aclLib.TagUserObj, + Perms: 7, + }, + aclLib.Entry{ + Tag: aclLib.TagGroupObj, + Perms: 0, + }, + aclLib.Entry{ + Tag: aclLib.TagOther, + Perms: 0, + }, + aclLib.Entry{ + Tag: aclLib.TagUser, + Qualifier: "1002", + Perms: 5, + }, + aclLib.Entry{ + Tag: aclLib.TagGroup, + Qualifier: "1005", + Perms: 4, + }, + aclLib.Entry{ + Tag: aclLib.TagMask, + Perms: 7, + }, + } + + acl, err := aclLib.Get(testDir) + if err != nil { + t.Fatalf("failed to get ACL on %s: %s", testDir, err) + } + + acl = append(acl, aclUserEntry, aclGroupEntry, aclMaskEntry) + + if err := aclLib.Set(testDir, acl); err != nil { + t.Fatalf("failed to set ACL %v on %s: %s", acl, testDir, err) + } + + if err := aclLib.SetDefault(testDir, aclDef); err != nil { + t.Fatalf("failed to set default ACL %v on %s: %s", aclDef, testDir, err) + } + + // ShiftAcls by subtracting offset + + uidOffset := int32(-1000) + gidOffset := int32(-1000) + + if err := shiftAclIds(testDir, true, uidOffset, gidOffset); err != nil { + t.Fatalf("shiftAclIds() failed: %s", err) + } + + // Verify the ACL for the dir were modified as expected + newAcl := aclLib.ACL{} + newDefAcl := aclLib.ACL{} + + newAcl, err = aclLib.Get(testDir) + if err != nil { + t.Fatalf("failed to get ACL on %s: %s", testDir, err) + } + + newDefAcl, err = aclLib.GetDefault(testDir) + if err != nil { + t.Fatalf("failed to get default ACL on %s: %s", testDir, err) + } + + wantAclUserEntry := aclLib.Entry{ + Tag: aclLib.TagUser, + Qualifier: "1", // 1001 - 1000 + Perms: 7, + } + + wantAclGroupEntry := aclLib.Entry{ + Tag: aclLib.TagGroup, + Qualifier: "5", // 1005 - 1000 + Perms: 4, + } + + wantAclDefUserEntry := aclLib.Entry{ + Tag: aclLib.TagUser, + Qualifier: "2", // 1002 - 1000 + Perms: 5, + } + + wantAclDefGroupEntry := aclLib.Entry{ + Tag: aclLib.TagGroup, + Qualifier: "5", // 1005 - 1000 + Perms: 4, + } + + for _, e := range newAcl { + if e.Tag == aclLib.TagUser { + if e != wantAclUserEntry { + t.Logf("acl mismatch: want %v, got %v", wantAclUserEntry, e) + } + } + if e.Tag == aclLib.TagGroup { + if e != wantAclGroupEntry { + t.Logf("acl mismatch: want %v, got %v", wantAclGroupEntry, e) + } + } + } + + for _, e := range newDefAcl { + if e.Tag == aclLib.TagUser { + if e != wantAclDefUserEntry { + t.Logf("acl mismatch: want %v, got %v", wantAclDefUserEntry, e) + } + } + if e.Tag == aclLib.TagGroup { + if e != wantAclDefGroupEntry { + t.Logf("acl mismatch: want %v, got %v", wantAclDefGroupEntry, e) + } + } + } + + // ShiftAcls by adding offset (revert back to original value) + + uidOffset = int32(1000) + gidOffset = int32(1000) + + if err := shiftAclIds(testDir, true, uidOffset, gidOffset); err != nil { + t.Fatalf("shiftAclIds() failed: %s", err) + } + + newAcl, err = aclLib.Get(testDir) + if err != nil { + t.Fatalf("failed to get ACL on %s: %s", testDir, err) + } + + newDefAcl, err = aclLib.GetDefault(testDir) + if err != nil { + t.Fatalf("failed to get default ACL on %s: %s", testDir, err) + } + + wantAclUserEntry = aclUserEntry + wantAclGroupEntry = aclGroupEntry + + wantAclDefUserEntry = aclLib.Entry{ + Tag: aclLib.TagUser, + Qualifier: "1002", + Perms: 5, + } + + wantAclDefGroupEntry = aclLib.Entry{ + Tag: aclLib.TagGroup, + Qualifier: "1005", + Perms: 4, + } + + for _, e := range newAcl { + if e.Tag == aclLib.TagUser { + if e != wantAclUserEntry { + t.Logf("acl mismatch: want %v, got %v", wantAclUserEntry, e) + } + } + if e.Tag == aclLib.TagGroup { + if e != wantAclGroupEntry { + t.Logf("acl mismatch: want %v, got %v", wantAclGroupEntry, e) + } + } + } + + for _, e := range newDefAcl { + if e.Tag == aclLib.TagUser { + if e != wantAclDefUserEntry { + t.Logf("acl mismatch: want %v, got %v", wantAclDefUserEntry, e) + } + } + if e.Tag == aclLib.TagGroup { + if e != wantAclDefGroupEntry { + t.Logf("acl mismatch: want %v, got %v", wantAclDefGroupEntry, e) + } + } + } + +} diff --git a/sysbox-libs/linuxUtils/go.mod b/sysbox-libs/linuxUtils/go.mod new file mode 100644 index 00000000..ecfa7088 --- /dev/null +++ b/sysbox-libs/linuxUtils/go.mod @@ -0,0 +1,11 @@ +module github.com/nestybox/sysbox-libs/linuxUtils + +go 1.21 + +require ( + github.com/opencontainers/runtime-spec v1.0.2 + github.com/spf13/afero v1.4.1 + golang.org/x/sys v0.19.0 +) + +require golang.org/x/text v0.3.8 // indirect diff --git a/sysbox-libs/linuxUtils/go.sum b/sysbox-libs/linuxUtils/go.sum new file mode 100644 index 00000000..2d67f7f0 --- /dev/null +++ b/sysbox-libs/linuxUtils/go.sum @@ -0,0 +1,25 @@ +github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= +github.com/kr/fs v0.1.0/go.mod h1:FFnZGqtBN9Gxj7eW1uZ42v5BccTP0vu6NEaFoC2HwRg= +github.com/opencontainers/runtime-spec v1.0.2 h1:UfAcuLBJB9Coz72x1hgl8O5RVzTdNiaglX6v2DM6FI0= +github.com/opencontainers/runtime-spec v1.0.2/go.mod h1:jwyrGlmzljRJv/Fgzds9SsS/C5hL+LL3ko9hs6T5lQ0= +github.com/pkg/errors v0.8.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0= +github.com/pkg/sftp v1.10.1/go.mod h1:lYOWFsE0bwd1+KfKJaKeuokY15vzFx25BLbzYYoAxZI= +github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= +github.com/spf13/afero v1.4.1 h1:asw9sl74539yqavKaglDM5hFpdJVK0Y5Dr/JOgQ89nQ= +github.com/spf13/afero v1.4.1/go.mod h1:Ai8FlHk4v/PARR026UzYexafAt9roJ7LcLMAmO6Z93I= +github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= +github.com/stretchr/testify v1.4.0/go.mod h1:j7eGeouHqKxXV5pUuKE4zz7dFj8WfuZ+81PSLYec5m4= +golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w= +golang.org/x/crypto v0.0.0-20190820162420-60c769a6c586/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI= +golang.org/x/net v0.0.0-20190404232315-eb5bcb51f2a3/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg= +golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= +golang.org/x/sys v0.0.0-20190412213103-97732733099d/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.19.0 h1:q5f1RH2jigJ1MoAWp2KTp3gm5zAGFUTarQZ5U386+4o= +golang.org/x/sys v0.19.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= +golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= +golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= +golang.org/x/text v0.3.8 h1:nAL+RVCQ9uMn3vJZbV+MRnydTJFPf8qqY42YiA6MrqY= +golang.org/x/text v0.3.8/go.mod h1:E6s5w1FMmriuDzIBO73fBruAKo1PCIq6d2Q6DHfQ8WQ= +golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= +gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= +gopkg.in/yaml.v2 v2.2.2/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI= diff --git a/sysbox-libs/linuxUtils/linux.go b/sysbox-libs/linuxUtils/linux.go new file mode 100644 index 00000000..a37a2de0 --- /dev/null +++ b/sysbox-libs/linuxUtils/linux.go @@ -0,0 +1,373 @@ +// +// Copyright 2020 - 2023 Nestybox, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +package linuxUtils + +import ( + "bufio" + "bytes" + "errors" + "fmt" + "io/ioutil" + "os" + "os/exec" + "path/filepath" + "strconv" + "strings" + "syscall" + "time" + + specs "github.com/opencontainers/runtime-spec/specs-go" + "github.com/spf13/afero" + "golang.org/x/sys/unix" +) + +// Afero FS for unit-testing purposes. +var appFs = afero.NewOsFs() + +// Obtain system's linux distribution. +func GetDistro() (string, error) { + + distro, err := GetDistroPath("/") + if err != nil { + return "", err + } + + return distro, nil +} + +// Parse os-release lines looking for 'ID' field. Originally borrowed from +// acobaugh/osrelease lib and adjusted to extract only the os-release "ID" +// field. +func parseLineDistroId(line string) string { + + // Skip empty lines. + if len(line) == 0 { + return "" + } + + // Skip comments. + if line[0] == '#' { + return "" + } + + // Try to split string at the first '='. + splitString := strings.SplitN(line, "=", 2) + if len(splitString) != 2 { + return "" + } + + // Trim white space from key. Return here if we are not dealing + // with an "ID" field. + key := splitString[0] + key = strings.Trim(key, " ") + if key != "ID" { + return "" + } + + // Trim white space from value. + value := splitString[1] + value = strings.Trim(value, " ") + + // Handle double quotes. + if strings.ContainsAny(value, `"`) { + first := string(value[0:1]) + last := string(value[len(value)-1:]) + + if first == last && strings.ContainsAny(first, `"'`) { + value = strings.TrimPrefix(value, `'`) + value = strings.TrimPrefix(value, `"`) + value = strings.TrimSuffix(value, `'`) + value = strings.TrimSuffix(value, `"`) + } + } + + // Expand anything else that could be escaped. + value = strings.Replace(value, `\"`, `"`, -1) + value = strings.Replace(value, `\$`, `$`, -1) + value = strings.Replace(value, `\\`, `\`, -1) + value = strings.Replace(value, "\\`", "`", -1) + + return value +} + +// Obtain system's linux distribution in the passed rootfs. +func GetDistroPath(rootfs string) (string, error) { + + var ( + data []byte + err error + ) + + // As per os-release(5) man page both of the following paths should be taken + // into account to find 'os-release' file. + var osRelPaths = []string{ + filepath.Join(rootfs, "/etc/os-release"), + filepath.Join(rootfs, "/usr/lib/os-release"), + } + + for _, file := range osRelPaths { + data, err = afero.ReadFile(appFs, file) + if err != nil { + continue + } + + lines := strings.Split(string(data), "\n") + + // Iterate through os-release lines looking for 'ID' content. + for _, line := range lines { + distro := parseLineDistroId(line) + if distro != "" { + return distro, nil + } + } + } + + return "", err +} + +// GetKernelRelease returns the kernel release (e.g., "4.18") +func GetKernelRelease() (string, error) { + + var utsname unix.Utsname + + if err := unix.Uname(&utsname); err != nil { + return "", fmt.Errorf("uname: %v", err) + } + + n := bytes.IndexByte(utsname.Release[:], 0) + + return string(utsname.Release[:n]), nil +} + +// Compares the given kernel version versus the current kernel version. Returns +// 0 if versions are equal, 1 if the current kernel has higher version than the +// given one, -1 otherwise. +func KernelCurrentVersionCmp(k1Major, k1Minor int) (int, error) { + + rel, err := GetKernelRelease() + if err != nil { + return 0, err + } + + splits := strings.SplitN(rel, ".", -1) + if len(splits) < 2 { + return 0, fmt.Errorf("failed to parse kernel release %v", rel) + } + + k2Major, err := strconv.Atoi(splits[0]) + if err != nil { + return 0, fmt.Errorf("failed to parse kernel release %v", rel) + } + + k2Minor, err := strconv.Atoi(splits[1]) + if err != nil { + return 0, fmt.Errorf("failed to parse kernel release %v", rel) + } + + if k2Major > k1Major { + return 1, nil + } else if k2Major == k1Major { + if k2Minor > k1Minor { + return 1, nil + } else if k2Minor == k1Minor { + return 0, nil + } + } + + return -1, nil +} + +// Parses the kernel release string (obtained from GetKernelRelease()) and returns +// the major and minor numbers. +func ParseKernelRelease(rel string) (int, int, error) { + var ( + major, minor int + err error + ) + + splits := strings.SplitN(rel, ".", -1) + if len(splits) < 2 { + return -1, -1, fmt.Errorf("failed to parse kernel release %v", rel) + } + + major, err = strconv.Atoi(splits[0]) + if err != nil { + return -1, -1, fmt.Errorf("failed to parse kernel release %v", rel) + } + + minor, err = strconv.Atoi(splits[1]) + if err != nil { + return -1, -1, fmt.Errorf("failed to parse kernel release %v", rel) + } + + return major, minor, nil +} + +// Obtain location of kernel-headers for a given linux distro. +func GetLinuxHeaderPath(distro string) (string, error) { + + var path string + + kernelRel, err := GetKernelRelease() + if err != nil { + return "", err + } + + if distro == "redhat" || distro == "centos" || distro == "rocky" || distro == "almalinux" || distro == "fedora" || distro == "amzn" { + path = filepath.Join("/usr/src/kernels", kernelRel) + } else if distro == "arch" || distro == "flatcar" { + path = filepath.Join("/lib/modules", kernelRel, "build") + } else { + // All other distros appear to be following the "/usr/src/linux-headers-rel" + // naming convention. + kernelHdr := "linux-headers-" + kernelRel + path = filepath.Join("/usr/src", kernelHdr) + } + + return path, nil +} + +// KernelModSupported returns nil if the given module is loaded in the kernel. +func KernelModSupported(mod string) (bool, error) { + + // Load the module + exec.Command("modprobe", mod).Run() + + // Check if the module is in the kernel + filename := "/proc/modules" + + f, err := os.Open(filename) + if err != nil { + return false, err + } + defer f.Close() + + s := bufio.NewScanner(f) + for s.Scan() { + if strings.Contains(s.Text(), mod) { + return true, nil + } + } + if err := s.Err(); err != nil { + return false, fmt.Errorf("failed to read %s: %s", filename, err) + } + + return false, nil +} + +// CreateUsernsProcess forks the current process into a new Linux +// user-namespace, using the given the ID mapping (common to both uid and +// gid). Returns the pid of the new process and a "kill" function (so that the +// caller can kill the child when desired). The new process executes the given +// function. +// +// NOTE: adapted from github.com/containers/storage/drivers/overlay +func CreateUsernsProcess(idMap *specs.LinuxIDMapping, execFunc func(), cwd string, newMountNs bool) (int, func(), error) { + + currCwd, err := os.Getwd() + if err != nil { + return 0, nil, err + } + + if err := os.Chdir(cwd); err != nil { + return 0, nil, err + } + defer os.Chdir(currCwd) + + flags := unix.CLONE_NEWUSER | uintptr(unix.SIGCHLD) + if newMountNs { + flags = flags | unix.CLONE_NEWNS + } + + pid, _, err2 := syscall.Syscall6(uintptr(unix.SYS_CLONE), flags, 0, 0, 0, 0, 0) + if err2 != 0 { + return -1, nil, err2 + } + + if pid == 0 { + // We are in the child; if our parent dies, ask the kernel to kill us + unix.Prctl(unix.PR_SET_PDEATHSIG, uintptr(unix.SIGKILL), 0, 0, 0) + + // Wait for the parent to do the user-ns uid & gid mappings (and timeout in 3 secs) + readIDMapFile := func(fname string) (*specs.LinuxIDMapping, error) { + data, err := os.ReadFile(fname) + if err != nil { + return nil, err + } + fields := strings.Fields(string(data)) + if len(fields) < 3 { + return nil, errors.New("invalid mapping") + } + containerID, _ := strconv.Atoi(fields[0]) + hostID, _ := strconv.Atoi(fields[1]) + size, _ := strconv.Atoi(fields[2]) + + return &specs.LinuxIDMapping{ + ContainerID: uint32(containerID), + HostID: uint32(hostID), + Size: uint32(size), + }, nil + } + + mapFiles := []string{"uid_map", "gid_map"} + foundMapping := false + + for _, f := range mapFiles { + for i := 0; i < 30; i++ { + m, err := readIDMapFile(fmt.Sprintf("/proc/self/%s", f)) + if err != nil { + continue + } + if m.ContainerID == idMap.ContainerID && + m.HostID == idMap.HostID && + m.Size == idMap.Size { + foundMapping = true + break + } + time.Sleep(100 * time.Millisecond) + } + if !foundMapping { + os.Exit(1) + } + } + + // Now execute the function we were given + execFunc() + } + + childKillFunc := func() { + unix.Kill(int(pid), unix.SIGKILL) + } + + // Write the user-ns mappings (the child is waiting for them) + writeMapping := func(fname string, idmap *specs.LinuxIDMapping) error { + mapping := fmt.Sprintf("%d %d %d\n", idmap.ContainerID, idmap.HostID, idmap.Size) + return ioutil.WriteFile(fmt.Sprintf("/proc/%d/%s", pid, fname), []byte(mapping), 0600) + } + + if err := writeMapping("uid_map", idMap); err != nil { + childKillFunc() + return -1, nil, err + } + + if err := writeMapping("gid_map", idMap); err != nil { + childKillFunc() + return -1, nil, err + } + + return int(pid), childKillFunc, nil +} diff --git a/sysbox-libs/linuxUtils/linux_test.go b/sysbox-libs/linuxUtils/linux_test.go new file mode 100644 index 00000000..74ecf678 --- /dev/null +++ b/sysbox-libs/linuxUtils/linux_test.go @@ -0,0 +1,148 @@ +// +// Copyright 2020-2023 Nestybox, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +package linuxUtils + +import ( + "testing" + + "github.com/spf13/afero" +) + +func TestMain(m *testing.M) { + appFs = afero.NewMemMapFs() + m.Run() +} + +func TestGetDistroPath(t *testing.T) { + type args struct { + rootfs string + } + + var s1 = `NAME="Ubuntu" +VERSION="20.04.1 LTS (Focal Fossa)" +ID=ubuntu +ID_LIKE=debian +PRETTY_NAME="Ubuntu 20.04.1 LTS" +VERSION_ID="20.04" +` + + var s2 = `NAME="Ubuntu" +VERSION="20.04.1 LTS (Focal Fossa)" +IDNO=ubuntu +ID_LIKE=debian +` + + var s3 = `NAME="Ubuntu" +IDubuntu +blah +` + + tests := []struct { + name string + args args + want string + wantErr bool + prepare func() + }{ + { + // Test-case 1: Primary os-release file with regular (/) path. + name: "1", + args: args{rootfs: "/"}, + want: "ubuntu", + wantErr: false, + prepare: func() { + + appFs.MkdirAll("/etc", 0755) + afero.WriteFile(appFs, "/etc/os-release", []byte(s1), 0644) + }, + }, + { + // Test-case 2: Primary os-release file with custom path. + name: "2", + args: args{"/var/lib/docker/rootfs"}, + want: "ubuntu", + wantErr: false, + prepare: func() { + + appFs.MkdirAll("/var/lib/docker/rootfs/etc", 0755) + afero.WriteFile(appFs, "/var/lib/docker/rootfs/etc/os-release", []byte(s1), 0644) + }, + }, + { + // Test-case 3: Secondary os-release file with custom path. + name: "3", + args: args{"/var/lib/docker/rootfs"}, + want: "ubuntu", + wantErr: false, + prepare: func() { + + appFs.MkdirAll("/var/lib/docker/rootfs/usr/lib", 0755) + afero.WriteFile(appFs, "/var/lib/docker/rootfs/usr/lib/os-release", []byte(s1), 0644) + }, + }, + { + // Test-case 4: Bogus os-release file. Error expected. + name: "4", + args: args{"/"}, + want: "", + wantErr: true, + prepare: func() { + + appFs.MkdirAll("/etc", 0755) + afero.WriteFile(appFs, "/etc/os-release", []byte(s2), 0644) + }, + }, + { + // Test-case 5: Bogus os-release file. Error expected. + name: "5", + args: args{"/"}, + want: "", + wantErr: true, + prepare: func() { + + appFs.MkdirAll("/etc", 0755) + afero.WriteFile(appFs, "/etc/os-release", []byte(s3), 0644) + }, + }, + } + + // Testcase executions. + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + + // Wipe out memfs. + if err := appFs.RemoveAll("/"); err != nil { + t.Errorf("Couldn't clean memMapFs: %v", err) + return + } + + // Prepare the setup. + if tt.prepare != nil { + tt.prepare() + } + + got, err := GetDistroPath(tt.args.rootfs) + if (err != nil) != tt.wantErr { + t.Errorf("GetDistroPath() error = %v, wantErr %v", err, tt.wantErr) + return + } + if got != tt.want { + t.Errorf("GetDistroPath() = %v, want %v", got, tt.want) + } + }) + } +} diff --git a/sysbox-libs/mount/OSS_DISCLOSURE.md b/sysbox-libs/mount/OSS_DISCLOSURE.md new file mode 100644 index 00000000..5991bf7f --- /dev/null +++ b/sysbox-libs/mount/OSS_DISCLOSURE.md @@ -0,0 +1,6 @@ +# Open-Source Disclosures + +Package mount was copied from `github.com/opencontainers/runc/libcontainer/mount`. + +By placing it in `sysbox-lib/mount` we allow more Sysbox components to leverage +it without creating dependency cycles. diff --git a/sysbox-libs/mount/go.mod b/sysbox-libs/mount/go.mod new file mode 100644 index 00000000..50d3d0a3 --- /dev/null +++ b/sysbox-libs/mount/go.mod @@ -0,0 +1,5 @@ +module github.com/nestybox/sysbox-libs/mount + +go 1.21 + +require golang.org/x/sys v0.20.0 diff --git a/sysbox-libs/mount/go.sum b/sysbox-libs/mount/go.sum new file mode 100644 index 00000000..5d1e088e --- /dev/null +++ b/sysbox-libs/mount/go.sum @@ -0,0 +1,2 @@ +golang.org/x/sys v0.20.0 h1:Od9JTbYCk261bKm4M/mw7AklTlFYIa0bIp9BgSm1S8Y= +golang.org/x/sys v0.20.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= diff --git a/sysbox-libs/mount/mount.go b/sysbox-libs/mount/mount.go new file mode 100644 index 00000000..7fc65e9d --- /dev/null +++ b/sysbox-libs/mount/mount.go @@ -0,0 +1,55 @@ +package mount + +import ( + "fmt" +) + +// GetMounts retrieves a list of mounts for the current running process. +func GetMounts() ([]*Info, error) { + return parseMountTable() +} + +// GetMountsPid retrieves a list of mounts for the 'pid' process. +func GetMountsPid(pid uint32) ([]*Info, error) { + return parseMountTableForPid(pid) +} + +func FindMount(mountpoint string, mounts []*Info) bool { + for _, m := range mounts { + if m.Mountpoint == mountpoint { + return true + } + } + return false +} + +// MountedWithFs looks at /proc/self/mountinfo to determine if the specified +// mountpoint has been mounted with the given filesystem type. +func MountedWithFs(mountpoint string, fs string, mounts []*Info) (bool, error) { + + // Search the table for the mountpoint + for _, m := range mounts { + if m.Mountpoint == mountpoint && m.Fstype == fs { + return true, nil + } + } + return false, nil +} + +// GetMountAt returns information about the given mountpoint. +func GetMountAt(mountpoint string, mounts []*Info) (*Info, error) { + + // Search the table for the given mountpoint + for _, m := range mounts { + if m.Mountpoint == mountpoint { + return m, nil + } + } + return nil, fmt.Errorf("%s is not a mountpoint", mountpoint) +} + +// Converts the set of mount options (e.g., "rw", "nodev", etc.) to it's +// corresponding mount flags representation +func OptionsToFlags(opt []string) int { + return optToFlag(opt) +} diff --git a/sysbox-libs/mount/mount_linux.go b/sysbox-libs/mount/mount_linux.go new file mode 100644 index 00000000..d4d2fed9 --- /dev/null +++ b/sysbox-libs/mount/mount_linux.go @@ -0,0 +1,119 @@ +// +build linux + +package mount + +import ( + "bufio" + "fmt" + "io" + "os" + "strings" + + "golang.org/x/sys/unix" +) + +const ( + /* 36 35 98:0 /mnt1 /mnt2 rw,noatime master:1 - ext3 /dev/root rw,errors=continue + (1)(2)(3) (4) (5) (6) (7) (8) (9) (10) (11) + + (1) mount ID: unique identifier of the mount (may be reused after umount) + (2) parent ID: ID of parent (or of self for the top of the mount tree) + (3) major:minor: value of st_dev for files on filesystem + (4) root: root of the mount within the filesystem + (5) mount point: mount point relative to the process's root + (6) mount options: per mount options + (7) optional fields: zero or more fields of the form "tag[:value]" + (8) separator: marks the end of the optional fields + (9) filesystem type: name of filesystem of the form "type[.subtype]" + (10) mount source: filesystem specific information or "none" + (11) super options: per super block options*/ + mountinfoFormat = "%d %d %d:%d %s %s %s %s" +) + +var mountFlagsMap = map[string]int{ + "ro": unix.MS_RDONLY, + "nodev": unix.MS_NODEV, + "noexec": unix.MS_NOEXEC, + "nosuid": unix.MS_NOSUID, + "noatime": unix.MS_NOATIME, + "nodiratime": unix.MS_NODIRATIME, + "relatime": unix.MS_RELATIME, + "strictatime": unix.MS_STRICTATIME, + "sync": unix.MS_SYNCHRONOUS, +} + +// Parse /proc/self/mountinfo because comparing Dev and ino does not work from +// bind mounts +func parseMountTable() ([]*Info, error) { + f, err := os.Open("/proc/self/mountinfo") + if err != nil { + return nil, err + } + defer f.Close() + + return parseInfoFile(f) +} + +// Same as above function but for a specific pid this time. +func parseMountTableForPid(pid uint32) ([]*Info, error) { + f, err := os.Open(fmt.Sprintf("/proc/%d/mountinfo", pid)) + if err != nil { + return nil, err + } + defer f.Close() + + return parseInfoFile(f) +} + +func parseInfoFile(r io.Reader) ([]*Info, error) { + var ( + s = bufio.NewScanner(r) + out = []*Info{} + ) + + for s.Scan() { + if err := s.Err(); err != nil { + return nil, err + } + + var ( + p = &Info{} + text = s.Text() + optionalFields string + ) + + if _, err := fmt.Sscanf(text, mountinfoFormat, + &p.ID, &p.Parent, &p.Major, &p.Minor, + &p.Root, &p.Mountpoint, &p.Opts, &optionalFields); err != nil { + return nil, fmt.Errorf("Scanning '%s' failed: %s", text, err) + } + // Safe as mountinfo encodes mountpoints with spaces as \040. + index := strings.Index(text, " - ") + postSeparatorFields := strings.Fields(text[index+3:]) + if len(postSeparatorFields) < 3 { + return nil, fmt.Errorf("Error found less than 3 fields post '-' in %q", text) + } + + if optionalFields != "-" { + p.Optional = optionalFields + } + + p.Fstype = postSeparatorFields[0] + p.Source = postSeparatorFields[1] + p.VfsOpts = strings.Join(postSeparatorFields[2:], " ") + out = append(out, p) + } + return out, nil +} + +func optToFlag(opts []string) int { + flags := 0 + for _, opt := range opts { + f, ok := mountFlagsMap[opt] + if !ok { + continue + } + flags |= f + } + return flags +} diff --git a/sysbox-libs/mount/mount_test.go b/sysbox-libs/mount/mount_test.go new file mode 100644 index 00000000..e4ef6d38 --- /dev/null +++ b/sysbox-libs/mount/mount_test.go @@ -0,0 +1,77 @@ +package mount + +import ( + "testing" +) + +func TestGetMounts(t *testing.T) { + allMounts, err := GetMounts() + if err != nil { + t.Fatalf("GetMounts() failed: %v", err) + } + for _, m := range allMounts { + if m.Mountpoint == "/proc" { + if m.Fstype != "proc" { + t.Fatalf("GetMounts() failed: want type = proc, got %s", m.Fstype) + } + } + if m.Mountpoint == "/sys" { + if m.Fstype != "sysfs" { + t.Fatalf("GetMounts() failed: want type = sysfs, got %s", m.Fstype) + } + } + } +} + +func TestMountedWithFs(t *testing.T) { + allMounts, err := GetMounts() + if err != nil { + t.Fatalf("GetMounts() failed: %v", err) + } + + ok, err := MountedWithFs("/proc", "proc", allMounts) + if err != nil || !ok { + t.Fatalf("MountedWithFs() failed: %v, %v", ok, err) + } + ok, err = MountedWithFs("/sys", "sysfs", allMounts) + if err != nil || !ok { + t.Fatalf("MountedWithFs() failed: %v, %v", ok, err) + } + + // negative testing + ok, err = MountedWithFs("/proc", "sysfs", allMounts) + if err != nil || ok { + t.Fatalf("MountedWithFs() failed: %v, %v", ok, err) + } + ok, err = MountedWithFs("/sys", "procfs", allMounts) + if err != nil || ok { + t.Fatalf("MountedWithFs() failed: %v, %v", ok, err) + } +} + +func TestGetMountAt(t *testing.T) { + allMounts, err := GetMounts() + if err != nil { + t.Fatalf("GetMounts() failed: %v", err) + } + + m, err := GetMountAt("/proc", allMounts) + if err != nil { + t.Fatalf("GetMountAt() failed: %v", err) + } + if m.Mountpoint == "/proc" { + if m.Fstype != "proc" { + t.Fatalf("GetMountAt() failed: want type = proc, got %s", m.Fstype) + } + } + + m, err = GetMountAt("/sys", allMounts) + if err != nil { + t.Fatalf("GetMountAt() failed: %v", err) + } + if m.Mountpoint == "/sys" { + if m.Fstype != "sysfs" { + t.Fatalf("GetMountAt() failed: want type = sysfs, got %s", m.Fstype) + } + } +} diff --git a/sysbox-libs/mount/mountinfo.go b/sysbox-libs/mount/mountinfo.go new file mode 100644 index 00000000..e3fc3535 --- /dev/null +++ b/sysbox-libs/mount/mountinfo.go @@ -0,0 +1,40 @@ +package mount + +// Info reveals information about a particular mounted filesystem. This +// struct is populated from the content in the /proc//mountinfo file. +type Info struct { + // ID is a unique identifier of the mount (may be reused after umount). + ID int + + // Parent indicates the ID of the mount parent (or of self for the top of the + // mount tree). + Parent int + + // Major indicates one half of the device ID which identifies the device class. + Major int + + // Minor indicates one half of the device ID which identifies a specific + // instance of device. + Minor int + + // Root of the mount within the filesystem. + Root string + + // Mountpoint indicates the mount point relative to the process's root. + Mountpoint string + + // Opts represents mount-specific options. + Opts string + + // Optional represents optional fields. + Optional string + + // Fstype indicates the type of filesystem, such as EXT3. + Fstype string + + // Source indicates filesystem specific information or "none". + Source string + + // VfsOpts represents per super block options. + VfsOpts string +} diff --git a/sysbox-libs/overlayUtils/go.mod b/sysbox-libs/overlayUtils/go.mod new file mode 100644 index 00000000..ff3a8722 --- /dev/null +++ b/sysbox-libs/overlayUtils/go.mod @@ -0,0 +1,11 @@ +module github.com/nestybox/sysbox-libs/overlayUtils + +go 1.22 + +toolchain go1.22.6 + +require ( + github.com/deckarep/golang-set v1.8.0 + github.com/nestybox/sysbox-libs/mount v0.0.0-20240602025437-33cbdf5a9e98 + golang.org/x/sys v0.26.0 +) diff --git a/sysbox-libs/overlayUtils/go.sum b/sysbox-libs/overlayUtils/go.sum new file mode 100644 index 00000000..7f2a3dbb --- /dev/null +++ b/sysbox-libs/overlayUtils/go.sum @@ -0,0 +1,6 @@ +github.com/deckarep/golang-set v1.8.0 h1:sk9/l/KqpunDwP7pSjUg0keiOOLEnOBHzykLrsPppp4= +github.com/deckarep/golang-set v1.8.0/go.mod h1:5nI87KwE7wgsBU1F4GKAw2Qod7p5kyS383rP6+o6qqo= +github.com/nestybox/sysbox-libs/mount v0.0.0-20240602025437-33cbdf5a9e98 h1:Xc+SFGUvahor6vCYrbwMqGmH+6iXK15rzAigyDXWBLU= +github.com/nestybox/sysbox-libs/mount v0.0.0-20240602025437-33cbdf5a9e98/go.mod h1:Sg6f4enTImsnsA0RET+BSVQzelR23WiI4p3rHxOu54w= +golang.org/x/sys v0.26.0 h1:KHjCJyddX0LoSTb3J+vWpupP9p0oznkqVk/IfjymZbo= +golang.org/x/sys v0.26.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= diff --git a/sysbox-libs/overlayUtils/overlayUtils.go b/sysbox-libs/overlayUtils/overlayUtils.go new file mode 100644 index 00000000..67641718 --- /dev/null +++ b/sysbox-libs/overlayUtils/overlayUtils.go @@ -0,0 +1,119 @@ +// +// Copyright 2023 Nestybox, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +// Utilities for dealing with Linux's overlay fs + +package overlayUtils + +import ( + "fmt" + "strings" + + mapset "github.com/deckarep/golang-set" + "github.com/nestybox/sysbox-libs/mount" + "golang.org/x/sys/unix" +) + +type MountOpts struct { + Opts string + Flags int + PropFlags int +} + +// GetMountOpt returns the mount options string, mount flags, and mount +// propagation flags of the overlayfs mount at the given path. +func GetMountOpt(mi *mount.Info) *MountOpts { + + currMntOpts := mapset.NewSet() + for _, opt := range strings.Split(mi.Opts, ",") { + currMntOpts.Add(opt) + } + + currVfsOpts := mapset.NewSet() + for _, opt := range strings.Split(mi.VfsOpts, ",") { + currVfsOpts.Add(opt) + } + + // The vfs opts reported by mountinfo are a combination of per superblock + // mount opts and the overlayfs-specific data; we need to separate these so + // we can do the mount properly. + properMntOpts := mapset.NewSetFromSlice([]interface{}{ + "ro", "rw", "nodev", "noexec", "nosuid", "noatime", "nodiratime", "relatime", "strictatime", "sync", + }) + + newMntOpts := currVfsOpts.Intersect(properMntOpts) + newVfsOpts := currVfsOpts.Difference(properMntOpts) + + // Convert the mount options to the mount flags + newMntOptsString := []string{} + for _, opt := range newMntOpts.ToSlice() { + newMntOptsString = append(newMntOptsString, fmt.Sprintf("%s", opt)) + } + mntFlags := mount.OptionsToFlags(newMntOptsString) + + // Convert the vfs option set to the mount data string + newVfsOptsString := "" + for i, opt := range newVfsOpts.ToSlice() { + if i != 0 { + newVfsOptsString += "," + } + newVfsOptsString += fmt.Sprintf("%s", opt) + } + + // Get the mount propagation flags + propFlags := 0 + + if strings.Contains(mi.Optional, "shared") { + propFlags |= unix.MS_SHARED + } else if strings.Contains(mi.Optional, "master") { + propFlags |= unix.MS_SLAVE + } else if strings.Contains(mi.Optional, "unbindable") { + propFlags |= unix.MS_UNBINDABLE + } else { + propFlags |= unix.MS_PRIVATE + } + + mntOpts := &MountOpts{ + Opts: newVfsOptsString, + Flags: mntFlags, + PropFlags: propFlags, + } + + return mntOpts +} + +func GetLowerLayers(mntOpts *MountOpts) []string { + lowerStr := "" + opts := strings.Split(mntOpts.Opts, ",") + for _, opt := range opts { + if strings.HasPrefix(opt, "lowerdir=") { + lowerStr = strings.TrimPrefix(opt, "lowerdir=") + break + } + } + + return strings.Split(lowerStr, ":") +} + +func GetUpperLayer(mntOpts *MountOpts) string { + opts := strings.Split(mntOpts.Opts, ",") + for _, opt := range opts { + if strings.HasPrefix(opt, "upperdir=") { + return strings.TrimPrefix(opt, "upperdir=") + } + } + return "" +} diff --git a/sysbox-libs/pidfd/go.mod b/sysbox-libs/pidfd/go.mod new file mode 100644 index 00000000..ebe138b9 --- /dev/null +++ b/sysbox-libs/pidfd/go.mod @@ -0,0 +1,5 @@ +module github.com/nestybox/sysbox-libs/pidfd + +go 1.21 + +toolchain go1.21.0 diff --git a/sysbox-libs/pidfd/pidfd.go b/sysbox-libs/pidfd/pidfd.go new file mode 100644 index 00000000..6cc84011 --- /dev/null +++ b/sysbox-libs/pidfd/pidfd.go @@ -0,0 +1,74 @@ +// +// Copyright 2019-2021 Nestybox, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +// Package pidfd provides pidfd_open, pidfd_getfd, pidfd_send_signal support on linux 5.6+. +// +// pidfd_send_signal() --> kernel 5.1+ +// pidfd_open() --> kernel 5.3+ +// pidfd_getfd() --> kernel 5.6+ +// +// Sysbox is currently only using pidfd_open(). + +package pidfd + +import "syscall" + +const ( + sys_pidfd_send_signal = 424 + sys_pidfd_open = 434 + sys_pidfd_getfd = 438 +) + +// PidFd, a file descriptor that refers to a process. +type PidFd int + +// Open obtains a file descriptor that refers to a process. +// +// The flags argument is reserved for future use; currently, this argument must be specified as 0. +func Open(pid int, flags uint) (PidFd, error) { + fd, _, errno := syscall.Syscall(sys_pidfd_open, uintptr(pid), uintptr(flags), 0) + if errno != 0 { + return 0, errno + } + + return PidFd(fd), nil +} + +// GetFd obtain a duplicate of another process's file descriptor. +// +// The flags argument is reserved for future use; currently, this argument must be specified as 0. +func (fd PidFd) GetFd(targetfd int, flags uint) (int, error) { + newfd, _, errno := syscall.Syscall(sys_pidfd_getfd, uintptr(fd), uintptr(targetfd), uintptr(flags)) + + if errno != 0 { + return 0, errno + } + + return int(newfd), nil +} + +// SendSignal send a signal to a process specified by a PidFd. +// +// The flags argument is reserved for future use; currently, this argument must be specified as 0. +func (fd PidFd) SendSignal(signal syscall.Signal, flags uint) error { + _, _, errno := syscall.Syscall6(sys_pidfd_send_signal, uintptr(fd), uintptr(signal), 0, uintptr(flags), 0, 0) + + if errno != 0 { + return errno + } + + return nil +} diff --git a/sysbox-libs/pidmonitor/go.mod b/sysbox-libs/pidmonitor/go.mod new file mode 100644 index 00000000..66fcb1e3 --- /dev/null +++ b/sysbox-libs/pidmonitor/go.mod @@ -0,0 +1,12 @@ +module github.com/nestybox/sysbox-libs/pidmonitor + +go 1.21 + +toolchain go1.21.0 + +require github.com/sirupsen/logrus v1.4.2 + +require ( + github.com/konsorten/go-windows-terminal-sequences v1.0.1 // indirect + golang.org/x/sys v0.0.0-20190422165155-953cdadca894 // indirect +) diff --git a/sysbox-libs/pidmonitor/go.sum b/sysbox-libs/pidmonitor/go.sum new file mode 100644 index 00000000..706fe8d5 --- /dev/null +++ b/sysbox-libs/pidmonitor/go.sum @@ -0,0 +1,13 @@ +github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= +github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= +github.com/konsorten/go-windows-terminal-sequences v1.0.1 h1:mweAR1A6xJ3oS2pRaGiHgQ4OO8tzTaLawm8vnODuwDk= +github.com/konsorten/go-windows-terminal-sequences v1.0.1/go.mod h1:T0+1ngSBFLxvqU3pZ+m/2kptfBszLMUkC4ZK/EgS/cQ= +github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= +github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= +github.com/sirupsen/logrus v1.4.2 h1:SPIRibHv4MatM3XXNO2BJeFLZwZ2LvZgfQ5+UNI2im4= +github.com/sirupsen/logrus v1.4.2/go.mod h1:tLMulIdttU9McNUspp0xgXVQah82FyeX6MwdIuYE2rE= +github.com/stretchr/objx v0.1.1/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= +github.com/stretchr/testify v1.2.2 h1:bSDNvY7ZPG5RlJ8otE/7V6gMiyenm9RtJ7IUVIAoJ1w= +github.com/stretchr/testify v1.2.2/go.mod h1:a8OnRcib4nhh0OaRAV+Yts87kKdq0PP7pXfy6kDkUVs= +golang.org/x/sys v0.0.0-20190422165155-953cdadca894 h1:Cz4ceDQGXuKRnVBDTS23GTn/pU5OE2C0WrNTOYK1Uuc= +golang.org/x/sys v0.0.0-20190422165155-953cdadca894/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= diff --git a/sysbox-libs/pidmonitor/monitor.go b/sysbox-libs/pidmonitor/monitor.go new file mode 100644 index 00000000..8b425148 --- /dev/null +++ b/sysbox-libs/pidmonitor/monitor.go @@ -0,0 +1,105 @@ +// +// Copyright 2019-2020 Nestybox, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +package pidmonitor + +import ( + "fmt" + "os" + "time" +) + +type cmd int + +const ( + stop cmd = iota +) + +// Monitors events associated with the given PidMon instance +func pidMonitor(pm *PidMon) { + + for { + eventList := []PidEvent{} + rmList := []PidEvent{} + + // handle incoming commands first + select { + case cmd := <-pm.cmdCh: + if cmd == stop { + pm.EventCh <- eventList + return + } + default: + } + + // perform monitoring action + pm.mu.Lock() + for pid, evect := range pm.eventTable { + if eventIsSet(evect, Exit) { + pidAlive, err := pidExists(pid) + if err != nil || !pidAlive { + + eventList = append(eventList, PidEvent{ + Pid: pid, + Event: Exit, + Err: err, + }) + + // pid exit implies event won't hit again; remove it. + rmList = append(rmList, PidEvent{pid, Exit, nil}) + } + } + } + + // release the lock so that we don't hold it while sending the event list + // (in case the event channel is blocked); this way new events can + // continue to be added. + pm.mu.Unlock() + + // send event list + if len(eventList) > 0 { + pm.EventCh <- eventList + } + + // remove events that won't hit any more + pm.mu.Lock() + for _, e := range rmList { + eventTableRm(pm.eventTable, e) + } + pm.mu.Unlock() + + // wait for the poll period + time.Sleep(pm.cfg.Poll * time.Millisecond) + } +} + +// Checks if a process with the given pid exists. +func pidExists(pid uint32) (bool, error) { + + // Our current checking mechanism is very simple but not the best; in the future, we + // should consider replacing it with the newly added pidfd_* syscalls in Linux. + + path := fmt.Sprintf("/proc/%d", pid) + + _, err := os.Stat(path) + if os.IsNotExist(err) { + return false, nil + } else if err != nil { + return false, err + } + + return true, nil +} diff --git a/sysbox-libs/pidmonitor/pidmon.go b/sysbox-libs/pidmonitor/pidmon.go new file mode 100644 index 00000000..71c8a47a --- /dev/null +++ b/sysbox-libs/pidmonitor/pidmon.go @@ -0,0 +1,120 @@ +// +// Copyright 2019-2020 Nestybox, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +// The pidmon package allows a process to get notificaitons on events associated with +// other processes. + +package pidmonitor + +import ( + "fmt" + "sync" + "time" +) + +// pidMon configuration info +type Cfg struct { + Poll time.Duration // polling time, in ms +} + +// polling config limits (in ms) +const ( + PollMin = 1 + PollMax = 1000 +) + +// Pid event types (bit-vector) +const ( + Exit int = 0x1 // Process exited +) + +// Represents an event on the given process +type PidEvent struct { + Pid uint32 + Event int // bit vector of events + Err error // set by WaitEvent() when an error is detected +} + +// Represents a pid monitor instance +type PidMon struct { + mu sync.Mutex + cfg *Cfg + eventTable map[uint32]int // maps each pid to it's event vector + cmdCh chan cmd // sends commands to monitor thread + EventCh chan []PidEvent // receives events from monitor thread +} + +// Creates a instance of the pid monitor; returns the pidMon ID. +func New(cfg *Cfg) (*PidMon, error) { + + if err := validateCfg(cfg); err != nil { + return nil, err + } + + pm := &PidMon{ + cfg: cfg, + eventTable: make(map[uint32]int), + cmdCh: make(chan cmd), + EventCh: make(chan []PidEvent, 100), // buffered to prevent monitor thread from blocking when pushing events + } + + go pidMonitor(pm) + + return pm, nil +} + +// Adds one or more events to the list of events monitored by the given pidMon +func (pm *PidMon) AddEvent(events []PidEvent) error { + + for _, e := range events { + if !validateEvent(e.Event) { + return fmt.Errorf("Unknown event %v", e.Event) + } + pm.mu.Lock() + eventTableAdd(pm.eventTable, e) + pm.mu.Unlock() + } + + return nil +} + +// Removes one or more events from the list of events monitored by the given pidMon +func (pm *PidMon) RemoveEvent(events []PidEvent) error { + + for _, e := range events { + if !validateEvent(e.Event) { + return fmt.Errorf("Unknown event %v", e.Event) + } + pm.mu.Lock() + eventTableRm(pm.eventTable, e) + pm.mu.Unlock() + } + + return nil +} + +// Blocks the calling process until the given pidMon detects an event in one or more of +// the processes it's monitoring. Returns the list of events. +func (pm *PidMon) WaitEvent() []PidEvent { + eventList := <-pm.EventCh + return eventList +} + +// Stops the given pidMon. Causes WaitEvent() to return immediately (likely +// with an empty pid list). +func (pm *PidMon) Close() { + pm.cmdCh <- stop +} diff --git a/sysbox-libs/pidmonitor/pidmon_test.go b/sysbox-libs/pidmonitor/pidmon_test.go new file mode 100644 index 00000000..90f8c3a7 --- /dev/null +++ b/sysbox-libs/pidmonitor/pidmon_test.go @@ -0,0 +1,358 @@ +// +// Copyright 2019-2020 Nestybox, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +package pidmonitor + +import ( + "fmt" + "math/rand" + "os" + "os/exec" + "sort" + "testing" + "time" + + log "github.com/sirupsen/logrus" +) + +func init() { + //log.SetLevel(log.DebugLevel) +} + +func pidListEqual(a, b []int) bool { + if len(a) != len(b) { + return false + } + + sort.Ints(a) + sort.Ints(b) + + for i, pid := range a { + if b[i] != pid { + return false + } + } + return true +} + +func eventListSort(a []PidEvent) { + sort.SliceStable(a, func(i, j int) bool { + return a[i].Pid < a[j].Pid + }) +} + +func eventListEqual(a, b []PidEvent) bool { + if len(a) != len(b) { + return false + } + + eventListSort(a) + eventListSort(b) + + for i, event := range a { + if b[i] != event { + return false + } + } + return true +} + +func TestAddAndRemoveEvent(t *testing.T) { + + pidMonCfg := &Cfg{ + Poll: 500, + } + + pidMon, err := New(pidMonCfg) + if err != nil { + t.Fatalf("New() failed: %s", err) + } + defer pidMon.Close() + + events := []PidEvent{ + {Pid: 1, Event: Exit}, + {Pid: 2, Event: Exit}, + } + + // verify Add + if err := pidMon.AddEvent(events); err != nil { + t.Errorf("AddEvent() failed: %s\n", err) + } + + for _, e := range events { + evect, found := pidMon.eventTable[e.Pid] + if !found || evect != Exit { + t.Errorf("AddEvent() failed: pid = %d, found = %v, evect = %x\n", e.Pid, found, evect) + } + } + + // verify Remove + if err := pidMon.RemoveEvent(events); err != nil { + t.Errorf("RemoveEvent() failed: %s\n", err) + } + + for _, e := range events { + _, found := pidMon.eventTable[e.Pid] + if found { + t.Errorf("RemoveEvent() failed: pid = %d, found = %v\n", e.Pid, found) + } + } + +} + +// spawns the given number of dummy processes; returns their pids. +func spawnDummyProcesses(num int) ([]int, error) { + var err error + + pids := []int{} + for i := 0; i < num; i++ { + cmd := exec.Command("tail", "-f", "/dev/null") + if err = cmd.Start(); err != nil { + break + } + pids = append(pids, cmd.Process.Pid) + } + + if err != nil { + killDummyProcesses(pids) + return nil, err + } + + return pids, nil +} + +// kills the processes with the given pids. +func killDummyProcesses(pids []int) error { + for _, pid := range pids { + proc, err := os.FindProcess(pid) + if err != nil { + return fmt.Errorf("failed to find pid %d\n", pid) + } + // kill + if err = proc.Kill(); err != nil { + return fmt.Errorf("failed to kill pid %d\n", pid) + } + // reap + _, err = proc.Wait() + if err != nil { + return fmt.Errorf("failed to reap pid %d\n", pid) + } + } + return nil +} + +func waitAndCheckEvent(t *testing.T, numProc int, pidMon *PidMon, want []PidEvent, resultCh chan error) { + + eventList := []PidEvent{} + for { + pidEvents := pidMon.WaitEvent() + eventList = append(eventList, pidEvents...) + if len(eventList) >= numProc { + break + } + } + + if !eventListEqual(want, eventList) { + resultCh <- fmt.Errorf("pidMon.Wait() failed: want %+v, got %+v\n", want, eventList) + return + } + + resultCh <- nil +} + +func TestEventExit(t *testing.T) { + + numProc := 10 + + pidMonCfg := &Cfg{ + Poll: 100, + } + + pidMon, err := New(pidMonCfg) + if err != nil { + t.Fatalf("New() failed: %s", err) + } + defer pidMon.Close() + + pidList, err := spawnDummyProcesses(numProc) + if err != nil { + t.Fatalf("spawnDummyProcesses() failed: %s\n", err) + } + + // create the event monitor list + eventList := []PidEvent{} + for _, pid := range pidList { + eventList = append(eventList, PidEvent{uint32(pid), Exit, nil}) + } + + resultCh := make(chan error) + + go waitAndCheckEvent(t, numProc, pidMon, eventList, resultCh) + + if err := pidMon.AddEvent(eventList); err != nil { + t.Fatalf("AddEvent() failed: %s\n", err) + } + + // wait a bit such that the process kill occurs concurrently with the monitor checking + // (otherwise the processes will likely be all killed before the monitor knows that it + // has to check for them) + time.Sleep(500 * time.Millisecond) + + // trigger process exit event + if err := killDummyProcesses(pidList); err != nil { + t.Fatalf("KillDummyProcesss() failed: %s\n", err) + } + + // wait for event checker to be done + if err := <-resultCh; err != nil { + t.Fatalf("Event failed: %s", err) + } +} + +// +// The following functions are used by the TestEventExitConcurrent() test +// + +// Spawns up to numProc processes at random intervals +func spawner(t *testing.T, numProc int, startCh chan bool, spawnedCh chan []int) { + src := rand.NewSource(time.Now().UnixNano()) + random := rand.New(src) + + <-startCh + + log.Debugf("spawner: started ...\n") + + for i := 0; i < numProc; i++ { + pidList, err := spawnDummyProcesses(1) + if err != nil { + t.Fatalf("spawnDummyProcesses() failed: %s\n", err) + } + + spawnedCh <- pidList + + log.Debugf("spawner: spawned %v\n", pidList) + + delay := random.Intn(10) + time.Sleep(time.Duration(delay) * time.Millisecond) + } +} + +// Kills spawned processes at random intervals +func killer(t *testing.T, numProc int, pidMon *PidMon, spawnedCh, killedCh chan []int) { + src := rand.NewSource(time.Now().UnixNano()) + random := rand.New(src) + + killedList := []int{} + + for { + // Listen to spawner + spawnedList := <-spawnedCh + + // Tell pidMon to watch for exit event on the spawned processes + eventList := []PidEvent{} + for _, pid := range spawnedList { + eventList = append(eventList, PidEvent{uint32(pid), Exit, nil}) + } + if err := pidMon.AddEvent(eventList); err != nil { + t.Fatalf("AddEvent() failed: %s\n", err) + } + + // Kill the processes + for _, pid := range spawnedList { + if err := killDummyProcesses([]int{pid}); err != nil { + t.Fatalf("KillDummyProcesss() failed: %s\n", err) + } + delay := random.Intn(10) + time.Sleep(time.Duration(delay) * time.Millisecond) + } + + log.Debugf("killer: killed %v\n", spawnedList) + + killedList = append(killedList, spawnedList...) + + if len(killedList) >= numProc { + break + } + } + + killedCh <- killedList +} + +// Waits for the pid monitor events +func waiter(t *testing.T, numProc int, pidMon *PidMon, eventCh chan []int) { + src := rand.NewSource(time.Now().UnixNano()) + random := rand.New(src) + + eventList := []int{} + + for { + pidEvents := pidMon.WaitEvent() + + log.Debugf("waiter: events %v\n", pidEvents) + + for _, e := range pidEvents { + if e.Event != Exit { + t.Fatalf("pidMon reported non-exit event: pid = %d, event = %x\n", e.Pid, e.Event) + } + eventList = append(eventList, int(e.Pid)) + } + + if len(eventList) >= numProc { + break + } + + delay := random.Intn(10) + time.Sleep(time.Duration(delay) * time.Millisecond) + } + + eventCh <- eventList +} + +func TestEventExitConcurrent(t *testing.T) { + + numProc := 100 + + pidMonCfg := &Cfg{ + Poll: 50, + } + + pidMon, err := New(pidMonCfg) + if err != nil { + t.Fatalf("New() failed: %s", err) + } + defer pidMon.Close() + + // create spawner, killer, waiter threads + startCh := make(chan bool) + spawnedCh := make(chan []int, 100) + killedCh := make(chan []int, 100) + eventCh := make(chan []int, 100) + + go spawner(t, numProc, startCh, spawnedCh) + go killer(t, numProc, pidMon, spawnedCh, killedCh) + go waiter(t, numProc, pidMon, eventCh) + + // start spawning + startCh <- true + + // wait for killer and checker to finish + killedList := <-killedCh + eventList := <-eventCh + + if !pidListEqual(eventList, killedList) { + t.Fatalf("event list does not match kill list: events: %+v; killed: %+v\n", eventList, killedList) + } +} diff --git a/sysbox-libs/pidmonitor/test.sh b/sysbox-libs/pidmonitor/test.sh new file mode 100755 index 00000000..090e0268 --- /dev/null +++ b/sysbox-libs/pidmonitor/test.sh @@ -0,0 +1,23 @@ +#!/bin/bash + +# +# Runs the pidmonitor unit tests for a given number of iterations. +# +# Usage: test +# + +iter=$1 + +for i in `seq 1 $iter`; do + result=$(go test -v) + echo $result + + failed=$(echo $result | grep -c "FAIL") + if [ $failed -ne 0 ]; then + echo "TEST FAILED" + exit 1 + fi +done + +printf "TEST PASSED (%d iterations)\n" $iter +exit 0 diff --git a/sysbox-libs/pidmonitor/utils.go b/sysbox-libs/pidmonitor/utils.go new file mode 100644 index 00000000..f65c5ab2 --- /dev/null +++ b/sysbox-libs/pidmonitor/utils.go @@ -0,0 +1,69 @@ +// +// Copyright 2019-2020 Nestybox, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +package pidmonitor + +import "fmt" + +func validateCfg(cfg *Cfg) error { + if cfg.Poll < PollMin || cfg.Poll > PollMax { + return fmt.Errorf("invalid config: poll time must be in range [%d, %d]; found %d", PollMin, PollMax, cfg.Poll) + } + return nil +} + +func validateEvent(event int) bool { + return event == Exit +} + +func eventSet(evect int, etype int) int { + return evect | etype +} + +func eventClear(evect int, etype int) int { + return evect &^ etype +} + +func eventIsSet(evect int, etype int) bool { + return evect&etype == etype +} + +func eventTableAdd(t map[uint32]int, e PidEvent) { + pid := e.Pid + pidEvent := e.Event + + evect, found := t[pid] + if !found { + t[pid] = pidEvent + } else { + t[pid] = eventSet(evect, pidEvent) + } +} + +func eventTableRm(t map[uint32]int, e PidEvent) { + pid := e.Pid + pidEvent := e.Event + + evect, found := t[pid] + if found { + evect = eventClear(evect, pidEvent) + if evect == 0 { + delete(t, pid) + } else { + t[pid] = evect + } + } +} diff --git a/sysbox-libs/shiftfs/.gitignore b/sysbox-libs/shiftfs/.gitignore new file mode 100644 index 00000000..c56069fe --- /dev/null +++ b/sysbox-libs/shiftfs/.gitignore @@ -0,0 +1 @@ +*.test \ No newline at end of file diff --git a/sysbox-libs/shiftfs/go.mod b/sysbox-libs/shiftfs/go.mod new file mode 100644 index 00000000..94e79a96 --- /dev/null +++ b/sysbox-libs/shiftfs/go.mod @@ -0,0 +1,24 @@ +module github.com/nestybox/sysbox-libs/shiftfs + +go 1.21 + +require ( + github.com/nestybox/sysbox-libs/linuxUtils v0.0.0-00010101000000-000000000000 + github.com/nestybox/sysbox-libs/mount v0.0.0-00010101000000-000000000000 + github.com/nestybox/sysbox-libs/utils v0.0.0-00010101000000-000000000000 + github.com/opencontainers/runtime-spec v1.0.2 + github.com/sirupsen/logrus v1.9.0 + golang.org/x/sys v0.20.0 + gopkg.in/hlandau/service.v1 v1.0.7 +) + +require ( + github.com/spf13/afero v1.4.1 // indirect + golang.org/x/text v0.3.8 // indirect +) + +replace ( + github.com/nestybox/sysbox-libs/linuxUtils => ../linuxUtils + github.com/nestybox/sysbox-libs/mount => ../mount + github.com/nestybox/sysbox-libs/utils => ../utils +) diff --git a/sysbox-libs/shiftfs/go.sum b/sysbox-libs/shiftfs/go.sum new file mode 100644 index 00000000..565b4574 --- /dev/null +++ b/sysbox-libs/shiftfs/go.sum @@ -0,0 +1,37 @@ +github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= +github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= +github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= +github.com/kr/fs v0.1.0/go.mod h1:FFnZGqtBN9Gxj7eW1uZ42v5BccTP0vu6NEaFoC2HwRg= +github.com/opencontainers/runtime-spec v1.0.2 h1:UfAcuLBJB9Coz72x1hgl8O5RVzTdNiaglX6v2DM6FI0= +github.com/opencontainers/runtime-spec v1.0.2/go.mod h1:jwyrGlmzljRJv/Fgzds9SsS/C5hL+LL3ko9hs6T5lQ0= +github.com/pkg/errors v0.8.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0= +github.com/pkg/sftp v1.10.1/go.mod h1:lYOWFsE0bwd1+KfKJaKeuokY15vzFx25BLbzYYoAxZI= +github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= +github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= +github.com/sirupsen/logrus v1.9.0 h1:trlNQbNUG3OdDrDil03MCb1H2o9nJ1x4/5LYw7byDE0= +github.com/sirupsen/logrus v1.9.0/go.mod h1:naHLuLoDiP4jHNo9R0sCBMtWGeIprob74mVsIT4qYEQ= +github.com/spf13/afero v1.4.1 h1:asw9sl74539yqavKaglDM5hFpdJVK0Y5Dr/JOgQ89nQ= +github.com/spf13/afero v1.4.1/go.mod h1:Ai8FlHk4v/PARR026UzYexafAt9roJ7LcLMAmO6Z93I= +github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= +github.com/stretchr/testify v1.4.0/go.mod h1:j7eGeouHqKxXV5pUuKE4zz7dFj8WfuZ+81PSLYec5m4= +github.com/stretchr/testify v1.7.0 h1:nwc3DEeHmmLAfoZucVR881uASk0Mfjw8xYJ99tb5CcY= +github.com/stretchr/testify v1.7.0/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg= +golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w= +golang.org/x/crypto v0.0.0-20190820162420-60c769a6c586/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI= +golang.org/x/net v0.0.0-20190404232315-eb5bcb51f2a3/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg= +golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= +golang.org/x/sys v0.0.0-20190412213103-97732733099d/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20220715151400-c0bba94af5f8/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.20.0 h1:Od9JTbYCk261bKm4M/mw7AklTlFYIa0bIp9BgSm1S8Y= +golang.org/x/sys v0.20.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= +golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= +golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= +golang.org/x/text v0.3.8 h1:nAL+RVCQ9uMn3vJZbV+MRnydTJFPf8qqY42YiA6MrqY= +golang.org/x/text v0.3.8/go.mod h1:E6s5w1FMmriuDzIBO73fBruAKo1PCIq6d2Q6DHfQ8WQ= +golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= +gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= +gopkg.in/hlandau/service.v1 v1.0.7 h1:16G5AJ1Cp8Vr65QItJXpyAIzf/FWAWCZBsTgsc6eyA8= +gopkg.in/hlandau/service.v1 v1.0.7/go.mod h1:sZw6ksxcoafC04GoZtw32UeqqEuPSABX35lVBaJP/bE= +gopkg.in/yaml.v2 v2.2.2/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI= +gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c h1:dUUwHk2QECo/6vqA44rthZ8ie2QXMNeKRTHCNY2nXvo= +gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= diff --git a/sysbox-libs/shiftfs/shiftfs.go b/sysbox-libs/shiftfs/shiftfs.go new file mode 100644 index 00000000..874d22b1 --- /dev/null +++ b/sysbox-libs/shiftfs/shiftfs.go @@ -0,0 +1,265 @@ +// +// Copyright 2023 Nestybox, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +package shiftfs + +import ( + "fmt" + "os" + "path/filepath" + "runtime" + "syscall" + + "github.com/nestybox/sysbox-libs/linuxUtils" + "github.com/nestybox/sysbox-libs/mount" + "github.com/nestybox/sysbox-libs/utils" + specs "github.com/opencontainers/runtime-spec/specs-go" + setxid "gopkg.in/hlandau/service.v1/daemon/setuid" + "github.com/sirupsen/logrus" + "golang.org/x/sys/unix" +) + +const SHIFTFS_MAGIC int64 = 0x6a656a62 + +// Describes a shiftfs mount point +type MountPoint struct { + Source string + Readonly bool +} + +// Mark performs a shiftfs mark-mount for path on the given markPath +// (e.g., Mark("/a/b", "/c/d") causes "b" to be mounted on "d" and +// "d" to have a shiftfs mark). +func Mark(path, markPath string) error { + if err := unix.Mount(path, markPath, "shiftfs", 0, "mark"); err != nil { + return fmt.Errorf("failed to mark shiftfs on %s at %s: %v", path, markPath, err) + } + return nil +} + +// Mount performs a shiftfs mount on the given path; the path must have a +// shiftfs mark on it already (e.g., Mount("/c/d", "/x/y") requires that +// "d" have a shiftfs mark on it and causes "d" to be mounted on "y" and +// "y" to have a shiftfs mount). +func Mount(path, mntPath string) error { + if err := unix.Mount(path, mntPath, "shiftfs", 0, ""); err != nil { + return fmt.Errorf("failed to mount shiftfs on %s at %s: %v", path, mntPath, err) + } + return nil +} + +// Unmount perform a shiftfs unmount on the given path. The path must have +// a shiftfs mark or mount on it. +func Unmount(path string) error { + if err := unix.Unmount(path, unix.MNT_DETACH); err != nil { + return fmt.Errorf("failed to unmount %s: %v", path, err) + } + return nil +} + +// Returns a boolean indicating if the given path has a shiftfs mount +// on it (mark or actual mount). +func Mounted(path string, mounts []*mount.Info) (bool, error) { + realPath, err := filepath.EvalSymlinks(path) + if err != nil { + return false, err + } + + return mount.MountedWithFs(realPath, "shiftfs", mounts) +} + +// ShiftfsSupported checks if shiftfs is supported on the host. +func ShiftfsSupported(dir string) (bool, error) { + logrus.Debugf("Running shiftfs check on host.") + return runShiftfsCheckOnHost(dir, false) +} + +// ShiftfsSupported checks if shiftfs-on-overlayfs is supported on the host. +func ShiftfsSupportedOnOverlayfs(dir string) (bool, error) { + logrus.Debugf("Running shiftfs-on-overlayfs check on host.") + return runShiftfsCheckOnHost(dir, true) +} + +// runShiftfsCheckOnHost runs a quick test on the host to check if shiftfs is +// supported. dir is the path where the test will run, and checkOnOverlayfs +// indicates if the test should check shiftfs-on-overlayfs. +func runShiftfsCheckOnHost(dir string, checkOnOverlayfs bool) (bool, error) { + usernsUid := 165536 + + shiftfsModPresent, err := linuxUtils.KernelModSupported("shiftfs") + if err != nil { + return false, err + } + + if !shiftfsModPresent { + return false, nil + } + + logrus.Debugf("- shiftfs check: found shiftfs module.") + + fsName, err := utils.GetFsName(dir) + if err != nil { + return false, err + } + + if fsName == "overlayfs" || fsName == "tmpfs" { + return false, fmt.Errorf("test dir (%s) must not be on overlayfs or tmpfs", dir) + } + + tmpDir, err := os.MkdirTemp(dir, "sysbox-shiftfs-check") + if err != nil { + return false, err + } + defer func() { + os.RemoveAll(tmpDir) + }() + + if err := os.Chmod(tmpDir, 0755); err != nil { + return false, err + } + + testDir := filepath.Join(tmpDir, "test") + if err := os.Mkdir(testDir, 0755); err != nil { + return false, err + } + + if err := os.Chown(testDir, usernsUid, usernsUid); err != nil { + return false, err + } + + logrus.Debugf("- shiftfs check: test dir = %s (%s)", testDir, fsName) + + if checkOnOverlayfs { + lowerDir := filepath.Join(tmpDir, "lower") + upperDir := filepath.Join(tmpDir, "upper") + workDir := filepath.Join(tmpDir, "work") + + dirs := []string{lowerDir, upperDir, workDir} + for _, dir := range dirs { + if err := os.Mkdir(dir, 0755); err != nil { + return false, err + } + } + + opts := fmt.Sprintf("lowerdir=%s,upperdir=%s,workdir=%s", lowerDir, upperDir, workDir) + flags := uintptr(0) + if err := unix.Mount("overlay", testDir, "overlay", flags, opts); err != nil { + return false, err + } + defer unix.Unmount(testDir, unix.MNT_DETACH) + + logrus.Debugf("- shiftfs check: mounted overlayfs on %s", testDir) + } + + // Create the shiftfs mark on the test dir + if err := Mark(testDir, testDir); err != nil { + return false, err + } + defer Unmount(testDir) + + logrus.Debugf("- shiftfs check: marked shiftfs on %s", testDir) + + // Since shiftfs only makes sense within a user-ns, we will fork a child + // process into a new user-ns and have it mount shiftfs and verify it + // works. execFunc is the function the child will execute. + execFunc := func() { + logrus.Debugf("- shiftfs check: execFunc: running") + + logrus.Debugf("- shiftfs check: execFunc: lock OS thread") + runtime.LockOSThread() + defer runtime.UnlockOSThread() + + // Make ourselves root within the user ns + logrus.Debugf("- shiftfs check: execFunc: setresuid") + if err := setxid.Setresuid(0, 0, 0); err != nil { + logrus.Debugf("- shiftfs check: execFunc: failed: %v", err) + os.Exit(1) + } + logrus.Debugf("- shiftfs check: execFunc: setresgid") + if err := setxid.Setresgid(0, 0, 0); err != nil { + logrus.Debugf("- shiftfs check: execFunc: failed: %v", err) + os.Exit(1) + } + + logrus.Debugf("- shiftfs check: execFunc: mounting shiftfs on %s", testDir) + if err := Mount(testDir, testDir); err != nil { + logrus.Debugf("- shiftfs check: execFunc: failed: %v", err) + os.Exit(2) + } + + testfile := filepath.Join(testDir, "testfile") + testfile2 := filepath.Join(testDir, "testfile2") + + logrus.Debugf("- shiftfs check: execFunc: creating file %s", testfile) + _, err := os.Create(testfile) + if err != nil { + logrus.Debugf("- shiftfs check: execFunc: failed: %v", err) + os.Exit(3) + } + + // This operation will fail with EOVERFLOW if shiftfs is buggy in the kernel + logrus.Debugf("- shiftfs check: execFunc: renaming file %s to %s", testfile, testfile2) + if err := os.Rename(testfile, testfile2); err != nil { + logrus.Debugf("- shiftfs check: execFunc: failed: %v", err) + os.Remove(testfile) + os.Exit(4) + } + + logrus.Debugf("- shiftfs check: execFunc: removing file %s", testfile2) + os.Remove(testfile2) + + logrus.Debugf("- shiftfs check: execFunc: success") + os.Exit(0) + } + + // Fork the child process into a new user-ns (and mount-ns too) + idmap := &specs.LinuxIDMapping{ + ContainerID: 0, + HostID: uint32(usernsUid), + Size: 65536, + } + + pid, _, err := linuxUtils.CreateUsernsProcess(idmap, execFunc, testDir, true) + if err != nil { + return false, err + } + + logrus.Debugf("- shiftfs check: spawning child process (%d) into user-ns", pid) + + // Wait for the child process to exit + var wstatus syscall.WaitStatus + var rusage syscall.Rusage + + _, err = syscall.Wait4(pid, &wstatus, 0, &rusage) + if err != nil { + return false, err + } + + if !wstatus.Exited() { + logrus.Debugf("- shiftfs check: child process (%d) did not exit normally", pid) + return false, fmt.Errorf("child process did not exit normally") + } + + exitStatus := wstatus.ExitStatus() + + if exitStatus != 0 { + logrus.Debugf("- shiftfs check: child process failed (exit status = %d)", exitStatus) + return false, nil + } + + logrus.Debugf("- shiftfs check: passed") + return true, nil +} diff --git a/sysbox-libs/shiftfs/shiftfs_test.go b/sysbox-libs/shiftfs/shiftfs_test.go new file mode 100644 index 00000000..5f995767 --- /dev/null +++ b/sysbox-libs/shiftfs/shiftfs_test.go @@ -0,0 +1,73 @@ +// +// Copyright 2023 Nestybox, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +package shiftfs + +import ( + "github.com/nestybox/sysbox-libs/linuxUtils" + "os" + "testing" +) + +func TestShiftfsSupported(t *testing.T) { + + kernelSupportsShiftfs, err := linuxUtils.KernelModSupported("shiftfs") + if err != nil { + t.Fatal(err) + } + + if kernelSupportsShiftfs { + dir := "/var/lib/sysbox" + + if err := os.MkdirAll(dir, 0755); err != nil { + t.Fatalf("ShiftfsSupported() failed with error: %s", err) + } + + supported, err := ShiftfsSupported(dir) + if err != nil { + t.Fatalf("ShiftfsSupported() failed with error: %s", err) + } + + if !supported { + t.Logf("shiftfs not supported on this host.") + } + } +} + +func TestShiftfsSupportedOnOverlayfs(t *testing.T) { + + kernelSupportsShiftfs, err := linuxUtils.KernelModSupported("shiftfs") + if err != nil { + t.Fatal(err) + } + + if kernelSupportsShiftfs { + dir := "/var/lib/sysbox" + + if err := os.MkdirAll(dir, 0755); err != nil { + t.Fatalf("ShiftfsSupportedOnOverlayfs() failed with error: %s", err) + } + + supported, err := ShiftfsSupportedOnOverlayfs(dir) + if err != nil { + t.Fatalf("ShiftfsSupportedOnOverlayfs() failed with error: %s", err) + } + + if !supported { + t.Logf("shiftfs-on-overlayfs not supported on this host.") + } + } +} diff --git a/sysbox-libs/utils/env.go b/sysbox-libs/utils/env.go new file mode 100644 index 00000000..589a3090 --- /dev/null +++ b/sysbox-libs/utils/env.go @@ -0,0 +1,41 @@ +// +// Copyright 2019-2020 Nestybox, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +package utils + +import ( + "fmt" + "os/exec" + "strings" +) + +// GetEnvVarInfo returns the name and value of the given environment variable +func GetEnvVarInfo(v string) (string, string, error) { + tokens := strings.Split(v, "=") + if len(tokens) != 2 { + return "", "", fmt.Errorf("invalid variable %s", v) + } + return tokens[0], tokens[1], nil +} + +// CmdExists check if the given command is available on the host +func CmdExists(name string) bool { + cmd := exec.Command("/bin/sh", "-c", "command -v "+name) + if err := cmd.Run(); err != nil { + return false + } + return true +} diff --git a/sysbox-libs/utils/env_test.go b/sysbox-libs/utils/env_test.go new file mode 100644 index 00000000..ffbfabcb --- /dev/null +++ b/sysbox-libs/utils/env_test.go @@ -0,0 +1,40 @@ +// +// Copyright 2019-2020 Nestybox, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +package utils + +import "testing" + +func TestGetEnvVarInfo(t *testing.T) { + + test := []string{"a=b", "var=1", "other-var=hello", "var2="} + name := []string{"a", "var", "other-var", "var2"} + val := []string{"b", "1", "hello", ""} + + for i, _ := range test { + n, v, err := GetEnvVarInfo(test[i]) + if err != nil { + t.Errorf("GetEnvVarInfo(%s) failed: returned unexpected error %v", test[i], err) + } + if n != name[i] || v != val[i] { + t.Errorf("GetEnvVarInfo(%s) failed: want %s, %s; got %s, %s", test[i], name[i], val[i], n, v) + } + } + + if _, _, err := GetEnvVarInfo("a=b=c"); err == nil { + t.Errorf("GetEnvVarInfo(%s) failed: expected error, got no error.", "a=b=c") + } +} diff --git a/sysbox-libs/utils/filepath.go b/sysbox-libs/utils/filepath.go new file mode 100644 index 00000000..a8dc50b8 --- /dev/null +++ b/sysbox-libs/utils/filepath.go @@ -0,0 +1,49 @@ +// +// Copyright 2019-2022 Nestybox, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +package utils + +import ( + "path/filepath" + "sort" + "strings" +) + +type FilepathSlice []string + +func (fp FilepathSlice) Len() int { + return len(fp) +} + +// Compares the number of "/" separated elements in paths fp[i] and fp[j]. +func (fp FilepathSlice) Less(i, j int) bool { + iClean := filepath.Clean(fp[i]) + jClean := filepath.Clean(fp[j]) + iElems := strings.Split(iClean, "/") + jElems := strings.Split(jClean, "/") + return len(iElems) < len(jElems) +} + +func (fp FilepathSlice) Swap(i, j int) { + tmp := fp[i] + fp[i] = fp[j] + fp[j] = tmp +} + +// Sorts the give set of filepaths hierarchically; paths must be absolute. +func FilepathSort(paths []string) { + sort.Sort(FilepathSlice(paths)) +} diff --git a/sysbox-libs/utils/filepath_test.go b/sysbox-libs/utils/filepath_test.go new file mode 100644 index 00000000..923ca6b0 --- /dev/null +++ b/sysbox-libs/utils/filepath_test.go @@ -0,0 +1,44 @@ +// +// Copyright 2019-2022 Nestybox, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +package utils + +import "testing" + +func TestFilepathSort(t *testing.T) { + + paths := []string{ + "/a", + "/a/b/c", + "/a/b", + "/w/x/y/z", + "/w", + } + + FilepathSort(paths) + + want := []string{ + "/a", + "/w", + "/a/b", + "/a/b/c", + "/w/x/y/z", + } + + if !StringSliceEqual(paths, want) { + t.Errorf("FilepathSort() failed: want %v, got %v", want, paths) + } +} diff --git a/sysbox-libs/utils/fs.go b/sysbox-libs/utils/fs.go new file mode 100644 index 00000000..83bbf8e0 --- /dev/null +++ b/sysbox-libs/utils/fs.go @@ -0,0 +1,95 @@ +// +// Copyright 2020 - 2022 Nestybox, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +package utils + +import ( + "fmt" + + "golang.org/x/sys/unix" +) + +var unixFsNameTable = map[int64]string{ + unix.AAFS_MAGIC: "aafs", + unix.ADFS_SUPER_MAGIC: "adfs", + unix.AFFS_SUPER_MAGIC: "affs", + unix.AFS_FS_MAGIC: "afs", + unix.AFS_SUPER_MAGIC: "afs", + unix.ANON_INODE_FS_MAGIC: "anon", + unix.AUTOFS_SUPER_MAGIC: "autofs", + unix.BDEVFS_MAGIC: "bdevfs", + unix.BINDERFS_SUPER_MAGIC: "binderfs", + unix.BINFMTFS_MAGIC: "binfmtfs", + unix.BPF_FS_MAGIC: "bpf fs", + unix.BTRFS_SUPER_MAGIC: "btrfs", + unix.BTRFS_TEST_MAGIC: "btrfs", + unix.CRAMFS_MAGIC: "cramfs", + unix.DAXFS_MAGIC: "daxfs", + unix.DEBUGFS_MAGIC: "debugfs", + unix.ECRYPTFS_SUPER_MAGIC: "encryptfs", + unix.EFIVARFS_MAGIC: "efivarfs", + unix.EFS_SUPER_MAGIC: "efs", + unix.EROFS_SUPER_MAGIC_V1: "erofs", + unix.EXT4_SUPER_MAGIC: "ext4", + unix.F2FS_SUPER_MAGIC: "f2fs", + unix.FUTEXFS_SUPER_MAGIC: "futexfs", + unix.HOSTFS_SUPER_MAGIC: "hostfs", + unix.HPFS_SUPER_MAGIC: "hpfs", + unix.HUGETLBFS_MAGIC: "hugetlbfs", + unix.ISOFS_SUPER_MAGIC: "isofs", + unix.JFFS2_SUPER_MAGIC: "jffs2", + unix.MTD_INODE_FS_MAGIC: "mtd", + unix.NFS_SUPER_MAGIC: "nfs", + unix.NILFS_SUPER_MAGIC: "nilfs", + unix.NSFS_MAGIC: "nsfs", + unix.OCFS2_SUPER_MAGIC: "ocfs2", + unix.OVERLAYFS_SUPER_MAGIC: "overlayfs", + unix.PIPEFS_MAGIC: "pipefs", + unix.PSTOREFS_MAGIC: "pstorefs", + unix.RAMFS_MAGIC: "ramfs", + unix.REISERFS_SUPER_MAGIC: "reiserfs", + unix.SECURITYFS_MAGIC: "securityfs", + unix.SOCKFS_MAGIC: "sockfs", + unix.SQUASHFS_MAGIC: "squashfs", + unix.SYSFS_MAGIC: "sysfs", + unix.TMPFS_MAGIC: "tmpfs", + unix.TRACEFS_MAGIC: "tracefs", + unix.V9FS_MAGIC: "v9fs", + unix.XENFS_SUPER_MAGIC: "xenfs", + unix.XFS_SUPER_MAGIC: "xfs", + unix.ZONEFS_MAGIC: "zonefs", + + // Magic codes not yet defined in Unix package + 0x65735546: "fuse", + 0x6a656a62: "shiftfs", + 0x6a656a63: "fakeowner", +} + +func GetFsName(path string) (string, error) { + var fs unix.Statfs_t + + err := unix.Statfs(path, &fs) + if err != nil { + return "", err + } + + name, ok := unixFsNameTable[fs.Type] + if !ok { + return "unknown fs", fmt.Errorf("unknown fs") + } + + return name, nil +} diff --git a/sysbox-libs/utils/go.mod b/sysbox-libs/utils/go.mod new file mode 100644 index 00000000..28645a15 --- /dev/null +++ b/sysbox-libs/utils/go.mod @@ -0,0 +1,9 @@ +module github.com/nestybox/sysbox-libs/utils + +go 1.21 + +require ( + github.com/opencontainers/runtime-spec v1.0.2 + github.com/sirupsen/logrus v1.9.0 + golang.org/x/sys v0.19.0 +) diff --git a/sysbox-libs/utils/go.sum b/sysbox-libs/utils/go.sum new file mode 100644 index 00000000..72fab8e7 --- /dev/null +++ b/sysbox-libs/utils/go.sum @@ -0,0 +1,18 @@ +github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= +github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= +github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= +github.com/opencontainers/runtime-spec v1.0.2 h1:UfAcuLBJB9Coz72x1hgl8O5RVzTdNiaglX6v2DM6FI0= +github.com/opencontainers/runtime-spec v1.0.2/go.mod h1:jwyrGlmzljRJv/Fgzds9SsS/C5hL+LL3ko9hs6T5lQ0= +github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= +github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= +github.com/sirupsen/logrus v1.9.0 h1:trlNQbNUG3OdDrDil03MCb1H2o9nJ1x4/5LYw7byDE0= +github.com/sirupsen/logrus v1.9.0/go.mod h1:naHLuLoDiP4jHNo9R0sCBMtWGeIprob74mVsIT4qYEQ= +github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= +github.com/stretchr/testify v1.7.0 h1:nwc3DEeHmmLAfoZucVR881uASk0Mfjw8xYJ99tb5CcY= +github.com/stretchr/testify v1.7.0/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg= +golang.org/x/sys v0.0.0-20220715151400-c0bba94af5f8/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.19.0 h1:q5f1RH2jigJ1MoAWp2KTp3gm5zAGFUTarQZ5U386+4o= +golang.org/x/sys v0.19.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= +gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= +gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c h1:dUUwHk2QECo/6vqA44rthZ8ie2QXMNeKRTHCNY2nXvo= +gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= diff --git a/sysbox-libs/utils/pidfile.go b/sysbox-libs/utils/pidfile.go new file mode 100644 index 00000000..667c3e41 --- /dev/null +++ b/sysbox-libs/utils/pidfile.go @@ -0,0 +1,91 @@ +// +// Copyright 2020 - 2022 Nestybox, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +package utils + +import ( + "fmt" + "io/ioutil" + "os" + "path/filepath" + "strconv" + "strings" + + "github.com/sirupsen/logrus" +) + +func CheckPidFile(program string, pidFile string) error { + + pid, err := readPidFile(pidFile) + if err != nil && !os.IsNotExist(err) { + return err + } + + if err == nil { + if isProgramRunning(program, pid) { + return fmt.Errorf("%s program is running as pid %d", program, pid) + } + } + + return nil +} + +// CreatePidFile writes a sysbox pid to a file. If the file already exists, +// and its pid matches a current sysbox program, then an error is returned. +func CreatePidFile(program string, pidFile string) error { + + if err := CheckPidFile(program, pidFile); err != nil { + return err + } + + pidStr := fmt.Sprintf("%d\n", os.Getpid()) + if err := ioutil.WriteFile(pidFile, []byte(pidStr), 0400); err != nil { + return fmt.Errorf("failed to write %s pid to file %s: %s", program, pidFile, err) + } + + return nil +} + +func DestroyPidFile(pidFile string) error { + return os.RemoveAll(pidFile) +} + +func readPidFile(pidFile string) (int, error) { + + bs, err := ioutil.ReadFile(pidFile) + if err != nil { + return 0, err + } + + return strconv.Atoi(strings.TrimSpace(string(bs))) +} + +func isProgramRunning(program string, pid int) bool { + + target, err := os.Readlink(fmt.Sprintf("/proc/%d/exe", pid)) + if err != nil { + return false + } + + base := filepath.Base(target) + + if program != base { + logrus.Infof("pid %d is not associated to process %s", pid, program) + return false + } + + return true +} diff --git a/sysbox-libs/utils/pidfile_test.go b/sysbox-libs/utils/pidfile_test.go new file mode 100644 index 00000000..457ff234 --- /dev/null +++ b/sysbox-libs/utils/pidfile_test.go @@ -0,0 +1,64 @@ +// +// Copyright 2019-2022 Nestybox, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +package utils + +import ( + "io/ioutil" + "os" + "path/filepath" + "testing" +) + +func TestSysboxPidFile(t *testing.T) { + + testDir, err := ioutil.TempDir("", "sysbox-mgr-test") + if err != nil { + t.Errorf(err.Error()) + } + defer os.RemoveAll(testDir) + + pidFile := filepath.Join(testDir, "sysbox-mgr.pid") + + // create sysbox pid file + if err := CreatePidFile("sysbox-mgr", pidFile); err != nil { + t.Errorf("CreatePidFile() failed: %s", err) + } + + // verify + _, err = os.Stat(pidFile) + if err != nil { + t.Errorf("failed to stat %s: %s", pidFile, err) + } + + // create again -- should pass given that there's no actual instance of + // sysbox-mgr running in the system. + if err := CreatePidFile("sysbox-mgr", pidFile); err != nil { + t.Errorf("CreatePidFile() failed: %s", err) + } + + // destroy the pid file + if err := DestroyPidFile(pidFile); err != nil { + t.Errorf("DestroyPidFile() failed: %s", err) + } + + // verify + _, err = os.Stat(pidFile) + if err == nil || !os.IsNotExist(err) { + t.Errorf("pid file %s was not removed", pidFile) + os.RemoveAll(pidFile) + } +} diff --git a/sysbox-libs/utils/slices.go b/sysbox-libs/utils/slices.go new file mode 100644 index 00000000..1efec28f --- /dev/null +++ b/sysbox-libs/utils/slices.go @@ -0,0 +1,158 @@ +// +// Copyright 2019-2020 Nestybox, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +package utils + +import ( + "github.com/opencontainers/runtime-spec/specs-go" +) + +// StringSliceContains returns true if x is in a +func StringSliceContains(a []string, x string) bool { + for _, n := range a { + if x == n { + return true + } + } + return false +} + +// StringSliceEqual compares two slices and returns true if they match +func StringSliceEqual(a, b []string) bool { + if len(a) != len(b) { + return false + } + for i, v := range a { + if v != b[i] { + return false + } + } + return true +} + +// StringSliceRemove removes from slice 's' any elements which occur on slice 'db'. +func StringSliceRemove(s, db []string) []string { + var r []string + for i := 0; i < len(s); i++ { + found := false + for _, e := range db { + if s[i] == e { + found = true + break + } + } + if !found { + r = append(r, s[i]) + } + } + return r +} + +// StringSliceRemoveMatch removes from slice 's' any elements for which the 'match' +// function returns true. +func StringSliceRemoveMatch(s []string, match func(string) bool) []string { + var r []string + for i := 0; i < len(s); i++ { + if !match(s[i]) { + r = append(r, s[i]) + } + } + return r +} + +// uniquify a string slice (i.e., remove duplicate elements) +func StringSliceUniquify(s []string) []string { + keys := make(map[string]bool) + result := []string{} + for _, str := range s { + if _, ok := keys[str]; !ok { + keys[str] = true + result = append(result, str) + } + } + return result +} + +// finds the shortest string in the given slice +func StringSliceFindShortest(s []string) string { + if len(s) == 0 { + return "" + } + shortest := s[0] + for _, str := range s { + if len(str) < len(shortest) { + shortest = str + } + } + return shortest +} + +// Compares the given mount slices and returns true if the match +func MountSliceEqual(a, b []specs.Mount) bool { + if len(a) != len(b) { + return false + } + for i, m := range a { + if m.Destination != b[i].Destination || + m.Source != b[i].Source || + m.Type != b[i].Type || + !StringSliceEqual(m.Options, b[i].Options) { + return false + } + } + return true +} + +// MountSliceRemove removes from slice 's' any elements which occur on slice 'db'; the +// given function is used to compare elements. +func MountSliceRemove(s, db []specs.Mount, cmp func(m1, m2 specs.Mount) bool) []specs.Mount { + var r []specs.Mount + for i := 0; i < len(s); i++ { + found := false + for _, e := range db { + if cmp(s[i], e) { + found = true + break + } + } + if !found { + r = append(r, s[i]) + } + } + return r +} + +// MountSliceRemoveMatch removes from slice 's' any elements for which the 'match' +// function returns true. +func MountSliceRemoveMatch(s []specs.Mount, match func(specs.Mount) bool) []specs.Mount { + var r []specs.Mount + for i := 0; i < len(s); i++ { + if !match(s[i]) { + r = append(r, s[i]) + } + } + return r +} + +// MountSliceContains returns true if mount x is in slice s. +func MountSliceContains(s []specs.Mount, x specs.Mount, match func(a, b specs.Mount) bool) bool { + for _, m := range s { + if match(m, x) { + return true + } + } + return false +} diff --git a/sysbox-mgr b/sysbox-mgr deleted file mode 160000 index 1159d228..00000000 --- a/sysbox-mgr +++ /dev/null @@ -1 +0,0 @@ -Subproject commit 1159d228eac8402efa63bd2cb18cdf9e404ea130 diff --git a/sysbox-mgr/.gitignore b/sysbox-mgr/.gitignore new file mode 100644 index 00000000..f452557d --- /dev/null +++ b/sysbox-mgr/.gitignore @@ -0,0 +1,10 @@ +# Ignore build artifacts +build + +# Ignore virtual-studio-code metadata +.vscode + +# GNU global tags +GPATH +GRTAGS +GTAGS diff --git a/sysbox-mgr/CONTRIBUTING.md b/sysbox-mgr/CONTRIBUTING.md new file mode 100644 index 00000000..03f37983 --- /dev/null +++ b/sysbox-mgr/CONTRIBUTING.md @@ -0,0 +1,5 @@ +# Contribute to Sysbox-mgr + +Sysbox-mgr is a component of the Sysbox container runtime. If you want to +contribute, please refer to the Sysbox contribution +[guidelines](https://github.com/nestybox/sysbox/blob/master/CONTRIBUTING.md). \ No newline at end of file diff --git a/sysbox-mgr/LICENSE b/sysbox-mgr/LICENSE new file mode 100644 index 00000000..c6087d5b --- /dev/null +++ b/sysbox-mgr/LICENSE @@ -0,0 +1,191 @@ + + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + Copyright 2020 Nestybox, Inc. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/sysbox-mgr/MAINTAINERS b/sysbox-mgr/MAINTAINERS new file mode 100644 index 00000000..3af2dbb0 --- /dev/null +++ b/sysbox-mgr/MAINTAINERS @@ -0,0 +1,2 @@ +Rodny Molina (@rodnymolina) +Cesar Talledo (@ctalledo) diff --git a/sysbox-mgr/Makefile b/sysbox-mgr/Makefile new file mode 100644 index 00000000..8de0a671 --- /dev/null +++ b/sysbox-mgr/Makefile @@ -0,0 +1,103 @@ +# +# sysbox-mgr Makefile +# +# Note: targets must execute from the $SYSMGR_DIR + +.PHONY: clean sysbox-mgr-debug sysbox-mgr-static lint list-packages + +GO := go + +SYSMGR_BUILDROOT := build +SYSMGR_BUILDDIR := $(SYSMGR_BUILDROOT)/$(TARGET_ARCH) +SYSMGR_TARGET := sysbox-mgr +SYSMGR_DEBUG_TARGET := sysbox-mgr-debug +SYSMGR_STATIC_TARGET := sysbox-mgr-static +SYSMGR_DIR := $(CURDIR) +SYSMGR_SRC := $(shell find . 2>&1 | grep -E '.*\.(c|h|go)$$') + +SYSMGR_GRPC_DIR := ../sysbox-ipc/sysboxMgrGrpc +SYSMGR_GRPC_SRC := $(shell find $(SYSMGR_GRPC_DIR) 2>&1 | grep -E '.*\.(c|h|go|proto)$$') + +SYSLIB_DIR := ../sysbox-libs +SYSLIB_SRC := $(shell find $(SYSLIB_DIR) 2>&1 | grep -E '.*\.(c|h|go|proto)$$') + +COMMIT_NO := $(shell git rev-parse HEAD 2> /dev/null || true) +COMMIT ?= $(if $(shell git status --porcelain --untracked-files=no),$(COMMIT_NO)-dirty,$(COMMIT_NO)) +BUILT_AT := $(shell date) +BUILT_BY := $(shell git config user.name) + +LDFLAGS := -X 'main.edition=${EDITION}' -X main.version=${VERSION} \ + -X main.commitId=$(COMMIT) -X 'main.builtAt=$(BUILT_AT)' \ + -X 'main.builtBy=$(BUILT_BY)' + +# idmapped mount is supported in kernels >= 5.12 +KERNEL_REL := $(shell uname -r) +KERNEL_REL_MAJ := $(shell echo $(KERNEL_REL) | cut -d'.' -f1) +KERNEL_REL_MIN := $(shell echo $(KERNEL_REL) | cut -d'.' -f2) + +ifeq ($(shell test $(KERNEL_REL_MAJ) -gt 5; echo $$?),0) + IDMAPPED_MNT := 1 +endif + +ifeq ($(shell test $(KERNEL_REL_MAJ) -eq 5; echo $$?),0) + ifeq ($(shell test $(KERNEL_REL_MIN) -ge 12; echo $$?),0) + IDMAPPED_MNT := 1 + endif +endif + +ifeq ($(IDMAPPED_MNT),1) + BUILDTAGS ?= idmapped_mnt +endif + +# Set cross-compilation flags if applicable. +ifneq ($(SYS_ARCH),$(TARGET_ARCH)) + ifeq ($(TARGET_ARCH),armel) + GO_XCOMPILE := CGO_ENABLED=1 GOOS=linux GOARCH=arm GOARM=6 CC=arm-linux-gnueabi-gcc + else ifeq ($(TARGET_ARCH),armhf) + GO_XCOMPILE := CGO_ENABLED=1 GOOS=linux GOARCH=arm GOARM=7 CC=arm-linux-gnueabihf-gcc + else ifeq ($(TARGET_ARCH),arm64) + GO_XCOMPILE = CGO_ENABLED=1 GOOS=linux GOARCH=arm64 CC=aarch64-linux-gnu-gcc + else ifeq ($(TARGET_ARCH),amd64) + GO_XCOMPILE = CGO_ENABLED=1 GOOS=linux GOARCH=amd64 CC=x86_64-linux-gnu-gcc + endif +endif + +.DEFAULT: sysbox-mgr + +sysbox-mgr: $(SYSMGR_BUILDDIR)/$(SYSMGR_TARGET) + +$(SYSMGR_BUILDDIR)/$(SYSMGR_TARGET): $(SYSMGR_SRC) $(SYSMGR_GRPC_SRC) $(SYSLIB_SRC) + $(GO_XCOMPILE) $(GO) build -buildvcs=false -trimpath -tags "$(BUILDTAGS)" -ldflags "${LDFLAGS}" -o $(SYSMGR_BUILDDIR)/sysbox-mgr + +sysbox-mgr-debug: $(SYSMGR_BUILDDIR)/$(SYSMGR_DEBUG_TARGET) + +$(SYSMGR_BUILDDIR)/$(SYSMGR_DEBUG_TARGET): $(SYSMGR_SRC) $(SYSMGR_GRPC_SRC) $(SYSLIB_SRC) + $(GO_XCOMPILE) $(GO) build -buildvcs=false -trimpath -tags "$(BUILDTAGS)" -gcflags="all=-N -l" -ldflags "${LDFLAGS}" \ + -o $(SYSMGR_BUILDDIR)/sysbox-mgr + +sysbox-mgr-static: $(SYSMGR_BUILDDIR)/$(SYSMGR_STATIC_TARGET) + +$(SYSMGR_BUILDDIR)/$(SYSMGR_STATIC_TARGET): $(SYSMGR_SRC) $(SYSMGR_GRPC_SRC) $(SYSLIB_SRC) + CGO_ENABLED=1 $(GO_XCOMPILE) $(GO) build -buildvcs=false -trimpath -tags "$(BUILDTAGS) netgo osusergo" \ + -installsuffix netgo -ldflags "-w -extldflags -static ${LDFLAGS}" \ + -o $(SYSMGR_BUILDDIR)/sysbox-mgr + +gomod-tidy: + $(GO) mod tidy + +lint: + $(GO) vet $(allpackages) + $(GO) fmt $(allpackages) + +listpackages: + @echo $(allpackages) + +clean: + rm -f $(SYSMGR_BUILDDIR)/sysbox-mgr + +distclean: clean + rm -rf $(SYSMGR_BUILDROOT) + +# memoize allpackages, so that it's executed only once and only if used +_allpackages = $(shell $(GO) list ./... | grep -v vendor) +allpackages = $(if $(__allpackages),,$(eval __allpackages := $$(_allpackages)))$(__allpackages) diff --git a/sysbox-mgr/README.md b/sysbox-mgr/README.md new file mode 100644 index 00000000..2dac08a2 --- /dev/null +++ b/sysbox-mgr/README.md @@ -0,0 +1,50 @@ +# sysbox-mgr + +The Sysbox Manager (aka sysbox-mgr) is a daemon that +provides miscellaneous services to other sysbox components. + +Currently it provides these services: + +* Subid allocation: allocates a common range of subuid and subgids + for all system containers; service is invoked by sysbox-runc. + +* Shiftfs marking: creates shiftfs marks on host directories on + which shiftfs will be mounted. Handles redundant mounts/unmounts + of shiftfs on the same directory. + +* Mount ownership changes: changes ownership on host directories + that are bind-mounted into the sys container and on top of + which shiftfs mounting is not possible. + +* Docker-store Volume Management: creates a directory on the host + that is mounted into the system container's `/var/lib/docker`. + This way, the overlayfs over overlayfs scenario created by running + docker-in-docker is avoided. + +* Kubelet-store Volume Management: creates a directory on the host + that is mounted into the system container's `/var/lib/kubelet`. + This is needed to avoid shiftfs mounts over this directory in + the sys container, as kubelet does not support it. + +* Docker-store Volume Management: creates a directory on the host + that is mounted into the system container's `/var/lib/containerd/io.containerd.snapshotter.v1.overlayfs`. + This way, the overlayfs over overlayfs scenario created by running + containerd-in-docker is avoided. + +In the future it's expected to provide further services to sysbox-runc +as well as sysbox-fs. + +# Build & Usage + +sysbox-mgr is built with the sysbox Makefile. Refer to that sysbox +[README](../README.md) file for details. + +# gRPC + +sysbox-mgr listens on a unix-domain socket for gRPC from other sysbox +components. + +Currently a single gRPC is used (between sysbox-runc and sysbox-mgr). + +In the future other gRPCs may be created (e.g,. for communication +between sysbox-fs and sysbox-mgr). diff --git a/sysbox-mgr/go.mod b/sysbox-mgr/go.mod new file mode 100644 index 00000000..4a6f0955 --- /dev/null +++ b/sysbox-mgr/go.mod @@ -0,0 +1,83 @@ +module github.com/nestybox/sysbox-mgr + +go 1.22 + +toolchain go1.22.6 + +require ( + github.com/coreos/go-systemd v0.0.0-20191104093116-d3cd4ed1dbcf + github.com/deckarep/golang-set v1.8.0 + github.com/google/uuid v1.6.0 + github.com/nestybox/sysbox-ipc v0.0.0-00010101000000-000000000000 + github.com/nestybox/sysbox-libs/dockerUtils v0.0.0-00010101000000-000000000000 + github.com/nestybox/sysbox-libs/fileMonitor v0.0.0-00010101000000-000000000000 + github.com/nestybox/sysbox-libs/formatter v0.0.0-20211230192847-357e78e444bd + github.com/nestybox/sysbox-libs/idMap v0.0.0-00010101000000-000000000000 + github.com/nestybox/sysbox-libs/idShiftUtils v0.0.0-00010101000000-000000000000 + github.com/nestybox/sysbox-libs/linuxUtils v0.0.0-00010101000000-000000000000 + github.com/nestybox/sysbox-libs/mount v0.0.0-20240602025437-33cbdf5a9e98 + github.com/nestybox/sysbox-libs/overlayUtils v0.0.0-00010101000000-000000000000 + github.com/nestybox/sysbox-libs/shiftfs v0.0.0-00010101000000-000000000000 + github.com/nestybox/sysbox-libs/utils v0.0.0-00010101000000-000000000000 + github.com/nestybox/sysbox-runc v0.0.0-00010101000000-000000000000 + github.com/opencontainers/runc v1.1.4 + github.com/opencontainers/runtime-spec v1.1.1-0.20230823135140-4fec88fd00a4 + github.com/pkg/profile v1.5.0 + github.com/sirupsen/logrus v1.9.3 + github.com/urfave/cli v1.22.14 + golang.org/x/sys v0.27.0 +) + +require ( + github.com/Azure/go-ansiterm v0.0.0-20230124172434-306776ec8161 // indirect + github.com/Microsoft/go-winio v0.4.16 // indirect + github.com/coreos/go-systemd/v22 v22.1.0 // indirect + github.com/cpuguy83/go-md2man/v2 v2.0.2 // indirect + github.com/distribution/reference v0.6.0 // indirect + github.com/docker/docker v26.0.0+incompatible // indirect + github.com/docker/go-connections v0.4.0 // indirect + github.com/docker/go-units v0.4.0 // indirect + github.com/felixge/httpsnoop v1.0.4 // indirect + github.com/go-logr/logr v1.4.2 // indirect + github.com/go-logr/stdr v1.2.2 // indirect + github.com/godbus/dbus/v5 v5.0.3 // indirect + github.com/gogo/protobuf v1.3.2 // indirect + github.com/grpc-ecosystem/grpc-gateway/v2 v2.20.0 // indirect + github.com/joshlf/go-acl v0.0.0-20200411065538-eae00ae38531 // indirect + github.com/karrick/godirwalk v1.16.1 // indirect + github.com/moby/docker-image-spec v1.3.1 // indirect + github.com/opencontainers/go-digest v1.0.0 // indirect + github.com/opencontainers/image-spec v1.0.2 // indirect + github.com/pkg/errors v0.9.1 // indirect + github.com/russross/blackfriday/v2 v2.1.0 // indirect + github.com/spf13/afero v1.4.1 // indirect + go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.51.0 // indirect + go.opentelemetry.io/otel v1.32.0 // indirect + go.opentelemetry.io/otel/metric v1.32.0 // indirect + go.opentelemetry.io/otel/trace v1.32.0 // indirect + golang.org/x/net v0.23.0 // indirect + golang.org/x/text v0.15.0 // indirect + google.golang.org/genproto/googleapis/rpc v0.0.0-20240513163218-0867130af1f8 // indirect + google.golang.org/grpc v1.64.0 // indirect + google.golang.org/protobuf v1.35.1 // indirect + gopkg.in/hlandau/service.v1 v1.0.7 // indirect +) + +replace ( + github.com/godbus/dbus => github.com/godbus/dbus/v5 v5.0.3 + github.com/nestybox/sysbox-ipc => ../sysbox-ipc + github.com/nestybox/sysbox-libs/capability => ../sysbox-libs/capability + github.com/nestybox/sysbox-libs/dockerUtils => ../sysbox-libs/dockerUtils + github.com/nestybox/sysbox-libs/fileMonitor => ../sysbox-libs/fileMonitor + github.com/nestybox/sysbox-libs/formatter => ../sysbox-libs/formatter + github.com/nestybox/sysbox-libs/idMap => ../sysbox-libs/idMap + github.com/nestybox/sysbox-libs/idShiftUtils => ../sysbox-libs/idShiftUtils + github.com/nestybox/sysbox-libs/libseccomp-golang => ../sysbox-libs/libseccomp-golang + github.com/nestybox/sysbox-libs/linuxUtils => ../sysbox-libs/linuxUtils + github.com/nestybox/sysbox-libs/mount => ../sysbox-libs/mount + github.com/nestybox/sysbox-libs/overlayUtils => ../sysbox-libs/overlayUtils + github.com/nestybox/sysbox-libs/shiftfs => ../sysbox-libs/shiftfs + github.com/nestybox/sysbox-libs/utils => ../sysbox-libs/utils + github.com/nestybox/sysbox-runc => ../sysbox-runc + github.com/opencontainers/runc => ./../sysbox-runc +) diff --git a/sysbox-mgr/go.sum b/sysbox-mgr/go.sum new file mode 100644 index 00000000..e0e89848 --- /dev/null +++ b/sysbox-mgr/go.sum @@ -0,0 +1,170 @@ +github.com/Azure/go-ansiterm v0.0.0-20230124172434-306776ec8161 h1:L/gRVlceqvL25UVaW/CKtUDjefjrs0SPonmDGUVOYP0= +github.com/Azure/go-ansiterm v0.0.0-20230124172434-306776ec8161/go.mod h1:xomTg63KZ2rFqZQzSB4Vz2SUXa1BpHTVz9L5PTmPC4E= +github.com/BurntSushi/toml v1.3.2/go.mod h1:CxXYINrC8qIiEnFrOxCa7Jy5BFHlXnUU2pbicEuybxQ= +github.com/Microsoft/go-winio v0.4.16 h1:FtSW/jqD+l4ba5iPBj9CODVtgfYAD8w2wS923g/cFDk= +github.com/Microsoft/go-winio v0.4.16/go.mod h1:XB6nPKklQyQ7GC9LdcBEcBl8PF76WugXOPRXwdLnMv0= +github.com/cenkalti/backoff/v4 v4.3.0 h1:MyRJ/UdXutAwSAT+s3wNd7MfTIcy71VQueUuFK343L8= +github.com/cenkalti/backoff/v4 v4.3.0/go.mod h1:Y3VNntkOUPxTVeUxJ/G5vcM//AlwfmyYozVcomhLiZE= +github.com/containerd/log v0.1.0 h1:TCJt7ioM2cr/tfR8GPbGf9/VRAX8D2B4PjzCpfX540I= +github.com/containerd/log v0.1.0/go.mod h1:VRRf09a7mHDIRezVKTRCrOq78v577GXq3bSa3EhrzVo= +github.com/coreos/go-systemd v0.0.0-20191104093116-d3cd4ed1dbcf h1:iW4rZ826su+pqaw19uhpSCzhj44qo35pNgKFGqzDKkU= +github.com/coreos/go-systemd v0.0.0-20191104093116-d3cd4ed1dbcf/go.mod h1:F5haX7vjVVG0kc13fIWeqUViNPyEJxv/OmvnBo0Yme4= +github.com/coreos/go-systemd/v22 v22.1.0 h1:kq/SbG2BCKLkDKkjQf5OWwKWUKj1lgs3lFI4PxnR5lg= +github.com/coreos/go-systemd/v22 v22.1.0/go.mod h1:xO0FLkIi5MaZafQlIrOotqXZ90ih+1atmu1JpKERPPk= +github.com/cpuguy83/go-md2man/v2 v2.0.2 h1:p1EgwI/C7NhT0JmVkwCD2ZBK8j4aeHQX2pMHHBfMQ6w= +github.com/cpuguy83/go-md2man/v2 v2.0.2/go.mod h1:tgQtvFlXSQOSOSIRvRPT7W67SCa46tRHOmNcaadrF8o= +github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= +github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= +github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= +github.com/deckarep/golang-set v1.8.0 h1:sk9/l/KqpunDwP7pSjUg0keiOOLEnOBHzykLrsPppp4= +github.com/deckarep/golang-set v1.8.0/go.mod h1:5nI87KwE7wgsBU1F4GKAw2Qod7p5kyS383rP6+o6qqo= +github.com/distribution/reference v0.6.0 h1:0IXCQ5g4/QMHHkarYzh5l+u8T3t73zM5QvfrDyIgxBk= +github.com/distribution/reference v0.6.0/go.mod h1:BbU0aIcezP1/5jX/8MP0YiH4SdvB5Y4f/wlDRiLyi3E= +github.com/docker/docker v26.0.0+incompatible h1:Ng2qi+gdKADUa/VM+6b6YaY2nlZhk/lVJiKR/2bMudU= +github.com/docker/docker v26.0.0+incompatible/go.mod h1:eEKB0N0r5NX/I1kEveEz05bcu8tLC/8azJZsviup8Sk= +github.com/docker/go-connections v0.4.0 h1:El9xVISelRB7BuFusrZozjnkIM5YnzCViNKohAFqRJQ= +github.com/docker/go-connections v0.4.0/go.mod h1:Gbd7IOopHjR8Iph03tsViu4nIes5XhDvyHbTtUxmeec= +github.com/docker/go-units v0.4.0 h1:3uh0PgVws3nIA0Q+MwDC8yjEPf9zjRfZZWXZYDct3Tw= +github.com/docker/go-units v0.4.0/go.mod h1:fgPhTUdO+D/Jk86RDLlptpiXQzgHJF7gydDDbaIK4Dk= +github.com/felixge/httpsnoop v1.0.4 h1:NFTV2Zj1bL4mc9sqWACXbQFVBBg2W3GPvqp8/ESS2Wg= +github.com/felixge/httpsnoop v1.0.4/go.mod h1:m8KPJKqk1gH5J9DgRY2ASl2lWCfGKXixSwevea8zH2U= +github.com/go-logr/logr v1.2.2/go.mod h1:jdQByPbusPIv2/zmleS9BjJVeZ6kBagPoEUsqbVz/1A= +github.com/go-logr/logr v1.4.2 h1:6pFjapn8bFcIbiKo3XT4j/BhANplGihG6tvd+8rYgrY= +github.com/go-logr/logr v1.4.2/go.mod h1:9T104GzyrTigFIr8wt5mBrctHMim0Nb2HLGrmQ40KvY= +github.com/go-logr/stdr v1.2.2 h1:hSWxHoqTgW2S2qGc0LTAI563KZ5YKYRhT3MFKZMbjag= +github.com/go-logr/stdr v1.2.2/go.mod h1:mMo/vtBO5dYbehREoey6XUKy/eSumjCCveDpRre4VKE= +github.com/godbus/dbus/v5 v5.0.3 h1:ZqHaoEF7TBzh4jzPmqVhE/5A1z9of6orkAe5uHoAeME= +github.com/godbus/dbus/v5 v5.0.3/go.mod h1:xhWf0FNVPg57R7Z0UbKHbJfkEywrmjJnf7w5xrFpKfA= +github.com/gogo/protobuf v1.3.2 h1:Ov1cvc58UF3b5XjBnZv7+opcTcQFZebYjWzi34vdm4Q= +github.com/gogo/protobuf v1.3.2/go.mod h1:P1XiOD3dCwIKUDQYPy72D8LYyHL2YPYrpS2s69NZV8Q= +github.com/google/go-cmp v0.6.0 h1:ofyhxvXcZhMsU5ulbFiLKl/XBFqE1GSq7atu8tAmTRI= +github.com/google/go-cmp v0.6.0/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY= +github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0= +github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= +github.com/grpc-ecosystem/grpc-gateway/v2 v2.20.0 h1:bkypFPDjIYGfCYD5mRBvpqxfYX1YCS1PXdKYWi8FsN0= +github.com/grpc-ecosystem/grpc-gateway/v2 v2.20.0/go.mod h1:P+Lt/0by1T8bfcF3z737NnSbmxQAppXMRziHUxPOC8k= +github.com/joshlf/go-acl v0.0.0-20200411065538-eae00ae38531 h1:hgVxRoDDPtQE68PT4LFvNlPz2nBKd3OMlGKIQ69OmR4= +github.com/joshlf/go-acl v0.0.0-20200411065538-eae00ae38531/go.mod h1:fqTUQpVYBvhCNIsMXGl2GE9q6z94DIP6NtFKXCSTVbg= +github.com/joshlf/testutil v0.0.0-20170608050642-b5d8aa79d93d h1:J8tJzRyiddAFF65YVgxli+TyWBi0f79Sld6rJP6CBcY= +github.com/joshlf/testutil v0.0.0-20170608050642-b5d8aa79d93d/go.mod h1:b+Q3v8Yrg5o15d71PSUraUzYb+jWl6wQMSBXSGS/hv0= +github.com/karrick/godirwalk v1.16.1 h1:DynhcF+bztK8gooS0+NDJFrdNZjJ3gzVzC545UNA9iw= +github.com/karrick/godirwalk v1.16.1/go.mod h1:j4mkqPuvaLI8mp1DroR3P6ad7cyYd4c1qeJ3RV7ULlk= +github.com/kisielk/errcheck v1.5.0/go.mod h1:pFxgyoBC7bSaBwPgfKdkLd5X25qrDl4LWUI2bnpBCr8= +github.com/kisielk/gotool v1.0.0/go.mod h1:XhKaO+MFFWcvkIS/tQcRk01m1F5IRFswLeQ+oQHNcck= +github.com/konsorten/go-windows-terminal-sequences v1.0.1/go.mod h1:T0+1ngSBFLxvqU3pZ+m/2kptfBszLMUkC4ZK/EgS/cQ= +github.com/kr/fs v0.1.0/go.mod h1:FFnZGqtBN9Gxj7eW1uZ42v5BccTP0vu6NEaFoC2HwRg= +github.com/moby/docker-image-spec v1.3.1 h1:jMKff3w6PgbfSa69GfNg+zN/XLhfXJGnEx3Nl2EsFP0= +github.com/moby/docker-image-spec v1.3.1/go.mod h1:eKmb5VW8vQEh/BAr2yvVNvuiJuY6UIocYsFu/DxxRpo= +github.com/moby/term v0.0.0-20201216013528-df9cb8a40635 h1:rzf0wL0CHVc8CEsgyygG0Mn9CNCCPZqOPaz8RiiHYQk= +github.com/moby/term v0.0.0-20201216013528-df9cb8a40635/go.mod h1:FBS0z0QWA44HXygs7VXDUOGoN/1TV3RuWkLO04am3wc= +github.com/morikuni/aec v1.0.0 h1:nP9CBfwrvYnBRgY6qfDQkygYDmYwOilePFkwzv4dU8A= +github.com/morikuni/aec v1.0.0/go.mod h1:BbKIizmSmc5MMPqRYbxO4ZU0S0+P200+tUnFx7PXmsc= +github.com/opencontainers/go-digest v1.0.0 h1:apOUWs51W5PlhuyGyz9FCeeBIOUDA/6nW8Oi/yOhh5U= +github.com/opencontainers/go-digest v1.0.0/go.mod h1:0JzlMkj0TRzQZfJkVvzbP0HBR3IKzErnv2BNG4W4MAM= +github.com/opencontainers/image-spec v1.0.2 h1:9yCKha/T5XdGtO0q9Q9a6T5NUCsTn/DrBg0D7ufOcFM= +github.com/opencontainers/image-spec v1.0.2/go.mod h1:BtxoFyWECRxE4U/7sNtV5W15zMzWCbyJoFRP3s7yZA0= +github.com/opencontainers/runtime-spec v1.1.1-0.20230823135140-4fec88fd00a4 h1:EctkgBjZ1y4q+sibyuuIgiKpa0QSd2elFtSSdNvBVow= +github.com/opencontainers/runtime-spec v1.1.1-0.20230823135140-4fec88fd00a4/go.mod h1:jwyrGlmzljRJv/Fgzds9SsS/C5hL+LL3ko9hs6T5lQ0= +github.com/pkg/errors v0.8.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0= +github.com/pkg/errors v0.9.1 h1:FEBLx1zS214owpjy7qsBeixbURkuhQAwrK5UwLGTwt4= +github.com/pkg/errors v0.9.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0= +github.com/pkg/profile v1.5.0 h1:042Buzk+NhDI+DeSAA62RwJL8VAuZUMQZUjCsRz1Mug= +github.com/pkg/profile v1.5.0/go.mod h1:qBsxPvzyUincmltOk6iyRVxHYg4adc0OFOv72ZdLa18= +github.com/pkg/sftp v1.10.1/go.mod h1:lYOWFsE0bwd1+KfKJaKeuokY15vzFx25BLbzYYoAxZI= +github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= +github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= +github.com/russross/blackfriday/v2 v2.1.0 h1:JIOH55/0cWyOuilr9/qlrm0BSXldqnqwMsf35Ld67mk= +github.com/russross/blackfriday/v2 v2.1.0/go.mod h1:+Rmxgy9KzJVeS9/2gXHxylqXiyQDYRxCVz55jmeOWTM= +github.com/sirupsen/logrus v1.4.1/go.mod h1:ni0Sbl8bgC9z8RoU9G6nDWqqs/fq4eDPysMBDgk/93Q= +github.com/sirupsen/logrus v1.9.3 h1:dueUQJ1C2q9oE3F7wvmSGAaVtTmUizReu6fjN8uqzbQ= +github.com/sirupsen/logrus v1.9.3/go.mod h1:naHLuLoDiP4jHNo9R0sCBMtWGeIprob74mVsIT4qYEQ= +github.com/spf13/afero v1.4.1 h1:asw9sl74539yqavKaglDM5hFpdJVK0Y5Dr/JOgQ89nQ= +github.com/spf13/afero v1.4.1/go.mod h1:Ai8FlHk4v/PARR026UzYexafAt9roJ7LcLMAmO6Z93I= +github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= +github.com/stretchr/objx v0.1.1/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= +github.com/stretchr/objx v0.4.0/go.mod h1:YvHI0jy2hoMjB+UWwv71VJQ9isScKT/TqJzVSSt89Yw= +github.com/stretchr/objx v0.5.0/go.mod h1:Yh+to48EsGEfYuaHDzXPcE3xhTkx73EhmCGUpEOglKo= +github.com/stretchr/testify v1.2.2/go.mod h1:a8OnRcib4nhh0OaRAV+Yts87kKdq0PP7pXfy6kDkUVs= +github.com/stretchr/testify v1.4.0/go.mod h1:j7eGeouHqKxXV5pUuKE4zz7dFj8WfuZ+81PSLYec5m4= +github.com/stretchr/testify v1.7.0/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg= +github.com/stretchr/testify v1.7.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg= +github.com/stretchr/testify v1.8.0/go.mod h1:yNjHg4UonilssWZ8iaSj1OCr/vHnekPRkoO+kdMU+MU= +github.com/stretchr/testify v1.8.4/go.mod h1:sz/lmYIOXD/1dqDmKjjqLyZ2RngseejIcXlSw2iwfAo= +github.com/stretchr/testify v1.9.0 h1:HtqpIVDClZ4nwg75+f6Lvsy/wHu+3BoSGCbBAcpTsTg= +github.com/stretchr/testify v1.9.0/go.mod h1:r2ic/lqez/lEtzL7wO/rwa5dbSLXVDPFyf8C91i36aY= +github.com/urfave/cli v1.22.14 h1:ebbhrRiGK2i4naQJr+1Xj92HXZCrK7MsyTS/ob3HnAk= +github.com/urfave/cli v1.22.14/go.mod h1:X0eDS6pD6Exaclxm99NJ3FiCDRED7vIHpx2mDOHLvkA= +github.com/yuin/goldmark v1.1.27/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74= +github.com/yuin/goldmark v1.2.1/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74= +go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.51.0 h1:Xs2Ncz0gNihqu9iosIZ5SkBbWo5T8JhhLJFMQL1qmLI= +go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.51.0/go.mod h1:vy+2G/6NvVMpwGX/NyLqcC41fxepnuKHk16E6IZUcJc= +go.opentelemetry.io/otel v1.32.0 h1:WnBN+Xjcteh0zdk01SVqV55d/m62NJLJdIyb4y/WO5U= +go.opentelemetry.io/otel v1.32.0/go.mod h1:00DCVSB0RQcnzlwyTfqtxSm+DRr9hpYrHjNGiBHVQIg= +go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.32.0 h1:IJFEoHiytixx8cMiVAO+GmHR6Frwu+u5Ur8njpFO6Ac= +go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.32.0/go.mod h1:3rHrKNtLIoS0oZwkY2vxi+oJcwFRWdtUyRII+so45p8= +go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracehttp v1.26.0 h1:1wp/gyxsuYtuE/JFxsQRtcCDtMrO2qMvlfXALU5wkzI= +go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracehttp v1.26.0/go.mod h1:gbTHmghkGgqxMomVQQMur1Nba4M0MQ8AYThXDUjsJ38= +go.opentelemetry.io/otel/metric v1.32.0 h1:xV2umtmNcThh2/a/aCP+h64Xx5wsj8qqnkYZktzNa0M= +go.opentelemetry.io/otel/metric v1.32.0/go.mod h1:jH7CIbbK6SH2V2wE16W05BHCtIDzauciCRLoc/SyMv8= +go.opentelemetry.io/otel/sdk v1.32.0 h1:RNxepc9vK59A8XsgZQouW8ue8Gkb4jpWtJm9ge5lEG4= +go.opentelemetry.io/otel/sdk v1.32.0/go.mod h1:LqgegDBjKMmb2GC6/PrTnteJG39I8/vJCAP9LlJXEjU= +go.opentelemetry.io/otel/trace v1.32.0 h1:WIC9mYrXf8TmY/EXuULKc8hR17vE+Hjv2cssQDe03fM= +go.opentelemetry.io/otel/trace v1.32.0/go.mod h1:+i4rkvCraA+tG6AzwloGaCtkx53Fa+L+V8e9a7YvhT8= +go.opentelemetry.io/proto/otlp v1.3.1 h1:TrMUixzpM0yuc/znrFTP9MMRh8trP93mkCiDVeXrui0= +go.opentelemetry.io/proto/otlp v1.3.1/go.mod h1:0X1WI4de4ZsLrrJNLAQbFeLCm3T7yBkR0XqQ7niQU+8= +golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w= +golang.org/x/crypto v0.0.0-20190820162420-60c769a6c586/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI= +golang.org/x/crypto v0.0.0-20191011191535-87dc89f01550/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI= +golang.org/x/crypto v0.0.0-20200622213623-75b288015ac9/go.mod h1:LzIPMQfyMNhhGPhUkYOs5KpL4U8rLKemX1yGLhDgUto= +golang.org/x/mod v0.2.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA= +golang.org/x/mod v0.3.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA= +golang.org/x/net v0.0.0-20190404232315-eb5bcb51f2a3/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg= +golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= +golang.org/x/net v0.0.0-20200226121028-0de0cce0169b/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= +golang.org/x/net v0.0.0-20201021035429-f5854403a974/go.mod h1:sp8m0HH+o8qH0wwXwYZr8TS3Oi6o0r6Gce1SSxlDquU= +golang.org/x/net v0.23.0 h1:7EYJ93RZ9vYSZAIb2x3lnuvqO5zneoD6IvWjuhfxjTs= +golang.org/x/net v0.23.0/go.mod h1:JKghWKKOSdJwpW2GEx0Ja7fmaKnMsbu+MWVZTokSYmg= +golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= +golang.org/x/sync v0.0.0-20190911185100-cd5d95a43a6e/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= +golang.org/x/sync v0.0.0-20201020160332-67f06af15bc9/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= +golang.org/x/sys v0.0.0-20180905080454-ebe1bf3edb33/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= +golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= +golang.org/x/sys v0.0.0-20190412213103-97732733099d/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20190916202348-b4ddaad3f8a3/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20200930185726-fdedc70b468f/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20210616094352-59db8d763f22/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.0.0-20220715151400-c0bba94af5f8/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.27.0 h1:wBqf8DvsY9Y/2P8gAfPDEYNuS30J4lPHJxXSb/nJZ+s= +golang.org/x/sys v0.27.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= +golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= +golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= +golang.org/x/text v0.15.0 h1:h1V/4gjBv8v9cjcR6+AR5+/cIYK5N/WAgiv4xlsEtAk= +golang.org/x/text v0.15.0/go.mod h1:18ZOQIKpY8NJVqYksKHtTdi31H5itFRjB5/qKTNYzSU= +golang.org/x/time v0.0.0-20201208040808-7e3f01d25324 h1:Hir2P/De0WpUhtrKGGjvSb2YxUgyZ7EFOSLIcSSpiwE= +golang.org/x/time v0.0.0-20201208040808-7e3f01d25324/go.mod h1:tRJNPiyCQ0inRvYxbN9jk5I+vvW/OXSQhTDSoE431IQ= +golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= +golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo= +golang.org/x/tools v0.0.0-20200619180055-7c47624df98f/go.mod h1:EkVYQZoAsY45+roYkvgYkIh4xh/qjgUK9TdY2XT94GE= +golang.org/x/tools v0.0.0-20210106214847-113979e3529a/go.mod h1:emZCQorbCU4vsT4fOWvOPXz4eW1wZW4PmDk9uLelYpA= +golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= +golang.org/x/xerrors v0.0.0-20191011141410-1b5146add898/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= +golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= +golang.org/x/xerrors v0.0.0-20200804184101-5ec99f83aff1/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= +google.golang.org/genproto/googleapis/api v0.0.0-20240513163218-0867130af1f8 h1:W5Xj/70xIA4x60O/IFyXivR5MGqblAb8R3w26pnD6No= +google.golang.org/genproto/googleapis/api v0.0.0-20240513163218-0867130af1f8/go.mod h1:vPrPUTsDCYxXWjP7clS81mZ6/803D8K4iM9Ma27VKas= +google.golang.org/genproto/googleapis/rpc v0.0.0-20240513163218-0867130af1f8 h1:mxSlqyb8ZAHsYDCfiXN1EDdNTdvjUJSLY+OnAUtYNYA= +google.golang.org/genproto/googleapis/rpc v0.0.0-20240513163218-0867130af1f8/go.mod h1:I7Y+G38R2bu5j1aLzfFmQfTcU/WnFuqDwLZAbvKTKpM= +google.golang.org/grpc v1.64.0 h1:KH3VH9y/MgNQg1dE7b3XfVK0GsPSIzJwdF617gUSbvY= +google.golang.org/grpc v1.64.0/go.mod h1:oxjF8E3FBnjp+/gVFYdWacaLDx9na1aqy9oovLpxQYg= +google.golang.org/protobuf v1.35.1 h1:m3LfL6/Ca+fqnjnlqQXNpFPABW1UD7mjh8KO2mKFytA= +google.golang.org/protobuf v1.35.1/go.mod h1:9fA7Ob0pmnwhb644+1+CVWFRbNajQ6iRojtC/QF5bRE= +gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= +gopkg.in/hlandau/service.v1 v1.0.7 h1:16G5AJ1Cp8Vr65QItJXpyAIzf/FWAWCZBsTgsc6eyA8= +gopkg.in/hlandau/service.v1 v1.0.7/go.mod h1:sZw6ksxcoafC04GoZtw32UeqqEuPSABX35lVBaJP/bE= +gopkg.in/yaml.v2 v2.2.2/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI= +gopkg.in/yaml.v2 v2.4.0/go.mod h1:RDklbk79AGWmwhnvt/jBztapEOGDOx6ZbXqjP6csGnQ= +gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= +gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA= +gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= +gotest.tools/v3 v3.0.3 h1:4AuOwCGf4lLR9u3YOe2awrHygurzhO/HeQ6laiA6Sx0= +gotest.tools/v3 v3.0.3/go.mod h1:Z7Lb0S5l+klDB31fvDQX8ss/FlKDxtlFlw3Oa8Ymbl8= diff --git a/sysbox-mgr/intf/intf.go b/sysbox-mgr/intf/intf.go new file mode 100644 index 00000000..f2389c48 --- /dev/null +++ b/sysbox-mgr/intf/intf.go @@ -0,0 +1,104 @@ +// +// Copyright 2019-2020 Nestybox, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +// +// sysbox-mgr interfaces +// + +package intf + +import ( + "os" + + "github.com/nestybox/sysbox-libs/shiftfs" + specs "github.com/opencontainers/runtime-spec/specs-go" +) + +// The SubidAlloc interface defines the interface exposed by the entity that +// performs or subuid and subgid allocations +type SubidAlloc interface { + + // Allocates an unused range of 'size' uids and gids for the container with the given 'id'. + // Max supported 'size' is 2^32. + // Possible errors are nil, "exhausted", or "invalid-size". + Alloc(id string, size uint64) (uint32, uint32, error) + + // Free releases a previously allocated uid and gid range for the container with the + // given 'id'. Possible errors are nil and "not-found" (if the container with the + // given 'id' has no allocations). + Free(id string) error +} + +// The VolMgr interface defines the interface exposed by the sysbox-mgr entities that +// manage the creation of volumes on the host that are bind-mounted into the sys +// container. +type VolMgr interface { + + // Creates a volume for the sys container with the given 'id'. This function + // returns an OCI mount spec (which is passed back to sysbox-runc to setup the actual mount). + // 'rootfs' is the absolute path the container's rootfs. + // 'mountpoint' is the volume's mountpoint (relative to the container's rootfs) + // 'uid' and 'gid' are the uid(gid) of the container root process in the host's namespace. + // 'chownOnSync' indicates if the volMgr should chown when copying to/from the container's rootfs + // 'perm' indicates the permissions for the created volume. + CreateVol(id, rootfs, mountpoint string, uid, gid uint32, chownOnSync bool, perm os.FileMode) ([]specs.Mount, error) + + // Destroys a volume for the container with the given 'id'. + DestroyVol(id string) error + + // Sync the contents of the volume back to container's rootfs + SyncOut(id string) error + + // Sync and destroys all volumes (best effort, ignore errors) + SyncOutAndDestroyAll() +} + +// The ShiftfsMgr interface defines the interface exposed by the sysbox-mgr shiftfs manager +type ShiftfsMgr interface { + + // Set shiftfs marks on the given paths; if createMarkpoint is true, create + // new moutpoint directories for each of the given paths. Returns a list of + // the paths where the shiftfs marks are set. + Mark(id string, mounts []shiftfs.MountPoint, createMarkpoint bool) ([]shiftfs.MountPoint, error) + + // Remove shiftfs marks associated with the given container + Unmark(id string, mount []shiftfs.MountPoint) error + + // Remove shiftfs marks associated with all containers (best effort, ignore errors) + UnmarkAll() +} + +// The RootfsCloner interface defines the interface exposed by the sysbox-mgr rootfs cloner +type RootfsCloner interface { + + // Creates a clone of the container's rootfs; returns the path to the cloned rootfs. + CreateClone(id, origRootfs string) (string, error) + + // Removes a rootfs clone for the given container + RemoveClone(id string) error + + // Chowns (recursively) the clone rootfs by the given user and group ID offset + ChownClone(id string, uidOffset, gidOffset int32) error + + // Undoes the actions of ChownClone() + RevertChown(id string) error + + // Notifies rootfsCloner that container has been stopped (but not removed) + ContainerStopped(id string) error + + // Performs cleanup (e.g., removes all clones, unmounts all mounts created by the cloner) + RemoveAll() +} diff --git a/sysbox-mgr/main.go b/sysbox-mgr/main.go new file mode 100644 index 00000000..c09494df --- /dev/null +++ b/sysbox-mgr/main.go @@ -0,0 +1,315 @@ +// +// Copyright 2019-2020 Nestybox, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +package main + +import ( + "fmt" + "os" + "os/signal" + "strings" + "syscall" + + "github.com/pkg/profile" + "github.com/sirupsen/logrus" + "github.com/urfave/cli" +) + +var ( + sysboxRunDir string = "/run/sysbox" + sysboxLibDirDefault string = "/var/lib/sysbox" + sysboxMgrPidFile string = sysboxRunDir + "/sysmgr.pid" + subidRangeSize uint64 = 65536 +) + +const ( + usage = `Sysbox manager daemon + +The Sysbox manager daemon's main job is to provide services to other +Sysbox components (e.g., sysbox-runc).` +) + +// Globals to be populated at build time during Makefile processing. +var ( + edition string // Sysbox Edition: CE or EE + version string // extracted from VERSION file + commitId string // latest sysbox-mgr's git commit-id + builtAt string // build time + builtBy string // build owner +) + +func main() { + app := cli.NewApp() + app.Name = "sysbox-mgr" + app.Usage = usage + app.Version = version + + var v []string + if version != "" { + v = append(v, version) + } + app.Version = strings.Join(v, "\n") + + app.Flags = []cli.Flag{ + cli.StringFlag{ + Name: "log, l", + Value: "", + Usage: "log file path or empty string for stderr output (default: \"\")", + }, + cli.StringFlag{ + Name: "log-level", + Value: "info", + Usage: "log categories to include (debug, info, warning, error, fatal)", + }, + cli.StringFlag{ + Name: "log-format", + Value: "text", + Usage: "log format; must be json or text (default = text)", + }, + cli.BoolTFlag{ + Name: "alias-dns", + Usage: "aliases the DNS IP inside the system container to ensure it never has a localhost address; required for system containers on user-defined Docker bridge networks (default = true)", + }, + cli.BoolFlag{ + Name: "cpu-profiling", + Usage: "enable cpu-profiling data collection", + Hidden: true, + }, + cli.BoolFlag{ + Name: "memory-profiling", + Usage: "enable memory-profiling data collection", + Hidden: true, + }, + cli.StringFlag{ + Name: "data-root", + Value: "/var/lib/sysbox", + Usage: "root directory for sysbox data store", + }, + cli.BoolFlag{ + Name: "disable-shiftfs", + Usage: "Disables Sysbox's use of the kernel's shiftfs module (present in Ubuntu/Debian); files may show with nobody:nogroup ownership inside the container; meant for testing. (default = false)", + }, + cli.BoolFlag{ + Name: "disable-shiftfs-on-fuse", + Usage: "Disables shiftfs on top of FUSE-based filesystems (which don't always work with shiftfs); FUSE-backed files mounted into the Sysbox container may show with nobody:nogroup ownership inside the container. (default = false)", + }, + cli.BoolFlag{ + Name: "disable-shiftfs-precheck", + Usage: "Disables Sysbox's preflight functional check of shiftfs; use this only if you want Sysbox to use shiftfs (e.g., kernel < 5.12) and you know it works properly (default = false).", + }, + cli.BoolFlag{ + Name: "disable-idmapped-mount", + Usage: "Disables Sysbox's use of the kernel's ID-mapped-mount feature; files may show with nobody:nogroup ownership inside the container; meant for testing (default = false)", + }, + cli.BoolFlag{ + Name: "disable-rootfs-cloning", + Usage: "Disables Sysbox's rootfs cloning feature (used for fast chown of the container's rootfs in hosts without shiftfs); this option will significantly slow down container startup time in hosts without shiftfs (default = false)", + }, + cli.BoolFlag{ + Name: "disable-ovfs-on-idmapped-mount", + Usage: "Disables ID-mapping of overlayfs (available in Linux kernel 5.19+); when set to true, forces Sysbox to use either shiftfs (if available on the host) or otherwise chown the container's rootfs, slowing container start/stop time; meant for testing (default = false)", + }, + cli.BoolFlag{ + Name: "disable-inner-image-preload", + Usage: "Disables the Sysbox feature that allows users to preload inner container images into system container images (e.g., via Docker commit or build); this makes container stop faster; running system container images that come preloaded with inner container images continue to work fine; (default = false)", + }, + cli.BoolFlag{ + Name: "ignore-sysfs-chown", + Usage: "Ignore chown of /sys inside all Sysbox containers; may be needed to run a few apps that chown /sys inside the container (e.g,. rpm). Causes Sysbox to trap the chown syscall inside the container, slowing it down (default = false).", + }, + cli.BoolFlag{ + Name: "allow-trusted-xattr", + Usage: "Allows the overlayfs trusted.overlay.opaque xattr to be set inside all Sysbox containers; needed when running Docker inside Sysbox on hosts with kernel < 5.11. Causes Sysbox to trap the *xattr syscalls inside the container, slowing it down (default = false).", + }, + cli.BoolFlag{ + Name: "honor-caps", + Usage: "Honor the container's process capabilities passed to Sysbox by the higher level container manager (e.g., Docker/containerd). When set to false, Sysbox always gives the container's root user full capabilities and other users no capabilities to mimic a VM-like environment. Note that the container's capabilities are isolated from the host via the Linux user-namespace. (default = false).", + }, + cli.BoolTFlag{ + Name: "syscont-mode", + Usage: "Causes Sysbox to run in \"system container\" mode. In this mode, it sets up the container to run system workloads (e.g., systemd, Docker, Kubernetes, etc.) seamlessly and securely. When set to false, Sysbox operates in \"regular container\" mode where it sets up the container strictly per its OCI spec (usually for microservices), with the exception of the Linux 'user' and 'cgroup' namespaces which Sysbox always enables for extra container isolation. (default = true)", + }, + cli.BoolFlag{ + Name: "relaxed-read-only", + Usage: "Allows Sysbox to create read-only containers while enabling read-write operations in certain mountpoints within the container. (default = false)", + }, + cli.BoolFlag{ + Name: "fsuid-map-fail-on-error", + Usage: "When set to true, fail to launch a container whenever filesystem uid-mapping (needed for files to show proper ownership inside the container's user-namespace) hits an error; when set to false, launch the container anyway (files may show up owned by nobody:nogroup) (default = false).", + }, + } + + // show-version specialization. + cli.VersionPrinter = func(c *cli.Context) { + fmt.Printf("sysbox-mgr\n"+ + "\tedition: \t%s\n"+ + "\tversion: \t%s\n"+ + "\tcommit: \t%s\n"+ + "\tbuilt at: \t%s\n"+ + "\tbuilt by: \t%s\n", + edition, c.App.Version, commitId, builtAt, builtBy) + } + + app.Before = func(ctx *cli.Context) error { + if path := ctx.GlobalString("log"); path != "" { + f, err := os.OpenFile(path, os.O_CREATE|os.O_WRONLY|os.O_APPEND|os.O_SYNC, 0666) + if err != nil { + return err + } + logrus.SetOutput(f) + } else { + logrus.SetOutput(os.Stderr) + } + + if logFormat := ctx.GlobalString("log-format"); logFormat == "json" { + logrus.SetFormatter(&logrus.JSONFormatter{ + TimestampFormat: "2006-01-02 15:04:05", + }) + } else { + logrus.SetFormatter(&logrus.TextFormatter{ + TimestampFormat: "2006-01-02 15:04:05", + FullTimestamp: true, + }) + } + + // Set desired log-level. + if logLevel := ctx.GlobalString("log-level"); logLevel != "" { + switch logLevel { + case "debug": + logrus.SetLevel(logrus.DebugLevel) + case "info": + logrus.SetLevel(logrus.InfoLevel) + case "warning": + logrus.SetLevel(logrus.WarnLevel) + case "error": + logrus.SetLevel(logrus.ErrorLevel) + case "fatal": + logrus.SetLevel(logrus.FatalLevel) + default: + logrus.Fatalf("'%v' log-level option not recognized", logLevel) + } + } else { + // Set 'info' as our default log-level. + logrus.SetLevel(logrus.InfoLevel) + } + + return nil + } + + app.Action = func(ctx *cli.Context) error { + + logrus.Info("Starting ...") + + // If requested, launch cpu/mem profiling data collection. + profile, err := runProfiler(ctx) + if err != nil { + return err + } + + mgr, err := newSysboxMgr(ctx) + if err != nil { + return fmt.Errorf("failed to create sysbox-mgr: %v", err) + } + + var signalChan = make(chan os.Signal, 1) + signal.Notify( + signalChan, + syscall.SIGHUP, + syscall.SIGINT, + syscall.SIGTERM, + syscall.SIGQUIT) + go signalHandler(signalChan, mgr, profile) + + logrus.Infof("Listening on %v", mgr.grpcServer.GetAddr()) + if err := mgr.Start(); err != nil { + return fmt.Errorf("failed to start sysbox-mgr: %v", err) + } + + mgr.Stop() + logrus.Info("Done.") + return nil + } + + if err := app.Run(os.Args); err != nil { + logrus.Fatal(err) + } +} + +// Run cpu / memory profiling collection. +func runProfiler(ctx *cli.Context) (interface{ Stop() }, error) { + + var prof interface{ Stop() } + + cpuProfOn := ctx.Bool("cpu-profiling") + memProfOn := ctx.Bool("memory-profiling") + + // Cpu and Memory profiling options seem to be mutually exclused in pprof. + if cpuProfOn && memProfOn { + return nil, fmt.Errorf("Unsupported parameter combination: cpu and memory profiling") + } + + // Typical / non-profiling case. + if !(cpuProfOn || memProfOn) { + return nil, nil + } + + // Notice that 'NoShutdownHook' option is passed to profiler constructor to + // avoid this one reacting to 'sigterm' signal arrival. IOW, we want + // sysbox-mgr signal handler to be the one stopping all profiling tasks. + + if cpuProfOn { + prof = profile.Start( + profile.CPUProfile, + profile.ProfilePath("."), + profile.NoShutdownHook, + ) + logrus.Info("Initiated cpu-profiling data collection.") + } + + if memProfOn { + prof = profile.Start( + profile.MemProfile, + profile.ProfilePath("."), + profile.NoShutdownHook, + ) + logrus.Info("Initiated memory-profiling data collection.") + } + + return prof, nil +} + +// sysbox-mgr signal handler goroutine. +func signalHandler( + signalChan chan os.Signal, + mgr *SysboxMgr, + profile interface{ Stop() }) { + + s := <-signalChan + + logrus.Infof("Caught OS signal: %s", s) + + if err := mgr.Stop(); err != nil { + logrus.Warnf("Failed to terminate sysbox-mgr gracefully: %s", err) + } + + // Stop cpu/mem profiling tasks. + if profile != nil { + profile.Stop() + } + + logrus.Info("Exiting.") + os.Exit(0) +} diff --git a/sysbox-mgr/mgr.go b/sysbox-mgr/mgr.go new file mode 100644 index 00000000..772cdefd --- /dev/null +++ b/sysbox-mgr/mgr.go @@ -0,0 +1,1546 @@ +// +// Copyright 2019-2022 Nestybox, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +package main + +import ( + "fmt" + "path" + "sync" + "time" + + systemd "github.com/coreos/go-systemd/daemon" + grpc "github.com/nestybox/sysbox-ipc/sysboxMgrGrpc" + ipcLib "github.com/nestybox/sysbox-ipc/sysboxMgrLib" + "github.com/nestybox/sysbox-libs/dockerUtils" + "github.com/nestybox/sysbox-libs/fileMonitor" + "github.com/nestybox/sysbox-libs/formatter" + "github.com/nestybox/sysbox-libs/idMap" + "github.com/nestybox/sysbox-libs/idShiftUtils" + "github.com/nestybox/sysbox-libs/linuxUtils" + "github.com/nestybox/sysbox-libs/shiftfs" + libutils "github.com/nestybox/sysbox-libs/utils" + intf "github.com/nestybox/sysbox-mgr/intf" + "github.com/nestybox/sysbox-mgr/rootfsCloner" + "github.com/nestybox/sysbox-mgr/shiftfsMgr" + "github.com/opencontainers/runc/libcontainer/configs" + specs "github.com/opencontainers/runtime-spec/specs-go" + "github.com/sirupsen/logrus" + "github.com/urfave/cli" +) + +var sysboxLibDir string + +type containerState int + +const ( + started containerState = iota + stopped + restarted +) + +type mntPrepRevInfo struct { + path string + uidShifted bool + origUid uint32 + origGid uint32 + targetUid uint32 + targetGid uint32 +} + +type mountInfo struct { + kind ipcLib.MntKind + mounts []specs.Mount +} + +type containerInfo struct { + state containerState + rootfs string + mntPrepRev []mntPrepRevInfo + reqMntInfos []mountInfo + containerMnts []specs.Mount + shiftfsMarks []shiftfs.MountPoint + autoRemove bool + userns string + netns string + netnsInode uint64 + uidMappings []specs.LinuxIDMapping + gidMappings []specs.LinuxIDMapping + subidAllocated bool + rootfsCloned bool + origRootfs string // if rootfs was cloned, this is the original rootfs + rootfsUidShiftType idShiftUtils.IDShiftType + rootfsOnOvfs bool + rootfsOvfsUpper string + rootfsOvfsUpperChowned bool +} + +type mgrConfig struct { + aliasDns bool + shiftfsOk bool + shiftfsOnOverlayfsOk bool + idMapMountOk bool + overlayfsOnIDMapMountOk bool + noRootfsCloning bool + ignoreSysfsChown bool + allowTrustedXattr bool + honorCaps bool + syscontMode bool + fsuidMapFailOnErr bool + noInnerImgPreload bool + noShiftfsOnFuse bool + relaxedReadOnly bool +} + +type SysboxMgr struct { + mgrCfg mgrConfig + grpcServer *grpc.ServerStub + subidAlloc intf.SubidAlloc + dockerVolMgr intf.VolMgr + kubeletVolMgr intf.VolMgr + k0sVolMgr intf.VolMgr + k3sVolMgr intf.VolMgr + rke2VolMgr intf.VolMgr + buildkitVolMgr intf.VolMgr + containerdVolMgr intf.VolMgr + shiftfsMgr intf.ShiftfsMgr + rootfsCloner intf.RootfsCloner + hostDistro string + hostKernelHdrPath string + linuxHeaderMounts []specs.Mount + libModMounts []specs.Mount + // Tracks containers known to sysbox (cont id -> cont info) + contTable map[string]containerInfo + ctLock sync.Mutex + // Tracks container rootfs (cont rootfs -> cont id); used by the rootfs monitor + rootfsTable map[string]string + rtLock sync.Mutex + rootfsMonStop chan int + rootfsMon *fileMonitor.FileMon + exclMntTable *exclusiveMntTable + // tracks containers using the same netns (netns inode -> list of container ids) + netnsTable map[uint64][]string + ntLock sync.Mutex +} + +// newSysboxMgr creates an instance of the sysbox manager +func newSysboxMgr(ctx *cli.Context) (*SysboxMgr, error) { + var err error + + err = libutils.CheckPidFile("sysbox-mgr", sysboxMgrPidFile) + if err != nil { + return nil, err + } + + err = preFlightCheck() + if err != nil { + return nil, fmt.Errorf("preflight check failed: %s", err) + } + + sysboxLibDir = ctx.GlobalString("data-root") + if sysboxLibDir == "" { + sysboxLibDir = sysboxLibDirDefault + } + logrus.Infof("Sysbox data root: %s", sysboxLibDir) + + err = setupRunDir() + if err != nil { + return nil, fmt.Errorf("failed to setup the sysbox run dir: %v", err) + } + + err = setupWorkDirs() + if err != nil { + return nil, fmt.Errorf("failed to setup the sysbox work dirs: %v", err) + } + + subidAlloc, err := setupSubidAlloc(ctx) + if err != nil { + return nil, fmt.Errorf("failed to setup subid allocator: %v", err) + } + + syncVolToRootfs := !ctx.GlobalBool("disable-inner-image-preload") + + dockerVolMgr, err := setupDockerVolMgr(syncVolToRootfs) + if err != nil { + return nil, fmt.Errorf("failed to setup docker vol mgr: %v", err) + } + + kubeletVolMgr, err := setupKubeletVolMgr(syncVolToRootfs) + if err != nil { + return nil, fmt.Errorf("failed to setup kubelet vol mgr: %v", err) + } + + k0sVolMgr, err := setupK0sVolMgr(syncVolToRootfs) + if err != nil { + return nil, fmt.Errorf("failed to setup k0s vol mgr: %v", err) + } + + k3sVolMgr, err := setupK3sVolMgr(syncVolToRootfs) + if err != nil { + return nil, fmt.Errorf("failed to setup k3s vol mgr: %v", err) + } + + rke2VolMgr, err := setupRke2VolMgr(syncVolToRootfs) + if err != nil { + return nil, fmt.Errorf("failed to setup rke2 vol mgr: %v", err) + } + + buildkitVolMgr, err := setupBuildkitVolMgr(syncVolToRootfs) + if err != nil { + return nil, fmt.Errorf("failed to setup buildkit vol mgr: %v", err) + } + + containerdVolMgr, err := setupContainerdVolMgr(syncVolToRootfs) + if err != nil { + return nil, fmt.Errorf("failed to setup containerd vol mgr: %v", err) + } + + shiftfsMgr, err := shiftfsMgr.New(sysboxLibDir) + if err != nil { + return nil, fmt.Errorf("failed to setup shiftfs mgr: %v", err) + } + + rootfsCloner := rootfsCloner.New(sysboxLibDir) + if err != nil { + return nil, fmt.Errorf("failed to setup rootfs mgr: %v", err) + } + + hostDistro, err := linuxUtils.GetDistro() + if err != nil { + return nil, fmt.Errorf("failed to identify system's linux distribution: %v", err) + } + + hostKernelHdrPath, err := linuxUtils.GetLinuxHeaderPath(hostDistro) + if err != nil { + return nil, fmt.Errorf("failed to identify system's linux-header path: %v", err) + } + + linuxHeaderMounts, err := getLinuxHeaderMounts(hostKernelHdrPath) + if err != nil { + return nil, fmt.Errorf("failed to compute linux header mounts: %v", err) + } + + libModMounts, err := getLibModMounts() + if err != nil { + return nil, fmt.Errorf("failed to compute kernel-module mounts: %v", err) + } + + idMapMountOk := false + ovfsOnIDMapMountOk := false + + if !ctx.GlobalBool("disable-idmapped-mount") { + idMapMountOk, ovfsOnIDMapMountOk, err = checkIDMapMountSupport(ctx) + if err != nil { + return nil, fmt.Errorf("ID-mapping check failed: %v", err) + } + } + + if ctx.GlobalBool("disable-ovfs-on-idmapped-mount") { + ovfsOnIDMapMountOk = false + } + + shiftfsModPresent := false + shiftfsOk := false + shiftfsOnOvfsOk := false + + if !ctx.GlobalBool("disable-shiftfs") { + shiftfsModPresent, err = linuxUtils.KernelModSupported("shiftfs") + if err != nil { + return nil, fmt.Errorf("shiftfs kernel module check failed: %v", err) + } + + if shiftfsModPresent { + if ctx.GlobalBool("disable-shiftfs-precheck") { + shiftfsOk = shiftfsModPresent + shiftfsOnOvfsOk = shiftfsModPresent + } else { + shiftfsOk, shiftfsOnOvfsOk, err = checkShiftfsSupport(ctx) + if err != nil { + return nil, fmt.Errorf("shiftfs check failed: %v", err) + } + } + } + } + + mgrCfg := mgrConfig{ + aliasDns: ctx.GlobalBoolT("alias-dns"), + shiftfsOk: shiftfsOk, + shiftfsOnOverlayfsOk: shiftfsOnOvfsOk, + idMapMountOk: idMapMountOk, + overlayfsOnIDMapMountOk: ovfsOnIDMapMountOk, + noRootfsCloning: ctx.GlobalBool("disable-rootfs-cloning"), + ignoreSysfsChown: ctx.GlobalBool("ignore-sysfs-chown"), + allowTrustedXattr: ctx.GlobalBool("allow-trusted-xattr"), + honorCaps: ctx.GlobalBool("honor-caps"), + syscontMode: ctx.GlobalBoolT("syscont-mode"), + relaxedReadOnly: ctx.GlobalBool("relaxed-read-only"), + fsuidMapFailOnErr: ctx.GlobalBool("fsuid-map-fail-on-error"), + noInnerImgPreload: !syncVolToRootfs, + noShiftfsOnFuse: ctx.GlobalBool("disable-shiftfs-on-fuse"), + } + + if !mgrCfg.aliasDns { + logrus.Info("Sys container DNS aliasing disabled.") + } + + if ctx.GlobalBool("disable-shiftfs") { + logrus.Info("Use of shiftfs disabled.") + } else { + logrus.Infof("Shiftfs module found in kernel: %s", ifThenElse(shiftfsModPresent, "yes", "no")) + if !ctx.GlobalBool("disable-shiftfs-precheck") { + logrus.Infof("Shiftfs works properly: %s", ifThenElse(mgrCfg.shiftfsOk, "yes", "no")) + logrus.Infof("Shiftfs-on-overlayfs works properly: %s", ifThenElse(mgrCfg.shiftfsOnOverlayfsOk, "yes", "no")) + } + } + + if ctx.GlobalBool("disable-idmapped-mount") { + logrus.Info("Use of ID-mapped mounts disabled.") + } else { + logrus.Infof("ID-mapped mounts supported by kernel: %s", ifThenElse(mgrCfg.idMapMountOk, "yes", "no")) + } + + if ctx.GlobalBool("disable-ovfs-on-idmapped-mount") { + logrus.Info("Use of overlayfs on ID-mapped mounts disabled.") + } else { + logrus.Infof("Overlayfs on ID-mapped mounts supported by kernel: %s", ifThenElse(mgrCfg.overlayfsOnIDMapMountOk, "yes", "no")) + } + + if mgrCfg.noRootfsCloning { + logrus.Info("Rootfs cloning disabled.") + } + + if mgrCfg.ignoreSysfsChown { + logrus.Info("Ignoring chown of /sys inside container.") + } + + if mgrCfg.allowTrustedXattr { + logrus.Info("Allowing trusted.overlay.opaque inside container.") + } + + if mgrCfg.honorCaps { + logrus.Info("Honoring process capabilities in OCI spec (--honor-caps).") + } + + if mgrCfg.syscontMode { + logrus.Info("Operating in system container mode.") + } else { + logrus.Info("Operating in regular container mode.") + } + + if mgrCfg.relaxedReadOnly { + logrus.Info("Relaxed read-only mode enabled.") + } else { + logrus.Info("Relaxed read-only mode disabled.") + } + + if mgrCfg.fsuidMapFailOnErr { + logrus.Info("fsuid-map-fail-on-error = true.") + } + + if mgrCfg.noInnerImgPreload { + logrus.Info("Inner container image preloading disabled.") + } else { + logrus.Info("Inner container image preloading enabled.") + } + + mgr := &SysboxMgr{ + mgrCfg: mgrCfg, + subidAlloc: subidAlloc, + dockerVolMgr: dockerVolMgr, + kubeletVolMgr: kubeletVolMgr, + k0sVolMgr: k0sVolMgr, + k3sVolMgr: k3sVolMgr, + rke2VolMgr: rke2VolMgr, + buildkitVolMgr: buildkitVolMgr, + containerdVolMgr: containerdVolMgr, + shiftfsMgr: shiftfsMgr, + rootfsCloner: rootfsCloner, + hostDistro: hostDistro, + hostKernelHdrPath: hostKernelHdrPath, + linuxHeaderMounts: linuxHeaderMounts, + libModMounts: libModMounts, + contTable: make(map[string]containerInfo), + rootfsTable: make(map[string]string), + rootfsMonStop: make(chan int), + netnsTable: make(map[uint64][]string), + exclMntTable: newExclusiveMntTable(), + } + + cb := &grpc.ServerCallbacks{ + Register: mgr.register, + Update: mgr.update, + Unregister: mgr.unregister, + SubidAlloc: mgr.allocSubid, + ReqMounts: mgr.reqMounts, + PrepMounts: mgr.prepMounts, + ReqShiftfsMark: mgr.reqShiftfsMark, + ReqFsState: mgr.reqFsState, + CloneRootfs: mgr.cloneRootfs, + ChownClonedRootfs: mgr.chownClonedRootfs, + RevertClonedRootfsChown: mgr.revertClonedRootfsChown, + Pause: mgr.pause, + Resume: mgr.resume, + } + + mgr.grpcServer = grpc.NewServerStub(cb) + + return mgr, nil +} + +func (mgr *SysboxMgr) Start() error { + + // setup the container rootfs monitor (detects container removal) + cfg := &fileMonitor.Cfg{ + EventBufSize: 10, + PollInterval: 50 * time.Millisecond, + } + mon, err := fileMonitor.New(cfg) + if err != nil { + return fmt.Errorf("failed to setup rootfs monitor: %v", err) + } + mgr.rootfsMon = mon + + // start the rootfs monitor thread (listens for rootfsMon events) + go mgr.rootfsMonitor() + + systemd.SdNotify(false, systemd.SdNotifyReady) + + err = libutils.CreatePidFile("sysbox-mgr", sysboxMgrPidFile) + if err != nil { + return fmt.Errorf("failed to create sysmgr.pid file: %s", err) + } + + logrus.Info("Ready ...") + + // listen for grpc connections + return mgr.grpcServer.Init() +} + +func (mgr *SysboxMgr) Stop() error { + + logrus.Info("Stopping (gracefully) ...") + + systemd.SdNotify(false, systemd.SdNotifyStopping) + + mgr.ctLock.Lock() + if len(mgr.contTable) > 0 { + logrus.Warn("The following containers are active and will stop operating properly:") + for id := range mgr.contTable { + logrus.Warnf("container id: %s", formatter.ContainerID{id}) + } + } + mgr.ctLock.Unlock() + + mgr.rootfsMonStop <- 1 + mgr.rootfsMon.Close() + + mgr.dockerVolMgr.SyncOutAndDestroyAll() + mgr.kubeletVolMgr.SyncOutAndDestroyAll() + mgr.k0sVolMgr.SyncOutAndDestroyAll() + mgr.k3sVolMgr.SyncOutAndDestroyAll() + mgr.rke2VolMgr.SyncOutAndDestroyAll() + mgr.buildkitVolMgr.SyncOutAndDestroyAll() + mgr.containerdVolMgr.SyncOutAndDestroyAll() + mgr.shiftfsMgr.UnmarkAll() + + // Note: this will cause the container's cloned rootfs to be removed when + // Sysbox is stopped, thus loosing the container's runtime data. In the + // future we may want to make this persistent across Sysbox stop-restart + // events. + mgr.rootfsCloner.RemoveAll() + + if err := cleanupWorkDirs(); err != nil { + logrus.Warnf("failed to cleanup work dirs: %v", err) + } + + if err := libutils.DestroyPidFile(sysboxMgrPidFile); err != nil { + logrus.Warnf("failed to destroy sysbox-mgr pid file: %v", err) + } + + logrus.Info("Stopped.") + + return nil +} + +// Registers a container with sysbox-mgr +func (mgr *SysboxMgr) register(regInfo *ipcLib.RegistrationInfo) (*ipcLib.ContainerConfig, error) { + + id := regInfo.Id + rootfs := regInfo.Rootfs + userns := regInfo.Userns + netns := regInfo.Netns + uidMappings := regInfo.UidMappings + gidMappings := regInfo.GidMappings + + mgr.ctLock.Lock() + info, found := mgr.contTable[id] + newContainer := !found + + if newContainer { + // new container + info = containerInfo{ + state: started, + mntPrepRev: []mntPrepRevInfo{}, + shiftfsMarks: []shiftfs.MountPoint{}, + } + } else { + // re-started container + if info.state != stopped { + mgr.ctLock.Unlock() + return nil, fmt.Errorf("redundant container registration for container %s", + formatter.ContainerID{id}) + } + info.state = restarted + } + + if !info.rootfsCloned { + info.rootfs = rootfs + info.origRootfs = rootfs + } + + rootfsOnOvfs, err := isRootfsOnOverlayfs(rootfs) + if err != nil { + mgr.ctLock.Unlock() + return nil, err + } + + info.rootfsOnOvfs = rootfsOnOvfs + info.netns = netns + info.userns = userns + + if !info.subidAllocated { + info.uidMappings = uidMappings + info.gidMappings = gidMappings + } + + // Track the container's net-ns, so we can later determine if multiple sys + // containers are sharing a net-ns (which implies they share the user-ns too). + var sameNetns []string + + if netns != "" { + netnsInode, err := getInode(netns) + if err != nil { + mgr.ctLock.Unlock() + return nil, fmt.Errorf("unable to get inode for netns %s: %s", netns, err) + } + + sameNetns, err = mgr.trackNetns(id, netnsInode) + if err != nil { + mgr.ctLock.Unlock() + return nil, fmt.Errorf("failed to track netns for container %s: %s", + formatter.ContainerID{id}, err) + } + + info.netnsInode = netnsInode + } + + // If this container's netns is shared with other containers, it's userns + // (and associated ID mappings) must be shared too. + if len(sameNetns) > 1 && userns == "" { + otherContSameNetnsInfo, ok := mgr.contTable[sameNetns[0]] + if !ok { + mgr.ctLock.Unlock() + return nil, + fmt.Errorf("container %s shares net-ns with other containers, but unable to find info for those.", + formatter.ContainerID{id}) + } + info.userns = otherContSameNetnsInfo.userns + info.uidMappings = otherContSameNetnsInfo.uidMappings + info.gidMappings = otherContSameNetnsInfo.gidMappings + } + + mgr.contTable[id] = info + mgr.ctLock.Unlock() + + if info.state == restarted { + if info.origRootfs != "" { + // remove the container's rootfs watch + origRootfs := sanitizeRootfs(id, info.origRootfs) + mgr.rootfsMon.Remove(origRootfs) + mgr.rtLock.Lock() + delete(mgr.rootfsTable, origRootfs) + mgr.rtLock.Unlock() + logrus.Debugf("removed fs watch on %s", origRootfs) + } + logrus.Infof("registered container %s", formatter.ContainerID{id}) + } else { + logrus.Infof("registered new container %s", formatter.ContainerID{id}) + } + + containerCfg := &ipcLib.ContainerConfig{ + AliasDns: mgr.mgrCfg.aliasDns, + ShiftfsOk: mgr.mgrCfg.shiftfsOk, + ShiftfsOnOverlayfsOk: mgr.mgrCfg.shiftfsOnOverlayfsOk, + IDMapMountOk: mgr.mgrCfg.idMapMountOk, + OverlayfsOnIDMapMountOk: mgr.mgrCfg.overlayfsOnIDMapMountOk, + NoRootfsCloning: mgr.mgrCfg.noRootfsCloning, + IgnoreSysfsChown: mgr.mgrCfg.ignoreSysfsChown, + AllowTrustedXattr: mgr.mgrCfg.allowTrustedXattr, + HonorCaps: mgr.mgrCfg.honorCaps, + SyscontMode: mgr.mgrCfg.syscontMode, + FsuidMapFailOnErr: mgr.mgrCfg.fsuidMapFailOnErr, + Userns: info.userns, + UidMappings: info.uidMappings, + GidMappings: info.gidMappings, + RootfsUidShiftType: info.rootfsUidShiftType, + NoShiftfsOnFuse: mgr.mgrCfg.noShiftfsOnFuse, + RelaxedReadOnly: mgr.mgrCfg.relaxedReadOnly, + } + + return containerCfg, nil +} + +// Updates info for a given container +func (mgr *SysboxMgr) update(updateInfo *ipcLib.UpdateInfo) error { + + id := updateInfo.Id + userns := updateInfo.Userns + netns := updateInfo.Netns + uidMappings := updateInfo.UidMappings + gidMappings := updateInfo.GidMappings + rootfsUidShiftType := updateInfo.RootfsUidShiftType + + mgr.ctLock.Lock() + defer mgr.ctLock.Unlock() + + info, found := mgr.contTable[id] + if !found { + return fmt.Errorf("can't update container %s; not found in container table", + formatter.ContainerID{id}) + } + + // If the container's rootfs is on overlayfs and it's ID-mapped, then + // sysbox-runc will chown the upper layer as it can't be ID-mapped (overlayfs + // does not support it). Track this fact so we can revert that chown when + // the container is stopped or paused. + if info.rootfsOnOvfs && rootfsUidShiftType == idShiftUtils.IDMappedMount { + rootfsOvfsUpper, err := getRootfsOverlayUpperLayer(info.rootfs) + if err != nil { + return nil + } + info.rootfsOvfsUpper = rootfsOvfsUpper + info.rootfsOvfsUpperChowned = true + } + + if info.netns == "" && netns != "" { + netnsInode, err := getInode(netns) + if err != nil { + return fmt.Errorf("can't update container %s: unable to get inode for netns %s: %s", + formatter.ContainerID{id}, netns, err) + } + + if _, err := mgr.trackNetns(id, netnsInode); err != nil { + return fmt.Errorf("can't update container %s: failed to track netns: %s", + formatter.ContainerID{id}, err) + } + info.netns = netns + info.netnsInode = netnsInode + } + + if info.userns == "" && userns != "" { + info.userns = userns + } + + if len(info.uidMappings) == 0 && len(uidMappings) > 0 { + info.uidMappings = uidMappings + } + + if len(info.gidMappings) == 0 && len(gidMappings) > 0 { + info.gidMappings = gidMappings + } + + info.rootfsUidShiftType = rootfsUidShiftType + + mgr.contTable[id] = info + return nil +} + +// Unregisters a container with sysbox-mgr +func (mgr *SysboxMgr) unregister(id string) error { + var err error + + // update container state + mgr.ctLock.Lock() + info, found := mgr.contTable[id] + mgr.ctLock.Unlock() + + if !found { + return fmt.Errorf("can't unregister container %s; not found in container table", + formatter.ContainerID{id}) + } + if info.state == stopped { + return fmt.Errorf("redundant container unregistration for container %s", + formatter.ContainerID{id}) + } + info.state = stopped + + if len(info.shiftfsMarks) != 0 { + if err = mgr.shiftfsMgr.Unmark(id, info.shiftfsMarks); err != nil { + logrus.Warnf("failed to remove shiftfs marks for container %s: %s", + formatter.ContainerID{id}, err) + } + info.shiftfsMarks = []shiftfs.MountPoint{} + } + + // If the rootfs is ID-mapped and on overlayfs, then chown the upper dir from + // [userns-host-ID -> 0] when the container stops (i.e., revert uid:gid to + // it's original). This way snapshots of the container rootfs (e.g., docker + // commit or docker build) will capture the correct uid:gid. + // + // TODO: before checking for info.autoRemove, we should ensure that the + // autoRemoveCheck() goroutine has run to completion. Otherwise the + // value of autoRemove is not guaranteed to be correct. + + if info.rootfsOvfsUpperChowned && !info.autoRemove { + uidOffset := -int32(info.uidMappings[0].HostID) + gidOffset := -int32(info.gidMappings[0].HostID) + + logrus.Infof("unregister %s: chown rootfs overlayfs upper layer at %s (%d -> %d)", + formatter.ContainerID{id}, info.rootfsOvfsUpper, info.uidMappings[0].HostID, 0) + + if err := idShiftUtils.ShiftIdsWithChown(info.rootfsOvfsUpper, uidOffset, gidOffset); err != nil { + return err + } + info.rootfsOvfsUpperChowned = false + } + + // revert mount prep actions + for _, revInfo := range info.mntPrepRev { + if revInfo.uidShifted { + + uidOffset := int32(revInfo.origUid) - int32(revInfo.targetUid) + gidOffset := int32(revInfo.origGid) - int32(revInfo.targetGid) + + logrus.Infof("reverting uid-shift on %s for %s (%d -> %d)", revInfo.path, formatter.ContainerID{id}, revInfo.targetUid, revInfo.origUid) + + if err = idShiftUtils.ShiftIdsWithChown(revInfo.path, uidOffset, gidOffset); err != nil { + logrus.Warnf("failed to revert uid-shift of mount source at %s: %s", revInfo.path, err) + } + + logrus.Infof("done reverting uid-shift on %s for %s", revInfo.path, formatter.ContainerID{id}) + } + + mgr.exclMntTable.remove(revInfo.path, id) + } + info.mntPrepRev = []mntPrepRevInfo{} + + // update the netns sharing table + // + // note: we don't do error checking because this can fail if the netns is not + // yet tracked for the container (e.g., if a container is registered and + // then unregistered because the container failed to start for some reason). + mgr.untrackNetns(id, info.netnsInode) + + // ns tracking info is reset for new or restarted containers + info.userns = "" + info.netns = "" + info.netnsInode = 0 + + // uid mappings for the container are also reset, except if they were + // allocated by sysbox-mgr (those are kept across container restarts). + if !info.subidAllocated { + info.uidMappings = nil + info.gidMappings = nil + } + + mgr.ctLock.Lock() + mgr.contTable[id] = info + mgr.ctLock.Unlock() + + // Request the volume managers to copy their contents to the container's rootfs. + if !info.autoRemove { + if err := mgr.volSyncOut(id, info); err != nil { + logrus.Warnf("sync-out for container %s failed: %v", + formatter.ContainerID{id}, err) + } + } + + // Notify rootfs cloner that container has stopped + if info.rootfsCloned { + if err := mgr.rootfsCloner.ContainerStopped(id); err != nil { + return err + } + } + + // setup a rootfs watch (allows us to get notified when the container's rootfs is removed) + if info.origRootfs != "" { + origRootfs := sanitizeRootfs(id, info.origRootfs) + mgr.rtLock.Lock() + mgr.rootfsTable[origRootfs] = id + mgr.rootfsMon.Add(origRootfs) + mgr.rtLock.Unlock() + logrus.Debugf("added fs watch on %s", origRootfs) + } + + logrus.Infof("unregistered container %s", formatter.ContainerID{id}) + return nil +} + +func (mgr *SysboxMgr) volSyncOut(id string, info containerInfo) error { + var err, err2 error + failedVols := []string{} + + for _, mnt := range info.reqMntInfos { + switch mnt.kind { + case ipcLib.MntVarLibDocker: + err = mgr.dockerVolMgr.SyncOut(id) + case ipcLib.MntVarLibKubelet: + err = mgr.kubeletVolMgr.SyncOut(id) + case ipcLib.MntVarLibK0s: + err = mgr.k0sVolMgr.SyncOut(id) + case ipcLib.MntVarLibRancherK3s: + err = mgr.k3sVolMgr.SyncOut(id) + case ipcLib.MntVarLibRancherRke2: + err = mgr.rke2VolMgr.SyncOut(id) + case ipcLib.MntVarLibBuildkit: + err = mgr.buildkitVolMgr.SyncOut(id) + case ipcLib.MntVarLibContainerdOvfs: + err = mgr.containerdVolMgr.SyncOut(id) + } + + if err != nil { + failedVols = append(failedVols, mnt.kind.String()) + err2 = err + } + } + + if len(failedVols) > 0 { + return fmt.Errorf("sync-out for volume backing %s: %v", failedVols, err2) + } + + return nil +} + +// rootfs monitor thread: checks for rootfs removal event and removes container state. +func (mgr *SysboxMgr) rootfsMonitor() { + logrus.Debugf("rootfsMon starting ...") + for { + select { + case events := <-mgr.rootfsMon.Events(): + for _, e := range events { + rootfs := e.Filename + if e.Err != nil { + logrus.Warnf("rootfsMon: container rootfs watch error on %s", rootfs) + continue + } + mgr.rtLock.Lock() + id, found := mgr.rootfsTable[rootfs] + if !found { + logrus.Warnf("rootfsMon: event on unknown container rootfs %s", rootfs) + mgr.rtLock.Unlock() + continue + } + logrus.Debugf("rootfsMon: detected removal of container rootfs %s", rootfs) + delete(mgr.rootfsTable, rootfs) + mgr.rtLock.Unlock() + mgr.removeCont(id) + } + + case <-mgr.rootfsMonStop: + logrus.Debugf("rootfsMon exiting ...") + return + } + } +} + +// removes all resources associated with a container +func (mgr *SysboxMgr) removeCont(id string) { + + mgr.ctLock.Lock() + info, found := mgr.contTable[id] + if !found { + mgr.ctLock.Unlock() + return + } + delete(mgr.contTable, id) + mgr.ctLock.Unlock() + + for _, mnt := range info.reqMntInfos { + var err error + + switch mnt.kind { + + case ipcLib.MntVarLibDocker: + err = mgr.dockerVolMgr.DestroyVol(id) + + case ipcLib.MntVarLibKubelet: + err = mgr.kubeletVolMgr.DestroyVol(id) + + case ipcLib.MntVarLibK0s: + err = mgr.k0sVolMgr.DestroyVol(id) + + case ipcLib.MntVarLibRancherK3s: + err = mgr.k3sVolMgr.DestroyVol(id) + + case ipcLib.MntVarLibRancherRke2: + err = mgr.rke2VolMgr.DestroyVol(id) + + case ipcLib.MntVarLibBuildkit: + err = mgr.buildkitVolMgr.DestroyVol(id) + + case ipcLib.MntVarLibContainerdOvfs: + err = mgr.containerdVolMgr.DestroyVol(id) + + } + if err != nil { + logrus.Errorf("rootfsMon: failed to destroy volume backing %s for container %s: %s", + mnt.kind, formatter.ContainerID{id}, err) + } + } + + if info.subidAllocated { + if err := mgr.subidAlloc.Free(id); err != nil { + logrus.Errorf("rootfsMon: failed to free uid(gid) for container %s: %s", + formatter.ContainerID{id}, err) + } + } + + if info.rootfsCloned { + if err := mgr.rootfsCloner.RemoveClone(id); err != nil { + logrus.Warnf("failed to unbind cloned rootfs for container %s: %s", + formatter.ContainerID{id}, err) + } + } + + logrus.Infof("released resources for container %s", + formatter.ContainerID{id}) +} + +func (mgr *SysboxMgr) reqMounts(id string, rootfsUidShiftType idShiftUtils.IDShiftType, reqList []ipcLib.MountReqInfo) ([]specs.Mount, error) { + + var ( + volChownOnSync bool + volUid, volGid uint32 + ) + + // get container info + mgr.ctLock.Lock() + info, found := mgr.contTable[id] + mgr.ctLock.Unlock() + + if !found { + return nil, fmt.Errorf("container %s is not registered", + formatter.ContainerID{id}) + } + + // if this is a stopped container that is being re-started, reuse its prior mounts + if info.state == restarted { + return info.containerMnts, nil + } + + // Setup Sysbox's implicit container mounts. The mounts may need chowning + // according to the following rules: + // + // Rootfs Container Rootfs Owner Sysbox Special Sync-in Sync-out + // ID-shift (Stopped) (Running) Mount Owner Chown Chown + // ------------------------------------------------------------------------------- + // ID-mapping root:root root:root root:root None None + // Shiftfs root:root root:root uid:gid root->uid uid->root + // Chown root:root uid:gid uid:gid root->uid uid->root + // No-shift uid:gid uid:gid uid:gid None None + + switch rootfsUidShiftType { + + case idShiftUtils.IDMappedMount: + volChownOnSync = false + volUid = 0 + volGid = 0 + + case idShiftUtils.Shiftfs: + volChownOnSync = true + volUid = info.uidMappings[0].HostID + volGid = info.gidMappings[0].HostID + + case idShiftUtils.Chown: + volChownOnSync = true + volUid = info.uidMappings[0].HostID + volGid = info.gidMappings[0].HostID + + case idShiftUtils.NoShift: + volChownOnSync = false + volUid = info.uidMappings[0].HostID + volGid = info.gidMappings[0].HostID + + default: + return nil, fmt.Errorf("unexpected rootfs ID shift type: %v", rootfsUidShiftType) + } + + containerMnts := []specs.Mount{} + reqMntInfos := []mountInfo{} + rootfs := info.rootfs + + for _, req := range reqList { + var err error + m := []specs.Mount{} + + switch req.Kind { + + case ipcLib.MntVarLibDocker: + m, err = mgr.dockerVolMgr.CreateVol(id, rootfs, req.Dest, volUid, volGid, volChownOnSync, 0700) + + case ipcLib.MntVarLibKubelet: + m, err = mgr.kubeletVolMgr.CreateVol(id, rootfs, req.Dest, volUid, volGid, volChownOnSync, 0755) + + case ipcLib.MntVarLibK0s: + m, err = mgr.k0sVolMgr.CreateVol(id, rootfs, req.Dest, volUid, volGid, volChownOnSync, 0755) + + case ipcLib.MntVarLibRancherK3s: + m, err = mgr.k3sVolMgr.CreateVol(id, rootfs, req.Dest, volUid, volGid, volChownOnSync, 0755) + + case ipcLib.MntVarLibRancherRke2: + m, err = mgr.rke2VolMgr.CreateVol(id, rootfs, req.Dest, volUid, volGid, volChownOnSync, 0755) + + case ipcLib.MntVarLibBuildkit: + m, err = mgr.buildkitVolMgr.CreateVol(id, rootfs, req.Dest, volUid, volGid, volChownOnSync, 0755) + + case ipcLib.MntVarLibContainerdOvfs: + m, err = mgr.containerdVolMgr.CreateVol(id, rootfs, req.Dest, volUid, volGid, volChownOnSync, 0700) + + default: + err = fmt.Errorf("invalid mount request type: %s", req.Kind) + } + + if err != nil { + return nil, fmt.Errorf("failed to setup mounts backing %s for container %s: %s", req.Dest, + formatter.ContainerID{id}, err) + } + + reqMntInfos = append(reqMntInfos, mountInfo{kind: req.Kind, mounts: m}) + containerMnts = append(containerMnts, m...) + } + + // Add the linux kernel header mounts to the sys container. This is needed to + // build or run apps that interact with the Linux kernel directly within a + // sys container. Note that there is no need to track mntInfo for these since + // we are not backing these with sysbox-mgr data stores. + containerMnts = append(containerMnts, mgr.linuxHeaderMounts...) + + // Add the linux /lib/modules/ mounts to the sys + // container. This allows system container processes to verify the + // presence of modules via modprobe. System apps such as Docker and + // K8s do this. Note that this does not imply module + // loading/unloading is supported in a system container (it's + // not). It merely lets processes check if a module is loaded. + containerMnts = append(containerMnts, mgr.libModMounts...) + + if len(reqMntInfos) > 0 { + info.reqMntInfos = reqMntInfos + info.containerMnts = containerMnts + + mgr.ctLock.Lock() + mgr.contTable[id] = info + mgr.ctLock.Unlock() + } + + // Dispatch a thread that checks if the container will be auto-removed after it stops + go mgr.autoRemoveCheck(id) + + return containerMnts, nil +} + +// autoRemoveCheck finds out (best effort) if the container will be automatically +// removed after being stopped. This allows us to skip copying back the contents +// of the sysbox-mgr volumes to the container's rootfs when the container is stopped +// (such a copy would not make sense since the containers rootfs will be destroyed +// anyway). +func (mgr *SysboxMgr) autoRemoveCheck(id string) { + + mgr.ctLock.Lock() + info, found := mgr.contTable[id] + if !found { + mgr.ctLock.Unlock() + return + } + mgr.ctLock.Unlock() + + logrus.Debugf("autoRemoveCheck: Docker query start for %s", + formatter.ContainerID{id}) + + docker, err := dockerUtils.DockerConnect() + if err != nil { + logrus.Debugf("autoRemoveCheck: Docker connection failed for %s: %s", + formatter.ContainerID{id}, err) + return + } + defer docker.Disconnect() + + ci, err := docker.ContainerGetInfo(id) + if err != nil { + logrus.Debugf("autoRemoveCheck: Docker query for %s failed: %s", + formatter.ContainerID{id}, err) + return + } + + mgr.ctLock.Lock() + info, found = mgr.contTable[id] + if !found { + mgr.ctLock.Unlock() + return + } + + info.autoRemove = ci.AutoRemove + mgr.contTable[id] = info + mgr.ctLock.Unlock() + + logrus.Debugf("autoRemoveCheck: done for %s (autoRemove = %v)", + formatter.ContainerID{id}, info.autoRemove) +} + +func (mgr *SysboxMgr) prepMounts(id string, uid, gid uint32, prepList []ipcLib.MountPrepInfo) (err error) { + + logrus.Debugf("preparing mounts for %s: %+v", + formatter.ContainerID{id}, prepList) + + // get container info + mgr.ctLock.Lock() + info, found := mgr.contTable[id] + mgr.ctLock.Unlock() + + if !found { + return fmt.Errorf("container %s is not registered", + formatter.ContainerID{id}) + } + + for _, prepInfo := range prepList { + src := prepInfo.Source + + // Exclusive mounts are mounts that should be mounted in one sys container at a + // given time; it's OK if it's mounted in multiple containers, as long as only one + // container uses it. If the mount is exclusive and another sys container has the + // same mount source, exclMntTable.add() will generate a warning. + exclMountInUse := false + if prepInfo.Exclusive { + exclMountInUse = mgr.exclMntTable.add(src, id) + defer func() { + if err != nil { + mgr.exclMntTable.remove(src, id) + } + }() + } + + // If the mount can be ID-mapped, nothing else to do + if mgr.mgrCfg.overlayfsOnIDMapMountOk { + useIDMap, err := idMap.IDMapMountSupportedOnPath(src) + if err != nil { + return err + } + if useIDMap { + continue + } + } + + // The mount can't be ID-mapped, we may need to chown it; check if the + // mount source has ownership matching that of the container's root + // user. If not, chown it the mount source accordingly. Skip this if the + // mount is already in use by another container (to avoid messing up the + // ownership of the mount). + needUidShift, origUid, origGid, err := mntSrcUidShiftNeeded(src, uid, gid) + if err != nil { + return fmt.Errorf("failed to check mount source ownership: %s", err) + } + + if needUidShift { + if !exclMountInUse { + // Offset may be positive or negative + uidOffset := int32(uid) - int32(origUid) + gidOffset := int32(gid) - int32(origGid) + + logrus.Infof("shifting uids at %s for %s (%d -> %d)", src, formatter.ContainerID{id}, origUid, uid) + + if err = idShiftUtils.ShiftIdsWithChown(src, uidOffset, gidOffset); err != nil { + return fmt.Errorf("failed to shift uids via chown for mount source at %s: %s", src, err) + } + + logrus.Infof("done shifting uids at %s for %s", src, formatter.ContainerID{id}) + } else { + logrus.Infof("skip shifting uids at %s for %s (mount is in use by another container)", src, formatter.ContainerID{id}) + } + } + + // store the prep info so we can revert it when the container is stopped + revInfo := mntPrepRevInfo{ + path: src, + uidShifted: needUidShift, + origUid: origUid, + origGid: origGid, + targetUid: uid, + targetGid: gid, + } + + info.mntPrepRev = append(info.mntPrepRev, revInfo) + mgr.ctLock.Lock() + mgr.contTable[id] = info + mgr.ctLock.Unlock() + } + + logrus.Debugf("done preparing mounts for %s", formatter.ContainerID{id}) + + return nil +} + +func (mgr *SysboxMgr) allocSubid(id string, size uint64) (uint32, uint32, error) { + + // get container info + mgr.ctLock.Lock() + info, found := mgr.contTable[id] + mgr.ctLock.Unlock() + + if !found { + return 0, 0, fmt.Errorf("container %s is not registered", + formatter.ContainerID{id}) + } + + // If we are being asked to allocate ID mappings for a new container, do it. + // For restarted containers, we keep the mappings we had prior to the + // container being stopped. + if !info.subidAllocated { + + uid, gid, err := mgr.subidAlloc.Alloc(id, size) + if err != nil { + return uid, gid, fmt.Errorf("failed to allocate uid(gid) for %s: %s", + formatter.ContainerID{id}, err) + } + + uidMapping := specs.LinuxIDMapping{ + ContainerID: 0, + HostID: uid, + Size: uint32(size), + } + + gidMapping := specs.LinuxIDMapping{ + ContainerID: 0, + HostID: gid, + Size: uint32(size), + } + + info.uidMappings = append(info.uidMappings, uidMapping) + info.gidMappings = append(info.gidMappings, gidMapping) + info.subidAllocated = true + + mgr.ctLock.Lock() + mgr.contTable[id] = info + mgr.ctLock.Unlock() + } + + return info.uidMappings[0].HostID, info.gidMappings[0].HostID, nil +} + +func (mgr *SysboxMgr) reqShiftfsMark(id string, mounts []shiftfs.MountPoint) ([]shiftfs.MountPoint, error) { + + // get container info + mgr.ctLock.Lock() + info, found := mgr.contTable[id] + mgr.ctLock.Unlock() + + if !found { + return nil, fmt.Errorf("container %s is not registered", formatter.ContainerID{id}) + } + + if len(info.shiftfsMarks) == 0 { + markpoints, err := mgr.shiftfsMgr.Mark(id, mounts, true) + if err != nil { + return nil, err + } + + info.shiftfsMarks = markpoints + + mgr.ctLock.Lock() + mgr.contTable[id] = info + mgr.ctLock.Unlock() + } + + return info.shiftfsMarks, nil +} + +func (mgr *SysboxMgr) reqFsState(id, rootfs string) ([]configs.FsEntry, error) { + + // get container info + mgr.ctLock.Lock() + _, found := mgr.contTable[id] + mgr.ctLock.Unlock() + + if !found { + return nil, fmt.Errorf("container %s is not registered", formatter.ContainerID{id}) + } + + if len(mgr.linuxHeaderMounts) == 0 { + return nil, nil + } + + // In certain scenarios a soft-link will be required to properly resolve the + // dependencies present in "/usr/src" and "/lib/modules/kernel" paths. + fsEntries, err := mgr.getKernelHeaderSoftlink(rootfs) + if err != nil { + return nil, fmt.Errorf("failed to obtain kernel-headers softlink state for container %s: %s", + formatter.ContainerID{id}, err) + } + + return fsEntries, nil +} + +func (mgr *SysboxMgr) getKernelHeaderSoftlink(rootfs string) ([]configs.FsEntry, error) { + + // Obtain linux distro within the passed rootfs path. Notice that we are + // not returning any received error to ensure we complete container's + // registration in all scenarios (i.e. rootfs may not include a full linux + // env -- it may miss os-release file). + cntrDistro, err := linuxUtils.GetDistroPath(rootfs) + if err != nil { + return nil, nil + } + + // No need to proceed if host and container are running the same distro. + if cntrDistro == mgr.hostDistro { + return nil, nil + } + + // Obtain container's kernel-header path. + cntrKernelPath, err := linuxUtils.GetLinuxHeaderPath(cntrDistro) + if err != nil { + return nil, fmt.Errorf("failed to identify kernel-header path of container's rootfs %s: %v", + rootfs, err) + } + + // Return if there's no kernelPath mismatch between host and container. + if cntrKernelPath == mgr.hostKernelHdrPath { + return nil, nil + } + + var fsEntries []configs.FsEntry + + // In certain distros, such as 'alpine', the kernel header path (typically + // "/usr/src") may not exist, so create an associated fsEntry to ensure + // that the kernel softlink addition (below) can be properly carried out. + fsEntryParents := configs.NewFsEntry( + path.Dir(cntrKernelPath), + "", + 0755, + configs.DirFsKind, + ) + + // Create kernel-header softlink. + fsEntry := configs.NewFsEntry( + cntrKernelPath, + mgr.hostKernelHdrPath, + 0644, + configs.SoftlinkFsKind, + ) + + fsEntries = append(fsEntries, *fsEntryParents, *fsEntry) + + return fsEntries, nil +} + +func (mgr *SysboxMgr) pause(id string) error { + + logrus.Debugf("pausing %s", formatter.ContainerID{id}) + + mgr.ctLock.Lock() + info, found := mgr.contTable[id] + mgr.ctLock.Unlock() + + if !found { + return fmt.Errorf("can't pause container %s; not found in container table", + formatter.ContainerID{id}) + } + + // If the rootfs is ID-mapped and on overlayfs, then chown the upper dir from + // [userns-host-ID -> 0] when the container pauses (same as we do during + // unregister(); see comment there for more info). + + if info.rootfsOvfsUpperChowned { + uidOffset := -int32(info.uidMappings[0].HostID) + gidOffset := -int32(info.gidMappings[0].HostID) + + logrus.Infof("pause %s: chown rootfs overlayfs upper layer at %s (%d -> %d)", + formatter.ContainerID{id}, info.rootfsOvfsUpper, info.uidMappings[0].HostID, 0) + + if err := idShiftUtils.ShiftIdsWithChown(info.rootfsOvfsUpper, uidOffset, gidOffset); err != nil { + return err + } + + info.rootfsOvfsUpperChowned = false + + mgr.ctLock.Lock() + mgr.contTable[id] = info + mgr.ctLock.Unlock() + + } else if info.rootfsCloned && info.rootfsUidShiftType == idShiftUtils.Chown { + if err := mgr.rootfsCloner.RevertChown(id); err != nil { + return err + } + } + + // Request all volume managers to sync back contents to the container's rootfs + if err := mgr.volSyncOut(id, info); err != nil { + logrus.Warnf("pause failed: %s", err) + return err + } + + logrus.Debugf("paused %s", formatter.ContainerID{id}) + + return nil +} + +func (mgr *SysboxMgr) resume(id string) error { + + logrus.Debugf("resuming %s", formatter.ContainerID{id}) + + mgr.ctLock.Lock() + info, found := mgr.contTable[id] + mgr.ctLock.Unlock() + + if !found { + return fmt.Errorf("can't resume container %s; not found in container table", + formatter.ContainerID{id}) + } + + uidOffset := int32(info.uidMappings[0].HostID) + gidOffset := int32(info.gidMappings[0].HostID) + + // If the rootfs is ID-mapped and on overlayfs, then chown the upper dir from + // [0 -> userns-host-ID] when the container resumes (i.e., opposite of what + // we do in pause()). + + if info.rootfsUidShiftType == idShiftUtils.IDMappedMount && + info.rootfsOnOvfs { + + // chown the rootfs upper dir (same as we do during update()). + logrus.Infof("resume %s: chown rootfs overlayfs upper layer at %s (%d -> %d)", + formatter.ContainerID{id}, info.rootfsOvfsUpper, 0, info.uidMappings[0].HostID) + + if err := idShiftUtils.ShiftIdsWithChown(info.rootfsOvfsUpper, uidOffset, gidOffset); err != nil { + return err + } + + info.rootfsOvfsUpperChowned = true + + mgr.ctLock.Lock() + mgr.contTable[id] = info + mgr.ctLock.Unlock() + + } else if info.rootfsCloned && info.rootfsUidShiftType == idShiftUtils.Chown { + if err := mgr.rootfsCloner.ChownClone(id, uidOffset, gidOffset); err != nil { + return err + } + } + + logrus.Debugf("resumed %s", formatter.ContainerID{id}) + + return nil +} + +// trackNetns tracks the network ns for the given container id +func (mgr *SysboxMgr) trackNetns(id string, netnsInode uint64) ([]string, error) { + + mgr.ntLock.Lock() + defer mgr.ntLock.Unlock() + + sameNetns, ok := mgr.netnsTable[netnsInode] + if ok { + sameNetns = append(sameNetns, id) + } else { + sameNetns = []string{id} + } + + mgr.netnsTable[netnsInode] = sameNetns + + return sameNetns, nil +} + +// untrackNetns removes netns tracking for the given container id +func (mgr *SysboxMgr) untrackNetns(id string, netnsInode uint64) error { + mgr.ntLock.Lock() + defer mgr.ntLock.Unlock() + + sameNetns, ok := mgr.netnsTable[netnsInode] + if !ok { + return fmt.Errorf("did not find inode %d in netnsTable", netnsInode) + } + + sameNetns = libutils.StringSliceRemove(sameNetns, []string{id}) + + if len(sameNetns) > 0 { + mgr.netnsTable[netnsInode] = sameNetns + } else { + delete(mgr.netnsTable, netnsInode) + } + + return nil +} + +func (mgr *SysboxMgr) cloneRootfs(id string) (string, error) { + + mgr.ctLock.Lock() + info, found := mgr.contTable[id] + mgr.ctLock.Unlock() + + if !found { + return "", fmt.Errorf("container %s is not registered", + formatter.ContainerID{id}) + } + + rmgr := mgr.rootfsCloner + + if !info.rootfsCloned { + + clonedRootfs, err := rmgr.CreateClone(id, info.rootfs) + if err != nil { + return "", err + } + + info.rootfs = clonedRootfs + info.rootfsCloned = true + + mgr.ctLock.Lock() + mgr.contTable[id] = info + mgr.ctLock.Unlock() + } + + return info.rootfs, nil +} + +func (mgr *SysboxMgr) chownClonedRootfs(id string, uidOffset, gidOffset int32) error { + + mgr.ctLock.Lock() + _, found := mgr.contTable[id] + mgr.ctLock.Unlock() + + if !found { + return fmt.Errorf("container %s is not registered", + formatter.ContainerID{id}) + } + + rmgr := mgr.rootfsCloner + + return rmgr.ChownClone(id, uidOffset, gidOffset) +} + +func (mgr *SysboxMgr) revertClonedRootfsChown(id string) error { + + mgr.ctLock.Lock() + _, found := mgr.contTable[id] + mgr.ctLock.Unlock() + + if !found { + return fmt.Errorf("container %s is not registered", + formatter.ContainerID{id}) + } + + rmgr := mgr.rootfsCloner + + return rmgr.RevertChown(id) +} diff --git a/sysbox-mgr/rootfsCloner/rootfsCloner.go b/sysbox-mgr/rootfsCloner/rootfsCloner.go new file mode 100644 index 00000000..34ad6e82 --- /dev/null +++ b/sysbox-mgr/rootfsCloner/rootfsCloner.go @@ -0,0 +1,257 @@ +// +// Copyright 2022 Nestybox, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +package rootfsCloner + +import ( + "fmt" + "os" + "path/filepath" + "sync" + + "github.com/nestybox/sysbox-libs/formatter" + "github.com/nestybox/sysbox-libs/mount" + "github.com/sirupsen/logrus" +) + +const clonerDir string = "rootfs" + +type cloneInfo struct { + origRootfsMntInfo *mount.Info + newRootfsDir string + ovfsMount ovfsMntInfo + bindMounts []bindMnt + chownUidOffset int32 + chownGidOffset int32 + bindToSelfActive bool +} + +type cloner struct { + hostDir string + clones map[string]*cloneInfo // container-ID -> cloneInfo + mu sync.Mutex +} + +type ovfsMntInfo struct { + mergedDir string + diffDir string + workDir string +} + +type bindMnt struct { + src string + dst string +} + +func New(hostDir string) *cloner { + return &cloner{ + hostDir: hostDir, + clones: make(map[string]*cloneInfo), + } +} + +func (c *cloner) CreateClone(id, origRootfs string) (string, error) { + + logrus.Debugf("Prep rootfs cloning for container %s", formatter.ContainerID{id}) + + // Check if this is a redundant clone + c.mu.Lock() + _, found := c.clones[id] + c.mu.Unlock() + + if found { + return "", fmt.Errorf("redundant rootfs clone for container %s", + formatter.ContainerID{id}) + } + + // Get the mount info for the orig rootfs + allMounts, err := mount.GetMounts() + if err != nil { + return "", err + } + + origRootfsMntInfo, err := mount.GetMountAt(origRootfs, allMounts) + if err != nil { + return "", fmt.Errorf("failed to get mount info for mount at %s: %s", origRootfs, err) + } + + // We only support cloning of rootfs on overlayfs currently + if origRootfsMntInfo.Fstype != "overlay" { + return "", fmt.Errorf("rootfs cloning is only supported for overlayfs; rootfs at %s is not on overlayfs", origRootfs) + } + + // Create the dir under which we will create the cloned rootfs + origRootfsDir := filepath.Dir(origRootfs) + newRootfsDir := filepath.Join(c.hostDir, clonerDir, id) + + perm, err := filePerm(origRootfsDir) + if err != nil { + return "", fmt.Errorf("failed to get permissions for %s: %s", origRootfsDir, err) + } + + if err := os.MkdirAll(newRootfsDir, perm); err != nil { + return "", err + } + + ci := &cloneInfo{ + origRootfsMntInfo: origRootfsMntInfo, + newRootfsDir: newRootfsDir, + } + + subdir := filepath.Join(newRootfsDir, "overlay2") + + ovfsMntInfo := ovfsMntInfo{ + mergedDir: filepath.Join(subdir, "merged"), + diffDir: filepath.Join(subdir, "diff"), + workDir: filepath.Join(subdir, "work"), + } + + if err := createNewOvfsDir(ovfsMntInfo); err != nil { + return "", err + } + + ci.ovfsMount = ovfsMntInfo + + if err := mountClone(ci); err != nil { + return "", fmt.Errorf("failed to mount clone for container %s: %s", + formatter.ContainerID{id}, err) + } + + c.mu.Lock() + c.clones[id] = ci + c.mu.Unlock() + + return ci.ovfsMount.mergedDir, nil +} + +func (c *cloner) RemoveClone(id string) error { + + logrus.Debugf("Removing rootfs clone for container %s", formatter.ContainerID{id}) + + c.mu.Lock() + ci, found := c.clones[id] + c.mu.Unlock() + + if !found { + return fmt.Errorf("did not find rootfs clone info for container %s", + formatter.ContainerID{id}) + } + + if err := unmountClone(ci); err != nil { + return fmt.Errorf("failed to unmount clone for container %s: %s", + formatter.ContainerID{id}, err) + } + + if err := os.RemoveAll(ci.newRootfsDir); err != nil { + return fmt.Errorf("failed to remove clone for container %s: %s", + formatter.ContainerID{id}, err) + } + + c.mu.Lock() + delete(c.clones, id) + c.mu.Unlock() + + return nil +} + +func (c *cloner) ChownClone(id string, uidOffset, gidOffset int32) error { + + logrus.Debugf("Chown rootfs clone for container %s (%d:%d)", formatter.ContainerID{id}, uidOffset, gidOffset) + + c.mu.Lock() + ci, found := c.clones[id] + c.mu.Unlock() + + if !found { + return fmt.Errorf("did not find rootfs clone info for container %s", + formatter.ContainerID{id}) + } + + if err := doChown(ci, uidOffset, gidOffset); err != nil { + return err + } + + // Remember the chown offsets (so we can revert it) + ci.chownUidOffset = uidOffset + ci.chownGidOffset = gidOffset + + c.mu.Lock() + c.clones[id] = ci + c.mu.Unlock() + + return nil +} + +func (c *cloner) RevertChown(id string) error { + + logrus.Debugf("Revert chown rootfs clone for container %s", formatter.ContainerID{id}) + + c.mu.Lock() + ci, found := c.clones[id] + c.mu.Unlock() + + if !found { + return fmt.Errorf("did not find rootfs clone info for container %s", + formatter.ContainerID{id}) + } + + uidOffset := 0 - int32(ci.chownUidOffset) + gidOffset := 0 - int32(ci.chownGidOffset) + + if err := doChown(ci, uidOffset, gidOffset); err != nil { + return fmt.Errorf("failed to chown cloned rootfs for container %s: %s", + formatter.ContainerID{id}, err) + } + + c.mu.Lock() + c.clones[id] = ci + c.mu.Unlock() + + return nil +} + +func (c *cloner) ContainerStopped(id string) error { + + c.mu.Lock() + defer c.mu.Unlock() + + ci, found := c.clones[id] + if !found { + return fmt.Errorf("did not find rootfs clone info for container %s", + formatter.ContainerID{id}) + } + + // Docker hack: when Docker stops the container, it will remove the ovfs + // mount over the container's rootfs. This will cause it to remove the + // bind-to-self mount we created on top of it (on purpose) rather than the + // underlying ovfs mount. Variable bindToSelfActive tracks this, such that + // the rootfs cloner is aware of this and won't remove that mount (since + // Docker already removed it). + + ci.bindToSelfActive = false + c.clones[id] = ci + + return nil +} + +func (c *cloner) RemoveAll() { + for id, _ := range c.clones { + if err := c.RemoveClone(id); err != nil { + logrus.Warnf("rootfsCloner cleanup error: failed to remove rootfs clone %s: %s", + formatter.ContainerID{id}, err) + } + } +} diff --git a/sysbox-mgr/rootfsCloner/utils.go b/sysbox-mgr/rootfsCloner/utils.go new file mode 100644 index 00000000..45814883 --- /dev/null +++ b/sysbox-mgr/rootfsCloner/utils.go @@ -0,0 +1,327 @@ +// +// Copyright 2022-2023 Nestybox, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +package rootfsCloner + +import ( + "fmt" + "os" + "strings" + + mapset "github.com/deckarep/golang-set" + "github.com/nestybox/sysbox-libs/mount" + "github.com/sirupsen/logrus" + "golang.org/x/sys/unix" + + sh "github.com/nestybox/sysbox-libs/idShiftUtils" +) + +func createNewOvfsDir(info ovfsMntInfo) error { + subdirs := []string{info.mergedDir, info.diffDir, info.workDir} + + for _, subdir := range subdirs { + if err := os.MkdirAll(subdir, 0755); err != nil { + return err + } + } + + return nil +} + +func filePerm(path string) (os.FileMode, error) { + fi, err := os.Stat(path) + if err != nil { + return os.FileMode(0), err + } + return fi.Mode().Perm(), nil +} + +// Creates the cloned rootfs overlayfs mounts and bind-mounts over the orig rootfs. +func mountClone(ci *cloneInfo) error { + + if err := setupBottomMount(ci); err != nil { + return fmt.Errorf("failed to set up bottom ovfs mount: %v", err) + } + + if err := bindOrigRootfs(ci); err != nil { + return fmt.Errorf("failed to bind mount over orig rootfs: %v", err) + } + + if err := bindToSelfOrigRootfs(ci); err != nil { + return fmt.Errorf("failed to bind-to-self over orig rootfs: %v", err) + } + + ci.bindToSelfActive = true + + return nil +} + +func unmountClone(ci *cloneInfo) error { + + if ci.bindToSelfActive { + if err := unbindToSelfOrigRootfs(ci); err != nil { + logrus.Errorf("failed to remove bind-to-self mount over orig rootfs: %s", err) + } + } + + if err := unbindOrigRootfs(ci); err != nil { + logrus.Errorf("failed to remove bind mounts over orig rootfs: %s", err) + } + + if err := removeBottomMount(ci); err != nil { + return fmt.Errorf("failed to remove bottom mount: %s", err) + } + + return nil +} + +// Sets up the overlayfs bottom mount; it uses the same lower layers as the +// original rootfs mount, but adds metacopy=on. Ths mount lives inside the +// sysbox data root directory (e.g., /var/lib/sysbox/rootfs//bottom/merged) +func setupBottomMount(ci *cloneInfo) error { + + mergedDir := ci.ovfsMount.mergedDir + diffDir := ci.ovfsMount.diffDir + workDir := ci.ovfsMount.workDir + + // This gets us the orig rootfs ovfs mount options, and adds metacopy=on to them + mntFlags, options, propFlags := getBottomMountOpt(ci.origRootfsMntInfo, []interface{}{"metacopy=on"}) + + // Replace the original upperdir and workdir with the bottom mount ones + tmpOpt := "" + for _, opt := range strings.Split(options, ",") { + if strings.Contains(opt, "upperdir=") { + opt = "upperdir=" + diffDir + } else if strings.Contains(opt, "workdir=") { + opt = "workdir=" + workDir + } + tmpOpt += opt + "," + } + options = strings.TrimSuffix(tmpOpt, ",") + + if err := unix.Mount("overlay", mergedDir, "overlay", uintptr(mntFlags), options); err != nil { + return fmt.Errorf("failed to mount overlayfs on %s: %s", mergedDir, err) + } + + if err := unix.Mount("", mergedDir, "", uintptr(propFlags), ""); err != nil { + return fmt.Errorf("failed to set mount prop flags on %s: %s", mergedDir, err) + } + + return nil +} + +// Removes the overlayfs bottom mount +func removeBottomMount(ci *cloneInfo) error { + return unix.Unmount(ci.ovfsMount.mergedDir, unix.MNT_DETACH) +} + +// Bind-mounts the cloned rootfs over the original rootfs. Adds the new mounts +// to the cloneInfo struct. +func bindOrigRootfs(ci *cloneInfo) error { + var origDiffDir, origWorkDir string + var bindMounts []bindMnt + + mi := ci.origRootfsMntInfo + vfsOpts := mi.VfsOpts + origRootfs := mi.Mountpoint + + // Find the upperdir and workdir of the original rootfs ovfs mount + for _, opt := range strings.Split(vfsOpts, ",") { + if strings.Contains(opt, "upperdir=") { + origDiffDir = strings.TrimPrefix(opt, "upperdir=") + } + if strings.Contains(opt, "workdir=") { + origWorkDir = strings.TrimPrefix(opt, "workdir=") + } + } + + if origDiffDir == "" || origWorkDir == "" { + return fmt.Errorf("failed to parse overlayfs mount options for mountpoint %s", origRootfs) + } + + // Bind mount the bottom mount's merged and diff dirs over the orig rootfs; + // these bind mounts are kept when the container is stopped, and only deleted + // when the container is removed. They ensure that higher level operations that operate on + // the container's original rootfs work (e.g., docker commit, docker build, docker cp). + bindMounts = append(bindMounts, bindMnt{src: ci.ovfsMount.mergedDir, dst: origRootfs}) + bindMounts = append(bindMounts, bindMnt{src: ci.ovfsMount.diffDir, dst: origDiffDir}) + bindMounts = append(bindMounts, bindMnt{src: ci.ovfsMount.workDir, dst: origWorkDir}) + + if err := bindMountOverOrigRootfs(bindMounts); err != nil { + return err + } + + ci.bindMounts = bindMounts + return nil +} + +func unbindOrigRootfs(ci *cloneInfo) error { + + for _, m := range ci.bindMounts { + if _, err := os.Stat(m.dst); os.IsNotExist(err) { + continue + } + if err := unix.Unmount(m.dst, unix.MNT_DETACH); err != nil { + return err + } + } + + ci.bindMounts = nil + return nil +} + +func bindToSelfOrigRootfs(ci *cloneInfo) error { + + // Create a redundant bind-to-self mount over the original rootfs. This way, + // if the higher level container manager (e.g., Docker) tries to unmount the + // rootfs ovfs mount when the container stops, it will unmount the redundant + // mount we just created. This means the rootfs ovfs mount stays in place + // when the container is stopped, and therefore won't be remounted when the + // container restarts. Such a remounting would fail because of the bind + // mounts that sysbox created over the ovfs diff and work dirs. This is a + // hacky solution, but we could not find another one. All this will go away + // once idmapped mounts are supported on overlayfs, at which point the + // rootfsCloner won't be needed anymore. + + origRootfs := ci.origRootfsMntInfo.Mountpoint + return unix.Mount(origRootfs, origRootfs, "", unix.MS_BIND|unix.MS_REC, "") +} + +func unbindToSelfOrigRootfs(ci *cloneInfo) error { + origRootfs := ci.origRootfsMntInfo.Mountpoint + return unix.Unmount(origRootfs, unix.MNT_DETACH) +} + +// Computes the lower overlayfs mount flags, mount options, and propagation flags. +func getBottomMountOpt(origRootfsMntInfo *mount.Info, wantOpts []interface{}) (int, string, int) { + + // Convert mount opts to a mapset; in the process replace the upperdir and + // workdir options with the new ones. + + currVfsOpts := mapset.NewSet() + for _, opt := range strings.Split(origRootfsMntInfo.VfsOpts, ",") { + currVfsOpts.Add(opt) + } + + currMntOpts := mapset.NewSet() + for _, opt := range strings.Split(origRootfsMntInfo.Opts, ",") { + currMntOpts.Add(opt) + } + + // Add "metacopy=on" to the existing mount options + wantVfsOpts := mapset.NewSetFromSlice(wantOpts) + addVfsOpts := wantVfsOpts.Difference(currVfsOpts) + + // The vfs opts reported by mountinfo are a combination of per superblock + // mount opts and the overlayfs-specific data; we need to separate these so + // we can do the mount properly. + properMntOpts := mapset.NewSetFromSlice([]interface{}{ + "ro", "rw", "nodev", "noexec", "nosuid", "noatime", "nodiratime", "relatime", "strictatime", "sync", + }) + + newMntOpts := currVfsOpts.Intersect(properMntOpts) + + newVfsOpts := currVfsOpts.Difference(properMntOpts) + newVfsOpts = newVfsOpts.Union(addVfsOpts) + + // Convert the mount options to the mount flags + newMntOptsString := []string{} + for _, opt := range newMntOpts.ToSlice() { + newMntOptsString = append(newMntOptsString, fmt.Sprintf("%s", opt)) + } + mntFlags := mount.OptionsToFlags(newMntOptsString) + + // Convert the vfs option set to the mount data string + newVfsOptsString := "" + for i, opt := range newVfsOpts.ToSlice() { + if i != 0 { + newVfsOptsString += "," + } + newVfsOptsString += fmt.Sprintf("%s", opt) + } + + // Set the mount propagation flags as they were in the original mount + // (shared, slave, etc.) + propFlags := 0 + + if strings.Contains(origRootfsMntInfo.Optional, "shared") { + propFlags |= unix.MS_SHARED + } else if strings.Contains(origRootfsMntInfo.Optional, "master") { + propFlags |= unix.MS_SLAVE + } else if strings.Contains(origRootfsMntInfo.Optional, "unbindable") { + propFlags |= unix.MS_UNBINDABLE + } else { + propFlags |= unix.MS_PRIVATE + } + + return mntFlags, newVfsOptsString, propFlags +} + +func bindMountOverOrigRootfs(bindMounts []bindMnt) error { + var ferr error + + failed := false + mounted := []string{} + + for _, m := range bindMounts { + if err := unix.Mount(m.src, m.dst, "", unix.MS_BIND|unix.MS_REC, ""); err != nil { + failed = true + ferr = fmt.Errorf("failed to bind mount %s to %s: %s", m.src, m.dst, err) + } + mounted = append(mounted, m.dst) + } + + // Cleanup in case a bind-mount fails + if failed { + for _, m := range mounted { + unix.Unmount(m, unix.MNT_DETACH) + } + return ferr + } + + return nil +} + +func doChown(ci *cloneInfo, uidOffset, gidOffset int32) error { + + if ci.bindToSelfActive { + if err := unbindToSelfOrigRootfs(ci); err != nil { + return err + } + } + + if err := unbindOrigRootfs(ci); err != nil { + return err + } + + // chown the bottom ovfs mount (fast because metacopy=on is set on it) + if err := sh.ShiftIdsWithChown(ci.ovfsMount.mergedDir, uidOffset, gidOffset); err != nil { + return fmt.Errorf("failed to chown cloned rootfs bottom mount at %s by offset %d, %d: %s", + ci.ovfsMount.mergedDir, uidOffset, gidOffset, err) + } + + if err := bindOrigRootfs(ci); err != nil { + return err + } + + if ci.bindToSelfActive { + if err := bindToSelfOrigRootfs(ci); err != nil { + return err + } + } + + return nil +} diff --git a/sysbox-mgr/shiftfsMgr/shiftfsMgr.go b/sysbox-mgr/shiftfsMgr/shiftfsMgr.go new file mode 100644 index 00000000..40849cfc --- /dev/null +++ b/sysbox-mgr/shiftfsMgr/shiftfsMgr.go @@ -0,0 +1,263 @@ +// +// Copyright 2019-2021 Nestybox, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +// The shiftfs manager performs shiftfs marks/unmarks on the sys container's rootfs +// and other mountpoins (e.g., bind-mount sources). +// +// When multiple sys containers share bind-mounts, the shiftfs manager ensures that +// shiftfs is only marked once on the bind mount and that the mark is removed when the +// last container associated with it is destroyed. + +package shiftfsMgr + +import ( + "fmt" + "os" + "os/exec" + "path/filepath" + "sync" + + uuid "github.com/google/uuid" + "github.com/nestybox/sysbox-libs/formatter" + "github.com/nestybox/sysbox-libs/mount" + "github.com/nestybox/sysbox-libs/shiftfs" + intf "github.com/nestybox/sysbox-mgr/intf" + "github.com/sirupsen/logrus" +) + +type mgr struct { + workDir string + mreqCntrMap map[string][]string // Maps shiftfs mount request paths to the associated container(s) IDs. + mpMreqMap map[string]string // Maps each shiftfs markpoint path to it's corresponding mount request path. + mu sync.Mutex +} + +// Creates a new instance of the shiftfs manager +func New(sysboxLibDir string) (intf.ShiftfsMgr, error) { + + // Load the shiftfs module (if present in the kernel) + exec.Command("modprobe", "shiftfs").Run() + + workDir := filepath.Join(sysboxLibDir, "shiftfs") + + if err := os.MkdirAll(workDir, 0710); err != nil { + return nil, err + } + + return &mgr{ + workDir: workDir, + mreqCntrMap: make(map[string][]string), + mpMreqMap: make(map[string]string), + }, nil + +} + +// Creates a shiftfs "mark" mount over the given path list, to prepare them for +// uid-shifting. If "createMarkpoint" is true, then this function creates new +// mountpoint directories for each of the given paths, under the shiftfs-mgr +// work dir (e.g., /var/lib/sysbox/shiftfs/), and mounts shiftfs with +// something equivalent to: +// +// mount -t shiftfs -o mark /var/lib/sysbox/shiftfs/ +// +// If createMarkpoint is false, then this function does something equivalent to: +// +// mount -t shiftfs -o mark +// +// Creating a separate markpoint is useful when the caller does not wish to set +// the shiftfs mark directly over the given paths, as doing so makes them +// implicitly "no-exec" and in addition can result in a security risk because it +// would allow unprivileged users to unshare their user-ns and mount shiftfs on +// those same paths, thereby gaining root access to them. Both of these issues +// are solve by placing the shiftfs mark over a separate markpoint directory +// under root-only access (such as /var/lib/sysbox). +// +// Returns the list of shiftfs markpoints. + +func (sm *mgr) Mark(id string, mountReqs []shiftfs.MountPoint, createMarkpoint bool) ([]shiftfs.MountPoint, error) { + sm.mu.Lock() + defer sm.mu.Unlock() + + markpoints := []shiftfs.MountPoint{} + + allMounts, err := mount.GetMounts() + if err != nil { + return nil, err + } + + for _, mntReq := range mountReqs { + + mntReqPath := mntReq.Source + + // if mount request path is in the mount-req-to-container map, add the container-id to the entry + ids, found := sm.mreqCntrMap[mntReqPath] + if found { + ids = append(ids, id) + sm.mreqCntrMap[mntReqPath] = ids + + // Get the markpoint for the mount request path and add it to the list + // of markpoints we will return. + for mp, mrp := range sm.mpMreqMap { + if mrp == mntReqPath { + markpoints = append(markpoints, shiftfs.MountPoint{Source: mp}) + break + } + } + + continue + } + + // if shiftfs already marked, no action (some entity other than sysbox did the + // marking; we don't track that) + mounted, err := shiftfs.Mounted(mntReqPath, allMounts) + if err != nil { + return nil, fmt.Errorf("error while checking for existing shiftfs mount on %s: %v", mntReqPath, err) + } + if mounted { + markpoints = append(markpoints, mntReq) + logrus.Debugf("skipped shiftfs mark on %s (already mounted)", mntReqPath) + continue + } + + markpoint := mntReqPath + + if createMarkpoint { + mntUuid := uuid.New().String() + markpoint = filepath.Join(sm.workDir, mntUuid) + if err := os.Mkdir(markpoint, 0700); err != nil { + return nil, err + } + } + + if err := shiftfs.Mark(mntReqPath, markpoint); err != nil { + return nil, err + } + + sm.mpMreqMap[markpoint] = mntReqPath + sm.mreqCntrMap[mntReqPath] = []string{id} + + markpoints = append(markpoints, shiftfs.MountPoint{Source: markpoint}) + + logrus.Debugf("marked shiftfs for %s at %s", mntReqPath, markpoint) + } + + return markpoints, nil +} + +func (sm *mgr) Unmark(id string, markpoints []shiftfs.MountPoint) error { + sm.mu.Lock() + defer sm.mu.Unlock() + + for _, mp := range markpoints { + markpoint := mp.Source + + // Lookup the mount request path for the given markpoint + // we may not find it in the markpoint map if we skipped it in Mark() + // (e.g., because it was already mounted by some other entity) + mntReqPath, found := sm.mpMreqMap[markpoint] + if !found { + continue + } + + // Lookup the containers associated with this mount request path + ids, ok := sm.mreqCntrMap[mntReqPath] + if !ok { + logrus.Warnf("shiftfs unmark error: mount request path %s expected to be in container map but it's not.", + mntReqPath) + continue + } + + // Remove matching container-id from mreqCntrMap entry + ids, err := removeID(ids, id) + if err != nil { + return fmt.Errorf("did not find container id %s in mount-point map entry for %s", + formatter.ContainerID{id}, mntReqPath) + } + + // If after removal the mreqCntrMap entry is empty it means there are no more containers + // associated with that mount, so we proceed to remove the shiftfs mark. Otherwise, + // we simply update the mreqCntrMap entry. + + if len(ids) == 0 { + if err := shiftfs.Unmount(markpoint); err != nil { + return err + } + + hasUuidMarkpoint := filepath.HasPrefix(markpoint, sm.workDir) + + if hasUuidMarkpoint { + if err := os.Remove(markpoint); err != nil { + return err + } + } + + delete(sm.mpMreqMap, markpoint) + delete(sm.mreqCntrMap, mntReqPath) + + logrus.Debugf("unmarked shiftfs for %s at %s", mntReqPath, markpoint) + + } else { + sm.mreqCntrMap[mntReqPath] = ids + } + } + + return nil +} + +func (sm *mgr) UnmarkAll() { + sm.mu.Lock() + defer sm.mu.Unlock() + + for mp := range sm.mpMreqMap { + if err := shiftfs.Unmount(mp); err != nil { + logrus.Warnf("failed to unmark shiftfs on %s: %s", mp, err) + } + + hasUuidMarkpoint := filepath.HasPrefix(mp, sm.workDir) + + if hasUuidMarkpoint { + if err := os.Remove(mp); err != nil { + logrus.Warnf("failed to remove %s: %s", mp, err) + } + } + + logrus.Debugf("unmarked shiftfs on %s", mp) + delete(sm.mpMreqMap, mp) + } +} + +// Removes element 'elem' from the given string slice +func removeID(ids []string, elem string) ([]string, error) { + var ( + i int + id string + found bool = false + ) + + for i, id = range ids { + if id == elem { + found = true + break + } + } + + if !found { + return []string{}, fmt.Errorf("not found") + } + + ids[i] = ids[len(ids)-1] + return ids[:len(ids)-1], nil +} diff --git a/sysbox-mgr/shiftfsMgr/shiftfsMgr_test.go b/sysbox-mgr/shiftfsMgr/shiftfsMgr_test.go new file mode 100644 index 00000000..ef5ac41b --- /dev/null +++ b/sysbox-mgr/shiftfsMgr/shiftfsMgr_test.go @@ -0,0 +1,606 @@ +// +// Copyright 2019-2021 Nestybox, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +package shiftfsMgr + +import ( + "fmt" + "io" + "io/ioutil" + "os" + "testing" + + "github.com/nestybox/sysbox-libs/mount" + "github.com/nestybox/sysbox-libs/shiftfs" + utils "github.com/nestybox/sysbox-libs/utils" +) + +var sysboxLibDir string = "/var/lib/sysbox" + +type mountTest struct { + id string + mounts []shiftfs.MountPoint +} + +func hostSupportsShiftfs() (bool, error) { + + dir, err := ioutil.TempDir("/mnt/scratch", "shiftfsMgrTest") + if err != nil { + return false, err + } + defer os.RemoveAll(dir) + + shiftfsOk, err := shiftfs.ShiftfsSupported(dir) + if err != nil { + return false, fmt.Errorf("failed to check kernel shiftfs support: %v", err) + } + + return shiftfsOk, nil +} + +func setupTest() (string, error) { + dir, err := ioutil.TempDir("/mnt/scratch", "shiftfsMgrTest") + if err != nil { + return "", err + } + + return dir, nil +} + +func cleanupTest(dir string) { + os.RemoveAll(dir) +} + +func dirIsEmpty(name string) (bool, error) { + f, err := os.Open(name) + if err != nil { + return false, err + } + defer f.Close() + + _, err = f.Readdirnames(1) + if err == io.EOF { + return true, nil + } + + return false, err +} + +func mountTestEqual(a, b []mountTest) bool { + if len(a) != len(b) { + return false + } + + for i, _ := range a { + if a[i].id != b[i].id { + return false + } + + if len(a[i].mounts) != len(b[i].mounts) { + return false + } + + for j, _ := range a[i].mounts { + if a[i].mounts[j] != b[i].mounts[j] { + return false + } + } + } + + return true +} + +func TestShiftfsMgrBasic(t *testing.T) { + + shiftfsOk, err := hostSupportsShiftfs() + if err != nil { + t.Fatalf("error: host shiftfs check failed: %s", err) + } + + if !shiftfsOk { + t.Skip("skipping test (shiftfs not supported).") + } + + tdir, err := setupTest() + if err != nil { + t.Errorf("error: setupTest() failed: %s", err) + } + + mgrIf, _ := New(sysboxLibDir) + mgr := mgrIf.(*mgr) + + // Generare some shiftfs mark requests + testIn := []mountTest{ + { + id: "testCont1", + mounts: []shiftfs.MountPoint{ + {"/a/b/c", false}, + {"/d/e/f/g", false}, + }, + }, + { + id: "testCont2", + mounts: []shiftfs.MountPoint{ + {"/a/b/c", false}, + {"/x/y/z", false}, + }, + }, + { + id: "testCont3", + mounts: []shiftfs.MountPoint{ + {"/i/h/j", false}, + {"/x/y/z", false}, + {"/a/b/c", false}, + }, + }, + } + + for _, mt := range testIn { + for _, m := range mt.mounts { + if err := os.MkdirAll(m.Source, 0755); err != nil { + t.Error(err) + } + } + } + + testOut := []mountTest{} + + for _, mt := range testIn { + + mp, err := mgr.Mark(mt.id, mt.mounts, false) + if err != nil { + t.Errorf("error: failed to mark mounts: %v", err) + } + + entry := mountTest{ + id: mt.id, + mounts: mp, + } + + testOut = append(testOut, entry) + } + + // Verify the shiftfs marks are present + allMounts, err := mount.GetMounts() + if err != nil { + t.Error(err) + } + + for _, mt := range testOut { + for _, m := range mt.mounts { + marked, err := shiftfs.Mounted(m.Source, allMounts) + if err != nil { + t.Error(err) + } + if !marked { + t.Errorf("error: shiftfs mark expected on %s, but none found.", m.Source) + } + } + } + + if !mountTestEqual(testIn, testOut) { + t.Errorf("error: markpoint mismatch: got %v, want %v", testOut, testIn) + } + + // verify the shiftfsMgr mreqCntrMap looks good + uniqueMnts := []string{"/a/b/c", "/d/e/f/g", "/x/y/z", "/i/h/j"} + cntrs := [][]string{ + {"testCont1", "testCont2", "testCont3"}, + {"testCont1"}, + {"testCont2", "testCont3"}, + {"testCont3"}, + } + + for i, k := range uniqueMnts { + ids := mgr.mreqCntrMap[k] + if !utils.StringSliceEqual(ids, cntrs[i]) { + t.Errorf("error: mreqCntrMap[%s] = %v; want mreqCntrMap[%s] = %v", k, ids, k, cntrs[i]) + } + } + + // Generate shiftfs unmark requests + for _, mt := range testOut { + if err := mgr.Unmark(mt.id, mt.mounts); err != nil { + t.Errorf("error: failed to unmark mounts: %v", err) + } + } + + // Verify the shiftfs marks were removed + allMounts, err = mount.GetMounts() + if err != nil { + t.Error(err) + } + + for _, mt := range testOut { + for _, m := range mt.mounts { + marked, err := shiftfs.Mounted(m.Source, allMounts) + if err != nil { + t.Error(err) + } + if marked { + t.Errorf("error: shiftfs mark not expected on %s, but found.", m.Source) + } + } + } + + // verify the shiftfMgr mreqCntrMap is clean now + if len(mgr.mreqCntrMap) != 0 { + t.Errorf("error: mreqCntrMap is not empty; it is %v", mgr.mreqCntrMap) + } + + // verify work dir is clean + empty, err := dirIsEmpty(mgr.workDir) + if err != nil { + t.Error(err) + } + if !empty { + t.Errorf("error: dir %s is expected to be empty but it's not.", mgr.workDir) + } + + cleanupTest(tdir) +} + +func TestShiftfsMgrCreateMarkpoint(t *testing.T) { + + shiftfsOk, err := hostSupportsShiftfs() + if err != nil { + t.Fatalf("error: host shiftfs check failed: %s", err) + } + + if !shiftfsOk { + t.Skip("skipping test (shiftfs not supported).") + } + + tdir, err := setupTest() + if err != nil { + t.Errorf("error: setupTest() failed: %s", err) + } + + mgrIf, _ := New(sysboxLibDir) + mgr := mgrIf.(*mgr) + + // Generare some shiftfs mark requests + testIn := []mountTest{ + { + id: "testCont1", + mounts: []shiftfs.MountPoint{ + {"/a/b/c", false}, + {"/d/e/f/g", false}, + }, + }, + { + id: "testCont2", + mounts: []shiftfs.MountPoint{ + {"/a/b/c", false}, + {"/x/y/z", false}, + }, + }, + { + id: "testCont3", + mounts: []shiftfs.MountPoint{ + {"/i/h/j", false}, + {"/x/y/z", false}, + {"/a/b/c", false}, + }, + }, + } + + for _, mt := range testIn { + for _, m := range mt.mounts { + if err := os.MkdirAll(m.Source, 0755); err != nil { + t.Error(err) + } + } + } + + testOut := []mountTest{} + + for _, mt := range testIn { + + // createMarkpoint = true + mp, err := mgr.Mark(mt.id, mt.mounts, true) + if err != nil { + t.Errorf("error: failed to mark mounts: %v", err) + } + + entry := mountTest{ + id: mt.id, + mounts: mp, + } + + testOut = append(testOut, entry) + } + + // Verify the shiftfs marks are present + allMounts, err := mount.GetMounts() + if err != nil { + t.Error(err) + } + + for _, mt := range testOut { + for _, m := range mt.mounts { + marked, err := shiftfs.Mounted(m.Source, allMounts) + if err != nil { + t.Error(err) + } + if !marked { + t.Errorf("error: shiftfs mark expected on %s, but none found.", m.Source) + } + } + } + + // The markpoints are expected to differ from the original mounts + if mountTestEqual(testIn, testOut) { + t.Errorf("error: markpoint mismatch: got %v, want %v", testOut, testIn) + } + + // But there should be as many markpoints returned as passed to Mark() + if len(testOut) != len(testIn) { + t.Errorf("error: markpoint length mismatch: got %d, want %d", len(testOut), len(testIn)) + } + + // Verify the shiftfsMgr mreqCntrMap looks good + uniqueMnts := []string{"/a/b/c", "/d/e/f/g", "/x/y/z", "/i/h/j"} + cntrs := [][]string{ + {"testCont1", "testCont2", "testCont3"}, + {"testCont1"}, + {"testCont2", "testCont3"}, + {"testCont3"}, + } + + for i, k := range uniqueMnts { + ids := mgr.mreqCntrMap[k] + if !utils.StringSliceEqual(ids, cntrs[i]) { + t.Errorf("error: mreqCntrMap[%s] = %v; want mreqCntrMap[%s] = %v", k, ids, k, cntrs[i]) + } + } + + // Verify the created markpoints are as expected (there should be as many as + // the length of slice "uniqueMnts" above"). + markpoints, _ := ioutil.ReadDir(mgr.workDir) + if len(markpoints) != len(uniqueMnts) { + t.Errorf("error: incorrect number of markpoints (expected %d); markpoints = %v", len(uniqueMnts), markpoints) + } + + // Generate shiftfs unmark requests + for _, mt := range testOut { + if err := mgr.Unmark(mt.id, mt.mounts); err != nil { + t.Errorf("error: failed to unmark mounts: %v", err) + } + } + + if len(mgr.mreqCntrMap) != 0 { + t.Errorf("error: mreqCntrMap is not empty; it is %v", mgr.mreqCntrMap) + } + + if len(mgr.mpMreqMap) != 0 { + t.Errorf("error: mpMreqMap is not empty; it is %v", mgr.mpMreqMap) + } + + // verify work dir is clean + empty, err := dirIsEmpty(mgr.workDir) + if err != nil { + t.Error(err) + } + if !empty { + t.Errorf("error: dir %s is expected to be empty but it's not.", mgr.workDir) + } + + cleanupTest(tdir) +} + +func TestShiftfsMgrMarkIgnore(t *testing.T) { + + shiftfsOk, err := hostSupportsShiftfs() + if err != nil { + t.Fatalf("error: host shiftfs check failed: %s", err) + } + + if !shiftfsOk { + t.Skip("skipping test (shiftfs not supported).") + } + + tdir, err := setupTest() + if err != nil { + t.Errorf("error: setupTest() failed: %s", err) + } + + mgrIf, _ := New(sysboxLibDir) + mgr := mgrIf.(*mgr) + + // Generare some shiftfs mark requests + testIn := []mountTest{ + { + id: "testCont1", + mounts: []shiftfs.MountPoint{ + {"/a/b/c", false}, + {"/d/e/f/g", false}, + }, + }, + } + + // Create the mark request dirs and pre-mark them with shiftfs; since they + // are premarked, the shiftfsMgr should not try to mark them. + for _, mt := range testIn { + for _, m := range mt.mounts { + if err := os.MkdirAll(m.Source, 0755); err != nil { + t.Error(err) + } + if err := shiftfs.Mark(m.Source, m.Source); err != nil { + t.Error(err) + } + } + } + + testOut := []mountTest{} + + for _, mt := range testIn { + + // createMarkpoint = true + mp, err := mgr.Mark(mt.id, mt.mounts, true) + if err != nil { + t.Errorf("error: failed to mark mounts: %v", err) + } + + entry := mountTest{ + id: mt.id, + mounts: mp, + } + + testOut = append(testOut, entry) + } + + // Verify the shiftfs marks are remain (shiftfsMgr should not have touched them) + allMounts, err := mount.GetMounts() + if err != nil { + t.Error(err) + } + + for _, mt := range testOut { + for _, m := range mt.mounts { + marked, err := shiftfs.Mounted(m.Source, allMounts) + if err != nil { + t.Error(err) + } + if !marked { + t.Errorf("error: shiftfs mark expected on %s, but none found.", m.Source) + } + } + } + + // Verify the returned markpoints are identical to the given mounts + if !mountTestEqual(testIn, testOut) { + t.Errorf("error: markpoint mismatch: got %v, want %v", testOut, testIn) + } + + // Generate shiftfs unmark requests + for _, mt := range testOut { + if err := mgr.Unmark(mt.id, mt.mounts); err != nil { + t.Errorf("error: failed to unmark mounts: %v", err) + } + } + + // Verify the shiftfs marks were not removed (since they were not added by shiftfsMgr) + allMounts, err = mount.GetMounts() + if err != nil { + t.Error(err) + } + + for _, mt := range testOut { + for _, m := range mt.mounts { + marked, err := shiftfs.Mounted(m.Source, allMounts) + if err != nil { + t.Error(err) + } + if !marked { + t.Errorf("error: shiftfs mark expected on %s, but not found.", m.Source) + } + } + } + + // verify work dir is clean + empty, err := dirIsEmpty(mgr.workDir) + if err != nil { + t.Error(err) + } + if !empty { + t.Errorf("error: dir %s is expected to be empty but it's not.", mgr.workDir) + } + + // Remove shiftfs marks + for _, mt := range testIn { + for _, m := range mt.mounts { + if err := shiftfs.Unmount(m.Source); err != nil { + t.Error(err) + } + } + } + + cleanupTest(tdir) +} + +func TestShiftfsMgrUnmarkAll(t *testing.T) { + + shiftfsOk, err := hostSupportsShiftfs() + if err != nil { + t.Fatalf("error: host shiftfs check failed: %s", err) + } + + if !shiftfsOk { + t.Skip("skipping test (shiftfs not supported).") + } + + tdir, err := setupTest() + if err != nil { + t.Errorf("error: setupTest() failed: %s", err) + } + + mgrIf, _ := New(sysboxLibDir) + mgr := mgrIf.(*mgr) + + // Generate some shiftfs mark requests + testIn := []mountTest{ + { + id: "testCont1", + mounts: []shiftfs.MountPoint{ + {"/a/b/c", false}, + {"/d/e/f/g", false}, + }, + }, + { + id: "testCont2", + mounts: []shiftfs.MountPoint{ + {"/a/b/c", false}, + {"/x/y/z", false}, + }, + }, + { + id: "testCont3", + mounts: []shiftfs.MountPoint{ + {"/i/h/j", false}, + {"/x/y/z", false}, + {"/a/b/c", false}, + }, + }, + } + + for _, mt := range testIn { + for _, m := range mt.mounts { + if err := os.MkdirAll(m.Source, 0755); err != nil { + t.Error(err) + } + } + } + + for _, mt := range testIn { + if _, err := mgr.Mark(mt.id, mt.mounts, true); err != nil { + t.Errorf("error: failed to mark mounts: %v", err) + } + } + + mgr.UnmarkAll() + + // verify work dir is clean (implies shiftfs marks were removed) + empty, err := dirIsEmpty(mgr.workDir) + if err != nil { + t.Error(err) + } + if !empty { + t.Errorf("error: dir %s is expected to be empty but it's not.", mgr.workDir) + } + + cleanupTest(tdir) +} diff --git a/sysbox-mgr/subidAlloc/subidAllocSimple.go b/sysbox-mgr/subidAlloc/subidAllocSimple.go new file mode 100644 index 00000000..c44d28ce --- /dev/null +++ b/sysbox-mgr/subidAlloc/subidAllocSimple.go @@ -0,0 +1,144 @@ +// +// Copyright 2019-2020 Nestybox, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +// sysbox-mgr: sub user-id and sub group-id allocator +// +// The subidAlloc class allocates portions of the subuid and subgid ranges associated +// with a given user. It implements the intf.SubidAlloc interface. +// +// An subid object is created with New(), allocations are performed with Alloc(), and +// freeing is performed with Free(). + +package subidAlloc + +import ( + "fmt" + "io" + "sort" + + mapset "github.com/deckarep/golang-set" + "github.com/nestybox/sysbox-libs/formatter" + intf "github.com/nestybox/sysbox-mgr/intf" + "github.com/nestybox/sysbox-runc/libcontainer/user" + "github.com/sirupsen/logrus" +) + +const ( + allocBlkSize uint32 = 65536 // min uid(gid) allocation range +) + +// subidAlloc class (implements the UidAllocator interface) +type subidAlloc struct { + idRange user.SubID +} + +// New creates an subidAlloc object +// +// userName is the Linux user whose subid/gid ranges will be used +// subuidSrc and subgidSrc contain the subid/gid ranges for the system +func New(userName string, subuidSrc, subgidSrc io.Reader) (intf.SubidAlloc, error) { + + filter := func(entry user.SubID) bool { + return entry.Name == userName + } + + // read subuid range(s) for userName + uidRanges, err := user.ParseSubIDFilter(subuidSrc, filter) + if err != nil { + return nil, err + } + + if len(uidRanges) == 0 { + return nil, fmt.Errorf("could not find subuid info for user %s", userName) + } + + // read subgid range(s) for userName + gidRanges, err := user.ParseSubIDFilter(subgidSrc, filter) + if err != nil { + return nil, err + } + + if len(gidRanges) == 0 { + return nil, fmt.Errorf("could not find subgid info for user %s", userName) + } + + // we need at least one common subuid and subgid range + commonRanges := getCommonRanges(uidRanges, gidRanges) + if len(commonRanges) == 0 { + return nil, fmt.Errorf("could not find matching subuid and subgids range for user %s", userName) + } + + sub := &subidAlloc{} + + // find a common range that is large enough for the allocation size + foundRange := false + for _, subid := range commonRanges { + if subid.Count >= int64(allocBlkSize) { + foundRange = true + sub.idRange = subid + break + } + } + + if !foundRange { + return nil, fmt.Errorf("did not find a large enough subuid range for user %s (need %v)", userName, allocBlkSize) + } + + return sub, nil +} + +func getCommonRanges(uidRanges, gidRanges []user.SubID) []user.SubID { + + uidRangeSet := mapset.NewSet() + for _, uidRange := range uidRanges { + uidRangeSet.Add(uidRange) + } + + gidRangeSet := mapset.NewSet() + for _, gidRange := range gidRanges { + gidRangeSet.Add(gidRange) + } + + commonSet := uidRangeSet.Intersect(gidRangeSet) + + common := []user.SubID{} + for elem := range commonSet.Iter() { + subid := elem.(user.SubID) + common = append(common, subid) + } + + // this ordering makes multi-range allocations more predictable, which helps in + // testing. + sort.Slice(common, func(i, j int) bool { + return common[i].SubID < common[j].SubID + }) + + return common +} + +// Implements intf.SubidAlloc.Alloc +func (sub *subidAlloc) Alloc(id string, size uint64) (uint32, uint32, error) { + subid := sub.idRange + logrus.Debugf("Alloc(%s, %v) = %v, %v", + formatter.ContainerID{id}, size, subid, subid) + return uint32(subid.SubID), uint32(subid.SubID), nil +} + +// Implements intf.SubidAlloc.Free +func (sub *subidAlloc) Free(id string) error { + logrus.Debugf("Free(%v)", formatter.ContainerID{id}) + return nil +} diff --git a/sysbox-mgr/subidAlloc/subidAllocSimple_test.go b/sysbox-mgr/subidAlloc/subidAllocSimple_test.go new file mode 100644 index 00000000..163977d1 --- /dev/null +++ b/sysbox-mgr/subidAlloc/subidAllocSimple_test.go @@ -0,0 +1,174 @@ +// +// Copyright 2019-2020 Nestybox, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +package subidAlloc + +import ( + "strings" + "testing" + + "github.com/nestybox/sysbox-mgr/intf" + "github.com/nestybox/sysbox-runc/libcontainer/user" +) + +type allocTest struct { + id string + size uint64 + wantUid uint32 + wantGid uint32 + wantErr string +} + +func testAlloc(t *testing.T, subidAlloc intf.SubidAlloc, tests []allocTest) { + + for _, test := range tests { + gotUid, gotGid, gotErr := subidAlloc.Alloc(test.id, test.size) + + var errStr string + if gotErr == nil { + errStr = "" + } else { + errStr = gotErr.Error() + } + + if errStr != test.wantErr || gotUid != test.wantUid || gotGid != test.wantGid { + if errStr == "" { + errStr = "(no-error)" + } + if test.wantErr == "" { + test.wantErr = "(no-error)" + } + + t.Errorf("Alloc(%v, %v) failed: got = %v,%v,%v; want = %v,%v,%v", + test.id, test.size, gotUid, gotGid, errStr, test.wantUid, test.wantGid, test.wantErr) + } + } +} + +func TestAllocBasic(t *testing.T) { + + subuidCfg := strings.NewReader(`testUser:0:655360`) + subgidCfg := strings.NewReader(`testUser:0:655360`) + + subidAlloc, err := New("testUser", subuidCfg, subgidCfg) + if err != nil { + t.Errorf("failed to create allocator: %v", err) + return + } + + var tests = []allocTest{ + // id, size, wantUid, wantGid, wantErr + {"1", 65536, 0, 0, ""}, + {"2", 65536, 0, 0, ""}, + {"3", 65536, 0, 0, ""}, + } + + testAlloc(t, subidAlloc, tests) +} + +func TestAllocInvalidUser(t *testing.T) { + + subuidCfg := strings.NewReader(`testUser:0:131072`) + subgidCfg := strings.NewReader(`testUser:0:131072`) + + _, err := New("anotherUser", subuidCfg, subgidCfg) + if err == nil { + t.Errorf("idAlloc.New(): want error, got no error") + return + } +} + +func TestAllocMultiRange(t *testing.T) { + + subuidCfg := strings.NewReader(`testUser:0:65536 + testUser:524288:65536`) + + subgidCfg := strings.NewReader(`testUser:0:65536 + testUser:524288:65536`) + + subidAlloc, err := New("testUser", subuidCfg, subgidCfg) + if err != nil { + t.Errorf("failed to create allocator: %v", err) + return + } + + var tests = []allocTest{ + // id, size, wantUid, wantGid, wantErr + {"1", 65536, 0, 0, ""}, + {"2", 65536, 0, 0, ""}, + {"3", 65536, 0, 0, ""}, + } + + testAlloc(t, subidAlloc, tests) +} + +func TestGetCommonRanges(t *testing.T) { + + uidRanges := []user.SubID{{"1", 0, 5}, {"2", 7, 3}, {"3", 10, 6}, {"4", 20, 1}} + gidRanges := []user.SubID{{"1", 1, 5}, {"2", 7, 3}, {"3", 10, 7}, {"4", 20, 1}} + + want := []user.SubID{{"2", 7, 3}, {"4", 20, 1}} + got := getCommonRanges(uidRanges, gidRanges) + + if len(want) != len(got) { + t.Errorf("getCommonRanges(%v, %v) failed; want %v; got %v", uidRanges, gidRanges, want, got) + } + + for _, w := range want { + found := false + for _, g := range got { + if w == g { + found = true + } + } + if !found { + t.Errorf("getCommonRanges(%v, %v) failed; want %v; got %v", uidRanges, gidRanges, want, got) + } + } +} + +func TestAllocCommonRange(t *testing.T) { + + subuidCfg := strings.NewReader(`testUser:0:65536 + testUser:524288:65536`) + + subgidCfg := strings.NewReader(`testUser:65536:65536 + testUser:0:65536`) + + subidAlloc, err := New("testUser", subuidCfg, subgidCfg) + if err != nil { + t.Errorf("failed to create allocator: %v", err) + } + + var tests = []allocTest{ + // id, size, wantUid, wantGid, wantErr + {"1", 65536, 0, 0, ""}, + {"1", 65536, 0, 0, ""}, + } + + testAlloc(t, subidAlloc, tests) + + subuidCfg = strings.NewReader(`testUser:0:65536 + testUser:524288:65536`) + + subgidCfg = strings.NewReader(`testUser:65536:65536 + testUser:231072:65536`) + + subidAlloc, err = New("testUser", subuidCfg, subgidCfg) + if err == nil { + t.Errorf("subidAlloc() passed; expected failure") + } +} diff --git a/sysbox-mgr/utils.go b/sysbox-mgr/utils.go new file mode 100644 index 00000000..c16ad4be --- /dev/null +++ b/sysbox-mgr/utils.go @@ -0,0 +1,1025 @@ +// +// Copyright 2019-2020 Nestybox, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +package main + +import ( + "bufio" + "bytes" + "fmt" + "io/ioutil" + "os" + "path/filepath" + "sort" + "strconv" + "strings" + "sync" + "syscall" + + "github.com/nestybox/sysbox-libs/dockerUtils" + "github.com/nestybox/sysbox-libs/idMap" + "github.com/nestybox/sysbox-libs/linuxUtils" + "github.com/nestybox/sysbox-libs/mount" + "github.com/nestybox/sysbox-libs/overlayUtils" + "github.com/nestybox/sysbox-libs/shiftfs" + libutils "github.com/nestybox/sysbox-libs/utils" + intf "github.com/nestybox/sysbox-mgr/intf" + "github.com/nestybox/sysbox-mgr/subidAlloc" + "github.com/nestybox/sysbox-mgr/volMgr" + "github.com/opencontainers/runc/libcontainer/user" + specs "github.com/opencontainers/runtime-spec/specs-go" + "github.com/sirupsen/logrus" + "github.com/urfave/cli" + + "golang.org/x/sys/unix" +) + +const SHIFTFS_MAGIC int64 = 0x6a656a62 + +var progDeps = []string{"rsync", "modprobe", "iptables"} + +type exclusiveMntTable struct { + mounts map[string][]string // mount source -> list of containers using that mount source + lock sync.Mutex +} + +func newExclusiveMntTable() *exclusiveMntTable { + return &exclusiveMntTable{ + mounts: make(map[string][]string), + } +} + +func (t *exclusiveMntTable) add(mntSrc, containerId string) bool { + t.lock.Lock() + defer t.lock.Unlock() + + cids, found := t.mounts[mntSrc] + if found { + logrus.Warnf("mount source at %s should be mounted in one container only, but is already mounted in containers %v", mntSrc, cids) + } + t.mounts[mntSrc] = append(cids, containerId) + + return found +} + +func (t *exclusiveMntTable) remove(mntSrc, containerId string) { + t.lock.Lock() + defer t.lock.Unlock() + + cids, found := t.mounts[mntSrc] + if !found { + return + } + + cids = libutils.StringSliceRemove(cids, []string{containerId}) + + if len(cids) > 0 { + t.mounts[mntSrc] = cids + } else { + delete(t.mounts, mntSrc) + } +} + +func allocSubidRange(subID []user.SubID, size, min, max uint64) ([]user.SubID, error) { + var holeStart, holeEnd uint64 + + if size == 0 { + return subID, fmt.Errorf("invalid allocation size: %d", size) + } + + sortedSubID := subID + + // Sort the subIDs by starting range (simplifies the allocation) + sort.Slice(sortedSubID, func(i, j int) bool { + return sortedSubID[i].SubID < sortedSubID[j].SubID + }) + + holeStart = min + + for _, id := range sortedSubID { + holeEnd = uint64(id.SubID) + + if (holeEnd >= holeStart) && (holeEnd-holeStart >= size) { + sortedSubID = append(sortedSubID, user.SubID{Name: "sysbox", SubID: int64(holeStart), Count: int64(size)}) + return sortedSubID, nil + } + + holeStart = uint64(id.SubID + id.Count) + } + + holeEnd = max + if holeEnd-holeStart < size { + return sortedSubID, fmt.Errorf("failed to allocate %d subids in range %d, %d", size, min, max) + } + + sortedSubID = append(sortedSubID, user.SubID{Name: "sysbox", SubID: int64(holeStart), Count: int64(size)}) + return sortedSubID, nil +} + +func writeSubidFile(path string, subID []user.SubID) error { + var buf bytes.Buffer + for _, id := range subID { + l := fmt.Sprintf("%s:%d:%d\n", id.Name, id.SubID, id.Count) + buf.WriteString(l) + } + + return ioutil.WriteFile(path, []byte(buf.String()), 0644) +} + +func configSubidRange(path string, size, min, max uint64) error { + + subID, err := user.ParseSubIDFile(path) + if err != nil { + if os.IsNotExist(err) { + // We will create an new file with only the "sysbox" entry + subID = []user.SubID{} + } else { + return fmt.Errorf("error parsing file %s: %s", path, err) + } + } + + // Check if there are any subids configured for user "sysbox" + numSysboxEntries := 0 + idx := 0 + for i, id := range subID { + if id.Name == "sysbox" { + numSysboxEntries = numSysboxEntries + 1 + idx = i + } + } + + // If a single valid subID range for user "sysbox" is found, let's use it. + if numSysboxEntries == 1 && uint64(subID[idx].Count) == size { + return nil + } + + // If there are multiple ranges for user sysbox (something we don't support) + // eliminate them and replace them with a single one. + if numSysboxEntries > 0 { + tmpSubID := []user.SubID{} + for _, id := range subID { + if id.Name != "sysbox" { + tmpSubID = append(tmpSubID, id) + } + } + subID = tmpSubID + } + + // Allocate range for user sysbox + subID, err = allocSubidRange(subID, size, min, max) + if err != nil { + return fmt.Errorf("failed to configure subid range for sysbox: %s", err) + } + + // Sort by subID + sort.Slice(subID, func(i, j int) bool { + return subID[i].SubID < subID[j].SubID + }) + + // Write it to the subuid file + if err = writeSubidFile(path, subID); err != nil { + return fmt.Errorf("failed to configure subid range for sysbox: %s", err) + } + + return nil +} + +// getSubidLimits returns the subuid min, subuid max, subgid min, and subgid max limits +// for the host (in that order) +func getSubidLimits(file string) ([]uint64, error) { + + // defaults (see login.defs(5); we set the max limits to 2^32 because uid(gid) + // are 32-bit, even though login.defs(5) indicates it's above this value) + limits := []uint64{100000, 4294967295, 100000, 4294967295} + + // check if these defaults are overridden by login.defs; if login.defs does not exist, move on. + f, err := os.Open(file) + if err != nil { + return limits, nil + } + defer f.Close() + + tokens := map[string]uint{ + "SUB_UID_MIN": 0, + "SUB_UID_MAX": 1, + "SUB_GID_MIN": 2, + "SUB_GID_MAX": 3, + } + + scanner := bufio.NewScanner(f) + for scanner.Scan() { + line := scanner.Text() + for token, pos := range tokens { + if strings.Contains(line, token) { + valStr := strings.Fields(line) + if len(valStr) < 2 { + return limits, fmt.Errorf("failed to parse file %s: line %s: expected two fields, found %d field(s)", file, line, len(valStr)) + } + limits[pos], err = strconv.ParseUint(valStr[1], 10, 64) + if err != nil { + return limits, fmt.Errorf("failed to parse line %s: %s", line, err) + } + } + } + } + + if err := scanner.Err(); err != nil { + return limits, fmt.Errorf("failed to scan file %s: %v", file, err) + } + + return limits, nil +} + +func setupSubidAlloc(ctx *cli.Context) (intf.SubidAlloc, error) { + + // get subid min/max limits from login.defs (if any) + limits, err := getSubidLimits("/etc/login.defs") + if err != nil { + return nil, err + } + + subUidMin := limits[0] + subUidMax := limits[1] + subGidMin := limits[2] + subGidMax := limits[3] + + // configure the subuid(gid) range for "sysbox" + if err := configSubidRange("/etc/subuid", subidRangeSize, subUidMin, subUidMax); err != nil { + return nil, err + } + if err := configSubidRange("/etc/subgid", subidRangeSize, subGidMin, subGidMax); err != nil { + return nil, err + } + + subuidSrc, err := os.Open("/etc/subuid") + if err != nil { + return nil, err + } + defer subuidSrc.Close() + + subgidSrc, err := os.Open("/etc/subgid") + if err != nil { + return nil, err + } + defer subgidSrc.Close() + + subidAlloc, err := subidAlloc.New("sysbox", subuidSrc, subgidSrc) + if err != nil { + return nil, err + } + + return subidAlloc, nil +} + +func setupDockerVolMgr(syncToRootfs bool) (intf.VolMgr, error) { + var statfs syscall.Statfs_t + + hostDir := filepath.Join(sysboxLibDir, "docker") + if err := os.MkdirAll(hostDir, 0700); err != nil { + return nil, fmt.Errorf("failed to create %v: %v", hostDir, err) + } + + // The host dir that is bind-mounted into the sys container's /var/lib/docker can't be + // on the following filesystems, as docker inside the sys container uses overlayfs for + // its images and overlayfs can't be mounted on top of these. + + unsupportedFs := map[string]int64{ + "tmpfs": unix.TMPFS_MAGIC, + "overlayfs": unix.OVERLAYFS_SUPER_MAGIC, + "shiftfs": SHIFTFS_MAGIC, + } + + if err := syscall.Statfs(hostDir, &statfs); err != nil { + return nil, fmt.Errorf("failed to find filesystem info for %s", hostDir) + } + + for name, magic := range unsupportedFs { + if int64(statfs.Type) == magic { + return nil, fmt.Errorf("host dir for docker vol manager (%s) can't be on %v", hostDir, name) + } + } + + return volMgr.New("dockerVolMgr", hostDir, syncToRootfs) +} + +func setupKubeletVolMgr(syncToRootfs bool) (intf.VolMgr, error) { + + var statfs syscall.Statfs_t + + hostDir := filepath.Join(sysboxLibDir, "kubelet") + if err := os.MkdirAll(hostDir, 0700); err != nil { + return nil, fmt.Errorf("failed to create %v: %v", hostDir, err) + } + + // The host dir that is bind-mounted into the sys container's /var/lib/kubelet + // directory can't be on the following filesystems, as kubelet inside the sys + // container does not support them. + unsupportedFs := map[string]int64{ + "shiftfs": SHIFTFS_MAGIC, + } + + if err := syscall.Statfs(hostDir, &statfs); err != nil { + return nil, fmt.Errorf("failed to find filesystem info for %s", hostDir) + } + + for name, magic := range unsupportedFs { + if int64(statfs.Type) == magic { + return nil, fmt.Errorf("host dir for kubelet vol manager (%s) can't be on %v", hostDir, name) + } + } + + return volMgr.New("kubeletVolMgr", hostDir, syncToRootfs) +} + +func setupK0sVolMgr(syncToRootfs bool) (intf.VolMgr, error) { + + var statfs syscall.Statfs_t + + hostDir := filepath.Join(sysboxLibDir, "k0s") + if err := os.MkdirAll(hostDir, 0700); err != nil { + return nil, fmt.Errorf("failed to create %v: %v", hostDir, err) + } + + // The host dir that is bind-mounted into the sys container's + // /var/lib/k0s directory can't be on the following filesystems, + // as k0s inside the sys container does not support them. + unsupportedFs := map[string]int64{ + "shiftfs": SHIFTFS_MAGIC, + } + + if err := syscall.Statfs(hostDir, &statfs); err != nil { + return nil, fmt.Errorf("failed to find filesystem info for %s", hostDir) + } + + for name, magic := range unsupportedFs { + if int64(statfs.Type) == magic { + return nil, fmt.Errorf("host dir for kubelet vol manager (%s) can't be on %v", hostDir, name) + } + } + + return volMgr.New("k0sVolMgr", hostDir, syncToRootfs) +} + +func setupK3sVolMgr(syncToRootfs bool) (intf.VolMgr, error) { + + var statfs syscall.Statfs_t + + hostDir := filepath.Join(sysboxLibDir, "rancher-k3s") + if err := os.MkdirAll(hostDir, 0700); err != nil { + return nil, fmt.Errorf("failed to create %v: %v", hostDir, err) + } + + // The host dir that is bind-mounted into the sys container's + // /var/lib/rancher/k3s directory can't be on the following filesystems, + // as k3s inside the sys container does not support them. + unsupportedFs := map[string]int64{ + "shiftfs": SHIFTFS_MAGIC, + } + + if err := syscall.Statfs(hostDir, &statfs); err != nil { + return nil, fmt.Errorf("failed to find filesystem info for %s", hostDir) + } + + for name, magic := range unsupportedFs { + if int64(statfs.Type) == magic { + return nil, fmt.Errorf("host dir for kubelet vol manager (%s) can't be on %v", hostDir, name) + } + } + + return volMgr.New("k3sVolMgr", hostDir, syncToRootfs) +} + +func setupRke2VolMgr(syncToRootfs bool) (intf.VolMgr, error) { + + var statfs syscall.Statfs_t + + hostDir := filepath.Join(sysboxLibDir, "rancher-rke2") + if err := os.MkdirAll(hostDir, 0700); err != nil { + return nil, fmt.Errorf("failed to create %v: %v", hostDir, err) + } + + // The host dir that is bind-mounted into the sys container's + // /var/lib/rancher/rke2 directory can't be on the following filesystems, + // as rke2 inside the sys container does not support them. + unsupportedFs := map[string]int64{ + "shiftfs": SHIFTFS_MAGIC, + } + + if err := syscall.Statfs(hostDir, &statfs); err != nil { + return nil, fmt.Errorf("failed to find filesystem info for %s", hostDir) + } + + for name, magic := range unsupportedFs { + if int64(statfs.Type) == magic { + return nil, fmt.Errorf("host dir for kubelet vol manager (%s) can't be on %v", hostDir, name) + } + } + + return volMgr.New("rke2VolMgr", hostDir, syncToRootfs) +} + +func setupBuildkitVolMgr(syncToRootfs bool) (intf.VolMgr, error) { + + var statfs syscall.Statfs_t + + hostDir := filepath.Join(sysboxLibDir, "buildkit") + if err := os.MkdirAll(hostDir, 0700); err != nil { + return nil, fmt.Errorf("failed to create %v: %v", hostDir, err) + } + + // The host dir that is bind-mounted into the sys container's + // /var/lib/buildkit directory can't be on the following filesystems, + // as buildkit inside the sys container does not support them. + unsupportedFs := map[string]int64{ + "shiftfs": SHIFTFS_MAGIC, + } + + if err := syscall.Statfs(hostDir, &statfs); err != nil { + return nil, fmt.Errorf("failed to find filesystem info for %s", hostDir) + } + + for name, magic := range unsupportedFs { + if int64(statfs.Type) == magic { + return nil, fmt.Errorf("host dir for kubelet vol manager (%s) can't be on %v", hostDir, name) + } + } + + return volMgr.New("buildkitVolMgr", hostDir, syncToRootfs) +} + +func setupContainerdVolMgr(syncToRootfs bool) (intf.VolMgr, error) { + + var statfs syscall.Statfs_t + + hostDir := filepath.Join(sysboxLibDir, "containerd") + if err := os.MkdirAll(hostDir, 0700); err != nil { + return nil, fmt.Errorf("failed to create %v: %v", hostDir, err) + } + + // The host dir that is bind-mounted into the sys container's + // /var/lib/containerd/io.containerd.snapshotter.v1.overlayfs + // directory can't be on the following filesystems, as containerd inside the sys + // container does not support them. + unsupportedFs := map[string]int64{ + "shiftfs": SHIFTFS_MAGIC, + } + + if err := syscall.Statfs(hostDir, &statfs); err != nil { + return nil, fmt.Errorf("failed to find filesystem info for %s", hostDir) + } + + for name, magic := range unsupportedFs { + if int64(statfs.Type) == magic { + return nil, fmt.Errorf("host dir for containerd vol manager (%s) can't be on %v", hostDir, name) + } + } + + return volMgr.New("containerdVolMgr", hostDir, syncToRootfs) +} + +func setupRunDir() error { + + if err := os.MkdirAll(sysboxRunDir, 0700); err != nil { + return fmt.Errorf("failed to create %s: %s", sysboxRunDir, err) + } + + return nil +} + +func setupWorkDirs() error { + + // Cleanup work dirs in case they were left unclean from a prior session (e.g., if + // sysbox was running and stopped with SIGKILL) + if err := cleanupWorkDirs(); err != nil { + return err + } + + // SysboxLibDir requires slightly less stringent permissions to ensure + // that sysbox-runc is capable of operating in this path during container + // initialization. Also, note that even though SysboxLibDir is typically + // owned by 'root:root', here we are explicitly enforcing it to address + // (testing) scenarios where this may not be the case. + if err := os.MkdirAll(sysboxLibDir, 0710); err != nil { + return fmt.Errorf("failed to create %s: %s", sysboxLibDir, err) + } + if err := os.Chown(sysboxLibDir, int(0), int(0)); err != nil { + return fmt.Errorf("failed to chown %s: %s", sysboxLibDir, err) + } + + return nil +} + +func cleanupWorkDirs() error { + + // Remove any mounts under the sysbox lib dir (we don't expect any because normally + // sysbox-mgr removes all mounts it creates, unless is was killed with SIGKILL). + mountinfos, err := mount.GetMounts() + if err != nil { + return fmt.Errorf("failed to obtain mounts: %s", err) + } + + for _, mi := range mountinfos { + if strings.HasPrefix(mi.Mountpoint, sysboxLibDir+"/") { + if err := unix.Unmount(mi.Mountpoint, unix.MNT_DETACH); err != nil { + return fmt.Errorf("failed to unmount %s: %s", mi.Mountpoint, err) + } + } + } + + // Remove the sysbox lib dir + if err := os.RemoveAll(sysboxLibDir); err != nil { + logrus.Warnf("failed to cleanup %s: %v", sysboxLibDir, err) + } + + return nil +} + +// Sanitize the given container's rootfs. +func sanitizeRootfs(id, rootfs string) string { + + // Docker containers on overlayfs have a rootfs under "/var/lib/docker/overlay2//merged". + // However, Docker removes the "merged" directory during container stop and re-creates + // it during container start. Thus, we can't rely on the presence of "merged" to + // determine if a container was stopped or removed. Instead, we use the rootfs path up + // to . + + isDocker, err := dockerUtils.ContainerIsDocker(id, rootfs) + if err == nil && isDocker { + if strings.Contains(rootfs, "overlay2") && filepath.Base(rootfs) == "merged" { + return filepath.Dir(rootfs) + } + } + + return rootfs +} + +// getLinuxHeaderMounts returns a list of read-only mounts of the host's linux +// kernel headers. +func getLinuxHeaderMounts(kernelHdrPath string) ([]specs.Mount, error) { + + var path = kernelHdrPath + + if _, err := os.Stat(path); os.IsNotExist(err) { + logrus.Warnf("No kernel-headers found in host filesystem at %s. No headers will be mounted inside any of the containers.", path) + return []specs.Mount{}, nil + } + + // Create a mount-spec making use of the kernel-hdr-path in the host. This way, + // sys containers will have kernel-headers exposed in the same path utilized by + // the host. In addition to this, a softlink will be added to container's rootfs, + // if its expected kernel-header-path differs from the one of the host -- refer + // to reqFsState() for details. + // + // Finally, notice that here we enable 'follow' flag as some distros (e.g., Ubuntu) + // heavily symlink the linux-header directory. + mounts, err := createMountSpec( + path, + path, + "bind", + []string{"ro", "rbind", "rprivate"}, + true, + "/usr/src", + ) + if err != nil { + return nil, + fmt.Errorf("failed to create mount spec for linux headers at %s: %v", path, err) + } + + return mounts, nil +} + +// getLibModMount returns a list of read-only mounts for the host's kernel modules dir (/lib/modules/). +func getLibModMounts() ([]specs.Mount, error) { + + kernelRel, err := linuxUtils.GetKernelRelease() + if err != nil { + return nil, err + } + + mounts := []specs.Mount{} + path := filepath.Join("/lib/modules/", kernelRel) + + _, err = os.Stat(path) + if os.IsNotExist(err) { + logrus.Warnf("No lib-modules found in host filesystem at %s. lib-modules won't be mounted inside any of the containers.", path) + return mounts, nil + } else if err != nil { + return nil, err + } + + // Do *not* follow symlinks as they normally point to the linux headers which we + // mount also (see getLinuxHeadersMount()). + mounts, err = createMountSpec( + path, + path, + "bind", + []string{"ro", "rbind", "rprivate"}, + false, + "/usr/src", + ) + + if err != nil { + return nil, fmt.Errorf("failed to create mount spec for linux modules at %s: %v", + path, err) + } + + return mounts, nil +} + +// createMountSpec returns a mount spec with the given source, destination, type, and +// options. 'source' must be an absolute path. 'dest' is absolute with respect to the +// container's rootfs. If followSymlinks is true, this function follows symlinks under the +// source path and returns additional mount specs to ensure the symlinks are valid at the +// destination. If filter is not empty, only symlinks that resolve to paths that +// are prefixed by the symlinkFilt strings are allowed. +func createMountSpec( + source string, + dest string, + mountType string, + mountOpt []string, + followSymlinks bool, + filter string) ([]specs.Mount, error) { + + mounts := []specs.Mount{} + m := specs.Mount{ + Source: source, + Destination: dest, + Type: mountType, + Options: mountOpt, + } + mounts = append(mounts, m) + + if !followSymlinks { + return mounts, nil + } + + // Follow symlinks under source, and add create mount specs for the host dirs + // pointed to by the symlinks. + links, err := followSymlinksUnder(source, true) + if err != nil { + return nil, fmt.Errorf("failed to follow symlinks under %s: %v", source, err) + } + + // apply symlink filtering + if filter != "" { + filter = filepath.Clean(filter) + links = libutils.StringSliceRemoveMatch(links, func(s string) bool { + if strings.HasPrefix(s, filter+"/") { + return false + } + return true + }) + } + + if len(links) == 0 { + return mounts, nil + } + + // Find the longest common path for directories prefixed by the given filter. + levelOneSubdirs := findSubPaths(links, filter) + + for _, paths := range levelOneSubdirs { + lcp := longestCommonPath(paths) + lcp = filepath.Clean(lcp) + + // Skip if we are matching the original (above) mount-spec. + // NOTE: this assumes sources = dest in the given mount-spec. + if lcp == source && lcp == dest { + continue + } + + // if the lcp is underneath the source, ignore it + if !strings.HasPrefix(lcp, source+"/") { + m := specs.Mount{ + Source: lcp, + Destination: lcp, + Type: mountType, + Options: mountOpt, + } + mounts = append(mounts, m) + } + } + + return mounts, nil +} + +// Given a list of filepaths and a prefix, returns the top level files/dirs +// under that prefix. The top level dirs are stored as keys in a map; the value +// associated with that key is all the files/dirs under that path. +func findSubPaths(paths []string, prefix string) map[string][]string { + levelOnePaths := make(map[string][]string) + + for _, path := range paths { + p := strings.TrimPrefix(path, prefix+"/") + comp := strings.Split(p, "/") + + name := "/" + comp[0] + if prefix != "" { + name = filepath.Join(prefix, comp[0]) + } + + val, ok := levelOnePaths[name] + if !ok { + levelOnePaths[name] = []string{path} + } else { + levelOnePaths[name] = append(val, path) + } + } + + return levelOnePaths +} + +// finds longest-common-path among the given absolute paths +func longestCommonPath(paths []string) string { + + if len(paths) == 0 { + return "" + } else if len(paths) == 1 { + return paths[0] + } + + // find the shortest and longest paths in the set + shortest, longest := paths[0], paths[0] + for _, p := range paths[1:] { + if p < shortest { + shortest = p + } else if p > longest { + longest = p + } + } + + // find the first 'i' common characters between the shortest and longest paths + lcp := shortest + for i := 0; i < len(shortest) && i < len(longest); i++ { + if shortest[i] != longest[i] { + lcp = shortest[:i] + break + } + } + + // if the longest common prefix does not end on a path separator, we may + // have left a path component truncated, and we need to strip it off + // (the longest common path of "/root/aba" and "/root/aca" is "/root/" and not "/root/a") + if !strings.HasSuffix(lcp, "/") { + // in the case we have something like "/root/a" and "/root/a/b", no need to strip "a" off + if (len(lcp) < len(shortest) && shortest[len(lcp)] != '/') || + (len(lcp) < len(longest) && longest[len(lcp)] != '/') { + if idx := strings.LastIndex(lcp, "/"); idx != -1 { + lcp = lcp[:idx] + } + } + } + + return lcp +} + +// returns a list of all symbolic links under the given directory +func followSymlinksUnder(dir string, skipDangling bool) ([]string, error) { + + // walk dir; if file is symlink (use os.Lstat()), readlink() and add to slice + symlinks := []string{} + + err := filepath.Walk(dir, func(path string, info os.FileInfo, err error) error { + var ( + fi os.FileInfo + realpath string + linkDest string + ) + + if path == dir { + return nil + } + fi, err = os.Lstat(path) + if err != nil { + return fmt.Errorf("failed to lstat %s: %v", path, err) + } + if fi.Mode()&os.ModeSymlink == 0 { + return nil + } + + linkDest, err = os.Readlink(path) + if err != nil { + return fmt.Errorf("failed to resolve symlink at %s: %v", path, err) + } + + if filepath.IsAbs(linkDest) { + realpath = linkDest + } else { + realpath = filepath.Join(filepath.Dir(path), linkDest) + } + + if skipDangling { + _, err = os.Stat(realpath) + if err != nil { + if os.IsNotExist(err) { + return nil + } else { + return fmt.Errorf("failed to stat %s: %v", realpath, err) + } + } + } + + symlinks = append(symlinks, realpath) + return nil + }) + + if err != nil { + return nil, err + } + + return symlinks, nil +} + +func mntSrcUidShiftNeeded(mntSrc string, uid, gid uint32) (bool, uint32, uint32, error) { + + // The determination on whether to uid-shift the given mount source directory + // is done by checking the ownership of the dir and the first level subdirs + // under it (if any), and comparing their owner:group versus that of the + // container's root user. This heuristic works well for the mounts for which + // we normally do preps (e.g., mounts over the container's /var/lib/docker, + // /var/lib/kubelet, etc.). We want to avoid an exhaustive check as it can be + // quite slow if the directory hierarchy underneath the mount source is + // extensive (e.g., if we are bind-mounting a fully populated docker cache on + // the host to the container's /var/lib/docker). + + var mntSrcUid, mntSrcGid uint32 + + // mnt src ownership check + fi, err := os.Stat(mntSrc) + if err != nil { + return false, 0, 0, err + } + + st, _ := fi.Sys().(*syscall.Stat_t) + + mntSrcUid = st.Uid + mntSrcGid = st.Gid + + // If the host uid assigned to the container's root user differs from the + // uid of the dir being mounted into the container, then we perform uid + // shifting. Same for gid. + if uid != mntSrcUid && gid != mntSrcGid { + return true, mntSrcUid, mntSrcGid, nil + } + + // If the mount dir has same ownership as the container, check the subdirs + // before we make a determination on whether ownership shifting will be + // required. + dirFis := []os.FileInfo{} + + dirFis, err = ioutil.ReadDir(mntSrc) + if err != nil { + return false, 0, 0, err + } + + numNeedChown := 0 + for _, fi := range dirFis { + st, _ := fi.Sys().(*syscall.Stat_t) + if uid != st.Uid || gid != st.Gid { + numNeedChown += 1 + } + } + + needChown := (numNeedChown == len(dirFis)) + + return needChown, mntSrcUid, mntSrcGid, nil +} + +func preFlightCheck() error { + for _, prog := range progDeps { + if !libutils.CmdExists(prog) { + return fmt.Errorf("%s is not installed on host.", prog) + } + } + + return nil +} + +func getInode(file string) (uint64, error) { + var st unix.Stat_t + + if err := unix.Stat(file, &st); err != nil { + return 0, fmt.Errorf("unable to stat %s: %s", file, err) + } + + return st.Ino, nil +} + +func checkIDMapMountSupport(ctx *cli.Context) (bool, bool, error) { + + // The sysbox lib dir may have restrictive permissions; loosen those up + // temporarily while we perform the ID-map check since it runs inside a + // Linux user-ns. + fi, err := os.Stat(sysboxLibDir) + if err != nil { + return false, false, err + } + origPerm := fi.Mode() + + if err := os.Chmod(sysboxLibDir, 0755); err != nil { + return false, false, fmt.Errorf("failed to chmod %s to 0755: %s", sysboxLibDir, err) + } + + defer func() () { + os.Chmod(sysboxLibDir, origPerm) + }() + + IDMapMountOk, err := idMap.IDMapMountSupported(sysboxLibDir) + if err != nil { + return false, false, fmt.Errorf("failed to check kernel ID-mapping support: %v", err) + } + + ovfsOnIDMapMountOk, err := idMap.OverlayfsOnIDMapMountSupported(sysboxLibDir) + if err != nil { + return false, false, fmt.Errorf("failed to check kernel ID-mapping-on-overlayfs support: %v", err) + } + + if err := os.Chmod(sysboxLibDir, origPerm); err != nil { + return false, false, fmt.Errorf("failed to chmod %s back to %o: %s", sysboxLibDir, origPerm, err) + } + + return IDMapMountOk, ovfsOnIDMapMountOk, nil +} + +func checkShiftfsSupport(ctx *cli.Context) (bool, bool, error) { + + // The sysbox lib dir may have restrictive permissions; loosen those up + // temporarily while we perform the shiftfs check since it runs inside a + // Linux user-ns. + fi, err := os.Stat(sysboxLibDir) + if err != nil { + return false, false, err + } + origPerm := fi.Mode() + + if err := os.Chmod(sysboxLibDir, 0755); err != nil { + return false, false, fmt.Errorf("failed to chmod %s to 0755: %s", sysboxLibDir, err) + } + + defer func() () { + os.Chmod(sysboxLibDir, origPerm) + }() + + shiftfsOk, err := shiftfs.ShiftfsSupported(sysboxLibDir) + if err != nil { + return false, false, fmt.Errorf("failed to check kernel shiftfs support: %v", err) + } + + shiftfsOnOvfsOk, err := shiftfs.ShiftfsSupportedOnOverlayfs(sysboxLibDir) + if err != nil { + return false, false, fmt.Errorf("failed to check kernel shiftfs-on-overlayfs support: %v", err) + } + + if err := os.Chmod(sysboxLibDir, origPerm); err != nil { + return false, false, fmt.Errorf("failed to chmod %s back to %o: %s", sysboxLibDir, origPerm, err) + } + + return shiftfsOk, shiftfsOnOvfsOk, nil +} + +func isRootfsOnOverlayfs(rootfs string) (bool, error) { + fsName, err := libutils.GetFsName(rootfs) + if err != nil { + return false, err + } + if fsName != "overlayfs" { + return false, nil + } + return true, nil +} + +func getRootfsOverlayUpperLayer(rootfs string) (string, error) { + mounts, err := mount.GetMountsPid(uint32(os.Getpid())) + if err != nil { + return "", err + } + mi, err := mount.GetMountAt(rootfs, mounts) + if err != nil { + return "", nil + } + ovfsMntOpts := overlayUtils.GetMountOpt(mi) + ovfsUpperLayer := overlayUtils.GetUpperLayer(ovfsMntOpts) + + return ovfsUpperLayer, nil +} + +// ifThenElse is one-liner for "condition? a : b" +func ifThenElse(condition bool, a interface{}, b interface{}) interface{} { + if condition { + return a + } + return b +} diff --git a/sysbox-mgr/utils_test.go b/sysbox-mgr/utils_test.go new file mode 100644 index 00000000..91cf5704 --- /dev/null +++ b/sysbox-mgr/utils_test.go @@ -0,0 +1,444 @@ +// +// Copyright 2019-2020 Nestybox, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +package main + +import ( + "bytes" + "fmt" + "io/ioutil" + "os" + "path/filepath" + "testing" + + "github.com/opencontainers/runc/libcontainer/user" + "golang.org/x/sys/unix" +) + +func compareSubidRanges(t *testing.T, want, got []user.SubID) { + if len(got) != len(want) { + t.Errorf("AllocSubidRange(): want %v, got %v", want, got) + } + for i, _ := range want { + if got[i] != want[i] { + t.Errorf("AllocSubidRange(): want %v, got %v", want, got) + } + } +} + +func TestAllocSubidRange(t *testing.T) { + + var subID, got, want []user.SubID + var min uint64 = 100000 + var max uint64 = 600100000 + var err error + + // at end of range + subID = []user.SubID{ + {"user1", 100000, 65536}, + {"user2", 165536, 65536}, + } + got, err = allocSubidRange(subID, 65536, min, max) + if err != nil { + t.Errorf("AllocSubidRange(): %v", err) + } + want = append(subID, user.SubID{"sysbox", 231072, 65536}) + compareSubidRanges(t, want, got) + + // at beginning of range + subID = []user.SubID{ + {"user2", 165536, 65536}, + } + got, err = allocSubidRange(subID, 65536, min, max) + if err != nil { + t.Errorf("AllocSubidRange(): %v", err) + } + want = append(subID, user.SubID{"sysbox", 100000, 65536}) + compareSubidRanges(t, want, got) + + // in middle of range + subID = []user.SubID{ + {"user1", 100000, 65536}, + {"user2", 231072, 65536}, + } + got, err = allocSubidRange(subID, 65536, min, max) + if err != nil { + t.Errorf("AllocSubidRange(): %v", err) + } + want = append(subID, user.SubID{"sysbox", 165536, 65536}) + compareSubidRanges(t, want, got) + + // with overlapping ranges + subID = []user.SubID{ + {"user1", 100000, 65536}, + {"user2", 100000, 65536}, + {"user3", 165536, 65536}, + } + got, err = allocSubidRange(subID, 65536, min, max) + if err != nil { + t.Errorf("AllocSubidRange(): %v", err) + } + want = append(subID, user.SubID{"sysbox", 231072, 65536}) + compareSubidRanges(t, want, got) + + // more overlapping ranges + subID = []user.SubID{ + {"user1", 100000, 65536}, + {"user2", 120000, 65536}, + {"user3", 165536, 65536}, + } + got, err = allocSubidRange(subID, 65536, min, max) + if err != nil { + t.Errorf("AllocSubidRange(): %v", err) + } + want = append(subID, user.SubID{"sysbox", 231072, 65536}) + compareSubidRanges(t, want, got) + + // empty range + subID = []user.SubID{} + got, err = allocSubidRange(subID, 65536, min, max) + if err != nil { + t.Errorf("AllocSubidRange(): %v", err) + } + want = append(subID, user.SubID{"sysbox", 100000, 65536}) + compareSubidRanges(t, want, got) + + // not enought ids + max = 165536 + subID = []user.SubID{ + {"user1", 100000, 65536}, + } + got, err = allocSubidRange(subID, 65536, min, max) + if err == nil { + t.Errorf("AllocSubidRange(): expected alloc error, got no error") + } + + max = 165536 + subID = []user.SubID{ + {"user1", 100000, 4096}, + } + got, err = allocSubidRange(subID, 65536, min, max) + if err == nil { + t.Errorf("AllocSubidRange(): expected alloc error, got no error") + } + + // off-by-one tests + max = 165536 + subID = []user.SubID{ + {"user1", 100000, 65536}, + } + got, err = allocSubidRange(subID, 1, min, max) + if err == nil { + t.Errorf("AllocSubidRange(): expected alloc error, got no error") + } + + subID = []user.SubID{ + {"user1", 100000, 65535}, + } + got, err = allocSubidRange(subID, 1, min, max) + if err != nil { + t.Errorf("AllocSubidRange(): %v", err) + } + want = append(subID, user.SubID{"sysbox", 165535, 1}) + compareSubidRanges(t, want, got) + + // invalid min/max/size + min = 100000 + max = 100000 + subID = []user.SubID{} + got, err = allocSubidRange(subID, 1, min, max) + if err == nil { + t.Errorf("AllocSubidRange(): expected alloc error, got no error") + } + + subID = []user.SubID{} + got, err = allocSubidRange(subID, 0, min, max) + if err == nil { + t.Errorf("AllocSubidRange(): expected alloc error, got no error") + } + + // un-sorted ranges + subID = []user.SubID{ + {"user1", 100000, 65536}, + {"user2", 231072, 65536}, + {"user3", 165536, 65536}, + {"user4", 362144, 65536}, + } + got, err = allocSubidRange(subID, 65536, min, max) + if err != nil { + t.Errorf("AllocSubidRange(): %v", err) + } + want = append(subID, user.SubID{"sysbox", 296608, 65536}) + compareSubidRanges(t, want, got) +} + +func verifyFileData(path string, data []byte) error { + + fileData, err := ioutil.ReadFile(path) + if err != nil { + return fmt.Errorf("failed to read file %s: %v", path, err) + } + + if bytes.Compare(fileData, data) != 0 { + return fmt.Errorf("file data mismatch: want %s, got %s", string(data), string(fileData)) + } + + return nil +} + +func testConfigSubidRangeHelper(subidFilePre, subidFilePost string, size, min, max uint64) error { + + f, err := ioutil.TempFile("", "testConfigSubidRange*") + if err != nil { + return fmt.Errorf("failed to create temp file: %v", err) + } + defer os.RemoveAll(f.Name()) + + if err := ioutil.WriteFile(f.Name(), []byte(subidFilePre), 0644); err != nil { + return fmt.Errorf("failed to write file %s: %v", f.Name(), err) + } + + if err := configSubidRange(f.Name(), size, min, max); err != nil { + return fmt.Errorf("configSubidRange(): error = %s", err) + } + + verifyFileData(f.Name(), []byte(subidFilePost)) + + return nil +} + +func TestConfigSubidRange(t *testing.T) { + + var subidFilePre, subidFilePost string + + // at end of range + subidFilePre = `user1:100000:65536 +user2:165536:65536` + subidFilePost = `user1:100000:65536 +user2:165536:65536 +sysbox:231072:268435456 +` + if err := testConfigSubidRangeHelper(subidFilePre, subidFilePost, 268435456, 100000, 600100000); err != nil { + t.Errorf(err.Error()) + } + + // at beginning of range + subidFilePre = `user2:165536:65536` + subidFilePost = `sysbox:100000:65536 +user2:165536:65536` + + if err := testConfigSubidRangeHelper(subidFilePre, subidFilePost, 65536, 100000, 600100000); err != nil { + t.Errorf(err.Error()) + } + + // in the middle of range + subidFilePre = `user1:100000:65536 +user2:231072:65536` + subidFilePost = `user1:100000:65536 +sysbox:165536:65536 +user2:231072:65536` + + if err := testConfigSubidRangeHelper(subidFilePre, subidFilePost, 65536, 100000, 600100000); err != nil { + t.Errorf(err.Error()) + } + + // not enought ids + subidFilePre = `user1:100000:65536` + if err := testConfigSubidRangeHelper(subidFilePre, subidFilePost, 600034465, 100000, 600100000); err == nil { + t.Errorf("configSubidRange(): expected alloc error, got no error") + } + + // do not disturb existing sysbox entry + subidFilePre = `user1:100000:65536 +sysbox:231072,65536 +user3:296608:65536` + + subidFilePost = subidFilePre + + if err := testConfigSubidRangeHelper(subidFilePre, subidFilePost, 65536, 100000, 600100000); err != nil { + t.Errorf(err.Error()) + } + + // replace redundant sysbox entries with one entry + subidFilePre = `user1:100000:65536 +sysbox:231072,65536 +user3:165536:65536 +sysbox:362144,65536` + + subidFilePost = `user1:100000:65536 +user3:165536:65536 +sysbox:231072,65536` + + if err := testConfigSubidRangeHelper(subidFilePre, subidFilePost, 65536, 100000, 600100000); err != nil { + t.Errorf(err.Error()) + } +} + +func testGetSubidLimitsHelper(fileData string, want []uint64) error { + + f, err := ioutil.TempFile("", "testGetSubidLimits*") + if err != nil { + return fmt.Errorf("failed to create temp file: %v", err) + } + + if err := ioutil.WriteFile(f.Name(), []byte(fileData), 0644); err != nil { + return fmt.Errorf("failed to write file %s: %v", f.Name(), err) + } + + limits, err := getSubidLimits(f.Name()) + if err != nil { + return fmt.Errorf("getSubidLimits(): error = %s", err) + } + + if len(limits) != 4 { + return fmt.Errorf("getSubidLimits(): limits length incorrect: want 4, got %d", len(limits)) + } + + for i := 0; i < 4; i++ { + if limits[i] != want[i] { + return fmt.Errorf("getSubidLimits(): failed: got %v, want %v", limits, want) + } + } + + if err := os.Remove(f.Name()); err != nil { + return fmt.Errorf("failed to remove file %s", f.Name()) + } + + return nil +} + +func TestGetSubidLimits(t *testing.T) { + + // fake login.defs data + fileData := `# some comments +some data +SUB_UID_MIN 100000 +some data +SUB_UID_MAX\t 600100000 +some data +SUB_GID_MIN 100000 +some data +SUB_GID_MAX\t\t 2147483648 +# some more comments` + + want := []uint64{100000, 600100000, 100000, 2147483648} + if err := testGetSubidLimitsHelper(fileData, want); err != nil { + t.Errorf(err.Error()) + } + + // login.defs file without uid(gid) limits + fileData = `# some comments +some data +# some more comments` + + want = []uint64{100000, 4294967295, 100000, 4294967295} + if err := testGetSubidLimitsHelper(fileData, want); err != nil { + t.Errorf(err.Error()) + } +} + +func TestGetLibModMounts(t *testing.T) { + + var utsname unix.Utsname + if err := unix.Uname(&utsname); err != nil { + t.Errorf("cfgLibModMount: uname failed: %v", err) + } + + n := bytes.IndexByte(utsname.Release[:], 0) + path := filepath.Join("/lib/modules/", string(utsname.Release[:n])) + if _, err := os.Stat(path); os.IsNotExist(err) { + return // skip test + } + + mounts, err := getLibModMounts() + if err != nil { + t.Errorf("cfgLibModMount: returned error: %v", err) + } + m := mounts[0] + if (m.Destination != path) || (m.Source != path) || (m.Type != "bind") { + t.Errorf("cfgLibModMount: failed basic mount test") + } +} + +func TestFindSubPaths(t *testing.T) { + paths := []string{ + "/my/prefix/a/b", + "/my/prefix/a/b/c/d", + "/my/prefix/a/b/c", + "/my/prefix/a/b/c/d/e", + "/my/prefix/x/y", + "/my/prefix/x/y/z", + } + + prefix := "/my/prefix" + subpaths := findSubPaths(paths, prefix) + + for k, v := range subpaths { + if k != "/my/prefix/a" && k != "/my/prefix/x" { + t.Errorf("Unexpected key in subpath map: %s", k) + } + if k == "/my/prefix/a" && len(v) != 4 || + k == "/my/prefix/x" && len(v) != 2 { + t.Errorf("Unexpected len(val) in subpath map for %s: %v", k, v) + } + //t.Logf("OK: %s: %v", k, v) + } + + prefix = "" + subpaths = findSubPaths(paths, prefix) + for k, v := range subpaths { + if k != "/my" { + t.Errorf("Unexpected key in subpath map: %s", k) + } + if len(v) != len(paths) { + t.Errorf("Unexpected len(val) in subpath map: want %d, got %d", len(paths), len(v)) + } + //t.Logf("OK: %s: %v", k, v) + } + +} + +func TestLongestCommonPath(t *testing.T) { + paths := []string{ + "/my/prefix/a/b", + "/my/prefix/a/b/c/d", + "/my/prefix/a/b/c", + "/my/prefix/a/b/c/d/e", + "/my/prefix/d/e/f/g", + "/my/prefix/d/e/f/g/h/i/j/k", + "/my/prefix/d/e/f/g/h/i", + "/my/prefix/d/e/f/g/h/i/j", + } + + prefix := "/my/prefix" + subpaths := findSubPaths(paths, prefix) + + for k, v := range subpaths { + if k != "/my/prefix/a" && k != "/my/prefix/d" { + t.Errorf("Unexpected key in subpath map: %s", k) + } + + lcp := longestCommonPath(v) + + if k == "/my/prefix/a" && lcp != "/my/prefix/a/b" { + t.Errorf("Unexpected lcp; want \"/my/prefix/a/b\", got \"%s\"", lcp) + } + if k == "/my/prefix/d" && lcp != "/my/prefix/d/e/f/g" { + t.Errorf("Unexpected lcp; want \"/my/prefix/d/e/f\", got \"%s\"", lcp) + } + //t.Logf("OK: k = %s, lcp = %s", k, lcp) + } +} diff --git a/sysbox-mgr/volMgr/volMgr.go b/sysbox-mgr/volMgr/volMgr.go new file mode 100644 index 00000000..59c53053 --- /dev/null +++ b/sysbox-mgr/volMgr/volMgr.go @@ -0,0 +1,409 @@ +// +// Copyright 2019-2021 Nestybox, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +// The volume manager manages a directory on the host that is bind-mounted into the sys +// container, typically to overcome problems that arise if those directories were to be on +// the sys container's rootfs (which typically uses overlayfs or shiftfs-on-overlayfs +// mounts when uid shifting is enabled). The bind-mount overcomes these problems since the +// source of the mount is a directory that is on the host's filesystem (typically ext4). +// +// The volume manager takes care of ensuring that the backing host directory has correct +// ownership to allow sys container root processes to access it, and also handles copying +// contents from the sys container rootfs to the backing dir when the container is started, +// and vice-versa when container is stopped or paused. + +package volMgr + +import ( + "bytes" + "fmt" + "io" + "os" + "os/exec" + "path/filepath" + "strings" + "sync" + + "github.com/nestybox/sysbox-libs/formatter" + "github.com/nestybox/sysbox-libs/idShiftUtils" + mount "github.com/nestybox/sysbox-libs/mount" + overlayUtils "github.com/nestybox/sysbox-libs/overlayUtils" + utils "github.com/nestybox/sysbox-libs/utils" + "github.com/nestybox/sysbox-mgr/intf" + specs "github.com/opencontainers/runtime-spec/specs-go" + "github.com/sirupsen/logrus" +) + +type volInfo struct { + volPath string // volume path in host + rootfs string // container rootfs + mountPath string // container path where volume is mounted + syncOutPath string // container path for volume sync-out + uid uint32 // uid owner for container + gid uint32 // gid owner for container + shiftUids bool // chown uid(gid) when copying to/from container rootfs + perm os.FileMode // permissions for the volume +} + +type vmgr struct { + name string + hostDir string + sync bool + volTable map[string]volInfo // cont id -> volume info + mu sync.Mutex +} + +type shiftType int + +const ( + shiftUp shiftType = iota + shiftDown +) + +// Creates a new instance of the volume manager. +// 'name' is the name for this volume manager. +// 'hostDir' is the directory on the host which the manager will use for its operations. +// 'sync' indicates if the volume contents should be sync'd with those of the mountpoint. +func New(name, hostDir string, sync bool) (intf.VolMgr, error) { + return &vmgr{ + name: name, + hostDir: hostDir, + sync: sync, + volTable: make(map[string]volInfo), + }, nil +} + +// Implements intf.VolMgr.CreateVol +func (m *vmgr) CreateVol(id, rootfs, mountpoint string, uid, gid uint32, chownOnSync bool, perm os.FileMode) ([]specs.Mount, error) { + var err error + + volPath := filepath.Join(m.hostDir, id) + if _, err = os.Stat(volPath); err == nil { + return nil, fmt.Errorf("volume dir for container %v already exists", id) + } + + mountPath := filepath.Join(rootfs, mountpoint) + + rootfsOnOvfs, rootfsOvfsUpper, err := isRootfsOnOverlayfs(rootfs) + if err != nil { + return nil, err + } + + // When the container stops and we need to copy the volume contents back to + // the container's rootfs. We call this "sync-out", and syncOutPath is the + // path were we want to copy to. + syncOutPath := mountPath + + // If the container rootfs is on overlayfs (common case), the syncOutPath + // can't be the overlayfs merged dir. That's because sysbox-runc may have + // remounted that in the container's mount ns (e.g., when using id-mapping on + // the rootfs), so sysbox-mgr won't have access to it. Instead the + // syncOutPath is the overlayfs "upper" dir. + if rootfsOnOvfs { + syncOutPath = filepath.Join(rootfsOvfsUpper, mountpoint) + } + + // create volume info + m.mu.Lock() + if _, found := m.volTable[id]; found { + m.mu.Unlock() + return nil, fmt.Errorf("volume for container %v already exists", id) + } + vi := volInfo{ + volPath: volPath, + rootfs: rootfs, + mountPath: mountPath, + syncOutPath: syncOutPath, + uid: uid, + gid: gid, + shiftUids: chownOnSync, + perm: perm, + } + m.volTable[id] = vi + m.mu.Unlock() + + defer func() { + if err != nil { + m.mu.Lock() + delete(m.volTable, id) + m.mu.Unlock() + } + }() + + if err = os.Mkdir(volPath, perm); err != nil { + return nil, fmt.Errorf("failed to create volume for container %v: %v", id, err) + } + + if err = os.Chown(volPath, int(uid), int(gid)); err != nil { + os.RemoveAll(volPath) + return nil, fmt.Errorf("failed to set ownership of volume %v: %v", volPath, err) + } + + // Sync the contents of container's mountpoint to the newly created volume ("sync-in") + if _, err := os.Stat(mountPath); err == nil { + if err = m.rsyncVol(mountPath, volPath, uid, gid, chownOnSync, shiftUp); err != nil { + os.RemoveAll(volPath) + return nil, fmt.Errorf("volume sync-in failed: %v", err) + } + } + + mounts := []specs.Mount{ + { + Source: volPath, + Destination: mountpoint, + Type: "bind", + Options: []string{"rbind", "rprivate"}, + }, + } + + logrus.Debugf("%s: created volume for container %s", + m.name, formatter.ContainerID{id}) + + return mounts, nil +} + +// Implements intf.VolMgr.DestroyVol +func (m *vmgr) DestroyVol(id string) error { + + m.mu.Lock() + vi, found := m.volTable[id] + if !found { + m.mu.Unlock() + return fmt.Errorf("failed to find vol info for container %s", + formatter.ContainerID{id}) + } + volPath := vi.volPath + m.mu.Unlock() + + if _, err := os.Stat(volPath); err != nil { + return fmt.Errorf("failed to stat %v: %v", volPath, err) + } + + if err := os.RemoveAll(volPath); err != nil { + return fmt.Errorf("failed to remove %v: %v", volPath, err) + } + + m.mu.Lock() + delete(m.volTable, id) + m.mu.Unlock() + + logrus.Debugf("%s: destroyed volume for container %s", + m.name, formatter.ContainerID{id}) + + return nil +} + +// Implements intf.VolMgr.SyncOut +func (m *vmgr) SyncOut(id string) error { + + if !m.sync { + return nil + } + + m.mu.Lock() + vi, found := m.volTable[id] + if !found { + m.mu.Unlock() + return fmt.Errorf("failed to find vol info for container %s", + formatter.ContainerID{id}) + } + m.mu.Unlock() + + // If the container's rootfs is gone, bail + if _, err := os.Stat(vi.rootfs); os.IsNotExist(err) { + logrus.Debugf("%s: volume sync-out for container %s skipped: target %s does not exist", + m.name, formatter.ContainerID{id}, vi.rootfs) + return nil + } + + // if the sync out target does not exist, create it (but only if we are going + // to be copying anything to it). + if _, err := os.Stat(vi.syncOutPath); os.IsNotExist(err) { + volIsEmpty, err := dirIsEmpty(vi.volPath) + if err != nil { + return fmt.Errorf("error while checking if %s is empty: %s", vi.volPath, err) + } + if !volIsEmpty { + if err := os.MkdirAll(vi.syncOutPath, vi.perm); err != nil { + return fmt.Errorf("failed to create directory %s: %s", vi.syncOutPath, err) + } + } + } + + // If the sync-out target exists, perform the rsync + if _, err := os.Stat(vi.syncOutPath); err == nil { + if err := m.rsyncVol(vi.volPath, vi.syncOutPath, vi.uid, vi.gid, vi.shiftUids, shiftDown); err != nil { + + // For sync-outs, the operation may fail if the target is removed while + // we are doing the copy. In this case we ignore the error since there + // is no data loss (the data being sync'd out would have been removed + // anyways). + + _, err2 := os.Stat(vi.syncOutPath) + if err2 != nil && os.IsNotExist(err2) { + logrus.Debugf("%s: volume sync-out for container %s skipped: target %s does not exist", + m.name, formatter.ContainerID{id}, vi.syncOutPath) + return nil + } + + return fmt.Errorf("volume sync-out failed: %v", err) + } + } + + logrus.Debugf("%s: sync'd-out volume for container %s", + m.name, formatter.ContainerID{id}) + + return nil +} + +// Implements intf.VolMgr.SyncOutAndDestroyAll +func (m *vmgr) SyncOutAndDestroyAll() { + for id, _ := range m.volTable { + if err := m.SyncOut(id); err != nil { + logrus.Warnf("%s: failed to sync-out volumes for container %s: %s", + m.name, formatter.ContainerID{id}, err) + } + if err := m.DestroyVol(id); err != nil { + logrus.Warnf("%s: failed to destroy volumes for container %s: %s", + m.name, formatter.ContainerID{id}, err) + } + } +} + +// rsyncVol performs an rsync from src to dest. If shiftUids is true, it also +// performs filesystem user-ID and group-ID shifting (via chown) using an +// offset specified via uid and gid. +// +// Note that depending no how much data is transferred, this operation can +// result in many file descriptors being opened by rsync, which the kernel may +// account to sysbox-mgr. Thus, the file open limit for sysbox-mgr should be +// very high / unlimited since the number of open files depends on how much data +// there is to copy and how many containers are active at a given time. +func (m *vmgr) rsyncVol(src, dest string, uid, gid uint32, shiftUids bool, shiftT shiftType) error { + + var cmd *exec.Cmd + var output bytes.Buffer + var usermap, groupmap string + + if shiftUids { + srcUidList, srcGidList, err := idShiftUtils.GetDirIDs(src) + if err != nil { + return fmt.Errorf("failed to get user and group IDs for %s: %s", src, err) + } + + // Get the usermap and groupmap options to pass to rsync + usermap = rsyncIdMapOpt(srcUidList, uid, shiftT) + groupmap = rsyncIdMapOpt(srcGidList, gid, shiftT) + + if usermap != "" { + usermap = "--usermap=" + usermap + } + + if groupmap != "" { + groupmap = "--groupmap=" + groupmap + } + } + + // Note: rsync uses file modification time and size to determine if a sync is + // needed. This should be fine for sync'ing the sys container's directories, + // assuming the probability of files being different yet having the same size & + // timestamp is low. If this assumption changes we could pass the `--checksum` option + // to rsync, but this will slow the copy operation significantly. + srcDir := src + "/" + + if usermap == "" && groupmap == "" { + cmd = exec.Command("rsync", "-rauqlH", "--no-devices", "--delete", srcDir, dest) + } else if usermap != "" && groupmap == "" { + cmd = exec.Command("rsync", "-rauqlH", "--no-devices", "--delete", usermap, srcDir, dest) + } else if usermap == "" && groupmap != "" { + cmd = exec.Command("rsync", "-rauqlH", "--no-devices", "--delete", groupmap, srcDir, dest) + } else { + cmd = exec.Command("rsync", "-rauqlH", "--no-devices", "--delete", usermap, groupmap, srcDir, dest) + } + + cmd.Stdout = &output + cmd.Stderr = &output + + err := cmd.Run() + if err != nil { + return fmt.Errorf("rsync %s to %s: %v %v", srcDir, dest, string(output.Bytes()), err) + } + + return nil +} + +func rsyncIdMapOpt(idList []uint32, offset uint32, shiftT shiftType) string { + var destId uint32 + + mapOpt := "" + for _, srcId := range idList { + if shiftT == shiftUp { + destId = srcId + offset + } else { + destId = srcId - offset + } + mapOpt += fmt.Sprintf("%d:%d,", srcId, destId) + } + + if mapOpt != "" { + mapOpt = strings.TrimSuffix(mapOpt, ",") + } + + return mapOpt +} + +func dirIsEmpty(name string) (bool, error) { + f, err := os.Open(name) + if err != nil { + return false, err + } + defer f.Close() + + _, err = f.Readdirnames(1) + if err == io.EOF { + return true, nil + } + + return false, err +} + +func isRootfsOnOverlayfs(rootfs string) (bool, string, error) { + + fsName, err := utils.GetFsName(rootfs) + if err != nil { + return false, "", err + } + + if fsName != "overlayfs" { + return false, "", nil + } + + mounts, err := mount.GetMountsPid(uint32(os.Getpid())) + if err != nil { + return false, "", err + } + + // If the rootfs is not a mountpoint, return false. + mi, err := mount.GetMountAt(rootfs, mounts) + if err != nil { + return false, "", nil + } + + ovfsMntOpts := overlayUtils.GetMountOpt(mi) + ovfsUpperLayer := overlayUtils.GetUpperLayer(ovfsMntOpts) + + return true, ovfsUpperLayer, nil +} diff --git a/sysbox-mgr/volMgr/volMgr_test.go b/sysbox-mgr/volMgr/volMgr_test.go new file mode 100644 index 00000000..dad573ea --- /dev/null +++ b/sysbox-mgr/volMgr/volMgr_test.go @@ -0,0 +1,542 @@ +// +// Copyright 2019-2020 Nestybox, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +package volMgr + +import ( + "fmt" + "io/ioutil" + "os" + "path/filepath" + "strings" + "syscall" + "testing" + + utils "github.com/nestybox/sysbox-libs/utils" + specs "github.com/opencontainers/runtime-spec/specs-go" + "github.com/sirupsen/logrus" +) + +type testFile struct { + name string + uid uint32 + gid uint32 +} + +func init() { + // turn off info & debug logging for unit tests + logrus.SetLevel(logrus.ErrorLevel) +} + +func setupTest() (string, string, error) { + hostDir, err := ioutil.TempDir("", "volMgrTest-host") + if err != nil { + return "", "", err + } + + rootfs, err := ioutil.TempDir("", "volMgrTest-rootfs") + if err != nil { + return "", "", err + } + + return hostDir, rootfs, nil +} + +func cleanupTest(hostDir, rootfs string) { + os.RemoveAll(hostDir) + os.RemoveAll(rootfs) +} + +func populateDir(base string, uid, gid uint32, files []testFile) error { + data := []byte("some data") + + // create the files in the directory + for _, file := range files { + + dir := filepath.Dir(file.name) + path := filepath.Join(base, dir) + if err := os.MkdirAll(path, 0700); err != nil { + return fmt.Errorf("failed to create dir %v: %v", path, err) + } + + path = filepath.Join(base, file.name) + + if err := ioutil.WriteFile(path, data, 0700); err != nil { + return fmt.Errorf("failed to create file %v: %v", path, err) + } + } + + // chown the files + err := filepath.Walk(base, func(path string, fi os.FileInfo, err error) error { + if err == nil { + + // chown all dirs & files to the given uid & gid by default + if err := os.Chown(path, int(uid), int(gid)); err != nil { + return fmt.Errorf("chown on %s failed: %s", path, err) + } + + // for the given files, chown to the file-specific uid & gid + for _, file := range files { + if strings.Contains(path, file.name) { + if err := os.Chown(path, int(file.uid), int(file.gid)); err != nil { + return fmt.Errorf("chown on %s failed: %s", path, err) + } + } + } + } + return err + }) + + if err != nil { + return fmt.Errorf("failed to chown files: %s", err) + } + + return nil +} + +func compareDirs(src, dest string) error { + var err error + + srcPaths := []string{} + err = filepath.Walk(src, func(path string, fi os.FileInfo, err error) error { + if err == nil { + path = path[len(src):] + srcPaths = append(srcPaths, path) + } + return err + }) + + if err != nil { + return fmt.Errorf("failed walking path %v: %v", src, err) + } + + destPaths := []string{} + err = filepath.Walk(dest, func(path string, fi os.FileInfo, err error) error { + if err == nil { + path = path[len(dest):] + destPaths = append(destPaths, path) + } + return err + }) + + if err != nil { + return fmt.Errorf("failed walking path %v: %v", dest, err) + } + + if !utils.StringSliceEqual(srcPaths, destPaths) { + return fmt.Errorf("mismatch between %v and %v", srcPaths, destPaths) + } + + return nil +} + +func testCreateVolWork(id, hostDir, rootfs, mountpoint string, uid, gid uint32, shiftUids bool) ([]specs.Mount, error) { + want := []specs.Mount{ + { + Source: filepath.Join(hostDir, id), + Destination: mountpoint, + Type: "bind", + Options: []string{"rbind", "rprivate"}, + }, + } + + mgr, err := New("testVolMgr", hostDir, true) + if err != nil { + return nil, fmt.Errorf("New(%v) returned %v", hostDir, err) + } + + got, err := mgr.CreateVol(id, rootfs, mountpoint, uid, gid, shiftUids, 0700) + if err != nil { + return nil, fmt.Errorf("CreateVol() returned %v", err) + } + + // check that the volMgr volTable entry got created + vmgr := mgr.(*vmgr) + if _, found := vmgr.volTable[id]; !found { + return nil, fmt.Errorf("CreateVol() did not create entry in volTable") + } + + // check that CreateVol returned the expected mount + if !utils.MountSliceEqual(got, want) { + return nil, fmt.Errorf("CreateVol(%v, %v, %v, %v, %v, 0700) returned %v, want %v", id, rootfs, mountpoint, uid, gid, got, want) + } + + return got, nil +} + +func TestCreateVol(t *testing.T) { + hostDir, rootfs, err := setupTest() + if err != nil { + t.Errorf("failed to setup test: %v", err) + } + defer cleanupTest(hostDir, rootfs) + + id := "test-cont" + mountpoint := "/var/lib/kubelet" + uid := uint32(os.Geteuid()) + gid := uint32(os.Getegid()) + + // create the volume and verify all is good + if _, err := testCreateVolWork(id, hostDir, rootfs, mountpoint, uid, gid, false); err != nil { + t.Errorf(err.Error()) + } +} + +func TestDestroyVol(t *testing.T) { + + hostDir, rootfs, err := setupTest() + if err != nil { + t.Errorf("failed to setup test: %v", err) + } + defer cleanupTest(hostDir, rootfs) + + mgr, err := New("testVolMgr", hostDir, true) + if err != nil { + t.Errorf("New(%v) returned %v", hostDir, err) + } + + id := "test-cont" + mountpoint := "/var/lib/kubelet" + uid := uint32(os.Geteuid()) + gid := uint32(os.Getegid()) + + _, err = mgr.CreateVol(id, rootfs, mountpoint, uid, gid, false, 0700) + if err != nil { + t.Errorf("CreateVol() returned %v", err) + } + + // check that the volMgr volTable entry got created + vmgr := mgr.(*vmgr) + if _, found := vmgr.volTable[id]; !found { + t.Errorf("CreateVol() did not create entry in volTable") + } + + if err := mgr.DestroyVol(id); err != nil { + t.Errorf("DestroyVol(%v) returned %v", id, err) + } + + // check that the volMgr volTable entry got removed + if _, found := vmgr.volTable[id]; found { + t.Errorf("CreateVol() did not destroy entry in volTable") + } + + // Verify the volume was indeed destroyed + vol := filepath.Join(hostDir, id) + if _, err := os.Stat(vol); err != nil { + if !os.IsNotExist(err) { + t.Errorf("DestroyVol(%v) failed: %v", id, err) + } + } +} + +func testSyncInWork(t *testing.T, shiftUids bool) { + uid := uint32(os.Geteuid()) + gid := uint32(os.Getegid()) + + if uid != 0 && gid != 0 { + t.Skip("This test only runs as root") + } + + hostDir, rootfs, err := setupTest() + if err != nil { + t.Errorf("failed to setup test: %v", err) + } + defer cleanupTest(hostDir, rootfs) + + // create a fake container rootfs and populate its "/var/lib/kubelet" + id := "test-cont" + mountpoint := "/var/lib/kubelet" + uid = 231072 + gid = 231072 + + rootfsUidOffset := uint32(0) + rootfsGidOffset := uint32(0) + + if !shiftUids { + rootfsUidOffset = uid + rootfsGidOffset = gid + } + + files := []testFile{ + { + name: "testdir1/a/b/c/d/file0", + uid: rootfsUidOffset + 0, + gid: rootfsGidOffset + 0, + }, + { + name: "testdir1/a/file1", + uid: rootfsUidOffset + 1000, + gid: rootfsGidOffset + 1000, + }, + { + name: "testdir3/a/b/file2", + uid: rootfsUidOffset + 100, + gid: rootfsGidOffset + 100, + }, + } + + mountPath := filepath.Join(rootfs, mountpoint) + + if err := populateDir(mountPath, rootfsUidOffset, rootfsGidOffset, files); err != nil { + t.Errorf("failed to populate rootfs mountpoint: %v", err) + } + + // create the volume mgr; this triggers the sync-in automatically. + mgr, err := New("testVolMgr", hostDir, true) + if err != nil { + t.Errorf("New(%v) returned %v", hostDir, err) + } + + _, err = mgr.CreateVol(id, rootfs, mountpoint, uid, gid, shiftUids, 0700) + if err != nil { + t.Errorf("CreateVol() returned %v", err) + } + + // verify the sync-in worked + volPath := filepath.Join(hostDir, id) + + if err := compareDirs(volPath, mountPath); err != nil { + t.Errorf("directory comparison between %v and %v failed: %v", volPath, mountPath, err) + } + + // verify the sync-in shifted the file uid and gid correctly + err = filepath.Walk(volPath, func(path string, fi os.FileInfo, err error) error { + wantUid := uint32(uid) + wantGid := uint32(gid) + + for _, f := range files { + if filepath.Base(f.name) == filepath.Base(path) { + if shiftUids { + // sync-in shifts uids by adding the container's root uid to them + wantUid = uid + f.uid + wantGid = gid + f.gid + } else { + wantUid = f.uid + wantGid = f.gid + } + } + } + + if err == nil { + stat := fi.Sys().(*syscall.Stat_t) + if stat.Uid != wantUid || stat.Gid != wantGid { + return fmt.Errorf("uid:gid mismatch on volume path %v: want %v:%v, got %v:%v", + path, wantUid, wantGid, stat.Uid, stat.Gid) + } + } + return err + }) + + if err != nil { + t.Errorf("ownership check failed: %s", err) + } +} + +func TestSyncIn(t *testing.T) { + testSyncInWork(t, false) +} + +func TestSyncInUidShift(t *testing.T) { + testSyncInWork(t, true) +} + +func testSyncOutWork(t *testing.T, shiftUids bool) { + uid := uint32(os.Geteuid()) + gid := uint32(os.Getegid()) + + if uid != 0 && gid != 0 { + t.Skip("This test only runs as root") + } + + hostDir, rootfs, err := setupTest() + if err != nil { + t.Errorf("failed to setup test: %v", err) + } + defer cleanupTest(hostDir, rootfs) + + // create the volume mgr + mgr, err := New("testVolMgr", hostDir, true) + if err != nil { + t.Errorf("New(%v) returned %v", hostDir, err) + } + + id := "test-cont" + mountpoint := "/var/lib/kubelet" + uid = 231072 + gid = 231072 + + _, err = mgr.CreateVol(id, rootfs, mountpoint, uid, gid, shiftUids, 0700) + if err != nil { + t.Errorf("CreateVol() returned %v", err) + } + + // Add some files to the volume mgr + volPath := filepath.Join(hostDir, id) + + files := []testFile{ + { + name: "testdir1/a/b/c/d/file0", + uid: uid + 0, + gid: gid + 0, + }, + { + name: "testdir1/a/file1", + uid: uid + 1000, + gid: gid + 1000, + }, + { + name: "testdir3/a/b/file2", + uid: uid + 100, + gid: gid + 100, + }, + } + + if err := populateDir(volPath, uid, gid, files); err != nil { + t.Errorf("failed to populate vol at path %s: %s", volPath, err) + } + + // sync-out the vol to the rootfs; this will create the target dir automatically + if err := mgr.SyncOut(id); err != nil { + t.Errorf("sync-out failed: %s", err) + } + + // verify that the sync-out worked + mountPath := filepath.Join(rootfs, mountpoint) + + if err := compareDirs(volPath, mountPath); err != nil { + t.Errorf("directory comparison between %v and %v failed: %v", volPath, mountPath, err) + } + + err = filepath.Walk(mountPath, func(path string, fi os.FileInfo, err error) error { + var wantUid, wantGid uint32 + + if shiftUids { + wantUid = 0 + wantGid = 0 + } else { + wantUid = uid + wantGid = gid + } + + for _, f := range files { + if filepath.Base(f.name) == filepath.Base(path) { + if shiftUids { + // sync-out shifts uids by subtracting the container's root uid from them + wantUid = f.uid - uid + wantGid = f.gid - gid + } else { + wantUid = f.uid + wantGid = f.gid + } + } + } + + if err == nil { + stat := fi.Sys().(*syscall.Stat_t) + if stat.Uid != wantUid || stat.Gid != wantGid { + return fmt.Errorf("uid:gid mismatch on volume path %v: want %v:%v, got %v:%v", + path, wantUid, wantGid, stat.Uid, stat.Gid) + } + } + return err + }) + + if err != nil { + t.Errorf("ownership check failed: %s", err) + } +} + +func TestSyncOut(t *testing.T) { + testSyncOutWork(t, false) +} + +func TestSyncOutUidShift(t *testing.T) { + testSyncOutWork(t, true) +} + +func TestSyncInSkip(t *testing.T) { + hostDir, rootfs, err := setupTest() + if err != nil { + t.Errorf("failed to setup test: %v", err) + } + defer cleanupTest(hostDir, rootfs) + + // create the volMgr + mgr, err := New("testVolMgr", hostDir, true) + if err != nil { + t.Errorf("New(%v) returned %v", hostDir, err) + } + + id := "test-cont" + mountpoint := "/var/lib/kubelet" + uid := uint32(231072) + gid := uint32(231072) + + _, err = mgr.CreateVol(id, rootfs, mountpoint, uid, gid, false, 0700) + if err != nil { + t.Errorf("CreateVol() returned %v", err) + } + + // since the moutpoint was not populated, verify the sync-in was skipped + volPath := filepath.Join(hostDir, id) + empty, err := dirIsEmpty(volPath) + if err != nil { + t.Errorf("dirIsEmpty(%s) failed: %s", volPath, err) + } + if !empty { + t.Errorf("%s is not empty as expected", volPath) + } + +} + +func TestSyncOutSkip(t *testing.T) { + + hostDir, rootfs, err := setupTest() + if err != nil { + t.Errorf("failed to setup test: %v", err) + } + defer cleanupTest(hostDir, rootfs) + + // create the volMgr + mgr, err := New("testVolMgr", hostDir, true) + if err != nil { + t.Errorf("New(%v) returned %v", hostDir, err) + } + + id := "test-cont" + mountpoint := "/var/lib/kubelet" + uid := uint32(231072) + gid := uint32(231072) + + _, err = mgr.CreateVol(id, rootfs, mountpoint, uid, gid, false, 0700) + if err != nil { + t.Errorf("CreateVol() returned %v", err) + } + + // this sync-out should be a "no-op" since the volume is empty + if err := mgr.SyncOut(id); err != nil { + t.Errorf("sync-out failed: %s", err) + } + + // verify sync-out was indeed a no-op + mountPath := filepath.Join(rootfs, mountpoint) + _, err = os.Stat(mountPath) + if err == nil { + t.Errorf("mountPath at %s was created erroneously", mountPath) + } else if !os.IsNotExist(err) { + t.Errorf("stat(%s) failed: %v", mountPath, err) + } +} diff --git a/sysbox-pkgr b/sysbox-pkgr deleted file mode 160000 index 0eeac3e1..00000000 --- a/sysbox-pkgr +++ /dev/null @@ -1 +0,0 @@ -Subproject commit 0eeac3e1464b9bb6d86ed6f0a3ad4ba5fed48fd6 diff --git a/sysbox-pkgr/.github/workflows/manual-workflow.yml b/sysbox-pkgr/.github/workflows/manual-workflow.yml new file mode 100644 index 00000000..771f4b3b --- /dev/null +++ b/sysbox-pkgr/.github/workflows/manual-workflow.yml @@ -0,0 +1,52 @@ +# +# GitHub Actions manual workflow. +# +# The main purpose of this workflow is to allow us (developers) to make +# quick changes to CI jobs without having to push them into the repository. +# This workflow file serves as a template that we can modify anytime that we +# need to experiment with new CI changes. The process to follow in those +# cases is fairly simple: +# +# 1. Create a topic-branch where to commit your changes (e.g. dev-branch). +# +# 2. Modify this file so that it looks as close as possible to the desired CI +# job that will be eventually merged to the repo. Let's refer to this workflow +# as a 'testing' workflow to differentiate it from the one in the 'master' +# branch. +# +# 3. Head over to the 'actions' portal within the repo we're working on, and +# click on the 'Manual Workflow". Then find the button "Run workflow" and +# click on it. Finally, choose the branch over which our 'testing' workflow +# was submitted. +# +# 4. Github will now run this latest 'testing' version of the workflow, and +# not the one being stored in the master branch. +# +# 5. Repeat above cycle as many times as required, and when done, place the +# workflow changes in a new file; do not make permanent changes to this +# file as this one is just expected to serve as a template to ease our +# live during CI definition & testing. +# + +name: Manual Workflow + +# Runs on-demand +on: workflow_dispatch + +defaults: + run: + shell: bash + +jobs: + sysbox_pkg_test: + runs-on: [self-hosted, Linux, X64, "${{ matrix.distro }}"] + strategy: + fail-fast: false + matrix: + distro: [ubuntu-focal] + steps: + - name: precheckout-cleanup + run: | + sudo rm -rf * + - name: checkout + uses: actions/checkout@v2 diff --git a/sysbox-pkgr/.github/workflows/sysbox-ce-pkg-ci.yml b/sysbox-pkgr/.github/workflows/sysbox-ce-pkg-ci.yml new file mode 100644 index 00000000..62571bce --- /dev/null +++ b/sysbox-pkgr/.github/workflows/sysbox-ce-pkg-ci.yml @@ -0,0 +1,43 @@ +# +# GitHub Actions nightly build and test of the sysbox CE package. +# + +name: sysbox-ce-pkg-ci + +# Runs daily at 5am PST (1pm UTC) +on: + schedule: + - cron: "0 13 * * *" + +defaults: + run: + shell: bash + +jobs: + sysbox_ce_pkg_test: + runs-on: [self-hosted, Linux, X64, '${{ matrix.distro }}'] + strategy: + fail-fast: false + max-parallel: 2 + matrix: + distro: [ubuntu-bionic, ubuntu-focal, debian-buster, debian-bullseye] + steps: + - name: precheckout-cleanup + run: | + sudo rm -rf * + - name: checkout + uses: actions/checkout@v2 + - name: prebuild-cleanup + run: | + docker stop -t0 $(docker ps -aq) || true + docker image rm $(docker image ls -aq) || true + - name: test-sysbox-ce-pkg + run: | + docker stop -t0 $(docker ps -aq) || true + make test-sysbox-ce-deb ${{ matrix.distro }} + - name: post-cleanup + working-directory: sources/sysbox + run: | + docker stop -t0 $(docker ps -aq) || true + sudo make test-cleanup + docker image rm $(docker image ls -aq) || true diff --git a/sysbox-pkgr/.github/workflows/sysbox-ee-pkg-ci.yml b/sysbox-pkgr/.github/workflows/sysbox-ee-pkg-ci.yml new file mode 100644 index 00000000..529063f4 --- /dev/null +++ b/sysbox-pkgr/.github/workflows/sysbox-ee-pkg-ci.yml @@ -0,0 +1,43 @@ +# +# GitHub Actions nightly build and test of the sysbox EE package. +# + +name: sysbox-ee-pkg-ci + +# Runs daily at 2am PST (10am UTC) +on: + schedule: + - cron: "0 10 * * *" + +defaults: + run: + shell: bash + +jobs: + sysbox_ee_pkg_test: + runs-on: [self-hosted, Linux, X64, '${{ matrix.distro }}'] + strategy: + fail-fast: false + max-parallel: 2 + matrix: + distro: [ubuntu-bionic, ubuntu-focal, debian-buster, debian-bullseye] + steps: + - name: precheckout-cleanup + run: | + sudo rm -rf * + - name: checkout + uses: actions/checkout@v2 + - name: prebuild-cleanup + run: | + docker stop -t0 $(docker ps -aq) || true + docker image rm $(docker image ls -aq) || true + - name: test-sysbox-ee-pkg + run: | + docker stop -t0 $(docker ps -aq) || true + make test-sysbox-ee-deb ${{ matrix.distro }} + - name: post-cleanup + working-directory: sources/sysbox-internal + run: | + docker stop -t0 $(docker ps -aq) || true + sudo make test-cleanup + docker image rm $(docker image ls -aq) || true diff --git a/sysbox-pkgr/.gitignore b/sysbox-pkgr/.gitignore new file mode 100644 index 00000000..d718020a --- /dev/null +++ b/sysbox-pkgr/.gitignore @@ -0,0 +1,4 @@ +deb/build +deb/sources +sources +sysbox \ No newline at end of file diff --git a/sysbox-pkgr/Makefile b/sysbox-pkgr/Makefile new file mode 100644 index 00000000..2df228f4 --- /dev/null +++ b/sysbox-pkgr/Makefile @@ -0,0 +1,129 @@ +# +# Sysbox Packager Makefile +# + +SHELL:=/bin/bash +ARCH=$(shell uname -m) + +.PHONY: help \ + sysbox-all \ + sysbox-ce \ + sysbox-deb \ + sysbox-rpm \ + sysbox-ce-deb \ + sysbox-ce-rpm \ + sysbox-ce-repo \ + test-sysbox-all \ + test-sysbox-ce \ + test-sysbox-deb \ + test-sysbox-deb \ + test-sysbox-rpm \ + test-sysbox-ce-deb \ + test-sysbox-ce-rpm \ + clean \ + clean-ce \ + clean-deb \ + clean-rpm \ + clean-ce-deb \ + clean-ce-rpm + +# CE git repository structures. +CE_SOURCES=sources/sysbox + +# Path to deb and rpm packages +DEB_PACKAGE_PATH=deb/debbuild +RPM_PACKAGE_PATH=rpd/rpmbuild + +# List of all the sysbox targets (build + tests) +SYSBOX_TARGETS := $(shell egrep '^.*sysbox.*: \#' Makefile | awk -F: '{print $$1}') + + +.DEFAULT := help + +help: ## Show build targets + @awk 'BEGIN {FS = ":.*##"; printf "\n\033[1mUsage:\n \ + make \033[36m\033[0m\n\n"} \ + /^[a-zA-Z_-]+:.*?##/ { printf " \033[36m%-25s\033[0m %s\n", $$1, $$2 } /^##@/ \ + { printf "\n\033[1m%s\033[0m\n", substr($$0, 5) } ' $(MAKEFILE_LIST) + +##@ Building targets + +sysbox-all: ## Build all sysbox packages +sysbox-all: sysbox-ce + +sysbox-ce: ## Build sysbox-ce DEB and RPM packages +sysbox-ce: sysbox-ce-deb sysbox-ce-rpm + +sysbox-deb: ## Build all sysbox DEB packages +sysbox-deb: sysbox-ce-deb + +sysbox-rpm: ## Build all sysbox RPM packages +sysbox-rpm: sysbox-ce-rpm + +sysbox-ce-deb: ## Build sysbox-ce DEB package +sysbox-ce-deb: $(CE_SOURCES) clean-ce-deb + $(eval export EDITION=ce) + @$(MAKE) -C deb --no-print-directory $(filter-out $(SYSBOX_TARGETS)@,$(MAKECMDGOALS)) + +sysbox-ce-rpm: ## Build sysbox-ce RPM package +sysbox-ce-rpm: $(CE_SOURCES) clean-ce-rpm + $(eval export EDITION=ce) + @$(MAKE) -C rpm --no-print-directory $(filter-out $(SYSBOX_TARGETS)@,$(MAKECMDGOALS)) + +sysbox-ce-repo: ## Set path to the sysbox-ce repo (remote github repo by default) +sysbox-ce-repo: + $(eval REPO_PATH=$(filter-out sysbox-ce-repo $@,$(MAKECMDGOALS))) + @printf "\n*** Setting sysbox-ce repository path to $(REPO_PATH) ***\n\n" + @mkdir -p sources + @ln -sf $(REPO_PATH) $(CE_SOURCES) + +sources/sysbox: + @printf "\n*** Cloning sysbox-ce superproject repository to $(CE_SOURCES) ***\n\n" + @git clone --recursive https://github.com/nestybox/sysbox.git sources/sysbox + +##@ Testing targets + +test-sysbox-all: ## Test all sysbox packages +test-sysbox-all: test-sysbox-ce + +test-sysbox-ce: ## Test sysbox-ce DEB and RPM packages +test-sysbox-ce: tesst-sysbox-ce-deb test-sysbox-ce-rpm + +test-sysbox-deb: ## Test all sysbox DEB packages +test-sysbox-deb: test-sysbox-ce-deb + +test-sysbox-rpm: ## Test all sysbox RPM packages +test-sysbox-rpm: test-sysbox-ce-rpm + +test-sysbox-ce-deb: ## Test sysbox-ce DEB package +test-sysbox-ce-deb: sysbox-ce-deb + @cp $(DEB_PACKAGE_PATH)/$(lastword $@,$(MAKECMDGOALS))/sysbox-ce*.deb $(CE_SOURCES) + @$(MAKE) -C $(CE_SOURCES) --no-print-directory test-sysbox-installer PACKAGE_FILE_PATH="." + @$(MAKE) -C $(CE_SOURCES) --no-print-directory test-sysbox-shiftuid-installer PACKAGE_FILE_PATH="." + +test-sysbox-ce-rpm: ## Test sysbox-ce RPM package +test-sysbox-ce-rpm: sysbox-ce-rpm + @cp $(RPM_PACKAGE_PATH)/$(lastword $@,$(MAKECMDGOALS))/sysbox-ce*.deb $(CE_SOURCES) + @$(MAKE) -C $(CE_SOURCES) --no-print-directory test-sysbox-installer PACKAGE_FILE_PATH="." + +##@ Cleaning targets + +clean: ## Remove build artifacts +clean: clean-ce + +clean-ce: ## Remove sysbox-ce DEB and RPM packages +clean-ce: clean-ce-deb clean-ce-rpm + +clean-deb: ## Remove sysbox DEB packages +clean-deb: clean-ce-deb + +clean-rpm: ## Remove sysbox RPM packages +clean-rpm: clean-ce-rpm + +clean-ce-deb: ## Remove sysbox-ce DEB package + $(eval export EDITION=ce) + $(MAKE) -C deb --no-print-directory clean + +clean-ce-rpm: ## Remove sysbox-ce RPM package + $(eval export EDITION=ce) + $(MAKE) -C rpm --no-print-directory clean diff --git a/sysbox-pkgr/deb/Makefile b/sysbox-pkgr/deb/Makefile new file mode 100644 index 00000000..68758706 --- /dev/null +++ b/sysbox-pkgr/deb/Makefile @@ -0,0 +1,200 @@ +# +# Sysbox DEB Packager Makefile +# + +.PHONY: help \ + deb \ + debian \ + debian-buster \ + debian-bullseye \ + ubuntu \ + ubuntu-bionic \ + ubuntu-focal \ + ubuntu-jammy \ + clean + +SHELL:=/bin/bash + +# Obtain the current system architecture. +UNAME_M := $(shell uname -m) +ifeq ($(UNAME_M),x86_64) + ARCH := amd64 +else ifeq ($(UNAME_M),aarch64) + ARCH := arm64 +else ifeq ($(UNAME_M),arm) + ARCH := armhf +else ifeq ($(UNAME_M),armel) + ARCH := armel +endif + +# Sysbox's default baseline image to build release binaries. Notice +# that this is currently a requirement to build official Sysbox +# packages as only Impish (or later) carries ID-mapped mount support +# in the kernel and associated clibs. +RELEASE_BASELINE_IMAGE=ubuntu-jammy + +# Go version to utilize in building process. +GO_BASE_IMAGE=golang +GO_VERSION:=1.22 +GO_IMAGE=$(GO_BASE_IMAGE):$(GO_VERSION) + +# Sysbox source-code locations. +ifeq ($(EDITION),ce) + SYSBOX_DIR := $(CURDIR)/../sources/sysbox +else ifeq ($(EDITION),ee) + SYSBOX_DIR := $(CURDIR)/../sources/sysbox-internal +else + echo "Unsupported Sysbox edition: $(EDITION)" + exit 1 +endif + +# Sysbox component locations. +SYSBOX_IMAGE_SYSTEMD := ../systemd + +SOURCE_FILES = sysbox.tgz \ + sysbox.service \ + sysbox-fs.service \ + sysbox-mgr.service \ + 99-sysbox-sysctl.conf \ + 50-sysbox-mod.conf + +SOURCES=$(addprefix sources/, $(SOURCE_FILES)) + +CHOWN:=docker run --rm -v $(CURDIR):/v -w /v alpine chown + +DOCKER_BUILD=docker build \ + --build-arg GO_IMAGE=$(GO_IMAGE) \ + --build-arg arch=$(ARCH) \ + --build-arg DEB_FILES=sysbox-${EDITION} \ + -t build-$@/$(ARCH) \ + -f $(CURDIR)/$@/Dockerfile . + +DOCKER_RUN=docker run --privileged --rm -i \ + -e EDITION \ + -v $(CURDIR)/build/$(ARCH)/$@:/build \ + -v $(GOPATH)/pkg/mod:/go/pkg/mod \ + -v $(HOME)/.gitconfig:/root/.gitconfig:ro \ + -v /lib/modules/$(KERNEL_REL):/lib/modules/$(KERNEL_REL):ro \ + -v /usr/src/$(HEADERS):/usr/src/$(HEADERS):ro \ + -v /usr/src/$(HEADERS_BASE):/usr/src/$(HEADERS_BASE):ro \ + build-$@/$(ARCH) + +DOCKER_BUILD_RELEASE=docker build \ + --build-arg GO_IMAGE=$(GO_IMAGE) \ + --build-arg arch=$(ARCH) \ + --build-arg DEB_FILES=sysbox-${EDITION} \ + -t build-$(RELEASE_BASELINE_IMAGE)/$(ARCH) \ + -f $(CURDIR)/$(RELEASE_BASELINE_IMAGE)/Dockerfile . + +DOCKER_RUN_RELEASE=docker run --privileged --rm -i \ + -e EDITION \ + -v $(CURDIR)/build/$(ARCH)/$(RELEASE_BASELINE_IMAGE):/build \ + -v $(GOPATH)/pkg/mod:/go/pkg/mod \ + -v $(HOME)/.gitconfig:/root/.gitconfig:ro \ + -v /lib/modules/$(KERNEL_REL):/lib/modules/$(KERNEL_REL):ro \ + -v /usr/src/$(HEADERS):/usr/src/$(HEADERS):ro \ + -v /usr/src/$(HEADERS_BASE):/usr/src/$(HEADERS_BASE):ro \ + build-$(RELEASE_BASELINE_IMAGE)/$(ARCH) + +.DEFAULT := help + +help: + @awk 'BEGIN {FS = ":.*##"; printf "\n\033[1mUsage:\n \ + make \033[36m\033[0m\n"} \ + /^[a-zA-Z_-]+:.*?##/ { printf " \033[36m%-25s\033[0m %s\n", $$1, $$2 } /^##@/ \ + { printf "\n\033[1m%s\033[0m\n", substr($$0, 5) } ' $(MAKEFILE_LIST) + +##@ DEB package targets + +all: ## Build all DEB packages +all: debian ubuntu + +debian: ## Build Debian packages +debian: debian-buster debian-bullseye + +debian-buster: ## Build Debian Buster package +debian-buster: $(SOURCES) + @echo "== Building packages for $@ ==" + $(DOCKER_BUILD) + $(DOCKER_RUN) + $(CHOWN) -R $(shell id -u):$(shell id -g) build/$(ARCH)/$@ + +debian-bullseye: ## Build Debian Bullseye package +debian-bullseye: $(SOURCES) + @echo "== Building packages for $@ ==" + $(DOCKER_BUILD) + $(DOCKER_RUN) + $(CHOWN) -R $(shell id -u):$(shell id -g) build/$(ARCH)/$@ + +ubuntu: ## Build Ubuntu packages +ubuntu: ubuntu-bionic ubuntu-focal ubuntu-jammy + +ubuntu-bionic: ## Build Ubuntu Bionic package +ubuntu-bionic: $(SOURCES) + @echo "== Building packages for $@ ==" + $(DOCKER_BUILD) + $(DOCKER_RUN) + $(CHOWN) -R $(shell id -u):$(shell id -g) build/$(ARCH)/$@ + +ubuntu-focal: ## Build Ubuntu Focal package +ubuntu-focal: $(SOURCES) + @echo "== Building packages for $@ ==" + $(DOCKER_BUILD) + $(DOCKER_RUN) + $(CHOWN) -R $(shell id -u):$(shell id -g) build/$(ARCH)/$@ + +ubuntu-jammy: ## Build Ubuntu Jammy package +ubuntu-jammy: $(SOURCES) + @echo "== Building packages for $@ ==" + $(DOCKER_BUILD) + $(DOCKER_RUN) + $(CHOWN) -R $(shell id -u):$(shell id -g) build/$(ARCH)/$@ + +generic: ## Build generic deb package (release purposes) +generic: $(SOURCES) + @echo "== Building generic release package" + $(DOCKER_BUILD_RELEASE) + $(DOCKER_RUN_RELEASE) + $(CHOWN) -R $(shell id -u):$(shell id -g) build/$(ARCH)/$(RELEASE_BASELINE_IMAGE) + +clean: ## Remove all DEB packages +ifeq ($(EDITION),ce) + @[ ! -d build ] || $(CHOWN) -R $(shell id -u):$(shell id -g) build + @$(RM) -r build/$(ARCH)/*/sysbox-ce* +else ifeq ($(EDITION),ee) + @[ ! -d build ] || $(CHOWN) -R $(shell id -u):$(shell id -g) build + @$(RM) -r build/$(ARCH)/*/sysbox-ee* +else + @[ ! -d build ] || $(CHOWN) -R $(shell id -u):$(shell id -g) build + @$(RM) -r build/$(ARCH)/* +endif + @[ ! -d sources ] || $(CHOWN) -R $(shell id -u):$(shell id -g) sources + @$(RM) -r sources + +sources/sysbox.tgz: + mkdir -p $(@D) + docker run --rm -i -w /v \ + -v $(SYSBOX_DIR):/sysbox \ + -v $(CURDIR)/$(@D):/v \ + alpine \ + tar -C / -czf /v/sysbox.tgz --exclude='sysbox-pkgr' sysbox + +sources/sysbox.service: ../systemd/sysbox.service + mkdir -p $(@D) + cp $< $@ + +sources/sysbox-fs.service: ../systemd/sysbox-fs.service + mkdir -p $(@D) + cp $< $@ + +sources/sysbox-mgr.service: ../systemd/sysbox-mgr.service + mkdir -p $(@D) + cp $< $@ + +sources/99-sysbox-sysctl.conf: ../systemd/99-sysbox-sysctl.conf + mkdir -p $(@D) + cp $< $@ + +sources/50-sysbox-mod.conf: ../systemd/50-sysbox-mod.conf + mkdir -p $(@D) + cp $< $@ diff --git a/sysbox-pkgr/deb/build-deb b/sysbox-pkgr/deb/build-deb new file mode 100755 index 00000000..6bd10607 --- /dev/null +++ b/sysbox-pkgr/deb/build-deb @@ -0,0 +1,39 @@ +#!/usr/bin/env bash +# +# Copyright: (C) 2019-2021 Nestybox Inc. All rights reserved. +# + +set -x +set -e + +# Untar Sysbox sources. +mkdir -p /root/build-deb/sysbox +tar -C /root/build-deb -xzf /sources/sysbox.tgz +chown -R $(id -u):$(id -g) . + +# Obtain sysbox version out of the source tree. +VERSION=$(cat /root/build-deb/sysbox/VERSION) + +############################################################################### +# +# Create sysbox's debian package. Note that the generated package will +# include all the traditional debian artifacts (tar, dsc, deb, etc), thereby, +# we should ensure that only the *.deb file (binaries) is externally shared. +# +############################################################################### + +# Generate debian's changelog file. +/root/build-deb/changelog_convert.sh +if [[ ! $? -eq 0 ]]; then + exit 1 +fi + +# Initialize dockerd to allow sysbox's containerized compilation as part of +# of dpkg-build. +dockerd > /var/log/dockerd.log 2>&1 & +sleep 3 + +# Build the package and copy artifacts to the expected location. +dpkg-buildpackage -uc -us -I.git +mkdir -p /build +mv -v /root/sysbox* /build diff --git a/sysbox-pkgr/deb/changelog_convert.sh b/sysbox-pkgr/deb/changelog_convert.sh new file mode 100755 index 00000000..3c0d4fe5 --- /dev/null +++ b/sysbox-pkgr/deb/changelog_convert.sh @@ -0,0 +1,169 @@ +#!/bin/bash +# +# Copyright: (C) 2019-2022 Nestybox Inc. All rights reserved. +# +# Description: Script converts a user-defined changelog file into a +# debian-friendly counterpart. +# +# Required input: +# +# User-defined changelog file must necessarily utilize the following layout: +# +# $ cat CHANGELOG.md +# ... +# ## [0.0.2-dev] - unreleased +# ### Added +# * Generate external documentation: README, user-guide, design-guide, etc. +# * Extend Sysbox support to Ubuntu-Bionic. +# +# ## [0.0.1] - 2019-06-23 +# ### Added +# * Initial public release. +# ... +# +# Expected output: +# +# $ cat sysbox-pkgr/deb/common/sysbox-ce/changelog +# ... +# sysbox-ce (0.0.2-0) unstable; urgency=medium +# +# * Generate external documentation: README, user-guide, design-guide, etc. +# * Extend Sysbox support to Ubuntu-Bionic. +# +# -- Rodny Molina Tue, 20 Aug 2019 16:21:10 -0700 +# +# sysbox-ce (0.0.1-0) unstable; urgency=medium +# +# * Initial public release. +# +# -- Rodny Molina Tue, 23 Jul 2019 17:37:44 -0400 +# ... +# +# Note 1: The CHANGELOG.md file will be parsed attending to the two following +# reg-expresions. Anything that doesn't match this pattern will be ignored. +# +# - "^## " Example: "## [0.0.1] - 2019-06-23 +# - "^ * " Example: " * Extend Sysbox support to Ubuntu-Bionic." +# +# Note 2: As per Debian's policy guidelines, the "unreleased" label must be +# utilized to tag packages that have not been properly released yet. We must keep +# this in mind when generating 'private' images to be provided to third parties. +# In these cases, the "version" tag will be derived from the sysbox/VERSION file; +# for all othe entries (i.e. "released" entries), the version field will be +# extracted from the changelog file itself. + +# Input file to be created/edited by whoever creates a new Sysbox release. +changelog_file="sysbox/CHANGELOG.md" + +# Version file to be taking into account to set the latest (top-most) changelog +# entry. +version_file="sysbox/VERSION" + +# Output file to be generated by this script, and to be included in Sysbox's +# debian-package installer. +debian_changelog="debian/changelog" + +# Base container image used to build Sysbox binaries. +BASE_DISTRO=${BASE_IMAGE%:*} +BASE_DISTRO_RELEASE=${BASE_IMAGE#*:} + +# Redirect all generated output. +exec >${debian_changelog} + +function print_tag_header() { + + local tag=$1 + local unreleased=$2 + + if [[ "$SYSBOX_RELEASE" = "true" ]]; then + if [[ $unreleased = true ]]; then + echo -e "sysbox-${EDITION} (${tag}.linux) UNRELEASED; urgency=medium\n" + else + echo -e "sysbox-${EDITION} (${tag}.linux) unstable; urgency=medium\n" + fi + else + if [[ $unreleased = true ]]; then + echo -e "sysbox-${EDITION} (${tag}.${BASE_DISTRO}-${BASE_DISTRO_RELEASE}) UNRELEASED; urgency=medium\n" + else + echo -e "sysbox-${EDITION} (${tag}.${BASE_DISTRO}-${BASE_DISTRO_RELEASE}) unstable; urgency=medium\n" + fi + fi +} + +function print_tag_trailer() { + + local tag=$1 + local unreleased=$2 + + local tag_author="" + local tag_email="" + local tag_date="" + + if [[ "$unreleased" = true ]]; then + tag_author=$(git config user.name) + tag_email=$(git config user.email) + tag_date=$(date --rfc-2822) + else + tag_author=$(git -C sysbox log -1 --format=%aN v$1) + tag_email=$(git -C sysbox log -1 --format=%ae v$1) + tag_date=$(git -C sysbox log -1 --format=%aD v$tag) + fi + + echo -e "\n -- ${tag_author} <${tag_email}> ${tag_date}\n" +} + +function main() { + local currTag="" + local prevTag="" + local unreleased="" + local prevUnreleased="" + + # Ensure that a version file is available. + if [[ ! -f ${version_file} ]]; then + echo "Sysbox VERSION file not found. Exiting..." + exit 1 + fi + + # Ensure that a changelog file is available. + if [[ ! -f ${changelog_file} ]]; then + echo "Sysbox CHANGELOG.md file not found. Exiting..." + exit 1 + fi + local versionTag=$(cat ${version_file}) + + # Iterate though CHANGELOG.md file to extract relevant information. + while IFS= read -r line; do + if echo ${line} | egrep -q "^## "; then + + local currTag=$(echo ${line} | cut -d"[" -f2 | cut -d"]" -f1) + + # If an 'unreleased' entry is found (usually the first / top-most + # line in changelog file), then we will honor the tag present in the + # 'version' file. For all other entries we will exclusively rely on + # tags present in the changelog file. + if echo ${line} | egrep -q "unreleased"; then + unreleased=true + currTag=${versionTag} + else + unreleased=false + fi + + if [[ ${currTag} != ${prevTag} ]] && [[ ${prevTag} != "" ]]; then + print_tag_trailer ${prevTag} ${prevUnreleased} + fi + + print_tag_header ${currTag} ${unreleased} + + prevTag=${currTag} + prevUnreleased=${unreleased} + + elif echo "${line}" | egrep -q "^ * "; then + echo -e "${line}" + fi + + done < ${changelog_file} + + print_tag_trailer ${currTag} ${unreleased} +} + +main diff --git a/sysbox-pkgr/deb/debian-bullseye/Dockerfile b/sysbox-pkgr/deb/debian-bullseye/Dockerfile new file mode 100644 index 00000000..85e9a7bc --- /dev/null +++ b/sysbox-pkgr/deb/debian-bullseye/Dockerfile @@ -0,0 +1,93 @@ +ARG GO_IMAGE +ARG BASE_IMAGE=debian:bullseye +ARG DEBIAN_FRONTEND=noninteractive + +FROM ${GO_IMAGE} as golang + +FROM ${BASE_IMAGE} + +RUN echo 'debconf debconf/frontend select Noninteractive' | debconf-set-selections && \ + apt-get update && \ + apt-get install -y --no-install-recommends \ + build-essential \ + ca-certificates \ + devscripts \ + equivs \ + git \ + wget \ + pkg-config \ + libnet-dev \ + libseccomp2 \ + libseccomp-dev \ + iproute2 \ + kmod \ + curl \ + unzip && \ + \ + # Housekeeping + apt-get clean -y && \ + rm -rf \ + /var/cache/debconf/* \ + /var/lib/apt/lists/* \ + /var/log/* \ + /tmp/* \ + /var/tmp/* \ + /usr/share/doc/* \ + /usr/share/man/* \ + /usr/share/local/* + +ARG arch +ENV ARCH=${arch} +ENV GOPATH /go +ENV PATH $PATH:/usr/local/go/bin:$GOPATH/bin:/root/.local/bin + +ARG DEB_FILES +COPY ${DEB_FILES} /root/build-deb/debian +RUN mk-build-deps -t "apt-get -o Debug::pkgProblemResolver=yes --no-install-recommends -y" -i /root/build-deb/debian/control + +ENV BASE_IMAGE=${BASE_IMAGE} +ENV SYSBOX_RELEASE true + +COPY --from=golang /usr/local/go /usr/local/go + +# Let's explicitly set go-module feature to 'auto' mode (default as per Go 1.13) to avoid +# potential changes to this feature's default mode in the future. Even though we are +# relying on modules for the package's building process, we are enabling 'auto' mode to +# allow 'go get' traditional behavior (fetch entire git repo). Notice that we need git's +# metadata to allow a git-checkout operation further below. +ENV GO111MODULE=auto +RUN go env -w GONOSUMDB=github.com/nestybox + +# Install protoc compiler for gRPC. +RUN if [ "${arch}" = "amd64" ]; then arch_str="x86_64"; \ + elif [ "${arch}" = "arm64" ]; then arch_str="aarch_64"; \ + else echo "Unsupported platform: ${arch}"; exit; fi \ + && curl -LO https://github.com/protocolbuffers/protobuf/releases/download/v3.15.8/protoc-3.15.8-linux-${arch_str}.zip \ + && unzip protoc-3.15.8-linux-${arch_str}.zip -d $HOME/.local \ + && export PATH="$PATH:$HOME/.local/bin" \ + && go install github.com/golang/protobuf/protoc-gen-go@latest \ + && export PATH="$PATH:$(go env GOPATH)/bin" + +# Install Docker +RUN curl -fsSL https://get.docker.com -o get-docker.sh \ + && sh get-docker.sh +ADD https://raw.githubusercontent.com/docker/docker-ce/master/components/cli/contrib/completion/bash/docker /etc/bash_completion.d/docker.sh + +# Use the old definition for SECCOMP_NOTIF_ID_VALID in /usr/include/linux/seccomp.h +# +# This is needed because the definition changed in the mainline kernel +# on 06/2020 (from SECCOMP_IOR -> SECCOMP_IOW), and some distros we +# support have picked it up in their latest releases / kernels +# updates. The kernel change was backward compatible, so by using the +# old definition, we are guaranteed it will work on kernels before and +# after the change. On the other hand, if we were to use the new +# definition, seccomp notify would fail when sysbox runs in old +# kernels. +RUN sed -i 's/^#define SECCOMP_IOCTL_NOTIF_ID_VALID[ \t]*SECCOMP_IOW(2, __u64)/#define SECCOMP_IOCTL_NOTIF_ID_VALID SECCOMP_IOR(2, __u64)/g' /usr/include/linux/seccomp.h + +WORKDIR /root/build-deb +COPY sources/ /sources +COPY build-deb /root/build-deb/build-deb +COPY changelog_convert.sh /root/build-deb/changelog_convert.sh + +ENTRYPOINT ["/root/build-deb/build-deb"] diff --git a/sysbox-pkgr/deb/debian-buster/Dockerfile b/sysbox-pkgr/deb/debian-buster/Dockerfile new file mode 100644 index 00000000..872e7f70 --- /dev/null +++ b/sysbox-pkgr/deb/debian-buster/Dockerfile @@ -0,0 +1,93 @@ +ARG GO_IMAGE +ARG BASE_IMAGE=debian:buster +ARG DEBIAN_FRONTEND=noninteractive + +FROM ${GO_IMAGE} as golang + +FROM ${BASE_IMAGE} + +RUN echo 'debconf debconf/frontend select Noninteractive' | debconf-set-selections && \ + apt-get update && \ + apt-get install -y --no-install-recommends \ + build-essential \ + ca-certificates \ + devscripts \ + equivs \ + git \ + wget \ + pkg-config \ + libnet-dev \ + libseccomp2 \ + libseccomp-dev \ + iproute2 \ + kmod \ + curl \ + unzip && \ + \ + # Housekeeping + apt-get clean -y && \ + rm -rf \ + /var/cache/debconf/* \ + /var/lib/apt/lists/* \ + /var/log/* \ + /tmp/* \ + /var/tmp/* \ + /usr/share/doc/* \ + /usr/share/man/* \ + /usr/share/local/* + +ARG arch +ENV ARCH=${arch} +ENV GOPATH /go +ENV PATH $PATH:/usr/local/go/bin:$GOPATH/bin:/root/.local/bin + +ARG DEB_FILES +COPY ${DEB_FILES} /root/build-deb/debian +RUN mk-build-deps -t "apt-get -o Debug::pkgProblemResolver=yes --no-install-recommends -y" -i /root/build-deb/debian/control + +ENV BASE_IMAGE=${BASE_IMAGE} +ENV SYSBOX_RELEASE true + +COPY --from=golang /usr/local/go /usr/local/go + +# Let's explicitly set go-module feature to 'auto' mode (default as per Go 1.13) to avoid +# potential changes to this feature's default mode in the future. Even though we are +# relying on modules for the package's building process, we are enabling 'auto' mode to +# allow 'go get' traditional behavior (fetch entire git repo). Notice that we need git's +# metadata to allow a git-checkout operation further below. +ENV GO111MODULE=auto +RUN go env -w GONOSUMDB=github.com/nestybox + +# Install protoc compiler for gRPC. +RUN if [ "${arch}" = "amd64" ]; then arch_str="x86_64"; \ + elif [ "${arch}" = "arm64" ]; then arch_str="aarch_64"; \ + else echo "Unsupported platform: ${arch}"; exit; fi \ + && curl -LO https://github.com/protocolbuffers/protobuf/releases/download/v3.15.8/protoc-3.15.8-linux-${arch_str}.zip \ + && unzip protoc-3.15.8-linux-${arch_str}.zip -d $HOME/.local \ + && export PATH="$PATH:$HOME/.local/bin" \ + && go install github.com/golang/protobuf/protoc-gen-go@latest \ + && export PATH="$PATH:$(go env GOPATH)/bin" + +# Install Docker +RUN curl -fsSL https://get.docker.com -o get-docker.sh \ + && sh get-docker.sh +ADD https://raw.githubusercontent.com/docker/docker-ce/master/components/cli/contrib/completion/bash/docker /etc/bash_completion.d/docker.sh + +# Use the old definition for SECCOMP_NOTIF_ID_VALID in /usr/include/linux/seccomp.h +# +# This is needed because the definition changed in the mainline kernel +# on 06/2020 (from SECCOMP_IOR -> SECCOMP_IOW), and some distros we +# support have picked it up in their latest releases / kernels +# updates. The kernel change was backward compatible, so by using the +# old definition, we are guaranteed it will work on kernels before and +# after the change. On the other hand, if we were to use the new +# definition, seccomp notify would fail when sysbox runs in old +# kernels. +RUN sed -i 's/^#define SECCOMP_IOCTL_NOTIF_ID_VALID[ \t]*SECCOMP_IOW(2, __u64)/#define SECCOMP_IOCTL_NOTIF_ID_VALID SECCOMP_IOR(2, __u64)/g' /usr/include/linux/seccomp.h + +WORKDIR /root/build-deb +COPY sources/ /sources +COPY build-deb /root/build-deb/build-deb +COPY changelog_convert.sh /root/build-deb/changelog_convert.sh + +ENTRYPOINT ["/root/build-deb/build-deb"] diff --git a/sysbox-pkgr/deb/sysbox-ce/compat b/sysbox-pkgr/deb/sysbox-ce/compat new file mode 100644 index 00000000..9a037142 --- /dev/null +++ b/sysbox-pkgr/deb/sysbox-ce/compat @@ -0,0 +1 @@ +10 \ No newline at end of file diff --git a/sysbox-pkgr/deb/sysbox-ce/control b/sysbox-pkgr/deb/sysbox-ce/control new file mode 100644 index 00000000..8cc57663 --- /dev/null +++ b/sysbox-pkgr/deb/sysbox-ce/control @@ -0,0 +1,25 @@ +Source: sysbox-ce +Section: admin +Priority: optional +Maintainer: Nestybox +Build-Depends: debhelper (>= 10~), + bash, +Standards-Version: 4.1.4 +Homepage: nestybox.com +Vcs-Git: git://github.com/nestybox/sysbox.git + +Package: sysbox-ce +Architecture: linux-any +Pre-Depends: jq +Depends: ${shlibs:Depends}, + ${misc:Depends}, + debconf, + jq, + fuse, + lsb-release, + rsync, + iptables +Description: Sysbox Community Edition (CE) is a next-generation container runtime, + developed by Nestybox, that enables deployment of containers that are capable of + running not just micro-services, but also system software such as Docker, Kubernetes, + Systemd, etc., inside the container, easily and securely. \ No newline at end of file diff --git a/sysbox-pkgr/deb/sysbox-ce/copyright b/sysbox-pkgr/deb/sysbox-ce/copyright new file mode 100644 index 00000000..b4d8d5da --- /dev/null +++ b/sysbox-pkgr/deb/sysbox-ce/copyright @@ -0,0 +1,12 @@ +Format: https://www.debian.org/doc/packaging-manuals/copyright-format/1.0/ +Upstream-Name: Nestybox Sysbox +Upstream-Contact: Nestybox Inc + +Disclaimer: + This package is not part of the GNU/Linux Debian distribution. + No modifications of the included binaries, and no other uses or + further distributions of the included binaries, are permitted. + +Files: * +Copyright: 2019-2022 Nestybox Inc. All rights reserved. +License: Apache License, Version 2.0. diff --git a/sysbox-pkgr/deb/sysbox-ce/rules b/sysbox-pkgr/deb/sysbox-ce/rules new file mode 100755 index 00000000..8082d47e --- /dev/null +++ b/sysbox-pkgr/deb/sysbox-ce/rules @@ -0,0 +1,68 @@ +#!/usr/bin/make -f + +# Output every command that modifies files on the build system. Enable me for +# debugging purposes. +export DH_VERBOSE = 1 + +# Override buildeb step to enforce 'xz' as the compressor to rely on. Notice +# that starting with Ubuntu Impish (i.e., 21.10), all packages make use of +# 'zstd' as the default compressor, but this one is not supported in debian +# distro yet. +override_dh_builddeb: + dh_builddeb -- -Zxz + +override_dh_auto_build: + # Build sysbox's components. + cd sysbox && make sysbox-static-local + +override_dh_auto_test: + +# Override dwz to avoid issues in debian-buster with debhelper=12 compat. +override_dh_dwz: + +# ONESHELL attribute to ensure that all instruccions in this target are executed +# within a single shell process. +.ONESHELL: +SHELL=/bin/bash +override_dh_auto_install: + # Sysbox binaries will be installed through the regular (makefile) process. + install -D -m0755 sysbox/sysbox-fs/build/${ARCH}/sysbox-fs \ + debian/sysbox-ce/usr/bin/sysbox-fs + install -D -m0755 sysbox/sysbox-mgr/build/${ARCH}/sysbox-mgr \ + debian/sysbox-ce/usr/bin/sysbox-mgr + install -D -m0755 sysbox/sysbox-runc/build/${ARCH}/sysbox-runc \ + debian/sysbox-ce/usr/bin/sysbox-runc + + # Sysbox services installation. + install -D -m 0644 /sources/sysbox.service \ + debian/sysbox-ce/lib/systemd/system/sysbox.service + install -D -m 0644 /sources/sysbox-fs.service \ + debian/sysbox-ce/lib/systemd/system/sysbox-fs.service + install -D -m 0644 /sources/sysbox-mgr.service \ + debian/sysbox-ce/lib/systemd/system/sysbox-mgr.service + + # Sysbox's sysctl.d config-file to hold the required procfs settings. + # Notice we're picking the largest two-digit integer to name this file + # to avoid collisions with other package's requirements -- we expect + # our requirements to be more generic, so we want ours to prevail. + install -D -m 0644 /sources/99-sysbox-sysctl.conf \ + debian/sysbox-ce/lib/sysctl.d/99-sysbox-sysctl.conf + +override_dh_installinit: + # Use "sysbox" as our service name, not "sysbox-ce". + dh_installinit --name=sysbox + +override_dh_install: + dh_install + +# Override dh_usrlocal to prevent error after placing sysbox binaries in +# /usr/local path. +override_dh_usrlocal: + +override_dh_installsystemd: + dh_installsystemd --name=sysbox + dh_installsystemd --name=sysbox-fs + dh_installsystemd --name=sysbox-mgr + +%: + dh $@ diff --git a/sysbox-pkgr/deb/sysbox-ce/source/format b/sysbox-pkgr/deb/sysbox-ce/source/format new file mode 100644 index 00000000..9f8e9b69 --- /dev/null +++ b/sysbox-pkgr/deb/sysbox-ce/source/format @@ -0,0 +1 @@ +1.0 \ No newline at end of file diff --git a/sysbox-pkgr/deb/sysbox-ce/sysbox-ce.config b/sysbox-pkgr/deb/sysbox-ce/sysbox-ce.config new file mode 100644 index 00000000..0c0abb20 --- /dev/null +++ b/sysbox-pkgr/deb/sysbox-ce/sysbox-ce.config @@ -0,0 +1,129 @@ +#!/bin/bash +# +# Copyright: (C) 2019-2022 Nestybox Inc. All rights reserved. +# + +set -e + +# Source debconf library. +. /usr/share/debconf/confmodule + +# Dockerd default configuration dir/file. +dockerCfgDir="/etc/docker" +dockerCfgFile="${dockerCfgDir}/daemon.json" + +# UID-shifting module +shiftfs_module="shiftfs" + +# +# Determines if docker network configuration is complete. That is, 'bip' +# and 'default-address-pools' attributes are properly configured. +# +function docker_network_valid_config() { + + # Matching pattern: + # + # "bip": "172.20.0.1/16" + # "default-address-pools": [ + # + if [[ -f ${dockerCfgFile} ]] && + egrep -q "^[ ]+\"bip\": \"[0-9.]+.*\"" ${dockerCfgFile} && + egrep -q "^[ ]+\"default-address-pools\"" ${dockerCfgFile}; then + return 0 + fi + + return 1 +} + +# +# Finds out if there are existing docker containers in the system. +# +function docker_existing_containers() { + if docker ps -a | wc -l | egrep -q "1$"; then + return 1 + fi + + return 0 +} + +# +# Check presence/absence of shiftfs module. +# +function shiftfs_mount_supported() { + if modprobe "${shiftfs_module}" &>/dev/null; then + return 0 + fi + + return 1 +} + +# +# Check for idmapped-mount support. +# +function idmapped_mount_supported() { + local kernel_rel=$(uname -r) + local rel_major=$(echo ${kernel_rel} | cut -d'.' -f1) + local rel_minor=$(echo ${kernel_rel} | cut -d'.' -f2) + + if [ ${rel_major} -gt 5 ] || ([ ${rel_major} -eq 5 ] && [ ${rel_minor} -ge 12 ]); then + return 0 + fi + + return 1 +} + +# +# Function returns 'true' to indicate that dockerd needs to be restarted during +# Sysbox installation process. Returns 'false' otherwise. +# +function docker_restart_required() { + if docker_network_valid_config; then + return 1 + fi + + return 0 +} + +# +# Checks if the docker engine is installed on the host. +# +function docker_installed() { + ret=$(command -v dockerd >/dev/null 2>&1) + return $? +} + +# +# Checks if the docker engine is running on the host. +# +function docker_running() { + ret=$(pidof dockerd >/dev/null 2>&1) + return $? +} + +# +# Main +# + +# If a docker-restart is required in this setup, and there are existing docker +# containers, alert user of the need to stop containers and exit installation +# process. +if docker_running && docker_restart_required && docker_existing_containers; then + echo -e "\nThe Sysbox installer requires a docker service restart to configure" \ + "network parameters, but it cannot proceed due to existing Docker containers." \ + "Please remove them as indicated below and re-launch the installation process." \ + "Refer to Sysbox installation documentation for details.\n" \ + "\t\"docker rm \$(docker ps -a -q) -f\"\n" + exit 1 +fi + +# If neither 'idmapping' nor 'shiftfs' is supported, we must alert the user of +# its potential side effects. +if ! idmapped_mount_supported && ! shiftfs_mount_supported; then + echo -e "\nYour OS does not support 'idmapped' feature (kernel < 5.12), nor it" \ + " provides 'shiftfs' support. In consequence, applications within Sysbox" \ + " containers may be unable to access volume-mounts, which will show up as" \ + " owned by 'nobody:nogroup' inside the container. Refer to Sysbox" \ + " installation documentation for details.\n" +fi + +#DEBHELPER# diff --git a/sysbox-pkgr/deb/sysbox-ce/sysbox-ce.install b/sysbox-pkgr/deb/sysbox-ce/sysbox-ce.install new file mode 100644 index 00000000..8b137891 --- /dev/null +++ b/sysbox-pkgr/deb/sysbox-ce/sysbox-ce.install @@ -0,0 +1 @@ + diff --git a/sysbox-pkgr/deb/sysbox-ce/sysbox-ce.manpages b/sysbox-pkgr/deb/sysbox-ce/sysbox-ce.manpages new file mode 100644 index 00000000..e69de29b diff --git a/sysbox-pkgr/deb/sysbox-ce/sysbox-ce.postinst b/sysbox-pkgr/deb/sysbox-ce/sysbox-ce.postinst new file mode 100755 index 00000000..439d9bf0 --- /dev/null +++ b/sysbox-pkgr/deb/sysbox-ce/sysbox-ce.postinst @@ -0,0 +1,378 @@ +#!/bin/bash +# +# Copyright: (C) 2019-2021 Nestybox Inc. All rights reserved. +# + +set -e + +# Source debconf library. +. /usr/share/debconf/confmodule + +# Dockerd default configuration dir/file. +dockerCfgDir="/etc/docker" +dockerCfgFile="${dockerCfgDir}/daemon.json" + +# sysbox-fs' default mountpoint path. +sysboxfs_mountpoint="/var/lib/sysboxfs" + +# UID-shifting module +shiftfs_module="shiftfs" + +# Kernel's pool-size of inotify resources. +inotify_pool_size=1048576 + +# Default docker network parameters +bip_subnet="172.20.0.1/16" +pool_subnet="172.25.0.0/16" + +# Docker config vars. +docker_network_config_changed="false" +docker_runtime_config_changed="false" + +# Temp file for jq write operations. +tmpfile=$(mktemp /tmp/installer-scr.XXXXXX) +trap 'rm -f "${tmpfile}"' EXIT + +# Kernel's keyring max keys limit. Increase keyring max-keys for scaling +# purposes. +# +# In debian-based distros (e.g., Ubuntu) the kernel keyring max keys limit +# is set to 200 for non-root users. This is too low for Sysbox. For example, +# for a sys container based K8s cluster, the number of keys required is: +# +# keys = 35 + (k8s_workers * 23) + (2 * pods) +# +# That is, a 10-node cluster would need 282 keys. +# +# In a large bare-metal machine, we expect ~100 sys containers. That would +# consume ~11K keys. To be conservative, we set maxkeys to 20K. Note that since +# each key consumes ~70 bytes on average, the total mem consumption assuming all +# 20K keys are used is 20K * 70 = 1.4MB. +kernel_keys_maxkeys=20000 +kernel_keys_maxbytes=1400000 + +# Increase the kernel's max PID limit to it's max value since Sysbox containers +# are often used as VM-like environments and can consume hundreds or thousands +# of PIDs each. For Sysbox deployments on K8s, we limit each pod to 16K pids via +# the CRI-O config file. For Sysbox deployments in Docker hosts, use Docker's +# "--pids-limit" option to fix this limit. +kernel_pid_max=4194304 + + +# Creates sysbox-fs mountpoint. +function create_sysboxfs_mountpoint() { + + if [[ -d ${sysboxfs_mountpoint} ]]; then + return + fi + + mkdir -p ${sysboxfs_mountpoint} + if [[ ! -d ${sysboxfs_mountpoint} ]]; then + exit 1 + fi +} + +# Ensure WSL2 kernel detected. +function is_wsl() { + case "$(uname -r)" in + *microsoft* ) true ;; # WSL 2 + *Microsoft* ) true ;; # WSL 1 + * ) false;; + esac +} + +# Enables the utilization of unprivileged user-namespaces. +function enable_unprivileged_userns() { + if [ -f "/proc/sys/kernel/unprivileged_userns_clone" ]; then + local val=$(sysctl kernel.unprivileged_userns_clone) + if [[ "${val##* }" = 0 ]]; then + sysctl -w kernel.unprivileged_userns_clone=1 >/dev/null 2>&1 + fi + fi +} + +# Ensure kernel's 'inotify' resources meet sysbox requirements -- default values +# in most distros are too low for decent-size scenarios. +function define_inotify_resources() { + + local val=$(sysctl fs.inotify.max_queued_events) + if [[ "${val##* }" -lt ${inotify_pool_size} ]]; then + sysctl -w fs.inotify.max_queued_events=${inotify_pool_size} >/dev/null 2>&1 + fi + + local val=$(sysctl fs.inotify.max_user_watches) + if [[ "${val##* }" -lt ${inotify_pool_size} ]]; then + sysctl -w fs.inotify.max_user_watches=${inotify_pool_size} >/dev/null 2>&1 + fi + + local val=$(sysctl fs.inotify.max_user_instances) + if [[ "${val##* }" -lt ${inotify_pool_size} ]]; then + sysctl -w fs.inotify.max_user_instances=${inotify_pool_size} >/dev/null 2>&1 + fi +} + +# Ensure kernel's 'keyring' resources meet sysbox requirements -- +# default values in most distros are too low for decent-size +# scenarios. See definitions of keyring variables above for +# details. +function define_keyring_resources() { + + local val=$(sysctl kernel.keys.maxkeys) + if [[ "${val##* }" -lt ${kernel_keys_maxkeys} ]]; then + sysctl -w kernel.keys.maxkeys=${kernel_keys_maxkeys} >/dev/null 2>&1 + fi + + local val=$(sysctl kernel.keys.maxbytes) + if [[ "${val##* }" -lt ${kernel_keys_maxbytes} ]]; then + sysctl -w kernel.keys.maxbytes=${kernel_keys_maxbytes} >/dev/null 2>&1 + fi +} + +# Apply a decent pid_max size. +function define_pidmax_resources() { + + local val=$(sysctl kernel.pid_max) + if [[ "${val##* }" -lt ${kernel_pid_max} ]]; then + sysctl -w kernel.pid_max=${kernel_pid_max} >/dev/null 2>&1 + fi +} + +# Adds user/group in charge of running all sysbox components. +function add_sysbox_user() { + + if ! getent passwd | grep "^sysbox:" >/dev/null 2>&1; then + useradd -s /bin/false sysbox + fi +} + +# +# Add sysbox runtime to docker configuration. +# +function adjust_docker_config_runtime() { + + # If no 'runtimes' key-entry is present, proceed to add one. + if [ $(jq 'has("runtimes")' ${dockerCfgFile}) = "false" ]; then + jq --indent 4 '. + {"runtimes": {"sysbox-runc": {"path": "/usr/bin/sysbox-runc"}}}' \ + ${dockerCfgFile} >${tmpfile} && cp ${tmpfile} ${dockerCfgFile} + + docker_runtime_config_changed="true" + + # If no 'sysbox-runc' runtime entry is present, proceed to add it. + elif [ $(jq '.runtimes | has("sysbox-runc")' ${dockerCfgFile}) = "false" ]; then + jq --indent 4 '.runtimes |= . + {"sysbox-runc": {"path": "/usr/bin/sysbox-runc"}}' \ + ${dockerCfgFile} >${tmpfile} && cp ${tmpfile} ${dockerCfgFile} + + docker_runtime_config_changed="true" + + # If the runtime config is complete (i.e. both 'runtimes' and 'sysbox-runc' + # entries exist) but has the old sysbox-runc binary location, update the location. + elif grep -q "/usr/local/sbin/sysbox-runc" ${dockerCfgFile}; then + sed -i "s@/usr/local/sbin/sysbox-runc@/usr/bin/sysbox-runc@g" ${dockerCfgFile} + docker_runtime_config_changed="true" + fi + + # If the state has not been digested by docker yet, ensure that docker + # processes it before this installation concludes. + if [ ${docker_runtime_config_changed} = false ] && + command -v docker >/dev/null 2>&1 && + ! docker info 2>&1 | egrep -q "Runtimes:.*sysbox-runc"; then + + docker_runtime_config_changed="true" + fi +} + +# +# Returns 'true' if passed ipv4 address overlaps with any of the system local +# subnets. Return 'false' otherwise. +# +function system_local_subnet() { + + if ip route get ${1} | egrep -q "via $(ip route | awk '/default/ {print $3}')"; then + return 1 + fi + + return 0 +} + +# +# Adjust docker's network configuration to avoid subnet overlapping ranges. +# +function adjust_docker_config_network() { + + local bip_host=$(echo ${bip_subnet} | cut -d'/' -f 1) + local pool_host=$(echo $pool_subnet} | cut -d'/' -f 1) + + # If no 'bip' key-entry is present, proceed to add one. + if [ $(jq 'has("bip")' ${dockerCfgFile}) = "false" ] || + [ $(jq '."bip"' ${dockerCfgFile}) = "\"\"" ]; then + + # If bip address to add overlaps with an existing local subnet, then + # dump a log message to user and skip bip's configuration. + if system_local_subnet ${bip_host} && + ! ip -4 address show dev docker0 | egrep -q "${bip_subnet}"; then + echo -e "\nDocker bridge-ip network to configure (${bip_subnet}) overlaps" \ + "with existing system subnet. Installation process will skip this docker" \ + "network setting. Please manually configure docker's 'bip' subnet to" \ + "avoid connectivity issues.\n" + else + jq --arg bip ${bip_subnet} --indent 4 '. + {"bip": $bip}' ${dockerCfgFile} \ + >${tmpfile} && cp ${tmpfile} ${dockerCfgFile} + + docker_network_config_changed="true" + fi + fi + + # If no 'default-address-pool' is found, do add one here. + if [ $(jq 'has("default-address-pools")' ${dockerCfgFile}) = "false" ] || + [ $(jq '."default-address-pools" | length' ${dockerCfgFile}) -eq "0" ]; then + + # If address-pool overlaps with an existing local subnet, then + # dump a log message to user and skip this attribute's configuration. + if system_local_subnet ${pool_host}; then + echo -e "\nDocker default-address-pool to configure (${pool_subnet}) overlaps" \ + "with existing system subnet. Installation process will skip this docker" \ + "network setting. Please manually configure docker's 'default-address-pool'" \ + "subnet to avoid connectivity issues.\n" + else + jq --arg subnet ${pool_subnet} --indent 4 \ + '."default-address-pools"[0] |= . + {"base": $subnet, "size": 24}' ${dockerCfgFile} \ + >${tmpfile} && cp ${tmpfile} ${dockerCfgFile} + + docker_network_config_changed="true" + fi + fi +} + +# +# Create/Modify dockerd configuration to meet Sysbox requirements. +# +function adjust_docker_config() { + + # If no dockerd default config-file exist, or if there's no content on it, + # create one with a bare json layout. + if [[ ! -f ${dockerCfgFile} ]] || [[ ! -s ${dockerCfgFile} ]]; then + mkdir -p ${dockerCfgDir} + touch ${dockerCfgFile} + echo -e "{\n}" >${dockerCfgFile} + fi + + adjust_docker_config_runtime + + adjust_docker_config_network + + if ! docker_running; then + return + fi + + # Restart docker if disruptive changes have been made. + if [[ ${docker_network_config_changed} = "true" ]]; then + + # If existing containers are found then skip docker-restart to avoid any + # service disruption. + if ! docker ps -a | wc -l | egrep -q "1$"; then + echo -e "\nDocker service was not restarted to avoid affecting existing" \ + "containers. Please remove them and restart Docker by doing:\n" \ + "\t\"docker rm \$(docker ps -a -q) -f &&" \ + "sudo systemctl restart docker\"\n" + else + systemctl restart docker + return + fi + fi + + # If non-disruptive changes have been made to docker config, then send it a + # sighup to have its config file getting re-parsed (no need to cold-boot). + if [ ${docker_runtime_config_changed} = true ]; then + kill -SIGHUP $(pidof dockerd) + fi +} + +# +# Checks if the docker engine is installed on the host +# +function docker_installed() { + ret=$(command -v dockerd >/dev/null 2>&1) + return $? +} + +# +# Checks if the docker engine is running on the host +# +function docker_running() { + ret=$(pidof dockerd >/dev/null 2>&1) + return $? +} + +# +# Verify if kernel-headers are properly installed and alert user otherwise. +# +function check_kernel_headers() { + + #if ! dpkg-query -W -f='${Status} ${Version}\n' linux-headers-$(uname -r) \ + if ! dpkg -s linux-headers-$(uname -r) 2>&1 | egrep -q "install ok installed"; then + echo -e "\nThe linux kernel headers package was not found. This may be" \ + "expected by user applications running within Sysbox containers." \ + "Please install it with this command:\n" \ + "\t\"sudo apt-get install -y linux-headers-\$(uname -r)\"\n" + fi +} + +function config_sysbox() { + + # Ensure sysbox-fs' default mountpoint is created in the file-system. + create_sysboxfs_mountpoint + + # Allows user-namespaces creation for unprivileged users. This change will + # persist through system reboots by relying on a sysctl.d config-file to be + # generated as part of this package's installation process. + if is_wsl; then + echo "WSL2 detected, enable_unprivileged_userns skipped." + else + enable_unprivileged_userns + fi + + # Ensure kernel's inotify resources can meet Sysbox's scaling requirements. + define_inotify_resources + + # Ensure kernel's keyring resources can meet Sysbox's scaling requirements. + define_keyring_resources + + # Ensure that kernel's pid_max values are large enough to meet Sysbox's scaling + # requirements. + define_pidmax_resources + + # Add 'sysbox' user to host a large pool of subordinate UIDs/GIDs to be + # shared across all system-containers. This user must match the one defined + # in the docker configuration file. + add_sysbox_user + + # Adjust dockerd configuration (if necessary) + if docker_installed; then + adjust_docker_config + fi + + # Check for kernel-headers. + if is_wsl; then + echo "WSL2 detected, check_kernel_headers skipped." + else + check_kernel_headers + fi +} + +case "$1" in +configure) + # Adjust system's configuration to satisfy Sysbox requirements. + config_sysbox + + # Avoid calling exit() here to allow debhelper tools to add their + # auto-generated code further below. + ;; + +abort-*) ;; + +\ + *) ;; + +esac + +#DEBHELPER# diff --git a/sysbox-pkgr/deb/sysbox-ce/sysbox-ce.postrm b/sysbox-pkgr/deb/sysbox-ce/sysbox-ce.postrm new file mode 100755 index 00000000..80b5b432 --- /dev/null +++ b/sysbox-pkgr/deb/sysbox-ce/sysbox-ce.postrm @@ -0,0 +1,107 @@ +#!/bin/bash +# +# Copyright: (C) 2019-2021 Nestybox Inc. All rights reserved. +# + +set -e + +# Source debconf library. +. /usr/share/debconf/confmodule + +# Dockerd default configuration dir/file. +dockerCfgDir="/etc/docker" +dockerCfgFile="${dockerCfgDir}/daemon.json" + +# Docker config vars. +docker_runtime_config_changed="false" + +# Temp file for jq write operations. +tmpfile=$(mktemp /tmp/installer-scr.XXXXXX) +trap 'rm -f "${tmpfile}"' EXIT + +# +# Note: As per Debian packaging policies, package elimination should not remove +# logfiles. Therefore we shouldn't delete the 'sysbox' user/group previously +# created by Sysbox's installation process, as it would leave files with +# dangling ownership. +# + +function adjust_docker_config_runtime() { + + # Eliminate sysbox's runtime entry if present. + if [ $(jq 'has("runtimes")' ${dockerCfgFile}) = "true" ] && + [ $(jq '.runtimes | has("sysbox-runc")' ${dockerCfgFile}) = "true" ]; then + + jq 'del(.runtimes."sysbox-runc")' \ + ${dockerCfgFile} >${tmpfile} && cp ${tmpfile} ${dockerCfgFile} + + # If sysbox is a docker registered runtime, ensure that the uninstallation + # process updates dockerd. + if command -v docker >/dev/null 2>&1 && + docker info 2>&1 | egrep -q "Runtimes:.*sysbox-runc"; then + + docker_runtime_config_changed="true" + fi + fi +} + +# +# Modify dockerd configuration to eliminate Sysbox runtime. +# +function adjust_docker_config() { + + local docker_sighup_required=false + + # There is not much to do here if docker config file is not present. + if [ ! -f ${dockerCfgFile} ]; then + return + fi + + adjust_docker_config_runtime + + if ! docker_running; then + return + fi + + # Send docker a sighup to digest the absence of sysbox runtime. + # This should not impact current non-sysbox containers. + if [ ${docker_runtime_config_changed} = "true" ]; then + kill -SIGHUP $(pidof dockerd) + fi +} + +# +# Checks if the docker engine is installed on the host +# +function docker_installed() { + ret=$(command -v dockerd >/dev/null 2>&1) + return $? +} + +# +# Checks if the docker engine is running on the host +# +function docker_running() { + ret=$(pidof dockerd >/dev/null 2>&1) + return $? +} + +case "$1" in +purge) + # Adjust docker config to eliminate entries added by Sysbox's + # installation process. + adjust_docker_config + ;; + +remove | upgrade | failed-upgrade | abort-install | abort-upgrade | disappear) ;; + +\ + *) + echo "postrm called with unknown argument \`$1'" >&2 + exit 1 + ;; +esac + +#DEBHELPER# + +exit 0 diff --git a/sysbox-pkgr/deb/sysbox-ce/sysbox-ce.preinst b/sysbox-pkgr/deb/sysbox-ce/sysbox-ce.preinst new file mode 100755 index 00000000..658bf3d2 --- /dev/null +++ b/sysbox-pkgr/deb/sysbox-ce/sysbox-ce.preinst @@ -0,0 +1,156 @@ +#!/bin/bash +# +# Copyright: (C) 2019-2022 Nestybox Inc. All rights reserved. +# + +set -e + +# The following table enumerates the Linux distributions (and associated +# releases) supported by this installer and the required kernel versions. +# +# +=================================================+ +# | Supported Distributions | Supported Kernels | +# +=================================================+ +# | | | +# | Ubuntu Bionic (18.04) | 5.3+ | +# | | | +# | Ubuntu Focal (20.04) | 5.4+ | +# | | | +# | Ubuntu Impish (21.10) | 5.13+ | +# | | | +# | Ubuntu Jammy (22.04) | 5.15+ | +# | | | +# | Debian Buster (10) | 5.5+ | +# | | | +# | Debian Bullseye (11) | 5.5+ | +# |___________________________|_____________________| + +declare -A sysbox_support_matrix +sysbox_support_matrix=( + ["Ubuntu 18.04"]="5.3" + ["Ubuntu 20.04"]="5.4" + ["Ubuntu 21.10"]="5.13" + ["Ubuntu 22.04"]="5.15" + ["Debian 10"]="5.5" + ["Debian 11"]="5.5" +) + +# Minimum (oldest) kernel required for Sysbox installation to proceed when +# dealing with non-officially-supported distros. +sysbox_min_required_kernel_unsupported_distros="5.5.0" + +# Compare two versions in SemVer format. +# +# Examples: (1.0.1, 1.0.1) = 0 +# (1.0.1, 1.0.2) = 2 +# (1.0.1, 1.0.0) = 1 +# (1, 1.0) = 0 +# (3.0.4.10, 3.0.4.2) = 1 +# (5.0.0-22, 5.0.0-22) = 0 +# (5.0.0-22, 5.0.0-21) = 1 +# (5.0.0-21, 5.0.0-22) = 2 +# +function version_compare() { + + if [[ $1 == $2 ]]; then + return 0 + fi + + local IFS='.|-' + local i ver1=($1) ver2=($2) + + # Fill empty fields in ver1 with zeros. + for ((i = ${#ver1[@]}; i < ${#ver2[@]}; i++)); do + ver1[i]=0 + done + + for ((i = 0; i < ${#ver1[@]}; i++)); do + if [[ -z ${ver2[i]} ]]; then + # Fill empty fields in ver2 with zeros. + ver2[i]=0 + fi + if ((10#${ver1[i]} > 10#${ver2[i]})); then + return 1 + fi + if ((10#${ver1[i]} < 10#${ver2[i]})); then + return 2 + fi + done + + return 0 +} + +# Extract required distro details. +function current_distro_details() { + + local distro=$(lsb_release -is) + local release=$(lsb_release -rs) + + echo "${distro}" "${release}" +} + +# +# Enforce sysbox's kernel-requirements matrix. +# +function verify_compatibility() { + + local cur_distro=$(current_distro_details) + local cur_kernel=$(uname -r | cut -d'-' -f1) + local found_supported_distro=false + local found_supported_kernel=false + + # Iterate through the support_matrix and verify that per-distros' minimum + # requirements are satisfied. + for distro in "${!sysbox_support_matrix[@]}"; do + + # Verify distro compatibility. + if [[ "${distro}" = "${cur_distro}" ]]; then + found_supported_distro=true + + # Verify kernel compatibility. + version_compare ${cur_kernel} ${sysbox_support_matrix[$distro]} && : + if [[ $? -le 1 ]]; then + found_distro_supported_kernel=true + fi + + break + fi + done + + # If the distro on which Sysbox is being installed is not officially supported, + # let the installation proceed if the minimum kernel requirement is satisfied. + # Alternatively, bail out if an unsupported kernel release is found for a + # supported distro. + if [[ ${found_supported_distro} = false ]]; then + version_compare ${cur_kernel} ${sysbox_min_required_kernel_unsupported_distros} && : + if [[ $? -eq 2 ]]; then + echo -e "\nUnsupported linux kernel release \"${cur_kernel}\" for" \ + "\"{cur_distro}\" distro. Sysbox may not operate as expected.\n" + exit 1 + fi + + elif [[ ${found_distro_supported_kernel} = false ]]; then + echo -e "\nUnsupported linux kernel release \"${cur_kernel}\" for" \ + "${cur_distro}\" distro.\n" + exit 1 + fi +} + +case "$1" in +install) + # Verify that sysbox's system requirements are met. + verify_compatibility + + exit 0 + ;; + +upgrade | abort-upgrade) ;; + +\ + *) + echo "preinst called with unknown argument \`$1'" >&2 + exit 0 + ;; +esac + +#DEBHELPER# diff --git a/sysbox-pkgr/deb/sysbox-ee/compat b/sysbox-pkgr/deb/sysbox-ee/compat new file mode 100644 index 00000000..9a037142 --- /dev/null +++ b/sysbox-pkgr/deb/sysbox-ee/compat @@ -0,0 +1 @@ +10 \ No newline at end of file diff --git a/sysbox-pkgr/deb/sysbox-ee/control b/sysbox-pkgr/deb/sysbox-ee/control new file mode 100644 index 00000000..3f32e58c --- /dev/null +++ b/sysbox-pkgr/deb/sysbox-ee/control @@ -0,0 +1,25 @@ +Source: sysbox-ee +Section: admin +Priority: optional +Maintainer: Nestybox +Build-Depends: debhelper (>= 10~), + bash, +Standards-Version: 4.1.4 +Homepage: nestybox.com +Vcs-Git: git://github.com/nestybox/sysbox.git + +Package: sysbox-ee +Architecture: linux-any +Pre-Depends: jq +Depends: ${shlibs:Depends}, + ${misc:Depends}, + debconf, + jq, + fuse, + lsb-release, + rsync, + iptables +Description: Sysbox Enterprise Edition (EE) is a next-generation container runtime, + developed by Nestybox, that enables deployment of containers that are capable of + running not just micro-services, but also system software such as Docker, Kubernetes, + Systemd, etc., inside the container, easily and securely. \ No newline at end of file diff --git a/sysbox-pkgr/deb/sysbox-ee/copyright b/sysbox-pkgr/deb/sysbox-ee/copyright new file mode 100644 index 00000000..99abbac1 --- /dev/null +++ b/sysbox-pkgr/deb/sysbox-ee/copyright @@ -0,0 +1,12 @@ +Format: https://www.debian.org/doc/packaging-manuals/copyright-format/1.0/ +Upstream-Name: Nestybox Sysbox +Upstream-Contact: Nestybox Inc + +Disclaimer: + This package is not part of the GNU/Linux Debian distribution. + No modifications of the included binaries, and no other uses or + further distributions of the included binaries, are permitted. + +Files: * +Copyright: 2019-2022 Nestybox Inc. All rights reserved. +License: Nestybox License diff --git a/sysbox-pkgr/deb/sysbox-ee/rules b/sysbox-pkgr/deb/sysbox-ee/rules new file mode 100755 index 00000000..9b352539 --- /dev/null +++ b/sysbox-pkgr/deb/sysbox-ee/rules @@ -0,0 +1,68 @@ +#!/usr/bin/make -f + +# Output every command that modifies files on the build system. Enable me for +# debugging purposes. +export DH_VERBOSE = 1 + +# Override buildeb step to enforce 'xz' as the compressor to rely on. Notice +# that starting with Ubuntu Impish (i.e., 21.10), all packages make use of +# 'zstd' as the default compressor, but this one is not supported in debian +# distro yet. +override_dh_builddeb: + dh_builddeb -- -Zxz + +override_dh_auto_build: + # Build sysbox's components. + cd sysbox && make sysbox-static-local + +override_dh_auto_test: + +# Override dwz to avoid issues in debian-buster with debhelper=12 compat. +override_dh_dwz: + +# ONESHELL attribute to ensure that all instruccions in this target are executed +# within a single shell process. +.ONESHELL: +SHELL=/bin/bash +override_dh_auto_install: + # Sysbox binaries will be installed through the regular (makefile) process. + install -D -m0755 sysbox/sysbox-fs/build/${ARCH}/sysbox-fs \ + debian/sysbox-ee/usr/bin/sysbox-fs + install -D -m0755 sysbox/sysbox-mgr/build/${ARCH}/sysbox-mgr \ + debian/sysbox-ee/usr/bin/sysbox-mgr + install -D -m0755 sysbox/sysbox-runc/build/${ARCH}/sysbox-runc \ + debian/sysbox-ee/usr/bin/sysbox-runc + + # Sysbox services installation. + install -D -m 0644 /sources/sysbox.service \ + debian/sysbox-ee/lib/systemd/system/sysbox.service + install -D -m 0644 /sources/sysbox-fs.service \ + debian/sysbox-ee/lib/systemd/system/sysbox-fs.service + install -D -m 0644 /sources/sysbox-mgr.service \ + debian/sysbox-ee/lib/systemd/system/sysbox-mgr.service + + # Sysbox's sysctl.d config-file to hold the required procfs settings. + # Notice we're picking the largest two-digit integer to name this file + # to avoid collisions with other package's requirements -- we expect + # our requirements to be more generic, so we want ours to prevail. + install -D -m 0644 /sources/99-sysbox-sysctl.conf \ + debian/sysbox-ee/lib/sysctl.d/99-sysbox-sysctl.conf + +override_dh_installinit: + # Use "sysbox" as our service name, not "sysbox-ee". + dh_installinit --name=sysbox + +override_dh_install: + dh_install + +# Override dh_usrlocal to prevent error after placing sysbox binaries in +# /usr/local path. +override_dh_usrlocal: + +override_dh_installsystemd: + dh_installsystemd --name=sysbox + dh_installsystemd --name=sysbox-fs + dh_installsystemd --name=sysbox-mgr + +%: + dh $@ diff --git a/sysbox-pkgr/deb/sysbox-ee/source/format b/sysbox-pkgr/deb/sysbox-ee/source/format new file mode 100644 index 00000000..9f8e9b69 --- /dev/null +++ b/sysbox-pkgr/deb/sysbox-ee/source/format @@ -0,0 +1 @@ +1.0 \ No newline at end of file diff --git a/sysbox-pkgr/deb/sysbox-ee/sysbox-ee.config b/sysbox-pkgr/deb/sysbox-ee/sysbox-ee.config new file mode 100644 index 00000000..0c0abb20 --- /dev/null +++ b/sysbox-pkgr/deb/sysbox-ee/sysbox-ee.config @@ -0,0 +1,129 @@ +#!/bin/bash +# +# Copyright: (C) 2019-2022 Nestybox Inc. All rights reserved. +# + +set -e + +# Source debconf library. +. /usr/share/debconf/confmodule + +# Dockerd default configuration dir/file. +dockerCfgDir="/etc/docker" +dockerCfgFile="${dockerCfgDir}/daemon.json" + +# UID-shifting module +shiftfs_module="shiftfs" + +# +# Determines if docker network configuration is complete. That is, 'bip' +# and 'default-address-pools' attributes are properly configured. +# +function docker_network_valid_config() { + + # Matching pattern: + # + # "bip": "172.20.0.1/16" + # "default-address-pools": [ + # + if [[ -f ${dockerCfgFile} ]] && + egrep -q "^[ ]+\"bip\": \"[0-9.]+.*\"" ${dockerCfgFile} && + egrep -q "^[ ]+\"default-address-pools\"" ${dockerCfgFile}; then + return 0 + fi + + return 1 +} + +# +# Finds out if there are existing docker containers in the system. +# +function docker_existing_containers() { + if docker ps -a | wc -l | egrep -q "1$"; then + return 1 + fi + + return 0 +} + +# +# Check presence/absence of shiftfs module. +# +function shiftfs_mount_supported() { + if modprobe "${shiftfs_module}" &>/dev/null; then + return 0 + fi + + return 1 +} + +# +# Check for idmapped-mount support. +# +function idmapped_mount_supported() { + local kernel_rel=$(uname -r) + local rel_major=$(echo ${kernel_rel} | cut -d'.' -f1) + local rel_minor=$(echo ${kernel_rel} | cut -d'.' -f2) + + if [ ${rel_major} -gt 5 ] || ([ ${rel_major} -eq 5 ] && [ ${rel_minor} -ge 12 ]); then + return 0 + fi + + return 1 +} + +# +# Function returns 'true' to indicate that dockerd needs to be restarted during +# Sysbox installation process. Returns 'false' otherwise. +# +function docker_restart_required() { + if docker_network_valid_config; then + return 1 + fi + + return 0 +} + +# +# Checks if the docker engine is installed on the host. +# +function docker_installed() { + ret=$(command -v dockerd >/dev/null 2>&1) + return $? +} + +# +# Checks if the docker engine is running on the host. +# +function docker_running() { + ret=$(pidof dockerd >/dev/null 2>&1) + return $? +} + +# +# Main +# + +# If a docker-restart is required in this setup, and there are existing docker +# containers, alert user of the need to stop containers and exit installation +# process. +if docker_running && docker_restart_required && docker_existing_containers; then + echo -e "\nThe Sysbox installer requires a docker service restart to configure" \ + "network parameters, but it cannot proceed due to existing Docker containers." \ + "Please remove them as indicated below and re-launch the installation process." \ + "Refer to Sysbox installation documentation for details.\n" \ + "\t\"docker rm \$(docker ps -a -q) -f\"\n" + exit 1 +fi + +# If neither 'idmapping' nor 'shiftfs' is supported, we must alert the user of +# its potential side effects. +if ! idmapped_mount_supported && ! shiftfs_mount_supported; then + echo -e "\nYour OS does not support 'idmapped' feature (kernel < 5.12), nor it" \ + " provides 'shiftfs' support. In consequence, applications within Sysbox" \ + " containers may be unable to access volume-mounts, which will show up as" \ + " owned by 'nobody:nogroup' inside the container. Refer to Sysbox" \ + " installation documentation for details.\n" +fi + +#DEBHELPER# diff --git a/sysbox-pkgr/deb/sysbox-ee/sysbox-ee.install b/sysbox-pkgr/deb/sysbox-ee/sysbox-ee.install new file mode 100644 index 00000000..8b137891 --- /dev/null +++ b/sysbox-pkgr/deb/sysbox-ee/sysbox-ee.install @@ -0,0 +1 @@ + diff --git a/sysbox-pkgr/deb/sysbox-ee/sysbox-ee.manpages b/sysbox-pkgr/deb/sysbox-ee/sysbox-ee.manpages new file mode 100644 index 00000000..e69de29b diff --git a/sysbox-pkgr/deb/sysbox-ee/sysbox-ee.postinst b/sysbox-pkgr/deb/sysbox-ee/sysbox-ee.postinst new file mode 100755 index 00000000..cbbe064e --- /dev/null +++ b/sysbox-pkgr/deb/sysbox-ee/sysbox-ee.postinst @@ -0,0 +1,361 @@ +#!/bin/bash +# +# Copyright: (C) 2019-2021 Nestybox Inc. All rights reserved. +# + +set -e + +# Source debconf library. +. /usr/share/debconf/confmodule + +# Dockerd default configuration dir/file. +dockerCfgDir="/etc/docker" +dockerCfgFile="${dockerCfgDir}/daemon.json" + +# sysbox-fs' default mountpoint path. +sysboxfs_mountpoint="/var/lib/sysboxfs" + +# UID-shifting module +shiftfs_module="shiftfs" + +# Kernel's pool-size of inotify resources. +inotify_pool_size=1048576 + +# Default docker network parameters +bip_subnet="172.20.0.1/16" +pool_subnet="172.25.0.0/16" + +# Docker config vars. +docker_network_config_changed="false" +docker_runtime_config_changed="false" + +# Temp file for jq write operations. +tmpfile=$(mktemp /tmp/installer-scr.XXXXXX) +trap 'rm -f "${tmpfile}"' EXIT + +# Kernel's keyring max keys limit. Increase keyring max-keys for scaling +# purposes. +# +# In debian-based distros (e.g., Ubuntu) the kernel keyring max keys limit +# is set to 200 for non-root users. This is too low for Sysbox. For example, +# for a sys container based K8s cluster, the number of keys required is: +# +# keys = 35 + (k8s_workers * 23) + (2 * pods) +# +# That is, a 10-node cluster would need 282 keys. +# +# In a large bare-metal machine, we expect ~100 sys containers. That would +# consume ~11K keys. To be conservative, we set maxkeys to 20K. Note that since +# each key consumes ~70 bytes on average, the total mem consumption assuming all +# 20K keys are used is 20K * 70 = 1.4MB. +kernel_keys_maxkeys=20000 +kernel_keys_maxbytes=1400000 + +# Increase the kernel's max PID limit to it's max value since Sysbox containers +# are often used as VM-like environments and can consume hundreds or thousands +# of PIDs each. For Sysbox deployments on K8s, we limit each pod to 16K pids via +# the CRI-O config file. For Sysbox deployments in Docker hosts, use Docker's +# "--pids-limit" option to fix this limit. +kernel_pid_max=4194304 + + +# Creates sysbox-fs mountpoint. +function create_sysboxfs_mountpoint() { + + if [[ -d ${sysboxfs_mountpoint} ]]; then + return + fi + + mkdir -p ${sysboxfs_mountpoint} + if [[ ! -d ${sysboxfs_mountpoint} ]]; then + exit 1 + fi +} + +# Enables the utilization of unprivileged user-namespaces. +function enable_unprivileged_userns() { + if [ -f "/proc/sys/kernel/unprivileged_userns_clone" ]; then + local val=$(sysctl kernel.unprivileged_userns_clone) + if [[ "${val##* }" = 0 ]]; then + sysctl -w kernel.unprivileged_userns_clone=1 >/dev/null 2>&1 + fi + fi +} + +# Ensure kernel's 'inotify' resources meet sysbox requirements -- default values +# in most distros are too low for decent-size scenarios. +function define_inotify_resources() { + + local val=$(sysctl fs.inotify.max_queued_events) + if [[ "${val##* }" -lt ${inotify_pool_size} ]]; then + sysctl -w fs.inotify.max_queued_events=${inotify_pool_size} >/dev/null 2>&1 + fi + + local val=$(sysctl fs.inotify.max_user_watches) + if [[ "${val##* }" -lt ${inotify_pool_size} ]]; then + sysctl -w fs.inotify.max_user_watches=${inotify_pool_size} >/dev/null 2>&1 + fi + + local val=$(sysctl fs.inotify.max_user_instances) + if [[ "${val##* }" -lt ${inotify_pool_size} ]]; then + sysctl -w fs.inotify.max_user_instances=${inotify_pool_size} >/dev/null 2>&1 + fi +} + +# Ensure kernel's 'keyring' resources meet sysbox requirements -- +# default values in most distros are too low for decent-size +# scenarios. See definitions of keyring variables above for +# details. +function define_keyring_resources() { + + local val=$(sysctl kernel.keys.maxkeys) + if [[ "${val##* }" -lt ${kernel_keys_maxkeys} ]]; then + sysctl -w kernel.keys.maxkeys=${kernel_keys_maxkeys} >/dev/null 2>&1 + fi + + local val=$(sysctl kernel.keys.maxbytes) + if [[ "${val##* }" -lt ${kernel_keys_maxbytes} ]]; then + sysctl -w kernel.keys.maxbytes=${kernel_keys_maxbytes} >/dev/null 2>&1 + fi +} + +# Apply a decent pid_max size. +function define_pidmax_resources() { + + local val=$(sysctl kernel.pid_max) + if [[ "${val##* }" -lt ${kernel_pid_max} ]]; then + sysctl -w kernel.pid_max=${kernel_pid_max} >/dev/null 2>&1 + fi +} + +# Adds user/group in charge of running all sysbox components. +function add_sysbox_user() { + + if ! getent passwd | grep "^sysbox:" >/dev/null 2>&1; then + useradd -s /bin/false sysbox + fi +} + +# +# Add sysbox runtime to docker configuration. +# +function adjust_docker_config_runtime() { + + # If no 'runtimes' key-entry is present, proceed to add one. + if [ $(jq 'has("runtimes")' ${dockerCfgFile}) = "false" ]; then + jq --indent 4 '. + {"runtimes": {"sysbox-runc": {"path": "/usr/bin/sysbox-runc"}}}' \ + ${dockerCfgFile} >${tmpfile} && cp ${tmpfile} ${dockerCfgFile} + + docker_runtime_config_changed="true" + + # If no 'sysbox-runc' runtime entry is present, proceed to add it. + elif [ $(jq '.runtimes | has("sysbox-runc")' ${dockerCfgFile}) = "false" ]; then + jq --indent 4 '.runtimes |= . + {"sysbox-runc": {"path": "/usr/bin/sysbox-runc"}}' \ + ${dockerCfgFile} >${tmpfile} && cp ${tmpfile} ${dockerCfgFile} + + docker_runtime_config_changed="true" + + # If the runtime config is complete (i.e. both 'runtimes' and 'sysbox-runc' + # entries exist) but has the old sysbox-runc binary location, update the location. + elif grep -q "/usr/local/sbin/sysbox-runc" ${dockerCfgFile}; then + sed -i "s@/usr/local/sbin/sysbox-runc@/usr/bin/sysbox-runc@g" ${dockerCfgFile} + docker_runtime_config_changed="true" + fi + + # If the state has not been digested by docker yet, ensure that docker + # processes it before this installation concludes. + if [ ${docker_runtime_config_changed} = false ] && + command -v docker >/dev/null 2>&1 && + ! docker info 2>&1 | egrep -q "Runtimes:.*sysbox-runc"; then + + docker_runtime_config_changed="true" + fi +} + +# +# Returns 'true' if passed ipv4 address overlaps with any of the system local +# subnets. Return 'false' otherwise. +# +function system_local_subnet() { + + if ip route get ${1} | egrep -q "via $(ip route | awk '/default/ {print $3}')"; then + return 1 + fi + + return 0 +} + +# +# Adjust docker's network configuration to avoid subnet overlapping ranges. +# +function adjust_docker_config_network() { + + local bip_host=$(echo ${bip_subnet} | cut -d'/' -f 1) + local pool_host=$(echo $pool_subnet} | cut -d'/' -f 1) + + # If no 'bip' key-entry is present, proceed to add one. + if [ $(jq 'has("bip")' ${dockerCfgFile}) = "false" ] || + [ $(jq '."bip"' ${dockerCfgFile}) = "\"\"" ]; then + + # If bip address to add overlaps with an existing local subnet, then + # dump a log message to user and skip bip's configuration. + if system_local_subnet ${bip_host} && + ! ip -4 address show dev docker0 | egrep -q "${bip_subnet}"; then + echo -e "\nDocker bridge-ip network to configure (${bip_subnet}) overlaps" \ + "with existing system subnet. Installation process will skip this docker" \ + "network setting. Please manually configure docker's 'bip' subnet to" \ + "avoid connectivity issues.\n" + else + jq --arg bip ${bip_subnet} --indent 4 '. + {"bip": $bip}' ${dockerCfgFile} \ + >${tmpfile} && cp ${tmpfile} ${dockerCfgFile} + + docker_network_config_changed="true" + fi + fi + + # If no 'default-address-pool' is found, do add one here. + if [ $(jq 'has("default-address-pools")' ${dockerCfgFile}) = "false" ] || + [ $(jq '."default-address-pools" | length' ${dockerCfgFile}) -eq "0" ]; then + + # If address-pool overlaps with an existing local subnet, then + # dump a log message to user and skip this attribute's configuration. + if system_local_subnet ${pool_host}; then + echo -e "\nDocker default-address-pool to configure (${pool_subnet}) overlaps" \ + "with existing system subnet. Installation process will skip this docker" \ + "network setting. Please manually configure docker's 'default-address-pool'" \ + "subnet to avoid connectivity issues.\n" + else + jq --arg subnet ${pool_subnet} --indent 4 \ + '."default-address-pools"[0] |= . + {"base": $subnet, "size": 24}' ${dockerCfgFile} \ + >${tmpfile} && cp ${tmpfile} ${dockerCfgFile} + + docker_network_config_changed="true" + fi + fi +} + +# +# Create/Modify dockerd configuration to meet Sysbox requirements. +# +function adjust_docker_config() { + + # If no dockerd default config-file exist, or if there's no content on it, + # create one with a bare json layout. + if [[ ! -f ${dockerCfgFile} ]] || [[ ! -s ${dockerCfgFile} ]]; then + mkdir -p ${dockerCfgDir} + touch ${dockerCfgFile} + echo -e "{\n}" >${dockerCfgFile} + fi + + adjust_docker_config_runtime + + adjust_docker_config_network + + if ! docker_running; then + return + fi + + # Restart docker if disruptive changes have been made. + if [[ ${docker_network_config_changed} = "true" ]]; then + + # If existing containers are found then skip docker-restart to avoid any + # service disruption. + if ! docker ps -a | wc -l | egrep -q "1$"; then + echo -e "\nDocker service was not restarted to avoid affecting existing" \ + "containers. Please remove them and restart Docker by doing:\n" \ + "\t\"docker rm \$(docker ps -a -q) -f &&" \ + "sudo systemctl restart docker\"\n" + else + systemctl restart docker + return + fi + fi + + # If non-disruptive changes have been made to docker config, then send it a + # sighup to have its config file getting re-parsed (no need to cold-boot). + if [ ${docker_runtime_config_changed} = true ]; then + kill -SIGHUP $(pidof dockerd) + fi +} + +# +# Checks if the docker engine is installed on the host +# +function docker_installed() { + ret=$(command -v dockerd >/dev/null 2>&1) + return $? +} + +# +# Checks if the docker engine is running on the host +# +function docker_running() { + ret=$(pidof dockerd >/dev/null 2>&1) + return $? +} + +# +# Verify if kernel-headers are properly installed and alert user otherwise. +# +function check_kernel_headers() { + + #if ! dpkg-query -W -f='${Status} ${Version}\n' linux-headers-$(uname -r) \ + if ! dpkg -s linux-headers-$(uname -r) 2>&1 | egrep -q "install ok installed"; then + echo -e "\nThe linux kernel headers package was not found. This may be" \ + "expected by user applications running within Sysbox containers." \ + "Please install it with this command:\n" \ + "\t\"sudo apt-get install -y linux-headers-\$(uname -r)\"\n" + fi +} + +function config_sysbox() { + + # Ensure sysbox-fs' default mountpoint is created in the file-system. + create_sysboxfs_mountpoint + + # Allows user-namespaces creation for unprivileged users. This change will + # persist through system reboots by relying on a sysctl.d config-file to be + # generated as part of this package's installation process. + enable_unprivileged_userns + + # Ensure kernel's inotify resources can meet Sysbox's scaling requirements. + define_inotify_resources + + # Ensure kernel's keyring resources can meet Sysbox's scaling requirements. + define_keyring_resources + + # Ensure that kernel's pid_max values are large enough to meet Sysbox's scaling + # requirements. + define_pidmax_resources + + # Add 'sysbox' user to host a large pool of subordinate UIDs/GIDs to be + # shared across all system-containers. This user must match the one defined + # in the docker configuration file. + add_sysbox_user + + # Adjust dockerd configuration (if necessary) + if docker_installed; then + adjust_docker_config + fi + + # Check for kernel-headers. + check_kernel_headers +} + +case "$1" in +configure) + # Adjust system's configuration to satisfy Sysbox requirements. + config_sysbox + + # Avoid calling exit() here to allow debhelper tools to add their + # auto-generated code further below. + ;; + +abort-*) ;; + +\ + *) ;; + +esac + +#DEBHELPER# diff --git a/sysbox-pkgr/deb/sysbox-ee/sysbox-ee.postrm b/sysbox-pkgr/deb/sysbox-ee/sysbox-ee.postrm new file mode 100755 index 00000000..80b5b432 --- /dev/null +++ b/sysbox-pkgr/deb/sysbox-ee/sysbox-ee.postrm @@ -0,0 +1,107 @@ +#!/bin/bash +# +# Copyright: (C) 2019-2021 Nestybox Inc. All rights reserved. +# + +set -e + +# Source debconf library. +. /usr/share/debconf/confmodule + +# Dockerd default configuration dir/file. +dockerCfgDir="/etc/docker" +dockerCfgFile="${dockerCfgDir}/daemon.json" + +# Docker config vars. +docker_runtime_config_changed="false" + +# Temp file for jq write operations. +tmpfile=$(mktemp /tmp/installer-scr.XXXXXX) +trap 'rm -f "${tmpfile}"' EXIT + +# +# Note: As per Debian packaging policies, package elimination should not remove +# logfiles. Therefore we shouldn't delete the 'sysbox' user/group previously +# created by Sysbox's installation process, as it would leave files with +# dangling ownership. +# + +function adjust_docker_config_runtime() { + + # Eliminate sysbox's runtime entry if present. + if [ $(jq 'has("runtimes")' ${dockerCfgFile}) = "true" ] && + [ $(jq '.runtimes | has("sysbox-runc")' ${dockerCfgFile}) = "true" ]; then + + jq 'del(.runtimes."sysbox-runc")' \ + ${dockerCfgFile} >${tmpfile} && cp ${tmpfile} ${dockerCfgFile} + + # If sysbox is a docker registered runtime, ensure that the uninstallation + # process updates dockerd. + if command -v docker >/dev/null 2>&1 && + docker info 2>&1 | egrep -q "Runtimes:.*sysbox-runc"; then + + docker_runtime_config_changed="true" + fi + fi +} + +# +# Modify dockerd configuration to eliminate Sysbox runtime. +# +function adjust_docker_config() { + + local docker_sighup_required=false + + # There is not much to do here if docker config file is not present. + if [ ! -f ${dockerCfgFile} ]; then + return + fi + + adjust_docker_config_runtime + + if ! docker_running; then + return + fi + + # Send docker a sighup to digest the absence of sysbox runtime. + # This should not impact current non-sysbox containers. + if [ ${docker_runtime_config_changed} = "true" ]; then + kill -SIGHUP $(pidof dockerd) + fi +} + +# +# Checks if the docker engine is installed on the host +# +function docker_installed() { + ret=$(command -v dockerd >/dev/null 2>&1) + return $? +} + +# +# Checks if the docker engine is running on the host +# +function docker_running() { + ret=$(pidof dockerd >/dev/null 2>&1) + return $? +} + +case "$1" in +purge) + # Adjust docker config to eliminate entries added by Sysbox's + # installation process. + adjust_docker_config + ;; + +remove | upgrade | failed-upgrade | abort-install | abort-upgrade | disappear) ;; + +\ + *) + echo "postrm called with unknown argument \`$1'" >&2 + exit 1 + ;; +esac + +#DEBHELPER# + +exit 0 diff --git a/sysbox-pkgr/deb/sysbox-ee/sysbox-ee.preinst b/sysbox-pkgr/deb/sysbox-ee/sysbox-ee.preinst new file mode 100755 index 00000000..658bf3d2 --- /dev/null +++ b/sysbox-pkgr/deb/sysbox-ee/sysbox-ee.preinst @@ -0,0 +1,156 @@ +#!/bin/bash +# +# Copyright: (C) 2019-2022 Nestybox Inc. All rights reserved. +# + +set -e + +# The following table enumerates the Linux distributions (and associated +# releases) supported by this installer and the required kernel versions. +# +# +=================================================+ +# | Supported Distributions | Supported Kernels | +# +=================================================+ +# | | | +# | Ubuntu Bionic (18.04) | 5.3+ | +# | | | +# | Ubuntu Focal (20.04) | 5.4+ | +# | | | +# | Ubuntu Impish (21.10) | 5.13+ | +# | | | +# | Ubuntu Jammy (22.04) | 5.15+ | +# | | | +# | Debian Buster (10) | 5.5+ | +# | | | +# | Debian Bullseye (11) | 5.5+ | +# |___________________________|_____________________| + +declare -A sysbox_support_matrix +sysbox_support_matrix=( + ["Ubuntu 18.04"]="5.3" + ["Ubuntu 20.04"]="5.4" + ["Ubuntu 21.10"]="5.13" + ["Ubuntu 22.04"]="5.15" + ["Debian 10"]="5.5" + ["Debian 11"]="5.5" +) + +# Minimum (oldest) kernel required for Sysbox installation to proceed when +# dealing with non-officially-supported distros. +sysbox_min_required_kernel_unsupported_distros="5.5.0" + +# Compare two versions in SemVer format. +# +# Examples: (1.0.1, 1.0.1) = 0 +# (1.0.1, 1.0.2) = 2 +# (1.0.1, 1.0.0) = 1 +# (1, 1.0) = 0 +# (3.0.4.10, 3.0.4.2) = 1 +# (5.0.0-22, 5.0.0-22) = 0 +# (5.0.0-22, 5.0.0-21) = 1 +# (5.0.0-21, 5.0.0-22) = 2 +# +function version_compare() { + + if [[ $1 == $2 ]]; then + return 0 + fi + + local IFS='.|-' + local i ver1=($1) ver2=($2) + + # Fill empty fields in ver1 with zeros. + for ((i = ${#ver1[@]}; i < ${#ver2[@]}; i++)); do + ver1[i]=0 + done + + for ((i = 0; i < ${#ver1[@]}; i++)); do + if [[ -z ${ver2[i]} ]]; then + # Fill empty fields in ver2 with zeros. + ver2[i]=0 + fi + if ((10#${ver1[i]} > 10#${ver2[i]})); then + return 1 + fi + if ((10#${ver1[i]} < 10#${ver2[i]})); then + return 2 + fi + done + + return 0 +} + +# Extract required distro details. +function current_distro_details() { + + local distro=$(lsb_release -is) + local release=$(lsb_release -rs) + + echo "${distro}" "${release}" +} + +# +# Enforce sysbox's kernel-requirements matrix. +# +function verify_compatibility() { + + local cur_distro=$(current_distro_details) + local cur_kernel=$(uname -r | cut -d'-' -f1) + local found_supported_distro=false + local found_supported_kernel=false + + # Iterate through the support_matrix and verify that per-distros' minimum + # requirements are satisfied. + for distro in "${!sysbox_support_matrix[@]}"; do + + # Verify distro compatibility. + if [[ "${distro}" = "${cur_distro}" ]]; then + found_supported_distro=true + + # Verify kernel compatibility. + version_compare ${cur_kernel} ${sysbox_support_matrix[$distro]} && : + if [[ $? -le 1 ]]; then + found_distro_supported_kernel=true + fi + + break + fi + done + + # If the distro on which Sysbox is being installed is not officially supported, + # let the installation proceed if the minimum kernel requirement is satisfied. + # Alternatively, bail out if an unsupported kernel release is found for a + # supported distro. + if [[ ${found_supported_distro} = false ]]; then + version_compare ${cur_kernel} ${sysbox_min_required_kernel_unsupported_distros} && : + if [[ $? -eq 2 ]]; then + echo -e "\nUnsupported linux kernel release \"${cur_kernel}\" for" \ + "\"{cur_distro}\" distro. Sysbox may not operate as expected.\n" + exit 1 + fi + + elif [[ ${found_distro_supported_kernel} = false ]]; then + echo -e "\nUnsupported linux kernel release \"${cur_kernel}\" for" \ + "${cur_distro}\" distro.\n" + exit 1 + fi +} + +case "$1" in +install) + # Verify that sysbox's system requirements are met. + verify_compatibility + + exit 0 + ;; + +upgrade | abort-upgrade) ;; + +\ + *) + echo "preinst called with unknown argument \`$1'" >&2 + exit 0 + ;; +esac + +#DEBHELPER# diff --git a/sysbox-pkgr/deb/ubuntu-bionic/Dockerfile b/sysbox-pkgr/deb/ubuntu-bionic/Dockerfile new file mode 100644 index 00000000..1e6eab63 --- /dev/null +++ b/sysbox-pkgr/deb/ubuntu-bionic/Dockerfile @@ -0,0 +1,92 @@ +ARG GO_IMAGE +ARG BASE_IMAGE=ubuntu:bionic +ARG DEBIAN_FRONTEND=noninteractive + +FROM ${GO_IMAGE} as golang + +FROM ${BASE_IMAGE} + +RUN apt-get update && \ + apt-get install -y --no-install-recommends \ + build-essential \ + ca-certificates \ + devscripts \ + equivs \ + git \ + wget \ + pkg-config \ + libnet-dev \ + libseccomp2 \ + libseccomp-dev \ + iproute2 \ + kmod \ + curl \ + unzip && \ + \ + # Housekeeping + apt-get clean -y && \ + rm -rf \ + /var/cache/debconf/* \ + /var/lib/apt/lists/* \ + /var/log/* \ + /tmp/* \ + /var/tmp/* \ + /usr/share/doc/* \ + /usr/share/man/* \ + /usr/share/local/* + +ARG arch +ENV ARCH=${arch} +ENV GOPATH /go +ENV PATH $PATH:/usr/local/go/bin:$GOPATH/bin:/root/.local/bin + +ARG DEB_FILES +COPY ${DEB_FILES} /root/build-deb/debian +RUN mk-build-deps -t "apt-get -o Debug::pkgProblemResolver=yes --no-install-recommends -y" -i /root/build-deb/debian/control + +ENV BASE_IMAGE=${BASE_IMAGE} +ENV SYSBOX_RELEASE true + +COPY --from=golang /usr/local/go /usr/local/go + +# Let's explicitly set go-module feature to 'auto' mode (default as per Go 1.13) to avoid +# potential changes to this feature's default mode in the future. Even though we are +# relying on modules for the package's building process, we are enabling 'auto' mode to +# allow 'go get' traditional behavior (fetch entire git repo). Notice that we need git's +# metadata to allow a git-checkout operation further below. +ENV GO111MODULE=auto +RUN go env -w GONOSUMDB=github.com/nestybox + +# Install protoc compiler for gRPC. +RUN if [ "${arch}" = "amd64" ]; then arch_str="x86_64"; \ + elif [ "${arch}" = "arm64" ]; then arch_str="aarch_64"; \ + else echo "Unsupported platform: ${arch}"; exit; fi \ + && curl -LO https://github.com/protocolbuffers/protobuf/releases/download/v3.15.8/protoc-3.15.8-linux-${arch_str}.zip \ + && unzip protoc-3.15.8-linux-${arch_str}.zip -d $HOME/.local \ + && export PATH="$PATH:$HOME/.local/bin" \ + && go install github.com/golang/protobuf/protoc-gen-go@latest \ + && export PATH="$PATH:$(go env GOPATH)/bin" + +# Install Docker +RUN curl -fsSL https://get.docker.com -o get-docker.sh \ + && sh get-docker.sh +ADD https://raw.githubusercontent.com/docker/docker-ce/master/components/cli/contrib/completion/bash/docker /etc/bash_completion.d/docker.sh + +# Use the old definition for SECCOMP_NOTIF_ID_VALID in /usr/include/linux/seccomp.h +# +# This is needed because the definition changed in the mainline kernel +# on 06/2020 (from SECCOMP_IOR -> SECCOMP_IOW), and some distros we +# support have picked it up in their latest releases / kernels +# updates. The kernel change was backward compatible, so by using the +# old definition, we are guaranteed it will work on kernels before and +# after the change. On the other hand, if we were to use the new +# definition, seccomp notify would fail when sysbox runs in old +# kernels. +RUN sed -i 's/^#define SECCOMP_IOCTL_NOTIF_ID_VALID[ \t]*SECCOMP_IOW(2, __u64)/#define SECCOMP_IOCTL_NOTIF_ID_VALID SECCOMP_IOR(2, __u64)/g' /usr/include/linux/seccomp.h + +WORKDIR /root/build-deb +COPY sources/ /sources +COPY build-deb /root/build-deb/build-deb +COPY changelog_convert.sh /root/build-deb/changelog_convert.sh + +ENTRYPOINT ["/root/build-deb/build-deb"] diff --git a/sysbox-pkgr/deb/ubuntu-focal/Dockerfile b/sysbox-pkgr/deb/ubuntu-focal/Dockerfile new file mode 100644 index 00000000..14bfd4b5 --- /dev/null +++ b/sysbox-pkgr/deb/ubuntu-focal/Dockerfile @@ -0,0 +1,93 @@ +ARG GO_IMAGE +ARG BASE_IMAGE=ubuntu:focal +ARG DEBIAN_FRONTEND=noninteractive + +FROM ${GO_IMAGE} as golang + +FROM ${BASE_IMAGE} + +RUN echo 'debconf debconf/frontend select Noninteractive' | debconf-set-selections && \ + apt-get update && \ + apt-get install -y --no-install-recommends \ + build-essential \ + ca-certificates \ + devscripts \ + equivs \ + git \ + wget \ + pkg-config \ + libnet-dev \ + libseccomp2 \ + libseccomp-dev \ + iproute2 \ + kmod \ + curl \ + unzip && \ + \ + # Housekeeping + apt-get clean -y && \ + rm -rf \ + /var/cache/debconf/* \ + /var/lib/apt/lists/* \ + /var/log/* \ + /tmp/* \ + /var/tmp/* \ + /usr/share/doc/* \ + /usr/share/man/* \ + /usr/share/local/* + +ARG arch +ENV ARCH=${arch} +ENV GOPATH /go +ENV PATH $PATH:/usr/local/go/bin:$GOPATH/bin:/root/.local/bin + +ARG DEB_FILES +COPY ${DEB_FILES} /root/build-deb/debian +RUN mk-build-deps -t "apt-get -o Debug::pkgProblemResolver=yes --no-install-recommends -y" -i /root/build-deb/debian/control + +ENV BASE_IMAGE=${BASE_IMAGE} +ENV SYSBOX_RELEASE true + +COPY --from=golang /usr/local/go /usr/local/go + +# Let's explicitly set go-module feature to 'auto' mode (default as per Go 1.13) to avoid +# potential changes to this feature's default mode in the future. Even though we are +# relying on modules for the package's building process, we are enabling 'auto' mode to +# allow 'go get' traditional behavior (fetch entire git repo). Notice that we need git's +# metadata to allow a git-checkout operation further below. +ENV GO111MODULE=auto +RUN go env -w GONOSUMDB=github.com/nestybox + +# Install protoc compiler for gRPC. +RUN if [ "${arch}" = "amd64" ]; then arch_str="x86_64"; \ + elif [ "${arch}" = "arm64" ]; then arch_str="aarch_64"; \ + else echo "Unsupported platform: ${arch}"; exit; fi \ + && curl -LO https://github.com/protocolbuffers/protobuf/releases/download/v3.15.8/protoc-3.15.8-linux-${arch_str}.zip \ + && unzip protoc-3.15.8-linux-${arch_str}.zip -d $HOME/.local \ + && export PATH="$PATH:$HOME/.local/bin" \ + && go install github.com/golang/protobuf/protoc-gen-go@latest \ + && export PATH="$PATH:$(go env GOPATH)/bin" + +# Install Docker +RUN curl -fsSL https://get.docker.com -o get-docker.sh \ + && sh get-docker.sh +ADD https://raw.githubusercontent.com/docker/docker-ce/master/components/cli/contrib/completion/bash/docker /etc/bash_completion.d/docker.sh + +# Use the old definition for SECCOMP_NOTIF_ID_VALID in /usr/include/linux/seccomp.h +# +# This is needed because the definition changed in the mainline kernel +# on 06/2020 (from SECCOMP_IOR -> SECCOMP_IOW), and some distros we +# support have picked it up in their latest releases / kernels +# updates. The kernel change was backward compatible, so by using the +# old definition, we are guaranteed it will work on kernels before and +# after the change. On the other hand, if we were to use the new +# definition, seccomp notify would fail when sysbox runs in old +# kernels. +RUN sed -i 's/^#define SECCOMP_IOCTL_NOTIF_ID_VALID[ \t]*SECCOMP_IOW(2, __u64)/#define SECCOMP_IOCTL_NOTIF_ID_VALID SECCOMP_IOR(2, __u64)/g' /usr/include/linux/seccomp.h + +WORKDIR /root/build-deb +COPY sources/ /sources +COPY build-deb /root/build-deb/build-deb +COPY changelog_convert.sh /root/build-deb/changelog_convert.sh + +ENTRYPOINT ["/root/build-deb/build-deb"] diff --git a/sysbox-pkgr/deb/ubuntu-impish/Dockerfile b/sysbox-pkgr/deb/ubuntu-impish/Dockerfile new file mode 100644 index 00000000..6303d42f --- /dev/null +++ b/sysbox-pkgr/deb/ubuntu-impish/Dockerfile @@ -0,0 +1,93 @@ +ARG GO_IMAGE +ARG BASE_IMAGE=ubuntu:impish +ARG DEBIAN_FRONTEND=noninteractive + +FROM ${GO_IMAGE} as golang + +FROM ${BASE_IMAGE} + +RUN echo 'debconf debconf/frontend select Noninteractive' | debconf-set-selections && \ + apt-get update && \ + apt-get install -y --no-install-recommends \ + build-essential \ + ca-certificates \ + devscripts \ + equivs \ + git \ + wget \ + pkg-config \ + libnet-dev \ + libseccomp2 \ + libseccomp-dev \ + iproute2 \ + kmod \ + curl \ + unzip && \ + \ + # Housekeeping + apt-get clean -y && \ + rm -rf \ + /var/cache/debconf/* \ + /var/lib/apt/lists/* \ + /var/log/* \ + /tmp/* \ + /var/tmp/* \ + /usr/share/doc/* \ + /usr/share/man/* \ + /usr/share/local/* + +ARG arch +ENV ARCH=${arch} +ENV GOPATH /go +ENV PATH $PATH:/usr/local/go/bin:$GOPATH/bin:/root/.local/bin + +ARG DEB_FILES +COPY ${DEB_FILES} /root/build-deb/debian +RUN mk-build-deps -t "apt-get -o Debug::pkgProblemResolver=yes --no-install-recommends -y" -i /root/build-deb/debian/control + +ENV BASE_IMAGE=${BASE_IMAGE} +ENV SYSBOX_RELEASE true + +COPY --from=golang /usr/local/go /usr/local/go + +# Let's explicitly set go-module feature to 'auto' mode (default as per Go 1.13) to avoid +# potential changes to this feature's default mode in the future. Even though we are +# relying on modules for the package's building process, we are enabling 'auto' mode to +# allow 'go get' traditional behavior (fetch entire git repo). Notice that we need git's +# metadata to allow a git-checkout operation further below. +ENV GO111MODULE=auto +RUN go env -w GONOSUMDB=github.com/nestybox + +# Install protoc compiler for gRPC. +RUN if [ "${arch}" = "amd64" ]; then arch_str="x86_64"; \ + elif [ "${arch}" = "arm64" ]; then arch_str="aarch_64"; \ + else echo "Unsupported platform: ${arch}"; exit; fi \ + && curl -LO https://github.com/protocolbuffers/protobuf/releases/download/v3.15.8/protoc-3.15.8-linux-${arch_str}.zip \ + && unzip protoc-3.15.8-linux-${arch_str}.zip -d $HOME/.local \ + && export PATH="$PATH:$HOME/.local/bin" \ + && go install github.com/golang/protobuf/protoc-gen-go@latest \ + && export PATH="$PATH:$(go env GOPATH)/bin" + +# Install Docker +RUN curl -fsSL https://get.docker.com -o get-docker.sh \ + && sh get-docker.sh +ADD https://raw.githubusercontent.com/docker/docker-ce/master/components/cli/contrib/completion/bash/docker /etc/bash_completion.d/docker.sh + +# Use the old definition for SECCOMP_NOTIF_ID_VALID in /usr/include/linux/seccomp.h +# +# This is needed because the definition changed in the mainline kernel +# on 06/2020 (from SECCOMP_IOR -> SECCOMP_IOW), and some distros we +# support have picked it up in their latest releases / kernels +# updates. The kernel change was backward compatible, so by using the +# old definition, we are guaranteed it will work on kernels before and +# after the change. On the other hand, if we were to use the new +# definition, seccomp notify would fail when sysbox runs in old +# kernels. +RUN sed -i 's/^#define SECCOMP_IOCTL_NOTIF_ID_VALID[ \t]*SECCOMP_IOW(2, __u64)/#define SECCOMP_IOCTL_NOTIF_ID_VALID SECCOMP_IOR(2, __u64)/g' /usr/include/linux/seccomp.h + +WORKDIR /root/build-deb +COPY sources/ /sources +COPY build-deb /root/build-deb/build-deb +COPY changelog_convert.sh /root/build-deb/changelog_convert.sh + +ENTRYPOINT ["/root/build-deb/build-deb"] diff --git a/sysbox-pkgr/deb/ubuntu-jammy/Dockerfile b/sysbox-pkgr/deb/ubuntu-jammy/Dockerfile new file mode 100644 index 00000000..40f1d5c7 --- /dev/null +++ b/sysbox-pkgr/deb/ubuntu-jammy/Dockerfile @@ -0,0 +1,93 @@ +ARG GO_IMAGE +ARG BASE_IMAGE=ubuntu:jammy +ARG DEBIAN_FRONTEND=noninteractive + +FROM ${GO_IMAGE} as golang + +FROM ${BASE_IMAGE} + +RUN echo 'debconf debconf/frontend select Noninteractive' | debconf-set-selections && \ + apt-get update && \ + apt-get install -y --no-install-recommends \ + build-essential \ + ca-certificates \ + devscripts \ + equivs \ + git \ + wget \ + pkg-config \ + libnet-dev \ + libseccomp2 \ + libseccomp-dev \ + iproute2 \ + kmod \ + curl \ + unzip && \ + \ + # Housekeeping + apt-get clean -y && \ + rm -rf \ + /var/cache/debconf/* \ + /var/lib/apt/lists/* \ + /var/log/* \ + /tmp/* \ + /var/tmp/* \ + /usr/share/doc/* \ + /usr/share/man/* \ + /usr/share/local/* + +ARG arch +ENV ARCH=${arch} +ENV GOPATH /go +ENV PATH $PATH:/usr/local/go/bin:$GOPATH/bin:/root/.local/bin + +ARG DEB_FILES +COPY ${DEB_FILES} /root/build-deb/debian +RUN mk-build-deps -t "apt-get -o Debug::pkgProblemResolver=yes --no-install-recommends -y" -i /root/build-deb/debian/control + +ENV BASE_IMAGE=${BASE_IMAGE} +ENV SYSBOX_RELEASE true + +COPY --from=golang /usr/local/go /usr/local/go + +# Let's explicitly set go-module feature to 'auto' mode (default as per Go 1.13) to avoid +# potential changes to this feature's default mode in the future. Even though we are +# relying on modules for the package's building process, we are enabling 'auto' mode to +# allow 'go get' traditional behavior (fetch entire git repo). Notice that we need git's +# metadata to allow a git-checkout operation further below. +ENV GO111MODULE=auto +RUN go env -w GONOSUMDB=github.com/nestybox + +# Install protoc compiler for gRPC. +RUN if [ "${arch}" = "amd64" ]; then arch_str="x86_64"; \ + elif [ "${arch}" = "arm64" ]; then arch_str="aarch_64"; \ + else echo "Unsupported platform: ${arch}"; exit; fi \ + && curl -LO https://github.com/protocolbuffers/protobuf/releases/download/v3.15.8/protoc-3.15.8-linux-${arch_str}.zip \ + && unzip protoc-3.15.8-linux-${arch_str}.zip -d $HOME/.local \ + && export PATH="$PATH:$HOME/.local/bin" \ + && go install github.com/golang/protobuf/protoc-gen-go@latest \ + && export PATH="$PATH:$(go env GOPATH)/bin" + +# Install Docker +RUN curl -fsSL https://get.docker.com -o get-docker.sh \ + && sh get-docker.sh +ADD https://raw.githubusercontent.com/docker/docker-ce/master/components/cli/contrib/completion/bash/docker /etc/bash_completion.d/docker.sh + +# Use the old definition for SECCOMP_NOTIF_ID_VALID in /usr/include/linux/seccomp.h +# +# This is needed because the definition changed in the mainline kernel +# on 06/2020 (from SECCOMP_IOR -> SECCOMP_IOW), and some distros we +# support have picked it up in their latest releases / kernels +# updates. The kernel change was backward compatible, so by using the +# old definition, we are guaranteed it will work on kernels before and +# after the change. On the other hand, if we were to use the new +# definition, seccomp notify would fail when sysbox runs in old +# kernels. +RUN sed -i 's/^#define SECCOMP_IOCTL_NOTIF_ID_VALID[ \t]*SECCOMP_IOW(2, __u64)/#define SECCOMP_IOCTL_NOTIF_ID_VALID SECCOMP_IOR(2, __u64)/g' /usr/include/linux/seccomp.h + +WORKDIR /root/build-deb +COPY sources/ /sources +COPY build-deb /root/build-deb/build-deb +COPY changelog_convert.sh /root/build-deb/changelog_convert.sh + +ENTRYPOINT ["/root/build-deb/build-deb"] diff --git a/sysbox-pkgr/k8s/.gitignore b/sysbox-pkgr/k8s/.gitignore new file mode 100644 index 00000000..56293261 --- /dev/null +++ b/sysbox-pkgr/k8s/.gitignore @@ -0,0 +1,2 @@ +bin/* +!bin/README.md diff --git a/sysbox-pkgr/k8s/Dockerfile.centos7-systemd b/sysbox-pkgr/k8s/Dockerfile.centos7-systemd new file mode 100644 index 00000000..325cbc29 --- /dev/null +++ b/sysbox-pkgr/k8s/Dockerfile.centos7-systemd @@ -0,0 +1,43 @@ +# +# Copyright 2019-2022 Nestybox, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +# +# Docker image for deploying Sysbox on a K8s host. +# +# Build with: +# +# docker build -t ghcr.io/nestybox/centos7/systemd: . +# +# ... where 'sys-arch' is one of the supported hardware platforms: amd64 or arm64 +# +# A docker manifest will point to the generic/platform-agnostic image name: 'ghcr.io/nestybox/centos7/systemd' + +FROM centos:7 + +ENV container docker + +RUN (cd /lib/systemd/system/sysinit.target.wants/; for i in *; do [ $i == systemd-tmpfiles-setup.service ] || rm -f $i; done); \ +rm -f /lib/systemd/system/multi-user.target.wants/*;\ +rm -f /etc/systemd/system/*.wants/*;\ +rm -f /lib/systemd/system/local-fs.target.wants/*; \ +rm -f /lib/systemd/system/sockets.target.wants/*udev*; \ +rm -f /lib/systemd/system/sockets.target.wants/*initctl*; \ +rm -f /lib/systemd/system/basic.target.wants/*;\ +rm -f /lib/systemd/system/anaconda.target.wants/*; + +VOLUME [ "/sys/fs/cgroup" ] + +CMD ["/usr/sbin/init"] diff --git a/sysbox-pkgr/k8s/Dockerfile.crio b/sysbox-pkgr/k8s/Dockerfile.crio new file mode 100644 index 00000000..50c934c3 --- /dev/null +++ b/sysbox-pkgr/k8s/Dockerfile.crio @@ -0,0 +1,41 @@ +# +# CRI-O build container Dockerfile +# + +FROM ubuntu:jammy + +ARG sys_arch +ENV SYS_ARCH=${sys_arch} + +ARG CRIO_VERSIONS +ENV CRIO_VERSIONS=${CRIO_VERSIONS} + +ENV DEBIAN_FRONTEND=noninteractive + +RUN apt-get update && apt-get install -y --no-install-recommends \ + git \ + make \ + wget \ + ca-certificates \ + ssh-client \ + gcc \ + libgpgme-dev \ + pkg-config \ + libseccomp-dev + +# Install Golang and explicitly activate modules functionality. +RUN wget https://golang.org/dl/go1.22.2.linux-${sys_arch}.tar.gz && \ + tar -C /usr/local -xzf go1.22.2.linux-${sys_arch}.tar.gz && \ + /usr/local/go/bin/go env -w GONOSUMDB=/root/nestybox + +ENV GOPATH /go +ENV PATH $GOPATH/bin:/usr/local/go/bin:$PATH +RUN go env -w GONOSUMDB=/root/nestybox && \ + mkdir -p "$GOPATH/src" "$GOPATH/bin" && \ + chmod -R 777 "$GOPATH" + +# CRI-O build script +COPY scripts/crio-build.sh /usr/bin/crio-build.sh + +WORKDIR /root +CMD crio-build.sh diff --git a/sysbox-pkgr/k8s/Dockerfile.sysbox-ce b/sysbox-pkgr/k8s/Dockerfile.sysbox-ce new file mode 100644 index 00000000..0a75f290 --- /dev/null +++ b/sysbox-pkgr/k8s/Dockerfile.sysbox-ce @@ -0,0 +1,126 @@ +# +# Copyright 2019-2023 Nestybox, Inc. +# + +# +# Docker image for deploying Sysbox-CE on a K8s host. +# +# Build with: +# +# docker build -t nestybox/sysbox-deploy-k8s . +# + +# Note: we use a centos base image because it carries a systemctl that can +# communicate with the host's systemd via dbus. This does not work when using a +# ubuntu + systemd image (systemctl can't not connect to the host's dbus, even +# though the host's dbus socket was mounted into the container). + +FROM ghcr.io/nestybox/centos7/systemd + +ARG sys_arch +ENV SYS_ARCH=${sys_arch} +ARG sysbox_version +ENV SYSBOX_VERSION=${sysbox_version} + +ARG DEST=/opt/sysbox +ARG CRICTL_VERSION="v1.28.0" +ARG CRIO_V1_27_TAR="cri-o.${SYS_ARCH}.v1.27.0.tar.gz" +ARG CRIO_V1_28_TAR="cri-o.${SYS_ARCH}.v1.28.0.tar.gz" +ARG CRIO_V1_29_TAR="cri-o.${SYS_ARCH}.v1.29.0.tar.gz" +ARG CRIO_V1_30_TAR="cri-o.${SYS_ARCH}.v1.30.0.tar.gz" + +RUN sed -i 's/mirrorlist/#mirrorlist/g' /etc/yum.repos.d/CentOS-* \ + && sed -i 's|#baseurl=http://mirror.centos.org|baseurl=http://vault.centos.org|g' /etc/yum.repos.d/CentOS-* + +RUN yum install -y curl wget git bc which epel-release \ + && yum install -y jq + +RUN curl -Lso /bin/kubectl https://storage.googleapis.com/kubernetes-release/release/$(curl -s https://storage.googleapis.com/kubernetes-release/release/stable.txt)/bin/linux/${SYS_ARCH}/kubectl && \ + chmod +x /bin/kubectl + +RUN wget https://github.com/TomWright/dasel/releases/download/v1.27.3/dasel_linux_${SYS_ARCH} \ + && mv dasel_linux_${SYS_ARCH} dasel && chmod +x dasel && mv ./dasel /usr/local/bin/dasel + +# crictl will be copied to the host, and used by the crio installation agents +RUN wget https://github.com/kubernetes-sigs/cri-tools/releases/download/${CRICTL_VERSION}/crictl-${CRICTL_VERSION}-linux-${SYS_ARCH}.tar.gz \ + && tar zxvf crictl-${CRICTL_VERSION}-linux-${SYS_ARCH}.tar.gz -C /usr/local/bin \ + && chmod +x /usr/local/bin/crictl \ + && rm -f crictl-${CRICTL_VERSION}-linux-${SYS_ARCH}.tar.gz + +# shiftfs-dkms sources +RUN git clone --branch k5.4 https://github.com/nestybox/shiftfs-dkms.git /opt/shiftfs-k5.4 \ + && git clone --branch k5.10 https://github.com/nestybox/shiftfs-dkms.git /opt/shiftfs-k5.10 \ + && git clone --branch k5.11 https://github.com/nestybox/shiftfs-dkms.git /opt/shiftfs-k5.11 \ + && git clone --branch k5.13 https://github.com/nestybox/shiftfs-dkms.git /opt/shiftfs-k5.13 \ + && git clone --branch k5.16 https://github.com/nestybox/shiftfs-dkms.git /opt/shiftfs-k5.16 \ + && git clone --branch k5.17 https://github.com/nestybox/shiftfs-dkms.git /opt/shiftfs-k5.17 \ + && git clone --branch k5.18 https://github.com/nestybox/shiftfs-dkms.git /opt/shiftfs-k5.18 \ + && git clone --branch k6.1 https://github.com/nestybox/shiftfs-dkms.git /opt/shiftfs-k6.1 + +# +# Load Sysbox installation artifacts +# + +COPY bin/sysbox-ce /opt/sysbox/bin + +COPY systemd/50-sysbox-mod.conf /opt/sysbox/systemd/50-sysbox-mod.conf +COPY systemd/99-sysbox-sysctl.conf /opt/sysbox/systemd/99-sysbox-sysctl.conf +COPY systemd/sysbox-installer-helper.service /opt/sysbox/systemd/sysbox-installer-helper.service +COPY systemd/sysbox-removal-helper.service /opt/sysbox/systemd/sysbox-removal-helper.service +COPY systemd/sysbox-fs.service /opt/sysbox/systemd/sysbox-fs.service +COPY systemd/sysbox-mgr.service /opt/sysbox/systemd/sysbox-mgr.service +COPY systemd/sysbox.service /opt/sysbox/systemd/sysbox.service + +COPY scripts/sysbox-deploy-k8s.sh /opt/sysbox/scripts/sysbox-deploy-k8s.sh +COPY scripts/sysbox-installer-helper.sh /opt/sysbox/scripts/sysbox-installer-helper.sh +COPY scripts/sysbox-removal-helper.sh /opt/sysbox/scripts/sysbox-removal-helper.sh + +# +# Load CRI-O installation artifacts +# + +RUN wget https://storage.googleapis.com/cri-o/artifacts/${CRIO_V1_27_TAR} -O cri-o.${SYS_ARCH}.tar.gz \ + && mkdir -p /opt/crio-deploy/bin/v1.27 \ + && mv cri-o.${SYS_ARCH}.tar.gz /opt/crio-deploy/bin/v1.27/. + +RUN wget https://storage.googleapis.com/cri-o/artifacts/${CRIO_V1_28_TAR} -O cri-o.${SYS_ARCH}.tar.gz \ + && mkdir -p /opt/crio-deploy/bin/v1.28 \ + && mv cri-o.${SYS_ARCH}.tar.gz /opt/crio-deploy/bin/v1.28/. + +RUN wget https://storage.googleapis.com/cri-o/artifacts/${CRIO_V1_29_TAR} -O cri-o.${SYS_ARCH}.tar.gz \ + && mkdir -p /opt/crio-deploy/bin/v1.29 \ + && mv cri-o.${SYS_ARCH}.tar.gz /opt/crio-deploy/bin/v1.29/. + +RUN wget https://storage.googleapis.com/cri-o/artifacts/${CRIO_V1_30_TAR} -O cri-o.${SYS_ARCH}.tar.gz \ + && mkdir -p /opt/crio-deploy/bin/v1.30 \ + && mv cri-o.${SYS_ARCH}.tar.gz /opt/crio-deploy/bin/v1.30/. + +COPY systemd/crio-installer.service /opt/crio-deploy/systemd/crio-installer.service +COPY systemd/crio-removal.service /opt/crio-deploy/systemd/crio-removal.service +COPY scripts/crio-extractor.sh /opt/crio-deploy/scripts/crio-extractor.sh + +COPY scripts/crio-installer.sh /opt/crio-deploy/scripts/crio-installer.sh +COPY scripts/crio-removal.sh /opt/crio-deploy/scripts/crio-removal.sh + +COPY systemd/kubelet-config-helper.service /opt/crio-deploy/systemd/kubelet-config-helper.service +COPY scripts/kubelet-config-helper.sh /opt/crio-deploy/scripts/kubelet-config-helper.sh + +COPY systemd/kubelet-unconfig-helper.service /opt/crio-deploy/systemd/kubelet-unconfig-helper.service +COPY scripts/kubelet-unconfig-helper.sh /opt/crio-deploy/scripts/kubelet-unconfig-helper.sh + +COPY config/crio-kubelet-options /opt/crio-deploy/config/crio-kubelet-options +COPY config/etc_cni_net.d_200-loopback.conf /opt/crio-deploy/config/etc_cni_net.d_200-loopback.conf +COPY config/etc_containers_registries.conf.d_000-shortnames.conf /opt/crio-deploy/config/etc_containers_registries.conf.d_000-shortnames.conf +COPY config/etc_containers_storage.conf /opt/crio-deploy/config/etc_containers_storage.conf +COPY config/etc_containers_registries.conf /opt/crio-deploy/config/etc_containers_registries.conf +COPY config/etc_containers_policy.json /opt/crio-deploy/config/etc_containers_policy.json +COPY config/etc_containers_registries.d_default.yaml /opt/crio-deploy/config/etc_containers_registries.d_default.yaml + +# +# Load CRI-O patched binaries (to generate correct user-ns mappings) +# + +COPY bin/crio/v1.27/crio /opt/crio-deploy/bin/v1.27/crio-patched +COPY bin/crio/v1.28/crio /opt/crio-deploy/bin/v1.28/crio-patched +COPY bin/crio/v1.29/crio /opt/crio-deploy/bin/v1.29/crio-patched +COPY bin/crio/v1.30/crio /opt/crio-deploy/bin/v1.30/crio-patched diff --git a/sysbox-pkgr/k8s/Makefile b/sysbox-pkgr/k8s/Makefile new file mode 100644 index 00000000..d3a443d2 --- /dev/null +++ b/sysbox-pkgr/k8s/Makefile @@ -0,0 +1,153 @@ +# +# Sysbox Deploy K8s Daemonset Makefile +# + +.PHONY: sysbox-deploy-k8s-image \ + fetch_sysbox_ce_bins \ + fetch_sysbox_ee_bins \ + check-sysbox-artifacts \ + check-crio-artifacts \ + clean-sysbox-ce clean-crio clean + + +SHELL := /bin/bash + +SYSBOX_BINS = sysbox-runc sysbox-mgr sysbox-fs + +# Obtain the version to build from the Sysbox repo itself. If a full version string +# is not found in the corresponding VERSION file, append a dummy revision/patch ("0") +# to match the name of the sysbox's deb image to download. +# Examples: +# 1) if VERSION = 0.6.3, then +# SYSBOX_CE_VER = 0.6.3 +# SYSBOX_CE_VER_SEMVER = 0.6.3 +# SYSBOX_CE_VER_FULL = 0.6.3-0 +# +# 2) if VERSION = 0.6.3-1, then +# SYSBOX_CE_VER = 0.6.3-1 +# SYSBOX_CE_VER_SEMVER = 0.6.3 +# SYSBOX_CE_VER_FULL = 0.6.3-1 +# +SYSBOX_CE_VER = $(shell cat ../sources/sysbox/VERSION) +SYSBOX_CE_VER_SEMVER = $(shell echo $(SYSBOX_CE_VER) | cut -d"-" -f1) +SYSBOX_CE_VER_FULL = $(shell echo $(SYSBOX_CE_VER) | sed '/-[0-9]/!s/.*/&-0/') + +# CRIO versions to build. +CRIO_VERSIONS = v1.27 v1.28 v1.29 v1.30 + +# Patch version is used to track changes to the sysbox-deploy-k8s image not related to +# the Sysbox's version. For example, if we need to rebuild the sysbox-deploy-k8s image +# due to a change in any of the files in this directory, we bump this patch version. +# If there's no need for a 'patch' version, then the patch version is set to nil (i.e., +# empty). This is useful to distinguish between sysbox-deploy-k8s images that have the +# same Sysbox version but differ in other components (e.g., crio versions, supported +# k8s releases, etc.). +SYSBOX_DEPLOY_K8S_IMAGE_PATCH := ".1" + +# Obtain the current system architecture. +UNAME_M := $(shell uname -m) +ifeq ($(UNAME_M),x86_64) + SYS_ARCH := amd64 +else ifeq ($(UNAME_M),aarch64) + SYS_ARCH := arm64 +else ifeq ($(UNAME_M),arm64) + SYS_ARCH := arm64 +else ifeq ($(UNAME_M),arm) + SYS_ARCH := armhf +else ifeq ($(UNAME_M),armel) + SYS_ARCH := armel +endif + +# +# Sysbox artifacts for Ubuntu distro +# + +# The fetch_* targets download the Sysbox binaries from the Sysbox repo and +# place them in the "bin" sub-directory. The *_image targets then load those +# binaries into the sysbox-deploy-k8s image. Variable SYSBOX_CE_VER selects +# the version of the Sysbox binaries to download. + +fetch-sysbox-ce-bins: + @echo "Fetching Sysbox CE binaries ..." + $(eval TMPDIR := $(shell mktemp -d)) + @echo "TMPDIR = $(TMPDIR)" + wget https://storage.googleapis.com/sysbox-releases/v$(SYSBOX_CE_VER_SEMVER)/sysbox-ce/sysbox-ce_$(SYSBOX_CE_VER_FULL).linux_$(SYS_ARCH).deb -P $(TMPDIR) + mkdir -p $(TMPDIR)/sysbox-ce-generic + dpkg -x $(TMPDIR)/sysbox-ce_$(SYSBOX_CE_VER_FULL).linux_$(SYS_ARCH).deb $(TMPDIR)/sysbox-ce-generic + mkdir -p bin/sysbox-ce/generic + rm -rf bin/sysbox-ce/generic/* + cp $(TMPDIR)/sysbox-ce-generic/usr/bin/sysbox-* bin/sysbox-ce/generic/. + rm -rf $(TMPDIR) + +# +# CRI-O artifacts (only built if not already present at ./bin/crio) +# +CRIO_BINS_DIR := $(PWD)/bin/crio + +ifeq ($(wildcard $(CRIO_BINS_DIR)),) +build-crio: crio-build-container + @echo "NOTE: building CRI-O binaries at ${PWD}/bin/crio" + docker run --rm -v $(shell pwd)/bin:/mnt/results crio-bld +else +build-crio: + @echo "NOTE: Skipping CRI-O build (found binaries at ${PWD}/bin/crio)" +endif + +crio-build-container: + docker build -t crio-bld -f Dockerfile.crio --build-arg sys_arch=$(SYS_ARCH) \ + --build-arg CRIO_VERSIONS="${CRIO_VERSIONS}" . + +# +# The check-* targets verify that CRI-O, Sysbox binaries and its dependencies are +# all in the "bin" directory: +# +# bin +# ├── crio +# │   └── v1.24 +# │   └── crio +# │   └── v1.25 +# │   └── crio +# │   ├── v1.26 +# │   │   └── crio +# │   ├── v1.27 +# │   │   └── crio +# ├── sysbox-ce +# │   ├── ubuntu-bionic +# │   │   ├── sysbox-fs +# │   │   ├── sysbox-mgr +# │   │   └── sysbox-runc +# │   └── ubuntu-focal +# │   ├── sysbox-fs +# │   ├── sysbox-mgr +# │   └── sysbox-runc +# + +check-sysbox-artifacts: + $(foreach file,$(SYSBOX_BINS),[ -f "bin/sysbox-ce/generic/$(file)" ] || "missing sysbox-ce binary: bin/sysbox-ce/generic/$(file)") + +check-crio-artifacts: + @$(foreach version,$(CRIO_VERSIONS),[ -f "bin/crio/$(version)/crio" ] || "missing CRI-O binary: bin/crio/$(version)/crio";) + +# +# These targets build the sysbox-deploy-k8s images for sysbox-ce +# + +all: sysbox-deploy-k8s-image + +sysbox-deploy-k8s-image: build-crio check-crio-artifacts fetch-sysbox-ce-bins check-sysbox-artifacts + docker build -t ghcr.io/nestybox/sysbox-deploy-k8s:v$(SYSBOX_CE_VER_FULL)$(SYSBOX_DEPLOY_K8S_IMAGE_PATCH) \ + --build-arg sys_arch=$(SYS_ARCH) \ + --build-arg sysbox_version=v$(SYSBOX_CE_VER_FULL) \ + -f Dockerfile.sysbox-ce . + +# +# Cleanup targets +# + +clean-sysbox-ce: + -rm -rf bin/sysbox-ce + +clean-crio: + -rm -rf bin/crio + +clean: clean-sysbox-ce clean-crio diff --git a/sysbox-pkgr/k8s/README.md b/sysbox-pkgr/k8s/README.md new file mode 100644 index 00000000..65a7d730 --- /dev/null +++ b/sysbox-pkgr/k8s/README.md @@ -0,0 +1,81 @@ +Sysbox-deploy-k8s image generation and update procedure +======================================================= + +1) Build the sysbox-deploy images through the usual `make ` method. + + NOTE: The process must be completed in every supported architecture (i.e., + amd64 and arm64). + +2) Identify the image that has been created and re-tag it accordingly to match the +platform architecture being used. + + NOTE: this must be done for each supported platforms. + + NOTE: tag the images with the `_amd64` and `_arm64` suffixes as needed. + +``` +$ docker images +REPOSITORY TAG IMAGE ID CREATED SIZE +ghcr.io/nestybox/sysbox-deploy-k8s v0.5.2 eb28ac89b60f About a minute ago 982MB + +$ docker tag eb28ac89b60f ghcr.io/nestybox/sysbox-deploy-k8s:v0.5.2_arm64 + +$ docker images +REPOSITORY TAG IMAGE ID CREATED SIZE +ghcr.io/nestybox/sysbox-deploy-k8s v0.5.2 eb28ac89b60f 7 minutes ago 982MB +ghcr.io/nestybox/sysbox-deploy-k8s v0.5.2_arm64 eb28ac89b60f 7 minutes ago 982MB +ghcr.io/nestybox/sysbox-deploy-k8s v0.5.2_amd64 c23934aef102 7 minutes ago 970MB +``` + + +3) Push each image to ghcr.io (for both supported platforms): + +``` +$ docker push ghcr.io/nestybox/sysbox-deploy-k8s:v0.5.2_arm64 +$ docker push ghcr.io/nestybox/sysbox-deploy-k8s:v0.5.2_amd64 +``` + +4) Now is time to update the existing manifest to point to the new image components. This +step can be completed in any linux machine, doesn't need to be any of the ones previously +utilized to build the sysbox-deploy images. + + * We start by removing the current manifests (in case they are already present locally). + +``` +$ docker manifest rm ghcr.io/nestybox/sysbox-deploy-k8s:v0.5.2 +$ docker manifest rm ghcr.io/nestybox/sysbox-deploy-k8s:latest +``` + + * Now we recreate each manifest by pointing it to the platform-specific images previously + created (which don't need to be present/fetched locally for this operation to succeed). + +``` +$ docker manifest create ghcr.io/nestybox/sysbox-deploy-k8s:v0.5.2 --amend ghcr.io/nestybox/sysbox-deploy-k8s:v0.5.2_amd64 --amend ghcr.io/nestybox/sysbox-deploy-k8s:v0.5.2_arm64 +$ docker manifest create ghcr.io/nestybox/sysbox-deploy-k8s:latest --amend ghcr.io/nestybox/sysbox-deploy-k8s:v0.5.2_amd64 --amend ghcr.io/nestybox/sysbox-deploy-k8s:v0.5.2_arm64 +``` + + * Finally, we push the newly updated manifests to ghcr.io: + +``` +$ docker manifest push ghcr.io/nestybox/sysbox-deploy-k8s:v0.5.2 +$ docker manifest push ghcr.io/nestybox/sysbox-deploy-k8s:latest +``` + +5) Verify in Github web-portal that the image-digests of both the manifest and the images +fully match those in our build-server: + + * Compare the image-digests seen below (the following two commands), with the ones seen at + the Github packages' site: + +``` +$ docker images --digests | egrep "sysbox-deploy" +`` + +``` +$ docker manifest inspect ghcr.io/nestybox/sysbox-deploy-k8s:v0.5.2 +... + +$ docker manifest inspect ghcr.io/nestybox/sysbox-deploy-k8s:latest +... + +```` diff --git a/sysbox-pkgr/k8s/bin/README.md b/sysbox-pkgr/k8s/bin/README.md new file mode 100644 index 00000000..faeea238 --- /dev/null +++ b/sysbox-pkgr/k8s/bin/README.md @@ -0,0 +1,41 @@ +# Sysbox-deploy Binaries Generation Tips + +## CRI-O binaries + +1. Clone CRI-O repository: + +``` +$ git clone git@github.com:nestybox/cri-o.git + +$ cd cri-o +``` + +2. Switch to a local branch based off of the desired CRI-O release (i.e. currently v1.20 + or v1.21): + +``` +$ git checkout -b v1.21-sysbox origin/v1.21-sysbox +``` + +3. Build a 'static' version of the cri-o binary -- note that it takes a while, but it's + just ~4MB larger than the regular binary: + +``` +$ CONTAINER_RUNTIME=docker make build-static +``` + +4. Copy the obtained binary to its expected location: + +``` +$ tree k8s/bin/crio +k8s/bin/crio +├── README.md +├── v1.20 +│ └── crio +└── v1.21 + └── crio +``` + +## Flatcar binaries + + diff --git a/sysbox-pkgr/k8s/config/crio-kubelet-options b/sysbox-pkgr/k8s/config/crio-kubelet-options new file mode 100644 index 00000000..c3a46c2c --- /dev/null +++ b/sysbox-pkgr/k8s/config/crio-kubelet-options @@ -0,0 +1,4 @@ +--container-runtime-endpoint=unix:///var/run/crio/crio.sock +--image-service-endpoint=unix:///var/run/crio/crio.sock +--runtime-cgroups=/system.slice/crio.service +--runtime-request-timeout=6m \ No newline at end of file diff --git a/sysbox-pkgr/k8s/config/etc_cni_net.d_200-loopback.conf b/sysbox-pkgr/k8s/config/etc_cni_net.d_200-loopback.conf new file mode 100644 index 00000000..1acb85a0 --- /dev/null +++ b/sysbox-pkgr/k8s/config/etc_cni_net.d_200-loopback.conf @@ -0,0 +1,4 @@ +{ + "cniVersion": "0.3.1", + "type": "loopback" +} diff --git a/sysbox-pkgr/k8s/config/etc_containers_policy.json b/sysbox-pkgr/k8s/config/etc_containers_policy.json new file mode 100644 index 00000000..dffc54a6 --- /dev/null +++ b/sysbox-pkgr/k8s/config/etc_containers_policy.json @@ -0,0 +1,14 @@ +{ + "default": [ + { + "type": "insecureAcceptAnything" + } + ], + "transports": + { + "docker-daemon": + { + "": [{"type":"insecureAcceptAnything"}] + } + } +} diff --git a/sysbox-pkgr/k8s/config/etc_containers_registries.conf b/sysbox-pkgr/k8s/config/etc_containers_registries.conf new file mode 100644 index 00000000..96a2b4d4 --- /dev/null +++ b/sysbox-pkgr/k8s/config/etc_containers_registries.conf @@ -0,0 +1,77 @@ +# For more information on this configuration file, see containers-registries.conf(5). +# +# NOTE: RISK OF USING UNQUALIFIED IMAGE NAMES +# We recommend always using fully qualified image names including the registry +# server (full dns name), namespace, image name, and tag +# (e.g., registry.redhat.io/ubi8/ubi:latest). Pulling by digest (i.e., +# quay.io/repository/name@digest) further eliminates the ambiguity of tags. +# When using short names, there is always an inherent risk that the image being +# pulled could be spoofed. For example, a user wants to pull an image named +# `foobar` from a registry and expects it to come from myregistry.com. If +# myregistry.com is not first in the search list, an attacker could place a +# different `foobar` image at a registry earlier in the search list. The user +# would accidentally pull and run the attacker's image and code rather than the +# intended content. We recommend only adding registries which are completely +# trusted (i.e., registries which don't allow unknown or anonymous users to +# create accounts with arbitrary names). This will prevent an image from being +# spoofed, squatted or otherwise made insecure. If it is necessary to use one +# of these registries, it should be added at the end of the list. +# +# # An array of host[:port] registries to try when pulling an unqualified image, in order. +unqualified-search-registries = ["docker.io", "quay.io"] +# +# [[registry]] +# # The "prefix" field is used to choose the relevant [[registry]] TOML table; +# # (only) the TOML table with the longest match for the input image name +# # (taking into account namespace/repo/tag/digest separators) is used. +# # +# # The prefix can also be of the form: *.example.com for wildcard subdomain +# # matching. +# # +# # If the prefix field is missing, it defaults to be the same as the "location" field. +# prefix = "example.com/foo" +# +# # If true, unencrypted HTTP as well as TLS connections with untrusted +# # certificates are allowed. +# insecure = false +# +# # If true, pulling images with matching names is forbidden. +# blocked = false +# +# # The physical location of the "prefix"-rooted namespace. +# # +# # By default, this is equal to "prefix" (in which case "prefix" can be omitted +# # and the [[registry]] TOML table can only specify "location"). +# # +# # Example: Given +# # prefix = "example.com/foo" +# # location = "internal-registry-for-example.net/bar" +# # requests for the image example.com/foo/myimage:latest will actually work with the +# # internal-registry-for-example.net/bar/myimage:latest image. +# +# # The location can be empty iff prefix is in a +# # wildcarded format: "*.example.com". In this case, the input reference will +# # be used as-is without any rewrite. +# location = internal-registry-for-example.com/bar" +# +# # (Possibly-partial) mirrors for the "prefix"-rooted namespace. +# # +# # The mirrors are attempted in the specified order; the first one that can be +# # contacted and contains the image will be used (and if none of the mirrors contains the image, +# # the primary location specified by the "registry.location" field, or using the unmodified +# # user-specified reference, is tried last). +# # +# # Each TOML table in the "mirror" array can contain the following fields, with the same semantics +# # as if specified in the [[registry]] TOML table directly: +# # - location +# # - insecure +# [[registry.mirror]] +# location = "example-mirror-0.local/mirror-for-foo" +# [[registry.mirror]] +# location = "example-mirror-1.local/mirrors/foo" +# insecure = true +# # Given the above, a pull of example.com/foo/image:latest will try: +# # 1. example-mirror-0.local/mirror-for-foo/image:latest +# # 2. example-mirror-1.local/mirrors/foo/image:latest +# # 3. internal-registry-for-example.net/bar/image:latest +# # in order, and use the first one that exists. diff --git a/sysbox-pkgr/k8s/config/etc_containers_registries.conf.d_000-shortnames.conf b/sysbox-pkgr/k8s/config/etc_containers_registries.conf.d_000-shortnames.conf new file mode 100644 index 00000000..28d22f1a --- /dev/null +++ b/sysbox-pkgr/k8s/config/etc_containers_registries.conf.d_000-shortnames.conf @@ -0,0 +1,65 @@ +[aliases] + # centos + "centos" = "quay.io/centos/centos" + # containers + "skopeo" = "quay.io/skopeo/stable" + "buildah" = "quay.io/buildah/stable" + "podman" = "quay.io/podman/stable" + # docker + "alpine" = "docker.io/library/alpine" + "docker" = "docker.io/library/docker" + "registry" = "docker.io/library/registry" + "hello-world" = "docker.io/library/hello-world" + "swarm" = "docker.io/library/swarm" + # Fedora + "fedora-minimal" = "registry.fedoraproject.org/fedora-minimal" + "fedora" = "registry.fedoraproject.org/fedora" + # openSUSE + "opensuse/tumbleweed" = "registry.opensuse.org/opensuse/tumbleweed" + "opensuse/tumbleweed-dnf" = "registry.opensuse.org/opensuse/tumbleweed-dnf" + "opensuse/tumbleweed-microdnf" = "registry.opensuse.org/opensuse/tumbleweed-microdnf" + "opensuse/leap" = "registry.opensuse.org/opensuse/leap" + "opensuse/busybox" = "registry.opensuse.org/opensuse/busybox" + "tumbleweed" = "registry.opensuse.org/opensuse/tumbleweed" + "tumbleweed-dnf" = "registry.opensuse.org/opensuse/tumbleweed-dnf" + "tumbleweed-microdnf" = "registry.opensuse.org/opensuse/tumbleweed-microdnf" + "leap" = "registry.opensuse.org/opensuse/leap" + "leap-dnf" = "registry.opensuse.org/opensuse/leap-dnf" + "leap-microdnf" = "registry.opensuse.org/opensuse/leap-microdnf" + "tw-busybox" = "registry.opensuse.org/opensuse/busybox" + # SUSE + "suse/sle15" = "registry.suse.com/suse/sle15" + "suse/sles12sp5" = "registry.suse.com/suse/sles12sp5" + "suse/sles12sp4" = "registry.suse.com/suse/sles12sp4" + "suse/sles12sp3" = "registry.suse.com/suse/sles12sp3" + "sle15" = "registry.suse.com/suse/sle15" + "sles12sp5" = "registry.suse.com/suse/sles12sp5" + "sles12sp4" = "registry.suse.com/suse/sles12sp4" + "sles12sp3" = "registry.suse.com/suse/sles12sp3" + # Red Hat Enterprise Linux + "rhel" = "registry.access.redhat.com/rhel" + "rhel6" = "registry.access.redhat.com/rhel6" + "rhel7" = "registry.access.redhat.com/rhel7" + "ubi7" = "registry.access.redhat.com/ubi7" + "ubi7-init" = "registry.access.redhat.com/ubi7-init" + "ubi7-minimal" = "registry.access.redhat.com/ubi7-minimal" + "ubi8" = "registry.access.redhat.com/ubi8" + "ubi8-minimal" = "registry.access.redhat.com/ubi8-minimal" + "ubi8-init" = "registry.access.redhat.com/ubi8-init" + "ubi8-micro" = "registry.access.redhat.com/ubi8-micro" + "ubi8/ubi" = "registry.access.redhat.com/ubi8/ubi" + "ubi8/ubi-minimal" = "registry.access.redhat.com/ubi8-minimal" + "ubi8/ubi-init" = "registry.access.redhat.com/ubi8-init" + "ubi8/ubi-micro" = "registry.access.redhat.com/ubi8-micro" + # Debian + "debian" = "docker.io/library/debian" + # Ubuntu + "ubuntu" = "docker.io/library/ubuntu" + # Oracle Linux + "oraclelinux" = "container-registry.oracle.com/os/oraclelinux" + # busybox + "busybox" = "docker.io/library/busybox" + # php + "php" = "docker.io/library/php" + #python + "python" = "docker.io/library/python" diff --git a/sysbox-pkgr/k8s/config/etc_containers_registries.d_default.yaml b/sysbox-pkgr/k8s/config/etc_containers_registries.d_default.yaml new file mode 100644 index 00000000..943ea171 --- /dev/null +++ b/sysbox-pkgr/k8s/config/etc_containers_registries.d_default.yaml @@ -0,0 +1,26 @@ +# This is a default registries.d configuration file. You may +# add to this file or create additional files in registries.d/. +# +# sigstore: indicates a location that is read and write +# sigstore-staging: indicates a location that is only for write +# +# sigstore and sigstore-staging take a value of the following: +# sigstore: {schema}://location +# +# For reading signatures, schema may be http, https, or file. +# For writing signatures, schema may only be file. + +# This is the default signature write location for docker registries. +default-docker: +# sigstore: file:///var/lib/containers/sigstore + sigstore-staging: file:///var/lib/containers/sigstore + +# The 'docker' indicator here is the start of the configuration +# for docker registries. +# +# docker: +# +# privateregistry.com: +# sigstore: http://privateregistry.com/sigstore/ +# sigstore-staging: /mnt/nfs/privateregistry/sigstore + diff --git a/sysbox-pkgr/k8s/config/etc_containers_storage.conf b/sysbox-pkgr/k8s/config/etc_containers_storage.conf new file mode 100644 index 00000000..9cc45a16 --- /dev/null +++ b/sysbox-pkgr/k8s/config/etc_containers_storage.conf @@ -0,0 +1,195 @@ +# This file is is the configuration file for all tools +# that use the containers/storage library. +# See man 5 containers-storage.conf for more information +# The "container storage" table contains all of the server options. +[storage] + +# Default Storage Driver, Must be set for proper operation. +driver = "overlay" + +# Temporary storage location +runroot = "/run/containers/storage" + +# Primary Read/Write location of container storage +graphroot = "/var/lib/containers/storage" + +# Storage path for rootless users +# +# rootless_storage_path = "$HOME/.local/share/containers/storage" + +[storage.options] +# Storage options to be passed to underlying storage drivers + +# AdditionalImageStores is used to pass paths to additional Read/Only image stores +# Must be comma separated list. +additionalimagestores = [ +] + +# Remap-UIDs/GIDs is the mapping from UIDs/GIDs as they should appear inside of +# a container, to the UIDs/GIDs as they should appear outside of the container, +# and the length of the range of UIDs/GIDs. Additional mapped sets can be +# listed and will be heeded by libraries, but there are limits to the number of +# mappings which the kernel will allow when you later attempt to run a +# container. +# +# remap-uids = 0:1668442479:65536 +# remap-gids = 0:1668442479:65536 + +# Remap-User/Group is a user name which can be used to look up one or more UID/GID +# ranges in the /etc/subuid or /etc/subgid file. Mappings are set up starting +# with an in-container ID of 0 and then a host-level ID taken from the lowest +# range that matches the specified name, and using the length of that range. +# Additional ranges are then assigned, using the ranges which specify the +# lowest host-level IDs first, to the lowest not-yet-mapped in-container ID, +# until all of the entries have been used for maps. +# +# remap-user = "containers" +# remap-group = "containers" + +# Root-auto-userns-user is a user name which can be used to look up one or more UID/GID +# ranges in the /etc/subuid and /etc/subgid file. These ranges will be partitioned +# to containers configured to create automatically a user namespace. Containers +# configured to automatically create a user namespace can still overlap with containers +# having an explicit mapping set. +# This setting is ignored when running as rootless. +# root-auto-userns-user = "storage" +# +# Auto-userns-min-size is the minimum size for a user namespace created automatically. +# auto-userns-min-size=1024 +# +# Auto-userns-max-size is the minimum size for a user namespace created automatically. +# auto-userns-max-size=65536 + +[storage.options.overlay] +# ignore_chown_errors can be set to allow a non privileged user running with +# a single UID within a user namespace to run containers. The user can pull +# and use any image even those with multiple uids. Note multiple UIDs will be +# squashed down to the default uid in the container. These images will have no +# separation between the users in the container. Only supported for the overlay +# and vfs drivers. +#ignore_chown_errors = "false" + +# Inodes is used to set a maximum inodes of the container image. +# inodes = "" + +# Path to an helper program to use for mounting the file system instead of mounting it +# directly. +#mount_program = "/usr/bin/fuse-overlayfs" + +# mountopt specifies comma separated list of extra mount options +mountopt = "nodev,metacopy=on" + +# Set to skip a PRIVATE bind mount on the storage home directory. +# skip_mount_home = "false" + +# Size is used to set a maximum size of the container image. +# size = "" + +# ForceMask specifies the permissions mask that is used for new files and +# directories. +# +# The values "shared" and "private" are accepted. +# Octal permission masks are also accepted. +# +# "": No value specified. +# All files/directories, get set with the permissions identified within the +# image. +# "private": it is equivalent to 0700. +# All files/directories get set with 0700 permissions. The owner has rwx +# access to the files. No other users on the system can access the files. +# This setting could be used with networked based homedirs. +# "shared": it is equivalent to 0755. +# The owner has rwx access to the files and everyone else can read, access +# and execute them. This setting is useful for sharing containers storage +# with other users. For instance have a storage owned by root but shared +# to rootless users as an additional store. +# NOTE: All files within the image are made readable and executable by any +# user on the system. Even /etc/shadow within your image is now readable by +# any user. +# +# OCTAL: Users can experiment with other OCTAL Permissions. +# +# Note: The force_mask Flag is an experimental feature, it could change in the +# future. When "force_mask" is set the original permission mask is stored in +# the "user.containers.override_stat" xattr and the "mount_program" option must +# be specified. Mount programs like "/usr/bin/fuse-overlayfs" present the +# extended attribute permissions to processes within containers rather then the +# "force_mask" permissions. +# +# force_mask = "" + +[storage.options.thinpool] +# Storage Options for thinpool + +# autoextend_percent determines the amount by which pool needs to be +# grown. This is specified in terms of % of pool size. So a value of 20 means +# that when threshold is hit, pool will be grown by 20% of existing +# pool size. +# autoextend_percent = "20" + +# autoextend_threshold determines the pool extension threshold in terms +# of percentage of pool size. For example, if threshold is 60, that means when +# pool is 60% full, threshold has been hit. +# autoextend_threshold = "80" + +# basesize specifies the size to use when creating the base device, which +# limits the size of images and containers. +# basesize = "10G" + +# blocksize specifies a custom blocksize to use for the thin pool. +# blocksize="64k" + +# directlvm_device specifies a custom block storage device to use for the +# thin pool. Required if you setup devicemapper. +# directlvm_device = "" + +# directlvm_device_force wipes device even if device already has a filesystem. +# directlvm_device_force = "True" + +# fs specifies the filesystem type to use for the base device. +# fs="xfs" + +# log_level sets the log level of devicemapper. +# 0: LogLevelSuppress 0 (Default) +# 2: LogLevelFatal +# 3: LogLevelErr +# 4: LogLevelWarn +# 5: LogLevelNotice +# 6: LogLevelInfo +# 7: LogLevelDebug +# log_level = "7" + +# min_free_space specifies the min free space percent in a thin pool require for +# new device creation to succeed. Valid values are from 0% - 99%. +# Value 0% disables +# min_free_space = "10%" + +# mkfsarg specifies extra mkfs arguments to be used when creating the base +# device. +# mkfsarg = "" + +# metadata_size is used to set the `pvcreate --metadatasize` options when +# creating thin devices. Default is 128k +# metadata_size = "" + +# Size is used to set a maximum size of the container image. +# size = "" + +# use_deferred_removal marks devicemapper block device for deferred removal. +# If the thinpool is in use when the driver attempts to remove it, the driver +# tells the kernel to remove it as soon as possible. Note this does not free +# up the disk space, use deferred deletion to fully remove the thinpool. +# use_deferred_removal = "True" + +# use_deferred_deletion marks thinpool device for deferred deletion. +# If the device is busy when the driver attempts to delete it, the driver +# will attempt to delete device every 30 seconds until successful. +# If the program using the driver exits, the driver will continue attempting +# to cleanup the next time the driver is used. Deferred deletion permanently +# deletes the device and all data stored in device will be lost. +# use_deferred_deletion = "True" + +# xfs_nospace_max_retries specifies the maximum number of retries XFS should +# attempt to complete IO when ENOSPC (no space) error is returned by +# underlying storage device. +# xfs_nospace_max_retries = "0" diff --git a/sysbox-pkgr/k8s/manifests/daemonset/crio/README.md b/sysbox-pkgr/k8s/manifests/daemonset/crio/README.md new file mode 100644 index 00000000..a28e3340 --- /dev/null +++ b/sysbox-pkgr/k8s/manifests/daemonset/crio/README.md @@ -0,0 +1,24 @@ +# DEPRECATION NOTICE + +*** + +**THE KUBERNETES MANIFESTS IN THIS DIRECTORY (CRIO-DEPLOY-K8S AND +CRIO-CLEANUP-K8S) ARE NOW DEPRECATED.** + +**DO NOT USE THESE DAEMONSETS IF YOU WISH TO INSTALL SYSBOX ON A HOST; INSTEAD, +USE ONLY THE SYSBOX-DEPLOY-K8S DAEMONSET WHICH WILL INSTALL BOTH CRI-O AND +SYSBOX ON THE HOST.** + +*** + +The crio-deploy-k8s daemonset in this directory was previously needed to +install CRI-O on a Kubernetes node, in preparation to installing Sysbox on the +node using the separate sysbox-deploy-k8s daemonset. This is no longer the case, +as the latest version of the sysbox-deploy-k8s daemonset installs both CRI-O and +Sysbox (in order to simplify and significantly speed up the installation +process). + +You can still use this daemonset if you wish to only install CRI-O on a +Kubernetes node (i.e., without installing Sysbox). However, note that there are +no plans to update this daemonset for newer versions of CRI-O (it currently +installs CRI-O v1.20). diff --git a/sysbox-pkgr/k8s/manifests/daemonset/crio/crio-cleanup-k8s.yaml b/sysbox-pkgr/k8s/manifests/daemonset/crio/crio-cleanup-k8s.yaml new file mode 100644 index 00000000..f0aca7e2 --- /dev/null +++ b/sysbox-pkgr/k8s/manifests/daemonset/crio/crio-cleanup-k8s.yaml @@ -0,0 +1,66 @@ +--- +apiVersion: apps/v1 +kind: DaemonSet +metadata: + name: crio-cleanup-k8s + namespace: kube-system +spec: + selector: + matchLabels: + crio-install: "yes" + template: + metadata: + labels: + crio-install: "yes" + spec: + serviceAccountName: crio-label-node + nodeSelector: + crio-install: "yes" + containers: + - name: crio-cleanup-k8s + image: registry.nestybox.com/nestybox/crio-deploy-k8s + imagePullPolicy: Always + command: [ "bash", "-c", "/opt/crio-deploy/scripts/crio-deploy-k8s.sh cleanup" ] + env: + - name: NODE_NAME + valueFrom: + fieldRef: + fieldPath: spec.nodeName + securityContext: + privileged: false + volumeMounts: + - name: host-dbus + mountPath: /var/run/dbus + - name: host-run-systemd + mountPath: /run/systemd + - name: host-lib-systemd + mountPath: /mnt/host/lib/systemd + - name: host-usr-local-bin + mountPath: /mnt/host/usr/local/bin/ + - name: host-etc + mountPath: /mnt/host/etc + - name: host-run + mountPath: /mnt/host/run + volumes: + - name: host-dbus + hostPath: + path: /var/run/dbus + - name: host-run-systemd + hostPath: + path: /run/systemd + - name: host-lib-systemd + hostPath: + path: /lib/systemd + - name: host-usr-local-bin + hostPath: + path: /usr/local/bin/ + - name: host-etc + hostPath: + path: /etc + - name: host-run + hostPath: + path: /run + updateStrategy: + rollingUpdate: + maxUnavailable: 1 + type: RollingUpdate diff --git a/sysbox-pkgr/k8s/manifests/daemonset/crio/crio-deploy-k8s.yaml b/sysbox-pkgr/k8s/manifests/daemonset/crio/crio-deploy-k8s.yaml new file mode 100644 index 00000000..2dfdfdac --- /dev/null +++ b/sysbox-pkgr/k8s/manifests/daemonset/crio/crio-deploy-k8s.yaml @@ -0,0 +1,66 @@ +--- +apiVersion: apps/v1 +kind: DaemonSet +metadata: + name: crio-deploy-k8s + namespace: kube-system +spec: + selector: + matchLabels: + crio-install: "yes" + template: + metadata: + labels: + crio-install: "yes" + spec: + serviceAccountName: crio-label-node + nodeSelector: + crio-install: "yes" + containers: + - name: crio-deploy-k8s + image: registry.nestybox.com/nestybox/crio-deploy-k8s + imagePullPolicy: Always + command: [ "bash", "-c", "/opt/crio-deploy/scripts/crio-deploy-k8s.sh install" ] + env: + - name: NODE_NAME + valueFrom: + fieldRef: + fieldPath: spec.nodeName + securityContext: + privileged: true + volumeMounts: + - name: host-dbus + mountPath: /var/run/dbus + - name: host-run-systemd + mountPath: /run/systemd + - name: host-lib-systemd + mountPath: /mnt/host/lib/systemd/system + - name: host-usr-local-bin + mountPath: /mnt/host/usr/local/bin + - name: host-etc + mountPath: /mnt/host/etc + - name: host-run + mountPath: /mnt/host/run + volumes: + - name: host-dbus + hostPath: + path: /var/run/dbus + - name: host-run-systemd + hostPath: + path: /run/systemd + - name: host-lib-systemd + hostPath: + path: /lib/systemd/system + - name: host-usr-local-bin + hostPath: + path: /usr/local/bin/ + - name: host-etc + hostPath: + path: /etc + - name: host-run + hostPath: + path: /run + updateStrategy: + rollingUpdate: + maxUnavailable: 1 + type: RollingUpdate diff --git a/sysbox-pkgr/k8s/manifests/daemonset/sysbox-cleanup-k8s.yaml b/sysbox-pkgr/k8s/manifests/daemonset/sysbox-cleanup-k8s.yaml new file mode 100644 index 00000000..f4b23bbd --- /dev/null +++ b/sysbox-pkgr/k8s/manifests/daemonset/sysbox-cleanup-k8s.yaml @@ -0,0 +1,96 @@ +--- +apiVersion: apps/v1 +kind: DaemonSet +metadata: + name: sysbox-cleanup-k8s + namespace: kube-system +spec: + selector: + matchLabels: + sysbox-install: "yes" + template: + metadata: + labels: + sysbox-install: "yes" + spec: + serviceAccountName: sysbox-label-node + nodeSelector: + sysbox-runtime: running + containers: + - name: sysbox-cleanup-k8s + image: registry.nestybox.com/nestybox/sysbox-deploy-k8s + imagePullPolicy: Always + command: [ "bash", "-c", "/opt/sysbox/scripts/sysbox-deploy-k8s.sh ce cleanup" ] + env: + - name: NODE_NAME + valueFrom: + fieldRef: + fieldPath: spec.nodeName + securityContext: + privileged: true + volumeMounts: + - name: host-etc + mountPath: /mnt/host/etc + - name: host-osrelease + mountPath: /mnt/host/os-release + - name: host-dbus + mountPath: /var/run/dbus + - name: host-run-systemd + mountPath: /run/systemd + - name: host-lib-systemd + mountPath: /mnt/host/lib/systemd/system + - name: host-etc-systemd + mountPath: /mnt/host/etc/systemd/system + - name: host-usr-bin + mountPath: /mnt/host/usr/bin + - name: host-opt-bin + mountPath: /mnt/host/opt/bin + - name: host-usr-local-bin + mountPath: /mnt/host/usr/local/bin + - name: host-opt-local-bin + mountPath: /mnt/host/opt/local/bin + - name: host-run + mountPath: /mnt/host/run + - name: host-var-lib + mountPath: /mnt/host/var/lib + volumes: + - name: host-etc + hostPath: + path: /etc + - name: host-osrelease + hostPath: + path: /etc/os-release + - name: host-dbus + hostPath: + path: /var/run/dbus + - name: host-run-systemd + hostPath: + path: /run/systemd + - name: host-lib-systemd + hostPath: + path: /lib/systemd/system + - name: host-etc-systemd + hostPath: + path: /etc/systemd/system + - name: host-usr-bin + hostPath: + path: /usr/bin + - name: host-opt-bin + hostPath: + path: /opt/bin + - name: host-usr-local-bin + hostPath: + path: /usr/local/bin + - name: host-opt-local-bin + hostPath: + path: /opt/local/bin/ + - name: host-run + hostPath: + path: /run + - name: host-var-lib + hostPath: + path: /var/lib + updateStrategy: + rollingUpdate: + maxUnavailable: 1 + type: RollingUpdate diff --git a/sysbox-pkgr/k8s/manifests/daemonset/sysbox-deploy-k8s.yaml b/sysbox-pkgr/k8s/manifests/daemonset/sysbox-deploy-k8s.yaml new file mode 100644 index 00000000..1afc943f --- /dev/null +++ b/sysbox-pkgr/k8s/manifests/daemonset/sysbox-deploy-k8s.yaml @@ -0,0 +1,116 @@ +--- +apiVersion: apps/v1 +kind: DaemonSet +metadata: + name: sysbox-deploy-k8s + namespace: kube-system +spec: + selector: + matchLabels: + sysbox-install: "yes" + template: + metadata: + labels: + sysbox-install: "yes" + spec: + serviceAccountName: sysbox-label-node + nodeSelector: + sysbox-install: "yes" + containers: + - name: sysbox-deploy-k8s + image: registry.nestybox.com/nestybox/sysbox-deploy-k8s + imagePullPolicy: Always + command: [ "bash", "-c", "/opt/sysbox/scripts/sysbox-deploy-k8s.sh ce install" ] + env: + - name: NODE_NAME + valueFrom: + fieldRef: + fieldPath: spec.nodeName + securityContext: + privileged: true + volumeMounts: + - name: host-etc + mountPath: /mnt/host/etc + - name: host-osrelease + mountPath: /mnt/host/os-release + - name: host-dbus + mountPath: /var/run/dbus + - name: host-run-systemd + mountPath: /run/systemd + - name: host-lib-systemd + mountPath: /mnt/host/lib/systemd/system + - name: host-etc-systemd + mountPath: /mnt/host/etc/systemd/system + - name: host-lib-sysctl + mountPath: /mnt/host/lib/sysctl.d + - name: host-opt-lib-sysctl + mountPath: /mnt/host/opt/lib/sysctl.d + - name: host-usr-bin + mountPath: /mnt/host/usr/bin + - name: host-opt-bin + mountPath: /mnt/host/opt/bin + - name: host-usr-local-bin + mountPath: /mnt/host/usr/local/bin + - name: host-opt-local-bin + mountPath: /mnt/host/opt/local/bin + - name: host-usr-lib-mod-load + mountPath: /mnt/host/usr/lib/modules-load.d + - name: host-opt-lib-mod-load + mountPath: /mnt/host/opt/lib/modules-load.d + - name: host-run + mountPath: /mnt/host/run + - name: host-var-lib + mountPath: /mnt/host/var/lib + volumes: + - name: host-etc + hostPath: + path: /etc + - name: host-osrelease + hostPath: + path: /etc/os-release + - name: host-dbus + hostPath: + path: /var/run/dbus + - name: host-run-systemd + hostPath: + path: /run/systemd + - name: host-lib-systemd + hostPath: + path: /lib/systemd/system + - name: host-etc-systemd + hostPath: + path: /etc/systemd/system + - name: host-lib-sysctl + hostPath: + path: /lib/sysctl.d + - name: host-opt-lib-sysctl + hostPath: + path: /opt/lib/sysctl.d + - name: host-usr-bin + hostPath: + path: /usr/bin/ + - name: host-opt-bin + hostPath: + path: /opt/bin/ + - name: host-usr-local-bin + hostPath: + path: /usr/local/bin/ + - name: host-opt-local-bin + hostPath: + path: /opt/local/bin/ + - name: host-usr-lib-mod-load + hostPath: + path: /usr/lib/modules-load.d + - name: host-opt-lib-mod-load + hostPath: + path: /opt/lib/modules-load.d + - name: host-run + hostPath: + path: /run + - name: host-var-lib + hostPath: + path: /var/lib + updateStrategy: + rollingUpdate: + maxUnavailable: 1 + type: RollingUpdate diff --git a/sysbox-pkgr/k8s/manifests/daemonset/sysbox-ee-cleanup-k8s.yaml b/sysbox-pkgr/k8s/manifests/daemonset/sysbox-ee-cleanup-k8s.yaml new file mode 100644 index 00000000..27c4d8d6 --- /dev/null +++ b/sysbox-pkgr/k8s/manifests/daemonset/sysbox-ee-cleanup-k8s.yaml @@ -0,0 +1,96 @@ +--- +apiVersion: apps/v1 +kind: DaemonSet +metadata: + name: sysbox-ee-cleanup-k8s + namespace: kube-system +spec: + selector: + matchLabels: + sysbox-install: "yes" + template: + metadata: + labels: + sysbox-install: "yes" + spec: + serviceAccountName: sysbox-label-node + nodeSelector: + sysbox-runtime: running + containers: + - name: sysbox-ee-cleanup-k8s + image: registry.nestybox.com/nestybox/sysbox-ee-deploy-k8s + imagePullPolicy: Always + command: [ "bash", "-c", "/opt/sysbox/scripts/sysbox-deploy-k8s.sh ee cleanup" ] + env: + - name: NODE_NAME + valueFrom: + fieldRef: + fieldPath: spec.nodeName + securityContext: + privileged: true + volumeMounts: + - name: host-etc + mountPath: /mnt/host/etc + - name: host-osrelease + mountPath: /mnt/host/os-release + - name: host-dbus + mountPath: /var/run/dbus + - name: host-run-systemd + mountPath: /run/systemd + - name: host-lib-systemd + mountPath: /mnt/host/lib/systemd/system + - name: host-etc-systemd + mountPath: /mnt/host/etc/systemd/system + - name: host-usr-bin + mountPath: /mnt/host/usr/bin + - name: host-opt-bin + mountPath: /mnt/host/opt/bin + - name: host-usr-local-bin + mountPath: /mnt/host/usr/local/bin + - name: host-opt-local-bin + mountPath: /mnt/host/opt/local/bin + - name: host-run + mountPath: /mnt/host/run + - name: host-var-lib + mountPath: /mnt/host/var/lib + volumes: + - name: host-etc + hostPath: + path: /etc + - name: host-osrelease + hostPath: + path: /etc/os-release + - name: host-dbus + hostPath: + path: /var/run/dbus + - name: host-run-systemd + hostPath: + path: /run/systemd + - name: host-lib-systemd + hostPath: + path: /lib/systemd/system + - name: host-etc-systemd + hostPath: + path: /etc/systemd/system + - name: host-usr-bin + hostPath: + path: /usr/bin + - name: host-opt-bin + hostPath: + path: /opt/bin + - name: host-usr-local-bin + hostPath: + path: /usr/local/bin + - name: host-opt-local-bin + hostPath: + path: /opt/local/bin/ + - name: host-run + hostPath: + path: /run + - name: host-var-lib + hostPath: + path: /var/lib + updateStrategy: + rollingUpdate: + maxUnavailable: 1 + type: RollingUpdate diff --git a/sysbox-pkgr/k8s/manifests/daemonset/sysbox-ee-deploy-k8s.yaml b/sysbox-pkgr/k8s/manifests/daemonset/sysbox-ee-deploy-k8s.yaml new file mode 100644 index 00000000..7f2e2b26 --- /dev/null +++ b/sysbox-pkgr/k8s/manifests/daemonset/sysbox-ee-deploy-k8s.yaml @@ -0,0 +1,116 @@ +--- +apiVersion: apps/v1 +kind: DaemonSet +metadata: + name: sysbox-ee-deploy-k8s + namespace: kube-system +spec: + selector: + matchLabels: + sysbox-install: "yes" + template: + metadata: + labels: + sysbox-install: "yes" + spec: + serviceAccountName: sysbox-label-node + nodeSelector: + sysbox-install: "yes" + containers: + - name: sysbox-ee-deploy-k8s + image: registry.nestybox.com/nestybox/sysbox-ee-deploy-k8s + imagePullPolicy: Always + command: [ "bash", "-c", "/opt/sysbox/scripts/sysbox-deploy-k8s.sh ee install" ] + env: + - name: NODE_NAME + valueFrom: + fieldRef: + fieldPath: spec.nodeName + securityContext: + privileged: true + volumeMounts: + - name: host-etc + mountPath: /mnt/host/etc + - name: host-osrelease + mountPath: /mnt/host/os-release + - name: host-dbus + mountPath: /var/run/dbus + - name: host-run-systemd + mountPath: /run/systemd + - name: host-lib-systemd + mountPath: /mnt/host/lib/systemd/system + - name: host-etc-systemd + mountPath: /mnt/host/etc/systemd/system + - name: host-lib-sysctl + mountPath: /mnt/host/lib/sysctl.d + - name: host-opt-lib-sysctl + mountPath: /mnt/host/opt/lib/sysctl.d + - name: host-usr-bin + mountPath: /mnt/host/usr/bin + - name: host-opt-bin + mountPath: /mnt/host/opt/bin + - name: host-usr-local-bin + mountPath: /mnt/host/usr/local/bin + - name: host-opt-local-bin + mountPath: /mnt/host/opt/local/bin + - name: host-usr-lib-mod-load + mountPath: /mnt/host/usr/lib/modules-load.d + - name: host-opt-lib-mod-load + mountPath: /mnt/host/opt/lib/modules-load.d + - name: host-run + mountPath: /mnt/host/run + - name: host-var-lib + mountPath: /mnt/host/var/lib + volumes: + - name: host-etc + hostPath: + path: /etc + - name: host-osrelease + hostPath: + path: /etc/os-release + - name: host-dbus + hostPath: + path: /var/run/dbus + - name: host-run-systemd + hostPath: + path: /run/systemd + - name: host-lib-systemd + hostPath: + path: /lib/systemd/system + - name: host-etc-systemd + hostPath: + path: /etc/systemd/system + - name: host-lib-sysctl + hostPath: + path: /lib/sysctl.d + - name: host-opt-lib-sysctl + hostPath: + path: /opt/lib/sysctl.d + - name: host-usr-bin + hostPath: + path: /usr/bin/ + - name: host-opt-bin + hostPath: + path: /opt/bin/ + - name: host-usr-local-bin + hostPath: + path: /usr/local/bin/ + - name: host-opt-local-bin + hostPath: + path: /opt/local/bin/ + - name: host-usr-lib-mod-load + hostPath: + path: /usr/lib/modules-load.d + - name: host-opt-lib-mod-load + hostPath: + path: /opt/lib/modules-load.d + - name: host-run + hostPath: + path: /run + - name: host-var-lib + hostPath: + path: /var/lib + updateStrategy: + rollingUpdate: + maxUnavailable: 1 + type: RollingUpdate diff --git a/sysbox-pkgr/k8s/manifests/rbac/crio-deploy-rbac.yaml b/sysbox-pkgr/k8s/manifests/rbac/crio-deploy-rbac.yaml new file mode 100644 index 00000000..b6cbfb55 --- /dev/null +++ b/sysbox-pkgr/k8s/manifests/rbac/crio-deploy-rbac.yaml @@ -0,0 +1,28 @@ +--- +apiVersion: v1 +kind: ServiceAccount +metadata: + name: crio-label-node + namespace: kube-system +--- +kind: ClusterRole +apiVersion: rbac.authorization.k8s.io/v1 +metadata: + name: crio-node-labeler +rules: +- apiGroups: [""] + resources: ["nodes"] + verbs: ["get", "patch"] +--- +kind: ClusterRoleBinding +apiVersion: rbac.authorization.k8s.io/v1 +metadata: + name: crio-label-node-rb +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: crio-node-labeler +subjects: +- kind: ServiceAccount + name: crio-label-node + namespace: kube-system diff --git a/sysbox-pkgr/k8s/manifests/rbac/sysbox-rbac.yaml b/sysbox-pkgr/k8s/manifests/rbac/sysbox-rbac.yaml new file mode 100644 index 00000000..479dbede --- /dev/null +++ b/sysbox-pkgr/k8s/manifests/rbac/sysbox-rbac.yaml @@ -0,0 +1,28 @@ +--- +apiVersion: v1 +kind: ServiceAccount +metadata: + name: sysbox-label-node + namespace: kube-system +--- +kind: ClusterRole +apiVersion: rbac.authorization.k8s.io/v1 +metadata: + name: sysbox-node-labeler +rules: +- apiGroups: [""] + resources: ["nodes"] + verbs: ["get", "patch"] +--- +kind: ClusterRoleBinding +apiVersion: rbac.authorization.k8s.io/v1 +metadata: + name: sysbox-label-node-rb +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: sysbox-node-labeler +subjects: +- kind: ServiceAccount + name: sysbox-label-node + namespace: kube-system diff --git a/sysbox-pkgr/k8s/manifests/runtime-class/sysbox-runtimeclass.yaml b/sysbox-pkgr/k8s/manifests/runtime-class/sysbox-runtimeclass.yaml new file mode 100644 index 00000000..ebe23c42 --- /dev/null +++ b/sysbox-pkgr/k8s/manifests/runtime-class/sysbox-runtimeclass.yaml @@ -0,0 +1,8 @@ +apiVersion: node.k8s.io/v1beta1 +kind: RuntimeClass +metadata: + name: sysbox-runc +handler: sysbox-runc +scheduling: + nodeSelector: + sysbox-runtime: running diff --git a/sysbox-pkgr/k8s/scripts/crio-build.sh b/sysbox-pkgr/k8s/scripts/crio-build.sh new file mode 100755 index 00000000..838e71fe --- /dev/null +++ b/sysbox-pkgr/k8s/scripts/crio-build.sh @@ -0,0 +1,30 @@ +#!/bin/bash + +# +# CRI-O build script (meant to run inside the CRI-O build container) +# +# The script will build the Nestybox-customized CRI-O and place the binaries +# under /mnt/results. It builds several versions of CRI-O. +# +# Usage: docker run -v $(shell pwd)/bin:/mnt/results crio-bld +# +# Note: refer to 'k8s/Makefile' for the CRIO_VERSIONS settings. +# + +# Split CRIO_VERSIONS space-separated string into an array. +CRIO_VERSIONS_ARRAY=($(echo ${CRIO_VERSIONS} | tr "," "\n")) + +for ver in "${CRIO_VERSIONS_ARRAY[@]}"; do + if [ -f "/mnt/results/crio/${ver}/crio" ]; then \ + printf "\n*** Skip building CRI-O ${ver} -- binary already present ... ***\n\n" + continue + fi + printf "\n*** Building CRI-O ${ver} ... ***\n\n" + TMPDIR=$(mktemp -d) + chmod 755 ${TMPDIR} + git clone https://github.com/nestybox/cri-o.git ${TMPDIR}/cri-o + git -C ${TMPDIR}/cri-o checkout -b ${ver}-sysbox origin/${ver}-sysbox + cd ${TMPDIR}/cri-o && make binaries + mkdir -p /mnt/results/crio/${ver} + cp ${TMPDIR}/cri-o/bin/crio-static /mnt/results/crio/${ver}/crio +done diff --git a/sysbox-pkgr/k8s/scripts/crio-extractor.sh b/sysbox-pkgr/k8s/scripts/crio-extractor.sh new file mode 100755 index 00000000..2185b294 --- /dev/null +++ b/sysbox-pkgr/k8s/scripts/crio-extractor.sh @@ -0,0 +1,224 @@ +#!/bin/bash + +# +# Script to install and uninstall CRI-O from a tar archive. Typically needed in +# OSes that don't have a package manager (e.g., Flatcar). +# +# NOTE: adapted from the CRI-O release bundle Makefile +# found here: https://github.com/cri-o/cri-o/releases +# + +ETCDIR=/etc +CONTAINERS_DIR=${ETCDIR}/containers +CNIDIR=${ETCDIR}/cni/net.d +SYSTEMDDIR=${ETCDIR}/systemd/system +SELINUX=$(selinuxenabled 2>/dev/null && echo -Z) +OPT_CNI_BIN_DIR=/opt/cni/bin +VAR_LIB_SYSBOX_DEPLOY_K8S=/var/lib/sysbox-deploy-k8s + +function install_all() { + mkdir -p ${PREFIX} + install_cni + install_conmon + install_crio + install_crictl + install_pinns + install_runc + install_crun +} + +function install_cni() { + install ${SELINUX} -d -m 755 ${CNIDIR} + install ${SELINUX} -D -m 755 -t ${OPT_CNI_BIN_DIR} cni-plugins/* + cp contrib/10-crio-bridge.conf contrib/100-crio-bridge.conf + install ${SELINUX} -D -m 644 -t ${CNIDIR} contrib/100-crio-bridge.conf +} + +function install_conmon() { + # Ensure that both 'conmon' and 'crio-conmon' can be resolved and are properly installed + # regardless of how they are being packaged (i.e., 'conmon' is not packaged in crio v1.30+, + # and 'crio-conmon' is not packaged in v1.30-). + # + # For scenarios with crio v1.30- + if [ -f bin/conmon ]; then + install ${SELINUX} -D -m 755 -t ${BINDIR} bin/conmon + if [ ! -f bin/crio-conmon ]; then + ln -s ${BINDIR}/conmon ${BINDIR}/crio-conmon + fi + fi + # For scenarios with crio v1.30 and v1.30+ + if [ -f bin/crio-conmon ]; then + install ${SELINUX} -D -m 755 -t ${BINDIR} bin/crio-conmon + if [ ! -f bin/conmon ]; then + ln -s ${BINDIR}/crio-conmon ${BINDIR}/conmon + fi + fi +} + + +function install_crictl() { + install ${SELINUX} -D -m 755 -t ${BINDIR} bin/crictl +} + +function install_crio() { + install ${SELINUX} -d -m 755 ${BASHINSTALLDIR} + install ${SELINUX} -d -m 755 ${FISHINSTALLDIR} + install ${SELINUX} -d -m 755 ${ZSHINSTALLDIR} + install ${SELINUX} -d -m 755 ${CONTAINERS_DIR} + install ${SELINUX} -D -m 755 -t ${BINDIR} bin/crio-status + install ${SELINUX} -D -m 755 -t ${BINDIR} bin/crio + install ${SELINUX} -D -m 644 -t ${ETCDIR} etc/crictl.yaml + install ${SELINUX} -D -m 644 -t ${OCIDIR} etc/crio-umount.conf + install ${SELINUX} -D -m 644 -t ${ETCDIR}/crio etc/crio.conf + install ${SELINUX} -D -m 644 -t ${MANDIR}/man5 man/crio.conf.5 + install ${SELINUX} -D -m 644 -t ${MANDIR}/man5 man/crio.conf.d.5 + install ${SELINUX} -D -m 644 -t ${MANDIR}/man8 man/crio.8 + install ${SELINUX} -D -m 644 -t ${BASHINSTALLDIR} completions/bash/crio + install ${SELINUX} -D -m 644 -t ${FISHINSTALLDIR} completions/fish/crio.fish + install ${SELINUX} -D -m 644 -t ${ZSHINSTALLDIR} completions/zsh/_crio + install ${SELINUX} -D -m 644 -t ${CONTAINERS_DIR} contrib/policy.json + install ${SELINUX} -D -m 644 -t ${SYSTEMDDIR} contrib/crio.service +} + +function install_pinns() { + install ${SELINUX} -D -m 755 -t ${BINDIR} bin/pinns +} + +function install_runc() { + + # If runc exists on the host, use it; otherwise install our own. This is + # needed to avoid breaking pods that rely on the existing runc version. + + curr_runc=$(which runc) + if [ $? -eq 0 ]; then + if [[ "$curr_runc" != "${BINDIR}/bin/runc" ]]; then + echo "Using existing runc (soft-linking ${BINDIR}/runc -> $curr_runc)" + ln -s $curr_runc ${BINDIR}/runc + ln -s $curr_runc ${BINDIR}/crio-runc + mkdir -p ${VAR_LIB_SYSBOX_DEPLOY_K8S} && touch ${VAR_LIB_SYSBOX_DEPLOY_K8S}/linked_runc + fi + else + # For scenarios with crio v1.30- + if [ -f bin/runc ]; then + echo "Installing runc at ${BINDIR}/bin/runc" + install ${SELINUX} -D -m 755 -t ${BINDIR} bin/runc + if [ ! -f bin/crio-runc ]; then + ln -s ${BINDIR}/runc ${BINDIR}/crio-runc + fi + fi + # For scenarios with crio v1.30 and v1.30+ + if [ -f bin/crio-runc ]; then + echo "Installing crio-runc at ${BINDIR}/bin/crio-runc" + install ${SELINUX} -D -m 755 -t ${BINDIR} bin/crio-runc + # Point traditional runc to crio-runc binary. + ln -s ${BINDIR}/crio-runc ${BINDIR}/runc + fi + mkdir -p ${VAR_LIB_SYSBOX_DEPLOY_K8S} && touch ${VAR_LIB_SYSBOX_DEPLOY_K8S}/installed_runc + fi +} + +function install_crun() { + if [ -f bin/crun ]; then + install ${SELINUX} -D -m 755 -t ${BINDIR} bin/crun + fi + if [ -f bin/crio-crun ]; then + install ${SELINUX} -D -m 755 -t ${BINDIR} bin/crio-crun + fi +} + +function uninstall_all() { + uninstall_cni + uninstall_conmon + uninstall_crio + uninstall_crictl + uninstall_pinns + uninstall_runc + uninstall_crun +} + +function uninstall_cni() { + rm ${CNIDIR}/100-crio-bridge.conf +} + +function uninstall_conmon() { + rm ${BINDIR}/conmon + rm ${BINDIR}/crio-conmon +} + +function uninstall_crictl() { + rm ${BINDIR}/crictl +} + +function uninstall_crio() { + rm ${BINDIR}/crio + rm ${BINDIR}/crio-status + rm ${ETCDIR}/crictl.yaml + rm ${OCIDIR}/crio-umount.conf + rm ${ETCDIR}/crio/crio.conf + rm ${MANDIR}/man5/crio.conf.5 + rm ${MANDIR}/man5/crio.conf.d.5 + rm ${MANDIR}/man8/crio.8 + rm ${BASHINSTALLDIR}/crio + rm ${FISHINSTALLDIR}/crio.fish + rm ${ZSHINSTALLDIR}/_crio + rm ${SYSTEMDDIR}/crio.service +} + +function uninstall_pinns() { + rm ${BINDIR}/pinns +} + +function uninstall_runc() { + + if [ -f ${VAR_LIB_SYSBOX_DEPLOY_K8S}/installed_runc ]; then + echo "Removing runc at ${BINDIR}/runc" + rm ${BINDIR}/runc + rm ${BINDIR}/crio-runc + rm ${VAR_LIB_SYSBOX_DEPLOY_K8S}/installed_runc + elif [ -f ${VAR_LIB_SYSBOX_DEPLOY_K8S}/linked_runc ]; then + echo "Removing runc softlink at ${BINDIR}/runc" + rm ${BINDIR}/runc + rm ${VAR_LIB_SYSBOX_DEPLOY_K8S}/linked_runc + fi + +} + +function uninstall_crun() { + if [ -f ${BINDIR}/crun ]; then + rm ${BINDIR}/crun + fi + if [ -f ${BINDIR}/crio-crun ]; then + rm ${BINDIR}/crio-crun + fi +} + +function main() { + + # Two parameters are expected: + # * Action: install / uinstall + # * Path: Top location where to install (uninstall) crio to (from). + + if [ "$#" -ne 2 ]; then + printf "\n" + printf "Usage: crio-extractor.sh [install | uninstall] path\n" + printf "\n" + exit 1 + fi + + # Set globals that depend on 'path' parameter. + PREFIX="$2" + BINDIR=${PREFIX}/bin + MANDIR=${PREFIX}/share/man + OCIDIR=${PREFIX}/share/oci-umount/oci-umount.d + BASHINSTALLDIR=${PREFIX}/share/bash-completion/completions + FISHINSTALLDIR=${PREFIX}/share/fish/completions + ZSHINSTALLDIR=${PREFIX}/share/zsh/site-functions + + if [[ "$1" == "install" ]]; then + install_all + elif [[ "$1" == "uninstall" ]]; then + uninstall_all + fi +} + +main "$@" diff --git a/sysbox-pkgr/k8s/scripts/crio-installer.sh b/sysbox-pkgr/k8s/scripts/crio-installer.sh new file mode 100755 index 00000000..925af00d --- /dev/null +++ b/sysbox-pkgr/k8s/scripts/crio-installer.sh @@ -0,0 +1,196 @@ +#!/bin/bash + +# +# Copyright 2019-2021 Nestybox, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +# +# Script to install CRI-O on a host +# + +set -o errexit +set -o pipefail +set -o nounset + +function die() { + msg="$*" + echo "ERROR: $msg" >&2 + exit 1 +} + +function backup_crictl_config() { + + if [ -f /etc/crictl.yaml ]; then + mv /etc/crictl.yaml /etc/crictl.orig.yaml + fi +} + +function flatcar_distro() { + grep -q "^ID=flatcar" /etc/os-release +} + +function get_sys_arch() { + local uname_m=$(uname -m) + + if [[ "$uname_m" == "x86_64" ]]; then + sys_arch=amd64 + elif [[ "$uname_m" == "aarch64" ]]; then + sys_arch=arm64 + elif [[ "$uname_m" == "arm" ]]; then + sys_arch=armhf + elif [[ "$uname_m" == "armel" ]]; then + sys_arch=armel + fi + + echo "${sys_arch}" +} + +function get_crio_tar_file_name { + local sys_arch=$(get_sys_arch) + + echo "cri-o.${sys_arch}.tar.gz" +} + +function do_install_crio() { + local path=$1 + local crio_tar_file_name=$(get_crio_tar_file_name) + local crio_tar_file_path="${path}/${crio_tar_file_name}" + + # Extract and install the CRI-O (and related dependencies) binaries + pushd "$path" + tar -xvf "$crio_tar_file_path" + rm -r "$crio_tar_file_path" + pushd cri-o + + chmod +x "${path}"/crio-extractor.sh + local path_dir=$(dirname "$path") + "${path}"/crio-extractor.sh install "$path_dir" + rm -r ${path}/cri-o + + # Replace the stock CRI-O binary with the one that has the uid mapping patch + # required by Sysbox. + mv ${path}/crio-patched ${path}/crio + + # Adjust PATH env-var and crio's binary location if it doesn't match the default + # location. + if [[ "$path" != "/usr/local/bin" ]]; then + sed -i "/Type=notify/a Environment=PATH=${path}:/sbin:/bin:/usr/sbin:/usr/bin" /etc/systemd/system/crio.service + sed -i "s@/usr/local/bin/crio@${path}/crio@" /etc/systemd/system/crio.service + fi + + # Create directories expected by CRI-O + mkdir -p /var/lib/crio +} + +function install_crio() { + + echo "Installing CRI-O ..." + + if flatcar_distro; then + do_install_crio "/opt/local/bin" + else + do_install_crio "/usr/local/bin" + fi + + # Ensure that cri-o service is automatically started at boot-up time. + systemctl enable crio + + echo "CRI-O installation done." +} + +function restart_crio() { + echo "Restarting CRI-O ..." + systemctl restart crio + systemctl is-active --quiet crio + echo "CRI-O restart done." +} + +# The instructions in this function are typically executed as part of the +# containers-common's deb-pkg installation (which is a dependency of the cri-o +# pkg) by creating the default config files required for cri-o operations. +# However, these config files are not part of the cri-o tar file that +# we're relying on in this installation process, so we must explicitly create +# this configuration state as part of the installation process. +function config_containers_common() { + + local config_files="/var/lib/sysbox-deploy-k8s" + local containers_dir="/etc/containers" + mkdir -p "$containers_dir" + + # Create a default system-wide registries.conf file and associated drop-in + # dir if not already present. + local reg_file="${containers_dir}/registries.conf" + if [ ! -f "$reg_file" ]; then + cp "${config_files}/etc_containers_registries.conf" "${reg_file}" + fi + + local reg_dropin_dir="${containers_dir}/registries.conf.d" + mkdir -p "$reg_dropin_dir" + + # Copy registry shortname config + local shortnames_conf_file="${reg_dropin_dir}/000-shortnames.conf" + if [ ! -f "$shortnames_conf_file" ]; then + cp "${config_files}/etc_containers_registries.conf.d_000-shortnames.conf" "${shortnames_conf_file}" + fi + + # Create a default registry-configuration file if not already present. + local reg_dir="${containers_dir}/registries.d" + mkdir -p "$reg_dir" + + local reg_def_file="${reg_dir}/default.yaml" + if [ ! -f "$reg_def_file" ]; then + cp "${config_files}/etc_containers_registries.d_default.yaml" "${reg_def_file}" + fi + + # Create a default storage.conf file if not already present. + local storage_conf_file="${containers_dir}/storage.conf" + if [ ! -f "$storage_conf_file" ]; then + cp "${config_files}/etc_containers_storage.conf" "${storage_conf_file}" + fi + + # Create a default policy.json file if not already present. + local policy_file="${containers_dir}/policy.json" + if [ ! -f "$policy_file" ]; then + cp "${config_files}/etc_containers_policy.json" "${policy_file}" + fi + + # Copy the default loopback CNI config file + local cni_dir="/etc/cni/net.d" + mkdir -p "$cni_dir" + + local lb_file="${cni_dir}/200-loopback.conf" + if [ ! -f "$lb_file" ]; then + cp "${config_files}/etc_cni_net.d_200-loopback.conf" "${lb_file}" + fi +} + +function main() { + + euid=$(id -u) + if [[ $euid -ne 0 ]]; then + die "This script must be run as root" + fi + + if systemctl is-active crio; then + echo "CRI-O is already running; skipping installation." + exit 0 + fi + + backup_crictl_config + config_containers_common + install_crio +} + +main "$@" diff --git a/sysbox-pkgr/k8s/scripts/crio-removal.sh b/sysbox-pkgr/k8s/scripts/crio-removal.sh new file mode 100755 index 00000000..ed87a569 --- /dev/null +++ b/sysbox-pkgr/k8s/scripts/crio-removal.sh @@ -0,0 +1,99 @@ +#!/bin/bash + +# +# Copyright 2019-2021 Nestybox, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +# +# Script to remove CRI-O from a host +# + +set -o errexit +set -o pipefail +set -o nounset + +CRIO_VERSION=1.21 + +function die() { + msg="$*" + echo "ERROR: $msg" >&2 + exit 1 +} + +function restore_crictl_config() { + if [ -f /etc/crictl.orig.yaml ]; then + mv /etc/crictl.orig.yaml /etc/crictl.yaml + fi +} + +function flatcar_distro() { + grep -q "^ID=flatcar" /etc/os-release +} + +function do_uninstall_crio() { + local path=$1 + + chmod +x "${path}"/crio-extractor.sh + local path_dir=$(dirname "$path") + "${path}"/crio-extractor.sh uninstall "$path_dir" +} + +function uninstall_crio() { + + echo "Uninstalling CRI-O ..." + + systemctl stop crio + systemctl disable crio + + if flatcar_distro; then + do_uninstall_crio "/opt/local/bin" + else + do_uninstall_crio "/usr/local/bin" + fi + + sed -i '/containers:/d' /etc/subuid + sed -i '/containers:/d' /etc/subgid + + echo "CRI-O uninstallation done." +} + +function stop_crio() { + echo "Stopping CRI-O ..." + systemctl stop crio + echo "CRI-O stop done." +} + +function is_crio_running() { + command -v crio >/dev/null 2>&1 +} + +function main() { + + euid=$(id -u) + if [[ $euid -ne 0 ]]; then + die "This script must be run as root" + fi + + if ! is_crio_running; then + echo "CRI-O is not present; skipping removal." + exit 0 + fi + + stop_crio + uninstall_crio + restore_crictl_config +} + +main "$@" diff --git a/sysbox-pkgr/k8s/scripts/kubelet-config-helper.sh b/sysbox-pkgr/k8s/scripts/kubelet-config-helper.sh new file mode 100755 index 00000000..d23f74dd --- /dev/null +++ b/sysbox-pkgr/k8s/scripts/kubelet-config-helper.sh @@ -0,0 +1,1517 @@ +#!/bin/bash -x + +# +# Copyright 2019-2021 Nestybox, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +# +# Script to configure the kubelet with the CRI-O runtime on a host +# + +set -o errexit +set -o pipefail +set -o nounset + +var_lib_sysbox_deploy_k8s="/var/lib/sysbox-deploy-k8s" +crictl_bin="/usr/local/bin/sysbox-deploy-k8s-crictl" +crio_conf_file="/etc/crio/crio.conf" +crio_socket="/var/run/crio/crio.sock" +crio_runtime="unix://${crio_socket}" +kubelet_bin="" +runtime="" + +# Container's default restart-policy mode (i.e. no restart). +kubelet_ctr_restart_mode="no" + +# Kubelet's initialization command and parameters. We are caching +# this value as a global var for performance reasons. +execstart_line_global="" + +function die() { + msg="$*" + echo "ERROR: $msg" >&2 + exit 1 +} + +function start_kubelet() { + echo "Starting Kubelet ..." + systemctl start kubelet +} + +function restart_kubelet() { + echo "Restarting Kubelet ..." + systemctl restart kubelet +} + +function stop_kubelet() { + echo "Stopping Kubelet ..." + systemctl stop kubelet +} + +function start_containerd() { + echo "Starting containerd on the host ..." + systemctl start containerd.service +} + +function stop_containerd() { + echo "Stopping containerd on the host ..." + systemctl stop containerd.service +} + +function get_pods_uids() { + $crictl_bin --runtime-endpoint ${runtime} pods -v | egrep ^UID | cut -d" " -f2 +} + +# Sets the restart-policy mode for any given docker container. +function set_ctr_restart_policy() { + local cntr=$1 + local mode=$2 + + # Docker's supported restart-policy modes. + if [[ $mode != "no" ]] && + [[ $mode != "always" ]] && + [[ $mode != "on-failure" ]] && + [[ $mode != "unless-stopped" ]]; then + echo "Unsupported restart-policy mode: $mode" + return + fi + + if ! docker update --restart=$mode $cntr; then + echo "Unable to modify container $cntr restart mode to $mode." + return + fi + + echo "Successfully modified $cntr container's restart-policy to mode: $mode." +} + +# Sets the restart-policy mode for the kubelet docker container. +function set_kubelet_ctr_restart_policy() { + local mode=$1 + + kubelet_ctr_restart_mode=$(docker inspect --format='{{.HostConfig.RestartPolicy.Name}}' kubelet) + + set_ctr_restart_policy "kubelet" $mode +} + +# Reverts the restart-policy mode previously stored in a global-variable. +function revert_kubelet_ctr_restart_policy() { + set_ctr_restart_policy "kubelet" $kubelet_ctr_restart_mode +} + +function get_kubelet_env_files() { + systemctl show kubelet | grep EnvironmentFile | awk '{ print $1 }' | cut -d"=" -f2 | tr "\n" " " +} + +function get_kubelet_env_var() { + local env_var=$(systemctl show kubelet | grep ExecStart= | cut -d ";" -f2 | sed -e 's@argv\[\]=${kubelet_bin}@@g' | awk '{print $NF}') + + if ! echo ${env_var} | grep -q "^\\$"; then + echo "" + return + fi + + env_var=${env_var#"$"} + echo ${env_var} +} + +# Extract kubelet's execution attribute-value associated to a given attribute from +# the exec-line passed by caller. +function parse_kubelet_exec_attr_val() { + local exec_attr=$1 + local exec_line=$2 + + # Attempt to extract attribute assuming "equal" (i.e. "=") based format being + # used (most common case). Example: --config=/home/kubernetes/kubelet-config.yaml. + # A full match between 'exec_attr' string and 'exec_attr_val' one indicates that + # no valid 'exec_attr_val' has been found. + local exec_attr_val=$(echo "$exec_line" | sed 's/ /\n/g' | egrep "^--${exec_attr}" | cut -d"=" -f2 | tr -d \'\") + if [[ ! "$exec_attr_val" == "--${exec_attr}" ]]; then + echo "$exec_attr_val" + return + fi + + # Attempt to extract attribute assuming "space" based format being used. + # Example: --config /home/kubernetes/kubelet-config.yaml. + local exec_attr_val=$(echo "$exec_line" | sed 's/ /\n/g' | egrep -C1 "^--${exec_attr}" | tail -1 | tr -d \'\") + if [[ ! "$exec_attr_val" == "--${exec_attr}" ]]; then + echo "$exec_attr_val" + return + fi + + echo "" +} + +function get_kubelet_env_vars() { + systemctl show kubelet.service | egrep "ExecStart=" | + cut -d ";" -f2 | sed -e 's@argv\[\]=${kubelet_bin}@@g' | + sed 's/ /\n/g' | egrep "^\\$" | sed 's/\$//g' +} + +function get_kubelet_env_vars_content() { + systemctl show kubelet.service -p Environment --no-pager +} + +function get_kubelet_service_file() { + systemctl show kubelet | grep "^FragmentPath" | cut -d "=" -f2 +} + +function get_kubelet_service_dropin_files() { + systemctl show kubelet | grep "^DropInPaths" | cut -d "=" -f2 +} + +function get_kubelet_service_execstart() { + # Note that "no-pager" attribute is necessary to prevent systemctl's output + # from being truncated in cases with a long set of command attributes. + systemctl show kubelet.service -p ExecStart --no-pager | cut -d";" -f2 | sed 's@argv\[\]=@@' | sed 's@^ @@' +} + +function replace_cmd_option { + cmd_opts=$1 + opt=$2 + want_val=$3 + + read -a curr_args <<<${cmd_opts} + declare -a new_args + + found_opt=false + + for arg in "${curr_args[@]}"; do + + new_arg=$arg + + if [[ "$arg" == "$opt="* ]]; then + found_opt=true + val=${arg#"$opt="} + if [[ "$val" != "$want_val" ]]; then + new_arg="$opt=$want_val" + fi + fi + + new_args+=($new_arg) + done + + result=$(printf "%s " "${new_args[@]}") + + if ! $found_opt; then + result="$result $opt=$want_val" + fi + + echo $result +} + +# Adds the kubelet config in the given file; the given $env_file may or may not +# exist. If it does not exist, this function will create it and add or replace +# a definiton for $env_var +function add_kubelet_env_var() { + local env_file=$1 + local env_var=$2 + + # If the extra args file does not have the extra args/opts, add them as needed + if [ ! -f "$env_file" ]; then + mkdir -p $(dirname "$env_file") + touch "$env_file" + echo "Created kubelet env file $env_file" + fi + + if ! grep -q "$env_var" "$env_file"; then + echo "$env_var=\"\"" >>"$env_file" + fi + + replace_kubelet_env_var "$env_file" "$env_var" +} + +# Replaces the kubelet config in the given file; the given $env_file is assumed +# to exist and have a definition for $env_var in it already. +function replace_kubelet_env_var() { + local env_file=$1 + local env_var=$2 + + readarray -t opts <${var_lib_sysbox_deploy_k8s}/crio-kubelet-options + + # add newline at end of $env_file if not present + sed -i '$a\' "$env_file" + + touch tmp.txt + + while read -r line; do + new_line=$line + + # ignore comment lines + if [[ "$line" == "#*" ]]; then + continue + fi + + # replace the options in the line(s) starting with $env_var + if [[ "$line" == "$env_var="* ]]; then + + line_prefix="$env_var" + + # Handle quoted or unquoted variable definitions ($env_var="..." or $env_var=...) + if [[ "$line" == "$line_prefix=\""* ]]; then + line_opts=$(echo $line | cut -d'"' -f2) + else + line_opts=${line#"$line_prefix="} + fi + + set +e + for opt in "${opts[@]}"; do + opt_name=$(echo $opt | cut -d"=" -f1) + opt_val=$(echo $opt | cut -d"=" -f2) + if [[ "$opt_name" != "" ]] && [[ "$opt_val" != "" ]]; then + if [[ $line_opts != "" ]]; then + line_opts=$(replace_cmd_option "$line_opts" "$opt_name" "$opt_val") + else + # If there are no existing kubelet-config attrs to replace, then + # simply take the crio-kubelet options being provided. + line_opts="${opt_name}=${opt_val}" + fi + fi + done + set -e + + new_line="$line_prefix=\"$line_opts\"" + fi + + echo $new_line >>tmp.txt + + done <"$env_file" + mv tmp.txt "$env_file" + + echo "Modified kubelet env var $env_var in $env_file" +} + +function get_flat_file() { + local file=$1 + + local flat_file=$(sed ':x; /\\$/ { N; s/\\\n//; tx }' $file | tr -s ' ') + echo "$flat_file" +} + +function get_kubelet_exec_line_from_systemd_docker() { + + local execstart_pre_line=$(systemctl show kubelet.service -p ExecStartPre --no-pager | egrep "docker run.*--name=kubelet" | cut -d";" -f2 | sed 's@argv\[\]=@@' | sed 's@^ @@' | xargs) + if [ ! -z "$execstart_pre_line" ]; then + echo "$execstart_pre_line" + return + fi + + local execstart_line=$(systemctl show kubelet.service -p ExecStart --no-pager | egrep "docker run.*--name=kubelet" | cut -d";" -f2 | sed 's@argv\[\]=@@' | sed 's@^ @@' | xargs) + if [ ! -z "$execstart_line" ]; then + echo "$execstart_line" + return + fi +} + +function get_kubelet_exec_line_from_systemd_regular() { + + local execstart_pre_line=$(systemctl show kubelet.service -p ExecStartPre --no-pager | egrep "$kubelet_bin" | cut -d";" -f2 | sed 's@argv\[\]=@@' | sed 's@^ @@' | xargs) + if [ ! -z "$execstart_pre_line" ]; then + echo "$execstart_pre_line" + return + fi + + local execstart_line=$(systemctl show kubelet.service -p ExecStart --no-pager | egrep "$kubelet_bin" | cut -d";" -f2 | sed 's@argv\[\]=@@' | sed 's@^ @@' | xargs) + if [ ! -z "$execstart_line" ]; then + echo "$execstart_line" + return + fi +} + +function get_kubelet_exec_line_from_systemd() { + if [ ! -z "$execstart_line_global" ]; then + echo "$execstart_line_global" + return + fi + + local execstart_line=$(get_kubelet_exec_line_from_systemd_docker) + if [ ! -z "$execstart_line" ]; then + execstart_line_global="$execstart_line" + echo "$execstart_line" + return + fi + + local execstart_line=$(get_kubelet_exec_line_from_systemd_regular) + if [ ! -z "$execstart_line" ]; then + execstart_line_global="$execstart_line" + echo "$execstart_line" + return + fi +} + +function get_kubelet_exec_line_from_shell() { + if [ ! -z "$execstart_line_global" ]; then + echo "$execstart_line_global" + return + fi + + local execstart_line=$(ps -e -o command | egrep "^kubelet") + execstart_line_global="$execstart_line" + echo "$execstart_line" +} + +function get_kubelet_exec_line() { + + local execstart_line=$(get_kubelet_exec_line_from_systemd) + if [ ! -z "$execstart_line" ]; then + echo "$execstart_line" + return + fi + + local execstart_line=$(get_kubelet_exec_line_from_shell) + if [ ! -z "$execstart_line" ]; then + echo "$execstart_line" + return + fi +} + +# Our purpose in this function is to identify the kubelet systemd file that +# contains the ExecStart line matching the "exec_line" parameter being passed +# by the caller. +# +# Note that for this comparison logic to succeed we must take into account +# that the attributes displayed by "systemctl show" command, which serve to +# extract the "exec_line", are massaged by systemd and shown attending to this: +# +# * Duplicated space characters are eliminated. That is, a single space char is +# displayed between the ExecStart attributes. +# * Single and double quote characters are also eliminated. +# +function get_kubelet_systemd_file_per_exec() { + local exec_line=$1 + + # Let's identify all the existing drop-in files associated with the kubelet service + # to look for an exec_line match -- these have preference so we must start here. + local dropinFiles=$(get_kubelet_service_dropin_files) + for f in ${dropinFiles}; do + if [ ! -z "$f" ]; then + local flat_dropin_str=$(sed ':x; /\\$/ { N; s/\\\n//; tx }' $f | tr -s ' ' | tr -d \'\") + if [[ "$flat_dropin_str" =~ "$exec_line" ]]; then + echo "$f" + return + fi + fi + done + + local service_file=$(get_kubelet_service_file) + if [ ! -z "$service_file" ]; then + local flat_service_str=$(sed ':x; /\\$/ { N; s/\\\n//; tx }' $service_file | tr -s ' ' | tr -d \'\") + if [[ "$flat_service_str" =~ "$exec_line" ]]; then + echo "$service_file" + return + fi + fi +} + +# Function adjusts the kubelet exec instruction to satisfy the crio requirements. +# +# The following changes are required: +# +# * In docker-based kubelet setups we must add /var/lib/containers bind-mount as +# kubelet interacts with files in this path. For doing this we rely on the +# presence of /var/lib/docker as a reference to the location where the +# /var/lib/containers mount entry must be appended. +# +# * Also, we must append the passed env-var to the end of the exec instruction. +# This env-var is expected to hold all the crio-specific config parameters. +# +function adjust_kubelet_exec_instruction() { + local systemd_file=$1 + local env_var=$2 + local kubelet_mode=$3 + + local search_mode="on" + local new_line + + # Let's make a backup copy of the original file. + backup_config "$systemd_file" "kubelet_systemd_file" + + touch tmp.txt + + # Set IFS to nil to prevent file lines from being split (by default IFS is + # set to \sp\t\n). + IFS='' + + while read -r line; do + new_line=$line + + if [[ "$search_mode" == "on" ]]; then + if echo "$new_line" | egrep -q "^ExecStart.*=\S*kubelet " || + echo "$new_line" | egrep -q "^ExecStart.*=docker run"; then + search_mode="found" + fi + fi + + # If the search pattern was already found, look for the different sections + # of the exec instruction that we want to edit: + # + # * Multi-line /var/lib/docker: Append /var/lib/containers bind-mount. + # * Single-line /var/lib/docker: Append /var/lib/containers bind-mount. + # * Exec's last line: Append crio's env-var. + if [[ "$search_mode" == "found" ]]; then + + if [[ "$kubelet_mode" == "docker-based" ]]; then + if echo "$new_line" | egrep -q "\-v /var/lib/docker:/var/lib/docker:rw.*\\\\ *$"; then + new_line=$(printf '%s\n -v /var/lib/containers:/var/lib/containers:rw \\\n' "$new_line") + + elif echo "$new_line" | egrep -q "\-v /var/lib/docker:/var/lib/docker:rw.*$"; then + new_line=$(echo $new_line | sed 's@-v /var/lib/docker:/var/lib/docker:rw@& -v /var/lib/containers:/var/lib/containers:rw@') + fi + fi + + if ! echo "$new_line" | egrep -q "\\\\ *$"; then + new_line=$(printf '%s \\\n $%s' $new_line $env_var) + search_mode="off" + fi + fi + + echo $new_line >>tmp.txt + + done <"$systemd_file" + + # Remember to unset IFS to avoid headaches down the road. + unset IFS + + mv tmp.txt "$systemd_file" + + echo "Adjusted exec instruction in kubelet's service file \"$systemd_file\"." +} + +# As its name implies, this function's goal is to carry out all the steps that +# are necessary to configure kubelet to use cri-o in systemd-managed deployments. +# +# The relative complexity of this function and its helper routines is simply a +# consequence of the multiple variables to account for due to the various ways +# in which systemd-managed apps can be configured. +# +# This function addresses all the combinations that derive from mixing these +# variables: +# +# * Kubelet can be configured through a systemd service file or through an +# associated drop-in file. +# * Kubelet can be launched through either an 'ExecStart' instruction or through +# any of the multiple instructions within a 'ExecStartPre' clauses. +# * Kubelet can be directly managed through a systemd service, or indirectly +# through a systemd-managed docker container. +# * Kubelet can be instantiated through a single-line instruction or from a +# multi-line one. +# +# Once that the proper file/line where to inject the new config state is +# identified, this function will simply append an env-var holding the cri-o +# config attributes. The content of this variable will be stored in any of the +# pre-existing env-files within the kubelet service, or a new file if none is +# found. +function config_kubelet() { + local kubelet_mode=$1 + + local kubelet_env_var="KUBELET_CRIO_ARGS" + + # Identify the exec-line. + local exec_line=$(get_kubelet_exec_line_from_systemd) + if [ -z "$exec_line" ]; then + die "No Kubelet execution instruction could be identified." + fi + + # Identify the systemd file where the exec-line lives. + local systemd_file=$(get_kubelet_systemd_file_per_exec "$exec_line") + if [ -z "$systemd_file" ]; then + die "No Kubelet systemd file could be identified for exec-line." + fi + + # Adjust the ExecStart instruction to satisfy this setup. + adjust_kubelet_exec_instruction "$systemd_file" "$kubelet_env_var" "$kubelet_mode" + + # If systemd shows no kubelet environment files, let's create one. + local kubelet_env_files=$(get_kubelet_env_files) + local kubelet_env_file + + if [[ "$kubelet_env_files" == "" ]]; then + kubelet_env_file="/etc/default/kubelet" + touch "$kubelet_env_file" + else + kubelet_env_file=$(echo "$kubelet_env_files" | awk '{print $NF}') + fi + + backup_config "$kubelet_env_file" "kubelet_env_file" + + # Append the new env-var content to one of the env-files. + add_kubelet_env_var "$kubelet_env_file" "$kubelet_env_var" + + # Ask systemd to reload it's config. + systemctl daemon-reload +} + +function backup_config() { + local file=$1 + local type=$2 + + local config_file="${var_lib_sysbox_deploy_k8s}/config" + + mkdir -p "$var_lib_sysbox_deploy_k8s" + + if [ ! -f "$file" ]; then + return + fi + + if [[ "$type" == "kubelet_systemd_file" ]]; then + echo "kubelet_systemd_file=${file}" >>"$config_file" + elif [[ "$type" == "kubelet_env_file" ]]; then + echo "kubelet_env_file=${file}" >>"$config_file" + else + return + fi + + cp "$file" "${var_lib_sysbox_deploy_k8s}"/"${type}.orig" +} + +# Searches the passed exec_attr throughout all the 'Environment' clauses defined in +# the kubelet service file and associated drop-in files. Function must be able to parse +# relatively complex/long one-liners like this one: +# Environment=KUBELET_CGROUP_FLAGS=--cgroup-driver=systemd \ +# "KUBELET_CONTAINERD_FLAGS=--container-runtime=remote \ +# --runtime-request-timeout=15m --container-runtime-endpoint=unix:///run/containerd/containerd.sock \ +# --runtime-cgroups=/system.slice/containerd.service" "KUBELET_TLS_BOOTSTRAP_FLAGS=--kubeconfig \ +# /var/lib/kubelet/kubeconfig --bootstrap-kubeconfig /var/lib/kubelet/bootstrap-kubeconfig" +# +function get_kubelet_config_attr_from_systemd_environment() { + local exec_attr=$1 + + if [ -z "$exec_attr" ]; then + echo "" + return + fi + + local env_vars_content=$(get_kubelet_env_vars_content) + if [[ "$env_vars_content" =~ "$exec_attr" ]]; then + # Parse equal-signed based attribute: Example: "--cgroup-driver=systemd" + local exec_attr_val=$(echo "$env_vars_content" | awk -F"--${exec_attr}=" '{print $2}' | cut -d' ' -f1 | tr -d \'\") + if [ ! -z "$exec_attr_val" ]; then + echo "$exec_attr_val" + return + fi + + # Parse space-based attribute. Example: "--bootstrap-kubeconfig /var/lib/kubelet/bootstrap-kubeconfig" + local exec_attr_val=$(echo "$env_vars_content" | awk -F"--${exec_attr} " '{print $2}' | cut -d' ' -f1 | tr -d \'\") + if [ ! -z "$exec_attr_val" ]; then + echo "$exec_attr_val" + return + fi + fi +} + +# Searches the passed exec_attr throughout all the 'ExecStart' clauses defined in +# the kubelet service file and associated drop-in files. +function get_kubelet_config_attr_from_systemd_execstart() { + local exec_attr=$1 + + if [ -z "$exec_attr" ]; then + return + fi + + local exec_line=$(get_kubelet_exec_line_from_systemd) + if [ -z "$exec_line" ]; then + return + fi + + local exec_attr_val=$(parse_kubelet_exec_attr_val "$exec_attr" "$exec_line") + echo "$exec_attr_val" +} + +# Function attempts to identify the value assigned to the attribute being passed. To +# accomplish its goal the function iterates through all the systemd files associated +# with the kubelet service, which includes the kubelet service file, kubelet's drop-in +# files, and external environment-files. +function get_kubelet_config_attr_from_systemd() { + local exec_attr=$1 + + if [ -z "$exec_attr" ]; then + return + fi + + # Let's identify all the existing 'Environment' attributes defined within the kubelet service file + # as well as its drop-in files. + local exec_attr_val=$(get_kubelet_config_attr_from_systemd_environment "$exec_attr") + if [ ! -z "$exec_attr_val" ]; then + echo "$exec_attr_val" + return + fi + + # Look directly into the list of 'ExecStart' attributes used within the kubelet service file. + local exec_attr_val=$(get_kubelet_config_attr_from_systemd_execstart "$exec_attr") + if [ ! -z "$exec_attr_val" ]; then + echo "$exec_attr_val" + return + fi + + # Let's now look within any existing 'EnvironmentFile' (e.g., '/etc/default/kubelet'). + # To be thorough we must iterate through the matrix formed by all env-files and + # env-vars to look for the exec attribute we are after. If found, return its value. + local env_files=$(get_kubelet_env_files) + local env_vars=$(get_kubelet_env_vars) + + for file in $env_files; do + for var in $env_vars; do + if [ ! -f "$file" ]; then + continue + fi + + var=${var#"$"} + + if grep -q "$var" "$file"; then + local exec_line=$(cat "$file") + local exec_attr_val=$(parse_kubelet_exec_attr_val "$exec_attr" "$exec_line") + echo "$exec_attr_val" + return + fi + done + done +} + +# Function obtains the kubelet config file (as displayed within kubelet's +# systemd file) and then search for the passed config attribute. +function get_kubelet_config_attr_from_config_file() { + local config_attr=$1 + + if [ -z "$config_attr" ]; then + return + fi + + # Let's start by identifying the kubelet config file. + # TODO: What if there's no explicit one defined? Is there a default one? + local kubelet_cfg_file=$(get_kubelet_config_attr_from_systemd "config") + + # Check if there's a matching config_attr in the kubelet config file and return + # its associated value if present. + if [ ! -z "$kubelet_cfg_file" ]; then + local config_attr_val=$(egrep "$config_attr" "$kubelet_cfg_file" | cut -d":" -f2 | tr -d ' ,"') + echo "$config_attr_val" + return + fi +} + +# Function extracts kubelet config attributes from a snap-based kubelet +# setup. +function get_kubelet_config_attr_from_snap() { + local config_attr=$1 + + # Return if not within a kubelet-snap setup. + if [ -z "${kubelet_snap:-}" ]; then + return + fi + + snap get $kubelet_snap $config_attr +} + +# Function obtains the kubelet exec instruction as it appears in linux shell. +function get_kubelet_config_attr_from_shell() { + local exec_attr="$1" + + if [ -z "$exec_attr" ]; then + return + fi + + local execstart_line=$(get_kubelet_exec_line_from_shell) + if [ -z "$execstart_line" ]; then + return + fi + + local exec_attr_val=$(parse_kubelet_exec_attr_val "$exec_attr" "$execstart_line") + echo "$exec_attr_val" +} + +# Function takes care of reconciliating operational attributes that can +# potentially overlap between 'kubelet' and 'crio' components. In this scenario +# we want to translate kubelet's overlapping attribute to the one understood by +# crio's config-parser, so that both components operate in-sync. For lack of a +# better word, we refer to these potentially-overlapped attributes as crio's +# 'config-dependencies'. +# +# The following config-dependencies have been identified so far: +# +# * --pod-infra-container-image: Initially conceived for dockershim consumption +# to allow user to define the "pause" image to utilize. This attribute's +# semantic has been expanded now to offer kubelet a mechanism to prevent this +# special image from being pruned by K8s GC. In CRI-O's case, there's an +# equivalent attribute for this purpose, which must reflect the value set by +# kubelet in dockershim scenarios. +# +# * --cni-conf-dir: Path utilized by kubelet to find the CNI configuration +# attributes (defaults to /etc/cni/net.d). +# +# * --cni-bin-dir: The full path of the directory in which to search for CNI +# plugin binaries. +# +# * --cgroup-driver: Driver utilized by kubelet to manipulate cgroups on the +# host. +# +# TODO: Review the list of kubelet attributes to identify other 'overlapping' +# parameters (if any). +function adjust_crio_config_dependencies() { + local crio_sighup=false + local crio_restart=false + + # If kubelet is currently running with an explicit "infra" (pause) image, then + # adjust crio.conf to honor that request. + local pause_image_systemd=$(get_kubelet_config_attr_from_systemd "pod-infra-container-image") + local pause_image_snap=$(get_kubelet_config_attr_from_snap "pod-infra-container-image") + local pause_image_exec=$(get_kubelet_config_attr_from_shell "pod-infra-container-image") + local pause_image + if [ ! -z "$pause_image_systemd" ]; then + pause_image=$pause_image_systemd + # Skipping for now due to issue #550. + # elif [ ! -z "$pause_image_snap" ]; then + # pause_image=$pause_image_snap + elif [ ! -z "$pause_image_exec" ]; then + pause_image=$pause_image_exec + fi + + if [ ! -z "${pause_image:-}" ]; then + # kubelet/crio seems to be unable to pull images that have both a tag and + # a digest attribute, so here we are eliminating the tag one if digests are + # present. + pause_image=$(echo $pause_image | sed 's/:.*@/@/') + + # Adjust crio.conf with kubelet's 'pause-image' attribute. + if egrep -q "pause_image =" $crio_conf_file; then + sed -i "s@pause_image =.*@pause_image = \"${pause_image}\"@" $crio_conf_file + else + sed -i "/\[crio.image\]/a \ pause_image = \"${pause_image}\"" $crio_conf_file + fi + crio_sighup=true + fi + + # + # Adjust crio.conf with kubelet's view of 'cni-conf-dir'. + # + local cni_conf_dir_systemd=$(get_kubelet_config_attr_from_systemd "cni-conf-dir") + local cni_conf_dir_snap=$(get_kubelet_config_attr_from_snap "cni-conf-dir") + local cni_conf_dir_exec=$(get_kubelet_config_attr_from_shell "cni-conf-dir") + local cni_conf_dir + if [ ! -z "$cni_conf_dir_systemd" ]; then + cni_conf_dir=$cni_conf_dir_systemd + elif [ ! -z "$cni_conf_dir_snap" ]; then + cni_conf_dir=$cni_conf_dir_snap + elif [ ! -z "$cni_conf_dir_exec" ]; then + cni_conf_dir=$cni_conf_dir_exec + fi + + if [ ! -z "${cni_conf_dir:-}" ] && [[ $cni_conf_dir != "/etc/cni/net.d" ]]; then + if egrep -q "network_dir =" $crio_conf_file; then + sed -i "s@network_dir =.*@network_dir = \"${cni_conf_dir}\"@" $crio_conf_file + else + sed -i "/\[crio.network\]/a \ network_dir = \"${cni_conf_dir}\"" $crio_conf_file + fi + crio_restart=true + fi + + # + # Adjust crio.conf with kubelet's view of 'cni-bin-dir'. + # + local cni_bin_dir_systemd=$(get_kubelet_config_attr_from_systemd "cni-bin-dir") + local cni_bin_dir_snap=$(get_kubelet_config_attr_from_snap "cni-bin-dir") + local cni_bin_dir_exec=$(get_kubelet_config_attr_from_shell "cni-bin-dir") + local cni_bin_dir + if [ ! -z "$cni_bin_dir_systemd" ]; then + cni_bin_dir=$cni_bin_dir_systemd + elif [ ! -z "$cni_bin_dir_snap" ]; then + cni_bin_dir=$cni_bin_dir_snap + elif [ ! -z "$cni_bin_dir_exec" ]; then + cni_bin_dir=$cni_bin_dir_exec + fi + + if [ ! -z "${cni_bin_dir:-}" ]; then + if egrep -q "plugin_dirs =" $crio_conf_file; then + sed -i "s@plugin_dirs =.*@plugin_dirs = [\"${cni_bin_dir}\"]@" $crio_conf_file + else + sed -i "/\[crio.network\]/a \ plugin_dirs = [\"${cni_bin_dir}\"]" $crio_conf_file + fi + crio_restart=true + fi + + # Adjust crio.conf with the cgroup driver configured by kubelet. As of today (Mar-2023), kubelet + # still defaults to `cgroupfs`. + local cgroup_driver_systemd=$(get_kubelet_config_attr_from_systemd "cgroup-driver") + local cgroup_driver_config=$(get_kubelet_config_attr_from_config_file "cgroupDriver") + local cgroup_driver_snap=$(get_kubelet_config_attr_from_snap "cgroup-driver") + local cgroup_driver_exec=$(get_kubelet_config_attr_from_shell "cgroup-driver") + local cgroup_driver + if [ ! -z "$cgroup_driver_config" ]; then + cgroup_driver=$cgroup_driver_config + elif [ ! -z "$cgroup_driver_systemd" ]; then + cgroup_driver=$cgroup_driver_systemd + elif [ ! -z "$cgroup_driver_snap" ]; then + cgroup_driver=$cgroup_driver_snap + elif [ ! -z "$cgroup_driver_exec" ]; then + cgroup_driver=$cgroup_driver_exec + else + cgroup_driver="cgroupfs" + fi + + # crio-o defaults to "systemd" cgroup driver, so we must only deal with scenarios where + # kubelet is operating in "cgroupfs" mode. + if [ ! -z "${cgroup_driver:-}" ]; then + if egrep -q "cgroup_manager =" $crio_conf_file; then + sed -i "s@cgroup_manager =.*@cgroup_manager = \"${cgroup_driver}\"@" $crio_conf_file + else + sed -i "/\[crio.runtime\]/a \ cgroup_manager = \"${cgroup_driver}\"" $crio_conf_file + fi + + # In 'cgroupfs' mode, the 'conmon-group' attribute must be defined as below. + if [[ "$cgroup_driver" == "cgroupfs" ]]; then + if egrep -q "conmon_cgroup =" $crio_conf_file; then + sed -i "s@conmon_cgroup =.*@conmon_cgroup = \"pod\"@" $crio_conf_file + else + sed -i "/\[crio.runtime\]/a \ conmon_cgroup = \"pod\"" $crio_conf_file + fi + fi + + # Likewise, 'systemd' mode requires 'conmon-group' attribute to be set as below. + if [[ "$cgroup_driver" == "systemd" ]]; then + if egrep -q "conmon_cgroup =" $crio_conf_file; then + sed -i "s@conmon_cgroup =.*@conmon_cgroup = \"system.slice\"@" $crio_conf_file + else + sed -i "/\[crio.runtime\]/a \ conmon_cgroup = \"system.slice\"" $crio_conf_file + fi + fi + + crio_restart=true + fi + + # Process crio changes. + if [[ "$crio_sighup" == "true" ]]; then + pkill -HUP crio + fi + + if [[ "$crio_restart" == "true" ]]; then + echo "Restarting CRI-O due to unmet Kubelet's config dependencies ..." + systemctl restart crio + fi +} + +function clean_runtime_state_containerd() { + local runtime=$1 + local runtime_path=$(echo $runtime | sed 's@unix://@@' | cut -d" " -f1) + + # Collect all the existing podIds as seen by crictl. + podList=$($crictl_bin --runtime-endpoint "$runtime" pods | awk 'NR>1 {print $1}') + + # Turn off errexit in these steps as we don't want to interrupt the process + # if any of the instructions fail for a particular pod / container. + set +e + + # Stop / remove all the existing pods. + for pod in ${podList}; do + ret=$($crictl_bin --runtime-endpoint "$runtime" stopp "$pod") + if [ $? -ne 0 ]; then + echo "Failed to stop pod ${pod}: ${ret}" + fi + + ret=$($crictl_bin --runtime-endpoint "$runtime" rmp --force "$pod") + if [ $? -ne 0 ]; then + echo "Failed to remove pod ${pod}: ${ret}" + fi + done + + # At this point all the pre-existing containers may be stopped and eliminated, + # but there may be inactive containers that we want to eliminate too as these may + # cause issues when flipping back to the original (non-crio) scenario. + cntrList=$($crictl_bin --runtime-endpoint "$runtime" ps -a | awk 'NR>1 {print $1}') + + for cntr in ${cntrList}; do + ret=$($crictl_bin --runtime-endpoint "$runtime" stop --timeout 0 "$cntr") + if [ $? -ne 0 ]; then + echo "Failed to stop container ${cntr}: ${ret}" + fi + + ret=$($crictl_bin --runtime-endpoint "$runtime" rm --force "$cntr") + if [ $? -ne 0 ]; then + echo "Failed to remove container ${cntr}: ${ret}" + fi + done + + set -e + + # Create a soft link from the containerd socket to the crio socket + # (some pods are designed to talk to containerd (e.g., gke-metadata-server)). + echo "Soft-linking containerd socket to CRI-O socket on the host ..." + rm -rf "$runtime_path" + ln -s "$crio_socket" "$runtime_path" +} + +function clean_runtime_state_dockershim() { + local runtime=$1 + shift + local podUids=("$@") + + # Cleanup the pods; turn off errexit in these steps as we don't want to + # interrupt the process if any of the instructions fail for a particular + # pod / container. + set +e + + # If no list of pre-existing pods is provided then proceed to eliminate all + # the present containers. Otherwise, eliminate only the containers associated + # with the provided pods. + if [ -z "${podUids-}" ]; then + docker stop -t0 $(docker ps -a -q) + docker rm $(docker ps -a -q) + else + # Collect all the existing containers as seen by docker. + local cntrList=$(docker ps | awk 'NR>1 {print $1}') + + for podUid in ${podUids}; do + for cntr in ${cntrList}; do + ret=$(docker inspect --format='{{index .Config.Labels "io.kubernetes.pod.uid"}}' $cntr 2>/dev/null | grep -q $podUid) + if [ $? -ne 0 ]; then + continue + fi + + ret=$(docker stop -t0 $cntr) + if [ $? -ne 0 ]; then + echo "Failed to stop cntr $cntr from pod $podUid: $ret" + fi + + ret=$(docker rm $cntr) + if [ $? -ne 0 ]; then + echo "Failed to remove cntr $cntr from pod $podUid: $ret" + fi + done + done + fi + + set -e + + echo "Done eliminating all existing docker containers." + + # Create a soft link from the dockershim socket to the crio socket + # (some pods are designed to talk to dockershim (e.g., aws-node)). + echo "Soft-linking dockershim socket to CRI-O socket on the host ..." + rm -f /var/run/dockershim.sock + ln -s /var/run/crio/crio.sock /var/run/dockershim.sock +} + +# Wipe out all the pods managed by the given container runtime (dockershim, containerd, etc.) +function clean_runtime_state() { + local runtime=$1 + shift + local podUids=("$@") + + if [[ "$runtime" =~ "containerd" ]]; then + clean_runtime_state_containerd "$runtime" + elif [[ "$runtime" =~ "dockershim" ]]; then + if [ -n "${podUids-}" ]; then + clean_runtime_state_dockershim "$runtime" "$podUids" + else + clean_runtime_state_dockershim "$runtime" + fi + else + echo "Container runtime not supported: ${runtime}" + return + fi + + # Store info about the prior runtime on the host so the + # kubelet-unconfig-helper service can revert it if/when the crio-cleanup-k8s + # daemonset runs. + mkdir -p "$var_lib_sysbox_deploy_k8s" + echo $runtime >${var_lib_sysbox_deploy_k8s}/prior_runtime +} + +# QoS cgroups are created as transient systemd slices when making use of the systemd +# cgroup driver. In these scenarios, kubelet won't be able to initialize if there are +# pre-existing kubepod cgroup entries corresponding to previous kubelet instantiations. +# This function ensures that these entries are eliminated. +function clean_cgroups_kubepods() { + + # We eliminate all the cgroup kubepod entries by simply stopping their associated + # systemd service. + echo "Stopping/eliminating kubelet QoS cgroup kubepod entries..." + for i in $(systemctl list-unit-files --no-legend --no-pager -l | grep --color=never -o .*.slice | grep kubepod); do + systemctl stop $i + done +} + +############################################################################### +# Scenario 1: Snap setup -- Snap-based kubelet +############################################################################### + +function start_kubelet_snap() { + snap start $kubelet_snap +} + +function stop_kubelet_snap() { + snap stop $kubelet_snap +} + +function get_runtime_kubelet_snap() { + + # If runtime is unknown, assume it's Docker + if [[ ${runtime} == "" ]]; then + runtime="unix:///var/run/dockershim.sock" + fi + + local ctr_runtime_type=$(snap get $kubelet_snap container-runtime) + if [[ "$ctr_runtime_type" == "remote" ]]; then + runtime=$(snap get $kubelet_snap container-runtime-endpoint) + fi +} + +function config_kubelet_snap() { + snap set $kubelet_snap container-runtime=remote + snap set $kubelet_snap container-runtime-endpoint=unix:///var/run/crio/crio.sock +} + +function do_config_kubelet_snap() { + echo "Detected snap-based kubelet deployment on host." + + kubelet_snap=$(snap list | grep kubelet | awk '{print $1}') + + get_runtime_kubelet_snap + + if [[ ${runtime} =~ "crio" ]]; then + echo "Kubelet is already using CRI-O; no action will be taken." + return + fi + + if [[ ${runtime} =~ "dockershim" ]]; then + stop_kubelet_snap + clean_runtime_state "$runtime" + clean_cgroups_kubepods + config_kubelet_snap + adjust_crio_config_dependencies + start_kubelet_snap + else + stop_kubelet_snap + clean_runtime_state "$runtime" + clean_cgroups_kubepods + config_kubelet_snap + adjust_crio_config_dependencies + start_kubelet_snap + fi +} + +function kubelet_snap_deployment() { + snap list 2>&1 | grep -q kubelet +} + +############################################################################### +# Scenario 2: RKE setup -- Docker-based kubelet created by rke tool +############################################################################### + +function start_kubelet_container() { + docker start kubelet +} + +function restart_kubelet_container() { + docker restart kubelet +} + +function stop_kubelet_container() { + docker stop kubelet +} + +function get_runtime_kubelet_docker() { + set +e + runtime=$(docker exec kubelet bash -c "ps -e -o command | egrep \^kubelet | egrep -o \"container-runtime-endpoint=\S*\" | cut -d '=' -f2") + set -e + + # If runtime is unknown, assume it's Docker + if [[ ${runtime} == "" ]]; then + runtime="unix:///var/run/dockershim.sock" + fi +} + +# Updates the entrypoint script of the kubelet container present in rke setups. +function config_kubelet_rke_update() { + local env_file=$1 + + local kubelet_entrypoint=$(docker inspect --format='{{index .Config.Entrypoint 0}}' kubelet) + + # Backup original entrypoint file -- to be utilized by kubelet-unconfig-helper + # script to revert configuration. + docker exec kubelet bash -c "cp ${kubelet_entrypoint} ${kubelet_entrypoint}.orig" + + # Extract the kubelet attributes to execute with. + local kubelet_attribs=$(cat $env_file | cut -d'"' -f 2) + + # Adjust kubelet's container entrypoint to incorporate the new exec attributes. + docker exec kubelet bash -c "sed -i 's@exec .*@exec kubelet ${kubelet_attribs}@' ${kubelet_entrypoint}" + + echo "Kubelet config updated within container's entrypoint: ${kubelet_entrypoint}" +} + +# Configures the kubelet to use cri-o in rke setups. +function config_kubelet_rke() { + + # Temp variables to hold kubelet's config file and its config attributes. + local kubelet_tmp_file="/etc/default/kubelet-rke" + local kubelet_tmp_var="KUBELET_EXTRA_ARGS" + + # Extract kubelet's current execution attributes and store them in a temp file. + local cur_kubelet_attr=$(docker exec kubelet bash -c "ps -e -o command | egrep \^kubelet | cut -d\" \" -f2-") + echo "${kubelet_tmp_var}=\"${cur_kubelet_attr}\"" >"${kubelet_tmp_file}" + + # Add crio-specific config attributes to the temporary kubelet config file. + replace_kubelet_env_var "$kubelet_tmp_file" "$kubelet_tmp_var" + + # Modify the actual kubelet's config file (container entrypoint) to reflect + # the new attributes obtained above. + config_kubelet_rke_update "$kubelet_tmp_file" + + rm -rf "$kubelet_tmp_file" +} + +function do_config_kubelet_rke() { + echo "Detected RKE's docker-based kubelet deployment on host." + + # Obtain current runtime. + get_runtime_kubelet_docker + if [[ ${runtime} =~ "crio" ]]; then + echo "Kubelet is already using CRI-O; no action will be taken." + return + fi + + # No runtime other than dockershim, and obviously crio, is expected in an + # rke deployment. + if [[ ! ${runtime} =~ "dockershim" ]]; then + echo "Unsupported runtime for RKE scenario: $runtime" + return + fi + + # RKE bind-mounts /sys into its kubelet container to be able to write directly + # into the hosts /sys/fs/cgroup path. With that goal in mind, RKE's kubelet + # container entrypoint does a RW remount of /sys/fs/cgroup mountpoint. However, + # this doesn't help host-based processes that require RW access to the cgroups + # path (such as cri-o), that's why here we explicitly remount /sys/fs/cgroup as + # RW within the init mount-ns. + if mount | grep -q "/sys/fs/cgroup .*ro,"; then + mount -o rw,remount /sys/fs/cgroup + fi + + # In RKE's case we must add a few steps to the typical logic utilized in other + # dockershim setups. In this case, as kubelet executes as the 'init' process + # of a docker container, we must do the following: + # + # * Modify kubelet's container restart-policy to prevent this one from being + # re-spawned by docker once that we temporarily shut it down. + # * Configurate the kubelet's container entrypoint to meet cri-o requirements. + # * Obtain the list of pre-existing pods that need to be deleted during the + # 'cleanup' phase -- see that we must provide an explicit list as we want + # to leave the 'kubelet' container untouched. + # * Once the usual kubelet's "stop + clean + start" cycle is completed, we + # must revert the changes made to the kubelet's container restart-policy. + + set_kubelet_ctr_restart_policy "no" + config_kubelet_rke + local podUids=$(get_pods_uids) + stop_kubelet_container + clean_runtime_state "$runtime" "$podUids" + start_kubelet_container + revert_kubelet_ctr_restart_policy +} + +function kubelet_rke_deployment() { + + # Docker presence is a must-have in rke setups. As we are enforcing this + # requirement at the very beginning of the execution path, no other rke + # related routine will check for docker's presence. + if ! command -v docker >/dev/null 2>&1; then + return 1 + fi + + docker inspect --format='{{.Config.Labels}}' kubelet | + egrep -q "rke.container.name:kubelet" +} + +############################################################################### +# Scenario 3: RKE2 setup -- Host-based kubelet managed by rke2-agent's systemd +# service +############################################################################### + +function get_rke2_service_name() { + # figure out which rke2 service is running + # one of these must be active because of the check in kubelet_rke2_deployment + systemctl list-units --type service --state active --quiet --plain 'rke2-*' | cut -d ' ' -f1 +} + +function start_rke2() { + echo "Starting RKE2 service $1 ..." + systemctl start $1 +} + +function stop_rke2() { + echo "Stopping RKE2 service $1 ..." + systemctl stop $1 +} + +function get_runtime_kubelet_rke2() { + set +e + runtime=$(ps -e -o command | egrep kubelet | egrep -o "container-runtime-endpoint=\S*" | cut -d '=' -f2) + set -e + + # If runtime is unknown, assume it's Docker + if [[ ${runtime} == "" ]]; then + runtime="unix:///var/run/dockershim.sock" + fi +} + +function config_kubelet_rke2() { + echo "Executing Kubelet RKE2 configuration function ..." + + local rancher_config="/etc/rancher/rke2/config.yaml" + + # TODO: Currently we only support RKE2 setups that are configured through + # the default RKE2's config.yaml file; meaning that we are not looking at + # custom config attributes that could be potentially passed by the user + # through the rke2-agent cli. + + if egrep -q "container-runtime-endpoint:.*crio.sock" "$rancher_config"; then + echo "RKE2's kubelet is already using CRI-O; no action will be taken." + return + fi + + if egrep -q "container-runtime-endpoint:" "$rancher_config"; then + sed -i "s@container-runtime-endpoint:.*@container-runtime-endpoint: /var/run/crio/crio.sock@" "$rancher_config" + else + echo "container-runtime-endpoint: /var/run/crio/crio.sock" >>"$rancher_config" + fi + + local cgroup_driver=$(get_kubelet_config_attr_from_shell "cgroup-driver") + if [[ "$cgroup_driver" == "cgroupfs" ]]; then + if egrep -q "cgroup-driver=" "$rancher_config"; then + sed -i "s@cgroup-driver=.*@cgroup-driver=cgroupfs" "$rancher_config" + else + if egrep -q "kubelet-arg:" "$rancher_config"; then + sed -i '/kubelet-arg:/a \ - "cgroup-driver=cgroupfs"' "$rancher_config" + else + sed -i '$akubelet-arg:' "$rancher_config" + sed -i '/kubelet-arg:/a \ - "cgroup-driver=cgroupfs"' "$rancher_config" + fi + fi + elif [[ "$cgroup_driver" == "systemd" ]]; then + if egrep -q "cgroup-driver=" "$rancher_config"; then + sed -i "s@cgroup-driver=.*@cgroup-driver=systemd" "$rancher_config" + else + if egrep -q "kubelet-arg:" "$rancher_config"; then + sed -i '/kubelet-arg:/a \ - "cgroup-driver=systemd"' "$rancher_config" + else + sed -i '$akubelet-arg:' "$rancher_config" + sed -i '/kubelet-arg:/a \ - "cgroup-driver=systemd"' "$rancher_config" + fi + fi + fi +} + +function do_config_kubelet_rke2() { + echo "Detected RKE2's host-based kubelet deployment on host." + + # Obtain current runtime. + get_runtime_kubelet_rke2 + if [[ ${runtime} =~ "crio" ]]; then + echo "Kubelet is already using CRI-O; no action will be taken." + return + fi + + # No runtime other than containerd, and obviously crio, is expected in an + # rke2 deployment. + if [[ ! ${runtime} =~ "containerd" ]]; then + echo "Unsupported runtime for RKE2 scenario: $runtime" + return + fi + + # Ideally, we should stop containerd first and do the clean-up right after, + # but that's not an option in RKE2 setups as it directly manages the + # live-cycle of the K8s components through its rke2-agent daemon. That's + # why we must first clean all the state, and stop rke2-agent afterwards. + # This could theoretically open up the possibility for race-conditions, but + # that's something that we haven't observed yet given the short interval + # between the 'clean' and the 'stop' events. + + local kubelet_exec_line=$(get_kubelet_exec_line_from_shell) + local rke2_service_name=$(get_rke2_service_name) # rke2-agent or rke2-server + clean_runtime_state "$runtime" + stop_rke2 "$rke2_service_name" + config_kubelet_rke2 + adjust_crio_config_dependencies + start_rke2 "$rke2_service_name" +} + +function kubelet_rke2_deployment() { + + # Worker nodes in RKE2 setups rely on rke2-agent's systemd service, + # or rke2-server for single-node clusters (there is only the controller node) + if systemctl is-active --quiet rke2-agent rke2-server; then + return + fi + + false +} + +############################################################################### +# Scenario 4: Docker-based kubelet managed through a systemd service +############################################################################### + +function get_runtime_kubelet_systemctl { + set +e + runtime=$(ps -e -o command | egrep kubelet | egrep -o "container-runtime-endpoint=\S*" | cut -d '=' -f2) + set -e + + # If runtime is unknown, assume it's Docker + if [[ ${runtime} == "" ]]; then + runtime="unix:///var/run/dockershim.sock" + fi +} + +function do_config_kubelet_docker_systemd() { + echo "Detected systemd-managed docker-based kubelet deployment on host." + + # Obtain current runtime. + get_runtime_kubelet_systemctl + if [[ ${runtime} =~ "crio" ]]; then + echo "Kubelet is already using CRI-O; no action will be taken." + return + fi + + # No runtime other than dockershim, and obviously crio, is expected in an + # systemd-managed docker-based deployment. + if [[ ! ${runtime} =~ "dockershim" ]]; then + echo "Unsupported runtime for docker-based scenario: $runtime" + return + fi + + # See comment above in rke's equivalent function. + if mount | grep -q "/sys/fs/cgroup .*ro,"; then + mount -o rw,remount /sys/fs/cgroup + fi + + config_kubelet "docker-based" + adjust_crio_config_dependencies + stop_kubelet + clean_runtime_state "$runtime" + clean_cgroups_kubepods + start_kubelet +} + +function kubelet_docker_systemd_deployment() { + + # Docker presence is a must-have requirement in these setups (obviously). As + # we are enforcing this requirement at the very beginning of the execution + # path, no other systemd-docker related routine will check for docker's + # presence. + if ! command -v docker >/dev/null 2>&1; then + return 1 + fi + + # Ensure that a container named 'kubelet' exists (typical de-facto standard). + if ! systemctl show kubelet.service | egrep -q "^ExecStart.*=docker run"; then + return 1 + fi + + # Ensure that the entrypoint of this kubelet container is executing 'kubelet' + # itself. + if ! docker inspect --format='{{index .Config.Entrypoint 0}}' kubelet | + awk -F "/" '{print $NF}' | egrep -q "kubelet"; then + return 1 + fi +} + +############################################################################### +# Scenario 5: Host-based kubelet managed through a systemd service +############################################################################### + +function get_kubelet_bin() { + local tmp=$(systemctl show kubelet | grep "ExecStart=" | cut -d ";" -f1) + tmp=${tmp#"ExecStart={ path="} + echo "$tmp" | xargs +} + +function do_config_kubelet() { + echo "Detected systemd-managed host-based kubelet deployment on host." + + # Obtain kubelet path. + kubelet_bin=$(get_kubelet_bin) + if [ -z "$kubelet_bin" ]; then + die "Kubelet binary not identified." + fi + + # Obtain current runtime. + get_runtime_kubelet_systemctl + if [[ ${runtime} =~ "crio" ]]; then + echo "Kubelet is already using CRI-O; no action will be taken." + return + fi + + # The ideal sequence is to stop the kubelet, cleanup all pods with the + # existing runtime, reconfig the kubelet, and restart it. But if the runtime + # is dockershim this logic does not work well by itself, because after stopping + # the kubelet the dockershim also stops. Thus, for dockershim we must complement + # this logic with an extra step: we obtain all the existing pods before + # stopping kubelet (A), and later on, once that kubelet is stopped (B), we + # eliminate these pods through the docker-cli interface (C). Technically, + # there's room for a race-condition scenario in which new pods could be deployed + # right between (A) and (B), but being the time-window so small, we can safely + # ignore this case in most setups; in the worst case scenario we would simply + # end up with a duplicated/stale ("non-ready") pod instantiation, but this + # wouldn't affect the proper operation of the primary ("ready") one. + + if [[ ${runtime} =~ "dockershim" ]]; then + stop_kubelet + clean_runtime_state "$runtime" + clean_cgroups_kubepods + config_kubelet "host-based" + adjust_crio_config_dependencies + restart_kubelet + else + stop_kubelet + clean_runtime_state "$runtime" + clean_cgroups_kubepods + config_kubelet "host-based" + adjust_crio_config_dependencies + restart_kubelet + fi +} + +function main() { + + euid=$(id -u) + if [[ $euid -ne 0 ]]; then + die "This script must be run as root." + fi + + # Verify that /sys is mounted as read-write; otherwise remount it. + if mount | grep -q "/sys .*ro,"; then + mount -o rw,remount /sys + fi + + # + # The following kubelet deployment scenarios are currently supported: + # + # * Snap: Snap-based kubelet (as in Ubuntu-based AWS EKS nodes). + # * RKE: Docker-based kubelet created as a static-pod (Rancher's RKE approach). + # * RKE2: Host-based kubelet managed by rke2-agent's systemd service (Rancher's RKE2 approach). + # * Systemd+Docker: Docker-based kubelet managed by a systemd service (Lokomotive's approach). + # * Systemd: Host-based kubelet managed by a systemd service (most common approach). + # + if kubelet_snap_deployment; then + do_config_kubelet_snap + elif kubelet_rke_deployment; then + do_config_kubelet_rke + elif kubelet_rke2_deployment; then + do_config_kubelet_rke2 + elif kubelet_docker_systemd_deployment; then + do_config_kubelet_docker_systemd + else + do_config_kubelet + fi +} + +main "$@" diff --git a/sysbox-pkgr/k8s/scripts/kubelet-unconfig-helper.sh b/sysbox-pkgr/k8s/scripts/kubelet-unconfig-helper.sh new file mode 100755 index 00000000..5cb41a3a --- /dev/null +++ b/sysbox-pkgr/k8s/scripts/kubelet-unconfig-helper.sh @@ -0,0 +1,559 @@ +#!/bin/bash -x + +# +# Copyright 2019-2021 Nestybox, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +# +# Script to revert the kubelet config done by the kubelet-config-helper +# + +set -o errexit +set -o pipefail +set -o nounset + +var_lib_sysbox_deploy_k8s="/var/lib/sysbox-deploy-k8s" +crictl_bin="/usr/local/bin/sysbox-deploy-k8s-crictl" +kubelet_bin="" +runtime="" + +# Container's default restart-policy mode (i.e. no restart). +kubelet_ctr_restart_mode="no" + +function die() { + msg="$*" + echo "ERROR: $msg" >&2 + exit 1 +} + +function get_kubelet_bin() { + local tmp=$(systemctl show kubelet | grep "ExecStart=" | cut -d ";" -f1) + tmp=${tmp#"ExecStart={ path="} + echo "$tmp" | xargs +} + +function get_kubelet_service_dropin_file() { + systemctl show kubelet | grep "^DropInPaths" | cut -d "=" -f2 +} + +# Wipe out all the pods previously created by the current runtime (i.e., CRI-O) +function clean_runtime_state() { + + # Collect all the existing podIds as seen by crictl. + podList=$($crictl_bin --runtime-endpoint "$runtime" pods | awk 'NR>1 {print $1}') + + # Turn off errexit in these steps as we don't want to interrupt the process + # if any of the instructions fail for a particular pod / container. + set +e + + # Stop / remove all the existing pods. + for pod in ${podList}; do + ret=$($crictl_bin --runtime-endpoint "$runtime" stopp ${pod}) + if [ $? -ne 0 ]; then + echo "Failed to stop pod ${pod}: $ret" + fi + + ret=$($crictl_bin --runtime-endpoint "$runtime" rmp ${pod}) + if [ $? -ne 0 ]; then + echo "Failed to remove pod ${pod}: $ret" + fi + done + + # At this point all the pre-existing containers may be stopped and eliminated, + # but there may be inactive containers that we want to eliminate too. + cntrList=$($crictl_bin --runtime-endpoint "$runtime" ps -a | awk 'NR>1 {print $1}') + + for cntr in ${cntrList}; do + ret=$($crictl_bin --runtime-endpoint "$runtime" stop --timeout 0 "$cntr") + if [ $? -ne 0 ]; then + echo "Failed to stop container ${cntr}: ${ret}" + fi + + ret=$($crictl_bin --runtime-endpoint "$runtime" rm --force "$cntr") + if [ $? -ne 0 ]; then + echo "Failed to remove container ${cntr}: ${ret}" + fi + done + + set -e + + # Revert the runtime socket changes made during installation. + local prior_runtime=$(cat ${var_lib_sysbox_deploy_k8s}/prior_runtime) + local prior_runtime_path=$(echo $prior_runtime | sed 's@unix://@@' | cut -d" " -f1) + + # We don't want to restart containerd in RKE2 scenarios as this one should + # be only managed by the rke2-agent. + # + # TODO: Do some refactoring to merge these two "containerd" scenarios by + # moving out the container-restart instruction to the caller. + if echo "$prior_runtime" | egrep -q "k3s.*containerd"; then + # This is a softlink created by kubelet-config-helper; remove it. + rm -f "$prior_runtime_path" + + elif [[ "$prior_runtime" =~ "containerd" ]]; then + rm -f "$prior_runtime_path" + + echo "Re-starting containerd on the host ..." + systemctl restart containerd + fi + + if [[ "$prior_runtime" =~ "dockershim" ]]; then + rm -f /var/run/dockershim.sock + + echo "Re-starting docker on the host ..." + systemctl restart docker + fi +} + +# QoS cgroups are created as transient systemd slices when making use of the systemd +# cgroup driver. In these scenarios, kubelet won't be able to initialize if there are +# pre-existing kubepod cgroup entries corresponding to previous kubelet instantiations. +# This function ensures that these entries are eliminated. +function clean_cgroups_kubepods() { + + # We eliminate all the cgroup kubepod entries by simply stopping their associated + # systemd service. + echo "Stopping/eliminating kubelet QoS cgroup kubepod entries..." + for i in $(systemctl list-unit-files --no-legend --no-pager -l | grep --color=never -o .*.slice | grep kubepod); do + systemctl stop $i + done +} + +# Sets the restart-policy mode for any given docker container. +function set_ctr_restart_policy() { + local cntr=$1 + local mode=$2 + + # Docker's supported restart-policy modes. + if [[ $mode != "no" ]] && + [[ $mode != "always" ]] && + [[ $mode != "on-failure" ]] && + [[ $mode != "unless-stopped" ]]; then + echo "Unsupported restart-policy mode: $mode" + return + fi + + if ! docker update --restart=$mode $cntr; then + echo "Unable to modify container $cntr restart mode to $mode." + return + fi + + echo "Successfully modified $cntr container's restart-policy to mode: $mode." +} + +# Sets the restart-policy mode for the kubelet docker container. +function set_kubelet_ctr_restart_policy() { + local mode=$1 + + kubelet_ctr_restart_mode=$(docker inspect --format='{{.HostConfig.RestartPolicy.Name}}' kubelet) + + set_ctr_restart_policy "kubelet" $mode +} + +# Reverts the restart-policy mode previously stored in a global-variable. +function revert_kubelet_ctr_restart_policy() { + set_ctr_restart_policy "kubelet" $kubelet_ctr_restart_mode +} + +############################################################################### +# Scenario 1: Snap setup -- Snap-based kubelet +############################################################################### + +function start_kubelet_snap() { + snap start $kubelet_snap +} + +function stop_kubelet_snap() { + snap stop $kubelet_snap +} + +function revert_kubelet_config_snap() { + local prior_runtime=$(cat ${var_lib_sysbox_deploy_k8s}/prior_runtime) + + echo "Reverting kubelet snap config" + + # If runtime is unknown, assume it's Docker + if [[ ${prior_runtime} == "" ]] || [[ ${prior_runtime} =~ "docker" ]]; then + echo "Reverting runtime to Docker" + snap unset $kubelet_snap container-runtime-endpoint + snap set $kubelet_snap container-runtime=docker + else + echo "Reverting runtime to $prior_runtime" + snap set $kubelet_snap container-runtime-endpoint=${prior_runtime} + fi +} + +function get_runtime_kubelet_snap() { + + # If runtime is unknown, assume it's Docker. + if [[ ${runtime} == "" ]]; then + runtime="unix:///var/run/dockershim.sock" + fi + + local ctr_runtime_type=$(snap get $kubelet_snap container-runtime) + if [[ "$ctr_runtime_type" == "remote" ]]; then + runtime=$(snap get $kubelet_snap container-runtime-endpoint) + fi +} + +function do_unconfig_kubelet_snap() { + echo "Detected kubelet snap package on host." + + kubelet_snap=$(snap list | grep kubelet | awk '{print $1}') + + get_runtime_kubelet_snap + + if [[ ! ${runtime} =~ "crio" ]]; then + echo "Expected kubelet to be using CRI-O, but it's using $runtime; no action will be taken." + return + fi + + stop_kubelet_snap + clean_runtime_state + clean_cgroups_kubepods + revert_kubelet_config_snap + start_kubelet_snap +} + +function kubelet_snap_deployment() { + snap list 2>&1 | grep -q kubelet +} + +############################################################################### +# Scenario 2: RKE setup -- Docker-based kubelet created by rke tool +############################################################################### + +function restart_kubelet_rke() { + docker restart kubelet +} + +function stop_kubelet_rke() { + docker stop kubelet +} + +function get_runtime_kubelet_docker() { + set +e + runtime=$(docker exec kubelet bash -c "ps -e -o command | egrep \^kubelet | egrep -o \"container-runtime-endpoint=\S*\" | cut -d '=' -f2") + set -e + + # If runtime is unknown, assume it's Docker. + if [[ ${runtime} == "" ]]; then + runtime="unix:///var/run/dockershim.sock" + fi +} + +function revert_kubelet_config_rke() { + + # Obtain kubelet's container entrypoint. + local kubelet_entrypoint=$(docker inspect --format='{{index .Config.Entrypoint 0}}' kubelet) + + if [ -z ${kubelet_entrypoint} ] || + ! docker exec kubelet bash -c "test -f ${kubelet_entrypoint}.orig"; then + echo "Failed to revert kubelet config; original entrypoint not found: ${kubelet_entrypoint}.orig" + return + fi + + echo "Reverting kubelet's RKE config" + + # Revert to original entrypoint. + docker exec kubelet bash -c "mv ${kubelet_entrypoint}.orig ${kubelet_entrypoint}" +} + +function do_unconfig_kubelet_rke() { + echo "Detected RKE's docker-based kubelet deployment on host." + + get_runtime_kubelet_docker + if [[ ! ${runtime} =~ "crio" ]]; then + echo "Expected kubelet to be using CRI-O, but it's using $runtime; no action will be taken." + return + fi + + # In RKE's case we must add a few steps to the typical logic utilized in other + # setups. In this case, as kubelet executes as the 'init' process of a docker + # container, we must do the following: + # + # * Modify kubelet's container restart-policy to prevent this one from being + # re-spawned by docker once that we temporarily shut it down. + # * Revert the kubelet's container entrypoint to honor its original + # initialization attributes. + # * Once the usual kubelet's "stop + clean + restart" cycle is completed, we + # must revert the changes made to the kubelet's container restart-policy. + + set_kubelet_ctr_restart_policy "no" + revert_kubelet_config_rke + stop_kubelet_rke + clean_runtime_state + restart_kubelet_rke + revert_kubelet_ctr_restart_policy +} + +function kubelet_rke_deployment() { + # Docker presence is a must-have in rke setups. As we are enforcing this + # requirement at the very beginning of the execution path, no other rke + # related routine will check for docker's presence. + if ! command -v docker >/dev/null 2>&1; then + return 1 + fi + + docker inspect --format='{{.Config.Labels}}' kubelet | + egrep -q "rke.container.name:kubelet" +} + +############################################################################### +# Scenario 3: RKE2 setup -- Host-based kubelet managed by rke2-agent's systemd +# service +############################################################################### + +function get_rke2_service_name() { + # figure out which rke2 service is running + # one of these must be active because of the check in kubelet_rke2_deployment + systemctl list-units --type service --state active --quiet --plain 'rke2-*' | cut -d ' ' -f1 +} + +function start_rke2() { + echo "Starting RKE2 service $1 ..." + systemctl start $1 +} + +function stop_rke2() { + echo "Stopping RKE2 service $1 ..." + systemctl stop $1 +} + +function revert_kubelet_config_rke2() { + + echo "Executing RKE2's Kubelet revert configuration function ..." + + local rancher_config="/etc/rancher/rke2/config.yaml" + + if egrep -q "container-runtime-endpoint:.*crio.sock" "$rancher_config"; then + sed -i '/container-runtime-endpoint:/d' "$rancher_config" + fi +} + +function get_runtime_kubelet_rke2() { + set +e + runtime=$(ps -e -o command | egrep kubelet | egrep -o "container-runtime-endpoint=\S*" | cut -d '=' -f2) + set -e + + # If runtime is unknown, assume it's Docker. + if [[ ${runtime} == "" ]]; then + runtime="unix:///var/run/dockershim.sock" + fi +} + +function do_unconfig_kubelet_rke2() { + echo "Detected RKE2's host-based kubelet deployment on host." + + get_runtime_kubelet_rke2 + if [[ ! ${runtime} =~ "crio" ]]; then + echo "Expected kubelet to be using CRI-O, but it's using $runtime; no action will be taken." + return + fi + + local rke2_service_name=$(get_rke2_service_name) # rke2-agent or rke2-server + stop_rke2 "$rke2_service_name" + clean_runtime_state + revert_kubelet_config_rke2 + start_rke2 "$rke2_service_name" +} + +function kubelet_rke2_deployment() { + + # Worker nodes in RKE2 setups rely on rke2-agent's systemd service, + # or rke2-server for single-node clusters (there is only the controller node) + if systemctl is-active --quiet rke2-agent rke2-server; then + return + fi + + false +} + +############################################################################### +# Scenario 4: Docker-based kubelet managed through a systemd service +############################################################################### + +function get_runtime_kubelet_systemctl { + set +e + # Notice that in this scenario there may be more than one 'container-runtime' + # entry present in the kubelet's exec instruction, so we must only look at + # the latest (relevant) one. + runtime=$(ps -e -o command | egrep kubelet | egrep -o "container-runtime-endpoint=\S*" | tail -1 | cut -d '=' -f2) + set -e + + # If runtime is unknown, assume it's Docker. + if [[ ${runtime} == "" ]]; then + runtime="unix:///var/run/dockershim.sock" + fi +} + +function do_unconfig_kubelet_docker_systemd() { + echo "Detected systemd-managed docker-based kubelet deployment on host." + + get_runtime_kubelet_systemctl + if [[ ! ${runtime} =~ "crio" ]]; then + echo "Expected kubelet to be using CRI-O, but it's using $runtime; no action will be taken." + return + fi + + stop_kubelet + clean_runtime_state + clean_cgroups_kubepods + revert_kubelet_config + start_kubelet +} + +function kubelet_docker_systemd_deployment() { + + # Docker presence is a must-have requirement in these setups (obviously). As + # we are enforcing this requirement at the very beginning of the execution + # path, no other systemd-docker related routine will check for docker's + # presence. + if ! command -v docker >/dev/null 2>&1; then + return 1 + fi + + # Ensure that a container named 'kubelet' exists (typical de-facto standard). + if ! systemctl show kubelet.service | egrep -q "^ExecStart.*=docker run"; then + return 1 + fi + + # Ensure that the entrypoint of this kubelet container is executing 'kubelet' + # itself. + if ! docker inspect --format='{{index .Config.Entrypoint 0}}' kubelet | + awk -F "/" '{print $NF}' | egrep -q "kubelet"; then + return 1 + fi +} + +############################################################################### +# Scenario 5: Host-based kubelet managed through a systemd service +############################################################################### + +function start_kubelet() { + echo "Starting Kubelet ..." + systemctl start kubelet +} + +function restart_kubelet() { + echo "Restarting Kubelet ..." + systemctl restart kubelet +} + +function stop_kubelet() { + echo "Stopping Kubelet ..." + systemctl stop kubelet +} + +function revert_kubelet_config() { + local config_file="${var_lib_sysbox_deploy_k8s}/config" + local kubelet_systemd_dropin="${var_lib_sysbox_deploy_k8s}/kubelet_systemd_dropin" + local kubelet_sysbox_systemd_dropin="/etc/systemd/system/kubelet.service.d/01-kubelet-sysbox-dropin.conf" + + echo "Reverting kubelet config (from $config_file)" + + if [ ! -f "$config_file" ]; then + echo "Failed to revert kubelet config; file $config_file not found." + return + fi + + if ! grep "kubelet_env_file" "$config_file"; then + echo "Failed to revert kubelet config; config not found in $config_file" + return + fi + + # The config file will have these entries: + # + # * kubelet_env_file=/path/to/file + # * kubelet_systemd_file=/path/to/file + # + # Below, we copy these original files back to their original locations. + + local target=$(grep "kubelet_env_file" "$config_file" | cut -d "=" -f2) + if [ ! -z "$target" ]; then + cp "${var_lib_sysbox_deploy_k8s}/kubelet_env_file.orig" "$target" + rm "${var_lib_sysbox_deploy_k8s}/kubelet_env_file.orig" + fi + + local target=$(grep "kubelet_systemd_file" "$config_file" | cut -d "=" -f2) + if [ ! -z "$target" ]; then + # If the primary kubelet systemd file was one artificially introduced by + # Sysbox during installation, we simply want to remove it here. In the + # other scenarios we must copy the original file to its former location. + if [[ "$target" == "$kubelet_sysbox_systemd_dropin" ]]; then + rm -r "$target" + else + cp "${var_lib_sysbox_deploy_k8s}/kubelet_systemd_file.orig" "$target" + rm -r "${var_lib_sysbox_deploy_k8s}/kubelet_systemd_file.orig" + fi + + systemctl daemon-reload + fi + + rm "$config_file" +} + +function do_unconfig_kubelet() { + echo "Detected systemd-managed host-based kubelet deployment on host." + + # Obtain kubelet path. + kubelet_bin=$(get_kubelet_bin) + if [ -z "$kubelet_bin" ]; then + die "Kubelet binary not identified." + fi + + get_runtime_kubelet_systemctl + if [[ ! ${runtime} =~ "crio" ]]; then + echo "Expected kubelet to be using CRI-O, but it's using $runtime; no action will be taken." + return + fi + + stop_kubelet + clean_runtime_state + clean_cgroups_kubepods + revert_kubelet_config + restart_kubelet +} + +function main() { + + euid=$(id -u) + if [[ $euid -ne 0 ]]; then + die "This script must be run as root" + fi + + # + # The following kubelet deployment scenarios are currently supported: + # + # * Snap: Snap-based kubelet (as in Ubuntu-based AWS EKS nodes). + # * RKE: Docker-based kubelet created as a static-pod (Rancher's RKE approach). + # * RKE2: Host-based kubelet managed by rke2-agent's systemd service (Rancher's RKE2 approach). + # * Systemd+Docker: Docker-based kubelet managed by a systemd service (Lokomotive's approach). + # * Systemd: Host-based kubelet managed by a systemd service (most common approach). + # + if kubelet_snap_deployment; then + do_unconfig_kubelet_snap + elif kubelet_rke_deployment; then + do_unconfig_kubelet_rke + elif kubelet_rke2_deployment; then + do_unconfig_kubelet_rke2 + elif kubelet_docker_systemd_deployment; then + do_unconfig_kubelet_docker_systemd + else + do_unconfig_kubelet + fi +} + +main "$@" diff --git a/sysbox-pkgr/k8s/scripts/sysbox-deploy-k8s.sh b/sysbox-pkgr/k8s/scripts/sysbox-deploy-k8s.sh new file mode 100755 index 00000000..aaca0e2e --- /dev/null +++ b/sysbox-pkgr/k8s/scripts/sysbox-deploy-k8s.sh @@ -0,0 +1,1332 @@ +#!/bin/bash + +# +# Copyright 2019-2023 Nestybox, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +# +# Script to install or remove Sysbox (CE) and Sysbox-EE on a Kubernetes node. +# The script assumes it will run inside the sysbox deploy daemonset container, +# and that several host directories are mounted onto the container. The script +# requires full root privileges on the host (e.g., CAP_SYS_ADMIN + write access +# to /proc) in order to install Sysbox on it. +# +# Note: inspired by kata-deploy (github.com/kata-containers/packaging/tree/master/kata-deploy) +# + +set -o errexit +set -o pipefail +set -o nounset + +# The Sysbox edition to install: Sysbox (CE) or Sysbox-EE. +sysbox_edition="" +# The Sysbox version to install; append "-0" if it doesn't have a patch number. +sysbox_version=$(echo "$SYSBOX_VERSION" | sed '/-[0-9]/!s/.*/&-0/') + +# The daemonset Dockerfile places sysbox artifacts here +sysbox_artifacts="/opt/sysbox" +crio_artifacts="/opt/crio-deploy" + +# The daemonset spec will set up these mounts. +host_systemd="/mnt/host/lib/systemd/system" +host_sysctl="/mnt/host/lib/sysctl.d" +host_bin="/mnt/host/usr/bin" +host_lib_mod="/mnt/host/usr/lib/modules-load.d" +host_local_bin="/mnt/host/usr/local/bin" +host_etc="/mnt/host/etc" +host_os_release="/mnt/host/os-release" +host_crio_conf_file="${host_etc}/crio/crio.conf" +host_crio_conf_file_backup="${host_crio_conf_file}.orig" +host_run="/mnt/host/run" +host_var_lib="/mnt/host/var/lib" +host_var_lib_sysbox_deploy_k8s="${host_var_lib}/sysbox-deploy-k8s" + +# +# Subid default values. +# +# Sysbox supports up 4K sys contaienrs per K8s node, each with 64K subids. +# +# Historical note: prior to Docker's acquisition of Nesytbox, Sysbox-CE was +# limited to 16-pods-per-node via variable subid_alloc_min_range below, whereas +# Sysbox-EE was limited to 4K-pods-per-node. After Docker's acquisition of +# Nestybox (05/22) Sysbox-EE is no longer being offered and therefore Docker has +# decided to lift the Sysbox-CE limit to encourage adoption of Sysbox on K8s +# clusters (the limit will now be 4K-pods-per-node as it was in Sysbox-EE). +# +subid_alloc_min_start=100000 +subid_alloc_min_range=268435456 +subid_alloc_max_end=4294967295 + +# We use CRI-O's default user "containers" for the sub-id range (rather than +# user "sysbox"). +subid_user="containers" +subid_def_file="${host_etc}/login.defs" +subuid_file="${host_etc}/subuid" +subgid_file="${host_etc}/subgid" + +# Shiftfs +shiftfs_min_kernel_ver=5.4 +shiftfs_max_kernel_ver=6.2 + +# Current OS distro and kernel release +os_distro_release="" +os_kernel_release="" + +# System platform architecture +sys_arch="" + +# Installation flags +do_sysbox_install="true" +do_sysbox_update="false" +do_crio_install="true" +sysbox_install_in_progress="false" + +# +# CRI-O Installation Functions +# + +function deploy_crio_installer_service() { + echo "Deploying CRI-O installer agent on the host ($k8s_version) ..." + + cp ${crio_artifacts}/bin/${k8s_version}/cri-o.${sys_arch}.tar.gz ${host_local_bin}/cri-o.${sys_arch}.tar.gz + cp ${crio_artifacts}/bin/${k8s_version}/crio-patched ${host_local_bin}/crio-patched + + cp ${crio_artifacts}/scripts/crio-installer.sh ${host_local_bin}/crio-installer.sh + cp ${crio_artifacts}/scripts/crio-extractor.sh ${host_local_bin}/crio-extractor.sh + cp ${crio_artifacts}/systemd/crio-installer.service ${host_systemd}/crio-installer.service + + mkdir -p ${host_var_lib_sysbox_deploy_k8s} + cp ${crio_artifacts}/config/etc_cni_net.d_200-loopback.conf ${host_var_lib_sysbox_deploy_k8s}/etc_cni_net.d_200-loopback.conf + cp ${crio_artifacts}/config/etc_containers_registries.conf.d_000-shortnames.conf ${host_var_lib_sysbox_deploy_k8s}/etc_containers_registries.conf.d_000-shortnames.conf + cp ${crio_artifacts}/config/etc_containers_storage.conf ${host_var_lib_sysbox_deploy_k8s}/etc_containers_storage.conf + cp ${crio_artifacts}/config/etc_containers_registries.conf ${host_var_lib_sysbox_deploy_k8s}/etc_containers_registries.conf + cp ${crio_artifacts}/config/etc_containers_registries.d_default.yaml ${host_var_lib_sysbox_deploy_k8s}/etc_containers_registries.d_default.yaml + cp ${crio_artifacts}/config/etc_containers_policy.json ${host_var_lib_sysbox_deploy_k8s}/etc_containers_policy.json + + systemctl daemon-reload + echo "Running CRI-O installer agent on the host (may take several seconds) ..." + systemctl restart crio-installer.service +} + +function remove_crio_installer_service() { + echo "Removing CRI-O installer agent from the host ..." + systemctl stop crio-installer.service + systemctl disable crio-installer.service + rm -f ${host_local_bin}/crio-installer.sh + rm -f ${host_local_bin}/crio-extractor.sh + rm -f ${host_systemd}/crio-installer.service + + systemctl daemon-reload +} + +function deploy_crio_removal_service() { + echo "Deploying CRI-O uninstaller ..." + cp ${crio_artifacts}/scripts/crio-removal.sh ${host_local_bin}/crio-removal.sh + cp ${crio_artifacts}/scripts/crio-extractor.sh ${host_local_bin}/crio-extractor.sh + cp ${crio_artifacts}/systemd/crio-removal.service ${host_systemd}/crio-removal.service + + systemctl daemon-reload + systemctl restart crio-removal.service +} + +function remove_crio_removal_service() { + echo "Removing the CRI-O uninstaller ..." + systemctl stop crio-removal.service + systemctl disable crio-removal.service + rm -f ${host_local_bin}/crio-removal.sh + rm -f ${host_local_bin}/crio-extractor.sh + rm -f ${host_systemd}/crio-removal.service + + systemctl daemon-reload +} + +function deploy_kubelet_config_service() { + echo "Deploying Kubelet config agent on the host ..." + mkdir -p ${host_var_lib_sysbox_deploy_k8s} + cp ${crio_artifacts}/scripts/kubelet-config-helper.sh ${host_local_bin}/kubelet-config-helper.sh + cp ${crio_artifacts}/systemd/kubelet-config-helper.service ${host_systemd}/kubelet-config-helper.service + cp ${crio_artifacts}/config/crio-kubelet-options ${host_var_lib_sysbox_deploy_k8s}/crio-kubelet-options + + cp /usr/local/bin/crictl ${host_local_bin}/sysbox-deploy-k8s-crictl + + echo "Running Kubelet config agent on the host (will restart Kubelet and temporary bring down all pods on this node for ~1 min) ..." + systemctl daemon-reload + systemctl restart kubelet-config-helper.service +} + +function remove_kubelet_config_service() { + echo "Stopping the Kubelet config agent on the host ..." + systemctl stop kubelet-config-helper.service + systemctl disable kubelet-config-helper.service + + echo "Removing Kubelet config agent from the host ..." + rm -f ${host_local_bin}/kubelet-config-helper.sh + rm -f ${host_systemd}/kubelet-config-helper.service + rm -f ${host_local_bin}/sysbox-deploy-k8s-crictl + systemctl daemon-reload +} + +function deploy_kubelet_unconfig_service() { + echo "Deploying Kubelet unconfig agent on the host ..." + + cp ${crio_artifacts}/scripts/kubelet-unconfig-helper.sh ${host_local_bin}/kubelet-unconfig-helper.sh + cp ${crio_artifacts}/systemd/kubelet-unconfig-helper.service ${host_systemd}/kubelet-unconfig-helper.service + cp /usr/local/bin/crictl ${host_local_bin}/sysbox-deploy-k8s-crictl + + echo "Running Kubelet unconfig agent on the host (will restart Kubelet and temporary bring down all pods on this node for ~1 min) ..." + systemctl daemon-reload + systemctl restart kubelet-unconfig-helper.service +} + +function remove_kubelet_unconfig_service() { + echo "Stopping the Kubelet unconfig agent on the host ..." + systemctl stop kubelet-unconfig-helper.service + systemctl disable kubelet-unconfig-helper.service + + echo "Removing Kubelet unconfig agent from the host ..." + rm -f ${host_local_bin}/kubelet-unconfig-helper.sh + rm -f ${host_systemd}/kubelet-unconfig-helper.service + rm -f ${host_local_bin}/sysbox-deploy-k8s-crictl + systemctl daemon-reload +} + +function config_crio() { + echo "Configuring CRI-O ..." + + touch ${host_crio_conf_file} + if [ ! -f ${host_crio_conf_file_backup} ]; then + cp ${host_crio_conf_file} ${host_crio_conf_file_backup} + fi + + # Disable selinux for now. + dasel put bool -f ${host_crio_conf_file} -p toml -m "crio.runtime.selinux" false + + # Add user "containers" to the /etc/subuid and /etc/subgid files + get_subid_limits + config_subid_range "$subuid_file" "$subid_alloc_min_range" "$subuid_min" "$subuid_max" + config_subid_range "$subgid_file" "$subid_alloc_min_range" "$subgid_min" "$subgid_max" + + # Set capabilities to match default caps in containerd/docker + dasel put string -f ${host_crio_conf_file} -p toml -m 'crio.runtime.default_capabilities.[]' "CHOWN" + dasel put string -f ${host_crio_conf_file} -p toml -m 'crio.runtime.default_capabilities.[]' "DAC_OVERRIDE" + dasel put string -f ${host_crio_conf_file} -p toml -m 'crio.runtime.default_capabilities.[]' "FSETID" + dasel put string -f ${host_crio_conf_file} -p toml -m 'crio.runtime.default_capabilities.[]' "FOWNER" + dasel put string -f ${host_crio_conf_file} -p toml -m 'crio.runtime.default_capabilities.[]' "SETUID" + dasel put string -f ${host_crio_conf_file} -p toml -m 'crio.runtime.default_capabilities.[]' "SETGID" + dasel put string -f ${host_crio_conf_file} -p toml -m 'crio.runtime.default_capabilities.[]' "SETPCAP" + dasel put string -f ${host_crio_conf_file} -p toml -m 'crio.runtime.default_capabilities.[]' "SETFCAP" + dasel put string -f ${host_crio_conf_file} -p toml -m 'crio.runtime.default_capabilities.[]' "NET_BIND_SERVICE" + dasel put string -f ${host_crio_conf_file} -p toml -m 'crio.runtime.default_capabilities.[]' "KILL" + dasel put string -f ${host_crio_conf_file} -p toml -m 'crio.runtime.default_capabilities.[]' "AUDIT_WRITE" + dasel put string -f ${host_crio_conf_file} -p toml -m 'crio.runtime.default_capabilities.[]' "NET_RAW" + dasel put string -f ${host_crio_conf_file} -p toml -m 'crio.runtime.default_capabilities.[]' "SYS_CHROOT" + dasel put string -f ${host_crio_conf_file} -p toml -m 'crio.runtime.default_capabilities.[]' "MKNOD" + + # Set the default runtime to crio-runc. + dasel put string -f ${host_crio_conf_file} -p toml -m "crio.runtime.default_runtime" "crio-runc" + + # Add crio-runc runtime and its monitor-path. + dasel put object -f "${host_crio_conf_file}" -p toml -t string -t string -t string "crio.runtime.runtimes.crio-runc" \ + "runtime_path=/usr/local/bin/crio-runc" "runtime_type=oci" "monitor_path=/usr/local/bin/crio-conmon" + + # Create 'crio.image' table (required for 'pause_image' settings). + dasel put document -f ${host_crio_conf_file} -p toml -m '.crio.image' + + # Create 'crio.network' table (required for 'network_dir' settings). + dasel put document -f ${host_crio_conf_file} -p toml -m '.crio.network' + + # CRI-O puts a default limit of 1024 processes per pod; this is too small for + # Sysbox pods, since these run sometimes complex software such as Docker, + # K8s, etc. Thus we increase this to 16K processes per pod. Since the max + # limit for Linux is 4M (see /proc/sys/kernel/pid_max), this allows up to + # ~256 Sysbox containers each consuming 16K processes on a given host. It + # also constraints a malicious container executing a fork bomb to 16K + # processes, well below the kernel's max pid limit. + dasel put int -f ${host_crio_conf_file} -p toml -m "crio.runtime.pids_limit" 16384 +} + +function restart_crio() { + echo "Restarting CRI-O ..." + systemctl restart crio +} + +# +# Sysbox Installation Functions +# + +function get_artifacts_dir() { + + local distro=$os_distro_release + + if [[ "$distro" == "ubuntu-22.04" ]] || + [[ "$distro" == "ubuntu-21.10" ]] || + [[ "$distro" == "ubuntu-20.04" ]] || + [[ "$distro" == "ubuntu-18.04" ]] || + [[ "$distro" =~ "debian" ]]; then + artifacts_dir="${sysbox_artifacts}/bin/generic" + elif [[ "$distro" =~ "flatcar" ]]; then + local release=$(echo $distro | cut -d"-" -f2) + artifacts_dir="${sysbox_artifacts}/bin/flatcar-${release}" + else + die "Sysbox is not supported on this host's distro ($distro)". + fi + + echo $artifacts_dir +} + +function copy_sysbox_to_host() { + + local artifacts_dir=$(get_artifacts_dir) + + cp "${artifacts_dir}/sysbox-mgr" "${host_bin}/sysbox-mgr" + cp "${artifacts_dir}/sysbox-fs" "${host_bin}/sysbox-fs" + cp "${artifacts_dir}/sysbox-runc" "${host_bin}/sysbox-runc" + + # Keep track of the sysbox version installed on the host (upgrade purposes). + echo "${sysbox_version}" >${host_var_lib_sysbox_deploy_k8s}/sysbox_installed_version +} + +function rm_sysbox_from_host() { + rm -f "${host_bin}/sysbox-mgr" + rm -f "${host_bin}/sysbox-fs" + rm -f "${host_bin}/sysbox-runc" + + # Remove sysbox from the /etc/subuid and /etc/subgid files + sed -i '/sysbox:/d' "${host_etc}/subuid" + sed -i '/sysbox:/d' "${host_etc}/subgid" + + rm -f "${host_var_lib_sysbox_deploy_k8s}/sysbox_installed_version" +} + +function copy_sysbox_env_config_to_host() { + cp "${sysbox_artifacts}/systemd/99-sysbox-sysctl.conf" "${host_sysctl}/99-sysbox-sysctl.conf" + cp "${sysbox_artifacts}/systemd/50-sysbox-mod.conf" "${host_lib_mod}/50-sysbox-mod.conf" +} + +function rm_conf_from_host() { + rm -f "${host_sysctl}/99-sysbox-sysctl.conf" + rm -f "${host_lib_mod}/50-sysbox-mod.conf" +} + +function config_sysbox_env() { + # Set the sysbox edition in the sysbox-mgr and sysbox-fs systemd unit files. + sed -i "/^Environment=/ s|SYSBOX_EDITION=.*|SYSBOX_EDITION=${sysbox_edition}|" ${sysbox_artifacts}/systemd/sysbox-mgr.service + sed -i "/^Environment=/ s|SYSBOX_EDITION=.*|SYSBOX_EDITION=${sysbox_edition}|" ${sysbox_artifacts}/systemd/sysbox-fs.service +} + +# Update Sysbox's systemd unit files with the received configMap configuration +# corresponding to the sysbox-mgr and sysbox-fs services. +function config_sysbox() { + if [ -n "$SYSBOX_MGR_CONFIG" ]; then + sed -i "/^ExecStart=/ s|/usr/bin/sysbox-mgr|/usr/bin/sysbox-mgr ${SYSBOX_MGR_CONFIG}|" ${sysbox_artifacts}/systemd/sysbox-mgr.service + fi + + if [ -n "$SYSBOX_FS_CONFIG" ]; then + sed -i "/^ExecStart=/ s|/usr/bin/sysbox-fs|/usr/bin/sysbox-fs ${SYSBOX_FS_CONFIG}|" ${sysbox_artifacts}/systemd/sysbox-fs.service + fi +} + +function copy_sysbox_config_to_host() { + cp "${sysbox_artifacts}/systemd/sysbox.service" "${host_systemd}/sysbox.service" + cp "${sysbox_artifacts}/systemd/sysbox-mgr.service" "${host_systemd}/sysbox-mgr.service" + cp "${sysbox_artifacts}/systemd/sysbox-fs.service" "${host_systemd}/sysbox-fs.service" + systemctl daemon-reload + systemctl enable sysbox.service + systemctl enable sysbox-mgr.service + systemctl enable sysbox-fs.service +} + +function rm_systemd_units_from_host() { + rm -f "${host_systemd}/sysbox.service" + rm -f "${host_systemd}/sysbox-mgr.service" + rm -f "${host_systemd}/sysbox-fs.service" + systemctl daemon-reload +} + +function apply_sysbox_env_config() { + # Note: this requires CAP_SYS_ADMIN on the host + echo "Configuring host sysctls ..." + sysctl -p "${host_sysctl}/99-sysbox-sysctl.conf" +} + +function start_sysbox() { + echo "Starting $sysbox_edition ..." + systemctl restart sysbox + systemctl is-active --quiet sysbox +} + +function stop_sysbox() { + if systemctl is-active --quiet sysbox || systemctl is-enabled --quiet sysbox; then + echo "Stopping $sysbox_edition ..." + systemctl stop sysbox + fi +} + +function install_sysbox() { + # Sysbox could potentially be already installed (during upgrades), + # so stop it first to ensure that copy instructions below can + # succeed. + stop_sysbox + + echo "Installing $sysbox_edition on host ..." + config_sysbox_env + copy_sysbox_env_config_to_host + apply_sysbox_env_config + config_sysbox + copy_sysbox_config_to_host + copy_sysbox_to_host + start_sysbox +} + +function remove_sysbox() { + echo "Removing $sysbox_edition from host ..." + stop_sysbox + rm_systemd_units_from_host + rm_conf_from_host + rm_sysbox_from_host +} + +function deploy_sysbox_installer_helper() { + echo "Deploying $sysbox_edition installer helper on the host ..." + cp ${sysbox_artifacts}/scripts/sysbox-installer-helper.sh ${host_local_bin}/sysbox-installer-helper.sh + cp ${sysbox_artifacts}/systemd/sysbox-installer-helper.service ${host_systemd}/sysbox-installer-helper.service + systemctl daemon-reload + echo "Running $sysbox_edition installer helper on the host (may take several seconds) ..." + systemctl restart sysbox-installer-helper.service +} + +function remove_sysbox_installer_helper() { + echo "Stopping the $sysbox_edition installer helper on the host ..." + systemctl stop sysbox-installer-helper.service + systemctl disable sysbox-installer-helper.service + echo "Removing $sysbox_edition installer helper from the host ..." + rm -f ${host_local_bin}/sysbox-installer-helper.sh + rm -f ${host_systemd}/sysbox-installer-helper.service + systemctl daemon-reload +} + +function deploy_sysbox_removal_helper() { + echo "Deploying $sysbox_edition removal helper on the host..." + cp ${sysbox_artifacts}/scripts/sysbox-removal-helper.sh ${host_local_bin}/sysbox-removal-helper.sh + cp ${sysbox_artifacts}/systemd/sysbox-removal-helper.service ${host_systemd}/sysbox-removal-helper.service + systemctl daemon-reload + systemctl restart sysbox-removal-helper.service +} + +function remove_sysbox_removal_helper() { + echo "Removing the $sysbox_edition removal helper ..." + systemctl stop sysbox-removal-helper.service + systemctl disable sysbox-removal-helper.service + rm -f ${host_local_bin}/sysbox-removal-helper.sh + rm -f ${host_systemd}/sysbox-removal-helper.service + systemctl daemon-reload +} + +function install_sysbox_deps_flatcar() { + + # Expected vars layout: + # * artifacts-dir == "/opt/sysbox/bin/flatcar-" + # * distro-release == "flatcar-" + local artifacts_dir=$(get_artifacts_dir) + local distro_release=$(echo ${artifacts_dir} | cut -d"/" -f5) + + # Let's try to install from local store first. + if [ -d "$artifacts_dir" ]; then + echo "Copying shiftfs module and sysbox dependencies to host ..." + cp ${artifacts_dir}/shiftfs.ko ${host_lib_mod}/shiftfs.ko + cp ${artifacts_dir}/fusermount ${host_bin}/fusermount + return + fi + + # Otherwise fetch the binaries/dependencies from an external location. + echo "Fetching / copying shiftfs module and sysbox dependencies to host ..." + mkdir -p ${artifacts_dir} + pushd ${artifacts_dir}/.. + curl -LJOSs https://github.com/nestybox/sysbox-flatcar-preview/releases/download/Sysbox-${distro_release}/${distro_release}.tar.gz + if [ $? -ne 0 ]; then + die "Unable to fetch Sysbox dependencies for ${distro_release} distribution. Exiting ..." + fi + + tar -xf ${distro_release}.tar.gz + rm -r ${distro_release}.tar.gz + + cp ${artifacts_dir}/shiftfs.ko ${host_lib_mod}/shiftfs.ko + cp ${artifacts_dir}/fusermount ${host_bin}/fusermount +} + +function install_sysbox_deps() { + + # The installation of sysbox dependencies on the host is done via the + # sysbox-installer-helper agent, which is a systemd service that we drop on + # the host and request systemd to start. This way the agent can install + # packages on the host as needed. One of those dependencies is shiftfs, which + # unlike the other dependencies, needs to be built from source on the host + # machine (with the corresponding kernel headers, etc). The shiftfs sources + # are included in the sysbox-deploy-k8s container image, and here we copy + # them to the host machine (in dir /run/shiftfs_dkms). The + # sysbox-installer-helper agent will build those sources on the host and + # install shiftfs on the host kernel via dkms. For the specific case of + # Flatcar, we carry a pre-built shiftfs binary as we can't easily build it + # on the Flatcar host. + + echo "Installing Sysbox dependencies on host ..." + + local kversion=$(echo $os_kernel_release | cut -d "." -f1-2) + if semver_lt $kversion 5.4; then + echo "Kernel has version $kversion, which is below the min required for shiftfs ($shiftfs_min_kernel_ver); skipping shiftfs installation." + return + fi + + if host_flatcar_distro; then + install_sysbox_deps_flatcar + else + echo "Copying shiftfs sources to host ..." + if semver_ge $kversion 5.4 && semver_lt $kversion 5.8; then + echo "Kernel version $kversion is >= 5.4 and < 5.8" + cp -r "/opt/shiftfs-k5.4" "$host_run/shiftfs-dkms" + elif semver_ge $kversion 5.8 && semver_lt $kversion 5.11; then + echo "Kernel version $kversion is >= 5.8 and < 5.11" + cp -r "/opt/shiftfs-k5.10" "$host_run/shiftfs-dkms" + elif semver_ge $kversion 5.11 && semver_lt $kversion 5.13; then + echo "Kernel version $kversion is >= 5.11 and < 5.13" + cp -r "/opt/shiftfs-k5.11" "$host_run/shiftfs-dkms" + elif semver_ge $kversion 5.13 && semver_lt $kversion 5.15; then + echo "Kernel version $kversion is >= 5.13 and < 5.15" + cp -r "/opt/shiftfs-k5.13" "$host_run/shiftfs-dkms" + elif semver_ge $kversion 5.15 && semver_lt $kversion 5.17; then + echo "Kernel version $kversion is >= 5.15 and < 5.17" + cp -r "/opt/shiftfs-k5.16" "$host_run/shiftfs-dkms" + elif semver_ge $kversion 5.17 && semver_lt $kversion 5.18; then + echo "Kernel version $kversion is 5.17" + cp -r "/opt/shiftfs-k5.17" "$host_run/shiftfs-dkms" + elif semver_ge $kversion 5.18 && semver_lt $kversion 6.1; then + echo "Kernel version $kversion is >= 5.18 and < 6.1" + cp -r "/opt/shiftfs-k5.18" "$host_run/shiftfs-dkms" + elif semver_ge $kversion 6.1 && semver_lt $kversion 6.3; then + echo "Kernel version $kversion is >= 6.1 and < 6.3" + cp -r "/opt/shiftfs-k6.1" "$host_run/shiftfs-dkms" + else + echo "Kernel version $kversion, which is above the max required for shiftfs ($shiftfs_max_kernel_ver); skipping shiftfs installation." + # dont copy shiftfs, but still proceed to install other deps + fi + fi + + deploy_sysbox_installer_helper + remove_sysbox_installer_helper +} + +function remove_sysbox_deps() { + echo "Removing sysbox dependencies from host ..." + + deploy_sysbox_removal_helper + remove_sysbox_removal_helper + rm -rf "$host_run/shiftfs-dkms" +} + +function get_subid_limits() { + + # Get subid defaults from /etc/login.defs + + subuid_min=$subid_alloc_min_start + subuid_max=$subid_alloc_max_end + subgid_min=$subid_alloc_min_start + subgid_max=$subid_alloc_max_end + + if [ ! -f $subid_def_file ]; then + return + fi + + set +e + res=$(grep "^SUB_UID_MIN" $subid_def_file 2>/dev/null) + if [ $? -eq 0 ]; then + subuid_min=$(echo $res | cut -d " " -f2) + fi + + res=$(grep "^SUB_UID_MAX" $subid_def_file 2>/dev/null) + if [ $? -eq 0 ]; then + subuid_max=$(echo $res | cut -d " " -f2) + fi + + res=$(grep "^SUB_GID_MIN" $subid_def_file 2>/dev/null) + if [ $? -eq 0 ]; then + subgid_min=$(echo $res | cut -d " " -f2) + fi + + res=$(grep "^SUB_GID_MAX" $subid_def_file 2>/dev/null) + if [ $? -eq 0 ]; then + subgid_max=$(echo $res | cut -d " " -f2) + fi + set -e +} + +function config_subid_range() { + local subid_file=$1 + local subid_size=$2 + local subid_min=$3 + local subid_max=$4 + + if [ ! -f $subid_file ] || [ ! -s $subid_file ]; then + echo "$subid_user:$subid_min:$subid_size" >"${subid_file}" + return + fi + + readarray -t subid_entries <"${subid_file}" + + # if a large enough subid config already exists for user $subid_user, there + # is nothing to do. + + for entry in "${subid_entries[@]}"; do + user=$(echo $entry | cut -d ":" -f1) + start=$(echo $entry | cut -d ":" -f2) + size=$(echo $entry | cut -d ":" -f3) + + if [[ "$user" == "$subid_user" ]] && [ "$size" -ge "$subid_size" ]; then + return + fi + done + + # Sort subid entries by start range + declare -a sorted_subids + if [ ${#subid_entries[@]} -gt 0 ]; then + readarray -t sorted_subids < <(echo "${subid_entries[@]}" | tr " " "\n" | tr ":" " " | sort -n -k 2) + fi + + # allocate a range of subid_alloc_range size + hole_start=$subid_min + + for entry in "${sorted_subids[@]}"; do + start=$(echo $entry | cut -d " " -f2) + size=$(echo $entry | cut -d " " -f3) + + hole_end=$start + + if [ $hole_end -ge $hole_start ] && [ $((hole_end - hole_start)) -ge $subid_size ]; then + echo "$subid_user:$hole_start:$subid_size" >>$subid_file + return + fi + + hole_start=$((start + size)) + done + + hole_end=$subid_max + if [ $((hole_end - hole_start)) -lt $subid_size ]; then + echo "failed to allocate $subid_size sub ids in range $subid_min:$subid_max" + return + else + echo "$subid_user:$hole_start:$subid_size" >>$subid_file + return + fi +} + +function config_crio_for_sysbox() { + echo "Adding Sysbox to CRI-O config ..." + + if [ ! -f ${host_crio_conf_file_backup} ]; then + cp ${host_crio_conf_file} ${host_crio_conf_file_backup} + fi + + # overlayfs with metacopy=on improves startup time of CRI-O rootless containers significantly + if ! dasel -n get string -f "${host_crio_conf_file}" -p toml -s 'crio.storage_option' | grep -q "metacopy=on"; then + dasel put string -f "${host_crio_conf_file}" -p toml -m 'crio.storage_driver' "overlay" + dasel put string -f "${host_crio_conf_file}" -p toml -m 'crio.storage_option.[]' "overlay.mountopt=metacopy=on" + fi + + # Add sysbox-runc and its monitoring-path. + dasel put object -f "${host_crio_conf_file}" -p toml -t string -t string -t string "crio.runtime.runtimes.sysbox-runc" \ + "runtime_path=/usr/bin/sysbox-runc" "runtime_type=oci" "monitor_path=/usr/local/bin/crio-conmon" + + # Add sysbox-runc's allowed annotations. + dasel put string -f "${host_crio_conf_file}" -p toml "crio.runtime.runtimes.sysbox-runc.allowed_annotations.[0]" \ + "io.kubernetes.cri-o.userns-mode" + + # In Flatcar's case we must further adjust crio config. + if host_flatcar_distro; then + sed -i 's@/usr/bin/sysbox-runc@/opt/bin/sysbox-runc@' ${host_crio_conf_file} + fi +} + +function unconfig_crio_for_sysbox() { + echo "Removing Sysbox from CRI-O config ..." + + # Note: dasel does not yet have a proper delete command, so we need the "sed" below. + dasel put document -f "${host_crio_conf_file}" -p toml '.crio.runtime.runtimes.sysbox-runc' '' + sed -i "s/\[crio.runtime.runtimes.sysbox-runc\]//g" "${host_crio_conf_file}" +} + +# +# General Helper Functions +# + +function die() { + msg="$*" + echo "ERROR: $msg" >&2 + exit 1 +} + +function print_usage() { + echo "Usage: $0 [ce|ee] [install|cleanup]" +} + +function get_k8s_version() { + local version=$(kubectl get node $NODE_NAME -o jsonpath='{.status.nodeInfo.kubeletVersion}' | awk -F "." '{print $1 "." $2}') + + if [ "$?" -ne 0 ]; then + die "invalid Kubernetes version" + fi + + echo "$version" +} + +function get_container_runtime() { + local runtime=$(kubectl get node $NODE_NAME -o jsonpath='{.status.nodeInfo.containerRuntimeVersion}') + + if [ "$?" -ne 0 ]; then + die "invalid node name" + fi + if echo "$runtime" | grep -qE 'containerd.*-k3s'; then + if systemctl is-active --quiet k3s-agent; then + echo "k3s-agent" + else + echo "k3s" + fi + else + echo "$runtime" | awk -F '[:]' '{print $1}' + fi +} + +function get_host_distro() { + local distro_name=$(grep -w "^ID" "$host_os_release" | cut -d "=" -f2) + local version_id=$(grep -w "^VERSION_ID" "$host_os_release" | cut -d "=" -f2 | tr -d '"') + echo "${distro_name}-${version_id}" +} + +function get_sys_arch() { + local uname_m=$(uname -m) + + if [[ "$uname_m" == "x86_64" ]]; then + sys_arch=amd64 + elif [[ "$uname_m" == "aarch64" ]]; then + sys_arch=arm64 + elif [[ "$uname_m" == "arm" ]]; then + sys_arch=armhf + elif [[ "$uname_m" == "armel" ]]; then + sys_arch=armel + fi + + echo "${sys_arch}" +} + +function host_flatcar_distro() { + local distro=$(get_host_distro) + echo $distro | grep -q "flatcar" +} + +function get_host_kernel() { + uname -r +} + +function is_supported_distro() { + + local distro=$os_distro_release + + if [[ "$distro" == "ubuntu-22.04" ]] || + [[ "$distro" == "ubuntu-21.10" ]] || + [[ "$distro" == "ubuntu-20.04" ]] || + [[ "$distro" == "ubuntu-18.04" ]] || + [[ "$distro" =~ "debian" ]] || + [[ "$distro" =~ "flatcar" ]]; then + return + fi + + false +} + +function is_supported_kernel() { + + local kversion=$(echo $os_kernel_release | cut -d "." -f1-2) + + # Ubuntu distro is supported starting with kernel 5.3+. + if [[ "$os_distro_release" =~ "ubuntu" ]]; then + if semver_lt $kversion 5.3; then + echo "Unsupported kernel version $kversion for Ubuntu distribution (< 5.3)." + return 1 + fi + + return 0 + fi + + # For all other distros, Sysbox requires 5.5+. + if semver_lt $kversion 5.5; then + echo "Unsupported kernel version $kversion for $os_distro_release distribution (< 5.5)." + return 1 + fi + + return 0 +} + +function is_supported_arch() { + if [[ "$sys_arch" == "amd64" ]] || [[ "$sys_arch" == "arm64" ]]; then + return + fi + + false +} + +function is_supported_k8s_version() { + + local ver=$k8s_version + + if [[ "$ver" == "v1.27" ]] || + [[ "$ver" == "v1.28" ]] || + [[ "$ver" == "v1.29" ]] || + [[ "$ver" == "v1.30" ]]; then + return + fi + + if [[ "$ver" == "v1.19" ]] || + [[ "$ver" == "v1.20" ]] || + [[ "$ver" == "v1.21" ]] || + [[ "$ver" == "v1.22" ]] || + [[ "$ver" == "v1.23" ]] || + [[ "$ver" == "v1.24" ]] || + [[ "$ver" == "v1.25" ]] || + [[ "$ver" == "v1.26" ]]; then + echo "Unsupported kubernetes version: $ver (EOL release)." + fi + + false +} + +function is_kernel_upgraded() { + local cur_kernel=$os_kernel_release + + if [ ! -f ${host_var_lib_sysbox_deploy_k8s}/os_kernel_release ]; then + false + return + fi + + local prev_kernel=$(cat ${host_var_lib_sysbox_deploy_k8s}/os_kernel_release) + if [[ ${cur_kernel} == ${prev_kernel} ]]; then + false + return + fi + + true +} + +function is_sysbox_upgraded() { + # If Sysbox hasn't been installed yet, then there's no upgrade needed. + if [ ! -f ${host_var_lib_sysbox_deploy_k8s}/sysbox_installed ]; then + false + return + fi + + # If the sysbox version file does not exist, then we must be dealing with an + # old Sysbox version ( 10#${ver2[i]})); then + return 1 + fi + if ((10#${ver1[i]} < 10#${ver2[i]})); then + return 2 + fi + done + + return 0 +} + +# Compare semantic versions; takes two semantic version numbers of the form +# x.y.z (or x.y), and returns 0 if the first is less than the +# second, and 1 otherwise. +function semver_lt() { + version_compare $1 $2 + if [ "$?" -eq "2" ]; then + return 0 + else + return 1 + fi +} + +# Compare semantic versions; takes two semantic version numbers of the form +# x.y.z (or x.y), and returns 0 if the first is greater than or equal to the +# second, and 1 otherwise. +function semver_ge() { + version_compare $1 $2 + if [ "$?" -ne "2" ]; then + return 0 + else + return 1 + fi +} + +function do_edition_adjustments() { + local edition_tag=$1 + + # Set the Sysbox edition name being installed. + if [[ ${edition_tag} == "ce" ]]; then + sysbox_edition="Sysbox" + elif [[ ${edition_tag} == "ee" ]]; then + sysbox_edition="Sysbox-EE" + else + print_usage + die "invalid sysbox edition value: $edition_tag" + fi +} + +# Function holds all the adjustments that need to be carried out to meet +# distro-specific requirements. For example, in Flatcar's case these special +# requirements are a consequence of its particular partition scheme (read-only +# /usr). For readability and maintainability purposes, we opted by placing this +# adjustment logic away from the natural location where each file component is +# utilized, so we must keep this point in mind if the files being edited here +# were to be modified prior to the invocation of this routine. +function do_distro_adjustments() { + + local distro=$(get_host_distro) + if [[ ! ${distro} =~ "flatcar" ]]; then + return + fi + + # Ensure that Flatcar installation proceeds only in Sysbox-EE case. + if [[ ${sysbox_edition} != "Sysbox-EE" ]]; then + die "Flatcar OS distribution is only supported on Sysbox Enterprise-Edition. Exiting ..." + fi + + # Adjust global vars. + host_bin="/mnt/host/opt/bin" + host_local_bin="/mnt/host/opt/local/bin" + host_systemd="/mnt/host/etc/systemd/system" + host_sysctl="/mnt/host/opt/lib/sysctl.d" + host_lib_mod="/mnt/host/opt/lib/modules-load.d" + + # Ensure that required folders are already present. + mkdir -p ${host_bin} ${host_local_bin} ${host_systemd} ${host_sysctl} ${host_lib_mod} + + # Adjust crio helper scripts and services. + sed -i 's@/usr/local/bin/crio@/opt/local/bin/crio@g' ${crio_artifacts}/systemd/crio-installer.service + sed -i '/Type=oneshot/a Environment=PATH=/opt/local/bin:/sbin:/bin:/usr/sbin:/usr/bin' ${crio_artifacts}/systemd/crio-removal.service + sed -i 's@/usr/local/bin/crio@/opt/local/bin/crio@g' ${crio_artifacts}/systemd/crio-removal.service + + # Adjust kubelet helper scripts and services. + sed -i '/^ExecStart=/ s@/usr/local/bin@/opt/local/bin@' ${crio_artifacts}/systemd/kubelet-config-helper.service + sed -i '/^ExecStart=/ s@/usr/local/bin@/opt/local/bin@' ${crio_artifacts}/systemd/kubelet-unconfig-helper.service + sed -i '/^crictl_bin/ s@/usr/local/bin@/opt/local/bin@' ${crio_artifacts}/scripts/kubelet-config-helper.sh + sed -i '/^crictl_bin/ s@/usr/local/bin@/opt/local/bin@' ${crio_artifacts}/scripts/kubelet-unconfig-helper.sh + + # Adjust sysbox helper scripts and services. + sed -i '/Type=notify/a Environment=PATH=/opt/bin:/sbin:/bin:/usr/sbin:/usr/bin' ${sysbox_artifacts}/systemd/sysbox-mgr.service + sed -i '/^ExecStart=/ s@/usr/bin/sysbox-mgr@/opt/bin/sysbox-mgr@' ${sysbox_artifacts}/systemd/sysbox-mgr.service + sed -i '/^ExecStart=/ s@/usr/bin/sysbox-fs@/opt/bin/sysbox-fs@' ${sysbox_artifacts}/systemd/sysbox-fs.service + sed -i '/Type=notify/a Environment=PATH=/opt/bin:/sbin:/bin:/usr/sbin:/usr/bin' ${sysbox_artifacts}/systemd/sysbox-fs.service + sed -i '/^ExecStart=/ s@/usr/bin@/opt/bin@g' ${sysbox_artifacts}/systemd/sysbox.service + sed -i '/^ExecStart=/ s@/usr/local/bin@/opt/local/bin@g' ${sysbox_artifacts}/systemd/sysbox-installer-helper.service + sed -i '/^ExecStart=/ s@/usr/local/bin@/opt/local/bin@g' ${sysbox_artifacts}/systemd/sysbox-removal-helper.service + + # Sysctl adjustments. + sed -i '/^kernel.unprivileged_userns_clone/ s/^#*/# /' ${sysbox_artifacts}/systemd/99-sysbox-sysctl.conf +} + +# determines if running on a GKE cluster by checking metadata endpoint +function check_is_gke() { + # GKE nodes will respond with an HTTP 200 for this URL and Metadata-Flavor header. + # Other clouds, URLs, etc. will throw a 404 error. If curl cannot connect the code will be 000. + is_cluster=$(curl -s -o /dev/null \ + -w "%{http_code}" \ + --connect-timeout 1 \ + -H "Metadata-Flavor: Google" \ + 169.254.169.254/computeMetadata/v1/instance/attributes/cluster-name) + if [ $is_cluster -ne 200 ]; then + false + return + fi + true +} + +# Fixes an issue with crio network bridge on GKE not working. +# Also adds correct path to k8s binaries on GKE nodes in /home/kubernetes/bin +function config_crio_for_gke() { + rm -rf ${host_etc}/cni/net.d/100-crio-bridge.conf + dasel put string -f ${host_crio_conf_file} -p toml -m 'crio.network.plugin_dirs.[]' "/opt/cni/bin/" + dasel put string -f ${host_crio_conf_file} -p toml -m 'crio.network.plugin_dirs.[]' "/home/kubernetes/bin" +} + +# Delete all sysbox pods in the node to ensure that the newly installed/updated sysbox runtime +# is used. This is needed because the sysbox pods are not automatically restarted when the sysbox +# runtime is updated, and these may not work correctly otherwise. Similarly, this routine is also +# needed after sysbox removal to ensure that the sysbox pods are not left running. +function delete_sysbox_pods() { + + # Turn off errexit in these steps as we don't want to interrupt the installation/update + # process if any of the pods fail to be deleted for any reason. + set +e + + echo "Deleting all the preexisting sysbox pods in the node..." + kubectl get pod -A -o json | jq -r --arg node "$NODE_NAME" '.items[] | + select(.spec.runtimeClassName=="sysbox-runc" and .spec.nodeName==$node) | "\(.metadata.name) \(.metadata.namespace)"' | \ + while read name namespace; do \ + echo "Deleting sysbox pod ${name} from namespace ${namespace}..."; \ + kubectl delete pod/${name} -n ${namespace} --grace-period=1; \ + done + + set -e +} + +# +# Main Function +# + +function main() { + + euid=$(id -u) + if [[ $euid -ne 0 ]]; then + die "This script must be run as root" + fi + + os_distro_release=$(get_host_distro) + if ! is_supported_distro; then + die "Sysbox is not supported on this host's distro ($os_distro_release)". + fi + + os_kernel_release=$(get_host_kernel) + if ! is_supported_kernel; then + die "Sysbox is not supported on this host's kernel release ($os_kernel_release)". + fi + + sys_arch=$(get_sys_arch) + if ! is_supported_arch; then + die "Sysbox is not supported on this platform architecture ($sys_arch)". + fi + + k8s_version=$(get_k8s_version) + if ! is_supported_k8s_version; then + die "Sysbox is not supported on this Kubernetes version ($k8s_version)". + fi + + k8s_runtime=$(get_container_runtime) + if [[ $k8s_runtime == "" ]]; then + die "Failed to detect K8s node runtime." + elif [ "$k8s_runtime" == "cri-o" ]; then + k8s_runtime="crio" + fi + + k8s_taints=${SYSBOX_TAINT:-"sysbox-runtime=not-running:NoSchedule"} + + echo "Detected Kubernetes version $k8s_version" + + local edition_tag=${1:-} + if [ -z "$edition_tag" ]; then + print_usage + die "invalid arguments" + fi + + # Adjust env-vars associated to the Sysbox product edition being (un)installed. + do_edition_adjustments $edition_tag + + local action=${2:-} + if [ -z "$action" ]; then + print_usage + die "invalid arguments" + fi + + # Perform distro-specific adjustments. + do_distro_adjustments + + local crio_restart_pending=false + + case "$action" in + install) + mkdir -p ${host_var_lib_sysbox_deploy_k8s} + install_precheck + + # Prevent new pods being scheduled till sysbox installation/update is completed. + add_taint_to_node "${k8s_taints}" + + # Install CRI-O + if [[ "$do_crio_install" == "true" ]]; then + add_label_to_node "crio-runtime=installing" + deploy_crio_installer_service + remove_crio_installer_service + config_crio + # if running on GKE patch the CRI-O config + if $(check_is_gke); then + echo "Configuring CRI-O for GKE" + config_crio_for_gke + fi + crio_restart_pending=true + echo "yes" >${host_var_lib_sysbox_deploy_k8s}/crio_installed + fi + + # Install or update Sysbox + if [[ "$do_sysbox_install" == "true" ]] || + [[ "$do_sysbox_update" == "true" ]]; then + add_label_to_node "sysbox-runtime=installing" + install_sysbox_deps + install_sysbox + # Adjust cri-o's config to account for Sysbox presence and tag its restart flag + # accordingly. Notice that this is only needed if cri-o is being installed, there's + # no need to do anything in case of sysbox being updated. + if [[ "$do_crio_install" == "true" ]]; then + config_crio_for_sysbox + crio_restart_pending=true + fi + echo "yes" >${host_var_lib_sysbox_deploy_k8s}/sysbox_installed + echo "$os_kernel_release" >${host_var_lib_sysbox_deploy_k8s}/os_kernel_release + fi + + if [[ "$crio_restart_pending" == "true" ]]; then + restart_crio + fi + + # Switch the K8s runtime to CRI-O. + # + # Note: this will configure the Kubelet to use CRI-O and restart it, + # thereby killing all pods on the K8s node (including this daemonset). + # The K8s control plane will then re-create the pods, but this time + # with CRI-O. The operation can take up to 1 minute. + if [[ "$k8s_runtime" != "crio" ]]; then + echo "yes" >${host_var_lib_sysbox_deploy_k8s}/kubelet_reconfigured + deploy_kubelet_config_service + fi + + # Kubelet config service cleanup + if [ -f ${host_var_lib_sysbox_deploy_k8s}/kubelet_reconfigured ]; then + remove_kubelet_config_service + rm -f ${host_var_lib_sysbox_deploy_k8s}/kubelet_reconfigured + echo "Kubelet reconfig completed." + fi + + # Remove all the sysbox pods in the node to ensure that the newly installed/updated + # sysbox binaries are used. No action will be taken if this daemon-set is being updated + # with no changes to the sysbox runtime or any of its dependencies. + if [[ "$do_sysbox_install" == "true" ]] || + [[ "$do_sysbox_update" == "true" ]] || + [[ "$sysbox_install_in_progress" == "true" ]]; then + delete_sysbox_pods + fi + + add_label_to_node "crio-runtime=running" + add_label_to_node "sysbox-runtime=running" + rm_taint_from_node "${k8s_taints}" + + if [[ "$do_sysbox_install" == "true" ]] || [[ "$sysbox_install_in_progress" == "true" ]]; then + echo "The k8s runtime on this node is now CRI-O with Sysbox." + echo "$sysbox_edition installation completed (version $sysbox_version)." + elif [[ "$do_sysbox_update" == "true" ]]; then + echo "$sysbox_edition update completed (version $sysbox_version)." + fi + ;; + + cleanup) + mkdir -p ${host_var_lib_sysbox_deploy_k8s} + + # Prevent new pods being scheduled during sysbox cleanup phase. + add_taint_to_node "${k8s_taints}" + + # Switch the K8s runtime away from CRI-O (but only if this daemonset installed CRI-O previously) + if [ -f ${host_var_lib_sysbox_deploy_k8s}/crio_installed ] && [[ "$k8s_runtime" == "crio" ]]; then + add_label_to_node "crio-runtime=removing" + + # Note: this will restart kubelet with the prior runtime (not + # CRI-O), thereby killing all pods (including this daemonset) + echo "yes" >${host_var_lib_sysbox_deploy_k8s}/kubelet_reconfigured + deploy_kubelet_unconfig_service + fi + + if [ -f ${host_var_lib_sysbox_deploy_k8s}/kubelet_reconfigured ]; then + remove_kubelet_unconfig_service + rm -f ${host_var_lib_sysbox_deploy_k8s}/kubelet_reconfigured + echo "Kubelet reconfig completed." + fi + + # Uninstall Sysbox + if [ -f ${host_var_lib_sysbox_deploy_k8s}/sysbox_installed ]; then + add_label_to_node "sysbox-runtime=removing" + unconfig_crio_for_sysbox + remove_sysbox + remove_sysbox_deps + crio_restart_pending=true + rm -f ${host_var_lib_sysbox_deploy_k8s}/sysbox_installed + rm -f ${host_var_lib_sysbox_deploy_k8s}/os_kernel_release + rm_label_from_node "sysbox-runtime" + echo "$sysbox_edition removal completed." + fi + + # Uninstall CRI-O + if [ -f ${host_var_lib_sysbox_deploy_k8s}/crio_installed ]; then + deploy_crio_removal_service + remove_crio_removal_service + crio_restart_pending=false + rm -f ${host_var_lib_sysbox_deploy_k8s}/crio_installed + rm_label_from_node "crio-runtime" + fi + + rm -rf ${host_var_lib_sysbox_deploy_k8s} + + if [[ "$crio_restart_pending" == "true" ]]; then + restart_crio + fi + + # Remove all the sysbox pods in the node to ensure that no sysbox pods are + # left behind in an inconsistent state. + delete_sysbox_pods + + rm_taint_from_node "${k8s_taints}" + + echo "The k8s runtime on this node is now $k8s_runtime." + ;; + + *) + echo invalid arguments + print_usage + ;; + esac + + # This script will be called as a daemonset. Do not return, otherwise the + # daemonset will restart and rexecute the script + echo "Done." + + sleep infinity +} + +main "$@" diff --git a/sysbox-pkgr/k8s/scripts/sysbox-installer-helper.sh b/sysbox-pkgr/k8s/scripts/sysbox-installer-helper.sh new file mode 100755 index 00000000..6c8a3de4 --- /dev/null +++ b/sysbox-pkgr/k8s/scripts/sysbox-installer-helper.sh @@ -0,0 +1,223 @@ +#!/bin/bash + +# +# Copyright 2019-2021 Nestybox, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +# +# Helper script to install Sysbox dependencies on host (e.g. shiftfs, rsync, etc.) +# + +set -o errexit +set -o pipefail +set -o nounset + +shiftfs_dkms=/run/shiftfs-dkms + +function die() { + msg="$*" + echo "ERROR: $msg" >&2 + exit 1 +} + +# Compare two versions in SemVer format. +# +# Examples: (1.0.1, 1.0.1) = 0 +# (1.0.1, 1.0.2) = 2 +# (1.0.1, 1.0.0) = 1 +# (1, 1.0) = 0 +# (3.0.4.10, 3.0.4.2) = 1 +# (5.0.0-22, 5.0.0-22) = 0 +# (5.0.0-22, 5.0.0-21) = 1 +# (5.0.0-21, 5.0.0-22) = 2 +# +function version_compare() { + + if [[ $1 == $2 ]]; then + return 0 + fi + + local IFS='.|-' + local i ver1=($1) ver2=($2) + + # Fill empty fields in ver1 with zeros. + for ((i = ${#ver1[@]}; i < ${#ver2[@]}; i++)); do + ver1[i]=0 + done + + for ((i = 0; i < ${#ver1[@]}; i++)); do + if [[ -z ${ver2[i]} ]]; then + # Fill empty fields in ver2 with zeros. + ver2[i]=0 + fi + if ((10#${ver1[i]} > 10#${ver2[i]})); then + return 1 + fi + if ((10#${ver1[i]} < 10#${ver2[i]})); then + return 2 + fi + done + + return 0 +} + +# Compare semantic versions; takes two semantic version numbers of the form +# x.y.z (or x.y), and returns 0 if the first is greater than or equal to the +# second, and 1 otherwise. +function semver_ge() { + version_compare $1 $2 + if [ "$?" -ne "2" ]; then + return 0 + else + return 1 + fi +} + +function install_package_deps() { + + # Need this to work-around "E: dpkg was interrupted, you must manually run 'dpkg --configure -a' to correct the problem." + dpkg --configure -a + + # Certificates package is required prior to running apt-update. + apt-get -y install ca-certificates + apt-get update + apt-get install -y rsync fuse iptables +} + +function install_shiftfs() { + # If shiftfs is not needed, skip + if ! shiftfs_needed; then + echo "Skipping shiftfs installation (kernel has id-mapped mounts support)." + return + fi + + # If shiftfs is already installed, skip + if shiftfs_installed; then + echo "Skipping shiftfs installation (it's already installed)." + return + fi + + echo "Installing Shiftfs ..." + + apt-get install -y make dkms + sh -c "cd $shiftfs_dkms && make -f Makefile.dkms" + + if ! shiftfs_installed; then + echo "Shiftfs installation failed!" + return + fi + + apt-get remove --purge -y make dkms + echo "Shiftfs installation done." +} + +function shiftfs_installed() { + modinfo shiftfs >/dev/null 2>&1 +} + +function shiftfs_needed() { + # shiftfs is not needed for kernels >= 5.19 where idmapped mounts are present + # and stable, but is still recommended if it is available. the max supported + # version for shiftfs is 6.2, so check against that here + local kversion=$(uname -r | cut -d "." -f1-2) + + if semver_ge $kversion 6.2; then + # not needed + return 1 + else + return 0 + fi +} + +function probe_kernel_mods() { + + echo "Probing kernel modules ..." + + if shiftfs_needed; then + # If provided by the caller, load the passed shiftfs module, otherwise assume + # that this one is already present in the system's default modules location. + local shiftfs_module=${1:-} + if [ ! -z "${shiftfs_module}" ]; then + if ! lsmod | grep -q shiftfs; then + insmod ${shiftfs_module} + fi + else + modprobe shiftfs + fi + fi + + # Ensure that configfs is loaded regardless of the running kernel version. Notice that + # we're not enforcing this requirement, and we're simply dumping a log to the user if + # configfs is not present. + if modprobe configfs && ! mount | grep -q configfs; then + echo -e "\nConfigfs kernel module is not loaded. Configfs may be required " \ + "by certain applications running inside a Sysbox container.\n" + fi + + if shiftfs_needed && ! lsmod | grep -q shiftfs; then + echo -e "\nShiftfs kernel module is not loaded. Shiftfs is required " \ + "for host volume mounts into Sysbox containers to have proper ownership " \ + "(user-ID and group-ID).\n" + fi +} + +function flatcar_distro() { + grep -q "^ID=flatcar" /etc/os-release +} + +function check_procfs_mount_userns() { + + # Attempt to mount procfs from a user-namespace. + if unshare -U -p -f --mount-proc -r cat /dev/null; then + return 0 + fi + + # Find out if there's anything we can do to workaround this situation. In certain + # scenarios (e.g. Flatcar >= 3033.2.4), a fake (bind-mounted) '/proc/cmdline' can + # prevent procfs from being mounted within a user-namespace. In these cases we'll + # attempt to unmount this resource and try again. + if mount | egrep -q "cmdline" && umount /proc/cmdline; then + if unshare -U -p -f --mount-proc -r cat /dev/null; then + return 0 + fi + fi + + return 1 +} + +function main() { + + euid=$(id -u) + if [[ $euid -ne 0 ]]; then + die "This script must be run as root" + fi + + if ! check_procfs_mount_userns; then + die "Sysbox unmet requirement: node is unable to mount procfs from within unprivileged user-namespaces." + fi + + # In flatcar's case the shiftfs module is explicitly provided by the installer + # itself. + if flatcar_distro; then + probe_kernel_mods "/opt/lib/modules-load.d/shiftfs.ko" + return + fi + + install_package_deps + install_shiftfs + probe_kernel_mods +} + +main "$@" diff --git a/sysbox-pkgr/k8s/scripts/sysbox-removal-helper.sh b/sysbox-pkgr/k8s/scripts/sysbox-removal-helper.sh new file mode 100755 index 00000000..92394bc1 --- /dev/null +++ b/sysbox-pkgr/k8s/scripts/sysbox-removal-helper.sh @@ -0,0 +1,64 @@ +#!/bin/bash + +# +# Copyright 2019-2021 Nestybox, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +# +# Helper script to remove Sysbox dependencies on host (e.g. shiftfs, rsync, etc.) +# + +set -o errexit +set -o pipefail +set -o nounset + +shiftfs_dkms=/run/shiftfs-dkms + +function die() { + msg="$*" + echo "ERROR: $msg" >&2 + exit 1 +} + +function remove_shiftfs() { + + if ! shiftfs_installed; then + echo "Skipping shiftfs uninstallation (shiftfs is not installed)." + return + fi + + if lsmod | grep shiftfs; then + echo "Removing Shiftfs ..." + rmmod shiftfs + fi + + echo "Shiftfs removal done." +} + +function shiftfs_installed() { + modinfo shiftfs > /dev/null 2>&1 +} + +function main() { + + euid=$(id -u) + if [[ $euid -ne 0 ]]; then + die "This script must be run as root" + fi + + remove_shiftfs +} + +main "$@" diff --git a/sysbox-pkgr/k8s/systemd/50-sysbox-mod.conf b/sysbox-pkgr/k8s/systemd/50-sysbox-mod.conf new file mode 100644 index 00000000..a76c19fc --- /dev/null +++ b/sysbox-pkgr/k8s/systemd/50-sysbox-mod.conf @@ -0,0 +1,2 @@ +# Required by Sysbox to allow dummy configfs folders to be exposed inside Sysbox containers. +configfs diff --git a/sysbox-pkgr/k8s/systemd/99-sysbox-sysctl.conf b/sysbox-pkgr/k8s/systemd/99-sysbox-sysctl.conf new file mode 100644 index 00000000..ce903bc2 --- /dev/null +++ b/sysbox-pkgr/k8s/systemd/99-sysbox-sysctl.conf @@ -0,0 +1,38 @@ +# Sysbox's sysctl.d requirements + +# Enable user-namespaces in kernel. +kernel.unprivileged_userns_clone = 1 + +# Increase default "inotify" kernel resources for scaling purposes. +fs.inotify.max_queued_events = 1048576 +fs.inotify.max_user_watches = 1048576 +fs.inotify.max_user_instances = 1048576 + +# Increase keyring max-keys for scaling purposes. +# +# In debian-based distros (e.g., Ubuntu) the kernel keyring max keys +# limit is set to 200 for non-root users. This is too low for +# Sysbox. For example, for a sys container based K8s cluster, the +# number of keys required is: +# +# keys = 35 + (k8s_workers * 23) + (2 * pods) +# +# That is, a 10-node cluster would need 282 keys. +# +# In a large bare-metal machine, we expect ~100 sys containers. That would +# consume ~11K keys. To be conservative, we set maxkeys to 20K. Note that since +# each key consumes ~70 bytes on average, the total mem consumption assuming all +# 20K keys are used is 20K * 70 = 1.4MB. +kernel.keys.maxkeys = 20000 +kernel.keys.maxbytes = 1400000 + +# Increase the kernel's max PID limit to it's max value since Sysbox containers +# are often used as VM-like environments and can consume hundreds or thousands +# of PIDs each. For Sysbox deployments on K8s, we limit each pod to 16K pids via +# the CRI-O config file. For Sysbox deployments in Docker hosts, use Docker's +# "--pids-limit" option to fix this limit. +kernel.pid_max = 4194304 + +# Increases the maximum number of memory map areas a process may have. This is +# a requirement for mmap-demanding apps such as Elastic-search. +vm.max_map_count=262144 \ No newline at end of file diff --git a/sysbox-pkgr/k8s/systemd/crio-installer.service b/sysbox-pkgr/k8s/systemd/crio-installer.service new file mode 100644 index 00000000..50f4feb9 --- /dev/null +++ b/sysbox-pkgr/k8s/systemd/crio-installer.service @@ -0,0 +1,7 @@ +[Unit] +Description=CRI-O installer service + +[Service] +Type=oneshot +ExecStart=/bin/sh -c "/usr/local/bin/crio-installer.sh" +RemainAfterExit=true \ No newline at end of file diff --git a/sysbox-pkgr/k8s/systemd/crio-removal.service b/sysbox-pkgr/k8s/systemd/crio-removal.service new file mode 100644 index 00000000..bae7a299 --- /dev/null +++ b/sysbox-pkgr/k8s/systemd/crio-removal.service @@ -0,0 +1,7 @@ +[Unit] +Description=CRI-O removal service + +[Service] +Type=oneshot +ExecStart=/bin/sh -c "/usr/local/bin/crio-removal.sh" +RemainAfterExit=true \ No newline at end of file diff --git a/sysbox-pkgr/k8s/systemd/kubelet-config-helper.service b/sysbox-pkgr/k8s/systemd/kubelet-config-helper.service new file mode 100644 index 00000000..66161dbf --- /dev/null +++ b/sysbox-pkgr/k8s/systemd/kubelet-config-helper.service @@ -0,0 +1,7 @@ +[Unit] +Description=Kubelet config service + +[Service] +Type=oneshot +ExecStart=/bin/sh -c "/usr/local/bin/kubelet-config-helper.sh" +RemainAfterExit=true \ No newline at end of file diff --git a/sysbox-pkgr/k8s/systemd/kubelet-unconfig-helper.service b/sysbox-pkgr/k8s/systemd/kubelet-unconfig-helper.service new file mode 100644 index 00000000..f3608121 --- /dev/null +++ b/sysbox-pkgr/k8s/systemd/kubelet-unconfig-helper.service @@ -0,0 +1,7 @@ +[Unit] +Description=Kubelet unconfig service + +[Service] +Type=oneshot +ExecStart=/bin/sh -c "/usr/local/bin/kubelet-unconfig-helper.sh" +RemainAfterExit=true \ No newline at end of file diff --git a/sysbox-pkgr/k8s/systemd/sysbox-fs.service b/sysbox-pkgr/k8s/systemd/sysbox-fs.service new file mode 100644 index 00000000..c7c01f31 --- /dev/null +++ b/sysbox-pkgr/k8s/systemd/sysbox-fs.service @@ -0,0 +1,23 @@ +[Unit] +Description=sysbox-fs (part of the Sysbox container runtime) +PartOf=sysbox.service +After=sysbox-mgr.service + +[Service] +Type=simple +Type=notify +ExecStart=/usr/bin/sysbox-fs +TimeoutStartSec=10 +TimeoutStopSec=10 +StartLimitInterval=0 +NotifyAccess=main +OOMScoreAdjust=-500 + +# The number of files opened by sysbox-fs is a function of the number of +# containers and the workloads within them. Thus we set the limit to +# infinite so to prevent "too many open files" errors. +LimitNOFILE=infinity +LimitNPROC=infinity + +[Install] +WantedBy=sysbox.service diff --git a/sysbox-pkgr/k8s/systemd/sysbox-installer-helper.service b/sysbox-pkgr/k8s/systemd/sysbox-installer-helper.service new file mode 100644 index 00000000..7bc98b86 --- /dev/null +++ b/sysbox-pkgr/k8s/systemd/sysbox-installer-helper.service @@ -0,0 +1,7 @@ +[Unit] +Description=Sysbox installer helper service + +[Service] +Type=oneshot +ExecStart=/bin/sh -c "/usr/local/bin/sysbox-installer-helper.sh" +RemainAfterExit=true \ No newline at end of file diff --git a/sysbox-pkgr/k8s/systemd/sysbox-mgr.service b/sysbox-pkgr/k8s/systemd/sysbox-mgr.service new file mode 100644 index 00000000..2793c64b --- /dev/null +++ b/sysbox-pkgr/k8s/systemd/sysbox-mgr.service @@ -0,0 +1,22 @@ +[Unit] +Description=sysbox-mgr (part of the Sysbox container runtime) +PartOf=sysbox.service + +[Service] +Type=simple +Type=notify +ExecStart=/usr/bin/sysbox-mgr --disable-inner-image-preload +TimeoutStartSec=45 +TimeoutStopSec=90 +StartLimitInterval=0 +NotifyAccess=main +OOMScoreAdjust=-500 + +# The number of files opened by sysbox-mgr is a function of the number of +# containers and the size of the rootfs within them. Thus we set the limit to +# infinite so to prevent "too many open files" errors. +LimitNOFILE=infinity +LimitNPROC=infinity + +[Install] +WantedBy=sysbox.service diff --git a/sysbox-pkgr/k8s/systemd/sysbox-removal-helper.service b/sysbox-pkgr/k8s/systemd/sysbox-removal-helper.service new file mode 100644 index 00000000..fce3b0bd --- /dev/null +++ b/sysbox-pkgr/k8s/systemd/sysbox-removal-helper.service @@ -0,0 +1,7 @@ +[Unit] +Description=Sysbox removal helper service + +[Service] +Type=oneshot +ExecStart=/bin/sh -c "/usr/local/bin/sysbox-removal-helper.sh" +RemainAfterExit=true \ No newline at end of file diff --git a/sysbox-pkgr/k8s/systemd/sysbox.service b/sysbox-pkgr/k8s/systemd/sysbox.service new file mode 100644 index 00000000..4f040af0 --- /dev/null +++ b/sysbox-pkgr/k8s/systemd/sysbox.service @@ -0,0 +1,20 @@ +# Sysbox unit to act as a wrapper of sysbox's inner components/daemons: +# sysbox-mgr and sysbox-fs. + +[Unit] +Description=Sysbox container runtime +Documentation=https://github.com/nestybox/sysbox +BindsTo=sysbox-mgr.service sysbox-fs.service +After=sysbox-mgr.service sysbox-fs.service + +# Must start before Docker/containerd to ensure "docker --restart" works +# properly with Sysbox. +Before=docker.service containerd.service + +[Service] +Type=exec +ExecStart=/bin/sh -c "/usr/bin/sysbox-runc --version && /usr/bin/sysbox-mgr --version && /usr/bin/sysbox-fs --version && /bin/sleep infinity" + +[Install] +# Components of this application should be started at boot time +WantedBy=multi-user.target diff --git a/sysbox-pkgr/rpm/Makefile b/sysbox-pkgr/rpm/Makefile new file mode 100644 index 00000000..42bb433b --- /dev/null +++ b/sysbox-pkgr/rpm/Makefile @@ -0,0 +1,128 @@ +# +# Sysbox RPM Packager Makefile +# + +.PHONY: help \ + rpm \ + centos \ + centos-8 \ + fedora \ + fedora-31 \ + fedora-32 \ + clean + +SHELL:=/bin/bash +ARCH:=$(shell uname -m) + +# Go version to utilize in slave containers for the building process. +# We are temporarily skipping these settings as we are relying on Go's +# top-of-tree (till 1.13 comes out). +GO_BASE_IMAGE=golang +GO_VERSION:=1.13 +GO_IMAGE=$(GO_BASE_IMAGE):$(GO_VERSION) + +# Sysbox source-code locations. +ifeq ($(EDITION),ce) + SYSBOX_DIR := $(CURDIR)/../sources/sysbox +else ifeq ($(EDITION),ee) + SYSBOX_DIR := $(CURDIR)/../sources/sysbox-internal +else + echo "Unsupported Sysbox edition: $(EDITION)" + exit 1 +endif + +# Sysbox component locations. +SYSBOX_IMAGE_SYSTEMD := ../systemd + +SOURCE_FILES = sysbox.tgz \ + sysbox.service \ + sysbox-fs.service \ + sysbox-mgr.service \ + 99-sysbox-sysctl.conf \ + 50-sysbox-mod.conf + +SOURCES=$(addprefix sources/, $(SOURCE_FILES)) + +CHOWN:=docker run --rm -v $(CURDIR):/v -w /v alpine chown + +DOCKER_BUILD=docker build \ + --build-arg GO_IMAGE=$(GO_IMAGE) \ + --build-arg COMMON_FILES=$(SYSBOX_IMAGE_COMMON) \ + -t rpmbuild-$@/$(ARCH) \ + -f $(CURDIR)/$@/Dockerfile . + +DOCKER_RUN=docker run --rm -i \ + -e EDITION \ + -v $(CURDIR)/rpmbuild/$@:/build \ + -v $(GOPATH)/pkg/mod:/go/pkg/mod \ + rpmbuild-$@/$(ARCH) + + +help: + @awk 'BEGIN {FS = ":.*##"; printf "\n\033[1mUsage:\n \ + make \033[36m\033[0m\n"} \ + /^[a-zA-Z_-]+:.*?##/ { printf " \033[36m%-25s\033[0m %s\n", $$1, $$2 } /^##@/ \ + { printf "\n\033[1m%s\033[0m\n", substr($$0, 5) } ' $(MAKEFILE_LIST) + +##@ RPM building targets + +all: ## Build all RPM packages +all: centos fedora + +centos: ## Build Centos packages +centos: centos-8 + +centos-8: ## Build Centos-8 package +centos-8: $(SOURCES) + $(DOCKER_BUILD) + $(DOCKER_RUN) + $(CHOWN) -R $(shell id -u):$(shell id -g) rpmbuild/$@ + +fedora: ## Build Fedora packages +fedora: fedora-31 fedora-32 + +fedora-31: ## Build Fedora-31 package +fedora-31: $(SOURCES) + $(DOCKER_BUILD) + $(DOCKER_RUN) + $(CHOWN) -R $(shell id -u):$(shell id -g) rpmbuild/$@ + +fedora-32: ## Build Fedora-32 packages +fedora-32: $(SOURCES) + $(DOCKER_BUILD) + $(DOCKER_RUN) + $(CHOWN) -R $(shell id -u):$(shell id -g) rpmbuild/$@ + +clean: ## Remove build artifacts + @[ ! -d rpmbuild ] || $(CHOWN) -R $(shell id -u):$(shell id -g) rpmbuild + @$(RM) -r rpmbuild + @[ ! -d sources ] || $(CHOWN) -R $(shell id -u):$(shell id -g) sources + @$(RM) -r sources + +sources/sysbox.tgz: + mkdir -p $(@D) + docker run --rm -i -w /v \ + -v $(SYSBOX_DIR):/sysbox \ + -v $(CURDIR)/$(@D):/v \ + alpine \ + tar -C / -czf /v/sysbox.tgz --exclude .git --exclude='sysbox-pkgr' sysbox + +sources/sysbox.service: ../systemd/sysbox.service + mkdir -p $(@D) + cp $< $@ + +sources/sysbox-fs.service: ../systemd/sysbox-fs.service + mkdir -p $(@D) + cp $< $@ + +sources/sysbox-mgr.service: ../systemd/sysbox-mgr.service + mkdir -p $(@D) + cp $< $@ + +sources/99-sysbox-sysctl.conf: ../systemd/99-sysbox-sysctl.conf + mkdir -p $(@D) + cp $< $@ + +sources/50-sysbox-mod.conf: ../systemd/50-sysbox-mod.conf + mkdir -p $(@D) + cp $< $@ diff --git a/sysbox-pkgr/systemd/50-sysbox-mod.conf b/sysbox-pkgr/systemd/50-sysbox-mod.conf new file mode 100644 index 00000000..a76c19fc --- /dev/null +++ b/sysbox-pkgr/systemd/50-sysbox-mod.conf @@ -0,0 +1,2 @@ +# Required by Sysbox to allow dummy configfs folders to be exposed inside Sysbox containers. +configfs diff --git a/sysbox-pkgr/systemd/99-sysbox-sysctl.conf b/sysbox-pkgr/systemd/99-sysbox-sysctl.conf new file mode 100644 index 00000000..f18b3588 --- /dev/null +++ b/sysbox-pkgr/systemd/99-sysbox-sysctl.conf @@ -0,0 +1,34 @@ +# Sysbox's sysctl.d requirements + +# Enable user-namespaces in kernel. +kernel.unprivileged_userns_clone = 1 + +# Increase default "inotify" kernel resources for scaling purposes. +fs.inotify.max_queued_events = 1048576 +fs.inotify.max_user_watches = 1048576 +fs.inotify.max_user_instances = 1048576 + +# Increase keyring max-keys for scaling purposes. +# +# In debian-based distros (e.g., Ubuntu) the kernel keyring max keys +# limit is set to 200 for non-root users. This is too low for +# Sysbox. For example, for a sys container based K8s cluster, the +# number of keys required is: +# +# keys = 35 + (k8s_workers * 23) + (2 * pods) +# +# That is, a 10-node cluster would need 282 keys. +# +# In a large bare-metal machine, we expect ~100 sys containers. That would +# consume ~11K keys. To be conservative, we set maxkeys to 20K. Note that since +# each key consumes ~70 bytes on average, the total mem consumption assuming all +# 20K keys are used is 20K * 70 = 1.4MB. +kernel.keys.maxkeys = 20000 +kernel.keys.maxbytes = 1400000 + +# Increase the kernel's max PID limit to it's max value since Sysbox containers +# are often used as VM-like environments and can consume hundreds or thousands +# of PIDs each. For Sysbox deployments on K8s, we limit each pod to 16K pids via +# the CRI-O config file. For Sysbox deployments in Docker hosts, use Docker's +# "--pids-limit" option to fix this limit. +kernel.pid_max = 4194304 \ No newline at end of file diff --git a/sysbox-pkgr/systemd/sysbox-fs.service b/sysbox-pkgr/systemd/sysbox-fs.service new file mode 100644 index 00000000..c7c01f31 --- /dev/null +++ b/sysbox-pkgr/systemd/sysbox-fs.service @@ -0,0 +1,23 @@ +[Unit] +Description=sysbox-fs (part of the Sysbox container runtime) +PartOf=sysbox.service +After=sysbox-mgr.service + +[Service] +Type=simple +Type=notify +ExecStart=/usr/bin/sysbox-fs +TimeoutStartSec=10 +TimeoutStopSec=10 +StartLimitInterval=0 +NotifyAccess=main +OOMScoreAdjust=-500 + +# The number of files opened by sysbox-fs is a function of the number of +# containers and the workloads within them. Thus we set the limit to +# infinite so to prevent "too many open files" errors. +LimitNOFILE=infinity +LimitNPROC=infinity + +[Install] +WantedBy=sysbox.service diff --git a/sysbox-pkgr/systemd/sysbox-mgr.service b/sysbox-pkgr/systemd/sysbox-mgr.service new file mode 100644 index 00000000..b3720163 --- /dev/null +++ b/sysbox-pkgr/systemd/sysbox-mgr.service @@ -0,0 +1,22 @@ +[Unit] +Description=sysbox-mgr (part of the Sysbox container runtime) +PartOf=sysbox.service + +[Service] +Type=simple +Type=notify +ExecStart=/usr/bin/sysbox-mgr +TimeoutStartSec=45 +TimeoutStopSec=90 +StartLimitInterval=0 +NotifyAccess=main +OOMScoreAdjust=-500 + +# The number of files opened by sysbox-mgr is a function of the number of +# containers and the size of the rootfs within them. Thus we set the limit to +# infinite so to prevent "too many open files" errors. +LimitNOFILE=infinity +LimitNPROC=infinity + +[Install] +WantedBy=sysbox.service diff --git a/sysbox-pkgr/systemd/sysbox.service b/sysbox-pkgr/systemd/sysbox.service new file mode 100644 index 00000000..4f040af0 --- /dev/null +++ b/sysbox-pkgr/systemd/sysbox.service @@ -0,0 +1,20 @@ +# Sysbox unit to act as a wrapper of sysbox's inner components/daemons: +# sysbox-mgr and sysbox-fs. + +[Unit] +Description=Sysbox container runtime +Documentation=https://github.com/nestybox/sysbox +BindsTo=sysbox-mgr.service sysbox-fs.service +After=sysbox-mgr.service sysbox-fs.service + +# Must start before Docker/containerd to ensure "docker --restart" works +# properly with Sysbox. +Before=docker.service containerd.service + +[Service] +Type=exec +ExecStart=/bin/sh -c "/usr/bin/sysbox-runc --version && /usr/bin/sysbox-mgr --version && /usr/bin/sysbox-fs --version && /bin/sleep infinity" + +[Install] +# Components of this application should be started at boot time +WantedBy=multi-user.target diff --git a/sysbox-runc b/sysbox-runc deleted file mode 160000 index 1b440ff2..00000000 --- a/sysbox-runc +++ /dev/null @@ -1 +0,0 @@ -Subproject commit 1b440ff266841f3d2d296e664122a9e29ceb9fd8 diff --git a/sysbox-runc/.gitignore b/sysbox-runc/.gitignore new file mode 100644 index 00000000..58a82e09 --- /dev/null +++ b/sysbox-runc/.gitignore @@ -0,0 +1,12 @@ +build +vendor/pkg +contrib/cmd/recvtty/recvtty +man/man8 +release +Vagrantfile +.vagrant +GPATH +GRTAGS +GTAGS +.ccls-cache +.vscode diff --git a/sysbox-runc/CONTRIBUTING.md b/sysbox-runc/CONTRIBUTING.md new file mode 100644 index 00000000..c26bcf4a --- /dev/null +++ b/sysbox-runc/CONTRIBUTING.md @@ -0,0 +1,5 @@ +# Contribute to Sysbox-runc + +Sysbox-runc is a component of the Sysbox container runtime. If you want to +contribute, please refer to the Sysbox contribution +[guidelines](https://github.com/nestybox/sysbox/blob/master/CONTRIBUTING.md). diff --git a/sysbox-runc/Dockerfile b/sysbox-runc/Dockerfile new file mode 100644 index 00000000..3bb4e946 --- /dev/null +++ b/sysbox-runc/Dockerfile @@ -0,0 +1,79 @@ +ARG GO_VERSION=1.16 +ARG BATS_VERSION=v1.2.1 +ARG UMOCI_VERSION=v0.4.6 + +FROM golang:${GO_VERSION}-bullseye +ARG DEBIAN_FRONTEND=noninteractive + +RUN echo 'deb https://download.opensuse.org/repositories/devel:/tools:/criu/Debian_11/ /' > /etc/apt/sources.list.d/criu.list \ + && wget -nv https://download.opensuse.org/repositories/devel:/tools:/criu/Debian_11/Release.key -O- | apt-key add - \ + && echo 'deb http://download.opensuse.org/repositories/devel:/kubic:/libcontainers:/stable/Debian_11/ /' > /etc/apt/sources.list.d/skopeo.list \ + && wget -nv https://download.opensuse.org/repositories/devel:kubic:libcontainers:stable/Debian_11/Release.key -O- | apt-key add - \ + && dpkg --add-architecture armel \ + && dpkg --add-architecture armhf \ + && dpkg --add-architecture arm64 \ + && apt-get update \ + && apt-get install -y --no-install-recommends \ + build-essential \ + criu \ + crossbuild-essential-arm64 \ + crossbuild-essential-armel \ + crossbuild-essential-armhf \ + curl \ + gawk \ + iptables \ + jq \ + kmod \ + libseccomp-dev \ + libseccomp-dev:arm64 \ + libseccomp-dev:armel \ + libseccomp-dev:armhf \ + libseccomp2 \ + lsb-release \ + pkg-config \ + python2-minimal \ + skopeo \ + sudo \ + uidmap \ + && apt-get clean \ + && rm -rf /var/cache/apt /var/lib/apt/lists/* /etc/apt/sources.list.d/*.list + +# Add a dummy user for the rootless integration tests. While runC does +# not require an entry in /etc/passwd to operate, one of the tests uses +# `git clone` -- and `git clone` does not allow you to clone a +# repository if the current uid does not have an entry in /etc/passwd. +RUN useradd -u1000 -m -d/home/rootless -s/bin/bash rootless + +# install bats +ARG BATS_VERSION +RUN cd /tmp \ + && git clone https://github.com/bats-core/bats-core.git \ + && cd bats-core \ + && git reset --hard "${BATS_VERSION}" \ + && ./install.sh /usr/local \ + && rm -rf /tmp/bats-core + +# install umoci +ARG UMOCI_VERSION +RUN curl -o /usr/local/bin/umoci -fsSL https://github.com/opencontainers/umoci/releases/download/${UMOCI_VERSION}/umoci.amd64 \ + && chmod +x /usr/local/bin/umoci + +# install debug tools +RUN go get github.com/go-delve/delve/cmd/dlv \ + && apt-get update \ + && apt-get install -y --no-install-recommends \ + psmisc + +WORKDIR /root/nestybox/sysbox-runc + +# setup a playground for us to spawn containers in +COPY tests/integration/multi-arch.bash tests/integration/ +ENV ROOTFS /busybox +RUN mkdir -p "${ROOTFS}" +RUN . tests/integration/multi-arch.bash \ + && curl -fsSL `get_busybox` | tar xfJC - "${ROOTFS}" + +ENV DEBIAN_ROOTFS /debian +RUN mkdir -p "${DEBIAN_ROOTFS}" +RUN . tests/integration/multi-arch.bash \ + && get_and_extract_debian "$DEBIAN_ROOTFS" diff --git a/sysbox-runc/LICENSE b/sysbox-runc/LICENSE new file mode 100644 index 00000000..385b1fd2 --- /dev/null +++ b/sysbox-runc/LICENSE @@ -0,0 +1,192 @@ + + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + Copyright 2019-2020 Nestybox, Inc. + Copyright 2014 Docker, Inc. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/sysbox-runc/MAINTAINERS b/sysbox-runc/MAINTAINERS new file mode 100644 index 00000000..3af2dbb0 --- /dev/null +++ b/sysbox-runc/MAINTAINERS @@ -0,0 +1,2 @@ +Rodny Molina (@rodnymolina) +Cesar Talledo (@ctalledo) diff --git a/sysbox-runc/Makefile b/sysbox-runc/Makefile new file mode 100644 index 00000000..1cb90bb9 --- /dev/null +++ b/sysbox-runc/Makefile @@ -0,0 +1,287 @@ +CONTAINER_ENGINE := docker +GO := go + +# Obtain the current system architecture. +ifeq ($(SYS_ARCH),) + UNAME_M := $(shell uname -m) + ifeq ($(UNAME_M),x86_64) + SYS_ARCH := amd64 + else ifeq ($(UNAME_M),aarch64) + SYS_ARCH := arm64 + else ifeq ($(UNAME_M),arm) + SYS_ARCH := armhf + else ifeq ($(UNAME_M),armel) + SYS_ARCH := armel + endif +endif + +# Set target architecture if not explicitly defined by user. +ifeq ($(TARGET_ARCH),) + TARGET_ARCH := $(SYS_ARCH) +endif + +RUNC_BUILDROOT := build +RUNC_BUILDDIR := $(RUNC_BUILDROOT)/$(TARGET_ARCH) +RUNC_TARGET := sysbox-runc +RUNC_DEBUG_TARGET := sysbox-runc-debug +RUNC_STATIC_TARGET := sysbox-runc-static + +SOURCES := $(shell find . 2>&1 | grep -E '.*\.(c|h|go)$$') +PREFIX ?= /usr/local +BINDIR := $(PREFIX)/sbin +MANDIR := $(PREFIX)/share/man + +GIT_BRANCH := $(shell git rev-parse --abbrev-ref HEAD 2>/dev/null) +GIT_BRANCH_CLEAN := $(shell echo $(GIT_BRANCH) | sed -e "s/[^[:alnum:]]/-/g") +RUNC_IMAGE := runc_dev$(if $(GIT_BRANCH_CLEAN),:$(GIT_BRANCH_CLEAN)) + +NBOX := /root/nestybox +RUNC := $(NBOX)/sysbox-runc + +COMMIT_NO := $(shell git rev-parse HEAD 2> /dev/null || true) +COMMIT ?= $(if $(shell git status --porcelain --untracked-files=no),$(COMMIT_NO)-dirty,$(COMMIT_NO)) +BUILT_AT := $(shell date) +BUILT_BY := $(shell git config user.name) + +SYSIPC_DIR := ../sysbox-ipc +SYSIPC_SRC := $(shell find $(SYSIPC_DIR) 2>&1 | grep -E '.*\.(c|h|go|proto)$$') + +SYSLIB_DIR := ../sysbox-libs +SYSLIB_SRC := $(shell find $(SYSLIB_DIR) 2>&1 | grep -E '.*\.(c|h|go|proto)$$') + +SHIFTFS_MODULE_PRESENT = $(shell lsmod | grep shiftfs) + +LDFLAGS := -X 'main.edition=${EDITION}' -X main.version=${VERSION} \ + -X main.commitId=$(COMMIT) -X 'main.builtAt=$(BUILT_AT)' \ + -X 'main.builtBy=$(BUILT_BY)' + +KERNEL_REL := $(shell uname -r) +KERNEL_REL_MAJ := $(shell echo $(KERNEL_REL) | cut -d'.' -f1) +KERNEL_REL_MIN := $(shell echo $(KERNEL_REL) | cut -d'.' -f2) + +# idmapped mount is supported in kernels >= 5.12 +ifeq ($(shell test $(KERNEL_REL_MAJ) -gt 5; echo $$?),0) + IDMAPPED_MNT := 1 +endif + +ifeq ($(shell test $(KERNEL_REL_MAJ) -eq 5; echo $$?),0) + ifeq ($(shell test $(KERNEL_REL_MIN) -ge 12; echo $$?),0) + IDMAPPED_MNT := 1 + endif +endif + +ifeq ($(IDMAPPED_MNT),1) + BUILDTAGS ?= seccomp apparmor idmapped_mnt +else + BUILDTAGS ?= seccomp apparmor +endif + +IMAGE_BASE_DISTRO := $(shell cat /etc/os-release | grep "^ID=" | cut -d "=" -f2 | tr -d '"') + +# Identify kernel-headers path if not previously defined. Notice that this logic is already +# present in Sysbox's Makefile; we are duplicating it here to keep sysbox-runc as independent +# as possible. If KERNEL_HEADERS is not already defined, we will assume that the same applies +# to all related variables declared below. +ifeq ($(IMAGE_BASE_DISTRO),$(filter $(IMAGE_BASE_DISTRO),centos fedora redhat almalinux rocky amzn)) + IMAGE_BASE_RELEASE := $(shell cat /etc/os-release | grep "^VERSION_ID" | cut -d "=" -f2 | tr -d '"' | cut -d "." -f1) + KERNEL_HEADERS := kernels/$(KERNEL_REL) +else + IMAGE_BASE_RELEASE := $(shell cat /etc/os-release | grep "^VERSION_CODENAME" | cut -d "=" -f2) + ifeq ($(IMAGE_BASE_DISTRO),linuxmint) + IMAGE_BASE_DISTRO := ubuntu + ifeq ($(IMAGE_BASE_RELEASE),$(filter $(IMAGE_BASE_RELEASE),ulyana ulyssa uma)) + IMAGE_BASE_RELEASE := focal + endif + ifeq ($(IMAGE_BASE_RELEASE),$(filter $(IMAGE_BASE_RELEASE),tara tessa tina tricia)) + IMAGE_BASE_RELEASE := bionic + endif + endif + KERNEL_HEADERS := linux-headers-$(KERNEL_REL) + KERNEL_HEADERS_BASE := $(shell find /usr/src/$(KERNEL_HEADERS) -maxdepth 1 -type l -exec readlink {} \; | cut -d"/" -f2 | egrep -v "^\.\." | head -1) +endif + +ifeq ($(KERNEL_HEADERS_BASE), ) + KERNEL_HEADERS_MOUNTS := -v /usr/src/$(KERNEL_HEADERS):/usr/src/$(KERNEL_HEADERS):ro +else + KERNEL_HEADERS_MOUNTS := -v /usr/src/$(KERNEL_HEADERS):/usr/src/$(KERNEL_HEADERS):ro \ + -v /usr/src/$(KERNEL_HEADERS_BASE):/usr/src/$(KERNEL_HEADERS_BASE):ro +endif + +ifeq ($(shell $(GO) env GOOS),linux) + ifeq (,$(filter $(shell $(GO) env GOARCH),mips mipsle mips64 mips64le ppc64)) + GO_BUILDMODE := "-buildmode=pie" + endif +endif + +# Set cross-compilation flags if applicable. +ifneq ($(SYS_ARCH),$(TARGET_ARCH)) + ifeq ($(TARGET_ARCH),armel) + GO_XCOMPILE := CGO_ENABLED=1 GOOS=linux GOARCH=arm GOARM=6 CC=arm-linux-gnueabi-gcc + else ifeq ($(TARGET_ARCH),armhf) + GO_XCOMPILE := CGO_ENABLED=1 GOOS=linux GOARCH=arm GOARM=7 CC=arm-linux-gnueabihf-gcc + else ifeq ($(TARGET_ARCH),arm64) + GO_XCOMPILE = CGO_ENABLED=1 GOOS=linux GOARCH=arm64 CC=aarch64-linux-gnu-gcc + else ifeq ($(TARGET_ARCH),amd64) + GO_XCOMPILE = CGO_ENABLED=1 GOOS=linux GOARCH=amd64 CC=x86_64-linux-gnu-gcc + endif +endif + +GO_BUILD := $(GO_XCOMPILE) $(GO) build $(GO_BUILDMODE) -buildvcs=false -trimpath $(EXTRA_FLAGS) \ + -tags "$(BUILDTAGS)" -ldflags "${LDFLAGS}" + +GO_BUILD_STATIC := CGO_ENABLED=1 $(GO_XCOMPILE) $(GO) build -buildvcs=false -trimpath $(EXTRA_FLAGS) \ + -tags "$(BUILDTAGS) netgo osusergo" -ldflags "-extldflags -static ${LDFLAGS}" + +GO_BUILD_DEBUG := $(GO_XCOMPILE) $(GO) build -buildvcs=false --buildmode=exe -trimpath $(EXTRA_FLAGS) \ + -tags "$(BUILDTAGS)" -gcflags="all=-N -l" -ldflags "${LDFLAGS}" + +RUN_TEST_CONT := $(CONTAINER_ENGINE) run ${DOCKER_RUN_PROXY} \ + -t --privileged --rm \ + -e SYS_ARCH=$(SYS_ARCH) \ + -e TARGET_ARCH=$(TARGET_ARCH) \ + -v $(CURDIR):$(RUNC) \ + -v $(CURDIR)/../sysbox-ipc:$(NBOX)/sysbox-ipc \ + -v $(CURDIR)/../sysbox-libs:$(NBOX)/sysbox-libs \ + -v /lib/modules/$(KERNEL_REL):/lib/modules/$(KERNEL_REL):ro \ + -v $(GOPATH)/pkg/mod:/go/pkg/mod \ + $(KERNEL_HEADERS_MOUNTS) \ + $(RUNC_IMAGE) + +.DEFAULT: sysbox-runc + +sysbox-runc: $(RUNC_BUILDDIR)/$(RUNC_TARGET) + +$(RUNC_BUILDDIR)/$(RUNC_TARGET): $(SOURCES) $(SYSIPC_SRC) $(SYSLIB_SRC) + $(GO_BUILD) -o $(RUNC_BUILDDIR)/$(RUNC_TARGET) . + +sysbox-runc-debug: $(RUNC_BUILDDIR)/$(RUNC_DEBUG_TARGET) + +# -buildmode=exe required in order to debug nsenter (cgo) +$(RUNC_BUILDDIR)/$(RUNC_DEBUG_TARGET): + $(GO_BUILD_DEBUG) -o $(RUNC_BUILDDIR)/$(RUNC_TARGET) . + +all: $(RUNC_BUILDDIR)/$(RUNC_TARGET) recvtty + +recvtty: + $(GO_BUILD) -o contrib/cmd/recvtty/recvtty ./contrib/cmd/recvtty + +static: $(SOURCES) $(SYSIPC_SRC) $(SYSLIB_SRC) + $(GO_BUILD_STATIC) -o $(RUNC_BUILDDIR)/$(RUNC_TARGET) . + $(GO_BUILD_STATIC) -o contrib/cmd/recvtty/recvtty ./contrib/cmd/recvtty + +release: + script/release.sh -r release/$(VERSION) -v $(VERSION) + +dbuild: runcimage + $(RUN_TEST_CONT) make clean all + +gomod-tidy: + $(GO) mod tidy + +lint: + $(GO) vet ./... + $(GO) fmt ./... + +man: + man/md2man-all.sh + +runcimage: + $(CONTAINER_ENGINE) build $(CONTAINER_ENGINE_BUILD_FLAGS) -t $(RUNC_IMAGE) . + +# Note: sysbox-runc does not support rootless mode, so rootless integration tests are not invoked as part of test or localtest +test: + make unittest integration integration-shiftuid + +localtest: + make localunittest localintegration localintegration-shiftuid + +unittest: runcimage + $(RUN_TEST_CONT) make localunittest TESTFLAGS=${TESTFLAGS} + +localunittest: all + $(GO) test -timeout 3m -tags "$(BUILDTAGS)" $(TESTFLAGS) -v ./... + +integration: runcimage + $(RUN_TEST_CONT) make localintegration TESTPATH=${TESTPATH} + +integration-shiftuid: runcimage +ifeq ($(SHIFTFS_MODULE_PRESENT),) + @printf "\n** Skipped 'integration-shiftuid' target due to missing 'shiftfs' module **\n\n" +else + $(RUN_TEST_CONT) make localintegration-shiftuid TESTPATH=${TESTPATH} +endif + +localintegration: all + bats -t tests/integration$(TESTPATH) + +localintegration-shiftuid: all +ifeq ($(SHIFTFS_MODULE_PRESENT),) + @printf "\n** Skipped 'localintegration-shiftuid' target due to missing 'shiftfs' module **\n\n" +else + SHIFT_ROOTFS_UIDS=true bats -t tests/integration${TESTPATH} +endif + +shell: runcimage + $(CONTAINER_ENGINE) run ${DOCKER_RUN_PROXY} \ + -it --privileged --rm \ + -e SYS_ARCH=$(SYS_ARCH) \ + -e TARGET_ARCH=$(TARGET_ARCH) \ + -v $(CURDIR):$(RUNC) \ + -v $(CURDIR)/../sysbox-ipc:$(NBOX)/sysbox-ipc \ + -v $(CURDIR)/../sysbox-libs:$(NBOX)/sysbox-libs \ + -v /lib/modules/$(KERNEL_REL):/lib/modules/$(KERNEL_REL):ro \ + -v $(GOPATH)/pkg/mod:/go/pkg/mod \ + $(KERNEL_HEADERS_MOUNTS) \ + $(RUNC_IMAGE) bash + +install: + install -D -m0755 $(RUNC_BUILDDIR)/$(RUNC_TARGET) $(BINDIR)/$(RUNC_TARGET) + +install-bash: + install -D -m0644 contrib/completions/bash/$(RUNC_TARGET) $(PREFIX)/share/bash-completion/completions/$(RUNC_TARGET) + +install-man: man + install -d -m 755 $(MANDIR)/man8 + install -D -m 644 man/man8/*.8 $(MANDIR)/man8 + +uninstall: + rm -f $(BINDIR)/$(RUNC_TARGET) + +uninstall-bash: + rm -f $(PREFIX)/share/bash-completion/completions/$(RUNC_TARGET) + +clean: + rm -rf $(RUNC_BUILDDIR)/$(RUNC_TARGET) + rm -f contrib/cmd/recvtty/recvtty + rm -rf release + rm -rf man/man8 + +distclean: clean + rm -rf $(SYSFS_BUILDROOT) + +validate: + script/validate-gofmt + script/validate-c + $(GO) vet ./... + +shellcheck: + shellcheck tests/integration/*.bats + # TODO: add shellcheck for sh files + +shfmt: + shfmt -ln bats -d -w tests/integration/*.bats + shfmt -ln bash -d -w man/*.sh script/* tests/*.sh tests/integration/*.bash + +ci: validate test release + +# memoize allpackages, so that it's executed only once and only if used +_allpackages = $(shell $(GO) list ./... | grep -v vendor) +allpackages = $(if $(__allpackages),,$(eval __allpackages := $$(_allpackages)))$(__allpackages) + +listpackages: + @echo $(allpackages) + +.PHONY: runc all recvtty static release dbuild lint man runcimage \ + test localtest unittest localunittest integration localintegration \ + rootlessintegration localrootlessintegration shell install install-bash \ + install-man uninstall uninstall-bash clean validate ci shfmt shellcheck diff --git a/sysbox-runc/NOTICE b/sysbox-runc/NOTICE new file mode 100644 index 00000000..5c97abce --- /dev/null +++ b/sysbox-runc/NOTICE @@ -0,0 +1,17 @@ +runc + +Copyright 2012-2015 Docker, Inc. + +This product includes software developed at Docker, Inc. (http://www.docker.com). + +The following is courtesy of our legal counsel: + + +Use and transfer of Docker may be subject to certain restrictions by the +United States and other governments. +It is your responsibility to ensure that your use and/or transfer does not +violate applicable laws. + +For more information, please see http://www.bis.doc.gov + +See also http://www.apache.org/dev/crypto.html and/or seek legal counsel. diff --git a/sysbox-runc/PRINCIPLES.md b/sysbox-runc/PRINCIPLES.md new file mode 100644 index 00000000..fdcc3738 --- /dev/null +++ b/sysbox-runc/PRINCIPLES.md @@ -0,0 +1,19 @@ +# runc principles + +In the design and development of runc and libcontainer we try to follow these principles: + +(Work in progress) + +* Don't try to replace every tool. Instead, be an ingredient to improve them. +* Less code is better. +* Fewer components are better. Do you really need to add one more class? +* 50 lines of straightforward, readable code is better than 10 lines of magic that nobody can understand. +* Don't do later what you can do now. "//TODO: refactor" is not acceptable in new code. +* When hesitating between two options, choose the one that is easier to reverse. +* "No" is temporary; "Yes" is forever. If you're not sure about a new feature, say no. You can change your mind later. +* Containers must be portable to the greatest possible number of machines. Be suspicious of any change which makes machines less interchangeable. +* The fewer moving parts in a container, the better. +* Don't merge it unless you document it. +* Don't document it unless you can keep it up-to-date. +* Don't merge it unless you test it! +* Everyone's problem is slightly different. Focus on the part that is the same for everyone, and solve that. diff --git a/sysbox-runc/README.md b/sysbox-runc/README.md new file mode 100644 index 00000000..501e8cc9 --- /dev/null +++ b/sysbox-runc/README.md @@ -0,0 +1,117 @@ +# sysbox-runc + +## Introduction + +sysbox-runc is part of [Sysbox](../README.md). + +sysbox-runc is the program that does the low level kernel setup for execution of +system containers. It's the "front-end" of Sysbox: higher layers (e.g., Docker & +containerd) invoke sysbox-runc to launch system containers. + +sysbox-runc is tightly integrated with sysbox-fs and sysbox-mgr via +gRPC. Refer to the [Sysbox design doc](../docs/user-guide/design.md) for +further info. + +sysbox-runc is a fork of the excellent [OCI runc](https://github.com/opencontainers/runc), +modified for running system containers. It was forked in early 2019 and has undergone +significant changes since then. It's regularly updated to track the latest changes +in the OCI runc. + +sysbox-runc is mostly (but not 100%) compatible with the OCI runtime specification (more on this +[here](../docs/user-guide/design.md#sysbox-oci-compatibility)). + +** A HUGE DEBT OF GRATITUDE TO THE OCI RUNC DEVELOPERS & MAINTAINERS, WITHOUT WHOM SYSBOX-RUNC WOULD NOT BE POSSIBLE. ** + +## Building + +sysbox-runc is built as part of the Sysbox build process. Refer to the Sysbox +[developer's guide](../docs/developers-guide.md) for more on this. + +### Running the test suite + +sysbox-runc is normally tested as part of the [Sysbox test suite](../docs/developers-guide.md#sysbox-testing). That test +suite has Makefile targets to run sysbox-runc unit and integration tests. + +Alternatively, you can run the sysbox-runc tests directly as follows: + +```bash +make test +``` + +There are additional make targets for running the tests outside of a container but this is +not recommended as the tests are written with the expectation that they can write and +remove anywhere. + +You can run a specific test case by setting the `TESTFLAGS` variable. + +```bash +# make test TESTFLAGS="-run=SomeTestFunction" +``` + +You can run a specific integration test by setting the `TESTPATH` variable. + +```bash +# make test TESTPATH="/checkpoint.bats" +``` + +You can run a specific rootless integration test by setting the `ROOTLESS_TESTPATH` variable. + +```bash +# make test ROOTLESS_TESTPATH="/checkpoint.bats" +``` + +You can run a test using your container engine's flags by setting `CONTAINER_ENGINE_BUILD_FLAGS` and `CONTAINER_ENGINE_RUN_FLAGS` variables. + +```bash +# make test CONTAINER_ENGINE_BUILD_FLAGS="--build-arg http_proxy=http://yourproxy/" CONTAINER_ENGINE_RUN_FLAGS="-e http_proxy=http://yourproxy/" +``` + +### Test Shell + +You can get a shell in the test container with: + +```bash +# make shell +``` + +To run a specific integration test: + +```bash +# bats -t tests/integration/sometest.bats +``` + +To run a specific unit test, point to the go package and test. + +```bash +# go test "-mod=vendor" -timeout 3m -tags "seccomp selinux apparmor" -v github.com/opencontainers/runc/libcontainer/integration -run TestEnter +``` + +You can get the list of go packages with: + +```bash +# go list ./... +``` + +The delve debugger is installed in the test container. You can attach it to a sysbox-runc process with: + +```bash +# dlv attach +``` + +where `` is the pid of the sysbox-runc process. + +## Using sysbox-runc + +See the [Sysbox User Guide](../docs/user-guide/deploy.md) for more info on this. + +## Other documentation + +* [cgroup v2](./docs/cgroup-v2.md) +* [Changing systemd unit properties](./docs/systemd-properties.md) +* [Terminals and standard IO](./docs/terminals.md) + +## Libcontainer + +The libcontainer package in sysbox-runc is not meant to be usable as a +standalone library (unlike the libcontainer package in the OCI runc). It has +undergone changes that tie it deeply into sysbox-runc. diff --git a/sysbox-runc/SECURITY.md b/sysbox-runc/SECURITY.md new file mode 100644 index 00000000..61e37bc5 --- /dev/null +++ b/sysbox-runc/SECURITY.md @@ -0,0 +1,3 @@ +# Security + +The reporting process and disclosure communications are outlined [here](https://github.com/opencontainers/org/blob/master/SECURITY.md). diff --git a/sysbox-runc/VERSION b/sysbox-runc/VERSION new file mode 100644 index 00000000..6e8bf73a --- /dev/null +++ b/sysbox-runc/VERSION @@ -0,0 +1 @@ +0.1.0 diff --git a/sysbox-runc/Vagrantfile.centos7 b/sysbox-runc/Vagrantfile.centos7 new file mode 100644 index 00000000..037ecd76 --- /dev/null +++ b/sysbox-runc/Vagrantfile.centos7 @@ -0,0 +1,62 @@ +# -*- mode: ruby -*- +# vi: set ft=ruby : + +Vagrant.configure("2") do |config| + config.vm.box = "centos/7" + config.vm.provider :virtualbox do |v| + v.memory = 2048 + v.cpus = 2 + end + config.vm.provider :libvirt do |v| + v.memory = 2048 + v.cpus = 2 + end + config.vm.provision "shell", inline: <<-SHELL + set -e -u -o pipefail + + # configuration + GO_VERSION="1.15" + BATS_VERSION="v1.2.1" + UMOCI_VERSION="v0.4.6" + + # install yum packages + yum install -y -q epel-release + (cd /etc/yum.repos.d && curl -O https://copr.fedorainfracloud.org/coprs/adrian/criu-el7/repo/epel-7/adrian-criu-el7-epel-7.repo) + yum install -y -q gcc git iptables jq libseccomp-devel make skopeo criu + yum clean all + + # install Go + curl -fsSL "https://dl.google.com/go/go${GO_VERSION}.linux-amd64.tar.gz" | tar Cxz /usr/local + + # Install umoci + curl -o /usr/local/bin/umoci -fsSL https://github.com/opencontainers/umoci/releases/download/${UMOCI_VERSION}/umoci.amd64 + chmod +x /usr/local/bin/umoci + + # install bats + git clone https://github.com/bats-core/bats-core + cd bats-core + git checkout $BATS_VERSION + ./install.sh /usr/local + cd .. + rm -rf bats-core + + # set PATH (NOTE: sudo without -i ignores this PATH) + cat >> /etc/profile.d/sh.local < /etc/sysctl.d/userns.conf + sysctl --system + + # Add a user for rootless tests + useradd -u2000 -m -d/home/rootless -s/bin/bash rootless + + # Add busybox for libcontainer/integration tests + . /vagrant/tests/integration/multi-arch.bash \ + && mkdir /busybox \ + && curl -fsSL $(get_busybox) | tar xfJC - /busybox + SHELL +end diff --git a/sysbox-runc/Vagrantfile.fedora33 b/sysbox-runc/Vagrantfile.fedora33 new file mode 100644 index 00000000..a32bed4a --- /dev/null +++ b/sysbox-runc/Vagrantfile.fedora33 @@ -0,0 +1,60 @@ +# -*- mode: ruby -*- +# vi: set ft=ruby : + +Vagrant.configure("2") do |config| +# Fedora box is used for testing cgroup v2 support + config.vm.box = "fedora/33-cloud-base" + config.vm.provider :virtualbox do |v| + v.memory = 2048 + v.cpus = 2 + end + config.vm.provider :libvirt do |v| + v.memory = 2048 + v.cpus = 2 + end + config.vm.provision "shell", inline: <<-SHELL + set -e -u -o pipefail + # Work around dnf mirror failures by retrying a few times + for i in $(seq 0 2); do + sleep $i + cat << EOF | dnf -y shell && break +config exclude kernel,kernel-core +config install_weak_deps false +update +install iptables gcc make golang-go libseccomp-devel bats jq git-core criu skopeo +ts run +EOF + done + dnf clean all + + # Add a user for rootless tests + useradd -u2000 -m -d/home/rootless -s/bin/bash rootless + + # Allow root to execute `ssh rootless@localhost` in tests/rootless.sh + ssh-keygen -t ecdsa -N "" -f /root/rootless.key + mkdir -m 0700 -p /home/rootless/.ssh + cat /root/rootless.key.pub >> /home/rootless/.ssh/authorized_keys + chown -R rootless.rootless /home/rootless + + # Install umoci + UMOCI_VERSION=v0.4.6 + curl -o /usr/local/bin/umoci -fsSL https://github.com/opencontainers/umoci/releases/download/${UMOCI_VERSION}/umoci.amd64 + chmod +x /usr/local/bin/umoci + + # Add busybox for libcontainer/integration tests + . /vagrant/tests/integration/multi-arch.bash \ + && mkdir /busybox /debian \ + && curl -fsSL $(get_busybox) | tar xfJC - /busybox \ + && get_and_extract_debian /debian + + # Delegate cgroup v2 controllers to rootless user via --systemd-cgroup + mkdir -p /etc/systemd/system/user@.service.d + cat > /etc/systemd/system/user@.service.d/delegate.conf << EOF +[Service] +# default: Delegate=pids memory +# NOTE: delegation of cpuset requires systemd >= 244 (Fedora >= 32, Ubuntu >= 20.04). cpuset is ignored on Fedora 31. +Delegate=cpu cpuset io memory pids +EOF + systemctl daemon-reload + SHELL +end diff --git a/sysbox-runc/checkpoint.go b/sysbox-runc/checkpoint.go new file mode 100644 index 00000000..1a8fdf0e --- /dev/null +++ b/sysbox-runc/checkpoint.go @@ -0,0 +1,142 @@ +// +build linux + +package main + +import ( + "errors" + "fmt" + "net" + "os" + "strconv" + + criu "github.com/checkpoint-restore/go-criu/v4/rpc" + "github.com/opencontainers/runc/libcontainer" + "github.com/opencontainers/runc/libcontainer/system" + "github.com/opencontainers/runtime-spec/specs-go" + "github.com/sirupsen/logrus" + "github.com/urfave/cli" + "golang.org/x/sys/unix" +) + +var checkpointCommand = cli.Command{ + Name: "checkpoint", + Usage: "checkpoint a running system container", + ArgsUsage: ` + +Where "" is the name for the instance of the container to be +checkpointed.`, + Description: `The checkpoint command saves the state of the container instance.`, + Flags: []cli.Flag{ + cli.StringFlag{Name: "image-path", Value: "", Usage: "path for saving criu image files"}, + cli.StringFlag{Name: "work-path", Value: "", Usage: "path for saving work files and logs"}, + cli.StringFlag{Name: "parent-path", Value: "", Usage: "path for previous criu image files in pre-dump"}, + cli.BoolFlag{Name: "leave-running", Usage: "leave the process running after checkpointing"}, + cli.BoolFlag{Name: "tcp-established", Usage: "allow open tcp connections"}, + cli.BoolFlag{Name: "ext-unix-sk", Usage: "allow external unix sockets"}, + cli.BoolFlag{Name: "shell-job", Usage: "allow shell jobs"}, + cli.BoolFlag{Name: "lazy-pages", Usage: "use userfaultfd to lazily restore memory pages"}, + cli.IntFlag{Name: "status-fd", Value: -1, Usage: "criu writes \\0 to this FD once lazy-pages is ready"}, + cli.StringFlag{Name: "page-server", Value: "", Usage: "ADDRESS:PORT of the page server"}, + cli.BoolFlag{Name: "file-locks", Usage: "handle file locks, for safety"}, + cli.BoolFlag{Name: "pre-dump", Usage: "dump container's memory information only, leave the container running after this"}, + cli.StringFlag{Name: "manage-cgroups-mode", Value: "", Usage: "cgroups mode: 'soft' (default), 'full' and 'strict'"}, + cli.StringSliceFlag{Name: "empty-ns", Usage: "create a namespace, but don't restore its properties"}, + cli.BoolFlag{Name: "auto-dedup", Usage: "enable auto deduplication of memory images"}, + }, + Action: func(context *cli.Context) error { + if err := checkArgs(context, 1, exactArgs); err != nil { + return err + } + // XXX: Currently this is untested with rootless containers. + if os.Geteuid() != 0 || system.RunningInUserNS() { + logrus.Warn("sysbox-runc checkpoint is untested") + } + + container, err := getContainer(context) + if err != nil { + return err + } + status, err := container.Status() + if err != nil { + return err + } + if status == libcontainer.Created || status == libcontainer.Stopped { + fatal(fmt.Errorf("Container cannot be checkpointed in %s state", status.String())) + } + options := criuOptions(context) + if !(options.LeaveRunning || options.PreDump) { + // destroy container unless we tell CRIU to keep it + defer destroy(container) + } + // these are the mandatory criu options for a container + setPageServer(context, options) + setManageCgroupsMode(context, options) + if err := setEmptyNsMask(context, options); err != nil { + return err + } + return container.Checkpoint(options) + }, +} + +func getCheckpointImagePath(context *cli.Context) string { + imagePath := context.String("image-path") + if imagePath == "" { + imagePath = getDefaultImagePath(context) + } + return imagePath +} + +func setPageServer(context *cli.Context, options *libcontainer.CriuOpts) { + // xxx following criu opts are optional + // The dump image can be sent to a criu page server + if psOpt := context.String("page-server"); psOpt != "" { + address, port, err := net.SplitHostPort(psOpt) + + if err != nil || address == "" || port == "" { + fatal(errors.New("Use --page-server ADDRESS:PORT to specify page server")) + } + portInt, err := strconv.Atoi(port) + if err != nil { + fatal(errors.New("Invalid port number")) + } + options.PageServer = libcontainer.CriuPageServerInfo{ + Address: address, + Port: int32(portInt), + } + } +} + +func setManageCgroupsMode(context *cli.Context, options *libcontainer.CriuOpts) { + if cgOpt := context.String("manage-cgroups-mode"); cgOpt != "" { + switch cgOpt { + case "soft": + options.ManageCgroupsMode = criu.CriuCgMode_SOFT + case "full": + options.ManageCgroupsMode = criu.CriuCgMode_FULL + case "strict": + options.ManageCgroupsMode = criu.CriuCgMode_STRICT + default: + fatal(errors.New("Invalid manage cgroups mode")) + } + } +} + +var namespaceMapping = map[specs.LinuxNamespaceType]int{ + specs.NetworkNamespace: unix.CLONE_NEWNET, +} + +func setEmptyNsMask(context *cli.Context, options *libcontainer.CriuOpts) error { + /* sysbox-runc doesn't manage network devices and their configuration */ + nsmask := unix.CLONE_NEWNET + + for _, ns := range context.StringSlice("empty-ns") { + f, exists := namespaceMapping[specs.LinuxNamespaceType(ns)] + if !exists { + return fmt.Errorf("namespace %q is not supported", ns) + } + nsmask |= f + } + + options.EmptyNs = uint32(nsmask) + return nil +} diff --git a/sysbox-runc/contrib/cmd/recvtty/recvtty.go b/sysbox-runc/contrib/cmd/recvtty/recvtty.go new file mode 100644 index 00000000..cf2fddf7 --- /dev/null +++ b/sysbox-runc/contrib/cmd/recvtty/recvtty.go @@ -0,0 +1,250 @@ +/* + * Copyright 2016 SUSE LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package main + +import ( + "fmt" + "io" + "io/ioutil" + "net" + "os" + "strings" + "sync" + + "github.com/containerd/console" + "github.com/opencontainers/runc/libcontainer/utils" + "github.com/urfave/cli" +) + +// version will be populated by the Makefile, read from +// VERSION file of the source code. +var version = "" + +// gitCommit will be the hash that the binary was built from +// and will be populated by the Makefile +var gitCommit = "" + +const ( + usage = `Open Container Initiative contrib/cmd/recvtty + +recvtty is a reference implementation of a consumer of runC's --console-socket +API. It has two main modes of operation: + + * single: Only permit one terminal to be sent to the socket, which is + then hooked up to the stdio of the recvtty process. This is useful + for rudimentary shell management of a container. + + * null: Permit as many terminals to be sent to the socket, but they + are read to /dev/null. This is used for testing, and imitates the + old runC API's --console=/dev/pts/ptmx hack which would allow for a + similar trick. This is probably not what you want to use, unless + you're doing something like our bats integration tests. + +To use recvtty, just specify a socket path at which you want to receive +terminals: + + $ recvtty [--mode ] socket.sock +` +) + +func bail(err error) { + fmt.Fprintf(os.Stderr, "[recvtty] fatal error: %v\n", err) + os.Exit(1) +} + +func handleSingle(path string, noStdin bool) error { + // Open a socket. + ln, err := net.Listen("unix", path) + if err != nil { + return err + } + defer ln.Close() + + // We only accept a single connection, since we can only really have + // one reader for os.Stdin. Plus this is all a PoC. + conn, err := ln.Accept() + if err != nil { + return err + } + defer conn.Close() + + // Close ln, to allow for other instances to take over. + ln.Close() + + // Get the fd of the connection. + unixconn, ok := conn.(*net.UnixConn) + if !ok { + return fmt.Errorf("failed to cast to unixconn") + } + + socket, err := unixconn.File() + if err != nil { + return err + } + defer socket.Close() + + // Get the master file descriptor from runC. + master, err := utils.RecvFd(socket) + if err != nil { + return err + } + c, err := console.ConsoleFromFile(master) + if err != nil { + return err + } + if err := console.ClearONLCR(c.Fd()); err != nil { + return err + } + + // Copy from our stdio to the master fd. + var ( + wg sync.WaitGroup + inErr, outErr error + ) + wg.Add(1) + go func() { + _, outErr = io.Copy(os.Stdout, c) + wg.Done() + }() + if !noStdin { + wg.Add(1) + go func() { + _, inErr = io.Copy(c, os.Stdin) + wg.Done() + }() + } + + // Only close the master fd once we've stopped copying. + wg.Wait() + c.Close() + + if outErr != nil { + return outErr + } + + return inErr +} + +func handleNull(path string) error { + // Open a socket. + ln, err := net.Listen("unix", path) + if err != nil { + return err + } + defer ln.Close() + + // As opposed to handleSingle we accept as many connections as we get, but + // we don't interact with Stdin at all (and we copy stdout to /dev/null). + for { + conn, err := ln.Accept() + if err != nil { + return err + } + go func(conn net.Conn) { + // Don't leave references lying around. + defer conn.Close() + + // Get the fd of the connection. + unixconn, ok := conn.(*net.UnixConn) + if !ok { + return + } + + socket, err := unixconn.File() + if err != nil { + return + } + defer socket.Close() + + // Get the master file descriptor from runC. + master, err := utils.RecvFd(socket) + if err != nil { + return + } + + _, _ = io.Copy(ioutil.Discard, master) + }(conn) + } +} + +func main() { + app := cli.NewApp() + app.Name = "recvtty" + app.Usage = usage + + // Set version to be the same as runC. + var v []string + if version != "" { + v = append(v, version) + } + if gitCommit != "" { + v = append(v, "commit: "+gitCommit) + } + app.Version = strings.Join(v, "\n") + + // Set the flags. + app.Flags = []cli.Flag{ + cli.StringFlag{ + Name: "mode, m", + Value: "single", + Usage: "Mode of operation (single or null)", + }, + cli.StringFlag{ + Name: "pid-file", + Value: "", + Usage: "Path to write daemon process ID to", + }, + cli.BoolFlag{ + Name: "no-stdin", + Usage: "Disable stdin handling (no-op for null mode)", + }, + } + + app.Action = func(ctx *cli.Context) error { + args := ctx.Args() + if len(args) != 1 { + return fmt.Errorf("need to specify a single socket path") + } + path := ctx.Args()[0] + + pidPath := ctx.String("pid-file") + if pidPath != "" { + pid := fmt.Sprintf("%d\n", os.Getpid()) + if err := ioutil.WriteFile(pidPath, []byte(pid), 0644); err != nil { + return err + } + } + + noStdin := ctx.Bool("no-stdin") + switch ctx.String("mode") { + case "single": + if err := handleSingle(path, noStdin); err != nil { + return err + } + case "null": + if err := handleNull(path); err != nil { + return err + } + default: + return fmt.Errorf("need to select a valid mode: %s", ctx.String("mode")) + } + return nil + } + if err := app.Run(os.Args); err != nil { + bail(err) + } +} diff --git a/sysbox-runc/contrib/completions/bash/runc b/sysbox-runc/contrib/completions/bash/runc new file mode 100644 index 00000000..165405e9 --- /dev/null +++ b/sysbox-runc/contrib/completions/bash/runc @@ -0,0 +1,828 @@ +#!/bin/bash +# +# bash completion file for runc command +# +# This script provides completion of: +# - commands and their options +# - filepaths +# +# To enable the completions either: +# - place this file in /usr/share/bash-completion/completions +# or +# - copy this file to e.g. ~/.runc-completion.sh and add the line +# below to your .bashrc after bash completion features are loaded +# . ~/.runc-completion.sh +# +# Configuration: +# + +# Note for developers: +# Please arrange options sorted alphabetically by long name with the short +# options immediately following their corresponding long form. +# This order should be applied to lists, alternatives and code blocks. + +__runc_previous_extglob_setting=$(shopt -p extglob) +shopt -s extglob + +__runc_list_all() { + COMPREPLY=($(compgen -W "$(runc list -q)" -- $cur)) +} + +__runc_pos_first_nonflag() { + local argument_flags=$1 + + local counter=$((${subcommand_pos:-${command_pos}} + 1)) + while [ $counter -le $cword ]; do + if [ -n "$argument_flags" ] && eval "case '${words[$counter]}' in $argument_flags) true ;; *) false ;; esac"; then + ((counter++)) + else + case "${words[$counter]}" in + -*) ;; + *) + break + ;; + esac + fi + ((counter++)) + done + + echo $counter +} + +# Transforms a multiline list of strings into a single line string +# with the words separated by "|". +# This is used to prepare arguments to __runc_pos_first_nonflag(). +__runc_to_alternatives() { + local parts=($1) + local IFS='|' + echo "${parts[*]}" +} + +# Transforms a multiline list of options into an extglob pattern +# suitable for use in case statements. +__runc_to_extglob() { + local extglob=$(__runc_to_alternatives "$1") + echo "@($extglob)" +} + +# Subcommand processing. +# Locates the first occurrence of any of the subcommands contained in the +# first argument. In case of a match, calls the corresponding completion +# function and returns 0. +# If no match is found, 1 is returned. The calling function can then +# continue processing its completion. +# +# TODO if the preceding command has options that accept arguments and an +# argument is equal to one of the subcommands, this is falsely detected as +# a match. +__runc_subcommands() { + local subcommands="$1" + + local counter=$(($command_pos + 1)) + while [ $counter -lt $cword ]; do + case "${words[$counter]}" in + $(__runc_to_extglob "$subcommands")) + subcommand_pos=$counter + local subcommand=${words[$counter]} + local completions_func=_runc_${command}_${subcommand} + declare -F $completions_func >/dev/null && $completions_func + return 0 + ;; + esac + ((counter++)) + done + return 1 +} + +# List all Signals +__runc_list_signals() { + COMPREPLY=($(compgen -W "$(for i in $(kill -l | xargs); do echo $i; done | grep SIG)")) +} + +# suppress trailing whitespace +__runc_nospace() { + # compopt is not available in ancient bash versions + type compopt &>/dev/null && compopt -o nospace +} + +# The list of capabilities is defined in types.go, ALL was added manually. +__runc_complete_capabilities() { + COMPREPLY=($(compgen -W " + ALL + AUDIT_CONTROL + AUDIT_WRITE + AUDIT_READ + BLOCK_SUSPEND + BPF + CHECKPOINT_RESTORE + CHOWN + DAC_OVERRIDE + DAC_READ_SEARCH + FOWNER + FSETID + IPC_LOCK + IPC_OWNER + KILL + LEASE + LINUX_IMMUTABLE + MAC_ADMIN + MAC_OVERRIDE + MKNOD + NET_ADMIN + NET_BIND_SERVICE + NET_BROADCAST + NET_RAW + PERFMON + SETFCAP + SETGID + SETPCAP + SETUID + SYS_ADMIN + SYS_BOOT + SYS_CHROOT + SYSLOG + SYS_MODULE + SYS_NICE + SYS_PACCT + SYS_PTRACE + SYS_RAWIO + SYS_RESOURCE + SYS_TIME + SYS_TTY_CONFIG + WAKE_ALARM + " -- "$cur")) +} + +_runc_exec() { + local boolean_options=" + --help + --no-new-privs + --tty, -t + --detach, -d + " + + local options_with_args=" + --console-socket + --cwd + --env, -e + --user, -u + --additional-gids, -g + --process, -p + --pid-file + --process-label + --apparmor + --cap, -c + --preserve-fds + " + + local all_options="$options_with_args $boolean_options" + + case "$prev" in + --cap | -c) + __runc_complete_capabilities + return + ;; + + --console-socket | --cwd | --process | --apparmor) + case "$cur" in + *:*) ;; # TODO somehow do _filedir for stuff inside the image, if it's already specified (which is also somewhat difficult to determine) + '') + COMPREPLY=($(compgen -W '/' -- "$cur")) + __runc_nospace + ;; + /*) + _filedir + __runc_nospace + ;; + esac + return + ;; + --env | -e) + COMPREPLY=($(compgen -e -- "$cur")) + __runc_nospace + return + ;; + $(__runc_to_extglob "$options_with_args")) + return + ;; + esac + + case "$cur" in + -*) + COMPREPLY=($(compgen -W "$all_options" -- "$cur")) + ;; + *) + __runc_list_all + ;; + esac +} + +# global options that may appear after the runc command +_runc_runc() { + local boolean_options=" + $global_boolean_options + --help + --version -v + --debug + --systemd-cgroup + " + local options_with_args=" + --log + --log-format + --root + --criu + --rootless + " + + case "$prev" in + --log | --root | --criu) + case "$cur" in + *:*) ;; # TODO somehow do _filedir for stuff inside the image, if it's already specified (which is also somewhat difficult to determine) + '') + COMPREPLY=($(compgen -W '/' -- "$cur")) + __runc_nospace + ;; + *) + _filedir + __runc_nospace + ;; + esac + return + ;; + + --log-format) + COMPREPLY=($(compgen -W 'text json' -- "$cur")) + return + ;; + + $(__runc_to_extglob "$options_with_args")) + return + ;; + esac + + case "$cur" in + -*) + COMPREPLY=($(compgen -W "$boolean_options $options_with_args" -- "$cur")) + ;; + *) + local counter=$(__runc_pos_first_nonflag $(__runc_to_extglob "$options_with_args")) + if [ $cword -eq $counter ]; then + COMPREPLY=($(compgen -W "${commands[*]} help" -- "$cur")) + fi + ;; + esac +} + +_runc_pause() { + local boolean_options=" + --help + -h + " + + case "$cur" in + -*) + COMPREPLY=($(compgen -W "$boolean_options $options_with_args" -- "$cur")) + ;; + *) + __runc_list_all + ;; + esac +} + +_runc_ps() { + local boolean_options=" + --help + -h + " + local options_with_args=" + --format, -f + " + + case "$cur" in + -*) + COMPREPLY=($(compgen -W "$boolean_options $options_with_args" -- "$cur")) + ;; + *) + __runc_list_all + ;; + esac +} + +_runc_delete() { + local boolean_options=" + --help + -h + --format, -f + " + + case "$cur" in + -*) + COMPREPLY=($(compgen -W "$boolean_options $options_with_args" -- "$cur")) + ;; + *) + __runc_list_all + ;; + esac +} + +_runc_kill() { + local boolean_options=" + --help + -h + --all + -a + " + + case "$prev" in + "kill") + __runc_list_all + return + ;; + *) + __runc_list_signals + return + ;; + esac + case "$cur" in + -*) + COMPREPLY=($(compgen -W "$boolean_options $options_with_args" -- "$cur")) + ;; + *) + __runc_list_all + ;; + esac +} + +_runc_events() { + local boolean_options=" + --help + --stats + " + + local options_with_args=" + --interval + " + + case "$prev" in + $(__runc_to_extglob "$options_with_args")) + return + ;; + esac + + case "$cur" in + -*) + COMPREPLY=($(compgen -W "$boolean_options $options_with_args" -- "$cur")) + ;; + *) + __runc_list_all + ;; + esac +} + +_runc_list() { + local boolean_options=" + --help + --quiet + -q + " + + local options_with_args=" + --format + -f + " + + case "$prev" in + --format | -f) + COMPREPLY=($(compgen -W 'text json' -- "$cur")) + return + ;; + + $(__runc_to_extglob "$options_with_args")) + return + ;; + esac + + case "$cur" in + -*) + COMPREPLY=($(compgen -W "$boolean_options $options_with_args" -- "$cur")) + ;; + *) + local counter=$(__runc_pos_first_nonflag $(__runc_to_extglob "$options_with_args")) + ;; + esac +} + +_runc_spec() { + local boolean_options=" + --help + --rootless + " + + local options_with_args=" + --bundle + -b + " + + case "$prev" in + --bundle | -b) + case "$cur" in + '') + COMPREPLY=($(compgen -W '/' -- "$cur")) + __runc_nospace + ;; + /*) + _filedir + __runc_nospace + ;; + esac + return + ;; + + $(__runc_to_extglob "$options_with_args")) + return + ;; + esac + + case "$cur" in + -*) + COMPREPLY=($(compgen -W "$boolean_options $options_with_args" -- "$cur")) + ;; + *) + local counter=$(__runc_pos_first_nonflag $(__runc_to_extglob "$options_with_args")) + ;; + esac +} + +_runc_run() { + local boolean_options=" + --help + --detatch + -d + --no-subreaper + --no-pivot + --no-new-keyring + " + + local options_with_args=" + --bundle + -b + --console-socket + --pid-file + --preserve-fds + " + + case "$prev" in + --bundle | -b | --console-socket | --pid-file) + case "$cur" in + '') + COMPREPLY=($(compgen -W '/' -- "$cur")) + __runc_nospace + ;; + /*) + _filedir + __runc_nospace + ;; + esac + return + ;; + + $(__runc_to_extglob "$options_with_args")) + return + ;; + esac + + case "$cur" in + -*) + COMPREPLY=($(compgen -W "$boolean_options $options_with_args" -- "$cur")) + ;; + *) + __runc_list_all + ;; + esac +} + +_runc_checkpoint() { + local boolean_options=" + --help + -h + --leave-running + --tcp-established + --ext-unix-sk + --shell-job + --lazy-pages + --file-locks + --pre-dump + --auto-dedup + " + + local options_with_args=" + --image-path + --work-path + --parent-path + --status-fd + --page-server + --manage-cgroups-mode + --empty-ns + " + + case "$prev" in + --page-server) ;; + + --manage-cgroups-mode) + COMPREPLY=($(compgen -W "soft full strict" -- "$cur")) + return + ;; + + --image-path | --work-path | --parent-path) + case "$cur" in + *:*) ;; # TODO somehow do _filedir for stuff inside the image, if it's already specified (which is also somewhat difficult to determine) + '') + COMPREPLY=($(compgen -W '/' -- "$cur")) + __runc_nospace + ;; + *) + _filedir + __runc_nospace + ;; + esac + return + ;; + + $(__runc_to_extglob "$options_with_args")) + return + ;; + esac + + case "$cur" in + -*) + COMPREPLY=($(compgen -W "$boolean_options $options_with_args" -- "$cur")) + ;; + *) + __runc_list_all + ;; + esac +} +_runc_create() { + local boolean_options=" + --help + --no-pivot + --no-new-keyring + " + + local options_with_args=" + --bundle + -b + --console-socket + --pid-file + --preserve-fds + " + case "$prev" in + --bundle | -b | --console-socket | --pid-file) + case "$cur" in + '') + COMPREPLY=($(compgen -W '/' -- "$cur")) + __runc_nospace + ;; + /*) + _filedir + __runc_nospace + ;; + esac + return + ;; + + $(__runc_to_extglob "$options_with_args")) + return + ;; + esac + + case "$cur" in + -*) + COMPREPLY=($(compgen -W "$boolean_options $options_with_args" -- "$cur")) + ;; + *) + __runc_list_all + ;; + esac + +} + +_runc_help() { + local counter=$(__runc_pos_first_nonflag) + if [ $cword -eq $counter ]; then + COMPREPLY=($(compgen -W "${commands[*]}" -- "$cur")) + fi +} + +_runc_restore() { + local boolean_options=" + --help + --tcp-established + --ext-unix-sk + --shell-job + --file-locks + --detach + -d + --no-subreaper + --no-pivot + --auto-dedup + --lazy-pages + " + + local options_with_args=" + -b + --bundle + --image-path + --work-path + --manage-cgroups-mode + --pid-file + --empty-ns + " + + local all_options="$options_with_args $boolean_options" + + case "$prev" in + --manage-cgroups-mode) + COMPREPLY=($(compgen -W "soft full strict" -- "$cur")) + return + ;; + + --pid-file | --image-path | --work-path | --bundle | -b) + case "$cur" in + *:*) ;; # TODO somehow do _filedir for stuff inside the image, if it's already specified (which is also somewhat difficult to determine) + '') + COMPREPLY=($(compgen -W '/' -- "$cur")) + __runc_nospace + ;; + /*) + _filedir + __runc_nospace + ;; + esac + return + ;; + + $(__runc_to_extglob "$options_with_args")) + return + ;; + esac + + case "$cur" in + -*) + COMPREPLY=($(compgen -W "$all_options" -- "$cur")) + ;; + *) + __runc_list_all + ;; + esac +} + +_runc_resume() { + local boolean_options=" + --help + -h + " + + case "$cur" in + -*) + COMPREPLY=($(compgen -W "$boolean_options $options_with_args" -- "$cur")) + ;; + *) + __runc_list_all + ;; + esac +} + +_runc_state() { + local boolean_options=" + --help + -h + " + + case "$cur" in + -*) + COMPREPLY=($(compgen -W "$boolean_options $options_with_args" -- "$cur")) + ;; + *) + __runc_list_all + ;; + esac +} +_runc_start() { + local boolean_options=" + --help + -h + " + + case "$cur" in + -*) + COMPREPLY=($(compgen -W "$boolean_options $options_with_args" -- "$cur")) + ;; + *) + __runc_list_all + ;; + esac +} +_runc_update() { + local boolean_options=" + --help + " + + local options_with_args=" + --blkio-weight + --cpu-period + --cpu-quota + --cpu-rt-period + --cpu-rt-runtime + --cpu-share + --cpuset-cpus + --cpuset-mems + --memory + --memory-reservation + --memory-swap + --pids-limit + --l3-cache-schema + --mem-bw-schema + " + + case "$prev" in + $(__runc_to_extglob "$options_with_args")) + return + ;; + esac + + case "$cur" in + -*) + COMPREPLY=($(compgen -W "$boolean_options $options_with_args" -- "$cur")) + ;; + *) + __runc_list_all + ;; + esac +} + +_runc() { + local previous_extglob_setting=$(shopt -p extglob) + shopt -s extglob + + local commands=( + checkpoint + create + delete + events + exec + init + kill + list + pause + ps + restore + resume + run + spec + start + state + update + help + h + ) + + # These options are valid as global options for all client commands + # and valid as command options for `runc daemon` + local global_boolean_options=" + --help -h + --version -v + " + + COMPREPLY=() + local cur prev words cword + _get_comp_words_by_ref -n : cur prev words cword + + local command='runc' command_pos=0 subcommand_pos + local counter=1 + while [ $counter -lt $cword ]; do + case "${words[$counter]}" in + -*) ;; + =) + ((counter++)) + ;; + *) + command="${words[$counter]}" + command_pos=$counter + break + ;; + esac + ((counter++)) + done + + local completions_func=_runc_${command} + declare -F $completions_func >/dev/null && $completions_func + + eval "$previous_extglob_setting" + return 0 +} + +eval "$__runc_previous_extglob_setting" +unset __runc_previous_extglob_setting + +complete -F _runc runc diff --git a/sysbox-runc/create.go b/sysbox-runc/create.go new file mode 100644 index 00000000..7f3b3fc6 --- /dev/null +++ b/sysbox-runc/create.go @@ -0,0 +1,132 @@ +package main + +import ( + "fmt" + "os" + + "github.com/opencontainers/runc/libsysbox/sysbox" + "github.com/opencontainers/runc/libsysbox/syscont" + specs "github.com/opencontainers/runtime-spec/specs-go" + "github.com/urfave/cli" +) + +var createCommand = cli.Command{ + Name: "create", + Usage: "create a system container", + ArgsUsage: ` + +Where "" is your name for the instance of the system container that you +are starting. The name you provide for the container instance must be unique on +your host.`, + Description: `The create command creates an instance of a system container for a bundle. The bundle +is a directory with a specification file named "` + specConfig + `" and a root +filesystem. + +The specification file includes an args parameter. The args parameter is used +to specify command(s) that get run when the container is started. To change the +command(s) that get executed on start, edit the args parameter of the spec. See +"runc spec --help" for more explanation.`, + Flags: []cli.Flag{ + cli.StringFlag{ + Name: "bundle, b", + Value: "", + Usage: `path to the root of the bundle directory, defaults to the current directory`, + }, + cli.StringFlag{ + Name: "console-socket", + Value: "", + Usage: "path to an AF_UNIX socket which will receive a file descriptor referencing the master end of the console's pseudoterminal", + }, + cli.StringFlag{ + Name: "pid-file", + Value: "", + Usage: "specify the file to write the process id to", + }, + cli.BoolFlag{ + Name: "no-pivot", + Usage: "do not use pivot root to jail process inside rootfs. This should be used whenever the rootfs is on top of a ramdisk", + }, + cli.BoolFlag{ + Name: "no-new-keyring", + Usage: "do not create a new session keyring for the container. This will cause the container to inherit the calling processes session key", + }, + cli.IntFlag{ + Name: "preserve-fds", + Usage: "Pass N additional file descriptors to the container (stdio + $LISTEN_FDS + N in total)", + }, + }, + Action: func(context *cli.Context) error { + var ( + err error + spec *specs.Spec + status int + ) + + if err = checkArgs(context, 1, exactArgs); err != nil { + return err + } + if err = revisePidFile(context); err != nil { + return err + } + + spec, err = setupSpec(context) + if err != nil { + return err + } + + if err = sysbox.CheckHostConfig(context, spec); err != nil { + return err + } + + id := context.Args().First() + + withMgr := !context.GlobalBool("no-sysbox-mgr") + withFs := !context.GlobalBool("no-sysbox-fs") + + sysbox := sysbox.NewSysbox(id, withMgr, withFs) + + // register with sysMgr + if sysbox.Mgr.Enabled() { + if err = sysbox.Mgr.Register(spec); err != nil { + return err + } + defer func() { + if err != nil { + sysbox.Mgr.Unregister() + } + }() + } + + // Get sysbox-fs related configs + if sysbox.Fs.Enabled() { + if err = sysbox.Fs.GetConfig(); err != nil { + return err + } + } + + if err = syscont.ConvertSpec(context, spec, sysbox); err != nil { + return fmt.Errorf("error in the container spec: %v", err) + } + + // pre-register with sysFs + if sysbox.Fs.Enabled() { + if err = sysbox.Fs.PreRegister(spec.Linux.Namespaces); err != nil { + return err + } + defer func() { + if err != nil { + sysbox.Fs.Unregister() + } + }() + } + + status, err = startContainer(context, spec, CT_ACT_CREATE, nil, sysbox) + if err != nil { + return err + } + // exit with the container's exit status so any external supervisor is + // notified of the exit with the correct exit status. + os.Exit(status) + return nil + }, +} diff --git a/sysbox-runc/delete.go b/sysbox-runc/delete.go new file mode 100644 index 00000000..78e5ccd5 --- /dev/null +++ b/sysbox-runc/delete.go @@ -0,0 +1,89 @@ +// +build !solaris + +package main + +import ( + "errors" + "fmt" + "os" + "path/filepath" + "time" + + "github.com/opencontainers/runc/libcontainer" + "github.com/urfave/cli" + + "golang.org/x/sys/unix" +) + +func killContainer(container libcontainer.Container) error { + _ = container.Signal(unix.SIGKILL, false) + for i := 0; i < 100; i++ { + time.Sleep(100 * time.Millisecond) + if err := container.Signal(unix.Signal(0), false); err != nil { + destroy(container) + return nil + } + } + return errors.New("container init still running") +} + +var deleteCommand = cli.Command{ + Name: "delete", + Usage: "delete any resources held by the system container; often used with detached container", + ArgsUsage: ` + +Where "" is the name for the instance of the container. + +EXAMPLE: +For example, if the container id is "ubuntu01" and sysbox-runc list currently shows the +status of "ubuntu01" as "stopped" the following will delete resources held for +"ubuntu01" removing "ubuntu01" from the sysbox-runc list of containers: + + # sysbox-runc delete ubuntu01`, + Flags: []cli.Flag{ + cli.BoolFlag{ + Name: "force, f", + Usage: "Forcibly deletes the container if it is still running (uses SIGKILL)", + }, + }, + Action: func(context *cli.Context) error { + if err := checkArgs(context, 1, exactArgs); err != nil { + return err + } + + id := context.Args().First() + force := context.Bool("force") + container, err := getContainer(context) + if err != nil { + if lerr, ok := err.(libcontainer.Error); ok && lerr.Code() == libcontainer.ContainerNotExists { + // if there was an aborted start or something of the sort then the container's directory could exist but + // libcontainer does not see it because the state.json file inside that directory was never created. + path := filepath.Join(context.GlobalString("root"), id) + if e := os.RemoveAll(path); e != nil { + fmt.Fprintf(os.Stderr, "remove %s: %v\n", path, e) + } + if force { + return nil + } + } + return err + } + s, err := container.Status() + if err != nil { + return err + } + switch s { + case libcontainer.Stopped: + destroy(container) + case libcontainer.Created: + return killContainer(container) + default: + if force { + return killContainer(container) + } + return fmt.Errorf("cannot delete container %s that is not stopped: %s\n", id, s) + } + + return nil + }, +} diff --git a/sysbox-runc/docs/Security-Audit.pdf b/sysbox-runc/docs/Security-Audit.pdf new file mode 100644 index 00000000..c41af427 Binary files /dev/null and b/sysbox-runc/docs/Security-Audit.pdf differ diff --git a/sysbox-runc/docs/cgroup-v2.md b/sysbox-runc/docs/cgroup-v2.md new file mode 100644 index 00000000..3d573d53 --- /dev/null +++ b/sysbox-runc/docs/cgroup-v2.md @@ -0,0 +1,62 @@ +# cgroup v2 + +runc fully supports cgroup v2 (unified mode) since v1.0.0-rc93. + +To use cgroup v2, you might need to change the configuration of the host init system. +Fedora (>= 31) uses cgroup v2 by default and no extra configuration is required. +On other systemd-based distros, cgroup v2 can be enabled by adding `systemd.unified_cgroup_hierarchy=1` to the kernel cmdline. + +## Am I using cgroup v2? + +Yes if `/sys/fs/cgroup/cgroup.controllers` is present. + +## Host Requirements +### Kernel +* Recommended version: 5.2 or later +* Minimum version: 4.15 + +Kernel older than 5.2 is not recommended due to lack of freezer. + +Notably, kernel older than 4.15 MUST NOT be used (unless you are running containers with user namespaces), as it lacks support for controlling permissions of devices. + +### Systemd +On cgroup v2 hosts, it is highly recommended to run runc with the systemd cgroup driver (`runc --systemd-cgroup`), though not mandatory. + +The recommended systemd version is 244 or later. Older systemd does not support delegation of `cpuset` controller. + +Make sure you also have the `dbus-user-session` (Debian/Ubuntu) or `dbus-daemon` (CentOS/Fedora) package installed, and that `dbus` is running. On Debian-flavored distros, this can be accomplished like so: + +```console +$ sudo apt install -y dbus-user-session +$ systemctl --user start dbus +``` + +## Rootless +On cgroup v2 hosts, rootless runc can talk to systemd to get cgroup permissions to be delegated. + +```console +$ runc spec --rootless +$ jq '.linux.cgroupsPath="user.slice:runc:foo"' config.json | sponge config.json +$ runc --systemd-cgroup run foo +``` + +The container processes are executed in a cgroup like `/user.slice/user-$(id -u).slice/user@$(id -u).service/user.slice/runc-foo.scope`. + +### Configuring delegation +Typically, only `memory` and `pids` controllers are delegated to non-root users by default. + +```console +$ cat /sys/fs/cgroup/user.slice/user-$(id -u).slice/user@$(id -u).service/cgroup.controllers +memory pids +``` + +To allow delegation of other controllers, you need to change the systemd configuration as follows: + +```console +# mkdir -p /etc/systemd/system/user@.service.d +# cat > /etc/systemd/system/user@.service.d/delegate.conf << EOF +[Service] +Delegate=cpu cpuset io memory pids +EOF +# systemctl daemon-reload +``` diff --git a/sysbox-runc/docs/checkpoint-restore.md b/sysbox-runc/docs/checkpoint-restore.md new file mode 100644 index 00000000..80ec46bd --- /dev/null +++ b/sysbox-runc/docs/checkpoint-restore.md @@ -0,0 +1,50 @@ +# Checkpoint and Restore # + +For a basic description about checkpointing and restoring containers with +`runc` please see [runc-checkpoint(8)](../man/runc-checkpoint.8.md) and +[runc-restore(8)](../man/runc-restore.8.md). + +## Checkpoint/Restore Annotations ## + +In addition to specifying options on the command-line like it is described +in the man-pages (see above), it is also possible to influence CRIU's +behaviour using CRIU configuration files. For details about CRIU's +configuration file support please see [CRIU's wiki](https://criu.org/Configuration_files). + +In addition to CRIU's default configuration files `runc` tells CRIU to +also evaluate the file `/etc/criu/runc.conf`. Using the annotation +`org.criu.config` it is, however, possible to change this additional +CRIU configuration file. + +If the annotation `org.criu.config` is set to an empty string `runc` +will not pass any additional configuration file to CRIU. With an empty +string it is therefore possible to disable the additional CRIU configuration +file. This can be used to make sure that no additional configuration file +changes CRIU's behaviour accidentally. + +If the annotation `org.criu.config` is set to a non-empty string `runc` will +pass that string to CRIU to be evaluated as an additional configuration file. +If CRIU cannot open this additional configuration file, it will ignore this +file and continue. + +### Annotation Example to disable additional CRIU configuration file ### + +``` +{ + "ociVersion": "1.0.0", + "annotations": { + "org.criu.config": "" + }, + "process": { +``` + +### Annotation Example to set a specific CRIU configuration file ### + +``` +{ + "ociVersion": "1.0.0", + "annotations": { + "org.criu.config": "/etc/special-runc-criu-options" + }, + "process": { +``` diff --git a/sysbox-runc/docs/systemd-properties.md b/sysbox-runc/docs/systemd-properties.md new file mode 100644 index 00000000..737e2415 --- /dev/null +++ b/sysbox-runc/docs/systemd-properties.md @@ -0,0 +1,27 @@ +## Changing systemd unit properties + +In case runc uses systemd to set cgroup parameters for a container (i.e. +`--systemd-cgroup` CLI flag is set), systemd creates a scope (a.k.a. +transient unit) for the container, usually named like `runc-$ID.scope`. + +The systemd properties of this unit (shown by `systemctl show runc-$ID.scope` +after the container is started) can be modified by adding annotations +to container's runtime spec (`config.json`). For example: + +```json + "annotations": { + "org.systemd.property.TimeoutStopUSec": "uint64 123456789", + "org.systemd.property.CollectMode":"'inactive-or-failed'" + }, +``` + +The above will set the following properties: + +* `TimeoutStopSec` to 2 minutes and 3 seconds; +* `CollectMode` to "inactive-or-failed". + +The values must be in the gvariant format (for details, see +[gvariant documentation](https://developer.gnome.org/glib/stable/gvariant-text.html)). + +To find out which type systemd expects for a particular parameter, please +consult systemd sources. diff --git a/sysbox-runc/docs/terminals.md b/sysbox-runc/docs/terminals.md new file mode 100644 index 00000000..403ca24a --- /dev/null +++ b/sysbox-runc/docs/terminals.md @@ -0,0 +1,327 @@ +# Terminals and Standard IO # + +*Note that the default configuration of `runc` (foreground, new terminal) is +generally the best option for most users. This document exists to help explain +what the purpose of the different modes is, and to try to steer users away from +common mistakes and misunderstandings.* + +In general, most processes on Unix (and Unix-like) operating systems have 3 +standard file descriptors provided at the start, collectively referred to as +"standard IO" (`stdio`): + +* `0`: standard-in (`stdin`), the input stream into the process +* `1`: standard-out (`stdout`), the output stream from the process +* `2`: standard-error (`stderr`), the error stream from the process + +When creating and running a container via `runc`, it is important to take care +to structure the `stdio` the new container's process receives. In some ways +containers are just regular processes, while in other ways they're an isolated +sub-partition of your machine (in a similar sense to a VM). This means that the +structure of IO is not as simple as with ordinary programs (which generally +just use the file descriptors you give them). + +## Other File Descriptors ## + +Before we continue, it is important to note that processes can have more file +descriptors than just `stdio`. By default in `runc` no other file descriptors +will be passed to the spawned container process. If you wish to explicitly pass +file descriptors to the container you have to use the `--preserve-fds` option. +These ancillary file descriptors don't have any of the strange semantics +discussed further in this document (those only apply to `stdio`) -- they are +passed untouched by `runc`. + +It should be noted that `--preserve-fds` does not take individual file +descriptors to preserve. Instead, it takes how many file descriptors (not +including `stdio` or `LISTEN_FDS`) should be passed to the container. In the +following example: + +``` +% runc run --preserve-fds 5 +``` + +`runc` will pass the first `5` file descriptors (`3`, `4`, `5`, `6`, and `7` -- +assuming that `LISTEN_FDS` has not been configured) to the container. + +In addition to `--preserve-fds`, `LISTEN_FDS` file descriptors are passed +automatically to allow for `systemd`-style socket activation. To extend the +above example: + +``` +% LISTEN_PID=$pid_of_runc LISTEN_FDS=3 runc run --preserve-fds 5 +``` + +`runc` will now pass the first `8` file descriptors (and it will also pass +`LISTEN_FDS=3` and `LISTEN_PID=1` to the container). The first `3` (`3`, `4`, +and `5`) were passed due to `LISTEN_FDS` and the other `5` (`6`, `7`, `8`, `9`, +and `10`) were passed due to `--preserve-fds`. You should keep this in mind if +you use `runc` directly in something like a `systemd` unit file. To disable +this `LISTEN_FDS`-style passing just unset `LISTEN_FDS`. + +**Be very careful when passing file descriptors to a container process.** Due +to some Linux kernel (mis)features, a container with access to certain types of +file descriptors (such as `O_PATH` descriptors) outside of the container's root +file system can use these to break out of the container's pivoted mount +namespace. [This has resulted in CVEs in the past.][CVE-2016-9962] + +[CVE-2016-9962]: https://nvd.nist.gov/vuln/detail/CVE-2016-9962 + +## Terminal Modes ## + +`runc` supports two distinct methods for passing `stdio` to the container's +primary process: + +* [new terminal](#new-terminal) (`terminal: true`) +* [pass-through](#pass-through) (`terminal: false`) + +When first using `runc` these two modes will look incredibly similar, but this +can be quite deceptive as these different modes have quite different +characteristics. + +By default, `runc spec` will create a configuration that will create a new +terminal (`terminal: true`). However, if the `terminal: ...` line is not +present in `config.json` then pass-through is the default. + +*In general we recommend using new terminal, because it means that tools like +`sudo` will work inside your container. But pass-through can be useful if you +know what you're doing, or if you're using `runc` as part of a non-interactive +pipeline.* + +### New Terminal ### + +In new terminal mode, `runc` will create a brand-new "console" (or more +precisely, a new pseudo-terminal using the container's namespaced +`/dev/pts/ptmx`) for your contained process to use as its `stdio`. + +When you start a process in new terminal mode, `runc` will do the following: + +1. Create a new pseudo-terminal. +2. Pass the slave end to the container's primary process as its `stdio`. +3. Send the master end to a process to interact with the `stdio` for the + container's primary process ([details below](#runc-modes)). + +It should be noted that since a new pseudo-terminal is being used for +communication with the container, some strange properties of pseudo-terminals +might surprise you. For instance, by default, all new pseudo-terminals +translate the byte `'\n'` to the sequence `'\r\n'` on both `stdout` and +`stderr`. In addition there are [a whole range of `ioctls(2)` that can only +interact with pseudo-terminal `stdio`][tty_ioctl(4)]. + +> **NOTE**: In new terminal mode, all three `stdio` file descriptors are the +> same underlying file. The reason for this is to match how a shell's `stdio` +> looks to a process (as well as remove race condition issues with having to +> deal with multiple master pseudo-terminal file descriptors). However this +> means that it is not really possible to uniquely distinguish between `stdout` +> and `stderr` from the caller's perspective. + +[tty_ioctl(4)]: https://linux.die.net/man/4/tty_ioctl + +### Pass-Through ### + +If you have already set up some file handles that you wish your contained +process to use as its `stdio`, then you can ask `runc` to pass them through to +the contained process (this is not necessarily the same as `--preserve-fds`'s +passing of file descriptors -- [details below](#runc-modes)). As an example +(assuming that `terminal: false` is set in `config.json`): + +``` +% echo input | runc run some_container > /tmp/log.out 2> /tmp/log.err +``` + +Here the container's various `stdio` file descriptors will be substituted with +the following: + +* `stdin` will be sourced from the `echo input` pipeline. +* `stdout` will be output into `/tmp/log.out` on the host. +* `stderr` will be output into `/tmp/log.err` on the host. + +It should be noted that the actual file handles seen inside the container may +be different [based on the mode `runc` is being used in](#runc-modes) (for +instance, the file referenced by `1` could be `/tmp/log.out` directly or a pipe +which `runc` is using to buffer output, based on the mode). However the net +result will be the same in either case. In principle you could use the [new +terminal mode](#new-terminal) in a pipeline, but the difference will become +more clear when you are introduced to [`runc`'s detached mode](#runc-modes). + +## `runc` Modes ## + +`runc` itself runs in two modes: + +* [foreground](#foreground) +* [detached](#detached) + +You can use either [terminal mode](#terminal-modes) with either `runc` mode. +However, there are considerations that may indicate preference for one mode +over another. It should be noted that while two types of modes (terminal and +`runc`) are conceptually independent from each other, you should be aware of +the intricacies of which combination you are using. + +*In general we recommend using foreground because it's the most +straight-forward to use, with the only downside being that you will have a +long-running `runc` process. Detached mode is difficult to get right and +generally requires having your own `stdio` management.* + +### Foreground ### + +The default (and most straight-forward) mode of `runc`. In this mode, your +`runc` command remains in the foreground with the container process as a child. +All `stdio` is buffered through the foreground `runc` process (irrespective of +which terminal mode you are using). This is conceptually quite similar to +running a normal process interactively in a shell (and if you are using `runc` +in a shell interactively, this is what you should use). + +Because the `stdio` will be buffered in this mode, some very important +peculiarities of this mode should be kept in mind: + +* With [new terminal mode](#new-terminal), the container will see a + pseudo-terminal as its `stdio` (as you might expect). However, the `stdio` of + the foreground `runc` process will remain the `stdio` that the process was + started with -- and `runc` will copy all `stdio` between its `stdio` and the + container's `stdio`. This means that while a new pseudo-terminal has been + created, the foreground `runc` process manages it over the lifetime of the + container. + +* With [pass-through mode](#pass-through), the foreground `runc`'s `stdio` is + **not** passed to the container. Instead, the container's `stdio` is a set of + pipes which are used to copy data between `runc`'s `stdio` and the + container's `stdio`. This means that the container never has direct access to + host file descriptors (aside from the pipes created by the container runtime, + but that shouldn't be an issue). + +The main drawback of the foreground mode of operation is that it requires a +long-running foreground `runc` process. If you kill the foreground `runc` +process then you will no longer have access to the `stdio` of the container +(and in most cases this will result in the container dying abnormally due to +`SIGPIPE` or some other error). By extension this means that any bug in the +long-running foreground `runc` process (such as a memory leak) or a stray +OOM-kill sweep could result in your container being killed **through no fault +of the user**. In addition, there is no way in foreground mode of passing a +file descriptor directly to the container process as its `stdio` (like +`--preserve-fds` does). + +These shortcomings are obviously sub-optimal and are the reason that `runc` has +an additional mode called "detached mode". + +### Detached ### + +In contrast to foreground mode, in detached mode there is no long-running +foreground `runc` process once the container has started. In fact, there is no +long-running `runc` process at all. However, this means that it is up to the +caller to handle the `stdio` after `runc` has set it up for you. In a shell +this means that the `runc` command will exit and control will return to the +shell, after the container has been set up. + +You can run `runc` in detached mode in one of the following ways: + +* `runc run -d ...` which operates similar to `runc run` but is detached. +* `runc create` followed by `runc start` which is the standard container + lifecycle defined by the OCI runtime specification (`runc create` sets up the + container completely, waiting for `runc start` to begin execution of user + code). + +The main use-case of detached mode is for higher-level tools that want to be +wrappers around `runc`. By running `runc` in detached mode, those tools have +far more control over the container's `stdio` without `runc` getting in the +way (most wrappers around `runc` like `cri-o` or `containerd` use detached mode +for this reason). + +Unfortunately using detached mode is a bit more complicated and requires more +care than the foreground mode -- mainly because it is now up to the caller to +handle the `stdio` of the container. + +Another complication is that the parent process is responsible for acting as +the subreaper for the container. In short, you need to call +`prctl(PR_SET_CHILD_SUBREAPER, 1, ...)` in the parent process and correctly +handle the implications of being a subreaper. Failing to do so may result in +zombie processes being accumulated on your host. + +These tasks are usually performed by a dedicated (and minimal) monitor process +per-container. For the sake of comparison, other runtimes such as LXC do not +have an equivalent detached mode and instead integrate this monitor process +into the container runtime itself -- this has several tradeoffs, and runc has +opted to support delegating the monitoring responsibility to the parent process +through this detached mode. + +#### Detached Pass-Through #### + +In detached mode, pass-through actually does what it says on the tin -- the +`stdio` file descriptors of the `runc` process are passed through (untouched) +to the container's `stdio`. The purpose of this option is to allow a user to +set up `stdio` for a container themselves and then force `runc` to just use +their pre-prepared `stdio` (without any pseudo-terminal funny business). *If +you don't see why this would be useful, don't use this option.* + +**You must be incredibly careful when using detached pass-through (especially +in a shell).** The reason for this is that by using detached pass-through you +are passing host file descriptors to the container. In the case of a shell, +usually your `stdio` is going to be a pseudo-terminal (on your host). A +malicious container could take advantage of TTY-specific `ioctls` like +`TIOCSTI` to fake input into the **host** shell (remember that in detached +mode, control is returned to your shell and so the terminal you've given the +container is being read by a shell prompt). + +There are also several other issues with running non-malicious containers in a +shell with detached pass-through (where you pass your shell's `stdio` to the +container): + +* Output from the container will be interleaved with output from your shell (in + a non-deterministic way), without any real way of distinguishing from where a + particular piece of output came from. + +* Any input to `stdin` will be non-deterministically split and given to either + the container or the shell (because both are blocked on a `read(2)` of the + same FIFO-style file descriptor). + +They are all related to the fact that there is going to be a race when either +your host or the container tries to read from (or write to) `stdio`. This +problem is especially obvious when in a shell, where usually the terminal has +been put into raw mode (where each individual key-press should cause `read(2)` +to return). + +> **NOTE**: There is also currently a [known problem][issue-1721] where using +> detached pass-through will result in the container hanging if the `stdout` or +> `stderr` is a pipe (though this should be a temporary issue). + +[issue-1721]: https://github.com/opencontainers/runc/issues/1721 + +#### Detached New Terminal #### + +When creating a new pseudo-terminal in detached mode, and fairly obvious +problem appears -- how do we use the new terminal that `runc` created? Unlike +in pass-through, `runc` has created a new set of file descriptors that need to +be used by *something* in order for container communication to work. + +The way this problem is resolved is through the use of Unix domain sockets. +There is a feature of Unix sockets called `SCM_RIGHTS` which allows a file +descriptor to be sent through a Unix socket to a completely separate process +(which can then use that file descriptor as though they opened it). When using +`runc` in detached new terminal mode, this is how a user gets access to the +pseudo-terminal's master file descriptor. + +To this end, there is a new option (which is required if you want to use `runc` +in detached new terminal mode): `--console-socket`. This option takes the path +to a Unix domain socket which `runc` will connect to and send the +pseudo-terminal master file descriptor down. The general process for getting +the pseudo-terminal master is as follows: + +1. Create a Unix domain socket at some path, `$socket_path`. +2. Call `runc run` or `runc create` with the argument `--console-socket + $socket_path`. +3. Using `recvmsg(2)` retrieve the file descriptor sent using `SCM_RIGHTS` by + `runc`. +4. Now the manager can interact with the `stdio` of the container, using the + retrieved pseudo-terminal master. + +After `runc` exits, the only process with a copy of the pseudo-terminal master +file descriptor is whoever read the file descriptor from the socket. + +> **NOTE**: Currently `runc` doesn't support abstract socket addresses (due to +> it not being possible to pass an `argv` with a null-byte as the first +> character). In the future this may change, but currently you must use a valid +> path name. + +In order to help users make use of detached new terminal mode, we have provided +a [Go implementation in the `go-runc` bindings][containerd/go-runc.Socket], as +well as [a simple client][recvtty]. + +[containerd/go-runc.Socket]: https://godoc.org/github.com/containerd/go-runc#Socket +[recvtty]: /contrib/cmd/recvtty diff --git a/sysbox-runc/events.go b/sysbox-runc/events.go new file mode 100644 index 00000000..0d151066 --- /dev/null +++ b/sysbox-runc/events.go @@ -0,0 +1,214 @@ +// +build linux + +package main + +import ( + "encoding/json" + "errors" + "fmt" + "os" + "sync" + "time" + + "github.com/opencontainers/runc/libcontainer" + "github.com/opencontainers/runc/libcontainer/cgroups" + "github.com/opencontainers/runc/libcontainer/intelrdt" + "github.com/opencontainers/runc/types" + + "github.com/sirupsen/logrus" + "github.com/urfave/cli" +) + +var eventsCommand = cli.Command{ + Name: "events", + Usage: "display container events such as OOM notifications, cpu, memory, and IO usage statistics", + ArgsUsage: ` + +Where "" is the name for the instance of the container.`, + Description: `The events command displays information about the container. By default the +information is displayed once every 5 seconds.`, + Flags: []cli.Flag{ + cli.DurationFlag{Name: "interval", Value: 5 * time.Second, Usage: "set the stats collection interval"}, + cli.BoolFlag{Name: "stats", Usage: "display the container's stats then exit"}, + }, + Action: func(context *cli.Context) error { + if err := checkArgs(context, 1, exactArgs); err != nil { + return err + } + container, err := getContainer(context) + if err != nil { + return err + } + duration := context.Duration("interval") + if duration <= 0 { + return errors.New("duration interval must be greater than 0") + } + status, err := container.Status() + if err != nil { + return err + } + if status == libcontainer.Stopped { + return fmt.Errorf("container with id %s is not running", container.ID()) + } + var ( + stats = make(chan *libcontainer.Stats, 1) + events = make(chan *types.Event, 1024) + group = &sync.WaitGroup{} + ) + group.Add(1) + go func() { + defer group.Done() + enc := json.NewEncoder(os.Stdout) + for e := range events { + if err := enc.Encode(e); err != nil { + logrus.Error(err) + } + } + }() + if context.Bool("stats") { + s, err := container.Stats() + if err != nil { + return err + } + events <- &types.Event{Type: "stats", ID: container.ID(), Data: convertLibcontainerStats(s)} + close(events) + group.Wait() + return nil + } + go func() { + for range time.Tick(context.Duration("interval")) { + s, err := container.Stats() + if err != nil { + logrus.Error(err) + continue + } + stats <- s + } + }() + n, err := container.NotifyOOM() + if err != nil { + return err + } + for { + select { + case _, ok := <-n: + if ok { + // this means an oom event was received, if it is !ok then + // the channel was closed because the container stopped and + // the cgroups no longer exist. + events <- &types.Event{Type: "oom", ID: container.ID()} + } else { + n = nil + } + case s := <-stats: + events <- &types.Event{Type: "stats", ID: container.ID(), Data: convertLibcontainerStats(s)} + } + if n == nil { + close(events) + break + } + } + group.Wait() + return nil + }, +} + +func convertLibcontainerStats(ls *libcontainer.Stats) *types.Stats { + cg := ls.CgroupStats + if cg == nil { + return nil + } + var s types.Stats + s.Pids.Current = cg.PidsStats.Current + s.Pids.Limit = cg.PidsStats.Limit + + s.CPU.Usage.Kernel = cg.CpuStats.CpuUsage.UsageInKernelmode + s.CPU.Usage.User = cg.CpuStats.CpuUsage.UsageInUsermode + s.CPU.Usage.Total = cg.CpuStats.CpuUsage.TotalUsage + s.CPU.Usage.Percpu = cg.CpuStats.CpuUsage.PercpuUsage + s.CPU.Usage.PercpuKernel = cg.CpuStats.CpuUsage.PercpuUsageInKernelmode + s.CPU.Usage.PercpuUser = cg.CpuStats.CpuUsage.PercpuUsageInUsermode + s.CPU.Throttling.Periods = cg.CpuStats.ThrottlingData.Periods + s.CPU.Throttling.ThrottledPeriods = cg.CpuStats.ThrottlingData.ThrottledPeriods + s.CPU.Throttling.ThrottledTime = cg.CpuStats.ThrottlingData.ThrottledTime + + s.CPUSet = types.CPUSet(cg.CPUSetStats) + + s.Memory.Cache = cg.MemoryStats.Cache + s.Memory.Kernel = convertMemoryEntry(cg.MemoryStats.KernelUsage) + s.Memory.KernelTCP = convertMemoryEntry(cg.MemoryStats.KernelTCPUsage) + s.Memory.Swap = convertMemoryEntry(cg.MemoryStats.SwapUsage) + s.Memory.Usage = convertMemoryEntry(cg.MemoryStats.Usage) + s.Memory.Raw = cg.MemoryStats.Stats + + s.Blkio.IoServiceBytesRecursive = convertBlkioEntry(cg.BlkioStats.IoServiceBytesRecursive) + s.Blkio.IoServicedRecursive = convertBlkioEntry(cg.BlkioStats.IoServicedRecursive) + s.Blkio.IoQueuedRecursive = convertBlkioEntry(cg.BlkioStats.IoQueuedRecursive) + s.Blkio.IoServiceTimeRecursive = convertBlkioEntry(cg.BlkioStats.IoServiceTimeRecursive) + s.Blkio.IoWaitTimeRecursive = convertBlkioEntry(cg.BlkioStats.IoWaitTimeRecursive) + s.Blkio.IoMergedRecursive = convertBlkioEntry(cg.BlkioStats.IoMergedRecursive) + s.Blkio.IoTimeRecursive = convertBlkioEntry(cg.BlkioStats.IoTimeRecursive) + s.Blkio.SectorsRecursive = convertBlkioEntry(cg.BlkioStats.SectorsRecursive) + + s.Hugetlb = make(map[string]types.Hugetlb) + for k, v := range cg.HugetlbStats { + s.Hugetlb[k] = convertHugtlb(v) + } + + if is := ls.IntelRdtStats; is != nil { + if intelrdt.IsCATEnabled() { + s.IntelRdt.L3CacheInfo = convertL3CacheInfo(is.L3CacheInfo) + s.IntelRdt.L3CacheSchemaRoot = is.L3CacheSchemaRoot + s.IntelRdt.L3CacheSchema = is.L3CacheSchema + } + if intelrdt.IsMBAEnabled() { + s.IntelRdt.MemBwInfo = convertMemBwInfo(is.MemBwInfo) + s.IntelRdt.MemBwSchemaRoot = is.MemBwSchemaRoot + s.IntelRdt.MemBwSchema = is.MemBwSchema + } + if intelrdt.IsMBMEnabled() { + s.IntelRdt.MBMStats = is.MBMStats + } + if intelrdt.IsCMTEnabled() { + s.IntelRdt.CMTStats = is.CMTStats + } + } + + s.NetworkInterfaces = ls.Interfaces + return &s +} + +func convertHugtlb(c cgroups.HugetlbStats) types.Hugetlb { + return types.Hugetlb{ + Usage: c.Usage, + Max: c.MaxUsage, + Failcnt: c.Failcnt, + } +} + +func convertMemoryEntry(c cgroups.MemoryData) types.MemoryEntry { + return types.MemoryEntry{ + Limit: c.Limit, + Usage: c.Usage, + Max: c.MaxUsage, + Failcnt: c.Failcnt, + } +} + +func convertBlkioEntry(c []cgroups.BlkioStatEntry) []types.BlkioEntry { + var out []types.BlkioEntry + for _, e := range c { + out = append(out, types.BlkioEntry(e)) + } + return out +} + +func convertL3CacheInfo(i *intelrdt.L3CacheInfo) *types.L3CacheInfo { + ci := types.L3CacheInfo(*i) + return &ci +} + +func convertMemBwInfo(i *intelrdt.MemBwInfo) *types.MemBwInfo { + mi := types.MemBwInfo(*i) + return &mi +} diff --git a/sysbox-runc/exec.go b/sysbox-runc/exec.go new file mode 100644 index 00000000..5ec00b24 --- /dev/null +++ b/sysbox-runc/exec.go @@ -0,0 +1,259 @@ +//go:build linux +// +build linux + +package main + +import ( + "encoding/json" + "fmt" + "os" + "strconv" + "strings" + + "github.com/opencontainers/runc/libcontainer" + "github.com/opencontainers/runc/libcontainer/utils" + "github.com/opencontainers/runc/libsysbox/sysbox" + "github.com/opencontainers/runc/libsysbox/syscont" + + "github.com/opencontainers/runtime-spec/specs-go" + "github.com/urfave/cli" +) + +var execCommand = cli.Command{ + Name: "exec", + Usage: "execute new process inside the system container", + ArgsUsage: ` [command options] || -p process.json + +Where "" is the name for the instance of the container and +"" is the command to be executed in the container. +"" can't be empty unless a "-p" flag provided. + +EXAMPLE: +For example, if the container is configured to run the linux ps command the +following will output a list of processes running in the container: + + # sysbox-runc exec ps`, + Flags: []cli.Flag{ + cli.StringFlag{ + Name: "console-socket", + Usage: "path to an AF_UNIX socket which will receive a file descriptor referencing the master end of the console's pseudoterminal", + }, + cli.StringFlag{ + Name: "cwd", + Usage: "current working directory in the container", + }, + cli.StringSliceFlag{ + Name: "env, e", + Usage: "set environment variables", + }, + cli.BoolFlag{ + Name: "tty, t", + Usage: "allocate a pseudo-TTY", + }, + cli.StringFlag{ + Name: "user, u", + Usage: "UID (format: [:])", + }, + cli.Int64SliceFlag{ + Name: "additional-gids, g", + Usage: "additional gids", + }, + cli.StringFlag{ + Name: "process, p", + Usage: "path to the process.json", + }, + cli.BoolFlag{ + Name: "detach,d", + Usage: "detach from the container's process", + }, + cli.StringFlag{ + Name: "pid-file", + Value: "", + Usage: "specify the file to write the process id to", + }, + cli.StringFlag{ + Name: "process-label", + Usage: "set the asm process label for the process commonly used with selinux", + }, + cli.StringFlag{ + Name: "apparmor", + Usage: "set the apparmor profile for the process", + }, + cli.BoolFlag{ + Name: "no-new-privs", + Usage: "set the no new privileges value for the process", + }, + cli.StringSliceFlag{ + Name: "cap, c", + Value: &cli.StringSlice{}, + Usage: "add a capability to the bounding set for the process", + }, + cli.BoolFlag{ + Name: "no-subreaper", + Usage: "disable the use of the subreaper used to reap reparented processes", + Hidden: true, + }, + cli.IntFlag{ + Name: "preserve-fds", + Usage: "Pass N additional file descriptors to the container (stdio + $LISTEN_FDS + N in total)", + }, + }, + Action: func(context *cli.Context) error { + if err := checkArgs(context, 1, minArgs); err != nil { + return err + } + if err := revisePidFile(context); err != nil { + return err + } + status, err := execProcess(context) + if err == nil { + os.Exit(status) + } + return fmt.Errorf("exec failed: %v", err) + }, + SkipArgReorder: true, +} + +func execProcess(context *cli.Context) (int, error) { + container, err := getContainer(context) + if err != nil { + return -1, err + } + status, err := container.Status() + if err != nil { + return -1, err + } + if status == libcontainer.Stopped { + return -1, fmt.Errorf("cannot exec a container that has stopped") + } + path := context.String("process") + if path == "" && len(context.Args()) == 1 { + return -1, fmt.Errorf("process args cannot be empty") + } + detach := context.Bool("detach") + state, err := container.State() + if err != nil { + return -1, err + } + + bundle := utils.SearchLabels(state.Config.Labels, "bundle") + p, err := getProcess(context, bundle, &state.Sysbox) + if err != nil { + return -1, err + } + + logLevel := "info" + if context.GlobalBool("debug") { + logLevel = "debug" + } + + r := &runner{ + enableSubreaper: false, + shouldDestroy: false, + container: container, + consoleSocket: context.String("console-socket"), + detach: detach, + pidFile: context.String("pid-file"), + action: CT_ACT_RUN, + init: false, + preserveFDs: context.Int("preserve-fds"), + logLevel: logLevel, + } + return r.run(p) +} + +func getProcess(context *cli.Context, bundle string, sysbox *sysbox.Sysbox) (*specs.Process, error) { + if path := context.String("process"); path != "" { + f, err := os.Open(path) + if err != nil { + return nil, err + } + defer f.Close() + var p specs.Process + if err := json.NewDecoder(f).Decode(&p); err != nil { + return nil, err + } + if err := validateProcessSpec(&p); err != nil { + return nil, err + } + // sysbox-runc: convert the process spec for system containers; drop SYSBOX_* + // env vars on exec (their effect is set at container start time and can't be + // changed thereafter). + return &p, syscont.ConvertProcessSpec(&p, sysbox, true) + } + // process via cli flags + if err := os.Chdir(bundle); err != nil { + return nil, err + } + spec, err := loadSpec(specConfig) + if err != nil { + return nil, err + } + p := spec.Process + p.Args = context.Args()[1:] + // override the cwd, if passed + if context.String("cwd") != "" { + p.Cwd = context.String("cwd") + } + if ap := context.String("apparmor"); ap != "" { + p.ApparmorProfile = ap + } + if l := context.String("process-label"); l != "" { + p.SelinuxLabel = l + } + if caps := context.StringSlice("cap"); len(caps) > 0 { + for _, c := range caps { + p.Capabilities.Bounding = append(p.Capabilities.Bounding, c) + p.Capabilities.Inheritable = append(p.Capabilities.Inheritable, c) + p.Capabilities.Effective = append(p.Capabilities.Effective, c) + p.Capabilities.Permitted = append(p.Capabilities.Permitted, c) + p.Capabilities.Ambient = append(p.Capabilities.Ambient, c) + } + } + + // append the passed env variables + p.Env = append(p.Env, context.StringSlice("env")...) + + // set the tty + p.Terminal = false + if context.IsSet("tty") { + p.Terminal = context.Bool("tty") + } + if context.IsSet("no-new-privs") { + p.NoNewPrivileges = context.Bool("no-new-privs") + } + // override the user, if passed + if context.String("user") != "" { + u := strings.SplitN(context.String("user"), ":", 2) + if len(u) > 1 { + gid, err := strconv.Atoi(u[1]) + if err != nil { + return nil, fmt.Errorf("parsing %s as int for gid failed: %v", u[1], err) + } + p.User.GID = uint32(gid) + } + uid, err := strconv.Atoi(u[0]) + if err != nil { + return nil, fmt.Errorf("parsing %s as int for uid failed: %v", u[0], err) + } + p.User.UID = uint32(uid) + } + for _, gid := range context.Int64Slice("additional-gids") { + if gid < 0 { + return nil, fmt.Errorf("additional-gids must be a positive number %d", gid) + } + p.User.AdditionalGids = append(p.User.AdditionalGids, uint32(gid)) + } + + if err := validateProcessSpec(p); err != nil { + return nil, err + } + + // sysbox-runc: convert the process spec for system containers; drop SYSBOX_* + // env vars on exec (their effect is set at container start time and can't be + // changed thereafter). + if err := syscont.ConvertProcessSpec(p, sysbox, true); err != nil { + return nil, err + } + return p, nil +} diff --git a/sysbox-runc/go.mod b/sysbox-runc/go.mod new file mode 100644 index 00000000..2f8ab6d3 --- /dev/null +++ b/sysbox-runc/go.mod @@ -0,0 +1,95 @@ +module github.com/nestybox/sysbox-runc + +go 1.22 + +toolchain go1.22.6 + +require ( + github.com/Masterminds/semver v1.5.0 + github.com/checkpoint-restore/go-criu/v4 v4.1.0 + github.com/cilium/ebpf v0.3.0 + github.com/containerd/console v1.0.1 + github.com/coreos/go-systemd/v22 v22.1.0 + github.com/cyphar/filepath-securejoin v0.2.2 + github.com/docker/go-units v0.4.0 + github.com/godbus/dbus/v5 v5.0.3 + github.com/golang/protobuf v1.5.4 + github.com/moby/sys/mountinfo v0.4.0 + github.com/mrunalp/fileutils v0.5.0 + github.com/nestybox/sysbox-ipc v0.0.0-00010101000000-000000000000 + github.com/nestybox/sysbox-libs/capability v0.0.0-00010101000000-000000000000 + github.com/nestybox/sysbox-libs/dockerUtils v0.0.0-00010101000000-000000000000 + github.com/nestybox/sysbox-libs/idMap v0.0.0-00010101000000-000000000000 + github.com/nestybox/sysbox-libs/idShiftUtils v0.0.0-00010101000000-000000000000 + github.com/nestybox/sysbox-libs/linuxUtils v0.0.0-00010101000000-000000000000 + github.com/nestybox/sysbox-libs/mount v0.0.0-20240602025437-33cbdf5a9e98 + github.com/nestybox/sysbox-libs/overlayUtils v0.0.0-00010101000000-000000000000 + github.com/nestybox/sysbox-libs/shiftfs v0.0.0-00010101000000-000000000000 + github.com/nestybox/sysbox-libs/utils v0.0.0-00010101000000-000000000000 + github.com/opencontainers/runc v1.1.4 + github.com/opencontainers/runtime-spec v1.1.1-0.20230823135140-4fec88fd00a4 + github.com/opencontainers/selinux v1.8.0 + github.com/pkg/errors v0.9.1 + github.com/pkg/profile v1.5.0 + github.com/sirupsen/logrus v1.9.3 + github.com/urfave/cli v1.22.14 + github.com/vishvananda/netlink v1.1.0 + github.com/willf/bitset v1.1.11 + golang.org/x/sys v0.27.0 +) + +require ( + github.com/deckarep/golang-set/v2 v2.3.1 + github.com/seccomp/libseccomp-golang v0.10.0 + golang.org/x/net v0.23.0 +) + +require ( + github.com/Microsoft/go-winio v0.4.16 // indirect + github.com/cenkalti/backoff/v4 v4.3.0 // indirect + github.com/cpuguy83/go-md2man/v2 v2.0.2 // indirect + github.com/deckarep/golang-set v1.8.0 // indirect + github.com/distribution/reference v0.6.0 // indirect + github.com/docker/docker v26.0.0+incompatible // indirect + github.com/docker/go-connections v0.4.0 // indirect + github.com/felixge/httpsnoop v1.0.4 // indirect + github.com/go-logr/logr v1.4.2 // indirect + github.com/go-logr/stdr v1.2.2 // indirect + github.com/gogo/protobuf v1.3.2 // indirect + github.com/joshlf/go-acl v0.0.0-20200411065538-eae00ae38531 // indirect + github.com/karrick/godirwalk v1.16.1 // indirect + github.com/moby/docker-image-spec v1.3.1 // indirect + github.com/nestybox/sysbox-libs/formatter v0.0.0-00010101000000-000000000000 // indirect + github.com/opencontainers/go-digest v1.0.0 // indirect + github.com/opencontainers/image-spec v1.0.2 // indirect + github.com/russross/blackfriday/v2 v2.1.0 // indirect + github.com/spf13/afero v1.4.1 // indirect + github.com/vishvananda/netns v0.0.0-20191106174202-0a2b9b5464df // indirect + go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.51.0 // indirect + go.opentelemetry.io/otel v1.32.0 // indirect + go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.32.0 // indirect + go.opentelemetry.io/otel/metric v1.32.0 // indirect + go.opentelemetry.io/otel/sdk v1.32.0 // indirect + go.opentelemetry.io/otel/trace v1.32.0 // indirect + go.opentelemetry.io/proto/otlp v1.3.1 // indirect + golang.org/x/text v0.15.0 // indirect + google.golang.org/genproto/googleapis/rpc v0.0.0-20240513163218-0867130af1f8 // indirect + google.golang.org/grpc v1.64.0 // indirect + google.golang.org/protobuf v1.35.1 // indirect + gopkg.in/hlandau/service.v1 v1.0.7 // indirect +) + +replace ( + github.com/nestybox/sysbox-ipc => ../sysbox-ipc + github.com/nestybox/sysbox-libs/capability => ../sysbox-libs/capability + github.com/nestybox/sysbox-libs/dockerUtils => ../sysbox-libs/dockerUtils + github.com/nestybox/sysbox-libs/formatter => ../sysbox-libs/formatter + github.com/nestybox/sysbox-libs/idMap => ../sysbox-libs/idMap + github.com/nestybox/sysbox-libs/idShiftUtils => ../sysbox-libs/idShiftUtils + github.com/nestybox/sysbox-libs/linuxUtils => ../sysbox-libs/linuxUtils + github.com/nestybox/sysbox-libs/mount => ../sysbox-libs/mount + github.com/nestybox/sysbox-libs/overlayUtils => ../sysbox-libs/overlayUtils + github.com/nestybox/sysbox-libs/shiftfs => ../sysbox-libs/shiftfs + github.com/nestybox/sysbox-libs/utils => ../sysbox-libs/utils + github.com/opencontainers/runc => ./ +) diff --git a/sysbox-runc/go.sum b/sysbox-runc/go.sum new file mode 100644 index 00000000..35e84c61 --- /dev/null +++ b/sysbox-runc/go.sum @@ -0,0 +1,202 @@ +github.com/Azure/go-ansiterm v0.0.0-20230124172434-306776ec8161 h1:L/gRVlceqvL25UVaW/CKtUDjefjrs0SPonmDGUVOYP0= +github.com/Azure/go-ansiterm v0.0.0-20230124172434-306776ec8161/go.mod h1:xomTg63KZ2rFqZQzSB4Vz2SUXa1BpHTVz9L5PTmPC4E= +github.com/BurntSushi/toml v1.3.2/go.mod h1:CxXYINrC8qIiEnFrOxCa7Jy5BFHlXnUU2pbicEuybxQ= +github.com/Masterminds/semver v1.5.0 h1:H65muMkzWKEuNDnfl9d70GUjFniHKHRbFPGBuZ3QEww= +github.com/Masterminds/semver v1.5.0/go.mod h1:MB6lktGJrhw8PrUyiEoblNEGEQ+RzHPF078ddwwvV3Y= +github.com/Microsoft/go-winio v0.4.16 h1:FtSW/jqD+l4ba5iPBj9CODVtgfYAD8w2wS923g/cFDk= +github.com/Microsoft/go-winio v0.4.16/go.mod h1:XB6nPKklQyQ7GC9LdcBEcBl8PF76WugXOPRXwdLnMv0= +github.com/cenkalti/backoff/v4 v4.3.0 h1:MyRJ/UdXutAwSAT+s3wNd7MfTIcy71VQueUuFK343L8= +github.com/cenkalti/backoff/v4 v4.3.0/go.mod h1:Y3VNntkOUPxTVeUxJ/G5vcM//AlwfmyYozVcomhLiZE= +github.com/checkpoint-restore/go-criu/v4 v4.1.0 h1:WW2B2uxx9KWF6bGlHqhm8Okiafwwx7Y2kcpn8lCpjgo= +github.com/checkpoint-restore/go-criu/v4 v4.1.0/go.mod h1:xUQBLp4RLc5zJtWY++yjOoMoB5lihDt7fai+75m+rGw= +github.com/cilium/ebpf v0.3.0 h1:LI3lsl5GmTh+OFYamrj8sp+R0yam38zHG6NTDhSlNmQ= +github.com/cilium/ebpf v0.3.0/go.mod h1:To2CFviqOWL/M0gIMsvSMlqe7em/l1ALkX1PyjrX2Qs= +github.com/containerd/console v1.0.1 h1:u7SFAJyRqWcG6ogaMAx3KjSTy1e3hT9QxqX7Jco7dRc= +github.com/containerd/console v1.0.1/go.mod h1:XUsP6YE/mKtz6bxc+I8UiKKTP04qjQL4qcS3XoQ5xkw= +github.com/containerd/log v0.1.0 h1:TCJt7ioM2cr/tfR8GPbGf9/VRAX8D2B4PjzCpfX540I= +github.com/containerd/log v0.1.0/go.mod h1:VRRf09a7mHDIRezVKTRCrOq78v577GXq3bSa3EhrzVo= +github.com/coreos/go-systemd/v22 v22.1.0 h1:kq/SbG2BCKLkDKkjQf5OWwKWUKj1lgs3lFI4PxnR5lg= +github.com/coreos/go-systemd/v22 v22.1.0/go.mod h1:xO0FLkIi5MaZafQlIrOotqXZ90ih+1atmu1JpKERPPk= +github.com/cpuguy83/go-md2man/v2 v2.0.2 h1:p1EgwI/C7NhT0JmVkwCD2ZBK8j4aeHQX2pMHHBfMQ6w= +github.com/cpuguy83/go-md2man/v2 v2.0.2/go.mod h1:tgQtvFlXSQOSOSIRvRPT7W67SCa46tRHOmNcaadrF8o= +github.com/cyphar/filepath-securejoin v0.2.2 h1:jCwT2GTP+PY5nBz3c/YL5PAIbusElVrPujOBSCj8xRg= +github.com/cyphar/filepath-securejoin v0.2.2/go.mod h1:FpkQEhXnPnOthhzymB7CGsFk2G9VLXONKD9G7QGMM+4= +github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= +github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= +github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= +github.com/deckarep/golang-set v1.8.0 h1:sk9/l/KqpunDwP7pSjUg0keiOOLEnOBHzykLrsPppp4= +github.com/deckarep/golang-set v1.8.0/go.mod h1:5nI87KwE7wgsBU1F4GKAw2Qod7p5kyS383rP6+o6qqo= +github.com/deckarep/golang-set/v2 v2.3.1 h1:vjmkvJt/IV27WXPyYQpAh4bRyWJc5Y435D17XQ9QU5A= +github.com/deckarep/golang-set/v2 v2.3.1/go.mod h1:VAky9rY/yGXJOLEDv3OMci+7wtDpOF4IN+y82NBOac4= +github.com/distribution/reference v0.6.0 h1:0IXCQ5g4/QMHHkarYzh5l+u8T3t73zM5QvfrDyIgxBk= +github.com/distribution/reference v0.6.0/go.mod h1:BbU0aIcezP1/5jX/8MP0YiH4SdvB5Y4f/wlDRiLyi3E= +github.com/docker/docker v26.0.0+incompatible h1:Ng2qi+gdKADUa/VM+6b6YaY2nlZhk/lVJiKR/2bMudU= +github.com/docker/docker v26.0.0+incompatible/go.mod h1:eEKB0N0r5NX/I1kEveEz05bcu8tLC/8azJZsviup8Sk= +github.com/docker/go-connections v0.4.0 h1:El9xVISelRB7BuFusrZozjnkIM5YnzCViNKohAFqRJQ= +github.com/docker/go-connections v0.4.0/go.mod h1:Gbd7IOopHjR8Iph03tsViu4nIes5XhDvyHbTtUxmeec= +github.com/docker/go-units v0.4.0 h1:3uh0PgVws3nIA0Q+MwDC8yjEPf9zjRfZZWXZYDct3Tw= +github.com/docker/go-units v0.4.0/go.mod h1:fgPhTUdO+D/Jk86RDLlptpiXQzgHJF7gydDDbaIK4Dk= +github.com/felixge/httpsnoop v1.0.4 h1:NFTV2Zj1bL4mc9sqWACXbQFVBBg2W3GPvqp8/ESS2Wg= +github.com/felixge/httpsnoop v1.0.4/go.mod h1:m8KPJKqk1gH5J9DgRY2ASl2lWCfGKXixSwevea8zH2U= +github.com/go-logr/logr v1.2.2/go.mod h1:jdQByPbusPIv2/zmleS9BjJVeZ6kBagPoEUsqbVz/1A= +github.com/go-logr/logr v1.4.2 h1:6pFjapn8bFcIbiKo3XT4j/BhANplGihG6tvd+8rYgrY= +github.com/go-logr/logr v1.4.2/go.mod h1:9T104GzyrTigFIr8wt5mBrctHMim0Nb2HLGrmQ40KvY= +github.com/go-logr/stdr v1.2.2 h1:hSWxHoqTgW2S2qGc0LTAI563KZ5YKYRhT3MFKZMbjag= +github.com/go-logr/stdr v1.2.2/go.mod h1:mMo/vtBO5dYbehREoey6XUKy/eSumjCCveDpRre4VKE= +github.com/godbus/dbus/v5 v5.0.3 h1:ZqHaoEF7TBzh4jzPmqVhE/5A1z9of6orkAe5uHoAeME= +github.com/godbus/dbus/v5 v5.0.3/go.mod h1:xhWf0FNVPg57R7Z0UbKHbJfkEywrmjJnf7w5xrFpKfA= +github.com/gogo/protobuf v1.3.2 h1:Ov1cvc58UF3b5XjBnZv7+opcTcQFZebYjWzi34vdm4Q= +github.com/gogo/protobuf v1.3.2/go.mod h1:P1XiOD3dCwIKUDQYPy72D8LYyHL2YPYrpS2s69NZV8Q= +github.com/golang/protobuf v1.3.5/go.mod h1:6O5/vntMXwX2lRkT1hjjk0nAC1IDOTvTlVgjlRvqsdk= +github.com/golang/protobuf v1.5.4 h1:i7eJL8qZTpSEXOPTxNKhASYpMn+8e5Q6AdndVa1dWek= +github.com/golang/protobuf v1.5.4/go.mod h1:lnTiLA8Wa4RWRcIUkrtSVa5nRhsEGBg48fD6rSs7xps= +github.com/google/go-cmp v0.5.2/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE= +github.com/google/go-cmp v0.6.0 h1:ofyhxvXcZhMsU5ulbFiLKl/XBFqE1GSq7atu8tAmTRI= +github.com/google/go-cmp v0.6.0/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY= +github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0= +github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= +github.com/grpc-ecosystem/grpc-gateway/v2 v2.20.0 h1:bkypFPDjIYGfCYD5mRBvpqxfYX1YCS1PXdKYWi8FsN0= +github.com/grpc-ecosystem/grpc-gateway/v2 v2.20.0/go.mod h1:P+Lt/0by1T8bfcF3z737NnSbmxQAppXMRziHUxPOC8k= +github.com/joshlf/go-acl v0.0.0-20200411065538-eae00ae38531 h1:hgVxRoDDPtQE68PT4LFvNlPz2nBKd3OMlGKIQ69OmR4= +github.com/joshlf/go-acl v0.0.0-20200411065538-eae00ae38531/go.mod h1:fqTUQpVYBvhCNIsMXGl2GE9q6z94DIP6NtFKXCSTVbg= +github.com/joshlf/testutil v0.0.0-20170608050642-b5d8aa79d93d h1:J8tJzRyiddAFF65YVgxli+TyWBi0f79Sld6rJP6CBcY= +github.com/joshlf/testutil v0.0.0-20170608050642-b5d8aa79d93d/go.mod h1:b+Q3v8Yrg5o15d71PSUraUzYb+jWl6wQMSBXSGS/hv0= +github.com/karrick/godirwalk v1.16.1 h1:DynhcF+bztK8gooS0+NDJFrdNZjJ3gzVzC545UNA9iw= +github.com/karrick/godirwalk v1.16.1/go.mod h1:j4mkqPuvaLI8mp1DroR3P6ad7cyYd4c1qeJ3RV7ULlk= +github.com/kisielk/errcheck v1.5.0/go.mod h1:pFxgyoBC7bSaBwPgfKdkLd5X25qrDl4LWUI2bnpBCr8= +github.com/kisielk/gotool v1.0.0/go.mod h1:XhKaO+MFFWcvkIS/tQcRk01m1F5IRFswLeQ+oQHNcck= +github.com/konsorten/go-windows-terminal-sequences v1.0.1/go.mod h1:T0+1ngSBFLxvqU3pZ+m/2kptfBszLMUkC4ZK/EgS/cQ= +github.com/kr/fs v0.1.0/go.mod h1:FFnZGqtBN9Gxj7eW1uZ42v5BccTP0vu6NEaFoC2HwRg= +github.com/moby/docker-image-spec v1.3.1 h1:jMKff3w6PgbfSa69GfNg+zN/XLhfXJGnEx3Nl2EsFP0= +github.com/moby/docker-image-spec v1.3.1/go.mod h1:eKmb5VW8vQEh/BAr2yvVNvuiJuY6UIocYsFu/DxxRpo= +github.com/moby/sys/mountinfo v0.4.0 h1:1KInV3Huv18akCu58V7lzNlt+jFmqlu1EaErnEHE/VM= +github.com/moby/sys/mountinfo v0.4.0/go.mod h1:rEr8tzG/lsIZHBtN/JjGG+LMYx9eXgW2JI+6q0qou+A= +github.com/moby/term v0.0.0-20201216013528-df9cb8a40635 h1:rzf0wL0CHVc8CEsgyygG0Mn9CNCCPZqOPaz8RiiHYQk= +github.com/moby/term v0.0.0-20201216013528-df9cb8a40635/go.mod h1:FBS0z0QWA44HXygs7VXDUOGoN/1TV3RuWkLO04am3wc= +github.com/morikuni/aec v1.0.0 h1:nP9CBfwrvYnBRgY6qfDQkygYDmYwOilePFkwzv4dU8A= +github.com/morikuni/aec v1.0.0/go.mod h1:BbKIizmSmc5MMPqRYbxO4ZU0S0+P200+tUnFx7PXmsc= +github.com/mrunalp/fileutils v0.5.0 h1:NKzVxiH7eSk+OQ4M+ZYW1K6h27RUV3MI6NUTsHhU6Z4= +github.com/mrunalp/fileutils v0.5.0/go.mod h1:M1WthSahJixYnrXQl/DFQuteStB1weuxD2QJNHXfbSQ= +github.com/opencontainers/go-digest v1.0.0 h1:apOUWs51W5PlhuyGyz9FCeeBIOUDA/6nW8Oi/yOhh5U= +github.com/opencontainers/go-digest v1.0.0/go.mod h1:0JzlMkj0TRzQZfJkVvzbP0HBR3IKzErnv2BNG4W4MAM= +github.com/opencontainers/image-spec v1.0.2 h1:9yCKha/T5XdGtO0q9Q9a6T5NUCsTn/DrBg0D7ufOcFM= +github.com/opencontainers/image-spec v1.0.2/go.mod h1:BtxoFyWECRxE4U/7sNtV5W15zMzWCbyJoFRP3s7yZA0= +github.com/opencontainers/runtime-spec v1.1.1-0.20230823135140-4fec88fd00a4 h1:EctkgBjZ1y4q+sibyuuIgiKpa0QSd2elFtSSdNvBVow= +github.com/opencontainers/runtime-spec v1.1.1-0.20230823135140-4fec88fd00a4/go.mod h1:jwyrGlmzljRJv/Fgzds9SsS/C5hL+LL3ko9hs6T5lQ0= +github.com/opencontainers/selinux v1.8.0 h1:+77ba4ar4jsCbL1GLbFL8fFM57w6suPfSS9PDLDY7KM= +github.com/opencontainers/selinux v1.8.0/go.mod h1:RScLhm78qiWa2gbVCcGkC7tCGdgk3ogry1nUQF8Evvo= +github.com/pkg/errors v0.8.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0= +github.com/pkg/errors v0.9.1 h1:FEBLx1zS214owpjy7qsBeixbURkuhQAwrK5UwLGTwt4= +github.com/pkg/errors v0.9.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0= +github.com/pkg/profile v1.5.0 h1:042Buzk+NhDI+DeSAA62RwJL8VAuZUMQZUjCsRz1Mug= +github.com/pkg/profile v1.5.0/go.mod h1:qBsxPvzyUincmltOk6iyRVxHYg4adc0OFOv72ZdLa18= +github.com/pkg/sftp v1.10.1/go.mod h1:lYOWFsE0bwd1+KfKJaKeuokY15vzFx25BLbzYYoAxZI= +github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= +github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= +github.com/russross/blackfriday/v2 v2.1.0 h1:JIOH55/0cWyOuilr9/qlrm0BSXldqnqwMsf35Ld67mk= +github.com/russross/blackfriday/v2 v2.1.0/go.mod h1:+Rmxgy9KzJVeS9/2gXHxylqXiyQDYRxCVz55jmeOWTM= +github.com/seccomp/libseccomp-golang v0.10.0 h1:aA4bp+/Zzi0BnWZ2F1wgNBs5gTpm+na2rWM6M9YjLpY= +github.com/seccomp/libseccomp-golang v0.10.0/go.mod h1:JA8cRccbGaA1s33RQf7Y1+q9gHmZX1yB/z9WDN1C6fg= +github.com/sirupsen/logrus v1.4.1/go.mod h1:ni0Sbl8bgC9z8RoU9G6nDWqqs/fq4eDPysMBDgk/93Q= +github.com/sirupsen/logrus v1.9.3 h1:dueUQJ1C2q9oE3F7wvmSGAaVtTmUizReu6fjN8uqzbQ= +github.com/sirupsen/logrus v1.9.3/go.mod h1:naHLuLoDiP4jHNo9R0sCBMtWGeIprob74mVsIT4qYEQ= +github.com/spf13/afero v1.4.1 h1:asw9sl74539yqavKaglDM5hFpdJVK0Y5Dr/JOgQ89nQ= +github.com/spf13/afero v1.4.1/go.mod h1:Ai8FlHk4v/PARR026UzYexafAt9roJ7LcLMAmO6Z93I= +github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= +github.com/stretchr/objx v0.1.1/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= +github.com/stretchr/objx v0.4.0/go.mod h1:YvHI0jy2hoMjB+UWwv71VJQ9isScKT/TqJzVSSt89Yw= +github.com/stretchr/objx v0.5.0/go.mod h1:Yh+to48EsGEfYuaHDzXPcE3xhTkx73EhmCGUpEOglKo= +github.com/stretchr/testify v1.2.2/go.mod h1:a8OnRcib4nhh0OaRAV+Yts87kKdq0PP7pXfy6kDkUVs= +github.com/stretchr/testify v1.4.0/go.mod h1:j7eGeouHqKxXV5pUuKE4zz7dFj8WfuZ+81PSLYec5m4= +github.com/stretchr/testify v1.7.0/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg= +github.com/stretchr/testify v1.7.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg= +github.com/stretchr/testify v1.8.0/go.mod h1:yNjHg4UonilssWZ8iaSj1OCr/vHnekPRkoO+kdMU+MU= +github.com/stretchr/testify v1.8.4/go.mod h1:sz/lmYIOXD/1dqDmKjjqLyZ2RngseejIcXlSw2iwfAo= +github.com/stretchr/testify v1.9.0 h1:HtqpIVDClZ4nwg75+f6Lvsy/wHu+3BoSGCbBAcpTsTg= +github.com/stretchr/testify v1.9.0/go.mod h1:r2ic/lqez/lEtzL7wO/rwa5dbSLXVDPFyf8C91i36aY= +github.com/urfave/cli v1.22.14 h1:ebbhrRiGK2i4naQJr+1Xj92HXZCrK7MsyTS/ob3HnAk= +github.com/urfave/cli v1.22.14/go.mod h1:X0eDS6pD6Exaclxm99NJ3FiCDRED7vIHpx2mDOHLvkA= +github.com/vishvananda/netlink v1.1.0 h1:1iyaYNBLmP6L0220aDnYQpo1QEV4t4hJ+xEEhhJH8j0= +github.com/vishvananda/netlink v1.1.0/go.mod h1:cTgwzPIzzgDAYoQrMm0EdrjRUBkTqKYppBueQtXaqoE= +github.com/vishvananda/netns v0.0.0-20191106174202-0a2b9b5464df h1:OviZH7qLw/7ZovXvuNyL3XQl8UFofeikI1NW1Gypu7k= +github.com/vishvananda/netns v0.0.0-20191106174202-0a2b9b5464df/go.mod h1:JP3t17pCcGlemwknint6hfoeCVQrEMVwxRLRjXpq+BU= +github.com/willf/bitset v1.1.11 h1:N7Z7E9UvjW+sGsEl7k/SJrvY2reP1A07MrGuCjIOjRE= +github.com/willf/bitset v1.1.11/go.mod h1:83CECat5yLh5zVOf4P1ErAgKA5UDvKtgyUABdr3+MjI= +github.com/yuin/goldmark v1.1.27/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74= +github.com/yuin/goldmark v1.2.1/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74= +go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.51.0 h1:Xs2Ncz0gNihqu9iosIZ5SkBbWo5T8JhhLJFMQL1qmLI= +go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.51.0/go.mod h1:vy+2G/6NvVMpwGX/NyLqcC41fxepnuKHk16E6IZUcJc= +go.opentelemetry.io/otel v1.32.0 h1:WnBN+Xjcteh0zdk01SVqV55d/m62NJLJdIyb4y/WO5U= +go.opentelemetry.io/otel v1.32.0/go.mod h1:00DCVSB0RQcnzlwyTfqtxSm+DRr9hpYrHjNGiBHVQIg= +go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.32.0 h1:IJFEoHiytixx8cMiVAO+GmHR6Frwu+u5Ur8njpFO6Ac= +go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.32.0/go.mod h1:3rHrKNtLIoS0oZwkY2vxi+oJcwFRWdtUyRII+so45p8= +go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracehttp v1.26.0 h1:1wp/gyxsuYtuE/JFxsQRtcCDtMrO2qMvlfXALU5wkzI= +go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracehttp v1.26.0/go.mod h1:gbTHmghkGgqxMomVQQMur1Nba4M0MQ8AYThXDUjsJ38= +go.opentelemetry.io/otel/metric v1.32.0 h1:xV2umtmNcThh2/a/aCP+h64Xx5wsj8qqnkYZktzNa0M= +go.opentelemetry.io/otel/metric v1.32.0/go.mod h1:jH7CIbbK6SH2V2wE16W05BHCtIDzauciCRLoc/SyMv8= +go.opentelemetry.io/otel/sdk v1.32.0 h1:RNxepc9vK59A8XsgZQouW8ue8Gkb4jpWtJm9ge5lEG4= +go.opentelemetry.io/otel/sdk v1.32.0/go.mod h1:LqgegDBjKMmb2GC6/PrTnteJG39I8/vJCAP9LlJXEjU= +go.opentelemetry.io/otel/trace v1.32.0 h1:WIC9mYrXf8TmY/EXuULKc8hR17vE+Hjv2cssQDe03fM= +go.opentelemetry.io/otel/trace v1.32.0/go.mod h1:+i4rkvCraA+tG6AzwloGaCtkx53Fa+L+V8e9a7YvhT8= +go.opentelemetry.io/proto/otlp v1.3.1 h1:TrMUixzpM0yuc/znrFTP9MMRh8trP93mkCiDVeXrui0= +go.opentelemetry.io/proto/otlp v1.3.1/go.mod h1:0X1WI4de4ZsLrrJNLAQbFeLCm3T7yBkR0XqQ7niQU+8= +golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w= +golang.org/x/crypto v0.0.0-20190820162420-60c769a6c586/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI= +golang.org/x/crypto v0.0.0-20191011191535-87dc89f01550/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI= +golang.org/x/crypto v0.0.0-20200622213623-75b288015ac9/go.mod h1:LzIPMQfyMNhhGPhUkYOs5KpL4U8rLKemX1yGLhDgUto= +golang.org/x/mod v0.2.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA= +golang.org/x/mod v0.3.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA= +golang.org/x/net v0.0.0-20190404232315-eb5bcb51f2a3/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg= +golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= +golang.org/x/net v0.0.0-20200226121028-0de0cce0169b/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= +golang.org/x/net v0.0.0-20201021035429-f5854403a974/go.mod h1:sp8m0HH+o8qH0wwXwYZr8TS3Oi6o0r6Gce1SSxlDquU= +golang.org/x/net v0.23.0 h1:7EYJ93RZ9vYSZAIb2x3lnuvqO5zneoD6IvWjuhfxjTs= +golang.org/x/net v0.23.0/go.mod h1:JKghWKKOSdJwpW2GEx0Ja7fmaKnMsbu+MWVZTokSYmg= +golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= +golang.org/x/sync v0.0.0-20190911185100-cd5d95a43a6e/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= +golang.org/x/sync v0.0.0-20201020160332-67f06af15bc9/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= +golang.org/x/sys v0.0.0-20180905080454-ebe1bf3edb33/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= +golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= +golang.org/x/sys v0.0.0-20190412213103-97732733099d/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20190606203320-7fc4e5ec1444/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20190916202348-b4ddaad3f8a3/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20191115151921-52ab43148777/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20200124204421-9fbb57f87de9/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20200909081042-eff7692f9009/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20200916030750-2334cc1a136f/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20200930185726-fdedc70b468f/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20220715151400-c0bba94af5f8/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.27.0 h1:wBqf8DvsY9Y/2P8gAfPDEYNuS30J4lPHJxXSb/nJZ+s= +golang.org/x/sys v0.27.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= +golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= +golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= +golang.org/x/text v0.15.0 h1:h1V/4gjBv8v9cjcR6+AR5+/cIYK5N/WAgiv4xlsEtAk= +golang.org/x/text v0.15.0/go.mod h1:18ZOQIKpY8NJVqYksKHtTdi31H5itFRjB5/qKTNYzSU= +golang.org/x/time v0.0.0-20201208040808-7e3f01d25324 h1:Hir2P/De0WpUhtrKGGjvSb2YxUgyZ7EFOSLIcSSpiwE= +golang.org/x/time v0.0.0-20201208040808-7e3f01d25324/go.mod h1:tRJNPiyCQ0inRvYxbN9jk5I+vvW/OXSQhTDSoE431IQ= +golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= +golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo= +golang.org/x/tools v0.0.0-20200619180055-7c47624df98f/go.mod h1:EkVYQZoAsY45+roYkvgYkIh4xh/qjgUK9TdY2XT94GE= +golang.org/x/tools v0.0.0-20210106214847-113979e3529a/go.mod h1:emZCQorbCU4vsT4fOWvOPXz4eW1wZW4PmDk9uLelYpA= +golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= +golang.org/x/xerrors v0.0.0-20191011141410-1b5146add898/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= +golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= +golang.org/x/xerrors v0.0.0-20200804184101-5ec99f83aff1/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= +google.golang.org/genproto/googleapis/api v0.0.0-20240513163218-0867130af1f8 h1:W5Xj/70xIA4x60O/IFyXivR5MGqblAb8R3w26pnD6No= +google.golang.org/genproto/googleapis/api v0.0.0-20240513163218-0867130af1f8/go.mod h1:vPrPUTsDCYxXWjP7clS81mZ6/803D8K4iM9Ma27VKas= +google.golang.org/genproto/googleapis/rpc v0.0.0-20240513163218-0867130af1f8 h1:mxSlqyb8ZAHsYDCfiXN1EDdNTdvjUJSLY+OnAUtYNYA= +google.golang.org/genproto/googleapis/rpc v0.0.0-20240513163218-0867130af1f8/go.mod h1:I7Y+G38R2bu5j1aLzfFmQfTcU/WnFuqDwLZAbvKTKpM= +google.golang.org/grpc v1.64.0 h1:KH3VH9y/MgNQg1dE7b3XfVK0GsPSIzJwdF617gUSbvY= +google.golang.org/grpc v1.64.0/go.mod h1:oxjF8E3FBnjp+/gVFYdWacaLDx9na1aqy9oovLpxQYg= +google.golang.org/protobuf v1.35.1 h1:m3LfL6/Ca+fqnjnlqQXNpFPABW1UD7mjh8KO2mKFytA= +google.golang.org/protobuf v1.35.1/go.mod h1:9fA7Ob0pmnwhb644+1+CVWFRbNajQ6iRojtC/QF5bRE= +gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= +gopkg.in/hlandau/service.v1 v1.0.7 h1:16G5AJ1Cp8Vr65QItJXpyAIzf/FWAWCZBsTgsc6eyA8= +gopkg.in/hlandau/service.v1 v1.0.7/go.mod h1:sZw6ksxcoafC04GoZtw32UeqqEuPSABX35lVBaJP/bE= +gopkg.in/yaml.v2 v2.2.2/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI= +gopkg.in/yaml.v2 v2.4.0/go.mod h1:RDklbk79AGWmwhnvt/jBztapEOGDOx6ZbXqjP6csGnQ= +gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= +gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA= +gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= +gotest.tools/v3 v3.0.3 h1:4AuOwCGf4lLR9u3YOe2awrHygurzhO/HeQ6laiA6Sx0= +gotest.tools/v3 v3.0.3/go.mod h1:Z7Lb0S5l+klDB31fvDQX8ss/FlKDxtlFlw3Oa8Ymbl8= diff --git a/sysbox-runc/init.go b/sysbox-runc/init.go new file mode 100644 index 00000000..335f80a7 --- /dev/null +++ b/sysbox-runc/init.go @@ -0,0 +1,50 @@ +package main + +import ( + "fmt" + "os" + "runtime" + + "github.com/opencontainers/runc/libcontainer" + "github.com/opencontainers/runc/libcontainer/logs" + _ "github.com/opencontainers/runc/libcontainer/nsenter" + "github.com/sirupsen/logrus" + "github.com/urfave/cli" +) + +func init() { + if len(os.Args) > 1 && os.Args[1] == "init" { + runtime.GOMAXPROCS(1) + runtime.LockOSThread() + + level := os.Getenv("_LIBCONTAINER_LOGLEVEL") + logLevel, err := logrus.ParseLevel(level) + if err != nil { + panic(fmt.Sprintf("libcontainer: failed to parse log level: %q: %v", level, err)) + } + + err = logs.ConfigureLogging(logs.Config{ + LogPipeFd: os.Getenv("_LIBCONTAINER_LOGPIPE"), + LogFormat: "json", + LogLevel: logLevel, + }) + if err != nil { + panic(fmt.Sprintf("libcontainer: failed to configure logging: %v", err)) + } + logrus.Debug("child process in init()") + } +} + +var initCommand = cli.Command{ + Name: "init", + Usage: `initialize the namespaces and launch the process (do not call it outside of sysbox-runc)`, + Action: func(context *cli.Context) error { + factory, _ := libcontainer.New("") + if err := factory.StartInitialization(); err != nil { + // as the error is sent back to the parent there is no need to log + // or write it to stderr because the parent process will handle this + os.Exit(1) + } + panic("libcontainer: container init failed to exec") + }, +} diff --git a/sysbox-runc/kill.go b/sysbox-runc/kill.go new file mode 100644 index 00000000..1024dc42 --- /dev/null +++ b/sysbox-runc/kill.go @@ -0,0 +1,72 @@ +// +build linux + +package main + +import ( + "fmt" + "strconv" + "strings" + + "github.com/urfave/cli" + "golang.org/x/sys/unix" +) + +var killCommand = cli.Command{ + Name: "kill", + Usage: "kill sends the specified signal (default: SIGTERM) to the container's init process", + ArgsUsage: ` [signal] + +Where "" is the name for the instance of the container and +"[signal]" is the signal to be sent to the init process. + +EXAMPLE: +For example, if the container id is "ubuntu01" the following will send a "KILL" +signal to the init process of the "ubuntu01" container: + + # sysbox-runc kill ubuntu01 KILL`, + Flags: []cli.Flag{ + cli.BoolFlag{ + Name: "all, a", + Usage: "send the specified signal to all processes inside the container", + }, + }, + Action: func(context *cli.Context) error { + if err := checkArgs(context, 1, minArgs); err != nil { + return err + } + if err := checkArgs(context, 2, maxArgs); err != nil { + return err + } + container, err := getContainer(context) + if err != nil { + return err + } + + sigstr := context.Args().Get(1) + if sigstr == "" { + sigstr = "SIGTERM" + } + + signal, err := parseSignal(sigstr) + if err != nil { + return err + } + return container.Signal(signal, context.Bool("all")) + }, +} + +func parseSignal(rawSignal string) (unix.Signal, error) { + s, err := strconv.Atoi(rawSignal) + if err == nil { + return unix.Signal(s), nil + } + sig := strings.ToUpper(rawSignal) + if !strings.HasPrefix(sig, "SIG") { + sig = "SIG" + sig + } + signal := unix.SignalNum(sig) + if signal == 0 { + return -1, fmt.Errorf("unknown signal %q", rawSignal) + } + return signal, nil +} diff --git a/sysbox-runc/libcontainer/README.md b/sysbox-runc/libcontainer/README.md new file mode 100644 index 00000000..b6adb284 --- /dev/null +++ b/sysbox-runc/libcontainer/README.md @@ -0,0 +1,331 @@ +# sysbox-runc libcontainer + +The sysbox-runc libcontainer is a fork of the OCI runc libcontainer library. It +has been modified to support creation and management of system containers. + +Libcontainer provides a native Go implementation for creating containers +with namespaces, cgroups, capabilities, and filesystem access controls. +It allows you to manage the lifecycle of the container performing additional operations +after the container is created. + + +#### Container +A container is a self contained execution environment that shares the kernel of the +host system and which is (optionally) isolated from other containers in the system. + +#### Using libcontainer + +Because containers are spawned in a two step process you will need a binary that +will be executed as the init process for the container. In libcontainer, we use +the current binary (/proc/self/exe) to be executed as the init process, and use +arg "init", we call the first step process "bootstrap", so you always need a "init" +function as the entry of "bootstrap". + +In addition to the go init function the early stage bootstrap is handled by importing +[nsenter](https://github.com/opencontainers/runc/blob/master/libcontainer/nsenter/README.md). + +```go +import ( + _ "github.com/opencontainers/runc/libcontainer/nsenter" +) + +func init() { + if len(os.Args) > 1 && os.Args[1] == "init" { + runtime.GOMAXPROCS(1) + runtime.LockOSThread() + factory, _ := libcontainer.New("") + if err := factory.StartInitialization(); err != nil { + logrus.Fatal(err) + } + panic("--this line should have never been executed, congratulations--") + } +} +``` + +Then to create a container you first have to initialize an instance of a factory +that will handle the creation and initialization for a container. + +```go +factory, err := libcontainer.New("/var/lib/container", libcontainer.Cgroupfs, libcontainer.InitArgs(os.Args[0], "init")) +if err != nil { + logrus.Fatal(err) + return +} +``` + +Once you have an instance of the factory created we can create a configuration +struct describing how the container is to be created. A sample would look similar to this: + +```go +defaultMountFlags := unix.MS_NOEXEC | unix.MS_NOSUID | unix.MS_NODEV +config := &configs.Config{ + Rootfs: "/your/path/to/rootfs", + Capabilities: &configs.Capabilities{ + Bounding: []string{ + "CAP_CHOWN", + "CAP_DAC_OVERRIDE", + "CAP_FSETID", + "CAP_FOWNER", + "CAP_MKNOD", + "CAP_NET_RAW", + "CAP_SETGID", + "CAP_SETUID", + "CAP_SETFCAP", + "CAP_SETPCAP", + "CAP_NET_BIND_SERVICE", + "CAP_SYS_CHROOT", + "CAP_KILL", + "CAP_AUDIT_WRITE", + }, + Effective: []string{ + "CAP_CHOWN", + "CAP_DAC_OVERRIDE", + "CAP_FSETID", + "CAP_FOWNER", + "CAP_MKNOD", + "CAP_NET_RAW", + "CAP_SETGID", + "CAP_SETUID", + "CAP_SETFCAP", + "CAP_SETPCAP", + "CAP_NET_BIND_SERVICE", + "CAP_SYS_CHROOT", + "CAP_KILL", + "CAP_AUDIT_WRITE", + }, + Inheritable: []string{ + "CAP_CHOWN", + "CAP_DAC_OVERRIDE", + "CAP_FSETID", + "CAP_FOWNER", + "CAP_MKNOD", + "CAP_NET_RAW", + "CAP_SETGID", + "CAP_SETUID", + "CAP_SETFCAP", + "CAP_SETPCAP", + "CAP_NET_BIND_SERVICE", + "CAP_SYS_CHROOT", + "CAP_KILL", + "CAP_AUDIT_WRITE", + }, + Permitted: []string{ + "CAP_CHOWN", + "CAP_DAC_OVERRIDE", + "CAP_FSETID", + "CAP_FOWNER", + "CAP_MKNOD", + "CAP_NET_RAW", + "CAP_SETGID", + "CAP_SETUID", + "CAP_SETFCAP", + "CAP_SETPCAP", + "CAP_NET_BIND_SERVICE", + "CAP_SYS_CHROOT", + "CAP_KILL", + "CAP_AUDIT_WRITE", + }, + Ambient: []string{ + "CAP_CHOWN", + "CAP_DAC_OVERRIDE", + "CAP_FSETID", + "CAP_FOWNER", + "CAP_MKNOD", + "CAP_NET_RAW", + "CAP_SETGID", + "CAP_SETUID", + "CAP_SETFCAP", + "CAP_SETPCAP", + "CAP_NET_BIND_SERVICE", + "CAP_SYS_CHROOT", + "CAP_KILL", + "CAP_AUDIT_WRITE", + }, + }, + Namespaces: configs.Namespaces([]configs.Namespace{ + {Type: configs.NEWNS}, + {Type: configs.NEWUTS}, + {Type: configs.NEWIPC}, + {Type: configs.NEWPID}, + {Type: configs.NEWUSER}, + {Type: configs.NEWNET}, + {Type: configs.NEWCGROUP}, + }), + Cgroups: &configs.Cgroup{ + Name: "test-container", + Parent: "system", + Resources: &configs.Resources{ + MemorySwappiness: nil, + Devices: specconv.AllowedDevices, + }, + }, + MaskPaths: []string{ + "/proc/kcore", + "/sys/firmware", + }, + ReadonlyPaths: []string{ + "/proc/sys", "/proc/sysrq-trigger", "/proc/irq", "/proc/bus", + }, + Devices: specconv.AllowedDevices, + Hostname: "testing", + Mounts: []*configs.Mount{ + { + Source: "proc", + Destination: "/proc", + Device: "proc", + Flags: defaultMountFlags, + }, + { + Source: "tmpfs", + Destination: "/dev", + Device: "tmpfs", + Flags: unix.MS_NOSUID | unix.MS_STRICTATIME, + Data: "mode=755", + }, + { + Source: "devpts", + Destination: "/dev/pts", + Device: "devpts", + Flags: unix.MS_NOSUID | unix.MS_NOEXEC, + Data: "newinstance,ptmxmode=0666,mode=0620,gid=5", + }, + { + Device: "tmpfs", + Source: "shm", + Destination: "/dev/shm", + Data: "mode=1777,size=65536k", + Flags: defaultMountFlags, + }, + { + Source: "mqueue", + Destination: "/dev/mqueue", + Device: "mqueue", + Flags: defaultMountFlags, + }, + { + Source: "sysfs", + Destination: "/sys", + Device: "sysfs", + Flags: defaultMountFlags | unix.MS_RDONLY, + }, + }, + UidMappings: []configs.IDMap{ + { + ContainerID: 0, + HostID: 1000, + Size: 65536, + }, + }, + GidMappings: []configs.IDMap{ + { + ContainerID: 0, + HostID: 1000, + Size: 65536, + }, + }, + Networks: []*configs.Network{ + { + Type: "loopback", + Address: "127.0.0.1/0", + Gateway: "localhost", + }, + }, + Rlimits: []configs.Rlimit{ + { + Type: unix.RLIMIT_NOFILE, + Hard: uint64(1025), + Soft: uint64(1025), + }, + }, +} +``` + +Once you have the configuration populated you can create a container: + +```go +container, err := factory.Create("container-id", config) +if err != nil { + logrus.Fatal(err) + return +} +``` + +To spawn bash as the initial process inside the container and have the +processes pid returned in order to wait, signal, or kill the process: + +```go +process := &libcontainer.Process{ + Args: []string{"/bin/bash"}, + Env: []string{"PATH=/bin"}, + User: "daemon", + Stdin: os.Stdin, + Stdout: os.Stdout, + Stderr: os.Stderr, + Init: true, +} + +err := container.Run(process) +if err != nil { + container.Destroy() + logrus.Fatal(err) + return +} + +// wait for the process to finish. +_, err := process.Wait() +if err != nil { + logrus.Fatal(err) +} + +// destroy the container. +container.Destroy() +``` + +Additional ways to interact with a running container are: + +```go +// return all the pids for all processes running inside the container. +processes, err := container.Processes() + +// get detailed cpu, memory, io, and network statistics for the container and +// it's processes. +stats, err := container.Stats() + +// pause all processes inside the container. +container.Pause() + +// resume all paused processes. +container.Resume() + +// send signal to container's init process. +container.Signal(signal) + +// update container resource constraints. +container.Set(config) + +// get current status of the container. +status, err := container.Status() + +// get current container's state information. +state, err := container.State() +``` + + +#### Checkpoint & Restore + +libcontainer now integrates [CRIU](http://criu.org/) for checkpointing and restoring containers. +This let's you save the state of a process running inside a container to disk, and then restore +that state into a new process, on the same machine or on another machine. + +`criu` version 1.5.2 or higher is required to use checkpoint and restore. +If you don't already have `criu` installed, you can build it from source, following the +[online instructions](http://criu.org/Installation). `criu` is also installed in the docker image +generated when building libcontainer with docker. + + +## Copyright and license + +Code and documentation copyright 2014 Docker, inc. +The code and documentation are released under the [Apache 2.0 license](../LICENSE). +The documentation is also released under Creative Commons Attribution 4.0 International License. +You may obtain a copy of the license, titled CC-BY-4.0, at http://creativecommons.org/licenses/by/4.0/. diff --git a/sysbox-runc/libcontainer/SPEC.md b/sysbox-runc/libcontainer/SPEC.md new file mode 100644 index 00000000..07ebdc12 --- /dev/null +++ b/sysbox-runc/libcontainer/SPEC.md @@ -0,0 +1,465 @@ +## Container Specification - v1 + +This is the standard configuration for version 1 containers. It includes +namespaces, standard filesystem setup, a default Linux capability set, and +information about resource reservations. It also has information about any +populated environment settings for the processes running inside a container. + +Along with the configuration of how a container is created the standard also +discusses actions that can be performed on a container to manage and inspect +information about the processes running inside. + +The v1 profile is meant to be able to accommodate the majority of applications +with a strong security configuration. + +### System Requirements and Compatibility + +Minimum requirements: +* Kernel version - 3.10 recommended 2.6.2x minimum(with backported patches) +* Mounted cgroups with each subsystem in its own hierarchy + + +### Namespaces + +| Flag | Enabled | +| --------------- | ------- | +| CLONE_NEWPID | 1 | +| CLONE_NEWUTS | 1 | +| CLONE_NEWIPC | 1 | +| CLONE_NEWNET | 1 | +| CLONE_NEWNS | 1 | +| CLONE_NEWUSER | 1 | +| CLONE_NEWCGROUP | 1 | + +Namespaces are created for the container via the `unshare` syscall. + + +### Filesystem + +A root filesystem must be provided to a container for execution. The container +will use this root filesystem (rootfs) to jail and spawn processes inside where +the binaries and system libraries are local to that directory. Any binaries +to be executed must be contained within this rootfs. + +Mounts that happen inside the container are automatically cleaned up when the +container exits as the mount namespace is destroyed and the kernel will +unmount all the mounts that were setup within that namespace. + +For a container to execute properly there are certain filesystems that +are required to be mounted within the rootfs that the runtime will setup. + +| Path | Type | Flags | Data | +| ----------- | ------ | -------------------------------------- | ---------------------------------------- | +| /proc | proc | MS_NOEXEC,MS_NOSUID,MS_NODEV | | +| /dev | tmpfs | MS_NOEXEC,MS_STRICTATIME | mode=755 | +| /dev/shm | tmpfs | MS_NOEXEC,MS_NOSUID,MS_NODEV | mode=1777,size=65536k | +| /dev/mqueue | mqueue | MS_NOEXEC,MS_NOSUID,MS_NODEV | | +| /dev/pts | devpts | MS_NOEXEC,MS_NOSUID | newinstance,ptmxmode=0666,mode=620,gid=5 | +| /sys | sysfs | MS_NOEXEC,MS_NOSUID,MS_NODEV,MS_RDONLY | | + + +After a container's filesystems are mounted within the newly created +mount namespace `/dev` will need to be populated with a set of device nodes. +It is expected that a rootfs does not need to have any device nodes specified +for `/dev` within the rootfs as the container will setup the correct devices +that are required for executing a container's process. + +| Path | Mode | Access | +| ------------ | ---- | ---------- | +| /dev/null | 0666 | rwm | +| /dev/zero | 0666 | rwm | +| /dev/full | 0666 | rwm | +| /dev/tty | 0666 | rwm | +| /dev/random | 0666 | rwm | +| /dev/urandom | 0666 | rwm | + + +**ptmx** +`/dev/ptmx` will need to be a symlink to the host's `/dev/ptmx` within +the container. + +The use of a pseudo TTY is optional within a container and it should support both. +If a pseudo is provided to the container `/dev/console` will need to be +setup by binding the console in `/dev/` after it has been populated and mounted +in tmpfs. + +| Source | Destination | UID GID | Mode | Type | +| --------------- | ------------ | ------- | ---- | ---- | +| *pty host path* | /dev/console | 0 0 | 0600 | bind | + + +After `/dev/null` has been setup we check for any external links between +the container's io, STDIN, STDOUT, STDERR. If the container's io is pointing +to `/dev/null` outside the container we close and `dup2` the `/dev/null` +that is local to the container's rootfs. + + +After the container has `/proc` mounted a few standard symlinks are setup +within `/dev/` for the io. + +| Source | Destination | +| --------------- | ----------- | +| /proc/self/fd | /dev/fd | +| /proc/self/fd/0 | /dev/stdin | +| /proc/self/fd/1 | /dev/stdout | +| /proc/self/fd/2 | /dev/stderr | + +A `pivot_root` is used to change the root for the process, effectively +jailing the process inside the rootfs. + +```c +put_old = mkdir(...); +pivot_root(rootfs, put_old); +chdir("/"); +unmount(put_old, MS_DETACH); +rmdir(put_old); +``` + +For container's running with a rootfs inside `ramfs` a `MS_MOVE` combined +with a `chroot` is required as `pivot_root` is not supported in `ramfs`. + +```c +mount(rootfs, "/", NULL, MS_MOVE, NULL); +chroot("."); +chdir("/"); +``` + +The `umask` is set back to `0022` after the filesystem setup has been completed. + +### Resources + +Cgroups are used to handle resource allocation for containers. This includes +system resources like cpu, memory, and device access. + +| Subsystem | Enabled | +| ---------- | ------- | +| devices | 1 | +| memory | 1 | +| cpu | 1 | +| cpuacct | 1 | +| cpuset | 1 | +| blkio | 1 | +| perf_event | 1 | +| freezer | 1 | +| hugetlb | 1 | +| pids | 1 | + + +All cgroup subsystem are joined so that statistics can be collected from +each of the subsystems. Freezer does not expose any stats but is joined +so that containers can be paused and resumed. + +The parent process of the container's init must place the init pid inside +the correct cgroups before the initialization begins. This is done so +that no processes or threads escape the cgroups. This sync is +done via a pipe ( specified in the runtime section below ) that the container's +init process will block waiting for the parent to finish setup. + +### IntelRdt + +Intel platforms with new Xeon CPU support Resource Director Technology (RDT). +Cache Allocation Technology (CAT) and Memory Bandwidth Allocation (MBA) are +two sub-features of RDT. + +Cache Allocation Technology (CAT) provides a way for the software to restrict +cache allocation to a defined 'subset' of L3 cache which may be overlapping +with other 'subsets'. The different subsets are identified by class of +service (CLOS) and each CLOS has a capacity bitmask (CBM). + +Memory Bandwidth Allocation (MBA) provides indirect and approximate throttle +over memory bandwidth for the software. A user controls the resource by +indicating the percentage of maximum memory bandwidth or memory bandwidth limit +in MBps unit if MBA Software Controller is enabled. + +It can be used to handle L3 cache and memory bandwidth resources allocation +for containers if hardware and kernel support Intel RDT CAT and MBA features. + +In Linux 4.10 kernel or newer, the interface is defined and exposed via +"resource control" filesystem, which is a "cgroup-like" interface. + +Comparing with cgroups, it has similar process management lifecycle and +interfaces in a container. But unlike cgroups' hierarchy, it has single level +filesystem layout. + +CAT and MBA features are introduced in Linux 4.10 and 4.12 kernel via +"resource control" filesystem. + +Intel RDT "resource control" filesystem hierarchy: +``` +mount -t resctrl resctrl /sys/fs/resctrl +tree /sys/fs/resctrl +/sys/fs/resctrl/ +|-- info +| |-- L3 +| | |-- cbm_mask +| | |-- min_cbm_bits +| | |-- num_closids +| |-- MB +| |-- bandwidth_gran +| |-- delay_linear +| |-- min_bandwidth +| |-- num_closids +|-- ... +|-- schemata +|-- tasks +|-- + |-- ... + |-- schemata + |-- tasks +``` + +For runc, we can make use of `tasks` and `schemata` configuration for L3 +cache and memory bandwidth resources constraints. + +The file `tasks` has a list of tasks that belongs to this group (e.g., +" group). Tasks can be added to a group by writing the task ID +to the "tasks" file (which will automatically remove them from the previous +group to which they belonged). New tasks created by fork(2) and clone(2) are +added to the same group as their parent. + +The file `schemata` has a list of all the resources available to this group. +Each resource (L3 cache, memory bandwidth) has its own line and format. + +L3 cache schema: +It has allocation bitmasks/values for L3 cache on each socket, which +contains L3 cache id and capacity bitmask (CBM). +``` + Format: "L3:=;=;..." +``` +For example, on a two-socket machine, the schema line could be "L3:0=ff;1=c0" +which means L3 cache id 0's CBM is 0xff, and L3 cache id 1's CBM is 0xc0. + +The valid L3 cache CBM is a *contiguous bits set* and number of bits that can +be set is less than the max bit. The max bits in the CBM is varied among +supported Intel CPU models. Kernel will check if it is valid when writing. +e.g., default value 0xfffff in root indicates the max bits of CBM is 20 +bits, which mapping to entire L3 cache capacity. Some valid CBM values to +set in a group: 0xf, 0xf0, 0x3ff, 0x1f00 and etc. + +Memory bandwidth schema: +It has allocation values for memory bandwidth on each socket, which contains +L3 cache id and memory bandwidth. +``` + Format: "MB:=bandwidth0;=bandwidth1;..." +``` +For example, on a two-socket machine, the schema line could be "MB:0=20;1=70" + +The minimum bandwidth percentage value for each CPU model is predefined and +can be looked up through "info/MB/min_bandwidth". The bandwidth granularity +that is allocated is also dependent on the CPU model and can be looked up at +"info/MB/bandwidth_gran". The available bandwidth control steps are: +min_bw + N * bw_gran. Intermediate values are rounded to the next control +step available on the hardware. + +If MBA Software Controller is enabled through mount option "-o mba_MBps" +mount -t resctrl resctrl -o mba_MBps /sys/fs/resctrl +We could specify memory bandwidth in "MBps" (Mega Bytes per second) unit +instead of "percentages". The kernel underneath would use a software feedback +mechanism or a "Software Controller" which reads the actual bandwidth using +MBM counters and adjust the memory bandwidth percentages to ensure: +"actual memory bandwidth < user specified memory bandwidth". + +For example, on a two-socket machine, the schema line could be +"MB:0=5000;1=7000" which means 5000 MBps memory bandwidth limit on socket 0 +and 7000 MBps memory bandwidth limit on socket 1. + +For more information about Intel RDT kernel interface: +https://www.kernel.org/doc/Documentation/x86/intel_rdt_ui.txt + +``` +An example for runc: +Consider a two-socket machine with two L3 caches where the default CBM is +0x7ff and the max CBM length is 11 bits, and minimum memory bandwidth of 10% +with a memory bandwidth granularity of 10%. + +Tasks inside the container only have access to the "upper" 7/11 of L3 cache +on socket 0 and the "lower" 5/11 L3 cache on socket 1, and may use a +maximum memory bandwidth of 20% on socket 0 and 70% on socket 1. + +"linux": { + "intelRdt": { + "closID": "guaranteed_group", + "l3CacheSchema": "L3:0=7f0;1=1f", + "memBwSchema": "MB:0=20;1=70" + } +} +``` + +### Security + +The standard set of Linux capabilities that are set in a container +provide a good default for security and flexibility for the applications. + + +| Capability | Enabled | +| -------------------- | ------- | +| CAP_NET_RAW | 1 | +| CAP_NET_BIND_SERVICE | 1 | +| CAP_AUDIT_READ | 1 | +| CAP_AUDIT_WRITE | 1 | +| CAP_DAC_OVERRIDE | 1 | +| CAP_SETFCAP | 1 | +| CAP_SETPCAP | 1 | +| CAP_SETGID | 1 | +| CAP_SETUID | 1 | +| CAP_MKNOD | 1 | +| CAP_CHOWN | 1 | +| CAP_FOWNER | 1 | +| CAP_FSETID | 1 | +| CAP_KILL | 1 | +| CAP_SYS_CHROOT | 1 | +| CAP_NET_BROADCAST | 0 | +| CAP_SYS_MODULE | 0 | +| CAP_SYS_RAWIO | 0 | +| CAP_SYS_PACCT | 0 | +| CAP_SYS_ADMIN | 0 | +| CAP_SYS_NICE | 0 | +| CAP_SYS_RESOURCE | 0 | +| CAP_SYS_TIME | 0 | +| CAP_SYS_TTY_CONFIG | 0 | +| CAP_AUDIT_CONTROL | 0 | +| CAP_MAC_OVERRIDE | 0 | +| CAP_MAC_ADMIN | 0 | +| CAP_NET_ADMIN | 0 | +| CAP_SYSLOG | 0 | +| CAP_DAC_READ_SEARCH | 0 | +| CAP_LINUX_IMMUTABLE | 0 | +| CAP_IPC_LOCK | 0 | +| CAP_IPC_OWNER | 0 | +| CAP_SYS_PTRACE | 0 | +| CAP_SYS_BOOT | 0 | +| CAP_LEASE | 0 | +| CAP_WAKE_ALARM | 0 | +| CAP_BLOCK_SUSPEND | 0 | + + +Additional security layers like [apparmor](https://wiki.ubuntu.com/AppArmor) +and [selinux](http://selinuxproject.org/page/Main_Page) can be used with +the containers. A container should support setting an apparmor profile or +selinux process and mount labels if provided in the configuration. + +Standard apparmor profile: +```c +#include +profile flags=(attach_disconnected,mediate_deleted) { + #include + network, + capability, + file, + umount, + + deny @{PROC}/sys/fs/** wklx, + deny @{PROC}/sysrq-trigger rwklx, + deny @{PROC}/mem rwklx, + deny @{PROC}/kmem rwklx, + deny @{PROC}/sys/kernel/[^s][^h][^m]* wklx, + deny @{PROC}/sys/kernel/*/** wklx, + + deny mount, + + deny /sys/[^f]*/** wklx, + deny /sys/f[^s]*/** wklx, + deny /sys/fs/[^c]*/** wklx, + deny /sys/fs/c[^g]*/** wklx, + deny /sys/fs/cg[^r]*/** wklx, + deny /sys/firmware/efi/efivars/** rwklx, + deny /sys/kernel/security/** rwklx, +} +``` + +*TODO: seccomp work is being done to find a good default config* + +### Runtime and Init Process + +During container creation the parent process needs to talk to the container's init +process and have a form of synchronization. This is accomplished by creating +a pipe that is passed to the container's init. When the init process first spawns +it will block on its side of the pipe until the parent closes its side. This +allows the parent to have time to set the new process inside a cgroup hierarchy +and/or write any uid/gid mappings required for user namespaces. +The pipe is passed to the init process via FD 3. + +The application consuming libcontainer should be compiled statically. libcontainer +does not define any init process and the arguments provided are used to `exec` the +process inside the application. There should be no long running init within the +container spec. + +If a pseudo tty is provided to a container it will open and `dup2` the console +as the container's STDIN, STDOUT, STDERR as well as mounting the console +as `/dev/console`. + +An extra set of mounts are provided to a container and setup for use. A container's +rootfs can contain some non portable files inside that can cause side effects during +execution of a process. These files are usually created and populated with the container +specific information via the runtime. + +**Extra runtime files:** +* /etc/hosts +* /etc/resolv.conf +* /etc/hostname +* /etc/localtime + + +#### Defaults + +There are a few defaults that can be overridden by users, but in their omission +these apply to processes within a container. + +| Type | Value | +| ------------------- | ------------------------------ | +| Parent Death Signal | SIGKILL | +| UID | 0 | +| GID | 0 | +| GROUPS | 0, NULL | +| CWD | "/" | +| $HOME | Current user's home dir or "/" | +| Readonly rootfs | false | +| Pseudo TTY | false | + + +## Actions + +After a container is created there is a standard set of actions that can +be done to the container. These actions are part of the public API for +a container. + +| Action | Description | +| -------------- | ------------------------------------------------------------------ | +| Get processes | Return all the pids for processes running inside a container | +| Get Stats | Return resource statistics for the container as a whole | +| Wait | Waits on the container's init process ( pid 1 ) | +| Wait Process | Wait on any of the container's processes returning the exit status | +| Destroy | Kill the container's init process and remove any filesystem state | +| Signal | Send a signal to the container's init process | +| Signal Process | Send a signal to any of the container's processes | +| Pause | Pause all processes inside the container | +| Resume | Resume all processes inside the container if paused | +| Exec | Execute a new process inside of the container ( requires setns ) | +| Set | Setup configs of the container after it's created | + +### Execute a new process inside of a running container + +User can execute a new process inside of a running container. Any binaries to be +executed must be accessible within the container's rootfs. + +The started process will run inside the container's rootfs. Any changes +made by the process to the container's filesystem will persist after the +process finished executing. + +The started process will join all the container's existing namespaces. When the +container is paused, the process will also be paused and will resume when +the container is unpaused. The started process will only run when the container's +primary process (PID 1) is running, and will not be restarted when the container +is restarted. + +#### Planned additions + +The started process will have its own cgroups nested inside the container's +cgroups. This is used for process tracking and optionally resource allocation +handling for the new process. Freezer cgroup is required, the rest of the cgroups +are optional. The process executor must place its pid inside the correct +cgroups before starting the process. This is done so that no child processes or +threads can escape the cgroups. + +When the process is stopped, the process executor will try (in a best-effort way) +to stop all its children and remove the sub-cgroups. diff --git a/sysbox-runc/libcontainer/apparmor/apparmor_linux.go b/sysbox-runc/libcontainer/apparmor/apparmor_linux.go new file mode 100644 index 00000000..73965f12 --- /dev/null +++ b/sysbox-runc/libcontainer/apparmor/apparmor_linux.go @@ -0,0 +1,54 @@ +package apparmor + +import ( + "bytes" + "fmt" + "io/ioutil" + "os" + + "github.com/opencontainers/runc/libcontainer/utils" +) + +// IsEnabled returns true if apparmor is enabled for the host. +func IsEnabled() bool { + if _, err := os.Stat("/sys/kernel/security/apparmor"); err == nil { + buf, err := ioutil.ReadFile("/sys/module/apparmor/parameters/enabled") + return err == nil && bytes.HasPrefix(buf, []byte("Y")) + } + return false +} + +func setProcAttr(attr, value string) error { + // Under AppArmor you can only change your own attr, so use /proc/self/ + // instead of /proc// like libapparmor does + f, err := os.OpenFile("/proc/self/attr/"+attr, os.O_WRONLY, 0) + if err != nil { + return err + } + defer f.Close() + + if err := utils.EnsureProcHandle(f); err != nil { + return err + } + + _, err = f.WriteString(value) + return err +} + +// changeOnExec reimplements aa_change_onexec from libapparmor in Go +func changeOnExec(name string) error { + if err := setProcAttr("exec", "exec "+name); err != nil { + return fmt.Errorf("apparmor failed to apply profile: %s", err) + } + return nil +} + +// ApplyProfile will apply the profile with the specified name to the process after +// the next exec. +func ApplyProfile(name string) error { + if name == "" { + return nil + } + + return changeOnExec(name) +} diff --git a/sysbox-runc/libcontainer/apparmor/apparmor_unsupported.go b/sysbox-runc/libcontainer/apparmor/apparmor_unsupported.go new file mode 100644 index 00000000..0bc473f8 --- /dev/null +++ b/sysbox-runc/libcontainer/apparmor/apparmor_unsupported.go @@ -0,0 +1,20 @@ +// +build !linux + +package apparmor + +import ( + "errors" +) + +var ErrApparmorNotEnabled = errors.New("apparmor: config provided but apparmor not supported") + +func IsEnabled() bool { + return false +} + +func ApplyProfile(name string) error { + if name != "" { + return ErrApparmorNotEnabled + } + return nil +} diff --git a/sysbox-runc/libcontainer/capabilities_linux.go b/sysbox-runc/libcontainer/capabilities_linux.go new file mode 100644 index 00000000..7e41f18b --- /dev/null +++ b/sysbox-runc/libcontainer/capabilities_linux.go @@ -0,0 +1,94 @@ +// +build linux + +package libcontainer + +import ( + "fmt" + "strings" + + "github.com/nestybox/sysbox-libs/capability" + "github.com/opencontainers/runc/libcontainer/configs" +) + +const allCapabilityTypes = capability.CAPS | capability.BOUNDS | capability.AMBS + +var capabilityMap map[string]capability.Cap + +func init() { + capabilityMap = make(map[string]capability.Cap, capability.CAP_LAST_CAP+1) + for _, c := range capability.List() { + if c > capability.CAP_LAST_CAP { + continue + } + capabilityMap["CAP_"+strings.ToUpper(c.String())] = c + } +} + +func newContainerCapList(capConfig *configs.Capabilities) (*containerCapabilities, error) { + var ( + err error + caps containerCapabilities + ) + + if caps.bounding, err = capSlice(capConfig.Bounding); err != nil { + return nil, err + } + if caps.effective, err = capSlice(capConfig.Effective); err != nil { + return nil, err + } + if caps.inheritable, err = capSlice(capConfig.Inheritable); err != nil { + return nil, err + } + if caps.permitted, err = capSlice(capConfig.Permitted); err != nil { + return nil, err + } + if caps.ambient, err = capSlice(capConfig.Ambient); err != nil { + return nil, err + } + if caps.pid, err = capability.NewPid2(0); err != nil { + return nil, err + } + if err = caps.pid.Load(); err != nil { + return nil, err + } + return &caps, nil +} + +func capSlice(caps []string) ([]capability.Cap, error) { + out := make([]capability.Cap, len(caps)) + for i, c := range caps { + v, ok := capabilityMap[c] + if !ok { + return nil, fmt.Errorf("unknown capability %q", c) + } + out[i] = v + } + return out, nil +} + +type containerCapabilities struct { + pid capability.Capabilities + bounding []capability.Cap + effective []capability.Cap + inheritable []capability.Cap + permitted []capability.Cap + ambient []capability.Cap +} + +// ApplyBoundingSet sets the capability bounding set to those specified in the whitelist. +func (c *containerCapabilities) ApplyBoundingSet() error { + c.pid.Clear(capability.BOUNDS) + c.pid.Set(capability.BOUNDS, c.bounding...) + return c.pid.Apply(capability.BOUNDS) +} + +// Apply sets all the capabilities for the current process in the config. +func (c *containerCapabilities) ApplyCaps() error { + c.pid.Clear(allCapabilityTypes) + c.pid.Set(capability.BOUNDS, c.bounding...) + c.pid.Set(capability.PERMITTED, c.permitted...) + c.pid.Set(capability.INHERITABLE, c.inheritable...) + c.pid.Set(capability.EFFECTIVE, c.effective...) + c.pid.Set(capability.AMBIENT, c.ambient...) + return c.pid.Apply(allCapabilityTypes) +} diff --git a/sysbox-runc/libcontainer/cgroups/cgroups.go b/sysbox-runc/libcontainer/cgroups/cgroups.go new file mode 100644 index 00000000..393a6d9e --- /dev/null +++ b/sysbox-runc/libcontainer/cgroups/cgroups.go @@ -0,0 +1,87 @@ +// +build linux + +package cgroups + +import ( + "github.com/opencontainers/runc/libcontainer/configs" +) + +// syscontCgroupRoot is the name of the host's cgroup subtree that is exposed / +// delegated inside the system container. This subtree lives under the cgroup +// hierarchy associated with the container itself. For example: +// +// /sys/fs/cgroup//docker//syscont-group-root + +var SyscontCgroupRoot string = "syscont-cgroup-root" + +type CgroupType int + +const ( + Cgroup_v1_fs CgroupType = iota + Cgroup_v1_systemd + Cgroup_v2_fs + Cgroup_v2_systemd +) + +type Manager interface { + // Applies cgroup configuration to the process with the specified pid + Apply(pid int) error + + // Returns the PIDs inside the cgroup set + GetPids() ([]int, error) + + // Returns the PIDs inside the cgroup set & all sub-cgroups + GetAllPids() ([]int, error) + + // Returns statistics for the cgroup set + GetStats() (*Stats, error) + + // Toggles the freezer cgroup according with specified state + Freeze(state configs.FreezerState) error + + // Destroys the cgroup set & all sub-cgroups + Destroy() error + + // Path returns a cgroup path to the specified controller/subsystem. + // For cgroupv2, the argument is unused and can be empty. + Path(string) string + + // Sets the cgroup as configured. + Set(container *configs.Config) error + + // GetPaths returns cgroup path(s) to save in a state file in order to restore later. + // + // For cgroup v1, a key is cgroup subsystem name, and the value is the path + // to the cgroup for this subsystem. + // + // For cgroup v2 unified hierarchy, a key is "", and the value is the unified path. + GetPaths() map[string]string + + // GetCgroups returns the cgroup data as configured. + GetCgroups() (*configs.Cgroup, error) + + // GetFreezerState retrieves the current FreezerState of the cgroup. + GetFreezerState() (configs.FreezerState, error) + + // Whether the cgroup path exists or not + Exists() bool + + // sysbox-runc: creates a child cgroup that will serve as the cgroup root + // exposed inside the system container. We don't need a corresponding + // destroy method because the existing Destroy() method will destroy the + // child cgroup. + CreateChildCgroup(container *configs.Config) error + + // sysbox-runc: applies child cgroup configuration to the process with the specified + // pid. Must be called after Apply() has been called because Apply() configures + // internal state in the cgroup manager that ApplyChildCgroup() does not. This + // awkwardness could be avoided if this interface had a separate Create() method as + // currently Apply() serves as both create and apply. + ApplyChildCgroup(pid int) error + + // sysbox-runc: same as GetPaths(), but returns child cgroup paths + GetChildCgroupPaths() map[string]string + + // sysbox-runc: get the type of the cgroup manager + GetType() CgroupType +} diff --git a/sysbox-runc/libcontainer/cgroups/cgroups_test.go b/sysbox-runc/libcontainer/cgroups/cgroups_test.go new file mode 100644 index 00000000..9efb83ec --- /dev/null +++ b/sysbox-runc/libcontainer/cgroups/cgroups_test.go @@ -0,0 +1,20 @@ +// +build linux + +package cgroups + +import ( + "testing" +) + +func TestParseCgroups(t *testing.T) { + cgroups, err := ParseCgroupFile("/proc/self/cgroup") + if err != nil { + t.Fatal(err) + } + if IsCgroup2UnifiedMode() { + return + } + if _, ok := cgroups["cpu"]; !ok { + t.Fail() + } +} diff --git a/sysbox-runc/libcontainer/cgroups/cgroups_unsupported.go b/sysbox-runc/libcontainer/cgroups/cgroups_unsupported.go new file mode 100644 index 00000000..278d507e --- /dev/null +++ b/sysbox-runc/libcontainer/cgroups/cgroups_unsupported.go @@ -0,0 +1,3 @@ +// +build !linux + +package cgroups diff --git a/sysbox-runc/libcontainer/cgroups/devices/devices_emulator.go b/sysbox-runc/libcontainer/cgroups/devices/devices_emulator.go new file mode 100644 index 00000000..3572a5ea --- /dev/null +++ b/sysbox-runc/libcontainer/cgroups/devices/devices_emulator.go @@ -0,0 +1,373 @@ +// +build linux + +// SPDX-License-Identifier: Apache-2.0 +/* + * Copyright (C) 2020 Aleksa Sarai + * Copyright (C) 2020 SUSE LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package devices + +import ( + "bufio" + "io" + "regexp" + "sort" + "strconv" + + "github.com/opencontainers/runc/libcontainer/devices" + + "github.com/pkg/errors" +) + +// deviceMeta is a Rule without the Allow or Permissions fields, and no +// wildcard-type support. It's effectively the "match" portion of a metadata +// rule, for the purposes of our emulation. +type deviceMeta struct { + node devices.Type + major int64 + minor int64 +} + +// deviceRule is effectively the tuple (deviceMeta, Permissions). +type deviceRule struct { + meta deviceMeta + perms devices.Permissions +} + +// deviceRules is a mapping of device metadata rules to the associated +// permissions in the ruleset. +type deviceRules map[deviceMeta]devices.Permissions + +func (r deviceRules) orderedEntries() []deviceRule { + var rules []deviceRule + for meta, perms := range r { + rules = append(rules, deviceRule{meta: meta, perms: perms}) + } + sort.Slice(rules, func(i, j int) bool { + // Sort by (major, minor, type). + a, b := rules[i].meta, rules[j].meta + return a.major < b.major || + (a.major == b.major && a.minor < b.minor) || + (a.major == b.major && a.minor == b.minor && a.node < b.node) + }) + return rules +} + +type Emulator struct { + defaultAllow bool + rules deviceRules +} + +func (e *Emulator) IsBlacklist() bool { + return e.defaultAllow +} + +func (e *Emulator) IsAllowAll() bool { + return e.IsBlacklist() && len(e.rules) == 0 +} + +var devicesListRegexp = regexp.MustCompile(`^([abc])\s+(\d+|\*):(\d+|\*)\s+([rwm]+)$`) + +func parseLine(line string) (*deviceRule, error) { + matches := devicesListRegexp.FindStringSubmatch(line) + if matches == nil { + return nil, errors.Errorf("line doesn't match devices.list format") + } + var ( + rule deviceRule + node = matches[1] + major = matches[2] + minor = matches[3] + perms = matches[4] + ) + + // Parse the node type. + switch node { + case "a": + // Super-special case -- "a" always means every device with every + // access mode. In fact, for devices.list this actually indicates that + // the cgroup is in black-list mode. + // TODO: Double-check that the entire file is "a *:* rwm". + return nil, nil + case "b": + rule.meta.node = devices.BlockDevice + case "c": + rule.meta.node = devices.CharDevice + default: + // Should never happen! + return nil, errors.Errorf("unknown device type %q", node) + } + + // Parse the major number. + if major == "*" { + rule.meta.major = devices.Wildcard + } else { + val, err := strconv.ParseUint(major, 10, 32) + if err != nil { + return nil, errors.Wrap(err, "parse major number") + } + rule.meta.major = int64(val) + } + + // Parse the minor number. + if minor == "*" { + rule.meta.minor = devices.Wildcard + } else { + val, err := strconv.ParseUint(minor, 10, 32) + if err != nil { + return nil, errors.Wrap(err, "parse minor number") + } + rule.meta.minor = int64(val) + } + + // Parse the access permissions. + rule.perms = devices.Permissions(perms) + if !rule.perms.IsValid() || rule.perms.IsEmpty() { + // Should never happen! + return nil, errors.Errorf("parse access mode: contained unknown modes or is empty: %q", perms) + } + return &rule, nil +} + +func (e *Emulator) addRule(rule deviceRule) error { + if e.rules == nil { + e.rules = make(map[deviceMeta]devices.Permissions) + } + + // Merge with any pre-existing permissions. + oldPerms := e.rules[rule.meta] + newPerms := rule.perms.Union(oldPerms) + e.rules[rule.meta] = newPerms + return nil +} + +func (e *Emulator) rmRule(rule deviceRule) error { + // Give an error if any of the permissions requested to be removed are + // present in a partially-matching wildcard rule, because such rules will + // be ignored by cgroupv1. + // + // This is a diversion from cgroupv1, but is necessary to avoid leading + // users into a false sense of security. cgroupv1 will silently(!) ignore + // requests to remove partial exceptions, but we really shouldn't do that. + // + // It may seem like we could just "split" wildcard rules which hit this + // issue, but unfortunately there are 2^32 possible major and minor + // numbers, which would exhaust kernel memory quickly if we did this. Not + // to mention it'd be really slow (the kernel side is implemented as a + // linked-list of exceptions). + for _, partialMeta := range []deviceMeta{ + {node: rule.meta.node, major: devices.Wildcard, minor: rule.meta.minor}, + {node: rule.meta.node, major: rule.meta.major, minor: devices.Wildcard}, + {node: rule.meta.node, major: devices.Wildcard, minor: devices.Wildcard}, + } { + // This wildcard rule is equivalent to the requested rule, so skip it. + if rule.meta == partialMeta { + continue + } + // Only give an error if the set of permissions overlap. + partialPerms := e.rules[partialMeta] + if !partialPerms.Intersection(rule.perms).IsEmpty() { + return errors.Errorf("requested rule [%v %v] not supported by devices cgroupv1 (cannot punch hole in existing wildcard rule [%v %v])", rule.meta, rule.perms, partialMeta, partialPerms) + } + } + + // Subtract all of the permissions listed from the full match rule. If the + // rule didn't exist, all of this is a no-op. + newPerms := e.rules[rule.meta].Difference(rule.perms) + if newPerms.IsEmpty() { + delete(e.rules, rule.meta) + } else { + e.rules[rule.meta] = newPerms + } + // TODO: The actual cgroup code doesn't care if an exception didn't exist + // during removal, so not erroring out here is /accurate/ but quite + // worrying. Maybe we should do additional validation, but again we + // have to worry about backwards-compatibility. + return nil +} + +func (e *Emulator) allow(rule *deviceRule) error { + // This cgroup is configured as a black-list. Reset the entire emulator, + // and put is into black-list mode. + if rule == nil || rule.meta.node == devices.WildcardDevice { + *e = Emulator{ + defaultAllow: true, + rules: nil, + } + return nil + } + + var err error + if e.defaultAllow { + err = errors.Wrap(e.rmRule(*rule), "remove 'deny' exception") + } else { + err = errors.Wrap(e.addRule(*rule), "add 'allow' exception") + } + return err +} + +func (e *Emulator) deny(rule *deviceRule) error { + // This cgroup is configured as a white-list. Reset the entire emulator, + // and put is into white-list mode. + if rule == nil || rule.meta.node == devices.WildcardDevice { + *e = Emulator{ + defaultAllow: false, + rules: nil, + } + return nil + } + + var err error + if e.defaultAllow { + err = errors.Wrap(e.addRule(*rule), "add 'deny' exception") + } else { + err = errors.Wrap(e.rmRule(*rule), "remove 'allow' exception") + } + return err +} + +func (e *Emulator) Apply(rule devices.Rule) error { + if !rule.Type.CanCgroup() { + return errors.Errorf("cannot add rule [%#v] with non-cgroup type %q", rule, rule.Type) + } + + innerRule := &deviceRule{ + meta: deviceMeta{ + node: rule.Type, + major: rule.Major, + minor: rule.Minor, + }, + perms: rule.Permissions, + } + if innerRule.meta.node == devices.WildcardDevice { + innerRule = nil + } + + if rule.Allow { + return e.allow(innerRule) + } else { + return e.deny(innerRule) + } +} + +// EmulatorFromList takes a reader to a "devices.list"-like source, and returns +// a new Emulator that represents the state of the devices cgroup. Note that +// black-list devices cgroups cannot be fully reconstructed, due to limitations +// in the devices cgroup API. Instead, such cgroups are always treated as +// "allow all" cgroups. +func EmulatorFromList(list io.Reader) (*Emulator, error) { + // Normally cgroups are in black-list mode by default, but the way we + // figure out the current mode is whether or not devices.list has an + // allow-all rule. So we default to a white-list, and the existence of an + // "a *:* rwm" entry will tell us otherwise. + e := &Emulator{ + defaultAllow: false, + } + + // Parse the "devices.list". + s := bufio.NewScanner(list) + for s.Scan() { + line := s.Text() + deviceRule, err := parseLine(line) + if err != nil { + return nil, errors.Wrapf(err, "parsing line %q", line) + } + // "devices.list" is an allow list. Note that this means that in + // black-list mode, we have no idea what rules are in play. As a + // result, we need to be very careful in Transition(). + if err := e.allow(deviceRule); err != nil { + return nil, errors.Wrapf(err, "adding devices.list rule") + } + } + if err := s.Err(); err != nil { + return nil, errors.Wrap(err, "reading devices.list lines") + } + return e, nil +} + +// Transition calculates what is the minimally-disruptive set of rules need to +// be applied to a devices cgroup in order to transition to the given target. +// This means that any already-existing rules will not be applied, and +// disruptive rules (like denying all device access) will only be applied if +// necessary. +// +// This function is the sole reason for all of Emulator -- to allow us +// to figure out how to update a containers' cgroups without causing spurrious +// device errors (if possible). +func (source *Emulator) Transition(target *Emulator) ([]*devices.Rule, error) { + var transitionRules []*devices.Rule + oldRules := source.rules + + // If the default policy doesn't match, we need to include a "disruptive" + // rule (either allow-all or deny-all) in order to switch the cgroup to the + // correct default policy. + // + // However, due to a limitation in "devices.list" we cannot be sure what + // deny rules are in place in a black-list cgroup. Thus if the source is a + // black-list we also have to include a disruptive rule. + if source.IsBlacklist() || source.defaultAllow != target.defaultAllow { + transitionRules = append(transitionRules, &devices.Rule{ + Type: 'a', + Major: -1, + Minor: -1, + Permissions: devices.Permissions("rwm"), + Allow: target.defaultAllow, + }) + // The old rules are only relevant if we aren't starting out with a + // disruptive rule. + oldRules = nil + } + + // NOTE: We traverse through the rules in a sorted order so we always write + // the same set of rules (this is to aid testing). + + // First, we create inverse rules for any old rules not in the new set. + // This includes partial-inverse rules for specific permissions. This is a + // no-op if we added a disruptive rule, since oldRules will be empty. + for _, rule := range oldRules.orderedEntries() { + meta, oldPerms := rule.meta, rule.perms + newPerms := target.rules[meta] + droppedPerms := oldPerms.Difference(newPerms) + if !droppedPerms.IsEmpty() { + transitionRules = append(transitionRules, &devices.Rule{ + Type: meta.node, + Major: meta.major, + Minor: meta.minor, + Permissions: droppedPerms, + Allow: target.defaultAllow, + }) + } + } + + // Add any additional rules which weren't in the old set. We happen to + // filter out rules which are present in both sets, though this isn't + // strictly necessary. + for _, rule := range target.rules.orderedEntries() { + meta, newPerms := rule.meta, rule.perms + oldPerms := oldRules[meta] + gainedPerms := newPerms.Difference(oldPerms) + if !gainedPerms.IsEmpty() { + transitionRules = append(transitionRules, &devices.Rule{ + Type: meta.node, + Major: meta.major, + Minor: meta.minor, + Permissions: gainedPerms, + Allow: !target.defaultAllow, + }) + } + } + return transitionRules, nil +} diff --git a/sysbox-runc/libcontainer/cgroups/devices/devices_emulator_test.go b/sysbox-runc/libcontainer/cgroups/devices/devices_emulator_test.go new file mode 100644 index 00000000..5dbcbdee --- /dev/null +++ b/sysbox-runc/libcontainer/cgroups/devices/devices_emulator_test.go @@ -0,0 +1,1114 @@ +// SPDX-License-Identifier: Apache-2.0 +/* + * Copyright (C) 2020 Aleksa Sarai + * Copyright (C) 2020 SUSE LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package devices + +import ( + "bytes" + "reflect" + "testing" + + "github.com/opencontainers/runc/libcontainer/devices" +) + +func TestDeviceEmulatorLoad(t *testing.T) { + tests := []struct { + name, list string + expected *Emulator + }{ + { + name: "BlacklistMode", + list: `a *:* rwm`, + expected: &Emulator{ + defaultAllow: true, + }, + }, + { + name: "WhitelistBasic", + list: `c 4:2 rw`, + expected: &Emulator{ + defaultAllow: false, + rules: deviceRules{ + { + node: devices.CharDevice, + major: 4, + minor: 2, + }: devices.Permissions("rw"), + }, + }, + }, + { + name: "WhitelistWildcard", + list: `b 0:* m`, + expected: &Emulator{ + defaultAllow: false, + rules: deviceRules{ + { + node: devices.BlockDevice, + major: 0, + minor: devices.Wildcard, + }: devices.Permissions("m"), + }, + }, + }, + { + name: "WhitelistDuplicate", + list: `c *:* rwm +c 1:1 r`, + expected: &Emulator{ + defaultAllow: false, + rules: deviceRules{ + { + node: devices.CharDevice, + major: devices.Wildcard, + minor: devices.Wildcard, + }: devices.Permissions("rwm"), + // To match the kernel, we allow redundant rules. + { + node: devices.CharDevice, + major: 1, + minor: 1, + }: devices.Permissions("r"), + }, + }, + }, + { + name: "WhitelistComplicated", + list: `c *:* m +b *:* m +c 1:3 rwm +c 1:5 rwm +c 1:7 rwm +c 1:8 rwm +c 1:9 rwm +c 5:0 rwm +c 5:2 rwm +c 136:* rwm +c 10:200 rwm`, + expected: &Emulator{ + defaultAllow: false, + rules: deviceRules{ + { + node: devices.CharDevice, + major: devices.Wildcard, + minor: devices.Wildcard, + }: devices.Permissions("m"), + { + node: devices.BlockDevice, + major: devices.Wildcard, + minor: devices.Wildcard, + }: devices.Permissions("m"), + { + node: devices.CharDevice, + major: 1, + minor: 3, + }: devices.Permissions("rwm"), + { + node: devices.CharDevice, + major: 1, + minor: 5, + }: devices.Permissions("rwm"), + { + node: devices.CharDevice, + major: 1, + minor: 7, + }: devices.Permissions("rwm"), + { + node: devices.CharDevice, + major: 1, + minor: 8, + }: devices.Permissions("rwm"), + { + node: devices.CharDevice, + major: 1, + minor: 9, + }: devices.Permissions("rwm"), + { + node: devices.CharDevice, + major: 5, + minor: 0, + }: devices.Permissions("rwm"), + { + node: devices.CharDevice, + major: 5, + minor: 2, + }: devices.Permissions("rwm"), + { + node: devices.CharDevice, + major: 136, + minor: devices.Wildcard, + }: devices.Permissions("rwm"), + { + node: devices.CharDevice, + major: 10, + minor: 200, + }: devices.Permissions("rwm"), + }, + }, + }, + // Some invalid lists. + { + name: "InvalidFieldNumber", + list: `b 1:0`, + expected: nil, + }, + { + name: "InvalidDeviceType", + list: `p *:* rwm`, + expected: nil, + }, + { + name: "InvalidMajorNumber1", + list: `p -1:3 rwm`, + expected: nil, + }, + { + name: "InvalidMajorNumber2", + list: `c foo:27 rwm`, + expected: nil, + }, + { + name: "InvalidMinorNumber1", + list: `b 1:-4 rwm`, + expected: nil, + }, + { + name: "InvalidMinorNumber2", + list: `b 1:foo rwm`, + expected: nil, + }, + { + name: "InvalidPermissions", + list: `b 1:7 rwk`, + expected: nil, + }, + } + + for _, test := range tests { + test := test // capture range variable + t.Run(test.name, func(t *testing.T) { + list := bytes.NewBufferString(test.list) + emu, err := EmulatorFromList(list) + if err != nil && test.expected != nil { + t.Fatalf("unexpected failure when creating emulator: %v", err) + } else if err == nil && test.expected == nil { + t.Fatalf("unexpected success when creating emulator: %#v", emu) + } + + if !reflect.DeepEqual(emu, test.expected) { + t.Errorf("final emulator state mismatch: %#v != %#v", emu, test.expected) + } + }) + } +} + +func testDeviceEmulatorApply(t *testing.T, baseDefaultAllow bool) { + tests := []struct { + name string + rule devices.Rule + base, expected *Emulator + }{ + // Switch between default modes. + { + name: "SwitchToOtherMode", + rule: devices.Rule{ + Type: devices.WildcardDevice, + Major: devices.Wildcard, + Minor: devices.Wildcard, + Permissions: devices.Permissions("rwm"), + Allow: !baseDefaultAllow, + }, + base: &Emulator{ + defaultAllow: baseDefaultAllow, + rules: deviceRules{ + { + node: devices.CharDevice, + major: devices.Wildcard, + minor: devices.Wildcard, + }: devices.Permissions("rwm"), + { + node: devices.CharDevice, + major: 1, + minor: 1, + }: devices.Permissions("r"), + }, + }, + expected: &Emulator{ + defaultAllow: !baseDefaultAllow, + rules: nil, + }, + }, + { + name: "SwitchToSameModeNoop", + rule: devices.Rule{ + Type: devices.WildcardDevice, + Major: devices.Wildcard, + Minor: devices.Wildcard, + Permissions: devices.Permissions("rwm"), + Allow: baseDefaultAllow, + }, + base: &Emulator{ + defaultAllow: baseDefaultAllow, + rules: nil, + }, + expected: &Emulator{ + defaultAllow: baseDefaultAllow, + rules: nil, + }, + }, + { + name: "SwitchToSameMode", + rule: devices.Rule{ + Type: devices.WildcardDevice, + Major: devices.Wildcard, + Minor: devices.Wildcard, + Permissions: devices.Permissions("rwm"), + Allow: baseDefaultAllow, + }, + base: &Emulator{ + defaultAllow: baseDefaultAllow, + rules: deviceRules{ + { + node: devices.CharDevice, + major: devices.Wildcard, + minor: devices.Wildcard, + }: devices.Permissions("rwm"), + { + node: devices.CharDevice, + major: 1, + minor: 1, + }: devices.Permissions("r"), + }, + }, + expected: &Emulator{ + defaultAllow: baseDefaultAllow, + rules: nil, + }, + }, + // Rule addition logic. + { + name: "RuleAdditionBasic", + rule: devices.Rule{ + Type: devices.CharDevice, + Major: 42, + Minor: 1337, + Permissions: devices.Permissions("rm"), + Allow: !baseDefaultAllow, + }, + base: &Emulator{ + defaultAllow: baseDefaultAllow, + rules: deviceRules{ + { + node: devices.CharDevice, + major: 2, + minor: 1, + }: devices.Permissions("rwm"), + { + node: devices.BlockDevice, + major: 1, + minor: 5, + }: devices.Permissions("r"), + }, + }, + expected: &Emulator{ + defaultAllow: baseDefaultAllow, + rules: deviceRules{ + { + node: devices.CharDevice, + major: 2, + minor: 1, + }: devices.Permissions("rwm"), + { + node: devices.BlockDevice, + major: 1, + minor: 5, + }: devices.Permissions("r"), + { + node: devices.CharDevice, + major: 42, + minor: 1337, + }: devices.Permissions("rm"), + }, + }, + }, + { + name: "RuleAdditionBasicDuplicate", + rule: devices.Rule{ + Type: devices.CharDevice, + Major: 42, + Minor: 1337, + Permissions: devices.Permissions("rm"), + Allow: !baseDefaultAllow, + }, + base: &Emulator{ + defaultAllow: baseDefaultAllow, + rules: deviceRules{ + { + node: devices.CharDevice, + major: 42, + minor: devices.Wildcard, + }: devices.Permissions("rwm"), + }, + }, + expected: &Emulator{ + defaultAllow: baseDefaultAllow, + rules: deviceRules{ + { + node: devices.CharDevice, + major: 42, + minor: devices.Wildcard, + }: devices.Permissions("rwm"), + // To match the kernel, we allow redundant rules. + { + node: devices.CharDevice, + major: 42, + minor: 1337, + }: devices.Permissions("rm"), + }, + }, + }, + { + name: "RuleAdditionBasicDuplicateNoop", + rule: devices.Rule{ + Type: devices.CharDevice, + Major: 42, + Minor: 1337, + Permissions: devices.Permissions("rm"), + Allow: !baseDefaultAllow, + }, + base: &Emulator{ + defaultAllow: baseDefaultAllow, + rules: deviceRules{ + { + node: devices.CharDevice, + major: 42, + minor: 1337, + }: devices.Permissions("rm"), + }, + }, + expected: &Emulator{ + defaultAllow: baseDefaultAllow, + rules: deviceRules{ + { + node: devices.CharDevice, + major: 42, + minor: 1337, + }: devices.Permissions("rm"), + }, + }, + }, + { + name: "RuleAdditionMerge", + rule: devices.Rule{ + Type: devices.BlockDevice, + Major: 5, + Minor: 12, + Permissions: devices.Permissions("rm"), + Allow: !baseDefaultAllow, + }, + base: &Emulator{ + defaultAllow: baseDefaultAllow, + rules: deviceRules{ + { + node: devices.CharDevice, + major: 2, + minor: 1, + }: devices.Permissions("rwm"), + { + node: devices.BlockDevice, + major: 5, + minor: 12, + }: devices.Permissions("rw"), + }, + }, + expected: &Emulator{ + defaultAllow: baseDefaultAllow, + rules: deviceRules{ + { + node: devices.CharDevice, + major: 2, + minor: 1, + }: devices.Permissions("rwm"), + { + node: devices.BlockDevice, + major: 5, + minor: 12, + }: devices.Permissions("rwm"), + }, + }, + }, + { + name: "RuleAdditionMergeWildcard", + rule: devices.Rule{ + Type: devices.BlockDevice, + Major: 5, + Minor: devices.Wildcard, + Permissions: devices.Permissions("rm"), + Allow: !baseDefaultAllow, + }, + base: &Emulator{ + defaultAllow: baseDefaultAllow, + rules: deviceRules{ + { + node: devices.CharDevice, + major: 2, + minor: 1, + }: devices.Permissions("rwm"), + { + node: devices.BlockDevice, + major: 5, + minor: devices.Wildcard, + }: devices.Permissions("rw"), + }, + }, + expected: &Emulator{ + defaultAllow: baseDefaultAllow, + rules: deviceRules{ + { + node: devices.CharDevice, + major: 2, + minor: 1, + }: devices.Permissions("rwm"), + { + node: devices.BlockDevice, + major: 5, + minor: devices.Wildcard, + }: devices.Permissions("rwm"), + }, + }, + }, + { + name: "RuleAdditionMergeNoop", + rule: devices.Rule{ + Type: devices.BlockDevice, + Major: 5, + Minor: 12, + Permissions: devices.Permissions("r"), + Allow: !baseDefaultAllow, + }, + base: &Emulator{ + defaultAllow: baseDefaultAllow, + rules: deviceRules{ + { + node: devices.CharDevice, + major: 2, + minor: 1, + }: devices.Permissions("rwm"), + { + node: devices.BlockDevice, + major: 5, + minor: 12, + }: devices.Permissions("rw"), + }, + }, + expected: &Emulator{ + defaultAllow: baseDefaultAllow, + rules: deviceRules{ + { + node: devices.CharDevice, + major: 2, + minor: 1, + }: devices.Permissions("rwm"), + { + node: devices.BlockDevice, + major: 5, + minor: 12, + }: devices.Permissions("rw"), + }, + }, + }, + // Rule removal logic. + { + name: "RuleRemovalBasic", + rule: devices.Rule{ + Type: devices.CharDevice, + Major: 42, + Minor: 1337, + Permissions: devices.Permissions("rm"), + Allow: baseDefaultAllow, + }, + base: &Emulator{ + defaultAllow: baseDefaultAllow, + rules: deviceRules{ + { + node: devices.CharDevice, + major: 42, + minor: 1337, + }: devices.Permissions("rm"), + { + node: devices.BlockDevice, + major: 1, + minor: 5, + }: devices.Permissions("r"), + }, + }, + expected: &Emulator{ + defaultAllow: baseDefaultAllow, + rules: deviceRules{ + { + node: devices.BlockDevice, + major: 1, + minor: 5, + }: devices.Permissions("r"), + }, + }, + }, + { + name: "RuleRemovalNonexistent", + rule: devices.Rule{ + Type: devices.CharDevice, + Major: 4, + Minor: 1, + Permissions: devices.Permissions("rw"), + Allow: baseDefaultAllow, + }, + base: &Emulator{ + defaultAllow: baseDefaultAllow, + rules: deviceRules{ + { + node: devices.BlockDevice, + major: 1, + minor: 5, + }: devices.Permissions("r"), + }, + }, + expected: &Emulator{ + defaultAllow: baseDefaultAllow, + rules: deviceRules{ + { + node: devices.BlockDevice, + major: 1, + minor: 5, + }: devices.Permissions("r"), + }, + }, + }, + { + name: "RuleRemovalFull", + rule: devices.Rule{ + Type: devices.CharDevice, + Major: 42, + Minor: 1337, + Permissions: devices.Permissions("rw"), + Allow: baseDefaultAllow, + }, + base: &Emulator{ + defaultAllow: baseDefaultAllow, + rules: deviceRules{ + { + node: devices.CharDevice, + major: 42, + minor: 1337, + }: devices.Permissions("w"), + { + node: devices.BlockDevice, + major: 1, + minor: 5, + }: devices.Permissions("r"), + }, + }, + expected: &Emulator{ + defaultAllow: baseDefaultAllow, + rules: deviceRules{ + { + node: devices.BlockDevice, + major: 1, + minor: 5, + }: devices.Permissions("r"), + }, + }, + }, + { + name: "RuleRemovalPartial", + rule: devices.Rule{ + Type: devices.CharDevice, + Major: 42, + Minor: 1337, + Permissions: devices.Permissions("r"), + Allow: baseDefaultAllow, + }, + base: &Emulator{ + defaultAllow: baseDefaultAllow, + rules: deviceRules{ + { + node: devices.CharDevice, + major: 42, + minor: 1337, + }: devices.Permissions("rm"), + { + node: devices.BlockDevice, + major: 1, + minor: 5, + }: devices.Permissions("r"), + }, + }, + expected: &Emulator{ + defaultAllow: baseDefaultAllow, + rules: deviceRules{ + { + node: devices.CharDevice, + major: 42, + minor: 1337, + }: devices.Permissions("m"), + { + node: devices.BlockDevice, + major: 1, + minor: 5, + }: devices.Permissions("r"), + }, + }, + }, + // Check our non-canonical behaviour when it comes to try to "punch + // out" holes in a wildcard rule. + { + name: "RuleRemovalWildcardPunchoutImpossible", + rule: devices.Rule{ + Type: devices.CharDevice, + Major: 42, + Minor: 1337, + Permissions: devices.Permissions("r"), + Allow: baseDefaultAllow, + }, + base: &Emulator{ + defaultAllow: baseDefaultAllow, + rules: deviceRules{ + { + node: devices.CharDevice, + major: 42, + minor: devices.Wildcard, + }: devices.Permissions("rm"), + { + node: devices.CharDevice, + major: 42, + minor: 1337, + }: devices.Permissions("r"), + }, + }, + expected: nil, + }, + { + name: "RuleRemovalWildcardPunchoutPossible", + rule: devices.Rule{ + Type: devices.CharDevice, + Major: 42, + Minor: 1337, + Permissions: devices.Permissions("r"), + Allow: baseDefaultAllow, + }, + base: &Emulator{ + defaultAllow: baseDefaultAllow, + rules: deviceRules{ + { + node: devices.CharDevice, + major: 42, + minor: devices.Wildcard, + }: devices.Permissions("wm"), + { + node: devices.CharDevice, + major: 42, + minor: 1337, + }: devices.Permissions("r"), + }, + }, + expected: &Emulator{ + defaultAllow: baseDefaultAllow, + rules: deviceRules{ + { + node: devices.CharDevice, + major: 42, + minor: devices.Wildcard, + }: devices.Permissions("wm"), + }, + }, + }, + } + + for _, test := range tests { + test := test + t.Run(test.name, func(t *testing.T) { + err := test.base.Apply(test.rule) + if err != nil && test.expected != nil { + t.Fatalf("unexpected failure when applying apply rule: %v", err) + } else if err == nil && test.expected == nil { + t.Fatalf("unexpected success when applying apply rule: %#v", test.base) + } + + if test.expected != nil && !reflect.DeepEqual(test.base, test.expected) { + t.Errorf("final emulator state mismatch: %#v != %#v", test.base, test.expected) + } + }) + } +} + +func TestDeviceEmulatorWhitelistApply(t *testing.T) { + testDeviceEmulatorApply(t, false) +} + +func TestDeviceEmulatorBlacklistApply(t *testing.T) { + testDeviceEmulatorApply(t, true) +} + +func testDeviceEmulatorTransition(t *testing.T, sourceDefaultAllow bool) { + tests := []struct { + name string + source, target *Emulator + expected []*devices.Rule + }{ + // No-op changes. + { + name: "Noop", + source: &Emulator{ + defaultAllow: sourceDefaultAllow, + rules: deviceRules{ + { + node: devices.CharDevice, + major: 42, + minor: devices.Wildcard, + }: devices.Permissions("wm"), + }, + }, + target: &Emulator{ + defaultAllow: sourceDefaultAllow, + rules: deviceRules{ + { + node: devices.CharDevice, + major: 42, + minor: devices.Wildcard, + }: devices.Permissions("wm"), + }, + }, + // Identical white-lists produce no extra rules. + expected: nil, + }, + // Switching modes. + { + name: "SwitchToOtherMode", + source: &Emulator{ + defaultAllow: sourceDefaultAllow, + rules: deviceRules{ + { + node: devices.CharDevice, + major: 1, + minor: 2, + }: devices.Permissions("rwm"), + }, + }, + target: &Emulator{ + defaultAllow: !sourceDefaultAllow, + rules: deviceRules{ + { + node: devices.BlockDevice, + major: 42, + minor: devices.Wildcard, + }: devices.Permissions("wm"), + }, + }, + expected: []*devices.Rule{ + // Clear-all rule. + { + Type: devices.WildcardDevice, + Major: devices.Wildcard, + Minor: devices.Wildcard, + Permissions: devices.Permissions("rwm"), + Allow: !sourceDefaultAllow, + }, + // The actual rule-set. + { + Type: devices.BlockDevice, + Major: 42, + Minor: devices.Wildcard, + Permissions: devices.Permissions("wm"), + Allow: sourceDefaultAllow, + }, + }, + }, + // Rule changes. + { + name: "RuleAddition", + source: &Emulator{ + defaultAllow: sourceDefaultAllow, + rules: deviceRules{ + { + node: devices.CharDevice, + major: 1, + minor: 2, + }: devices.Permissions("rwm"), + }, + }, + target: &Emulator{ + defaultAllow: sourceDefaultAllow, + rules: deviceRules{ + { + node: devices.CharDevice, + major: 1, + minor: 2, + }: devices.Permissions("rwm"), + { + node: devices.BlockDevice, + major: 42, + minor: 1337, + }: devices.Permissions("rwm"), + }, + }, + expected: []*devices.Rule{ + { + Type: devices.BlockDevice, + Major: 42, + Minor: 1337, + Permissions: devices.Permissions("rwm"), + Allow: !sourceDefaultAllow, + }, + }, + }, + { + name: "RuleRemoval", + source: &Emulator{ + defaultAllow: sourceDefaultAllow, + rules: deviceRules{ + { + node: devices.CharDevice, + major: 1, + minor: 2, + }: devices.Permissions("rwm"), + { + node: devices.BlockDevice, + major: 42, + minor: 1337, + }: devices.Permissions("rwm"), + }, + }, + target: &Emulator{ + defaultAllow: sourceDefaultAllow, + rules: deviceRules{ + { + node: devices.CharDevice, + major: 1, + minor: 2, + }: devices.Permissions("rwm"), + }, + }, + expected: []*devices.Rule{ + { + Type: devices.BlockDevice, + Major: 42, + Minor: 1337, + Permissions: devices.Permissions("rwm"), + Allow: sourceDefaultAllow, + }, + }, + }, + { + name: "RuleMultipleAdditionRemoval", + source: &Emulator{ + defaultAllow: sourceDefaultAllow, + rules: deviceRules{ + { + node: devices.CharDevice, + major: 1, + minor: 2, + }: devices.Permissions("rwm"), + { + node: devices.BlockDevice, + major: 3, + minor: 9, + }: devices.Permissions("rw"), + }, + }, + target: &Emulator{ + defaultAllow: sourceDefaultAllow, + rules: deviceRules{ + { + node: devices.CharDevice, + major: 1, + minor: 2, + }: devices.Permissions("rwm"), + }, + }, + expected: []*devices.Rule{ + { + Type: devices.BlockDevice, + Major: 3, + Minor: 9, + Permissions: devices.Permissions("rw"), + Allow: sourceDefaultAllow, + }, + }, + }, + // Modifying the access permissions. + { + name: "RulePartialAddition", + source: &Emulator{ + defaultAllow: sourceDefaultAllow, + rules: deviceRules{ + { + node: devices.CharDevice, + major: 1, + minor: 2, + }: devices.Permissions("r"), + }, + }, + target: &Emulator{ + defaultAllow: sourceDefaultAllow, + rules: deviceRules{ + { + node: devices.CharDevice, + major: 1, + minor: 2, + }: devices.Permissions("rwm"), + }, + }, + expected: []*devices.Rule{ + { + Type: devices.CharDevice, + Major: 1, + Minor: 2, + Permissions: devices.Permissions("wm"), + Allow: !sourceDefaultAllow, + }, + }, + }, + { + name: "RulePartialRemoval", + source: &Emulator{ + defaultAllow: sourceDefaultAllow, + rules: deviceRules{ + { + node: devices.CharDevice, + major: 1, + minor: 2, + }: devices.Permissions("rw"), + }, + }, + target: &Emulator{ + defaultAllow: sourceDefaultAllow, + rules: deviceRules{ + { + node: devices.CharDevice, + major: 1, + minor: 2, + }: devices.Permissions("w"), + }, + }, + expected: []*devices.Rule{ + { + Type: devices.CharDevice, + Major: 1, + Minor: 2, + Permissions: devices.Permissions("r"), + Allow: sourceDefaultAllow, + }, + }, + }, + { + name: "RulePartialBoth", + source: &Emulator{ + defaultAllow: sourceDefaultAllow, + rules: deviceRules{ + { + node: devices.CharDevice, + major: 1, + minor: 2, + }: devices.Permissions("rw"), + }, + }, + target: &Emulator{ + defaultAllow: sourceDefaultAllow, + rules: deviceRules{ + { + node: devices.CharDevice, + major: 1, + minor: 2, + }: devices.Permissions("rm"), + }, + }, + expected: []*devices.Rule{ + { + Type: devices.CharDevice, + Major: 1, + Minor: 2, + Permissions: devices.Permissions("w"), + Allow: sourceDefaultAllow, + }, + { + Type: devices.CharDevice, + Major: 1, + Minor: 2, + Permissions: devices.Permissions("m"), + Allow: !sourceDefaultAllow, + }, + }, + }, + } + + for _, test := range tests { + test := test + t.Run(test.name, func(t *testing.T) { + // If we are in black-list mode, we need to prepend the relevant + // clear-all rule (the expected rule lists are written with + // white-list mode in mind), and then make a full copy of the + // target rules. + if sourceDefaultAllow && test.source.defaultAllow == test.target.defaultAllow { + test.expected = []*devices.Rule{{ + Type: devices.WildcardDevice, + Major: devices.Wildcard, + Minor: devices.Wildcard, + Permissions: devices.Permissions("rwm"), + Allow: test.target.defaultAllow, + }} + for _, rule := range test.target.rules.orderedEntries() { + test.expected = append(test.expected, &devices.Rule{ + Type: rule.meta.node, + Major: rule.meta.major, + Minor: rule.meta.minor, + Permissions: rule.perms, + Allow: !test.target.defaultAllow, + }) + } + } + + rules, err := test.source.Transition(test.target) + if err != nil { + t.Fatalf("unexpected error while calculating transition rules: %#v", err) + } + + if !reflect.DeepEqual(rules, test.expected) { + t.Errorf("rules don't match expected set: %#v != %#v", rules, test.expected) + } + + // Apply the rules to the source to see if it actually transitions + // correctly. This is all emulated but it's a good thing to + // double-check. + for _, rule := range rules { + if err := test.source.Apply(*rule); err != nil { + t.Fatalf("error while applying transition rule [%#v]: %v", rule, err) + } + } + if !reflect.DeepEqual(test.source, test.target) { + t.Errorf("transition incomplete after applying all rules: %#v != %#v", test.source, test.target) + } + }) + } +} + +func TestDeviceEmulatorTransitionFromBlacklist(t *testing.T) { + testDeviceEmulatorTransition(t, true) +} + +func TestDeviceEmulatorTransitionFromWhitelist(t *testing.T) { + testDeviceEmulatorTransition(t, false) +} diff --git a/sysbox-runc/libcontainer/cgroups/ebpf/devicefilter/devicefilter.go b/sysbox-runc/libcontainer/cgroups/ebpf/devicefilter/devicefilter.go new file mode 100644 index 00000000..a173fd4a --- /dev/null +++ b/sysbox-runc/libcontainer/cgroups/ebpf/devicefilter/devicefilter.go @@ -0,0 +1,183 @@ +// Package devicefilter contains eBPF device filter program +// +// The implementation is based on https://github.com/containers/crun/blob/0.10.2/src/libcrun/ebpf.c +// +// Although ebpf.c is originally licensed under LGPL-3.0-or-later, the author (Giuseppe Scrivano) +// agreed to relicense the file in Apache License 2.0: https://github.com/opencontainers/runc/issues/2144#issuecomment-543116397 +package devicefilter + +import ( + "math" + "strconv" + + "github.com/cilium/ebpf/asm" + "github.com/opencontainers/runc/libcontainer/devices" + "github.com/pkg/errors" + "golang.org/x/sys/unix" +) + +const ( + // license string format is same as kernel MODULE_LICENSE macro + license = "Apache" +) + +// DeviceFilter returns eBPF device filter program and its license string +func DeviceFilter(devices []*devices.Rule) (asm.Instructions, string, error) { + p := &program{} + p.init() + for i := len(devices) - 1; i >= 0; i-- { + if err := p.appendDevice(devices[i]); err != nil { + return nil, "", err + } + } + insts, err := p.finalize() + return insts, license, err +} + +type program struct { + insts asm.Instructions + hasWildCard bool + blockID int +} + +func (p *program) init() { + // struct bpf_cgroup_dev_ctx: https://elixir.bootlin.com/linux/v5.3.6/source/include/uapi/linux/bpf.h#L3423 + /* + u32 access_type + u32 major + u32 minor + */ + // R2 <- type (lower 16 bit of u32 access_type at R1[0]) + p.insts = append(p.insts, + asm.LoadMem(asm.R2, asm.R1, 0, asm.Word), + asm.And.Imm32(asm.R2, 0xFFFF)) + + // R3 <- access (upper 16 bit of u32 access_type at R1[0]) + p.insts = append(p.insts, + asm.LoadMem(asm.R3, asm.R1, 0, asm.Word), + // RSh: bitwise shift right + asm.RSh.Imm32(asm.R3, 16)) + + // R4 <- major (u32 major at R1[4]) + p.insts = append(p.insts, + asm.LoadMem(asm.R4, asm.R1, 4, asm.Word)) + + // R5 <- minor (u32 minor at R1[8]) + p.insts = append(p.insts, + asm.LoadMem(asm.R5, asm.R1, 8, asm.Word)) +} + +// appendDevice needs to be called from the last element of OCI linux.resources.devices to the head element. +func (p *program) appendDevice(dev *devices.Rule) error { + if p.blockID < 0 { + return errors.New("the program is finalized") + } + if p.hasWildCard { + // All entries after wildcard entry are ignored + return nil + } + + bpfType := int32(-1) + hasType := true + switch dev.Type { + case 'c': + bpfType = int32(unix.BPF_DEVCG_DEV_CHAR) + case 'b': + bpfType = int32(unix.BPF_DEVCG_DEV_BLOCK) + case 'a': + hasType = false + default: + // if not specified in OCI json, typ is set to DeviceTypeAll + return errors.Errorf("invalid Type %q", string(dev.Type)) + } + if dev.Major > math.MaxUint32 { + return errors.Errorf("invalid major %d", dev.Major) + } + if dev.Minor > math.MaxUint32 { + return errors.Errorf("invalid minor %d", dev.Major) + } + hasMajor := dev.Major >= 0 // if not specified in OCI json, major is set to -1 + hasMinor := dev.Minor >= 0 + bpfAccess := int32(0) + for _, r := range dev.Permissions { + switch r { + case 'r': + bpfAccess |= unix.BPF_DEVCG_ACC_READ + case 'w': + bpfAccess |= unix.BPF_DEVCG_ACC_WRITE + case 'm': + bpfAccess |= unix.BPF_DEVCG_ACC_MKNOD + default: + return errors.Errorf("unknown device access %v", r) + } + } + // If the access is rwm, skip the check. + hasAccess := bpfAccess != (unix.BPF_DEVCG_ACC_READ | unix.BPF_DEVCG_ACC_WRITE | unix.BPF_DEVCG_ACC_MKNOD) + + var ( + blockSym = "block-" + strconv.Itoa(p.blockID) + nextBlockSym = "block-" + strconv.Itoa(p.blockID+1) + prevBlockLastIdx = len(p.insts) - 1 + ) + if hasType { + p.insts = append(p.insts, + // if (R2 != bpfType) goto next + asm.JNE.Imm(asm.R2, bpfType, nextBlockSym), + ) + } + if hasAccess { + p.insts = append(p.insts, + // if (R3 & bpfAccess == 0 /* use R1 as a temp var */) goto next + asm.Mov.Reg32(asm.R1, asm.R3), + asm.And.Imm32(asm.R1, bpfAccess), + asm.JEq.Imm(asm.R1, 0, nextBlockSym), + ) + } + if hasMajor { + p.insts = append(p.insts, + // if (R4 != major) goto next + asm.JNE.Imm(asm.R4, int32(dev.Major), nextBlockSym), + ) + } + if hasMinor { + p.insts = append(p.insts, + // if (R5 != minor) goto next + asm.JNE.Imm(asm.R5, int32(dev.Minor), nextBlockSym), + ) + } + if !hasType && !hasAccess && !hasMajor && !hasMinor { + p.hasWildCard = true + } + p.insts = append(p.insts, acceptBlock(dev.Allow)...) + // set blockSym to the first instruction we added in this iteration + p.insts[prevBlockLastIdx+1] = p.insts[prevBlockLastIdx+1].Sym(blockSym) + p.blockID++ + return nil +} + +func (p *program) finalize() (asm.Instructions, error) { + if p.hasWildCard { + // acceptBlock with asm.Return() is already inserted + return p.insts, nil + } + blockSym := "block-" + strconv.Itoa(p.blockID) + p.insts = append(p.insts, + // R0 <- 0 + asm.Mov.Imm32(asm.R0, 0).Sym(blockSym), + asm.Return(), + ) + p.blockID = -1 + return p.insts, nil +} + +func acceptBlock(accept bool) asm.Instructions { + v := int32(0) + if accept { + v = 1 + } + return []asm.Instruction{ + // R0 <- v + asm.Mov.Imm32(asm.R0, v), + asm.Return(), + } +} diff --git a/sysbox-runc/libcontainer/cgroups/ebpf/devicefilter/devicefilter_test.go b/sysbox-runc/libcontainer/cgroups/ebpf/devicefilter/devicefilter_test.go new file mode 100644 index 00000000..f714bcac --- /dev/null +++ b/sysbox-runc/libcontainer/cgroups/ebpf/devicefilter/devicefilter_test.go @@ -0,0 +1,260 @@ +package devicefilter + +import ( + "strings" + "testing" + + "github.com/opencontainers/runc/libcontainer/devices" + "github.com/opencontainers/runc/libcontainer/specconv" +) + +func hash(s, comm string) string { + var res []string + for _, l := range strings.Split(s, "\n") { + trimmed := strings.TrimSpace(l) + if trimmed == "" || strings.HasPrefix(trimmed, comm) { + continue + } + res = append(res, trimmed) + } + return strings.Join(res, "\n") +} + +func testDeviceFilter(t testing.TB, devices []*devices.Rule, expectedStr string) { + insts, _, err := DeviceFilter(devices) + if err != nil { + t.Fatalf("%s: %v (devices: %+v)", t.Name(), err, devices) + } + s := insts.String() + if expectedStr != "" { + hashed := hash(s, "//") + expectedHashed := hash(expectedStr, "//") + if expectedHashed != hashed { + t.Fatalf("expected:\n%q\ngot\n%q", expectedHashed, hashed) + } + } +} + +func TestDeviceFilter_Nil(t *testing.T) { + expected := ` +// load parameters into registers + 0: LdXMemW dst: r2 src: r1 off: 0 imm: 0 + 1: And32Imm dst: r2 imm: 65535 + 2: LdXMemW dst: r3 src: r1 off: 0 imm: 0 + 3: RSh32Imm dst: r3 imm: 16 + 4: LdXMemW dst: r4 src: r1 off: 4 imm: 0 + 5: LdXMemW dst: r5 src: r1 off: 8 imm: 0 +block-0: +// return 0 (reject) + 6: Mov32Imm dst: r0 imm: 0 + 7: Exit + ` + testDeviceFilter(t, nil, expected) +} + +func TestDeviceFilter_BuiltInAllowList(t *testing.T) { + expected := ` +// load parameters into registers + 0: LdXMemW dst: r2 src: r1 off: 0 imm: 0 + 1: And32Imm dst: r2 imm: 65535 + 2: LdXMemW dst: r3 src: r1 off: 0 imm: 0 + 3: RSh32Imm dst: r3 imm: 16 + 4: LdXMemW dst: r4 src: r1 off: 4 imm: 0 + 5: LdXMemW dst: r5 src: r1 off: 8 imm: 0 +block-0: +// tuntap (c, 10, 200, rwm, allow) + 6: JNEImm dst: r2 off: -1 imm: 2 + 7: JNEImm dst: r4 off: -1 imm: 10 + 8: JNEImm dst: r5 off: -1 imm: 200 + 9: Mov32Imm dst: r0 imm: 1 + 10: Exit +block-1: + 11: JNEImm dst: r2 off: -1 imm: 2 + 12: JNEImm dst: r4 off: -1 imm: 5 + 13: JNEImm dst: r5 off: -1 imm: 2 + 14: Mov32Imm dst: r0 imm: 1 + 15: Exit +block-2: +// /dev/pts (c, 136, wildcard, rwm, true) + 16: JNEImm dst: r2 off: -1 imm: 2 + 17: JNEImm dst: r4 off: -1 imm: 136 + 18: Mov32Imm dst: r0 imm: 1 + 19: Exit +block-3: + 20: JNEImm dst: r2 off: -1 imm: 2 + 21: JNEImm dst: r4 off: -1 imm: 1 + 22: JNEImm dst: r5 off: -1 imm: 9 + 23: Mov32Imm dst: r0 imm: 1 + 24: Exit +block-4: + 25: JNEImm dst: r2 off: -1 imm: 2 + 26: JNEImm dst: r4 off: -1 imm: 1 + 27: JNEImm dst: r5 off: -1 imm: 5 + 28: Mov32Imm dst: r0 imm: 1 + 29: Exit +block-5: + 30: JNEImm dst: r2 off: -1 imm: 2 + 31: JNEImm dst: r4 off: -1 imm: 5 + 32: JNEImm dst: r5 off: -1 imm: 0 + 33: Mov32Imm dst: r0 imm: 1 + 34: Exit +block-6: + 35: JNEImm dst: r2 off: -1 imm: 2 + 36: JNEImm dst: r4 off: -1 imm: 1 + 37: JNEImm dst: r5 off: -1 imm: 7 + 38: Mov32Imm dst: r0 imm: 1 + 39: Exit +block-7: + 40: JNEImm dst: r2 off: -1 imm: 2 + 41: JNEImm dst: r4 off: -1 imm: 1 + 42: JNEImm dst: r5 off: -1 imm: 8 + 43: Mov32Imm dst: r0 imm: 1 + 44: Exit +block-8: + 45: JNEImm dst: r2 off: -1 imm: 2 + 46: JNEImm dst: r4 off: -1 imm: 1 + 47: JNEImm dst: r5 off: -1 imm: 3 + 48: Mov32Imm dst: r0 imm: 1 + 49: Exit +block-9: +// (b, wildcard, wildcard, m, true) + 50: JNEImm dst: r2 off: -1 imm: 1 + 51: Mov32Reg dst: r1 src: r3 + 52: And32Imm dst: r1 imm: 1 + 53: JEqImm dst: r1 off: -1 imm: 0 + 54: Mov32Imm dst: r0 imm: 1 + 55: Exit +block-10: +// (c, wildcard, wildcard, m, true) + 56: JNEImm dst: r2 off: -1 imm: 2 + 57: Mov32Reg dst: r1 src: r3 + 58: And32Imm dst: r1 imm: 1 + 59: JEqImm dst: r1 off: -1 imm: 0 + 60: Mov32Imm dst: r0 imm: 1 + 61: Exit +block-11: + 62: Mov32Imm dst: r0 imm: 0 + 63: Exit +` + var devices []*devices.Rule + for _, device := range specconv.AllowedDevices { + devices = append(devices, &device.Rule) + } + testDeviceFilter(t, devices, expected) +} + +func TestDeviceFilter_Privileged(t *testing.T) { + devices := []*devices.Rule{ + { + Type: 'a', + Major: -1, + Minor: -1, + Permissions: "rwm", + Allow: true, + }, + } + expected := + ` +// load parameters into registers + 0: LdXMemW dst: r2 src: r1 off: 0 imm: 0 + 1: And32Imm dst: r2 imm: 65535 + 2: LdXMemW dst: r3 src: r1 off: 0 imm: 0 + 3: RSh32Imm dst: r3 imm: 16 + 4: LdXMemW dst: r4 src: r1 off: 4 imm: 0 + 5: LdXMemW dst: r5 src: r1 off: 8 imm: 0 +block-0: +// return 1 (accept) + 6: Mov32Imm dst: r0 imm: 1 + 7: Exit + ` + testDeviceFilter(t, devices, expected) +} + +func TestDeviceFilter_PrivilegedExceptSingleDevice(t *testing.T) { + devices := []*devices.Rule{ + { + Type: 'a', + Major: -1, + Minor: -1, + Permissions: "rwm", + Allow: true, + }, + { + Type: 'b', + Major: 8, + Minor: 0, + Permissions: "rwm", + Allow: false, + }, + } + expected := ` +// load parameters into registers + 0: LdXMemW dst: r2 src: r1 off: 0 imm: 0 + 1: And32Imm dst: r2 imm: 65535 + 2: LdXMemW dst: r3 src: r1 off: 0 imm: 0 + 3: RSh32Imm dst: r3 imm: 16 + 4: LdXMemW dst: r4 src: r1 off: 4 imm: 0 + 5: LdXMemW dst: r5 src: r1 off: 8 imm: 0 +block-0: +// return 0 (reject) if type==b && major == 8 && minor == 0 + 6: JNEImm dst: r2 off: -1 imm: 1 + 7: JNEImm dst: r4 off: -1 imm: 8 + 8: JNEImm dst: r5 off: -1 imm: 0 + 9: Mov32Imm dst: r0 imm: 0 + 10: Exit +block-1: +// return 1 (accept) + 11: Mov32Imm dst: r0 imm: 1 + 12: Exit +` + testDeviceFilter(t, devices, expected) +} + +func TestDeviceFilter_Weird(t *testing.T) { + devices := []*devices.Rule{ + { + Type: 'b', + Major: 8, + Minor: 1, + Permissions: "rwm", + Allow: false, + }, + { + Type: 'a', + Major: -1, + Minor: -1, + Permissions: "rwm", + Allow: true, + }, + { + Type: 'b', + Major: 8, + Minor: 2, + Permissions: "rwm", + Allow: false, + }, + } + // 8/1 is allowed, 8/2 is not allowed. + // This conforms to runc v1.0.0-rc.9 (cgroup1) behavior. + expected := ` +// load parameters into registers + 0: LdXMemW dst: r2 src: r1 off: 0 imm: 0 + 1: And32Imm dst: r2 imm: 65535 + 2: LdXMemW dst: r3 src: r1 off: 0 imm: 0 + 3: RSh32Imm dst: r3 imm: 16 + 4: LdXMemW dst: r4 src: r1 off: 4 imm: 0 + 5: LdXMemW dst: r5 src: r1 off: 8 imm: 0 +block-0: +// return 0 (reject) if type==b && major == 8 && minor == 2 + 6: JNEImm dst: r2 off: -1 imm: 1 + 7: JNEImm dst: r4 off: -1 imm: 8 + 8: JNEImm dst: r5 off: -1 imm: 2 + 9: Mov32Imm dst: r0 imm: 0 + 10: Exit +block-1: +// return 1 (accept) + 11: Mov32Imm dst: r0 imm: 1 + 12: Exit +` + testDeviceFilter(t, devices, expected) +} diff --git a/sysbox-runc/libcontainer/cgroups/ebpf/ebpf.go b/sysbox-runc/libcontainer/cgroups/ebpf/ebpf.go new file mode 100644 index 00000000..4795e0aa --- /dev/null +++ b/sysbox-runc/libcontainer/cgroups/ebpf/ebpf.go @@ -0,0 +1,45 @@ +package ebpf + +import ( + "github.com/cilium/ebpf" + "github.com/cilium/ebpf/asm" + "github.com/pkg/errors" + "golang.org/x/sys/unix" +) + +// LoadAttachCgroupDeviceFilter installs eBPF device filter program to /sys/fs/cgroup/ directory. +// +// Requires the system to be running in cgroup2 unified-mode with kernel >= 4.15 . +// +// https://github.com/torvalds/linux/commit/ebc614f687369f9df99828572b1d85a7c2de3d92 +func LoadAttachCgroupDeviceFilter(insts asm.Instructions, license string, dirFD int) (func() error, error) { + nilCloser := func() error { + return nil + } + // Increase `ulimit -l` limit to avoid BPF_PROG_LOAD error (#2167). + // This limit is not inherited into the container. + memlockLimit := &unix.Rlimit{ + Cur: unix.RLIM_INFINITY, + Max: unix.RLIM_INFINITY, + } + _ = unix.Setrlimit(unix.RLIMIT_MEMLOCK, memlockLimit) + spec := &ebpf.ProgramSpec{ + Type: ebpf.CGroupDevice, + Instructions: insts, + License: license, + } + prog, err := ebpf.NewProgram(spec) + if err != nil { + return nilCloser, err + } + if err := prog.Attach(dirFD, ebpf.AttachCGroupDevice, unix.BPF_F_ALLOW_MULTI); err != nil { + return nilCloser, errors.Wrap(err, "failed to call BPF_PROG_ATTACH (BPF_CGROUP_DEVICE, BPF_F_ALLOW_MULTI)") + } + closer := func() error { + if err := prog.Detach(dirFD, ebpf.AttachCGroupDevice, unix.BPF_F_ALLOW_MULTI); err != nil { + return errors.Wrap(err, "failed to call BPF_PROG_DETACH (BPF_CGROUP_DEVICE, BPF_F_ALLOW_MULTI)") + } + return nil + } + return closer, nil +} diff --git a/sysbox-runc/libcontainer/cgroups/file.go b/sysbox-runc/libcontainer/cgroups/file.go new file mode 100644 index 00000000..a2ece77a --- /dev/null +++ b/sysbox-runc/libcontainer/cgroups/file.go @@ -0,0 +1,187 @@ +package cgroups + +import ( + "bytes" + "fmt" + "io/ioutil" + "os" + "strings" + "sync" + + securejoin "github.com/cyphar/filepath-securejoin" + "github.com/pkg/errors" + "github.com/sirupsen/logrus" + "golang.org/x/sys/unix" +) + +// OpenFile opens a cgroup file in a given dir with given flags. +// It is supposed to be used for cgroup files only. +func OpenFile(dir, file string, flags int) (*os.File, error) { + if dir == "" { + return nil, errors.Errorf("no directory specified for %s", file) + } + return openFile(dir, file, flags) +} + +// ReadFile reads data from a cgroup file in dir. +// It is supposed to be used for cgroup files only. +func ReadFile(dir, file string) (string, error) { + fd, err := OpenFile(dir, file, unix.O_RDONLY) + if err != nil { + return "", err + } + defer fd.Close() + var buf bytes.Buffer + + _, err = buf.ReadFrom(fd) + return buf.String(), err +} + +// WriteFile writes data to a cgroup file in dir. +// It is supposed to be used for cgroup files only. +func WriteFile(dir, file, data string) error { + fd, err := OpenFile(dir, file, unix.O_WRONLY) + if err != nil { + return err + } + defer fd.Close() + if err := retryingWriteFile(fd, data); err != nil { + return errors.Wrapf(err, "failed to write %q", data) + } + return nil +} + +func CopyFile(source, dest string) error { + var ( + srcF *os.File + dstF *os.File + data []byte + err error + ) + + srcF, err = os.Open(source) + if err != nil { + return fmt.Errorf("failed to open %s: %s", source, err) + } + defer srcF.Close() + + dstF, err = os.Open(dest) + if err != nil { + dstF.Close() + return fmt.Errorf("failed to open %s: %s", dest, err) + } + defer dstF.Close() + + data, err = ioutil.ReadFile(source) + if err != nil { + return fmt.Errorf("failed to read %s: %s", source, err) + } + + err = ioutil.WriteFile(dest, data, 0) + if err != nil { + return fmt.Errorf("failed to read %s: %s", dest, err) + } + + return nil +} + +func retryingWriteFile(fd *os.File, data string) error { + for { + _, err := fd.Write([]byte(data)) + if errors.Is(err, unix.EINTR) { + logrus.Infof("interrupted while writing %s to %s", data, fd.Name()) + continue + } + return err + } +} + +const ( + cgroupfsDir = "/sys/fs/cgroup" + cgroupfsPrefix = cgroupfsDir + "/" +) + +var ( + // Set to true by fs unit tests + TestMode bool + + cgroupRootHandle *os.File + prepOnce sync.Once + prepErr error + resolveFlags uint64 +) + +func prepareOpenat2() error { + prepOnce.Do(func() { + fd, err := unix.Openat2(-1, cgroupfsDir, &unix.OpenHow{ + Flags: unix.O_DIRECTORY | unix.O_PATH | unix.O_CLOEXEC, + }) + if err != nil { + prepErr = &os.PathError{Op: "openat2", Path: cgroupfsDir, Err: err} + if err != unix.ENOSYS { + logrus.Warnf("falling back to securejoin: %s", prepErr) + } else { + logrus.Debug("openat2 not available, falling back to securejoin") + } + return + } + file := os.NewFile(uintptr(fd), cgroupfsDir) + + var st unix.Statfs_t + if err = unix.Fstatfs(int(file.Fd()), &st); err != nil { + prepErr = &os.PathError{Op: "statfs", Path: cgroupfsDir, Err: err} + logrus.Warnf("falling back to securejoin: %s", prepErr) + return + } + + cgroupRootHandle = file + resolveFlags = unix.RESOLVE_BENEATH | unix.RESOLVE_NO_MAGICLINKS + if st.Type == unix.CGROUP2_SUPER_MAGIC { + // cgroupv2 has a single mountpoint and no "cpu,cpuacct" symlinks + resolveFlags |= unix.RESOLVE_NO_XDEV | unix.RESOLVE_NO_SYMLINKS + } + + }) + + return prepErr +} + +// OpenFile opens a cgroup file in a given dir with given flags. +// It is supposed to be used for cgroup files only. +func openFile(dir, file string, flags int) (*os.File, error) { + mode := os.FileMode(0) + if TestMode && flags&os.O_WRONLY != 0 { + // "emulate" cgroup fs for unit tests + flags |= os.O_TRUNC | os.O_CREATE + mode = 0o600 + } + reldir := strings.TrimPrefix(dir, cgroupfsPrefix) + if len(reldir) == len(dir) { // non-standard path, old system? + return openWithSecureJoin(dir, file, flags, mode) + } + if prepareOpenat2() != nil { + return openWithSecureJoin(dir, file, flags, mode) + } + + relname := reldir + "/" + file + fd, err := unix.Openat2(int(cgroupRootHandle.Fd()), relname, + &unix.OpenHow{ + Resolve: resolveFlags, + Flags: uint64(flags) | unix.O_CLOEXEC, + Mode: uint64(mode), + }) + if err != nil { + return nil, &os.PathError{Op: "openat2", Path: dir + "/" + file, Err: err} + } + + return os.NewFile(uintptr(fd), cgroupfsPrefix+relname), nil +} + +func openWithSecureJoin(dir, file string, flags int, mode os.FileMode) (*os.File, error) { + path, err := securejoin.SecureJoin(dir, file) + if err != nil { + return nil, err + } + + return os.OpenFile(path, flags, mode) +} diff --git a/sysbox-runc/libcontainer/cgroups/file_test.go b/sysbox-runc/libcontainer/cgroups/file_test.go new file mode 100644 index 00000000..be07c90e --- /dev/null +++ b/sysbox-runc/libcontainer/cgroups/file_test.go @@ -0,0 +1,42 @@ +// +build linux + +package cgroups + +import ( + "fmt" + "os" + "path/filepath" + "strconv" + "testing" + "time" +) + +func TestWriteCgroupFileHandlesInterrupt(t *testing.T) { + const ( + memoryCgroupMount = "/sys/fs/cgroup/memory" + memoryLimit = "memory.limit_in_bytes" + ) + if _, err := os.Stat(memoryCgroupMount); err != nil { + // most probably cgroupv2 + t.Skip(err) + } + + cgroupName := fmt.Sprintf("test-eint-%d", time.Now().Nanosecond()) + cgroupPath := filepath.Join(memoryCgroupMount, cgroupName) + if err := os.MkdirAll(cgroupPath, 0755); err != nil { + t.Fatal(err) + } + defer os.RemoveAll(cgroupPath) + + if _, err := os.Stat(filepath.Join(cgroupPath, memoryLimit)); err != nil { + // either cgroupv2, or memory controller is not available + t.Skip(err) + } + + for i := 0; i < 100000; i++ { + limit := 1024*1024 + i + if err := WriteFile(cgroupPath, memoryLimit, strconv.Itoa(limit)); err != nil { + t.Fatalf("Failed to write %d on attempt %d: %+v", limit, i, err) + } + } +} diff --git a/sysbox-runc/libcontainer/cgroups/fs/blkio.go b/sysbox-runc/libcontainer/cgroups/fs/blkio.go new file mode 100644 index 00000000..e4a3b613 --- /dev/null +++ b/sysbox-runc/libcontainer/cgroups/fs/blkio.go @@ -0,0 +1,303 @@ +// +build linux + +package fs + +import ( + "bufio" + "fmt" + "os" + "strconv" + "strings" + + "github.com/opencontainers/runc/libcontainer/cgroups" + "github.com/opencontainers/runc/libcontainer/cgroups/fscommon" + "github.com/opencontainers/runc/libcontainer/configs" +) + +type BlkioGroup struct { +} + +func (s *BlkioGroup) Name() string { + return "blkio" +} + +func (s *BlkioGroup) Apply(path string, d *cgroupData) error { + return join(path, d.pid) +} + +func (s *BlkioGroup) Set(path string, cgroup *configs.Cgroup) error { + if cgroup.Resources.BlkioWeight != 0 { + if err := fscommon.WriteFile(path, "blkio.weight", strconv.FormatUint(uint64(cgroup.Resources.BlkioWeight), 10)); err != nil { + return err + } + } + + if cgroup.Resources.BlkioLeafWeight != 0 { + if err := fscommon.WriteFile(path, "blkio.leaf_weight", strconv.FormatUint(uint64(cgroup.Resources.BlkioLeafWeight), 10)); err != nil { + return err + } + } + for _, wd := range cgroup.Resources.BlkioWeightDevice { + if err := fscommon.WriteFile(path, "blkio.weight_device", wd.WeightString()); err != nil { + return err + } + if err := fscommon.WriteFile(path, "blkio.leaf_weight_device", wd.LeafWeightString()); err != nil { + return err + } + } + for _, td := range cgroup.Resources.BlkioThrottleReadBpsDevice { + if err := fscommon.WriteFile(path, "blkio.throttle.read_bps_device", td.String()); err != nil { + return err + } + } + for _, td := range cgroup.Resources.BlkioThrottleWriteBpsDevice { + if err := fscommon.WriteFile(path, "blkio.throttle.write_bps_device", td.String()); err != nil { + return err + } + } + for _, td := range cgroup.Resources.BlkioThrottleReadIOPSDevice { + if err := fscommon.WriteFile(path, "blkio.throttle.read_iops_device", td.String()); err != nil { + return err + } + } + for _, td := range cgroup.Resources.BlkioThrottleWriteIOPSDevice { + if err := fscommon.WriteFile(path, "blkio.throttle.write_iops_device", td.String()); err != nil { + return err + } + } + + return nil +} + +func (s *BlkioGroup) Clone(source, dest string) error { + + if err := fscommon.WriteFile(source, "cgroup.clone_children", "1"); err != nil { + return err + } + + if err := os.MkdirAll(dest, 0755); err != nil { + return fmt.Errorf("Failed to create cgroup %s", dest) + } + + return nil +} + +/* +examples: + + blkio.sectors + 8:0 6792 + + blkio.io_service_bytes + 8:0 Read 1282048 + 8:0 Write 2195456 + 8:0 Sync 2195456 + 8:0 Async 1282048 + 8:0 Total 3477504 + Total 3477504 + + blkio.io_serviced + 8:0 Read 124 + 8:0 Write 104 + 8:0 Sync 104 + 8:0 Async 124 + 8:0 Total 228 + Total 228 + + blkio.io_queued + 8:0 Read 0 + 8:0 Write 0 + 8:0 Sync 0 + 8:0 Async 0 + 8:0 Total 0 + Total 0 +*/ + +func splitBlkioStatLine(r rune) bool { + return r == ' ' || r == ':' +} + +func getBlkioStat(dir, file string) ([]cgroups.BlkioStatEntry, error) { + var blkioStats []cgroups.BlkioStatEntry + f, err := fscommon.OpenFile(dir, file, os.O_RDONLY) + if err != nil { + if os.IsNotExist(err) { + return blkioStats, nil + } + return nil, err + } + defer f.Close() + + sc := bufio.NewScanner(f) + for sc.Scan() { + // format: dev type amount + fields := strings.FieldsFunc(sc.Text(), splitBlkioStatLine) + if len(fields) < 3 { + if len(fields) == 2 && fields[0] == "Total" { + // skip total line + continue + } else { + return nil, fmt.Errorf("Invalid line found while parsing %s/%s: %s", dir, file, sc.Text()) + } + } + + v, err := strconv.ParseUint(fields[0], 10, 64) + if err != nil { + return nil, err + } + major := v + + v, err = strconv.ParseUint(fields[1], 10, 64) + if err != nil { + return nil, err + } + minor := v + + op := "" + valueField := 2 + if len(fields) == 4 { + op = fields[2] + valueField = 3 + } + v, err = strconv.ParseUint(fields[valueField], 10, 64) + if err != nil { + return nil, err + } + blkioStats = append(blkioStats, cgroups.BlkioStatEntry{Major: major, Minor: minor, Op: op, Value: v}) + } + + return blkioStats, nil +} + +func (s *BlkioGroup) GetStats(path string, stats *cgroups.Stats) error { + type blkioStatInfo struct { + filename string + blkioStatEntriesPtr *[]cgroups.BlkioStatEntry + } + var bfqDebugStats = []blkioStatInfo{ + { + filename: "blkio.bfq.sectors_recursive", + blkioStatEntriesPtr: &stats.BlkioStats.SectorsRecursive, + }, + { + filename: "blkio.bfq.io_service_time_recursive", + blkioStatEntriesPtr: &stats.BlkioStats.IoServiceTimeRecursive, + }, + { + filename: "blkio.bfq.io_wait_time_recursive", + blkioStatEntriesPtr: &stats.BlkioStats.IoWaitTimeRecursive, + }, + { + filename: "blkio.bfq.io_merged_recursive", + blkioStatEntriesPtr: &stats.BlkioStats.IoMergedRecursive, + }, + { + filename: "blkio.bfq.io_queued_recursive", + blkioStatEntriesPtr: &stats.BlkioStats.IoQueuedRecursive, + }, + { + filename: "blkio.bfq.time_recursive", + blkioStatEntriesPtr: &stats.BlkioStats.IoTimeRecursive, + }, + { + filename: "blkio.bfq.io_serviced_recursive", + blkioStatEntriesPtr: &stats.BlkioStats.IoServicedRecursive, + }, + { + filename: "blkio.bfq.io_service_bytes_recursive", + blkioStatEntriesPtr: &stats.BlkioStats.IoServiceBytesRecursive, + }, + } + var bfqStats = []blkioStatInfo{ + { + filename: "blkio.bfq.io_serviced_recursive", + blkioStatEntriesPtr: &stats.BlkioStats.IoServicedRecursive, + }, + { + filename: "blkio.bfq.io_service_bytes_recursive", + blkioStatEntriesPtr: &stats.BlkioStats.IoServiceBytesRecursive, + }, + } + var cfqStats = []blkioStatInfo{ + { + filename: "blkio.sectors_recursive", + blkioStatEntriesPtr: &stats.BlkioStats.SectorsRecursive, + }, + { + filename: "blkio.io_service_time_recursive", + blkioStatEntriesPtr: &stats.BlkioStats.IoServiceTimeRecursive, + }, + { + filename: "blkio.io_wait_time_recursive", + blkioStatEntriesPtr: &stats.BlkioStats.IoWaitTimeRecursive, + }, + { + filename: "blkio.io_merged_recursive", + blkioStatEntriesPtr: &stats.BlkioStats.IoMergedRecursive, + }, + { + filename: "blkio.io_queued_recursive", + blkioStatEntriesPtr: &stats.BlkioStats.IoQueuedRecursive, + }, + { + filename: "blkio.time_recursive", + blkioStatEntriesPtr: &stats.BlkioStats.IoTimeRecursive, + }, + { + filename: "blkio.io_serviced_recursive", + blkioStatEntriesPtr: &stats.BlkioStats.IoServicedRecursive, + }, + { + filename: "blkio.io_service_bytes_recursive", + blkioStatEntriesPtr: &stats.BlkioStats.IoServiceBytesRecursive, + }, + } + var throttleRecursiveStats = []blkioStatInfo{ + { + filename: "blkio.throttle.io_serviced_recursive", + blkioStatEntriesPtr: &stats.BlkioStats.IoServicedRecursive, + }, + { + filename: "blkio.throttle.io_service_bytes_recursive", + blkioStatEntriesPtr: &stats.BlkioStats.IoServiceBytesRecursive, + }, + } + var baseStats = []blkioStatInfo{ + { + filename: "blkio.throttle.io_serviced", + blkioStatEntriesPtr: &stats.BlkioStats.IoServicedRecursive, + }, + { + filename: "blkio.throttle.io_service_bytes", + blkioStatEntriesPtr: &stats.BlkioStats.IoServiceBytesRecursive, + }, + } + var orderedStats = [][]blkioStatInfo{ + bfqDebugStats, + bfqStats, + cfqStats, + throttleRecursiveStats, + baseStats, + } + + var blkioStats []cgroups.BlkioStatEntry + var err error + + for _, statGroup := range orderedStats { + for i, statInfo := range statGroup { + if blkioStats, err = getBlkioStat(path, statInfo.filename); err != nil || blkioStats == nil { + // if error occurs on first file, move to next group + if i == 0 { + break + } + return err + } + *statInfo.blkioStatEntriesPtr = blkioStats + //finish if all stats are gathered + if i == len(statGroup)-1 { + return nil + } + } + } + return nil +} diff --git a/sysbox-runc/libcontainer/cgroups/fs/blkio_test.go b/sysbox-runc/libcontainer/cgroups/fs/blkio_test.go new file mode 100644 index 00000000..23564805 --- /dev/null +++ b/sysbox-runc/libcontainer/cgroups/fs/blkio_test.go @@ -0,0 +1,851 @@ +// +build linux + +package fs + +import ( + "fmt" + "strconv" + "testing" + + "github.com/opencontainers/runc/libcontainer/cgroups" + "github.com/opencontainers/runc/libcontainer/cgroups/fscommon" + "github.com/opencontainers/runc/libcontainer/configs" +) + +const ( + sectorsRecursiveContents = `8:0 1024` + sectorsRecursiveContentsBFQ = `8:0 2048` + serviceBytesRecursiveContents = `8:0 Read 100 +8:0 Write 200 +8:0 Sync 300 +8:0 Async 500 +8:0 Total 500 +Total 500` + + serviceBytesRecursiveContentsBFQ = `8:0 Read 1100 +8:0 Write 1200 +8:0 Sync 1300 +8:0 Async 1500 +8:0 Total 1500 +Total 1500` + servicedRecursiveContents = `8:0 Read 10 +8:0 Write 40 +8:0 Sync 20 +8:0 Async 30 +8:0 Total 50 +Total 50` + servicedRecursiveContentsBFQ = `8:0 Read 11 +8:0 Write 41 +8:0 Sync 21 +8:0 Async 31 +8:0 Total 51 +Total 51` + queuedRecursiveContents = `8:0 Read 1 +8:0 Write 4 +8:0 Sync 2 +8:0 Async 3 +8:0 Total 5 +Total 5` + queuedRecursiveContentsBFQ = `8:0 Read 2 +8:0 Write 3 +8:0 Sync 4 +8:0 Async 5 +8:0 Total 6 +Total 6` + serviceTimeRecursiveContents = `8:0 Read 173959 +8:0 Write 0 +8:0 Sync 0 +8:0 Async 173959 +8:0 Total 17395 +Total 17395` + serviceTimeRecursiveContentsBFQ = `8:0 Read 173959 +8:0 Write 0 +8:0 Sync 0 +8:0 Async 173 +8:0 Total 174 +Total 174` + waitTimeRecursiveContents = `8:0 Read 15571 +8:0 Write 0 +8:0 Sync 0 +8:0 Async 15571 +8:0 Total 15571` + waitTimeRecursiveContentsBFQ = `8:0 Read 1557 +8:0 Write 0 +8:0 Sync 0 +8:0 Async 1557 +8:0 Total 1557` + mergedRecursiveContents = `8:0 Read 5 +8:0 Write 10 +8:0 Sync 0 +8:0 Async 0 +8:0 Total 15 +Total 15` + mergedRecursiveContentsBFQ = `8:0 Read 51 +8:0 Write 101 +8:0 Sync 0 +8:0 Async 0 +8:0 Total 151 +Total 151` + timeRecursiveContents = `8:0 8` + timeRecursiveContentsBFQ = `8:0 16` + throttleServiceBytes = `8:0 Read 11030528 +8:0 Write 23 +8:0 Sync 42 +8:0 Async 11030528 +8:0 Total 11030528 +252:0 Read 11030528 +252:0 Write 23 +252:0 Sync 42 +252:0 Async 11030528 +252:0 Total 11030528 +Total 22061056` + throttleServiceBytesRecursive = `8:0 Read 110305281 +8:0 Write 231 +8:0 Sync 421 +8:0 Async 110305281 +8:0 Total 110305281 +252:0 Read 110305281 +252:0 Write 231 +252:0 Sync 421 +252:0 Async 110305281 +252:0 Total 110305281 +Total 220610561` + throttleServiced = `8:0 Read 164 +8:0 Write 23 +8:0 Sync 42 +8:0 Async 164 +8:0 Total 164 +252:0 Read 164 +252:0 Write 23 +252:0 Sync 42 +252:0 Async 164 +252:0 Total 164 +Total 328` + throttleServicedRecursive = `8:0 Read 1641 +8:0 Write 231 +8:0 Sync 421 +8:0 Async 1641 +8:0 Total 1641 +252:0 Read 1641 +252:0 Write 231 +252:0 Sync 421 +252:0 Async 1641 +252:0 Total 1641 +Total 3281` +) + +var blkioBFQDebugStatsTestFiles = map[string]string{ + "blkio.bfq.io_service_bytes_recursive": serviceBytesRecursiveContentsBFQ, + "blkio.bfq.io_serviced_recursive": servicedRecursiveContentsBFQ, + "blkio.bfq.io_queued_recursive": queuedRecursiveContentsBFQ, + "blkio.bfq.io_service_time_recursive": serviceTimeRecursiveContentsBFQ, + "blkio.bfq.io_wait_time_recursive": waitTimeRecursiveContentsBFQ, + "blkio.bfq.io_merged_recursive": mergedRecursiveContentsBFQ, + "blkio.bfq.time_recursive": timeRecursiveContentsBFQ, + "blkio.bfq.sectors_recursive": sectorsRecursiveContentsBFQ, +} + +var blkioBFQStatsTestFiles = map[string]string{ + "blkio.bfq.io_service_bytes_recursive": serviceBytesRecursiveContentsBFQ, + "blkio.bfq.io_serviced_recursive": servicedRecursiveContentsBFQ, +} + +var blkioCFQStatsTestFiles = map[string]string{ + "blkio.io_service_bytes_recursive": serviceBytesRecursiveContents, + "blkio.io_serviced_recursive": servicedRecursiveContents, + "blkio.io_queued_recursive": queuedRecursiveContents, + "blkio.io_service_time_recursive": serviceTimeRecursiveContents, + "blkio.io_wait_time_recursive": waitTimeRecursiveContents, + "blkio.io_merged_recursive": mergedRecursiveContents, + "blkio.time_recursive": timeRecursiveContents, + "blkio.sectors_recursive": sectorsRecursiveContents, +} + +type blkioStatFailureTestCase struct { + desc string + filename string +} + +func appendBlkioStatEntry(blkioStatEntries *[]cgroups.BlkioStatEntry, major, minor, value uint64, op string) { + *blkioStatEntries = append(*blkioStatEntries, cgroups.BlkioStatEntry{Major: major, Minor: minor, Value: value, Op: op}) +} + +func TestBlkioSetWeight(t *testing.T) { + helper := NewCgroupTestUtil("blkio", t) + defer helper.cleanup() + + const ( + weightBefore = 100 + weightAfter = 200 + ) + + helper.writeFileContents(map[string]string{ + "blkio.weight": strconv.Itoa(weightBefore), + }) + + helper.CgroupData.config.Resources.BlkioWeight = weightAfter + blkio := &BlkioGroup{} + if err := blkio.Set(helper.CgroupPath, helper.CgroupData.config); err != nil { + t.Fatal(err) + } + + value, err := fscommon.GetCgroupParamUint(helper.CgroupPath, "blkio.weight") + if err != nil { + t.Fatalf("Failed to parse blkio.weight - %s", err) + } + + if value != weightAfter { + t.Fatal("Got the wrong value, set blkio.weight failed.") + } +} + +func TestBlkioSetWeightDevice(t *testing.T) { + helper := NewCgroupTestUtil("blkio", t) + defer helper.cleanup() + + const ( + weightDeviceBefore = "8:0 400" + ) + + wd := configs.NewWeightDevice(8, 0, 500, 0) + weightDeviceAfter := wd.WeightString() + + helper.writeFileContents(map[string]string{ + "blkio.weight_device": weightDeviceBefore, + }) + + helper.CgroupData.config.Resources.BlkioWeightDevice = []*configs.WeightDevice{wd} + blkio := &BlkioGroup{} + if err := blkio.Set(helper.CgroupPath, helper.CgroupData.config); err != nil { + t.Fatal(err) + } + + value, err := fscommon.GetCgroupParamString(helper.CgroupPath, "blkio.weight_device") + if err != nil { + t.Fatalf("Failed to parse blkio.weight_device - %s", err) + } + + if value != weightDeviceAfter { + t.Fatal("Got the wrong value, set blkio.weight_device failed.") + } +} + +// regression #274 +func TestBlkioSetMultipleWeightDevice(t *testing.T) { + helper := NewCgroupTestUtil("blkio", t) + defer helper.cleanup() + + const ( + weightDeviceBefore = "8:0 400" + ) + + wd1 := configs.NewWeightDevice(8, 0, 500, 0) + wd2 := configs.NewWeightDevice(8, 16, 500, 0) + // we cannot actually set and check both because normal ioutil.WriteFile + // when writing to cgroup file will overwrite the whole file content instead + // of updating it as the kernel is doing. Just check the second device + // is present will suffice for the test to ensure multiple writes are done. + weightDeviceAfter := wd2.WeightString() + + helper.writeFileContents(map[string]string{ + "blkio.weight_device": weightDeviceBefore, + }) + + helper.CgroupData.config.Resources.BlkioWeightDevice = []*configs.WeightDevice{wd1, wd2} + blkio := &BlkioGroup{} + if err := blkio.Set(helper.CgroupPath, helper.CgroupData.config); err != nil { + t.Fatal(err) + } + + value, err := fscommon.GetCgroupParamString(helper.CgroupPath, "blkio.weight_device") + if err != nil { + t.Fatalf("Failed to parse blkio.weight_device - %s", err) + } + + if value != weightDeviceAfter { + t.Fatal("Got the wrong value, set blkio.weight_device failed.") + } +} + +func TestBlkioBFQDebugStats(t *testing.T) { + helper := NewCgroupTestUtil("blkio", t) + defer helper.cleanup() + helper.writeFileContents(blkioBFQDebugStatsTestFiles) + blkio := &BlkioGroup{} + actualStats := *cgroups.NewStats() + err := blkio.GetStats(helper.CgroupPath, &actualStats) + if err != nil { + t.Fatal(err) + } + + expectedStats := cgroups.BlkioStats{} + appendBlkioStatEntry(&expectedStats.SectorsRecursive, 8, 0, 2048, "") + + appendBlkioStatEntry(&expectedStats.IoServiceBytesRecursive, 8, 0, 1100, "Read") + appendBlkioStatEntry(&expectedStats.IoServiceBytesRecursive, 8, 0, 1200, "Write") + appendBlkioStatEntry(&expectedStats.IoServiceBytesRecursive, 8, 0, 1300, "Sync") + appendBlkioStatEntry(&expectedStats.IoServiceBytesRecursive, 8, 0, 1500, "Async") + appendBlkioStatEntry(&expectedStats.IoServiceBytesRecursive, 8, 0, 1500, "Total") + + appendBlkioStatEntry(&expectedStats.IoServicedRecursive, 8, 0, 11, "Read") + appendBlkioStatEntry(&expectedStats.IoServicedRecursive, 8, 0, 41, "Write") + appendBlkioStatEntry(&expectedStats.IoServicedRecursive, 8, 0, 21, "Sync") + appendBlkioStatEntry(&expectedStats.IoServicedRecursive, 8, 0, 31, "Async") + appendBlkioStatEntry(&expectedStats.IoServicedRecursive, 8, 0, 51, "Total") + + appendBlkioStatEntry(&expectedStats.IoQueuedRecursive, 8, 0, 2, "Read") + appendBlkioStatEntry(&expectedStats.IoQueuedRecursive, 8, 0, 3, "Write") + appendBlkioStatEntry(&expectedStats.IoQueuedRecursive, 8, 0, 4, "Sync") + appendBlkioStatEntry(&expectedStats.IoQueuedRecursive, 8, 0, 5, "Async") + appendBlkioStatEntry(&expectedStats.IoQueuedRecursive, 8, 0, 6, "Total") + + appendBlkioStatEntry(&expectedStats.IoServiceTimeRecursive, 8, 0, 173959, "Read") + appendBlkioStatEntry(&expectedStats.IoServiceTimeRecursive, 8, 0, 0, "Write") + appendBlkioStatEntry(&expectedStats.IoServiceTimeRecursive, 8, 0, 0, "Sync") + appendBlkioStatEntry(&expectedStats.IoServiceTimeRecursive, 8, 0, 173, "Async") + appendBlkioStatEntry(&expectedStats.IoServiceTimeRecursive, 8, 0, 174, "Total") + + appendBlkioStatEntry(&expectedStats.IoWaitTimeRecursive, 8, 0, 1557, "Read") + appendBlkioStatEntry(&expectedStats.IoWaitTimeRecursive, 8, 0, 0, "Write") + appendBlkioStatEntry(&expectedStats.IoWaitTimeRecursive, 8, 0, 0, "Sync") + appendBlkioStatEntry(&expectedStats.IoWaitTimeRecursive, 8, 0, 1557, "Async") + appendBlkioStatEntry(&expectedStats.IoWaitTimeRecursive, 8, 0, 1557, "Total") + + appendBlkioStatEntry(&expectedStats.IoMergedRecursive, 8, 0, 51, "Read") + appendBlkioStatEntry(&expectedStats.IoMergedRecursive, 8, 0, 101, "Write") + appendBlkioStatEntry(&expectedStats.IoMergedRecursive, 8, 0, 0, "Sync") + appendBlkioStatEntry(&expectedStats.IoMergedRecursive, 8, 0, 0, "Async") + appendBlkioStatEntry(&expectedStats.IoMergedRecursive, 8, 0, 151, "Total") + + appendBlkioStatEntry(&expectedStats.IoTimeRecursive, 8, 0, 16, "") + + expectBlkioStatsEquals(t, expectedStats, actualStats.BlkioStats) +} + +func TestBlkioMultipleStatsFiles(t *testing.T) { + helper := NewCgroupTestUtil("blkio", t) + defer helper.cleanup() + helper.writeFileContents(blkioBFQDebugStatsTestFiles) + helper.writeFileContents(blkioCFQStatsTestFiles) + blkio := &BlkioGroup{} + actualStats := *cgroups.NewStats() + err := blkio.GetStats(helper.CgroupPath, &actualStats) + if err != nil { + t.Fatal(err) + } + + expectedStats := cgroups.BlkioStats{} + appendBlkioStatEntry(&expectedStats.SectorsRecursive, 8, 0, 2048, "") + + appendBlkioStatEntry(&expectedStats.IoServiceBytesRecursive, 8, 0, 1100, "Read") + appendBlkioStatEntry(&expectedStats.IoServiceBytesRecursive, 8, 0, 1200, "Write") + appendBlkioStatEntry(&expectedStats.IoServiceBytesRecursive, 8, 0, 1300, "Sync") + appendBlkioStatEntry(&expectedStats.IoServiceBytesRecursive, 8, 0, 1500, "Async") + appendBlkioStatEntry(&expectedStats.IoServiceBytesRecursive, 8, 0, 1500, "Total") + + appendBlkioStatEntry(&expectedStats.IoServicedRecursive, 8, 0, 11, "Read") + appendBlkioStatEntry(&expectedStats.IoServicedRecursive, 8, 0, 41, "Write") + appendBlkioStatEntry(&expectedStats.IoServicedRecursive, 8, 0, 21, "Sync") + appendBlkioStatEntry(&expectedStats.IoServicedRecursive, 8, 0, 31, "Async") + appendBlkioStatEntry(&expectedStats.IoServicedRecursive, 8, 0, 51, "Total") + + appendBlkioStatEntry(&expectedStats.IoQueuedRecursive, 8, 0, 2, "Read") + appendBlkioStatEntry(&expectedStats.IoQueuedRecursive, 8, 0, 3, "Write") + appendBlkioStatEntry(&expectedStats.IoQueuedRecursive, 8, 0, 4, "Sync") + appendBlkioStatEntry(&expectedStats.IoQueuedRecursive, 8, 0, 5, "Async") + appendBlkioStatEntry(&expectedStats.IoQueuedRecursive, 8, 0, 6, "Total") + + appendBlkioStatEntry(&expectedStats.IoServiceTimeRecursive, 8, 0, 173959, "Read") + appendBlkioStatEntry(&expectedStats.IoServiceTimeRecursive, 8, 0, 0, "Write") + appendBlkioStatEntry(&expectedStats.IoServiceTimeRecursive, 8, 0, 0, "Sync") + appendBlkioStatEntry(&expectedStats.IoServiceTimeRecursive, 8, 0, 173, "Async") + appendBlkioStatEntry(&expectedStats.IoServiceTimeRecursive, 8, 0, 174, "Total") + + appendBlkioStatEntry(&expectedStats.IoWaitTimeRecursive, 8, 0, 1557, "Read") + appendBlkioStatEntry(&expectedStats.IoWaitTimeRecursive, 8, 0, 0, "Write") + appendBlkioStatEntry(&expectedStats.IoWaitTimeRecursive, 8, 0, 0, "Sync") + appendBlkioStatEntry(&expectedStats.IoWaitTimeRecursive, 8, 0, 1557, "Async") + appendBlkioStatEntry(&expectedStats.IoWaitTimeRecursive, 8, 0, 1557, "Total") + + appendBlkioStatEntry(&expectedStats.IoMergedRecursive, 8, 0, 51, "Read") + appendBlkioStatEntry(&expectedStats.IoMergedRecursive, 8, 0, 101, "Write") + appendBlkioStatEntry(&expectedStats.IoMergedRecursive, 8, 0, 0, "Sync") + appendBlkioStatEntry(&expectedStats.IoMergedRecursive, 8, 0, 0, "Async") + appendBlkioStatEntry(&expectedStats.IoMergedRecursive, 8, 0, 151, "Total") + + appendBlkioStatEntry(&expectedStats.IoTimeRecursive, 8, 0, 16, "") + + expectBlkioStatsEquals(t, expectedStats, actualStats.BlkioStats) +} + +func TestBlkioBFQStats(t *testing.T) { + helper := NewCgroupTestUtil("blkio", t) + defer helper.cleanup() + helper.writeFileContents(blkioBFQStatsTestFiles) + blkio := &BlkioGroup{} + actualStats := *cgroups.NewStats() + err := blkio.GetStats(helper.CgroupPath, &actualStats) + if err != nil { + t.Fatal(err) + } + + expectedStats := cgroups.BlkioStats{} + + appendBlkioStatEntry(&expectedStats.IoServiceBytesRecursive, 8, 0, 1100, "Read") + appendBlkioStatEntry(&expectedStats.IoServiceBytesRecursive, 8, 0, 1200, "Write") + appendBlkioStatEntry(&expectedStats.IoServiceBytesRecursive, 8, 0, 1300, "Sync") + appendBlkioStatEntry(&expectedStats.IoServiceBytesRecursive, 8, 0, 1500, "Async") + appendBlkioStatEntry(&expectedStats.IoServiceBytesRecursive, 8, 0, 1500, "Total") + + appendBlkioStatEntry(&expectedStats.IoServicedRecursive, 8, 0, 11, "Read") + appendBlkioStatEntry(&expectedStats.IoServicedRecursive, 8, 0, 41, "Write") + appendBlkioStatEntry(&expectedStats.IoServicedRecursive, 8, 0, 21, "Sync") + appendBlkioStatEntry(&expectedStats.IoServicedRecursive, 8, 0, 31, "Async") + appendBlkioStatEntry(&expectedStats.IoServicedRecursive, 8, 0, 51, "Total") + + expectBlkioStatsEquals(t, expectedStats, actualStats.BlkioStats) +} + +func TestBlkioStatsNoFilesBFQDebug(t *testing.T) { + if testing.Short() { + t.Skip("skipping test in short mode.") + } + testCases := []blkioStatFailureTestCase{ + { + desc: "missing blkio.bfq.io_service_bytes_recursive file", + filename: "blkio.bfq.io_service_bytes_recursive", + }, + { + desc: "missing blkio.bfq.io_serviced_recursive file", + filename: "blkio.bfq.io_serviced_recursive", + }, + { + desc: "missing blkio.bfq.io_queued_recursive file", + filename: "blkio.bfq.io_queued_recursive", + }, + { + desc: "missing blkio.bfq.sectors_recursive file", + filename: "blkio.bfq.sectors_recursive", + }, + { + desc: "missing blkio.bfq.io_service_time_recursive file", + filename: "blkio.bfq.io_service_time_recursive", + }, + { + desc: "missing blkio.bfq.io_wait_time_recursive file", + filename: "blkio.bfq.io_wait_time_recursive", + }, + { + desc: "missing blkio.bfq.io_merged_recursive file", + filename: "blkio.bfq.io_merged_recursive", + }, + { + desc: "missing blkio.bfq.time_recursive file", + filename: "blkio.bfq.time_recursive", + }, + } + + for _, testCase := range testCases { + helper := NewCgroupTestUtil("cpuset", t) + defer helper.cleanup() + + tempBlkioTestFiles := map[string]string{} + for i, v := range blkioBFQDebugStatsTestFiles { + tempBlkioTestFiles[i] = v + } + delete(tempBlkioTestFiles, testCase.filename) + + helper.writeFileContents(tempBlkioTestFiles) + cpuset := &CpusetGroup{} + actualStats := *cgroups.NewStats() + err := cpuset.GetStats(helper.CgroupPath, &actualStats) + + if err != nil { + t.Errorf(fmt.Sprintf("test case '%s' failed unexpectedly: %s", testCase.desc, err)) + } + } +} + +func TestBlkioCFQStats(t *testing.T) { + helper := NewCgroupTestUtil("blkio", t) + defer helper.cleanup() + helper.writeFileContents(blkioCFQStatsTestFiles) + + blkio := &BlkioGroup{} + actualStats := *cgroups.NewStats() + err := blkio.GetStats(helper.CgroupPath, &actualStats) + if err != nil { + t.Fatal(err) + } + + // Verify expected stats. + expectedStats := cgroups.BlkioStats{} + appendBlkioStatEntry(&expectedStats.SectorsRecursive, 8, 0, 1024, "") + + appendBlkioStatEntry(&expectedStats.IoServiceBytesRecursive, 8, 0, 100, "Read") + appendBlkioStatEntry(&expectedStats.IoServiceBytesRecursive, 8, 0, 200, "Write") + appendBlkioStatEntry(&expectedStats.IoServiceBytesRecursive, 8, 0, 300, "Sync") + appendBlkioStatEntry(&expectedStats.IoServiceBytesRecursive, 8, 0, 500, "Async") + appendBlkioStatEntry(&expectedStats.IoServiceBytesRecursive, 8, 0, 500, "Total") + + appendBlkioStatEntry(&expectedStats.IoServicedRecursive, 8, 0, 10, "Read") + appendBlkioStatEntry(&expectedStats.IoServicedRecursive, 8, 0, 40, "Write") + appendBlkioStatEntry(&expectedStats.IoServicedRecursive, 8, 0, 20, "Sync") + appendBlkioStatEntry(&expectedStats.IoServicedRecursive, 8, 0, 30, "Async") + appendBlkioStatEntry(&expectedStats.IoServicedRecursive, 8, 0, 50, "Total") + + appendBlkioStatEntry(&expectedStats.IoQueuedRecursive, 8, 0, 1, "Read") + appendBlkioStatEntry(&expectedStats.IoQueuedRecursive, 8, 0, 4, "Write") + appendBlkioStatEntry(&expectedStats.IoQueuedRecursive, 8, 0, 2, "Sync") + appendBlkioStatEntry(&expectedStats.IoQueuedRecursive, 8, 0, 3, "Async") + appendBlkioStatEntry(&expectedStats.IoQueuedRecursive, 8, 0, 5, "Total") + + appendBlkioStatEntry(&expectedStats.IoServiceTimeRecursive, 8, 0, 173959, "Read") + appendBlkioStatEntry(&expectedStats.IoServiceTimeRecursive, 8, 0, 0, "Write") + appendBlkioStatEntry(&expectedStats.IoServiceTimeRecursive, 8, 0, 0, "Sync") + appendBlkioStatEntry(&expectedStats.IoServiceTimeRecursive, 8, 0, 173959, "Async") + appendBlkioStatEntry(&expectedStats.IoServiceTimeRecursive, 8, 0, 17395, "Total") + + appendBlkioStatEntry(&expectedStats.IoWaitTimeRecursive, 8, 0, 15571, "Read") + appendBlkioStatEntry(&expectedStats.IoWaitTimeRecursive, 8, 0, 0, "Write") + appendBlkioStatEntry(&expectedStats.IoWaitTimeRecursive, 8, 0, 0, "Sync") + appendBlkioStatEntry(&expectedStats.IoWaitTimeRecursive, 8, 0, 15571, "Async") + appendBlkioStatEntry(&expectedStats.IoWaitTimeRecursive, 8, 0, 15571, "Total") + + appendBlkioStatEntry(&expectedStats.IoMergedRecursive, 8, 0, 5, "Read") + appendBlkioStatEntry(&expectedStats.IoMergedRecursive, 8, 0, 10, "Write") + appendBlkioStatEntry(&expectedStats.IoMergedRecursive, 8, 0, 0, "Sync") + appendBlkioStatEntry(&expectedStats.IoMergedRecursive, 8, 0, 0, "Async") + appendBlkioStatEntry(&expectedStats.IoMergedRecursive, 8, 0, 15, "Total") + + appendBlkioStatEntry(&expectedStats.IoTimeRecursive, 8, 0, 8, "") + + expectBlkioStatsEquals(t, expectedStats, actualStats.BlkioStats) +} + +func TestBlkioStatsNoFilesCFQ(t *testing.T) { + if testing.Short() { + t.Skip("skipping test in short mode.") + } + testCases := []blkioStatFailureTestCase{ + { + desc: "missing blkio.io_service_bytes_recursive file", + filename: "blkio.io_service_bytes_recursive", + }, + { + desc: "missing blkio.io_serviced_recursive file", + filename: "blkio.io_serviced_recursive", + }, + { + desc: "missing blkio.io_queued_recursive file", + filename: "blkio.io_queued_recursive", + }, + { + desc: "missing blkio.sectors_recursive file", + filename: "blkio.sectors_recursive", + }, + { + desc: "missing blkio.io_service_time_recursive file", + filename: "blkio.io_service_time_recursive", + }, + { + desc: "missing blkio.io_wait_time_recursive file", + filename: "blkio.io_wait_time_recursive", + }, + { + desc: "missing blkio.io_merged_recursive file", + filename: "blkio.io_merged_recursive", + }, + { + desc: "missing blkio.time_recursive file", + filename: "blkio.time_recursive", + }, + } + + for _, testCase := range testCases { + helper := NewCgroupTestUtil("cpuset", t) + defer helper.cleanup() + + tempBlkioTestFiles := map[string]string{} + for i, v := range blkioCFQStatsTestFiles { + tempBlkioTestFiles[i] = v + } + delete(tempBlkioTestFiles, testCase.filename) + + helper.writeFileContents(tempBlkioTestFiles) + cpuset := &CpusetGroup{} + actualStats := *cgroups.NewStats() + err := cpuset.GetStats(helper.CgroupPath, &actualStats) + + if err != nil { + t.Errorf(fmt.Sprintf("test case '%s' failed unexpectedly: %s", testCase.desc, err)) + } + } +} + +func TestBlkioStatsUnexpectedNumberOfFields(t *testing.T) { + helper := NewCgroupTestUtil("blkio", t) + defer helper.cleanup() + helper.writeFileContents(map[string]string{ + "blkio.io_service_bytes_recursive": "8:0 Read 100 100", + "blkio.io_serviced_recursive": servicedRecursiveContents, + "blkio.io_queued_recursive": queuedRecursiveContents, + "blkio.sectors_recursive": sectorsRecursiveContents, + "blkio.io_service_time_recursive": serviceTimeRecursiveContents, + "blkio.io_wait_time_recursive": waitTimeRecursiveContents, + "blkio.io_merged_recursive": mergedRecursiveContents, + "blkio.time_recursive": timeRecursiveContents, + }) + + blkio := &BlkioGroup{} + actualStats := *cgroups.NewStats() + err := blkio.GetStats(helper.CgroupPath, &actualStats) + if err == nil { + t.Fatal("Expected to fail, but did not") + } +} + +func TestBlkioStatsUnexpectedFieldType(t *testing.T) { + helper := NewCgroupTestUtil("blkio", t) + defer helper.cleanup() + helper.writeFileContents(map[string]string{ + "blkio.io_service_bytes_recursive": "8:0 Read Write", + "blkio.io_serviced_recursive": servicedRecursiveContents, + "blkio.io_queued_recursive": queuedRecursiveContents, + "blkio.sectors_recursive": sectorsRecursiveContents, + "blkio.io_service_time_recursive": serviceTimeRecursiveContents, + "blkio.io_wait_time_recursive": waitTimeRecursiveContents, + "blkio.io_merged_recursive": mergedRecursiveContents, + "blkio.time_recursive": timeRecursiveContents, + }) + + blkio := &BlkioGroup{} + actualStats := *cgroups.NewStats() + err := blkio.GetStats(helper.CgroupPath, &actualStats) + if err == nil { + t.Fatal("Expected to fail, but did not") + } +} + +func TestThrottleRecursiveBlkioStats(t *testing.T) { + helper := NewCgroupTestUtil("blkio", t) + defer helper.cleanup() + helper.writeFileContents(map[string]string{ + "blkio.io_service_bytes_recursive": "", + "blkio.io_serviced_recursive": "", + "blkio.io_queued_recursive": "", + "blkio.sectors_recursive": "", + "blkio.io_service_time_recursive": "", + "blkio.io_wait_time_recursive": "", + "blkio.io_merged_recursive": "", + "blkio.time_recursive": "", + "blkio.throttle.io_service_bytes_recursive": throttleServiceBytesRecursive, + "blkio.throttle.io_serviced_recursive": throttleServicedRecursive, + }) + + blkio := &BlkioGroup{} + actualStats := *cgroups.NewStats() + err := blkio.GetStats(helper.CgroupPath, &actualStats) + if err != nil { + t.Fatal(err) + } + + // Verify expected stats. + expectedStats := cgroups.BlkioStats{} + + appendBlkioStatEntry(&expectedStats.IoServiceBytesRecursive, 8, 0, 110305281, "Read") + appendBlkioStatEntry(&expectedStats.IoServiceBytesRecursive, 8, 0, 231, "Write") + appendBlkioStatEntry(&expectedStats.IoServiceBytesRecursive, 8, 0, 421, "Sync") + appendBlkioStatEntry(&expectedStats.IoServiceBytesRecursive, 8, 0, 110305281, "Async") + appendBlkioStatEntry(&expectedStats.IoServiceBytesRecursive, 8, 0, 110305281, "Total") + appendBlkioStatEntry(&expectedStats.IoServiceBytesRecursive, 252, 0, 110305281, "Read") + appendBlkioStatEntry(&expectedStats.IoServiceBytesRecursive, 252, 0, 231, "Write") + appendBlkioStatEntry(&expectedStats.IoServiceBytesRecursive, 252, 0, 421, "Sync") + appendBlkioStatEntry(&expectedStats.IoServiceBytesRecursive, 252, 0, 110305281, "Async") + appendBlkioStatEntry(&expectedStats.IoServiceBytesRecursive, 252, 0, 110305281, "Total") + + appendBlkioStatEntry(&expectedStats.IoServicedRecursive, 8, 0, 1641, "Read") + appendBlkioStatEntry(&expectedStats.IoServicedRecursive, 8, 0, 231, "Write") + appendBlkioStatEntry(&expectedStats.IoServicedRecursive, 8, 0, 421, "Sync") + appendBlkioStatEntry(&expectedStats.IoServicedRecursive, 8, 0, 1641, "Async") + appendBlkioStatEntry(&expectedStats.IoServicedRecursive, 8, 0, 1641, "Total") + appendBlkioStatEntry(&expectedStats.IoServicedRecursive, 252, 0, 1641, "Read") + appendBlkioStatEntry(&expectedStats.IoServicedRecursive, 252, 0, 231, "Write") + appendBlkioStatEntry(&expectedStats.IoServicedRecursive, 252, 0, 421, "Sync") + appendBlkioStatEntry(&expectedStats.IoServicedRecursive, 252, 0, 1641, "Async") + appendBlkioStatEntry(&expectedStats.IoServicedRecursive, 252, 0, 1641, "Total") + + expectBlkioStatsEquals(t, expectedStats, actualStats.BlkioStats) +} + +func TestThrottleBlkioStats(t *testing.T) { + helper := NewCgroupTestUtil("blkio", t) + defer helper.cleanup() + helper.writeFileContents(map[string]string{ + "blkio.io_service_bytes_recursive": "", + "blkio.io_serviced_recursive": "", + "blkio.io_queued_recursive": "", + "blkio.sectors_recursive": "", + "blkio.io_service_time_recursive": "", + "blkio.io_wait_time_recursive": "", + "blkio.io_merged_recursive": "", + "blkio.time_recursive": "", + "blkio.throttle.io_service_bytes": throttleServiceBytes, + "blkio.throttle.io_serviced": throttleServiced, + }) + + blkio := &BlkioGroup{} + actualStats := *cgroups.NewStats() + err := blkio.GetStats(helper.CgroupPath, &actualStats) + if err != nil { + t.Fatal(err) + } + + // Verify expected stats. + expectedStats := cgroups.BlkioStats{} + + appendBlkioStatEntry(&expectedStats.IoServiceBytesRecursive, 8, 0, 11030528, "Read") + appendBlkioStatEntry(&expectedStats.IoServiceBytesRecursive, 8, 0, 23, "Write") + appendBlkioStatEntry(&expectedStats.IoServiceBytesRecursive, 8, 0, 42, "Sync") + appendBlkioStatEntry(&expectedStats.IoServiceBytesRecursive, 8, 0, 11030528, "Async") + appendBlkioStatEntry(&expectedStats.IoServiceBytesRecursive, 8, 0, 11030528, "Total") + appendBlkioStatEntry(&expectedStats.IoServiceBytesRecursive, 252, 0, 11030528, "Read") + appendBlkioStatEntry(&expectedStats.IoServiceBytesRecursive, 252, 0, 23, "Write") + appendBlkioStatEntry(&expectedStats.IoServiceBytesRecursive, 252, 0, 42, "Sync") + appendBlkioStatEntry(&expectedStats.IoServiceBytesRecursive, 252, 0, 11030528, "Async") + appendBlkioStatEntry(&expectedStats.IoServiceBytesRecursive, 252, 0, 11030528, "Total") + + appendBlkioStatEntry(&expectedStats.IoServicedRecursive, 8, 0, 164, "Read") + appendBlkioStatEntry(&expectedStats.IoServicedRecursive, 8, 0, 23, "Write") + appendBlkioStatEntry(&expectedStats.IoServicedRecursive, 8, 0, 42, "Sync") + appendBlkioStatEntry(&expectedStats.IoServicedRecursive, 8, 0, 164, "Async") + appendBlkioStatEntry(&expectedStats.IoServicedRecursive, 8, 0, 164, "Total") + appendBlkioStatEntry(&expectedStats.IoServicedRecursive, 252, 0, 164, "Read") + appendBlkioStatEntry(&expectedStats.IoServicedRecursive, 252, 0, 23, "Write") + appendBlkioStatEntry(&expectedStats.IoServicedRecursive, 252, 0, 42, "Sync") + appendBlkioStatEntry(&expectedStats.IoServicedRecursive, 252, 0, 164, "Async") + appendBlkioStatEntry(&expectedStats.IoServicedRecursive, 252, 0, 164, "Total") + + expectBlkioStatsEquals(t, expectedStats, actualStats.BlkioStats) +} + +func TestBlkioSetThrottleReadBpsDevice(t *testing.T) { + helper := NewCgroupTestUtil("blkio", t) + defer helper.cleanup() + + const ( + throttleBefore = `8:0 1024` + ) + + td := configs.NewThrottleDevice(8, 0, 2048) + throttleAfter := td.String() + + helper.writeFileContents(map[string]string{ + "blkio.throttle.read_bps_device": throttleBefore, + }) + + helper.CgroupData.config.Resources.BlkioThrottleReadBpsDevice = []*configs.ThrottleDevice{td} + blkio := &BlkioGroup{} + if err := blkio.Set(helper.CgroupPath, helper.CgroupData.config); err != nil { + t.Fatal(err) + } + + value, err := fscommon.GetCgroupParamString(helper.CgroupPath, "blkio.throttle.read_bps_device") + if err != nil { + t.Fatalf("Failed to parse blkio.throttle.read_bps_device - %s", err) + } + + if value != throttleAfter { + t.Fatal("Got the wrong value, set blkio.throttle.read_bps_device failed.") + } +} +func TestBlkioSetThrottleWriteBpsDevice(t *testing.T) { + helper := NewCgroupTestUtil("blkio", t) + defer helper.cleanup() + + const ( + throttleBefore = `8:0 1024` + ) + + td := configs.NewThrottleDevice(8, 0, 2048) + throttleAfter := td.String() + + helper.writeFileContents(map[string]string{ + "blkio.throttle.write_bps_device": throttleBefore, + }) + + helper.CgroupData.config.Resources.BlkioThrottleWriteBpsDevice = []*configs.ThrottleDevice{td} + blkio := &BlkioGroup{} + if err := blkio.Set(helper.CgroupPath, helper.CgroupData.config); err != nil { + t.Fatal(err) + } + + value, err := fscommon.GetCgroupParamString(helper.CgroupPath, "blkio.throttle.write_bps_device") + if err != nil { + t.Fatalf("Failed to parse blkio.throttle.write_bps_device - %s", err) + } + + if value != throttleAfter { + t.Fatal("Got the wrong value, set blkio.throttle.write_bps_device failed.") + } +} +func TestBlkioSetThrottleReadIOpsDevice(t *testing.T) { + helper := NewCgroupTestUtil("blkio", t) + defer helper.cleanup() + + const ( + throttleBefore = `8:0 1024` + ) + + td := configs.NewThrottleDevice(8, 0, 2048) + throttleAfter := td.String() + + helper.writeFileContents(map[string]string{ + "blkio.throttle.read_iops_device": throttleBefore, + }) + + helper.CgroupData.config.Resources.BlkioThrottleReadIOPSDevice = []*configs.ThrottleDevice{td} + blkio := &BlkioGroup{} + if err := blkio.Set(helper.CgroupPath, helper.CgroupData.config); err != nil { + t.Fatal(err) + } + + value, err := fscommon.GetCgroupParamString(helper.CgroupPath, "blkio.throttle.read_iops_device") + if err != nil { + t.Fatalf("Failed to parse blkio.throttle.read_iops_device - %s", err) + } + + if value != throttleAfter { + t.Fatal("Got the wrong value, set blkio.throttle.read_iops_device failed.") + } +} +func TestBlkioSetThrottleWriteIOpsDevice(t *testing.T) { + helper := NewCgroupTestUtil("blkio", t) + defer helper.cleanup() + + const ( + throttleBefore = `8:0 1024` + ) + + td := configs.NewThrottleDevice(8, 0, 2048) + throttleAfter := td.String() + + helper.writeFileContents(map[string]string{ + "blkio.throttle.write_iops_device": throttleBefore, + }) + + helper.CgroupData.config.Resources.BlkioThrottleWriteIOPSDevice = []*configs.ThrottleDevice{td} + blkio := &BlkioGroup{} + if err := blkio.Set(helper.CgroupPath, helper.CgroupData.config); err != nil { + t.Fatal(err) + } + + value, err := fscommon.GetCgroupParamString(helper.CgroupPath, "blkio.throttle.write_iops_device") + if err != nil { + t.Fatalf("Failed to parse blkio.throttle.write_iops_device - %s", err) + } + + if value != throttleAfter { + t.Fatal("Got the wrong value, set blkio.throttle.write_iops_device failed.") + } +} diff --git a/sysbox-runc/libcontainer/cgroups/fs/cpu.go b/sysbox-runc/libcontainer/cgroups/fs/cpu.go new file mode 100644 index 00000000..9ee46c17 --- /dev/null +++ b/sysbox-runc/libcontainer/cgroups/fs/cpu.go @@ -0,0 +1,129 @@ +// +build linux + +package fs + +import ( + "bufio" + "fmt" + "os" + "strconv" + + "github.com/opencontainers/runc/libcontainer/cgroups" + "github.com/opencontainers/runc/libcontainer/cgroups/fscommon" + "github.com/opencontainers/runc/libcontainer/configs" +) + +type CpuGroup struct { +} + +func (s *CpuGroup) Name() string { + return "cpu" +} + +func (s *CpuGroup) Apply(path string, d *cgroupData) error { + // This might happen if we have no cpu cgroup mounted. + // Just do nothing and don't fail. + if path == "" { + return nil + } + if err := os.MkdirAll(path, 0755); err != nil { + return err + } + // We should set the real-Time group scheduling settings before moving + // in the process because if the process is already in SCHED_RR mode + // and no RT bandwidth is set, adding it will fail. + if err := s.SetRtSched(path, d.config); err != nil { + return err + } + // Since we are not using join(), we need to place the pid + // into the procs file unlike other subsystems. + return cgroups.WriteCgroupProc(path, d.pid) +} + +func (s *CpuGroup) SetRtSched(path string, cgroup *configs.Cgroup) error { + if cgroup.Resources.CpuRtPeriod != 0 { + if err := fscommon.WriteFile(path, "cpu.rt_period_us", strconv.FormatUint(cgroup.Resources.CpuRtPeriod, 10)); err != nil { + return err + } + } + if cgroup.Resources.CpuRtRuntime != 0 { + if err := fscommon.WriteFile(path, "cpu.rt_runtime_us", strconv.FormatInt(cgroup.Resources.CpuRtRuntime, 10)); err != nil { + return err + } + } + return nil +} + +func (s *CpuGroup) Set(path string, cgroup *configs.Cgroup) error { + if cgroup.Resources.CpuShares != 0 { + shares := cgroup.Resources.CpuShares + if err := fscommon.WriteFile(path, "cpu.shares", strconv.FormatUint(shares, 10)); err != nil { + return err + } + // read it back + sharesRead, err := fscommon.GetCgroupParamUint(path, "cpu.shares") + if err != nil { + return err + } + // ... and check + if shares > sharesRead { + return fmt.Errorf("the maximum allowed cpu-shares is %d", sharesRead) + } else if shares < sharesRead { + return fmt.Errorf("the minimum allowed cpu-shares is %d", sharesRead) + } + } + if cgroup.Resources.CpuPeriod != 0 { + if err := fscommon.WriteFile(path, "cpu.cfs_period_us", strconv.FormatUint(cgroup.Resources.CpuPeriod, 10)); err != nil { + return err + } + } + if cgroup.Resources.CpuQuota != 0 { + if err := fscommon.WriteFile(path, "cpu.cfs_quota_us", strconv.FormatInt(cgroup.Resources.CpuQuota, 10)); err != nil { + return err + } + } + return s.SetRtSched(path, cgroup) +} + +func (s *CpuGroup) GetStats(path string, stats *cgroups.Stats) error { + f, err := fscommon.OpenFile(path, "cpu.stat", os.O_RDONLY) + if err != nil { + if os.IsNotExist(err) { + return nil + } + return err + } + defer f.Close() + + sc := bufio.NewScanner(f) + for sc.Scan() { + t, v, err := fscommon.GetCgroupParamKeyValue(sc.Text()) + if err != nil { + return err + } + switch t { + case "nr_periods": + stats.CpuStats.ThrottlingData.Periods = v + + case "nr_throttled": + stats.CpuStats.ThrottlingData.ThrottledPeriods = v + + case "throttled_time": + stats.CpuStats.ThrottlingData.ThrottledTime = v + } + } + return nil +} + +func (s *CpuGroup) Clone(source, dest string) error { + + if err := fscommon.WriteFile(source, "cgroup.clone_children", "1"); err != nil { + return err + } + + if err := os.MkdirAll(dest, 0755); err != nil { + return fmt.Errorf("Failed to create cgroup %s", dest) + } + + return nil +} diff --git a/sysbox-runc/libcontainer/cgroups/fs/cpu_test.go b/sysbox-runc/libcontainer/cgroups/fs/cpu_test.go new file mode 100644 index 00000000..4a8ecf99 --- /dev/null +++ b/sysbox-runc/libcontainer/cgroups/fs/cpu_test.go @@ -0,0 +1,212 @@ +// +build linux + +package fs + +import ( + "fmt" + "strconv" + "testing" + + "github.com/opencontainers/runc/libcontainer/cgroups" + "github.com/opencontainers/runc/libcontainer/cgroups/fscommon" +) + +func TestCpuSetShares(t *testing.T) { + helper := NewCgroupTestUtil("cpu", t) + defer helper.cleanup() + + const ( + sharesBefore = 1024 + sharesAfter = 512 + ) + + helper.writeFileContents(map[string]string{ + "cpu.shares": strconv.Itoa(sharesBefore), + }) + + helper.CgroupData.config.Resources.CpuShares = sharesAfter + cpu := &CpuGroup{} + if err := cpu.Set(helper.CgroupPath, helper.CgroupData.config); err != nil { + t.Fatal(err) + } + + value, err := fscommon.GetCgroupParamUint(helper.CgroupPath, "cpu.shares") + if err != nil { + t.Fatalf("Failed to parse cpu.shares - %s", err) + } + + if value != sharesAfter { + t.Fatal("Got the wrong value, set cpu.shares failed.") + } +} + +func TestCpuSetBandWidth(t *testing.T) { + helper := NewCgroupTestUtil("cpu", t) + defer helper.cleanup() + + const ( + quotaBefore = 8000 + quotaAfter = 5000 + periodBefore = 10000 + periodAfter = 7000 + rtRuntimeBefore = 8000 + rtRuntimeAfter = 5000 + rtPeriodBefore = 10000 + rtPeriodAfter = 7000 + ) + + helper.writeFileContents(map[string]string{ + "cpu.cfs_quota_us": strconv.Itoa(quotaBefore), + "cpu.cfs_period_us": strconv.Itoa(periodBefore), + "cpu.rt_runtime_us": strconv.Itoa(rtRuntimeBefore), + "cpu.rt_period_us": strconv.Itoa(rtPeriodBefore), + }) + + helper.CgroupData.config.Resources.CpuQuota = quotaAfter + helper.CgroupData.config.Resources.CpuPeriod = periodAfter + helper.CgroupData.config.Resources.CpuRtRuntime = rtRuntimeAfter + helper.CgroupData.config.Resources.CpuRtPeriod = rtPeriodAfter + cpu := &CpuGroup{} + if err := cpu.Set(helper.CgroupPath, helper.CgroupData.config); err != nil { + t.Fatal(err) + } + + quota, err := fscommon.GetCgroupParamUint(helper.CgroupPath, "cpu.cfs_quota_us") + if err != nil { + t.Fatalf("Failed to parse cpu.cfs_quota_us - %s", err) + } + if quota != quotaAfter { + t.Fatal("Got the wrong value, set cpu.cfs_quota_us failed.") + } + + period, err := fscommon.GetCgroupParamUint(helper.CgroupPath, "cpu.cfs_period_us") + if err != nil { + t.Fatalf("Failed to parse cpu.cfs_period_us - %s", err) + } + if period != periodAfter { + t.Fatal("Got the wrong value, set cpu.cfs_period_us failed.") + } + rtRuntime, err := fscommon.GetCgroupParamUint(helper.CgroupPath, "cpu.rt_runtime_us") + if err != nil { + t.Fatalf("Failed to parse cpu.rt_runtime_us - %s", err) + } + if rtRuntime != rtRuntimeAfter { + t.Fatal("Got the wrong value, set cpu.rt_runtime_us failed.") + } + rtPeriod, err := fscommon.GetCgroupParamUint(helper.CgroupPath, "cpu.rt_period_us") + if err != nil { + t.Fatalf("Failed to parse cpu.rt_period_us - %s", err) + } + if rtPeriod != rtPeriodAfter { + t.Fatal("Got the wrong value, set cpu.rt_period_us failed.") + } +} + +func TestCpuStats(t *testing.T) { + helper := NewCgroupTestUtil("cpu", t) + defer helper.cleanup() + + const ( + nrPeriods = 2000 + nrThrottled = 200 + throttledTime = uint64(18446744073709551615) + ) + + cpuStatContent := fmt.Sprintf("nr_periods %d\n nr_throttled %d\n throttled_time %d\n", + nrPeriods, nrThrottled, throttledTime) + helper.writeFileContents(map[string]string{ + "cpu.stat": cpuStatContent, + }) + + cpu := &CpuGroup{} + actualStats := *cgroups.NewStats() + err := cpu.GetStats(helper.CgroupPath, &actualStats) + if err != nil { + t.Fatal(err) + } + + expectedStats := cgroups.ThrottlingData{ + Periods: nrPeriods, + ThrottledPeriods: nrThrottled, + ThrottledTime: throttledTime} + + expectThrottlingDataEquals(t, expectedStats, actualStats.CpuStats.ThrottlingData) +} + +func TestNoCpuStatFile(t *testing.T) { + helper := NewCgroupTestUtil("cpu", t) + defer helper.cleanup() + + cpu := &CpuGroup{} + actualStats := *cgroups.NewStats() + err := cpu.GetStats(helper.CgroupPath, &actualStats) + if err != nil { + t.Fatal("Expected not to fail, but did") + } +} + +func TestInvalidCpuStat(t *testing.T) { + helper := NewCgroupTestUtil("cpu", t) + defer helper.cleanup() + cpuStatContent := `nr_periods 2000 + nr_throttled 200 + throttled_time fortytwo` + helper.writeFileContents(map[string]string{ + "cpu.stat": cpuStatContent, + }) + + cpu := &CpuGroup{} + actualStats := *cgroups.NewStats() + err := cpu.GetStats(helper.CgroupPath, &actualStats) + if err == nil { + t.Fatal("Expected failed stat parsing.") + } +} + +func TestCpuSetRtSchedAtApply(t *testing.T) { + helper := NewCgroupTestUtil("cpu", t) + defer helper.cleanup() + + const ( + rtRuntimeBefore = 0 + rtRuntimeAfter = 5000 + rtPeriodBefore = 0 + rtPeriodAfter = 7000 + ) + + helper.writeFileContents(map[string]string{ + "cpu.rt_runtime_us": strconv.Itoa(rtRuntimeBefore), + "cpu.rt_period_us": strconv.Itoa(rtPeriodBefore), + }) + + helper.CgroupData.config.Resources.CpuRtRuntime = rtRuntimeAfter + helper.CgroupData.config.Resources.CpuRtPeriod = rtPeriodAfter + cpu := &CpuGroup{} + + helper.CgroupData.pid = 1234 + if err := cpu.Apply(helper.CgroupPath, helper.CgroupData); err != nil { + t.Fatal(err) + } + + rtRuntime, err := fscommon.GetCgroupParamUint(helper.CgroupPath, "cpu.rt_runtime_us") + if err != nil { + t.Fatalf("Failed to parse cpu.rt_runtime_us - %s", err) + } + if rtRuntime != rtRuntimeAfter { + t.Fatal("Got the wrong value, set cpu.rt_runtime_us failed.") + } + rtPeriod, err := fscommon.GetCgroupParamUint(helper.CgroupPath, "cpu.rt_period_us") + if err != nil { + t.Fatalf("Failed to parse cpu.rt_period_us - %s", err) + } + if rtPeriod != rtPeriodAfter { + t.Fatal("Got the wrong value, set cpu.rt_period_us failed.") + } + pid, err := fscommon.GetCgroupParamUint(helper.CgroupPath, "cgroup.procs") + if err != nil { + t.Fatalf("Failed to parse cgroup.procs - %s", err) + } + if pid != 1234 { + t.Fatal("Got the wrong value, set cgroup.procs failed.") + } +} diff --git a/sysbox-runc/libcontainer/cgroups/fs/cpuacct.go b/sysbox-runc/libcontainer/cgroups/fs/cpuacct.go new file mode 100644 index 00000000..38807e4e --- /dev/null +++ b/sysbox-runc/libcontainer/cgroups/fs/cpuacct.go @@ -0,0 +1,186 @@ +// +build linux + +package fs + +import ( + "bufio" + "fmt" + "os" + "path/filepath" + "strconv" + "strings" + + "github.com/opencontainers/runc/libcontainer/cgroups" + "github.com/opencontainers/runc/libcontainer/cgroups/fscommon" + "github.com/opencontainers/runc/libcontainer/configs" +) + +const ( + cgroupCpuacctStat = "cpuacct.stat" + cgroupCpuacctUsageAll = "cpuacct.usage_all" + + nanosecondsInSecond = 1000000000 + + userModeColumn = 1 + kernelModeColumn = 2 + cuacctUsageAllColumnsNumber = 3 + + // The value comes from `C.sysconf(C._SC_CLK_TCK)`, and + // on Linux it's a constant which is safe to be hard coded, + // so we can avoid using cgo here. For details, see: + // https://github.com/containerd/cgroups/pull/12 + clockTicks uint64 = 100 +) + +type CpuacctGroup struct { +} + +func (s *CpuacctGroup) Name() string { + return "cpuacct" +} + +func (s *CpuacctGroup) Apply(path string, d *cgroupData) error { + return join(path, d.pid) +} + +func (s *CpuacctGroup) Set(path string, cgroup *configs.Cgroup) error { + return nil +} + +func (s *CpuacctGroup) GetStats(path string, stats *cgroups.Stats) error { + if !cgroups.PathExists(path) { + return nil + } + userModeUsage, kernelModeUsage, err := getCpuUsageBreakdown(path) + if err != nil { + return err + } + + totalUsage, err := fscommon.GetCgroupParamUint(path, "cpuacct.usage") + if err != nil { + return err + } + + percpuUsage, err := getPercpuUsage(path) + if err != nil { + return err + } + + percpuUsageInKernelmode, percpuUsageInUsermode, err := getPercpuUsageInModes(path) + if err != nil { + return err + } + + stats.CpuStats.CpuUsage.TotalUsage = totalUsage + stats.CpuStats.CpuUsage.PercpuUsage = percpuUsage + stats.CpuStats.CpuUsage.PercpuUsageInKernelmode = percpuUsageInKernelmode + stats.CpuStats.CpuUsage.PercpuUsageInUsermode = percpuUsageInUsermode + stats.CpuStats.CpuUsage.UsageInUsermode = userModeUsage + stats.CpuStats.CpuUsage.UsageInKernelmode = kernelModeUsage + return nil +} + +func (s *CpuacctGroup) Clone(source, dest string) error { + + if err := fscommon.WriteFile(source, "cgroup.clone_children", "1"); err != nil { + return err + } + + if err := os.MkdirAll(dest, 0755); err != nil { + return fmt.Errorf("Failed to create cgroup %s", dest) + } + + return nil +} + +// Returns user and kernel usage breakdown in nanoseconds. +func getCpuUsageBreakdown(path string) (uint64, uint64, error) { + var userModeUsage, kernelModeUsage uint64 + const ( + userField = "user" + systemField = "system" + ) + + // Expected format: + // user + // system + data, err := fscommon.ReadFile(path, cgroupCpuacctStat) + if err != nil { + return 0, 0, err + } + fields := strings.Fields(data) + if len(fields) < 4 { + return 0, 0, fmt.Errorf("failure - %s is expected to have at least 4 fields", filepath.Join(path, cgroupCpuacctStat)) + } + if fields[0] != userField { + return 0, 0, fmt.Errorf("unexpected field %q in %q, expected %q", fields[0], cgroupCpuacctStat, userField) + } + if fields[2] != systemField { + return 0, 0, fmt.Errorf("unexpected field %q in %q, expected %q", fields[2], cgroupCpuacctStat, systemField) + } + if userModeUsage, err = strconv.ParseUint(fields[1], 10, 64); err != nil { + return 0, 0, err + } + if kernelModeUsage, err = strconv.ParseUint(fields[3], 10, 64); err != nil { + return 0, 0, err + } + + return (userModeUsage * nanosecondsInSecond) / clockTicks, (kernelModeUsage * nanosecondsInSecond) / clockTicks, nil +} + +func getPercpuUsage(path string) ([]uint64, error) { + percpuUsage := []uint64{} + data, err := fscommon.ReadFile(path, "cpuacct.usage_percpu") + if err != nil { + return percpuUsage, err + } + for _, value := range strings.Fields(data) { + value, err := strconv.ParseUint(value, 10, 64) + if err != nil { + return percpuUsage, fmt.Errorf("Unable to convert param value to uint64: %s", err) + } + percpuUsage = append(percpuUsage, value) + } + return percpuUsage, nil +} + +func getPercpuUsageInModes(path string) ([]uint64, []uint64, error) { + usageKernelMode := []uint64{} + usageUserMode := []uint64{} + + file, err := fscommon.OpenFile(path, cgroupCpuacctUsageAll, os.O_RDONLY) + if os.IsNotExist(err) { + return usageKernelMode, usageUserMode, nil + } else if err != nil { + return nil, nil, err + } + defer file.Close() + + scanner := bufio.NewScanner(file) + scanner.Scan() //skipping header line + + for scanner.Scan() { + lineFields := strings.SplitN(scanner.Text(), " ", cuacctUsageAllColumnsNumber+1) + if len(lineFields) != cuacctUsageAllColumnsNumber { + continue + } + + usageInKernelMode, err := strconv.ParseUint(lineFields[kernelModeColumn], 10, 64) + if err != nil { + return nil, nil, fmt.Errorf("Unable to convert CPU usage in kernel mode to uint64: %s", err) + } + usageKernelMode = append(usageKernelMode, usageInKernelMode) + + usageInUserMode, err := strconv.ParseUint(lineFields[userModeColumn], 10, 64) + if err != nil { + return nil, nil, fmt.Errorf("Unable to convert CPU usage in user mode to uint64: %s", err) + } + usageUserMode = append(usageUserMode, usageInUserMode) + } + + if err := scanner.Err(); err != nil { + return nil, nil, fmt.Errorf("Problem in reading %s line by line, %s", cgroupCpuacctUsageAll, err) + } + + return usageKernelMode, usageUserMode, nil +} diff --git a/sysbox-runc/libcontainer/cgroups/fs/cpuacct_test.go b/sysbox-runc/libcontainer/cgroups/fs/cpuacct_test.go new file mode 100644 index 00000000..bb69d9be --- /dev/null +++ b/sysbox-runc/libcontainer/cgroups/fs/cpuacct_test.go @@ -0,0 +1,93 @@ +// +build linux + +package fs + +import ( + "reflect" + "testing" + + "github.com/opencontainers/runc/libcontainer/cgroups" +) + +const ( + cpuAcctUsageContents = "12262454190222160" + cpuAcctUsagePerCPUContents = "1564936537989058 1583937096487821 1604195415465681 1596445226820187 1481069084155629 1478735613864327 1477610593414743 1476362015778086" + cpuAcctStatContents = "user 452278264\nsystem 291429664" + cpuAcctUsageAll = `cpu user system + 0 962250696038415 637727786389114 + 1 981956408513304 638197595421064 + 2 1002658817529022 638956774598358 + 3 994937703492523 637985531181620 + 4 874843781648690 638837766495476 + 5 872544369885276 638763309884944 + 6 870104915696359 640081778921247 + 7 870202363887496 638716766259495 + ` +) + +func TestCpuacctStats(t *testing.T) { + helper := NewCgroupTestUtil("cpuacct.", t) + defer helper.cleanup() + helper.writeFileContents(map[string]string{ + "cpuacct.usage": cpuAcctUsageContents, + "cpuacct.usage_percpu": cpuAcctUsagePerCPUContents, + "cpuacct.stat": cpuAcctStatContents, + "cpuacct.usage_all": cpuAcctUsageAll, + }) + + cpuacct := &CpuacctGroup{} + actualStats := *cgroups.NewStats() + err := cpuacct.GetStats(helper.CgroupPath, &actualStats) + if err != nil { + t.Fatal(err) + } + + expectedStats := cgroups.CpuUsage{ + TotalUsage: uint64(12262454190222160), + PercpuUsage: []uint64{1564936537989058, 1583937096487821, 1604195415465681, 1596445226820187, + 1481069084155629, 1478735613864327, 1477610593414743, 1476362015778086}, + PercpuUsageInKernelmode: []uint64{637727786389114, 638197595421064, 638956774598358, 637985531181620, + 638837766495476, 638763309884944, 640081778921247, 638716766259495}, + PercpuUsageInUsermode: []uint64{962250696038415, 981956408513304, 1002658817529022, 994937703492523, + 874843781648690, 872544369885276, 870104915696359, 870202363887496}, + UsageInKernelmode: (uint64(291429664) * nanosecondsInSecond) / clockTicks, + UsageInUsermode: (uint64(452278264) * nanosecondsInSecond) / clockTicks, + } + + if !reflect.DeepEqual(expectedStats, actualStats.CpuStats.CpuUsage) { + t.Errorf("Expected CPU usage %#v but found %#v\n", + expectedStats, actualStats.CpuStats.CpuUsage) + } +} + +func TestCpuacctStatsWithoutUsageAll(t *testing.T) { + helper := NewCgroupTestUtil("cpuacct.", t) + defer helper.cleanup() + helper.writeFileContents(map[string]string{ + "cpuacct.usage": cpuAcctUsageContents, + "cpuacct.usage_percpu": cpuAcctUsagePerCPUContents, + "cpuacct.stat": cpuAcctStatContents, + }) + + cpuacct := &CpuacctGroup{} + actualStats := *cgroups.NewStats() + err := cpuacct.GetStats(helper.CgroupPath, &actualStats) + if err != nil { + t.Fatal(err) + } + + expectedStats := cgroups.CpuUsage{ + TotalUsage: uint64(12262454190222160), + PercpuUsage: []uint64{1564936537989058, 1583937096487821, 1604195415465681, 1596445226820187, + 1481069084155629, 1478735613864327, 1477610593414743, 1476362015778086}, + PercpuUsageInKernelmode: []uint64{}, + PercpuUsageInUsermode: []uint64{}, + UsageInKernelmode: (uint64(291429664) * nanosecondsInSecond) / clockTicks, + UsageInUsermode: (uint64(452278264) * nanosecondsInSecond) / clockTicks, + } + + if !reflect.DeepEqual(expectedStats, actualStats.CpuStats.CpuUsage) { + t.Errorf("Expected CPU usage %#v but found %#v\n", + expectedStats, actualStats.CpuStats.CpuUsage) + } +} diff --git a/sysbox-runc/libcontainer/cgroups/fs/cpuset.go b/sysbox-runc/libcontainer/cgroups/fs/cpuset.go new file mode 100644 index 00000000..f5558eba --- /dev/null +++ b/sysbox-runc/libcontainer/cgroups/fs/cpuset.go @@ -0,0 +1,285 @@ +// +build linux + +package fs + +import ( + "fmt" + "os" + "path/filepath" + "strconv" + "strings" + + "github.com/moby/sys/mountinfo" + "github.com/opencontainers/runc/libcontainer/cgroups" + "github.com/opencontainers/runc/libcontainer/cgroups/fscommon" + "github.com/opencontainers/runc/libcontainer/configs" + libcontainerUtils "github.com/opencontainers/runc/libcontainer/utils" + "github.com/pkg/errors" +) + +type CpusetGroup struct { +} + +func (s *CpusetGroup) Name() string { + return "cpuset" +} + +func (s *CpusetGroup) Apply(path string, d *cgroupData) error { + return s.ApplyDir(path, d.config, d.pid) +} + +func (s *CpusetGroup) Set(path string, cgroup *configs.Cgroup) error { + if cgroup.Resources.CpusetCpus != "" { + if err := fscommon.WriteFile(path, "cpuset.cpus", cgroup.Resources.CpusetCpus); err != nil { + return err + } + } + if cgroup.Resources.CpusetMems != "" { + if err := fscommon.WriteFile(path, "cpuset.mems", cgroup.Resources.CpusetMems); err != nil { + return err + } + } + return nil +} + +func (s *CpusetGroup) Clone(source, dest string) error { + + // For the cpuset cgroup, cloning is done by simply setting cgroup.clone_children on the source + if err := fscommon.WriteFile(source, "cgroup.clone_children", "1"); err != nil { + return err + } + + if err := os.MkdirAll(dest, 0755); err != nil { + return fmt.Errorf("Failed to create cgroup %s", dest) + } + + return nil +} + +func getCpusetStat(path string, filename string) ([]uint16, error) { + var extracted []uint16 + fileContent, err := fscommon.GetCgroupParamString(path, filename) + if err != nil { + return extracted, err + } + if len(fileContent) == 0 { + return extracted, fmt.Errorf("%s found to be empty", filepath.Join(path, filename)) + } + + for _, s := range strings.Split(fileContent, ",") { + splitted := strings.SplitN(s, "-", 3) + switch len(splitted) { + case 3: + return extracted, fmt.Errorf("invalid values in %s", filepath.Join(path, filename)) + case 2: + min, err := strconv.ParseUint(splitted[0], 10, 16) + if err != nil { + return extracted, err + } + max, err := strconv.ParseUint(splitted[1], 10, 16) + if err != nil { + return extracted, err + } + if min > max { + return extracted, fmt.Errorf("invalid values in %s", filepath.Join(path, filename)) + } + for i := min; i <= max; i++ { + extracted = append(extracted, uint16(i)) + } + case 1: + value, err := strconv.ParseUint(s, 10, 16) + if err != nil { + return extracted, err + } + extracted = append(extracted, uint16(value)) + } + } + + return extracted, nil +} + +func (s *CpusetGroup) GetStats(path string, stats *cgroups.Stats) error { + var err error + + stats.CPUSetStats.CPUs, err = getCpusetStat(path, "cpuset.cpus") + if err != nil && !errors.Is(err, os.ErrNotExist) { + return err + } + + stats.CPUSetStats.CPUExclusive, err = fscommon.GetCgroupParamUint(path, "cpuset.cpu_exclusive") + if err != nil && !errors.Is(err, os.ErrNotExist) { + return err + } + + stats.CPUSetStats.Mems, err = getCpusetStat(path, "cpuset.mems") + if err != nil && !errors.Is(err, os.ErrNotExist) { + return err + } + + stats.CPUSetStats.MemHardwall, err = fscommon.GetCgroupParamUint(path, "cpuset.mem_hardwall") + if err != nil && !errors.Is(err, os.ErrNotExist) { + return err + } + + stats.CPUSetStats.MemExclusive, err = fscommon.GetCgroupParamUint(path, "cpuset.mem_exclusive") + if err != nil && !errors.Is(err, os.ErrNotExist) { + return err + } + + stats.CPUSetStats.MemoryMigrate, err = fscommon.GetCgroupParamUint(path, "cpuset.memory_migrate") + if err != nil && !errors.Is(err, os.ErrNotExist) { + return err + } + + stats.CPUSetStats.MemorySpreadPage, err = fscommon.GetCgroupParamUint(path, "cpuset.memory_spread_page") + if err != nil && !errors.Is(err, os.ErrNotExist) { + return err + } + + stats.CPUSetStats.MemorySpreadSlab, err = fscommon.GetCgroupParamUint(path, "cpuset.memory_spread_slab") + if err != nil && !errors.Is(err, os.ErrNotExist) { + return err + } + + stats.CPUSetStats.MemoryPressure, err = fscommon.GetCgroupParamUint(path, "cpuset.memory_pressure") + if err != nil && !errors.Is(err, os.ErrNotExist) { + return err + } + + stats.CPUSetStats.SchedLoadBalance, err = fscommon.GetCgroupParamUint(path, "cpuset.sched_load_balance") + if err != nil && !errors.Is(err, os.ErrNotExist) { + return err + } + + stats.CPUSetStats.SchedRelaxDomainLevel, err = fscommon.GetCgroupParamInt(path, "cpuset.sched_relax_domain_level") + if err != nil && !errors.Is(err, os.ErrNotExist) { + return err + } + + return nil +} + +// Get the source mount point of directory passed in as argument. +func getMount(dir string) (string, error) { + mi, err := mountinfo.GetMounts(mountinfo.ParentsFilter(dir)) + if err != nil { + return "", err + } + if len(mi) < 1 { + return "", errors.Errorf("Can't find mount point of %s", dir) + } + + // find the longest mount point + var idx, maxlen int + for i := range mi { + if len(mi[i].Mountpoint) > maxlen { + maxlen = len(mi[i].Mountpoint) + idx = i + } + } + + return mi[idx].Mountpoint, nil +} + +func (s *CpusetGroup) ApplyDir(dir string, cgroup *configs.Cgroup, pid int) error { + // This might happen if we have no cpuset cgroup mounted. + // Just do nothing and don't fail. + if dir == "" { + return nil + } + root, err := getMount(dir) + if err != nil { + return err + } + root = filepath.Dir(root) + // 'ensureParent' start with parent because we don't want to + // explicitly inherit from parent, it could conflict with + // 'cpuset.cpu_exclusive'. + if err := cpusetEnsureParent(filepath.Dir(dir), root); err != nil { + return err + } + if err := os.MkdirAll(dir, 0755); err != nil { + return err + } + // We didn't inherit cpuset configs from parent, but we have + // to ensure cpuset configs are set before moving task into the + // cgroup. + // The logic is, if user specified cpuset configs, use these + // specified configs, otherwise, inherit from parent. This makes + // cpuset configs work correctly with 'cpuset.cpu_exclusive', and + // keep backward compatibility. + if err := s.ensureCpusAndMems(dir, cgroup); err != nil { + return err + } + + // because we are not using d.join we need to place the pid into the procs file + // unlike the other subsystems + return cgroups.WriteCgroupProc(dir, pid) +} + +func getCpusetSubsystemSettings(parent string) (cpus, mems string, err error) { + if cpus, err = fscommon.ReadFile(parent, "cpuset.cpus"); err != nil { + return + } + if mems, err = fscommon.ReadFile(parent, "cpuset.mems"); err != nil { + return + } + return cpus, mems, nil +} + +// cpusetEnsureParent makes sure that the parent directory of current is created +// and populated with the proper cpus and mems files copied from +// its parent. +func cpusetEnsureParent(current, root string) error { + parent := filepath.Dir(current) + if libcontainerUtils.CleanPath(parent) == root { + return nil + } + // Avoid infinite recursion. + if parent == current { + return errors.New("cpuset: cgroup parent path outside cgroup root") + } + if err := cpusetEnsureParent(parent, root); err != nil { + return err + } + if err := os.MkdirAll(current, 0755); err != nil { + return err + } + return cpusetCopyIfNeeded(current, parent) +} + +// cpusetCopyIfNeeded copies the cpuset.cpus and cpuset.mems from the parent +// directory to the current directory if the file's contents are 0 +func cpusetCopyIfNeeded(current, parent string) error { + currentCpus, currentMems, err := getCpusetSubsystemSettings(current) + if err != nil { + return err + } + parentCpus, parentMems, err := getCpusetSubsystemSettings(parent) + if err != nil { + return err + } + + if isEmptyCpuset(currentCpus) { + if err := fscommon.WriteFile(current, "cpuset.cpus", string(parentCpus)); err != nil { + return err + } + } + if isEmptyCpuset(currentMems) { + if err := fscommon.WriteFile(current, "cpuset.mems", string(parentMems)); err != nil { + return err + } + } + return nil +} + +func isEmptyCpuset(str string) bool { + return str == "" || str == "\n" +} + +func (s *CpusetGroup) ensureCpusAndMems(path string, cgroup *configs.Cgroup) error { + if err := s.Set(path, cgroup); err != nil { + return err + } + return cpusetCopyIfNeeded(path, filepath.Dir(path)) +} diff --git a/sysbox-runc/libcontainer/cgroups/fs/cpuset_test.go b/sysbox-runc/libcontainer/cgroups/fs/cpuset_test.go new file mode 100644 index 00000000..8a49e440 --- /dev/null +++ b/sysbox-runc/libcontainer/cgroups/fs/cpuset_test.go @@ -0,0 +1,246 @@ +// +build linux + +package fs + +import ( + "reflect" + "testing" + + "github.com/opencontainers/runc/libcontainer/cgroups" + "github.com/opencontainers/runc/libcontainer/cgroups/fscommon" +) + +const ( + cpus = "0-2,7,12-14\n" + cpuExclusive = "1\n" + mems = "1-4,6,9\n" + memHardwall = "0\n" + memExclusive = "0\n" + memoryMigrate = "1\n" + memorySpreadPage = "0\n" + memorySpeadSlab = "1\n" + memoryPressure = "34377\n" + schedLoadBalance = "1\n" + schedRelaxDomainLevel = "-1\n" +) + +var cpusetTestFiles = map[string]string{ + "cpuset.cpus": cpus, + "cpuset.cpu_exclusive": cpuExclusive, + "cpuset.mems": mems, + "cpuset.mem_hardwall": memHardwall, + "cpuset.mem_exclusive": memExclusive, + "cpuset.memory_migrate": memoryMigrate, + "cpuset.memory_spread_page": memorySpreadPage, + "cpuset.memory_spread_slab": memorySpeadSlab, + "cpuset.memory_pressure": memoryPressure, + "cpuset.sched_load_balance": schedLoadBalance, + "cpuset.sched_relax_domain_level": schedRelaxDomainLevel, +} + +func TestCPUSetSetCpus(t *testing.T) { + helper := NewCgroupTestUtil("cpuset", t) + defer helper.cleanup() + + const ( + cpusBefore = "0" + cpusAfter = "1-3" + ) + + helper.writeFileContents(map[string]string{ + "cpuset.cpus": cpusBefore, + }) + + helper.CgroupData.config.Resources.CpusetCpus = cpusAfter + cpuset := &CpusetGroup{} + if err := cpuset.Set(helper.CgroupPath, helper.CgroupData.config); err != nil { + t.Fatal(err) + } + + value, err := fscommon.GetCgroupParamString(helper.CgroupPath, "cpuset.cpus") + if err != nil { + t.Fatalf("Failed to parse cpuset.cpus - %s", err) + } + + if value != cpusAfter { + t.Fatal("Got the wrong value, set cpuset.cpus failed.") + } +} + +func TestCPUSetSetMems(t *testing.T) { + helper := NewCgroupTestUtil("cpuset", t) + defer helper.cleanup() + + const ( + memsBefore = "0" + memsAfter = "1" + ) + + helper.writeFileContents(map[string]string{ + "cpuset.mems": memsBefore, + }) + + helper.CgroupData.config.Resources.CpusetMems = memsAfter + cpuset := &CpusetGroup{} + if err := cpuset.Set(helper.CgroupPath, helper.CgroupData.config); err != nil { + t.Fatal(err) + } + + value, err := fscommon.GetCgroupParamString(helper.CgroupPath, "cpuset.mems") + if err != nil { + t.Fatalf("Failed to parse cpuset.mems - %s", err) + } + + if value != memsAfter { + t.Fatal("Got the wrong value, set cpuset.mems failed.") + } +} + +func TestCPUSetStatsCorrect(t *testing.T) { + helper := NewCgroupTestUtil("cpuset", t) + defer helper.cleanup() + helper.writeFileContents(cpusetTestFiles) + + cpuset := &CpusetGroup{} + actualStats := *cgroups.NewStats() + err := cpuset.GetStats(helper.CgroupPath, &actualStats) + if err != nil { + t.Fatal(err) + } + expectedStats := cgroups.CPUSetStats{ + CPUs: []uint16{0, 1, 2, 7, 12, 13, 14}, + CPUExclusive: 1, + Mems: []uint16{1, 2, 3, 4, 6, 9}, + MemoryMigrate: 1, + MemHardwall: 0, + MemExclusive: 0, + MemorySpreadPage: 0, + MemorySpreadSlab: 1, + MemoryPressure: 34377, + SchedLoadBalance: 1, + SchedRelaxDomainLevel: -1} + if !reflect.DeepEqual(expectedStats, actualStats.CPUSetStats) { + t.Fatalf("Expected Cpuset stats usage %#v but found %#v", + expectedStats, actualStats.CPUSetStats) + } + +} + +func TestCPUSetStatsMissingFiles(t *testing.T) { + for _, testCase := range []struct { + desc string + filename, contents string + removeFile bool + }{ + { + desc: "empty cpus file", + filename: "cpuset.cpus", + contents: "", + removeFile: false, + }, + { + desc: "empty mems file", + filename: "cpuset.mems", + contents: "", + removeFile: false, + }, + { + desc: "corrupted cpus file", + filename: "cpuset.cpus", + contents: "0-3,*4^2", + removeFile: false, + }, + { + desc: "corrupted mems file", + filename: "cpuset.mems", + contents: "0,1,2-5,8-7", + removeFile: false, + }, + { + desc: "missing cpu_exclusive file", + filename: "cpuset.cpu_exclusive", + contents: "", + removeFile: true, + }, + { + desc: "missing memory_migrate file", + filename: "cpuset.memory_migrate", + contents: "", + removeFile: true, + }, + { + desc: "missing mem_hardwall file", + filename: "cpuset.mem_hardwall", + contents: "", + removeFile: true, + }, + { + desc: "missing mem_exclusive file", + filename: "cpuset.mem_exclusive", + contents: "", + removeFile: true, + }, + { + desc: "missing memory_spread_page file", + filename: "cpuset.memory_spread_page", + contents: "", + removeFile: true, + }, + { + desc: "missing memory_spread_slab file", + filename: "cpuset.memory_spread_slab", + contents: "", + removeFile: true, + }, + { + desc: "missing memory_pressure file", + filename: "cpuset.memory_pressure", + contents: "", + removeFile: true, + }, + { + desc: "missing sched_load_balance file", + filename: "cpuset.sched_load_balance", + contents: "", + removeFile: true, + }, + { + desc: "missing sched_relax_domain_level file", + filename: "cpuset.sched_relax_domain_level", + contents: "", + removeFile: true, + }, + } { + t.Run(testCase.desc, func(t *testing.T) { + helper := NewCgroupTestUtil("cpuset", t) + defer helper.cleanup() + + tempCpusetTestFiles := map[string]string{} + for i, v := range cpusetTestFiles { + tempCpusetTestFiles[i] = v + } + + if testCase.removeFile { + delete(tempCpusetTestFiles, testCase.filename) + helper.writeFileContents(tempCpusetTestFiles) + cpuset := &CpusetGroup{} + actualStats := *cgroups.NewStats() + err := cpuset.GetStats(helper.CgroupPath, &actualStats) + + if err != nil { + t.Errorf("failed unexpectedly: %q", err) + } + } else { + tempCpusetTestFiles[testCase.filename] = testCase.contents + helper.writeFileContents(tempCpusetTestFiles) + cpuset := &CpusetGroup{} + actualStats := *cgroups.NewStats() + err := cpuset.GetStats(helper.CgroupPath, &actualStats) + + if err == nil { + t.Error("failed to return expected error") + } + } + }) + } +} diff --git a/sysbox-runc/libcontainer/cgroups/fs/devices.go b/sysbox-runc/libcontainer/cgroups/fs/devices.go new file mode 100644 index 00000000..33e07d52 --- /dev/null +++ b/sysbox-runc/libcontainer/cgroups/fs/devices.go @@ -0,0 +1,126 @@ +// +build linux + +package fs + +import ( + "bytes" + "errors" + "fmt" + "os" + "reflect" + + "github.com/opencontainers/runc/libcontainer/cgroups" + cgroupdevices "github.com/opencontainers/runc/libcontainer/cgroups/devices" + "github.com/opencontainers/runc/libcontainer/cgroups/fscommon" + "github.com/opencontainers/runc/libcontainer/configs" + "github.com/opencontainers/runc/libcontainer/devices" + "github.com/opencontainers/runc/libcontainer/system" +) + +type DevicesGroup struct { + testingSkipFinalCheck bool +} + +func (s *DevicesGroup) Name() string { + return "devices" +} + +func (s *DevicesGroup) Apply(path string, d *cgroupData) error { + if d.config.SkipDevices { + return nil + } + if path == "" { + // Return error here, since devices cgroup + // is a hard requirement for container's security. + return errSubsystemDoesNotExist + } + return join(path, d.pid) +} + +func loadEmulator(path string) (*cgroupdevices.Emulator, error) { + list, err := fscommon.ReadFile(path, "devices.list") + if err != nil { + return nil, err + } + return cgroupdevices.EmulatorFromList(bytes.NewBufferString(list)) +} + +func buildEmulator(rules []*devices.Rule) (*cgroupdevices.Emulator, error) { + // This defaults to a white-list -- which is what we want! + emu := &cgroupdevices.Emulator{} + for _, rule := range rules { + if err := emu.Apply(*rule); err != nil { + return nil, err + } + } + return emu, nil +} + +func (s *DevicesGroup) Set(path string, cgroup *configs.Cgroup) error { + if system.RunningInUserNS() || cgroup.SkipDevices { + return nil + } + + // Generate two emulators, one for the current state of the cgroup and one + // for the requested state by the user. + current, err := loadEmulator(path) + if err != nil { + return err + } + target, err := buildEmulator(cgroup.Resources.Devices) + if err != nil { + return err + } + + // Compute the minimal set of transition rules needed to achieve the + // requested state. + transitionRules, err := current.Transition(target) + if err != nil { + return err + } + for _, rule := range transitionRules { + file := "devices.deny" + if rule.Allow { + file = "devices.allow" + } + if err := fscommon.WriteFile(path, file, rule.CgroupString()); err != nil { + return err + } + } + + // Final safety check -- ensure that the resulting state is what was + // requested. This is only really correct for white-lists, but for + // black-lists we can at least check that the cgroup is in the right mode. + // + // This safety-check is skipped for the unit tests because we cannot + // currently mock devices.list correctly. + if !s.testingSkipFinalCheck { + currentAfter, err := loadEmulator(path) + if err != nil { + return err + } + if !target.IsBlacklist() && !reflect.DeepEqual(currentAfter, target) { + return errors.New("resulting devices cgroup doesn't precisely match target") + } else if target.IsBlacklist() != currentAfter.IsBlacklist() { + return errors.New("resulting devices cgroup doesn't match target mode") + } + } + return nil +} + +func (s *DevicesGroup) GetStats(path string, stats *cgroups.Stats) error { + return nil +} + +func (s *DevicesGroup) Clone(source, dest string) error { + + if err := fscommon.WriteFile(source, "cgroup.clone_children", "1"); err != nil { + return err + } + + if err := os.MkdirAll(dest, 0755); err != nil { + return fmt.Errorf("Failed to create cgroup %s", dest) + } + + return nil +} diff --git a/sysbox-runc/libcontainer/cgroups/fs/devices_test.go b/sysbox-runc/libcontainer/cgroups/fs/devices_test.go new file mode 100644 index 00000000..752fadd9 --- /dev/null +++ b/sysbox-runc/libcontainer/cgroups/fs/devices_test.go @@ -0,0 +1,52 @@ +// +build linux + +package fs + +import ( + "testing" + + "github.com/opencontainers/runc/libcontainer/cgroups/fscommon" + "github.com/opencontainers/runc/libcontainer/devices" +) + +func TestDevicesSetAllow(t *testing.T) { + helper := NewCgroupTestUtil("devices", t) + defer helper.cleanup() + + helper.writeFileContents(map[string]string{ + "devices.allow": "", + "devices.deny": "", + "devices.list": "a *:* rwm", + }) + + helper.CgroupData.config.Resources.Devices = []*devices.Rule{ + { + Type: devices.CharDevice, + Major: 1, + Minor: 5, + Permissions: devices.Permissions("rwm"), + Allow: true, + }, + } + + d := &DevicesGroup{testingSkipFinalCheck: true} + if err := d.Set(helper.CgroupPath, helper.CgroupData.config); err != nil { + t.Fatal(err) + } + + // The default deny rule must be written. + value, err := fscommon.GetCgroupParamString(helper.CgroupPath, "devices.deny") + if err != nil { + t.Fatalf("Failed to parse devices.deny: %s", err) + } + if value[0] != 'a' { + t.Errorf("Got the wrong value (%q), set devices.deny failed.", value) + } + + // Permitted rule must be written. + if value, err := fscommon.GetCgroupParamString(helper.CgroupPath, "devices.allow"); err != nil { + t.Fatalf("Failed to parse devices.allow: %s", err) + } else if value != "c 1:5 rwm" { + t.Errorf("Got the wrong value (%q), set devices.allow failed.", value) + } +} diff --git a/sysbox-runc/libcontainer/cgroups/fs/freezer.go b/sysbox-runc/libcontainer/cgroups/fs/freezer.go new file mode 100644 index 00000000..145faceb --- /dev/null +++ b/sysbox-runc/libcontainer/cgroups/fs/freezer.go @@ -0,0 +1,102 @@ +// +build linux + +package fs + +import ( + "errors" + "fmt" + "os" + "strings" + "time" + + "github.com/opencontainers/runc/libcontainer/cgroups" + "github.com/opencontainers/runc/libcontainer/cgroups/fscommon" + "github.com/opencontainers/runc/libcontainer/configs" + "golang.org/x/sys/unix" +) + +type FreezerGroup struct { +} + +func (s *FreezerGroup) Name() string { + return "freezer" +} + +func (s *FreezerGroup) Apply(path string, d *cgroupData) error { + return join(path, d.pid) +} + +func (s *FreezerGroup) Set(path string, cgroup *configs.Cgroup) error { + switch cgroup.Resources.Freezer { + case configs.Frozen, configs.Thawed: + for { + // In case this loop does not exit because it doesn't get the expected + // state, let's write again this state, hoping it's going to be properly + // set this time. Otherwise, this loop could run infinitely, waiting for + // a state change that would never happen. + if err := fscommon.WriteFile(path, "freezer.state", string(cgroup.Resources.Freezer)); err != nil { + return err + } + + state, err := s.GetState(path) + if err != nil { + return err + } + if state == cgroup.Resources.Freezer { + break + } + + time.Sleep(1 * time.Millisecond) + } + case configs.Undefined: + return nil + default: + return fmt.Errorf("Invalid argument '%s' to freezer.state", string(cgroup.Resources.Freezer)) + } + + return nil +} + +func (s *FreezerGroup) GetStats(path string, stats *cgroups.Stats) error { + return nil +} + +func (s *FreezerGroup) GetState(path string) (configs.FreezerState, error) { + for { + state, err := fscommon.ReadFile(path, "freezer.state") + if err != nil { + // If the kernel is too old, then we just treat the freezer as + // being in an "undefined" state. + if os.IsNotExist(err) || errors.Is(err, unix.ENODEV) { + err = nil + } + return configs.Undefined, err + } + switch strings.TrimSpace(state) { + case "THAWED": + return configs.Thawed, nil + case "FROZEN": + return configs.Frozen, nil + case "FREEZING": + // Make sure we get a stable freezer state, so retry if the cgroup + // is still undergoing freezing. This should be a temporary delay. + time.Sleep(1 * time.Millisecond) + continue + default: + return configs.Undefined, fmt.Errorf("unknown freezer.state %q", state) + } + } +} + +func (s *FreezerGroup) Clone(source, dest string) error { + + if err := fscommon.WriteFile(source, "cgroup.clone_children", "1"); err != nil { + return err + } + + if err := os.MkdirAll(dest, 0755); err != nil { + return fmt.Errorf("Failed to create cgroup %s", dest) + } + + return nil +} diff --git a/sysbox-runc/libcontainer/cgroups/fs/freezer_test.go b/sysbox-runc/libcontainer/cgroups/fs/freezer_test.go new file mode 100644 index 00000000..ad80261c --- /dev/null +++ b/sysbox-runc/libcontainer/cgroups/fs/freezer_test.go @@ -0,0 +1,48 @@ +// +build linux + +package fs + +import ( + "testing" + + "github.com/opencontainers/runc/libcontainer/cgroups/fscommon" + "github.com/opencontainers/runc/libcontainer/configs" +) + +func TestFreezerSetState(t *testing.T) { + helper := NewCgroupTestUtil("freezer", t) + defer helper.cleanup() + + helper.writeFileContents(map[string]string{ + "freezer.state": string(configs.Frozen), + }) + + helper.CgroupData.config.Resources.Freezer = configs.Thawed + freezer := &FreezerGroup{} + if err := freezer.Set(helper.CgroupPath, helper.CgroupData.config); err != nil { + t.Fatal(err) + } + + value, err := fscommon.GetCgroupParamString(helper.CgroupPath, "freezer.state") + if err != nil { + t.Fatalf("Failed to parse freezer.state - %s", err) + } + if value != string(configs.Thawed) { + t.Fatal("Got the wrong value, set freezer.state failed.") + } +} + +func TestFreezerSetInvalidState(t *testing.T) { + helper := NewCgroupTestUtil("freezer", t) + defer helper.cleanup() + + const ( + invalidArg configs.FreezerState = "Invalid" + ) + + helper.CgroupData.config.Resources.Freezer = invalidArg + freezer := &FreezerGroup{} + if err := freezer.Set(helper.CgroupPath, helper.CgroupData.config); err == nil { + t.Fatal("Failed to return invalid argument error") + } +} diff --git a/sysbox-runc/libcontainer/cgroups/fs/fs.go b/sysbox-runc/libcontainer/cgroups/fs/fs.go new file mode 100644 index 00000000..141b70ee --- /dev/null +++ b/sysbox-runc/libcontainer/cgroups/fs/fs.go @@ -0,0 +1,555 @@ +// +build linux + +package fs + +import ( + "bufio" + "fmt" + "io/ioutil" + "os" + "path/filepath" + "strings" + "sync" + + "github.com/opencontainers/runc/libcontainer/cgroups" + "github.com/opencontainers/runc/libcontainer/configs" + libcontainerUtils "github.com/opencontainers/runc/libcontainer/utils" + "github.com/pkg/errors" + "golang.org/x/sys/unix" +) + +var ( + subsystems = []subsystem{ + &CpusetGroup{}, + &DevicesGroup{}, + &MemoryGroup{}, + &CpuGroup{}, + &CpuacctGroup{}, + &PidsGroup{}, + &BlkioGroup{}, + &HugetlbGroup{}, + &NetClsGroup{}, + &NetPrioGroup{}, + &PerfEventGroup{}, + &FreezerGroup{}, + &RdmaGroup{}, + &NameGroup{GroupName: "name=systemd", Join: true}, + } + HugePageSizes, _ = cgroups.GetHugePageSize() +) + +var errSubsystemDoesNotExist = errors.New("cgroup: subsystem does not exist") + +type subsystem interface { + // Name returns the name of the subsystem. + Name() string + // Returns the stats, as 'stats', corresponding to the cgroup under 'path'. + GetStats(path string, stats *cgroups.Stats) error + // Creates and joins the cgroup represented by 'cgroupData'. + Apply(path string, c *cgroupData) error + // Set the cgroup represented by cgroup. + Set(path string, cgroup *configs.Cgroup) error + // Copy cgroup settings to from a given cgroup to another + Clone(source, dest string) error +} + +type manager struct { + mu sync.Mutex + cgroups *configs.Cgroup + rootless bool // ignore permission-related errors + paths map[string]string + childCgroupCreated bool +} + +func NewManager(cg *configs.Cgroup, paths map[string]string, rootless bool) cgroups.Manager { + + childCgroupCreated := false + if paths != nil { + childCgroupCreated = true + } + + return &manager{ + cgroups: cg, + paths: paths, + rootless: rootless, + childCgroupCreated: childCgroupCreated, + } +} + +// The absolute path to the root of the cgroup hierarchies. +var cgroupRootLock sync.Mutex +var cgroupRoot string + +const defaultCgroupRoot = "/sys/fs/cgroup" + +func tryDefaultCgroupRoot() string { + var st, pst unix.Stat_t + + // (1) it should be a directory... + err := unix.Lstat(defaultCgroupRoot, &st) + if err != nil || st.Mode&unix.S_IFDIR == 0 { + return "" + } + + // (2) ... and a mount point ... + err = unix.Lstat(filepath.Dir(defaultCgroupRoot), &pst) + if err != nil { + return "" + } + + if st.Dev == pst.Dev { + // parent dir has the same dev -- not a mount point + return "" + } + + // (3) ... of 'tmpfs' fs type. + var fst unix.Statfs_t + err = unix.Statfs(defaultCgroupRoot, &fst) + if err != nil || fst.Type != unix.TMPFS_MAGIC { + return "" + } + + // (4) it should have at least 1 entry ... + dir, err := os.Open(defaultCgroupRoot) + if err != nil { + return "" + } + names, err := dir.Readdirnames(1) + if err != nil { + return "" + } + if len(names) < 1 { + return "" + } + // ... which is a cgroup mount point. + err = unix.Statfs(filepath.Join(defaultCgroupRoot, names[0]), &fst) + if err != nil || fst.Type != unix.CGROUP_SUPER_MAGIC { + return "" + } + + return defaultCgroupRoot +} + +// Gets the cgroupRoot. +func getCgroupRoot() (string, error) { + cgroupRootLock.Lock() + defer cgroupRootLock.Unlock() + + if cgroupRoot != "" { + return cgroupRoot, nil + } + + // fast path + cgroupRoot = tryDefaultCgroupRoot() + if cgroupRoot != "" { + return cgroupRoot, nil + } + + // slow path: parse mountinfo, find the first mount where fs=cgroup + // (e.g. "/sys/fs/cgroup/memory"), use its parent. + f, err := os.Open("/proc/self/mountinfo") + if err != nil { + return "", err + } + defer f.Close() + + var root string + scanner := bufio.NewScanner(f) + for scanner.Scan() { + text := scanner.Text() + fields := strings.Split(text, " ") + // Safe as mountinfo encodes mountpoints with spaces as \040. + index := strings.Index(text, " - ") + postSeparatorFields := strings.Fields(text[index+3:]) + numPostFields := len(postSeparatorFields) + + // This is an error as we can't detect if the mount is for "cgroup" + if numPostFields == 0 { + return "", fmt.Errorf("mountinfo: found no fields post '-' in %q", text) + } + + if postSeparatorFields[0] == "cgroup" { + // Check that the mount is properly formatted. + if numPostFields < 3 { + return "", fmt.Errorf("Error found less than 3 fields post '-' in %q", text) + } + + root = filepath.Dir(fields[4]) + break + } + } + if err := scanner.Err(); err != nil { + return "", err + } + if root == "" { + return "", errors.New("no cgroup mount found in mountinfo") + } + + if _, err := os.Stat(root); err != nil { + return "", err + } + + cgroupRoot = root + return cgroupRoot, nil +} + +type cgroupData struct { + root string + innerPath string + config *configs.Cgroup + pid int +} + +// isIgnorableError returns whether err is a permission error (in the loose +// sense of the word). This includes EROFS (which for an unprivileged user is +// basically a permission error) and EACCES (for similar reasons) as well as +// the normal EPERM. +func isIgnorableError(rootless bool, err error) bool { + // We do not ignore errors if we are root. + if !rootless { + return false + } + // TODO: rm errors.Cause once we switch to %w everywhere + err = errors.Cause(err) + // Is it an ordinary EPERM? + if errors.Is(err, os.ErrPermission) { + return true + } + // Handle some specific syscall errors. + var errno unix.Errno + if errors.As(err, &errno) { + return errno == unix.EROFS || errno == unix.EPERM || errno == unix.EACCES + } + return false +} + +func (m *manager) CreateChildCgroup(config *configs.Config) error { + paths := m.GetPaths() + for _, sys := range subsystems { + cgroupPath := paths[sys.Name()] + + if cgroupPath != "" { + childPath := filepath.Join(cgroupPath, cgroups.SyscontCgroupRoot) + + if err := sys.Clone(cgroupPath, childPath); err != nil { + return fmt.Errorf("Failed to clone cgroup %s to %s", cgroupPath, childPath) + } + + // Change child cgroup ownership to match the root user in the system container + rootuid, err := config.HostRootUID() + if err != nil { + return err + } + rootgid, err := config.HostRootGID() + if err != nil { + return err + } + if err := os.Chown(childPath, rootuid, rootgid); err != nil { + return fmt.Errorf("Failed to change owner of sub cgroup %s", childPath) + } + + // Change ownership of the files inside the child cgroup + files, err := ioutil.ReadDir(childPath) + if err != nil { + return err + } + for _, file := range files { + absFileName := filepath.Join(childPath, file.Name()) + if err := os.Chown(absFileName, rootuid, rootgid); err != nil { + return fmt.Errorf("Failed to change owner for file %s", absFileName) + } + } + } + } + + m.childCgroupCreated = true + return nil +} + +func (m *manager) Apply(pid int) (err error) { + if m.cgroups == nil { + return nil + } + m.mu.Lock() + defer m.mu.Unlock() + + c := m.cgroups + if c.Resources.Unified != nil { + return cgroups.ErrV1NoUnified + } + + m.paths = make(map[string]string) + if c.Paths != nil { + cgMap, err := cgroups.ParseCgroupFile("/proc/self/cgroup") + if err != nil { + return err + } + for name, path := range c.Paths { + // XXX(kolyshkin@): why this check is needed? + if _, ok := cgMap[name]; ok { + m.paths[name] = path + } + } + return cgroups.EnterPid(m.paths, pid) + } + + d, err := getCgroupData(m.cgroups, pid) + if err != nil { + return err + } + + for _, sys := range subsystems { + p, err := d.path(sys.Name()) + if err != nil { + // The non-presence of the devices subsystem is + // considered fatal for security reasons. + if cgroups.IsNotFound(err) && (c.SkipDevices || sys.Name() != "devices") { + continue + } + return err + } + m.paths[sys.Name()] = p + + if err := sys.Apply(p, d); err != nil { + // In the case of rootless (including euid=0 in userns), where an + // explicit cgroup path hasn't been set, we don't bail on error in + // case of permission problems. Cases where limits have been set + // (and we couldn't create our own cgroup) are handled by Set. + if isIgnorableError(m.rootless, err) && m.cgroups.Path == "" { + delete(m.paths, sys.Name()) + continue + } + return err + } + + } + return nil +} + +func (m *manager) ApplyChildCgroup(pid int) (err error) { + m.mu.Lock() + defer m.mu.Unlock() + + if m.cgroups == nil { + return nil + } + + if !m.childCgroupCreated { + return errors.New("can't place process in child cgroup because child cgroup has not been created") + } + + if m.paths == nil { + return errors.New("can't place pid in delegated cgroup unless it was placed in container cgroup first") + } + + childCgroupPaths := make(map[string]string) + + for name, path := range m.paths { + childCgroupPaths[name] = filepath.Join(path, cgroups.SyscontCgroupRoot) + } + + return cgroups.EnterPid(childCgroupPaths, pid) +} + +func (m *manager) Destroy() error { + if m.cgroups == nil || m.cgroups.Paths != nil { + return nil + } + m.mu.Lock() + defer m.mu.Unlock() + return cgroups.RemovePaths(m.paths) +} + +func (m *manager) Path(subsys string) string { + m.mu.Lock() + defer m.mu.Unlock() + return m.paths[subsys] +} + +func (m *manager) GetStats() (*cgroups.Stats, error) { + m.mu.Lock() + defer m.mu.Unlock() + stats := cgroups.NewStats() + for _, sys := range subsystems { + path := m.paths[sys.Name()] + if path == "" { + continue + } + if err := sys.GetStats(path, stats); err != nil { + return nil, err + } + } + return stats, nil +} + +func (m *manager) Set(container *configs.Config) error { + if container.Cgroups == nil { + return nil + } + + // If Paths are set, then we are just joining cgroups paths + // and there is no need to set any values. + if m.cgroups != nil && m.cgroups.Paths != nil { + return nil + } + if container.Cgroups.Resources.Unified != nil { + return cgroups.ErrV1NoUnified + } + + m.mu.Lock() + defer m.mu.Unlock() + for _, sys := range subsystems { + path := m.paths[sys.Name()] + if err := sys.Set(path, container.Cgroups); err != nil { + if m.rootless && sys.Name() == "devices" { + continue + } + // When m.Rootless is true, errors from the device subsystem are ignored because it is really not expected to work. + // However, errors from other subsystems are not ignored. + // see @test "runc create (rootless + limits + no cgrouppath + no permission) fails with informative error" + if path == "" { + // We never created a path for this cgroup, so we cannot set + // limits for it (though we have already tried at this point). + return fmt.Errorf("cannot set %s limit: container could not join or create cgroup", sys.Name()) + } + return err + } + } + + return nil +} + +// Freeze toggles the container's freezer cgroup depending on the state +// provided +func (m *manager) Freeze(state configs.FreezerState) error { + path := m.Path("freezer") + if m.cgroups == nil || path == "" { + return errors.New("cannot toggle freezer: cgroups not configured for container") + } + + prevState := m.cgroups.Resources.Freezer + m.cgroups.Resources.Freezer = state + freezer := &FreezerGroup{} + if err := freezer.Set(path, m.cgroups); err != nil { + m.cgroups.Resources.Freezer = prevState + return err + } + return nil +} + +func (m *manager) GetPids() ([]int, error) { + // sysbox-runc: return the pids starting from the system container root + // (all sys container pids start at this level) + paths := m.GetChildCgroupPaths() + return cgroups.GetPids(paths["devices"]) +} + +func (m *manager) GetAllPids() ([]int, error) { + // sysbox-runc: return the pids starting from the system container root + // (all sys container pids start at this level) + paths := m.GetChildCgroupPaths() + return cgroups.GetAllPids(paths["devices"]) +} + +func getCgroupData(c *configs.Cgroup, pid int) (*cgroupData, error) { + root, err := getCgroupRoot() + if err != nil { + return nil, err + } + + if (c.Name != "" || c.Parent != "") && c.Path != "" { + return nil, errors.New("cgroup: either Path or Name and Parent should be used") + } + + // XXX: Do not remove this code. Path safety is important! -- cyphar + cgPath := libcontainerUtils.CleanPath(c.Path) + cgParent := libcontainerUtils.CleanPath(c.Parent) + cgName := libcontainerUtils.CleanPath(c.Name) + + innerPath := cgPath + if innerPath == "" { + innerPath = filepath.Join(cgParent, cgName) + } + + return &cgroupData{ + root: root, + innerPath: innerPath, + config: c, + pid: pid, + }, nil +} + +func (raw *cgroupData) path(subsystem string) (string, error) { + // If the cgroup name/path is absolute do not look relative to the cgroup of the init process. + if filepath.IsAbs(raw.innerPath) { + mnt, err := cgroups.FindCgroupMountpoint(raw.root, subsystem) + // If we didn't mount the subsystem, there is no point we make the path. + if err != nil { + return "", err + } + + // Sometimes subsystems can be mounted together as 'cpu,cpuacct'. + return filepath.Join(raw.root, filepath.Base(mnt), raw.innerPath), nil + } + + // Use GetOwnCgroupPath instead of GetInitCgroupPath, because the creating + // process could in container and shared pid namespace with host, and + // /proc/1/cgroup could point to whole other world of cgroups. + parentPath, err := cgroups.GetOwnCgroupPath(subsystem) + if err != nil { + return "", err + } + + return filepath.Join(parentPath, raw.innerPath), nil +} + +func join(path string, pid int) error { + if path == "" { + return nil + } + if err := os.MkdirAll(path, 0755); err != nil { + return err + } + return cgroups.WriteCgroupProc(path, pid) +} + +func (m *manager) GetPaths() map[string]string { + m.mu.Lock() + defer m.mu.Unlock() + return m.paths +} + +// sysbox-runc +func (m *manager) GetChildCgroupPaths() map[string]string { + m.mu.Lock() + defer m.mu.Unlock() + + childCgroupPaths := make(map[string]string) + for k, v := range m.paths { + childCgroupPaths[k] = filepath.Join(v, cgroups.SyscontCgroupRoot) + } + + return childCgroupPaths +} + +func (m *manager) GetCgroups() (*configs.Cgroup, error) { + return m.cgroups, nil +} + +func (m *manager) GetFreezerState() (configs.FreezerState, error) { + dir := m.Path("freezer") + // If the container doesn't have the freezer cgroup, say it's undefined. + if dir == "" { + return configs.Undefined, nil + } + freezer := &FreezerGroup{} + return freezer.GetState(dir) +} + +func (m *manager) Exists() bool { + return cgroups.PathExists(m.Path("devices")) +} + +func (m *manager) GetType() cgroups.CgroupType { + return cgroups.Cgroup_v1_fs +} diff --git a/sysbox-runc/libcontainer/cgroups/fs/fs_test.go b/sysbox-runc/libcontainer/cgroups/fs/fs_test.go new file mode 100644 index 00000000..c899ff8f --- /dev/null +++ b/sysbox-runc/libcontainer/cgroups/fs/fs_test.go @@ -0,0 +1,106 @@ +// +build linux + +package fs + +import ( + "path/filepath" + "strings" + "testing" + + "github.com/opencontainers/runc/libcontainer/cgroups" + "github.com/opencontainers/runc/libcontainer/configs" +) + +func TestInvalidCgroupPath(t *testing.T) { + if cgroups.IsCgroup2UnifiedMode() { + t.Skip("cgroup v2 is not supported") + } + + root, err := getCgroupRoot() + if err != nil { + t.Fatalf("couldn't get cgroup root: %v", err) + } + + testCases := []struct { + test string + path, name, parent string + }{ + { + test: "invalid cgroup path", + path: "../../../../../../../../../../some/path", + }, + { + test: "invalid absolute cgroup path", + path: "/../../../../../../../../../../some/path", + }, + { + test: "invalid cgroup parent", + parent: "../../../../../../../../../../some/path", + name: "name", + }, + { + test: "invalid absolute cgroup parent", + parent: "/../../../../../../../../../../some/path", + name: "name", + }, + { + test: "invalid cgroup name", + parent: "parent", + name: "../../../../../../../../../../some/path", + }, + { + test: "invalid absolute cgroup name", + parent: "parent", + name: "/../../../../../../../../../../some/path", + }, + { + test: "invalid cgroup name and parent", + parent: "../../../../../../../../../../some/path", + name: "../../../../../../../../../../some/path", + }, + { + test: "invalid absolute cgroup name and parent", + parent: "/../../../../../../../../../../some/path", + name: "/../../../../../../../../../../some/path", + }, + } + + for _, tc := range testCases { + t.Run(tc.test, func(t *testing.T) { + config := &configs.Cgroup{Path: tc.path, Name: tc.name, Parent: tc.parent} + + data, err := getCgroupData(config, 0) + if err != nil { + t.Fatalf("couldn't get cgroup data: %v", err) + } + + // Make sure the final innerPath doesn't go outside the cgroup mountpoint. + if strings.HasPrefix(data.innerPath, "..") { + t.Errorf("SECURITY: cgroup innerPath is outside cgroup mountpoint!") + } + + // Double-check, using an actual cgroup. + deviceRoot := filepath.Join(root, "devices") + devicePath, err := data.path("devices") + if err != nil { + t.Fatalf("couldn't get cgroup path: %v", err) + } + if !strings.HasPrefix(devicePath, deviceRoot) { + t.Errorf("SECURITY: cgroup path() is outside cgroup mountpoint!") + } + }) + } +} + +func TestTryDefaultCgroupRoot(t *testing.T) { + res := tryDefaultCgroupRoot() + exp := defaultCgroupRoot + if cgroups.IsCgroup2UnifiedMode() { + // checking that tryDefaultCgroupRoot does return "" + // in case /sys/fs/cgroup is not cgroup v1 root dir. + exp = "" + } + if res != exp { + t.Errorf("tryDefaultCgroupRoot: want %q, got %q", exp, res) + } +} diff --git a/sysbox-runc/libcontainer/cgroups/fs/hugetlb.go b/sysbox-runc/libcontainer/cgroups/fs/hugetlb.go new file mode 100644 index 00000000..1be54dca --- /dev/null +++ b/sysbox-runc/libcontainer/cgroups/fs/hugetlb.go @@ -0,0 +1,80 @@ +// +build linux + +package fs + +import ( + "fmt" + "os" + "strconv" + + "github.com/opencontainers/runc/libcontainer/cgroups" + "github.com/opencontainers/runc/libcontainer/cgroups/fscommon" + "github.com/opencontainers/runc/libcontainer/configs" +) + +type HugetlbGroup struct { +} + +func (s *HugetlbGroup) Name() string { + return "hugetlb" +} + +func (s *HugetlbGroup) Apply(path string, d *cgroupData) error { + return join(path, d.pid) +} + +func (s *HugetlbGroup) Set(path string, cgroup *configs.Cgroup) error { + for _, hugetlb := range cgroup.Resources.HugetlbLimit { + if err := fscommon.WriteFile(path, "hugetlb."+hugetlb.Pagesize+".limit_in_bytes", strconv.FormatUint(hugetlb.Limit, 10)); err != nil { + return err + } + } + + return nil +} + +func (s *HugetlbGroup) GetStats(path string, stats *cgroups.Stats) error { + hugetlbStats := cgroups.HugetlbStats{} + if !cgroups.PathExists(path) { + return nil + } + for _, pageSize := range HugePageSizes { + usage := "hugetlb." + pageSize + ".usage_in_bytes" + value, err := fscommon.GetCgroupParamUint(path, usage) + if err != nil { + return fmt.Errorf("failed to parse %s - %v", usage, err) + } + hugetlbStats.Usage = value + + maxUsage := "hugetlb." + pageSize + ".max_usage_in_bytes" + value, err = fscommon.GetCgroupParamUint(path, maxUsage) + if err != nil { + return fmt.Errorf("failed to parse %s - %v", maxUsage, err) + } + hugetlbStats.MaxUsage = value + + failcnt := "hugetlb." + pageSize + ".failcnt" + value, err = fscommon.GetCgroupParamUint(path, failcnt) + if err != nil { + return fmt.Errorf("failed to parse %s - %v", failcnt, err) + } + hugetlbStats.Failcnt = value + + stats.HugetlbStats[pageSize] = hugetlbStats + } + + return nil +} + +func (s *HugetlbGroup) Clone(source, dest string) error { + + if err := fscommon.WriteFile(source, "cgroup.clone_children", "1"); err != nil { + return err + } + + if err := os.MkdirAll(dest, 0755); err != nil { + return fmt.Errorf("Failed to create cgroup %s", dest) + } + + return nil +} diff --git a/sysbox-runc/libcontainer/cgroups/fs/hugetlb_test.go b/sysbox-runc/libcontainer/cgroups/fs/hugetlb_test.go new file mode 100644 index 00000000..2d17ca67 --- /dev/null +++ b/sysbox-runc/libcontainer/cgroups/fs/hugetlb_test.go @@ -0,0 +1,155 @@ +// +build linux + +package fs + +import ( + "fmt" + "strconv" + "testing" + + "github.com/opencontainers/runc/libcontainer/cgroups" + "github.com/opencontainers/runc/libcontainer/cgroups/fscommon" + "github.com/opencontainers/runc/libcontainer/configs" +) + +const ( + hugetlbUsageContents = "128\n" + hugetlbMaxUsageContents = "256\n" + hugetlbFailcnt = "100\n" +) + +const ( + usage = "hugetlb.%s.usage_in_bytes" + limit = "hugetlb.%s.limit_in_bytes" + maxUsage = "hugetlb.%s.max_usage_in_bytes" + failcnt = "hugetlb.%s.failcnt" +) + +func TestHugetlbSetHugetlb(t *testing.T) { + helper := NewCgroupTestUtil("hugetlb", t) + defer helper.cleanup() + + const ( + hugetlbBefore = 256 + hugetlbAfter = 512 + ) + + for _, pageSize := range HugePageSizes { + helper.writeFileContents(map[string]string{ + fmt.Sprintf(limit, pageSize): strconv.Itoa(hugetlbBefore), + }) + } + + for _, pageSize := range HugePageSizes { + helper.CgroupData.config.Resources.HugetlbLimit = []*configs.HugepageLimit{ + { + Pagesize: pageSize, + Limit: hugetlbAfter, + }, + } + hugetlb := &HugetlbGroup{} + if err := hugetlb.Set(helper.CgroupPath, helper.CgroupData.config); err != nil { + t.Fatal(err) + } + } + + for _, pageSize := range HugePageSizes { + limit := fmt.Sprintf(limit, pageSize) + value, err := fscommon.GetCgroupParamUint(helper.CgroupPath, limit) + if err != nil { + t.Fatalf("Failed to parse %s - %s", limit, err) + } + if value != hugetlbAfter { + t.Fatalf("Set hugetlb.limit_in_bytes failed. Expected: %v, Got: %v", hugetlbAfter, value) + } + } +} + +func TestHugetlbStats(t *testing.T) { + helper := NewCgroupTestUtil("hugetlb", t) + defer helper.cleanup() + for _, pageSize := range HugePageSizes { + helper.writeFileContents(map[string]string{ + fmt.Sprintf(usage, pageSize): hugetlbUsageContents, + fmt.Sprintf(maxUsage, pageSize): hugetlbMaxUsageContents, + fmt.Sprintf(failcnt, pageSize): hugetlbFailcnt, + }) + } + + hugetlb := &HugetlbGroup{} + actualStats := *cgroups.NewStats() + err := hugetlb.GetStats(helper.CgroupPath, &actualStats) + if err != nil { + t.Fatal(err) + } + expectedStats := cgroups.HugetlbStats{Usage: 128, MaxUsage: 256, Failcnt: 100} + for _, pageSize := range HugePageSizes { + expectHugetlbStatEquals(t, expectedStats, actualStats.HugetlbStats[pageSize]) + } +} + +func TestHugetlbStatsNoUsageFile(t *testing.T) { + helper := NewCgroupTestUtil("hugetlb", t) + defer helper.cleanup() + helper.writeFileContents(map[string]string{ + maxUsage: hugetlbMaxUsageContents, + }) + + hugetlb := &HugetlbGroup{} + actualStats := *cgroups.NewStats() + err := hugetlb.GetStats(helper.CgroupPath, &actualStats) + if err == nil { + t.Fatal("Expected failure") + } +} + +func TestHugetlbStatsNoMaxUsageFile(t *testing.T) { + helper := NewCgroupTestUtil("hugetlb", t) + defer helper.cleanup() + for _, pageSize := range HugePageSizes { + helper.writeFileContents(map[string]string{ + fmt.Sprintf(usage, pageSize): hugetlbUsageContents, + }) + } + + hugetlb := &HugetlbGroup{} + actualStats := *cgroups.NewStats() + err := hugetlb.GetStats(helper.CgroupPath, &actualStats) + if err == nil { + t.Fatal("Expected failure") + } +} + +func TestHugetlbStatsBadUsageFile(t *testing.T) { + helper := NewCgroupTestUtil("hugetlb", t) + defer helper.cleanup() + for _, pageSize := range HugePageSizes { + helper.writeFileContents(map[string]string{ + fmt.Sprintf(usage, pageSize): "bad", + maxUsage: hugetlbMaxUsageContents, + }) + } + + hugetlb := &HugetlbGroup{} + actualStats := *cgroups.NewStats() + err := hugetlb.GetStats(helper.CgroupPath, &actualStats) + if err == nil { + t.Fatal("Expected failure") + } +} + +func TestHugetlbStatsBadMaxUsageFile(t *testing.T) { + helper := NewCgroupTestUtil("hugetlb", t) + defer helper.cleanup() + helper.writeFileContents(map[string]string{ + usage: hugetlbUsageContents, + maxUsage: "bad", + }) + + hugetlb := &HugetlbGroup{} + actualStats := *cgroups.NewStats() + err := hugetlb.GetStats(helper.CgroupPath, &actualStats) + if err == nil { + t.Fatal("Expected failure") + } +} diff --git a/sysbox-runc/libcontainer/cgroups/fs/memory.go b/sysbox-runc/libcontainer/cgroups/fs/memory.go new file mode 100644 index 00000000..856d5cdf --- /dev/null +++ b/sysbox-runc/libcontainer/cgroups/fs/memory.go @@ -0,0 +1,348 @@ +// +build linux + +package fs + +import ( + "bufio" + "fmt" + "math" + "os" + "path/filepath" + "strconv" + "strings" + + "github.com/opencontainers/runc/libcontainer/cgroups" + "github.com/opencontainers/runc/libcontainer/cgroups/fscommon" + "github.com/opencontainers/runc/libcontainer/configs" +) + +const ( + numaNodeSymbol = "N" + numaStatColumnSeparator = " " + numaStatKeyValueSeparator = "=" + numaStatMaxColumns = math.MaxUint8 + 1 + numaStatValueIndex = 1 + numaStatTypeIndex = 0 + numaStatColumnSliceLength = 2 + cgroupMemorySwapLimit = "memory.memsw.limit_in_bytes" + cgroupMemoryLimit = "memory.limit_in_bytes" + cgroupMemoryPagesByNuma = "memory.numa_stat" +) + +type MemoryGroup struct { +} + +func (s *MemoryGroup) Name() string { + return "memory" +} + +func (s *MemoryGroup) Apply(path string, d *cgroupData) (err error) { + return join(path, d.pid) +} + +func setMemoryAndSwap(path string, cgroup *configs.Cgroup) error { + // If the memory update is set to -1 and the swap is not explicitly + // set, we should also set swap to -1, it means unlimited memory. + if cgroup.Resources.Memory == -1 && cgroup.Resources.MemorySwap == 0 { + // Only set swap if it's enabled in kernel + if cgroups.PathExists(filepath.Join(path, cgroupMemorySwapLimit)) { + cgroup.Resources.MemorySwap = -1 + } + } + + // When memory and swap memory are both set, we need to handle the cases + // for updating container. + if cgroup.Resources.Memory != 0 && cgroup.Resources.MemorySwap != 0 { + memoryUsage, err := getMemoryData(path, "") + if err != nil { + return err + } + + // When update memory limit, we should adapt the write sequence + // for memory and swap memory, so it won't fail because the new + // value and the old value don't fit kernel's validation. + if cgroup.Resources.MemorySwap == -1 || memoryUsage.Limit < uint64(cgroup.Resources.MemorySwap) { + if err := fscommon.WriteFile(path, cgroupMemorySwapLimit, strconv.FormatInt(cgroup.Resources.MemorySwap, 10)); err != nil { + return err + } + if err := fscommon.WriteFile(path, cgroupMemoryLimit, strconv.FormatInt(cgroup.Resources.Memory, 10)); err != nil { + return err + } + } else { + if err := fscommon.WriteFile(path, cgroupMemoryLimit, strconv.FormatInt(cgroup.Resources.Memory, 10)); err != nil { + return err + } + if err := fscommon.WriteFile(path, cgroupMemorySwapLimit, strconv.FormatInt(cgroup.Resources.MemorySwap, 10)); err != nil { + return err + } + } + } else { + if cgroup.Resources.Memory != 0 { + if err := fscommon.WriteFile(path, cgroupMemoryLimit, strconv.FormatInt(cgroup.Resources.Memory, 10)); err != nil { + return err + } + } + if cgroup.Resources.MemorySwap != 0 { + if err := fscommon.WriteFile(path, cgroupMemorySwapLimit, strconv.FormatInt(cgroup.Resources.MemorySwap, 10)); err != nil { + return err + } + } + } + + return nil +} + +func (s *MemoryGroup) Set(path string, cgroup *configs.Cgroup) error { + if err := setMemoryAndSwap(path, cgroup); err != nil { + return err + } + + // ignore KernelMemory and KernelMemoryTCP + + if cgroup.Resources.MemoryReservation != 0 { + if err := fscommon.WriteFile(path, "memory.soft_limit_in_bytes", strconv.FormatInt(cgroup.Resources.MemoryReservation, 10)); err != nil { + return err + } + } + + if cgroup.Resources.OomKillDisable { + if err := fscommon.WriteFile(path, "memory.oom_control", "1"); err != nil { + return err + } + } + if cgroup.Resources.MemorySwappiness == nil || int64(*cgroup.Resources.MemorySwappiness) == -1 { + return nil + } else if *cgroup.Resources.MemorySwappiness <= 100 { + if err := fscommon.WriteFile(path, "memory.swappiness", strconv.FormatUint(*cgroup.Resources.MemorySwappiness, 10)); err != nil { + return err + } + } else { + return fmt.Errorf("invalid value:%d. valid memory swappiness range is 0-100", *cgroup.Resources.MemorySwappiness) + } + + return nil +} + +func (s *MemoryGroup) GetStats(path string, stats *cgroups.Stats) error { + // Set stats from memory.stat. + statsFile, err := fscommon.OpenFile(path, "memory.stat", os.O_RDONLY) + if err != nil { + if os.IsNotExist(err) { + return nil + } + return err + } + defer statsFile.Close() + + sc := bufio.NewScanner(statsFile) + for sc.Scan() { + t, v, err := fscommon.GetCgroupParamKeyValue(sc.Text()) + if err != nil { + return fmt.Errorf("failed to parse memory.stat (%q) - %v", sc.Text(), err) + } + stats.MemoryStats.Stats[t] = v + } + stats.MemoryStats.Cache = stats.MemoryStats.Stats["cache"] + + memoryUsage, err := getMemoryData(path, "") + if err != nil { + return err + } + stats.MemoryStats.Usage = memoryUsage + swapUsage, err := getMemoryData(path, "memsw") + if err != nil { + return err + } + stats.MemoryStats.SwapUsage = swapUsage + kernelUsage, err := getMemoryData(path, "kmem") + if err != nil { + return err + } + stats.MemoryStats.KernelUsage = kernelUsage + kernelTCPUsage, err := getMemoryData(path, "kmem.tcp") + if err != nil { + return err + } + stats.MemoryStats.KernelTCPUsage = kernelTCPUsage + + value, err := fscommon.GetCgroupParamUint(path, "memory.use_hierarchy") + if err != nil { + return err + } + if value == 1 { + stats.MemoryStats.UseHierarchy = true + } + + pagesByNUMA, err := getPageUsageByNUMA(path) + if err != nil { + return err + } + stats.MemoryStats.PageUsageByNUMA = pagesByNUMA + + return nil +} + +func memoryAssigned(cgroup *configs.Cgroup) bool { + return cgroup.Resources.Memory != 0 || + cgroup.Resources.MemoryReservation != 0 || + cgroup.Resources.MemorySwap > 0 || + cgroup.Resources.OomKillDisable || + (cgroup.Resources.MemorySwappiness != nil && int64(*cgroup.Resources.MemorySwappiness) != -1) +} + +func getMemoryData(path, name string) (cgroups.MemoryData, error) { + memoryData := cgroups.MemoryData{} + + moduleName := "memory" + if name != "" { + moduleName = "memory." + name + } + var ( + usage = moduleName + ".usage_in_bytes" + maxUsage = moduleName + ".max_usage_in_bytes" + failcnt = moduleName + ".failcnt" + limit = moduleName + ".limit_in_bytes" + ) + + value, err := fscommon.GetCgroupParamUint(path, usage) + if err != nil { + if moduleName != "memory" && os.IsNotExist(err) { + return cgroups.MemoryData{}, nil + } + return cgroups.MemoryData{}, fmt.Errorf("failed to parse %s - %v", usage, err) + } + memoryData.Usage = value + value, err = fscommon.GetCgroupParamUint(path, maxUsage) + if err != nil { + if moduleName != "memory" && os.IsNotExist(err) { + return cgroups.MemoryData{}, nil + } + return cgroups.MemoryData{}, fmt.Errorf("failed to parse %s - %v", maxUsage, err) + } + memoryData.MaxUsage = value + value, err = fscommon.GetCgroupParamUint(path, failcnt) + if err != nil { + if moduleName != "memory" && os.IsNotExist(err) { + return cgroups.MemoryData{}, nil + } + return cgroups.MemoryData{}, fmt.Errorf("failed to parse %s - %v", failcnt, err) + } + memoryData.Failcnt = value + value, err = fscommon.GetCgroupParamUint(path, limit) + if err != nil { + if moduleName != "memory" && os.IsNotExist(err) { + return cgroups.MemoryData{}, nil + } + return cgroups.MemoryData{}, fmt.Errorf("failed to parse %s - %v", limit, err) + } + memoryData.Limit = value + + return memoryData, nil +} + +func getPageUsageByNUMA(cgroupPath string) (cgroups.PageUsageByNUMA, error) { + stats := cgroups.PageUsageByNUMA{} + + file, err := fscommon.OpenFile(cgroupPath, cgroupMemoryPagesByNuma, os.O_RDONLY) + if os.IsNotExist(err) { + return stats, nil + } else if err != nil { + return stats, err + } + + scanner := bufio.NewScanner(file) + for scanner.Scan() { + var statsType string + statsByType := cgroups.PageStats{Nodes: map[uint8]uint64{}} + columns := strings.SplitN(scanner.Text(), numaStatColumnSeparator, numaStatMaxColumns) + + for _, column := range columns { + pagesByNode := strings.SplitN(column, numaStatKeyValueSeparator, numaStatColumnSliceLength) + + if strings.HasPrefix(pagesByNode[numaStatTypeIndex], numaNodeSymbol) { + nodeID, err := strconv.ParseUint(pagesByNode[numaStatTypeIndex][1:], 10, 8) + if err != nil { + return cgroups.PageUsageByNUMA{}, err + } + + statsByType.Nodes[uint8(nodeID)], err = strconv.ParseUint(pagesByNode[numaStatValueIndex], 0, 64) + if err != nil { + return cgroups.PageUsageByNUMA{}, err + } + } else { + statsByType.Total, err = strconv.ParseUint(pagesByNode[numaStatValueIndex], 0, 64) + if err != nil { + return cgroups.PageUsageByNUMA{}, err + } + + statsType = pagesByNode[numaStatTypeIndex] + } + + err := addNUMAStatsByType(&stats, statsByType, statsType) + if err != nil { + return cgroups.PageUsageByNUMA{}, err + } + } + } + err = scanner.Err() + if err != nil { + return cgroups.PageUsageByNUMA{}, err + } + + return stats, nil +} + +func addNUMAStatsByType(stats *cgroups.PageUsageByNUMA, byTypeStats cgroups.PageStats, statsType string) error { + switch statsType { + case "total": + stats.Total = byTypeStats + case "file": + stats.File = byTypeStats + case "anon": + stats.Anon = byTypeStats + case "unevictable": + stats.Unevictable = byTypeStats + case "hierarchical_total": + stats.Hierarchical.Total = byTypeStats + case "hierarchical_file": + stats.Hierarchical.File = byTypeStats + case "hierarchical_anon": + stats.Hierarchical.Anon = byTypeStats + case "hierarchical_unevictable": + stats.Hierarchical.Unevictable = byTypeStats + default: + return fmt.Errorf("unsupported NUMA page type found: %s", statsType) + } + return nil +} + +func (s *MemoryGroup) Clone(source, dest string) error { + + if err := fscommon.WriteFile(source, "cgroup.clone_children", "1"); err != nil { + return err + } + + if err := os.MkdirAll(dest, 0755); err != nil { + return fmt.Errorf("Failed to create cgroup %s", dest) + } + + // Copy the memory cgroup limits from source to dest; this helps in the scenario where + // "dest" is the cgroup associated with the container's init process, as it allows some + // tools that collect container stats (e.g., "docker stats") to collect the appropriate + // mem limits for the container. + files := []string{ + "memory.limit_in_bytes", + "memory.soft_limit_in_bytes", + } + + for _, f := range files { + srcPath := filepath.Join(source, f) + dstPath := filepath.Join(dest, f) + + if err := fscommon.CopyFile(srcPath, dstPath); err != nil { + return fmt.Errorf("failed to copy %s to %s: %s", srcPath, dstPath, err) + } + } + + return nil +} diff --git a/sysbox-runc/libcontainer/cgroups/fs/memory_test.go b/sysbox-runc/libcontainer/cgroups/fs/memory_test.go new file mode 100644 index 00000000..a1118d34 --- /dev/null +++ b/sysbox-runc/libcontainer/cgroups/fs/memory_test.go @@ -0,0 +1,461 @@ +// +build linux + +package fs + +import ( + "strconv" + "testing" + + "github.com/opencontainers/runc/libcontainer/cgroups" + "github.com/opencontainers/runc/libcontainer/cgroups/fscommon" +) + +const ( + memoryStatContents = `cache 512 +rss 1024` + memoryUsageContents = "2048\n" + memoryMaxUsageContents = "4096\n" + memoryFailcnt = "100\n" + memoryLimitContents = "8192\n" + memoryUseHierarchyContents = "1\n" + memoryNUMAStatContents = `total=44611 N0=32631 N1=7501 N2=1982 N3=2497 +file=44428 N0=32614 N1=7335 N2=1982 N3=2497 +anon=183 N0=17 N1=166 N2=0 N3=0 +unevictable=0 N0=0 N1=0 N2=0 N3=0 +hierarchical_total=768133 N0=509113 N1=138887 N2=20464 N3=99669 +hierarchical_file=722017 N0=496516 N1=119997 N2=20181 N3=85323 +hierarchical_anon=46096 N0=12597 N1=18890 N2=283 N3=14326 +hierarchical_unevictable=20 N0=0 N1=0 N2=0 N3=20` + memoryNUMAStatNoHierarchyContents = `total=44611 N0=32631 N1=7501 N2=1982 N3=2497 +file=44428 N0=32614 N1=7335 N2=1982 N3=2497 +anon=183 N0=17 N1=166 N2=0 N3=0 +unevictable=0 N0=0 N1=0 N2=0 N3=0` +) + +func TestMemorySetMemory(t *testing.T) { + helper := NewCgroupTestUtil("memory", t) + defer helper.cleanup() + + const ( + memoryBefore = 314572800 // 300M + memoryAfter = 524288000 // 500M + reservationBefore = 209715200 // 200M + reservationAfter = 314572800 // 300M + ) + + helper.writeFileContents(map[string]string{ + "memory.limit_in_bytes": strconv.Itoa(memoryBefore), + "memory.soft_limit_in_bytes": strconv.Itoa(reservationBefore), + }) + + helper.CgroupData.config.Resources.Memory = memoryAfter + helper.CgroupData.config.Resources.MemoryReservation = reservationAfter + memory := &MemoryGroup{} + if err := memory.Set(helper.CgroupPath, helper.CgroupData.config); err != nil { + t.Fatal(err) + } + + value, err := fscommon.GetCgroupParamUint(helper.CgroupPath, "memory.limit_in_bytes") + if err != nil { + t.Fatalf("Failed to parse memory.limit_in_bytes - %s", err) + } + if value != memoryAfter { + t.Fatal("Got the wrong value, set memory.limit_in_bytes failed.") + } + + value, err = fscommon.GetCgroupParamUint(helper.CgroupPath, "memory.soft_limit_in_bytes") + if err != nil { + t.Fatalf("Failed to parse memory.soft_limit_in_bytes - %s", err) + } + if value != reservationAfter { + t.Fatal("Got the wrong value, set memory.soft_limit_in_bytes failed.") + } +} + +func TestMemorySetMemoryswap(t *testing.T) { + helper := NewCgroupTestUtil("memory", t) + defer helper.cleanup() + + const ( + memoryswapBefore = 314572800 // 300M + memoryswapAfter = 524288000 // 500M + ) + + helper.writeFileContents(map[string]string{ + "memory.memsw.limit_in_bytes": strconv.Itoa(memoryswapBefore), + }) + + helper.CgroupData.config.Resources.MemorySwap = memoryswapAfter + memory := &MemoryGroup{} + if err := memory.Set(helper.CgroupPath, helper.CgroupData.config); err != nil { + t.Fatal(err) + } + + value, err := fscommon.GetCgroupParamUint(helper.CgroupPath, "memory.memsw.limit_in_bytes") + if err != nil { + t.Fatalf("Failed to parse memory.memsw.limit_in_bytes - %s", err) + } + if value != memoryswapAfter { + t.Fatal("Got the wrong value, set memory.memsw.limit_in_bytes failed.") + } +} + +func TestMemorySetMemoryLargerThanSwap(t *testing.T) { + helper := NewCgroupTestUtil("memory", t) + defer helper.cleanup() + + const ( + memoryBefore = 314572800 // 300M + memoryswapBefore = 524288000 // 500M + memoryAfter = 629145600 // 600M + memoryswapAfter = 838860800 // 800M + ) + + helper.writeFileContents(map[string]string{ + "memory.limit_in_bytes": strconv.Itoa(memoryBefore), + "memory.memsw.limit_in_bytes": strconv.Itoa(memoryswapBefore), + // Set will call getMemoryData when memory and swap memory are + // both set, fake these fields so we don't get error. + "memory.usage_in_bytes": "0", + "memory.max_usage_in_bytes": "0", + "memory.failcnt": "0", + }) + + helper.CgroupData.config.Resources.Memory = memoryAfter + helper.CgroupData.config.Resources.MemorySwap = memoryswapAfter + memory := &MemoryGroup{} + if err := memory.Set(helper.CgroupPath, helper.CgroupData.config); err != nil { + t.Fatal(err) + } + + value, err := fscommon.GetCgroupParamUint(helper.CgroupPath, "memory.limit_in_bytes") + if err != nil { + t.Fatalf("Failed to parse memory.limit_in_bytes - %s", err) + } + if value != memoryAfter { + t.Fatal("Got the wrong value, set memory.limit_in_bytes failed.") + } + value, err = fscommon.GetCgroupParamUint(helper.CgroupPath, "memory.memsw.limit_in_bytes") + if err != nil { + t.Fatalf("Failed to parse memory.memsw.limit_in_bytes - %s", err) + } + if value != memoryswapAfter { + t.Fatal("Got the wrong value, set memory.memsw.limit_in_bytes failed.") + } +} + +func TestMemorySetSwapSmallerThanMemory(t *testing.T) { + helper := NewCgroupTestUtil("memory", t) + defer helper.cleanup() + + const ( + memoryBefore = 629145600 // 600M + memoryswapBefore = 838860800 // 800M + memoryAfter = 314572800 // 300M + memoryswapAfter = 524288000 // 500M + ) + + helper.writeFileContents(map[string]string{ + "memory.limit_in_bytes": strconv.Itoa(memoryBefore), + "memory.memsw.limit_in_bytes": strconv.Itoa(memoryswapBefore), + // Set will call getMemoryData when memory and swap memory are + // both set, fake these fields so we don't get error. + "memory.usage_in_bytes": "0", + "memory.max_usage_in_bytes": "0", + "memory.failcnt": "0", + }) + + helper.CgroupData.config.Resources.Memory = memoryAfter + helper.CgroupData.config.Resources.MemorySwap = memoryswapAfter + memory := &MemoryGroup{} + if err := memory.Set(helper.CgroupPath, helper.CgroupData.config); err != nil { + t.Fatal(err) + } + + value, err := fscommon.GetCgroupParamUint(helper.CgroupPath, "memory.limit_in_bytes") + if err != nil { + t.Fatalf("Failed to parse memory.limit_in_bytes - %s", err) + } + if value != memoryAfter { + t.Fatal("Got the wrong value, set memory.limit_in_bytes failed.") + } + value, err = fscommon.GetCgroupParamUint(helper.CgroupPath, "memory.memsw.limit_in_bytes") + if err != nil { + t.Fatalf("Failed to parse memory.memsw.limit_in_bytes - %s", err) + } + if value != memoryswapAfter { + t.Fatal("Got the wrong value, set memory.memsw.limit_in_bytes failed.") + } +} + +func TestMemorySetMemorySwappinessDefault(t *testing.T) { + helper := NewCgroupTestUtil("memory", t) + defer helper.cleanup() + + swappinessBefore := 60 //default is 60 + swappinessAfter := uint64(0) + + helper.writeFileContents(map[string]string{ + "memory.swappiness": strconv.Itoa(swappinessBefore), + }) + + helper.CgroupData.config.Resources.MemorySwappiness = &swappinessAfter + memory := &MemoryGroup{} + if err := memory.Set(helper.CgroupPath, helper.CgroupData.config); err != nil { + t.Fatal(err) + } + + value, err := fscommon.GetCgroupParamUint(helper.CgroupPath, "memory.swappiness") + if err != nil { + t.Fatalf("Failed to parse memory.swappiness - %s", err) + } + if value != swappinessAfter { + t.Fatalf("Got the wrong value (%d), set memory.swappiness = %d failed.", value, swappinessAfter) + } +} + +func TestMemoryStats(t *testing.T) { + helper := NewCgroupTestUtil("memory", t) + defer helper.cleanup() + helper.writeFileContents(map[string]string{ + "memory.stat": memoryStatContents, + "memory.usage_in_bytes": memoryUsageContents, + "memory.limit_in_bytes": memoryLimitContents, + "memory.max_usage_in_bytes": memoryMaxUsageContents, + "memory.failcnt": memoryFailcnt, + "memory.memsw.usage_in_bytes": memoryUsageContents, + "memory.memsw.max_usage_in_bytes": memoryMaxUsageContents, + "memory.memsw.failcnt": memoryFailcnt, + "memory.memsw.limit_in_bytes": memoryLimitContents, + "memory.kmem.usage_in_bytes": memoryUsageContents, + "memory.kmem.max_usage_in_bytes": memoryMaxUsageContents, + "memory.kmem.failcnt": memoryFailcnt, + "memory.kmem.limit_in_bytes": memoryLimitContents, + "memory.use_hierarchy": memoryUseHierarchyContents, + "memory.numa_stat": memoryNUMAStatContents, + }) + + memory := &MemoryGroup{} + actualStats := *cgroups.NewStats() + err := memory.GetStats(helper.CgroupPath, &actualStats) + if err != nil { + t.Fatal(err) + } + expectedStats := cgroups.MemoryStats{Cache: 512, Usage: cgroups.MemoryData{Usage: 2048, MaxUsage: 4096, Failcnt: 100, Limit: 8192}, SwapUsage: cgroups.MemoryData{Usage: 2048, MaxUsage: 4096, Failcnt: 100, Limit: 8192}, KernelUsage: cgroups.MemoryData{Usage: 2048, MaxUsage: 4096, Failcnt: 100, Limit: 8192}, Stats: map[string]uint64{"cache": 512, "rss": 1024}, UseHierarchy: true, + PageUsageByNUMA: cgroups.PageUsageByNUMA{ + PageUsageByNUMAInner: cgroups.PageUsageByNUMAInner{ + Total: cgroups.PageStats{Total: 44611, Nodes: map[uint8]uint64{0: 32631, 1: 7501, 2: 1982, 3: 2497}}, + File: cgroups.PageStats{Total: 44428, Nodes: map[uint8]uint64{0: 32614, 1: 7335, 2: 1982, 3: 2497}}, + Anon: cgroups.PageStats{Total: 183, Nodes: map[uint8]uint64{0: 17, 1: 166, 2: 0, 3: 0}}, + Unevictable: cgroups.PageStats{Total: 0, Nodes: map[uint8]uint64{0: 0, 1: 0, 2: 0, 3: 0}}, + }, + Hierarchical: cgroups.PageUsageByNUMAInner{ + Total: cgroups.PageStats{Total: 768133, Nodes: map[uint8]uint64{0: 509113, 1: 138887, 2: 20464, 3: 99669}}, + File: cgroups.PageStats{Total: 722017, Nodes: map[uint8]uint64{0: 496516, 1: 119997, 2: 20181, 3: 85323}}, + Anon: cgroups.PageStats{Total: 46096, Nodes: map[uint8]uint64{0: 12597, 1: 18890, 2: 283, 3: 14326}}, + Unevictable: cgroups.PageStats{Total: 20, Nodes: map[uint8]uint64{0: 0, 1: 0, 2: 0, 3: 20}}, + }, + }} + expectMemoryStatEquals(t, expectedStats, actualStats.MemoryStats) +} + +func TestMemoryStatsNoStatFile(t *testing.T) { + helper := NewCgroupTestUtil("memory", t) + defer helper.cleanup() + helper.writeFileContents(map[string]string{ + "memory.usage_in_bytes": memoryUsageContents, + "memory.max_usage_in_bytes": memoryMaxUsageContents, + "memory.limit_in_bytes": memoryLimitContents, + }) + + memory := &MemoryGroup{} + actualStats := *cgroups.NewStats() + err := memory.GetStats(helper.CgroupPath, &actualStats) + if err != nil { + t.Fatal(err) + } +} + +func TestMemoryStatsNoUsageFile(t *testing.T) { + helper := NewCgroupTestUtil("memory", t) + defer helper.cleanup() + helper.writeFileContents(map[string]string{ + "memory.stat": memoryStatContents, + "memory.max_usage_in_bytes": memoryMaxUsageContents, + "memory.limit_in_bytes": memoryLimitContents, + }) + + memory := &MemoryGroup{} + actualStats := *cgroups.NewStats() + err := memory.GetStats(helper.CgroupPath, &actualStats) + if err == nil { + t.Fatal("Expected failure") + } +} + +func TestMemoryStatsNoMaxUsageFile(t *testing.T) { + helper := NewCgroupTestUtil("memory", t) + defer helper.cleanup() + helper.writeFileContents(map[string]string{ + "memory.stat": memoryStatContents, + "memory.usage_in_bytes": memoryUsageContents, + "memory.limit_in_bytes": memoryLimitContents, + }) + + memory := &MemoryGroup{} + actualStats := *cgroups.NewStats() + err := memory.GetStats(helper.CgroupPath, &actualStats) + if err == nil { + t.Fatal("Expected failure") + } +} + +func TestMemoryStatsNoLimitInBytesFile(t *testing.T) { + helper := NewCgroupTestUtil("memory", t) + defer helper.cleanup() + helper.writeFileContents(map[string]string{ + "memory.stat": memoryStatContents, + "memory.usage_in_bytes": memoryUsageContents, + "memory.max_usage_in_bytes": memoryMaxUsageContents, + }) + + memory := &MemoryGroup{} + actualStats := *cgroups.NewStats() + err := memory.GetStats(helper.CgroupPath, &actualStats) + if err == nil { + t.Fatal("Expected failure") + } +} + +func TestMemoryStatsBadStatFile(t *testing.T) { + helper := NewCgroupTestUtil("memory", t) + defer helper.cleanup() + helper.writeFileContents(map[string]string{ + "memory.stat": "rss rss", + "memory.usage_in_bytes": memoryUsageContents, + "memory.max_usage_in_bytes": memoryMaxUsageContents, + "memory.limit_in_bytes": memoryLimitContents, + }) + + memory := &MemoryGroup{} + actualStats := *cgroups.NewStats() + err := memory.GetStats(helper.CgroupPath, &actualStats) + if err == nil { + t.Fatal("Expected failure") + } +} + +func TestMemoryStatsBadUsageFile(t *testing.T) { + helper := NewCgroupTestUtil("memory", t) + defer helper.cleanup() + helper.writeFileContents(map[string]string{ + "memory.stat": memoryStatContents, + "memory.usage_in_bytes": "bad", + "memory.max_usage_in_bytes": memoryMaxUsageContents, + "memory.limit_in_bytes": memoryLimitContents, + }) + + memory := &MemoryGroup{} + actualStats := *cgroups.NewStats() + err := memory.GetStats(helper.CgroupPath, &actualStats) + if err == nil { + t.Fatal("Expected failure") + } +} + +func TestMemoryStatsBadMaxUsageFile(t *testing.T) { + helper := NewCgroupTestUtil("memory", t) + defer helper.cleanup() + helper.writeFileContents(map[string]string{ + "memory.stat": memoryStatContents, + "memory.usage_in_bytes": memoryUsageContents, + "memory.max_usage_in_bytes": "bad", + "memory.limit_in_bytes": memoryLimitContents, + }) + + memory := &MemoryGroup{} + actualStats := *cgroups.NewStats() + err := memory.GetStats(helper.CgroupPath, &actualStats) + if err == nil { + t.Fatal("Expected failure") + } +} + +func TestMemoryStatsBadLimitInBytesFile(t *testing.T) { + helper := NewCgroupTestUtil("memory", t) + defer helper.cleanup() + helper.writeFileContents(map[string]string{ + "memory.stat": memoryStatContents, + "memory.usage_in_bytes": memoryUsageContents, + "memory.max_usage_in_bytes": memoryMaxUsageContents, + "memory.limit_in_bytes": "bad", + }) + + memory := &MemoryGroup{} + actualStats := *cgroups.NewStats() + err := memory.GetStats(helper.CgroupPath, &actualStats) + if err == nil { + t.Fatal("Expected failure") + } +} + +func TestMemorySetOomControl(t *testing.T) { + helper := NewCgroupTestUtil("memory", t) + defer helper.cleanup() + + const ( + oomKillDisable = 1 // disable oom killer, default is 0 + ) + + helper.writeFileContents(map[string]string{ + "memory.oom_control": strconv.Itoa(oomKillDisable), + }) + + memory := &MemoryGroup{} + if err := memory.Set(helper.CgroupPath, helper.CgroupData.config); err != nil { + t.Fatal(err) + } + + value, err := fscommon.GetCgroupParamUint(helper.CgroupPath, "memory.oom_control") + if err != nil { + t.Fatalf("Failed to parse memory.oom_control - %s", err) + } + + if value != oomKillDisable { + t.Fatalf("Got the wrong value, set memory.oom_control failed.") + } +} + +func TestNoHierarchicalNumaStat(t *testing.T) { + helper := NewCgroupTestUtil("memory", t) + defer helper.cleanup() + helper.writeFileContents(map[string]string{ + "memory.numa_stat": memoryNUMAStatNoHierarchyContents, + }) + + actualStats, err := getPageUsageByNUMA(helper.CgroupPath) + if err != nil { + t.Fatal(err) + } + pageUsageByNUMA := cgroups.PageUsageByNUMA{ + PageUsageByNUMAInner: cgroups.PageUsageByNUMAInner{ + Total: cgroups.PageStats{Total: 44611, Nodes: map[uint8]uint64{0: 32631, 1: 7501, 2: 1982, 3: 2497}}, + File: cgroups.PageStats{Total: 44428, Nodes: map[uint8]uint64{0: 32614, 1: 7335, 2: 1982, 3: 2497}}, + Anon: cgroups.PageStats{Total: 183, Nodes: map[uint8]uint64{0: 17, 1: 166, 2: 0, 3: 0}}, + Unevictable: cgroups.PageStats{Total: 0, Nodes: map[uint8]uint64{0: 0, 1: 0, 2: 0, 3: 0}}, + }, + Hierarchical: cgroups.PageUsageByNUMAInner{}, + } + expectPageUsageByNUMAEquals(t, pageUsageByNUMA, actualStats) +} + +func TestWithoutNumaStat(t *testing.T) { + helper := NewCgroupTestUtil("memory", t) + defer helper.cleanup() + + actualStats, err := getPageUsageByNUMA(helper.CgroupPath) + if err != nil { + t.Fatal(err) + } + expectPageUsageByNUMAEquals(t, cgroups.PageUsageByNUMA{}, actualStats) +} diff --git a/sysbox-runc/libcontainer/cgroups/fs/name.go b/sysbox-runc/libcontainer/cgroups/fs/name.go new file mode 100644 index 00000000..6967e5c4 --- /dev/null +++ b/sysbox-runc/libcontainer/cgroups/fs/name.go @@ -0,0 +1,50 @@ +// +build linux + +package fs + +import ( + "fmt" + "os" + + "github.com/opencontainers/runc/libcontainer/cgroups" + "github.com/opencontainers/runc/libcontainer/cgroups/fscommon" + "github.com/opencontainers/runc/libcontainer/configs" +) + +type NameGroup struct { + GroupName string + Join bool +} + +func (s *NameGroup) Name() string { + return s.GroupName +} + +func (s *NameGroup) Apply(path string, d *cgroupData) error { + if s.Join { + // ignore errors if the named cgroup does not exist + join(path, d.pid) + } + return nil +} + +func (s *NameGroup) Set(path string, cgroup *configs.Cgroup) error { + return nil +} + +func (s *NameGroup) GetStats(path string, stats *cgroups.Stats) error { + return nil +} + +func (s *NameGroup) Clone(source, dest string) error { + + if err := fscommon.WriteFile(source, "cgroup.clone_children", "1"); err != nil { + return err + } + + if err := os.MkdirAll(dest, 0755); err != nil { + return fmt.Errorf("Failed to create cgroup %s", dest) + } + + return nil +} diff --git a/sysbox-runc/libcontainer/cgroups/fs/net_cls.go b/sysbox-runc/libcontainer/cgroups/fs/net_cls.go new file mode 100644 index 00000000..71c0b700 --- /dev/null +++ b/sysbox-runc/libcontainer/cgroups/fs/net_cls.go @@ -0,0 +1,51 @@ +// +build linux + +package fs + +import ( + "fmt" + "os" + "strconv" + + "github.com/opencontainers/runc/libcontainer/cgroups" + "github.com/opencontainers/runc/libcontainer/cgroups/fscommon" + "github.com/opencontainers/runc/libcontainer/configs" +) + +type NetClsGroup struct { +} + +func (s *NetClsGroup) Name() string { + return "net_cls" +} + +func (s *NetClsGroup) Apply(path string, d *cgroupData) error { + return join(path, d.pid) +} + +func (s *NetClsGroup) Set(path string, cgroup *configs.Cgroup) error { + if cgroup.Resources.NetClsClassid != 0 { + if err := fscommon.WriteFile(path, "net_cls.classid", strconv.FormatUint(uint64(cgroup.Resources.NetClsClassid), 10)); err != nil { + return err + } + } + + return nil +} + +func (s *NetClsGroup) GetStats(path string, stats *cgroups.Stats) error { + return nil +} + +func (s *NetClsGroup) Clone(source, dest string) error { + + if err := fscommon.WriteFile(source, "cgroup.clone_children", "1"); err != nil { + return err + } + + if err := os.MkdirAll(dest, 0755); err != nil { + return fmt.Errorf("Failed to create cgroup %s", dest) + } + + return nil +} diff --git a/sysbox-runc/libcontainer/cgroups/fs/net_cls_test.go b/sysbox-runc/libcontainer/cgroups/fs/net_cls_test.go new file mode 100644 index 00000000..602133a2 --- /dev/null +++ b/sysbox-runc/libcontainer/cgroups/fs/net_cls_test.go @@ -0,0 +1,41 @@ +// +build linux + +package fs + +import ( + "strconv" + "testing" + + "github.com/opencontainers/runc/libcontainer/cgroups/fscommon" +) + +const ( + classidBefore = 0x100002 + classidAfter = 0x100001 +) + +func TestNetClsSetClassid(t *testing.T) { + helper := NewCgroupTestUtil("net_cls", t) + defer helper.cleanup() + + helper.writeFileContents(map[string]string{ + "net_cls.classid": strconv.FormatUint(classidBefore, 10), + }) + + helper.CgroupData.config.Resources.NetClsClassid = classidAfter + netcls := &NetClsGroup{} + if err := netcls.Set(helper.CgroupPath, helper.CgroupData.config); err != nil { + t.Fatal(err) + } + + // As we are in mock environment, we can't get correct value of classid from + // net_cls.classid. + // So. we just judge if we successfully write classid into file + value, err := fscommon.GetCgroupParamUint(helper.CgroupPath, "net_cls.classid") + if err != nil { + t.Fatalf("Failed to parse net_cls.classid - %s", err) + } + if value != classidAfter { + t.Fatal("Got the wrong value, set net_cls.classid failed.") + } +} diff --git a/sysbox-runc/libcontainer/cgroups/fs/net_prio.go b/sysbox-runc/libcontainer/cgroups/fs/net_prio.go new file mode 100644 index 00000000..ac5ff948 --- /dev/null +++ b/sysbox-runc/libcontainer/cgroups/fs/net_prio.go @@ -0,0 +1,50 @@ +// +build linux + +package fs + +import ( + "fmt" + "os" + + "github.com/opencontainers/runc/libcontainer/cgroups" + "github.com/opencontainers/runc/libcontainer/cgroups/fscommon" + "github.com/opencontainers/runc/libcontainer/configs" +) + +type NetPrioGroup struct { +} + +func (s *NetPrioGroup) Name() string { + return "net_prio" +} + +func (s *NetPrioGroup) Apply(path string, d *cgroupData) error { + return join(path, d.pid) +} + +func (s *NetPrioGroup) Set(path string, cgroup *configs.Cgroup) error { + for _, prioMap := range cgroup.Resources.NetPrioIfpriomap { + if err := fscommon.WriteFile(path, "net_prio.ifpriomap", prioMap.CgroupString()); err != nil { + return err + } + } + + return nil +} + +func (s *NetPrioGroup) GetStats(path string, stats *cgroups.Stats) error { + return nil +} + +func (s *NetPrioGroup) Clone(source, dest string) error { + + if err := fscommon.WriteFile(source, "cgroup.clone_children", "1"); err != nil { + return err + } + + if err := os.MkdirAll(dest, 0755); err != nil { + return fmt.Errorf("Failed to create cgroup %s", dest) + } + + return nil +} diff --git a/sysbox-runc/libcontainer/cgroups/fs/net_prio_test.go b/sysbox-runc/libcontainer/cgroups/fs/net_prio_test.go new file mode 100644 index 00000000..2ce8e192 --- /dev/null +++ b/sysbox-runc/libcontainer/cgroups/fs/net_prio_test.go @@ -0,0 +1,39 @@ +// +build linux + +package fs + +import ( + "strings" + "testing" + + "github.com/opencontainers/runc/libcontainer/cgroups/fscommon" + "github.com/opencontainers/runc/libcontainer/configs" +) + +var ( + prioMap = []*configs.IfPrioMap{ + { + Interface: "test", + Priority: 5, + }, + } +) + +func TestNetPrioSetIfPrio(t *testing.T) { + helper := NewCgroupTestUtil("net_prio", t) + defer helper.cleanup() + + helper.CgroupData.config.Resources.NetPrioIfpriomap = prioMap + netPrio := &NetPrioGroup{} + if err := netPrio.Set(helper.CgroupPath, helper.CgroupData.config); err != nil { + t.Fatal(err) + } + + value, err := fscommon.GetCgroupParamString(helper.CgroupPath, "net_prio.ifpriomap") + if err != nil { + t.Fatalf("Failed to parse net_prio.ifpriomap - %s", err) + } + if !strings.Contains(value, "test 5") { + t.Fatal("Got the wrong value, set net_prio.ifpriomap failed.") + } +} diff --git a/sysbox-runc/libcontainer/cgroups/fs/perf_event.go b/sysbox-runc/libcontainer/cgroups/fs/perf_event.go new file mode 100644 index 00000000..749b2c78 --- /dev/null +++ b/sysbox-runc/libcontainer/cgroups/fs/perf_event.go @@ -0,0 +1,44 @@ +// +build linux + +package fs + +import ( + "fmt" + "os" + + "github.com/opencontainers/runc/libcontainer/cgroups" + "github.com/opencontainers/runc/libcontainer/cgroups/fscommon" + "github.com/opencontainers/runc/libcontainer/configs" +) + +type PerfEventGroup struct { +} + +func (s *PerfEventGroup) Name() string { + return "perf_event" +} + +func (s *PerfEventGroup) Apply(path string, d *cgroupData) error { + return join(path, d.pid) +} + +func (s *PerfEventGroup) Set(path string, cgroup *configs.Cgroup) error { + return nil +} + +func (s *PerfEventGroup) GetStats(path string, stats *cgroups.Stats) error { + return nil +} + +func (s *PerfEventGroup) Clone(source, dest string) error { + + if err := fscommon.WriteFile(source, "cgroup.clone_children", "1"); err != nil { + return err + } + + if err := os.MkdirAll(dest, 0755); err != nil { + return fmt.Errorf("Failed to create cgroup %s", dest) + } + + return nil +} diff --git a/sysbox-runc/libcontainer/cgroups/fs/pids.go b/sysbox-runc/libcontainer/cgroups/fs/pids.go new file mode 100644 index 00000000..5f926194 --- /dev/null +++ b/sysbox-runc/libcontainer/cgroups/fs/pids.go @@ -0,0 +1,83 @@ +// +build linux + +package fs + +import ( + "fmt" + "os" + "path/filepath" + "strconv" + + "github.com/opencontainers/runc/libcontainer/cgroups" + "github.com/opencontainers/runc/libcontainer/cgroups/fscommon" + "github.com/opencontainers/runc/libcontainer/configs" +) + +type PidsGroup struct { +} + +func (s *PidsGroup) Name() string { + return "pids" +} + +func (s *PidsGroup) Apply(path string, d *cgroupData) error { + return join(path, d.pid) +} + +func (s *PidsGroup) Set(path string, cgroup *configs.Cgroup) error { + if cgroup.Resources.PidsLimit != 0 { + // "max" is the fallback value. + limit := "max" + + if cgroup.Resources.PidsLimit > 0 { + limit = strconv.FormatInt(cgroup.Resources.PidsLimit, 10) + } + + if err := fscommon.WriteFile(path, "pids.max", limit); err != nil { + return err + } + } + + return nil +} + +func (s *PidsGroup) GetStats(path string, stats *cgroups.Stats) error { + if !cgroups.PathExists(path) { + return nil + } + current, err := fscommon.GetCgroupParamUint(path, "pids.current") + if err != nil { + return fmt.Errorf("failed to parse pids.current - %s", err) + } + + maxString, err := fscommon.GetCgroupParamString(path, "pids.max") + if err != nil { + return fmt.Errorf("failed to parse pids.max - %s", err) + } + + // Default if pids.max == "max" is 0 -- which represents "no limit". + var max uint64 + if maxString != "max" { + max, err = fscommon.ParseUint(maxString, 10, 64) + if err != nil { + return fmt.Errorf("failed to parse pids.max - unable to parse %q as a uint from Cgroup file %q", maxString, filepath.Join(path, "pids.max")) + } + } + + stats.PidsStats.Current = current + stats.PidsStats.Limit = max + return nil +} + +func (s *PidsGroup) Clone(source, dest string) error { + + if err := fscommon.WriteFile(source, "cgroup.clone_children", "1"); err != nil { + return err + } + + if err := os.MkdirAll(dest, 0755); err != nil { + return fmt.Errorf("Failed to create cgroup %s", dest) + } + + return nil +} diff --git a/sysbox-runc/libcontainer/cgroups/fs/pids_test.go b/sysbox-runc/libcontainer/cgroups/fs/pids_test.go new file mode 100644 index 00000000..66f3aa33 --- /dev/null +++ b/sysbox-runc/libcontainer/cgroups/fs/pids_test.go @@ -0,0 +1,112 @@ +// +build linux + +package fs + +import ( + "strconv" + "testing" + + "github.com/opencontainers/runc/libcontainer/cgroups" + "github.com/opencontainers/runc/libcontainer/cgroups/fscommon" +) + +const ( + maxUnlimited = -1 + maxLimited = 1024 +) + +func TestPidsSetMax(t *testing.T) { + helper := NewCgroupTestUtil("pids", t) + defer helper.cleanup() + + helper.writeFileContents(map[string]string{ + "pids.max": "max", + }) + + helper.CgroupData.config.Resources.PidsLimit = maxLimited + pids := &PidsGroup{} + if err := pids.Set(helper.CgroupPath, helper.CgroupData.config); err != nil { + t.Fatal(err) + } + + value, err := fscommon.GetCgroupParamUint(helper.CgroupPath, "pids.max") + if err != nil { + t.Fatalf("Failed to parse pids.max - %s", err) + } + + if value != maxLimited { + t.Fatalf("Expected %d, got %d for setting pids.max - limited", maxLimited, value) + } +} + +func TestPidsSetUnlimited(t *testing.T) { + helper := NewCgroupTestUtil("pids", t) + defer helper.cleanup() + + helper.writeFileContents(map[string]string{ + "pids.max": strconv.Itoa(maxLimited), + }) + + helper.CgroupData.config.Resources.PidsLimit = maxUnlimited + pids := &PidsGroup{} + if err := pids.Set(helper.CgroupPath, helper.CgroupData.config); err != nil { + t.Fatal(err) + } + + value, err := fscommon.GetCgroupParamString(helper.CgroupPath, "pids.max") + if err != nil { + t.Fatalf("Failed to parse pids.max - %s", err) + } + + if value != "max" { + t.Fatalf("Expected %s, got %s for setting pids.max - unlimited", "max", value) + } +} + +func TestPidsStats(t *testing.T) { + helper := NewCgroupTestUtil("pids", t) + defer helper.cleanup() + + helper.writeFileContents(map[string]string{ + "pids.current": strconv.Itoa(1337), + "pids.max": strconv.Itoa(maxLimited), + }) + + pids := &PidsGroup{} + stats := *cgroups.NewStats() + if err := pids.GetStats(helper.CgroupPath, &stats); err != nil { + t.Fatal(err) + } + + if stats.PidsStats.Current != 1337 { + t.Fatalf("Expected %d, got %d for pids.current", 1337, stats.PidsStats.Current) + } + + if stats.PidsStats.Limit != maxLimited { + t.Fatalf("Expected %d, got %d for pids.max", maxLimited, stats.PidsStats.Limit) + } +} + +func TestPidsStatsUnlimited(t *testing.T) { + helper := NewCgroupTestUtil("pids", t) + defer helper.cleanup() + + helper.writeFileContents(map[string]string{ + "pids.current": strconv.Itoa(4096), + "pids.max": "max", + }) + + pids := &PidsGroup{} + stats := *cgroups.NewStats() + if err := pids.GetStats(helper.CgroupPath, &stats); err != nil { + t.Fatal(err) + } + + if stats.PidsStats.Current != 4096 { + t.Fatalf("Expected %d, got %d for pids.current", 4096, stats.PidsStats.Current) + } + + if stats.PidsStats.Limit != 0 { + t.Fatalf("Expected %d, got %d for pids.max", 0, stats.PidsStats.Limit) + } +} diff --git a/sysbox-runc/libcontainer/cgroups/fs/rdma.go b/sysbox-runc/libcontainer/cgroups/fs/rdma.go new file mode 100644 index 00000000..a837111d --- /dev/null +++ b/sysbox-runc/libcontainer/cgroups/fs/rdma.go @@ -0,0 +1,41 @@ +package fs + +import ( + "fmt" + "os" + + "github.com/opencontainers/runc/libcontainer/cgroups" + "github.com/opencontainers/runc/libcontainer/cgroups/fscommon" + "github.com/opencontainers/runc/libcontainer/configs" +) + +type RdmaGroup struct{} + +func (s *RdmaGroup) Name() string { + return "rdma" +} + +func (s *RdmaGroup) Apply(path string, d *cgroupData) error { + return join(path, d.pid) +} + +func (s *RdmaGroup) Set(path string, cgroup *configs.Cgroup) error { + return fscommon.RdmaSet(path, cgroup) +} + +func (s *RdmaGroup) GetStats(path string, stats *cgroups.Stats) error { + return fscommon.RdmaGetStats(path, stats) +} + +func (s *RdmaGroup) Clone(source, dest string) error { + + if err := fscommon.WriteFile(source, "cgroup.clone_children", "1"); err != nil { + return err + } + + if err := os.MkdirAll(dest, 0755); err != nil { + return fmt.Errorf("Failed to create cgroup %s", dest) + } + + return nil +} diff --git a/sysbox-runc/libcontainer/cgroups/fs/stats_util_test.go b/sysbox-runc/libcontainer/cgroups/fs/stats_util_test.go new file mode 100644 index 00000000..08f717ea --- /dev/null +++ b/sysbox-runc/libcontainer/cgroups/fs/stats_util_test.go @@ -0,0 +1,134 @@ +// +build linux + +package fs + +import ( + "errors" + "fmt" + "reflect" + "testing" + + "github.com/opencontainers/runc/libcontainer/cgroups" +) + +func blkioStatEntryEquals(expected, actual []cgroups.BlkioStatEntry) error { + if len(expected) != len(actual) { + return errors.New("blkioStatEntries length do not match") + } + for i, expValue := range expected { + actValue := actual[i] + if expValue != actValue { + return fmt.Errorf("Expected blkio stat entry %v but found %v", expValue, actValue) + } + } + return nil +} + +func expectBlkioStatsEquals(t *testing.T, expected, actual cgroups.BlkioStats) { + if err := blkioStatEntryEquals(expected.IoServiceBytesRecursive, actual.IoServiceBytesRecursive); err != nil { + t.Errorf("blkio IoServiceBytesRecursive do not match - %s\n", err) + } + + if err := blkioStatEntryEquals(expected.IoServicedRecursive, actual.IoServicedRecursive); err != nil { + t.Errorf("blkio IoServicedRecursive do not match - %s\n", err) + } + + if err := blkioStatEntryEquals(expected.IoQueuedRecursive, actual.IoQueuedRecursive); err != nil { + t.Errorf("blkio IoQueuedRecursive do not match - %s\n", err) + } + + if err := blkioStatEntryEquals(expected.SectorsRecursive, actual.SectorsRecursive); err != nil { + t.Errorf("blkio SectorsRecursive do not match - %s\n", err) + } + + if err := blkioStatEntryEquals(expected.IoServiceTimeRecursive, actual.IoServiceTimeRecursive); err != nil { + t.Errorf("blkio IoServiceTimeRecursive do not match - %s\n", err) + } + + if err := blkioStatEntryEquals(expected.IoWaitTimeRecursive, actual.IoWaitTimeRecursive); err != nil { + t.Errorf("blkio IoWaitTimeRecursive do not match - %s\n", err) + } + + if err := blkioStatEntryEquals(expected.IoMergedRecursive, actual.IoMergedRecursive); err != nil { + t.Errorf("blkio IoMergedRecursive do not match - %v vs %v\n", expected.IoMergedRecursive, actual.IoMergedRecursive) + } + + if err := blkioStatEntryEquals(expected.IoTimeRecursive, actual.IoTimeRecursive); err != nil { + t.Errorf("blkio IoTimeRecursive do not match - %s\n", err) + } +} + +func expectThrottlingDataEquals(t *testing.T, expected, actual cgroups.ThrottlingData) { + if expected != actual { + t.Errorf("Expected throttling data %v but found %v\n", expected, actual) + } +} + +func expectHugetlbStatEquals(t *testing.T, expected, actual cgroups.HugetlbStats) { + if expected != actual { + t.Errorf("Expected hugetlb stats %v but found %v\n", expected, actual) + } +} + +func expectMemoryStatEquals(t *testing.T, expected, actual cgroups.MemoryStats) { + expectMemoryDataEquals(t, expected.Usage, actual.Usage) + expectMemoryDataEquals(t, expected.SwapUsage, actual.SwapUsage) + expectMemoryDataEquals(t, expected.KernelUsage, actual.KernelUsage) + expectPageUsageByNUMAEquals(t, expected.PageUsageByNUMA, actual.PageUsageByNUMA) + + if expected.UseHierarchy != actual.UseHierarchy { + t.Errorf("Expected memory use hierarchy %v, but found %v\n", expected.UseHierarchy, actual.UseHierarchy) + } + + for key, expValue := range expected.Stats { + actValue, ok := actual.Stats[key] + if !ok { + t.Errorf("Expected memory stat key %s not found\n", key) + } + if expValue != actValue { + t.Errorf("Expected memory stat value %d but found %d\n", expValue, actValue) + } + } +} + +func expectMemoryDataEquals(t *testing.T, expected, actual cgroups.MemoryData) { + if expected.Usage != actual.Usage { + t.Errorf("Expected memory usage %d but found %d\n", expected.Usage, actual.Usage) + } + if expected.MaxUsage != actual.MaxUsage { + t.Errorf("Expected memory max usage %d but found %d\n", expected.MaxUsage, actual.MaxUsage) + } + if expected.Failcnt != actual.Failcnt { + t.Errorf("Expected memory failcnt %d but found %d\n", expected.Failcnt, actual.Failcnt) + } + if expected.Limit != actual.Limit { + t.Errorf("Expected memory limit %d but found %d\n", expected.Limit, actual.Limit) + } +} + +func expectPageUsageByNUMAEquals(t *testing.T, expected, actual cgroups.PageUsageByNUMA) { + if !reflect.DeepEqual(expected.Total, actual.Total) { + t.Errorf("Expected total page usage by NUMA %#v but found %#v", expected.Total, actual.Total) + } + if !reflect.DeepEqual(expected.File, actual.File) { + t.Errorf("Expected file page usage by NUMA %#v but found %#v", expected.File, actual.File) + } + if !reflect.DeepEqual(expected.Anon, actual.Anon) { + t.Errorf("Expected anon page usage by NUMA %#v but found %#v", expected.Anon, actual.Anon) + } + if !reflect.DeepEqual(expected.Unevictable, actual.Unevictable) { + t.Errorf("Expected unevictable page usage by NUMA %#v but found %#v", expected.Unevictable, actual.Unevictable) + } + if !reflect.DeepEqual(expected.Hierarchical.Total, actual.Hierarchical.Total) { + t.Errorf("Expected hierarchical total page usage by NUMA %#v but found %#v", expected.Hierarchical.Total, actual.Hierarchical.Total) + } + if !reflect.DeepEqual(expected.Hierarchical.File, actual.Hierarchical.File) { + t.Errorf("Expected hierarchical file page usage by NUMA %#v but found %#v", expected.Hierarchical.File, actual.Hierarchical.File) + } + if !reflect.DeepEqual(expected.Hierarchical.Anon, actual.Hierarchical.Anon) { + t.Errorf("Expected hierarchical anon page usage by NUMA %#v but found %#v", expected.Hierarchical.Anon, actual.Hierarchical.Anon) + } + if !reflect.DeepEqual(expected.Hierarchical.Unevictable, actual.Hierarchical.Unevictable) { + t.Errorf("Expected hierarchical total page usage by NUMA %#v but found %#v", expected.Hierarchical.Unevictable, actual.Hierarchical.Unevictable) + } +} diff --git a/sysbox-runc/libcontainer/cgroups/fs/unsupported.go b/sysbox-runc/libcontainer/cgroups/fs/unsupported.go new file mode 100644 index 00000000..3ef9e031 --- /dev/null +++ b/sysbox-runc/libcontainer/cgroups/fs/unsupported.go @@ -0,0 +1,3 @@ +// +build !linux + +package fs diff --git a/sysbox-runc/libcontainer/cgroups/fs/util_test.go b/sysbox-runc/libcontainer/cgroups/fs/util_test.go new file mode 100644 index 00000000..403ab6ec --- /dev/null +++ b/sysbox-runc/libcontainer/cgroups/fs/util_test.go @@ -0,0 +1,73 @@ +// +build linux + +/* +Utility for testing cgroup operations. + +Creates a mock of the cgroup filesystem for the duration of the test. +*/ +package fs + +import ( + "io/ioutil" + "os" + "path/filepath" + "testing" + + "github.com/opencontainers/runc/libcontainer/cgroups" + "github.com/opencontainers/runc/libcontainer/cgroups/fscommon" + "github.com/opencontainers/runc/libcontainer/configs" +) + +func init() { + cgroups.TestMode = true +} + +type cgroupTestUtil struct { + // cgroup data to use in tests. + CgroupData *cgroupData + + // Path to the mock cgroup directory. + CgroupPath string + + // Temporary directory to store mock cgroup filesystem. + tempDir string + t *testing.T +} + +// Creates a new test util for the specified subsystem +func NewCgroupTestUtil(subsystem string, t *testing.T) *cgroupTestUtil { + d := &cgroupData{ + config: &configs.Cgroup{}, + } + d.config.Resources = &configs.Resources{} + tempDir, err := ioutil.TempDir("", "cgroup_test") + if err != nil { + t.Fatal(err) + } + d.root = tempDir + testCgroupPath := filepath.Join(d.root, subsystem) + if err != nil { + t.Fatal(err) + } + + // Ensure the full mock cgroup path exists. + err = os.MkdirAll(testCgroupPath, 0755) + if err != nil { + t.Fatal(err) + } + return &cgroupTestUtil{CgroupData: d, CgroupPath: testCgroupPath, tempDir: tempDir, t: t} +} + +func (c *cgroupTestUtil) cleanup() { + os.RemoveAll(c.tempDir) +} + +// Write the specified contents on the mock of the specified cgroup files. +func (c *cgroupTestUtil) writeFileContents(fileContents map[string]string) { + for file, contents := range fileContents { + err := fscommon.WriteFile(c.CgroupPath, file, contents) + if err != nil { + c.t.Fatal(err) + } + } +} diff --git a/sysbox-runc/libcontainer/cgroups/fs2/cpu.go b/sysbox-runc/libcontainer/cgroups/fs2/cpu.go new file mode 100644 index 00000000..0dffe664 --- /dev/null +++ b/sysbox-runc/libcontainer/cgroups/fs2/cpu.go @@ -0,0 +1,76 @@ +// +build linux + +package fs2 + +import ( + "bufio" + "os" + "strconv" + + "github.com/opencontainers/runc/libcontainer/cgroups" + "github.com/opencontainers/runc/libcontainer/cgroups/fscommon" + "github.com/opencontainers/runc/libcontainer/configs" +) + +func isCpuSet(cgroup *configs.Cgroup) bool { + return cgroup.Resources.CpuWeight != 0 || cgroup.Resources.CpuQuota != 0 || cgroup.Resources.CpuPeriod != 0 +} + +func setCpu(dirPath string, cgroup *configs.Cgroup) error { + if !isCpuSet(cgroup) { + return nil + } + r := cgroup.Resources + + // NOTE: .CpuShares is not used here. Conversion is the caller's responsibility. + if r.CpuWeight != 0 { + if err := fscommon.WriteFile(dirPath, "cpu.weight", strconv.FormatUint(r.CpuWeight, 10)); err != nil { + return err + } + } + + if r.CpuQuota != 0 || r.CpuPeriod != 0 { + str := "max" + if r.CpuQuota > 0 { + str = strconv.FormatInt(r.CpuQuota, 10) + } + period := r.CpuPeriod + if period == 0 { + // This default value is documented in + // https://www.kernel.org/doc/html/latest/admin-guide/cgroup-v2.html + period = 100000 + } + str += " " + strconv.FormatUint(period, 10) + if err := fscommon.WriteFile(dirPath, "cpu.max", str); err != nil { + return err + } + } + + return nil +} +func statCpu(dirPath string, stats *cgroups.Stats) error { + f, err := fscommon.OpenFile(dirPath, "cpu.stat", os.O_RDONLY) + if err != nil { + return err + } + defer f.Close() + + sc := bufio.NewScanner(f) + for sc.Scan() { + t, v, err := fscommon.GetCgroupParamKeyValue(sc.Text()) + if err != nil { + return err + } + switch t { + case "usage_usec": + stats.CpuStats.CpuUsage.TotalUsage = v * 1000 + + case "user_usec": + stats.CpuStats.CpuUsage.UsageInUsermode = v * 1000 + + case "system_usec": + stats.CpuStats.CpuUsage.UsageInKernelmode = v * 1000 + } + } + return nil +} diff --git a/sysbox-runc/libcontainer/cgroups/fs2/cpuset.go b/sysbox-runc/libcontainer/cgroups/fs2/cpuset.go new file mode 100644 index 00000000..fb4456b4 --- /dev/null +++ b/sysbox-runc/libcontainer/cgroups/fs2/cpuset.go @@ -0,0 +1,30 @@ +// +build linux + +package fs2 + +import ( + "github.com/opencontainers/runc/libcontainer/cgroups/fscommon" + "github.com/opencontainers/runc/libcontainer/configs" +) + +func isCpusetSet(cgroup *configs.Cgroup) bool { + return cgroup.Resources.CpusetCpus != "" || cgroup.Resources.CpusetMems != "" +} + +func setCpuset(dirPath string, cgroup *configs.Cgroup) error { + if !isCpusetSet(cgroup) { + return nil + } + + if cgroup.Resources.CpusetCpus != "" { + if err := fscommon.WriteFile(dirPath, "cpuset.cpus", cgroup.Resources.CpusetCpus); err != nil { + return err + } + } + if cgroup.Resources.CpusetMems != "" { + if err := fscommon.WriteFile(dirPath, "cpuset.mems", cgroup.Resources.CpusetMems); err != nil { + return err + } + } + return nil +} diff --git a/sysbox-runc/libcontainer/cgroups/fs2/create.go b/sysbox-runc/libcontainer/cgroups/fs2/create.go new file mode 100644 index 00000000..06778e0c --- /dev/null +++ b/sysbox-runc/libcontainer/cgroups/fs2/create.go @@ -0,0 +1,153 @@ +package fs2 + +import ( + "fmt" + "os" + "path/filepath" + "strings" + + "github.com/opencontainers/runc/libcontainer/cgroups/fscommon" + "github.com/opencontainers/runc/libcontainer/configs" +) + +func supportedControllers(cgroup *configs.Cgroup) (string, error) { + return fscommon.ReadFile(UnifiedMountpoint, "/cgroup.controllers") +} + +// needAnyControllers returns whether we enable some supported controllers or not, +// based on (1) controllers available and (2) resources that are being set. +// We don't check "pseudo" controllers such as +// "freezer" and "devices". +func needAnyControllers(cgroup *configs.Cgroup) (bool, error) { + if cgroup == nil { + return false, nil + } + + // list of all available controllers + content, err := supportedControllers(cgroup) + if err != nil { + return false, err + } + avail := make(map[string]struct{}) + for _, ctr := range strings.Fields(content) { + avail[ctr] = struct{}{} + } + + // check whether the controller if available or not + have := func(controller string) bool { + _, ok := avail[controller] + return ok + } + + if isPidsSet(cgroup) && have("pids") { + return true, nil + } + if isMemorySet(cgroup) && have("memory") { + return true, nil + } + if isIoSet(cgroup) && have("io") { + return true, nil + } + if isCpuSet(cgroup) && have("cpu") { + return true, nil + } + if isCpusetSet(cgroup) && have("cpuset") { + return true, nil + } + if isHugeTlbSet(cgroup) && have("hugetlb") { + return true, nil + } + + return false, nil +} + +// containsDomainController returns whether the current config contains domain controller or not. +// Refer to: http://man7.org/linux/man-pages/man7/cgroups.7.html +// As at Linux 4.19, the following controllers are threaded: cpu, perf_event, and pids. +func containsDomainController(cg *configs.Cgroup) bool { + return isMemorySet(cg) || isIoSet(cg) || isCpuSet(cg) || isHugeTlbSet(cg) +} + +// CreateCgroupPath creates cgroupv2 path, enabling all the supported controllers. +func CreateCgroupPath(path string, c *configs.Cgroup) (Err error) { + if !strings.HasPrefix(path, UnifiedMountpoint) { + return fmt.Errorf("invalid cgroup path %s", path) + } + + content, err := supportedControllers(c) + if err != nil { + return err + } + + const ( + cgTypeFile = "cgroup.type" + cgStCtlFile = "cgroup.subtree_control" + ) + ctrs := strings.Fields(content) + res := "+" + strings.Join(ctrs, " +") + + elements := strings.Split(path, "/") + elements = elements[3:] + current := "/sys/fs" + for i, e := range elements { + current = filepath.Join(current, e) + if i > 0 { + if err := os.Mkdir(current, 0755); err != nil { + if !os.IsExist(err) { + return err + } + } else { + // If the directory was created, be sure it is not left around on errors. + current := current + defer func() { + if Err != nil { + os.Remove(current) + } + }() + } + cgType, _ := fscommon.ReadFile(current, cgTypeFile) + cgType = strings.TrimSpace(cgType) + + switch cgType { + // If the cgroup is in an invalid mode (usually this means there's an internal + // process in the cgroup tree, because we created a cgroup under an + // already-populated-by-other-processes cgroup), then we have to error out if + // the user requested controllers which are not thread-aware. However, if all + // the controllers requested are thread-aware we can simply put the cgroup into + // threaded mode. + case "domain invalid": + if containsDomainController(c) { + return fmt.Errorf("cannot enter cgroupv2 %q with domain controllers -- it is in an invalid state", current) + } else { + // Not entirely correct (in theory we'd always want to be a domain -- + // since that means we're a properly delegated cgroup subtree) but in + // this case there's not much we can do and it's better than giving an + // error. + _ = fscommon.WriteFile(current, cgTypeFile, "threaded") + } + // If the cgroup is in (threaded) or (domain threaded) mode, we can only use thread-aware controllers + // (and you cannot usually take a cgroup out of threaded mode). + case "domain threaded": + fallthrough + case "threaded": + if containsDomainController(c) { + return fmt.Errorf("cannot enter cgroupv2 %q with domain controllers -- it is in %s mode", current, cgType) + } + } + } + // enable all supported controllers + if i < len(elements)-1 { + if err := fscommon.WriteFile(current, cgStCtlFile, res); err != nil { + // try write one by one + allCtrs := strings.Split(res, " ") + for _, ctr := range allCtrs { + _ = fscommon.WriteFile(current, cgStCtlFile, ctr) + } + } + // Some controllers might not be enabled when rootless or containerized, + // but we don't catch the error here. (Caught in setXXX() functions.) + } + } + + return nil +} diff --git a/sysbox-runc/libcontainer/cgroups/fs2/defaultpath.go b/sysbox-runc/libcontainer/cgroups/fs2/defaultpath.go new file mode 100644 index 00000000..00912645 --- /dev/null +++ b/sysbox-runc/libcontainer/cgroups/fs2/defaultpath.go @@ -0,0 +1,105 @@ +/* + Copyright The containerd Authors. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +package fs2 + +import ( + "bufio" + "io" + "os" + "path/filepath" + "strings" + + "github.com/opencontainers/runc/libcontainer/configs" + libcontainerUtils "github.com/opencontainers/runc/libcontainer/utils" + "github.com/pkg/errors" +) + +const UnifiedMountpoint = "/sys/fs/cgroup" + +func defaultDirPath(c *configs.Cgroup) (string, error) { + if (c.Name != "" || c.Parent != "") && c.Path != "" { + return "", errors.Errorf("cgroup: either Path or Name and Parent should be used, got %+v", c) + } + if len(c.Paths) != 0 { + // never set by specconv + return "", errors.Errorf("cgroup: Paths is unsupported, use Path, got %+v", c) + } + + // XXX: Do not remove this code. Path safety is important! -- cyphar + cgPath := libcontainerUtils.CleanPath(c.Path) + cgParent := libcontainerUtils.CleanPath(c.Parent) + cgName := libcontainerUtils.CleanPath(c.Name) + + return _defaultDirPath(UnifiedMountpoint, cgPath, cgParent, cgName) +} + +func _defaultDirPath(root, cgPath, cgParent, cgName string) (string, error) { + if (cgName != "" || cgParent != "") && cgPath != "" { + return "", errors.New("cgroup: either Path or Name and Parent should be used") + } + innerPath := cgPath + if innerPath == "" { + innerPath = filepath.Join(cgParent, cgName) + } + if filepath.IsAbs(innerPath) { + return filepath.Join(root, innerPath), nil + } + + ownCgroup, err := parseCgroupFile("/proc/self/cgroup") + if err != nil { + return "", err + } + // The current user scope most probably has tasks in it already, + // making it impossible to enable controllers for its sub-cgroup. + // A parent cgroup (with no tasks in it) is what we need. + ownCgroup = filepath.Dir(ownCgroup) + + return filepath.Join(root, ownCgroup, innerPath), nil +} + +// parseCgroupFile parses /proc/PID/cgroup file and return string +func parseCgroupFile(path string) (string, error) { + f, err := os.Open(path) + if err != nil { + return "", err + } + defer f.Close() + return parseCgroupFromReader(f) +} + +func parseCgroupFromReader(r io.Reader) (string, error) { + var ( + s = bufio.NewScanner(r) + ) + for s.Scan() { + var ( + text = s.Text() + parts = strings.SplitN(text, ":", 3) + ) + if len(parts) < 3 { + return "", errors.Errorf("invalid cgroup entry: %q", text) + } + // text is like "0::/user.slice/user-1001.slice/session-1.scope" + if parts[0] == "0" && parts[1] == "" { + return parts[2], nil + } + } + if err := s.Err(); err != nil { + return "", err + } + return "", errors.New("cgroup path not found") +} diff --git a/sysbox-runc/libcontainer/cgroups/fs2/defaultpath_test.go b/sysbox-runc/libcontainer/cgroups/fs2/defaultpath_test.go new file mode 100644 index 00000000..30f1c622 --- /dev/null +++ b/sysbox-runc/libcontainer/cgroups/fs2/defaultpath_test.go @@ -0,0 +1,87 @@ +/* + Copyright The containerd Authors. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +package fs2 + +import ( + "path/filepath" + "strings" + "testing" + + "github.com/opencontainers/runc/libcontainer/cgroups" +) + +func TestParseCgroupFromReader(t *testing.T) { + cases := map[string]string{ + "0::/user.slice/user-1001.slice/session-1.scope\n": "/user.slice/user-1001.slice/session-1.scope", + "2:cpuset:/foo\n1:name=systemd:/\n": "", + "2:cpuset:/foo\n1:name=systemd:/\n0::/user.slice/user-1001.slice/session-1.scope\n": "/user.slice/user-1001.slice/session-1.scope", + } + for s, expected := range cases { + g, err := parseCgroupFromReader(strings.NewReader(s)) + if expected != "" { + if g != expected { + t.Errorf("expected %q, got %q", expected, g) + } + if err != nil { + t.Error(err) + } + } else { + if err == nil { + t.Error("error is expected") + } + } + } +} + +func TestDefaultDirPath(t *testing.T) { + if !cgroups.IsCgroup2UnifiedMode() { + t.Skip("need cgroupv2") + } + // same code as in defaultDirPath() + ownCgroup, err := parseCgroupFile("/proc/self/cgroup") + if err != nil { + // Not a test failure, but rather some weird + // environment so we can't run this test. + t.Skipf("can't get own cgroup: %v", err) + } + ownCgroup = filepath.Dir(ownCgroup) + + cases := []struct { + cgPath string + cgParent string + cgName string + expected string + }{ + { + cgPath: "/foo/bar", + expected: "/sys/fs/cgroup/foo/bar", + }, + { + cgPath: "foo/bar", + expected: filepath.Join(UnifiedMountpoint, ownCgroup, "foo/bar"), + }, + } + for _, c := range cases { + got, err := _defaultDirPath(UnifiedMountpoint, c.cgPath, c.cgParent, c.cgName) + if err != nil { + t.Fatal(err) + } + if got != c.expected { + t.Fatalf("expected %q, got %q", c.expected, got) + } + } +} diff --git a/sysbox-runc/libcontainer/cgroups/fs2/devices.go b/sysbox-runc/libcontainer/cgroups/fs2/devices.go new file mode 100644 index 00000000..4c793a1c --- /dev/null +++ b/sysbox-runc/libcontainer/cgroups/fs2/devices.go @@ -0,0 +1,74 @@ +// +build linux + +package fs2 + +import ( + "github.com/opencontainers/runc/libcontainer/cgroups/ebpf" + "github.com/opencontainers/runc/libcontainer/cgroups/ebpf/devicefilter" + "github.com/opencontainers/runc/libcontainer/configs" + "github.com/opencontainers/runc/libcontainer/devices" + "github.com/pkg/errors" + "golang.org/x/sys/unix" +) + +func isRWM(perms devices.Permissions) bool { + var r, w, m bool + for _, perm := range perms { + switch perm { + case 'r': + r = true + case 'w': + w = true + case 'm': + m = true + } + } + return r && w && m +} + +// the logic is from crun +// https://github.com/containers/crun/blob/0.10.2/src/libcrun/cgroup.c#L1644-L1652 +func canSkipEBPFError(cgroup *configs.Cgroup) bool { + for _, dev := range cgroup.Resources.Devices { + if dev.Allow || !isRWM(dev.Permissions) { + return false + } + } + return true +} + +func setDevices(dirPath string, cgroup *configs.Cgroup) error { + if cgroup.SkipDevices { + return nil + } + // XXX: This is currently a white-list (but all callers pass a blacklist of + // devices). This is bad for a whole variety of reasons, but will need + // to be fixed with co-ordinated effort with downstreams. + devices := cgroup.Devices + insts, license, err := devicefilter.DeviceFilter(devices) + if err != nil { + return err + } + dirFD, err := unix.Open(dirPath, unix.O_DIRECTORY|unix.O_RDONLY, 0600) + if err != nil { + return errors.Errorf("cannot get dir FD for %s", dirPath) + } + defer unix.Close(dirFD) + // XXX: This code is currently incorrect when it comes to updating an + // existing cgroup with new rules (new rulesets are just appended to + // the program list because this uses BPF_F_ALLOW_MULTI). If we didn't + // use BPF_F_ALLOW_MULTI we could actually atomically swap the + // programs. + // + // The real issue is that BPF_F_ALLOW_MULTI makes it hard to have a + // race-free blacklist because it acts as a whitelist by default, and + // having a deny-everything program cannot be overridden by other + // programs. You could temporarily insert a deny-everything program + // but that would result in spurrious failures during updates. + if _, err := ebpf.LoadAttachCgroupDeviceFilter(insts, license, dirFD); err != nil { + if !canSkipEBPFError(cgroup) { + return err + } + } + return nil +} diff --git a/sysbox-runc/libcontainer/cgroups/fs2/freezer.go b/sysbox-runc/libcontainer/cgroups/fs2/freezer.go new file mode 100644 index 00000000..441531fd --- /dev/null +++ b/sysbox-runc/libcontainer/cgroups/fs2/freezer.go @@ -0,0 +1,74 @@ +// +build linux + +package fs2 + +import ( + stdErrors "errors" + "os" + "strings" + + "github.com/opencontainers/runc/libcontainer/cgroups/fscommon" + "github.com/opencontainers/runc/libcontainer/configs" + "github.com/pkg/errors" + "golang.org/x/sys/unix" +) + +func setFreezer(dirPath string, state configs.FreezerState) error { + if err := supportsFreezer(dirPath); err != nil { + // We can ignore this request as long as the user didn't ask us to + // freeze the container (since without the freezer cgroup, that's a + // no-op). + if state == configs.Undefined || state == configs.Thawed { + return nil + } + return errors.Wrap(err, "freezer not supported") + } + + var stateStr string + switch state { + case configs.Undefined: + return nil + case configs.Frozen: + stateStr = "1" + case configs.Thawed: + stateStr = "0" + default: + return errors.Errorf("invalid freezer state %q requested", state) + } + + if err := fscommon.WriteFile(dirPath, "cgroup.freeze", stateStr); err != nil { + return err + } + // Confirm that the cgroup did actually change states. + if actualState, err := getFreezer(dirPath); err != nil { + return err + } else if actualState != state { + return errors.Errorf(`expected "cgroup.freeze" to be in state %q but was in %q`, state, actualState) + } + return nil +} + +func supportsFreezer(dirPath string) error { + _, err := fscommon.ReadFile(dirPath, "cgroup.freeze") + return err +} + +func getFreezer(dirPath string) (configs.FreezerState, error) { + state, err := fscommon.ReadFile(dirPath, "cgroup.freeze") + if err != nil { + // If the kernel is too old, then we just treat the freezer as being in + // an "undefined" state. + if os.IsNotExist(err) || stdErrors.Is(err, unix.ENODEV) { + err = nil + } + return configs.Undefined, err + } + switch strings.TrimSpace(state) { + case "0": + return configs.Thawed, nil + case "1": + return configs.Frozen, nil + default: + return configs.Undefined, errors.Errorf(`unknown "cgroup.freeze" state: %q`, state) + } +} diff --git a/sysbox-runc/libcontainer/cgroups/fs2/fs2.go b/sysbox-runc/libcontainer/cgroups/fs2/fs2.go new file mode 100644 index 00000000..6e59416c --- /dev/null +++ b/sysbox-runc/libcontainer/cgroups/fs2/fs2.go @@ -0,0 +1,372 @@ +// +build linux + +package fs2 + +import ( + "fmt" + "io/ioutil" + "os" + "path/filepath" + "strings" + + "github.com/opencontainers/runc/libcontainer/cgroups" + "github.com/opencontainers/runc/libcontainer/cgroups/fscommon" + "github.com/opencontainers/runc/libcontainer/configs" + "github.com/pkg/errors" +) + +type manager struct { + config *configs.Cgroup + // dirPath is like "/sys/fs/cgroup/user.slice/user-1001.slice/session-1.scope" + dirPath string + // controllers is content of "cgroup.controllers" file. + // excludes pseudo-controllers ("devices" and "freezer"). + controllers map[string]struct{} + rootless bool +} + +// NewManager creates a manager for cgroup v2 unified hierarchy. +// dirPath is like "/sys/fs/cgroup/user.slice/user-1001.slice/session-1.scope". +// If dirPath is empty, it is automatically set using config. +func NewManager(config *configs.Cgroup, dirPath string, rootless bool) (cgroups.Manager, error) { + if config == nil { + config = &configs.Cgroup{} + } + if dirPath == "" { + var err error + dirPath, err = defaultDirPath(config) + if err != nil { + return nil, err + } + } + + m := &manager{ + config: config, + dirPath: dirPath, + rootless: rootless, + } + return m, nil +} + +func (m *manager) getControllers() error { + if m.controllers != nil { + return nil + } + + data, err := fscommon.ReadFile(m.dirPath, "cgroup.controllers") + if err != nil { + if m.rootless && m.config.Path == "" { + return nil + } + return err + } + fields := strings.Fields(data) + m.controllers = make(map[string]struct{}, len(fields)) + for _, c := range fields { + m.controllers[c] = struct{}{} + } + + return nil +} + +func (m *manager) Apply(pid int) error { + if err := CreateCgroupPath(m.dirPath, m.config); err != nil { + // Related tests: + // - "runc create (no limits + no cgrouppath + no permission) succeeds" + // - "runc create (rootless + no limits + cgrouppath + no permission) fails with permission error" + // - "runc create (rootless + limits + no cgrouppath + no permission) fails with informative error" + if m.rootless { + if m.config.Path == "" { + if blNeed, nErr := needAnyControllers(m.config); nErr == nil && !blNeed { + return nil + } + return errors.Wrap(err, "rootless needs no limits + no cgrouppath when no permission is granted for cgroups") + } + } + return err + } + if err := cgroups.WriteCgroupProc(m.dirPath, pid); err != nil { + return err + } + return nil +} + +func (m *manager) GetPids() ([]int, error) { + return cgroups.GetPids(m.dirPath) +} + +func (m *manager) GetAllPids() ([]int, error) { + return cgroups.GetAllPids(m.dirPath) +} + +func (m *manager) GetStats() (*cgroups.Stats, error) { + var ( + errs []error + ) + + st := cgroups.NewStats() + if err := m.getControllers(); err != nil { + return st, err + } + + // pids (since kernel 4.5) + if _, ok := m.controllers["pids"]; ok { + if err := statPids(m.dirPath, st); err != nil { + errs = append(errs, err) + } + } else { + if err := statPidsWithoutController(m.dirPath, st); err != nil { + errs = append(errs, err) + } + } + // memory (since kernel 4.5) + if _, ok := m.controllers["memory"]; ok { + if err := statMemory(m.dirPath, st); err != nil { + errs = append(errs, err) + } + } + // io (since kernel 4.5) + if _, ok := m.controllers["io"]; ok { + if err := statIo(m.dirPath, st); err != nil { + errs = append(errs, err) + } + } + // cpu (since kernel 4.15) + if _, ok := m.controllers["cpu"]; ok { + if err := statCpu(m.dirPath, st); err != nil { + errs = append(errs, err) + } + } + // hugetlb (since kernel 5.6) + if _, ok := m.controllers["hugetlb"]; ok { + if err := statHugeTlb(m.dirPath, st); err != nil { + errs = append(errs, err) + } + } + // rdma (since kernel 4.11) + if err := fscommon.RdmaGetStats(m.dirPath, st); err != nil && !os.IsNotExist(err) { + errs = append(errs, err) + } + if len(errs) > 0 && !m.rootless { + return st, errors.Errorf("error while statting cgroup v2: %+v", errs) + } + return st, nil +} + +func (m *manager) Freeze(state configs.FreezerState) error { + if err := setFreezer(m.dirPath, state); err != nil { + return err + } + m.config.Resources.Freezer = state + return nil +} + +func (m *manager) Destroy() error { + return cgroups.RemovePath(m.dirPath) +} + +func (m *manager) Path(_ string) string { + return m.dirPath +} + +func (m *manager) Set(container *configs.Config) error { + if container == nil || container.Cgroups == nil { + return nil + } + if err := m.getControllers(); err != nil { + return err + } + // pids (since kernel 4.5) + if err := setPids(m.dirPath, container.Cgroups); err != nil { + return err + } + // memory (since kernel 4.5) + if err := setMemory(m.dirPath, container.Cgroups); err != nil { + return err + } + // io (since kernel 4.5) + if err := setIo(m.dirPath, container.Cgroups); err != nil { + return err + } + // cpu (since kernel 4.15) + if err := setCpu(m.dirPath, container.Cgroups); err != nil { + return err + } + // devices (since kernel 4.15, pseudo-controller) + // + // When m.Rootless is true, errors from the device subsystem are ignored because it is really not expected to work. + // However, errors from other subsystems are not ignored. + // see @test "runc create (rootless + limits + no cgrouppath + no permission) fails with informative error" + if err := setDevices(m.dirPath, container.Cgroups); err != nil && !m.rootless { + return err + } + // cpuset (since kernel 5.0) + if err := setCpuset(m.dirPath, container.Cgroups); err != nil { + return err + } + // hugetlb (since kernel 5.6) + if err := setHugeTlb(m.dirPath, container.Cgroups); err != nil { + return err + } + // rdma (since kernel 4.11) + if err := fscommon.RdmaSet(m.dirPath, container.Cgroups); err != nil { + return err + } + // freezer (since kernel 5.2, pseudo-controller) + if err := setFreezer(m.dirPath, container.Cgroups.Freezer); err != nil { + return err + } + if err := m.setUnified(container.Cgroups.Unified); err != nil { + return err + } + m.config = container.Cgroups + return nil +} + +func (m *manager) setUnified(res map[string]string) error { + for k, v := range res { + if strings.Contains(k, "/") { + return fmt.Errorf("unified resource %q must be a file name (no slashes)", k) + } + if err := fscommon.WriteFile(m.dirPath, k, v); err != nil { + errC := errors.Cause(err) + // Check for both EPERM and ENOENT since O_CREAT is used by WriteFile. + if errors.Is(errC, os.ErrPermission) || errors.Is(errC, os.ErrNotExist) { + // Check if a controller is available, + // to give more specific error if not. + sk := strings.SplitN(k, ".", 2) + if len(sk) != 2 { + return fmt.Errorf("unified resource %q must be in the form CONTROLLER.PARAMETER", k) + } + c := sk[0] + if _, ok := m.controllers[c]; !ok && c != "cgroup" { + return fmt.Errorf("unified resource %q can't be set: controller %q not available", k, c) + } + } + return errors.Wrapf(err, "can't set unified resource %q", k) + } + } + + return nil +} + +func (m *manager) GetPaths() map[string]string { + paths := make(map[string]string, 1) + paths[""] = m.dirPath + return paths +} + +func (m *manager) GetCgroups() (*configs.Cgroup, error) { + return m.config, nil +} + +func (m *manager) GetFreezerState() (configs.FreezerState, error) { + return getFreezer(m.dirPath) +} + +func (m *manager) Exists() bool { + return cgroups.PathExists(m.dirPath) +} + +func (m *manager) CreateChildCgroup(config *configs.Config) error { + + // Change the cgroup ownership to match the root user in the system + // container (needed for delegation). + path := m.dirPath + + rootuid, err := config.HostRootUID() + if err != nil { + return err + } + rootgid, err := config.HostRootGID() + if err != nil { + return err + } + + if err := os.Chown(path, rootuid, rootgid); err != nil { + return fmt.Errorf("Failed to change owner of cgroup %s", path) + } + + // Change ownership of some of the files inside the sys container's cgroup; + // for cgroups v2 we only change the ownership of a subset of the files, as + // specified in section "Cgroups Delegation: Delegating a Hierarchy to a Less + // Privileged User" in cgroups(7). + files, err := ioutil.ReadDir(path) + if err != nil { + return err + } + for _, file := range files { + fname := file.Name() + + if fname == "cgroup.procs" || + fname == "cgroup.subtree_control" || + fname == "cgroup.threads" { + + absFileName := filepath.Join(path, fname) + if err := os.Chown(absFileName, rootuid, rootgid); err != nil { + return fmt.Errorf("Failed to change owner for file %s", absFileName) + } + } + } + + // Create a leaf cgroup to be used for the sys container's init process (and + // for all its child processes). Its purpose is to prevent processes from + // living in the sys container's cgroup root, because once inner sub-cgroups + // are created, the kernel considers the sys container's cgroup root an + // intermediate node in the global cgroup hierarchy. This in turn forces all + // sub-groups inside the sys container to be of "domain-invalid" type (and + // thus prevents domain cgroup controllers such as the memory controller + // from being applied inside the sys container). + // + // We choose the name "init.scope" for the leaf cgroup because it works well + // in sys containers that carry systemd, as well as those that don't. In both + // cases, the sys container's init processes are placed in the init.scope + // cgroup. For sys container's with systemd, systemd then moves the processes + // to other sub-cgroups it manages. + // + // Note that processes that enter the sys container via "exec" will also + // be placed in this sub-cgroup. + + leafPath := filepath.Join(path, "init.scope") + if err = os.MkdirAll(leafPath, 0755); err != nil { + return err + } + + if err := os.Chown(leafPath, rootuid, rootgid); err != nil { + return fmt.Errorf("Failed to change owner of cgroup %s", leafPath) + } + + files, err = ioutil.ReadDir(leafPath) + if err != nil { + return err + } + for _, file := range files { + fname := file.Name() + + if fname == "cgroup.procs" || + fname == "cgroup.subtree_control" || + fname == "cgroup.threads" { + + absFileName := filepath.Join(leafPath, fname) + if err := os.Chown(absFileName, rootuid, rootgid); err != nil { + return fmt.Errorf("Failed to change owner for file %s", absFileName) + } + } + } + + return nil +} + +func (m *manager) ApplyChildCgroup(pid int) error { + paths := make(map[string]string, 1) + paths[""] = filepath.Join(m.dirPath, "init.scope") + return cgroups.EnterPid(paths, pid) +} + +func (m *manager) GetChildCgroupPaths() map[string]string { + return m.GetPaths() +} + +func (m *manager) GetType() cgroups.CgroupType { + return cgroups.Cgroup_v2_fs +} diff --git a/sysbox-runc/libcontainer/cgroups/fs2/hugetlb.go b/sysbox-runc/libcontainer/cgroups/fs2/hugetlb.go new file mode 100644 index 00000000..18cd411c --- /dev/null +++ b/sysbox-runc/libcontainer/cgroups/fs2/hugetlb.go @@ -0,0 +1,61 @@ +// +build linux + +package fs2 + +import ( + "strconv" + + "github.com/pkg/errors" + + "github.com/opencontainers/runc/libcontainer/cgroups" + "github.com/opencontainers/runc/libcontainer/cgroups/fscommon" + "github.com/opencontainers/runc/libcontainer/configs" +) + +func isHugeTlbSet(cgroup *configs.Cgroup) bool { + return len(cgroup.Resources.HugetlbLimit) > 0 +} + +func setHugeTlb(dirPath string, cgroup *configs.Cgroup) error { + if !isHugeTlbSet(cgroup) { + return nil + } + for _, hugetlb := range cgroup.Resources.HugetlbLimit { + if err := fscommon.WriteFile(dirPath, "hugetlb."+hugetlb.Pagesize+".max", strconv.FormatUint(hugetlb.Limit, 10)); err != nil { + return err + } + } + + return nil +} + +func statHugeTlb(dirPath string, stats *cgroups.Stats) error { + hugePageSizes, err := cgroups.GetHugePageSize() + if err != nil { + return errors.Wrap(err, "failed to fetch hugetlb info") + } + hugetlbStats := cgroups.HugetlbStats{} + + for _, pagesize := range hugePageSizes { + value, err := fscommon.GetCgroupParamUint(dirPath, "hugetlb."+pagesize+".current") + if err != nil { + return err + } + hugetlbStats.Usage = value + + fileName := "hugetlb." + pagesize + ".events" + contents, err := fscommon.ReadFile(dirPath, fileName) + if err != nil { + return errors.Wrap(err, "failed to read stats") + } + _, value, err = fscommon.GetCgroupParamKeyValue(contents) + if err != nil { + return errors.Wrap(err, "failed to parse "+fileName) + } + hugetlbStats.Failcnt = value + + stats.HugetlbStats[pagesize] = hugetlbStats + } + + return nil +} diff --git a/sysbox-runc/libcontainer/cgroups/fs2/io.go b/sysbox-runc/libcontainer/cgroups/fs2/io.go new file mode 100644 index 00000000..e01ea001 --- /dev/null +++ b/sysbox-runc/libcontainer/cgroups/fs2/io.go @@ -0,0 +1,134 @@ +// +build linux + +package fs2 + +import ( + "bufio" + "os" + "strconv" + "strings" + + "github.com/opencontainers/runc/libcontainer/cgroups" + "github.com/opencontainers/runc/libcontainer/cgroups/fscommon" + "github.com/opencontainers/runc/libcontainer/configs" +) + +func isIoSet(cgroup *configs.Cgroup) bool { + return cgroup.Resources.BlkioWeight != 0 || + len(cgroup.Resources.BlkioThrottleReadBpsDevice) > 0 || + len(cgroup.Resources.BlkioThrottleWriteBpsDevice) > 0 || + len(cgroup.Resources.BlkioThrottleReadIOPSDevice) > 0 || + len(cgroup.Resources.BlkioThrottleWriteIOPSDevice) > 0 +} + +func setIo(dirPath string, cgroup *configs.Cgroup) error { + if !isIoSet(cgroup) { + return nil + } + + if cgroup.Resources.BlkioWeight != 0 { + filename := "io.bfq.weight" + if err := fscommon.WriteFile(dirPath, filename, + strconv.FormatUint(cgroups.ConvertBlkIOToCgroupV2Value(cgroup.Resources.BlkioWeight), 10)); err != nil { + return err + } + } + for _, td := range cgroup.Resources.BlkioThrottleReadBpsDevice { + if err := fscommon.WriteFile(dirPath, "io.max", td.StringName("rbps")); err != nil { + return err + } + } + for _, td := range cgroup.Resources.BlkioThrottleWriteBpsDevice { + if err := fscommon.WriteFile(dirPath, "io.max", td.StringName("wbps")); err != nil { + return err + } + } + for _, td := range cgroup.Resources.BlkioThrottleReadIOPSDevice { + if err := fscommon.WriteFile(dirPath, "io.max", td.StringName("riops")); err != nil { + return err + } + } + for _, td := range cgroup.Resources.BlkioThrottleWriteIOPSDevice { + if err := fscommon.WriteFile(dirPath, "io.max", td.StringName("wiops")); err != nil { + return err + } + } + + return nil +} + +func readCgroup2MapFile(dirPath string, name string) (map[string][]string, error) { + ret := map[string][]string{} + f, err := fscommon.OpenFile(dirPath, name, os.O_RDONLY) + if err != nil { + return nil, err + } + defer f.Close() + scanner := bufio.NewScanner(f) + for scanner.Scan() { + line := scanner.Text() + parts := strings.Fields(line) + if len(parts) < 2 { + continue + } + ret[parts[0]] = parts[1:] + } + if err := scanner.Err(); err != nil { + return nil, err + } + return ret, nil +} + +func statIo(dirPath string, stats *cgroups.Stats) error { + // more details on the io.stat file format: https://www.kernel.org/doc/Documentation/cgroup-v2.txt + var ioServiceBytesRecursive []cgroups.BlkioStatEntry + values, err := readCgroup2MapFile(dirPath, "io.stat") + if err != nil { + return err + } + for k, v := range values { + d := strings.Split(k, ":") + if len(d) != 2 { + continue + } + major, err := strconv.ParseUint(d[0], 10, 0) + if err != nil { + return err + } + minor, err := strconv.ParseUint(d[1], 10, 0) + if err != nil { + return err + } + + for _, item := range v { + d := strings.Split(item, "=") + if len(d) != 2 { + continue + } + op := d[0] + + // Accommodate the cgroup v1 naming + switch op { + case "rbytes": + op = "read" + case "wbytes": + op = "write" + } + + value, err := strconv.ParseUint(d[1], 10, 0) + if err != nil { + return err + } + + entry := cgroups.BlkioStatEntry{ + Op: op, + Major: major, + Minor: minor, + Value: value, + } + ioServiceBytesRecursive = append(ioServiceBytesRecursive, entry) + } + } + stats.BlkioStats = cgroups.BlkioStats{IoServiceBytesRecursive: ioServiceBytesRecursive} + return nil +} diff --git a/sysbox-runc/libcontainer/cgroups/fs2/memory.go b/sysbox-runc/libcontainer/cgroups/fs2/memory.go new file mode 100644 index 00000000..1c6913bf --- /dev/null +++ b/sysbox-runc/libcontainer/cgroups/fs2/memory.go @@ -0,0 +1,137 @@ +// +build linux + +package fs2 + +import ( + "bufio" + "os" + "strconv" + + "github.com/opencontainers/runc/libcontainer/cgroups" + "github.com/opencontainers/runc/libcontainer/cgroups/fscommon" + "github.com/opencontainers/runc/libcontainer/configs" + "github.com/pkg/errors" +) + +// numToStr converts an int64 value to a string for writing to a +// cgroupv2 files with .min, .max, .low, or .high suffix. +// The value of -1 is converted to "max" for cgroupv1 compatibility +// (which used to write -1 to remove the limit). +func numToStr(value int64) (ret string) { + switch { + case value == 0: + ret = "" + case value == -1: + ret = "max" + default: + ret = strconv.FormatInt(value, 10) + } + + return ret +} + +func isMemorySet(cgroup *configs.Cgroup) bool { + return cgroup.Resources.MemoryReservation != 0 || + cgroup.Resources.Memory != 0 || cgroup.Resources.MemorySwap != 0 +} + +func setMemory(dirPath string, cgroup *configs.Cgroup) error { + if !isMemorySet(cgroup) { + return nil + } + swap, err := cgroups.ConvertMemorySwapToCgroupV2Value(cgroup.Resources.MemorySwap, cgroup.Resources.Memory) + if err != nil { + return err + } + swapStr := numToStr(swap) + if swapStr == "" && swap == 0 && cgroup.Resources.MemorySwap > 0 { + // memory and memorySwap set to the same value -- disable swap + swapStr = "0" + } + // never write empty string to `memory.swap.max`, it means set to 0. + if swapStr != "" { + if err := fscommon.WriteFile(dirPath, "memory.swap.max", swapStr); err != nil { + return err + } + } + + if val := numToStr(cgroup.Resources.Memory); val != "" { + if err := fscommon.WriteFile(dirPath, "memory.max", val); err != nil { + return err + } + } + + // cgroup.Resources.KernelMemory is ignored + + if val := numToStr(cgroup.Resources.MemoryReservation); val != "" { + if err := fscommon.WriteFile(dirPath, "memory.low", val); err != nil { + return err + } + } + + return nil +} + +func statMemory(dirPath string, stats *cgroups.Stats) error { + // Set stats from memory.stat. + statsFile, err := fscommon.OpenFile(dirPath, "memory.stat", os.O_RDONLY) + if err != nil { + return err + } + defer statsFile.Close() + + sc := bufio.NewScanner(statsFile) + for sc.Scan() { + t, v, err := fscommon.GetCgroupParamKeyValue(sc.Text()) + if err != nil { + return errors.Wrapf(err, "failed to parse memory.stat (%q)", sc.Text()) + } + stats.MemoryStats.Stats[t] = v + } + stats.MemoryStats.Cache = stats.MemoryStats.Stats["cache"] + + memoryUsage, err := getMemoryDataV2(dirPath, "") + if err != nil { + return err + } + stats.MemoryStats.Usage = memoryUsage + swapUsage, err := getMemoryDataV2(dirPath, "swap") + if err != nil { + return err + } + stats.MemoryStats.SwapUsage = swapUsage + + stats.MemoryStats.UseHierarchy = true + return nil +} + +func getMemoryDataV2(path, name string) (cgroups.MemoryData, error) { + memoryData := cgroups.MemoryData{} + + moduleName := "memory" + if name != "" { + moduleName = "memory." + name + } + usage := moduleName + ".current" + limit := moduleName + ".max" + + value, err := fscommon.GetCgroupParamUint(path, usage) + if err != nil { + if moduleName != "memory" && os.IsNotExist(err) { + return cgroups.MemoryData{}, nil + } + return cgroups.MemoryData{}, errors.Wrapf(err, "failed to parse %s", usage) + } + memoryData.Usage = value + + value, err = fscommon.GetCgroupParamUint(path, limit) + if err != nil { + if moduleName != "memory" && os.IsNotExist(err) { + return cgroups.MemoryData{}, nil + } + return cgroups.MemoryData{}, errors.Wrapf(err, "failed to parse %s", limit) + } + memoryData.Limit = value + + return memoryData, nil +} diff --git a/sysbox-runc/libcontainer/cgroups/fs2/pids.go b/sysbox-runc/libcontainer/cgroups/fs2/pids.go new file mode 100644 index 00000000..16e1c219 --- /dev/null +++ b/sysbox-runc/libcontainer/cgroups/fs2/pids.go @@ -0,0 +1,78 @@ +// +build linux + +package fs2 + +import ( + "path/filepath" + "strings" + + "github.com/opencontainers/runc/libcontainer/cgroups" + "github.com/opencontainers/runc/libcontainer/cgroups/fscommon" + "github.com/opencontainers/runc/libcontainer/configs" + "github.com/pkg/errors" + "golang.org/x/sys/unix" +) + +func isPidsSet(cgroup *configs.Cgroup) bool { + return cgroup.Resources.PidsLimit != 0 +} + +func setPids(dirPath string, cgroup *configs.Cgroup) error { + if !isPidsSet(cgroup) { + return nil + } + if val := numToStr(cgroup.Resources.PidsLimit); val != "" { + if err := fscommon.WriteFile(dirPath, "pids.max", val); err != nil { + return err + } + } + + return nil +} + +func statPidsWithoutController(dirPath string, stats *cgroups.Stats) error { + // if the controller is not enabled, let's read PIDS from cgroups.procs + // (or threads if cgroup.threads is enabled) + contents, err := fscommon.ReadFile(dirPath, "cgroup.procs") + if errors.Is(err, unix.ENOTSUP) { + contents, err = fscommon.ReadFile(dirPath, "cgroup.threads") + } + if err != nil { + return err + } + pids := make(map[string]string) + for _, i := range strings.Split(contents, "\n") { + if i != "" { + pids[i] = i + } + } + stats.PidsStats.Current = uint64(len(pids)) + stats.PidsStats.Limit = 0 + return nil +} + +func statPids(dirPath string, stats *cgroups.Stats) error { + current, err := fscommon.GetCgroupParamUint(dirPath, "pids.current") + if err != nil { + return errors.Wrap(err, "failed to parse pids.current") + } + + maxString, err := fscommon.GetCgroupParamString(dirPath, "pids.max") + if err != nil { + return errors.Wrap(err, "failed to parse pids.max") + } + + // Default if pids.max == "max" is 0 -- which represents "no limit". + var max uint64 + if maxString != "max" { + max, err = fscommon.ParseUint(maxString, 10, 64) + if err != nil { + return errors.Wrapf(err, "failed to parse pids.max - unable to parse %q as a uint from Cgroup file %q", + maxString, filepath.Join(dirPath, "pids.max")) + } + } + + stats.PidsStats.Current = current + stats.PidsStats.Limit = max + return nil +} diff --git a/sysbox-runc/libcontainer/cgroups/fscommon/rdma.go b/sysbox-runc/libcontainer/cgroups/fscommon/rdma.go new file mode 100644 index 00000000..9a74ebf7 --- /dev/null +++ b/sysbox-runc/libcontainer/cgroups/fscommon/rdma.go @@ -0,0 +1,121 @@ +package fscommon + +import ( + "bufio" + "errors" + "math" + "os" + "strconv" + "strings" + + "github.com/opencontainers/runc/libcontainer/cgroups" + "github.com/opencontainers/runc/libcontainer/configs" + "golang.org/x/sys/unix" +) + +// parseRdmaKV parses raw string to RdmaEntry. +func parseRdmaKV(raw string, entry *cgroups.RdmaEntry) error { + var value uint32 + + parts := strings.SplitN(raw, "=", 3) + + if len(parts) != 2 { + return errors.New("Unable to parse RDMA entry") + } + + k, v := parts[0], parts[1] + + if v == "max" { + value = math.MaxUint32 + } else { + val64, err := strconv.ParseUint(v, 10, 32) + if err != nil { + return err + } + value = uint32(val64) + } + if k == "hca_handle" { + entry.HcaHandles = value + } else if k == "hca_object" { + entry.HcaObjects = value + } + + return nil +} + +// readRdmaEntries reads and converts array of rawstrings to RdmaEntries from file. +// example entry: mlx4_0 hca_handle=2 hca_object=2000 +func readRdmaEntries(dir, file string) ([]cgroups.RdmaEntry, error) { + rdmaEntries := make([]cgroups.RdmaEntry, 0) + fd, err := cgroups.OpenFile(dir, file, unix.O_RDONLY) + if err != nil { + return nil, err + } + defer fd.Close() //nolint:errorlint + scanner := bufio.NewScanner(fd) + for scanner.Scan() { + parts := strings.SplitN(scanner.Text(), " ", 4) + if len(parts) == 3 { + entry := new(cgroups.RdmaEntry) + entry.Device = parts[0] + err = parseRdmaKV(parts[1], entry) + if err != nil { + continue + } + err = parseRdmaKV(parts[2], entry) + if err != nil { + continue + } + + rdmaEntries = append(rdmaEntries, *entry) + } + } + return rdmaEntries, scanner.Err() +} + +// RdmaGetStats returns rdma stats such as totalLimit and current entries. +func RdmaGetStats(path string, stats *cgroups.Stats) error { + currentEntries, err := readRdmaEntries(path, "rdma.current") + if err != nil { + if errors.Is(err, os.ErrNotExist) { + err = nil + } + return err + } + maxEntries, err := readRdmaEntries(path, "rdma.max") + if err != nil { + return err + } + // If device got removed between reading two files, ignore returning stats. + if len(currentEntries) != len(maxEntries) { + return nil + } + + stats.RdmaStats = cgroups.RdmaStats{ + RdmaLimit: maxEntries, + RdmaCurrent: currentEntries, + } + + return nil +} + +func createCmdString(device string, limits configs.LinuxRdma) string { + cmdString := device + if limits.HcaHandles != nil { + cmdString += " hca_handle=" + strconv.FormatUint(uint64(*limits.HcaHandles), 10) + } + if limits.HcaObjects != nil { + cmdString += " hca_object=" + strconv.FormatUint(uint64(*limits.HcaObjects), 10) + } + return cmdString +} + +// RdmaSet sets RDMA resources. +func RdmaSet(path string, cgroup *configs.Cgroup) error { + for device, limits := range cgroup.Resources.Rdma { + if err := cgroups.WriteFile(path, "rdma.max", createCmdString(device, limits)); err != nil { + return err + } + } + return nil +} diff --git a/sysbox-runc/libcontainer/cgroups/fscommon/rdma_test.go b/sysbox-runc/libcontainer/cgroups/fscommon/rdma_test.go new file mode 100644 index 00000000..c61113f2 --- /dev/null +++ b/sysbox-runc/libcontainer/cgroups/fscommon/rdma_test.go @@ -0,0 +1,67 @@ +package fscommon + +import ( + "os" + "path/filepath" + "testing" + + "github.com/opencontainers/runc/libcontainer/configs" +) + +/* Roadmap for future */ +// (Low-priority) TODO: Check if it is possible to virtually mimic an actual RDMA device. +// TODO: Think of more edge-cases to add. + +// TestRdmaSet performs an E2E test of RdmaSet(), parseRdmaKV() using dummy device and a dummy cgroup file-system. +// Note: Following test does not guarantees that your host supports RDMA since this mocks underlying infrastructure. +func TestRdmaSet(t *testing.T) { + + // XXX: sysbox-runc: + // Test fails with: rdma_test.go:44: open /tmp/TestRdmaSet933593826/001/rdma/rdma.max: no such file or directory + // Skip for now. + return + + testCgroupPath := filepath.Join(t.TempDir(), "rdma") + + // Ensure the full mock cgroup path exists. + err := os.Mkdir(testCgroupPath, 0o755) + if err != nil { + t.Fatal(err) + } + + rdmaDevice := "mlx5_1" + maxHandles := uint32(100) + maxObjects := uint32(300) + + rdmaStubResource := &configs.Resources{ + Rdma: map[string]configs.LinuxRdma{ + rdmaDevice: { + HcaHandles: &maxHandles, + HcaObjects: &maxObjects, + }, + }, + } + + cgroup := &configs.Cgroup{ + Resources: rdmaStubResource, + } + + if err := RdmaSet(testCgroupPath, cgroup); err != nil { + t.Fatal(err) + } + + // The default rdma.max must be written. + rdmaEntries, err := readRdmaEntries(testCgroupPath, "rdma.max") + if err != nil { + t.Fatal(err) + } + if len(rdmaEntries) != 1 { + t.Fatal("rdma_test: Got the wrong values while parsing entries from rdma.max") + } + if rdmaEntries[0].HcaHandles != maxHandles { + t.Fatalf("rdma_test: Got the wrong value for hca_handles") + } + if rdmaEntries[0].HcaObjects != maxObjects { + t.Fatalf("rdma_test: Got the wrong value for hca_Objects") + } +} diff --git a/sysbox-runc/libcontainer/cgroups/fscommon/utils.go b/sysbox-runc/libcontainer/cgroups/fscommon/utils.go new file mode 100644 index 00000000..1590b8cd --- /dev/null +++ b/sysbox-runc/libcontainer/cgroups/fscommon/utils.go @@ -0,0 +1,109 @@ +// +build linux + +package fscommon + +import ( + "errors" + "fmt" + "math" + "strconv" + "strings" + + "github.com/opencontainers/runc/libcontainer/cgroups" +) + +var ( + ErrNotValidFormat = errors.New("line is not a valid key value format") + + OpenFile = cgroups.OpenFile + ReadFile = cgroups.ReadFile + WriteFile = cgroups.WriteFile + CopyFile = cgroups.CopyFile +) + +// ParseUint converts a string to an uint64 integer. +// Negative values are returned at zero as, due to kernel bugs, +// some of the memory cgroup stats can be negative. +func ParseUint(s string, base, bitSize int) (uint64, error) { + value, err := strconv.ParseUint(s, base, bitSize) + if err != nil { + intValue, intErr := strconv.ParseInt(s, base, bitSize) + // 1. Handle negative values greater than MinInt64 (and) + // 2. Handle negative values lesser than MinInt64 + if intErr == nil && intValue < 0 { + return 0, nil + } else if intErr != nil && intErr.(*strconv.NumError).Err == strconv.ErrRange && intValue < 0 { + return 0, nil + } + + return value, err + } + + return value, nil +} + +// GetCgroupParamKeyValue parses a space-separated "name value" kind of cgroup +// parameter and returns its components. For example, "io_service_bytes 1234" +// will return as "io_service_bytes", 1234. +func GetCgroupParamKeyValue(t string) (string, uint64, error) { + parts := strings.Fields(t) + switch len(parts) { + case 2: + value, err := ParseUint(parts[1], 10, 64) + if err != nil { + return "", 0, fmt.Errorf("unable to convert to uint64: %v", err) + } + + return parts[0], value, nil + default: + return "", 0, ErrNotValidFormat + } +} + +// GetCgroupParamUint reads a single uint64 value from the specified cgroup file. +// If the value read is "max", the math.MaxUint64 is returned. +func GetCgroupParamUint(path, file string) (uint64, error) { + contents, err := GetCgroupParamString(path, file) + if err != nil { + return 0, err + } + contents = strings.TrimSpace(contents) + if contents == "max" { + return math.MaxUint64, nil + } + + res, err := ParseUint(contents, 10, 64) + if err != nil { + return res, fmt.Errorf("unable to parse file %q", path+"/"+file) + } + return res, nil +} + +// GetCgroupParamInt reads a single int64 value from specified cgroup file. +// If the value read is "max", the math.MaxInt64 is returned. +func GetCgroupParamInt(path, file string) (int64, error) { + contents, err := cgroups.ReadFile(path, file) + if err != nil { + return 0, err + } + contents = strings.TrimSpace(contents) + if contents == "max" { + return math.MaxInt64, nil + } + + res, err := strconv.ParseInt(contents, 10, 64) + if err != nil { + return res, fmt.Errorf("unable to parse %q as a int from Cgroup file %q", contents, path+"/"+file) + } + return res, nil +} + +// GetCgroupParamString reads a string from the specified cgroup file. +func GetCgroupParamString(path, file string) (string, error) { + contents, err := cgroups.ReadFile(path, file) + if err != nil { + return "", err + } + + return strings.TrimSpace(contents), nil +} diff --git a/sysbox-runc/libcontainer/cgroups/fscommon/utils_test.go b/sysbox-runc/libcontainer/cgroups/fscommon/utils_test.go new file mode 100644 index 00000000..d0c5668b --- /dev/null +++ b/sysbox-runc/libcontainer/cgroups/fscommon/utils_test.go @@ -0,0 +1,97 @@ +// +build linux + +package fscommon + +import ( + "io/ioutil" + "math" + "os" + "path/filepath" + "strconv" + "testing" +) + +const ( + cgroupFile = "cgroup.file" + floatValue = 2048.0 + floatString = "2048" +) + +func TestGetCgroupParamsInt(t *testing.T) { + // Setup tempdir. + tempDir, err := ioutil.TempDir("", "cgroup_utils_test") + if err != nil { + t.Fatal(err) + } + defer os.RemoveAll(tempDir) + tempFile := filepath.Join(tempDir, cgroupFile) + + // Success. + err = ioutil.WriteFile(tempFile, []byte(floatString), 0755) + if err != nil { + t.Fatal(err) + } + value, err := GetCgroupParamUint(tempDir, cgroupFile) + if err != nil { + t.Fatal(err) + } else if value != floatValue { + t.Fatalf("Expected %d to equal %f", value, floatValue) + } + + // Success with new line. + err = ioutil.WriteFile(tempFile, []byte(floatString+"\n"), 0755) + if err != nil { + t.Fatal(err) + } + value, err = GetCgroupParamUint(tempDir, cgroupFile) + if err != nil { + t.Fatal(err) + } else if value != floatValue { + t.Fatalf("Expected %d to equal %f", value, floatValue) + } + + // Success with negative values + err = ioutil.WriteFile(tempFile, []byte("-12345"), 0755) + if err != nil { + t.Fatal(err) + } + value, err = GetCgroupParamUint(tempDir, cgroupFile) + if err != nil { + t.Fatal(err) + } else if value != 0 { + t.Fatalf("Expected %d to equal %d", value, 0) + } + + // Success with negative values lesser than min int64 + s := strconv.FormatFloat(math.MinInt64, 'f', -1, 64) + err = ioutil.WriteFile(tempFile, []byte(s), 0755) + if err != nil { + t.Fatal(err) + } + value, err = GetCgroupParamUint(tempDir, cgroupFile) + if err != nil { + t.Fatal(err) + } else if value != 0 { + t.Fatalf("Expected %d to equal %d", value, 0) + } + + // Not a float. + err = ioutil.WriteFile(tempFile, []byte("not-a-float"), 0755) + if err != nil { + t.Fatal(err) + } + _, err = GetCgroupParamUint(tempDir, cgroupFile) + if err == nil { + t.Fatal("Expecting error, got none") + } + + // Unknown file. + err = os.Remove(tempFile) + if err != nil { + t.Fatal(err) + } + _, err = GetCgroupParamUint(tempDir, cgroupFile) + if err == nil { + t.Fatal("Expecting error, got none") + } +} diff --git a/sysbox-runc/libcontainer/cgroups/stats.go b/sysbox-runc/libcontainer/cgroups/stats.go new file mode 100644 index 00000000..8f7301da --- /dev/null +++ b/sysbox-runc/libcontainer/cgroups/stats.go @@ -0,0 +1,175 @@ +// +build linux + +package cgroups + +type ThrottlingData struct { + // Number of periods with throttling active + Periods uint64 `json:"periods,omitempty"` + // Number of periods when the container hit its throttling limit. + ThrottledPeriods uint64 `json:"throttled_periods,omitempty"` + // Aggregate time the container was throttled for in nanoseconds. + ThrottledTime uint64 `json:"throttled_time,omitempty"` +} + +// CpuUsage denotes the usage of a CPU. +// All CPU stats are aggregate since container inception. +type CpuUsage struct { + // Total CPU time consumed. + // Units: nanoseconds. + TotalUsage uint64 `json:"total_usage,omitempty"` + // Total CPU time consumed per core. + // Units: nanoseconds. + PercpuUsage []uint64 `json:"percpu_usage,omitempty"` + // CPU time consumed per core in kernel mode + // Units: nanoseconds. + PercpuUsageInKernelmode []uint64 `json:"percpu_usage_in_kernelmode"` + // CPU time consumed per core in user mode + // Units: nanoseconds. + PercpuUsageInUsermode []uint64 `json:"percpu_usage_in_usermode"` + // Time spent by tasks of the cgroup in kernel mode. + // Units: nanoseconds. + UsageInKernelmode uint64 `json:"usage_in_kernelmode"` + // Time spent by tasks of the cgroup in user mode. + // Units: nanoseconds. + UsageInUsermode uint64 `json:"usage_in_usermode"` +} + +type CpuStats struct { + CpuUsage CpuUsage `json:"cpu_usage,omitempty"` + ThrottlingData ThrottlingData `json:"throttling_data,omitempty"` +} + +type CPUSetStats struct { + // List of the physical numbers of the CPUs on which processes + // in that cpuset are allowed to execute + CPUs []uint16 `json:"cpus,omitempty"` + // cpu_exclusive flag + CPUExclusive uint64 `json:"cpu_exclusive"` + // List of memory nodes on which processes in that cpuset + // are allowed to allocate memory + Mems []uint16 `json:"mems,omitempty"` + // mem_hardwall flag + MemHardwall uint64 `json:"mem_hardwall"` + // mem_exclusive flag + MemExclusive uint64 `json:"mem_exclusive"` + // memory_migrate flag + MemoryMigrate uint64 `json:"memory_migrate"` + // memory_spread page flag + MemorySpreadPage uint64 `json:"memory_spread_page"` + // memory_spread slab flag + MemorySpreadSlab uint64 `json:"memory_spread_slab"` + // memory_pressure + MemoryPressure uint64 `json:"memory_pressure"` + // sched_load balance flag + SchedLoadBalance uint64 `json:"sched_load_balance"` + // sched_relax_domain_level + SchedRelaxDomainLevel int64 `json:"sched_relax_domain_level"` +} + +type MemoryData struct { + Usage uint64 `json:"usage,omitempty"` + MaxUsage uint64 `json:"max_usage,omitempty"` + Failcnt uint64 `json:"failcnt"` + Limit uint64 `json:"limit"` +} + +type MemoryStats struct { + // memory used for cache + Cache uint64 `json:"cache,omitempty"` + // usage of memory + Usage MemoryData `json:"usage,omitempty"` + // usage of memory + swap + SwapUsage MemoryData `json:"swap_usage,omitempty"` + // usage of kernel memory + KernelUsage MemoryData `json:"kernel_usage,omitempty"` + // usage of kernel TCP memory + KernelTCPUsage MemoryData `json:"kernel_tcp_usage,omitempty"` + // usage of memory pages by NUMA node + // see chapter 5.6 of memory controller documentation + PageUsageByNUMA PageUsageByNUMA `json:"page_usage_by_numa,omitempty"` + // if true, memory usage is accounted for throughout a hierarchy of cgroups. + UseHierarchy bool `json:"use_hierarchy"` + + Stats map[string]uint64 `json:"stats,omitempty"` +} + +type PageUsageByNUMA struct { + // Embedding is used as types can't be recursive. + PageUsageByNUMAInner + Hierarchical PageUsageByNUMAInner `json:"hierarchical,omitempty"` +} + +type PageUsageByNUMAInner struct { + Total PageStats `json:"total,omitempty"` + File PageStats `json:"file,omitempty"` + Anon PageStats `json:"anon,omitempty"` + Unevictable PageStats `json:"unevictable,omitempty"` +} + +type PageStats struct { + Total uint64 `json:"total,omitempty"` + Nodes map[uint8]uint64 `json:"nodes,omitempty"` +} + +type PidsStats struct { + // number of pids in the cgroup + Current uint64 `json:"current,omitempty"` + // active pids hard limit + Limit uint64 `json:"limit,omitempty"` +} + +type BlkioStatEntry struct { + Major uint64 `json:"major,omitempty"` + Minor uint64 `json:"minor,omitempty"` + Op string `json:"op,omitempty"` + Value uint64 `json:"value,omitempty"` +} + +type BlkioStats struct { + // number of bytes tranferred to and from the block device + IoServiceBytesRecursive []BlkioStatEntry `json:"io_service_bytes_recursive,omitempty"` + IoServicedRecursive []BlkioStatEntry `json:"io_serviced_recursive,omitempty"` + IoQueuedRecursive []BlkioStatEntry `json:"io_queue_recursive,omitempty"` + IoServiceTimeRecursive []BlkioStatEntry `json:"io_service_time_recursive,omitempty"` + IoWaitTimeRecursive []BlkioStatEntry `json:"io_wait_time_recursive,omitempty"` + IoMergedRecursive []BlkioStatEntry `json:"io_merged_recursive,omitempty"` + IoTimeRecursive []BlkioStatEntry `json:"io_time_recursive,omitempty"` + SectorsRecursive []BlkioStatEntry `json:"sectors_recursive,omitempty"` +} + +type HugetlbStats struct { + // current res_counter usage for hugetlb + Usage uint64 `json:"usage,omitempty"` + // maximum usage ever recorded. + MaxUsage uint64 `json:"max_usage,omitempty"` + // number of times hugetlb usage allocation failure. + Failcnt uint64 `json:"failcnt"` +} + +type RdmaEntry struct { + Device string `json:"device,omitempty"` + HcaHandles uint32 `json:"hca_handles,omitempty"` + HcaObjects uint32 `json:"hca_objects,omitempty"` +} + +type RdmaStats struct { + RdmaLimit []RdmaEntry `json:"rdma_limit,omitempty"` + RdmaCurrent []RdmaEntry `json:"rdma_current,omitempty"` +} + +type Stats struct { + CpuStats CpuStats `json:"cpu_stats,omitempty"` + CPUSetStats CPUSetStats `json:"cpuset_stats,omitempty"` + MemoryStats MemoryStats `json:"memory_stats,omitempty"` + PidsStats PidsStats `json:"pids_stats,omitempty"` + BlkioStats BlkioStats `json:"blkio_stats,omitempty"` + // the map is in the format "size of hugepage: stats of the hugepage" + HugetlbStats map[string]HugetlbStats `json:"hugetlb_stats,omitempty"` + RdmaStats RdmaStats `json:"rdma_stats,omitempty"` +} + +func NewStats() *Stats { + memoryStats := MemoryStats{Stats: make(map[string]uint64)} + hugetlbStats := make(map[string]HugetlbStats) + return &Stats{MemoryStats: memoryStats, HugetlbStats: hugetlbStats} +} diff --git a/sysbox-runc/libcontainer/cgroups/systemd/common.go b/sysbox-runc/libcontainer/cgroups/systemd/common.go new file mode 100644 index 00000000..6d5def71 --- /dev/null +++ b/sysbox-runc/libcontainer/cgroups/systemd/common.go @@ -0,0 +1,471 @@ +package systemd + +import ( + "bufio" + "fmt" + "math" + "os" + "regexp" + "strconv" + "strings" + "sync" + "time" + + systemdDbus "github.com/coreos/go-systemd/v22/dbus" + dbus "github.com/godbus/dbus/v5" + cgroupdevices "github.com/opencontainers/runc/libcontainer/cgroups/devices" + "github.com/opencontainers/runc/libcontainer/configs" + "github.com/opencontainers/runc/libcontainer/devices" + "github.com/pkg/errors" + "github.com/sirupsen/logrus" +) + +const ( + // Default kernel value for cpu quota period is 100000 us (100 ms), same for v1 and v2. + // v1: https://www.kernel.org/doc/html/latest/scheduler/sched-bwc.html and + // v2: https://www.kernel.org/doc/html/latest/admin-guide/cgroup-v2.html + defCPUQuotaPeriod = uint64(100000) +) + +var ( + connOnce sync.Once + connDbus *systemdDbus.Conn + connErr error + + versionOnce sync.Once + version int + + isRunningSystemdOnce sync.Once + isRunningSystemd bool +) + +// NOTE: This function comes from package github.com/coreos/go-systemd/util +// It was borrowed here to avoid a dependency on cgo. +// +// IsRunningSystemd checks whether the host was booted with systemd as its init +// system. This functions similarly to systemd's `sd_booted(3)`: internally, it +// checks whether /run/systemd/system/ exists and is a directory. +// http://www.freedesktop.org/software/systemd/man/sd_booted.html +func IsRunningSystemd() bool { + isRunningSystemdOnce.Do(func() { + fi, err := os.Lstat("/run/systemd/system") + isRunningSystemd = err == nil && fi.IsDir() + }) + return isRunningSystemd +} + +// systemd represents slice hierarchy using `-`, so we need to follow suit when +// generating the path of slice. Essentially, test-a-b.slice becomes +// /test.slice/test-a.slice/test-a-b.slice. +func ExpandSlice(slice string) (string, error) { + suffix := ".slice" + // Name has to end with ".slice", but can't be just ".slice". + if len(slice) < len(suffix) || !strings.HasSuffix(slice, suffix) { + return "", fmt.Errorf("invalid slice name: %s", slice) + } + + // Path-separators are not allowed. + if strings.Contains(slice, "/") { + return "", fmt.Errorf("invalid slice name: %s", slice) + } + + var path, prefix string + sliceName := strings.TrimSuffix(slice, suffix) + // if input was -.slice, we should just return root now + if sliceName == "-" { + return "/", nil + } + for _, component := range strings.Split(sliceName, "-") { + // test--a.slice isn't permitted, nor is -test.slice. + if component == "" { + return "", fmt.Errorf("invalid slice name: %s", slice) + } + + // Append the component to the path and to the prefix. + path += "/" + prefix + component + suffix + prefix += component + "-" + } + return path, nil +} + +func groupPrefix(ruleType devices.Type) (string, error) { + switch ruleType { + case devices.BlockDevice: + return "block-", nil + case devices.CharDevice: + return "char-", nil + default: + return "", errors.Errorf("device type %v has no group prefix", ruleType) + } +} + +// findDeviceGroup tries to find the device group name (as listed in +// /proc/devices) with the type prefixed as required for DeviceAllow, for a +// given (type, major) combination. If more than one device group exists, an +// arbitrary one is chosen. +func findDeviceGroup(ruleType devices.Type, ruleMajor int64) (string, error) { + fh, err := os.Open("/proc/devices") + if err != nil { + return "", err + } + defer fh.Close() + + prefix, err := groupPrefix(ruleType) + if err != nil { + return "", err + } + + scanner := bufio.NewScanner(fh) + var currentType devices.Type + for scanner.Scan() { + // We need to strip spaces because the first number is column-aligned. + line := strings.TrimSpace(scanner.Text()) + + // Handle the "header" lines. + switch line { + case "Block devices:": + currentType = devices.BlockDevice + continue + case "Character devices:": + currentType = devices.CharDevice + continue + case "": + continue + } + + // Skip lines unrelated to our type. + if currentType != ruleType { + continue + } + + // Parse out the (major, name). + var ( + currMajor int64 + currName string + ) + if n, err := fmt.Sscanf(line, "%d %s", &currMajor, &currName); err != nil || n != 2 { + if err == nil { + err = errors.Errorf("wrong number of fields") + } + return "", errors.Wrapf(err, "scan /proc/devices line %q", line) + } + + if currMajor == ruleMajor { + return prefix + currName, nil + } + } + if err := scanner.Err(); err != nil { + return "", errors.Wrap(err, "reading /proc/devices") + } + // Couldn't find the device group. + return "", nil +} + +// generateDeviceProperties takes the configured device rules and generates a +// corresponding set of systemd properties to configure the devices correctly. +func generateDeviceProperties(rules []*devices.Rule) ([]systemdDbus.Property, error) { + // DeviceAllow is the type "a(ss)" which means we need a temporary struct + // to represent it in Go. + type deviceAllowEntry struct { + Path string + Perms string + } + + properties := []systemdDbus.Property{ + // Always run in the strictest white-list mode. + newProp("DevicePolicy", "strict"), + // Empty the DeviceAllow array before filling it. + newProp("DeviceAllow", []deviceAllowEntry{}), + } + + // Figure out the set of rules. + configEmu := &cgroupdevices.Emulator{} + for _, rule := range rules { + if err := configEmu.Apply(*rule); err != nil { + return nil, errors.Wrap(err, "apply rule for systemd") + } + } + // systemd doesn't support blacklists. So we log a warning, and tell + // systemd to act as a deny-all whitelist. This ruleset will be replaced + // with our normal fallback code. This may result in spurrious errors, but + // the only other option is to error out here. + if configEmu.IsBlacklist() { + // However, if we're dealing with an allow-all rule then we can do it. + if configEmu.IsAllowAll() { + return []systemdDbus.Property{ + // Run in white-list mode by setting to "auto" and removing all + // DeviceAllow rules. + newProp("DevicePolicy", "auto"), + newProp("DeviceAllow", []deviceAllowEntry{}), + }, nil + } + logrus.Warn("systemd doesn't support blacklist device rules -- applying temporary deny-all rule") + return properties, nil + } + + // Now generate the set of rules we actually need to apply. Unlike the + // normal devices cgroup, in "strict" mode systemd defaults to a deny-all + // whitelist which is the default for devices.Emulator. + baseEmu := &cgroupdevices.Emulator{} + finalRules, err := baseEmu.Transition(configEmu) + if err != nil { + return nil, errors.Wrap(err, "get simplified rules for systemd") + } + var deviceAllowList []deviceAllowEntry + for _, rule := range finalRules { + if !rule.Allow { + // Should never happen. + return nil, errors.Errorf("[internal error] cannot add deny rule to systemd DeviceAllow list: %v", *rule) + } + switch rule.Type { + case devices.BlockDevice, devices.CharDevice: + default: + // Should never happen. + return nil, errors.Errorf("invalid device type for DeviceAllow: %v", rule.Type) + } + + entry := deviceAllowEntry{ + Perms: string(rule.Permissions), + } + + // systemd has a fairly odd (though understandable) syntax here, and + // because of the OCI configuration format we have to do quite a bit of + // trickery to convert things: + // + // * Concrete rules with non-wildcard major/minor numbers have to use + // /dev/{block,char} paths. This is slightly odd because it means + // that we cannot add whitelist rules for devices that don't exist, + // but there's not too much we can do about that. + // + // However, path globbing is not support for path-based rules so we + // need to handle wildcards in some other manner. + // + // * Wildcard-minor rules have to specify a "device group name" (the + // second column in /proc/devices). + // + // * Wildcard (major and minor) rules can just specify a glob with the + // type ("char-*" or "block-*"). + // + // The only type of rule we can't handle is wildcard-major rules, and + // so we'll give a warning in that case (note that the fallback code + // will insert any rules systemd couldn't handle). What amazing fun. + + if rule.Major == devices.Wildcard { + // "_ *:n _" rules aren't supported by systemd. + if rule.Minor != devices.Wildcard { + logrus.Warnf("systemd doesn't support '*:n' device rules -- temporarily ignoring rule: %v", *rule) + continue + } + + // "_ *:* _" rules just wildcard everything. + prefix, err := groupPrefix(rule.Type) + if err != nil { + return nil, err + } + entry.Path = prefix + "*" + } else if rule.Minor == devices.Wildcard { + // "_ n:* _" rules require a device group from /proc/devices. + group, err := findDeviceGroup(rule.Type, rule.Major) + if err != nil { + return nil, errors.Wrapf(err, "find device '%v/%d'", rule.Type, rule.Major) + } + if group == "" { + // Couldn't find a group. + logrus.Warnf("could not find device group for '%v/%d' in /proc/devices -- temporarily ignoring rule: %v", rule.Type, rule.Major, *rule) + continue + } + entry.Path = group + } else { + // "_ n:m _" rules are just a path in /dev/{block,char}/. + switch rule.Type { + case devices.BlockDevice: + entry.Path = fmt.Sprintf("/dev/block/%d:%d", rule.Major, rule.Minor) + case devices.CharDevice: + entry.Path = fmt.Sprintf("/dev/char/%d:%d", rule.Major, rule.Minor) + } + } + deviceAllowList = append(deviceAllowList, entry) + } + + properties = append(properties, newProp("DeviceAllow", deviceAllowList)) + return properties, nil +} + +// getDbusConnection lazy initializes systemd dbus connection +// and returns it +func getDbusConnection(rootless bool) (*systemdDbus.Conn, error) { + connOnce.Do(func() { + if rootless { + connDbus, connErr = NewUserSystemdDbus() + } else { + connDbus, connErr = systemdDbus.New() + } + }) + return connDbus, connErr +} + +func newProp(name string, units interface{}) systemdDbus.Property { + return systemdDbus.Property{ + Name: name, + Value: dbus.MakeVariant(units), + } +} + +func getUnitName(c *configs.Cgroup) string { + // by default, we create a scope unless the user explicitly asks for a slice. + if !strings.HasSuffix(c.Name, ".slice") { + return c.ScopePrefix + "-" + c.Name + ".scope" + } + return c.Name +} + +// isUnitExists returns true if the error is that a systemd unit already exists. +func isUnitExists(err error) bool { + if err != nil { + if dbusError, ok := err.(dbus.Error); ok { + return strings.Contains(dbusError.Name, "org.freedesktop.systemd1.UnitExists") + } + } + return false +} + +func startUnit(dbusConnection *systemdDbus.Conn, unitName string, properties []systemdDbus.Property) error { + statusChan := make(chan string, 1) + if _, err := dbusConnection.StartTransientUnit(unitName, "replace", properties, statusChan); err == nil { + timeout := time.NewTimer(30 * time.Second) + defer timeout.Stop() + + select { + case s := <-statusChan: + close(statusChan) + // Please refer to https://godoc.org/github.com/coreos/go-systemd/dbus#Conn.StartUnit + if s != "done" { + dbusConnection.ResetFailedUnit(unitName) + return errors.Errorf("error creating systemd unit `%s`: got `%s`", unitName, s) + } + case <-timeout.C: + dbusConnection.ResetFailedUnit(unitName) + return errors.New("Timeout waiting for systemd to create " + unitName) + } + } else if !isUnitExists(err) { + return err + } + + return nil +} + +func stopUnit(dbusConnection *systemdDbus.Conn, unitName string) error { + statusChan := make(chan string, 1) + if _, err := dbusConnection.StopUnit(unitName, "replace", statusChan); err == nil { + select { + case s := <-statusChan: + close(statusChan) + // Please refer to https://godoc.org/github.com/coreos/go-systemd/dbus#Conn.StartUnit + if s != "done" { + logrus.Warnf("error removing unit `%s`: got `%s`. Continuing...", unitName, s) + } + case <-time.After(time.Second): + logrus.Warnf("Timed out while waiting for StopUnit(%s) completion signal from dbus. Continuing...", unitName) + } + } + return nil +} + +func systemdVersion(conn *systemdDbus.Conn) int { + versionOnce.Do(func() { + version = -1 + verStr, err := conn.GetManagerProperty("Version") + if err == nil { + version, err = systemdVersionAtoi(verStr) + } + + if err != nil { + logrus.WithError(err).Error("unable to get systemd version") + } + }) + + return version +} + +func systemdVersionAtoi(verStr string) (int, error) { + // verStr should be of the form: + // "v245.4-1.fc32", "245", "v245-1.fc32", "245-1.fc32" + // all the input strings include quotes, and the output int should be 245 + // thus, we unconditionally remove the `"v` + // and then match on the first integer we can grab + re := regexp.MustCompile(`"?v?([0-9]+)`) + matches := re.FindStringSubmatch(verStr) + if len(matches) < 2 { + return 0, errors.Errorf("can't parse version %s: incorrect number of matches %v", verStr, matches) + } + ver, err := strconv.Atoi(matches[1]) + return ver, errors.Wrapf(err, "can't parse version %s", verStr) +} + +func addCpuQuota(conn *systemdDbus.Conn, properties *[]systemdDbus.Property, quota int64, period uint64) { + if period != 0 { + // systemd only supports CPUQuotaPeriodUSec since v242 + sdVer := systemdVersion(conn) + if sdVer >= 242 { + *properties = append(*properties, + newProp("CPUQuotaPeriodUSec", period)) + } else { + logrus.Debugf("systemd v%d is too old to support CPUQuotaPeriodSec "+ + " (setting will still be applied to cgroupfs)", sdVer) + } + } + if quota != 0 || period != 0 { + // corresponds to USEC_INFINITY in systemd + cpuQuotaPerSecUSec := uint64(math.MaxUint64) + if quota > 0 { + if period == 0 { + // assume the default + period = defCPUQuotaPeriod + } + // systemd converts CPUQuotaPerSecUSec (microseconds per CPU second) to CPUQuota + // (integer percentage of CPU) internally. This means that if a fractional percent of + // CPU is indicated by Resources.CpuQuota, we need to round up to the nearest + // 10ms (1% of a second) such that child cgroups can set the cpu.cfs_quota_us they expect. + cpuQuotaPerSecUSec = uint64(quota*1000000) / period + if cpuQuotaPerSecUSec%10000 != 0 { + cpuQuotaPerSecUSec = ((cpuQuotaPerSecUSec / 10000) + 1) * 10000 + } + } + *properties = append(*properties, + newProp("CPUQuotaPerSecUSec", cpuQuotaPerSecUSec)) + } +} + +func addCpuset(conn *systemdDbus.Conn, props *[]systemdDbus.Property, cpus, mems string) error { + if cpus == "" && mems == "" { + return nil + } + + // systemd only supports AllowedCPUs/AllowedMemoryNodes since v244 + sdVer := systemdVersion(conn) + if sdVer < 244 { + logrus.Debugf("systemd v%d is too old to support AllowedCPUs/AllowedMemoryNodes"+ + " (settings will still be applied to cgroupfs)", sdVer) + return nil + } + + if cpus != "" { + bits, err := rangeToBits(cpus) + if err != nil { + return fmt.Errorf("resources.CPU.Cpus=%q conversion error: %w", + cpus, err) + } + *props = append(*props, + newProp("AllowedCPUs", bits)) + } + if mems != "" { + bits, err := rangeToBits(mems) + if err != nil { + return fmt.Errorf("resources.CPU.Mems=%q conversion error: %w", + mems, err) + } + *props = append(*props, + newProp("AllowedMemoryNodes", bits)) + } + return nil +} diff --git a/sysbox-runc/libcontainer/cgroups/systemd/cpuset.go b/sysbox-runc/libcontainer/cgroups/systemd/cpuset.go new file mode 100644 index 00000000..07098218 --- /dev/null +++ b/sysbox-runc/libcontainer/cgroups/systemd/cpuset.go @@ -0,0 +1,67 @@ +package systemd + +import ( + "encoding/binary" + "strconv" + "strings" + + "github.com/pkg/errors" + "github.com/willf/bitset" +) + +// rangeToBits converts a text representation of a CPU mask (as written to +// or read from cgroups' cpuset.* files, e.g. "1,3-5") to a slice of bytes +// with the corresponding bits set (as consumed by systemd over dbus as +// AllowedCPUs/AllowedMemoryNodes unit property value). +func rangeToBits(str string) ([]byte, error) { + bits := &bitset.BitSet{} + + for _, r := range strings.Split(str, ",") { + // allow extra spaces around + r = strings.TrimSpace(r) + // allow empty elements (extra commas) + if r == "" { + continue + } + ranges := strings.SplitN(r, "-", 2) + if len(ranges) > 1 { + start, err := strconv.ParseUint(ranges[0], 10, 32) + if err != nil { + return nil, err + } + end, err := strconv.ParseUint(ranges[1], 10, 32) + if err != nil { + return nil, err + } + if start > end { + return nil, errors.New("invalid range: " + r) + } + for i := uint(start); i <= uint(end); i++ { + bits.Set(i) + } + } else { + val, err := strconv.ParseUint(ranges[0], 10, 32) + if err != nil { + return nil, err + } + bits.Set(uint(val)) + } + } + + val := bits.Bytes() + if len(val) == 0 { + // do not allow empty values + return nil, errors.New("empty value") + } + ret := make([]byte, len(val)*8) + for i := range val { + // bitset uses BigEndian internally + binary.BigEndian.PutUint64(ret[i*8:], val[len(val)-1-i]) + } + // remove upper all-zero bytes + for ret[0] == 0 { + ret = ret[1:] + } + + return ret, nil +} diff --git a/sysbox-runc/libcontainer/cgroups/systemd/cpuset_test.go b/sysbox-runc/libcontainer/cgroups/systemd/cpuset_test.go new file mode 100644 index 00000000..7dda1e3c --- /dev/null +++ b/sysbox-runc/libcontainer/cgroups/systemd/cpuset_test.go @@ -0,0 +1,55 @@ +package systemd + +import ( + "bytes" + "testing" +) + +func TestRangeToBits(t *testing.T) { + testCases := []struct { + in string + out []byte + isErr bool + }{ + {in: "", isErr: true}, + {in: "0", out: []byte{1}}, + {in: "1", out: []byte{2}}, + {in: "0-1", out: []byte{3}}, + {in: "0,1", out: []byte{3}}, + {in: ",0,1,", out: []byte{3}}, + {in: "0-3", out: []byte{0x0f}}, + {in: "0,1,2-3", out: []byte{0x0f}}, + {in: "4-7", out: []byte{0xf0}}, + {in: "0-7", out: []byte{0xff}}, + {in: "0-15", out: []byte{0xff, 0xff}}, + {in: "16", out: []byte{1, 0, 0}}, + {in: "0-3,32-33", out: []byte{3, 0, 0, 0, 0x0f}}, + // extra spaces and tabs are ok + {in: "1, 2, 1-2", out: []byte{6}}, + {in: " , 1 , 3 , 5-7, ", out: []byte{0xea}}, + // somewhat large values + {in: "128-130,1", out: []byte{7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2}}, + + {in: "-", isErr: true}, + {in: "1-", isErr: true}, + {in: "-3", isErr: true}, + // bad range (start > end) + {in: "54-53", isErr: true}, + // kernel does not allow extra spaces inside a range + {in: "1 - 2", isErr: true}, + } + + for _, tc := range testCases { + out, err := rangeToBits(tc.in) + if err != nil { + if !tc.isErr { + t.Errorf("case %q: unexpected error: %v", tc.in, err) + } + + continue + } + if !bytes.Equal(out, tc.out) { + t.Errorf("case %q: expected %v, got %v", tc.in, tc.out, out) + } + } +} diff --git a/sysbox-runc/libcontainer/cgroups/systemd/systemd_test.go b/sysbox-runc/libcontainer/cgroups/systemd/systemd_test.go new file mode 100644 index 00000000..0c98a117 --- /dev/null +++ b/sysbox-runc/libcontainer/cgroups/systemd/systemd_test.go @@ -0,0 +1,32 @@ +package systemd + +import ( + "testing" +) + +func TestSystemdVersion(t *testing.T) { + var systemdVersionTests = []struct { + verStr string + expectedVer int + expectErr bool + }{ + {`"219"`, 219, false}, + {`"v245.4-1.fc32"`, 245, false}, + {`"241-1"`, 241, false}, + {`"v241-1"`, 241, false}, + {"NaN", 0, true}, + {"", 0, true}, + } + for _, sdTest := range systemdVersionTests { + ver, err := systemdVersionAtoi(sdTest.verStr) + if !sdTest.expectErr && err != nil { + t.Errorf("systemdVersionAtoi(%s); want nil; got %v", sdTest.verStr, err) + } + if sdTest.expectErr && err == nil { + t.Errorf("systemdVersionAtoi(%s); wanted failure; got nil", sdTest.verStr) + } + if ver != sdTest.expectedVer { + t.Errorf("systemdVersionAtoi(%s); want %d; got %d", sdTest.verStr, sdTest.expectedVer, ver) + } + } +} diff --git a/sysbox-runc/libcontainer/cgroups/systemd/unsupported.go b/sysbox-runc/libcontainer/cgroups/systemd/unsupported.go new file mode 100644 index 00000000..ed5da692 --- /dev/null +++ b/sysbox-runc/libcontainer/cgroups/systemd/unsupported.go @@ -0,0 +1,84 @@ +// +build !linux + +package systemd + +import ( + "errors" + "fmt" + + "github.com/opencontainers/runc/libcontainer/cgroups" + "github.com/opencontainers/runc/libcontainer/configs" +) + +type Manager struct { + Cgroups *configs.Cgroup + Paths map[string]string +} + +func IsRunningSystemd() bool { + return false +} + +func NewSystemdCgroupsManager() (func(config *configs.Cgroup, paths map[string]string) cgroups.Manager, error) { + return nil, errors.New("Systemd not supported") +} + +func (m *Manager) Apply(pid int) error { + return errors.New("Systemd not supported") +} + +func (m *Manager) GetPids() ([]int, error) { + return nil, errors.New("Systemd not supported") +} + +func (m *Manager) GetAllPids() ([]int, error) { + return nil, errors.New("Systemd not supported") +} + +func (m *Manager) Destroy() error { + return errors.New("Systemd not supported") +} + +func (m *Manager) GetPaths() map[string]string { + return nil +} + +func (m *Manager) Path(_ string) string { + return "" +} + +func (m *Manager) GetStats() (*cgroups.Stats, error) { + return nil, errors.New("Systemd not supported") +} + +func (m *Manager) Set(container *configs.Config) error { + return errors.New("Systemd not supported") +} + +func (m *Manager) Freeze(state configs.FreezerState) error { + return errors.New("Systemd not supported") +} + +func Freeze(c *configs.Cgroup, state configs.FreezerState) error { + return errors.New("Systemd not supported") +} + +func (m *Manager) GetCgroups() (*configs.Cgroup, error) { + return nil, errors.New("Systemd not supported") +} + +func (m *Manager) Exists() bool { + return false +} + +func (m *Manager) CreateChildCgroup(container *configs.Config) error { + return fmt.Errorf("Systemd not supported") +} + +func (m *Manager) ApplyChildCgroup(pid int) error { + return fmt.Errorf("Systemd not supported") +} + +func (m *Manager) GetChildCgroupPaths() map[string]string { + return nil +} diff --git a/sysbox-runc/libcontainer/cgroups/systemd/user.go b/sysbox-runc/libcontainer/cgroups/systemd/user.go new file mode 100644 index 00000000..8fe91688 --- /dev/null +++ b/sysbox-runc/libcontainer/cgroups/systemd/user.go @@ -0,0 +1,106 @@ +// +build linux + +package systemd + +import ( + "bufio" + "bytes" + "os" + "os/exec" + "path/filepath" + "strconv" + "strings" + + systemdDbus "github.com/coreos/go-systemd/v22/dbus" + dbus "github.com/godbus/dbus/v5" + "github.com/opencontainers/runc/libcontainer/system" + "github.com/pkg/errors" +) + +// NewUserSystemdDbus creates a connection for systemd user-instance. +func NewUserSystemdDbus() (*systemdDbus.Conn, error) { + addr, err := DetectUserDbusSessionBusAddress() + if err != nil { + return nil, err + } + uid, err := DetectUID() + if err != nil { + return nil, err + } + + return systemdDbus.NewConnection(func() (*dbus.Conn, error) { + conn, err := dbus.Dial(addr) + if err != nil { + return nil, errors.Wrapf(err, "error while dialing %q", addr) + } + methods := []dbus.Auth{dbus.AuthExternal(strconv.Itoa(uid))} + err = conn.Auth(methods) + if err != nil { + conn.Close() + return nil, errors.Wrapf(err, "error while authenticating connection, address=%q, UID=%d", addr, uid) + } + if err = conn.Hello(); err != nil { + conn.Close() + return nil, errors.Wrapf(err, "error while sending Hello message, address=%q, UID=%d", addr, uid) + } + return conn, nil + }) +} + +// DetectUID detects UID from the OwnerUID field of `busctl --user status` +// if running in userNS. The value corresponds to sd_bus_creds_get_owner_uid(3) . +// +// Otherwise returns os.Getuid() . +func DetectUID() (int, error) { + if !system.RunningInUserNS() { + return os.Getuid(), nil + } + b, err := exec.Command("busctl", "--user", "--no-pager", "status").CombinedOutput() + if err != nil { + return -1, errors.Wrapf(err, "could not execute `busctl --user --no-pager status`: %q", string(b)) + } + scanner := bufio.NewScanner(bytes.NewReader(b)) + for scanner.Scan() { + s := strings.TrimSpace(scanner.Text()) + if strings.HasPrefix(s, "OwnerUID=") { + uidStr := strings.TrimPrefix(s, "OwnerUID=") + i, err := strconv.Atoi(uidStr) + if err != nil { + return -1, errors.Wrapf(err, "could not detect the OwnerUID: %s", s) + } + return i, nil + } + } + if err := scanner.Err(); err != nil { + return -1, err + } + return -1, errors.New("could not detect the OwnerUID") +} + +// DetectUserDbusSessionBusAddress returns $DBUS_SESSION_BUS_ADDRESS if set. +// Otherwise returns "unix:path=$XDG_RUNTIME_DIR/bus" if $XDG_RUNTIME_DIR/bus exists. +// Otherwise parses the value from `systemctl --user show-environment` . +func DetectUserDbusSessionBusAddress() (string, error) { + if env := os.Getenv("DBUS_SESSION_BUS_ADDRESS"); env != "" { + return env, nil + } + if xdr := os.Getenv("XDG_RUNTIME_DIR"); xdr != "" { + busPath := filepath.Join(xdr, "bus") + if _, err := os.Stat(busPath); err == nil { + busAddress := "unix:path=" + busPath + return busAddress, nil + } + } + b, err := exec.Command("systemctl", "--user", "--no-pager", "show-environment").CombinedOutput() + if err != nil { + return "", errors.Wrapf(err, "could not execute `systemctl --user --no-pager show-environment`, output=%q", string(b)) + } + scanner := bufio.NewScanner(bytes.NewReader(b)) + for scanner.Scan() { + s := strings.TrimSpace(scanner.Text()) + if strings.HasPrefix(s, "DBUS_SESSION_BUS_ADDRESS=") { + return strings.TrimPrefix(s, "DBUS_SESSION_BUS_ADDRESS="), nil + } + } + return "", errors.New("could not detect DBUS_SESSION_BUS_ADDRESS from `systemctl --user --no-pager show-environment`. Make sure you have installed the dbus-user-session or dbus-daemon package and then run: `systemctl --user start dbus`") +} diff --git a/sysbox-runc/libcontainer/cgroups/systemd/v1.go b/sysbox-runc/libcontainer/cgroups/systemd/v1.go new file mode 100644 index 00000000..4a9605e8 --- /dev/null +++ b/sysbox-runc/libcontainer/cgroups/systemd/v1.go @@ -0,0 +1,494 @@ +// +build linux + +package systemd + +import ( + "errors" + "fmt" + "os" + "path/filepath" + "strings" + "sync" + + systemdDbus "github.com/coreos/go-systemd/v22/dbus" + "github.com/opencontainers/runc/libcontainer/cgroups" + "github.com/opencontainers/runc/libcontainer/cgroups/fs" + "github.com/opencontainers/runc/libcontainer/configs" + "github.com/sirupsen/logrus" +) + +type legacyManager struct { + mu sync.Mutex + cgroups *configs.Cgroup + paths map[string]string + childCgroupCreated bool +} + +func NewLegacyManager(cg *configs.Cgroup, paths map[string]string) cgroups.Manager { + + childCgroupCreated := false + if paths != nil { + childCgroupCreated = true + } + + return &legacyManager{ + cgroups: cg, + paths: paths, + childCgroupCreated: childCgroupCreated, + } +} + +type subsystem interface { + // Name returns the name of the subsystem. + Name() string + // Returns the stats, as 'stats', corresponding to the cgroup under 'path'. + GetStats(path string, stats *cgroups.Stats) error + // Set the cgroup represented by cgroup. + Set(path string, cgroup *configs.Cgroup) error +} + +var errSubsystemDoesNotExist = errors.New("cgroup: subsystem does not exist") + +var legacySubsystems = []subsystem{ + &fs.CpusetGroup{}, + &fs.DevicesGroup{}, + &fs.MemoryGroup{}, + &fs.CpuGroup{}, + &fs.CpuacctGroup{}, + &fs.PidsGroup{}, + &fs.BlkioGroup{}, + &fs.HugetlbGroup{}, + &fs.PerfEventGroup{}, + &fs.FreezerGroup{}, + &fs.NetPrioGroup{}, + &fs.NetClsGroup{}, + &fs.NameGroup{GroupName: "name=systemd"}, + &fs.RdmaGroup{}, +} + +func genV1ResourcesProperties(c *configs.Cgroup, conn *systemdDbus.Conn) ([]systemdDbus.Property, error) { + var properties []systemdDbus.Property + r := c.Resources + + deviceProperties, err := generateDeviceProperties(r.Devices) + if err != nil { + return nil, err + } + properties = append(properties, deviceProperties...) + + if r.Memory != 0 { + properties = append(properties, + newProp("MemoryLimit", uint64(r.Memory))) + } + + if r.CpuShares != 0 { + properties = append(properties, + newProp("CPUShares", r.CpuShares)) + } + + addCpuQuota(conn, &properties, r.CpuQuota, r.CpuPeriod) + + if r.BlkioWeight != 0 { + properties = append(properties, + newProp("BlockIOWeight", uint64(r.BlkioWeight))) + } + + if r.PidsLimit > 0 || r.PidsLimit == -1 { + properties = append(properties, + newProp("TasksAccounting", true), + newProp("TasksMax", uint64(r.PidsLimit))) + } + + err = addCpuset(conn, &properties, r.CpusetCpus, r.CpusetMems) + if err != nil { + return nil, err + } + + return properties, nil +} + +func (m *legacyManager) Apply(pid int) error { + var ( + c = m.cgroups + unitName = getUnitName(c) + slice = "system.slice" + properties []systemdDbus.Property + ) + + if c.Resources.Unified != nil { + return cgroups.ErrV1NoUnified + } + + m.mu.Lock() + defer m.mu.Unlock() + if c.Paths != nil { + paths := make(map[string]string) + cgMap, err := cgroups.ParseCgroupFile("/proc/self/cgroup") + if err != nil { + return err + } + // XXX(kolyshkin@): why this check is needed? + for name, path := range c.Paths { + if _, ok := cgMap[name]; ok { + paths[name] = path + } + } + m.paths = paths + return cgroups.EnterPid(m.paths, pid) + } + + if c.Parent != "" { + slice = c.Parent + } + + properties = append(properties, systemdDbus.PropDescription("libcontainer container "+c.Name)) + + // if we create a slice, the parent is defined via a Wants= + if strings.HasSuffix(unitName, ".slice") { + properties = append(properties, systemdDbus.PropWants(slice)) + } else { + // otherwise, we use Slice= + properties = append(properties, systemdDbus.PropSlice(slice)) + } + + // only add pid if its valid, -1 is used w/ general slice creation. + if pid != -1 { + properties = append(properties, newProp("PIDs", []uint32{uint32(pid)})) + } + + // sysbox-runc requires service or scope units for the container, as otherwise delegation won't work. + if strings.HasSuffix(unitName, ".slice") { + return fmt.Errorf("container cgroup is on systemd slice unit %s; sysbox-runc requires it to be on systemd service or scope units in order for cgroup delegation to work", unitName) + } + + // NOTE: sysbox-runc requires cgroup delegation, which is supported on systemd versions >= 218. + dbusConnection, err := getDbusConnection(false) + if err != nil { + return err + } + + sdVer := systemdVersion(dbusConnection) + if sdVer < 218 { + return fmt.Errorf("systemd version is < 218; sysbox-runc requires version >= 218 for cgroup delegation.") + } + + properties = append(properties, newProp("Delegate", true)) + + // Always enable accounting, this gets us the same behaviour as the fs implementation, + // plus the kernel has some problems with joining the memory cgroup at a later time. + properties = append(properties, + newProp("MemoryAccounting", true), + newProp("CPUAccounting", true), + newProp("BlockIOAccounting", true)) + + // Assume DefaultDependencies= will always work (the check for it was previously broken.) + properties = append(properties, + newProp("DefaultDependencies", false)) + + resourcesProperties, err := genV1ResourcesProperties(c, dbusConnection) + if err != nil { + return err + } + properties = append(properties, resourcesProperties...) + properties = append(properties, c.SystemdProps...) + + if err := startUnit(dbusConnection, unitName, properties); err != nil { + return err + } + + paths := make(map[string]string) + for _, s := range legacySubsystems { + subsystemPath, err := getSubsystemPath(m.cgroups, s.Name()) + if err != nil { + // Even if it's `not found` error, we'll return err + // because devices cgroup is hard requirement for + // container security. + if s.Name() == "devices" { + return err + } + // Don't fail if a cgroup hierarchy was not found, just skip this subsystem + if cgroups.IsNotFound(err) { + continue + } + return err + } + paths[s.Name()] = subsystemPath + } + m.paths = paths + + if err := m.joinCgroups(pid); err != nil { + return err + } + + return nil +} + +func (m *legacyManager) Destroy() error { + if m.cgroups.Paths != nil { + return nil + } + m.mu.Lock() + defer m.mu.Unlock() + + dbusConnection, err := getDbusConnection(false) + if err != nil { + return err + } + unitName := getUnitName(m.cgroups) + + stopErr := stopUnit(dbusConnection, unitName) + // Both on success and on error, cleanup all the cgroups we are aware of. + // Some of them were created directly by Apply() and are not managed by systemd. + if err := cgroups.RemovePaths(m.paths); err != nil { + return err + } + + return stopErr +} + +func (m *legacyManager) Path(subsys string) string { + m.mu.Lock() + defer m.mu.Unlock() + return m.paths[subsys] +} + +func (m *legacyManager) joinCgroups(pid int) error { + for _, sys := range legacySubsystems { + name := sys.Name() + switch name { + case "name=systemd": + // let systemd handle this + case "cpuset": + if path, ok := m.paths[name]; ok { + s := &fs.CpusetGroup{} + if err := s.ApplyDir(path, m.cgroups, pid); err != nil { + return err + } + } + default: + if path, ok := m.paths[name]; ok { + if err := os.MkdirAll(path, 0755); err != nil { + return err + } + if err := cgroups.WriteCgroupProc(path, pid); err != nil { + return err + } + } + } + } + + return nil +} + +func getSubsystemPath(c *configs.Cgroup, subsystem string) (string, error) { + mountpoint, err := cgroups.FindCgroupMountpoint("", subsystem) + if err != nil { + return "", err + } + + initPath, err := cgroups.GetInitCgroup(subsystem) + if err != nil { + return "", err + } + // if pid 1 is systemd 226 or later, it will be in init.scope, not the root + initPath = strings.TrimSuffix(filepath.Clean(initPath), "init.scope") + + slice := "system.slice" + if c.Parent != "" { + slice = c.Parent + } + + slice, err = ExpandSlice(slice) + if err != nil { + return "", err + } + + return filepath.Join(mountpoint, initPath, slice, getUnitName(c)), nil +} + +func (m *legacyManager) Freeze(state configs.FreezerState) error { + path, ok := m.paths["freezer"] + if !ok { + return errSubsystemDoesNotExist + } + prevState := m.cgroups.Resources.Freezer + m.cgroups.Resources.Freezer = state + freezer := &fs.FreezerGroup{} + if err := freezer.Set(path, m.cgroups); err != nil { + m.cgroups.Resources.Freezer = prevState + return err + } + return nil +} + +func (m *legacyManager) GetPids() ([]int, error) { + path, ok := m.paths["devices"] + if !ok { + return nil, errSubsystemDoesNotExist + } + return cgroups.GetPids(path) +} + +func (m *legacyManager) GetAllPids() ([]int, error) { + path, ok := m.paths["devices"] + if !ok { + return nil, errSubsystemDoesNotExist + } + return cgroups.GetAllPids(path) +} + +func (m *legacyManager) GetStats() (*cgroups.Stats, error) { + m.mu.Lock() + defer m.mu.Unlock() + stats := cgroups.NewStats() + for _, sys := range legacySubsystems { + path := m.paths[sys.Name()] + if path == "" { + continue + } + if err := sys.GetStats(path, stats); err != nil { + return nil, err + } + } + + return stats, nil +} + +func (m *legacyManager) Set(container *configs.Config) error { + // If Paths are set, then we are just joining cgroups paths + // and there is no need to set any values. + if m.cgroups.Paths != nil { + return nil + } + if container.Cgroups.Resources.Unified != nil { + return cgroups.ErrV1NoUnified + } + dbusConnection, err := getDbusConnection(false) + if err != nil { + return err + } + properties, err := genV1ResourcesProperties(container.Cgroups, dbusConnection) + if err != nil { + return err + } + + // We have to freeze the container while systemd sets the cgroup settings. + // The reason for this is that systemd's application of DeviceAllow rules + // is done disruptively, resulting in spurrious errors to common devices + // (unlike our fs driver, they will happily write deny-all rules to running + // containers). So we freeze the container to avoid them hitting the cgroup + // error. But if the freezer cgroup isn't supported, we just warn about it. + targetFreezerState := configs.Undefined + if !m.cgroups.SkipDevices { + // Figure out the current freezer state, so we can revert to it after we + // temporarily freeze the container. + targetFreezerState, err = m.GetFreezerState() + if err != nil { + return err + } + if targetFreezerState == configs.Undefined { + targetFreezerState = configs.Thawed + } + + if err := m.Freeze(configs.Frozen); err != nil { + logrus.Infof("freeze container before SetUnitProperties failed: %v", err) + } + } + + if err := dbusConnection.SetUnitProperties(getUnitName(container.Cgroups), true, properties...); err != nil { + _ = m.Freeze(targetFreezerState) + return err + } + + // Reset freezer state before we apply the configuration, to avoid clashing + // with the freezer setting in the configuration. + _ = m.Freeze(targetFreezerState) + + for _, sys := range legacySubsystems { + // Get the subsystem path, but don't error out for not found cgroups. + path, ok := m.paths[sys.Name()] + if !ok { + continue + } + if err := sys.Set(path, container.Cgroups); err != nil { + return err + } + } + + return nil +} + +func (m *legacyManager) GetPaths() map[string]string { + m.mu.Lock() + defer m.mu.Unlock() + return m.paths +} + +func (m *legacyManager) GetCgroups() (*configs.Cgroup, error) { + return m.cgroups, nil +} + +func (m *legacyManager) GetFreezerState() (configs.FreezerState, error) { + path, ok := m.paths["freezer"] + if !ok { + return configs.Undefined, nil + } + freezer := &fs.FreezerGroup{} + return freezer.GetState(path) +} + +func (m *legacyManager) Exists() bool { + return cgroups.PathExists(m.Path("devices")) +} + +func (m *legacyManager) CreateChildCgroup(container *configs.Config) error { + m.mu.Lock() + defer m.mu.Unlock() + + // The child cgroups will not be visible to systemd (due to delegation); thus + // we create them directly on the filesystem using the fs cgroup manager. + childMgr := fs.NewManager(m.cgroups, m.paths, false) + + if err := childMgr.CreateChildCgroup(container); err != nil { + return fmt.Errorf("failed to create child cgroup: %s", err) + } + + m.childCgroupCreated = true + return nil +} + +func (m *legacyManager) ApplyChildCgroup(pid int) error { + m.mu.Lock() + defer m.mu.Unlock() + + if m.cgroups == nil { + return nil + } + + if !m.childCgroupCreated { + return fmt.Errorf("can't place process in child cgroup because child cgroup has not been created") + } + + if m.paths == nil { + return errors.New("can't place pid in delegated cgroup unless it was placed in container cgroup first") + } + + childMgr := fs.NewManager(m.cgroups, m.paths, false) + if err := childMgr.ApplyChildCgroup(pid); err != nil { + return fmt.Errorf("failed to apply child cgroup: %s", err) + } + + return nil +} + +func (m *legacyManager) GetChildCgroupPaths() map[string]string { + m.mu.Lock() + defer m.mu.Unlock() + + childMgr := fs.NewManager(m.cgroups, m.paths, false) + return childMgr.GetChildCgroupPaths() +} + +func (m *legacyManager) GetType() cgroups.CgroupType { + return cgroups.Cgroup_v1_systemd +} diff --git a/sysbox-runc/libcontainer/cgroups/systemd/v2.go b/sysbox-runc/libcontainer/cgroups/systemd/v2.go new file mode 100644 index 00000000..9bfeae7b --- /dev/null +++ b/sysbox-runc/libcontainer/cgroups/systemd/v2.go @@ -0,0 +1,615 @@ +// +build linux + +package systemd + +import ( + "fmt" + "io/ioutil" + "math" + "os" + "path/filepath" + "strconv" + "strings" + "sync" + + systemdDbus "github.com/coreos/go-systemd/v22/dbus" + securejoin "github.com/cyphar/filepath-securejoin" + "github.com/opencontainers/runc/libcontainer/cgroups" + "github.com/opencontainers/runc/libcontainer/cgroups/fs2" + "github.com/opencontainers/runc/libcontainer/configs" + "github.com/pkg/errors" + "github.com/sirupsen/logrus" +) + +type unifiedManager struct { + mu sync.Mutex + cgroups *configs.Cgroup + // path is like "/sys/fs/cgroup/user.slice/user-1001.slice/session-1.scope" + path string + rootless bool +} + +func NewUnifiedManager(config *configs.Cgroup, path string, rootless bool) cgroups.Manager { + return &unifiedManager{ + cgroups: config, + path: path, + rootless: rootless, + } +} + +// unifiedResToSystemdProps tries to convert from Cgroup.Resources.Unified +// key/value map (where key is cgroupfs file name) to systemd unit properties. +// This is on a best-effort basis, so the properties that are not known +// (to this function and/or systemd) are ignored (but logged with "debug" +// log level). +// +// For the list of keys, see https://www.kernel.org/doc/Documentation/cgroup-v2.txt +// +// For the list of systemd unit properties, see systemd.resource-control(5). +func unifiedResToSystemdProps(conn *systemdDbus.Conn, res map[string]string) (props []systemdDbus.Property, _ error) { + var err error + + for k, v := range res { + if strings.Contains(k, "/") { + return nil, fmt.Errorf("unified resource %q must be a file name (no slashes)", k) + } + sk := strings.SplitN(k, ".", 2) + if len(sk) != 2 { + return nil, fmt.Errorf("unified resource %q must be in the form CONTROLLER.PARAMETER", k) + } + // Kernel is quite forgiving to extra whitespace + // around the value, and so should we. + v = strings.TrimSpace(v) + // Please keep cases in alphabetical order. + switch k { + case "cpu.max": + // value: quota [period] + quota := int64(0) // 0 means "unlimited" for addCpuQuota, if period is set + period := defCPUQuotaPeriod + sv := strings.Fields(v) + if len(sv) < 1 || len(sv) > 2 { + return nil, fmt.Errorf("unified resource %q value invalid: %q", k, v) + } + // quota + if sv[0] != "max" { + quota, err = strconv.ParseInt(sv[0], 10, 64) + if err != nil { + return nil, fmt.Errorf("unified resource %q period value conversion error: %w", k, err) + } + } + // period + if len(sv) == 2 { + period, err = strconv.ParseUint(sv[1], 10, 64) + if err != nil { + return nil, fmt.Errorf("unified resource %q quota value conversion error: %w", k, err) + } + } + addCpuQuota(conn, &props, quota, period) + + case "cpu.weight": + num, err := strconv.ParseUint(v, 10, 64) + if err != nil { + return nil, fmt.Errorf("unified resource %q value conversion error: %w", k, err) + } + props = append(props, + newProp("CPUWeight", num)) + + case "cpuset.cpus", "cpuset.mems": + bits, err := rangeToBits(v) + if err != nil { + return nil, fmt.Errorf("unified resource %q=%q conversion error: %w", k, v, err) + } + m := map[string]string{ + "cpuset.cpus": "AllowedCPUs", + "cpuset.mems": "AllowedMemoryNodes", + } + // systemd only supports these properties since v244 + sdVer := systemdVersion(conn) + if sdVer >= 244 { + props = append(props, + newProp(m[k], bits)) + } else { + logrus.Debugf("systemd v%d is too old to support %s"+ + " (setting will still be applied to cgroupfs)", + sdVer, m[k]) + } + + case "memory.high", "memory.low", "memory.min", "memory.max", "memory.swap.max": + num := uint64(math.MaxUint64) + if v != "max" { + num, err = strconv.ParseUint(v, 10, 64) + if err != nil { + return nil, fmt.Errorf("unified resource %q value conversion error: %w", k, err) + } + } + m := map[string]string{ + "memory.high": "MemoryHigh", + "memory.low": "MemoryLow", + "memory.min": "MemoryMin", + "memory.max": "MemoryMax", + "memory.swap.max": "MemorySwapMax", + } + props = append(props, + newProp(m[k], num)) + + case "pids.max": + num := uint64(math.MaxUint64) + if v != "max" { + var err error + num, err = strconv.ParseUint(v, 10, 64) + if err != nil { + return nil, fmt.Errorf("unified resource %q value conversion error: %w", k, err) + } + } + props = append(props, + newProp("TasksAccounting", true), + newProp("TasksMax", num)) + + case "memory.oom.group": + // Setting this to 1 is roughly equivalent to OOMPolicy=kill + // (as per systemd.service(5) and + // https://www.kernel.org/doc/html/latest/admin-guide/cgroup-v2.html), + // but it's not clear what to do if it is unset or set + // to 0 in runc update, as there are two other possible + // values for OOMPolicy (continue/stop). + fallthrough + + default: + // Ignore the unknown resource here -- will still be + // applied in Set which calls fs2.Set. + logrus.Debugf("don't know how to convert unified resource %q=%q to systemd unit property; skipping (will still be applied to cgroupfs)", k, v) + } + } + + return props, nil +} + +func genV2ResourcesProperties(c *configs.Cgroup, conn *systemdDbus.Conn) ([]systemdDbus.Property, error) { + var properties []systemdDbus.Property + r := c.Resources + + // NOTE: This is of questionable correctness because we insert our own + // devices eBPF program later. Two programs with identical rules + // aren't the end of the world, but it is a bit concerning. However + // it's unclear if systemd removes all eBPF programs attached when + // doing SetUnitProperties... + deviceProperties, err := generateDeviceProperties(r.Devices) + if err != nil { + return nil, err + } + properties = append(properties, deviceProperties...) + + if r.Memory != 0 { + properties = append(properties, + newProp("MemoryMax", uint64(r.Memory))) + } + if r.MemoryReservation != 0 { + properties = append(properties, + newProp("MemoryLow", uint64(r.MemoryReservation))) + } + + swap, err := cgroups.ConvertMemorySwapToCgroupV2Value(r.MemorySwap, r.Memory) + if err != nil { + return nil, err + } + if swap != 0 { + properties = append(properties, + newProp("MemorySwapMax", uint64(swap))) + } + + if r.CpuWeight != 0 { + properties = append(properties, + newProp("CPUWeight", r.CpuWeight)) + } + + addCpuQuota(conn, &properties, r.CpuQuota, r.CpuPeriod) + + if r.PidsLimit > 0 || r.PidsLimit == -1 { + properties = append(properties, + newProp("TasksAccounting", true), + newProp("TasksMax", uint64(r.PidsLimit))) + } + + err = addCpuset(conn, &properties, r.CpusetCpus, r.CpusetMems) + if err != nil { + return nil, err + } + + // ignore r.KernelMemory + + // convert Resources.Unified map to systemd properties + if r.Unified != nil { + unifiedProps, err := unifiedResToSystemdProps(conn, r.Unified) + if err != nil { + return nil, err + } + properties = append(properties, unifiedProps...) + } + + return properties, nil +} + +func (m *unifiedManager) Apply(pid int) error { + var ( + c = m.cgroups + unitName = getUnitName(c) + properties []systemdDbus.Property + ) + + if c.Paths != nil { + return cgroups.WriteCgroupProc(m.path, pid) + } + + slice := "system.slice" + if m.rootless { + slice = "user.slice" + } + if c.Parent != "" { + slice = c.Parent + } + + properties = append(properties, systemdDbus.PropDescription("libcontainer container "+c.Name)) + + // if we create a slice, the parent is defined via a Wants= + if strings.HasSuffix(unitName, ".slice") { + properties = append(properties, systemdDbus.PropWants(slice)) + } else { + // otherwise, we use Slice= + properties = append(properties, systemdDbus.PropSlice(slice)) + } + + // only add pid if its valid, -1 is used w/ general slice creation. + if pid != -1 { + properties = append(properties, newProp("PIDs", []uint32{uint32(pid)})) + } + + // sysbox-runc requires service or scope units for the container, as otherwise delegation won't work. + if strings.HasSuffix(unitName, ".slice") { + return fmt.Errorf("container cgroup is on systemd slice unit %s; sysbox-runc requires it to be on systemd service or scope units in order for cgroup delegation to work", unitName) + } + + // sysbox-runc requires cgroup delegation, which is supported on systemd versions >= 218. + dbusConnection, err := getDbusConnection(false) + if err != nil { + return err + } + + sdVer := systemdVersion(dbusConnection) + if sdVer < 218 { + return fmt.Errorf("systemd version is < 218; sysbox-runc requires version >= 218 for cgroup delegation.") + } + + properties = append(properties, newProp("Delegate", true)) + + // Always enable accounting, this gets us the same behaviour as the fs implementation, + // plus the kernel has some problems with joining the memory cgroup at a later time. + properties = append(properties, + newProp("MemoryAccounting", true), + newProp("CPUAccounting", true), + newProp("IOAccounting", true)) + + // Assume DefaultDependencies= will always work (the check for it was previously broken.) + properties = append(properties, + newProp("DefaultDependencies", false)) + + resourcesProperties, err := genV2ResourcesProperties(c, dbusConnection) + if err != nil { + return err + } + properties = append(properties, resourcesProperties...) + properties = append(properties, c.SystemdProps...) + + if err := startUnit(dbusConnection, unitName, properties); err != nil { + return errors.Wrapf(err, "error while starting unit %q with properties %+v", unitName, properties) + } + + if err = m.initPath(); err != nil { + return err + } + if err := fs2.CreateCgroupPath(m.path, m.cgroups); err != nil { + return err + } + return nil +} + +func (m *unifiedManager) Destroy() error { + if m.cgroups.Paths != nil { + return nil + } + m.mu.Lock() + defer m.mu.Unlock() + + dbusConnection, err := getDbusConnection(m.rootless) + if err != nil { + return err + } + unitName := getUnitName(m.cgroups) + if err := stopUnit(dbusConnection, unitName); err != nil { + return err + } + + // XXX this is probably not needed, systemd should handle it + err = os.Remove(m.path) + if err != nil && !os.IsNotExist(err) { + return err + } + + return nil +} + +func (m *unifiedManager) Path(_ string) string { + return m.path +} + +// getSliceFull value is used in initPath. +// The value is incompatible with systemdDbus.PropSlice. +func (m *unifiedManager) getSliceFull() (string, error) { + c := m.cgroups + slice := "system.slice" + if m.rootless { + slice = "user.slice" + } + if c.Parent != "" { + var err error + slice, err = ExpandSlice(c.Parent) + if err != nil { + return "", err + } + } + + if m.rootless { + dbusConnection, err := getDbusConnection(m.rootless) + if err != nil { + return "", err + } + // managerCGQuoted is typically "/user.slice/user-${uid}.slice/user@${uid}.service" including the quote symbols + managerCGQuoted, err := dbusConnection.GetManagerProperty("ControlGroup") + if err != nil { + return "", err + } + managerCG, err := strconv.Unquote(managerCGQuoted) + if err != nil { + return "", err + } + slice = filepath.Join(managerCG, slice) + } + + // an example of the final slice in rootless: "/user.slice/user-1001.slice/user@1001.service/user.slice" + // NOTE: systemdDbus.PropSlice requires the "/user.slice/user-1001.slice/user@1001.service/" prefix NOT to be specified. + return slice, nil +} + +func (m *unifiedManager) initPath() error { + if m.path != "" { + return nil + } + + sliceFull, err := m.getSliceFull() + if err != nil { + return err + } + + c := m.cgroups + path := filepath.Join(sliceFull, getUnitName(c)) + path, err = securejoin.SecureJoin(fs2.UnifiedMountpoint, path) + if err != nil { + return err + } + + // an example of the final path in rootless: + // "/sys/fs/cgroup/user.slice/user-1001.slice/user@1001.service/user.slice/libpod-132ff0d72245e6f13a3bbc6cdc5376886897b60ac59eaa8dea1df7ab959cbf1c.scope" + m.path = path + + return nil +} + +func (m *unifiedManager) fsManager() (cgroups.Manager, error) { + if err := m.initPath(); err != nil { + return nil, err + } + return fs2.NewManager(m.cgroups, m.path, m.rootless) +} + +func (m *unifiedManager) Freeze(state configs.FreezerState) error { + fsMgr, err := m.fsManager() + if err != nil { + return err + } + return fsMgr.Freeze(state) +} + +func (m *unifiedManager) GetPids() ([]int, error) { + if err := m.initPath(); err != nil { + return nil, err + } + return cgroups.GetPids(m.path) +} + +func (m *unifiedManager) GetAllPids() ([]int, error) { + if err := m.initPath(); err != nil { + return nil, err + } + return cgroups.GetAllPids(m.path) +} + +func (m *unifiedManager) GetStats() (*cgroups.Stats, error) { + fsMgr, err := m.fsManager() + if err != nil { + return nil, err + } + return fsMgr.GetStats() +} + +func (m *unifiedManager) Set(container *configs.Config) error { + dbusConnection, err := getDbusConnection(m.rootless) + if err != nil { + return err + } + properties, err := genV2ResourcesProperties(m.cgroups, dbusConnection) + if err != nil { + return err + } + + // We have to freeze the container while systemd sets the cgroup settings. + // The reason for this is that systemd's application of DeviceAllow rules + // is done disruptively, resulting in spurrious errors to common devices + // (unlike our fs driver, they will happily write deny-all rules to running + // containers). So we freeze the container to avoid them hitting the cgroup + // error. But if the freezer cgroup isn't supported, we just warn about it. + targetFreezerState := configs.Undefined + if !m.cgroups.SkipDevices { + // Figure out the current freezer state, so we can revert to it after we + // temporarily freeze the container. + targetFreezerState, err = m.GetFreezerState() + if err != nil { + return err + } + if targetFreezerState == configs.Undefined { + targetFreezerState = configs.Thawed + } + + if err := m.Freeze(configs.Frozen); err != nil { + logrus.Infof("freeze container before SetUnitProperties failed: %v", err) + } + } + + if err := dbusConnection.SetUnitProperties(getUnitName(m.cgroups), true, properties...); err != nil { + _ = m.Freeze(targetFreezerState) + return errors.Wrap(err, "error while setting unit properties") + } + + // Reset freezer state before we apply the configuration, to avoid clashing + // with the freezer setting in the configuration. + _ = m.Freeze(targetFreezerState) + + fsMgr, err := m.fsManager() + if err != nil { + return err + } + return fsMgr.Set(container) +} + +func (m *unifiedManager) GetPaths() map[string]string { + paths := make(map[string]string, 1) + paths[""] = m.path + return paths +} + +func (m *unifiedManager) GetCgroups() (*configs.Cgroup, error) { + return m.cgroups, nil +} + +func (m *unifiedManager) GetFreezerState() (configs.FreezerState, error) { + fsMgr, err := m.fsManager() + if err != nil { + return configs.Undefined, err + } + return fsMgr.GetFreezerState() +} + +func (m *unifiedManager) Exists() bool { + return cgroups.PathExists(m.path) +} + +func (m *unifiedManager) CreateChildCgroup(config *configs.Config) error { + + // Change the cgroup ownership to match the root user in the system + // container (needed for delegation). + path := m.path + + rootuid, err := config.HostRootUID() + if err != nil { + return err + } + rootgid, err := config.HostRootGID() + if err != nil { + return err + } + + if err := os.Chown(path, rootuid, rootgid); err != nil { + return fmt.Errorf("Failed to change owner of cgroup %s", path) + } + + // Change ownership of some of the files inside the sys container's cgroup; + // for cgroups v2 we only change the ownership of a subset of the files, as + // specified in section "Cgroups Delegation: Delegating a Hierarchy to a Less + // Privileged User" in cgroups(7). + files, err := ioutil.ReadDir(path) + if err != nil { + return err + } + for _, file := range files { + fname := file.Name() + + if fname == "cgroup.procs" || + fname == "cgroup.subtree_control" || + fname == "cgroup.threads" { + + absFileName := filepath.Join(path, fname) + if err := os.Chown(absFileName, rootuid, rootgid); err != nil { + return fmt.Errorf("Failed to change owner for file %s", absFileName) + } + } + } + + // Create a leaf cgroup to be used for the sys container's init process (and + // for all its child processes). Its purpose is to prevent processes from + // living in the sys container's cgroup root, because once inner sub-cgroups are + // created, the kernel considers the sys container's cgroup root an + // intermediate node in the global cgroup hierarchy. This in turn forces all + // sub-groups inside the sys container to be of "domain-invalid" type (and + // thus prevents domain cgroup controllers such as the memory controller + // from being applied inside the sys container). + // + // We choose the name "init.scope" for the leaf cgroup because it works well + // in sys containers that carry systemd, as well as those that don't. In both + // cases, the sys container's init processes are placed in the init.scope + // cgroup. For sys container's with systemd, systemd then moves the processes + // to other sub-cgroups it manages. + // + // Note that processes that enter the sys container via "exec" will also + // be placed in this sub-cgroup. + + leafPath := filepath.Join(path, "init.scope") + if err = os.MkdirAll(leafPath, 0755); err != nil { + return err + } + + if err := os.Chown(leafPath, rootuid, rootgid); err != nil { + return fmt.Errorf("Failed to change owner of cgroup %s", leafPath) + } + + files, err = ioutil.ReadDir(leafPath) + if err != nil { + return err + } + for _, file := range files { + fname := file.Name() + + if fname == "cgroup.procs" || + fname == "cgroup.subtree_control" || + fname == "cgroup.threads" { + + absFileName := filepath.Join(leafPath, fname) + if err := os.Chown(absFileName, rootuid, rootgid); err != nil { + return fmt.Errorf("Failed to change owner for file %s", absFileName) + } + } + } + + return nil +} + +func (m *unifiedManager) ApplyChildCgroup(pid int) error { + paths := make(map[string]string, 1) + paths[""] = filepath.Join(m.path, "init.scope") + return cgroups.EnterPid(paths, pid) +} + +func (m *unifiedManager) GetChildCgroupPaths() map[string]string { + return m.GetPaths() +} + +func (m *unifiedManager) GetType() cgroups.CgroupType { + return cgroups.Cgroup_v2_systemd +} diff --git a/sysbox-runc/libcontainer/cgroups/utils.go b/sysbox-runc/libcontainer/cgroups/utils.go new file mode 100644 index 00000000..b2d35027 --- /dev/null +++ b/sysbox-runc/libcontainer/cgroups/utils.go @@ -0,0 +1,451 @@ +// +build linux + +package cgroups + +import ( + "bufio" + "errors" + "fmt" + "io" + "io/ioutil" + "os" + "path/filepath" + "strconv" + "strings" + "sync" + "time" + + "github.com/opencontainers/runc/libcontainer/system" + "github.com/sirupsen/logrus" + "golang.org/x/sys/unix" +) + +const ( + CgroupProcesses = "cgroup.procs" + unifiedMountpoint = "/sys/fs/cgroup" +) + +var ( + isUnifiedOnce sync.Once + isUnified bool +) + +// IsCgroup2UnifiedMode returns whether we are running in cgroup v2 unified mode. +func IsCgroup2UnifiedMode() bool { + isUnifiedOnce.Do(func() { + var st unix.Statfs_t + err := unix.Statfs(unifiedMountpoint, &st) + if err != nil { + if os.IsNotExist(err) && system.RunningInUserNS() { + // ignore the "not found" error if running in userns + logrus.WithError(err).Debugf("%s missing, assuming cgroup v1", unifiedMountpoint) + isUnified = false + return + } + panic(fmt.Sprintf("cannot statfs cgroup root: %s", err)) + } + isUnified = st.Type == unix.CGROUP2_SUPER_MAGIC + }) + return isUnified +} + +type Mount struct { + Mountpoint string + Root string + Subsystems []string +} + +// GetCgroupMounts returns the mounts for the cgroup subsystems. +// all indicates whether to return just the first instance or all the mounts. +// This function should not be used from cgroupv2 code, as in this case +// all the controllers are available under the constant unifiedMountpoint. +func GetCgroupMounts(all bool) ([]Mount, error) { + if IsCgroup2UnifiedMode() { + // TODO: remove cgroupv2 case once all external users are converted + availableControllers, err := GetAllSubsystems() + if err != nil { + return nil, err + } + m := Mount{ + Mountpoint: unifiedMountpoint, + Root: unifiedMountpoint, + Subsystems: availableControllers, + } + return []Mount{m}, nil + } + + return getCgroupMountsV1(all) +} + +// GetAllSubsystems returns all the cgroup subsystems supported by the kernel +func GetAllSubsystems() ([]string, error) { + // /proc/cgroups is meaningless for v2 + // https://github.com/torvalds/linux/blob/v5.3/Documentation/admin-guide/cgroup-v2.rst#deprecated-v1-core-features + if IsCgroup2UnifiedMode() { + // "pseudo" controllers do not appear in /sys/fs/cgroup/cgroup.controllers. + // - devices: implemented in kernel 4.15 + // - freezer: implemented in kernel 5.2 + // We assume these are always available, as it is hard to detect availability. + pseudo := []string{"devices", "freezer"} + data, err := ReadFile("/sys/fs/cgroup", "cgroup.controllers") + if err != nil { + return nil, err + } + subsystems := append(pseudo, strings.Fields(data)...) + return subsystems, nil + } + f, err := os.Open("/proc/cgroups") + if err != nil { + return nil, err + } + defer f.Close() + + subsystems := []string{} + + s := bufio.NewScanner(f) + for s.Scan() { + text := s.Text() + if text[0] != '#' { + parts := strings.Fields(text) + if len(parts) >= 4 && parts[3] != "0" { + subsystems = append(subsystems, parts[0]) + } + } + } + if err := s.Err(); err != nil { + return nil, err + } + return subsystems, nil +} + +func readProcsFile(file string) ([]int, error) { + f, err := os.Open(file) + if err != nil { + return nil, err + } + defer f.Close() + + var ( + s = bufio.NewScanner(f) + out = []int{} + ) + + for s.Scan() { + if t := s.Text(); t != "" { + pid, err := strconv.Atoi(t) + if err != nil { + return nil, err + } + out = append(out, pid) + } + } + return out, s.Err() +} + +// ParseCgroupFile parses the given cgroup file, typically /proc/self/cgroup +// or /proc//cgroup, into a map of subsystems to cgroup paths, e.g. +// "cpu": "/user.slice/user-1000.slice" +// "pids": "/user.slice/user-1000.slice" +// etc. +// +// Note that for cgroup v2 unified hierarchy, there are no per-controller +// cgroup paths, so the resulting map will have a single element where the key +// is empty string ("") and the value is the cgroup path the is in. +func ParseCgroupFile(path string) (map[string]string, error) { + f, err := os.Open(path) + if err != nil { + return nil, err + } + defer f.Close() + + return parseCgroupFromReader(f) +} + +// helper function for ParseCgroupFile to make testing easier +func parseCgroupFromReader(r io.Reader) (map[string]string, error) { + s := bufio.NewScanner(r) + cgroups := make(map[string]string) + + for s.Scan() { + text := s.Text() + // from cgroups(7): + // /proc/[pid]/cgroup + // ... + // For each cgroup hierarchy ... there is one entry + // containing three colon-separated fields of the form: + // hierarchy-ID:subsystem-list:cgroup-path + parts := strings.SplitN(text, ":", 3) + if len(parts) < 3 { + return nil, fmt.Errorf("invalid cgroup entry: must contain at least two colons: %v", text) + } + + for _, subs := range strings.Split(parts[1], ",") { + cgroups[subs] = parts[2] + } + } + if err := s.Err(); err != nil { + return nil, err + } + + return cgroups, nil +} + +func PathExists(path string) bool { + if _, err := os.Stat(path); err != nil { + return false + } + return true +} + +func EnterPid(cgroupPaths map[string]string, pid int) error { + for _, path := range cgroupPaths { + if PathExists(path) { + if err := WriteCgroupProc(path, pid); err != nil { + return err + } + } + } + return nil +} + +func rmdir(path string) error { + err := unix.Rmdir(path) + if err == nil || err == unix.ENOENT { + return nil + } + return &os.PathError{Op: "rmdir", Path: path, Err: err} +} + +// RemovePath aims to remove cgroup path. It does so recursively, +// by removing any subdirectories (sub-cgroups) first. +func RemovePath(path string) error { + // try the fast path first + if err := rmdir(path); err == nil { + return nil + } + + infos, err := ioutil.ReadDir(path) + if err != nil { + if os.IsNotExist(err) { + err = nil + } + return err + } + for _, info := range infos { + if info.IsDir() { + // We should remove subcgroups dir first + if err = RemovePath(filepath.Join(path, info.Name())); err != nil { + break + } + } + } + if err == nil { + err = rmdir(path) + } + return err +} + +// RemovePaths iterates over the provided paths removing them. +// We trying to remove all paths five times with increasing delay between tries. +// If after all there are not removed cgroups - appropriate error will be +// returned. +func RemovePaths(paths map[string]string) (err error) { + const retries = 5 + delay := 10 * time.Millisecond + for i := 0; i < retries; i++ { + if i != 0 { + time.Sleep(delay) + delay *= 2 + } + for s, p := range paths { + if err := RemovePath(p); err != nil { + // do not log intermediate iterations + switch i { + case 0: + logrus.WithError(err).Warnf("Failed to remove cgroup (will retry)") + case retries - 1: + logrus.WithError(err).Error("Failed to remove cgroup") + } + + } + _, err := os.Stat(p) + // We need this strange way of checking cgroups existence because + // RemoveAll almost always returns error, even on already removed + // cgroups + if os.IsNotExist(err) { + delete(paths, s) + } + } + if len(paths) == 0 { + //nolint:ineffassign,staticcheck // done to help garbage collecting: opencontainers/runc#2506 + paths = make(map[string]string) + return nil + } + } + return fmt.Errorf("Failed to remove paths: %v", paths) +} + +func GetHugePageSize() ([]string, error) { + dir, err := os.OpenFile("/sys/kernel/mm/hugepages", unix.O_DIRECTORY|unix.O_RDONLY, 0) + if err != nil { + return nil, err + } + files, err := dir.Readdirnames(0) + dir.Close() + if err != nil { + return nil, err + } + + return getHugePageSizeFromFilenames(files) +} + +func getHugePageSizeFromFilenames(fileNames []string) ([]string, error) { + pageSizes := make([]string, 0, len(fileNames)) + + for _, file := range fileNames { + // example: hugepages-1048576kB + val := strings.TrimPrefix(file, "hugepages-") + if len(val) == len(file) { + // unexpected file name: no prefix found + continue + } + // The suffix is always "kB" (as of Linux 5.9) + eLen := len(val) - 2 + val = strings.TrimSuffix(val, "kB") + if len(val) != eLen { + logrus.Warnf("GetHugePageSize: %s: invalid filename suffix (expected \"kB\")", file) + continue + } + size, err := strconv.Atoi(val) + if err != nil { + return nil, err + } + // Model after https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/mm/hugetlb_cgroup.c?id=eff48ddeab782e35e58ccc8853f7386bbae9dec4#n574 + // but in our case the size is in KB already. + if size >= (1 << 20) { + val = strconv.Itoa(size>>20) + "GB" + } else if size >= (1 << 10) { + val = strconv.Itoa(size>>10) + "MB" + } else { + val += "KB" + } + pageSizes = append(pageSizes, val) + } + + return pageSizes, nil +} + +// GetPids returns all pids, that were added to cgroup at path. +func GetPids(dir string) ([]int, error) { + return readProcsFile(filepath.Join(dir, CgroupProcesses)) +} + +// GetAllPids returns all pids, that were added to cgroup at path and to all its +// subcgroups. +func GetAllPids(path string) ([]int, error) { + var pids []int + // collect pids from all sub-cgroups + err := filepath.Walk(path, func(p string, info os.FileInfo, iErr error) error { + if iErr != nil { + return iErr + } + if info.IsDir() || info.Name() != CgroupProcesses { + return nil + } + cPids, err := readProcsFile(p) + if err != nil { + return err + } + pids = append(pids, cPids...) + return nil + }) + return pids, err +} + +// WriteCgroupProc writes the specified pid into the cgroup's cgroup.procs file +func WriteCgroupProc(dir string, pid int) error { + // Normally dir should not be empty, one case is that cgroup subsystem + // is not mounted, we will get empty dir, and we want it fail here. + if dir == "" { + return fmt.Errorf("no such directory for %s", CgroupProcesses) + } + + // Dont attach any pid to the cgroup if -1 is specified as a pid + if pid == -1 { + return nil + } + + file, err := OpenFile(dir, CgroupProcesses, os.O_WRONLY) + if err != nil { + return fmt.Errorf("failed to write %v to %v: %v", pid, CgroupProcesses, err) + } + defer file.Close() + + for i := 0; i < 5; i++ { + _, err = file.WriteString(strconv.Itoa(pid)) + if err == nil { + return nil + } + + // EINVAL might mean that the task being added to cgroup.procs is in state + // TASK_NEW. We should attempt to do so again. + if errors.Is(err, unix.EINVAL) { + time.Sleep(30 * time.Millisecond) + continue + } + + return fmt.Errorf("failed to write %v to %v: %v", pid, CgroupProcesses, err) + } + return err +} + +// Since the OCI spec is designed for cgroup v1, in some cases +// there is need to convert from the cgroup v1 configuration to cgroup v2 +// the formula for BlkIOWeight is y = (1 + (x - 10) * 9999 / 990) +// convert linearly from [10-1000] to [1-10000] +func ConvertBlkIOToCgroupV2Value(blkIoWeight uint16) uint64 { + if blkIoWeight == 0 { + return 0 + } + return uint64(1 + (uint64(blkIoWeight)-10)*9999/990) +} + +// Since the OCI spec is designed for cgroup v1, in some cases +// there is need to convert from the cgroup v1 configuration to cgroup v2 +// the formula for cpuShares is y = (1 + ((x - 2) * 9999) / 262142) +// convert from [2-262144] to [1-10000] +// 262144 comes from Linux kernel definition "#define MAX_SHARES (1UL << 18)" +func ConvertCPUSharesToCgroupV2Value(cpuShares uint64) uint64 { + if cpuShares == 0 { + return 0 + } + return (1 + ((cpuShares-2)*9999)/262142) +} + +// ConvertMemorySwapToCgroupV2Value converts MemorySwap value from OCI spec +// for use by cgroup v2 drivers. A conversion is needed since Resources.MemorySwap +// is defined as memory+swap combined, while in cgroup v2 swap is a separate value. +func ConvertMemorySwapToCgroupV2Value(memorySwap, memory int64) (int64, error) { + // for compatibility with cgroup1 controller, set swap to unlimited in + // case the memory is set to unlimited, and swap is not explicitly set, + // treating the request as "set both memory and swap to unlimited". + if memory == -1 && memorySwap == 0 { + return -1, nil + } + if memorySwap == -1 || memorySwap == 0 { + // -1 is "max", 0 is "unset", so treat as is + return memorySwap, nil + } + // sanity checks + if memory == 0 || memory == -1 { + return 0, errors.New("unable to set swap limit without memory limit") + } + if memory < 0 { + return 0, fmt.Errorf("invalid memory value: %d", memory) + } + if memorySwap < memory { + return 0, errors.New("memory+swap limit should be >= memory limit") + } + + return memorySwap - memory, nil +} diff --git a/sysbox-runc/libcontainer/cgroups/utils_test.go b/sysbox-runc/libcontainer/cgroups/utils_test.go new file mode 100644 index 00000000..11bc7d88 --- /dev/null +++ b/sysbox-runc/libcontainer/cgroups/utils_test.go @@ -0,0 +1,636 @@ +// +build linux + +package cgroups + +import ( + "bytes" + "fmt" + "reflect" + "strings" + "testing" + + "github.com/sirupsen/logrus" +) + +const fedoraMountinfo = `15 35 0:3 / /proc rw,nosuid,nodev,noexec,relatime shared:5 - proc proc rw +16 35 0:14 / /sys rw,nosuid,nodev,noexec,relatime shared:6 - sysfs sysfs rw,seclabel +17 35 0:5 / /dev rw,nosuid shared:2 - devtmpfs devtmpfs rw,seclabel,size=8056484k,nr_inodes=2014121,mode=755 +18 16 0:15 / /sys/kernel/security rw,nosuid,nodev,noexec,relatime shared:7 - securityfs securityfs rw +19 16 0:13 / /sys/fs/selinux rw,relatime shared:8 - selinuxfs selinuxfs rw +20 17 0:16 / /dev/shm rw,nosuid,nodev shared:3 - tmpfs tmpfs rw,seclabel +21 17 0:10 / /dev/pts rw,nosuid,noexec,relatime shared:4 - devpts devpts rw,seclabel,gid=5,mode=620,ptmxmode=000 +22 35 0:17 / /run rw,nosuid,nodev shared:21 - tmpfs tmpfs rw,seclabel,mode=755 +23 16 0:18 / /sys/fs/cgroup rw,nosuid,nodev,noexec shared:9 - tmpfs tmpfs rw,seclabel,mode=755 +24 23 0:19 / /sys/fs/cgroup/systemd rw,nosuid,nodev,noexec,relatime shared:10 - cgroup cgroup rw,xattr,release_agent=/usr/lib/systemd/systemd-cgroups-agent,name=systemd +25 16 0:20 / /sys/fs/pstore rw,nosuid,nodev,noexec,relatime shared:20 - pstore pstore rw +26 23 0:21 / /sys/fs/cgroup/cpuset rw,nosuid,nodev,noexec,relatime shared:11 - cgroup cgroup rw,cpuset,clone_children +27 23 0:22 / /sys/fs/cgroup/cpu,cpuacct rw,nosuid,nodev,noexec,relatime shared:12 - cgroup cgroup rw,cpuacct,cpu,clone_children +28 23 0:23 / /sys/fs/cgroup/memory rw,nosuid,nodev,noexec,relatime shared:13 - cgroup cgroup rw,memory,clone_children +29 23 0:24 / /sys/fs/cgroup/devices rw,nosuid,nodev,noexec,relatime shared:14 - cgroup cgroup rw,devices,clone_children +30 23 0:25 / /sys/fs/cgroup/freezer rw,nosuid,nodev,noexec,relatime shared:15 - cgroup cgroup rw,freezer,clone_children +31 23 0:26 / /sys/fs/cgroup/net_cls rw,nosuid,nodev,noexec,relatime shared:16 - cgroup cgroup rw,net_cls,clone_children +32 23 0:27 / /sys/fs/cgroup/blkio rw,nosuid,nodev,noexec,relatime shared:17 - cgroup cgroup rw,blkio,clone_children +33 23 0:28 / /sys/fs/cgroup/perf_event rw,nosuid,nodev,noexec,relatime shared:18 - cgroup cgroup rw,perf_event,clone_children +34 23 0:29 / /sys/fs/cgroup/hugetlb rw,nosuid,nodev,noexec,relatime shared:19 - cgroup cgroup rw,hugetlb,clone_children +35 1 253:2 / / rw,relatime shared:1 - ext4 /dev/mapper/ssd-root--f20 rw,seclabel,data=ordered +36 15 0:30 / /proc/sys/fs/binfmt_misc rw,relatime shared:22 - autofs systemd-1 rw,fd=38,pgrp=1,timeout=300,minproto=5,maxproto=5,direct +37 17 0:12 / /dev/mqueue rw,relatime shared:23 - mqueue mqueue rw,seclabel +38 35 0:31 / /tmp rw shared:24 - tmpfs tmpfs rw,seclabel +39 17 0:32 / /dev/hugepages rw,relatime shared:25 - hugetlbfs hugetlbfs rw,seclabel +40 16 0:7 / /sys/kernel/debug rw,relatime shared:26 - debugfs debugfs rw +41 16 0:33 / /sys/kernel/config rw,relatime shared:27 - configfs configfs rw +42 35 0:34 / /var/lib/nfs/rpc_pipefs rw,relatime shared:28 - rpc_pipefs sunrpc rw +43 15 0:35 / /proc/fs/nfsd rw,relatime shared:29 - nfsd sunrpc rw +45 35 8:17 / /boot rw,relatime shared:30 - ext4 /dev/sdb1 rw,seclabel,data=ordered +46 35 253:4 / /home rw,relatime shared:31 - ext4 /dev/mapper/ssd-home rw,seclabel,data=ordered +47 35 253:5 / /var/lib/libvirt/images rw,noatime,nodiratime shared:32 - ext4 /dev/mapper/ssd-virt rw,seclabel,discard,data=ordered +48 35 253:12 / /mnt/old rw,relatime shared:33 - ext4 /dev/mapper/HelpDeskRHEL6-FedoraRoot rw,seclabel,data=ordered +121 22 0:36 / /run/user/1000/gvfs rw,nosuid,nodev,relatime shared:104 - fuse.gvfsd-fuse gvfsd-fuse rw,user_id=1000,group_id=1000 +124 16 0:37 / /sys/fs/fuse/connections rw,relatime shared:107 - fusectl fusectl rw +165 38 253:3 / /tmp/mnt rw,relatime shared:147 - ext4 /dev/mapper/ssd-root rw,seclabel,data=ordered +167 35 253:15 / /var/lib/docker/devicemapper/mnt/aae4076022f0e2b80a2afbf8fc6df450c52080191fcef7fb679a73e6f073e5c2 rw,relatime shared:149 - ext4 /dev/mapper/docker-253:2-425882-aae4076022f0e2b80a2afbf8fc6df450c52080191fcef7fb679a73e6f073e5c2 rw,seclabel,discard,stripe=16,data=ordered +171 35 253:16 / /var/lib/docker/devicemapper/mnt/c71be651f114db95180e472f7871b74fa597ee70a58ccc35cb87139ddea15373 rw,relatime shared:153 - ext4 /dev/mapper/docker-253:2-425882-c71be651f114db95180e472f7871b74fa597ee70a58ccc35cb87139ddea15373 rw,seclabel,discard,stripe=16,data=ordered +175 35 253:17 / /var/lib/docker/devicemapper/mnt/1bac6ab72862d2d5626560df6197cf12036b82e258c53d981fa29adce6f06c3c rw,relatime shared:157 - ext4 /dev/mapper/docker-253:2-425882-1bac6ab72862d2d5626560df6197cf12036b82e258c53d981fa29adce6f06c3c rw,seclabel,discard,stripe=16,data=ordered +179 35 253:18 / /var/lib/docker/devicemapper/mnt/d710a357d77158e80d5b2c55710ae07c94e76d34d21ee7bae65ce5418f739b09 rw,relatime shared:161 - ext4 /dev/mapper/docker-253:2-425882-d710a357d77158e80d5b2c55710ae07c94e76d34d21ee7bae65ce5418f739b09 rw,seclabel,discard,stripe=16,data=ordered +183 35 253:19 / /var/lib/docker/devicemapper/mnt/6479f52366114d5f518db6837254baab48fab39f2ac38d5099250e9a6ceae6c7 rw,relatime shared:165 - ext4 /dev/mapper/docker-253:2-425882-6479f52366114d5f518db6837254baab48fab39f2ac38d5099250e9a6ceae6c7 rw,seclabel,discard,stripe=16,data=ordered +187 35 253:20 / /var/lib/docker/devicemapper/mnt/8d9df91c4cca5aef49eeb2725292aab324646f723a7feab56be34c2ad08268e1 rw,relatime shared:169 - ext4 /dev/mapper/docker-253:2-425882-8d9df91c4cca5aef49eeb2725292aab324646f723a7feab56be34c2ad08268e1 rw,seclabel,discard,stripe=16,data=ordered +191 35 253:21 / /var/lib/docker/devicemapper/mnt/c8240b768603d32e920d365dc9d1dc2a6af46cd23e7ae819947f969e1b4ec661 rw,relatime shared:173 - ext4 /dev/mapper/docker-253:2-425882-c8240b768603d32e920d365dc9d1dc2a6af46cd23e7ae819947f969e1b4ec661 rw,seclabel,discard,stripe=16,data=ordered +195 35 253:22 / /var/lib/docker/devicemapper/mnt/2eb3a01278380bbf3ed12d86ac629eaa70a4351301ee307a5cabe7b5f3b1615f rw,relatime shared:177 - ext4 /dev/mapper/docker-253:2-425882-2eb3a01278380bbf3ed12d86ac629eaa70a4351301ee307a5cabe7b5f3b1615f rw,seclabel,discard,stripe=16,data=ordered +199 35 253:23 / /var/lib/docker/devicemapper/mnt/37a17fb7c9d9b80821235d5f2662879bd3483915f245f9b49cdaa0e38779b70b rw,relatime shared:181 - ext4 /dev/mapper/docker-253:2-425882-37a17fb7c9d9b80821235d5f2662879bd3483915f245f9b49cdaa0e38779b70b rw,seclabel,discard,stripe=16,data=ordered +203 35 253:24 / /var/lib/docker/devicemapper/mnt/aea459ae930bf1de913e2f29428fd80ee678a1e962d4080019d9f9774331ee2b rw,relatime shared:185 - ext4 /dev/mapper/docker-253:2-425882-aea459ae930bf1de913e2f29428fd80ee678a1e962d4080019d9f9774331ee2b rw,seclabel,discard,stripe=16,data=ordered +207 35 253:25 / /var/lib/docker/devicemapper/mnt/928ead0bc06c454bd9f269e8585aeae0a6bd697f46dc8754c2a91309bc810882 rw,relatime shared:189 - ext4 /dev/mapper/docker-253:2-425882-928ead0bc06c454bd9f269e8585aeae0a6bd697f46dc8754c2a91309bc810882 rw,seclabel,discard,stripe=16,data=ordered +211 35 253:26 / /var/lib/docker/devicemapper/mnt/0f284d18481d671644706e7a7244cbcf63d590d634cc882cb8721821929d0420 rw,relatime shared:193 - ext4 /dev/mapper/docker-253:2-425882-0f284d18481d671644706e7a7244cbcf63d590d634cc882cb8721821929d0420 rw,seclabel,discard,stripe=16,data=ordered +215 35 253:27 / /var/lib/docker/devicemapper/mnt/d9dd16722ab34c38db2733e23f69e8f4803ce59658250dd63e98adff95d04919 rw,relatime shared:197 - ext4 /dev/mapper/docker-253:2-425882-d9dd16722ab34c38db2733e23f69e8f4803ce59658250dd63e98adff95d04919 rw,seclabel,discard,stripe=16,data=ordered +219 35 253:28 / /var/lib/docker/devicemapper/mnt/bc4500479f18c2c08c21ad5282e5f826a016a386177d9874c2764751c031d634 rw,relatime shared:201 - ext4 /dev/mapper/docker-253:2-425882-bc4500479f18c2c08c21ad5282e5f826a016a386177d9874c2764751c031d634 rw,seclabel,discard,stripe=16,data=ordered +223 35 253:29 / /var/lib/docker/devicemapper/mnt/7770c8b24eb3d5cc159a065910076938910d307ab2f5d94e1dc3b24c06ee2c8a rw,relatime shared:205 - ext4 /dev/mapper/docker-253:2-425882-7770c8b24eb3d5cc159a065910076938910d307ab2f5d94e1dc3b24c06ee2c8a rw,seclabel,discard,stripe=16,data=ordered +227 35 253:30 / /var/lib/docker/devicemapper/mnt/c280cd3d0bf0aa36b478b292279671624cceafc1a67eaa920fa1082601297adf rw,relatime shared:209 - ext4 /dev/mapper/docker-253:2-425882-c280cd3d0bf0aa36b478b292279671624cceafc1a67eaa920fa1082601297adf rw,seclabel,discard,stripe=16,data=ordered +231 35 253:31 / /var/lib/docker/devicemapper/mnt/8b59a7d9340279f09fea67fd6ad89ddef711e9e7050eb647984f8b5ef006335f rw,relatime shared:213 - ext4 /dev/mapper/docker-253:2-425882-8b59a7d9340279f09fea67fd6ad89ddef711e9e7050eb647984f8b5ef006335f rw,seclabel,discard,stripe=16,data=ordered +235 35 253:32 / /var/lib/docker/devicemapper/mnt/1a28059f29eda821578b1bb27a60cc71f76f846a551abefabce6efd0146dce9f rw,relatime shared:217 - ext4 /dev/mapper/docker-253:2-425882-1a28059f29eda821578b1bb27a60cc71f76f846a551abefabce6efd0146dce9f rw,seclabel,discard,stripe=16,data=ordered +239 35 253:33 / /var/lib/docker/devicemapper/mnt/e9aa60c60128cad1 rw,relatime shared:221 - ext4 /dev/mapper/docker-253:2-425882-e9aa60c60128cad1 rw,seclabel,discard,stripe=16,data=ordered +243 35 253:34 / /var/lib/docker/devicemapper/mnt/5fec11304b6f4713fea7b6ccdcc1adc0a1966187f590fe25a8227428a8df275d-init rw,relatime shared:225 - ext4 /dev/mapper/docker-253:2-425882-5fec11304b6f4713fea7b6ccdcc1adc0a1966187f590fe25a8227428a8df275d-init rw,seclabel,discard,stripe=16,data=ordered +247 35 253:35 / /var/lib/docker/devicemapper/mnt/5fec11304b6f4713fea7b6ccdcc1adc0a1966187f590fe25a8227428a8df275d rw,relatime shared:229 - ext4 /dev/mapper/docker-253:2-425882-5fec11304b6f4713fea7b6ccdcc1adc0a1966187f590fe25a8227428a8df275d rw,seclabel,discard,stripe=16,data=ordered +31 21 0:23 / /DATA/foo_bla_bla rw,relatime - cifs //foo/BLA\040BLA\040BLA/ rw,sec=ntlm,cache=loose,unc=\\foo\BLA BLA BLA,username=my_login,domain=mydomain.com,uid=12345678,forceuid,gid=12345678,forcegid,addr=10.1.30.10,file_mode=0755,dir_mode=0755,nounix,rsize=61440,wsize=65536,actimeo=1` + +const systemdMountinfo = `115 83 0:32 / / rw,relatime - aufs none rw,si=c0bd3d3,dio,dirperm1 +116 115 0:35 / /proc rw,nosuid,nodev,noexec,relatime - proc proc rw +117 115 0:36 / /dev rw,nosuid - tmpfs tmpfs rw,mode=755 +118 117 0:37 / /dev/pts rw,nosuid,noexec,relatime - devpts devpts rw,gid=5,mode=620,ptmxmode=666 +119 115 0:38 / /sys rw,nosuid,nodev,noexec,relatime - sysfs sysfs rw +120 119 0:39 / /sys/fs/cgroup rw,nosuid,nodev,noexec,relatime - tmpfs tmpfs rw,mode=755 +121 120 0:19 /system.slice/docker-dc4eaa1a34ec4d593bc0125d31eea823a1d76ae483aeb1409cca80304e34da2e.scope /sys/fs/cgroup/systemd rw,nosuid,nodev,noexec,relatime - cgroup cgroup rw,xattr,release_agent=/lib/systemd/systemd-cgroups-agent,name=systemd +122 120 0:20 /system.slice/docker-dc4eaa1a34ec4d593bc0125d31eea823a1d76ae483aeb1409cca80304e34da2e.scope /sys/fs/cgroup/devices rw,nosuid,nodev,noexec,relatime - cgroup cgroup rw,devices +123 120 0:21 /system.slice/docker-dc4eaa1a34ec4d593bc0125d31eea823a1d76ae483aeb1409cca80304e34da2e.scope /sys/fs/cgroup/freezer rw,nosuid,nodev,noexec,relatime - cgroup cgroup rw,freezer +124 120 0:22 /system.slice/docker-dc4eaa1a34ec4d593bc0125d31eea823a1d76ae483aeb1409cca80304e34da2e.scope /sys/fs/cgroup/memory rw,nosuid,nodev,noexec,relatime - cgroup cgroup rw,memory +125 120 0:23 /system.slice/docker-dc4eaa1a34ec4d593bc0125d31eea823a1d76ae483aeb1409cca80304e34da2e.scope /sys/fs/cgroup/net_cls,net_prio rw,nosuid,nodev,noexec,relatime - cgroup cgroup rw,net_cls,net_prio +126 120 0:24 /system.slice/docker-dc4eaa1a34ec4d593bc0125d31eea823a1d76ae483aeb1409cca80304e34da2e.scope /sys/fs/cgroup/blkio rw,nosuid,nodev,noexec,relatime - cgroup cgroup rw,blkio +127 120 0:25 /system.slice/docker-dc4eaa1a34ec4d593bc0125d31eea823a1d76ae483aeb1409cca80304e34da2e.scope /sys/fs/cgroup/cpuset rw,nosuid,nodev,noexec,relatime - cgroup cgroup rw,cpuset,clone_children +128 120 0:26 /system.slice/docker-dc4eaa1a34ec4d593bc0125d31eea823a1d76ae483aeb1409cca80304e34da2e.scope /sys/fs/cgroup/cpu,cpuacct rw,nosuid,nodev,noexec,relatime - cgroup cgroup rw,cpu,cpuacct +129 120 0:27 /system.slice/docker-dc4eaa1a34ec4d593bc0125d31eea823a1d76ae483aeb1409cca80304e34da2e.scope /sys/fs/cgroup/perf_event rw,nosuid,nodev,noexec,relatime - cgroup cgroup rw,perf_event,release_agent=/run/cgmanager/agents/cgm-release-agent.perf_event +130 115 43:0 /var/lib/docker/volumes/a44a712176377f57c094397330ee04387284c478364eb25f4c3d25f775f25c26/_data /var/lib/docker rw,relatime - ext4 /dev/nbd0 rw,data=ordered +131 115 43:0 /var/lib/docker/containers/dc4eaa1a34ec4d593bc0125d31eea823a1d76ae483aeb1409cca80304e34da2e/resolv.conf /etc/resolv.conf rw,relatime - ext4 /dev/nbd0 rw,data=ordered +132 115 43:0 /var/lib/docker/containers/dc4eaa1a34ec4d593bc0125d31eea823a1d76ae483aeb1409cca80304e34da2e/hostname /etc/hostname rw,relatime - ext4 /dev/nbd0 rw,data=ordered +133 115 43:0 /var/lib/docker/containers/dc4eaa1a34ec4d593bc0125d31eea823a1d76ae483aeb1409cca80304e34da2e/hosts /etc/hosts rw,relatime - ext4 /dev/nbd0 rw,data=ordered +134 117 0:33 / /dev/shm rw,nosuid,nodev,noexec,relatime - tmpfs shm rw,size=65536k +135 117 0:13 / /dev/mqueue rw,nosuid,nodev,noexec,relatime - mqueue mqueue rw +136 117 0:12 /1 /dev/console rw,nosuid,noexec,relatime - devpts none rw,gid=5,mode=620,ptmxmode=000 +84 115 0:40 / /tmp rw,relatime - tmpfs none rw` + +const bedrockMountinfo = `120 17 0:28 / /sys/fs/cgroup ro,nosuid,nodev,noexec shared:16 - tmpfs tmpfs ro,mode=755 +124 28 0:28 / /bedrock/strata/arch/sys/fs/cgroup rw,nosuid,nodev,noexec shared:16 - tmpfs tmpfs ro,mode=755 +123 53 0:28 / /bedrock/strata/fallback/sys/fs/cgroup rw,nosuid,nodev,noexec shared:16 - tmpfs tmpfs ro,mode=755 +122 71 0:28 / /bedrock/strata/gentoo/sys/fs/cgroup rw,nosuid,nodev,noexec shared:16 - tmpfs tmpfs ro,mode=755 +121 89 0:28 / /bedrock/strata/kde/sys/fs/cgroup rw,nosuid,nodev,noexec shared:16 - tmpfs tmpfs ro,mode=755 +125 120 0:29 / /sys/fs/cgroup/systemd rw,nosuid,nodev,noexec,relatime shared:17 - cgroup cgroup rw,xattr,release_agent=/lib/systemd/systemd-cgroups-agent,name=systemd +129 124 0:29 / /bedrock/strata/arch/sys/fs/cgroup/systemd rw,nosuid,nodev,noexec,relatime shared:17 - cgroup cgroup rw,xattr,release_agent=/lib/systemd/systemd-cgroups-agent,name=systemd +128 123 0:29 / /bedrock/strata/fallback/sys/fs/cgroup/systemd rw,nosuid,nodev,noexec,relatime shared:17 - cgroup cgroup rw,xattr,release_agent=/lib/systemd/systemd-cgroups-agent,name=systemd +127 122 0:29 / /bedrock/strata/gentoo/sys/fs/cgroup/systemd rw,nosuid,nodev,noexec,relatime shared:17 - cgroup cgroup rw,xattr,release_agent=/lib/systemd/systemd-cgroups-agent,name=systemd +126 121 0:29 / /bedrock/strata/kde/sys/fs/cgroup/systemd rw,nosuid,nodev,noexec,relatime shared:17 - cgroup cgroup rw,xattr,release_agent=/lib/systemd/systemd-cgroups-agent,name=systemd +140 120 0:32 / /sys/fs/cgroup/net_cls,net_prio rw,nosuid,nodev,noexec,relatime shared:48 - cgroup cgroup rw,net_cls,net_prio +144 124 0:32 / /bedrock/strata/arch/sys/fs/cgroup/net_cls,net_prio rw,nosuid,nodev,noexec,relatime shared:48 - cgroup cgroup rw,net_cls,net_prio +143 123 0:32 / /bedrock/strata/fallback/sys/fs/cgroup/net_cls,net_prio rw,nosuid,nodev,noexec,relatime shared:48 - cgroup cgroup rw,net_cls,net_prio +142 122 0:32 / /bedrock/strata/gentoo/sys/fs/cgroup/net_cls,net_prio rw,nosuid,nodev,noexec,relatime shared:48 - cgroup cgroup rw,net_cls,net_prio +141 121 0:32 / /bedrock/strata/kde/sys/fs/cgroup/net_cls,net_prio rw,nosuid,nodev,noexec,relatime shared:48 - cgroup cgroup rw,net_cls,net_prio +145 120 0:33 / /sys/fs/cgroup/blkio rw,nosuid,nodev,noexec,relatime shared:49 - cgroup cgroup rw,blkio +149 124 0:33 / /bedrock/strata/arch/sys/fs/cgroup/blkio rw,nosuid,nodev,noexec,relatime shared:49 - cgroup cgroup rw,blkio +148 123 0:33 / /bedrock/strata/fallback/sys/fs/cgroup/blkio rw,nosuid,nodev,noexec,relatime shared:49 - cgroup cgroup rw,blkio +147 122 0:33 / /bedrock/strata/gentoo/sys/fs/cgroup/blkio rw,nosuid,nodev,noexec,relatime shared:49 - cgroup cgroup rw,blkio +146 121 0:33 / /bedrock/strata/kde/sys/fs/cgroup/blkio rw,nosuid,nodev,noexec,relatime shared:49 - cgroup cgroup rw,blkio +150 120 0:34 / /sys/fs/cgroup/cpu,cpuacct rw,nosuid,nodev,noexec,relatime shared:50 - cgroup cgroup rw,cpu,cpuacct +154 124 0:34 / /bedrock/strata/arch/sys/fs/cgroup/cpu,cpuacct rw,nosuid,nodev,noexec,relatime shared:50 - cgroup cgroup rw,cpu,cpuacct +153 123 0:34 / /bedrock/strata/fallback/sys/fs/cgroup/cpu,cpuacct rw,nosuid,nodev,noexec,relatime shared:50 - cgroup cgroup rw,cpu,cpuacct +152 122 0:34 / /bedrock/strata/gentoo/sys/fs/cgroup/cpu,cpuacct rw,nosuid,nodev,noexec,relatime shared:50 - cgroup cgroup rw,cpu,cpuacct +151 121 0:34 / /bedrock/strata/kde/sys/fs/cgroup/cpu,cpuacct rw,nosuid,nodev,noexec,relatime shared:50 - cgroup cgroup rw,cpu,cpuacct +155 120 0:35 / /sys/fs/cgroup/cpuset rw,nosuid,nodev,noexec,relatime shared:51 - cgroup cgroup rw,cpuset +159 124 0:35 / /bedrock/strata/arch/sys/fs/cgroup/cpuset rw,nosuid,nodev,noexec,relatime shared:51 - cgroup cgroup rw,cpuset +158 123 0:35 / /bedrock/strata/fallback/sys/fs/cgroup/cpuset rw,nosuid,nodev,noexec,relatime shared:51 - cgroup cgroup rw,cpuset +157 122 0:35 / /bedrock/strata/gentoo/sys/fs/cgroup/cpuset rw,nosuid,nodev,noexec,relatime shared:51 - cgroup cgroup rw,cpuset +156 121 0:35 / /bedrock/strata/kde/sys/fs/cgroup/cpuset rw,nosuid,nodev,noexec,relatime shared:51 - cgroup cgroup rw,cpuset +160 120 0:36 / /sys/fs/cgroup/devices rw,nosuid,nodev,noexec,relatime shared:52 - cgroup cgroup rw,devices +164 124 0:36 / /bedrock/strata/arch/sys/fs/cgroup/devices rw,nosuid,nodev,noexec,relatime shared:52 - cgroup cgroup rw,devices +163 123 0:36 / /bedrock/strata/fallback/sys/fs/cgroup/devices rw,nosuid,nodev,noexec,relatime shared:52 - cgroup cgroup rw,devices +162 122 0:36 / /bedrock/strata/gentoo/sys/fs/cgroup/devices rw,nosuid,nodev,noexec,relatime shared:52 - cgroup cgroup rw,devices +161 121 0:36 / /bedrock/strata/kde/sys/fs/cgroup/devices rw,nosuid,nodev,noexec,relatime shared:52 - cgroup cgroup rw,devices +165 120 0:37 / /sys/fs/cgroup/memory rw,nosuid,nodev,noexec,relatime shared:53 - cgroup cgroup rw,memory +169 124 0:37 / /bedrock/strata/arch/sys/fs/cgroup/memory rw,nosuid,nodev,noexec,relatime shared:53 - cgroup cgroup rw,memory +168 123 0:37 / /bedrock/strata/fallback/sys/fs/cgroup/memory rw,nosuid,nodev,noexec,relatime shared:53 - cgroup cgroup rw,memory +167 122 0:37 / /bedrock/strata/gentoo/sys/fs/cgroup/memory rw,nosuid,nodev,noexec,relatime shared:53 - cgroup cgroup rw,memory +166 121 0:37 / /bedrock/strata/kde/sys/fs/cgroup/memory rw,nosuid,nodev,noexec,relatime shared:53 - cgroup cgroup rw,memory +170 120 0:38 / /sys/fs/cgroup/freezer rw,nosuid,nodev,noexec,relatime shared:54 - cgroup cgroup rw,freezer +174 124 0:38 / /bedrock/strata/arch/sys/fs/cgroup/freezer rw,nosuid,nodev,noexec,relatime shared:54 - cgroup cgroup rw,freezer +173 123 0:38 / /bedrock/strata/fallback/sys/fs/cgroup/freezer rw,nosuid,nodev,noexec,relatime shared:54 - cgroup cgroup rw,freezer +172 122 0:38 / /bedrock/strata/gentoo/sys/fs/cgroup/freezer rw,nosuid,nodev,noexec,relatime shared:54 - cgroup cgroup rw,freezer +171 121 0:38 / /bedrock/strata/kde/sys/fs/cgroup/freezer rw,nosuid,nodev,noexec,relatime shared:54 - cgroup cgroup rw,freezer +175 120 0:39 / /sys/fs/cgroup/pids rw,nosuid,nodev,noexec,relatime shared:55 - cgroup cgroup rw,pids +179 124 0:39 / /bedrock/strata/arch/sys/fs/cgroup/pids rw,nosuid,nodev,noexec,relatime shared:55 - cgroup cgroup rw,pids +178 123 0:39 / /bedrock/strata/fallback/sys/fs/cgroup/pids rw,nosuid,nodev,noexec,relatime shared:55 - cgroup cgroup rw,pids +177 122 0:39 / /bedrock/strata/gentoo/sys/fs/cgroup/pids rw,nosuid,nodev,noexec,relatime shared:55 - cgroup cgroup rw,pids +176 121 0:39 / /bedrock/strata/kde/sys/fs/cgroup/pids rw,nosuid,nodev,noexec,relatime shared:55 - cgroup cgroup rw,pids +180 120 0:40 / /sys/fs/cgroup/perf_event rw,nosuid,nodev,noexec,relatime shared:56 - cgroup cgroup rw,perf_event +184 124 0:40 / /bedrock/strata/arch/sys/fs/cgroup/perf_event rw,nosuid,nodev,noexec,relatime shared:56 - cgroup cgroup rw,perf_event +183 123 0:40 / /bedrock/strata/fallback/sys/fs/cgroup/perf_event rw,nosuid,nodev,noexec,relatime shared:56 - cgroup cgroup rw,perf_event +182 122 0:40 / /bedrock/strata/gentoo/sys/fs/cgroup/perf_event rw,nosuid,nodev,noexec,relatime shared:56 - cgroup cgroup rw,perf_event +181 121 0:40 / /bedrock/strata/kde/sys/fs/cgroup/perf_event rw,nosuid,nodev,noexec,relatime shared:56 - cgroup cgroup rw,perf_event` + +const cgroup2Mountinfo = `18 64 0:18 / /sys rw,nosuid,nodev,noexec,relatime shared:6 - sysfs sysfs rw,seclabel +19 64 0:4 / /proc rw,nosuid,nodev,noexec,relatime shared:5 - proc proc rw +20 64 0:6 / /dev rw,nosuid shared:2 - devtmpfs devtmpfs rw,seclabel,size=8171204k,nr_inodes=2042801,mode=755 +21 18 0:19 / /sys/kernel/security rw,nosuid,nodev,noexec,relatime shared:7 - securityfs securityfs rw +22 20 0:20 / /dev/shm rw,nosuid,nodev shared:3 - tmpfs tmpfs rw,seclabel +23 20 0:21 / /dev/pts rw,nosuid,noexec,relatime shared:4 - devpts devpts rw,seclabel,gid=5,mode=620,ptmxmode=000 +24 64 0:22 / /run rw,nosuid,nodev shared:24 - tmpfs tmpfs rw,seclabel,mode=755 +25 18 0:23 / /sys/fs/cgroup ro,nosuid,nodev,noexec shared:8 - tmpfs tmpfs ro,seclabel,mode=755 +26 25 0:24 / /sys/fs/cgroup/systemd rw,nosuid,nodev,noexec,relatime shared:9 - cgroup2 cgroup rw +27 18 0:25 / /sys/fs/pstore rw,nosuid,nodev,noexec,relatime shared:20 - pstore pstore rw,seclabel +28 18 0:26 / /sys/firmware/efi/efivars rw,nosuid,nodev,noexec,relatime shared:21 - efivarfs efivarfs rw +29 25 0:27 / /sys/fs/cgroup/cpu,cpuacct rw,nosuid,nodev,noexec,relatime shared:10 - cgroup cgroup rw,cpu,cpuacct +30 25 0:28 / /sys/fs/cgroup/memory rw,nosuid,nodev,noexec,relatime shared:11 - cgroup cgroup rw,memory +31 25 0:29 / /sys/fs/cgroup/net_cls,net_prio rw,nosuid,nodev,noexec,relatime shared:12 - cgroup cgroup rw,net_cls,net_prio +32 25 0:30 / /sys/fs/cgroup/blkio rw,nosuid,nodev,noexec,relatime shared:13 - cgroup cgroup rw,blkio +33 25 0:31 / /sys/fs/cgroup/perf_event rw,nosuid,nodev,noexec,relatime shared:14 - cgroup cgroup rw,perf_event +34 25 0:32 / /sys/fs/cgroup/hugetlb rw,nosuid,nodev,noexec,relatime shared:15 - cgroup cgroup rw,hugetlb +35 25 0:33 / /sys/fs/cgroup/freezer rw,nosuid,nodev,noexec,relatime shared:16 - cgroup cgroup rw,freezer +36 25 0:34 / /sys/fs/cgroup/cpuset rw,nosuid,nodev,noexec,relatime shared:17 - cgroup cgroup rw,cpuset +37 25 0:35 / /sys/fs/cgroup/devices rw,nosuid,nodev,noexec,relatime shared:18 - cgroup cgroup rw,devices +38 25 0:36 / /sys/fs/cgroup/pids rw,nosuid,nodev,noexec,relatime shared:19 - cgroup cgroup rw,pids +61 18 0:37 / /sys/kernel/config rw,relatime shared:22 - configfs configfs rw +64 0 253:0 / / rw,relatime shared:1 - ext4 /dev/mapper/fedora_dhcp--16--129-root rw,seclabel,data=ordered +39 18 0:17 / /sys/fs/selinux rw,relatime shared:23 - selinuxfs selinuxfs rw +40 20 0:16 / /dev/mqueue rw,relatime shared:25 - mqueue mqueue rw,seclabel +41 20 0:39 / /dev/hugepages rw,relatime shared:26 - hugetlbfs hugetlbfs rw,seclabel +` + +func TestGetCgroupMounts(t *testing.T) { + type testData struct { + mountInfo string + root string + // all is the total number of records expected with all=true, + // or 0 for no extra records expected (most cases). + all int + subsystems map[string]bool + } + testTable := []testData{ + { + mountInfo: fedoraMountinfo, + root: "/", + subsystems: map[string]bool{ + "name=systemd": false, + "cpuset": false, + "cpu": false, + "cpuacct": false, + "memory": false, + "devices": false, + "freezer": false, + "net_cls": false, + "blkio": false, + "perf_event": false, + "hugetlb": false, + }, + }, + { + mountInfo: systemdMountinfo, + root: "/system.slice/docker-dc4eaa1a34ec4d593bc0125d31eea823a1d76ae483aeb1409cca80304e34da2e.scope", + subsystems: map[string]bool{ + "name=systemd": false, + "cpuset": false, + "cpu": false, + "cpuacct": false, + "memory": false, + "devices": false, + "freezer": false, + "net_cls": false, + "net_prio": false, + "blkio": false, + "perf_event": false, + }, + }, + { + mountInfo: bedrockMountinfo, + root: "/", + all: 50, + subsystems: map[string]bool{ + "name=systemd": false, + "cpuset": false, + "cpu": false, + "cpuacct": false, + "memory": false, + "devices": false, + "freezer": false, + "net_cls": false, + "net_prio": false, + "blkio": false, + "perf_event": false, + "pids": false, + }, + }, + } + for _, td := range testTable { + mi := bytes.NewBufferString(td.mountInfo) + cgMounts, err := getCgroupMountsHelper(td.subsystems, mi, false) + if err != nil { + t.Fatal(err) + } + cgMap := make(map[string]Mount) + for _, m := range cgMounts { + for _, ss := range m.Subsystems { + cgMap[ss] = m + } + } + for ss := range td.subsystems { + ss = strings.TrimPrefix(ss, CgroupNamePrefix) + m, ok := cgMap[ss] + if !ok { + t.Fatalf("%s not found", ss) + } + if m.Root != td.root { + t.Fatalf("unexpected root for %s: %s", ss, m.Root) + } + if !strings.HasPrefix(m.Mountpoint, "/sys/fs/cgroup/") && !strings.Contains(m.Mountpoint, ss) { + t.Fatalf("unexpected mountpoint for %s: %s", ss, m.Mountpoint) + } + var ssFound bool + for _, mss := range m.Subsystems { + if mss == ss { + ssFound = true + break + } + } + if !ssFound { + t.Fatalf("subsystem %s not found in Subsystems field %v", ss, m.Subsystems) + } + } + // Test the all=true case. + + // Reset the test input. + mi = bytes.NewBufferString(td.mountInfo) + for k := range td.subsystems { + td.subsystems[k] = false + } + cgMountsAll, err := getCgroupMountsHelper(td.subsystems, mi, true) + if err != nil { + t.Fatal(err) + } + if td.all == 0 { + // Results with and without "all" should be the same. + if len(cgMounts) != len(cgMountsAll) || !reflect.DeepEqual(cgMounts, cgMountsAll) { + t.Errorf("expected same results, got (all=false) %v, (all=true) %v", cgMounts, cgMountsAll) + } + } else { + // Make sure we got all records. + if len(cgMountsAll) != td.all { + t.Errorf("expected %d records, got %d (%+v)", td.all, len(cgMountsAll), cgMountsAll) + } + } + + } +} + +func BenchmarkGetCgroupMounts(b *testing.B) { + subsystems := map[string]bool{ + "cpuset": false, + "cpu": false, + "cpuacct": false, + "memory": false, + "devices": false, + "freezer": false, + "net_cls": false, + "blkio": false, + "perf_event": false, + "hugetlb": false, + } + b.ResetTimer() + for i := 0; i < b.N; i++ { + b.StopTimer() + mi := bytes.NewBufferString(fedoraMountinfo) + b.StartTimer() + if _, err := getCgroupMountsHelper(subsystems, mi, false); err != nil { + b.Fatal(err) + } + } +} + +func TestParseCgroupString(t *testing.T) { + testCases := []struct { + input string + expectedError error + expectedOutput map[string]string + }{ + { + // Taken from a CoreOS instance running systemd 225 with CPU/Mem + // accounting enabled in systemd + input: `9:blkio:/ +8:freezer:/ +7:perf_event:/ +6:devices:/system.slice/system-sshd.slice +5:cpuset:/ +4:cpu,cpuacct:/system.slice/system-sshd.slice/sshd@126-10.240.0.15:22-xxx.yyy.zzz.aaa:33678.service +3:net_cls,net_prio:/ +2:memory:/system.slice/system-sshd.slice/sshd@126-10.240.0.15:22-xxx.yyy.zzz.aaa:33678.service +1:name=systemd:/system.slice/system-sshd.slice/sshd@126-10.240.0.15:22-xxx.yyy.zzz.aaa:33678.service`, + expectedOutput: map[string]string{ + "name=systemd": "/system.slice/system-sshd.slice/sshd@126-10.240.0.15:22-xxx.yyy.zzz.aaa:33678.service", + "blkio": "/", + "freezer": "/", + "perf_event": "/", + "devices": "/system.slice/system-sshd.slice", + "cpuset": "/", + "cpu": "/system.slice/system-sshd.slice/sshd@126-10.240.0.15:22-xxx.yyy.zzz.aaa:33678.service", + "cpuacct": "/system.slice/system-sshd.slice/sshd@126-10.240.0.15:22-xxx.yyy.zzz.aaa:33678.service", + "net_cls": "/", + "net_prio": "/", + "memory": "/system.slice/system-sshd.slice/sshd@126-10.240.0.15:22-xxx.yyy.zzz.aaa:33678.service", + }, + }, + { + input: `malformed input`, + expectedError: fmt.Errorf(`invalid cgroup entry: must contain at least two colons: malformed input`), + }, + } + + for ndx, testCase := range testCases { + out, err := parseCgroupFromReader(strings.NewReader(testCase.input)) + if err != nil { + if testCase.expectedError == nil || testCase.expectedError.Error() != err.Error() { + t.Errorf("%v: expected error %v, got error %v", ndx, testCase.expectedError, err) + } + } else { + if !reflect.DeepEqual(testCase.expectedOutput, out) { + t.Errorf("%v: expected output %v, got error %v", ndx, testCase.expectedOutput, out) + } + } + } + +} + +func TestIgnoreCgroup2Mount(t *testing.T) { + subsystems := map[string]bool{ + "cpuset": false, + "cpu": false, + "cpuacct": false, + "memory": false, + "devices": false, + "freezer": false, + "net_cls": false, + "blkio": false, + "perf_event": false, + "pids": false, + "name=systemd": false, + } + + mi := bytes.NewBufferString(cgroup2Mountinfo) + cgMounts, err := getCgroupMountsHelper(subsystems, mi, false) + if err != nil { + t.Fatal(err) + } + for _, m := range cgMounts { + if m.Mountpoint == "/sys/fs/cgroup/systemd" { + t.Errorf("parsed a cgroup2 mount at /sys/fs/cgroup/systemd instead of ignoring it") + } + } +} + +func TestFindCgroupMountpointAndRoot(t *testing.T) { + fakeMountInfo := ` +35 27 0:29 / /foo rw,nosuid,nodev,noexec,relatime shared:18 - cgroup cgroup rw,devices +35 27 0:29 / /sys/fs/cgroup/devices rw,nosuid,nodev,noexec,relatime shared:18 - cgroup cgroup rw,devices +` + testCases := []struct { + cgroupPath string + output string + }{ + {cgroupPath: "/sys/fs", output: "/sys/fs/cgroup/devices"}, + {cgroupPath: "", output: "/foo"}, + } + + for _, c := range testCases { + mountpoint, _, _ := findCgroupMountpointAndRootFromReader(strings.NewReader(fakeMountInfo), c.cgroupPath, "devices") + if mountpoint != c.output { + t.Errorf("expected %s, got %s", c.output, mountpoint) + } + } +} + +func BenchmarkGetHugePageSize(b *testing.B) { + var ( + output []string + err error + ) + for i := 0; i < b.N; i++ { + output, err = GetHugePageSize() + } + if err != nil || len(output) == 0 { + b.Fatal("unexpected results") + } +} + +func BenchmarkGetHugePageSizeImpl(b *testing.B) { + var ( + input = []string{"hugepages-1048576kB", "hugepages-2048kB", "hugepages-32768kB", "hugepages-64kB"} + output []string + err error + ) + for i := 0; i < b.N; i++ { + output, err = getHugePageSizeFromFilenames(input) + } + if err != nil || len(output) != len(input) { + b.Fatal("unexpected results") + } +} + +func TestGetHugePageSizeImpl(t *testing.T) { + testCases := []struct { + input []string + output []string + isErr bool + isWarn bool + }{ + { + input: []string{"hugepages-1048576kB", "hugepages-2048kB", "hugepages-32768kB", "hugepages-64kB"}, + output: []string{"1GB", "2MB", "32MB", "64KB"}, + }, + { + input: []string{}, + output: []string{}, + }, + { // not a number + input: []string{"hugepages-akB"}, + isErr: true, + }, + { // no prefix (silently skipped) + input: []string{"1024kB"}, + }, + { // invalid prefix (silently skipped) + input: []string{"whatever-1024kB"}, + }, + { // invalid suffix (skipped with a warning) + input: []string{"hugepages-1024gB"}, + isWarn: true, + }, + { // no suffix (skipped with a warning) + input: []string{"hugepages-1024"}, + isWarn: true, + }, + } + + // Need to catch warnings. + savedOut := logrus.StandardLogger().Out + defer logrus.SetOutput(savedOut) + warns := new(bytes.Buffer) + logrus.SetOutput(warns) + + for _, c := range testCases { + warns.Reset() + output, err := getHugePageSizeFromFilenames(c.input) + if err != nil { + if !c.isErr { + t.Errorf("input %v, expected nil, got error: %v", c.input, err) + } + // no more checks + continue + } + if c.isErr { + t.Errorf("input %v, expected error, got error: nil, output: %v", c.input, output) + // no more checks + continue + } + // check for warnings + if c.isWarn && warns.Len() == 0 { + t.Errorf("input %v, expected a warning, got none", c.input) + } + if !c.isWarn && warns.Len() > 0 { + t.Errorf("input %v, unexpected warning: %s", c.input, warns.String()) + } + // check output + if len(output) != len(c.output) || (len(output) > 0 && !reflect.DeepEqual(output, c.output)) { + t.Errorf("input %v, expected %v, got %v", c.input, c.output, output) + } + } +} + +func TestConvertBlkIOToCgroupV2Value(t *testing.T) { + cases := map[uint16]uint64{ + 0: 0, + 10: 1, + 1000: 10000, + } + for i, expected := range cases { + got := ConvertBlkIOToCgroupV2Value(i) + if got != expected { + t.Errorf("expected ConvertBlkIOToCgroupV2Value(%d) to be %d, got %d", i, expected, got) + } + } +} + +func TestConvertCPUSharesToCgroupV2Value(t *testing.T) { + cases := map[uint64]uint64{ + 0: 0, + 2: 1, + 262144: 10000, + } + for i, expected := range cases { + got := ConvertCPUSharesToCgroupV2Value(i) + if got != expected { + t.Errorf("expected ConvertCPUSharesToCgroupV2Value(%d) to be %d, got %d", i, expected, got) + } + } +} + +func TestConvertMemorySwapToCgroupV2Value(t *testing.T) { + cases := []struct { + memswap, memory int64 + expected int64 + expErr bool + }{ + { + memswap: 0, + memory: 0, + expected: 0, + }, + { + memswap: -1, + memory: 0, + expected: -1, + }, + { + memswap: -1, + memory: -1, + expected: -1, + }, + { + memswap: -2, + memory: 0, + expErr: true, + }, + { + memswap: -1, + memory: 1000, + expected: -1, + }, + { + memswap: 1000, + memory: 1000, + expected: 0, + }, + { + memswap: 500, + memory: 200, + expected: 300, + }, + { + memswap: 300, + memory: 400, + expErr: true, + }, + { + memswap: 300, + memory: 0, + expErr: true, + }, + { + memswap: 300, + memory: -300, + expErr: true, + }, + { + memswap: 300, + memory: -1, + expErr: true, + }, + } + + for _, c := range cases { + swap, err := ConvertMemorySwapToCgroupV2Value(c.memswap, c.memory) + if c.expErr { + if err == nil { + t.Errorf("memswap: %d, memory %d, expected error, got %d, nil", c.memswap, c.memory, swap) + } + // no more checks + continue + } + if err != nil { + t.Errorf("memswap: %d, memory %d, expected success, got error %s", c.memswap, c.memory, err) + } + if swap != c.expected { + t.Errorf("memswap: %d, memory %d, expected %d, got %d", c.memswap, c.memory, c.expected, swap) + } + } +} diff --git a/sysbox-runc/libcontainer/cgroups/v1_utils.go b/sysbox-runc/libcontainer/cgroups/v1_utils.go new file mode 100644 index 00000000..f610ed8c --- /dev/null +++ b/sysbox-runc/libcontainer/cgroups/v1_utils.go @@ -0,0 +1,302 @@ +package cgroups + +import ( + "bufio" + "errors" + "fmt" + "io" + "os" + "path/filepath" + "strings" + "syscall" + + securejoin "github.com/cyphar/filepath-securejoin" + "golang.org/x/sys/unix" +) + +// Code in this source file are specific to cgroup v1, +// and must not be used from any cgroup v2 code. + +const ( + CgroupNamePrefix = "name=" + defaultPrefix = "/sys/fs/cgroup" +) + +var ( + errUnified = errors.New("not implemented for cgroup v2 unified hierarchy") + ErrV1NoUnified = errors.New("invalid configuration: cannot use unified on cgroup v1") +) + +type NotFoundError struct { + Subsystem string +} + +func (e *NotFoundError) Error() string { + return fmt.Sprintf("mountpoint for %s not found", e.Subsystem) +} + +func NewNotFoundError(sub string) error { + return &NotFoundError{ + Subsystem: sub, + } +} + +func IsNotFound(err error) bool { + if err == nil { + return false + } + _, ok := err.(*NotFoundError) + return ok +} + +func tryDefaultPath(cgroupPath, subsystem string) string { + if !strings.HasPrefix(defaultPrefix, cgroupPath) { + return "" + } + + // remove possible prefix + subsystem = strings.TrimPrefix(subsystem, CgroupNamePrefix) + + // Make sure we're still under defaultPrefix, and resolve + // a possible symlink (like cpu -> cpu,cpuacct). + path, err := securejoin.SecureJoin(defaultPrefix, subsystem) + if err != nil { + return "" + } + + // (1) path should be a directory. + st, err := os.Lstat(path) + if err != nil || !st.IsDir() { + return "" + } + + // (2) path should be a mount point. + pst, err := os.Lstat(filepath.Dir(path)) + if err != nil { + return "" + } + + if st.Sys().(*syscall.Stat_t).Dev == pst.Sys().(*syscall.Stat_t).Dev { + // parent dir has the same dev -- path is not a mount point + return "" + } + + // (3) path should have 'cgroup' fs type. + fst := unix.Statfs_t{} + err = unix.Statfs(path, &fst) + if err != nil || fst.Type != unix.CGROUP_SUPER_MAGIC { + return "" + } + + return path +} + +// https://www.kernel.org/doc/Documentation/cgroup-v1/cgroups.txt +func FindCgroupMountpoint(cgroupPath, subsystem string) (string, error) { + if IsCgroup2UnifiedMode() { + return "", errUnified + } + + // Avoid parsing mountinfo by trying the default path first, if possible. + if path := tryDefaultPath(cgroupPath, subsystem); path != "" { + return path, nil + } + + mnt, _, err := FindCgroupMountpointAndRoot(cgroupPath, subsystem) + return mnt, err +} + +func FindCgroupMountpointAndRoot(cgroupPath, subsystem string) (string, string, error) { + if IsCgroup2UnifiedMode() { + return "", "", errUnified + } + + // Avoid parsing mountinfo by checking if subsystem is valid/available. + if !isSubsystemAvailable(subsystem) { + return "", "", NewNotFoundError(subsystem) + } + + f, err := os.Open("/proc/self/mountinfo") + if err != nil { + return "", "", err + } + defer f.Close() + + return findCgroupMountpointAndRootFromReader(f, cgroupPath, subsystem) +} + +func findCgroupMountpointAndRootFromReader(reader io.Reader, cgroupPath, subsystem string) (string, string, error) { + scanner := bufio.NewScanner(reader) + for scanner.Scan() { + txt := scanner.Text() + fields := strings.Fields(txt) + if len(fields) < 9 { + continue + } + if strings.HasPrefix(fields[4], cgroupPath) { + for _, opt := range strings.Split(fields[len(fields)-1], ",") { + if opt == subsystem { + return fields[4], fields[3], nil + } + } + } + } + if err := scanner.Err(); err != nil { + return "", "", err + } + + return "", "", NewNotFoundError(subsystem) +} + +func isSubsystemAvailable(subsystem string) bool { + if IsCgroup2UnifiedMode() { + panic("don't call isSubsystemAvailable from cgroupv2 code") + } + + cgroups, err := ParseCgroupFile("/proc/self/cgroup") + if err != nil { + return false + } + _, avail := cgroups[subsystem] + return avail +} + +func (m Mount) GetOwnCgroup(cgroups map[string]string) (string, error) { + if len(m.Subsystems) == 0 { + return "", fmt.Errorf("no subsystem for mount") + } + + return getControllerPath(m.Subsystems[0], cgroups) +} + +func getCgroupMountsHelper(ss map[string]bool, mi io.Reader, all bool) ([]Mount, error) { + res := make([]Mount, 0, len(ss)) + scanner := bufio.NewScanner(mi) + numFound := 0 + for scanner.Scan() && (all || numFound < len(ss)) { + txt := scanner.Text() + sepIdx := strings.Index(txt, " - ") + if sepIdx == -1 { + return nil, fmt.Errorf("invalid mountinfo format") + } + if txt[sepIdx+3:sepIdx+10] == "cgroup2" || txt[sepIdx+3:sepIdx+9] != "cgroup" { + continue + } + fields := strings.Split(txt, " ") + m := Mount{ + Mountpoint: fields[4], + Root: fields[3], + } + for _, opt := range strings.Split(fields[len(fields)-1], ",") { + seen, known := ss[opt] + if !known || (!all && seen) { + continue + } + ss[opt] = true + opt = strings.TrimPrefix(opt, CgroupNamePrefix) + m.Subsystems = append(m.Subsystems, opt) + numFound++ + } + if len(m.Subsystems) > 0 || all { + res = append(res, m) + } + } + if err := scanner.Err(); err != nil { + return nil, err + } + return res, nil +} + +func getCgroupMountsV1(all bool) ([]Mount, error) { + f, err := os.Open("/proc/self/mountinfo") + if err != nil { + return nil, err + } + defer f.Close() + + allSubsystems, err := ParseCgroupFile("/proc/self/cgroup") + if err != nil { + return nil, err + } + + allMap := make(map[string]bool) + for s := range allSubsystems { + allMap[s] = false + } + return getCgroupMountsHelper(allMap, f, all) +} + +// GetOwnCgroup returns the relative path to the cgroup docker is running in. +func GetOwnCgroup(subsystem string) (string, error) { + if IsCgroup2UnifiedMode() { + return "", errUnified + } + cgroups, err := ParseCgroupFile("/proc/self/cgroup") + if err != nil { + return "", err + } + + return getControllerPath(subsystem, cgroups) +} + +func GetOwnCgroupPath(subsystem string) (string, error) { + cgroup, err := GetOwnCgroup(subsystem) + if err != nil { + return "", err + } + + return getCgroupPathHelper(subsystem, cgroup) +} + +func GetInitCgroup(subsystem string) (string, error) { + if IsCgroup2UnifiedMode() { + return "", errUnified + } + cgroups, err := ParseCgroupFile("/proc/1/cgroup") + if err != nil { + return "", err + } + + return getControllerPath(subsystem, cgroups) +} + +func GetInitCgroupPath(subsystem string) (string, error) { + cgroup, err := GetInitCgroup(subsystem) + if err != nil { + return "", err + } + + return getCgroupPathHelper(subsystem, cgroup) +} + +func getCgroupPathHelper(subsystem, cgroup string) (string, error) { + mnt, root, err := FindCgroupMountpointAndRoot("", subsystem) + if err != nil { + return "", err + } + + // This is needed for nested containers, because in /proc/self/cgroup we + // see paths from host, which don't exist in container. + relCgroup, err := filepath.Rel(root, cgroup) + if err != nil { + return "", err + } + + return filepath.Join(mnt, relCgroup), nil +} + +func getControllerPath(subsystem string, cgroups map[string]string) (string, error) { + if IsCgroup2UnifiedMode() { + return "", errUnified + } + + if p, ok := cgroups[subsystem]; ok { + return p, nil + } + + if p, ok := cgroups[CgroupNamePrefix+subsystem]; ok { + return p, nil + } + + return "", NewNotFoundError(subsystem) +} diff --git a/sysbox-runc/libcontainer/configs/blkio_device.go b/sysbox-runc/libcontainer/configs/blkio_device.go new file mode 100644 index 00000000..fa195bf9 --- /dev/null +++ b/sysbox-runc/libcontainer/configs/blkio_device.go @@ -0,0 +1,66 @@ +package configs + +import "fmt" + +// blockIODevice holds major:minor format supported in blkio cgroup +type blockIODevice struct { + // Major is the device's major number + Major int64 `json:"major"` + // Minor is the device's minor number + Minor int64 `json:"minor"` +} + +// WeightDevice struct holds a `major:minor weight`|`major:minor leaf_weight` pair +type WeightDevice struct { + blockIODevice + // Weight is the bandwidth rate for the device, range is from 10 to 1000 + Weight uint16 `json:"weight"` + // LeafWeight is the bandwidth rate for the device while competing with the cgroup's child cgroups, range is from 10 to 1000, cfq scheduler only + LeafWeight uint16 `json:"leafWeight"` +} + +// NewWeightDevice returns a configured WeightDevice pointer +func NewWeightDevice(major, minor int64, weight, leafWeight uint16) *WeightDevice { + wd := &WeightDevice{} + wd.Major = major + wd.Minor = minor + wd.Weight = weight + wd.LeafWeight = leafWeight + return wd +} + +// WeightString formats the struct to be writable to the cgroup specific file +func (wd *WeightDevice) WeightString() string { + return fmt.Sprintf("%d:%d %d", wd.Major, wd.Minor, wd.Weight) +} + +// LeafWeightString formats the struct to be writable to the cgroup specific file +func (wd *WeightDevice) LeafWeightString() string { + return fmt.Sprintf("%d:%d %d", wd.Major, wd.Minor, wd.LeafWeight) +} + +// ThrottleDevice struct holds a `major:minor rate_per_second` pair +type ThrottleDevice struct { + blockIODevice + // Rate is the IO rate limit per cgroup per device + Rate uint64 `json:"rate"` +} + +// NewThrottleDevice returns a configured ThrottleDevice pointer +func NewThrottleDevice(major, minor int64, rate uint64) *ThrottleDevice { + td := &ThrottleDevice{} + td.Major = major + td.Minor = minor + td.Rate = rate + return td +} + +// String formats the struct to be writable to the cgroup specific file +func (td *ThrottleDevice) String() string { + return fmt.Sprintf("%d:%d %d", td.Major, td.Minor, td.Rate) +} + +// StringName formats the struct to be writable to the cgroup specific file +func (td *ThrottleDevice) StringName(name string) string { + return fmt.Sprintf("%d:%d %s=%d", td.Major, td.Minor, name, td.Rate) +} diff --git a/sysbox-runc/libcontainer/configs/cgroup_linux.go b/sysbox-runc/libcontainer/configs/cgroup_linux.go new file mode 100644 index 00000000..66e5892c --- /dev/null +++ b/sysbox-runc/libcontainer/configs/cgroup_linux.go @@ -0,0 +1,137 @@ +package configs + +import ( + systemdDbus "github.com/coreos/go-systemd/v22/dbus" + "github.com/opencontainers/runc/libcontainer/devices" +) + +type FreezerState string + +const ( + Undefined FreezerState = "" + Frozen FreezerState = "FROZEN" + Thawed FreezerState = "THAWED" +) + +type Cgroup struct { + // Deprecated, use Path instead + Name string `json:"name,omitempty"` + + // name of parent of cgroup or slice + // Deprecated, use Path instead + Parent string `json:"parent,omitempty"` + + // Path specifies the path to cgroups that are created and/or joined by the container. + // The path is assumed to be relative to the host system cgroup mountpoint. + Path string `json:"path"` + + // ScopePrefix describes prefix for the scope name + ScopePrefix string `json:"scope_prefix"` + + // Paths represent the absolute cgroups paths to join. + // This takes precedence over Path. + Paths map[string]string + + // Resources contains various cgroups settings to apply + *Resources + + // SystemdProps are any additional properties for systemd, + // derived from org.systemd.property.xxx annotations. + // Ignored unless systemd is used for managing cgroups. + SystemdProps []systemdDbus.Property `json:"-"` +} + +type Resources struct { + // Devices is the set of access rules for devices in the container. + Devices []*devices.Rule `json:"devices"` + + // Memory limit (in bytes) + Memory int64 `json:"memory"` + + // Memory reservation or soft_limit (in bytes) + MemoryReservation int64 `json:"memory_reservation"` + + // Total memory usage (memory + swap); set `-1` to enable unlimited swap + MemorySwap int64 `json:"memory_swap"` + + // CPU shares (relative weight vs. other containers) + CpuShares uint64 `json:"cpu_shares"` + + // CPU hardcap limit (in usecs). Allowed cpu time in a given period. + CpuQuota int64 `json:"cpu_quota"` + + // CPU period to be used for hardcapping (in usecs). 0 to use system default. + CpuPeriod uint64 `json:"cpu_period"` + + // How many time CPU will use in realtime scheduling (in usecs). + CpuRtRuntime int64 `json:"cpu_rt_quota"` + + // CPU period to be used for realtime scheduling (in usecs). + CpuRtPeriod uint64 `json:"cpu_rt_period"` + + // CPU to use + CpusetCpus string `json:"cpuset_cpus"` + + // MEM to use + CpusetMems string `json:"cpuset_mems"` + + // Process limit; set <= `0' to disable limit. + PidsLimit int64 `json:"pids_limit"` + + // Specifies per cgroup weight, range is from 10 to 1000. + BlkioWeight uint16 `json:"blkio_weight"` + + // Specifies tasks' weight in the given cgroup while competing with the cgroup's child cgroups, range is from 10 to 1000, cfq scheduler only + BlkioLeafWeight uint16 `json:"blkio_leaf_weight"` + + // Weight per cgroup per device, can override BlkioWeight. + BlkioWeightDevice []*WeightDevice `json:"blkio_weight_device"` + + // IO read rate limit per cgroup per device, bytes per second. + BlkioThrottleReadBpsDevice []*ThrottleDevice `json:"blkio_throttle_read_bps_device"` + + // IO write rate limit per cgroup per device, bytes per second. + BlkioThrottleWriteBpsDevice []*ThrottleDevice `json:"blkio_throttle_write_bps_device"` + + // IO read rate limit per cgroup per device, IO per second. + BlkioThrottleReadIOPSDevice []*ThrottleDevice `json:"blkio_throttle_read_iops_device"` + + // IO write rate limit per cgroup per device, IO per second. + BlkioThrottleWriteIOPSDevice []*ThrottleDevice `json:"blkio_throttle_write_iops_device"` + + // set the freeze value for the process + Freezer FreezerState `json:"freezer"` + + // Hugetlb limit (in bytes) + HugetlbLimit []*HugepageLimit `json:"hugetlb_limit"` + + // Whether to disable OOM Killer + OomKillDisable bool `json:"oom_kill_disable"` + + // Tuning swappiness behaviour per cgroup + MemorySwappiness *uint64 `json:"memory_swappiness"` + + // Set priority of network traffic for container + NetPrioIfpriomap []*IfPrioMap `json:"net_prio_ifpriomap"` + + // Set class identifier for container's network packets + NetClsClassid uint32 `json:"net_cls_classid_u"` + + // Rdma resource restriction configuration + Rdma map[string]LinuxRdma `json:"rdma"` + + // Used on cgroups v2: + + // CpuWeight sets a proportional bandwidth limit. + CpuWeight uint64 `json:"cpu_weight"` + + // Unified is cgroupv2-only key-value map. + Unified map[string]string `json:"unified"` + + // SkipDevices allows to skip configuring device permissions. + // Used by e.g. kubelet while creating a parent cgroup (kubepods) + // common for many containers. + // + // NOTE it is impossible to start a container which has this flag set. + SkipDevices bool `json:"skip_devices"` +} diff --git a/sysbox-runc/libcontainer/configs/cgroup_unsupported.go b/sysbox-runc/libcontainer/configs/cgroup_unsupported.go new file mode 100644 index 00000000..c0c23d70 --- /dev/null +++ b/sysbox-runc/libcontainer/configs/cgroup_unsupported.go @@ -0,0 +1,8 @@ +// +build !linux + +package configs + +// TODO Windows: This can ultimately be entirely factored out on Windows as +// cgroups are a Unix-specific construct. +type Cgroup struct { +} diff --git a/sysbox-runc/libcontainer/configs/config.go b/sysbox-runc/libcontainer/configs/config.go new file mode 100644 index 00000000..1295290a --- /dev/null +++ b/sysbox-runc/libcontainer/configs/config.go @@ -0,0 +1,432 @@ +package configs + +import ( + "bytes" + "encoding/json" + "fmt" + "os/exec" + "time" + + sh "github.com/nestybox/sysbox-libs/idShiftUtils" + "github.com/nestybox/sysbox-libs/shiftfs" + "github.com/opencontainers/runc/libcontainer/devices" + "github.com/opencontainers/runtime-spec/specs-go" + "github.com/pkg/errors" + "github.com/sirupsen/logrus" +) + +type Rlimit struct { + Type int `json:"type"` + Hard uint64 `json:"hard"` + Soft uint64 `json:"soft"` +} + +// IDMap represents UID/GID Mappings for User Namespaces. +type IDMap struct { + ContainerID int `json:"container_id"` + HostID int `json:"host_id"` + Size int `json:"size"` +} + +// Seccomp represents syscall restrictions +// By default, only the native architecture of the kernel is allowed to be used +// for syscalls. Additional architectures can be added by specifying them in +// Architectures. +type Seccomp struct { + DefaultAction Action `json:"default_action"` + Architectures []string `json:"architectures"` + Flags []specs.LinuxSeccompFlag `json:"flags"` + Syscalls []*Syscall `json:"syscalls"` + DefaultErrnoRet *uint `json:"default_errno_ret"` + ListenerPath string `json:"listener_path,omitempty"` + ListenerMetadata string `json:"listener_metadata,omitempty"` +} + +// Action is taken upon rule match in Seccomp +type Action int + +const ( + Kill Action = iota + 1 + Errno + Trap + Allow + Trace + Log + Notify + KillThread + KillProcess +) + +// Operator is a comparison operator to be used when matching syscall arguments in Seccomp +type Operator int + +const ( + EqualTo Operator = iota + 1 + NotEqualTo + GreaterThan + GreaterThanOrEqualTo + LessThan + LessThanOrEqualTo + MaskEqualTo +) + +// Arg is a rule to match a specific syscall argument in Seccomp +type Arg struct { + Index uint `json:"index"` + Value uint64 `json:"value"` + ValueTwo uint64 `json:"value_two"` + Op Operator `json:"op"` +} + +// Syscall is a rule to match a syscall in Seccomp +type Syscall struct { + Name string `json:"name"` + Action Action `json:"action"` + ErrnoRet *uint `json:"errnoRet"` + Args []*Arg `json:"args"` +} + +// TODO Windows. Many of these fields should be factored out into those parts +// which are common across platforms, and those which are platform specific. + +// Config defines configuration options for executing a process inside a contained environment. +type Config struct { + // NoPivotRoot will use MS_MOVE and a chroot to jail the process into the container's rootfs + // This is a common option when the container is running in ramdisk + NoPivotRoot bool `json:"no_pivot_root"` + + // ParentDeathSignal specifies the signal that is sent to the container's process in the case + // that the parent process dies. + ParentDeathSignal int `json:"parent_death_signal"` + + // Path to a directory containing the container's root filesystem. + Rootfs string `json:"rootfs"` + + // Umask is the umask to use inside of the container. + Umask *uint32 `json:"umask"` + + // Readonlyfs will remount the container's rootfs as readonly where only externally mounted + // bind mounts are writtable. + Readonlyfs bool `json:"readonlyfs"` + + // Specifies the mount propagation flags to be applied to /. + RootPropagation int `json:"rootPropagation"` + + // Mounts specify additional source and destination paths that will be mounted inside the container's + // rootfs and mount namespace if specified + Mounts []*Mount `json:"mounts"` + + // The device nodes that should be automatically created within the container upon container start. Note, make sure that the node is marked as allowed in the cgroup as well! + Devices []*devices.Device `json:"devices"` + + MountLabel string `json:"mount_label"` + + // Hostname optionally sets the container's hostname if provided + Hostname string `json:"hostname"` + + // Namespaces specifies the container's namespaces that it should setup when cloning the init process + // If a namespace is not provided that namespace is shared from the container's parent process + Namespaces Namespaces `json:"namespaces"` + + // Capabilities specify the capabilities to keep when executing the process inside the container + // All capabilities not specified will be dropped from the processes capability mask + Capabilities *Capabilities `json:"capabilities"` + + // Networks specifies the container's network setup to be created + Networks []*Network `json:"networks"` + + // Routes can be specified to create entries in the route table as the container is started + Routes []*Route `json:"routes"` + + // Cgroups specifies specific cgroup settings for the various subsystems that the container is + // placed into to limit the resources the container has available + Cgroups *Cgroup `json:"cgroups"` + + // AppArmorProfile specifies the profile to apply to the process running in the container and is + // change at the time the process is execed + AppArmorProfile string `json:"apparmor_profile,omitempty"` + + // ProcessLabel specifies the label to apply to the process running in the container. It is + // commonly used by selinux + ProcessLabel string `json:"process_label,omitempty"` + + // Rlimits specifies the resource limits, such as max open files, to set in the container + // If Rlimits are not set, the container will inherit rlimits from the parent process + Rlimits []Rlimit `json:"rlimits,omitempty"` + + // OomScoreAdj specifies the adjustment to be made by the kernel when calculating oom scores + // for a process. Valid values are between the range [-1000, '1000'], where processes with + // higher scores are preferred for being killed. If it is unset then we don't touch the current + // value. + // More information about kernel oom score calculation here: https://lwn.net/Articles/317814/ + OomScoreAdj *int `json:"oom_score_adj,omitempty"` + + // UidMappings is an array of User ID mappings for User Namespaces + UidMappings []IDMap `json:"uid_mappings"` + + // GidMappings is an array of Group ID mappings for User Namespaces + GidMappings []IDMap `json:"gid_mappings"` + + // MaskPaths specifies paths within the container's rootfs to mask over with a bind + // mount pointing to /dev/null as to prevent reads of the file. + MaskPaths []string `json:"mask_paths"` + + // ReadonlyPaths specifies paths within the container's rootfs to remount as read-only + // so that these files prevent any writes. + ReadonlyPaths []string `json:"readonly_paths"` + + // Sysctl is a map of properties and their values. It is the equivalent of using + // sysctl -w my.property.name value in Linux. + Sysctl map[string]string `json:"sysctl"` + + // Seccomp allows actions to be taken whenever a syscall is made within the container. + // A number of rules are given, each having an action to be taken if a syscall matches it. + // A default action to be taken if no rules match is also given. + Seccomp *Seccomp `json:"seccomp"` + + // sysbox-runc: Seccomp notification actions for syscall trapping inside the sys container. + SeccompNotif *Seccomp `json:"seccomp_notif"` + + // NoNewPrivileges controls whether processes in the container can gain additional privileges. + NoNewPrivileges bool `json:"no_new_privileges,omitempty"` + + // Hooks are a collection of actions to perform at various container lifecycle events. + // CommandHooks are serialized to JSON, but other hooks are not. + Hooks Hooks + + // Version is the version of opencontainer specification that is supported. + Version string `json:"version"` + + // Labels are user defined metadata that is stored in the config and populated on the state + Labels []string `json:"labels"` + + // NoNewKeyring will not allocated a new session keyring for the container. It will use the + // callers keyring in this case. + NoNewKeyring bool `json:"no_new_keyring"` + + // IntelRdt specifies settings for Intel RDT group that the container is placed into + // to limit the resources (e.g., L3 cache, memory bandwidth) the container has available + IntelRdt *IntelRdt `json:"intel_rdt,omitempty"` + + // RootlessEUID is set when the runc was launched with non-zero EUID. + // Note that RootlessEUID is set to false when launched with EUID=0 in userns. + // When RootlessEUID is set, runc creates a new userns for the container. + // (config.json needs to contain userns settings) + RootlessEUID bool `json:"rootless_euid,omitempty"` + + // RootlessCgroups is set when unlikely to have the full access to cgroups. + // When RootlessCgroups is set, cgroups errors are ignored. + RootlessCgroups bool `json:"rootless_cgroups,omitempty"` + + // RootfsUidShiftType indicates the type of fs ID shifting to do on the rootfs + RootfsUidShiftType sh.IDShiftType `json:"rootfs_uid_shift_type,omitempty"` + + // BindMntUidShiftType indicates the type of fs ID shifting to do on container bind-mounts + BindMntUidShiftType sh.IDShiftType `json:"bindmnt_uid_shift_type,omitempty"` + + // ShiftfsMounts is a list of directories on which shiftfs needs to be mounted + ShiftfsMounts []shiftfs.MountPoint `json:"shiftfs_mounts,omitempty"` + + // SwitchDockerDns indicates if the containers should change the IP address + // of Docker DNS hosts with localhost addresses. + SwitchDockerDns bool `json:"switch_docker_dns,omitempty"` + + // FsState slice is utilized to host file-system state (e.g. dir, file, softlinks, + // etc) to be created in container's rootfs during initialization. + FsState []FsEntry `json:"fs_state,omitempty"` + + // Indicates if the rootfs was cloned by the sysbox-mgr. + RootfsCloned bool `json:"rootfs_cloned"` + + // Indicates if container setup should fail when we detect a filesystem uid mapping error + FsuidMapFailOnErr bool `json:"fsuid_map_fail_on_err"` + + // IDShiftIgnoreList is a list of container paths over which no UID/GID shifting mechanism must + // be applied. + IDshiftIgnoreList []string `json:"idshift_ignore_list,omitempty"` +} + +type HookName string +type HookList []Hook +type Hooks map[HookName]HookList + +const ( + // Prestart commands are executed after the container namespaces are created, + // but before the user supplied command is executed from init. + // Note: This hook is now deprecated + // Prestart commands are called in the Runtime namespace. + Prestart HookName = "prestart" + + // CreateRuntime commands MUST be called as part of the create operation after + // the runtime environment has been created but before the pivot_root has been executed. + // CreateRuntime is called immediately after the deprecated Prestart hook. + // CreateRuntime commands are called in the Runtime Namespace. + CreateRuntime = "createRuntime" + + // CreateContainer commands MUST be called as part of the create operation after + // the runtime environment has been created but before the pivot_root has been executed. + // CreateContainer commands are called in the Container namespace. + CreateContainer = "createContainer" + + // StartContainer commands MUST be called as part of the start operation and before + // the container process is started. + // StartContainer commands are called in the Container namespace. + StartContainer = "startContainer" + + // Poststart commands are executed after the container init process starts. + // Poststart commands are called in the Runtime Namespace. + Poststart = "poststart" + + // Poststop commands are executed after the container init process exits. + // Poststop commands are called in the Runtime Namespace. + Poststop = "poststop" +) + +type Capabilities struct { + // Bounding is the set of capabilities checked by the kernel. + Bounding []string + // Effective is the set of capabilities checked by the kernel. + Effective []string + // Inheritable is the capabilities preserved across execve. + Inheritable []string + // Permitted is the limiting superset for effective capabilities. + Permitted []string + // Ambient is the ambient set of capabilities that are kept. + Ambient []string +} + +func (hooks HookList) RunHooks(state *specs.State) error { + for i, h := range hooks { + if err := h.Run(state); err != nil { + return errors.Wrapf(err, "Running hook #%d:", i) + } + } + + return nil +} + +func (hooks *Hooks) UnmarshalJSON(b []byte) error { + var state map[HookName][]CommandHook + + if err := json.Unmarshal(b, &state); err != nil { + return err + } + + *hooks = Hooks{} + for n, commandHooks := range state { + if len(commandHooks) == 0 { + continue + } + + (*hooks)[n] = HookList{} + for _, h := range commandHooks { + (*hooks)[n] = append((*hooks)[n], h) + } + } + + return nil +} + +func (hooks *Hooks) MarshalJSON() ([]byte, error) { + serialize := func(hooks []Hook) (serializableHooks []CommandHook) { + for _, hook := range hooks { + switch chook := hook.(type) { + case CommandHook: + serializableHooks = append(serializableHooks, chook) + default: + logrus.Warnf("cannot serialize hook of type %T, skipping", hook) + } + } + + return serializableHooks + } + + return json.Marshal(map[string]interface{}{ + "prestart": serialize((*hooks)[Prestart]), + "createRuntime": serialize((*hooks)[CreateRuntime]), + "createContainer": serialize((*hooks)[CreateContainer]), + "startContainer": serialize((*hooks)[StartContainer]), + "poststart": serialize((*hooks)[Poststart]), + "poststop": serialize((*hooks)[Poststop]), + }) +} + +type Hook interface { + // Run executes the hook with the provided state. + Run(*specs.State) error +} + +// NewFunctionHook will call the provided function when the hook is run. +func NewFunctionHook(f func(*specs.State) error) FuncHook { + return FuncHook{ + run: f, + } +} + +type FuncHook struct { + run func(*specs.State) error +} + +func (f FuncHook) Run(s *specs.State) error { + return f.run(s) +} + +type Command struct { + Path string `json:"path"` + Args []string `json:"args"` + Env []string `json:"env"` + Dir string `json:"dir"` + Timeout *time.Duration `json:"timeout"` +} + +// NewCommandHook will execute the provided command when the hook is run. +func NewCommandHook(cmd Command) CommandHook { + return CommandHook{ + Command: cmd, + } +} + +type CommandHook struct { + Command +} + +func (c Command) Run(s *specs.State) error { + b, err := json.Marshal(s) + if err != nil { + return err + } + var stdout, stderr bytes.Buffer + cmd := exec.Cmd{ + Path: c.Path, + Args: c.Args, + Env: c.Env, + Stdin: bytes.NewReader(b), + Stdout: &stdout, + Stderr: &stderr, + } + if err := cmd.Start(); err != nil { + return err + } + errC := make(chan error, 1) + go func() { + err := cmd.Wait() + if err != nil { + err = fmt.Errorf("error running hook: %v, stdout: %s, stderr: %s", err, stdout.String(), stderr.String()) + } + errC <- err + }() + var timerCh <-chan time.Time + if c.Timeout != nil { + timer := time.NewTimer(*c.Timeout) + defer timer.Stop() + timerCh = timer.C + } + select { + case err := <-errC: + return err + case <-timerCh: + cmd.Process.Kill() + cmd.Wait() + return fmt.Errorf("hook ran past specified timeout of %.1fs", c.Timeout.Seconds()) + } +} diff --git a/sysbox-runc/libcontainer/configs/config_linux.go b/sysbox-runc/libcontainer/configs/config_linux.go new file mode 100644 index 00000000..07da1080 --- /dev/null +++ b/sysbox-runc/libcontainer/configs/config_linux.go @@ -0,0 +1,61 @@ +package configs + +import "fmt" + +// HostUID gets the translated uid for the process on host which could be +// different when user namespaces are enabled. +func (c Config) HostUID(containerId int) (int, error) { + if c.Namespaces.Contains(NEWUSER) { + if c.UidMappings == nil { + return -1, fmt.Errorf("User namespaces enabled, but no uid mappings found.") + } + id, found := c.hostIDFromMapping(containerId, c.UidMappings) + if !found { + return -1, fmt.Errorf("User namespaces enabled, but no user mapping found.") + } + return id, nil + } + // Return unchanged id. + return containerId, nil +} + +// HostRootUID gets the root uid for the process on host which could be non-zero +// when user namespaces are enabled. +func (c Config) HostRootUID() (int, error) { + return c.HostUID(0) +} + +// HostGID gets the translated gid for the process on host which could be +// different when user namespaces are enabled. +func (c Config) HostGID(containerId int) (int, error) { + if c.Namespaces.Contains(NEWUSER) { + if c.GidMappings == nil { + return -1, fmt.Errorf("User namespaces enabled, but no gid mappings found.") + } + id, found := c.hostIDFromMapping(containerId, c.GidMappings) + if !found { + return -1, fmt.Errorf("User namespaces enabled, but no group mapping found.") + } + return id, nil + } + // Return unchanged id. + return containerId, nil +} + +// HostRootGID gets the root gid for the process on host which could be non-zero +// when user namespaces are enabled. +func (c Config) HostRootGID() (int, error) { + return c.HostGID(0) +} + +// Utility function that gets a host ID for a container ID from user namespace map +// if that ID is present in the map. +func (c Config) hostIDFromMapping(containerID int, uMap []IDMap) (int, bool) { + for _, m := range uMap { + if (containerID >= m.ContainerID) && (containerID <= (m.ContainerID + m.Size - 1)) { + hostID := m.HostID + (containerID - m.ContainerID) + return hostID, true + } + } + return -1, false +} diff --git a/sysbox-runc/libcontainer/configs/config_linux_test.go b/sysbox-runc/libcontainer/configs/config_linux_test.go new file mode 100644 index 00000000..7b8dc57f --- /dev/null +++ b/sysbox-runc/libcontainer/configs/config_linux_test.go @@ -0,0 +1,83 @@ +package configs + +import ( + "testing" +) + +func TestRemoveNamespace(t *testing.T) { + ns := Namespaces{ + {Type: NEWNET}, + } + if !ns.Remove(NEWNET) { + t.Fatal("NEWNET was not removed") + } + if len(ns) != 0 { + t.Fatalf("namespaces should have 0 items but reports %d", len(ns)) + } +} + +func TestHostRootUIDNoUSERNS(t *testing.T) { + config := &Config{ + Namespaces: Namespaces{}, + } + uid, err := config.HostRootUID() + if err != nil { + t.Fatal(err) + } + if uid != 0 { + t.Fatalf("expected uid 0 with no USERNS but received %d", uid) + } +} + +func TestHostRootUIDWithUSERNS(t *testing.T) { + config := &Config{ + Namespaces: Namespaces{{Type: NEWUSER}}, + UidMappings: []IDMap{ + { + ContainerID: 0, + HostID: 1000, + Size: 1, + }, + }, + } + uid, err := config.HostRootUID() + if err != nil { + t.Fatal(err) + } + if uid != 1000 { + t.Fatalf("expected uid 1000 with no USERNS but received %d", uid) + } +} + +func TestHostRootGIDNoUSERNS(t *testing.T) { + config := &Config{ + Namespaces: Namespaces{}, + } + uid, err := config.HostRootGID() + if err != nil { + t.Fatal(err) + } + if uid != 0 { + t.Fatalf("expected gid 0 with no USERNS but received %d", uid) + } +} + +func TestHostRootGIDWithUSERNS(t *testing.T) { + config := &Config{ + Namespaces: Namespaces{{Type: NEWUSER}}, + GidMappings: []IDMap{ + { + ContainerID: 0, + HostID: 1000, + Size: 1, + }, + }, + } + uid, err := config.HostRootGID() + if err != nil { + t.Fatal(err) + } + if uid != 1000 { + t.Fatalf("expected gid 1000 with no USERNS but received %d", uid) + } +} diff --git a/sysbox-runc/libcontainer/configs/config_test.go b/sysbox-runc/libcontainer/configs/config_test.go new file mode 100644 index 00000000..1eb87433 --- /dev/null +++ b/sysbox-runc/libcontainer/configs/config_test.go @@ -0,0 +1,237 @@ +package configs_test + +import ( + "encoding/json" + "fmt" + "io/ioutil" + "os" + "reflect" + "testing" + "time" + + "github.com/opencontainers/runc/libcontainer/configs" + "github.com/opencontainers/runtime-spec/specs-go" +) + +var ( + hookNameList = []configs.HookName{ + configs.Prestart, + configs.CreateRuntime, + configs.CreateContainer, + configs.StartContainer, + configs.Poststart, + configs.Poststop, + } +) + +func TestUnmarshalHooks(t *testing.T) { + timeout := time.Second + + hookCmd := configs.NewCommandHook(configs.Command{ + Path: "/var/vcap/hooks/hook", + Args: []string{"--pid=123"}, + Env: []string{"FOO=BAR"}, + Dir: "/var/vcap", + Timeout: &timeout, + }) + + hookJson, err := json.Marshal(hookCmd) + if err != nil { + t.Fatal(err) + } + + for _, hookName := range hookNameList { + hooks := configs.Hooks{} + err = hooks.UnmarshalJSON([]byte(fmt.Sprintf(`{"%s" :[%s]}`, hookName, hookJson))) + if err != nil { + t.Fatal(err) + } + + if !reflect.DeepEqual(hooks[hookName], configs.HookList{hookCmd}) { + t.Errorf("Expected %s to equal %+v but it was %+v", hookName, hookCmd, hooks[hookName]) + } + } +} + +func TestUnmarshalHooksWithInvalidData(t *testing.T) { + hook := configs.Hooks{} + err := hook.UnmarshalJSON([]byte(`{invalid-json}`)) + if err == nil { + t.Error("Expected error to occur but it was nil") + } +} + +func TestMarshalHooks(t *testing.T) { + timeout := time.Second + + hookCmd := configs.NewCommandHook(configs.Command{ + Path: "/var/vcap/hooks/hook", + Args: []string{"--pid=123"}, + Env: []string{"FOO=BAR"}, + Dir: "/var/vcap", + Timeout: &timeout, + }) + + hook := configs.Hooks{ + configs.Prestart: configs.HookList{hookCmd}, + configs.CreateRuntime: configs.HookList{hookCmd}, + configs.CreateContainer: configs.HookList{hookCmd}, + configs.StartContainer: configs.HookList{hookCmd}, + configs.Poststart: configs.HookList{hookCmd}, + configs.Poststop: configs.HookList{hookCmd}, + } + hooks, err := hook.MarshalJSON() + if err != nil { + t.Fatal(err) + } + + // Note Marshal seems to output fields in alphabetical order + hookCmdJson := `[{"path":"/var/vcap/hooks/hook","args":["--pid=123"],"env":["FOO=BAR"],"dir":"/var/vcap","timeout":1000000000}]` + h := fmt.Sprintf(`{"createContainer":%[1]s,"createRuntime":%[1]s,"poststart":%[1]s,"poststop":%[1]s,"prestart":%[1]s,"startContainer":%[1]s}`, hookCmdJson) + if string(hooks) != h { + t.Errorf("Expected hooks %s to equal %s", string(hooks), h) + } +} + +func TestMarshalUnmarshalHooks(t *testing.T) { + timeout := time.Second + + hookCmd := configs.NewCommandHook(configs.Command{ + Path: "/var/vcap/hooks/hook", + Args: []string{"--pid=123"}, + Env: []string{"FOO=BAR"}, + Dir: "/var/vcap", + Timeout: &timeout, + }) + + hook := configs.Hooks{ + configs.Prestart: configs.HookList{hookCmd}, + configs.CreateRuntime: configs.HookList{hookCmd}, + configs.CreateContainer: configs.HookList{hookCmd}, + configs.StartContainer: configs.HookList{hookCmd}, + configs.Poststart: configs.HookList{hookCmd}, + configs.Poststop: configs.HookList{hookCmd}, + } + hooks, err := hook.MarshalJSON() + if err != nil { + t.Fatal(err) + } + + umMhook := configs.Hooks{} + err = umMhook.UnmarshalJSON(hooks) + if err != nil { + t.Fatal(err) + } + if !reflect.DeepEqual(umMhook, hook) { + t.Errorf("Expected hooks to be equal after mashaling -> unmarshaling them: %+v, %+v", umMhook, hook) + } +} + +func TestMarshalHooksWithUnexpectedType(t *testing.T) { + fHook := configs.NewFunctionHook(func(*specs.State) error { + return nil + }) + hook := configs.Hooks{ + configs.CreateRuntime: configs.HookList{fHook}, + } + hooks, err := hook.MarshalJSON() + if err != nil { + t.Fatal(err) + } + + h := `{"createContainer":null,"createRuntime":null,"poststart":null,"poststop":null,"prestart":null,"startContainer":null}` + if string(hooks) != h { + t.Errorf("Expected hooks %s to equal %s", string(hooks), h) + } +} + +func TestFuncHookRun(t *testing.T) { + state := &specs.State{ + Version: "1", + ID: "1", + Status: "created", + Pid: 1, + Bundle: "/bundle", + } + + fHook := configs.NewFunctionHook(func(s *specs.State) error { + if !reflect.DeepEqual(state, s) { + t.Errorf("Expected state %+v to equal %+v", state, s) + } + return nil + }) + + fHook.Run(state) +} + +func TestCommandHookRun(t *testing.T) { + state := &specs.State{ + Version: "1", + ID: "1", + Status: "created", + Pid: 1, + Bundle: "/bundle", + } + + stateJson, err := json.Marshal(state) + if err != nil { + t.Fatal(err) + } + + verifyCommandTemplate := `#!/bin/sh +if [ "$1" != "testarg" ]; then + echo "Bad value for $1. Expected 'testarg', found '$1'" + exit 1 +fi +if [ -z "$FOO" ] || [ "$FOO" != BAR ]; then + echo "Bad value for FOO. Expected 'BAR', found '$FOO'" + exit 1 +fi +expectedJson=%q +read JSON +if [ "$JSON" != "$expectedJson" ]; then + echo "Bad JSON received. Expected '$expectedJson', found '$JSON'" + exit 1 +fi +exit 0 + ` + verifyCommand := fmt.Sprintf(verifyCommandTemplate, stateJson) + filename := "/tmp/runc-hooktest.sh" + os.Remove(filename) + if err := ioutil.WriteFile(filename, []byte(verifyCommand), 0700); err != nil { + t.Fatalf("Failed to create tmp file: %v", err) + } + defer os.Remove(filename) + + cmdHook := configs.NewCommandHook(configs.Command{ + Path: filename, + Args: []string{filename, "testarg"}, + Env: []string{"FOO=BAR"}, + Dir: "/", + }) + + if err := cmdHook.Run(state); err != nil { + t.Errorf(fmt.Sprintf("Expected error to not occur but it was %+v", err)) + } +} + +func TestCommandHookRunTimeout(t *testing.T) { + state := &specs.State{ + Version: "1", + ID: "1", + Status: "created", + Pid: 1, + Bundle: "/bundle", + } + timeout := 100 * time.Millisecond + + cmdHook := configs.NewCommandHook(configs.Command{ + Path: "/bin/sleep", + Args: []string{"/bin/sleep", "1"}, + Timeout: &timeout, + }) + + if err := cmdHook.Run(state); err == nil { + t.Error("Expected error to occur but it was nil") + } +} diff --git a/sysbox-runc/libcontainer/configs/config_windows_test.go b/sysbox-runc/libcontainer/configs/config_windows_test.go new file mode 100644 index 00000000..1a0c8fa2 --- /dev/null +++ b/sysbox-runc/libcontainer/configs/config_windows_test.go @@ -0,0 +1,3 @@ +package configs + +// All current tests are for Unix-specific functionality diff --git a/sysbox-runc/libcontainer/configs/devices.go b/sysbox-runc/libcontainer/configs/devices.go new file mode 100644 index 00000000..b9e3664c --- /dev/null +++ b/sysbox-runc/libcontainer/configs/devices.go @@ -0,0 +1,17 @@ +package configs + +import "github.com/opencontainers/runc/libcontainer/devices" + +type ( + // Deprecated: use libcontainer/devices.Device + Device = devices.Device + + // Deprecated: use libcontainer/devices.Rule + DeviceRule = devices.Rule + + // Deprecated: use libcontainer/devices.Type + DeviceType = devices.Type + + // Deprecated: use libcontainer/devices.Permissions + DevicePermissions = devices.Permissions +) diff --git a/sysbox-runc/libcontainer/configs/fsentry.go b/sysbox-runc/libcontainer/configs/fsentry.go new file mode 100644 index 00000000..6cdfc43f --- /dev/null +++ b/sysbox-runc/libcontainer/configs/fsentry.go @@ -0,0 +1,116 @@ +// +// Copyright 2020 Nestybox, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +package configs + +import "os" + +type FsEntryKind uint32 + +const ( + InvalidFsKind FsEntryKind = iota + FileFsKind + DirFsKind + SoftlinkFsKind +) + +// +// FsEntry type is utilized to hold file-system state (e.g. dir, file, softlinks, +// etc) to be created in container's rootfs. +// +type FsEntry struct { + Kind FsEntryKind + Path string // holds the path + name of the fsentry + Mode os.FileMode // regular filemode + Dst string // only relevant in SoftlinkFsKind types +} + +func NewFsEntry(path, dst string, mode os.FileMode, kind FsEntryKind) *FsEntry { + + entry := &FsEntry{ + Kind: kind, + Path: path, + Mode: mode, + Dst: dst, + } + + return entry +} + +func (e *FsEntry) Add() error { + + switch e.Kind { + + case FileFsKind: + // Check if file exists. + var _, err = os.Stat(e.Path) + + // Create file if not exits. + if os.IsNotExist(err) { + file, err := os.OpenFile(e.Path, os.O_RDWR|os.O_CREATE, e.Mode) + if err != nil { + return err + } + defer file.Close() + } + + case DirFsKind: + if err := os.MkdirAll(e.Path, e.Mode); err != nil { + return err + } + + case SoftlinkFsKind: + // Check if softlink exists. + var _, err = os.Lstat(e.Path) + + // Create softlink if not present. + if os.IsNotExist(err) { + // In Linux softlink permissions are irrelevant; i.e. changing a + // permission on a symbolic link by chmod() will simply act as if it + // was performed against the target of the symbolic link, so we are + // obviating it here. + if err := os.Symlink(e.Dst, e.Path); err != nil { + return err + } + } + } + + return nil +} + +func (e *FsEntry) Remove() error { + if err := os.RemoveAll(e.Path); err != nil { + return err + } + + return nil +} + +func (e *FsEntry) GetPath() string { + return e.Path +} + +func (e *FsEntry) GetMode() os.FileMode { + return e.Mode +} + +func (e *FsEntry) GetKind() FsEntryKind { + return e.Kind +} + +func (e *FsEntry) GetDest() string { + return e.Dst +} diff --git a/sysbox-runc/libcontainer/configs/hugepage_limit.go b/sysbox-runc/libcontainer/configs/hugepage_limit.go new file mode 100644 index 00000000..d3021638 --- /dev/null +++ b/sysbox-runc/libcontainer/configs/hugepage_limit.go @@ -0,0 +1,9 @@ +package configs + +type HugepageLimit struct { + // which type of hugepage to limit. + Pagesize string `json:"page_size"` + + // usage limit for hugepage. + Limit uint64 `json:"limit"` +} diff --git a/sysbox-runc/libcontainer/configs/intelrdt.go b/sysbox-runc/libcontainer/configs/intelrdt.go new file mode 100644 index 00000000..57e9f037 --- /dev/null +++ b/sysbox-runc/libcontainer/configs/intelrdt.go @@ -0,0 +1,13 @@ +package configs + +type IntelRdt struct { + // The schema for L3 cache id and capacity bitmask (CBM) + // Format: "L3:=;=;..." + L3CacheSchema string `json:"l3_cache_schema,omitempty"` + + // The schema of memory bandwidth per L3 cache id + // Format: "MB:=bandwidth0;=bandwidth1;..." + // The unit of memory bandwidth is specified in "percentages" by + // default, and in "MBps" if MBA Software Controller is enabled. + MemBwSchema string `json:"memBwSchema,omitempty"` +} diff --git a/sysbox-runc/libcontainer/configs/interface_priority_map.go b/sysbox-runc/libcontainer/configs/interface_priority_map.go new file mode 100644 index 00000000..9a0395ea --- /dev/null +++ b/sysbox-runc/libcontainer/configs/interface_priority_map.go @@ -0,0 +1,14 @@ +package configs + +import ( + "fmt" +) + +type IfPrioMap struct { + Interface string `json:"interface"` + Priority int64 `json:"priority"` +} + +func (i *IfPrioMap) CgroupString() string { + return fmt.Sprintf("%s %d", i.Interface, i.Priority) +} diff --git a/sysbox-runc/libcontainer/configs/mount.go b/sysbox-runc/libcontainer/configs/mount.go new file mode 100644 index 00000000..04847b8e --- /dev/null +++ b/sysbox-runc/libcontainer/configs/mount.go @@ -0,0 +1,45 @@ +package configs + +const ( + // EXT_COPYUP is a directive to copy up the contents of a directory when + // a tmpfs is mounted over it. + EXT_COPYUP = 1 << iota +) + +type BindSrcInfo struct { + IsDir bool `json:"is_dir,omitempty"` + Uid uint32 `json:"uid,omitempty"` + Gid uint32 `json:"gid,omitempty"` +} + +type Mount struct { + // Source path for the mount. + Source string `json:"source"` + + // Destination path for the mount inside the container. + Destination string `json:"destination"` + + // Device the mount is for. + Device string `json:"device"` + + // Mount flags. + Flags int `json:"flags"` + + // Propagation Flags + PropagationFlags []int `json:"propagation_flags"` + + // Mount data applied to the mount. + Data string `json:"data"` + + // Relabel source if set, "z" indicates shared, "Z" indicates unshared. + Relabel string `json:"relabel"` + + // Extensions are additional flags that are specific to runc. + Extensions int `json:"extensions"` + + // Bind mount source info + BindSrcInfo BindSrcInfo `json:"bind_src_info,omitempty"` + + // Indicates if mounts is to be ID-mapped (see mount_setattr(2) in Linux >= 5.12). + IDMappedMount bool `json:"idmap_mount"` +} diff --git a/sysbox-runc/libcontainer/configs/namespaces.go b/sysbox-runc/libcontainer/configs/namespaces.go new file mode 100644 index 00000000..a3329a31 --- /dev/null +++ b/sysbox-runc/libcontainer/configs/namespaces.go @@ -0,0 +1,5 @@ +package configs + +type NamespaceType string + +type Namespaces []Namespace diff --git a/sysbox-runc/libcontainer/configs/namespaces_linux.go b/sysbox-runc/libcontainer/configs/namespaces_linux.go new file mode 100644 index 00000000..d52d6fcd --- /dev/null +++ b/sysbox-runc/libcontainer/configs/namespaces_linux.go @@ -0,0 +1,126 @@ +package configs + +import ( + "fmt" + "os" + "sync" +) + +const ( + NEWNET NamespaceType = "NEWNET" + NEWPID NamespaceType = "NEWPID" + NEWNS NamespaceType = "NEWNS" + NEWUTS NamespaceType = "NEWUTS" + NEWIPC NamespaceType = "NEWIPC" + NEWUSER NamespaceType = "NEWUSER" + NEWCGROUP NamespaceType = "NEWCGROUP" +) + +var ( + nsLock sync.Mutex + supportedNamespaces = make(map[NamespaceType]bool) +) + +// NsName converts the namespace type to its filename +func NsName(ns NamespaceType) string { + switch ns { + case NEWNET: + return "net" + case NEWNS: + return "mnt" + case NEWPID: + return "pid" + case NEWIPC: + return "ipc" + case NEWUSER: + return "user" + case NEWUTS: + return "uts" + case NEWCGROUP: + return "cgroup" + } + return "" +} + +// IsNamespaceSupported returns whether a namespace is available or +// not +func IsNamespaceSupported(ns NamespaceType) bool { + nsLock.Lock() + defer nsLock.Unlock() + supported, ok := supportedNamespaces[ns] + if ok { + return supported + } + nsFile := NsName(ns) + // if the namespace type is unknown, just return false + if nsFile == "" { + return false + } + _, err := os.Stat("/proc/self/ns/" + nsFile) + // a namespace is supported if it exists and we have permissions to read it + supported = err == nil + supportedNamespaces[ns] = supported + return supported +} + +func NamespaceTypes() []NamespaceType { + return []NamespaceType{ + NEWUSER, // Keep user NS always first, don't move it. + NEWIPC, + NEWUTS, + NEWNET, + NEWPID, + NEWNS, + NEWCGROUP, + } +} + +// Namespace defines configuration for each namespace. It specifies an +// alternate path that is able to be joined via setns. +type Namespace struct { + Type NamespaceType `json:"type"` + Path string `json:"path"` +} + +func (n *Namespace) GetPath(pid int) string { + return fmt.Sprintf("/proc/%d/ns/%s", pid, NsName(n.Type)) +} + +func (n *Namespaces) Remove(t NamespaceType) bool { + i := n.index(t) + if i == -1 { + return false + } + *n = append((*n)[:i], (*n)[i+1:]...) + return true +} + +func (n *Namespaces) Add(t NamespaceType, path string) { + i := n.index(t) + if i == -1 { + *n = append(*n, Namespace{Type: t, Path: path}) + return + } + (*n)[i].Path = path +} + +func (n *Namespaces) index(t NamespaceType) int { + for i, ns := range *n { + if ns.Type == t { + return i + } + } + return -1 +} + +func (n *Namespaces) Contains(t NamespaceType) bool { + return n.index(t) != -1 +} + +func (n *Namespaces) PathOf(t NamespaceType) string { + i := n.index(t) + if i == -1 { + return "" + } + return (*n)[i].Path +} diff --git a/sysbox-runc/libcontainer/configs/namespaces_syscall.go b/sysbox-runc/libcontainer/configs/namespaces_syscall.go new file mode 100644 index 00000000..2dc7adfc --- /dev/null +++ b/sysbox-runc/libcontainer/configs/namespaces_syscall.go @@ -0,0 +1,32 @@ +// +build linux + +package configs + +import "golang.org/x/sys/unix" + +func (n *Namespace) Syscall() int { + return namespaceInfo[n.Type] +} + +var namespaceInfo = map[NamespaceType]int{ + NEWNET: unix.CLONE_NEWNET, + NEWNS: unix.CLONE_NEWNS, + NEWUSER: unix.CLONE_NEWUSER, + NEWIPC: unix.CLONE_NEWIPC, + NEWUTS: unix.CLONE_NEWUTS, + NEWPID: unix.CLONE_NEWPID, + NEWCGROUP: unix.CLONE_NEWCGROUP, +} + +// CloneFlags parses the container's Namespaces options to set the correct +// flags on clone, unshare. This function returns flags only for new namespaces. +func (n *Namespaces) CloneFlags() uintptr { + var flag int + for _, v := range *n { + if v.Path != "" { + continue + } + flag |= namespaceInfo[v.Type] + } + return uintptr(flag) +} diff --git a/sysbox-runc/libcontainer/configs/namespaces_syscall_unsupported.go b/sysbox-runc/libcontainer/configs/namespaces_syscall_unsupported.go new file mode 100644 index 00000000..5d9a5c81 --- /dev/null +++ b/sysbox-runc/libcontainer/configs/namespaces_syscall_unsupported.go @@ -0,0 +1,13 @@ +// +build !linux,!windows + +package configs + +func (n *Namespace) Syscall() int { + panic("No namespace syscall support") +} + +// CloneFlags parses the container's Namespaces options to set the correct +// flags on clone, unshare. This function returns flags only for new namespaces. +func (n *Namespaces) CloneFlags() uintptr { + panic("No namespace syscall support") +} diff --git a/sysbox-runc/libcontainer/configs/namespaces_unsupported.go b/sysbox-runc/libcontainer/configs/namespaces_unsupported.go new file mode 100644 index 00000000..19bf713d --- /dev/null +++ b/sysbox-runc/libcontainer/configs/namespaces_unsupported.go @@ -0,0 +1,8 @@ +// +build !linux + +package configs + +// Namespace defines configuration for each namespace. It specifies an +// alternate path that is able to be joined via setns. +type Namespace struct { +} diff --git a/sysbox-runc/libcontainer/configs/network.go b/sysbox-runc/libcontainer/configs/network.go new file mode 100644 index 00000000..ccdb228e --- /dev/null +++ b/sysbox-runc/libcontainer/configs/network.go @@ -0,0 +1,72 @@ +package configs + +// Network defines configuration for a container's networking stack +// +// The network configuration can be omitted from a container causing the +// container to be setup with the host's networking stack +type Network struct { + // Type sets the networks type, commonly veth and loopback + Type string `json:"type"` + + // Name of the network interface + Name string `json:"name"` + + // The bridge to use. + Bridge string `json:"bridge"` + + // MacAddress contains the MAC address to set on the network interface + MacAddress string `json:"mac_address"` + + // Address contains the IPv4 and mask to set on the network interface + Address string `json:"address"` + + // Gateway sets the gateway address that is used as the default for the interface + Gateway string `json:"gateway"` + + // IPv6Address contains the IPv6 and mask to set on the network interface + IPv6Address string `json:"ipv6_address"` + + // IPv6Gateway sets the ipv6 gateway address that is used as the default for the interface + IPv6Gateway string `json:"ipv6_gateway"` + + // Mtu sets the mtu value for the interface and will be mirrored on both the host and + // container's interfaces if a pair is created, specifically in the case of type veth + // Note: This does not apply to loopback interfaces. + Mtu int `json:"mtu"` + + // TxQueueLen sets the tx_queuelen value for the interface and will be mirrored on both the host and + // container's interfaces if a pair is created, specifically in the case of type veth + // Note: This does not apply to loopback interfaces. + TxQueueLen int `json:"txqueuelen"` + + // HostInterfaceName is a unique name of a veth pair that resides on in the host interface of the + // container. + HostInterfaceName string `json:"host_interface_name"` + + // HairpinMode specifies if hairpin NAT should be enabled on the virtual interface + // bridge port in the case of type veth + // Note: This is unsupported on some systems. + // Note: This does not apply to loopback interfaces. + HairpinMode bool `json:"hairpin_mode"` +} + +// Routes can be specified to create entries in the route table as the container is started +// +// All of destination, source, and gateway should be either IPv4 or IPv6. +// One of the three options must be present, and omitted entries will use their +// IP family default for the route table. For IPv4 for example, setting the +// gateway to 1.2.3.4 and the interface to eth0 will set up a standard +// destination of 0.0.0.0(or *) when viewed in the route table. +type Route struct { + // Sets the destination and mask, should be a CIDR. Accepts IPv4 and IPv6 + Destination string `json:"destination"` + + // Sets the source and mask, should be a CIDR. Accepts IPv4 and IPv6 + Source string `json:"source"` + + // Sets the gateway. Accepts IPv4 and IPv6 + Gateway string `json:"gateway"` + + // The device to set this route up for, for example: eth0 + InterfaceName string `json:"interface_name"` +} diff --git a/sysbox-runc/libcontainer/configs/rdma.go b/sysbox-runc/libcontainer/configs/rdma.go new file mode 100644 index 00000000..c69f2c80 --- /dev/null +++ b/sysbox-runc/libcontainer/configs/rdma.go @@ -0,0 +1,9 @@ +package configs + +// LinuxRdma for Linux cgroup 'rdma' resource management (Linux 4.11) +type LinuxRdma struct { + // Maximum number of HCA handles that can be opened. Default is "no limit". + HcaHandles *uint32 `json:"hca_handles,omitempty"` + // Maximum number of HCA objects that can be created. Default is "no limit". + HcaObjects *uint32 `json:"hca_objects,omitempty"` +} diff --git a/sysbox-runc/libcontainer/configs/validate/rootless.go b/sysbox-runc/libcontainer/configs/validate/rootless.go new file mode 100644 index 00000000..717d0f00 --- /dev/null +++ b/sysbox-runc/libcontainer/configs/validate/rootless.go @@ -0,0 +1,90 @@ +package validate + +import ( + "errors" + "fmt" + "strings" + + "github.com/opencontainers/runc/libcontainer/configs" +) + +// rootlessEUID makes sure that the config can be applied when runc +// is being executed as a non-root user (euid != 0) in the current user namespace. +func (v *ConfigValidator) rootlessEUID(config *configs.Config) error { + if err := rootlessEUIDMappings(config); err != nil { + return err + } + if err := rootlessEUIDMount(config); err != nil { + return err + } + + // XXX: We currently can't verify the user config at all, because + // configs.Config doesn't store the user-related configs. So this + // has to be verified by setupUser() in init_linux.go. + + return nil +} + +func hasIDMapping(id int, mappings []configs.IDMap) bool { + for _, m := range mappings { + if id >= m.ContainerID && id < m.ContainerID+m.Size { + return true + } + } + return false +} + +func rootlessEUIDMappings(config *configs.Config) error { + if !config.Namespaces.Contains(configs.NEWUSER) { + return errors.New("rootless container requires user namespaces") + } + + if len(config.UidMappings) == 0 { + return errors.New("rootless containers requires at least one UID mapping") + } + if len(config.GidMappings) == 0 { + return errors.New("rootless containers requires at least one GID mapping") + } + return nil +} + +// mount verifies that the user isn't trying to set up any mounts they don't have +// the rights to do. In addition, it makes sure that no mount has a `uid=` or +// `gid=` option that doesn't resolve to root. +func rootlessEUIDMount(config *configs.Config) error { + // XXX: We could whitelist allowed devices at this point, but I'm not + // convinced that's a good idea. The kernel is the best arbiter of + // access control. + + for _, mount := range config.Mounts { + // Check that the options list doesn't contain any uid= or gid= entries + // that don't resolve to root. + for _, opt := range strings.Split(mount.Data, ",") { + if strings.HasPrefix(opt, "uid=") { + var uid int + n, err := fmt.Sscanf(opt, "uid=%d", &uid) + if n != 1 || err != nil { + // Ignore unknown mount options. + continue + } + if !hasIDMapping(uid, config.UidMappings) { + return errors.New("cannot specify uid= mount options for unmapped uid in rootless containers") + } + } + + if strings.HasPrefix(opt, "gid=") { + var gid int + n, err := fmt.Sscanf(opt, "gid=%d", &gid) + if n != 1 || err != nil { + // Ignore unknown mount options. + continue + } + if !hasIDMapping(gid, config.GidMappings) { + return errors.New("cannot specify gid= mount options for unmapped gid in rootless containers") + } + } + } + } + + return nil +} diff --git a/sysbox-runc/libcontainer/configs/validate/rootless_test.go b/sysbox-runc/libcontainer/configs/validate/rootless_test.go new file mode 100644 index 00000000..59d15575 --- /dev/null +++ b/sysbox-runc/libcontainer/configs/validate/rootless_test.go @@ -0,0 +1,155 @@ +package validate + +import ( + "testing" + + "github.com/opencontainers/runc/libcontainer/configs" +) + +func rootlessEUIDConfig() *configs.Config { + return &configs.Config{ + Rootfs: "/var", + RootlessEUID: true, + RootlessCgroups: true, + Namespaces: configs.Namespaces( + []configs.Namespace{ + {Type: configs.NEWUSER}, + }, + ), + UidMappings: []configs.IDMap{ + { + HostID: 1337, + ContainerID: 0, + Size: 1, + }, + }, + GidMappings: []configs.IDMap{ + { + HostID: 7331, + ContainerID: 0, + Size: 1, + }, + }, + } +} + +func TestValidateRootlessEUID(t *testing.T) { + validator := New() + + config := rootlessEUIDConfig() + if err := validator.Validate(config); err != nil { + t.Errorf("Expected error to not occur: %+v", err) + } +} + +/* rootlessEUIDMappings */ + +func TestValidateRootlessEUIDUserns(t *testing.T) { + validator := New() + + config := rootlessEUIDConfig() + config.Namespaces = nil + if err := validator.Validate(config); err == nil { + t.Errorf("Expected error to occur if user namespaces not set") + } +} + +func TestValidateRootlessEUIDMappingUid(t *testing.T) { + validator := New() + + config := rootlessEUIDConfig() + config.UidMappings = nil + if err := validator.Validate(config); err == nil { + t.Errorf("Expected error to occur if no uid mappings provided") + } +} + +func TestValidateNonZeroEUIDMappingGid(t *testing.T) { + validator := New() + + config := rootlessEUIDConfig() + config.GidMappings = nil + if err := validator.Validate(config); err == nil { + t.Errorf("Expected error to occur if no gid mappings provided") + } +} + +/* rootlessEUIDMount() */ + +func TestValidateRootlessEUIDMountUid(t *testing.T) { + config := rootlessEUIDConfig() + validator := New() + + config.Mounts = []*configs.Mount{ + { + Source: "devpts", + Destination: "/dev/pts", + Device: "devpts", + }, + } + + if err := validator.Validate(config); err != nil { + t.Errorf("Expected error to not occur when uid= not set in mount options: %+v", err) + } + + config.Mounts[0].Data = "uid=5" + if err := validator.Validate(config); err == nil { + t.Errorf("Expected error to occur when setting uid=5 in mount options") + } + + config.Mounts[0].Data = "uid=0" + if err := validator.Validate(config); err != nil { + t.Errorf("Expected error to not occur when setting uid=0 in mount options: %+v", err) + } + + config.Mounts[0].Data = "uid=2" + config.UidMappings[0].Size = 10 + if err := validator.Validate(config); err != nil { + t.Errorf("Expected error to not occur when setting uid=2 in mount options and UidMapping[0].size is 10") + } + + config.Mounts[0].Data = "uid=20" + config.UidMappings[0].Size = 10 + if err := validator.Validate(config); err == nil { + t.Errorf("Expected error to occur when setting uid=20 in mount options and UidMapping[0].size is 10") + } +} + +func TestValidateRootlessEUIDMountGid(t *testing.T) { + config := rootlessEUIDConfig() + validator := New() + + config.Mounts = []*configs.Mount{ + { + Source: "devpts", + Destination: "/dev/pts", + Device: "devpts", + }, + } + + if err := validator.Validate(config); err != nil { + t.Errorf("Expected error to not occur when gid= not set in mount options: %+v", err) + } + + config.Mounts[0].Data = "gid=5" + if err := validator.Validate(config); err == nil { + t.Errorf("Expected error to occur when setting gid=5 in mount options") + } + + config.Mounts[0].Data = "gid=0" + if err := validator.Validate(config); err != nil { + t.Errorf("Expected error to not occur when setting gid=0 in mount options: %+v", err) + } + + config.Mounts[0].Data = "gid=5" + config.GidMappings[0].Size = 10 + if err := validator.Validate(config); err != nil { + t.Errorf("Expected error to not occur when setting gid=5 in mount options and GidMapping[0].size is 10") + } + + config.Mounts[0].Data = "gid=11" + config.GidMappings[0].Size = 10 + if err := validator.Validate(config); err == nil { + t.Errorf("Expected error to occur when setting gid=11 in mount options and GidMapping[0].size is 10") + } +} diff --git a/sysbox-runc/libcontainer/configs/validate/validator.go b/sysbox-runc/libcontainer/configs/validate/validator.go new file mode 100644 index 00000000..63abdb00 --- /dev/null +++ b/sysbox-runc/libcontainer/configs/validate/validator.go @@ -0,0 +1,239 @@ +package validate + +import ( + "errors" + "fmt" + "os" + "path/filepath" + "strings" + "sync" + + "github.com/opencontainers/runc/libcontainer/configs" + "github.com/opencontainers/runc/libcontainer/intelrdt" + selinux "github.com/opencontainers/selinux/go-selinux" + "golang.org/x/sys/unix" +) + +type Validator interface { + Validate(*configs.Config) error +} + +func New() Validator { + return &ConfigValidator{} +} + +type ConfigValidator struct { +} + +func (v *ConfigValidator) Validate(config *configs.Config) error { + if err := v.rootfs(config); err != nil { + return err + } + if err := v.network(config); err != nil { + return err + } + if err := v.hostname(config); err != nil { + return err + } + if err := v.security(config); err != nil { + return err + } + if err := v.usernamespace(config); err != nil { + return err + } + if err := v.cgroupnamespace(config); err != nil { + return err + } + if err := v.sysctl(config); err != nil { + return err + } + if err := v.intelrdt(config); err != nil { + return err + } + if config.RootlessEUID { + if err := v.rootlessEUID(config); err != nil { + return err + } + } + return nil +} + +// rootfs validates if the rootfs is an absolute path and is not a symlink +// to the container's root filesystem. +func (v *ConfigValidator) rootfs(config *configs.Config) error { + if _, err := os.Stat(config.Rootfs); err != nil { + if os.IsNotExist(err) { + return fmt.Errorf("rootfs (%s) does not exist", config.Rootfs) + } + return err + } + cleaned, err := filepath.Abs(config.Rootfs) + if err != nil { + return err + } + if cleaned, err = filepath.EvalSymlinks(cleaned); err != nil { + return err + } + if filepath.Clean(config.Rootfs) != cleaned { + return fmt.Errorf("%s is not an absolute path or is a symlink", config.Rootfs) + } + return nil +} + +func (v *ConfigValidator) network(config *configs.Config) error { + if !config.Namespaces.Contains(configs.NEWNET) { + if len(config.Networks) > 0 || len(config.Routes) > 0 { + return errors.New("unable to apply network settings without a private NET namespace") + } + } + return nil +} + +func (v *ConfigValidator) hostname(config *configs.Config) error { + if config.Hostname != "" && !config.Namespaces.Contains(configs.NEWUTS) { + return errors.New("unable to set hostname without a private UTS namespace") + } + return nil +} + +func (v *ConfigValidator) security(config *configs.Config) error { + // restrict sys without mount namespace + if (len(config.MaskPaths) > 0 || len(config.ReadonlyPaths) > 0) && + !config.Namespaces.Contains(configs.NEWNS) { + return errors.New("unable to restrict sys entries without a private MNT namespace") + } + if config.ProcessLabel != "" && !selinux.GetEnabled() { + return errors.New("selinux label is specified in config, but selinux is disabled or not supported") + } + + return nil +} + +func (v *ConfigValidator) usernamespace(config *configs.Config) error { + if config.Namespaces.Contains(configs.NEWUSER) { + if _, err := os.Stat("/proc/self/ns/user"); os.IsNotExist(err) { + return errors.New("USER namespaces aren't enabled in the kernel") + } + } else { + if config.UidMappings != nil || config.GidMappings != nil { + return errors.New("User namespace mappings specified, but USER namespace isn't enabled in the config") + } + } + return nil +} + +func (v *ConfigValidator) cgroupnamespace(config *configs.Config) error { + if config.Namespaces.Contains(configs.NEWCGROUP) { + if _, err := os.Stat("/proc/self/ns/cgroup"); os.IsNotExist(err) { + return errors.New("cgroup namespaces aren't enabled in the kernel") + } + } + return nil +} + +// sysctl validates that the specified sysctl keys are valid or not. +// /proc/sys isn't completely namespaced and depending on which namespaces +// are specified, a subset of sysctls are permitted. +func (v *ConfigValidator) sysctl(config *configs.Config) error { + validSysctlMap := map[string]bool{ + "kernel.msgmax": true, + "kernel.msgmnb": true, + "kernel.msgmni": true, + "kernel.sem": true, + "kernel.shmall": true, + "kernel.shmmax": true, + "kernel.shmmni": true, + "kernel.shm_rmid_forced": true, + } + + var ( + netOnce sync.Once + hostnet bool + hostnetErr error + ) + + for s := range config.Sysctl { + if validSysctlMap[s] || strings.HasPrefix(s, "fs.mqueue.") { + if config.Namespaces.Contains(configs.NEWIPC) { + continue + } else { + return fmt.Errorf("sysctl %q is not allowed in the hosts ipc namespace", s) + } + } + if strings.HasPrefix(s, "net.") { + // Is container using host netns? + // Here "host" means "current", not "initial". + netOnce.Do(func() { + if !config.Namespaces.Contains(configs.NEWNET) { + hostnet = true + return + } + path := config.Namespaces.PathOf(configs.NEWNET) + if path == "" { + // own netns, so hostnet = false + return + } + hostnet, hostnetErr = isHostNetNS(path) + }) + if hostnetErr != nil { + return hostnetErr + } + if hostnet { + return fmt.Errorf("sysctl %q not allowed in host network namespace", s) + } + continue + } + if config.Namespaces.Contains(configs.NEWUTS) { + switch s { + case "kernel.domainname": + // This is namespaced and there's no explicit OCI field for it. + continue + case "kernel.hostname": + // This is namespaced but there's a conflicting (dedicated) OCI field for it. + return fmt.Errorf("sysctl %q is not allowed as it conflicts with the OCI %q field", s, "hostname") + } + } + return fmt.Errorf("sysctl %q is not in a separate kernel namespace", s) + } + + return nil +} + +func (v *ConfigValidator) intelrdt(config *configs.Config) error { + if config.IntelRdt != nil { + if !intelrdt.IsCATEnabled() && !intelrdt.IsMBAEnabled() { + return errors.New("intelRdt is specified in config, but Intel RDT is not supported or enabled") + } + + if !intelrdt.IsCATEnabled() && config.IntelRdt.L3CacheSchema != "" { + return errors.New("intelRdt.l3CacheSchema is specified in config, but Intel RDT/CAT is not enabled") + } + if !intelrdt.IsMBAEnabled() && config.IntelRdt.MemBwSchema != "" { + return errors.New("intelRdt.memBwSchema is specified in config, but Intel RDT/MBA is not enabled") + } + + if intelrdt.IsCATEnabled() && config.IntelRdt.L3CacheSchema == "" { + return errors.New("Intel RDT/CAT is enabled and intelRdt is specified in config, but intelRdt.l3CacheSchema is empty") + } + if intelrdt.IsMBAEnabled() && config.IntelRdt.MemBwSchema == "" { + return errors.New("Intel RDT/MBA is enabled and intelRdt is specified in config, but intelRdt.memBwSchema is empty") + } + } + + return nil +} + +func isHostNetNS(path string) (bool, error) { + const currentProcessNetns = "/proc/self/ns/net" + + var st1, st2 unix.Stat_t + + if err := unix.Stat(currentProcessNetns, &st1); err != nil { + return false, fmt.Errorf("unable to stat %q: %s", currentProcessNetns, err) + } + if err := unix.Stat(path, &st2); err != nil { + return false, fmt.Errorf("unable to stat %q: %s", path, err) + } + + return (st1.Dev == st2.Dev) && (st1.Ino == st2.Ino), nil +} diff --git a/sysbox-runc/libcontainer/configs/validate/validator_test.go b/sysbox-runc/libcontainer/configs/validate/validator_test.go new file mode 100644 index 00000000..7c5c7138 --- /dev/null +++ b/sysbox-runc/libcontainer/configs/validate/validator_test.go @@ -0,0 +1,319 @@ +package validate_test + +import ( + "io/ioutil" + "os" + "path/filepath" + "testing" + + "github.com/opencontainers/runc/libcontainer/configs" + "github.com/opencontainers/runc/libcontainer/configs/validate" + "golang.org/x/sys/unix" +) + +func TestValidate(t *testing.T) { + config := &configs.Config{ + Rootfs: "/var", + } + + validator := validate.New() + err := validator.Validate(config) + if err != nil { + t.Errorf("Expected error to not occur: %+v", err) + } +} + +func TestValidateWithInvalidRootfs(t *testing.T) { + dir := "rootfs" + if err := os.Symlink("/var", dir); err != nil { + t.Fatal(err) + } + defer os.Remove(dir) + + config := &configs.Config{ + Rootfs: dir, + } + + validator := validate.New() + err := validator.Validate(config) + if err == nil { + t.Error("Expected error to occur but it was nil") + } +} + +func TestValidateNetworkWithoutNETNamespace(t *testing.T) { + network := &configs.Network{Type: "loopback"} + config := &configs.Config{ + Rootfs: "/var", + Namespaces: []configs.Namespace{}, + Networks: []*configs.Network{network}, + } + + validator := validate.New() + err := validator.Validate(config) + if err == nil { + t.Error("Expected error to occur but it was nil") + } +} + +func TestValidateNetworkRoutesWithoutNETNamespace(t *testing.T) { + route := &configs.Route{Gateway: "255.255.255.0"} + config := &configs.Config{ + Rootfs: "/var", + Namespaces: []configs.Namespace{}, + Routes: []*configs.Route{route}, + } + + validator := validate.New() + err := validator.Validate(config) + if err == nil { + t.Error("Expected error to occur but it was nil") + } +} + +func TestValidateHostname(t *testing.T) { + config := &configs.Config{ + Rootfs: "/var", + Hostname: "runc", + Namespaces: configs.Namespaces( + []configs.Namespace{ + {Type: configs.NEWUTS}, + }, + ), + } + + validator := validate.New() + err := validator.Validate(config) + if err != nil { + t.Errorf("Expected error to not occur: %+v", err) + } +} + +func TestValidateHostnameWithoutUTSNamespace(t *testing.T) { + config := &configs.Config{ + Rootfs: "/var", + Hostname: "runc", + } + + validator := validate.New() + err := validator.Validate(config) + if err == nil { + t.Error("Expected error to occur but it was nil") + } +} + +func TestValidateSecurityWithMaskPaths(t *testing.T) { + config := &configs.Config{ + Rootfs: "/var", + MaskPaths: []string{"/proc/kcore"}, + Namespaces: configs.Namespaces( + []configs.Namespace{ + {Type: configs.NEWNS}, + }, + ), + } + + validator := validate.New() + err := validator.Validate(config) + if err != nil { + t.Errorf("Expected error to not occur: %+v", err) + } +} + +func TestValidateSecurityWithROPaths(t *testing.T) { + config := &configs.Config{ + Rootfs: "/var", + ReadonlyPaths: []string{"/proc/sys"}, + Namespaces: configs.Namespaces( + []configs.Namespace{ + {Type: configs.NEWNS}, + }, + ), + } + + validator := validate.New() + err := validator.Validate(config) + if err != nil { + t.Errorf("Expected error to not occur: %+v", err) + } +} + +func TestValidateSecurityWithoutNEWNS(t *testing.T) { + config := &configs.Config{ + Rootfs: "/var", + MaskPaths: []string{"/proc/kcore"}, + ReadonlyPaths: []string{"/proc/sys"}, + } + + validator := validate.New() + err := validator.Validate(config) + if err == nil { + t.Error("Expected error to occur but it was nil") + } +} + +func TestValidateUsernamespace(t *testing.T) { + if _, err := os.Stat("/proc/self/ns/user"); os.IsNotExist(err) { + t.Skip("userns is unsupported") + } + config := &configs.Config{ + Rootfs: "/var", + Namespaces: configs.Namespaces( + []configs.Namespace{ + {Type: configs.NEWUSER}, + }, + ), + } + + validator := validate.New() + err := validator.Validate(config) + if err != nil { + t.Errorf("expected error to not occur %+v", err) + } +} + +func TestValidateUsernamespaceWithoutUserNS(t *testing.T) { + uidMap := configs.IDMap{ContainerID: 123} + config := &configs.Config{ + Rootfs: "/var", + UidMappings: []configs.IDMap{uidMap}, + } + + validator := validate.New() + err := validator.Validate(config) + if err == nil { + t.Error("Expected error to occur but it was nil") + } +} + +func TestValidateSysctl(t *testing.T) { + sysctl := map[string]string{ + "fs.mqueue.ctl": "ctl", + "net.ctl": "ctl", + "kernel.ctl": "ctl", + } + + for k, v := range sysctl { + config := &configs.Config{ + Rootfs: "/var", + Sysctl: map[string]string{k: v}, + } + + validator := validate.New() + err := validator.Validate(config) + if err == nil { + t.Error("Expected error to occur but it was nil") + } + } +} + +func TestValidateValidSysctl(t *testing.T) { + sysctl := map[string]string{ + "fs.mqueue.ctl": "ctl", + "net.ctl": "ctl", + "kernel.msgmax": "ctl", + } + + for k, v := range sysctl { + config := &configs.Config{ + Rootfs: "/var", + Sysctl: map[string]string{k: v}, + Namespaces: []configs.Namespace{ + { + Type: configs.NEWNET, + }, + { + Type: configs.NEWIPC, + }, + }, + } + + validator := validate.New() + err := validator.Validate(config) + if err != nil { + t.Errorf("Expected error to not occur with {%s=%s} but got: %q", k, v, err) + } + } +} + +func TestValidateSysctlWithSameNs(t *testing.T) { + config := &configs.Config{ + Rootfs: "/var", + Sysctl: map[string]string{"net.ctl": "ctl"}, + Namespaces: configs.Namespaces( + []configs.Namespace{ + { + Type: configs.NEWNET, + Path: "/proc/self/ns/net", + }, + }, + ), + } + + validator := validate.New() + err := validator.Validate(config) + if err == nil { + t.Error("Expected error to occur but it was nil") + } +} + +func TestValidateSysctlWithBindHostNetNS(t *testing.T) { + if os.Getuid() != 0 { + t.Skip("requires root") + } + + const selfnet = "/proc/self/ns/net" + + dir, err := ioutil.TempDir("", t.Name()+"-*") + if err != nil { + t.Fatal(err) + } + defer os.Remove(dir) + + file := filepath.Join(dir, "default") + fd, err := os.Create(file) + if err != nil { + t.Fatal(err) + } + defer os.Remove(file) + fd.Close() + + if err := unix.Mount(selfnet, file, "bind", unix.MS_BIND, ""); err != nil { + t.Fatalf("can't bind-mount %s to %s: %s", selfnet, file, err) + } + defer func() { + _ = unix.Unmount(file, unix.MNT_DETACH) + }() + + config := &configs.Config{ + Rootfs: "/var", + Sysctl: map[string]string{"net.ctl": "ctl", "net.foo": "bar"}, + Namespaces: configs.Namespaces( + []configs.Namespace{ + { + Type: configs.NEWNET, + Path: file, + }, + }, + ), + } + + validator := validate.New() + if err := validator.Validate(config); err == nil { + t.Error("Expected error to occur but it was nil") + } +} + +func TestValidateSysctlWithoutNETNamespace(t *testing.T) { + config := &configs.Config{ + Rootfs: "/var", + Sysctl: map[string]string{"net.ctl": "ctl"}, + Namespaces: []configs.Namespace{}, + } + + validator := validate.New() + err := validator.Validate(config) + if err == nil { + t.Error("Expected error to occur but it was nil") + } +} diff --git a/sysbox-runc/libcontainer/console_linux.go b/sysbox-runc/libcontainer/console_linux.go new file mode 100644 index 00000000..9997e93e --- /dev/null +++ b/sysbox-runc/libcontainer/console_linux.go @@ -0,0 +1,41 @@ +package libcontainer + +import ( + "os" + + "golang.org/x/sys/unix" +) + +// mount initializes the console inside the rootfs mounting with the specified mount label +// and applying the correct ownership of the console. +func mountConsole(slavePath string) error { + oldMask := unix.Umask(0000) + defer unix.Umask(oldMask) + f, err := os.Create("/dev/console") + if err != nil && !os.IsExist(err) { + return err + } + if f != nil { + f.Close() + } + return unix.Mount(slavePath, "/dev/console", "bind", unix.MS_BIND, "") +} + +// dupStdio opens the slavePath for the console and dups the fds to the current +// processes stdio, fd 0,1,2. +func dupStdio(slavePath string) error { + fd, err := unix.Open(slavePath, unix.O_RDWR, 0) + if err != nil { + return &os.PathError{ + Op: "open", + Path: slavePath, + Err: err, + } + } + for _, i := range []int{0, 1, 2} { + if err := unix.Dup3(fd, i, 0); err != nil { + return err + } + } + return nil +} diff --git a/sysbox-runc/libcontainer/container.go b/sysbox-runc/libcontainer/container.go new file mode 100644 index 00000000..ba7541c5 --- /dev/null +++ b/sysbox-runc/libcontainer/container.go @@ -0,0 +1,173 @@ +// Package libcontainer provides a native Go implementation for creating containers +// with namespaces, cgroups, capabilities, and filesystem access controls. +// It allows you to manage the lifecycle of the container performing additional operations +// after the container is created. +package libcontainer + +import ( + "os" + "time" + + "github.com/opencontainers/runc/libcontainer/configs" + "github.com/opencontainers/runtime-spec/specs-go" +) + +// Status is the status of a container. +type Status int + +const ( + // Created is the status that denotes the container exists but has not been run yet. + Created Status = iota + // Running is the status that denotes the container exists and is running. + Running + // Pausing is the status that denotes the container exists, it is in the process of being paused. + Pausing + // Paused is the status that denotes the container exists, but all its processes are paused. + Paused + // Stopped is the status that denotes the container does not have a created or running process. + Stopped +) + +func (s Status) String() string { + switch s { + case Created: + return "created" + case Running: + return "running" + case Pausing: + return "pausing" + case Paused: + return "paused" + case Stopped: + return "stopped" + default: + return "unknown" + } +} + +// BaseState represents the platform agnostic pieces relating to a +// running container's state +type BaseState struct { + // ID is the container ID. + ID string `json:"id"` + + // InitProcessPid is the init process id in the parent namespace. + InitProcessPid int `json:"init_process_pid"` + + // InitProcessStartTime is the init process start time in clock cycles since boot time. + InitProcessStartTime uint64 `json:"init_process_start"` + + // Created is the unix timestamp for the creation time of the container in UTC + Created time.Time `json:"created"` + + // Config is the container's configuration. + Config configs.Config `json:"config"` +} + +// BaseContainer is a libcontainer container object. +// +// Each container is thread-safe within the same process. Since a container can +// be destroyed by a separate process, any function may return that the container +// was not found. BaseContainer includes methods that are platform agnostic. +type BaseContainer interface { + // Returns the ID of the container + ID() string + + // Returns the current status of the container. + // + // errors: + // ContainerNotExists - Container no longer exists, + // Systemerror - System error. + Status() (Status, error) + + // State returns the current container's state information. + // + // errors: + // SystemError - System error. + State() (*State, error) + + // OCIState returns the current container's state information. + // + // errors: + // SystemError - System error. + OCIState() (*specs.State, error) + + // Returns the current config of the container. + Config() configs.Config + + // Returns the PIDs inside this container. The PIDs are in the namespace of the calling process. + // + // errors: + // ContainerNotExists - Container no longer exists, + // Systemerror - System error. + // + // Some of the returned PIDs may no longer refer to processes in the Container, unless + // the Container state is PAUSED in which case every PID in the slice is valid. + Processes() ([]int, error) + + // Returns statistics for the container. + // + // errors: + // ContainerNotExists - Container no longer exists, + // Systemerror - System error. + Stats() (*Stats, error) + + // Set resources of container as configured + // + // We can use this to change resources when containers are running. + // + // errors: + // SystemError - System error. + Set(config configs.Config) error + + // Start a process inside the container. Returns error if process fails to + // start. You can track process lifecycle with passed Process structure. + // + // errors: + // ContainerNotExists - Container no longer exists, + // ConfigInvalid - config is invalid, + // ContainerPaused - Container is paused, + // SystemError - System error. + Start(process *Process) (err error) + + // Run immediately starts the process inside the container. Returns error if process + // fails to start. It does not block waiting for the exec fifo after start returns but + // opens the fifo after start returns. + // + // errors: + // ContainerNotExists - Container no longer exists, + // ConfigInvalid - config is invalid, + // ContainerPaused - Container is paused, + // SystemError - System error. + Run(process *Process) (err error) + + // Destroys the container, if its in a valid state, after killing any + // remaining running processes. + // + // Any event registrations are removed before the container is destroyed. + // No error is returned if the container is already destroyed. + // + // Running containers must first be stopped using Signal(..). + // Paused containers must first be resumed using Resume(..). + // + // errors: + // ContainerNotStopped - Container is still running, + // ContainerPaused - Container is paused, + // SystemError - System error. + Destroy() error + + // Signal sends the provided signal code to the container's initial process. + // + // If all is specified the signal is sent to all processes in the container + // including the initial process. + // + // errors: + // SystemError - System error. + Signal(s os.Signal, all bool) error + + // Exec signals the container to exec the users process at the end of the init. + // + // errors: + // SystemError - System error. + Exec() error +} diff --git a/sysbox-runc/libcontainer/container_linux.go b/sysbox-runc/libcontainer/container_linux.go new file mode 100644 index 00000000..a7d4f04b --- /dev/null +++ b/sysbox-runc/libcontainer/container_linux.go @@ -0,0 +1,2994 @@ +//go:build linux +// +build linux + +package libcontainer + +import ( + "bufio" + "bytes" + "encoding/json" + "errors" + "fmt" + "io" + "io/ioutil" + "net" + "os" + "os/exec" + "path/filepath" + "reflect" + "strconv" + "strings" + "sync" + "time" + + securejoin "github.com/cyphar/filepath-securejoin" + + "github.com/nestybox/sysbox-libs/mount" + "github.com/opencontainers/runc/libcontainer/cgroups" + "github.com/opencontainers/runc/libcontainer/configs" + "github.com/opencontainers/runc/libcontainer/intelrdt" + "github.com/opencontainers/runc/libcontainer/logs" + "github.com/opencontainers/runc/libcontainer/system" + "github.com/opencontainers/runc/libcontainer/utils" + "github.com/opencontainers/runc/libsysbox/sysbox" + "github.com/opencontainers/runc/libsysbox/syscont" + "github.com/opencontainers/runtime-spec/specs-go" + + "github.com/checkpoint-restore/go-criu/v4" + criurpc "github.com/checkpoint-restore/go-criu/v4/rpc" + + "github.com/golang/protobuf/proto" + + errorsf "github.com/pkg/errors" + + "github.com/sirupsen/logrus" + "github.com/vishvananda/netlink/nl" + "golang.org/x/sys/unix" + + "github.com/nestybox/sysbox-libs/idMap" + sh "github.com/nestybox/sysbox-libs/idShiftUtils" + "github.com/nestybox/sysbox-libs/shiftfs" + sysboxLibsUtils "github.com/nestybox/sysbox-libs/utils" +) + +const stdioFdCount = 3 + +type linuxContainer struct { + id string + root string + config *configs.Config + cgroupManager cgroups.Manager + intelRdtManager intelrdt.Manager + initPath string + initArgs []string + initProcess parentProcess + initProcessStartTime uint64 + criuPath string + newuidmapPath string + newgidmapPath string + m sync.Mutex + criuVersion int + state containerState + created time.Time + sysbox *sysbox.Sysbox +} + +// State represents a running container's state +type State struct { + BaseState + + // Platform specific fields below here + + // Specified if the container was started under the rootless mode. + // Set to true if BaseState.Config.RootlessEUID && BaseState.Config.RootlessCgroups + Rootless bool `json:"rootless"` + + // Paths to all the container's cgroups, as returned by (*cgroups.Manager).GetPaths + // + // For cgroup v1, a key is cgroup subsystem name, and the value is the path + // to the cgroup for this subsystem. + // + // For cgroup v2 unified hierarchy, a key is "", and the value is the unified path. + CgroupPaths map[string]string `json:"cgroup_paths"` + + // NamespacePaths are filepaths to the container's namespaces. Key is the namespace type + // with the value as the path. + NamespacePaths map[configs.NamespaceType]string `json:"namespace_paths"` + + // Container's standard descriptors (std{in,out,err}), needed for checkpoint and restore + ExternalDescriptors []string `json:"external_descriptors,omitempty"` + + // Intel RDT "resource control" filesystem path + IntelRdtPath string `json:"intel_rdt_path"` + + // Sysbox contains sysbox-specific config + Sysbox sysbox.Sysbox `json:"sysbox,omitempty"` + + // SysFs contains info about resources obtained from sysbox-fs + SysFs sysbox.Fs `json:"sys_fs,omitempty"` + + // SysMgr contains info about resources obtained from sysbox-mgr + SysMgr sysbox.Mgr `json:"sys_mgr,omitempty"` +} + +// Container is a libcontainer container object. +// +// Each container is thread-safe within the same process. Since a container can +// be destroyed by a separate process, any function may return that the container +// was not found. +type Container interface { + BaseContainer + + // Methods below here are platform specific + + // Checkpoint checkpoints the running container's state to disk using the criu(8) utility. + // + // errors: + // Systemerror - System error. + Checkpoint(criuOpts *CriuOpts) error + + // Restore restores the checkpointed container to a running state using the criu(8) utility. + // + // errors: + // Systemerror - System error. + Restore(process *Process, criuOpts *CriuOpts) error + + // If the Container state is RUNNING or CREATED, sets the Container state to PAUSING and pauses + // the execution of any user processes. Asynchronously, when the container finished being paused the + // state is changed to PAUSED. + // If the Container state is PAUSED, do nothing. + // + // errors: + // ContainerNotExists - Container no longer exists, + // ContainerNotRunning - Container not running or created, + // Systemerror - System error. + Pause() error + + // If the Container state is PAUSED, resumes the execution of any user processes in the + // Container before setting the Container state to RUNNING. + // If the Container state is RUNNING, do nothing. + // + // errors: + // ContainerNotExists - Container no longer exists, + // ContainerNotPaused - Container is not paused, + // Systemerror - System error. + Resume() error + + // NotifyOOM returns a read-only channel signaling when the container receives an OOM notification. + // + // errors: + // Systemerror - System error. + NotifyOOM() (<-chan struct{}, error) + + // NotifyMemoryPressure returns a read-only channel signaling when the container reaches a given pressure level + // + // errors: + // Systemerror - System error. + NotifyMemoryPressure(level PressureLevel) (<-chan struct{}, error) +} + +// ID returns the container's unique ID +func (c *linuxContainer) ID() string { + return c.id +} + +// Config returns the container's configuration +func (c *linuxContainer) Config() configs.Config { + return *c.config +} + +func (c *linuxContainer) Status() (Status, error) { + c.m.Lock() + defer c.m.Unlock() + return c.currentStatus() +} + +func (c *linuxContainer) State() (*State, error) { + c.m.Lock() + defer c.m.Unlock() + return c.currentState() +} + +func (c *linuxContainer) OCIState() (*specs.State, error) { + c.m.Lock() + defer c.m.Unlock() + return c.currentOCIState() +} + +func (c *linuxContainer) Processes() ([]int, error) { + var pids []int + status, err := c.currentStatus() + if err != nil { + return pids, err + } + // for systemd cgroup, the unit's cgroup path will be auto removed if container's all processes exited + if status == Stopped && !c.cgroupManager.Exists() { + return pids, nil + } + + pids, err = c.cgroupManager.GetAllPids() + if err != nil { + return nil, newSystemErrorWithCause(err, "getting all container pids from cgroups") + } + return pids, nil +} + +func (c *linuxContainer) Stats() (*Stats, error) { + var ( + err error + stats = &Stats{} + ) + if stats.CgroupStats, err = c.cgroupManager.GetStats(); err != nil { + return stats, newSystemErrorWithCause(err, "getting container stats from cgroups") + } + if c.intelRdtManager != nil { + if stats.IntelRdtStats, err = c.intelRdtManager.GetStats(); err != nil { + return stats, newSystemErrorWithCause(err, "getting container's Intel RDT stats") + } + } + for _, iface := range c.config.Networks { + switch iface.Type { + case "veth": + istats, err := getNetworkInterfaceStats(iface.HostInterfaceName) + if err != nil { + return stats, newSystemErrorWithCausef(err, "getting network stats for interface %q", iface.HostInterfaceName) + } + stats.Interfaces = append(stats.Interfaces, istats) + } + } + return stats, nil +} + +func (c *linuxContainer) Set(config configs.Config) error { + c.m.Lock() + defer c.m.Unlock() + status, err := c.currentStatus() + if err != nil { + return err + } + if status == Stopped { + return newGenericError(errors.New("container not running"), ContainerNotRunning) + } + if err := c.cgroupManager.Set(&config); err != nil { + logrus.Warnf("Setting cgroup configs failed due to error: %v", err) + // Set configs back + if err2 := c.cgroupManager.Set(c.config); err2 != nil { + logrus.Warnf("Setting back cgroup configs failed due to error: %v, your state.json and actual configs might be inconsistent.", err2) + } + return err + } + if c.intelRdtManager != nil { + if err := c.intelRdtManager.Set(&config); err != nil { + // Set configs back + if err2 := c.cgroupManager.Set(c.config); err2 != nil { + logrus.Warnf("Setting back cgroup configs failed due to error: %v, your state.json and actual configs might be inconsistent.", err2) + } + if err2 := c.intelRdtManager.Set(c.config); err2 != nil { + logrus.Warnf("Setting back intelrdt configs failed due to error: %v, your state.json and actual configs might be inconsistent.", err2) + } + return err + } + } + // After config setting succeed, update config and states + c.config = &config + _, err = c.updateState(nil) + return err +} + +func (c *linuxContainer) Start(process *Process) error { + c.m.Lock() + defer c.m.Unlock() + + config := c.config + + if config.Cgroups.Resources.SkipDevices { + return newGenericError(errors.New("can't start container with SkipDevices set"), ConfigInvalid) + } + + if process.Init { + if err := c.createExecFifo(); err != nil { + return err + } + + // + // Set up ID-shifting for the rootfs and bind-mounts + // + + // Chown (rootfs only) + if config.RootfsUidShiftType == sh.Chown { + if config.RootfsCloned { + uidOffset := int32(config.UidMappings[0].HostID) + gidOffset := int32(config.GidMappings[0].HostID) + if err := c.sysbox.Mgr.ChownClonedRootfs(uidOffset, gidOffset); err != nil { + return newSystemErrorWithCause(err, "failed to chown rootfs clone") + } + } else { + if err := c.chownRootfs(); err != nil { + return err + } + } + } + + // ID-mapping + if err := c.setupIDMappedMounts(); err != nil { + return err + } + + // Shiftfs (will only act if mount is not marked for ID-mapping already) + if err := c.setupShiftfsMarks(); err != nil { + return err + } + } + + if err := c.start(process); err != nil { + if process.Init { + c.deleteExecFifo() + } + return err + } + return nil +} + +func (c *linuxContainer) Run(process *Process) error { + if err := c.Start(process); err != nil { + return err + } + if process.Init { + return c.exec() + } + return nil +} + +func (c *linuxContainer) Exec() error { + c.m.Lock() + defer c.m.Unlock() + return c.exec() +} + +func (c *linuxContainer) exec() error { + path := filepath.Join(c.root, execFifoFilename) + pid := c.initProcess.pid() + blockingFifoOpenCh := awaitFifoOpen(path) + for { + select { + case result := <-blockingFifoOpenCh: + return handleFifoResult(result) + + case <-time.After(time.Millisecond * 100): + stat, err := system.Stat(pid) + if err != nil || stat.State == system.Zombie { + // could be because process started, ran, and completed between our 100ms timeout and our system.Stat() check. + // see if the fifo exists and has data (with a non-blocking open, which will succeed if the writing process is complete). + if err := handleFifoResult(fifoOpen(path, false)); err != nil { + return errors.New("container process is already dead") + } + return nil + } + } + } +} + +func readFromExecFifo(execFifo io.Reader) error { + data, err := ioutil.ReadAll(execFifo) + if err != nil { + return err + } + if len(data) <= 0 { + return errors.New("cannot start an already running container") + } + return nil +} + +func awaitFifoOpen(path string) <-chan openResult { + fifoOpened := make(chan openResult) + go func() { + result := fifoOpen(path, true) + fifoOpened <- result + }() + return fifoOpened +} + +func fifoOpen(path string, block bool) openResult { + flags := os.O_RDONLY + if !block { + flags |= unix.O_NONBLOCK + } + f, err := os.OpenFile(path, flags, 0) + if err != nil { + return openResult{err: newSystemErrorWithCause(err, "open exec fifo for reading")} + } + return openResult{file: f} +} + +func handleFifoResult(result openResult) error { + if result.err != nil { + return result.err + } + f := result.file + defer f.Close() + if err := readFromExecFifo(f); err != nil { + return err + } + return os.Remove(f.Name()) +} + +type openResult struct { + file *os.File + err error +} + +func (c *linuxContainer) start(process *Process) error { + parent, err := c.newParentProcess(process) + if err != nil { + return newSystemErrorWithCause(err, "creating new parent process") + } + parent.forwardChildLogs() + + // Before starting "sysbox-runc init", mark all non-stdio open files as + // O_CLOEXEC to make sure we don't leak any files into "sysbox-runc + // init". Any files to be passed to "sysbox-runc init" through ExtraFiles + // will get dup2'd by the Go runtime and thus their O_CLOEXEC flag will be + // cleared. This is some additional protection against runc attacks like + // CVE-2024-21626, by making sure we never leak files to "runc init" we + // didn't intend to. + if err := utils.CloseExecFrom(3); err != nil { + return fmt.Errorf("unable to mark non-stdio fds as cloexec: %w", err) + } + + if err := parent.start(); err != nil { + return newSystemErrorWithCause(err, "starting container process") + } + + // generate a timestamp indicating when the container was started + c.created = time.Now().UTC() + + // sysbox-runc: send the creation-timestamp to sysbox-fs. + if process.Init && c.sysbox.Fs.Enabled() { + if err := c.sysbox.Fs.SendCreationTime(c.created); err != nil { + return newSystemErrorWithCause(err, "sending creation timestamp to sysbox-fs") + } + } + + if process.Init { + c.state = &createdState{ + c: c, + } + state, err := c.updateState(parent) + if err != nil { + return err + } + c.initProcessStartTime = state.InitProcessStartTime + + if c.config.Hooks != nil { + s, err := c.currentOCIState() + if err != nil { + return err + } + + if err := c.config.Hooks[configs.Poststart].RunHooks(s); err != nil { + if err := ignoreTerminateErrors(parent.terminate()); err != nil { + logrus.Warn(errorsf.Wrapf(err, "Running Poststart hook")) + } + return err + } + } + + // sysbox-runc: send an update to sysbox-mgr with the container's config + if c.sysbox.Mgr.Enabled() { + userns := state.NamespacePaths[configs.NEWUSER] + netns := state.NamespacePaths[configs.NEWNET] + + // Cast IDMap to LinuxIDMapping + cast := func(m configs.IDMap) specs.LinuxIDMapping { + return specs.LinuxIDMapping{ + ContainerID: uint32(m.ContainerID), + HostID: uint32(m.HostID), + Size: uint32(m.Size), + } + } + + uidMappings := []specs.LinuxIDMapping{} + for _, m := range state.BaseState.Config.UidMappings { + uidMappings = append(uidMappings, cast(m)) + } + + gidMappings := []specs.LinuxIDMapping{} + for _, m := range state.BaseState.Config.GidMappings { + gidMappings = append(gidMappings, cast(m)) + } + + rootfsUidShiftType := c.config.RootfsUidShiftType + + if err := c.sysbox.Mgr.Update(userns, netns, uidMappings, gidMappings, rootfsUidShiftType); err != nil { + return newSystemErrorWithCause(err, "sending creation timestamp to sysbox-fs") + } + } + } + + return nil +} + +func (c *linuxContainer) Signal(s os.Signal, all bool) error { + c.m.Lock() + defer c.m.Unlock() + status, err := c.currentStatus() + if err != nil { + return err + } + if all { + // for systemd cgroup, the unit's cgroup path will be auto removed if container's all processes exited + if status == Stopped && !c.cgroupManager.Exists() { + return nil + } + return signalAllProcesses(c.cgroupManager, s) + } + // to avoid a PID reuse attack + if status == Running || status == Created || status == Paused { + if err := c.initProcess.signal(s); err != nil { + return newSystemErrorWithCause(err, "signaling init process") + } + return nil + } + return newGenericError(errors.New("container not running"), ContainerNotRunning) +} + +func (c *linuxContainer) createExecFifo() error { + rootuid, err := c.Config().HostRootUID() + if err != nil { + return err + } + rootgid, err := c.Config().HostRootGID() + if err != nil { + return err + } + + fifoName := filepath.Join(c.root, execFifoFilename) + if _, err := os.Stat(fifoName); err == nil { + return fmt.Errorf("exec fifo %s already exists", fifoName) + } + oldMask := unix.Umask(0000) + if err := unix.Mkfifo(fifoName, 0622); err != nil { + unix.Umask(oldMask) + return err + } + unix.Umask(oldMask) + return os.Chown(fifoName, rootuid, rootgid) +} + +func (c *linuxContainer) deleteExecFifo() { + fifoName := filepath.Join(c.root, execFifoFilename) + os.Remove(fifoName) +} + +// includeExecFifo opens the container's execfifo as a pathfd, so that the +// container cannot access the statedir (and the FIFO itself remains +// un-opened). It then adds the FifoFd to the given exec.Cmd as an inherited +// fd, with _LIBCONTAINER_FIFOFD set to its fd number. +func (c *linuxContainer) includeExecFifo(cmd *exec.Cmd) error { + fifoName := filepath.Join(c.root, execFifoFilename) + fifoFd, err := unix.Open(fifoName, unix.O_PATH|unix.O_CLOEXEC, 0) + if err != nil { + return err + } + + cmd.ExtraFiles = append(cmd.ExtraFiles, os.NewFile(uintptr(fifoFd), fifoName)) + cmd.Env = append(cmd.Env, + "_LIBCONTAINER_FIFOFD="+strconv.Itoa(stdioFdCount+len(cmd.ExtraFiles)-1)) + return nil +} + +func (c *linuxContainer) newParentProcess(p *Process) (parentProcess, error) { + parentInitPipe, childInitPipe, err := utils.NewSockPair("init") + if err != nil { + return nil, newSystemErrorWithCause(err, "creating new init pipe") + } + messageSockPair := filePair{parentInitPipe, childInitPipe} + + parentLogPipe, childLogPipe, err := os.Pipe() + if err != nil { + return nil, fmt.Errorf("Unable to create the log pipe: %s", err) + } + logFilePair := filePair{parentLogPipe, childLogPipe} + + cmd := c.commandTemplate(p, childInitPipe, childLogPipe) + if !p.Init { + return c.newSetnsProcess(p, cmd, messageSockPair, logFilePair) + } + + // We only set up fifoFd if we're not doing a `runc exec`. The historic + // reason for this is that previously we would pass a dirfd that allowed + // for container rootfs escape (and not doing it in `runc exec` avoided + // that problem), but we no longer do that. However, there's no need to do + // this for `runc exec` so we just keep it this way to be safe. + if err := c.includeExecFifo(cmd); err != nil { + return nil, newSystemErrorWithCause(err, "including execfifo in cmd.Exec setup") + } + return c.newInitProcess(p, cmd, messageSockPair, logFilePair) +} + +func (c *linuxContainer) commandTemplate(p *Process, childInitPipe *os.File, childLogPipe *os.File) *exec.Cmd { + cmd := exec.Command(c.initPath, c.initArgs[1:]...) + cmd.Args[0] = c.initArgs[0] + cmd.Stdin = p.Stdin + cmd.Stdout = p.Stdout + cmd.Stderr = p.Stderr + cmd.Dir = c.config.Rootfs + if cmd.SysProcAttr == nil { + cmd.SysProcAttr = &unix.SysProcAttr{} + } + cmd.Env = append(cmd.Env, "GOMAXPROCS="+os.Getenv("GOMAXPROCS")) + cmd.ExtraFiles = append(cmd.ExtraFiles, p.ExtraFiles...) + if p.ConsoleSocket != nil { + cmd.ExtraFiles = append(cmd.ExtraFiles, p.ConsoleSocket) + cmd.Env = append(cmd.Env, + "_LIBCONTAINER_CONSOLE="+strconv.Itoa(stdioFdCount+len(cmd.ExtraFiles)-1), + ) + } + cmd.ExtraFiles = append(cmd.ExtraFiles, childInitPipe) + cmd.Env = append(cmd.Env, + "_LIBCONTAINER_INITPIPE="+strconv.Itoa(stdioFdCount+len(cmd.ExtraFiles)-1), + "_LIBCONTAINER_STATEDIR="+c.root, + ) + + cmd.ExtraFiles = append(cmd.ExtraFiles, childLogPipe) + cmd.Env = append(cmd.Env, + "_LIBCONTAINER_LOGPIPE="+strconv.Itoa(stdioFdCount+len(cmd.ExtraFiles)-1), + "_LIBCONTAINER_LOGLEVEL="+p.LogLevel, + ) + + // NOTE: when running a container with no PID namespace and the parent process spawning the container is + // PID1 the pdeathsig is being delivered to the container's init process by the kernel for some reason + // even with the parent still running. + if c.config.ParentDeathSignal > 0 { + cmd.SysProcAttr.Pdeathsig = unix.Signal(c.config.ParentDeathSignal) + } + return cmd +} + +func (c *linuxContainer) newInitProcess(p *Process, cmd *exec.Cmd, messageSockPair, logFilePair filePair) (*initProcess, error) { + cmd.Env = append(cmd.Env, "_LIBCONTAINER_INITTYPE="+string(initStandard)) + nsMaps := make(map[configs.NamespaceType]string) + for _, ns := range c.config.Namespaces { + if ns.Path != "" { + nsMaps[ns.Type] = ns.Path + } + } + _, sharePidns := nsMaps[configs.NEWPID] + data, err := c.bootstrapData(c.config.Namespaces.CloneFlags(), nsMaps) + if err != nil { + return nil, err + } + init := &initProcess{ + cmd: cmd, + messageSockPair: messageSockPair, + logFilePair: logFilePair, + manager: c.cgroupManager, + intelRdtManager: c.intelRdtManager, + config: c.newInitConfig(p), + container: c, + process: p, + bootstrapData: data, + sharePidns: sharePidns, + } + c.initProcess = init + return init, nil +} + +func (c *linuxContainer) newSetnsProcess(p *Process, cmd *exec.Cmd, messageSockPair, logFilePair filePair) (*setnsProcess, error) { + cmd.Env = append(cmd.Env, "_LIBCONTAINER_INITTYPE="+string(initSetns)) + state, err := c.currentState() + if err != nil { + return nil, newSystemErrorWithCause(err, "getting container's current state") + } + // for setns process, we don't have to set cloneflags as the process namespaces + // will only be set via setns syscall + data, err := c.bootstrapData(0, state.NamespacePaths) + if err != nil { + return nil, err + } + // sysbox-runc: setns processes enter the child cgroup (i.e., the system + // container's cgroup root); this way they can't change the cgroup resources + // assigned to the system container itself. + return &setnsProcess{ + cmd: cmd, + cgroupPaths: c.cgroupManager.GetChildCgroupPaths(), + rootlessCgroups: c.config.RootlessCgroups, + intelRdtPath: state.IntelRdtPath, + messageSockPair: messageSockPair, + logFilePair: logFilePair, + config: c.newInitConfig(p), + process: p, + bootstrapData: data, + initProcessPid: state.InitProcessPid, + container: c, + }, nil +} + +// sysbox-runc: create a new helper process command to perform rootfs mount initialization +func (c *linuxContainer) initHelperCmdTemplate(p *Process, childInitPipe, childLogPipe *os.File) *exec.Cmd { + cmd := exec.Command(c.initPath, c.initArgs[1:]...) + cmd.Args[0] = c.initArgs[0] + cmd.Stdin = nil + cmd.Stdout = nil + cmd.Stderr = nil + cmd.Dir = c.config.Rootfs + if cmd.SysProcAttr == nil { + cmd.SysProcAttr = &unix.SysProcAttr{} + } + cmd.ExtraFiles = append(cmd.ExtraFiles, childInitPipe) + cmd.Env = append(cmd.Env, "GOMAXPROCS="+os.Getenv("GOMAXPROCS")) + cmd.Env = append(cmd.Env, "_LIBCONTAINER_INITTYPE="+string(initMount)) + cmd.Env = append(cmd.Env, + "_LIBCONTAINER_INITPIPE="+strconv.Itoa(stdioFdCount+len(cmd.ExtraFiles)-1), + "_LIBCONTAINER_STATEDIR="+c.root, + ) + cmd.ExtraFiles = append(cmd.ExtraFiles, childLogPipe) + cmd.Env = append(cmd.Env, + "_LIBCONTAINER_LOGPIPE="+strconv.Itoa(stdioFdCount+len(cmd.ExtraFiles)-1), + "_LIBCONTAINER_LOGLEVEL="+p.LogLevel, + ) + return cmd +} + +func (c *linuxContainer) newInitConfig(process *Process) *initConfig { + cfg := &initConfig{ + Config: c.config, + Args: process.Args, + Env: process.Env, + User: process.User, + AdditionalGroups: process.AdditionalGroups, + Cwd: process.Cwd, + Capabilities: process.Capabilities, + PassedFilesCount: len(process.ExtraFiles), + ContainerId: c.ID(), + NoNewPrivileges: c.config.NoNewPrivileges, + RootlessEUID: c.config.RootlessEUID, + RootlessCgroups: c.config.RootlessCgroups, + AppArmorProfile: c.config.AppArmorProfile, + ProcessLabel: c.config.ProcessLabel, + Rlimits: c.config.Rlimits, + } + if process.NoNewPrivileges != nil { + cfg.NoNewPrivileges = *process.NoNewPrivileges + } + if process.AppArmorProfile != "" { + cfg.AppArmorProfile = process.AppArmorProfile + } + if process.Label != "" { + cfg.ProcessLabel = process.Label + } + if len(process.Rlimits) > 0 { + cfg.Rlimits = process.Rlimits + } + cfg.CreateConsole = process.ConsoleSocket != nil + cfg.ConsoleWidth = process.ConsoleWidth + cfg.ConsoleHeight = process.ConsoleHeight + return cfg +} + +func (c *linuxContainer) Destroy() error { + var err error + + c.m.Lock() + defer c.m.Unlock() + + // If the rootfs was chowned, revert it back to its original uid & gid + if c.config.RootfsUidShiftType == sh.Chown { + if c.config.RootfsCloned { + err = c.sysbox.Mgr.RevertClonedRootfsChown() + } else { + err = c.revertRootfsChown() + } + } + + if err2 := c.state.destroy(); err == nil { + err = err2 + } + + if c.sysbox.Fs.Enabled() { + if err2 := c.sysbox.Fs.Unregister(); err == nil { + err = err2 + } + } + + if c.sysbox.Mgr.Enabled() { + if err2 := c.sysbox.Mgr.Unregister(); err == nil { + err = err2 + } + } else { + // If sysbox-mgr is not present (i.e., unit testing), then we teardown + // shiftfs marks here. + mounts, err := mount.GetMounts() + if err != nil { + return fmt.Errorf("failed to read mountinfo: %s", err) + } + + if err2 := c.teardownShiftfsMarkLocal(mounts); err == nil { + err = err2 + } + } + + return err +} + +func (c *linuxContainer) Pause() error { + c.m.Lock() + defer c.m.Unlock() + status, err := c.currentStatus() + if err != nil { + return err + } + switch status { + case Running, Created: + if err := c.cgroupManager.Freeze(configs.Frozen); err != nil { + return err + } + + if !c.config.RootfsCloned && c.config.RootfsUidShiftType == sh.Chown { + if err := c.revertRootfsChown(); err == nil { + return err + } + } + + if c.sysbox.Mgr.Enabled() { + if err := c.sysbox.Mgr.Pause(); err != nil { + return err + } + } + + return c.state.transition(&pausedState{ + c: c, + }) + } + return newGenericError(fmt.Errorf("container not running or created: %s", status), ContainerNotRunning) +} + +func (c *linuxContainer) Resume() error { + c.m.Lock() + defer c.m.Unlock() + status, err := c.currentStatus() + if err != nil { + return err + } + if status != Paused { + return newGenericError(fmt.Errorf("container not paused"), ContainerNotPaused) + } + + if !c.config.RootfsCloned && c.config.RootfsUidShiftType == sh.Chown { + if err := c.chownRootfs(); err != nil { + return err + } + } + + if c.sysbox.Mgr.Enabled() { + if err := c.sysbox.Mgr.Resume(); err != nil { + return err + } + } + + if err := c.cgroupManager.Freeze(configs.Thawed); err != nil { + return err + } + return c.state.transition(&runningState{ + c: c, + }) +} + +func (c *linuxContainer) NotifyOOM() (<-chan struct{}, error) { + // XXX(cyphar): This requires cgroups. + if c.config.RootlessCgroups { + logrus.Warn("getting OOM notifications may fail if you don't have the full access to cgroups") + } + path := c.cgroupManager.Path("memory") + if cgroups.IsCgroup2UnifiedMode() { + return notifyOnOOMV2(path) + } + return notifyOnOOM(path) +} + +func (c *linuxContainer) NotifyMemoryPressure(level PressureLevel) (<-chan struct{}, error) { + // XXX(cyphar): This requires cgroups. + if c.config.RootlessCgroups { + logrus.Warn("getting memory pressure notifications may fail if you don't have the full access to cgroups") + } + return notifyMemoryPressure(c.cgroupManager.Path("memory"), level) +} + +var criuFeatures *criurpc.CriuFeatures + +func (c *linuxContainer) checkCriuFeatures(criuOpts *CriuOpts, rpcOpts *criurpc.CriuOpts, criuFeat *criurpc.CriuFeatures) error { + + t := criurpc.CriuReqType_FEATURE_CHECK + + // make sure the features we are looking for are really not from + // some previous check + criuFeatures = nil + + req := &criurpc.CriuReq{ + Type: &t, + // Theoretically this should not be necessary but CRIU + // segfaults if Opts is empty. + // Fixed in CRIU 2.12 + Opts: rpcOpts, + Features: criuFeat, + } + + err := c.criuSwrk(nil, req, criuOpts, nil) + if err != nil { + logrus.Debugf("%s", err) + return errors.New("CRIU feature check failed") + } + + logrus.Debugf("Feature check says: %s", criuFeatures) + missingFeatures := false + + // The outer if checks if the fields actually exist + if (criuFeat.MemTrack != nil) && + (criuFeatures.MemTrack != nil) { + // The inner if checks if they are set to true + if *criuFeat.MemTrack && !*criuFeatures.MemTrack { + missingFeatures = true + logrus.Debugf("CRIU does not support MemTrack") + } + } + + // This needs to be repeated for every new feature check. + // Is there a way to put this in a function. Reflection? + if (criuFeat.LazyPages != nil) && + (criuFeatures.LazyPages != nil) { + if *criuFeat.LazyPages && !*criuFeatures.LazyPages { + missingFeatures = true + logrus.Debugf("CRIU does not support LazyPages") + } + } + + if missingFeatures { + return errors.New("CRIU is missing features") + } + + return nil +} + +func compareCriuVersion(criuVersion int, minVersion int) error { + // simple function to perform the actual version compare + if criuVersion < minVersion { + return fmt.Errorf("CRIU version %d must be %d or higher", criuVersion, minVersion) + } + + return nil +} + +// checkCriuVersion checks Criu version greater than or equal to minVersion +func (c *linuxContainer) checkCriuVersion(minVersion int) error { + + // If the version of criu has already been determined there is no need + // to ask criu for the version again. Use the value from c.criuVersion. + if c.criuVersion != 0 { + return compareCriuVersion(c.criuVersion, minVersion) + } + + criu := criu.MakeCriu() + criu.SetCriuPath(c.criuPath) + var err error + c.criuVersion, err = criu.GetCriuVersion() + if err != nil { + return fmt.Errorf("CRIU version check failed: %s", err) + } + + return compareCriuVersion(c.criuVersion, minVersion) +} + +const descriptorsFilename = "descriptors.json" + +func (c *linuxContainer) addCriuDumpMount(req *criurpc.CriuReq, m *configs.Mount) { + mountDest := strings.TrimPrefix(m.Destination, c.config.Rootfs) + extMnt := &criurpc.ExtMountMap{ + Key: proto.String(mountDest), + Val: proto.String(mountDest), + } + req.Opts.ExtMnt = append(req.Opts.ExtMnt, extMnt) +} + +func (c *linuxContainer) addMaskPaths(req *criurpc.CriuReq) error { + for _, path := range c.config.MaskPaths { + fi, err := os.Stat(fmt.Sprintf("/proc/%d/root/%s", c.initProcess.pid(), path)) + if err != nil { + if os.IsNotExist(err) { + continue + } + return err + } + if fi.IsDir() { + continue + } + + extMnt := &criurpc.ExtMountMap{ + Key: proto.String(path), + Val: proto.String("/dev/null"), + } + req.Opts.ExtMnt = append(req.Opts.ExtMnt, extMnt) + } + return nil +} + +func (c *linuxContainer) handleCriuConfigurationFile(rpcOpts *criurpc.CriuOpts) { + // CRIU will evaluate a configuration starting with release 3.11. + // Settings in the configuration file will overwrite RPC settings. + // Look for annotations. The annotation 'org.criu.config' + // specifies if CRIU should use a different, container specific + // configuration file. + _, annotations := utils.Annotations(c.config.Labels) + configFile, exists := annotations["org.criu.config"] + if exists { + // If the annotation 'org.criu.config' exists and is set + // to a non-empty string, tell CRIU to use that as a + // configuration file. If the file does not exist, CRIU + // will just ignore it. + if configFile != "" { + rpcOpts.ConfigFile = proto.String(configFile) + } + // If 'org.criu.config' exists and is set to an empty + // string, a runc specific CRIU configuration file will + // be not set at all. + } else { + // If the mentioned annotation has not been found, specify + // a default CRIU configuration file. + rpcOpts.ConfigFile = proto.String("/etc/criu/runc.conf") + } +} + +func (c *linuxContainer) criuSupportsExtNS(t configs.NamespaceType) bool { + var minVersion int + switch t { + case configs.NEWNET: + // CRIU supports different external namespace with different released CRIU versions. + // For network namespaces to work we need at least criu 3.11.0 => 31100. + minVersion = 31100 + case configs.NEWPID: + // For PID namespaces criu 31500 is needed. + minVersion = 31500 + default: + return false + } + return c.checkCriuVersion(minVersion) == nil +} + +func criuNsToKey(t configs.NamespaceType) string { + return "extRoot" + strings.Title(configs.NsName(t)) + "NS" +} + +func (c *linuxContainer) handleCheckpointingExternalNamespaces(rpcOpts *criurpc.CriuOpts, t configs.NamespaceType) error { + if !c.criuSupportsExtNS(t) { + return nil + } + + nsPath := c.config.Namespaces.PathOf(t) + if nsPath == "" { + return nil + } + // CRIU expects the information about an external namespace + // like this: --external []: + // This is always 'extRootNS'. + var ns unix.Stat_t + if err := unix.Stat(nsPath, &ns); err != nil { + return err + } + criuExternal := fmt.Sprintf("%s[%d]:%s", configs.NsName(t), ns.Ino, criuNsToKey(t)) + rpcOpts.External = append(rpcOpts.External, criuExternal) + + return nil +} + +func (c *linuxContainer) handleRestoringNamespaces(rpcOpts *criurpc.CriuOpts, extraFiles *[]*os.File) error { + for _, ns := range c.config.Namespaces { + switch ns.Type { + case configs.NEWNET, configs.NEWPID: + // If the container is running in a network or PID namespace and has + // a path to the network or PID namespace configured, we will dump + // that network or PID namespace as an external namespace and we + // will expect that the namespace exists during restore. + // This basically means that CRIU will ignore the namespace + // and expect it to be setup correctly. + if err := c.handleRestoringExternalNamespaces(rpcOpts, extraFiles, ns.Type); err != nil { + return err + } + default: + // For all other namespaces except NET and PID CRIU has + // a simpler way of joining the existing namespace if set + nsPath := c.config.Namespaces.PathOf(ns.Type) + if nsPath == "" { + continue + } + if ns.Type == configs.NEWCGROUP { + // CRIU has no code to handle NEWCGROUP + return fmt.Errorf("Do not know how to handle namespace %v", ns.Type) + } + // CRIU has code to handle NEWTIME, but it does not seem to be defined in runc + + // CRIU will issue a warning for NEWUSER: + // criu/namespaces.c: 'join-ns with user-namespace is not fully tested and dangerous' + rpcOpts.JoinNs = append(rpcOpts.JoinNs, &criurpc.JoinNamespace{ + Ns: proto.String(configs.NsName(ns.Type)), + NsFile: proto.String(nsPath), + }) + } + } + + return nil +} + +func (c *linuxContainer) handleRestoringExternalNamespaces(rpcOpts *criurpc.CriuOpts, extraFiles *[]*os.File, t configs.NamespaceType) error { + if !c.criuSupportsExtNS(t) { + return nil + } + + nsPath := c.config.Namespaces.PathOf(t) + if nsPath == "" { + return nil + } + // CRIU wants the information about an existing namespace + // like this: --inherit-fd fd[]: + // The needs to be the same as during checkpointing. + // We are always using 'extRootNS' as the key in this. + nsFd, err := os.Open(nsPath) + if err != nil { + logrus.Errorf("If a specific network namespace is defined it must exist: %s", err) + return fmt.Errorf("Requested network namespace %v does not exist", nsPath) + } + inheritFd := &criurpc.InheritFd{ + Key: proto.String(criuNsToKey(t)), + // The offset of four is necessary because 0, 1, 2 and 3 are + // already used by stdin, stdout, stderr, 'criu swrk' socket. + Fd: proto.Int32(int32(4 + len(*extraFiles))), + } + rpcOpts.InheritFd = append(rpcOpts.InheritFd, inheritFd) + // All open FDs need to be transferred to CRIU via extraFiles + *extraFiles = append(*extraFiles, nsFd) + + return nil +} + +func (c *linuxContainer) Checkpoint(criuOpts *CriuOpts) error { + c.m.Lock() + defer c.m.Unlock() + + // Checkpoint is unlikely to work if os.Geteuid() != 0 || system.RunningInUserNS(). + // (CLI prints a warning) + // TODO(avagin): Figure out how to make this work nicely. CRIU 2.0 has + // support for doing unprivileged dumps, but the setup of + // rootless containers might make this complicated. + + // We are relying on the CRIU version RPC which was introduced with CRIU 3.0.0 + if err := c.checkCriuVersion(30000); err != nil { + return err + } + + if criuOpts.ImagesDirectory == "" { + return errors.New("invalid directory to save checkpoint") + } + + // Since a container can be C/R'ed multiple times, + // the checkpoint directory may already exist. + if err := os.Mkdir(criuOpts.ImagesDirectory, 0700); err != nil && !os.IsExist(err) { + return err + } + + if criuOpts.WorkDirectory == "" { + criuOpts.WorkDirectory = filepath.Join(c.root, "criu.work") + } + + if err := os.Mkdir(criuOpts.WorkDirectory, 0700); err != nil && !os.IsExist(err) { + return err + } + + workDir, err := os.Open(criuOpts.WorkDirectory) + if err != nil { + return err + } + defer workDir.Close() + + imageDir, err := os.Open(criuOpts.ImagesDirectory) + if err != nil { + return err + } + defer imageDir.Close() + + rpcOpts := criurpc.CriuOpts{ + ImagesDirFd: proto.Int32(int32(imageDir.Fd())), + WorkDirFd: proto.Int32(int32(workDir.Fd())), + LogLevel: proto.Int32(4), + LogFile: proto.String("dump.log"), + Root: proto.String(c.config.Rootfs), + ManageCgroups: proto.Bool(true), + NotifyScripts: proto.Bool(true), + Pid: proto.Int32(int32(c.initProcess.pid())), + ShellJob: proto.Bool(criuOpts.ShellJob), + LeaveRunning: proto.Bool(criuOpts.LeaveRunning), + TcpEstablished: proto.Bool(criuOpts.TcpEstablished), + ExtUnixSk: proto.Bool(criuOpts.ExternalUnixConnections), + FileLocks: proto.Bool(criuOpts.FileLocks), + EmptyNs: proto.Uint32(criuOpts.EmptyNs), + OrphanPtsMaster: proto.Bool(true), + AutoDedup: proto.Bool(criuOpts.AutoDedup), + LazyPages: proto.Bool(criuOpts.LazyPages), + } + + c.handleCriuConfigurationFile(&rpcOpts) + + // If the container is running in a network namespace and has + // a path to the network namespace configured, we will dump + // that network namespace as an external namespace and we + // will expect that the namespace exists during restore. + // This basically means that CRIU will ignore the namespace + // and expect to be setup correctly. + if err := c.handleCheckpointingExternalNamespaces(&rpcOpts, configs.NEWNET); err != nil { + return err + } + + // Same for possible external PID namespaces + if err := c.handleCheckpointingExternalNamespaces(&rpcOpts, configs.NEWPID); err != nil { + return err + } + + // CRIU can use cgroup freezer; when rpcOpts.FreezeCgroup + // is not set, CRIU uses ptrace() to pause the processes. + // Note cgroup v2 freezer is only supported since CRIU release 3.14. + if !cgroups.IsCgroup2UnifiedMode() || c.checkCriuVersion(31400) == nil { + if fcg := c.cgroupManager.Path("freezer"); fcg != "" { + rpcOpts.FreezeCgroup = proto.String(fcg) + } + } + + // append optional criu opts, e.g., page-server and port + if criuOpts.PageServer.Address != "" && criuOpts.PageServer.Port != 0 { + rpcOpts.Ps = &criurpc.CriuPageServerInfo{ + Address: proto.String(criuOpts.PageServer.Address), + Port: proto.Int32(criuOpts.PageServer.Port), + } + } + + //pre-dump may need parentImage param to complete iterative migration + if criuOpts.ParentImage != "" { + rpcOpts.ParentImg = proto.String(criuOpts.ParentImage) + rpcOpts.TrackMem = proto.Bool(true) + } + + // append optional manage cgroups mode + if criuOpts.ManageCgroupsMode != 0 { + mode := criurpc.CriuCgMode(criuOpts.ManageCgroupsMode) + rpcOpts.ManageCgroupsMode = &mode + } + + var t criurpc.CriuReqType + if criuOpts.PreDump { + feat := criurpc.CriuFeatures{ + MemTrack: proto.Bool(true), + } + + if err := c.checkCriuFeatures(criuOpts, &rpcOpts, &feat); err != nil { + return err + } + + t = criurpc.CriuReqType_PRE_DUMP + } else { + t = criurpc.CriuReqType_DUMP + } + + if criuOpts.LazyPages { + // lazy migration requested; check if criu supports it + feat := criurpc.CriuFeatures{ + LazyPages: proto.Bool(true), + } + if err := c.checkCriuFeatures(criuOpts, &rpcOpts, &feat); err != nil { + return err + } + + if fd := criuOpts.StatusFd; fd != -1 { + // check that the FD is valid + flags, err := unix.FcntlInt(uintptr(fd), unix.F_GETFL, 0) + if err != nil { + return fmt.Errorf("invalid --status-fd argument %d: %w", fd, err) + } + // and writable + if flags&unix.O_WRONLY == 0 { + return fmt.Errorf("invalid --status-fd argument %d: not writable", fd) + } + + if c.checkCriuVersion(31500) != nil { + // For criu 3.15+, use notifications (see case "status-ready" + // in criuNotifications). Otherwise, rely on criu status fd. + rpcOpts.StatusFd = proto.Int32(int32(fd)) + } + } + } + + req := &criurpc.CriuReq{ + Type: &t, + Opts: &rpcOpts, + } + + // no need to dump all this in pre-dump + if !criuOpts.PreDump { + hasCgroupns := c.config.Namespaces.Contains(configs.NEWCGROUP) + for _, m := range c.config.Mounts { + switch m.Device { + case "bind": + c.addCriuDumpMount(req, m) + case "cgroup": + if cgroups.IsCgroup2UnifiedMode() || hasCgroupns { + // real mount(s) + continue + } + // a set of "external" bind mounts + binds, err := getCgroupMounts(m) + if err != nil { + return err + } + for _, b := range binds { + c.addCriuDumpMount(req, b) + } + } + } + + if err := c.addMaskPaths(req); err != nil { + return err + } + + for _, node := range c.config.Devices { + m := &configs.Mount{Destination: node.Path, Source: node.Path} + c.addCriuDumpMount(req, m) + } + + // Write the FD info to a file in the image directory + fdsJSON, err := json.Marshal(c.initProcess.externalDescriptors()) + if err != nil { + return err + } + + err = ioutil.WriteFile(filepath.Join(criuOpts.ImagesDirectory, descriptorsFilename), fdsJSON, 0600) + if err != nil { + return err + } + } + + err = c.criuSwrk(nil, req, criuOpts, nil) + if err != nil { + return err + } + return nil +} + +func (c *linuxContainer) addCriuRestoreMount(req *criurpc.CriuReq, m *configs.Mount) { + mountDest := strings.TrimPrefix(m.Destination, c.config.Rootfs) + extMnt := &criurpc.ExtMountMap{ + Key: proto.String(mountDest), + Val: proto.String(m.Source), + } + req.Opts.ExtMnt = append(req.Opts.ExtMnt, extMnt) +} + +func (c *linuxContainer) restoreNetwork(req *criurpc.CriuReq, criuOpts *CriuOpts) { + for _, iface := range c.config.Networks { + switch iface.Type { + case "veth": + veth := new(criurpc.CriuVethPair) + veth.IfOut = proto.String(iface.HostInterfaceName) + veth.IfIn = proto.String(iface.Name) + req.Opts.Veths = append(req.Opts.Veths, veth) + case "loopback": + // Do nothing + } + } + for _, i := range criuOpts.VethPairs { + veth := new(criurpc.CriuVethPair) + veth.IfOut = proto.String(i.HostInterfaceName) + veth.IfIn = proto.String(i.ContainerInterfaceName) + req.Opts.Veths = append(req.Opts.Veths, veth) + } +} + +// makeCriuRestoreMountpoints makes the actual mountpoints for the +// restore using CRIU. This function is inspired from the code in +// rootfs_linux.go +func (c *linuxContainer) makeCriuRestoreMountpoints(m *configs.Mount) error { + switch m.Device { + case "cgroup": + // No mount point(s) need to be created: + // + // * for v1, mount points are saved by CRIU because + // /sys/fs/cgroup is a tmpfs mount + // + // * for v2, /sys/fs/cgroup is a real mount, but + // the mountpoint appears as soon as /sys is mounted + return nil + case "bind": + // The prepareBindDest() function checks if source + // exists. So it cannot be used for other filesystem types. + // + // sysbox-runc: this is no longer the case; prepareBindDest() only checks the + // mount destination; if we need to check the mount source we need to create a + // function that explicitly does this. + if err := prepareBindDest(m, true, c.config, nil); err != nil { + return err + } + default: + // for all other filesystems just create the mountpoints + dest, err := securejoin.SecureJoin(c.config.Rootfs, m.Destination) + if err != nil { + return err + } + m.Destination = dest + if err := os.MkdirAll(dest, 0755); err != nil { + return err + } + } + return nil +} + +// isPathInPrefixList is a small function for CRIU restore to make sure +// mountpoints, which are on a tmpfs, are not created in the roofs +func isPathInPrefixList(path string, prefix []string) bool { + for _, p := range prefix { + if strings.HasPrefix(path, p+"/") { + return true + } + } + return false +} + +// prepareCriuRestoreMounts tries to set up the rootfs of the +// container to be restored in the same way runc does it for +// initial container creation. Even for a read-only rootfs container +// runc modifies the rootfs to add mountpoints which do not exist. +// This function also creates missing mountpoints as long as they +// are not on top of a tmpfs, as CRIU will restore tmpfs content anyway. +func (c *linuxContainer) prepareCriuRestoreMounts(mounts []*configs.Mount) error { + // First get a list of a all tmpfs mounts + tmpfs := []string{} + for _, m := range mounts { + switch m.Device { + case "tmpfs": + tmpfs = append(tmpfs, m.Destination) + } + } + // Now go through all mounts and create the mountpoints + // if the mountpoints are not on a tmpfs, as CRIU will + // restore the complete tmpfs content from its checkpoint. + for _, m := range mounts { + if !isPathInPrefixList(m.Destination, tmpfs) { + if err := c.makeCriuRestoreMountpoints(m); err != nil { + return err + } + } + } + return nil +} + +func (c *linuxContainer) Restore(process *Process, criuOpts *CriuOpts) error { + c.m.Lock() + defer c.m.Unlock() + + var extraFiles []*os.File + + // Restore is unlikely to work if os.Geteuid() != 0 || system.RunningInUserNS(). + // (CLI prints a warning) + // TODO(avagin): Figure out how to make this work nicely. CRIU doesn't have + // support for unprivileged restore at the moment. + + // We are relying on the CRIU version RPC which was introduced with CRIU 3.0.0 + if err := c.checkCriuVersion(30000); err != nil { + return err + } + if criuOpts.WorkDirectory == "" { + criuOpts.WorkDirectory = filepath.Join(c.root, "criu.work") + } + // Since a container can be C/R'ed multiple times, + // the work directory may already exist. + if err := os.Mkdir(criuOpts.WorkDirectory, 0700); err != nil && !os.IsExist(err) { + return err + } + workDir, err := os.Open(criuOpts.WorkDirectory) + if err != nil { + return err + } + defer workDir.Close() + if criuOpts.ImagesDirectory == "" { + return errors.New("invalid directory to restore checkpoint") + } + imageDir, err := os.Open(criuOpts.ImagesDirectory) + if err != nil { + return err + } + defer imageDir.Close() + // CRIU has a few requirements for a root directory: + // * it must be a mount point + // * its parent must not be overmounted + // c.config.Rootfs is bind-mounted to a temporary directory + // to satisfy these requirements. + root := filepath.Join(c.root, "criu-root") + if err := os.Mkdir(root, 0755); err != nil { + return err + } + defer os.Remove(root) + root, err = filepath.EvalSymlinks(root) + if err != nil { + return err + } + err = unix.Mount(c.config.Rootfs, root, "", unix.MS_BIND|unix.MS_REC, "") + if err != nil { + return err + } + defer unix.Unmount(root, unix.MNT_DETACH) + t := criurpc.CriuReqType_RESTORE + req := &criurpc.CriuReq{ + Type: &t, + Opts: &criurpc.CriuOpts{ + ImagesDirFd: proto.Int32(int32(imageDir.Fd())), + WorkDirFd: proto.Int32(int32(workDir.Fd())), + EvasiveDevices: proto.Bool(true), + LogLevel: proto.Int32(4), + LogFile: proto.String("restore.log"), + RstSibling: proto.Bool(true), + Root: proto.String(root), + ManageCgroups: proto.Bool(true), + NotifyScripts: proto.Bool(true), + ShellJob: proto.Bool(criuOpts.ShellJob), + ExtUnixSk: proto.Bool(criuOpts.ExternalUnixConnections), + TcpEstablished: proto.Bool(criuOpts.TcpEstablished), + FileLocks: proto.Bool(criuOpts.FileLocks), + EmptyNs: proto.Uint32(criuOpts.EmptyNs), + OrphanPtsMaster: proto.Bool(true), + AutoDedup: proto.Bool(criuOpts.AutoDedup), + LazyPages: proto.Bool(criuOpts.LazyPages), + }, + } + + c.handleCriuConfigurationFile(req.Opts) + + if err := c.handleRestoringNamespaces(req.Opts, &extraFiles); err != nil { + return err + } + + // This will modify the rootfs of the container in the same way runc + // modifies the container during initial creation. + if err := c.prepareCriuRestoreMounts(c.config.Mounts); err != nil { + return err + } + + hasCgroupns := c.config.Namespaces.Contains(configs.NEWCGROUP) + for _, m := range c.config.Mounts { + switch m.Device { + case "bind": + c.addCriuRestoreMount(req, m) + case "cgroup": + if cgroups.IsCgroup2UnifiedMode() || hasCgroupns { + continue + } + // cgroup v1 is a set of bind mounts, unless cgroupns is used + binds, err := getCgroupMounts(m) + if err != nil { + return err + } + for _, b := range binds { + c.addCriuRestoreMount(req, b) + } + } + } + + if len(c.config.MaskPaths) > 0 { + m := &configs.Mount{Destination: "/dev/null", Source: "/dev/null"} + c.addCriuRestoreMount(req, m) + } + + for _, node := range c.config.Devices { + m := &configs.Mount{Destination: node.Path, Source: node.Path} + c.addCriuRestoreMount(req, m) + } + + if criuOpts.EmptyNs&unix.CLONE_NEWNET == 0 { + c.restoreNetwork(req, criuOpts) + } + + // append optional manage cgroups mode + if criuOpts.ManageCgroupsMode != 0 { + mode := criurpc.CriuCgMode(criuOpts.ManageCgroupsMode) + req.Opts.ManageCgroupsMode = &mode + } + + var ( + fds []string + fdJSON []byte + ) + if fdJSON, err = ioutil.ReadFile(filepath.Join(criuOpts.ImagesDirectory, descriptorsFilename)); err != nil { + return err + } + + if err := json.Unmarshal(fdJSON, &fds); err != nil { + return err + } + for i := range fds { + if s := fds[i]; strings.Contains(s, "pipe:") { + inheritFd := new(criurpc.InheritFd) + inheritFd.Key = proto.String(s) + inheritFd.Fd = proto.Int32(int32(i)) + req.Opts.InheritFd = append(req.Opts.InheritFd, inheritFd) + } + } + err = c.criuSwrk(process, req, criuOpts, extraFiles) + + // Now that CRIU is done let's close all opened FDs CRIU needed. + for _, fd := range extraFiles { + fd.Close() + } + + return err +} + +func (c *linuxContainer) criuApplyCgroups(pid int, req *criurpc.CriuReq) error { + // need to apply cgroups only on restore + if req.GetType() != criurpc.CriuReqType_RESTORE { + return nil + } + + // XXX: Do we need to deal with this case? AFAIK criu still requires root. + if err := c.cgroupManager.Apply(pid); err != nil { + return err + } + // sysbox-runc: place the pid in the sys container's cgroup root. The prior call to + // Apply(pid) is necessary because Apply() populates the cgroup manager's internal + // state. + if err := c.cgroupManager.ApplyChildCgroup(pid); err != nil { + return err + } + + if err := c.cgroupManager.Set(c.config); err != nil { + return newSystemError(err) + } + + if cgroups.IsCgroup2UnifiedMode() { + return nil + } + + // the stuff below is cgroupv1-specific + path := fmt.Sprintf("/proc/%d/cgroup", pid) + cgroupsPaths, err := cgroups.ParseCgroupFile(path) + if err != nil { + return err + } + + for c, p := range cgroupsPaths { + cgroupRoot := &criurpc.CgroupRoot{ + Ctrl: proto.String(c), + Path: proto.String(p), + } + req.Opts.CgRoot = append(req.Opts.CgRoot, cgroupRoot) + } + + return nil +} + +func (c *linuxContainer) criuSwrk(process *Process, req *criurpc.CriuReq, opts *CriuOpts, extraFiles []*os.File) error { + fds, err := unix.Socketpair(unix.AF_LOCAL, unix.SOCK_SEQPACKET|unix.SOCK_CLOEXEC, 0) + if err != nil { + return err + } + + var logPath string + if opts != nil { + logPath = filepath.Join(opts.WorkDirectory, req.GetOpts().GetLogFile()) + } else { + // For the VERSION RPC 'opts' is set to 'nil' and therefore + // opts.WorkDirectory does not exist. Set logPath to "". + logPath = "" + } + criuClient := os.NewFile(uintptr(fds[0]), "criu-transport-client") + criuClientFileCon, err := net.FileConn(criuClient) + criuClient.Close() + if err != nil { + return err + } + + criuClientCon := criuClientFileCon.(*net.UnixConn) + defer criuClientCon.Close() + + criuServer := os.NewFile(uintptr(fds[1]), "criu-transport-server") + defer criuServer.Close() + + args := []string{"swrk", "3"} + if c.criuVersion != 0 { + // If the CRIU Version is still '0' then this is probably + // the initial CRIU run to detect the version. Skip it. + logrus.Debugf("Using CRIU %d at: %s", c.criuVersion, c.criuPath) + } + logrus.Debugf("Using CRIU with following args: %s", args) + cmd := exec.Command(c.criuPath, args...) + if process != nil { + cmd.Stdin = process.Stdin + cmd.Stdout = process.Stdout + cmd.Stderr = process.Stderr + } + cmd.ExtraFiles = append(cmd.ExtraFiles, criuServer) + if extraFiles != nil { + cmd.ExtraFiles = append(cmd.ExtraFiles, extraFiles...) + } + + if err := cmd.Start(); err != nil { + return err + } + // we close criuServer so that even if CRIU crashes or unexpectedly exits, runc will not hang. + criuServer.Close() + // cmd.Process will be replaced by a restored init. + criuProcess := cmd.Process + + var criuProcessState *os.ProcessState + defer func() { + if criuProcessState == nil { + criuClientCon.Close() + _, err := criuProcess.Wait() + if err != nil { + logrus.Warnf("wait on criuProcess returned %v", err) + } + } + }() + + if err := c.criuApplyCgroups(criuProcess.Pid, req); err != nil { + return err + } + + var extFds []string + if process != nil { + extFds, err = getPipeFds(criuProcess.Pid) + if err != nil { + return err + } + } + + logrus.Debugf("Using CRIU in %s mode", req.GetType().String()) + // In the case of criurpc.CriuReqType_FEATURE_CHECK req.GetOpts() + // should be empty. For older CRIU versions it still will be + // available but empty. criurpc.CriuReqType_VERSION actually + // has no req.GetOpts(). + if !(req.GetType() == criurpc.CriuReqType_FEATURE_CHECK || + req.GetType() == criurpc.CriuReqType_VERSION) { + + val := reflect.ValueOf(req.GetOpts()) + v := reflect.Indirect(val) + for i := 0; i < v.NumField(); i++ { + st := v.Type() + name := st.Field(i).Name + if strings.HasPrefix(name, "XXX_") { + continue + } + value := val.MethodByName("Get" + name).Call([]reflect.Value{}) + logrus.Debugf("CRIU option %s with value %v", name, value[0]) + } + } + data, err := proto.Marshal(req) + if err != nil { + return err + } + _, err = criuClientCon.Write(data) + if err != nil { + return err + } + + buf := make([]byte, 10*4096) + oob := make([]byte, 4096) + for { + n, oobn, _, _, err := criuClientCon.ReadMsgUnix(buf, oob) + if req.Opts != nil && req.Opts.StatusFd != nil { + // Close status_fd as soon as we got something back from criu, + // assuming it has consumed (reopened) it by this time. + // Otherwise it will might be left open forever and whoever + // is waiting on it will wait forever. + fd := int(*req.Opts.StatusFd) + _ = unix.Close(fd) + req.Opts.StatusFd = nil + } + if err != nil { + return err + } + if n == 0 { + return errors.New("unexpected EOF") + } + if n == len(buf) { + return errors.New("buffer is too small") + } + + resp := new(criurpc.CriuResp) + err = proto.Unmarshal(buf[:n], resp) + if err != nil { + return err + } + if !resp.GetSuccess() { + typeString := req.GetType().String() + return fmt.Errorf("criu failed: type %s errno %d\nlog file: %s", typeString, resp.GetCrErrno(), logPath) + } + + t := resp.GetType() + switch { + case t == criurpc.CriuReqType_FEATURE_CHECK: + logrus.Debugf("Feature check says: %s", resp) + criuFeatures = resp.GetFeatures() + case t == criurpc.CriuReqType_NOTIFY: + if err := c.criuNotifications(resp, process, cmd, opts, extFds, oob[:oobn]); err != nil { + return err + } + t = criurpc.CriuReqType_NOTIFY + req = &criurpc.CriuReq{ + Type: &t, + NotifySuccess: proto.Bool(true), + } + data, err = proto.Marshal(req) + if err != nil { + return err + } + _, err = criuClientCon.Write(data) + if err != nil { + return err + } + continue + case t == criurpc.CriuReqType_RESTORE: + case t == criurpc.CriuReqType_DUMP: + case t == criurpc.CriuReqType_PRE_DUMP: + default: + return fmt.Errorf("unable to parse the response %s", resp.String()) + } + + break + } + + criuClientCon.CloseWrite() + // cmd.Wait() waits cmd.goroutines which are used for proxying file descriptors. + // Here we want to wait only the CRIU process. + criuProcessState, err = criuProcess.Wait() + if err != nil { + return err + } + + // In pre-dump mode CRIU is in a loop and waits for + // the final DUMP command. + // The current runc pre-dump approach, however, is + // start criu in PRE_DUMP once for a single pre-dump + // and not the whole series of pre-dump, pre-dump, ...m, dump + // If we got the message CriuReqType_PRE_DUMP it means + // CRIU was successful and we need to forcefully stop CRIU + if !criuProcessState.Success() && *req.Type != criurpc.CriuReqType_PRE_DUMP { + return fmt.Errorf("criu failed: %s\nlog file: %s", criuProcessState.String(), logPath) + } + return nil +} + +// block any external network activity +func lockNetwork(config *configs.Config) error { + for _, config := range config.Networks { + strategy, err := getStrategy(config.Type) + if err != nil { + return err + } + + if err := strategy.detach(config); err != nil { + return err + } + } + return nil +} + +func unlockNetwork(config *configs.Config) error { + for _, config := range config.Networks { + strategy, err := getStrategy(config.Type) + if err != nil { + return err + } + if err = strategy.attach(config); err != nil { + return err + } + } + return nil +} + +func (c *linuxContainer) criuNotifications(resp *criurpc.CriuResp, process *Process, cmd *exec.Cmd, opts *CriuOpts, fds []string, oob []byte) error { + notify := resp.GetNotify() + if notify == nil { + return fmt.Errorf("invalid response: %s", resp.String()) + } + script := notify.GetScript() + logrus.Debugf("notify: %s\n", script) + switch script { + case "post-dump": + f, err := os.Create(filepath.Join(c.root, "checkpoint")) + if err != nil { + return err + } + f.Close() + case "network-unlock": + if err := unlockNetwork(c.config); err != nil { + return err + } + case "network-lock": + if err := lockNetwork(c.config); err != nil { + return err + } + case "setup-namespaces": + if c.config.Hooks != nil { + s, err := c.currentOCIState() + if err != nil { + return nil + } + s.Pid = int(notify.GetPid()) + + if err := c.config.Hooks[configs.Prestart].RunHooks(s); err != nil { + return err + } + if err := c.config.Hooks[configs.CreateRuntime].RunHooks(s); err != nil { + return err + } + } + case "post-restore": + pid := notify.GetPid() + + p, err := os.FindProcess(int(pid)) + if err != nil { + return err + } + cmd.Process = p + + r, err := newRestoredProcess(cmd, fds) + if err != nil { + return err + } + process.ops = r + if err := c.state.transition(&restoredState{ + imageDir: opts.ImagesDirectory, + c: c, + }); err != nil { + return err + } + // create a timestamp indicating when the restored checkpoint was started + c.created = time.Now().UTC() + if _, err := c.updateState(r); err != nil { + return err + } + if err := os.Remove(filepath.Join(c.root, "checkpoint")); err != nil { + if !os.IsNotExist(err) { + logrus.Error(err) + } + } + case "orphan-pts-master": + scm, err := unix.ParseSocketControlMessage(oob) + if err != nil { + return err + } + fds, err := unix.ParseUnixRights(&scm[0]) + if err != nil { + return err + } + + master := os.NewFile(uintptr(fds[0]), "orphan-pts-master") + defer master.Close() + + // While we can access console.master, using the API is a good idea. + if err := utils.SendFd(process.ConsoleSocket, master.Name(), master.Fd()); err != nil { + return err + } + case "status-ready": + if opts.StatusFd != -1 { + // write \0 to status fd to notify that lazy page server is ready + _, err := unix.Write(opts.StatusFd, []byte{0}) + if err != nil { + logrus.Warnf("can't write \\0 to status fd: %v", err) + } + _ = unix.Close(opts.StatusFd) + opts.StatusFd = -1 + } + } + return nil +} + +func (c *linuxContainer) updateState(process parentProcess) (*State, error) { + if process != nil { + c.initProcess = process + } + state, err := c.currentState() + if err != nil { + return nil, err + } + err = c.saveState(state) + if err != nil { + return nil, err + } + return state, nil +} + +func (c *linuxContainer) saveState(s *State) (retErr error) { + tmpFile, err := ioutil.TempFile(c.root, "state-") + if err != nil { + return err + } + + defer func() { + if retErr != nil { + tmpFile.Close() + os.Remove(tmpFile.Name()) + } + }() + + err = utils.WriteJSON(tmpFile, s) + if err != nil { + return err + } + err = tmpFile.Close() + if err != nil { + return err + } + + stateFilePath := filepath.Join(c.root, stateFilename) + return os.Rename(tmpFile.Name(), stateFilePath) +} + +func (c *linuxContainer) currentStatus() (Status, error) { + if err := c.refreshState(); err != nil { + return -1, err + } + return c.state.status(), nil +} + +// refreshState needs to be called to verify that the current state on the +// container is what is true. Because consumers of libcontainer can use it +// out of process we need to verify the container's status based on runtime +// information and not rely on our in process info. +func (c *linuxContainer) refreshState() error { + paused, err := c.isPaused() + if err != nil { + return err + } + if paused { + return c.state.transition(&pausedState{c: c}) + } + t := c.runType() + switch t { + case Created: + return c.state.transition(&createdState{c: c}) + case Running: + return c.state.transition(&runningState{c: c}) + } + return c.state.transition(&stoppedState{c: c}) +} + +func (c *linuxContainer) runType() Status { + if c.initProcess == nil { + return Stopped + } + pid := c.initProcess.pid() + stat, err := system.Stat(pid) + if err != nil { + return Stopped + } + if stat.StartTime != c.initProcessStartTime || stat.State == system.Zombie || stat.State == system.Dead { + return Stopped + } + // We'll create exec fifo and blocking on it after container is created, + // and delete it after start container. + if _, err := os.Stat(filepath.Join(c.root, execFifoFilename)); err == nil { + return Created + } + return Running +} + +func (c *linuxContainer) isPaused() (bool, error) { + state, err := c.cgroupManager.GetFreezerState() + if err != nil { + return false, err + } + return state == configs.Frozen, nil +} + +func (c *linuxContainer) currentState() (*State, error) { + var ( + startTime uint64 + externalDescriptors []string + pid = -1 + ) + if c.initProcess != nil { + pid = c.initProcess.pid() + startTime, _ = c.initProcess.startTime() + externalDescriptors = c.initProcess.externalDescriptors() + } + intelRdtPath, err := intelrdt.GetIntelRdtPath(c.ID()) + if err != nil { + intelRdtPath = "" + } + state := &State{ + BaseState: BaseState{ + ID: c.ID(), + Config: *c.config, + InitProcessPid: pid, + InitProcessStartTime: startTime, + Created: c.created, + }, + Rootless: c.config.RootlessEUID && c.config.RootlessCgroups, + CgroupPaths: c.cgroupManager.GetPaths(), + IntelRdtPath: intelRdtPath, + NamespacePaths: make(map[configs.NamespaceType]string), + ExternalDescriptors: externalDescriptors, + Sysbox: *c.sysbox, + SysMgr: *c.sysbox.Mgr, + SysFs: *c.sysbox.Fs, + } + + if pid > 0 { + for _, ns := range c.config.Namespaces { + state.NamespacePaths[ns.Type] = ns.GetPath(pid) + } + for _, nsType := range configs.NamespaceTypes() { + if !configs.IsNamespaceSupported(nsType) { + continue + } + if _, ok := state.NamespacePaths[nsType]; !ok { + ns := configs.Namespace{Type: nsType} + state.NamespacePaths[ns.Type] = ns.GetPath(pid) + } + } + } + return state, nil +} + +func (c *linuxContainer) currentOCIState() (*specs.State, error) { + bundle, annotations := utils.Annotations(c.config.Labels) + state := &specs.State{ + Version: specs.Version, + ID: c.ID(), + Bundle: bundle, + Annotations: annotations, + } + status, err := c.currentStatus() + if err != nil { + return nil, err + } + state.Status = specs.ContainerState(status.String()) + if status != Stopped { + if c.initProcess != nil { + state.Pid = c.initProcess.pid() + } + } + return state, nil +} + +// orderNamespacePaths sorts namespace paths into a list of paths that we +// can setns in order. +func (c *linuxContainer) orderNamespacePaths(namespaces map[configs.NamespaceType]string) ([]string, error) { + paths := []string{} + for _, ns := range configs.NamespaceTypes() { + + // Remove namespaces that we don't need to join. + if !c.config.Namespaces.Contains(ns) { + continue + } + + if p, ok := namespaces[ns]; ok && p != "" { + // check if the requested namespace is supported + if !configs.IsNamespaceSupported(ns) { + return nil, newSystemError(fmt.Errorf("namespace %s is not supported", ns)) + } + // only set to join this namespace if it exists + if _, err := os.Lstat(p); err != nil { + return nil, newSystemErrorWithCausef(err, "running lstat on namespace path %q", p) + } + // do not allow namespace path with comma as we use it to separate + // the namespace paths + if strings.ContainsRune(p, ',') { + return nil, newSystemError(fmt.Errorf("invalid path %s", p)) + } + paths = append(paths, fmt.Sprintf("%s:%s", configs.NsName(ns), p)) + } + + } + + return paths, nil +} + +func encodeIDMapping(idMap []configs.IDMap) ([]byte, error) { + data := bytes.NewBuffer(nil) + for _, im := range idMap { + line := fmt.Sprintf("%d %d %d\n", im.ContainerID, im.HostID, im.Size) + if _, err := data.WriteString(line); err != nil { + return nil, err + } + } + return data.Bytes(), nil +} + +// bootstrapData encodes the necessary data in netlink binary format +// as a io.Reader. +// Consumer can write the data to a bootstrap program +// such as one that uses nsenter package to bootstrap the container's +// init process correctly, i.e. with correct namespaces, uid/gid +// mapping etc. +func (c *linuxContainer) bootstrapData(cloneFlags uintptr, nsMaps map[configs.NamespaceType]string) (io.Reader, error) { + // create the netlink message + r := nl.NewNetlinkRequest(int(InitMsg), 0) + + // write cloneFlags + r.AddData(&Int32msg{ + Type: CloneFlagsAttr, + Value: uint32(cloneFlags), + }) + + // write custom namespace paths + if len(nsMaps) > 0 { + nsPaths, err := c.orderNamespacePaths(nsMaps) + if err != nil { + return nil, err + } + r.AddData(&Bytemsg{ + Type: NsPathsAttr, + Value: []byte(strings.Join(nsPaths, ",")), + }) + } + + // write uid & gid mappings only when we create a new user-ns + _, joinExistingUser := nsMaps[configs.NEWUSER] + if !joinExistingUser { + // write uid mappings + if len(c.config.UidMappings) > 0 { + if c.config.RootlessEUID && c.newuidmapPath != "" { + r.AddData(&Bytemsg{ + Type: UidmapPathAttr, + Value: []byte(c.newuidmapPath), + }) + } + b, err := encodeIDMapping(c.config.UidMappings) + if err != nil { + return nil, err + } + r.AddData(&Bytemsg{ + Type: UidmapAttr, + Value: b, + }) + } + + // write gid mappings + if len(c.config.GidMappings) > 0 { + b, err := encodeIDMapping(c.config.GidMappings) + if err != nil { + return nil, err + } + r.AddData(&Bytemsg{ + Type: GidmapAttr, + Value: b, + }) + if c.config.RootlessEUID && c.newgidmapPath != "" { + r.AddData(&Bytemsg{ + Type: GidmapPathAttr, + Value: []byte(c.newgidmapPath), + }) + } + if requiresRootOrMappingTool(c.config) { + r.AddData(&Boolmsg{ + Type: SetgroupAttr, + Value: true, + }) + } + } + } + + if c.config.OomScoreAdj != nil { + // write the configured oom_score_adj + r.AddData(&Bytemsg{ + Type: OomScoreAdjAttr, + Value: []byte(strconv.Itoa(*c.config.OomScoreAdj)), + }) + } else { + // Pass sysbox's oom_score_adj explicitly to nsenter; this is needed because nsenter + // initially sets the oom_score_adj to -999 and later reverts it to the given value + // (so as to allow child processes to set -999 if desired). By passing it here, we + // honor the OCI spec: "If oomScoreAdj is not set, the runtime MUST NOT change the + // value of oom_score_adj." + var err error + + f, err := os.Open("/proc/self/oom_score_adj") + if err != nil { + return nil, err + } + defer f.Close() + + str, err := bufio.NewReader(f).ReadString('\n') + if err != nil { + return nil, err + } + + str = strings.Trim(str, "\n") + + selfOomScoreAdj, err := strconv.Atoi(str) + if err != nil { + return nil, err + } + + // For sys containers we don't allow -1000 for the OOM score value, as this + // is not supported from within a user-ns. + if selfOomScoreAdj < -999 { + selfOomScoreAdj = -999 + } + + r.AddData(&Bytemsg{ + Type: OomScoreAdjAttr, + Value: []byte(strconv.Itoa(selfOomScoreAdj)), + }) + } + + // write rootless + r.AddData(&Boolmsg{ + Type: RootlessEUIDAttr, + Value: c.config.RootlessEUID, + }) + + // sysbox-runc: request prep of the rootfs when we create a new mnt-ns + _, joinExistingMnt := nsMaps[configs.NEWNS] + if !joinExistingMnt { + + r.AddData(&Boolmsg{ + Type: PrepRootfsAttr, + Value: true, + }) + + makeParentPriv, parentMount, err := rootfsParentMountIsShared(c.config.Rootfs) + if err != nil { + return nil, err + } + + r.AddData(&Boolmsg{ + Type: MakeParentPrivAttr, + Value: makeParentPriv, + }) + + r.AddData(&Bytemsg{ + Type: ParentMountAttr, + Value: []byte(parentMount), + }) + + propFlag := unix.MS_SLAVE | unix.MS_REC + if c.config.RootPropagation != 0 { + propFlag = c.config.RootPropagation + } + + r.AddData(&Int32msg{ + Type: RootfsPropAttr, + Value: uint32(propFlag), + }) + + r.AddData(&Bytemsg{ + Type: RootfsAttr, + Value: []byte(c.config.Rootfs), + }) + + shiftfsMounts := []string{} + for _, m := range c.config.ShiftfsMounts { + shiftfsMounts = append(shiftfsMounts, m.Source) + } + + r.AddData(&Bytemsg{ + Type: ShiftfsMountsAttr, + Value: []byte(strings.Join(shiftfsMounts, ",")), + }) + + } + + return bytes.NewReader(r.Serialize()), nil +} + +// ignoreTerminateErrors returns nil if the given err matches an error known +// to indicate that the terminate occurred successfully or err was nil, otherwise +// err is returned unaltered. +func ignoreTerminateErrors(err error) error { + if err == nil { + return nil + } + // terminate() might return an error from ether Kill or Wait. + // The (*Cmd).Wait documentation says: "If the command fails to run + // or doesn't complete successfully, the error is of type *ExitError". + // Filter out such errors (like "exit status 1" or "signal: killed"). + var exitErr *exec.ExitError + if errors.As(err, &exitErr) { + return nil + } + // TODO: use errors.Is(err, os.ErrProcessDone) here and + // remove "process already finished" string comparison below + // once go 1.16 is minimally supported version. + + s := err.Error() + if strings.Contains(s, "process already finished") || + strings.Contains(s, "Wait was already called") { + return nil + } + return err +} + +func requiresRootOrMappingTool(c *configs.Config) bool { + gidMap := []configs.IDMap{ + {ContainerID: 0, HostID: os.Getegid(), Size: 1}, + } + return !reflect.DeepEqual(c.GidMappings, gidMap) +} + +// Borrowed from https://golang.org/src/syscall/exec_linux.go (BSD-license) +func formatIDMappings(idMap []configs.IDMap) []byte { + var data []byte + for _, im := range idMap { + data = append(data, []byte(strconv.Itoa(im.ContainerID)+" "+strconv.Itoa(im.HostID)+" "+strconv.Itoa(im.Size)+"\n")...) + } + return data +} + +// sysbox-runc: handleReqOp handles requests from the container's init process for actions +// that can't be done by it (e.g., due to lack of permissions, etc.). +func (c *linuxContainer) handleReqOp(childPid int, reqs []opReq) error { + + if len(reqs) == 0 { + return newSystemError(fmt.Errorf("no op requests!")) + } + + // If multiple requests are passed in the slice, they must all be + // of the same type. + op := reqs[0].Op + + if op != bind && op != switchDockerDns && op != chown && op != mkdir && op != rootfsIDMap { + return newSystemError(fmt.Errorf("invalid opReq type %d", int(op))) + } + + return c.handleOp(op, childPid, reqs) +} + +// sysbox-runc: handleOp dispatches a helpter process that enters one or more of +// the container's namespaces and performs the given request. By virtue of only +// entering a subset of the container's namespaces, the helper can bypass restrictions +// that the container's init process would have in order to perform those same actions. +func (c *linuxContainer) handleOp(op opReqType, childPid int, reqs []opReq) error { + + // create the socket pairs for communication with the new nsenter child process + parentMsgPipe, childMsgPipe, err := utils.NewSockPair("initHelper") + if err != nil { + return newSystemErrorWithCause(err, "creating new initHelper pipe") + } + defer parentMsgPipe.Close() + + parentLogPipe, childLogPipe, err := os.Pipe() + if err != nil { + return newSystemErrorWithCause(err, "Unable to create the initMount log pipe") + } + + // create a new initMount command + initProc := c.initProcess.(*initProcess).process + cmd := c.initHelperCmdTemplate(initProc, childMsgPipe, childLogPipe) + + // Log error messages from the initMount child process + go logs.ForwardLogs(parentLogPipe) + + // start the command (creates parent, child, and grandchild + // processes; the granchild enters the go-runtime in the desired + // namespaces). + err = cmd.Start() + childMsgPipe.Close() + childLogPipe.Close() + if err != nil { + return newSystemErrorWithCause(err, "starting initHelper child") + } + + // create the config payload + namespaces := []string{} + + switch op { + case bind, chown, mkdir, rootfsIDMap: + namespaces = append(namespaces, + fmt.Sprintf("mnt:/proc/%d/ns/mnt", childPid), + fmt.Sprintf("pid:/proc/%d/ns/pid", childPid), + ) + + case switchDockerDns: + namespaces = append(namespaces, + fmt.Sprintf("net:/proc/%d/ns/net", childPid), + ) + } + + r := nl.NewNetlinkRequest(int(InitMsg), 0) + r.AddData(&Bytemsg{ + Type: NsPathsAttr, + Value: []byte(strings.Join(namespaces, ",")), + }) + + // send the config to the parent process + if _, err := io.Copy(parentMsgPipe, bytes.NewReader(r.Serialize())); err != nil { + return newSystemErrorWithCause(err, "copying initHelper bootstrap data to pipe") + } + + // wait for parent process to exit + status, err := cmd.Process.Wait() + if err != nil { + cmd.Wait() + return err + } + if !status.Success() { + cmd.Wait() + return newSystemError(&exec.ExitError{ProcessState: status}) + } + + // get the first child pid from the pipe + var pid pid + decoder := json.NewDecoder(parentMsgPipe) + if err := decoder.Decode(&pid); err != nil { + cmd.Wait() + return newSystemErrorWithCause(err, "getting the initHelper pid from pipe") + } + + firstChildProcess, err := os.FindProcess(pid.PidFirstChild) + if err != nil { + return err + } + + // wait for the first child to exit; ignore errors in case the child has + // already been reaped for any reason + _, _ = firstChildProcess.Wait() + + // grandchild remains and will enter the go runtime + process, err := os.FindProcess(pid.Pid) + if err != nil { + return err + } + cmd.Process = process + + // send the action requests to the grandchild + if err := utils.WriteJSON(parentMsgPipe, reqs); err != nil { + return newSystemErrorWithCause(err, "writing init mount info to pipe") + } + + // wait for msg from the grandchild indicating that it's done + ierr := parseSync(parentMsgPipe, func(sync *syncT) error { + switch sync.Type { + case opDone: + // no further action; parseSync will wait for pipe to be closed on the other side. + default: + return newSystemError(fmt.Errorf("invalid JSON payload from initSetRootfs child")) + } + return nil + }) + + // destroy the socket pair + if err := unix.Shutdown(int(parentMsgPipe.Fd()), unix.SHUT_WR); err != nil { + return newSystemErrorWithCause(err, "shutting down initHelper pipe") + } + + if ierr != nil { + cmd.Wait() + return ierr + } + + cmd.Wait() + return nil +} + +// Processes a seccomp notification file-descriptor for the sys container by passing it to +// sysbox-fs to setup syscall trapping. +func (c *linuxContainer) procSeccompInit(pid int, fd int32) error { + if c.sysbox.Fs.Enabled() { + if err := c.sysbox.Fs.SendSeccompInit(pid, c.id, fd); err != nil { + return newSystemErrorWithCause(err, "sending seccomp fd to sysbox-fs") + } + } + return nil +} + +// sysbox-runc: sets up the shiftfs marks for the container +func (c *linuxContainer) setupShiftfsMarks() error { + + mi, err := mount.GetMounts() + if err != nil { + return fmt.Errorf("failed to read mountinfo: %s", err) + } + + config := c.config + shiftfsMounts := []shiftfs.MountPoint{} + noShiftfsOnFuse := c.sysbox.Mgr.Config.NoShiftfsOnFuse + + // rootfs + if config.RootfsUidShiftType == sh.Shiftfs || + config.RootfsUidShiftType == sh.IDMappedMountOrShiftfs { + shiftfsMounts = append(shiftfsMounts, shiftfs.MountPoint{Source: config.Rootfs, Readonly: false}) + } + + // bind-mounts + if config.BindMntUidShiftType == sh.Shiftfs || + config.BindMntUidShiftType == sh.IDMappedMountOrShiftfs { + + for _, m := range config.Mounts { + if m.Device == "bind" { + + if ignoreIDshift(m, config) { + continue + } + + if m.IDMappedMount { + continue + } + + needShiftfs, err := needUidShiftOnBindSrc(m, config) + if err != nil { + return newSystemErrorWithCause(err, "checking uid shifting on bind source") + } + + if !needShiftfs { + continue + } + + // If the mount source is a file, it may itself be a bind-mount from + // another file. In this case, we need to mount shiftfs over the + // orig file (i.e., the source of the bind mount). + if !m.BindSrcInfo.IsDir { + + isBindMnt, origSrc, err := fileIsBindMount(mi, m.Source) + if err != nil { + return fmt.Errorf("failed to check if %s is a bind-mount: %s", m.Source, err) + } + + if isBindMnt { + m.Source = origSrc + } + } + + // shiftfs mounts must be on directories (not on files). But this + // does not mean that the directory on which shiftfs is mounted is + // necessarily fully exposed inside the container; it may be that + // only a file in that directory is exposed inside the container + // (via bind-mounts when setting up the container rootfs). + + var dir string + if !m.BindSrcInfo.IsDir { + dir = filepath.Dir(m.Source) + } else { + dir = m.Source + } + + allow, err := allowShiftfsBindSource(dir, noShiftfsOnFuse) + if err != nil { + return err + } + + if !allow { + continue + } + + duplicate := false + for _, sm := range shiftfsMounts { + if sm.Source == dir { + duplicate = true + } + } + + if !duplicate { + sm := shiftfs.MountPoint{ + Source: dir, + Readonly: m.Flags&unix.MS_RDONLY == unix.MS_RDONLY, + } + shiftfsMounts = append(shiftfsMounts, sm) + } + } + } + } + + if len(shiftfsMounts) == 0 { + return nil + } + + // Perform the shiftfs marks; normally this is done by sysbox-mgr as it can + // track shiftfs mark-points on the host. But for sysbox-runc unit testing + // the sysbox-mgr is not present, so we do the shiftfs marking locally (which + // only works when sys containers are not sharing mount points). + if c.sysbox.Mgr.Enabled() { + + shiftfsMarks, err := c.sysbox.Mgr.ReqShiftfsMark(shiftfsMounts) + if err != nil { + return err + } + + if len(shiftfsMarks) != len(shiftfsMounts) { + return fmt.Errorf("Error creating shiftfs mark-mounts: shiftfsMounts = %v, shiftfsMarks = %v", + shiftfsMounts, shiftfsMarks) + } + + config.ShiftfsMounts = shiftfsMarks + + // Replace the container's mounts that have shiftfs with the shiftfs + // markpoint allocated by sysbox-mgr. + + if config.RootfsUidShiftType == sh.Shiftfs { + config.Rootfs = shiftfsMarks[0].Source + } + + for _, m := range config.Mounts { + if m.Device == "bind" { + if m.BindSrcInfo.IsDir { + for i, sm := range shiftfsMounts { + if m.Source == sm.Source { + m.Source = shiftfsMarks[i].Source + } + } + } else { + for i, sm := range shiftfsMounts { + if filepath.Dir(m.Source) == sm.Source { + m.Source = filepath.Join(shiftfsMarks[i].Source, filepath.Base(m.Source)) + } + } + } + } + } + + return nil + + } else { + config.ShiftfsMounts = shiftfsMounts + return c.setupShiftfsMarkLocal(mi) + } +} + +// Setup shiftfs marks; meant for testing only +func (c *linuxContainer) setupShiftfsMarkLocal(mi []*mount.Info) error { + + for _, m := range c.config.ShiftfsMounts { + mounted, err := mount.MountedWithFs(m.Source, "shiftfs", mi) + if err != nil { + return newSystemErrorWithCausef(err, "checking for shiftfs mount at %s", m.Source) + } + if !mounted { + if err := shiftfs.Mark(m.Source, m.Source); err != nil { + return newSystemErrorWithCausef(err, "marking shiftfs on %s", m.Source) + } + } + } + + return nil +} + +// Teardown shiftfs marks; meant for testing only +func (c *linuxContainer) teardownShiftfsMarkLocal(mi []*mount.Info) error { + + for _, m := range c.config.ShiftfsMounts { + mounted, err := mount.MountedWithFs(m.Source, "shiftfs", mi) + if err != nil { + return newSystemErrorWithCausef(err, "checking for shiftfs mount at %s", m.Source) + } + if mounted { + if err := shiftfs.Unmount(m.Source); err != nil { + return newSystemErrorWithCausef(err, "unmarking shiftfs on %s", m.Source) + } + } + } + + return nil +} + +// chowns the container's rootfs to match the user-ns uid & gid mappings. +func (c *linuxContainer) chownRootfs() error { + + rootfs := c.config.Rootfs + + uidOffset := int32(c.config.UidMappings[0].HostID) + gidOffset := int32(c.config.GidMappings[0].HostID) + + if err := sh.ShiftIdsWithChown(rootfs, uidOffset, gidOffset); err != nil { + return newSystemErrorWithCausef(err, "chowning rootfs at %s by offset %d, %d", rootfs, uidOffset, gidOffset) + } + + return nil +} + +// reverts the container's rootfs chown (back to it's original value) +func (c *linuxContainer) revertRootfsChown() error { + + if c.sysbox.Mgr.IsRootfsCloned() { + c.config.Rootfs = c.sysbox.Mgr.GetClonedRootfs() + } + + uidOffset := 0 - int32(c.config.UidMappings[0].HostID) + gidOffset := 0 - int32(c.config.GidMappings[0].HostID) + + if err := sh.ShiftIdsWithChown(c.config.Rootfs, uidOffset, gidOffset); err != nil { + return newSystemErrorWithCausef(err, "chowning rootfs at %s by offset %d, %d", c.config.Rootfs, uidOffset, gidOffset) + } + + return nil +} + +// The following are host directories where we never mount shiftfs as it causes functional problems. +var shiftfsDevBlacklist = []string{"/dev"} +var shiftfsBlacklist = []string{"shiftfs"} + +// sysbox-runc: allowShiftfsBindSource indicates if shiftfs mounts is allowed on +// the given bind mount. +func allowShiftfsBindSource(source string, noShiftfsOnFuse bool) (bool, error) { + + // Don't mount shiftfs on top of some devices + if sysboxLibsUtils.StringSliceContains(shiftfsDevBlacklist, source) { + return false, nil + } + + // Don't mount shiftfs on cgroup v2 bind mounts either + if strings.HasPrefix(source, "/sys/fs/cgroup") { + return false, nil + } + + // Don't mount shiftfs on top of some filesystems + fsName, err := sysboxLibsUtils.GetFsName(source) + if err != nil { + return false, err + } + if sysboxLibsUtils.StringSliceContains(shiftfsBlacklist, fsName) { + return false, nil + } + if noShiftfsOnFuse && fsName == "fuse" { + return false, nil + } + + return true, nil +} + +// sysbox-runc: determines which mounts must be ID-mapped; does not actually +// perform the ID-mapped mounts (that's done inside the container, see +// rootfs_init_linux.go) but rather marks the mount for ID-mapping only. +func (c *linuxContainer) setupIDMappedMounts() error { + + config := c.config + + // rootfs + if config.RootfsUidShiftType == sh.IDMappedMount || + config.RootfsUidShiftType == sh.IDMappedMountOrShiftfs { + idMapMountAllowed, err := idMap.IDMapMountSupportedOnPath(config.Rootfs) + if err != nil { + return newSystemErrorWithCausef(err, "checking for ID-mapped mount support on rootfs %s", config.Rootfs) + } + if idMapMountAllowed { + config.RootfsUidShiftType = sh.IDMappedMount + } + } + + // bind-mounts + if config.BindMntUidShiftType == sh.IDMappedMount || + config.BindMntUidShiftType == sh.IDMappedMountOrShiftfs { + + for _, m := range config.Mounts { + if m.Device == "bind" { + + if ignoreIDshift(m, config) { + continue + } + + idMapMntAllowed, err := idMap.IDMapMountSupportedOnPath(m.Source) + if err != nil { + return newSystemErrorWithCausef(err, "checking for ID-mapped mount support on bind source %s", m.Source) + } + + if !idMapMntAllowed { + continue + } + + needIDMap, err := needUidShiftOnBindSrc(m, config) + if err != nil { + return newSystemErrorWithCause(err, "checking uid shifting on bind source") + } + + m.IDMappedMount = needIDMap + } + } + } + + return nil +} + +// needUidShiftOnBindSrc checks if uid/gid shifting on the given bind mount source path is +// required to run the system container. +func needUidShiftOnBindSrc(mount *configs.Mount, config *configs.Config) (bool, error) { + + // sysbox-fs handles uid(gid) shifting itself, so no need for mounting shiftfs on top + if strings.HasPrefix(mount.Source, syscont.SysboxFsDir+"/") { + return false, nil + } + + // Don't uid shift on bind sources under the container's rootfs + if strings.HasPrefix(mount.Source, config.Rootfs+"/") { + return false, nil + } + + // If the bind source has uid:gid ownership matching the container's user-ns + // mappings, uid shifting is not needed. + + var hostUid, hostGid uint32 + var uidSize, gidSize uint32 + + for _, mapping := range config.UidMappings { + if mapping.ContainerID == 0 { + hostUid = uint32(mapping.HostID) + } + uidSize += uint32(mapping.Size) + } + for _, mapping := range config.GidMappings { + if mapping.ContainerID == 0 { + hostGid = uint32(mapping.HostID) + } + gidSize += uint32(mapping.Size) + } + + if (mount.BindSrcInfo.Uid >= hostUid) && (mount.BindSrcInfo.Uid < hostUid+uidSize) && + (mount.BindSrcInfo.Gid >= hostGid) && (mount.BindSrcInfo.Gid < hostGid+gidSize) { + return false, nil + } + + return true, nil +} + +// Checks if the file at the given path is a bind-mount; if so, returns true and +// the path to the bind-mount's source. +func fileIsBindMount(mounts []*mount.Info, fpath string) (bool, string, error) { + var fpathMi *mount.Info + + // Since path corresponds to a file (not a directory), if it's a mountpoint + // then it must be a bind-mount (i.e., file mountpoints are only allowed for + // bind mounts). + for _, mi := range mounts { + if mi.Mountpoint == fpath { + fpathMi = mi + break + } + } + + // If file is not a mountpoint, we are done + if fpathMi == nil { + return false, "", nil + } + + // Find the source of that bind mount. This is not as simple as looking at + // the fpathMi.Root, because the root itself may be a bind-mount. To resolve + // this, we find the device that backs the file, then find where that device + // is mounted at (the device's root mountpoint), and then use it to replace + // the correponding prefix in the fpathMi.Root. + // + // For example: say fpath = /mnt/scratch/t1/f1 and the mount tree looks like: + // + // 1232 1303 0:60 / /mnt/scratch/tmpfs rw,relatime - tmpfs tmpfs rw,size=10240k + // 1233 1303 0:60 /f1-tmpfs /mnt/scratch/t1/f1 rw,relatime - tmpfs tmpfs rw,size=10240k + // + // Then we see that mount 1232 is the root mount for the device and it's mounted at "/mnt/scratch/tmpfs". + // Thus, we replace /f1-tmpfs -> /mnt/scratch/tmpfs/f1-tmpfs. + // + // Another example: say fpath = /mnt/scratch/t1/f3 and the mount tree looks like: + // + // 1302 1282 8:2 /var/tmp/sysbox-test-var-run /run rw,relatime - ext4 /dev/sda2 rw + // 1303 1282 8:2 /var/tmp/sysbox-test-scratch /mnt/scratch rw,relatime - ext4 /dev/sda2 rw + // 1234 1303 8:2 /var/tmp/sysbox-test-scratch/t1/f4 /mnt/scratch/t1/f3 rw,relatime - ext4 /dev/sda2 rw + // + // Then we see that mount 1303 is the root mount for the device and it's mounted at "/mnt/scratch". + // Thus, we replace /var/tmp/sysbox-test-scratch/t1/f4 -> /mnt/scratch/t1/f4. + + devRoot := fpathMi.Root + devMp := "" + + for _, mi := range mounts { + if mi.Major == fpathMi.Major && mi.Minor == fpathMi.Minor { + if strings.HasPrefix(devRoot, mi.Root) { + devRoot = mi.Root + devMp = mi.Mountpoint + } + } + } + + // The extra "/" ensures we have a path separator in the resulting path + fpathMi.Root = strings.Replace(fpathMi.Root, devRoot, devMp+"/", 1) + return true, fpathMi.Root, nil +} + +// Iterate through the 'idShift-ignore-list' extracted by sysbox-mgr and skip all +// those mountpoints with a matching 'destination'. +func ignoreIDshift(mount *configs.Mount, config *configs.Config) bool { + for _, e := range config.IDshiftIgnoreList { + if e == mount.Destination { + return true + } + } + return false +} diff --git a/sysbox-runc/libcontainer/container_linux_test.go b/sysbox-runc/libcontainer/container_linux_test.go new file mode 100644 index 00000000..dd9f5d2a --- /dev/null +++ b/sysbox-runc/libcontainer/container_linux_test.go @@ -0,0 +1,420 @@ +//go:build linux +// +build linux + +package libcontainer + +import ( + "fmt" + "io/ioutil" + "os" + "testing" + + "github.com/opencontainers/runc/libcontainer/cgroups" + "github.com/opencontainers/runc/libcontainer/configs" + "github.com/opencontainers/runc/libcontainer/intelrdt" + "github.com/opencontainers/runc/libcontainer/system" + "github.com/opencontainers/runc/libsysbox/sysbox" +) + +type mockCgroupManager struct { + pids []int + allPids []int + stats *cgroups.Stats + paths map[string]string +} + +type mockIntelRdtManager struct { + stats *intelrdt.Stats + path string +} + +func (m *mockCgroupManager) GetPids() ([]int, error) { + return m.pids, nil +} + +func (m *mockCgroupManager) GetAllPids() ([]int, error) { + return m.allPids, nil +} + +func (m *mockCgroupManager) GetStats() (*cgroups.Stats, error) { + return m.stats, nil +} + +func (m *mockCgroupManager) Apply(pid int) error { + return nil +} + +func (m *mockCgroupManager) Set(container *configs.Config) error { + return nil +} + +func (m *mockCgroupManager) Destroy() error { + return nil +} + +func (m *mockCgroupManager) Exists() bool { + _, err := os.Lstat(m.Path("devices")) + return err == nil +} + +func (m *mockCgroupManager) GetPaths() map[string]string { + return m.paths +} + +func (m *mockCgroupManager) Path(subsys string) string { + return m.paths[subsys] +} + +func (m *mockCgroupManager) Freeze(state configs.FreezerState) error { + return nil +} + +func (m *mockCgroupManager) GetCgroups() (*configs.Cgroup, error) { + return nil, nil +} + +func (m *mockCgroupManager) GetFreezerState() (configs.FreezerState, error) { + return configs.Thawed, nil +} + +func (m *mockCgroupManager) CreateChildCgroup(container *configs.Config) error { + return nil +} + +func (m *mockCgroupManager) ApplyChildCgroup(pid int) error { + return nil +} + +func (m *mockCgroupManager) GetChildCgroupPaths() map[string]string { + return m.paths +} + +func (m *mockCgroupManager) GetType() cgroups.CgroupType { + return cgroups.Cgroup_v1_fs +} + +func (m *mockIntelRdtManager) Apply(pid int) error { + return nil +} + +func (m *mockIntelRdtManager) GetStats() (*intelrdt.Stats, error) { + return m.stats, nil +} + +func (m *mockIntelRdtManager) Destroy() error { + return nil +} + +func (m *mockIntelRdtManager) GetPath() string { + return m.path +} + +func (m *mockIntelRdtManager) Set(container *configs.Config) error { + return nil +} + +func (m *mockIntelRdtManager) GetCgroups() (*configs.Cgroup, error) { + return nil, nil +} + +type mockProcess struct { + _pid int + started uint64 +} + +func (m *mockProcess) terminate() error { + return nil +} + +func (m *mockProcess) pid() int { + return m._pid +} + +func (m *mockProcess) startTime() (uint64, error) { + return m.started, nil +} + +func (m *mockProcess) start() error { + return nil +} + +func (m *mockProcess) wait() (*os.ProcessState, error) { + return nil, nil +} + +func (m *mockProcess) signal(_ os.Signal) error { + return nil +} + +func (m *mockProcess) externalDescriptors() []string { + return []string{} +} + +func (m *mockProcess) setExternalDescriptors(newFds []string) { +} + +func (m *mockProcess) forwardChildLogs() { +} + +func TestGetContainerPids(t *testing.T) { + pid := 1 + stat, err := system.Stat(pid) + if err != nil { + t.Fatalf("can't stat pid %d, got %v", pid, err) + } + container := &linuxContainer{ + id: "myid", + config: &configs.Config{}, + sysbox: sysbox.NewSysbox("myid", false, false), + cgroupManager: &mockCgroupManager{ + allPids: []int{1, 2, 3}, + paths: map[string]string{ + "device": "/proc/self/cgroups", + }, + }, + initProcess: &mockProcess{ + _pid: 1, + started: 10, + }, + initProcessStartTime: stat.StartTime, + } + container.state = &runningState{c: container} + pids, err := container.Processes() + if err != nil { + t.Fatal(err) + } + for i, expected := range []int{1, 2, 3} { + if pids[i] != expected { + t.Fatalf("expected pid %d but received %d", expected, pids[i]) + } + } +} + +func TestGetContainerStats(t *testing.T) { + container := &linuxContainer{ + id: "myid", + config: &configs.Config{}, + cgroupManager: &mockCgroupManager{ + pids: []int{1, 2, 3}, + stats: &cgroups.Stats{ + MemoryStats: cgroups.MemoryStats{ + Usage: cgroups.MemoryData{ + Usage: 1024, + }, + }, + }, + }, + intelRdtManager: &mockIntelRdtManager{ + stats: &intelrdt.Stats{ + L3CacheSchema: "L3:0=f;1=f0", + MemBwSchema: "MB:0=20;1=70", + }, + }, + sysbox: sysbox.NewSysbox("myid", false, false), + } + stats, err := container.Stats() + if err != nil { + t.Fatal(err) + } + if stats.CgroupStats == nil { + t.Fatal("cgroup stats are nil") + } + if stats.CgroupStats.MemoryStats.Usage.Usage != 1024 { + t.Fatalf("expected memory usage 1024 but received %d", stats.CgroupStats.MemoryStats.Usage.Usage) + } + if intelrdt.IsCATEnabled() { + if stats.IntelRdtStats == nil { + t.Fatal("intel rdt stats are nil") + } + if stats.IntelRdtStats.L3CacheSchema != "L3:0=f;1=f0" { + t.Fatalf("expected L3CacheSchema L3:0=f;1=f0 but received %s", stats.IntelRdtStats.L3CacheSchema) + } + } + if intelrdt.IsMBAEnabled() { + if stats.IntelRdtStats == nil { + t.Fatal("intel rdt stats are nil") + } + if stats.IntelRdtStats.MemBwSchema != "MB:0=20;1=70" { + t.Fatalf("expected MemBwSchema MB:0=20;1=70 but received %s", stats.IntelRdtStats.MemBwSchema) + } + } +} + +func TestGetContainerState(t *testing.T) { + var ( + pid = os.Getpid() + expectedMemoryPath = "/sys/fs/cgroup/memory/myid" + expectedNetworkPath = fmt.Sprintf("/proc/%d/ns/net", pid) + expectedIntelRdtPath = "/sys/fs/resctrl/myid" + ) + container := &linuxContainer{ + id: "myid", + config: &configs.Config{ + Namespaces: []configs.Namespace{ + {Type: configs.NEWPID}, + {Type: configs.NEWNS}, + {Type: configs.NEWNET, Path: expectedNetworkPath}, + {Type: configs.NEWUTS}, + // emulate host for IPC + //{Type: configs.NEWIPC}, + {Type: configs.NEWCGROUP}, + }, + }, + initProcess: &mockProcess{ + _pid: pid, + started: 10, + }, + cgroupManager: &mockCgroupManager{ + pids: []int{1, 2, 3}, + stats: &cgroups.Stats{ + MemoryStats: cgroups.MemoryStats{ + Usage: cgroups.MemoryData{ + Usage: 1024, + }, + }, + }, + paths: map[string]string{ + "memory": expectedMemoryPath, + }, + }, + intelRdtManager: &mockIntelRdtManager{ + stats: &intelrdt.Stats{ + L3CacheSchema: "L3:0=f0;1=f", + MemBwSchema: "MB:0=70;1=20", + }, + path: expectedIntelRdtPath, + }, + sysbox: sysbox.NewSysbox("myid", false, false), + } + container.state = &createdState{c: container} + state, err := container.State() + if err != nil { + t.Fatal(err) + } + if state.InitProcessPid != pid { + t.Fatalf("expected pid %d but received %d", pid, state.InitProcessPid) + } + if state.InitProcessStartTime != 10 { + t.Fatalf("expected process start time 10 but received %d", state.InitProcessStartTime) + } + paths := state.CgroupPaths + if paths == nil { + t.Fatal("cgroup paths should not be nil") + } + if memPath := paths["memory"]; memPath != expectedMemoryPath { + t.Fatalf("expected memory path %q but received %q", expectedMemoryPath, memPath) + } + if intelrdt.IsCATEnabled() || intelrdt.IsMBAEnabled() { + intelRdtPath := state.IntelRdtPath + if intelRdtPath == "" { + t.Fatal("intel rdt path should not be empty") + } + if intelRdtPath != expectedIntelRdtPath { + t.Fatalf("expected intel rdt path %q but received %q", expectedIntelRdtPath, intelRdtPath) + } + } + for _, ns := range container.config.Namespaces { + path := state.NamespacePaths[ns.Type] + if path == "" { + t.Fatalf("expected non nil namespace path for %s", ns.Type) + } + if ns.Type == configs.NEWNET { + if path != expectedNetworkPath { + t.Fatalf("expected path %q but received %q", expectedNetworkPath, path) + } + } else { + file := "" + switch ns.Type { + case configs.NEWNET: + file = "net" + case configs.NEWNS: + file = "mnt" + case configs.NEWPID: + file = "pid" + case configs.NEWIPC: + file = "ipc" + case configs.NEWUSER: + file = "user" + case configs.NEWUTS: + file = "uts" + case configs.NEWCGROUP: + file = "cgroup" + } + expected := fmt.Sprintf("/proc/%d/ns/%s", pid, file) + if expected != path { + t.Fatalf("expected path %q but received %q", expected, path) + } + } + } +} + +func TestGetContainerStateAfterUpdate(t *testing.T) { + var ( + pid = os.Getpid() + ) + stat, err := system.Stat(pid) + if err != nil { + t.Fatal(err) + } + + rootDir, err := ioutil.TempDir("", "TestGetContainerStateAfterUpdate") + if err != nil { + t.Fatal(err) + } + defer os.RemoveAll(rootDir) + + container := &linuxContainer{ + root: rootDir, + id: "myid", + config: &configs.Config{ + Namespaces: []configs.Namespace{ + {Type: configs.NEWPID}, + {Type: configs.NEWNS}, + {Type: configs.NEWNET}, + {Type: configs.NEWUTS}, + {Type: configs.NEWIPC}, + }, + Cgroups: &configs.Cgroup{ + Resources: &configs.Resources{ + Memory: 1024, + }, + }, + }, + initProcess: &mockProcess{ + _pid: pid, + started: stat.StartTime, + }, + cgroupManager: &mockCgroupManager{}, + sysbox: sysbox.NewSysbox("myid", false, false), + } + container.state = &createdState{c: container} + state, err := container.State() + if err != nil { + t.Fatal(err) + } + if state.InitProcessPid != pid { + t.Fatalf("expected pid %d but received %d", pid, state.InitProcessPid) + } + if state.InitProcessStartTime != stat.StartTime { + t.Fatalf("expected process start time %d but received %d", stat.StartTime, state.InitProcessStartTime) + } + if state.Config.Cgroups.Resources.Memory != 1024 { + t.Fatalf("expected Memory to be 1024 but received %q", state.Config.Cgroups.Memory) + } + + // Set initProcessStartTime so we fake to be running + container.initProcessStartTime = state.InitProcessStartTime + container.state = &runningState{c: container} + newConfig := container.Config() + newConfig.Cgroups.Resources.Memory = 2048 + if err := container.Set(newConfig); err != nil { + t.Fatal(err) + } + state, err = container.State() + if err != nil { + t.Fatal(err) + } + if state.Config.Cgroups.Resources.Memory != 2048 { + t.Fatalf("expected Memory to be 2048 but received %q", state.Config.Cgroups.Memory) + } +} diff --git a/sysbox-runc/libcontainer/criu_opts_linux.go b/sysbox-runc/libcontainer/criu_opts_linux.go new file mode 100644 index 00000000..11cbdb2d --- /dev/null +++ b/sysbox-runc/libcontainer/criu_opts_linux.go @@ -0,0 +1,32 @@ +package libcontainer + +import criu "github.com/checkpoint-restore/go-criu/v4/rpc" + +type CriuPageServerInfo struct { + Address string // IP address of CRIU page server + Port int32 // port number of CRIU page server +} + +type VethPairName struct { + ContainerInterfaceName string + HostInterfaceName string +} + +type CriuOpts struct { + ImagesDirectory string // directory for storing image files + WorkDirectory string // directory to cd and write logs/pidfiles/stats to + ParentImage string // directory for storing parent image files in pre-dump and dump + LeaveRunning bool // leave container in running state after checkpoint + TcpEstablished bool // checkpoint/restore established TCP connections + ExternalUnixConnections bool // allow external unix connections + ShellJob bool // allow to dump and restore shell jobs + FileLocks bool // handle file locks, for safety + PreDump bool // call criu predump to perform iterative checkpoint + PageServer CriuPageServerInfo // allow to dump to criu page server + VethPairs []VethPairName // pass the veth to criu when restore + ManageCgroupsMode criu.CriuCgMode // dump or restore cgroup mode + EmptyNs uint32 // don't c/r properties for namespace from this mask + AutoDedup bool // auto deduplication for incremental dumps + LazyPages bool // restore memory pages lazily using userfaultfd + StatusFd int // fd for feedback when lazy server is ready +} diff --git a/sysbox-runc/libcontainer/devices/device.go b/sysbox-runc/libcontainer/devices/device.go new file mode 100644 index 00000000..3eb73cc7 --- /dev/null +++ b/sysbox-runc/libcontainer/devices/device.go @@ -0,0 +1,170 @@ +package devices + +import ( + "fmt" + "os" + "strconv" +) + +const ( + Wildcard = -1 +) + +type Device struct { + Rule + + // Path to the device. + Path string `json:"path"` + + // FileMode permission bits for the device. + FileMode os.FileMode `json:"file_mode"` + + // Uid of the device. + Uid uint32 `json:"uid"` + + // Gid of the device. + Gid uint32 `json:"gid"` +} + +// Permissions is a cgroupv1-style string to represent device access. It +// has to be a string for backward compatibility reasons, hence why it has +// methods to do set operations. +type Permissions string + +const ( + deviceRead uint = (1 << iota) + deviceWrite + deviceMknod +) + +func (p Permissions) toSet() uint { + var set uint + for _, perm := range p { + switch perm { + case 'r': + set |= deviceRead + case 'w': + set |= deviceWrite + case 'm': + set |= deviceMknod + } + } + return set +} + +func fromSet(set uint) Permissions { + var perm string + if set&deviceRead == deviceRead { + perm += "r" + } + if set&deviceWrite == deviceWrite { + perm += "w" + } + if set&deviceMknod == deviceMknod { + perm += "m" + } + return Permissions(perm) +} + +// Union returns the union of the two sets of Permissions. +func (p Permissions) Union(o Permissions) Permissions { + lhs := p.toSet() + rhs := o.toSet() + return fromSet(lhs | rhs) +} + +// Difference returns the set difference of the two sets of Permissions. +// In set notation, A.Difference(B) gives you A\B. +func (p Permissions) Difference(o Permissions) Permissions { + lhs := p.toSet() + rhs := o.toSet() + return fromSet(lhs &^ rhs) +} + +// Intersection computes the intersection of the two sets of Permissions. +func (p Permissions) Intersection(o Permissions) Permissions { + lhs := p.toSet() + rhs := o.toSet() + return fromSet(lhs & rhs) +} + +// IsEmpty returns whether the set of permissions in a Permissions is +// empty. +func (p Permissions) IsEmpty() bool { + return p == Permissions("") +} + +// IsValid returns whether the set of permissions is a subset of valid +// permissions (namely, {r,w,m}). +func (p Permissions) IsValid() bool { + return p == fromSet(p.toSet()) +} + +type Type rune + +const ( + WildcardDevice Type = 'a' + BlockDevice Type = 'b' + CharDevice Type = 'c' // or 'u' + FifoDevice Type = 'p' +) + +func (t Type) IsValid() bool { + switch t { + case WildcardDevice, BlockDevice, CharDevice, FifoDevice: + return true + default: + return false + } +} + +func (t Type) CanMknod() bool { + switch t { + case BlockDevice, CharDevice, FifoDevice: + return true + default: + return false + } +} + +func (t Type) CanCgroup() bool { + switch t { + case WildcardDevice, BlockDevice, CharDevice: + return true + default: + return false + } +} + +type Rule struct { + // Type of device ('c' for char, 'b' for block). If set to 'a', this rule + // acts as a wildcard and all fields other than Allow are ignored. + Type Type `json:"type"` + + // Major is the device's major number. + Major int64 `json:"major"` + + // Minor is the device's minor number. + Minor int64 `json:"minor"` + + // Permissions is the set of permissions that this rule applies to (in the + // cgroupv1 format -- any combination of "rwm"). + Permissions Permissions `json:"permissions"` + + // Allow specifies whether this rule is allowed. + Allow bool `json:"allow"` +} + +func (d *Rule) CgroupString() string { + var ( + major = strconv.FormatInt(d.Major, 10) + minor = strconv.FormatInt(d.Minor, 10) + ) + if d.Major == Wildcard { + major = "*" + } + if d.Minor == Wildcard { + minor = "*" + } + return fmt.Sprintf("%c %s:%s %s", d.Type, major, minor, d.Permissions) +} diff --git a/sysbox-runc/libcontainer/devices/device_unix.go b/sysbox-runc/libcontainer/devices/device_unix.go new file mode 100644 index 00000000..a400341e --- /dev/null +++ b/sysbox-runc/libcontainer/devices/device_unix.go @@ -0,0 +1,16 @@ +// +build !windows + +package devices + +import ( + "errors" + + "golang.org/x/sys/unix" +) + +func (d *Rule) Mkdev() (uint64, error) { + if d.Major == Wildcard || d.Minor == Wildcard { + return 0, errors.New("cannot mkdev() device with wildcards") + } + return unix.Mkdev(uint32(d.Major), uint32(d.Minor)), nil +} diff --git a/sysbox-runc/libcontainer/devices/device_windows.go b/sysbox-runc/libcontainer/devices/device_windows.go new file mode 100644 index 00000000..8511bf00 --- /dev/null +++ b/sysbox-runc/libcontainer/devices/device_windows.go @@ -0,0 +1,5 @@ +package devices + +func (d *Rule) Mkdev() (uint64, error) { + return 0, nil +} diff --git a/sysbox-runc/libcontainer/devices/devices.go b/sysbox-runc/libcontainer/devices/devices.go new file mode 100644 index 00000000..5011f373 --- /dev/null +++ b/sysbox-runc/libcontainer/devices/devices.go @@ -0,0 +1,112 @@ +package devices + +import ( + "errors" + "io/ioutil" + "os" + "path/filepath" + + "golang.org/x/sys/unix" +) + +var ( + // ErrNotADevice denotes that a file is not a valid linux device. + ErrNotADevice = errors.New("not a device node") +) + +// Testing dependencies +var ( + unixLstat = unix.Lstat + ioutilReadDir = ioutil.ReadDir +) + +// Given the path to a device and its cgroup_permissions(which cannot be easily queried) look up the +// information about a linux device and return that information as a Device struct. +func DeviceFromPath(path, permissions string) (*Device, error) { + var stat unix.Stat_t + err := unixLstat(path, &stat) + if err != nil { + return nil, err + } + + var ( + devType Type + mode = stat.Mode + devNumber = uint64(stat.Rdev) + major = unix.Major(devNumber) + minor = unix.Minor(devNumber) + ) + switch mode & unix.S_IFMT { + case unix.S_IFBLK: + devType = BlockDevice + case unix.S_IFCHR: + devType = CharDevice + case unix.S_IFIFO: + devType = FifoDevice + default: + return nil, ErrNotADevice + } + return &Device{ + Rule: Rule{ + Type: devType, + Major: int64(major), + Minor: int64(minor), + Permissions: Permissions(permissions), + }, + Path: path, + FileMode: os.FileMode(mode), + Uid: stat.Uid, + Gid: stat.Gid, + }, nil +} + +// HostDevices returns all devices that can be found under /dev directory. +func HostDevices() ([]*Device, error) { + return GetDevices("/dev") +} + +// GetDevices recursively traverses a directory specified by path +// and returns all devices found there. +func GetDevices(path string) ([]*Device, error) { + files, err := ioutilReadDir(path) + if err != nil { + return nil, err + } + var out []*Device + for _, f := range files { + switch { + case f.IsDir(): + switch f.Name() { + // ".lxc" & ".lxd-mounts" added to address https://github.com/lxc/lxd/issues/2825 + // ".udev" added to address https://github.com/opencontainers/runc/issues/2093 + case "pts", "shm", "fd", "mqueue", ".lxc", ".lxd-mounts", ".udev": + continue + default: + sub, err := GetDevices(filepath.Join(path, f.Name())) + if err != nil { + return nil, err + } + + out = append(out, sub...) + continue + } + case f.Name() == "console": + continue + } + device, err := DeviceFromPath(filepath.Join(path, f.Name()), "rwm") + if err != nil { + if err == ErrNotADevice { + continue + } + if os.IsNotExist(err) { + continue + } + return nil, err + } + if device.Type == FifoDevice { + continue + } + out = append(out, device) + } + return out, nil +} diff --git a/sysbox-runc/libcontainer/devices/devices_test.go b/sysbox-runc/libcontainer/devices/devices_test.go new file mode 100644 index 00000000..02ebb4b6 --- /dev/null +++ b/sysbox-runc/libcontainer/devices/devices_test.go @@ -0,0 +1,103 @@ +package devices + +import ( + "errors" + "io/ioutil" + "os" + "testing" + + "golang.org/x/sys/unix" +) + +func cleanupTest() { + unixLstat = unix.Lstat + ioutilReadDir = ioutil.ReadDir +} + +func TestDeviceFromPathLstatFailure(t *testing.T) { + testError := errors.New("test error") + + // Override unix.Lstat to inject error. + unixLstat = func(path string, stat *unix.Stat_t) error { + return testError + } + defer cleanupTest() + + _, err := DeviceFromPath("", "") + if err != testError { + t.Fatalf("Unexpected error %v, expected %v", err, testError) + } +} + +func TestHostDevicesIoutilReadDirFailure(t *testing.T) { + testError := errors.New("test error") + + // Override ioutil.ReadDir to inject error. + ioutilReadDir = func(dirname string) ([]os.FileInfo, error) { + return nil, testError + } + defer cleanupTest() + + _, err := HostDevices() + if err != testError { + t.Fatalf("Unexpected error %v, expected %v", err, testError) + } +} + +func TestHostDevicesIoutilReadDirDeepFailure(t *testing.T) { + testError := errors.New("test error") + called := false + + // Override ioutil.ReadDir to inject error after the first call. + ioutilReadDir = func(dirname string) ([]os.FileInfo, error) { + if called { + return nil, testError + } + called = true + + // Provoke a second call. + fi, err := os.Lstat("/tmp") + if err != nil { + t.Fatalf("Unexpected error %v", err) + } + + return []os.FileInfo{fi}, nil + } + defer cleanupTest() + + _, err := HostDevices() + if err != testError { + t.Fatalf("Unexpected error %v, expected %v", err, testError) + } +} + +func TestHostDevicesAllValid(t *testing.T) { + devices, err := HostDevices() + if err != nil { + t.Fatalf("failed to get host devices: %v", err) + } + + for _, device := range devices { + // Devices can't have major number 0. + if device.Major == 0 { + t.Errorf("device entry %+v has zero major number", device) + } + // Devices should only have file modes that correspond to their type. + var expectedType os.FileMode + switch device.Type { + case BlockDevice: + expectedType = unix.S_IFBLK + case CharDevice: + expectedType = unix.S_IFCHR + case FifoDevice: + t.Logf("fifo devices shouldn't show up from HostDevices") + fallthrough + default: + t.Errorf("device entry %+v has unexpected type %v", device, device.Type) + } + gotType := device.FileMode & unix.S_IFMT + if expectedType != gotType { + t.Errorf("device entry %+v has mismatched types (expected %#x, got %#x)", device, expectedType, gotType) + } + } +} diff --git a/sysbox-runc/libcontainer/error.go b/sysbox-runc/libcontainer/error.go new file mode 100644 index 00000000..21a3789b --- /dev/null +++ b/sysbox-runc/libcontainer/error.go @@ -0,0 +1,70 @@ +package libcontainer + +import "io" + +// ErrorCode is the API error code type. +type ErrorCode int + +// API error codes. +const ( + // Factory errors + IdInUse ErrorCode = iota + InvalidIdFormat + + // Container errors + ContainerNotExists + ContainerPaused + ContainerNotStopped + ContainerNotRunning + ContainerNotPaused + + // Process errors + NoProcessOps + + // Common errors + ConfigInvalid + ConsoleExists + SystemError +) + +func (c ErrorCode) String() string { + switch c { + case IdInUse: + return "Id already in use" + case InvalidIdFormat: + return "Invalid format" + case ContainerPaused: + return "Container paused" + case ConfigInvalid: + return "Invalid configuration" + case SystemError: + return "System error" + case ContainerNotExists: + return "Container does not exist" + case ContainerNotStopped: + return "Container is not stopped" + case ContainerNotRunning: + return "Container is not running" + case ConsoleExists: + return "Console exists for process" + case ContainerNotPaused: + return "Container is not paused" + case NoProcessOps: + return "No process operations" + default: + return "Unknown error" + } +} + +// Error is the API error type. +type Error interface { + error + + // Returns an error if it failed to write the detail of the Error to w. + // The detail of the Error may include the error message and a + // representation of the stack trace. + Detail(w io.Writer) error + + // Returns the error code for this error. + Code() ErrorCode +} diff --git a/sysbox-runc/libcontainer/error_test.go b/sysbox-runc/libcontainer/error_test.go new file mode 100644 index 00000000..36841ad8 --- /dev/null +++ b/sysbox-runc/libcontainer/error_test.go @@ -0,0 +1,25 @@ +package libcontainer + +import "testing" + +func TestErrorCode(t *testing.T) { + codes := map[ErrorCode]string{ + IdInUse: "Id already in use", + InvalidIdFormat: "Invalid format", + ContainerPaused: "Container paused", + ConfigInvalid: "Invalid configuration", + SystemError: "System error", + ContainerNotExists: "Container does not exist", + ContainerNotStopped: "Container is not stopped", + ContainerNotRunning: "Container is not running", + ConsoleExists: "Console exists for process", + ContainerNotPaused: "Container is not paused", + NoProcessOps: "No process operations", + } + + for code, expected := range codes { + if actual := code.String(); actual != expected { + t.Fatalf("expected string %q but received %q", expected, actual) + } + } +} diff --git a/sysbox-runc/libcontainer/factory.go b/sysbox-runc/libcontainer/factory.go new file mode 100644 index 00000000..0986cd77 --- /dev/null +++ b/sysbox-runc/libcontainer/factory.go @@ -0,0 +1,44 @@ +package libcontainer + +import ( + "github.com/opencontainers/runc/libcontainer/configs" +) + +type Factory interface { + // Creates a new container with the given id and starts the initial process inside it. + // id must be a string containing only letters, digits and underscores and must contain + // between 1 and 1024 characters, inclusive. + // + // The id must not already be in use by an existing container. Containers created using + // a factory with the same path (and filesystem) must have distinct ids. + // + // Returns the new container with a running process. + // + // errors: + // IdInUse - id is already in use by a container + // InvalidIdFormat - id has incorrect format + // ConfigInvalid - config is invalid + // Systemerror - System error + // + // On error, any partially created container parts are cleaned up (the operation is atomic). + Create(id string, config *configs.Config) (Container, error) + + // Load takes an ID for an existing container and returns the container information + // from the state. This presents a read only view of the container. + // + // errors: + // Path does not exist + // System error + Load(id string) (Container, error) + + // StartInitialization is an internal API to libcontainer used during the reexec of the + // container. + // + // Errors: + // Pipe connection error + // System error + StartInitialization() error + + // Type returns info string about factory type (e.g. lxc, libcontainer...) + Type() string +} diff --git a/sysbox-runc/libcontainer/factory_linux.go b/sysbox-runc/libcontainer/factory_linux.go new file mode 100644 index 00000000..b04a319f --- /dev/null +++ b/sysbox-runc/libcontainer/factory_linux.go @@ -0,0 +1,472 @@ +//go:build linux +// +build linux + +package libcontainer + +import ( + "encoding/json" + "fmt" + "os" + "path/filepath" + "regexp" + "runtime/debug" + "strconv" + + securejoin "github.com/cyphar/filepath-securejoin" + "github.com/moby/sys/mountinfo" + "github.com/opencontainers/runc/libcontainer/cgroups" + "github.com/opencontainers/runc/libcontainer/cgroups/fs" + "github.com/opencontainers/runc/libcontainer/cgroups/fs2" + "github.com/opencontainers/runc/libcontainer/cgroups/systemd" + "github.com/opencontainers/runc/libcontainer/configs" + "github.com/opencontainers/runc/libcontainer/configs/validate" + "github.com/opencontainers/runc/libcontainer/intelrdt" + "github.com/opencontainers/runc/libcontainer/utils" + + "github.com/opencontainers/runc/libsysbox/sysbox" + "github.com/pkg/errors" + + "golang.org/x/sys/unix" +) + +const ( + stateFilename = "state.json" + execFifoFilename = "exec.fifo" +) + +var idRegex = regexp.MustCompile(`^[\w+-\.]+$`) + +// InitArgs returns an options func to configure a LinuxFactory with the +// provided init binary path and arguments. +func InitArgs(args ...string) func(*LinuxFactory) error { + return func(l *LinuxFactory) (err error) { + if len(args) > 0 { + // Resolve relative paths to ensure that its available + // after directory changes. + if args[0], err = filepath.Abs(args[0]); err != nil { + return newGenericError(err, ConfigInvalid) + } + } + + l.InitArgs = args + return nil + } +} + +func getUnifiedPath(paths map[string]string) string { + path := "" + for k, v := range paths { + if path == "" { + path = v + } else if v != path { + panic(errors.Errorf("expected %q path to be unified path %q, got %q", k, path, v)) + } + } + // can be empty + if path != "" { + if filepath.Clean(path) != path || !filepath.IsAbs(path) { + panic(errors.Errorf("invalid dir path %q", path)) + } + } + + return path +} + +func systemdCgroupV2(l *LinuxFactory, rootless bool) error { + l.NewCgroupsManager = func(config *configs.Cgroup, paths map[string]string) cgroups.Manager { + return systemd.NewUnifiedManager(config, getUnifiedPath(paths), rootless) + } + return nil +} + +// SystemdCgroups is an options func to configure a LinuxFactory to return +// containers that use systemd to create and manage cgroups. +func SystemdCgroups(l *LinuxFactory) error { + if !systemd.IsRunningSystemd() { + return fmt.Errorf("systemd not running on this host, can't use systemd as cgroups manager") + } + + if cgroups.IsCgroup2UnifiedMode() { + return systemdCgroupV2(l, false) + } + + l.NewCgroupsManager = func(config *configs.Cgroup, paths map[string]string) cgroups.Manager { + return systemd.NewLegacyManager(config, paths) + } + + return nil +} + +// RootlessSystemdCgroups is rootless version of SystemdCgroups. +func RootlessSystemdCgroups(l *LinuxFactory) error { + if !systemd.IsRunningSystemd() { + return fmt.Errorf("systemd not running on this host, can't use systemd as cgroups manager") + } + + if !cgroups.IsCgroup2UnifiedMode() { + return fmt.Errorf("cgroup v2 not enabled on this host, can't use systemd (rootless) as cgroups manager") + } + return systemdCgroupV2(l, true) +} + +func cgroupfs2(l *LinuxFactory, rootless bool) error { + l.NewCgroupsManager = func(config *configs.Cgroup, paths map[string]string) cgroups.Manager { + m, err := fs2.NewManager(config, getUnifiedPath(paths), rootless) + if err != nil { + panic(err) + } + return m + } + return nil +} + +func cgroupfs(l *LinuxFactory, rootless bool) error { + if cgroups.IsCgroup2UnifiedMode() { + return cgroupfs2(l, rootless) + } + l.NewCgroupsManager = func(config *configs.Cgroup, paths map[string]string) cgroups.Manager { + return fs.NewManager(config, paths, rootless) + } + return nil +} + +// Cgroupfs is an options func to configure a LinuxFactory to return containers +// that use the native cgroups filesystem implementation to create and manage +// cgroups. +func Cgroupfs(l *LinuxFactory) error { + return cgroupfs(l, false) +} + +// RootlessCgroupfs is an options func to configure a LinuxFactory to return +// containers that use the native cgroups filesystem implementation to create +// and manage cgroups. The difference between RootlessCgroupfs and Cgroupfs is +// that RootlessCgroupfs can transparently handle permission errors that occur +// during rootless container (including euid=0 in userns) setup (while still allowing cgroup usage if +// they've been set up properly). +func RootlessCgroupfs(l *LinuxFactory) error { + return cgroupfs(l, true) +} + +// IntelRdtfs is an options func to configure a LinuxFactory to return +// containers that use the Intel RDT "resource control" filesystem to +// create and manage Intel RDT resources (e.g., L3 cache, memory bandwidth). +func IntelRdtFs(l *LinuxFactory) error { + if !intelrdt.IsCATEnabled() && !intelrdt.IsMBAEnabled() { + l.NewIntelRdtManager = nil + } else { + l.NewIntelRdtManager = func(config *configs.Config, id string, path string) intelrdt.Manager { + return intelrdt.NewManager(config, id, path) + } + } + return nil +} + +// TmpfsRoot is an option func to mount LinuxFactory.Root to tmpfs. +func TmpfsRoot(l *LinuxFactory) error { + mounted, err := mountinfo.Mounted(l.Root) + if err != nil { + return err + } + if !mounted { + if err := unix.Mount("tmpfs", l.Root, "tmpfs", 0, ""); err != nil { + return err + } + } + return nil +} + +// CriuPath returns an option func to configure a LinuxFactory with the +// provided criupath +func CriuPath(criupath string) func(*LinuxFactory) error { + return func(l *LinuxFactory) error { + l.CriuPath = criupath + return nil + } +} + +// Sysbox returns an option func to configure a LinuxFactory with the given +// Sysbox config. +func Sysbox(sysbox *sysbox.Sysbox) func(*LinuxFactory) error { + return func(l *LinuxFactory) error { + l.Sysbox = sysbox + return nil + } +} + +// New returns a linux based container factory based in the root directory and +// configures the factory with the provided option funcs. +func New(root string, options ...func(*LinuxFactory) error) (Factory, error) { + if root != "" { + if err := os.MkdirAll(root, 0700); err != nil { + return nil, newGenericError(err, SystemError) + } + } + l := &LinuxFactory{ + Root: root, + InitPath: "/proc/self/exe", + InitArgs: []string{os.Args[0], "init"}, + Validator: validate.New(), + CriuPath: "criu", + } + Cgroupfs(l) + for _, opt := range options { + if opt == nil { + continue + } + if err := opt(l); err != nil { + return nil, err + } + } + + if l.Sysbox == nil { + l.Sysbox = sysbox.NewSysbox("", false, false) + } + + return l, nil +} + +// LinuxFactory implements the default factory interface for linux based systems. +type LinuxFactory struct { + // Root directory for the factory to store state. + Root string + + // InitPath is the path for calling the init responsibilities for spawning + // a container. + InitPath string + + // InitArgs are arguments for calling the init responsibilities for spawning + // a container. + InitArgs []string + + // CriuPath is the path to the criu binary used for checkpoint and restore of + // containers. + CriuPath string + + // New{u,g}uidmapPath is the path to the binaries used for mapping with + // rootless containers. + NewuidmapPath string + NewgidmapPath string + + // Validator provides validation to container configurations. + Validator validate.Validator + + // NewCgroupsManager returns an initialized cgroups manager for a single container. + NewCgroupsManager func(config *configs.Cgroup, paths map[string]string) cgroups.Manager + + // NewIntelRdtManager returns an initialized Intel RDT manager for a single container. + NewIntelRdtManager func(config *configs.Config, id string, path string) intelrdt.Manager + + // Sysbox config + Sysbox *sysbox.Sysbox +} + +func (l *LinuxFactory) Create(id string, config *configs.Config) (Container, error) { + if l.Root == "" { + return nil, newGenericError(fmt.Errorf("invalid root"), ConfigInvalid) + } + if err := l.validateID(id); err != nil { + return nil, err + } + if err := l.Validator.Validate(config); err != nil { + return nil, newGenericError(err, ConfigInvalid) + } + containerRoot, err := securejoin.SecureJoin(l.Root, id) + if err != nil { + return nil, err + } + if _, err := os.Stat(containerRoot); err == nil { + return nil, newGenericError(fmt.Errorf("container with id exists: %v", id), IdInUse) + } else if !os.IsNotExist(err) { + return nil, newGenericError(err, SystemError) + } + if err := os.MkdirAll(containerRoot, 0711); err != nil { + return nil, newGenericError(err, SystemError) + } + if err := os.Chown(containerRoot, unix.Geteuid(), unix.Getegid()); err != nil { + return nil, newGenericError(err, SystemError) + } + c := &linuxContainer{ + id: id, + root: containerRoot, + config: config, + initPath: l.InitPath, + initArgs: l.InitArgs, + criuPath: l.CriuPath, + newuidmapPath: l.NewuidmapPath, + newgidmapPath: l.NewgidmapPath, + cgroupManager: l.NewCgroupsManager(config.Cgroups, nil), + sysbox: l.Sysbox, + } + if l.NewIntelRdtManager != nil { + c.intelRdtManager = l.NewIntelRdtManager(config, id, "") + } + c.state = &stoppedState{c: c} + return c, nil +} + +func (l *LinuxFactory) Load(id string) (Container, error) { + if l.Root == "" { + return nil, newGenericError(fmt.Errorf("invalid root"), ConfigInvalid) + } + //when load, we need to check id is valid or not. + if err := l.validateID(id); err != nil { + return nil, err + } + containerRoot, err := securejoin.SecureJoin(l.Root, id) + if err != nil { + return nil, err + } + state, err := l.loadState(containerRoot, id) + if err != nil { + return nil, err + } + r := &nonChildProcess{ + processPid: state.InitProcessPid, + processStartTime: state.InitProcessStartTime, + fds: state.ExternalDescriptors, + } + c := &linuxContainer{ + initProcess: r, + initProcessStartTime: state.InitProcessStartTime, + id: id, + config: &state.Config, + initPath: l.InitPath, + initArgs: l.InitArgs, + criuPath: l.CriuPath, + newuidmapPath: l.NewuidmapPath, + newgidmapPath: l.NewgidmapPath, + cgroupManager: l.NewCgroupsManager(state.Config.Cgroups, state.CgroupPaths), + root: containerRoot, + created: state.Created, + sysbox: &state.Sysbox, + } + + c.sysbox.Mgr = &state.SysMgr + c.sysbox.Fs = &state.SysFs + + if l.NewIntelRdtManager != nil { + c.intelRdtManager = l.NewIntelRdtManager(&state.Config, id, state.IntelRdtPath) + } + if l.NewIntelRdtManager != nil { + c.intelRdtManager = l.NewIntelRdtManager(&state.Config, id, state.IntelRdtPath) + } + c.state = &loadedState{c: c} + if err := c.refreshState(); err != nil { + return nil, err + } + return c, nil +} + +func (l *LinuxFactory) Type() string { + return "libcontainer" +} + +// StartInitialization loads a container by opening the pipe fd from the parent to read the configuration and state +// This is a low level implementation detail of the reexec and should not be consumed externally +func (l *LinuxFactory) StartInitialization() (err error) { + // Get the INITPIPE. + envInitPipe := os.Getenv("_LIBCONTAINER_INITPIPE") + pipefd, err := strconv.Atoi(envInitPipe) + if err != nil { + return fmt.Errorf("unable to convert _LIBCONTAINER_INITPIPE=%s to int: %s", envInitPipe, err) + } + pipe := os.NewFile(uintptr(pipefd), "pipe") + defer pipe.Close() + + // Only init processes have FIFOFD. + fifofd := -1 + envInitType := os.Getenv("_LIBCONTAINER_INITTYPE") + it := initType(envInitType) + if it == initStandard { + envFifoFd := os.Getenv("_LIBCONTAINER_FIFOFD") + if fifofd, err = strconv.Atoi(envFifoFd); err != nil { + return fmt.Errorf("unable to convert _LIBCONTAINER_FIFOFD=%s to int: %s", envFifoFd, err) + } + } + + var consoleSocket *os.File + if envConsole := os.Getenv("_LIBCONTAINER_CONSOLE"); envConsole != "" { + console, err := strconv.Atoi(envConsole) + if err != nil { + return fmt.Errorf("unable to convert _LIBCONTAINER_CONSOLE=%s to int: %s", envConsole, err) + } + consoleSocket = os.NewFile(uintptr(console), "console-socket") + defer consoleSocket.Close() + } + + // clear the current process's environment to clean any libcontainer + // specific env vars. + os.Clearenv() + + defer func() { + // We have an error during the initialization of the container's init, + // send it back to the parent process in the form of an initError. + if werr := utils.WriteJSON(pipe, syncT{procError}); werr != nil { + fmt.Fprintln(os.Stderr, err) + return + } + if werr := utils.WriteJSON(pipe, newSystemError(err)); werr != nil { + fmt.Fprintln(os.Stderr, err) + return + } + }() + defer func() { + if e := recover(); e != nil { + err = fmt.Errorf("panic from initialization: %v, %v", e, string(debug.Stack())) + } + }() + + i, err := newContainerInit(it, pipe, consoleSocket, fifofd) + if err != nil { + return err + } + + // If Init succeeds, syscall.Exec will not return, hence none of the defers will be called. + return i.Init() +} + +func (l *LinuxFactory) loadState(root, id string) (*State, error) { + stateFilePath, err := securejoin.SecureJoin(root, stateFilename) + if err != nil { + return nil, err + } + f, err := os.Open(stateFilePath) + if err != nil { + if os.IsNotExist(err) { + return nil, newGenericError(fmt.Errorf("container %q does not exist", id), ContainerNotExists) + } + return nil, newGenericError(err, SystemError) + } + defer f.Close() + var state *State + if err := json.NewDecoder(f).Decode(&state); err != nil { + return nil, newGenericError(err, SystemError) + } + return state, nil +} + +func (l *LinuxFactory) validateID(id string) error { + if !idRegex.MatchString(id) || string(os.PathSeparator)+id != utils.CleanPath(string(os.PathSeparator)+id) { + return newGenericError(fmt.Errorf("invalid id format: %v", id), InvalidIdFormat) + } + + return nil +} + +// NewuidmapPath returns an option func to configure a LinuxFactory with the +// provided .. +func NewuidmapPath(newuidmapPath string) func(*LinuxFactory) error { + return func(l *LinuxFactory) error { + l.NewuidmapPath = newuidmapPath + return nil + } +} + +// NewgidmapPath returns an option func to configure a LinuxFactory with the +// provided .. +func NewgidmapPath(newgidmapPath string) func(*LinuxFactory) error { + return func(l *LinuxFactory) error { + l.NewgidmapPath = newgidmapPath + return nil + } +} diff --git a/sysbox-runc/libcontainer/factory_linux_test.go b/sysbox-runc/libcontainer/factory_linux_test.go new file mode 100644 index 00000000..2b40e24d --- /dev/null +++ b/sysbox-runc/libcontainer/factory_linux_test.go @@ -0,0 +1,230 @@ +// +build linux + +package libcontainer + +import ( + "io/ioutil" + "os" + "path/filepath" + "reflect" + "testing" + + "github.com/moby/sys/mountinfo" + "github.com/opencontainers/runc/libcontainer/configs" + "github.com/opencontainers/runc/libcontainer/utils" + "github.com/opencontainers/runtime-spec/specs-go" + + "golang.org/x/sys/unix" +) + +func newTestRoot() (string, error) { + dir, err := ioutil.TempDir("", "libcontainer") + if err != nil { + return "", err + } + return dir, nil +} + +func TestFactoryNew(t *testing.T) { + root, rerr := newTestRoot() + if rerr != nil { + t.Fatal(rerr) + } + defer os.RemoveAll(root) + factory, err := New(root, Cgroupfs) + if err != nil { + t.Fatal(err) + } + if factory == nil { + t.Fatal("factory should not be nil") + } + lfactory, ok := factory.(*LinuxFactory) + if !ok { + t.Fatal("expected linux factory returned on linux based systems") + } + if lfactory.Root != root { + t.Fatalf("expected factory root to be %q but received %q", root, lfactory.Root) + } + + if factory.Type() != "libcontainer" { + t.Fatalf("unexpected factory type: %q, expected %q", factory.Type(), "libcontainer") + } +} + +func TestFactoryNewIntelRdt(t *testing.T) { + root, rerr := newTestRoot() + if rerr != nil { + t.Fatal(rerr) + } + defer os.RemoveAll(root) + factory, err := New(root, Cgroupfs, IntelRdtFs) + if err != nil { + t.Fatal(err) + } + if factory == nil { + t.Fatal("factory should not be nil") + } + lfactory, ok := factory.(*LinuxFactory) + if !ok { + t.Fatal("expected linux factory returned on linux based systems") + } + if lfactory.Root != root { + t.Fatalf("expected factory root to be %q but received %q", root, lfactory.Root) + } + + if factory.Type() != "libcontainer" { + t.Fatalf("unexpected factory type: %q, expected %q", factory.Type(), "libcontainer") + } +} + +func TestFactoryNewTmpfs(t *testing.T) { + root, rerr := newTestRoot() + if rerr != nil { + t.Fatal(rerr) + } + defer os.RemoveAll(root) + factory, err := New(root, Cgroupfs, TmpfsRoot) + if err != nil { + t.Fatal(err) + } + if factory == nil { + t.Fatal("factory should not be nil") + } + lfactory, ok := factory.(*LinuxFactory) + if !ok { + t.Fatal("expected linux factory returned on linux based systems") + } + if lfactory.Root != root { + t.Fatalf("expected factory root to be %q but received %q", root, lfactory.Root) + } + + if factory.Type() != "libcontainer" { + t.Fatalf("unexpected factory type: %q, expected %q", factory.Type(), "libcontainer") + } + mounted, err := mountinfo.Mounted(lfactory.Root) + if err != nil { + t.Fatal(err) + } + if !mounted { + t.Fatalf("Factory Root is not mounted") + } + mounts, err := mountinfo.GetMounts(mountinfo.SingleEntryFilter(lfactory.Root)) + if err != nil { + t.Fatal(err) + } + if len(mounts) != 1 { + t.Fatalf("Factory Root is not listed in mounts list") + } + m := mounts[0] + if m.FSType != "tmpfs" { + t.Fatalf("FSType of root: %s, expected %s", m.FSType, "tmpfs") + } + if m.Source != "tmpfs" { + t.Fatalf("Source of root: %s, expected %s", m.Source, "tmpfs") + } + unix.Unmount(root, unix.MNT_DETACH) +} + +func TestFactoryLoadNotExists(t *testing.T) { + root, rerr := newTestRoot() + if rerr != nil { + t.Fatal(rerr) + } + defer os.RemoveAll(root) + factory, err := New(root, Cgroupfs) + if err != nil { + t.Fatal(err) + } + _, err = factory.Load("nocontainer") + if err == nil { + t.Fatal("expected nil error loading non-existing container") + } + lerr, ok := err.(Error) + if !ok { + t.Fatal("expected libcontainer error type") + } + if lerr.Code() != ContainerNotExists { + t.Fatalf("expected error code %s but received %s", ContainerNotExists, lerr.Code()) + } +} + +func TestFactoryLoadContainer(t *testing.T) { + root, err := newTestRoot() + if err != nil { + t.Fatal(err) + } + defer os.RemoveAll(root) + // setup default container config and state for mocking + var ( + id = "1" + expectedHooks = configs.Hooks{ + configs.Prestart: configs.HookList{ + configs.CommandHook{Command: configs.Command{Path: "prestart-hook"}}, + }, + configs.Poststart: configs.HookList{ + configs.CommandHook{Command: configs.Command{Path: "poststart-hook"}}, + }, + configs.Poststop: configs.HookList{ + unserializableHook{}, + configs.CommandHook{Command: configs.Command{Path: "poststop-hook"}}, + }, + } + expectedConfig = &configs.Config{ + Rootfs: "/mycontainer/root", + Hooks: expectedHooks, + } + expectedState = &State{ + BaseState: BaseState{ + InitProcessPid: 1024, + Config: *expectedConfig, + }, + } + ) + if err := os.Mkdir(filepath.Join(root, id), 0700); err != nil { + t.Fatal(err) + } + if err := marshal(filepath.Join(root, id, stateFilename), expectedState); err != nil { + t.Fatal(err) + } + factory, err := New(root, Cgroupfs, IntelRdtFs) + if err != nil { + t.Fatal(err) + } + container, err := factory.Load(id) + if err != nil { + t.Fatal(err) + } + if container.ID() != id { + t.Fatalf("expected container id %q but received %q", id, container.ID()) + } + config := container.Config() + if config.Rootfs != expectedConfig.Rootfs { + t.Fatalf("expected rootfs %q but received %q", expectedConfig.Rootfs, config.Rootfs) + } + expectedHooks[configs.Poststop] = expectedHooks[configs.Poststop][1:] // expect unserializable hook to be skipped + if !reflect.DeepEqual(config.Hooks, expectedHooks) { + t.Fatalf("expects hooks %q but received %q", expectedHooks, config.Hooks) + } + lcontainer, ok := container.(*linuxContainer) + if !ok { + t.Fatal("expected linux container on linux based systems") + } + if lcontainer.initProcess.pid() != expectedState.InitProcessPid { + t.Fatalf("expected init pid %d but received %d", expectedState.InitProcessPid, lcontainer.initProcess.pid()) + } +} + +func marshal(path string, v interface{}) error { + f, err := os.Create(path) + if err != nil { + return err + } + defer f.Close() + return utils.WriteJSON(f, v) +} + +type unserializableHook struct{} + +func (unserializableHook) Run(*specs.State) error { + return nil +} diff --git a/sysbox-runc/libcontainer/generic_error.go b/sysbox-runc/libcontainer/generic_error.go new file mode 100644 index 00000000..d185ebd8 --- /dev/null +++ b/sysbox-runc/libcontainer/generic_error.go @@ -0,0 +1,92 @@ +package libcontainer + +import ( + "fmt" + "io" + "text/template" + "time" + + "github.com/opencontainers/runc/libcontainer/stacktrace" +) + +var errorTemplate = template.Must(template.New("error").Parse(`Timestamp: {{.Timestamp}} +Code: {{.ECode}} +{{if .Message }} +Message: {{.Message}} +{{end}} +Frames:{{range $i, $frame := .Stack.Frames}} +--- +{{$i}}: {{$frame.Function}} +Package: {{$frame.Package}} +File: {{$frame.File}}@{{$frame.Line}}{{end}} +`)) + +func newGenericError(err error, c ErrorCode) Error { + if le, ok := err.(Error); ok { + return le + } + gerr := &genericError{ + Timestamp: time.Now(), + Err: err, + ECode: c, + Stack: stacktrace.Capture(1), + } + if err != nil { + gerr.Message = err.Error() + } + return gerr +} + +func newSystemError(err error) Error { + return createSystemError(err, "") +} + +func newSystemErrorWithCausef(err error, cause string, v ...interface{}) Error { + return createSystemError(err, fmt.Sprintf(cause, v...)) +} + +func newSystemErrorWithCause(err error, cause string) Error { + return createSystemError(err, cause) +} + +// createSystemError creates the specified error with the correct number of +// stack frames skipped. This is only to be called by the other functions for +// formatting the error. +func createSystemError(err error, cause string) Error { + gerr := &genericError{ + Timestamp: time.Now(), + Err: err, + ECode: SystemError, + Cause: cause, + Stack: stacktrace.Capture(2), + } + if err != nil { + gerr.Message = err.Error() + } + return gerr +} + +type genericError struct { + Timestamp time.Time + ECode ErrorCode + Err error `json:"-"` + Cause string + Message string + Stack stacktrace.Stacktrace +} + +func (e *genericError) Error() string { + if e.Cause == "" { + return e.Message + } + frame := e.Stack.Frames[0] + return fmt.Sprintf("%s:%d: %s caused: %s", frame.File, frame.Line, e.Cause, e.Message) +} + +func (e *genericError) Code() ErrorCode { + return e.ECode +} + +func (e *genericError) Detail(w io.Writer) error { + return errorTemplate.Execute(w, e) +} diff --git a/sysbox-runc/libcontainer/generic_error_test.go b/sysbox-runc/libcontainer/generic_error_test.go new file mode 100644 index 00000000..8fbdd4d3 --- /dev/null +++ b/sysbox-runc/libcontainer/generic_error_test.go @@ -0,0 +1,49 @@ +package libcontainer + +import ( + "fmt" + "io/ioutil" + "testing" +) + +func TestErrorDetail(t *testing.T) { + err := newGenericError(fmt.Errorf("test error"), SystemError) + if derr := err.Detail(ioutil.Discard); derr != nil { + t.Fatal(derr) + } +} + +func TestErrorWithCode(t *testing.T) { + err := newGenericError(fmt.Errorf("test error"), SystemError) + if code := err.Code(); code != SystemError { + t.Fatalf("expected err code %q but %q", SystemError, code) + } +} + +func TestErrorWithError(t *testing.T) { + cc := []struct { + errmsg string + cause string + }{ + { + errmsg: "test error", + }, + { + errmsg: "test error", + cause: "test", + }, + } + + for _, v := range cc { + err := newSystemErrorWithCause(fmt.Errorf(v.errmsg), v.cause) + + msg := err.Error() + if v.cause == "" && msg != v.errmsg { + t.Fatalf("expected err(%q) equal errmsg(%q)", msg, v.errmsg) + } + if v.cause != "" && msg == v.errmsg { + t.Fatalf("unexpected err(%q) equal errmsg(%q)", msg, v.errmsg) + } + + } +} diff --git a/sysbox-runc/libcontainer/init_linux.go b/sysbox-runc/libcontainer/init_linux.go new file mode 100644 index 00000000..18671041 --- /dev/null +++ b/sysbox-runc/libcontainer/init_linux.go @@ -0,0 +1,672 @@ +//go:build linux +// +build linux + +package libcontainer + +import ( + "bytes" + "encoding/json" + "fmt" + "io" + "io/ioutil" + "net" + "os" + "path/filepath" + "strings" + "syscall" + "unsafe" + + "golang.org/x/sys/unix" + + "github.com/containerd/console" + "github.com/opencontainers/runc/libcontainer/cgroups" + "github.com/opencontainers/runc/libcontainer/configs" + "github.com/opencontainers/runc/libcontainer/seccomp" + "github.com/opencontainers/runc/libcontainer/system" + "github.com/opencontainers/runc/libcontainer/user" + "github.com/opencontainers/runc/libcontainer/utils" + "github.com/opencontainers/runtime-spec/specs-go" + "github.com/pkg/errors" + "github.com/sirupsen/logrus" + "github.com/vishvananda/netlink" +) + +type initType string + +const ( + initSetns initType = "setns" + initStandard initType = "standard" + initMount initType = "mount" +) + +type pid struct { + Pid int `json:"pid"` + PidFirstChild int `json:"pid_first"` +} + +// network is an internal struct used to setup container networks. +type network struct { + configs.Network + + // TempVethPeerName is a unique temporary veth peer name that was placed into + // the container's namespace. + TempVethPeerName string `json:"temp_veth_peer_name"` +} + +// initConfig is used for transferring parameters from Exec() to Init() +type initConfig struct { + Args []string `json:"args"` + Env []string `json:"env"` + Cwd string `json:"cwd"` + Capabilities *configs.Capabilities `json:"capabilities"` + ProcessLabel string `json:"process_label"` + AppArmorProfile string `json:"apparmor_profile"` + NoNewPrivileges bool `json:"no_new_privileges"` + User string `json:"user"` + AdditionalGroups []string `json:"additional_groups"` + Config *configs.Config `json:"config"` + Networks []*network `json:"network"` + PassedFilesCount int `json:"passed_files_count"` + ContainerId string `json:"containerid"` + Rlimits []configs.Rlimit `json:"rlimits"` + CreateConsole bool `json:"create_console"` + ConsoleWidth uint16 `json:"console_width"` + ConsoleHeight uint16 `json:"console_height"` + RootlessEUID bool `json:"rootless_euid,omitempty"` + RootlessCgroups bool `json:"rootless_cgroups,omitempty"` + SpecState *specs.State `json:"spec_state,omitempty"` +} + +type initer interface { + Init() error +} + +func newContainerInit(t initType, pipe *os.File, consoleSocket *os.File, fifoFd int) (initer, error) { + if t == initStandard || t == initSetns { + var config *initConfig + if err := json.NewDecoder(pipe).Decode(&config); err != nil { + return nil, err + } + if err := populateProcessEnvironment(config.Env); err != nil { + return nil, err + } + switch t { + case initSetns: + return &linuxSetnsInit{ + pipe: pipe, + consoleSocket: consoleSocket, + config: config, + }, nil + case initStandard: + return &linuxStandardInit{ + pipe: pipe, + consoleSocket: consoleSocket, + parentPid: unix.Getppid(), + config: config, + fifoFd: fifoFd, + }, nil + } + } else if t == initMount { + var reqs []opReq + if err := json.NewDecoder(pipe).Decode(&reqs); err != nil { + return nil, err + } + return &linuxRootfsInit{ + pipe: pipe, + reqs: reqs, + }, nil + } + + return nil, fmt.Errorf("unknown init type %q", t) +} + +// populateProcessEnvironment loads the provided environment variables into the +// current processes's environment. +func populateProcessEnvironment(env []string) error { + for _, pair := range env { + p := strings.SplitN(pair, "=", 2) + if len(p) < 2 { + return fmt.Errorf("invalid environment '%v'", pair) + } + if err := os.Setenv(p[0], p[1]); err != nil { + return err + } + } + return nil +} + +// verifyCwd ensures that the current directory is actually inside the mount +// namespace root of the current process. +func verifyCwd() error { + // getcwd(2) on Linux detects if cwd is outside of the rootfs of the + // current mount namespace root, and in that case prefixes "(unreachable)" + // to the returned string. glibc's getcwd(3) and Go's Getwd() both detect + // when this happens and return ENOENT rather than returning a non-absolute + // path. In both cases we can therefore easily detect if we have an invalid + // cwd by checking the return value of getcwd(3). See getcwd(3) for more + // details, and CVE-2024-21626 for the security issue that motivated this + // check. + // + // We have to use unix.Getwd() here because os.Getwd() has a workaround for + // $PWD which involves doing stat(.), which can fail if the current + // directory is inaccessible to the container process. + if wd, err := unix.Getwd(); errors.Is(err, unix.ENOENT) { + return errors.New("current working directory is outside of container mount namespace root -- possible container breakout detected") + } else if err != nil { + return fmt.Errorf("failed to verify if current working directory is safe: %w", err) + } else if !filepath.IsAbs(wd) { + // We shouldn't ever hit this, but check just in case. + return fmt.Errorf("current working directory is not absolute -- possible container breakout detected: cwd is %q", wd) + } + return nil +} + +// finalizeNamespace drops the caps, sets the correct user +// and working dir, and closes any leaked file descriptors +// before executing the command inside the namespace +func finalizeNamespace(config *initConfig) error { + // Ensure that all unwanted fds we may have accidentally + // inherited are marked close-on-exec so they stay out of the + // container + // + // XXX: CloseExecFrom relies on the presence procfs being mounted inside the sys container. + // This means a setns entry into the sys container (e.g., via docker exec) would fail if + // /proc is not mounted in the container. + if err := utils.CloseExecFrom(config.PassedFilesCount + 3); err != nil { + return errors.Wrap(err, "close exec fds") + } + + capabilities := &configs.Capabilities{} + if config.Capabilities != nil { + capabilities = config.Capabilities + } else if config.Config.Capabilities != nil { + capabilities = config.Config.Capabilities + } + w, err := newContainerCapList(capabilities) + if err != nil { + return err + } + // drop capabilities in bounding set before changing user + if err := w.ApplyBoundingSet(); err != nil { + return errors.Wrap(err, "apply bounding set") + } + // preserve existing capabilities while we change users + if err := system.SetKeepCaps(); err != nil { + return errors.Wrap(err, "set keep caps") + } + if err := setupUser(config); err != nil { + return errors.Wrap(err, "setup user") + } + // Change working directory AFTER the user has been set up. + // Otherwise, if the cwd is also a volume that's been chowned to the container user (and not the user running runc), + // this command will EPERM. + if config.Cwd != "" { + if err := unix.Chdir(config.Cwd); err != nil { + return fmt.Errorf("chdir to cwd (%q) set in config.json failed: %v", config.Cwd, err) + } + } + // Make sure our final working directory is inside the container. + if err := verifyCwd(); err != nil { + return err + } + if err := system.ClearKeepCaps(); err != nil { + return errors.Wrap(err, "clear keep caps") + } + if err := w.ApplyCaps(); err != nil { + return errors.Wrap(err, "apply caps") + } + return nil +} + +// setupConsole sets up the console from inside the container, and sends the +// master pty fd to the config.Pipe (using cmsg). This is done to ensure that +// consoles are scoped to a container properly (see runc#814 and the many +// issues related to that). This has to be run *after* we've pivoted to the new +// rootfs (and the users' configuration is entirely set up). +func setupConsole(socket *os.File, config *initConfig, mount bool) error { + defer socket.Close() + // At this point, /dev/ptmx points to something that we would expect. We + // used to change the owner of the slave path, but since the /dev/pts mount + // can have gid=X set (at the users' option). So touching the owner of the + // slave PTY is not necessary, as the kernel will handle that for us. Note + // however, that setupUser (specifically fixStdioPermissions) *will* change + // the UID owner of the console to be the user the process will run as (so + // they can actually control their console). + + pty, slavePath, err := console.NewPty() + if err != nil { + return err + } + + // After we return from here, we don't need the console anymore. + defer pty.Close() + + if config.ConsoleHeight != 0 && config.ConsoleWidth != 0 { + err = pty.Resize(console.WinSize{ + Height: config.ConsoleHeight, + Width: config.ConsoleWidth, + }) + + if err != nil { + return err + } + } + + // Mount the console inside our rootfs. + if mount { + if err := mountConsole(slavePath); err != nil { + return err + } + } + // While we can access console.master, using the API is a good idea. + if err := utils.SendFd(socket, pty.Name(), pty.Fd()); err != nil { + return err + } + // Now, dup over all the things. + return dupStdio(slavePath) +} + +// syncParentReady sends to the given pipe a JSON payload which indicates that +// the init is ready to Exec the child process. It then waits for the parent to +// indicate that it is cleared to Exec. +func syncParentReady(pipe io.ReadWriter) error { + // Tell parent. + if err := writeSync(pipe, procReady); err != nil { + return err + } + + // Wait for parent to give the all-clear. + return readSync(pipe, procRun) +} + +// syncParentHooks sends to the given pipe a JSON payload which indicates that +// the parent should execute pre-start hooks. It then waits for the parent to +// indicate that it is cleared to resume. +func syncParentHooks(pipe io.ReadWriter) error { + // Tell parent. + if err := writeSync(pipe, procHooks); err != nil { + return err + } + + // Wait for parent to give the all-clear. + return readSync(pipe, procResume) +} + +// sysbox-runc: +// syncParentDoOp signals the parent runc to perform an operation on behalf of the +// sys container's init process; this is useful in cases where the container's init process +// can't do the operation because it may not have appropriate permissions. +// See sync.go for the sync sequence. +func syncParentDoOp(reqs []opReq, pipe io.ReadWriter) error { + if err := writeSync(pipe, reqOp); err != nil { + return err + } + if err := readSync(pipe, sendOpInfo); err != nil { + return err + } + if err := utils.WriteJSON(pipe, reqs); err != nil { + return err + } + if err := readSync(pipe, opDone); err != nil { + return err + } + return nil +} + +// sysbox-runc: +// syncParentSeccompFd sends a seccomp notification file-descriptor to the parent runc. +func syncParentSeccompFd(fi *os.File, pipe *os.File) error { + if err := writeSync(pipe, procFd); err != nil { + return err + } + if err := readSync(pipe, sendFd); err != nil { + return err + } + + // send fd using cmsg(3) + socket := int(pipe.Fd()) + scmRights := syscall.UnixRights(int(fi.Fd())) + if err := syscall.Sendmsg(socket, nil, scmRights, nil, 0); err != nil { + return err + } + + if err := readSync(pipe, procFdDone); err != nil { + return err + } + + return nil +} + +// sysbox-runc: +// syncParentRootfsReady sends a notification to parent to indicate that the +// container's rootfs is fully initialized, and that it's time for the parent +// to register the container with sysbox-fs component. +func syncParentRootfsReady(pipe *os.File) error { + if err := writeSync(pipe, rootfsReady); err != nil { + return err + } + + // Wait for parent to acknowledge rootfsReady msg. + return readSync(pipe, rootfsReadyAck) +} + +// setupUser changes the groups, gid, and uid for the user inside the container +func setupUser(config *initConfig) error { + // Set up defaults. + defaultExecUser := user.ExecUser{ + Uid: 0, + Gid: 0, + Home: "/", + } + + passwdPath, err := user.GetPasswdPath() + if err != nil { + return err + } + + groupPath, err := user.GetGroupPath() + if err != nil { + return err + } + + execUser, err := user.GetExecUserPath(config.User, &defaultExecUser, passwdPath, groupPath) + if err != nil { + return err + } + + var addGroups []int + if len(config.AdditionalGroups) > 0 { + addGroups, err = user.GetAdditionalGroupsPath(config.AdditionalGroups, groupPath) + if err != nil { + return err + } + } + + // Rather than just erroring out later in setuid(2) and setgid(2), check + // that the user is mapped here. + if _, err := config.Config.HostUID(execUser.Uid); err != nil { + return errors.New("cannot set uid to unmapped user in user namespace") + } + if _, err := config.Config.HostGID(execUser.Gid); err != nil { + return errors.New("cannot set gid to unmapped user in user namespace") + } + + if config.RootlessEUID { + // We cannot set any additional groups in a rootless container and thus + // we bail if the user asked us to do so. TODO: We currently can't do + // this check earlier, but if libcontainer.Process.User was typesafe + // this might work. + if len(addGroups) > 0 { + return errors.New("cannot set any additional groups in a rootless container") + } + } + + // Before we change to the container's user make sure that the processes + // STDIO is correctly owned by the user that we are switching to. + if err := fixStdioPermissions(config, execUser); err != nil { + return err + } + + setgroups, err := ioutil.ReadFile("/proc/self/setgroups") + if err != nil && !os.IsNotExist(err) { + return err + } + + // This isn't allowed in an unprivileged user namespace since Linux 3.19. + // There's nothing we can do about /etc/group entries, so we silently + // ignore setting groups here (since the user didn't explicitly ask us to + // set the group). + allowSupGroups := !config.RootlessEUID && string(bytes.TrimSpace(setgroups)) != "deny" + + if allowSupGroups { + suppGroups := append(execUser.Sgids, addGroups...) + if err := unix.Setgroups(suppGroups); err != nil { + return err + } + } + + if err := system.Setgid(execUser.Gid); err != nil { + return err + } + if err := system.Setuid(execUser.Uid); err != nil { + return err + } + + // if we didn't get HOME already, set it based on the user's HOME + if envHome := os.Getenv("HOME"); envHome == "" { + if err := os.Setenv("HOME", execUser.Home); err != nil { + return err + } + } + return nil +} + +// fixStdioPermissions fixes the permissions of PID 1's STDIO within the container to the specified user. +// The ownership needs to match because it is created outside of the container and needs to be +// localized. +func fixStdioPermissions(config *initConfig, u *user.ExecUser) error { + var null unix.Stat_t + if err := unix.Stat("/dev/null", &null); err != nil { + return err + } + for _, fd := range []uintptr{ + os.Stdin.Fd(), + os.Stderr.Fd(), + os.Stdout.Fd(), + } { + var s unix.Stat_t + if err := unix.Fstat(int(fd), &s); err != nil { + return err + } + + // Skip chown of /dev/null if it was used as one of the STDIO fds. + if s.Rdev == null.Rdev { + continue + } + + // We only change the uid owner (as it is possible for the mount to + // prefer a different gid, and there's no reason for us to change it). + // The reason why we don't just leave the default uid=X mount setup is + // that users expect to be able to actually use their console. Without + // this code, you couldn't effectively run as a non-root user inside a + // container and also have a console set up. + if err := unix.Fchown(int(fd), u.Uid, int(s.Gid)); err != nil { + // If we've hit an EINVAL then s.Gid isn't mapped in the user + // namespace. If we've hit an EPERM then the inode's current owner + // is not mapped in our user namespace (in particular, + // privileged_wrt_inode_uidgid() has failed). In either case, we + // are in a configuration where it's better for us to just not + // touch the stdio rather than bail at this point. + if err == unix.EINVAL || err == unix.EPERM { + continue + } + return err + } + } + return nil +} + +// setupNetwork sets up and initializes any network interface inside the container. +func setupNetwork(config *initConfig) error { + for _, config := range config.Networks { + strategy, err := getStrategy(config.Type) + if err != nil { + return err + } + if err := strategy.initialize(config); err != nil { + return err + } + } + return nil +} + +func setupRoute(config *configs.Config) error { + for _, config := range config.Routes { + _, dst, err := net.ParseCIDR(config.Destination) + if err != nil { + return err + } + src := net.ParseIP(config.Source) + if src == nil { + return fmt.Errorf("Invalid source for route: %s", config.Source) + } + gw := net.ParseIP(config.Gateway) + if gw == nil { + return fmt.Errorf("Invalid gateway for route: %s", config.Gateway) + } + l, err := netlink.LinkByName(config.InterfaceName) + if err != nil { + return err + } + route := &netlink.Route{ + Scope: netlink.SCOPE_UNIVERSE, + Dst: dst, + Src: src, + Gw: gw, + LinkIndex: l.Attrs().Index, + } + if err := netlink.RouteAdd(route); err != nil { + return err + } + } + return nil +} + +func setupRlimits(limits []configs.Rlimit, pid int) error { + for _, rlimit := range limits { + if err := system.Prlimit(pid, rlimit.Type, unix.Rlimit{Max: rlimit.Hard, Cur: rlimit.Soft}); err != nil { + return fmt.Errorf("error setting rlimit type %v: %v", rlimit.Type, err) + } + } + return nil +} + +const _P_PID = 1 + +//nolint:structcheck,unused +type siginfo struct { + si_signo int32 + si_errno int32 + si_code int32 + // below here is a union; si_pid is the only field we use + si_pid int32 + // Pad to 128 bytes as detailed in blockUntilWaitable + pad [96]byte +} + +// isWaitable returns true if the process has exited false otherwise. +// Its based off blockUntilWaitable in src/os/wait_waitid.go +func isWaitable(pid int) (bool, error) { + si := &siginfo{} + _, _, e := unix.Syscall6(unix.SYS_WAITID, _P_PID, uintptr(pid), uintptr(unsafe.Pointer(si)), unix.WEXITED|unix.WNOWAIT|unix.WNOHANG, 0, 0) + if e != 0 { + return false, os.NewSyscallError("waitid", e) + } + + return si.si_pid != 0, nil +} + +// isNoChildren returns true if err represents a unix.ECHILD (formerly syscall.ECHILD) false otherwise +func isNoChildren(err error) bool { + switch err := err.(type) { + case unix.Errno: + if err == unix.ECHILD { + return true + } + case *os.SyscallError: + if err.Err == unix.ECHILD { + return true + } + } + return false +} + +// signalAllProcesses freezes then iterates over all the processes inside the +// manager's cgroups sending the signal s to them. +// If s is SIGKILL then it will wait for each process to exit. +// For all other signals it will check if the process is ready to report its +// exit status and only if it is will a wait be performed. +func signalAllProcesses(m cgroups.Manager, s os.Signal) error { + var procs []*os.Process + if err := m.Freeze(configs.Frozen); err != nil { + logrus.Warn(err) + } + pids, err := m.GetAllPids() + if err != nil { + if err := m.Freeze(configs.Thawed); err != nil { + logrus.Warn(err) + } + return err + } + for _, pid := range pids { + p, err := os.FindProcess(pid) + if err != nil { + logrus.Warn(err) + continue + } + procs = append(procs, p) + if err := p.Signal(s); err != nil { + logrus.Warn(err) + } + } + if err := m.Freeze(configs.Thawed); err != nil { + logrus.Warn(err) + } + + subreaper, err := system.GetSubreaper() + if err != nil { + // The error here means that PR_GET_CHILD_SUBREAPER is not + // supported because this code might run on a kernel older + // than 3.4. We don't want to throw an error in that case, + // and we simplify things, considering there is no subreaper + // set. + subreaper = 0 + } + + for _, p := range procs { + if s != unix.SIGKILL { + if ok, err := isWaitable(p.Pid); err != nil { + if !isNoChildren(err) { + logrus.Warn("signalAllProcesses: ", p.Pid, err) + } + continue + } else if !ok { + // Not ready to report so don't wait + continue + } + } + + // In case a subreaper has been setup, this code must not + // wait for the process. Otherwise, we cannot be sure the + // current process will be reaped by the subreaper, while + // the subreaper might be waiting for this process in order + // to retrieve its exit code. + if subreaper == 0 { + if _, err := p.Wait(); err != nil { + if !isNoChildren(err) { + logrus.Warn("wait: ", err) + } + } + } + } + return nil +} + +// setupSyscallTraps sets up syscall trapping for the calling process, using seccomp. +func setupSyscallTraps(config *initConfig, pipe *os.File) error { + + // Load the seccomp notification filter here (for syscall trapping inside the container) + if config.Config.SeccompNotif != nil && len(config.Config.SeccompNotif.Syscalls) > 0 { + + fi, err := seccomp.InitSeccomp(config.Config.SeccompNotif) + if err != nil { + return newSystemErrorWithCause(err, "loading seccomp notification rules") + } + + if err := syncParentSeccompFd(fi, pipe); err != nil { + return newSystemErrorWithCause(err, "syncing with parent runc to pass seccomp file-descriptor") + } + } + + return nil +} diff --git a/sysbox-runc/libcontainer/integration/checkpoint_test.go b/sysbox-runc/libcontainer/integration/checkpoint_test.go new file mode 100644 index 00000000..7cf85501 --- /dev/null +++ b/sysbox-runc/libcontainer/integration/checkpoint_test.go @@ -0,0 +1,255 @@ +package integration + +import ( + "bufio" + "bytes" + "io/ioutil" + "os" + "os/exec" + "path/filepath" + "strings" + "testing" + + "github.com/opencontainers/runc/libcontainer" +) + +func showFile(t *testing.T, fname string) error { + t.Logf("=== %s ===\n", fname) + + f, err := os.Open(fname) + if err != nil { + t.Log(err) + return err + } + defer f.Close() + + scanner := bufio.NewScanner(f) + for scanner.Scan() { + t.Log(scanner.Text()) + } + + if err := scanner.Err(); err != nil { + return err + } + + t.Logf("=== END ===\n") + + return nil +} + +func TestUsernsCheckpoint(t *testing.T) { + + t.Skip("UNSUPPORTED") + + if _, err := os.Stat("/proc/self/ns/user"); os.IsNotExist(err) { + t.Skip("userns is unsupported") + } + cmd := exec.Command("criu", "check", "--feature", "userns") + if err := cmd.Run(); err != nil { + t.Skip("Unable to c/r a container with userns") + } + testCheckpoint(t, true) +} + +func TestCheckpoint(t *testing.T) { + + t.Skip("UNSUPPORTED") + + testCheckpoint(t, false) +} + +func testCheckpoint(t *testing.T, userns bool) { + if testing.Short() { + return + } + + if _, err := exec.LookPath("criu"); err != nil { + t.Skipf("criu binary not found: %v", err) + } + + root, err := newTestRoot() + if err != nil { + t.Fatal(err) + } + defer os.RemoveAll(root) + + rootfs, err := newRootfs() + if err != nil { + t.Fatal(err) + } + defer remove(rootfs) + + config := newTemplateConfig(&tParam{ + rootfs: rootfs, + userns: userns, + }) + factory, err := libcontainer.New(root, libcontainer.Cgroupfs) + + if err != nil { + t.Fatal(err) + } + + container, err := factory.Create("test", config) + if err != nil { + t.Fatal(err) + } + defer container.Destroy() + + stdinR, stdinW, err := os.Pipe() + if err != nil { + t.Fatal(err) + } + + var stdout bytes.Buffer + + pconfig := libcontainer.Process{ + Cwd: "/", + Args: []string{"cat"}, + Env: standardEnvironment, + Stdin: stdinR, + Stdout: &stdout, + Init: true, + } + + err = container.Run(&pconfig) + stdinR.Close() + defer stdinW.Close() + if err != nil { + t.Fatal(err) + } + + pid, err := pconfig.Pid() + if err != nil { + t.Fatal(err) + } + + process, err := os.FindProcess(pid) + if err != nil { + t.Fatal(err) + } + + parentDir, err := ioutil.TempDir("", "criu-parent") + if err != nil { + t.Fatal(err) + } + defer os.RemoveAll(parentDir) + + preDumpOpts := &libcontainer.CriuOpts{ + ImagesDirectory: parentDir, + WorkDirectory: parentDir, + PreDump: true, + } + preDumpLog := filepath.Join(preDumpOpts.WorkDirectory, "dump.log") + + if err := container.Checkpoint(preDumpOpts); err != nil { + showFile(t, preDumpLog) + t.Fatal(err) + } + + state, err := container.Status() + if err != nil { + t.Fatal(err) + } + + if state != libcontainer.Running { + t.Fatal("Unexpected preDump state: ", state) + } + + imagesDir, err := ioutil.TempDir("", "criu") + if err != nil { + t.Fatal(err) + } + defer os.RemoveAll(imagesDir) + + checkpointOpts := &libcontainer.CriuOpts{ + ImagesDirectory: imagesDir, + WorkDirectory: imagesDir, + ParentImage: "../criu-parent", + } + dumpLog := filepath.Join(checkpointOpts.WorkDirectory, "dump.log") + restoreLog := filepath.Join(checkpointOpts.WorkDirectory, "restore.log") + + if err := container.Checkpoint(checkpointOpts); err != nil { + showFile(t, dumpLog) + t.Fatal(err) + } + + state, err = container.Status() + if err != nil { + t.Fatal(err) + } + + if state != libcontainer.Stopped { + t.Fatal("Unexpected state checkpoint: ", state) + } + + stdinW.Close() + _, err = process.Wait() + if err != nil { + t.Fatal(err) + } + + // reload the container + container, err = factory.Load("test") + if err != nil { + t.Fatal(err) + } + + restoreStdinR, restoreStdinW, err := os.Pipe() + if err != nil { + t.Fatal(err) + } + + restoreProcessConfig := &libcontainer.Process{ + Cwd: "/", + Stdin: restoreStdinR, + Stdout: &stdout, + Init: true, + } + + err = container.Restore(restoreProcessConfig, checkpointOpts) + restoreStdinR.Close() + defer restoreStdinW.Close() + if err != nil { + showFile(t, restoreLog) + t.Fatal(err) + } + + state, err = container.Status() + if err != nil { + t.Fatal(err) + } + if state != libcontainer.Running { + t.Fatal("Unexpected restore state: ", state) + } + + pid, err = restoreProcessConfig.Pid() + if err != nil { + t.Fatal(err) + } + + _, err = os.FindProcess(pid) + if err != nil { + t.Fatal(err) + } + + _, err = restoreStdinW.WriteString("Hello!") + if err != nil { + t.Fatal(err) + } + + restoreStdinW.Close() + s, err := restoreProcessConfig.Wait() + if err != nil { + t.Fatal(err) + } + + if !s.Success() { + t.Fatal(s.String(), pid) + } + + output := stdout.String() + if !strings.Contains(output, "Hello!") { + t.Fatal("Did not restore the pipe correctly:", output) + } +} diff --git a/sysbox-runc/libcontainer/integration/doc.go b/sysbox-runc/libcontainer/integration/doc.go new file mode 100644 index 00000000..87545bc9 --- /dev/null +++ b/sysbox-runc/libcontainer/integration/doc.go @@ -0,0 +1,2 @@ +// integration is used for integration testing of libcontainer +package integration diff --git a/sysbox-runc/libcontainer/integration/exec_test.go b/sysbox-runc/libcontainer/integration/exec_test.go new file mode 100644 index 00000000..91e97ff9 --- /dev/null +++ b/sysbox-runc/libcontainer/integration/exec_test.go @@ -0,0 +1,1967 @@ +package integration + +import ( + "bytes" + "encoding/json" + "fmt" + "io/ioutil" + "os" + "os/exec" + "path/filepath" + "reflect" + "strconv" + "strings" + "syscall" + "testing" + + "github.com/opencontainers/runc/libcontainer" + "github.com/opencontainers/runc/libcontainer/cgroups" + "github.com/opencontainers/runc/libcontainer/cgroups/systemd" + "github.com/opencontainers/runc/libcontainer/configs" + "github.com/opencontainers/runtime-spec/specs-go" + + "golang.org/x/sys/unix" +) + +func TestExecPS(t *testing.T) { + testExecPS(t, true) +} + +// sysbox-runc: sys container's always have the user-ns, so the following test is the same as TestExecPS + +// func TestUsernsExecPS(t *testing.T) { +// if _, err := os.Stat("/proc/self/ns/user"); os.IsNotExist(err) { +// t.Skip("userns is unsupported") +// } +// testExecPS(t, true) +// } + +func testExecPS(t *testing.T, userns bool) { + if testing.Short() { + return + } + rootfs, err := newRootfs() + ok(t, err) + defer remove(rootfs) + + config := newTemplateConfig(&tParam{ + rootfs: rootfs, + userns: userns, + }) + + buffers, exitCode, err := runContainer(config, "", "ps", "-o", "pid,user,comm") + if err != nil { + t.Fatalf("%s: %s", buffers, err) + } + if exitCode != 0 { + t.Fatalf("exit code not 0. code %d stderr %q", exitCode, buffers.Stderr) + } + lines := strings.Split(buffers.Stdout.String(), "\n") + if len(lines) < 2 { + t.Fatalf("more than one process running for output %q", buffers.Stdout.String()) + } + expected := `1 root ps` + actual := strings.Trim(lines[1], "\n ") + if actual != expected { + t.Fatalf("expected output %q but received %q", expected, actual) + } +} + +func TestIPCPrivate(t *testing.T) { + if testing.Short() { + return + } + + rootfs, err := newRootfs() + ok(t, err) + defer remove(rootfs) + + l, err := os.Readlink("/proc/1/ns/ipc") + ok(t, err) + + config := newTemplateConfig(&tParam{rootfs: rootfs}) + buffers, exitCode, err := runContainer(config, "", "readlink", "/proc/self/ns/ipc") + ok(t, err) + + if exitCode != 0 { + t.Fatalf("exit code not 0. code %d stderr %q", exitCode, buffers.Stderr) + } + + if actual := strings.Trim(buffers.Stdout.String(), "\n"); actual == l { + t.Fatalf("ipc link should be private to the container but equals host %q %q", actual, l) + } +} + +func TestIPCHost(t *testing.T) { + if testing.Short() { + return + } + + rootfs, err := newRootfs() + ok(t, err) + defer remove(rootfs) + + l, err := os.Readlink("/proc/1/ns/ipc") + ok(t, err) + + config := newTemplateConfig(&tParam{rootfs: rootfs}) + config.Namespaces.Remove(configs.NEWIPC) + buffers, exitCode, err := runContainer(config, "", "readlink", "/proc/self/ns/ipc") + ok(t, err) + + if exitCode != 0 { + t.Fatalf("exit code not 0. code %d stderr %q", exitCode, buffers.Stderr) + } + + if actual := strings.Trim(buffers.Stdout.String(), "\n"); actual != l { + t.Fatalf("ipc link not equal to host link %q %q", actual, l) + } +} + +func TestIPCJoinPath(t *testing.T) { + if testing.Short() { + return + } + + rootfs, err := newRootfs() + ok(t, err) + defer remove(rootfs) + + l, err := os.Readlink("/proc/1/ns/ipc") + ok(t, err) + + config := newTemplateConfig(&tParam{rootfs: rootfs}) + config.Namespaces.Add(configs.NEWIPC, "/proc/1/ns/ipc") + + buffers, exitCode, err := runContainer(config, "", "readlink", "/proc/self/ns/ipc") + ok(t, err) + + if exitCode != 0 { + t.Fatalf("exit code not 0. code %d stderr %q", exitCode, buffers.Stderr) + } + + if actual := strings.Trim(buffers.Stdout.String(), "\n"); actual != l { + t.Fatalf("ipc link not equal to host link %q %q", actual, l) + } +} + +func TestIPCBadPath(t *testing.T) { + if testing.Short() { + return + } + + rootfs, err := newRootfs() + ok(t, err) + defer remove(rootfs) + + config := newTemplateConfig(&tParam{rootfs: rootfs}) + config.Namespaces.Add(configs.NEWIPC, "/proc/1/ns/ipcc") + + _, _, err = runContainer(config, "", "true") + if err == nil { + t.Fatal("container succeeded with bad ipc path") + } +} + +func TestRlimit(t *testing.T) { + testRlimit(t, false) +} + +func TestUsernsRlimit(t *testing.T) { + if _, err := os.Stat("/proc/self/ns/user"); os.IsNotExist(err) { + t.Skip("userns is unsupported") + } + + testRlimit(t, true) +} + +func testRlimit(t *testing.T, userns bool) { + if testing.Short() { + return + } + + rootfs, err := newRootfs() + ok(t, err) + defer remove(rootfs) + + config := newTemplateConfig(&tParam{ + rootfs: rootfs, + userns: userns, + }) + + // ensure limit is lower than what the config requests to test that in a user namespace + // the Setrlimit call happens early enough that we still have permissions to raise the limit. + ok(t, unix.Setrlimit(unix.RLIMIT_NOFILE, &unix.Rlimit{ + Max: 1024, + Cur: 1024, + })) + + out, _, err := runContainer(config, "", "/bin/sh", "-c", "ulimit -n") + ok(t, err) + if limit := strings.TrimSpace(out.Stdout.String()); limit != "1025" { + t.Fatalf("expected rlimit to be 1025, got %s", limit) + } +} + +func TestEnter(t *testing.T) { + if testing.Short() { + return + } + + rootfs, err := newRootfs() + ok(t, err) + defer remove(rootfs) + + config := newTemplateConfig(&tParam{rootfs: rootfs}) + + container, err := newContainerWithName("test", config) + ok(t, err) + defer container.Destroy() + + // Execute a first process in the container + stdinR, stdinW, err := os.Pipe() + ok(t, err) + + var stdout, stdout2 bytes.Buffer + + pconfig := libcontainer.Process{ + Cwd: "/", + Args: []string{"sh", "-c", "cat && readlink /proc/self/ns/pid"}, + Env: standardEnvironment, + Stdin: stdinR, + Stdout: &stdout, + Init: true, + } + err = container.Run(&pconfig) + stdinR.Close() + defer stdinW.Close() + ok(t, err) + pid, err := pconfig.Pid() + ok(t, err) + + // Execute another process in the container + stdinR2, stdinW2, err := os.Pipe() + ok(t, err) + pconfig2 := libcontainer.Process{ + Cwd: "/", + Env: standardEnvironment, + } + pconfig2.Args = []string{"sh", "-c", "cat && readlink /proc/self/ns/pid"} + pconfig2.Stdin = stdinR2 + pconfig2.Stdout = &stdout2 + + err = container.Run(&pconfig2) + stdinR2.Close() + defer stdinW2.Close() + ok(t, err) + + pid2, err := pconfig2.Pid() + ok(t, err) + + processes, err := container.Processes() + ok(t, err) + + n := 0 + for i := range processes { + if processes[i] == pid || processes[i] == pid2 { + n++ + } + } + if n != 2 { + t.Fatal("unexpected number of processes", processes, pid, pid2) + } + + // Wait processes + stdinW2.Close() + waitProcess(&pconfig2, t) + + stdinW.Close() + waitProcess(&pconfig, t) + + // Check that both processes live in the same pidns + pidns := stdout.String() + ok(t, err) + + pidns2 := stdout2.String() + ok(t, err) + + if pidns != pidns2 { + t.Fatal("The second process isn't in the required pid namespace", pidns, pidns2) + } +} + +func TestProcessEnv(t *testing.T) { + if testing.Short() { + return + } + + rootfs, err := newRootfs() + ok(t, err) + defer remove(rootfs) + + config := newTemplateConfig(&tParam{rootfs: rootfs}) + + container, err := newContainerWithName("test", config) + ok(t, err) + defer container.Destroy() + + var stdout bytes.Buffer + pconfig := libcontainer.Process{ + Cwd: "/", + Args: []string{"sh", "-c", "env"}, + Env: []string{ + "PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin", + "HOSTNAME=integration", + "TERM=xterm", + "FOO=BAR", + }, + Stdin: nil, + Stdout: &stdout, + Init: true, + } + err = container.Run(&pconfig) + ok(t, err) + + // Wait for process + waitProcess(&pconfig, t) + + outputEnv := stdout.String() + + // Check that the environment has the key/value pair we added + if !strings.Contains(outputEnv, "FOO=BAR") { + t.Fatal("Environment doesn't have the expected FOO=BAR key/value pair: ", outputEnv) + } + + // Make sure that HOME is set + if !strings.Contains(outputEnv, "HOME=/root") { + t.Fatal("Environment doesn't have HOME set: ", outputEnv) + } +} + +func TestProcessEmptyCaps(t *testing.T) { + if testing.Short() { + return + } + + rootfs, err := newRootfs() + ok(t, err) + defer remove(rootfs) + + config := newTemplateConfig(&tParam{rootfs: rootfs}) + config.Capabilities = nil + + container, err := newContainerWithName("test", config) + ok(t, err) + defer container.Destroy() + + var stdout bytes.Buffer + pconfig := libcontainer.Process{ + Cwd: "/", + Args: []string{"sh", "-c", "cat /proc/self/status"}, + Env: standardEnvironment, + Stdin: nil, + Stdout: &stdout, + Init: true, + } + err = container.Run(&pconfig) + ok(t, err) + + // Wait for process + waitProcess(&pconfig, t) + + outputStatus := stdout.String() + + lines := strings.Split(outputStatus, "\n") + + effectiveCapsLine := "" + for _, l := range lines { + line := strings.TrimSpace(l) + if strings.Contains(line, "CapEff:") { + effectiveCapsLine = line + break + } + } + + if effectiveCapsLine == "" { + t.Fatal("Couldn't find effective caps: ", outputStatus) + } +} + +func TestProcessCaps(t *testing.T) { + if testing.Short() { + return + } + + rootfs, err := newRootfs() + ok(t, err) + defer remove(rootfs) + + config := newTemplateConfig(&tParam{rootfs: rootfs}) + + container, err := newContainerWithName("test", config) + ok(t, err) + defer container.Destroy() + + var stdout bytes.Buffer + pconfig := libcontainer.Process{ + Cwd: "/", + Args: []string{"sh", "-c", "cat /proc/self/status"}, + Env: standardEnvironment, + Stdin: nil, + Stdout: &stdout, + Capabilities: &configs.Capabilities{}, + Init: true, + } + pconfig.Capabilities.Bounding = append(config.Capabilities.Bounding, "CAP_NET_ADMIN") + pconfig.Capabilities.Permitted = append(config.Capabilities.Permitted, "CAP_NET_ADMIN") + pconfig.Capabilities.Effective = append(config.Capabilities.Effective, "CAP_NET_ADMIN") + pconfig.Capabilities.Inheritable = append(config.Capabilities.Inheritable, "CAP_NET_ADMIN") + err = container.Run(&pconfig) + ok(t, err) + + // Wait for process + waitProcess(&pconfig, t) + + outputStatus := stdout.String() + + lines := strings.Split(outputStatus, "\n") + + effectiveCapsLine := "" + for _, l := range lines { + line := strings.TrimSpace(l) + if strings.Contains(line, "CapEff:") { + effectiveCapsLine = line + break + } + } + + if effectiveCapsLine == "" { + t.Fatal("Couldn't find effective caps: ", outputStatus) + } + + parts := strings.Split(effectiveCapsLine, ":") + effectiveCapsStr := strings.TrimSpace(parts[1]) + + effectiveCaps, err := strconv.ParseUint(effectiveCapsStr, 16, 64) + if err != nil { + t.Fatal("Could not parse effective caps", err) + } + + const netAdminMask = 1 << unix.CAP_NET_ADMIN + if effectiveCaps&netAdminMask != netAdminMask { + t.Fatal("CAP_NET_ADMIN is not set as expected") + } +} + +func TestAdditionalGroups(t *testing.T) { + if testing.Short() { + return + } + rootfs, err := newRootfs() + ok(t, err) + defer remove(rootfs) + + config := newTemplateConfig(&tParam{rootfs: rootfs}) + + container, err := newContainerWithName("test", config) + ok(t, err) + defer container.Destroy() + + var stdout bytes.Buffer + pconfig := libcontainer.Process{ + Cwd: "/", + Args: []string{"sh", "-c", "id", "-Gn"}, + Env: standardEnvironment, + Stdin: nil, + Stdout: &stdout, + AdditionalGroups: []string{"plugdev", "audio"}, + Init: true, + } + err = container.Run(&pconfig) + ok(t, err) + + // Wait for process + waitProcess(&pconfig, t) + + outputGroups := stdout.String() + + // Check that the groups output has the groups that we specified + if !strings.Contains(outputGroups, "audio") { + t.Fatalf("Listed groups do not contain the audio group as expected: %v", outputGroups) + } + + if !strings.Contains(outputGroups, "plugdev") { + t.Fatalf("Listed groups do not contain the plugdev group as expected: %v", outputGroups) + } +} + +func TestFreeze(t *testing.T) { + testFreeze(t, false) +} + +func TestSystemdFreeze(t *testing.T) { + if !systemd.IsRunningSystemd() { + t.Skip("Systemd is unsupported") + } + testFreeze(t, true) +} + +func testFreeze(t *testing.T, systemd bool) { + if testing.Short() { + return + } + + rootfs, err := newRootfs() + ok(t, err) + defer remove(rootfs) + + config := newTemplateConfig(&tParam{ + rootfs: rootfs, + systemd: systemd, + }) + container, err := newContainerWithName("test", config) + ok(t, err) + defer container.Destroy() + + stdinR, stdinW, err := os.Pipe() + ok(t, err) + + pconfig := &libcontainer.Process{ + Cwd: "/", + Args: []string{"cat"}, + Env: standardEnvironment, + Stdin: stdinR, + Init: true, + } + err = container.Run(pconfig) + stdinR.Close() + defer stdinW.Close() + ok(t, err) + + err = container.Pause() + ok(t, err) + state, err := container.Status() + ok(t, err) + err = container.Resume() + ok(t, err) + if state != libcontainer.Paused { + t.Fatal("Unexpected state: ", state) + } + + stdinW.Close() + waitProcess(pconfig, t) +} + +func TestCpuShares(t *testing.T) { + testCpuShares(t, false) +} + +func TestCpuSharesSystemd(t *testing.T) { + if !systemd.IsRunningSystemd() { + t.Skip("Systemd is unsupported") + } + testCpuShares(t, true) +} + +func testCpuShares(t *testing.T, systemd bool) { + if testing.Short() { + return + } + if cgroups.IsCgroup2UnifiedMode() { + t.Skip("cgroup v2 does not support CpuShares") + } + + rootfs, err := newRootfs() + ok(t, err) + defer remove(rootfs) + + config := newTemplateConfig(&tParam{ + rootfs: rootfs, + systemd: systemd, + }) + config.Cgroups.Resources.CpuShares = 1 + + _, _, err = runContainer(config, "", "ps") + if err == nil { + t.Fatalf("runContainer should failed with invalid CpuShares") + } +} + +func TestPids(t *testing.T) { + testPids(t, false) +} + +func TestPidsSystemd(t *testing.T) { + if !systemd.IsRunningSystemd() { + t.Skip("Systemd is unsupported") + } + testPids(t, true) +} + +func testPids(t *testing.T, systemd bool) { + if testing.Short() { + return + } + + rootfs, err := newRootfs() + ok(t, err) + defer remove(rootfs) + + config := newTemplateConfig(&tParam{ + rootfs: rootfs, + systemd: systemd, + }) + config.Cgroups.Resources.PidsLimit = -1 + + // Running multiple processes. + _, ret, err := runContainer(config, "", "/bin/sh", "-c", "/bin/true | /bin/true | /bin/true | /bin/true") + if err != nil && strings.Contains(err.Error(), "no such directory for pids.max") { + t.Skip("PIDs cgroup is unsupported") + } + ok(t, err) + + if ret != 0 { + t.Fatalf("expected fork() to succeed with no pids limit") + } + + // Enforce a permissive limit. This needs to be fairly hand-wavey due to the + // issues with running Go binaries with pids restrictions (see below). + config.Cgroups.Resources.PidsLimit = 64 + _, ret, err = runContainer(config, "", "/bin/sh", "-c", ` + /bin/true | /bin/true | /bin/true | /bin/true | /bin/true | /bin/true | bin/true | /bin/true | + /bin/true | /bin/true | /bin/true | /bin/true | /bin/true | /bin/true | bin/true | /bin/true | + /bin/true | /bin/true | /bin/true | /bin/true | /bin/true | /bin/true | bin/true | /bin/true | + /bin/true | /bin/true | /bin/true | /bin/true | /bin/true | /bin/true | bin/true | /bin/true`) + if err != nil && strings.Contains(err.Error(), "no such directory for pids.max") { + t.Skip("PIDs cgroup is unsupported") + } + ok(t, err) + + if ret != 0 { + t.Fatalf("expected fork() to succeed with permissive pids limit") + } + + // Enforce a restrictive limit. 64 * /bin/true + 1 * shell should cause this + // to fail reliability. + config.Cgroups.Resources.PidsLimit = 64 + out, _, err := runContainer(config, "", "/bin/sh", "-c", ` + /bin/true | /bin/true | /bin/true | /bin/true | /bin/true | /bin/true | bin/true | /bin/true | + /bin/true | /bin/true | /bin/true | /bin/true | /bin/true | /bin/true | bin/true | /bin/true | + /bin/true | /bin/true | /bin/true | /bin/true | /bin/true | /bin/true | bin/true | /bin/true | + /bin/true | /bin/true | /bin/true | /bin/true | /bin/true | /bin/true | bin/true | /bin/true | + /bin/true | /bin/true | /bin/true | /bin/true | /bin/true | /bin/true | bin/true | /bin/true | + /bin/true | /bin/true | /bin/true | /bin/true | /bin/true | /bin/true | bin/true | /bin/true | + /bin/true | /bin/true | /bin/true | /bin/true | /bin/true | /bin/true | bin/true | /bin/true | + /bin/true | /bin/true | /bin/true | /bin/true | /bin/true | /bin/true | bin/true | /bin/true`) + if err != nil && strings.Contains(err.Error(), "no such directory for pids.max") { + t.Skip("PIDs cgroup is unsupported") + } + if err != nil && !strings.Contains(out.String(), "sh: can't fork") { + ok(t, err) + } + + if err == nil { + t.Fatalf("expected fork() to fail with restrictive pids limit") + } + + // Minimal restrictions are not really supported, due to quirks in using Go + // due to the fact that it spawns random processes. While we do our best with + // late setting cgroup values, it's just too unreliable with very small pids.max. + // As such, we don't test that case. YMMV. +} + +func TestCgroupResourcesUnifiedErrorOnV1(t *testing.T) { + testCgroupResourcesUnifiedErrorOnV1(t, false) +} + +func TestCgroupResourcesUnifiedErrorOnV1Systemd(t *testing.T) { + if !systemd.IsRunningSystemd() { + t.Skip("Systemd is unsupported") + } + testCgroupResourcesUnifiedErrorOnV1(t, true) +} + +func testCgroupResourcesUnifiedErrorOnV1(t *testing.T, systemd bool) { + if testing.Short() { + return + } + if cgroups.IsCgroup2UnifiedMode() { + t.Skip("requires cgroup v1") + } + rootfs, err := newRootfs() + ok(t, err) + defer remove(rootfs) + + config := newTemplateConfig(&tParam{ + rootfs: rootfs, + systemd: systemd, + }) + config.Cgroups.Resources.Unified = map[string]string{ + "memory.min": "10240", + } + _, _, err = runContainer(config, "", "true") + if !strings.Contains(err.Error(), cgroups.ErrV1NoUnified.Error()) { + t.Fatalf("expected error to contain %v, got %v", cgroups.ErrV1NoUnified, err) + } +} + +func TestCgroupResourcesUnified(t *testing.T) { + testCgroupResourcesUnified(t, false) +} + +func TestCgroupResourcesUnifiedSystemd(t *testing.T) { + if !systemd.IsRunningSystemd() { + t.Skip("Systemd is unsupported") + } + testCgroupResourcesUnified(t, true) +} + +func testCgroupResourcesUnified(t *testing.T, systemd bool) { + if testing.Short() { + return + } + if !cgroups.IsCgroup2UnifiedMode() { + t.Skip("requires cgroup v2") + } + rootfs, err := newRootfs() + ok(t, err) + defer remove(rootfs) + + config := newTemplateConfig(&tParam{ + rootfs: rootfs, + systemd: systemd, + }) + config.Cgroups.Resources.Memory = 536870912 // 512M + config.Cgroups.Resources.MemorySwap = 536870912 // 512M, i.e. no swap + config.Namespaces.Add(configs.NEWCGROUP, "") + + testCases := []struct { + name string + cfg map[string]string + expError string + cmd []string + exp string + }{ + { + name: "dummy", + cmd: []string{"true"}, + exp: "", + }, + { + name: "set memory.min", + cfg: map[string]string{"memory.min": "131072"}, + cmd: []string{"cat", "/sys/fs/cgroup/memory.min"}, + exp: "131072\n", + }, + { + name: "check memory.max", + cmd: []string{"cat", "/sys/fs/cgroup/memory.max"}, + exp: strconv.Itoa(int(config.Cgroups.Resources.Memory)) + "\n", + }, + + { + name: "overwrite memory.max", + cfg: map[string]string{"memory.max": "268435456"}, + cmd: []string{"cat", "/sys/fs/cgroup/memory.max"}, + exp: "268435456\n", + }, + { + name: "no such controller error", + cfg: map[string]string{"privet.vsem": "vam"}, + expError: "controller \"privet\" not available", + }, + { + name: "slash in key error", + cfg: map[string]string{"bad/key": "val"}, + expError: "must be a file name (no slashes)", + }, + { + name: "no dot in key error", + cfg: map[string]string{"badkey": "val"}, + expError: "must be in the form CONTROLLER.PARAMETER", + }, + { + name: "read-only parameter", + cfg: map[string]string{"pids.current": "42"}, + expError: "failed to write", + }, + } + + for _, tc := range testCases { + config.Cgroups.Resources.Unified = tc.cfg + buffers, ret, err := runContainer(config, "", tc.cmd...) + if tc.expError != "" { + if err == nil { + t.Errorf("case %q failed: expected error, got nil", tc.name) + continue + } + if !strings.Contains(err.Error(), tc.expError) { + t.Errorf("case %q failed: expected error to contain %q, got %q", tc.name, tc.expError, err) + } + continue + } + if err != nil { + t.Errorf("case %q failed: expected no error, got %v (command: %v, status: %d, stderr: %q)", + tc.name, err, tc.cmd, ret, buffers.Stderr.String()) + continue + } + if tc.exp != "" { + out := buffers.Stdout.String() + if out != tc.exp { + t.Errorf("expected %q, got %q", tc.exp, out) + } + } + } +} + +func TestContainerState(t *testing.T) { + if testing.Short() { + return + } + + rootfs, err := newRootfs() + if err != nil { + t.Fatal(err) + } + defer remove(rootfs) + + l, err := os.Readlink("/proc/1/ns/ipc") + if err != nil { + t.Fatal(err) + } + + config := newTemplateConfig(&tParam{ + rootfs: rootfs, + userns: true, + }) + + config.Namespaces = configs.Namespaces([]configs.Namespace{ + {Type: configs.NEWUSER}, + {Type: configs.NEWNS}, + {Type: configs.NEWUTS}, + // host for IPC + //{Type: configs.NEWIPC}, + {Type: configs.NEWPID}, + {Type: configs.NEWNET}, + }) + + container, err := newContainerWithName("test", config) + if err != nil { + t.Fatal(err) + } + defer container.Destroy() + + stdinR, stdinW, err := os.Pipe() + if err != nil { + t.Fatal(err) + } + p := &libcontainer.Process{ + Cwd: "/", + Args: []string{"cat"}, + Env: standardEnvironment, + Stdin: stdinR, + Init: true, + } + err = container.Run(p) + if err != nil { + t.Fatal(err) + } + stdinR.Close() + defer stdinW.Close() + + st, err := container.State() + if err != nil { + t.Fatal(err) + } + + l1, err := os.Readlink(st.NamespacePaths[configs.NEWIPC]) + if err != nil { + t.Fatal(err) + } + if l1 != l { + t.Fatal("Container using non-host ipc namespace") + } + stdinW.Close() + waitProcess(p, t) +} + +func TestPassExtraFiles(t *testing.T) { + if testing.Short() { + return + } + + rootfs, err := newRootfs() + if err != nil { + t.Fatal(err) + } + defer remove(rootfs) + + config := newTemplateConfig(&tParam{rootfs: rootfs}) + + container, err := newContainerWithName("test", config) + if err != nil { + t.Fatal(err) + } + defer container.Destroy() + + var stdout bytes.Buffer + pipeout1, pipein1, err := os.Pipe() + if err != nil { + t.Fatal(err) + } + pipeout2, pipein2, err := os.Pipe() + if err != nil { + t.Fatal(err) + } + process := libcontainer.Process{ + Cwd: "/", + Args: []string{"sh", "-c", "cd /proc/$$/fd; echo -n *; echo -n 1 >3; echo -n 2 >4"}, + Env: []string{"PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin"}, + ExtraFiles: []*os.File{pipein1, pipein2}, + Stdin: nil, + Stdout: &stdout, + Init: true, + } + err = container.Run(&process) + if err != nil { + t.Fatal(err) + } + + waitProcess(&process, t) + + out := stdout.String() + // fd 5 is the directory handle for /proc/$$/fd + if out != "0 1 2 3 4 5" { + t.Fatalf("expected to have the file descriptors '0 1 2 3 4 5' passed to init, got '%s'", out) + } + var buf = []byte{0} + _, err = pipeout1.Read(buf) + if err != nil { + t.Fatal(err) + } + out1 := string(buf) + if out1 != "1" { + t.Fatalf("expected first pipe to receive '1', got '%s'", out1) + } + + _, err = pipeout2.Read(buf) + if err != nil { + t.Fatal(err) + } + out2 := string(buf) + if out2 != "2" { + t.Fatalf("expected second pipe to receive '2', got '%s'", out2) + } +} + +func TestSysctl(t *testing.T) { + + if testing.Short() { + return + } + + rootfs, err := newRootfs() + ok(t, err) + defer remove(rootfs) + + config := newTemplateConfig(&tParam{rootfs: rootfs}) + config.Sysctl = map[string]string{ + "kernel.shmmni": "8192", + } + + container, err := newContainerWithName("test", config) + ok(t, err) + defer container.Destroy() + + var stdout bytes.Buffer + pconfig := libcontainer.Process{ + Cwd: "/", + Args: []string{"sh", "-c", "cat /proc/sys/kernel/shmmni"}, + Env: standardEnvironment, + Stdin: nil, + Stdout: &stdout, + Init: true, + } + + err = container.Run(&pconfig) + ok(t, err) + + // Wait for process + waitProcess(&pconfig, t) + + shmmniOutput := strings.TrimSpace(stdout.String()) + if shmmniOutput != "8192" { + t.Fatalf("kernel.shmmni property expected to be 8192, but is %s", shmmniOutput) + } +} + +func TestMountCgroupRO(t *testing.T) { + if testing.Short() { + return + } + rootfs, err := newRootfs() + ok(t, err) + defer remove(rootfs) + config := newTemplateConfig(&tParam{rootfs: rootfs}) + buffers, exitCode, err := runContainer(config, "", "mount") + if err != nil { + t.Fatalf("%s: %s", buffers, err) + } + if exitCode != 0 { + t.Fatalf("exit code not 0. code %d stderr %q", exitCode, buffers.Stderr) + } + mountInfo := buffers.Stdout.String() + lines := strings.Split(mountInfo, "\n") + for _, l := range lines { + if strings.HasPrefix(l, "tmpfs on /sys/fs/cgroup") { + if !strings.Contains(l, "ro") || + !strings.Contains(l, "nosuid") || + !strings.Contains(l, "nodev") || + !strings.Contains(l, "noexec") { + t.Fatalf("Mode expected to contain 'ro,nosuid,nodev,noexec': %s", l) + } + if !strings.Contains(l, "mode=755") { + t.Fatalf("Mode expected to contain 'mode=755': %s", l) + } + continue + } + if !strings.HasPrefix(l, "cgroup") { + continue + } + if !strings.Contains(l, "ro") || + !strings.Contains(l, "nosuid") || + !strings.Contains(l, "nodev") || + !strings.Contains(l, "noexec") { + t.Fatalf("Mode expected to contain 'ro,nosuid,nodev,noexec': %s", l) + } + } +} + +func TestMountCgroupRW(t *testing.T) { + if testing.Short() { + return + } + rootfs, err := newRootfs() + ok(t, err) + defer remove(rootfs) + config := newTemplateConfig(&tParam{rootfs: rootfs}) + // clear the RO flag from cgroup mount + for _, m := range config.Mounts { + if m.Device == "cgroup" { + m.Flags = defaultMountFlags + break + } + } + + buffers, exitCode, err := runContainer(config, "", "mount") + if err != nil { + t.Fatalf("%s: %s", buffers, err) + } + if exitCode != 0 { + t.Fatalf("exit code not 0. code %d stderr %q", exitCode, buffers.Stderr) + } + mountInfo := buffers.Stdout.String() + lines := strings.Split(mountInfo, "\n") + for _, l := range lines { + if strings.HasPrefix(l, "tmpfs on /sys/fs/cgroup") { + if !strings.Contains(l, "rw") || + !strings.Contains(l, "nosuid") || + !strings.Contains(l, "nodev") || + !strings.Contains(l, "noexec") { + t.Fatalf("Mode expected to contain 'rw,nosuid,nodev,noexec': %s", l) + } + if !strings.Contains(l, "mode=755") { + t.Fatalf("Mode expected to contain 'mode=755': %s", l) + } + continue + } + if !strings.HasPrefix(l, "cgroup") { + continue + } + if !strings.Contains(l, "rw") || + !strings.Contains(l, "nosuid") || + !strings.Contains(l, "nodev") || + !strings.Contains(l, "noexec") { + t.Fatalf("Mode expected to contain 'rw,nosuid,nodev,noexec': %s", l) + } + } +} + +func TestOomScoreAdj(t *testing.T) { + if testing.Short() { + return + } + + rootfs, err := newRootfs() + ok(t, err) + defer remove(rootfs) + + config := newTemplateConfig(&tParam{rootfs: rootfs}) + config.OomScoreAdj = ptrInt(200) + + container, err := newContainerWithName("test", config) + ok(t, err) + defer container.Destroy() + + var stdout bytes.Buffer + pconfig := libcontainer.Process{ + Cwd: "/", + Args: []string{"sh", "-c", "cat /proc/self/oom_score_adj"}, + Env: standardEnvironment, + Stdin: nil, + Stdout: &stdout, + Init: true, + } + err = container.Run(&pconfig) + ok(t, err) + + // Wait for process + waitProcess(&pconfig, t) + outputOomScoreAdj := strings.TrimSpace(stdout.String()) + + // Check that the oom_score_adj matches the value that was set as part of config. + if outputOomScoreAdj != strconv.Itoa(*config.OomScoreAdj) { + t.Fatalf("Expected oom_score_adj %d; got %q", *config.OomScoreAdj, outputOomScoreAdj) + } +} + +func TestHook(t *testing.T) { + if testing.Short() { + return + } + + bundle, err := newTestBundle() + ok(t, err) + defer remove(bundle) + + rootfs, err := newRootfs() + ok(t, err) + defer remove(rootfs) + + config := newTemplateConfig(&tParam{rootfs: rootfs}) + expectedBundle := bundle + config.Labels = append(config.Labels, "bundle="+expectedBundle) + + getRootfsFromBundle := func(bundle string) (string, error) { + f, err := os.Open(filepath.Join(bundle, "config.json")) + if err != nil { + return "", err + } + + var config configs.Config + if err = json.NewDecoder(f).Decode(&config); err != nil { + return "", err + } + return config.Rootfs, nil + } + createFileFromBundle := func(filename, bundle string) error { + root, err := getRootfsFromBundle(bundle) + if err != nil { + return err + } + + f, err := os.Create(filepath.Join(root, filename)) + if err != nil { + return err + } + return f.Close() + } + + // Note FunctionHooks can't be serialized to json this means they won't be passed down to the container + // For CreateContainer and StartContainer which run in the container namespace, this means we need to pass Command Hooks. + hookFiles := map[configs.HookName]string{ + configs.Prestart: "prestart", + configs.CreateRuntime: "createRuntime", + configs.CreateContainer: "createContainer", + configs.StartContainer: "startContainer", + configs.Poststart: "poststart", + } + + config.Hooks = configs.Hooks{ + configs.Prestart: configs.HookList{ + configs.NewFunctionHook(func(s *specs.State) error { + if s.Bundle != expectedBundle { + t.Fatalf("Expected prestart hook bundlePath '%s'; got '%s'", expectedBundle, s.Bundle) + } + return createFileFromBundle(hookFiles[configs.Prestart], s.Bundle) + }), + }, + configs.CreateRuntime: configs.HookList{ + configs.NewFunctionHook(func(s *specs.State) error { + if s.Bundle != expectedBundle { + t.Fatalf("Expected createRuntime hook bundlePath '%s'; got '%s'", expectedBundle, s.Bundle) + } + return createFileFromBundle(hookFiles[configs.CreateRuntime], s.Bundle) + }), + }, + configs.CreateContainer: configs.HookList{ + configs.NewCommandHook(configs.Command{ + Path: "/bin/bash", + Args: []string{"/bin/bash", "-c", fmt.Sprintf("touch ./%s", hookFiles[configs.CreateContainer])}, + }), + }, + configs.StartContainer: configs.HookList{ + configs.NewCommandHook(configs.Command{ + Path: "/bin/sh", + Args: []string{"/bin/sh", "-c", fmt.Sprintf("touch /%s", hookFiles[configs.StartContainer])}, + }), + }, + configs.Poststart: configs.HookList{ + configs.NewFunctionHook(func(s *specs.State) error { + if s.Bundle != expectedBundle { + t.Fatalf("Expected poststart hook bundlePath '%s'; got '%s'", expectedBundle, s.Bundle) + } + return createFileFromBundle(hookFiles[configs.Poststart], s.Bundle) + }), + }, + configs.Poststop: configs.HookList{ + configs.NewFunctionHook(func(s *specs.State) error { + if s.Bundle != expectedBundle { + t.Fatalf("Expected poststop hook bundlePath '%s'; got '%s'", expectedBundle, s.Bundle) + } + + root, err := getRootfsFromBundle(s.Bundle) + if err != nil { + return err + } + + for _, hook := range hookFiles { + if err = os.RemoveAll(filepath.Join(root, hook)); err != nil { + return err + } + } + return nil + }), + }, + } + + // write config of json format into config.json under bundle + f, err := os.OpenFile(filepath.Join(bundle, "config.json"), os.O_CREATE|os.O_RDWR, 0644) + ok(t, err) + ok(t, json.NewEncoder(f).Encode(config)) + + container, err := newContainerWithName("test", config) + ok(t, err) + + // e.g: 'ls /prestart ...' + cmd := "ls " + for _, hook := range hookFiles { + cmd += "/" + hook + " " + } + + var stdout bytes.Buffer + pconfig := libcontainer.Process{ + Cwd: "/", + Args: []string{"sh", "-c", cmd}, + Env: standardEnvironment, + Stdin: nil, + Stdout: &stdout, + Init: true, + } + err = container.Run(&pconfig) + ok(t, err) + + // Wait for process + waitProcess(&pconfig, t) + + if err := container.Destroy(); err != nil { + t.Fatalf("container destroy %s", err) + } + + for _, hook := range []string{"prestart", "createRuntime", "poststart"} { + fi, err := os.Stat(filepath.Join(rootfs, hook)) + if err == nil || !os.IsNotExist(err) { + t.Fatalf("expected file '%s to not exists, but it does", fi.Name()) + } + } +} + +func TestSTDIOPermissions(t *testing.T) { + if testing.Short() { + return + } + + rootfs, err := newRootfs() + ok(t, err) + defer remove(rootfs) + config := newTemplateConfig(&tParam{rootfs: rootfs}) + buffers, exitCode, err := runContainer(config, "", "sh", "-c", "echo hi > /dev/stderr") + ok(t, err) + if exitCode != 0 { + t.Fatalf("exit code not 0. code %d stderr %q", exitCode, buffers.Stderr) + } + + if actual := strings.Trim(buffers.Stderr.String(), "\n"); actual != "hi" { + t.Fatalf("stderr should equal be equal %q %q", actual, "hi") + } +} + +func unmountOp(path string) error { + return unix.Unmount(path, unix.MNT_DETACH) +} + +// Launch container with rootfsPropagation in rslave mode. Also +// bind mount a volume /mnt1host at /mnt1cont at the time of launch. Now do +// another mount on host (/mnt1host/mnt2host) and this new mount should +// propagate to container (/mnt1cont/mnt2host) +func TestRootfsPropagationSlaveMount(t *testing.T) { + var mountPropagated bool + var dir1cont string + var dir2cont string + + dir1cont = "/root/mnt1cont" + + if testing.Short() { + return + } + rootfs, err := newRootfs() + ok(t, err) + defer remove(rootfs) + config := newTemplateConfig(&tParam{rootfs: rootfs}) + + config.RootPropagation = unix.MS_SLAVE | unix.MS_REC + + // Bind mount a volume + dir1host, err := ioutil.TempDir("", "mnt1host") + ok(t, err) + defer os.RemoveAll(dir1host) + + // Make this dir a "shared" mount point. This will make sure a + // slave relationship can be established in container. + err = unix.Mount(dir1host, dir1host, "bind", unix.MS_BIND|unix.MS_REC, "") + ok(t, err) + err = unix.Mount("", dir1host, "", unix.MS_SHARED|unix.MS_REC, "") + ok(t, err) + defer unmountOp(dir1host) + + config.Mounts = append(config.Mounts, &configs.Mount{ + Source: dir1host, + Destination: dir1cont, + Device: "bind", + Flags: unix.MS_BIND | unix.MS_REC, + BindSrcInfo: configs.BindSrcInfo{ + IsDir: true, + Uid: uint32(os.Geteuid()), + Gid: uint32(os.Getegid()), + }, + }) + + container, err := newContainerWithName("testSlaveMount", config) + ok(t, err) + defer container.Destroy() + + stdinR, stdinW, err := os.Pipe() + ok(t, err) + + pconfig := &libcontainer.Process{ + Cwd: "/", + Args: []string{"cat"}, + Env: standardEnvironment, + Stdin: stdinR, + Init: true, + } + + err = container.Run(pconfig) + stdinR.Close() + defer stdinW.Close() + ok(t, err) + + // Create mnt1host/mnt2host and bind mount itself on top of it. This + // should be visible in container. + dir2host, err := ioutil.TempDir(dir1host, "mnt2host") + ok(t, err) + defer os.RemoveAll(dir2host) + + err = unix.Mount(dir2host, dir2host, "bind", unix.MS_BIND, "") + defer unmountOp(dir2host) + ok(t, err) + + // Run "cat /proc/self/mountinfo" in container and look at mount points. + var stdout2 bytes.Buffer + + stdinR2, stdinW2, err := os.Pipe() + ok(t, err) + + pconfig2 := &libcontainer.Process{ + Cwd: "/", + Args: []string{"cat", "/proc/self/mountinfo"}, + Env: standardEnvironment, + Stdin: stdinR2, + Stdout: &stdout2, + } + + err = container.Run(pconfig2) + stdinR2.Close() + defer stdinW2.Close() + ok(t, err) + + stdinW2.Close() + waitProcess(pconfig2, t) + stdinW.Close() + waitProcess(pconfig, t) + + mountPropagated = false + dir2cont = filepath.Join(dir1cont, filepath.Base(dir2host)) + + propagationInfo := stdout2.String() + lines := strings.Split(propagationInfo, "\n") + for _, l := range lines { + linefields := strings.Split(l, " ") + if len(linefields) < 5 { + continue + } + + if linefields[4] == dir2cont { + mountPropagated = true + break + } + } + + if mountPropagated != true { + t.Fatalf("Mount on host %s did not propagate in container at %s\n", dir2host, dir2cont) + } +} + +// Launch container with rootfsPropagation 0 so no propagation flags are applied. Also +// bind mount a volume /mnt1host at /mnt1cont at the time of launch. The /mnt1host volume +// has shared propagation. Now do a mount in container (/mnt1cont/mnt2cont) and this new +// mount should propagate to host (/mnt1host/mnt2cont) + +func TestRootfsPropagationSharedMount(t *testing.T) { + + // sysbox-runc: sys containers always use the user-ns; this test is + // not applicable as it creates a bind-mount with shared + // propagation, which is not possible when using user-ns (see + // snippet below on mount_namespaces(7)). + + t.Skip("not applicable") + + var dir1cont string + var dir2cont string + + dir1cont = "/root/mnt1cont" + + if testing.Short() { + return + } + rootfs, err := newRootfs() + ok(t, err) + defer remove(rootfs) + config := newTemplateConfig(&tParam{rootfs: rootfs}) + config.RootPropagation = unix.MS_PRIVATE + + // Shared mounts only work without user-ns. Per mount_namespaces(7): + // + // * A mount namespace has an owner user namespace. A mount namespace whose + // owner user namespace is different from the owner user namespace of its + // parent mount namespace is considered a less privileged mount namespace. + // + // * When creating a less privileged mount namespace, shared mounts are + // reduced to slave mounts. (Shared and slave mounts are discussed below.) + // This ensures that mappings performed in less privileged mount namespaces + // will not propagate to more privileged mount namespaces. + // + // Thus, we must remove the user-ns that comes in the template config. + + config.Namespaces.Remove(configs.NEWUSER) + config.UidMappings = nil + config.GidMappings = nil + + // Bind mount a volume + dir1host, err := ioutil.TempDir("", "mnt1host") + ok(t, err) + defer os.RemoveAll(dir1host) + + // Make this dir a "shared" mount point. This will make sure a + // shared relationship can be established in container. + err = unix.Mount(dir1host, dir1host, "bind", unix.MS_BIND|unix.MS_REC, "") + ok(t, err) + err = unix.Mount("", dir1host, "", unix.MS_SHARED|unix.MS_REC, "") + ok(t, err) + defer unmountOp(dir1host) + + config.Mounts = append(config.Mounts, &configs.Mount{ + Source: dir1host, + Destination: dir1cont, + Device: "bind", + Flags: unix.MS_BIND | unix.MS_REC, + BindSrcInfo: configs.BindSrcInfo{ + IsDir: true, + Uid: uint32(os.Geteuid()), + Gid: uint32(os.Getegid()), + }, + }) + + container, err := newContainerWithName("testSharedMount", config) + ok(t, err) + defer container.Destroy() + + stdinR, stdinW, err := os.Pipe() + ok(t, err) + + pconfig := &libcontainer.Process{ + Cwd: "/", + Args: []string{"cat"}, + Env: standardEnvironment, + Stdin: stdinR, + Init: true, + } + + err = container.Run(pconfig) + stdinR.Close() + defer stdinW.Close() + ok(t, err) + + // Create mnt1host/mnt2cont. This will become visible inside container + // at mnt1cont/mnt2cont. Bind mount itself on top of it. This + // should be visible on host now. + dir2host, err := ioutil.TempDir(dir1host, "mnt2cont") + ok(t, err) + defer os.RemoveAll(dir2host) + + dir2cont = filepath.Join(dir1cont, filepath.Base(dir2host)) + + // Mount something in container and see if it is visible on host. + var stdout2 bytes.Buffer + + stdinR2, stdinW2, err := os.Pipe() + ok(t, err) + + pconfig2 := &libcontainer.Process{ + Cwd: "/", + Args: []string{"mount", "--bind", dir2cont, dir2cont}, + Env: standardEnvironment, + Stdin: stdinR2, + Stdout: &stdout2, + Capabilities: &configs.Capabilities{}, + } + + // Provide CAP_SYS_ADMIN + pconfig2.Capabilities.Bounding = append(config.Capabilities.Bounding, "CAP_SYS_ADMIN") + pconfig2.Capabilities.Permitted = append(config.Capabilities.Permitted, "CAP_SYS_ADMIN") + pconfig2.Capabilities.Effective = append(config.Capabilities.Effective, "CAP_SYS_ADMIN") + pconfig2.Capabilities.Inheritable = append(config.Capabilities.Inheritable, "CAP_SYS_ADMIN") + + err = container.Run(pconfig2) + stdinR2.Close() + defer stdinW2.Close() + ok(t, err) + + // Wait for process + stdinW2.Close() + waitProcess(pconfig2, t) + stdinW.Close() + waitProcess(pconfig, t) + + defer unmountOp(dir2host) + + // Check if mount is visible on host or not. + out, err := exec.Command("findmnt", "-n", "-f", "-oTARGET", dir2host).CombinedOutput() + outtrim := string(bytes.TrimSpace(out)) + if err != nil { + t.Logf("findmnt error %q: %q", err, outtrim) + } + + if outtrim != dir2host { + t.Fatalf("Mount in container on %s did not propagate to host on %s. finmnt output=%s", dir2cont, dir2host, outtrim) + } +} + +func TestPIDHost(t *testing.T) { + + // sysbox-runc: sys containers always use all namespaces; this test is not applicable + // as it spawns a container without the pid ns. + t.Skip("not applicable") + + if testing.Short() { + return + } + + rootfs, err := newRootfs() + ok(t, err) + defer remove(rootfs) + + l, err := os.Readlink("/proc/1/ns/pid") + ok(t, err) + + config := newTemplateConfig(&tParam{rootfs: rootfs}) + config.Namespaces.Remove(configs.NEWPID) + buffers, exitCode, err := runContainer(config, "", "readlink", "/proc/self/ns/pid") + ok(t, err) + + if exitCode != 0 { + t.Fatalf("exit code not 0. code %d stderr %q", exitCode, buffers.Stderr) + } + + if actual := strings.Trim(buffers.Stdout.String(), "\n"); actual != l { + t.Fatalf("ipc link not equal to host link %q %q", actual, l) + } +} + +func TestPIDHostInitProcessWait(t *testing.T) { + if testing.Short() { + return + } + + rootfs, err := newRootfs() + ok(t, err) + defer remove(rootfs) + + pidns := "/proc/1/ns/pid" + + // Run a container with two long-running processes. + config := newTemplateConfig(&tParam{rootfs: rootfs}) + config.Namespaces.Add(configs.NEWPID, pidns) + container, err := newContainerWithName("test", config) + ok(t, err) + defer func() { + _ = container.Destroy() + }() + + process1 := &libcontainer.Process{ + Cwd: "/", + Args: []string{"sleep", "100"}, + Env: standardEnvironment, + Init: true, + } + err = container.Run(process1) + ok(t, err) + + process2 := &libcontainer.Process{ + Cwd: "/", + Args: []string{"sleep", "100"}, + Env: standardEnvironment, + Init: false, + } + err = container.Run(process2) + ok(t, err) + + // Kill the init process and Wait for it. + err = process1.Signal(syscall.SIGKILL) + ok(t, err) + _, err = process1.Wait() + if err == nil { + t.Fatal("expected Wait to indicate failure") + } + + // The non-init process must've been killed. + err = process2.Signal(syscall.Signal(0)) + if err == nil || err.Error() != "no such process" { + t.Fatalf("expected process to have been killed: %v", err) + } +} + +func TestInitJoinPID(t *testing.T) { + + if testing.Short() { + return + } + rootfs, err := newRootfs() + ok(t, err) + defer remove(rootfs) + + // Execute a long-running container + container1, err := newContainer(newTemplateConfig(&tParam{ + rootfs: rootfs, + userns: true, + })) + + ok(t, err) + defer container1.Destroy() + + stdinR1, stdinW1, err := os.Pipe() + ok(t, err) + init1 := &libcontainer.Process{ + Cwd: "/", + Args: []string{"cat"}, + Env: standardEnvironment, + Stdin: stdinR1, + Init: true, + } + err = container1.Run(init1) + stdinR1.Close() + defer stdinW1.Close() + ok(t, err) + + // get the state of the first container + state1, err := container1.State() + ok(t, err) + pidns1 := state1.NamespacePaths[configs.NEWPID] + userns1 := state1.NamespacePaths[configs.NEWUSER] + + // Run a container inside the existing pidns but with different cgroups + config2 := newTemplateConfig(&tParam{ + rootfs: rootfs, + userns: true, + }) + + // sysbox-runc: since sys containers always have user-ns, we must also join it (we + // can't just joint the pid-ns and not the user-ns as the kernel balks with "operation + // not permitted") + config2.Namespaces.Add(configs.NEWPID, pidns1) + config2.Namespaces.Add(configs.NEWUSER, userns1) + config2.Cgroups.Path = "integration/test2" + container2, err := newContainerWithName("testCT2", config2) + ok(t, err) + defer container2.Destroy() + + stdinR2, stdinW2, err := os.Pipe() + ok(t, err) + init2 := &libcontainer.Process{ + Cwd: "/", + Args: []string{"cat"}, + Env: standardEnvironment, + Stdin: stdinR2, + Init: true, + } + err = container2.Run(init2) + stdinR2.Close() + defer stdinW2.Close() + ok(t, err) + // get the state of the second container + state2, err := container2.State() + ok(t, err) + + ns1, err := os.Readlink(fmt.Sprintf("/proc/%d/ns/pid", state1.InitProcessPid)) + ok(t, err) + ns2, err := os.Readlink(fmt.Sprintf("/proc/%d/ns/pid", state2.InitProcessPid)) + ok(t, err) + if ns1 != ns2 { + t.Errorf("pidns(%s), wanted %s", ns2, ns1) + } + + // check that namespaces are not the same + if reflect.DeepEqual(state2.NamespacePaths, state1.NamespacePaths) { + t.Errorf("Namespaces(%v), original %v", state2.NamespacePaths, + state1.NamespacePaths) + } + // check that pidns is joined correctly. The initial container process list + // should contain the second container's init process + buffers := newStdBuffers() + ps := &libcontainer.Process{ + Cwd: "/", + Args: []string{"ps"}, + Env: standardEnvironment, + Stdout: buffers.Stdout, + } + err = container1.Run(ps) + ok(t, err) + waitProcess(ps, t) + + // Stop init processes one by one. Stop the second container should + // not stop the first. + stdinW2.Close() + waitProcess(init2, t) + stdinW1.Close() + waitProcess(init1, t) + + out := strings.TrimSpace(buffers.Stdout.String()) + // output of ps inside the initial PID namespace should have + // 1 line of header, + // 2 lines of init processes, + // 1 line of ps process + if len(strings.Split(out, "\n")) != 4 { + t.Errorf("unexpected running process, output %q", out) + } +} + +func TestInitJoinNetworkAndUser(t *testing.T) { + if _, err := os.Stat("/proc/self/ns/user"); os.IsNotExist(err) { + t.Skip("userns is unsupported") + } + if testing.Short() { + return + } + rootfs, err := newRootfs() + ok(t, err) + defer remove(rootfs) + + // Execute a long-running container + config1 := newTemplateConfig(&tParam{ + rootfs: rootfs, + userns: true, + }) + container1, err := newContainer(config1) + ok(t, err) + defer container1.Destroy() + + stdinR1, stdinW1, err := os.Pipe() + ok(t, err) + init1 := &libcontainer.Process{ + Cwd: "/", + Args: []string{"cat"}, + Env: standardEnvironment, + Stdin: stdinR1, + Init: true, + } + err = container1.Run(init1) + stdinR1.Close() + defer stdinW1.Close() + ok(t, err) + + // get the state of the first container + state1, err := container1.State() + ok(t, err) + netns1 := state1.NamespacePaths[configs.NEWNET] + userns1 := state1.NamespacePaths[configs.NEWUSER] + + // Run a container inside the existing pidns but with different cgroups + rootfs2, err := newRootfs() + ok(t, err) + defer remove(rootfs2) + + config2 := newTemplateConfig(&tParam{ + rootfs: rootfs2, + userns: true, + }) + config2.Namespaces.Add(configs.NEWNET, netns1) + config2.Namespaces.Add(configs.NEWUSER, userns1) + config2.Cgroups.Path = "integration/test2" + container2, err := newContainerWithName("testCT2", config2) + ok(t, err) + defer container2.Destroy() + + stdinR2, stdinW2, err := os.Pipe() + ok(t, err) + init2 := &libcontainer.Process{ + Cwd: "/", + Args: []string{"cat"}, + Env: standardEnvironment, + Stdin: stdinR2, + Init: true, + } + err = container2.Run(init2) + stdinR2.Close() + defer stdinW2.Close() + ok(t, err) + + // get the state of the second container + state2, err := container2.State() + ok(t, err) + + for _, ns := range []string{"net", "user"} { + ns1, err := os.Readlink(fmt.Sprintf("/proc/%d/ns/%s", state1.InitProcessPid, ns)) + ok(t, err) + ns2, err := os.Readlink(fmt.Sprintf("/proc/%d/ns/%s", state2.InitProcessPid, ns)) + ok(t, err) + if ns1 != ns2 { + t.Errorf("%s(%s), wanted %s", ns, ns2, ns1) + } + } + + // check that namespaces are not the same + if reflect.DeepEqual(state2.NamespacePaths, state1.NamespacePaths) { + t.Errorf("Namespaces(%v), original %v", state2.NamespacePaths, + state1.NamespacePaths) + } + // Stop init processes one by one. Stop the second container should + // not stop the first. + stdinW2.Close() + waitProcess(init2, t) + stdinW1.Close() + waitProcess(init1, t) +} + +func TestTmpfsCopyUp(t *testing.T) { + if testing.Short() { + return + } + + rootfs, err := newRootfs() + ok(t, err) + defer remove(rootfs) + + config := newTemplateConfig(&tParam{rootfs: rootfs}) + + config.Mounts = append(config.Mounts, &configs.Mount{ + Source: "tmpfs", + Destination: "/etc", + Device: "tmpfs", + Extensions: configs.EXT_COPYUP, + }) + + container, err := newContainerWithName("test", config) + ok(t, err) + defer container.Destroy() + + var stdout bytes.Buffer + pconfig := libcontainer.Process{ + Args: []string{"ls", "/etc/passwd"}, + Env: standardEnvironment, + Stdin: nil, + Stdout: &stdout, + Init: true, + } + err = container.Run(&pconfig) + ok(t, err) + + // Wait for process + waitProcess(&pconfig, t) + + outputLs := stdout.String() + + // Check that the ls output has /etc/passwd + if !strings.Contains(outputLs, "/etc/passwd") { + t.Fatalf("/etc/passwd not copied up as expected: %v", outputLs) + } +} + +func TestCGROUPPrivate(t *testing.T) { + if _, err := os.Stat("/proc/self/ns/cgroup"); os.IsNotExist(err) { + t.Skip("cgroupns is unsupported") + } + if testing.Short() { + return + } + + rootfs, err := newRootfs() + ok(t, err) + defer remove(rootfs) + + l, err := os.Readlink("/proc/1/ns/cgroup") + ok(t, err) + + config := newTemplateConfig(&tParam{rootfs: rootfs}) + config.Namespaces.Add(configs.NEWCGROUP, "") + buffers, exitCode, err := runContainer(config, "", "readlink", "/proc/self/ns/cgroup") + ok(t, err) + + if exitCode != 0 { + t.Fatalf("exit code not 0. code %d stderr %q", exitCode, buffers.Stderr) + } + + if actual := strings.Trim(buffers.Stdout.String(), "\n"); actual == l { + t.Fatalf("cgroup link should be private to the container but equals host %q %q", actual, l) + } +} + +func TestCGROUPHost(t *testing.T) { + if _, err := os.Stat("/proc/self/ns/cgroup"); os.IsNotExist(err) { + t.Skip("cgroupns is unsupported") + } + if testing.Short() { + return + } + + rootfs, err := newRootfs() + ok(t, err) + defer remove(rootfs) + + l, err := os.Readlink("/proc/1/ns/cgroup") + ok(t, err) + + // This test only makes sense when the container is not using the cgroup-ns. + config := newTemplateConfig(&tParam{rootfs: rootfs}) + config.Namespaces.Remove(configs.NEWCGROUP) + + buffers, exitCode, err := runContainer(config, "", "readlink", "/proc/self/ns/cgroup") + ok(t, err) + + if exitCode != 0 { + t.Fatalf("exit code not 0. code %d stderr %q", exitCode, buffers.Stderr) + } + + if actual := strings.Trim(buffers.Stdout.String(), "\n"); actual != l { + t.Fatalf("cgroup link not equal to host link %q %q", actual, l) + } +} diff --git a/sysbox-runc/libcontainer/integration/execin_test.go b/sysbox-runc/libcontainer/integration/execin_test.go new file mode 100644 index 00000000..07a62090 --- /dev/null +++ b/sysbox-runc/libcontainer/integration/execin_test.go @@ -0,0 +1,606 @@ +package integration + +import ( + "bytes" + "fmt" + "io" + "os" + "strconv" + "strings" + "testing" + "time" + + "github.com/containerd/console" + "github.com/opencontainers/runc/libcontainer" + "github.com/opencontainers/runc/libcontainer/configs" + "github.com/opencontainers/runc/libcontainer/utils" + + "golang.org/x/sys/unix" +) + +func TestExecIn(t *testing.T) { + if testing.Short() { + return + } + rootfs, err := newRootfs() + ok(t, err) + defer remove(rootfs) + config := newTemplateConfig(&tParam{rootfs: rootfs}) + container, err := newContainer(config) + ok(t, err) + defer container.Destroy() + + // Execute a first process in the container + stdinR, stdinW, err := os.Pipe() + ok(t, err) + process := &libcontainer.Process{ + Cwd: "/", + Args: []string{"cat"}, + Env: standardEnvironment, + Stdin: stdinR, + Init: true, + } + err = container.Run(process) + stdinR.Close() + defer stdinW.Close() + ok(t, err) + + buffers := newStdBuffers() + ps := &libcontainer.Process{ + Cwd: "/", + Args: []string{"ps"}, + Env: standardEnvironment, + Stdin: buffers.Stdin, + Stdout: buffers.Stdout, + Stderr: buffers.Stderr, + } + + err = container.Run(ps) + ok(t, err) + waitProcess(ps, t) + stdinW.Close() + waitProcess(process, t) + + out := buffers.Stdout.String() + if !strings.Contains(out, "cat") || !strings.Contains(out, "ps") { + t.Fatalf("unexpected running process, output %q", out) + } + if strings.Contains(out, "\r") { + t.Fatalf("unexpected carriage-return in output %q", out) + } +} + +func TestExecInUsernsRlimit(t *testing.T) { + if _, err := os.Stat("/proc/self/ns/user"); os.IsNotExist(err) { + t.Skip("userns is unsupported") + } + + testExecInRlimit(t, true) +} + +func TestExecInRlimit(t *testing.T) { + testExecInRlimit(t, false) +} + +func testExecInRlimit(t *testing.T, userns bool) { + if testing.Short() { + return + } + + rootfs, err := newRootfs() + ok(t, err) + defer remove(rootfs) + + config := newTemplateConfig(&tParam{ + rootfs: rootfs, + userns: userns, + }) + + container, err := newContainer(config) + ok(t, err) + defer container.Destroy() + + stdinR, stdinW, err := os.Pipe() + ok(t, err) + process := &libcontainer.Process{ + Cwd: "/", + Args: []string{"cat"}, + Env: standardEnvironment, + Stdin: stdinR, + Init: true, + } + err = container.Run(process) + stdinR.Close() + defer stdinW.Close() + ok(t, err) + + buffers := newStdBuffers() + ps := &libcontainer.Process{ + Cwd: "/", + Args: []string{"/bin/sh", "-c", "ulimit -n"}, + Env: standardEnvironment, + Stdin: buffers.Stdin, + Stdout: buffers.Stdout, + Stderr: buffers.Stderr, + Rlimits: []configs.Rlimit{ + // increase process rlimit higher than container rlimit to test per-process limit + {Type: unix.RLIMIT_NOFILE, Hard: 1026, Soft: 1026}, + }, + Init: false, // not the first process in container + } + err = container.Run(ps) + ok(t, err) + waitProcess(ps, t) + + stdinW.Close() + waitProcess(process, t) + + out := buffers.Stdout.String() + if limit := strings.TrimSpace(out); limit != "1026" { + t.Fatalf("expected rlimit to be 1026, got %s", limit) + } +} + +func TestExecInAdditionalGroups(t *testing.T) { + if testing.Short() { + return + } + + rootfs, err := newRootfs() + ok(t, err) + defer remove(rootfs) + + config := newTemplateConfig(&tParam{rootfs: rootfs}) + container, err := newContainer(config) + ok(t, err) + defer container.Destroy() + + // Execute a first process in the container + stdinR, stdinW, err := os.Pipe() + ok(t, err) + process := &libcontainer.Process{ + Cwd: "/", + Args: []string{"cat"}, + Env: standardEnvironment, + Stdin: stdinR, + Init: true, + } + err = container.Run(process) + stdinR.Close() + defer stdinW.Close() + ok(t, err) + + var stdout bytes.Buffer + pconfig := libcontainer.Process{ + Cwd: "/", + Args: []string{"sh", "-c", "id", "-Gn"}, + Env: standardEnvironment, + Stdin: nil, + Stdout: &stdout, + AdditionalGroups: []string{"plugdev", "audio"}, + } + err = container.Run(&pconfig) + ok(t, err) + + // Wait for process + waitProcess(&pconfig, t) + + stdinW.Close() + waitProcess(process, t) + + outputGroups := stdout.String() + + // Check that the groups output has the groups that we specified + if !strings.Contains(outputGroups, "audio") { + t.Fatalf("Listed groups do not contain the audio group as expected: %v", outputGroups) + } + + if !strings.Contains(outputGroups, "plugdev") { + t.Fatalf("Listed groups do not contain the plugdev group as expected: %v", outputGroups) + } +} + +func TestExecInError(t *testing.T) { + if testing.Short() { + return + } + rootfs, err := newRootfs() + ok(t, err) + defer remove(rootfs) + config := newTemplateConfig(&tParam{rootfs: rootfs}) + container, err := newContainer(config) + ok(t, err) + defer container.Destroy() + + // Execute a first process in the container + stdinR, stdinW, err := os.Pipe() + ok(t, err) + process := &libcontainer.Process{ + Cwd: "/", + Args: []string{"cat"}, + Env: standardEnvironment, + Stdin: stdinR, + Init: true, + } + err = container.Run(process) + stdinR.Close() + defer func() { + stdinW.Close() + if _, err := process.Wait(); err != nil { + t.Log(err) + } + }() + ok(t, err) + + for i := 0; i < 42; i++ { + var out bytes.Buffer + unexistent := &libcontainer.Process{ + Cwd: "/", + Args: []string{"unexistent"}, + Env: standardEnvironment, + Stderr: &out, + } + err = container.Run(unexistent) + if err == nil { + t.Fatal("Should be an error") + } + if !strings.Contains(err.Error(), "executable file not found") { + t.Fatalf("Should be error about not found executable, got %s", err) + } + if !bytes.Contains(out.Bytes(), []byte("executable file not found")) { + t.Fatalf("executable file not found error not delivered to stdio:\n%s", out.String()) + } + } +} + +func TestExecInTTY(t *testing.T) { + if testing.Short() { + return + } + t.Skip("racy; see https://github.com/opencontainers/runc/issues/2425") + rootfs, err := newRootfs() + ok(t, err) + defer remove(rootfs) + config := newTemplateConfig(&tParam{rootfs: rootfs}) + container, err := newContainer(config) + ok(t, err) + defer container.Destroy() + + // Execute a first process in the container + stdinR, stdinW, err := os.Pipe() + ok(t, err) + process := &libcontainer.Process{ + Cwd: "/", + Args: []string{"cat"}, + Env: standardEnvironment, + Stdin: stdinR, + Init: true, + } + err = container.Run(process) + stdinR.Close() + defer func() { + stdinW.Close() + if _, err := process.Wait(); err != nil { + t.Log(err) + } + }() + ok(t, err) + + ps := &libcontainer.Process{ + Cwd: "/", + Args: []string{"ps"}, + Env: standardEnvironment, + } + + // Repeat to increase chances to catch a race; see + // https://github.com/opencontainers/runc/issues/2425. + for i := 0; i < 300; i++ { + var stdout bytes.Buffer + + parent, child, err := utils.NewSockPair("console") + if err != nil { + ok(t, err) + } + ps.ConsoleSocket = child + + done := make(chan (error)) + go func() { + f, err := utils.RecvFd(parent) + if err != nil { + done <- fmt.Errorf("RecvFd: %w", err) + return + } + c, err := console.ConsoleFromFile(f) + if err != nil { + done <- fmt.Errorf("ConsoleFromFile: %w", err) + return + } + err = console.ClearONLCR(c.Fd()) + if err != nil { + done <- fmt.Errorf("ClearONLCR: %w", err) + return + } + // An error from io.Copy is expected once the terminal + // is gone, so we deliberately ignore it. + _, _ = io.Copy(&stdout, c) + done <- nil + }() + + err = container.Run(ps) + ok(t, err) + + select { + case <-time.After(5 * time.Second): + t.Fatal("Waiting for copy timed out") + case err := <-done: + ok(t, err) + } + + waitProcess(ps, t) + parent.Close() + child.Close() + + out := stdout.String() + if !strings.Contains(out, "cat") || !strings.Contains(out, "ps") { + t.Fatalf("unexpected running process, output %q", out) + } + if strings.Contains(out, "\r") { + t.Fatalf("unexpected carriage-return in output %q", out) + } + } +} + +func TestExecInEnvironment(t *testing.T) { + if testing.Short() { + return + } + rootfs, err := newRootfs() + ok(t, err) + defer remove(rootfs) + config := newTemplateConfig(&tParam{rootfs: rootfs}) + container, err := newContainer(config) + ok(t, err) + defer container.Destroy() + + // Execute a first process in the container + stdinR, stdinW, err := os.Pipe() + ok(t, err) + process := &libcontainer.Process{ + Cwd: "/", + Args: []string{"cat"}, + Env: standardEnvironment, + Stdin: stdinR, + Init: true, + } + err = container.Run(process) + stdinR.Close() + defer stdinW.Close() + ok(t, err) + + buffers := newStdBuffers() + process2 := &libcontainer.Process{ + Cwd: "/", + Args: []string{"env"}, + Env: []string{ + "PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin", + "DEBUG=true", + "DEBUG=false", + "ENV=test", + }, + Stdin: buffers.Stdin, + Stdout: buffers.Stdout, + Stderr: buffers.Stderr, + Init: false, // not the first process in container + } + err = container.Run(process2) + ok(t, err) + waitProcess(process2, t) + + stdinW.Close() + waitProcess(process, t) + + out := buffers.Stdout.String() + // check execin's process environment + if !strings.Contains(out, "DEBUG=false") || + !strings.Contains(out, "ENV=test") || + !strings.Contains(out, "HOME=/root") || + !strings.Contains(out, "PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin") || + strings.Contains(out, "DEBUG=true") { + t.Fatalf("unexpected running process, output %q", out) + } +} + +func TestExecinPassExtraFiles(t *testing.T) { + if testing.Short() { + return + } + rootfs, err := newRootfs() + if err != nil { + t.Fatal(err) + } + defer remove(rootfs) + config := newTemplateConfig(&tParam{rootfs: rootfs}) + container, err := newContainer(config) + if err != nil { + t.Fatal(err) + } + defer container.Destroy() + + // Execute a first process in the container + stdinR, stdinW, err := os.Pipe() + if err != nil { + t.Fatal(err) + } + process := &libcontainer.Process{ + Cwd: "/", + Args: []string{"cat"}, + Env: standardEnvironment, + Stdin: stdinR, + Init: true, + } + err = container.Run(process) + stdinR.Close() + defer stdinW.Close() + if err != nil { + t.Fatal(err) + } + + var stdout bytes.Buffer + pipeout1, pipein1, err := os.Pipe() + if err != nil { + t.Fatal(err) + } + pipeout2, pipein2, err := os.Pipe() + if err != nil { + t.Fatal(err) + } + inprocess := &libcontainer.Process{ + Cwd: "/", + Args: []string{"sh", "-c", "cd /proc/$$/fd; echo -n *; echo -n 1 >3; echo -n 2 >4"}, + Env: []string{"PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin"}, + ExtraFiles: []*os.File{pipein1, pipein2}, + Stdin: nil, + Stdout: &stdout, + } + err = container.Run(inprocess) + if err != nil { + t.Fatal(err) + } + + waitProcess(inprocess, t) + stdinW.Close() + waitProcess(process, t) + + out := stdout.String() + // fd 5 is the directory handle for /proc/$$/fd + if out != "0 1 2 3 4 5" { + t.Fatalf("expected to have the file descriptors '0 1 2 3 4 5' passed to exec, got '%s'", out) + } + var buf = []byte{0} + _, err = pipeout1.Read(buf) + if err != nil { + t.Fatal(err) + } + out1 := string(buf) + if out1 != "1" { + t.Fatalf("expected first pipe to receive '1', got '%s'", out1) + } + + _, err = pipeout2.Read(buf) + if err != nil { + t.Fatal(err) + } + out2 := string(buf) + if out2 != "2" { + t.Fatalf("expected second pipe to receive '2', got '%s'", out2) + } +} + +func TestExecInOomScoreAdj(t *testing.T) { + if testing.Short() { + return + } + rootfs, err := newRootfs() + ok(t, err) + defer remove(rootfs) + config := newTemplateConfig(&tParam{rootfs: rootfs}) + config.OomScoreAdj = ptrInt(200) + container, err := newContainer(config) + ok(t, err) + defer container.Destroy() + + stdinR, stdinW, err := os.Pipe() + ok(t, err) + process := &libcontainer.Process{ + Cwd: "/", + Args: []string{"cat"}, + Env: standardEnvironment, + Stdin: stdinR, + Init: true, + } + err = container.Run(process) + stdinR.Close() + defer stdinW.Close() + ok(t, err) + + buffers := newStdBuffers() + ps := &libcontainer.Process{ + Cwd: "/", + Args: []string{"/bin/sh", "-c", "cat /proc/self/oom_score_adj"}, + Env: standardEnvironment, + Stdin: buffers.Stdin, + Stdout: buffers.Stdout, + Stderr: buffers.Stderr, + } + err = container.Run(ps) + ok(t, err) + waitProcess(ps, t) + + stdinW.Close() + waitProcess(process, t) + + out := buffers.Stdout.String() + if oomScoreAdj := strings.TrimSpace(out); oomScoreAdj != strconv.Itoa(*config.OomScoreAdj) { + t.Fatalf("expected oomScoreAdj to be %d, got %s", *config.OomScoreAdj, oomScoreAdj) + } +} + +func TestExecInUserns(t *testing.T) { + if _, err := os.Stat("/proc/self/ns/user"); os.IsNotExist(err) { + t.Skip("userns is unsupported") + } + if testing.Short() { + return + } + rootfs, err := newRootfs() + ok(t, err) + defer remove(rootfs) + config := newTemplateConfig(&tParam{ + rootfs: rootfs, + userns: true, + }) + container, err := newContainer(config) + ok(t, err) + defer container.Destroy() + + // Execute a first process in the container + stdinR, stdinW, err := os.Pipe() + ok(t, err) + + process := &libcontainer.Process{ + Cwd: "/", + Args: []string{"cat"}, + Env: standardEnvironment, + Stdin: stdinR, + Init: true, + } + err = container.Run(process) + stdinR.Close() + defer stdinW.Close() + ok(t, err) + + initPID, err := process.Pid() + ok(t, err) + initUserns, err := os.Readlink(fmt.Sprintf("/proc/%d/ns/user", initPID)) + ok(t, err) + + buffers := newStdBuffers() + process2 := &libcontainer.Process{ + Cwd: "/", + Args: []string{"readlink", "/proc/self/ns/user"}, + Env: []string{ + "PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin", + }, + Stdout: buffers.Stdout, + Stderr: os.Stderr, + } + err = container.Run(process2) + ok(t, err) + waitProcess(process2, t) + stdinW.Close() + waitProcess(process, t) + + if out := strings.TrimSpace(buffers.Stdout.String()); out != initUserns { + t.Errorf("execin userns(%s), wanted %s", out, initUserns) + } +} diff --git a/sysbox-runc/libcontainer/integration/init_test.go b/sysbox-runc/libcontainer/integration/init_test.go new file mode 100644 index 00000000..f5180eac --- /dev/null +++ b/sysbox-runc/libcontainer/integration/init_test.go @@ -0,0 +1,46 @@ +package integration + +import ( + "os" + "runtime" + "testing" + + "github.com/opencontainers/runc/libcontainer" + _ "github.com/opencontainers/runc/libcontainer/nsenter" + + "github.com/sirupsen/logrus" +) + +// init runs the libcontainer initialization code because of the busybox style needs +// to work around the go runtime and the issues with forking +func init() { + if len(os.Args) < 2 || os.Args[1] != "init" { + return + } + runtime.GOMAXPROCS(1) + runtime.LockOSThread() + factory, err := libcontainer.New("") + if err != nil { + logrus.Fatalf("unable to initialize for container: %s", err) + } + if err := factory.StartInitialization(); err != nil { + logrus.Fatal(err) + } +} + +var testRoots []string + +func TestMain(m *testing.M) { + logrus.SetOutput(os.Stderr) + logrus.SetLevel(logrus.InfoLevel) + + // Clean up roots after running everything. + defer func() { + for _, root := range testRoots { + os.RemoveAll(root) + } + }() + + ret := m.Run() + os.Exit(ret) +} diff --git a/sysbox-runc/libcontainer/integration/seccomp_test.go b/sysbox-runc/libcontainer/integration/seccomp_test.go new file mode 100644 index 00000000..8d246efd --- /dev/null +++ b/sysbox-runc/libcontainer/integration/seccomp_test.go @@ -0,0 +1,495 @@ +//go:build linux && cgo && seccomp +// +build linux,cgo,seccomp + +package integration + +import ( + "strings" + "syscall" + "testing" + + "github.com/opencontainers/runc/libcontainer" + "github.com/opencontainers/runc/libcontainer/configs" + libseccomp "github.com/seccomp/libseccomp-golang" +) + +func TestSeccompDenyGetcwdWithErrno(t *testing.T) { + if testing.Short() { + return + } + + rootfs, err := newRootfs() + if err != nil { + t.Fatal(err) + } + defer remove(rootfs) + + errnoRet := uint(syscall.ESRCH) + + config := newTemplateConfig(&tParam{rootfs: rootfs}) + config.Seccomp = &configs.Seccomp{ + DefaultAction: configs.Allow, + Syscalls: []*configs.Syscall{ + { + Name: "getcwd", + Action: configs.Errno, + ErrnoRet: &errnoRet, + }, + }, + } + + container, err := newContainer(config) + if err != nil { + t.Fatal(err) + } + defer container.Destroy() + + buffers := newStdBuffers() + pwd := &libcontainer.Process{ + Cwd: "/", + Args: []string{"pwd"}, + Env: standardEnvironment, + Stdin: buffers.Stdin, + Stdout: buffers.Stdout, + Stderr: buffers.Stderr, + Init: true, + } + + err = container.Run(pwd) + if err != nil { + t.Fatal(err) + } + ps, err := pwd.Wait() + if err == nil { + t.Fatal("Expecting error (negative return code); instead exited cleanly!") + } + + var exitCode int + status := ps.Sys().(syscall.WaitStatus) + if status.Exited() { + exitCode = status.ExitStatus() + } else if status.Signaled() { + exitCode = -int(status.Signal()) + } else { + t.Fatalf("Unrecognized exit reason!") + } + + if exitCode == 0 { + t.Fatalf("Getcwd should fail with negative exit code, instead got %d!", exitCode) + } + + expected := "pwd: getcwd: No such process" + actual := strings.Trim(buffers.Stderr.String(), "\n") + if actual != expected { + t.Fatalf("Expected output %s but got %s\n", expected, actual) + } +} + +func TestSeccompDenyGetcwd(t *testing.T) { + if testing.Short() { + return + } + + rootfs, err := newRootfs() + if err != nil { + t.Fatal(err) + } + defer remove(rootfs) + + config := newTemplateConfig(&tParam{rootfs: rootfs}) + config.Seccomp = &configs.Seccomp{ + DefaultAction: configs.Allow, + Syscalls: []*configs.Syscall{ + { + Name: "getcwd", + Action: configs.Errno, + }, + }, + } + + container, err := newContainer(config) + if err != nil { + t.Fatal(err) + } + defer container.Destroy() + + buffers := newStdBuffers() + pwd := &libcontainer.Process{ + Cwd: "/", + Args: []string{"pwd"}, + Env: standardEnvironment, + Stdin: buffers.Stdin, + Stdout: buffers.Stdout, + Stderr: buffers.Stderr, + Init: true, + } + + err = container.Run(pwd) + if err != nil { + t.Fatal(err) + } + ps, err := pwd.Wait() + if err == nil { + t.Fatal("Expecting error (negative return code); instead exited cleanly!") + } + + var exitCode int + status := ps.Sys().(syscall.WaitStatus) + if status.Exited() { + exitCode = status.ExitStatus() + } else if status.Signaled() { + exitCode = -int(status.Signal()) + } else { + t.Fatalf("Unrecognized exit reason!") + } + + if exitCode == 0 { + t.Fatalf("Getcwd should fail with negative exit code, instead got %d!", exitCode) + } + + expected := "pwd: getcwd: Operation not permitted" + actual := strings.Trim(buffers.Stderr.String(), "\n") + if actual != expected { + t.Fatalf("Expected output %s but got %s\n", expected, actual) + } +} + +func TestSeccompPermitWriteConditional(t *testing.T) { + if testing.Short() { + return + } + + rootfs, err := newRootfs() + if err != nil { + t.Fatal(err) + } + defer remove(rootfs) + + config := newTemplateConfig(&tParam{rootfs: rootfs}) + config.Seccomp = &configs.Seccomp{ + DefaultAction: configs.Allow, + Syscalls: []*configs.Syscall{ + { + Name: "write", + Action: configs.Errno, + Args: []*configs.Arg{ + { + Index: 0, + Value: 2, + Op: configs.EqualTo, + }, + }, + }, + }, + } + + container, err := newContainer(config) + if err != nil { + t.Fatal(err) + } + defer container.Destroy() + + buffers := newStdBuffers() + dmesg := &libcontainer.Process{ + Cwd: "/", + Args: []string{"busybox", "ls", "/"}, + Env: standardEnvironment, + Stdin: buffers.Stdin, + Stdout: buffers.Stdout, + Stderr: buffers.Stderr, + Init: true, + } + + err = container.Run(dmesg) + if err != nil { + t.Fatal(err) + } + if _, err := dmesg.Wait(); err != nil { + t.Fatalf("%s: %s", err, buffers.Stderr) + } +} + +func TestSeccompDenyWriteConditional(t *testing.T) { + if testing.Short() { + return + } + + // Only test if library version is v2.2.1 or higher + // Conditional filtering will always error in v2.2.0 and lower + major, minor, micro := libseccomp.GetLibraryVersion() + if (major == 2 && minor < 2) || (major == 2 && minor == 2 && micro < 1) { + return + } + + rootfs, err := newRootfs() + if err != nil { + t.Fatal(err) + } + defer remove(rootfs) + + config := newTemplateConfig(&tParam{rootfs: rootfs}) + config.Seccomp = &configs.Seccomp{ + DefaultAction: configs.Allow, + Syscalls: []*configs.Syscall{ + { + Name: "write", + Action: configs.Errno, + Args: []*configs.Arg{ + { + Index: 0, + Value: 2, + Op: configs.EqualTo, + }, + }, + }, + }, + } + + container, err := newContainer(config) + if err != nil { + t.Fatal(err) + } + defer container.Destroy() + + buffers := newStdBuffers() + dmesg := &libcontainer.Process{ + Cwd: "/", + Args: []string{"busybox", "ls", "does_not_exist"}, + Env: standardEnvironment, + Stdin: buffers.Stdin, + Stdout: buffers.Stdout, + Stderr: buffers.Stderr, + Init: true, + } + + err = container.Run(dmesg) + if err != nil { + t.Fatal(err) + } + + ps, err := dmesg.Wait() + if err == nil { + t.Fatal("Expecting negative return, instead got 0!") + } + + var exitCode int + status := ps.Sys().(syscall.WaitStatus) + if status.Exited() { + exitCode = status.ExitStatus() + } else if status.Signaled() { + exitCode = -int(status.Signal()) + } else { + t.Fatalf("Unrecognized exit reason!") + } + + if exitCode == 0 { + t.Fatalf("Busybox should fail with negative exit code, instead got %d!", exitCode) + } + + // We're denying write to stderr, so we expect an empty buffer + expected := "" + actual := strings.Trim(buffers.Stderr.String(), "\n") + if actual != expected { + t.Fatalf("Expected output %s but got %s\n", expected, actual) + } +} + +func TestSeccompPermitWriteMultipleConditions(t *testing.T) { + if testing.Short() { + return + } + + rootfs, err := newRootfs() + if err != nil { + t.Fatal(err) + } + defer remove(rootfs) + + config := newTemplateConfig(&tParam{rootfs: rootfs}) + config.Seccomp = &configs.Seccomp{ + DefaultAction: configs.Allow, + Syscalls: []*configs.Syscall{ + { + Name: "write", + Action: configs.Errno, + Args: []*configs.Arg{ + { + Index: 0, + Value: 2, + Op: configs.EqualTo, + }, + { + Index: 2, + Value: 0, + Op: configs.NotEqualTo, + }, + }, + }, + }, + } + + buffers, exitCode, err := runContainer(config, "", "ls", "/") + if err != nil { + t.Fatalf("%s: %s", buffers, err) + } + if exitCode != 0 { + t.Fatalf("exit code not 0. code %d buffers %s", exitCode, buffers) + } + // We don't need to verify the actual thing printed + // Just that something was written to stdout + if len(buffers.Stdout.String()) == 0 { + t.Fatalf("Nothing was written to stdout, write call failed!\n") + } +} + +func TestSeccompDenyWriteMultipleConditions(t *testing.T) { + if testing.Short() { + return + } + + // Only test if library version is v2.2.1 or higher + // Conditional filtering will always error in v2.2.0 and lower + major, minor, micro := libseccomp.GetLibraryVersion() + if (major == 2 && minor < 2) || (major == 2 && minor == 2 && micro < 1) { + return + } + + rootfs, err := newRootfs() + if err != nil { + t.Fatal(err) + } + defer remove(rootfs) + + config := newTemplateConfig(&tParam{rootfs: rootfs}) + config.Seccomp = &configs.Seccomp{ + DefaultAction: configs.Allow, + Syscalls: []*configs.Syscall{ + { + Name: "write", + Action: configs.Errno, + Args: []*configs.Arg{ + { + Index: 0, + Value: 2, + Op: configs.EqualTo, + }, + { + Index: 2, + Value: 0, + Op: configs.NotEqualTo, + }, + }, + }, + }, + } + + buffers, exitCode, err := runContainer(config, "", "ls", "/does_not_exist") + if err == nil { + t.Fatalf("Expecting error return, instead got 0") + } + if exitCode == 0 { + t.Fatalf("Busybox should fail with negative exit code, instead got %d!", exitCode) + } + + expected := "" + actual := strings.Trim(buffers.Stderr.String(), "\n") + if actual != expected { + t.Fatalf("Expected output %s but got %s\n", expected, actual) + } +} + +func TestSeccompMultipleConditionSameArgDeniesStdout(t *testing.T) { + if testing.Short() { + return + } + + rootfs, err := newRootfs() + if err != nil { + t.Fatal(err) + } + defer remove(rootfs) + + // Prevent writing to both stdout and stderr + config := newTemplateConfig(&tParam{rootfs: rootfs}) + config.Seccomp = &configs.Seccomp{ + DefaultAction: configs.Allow, + Syscalls: []*configs.Syscall{ + { + Name: "write", + Action: configs.Errno, + Args: []*configs.Arg{ + { + Index: 0, + Value: 1, + Op: configs.EqualTo, + }, + { + Index: 0, + Value: 2, + Op: configs.EqualTo, + }, + }, + }, + }, + } + + buffers, exitCode, err := runContainer(config, "", "ls", "/") + if err != nil { + t.Fatalf("%s: %s", buffers, err) + } + if exitCode != 0 { + t.Fatalf("exit code not 0. code %d buffers %s", exitCode, buffers) + } + // Verify that nothing was printed + if len(buffers.Stdout.String()) != 0 { + t.Fatalf("Something was written to stdout, write call succeeded!\n") + } +} + +func TestSeccompMultipleConditionSameArgDeniesStderr(t *testing.T) { + if testing.Short() { + return + } + + rootfs, err := newRootfs() + if err != nil { + t.Fatal(err) + } + defer remove(rootfs) + + // Prevent writing to both stdout and stderr + config := newTemplateConfig(&tParam{rootfs: rootfs}) + config.Seccomp = &configs.Seccomp{ + DefaultAction: configs.Allow, + Syscalls: []*configs.Syscall{ + { + Name: "write", + Action: configs.Errno, + Args: []*configs.Arg{ + { + Index: 0, + Value: 1, + Op: configs.EqualTo, + }, + { + Index: 0, + Value: 2, + Op: configs.EqualTo, + }, + }, + }, + }, + } + + buffers, exitCode, err := runContainer(config, "", "ls", "/does_not_exist") + if err == nil { + t.Fatalf("Expecting error return, instead got 0") + } + if exitCode == 0 { + t.Fatalf("Busybox should fail with negative exit code, instead got %d!", exitCode) + } + // Verify nothing was printed + if len(buffers.Stderr.String()) != 0 { + t.Fatalf("Something was written to stderr, write call succeeded!\n") + } +} diff --git a/sysbox-runc/libcontainer/integration/template_test.go b/sysbox-runc/libcontainer/integration/template_test.go new file mode 100644 index 00000000..c1fbdab3 --- /dev/null +++ b/sysbox-runc/libcontainer/integration/template_test.go @@ -0,0 +1,195 @@ +package integration + +import ( + "math/rand" + "strconv" + + "github.com/opencontainers/runc/libcontainer/configs" + "github.com/opencontainers/runc/libcontainer/devices" + "github.com/opencontainers/runc/libcontainer/specconv" + "golang.org/x/sys/unix" +) + +var standardEnvironment = []string{ + "HOME=/root", + "PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin", + "HOSTNAME=integration", + "TERM=xterm", +} + +const defaultMountFlags = unix.MS_NOEXEC | unix.MS_NOSUID | unix.MS_NODEV + +type tParam struct { + rootfs string + userns bool + systemd bool +} + +var linuxCaps = []string{ + "CAP_CHOWN", + "CAP_DAC_OVERRIDE", + "CAP_FSETID", + "CAP_FOWNER", + "CAP_MKNOD", + "CAP_NET_RAW", + "CAP_SETGID", + "CAP_SETUID", + "CAP_SETFCAP", + "CAP_SETPCAP", + "CAP_NET_BIND_SERVICE", + "CAP_SYS_CHROOT", + "CAP_KILL", + "CAP_AUDIT_WRITE", + "CAP_DAC_READ_SEARCH", + "CAP_LINUX_IMMUTABLE", + "CAP_NET_BROADCAST", + "CAP_NET_ADMIN", + "CAP_IPC_LOCK", + "CAP_IPC_OWNER", + "CAP_SYS_MODULE", + "CAP_SYS_RAWIO", + "CAP_SYS_PTRACE", + "CAP_SYS_PACCT", + "CAP_SYS_ADMIN", + "CAP_SYS_BOOT", + "CAP_SYS_NICE", + "CAP_SYS_RESOURCE", + "CAP_SYS_TIME", + "CAP_SYS_TTY_CONFIG", + "CAP_LEASE", + "CAP_AUDIT_CONTROL", + "CAP_MAC_OVERRIDE", + "CAP_MAC_ADMIN", + "CAP_SYSLOG", + "CAP_WAKE_ALARM", + "CAP_BLOCK_SUSPEND", + "CAP_AUDIT_READ", +} + +// newTemplateConfig returns a base template for running a container +// +// it uses a network strategy of just setting a loopback interface +// and the default setup for devices +func newTemplateConfig(p *tParam) *configs.Config { + var allowedDevices []*devices.Rule + for _, device := range specconv.AllowedDevices { + allowedDevices = append(allowedDevices, &device.Rule) + } + config := &configs.Config{ + Rootfs: p.rootfs, + Capabilities: &configs.Capabilities{ + Bounding: linuxCaps, + Effective: linuxCaps, + Inheritable: linuxCaps, + Permitted: linuxCaps, + Ambient: linuxCaps, + }, + Namespaces: configs.Namespaces([]configs.Namespace{ + {Type: configs.NEWNS}, + {Type: configs.NEWUTS}, + {Type: configs.NEWIPC}, + {Type: configs.NEWPID}, + {Type: configs.NEWNET}, + {Type: configs.NEWCGROUP}, + }), + + Cgroups: &configs.Cgroup{ + Resources: &configs.Resources{ + MemorySwappiness: nil, + Devices: allowedDevices, + }, + }, + MaskPaths: []string{ + "/proc/kcore", + "/sys/firmware", + }, + ReadonlyPaths: []string{ + // sysbox-runc: /proc/sys is never read-only in sysbox containers + "/proc/sysrq-trigger", "/proc/irq", "/proc/bus", + }, + Devices: specconv.AllowedDevices, + Hostname: "integration", + Mounts: []*configs.Mount{ + { + Source: "proc", + Destination: "/proc", + Device: "proc", + Flags: defaultMountFlags, + }, + { + Source: "tmpfs", + Destination: "/dev", + Device: "tmpfs", + Flags: unix.MS_NOSUID | unix.MS_STRICTATIME, + Data: "mode=755", + }, + { + Source: "devpts", + Destination: "/dev/pts", + Device: "devpts", + Flags: unix.MS_NOSUID | unix.MS_NOEXEC, + Data: "newinstance,ptmxmode=0666,mode=0620,gid=5", + }, + { + Device: "tmpfs", + Source: "shm", + Destination: "/dev/shm", + Data: "mode=1777,size=65536k", + Flags: defaultMountFlags, + }, + /* + CI is broken on the debian based kernels with this + { + Source: "mqueue", + Destination: "/dev/mqueue", + Device: "mqueue", + Flags: defaultMountFlags, + }, + */ + { + Source: "sysfs", + Destination: "/sys", + Device: "sysfs", + Flags: defaultMountFlags | unix.MS_RDONLY, + }, + }, + Networks: []*configs.Network{ + { + Type: "loopback", + Address: "127.0.0.1/0", + Gateway: "localhost", + }, + }, + Rlimits: []configs.Rlimit{ + { + Type: unix.RLIMIT_NOFILE, + Hard: uint64(1025), + Soft: uint64(1025), + }, + }, + } + + if p.userns { + config.UidMappings = []configs.IDMap{{HostID: 0, ContainerID: 0, Size: 1000}} + config.GidMappings = []configs.IDMap{{HostID: 0, ContainerID: 0, Size: 1000}} + config.Namespaces = append(config.Namespaces, configs.Namespace{Type: configs.NEWUSER}) + } else { + config.Mounts = append(config.Mounts, &configs.Mount{ + Destination: "/sys/fs/cgroup", + Device: "cgroup", + Flags: defaultMountFlags | unix.MS_RDONLY, + }) + } + + if p.systemd { + id := strconv.FormatUint(rand.Uint64(), 36) + config.Cgroups.Name = "test" + id + // do not change Parent (see newContainerWithName) + config.Cgroups.Parent = "system.slice" + config.Cgroups.ScopePrefix = "runc-test" + } else { + config.Cgroups.Path = "/test/integration" + } + + return config +} diff --git a/sysbox-runc/libcontainer/integration/utils_test.go b/sysbox-runc/libcontainer/integration/utils_test.go new file mode 100644 index 00000000..2f501f4b --- /dev/null +++ b/sysbox-runc/libcontainer/integration/utils_test.go @@ -0,0 +1,186 @@ +package integration + +import ( + "bytes" + "crypto/md5" + "encoding/hex" + "fmt" + "io/ioutil" + "os" + "os/exec" + "path/filepath" + "runtime" + "strings" + "syscall" + "testing" + "time" + + "github.com/opencontainers/runc/libcontainer" + "github.com/opencontainers/runc/libcontainer/configs" +) + +func ptrInt(v int) *int { + return &v +} + +func newStdBuffers() *stdBuffers { + return &stdBuffers{ + Stdin: bytes.NewBuffer(nil), + Stdout: bytes.NewBuffer(nil), + Stderr: bytes.NewBuffer(nil), + } +} + +type stdBuffers struct { + Stdin *bytes.Buffer + Stdout *bytes.Buffer + Stderr *bytes.Buffer +} + +func (b *stdBuffers) String() string { + s := []string{} + if b.Stderr != nil { + s = append(s, b.Stderr.String()) + } + if b.Stdout != nil { + s = append(s, b.Stdout.String()) + } + return strings.Join(s, "|") +} + +// ok fails the test if an err is not nil. +func ok(t testing.TB, err error) { + if err != nil { + _, file, line, _ := runtime.Caller(1) + t.Fatalf("%s:%d: unexpected error: %s\n\n", filepath.Base(file), line, err.Error()) + } +} + +func waitProcess(p *libcontainer.Process, t *testing.T) { + _, file, line, _ := runtime.Caller(1) + status, err := p.Wait() + + if err != nil { + t.Fatalf("%s:%d: unexpected error: %s\n\n", filepath.Base(file), line, err.Error()) + } + + if !status.Success() { + t.Fatalf("%s:%d: unexpected status: %s\n\n", filepath.Base(file), line, status.String()) + } +} + +func newTestRoot() (string, error) { + dir, err := ioutil.TempDir("", "libcontainer") + if err != nil { + return "", err + } + if err := os.MkdirAll(dir, 0700); err != nil { + return "", err + } + testRoots = append(testRoots, dir) + return dir, nil +} + +func newTestBundle() (string, error) { + dir, err := ioutil.TempDir("", "bundle") + if err != nil { + return "", err + } + if err := os.MkdirAll(dir, 0700); err != nil { + return "", err + } + return dir, nil +} + +// newRootfs creates a new tmp directory and copies the busybox root filesystem +func newRootfs() (string, error) { + dir, err := ioutil.TempDir("", "") + if err != nil { + return "", err + } + if err := os.MkdirAll(dir, 0700); err != nil { + return "", err + } + if err := copyBusybox(dir); err != nil { + return "", err + } + return dir, nil +} + +func remove(dir string) { + os.RemoveAll(dir) +} + +// copyBusybox copies the rootfs for a busybox container created for the test image +// into the new directory for the specific test +func copyBusybox(dest string) error { + out, err := exec.Command("sh", "-c", fmt.Sprintf("cp -a /busybox/* %s/", dest)).CombinedOutput() + if err != nil { + return fmt.Errorf("copy error %q: %q", err, out) + } + return nil +} + +func newContainer(config *configs.Config) (libcontainer.Container, error) { + h := md5.New() + h.Write([]byte(time.Now().String())) + return newContainerWithName(hex.EncodeToString(h.Sum(nil)), config) +} + +func newContainerWithName(name string, config *configs.Config) (libcontainer.Container, error) { + root, err := newTestRoot() + if err != nil { + return nil, err + } + f, err := libcontainer.New(root, libcontainer.Cgroupfs) + if err != nil { + return nil, err + } + if config.Cgroups != nil && config.Cgroups.Parent == "system.slice" { + f, err = libcontainer.New(root, libcontainer.SystemdCgroups) + if err != nil { + return nil, err + } + } + return f.Create(name, config) +} + +// runContainer runs the container with the specific config and arguments +// +// buffers are returned containing the STDOUT and STDERR output for the run +// along with the exit code and any go error +func runContainer(config *configs.Config, console string, args ...string) (buffers *stdBuffers, exitCode int, err error) { + container, err := newContainer(config) + if err != nil { + return nil, -1, err + } + defer container.Destroy() + buffers = newStdBuffers() + process := &libcontainer.Process{ + Cwd: "/", + Args: args, + Env: standardEnvironment, + Stdin: buffers.Stdin, + Stdout: buffers.Stdout, + Stderr: buffers.Stderr, + Init: true, + } + + err = container.Run(process) + if err != nil { + return buffers, -1, err + } + ps, err := process.Wait() + if err != nil { + return buffers, -1, err + } + status := ps.Sys().(syscall.WaitStatus) + if status.Exited() { + exitCode = status.ExitStatus() + } else if status.Signaled() { + exitCode = -int(status.Signal()) + } else { + return buffers, -1, err + } + return +} diff --git a/sysbox-runc/libcontainer/intelrdt/cmt.go b/sysbox-runc/libcontainer/intelrdt/cmt.go new file mode 100644 index 00000000..ed950973 --- /dev/null +++ b/sysbox-runc/libcontainer/intelrdt/cmt.go @@ -0,0 +1,25 @@ +package intelrdt + +var ( + cmtEnabled bool +) + +// Check if Intel RDT/CMT is enabled. +func IsCMTEnabled() bool { + featuresInit() + return cmtEnabled +} + +func getCMTNumaNodeStats(numaPath string) (*CMTNumaNodeStats, error) { + stats := &CMTNumaNodeStats{} + + if enabledMonFeatures.llcOccupancy { + llcOccupancy, err := getIntelRdtParamUint(numaPath, "llc_occupancy") + if err != nil { + return nil, err + } + stats.LLCOccupancy = llcOccupancy + } + + return stats, nil +} diff --git a/sysbox-runc/libcontainer/intelrdt/cmt_test.go b/sysbox-runc/libcontainer/intelrdt/cmt_test.go new file mode 100644 index 00000000..e061695e --- /dev/null +++ b/sysbox-runc/libcontainer/intelrdt/cmt_test.go @@ -0,0 +1,56 @@ +package intelrdt + +import ( + "os" + "path/filepath" + "testing" +) + +func TestGetCMTNumaNodeStats(t *testing.T) { + mocksNUMANodesToCreate := []string{"mon_l3_00", "mon_l3_01"} + + mocksFilesToCreate := map[string]uint64{ + "llc_occupancy": 9123911, + } + + mockedL3_MON, err := mockResctrlL3_MON(mocksNUMANodesToCreate, mocksFilesToCreate) + + defer func() { + err := os.RemoveAll(mockedL3_MON) + if err != nil { + t.Fatal(err) + } + }() + + if err != nil { + t.Fatal(err) + } + + t.Run("Gather mbm", func(t *testing.T) { + enabledMonFeatures.llcOccupancy = true + + stats := make([]CMTNumaNodeStats, 0, len(mocksNUMANodesToCreate)) + for _, numa := range mocksNUMANodesToCreate { + other, err := getCMTNumaNodeStats(filepath.Join(mockedL3_MON, "mon_data", numa)) + if err != nil { + t.Fatal(err) + } + stats = append(stats, *other) + } + + expectedStats := CMTNumaNodeStats{ + LLCOccupancy: mocksFilesToCreate["llc_occupancy"], + } + + checkCMTStatCorrection(stats[0], expectedStats, t) + checkCMTStatCorrection(stats[1], expectedStats, t) + }) +} + +func checkCMTStatCorrection(got CMTNumaNodeStats, expected CMTNumaNodeStats, t *testing.T) { + if got.LLCOccupancy != expected.LLCOccupancy { + t.Fatalf("Wrong value of `llc_occupancy`. Expected: %v but got: %v", + expected.LLCOccupancy, + got.LLCOccupancy) + } +} diff --git a/sysbox-runc/libcontainer/intelrdt/intelrdt.go b/sysbox-runc/libcontainer/intelrdt/intelrdt.go new file mode 100644 index 00000000..75d37c20 --- /dev/null +++ b/sysbox-runc/libcontainer/intelrdt/intelrdt.go @@ -0,0 +1,816 @@ +// +build linux + +package intelrdt + +import ( + "bufio" + "bytes" + "fmt" + "io" + "io/ioutil" + "os" + "path/filepath" + "strconv" + "strings" + "sync" + + "github.com/moby/sys/mountinfo" + "github.com/opencontainers/runc/libcontainer/configs" +) + +/* + * About Intel RDT features: + * Intel platforms with new Xeon CPU support Resource Director Technology (RDT). + * Cache Allocation Technology (CAT) and Memory Bandwidth Allocation (MBA) are + * two sub-features of RDT. + * + * Cache Allocation Technology (CAT) provides a way for the software to restrict + * cache allocation to a defined 'subset' of L3 cache which may be overlapping + * with other 'subsets'. The different subsets are identified by class of + * service (CLOS) and each CLOS has a capacity bitmask (CBM). + * + * Memory Bandwidth Allocation (MBA) provides indirect and approximate throttle + * over memory bandwidth for the software. A user controls the resource by + * indicating the percentage of maximum memory bandwidth or memory bandwidth + * limit in MBps unit if MBA Software Controller is enabled. + * + * More details about Intel RDT CAT and MBA can be found in the section 17.18 + * of Intel Software Developer Manual: + * https://software.intel.com/en-us/articles/intel-sdm + * + * About Intel RDT kernel interface: + * In Linux 4.10 kernel or newer, the interface is defined and exposed via + * "resource control" filesystem, which is a "cgroup-like" interface. + * + * Comparing with cgroups, it has similar process management lifecycle and + * interfaces in a container. But unlike cgroups' hierarchy, it has single level + * filesystem layout. + * + * CAT and MBA features are introduced in Linux 4.10 and 4.12 kernel via + * "resource control" filesystem. + * + * Intel RDT "resource control" filesystem hierarchy: + * mount -t resctrl resctrl /sys/fs/resctrl + * tree /sys/fs/resctrl + * /sys/fs/resctrl/ + * |-- info + * | |-- L3 + * | | |-- cbm_mask + * | | |-- min_cbm_bits + * | | |-- num_closids + * | |-- L3_MON + * | | |-- max_threshold_occupancy + * | | |-- mon_features + * | | |-- num_rmids + * | |-- MB + * | |-- bandwidth_gran + * | |-- delay_linear + * | |-- min_bandwidth + * | |-- num_closids + * |-- ... + * |-- schemata + * |-- tasks + * |-- + * |-- ... + * |-- schemata + * |-- tasks + * + * For runc, we can make use of `tasks` and `schemata` configuration for L3 + * cache and memory bandwidth resources constraints. + * + * The file `tasks` has a list of tasks that belongs to this group (e.g., + * " group). Tasks can be added to a group by writing the task ID + * to the "tasks" file (which will automatically remove them from the previous + * group to which they belonged). New tasks created by fork(2) and clone(2) are + * added to the same group as their parent. + * + * The file `schemata` has a list of all the resources available to this group. + * Each resource (L3 cache, memory bandwidth) has its own line and format. + * + * L3 cache schema: + * It has allocation bitmasks/values for L3 cache on each socket, which + * contains L3 cache id and capacity bitmask (CBM). + * Format: "L3:=;=;..." + * For example, on a two-socket machine, the schema line could be "L3:0=ff;1=c0" + * which means L3 cache id 0's CBM is 0xff, and L3 cache id 1's CBM is 0xc0. + * + * The valid L3 cache CBM is a *contiguous bits set* and number of bits that can + * be set is less than the max bit. The max bits in the CBM is varied among + * supported Intel CPU models. Kernel will check if it is valid when writing. + * e.g., default value 0xfffff in root indicates the max bits of CBM is 20 + * bits, which mapping to entire L3 cache capacity. Some valid CBM values to + * set in a group: 0xf, 0xf0, 0x3ff, 0x1f00 and etc. + * + * Memory bandwidth schema: + * It has allocation values for memory bandwidth on each socket, which contains + * L3 cache id and memory bandwidth. + * Format: "MB:=bandwidth0;=bandwidth1;..." + * For example, on a two-socket machine, the schema line could be "MB:0=20;1=70" + * + * The minimum bandwidth percentage value for each CPU model is predefined and + * can be looked up through "info/MB/min_bandwidth". The bandwidth granularity + * that is allocated is also dependent on the CPU model and can be looked up at + * "info/MB/bandwidth_gran". The available bandwidth control steps are: + * min_bw + N * bw_gran. Intermediate values are rounded to the next control + * step available on the hardware. + * + * If MBA Software Controller is enabled through mount option "-o mba_MBps": + * mount -t resctrl resctrl -o mba_MBps /sys/fs/resctrl + * We could specify memory bandwidth in "MBps" (Mega Bytes per second) unit + * instead of "percentages". The kernel underneath would use a software feedback + * mechanism or a "Software Controller" which reads the actual bandwidth using + * MBM counters and adjust the memory bandwidth percentages to ensure: + * "actual memory bandwidth < user specified memory bandwidth". + * + * For example, on a two-socket machine, the schema line could be + * "MB:0=5000;1=7000" which means 5000 MBps memory bandwidth limit on socket 0 + * and 7000 MBps memory bandwidth limit on socket 1. + * + * For more information about Intel RDT kernel interface: + * https://www.kernel.org/doc/Documentation/x86/intel_rdt_ui.txt + * + * An example for runc: + * Consider a two-socket machine with two L3 caches where the default CBM is + * 0x7ff and the max CBM length is 11 bits, and minimum memory bandwidth of 10% + * with a memory bandwidth granularity of 10%. + * + * Tasks inside the container only have access to the "upper" 7/11 of L3 cache + * on socket 0 and the "lower" 5/11 L3 cache on socket 1, and may use a + * maximum memory bandwidth of 20% on socket 0 and 70% on socket 1. + * + * "linux": { + * "intelRdt": { + * "l3CacheSchema": "L3:0=7f0;1=1f", + * "memBwSchema": "MB:0=20;1=70" + * } + * } + */ + +type Manager interface { + // Applies Intel RDT configuration to the process with the specified pid + Apply(pid int) error + + // Returns statistics for Intel RDT + GetStats() (*Stats, error) + + // Destroys the Intel RDT 'container_id' group + Destroy() error + + // Returns Intel RDT path to save in a state file and to be able to + // restore the object later + GetPath() string + + // Set Intel RDT "resource control" filesystem as configured. + Set(container *configs.Config) error +} + +// This implements interface Manager +type intelRdtManager struct { + mu sync.Mutex + config *configs.Config + id string + path string +} + +func NewManager(config *configs.Config, id string, path string) Manager { + return &intelRdtManager{ + config: config, + id: id, + path: path, + } +} + +const ( + IntelRdtTasks = "tasks" +) + +var ( + // The absolute root path of the Intel RDT "resource control" filesystem + intelRdtRoot string + intelRdtRootLock sync.Mutex + + // The flag to indicate if Intel RDT/CAT is enabled + catEnabled bool + // The flag to indicate if Intel RDT/MBA is enabled + mbaEnabled bool + // The flag to indicate if Intel RDT/MBA Software Controller is enabled + mbaScEnabled bool + + // For Intel RDT initialization + initOnce sync.Once +) + +type intelRdtData struct { + root string + config *configs.Config + pid int +} + +// Check if Intel RDT sub-features are enabled in featuresInit() +func featuresInit() { + initOnce.Do(func() { + // 1. Check if hardware and kernel support Intel RDT sub-features + flagsSet, err := parseCpuInfoFile("/proc/cpuinfo") + if err != nil { + return + } + + // 2. Check if Intel RDT "resource control" filesystem is mounted + // The user guarantees to mount the filesystem + if !isIntelRdtMounted() { + return + } + + // 3. Double check if Intel RDT sub-features are available in + // "resource control" filesystem. Intel RDT sub-features can be + // selectively disabled or enabled by kernel command line + // (e.g., rdt=!l3cat,mba) in 4.14 and newer kernel + if flagsSet.CAT { + if _, err := os.Stat(filepath.Join(intelRdtRoot, "info", "L3")); err == nil { + catEnabled = true + } + } + if mbaScEnabled { + // We confirm MBA Software Controller is enabled in step 2, + // MBA should be enabled because MBA Software Controller + // depends on MBA + mbaEnabled = true + } else if flagsSet.MBA { + if _, err := os.Stat(filepath.Join(intelRdtRoot, "info", "MB")); err == nil { + mbaEnabled = true + } + } + if flagsSet.MBMTotal || flagsSet.MBMLocal || flagsSet.CMT { + if _, err := os.Stat(filepath.Join(intelRdtRoot, "info", "L3_MON")); err != nil { + return + } + enabledMonFeatures, err = getMonFeatures(intelRdtRoot) + if err != nil { + return + } + if enabledMonFeatures.mbmTotalBytes || enabledMonFeatures.mbmLocalBytes { + mbmEnabled = true + } + if enabledMonFeatures.llcOccupancy { + cmtEnabled = true + } + } + }) +} + +// Return the mount point path of Intel RDT "resource control" filesysem +func findIntelRdtMountpointDir(f io.Reader) (string, error) { + mi, err := mountinfo.GetMountsFromReader(f, func(m *mountinfo.Info) (bool, bool) { + // similar to mountinfo.FSTypeFilter but stops after the first match + if m.FSType == "resctrl" { + return false, true // don't skip, stop + } + return true, false // skip, keep going + }) + if err != nil { + return "", err + } + if len(mi) < 1 { + return "", NewNotFoundError("Intel RDT") + } + + // Check if MBA Software Controller is enabled through mount option "-o mba_MBps" + if strings.Contains(","+mi[0].VFSOptions+",", ",mba_MBps,") { + mbaScEnabled = true + } + + return mi[0].Mountpoint, nil +} + +// Gets the root path of Intel RDT "resource control" filesystem +func getIntelRdtRoot() (string, error) { + intelRdtRootLock.Lock() + defer intelRdtRootLock.Unlock() + + if intelRdtRoot != "" { + return intelRdtRoot, nil + } + + f, err := os.Open("/proc/self/mountinfo") + if err != nil { + return "", err + } + root, err := findIntelRdtMountpointDir(f) + f.Close() + if err != nil { + return "", err + } + + if _, err := os.Stat(root); err != nil { + return "", err + } + + intelRdtRoot = root + return intelRdtRoot, nil +} + +func isIntelRdtMounted() bool { + _, err := getIntelRdtRoot() + return err == nil +} + +type cpuInfoFlags struct { + CAT bool // Cache Allocation Technology + MBA bool // Memory Bandwidth Allocation + + // Memory Bandwidth Monitoring related. + MBMTotal bool + MBMLocal bool + + CMT bool // Cache Monitoring Technology +} + +func parseCpuInfoFile(path string) (cpuInfoFlags, error) { + infoFlags := cpuInfoFlags{} + + f, err := os.Open(path) + if err != nil { + return infoFlags, err + } + defer f.Close() + + s := bufio.NewScanner(f) + for s.Scan() { + line := s.Text() + + // Search "cat_l3" and "mba" flags in first "flags" line + if strings.HasPrefix(line, "flags") { + flags := strings.Split(line, " ") + // "cat_l3" flag for CAT and "mba" flag for MBA + for _, flag := range flags { + switch flag { + case "cat_l3": + infoFlags.CAT = true + case "mba": + infoFlags.MBA = true + case "cqm_mbm_total": + infoFlags.MBMTotal = true + case "cqm_mbm_local": + infoFlags.MBMLocal = true + case "cqm_occup_llc": + infoFlags.CMT = true + } + } + return infoFlags, nil + } + } + if err := s.Err(); err != nil { + return infoFlags, err + } + + return infoFlags, nil +} + +func parseUint(s string, base, bitSize int) (uint64, error) { + value, err := strconv.ParseUint(s, base, bitSize) + if err != nil { + intValue, intErr := strconv.ParseInt(s, base, bitSize) + // 1. Handle negative values greater than MinInt64 (and) + // 2. Handle negative values lesser than MinInt64 + if intErr == nil && intValue < 0 { + return 0, nil + } else if intErr != nil && intErr.(*strconv.NumError).Err == strconv.ErrRange && intValue < 0 { + return 0, nil + } + + return value, err + } + + return value, nil +} + +// Gets a single uint64 value from the specified file. +func getIntelRdtParamUint(path, file string) (uint64, error) { + fileName := filepath.Join(path, file) + contents, err := ioutil.ReadFile(fileName) + if err != nil { + return 0, err + } + + res, err := parseUint(string(bytes.TrimSpace(contents)), 10, 64) + if err != nil { + return res, fmt.Errorf("unable to parse %q as a uint from file %q", string(contents), fileName) + } + return res, nil +} + +// Gets a string value from the specified file +func getIntelRdtParamString(path, file string) (string, error) { + contents, err := ioutil.ReadFile(filepath.Join(path, file)) + if err != nil { + return "", err + } + + return string(bytes.TrimSpace(contents)), nil +} + +func writeFile(dir, file, data string) error { + if dir == "" { + return fmt.Errorf("no such directory for %s", file) + } + if err := ioutil.WriteFile(filepath.Join(dir, file), []byte(data+"\n"), 0700); err != nil { + return fmt.Errorf("failed to write %v to %v: %v", data, file, err) + } + return nil +} + +func getIntelRdtData(c *configs.Config, pid int) (*intelRdtData, error) { + rootPath, err := getIntelRdtRoot() + if err != nil { + return nil, err + } + return &intelRdtData{ + root: rootPath, + config: c, + pid: pid, + }, nil +} + +// Get the read-only L3 cache information +func getL3CacheInfo() (*L3CacheInfo, error) { + l3CacheInfo := &L3CacheInfo{} + + rootPath, err := getIntelRdtRoot() + if err != nil { + return l3CacheInfo, err + } + + path := filepath.Join(rootPath, "info", "L3") + cbmMask, err := getIntelRdtParamString(path, "cbm_mask") + if err != nil { + return l3CacheInfo, err + } + minCbmBits, err := getIntelRdtParamUint(path, "min_cbm_bits") + if err != nil { + return l3CacheInfo, err + } + numClosids, err := getIntelRdtParamUint(path, "num_closids") + if err != nil { + return l3CacheInfo, err + } + + l3CacheInfo.CbmMask = cbmMask + l3CacheInfo.MinCbmBits = minCbmBits + l3CacheInfo.NumClosids = numClosids + + return l3CacheInfo, nil +} + +// Get the read-only memory bandwidth information +func getMemBwInfo() (*MemBwInfo, error) { + memBwInfo := &MemBwInfo{} + + rootPath, err := getIntelRdtRoot() + if err != nil { + return memBwInfo, err + } + + path := filepath.Join(rootPath, "info", "MB") + bandwidthGran, err := getIntelRdtParamUint(path, "bandwidth_gran") + if err != nil { + return memBwInfo, err + } + delayLinear, err := getIntelRdtParamUint(path, "delay_linear") + if err != nil { + return memBwInfo, err + } + minBandwidth, err := getIntelRdtParamUint(path, "min_bandwidth") + if err != nil { + return memBwInfo, err + } + numClosids, err := getIntelRdtParamUint(path, "num_closids") + if err != nil { + return memBwInfo, err + } + + memBwInfo.BandwidthGran = bandwidthGran + memBwInfo.DelayLinear = delayLinear + memBwInfo.MinBandwidth = minBandwidth + memBwInfo.NumClosids = numClosids + + return memBwInfo, nil +} + +// Get diagnostics for last filesystem operation error from file info/last_cmd_status +func getLastCmdStatus() (string, error) { + rootPath, err := getIntelRdtRoot() + if err != nil { + return "", err + } + + path := filepath.Join(rootPath, "info") + lastCmdStatus, err := getIntelRdtParamString(path, "last_cmd_status") + if err != nil { + return "", err + } + + return lastCmdStatus, nil +} + +// WriteIntelRdtTasks writes the specified pid into the "tasks" file +func WriteIntelRdtTasks(dir string, pid int) error { + if dir == "" { + return fmt.Errorf("no such directory for %s", IntelRdtTasks) + } + + // Don't attach any pid if -1 is specified as a pid + if pid != -1 { + if err := ioutil.WriteFile(filepath.Join(dir, IntelRdtTasks), []byte(strconv.Itoa(pid)), 0700); err != nil { + return fmt.Errorf("failed to write %v to %v: %v", pid, IntelRdtTasks, err) + } + } + return nil +} + +// Check if Intel RDT/CAT is enabled +func IsCATEnabled() bool { + featuresInit() + return catEnabled +} + +// Check if Intel RDT/MBA is enabled +func IsMBAEnabled() bool { + featuresInit() + return mbaEnabled +} + +// Check if Intel RDT/MBA Software Controller is enabled +func IsMBAScEnabled() bool { + featuresInit() + return mbaScEnabled +} + +// Get the 'container_id' path in Intel RDT "resource control" filesystem +func GetIntelRdtPath(id string) (string, error) { + rootPath, err := getIntelRdtRoot() + if err != nil { + return "", err + } + + path := filepath.Join(rootPath, id) + return path, nil +} + +// Applies Intel RDT configuration to the process with the specified pid +func (m *intelRdtManager) Apply(pid int) (err error) { + // If intelRdt is not specified in config, we do nothing + if m.config.IntelRdt == nil { + return nil + } + d, err := getIntelRdtData(m.config, pid) + if err != nil && !IsNotFound(err) { + return err + } + + m.mu.Lock() + defer m.mu.Unlock() + path, err := d.join(m.id) + if err != nil { + return err + } + + m.path = path + return nil +} + +// Destroys the Intel RDT 'container_id' group +func (m *intelRdtManager) Destroy() error { + m.mu.Lock() + defer m.mu.Unlock() + if err := os.RemoveAll(m.GetPath()); err != nil { + return err + } + m.path = "" + return nil +} + +// Returns Intel RDT path to save in a state file and to be able to +// restore the object later +func (m *intelRdtManager) GetPath() string { + if m.path == "" { + m.path, _ = GetIntelRdtPath(m.id) + } + return m.path +} + +// Returns statistics for Intel RDT +func (m *intelRdtManager) GetStats() (*Stats, error) { + // If intelRdt is not specified in config + if m.config.IntelRdt == nil { + return nil, nil + } + + m.mu.Lock() + defer m.mu.Unlock() + stats := NewStats() + + rootPath, err := getIntelRdtRoot() + if err != nil { + return nil, err + } + // The read-only L3 cache and memory bandwidth schemata in root + tmpRootStrings, err := getIntelRdtParamString(rootPath, "schemata") + if err != nil { + return nil, err + } + schemaRootStrings := strings.Split(tmpRootStrings, "\n") + + // The L3 cache and memory bandwidth schemata in 'container_id' group + containerPath := m.GetPath() + tmpStrings, err := getIntelRdtParamString(containerPath, "schemata") + if err != nil { + return nil, err + } + schemaStrings := strings.Split(tmpStrings, "\n") + + if IsCATEnabled() { + // The read-only L3 cache information + l3CacheInfo, err := getL3CacheInfo() + if err != nil { + return nil, err + } + stats.L3CacheInfo = l3CacheInfo + + // The read-only L3 cache schema in root + for _, schemaRoot := range schemaRootStrings { + if strings.Contains(schemaRoot, "L3") { + stats.L3CacheSchemaRoot = strings.TrimSpace(schemaRoot) + } + } + + // The L3 cache schema in 'container_id' group + for _, schema := range schemaStrings { + if strings.Contains(schema, "L3") { + stats.L3CacheSchema = strings.TrimSpace(schema) + } + } + } + + if IsMBAEnabled() { + // The read-only memory bandwidth information + memBwInfo, err := getMemBwInfo() + if err != nil { + return nil, err + } + stats.MemBwInfo = memBwInfo + + // The read-only memory bandwidth information + for _, schemaRoot := range schemaRootStrings { + if strings.Contains(schemaRoot, "MB") { + stats.MemBwSchemaRoot = strings.TrimSpace(schemaRoot) + } + } + + // The memory bandwidth schema in 'container_id' group + for _, schema := range schemaStrings { + if strings.Contains(schema, "MB") { + stats.MemBwSchema = strings.TrimSpace(schema) + } + } + } + + if IsMBMEnabled() || IsCMTEnabled() { + err = getMonitoringStats(containerPath, stats) + if err != nil { + return nil, err + } + } + + return stats, nil +} + +// Set Intel RDT "resource control" filesystem as configured. +func (m *intelRdtManager) Set(container *configs.Config) error { + // About L3 cache schema: + // It has allocation bitmasks/values for L3 cache on each socket, + // which contains L3 cache id and capacity bitmask (CBM). + // Format: "L3:=;=;..." + // For example, on a two-socket machine, the schema line could be: + // L3:0=ff;1=c0 + // which means L3 cache id 0's CBM is 0xff, and L3 cache id 1's CBM + // is 0xc0. + // + // The valid L3 cache CBM is a *contiguous bits set* and number of + // bits that can be set is less than the max bit. The max bits in the + // CBM is varied among supported Intel CPU models. Kernel will check + // if it is valid when writing. e.g., default value 0xfffff in root + // indicates the max bits of CBM is 20 bits, which mapping to entire + // L3 cache capacity. Some valid CBM values to set in a group: + // 0xf, 0xf0, 0x3ff, 0x1f00 and etc. + // + // + // About memory bandwidth schema: + // It has allocation values for memory bandwidth on each socket, which + // contains L3 cache id and memory bandwidth. + // Format: "MB:=bandwidth0;=bandwidth1;..." + // For example, on a two-socket machine, the schema line could be: + // "MB:0=20;1=70" + // + // The minimum bandwidth percentage value for each CPU model is + // predefined and can be looked up through "info/MB/min_bandwidth". + // The bandwidth granularity that is allocated is also dependent on + // the CPU model and can be looked up at "info/MB/bandwidth_gran". + // The available bandwidth control steps are: min_bw + N * bw_gran. + // Intermediate values are rounded to the next control step available + // on the hardware. + // + // If MBA Software Controller is enabled through mount option + // "-o mba_MBps": mount -t resctrl resctrl -o mba_MBps /sys/fs/resctrl + // We could specify memory bandwidth in "MBps" (Mega Bytes per second) + // unit instead of "percentages". The kernel underneath would use a + // software feedback mechanism or a "Software Controller" which reads + // the actual bandwidth using MBM counters and adjust the memory + // bandwidth percentages to ensure: + // "actual memory bandwidth < user specified memory bandwidth". + // + // For example, on a two-socket machine, the schema line could be + // "MB:0=5000;1=7000" which means 5000 MBps memory bandwidth limit on + // socket 0 and 7000 MBps memory bandwidth limit on socket 1. + if container.IntelRdt != nil { + path := m.GetPath() + l3CacheSchema := container.IntelRdt.L3CacheSchema + memBwSchema := container.IntelRdt.MemBwSchema + + // Write a single joint schema string to schemata file + if l3CacheSchema != "" && memBwSchema != "" { + if err := writeFile(path, "schemata", l3CacheSchema+"\n"+memBwSchema); err != nil { + return NewLastCmdError(err) + } + } + + // Write only L3 cache schema string to schemata file + if l3CacheSchema != "" && memBwSchema == "" { + if err := writeFile(path, "schemata", l3CacheSchema); err != nil { + return NewLastCmdError(err) + } + } + + // Write only memory bandwidth schema string to schemata file + if l3CacheSchema == "" && memBwSchema != "" { + if err := writeFile(path, "schemata", memBwSchema); err != nil { + return NewLastCmdError(err) + } + } + } + + return nil +} + +func (raw *intelRdtData) join(id string) (string, error) { + path := filepath.Join(raw.root, id) + if err := os.MkdirAll(path, 0755); err != nil { + return "", NewLastCmdError(err) + } + + if err := WriteIntelRdtTasks(path, raw.pid); err != nil { + return "", NewLastCmdError(err) + } + return path, nil +} + +type NotFoundError struct { + ResourceControl string +} + +func (e *NotFoundError) Error() string { + return fmt.Sprintf("mountpoint for %s not found", e.ResourceControl) +} + +func NewNotFoundError(res string) error { + return &NotFoundError{ + ResourceControl: res, + } +} + +func IsNotFound(err error) bool { + if err == nil { + return false + } + _, ok := err.(*NotFoundError) + return ok +} + +type LastCmdError struct { + LastCmdStatus string + Err error +} + +func (e *LastCmdError) Error() string { + return e.Err.Error() + ", last_cmd_status: " + e.LastCmdStatus +} + +func NewLastCmdError(err error) error { + lastCmdStatus, err1 := getLastCmdStatus() + if err1 == nil { + return &LastCmdError{ + LastCmdStatus: lastCmdStatus, + Err: err, + } + } + return err +} diff --git a/sysbox-runc/libcontainer/intelrdt/intelrdt_test.go b/sysbox-runc/libcontainer/intelrdt/intelrdt_test.go new file mode 100644 index 00000000..485442d7 --- /dev/null +++ b/sysbox-runc/libcontainer/intelrdt/intelrdt_test.go @@ -0,0 +1,252 @@ +// +build linux + +package intelrdt + +import ( + "io" + "strings" + "testing" +) + +func TestIntelRdtSetL3CacheSchema(t *testing.T) { + if !IsCATEnabled() { + return + } + + helper := NewIntelRdtTestUtil(t) + defer helper.cleanup() + + const ( + l3CacheSchemaBefore = "L3:0=f;1=f0" + l3CacheSchemeAfter = "L3:0=f0;1=f" + ) + + helper.writeFileContents(map[string]string{ + "schemata": l3CacheSchemaBefore + "\n", + }) + + helper.IntelRdtData.config.IntelRdt.L3CacheSchema = l3CacheSchemeAfter + intelrdt := NewManager(helper.IntelRdtData.config, "", helper.IntelRdtPath) + if err := intelrdt.Set(helper.IntelRdtData.config); err != nil { + t.Fatal(err) + } + + tmpStrings, err := getIntelRdtParamString(helper.IntelRdtPath, "schemata") + if err != nil { + t.Fatalf("Failed to parse file 'schemata' - %s", err) + } + values := strings.Split(tmpStrings, "\n") + value := values[0] + + if value != l3CacheSchemeAfter { + t.Fatal("Got the wrong value, set 'schemata' failed.") + } +} + +func TestIntelRdtSetMemBwSchema(t *testing.T) { + if !IsMBAEnabled() { + return + } + + helper := NewIntelRdtTestUtil(t) + defer helper.cleanup() + + const ( + memBwSchemaBefore = "MB:0=20;1=70" + memBwSchemeAfter = "MB:0=70;1=20" + ) + + helper.writeFileContents(map[string]string{ + "schemata": memBwSchemaBefore + "\n", + }) + + helper.IntelRdtData.config.IntelRdt.MemBwSchema = memBwSchemeAfter + intelrdt := NewManager(helper.IntelRdtData.config, "", helper.IntelRdtPath) + if err := intelrdt.Set(helper.IntelRdtData.config); err != nil { + t.Fatal(err) + } + + tmpStrings, err := getIntelRdtParamString(helper.IntelRdtPath, "schemata") + if err != nil { + t.Fatalf("Failed to parse file 'schemata' - %s", err) + } + values := strings.Split(tmpStrings, "\n") + value := values[0] + + if value != memBwSchemeAfter { + t.Fatal("Got the wrong value, set 'schemata' failed.") + } +} + +func TestIntelRdtSetMemBwScSchema(t *testing.T) { + if !IsMBAScEnabled() { + return + } + + helper := NewIntelRdtTestUtil(t) + defer helper.cleanup() + + const ( + memBwScSchemaBefore = "MB:0=5000;1=7000" + memBwScSchemeAfter = "MB:0=9000;1=4000" + ) + + helper.writeFileContents(map[string]string{ + "schemata": memBwScSchemaBefore + "\n", + }) + + helper.IntelRdtData.config.IntelRdt.MemBwSchema = memBwScSchemeAfter + intelrdt := NewManager(helper.IntelRdtData.config, "", helper.IntelRdtPath) + if err := intelrdt.Set(helper.IntelRdtData.config); err != nil { + t.Fatal(err) + } + + tmpStrings, err := getIntelRdtParamString(helper.IntelRdtPath, "schemata") + if err != nil { + t.Fatalf("Failed to parse file 'schemata' - %s", err) + } + values := strings.Split(tmpStrings, "\n") + value := values[0] + + if value != memBwScSchemeAfter { + t.Fatal("Got the wrong value, set 'schemata' failed.") + } +} + +const ( + mountinfoValid = `18 40 0:18 / /sys rw,nosuid,nodev,noexec,relatime shared:6 - sysfs sysfs rw +19 40 0:3 / /proc rw,nosuid,nodev,noexec,relatime shared:5 - proc proc rw +20 40 0:5 / /dev rw,nosuid shared:2 - devtmpfs devtmpfs rw,size=131927256k,nr_inodes=32981814,mode=755 +21 18 0:17 / /sys/kernel/security rw,nosuid,nodev,noexec,relatime shared:7 - securityfs securityfs rw +22 20 0:19 / /dev/shm rw,nosuid,nodev shared:3 - tmpfs tmpfs rw +23 20 0:12 / /dev/pts rw,nosuid,noexec,relatime shared:4 - devpts devpts rw,gid=5,mode=620,ptmxmode=000 +24 40 0:20 / /run rw,nosuid,nodev shared:22 - tmpfs tmpfs rw,mode=755 +25 18 0:21 / /sys/fs/cgroup ro,nosuid,nodev,noexec shared:8 - tmpfs tmpfs ro,mode=755 +26 25 0:22 / /sys/fs/cgroup/systemd rw,nosuid,nodev,noexec,relatime shared:9 - cgroup cgroup rw,xattr,release_agent=/usr/lib/systemd/systemd-cgroups-agent,name=systemd +27 18 0:23 / /sys/fs/pstore rw,nosuid,nodev,noexec,relatime shared:20 - pstore pstore rw +28 25 0:24 / /sys/fs/cgroup/perf_event rw,nosuid,nodev,noexec,relatime shared:10 - cgroup cgroup rw,perf_event +29 25 0:25 / /sys/fs/cgroup/cpu,cpuacct rw,nosuid,nodev,noexec,relatime shared:11 - cgroup cgroup rw,cpuacct,cpu +30 25 0:26 / /sys/fs/cgroup/memory rw,nosuid,nodev,noexec,relatime shared:12 - cgroup cgroup rw,memory +31 25 0:27 / /sys/fs/cgroup/devices rw,nosuid,nodev,noexec,relatime shared:13 - cgroup cgroup rw,devices +32 25 0:28 / /sys/fs/cgroup/hugetlb rw,nosuid,nodev,noexec,relatime shared:14 - cgroup cgroup rw,hugetlb +33 25 0:29 / /sys/fs/cgroup/blkio rw,nosuid,nodev,noexec,relatime shared:15 - cgroup cgroup rw,blkio +34 25 0:30 / /sys/fs/cgroup/pids rw,nosuid,nodev,noexec,relatime shared:16 - cgroup cgroup rw,pids +35 25 0:31 / /sys/fs/cgroup/cpuset rw,nosuid,nodev,noexec,relatime shared:17 - cgroup cgroup rw,cpuset +36 25 0:32 / /sys/fs/cgroup/freezer rw,nosuid,nodev,noexec,relatime shared:18 - cgroup cgroup rw,freezer +37 25 0:33 / /sys/fs/cgroup/net_cls,net_prio rw,nosuid,nodev,noexec,relatime shared:19 - cgroup cgroup rw,net_prio,net_cls +38 18 0:34 / /sys/kernel/config rw,relatime shared:21 - configfs configfs rw +40 0 253:0 / / rw,relatime shared:1 - ext4 /dev/mapper/vvrg-vvrg rw,data=ordered +16 18 0:6 / /sys/kernel/debug rw,relatime shared:23 - debugfs debugfs rw +41 18 0:16 / /sys/fs/resctrl rw,relatime shared:24 - resctrl resctrl rw +42 20 0:36 / /dev/hugepages rw,relatime shared:25 - hugetlbfs hugetlbfs rw +43 19 0:37 / /proc/sys/fs/binfmt_misc rw,relatime shared:26 - autofs systemd-1 rw,fd=32,pgrp=1,timeout=0,minproto=5,maxproto=5,direct,pipe_ino=35492 +44 20 0:15 / /dev/mqueue rw,relatime shared:27 - mqueue mqueue rw +45 40 8:1 / /boot rw,relatime shared:28 - ext4 /dev/sda1 rw,stripe=4,data=ordered +46 40 253:1 / /home rw,relatime shared:29 - ext4 /dev/mapper/vvhg-vvhg rw,data=ordered +47 40 0:38 / /var/lib/nfs/rpc_pipefs rw,relatime shared:30 - rpc_pipefs sunrpc rw +125 24 0:20 /mesos/containers /run/mesos/containers rw,nosuid shared:22 - tmpfs tmpfs rw,mode=755 +123 40 253:0 /var/lib/docker/containers /var/lib/docker/containers rw,relatime - ext4 /dev/mapper/vvrg-vvrg rw,data=ordered +129 40 253:0 /var/lib/docker/overlay2 /var/lib/docker/overlay2 rw,relatime - ext4 /dev/mapper/vvrg-vvrg rw,data=ordered +119 24 0:39 / /run/user/1009 rw,nosuid,nodev,relatime shared:100 - tmpfs tmpfs rw,size=26387788k,mode=700,uid=1009,gid=1009` + + mountinfoMbaSc = `18 40 0:18 / /sys rw,nosuid,nodev,noexec,relatime shared:6 - sysfs sysfs rw +19 40 0:3 / /proc rw,nosuid,nodev,noexec,relatime shared:5 - proc proc rw +20 40 0:5 / /dev rw,nosuid shared:2 - devtmpfs devtmpfs rw,size=131927256k,nr_inodes=32981814,mode=755 +21 18 0:17 / /sys/kernel/security rw,nosuid,nodev,noexec,relatime shared:7 - securityfs securityfs rw +22 20 0:19 / /dev/shm rw,nosuid,nodev shared:3 - tmpfs tmpfs rw +23 20 0:12 / /dev/pts rw,nosuid,noexec,relatime shared:4 - devpts devpts rw,gid=5,mode=620,ptmxmode=000 +24 40 0:20 / /run rw,nosuid,nodev shared:22 - tmpfs tmpfs rw,mode=755 +25 18 0:21 / /sys/fs/cgroup ro,nosuid,nodev,noexec shared:8 - tmpfs tmpfs ro,mode=755 +26 25 0:22 / /sys/fs/cgroup/systemd rw,nosuid,nodev,noexec,relatime shared:9 - cgroup cgroup rw,xattr,release_agent=/usr/lib/systemd/systemd-cgroups-agent,name=systemd +27 18 0:23 / /sys/fs/pstore rw,nosuid,nodev,noexec,relatime shared:20 - pstore pstore rw +28 25 0:24 / /sys/fs/cgroup/perf_event rw,nosuid,nodev,noexec,relatime shared:10 - cgroup cgroup rw,perf_event +29 25 0:25 / /sys/fs/cgroup/cpu,cpuacct rw,nosuid,nodev,noexec,relatime shared:11 - cgroup cgroup rw,cpuacct,cpu +30 25 0:26 / /sys/fs/cgroup/memory rw,nosuid,nodev,noexec,relatime shared:12 - cgroup cgroup rw,memory +31 25 0:27 / /sys/fs/cgroup/devices rw,nosuid,nodev,noexec,relatime shared:13 - cgroup cgroup rw,devices +32 25 0:28 / /sys/fs/cgroup/hugetlb rw,nosuid,nodev,noexec,relatime shared:14 - cgroup cgroup rw,hugetlb +33 25 0:29 / /sys/fs/cgroup/blkio rw,nosuid,nodev,noexec,relatime shared:15 - cgroup cgroup rw,blkio +34 25 0:30 / /sys/fs/cgroup/pids rw,nosuid,nodev,noexec,relatime shared:16 - cgroup cgroup rw,pids +35 25 0:31 / /sys/fs/cgroup/cpuset rw,nosuid,nodev,noexec,relatime shared:17 - cgroup cgroup rw,cpuset +36 25 0:32 / /sys/fs/cgroup/freezer rw,nosuid,nodev,noexec,relatime shared:18 - cgroup cgroup rw,freezer +37 25 0:33 / /sys/fs/cgroup/net_cls,net_prio rw,nosuid,nodev,noexec,relatime shared:19 - cgroup cgroup rw,net_prio,net_cls +38 18 0:34 / /sys/kernel/config rw,relatime shared:21 - configfs configfs rw +40 0 253:0 / / rw,relatime shared:1 - ext4 /dev/mapper/vvrg-vvrg rw,data=ordered +16 18 0:6 / /sys/kernel/debug rw,relatime shared:23 - debugfs debugfs rw +41 18 0:16 / /sys/fs/resctrl rw,relatime shared:24 - resctrl resctrl rw,mba_MBps +42 20 0:36 / /dev/hugepages rw,relatime shared:25 - hugetlbfs hugetlbfs rw +43 19 0:37 / /proc/sys/fs/binfmt_misc rw,relatime shared:26 - autofs systemd-1 rw,fd=32,pgrp=1,timeout=0,minproto=5,maxproto=5,direct,pipe_ino=35492 +44 20 0:15 / /dev/mqueue rw,relatime shared:27 - mqueue mqueue rw +45 40 8:1 / /boot rw,relatime shared:28 - ext4 /dev/sda1 rw,stripe=4,data=ordered +46 40 253:1 / /home rw,relatime shared:29 - ext4 /dev/mapper/vvhg-vvhg rw,data=ordered +47 40 0:38 / /var/lib/nfs/rpc_pipefs rw,relatime shared:30 - rpc_pipefs sunrpc rw +125 24 0:20 /mesos/containers /run/mesos/containers rw,nosuid shared:22 - tmpfs tmpfs rw,mode=755 +123 40 253:0 /var/lib/docker/containers /var/lib/docker/containers rw,relatime - ext4 /dev/mapper/vvrg-vvrg rw,data=ordered +129 40 253:0 /var/lib/docker/overlay2 /var/lib/docker/overlay2 rw,relatime - ext4 /dev/mapper/vvrg-vvrg rw,data=ordered +119 24 0:39 / /run/user/1009 rw,nosuid,nodev,relatime shared:100 - tmpfs tmpfs rw,size=26387788k,mode=700,uid=1009,gid=1009` +) + +func TestFindIntelRdtMountpointDir(t *testing.T) { + testCases := []struct { + name string + input io.Reader + isNotFoundError bool + isError bool + mbaScEnabled bool + mountpoint string + }{ + { + name: "Valid mountinfo with MBA Software Controller disabled", + input: strings.NewReader(mountinfoValid), + mountpoint: "/sys/fs/resctrl", + }, + { + name: "Valid mountinfo with MBA Software Controller enabled", + input: strings.NewReader(mountinfoMbaSc), + mbaScEnabled: true, + mountpoint: "/sys/fs/resctrl", + }, + { + name: "Empty mountinfo", + input: strings.NewReader(""), + isNotFoundError: true, + }, + { + name: "Broken mountinfo", + input: strings.NewReader("baa"), + isError: true, + }, + } + + t.Parallel() + for _, tc := range testCases { + tc := tc + t.Run(tc.name, func(t *testing.T) { + mp, err := findIntelRdtMountpointDir(tc.input) + if tc.isNotFoundError { + if !IsNotFound(err) { + t.Errorf("expected IsNotFound error, got %+v", err) + } + return + } + if tc.isError { + if err == nil { + t.Error("expected error, got nil") + } + return + } + if err != nil { + t.Errorf("expected nil, got %+v", err) + return + } + // no errors, check the results + if tc.mbaScEnabled != mbaScEnabled { + t.Errorf("expected mbaScEnabled=%v, got %v", + tc.mbaScEnabled, mbaScEnabled) + } + if tc.mountpoint != mp { + t.Errorf("expected mountpoint=%q, got %q", + tc.mountpoint, mp) + } + }) + } +} diff --git a/sysbox-runc/libcontainer/intelrdt/mbm.go b/sysbox-runc/libcontainer/intelrdt/mbm.go new file mode 100644 index 00000000..93063ee0 --- /dev/null +++ b/sysbox-runc/libcontainer/intelrdt/mbm.go @@ -0,0 +1,35 @@ +// +build linux + +package intelrdt + +var ( + // The flag to indicate if Intel RDT/MBM is enabled + mbmEnabled bool +) + +// Check if Intel RDT/MBM is enabled. +func IsMBMEnabled() bool { + featuresInit() + return mbmEnabled +} + +func getMBMNumaNodeStats(numaPath string) (*MBMNumaNodeStats, error) { + stats := &MBMNumaNodeStats{} + if enabledMonFeatures.mbmTotalBytes { + mbmTotalBytes, err := getIntelRdtParamUint(numaPath, "mbm_total_bytes") + if err != nil { + return nil, err + } + stats.MBMTotalBytes = mbmTotalBytes + } + + if enabledMonFeatures.mbmLocalBytes { + mbmLocalBytes, err := getIntelRdtParamUint(numaPath, "mbm_local_bytes") + if err != nil { + return nil, err + } + stats.MBMLocalBytes = mbmLocalBytes + } + + return stats, nil +} diff --git a/sysbox-runc/libcontainer/intelrdt/mbm_test.go b/sysbox-runc/libcontainer/intelrdt/mbm_test.go new file mode 100644 index 00000000..ae0f202c --- /dev/null +++ b/sysbox-runc/libcontainer/intelrdt/mbm_test.go @@ -0,0 +1,68 @@ +// +build linux + +package intelrdt + +import ( + "os" + "path/filepath" + "testing" +) + +func TestGetMBMNumaNodeStats(t *testing.T) { + mocksNUMANodesToCreate := []string{"mon_l3_00", "mon_l3_01"} + + mocksFilesToCreate := map[string]uint64{ + "mbm_total_bytes": 9123911, + "mbm_local_bytes": 2361361, + } + + mockedL3_MON, err := mockResctrlL3_MON(mocksNUMANodesToCreate, mocksFilesToCreate) + + defer func() { + err := os.RemoveAll(mockedL3_MON) + if err != nil { + t.Fatal(err) + } + }() + + if err != nil { + t.Fatal(err) + } + + t.Run("Gather mbm", func(t *testing.T) { + enabledMonFeatures.mbmTotalBytes = true + enabledMonFeatures.mbmLocalBytes = true + + stats := make([]MBMNumaNodeStats, 0, len(mocksNUMANodesToCreate)) + for _, numa := range mocksNUMANodesToCreate { + other, err := getMBMNumaNodeStats(filepath.Join(mockedL3_MON, "mon_data", numa)) + if err != nil { + t.Fatal(err) + } + stats = append(stats, *other) + } + + expectedStats := MBMNumaNodeStats{ + MBMTotalBytes: mocksFilesToCreate["mbm_total_bytes"], + MBMLocalBytes: mocksFilesToCreate["mbm_local_bytes"], + } + + checkMBMStatCorrection(stats[0], expectedStats, t) + checkMBMStatCorrection(stats[1], expectedStats, t) + }) +} + +func checkMBMStatCorrection(got MBMNumaNodeStats, expected MBMNumaNodeStats, t *testing.T) { + if got.MBMTotalBytes != expected.MBMTotalBytes { + t.Fatalf("Wrong value of mbm_total_bytes. Expected: %v but got: %v", + expected.MBMTotalBytes, + got.MBMTotalBytes) + } + + if got.MBMLocalBytes != expected.MBMLocalBytes { + t.Fatalf("Wrong value of mbm_local_bytes. Expected: %v but got: %v", + expected.MBMLocalBytes, + got.MBMLocalBytes) + } + +} diff --git a/sysbox-runc/libcontainer/intelrdt/monitoring.go b/sysbox-runc/libcontainer/intelrdt/monitoring.go new file mode 100644 index 00000000..78c2f624 --- /dev/null +++ b/sysbox-runc/libcontainer/intelrdt/monitoring.go @@ -0,0 +1,86 @@ +package intelrdt + +import ( + "bufio" + "io" + "io/ioutil" + "os" + "path/filepath" + + "github.com/sirupsen/logrus" +) + +var ( + enabledMonFeatures monFeatures +) + +type monFeatures struct { + mbmTotalBytes bool + mbmLocalBytes bool + llcOccupancy bool +} + +func getMonFeatures(intelRdtRoot string) (monFeatures, error) { + file, err := os.Open(filepath.Join(intelRdtRoot, "info", "L3_MON", "mon_features")) + if err != nil { + return monFeatures{}, err + } + defer file.Close() + return parseMonFeatures(file) +} + +func parseMonFeatures(reader io.Reader) (monFeatures, error) { + scanner := bufio.NewScanner(reader) + + monFeatures := monFeatures{} + + for scanner.Scan() { + switch feature := scanner.Text(); feature { + case "mbm_total_bytes": + monFeatures.mbmTotalBytes = true + case "mbm_local_bytes": + monFeatures.mbmLocalBytes = true + case "llc_occupancy": + monFeatures.llcOccupancy = true + default: + logrus.Warnf("Unsupported Intel RDT monitoring feature: %s", feature) + } + } + + return monFeatures, scanner.Err() +} + +func getMonitoringStats(containerPath string, stats *Stats) error { + numaFiles, err := ioutil.ReadDir(filepath.Join(containerPath, "mon_data")) + if err != nil { + return err + } + + var mbmStats []MBMNumaNodeStats + var cmtStats []CMTNumaNodeStats + + for _, file := range numaFiles { + if file.IsDir() { + numaPath := filepath.Join(containerPath, "mon_data", file.Name()) + if IsMBMEnabled() { + numaMBMStats, err := getMBMNumaNodeStats(numaPath) + if err != nil { + return err + } + mbmStats = append(mbmStats, *numaMBMStats) + } + if IsCMTEnabled() { + numaCMTStats, err := getCMTNumaNodeStats(numaPath) + if err != nil { + return err + } + cmtStats = append(cmtStats, *numaCMTStats) + } + } + } + + stats.MBMStats = &mbmStats + stats.CMTStats = &cmtStats + + return err +} diff --git a/sysbox-runc/libcontainer/intelrdt/monitoring_test.go b/sysbox-runc/libcontainer/intelrdt/monitoring_test.go new file mode 100644 index 00000000..ad8afcff --- /dev/null +++ b/sysbox-runc/libcontainer/intelrdt/monitoring_test.go @@ -0,0 +1,118 @@ +package intelrdt + +import ( + "io/ioutil" + "os" + "path/filepath" + "strconv" + "strings" + "testing" +) + +func TestParseMonFeatures(t *testing.T) { + t.Run("All features available", func(t *testing.T) { + parsedMonFeatures, err := parseMonFeatures( + strings.NewReader("mbm_total_bytes\nmbm_local_bytes\nllc_occupancy")) + if err != nil { + t.Errorf("Error while parsing mon features err = %v", err) + } + + expectedMonFeatures := monFeatures{true, true, true} + + if parsedMonFeatures != expectedMonFeatures { + t.Error("Cannot gather all features!") + } + }) + + t.Run("No features available", func(t *testing.T) { + parsedMonFeatures, err := parseMonFeatures(strings.NewReader("")) + + if err != nil { + t.Errorf("Error while parsing mon features err = %v", err) + } + + expectedMonFeatures := monFeatures{false, false, false} + + if parsedMonFeatures != expectedMonFeatures { + t.Error("Expected no features available but there is any!") + } + }) +} + +func mockResctrlL3_MON(NUMANodes []string, mocks map[string]uint64) (string, error) { + testDir, err := ioutil.TempDir("", "rdt_mbm_test") + if err != nil { + return "", err + } + monDataPath := filepath.Join(testDir, "mon_data") + + for _, numa := range NUMANodes { + numaPath := filepath.Join(monDataPath, numa) + err = os.MkdirAll(numaPath, os.ModePerm) + if err != nil { + return "", err + } + + for fileName, value := range mocks { + err := ioutil.WriteFile(filepath.Join(numaPath, fileName), []byte(strconv.FormatUint(value, 10)), 0o644) + if err != nil { + return "", err + } + } + + } + + return testDir, nil +} + +func TestGetMonitoringStats(t *testing.T) { + enabledMonFeatures.mbmTotalBytes = true + enabledMonFeatures.mbmLocalBytes = true + enabledMonFeatures.llcOccupancy = true + mbmEnabled = true + cmtEnabled = true + + mocksNUMANodesToCreate := []string{"mon_l3_00", "mon_l3_01"} + + mocksFilesToCreate := map[string]uint64{ + "mbm_total_bytes": 9123911, + "mbm_local_bytes": 2361361, + "llc_occupancy": 123331, + } + + mockedL3_MON, err := mockResctrlL3_MON(mocksNUMANodesToCreate, mocksFilesToCreate) + + defer func() { + err := os.RemoveAll(mockedL3_MON) + if err != nil { + t.Fatal(err) + } + }() + + if err != nil { + t.Fatal(err) + } + + t.Run("Gather monitoring stats", func(t *testing.T) { + var stats Stats + err := getMonitoringStats(mockedL3_MON, &stats) + if err != nil { + t.Fatal(err) + } + + expectedMBMStats := MBMNumaNodeStats{ + MBMTotalBytes: mocksFilesToCreate["mbm_total_bytes"], + MBMLocalBytes: mocksFilesToCreate["mbm_local_bytes"], + } + + expectedCMTStats := CMTNumaNodeStats{LLCOccupancy: mocksFilesToCreate["llc_occupancy"]} + + for _, gotMBMStat := range *stats.MBMStats { + checkMBMStatCorrection(gotMBMStat, expectedMBMStats, t) + } + + for _, gotCMTStat := range *stats.CMTStats { + checkCMTStatCorrection(gotCMTStat, expectedCMTStats, t) + } + }) +} diff --git a/sysbox-runc/libcontainer/intelrdt/stats.go b/sysbox-runc/libcontainer/intelrdt/stats.go new file mode 100644 index 00000000..70df0d14 --- /dev/null +++ b/sysbox-runc/libcontainer/intelrdt/stats.go @@ -0,0 +1,59 @@ +// +build linux + +package intelrdt + +type L3CacheInfo struct { + CbmMask string `json:"cbm_mask,omitempty"` + MinCbmBits uint64 `json:"min_cbm_bits,omitempty"` + NumClosids uint64 `json:"num_closids,omitempty"` +} + +type MemBwInfo struct { + BandwidthGran uint64 `json:"bandwidth_gran,omitempty"` + DelayLinear uint64 `json:"delay_linear,omitempty"` + MinBandwidth uint64 `json:"min_bandwidth,omitempty"` + NumClosids uint64 `json:"num_closids,omitempty"` +} + +type MBMNumaNodeStats struct { + // The 'mbm_total_bytes' in 'container_id' group. + MBMTotalBytes uint64 `json:"mbm_total_bytes"` + + // The 'mbm_local_bytes' in 'container_id' group. + MBMLocalBytes uint64 `json:"mbm_local_bytes"` +} + +type CMTNumaNodeStats struct { + // The 'llc_occupancy' in 'container_id' group. + LLCOccupancy uint64 `json:"llc_occupancy"` +} + +type Stats struct { + // The read-only L3 cache information + L3CacheInfo *L3CacheInfo `json:"l3_cache_info,omitempty"` + + // The read-only L3 cache schema in root + L3CacheSchemaRoot string `json:"l3_cache_schema_root,omitempty"` + + // The L3 cache schema in 'container_id' group + L3CacheSchema string `json:"l3_cache_schema,omitempty"` + + // The read-only memory bandwidth information + MemBwInfo *MemBwInfo `json:"mem_bw_info,omitempty"` + + // The read-only memory bandwidth schema in root + MemBwSchemaRoot string `json:"mem_bw_schema_root,omitempty"` + + // The memory bandwidth schema in 'container_id' group + MemBwSchema string `json:"mem_bw_schema,omitempty"` + + // The memory bandwidth monitoring statistics from NUMA nodes in 'container_id' group + MBMStats *[]MBMNumaNodeStats `json:"mbm_stats,omitempty"` + + // The cache monitoring technology statistics from NUMA nodes in 'container_id' group + CMTStats *[]CMTNumaNodeStats `json:"cmt_stats,omitempty"` +} + +func NewStats() *Stats { + return &Stats{} +} diff --git a/sysbox-runc/libcontainer/intelrdt/util_test.go b/sysbox-runc/libcontainer/intelrdt/util_test.go new file mode 100644 index 00000000..970b6ce3 --- /dev/null +++ b/sysbox-runc/libcontainer/intelrdt/util_test.go @@ -0,0 +1,67 @@ +// +build linux + +/* + * Utility for testing Intel RDT operations. + * Creates a mock of the Intel RDT "resource control" filesystem for the duration of the test. + */ +package intelrdt + +import ( + "io/ioutil" + "os" + "path/filepath" + "testing" + + "github.com/opencontainers/runc/libcontainer/configs" +) + +type intelRdtTestUtil struct { + // intelRdt data to use in tests + IntelRdtData *intelRdtData + + // Path to the mock Intel RDT "resource control" filesystem directory + IntelRdtPath string + + // Temporary directory to store mock Intel RDT "resource control" filesystem + tempDir string + t *testing.T +} + +// Creates a new test util +func NewIntelRdtTestUtil(t *testing.T) *intelRdtTestUtil { + d := &intelRdtData{ + config: &configs.Config{ + IntelRdt: &configs.IntelRdt{}, + }, + } + tempDir, err := ioutil.TempDir("", "intelrdt_test") + if err != nil { + t.Fatal(err) + } + d.root = tempDir + testIntelRdtPath := filepath.Join(d.root, "resctrl") + if err != nil { + t.Fatal(err) + } + + // Ensure the full mock Intel RDT "resource control" filesystem path exists + err = os.MkdirAll(testIntelRdtPath, 0755) + if err != nil { + t.Fatal(err) + } + return &intelRdtTestUtil{IntelRdtData: d, IntelRdtPath: testIntelRdtPath, tempDir: tempDir, t: t} +} + +func (c *intelRdtTestUtil) cleanup() { + os.RemoveAll(c.tempDir) +} + +// Write the specified contents on the mock of the specified Intel RDT "resource control" files +func (c *intelRdtTestUtil) writeFileContents(fileContents map[string]string) { + for file, contents := range fileContents { + err := writeFile(c.IntelRdtPath, file, contents) + if err != nil { + c.t.Fatal(err) + } + } +} diff --git a/sysbox-runc/libcontainer/keys/keyctl.go b/sysbox-runc/libcontainer/keys/keyctl.go new file mode 100644 index 00000000..e73af7ae --- /dev/null +++ b/sysbox-runc/libcontainer/keys/keyctl.go @@ -0,0 +1,47 @@ +// +build linux + +package keys + +import ( + "strconv" + "strings" + + "github.com/pkg/errors" + + "golang.org/x/sys/unix" +) + +type KeySerial uint32 + +func JoinSessionKeyring(name string) (KeySerial, error) { + sessKeyId, err := unix.KeyctlJoinSessionKeyring(name) + if err != nil { + return 0, errors.Wrap(err, "create session key") + } + return KeySerial(sessKeyId), nil +} + +// ModKeyringPerm modifies permissions on a keyring by reading the current permissions, +// anding the bits with the given mask (clearing permissions) and setting +// additional permission bits +func ModKeyringPerm(ringId KeySerial, mask, setbits uint32) error { + dest, err := unix.KeyctlString(unix.KEYCTL_DESCRIBE, int(ringId)) + if err != nil { + return err + } + + res := strings.Split(dest, ";") + if len(res) < 5 { + return errors.New("Destination buffer for key description is too small") + } + + // parse permissions + perm64, err := strconv.ParseUint(res[3], 16, 32) + if err != nil { + return err + } + + perm := (uint32(perm64) & mask) | setbits + + return unix.KeyctlSetperm(int(ringId), perm) +} diff --git a/sysbox-runc/libcontainer/logs/logs.go b/sysbox-runc/libcontainer/logs/logs.go new file mode 100644 index 00000000..1077e7b0 --- /dev/null +++ b/sysbox-runc/libcontainer/logs/logs.go @@ -0,0 +1,102 @@ +package logs + +import ( + "bufio" + "encoding/json" + "fmt" + "io" + "os" + "strconv" + "sync" + + "github.com/sirupsen/logrus" +) + +var ( + configureMutex = sync.Mutex{} + // loggingConfigured will be set once logging has been configured via invoking `ConfigureLogging`. + // Subsequent invocations of `ConfigureLogging` would be no-op + loggingConfigured = false +) + +type Config struct { + LogLevel logrus.Level + LogFormat string + LogFilePath string + LogPipeFd string +} + +func ForwardLogs(logPipe io.Reader) { + lineReader := bufio.NewReader(logPipe) + for { + line, err := lineReader.ReadBytes('\n') + if len(line) > 0 { + processEntry(line) + } + if err == io.EOF { + logrus.Debugf("log pipe has been closed: %+v", err) + return + } + if err != nil { + logrus.Errorf("log pipe read error: %+v", err) + } + } +} + +func processEntry(text []byte) { + type jsonLog struct { + Level string `json:"level"` + Msg string `json:"msg"` + } + + var jl jsonLog + if err := json.Unmarshal(text, &jl); err != nil { + logrus.Errorf("failed to decode %q to json: %+v", text, err) + return + } + + lvl, err := logrus.ParseLevel(jl.Level) + if err != nil { + logrus.Errorf("failed to parse log level %q: %v\n", jl.Level, err) + return + } + logrus.StandardLogger().Logf(lvl, jl.Msg) +} + +func ConfigureLogging(config Config) error { + configureMutex.Lock() + defer configureMutex.Unlock() + + if loggingConfigured { + logrus.Debug("logging has already been configured") + return nil + } + + logrus.SetLevel(config.LogLevel) + + if config.LogPipeFd != "" { + logPipeFdInt, err := strconv.Atoi(config.LogPipeFd) + if err != nil { + return fmt.Errorf("failed to convert _LIBCONTAINER_LOGPIPE environment variable value %q to int: %v", config.LogPipeFd, err) + } + logrus.SetOutput(os.NewFile(uintptr(logPipeFdInt), "logpipe")) + } else if config.LogFilePath != "" { + f, err := os.OpenFile(config.LogFilePath, os.O_CREATE|os.O_WRONLY|os.O_APPEND|os.O_SYNC, 0644) + if err != nil { + return err + } + logrus.SetOutput(f) + } + + switch config.LogFormat { + case "text": + // retain logrus's default. + case "json": + logrus.SetFormatter(new(logrus.JSONFormatter)) + default: + return fmt.Errorf("unknown log-format %q", config.LogFormat) + } + + loggingConfigured = true + return nil +} diff --git a/sysbox-runc/libcontainer/logs/logs_linux_test.go b/sysbox-runc/libcontainer/logs/logs_linux_test.go new file mode 100644 index 00000000..ec9ae4fc --- /dev/null +++ b/sysbox-runc/libcontainer/logs/logs_linux_test.go @@ -0,0 +1,160 @@ +package logs + +import ( + "errors" + "io/ioutil" + "os" + "strings" + "testing" + "time" + + "github.com/sirupsen/logrus" +) + +func TestLoggingToFile(t *testing.T) { + logW, logFile, _ := runLogForwarding(t) + defer os.Remove(logFile) + defer logW.Close() + + logToLogWriter(t, logW, `{"level": "info","msg":"kitten"}`) + + logFileContent := waitForLogContent(t, logFile) + if !strings.Contains(logFileContent, "kitten") { + t.Fatalf("%s does not contain kitten", logFileContent) + } +} + +func TestLogForwardingDoesNotStopOnJsonDecodeErr(t *testing.T) { + logW, logFile, _ := runLogForwarding(t) + defer os.Remove(logFile) + defer logW.Close() + + logToLogWriter(t, logW, "invalid-json-with-kitten") + + logFileContent := waitForLogContent(t, logFile) + if !strings.Contains(logFileContent, "failed to decode") { + t.Fatalf("%q does not contain decoding error", logFileContent) + } + + truncateLogFile(t, logFile) + + logToLogWriter(t, logW, `{"level": "info","msg":"puppy"}`) + + logFileContent = waitForLogContent(t, logFile) + if !strings.Contains(logFileContent, "puppy") { + t.Fatalf("%s does not contain puppy", logFileContent) + } +} + +func TestLogForwardingDoesNotStopOnLogLevelParsingErr(t *testing.T) { + logW, logFile, _ := runLogForwarding(t) + defer os.Remove(logFile) + defer logW.Close() + + logToLogWriter(t, logW, `{"level": "alert","msg":"puppy"}`) + + logFileContent := waitForLogContent(t, logFile) + if !strings.Contains(logFileContent, "failed to parse log level") { + t.Fatalf("%q does not contain log level parsing error", logFileContent) + } + + truncateLogFile(t, logFile) + + logToLogWriter(t, logW, `{"level": "info","msg":"puppy"}`) + + logFileContent = waitForLogContent(t, logFile) + if !strings.Contains(logFileContent, "puppy") { + t.Fatalf("%s does not contain puppy", logFileContent) + } +} + +func TestLogForwardingStopsAfterClosingTheWriter(t *testing.T) { + logW, logFile, doneForwarding := runLogForwarding(t) + defer os.Remove(logFile) + + logToLogWriter(t, logW, `{"level": "info","msg":"sync"}`) + + logFileContent := waitForLogContent(t, logFile) + if !strings.Contains(logFileContent, "sync") { + t.Fatalf("%q does not contain sync message", logFileContent) + } + + logW.Close() + select { + case <-doneForwarding: + case <-time.After(10 * time.Second): + t.Fatal("log forwarding did not stop after closing the pipe") + } +} + +func logToLogWriter(t *testing.T, logW *os.File, message string) { + _, err := logW.Write([]byte(message + "\n")) + if err != nil { + t.Fatalf("failed to write %q to log writer: %v", message, err) + } +} + +func runLogForwarding(t *testing.T) (*os.File, string, chan struct{}) { + logR, logW, err := os.Pipe() + if err != nil { + t.Fatal(err) + } + + tempFile, err := ioutil.TempFile("", "") + if err != nil { + t.Fatal(err) + } + logFile := tempFile.Name() + + logConfig := Config{LogLevel: logrus.InfoLevel, LogFormat: "json", LogFilePath: logFile} + return logW, logFile, startLogForwarding(t, logConfig, logR) +} + +func startLogForwarding(t *testing.T, logConfig Config, logR *os.File) chan struct{} { + loggingConfigured = false + if err := ConfigureLogging(logConfig); err != nil { + t.Fatal(err) + } + doneForwarding := make(chan struct{}) + go func() { + ForwardLogs(logR) + close(doneForwarding) + }() + return doneForwarding +} + +func waitForLogContent(t *testing.T, logFile string) string { + startTime := time.Now() + + for { + if time.Now().After(startTime.Add(10 * time.Second)) { + t.Fatal(errors.New("No content in log file after 10 seconds")) + break + } + + fileContent, err := ioutil.ReadFile(logFile) + if err != nil { + t.Fatal(err) + } + if len(fileContent) == 0 { + continue + } + return string(fileContent) + } + + return "" +} + +func truncateLogFile(t *testing.T, logFile string) { + file, err := os.OpenFile(logFile, os.O_RDWR, 0666) + if err != nil { + t.Fatalf("failed to open log file: %v", err) + return + } + defer file.Close() + + err = file.Truncate(0) + if err != nil { + t.Fatalf("failed to truncate log file: %v", err) + } +} diff --git a/sysbox-runc/libcontainer/message_linux.go b/sysbox-runc/libcontainer/message_linux.go new file mode 100644 index 00000000..1364e761 --- /dev/null +++ b/sysbox-runc/libcontainer/message_linux.go @@ -0,0 +1,98 @@ +//go:build linux +// +build linux + +package libcontainer + +import ( + "github.com/vishvananda/netlink/nl" + "golang.org/x/sys/unix" +) + +// list of known message types we want to send to bootstrap program +// The number is randomly chosen to not conflict with known netlink types +const ( + InitMsg uint16 = 62000 + CloneFlagsAttr uint16 = 27281 + NsPathsAttr uint16 = 27282 + UidmapAttr uint16 = 27283 + GidmapAttr uint16 = 27284 + SetgroupAttr uint16 = 27285 + OomScoreAdjAttr uint16 = 27286 + RootlessEUIDAttr uint16 = 27287 + UidmapPathAttr uint16 = 27288 + GidmapPathAttr uint16 = 27289 + + // sysbox-runc + PrepRootfsAttr uint16 = 27290 + MakeParentPrivAttr uint16 = 27291 + RootfsPropAttr uint16 = 27292 + RootfsAttr uint16 = 27293 + ParentMountAttr uint16 = 27294 + ShiftfsMountsAttr uint16 = 27295 +) + +type Int32msg struct { + Type uint16 + Value uint32 +} + +// Serialize serializes the message. +// Int32msg has the following representation +// | nlattr len | nlattr type | +// | uint32 value | +func (msg *Int32msg) Serialize() []byte { + buf := make([]byte, msg.Len()) + native := nl.NativeEndian() + native.PutUint16(buf[0:2], uint16(msg.Len())) + native.PutUint16(buf[2:4], msg.Type) + native.PutUint32(buf[4:8], msg.Value) + return buf +} + +func (msg *Int32msg) Len() int { + return unix.NLA_HDRLEN + 4 +} + +// Bytemsg has the following representation +// | nlattr len | nlattr type | +// | value | pad | +type Bytemsg struct { + Type uint16 + Value []byte +} + +func (msg *Bytemsg) Serialize() []byte { + l := msg.Len() + buf := make([]byte, (l+unix.NLA_ALIGNTO-1) & ^(unix.NLA_ALIGNTO-1)) + native := nl.NativeEndian() + native.PutUint16(buf[0:2], uint16(l)) + native.PutUint16(buf[2:4], msg.Type) + copy(buf[4:], msg.Value) + return buf +} + +func (msg *Bytemsg) Len() int { + return unix.NLA_HDRLEN + len(msg.Value) + 1 // null-terminated +} + +type Boolmsg struct { + Type uint16 + Value bool +} + +func (msg *Boolmsg) Serialize() []byte { + buf := make([]byte, msg.Len()) + native := nl.NativeEndian() + native.PutUint16(buf[0:2], uint16(msg.Len())) + native.PutUint16(buf[2:4], msg.Type) + if msg.Value { + native.PutUint32(buf[4:8], uint32(1)) + } else { + native.PutUint32(buf[4:8], uint32(0)) + } + return buf +} + +func (msg *Boolmsg) Len() int { + return unix.NLA_HDRLEN + 4 // alignment +} diff --git a/sysbox-runc/libcontainer/network_linux.go b/sysbox-runc/libcontainer/network_linux.go new file mode 100644 index 00000000..a0a87b98 --- /dev/null +++ b/sysbox-runc/libcontainer/network_linux.go @@ -0,0 +1,103 @@ +// +build linux + +package libcontainer + +import ( + "bytes" + "fmt" + "io/ioutil" + "path/filepath" + "strconv" + + "github.com/opencontainers/runc/libcontainer/configs" + "github.com/opencontainers/runc/types" + "github.com/vishvananda/netlink" +) + +var strategies = map[string]networkStrategy{ + "loopback": &loopback{}, +} + +// networkStrategy represents a specific network configuration for +// a container's networking stack +type networkStrategy interface { + create(*network, int) error + initialize(*network) error + detach(*configs.Network) error + attach(*configs.Network) error +} + +// getStrategy returns the specific network strategy for the +// provided type. +func getStrategy(tpe string) (networkStrategy, error) { + s, exists := strategies[tpe] + if !exists { + return nil, fmt.Errorf("unknown strategy type %q", tpe) + } + return s, nil +} + +// Returns the network statistics for the network interfaces represented by the NetworkRuntimeInfo. +func getNetworkInterfaceStats(interfaceName string) (*types.NetworkInterface, error) { + out := &types.NetworkInterface{Name: interfaceName} + // This can happen if the network runtime information is missing - possible if the + // container was created by an old version of libcontainer. + if interfaceName == "" { + return out, nil + } + type netStatsPair struct { + // Where to write the output. + Out *uint64 + // The network stats file to read. + File string + } + // Ingress for host veth is from the container. Hence tx_bytes stat on the host veth is actually number of bytes received by the container. + netStats := []netStatsPair{ + {Out: &out.RxBytes, File: "tx_bytes"}, + {Out: &out.RxPackets, File: "tx_packets"}, + {Out: &out.RxErrors, File: "tx_errors"}, + {Out: &out.RxDropped, File: "tx_dropped"}, + + {Out: &out.TxBytes, File: "rx_bytes"}, + {Out: &out.TxPackets, File: "rx_packets"}, + {Out: &out.TxErrors, File: "rx_errors"}, + {Out: &out.TxDropped, File: "rx_dropped"}, + } + for _, netStat := range netStats { + data, err := readSysfsNetworkStats(interfaceName, netStat.File) + if err != nil { + return nil, err + } + *(netStat.Out) = data + } + return out, nil +} + +// Reads the specified statistics available under /sys/class/net//statistics +func readSysfsNetworkStats(ethInterface, statsFile string) (uint64, error) { + data, err := ioutil.ReadFile(filepath.Join("/sys/class/net", ethInterface, "statistics", statsFile)) + if err != nil { + return 0, err + } + return strconv.ParseUint(string(bytes.TrimSpace(data)), 10, 64) +} + +// loopback is a network strategy that provides a basic loopback device +type loopback struct { +} + +func (l *loopback) create(n *network, nspid int) error { + return nil +} + +func (l *loopback) initialize(config *network) error { + return netlink.LinkSetUp(&netlink.Device{LinkAttrs: netlink.LinkAttrs{Name: "lo"}}) +} + +func (l *loopback) attach(n *configs.Network) (err error) { + return nil +} + +func (l *loopback) detach(n *configs.Network) (err error) { + return nil +} diff --git a/sysbox-runc/libcontainer/notify_linux.go b/sysbox-runc/libcontainer/notify_linux.go new file mode 100644 index 00000000..d7d1de1b --- /dev/null +++ b/sysbox-runc/libcontainer/notify_linux.go @@ -0,0 +1,87 @@ +// +build linux + +package libcontainer + +import ( + "errors" + "fmt" + "io/ioutil" + "os" + "path/filepath" + + "golang.org/x/sys/unix" +) + +type PressureLevel uint + +const ( + LowPressure PressureLevel = iota + MediumPressure + CriticalPressure +) + +func registerMemoryEvent(cgDir string, evName string, arg string) (<-chan struct{}, error) { + evFile, err := os.Open(filepath.Join(cgDir, evName)) + if err != nil { + return nil, err + } + fd, err := unix.Eventfd(0, unix.EFD_CLOEXEC) + if err != nil { + evFile.Close() + return nil, err + } + + eventfd := os.NewFile(uintptr(fd), "eventfd") + + eventControlPath := filepath.Join(cgDir, "cgroup.event_control") + data := fmt.Sprintf("%d %d %s", eventfd.Fd(), evFile.Fd(), arg) + if err := ioutil.WriteFile(eventControlPath, []byte(data), 0700); err != nil { + eventfd.Close() + evFile.Close() + return nil, err + } + ch := make(chan struct{}) + go func() { + defer func() { + eventfd.Close() + evFile.Close() + close(ch) + }() + buf := make([]byte, 8) + for { + if _, err := eventfd.Read(buf); err != nil { + return + } + // When a cgroup is destroyed, an event is sent to eventfd. + // So if the control path is gone, return instead of notifying. + if _, err := os.Lstat(eventControlPath); os.IsNotExist(err) { + return + } + ch <- struct{}{} + } + }() + return ch, nil +} + +// notifyOnOOM returns channel on which you can expect event about OOM, +// if process died without OOM this channel will be closed. +func notifyOnOOM(dir string) (<-chan struct{}, error) { + if dir == "" { + return nil, errors.New("memory controller missing") + } + + return registerMemoryEvent(dir, "memory.oom_control", "") +} + +func notifyMemoryPressure(dir string, level PressureLevel) (<-chan struct{}, error) { + if dir == "" { + return nil, errors.New("memory controller missing") + } + + if level > CriticalPressure { + return nil, fmt.Errorf("invalid pressure level %d", level) + } + + levelStr := []string{"low", "medium", "critical"}[level] + return registerMemoryEvent(dir, "memory.pressure_level", levelStr) +} diff --git a/sysbox-runc/libcontainer/notify_linux_test.go b/sysbox-runc/libcontainer/notify_linux_test.go new file mode 100644 index 00000000..6f8b6d30 --- /dev/null +++ b/sysbox-runc/libcontainer/notify_linux_test.go @@ -0,0 +1,123 @@ +// +build linux + +package libcontainer + +import ( + "encoding/binary" + "fmt" + "io/ioutil" + "os" + "path/filepath" + "testing" + "time" + + "golang.org/x/sys/unix" +) + +type notifyFunc func(path string) (<-chan struct{}, error) + +func testMemoryNotification(t *testing.T, evName string, notify notifyFunc, targ string) { + memoryPath, err := ioutil.TempDir("", "testmemnotification-"+evName) + if err != nil { + t.Fatal(err) + } + evFile := filepath.Join(memoryPath, evName) + eventPath := filepath.Join(memoryPath, "cgroup.event_control") + if err := ioutil.WriteFile(evFile, []byte{}, 0700); err != nil { + t.Fatal(err) + } + if err := ioutil.WriteFile(eventPath, []byte{}, 0700); err != nil { + t.Fatal(err) + } + ch, err := notify(memoryPath) + if err != nil { + t.Fatal("expected no error, got:", err) + } + + data, err := ioutil.ReadFile(eventPath) + if err != nil { + t.Fatal("couldn't read event control file:", err) + } + + var eventFd, evFd int + var arg string + if targ != "" { + _, err = fmt.Sscanf(string(data), "%d %d %s", &eventFd, &evFd, &arg) + } else { + _, err = fmt.Sscanf(string(data), "%d %d", &eventFd, &evFd) + } + if err != nil || arg != targ { + t.Fatalf("invalid control data %q: %s", data, err) + } + + // dup the eventfd + efd, err := unix.Dup(eventFd) + if err != nil { + t.Fatal("unable to dup eventfd:", err) + } + defer unix.Close(efd) + + buf := make([]byte, 8) + binary.LittleEndian.PutUint64(buf, 1) + + if _, err := unix.Write(efd, buf); err != nil { + t.Fatal("unable to write to eventfd:", err) + } + + select { + case <-ch: + case <-time.After(100 * time.Millisecond): + t.Fatal("no notification on channel after 100ms") + } + + // simulate what happens when a cgroup is destroyed by cleaning up and then + // writing to the eventfd. + if err := os.RemoveAll(memoryPath); err != nil { + t.Fatal(err) + } + if _, err := unix.Write(efd, buf); err != nil { + t.Fatal("unable to write to eventfd:", err) + } + + // give things a moment to shut down + select { + case _, ok := <-ch: + if ok { + t.Fatal("expected no notification to be triggered") + } + case <-time.After(100 * time.Millisecond): + t.Fatal("channel not closed after 100ms") + } + + if _, _, err := unix.Syscall(unix.SYS_FCNTL, uintptr(evFd), unix.F_GETFD, 0); err != unix.EBADF { + t.Errorf("expected event control to be closed, but received error %s", err.Error()) + } + + if _, _, err := unix.Syscall(unix.SYS_FCNTL, uintptr(eventFd), unix.F_GETFD, 0); err != unix.EBADF { + t.Errorf("expected event fd to be closed, but received error %s", err.Error()) + } +} + +func TestNotifyOnOOM(t *testing.T) { + f := func(path string) (<-chan struct{}, error) { + return notifyOnOOM(path) + } + + testMemoryNotification(t, "memory.oom_control", f, "") +} + +func TestNotifyMemoryPressure(t *testing.T) { + tests := map[PressureLevel]string{ + LowPressure: "low", + MediumPressure: "medium", + CriticalPressure: "critical", + } + + for level, arg := range tests { + f := func(path string) (<-chan struct{}, error) { + return notifyMemoryPressure(path, level) + } + + testMemoryNotification(t, "memory.pressure_level", f, arg) + } +} diff --git a/sysbox-runc/libcontainer/notify_linux_v2.go b/sysbox-runc/libcontainer/notify_linux_v2.go new file mode 100644 index 00000000..cdab10ed --- /dev/null +++ b/sysbox-runc/libcontainer/notify_linux_v2.go @@ -0,0 +1,102 @@ +// +build linux + +package libcontainer + +import ( + "io/ioutil" + "path/filepath" + "strconv" + "strings" + "unsafe" + + "github.com/pkg/errors" + "github.com/sirupsen/logrus" + "golang.org/x/sys/unix" +) + +func getValueFromCgroup(path, key string) (int, error) { + content, err := ioutil.ReadFile(path) + if err != nil { + return 0, err + } + + lines := strings.Split(string(content), "\n") + for _, line := range lines { + arr := strings.Split(line, " ") + if len(arr) == 2 && arr[0] == key { + return strconv.Atoi(arr[1]) + } + } + return 0, nil +} + +func registerMemoryEventV2(cgDir, evName, cgEvName string) (<-chan struct{}, error) { + eventControlPath := filepath.Join(cgDir, evName) + cgEvPath := filepath.Join(cgDir, cgEvName) + fd, err := unix.InotifyInit() + if err != nil { + return nil, errors.Wrap(err, "unable to init inotify") + } + // watching oom kill + evFd, err := unix.InotifyAddWatch(fd, eventControlPath, unix.IN_MODIFY) + if err != nil { + unix.Close(fd) + return nil, errors.Wrap(err, "unable to add inotify watch") + } + // Because no `unix.IN_DELETE|unix.IN_DELETE_SELF` event for cgroup file system, so watching all process exited + cgFd, err := unix.InotifyAddWatch(fd, cgEvPath, unix.IN_MODIFY) + if err != nil { + unix.Close(fd) + return nil, errors.Wrap(err, "unable to add inotify watch") + } + ch := make(chan struct{}) + go func() { + var ( + buffer [unix.SizeofInotifyEvent + unix.PathMax + 1]byte + offset uint32 + ) + defer func() { + unix.Close(fd) + close(ch) + }() + + for { + n, err := unix.Read(fd, buffer[:]) + if err != nil { + logrus.Warnf("unable to read event data from inotify, got error: %v", err) + return + } + if n < unix.SizeofInotifyEvent { + logrus.Warnf("we should read at least %d bytes from inotify, but got %d bytes.", unix.SizeofInotifyEvent, n) + return + } + offset = 0 + for offset <= uint32(n-unix.SizeofInotifyEvent) { + rawEvent := (*unix.InotifyEvent)(unsafe.Pointer(&buffer[offset])) + offset += unix.SizeofInotifyEvent + uint32(rawEvent.Len) + if rawEvent.Mask&unix.IN_MODIFY != unix.IN_MODIFY { + continue + } + switch int(rawEvent.Wd) { + case evFd: + oom, err := getValueFromCgroup(eventControlPath, "oom_kill") + if err != nil || oom > 0 { + ch <- struct{}{} + } + case cgFd: + pids, err := getValueFromCgroup(cgEvPath, "populated") + if err != nil || pids == 0 { + return + } + } + } + } + }() + return ch, nil +} + +// notifyOnOOMV2 returns channel on which you can expect event about OOM, +// if process died without OOM this channel will be closed. +func notifyOnOOMV2(path string) (<-chan struct{}, error) { + return registerMemoryEventV2(path, "memory.events", "cgroup.events") +} diff --git a/sysbox-runc/libcontainer/nsenter/README.md b/sysbox-runc/libcontainer/nsenter/README.md new file mode 100644 index 00000000..9ec6c393 --- /dev/null +++ b/sysbox-runc/libcontainer/nsenter/README.md @@ -0,0 +1,44 @@ +## nsenter + +The `nsenter` package registers a special init constructor that is called before +the Go runtime has a chance to boot. This provides us the ability to `setns` on +existing namespaces and avoid the issues that the Go runtime has with multiple +threads. This constructor will be called if this package is registered, +imported, in your go application. + +The `nsenter` package will `import "C"` and it uses [cgo](https://golang.org/cmd/cgo/) +package. In cgo, if the import of "C" is immediately preceded by a comment, that comment, +called the preamble, is used as a header when compiling the C parts of the package. +So every time we import package `nsenter`, the C code function `nsexec()` would be +called. And package `nsenter` is only imported in `init.go`, so every time the runc +`init` command is invoked, that C code is run. + +Because `nsexec()` must be run before the Go runtime in order to use the +Linux kernel namespace, you must `import` this library into a package if +you plan to use `libcontainer` directly. Otherwise Go will not execute +the `nsexec()` constructor, which means that the re-exec will not cause +the namespaces to be joined. You can import it like this: + +```go +import _ "github.com/opencontainers/runc/libcontainer/nsenter" +``` + +`nsexec()` will first get the file descriptor number for the init pipe +from the environment variable `_LIBCONTAINER_INITPIPE` (which was opened +by the parent and kept open across the fork-exec of the `nsexec()` init +process). The init pipe is used to read bootstrap data (namespace paths, +clone flags, uid and gid mappings, and the console path) from the parent +process. `nsexec()` will then call `setns(2)` to join the namespaces +provided in the bootstrap data (if available), `clone(2)` a child process +with the provided clone flags, update the user and group ID mappings, do +some further miscellaneous setup steps, and then send the PID of the +child process to the parent of the `nsexec()` "caller". Finally, +the parent `nsexec()` will exit and the child `nsexec()` process will +return to allow the Go runtime take over. + +NOTE: We do both `setns(2)` and `clone(2)` even if we don't have any +`CLONE_NEW*` clone flags because we must fork a new process in order to +enter the PID namespace. + + + diff --git a/sysbox-runc/libcontainer/nsenter/cloned_binary.c b/sysbox-runc/libcontainer/nsenter/cloned_binary.c new file mode 100644 index 00000000..2667cd65 --- /dev/null +++ b/sysbox-runc/libcontainer/nsenter/cloned_binary.c @@ -0,0 +1,566 @@ +// SPDX-License-Identifier: Apache-2.0 OR LGPL-2.1-or-later +/* + * Copyright (C) 2019 Aleksa Sarai + * Copyright (C) 2019 SUSE LLC + * + * This work is dual licensed under the following licenses. You may use, + * redistribute, and/or modify the work under the conditions of either (or + * both) licenses. + * + * === Apache-2.0 === + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * === LGPL-2.1-or-later === + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library. If not, see + * . + * + */ + +#define _GNU_SOURCE +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +/* Use our own wrapper for memfd_create. */ +#ifndef SYS_memfd_create +# ifdef __NR_memfd_create +# define SYS_memfd_create __NR_memfd_create +# else +/* These values come from . */ +# warning "libc is outdated -- using hard-coded SYS_memfd_create" +# if defined(__x86_64__) // x86_64 +# define SYS_memfd_create 319 +# elif defined(__i386__) // i386 +# define SYS_memfd_create 356 +# elif defined(__ia64__) // ia64 +# define SYS_memfd_create 1340 +# elif defined(__arm__) // arm +# define SYS_memfd_create 385 +# elif defined(__aarch64__) // arm64 +# define SYS_memfd_create 279 +# elif defined(__ppc__) || defined(__ppc64__) // ppc + ppc64 +# define SYS_memfd_create 360 +# elif defined(__s390__) || defined(__s390x__) // s390(x) +# define SYS_memfd_create 350 +# else +# error "unknown architecture -- cannot hard-code SYS_memfd_create" +# endif +# endif +#endif + +/* memfd_create(2) flags -- copied from . */ +#ifndef MFD_CLOEXEC +# define MFD_CLOEXEC 0x0001U +# define MFD_ALLOW_SEALING 0x0002U +#endif + +int memfd_create(const char *name, unsigned int flags) +{ +#ifdef SYS_memfd_create + return syscall(SYS_memfd_create, name, flags); +#else + errno = ENOSYS; + return -1; +#endif +} + + +/* This comes directly from . */ +#ifndef F_LINUX_SPECIFIC_BASE +# define F_LINUX_SPECIFIC_BASE 1024 +#endif +#ifndef F_ADD_SEALS +# define F_ADD_SEALS (F_LINUX_SPECIFIC_BASE + 9) +# define F_GET_SEALS (F_LINUX_SPECIFIC_BASE + 10) +#endif +#ifndef F_SEAL_SEAL +# define F_SEAL_SEAL 0x0001 /* prevent further seals from being set */ +# define F_SEAL_SHRINK 0x0002 /* prevent file from shrinking */ +# define F_SEAL_GROW 0x0004 /* prevent file from growing */ +# define F_SEAL_WRITE 0x0008 /* prevent writes */ +#endif + +#define CLONED_BINARY_ENV "_LIBCONTAINER_CLONED_BINARY" +#define RUNC_MEMFD_COMMENT "runc_cloned:/proc/self/exe" +#define RUNC_MEMFD_SEALS \ + (F_SEAL_SEAL | F_SEAL_SHRINK | F_SEAL_GROW | F_SEAL_WRITE) + +static void *must_realloc(void *ptr, size_t size) +{ + void *old = ptr; + do { + ptr = realloc(old, size); + } while(!ptr); + return ptr; +} + +/* + * Verify whether we are currently in a self-cloned program (namely, is + * /proc/self/exe a memfd). F_GET_SEALS will only succeed for memfds (or rather + * for shmem files), and we want to be sure it's actually sealed. + */ +static int is_self_cloned(void) +{ + int fd, ret, is_cloned = 0; + struct stat statbuf = {}; + struct statfs fsbuf = {}; + + fd = open("/proc/self/exe", O_RDONLY|O_CLOEXEC); + if (fd < 0) { + fprintf(stderr, "you have no read access to runc binary file\n"); + return -ENOTRECOVERABLE; + } + + /* + * Is the binary a fully-sealed memfd? We don't need CLONED_BINARY_ENV for + * this, because you cannot write to a sealed memfd no matter what (so + * sharing it isn't a bad thing -- and an admin could bind-mount a sealed + * memfd to /usr/bin/runc to allow re-use). + */ + ret = fcntl(fd, F_GET_SEALS); + if (ret >= 0) { + is_cloned = (ret == RUNC_MEMFD_SEALS); + goto out; + } + + /* + * All other forms require CLONED_BINARY_ENV, since they are potentially + * writeable (or we can't tell if they're fully safe) and thus we must + * check the environment as an extra layer of defence. + */ + if (!getenv(CLONED_BINARY_ENV)) { + is_cloned = false; + goto out; + } + + /* + * Is the binary on a read-only filesystem? We can't detect bind-mounts in + * particular (in-kernel they are identical to regular mounts) but we can + * at least be sure that it's read-only. In addition, to make sure that + * it's *our* bind-mount we check CLONED_BINARY_ENV. + */ + if (fstatfs(fd, &fsbuf) >= 0) + is_cloned |= (fsbuf.f_flags & MS_RDONLY); + + /* + * Okay, we're a tmpfile -- or we're currently running on RHEL <=7.6 + * which appears to have a borked backport of F_GET_SEALS. Either way, + * having a file which has no hardlinks indicates that we aren't using + * a host-side "runc" binary and this is something that a container + * cannot fake (because unlinking requires being able to resolve the + * path that you want to unlink). + */ + if (fstat(fd, &statbuf) >= 0) + is_cloned |= (statbuf.st_nlink == 0); + +out: + close(fd); + return is_cloned; +} + +/* Read a given file into a new buffer, and providing the length. */ +static char *read_file(char *path, size_t *length) +{ + int fd; + char buf[4096], *copy = NULL; + + if (!length) + return NULL; + + fd = open(path, O_RDONLY | O_CLOEXEC); + if (fd < 0) + return NULL; + + *length = 0; + for (;;) { + ssize_t n; + + n = read(fd, buf, sizeof(buf)); + if (n < 0) + goto error; + if (!n) + break; + + copy = must_realloc(copy, (*length + n) * sizeof(*copy)); + memcpy(copy + *length, buf, n); + *length += n; + } + close(fd); + return copy; + +error: + close(fd); + free(copy); + return NULL; +} + +/* + * A poor-man's version of "xargs -0". Basically parses a given block of + * NUL-delimited data, within the given length and adds a pointer to each entry + * to the array of pointers. + */ +static int parse_xargs(char *data, int data_length, char ***output) +{ + int num = 0; + char *cur = data; + + if (!data || *output != NULL) + return -1; + + while (cur < data + data_length) { + num++; + *output = must_realloc(*output, (num + 1) * sizeof(**output)); + (*output)[num - 1] = cur; + cur += strlen(cur) + 1; + } + (*output)[num] = NULL; + return num; +} + +/* + * "Parse" out argv from /proc/self/cmdline. + * This is necessary because we are running in a context where we don't have a + * main() that we can just get the arguments from. + */ +static int fetchve(char ***argv) +{ + char *cmdline = NULL; + size_t cmdline_size; + + cmdline = read_file("/proc/self/cmdline", &cmdline_size); + if (!cmdline) + goto error; + + if (parse_xargs(cmdline, cmdline_size, argv) <= 0) + goto error; + + return 0; + +error: + free(cmdline); + return -EINVAL; +} + +enum { + EFD_NONE = 0, + EFD_MEMFD, + EFD_FILE, +}; + +/* + * This comes from . We can't hard-code __O_TMPFILE because it + * changes depending on the architecture. If we don't have O_TMPFILE we always + * have the mkostemp(3) fallback. + */ +#ifndef O_TMPFILE +# if defined(__O_TMPFILE) && defined(O_DIRECTORY) +# define O_TMPFILE (__O_TMPFILE | O_DIRECTORY) +# endif +#endif + +static int make_execfd(int *fdtype) +{ + int fd = -1; + char template[PATH_MAX] = {0}; + char *prefix = getenv("_LIBCONTAINER_STATEDIR"); + + if (!prefix || *prefix != '/') + prefix = "/tmp"; + if (snprintf(template, sizeof(template), "%s/runc.XXXXXX", prefix) < 0) + return -1; + + /* + * Now try memfd, it's much nicer than actually creating a file in STATEDIR + * since it's easily detected thanks to sealing and also doesn't require + * assumptions about STATEDIR. + */ + *fdtype = EFD_MEMFD; + fd = memfd_create(RUNC_MEMFD_COMMENT, MFD_CLOEXEC | MFD_ALLOW_SEALING); + if (fd >= 0) + return fd; + if (errno != ENOSYS && errno != EINVAL) + goto error; + +#ifdef O_TMPFILE + /* + * Try O_TMPFILE to avoid races where someone might snatch our file. Note + * that O_EXCL isn't actually a security measure here (since you can just + * fd re-open it and clear O_EXCL). + */ + *fdtype = EFD_FILE; + fd = open(prefix, O_TMPFILE | O_EXCL | O_RDWR | O_CLOEXEC, 0700); + if (fd >= 0) { + struct stat statbuf = {}; + bool working_otmpfile = false; + + /* + * open(2) ignores unknown O_* flags -- yeah, I was surprised when I + * found this out too. As a result we can't check for EINVAL. However, + * if we get nlink != 0 (or EISDIR) then we know that this kernel + * doesn't support O_TMPFILE. + */ + if (fstat(fd, &statbuf) >= 0) + working_otmpfile = (statbuf.st_nlink == 0); + + if (working_otmpfile) + return fd; + + /* Pretend that we got EISDIR since O_TMPFILE failed. */ + close(fd); + errno = EISDIR; + } + if (errno != EISDIR) + goto error; +#endif /* defined(O_TMPFILE) */ + + /* + * Our final option is to create a temporary file the old-school way, and + * then unlink it so that nothing else sees it by accident. + */ + *fdtype = EFD_FILE; + fd = mkostemp(template, O_CLOEXEC); + if (fd >= 0) { + if (unlink(template) >= 0) + return fd; + close(fd); + } + +error: + *fdtype = EFD_NONE; + return -1; +} + +static int seal_execfd(int *fd, int fdtype) +{ + switch (fdtype) { + case EFD_MEMFD: + return fcntl(*fd, F_ADD_SEALS, RUNC_MEMFD_SEALS); + case EFD_FILE: { + /* Need to re-open our pseudo-memfd as an O_PATH to avoid execve(2) giving -ETXTBSY. */ + int newfd; + char fdpath[PATH_MAX] = {0}; + + if (fchmod(*fd, 0100) < 0) + return -1; + + if (snprintf(fdpath, sizeof(fdpath), "/proc/self/fd/%d", *fd) < 0) + return -1; + + newfd = open(fdpath, O_PATH | O_CLOEXEC); + if (newfd < 0) + return -1; + + close(*fd); + *fd = newfd; + return 0; + } + default: + break; + } + return -1; +} + +static int try_bindfd(void) +{ + int fd, ret = -1; + char template[PATH_MAX] = {0}; + char *prefix = getenv("_LIBCONTAINER_STATEDIR"); + + if (!prefix || *prefix != '/') + prefix = "/tmp"; + if (snprintf(template, sizeof(template), "%s/runc.XXXXXX", prefix) < 0) + return ret; + + /* + * We need somewhere to mount it, mounting anything over /proc/self is a + * BAD idea on the host -- even if we do it temporarily. + */ + fd = mkstemp(template); + if (fd < 0) + return ret; + close(fd); + + /* + * For obvious reasons this won't work in rootless mode because we haven't + * created a userns+mntns -- but getting that to work will be a bit + * complicated and it's only worth doing if someone actually needs it. + */ + ret = -EPERM; + if (mount("/proc/self/exe", template, "", MS_BIND, "") < 0) + goto out; + if (mount("", template, "", MS_REMOUNT | MS_BIND | MS_RDONLY, "") < 0) + goto out_umount; + + + /* Get read-only handle that we're sure can't be made read-write. */ + ret = open(template, O_PATH | O_CLOEXEC); + +out_umount: + /* + * Make sure the MNT_DETACH works, otherwise we could get remounted + * read-write and that would be quite bad (the fd would be made read-write + * too, invalidating the protection). + */ + if (umount2(template, MNT_DETACH) < 0) { + if (ret >= 0) + close(ret); + ret = -ENOTRECOVERABLE; + } + +out: + /* + * We don't care about unlink errors, the worst that happens is that + * there's an empty file left around in STATEDIR. + */ + unlink(template); + return ret; +} + +static ssize_t fd_to_fd(int outfd, int infd) +{ + ssize_t total = 0; + char buffer[4096]; + + for (;;) { + ssize_t nread, nwritten = 0; + + nread = read(infd, buffer, sizeof(buffer)); + if (nread < 0) + return -1; + if (!nread) + break; + + do { + ssize_t n = write(outfd, buffer + nwritten, nread - nwritten); + if (n < 0) + return -1; + nwritten += n; + } while(nwritten < nread); + + total += nwritten; + } + + return total; +} + +static int clone_binary(void) +{ + int binfd, execfd; + struct stat statbuf = {}; + size_t sent = 0; + int fdtype = EFD_NONE; + + /* + * Before we resort to copying, let's try creating an ro-binfd in one shot + * by getting a handle for a read-only bind-mount of the execfd. + */ + execfd = try_bindfd(); + if (execfd >= 0) + return execfd; + + /* + * Dammit, that didn't work -- time to copy the binary to a safe place we + * can seal the contents. + */ + execfd = make_execfd(&fdtype); + if (execfd < 0 || fdtype == EFD_NONE) + return -ENOTRECOVERABLE; + + binfd = open("/proc/self/exe", O_RDONLY | O_CLOEXEC); + if (binfd < 0) + goto error; + + if (fstat(binfd, &statbuf) < 0) + goto error_binfd; + + while (sent < statbuf.st_size) { + int n = sendfile(execfd, binfd, NULL, statbuf.st_size - sent); + if (n < 0) { + /* sendfile can fail so we fallback to a dumb user-space copy. */ + n = fd_to_fd(execfd, binfd); + if (n < 0) + goto error_binfd; + } + sent += n; + } + close(binfd); + if (sent != statbuf.st_size) + goto error; + + if (seal_execfd(&execfd, fdtype) < 0) + goto error; + + return execfd; + +error_binfd: + close(binfd); +error: + close(execfd); + return -EIO; +} + +/* Get cheap access to the environment. */ +extern char **environ; + +int ensure_cloned_binary(void) +{ + int execfd; + char **argv = NULL; + + /* Check that we're not self-cloned, and if we are then bail. */ + int cloned = is_self_cloned(); + if (cloned > 0 || cloned == -ENOTRECOVERABLE) + return cloned; + + if (fetchve(&argv) < 0) + return -EINVAL; + + execfd = clone_binary(); + if (execfd < 0) + return -EIO; + + if (putenv(CLONED_BINARY_ENV "=1")) + goto error; + + fexecve(execfd, argv, environ); +error: + close(execfd); + return -ENOEXEC; +} diff --git a/sysbox-runc/libcontainer/nsenter/namespace.h b/sysbox-runc/libcontainer/nsenter/namespace.h new file mode 100644 index 00000000..9e9bdca0 --- /dev/null +++ b/sysbox-runc/libcontainer/nsenter/namespace.h @@ -0,0 +1,32 @@ +#ifndef NSENTER_NAMESPACE_H +#define NSENTER_NAMESPACE_H + +#ifndef _GNU_SOURCE +# define _GNU_SOURCE +#endif +#include + +/* All of these are taken from include/uapi/linux/sched.h */ +#ifndef CLONE_NEWNS +# define CLONE_NEWNS 0x00020000 /* New mount namespace group */ +#endif +#ifndef CLONE_NEWCGROUP +# define CLONE_NEWCGROUP 0x02000000 /* New cgroup namespace */ +#endif +#ifndef CLONE_NEWUTS +# define CLONE_NEWUTS 0x04000000 /* New utsname namespace */ +#endif +#ifndef CLONE_NEWIPC +# define CLONE_NEWIPC 0x08000000 /* New ipc namespace */ +#endif +#ifndef CLONE_NEWUSER +# define CLONE_NEWUSER 0x10000000 /* New user namespace */ +#endif +#ifndef CLONE_NEWPID +# define CLONE_NEWPID 0x20000000 /* New pid namespace */ +#endif +#ifndef CLONE_NEWNET +# define CLONE_NEWNET 0x40000000 /* New network namespace */ +#endif + +#endif /* NSENTER_NAMESPACE_H */ diff --git a/sysbox-runc/libcontainer/nsenter/nsenter.go b/sysbox-runc/libcontainer/nsenter/nsenter.go new file mode 100644 index 00000000..7e294a0f --- /dev/null +++ b/sysbox-runc/libcontainer/nsenter/nsenter.go @@ -0,0 +1,13 @@ +// +build linux,!gccgo + +package nsenter + +/* +#cgo CFLAGS: -Wall +extern void nsexec(); +void __attribute__((constructor)) init(void) { + nsexec(); +} +*/ +import "C" + diff --git a/sysbox-runc/libcontainer/nsenter/nsenter_gccgo.go b/sysbox-runc/libcontainer/nsenter/nsenter_gccgo.go new file mode 100644 index 00000000..63c7a3ec --- /dev/null +++ b/sysbox-runc/libcontainer/nsenter/nsenter_gccgo.go @@ -0,0 +1,25 @@ +// +build linux,gccgo + +package nsenter + +/* +#cgo CFLAGS: -Wall +extern void nsexec(); +void __attribute__((constructor)) init(void) { + nsexec(); +} +*/ +import "C" + +// AlwaysFalse is here to stay false +// (and be exported so the compiler doesn't optimize out its reference) +var AlwaysFalse bool + +func init() { + if AlwaysFalse { + // by referencing this C init() in a noop test, it will ensure the compiler + // links in the C function. + // https://gcc.gnu.org/bugzilla/show_bug.cgi?id=65134 + C.init() + } +} diff --git a/sysbox-runc/libcontainer/nsenter/nsenter_test.go b/sysbox-runc/libcontainer/nsenter/nsenter_test.go new file mode 100644 index 00000000..86aeb042 --- /dev/null +++ b/sysbox-runc/libcontainer/nsenter/nsenter_test.go @@ -0,0 +1,237 @@ +package nsenter + +import ( + "bytes" + "encoding/json" + "fmt" + "io" + "io/ioutil" + "os" + "os/exec" + "strings" + "testing" + + "github.com/opencontainers/runc/libcontainer" + "github.com/vishvananda/netlink/nl" + "golang.org/x/sys/unix" +) + +type pid struct { + Pid int `json:"Pid"` +} + +type logentry struct { + Msg string `json:"msg"` + Level string `json:"level"` +} + +func TestNsenterValidPaths(t *testing.T) { + args := []string{"nsenter-exec"} + parent, child, err := newPipe() + if err != nil { + t.Fatalf("failed to create pipe %v", err) + } + + namespaces := []string{ + // join pid ns of the current process + fmt.Sprintf("pid:/proc/%d/ns/pid", os.Getpid()), + } + cmd := &exec.Cmd{ + Path: os.Args[0], + Args: args, + ExtraFiles: []*os.File{child}, + Env: []string{"_LIBCONTAINER_INITPIPE=3"}, + Stdout: os.Stdout, + Stderr: os.Stderr, + } + + if err := cmd.Start(); err != nil { + t.Fatalf("nsenter failed to start %v", err) + } + // write cloneFlags + r := nl.NewNetlinkRequest(int(libcontainer.InitMsg), 0) + r.AddData(&libcontainer.Int32msg{ + Type: libcontainer.CloneFlagsAttr, + Value: uint32(unix.CLONE_NEWNET), + }) + r.AddData(&libcontainer.Bytemsg{ + Type: libcontainer.NsPathsAttr, + Value: []byte(strings.Join(namespaces, ",")), + }) + if _, err := io.Copy(parent, bytes.NewReader(r.Serialize())); err != nil { + t.Fatal(err) + } + + decoder := json.NewDecoder(parent) + var pid *pid + + if err := cmd.Wait(); err != nil { + t.Fatalf("nsenter exits with a non-zero exit status") + } + if err := decoder.Decode(&pid); err != nil { + dir, _ := ioutil.ReadDir(fmt.Sprintf("/proc/%d/ns", os.Getpid())) + for _, d := range dir { + t.Log(d.Name()) + } + t.Fatalf("%v", err) + } + + p, err := os.FindProcess(pid.Pid) + if err != nil { + t.Fatalf("%v", err) + } + p.Wait() +} + +func TestNsenterInvalidPaths(t *testing.T) { + args := []string{"nsenter-exec"} + parent, child, err := newPipe() + if err != nil { + t.Fatalf("failed to create pipe %v", err) + } + + namespaces := []string{ + // join pid ns of the current process + fmt.Sprintf("pid:/proc/%d/ns/pid", -1), + } + cmd := &exec.Cmd{ + Path: os.Args[0], + Args: args, + ExtraFiles: []*os.File{child}, + Env: []string{"_LIBCONTAINER_INITPIPE=3"}, + } + + if err := cmd.Start(); err != nil { + t.Fatal(err) + } + // write cloneFlags + r := nl.NewNetlinkRequest(int(libcontainer.InitMsg), 0) + r.AddData(&libcontainer.Int32msg{ + Type: libcontainer.CloneFlagsAttr, + Value: uint32(unix.CLONE_NEWNET), + }) + r.AddData(&libcontainer.Bytemsg{ + Type: libcontainer.NsPathsAttr, + Value: []byte(strings.Join(namespaces, ",")), + }) + if _, err := io.Copy(parent, bytes.NewReader(r.Serialize())); err != nil { + t.Fatal(err) + } + + if err := cmd.Wait(); err == nil { + t.Fatalf("nsenter exits with a zero exit status") + } +} + +func TestNsenterIncorrectPathType(t *testing.T) { + args := []string{"nsenter-exec"} + parent, child, err := newPipe() + if err != nil { + t.Fatalf("failed to create pipe %v", err) + } + + namespaces := []string{ + // join pid ns of the current process + fmt.Sprintf("net:/proc/%d/ns/pid", os.Getpid()), + } + cmd := &exec.Cmd{ + Path: os.Args[0], + Args: args, + ExtraFiles: []*os.File{child}, + Env: []string{"_LIBCONTAINER_INITPIPE=3"}, + } + + if err := cmd.Start(); err != nil { + t.Fatal(err) + } + // write cloneFlags + r := nl.NewNetlinkRequest(int(libcontainer.InitMsg), 0) + r.AddData(&libcontainer.Int32msg{ + Type: libcontainer.CloneFlagsAttr, + Value: uint32(unix.CLONE_NEWNET), + }) + r.AddData(&libcontainer.Bytemsg{ + Type: libcontainer.NsPathsAttr, + Value: []byte(strings.Join(namespaces, ",")), + }) + if _, err := io.Copy(parent, bytes.NewReader(r.Serialize())); err != nil { + t.Fatal(err) + } + + if err := cmd.Wait(); err == nil { + t.Fatalf("nsenter exits with a zero exit status") + } +} + +func TestNsenterChildLogging(t *testing.T) { + args := []string{"nsenter-exec"} + parent, child, err := newPipe() + if err != nil { + t.Fatalf("failed to create exec pipe %v", err) + } + logread, logwrite, err := os.Pipe() + if err != nil { + t.Fatalf("failed to create log pipe %v", err) + } + defer logread.Close() + defer logwrite.Close() + + namespaces := []string{ + // join pid ns of the current process + fmt.Sprintf("pid:/proc/%d/ns/pid", os.Getpid()), + } + cmd := &exec.Cmd{ + Path: os.Args[0], + Args: args, + ExtraFiles: []*os.File{child, logwrite}, + Env: []string{"_LIBCONTAINER_INITPIPE=3", "_LIBCONTAINER_LOGPIPE=4"}, + Stdout: os.Stdout, + Stderr: os.Stderr, + } + + if err := cmd.Start(); err != nil { + t.Fatalf("nsenter failed to start %v", err) + } + // write cloneFlags + r := nl.NewNetlinkRequest(int(libcontainer.InitMsg), 0) + r.AddData(&libcontainer.Int32msg{ + Type: libcontainer.CloneFlagsAttr, + Value: uint32(unix.CLONE_NEWNET), + }) + r.AddData(&libcontainer.Bytemsg{ + Type: libcontainer.NsPathsAttr, + Value: []byte(strings.Join(namespaces, ",")), + }) + if _, err := io.Copy(parent, bytes.NewReader(r.Serialize())); err != nil { + t.Fatal(err) + } + + logsDecoder := json.NewDecoder(logread) + var logentry *logentry + + err = logsDecoder.Decode(&logentry) + if err != nil { + t.Fatalf("child log: %v", err) + } + if logentry.Level == "" || logentry.Msg == "" { + t.Fatalf("child log: empty log fileds: level=\"%s\" msg=\"%s\"", logentry.Level, logentry.Msg) + } + + if err := cmd.Wait(); err != nil { + t.Fatalf("nsenter exits with a non-zero exit status") + } +} + +func init() { + if strings.HasPrefix(os.Args[0], "nsenter-") { + os.Exit(0) + } +} + +func newPipe() (parent *os.File, child *os.File, err error) { + fds, err := unix.Socketpair(unix.AF_LOCAL, unix.SOCK_STREAM|unix.SOCK_CLOEXEC, 0) + if err != nil { + return nil, nil, err + } + return os.NewFile(uintptr(fds[1]), "parent"), os.NewFile(uintptr(fds[0]), "child"), nil +} diff --git a/sysbox-runc/libcontainer/nsenter/nsenter_unsupported.go b/sysbox-runc/libcontainer/nsenter/nsenter_unsupported.go new file mode 100644 index 00000000..2459c636 --- /dev/null +++ b/sysbox-runc/libcontainer/nsenter/nsenter_unsupported.go @@ -0,0 +1,3 @@ +// +build !linux !cgo + +package nsenter diff --git a/sysbox-runc/libcontainer/nsenter/nsexec.c b/sysbox-runc/libcontainer/nsenter/nsexec.c new file mode 100644 index 00000000..9bb8cfed --- /dev/null +++ b/sysbox-runc/libcontainer/nsenter/nsexec.c @@ -0,0 +1,1214 @@ + +#define _GNU_SOURCE +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +/* Get all of the CLONE_NEW* flags. */ +#include "namespace.h" + +/* Synchronisation values. */ +enum sync_t { + SYNC_USERMAP_PLS = 0x40, /* Request parent to map our users. */ + SYNC_USERMAP_ACK = 0x41, /* Mapping finished by the parent. */ + SYNC_RECVPID_PLS = 0x42, /* Tell parent we're sending the PID. */ + SYNC_RECVPID_ACK = 0x43, /* PID was correctly received by parent. */ + SYNC_GRANDCHILD = 0x44, /* The grandchild is ready to run. */ + SYNC_CHILD_READY = 0x45, /* The child or grandchild is ready to return. */ +}; + +/* + * Synchronisation value for cgroup namespace setup. + * The same constant is defined in process_linux.go as "createCgroupns". + */ +#define CREATECGROUPNS 0x80 + +/* longjmp() arguments. */ +#define JUMP_PARENT 0x00 +#define JUMP_CHILD 0xA0 +#define JUMP_INIT 0xA1 + +/* Assume the stack grows down, so arguments should be above it. */ +struct clone_t { + /* + * Reserve some space for clone() to locate arguments + * and retcode in this place + */ + char stack[4096] __attribute__ ((aligned(16))); + char stack_ptr[0]; + + /* There's two children. This is used to execute the different code. */ + jmp_buf *env; + int jmpval; +}; + +struct nlconfig_t { + char *data; + + /* Process settings. */ + uint32_t cloneflags; + char *oom_score_adj; + size_t oom_score_adj_len; + + /* User namespace settings. */ + char *uidmap; + size_t uidmap_len; + char *gidmap; + size_t gidmap_len; + char *namespaces; + size_t namespaces_len; + uint8_t is_setgroup; + + /* Rootless container settings. */ + uint8_t is_rootless_euid; /* boolean */ + char *uidmappath; + size_t uidmappath_len; + char *gidmappath; + size_t gidmappath_len; + + /* sysbox-runc: rootfs prep */ + uint8_t prep_rootfs; /* boolean */ + uint8_t make_parent_priv; /* boolean */ + uint32_t rootfs_prop; + char *rootfs; + size_t rootfs_len; + char *parent_mount; + size_t parent_mount_len; + char *shiftfs_mounts; + size_t shiftfs_mounts_len; +}; + +#define PANIC "panic" +#define FATAL "fatal" +#define ERROR "error" +#define WARNING "warning" +#define INFO "info" +#define DEBUG "debug" + +static int logfd = -1; + +/* + * List of netlink message types sent to us as part of bootstrapping the init. + * These constants are defined in libcontainer/message_linux.go. + */ +#define INIT_MSG 62000 +#define CLONE_FLAGS_ATTR 27281 +#define NS_PATHS_ATTR 27282 +#define UIDMAP_ATTR 27283 +#define GIDMAP_ATTR 27284 +#define SETGROUP_ATTR 27285 +#define OOM_SCORE_ADJ_ATTR 27286 +#define ROOTLESS_EUID_ATTR 27287 +#define UIDMAPPATH_ATTR 27288 +#define GIDMAPPATH_ATTR 27289 +#define PREP_ROOTFS_ATTR 27290 +#define MAKE_PARENT_PRIV_ATTR 27291 +#define ROOTFS_PROP_ATTR 27292 +#define ROOTFS_ATTR 27293 +#define PARENT_MOUNT_ATTR 27294 +#define SHIFTFS_MOUNTS_ATTR 27295 + +/* + * Use the raw syscall for versions of glibc which don't include a function for + * it, namely (glibc 2.12). + */ +#if __GLIBC__ == 2 && __GLIBC_MINOR__ < 14 +# define _GNU_SOURCE +# include "syscall.h" +# if !defined(SYS_setns) && defined(__NR_setns) +# define SYS_setns __NR_setns +# endif + +#ifndef SYS_setns +# error "setns(2) syscall not supported by glibc version" +#endif + +int setns(int fd, int nstype) +{ + return syscall(SYS_setns, fd, nstype); +} +#endif + +static void write_log_with_info(const char *level, const char *function, int line, const char *format, ...) +{ + char message[1024] = {}; + + va_list args; + + if (logfd < 0 || level == NULL) + return; + + va_start(args, format); + if (vsnprintf(message, sizeof(message), format, args) < 0) + goto done; + + dprintf(logfd, "{\"level\":\"%s\", \"msg\": \"%s:%d %s\"}\n", level, function, line, message); +done: + va_end(args); +} + +#define write_log(level, fmt, ...) \ + write_log_with_info((level), __FUNCTION__, __LINE__, (fmt), ##__VA_ARGS__) + +/* XXX: This is ugly. */ +static int syncfd = -1; + +#define bail(fmt, ...) \ + do { \ + write_log(FATAL, "nsenter: " fmt ": %m", ##__VA_ARGS__); \ + exit(1); \ + } while(0) + +static int write_file(char *data, size_t data_len, char *pathfmt, ...) +{ + int fd, len, ret = 0; + char path[PATH_MAX]; + + va_list ap; + va_start(ap, pathfmt); + len = vsnprintf(path, PATH_MAX, pathfmt, ap); + va_end(ap); + if (len < 0) + return -1; + + fd = open(path, O_RDWR); + if (fd < 0) { + return -1; + } + + len = write(fd, data, data_len); + if (len != data_len) { + ret = -1; + goto out; + } + + out: + close(fd); + return ret; +} + +enum policy_t { + SETGROUPS_DEFAULT = 0, + SETGROUPS_ALLOW, + SETGROUPS_DENY, +}; + +/* This *must* be called before we touch gid_map. */ +static void update_setgroups(int pid, enum policy_t setgroup) +{ + char *policy; + + switch (setgroup) { + case SETGROUPS_ALLOW: + policy = "allow"; + break; + case SETGROUPS_DENY: + policy = "deny"; + break; + case SETGROUPS_DEFAULT: + default: + /* Nothing to do. */ + return; + } + + if (write_file(policy, strlen(policy), "/proc/%d/setgroups", pid) < 0) { + /* + * If the kernel is too old to support /proc/pid/setgroups, + * open(2) or write(2) will return ENOENT. This is fine. + */ + if (errno != ENOENT) + bail("failed to write '%s' to /proc/%d/setgroups", policy, pid); + } +} + +/* + * In sysbox-runc, nsexec must not set the user-ns ID mappings as + * otherwise the container's init process looses permissions required + * to setup the container's context (e.g., mounts, etc). Instead, the + * ID mappings are setup later in the sysbox-runc Go runtime that + * performs the container's rootfs setup. + */ + +static int try_mapping_tool(const char *app, int pid, char *map, size_t map_len) +{ + int child; + + /* + * If @app is NULL, execve will segfault. Just check it here and bail (if + * we're in this path, the caller is already getting desperate and there + * isn't a backup to this failing). This usually would be a configuration + * or programming issue. + */ + if (!app) + bail("mapping tool not present"); + + child = fork(); + if (child < 0) + bail("failed to fork"); + + if (!child) { +#define MAX_ARGV 20 + char *argv[MAX_ARGV]; + char *envp[] = { NULL }; + char pid_fmt[16]; + int argc = 0; + char *next; + + snprintf(pid_fmt, 16, "%d", pid); + + argv[argc++] = (char *)app; + argv[argc++] = pid_fmt; + /* + * Convert the map string into a list of argument that + * newuidmap/newgidmap can understand. + */ + + while (argc < MAX_ARGV) { + if (*map == '\0') { + argv[argc++] = NULL; + break; + } + argv[argc++] = map; + next = strpbrk(map, "\n "); + if (next == NULL) + break; + *next++ = '\0'; + map = next + strspn(next, "\n "); + } + + execve(app, argv, envp); + bail("failed to execv"); + } else { + int status; + + while (true) { + if (waitpid(child, &status, 0) < 0) { + if (errno == EINTR) + continue; + bail("failed to waitpid"); + } + if (WIFEXITED(status) || WIFSIGNALED(status)) + return WEXITSTATUS(status); + } + } + + return -1; +} + +static void update_uidmap(const char *path, int pid, char *map, size_t map_len) +{ + if (map == NULL || map_len <= 0) + return; + + if (write_file(map, map_len, "/proc/%d/uid_map", pid) < 0) { + if (errno != EPERM) + bail("failed to update /proc/%d/uid_map", pid); + if (try_mapping_tool(path, pid, map, map_len)) + bail("failed to use newuid map on %d", pid); + } +} + +static void update_gidmap(const char *path, int pid, char *map, size_t map_len) +{ + if (map == NULL || map_len <= 0) + return; + + if (write_file(map, map_len, "/proc/%d/gid_map", pid) < 0) { + if (errno != EPERM) + bail("failed to update /proc/%d/gid_map", pid); + if (try_mapping_tool(path, pid, map, map_len)) + bail("failed to use newgid map on %d", pid); + } +} + +static void update_oom_score_adj(char *data, size_t len) +{ + if (data == NULL || len <= 0) + return; + + if (write_file(data, len, "/proc/self/oom_score_adj") < 0) + bail("failed to update /proc/self/oom_score_adj"); +} + +/* A dummy function that just jumps to the given jumpval. */ +static int child_func(void *arg) __attribute__ ((noinline)); +static int child_func(void *arg) +{ + struct clone_t *ca = (struct clone_t *)arg; + longjmp(*ca->env, ca->jmpval); +} + +static int clone_parent(jmp_buf *env, int jmpval) __attribute__ ((noinline)); +static int clone_parent(jmp_buf *env, int jmpval) +{ + struct clone_t ca = { + .env = env, + .jmpval = jmpval, + }; + + return clone(child_func, ca.stack_ptr, CLONE_PARENT | SIGCHLD, &ca); +} + +/* + * Gets the init pipe fd from the environment, which is used to read the + * bootstrap data and tell the parent what the new pid is after we finish + * setting up the environment. + */ +static int initpipe(void) +{ + int pipenum; + char *initpipe, *endptr; + + initpipe = getenv("_LIBCONTAINER_INITPIPE"); + if (initpipe == NULL || *initpipe == '\0') + return -1; + + pipenum = strtol(initpipe, &endptr, 10); + if (*endptr != '\0') + bail("unable to parse _LIBCONTAINER_INITPIPE"); + + return pipenum; +} + +static void setup_logpipe(void) +{ + char *logpipe, *endptr; + + logpipe = getenv("_LIBCONTAINER_LOGPIPE"); + if (logpipe == NULL || *logpipe == '\0') { + return; + } + + logfd = strtol(logpipe, &endptr, 10); + if (logpipe == endptr || *endptr != '\0') { + fprintf(stderr, "unable to parse _LIBCONTAINER_LOGPIPE, value: %s\n", logpipe); + /* It is too early to use bail */ + exit(1); + } +} + +/* Returns the clone(2) flag for a namespace, given the name of a namespace. */ +static int nsflag(char *name) +{ + if (!strcmp(name, "cgroup")) + return CLONE_NEWCGROUP; + else if (!strcmp(name, "ipc")) + return CLONE_NEWIPC; + else if (!strcmp(name, "mnt")) + return CLONE_NEWNS; + else if (!strcmp(name, "net")) + return CLONE_NEWNET; + else if (!strcmp(name, "pid")) + return CLONE_NEWPID; + else if (!strcmp(name, "user")) + return CLONE_NEWUSER; + else if (!strcmp(name, "uts")) + return CLONE_NEWUTS; + + /* If we don't recognise a name, fallback to 0. */ + return 0; +} + +static uint32_t readint32(char *buf) +{ + return *(uint32_t *) buf; +} + +static uint8_t readint8(char *buf) +{ + return *(uint8_t *) buf; +} + +static void nl_parse(int fd, struct nlconfig_t *config) +{ + size_t len, size; + struct nlmsghdr hdr; + char *data, *current; + + /* Retrieve the netlink header. */ + len = read(fd, &hdr, NLMSG_HDRLEN); + if (len != NLMSG_HDRLEN) + bail("invalid netlink header length %zu", len); + + if (hdr.nlmsg_type == NLMSG_ERROR) + bail("failed to read netlink message"); + + if (hdr.nlmsg_type != INIT_MSG) + bail("unexpected msg type %d", hdr.nlmsg_type); + + /* Retrieve data. */ + size = NLMSG_PAYLOAD(&hdr, 0); + current = data = malloc(size); + if (!data) + bail("failed to allocate %zu bytes of memory for nl_payload", size); + + len = read(fd, data, size); + if (len != size) + bail("failed to read netlink payload, %zu != %zu", len, size); + + /* Parse the netlink payload. */ + config->data = data; + while (current < data + size) { + struct nlattr *nlattr = (struct nlattr *)current; + size_t payload_len = nlattr->nla_len - NLA_HDRLEN; + + /* Advance to payload. */ + current += NLA_HDRLEN; + + /* Handle payload. */ + switch (nlattr->nla_type) { + case CLONE_FLAGS_ATTR: + config->cloneflags = readint32(current); + break; + case ROOTLESS_EUID_ATTR: + config->is_rootless_euid = readint8(current); /* boolean */ + break; + case OOM_SCORE_ADJ_ATTR: + config->oom_score_adj = current; + config->oom_score_adj_len = payload_len; + break; + case NS_PATHS_ATTR: + config->namespaces = current; + config->namespaces_len = payload_len; + break; + case UIDMAP_ATTR: + config->uidmap = current; + config->uidmap_len = payload_len; + break; + case GIDMAP_ATTR: + config->gidmap = current; + config->gidmap_len = payload_len; + break; + case UIDMAPPATH_ATTR: + config->uidmappath = current; + config->uidmappath_len = payload_len; + break; + case GIDMAPPATH_ATTR: + config->gidmappath = current; + config->gidmappath_len = payload_len; + break; + case SETGROUP_ATTR: + config->is_setgroup = readint8(current); + break; + + /* sysbox-runc */ + case PREP_ROOTFS_ATTR: + config->prep_rootfs = readint8(current); + break; + case MAKE_PARENT_PRIV_ATTR: + config->make_parent_priv = readint8(current); + break; + case ROOTFS_PROP_ATTR: + config->rootfs_prop = readint32(current); + break; + case ROOTFS_ATTR: + config->rootfs = current; + config->rootfs_len = payload_len; + break; + case PARENT_MOUNT_ATTR: + config->parent_mount = current; + config->parent_mount_len = payload_len; + break; + case SHIFTFS_MOUNTS_ATTR: + config->shiftfs_mounts = current; + config->shiftfs_mounts_len = payload_len; + break; + default: + bail("unknown netlink message type %d", nlattr->nla_type); + } + + current += NLA_ALIGN(payload_len); + } +} + +void nl_free(struct nlconfig_t *config) +{ + free(config->data); +} + +void join_namespaces(char *nslist) +{ + int num = 0, i; + char *saveptr = NULL; + char *namespace = strtok_r(nslist, ",", &saveptr); + struct namespace_t { + int fd; + int ns; + char type[PATH_MAX]; + char path[PATH_MAX]; + } *namespaces = NULL; + + if (!namespace || !strlen(namespace) || !strlen(nslist)) + bail("ns paths are empty"); + + /* + * We have to open the file descriptors first, since after + * we join the mnt namespace we might no longer be able to + * access the paths. + */ + do { + int fd; + char *path; + struct namespace_t *ns; + + /* Resize the namespace array. */ + namespaces = realloc(namespaces, ++num * sizeof(struct namespace_t)); + if (!namespaces) + bail("failed to reallocate namespace array"); + ns = &namespaces[num - 1]; + + /* Split 'ns:path'. */ + path = strstr(namespace, ":"); + if (!path) + bail("failed to parse %s", namespace); + *path++ = '\0'; + + fd = open(path, O_RDONLY); + if (fd < 0) + bail("failed to open %s", path); + + ns->fd = fd; + ns->ns = nsflag(namespace); + strncpy(ns->path, path, PATH_MAX - 1); + ns->path[PATH_MAX - 1] = '\0'; + } while ((namespace = strtok_r(NULL, ",", &saveptr)) != NULL); + + /* + * The ordering in which we join namespaces is important. We should + * always join the user namespace *first*. This is all guaranteed + * from the container_linux.go side of this, so we're just going to + * follow the order given to us. + */ + + for (i = 0; i < num; i++) { + struct namespace_t ns = namespaces[i]; + + if (setns(ns.fd, ns.ns) < 0) + bail("failed to setns to %s", ns.path); + + close(ns.fd); + } + + free(namespaces); +} + +int mount_shiftfs(struct nlconfig_t *config) { + char *saveptr = NULL; + char *mntlist = config->shiftfs_mounts; + char *mntpath = strtok_r(mntlist, ",", &saveptr); + + if (!mntpath || !strlen(mntpath) || !strlen(mntlist)) + return 0; + + do { + // For shiftfs mounts over the container's rootfs, we use "." (cwd) + // instead of the mount path because the container may no longer have + // search permissions into the full path of the rootfs (i.e., may have + // lost permissions when it entered the user-ns). Note that by design, the + // nsenter process' cwd is the container's rootfs. + + if (strcmp(mntpath, config->rootfs) == 0) { + if (mount(".", ".", "shiftfs", 0, "") < 0) + return -1; + } else { + if (mount(mntpath, mntpath, "shiftfs", 0, "") < 0) + return -1; + } + + } while ((mntpath = strtok_r(NULL, ",", &saveptr)) != NULL); + + return 0; +} + +/* Defined in cloned_binary.c. */ +extern int ensure_cloned_binary(void); + +void nsexec(void) +{ + int pipenum; + jmp_buf env; + int sync_child_pipe[2], sync_grandchild_pipe[2]; + struct nlconfig_t config = { 0 }; + + /* + * Setup a pipe to send logs to the parent. This should happen + * first, because bail will use that pipe. + */ + setup_logpipe(); + + /* + * If we don't have an init pipe, just return to the go routine. + * We'll only get an init pipe for start or exec. + */ + pipenum = initpipe(); + if (pipenum == -1) + return; + + /* + * We need to re-exec if we are not in a cloned binary. This is necessary + * to ensure that containers won't be able to access the host binary + * through /proc/self/exe. See CVE-2019-5736. + */ + if (ensure_cloned_binary() < 0) + bail("could not ensure we are a cloned binary"); + + write_log(DEBUG, "nsexec started"); + + /* Parse all of the netlink configuration. */ + nl_parse(pipenum, &config); + + /* Set oom_score_adj. This has to be done before !dumpable because + * /proc/self/oom_score_adj is not writeable unless you're an privileged + * user (if !dumpable is set). All children inherit their parent's + * oom_score_adj value on fork(2) so this will always be propagated + * properly. + */ + + /* sysbox-runc: initially set oom_score_adj to "-999" for the + * container's init process. It will later be increased to the + * configured value. The goal here is to allow child processes to + * decrease their oom_score down to "-999", yet have the init + * process start with it's configured oom score adjustment. See + * sysbox issue #381. + */ + update_oom_score_adj("-999", 4); + + /* + * Make the process non-dumpable, to avoid various race conditions that + * could cause processes in namespaces we're joining to access host + * resources (or potentially execute code). + * + * However, if the number of namespaces we are joining is 0, we are not + * going to be switching to a different security context. Thus setting + * ourselves to be non-dumpable only breaks things (like rootless + * containers), which is the recommendation from the kernel folks. + */ + if (config.namespaces) { + if (prctl(PR_SET_DUMPABLE, 0, 0, 0, 0) < 0) + bail("failed to set process as non-dumpable"); + } + + /* Pipe so we can tell the child when we've finished setting up. */ + if (socketpair(AF_LOCAL, SOCK_STREAM, 0, sync_child_pipe) < 0) + bail("failed to setup sync pipe between parent and child"); + + /* + * We need a new socketpair to sync with grandchild so we don't have + * race condition with child. + */ + if (socketpair(AF_LOCAL, SOCK_STREAM, 0, sync_grandchild_pipe) < 0) + bail("failed to setup sync pipe between parent and grandchild"); + + /* TODO: Currently we aren't dealing with child deaths properly. */ + + /* + * Okay, so this is quite annoying. + * + * In order for this unsharing code to be more extensible we need to split + * up unshare(CLONE_NEWUSER) and clone() in various ways. The ideal case + * would be if we did clone(CLONE_NEWUSER) and the other namespaces + * separately, but because of SELinux issues we cannot really do that. But + * we cannot just dump the namespace flags into clone(...) because several + * usecases (such as rootless containers) require more granularity around + * the namespace setup. In addition, some older kernels had issues where + * CLONE_NEWUSER wasn't handled before other namespaces (but we cannot + * handle this while also dealing with SELinux so we choose SELinux support + * over broken kernel support). + * + * However, if we unshare(2) the user namespace *before* we clone(2), then + * all hell breaks loose. + * + * The parent no longer has permissions to do many things (unshare(2) drops + * all capabilities in your old namespace), and the container cannot be set + * up to have more than one {uid,gid} mapping. This is obviously less than + * ideal. In order to fix this, we have to first clone(2) and then unshare. + * + * Unfortunately, it's not as simple as that. We have to fork to enter the + * PID namespace (the PID namespace only applies to children). Since we'll + * have to double-fork, this clone_parent() call won't be able to get the + * PID of the _actual_ init process (without doing more synchronisation than + * I can deal with at the moment). So we'll just get the parent to send it + * for us, the only job of this process is to update + * /proc/pid/{setgroups,uid_map,gid_map}. + * + * And as a result of the above, we also need to setns(2) in the first child + * because if we join a PID namespace in the topmost parent then our child + * will be in that namespace (and it will not be able to give us a PID value + * that makes sense without resorting to sending things with cmsg). + * + * This also deals with an older issue caused by dumping cloneflags into + * clone(2): On old kernels, CLONE_PARENT didn't work with CLONE_NEWPID, so + * we have to unshare(2) before clone(2) in order to do this. This was fixed + * in upstream commit 1f7f4dde5c945f41a7abc2285be43d918029ecc5, and was + * introduced by 40a0d32d1eaffe6aac7324ca92604b6b3977eb0e. As far as we're + * aware, the last mainline kernel which had this bug was Linux 3.12. + * However, we cannot comment on which kernels the broken patch was + * backported to. + * + * -- Aleksa "what has my life come to?" Sarai + */ + + switch (setjmp(env)) { + /* + * Stage 0: We're in the parent. Our job is just to create a new child + * (stage 1: JUMP_CHILD) process and write its uid_map and + * gid_map. That process will go on to create a new process, then + * it will send us its PID which we will send to the bootstrap + * process. + */ + case JUMP_PARENT:{ + int len; + pid_t child, first_child = -1; + bool ready = false; + + /* For debugging. */ + prctl(PR_SET_NAME, (unsigned long)"runc:[0:PARENT]", 0, 0, 0); + + /* Start the process of getting a container. */ + child = clone_parent(&env, JUMP_CHILD); + if (child < 0) + bail("unable to fork: child_func"); + + /* + * State machine for synchronisation with the children. + * + * Father only return when both child and grandchild are + * ready, so we can receive all possible error codes + * generated by children. + */ + syncfd = sync_child_pipe[1]; + close(sync_child_pipe[0]); + + while (!ready) { + enum sync_t s; + + if (read(syncfd, &s, sizeof(s)) != sizeof(s)) + bail("failed to sync with child: next state"); + + switch (s) { + case SYNC_USERMAP_PLS: + /* + * Enable setgroups(2) if we've been asked to. But we also + * have to explicitly disable setgroups(2) if we're + * creating a rootless container for single-entry mapping. + * i.e. config.is_setgroup == false. + * (this is required since Linux 3.19). + * + * For rootless multi-entry mapping, config.is_setgroup shall be true and + * newuidmap/newgidmap shall be used. + */ + + if (config.is_rootless_euid && !config.is_setgroup) + update_setgroups(child, SETGROUPS_DENY); + + update_uidmap(config.uidmappath, child, config.uidmap, config.uidmap_len); + update_gidmap(config.gidmappath, child, config.gidmap, config.gidmap_len); + + s = SYNC_USERMAP_ACK; + if (write(syncfd, &s, sizeof(s)) != sizeof(s)) { + kill(child, SIGKILL); + bail("failed to sync with child: write(SYNC_USERMAP_ACK)"); + } + break; + + case SYNC_RECVPID_PLS:{ + first_child = child; + + /* Get the init_func pid. */ + if (read(syncfd, &child, sizeof(child)) != sizeof(child)) { + kill(first_child, SIGKILL); + bail("failed to sync with child: read(childpid)"); + } + + /* Send ACK. */ + s = SYNC_RECVPID_ACK; + if (write(syncfd, &s, sizeof(s)) != sizeof(s)) { + kill(first_child, SIGKILL); + kill(child, SIGKILL); + bail("failed to sync with child: write(SYNC_RECVPID_ACK)"); + } + + /* Send the init_func pid back to our parent. + * + * Send the init_func pid and the pid of the first child back to our parent. + * We need to send both back because we can't reap the first child we created (CLONE_PARENT). + * It becomes the responsibility of our parent to reap the first child. + */ + len = dprintf(pipenum, "{\"pid\": %d, \"pid_first\": %d}\n", child, first_child); + if (len < 0) { + kill(child, SIGKILL); + bail("unable to generate JSON for child pid"); + } + } + break; + case SYNC_CHILD_READY: + ready = true; + break; + default: + bail("unexpected sync value: %u", s); + } + } + + /* Now sync with grandchild. */ + + syncfd = sync_grandchild_pipe[1]; + close(sync_grandchild_pipe[0]); + + ready = false; + while (!ready) { + enum sync_t s; + + s = SYNC_GRANDCHILD; + if (write(syncfd, &s, sizeof(s)) != sizeof(s)) { + kill(child, SIGKILL); + bail("failed to sync with child: write(SYNC_GRANDCHILD)"); + } + + if (read(syncfd, &s, sizeof(s)) != sizeof(s)) + bail("failed to sync with child: next state"); + + switch (s) { + case SYNC_CHILD_READY: + ready = true; + break; + default: + bail("unexpected sync value: %u", s); + } + } + + exit(0); + } + + /* + * Stage 1: We're in the first child process. Our job is to join any + * provided namespaces in the netlink payload and unshare all + * of the requested namespaces. If we've been asked to + * CLONE_NEWUSER, we will ask our parent (stage 0) to set up + * our user mappings for us. Then, we create a new child + * (stage 2: JUMP_INIT) for PID namespace. We then send the + * child's PID to our parent (stage 0). + */ + case JUMP_CHILD:{ + pid_t child; + enum sync_t s; + bool new_userns = false; + bool make_parent_priv_done = false; + bool shiftfs_mounts_done = false; + + /* We're in a child and thus need to tell the parent if we die. */ + syncfd = sync_child_pipe[0]; + close(sync_child_pipe[1]); + + /* For debugging. */ + prctl(PR_SET_NAME, (unsigned long)"runc:[1:CHILD]", 0, 0, 0); + + /* + * We need to setns first. We cannot do this earlier (in stage 0) + * because of the fact that we forked to get here (the PID of + * [stage 2: JUMP_INIT]) would be meaningless). We could send it + * using cmsg(3) but that's just annoying. + */ + if (config.namespaces) + join_namespaces(config.namespaces); + + /* + * Deal with user namespaces first. They are quite special, as they + * affect our ability to unshare other namespaces and are used as + * context for privilege checks. + * + * We don't unshare all namespaces in one go. The reason for this + * is that, while the kernel documentation may claim otherwise, + * there are certain cases where unsharing all namespaces at once + * will result in namespace objects being owned incorrectly. + * Ideally we should just fix these kernel bugs, but it's better to + * be safe than sorry, and fix them separately. + * + * A specific case of this is that the SELinux label of the + * internal kern-mount that mqueue uses will be incorrect if the + * UTS namespace is cloned before the USER namespace is mapped. + * I've also heard of similar problems with the network namespace + * in some scenarios. This also mirrors how LXC deals with this + * problem. + */ + if (config.cloneflags & CLONE_NEWUSER) { + if (unshare(CLONE_NEWUSER) < 0) + bail("failed to unshare user namespace"); + + config.cloneflags &= ~CLONE_NEWUSER; + new_userns = true; + } + + /* + * Unshare the mount ns before preparing the rootfs (next + * step). + */ + if (config.cloneflags & CLONE_NEWNS) { + if (unshare(CLONE_NEWNS) < 0) + bail("failed to unshare mount namespace"); + + config.cloneflags &= ~CLONE_NEWNS; + } + + // sysbox-runc: prepare the container's rootfs and setup + // shiftfs mounts if asked to do so. + // + // Note: in the OCI runc this is all done in rootfs_linux.go, but for + // sysbox-runc we need to do it here when using shiftfs. That's because + // it must be done after we are in the user-ns and mount-ns, but + // *before* uid(gid) mappings for the container's user-ns are set, as + // otherwise we may loose permission to perform the mounts (i.e., the + // bind mount sources may not longer be accessible once the user-ns + // mappings are configured). + + if (config.prep_rootfs) { + if (mount("", "/", "", (unsigned long)(config.rootfs_prop), "") < 0) + bail("failed to set rootfs mount propagation"); + + // This can fail if we don't have search permission into the parent + // mount path; if it fails, we will retry after userns uid-mapping. + if (config.make_parent_priv) { + if (mount("", config.parent_mount, "", MS_PRIVATE, "") == 0) + make_parent_priv_done = true; + } + + if (!config.make_parent_priv || make_parent_priv_done) { + // Note: by design cwd = rootfs + if (mount(".", ".", "bind", MS_BIND|MS_REC, "") < 0) + bail("failed to create bind-to-self mount on rootfs."); + + if (mount_shiftfs(&config) == 0) + shiftfs_mounts_done = true; + } + } + + /* + * If we are in a new user-ns, map our uid and gid. We don't + * have the privileges to do any mapping here (see the + * clone_parent rant). So signal our parent to hook us up. + */ + if (new_userns) { + + /* Switching is only necessary if we joined namespaces. */ + if (config.namespaces) { + if (prctl(PR_SET_DUMPABLE, 1, 0, 0, 0) < 0) + bail("failed to set process as dumpable"); + } + + s = SYNC_USERMAP_PLS; + if (write(syncfd, &s, sizeof(s)) != sizeof(s)) + bail("failed to sync with parent: write(SYNC_USERMAP_PLS)"); + + /* ... wait for mapping ... */ + + if (read(syncfd, &s, sizeof(s)) != sizeof(s)) + bail("failed to sync with parent: read(SYNC_USERMAP_ACK)"); + if (s != SYNC_USERMAP_ACK) + bail("failed to sync with parent: SYNC_USERMAP_ACK: got %u", s); + + /* Switching is only necessary if we joined namespaces. */ + if (config.namespaces) { + if (prctl(PR_SET_DUMPABLE, 0, 0, 0, 0) < 0) + bail("failed to set process as dumpable"); + } + + if (setresuid(0, 0, 0) < 0) + bail("failed to become root in user namespace"); + } + + /* sysbox-runc: + * + * If we did not succeed on making the parent mount private before, + * let's try again *after* uid-mappings are set (as we may now have + * permission to do so). + */ + + if (config.make_parent_priv && !make_parent_priv_done) { + if (mount("", config.parent_mount, "", MS_PRIVATE, "") < 0) + bail("failed to set rootfs parent mount propagation to private"); + + if (mount(".", ".", "bind", MS_BIND|MS_REC, "") < 0) + bail("failed to create bind-to-self mount on rootfs."); + } + + if (config.prep_rootfs && !shiftfs_mounts_done) { + if (mount_shiftfs(&config) < 0) { + bail("failed to setup shiftfs mounts"); + } + } + + /* + * Unshare the remaining namespaces (except the cgroup ns + * which we join later). This must be done *after* the user-ns uid mappings + * are set (assuming we joined a user-ns) because those other namespaces + * use the mappings implicitly (e.g., the net namespaces uses the mappings + * to display the correct uid:gid ownership for files under /proc/pid/net). + * + * Note that we don't merge this with clone() because there were + * some old kernel versions where clone(CLONE_PARENT | CLONE_NEWPID) + * was broken, so we'll just do it the long way anyway. + */ + if (unshare(config.cloneflags & ~CLONE_NEWCGROUP) < 0) + bail("failed to unshare namespaces"); + + /* + * TODO: What about non-namespace clone flags that we're dropping here? + * + * We fork again because of PID namespace, setns(2) or unshare(2) don't + * change the PID namespace of the calling process, because doing so + * would change the caller's idea of its own PID (as reported by getpid()), + * which would break many applications and libraries, so we must fork + * to actually enter the new PID namespace. + */ + child = clone_parent(&env, JUMP_INIT); + if (child < 0) + bail("unable to fork: init_func"); + + /* Send the child to our parent, which knows what it's doing. */ + s = SYNC_RECVPID_PLS; + if (write(syncfd, &s, sizeof(s)) != sizeof(s)) { + kill(child, SIGKILL); + bail("failed to sync with parent: write(SYNC_RECVPID_PLS)"); + } + if (write(syncfd, &child, sizeof(child)) != sizeof(child)) { + kill(child, SIGKILL); + bail("failed to sync with parent: write(childpid)"); + } + + /* ... wait for parent to get the pid ... */ + + if (read(syncfd, &s, sizeof(s)) != sizeof(s)) { + kill(child, SIGKILL); + bail("failed to sync with parent: read(SYNC_RECVPID_ACK)"); + } + if (s != SYNC_RECVPID_ACK) { + kill(child, SIGKILL); + bail("failed to sync with parent: SYNC_RECVPID_ACK: got %u", s); + } + + s = SYNC_CHILD_READY; + if (write(syncfd, &s, sizeof(s)) != sizeof(s)) { + kill(child, SIGKILL); + bail("failed to sync with parent: write(SYNC_CHILD_READY)"); + } + + /* Our work is done. [Stage 2: JUMP_INIT] is doing the rest of the work. */ + exit(0); + } + + /* + * Stage 2: We're the final child process, and the only process that will + * actually return to the Go runtime. Our job is to just do the + * final cleanup steps and then return to the Go runtime to allow + * init_linux.go to run. + */ + case JUMP_INIT:{ + /* + * We're inside the child now, having jumped from the + * start_child() code after forking in the parent. + */ + enum sync_t s; + + /* We're in a child and thus need to tell the parent if we die. */ + syncfd = sync_grandchild_pipe[0]; + close(sync_grandchild_pipe[1]); + close(sync_child_pipe[0]); + close(sync_child_pipe[1]); + + /* For debugging. */ + prctl(PR_SET_NAME, (unsigned long)"runc:[2:INIT]", 0, 0, 0); + + /* + * sysbox-runc: set the oom score adjustment to the + * configured value. Note that this operation relies on + * /proc being mounted (which should be the case both when + * creating a new container and when joining one). Also, we + * have to temporarily set dumpable because it may have been + * reset to 0 when we created the user-ns and its uid(gid)s + * were mapped (which in turn removes permissions to access + * /proc when creating a new container as described in + * procfs(5)). + */ + if (prctl(PR_SET_DUMPABLE, 1, 0, 0, 0) < 0) + bail("failed to set process as dumpable"); + + update_oom_score_adj(config.oom_score_adj, config.oom_score_adj_len); + + if (prctl(PR_SET_DUMPABLE, 0, 0, 0, 0) < 0) + bail("failed to set process as dumpable"); + + /* Perform the sync with our grandparent */ + if (read(syncfd, &s, sizeof(s)) != sizeof(s)) + bail("failed to sync with parent: read(SYNC_GRANDCHILD)"); + + if (s != SYNC_GRANDCHILD) + bail("failed to sync with parent: SYNC_GRANDCHILD: got %u", s); + + if (setsid() < 0) + bail("setsid failed"); + + if (setuid(0) < 0) + bail("setuid failed"); + + if (setgid(0) < 0) + bail("setgid failed"); + + if (!config.is_rootless_euid && config.is_setgroup) { + if (setgroups(0, NULL) < 0) + bail("setgroups failed"); + } + + /* ... wait until our topmost parent has finished cgroup setup in p.manager.Apply() ... */ + if (config.cloneflags & CLONE_NEWCGROUP) { + uint8_t value; + if (read(pipenum, &value, sizeof(value)) != sizeof(value)) + bail("read synchronisation value failed"); + if (value == CREATECGROUPNS) { + if (unshare(CLONE_NEWCGROUP) < 0) + bail("failed to unshare cgroup namespace"); + } else + bail("received unknown synchronisation value"); + } + + s = SYNC_CHILD_READY; + if (write(syncfd, &s, sizeof(s)) != sizeof(s)) + bail("failed to sync with patent: write(SYNC_CHILD_READY)"); + + /* Close sync pipes. */ + close(sync_grandchild_pipe[0]); + + /* Free netlink data. */ + nl_free(&config); + + /* Finish executing, let the Go runtime take over. */ + return; + } + default: + bail("unexpected jump value"); + } + + /* Should never be reached. */ + bail("should never be reached"); +} diff --git a/sysbox-runc/libcontainer/process.go b/sysbox-runc/libcontainer/process.go new file mode 100644 index 00000000..d3e472a4 --- /dev/null +++ b/sysbox-runc/libcontainer/process.go @@ -0,0 +1,115 @@ +package libcontainer + +import ( + "fmt" + "io" + "math" + "os" + + "github.com/opencontainers/runc/libcontainer/configs" +) + +type processOperations interface { + wait() (*os.ProcessState, error) + signal(sig os.Signal) error + pid() int +} + +// Process specifies the configuration and IO for a process inside +// a container. +type Process struct { + // The command to be run followed by any arguments. + Args []string + + // Env specifies the environment variables for the process. + Env []string + + // User will set the uid and gid of the executing process running inside the container + // local to the container's user and group configuration. + User string + + // AdditionalGroups specifies the gids that should be added to supplementary groups + // in addition to those that the user belongs to. + AdditionalGroups []string + + // Cwd will change the processes current working directory inside the container's rootfs. + Cwd string + + // Stdin is a pointer to a reader which provides the standard input stream. + Stdin io.Reader + + // Stdout is a pointer to a writer which receives the standard output stream. + Stdout io.Writer + + // Stderr is a pointer to a writer which receives the standard error stream. + Stderr io.Writer + + // ExtraFiles specifies additional open files to be inherited by the container + ExtraFiles []*os.File + + // Initial sizings for the console + ConsoleWidth uint16 + ConsoleHeight uint16 + + // Capabilities specify the capabilities to keep when executing the process inside the container + // All capabilities not specified will be dropped from the processes capability mask + Capabilities *configs.Capabilities + + // AppArmorProfile specifies the profile to apply to the process and is + // changed at the time the process is execed + AppArmorProfile string + + // Label specifies the label to apply to the process. It is commonly used by selinux + Label string + + // NoNewPrivileges controls whether processes can gain additional privileges. + NoNewPrivileges *bool + + // Rlimits specifies the resource limits, such as max open files, to set in the container + // If Rlimits are not set, the container will inherit rlimits from the parent process + Rlimits []configs.Rlimit + + // ConsoleSocket provides the masterfd console. + ConsoleSocket *os.File + + // Init specifies whether the process is the first process in the container. + Init bool + + ops processOperations + + LogLevel string +} + +// Wait waits for the process to exit. +// Wait releases any resources associated with the Process +func (p Process) Wait() (*os.ProcessState, error) { + if p.ops == nil { + return nil, newGenericError(fmt.Errorf("invalid process"), NoProcessOps) + } + return p.ops.wait() +} + +// Pid returns the process ID +func (p Process) Pid() (int, error) { + // math.MinInt32 is returned here, because it's invalid value + // for the kill() system call. + if p.ops == nil { + return math.MinInt32, newGenericError(fmt.Errorf("invalid process"), NoProcessOps) + } + return p.ops.pid(), nil +} + +// Signal sends a signal to the Process. +func (p Process) Signal(sig os.Signal) error { + if p.ops == nil { + return newGenericError(fmt.Errorf("invalid process"), NoProcessOps) + } + return p.ops.signal(sig) +} + +// IO holds the process's STDIO +type IO struct { + Stdin io.WriteCloser + Stdout io.ReadCloser + Stderr io.ReadCloser +} diff --git a/sysbox-runc/libcontainer/process_linux.go b/sysbox-runc/libcontainer/process_linux.go new file mode 100644 index 00000000..cf6034a3 --- /dev/null +++ b/sysbox-runc/libcontainer/process_linux.go @@ -0,0 +1,860 @@ +//go:build linux +// +build linux + +package libcontainer + +import ( + "encoding/json" + "errors" + "fmt" + "io" + "os" + "os/exec" + "path/filepath" + "strconv" + "strings" + "syscall" + "time" + + "github.com/opencontainers/runc/libcontainer/cgroups" + "github.com/opencontainers/runc/libcontainer/cgroups/fs2" + "github.com/opencontainers/runc/libcontainer/configs" + "github.com/opencontainers/runc/libcontainer/intelrdt" + "github.com/opencontainers/runc/libcontainer/logs" + "github.com/opencontainers/runc/libcontainer/system" + "github.com/opencontainers/runc/libcontainer/utils" + "github.com/opencontainers/runc/libsysbox/sysbox" + + "github.com/opencontainers/runtime-spec/specs-go" + "github.com/sirupsen/logrus" + + "golang.org/x/sys/unix" +) + +// Synchronisation value for cgroup namespace setup. +// The same constant is defined in nsexec.c as "CREATECGROUPNS". +const createCgroupns = 0x80 + +type parentProcess interface { + // pid returns the pid for the running process. + pid() int + + // start starts the process execution. + start() error + + // send a SIGKILL to the process and wait for the exit. + terminate() error + + // wait waits on the process returning the process state. + wait() (*os.ProcessState, error) + + // startTime returns the process start time. + startTime() (uint64, error) + + signal(os.Signal) error + + externalDescriptors() []string + + setExternalDescriptors(fds []string) + + forwardChildLogs() +} + +type filePair struct { + parent *os.File + child *os.File +} + +type setnsProcess struct { + cmd *exec.Cmd + messageSockPair filePair + logFilePair filePair + cgroupPaths map[string]string + rootlessCgroups bool + intelRdtPath string + config *initConfig + fds []string + process *Process + bootstrapData io.Reader + initProcessPid int + container *linuxContainer +} + +func (p *setnsProcess) startTime() (uint64, error) { + stat, err := system.Stat(p.pid()) + return stat.StartTime, err +} + +func (p *setnsProcess) signal(sig os.Signal) error { + s, ok := sig.(unix.Signal) + if !ok { + return errors.New("os: unsupported signal type") + } + return unix.Kill(p.pid(), s) +} + +func (p *setnsProcess) start() (retErr error) { + defer p.messageSockPair.parent.Close() + err := p.cmd.Start() + // close the write-side of the pipes (controlled by child) + p.messageSockPair.child.Close() + p.logFilePair.child.Close() + if err != nil { + return newSystemErrorWithCause(err, "starting setns process") + } + defer func() { + if retErr != nil { + err := ignoreTerminateErrors(p.terminate()) + if err != nil { + logrus.WithError(err).Warn("unable to terminate setnsProcess") + } + } + }() + if p.bootstrapData != nil { + if _, err := io.Copy(p.messageSockPair.parent, p.bootstrapData); err != nil { + return newSystemErrorWithCause(err, "copying bootstrap data to pipe") + } + } + if err := p.execSetns(); err != nil { + return newSystemErrorWithCause(err, "executing setns process") + } + if len(p.cgroupPaths) > 0 { + if err := cgroups.EnterPid(p.cgroupPaths, p.pid()); err != nil && !p.rootlessCgroups { + // On cgroup v2 + nesting + domain controllers, EnterPid may fail with EBUSY. + // https://github.com/opencontainers/runc/issues/2356#issuecomment-621277643 + // Try to join the cgroup of InitProcessPid. + if cgroups.IsCgroup2UnifiedMode() { + initProcCgroupFile := fmt.Sprintf("/proc/%d/cgroup", p.initProcessPid) + initCg, initCgErr := cgroups.ParseCgroupFile(initProcCgroupFile) + if initCgErr == nil { + if initCgPath, ok := initCg[""]; ok { + initCgDirpath := filepath.Join(fs2.UnifiedMountpoint, initCgPath) + logrus.Debugf("adding pid %d to cgroups %v failed (%v), attempting to join %q (obtained from %s)", + p.pid(), p.cgroupPaths, err, initCg, initCgDirpath) + // NOTE: initCgDirPath is not guaranteed to exist because we didn't pause the container. + err = cgroups.WriteCgroupProc(initCgDirpath, p.pid()) + } + } + } + if err != nil { + return newSystemErrorWithCausef(err, "adding pid %d to cgroups", p.pid()) + } + } + } + if p.intelRdtPath != "" { + // if Intel RDT "resource control" filesystem path exists + _, err := os.Stat(p.intelRdtPath) + if err == nil { + if err := intelrdt.WriteIntelRdtTasks(p.intelRdtPath, p.pid()); err != nil { + return newSystemErrorWithCausef(err, "adding pid %d to Intel RDT resource control filesystem", p.pid()) + } + } + } + // set rlimits, this has to be done here because we lose permissions + // to raise the limits once we enter a user-namespace + if err := setupRlimits(p.config.Rlimits, p.pid()); err != nil { + return newSystemErrorWithCause(err, "setting rlimits for process") + } + if err := utils.WriteJSON(p.messageSockPair.parent, p.config); err != nil { + return newSystemErrorWithCause(err, "writing config to pipe") + } + + ierr := parseSync(p.messageSockPair.parent, func(sync *syncT) error { + switch sync.Type { + case procReady: + // This shouldn't happen. + panic("unexpected procReady in setns") + + case procHooks: + // This shouldn't happen. + panic("unexpected procHooks in setns") + + case reqOp: + // This shouldn't happen. + panic("unexpected reqOp in setns") + + case procFd: + if err := writeSync(p.messageSockPair.parent, sendFd); err != nil { + return newSystemErrorWithCause(err, "writing syncT 'sendFd'") + } + fd, err := recvSeccompFd(p.messageSockPair.parent) + if err != nil { + return newSystemErrorWithCause(err, "receiving seccomp fd") + } + if err := p.container.procSeccompInit(p.pid(), fd); err != nil { + return newSystemErrorWithCausef(err, "processing seccomp fd") + } + if err := writeSync(p.messageSockPair.parent, procFdDone); err != nil { + return newSystemErrorWithCause(err, "writing syncT 'procFdDone'") + } + + default: + return newSystemError(errors.New("invalid JSON payload from child")) + } + return nil + }) + + if err := unix.Shutdown(int(p.messageSockPair.parent.Fd()), unix.SHUT_WR); err != nil { + return newSystemErrorWithCause(err, "calling shutdown on init pipe") + } + // Must be done after Shutdown so the child will exit and we can wait for it. + if ierr != nil { + p.wait() + return ierr + } + return nil +} + +// execSetns runs the process that executes C code to perform the setns calls +// because setns support requires the C process to fork off a child and perform the setns +// before the go runtime boots, we wait on the process to die and receive the child's pid +// over the provided pipe. +func (p *setnsProcess) execSetns() error { + status, err := p.cmd.Process.Wait() + if err != nil { + p.cmd.Wait() + return newSystemErrorWithCause(err, "waiting on setns process to finish") + } + if !status.Success() { + p.cmd.Wait() + return newSystemError(&exec.ExitError{ProcessState: status}) + } + var pid *pid + if err := json.NewDecoder(p.messageSockPair.parent).Decode(&pid); err != nil { + p.cmd.Wait() + return newSystemErrorWithCause(err, "reading pid from init pipe") + } + + // Clean up the zombie parent process + // On Unix systems FindProcess always succeeds. + firstChildProcess, _ := os.FindProcess(pid.PidFirstChild) + + // Ignore the error in case the child has already been reaped for any reason + _, _ = firstChildProcess.Wait() + + process, err := os.FindProcess(pid.Pid) + if err != nil { + return err + } + p.cmd.Process = process + p.process.ops = p + return nil +} + +// terminate sends a SIGKILL to the forked process for the setns routine then waits to +// avoid the process becoming a zombie. +func (p *setnsProcess) terminate() error { + if p.cmd.Process == nil { + return nil + } + err := p.cmd.Process.Kill() + if _, werr := p.wait(); err == nil { + err = werr + } + return err +} + +func (p *setnsProcess) wait() (*os.ProcessState, error) { + err := p.cmd.Wait() + + // Return actual ProcessState even on Wait error + return p.cmd.ProcessState, err +} + +func (p *setnsProcess) pid() int { + return p.cmd.Process.Pid +} + +func (p *setnsProcess) externalDescriptors() []string { + return p.fds +} + +func (p *setnsProcess) setExternalDescriptors(newFds []string) { + p.fds = newFds +} + +func (p *setnsProcess) forwardChildLogs() { + go logs.ForwardLogs(p.logFilePair.parent) +} + +type initProcess struct { + cmd *exec.Cmd + messageSockPair filePair + logFilePair filePair + config *initConfig + manager cgroups.Manager + intelRdtManager intelrdt.Manager + container *linuxContainer + fds []string + process *Process + bootstrapData io.Reader + sharePidns bool +} + +func (p *initProcess) pid() int { + return p.cmd.Process.Pid +} + +func (p *initProcess) externalDescriptors() []string { + return p.fds +} + +// getChildPid receives the final child's pid over the provided pipe. +func (p *initProcess) getChildPid() (int, error) { + var pid pid + if err := json.NewDecoder(p.messageSockPair.parent).Decode(&pid); err != nil { + p.cmd.Wait() + return -1, err + } + + // Clean up the zombie parent process + // On Unix systems FindProcess always succeeds. + firstChildProcess, _ := os.FindProcess(pid.PidFirstChild) + + // Ignore the error in case the child has already been reaped for any reason + _, _ = firstChildProcess.Wait() + + return pid.Pid, nil +} + +func (p *initProcess) waitForChildExit(childPid int) error { + status, err := p.cmd.Process.Wait() + if err != nil { + p.cmd.Wait() + return err + } + if !status.Success() { + p.cmd.Wait() + return &exec.ExitError{ProcessState: status} + } + + process, err := os.FindProcess(childPid) + if err != nil { + return err + } + p.cmd.Process = process + p.process.ops = p + return nil +} + +func (p *initProcess) start() (retErr error) { + defer p.messageSockPair.parent.Close() + err := p.cmd.Start() + p.process.ops = p + // close the write-side of the pipes (controlled by child) + p.messageSockPair.child.Close() + p.logFilePair.child.Close() + if err != nil { + p.process.ops = nil + return newSystemErrorWithCause(err, "starting init process command") + } + defer func() { + if retErr != nil { + // terminate the process to ensure we can remove cgroups + if err := ignoreTerminateErrors(p.terminate()); err != nil { + logrus.WithError(err).Warn("unable to terminate initProcess") + } + + p.manager.Destroy() + if p.intelRdtManager != nil { + p.intelRdtManager.Destroy() + } + } + }() + + // Do this before syncing with child so that no children can escape the + // cgroup. We don't need to worry about not doing this and not being root + // because we'd be using the rootless cgroup manager in that case. + if err := p.manager.Apply(p.pid()); err != nil { + return newSystemErrorWithCause(err, "applying cgroup configuration for process") + } + + // sysbox-runc: set the cgroup resources before creating a child cgroup for + // the system container's cgroup root. This way the child cgroup will inherit + // the cgroup resources. Also, do this before the prestart hook so that the + // prestart hook may apply cgroup permissions. + if err := p.manager.Set(p.config.Config); err != nil { + return newSystemErrorWithCause(err, "setting cgroup config for ready process") + } + + // sysbox-runc: create a child cgroup that will serve as the system container's + // cgroup root. + cgType := p.manager.GetType() + + if cgType == cgroups.Cgroup_v1_fs || cgType == cgroups.Cgroup_v1_systemd { + if err := p.manager.CreateChildCgroup(p.config.Config); err != nil { + return newSystemErrorWithCause(err, "creating container child cgroup") + } + } + + if err := p.setupDevSubdir(); err != nil { + return newSystemErrorWithCause(err, "setup up dev subdir under rootfs") + } + + if p.intelRdtManager != nil { + if err := p.intelRdtManager.Apply(p.pid()); err != nil { + return newSystemErrorWithCause(err, "applying Intel RDT configuration for process") + } + } + if _, err := io.Copy(p.messageSockPair.parent, p.bootstrapData); err != nil { + return newSystemErrorWithCause(err, "copying bootstrap data to pipe") + } + + childPid, err := p.getChildPid() + if err != nil { + return newSystemErrorWithCause(err, "getting the final child's pid from pipe") + } + + // Save the standard descriptor names before the container process + // can potentially move them (e.g., via dup2()). If we don't do this now, + // we won't know at checkpoint time which file descriptor to look up. + fds, err := getPipeFds(childPid) + if err != nil { + return newSystemErrorWithCausef(err, "getting pipe fds for pid %d", childPid) + } + p.setExternalDescriptors(fds) + + // sysbox-runc: place the system container's init process in the child cgroup. Do + // this before syncing with child so that no children can escape the cgroup + if cgType == cgroups.Cgroup_v1_fs || cgType == cgroups.Cgroup_v1_systemd { + if err := p.manager.ApplyChildCgroup(childPid); err != nil { + return newSystemErrorWithCause(err, "applying cgroup configuration for process") + } + } + + if p.intelRdtManager != nil { + if err := p.intelRdtManager.Apply(childPid); err != nil { + return newSystemErrorWithCause(err, "applying Intel RDT configuration for process") + } + } + + // Now it's time to setup cgroup namespace + if p.config.Config.Namespaces.Contains(configs.NEWCGROUP) && p.config.Config.Namespaces.PathOf(configs.NEWCGROUP) == "" { + if _, err := p.messageSockPair.parent.Write([]byte{createCgroupns}); err != nil { + return newSystemErrorWithCause(err, "sending synchronization value to init process") + } + } + + // Wait for our first child to exit + if err := p.waitForChildExit(childPid); err != nil { + return newSystemErrorWithCause(err, "waiting for our first child to exit") + } + + if err := p.createNetworkInterfaces(); err != nil { + return newSystemErrorWithCause(err, "creating network interfaces") + } + + if err := p.updateSpecState(); err != nil { + return newSystemErrorWithCause(err, "updating the spec state") + } + + if err := p.sendConfig(); err != nil { + return newSystemErrorWithCause(err, "sending config to init process") + } + + var ( + sentRun bool + sentResume bool + ) + + ierr := parseSync(p.messageSockPair.parent, func(sync *syncT) error { + switch sync.Type { + case procReady: + // set rlimits, this has to be done here because we lose permissions + // to raise the limits once we enter a user-namespace + if err := setupRlimits(p.config.Rlimits, p.pid()); err != nil { + return newSystemErrorWithCause(err, "setting rlimits for ready process") + } + // call prestart and CreateRuntime hooks + if !p.config.Config.Namespaces.Contains(configs.NEWNS) { + if p.intelRdtManager != nil { + if err := p.intelRdtManager.Set(p.config.Config); err != nil { + return newSystemErrorWithCause(err, "setting Intel RDT config for ready process") + } + } + + if p.config.Config.Hooks != nil { + s, err := p.container.currentOCIState() + if err != nil { + return err + } + // initProcessStartTime hasn't been set yet. + s.Pid = p.cmd.Process.Pid + s.Status = specs.StateCreating + hooks := p.config.Config.Hooks + + if err := hooks[configs.Prestart].RunHooks(s); err != nil { + return err + } + if err := hooks[configs.CreateRuntime].RunHooks(s); err != nil { + return err + } + } + } + + // generate a timestamp indicating when the container was started + p.container.created = time.Now().UTC() + p.container.state = &createdState{ + c: p.container, + } + + // NOTE: If the procRun state has been synced and the + // runc-create process has been killed for some reason, + // the runc-init[2:stage] process will be leaky. And + // the runc command also fails to parse root directory + // because the container doesn't have state.json. + // + // In order to cleanup the runc-init[2:stage] by + // runc-delete/stop, we should store the status before + // procRun sync. + state, uerr := p.container.updateState(p) + if uerr != nil { + return newSystemErrorWithCause(err, "store init state") + } + p.container.initProcessStartTime = state.InitProcessStartTime + + // Sync with child. + if err := writeSync(p.messageSockPair.parent, procRun); err != nil { + return newSystemErrorWithCause(err, "writing syncT 'run'") + } + sentRun = true + + case rootfsReady: + // Setup cgroup v2 child cgroup + if cgType == cgroups.Cgroup_v2_fs || cgType == cgroups.Cgroup_v2_systemd { + if err := p.manager.CreateChildCgroup(p.config.Config); err != nil { + return newSystemErrorWithCause(err, "creating container child cgroup") + } + if err := p.manager.ApplyChildCgroup(childPid); err != nil { + return newSystemErrorWithCause(err, "applying cgroup configuration for process") + } + } + // Register container with sysbox-fs. + if err = p.registerWithSysboxfs(childPid); err != nil { + return err + } + // Sync with child. + if err := writeSync(p.messageSockPair.parent, rootfsReadyAck); err != nil { + return newSystemErrorWithCause(err, "writing syncT 'rootfsReadyAck'") + } + + case procHooks: + if p.intelRdtManager != nil { + if err := p.intelRdtManager.Set(p.config.Config); err != nil { + return newSystemErrorWithCause(err, "setting Intel RDT config for procHooks process") + } + } + if p.config.Config.Hooks != nil { + s, err := p.container.currentOCIState() + if err != nil { + return err + } + // initProcessStartTime hasn't been set yet. + s.Pid = p.cmd.Process.Pid + s.Status = specs.StateCreating + hooks := p.config.Config.Hooks + + if err := hooks[configs.Prestart].RunHooks(s); err != nil { + return err + } + if err := hooks[configs.CreateRuntime].RunHooks(s); err != nil { + return err + } + } + // Sync with child. + if err := writeSync(p.messageSockPair.parent, procResume); err != nil { + return newSystemErrorWithCause(err, "writing syncT 'resume'") + } + sentResume = true + + case reqOp: + var reqs []opReq + if err := writeSync(p.messageSockPair.parent, sendOpInfo); err != nil { + return newSystemErrorWithCause(err, "writing syncT 'sendOpInfo'") + } + if err := json.NewDecoder(p.messageSockPair.parent).Decode(&reqs); err != nil { + return newSystemErrorWithCause(err, "receiving / decoding reqOp'") + } + if err := p.container.handleReqOp(childPid, reqs); err != nil { + return newSystemErrorWithCause(err, "handleReqOp") + } + if err := writeSync(p.messageSockPair.parent, opDone); err != nil { + return newSystemErrorWithCause(err, "writing syncT 'opDone'") + } + + case procFd: + if err := writeSync(p.messageSockPair.parent, sendFd); err != nil { + return newSystemErrorWithCause(err, "writing syncT 'sendFd'") + } + fd, err := recvSeccompFd(p.messageSockPair.parent) + if err != nil { + return newSystemErrorWithCause(err, "receiving seccomp fd") + } + if err := p.container.procSeccompInit(childPid, fd); err != nil { + return newSystemErrorWithCausef(err, "processing seccomp fd") + } + if err := writeSync(p.messageSockPair.parent, procFdDone); err != nil { + return newSystemErrorWithCause(err, "writing syncT 'procFdDone'") + } + + default: + return newSystemError(errors.New("invalid JSON payload from child")) + } + + return nil + }) + + if !sentRun { + return newSystemErrorWithCause(ierr, "container init") + } + if p.config.Config.Namespaces.Contains(configs.NEWNS) && !sentResume { + return newSystemError(errors.New("could not synchronise after executing prestart and CreateRuntime hooks with container process")) + } + if err := unix.Shutdown(int(p.messageSockPair.parent.Fd()), unix.SHUT_WR); err != nil { + return newSystemErrorWithCause(err, "shutting down init pipe") + } + + // Must be done after Shutdown so the child will exit and we can wait for it. + if ierr != nil { + p.wait() + return ierr + } + + return nil +} + +// sysbox-runc: register the container with sysbox-fs. This must be done after +// childPid is obtained and all container mounts are present, but before prestart +// hooks so that sysbox-fs is ready to respond by the time the hooks run. +func (p *initProcess) registerWithSysboxfs(childPid int) error { + + sysFs := p.container.sysbox.Fs + if !sysFs.Enabled() { + return nil + } + + c := p.container + + procRoPaths := []string{} + for _, p := range c.config.ReadonlyPaths { + if strings.HasPrefix(p, "/proc") { + procRoPaths = append(procRoPaths, p) + } + } + + procMaskPaths := []string{} + for _, p := range c.config.MaskPaths { + if strings.HasPrefix(p, "/proc") { + procMaskPaths = append(procMaskPaths, p) + } + } + + info := &sysbox.FsRegInfo{ + Hostname: c.config.Hostname, + Pid: childPid, + Uid: c.config.UidMappings[0].HostID, + Gid: c.config.GidMappings[0].HostID, + IdSize: c.config.UidMappings[0].Size, + ProcRoPaths: procRoPaths, + ProcMaskPaths: procMaskPaths, + } + + // Launch registration process. + if err := sysFs.Register(info); err != nil { + return newSystemErrorWithCause(err, "registering with sysbox-fs") + } + + return nil +} + +func (p *initProcess) wait() (*os.ProcessState, error) { + err := p.cmd.Wait() + // we should kill all processes in cgroup when init is died if we use host PID namespace + if p.sharePidns { + signalAllProcesses(p.manager, unix.SIGKILL) + } + return p.cmd.ProcessState, err +} + +func (p *initProcess) terminate() error { + if p.cmd.Process == nil { + return nil + } + err := p.cmd.Process.Kill() + if _, werr := p.wait(); err == nil { + err = werr + } + return err +} + +func (p *initProcess) startTime() (uint64, error) { + stat, err := system.Stat(p.pid()) + return stat.StartTime, err +} + +func (p *initProcess) updateSpecState() error { + s, err := p.container.currentOCIState() + if err != nil { + return err + } + + p.config.SpecState = s + return nil +} + +func (p *initProcess) sendConfig() error { + // send the config to the container's init process, we don't use JSON Encode + // here because there might be a problem in JSON decoder in some cases, see: + // https://github.com/docker/docker/issues/14203#issuecomment-174177790 + return utils.WriteJSON(p.messageSockPair.parent, p.config) +} + +func (p *initProcess) createNetworkInterfaces() error { + for _, config := range p.config.Config.Networks { + strategy, err := getStrategy(config.Type) + if err != nil { + return err + } + n := &network{ + Network: *config, + } + if err := strategy.create(n, p.pid()); err != nil { + return err + } + p.config.Networks = append(p.config.Networks, n) + } + return nil +} + +func (p *initProcess) signal(sig os.Signal) error { + s, ok := sig.(unix.Signal) + if !ok { + return errors.New("os: unsupported signal type") + } + return unix.Kill(p.pid(), s) +} + +func (p *initProcess) setExternalDescriptors(newFds []string) { + p.fds = newFds +} + +func (p *initProcess) forwardChildLogs() { + go logs.ForwardLogs(p.logFilePair.parent) +} + +func (p *initProcess) setupDevSubdir() error { + + // sysbox-runc: create target dir for the sys container's "dev" + // mount. Normally this should be done by the container's init + // process, but we do it here to work-around a problem in which the + // container's init process must have a subdir under the rootfs + // that it can chdir into and back to the rootfs in order to "feel" + // the effect of mounts that it performs on the container's rootfs + // (e.g., shiftfs mounts). And without feeling the effect of those + // mounts it may not have permission to create the subdir itself. + // See function effectRootfsMount() in rootfs_linux.go. + // + // Note also that normally containers have the "dev" subdir, but in + // some cases (e.g., k8s "pause" container) they do not. + devSubdir := filepath.Join(p.config.Config.Rootfs, "dev") + + // The dir mode must match the corresponding mode in libsysbox/spec/spec.go. + // See that there is no need to chown() this dir to match the container's + // root uid & gid as we are expecting a tmpfs mount over this node to take + // care of that. + if err := os.MkdirAll(devSubdir, 0755); err != nil { + return newSystemErrorWithCause(err, "creating dev subdir under rootfs") + } + + return nil +} + +func getPipeFds(pid int) ([]string, error) { + fds := make([]string, 3) + + dirPath := filepath.Join("/proc", strconv.Itoa(pid), "/fd") + for i := 0; i < 3; i++ { + // XXX: This breaks if the path is not a valid symlink (which can + // happen in certain particularly unlucky mount namespace setups). + f := filepath.Join(dirPath, strconv.Itoa(i)) + target, err := os.Readlink(f) + if err != nil { + // Ignore permission errors, for rootless containers and other + // non-dumpable processes. if we can't get the fd for a particular + // file, there's not much we can do. + if os.IsPermission(err) { + continue + } + return fds, err + } + fds[i] = target + } + return fds, nil +} + +// InitializeIO creates pipes for use with the process's stdio and returns the +// opposite side for each. Do not use this if you want to have a pseudoterminal +// set up for you by libcontainer (TODO: fix that too). +// TODO: This is mostly unnecessary, and should be handled by clients. +func (p *Process) InitializeIO(rootuid, rootgid int) (i *IO, err error) { + var fds []uintptr + i = &IO{} + // cleanup in case of an error + defer func() { + if err != nil { + for _, fd := range fds { + unix.Close(int(fd)) + } + } + }() + // STDIN + r, w, err := os.Pipe() + if err != nil { + return nil, err + } + fds = append(fds, r.Fd(), w.Fd()) + p.Stdin, i.Stdin = r, w + // STDOUT + if r, w, err = os.Pipe(); err != nil { + return nil, err + } + fds = append(fds, r.Fd(), w.Fd()) + p.Stdout, i.Stdout = w, r + // STDERR + if r, w, err = os.Pipe(); err != nil { + return nil, err + } + fds = append(fds, r.Fd(), w.Fd()) + p.Stderr, i.Stderr = w, r + // change ownership of the pipes in case we are in a user namespace + for _, fd := range fds { + if err := unix.Fchown(int(fd), rootuid, rootgid); err != nil { + return nil, err + } + } + return i, nil +} + +// Receives a seccomp file descriptor from the given pipe using cmsg(3) +func recvSeccompFd(pipe *os.File) (int32, error) { + var msgs []syscall.SocketControlMessage + + socket := int(pipe.Fd()) + + buf := make([]byte, syscall.CmsgSpace(4)) + if _, _, _, _, err := syscall.Recvmsg(socket, nil, buf, 0); err != nil { + return -1, fmt.Errorf("recvmsg() failed: %s", err) + } + + msgs, err := syscall.ParseSocketControlMessage(buf) + if err != nil || len(msgs) != 1 { + return -1, fmt.Errorf("parsing socket control msg failed: %s", err) + } + + fd, err := syscall.ParseUnixRights(&msgs[0]) + if err != nil { + return -1, fmt.Errorf("parsing unix rights msg failed: %s", err) + } + + return int32(fd[0]), nil +} diff --git a/sysbox-runc/libcontainer/restored_process.go b/sysbox-runc/libcontainer/restored_process.go new file mode 100644 index 00000000..f861e82d --- /dev/null +++ b/sysbox-runc/libcontainer/restored_process.go @@ -0,0 +1,129 @@ +// +build linux + +package libcontainer + +import ( + "fmt" + "os" + "os/exec" + + "github.com/opencontainers/runc/libcontainer/system" +) + +func newRestoredProcess(cmd *exec.Cmd, fds []string) (*restoredProcess, error) { + var ( + err error + ) + pid := cmd.Process.Pid + stat, err := system.Stat(pid) + if err != nil { + return nil, err + } + return &restoredProcess{ + cmd: cmd, + processStartTime: stat.StartTime, + fds: fds, + }, nil +} + +type restoredProcess struct { + cmd *exec.Cmd + processStartTime uint64 + fds []string +} + +func (p *restoredProcess) start() error { + return newGenericError(fmt.Errorf("restored process cannot be started"), SystemError) +} + +func (p *restoredProcess) pid() int { + return p.cmd.Process.Pid +} + +func (p *restoredProcess) terminate() error { + err := p.cmd.Process.Kill() + if _, werr := p.wait(); err == nil { + err = werr + } + return err +} + +func (p *restoredProcess) wait() (*os.ProcessState, error) { + // TODO: how do we wait on the actual process? + // maybe use --exec-cmd in criu + err := p.cmd.Wait() + if err != nil { + if _, ok := err.(*exec.ExitError); !ok { + return nil, err + } + } + st := p.cmd.ProcessState + return st, nil +} + +func (p *restoredProcess) startTime() (uint64, error) { + return p.processStartTime, nil +} + +func (p *restoredProcess) signal(s os.Signal) error { + return p.cmd.Process.Signal(s) +} + +func (p *restoredProcess) externalDescriptors() []string { + return p.fds +} + +func (p *restoredProcess) setExternalDescriptors(newFds []string) { + p.fds = newFds +} + +func (p *restoredProcess) forwardChildLogs() { +} + +// nonChildProcess represents a process where the calling process is not +// the parent process. This process is created when a factory loads a container from +// a persisted state. +type nonChildProcess struct { + processPid int + processStartTime uint64 + fds []string +} + +func (p *nonChildProcess) start() error { + return newGenericError(fmt.Errorf("restored process cannot be started"), SystemError) +} + +func (p *nonChildProcess) pid() int { + return p.processPid +} + +func (p *nonChildProcess) terminate() error { + return newGenericError(fmt.Errorf("restored process cannot be terminated"), SystemError) +} + +func (p *nonChildProcess) wait() (*os.ProcessState, error) { + return nil, newGenericError(fmt.Errorf("restored process cannot be waited on"), SystemError) +} + +func (p *nonChildProcess) startTime() (uint64, error) { + return p.processStartTime, nil +} + +func (p *nonChildProcess) signal(s os.Signal) error { + proc, err := os.FindProcess(p.processPid) + if err != nil { + return err + } + return proc.Signal(s) +} + +func (p *nonChildProcess) externalDescriptors() []string { + return p.fds +} + +func (p *nonChildProcess) setExternalDescriptors(newFds []string) { + p.fds = newFds +} + +func (p *nonChildProcess) forwardChildLogs() { +} diff --git a/sysbox-runc/libcontainer/rootfs_init_linux.go b/sysbox-runc/libcontainer/rootfs_init_linux.go new file mode 100644 index 00000000..5c2ec05d --- /dev/null +++ b/sysbox-runc/libcontainer/rootfs_init_linux.go @@ -0,0 +1,479 @@ +package libcontainer + +import ( + "bytes" + "fmt" + "os" + "os/exec" + "path/filepath" + "strconv" + "strings" + "time" + + "github.com/Masterminds/semver" + securejoin "github.com/cyphar/filepath-securejoin" + "github.com/opencontainers/runc/libcontainer/configs" + "github.com/opencontainers/selinux/go-selinux/label" + "github.com/sirupsen/logrus" + "golang.org/x/sys/unix" + + "github.com/nestybox/sysbox-libs/idMap" + "github.com/nestybox/sysbox-libs/idShiftUtils" + mount "github.com/nestybox/sysbox-libs/mount" + overlayUtils "github.com/nestybox/sysbox-libs/overlayUtils" + utils "github.com/nestybox/sysbox-libs/utils" + libcontainerUtils "github.com/opencontainers/runc/libcontainer/utils" +) + +type linuxRootfsInit struct { + pipe *os.File + reqs []opReq +} + +// getDir returns the path to the directory that contains the file at the given path +func getDir(file string) (string, error) { + fi, err := os.Stat(file) + if err != nil { + return "", fmt.Errorf("stat %s: %v", file, err) + } + if !fi.IsDir() { + return filepath.Dir(file), nil + } else { + return file, nil + } +} + +// iptablesRestoreHasWait determines if the version of iptables-restore on the +// host has "--wait" option. +func iptablesRestoreHasWait() (bool, error) { + var cmd *exec.Cmd + + if _, err := os.Stat("/usr/sbin/iptables"); os.IsNotExist(err) { + cmd = exec.Command("/sbin/iptables", "--version") + } else { + cmd = exec.Command("/usr/sbin/iptables", "--version") + } + + bytes, err := cmd.CombinedOutput() + if err != nil { + return false, fmt.Errorf("failed to start %v: %s", cmd.Args, err) + } + + // output is "iptables "; we are looking for version >= v1.6.2 + output := strings.Fields(string(bytes)) + if len(output) < 2 { + return false, fmt.Errorf("failed to get iptables version: got %v", output) + } + + // The iptables "--wait" option shows up in v1.6.2 and above + // (see iptables commit 999eaa241212d3952ddff39a99d0d55a74e3639e on 03/16/2017) + + verStr := strings.TrimPrefix(output[1], "v") + + verConstraint, _ := semver.NewConstraint(">= 1.6.2") + + ver, err := semver.NewVersion(verStr) + if err != nil { + return false, fmt.Errorf("failed to parse iptables version: %s", err) + } + + return verConstraint.Check(ver), nil +} + +func doBindMount(rootfs string, m *configs.Mount) error { + + // sysbox-runc: For some reason, when the rootfs is on shiftfs, we need to do + // an Lstat() of the source path prior to doing the mount. Otherwise we get a + // "permission denied" error. It took me a while to figure this out. I found + // out by noticing that the mount utility (not the syscall) would not hit the + // permission error, and then did an strace of the syscalls being done by it, + // which led me to realize that the Lstat() was solving the problem. + + src := m.Source + if !m.BindSrcInfo.IsDir { + src = filepath.Dir(m.Source) + } + os.Lstat(src) + + // Bind-mount with procfd to mitigate symlink exchange attacks. + if err := libcontainerUtils.WithProcfd(rootfs, m.Destination, func(procfd string) error { + if err := unix.Mount(m.Source, procfd, "", unix.MS_BIND|unix.MS_REC, ""); os.IsPermission(err) { + + // We've noticed that the lstat and/or mount syscall fails with EPERM when + // bind-mounting a source dir that is on a shiftfs mount on top of a tmpfs + // mount. For some reason the Linux "mount" command does not fail in this case, + // so let's try it. + cmd := exec.Command("/bin/mount", "--rbind", m.Source, procfd) + err := cmd.Run() + if err != nil { + realpath, _ := os.Readlink(procfd) + return fmt.Errorf("bind-mount of %s to %s failed: %v", m.Source, realpath, err) + } + } else if err != nil { + return err + } + return nil + }); err != nil { + return fmt.Errorf("bind mount through procfd of %s -> %s: %w", m.Source, m.Destination, err) + } + + if err := libcontainerUtils.WithProcfd(rootfs, m.Destination, func(procfd string) error { + for _, pflag := range m.PropagationFlags { + if err := unix.Mount("", procfd, "", uintptr(pflag), ""); err != nil { + return err + } + } + return nil + }); err != nil { + return fmt.Errorf("change bind mount propagation through procfd: %w", err) + } + + return nil +} + +// Creates an alias for the Docker DNS via iptables. +func doDockerDnsSwitch(oldDns, newDns string) error { + var ( + cmdOut, cmdErr bytes.Buffer + cmd *exec.Cmd + ) + + // Get current iptables + if _, err := os.Stat("/usr/sbin/iptables-save"); os.IsNotExist(err) { + cmd = exec.Command("/sbin/iptables-save") + } else { + cmd = exec.Command("/usr/sbin/iptables-save") + } + + cmd.Stdout = &cmdOut + cmd.Stderr = &cmdErr + + if err := cmd.Run(); err != nil { + return fmt.Errorf("failed to start %v: %s", cmd.Args, err) + } + + // Create the alias for the Docker DNS (it's at oldDns (e.g., 127.0.0.11), + // but we will alias it to newDns (e.g., 172.20.0.1)). + // + // That is, inside the container, all processes will think the Docker DNS is + // at newDns, but iptables will send the packet to oldDns. Similarly, when + // oldDns responds, iptables will make it seem like newDns is responding. + + iptables := cmdOut.String() + + // All packets destined to oldDns now go to newDns + iptables = strings.Replace(iptables, fmt.Sprintf("-d %s", oldDns), fmt.Sprintf("-d %s", newDns), -1) + + // Source NATing from oldDns is now from newDns + iptables = strings.Replace(iptables, "--to-source :53", fmt.Sprintf("--to-source %s:53", newDns), -1) + + // Add pre-routing rule so that packets from inner containers go through DOCKER_OUTPUT rule (DNAT) + rule := fmt.Sprintf("-A OUTPUT -d %s/32 -j DOCKER_OUTPUT", newDns) + newRule := rule + "\n" + fmt.Sprintf("-A PREROUTING -d %s/32 -j DOCKER_OUTPUT", newDns) + iptables = strings.Replace(iptables, rule, newRule, 1) + + // Commit the changed iptables + // + // The iptables-restore command holds the xtables lock to ensure consistency + // in case multiple processes try to restore iptables concurrently. Recent + // versions of this command (e.g., iptables 1.8.3) support the "--wait" flag + // to deal with xtables lock contention. However, older versions (e.g., + // iptables 1.6.1) don't. For those older versions, we do the wait ourselves. + + xtablesWait := 30 // wait up to 30 secs for the xtables lock + xtablesWaitInterval := 100000 // poll the lock every 100ms when waiting + + iptablesRestoreHasWait, err := iptablesRestoreHasWait() + if err != nil { + return err + } + + iptablesRestorePath := "/usr/sbin/iptables-restore" + if _, err = os.Stat(iptablesRestorePath); os.IsNotExist(err) { + iptablesRestorePath = "/sbin/iptables-restore" + } + + if iptablesRestoreHasWait { + + wait := strconv.Itoa(xtablesWait) + waitInterval := strconv.Itoa(xtablesWaitInterval) + + cmd = exec.Command(iptablesRestorePath, "--wait", wait, "--wait-interval", waitInterval) + cmd.Stdin = strings.NewReader(iptables) + + if err := cmd.Run(); err != nil { + return fmt.Errorf("failed to start %v: %s", cmd.Args, err) + } + + } else { + + // If we are here, iptables-restore is old and does not support concurrent + // accesses (does not have the "--wait") option. This means that if + // multiple processes do iptables-restore concurrently, the command may + // return exit status "4" (resource unavailable) (see iptables/include/xtables.h). + // Here we do our best to deal with this by retrying the operation whenever + // we get this error. + + var err error + + exitCodeResourceUnavailable := 4 + success := false + + for start := time.Now(); time.Since(start) < (time.Duration(xtablesWait) * time.Second); { + + cmd = exec.Command(iptablesRestorePath) + cmd.Stdin = strings.NewReader(iptables) + + err := cmd.Run() + if err == nil { + success = true + break + } + + if exitError, ok := err.(*exec.ExitError); ok { + exitCode := exitError.ExitCode() + if exitCode != exitCodeResourceUnavailable { + break + } + } + + time.Sleep(time.Duration(xtablesWaitInterval) * time.Microsecond) + } + + if !success { + return fmt.Errorf("failed to run %v: %s", cmd.Args, err) + } + } + + return nil +} + +// sysbox-runc: Init performs container's rootfs initialization actions from +// within specific container namespaces. By virtue of entering to an individual +// namespace (e.g. 'mount' or 'network' ns), Init has true root-level access to +// the host and thus can perform operations that the container's init process +// may not have permissions to do. +func (l *linuxRootfsInit) Init() error { + + if len(l.reqs) == 0 { + return newSystemError(fmt.Errorf("no op requests!")) + } + + // If multiple requests are passed in the slice, they must all be + // of the same type. + switch l.reqs[0].Op { + + case rootfsIDMap: + rootfs := l.reqs[0].Rootfs + uid := l.reqs[0].Uid + gid := l.reqs[0].Gid + + usernsPath := "/proc/1/ns/user" + + // Move current dir away from rootfs since we will remount it + if err := unix.Chdir("/"); err != nil { + return newSystemErrorWithCause(err, "chdir to /") + } + + // We are in the pid and mount ns of the container's init process; remount + // /proc so that it picks up this fact. + os.Lstat("/proc") + if err := unix.Mount("proc", "/proc", "proc", 0, ""); err != nil { + return newSystemErrorWithCause(err, "re-mounting procfs") + } + defer unix.Unmount("/proc", unix.MNT_DETACH) + + fsName, err := utils.GetFsName(rootfs) + if err != nil { + return err + } + + if fsName == "overlayfs" { + + // Get info about the ovfs mount (layers, mount opts, propagation, etc.) + mounts, err := mount.GetMountsPid(uint32(os.Getpid())) + if err != nil { + return err + } + + mi, err := mount.GetMountAt(rootfs, mounts) + if err != nil { + return err + } + + ovfsMntOpts := overlayUtils.GetMountOpt(mi) + ovfsUpperLayer := overlayUtils.GetUpperLayer(ovfsMntOpts) + ovfsLowerLayers := overlayUtils.GetLowerLayers(ovfsMntOpts) + + // Remove the current overlayfs mount + if err := unix.Unmount(rootfs, unix.MNT_DETACH); err != nil { + return err + } + + // ID-map each of the ovfs lower layers + for _, layer := range ovfsLowerLayers { + if err := idMap.IDMapMount(usernsPath, layer, false); err != nil { + fsName, _ := utils.GetFsName(layer) + return newSystemErrorWithCausef(err, + "setting up ID-mapped mount on path %s (likely means idmapped mounts are not supported on the filesystem at this path (%s))", + layer, fsName) + } + } + + // The overlayfs upper layer can't be ID-mapped, so it needs to be chowned. + if err := idShiftUtils.ShiftIdsWithChown(ovfsUpperLayer, int32(uid), int32(gid)); err != nil { + return newSystemErrorWithCausef(err, "chown overlayfs upper layet at %s", ovfsUpperLayer) + } + + // Recreate the rootfs overlayfs mount (using the ID-mapped lower layers) + if err := unix.Mount("overlay", rootfs, "overlay", uintptr(ovfsMntOpts.Flags), ovfsMntOpts.Opts); err != nil { + return fmt.Errorf("failed to mount %s: %s", rootfs, err) + } + if err := unix.Mount("", rootfs, "", uintptr(ovfsMntOpts.PropFlags), ""); err != nil { + return fmt.Errorf("failed to set mount prop flags %s: %s", rootfs, err) + } + + } else { + if err := idMap.IDMapMount(usernsPath, rootfs, true); err != nil { + return newSystemErrorWithCausef(err, + "setting up ID-mapped mount on path %s (likely means idmapped mounts are not supported on the filesystem at this path (%s))", + rootfs, fsName) + } + } + + // ID-mapping by itself won't allow the container to write to "/"; must + // chown the rootfs dir so that it can write there. + if err := unix.Chown(rootfs, uid, gid); err != nil { + return newSystemErrorWithCausef(err, "failed to chown %s to %v:%v", rootfs, uid, gid) + } + + case bind: + // The mount requests assume that the process cwd is the rootfs directory + rootfs := l.reqs[0].Rootfs + if err := unix.Chdir(rootfs); err != nil { + return newSystemErrorWithCausef(err, "chdir to rootfs %s", rootfs) + } + + // We are in the pid and mount ns of the container's init process; remount + // /proc so that it picks up this fact. + os.Lstat("/proc") + if err := unix.Mount("proc", "/proc", "proc", 0, ""); err != nil { + return newSystemErrorWithCause(err, "re-mounting procfs") + } + defer unix.Unmount("/proc", unix.MNT_DETACH) + + usernsPath := "/proc/1/ns/user" + fsuidMapFailOnErr := l.reqs[0].FsuidMapFailOnErr + + for _, req := range l.reqs { + + m := &req.Mount + mountLabel := req.Label + + if err := doBindMount(rootfs, m); err != nil { + return newSystemErrorWithCausef(err, "bind mounting %s to %s", m.Source, m.Destination) + } + + // The bind mount won't change mount options, we need remount to make mount options effective. + // first check that we have non-default options required before attempting a remount + if m.Flags&^(unix.MS_REC|unix.MS_REMOUNT|unix.MS_BIND) != 0 { + // only remount if unique mount options are set + if err := remount(m); err != nil { + return newSystemErrorWithCausef(err, "remount of %s with flags %#x", + m.Destination, m.Flags) + } + } + + // Apply label + if m.Relabel != "" { + if err := label.Validate(m.Relabel); err != nil { + return newSystemErrorWithCausef(err, "validating label %s", m.Relabel) + } + shared := label.IsShared(m.Relabel) + if err := label.Relabel(m.Source, mountLabel, shared); err != nil { + return newSystemErrorWithCausef(err, "relabeling %s to %s", m.Source, mountLabel) + } + } + + // Set up the ID-mapping as needed + if m.IDMappedMount { + if err := libcontainerUtils.WithProcfd(rootfs, m.Destination, func(procfd string) error { + if err := idMap.IDMapMount(usernsPath, procfd, true); err != nil { + fsName, _ := utils.GetFsName(procfd) + realpath, _ := os.Readlink(procfd) + + errMsg := fmt.Sprintf("setting up ID-mapped mount on path %s failed with %s "+ + "(likely means idmapped mounts are not supported on the filesystem at this path (%s))", + realpath, err, fsName) + + if fsuidMapFailOnErr { + return fmt.Errorf(errMsg) + } else { + logrus.Warnf(errMsg) + } + } + return nil + }); err != nil { + return newSystemErrorWithCausef(err, "ID-map mount on %s", m.Destination) + } + } + } + + case switchDockerDns: + oldDns := l.reqs[0].OldDns + newDns := l.reqs[0].NewDns + + if err := doDockerDnsSwitch(oldDns, newDns); err != nil { + return newSystemErrorWithCausef(err, "Docker DNS switch from %s to %s", oldDns, newDns) + } + + case chown: + rootfs := l.reqs[0].Rootfs + + for _, req := range l.reqs { + path, err := securejoin.SecureJoin(rootfs, req.Path) + if err != nil { + return newSystemErrorWithCausef(err, "secure join of %s and %s failed: %s", rootfs, req.Path, err) + } + + uid := req.Uid + gid := req.Gid + + if err := unix.Chown(path, uid, gid); err != nil { + return newSystemErrorWithCausef(err, "failed to chown %s to %v:%v", path, uid, gid) + } + } + + case mkdir: + rootfs := l.reqs[0].Rootfs + + for _, req := range l.reqs { + path, err := securejoin.SecureJoin(rootfs, req.Path) + if err != nil { + return newSystemErrorWithCausef(err, "secure join of %s and %s failed: %s", rootfs, req.Path, err) + } + + mode := req.Mode + uid := req.Uid + gid := req.Gid + + if err := os.MkdirAll(path, mode); err != nil { + return newSystemErrorWithCausef(err, "failed to mkdirall %s: %s", path, err) + } + if err := unix.Chown(path, uid, gid); err != nil { + return newSystemErrorWithCausef(err, "failed to chown %s to %v:%v", path, uid, gid) + } + + } + + default: + return newSystemError(fmt.Errorf("invalid init type")) + } + + if err := writeSync(l.pipe, opDone); err != nil { + return err + } + + l.pipe.Close() + return nil +} diff --git a/sysbox-runc/libcontainer/rootfs_linux.go b/sysbox-runc/libcontainer/rootfs_linux.go new file mode 100644 index 00000000..3a86c3ea --- /dev/null +++ b/sysbox-runc/libcontainer/rootfs_linux.go @@ -0,0 +1,1460 @@ +//go:build linux +// +build linux + +package libcontainer + +import ( + "bufio" + "encoding/binary" + "fmt" + "io" + "io/ioutil" + "net" + "os" + "os/exec" + "path" + "path/filepath" + "strconv" + "strings" + "syscall" + "time" + + securejoin "github.com/cyphar/filepath-securejoin" + sh "github.com/nestybox/sysbox-libs/idShiftUtils" + "github.com/nestybox/sysbox-libs/mount" + + "github.com/moby/sys/mountinfo" + + "github.com/mrunalp/fileutils" + "github.com/opencontainers/runc/libcontainer/cgroups" + "github.com/opencontainers/runc/libcontainer/configs" + "github.com/opencontainers/runc/libcontainer/devices" + "github.com/opencontainers/runc/libcontainer/system" + "github.com/opencontainers/runc/libcontainer/utils" + "github.com/opencontainers/runtime-spec/specs-go" + "github.com/opencontainers/selinux/go-selinux/label" + "github.com/sirupsen/logrus" + "golang.org/x/sys/unix" +) + +const defaultMountFlags = unix.MS_NOEXEC | unix.MS_NOSUID | unix.MS_NODEV + +// needsSetupDev returns true if /dev needs to be set up. +func needsSetupDev(config *configs.Config) bool { + for _, m := range config.Mounts { + if m.Device == "bind" && utils.CleanPath(m.Destination) == "/dev" { + return false + } + } + return true +} + +// prepareRootfs sets up the devices, mount points, and filesystems for use inside a new +// mount namespace. It must be called from the container's rootfs. It doesn't set anything +// as ro. You must call finalizeRootfs after this function to finish setting up the +// rootfs. +func prepareRootfs(pipe io.ReadWriter, iConfig *initConfig) (err error) { + config := iConfig.Config + + if config.RootfsUidShiftType == sh.IDMappedMount { + if err := doRootfsIDMapping(config, pipe); err != nil { + return newSystemErrorWithCause(err, "ID-mapping rootfs") + } + } + + if err := effectRootfsMount(); err != nil { + return newSystemErrorWithCause(err, "effecting rootfs mount") + } + + if err := doMounts(config, pipe); err != nil { + return newSystemErrorWithCause(err, "setting up rootfs mounts") + } + + setupDev := needsSetupDev(config) + if setupDev { + if err := createDevices(config, pipe); err != nil { + return newSystemErrorWithCause(err, "creating device nodes") + } + if err := setupPtmx(config); err != nil { + return newSystemErrorWithCause(err, "setting up ptmx") + } + if err := setupDevSymlinks(config.Rootfs); err != nil { + return newSystemErrorWithCause(err, "setting up /dev symlinks") + } + } + + // Signal the parent to run the pre-start hooks. + // The hooks are run after the mounts are setup, but before we switch to the new + // root, so that the old root is still available in the hooks for any mount + // manipulations. + // Note that iConfig.Cwd is not guaranteed to exist here. + if err := syncParentHooks(pipe); err != nil { + return err + } + + // The reason these operations are done here rather than in finalizeRootfs + // is because the console-handling code gets quite sticky if we have to set + // up the console before doing the pivot_root(2). This is because the + // Console API has to also work with the ExecIn case, which means that the + // API must be able to deal with being inside as well as outside the + // container. It's just cleaner to do this here (at the expense of the + // operation not being perfectly split). + + s := iConfig.SpecState + s.Pid = unix.Getpid() + s.Status = specs.StateCreating + if err := iConfig.Config.Hooks[configs.CreateContainer].RunHooks(s); err != nil { + return err + } + + if config.NoPivotRoot { + err = msMoveRoot(config.Rootfs) + } else if config.Namespaces.Contains(configs.NEWNS) { + err = pivotRoot(config.Rootfs) + } else { + err = chroot() + } + if err != nil { + return newSystemErrorWithCause(err, "jailing process inside rootfs") + } + + // Add pending fsState to container's rootfs. + if err := addFsState(config); err != nil { + return newSystemErrorWithCause(err, "adding rootfs state") + } + + if setupDev { + if err := reOpenDevNull(); err != nil { + return newSystemErrorWithCause(err, "reopening /dev/null inside container") + } + } + + if cwd := iConfig.Cwd; cwd != "" { + // Note that spec.Process.Cwd can contain unclean value like "../../../../foo/bar...". + // However, we are safe to call MkDirAll directly because we are in the jail here. + if err := os.MkdirAll(cwd, 0755); err != nil { + return err + } + } + + return nil +} + +// finalizeRootfs sets anything to ro if necessary. +func finalizeRootfs(config *configs.Config) (err error) { + // remount dev as ro if specified + for _, m := range config.Mounts { + if utils.CleanPath(m.Destination) == "/dev" { + if m.Flags&unix.MS_RDONLY == unix.MS_RDONLY { + if err := remountReadonly(m); err != nil { + return newSystemErrorWithCausef(err, "remounting %q as readonly", m.Destination) + } + } + break + } + } + + // set rootfs ( / ) as readonly + if config.Readonlyfs { + if err := setReadonly(); err != nil { + return newSystemErrorWithCause(err, "setting rootfs as readonly") + } + } + + if config.Umask != nil { + unix.Umask(int(*config.Umask)) + } else { + unix.Umask(0022) + } + return nil +} + +// /tmp has to be mounted as private to allow MS_MOVE to work in all situations +func prepareTmp(topTmpDir string) (string, error) { + tmpdir, err := ioutil.TempDir(topTmpDir, "runctop") + if err != nil { + return "", err + } + if err := unix.Mount(tmpdir, tmpdir, "bind", unix.MS_BIND, ""); err != nil { + return "", err + } + if err := unix.Mount("", tmpdir, "", uintptr(unix.MS_PRIVATE), ""); err != nil { + return "", err + } + return tmpdir, nil +} + +func cleanupTmp(tmpdir string) error { + unix.Unmount(tmpdir, 0) + return os.RemoveAll(tmpdir) +} + +func mountCmd(cmd configs.Command) error { + command := exec.Command(cmd.Path, cmd.Args[:]...) + command.Env = cmd.Env + command.Dir = cmd.Dir + if out, err := command.CombinedOutput(); err != nil { + return fmt.Errorf("%#v failed: %s: %v", cmd, string(out), err) + } + return nil +} + +func prepareBindDest(m *configs.Mount, absDestPath bool, config *configs.Config, pipe io.ReadWriter) (err error) { + var base, dest string + + // ensure that the destination of the bind mount is resolved of symlinks at mount time because + // any previous mounts can invalidate the next mount's destination. + // this can happen when a user specifies mounts within other mounts to cause breakouts or other + // evil stuff to try to escape the container's rootfs. + + if absDestPath { + base = config.Rootfs + } else { + base = "." + } + + if dest, err = securejoin.SecureJoin(base, m.Destination); err != nil { + return err + } + + // update the mount with the correct dest after symlinks are resolved. + m.Destination = dest + if err = createIfNotExists(dest, m.BindSrcInfo.IsDir, config, pipe); err != nil { + return err + } + + return nil +} + +func mountCgroupV1(m *configs.Mount, enableCgroupns bool, config *configs.Config, pipe io.ReadWriter) error { + + binds, err := getCgroupMounts(m) + if err != nil { + return err + } + var merged []string + for _, b := range binds { + ss := filepath.Base(b.Destination) + if strings.Contains(ss, ",") { + merged = append(merged, ss) + } + } + tmpfs := &configs.Mount{ + Source: "tmpfs", + Device: "tmpfs", + Destination: m.Destination, + Flags: defaultMountFlags, + Data: "mode=755", + PropagationFlags: m.PropagationFlags, + } + + if err := mountToRootfs(tmpfs, config, enableCgroupns, pipe); err != nil { + return err + } + + for _, b := range binds { + if enableCgroupns { + + // sysbox-runc: use relative path (as otherwise we may not have permission to mkdir) + subsystemPath, err := securejoin.SecureJoin(".", b.Destination) + if err != nil { + return err + } + + if err := mkdirall(subsystemPath, 0755, config, pipe); err != nil { + return err + } + + if err := utils.WithProcfd(".", subsystemPath, func(procfd string) error { + flags := defaultMountFlags + if m.Flags&unix.MS_RDONLY != 0 { + flags = flags | unix.MS_RDONLY + } + var ( + source = "cgroup" + data = filepath.Base(subsystemPath) + ) + if data == "systemd" { + data = cgroups.CgroupNamePrefix + data + source = "systemd" + } + return unix.Mount(source, procfd, "cgroup", uintptr(flags), data) + }); err != nil { + return err + } + } else { + if err := mountToRootfs(b, config, enableCgroupns, pipe); err != nil { + return err + } + } + } + for _, mc := range merged { + for _, ss := range strings.Split(mc, ",") { + // symlink(2) is very dumb, it will just shove the path into + // the link and doesn't do any checks or relative path + // conversion. Also, don't error out if the cgroup already exists. + dest, err := securejoin.SecureJoin(".", filepath.Join(m.Destination, ss)) + if err != nil { + return err + } + if err := os.Symlink(mc, dest); err != nil && !os.IsExist(err) { + return err + } + } + } + return nil +} + +func mountCgroupV2(m *configs.Mount, enableCgroupns bool, config *configs.Config, pipe io.ReadWriter) error { + + // sysbox-runc: use relative path (as otherwise we may not have permission to mkdir) + cgroupPath, err := securejoin.SecureJoin(".", m.Destination) + if err != nil { + return err + } + + if err := mkdirall(cgroupPath, 0755, config, pipe); err != nil { + return err + } + + return utils.WithProcfd(".", cgroupPath, func(procfd string) error { + if err := unix.Mount(m.Source, procfd, "cgroup2", uintptr(m.Flags), m.Data); err != nil { + // when we are in UserNS but CgroupNS is not unshared, we cannot mount cgroup2 (#2158) + if err == unix.EPERM || err == unix.EBUSY { + return unix.Mount("/sys/fs/cgroup", procfd, "", uintptr(m.Flags)|unix.MS_BIND, "") + } + return nil + } + return nil + }) +} + +func doTmpfsCopyUp(m *configs.Mount, rootfs, mountLabel string) (Err error) { + // Set up a scratch dir for the tmpfs on the host. + tmpdir, err := prepareTmp("/tmp") + if err != nil { + return newSystemErrorWithCause(err, "tmpcopyup: failed to setup tmpdir") + } + defer cleanupTmp(tmpdir) + tmpDir, err := ioutil.TempDir(tmpdir, "runctmpdir") + if err != nil { + return newSystemErrorWithCause(err, "tmpcopyup: failed to create tmpdir") + } + defer os.RemoveAll(tmpDir) + + // Configure the *host* tmpdir as if it's the container mount. We change + // m.Destination since we are going to mount *on the host*. + oldDest := m.Destination + m.Destination = tmpDir + err = mountPropagate(m, "/", mountLabel) + m.Destination = oldDest + if err != nil { + return err + } + defer func() { + if Err != nil { + if err := unix.Unmount(tmpDir, unix.MNT_DETACH); err != nil { + logrus.Warnf("tmpcopyup: failed to unmount tmpdir on error: %v", err) + } + } + }() + return utils.WithProcfd(".", m.Destination, func(procfd string) (Err error) { + // Copy the container data to the host tmpdir. We append "/" to force + // CopyDirectory to resolve the symlink rather than trying to copy the + // symlink itself. + if err := fileutils.CopyDirectory(procfd+"/", tmpDir); err != nil { + return fmt.Errorf("tmpcopyup: failed to copy %s to %s (%s): %w", m.Destination, procfd, tmpDir, err) + } + // Now move the mount into the container. + if err := unix.Mount(tmpDir, procfd, "", unix.MS_MOVE, ""); err != nil { + return fmt.Errorf("tmpcopyup: failed to move mount %s to %s (%s): %w", tmpDir, procfd, m.Destination, err) + } + return nil + }) +} + +// mkdirall calls into os.Mkdirall(), but precedes the call with an open of the current +// working directory (cwd). This avoids permission-denied problems on the Mkdirall call +// when shiftfs is mounted on the cwd. The exact cause of the permission problem is not +// clear and needs further investigation. +func mkdirall(path string, mode os.FileMode, config *configs.Config, pipe io.ReadWriter) error { + + fd, err := syscall.Open(".", unix.O_PATH|unix.O_CLOEXEC|unix.O_DIRECTORY, 0) + if err != nil { + return fmt.Errorf("failed to open current dir.") + } + + if err := syscall.Fchdir(fd); err != nil { + return fmt.Errorf("fchdir %s failed: %v", path, err) + } + + if err := os.MkdirAll(path, mode); err != nil { + + // In some cases the container's init process process won't have + // permission to perform the mkdir (e.g., if the parent directory in the + // image is owned by root:root on the host). In this case, we ask the + // parent sysbox-runc process to do this for us. + + req := opReq{ + Op: mkdir, + Rootfs: config.Rootfs, + Path: path, + Mode: mode, + Uid: config.UidMappings[0].HostID, + Gid: config.GidMappings[0].HostID, + } + + if err := syncParentDoOp([]opReq{req}, pipe); err != nil { + return fmt.Errorf("mkdirall %s with mode %o failed: %v", path, mode, err) + } + } + + if err := syscall.Close(fd); err != nil { + return fmt.Errorf("failed to close fd %d", fd) + } + + return nil +} + +func mountToRootfs(m *configs.Mount, config *configs.Config, enableCgroupns bool, pipe io.ReadWriter) error { + + mountLabel := config.MountLabel + + // sysbox-runc: use relative path for the rootfs as we may not have access to it via the abs path. + rootfs := "." + + // Ensure the mount destination is within the container's rootfs + dest, err := securejoin.SecureJoin(rootfs, m.Destination) + if err != nil { + return err + } + m.Destination = dest + + switch m.Device { + case "proc", "sysfs": + // If the destination already exists and is not a directory, we bail + // out This is to avoid mounting through a symlink or similar -- which + // has been a "fun" attack scenario in the past. + // TODO: This won't be necessary once we switch to libpathrs and we can + // stop all of these symlink-exchange attacks. + if fi, err := os.Lstat(dest); err != nil { + if !os.IsNotExist(err) { + return err + } + } else if fi.Mode()&os.ModeDir == 0 { + return fmt.Errorf("filesystem %q must be mounted on ordinary directory", m.Device) + } + if err := mkdirall(dest, 0755, config, pipe); err != nil { + return fmt.Errorf("failed to created dir for %s mount: %v", m.Device, err) + } + // Selinux kernels do not support labeling of /proc or /sys + return mountPropagate(m, ".", "") + case "mqueue": + if err := mkdirall(dest, 0755, config, pipe); err != nil { + return err + } + if err := mountPropagate(m, rootfs, ""); err != nil { + return err + } + return label.SetFileLabel(dest, mountLabel) + case "tmpfs": + stat, err := os.Stat(dest) + if err != nil { + if err := mkdirall(dest, 0755, config, pipe); err != nil { + return err + } + } + if m.Extensions&configs.EXT_COPYUP == configs.EXT_COPYUP { + err = doTmpfsCopyUp(m, rootfs, mountLabel) + } else { + err = mountPropagate(m, rootfs, mountLabel) + } + if err != nil { + return err + } + if stat != nil { + if err = os.Chmod(dest, stat.Mode()); err != nil { + return err + } + } + // Initially mounted rw in mountPropagate, remount to ro if flag set. + if m.Flags&unix.MS_RDONLY != 0 { + if err := remount(m); err != nil { + return err + } + } + return nil + case "cgroup": + if cgroups.IsCgroup2UnifiedMode() { + return mountCgroupV2(m, enableCgroupns, config, pipe) + } + return mountCgroupV1(m, enableCgroupns, config, pipe) + default: + // ensure that the destination of the mount is resolved of symlinks at mount time because + // any previous mounts can invalidate the next mount's destination. + // this can happen when a user specifies mounts within other mounts to cause breakouts or other + // evil stuff to try to escape the container's rootfs. + if err := mkdirall(dest, 0755, config, pipe); err != nil { + return err + } + return mountPropagate(m, rootfs, mountLabel) + } +} + +func doBindMounts(config *configs.Config, pipe io.ReadWriter) error { + + // sysbox-runc: the sys container's init process is in a dedicated + // user-ns, so it may not have search permission to the bind mount + // sources (and thus can't perform the bind mount itself). As a + // result, we perform the bind mounts by asking the parent + // sysbox-runc to spawn a helper child process which enters the + // container's mount namespace (only) and performs the mounts. That + // helper process has true root credentials (because it's in the + // initial user-ns rather than the sys container's user-ns) yet it + // can perform mounts inside the container. + // + // Also, to avoid sending too many requests to our parent + // sysbox-runc, we group bind mounts and send a bulk request, with + // one exception: when a bind mount depends on a prior one, we must + // ask the parent sysbox-runc to perform the prior ones before we + // can prepare the bind destination and perform the current one. + + mntReqs := []opReq{} + + for _, m := range config.Mounts { + + if m.Device != "bind" { + continue + } + + // Determine if the current mount is dependent on a prior one. + mntDependsOnPrior := false + for _, mr := range mntReqs { + + // Mount destinations in mntReqs are relative to the rootfs + // (see prepareBindDest()); thus we need to prepend "/" for a + // proper comparison. + if strings.HasPrefix(m.Destination, filepath.Join("/", mr.Mount.Destination)) { + mntDependsOnPrior = true + } + } + + // If the current mount depends on a prior one, ask our parent + // runc to actually do the prior mount(s). + if mntDependsOnPrior { + if len(mntReqs) > 0 { + + mntReqs[0].Op = bind + mntReqs[0].Rootfs = config.Rootfs + mntReqs[0].FsuidMapFailOnErr = config.FsuidMapFailOnErr + + if err := syncParentDoOp(mntReqs, pipe); err != nil { + return newSystemErrorWithCause(err, "syncing with parent runc to perform bind mounts") + } + mntReqs = mntReqs[:0] + } + } + + if err := prepareBindDest(m, false, config, pipe); err != nil { + return err + } + + req := opReq{ + Op: bind, + Mount: *m, + Label: config.MountLabel, + Rootfs: config.Rootfs, + FsuidMapFailOnErr: config.FsuidMapFailOnErr, + } + + mntReqs = append(mntReqs, req) + } + + if len(mntReqs) > 0 { + + mntReqs[0].Op = bind + mntReqs[0].Rootfs = config.Rootfs + mntReqs[0].FsuidMapFailOnErr = config.FsuidMapFailOnErr + + if err := syncParentDoOp(mntReqs, pipe); err != nil { + return newSystemErrorWithCause(err, "syncing with parent runc to perform bind mounts") + } + } + + return nil +} + +func chownMounts(config *configs.Config, pipe io.ReadWriter, chownList []string) error { + chownReqs := []opReq{} + + if config.UidMappings != nil && config.GidMappings != nil { + for _, path := range chownList { + req := opReq{ + Op: chown, + Rootfs: config.Rootfs, + Path: path, + Uid: config.UidMappings[0].HostID, + Gid: config.GidMappings[0].HostID, + } + chownReqs = append(chownReqs, req) + } + } + + if len(chownReqs) > 0 { + if err := syncParentDoOp(chownReqs, pipe); err != nil { + return newSystemErrorWithCause(err, "syncing with parent runc to chown mounts") + } + } + + return nil +} + +func getCgroupMounts(m *configs.Mount) ([]*configs.Mount, error) { + mounts, err := cgroups.GetCgroupMounts(false) + if err != nil { + return nil, err + } + + cgroupPaths, err := cgroups.ParseCgroupFile("/proc/self/cgroup") + if err != nil { + return nil, err + } + + var binds []*configs.Mount + + for _, mm := range mounts { + dir, err := mm.GetOwnCgroup(cgroupPaths) + if err != nil { + return nil, err + } + relDir, err := filepath.Rel(mm.Root, dir) + if err != nil { + return nil, err + } + binds = append(binds, &configs.Mount{ + Device: "bind", + Source: filepath.Join(mm.Mountpoint, relDir), + Destination: filepath.Join(m.Destination, filepath.Base(mm.Mountpoint)), + Flags: unix.MS_BIND | unix.MS_REC | m.Flags, + PropagationFlags: m.PropagationFlags, + }) + } + + return binds, nil +} + +func setupDevSymlinks(rootfs string) error { + var links = [][2]string{ + {"/proc/self/fd", "/dev/fd"}, + {"/proc/self/fd/0", "/dev/stdin"}, + {"/proc/self/fd/1", "/dev/stdout"}, + {"/proc/self/fd/2", "/dev/stderr"}, + } + // kcore support can be toggled with CONFIG_PROC_KCORE; only create a symlink + // in /dev if it exists in /proc. + if _, err := os.Stat("/proc/kcore"); err == nil { + links = append(links, [2]string{"/proc/kcore", "/dev/core"}) + } + for _, link := range links { + var ( + src = link[0] + dst = filepath.Join(".", link[1]) + ) + if err := os.Symlink(src, dst); err != nil && !os.IsExist(err) { + return fmt.Errorf("symlink %s %s %s", src, dst, err) + } + } + return nil +} + +// If stdin, stdout, and/or stderr are pointing to `/dev/null` in the parent's rootfs +// this method will make them point to `/dev/null` in this container's rootfs. This +// needs to be called after we chroot/pivot into the container's rootfs so that any +// symlinks are resolved locally. +func reOpenDevNull() error { + var stat, devNullStat unix.Stat_t + file, err := os.OpenFile("/dev/null", os.O_RDWR, 0) + if err != nil { + return fmt.Errorf("Failed to open /dev/null - %s", err) + } + defer file.Close() + if err := unix.Fstat(int(file.Fd()), &devNullStat); err != nil { + return err + } + for fd := 0; fd < 3; fd++ { + if err := unix.Fstat(fd, &stat); err != nil { + return err + } + if stat.Rdev == devNullStat.Rdev { + // Close and re-open the fd. + if err := unix.Dup3(int(file.Fd()), fd, 0); err != nil { + return err + } + } + } + return nil +} + +// Create the device nodes in the container. +func createDevices(config *configs.Config, pipe io.ReadWriter) error { + useBindMount := system.RunningInUserNS() || config.Namespaces.Contains(configs.NEWUSER) + oldMask := unix.Umask(0000) + for _, node := range config.Devices { + + // The /dev/ptmx device is setup by setupPtmx() + if utils.CleanPath(node.Path) == "/dev/ptmx" { + continue + } + + // containers running in a user namespace are not allowed to mknod + // devices so we can just bind mount it from the host. + if err := createDeviceNode(node, useBindMount, config, pipe); err != nil { + unix.Umask(oldMask) + return err + } + } + unix.Umask(oldMask) + return nil +} + +func bindMountDeviceNode(rootfs, dest string, node *devices.Device) error { + f, err := os.Create(dest) + if err != nil && !os.IsExist(err) { + return err + } + if f != nil { + f.Close() + } + return utils.WithProcfd(rootfs, dest, func(procfd string) error { + return unix.Mount(node.Path, procfd, "bind", unix.MS_BIND, "") + }) +} + +// Creates the device node in the rootfs of the container. +func createDeviceNode(node *devices.Device, bind bool, config *configs.Config, pipe io.ReadWriter) error { + if node.Path == "" { + // The node only exists for cgroup reasons, ignore it here. + return nil + } + + // sysbox-runc: use relative path for the rootfs as we may not have access to it via the abs path. + rootfs := "." + + // Verify the device node path is within the container's rootfs + dest, err := securejoin.SecureJoin(rootfs, node.Path) + if err != nil { + return err + } + if err := mkdirall(filepath.Dir(dest), 0755, config, pipe); err != nil { + return err + } + if bind { + return bindMountDeviceNode(rootfs, dest, node) + } + if err := mknodDevice(dest, node); err != nil { + if os.IsExist(err) { + return nil + } else if os.IsPermission(err) { + return bindMountDeviceNode(rootfs, dest, node) + } + return err + } + return nil +} + +func mknodDevice(dest string, node *devices.Device) error { + fileMode := node.FileMode + switch node.Type { + case devices.BlockDevice: + fileMode |= unix.S_IFBLK + case devices.CharDevice: + fileMode |= unix.S_IFCHR + case devices.FifoDevice: + fileMode |= unix.S_IFIFO + default: + return fmt.Errorf("%c is not a valid device type for device %s", node.Type, node.Path) + } + dev, err := node.Mkdev() + if err != nil { + return err + } + if err := unix.Mknod(dest, uint32(fileMode), int(dev)); err != nil { + return err + } + return unix.Chown(dest, int(node.Uid), int(node.Gid)) +} + +// Get the parent mount point of directory passed in as argument. Also return +// optional fields. +func getParentMount(rootfs string) (string, string, error) { + mi, err := mountinfo.GetMounts(mountinfo.ParentsFilter(rootfs)) + if err != nil { + return "", "", err + } + if len(mi) < 1 { + return "", "", fmt.Errorf("could not find parent mount of %s", rootfs) + } + + // find the longest mount point + var idx, maxlen int + for i := range mi { + if len(mi[i].Mountpoint) > maxlen { + maxlen = len(mi[i].Mountpoint) + idx = i + } + } + return mi[idx].Mountpoint, mi[idx].Optional, nil +} + +// Indicates if our parent mount has shared propagation +func rootfsParentMountIsShared(rootfs string) (bool, string, error) { + sharedMount := false + + parentMount, optionalOpts, err := getParentMount(rootfs) + if err != nil { + return false, "", err + } + + optsSplit := strings.Split(optionalOpts, " ") + for _, opt := range optsSplit { + if strings.HasPrefix(opt, "shared:") { + sharedMount = true + break + } + } + + return sharedMount, parentMount, nil +} + +func setReadonly() error { + flags := uintptr(unix.MS_BIND | unix.MS_REMOUNT | unix.MS_RDONLY) + + err := unix.Mount("", "/", "", flags, "") + if err == nil { + return nil + } + var s unix.Statfs_t + if err := unix.Statfs("/", &s); err != nil { + return &os.PathError{Op: "statfs", Path: "/", Err: err} + } + flags |= uintptr(s.Flags) + return unix.Mount("", "/", "", flags, "") + +} + +func setupPtmx(config *configs.Config) error { + ptmx := "dev/ptmx" + if err := os.Remove(ptmx); err != nil && !os.IsNotExist(err) { + return err + } + if err := os.Symlink("pts/ptmx", ptmx); err != nil { + return fmt.Errorf("symlink dev ptmx %s", err) + } + return nil +} + +// pivotRoot will call pivot_root such that rootfs becomes the new root +// filesystem, and everything else is cleaned up. +func pivotRoot(rootfs string) error { + // While the documentation may claim otherwise, pivot_root(".", ".") is + // actually valid. What this results in is / being the new root but + // /proc/self/cwd being the old root. Since we can play around with the cwd + // with pivot_root this allows us to pivot without creating directories in + // the rootfs. Shout-outs to the LXC developers for giving us this idea. + + oldroot, err := unix.Open("/", unix.O_DIRECTORY|unix.O_RDONLY, 0) + if err != nil { + return err + } + defer unix.Close(oldroot) + + newroot, err := unix.Open(".", unix.O_DIRECTORY|unix.O_RDONLY, 0) + if err != nil { + return err + } + defer unix.Close(newroot) + + // Change to the new root so that the pivot_root actually acts on it. + if err := unix.Fchdir(newroot); err != nil { + return err + } + + if err := unix.PivotRoot(".", "."); err != nil { + return fmt.Errorf("pivot_root %s", err) + } + + // Currently our "." is oldroot (according to the current kernel code). + // However, purely for safety, we will fchdir(oldroot) since there isn't + // really any guarantee from the kernel what /proc/self/cwd will be after a + // pivot_root(2). + + if err := unix.Fchdir(oldroot); err != nil { + return err + } + + // Make oldroot rslave to make sure our unmounts don't propagate to the + // host (and thus bork the machine). We don't use rprivate because this is + // known to cause issues due to races where we still have a reference to a + // mount while a process in the host namespace are trying to operate on + // something they think has no mounts (devicemapper in particular). + if err := unix.Mount("", ".", "", unix.MS_SLAVE|unix.MS_REC, ""); err != nil { + return err + } + + // Perform the unmount. MNT_DETACH allows us to unmount /proc/self/cwd. + if err := unix.Unmount(".", unix.MNT_DETACH); err != nil { + return err + } + + // Switch back to our shiny new root. + if err := unix.Chdir("/"); err != nil { + return fmt.Errorf("chdir / %s", err) + } + return nil +} + +func msMoveRoot(rootfs string) error { + // Before we move the root and chroot we have to mask all "full" sysfs and + // procfs mounts which exist on the host. This is because while the kernel + // has protections against mounting procfs if it has masks, when using + // chroot(2) the *host* procfs mount is still reachable in the mount + // namespace and the kernel permits procfs mounts inside --no-pivot + // containers. + // + // Users shouldn't be using --no-pivot except in exceptional circumstances, + // but to avoid such a trivial security flaw we apply a best-effort + // protection here. The kernel only allows a mount of a pseudo-filesystem + // like procfs or sysfs if there is a *full* mount (the root of the + // filesystem is mounted) without any other locked mount points covering a + // subtree of the mount. + // + // So we try to unmount (or mount tmpfs on top of) any mountpoint which is + // a full mount of either sysfs or procfs (since those are the most + // concerning filesystems to us). + mountinfos, err := mountinfo.GetMounts(func(info *mountinfo.Info) (skip, stop bool) { + // Collect every sysfs and procfs filesystem, except for those which + // are non-full mounts or are inside the rootfs of the container. + if info.Root != "/" || + (info.FSType != "proc" && info.FSType != "sysfs") || + strings.HasPrefix(info.Mountpoint, rootfs) { + skip = true + } + return + }) + if err != nil { + return err + } + for _, info := range mountinfos { + p := info.Mountpoint + // Be sure umount events are not propagated to the host. + if err := unix.Mount("", p, "", unix.MS_SLAVE|unix.MS_REC, ""); err != nil { + if err == unix.ENOENT { + // If the mountpoint doesn't exist that means that we've + // already blasted away some parent directory of the mountpoint + // and so we don't care about this error. + continue + } + return err + } + if err := unix.Unmount(p, unix.MNT_DETACH); err != nil { + if err != unix.EINVAL && err != unix.EPERM { + return err + } else { + // If we have not privileges for umounting (e.g. rootless), then + // cover the path. + if err := unix.Mount("tmpfs", p, "tmpfs", 0, ""); err != nil { + return err + } + } + } + } + + // Move the rootfs on top of "/" in our mount namespace. + if err := unix.Mount(rootfs, "/", "", unix.MS_MOVE, ""); err != nil { + return err + } + return chroot() +} + +func chroot() error { + if err := unix.Chroot("."); err != nil { + return err + } + return unix.Chdir("/") +} + +// createIfNotExists creates a file or a directory only if it does not already exist. +func createIfNotExists(path string, isDir bool, config *configs.Config, pipe io.ReadWriter) error { + if _, err := os.Stat(path); err != nil { + if os.IsNotExist(err) { + if isDir { + return mkdirall(path, 0755, config, pipe) + } + if err := mkdirall(filepath.Dir(path), 0755, config, pipe); err != nil { + return err + } + f, err := os.OpenFile(path, os.O_CREATE, 0755) + if err != nil { + return err + } + f.Close() + } + } + return nil +} + +// readonlyPath will make a path read only. +func readonlyPath(path string, mounts []*mount.Info) error { + isMountpoint := mount.FindMount(path, mounts) + + if !isMountpoint { + if err := unix.Mount(path, path, "", unix.MS_BIND|unix.MS_REC, ""); err != nil { + if os.IsNotExist(err) { + return nil + } + return err + } + } + + m := &configs.Mount{ + Destination: path, + Flags: unix.MS_BIND | unix.MS_RDONLY | unix.MS_REC, + } + + return remount(m) +} + +// remountReadonly will remount an existing mount point and ensure that it is read-only. +func remountReadonly(m *configs.Mount) error { + var ( + dest = m.Destination + flags = m.Flags + ) + for i := 0; i < 5; i++ { + // There is a special case in the kernel for + // MS_REMOUNT | MS_BIND, which allows us to change only the + // flags even as an unprivileged user (i.e. user namespace) + // assuming we don't drop any security related flags (nodev, + // nosuid, etc.). So, let's use that case so that we can do + // this re-mount without failing in a userns. + flags |= unix.MS_REMOUNT | unix.MS_BIND | unix.MS_RDONLY + if err := unix.Mount("", dest, "", uintptr(flags), ""); err != nil { + switch err { + case unix.EBUSY: + time.Sleep(100 * time.Millisecond) + continue + default: + return err + } + } + return nil + } + return fmt.Errorf("unable to mount %s as readonly max retries reached", dest) +} + +// remountReadwrite will remount an existing mount point with read-write permissions. +func remountReadwrite(m *configs.Mount) error { + var ( + dest = m.Destination + flags = m.Flags + ) + + for i := 0; i < 5; i++ { + // There is a special case in the kernel for + // MS_REMOUNT | MS_BIND, which allows us to change only the + // flags even as an unprivileged user (i.e. user namespace) + // assuming we don't drop any security related flags (nodev, + // nosuid, etc.). So, let's use that case so that we can do + // this re-mount without failing in a userns. + flags = (flags &^ unix.MS_RDONLY) | unix.MS_REMOUNT | unix.MS_BIND + + if err := unix.Mount("", dest, "", uintptr(flags), ""); err != nil { + switch err { + case unix.EBUSY: + time.Sleep(100 * time.Millisecond) + continue + default: + return err + } + } + return nil + } + + return fmt.Errorf("unable to mount %s as readwrite max retries reached", dest) +} + +// maskPath masks the top of the specified path inside a container to avoid +// security issues from processes reading information from non-namespace aware +// mounts ( proc/kcore ). +// For files, maskPath bind mounts /dev/null over the top of the specified path. +// For directories, maskPath mounts read-only tmpfs over the top of the specified path. +func maskPath(path string, mountLabel string) error { + if err := unix.Mount("/dev/null", path, "", unix.MS_BIND, ""); err != nil && !os.IsNotExist(err) { + if err == unix.ENOTDIR { + return unix.Mount("tmpfs", path, "tmpfs", unix.MS_RDONLY, label.FormatMountLabel("", mountLabel)) + } + return err + } + return nil +} + +// writeSystemProperty writes the value to a path under /proc/sys as determined from the key. +// For e.g. net.ipv4.ip_forward translated to /proc/sys/net/ipv4/ip_forward. +func writeSystemProperty(key, value string) error { + keyPath := strings.Replace(key, ".", "/", -1) + return ioutil.WriteFile(path.Join("/proc/sys", keyPath), []byte(value), 0644) +} + +func remount(m *configs.Mount) error { + flags := uintptr(m.Flags | unix.MS_REMOUNT) + + // Per mount(2): remounting must keep original mount flags, except the flags being changed + var s unix.Statfs_t + if err := unix.Statfs(m.Destination, &s); err != nil { + return &os.PathError{Op: "statfs", Path: m.Destination, Err: err} + } + flags |= uintptr(s.Flags) + + return utils.WithProcfd(".", m.Destination, func(procfd string) error { + return unix.Mount("", procfd, "", flags, "") + }) +} + +// Do the mount operation followed by additional mounts required to take care +// of propagation flags. This will always be scoped inside the container rootfs. +func mountPropagate(m *configs.Mount, rootfs string, mountLabel string) error { + var ( + data = label.FormatMountLabel(m.Data, mountLabel) + flags = m.Flags + ) + + // Delay mounting the filesystem read-only if we need to do further + // operations on it. We need to set up files in "/dev" and tmpfs mounts may + // need to be chmod-ed after mounting. The mount will be remounted ro later + // in finalizeRootfs() if necessary. + if utils.CleanPath(m.Destination) == "/dev" || m.Device == "tmpfs" { + flags &= ^unix.MS_RDONLY + } + + // Because the destination is inside a container path which might be + // mutating underneath us, we verify that we are actually going to mount + // inside the container with WithProcfd() -- mounting through a procfd + // mounts on the target. + if err := utils.WithProcfd(rootfs, m.Destination, func(procfd string) error { + return unix.Mount(m.Source, procfd, m.Device, uintptr(flags), data) + }); err != nil { + return fmt.Errorf("mount through procfd: %w", err) + } + // We have to apply mount propagation flags in a separate WithProcfd() call + // because the previous call invalidates the passed procfd -- the mount + // target needs to be re-opened. + if err := utils.WithProcfd(rootfs, m.Destination, func(procfd string) error { + for _, pflag := range m.PropagationFlags { + if err := unix.Mount("", procfd, "", uintptr(pflag), ""); err != nil { + return err + } + } + return nil + }); err != nil { + return fmt.Errorf("change mount propagation through procfd: %w", err) + } + return nil +} + +// sysbox-runc: doRootfsIDMapping sets up ID-mapping on the container's rootfs +func doRootfsIDMapping(config *configs.Config, pipe io.ReadWriter) error { + reqs := []opReq{ + { + Op: rootfsIDMap, + Rootfs: config.Rootfs, + Uid: config.UidMappings[0].HostID, + Gid: config.GidMappings[0].HostID, + }, + } + + if err := syncParentDoOp(reqs, pipe); err != nil { + return newSystemErrorWithCause(err, "syncing with parent runc to perform rootfs ID-mapping") + } + + return nil +} + +// sysbox-runc: doMounts sets up all of the container's mounts as specified in the given config. +func doMounts(config *configs.Config, pipe io.ReadWriter) error { + + chownList := []string{} + + // Do non-bind mounts + for _, m := range config.Mounts { + if m.Device != "bind" { + if err := mountToRootfs(m, config, true, pipe); err != nil { + return newSystemErrorWithCausef(err, "mounting %q to rootfs %q at %q", m.Source, config.Rootfs, m.Destination) + } + + // Change ownership of the container's /proc to match the container's + // root user. This prevents /proc showing up as nobody:nogroup + // yet does not give any extra permissions to the container. It not only + // looks better, but helps prevents problems such as + // https://github.com/nestybox/sysbox/issues/130. + // + // Note: ideally we would do the same for "/sys", but we can't because + // changing ownership of any sysfs mountpoint causes the ownership + // change to propagate to all other sysfs mountpoints in the system. + + if m.Device == "proc" { + chownList = append(chownList, "proc") + } + } + } + + if err := doBindMounts(config, pipe); err != nil { + return err + } + + if err := chownMounts(config, pipe, chownList); err != nil { + return err + } + + return nil +} + +// sysbox-runc: validateCwd verifies that the current working directory is the container's +// rootfs +func validateCwd(rootfs string) error { + cwd, err := os.Getwd() + if err != nil { + return newSystemErrorWithCause(err, "getting cwd") + } + if cwd != rootfs { + return newSystemErrorWithCausef(err, "cwd %s is not container's rootfs %s", cwd, rootfs) + } + return nil +} + +// sysbox-runc: effectRootfsMount ensure the calling process sees the effects of a previous rootfs +// mount. It does this by reopening the rootfs directory. +func effectRootfsMount() error { + + // @ctalledo: the method for reopening the rootfs directory is pretty lame, + // but I could not find any other. Note that the "dev" subdirectory is + // guaranteed to be present, as it's always created by our parent + // sysbox-runc. + + if err := os.Chdir("dev"); err != nil { + return newSystemErrorWithCause(err, "chdir dev") + } + if err := os.Chdir(".."); err != nil { + return newSystemErrorWithCause(err, "chdir ..") + } + + return nil +} + +// Returns the IP address(es) of the nameserver(ers) in the +// DNS resolver configuration file +func getDnsNameservers(resolvconf string) ([]string, error) { + + file, err := os.Open(resolvconf) + if err != nil { + return nil, newSystemErrorWithCausef(err, "opening %s", resolvconf) + } + defer file.Close() + + nameservers := []string{} + scanner := bufio.NewScanner(file) + + for scanner.Scan() { + line := scanner.Text() + words := strings.Fields(line) + if len(words) > 1 { + if words[0] == "nameserver" { + nameservers = append(nameservers, words[1]) + } + } + } + + if err := scanner.Err(); err != nil { + return nil, newSystemErrorWithCausef(err, "scanning %s", resolvconf) + } + + return nameservers, nil +} + +// Returns the IP address of the container's default gateway +func getDefaultRoute() (string, error) { + var ipStr string + + file, err := os.Open("/proc/net/route") + if err != nil { + return "", newSystemErrorWithCause(err, "opening /proc/net/route") + } + defer file.Close() + + // /proc/net/route: + // + // Iface Destination Gateway Flags RefCnt Use Metric Mask MTU Window IRTT + // eth0 00000000 010011AC 0003 0 0 0 00000000 0 0 0 + // eth0 000011AC 00000000 0001 0 0 0 0000FFFF 0 0 0 + + scanner := bufio.NewScanner(file) + line := 0 + + for scanner.Scan() { + + // Skip header line + if line < 1 { + line++ + continue + } + + // Skip if this is not a default route + tokens := strings.Fields(scanner.Text()) + destIP := tokens[1] + if destIP != "00000000" { + continue + } + + // Gateway address is field 2 + tokens = strings.Fields(scanner.Text()) + hexIP := "0x" + tokens[2] + + intIP, err := strconv.ParseInt(hexIP, 0, 64) + if err != nil { + return "", newSystemErrorWithCausef(err, "converting %s to int", hexIP) + } + uintIP := uint32(intIP) + + // Generate the IP address string + // + // TODO: ideally we should use host byte-order; the binary conversion + // below is x86-specific. + + ip := make(net.IP, 4) + binary.LittleEndian.PutUint32(ip, uintIP) + ipStr = net.IP(ip).String() + + break + } + + return ipStr, nil +} + +// Switches the IP address of the Docker DNS nameserver inside the container +// when it has localhost address (e.g., 127.0.0.11). This avoids DNS resolution +// problems with inner Docker containers. See Sysbox issue #679. +func switchDockerDnsIP(config *configs.Config, pipe io.ReadWriter) error { + + // Docker places a DNS resolver in containers deployed on custom bridge networks + dockerDns := "127.0.0.11" + + resolvconf := "/etc/resolv.conf" + if _, err := os.Stat(resolvconf); os.IsNotExist(err) { + return nil + } + + nameservers, err := getDnsNameservers(resolvconf) + if err != nil { + return err + } + + needSwitch := false + for _, ns := range nameservers { + if ns == dockerDns { + needSwitch = true + } + } + + if !needSwitch { + return nil + } + + defRoute, err := getDefaultRoute() + if err != nil { + return err + } + + // https://github.com/nestybox/sysbox/issues/834 + // Skip the DNS change if a default gateway isn't available. + if defRoute == "" { + return nil + } + + // Request the parent runc to enter the container's net-ns and change the DNS + // in the iptables (can't do this from within the container as we may not + // have the required / compatible iptables package in the container). + reqs := []opReq{ + { + Op: switchDockerDns, + OldDns: dockerDns, + NewDns: defRoute, + }, + } + + if err := syncParentDoOp(reqs, pipe); err != nil { + return newSystemErrorWithCause(err, "syncing with parent runc to switch DNS IP") + } + + oldData, err := ioutil.ReadFile(resolvconf) + if err != nil { + return newSystemErrorWithCausef(err, "reading %s", resolvconf) + } + + newData := strings.Replace(string(oldData), dockerDns, defRoute, -1) + + // As we are about to write to resolv.conf, we should ensure that this one + // is writable, which is not necessarily the case as file could have been + // bind-mounted in RO mode (usually the case when 'readonly' spec attribute + // is present). In these scenarios we will first remount the resource as RW, + // and will remount it back to RO once the write operation is completed. + var resolvconfMount *configs.Mount + resolvcontDest := resolvconf[1:] + for _, m := range config.Mounts { + if m.Destination == resolvcontDest && + m.Flags&unix.MS_RDONLY == unix.MS_RDONLY { + if err := remountReadwrite(m); err != nil { + return newSystemErrorWithCausef(err, "remounting %q as readwrite", + m.Destination) + } + + resolvconfMount = m + break + } + } + + err = ioutil.WriteFile(resolvconf, []byte(newData), 0644) + if err != nil { + return newSystemErrorWithCausef(err, "writing %s", resolvconf) + } + + // If applicable, flip resolvconfMount back to RO mode. + if resolvconfMount != nil { + if err := remountReadonly(resolvconfMount); err != nil { + return newSystemErrorWithCausef(err, "remounting %q as readonly", + resolvconfMount.Destination) + } + } + + // Enable routing of local-host addresses to ensure packets make it to the + // Docker DNS (127.0.0.11:53). + + if err := ioutil.WriteFile("/proc/sys/net/ipv4/conf/all/route_localnet", []byte("1"), 0644); err != nil { + return fmt.Errorf("failed to enble routing of local-host addresses: %s", err) + } + + return nil +} + +// Function creates file-system state (i.e. files, dirs, softlinks) inside the +// container's rootfs, as required by Sysbox. Notice that the path of these fs +// components is with respect to the container's rootfs, so this instruction +// should be only called after pivot-root invokation. +func addFsState(config *configs.Config) error { + + for _, entry := range config.FsState { + if err := entry.Add(); err != nil { + return newSystemErrorWithCausef(err, "unable to create fsEntry %s", entry.GetPath()) + } + } + + return nil +} diff --git a/sysbox-runc/libcontainer/rootfs_linux_test.go b/sysbox-runc/libcontainer/rootfs_linux_test.go new file mode 100644 index 00000000..b17bcbdc --- /dev/null +++ b/sysbox-runc/libcontainer/rootfs_linux_test.go @@ -0,0 +1,69 @@ +// +build linux + +package libcontainer + +import ( + "testing" + + "github.com/opencontainers/runc/libcontainer/configs" +) + +func TestNeedsSetupDev(t *testing.T) { + config := &configs.Config{ + Mounts: []*configs.Mount{ + { + Device: "bind", + Source: "/dev", + Destination: "/dev", + }, + }, + } + if needsSetupDev(config) { + t.Fatal("expected needsSetupDev to be false, got true") + } +} + +func TestNeedsSetupDevStrangeSource(t *testing.T) { + config := &configs.Config{ + Mounts: []*configs.Mount{ + { + Device: "bind", + Source: "/devx", + Destination: "/dev", + }, + }, + } + if needsSetupDev(config) { + t.Fatal("expected needsSetupDev to be false, got true") + } +} + +func TestNeedsSetupDevStrangeDest(t *testing.T) { + config := &configs.Config{ + Mounts: []*configs.Mount{ + { + Device: "bind", + Source: "/dev", + Destination: "/devx", + }, + }, + } + if !needsSetupDev(config) { + t.Fatal("expected needsSetupDev to be true, got false") + } +} + +func TestNeedsSetupDevStrangeSourceDest(t *testing.T) { + config := &configs.Config{ + Mounts: []*configs.Mount{ + { + Device: "bind", + Source: "/devx", + Destination: "/devx", + }, + }, + } + if !needsSetupDev(config) { + t.Fatal("expected needsSetupDev to be true, got false") + } +} diff --git a/sysbox-runc/libcontainer/seccomp/config.go b/sysbox-runc/libcontainer/seccomp/config.go new file mode 100644 index 00000000..3ca03ed8 --- /dev/null +++ b/sysbox-runc/libcontainer/seccomp/config.go @@ -0,0 +1,150 @@ +package seccomp + +import ( + "fmt" + "sort" + + "github.com/opencontainers/runc/libcontainer/configs" + "github.com/opencontainers/runtime-spec/specs-go" +) + +// flagTsync is recognized but ignored by runc, and it is not defined +// in the runtime-spec. +const flagTsync = "SECCOMP_FILTER_FLAG_TSYNC" + +var operators = map[string]configs.Operator{ + "SCMP_CMP_NE": configs.NotEqualTo, + "SCMP_CMP_LT": configs.LessThan, + "SCMP_CMP_LE": configs.LessThanOrEqualTo, + "SCMP_CMP_EQ": configs.EqualTo, + "SCMP_CMP_GE": configs.GreaterThanOrEqualTo, + "SCMP_CMP_GT": configs.GreaterThan, + "SCMP_CMP_MASKED_EQ": configs.MaskEqualTo, +} + +// KnownOperators returns the list of the known operations. +// Used by `runc features`. +func KnownOperators() []string { + var res []string + for k := range operators { + res = append(res, k) + } + sort.Strings(res) + return res +} + +var actions = map[string]configs.Action{ + "SCMP_ACT_KILL": configs.Kill, + "SCMP_ACT_ERRNO": configs.Errno, + "SCMP_ACT_TRAP": configs.Trap, + "SCMP_ACT_ALLOW": configs.Allow, + "SCMP_ACT_TRACE": configs.Trace, + "SCMP_ACT_LOG": configs.Log, + "SCMP_ACT_NOTIFY": configs.Notify, + "SCMP_ACT_KILL_THREAD": configs.KillThread, + "SCMP_ACT_KILL_PROCESS": configs.KillProcess, +} + +// KnownActions returns the list of the known actions. +// Used by `runc features`. +func KnownActions() []string { + var res []string + for k := range actions { + res = append(res, k) + } + sort.Strings(res) + return res +} + +var archs = map[string]string{ + "SCMP_ARCH_X86": "x86", + "SCMP_ARCH_X86_64": "amd64", + "SCMP_ARCH_X32": "x32", + "SCMP_ARCH_ARM": "arm", + "SCMP_ARCH_AARCH64": "arm64", + "SCMP_ARCH_MIPS": "mips", + "SCMP_ARCH_MIPS64": "mips64", + "SCMP_ARCH_MIPS64N32": "mips64n32", + "SCMP_ARCH_MIPSEL": "mipsel", + "SCMP_ARCH_MIPSEL64": "mipsel64", + "SCMP_ARCH_MIPSEL64N32": "mipsel64n32", + "SCMP_ARCH_PPC": "ppc", + "SCMP_ARCH_PPC64": "ppc64", + "SCMP_ARCH_PPC64LE": "ppc64le", + "SCMP_ARCH_RISCV64": "riscv64", + "SCMP_ARCH_S390": "s390", + "SCMP_ARCH_S390X": "s390x", +} + +// KnownArchs returns the list of the known archs. +// Used by `runc features`. +func KnownArchs() []string { + var res []string + for k := range archs { + res = append(res, k) + } + sort.Strings(res) + return res +} + +// ConvertStringToOperator converts a string into a Seccomp comparison operator. +// Comparison operators use the names they are assigned by Libseccomp's header. +// Attempting to convert a string that is not a valid operator results in an +// error. +func ConvertStringToOperator(in string) (configs.Operator, error) { + if op, ok := operators[in]; ok { + return op, nil + } + return 0, fmt.Errorf("string %s is not a valid operator for seccomp", in) +} + +// ConvertStringToAction converts a string into a Seccomp rule match action. +// Actions use the names they are assigned in Libseccomp's header. +// Attempting to convert a string that is not a valid action results in an +// error. +func ConvertStringToAction(in string) (configs.Action, error) { + if act, ok := actions[in]; ok { + return act, nil + } + return 0, fmt.Errorf("string %s is not a valid action for seccomp", in) +} + +// ConvertStringToArch converts a string into a Seccomp comparison arch. +func ConvertStringToArch(in string) (string, error) { + if arch, ok := archs[in]; ok { + return arch, nil + } + return "", fmt.Errorf("string %s is not a valid arch for seccomp", in) +} + +// List of flags known to this version of runc. +var flags = []string{ + flagTsync, + string(specs.LinuxSeccompFlagSpecAllow), + string(specs.LinuxSeccompFlagLog), +} + +// KnownFlags returns the list of the known filter flags. +// Used by `runc features`. +func KnownFlags() []string { + return flags +} + +// SupportedFlags returns the list of the supported filter flags. +// This list may be a subset of one returned by KnownFlags due to +// some flags not supported by the current kernel and/or libseccomp. +// Used by `runc features`. +func SupportedFlags() []string { + if !Enabled { + return nil + } + + var res []string + for _, flag := range flags { + if FlagSupported(specs.LinuxSeccompFlag(flag)) == nil { + res = append(res, flag) + } + } + + return res +} diff --git a/sysbox-runc/libcontainer/seccomp/patchbpf/enosys_linux.go b/sysbox-runc/libcontainer/seccomp/patchbpf/enosys_linux.go new file mode 100644 index 00000000..66513860 --- /dev/null +++ b/sysbox-runc/libcontainer/seccomp/patchbpf/enosys_linux.go @@ -0,0 +1,721 @@ +//go:build cgo && seccomp +// +build cgo,seccomp + +package patchbpf + +import ( + "bytes" + "encoding/binary" + "errors" + "fmt" + "io" + "os" + "runtime" + "unsafe" + + libseccomp "github.com/seccomp/libseccomp-golang" + "github.com/sirupsen/logrus" + "golang.org/x/net/bpf" + "golang.org/x/sys/unix" + + "github.com/opencontainers/runc/libcontainer/configs" + "github.com/opencontainers/runc/libcontainer/utils" +) + +// #cgo pkg-config: libseccomp +/* +#include +#include +#include +#include + +const uint32_t C_ACT_ERRNO_ENOSYS = SCMP_ACT_ERRNO(ENOSYS); + +// Copied from . + +#ifndef SECCOMP_SET_MODE_FILTER +# define SECCOMP_SET_MODE_FILTER 1 +#endif +const uintptr_t C_SET_MODE_FILTER = SECCOMP_SET_MODE_FILTER; + +#ifndef SECCOMP_FILTER_FLAG_LOG +# define SECCOMP_FILTER_FLAG_LOG (1UL << 1) +#endif +const uintptr_t C_FILTER_FLAG_LOG = SECCOMP_FILTER_FLAG_LOG; + +#ifndef SECCOMP_FILTER_FLAG_SPEC_ALLOW +# define SECCOMP_FILTER_FLAG_SPEC_ALLOW (1UL << 2) +#endif +const uintptr_t C_FILTER_FLAG_SPEC_ALLOW = SECCOMP_FILTER_FLAG_SPEC_ALLOW; + +#ifndef SECCOMP_FILTER_FLAG_NEW_LISTENER +# define SECCOMP_FILTER_FLAG_NEW_LISTENER (1UL << 3) +#endif +const uintptr_t C_FILTER_FLAG_NEW_LISTENER = SECCOMP_FILTER_FLAG_NEW_LISTENER; + +#ifndef AUDIT_ARCH_RISCV64 +#ifndef EM_RISCV +#define EM_RISCV 243 +#endif +#define AUDIT_ARCH_RISCV64 (EM_RISCV|__AUDIT_ARCH_64BIT|__AUDIT_ARCH_LE) +#endif + +// We use the AUDIT_ARCH_* values because those are the ones used by the kernel +// and SCMP_ARCH_* sometimes has fake values (such as SCMP_ARCH_X32). But we +// use so we get libseccomp's fallback definitions of AUDIT_ARCH_*. + +const uint32_t C_AUDIT_ARCH_I386 = AUDIT_ARCH_I386; +const uint32_t C_AUDIT_ARCH_X86_64 = AUDIT_ARCH_X86_64; +const uint32_t C_AUDIT_ARCH_ARM = AUDIT_ARCH_ARM; +const uint32_t C_AUDIT_ARCH_AARCH64 = AUDIT_ARCH_AARCH64; +const uint32_t C_AUDIT_ARCH_MIPS = AUDIT_ARCH_MIPS; +const uint32_t C_AUDIT_ARCH_MIPS64 = AUDIT_ARCH_MIPS64; +const uint32_t C_AUDIT_ARCH_MIPS64N32 = AUDIT_ARCH_MIPS64N32; +const uint32_t C_AUDIT_ARCH_MIPSEL = AUDIT_ARCH_MIPSEL; +const uint32_t C_AUDIT_ARCH_MIPSEL64 = AUDIT_ARCH_MIPSEL64; +const uint32_t C_AUDIT_ARCH_MIPSEL64N32 = AUDIT_ARCH_MIPSEL64N32; +const uint32_t C_AUDIT_ARCH_PPC = AUDIT_ARCH_PPC; +const uint32_t C_AUDIT_ARCH_PPC64 = AUDIT_ARCH_PPC64; +const uint32_t C_AUDIT_ARCH_PPC64LE = AUDIT_ARCH_PPC64LE; +const uint32_t C_AUDIT_ARCH_S390 = AUDIT_ARCH_S390; +const uint32_t C_AUDIT_ARCH_S390X = AUDIT_ARCH_S390X; +const uint32_t C_AUDIT_ARCH_RISCV64 = AUDIT_ARCH_RISCV64; +*/ +import "C" + +var retErrnoEnosys = uint32(C.C_ACT_ERRNO_ENOSYS) + +// Assume sizeof(int) == 4 in the BPF program. +const bpfSizeofInt = 4 + +// This syscall is used for multiplexing "large" syscalls on s390(x). Unknown +// syscalls will end up with this syscall number, so we need to explicitly +// return -ENOSYS for this syscall on those architectures. +const s390xMultiplexSyscall libseccomp.ScmpSyscall = 0 + +func isAllowAction(action configs.Action) bool { + switch action { + // Trace is considered an "allow" action because a good tracer should + // support future syscalls (by handling -ENOSYS on its own), and giving + // -ENOSYS will be disruptive for emulation. + case configs.Allow, configs.Log, configs.Trace: + return true + default: + return false + } +} + +func parseProgram(rdr io.Reader) ([]bpf.RawInstruction, error) { + var program []bpf.RawInstruction + for { + // Read the next instruction. We have to use NativeEndian because + // seccomp_export_bpf outputs the program in *host* endian-ness. + var insn unix.SockFilter + if err := binary.Read(rdr, utils.NativeEndian, &insn); err != nil { + if errors.Is(err, io.EOF) { + // Parsing complete. + break + } + if errors.Is(err, io.ErrUnexpectedEOF) { + // Parsing stopped mid-instruction. + return nil, fmt.Errorf("program parsing halted mid-instruction: %w", err) + } + // All other errors. + return nil, fmt.Errorf("error parsing instructions: %w", err) + } + program = append(program, bpf.RawInstruction{ + Op: insn.Code, + Jt: insn.Jt, + Jf: insn.Jf, + K: insn.K, + }) + } + return program, nil +} + +func disassembleFilter(filter *libseccomp.ScmpFilter) ([]bpf.Instruction, error) { + rdr, wtr, err := os.Pipe() + if err != nil { + return nil, fmt.Errorf("error creating scratch pipe: %w", err) + } + defer wtr.Close() + defer rdr.Close() + + readerBuffer := new(bytes.Buffer) + errChan := make(chan error, 1) + go func() { + _, err := io.Copy(readerBuffer, rdr) + errChan <- err + close(errChan) + }() + + if err := filter.ExportBPF(wtr); err != nil { + return nil, fmt.Errorf("error exporting BPF: %w", err) + } + // Close so that the reader actually gets EOF. + _ = wtr.Close() + + if copyErr := <-errChan; copyErr != nil { + return nil, fmt.Errorf("error reading from ExportBPF pipe: %w", copyErr) + } + + // Parse the instructions. + rawProgram, err := parseProgram(readerBuffer) + if err != nil { + return nil, fmt.Errorf("parsing generated BPF filter: %w", err) + } + program, ok := bpf.Disassemble(rawProgram) + if !ok { + return nil, errors.New("could not disassemble entire BPF filter") + } + return program, nil +} + +type nativeArch uint32 + +const invalidArch nativeArch = 0 + +func archToNative(arch libseccomp.ScmpArch) (nativeArch, error) { + switch arch { + case libseccomp.ArchNative: + // Convert to actual native architecture. + arch, err := libseccomp.GetNativeArch() + if err != nil { + return invalidArch, fmt.Errorf("unable to get native arch: %w", err) + } + return archToNative(arch) + case libseccomp.ArchX86: + return nativeArch(C.C_AUDIT_ARCH_I386), nil + case libseccomp.ArchAMD64, libseccomp.ArchX32: + // NOTE: x32 is treated like x86_64 except all x32 syscalls have the + // 30th bit of the syscall number set to indicate that it's not a + // normal x86_64 syscall. + return nativeArch(C.C_AUDIT_ARCH_X86_64), nil + case libseccomp.ArchARM: + return nativeArch(C.C_AUDIT_ARCH_ARM), nil + case libseccomp.ArchARM64: + return nativeArch(C.C_AUDIT_ARCH_AARCH64), nil + case libseccomp.ArchMIPS: + return nativeArch(C.C_AUDIT_ARCH_MIPS), nil + case libseccomp.ArchMIPS64: + return nativeArch(C.C_AUDIT_ARCH_MIPS64), nil + case libseccomp.ArchMIPS64N32: + return nativeArch(C.C_AUDIT_ARCH_MIPS64N32), nil + case libseccomp.ArchMIPSEL: + return nativeArch(C.C_AUDIT_ARCH_MIPSEL), nil + case libseccomp.ArchMIPSEL64: + return nativeArch(C.C_AUDIT_ARCH_MIPSEL64), nil + case libseccomp.ArchMIPSEL64N32: + return nativeArch(C.C_AUDIT_ARCH_MIPSEL64N32), nil + case libseccomp.ArchPPC: + return nativeArch(C.C_AUDIT_ARCH_PPC), nil + case libseccomp.ArchPPC64: + return nativeArch(C.C_AUDIT_ARCH_PPC64), nil + case libseccomp.ArchPPC64LE: + return nativeArch(C.C_AUDIT_ARCH_PPC64LE), nil + case libseccomp.ArchS390: + return nativeArch(C.C_AUDIT_ARCH_S390), nil + case libseccomp.ArchS390X: + return nativeArch(C.C_AUDIT_ARCH_S390X), nil + case libseccomp.ArchRISCV64: + return nativeArch(C.C_AUDIT_ARCH_RISCV64), nil + default: + return invalidArch, fmt.Errorf("unknown architecture: %v", arch) + } +} + +type lastSyscallMap map[nativeArch]map[libseccomp.ScmpArch]libseccomp.ScmpSyscall + +// Figure out largest syscall number referenced in the filter for each +// architecture. We will be generating code based on the native architecture +// representation, but SCMP_ARCH_X32 means we have to track cases where the +// same architecture has different largest syscalls based on the mode. +func findLastSyscalls(config *configs.Seccomp) (lastSyscallMap, error) { + lastSyscalls := make(lastSyscallMap) + // Only loop over architectures which are present in the filter. Any other + // architectures will get the libseccomp bad architecture action anyway. + for _, ociArch := range config.Architectures { + arch, err := libseccomp.GetArchFromString(ociArch) + if err != nil { + return nil, fmt.Errorf("unable to validate seccomp architecture: %w", err) + } + + // Figure out native architecture representation of the architecture. + nativeArch, err := archToNative(arch) + if err != nil { + return nil, fmt.Errorf("cannot map architecture %v to AUDIT_ARCH_ constant: %w", arch, err) + } + + if _, ok := lastSyscalls[nativeArch]; !ok { + lastSyscalls[nativeArch] = map[libseccomp.ScmpArch]libseccomp.ScmpSyscall{} + } + if _, ok := lastSyscalls[nativeArch][arch]; ok { + // Because of ArchNative we may hit the same entry multiple times. + // Just skip it if we've seen this (nativeArch, ScmpArch) + // combination before. + continue + } + + // Find the largest syscall in the filter for this architecture. + var largestSyscall libseccomp.ScmpSyscall + for _, rule := range config.Syscalls { + sysno, err := libseccomp.GetSyscallFromNameByArch(rule.Name, arch) + if err != nil { + // Ignore unknown syscalls. + continue + } + if sysno > largestSyscall { + largestSyscall = sysno + } + } + if largestSyscall != 0 { + lastSyscalls[nativeArch][arch] = largestSyscall + } else { + logrus.Warnf("could not find any syscalls for arch %s", ociArch) + delete(lastSyscalls[nativeArch], arch) + } + } + return lastSyscalls, nil +} + +// FIXME FIXME FIXME +// +// This solution is less than ideal. In the future it would be great to have +// per-arch information about which syscalls were added in which kernel +// versions so we can create far more accurate filter rules (handling holes in +// the syscall table and determining -ENOSYS requirements based on kernel +// minimum version alone. +// +// This implementation can in principle cause issues with syscalls like +// close_range(2) which were added out-of-order in the syscall table between +// kernel releases. +func generateEnosysStub(lastSyscalls lastSyscallMap) ([]bpf.Instruction, error) { + // A jump-table for each nativeArch used to generate the initial + // conditional jumps -- measured from the *END* of the program so they + // remain valid after prepending to the tail. + archJumpTable := map[nativeArch]uint32{} + + // Generate our own -ENOSYS rules for each architecture. They have to be + // generated in reverse (prepended to the tail of the program) because the + // JumpIf jumps need to be computed from the end of the program. + programTail := []bpf.Instruction{ + // Fall-through rules jump into the filter. + bpf.Jump{Skip: 1}, + // Rules which jump to here get -ENOSYS. + bpf.RetConstant{Val: retErrnoEnosys}, + } + + // Generate the syscall -ENOSYS rules. + for nativeArch, maxSyscalls := range lastSyscalls { + // The number of instructions from the tail of this section which need + // to be jumped in order to reach the -ENOSYS return. If the section + // does not jump, it will fall through to the actual filter. + baseJumpEnosys := uint32(len(programTail) - 1) + baseJumpFilter := baseJumpEnosys + 1 + + // Add the load instruction for the syscall number -- we jump here + // directly from the arch code so we need to do it here. Sadly we can't + // share this code between architecture branches. + section := []bpf.Instruction{ + // load [0] (syscall number) + bpf.LoadAbsolute{Off: 0, Size: bpfSizeofInt}, + } + + switch len(maxSyscalls) { + case 0: + // No syscalls found for this arch -- skip it and move on. + continue + case 1: + // Get the only syscall and scmpArch in the map. + var ( + scmpArch libseccomp.ScmpArch + sysno libseccomp.ScmpSyscall + ) + for arch, no := range maxSyscalls { + sysno = no + scmpArch = arch + } + + switch scmpArch { + // Return -ENOSYS for setup(2) on s390(x). This syscall is used for + // multiplexing "large syscall number" syscalls, but if the syscall + // number is not known to the kernel then the syscall number is + // left unchanged (and because it is sysno=0, you'll end up with + // EPERM for syscalls the kernel doesn't know about). + // + // The actual setup(2) syscall is never used by userspace anymore + // (and hasn't existed for decades) outside of this multiplexing + // scheme so returning -ENOSYS is fine. + case libseccomp.ArchS390, libseccomp.ArchS390X: + section = append(section, []bpf.Instruction{ + // jne [setup=0],1 + bpf.JumpIf{ + Cond: bpf.JumpNotEqual, + Val: uint32(s390xMultiplexSyscall), + SkipTrue: 1, + }, + // ret [ENOSYS] + bpf.RetConstant{Val: retErrnoEnosys}, + }...) + } + + // The simplest case just boils down to a single jgt instruction, + // with special handling if baseJumpEnosys is larger than 255 (and + // thus a long jump is required). + var sectionTail []bpf.Instruction + if baseJumpEnosys+1 <= 255 { + sectionTail = []bpf.Instruction{ + // jgt [syscall],[baseJumpEnosys+1] + bpf.JumpIf{ + Cond: bpf.JumpGreaterThan, + Val: uint32(sysno), + SkipTrue: uint8(baseJumpEnosys + 1), + }, + // ja [baseJumpFilter] + bpf.Jump{Skip: baseJumpFilter}, + } + } else { + sectionTail = []bpf.Instruction{ + // jle [syscall],1 + bpf.JumpIf{Cond: bpf.JumpLessOrEqual, Val: uint32(sysno), SkipTrue: 1}, + // ret [ENOSYS] + bpf.RetConstant{Val: retErrnoEnosys}, + // ja [baseJumpFilter] + bpf.Jump{Skip: baseJumpFilter}, + } + } + + // If we're on x86 we need to add a check for x32 and if we're in + // the wrong mode we jump over the section. + if uint32(nativeArch) == uint32(C.C_AUDIT_ARCH_X86_64) { + // Generate a prefix to check the mode. + switch scmpArch { + case libseccomp.ArchAMD64: + sectionTail = append([]bpf.Instruction{ + // jset (1<<30),[len(tail)-1] + bpf.JumpIf{ + Cond: bpf.JumpBitsSet, + Val: 1 << 30, + SkipTrue: uint8(len(sectionTail) - 1), + }, + }, sectionTail...) + case libseccomp.ArchX32: + sectionTail = append([]bpf.Instruction{ + // jset (1<<30),0,[len(tail)-1] + bpf.JumpIf{ + Cond: bpf.JumpBitsNotSet, + Val: 1 << 30, + SkipTrue: uint8(len(sectionTail) - 1), + }, + }, sectionTail...) + default: + return nil, fmt.Errorf("unknown amd64 native architecture %#x", scmpArch) + } + } + + section = append(section, sectionTail...) + case 2: + // x32 and x86_64 are a unique case, we can't handle any others. + if uint32(nativeArch) != uint32(C.C_AUDIT_ARCH_X86_64) { + return nil, fmt.Errorf("unknown architecture overlap on native arch %#x", nativeArch) + } + + x32sysno, ok := maxSyscalls[libseccomp.ArchX32] + if !ok { + return nil, fmt.Errorf("missing %v in overlapping x86_64 arch: %v", libseccomp.ArchX32, maxSyscalls) + } + x86sysno, ok := maxSyscalls[libseccomp.ArchAMD64] + if !ok { + return nil, fmt.Errorf("missing %v in overlapping x86_64 arch: %v", libseccomp.ArchAMD64, maxSyscalls) + } + + // The x32 ABI indicates that a syscall is being made by an x32 + // process by setting the 30th bit of the syscall number, but we + // need to do some special-casing depending on whether we need to + // do long jumps. + if baseJumpEnosys+2 <= 255 { + // For the simple case we want to have something like: + // jset (1<<30),1 + // jgt [x86 syscall],[baseJumpEnosys+2],1 + // jgt [x32 syscall],[baseJumpEnosys+1] + // ja [baseJumpFilter] + section = append(section, []bpf.Instruction{ + // jset (1<<30),1 + bpf.JumpIf{Cond: bpf.JumpBitsSet, Val: 1 << 30, SkipTrue: 1}, + // jgt [x86 syscall],[baseJumpEnosys+1],1 + bpf.JumpIf{ + Cond: bpf.JumpGreaterThan, + Val: uint32(x86sysno), + SkipTrue: uint8(baseJumpEnosys + 2), SkipFalse: 1, + }, + // jgt [x32 syscall],[baseJumpEnosys] + bpf.JumpIf{ + Cond: bpf.JumpGreaterThan, + Val: uint32(x32sysno), + SkipTrue: uint8(baseJumpEnosys + 1), + }, + // ja [baseJumpFilter] + bpf.Jump{Skip: baseJumpFilter}, + }...) + } else { + // But if the [baseJumpEnosys+2] jump is larger than 255 we + // need to do a long jump like so: + // jset (1<<30),1 + // jgt [x86 syscall],1,2 + // jle [x32 syscall],1 + // ret [ENOSYS] + // ja [baseJumpFilter] + section = append(section, []bpf.Instruction{ + // jset (1<<30),1 + bpf.JumpIf{Cond: bpf.JumpBitsSet, Val: 1 << 30, SkipTrue: 1}, + // jgt [x86 syscall],1,2 + bpf.JumpIf{ + Cond: bpf.JumpGreaterThan, + Val: uint32(x86sysno), + SkipTrue: 1, SkipFalse: 2, + }, + // jle [x32 syscall],1 + bpf.JumpIf{ + Cond: bpf.JumpLessOrEqual, + Val: uint32(x32sysno), + SkipTrue: 1, + }, + // ret [ENOSYS] + bpf.RetConstant{Val: retErrnoEnosys}, + // ja [baseJumpFilter] + bpf.Jump{Skip: baseJumpFilter}, + }...) + } + default: + return nil, fmt.Errorf("invalid number of architecture overlaps: %v", len(maxSyscalls)) + } + + // Prepend this section to the tail. + programTail = append(section, programTail...) + + // Update jump table. + archJumpTable[nativeArch] = uint32(len(programTail)) + } + + // Add a dummy "jump to filter" for any architecture we might miss below. + // Such architectures will probably get the BadArch action of the filter + // regardless. + programTail = append([]bpf.Instruction{ + // ja [end of stub and start of filter] + bpf.Jump{Skip: uint32(len(programTail))}, + }, programTail...) + + // Generate the jump rules for each architecture. This has to be done in + // reverse as well for the same reason as above. We add to programTail + // directly because the jumps are impacted by each architecture rule we add + // as well. + // + // TODO: Maybe we want to optimise to avoid long jumps here? So sort the + // architectures based on how large the jumps are going to be, or + // re-sort the candidate architectures each time to make sure that we + // pick the largest jump which is going to be smaller than 255. + for nativeArch := range lastSyscalls { + // We jump forwards but the jump table is calculated from the *END*. + jump := uint32(len(programTail)) - archJumpTable[nativeArch] + + // Same routine as above -- this is a basic jeq check, complicated + // slightly if it turns out that we need to do a long jump. + if jump <= 255 { + programTail = append([]bpf.Instruction{ + // jeq [arch],[jump] + bpf.JumpIf{ + Cond: bpf.JumpEqual, + Val: uint32(nativeArch), + SkipTrue: uint8(jump), + }, + }, programTail...) + } else { + programTail = append([]bpf.Instruction{ + // jne [arch],1 + bpf.JumpIf{ + Cond: bpf.JumpNotEqual, + Val: uint32(nativeArch), + SkipTrue: 1, + }, + // ja [jump] + bpf.Jump{Skip: jump}, + }, programTail...) + } + } + + // Prepend the load instruction for the architecture. + programTail = append([]bpf.Instruction{ + // load [4] (architecture) + bpf.LoadAbsolute{Off: bpfSizeofInt, Size: bpfSizeofInt}, + }, programTail...) + + // And that's all folks! + return programTail, nil +} + +func assemble(program []bpf.Instruction) ([]unix.SockFilter, error) { + rawProgram, err := bpf.Assemble(program) + if err != nil { + return nil, fmt.Errorf("error assembling program: %w", err) + } + + // Convert to []unix.SockFilter for unix.SockFilter. + var filter []unix.SockFilter + for _, insn := range rawProgram { + filter = append(filter, unix.SockFilter{ + Code: insn.Op, + Jt: insn.Jt, + Jf: insn.Jf, + K: insn.K, + }) + } + return filter, nil +} + +func generatePatch(config *configs.Seccomp) ([]bpf.Instruction, error) { + // Patch the generated cBPF only when there is not a defaultErrnoRet set + // and it is different from ENOSYS + if config.DefaultErrnoRet != nil && *config.DefaultErrnoRet == uint(retErrnoEnosys) { + return nil, nil + } + // We only add the stub if the default action is not permissive. + if isAllowAction(config.DefaultAction) { + logrus.Debugf("seccomp: skipping -ENOSYS stub filter generation") + return nil, nil + } + + lastSyscalls, err := findLastSyscalls(config) + if err != nil { + return nil, fmt.Errorf("error finding last syscalls for -ENOSYS stub: %w", err) + } + stubProgram, err := generateEnosysStub(lastSyscalls) + if err != nil { + return nil, fmt.Errorf("error generating -ENOSYS stub: %w", err) + } + return stubProgram, nil +} + +func enosysPatchFilter(config *configs.Seccomp, filter *libseccomp.ScmpFilter) ([]unix.SockFilter, error) { + program, err := disassembleFilter(filter) + if err != nil { + return nil, fmt.Errorf("error disassembling original filter: %w", err) + } + + patch, err := generatePatch(config) + if err != nil { + return nil, fmt.Errorf("error generating patch for filter: %w", err) + } + fullProgram := append(patch, program...) + + logrus.Debugf("seccomp: prepending -ENOSYS stub filter to user filter...") + for idx, insn := range patch { + logrus.Debugf(" [%4.1d] %s", idx, insn) + } + logrus.Debugf(" [....] --- original filter ---") + + fprog, err := assemble(fullProgram) + if err != nil { + return nil, fmt.Errorf("error assembling modified filter: %w", err) + } + return fprog, nil +} + +func filterFlags(config *configs.Seccomp, filter *libseccomp.ScmpFilter) (flags uint, noNewPrivs bool, err error) { + // Ignore the error since pre-2.4 libseccomp is treated as API level 0. + apiLevel, _ := libseccomp.GetAPI() + + noNewPrivs, err = filter.GetNoNewPrivsBit() + if err != nil { + return 0, false, fmt.Errorf("unable to fetch no_new_privs filter bit: %w", err) + } + + if apiLevel >= 3 { + if logBit, err := filter.GetLogBit(); err != nil { + return 0, false, fmt.Errorf("unable to fetch SECCOMP_FILTER_FLAG_LOG bit: %w", err) + } else if logBit { + flags |= uint(C.C_FILTER_FLAG_LOG) + } + } + if apiLevel >= 4 { + if ssb, err := filter.GetSSB(); err != nil { + return 0, false, fmt.Errorf("unable to fetch SECCOMP_FILTER_FLAG_SPEC_ALLOW bit: %w", err) + } else if ssb { + flags |= uint(C.C_FILTER_FLAG_SPEC_ALLOW) + } + } + // XXX: add newly supported filter flags above this line. + + for _, call := range config.Syscalls { + if call.Action == configs.Notify { + flags |= uint(C.C_FILTER_FLAG_NEW_LISTENER) + break + } + } + + return +} + +func sysSeccompSetFilter(flags uint, filter []unix.SockFilter) (fd int, err error) { + // This debug output is validated in tests/integration/seccomp.bats + // by the SECCOMP_FILTER_FLAG_* test. + logrus.Debugf("seccomp filter flags: %d", flags) + fprog := unix.SockFprog{ + Len: uint16(len(filter)), + Filter: &filter[0], + } + fd = -1 // only return a valid fd when C_FILTER_FLAG_NEW_LISTENER is set + // If no seccomp flags were requested we can use the old-school prctl(2). + if flags == 0 { + err = unix.Prctl(unix.PR_SET_SECCOMP, + unix.SECCOMP_MODE_FILTER, + uintptr(unsafe.Pointer(&fprog)), 0, 0) + } else { + fdptr, _, errno := unix.RawSyscall(unix.SYS_SECCOMP, + uintptr(C.C_SET_MODE_FILTER), + uintptr(flags), uintptr(unsafe.Pointer(&fprog))) + if errno != 0 { + err = errno + } + if flags&uint(C.C_FILTER_FLAG_NEW_LISTENER) != 0 { + fd = int(fdptr) + } + } + runtime.KeepAlive(filter) + runtime.KeepAlive(fprog) + return +} + +// PatchAndLoad takes a seccomp configuration and a libseccomp filter which has +// been pre-configured with the set of rules in the seccomp config. It then +// patches said filter to handle -ENOSYS in a much nicer manner than the +// default libseccomp default action behaviour, and loads the patched filter +// into the kernel for the current process. +func PatchAndLoad(config *configs.Seccomp, filter *libseccomp.ScmpFilter) (*os.File, error) { + // Generate a patched filter. + fprog, err := enosysPatchFilter(config, filter) + if err != nil { + return nil, fmt.Errorf("error patching filter: %w", err) + } + + // Get the set of libseccomp flags set. + seccompFlags, noNewPrivs, err := filterFlags(config, filter) + if err != nil { + return nil, fmt.Errorf("unable to fetch seccomp filter flags: %w", err) + } + + // Set no_new_privs if it was requested, though in runc we handle + // no_new_privs separately so warn if we hit this path. + if noNewPrivs { + logrus.Warnf("potentially misconfigured filter -- setting no_new_privs in seccomp path") + if err := unix.Prctl(unix.PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0); err != nil { + return nil, fmt.Errorf("error enabling no_new_privs bit: %w", err) + } + } + + // Finally, load the filter. + fd, err := sysSeccompSetFilter(seccompFlags, fprog) + if err != nil { + return nil, fmt.Errorf("error loading seccomp filter: %w", err) + } + return os.NewFile(uintptr(fd), "[seccomp filter]"), nil +} diff --git a/sysbox-runc/libcontainer/seccomp/patchbpf/enosys_linux_test.go b/sysbox-runc/libcontainer/seccomp/patchbpf/enosys_linux_test.go new file mode 100644 index 00000000..e2d363a4 --- /dev/null +++ b/sysbox-runc/libcontainer/seccomp/patchbpf/enosys_linux_test.go @@ -0,0 +1,309 @@ +//go:build cgo && seccomp +// +build cgo,seccomp + +package patchbpf + +import ( + "bytes" + "encoding/binary" + "fmt" + "testing" + + "github.com/opencontainers/runc/libcontainer/configs" + + libseccomp "github.com/seccomp/libseccomp-golang" + "golang.org/x/net/bpf" +) + +type seccompData struct { + Syscall uint32 // NOTE: We assume sizeof(int) == 4. + Arch uint32 + IP uint64 + Args [6]uint64 +} + +// mockSyscallPayload creates a fake seccomp_data struct with the given data. +func mockSyscallPayload(t *testing.T, sysno libseccomp.ScmpSyscall, arch nativeArch, args ...uint64) []byte { + var buf bytes.Buffer + + data := seccompData{ + Syscall: uint32(sysno), + Arch: uint32(arch), + IP: 0xDEADBEEFCAFE, + } + + copy(data.Args[:], args) + if len(args) > 6 { + t.Fatalf("bad syscall payload: linux only supports 6-argument syscalls") + } + + // NOTE: We use BigEndian here because golang.org/x/net/bpf assumes that + // all payloads are big-endian while seccomp uses host endianness. + if err := binary.Write(&buf, binary.BigEndian, data); err != nil { + t.Fatalf("bad syscall payload: cannot write data: %v", err) + } + return buf.Bytes() +} + +// retFallthrough is returned by the mockFilter. If a the mock filter returns +// this value, it indicates "fallthrough to libseccomp-generated filter". +const retFallthrough uint32 = 0xDEADBEEF + +// mockFilter returns a BPF VM that contains a mock filter with an -ENOSYS +// stub. If the filter returns retFallthrough, the stub filter has permitted +// the syscall to pass. +func mockFilter(t *testing.T, config *configs.Seccomp) (*bpf.VM, []bpf.Instruction) { + patch, err := generatePatch(config) + if err != nil { + t.Fatalf("mock filter: generate enosys patch: %v", err) + } + + program := append(patch, bpf.RetConstant{Val: retFallthrough}) + + vm, err := bpf.NewVM(program) + if err != nil { + t.Fatalf("mock filter: compile BPF VM: %v", err) + } + return vm, program +} + +// fakeConfig generates a fake libcontainer seccomp configuration. The syscalls +// are added with an action distinct from the default action. +func fakeConfig(defaultAction configs.Action, explicitSyscalls []string, arches []string) *configs.Seccomp { + config := configs.Seccomp{ + DefaultAction: defaultAction, + Architectures: arches, + } + syscallAction := configs.Allow + if syscallAction == defaultAction { + syscallAction = configs.Kill + } + for _, syscall := range explicitSyscalls { + config.Syscalls = append(config.Syscalls, &configs.Syscall{ + Name: syscall, + Action: syscallAction, + }) + } + return &config +} + +// List copied from . +var testArches = []string{ + "x86", + "amd64", + "x32", + "arm", + "arm64", + "mips", + "mips64", + "mips64n32", + "mipsel", + "mipsel64", + "mipsel64n32", + "ppc", + "ppc64", + "ppc64le", + "s390", + "s390x", +} + +func testEnosysStub(t *testing.T, defaultAction configs.Action, arches []string) { + explicitSyscalls := []string{ + "setns", + "kcmp", + "renameat2", + "copy_file_range", + } + + implicitSyscalls := []string{ + "clone", + "openat", + "read", + "write", + } + + futureSyscalls := []libseccomp.ScmpSyscall{1000, 7331} + + // Quick lookups for which arches are enabled. + archSet := map[string]bool{} + for _, arch := range arches { + archSet[arch] = true + } + + for _, test := range []struct { + start, end int + }{ + {0, 1}, // [setns] + {0, 2}, // [setns, process_vm_readv] + {1, 2}, // [process_vm_readv] + {1, 3}, // [process_vm_readv, renameat2, copy_file_range] + {1, 4}, // [process_vm_readv, renameat2, copy_file_range] + {3, 4}, // [copy_file_range] + } { + allowedSyscalls := explicitSyscalls[test.start:test.end] + config := fakeConfig(defaultAction, allowedSyscalls, arches) + filter, program := mockFilter(t, config) + + // The syscalls are in increasing order of newness, so all syscalls + // after the last allowed syscall will give -ENOSYS. + enosysStart := test.end + + for _, arch := range testArches { + type syscallTest struct { + syscall string + sysno libseccomp.ScmpSyscall + expected uint32 + } + + scmpArch, err := libseccomp.GetArchFromString(arch) + if err != nil { + t.Fatalf("unknown libseccomp architecture %q: %v", arch, err) + } + + nativeArch, err := archToNative(scmpArch) + if err != nil { + t.Fatalf("unknown audit architecture %q: %v", arch, err) + } + + var syscallTests []syscallTest + + // Add explicit syscalls (whether they will return -ENOSYS + // depends on the filter rules). + for idx, syscall := range explicitSyscalls { + expected := retFallthrough + if idx >= enosysStart { + expected = retErrnoEnosys + } + sysno, err := libseccomp.GetSyscallFromNameByArch(syscall, scmpArch) + if err != nil { + t.Fatalf("unknown syscall %q on arch %q: %v", syscall, arch, err) + } + syscallTests = append(syscallTests, syscallTest{ + syscall, + sysno, + expected, + }) + } + + // Add implicit syscalls. + for _, syscall := range implicitSyscalls { + sysno, err := libseccomp.GetSyscallFromNameByArch(syscall, scmpArch) + if err != nil { + t.Fatalf("unknown syscall %q on arch %q: %v", syscall, arch, err) + } + syscallTests = append(syscallTests, syscallTest{ + sysno: sysno, + syscall: syscall, + expected: retFallthrough, + }) + } + + // Add future syscalls. + for _, sysno := range futureSyscalls { + baseSysno, err := libseccomp.GetSyscallFromNameByArch("copy_file_range", scmpArch) + if err != nil { + t.Fatalf("unknown syscall 'copy_file_range' on arch %q: %v", arch, err) + } + sysno += baseSysno + + syscallTests = append(syscallTests, syscallTest{ + sysno: sysno, + syscall: fmt.Sprintf("syscall_%#x", sysno), + expected: retErrnoEnosys, + }) + } + + // If we're on s390(x) make sure you get -ENOSYS for the "setup" + // syscall (this is done to work around an issue with s390x's + // syscall multiplexing which results in unknown syscalls being a + // setup(2) invocation). + switch scmpArch { + case libseccomp.ArchS390, libseccomp.ArchS390X: + syscallTests = append(syscallTests, syscallTest{ + sysno: s390xMultiplexSyscall, + syscall: "setup", + expected: retErrnoEnosys, + }) + } + + // Test syscalls in the explicit list. + for _, test := range syscallTests { + // Override the expected value in the two special cases. + if !archSet[arch] || isAllowAction(defaultAction) { + test.expected = retFallthrough + } + + payload := mockSyscallPayload(t, test.sysno, nativeArch, 0x1337, 0xF00BA5) + // NOTE: golang.org/x/net/bpf returns int here rather + // than uint32. + rawRet, err := filter.Run(payload) + if err != nil { + t.Fatalf("error running filter: %v", err) + } + ret := uint32(rawRet) + if ret != test.expected { + t.Logf("mock filter for %v %v:", arches, allowedSyscalls) + for idx, insn := range program { + t.Logf(" [%4.1d] %s", idx, insn) + } + t.Logf("payload: %#v", payload) + t.Errorf("filter %s(%d) %q(%d): got %#x, want %#x", arch, nativeArch, test.syscall, test.sysno, ret, test.expected) + } + } + } + } +} + +var testActions = map[string]configs.Action{ + "allow": configs.Allow, + "log": configs.Log, + "errno": configs.Errno, + "kill": configs.Kill, +} + +func TestEnosysStub_SingleArch(t *testing.T) { + for _, arch := range testArches { + arches := []string{arch} + t.Run("arch="+arch, func(t *testing.T) { + for name, action := range testActions { + t.Run("action="+name, func(t *testing.T) { + testEnosysStub(t, action, arches) + }) + } + }) + } +} + +func TestEnosysStub_MultiArch(t *testing.T) { + for end := 0; end < len(testArches); end++ { + for start := 0; start < end; start++ { + arches := testArches[start:end] + if len(arches) <= 1 { + continue + } + for _, action := range testActions { + testEnosysStub(t, action, arches) + } + } + } +} + +func TestDisassembleHugeFilterDoesNotHang(t *testing.T) { + hugeFilter, err := libseccomp.NewFilter(libseccomp.ActAllow) + if err != nil { + t.Fatalf("failed to create seccomp filter: %v", err) + } + + for i := 1; i < 10000; i++ { + if err := hugeFilter.AddRule(libseccomp.ScmpSyscall(i), libseccomp.ActKillThread); err != nil { + t.Fatalf("failed to add rule to filter %d: %v", i, err) + } + } + + _, err = disassembleFilter(hugeFilter) + if err != nil { + t.Fatalf("failed to disassembleFilter: %v", err) + } + + // if we exit, we did not hang +} diff --git a/sysbox-runc/libcontainer/seccomp/patchbpf/enosys_unsupported.go b/sysbox-runc/libcontainer/seccomp/patchbpf/enosys_unsupported.go new file mode 100644 index 00000000..d23167ae --- /dev/null +++ b/sysbox-runc/libcontainer/seccomp/patchbpf/enosys_unsupported.go @@ -0,0 +1,4 @@ +//go:build !linux || !cgo || !seccomp +// +build !linux !cgo !seccomp + +package patchbpf diff --git a/sysbox-runc/libcontainer/seccomp/seccomp_linux.go b/sysbox-runc/libcontainer/seccomp/seccomp_linux.go new file mode 100644 index 00000000..2c64ebbb --- /dev/null +++ b/sysbox-runc/libcontainer/seccomp/seccomp_linux.go @@ -0,0 +1,350 @@ +//go:build cgo && seccomp +// +build cgo,seccomp + +package seccomp + +import ( + "errors" + "fmt" + "os" + + libseccomp "github.com/seccomp/libseccomp-golang" + "github.com/sirupsen/logrus" + "golang.org/x/sys/unix" + + "github.com/opencontainers/runc/libcontainer/configs" + "github.com/opencontainers/runc/libcontainer/seccomp/patchbpf" + "github.com/opencontainers/runtime-spec/specs-go" +) + +var ( + actTrace = libseccomp.ActTrace.SetReturnCode(int16(unix.EPERM)) + actErrno = libseccomp.ActErrno.SetReturnCode(int16(unix.EPERM)) +) + +const ( + // Linux system calls can have at most 6 arguments + syscallMaxArguments int = 6 +) + +// InitSeccomp installs the seccomp filters to be used in the container as +// specified in config. Returns the seccomp file descriptor if any of the +// filters include a SCMP_ACT_NOTIFY action. +func InitSeccomp(config *configs.Seccomp) (*os.File, error) { + if config == nil { + return nil, errors.New("cannot initialize Seccomp - nil config passed") + } + + defaultAction, err := getAction(config.DefaultAction, config.DefaultErrnoRet) + if err != nil { + return nil, errors.New("error initializing seccomp - invalid default action") + } + + // Ignore the error since pre-2.4 libseccomp is treated as API level 0. + apiLevel, _ := libseccomp.GetAPI() + for _, call := range config.Syscalls { + if call.Action == configs.Notify { + if apiLevel < 6 { + return nil, fmt.Errorf("seccomp notify unsupported: API level: got %d, want at least 6. Please try with libseccomp >= 2.5.0 and Linux >= 5.7", apiLevel) + } + + // We can't allow the write syscall to notify to the seccomp agent. + // After InitSeccomp() is called, we need to syncParentSeccomp() to write the seccomp fd plain + // number, so the parent sends it to the seccomp agent. If we use SCMP_ACT_NOTIFY on write, we + // never can write the seccomp fd to the parent and therefore the seccomp agent never receives + // the seccomp fd and runc is hang during initialization. + // + // Note that read()/close(), that are also used in syncParentSeccomp(), _can_ use SCMP_ACT_NOTIFY. + // Because we write the seccomp fd on the pipe to the parent, the parent is able to proceed and + // send the seccomp fd to the agent (it is another process and not subject to the seccomp + // filter). We will be blocked on read()/close() inside syncParentSeccomp() but if the seccomp + // agent allows those syscalls to proceed, initialization works just fine and the agent can + // handle future read()/close() syscalls as it wanted. + if call.Name == "write" { + return nil, errors.New("SCMP_ACT_NOTIFY cannot be used for the write syscall") + } + } + } + + // See comment on why write is not allowed. The same reason applies, as this can mean handling write too. + if defaultAction == libseccomp.ActNotify { + return nil, errors.New("SCMP_ACT_NOTIFY cannot be used as default action") + } + + filter, err := libseccomp.NewFilter(defaultAction) + if err != nil { + return nil, fmt.Errorf("error creating filter: %w", err) + } + + // Add extra architectures + for _, arch := range config.Architectures { + scmpArch, err := libseccomp.GetArchFromString(arch) + if err != nil { + return nil, fmt.Errorf("error validating Seccomp architecture: %w", err) + } + if err := filter.AddArch(scmpArch); err != nil { + return nil, fmt.Errorf("error adding architecture to seccomp filter: %w", err) + } + } + + // Add extra flags. + for _, flag := range config.Flags { + if err := setFlag(filter, flag); err != nil { + return nil, err + } + } + + // Enable libseccomp binary tree optimization for longer rulesets. + // + // The number below chosen semi-arbitrarily, considering the following: + // 1. libseccomp <= 2.5.4 misbehaves when binary tree optimization + // is enabled and there are 0 rules. + // 2. All known libseccomp versions (2.5.0 to 2.5.4) generate a binary + // tree with 4 syscalls per node. + if len(config.Syscalls) > 32 { + if err := filter.SetOptimize(2); err != nil { + // The error is not fatal and is probably means we have older libseccomp. + logrus.Debugf("seccomp binary tree optimization not available: %v", err) + } + } + + // Unset no new privs bit + if err := filter.SetNoNewPrivsBit(false); err != nil { + return nil, fmt.Errorf("error setting no new privileges: %w", err) + } + + // Add a rule for each syscall + for _, call := range config.Syscalls { + if call == nil { + return nil, errors.New("encountered nil syscall while initializing Seccomp") + } + + if err := matchCall(filter, call, defaultAction); err != nil { + return nil, err + } + } + + seccompFd, err := patchbpf.PatchAndLoad(config, filter) + if err != nil { + return nil, fmt.Errorf("error loading seccomp filter into kernel: %w", err) + } + return seccompFd, nil +} + +type unknownFlagError struct { + flag specs.LinuxSeccompFlag +} + +func (e *unknownFlagError) Error() string { + return "seccomp flag " + string(e.flag) + " is not known to runc" +} + +func setFlag(filter *libseccomp.ScmpFilter, flag specs.LinuxSeccompFlag) error { + switch flag { + case flagTsync: + // libseccomp-golang always use filterAttrTsync when + // possible so all goroutines will receive the same + // rules, so there is nothing to do. It does not make + // sense to apply the seccomp filter on only one + // thread; other threads will be terminated after exec + // anyway. + return nil + case specs.LinuxSeccompFlagLog: + if err := filter.SetLogBit(true); err != nil { + return fmt.Errorf("error adding log flag to seccomp filter: %w", err) + } + return nil + case specs.LinuxSeccompFlagSpecAllow: + if err := filter.SetSSB(true); err != nil { + return fmt.Errorf("error adding SSB flag to seccomp filter: %w", err) + } + return nil + } + // NOTE when adding more flags above, do not forget to also: + // - add new flags to `flags` slice in config.go; + // - add new flag values to flags_value() in tests/integration/seccomp.bats; + // - modify func filterFlags in patchbpf/ accordingly. + + return &unknownFlagError{flag: flag} +} + +// FlagSupported checks if the flag is known to runc and supported by +// currently used libseccomp and kernel (i.e. it can be set). +func FlagSupported(flag specs.LinuxSeccompFlag) error { + filter := &libseccomp.ScmpFilter{} + err := setFlag(filter, flag) + + // For flags we don't know, setFlag returns unknownFlagError. + var uf *unknownFlagError + if errors.As(err, &uf) { + return err + } + // For flags that are known to runc and libseccomp-golang but can not + // be applied because either libseccomp or the kernel is too old, + // seccomp.VersionError is returned. + var verErr *libseccomp.VersionError + if errors.As(err, &verErr) { + // Not supported by libseccomp or the kernel. + return err + } + + // All other flags are known and supported. + return nil +} + +// Convert Libcontainer Action to Libseccomp ScmpAction +func getAction(act configs.Action, errnoRet *uint) (libseccomp.ScmpAction, error) { + switch act { + case configs.Kill, configs.KillThread: + return libseccomp.ActKillThread, nil + case configs.Errno: + if errnoRet != nil { + return libseccomp.ActErrno.SetReturnCode(int16(*errnoRet)), nil + } + return actErrno, nil + case configs.Trap: + return libseccomp.ActTrap, nil + case configs.Allow: + return libseccomp.ActAllow, nil + case configs.Trace: + if errnoRet != nil { + return libseccomp.ActTrace.SetReturnCode(int16(*errnoRet)), nil + } + return actTrace, nil + case configs.Log: + return libseccomp.ActLog, nil + case configs.Notify: + return libseccomp.ActNotify, nil + case configs.KillProcess: + return libseccomp.ActKillProcess, nil + default: + return libseccomp.ActInvalid, errors.New("invalid action, cannot use in rule") + } +} + +// Convert Libcontainer Operator to Libseccomp ScmpCompareOp +func getOperator(op configs.Operator) (libseccomp.ScmpCompareOp, error) { + switch op { + case configs.EqualTo: + return libseccomp.CompareEqual, nil + case configs.NotEqualTo: + return libseccomp.CompareNotEqual, nil + case configs.GreaterThan: + return libseccomp.CompareGreater, nil + case configs.GreaterThanOrEqualTo: + return libseccomp.CompareGreaterEqual, nil + case configs.LessThan: + return libseccomp.CompareLess, nil + case configs.LessThanOrEqualTo: + return libseccomp.CompareLessOrEqual, nil + case configs.MaskEqualTo: + return libseccomp.CompareMaskedEqual, nil + default: + return libseccomp.CompareInvalid, errors.New("invalid operator, cannot use in rule") + } +} + +// Convert Libcontainer Arg to Libseccomp ScmpCondition +func getCondition(arg *configs.Arg) (libseccomp.ScmpCondition, error) { + cond := libseccomp.ScmpCondition{} + + if arg == nil { + return cond, errors.New("cannot convert nil to syscall condition") + } + + op, err := getOperator(arg.Op) + if err != nil { + return cond, err + } + + return libseccomp.MakeCondition(arg.Index, op, arg.Value, arg.ValueTwo) +} + +// Add a rule to match a single syscall +func matchCall(filter *libseccomp.ScmpFilter, call *configs.Syscall, defAct libseccomp.ScmpAction) error { + if call == nil || filter == nil { + return errors.New("cannot use nil as syscall to block") + } + + if len(call.Name) == 0 { + return errors.New("empty string is not a valid syscall") + } + + // Convert the call's action to the libseccomp equivalent + callAct, err := getAction(call.Action, call.ErrnoRet) + if err != nil { + return fmt.Errorf("action in seccomp profile is invalid: %w", err) + } + if callAct == defAct { + // This rule is redundant, silently skip it + // to avoid error from AddRule. + return nil + } + + // If we can't resolve the syscall, assume it is not supported + // by this kernel. Warn about it, don't error out. + callNum, err := libseccomp.GetSyscallFromName(call.Name) + if err != nil { + logrus.Debugf("unknown seccomp syscall %q ignored", call.Name) + return nil + } + + // Unconditional match - just add the rule + if len(call.Args) == 0 { + if err := filter.AddRule(callNum, callAct); err != nil { + return fmt.Errorf("error adding seccomp filter rule for syscall %s: %w", call.Name, err) + } + } else { + // If two or more arguments have the same condition, + // Revert to old behavior, adding each condition as a separate rule + argCounts := make([]uint, syscallMaxArguments) + conditions := []libseccomp.ScmpCondition{} + + for _, cond := range call.Args { + newCond, err := getCondition(cond) + if err != nil { + return fmt.Errorf("error creating seccomp syscall condition for syscall %s: %w", call.Name, err) + } + + argCounts[cond.Index] += 1 + + conditions = append(conditions, newCond) + } + + hasMultipleArgs := false + for _, count := range argCounts { + if count > 1 { + hasMultipleArgs = true + break + } + } + + if hasMultipleArgs { + // Revert to old behavior + // Add each condition attached to a separate rule + for _, cond := range conditions { + condArr := []libseccomp.ScmpCondition{cond} + + if err := filter.AddRuleConditional(callNum, callAct, condArr); err != nil { + return fmt.Errorf("error adding seccomp rule for syscall %s: %w", call.Name, err) + } + } + } else { + // No conditions share same argument + // Use new, proper behavior + if err := filter.AddRuleConditional(callNum, callAct, conditions); err != nil { + return fmt.Errorf("error adding seccomp rule for syscall %s: %w", call.Name, err) + } + } + } + + return nil +} + +// Version returns major, minor, and micro. +func Version() (uint, uint, uint) { + return libseccomp.GetLibraryVersion() +} + +// Enabled is true if seccomp support is compiled in. +const Enabled = true diff --git a/sysbox-runc/libcontainer/seccomp/seccomp_unsupported.go b/sysbox-runc/libcontainer/seccomp/seccomp_unsupported.go new file mode 100644 index 00000000..b08a3498 --- /dev/null +++ b/sysbox-runc/libcontainer/seccomp/seccomp_unsupported.go @@ -0,0 +1,35 @@ +//go:build !linux || !cgo || !seccomp +// +build !linux !cgo !seccomp + +package seccomp + +import ( + "errors" + "os" + + "github.com/opencontainers/runc/libcontainer/configs" + "github.com/opencontainers/runtime-spec/specs-go" +) + +var ErrSeccompNotEnabled = errors.New("seccomp: config provided but seccomp not supported") + +// InitSeccomp does nothing because seccomp is not supported. +func InitSeccomp(config *configs.Seccomp) (*os.File, error) { + if config != nil { + return nil, ErrSeccompNotEnabled + } + return nil, nil +} + +// FlagSupported tells if a provided seccomp flag is supported. +func FlagSupported(_ specs.LinuxSeccompFlag) error { + return ErrSeccompNotEnabled +} + +// Version returns major, minor, and micro. +func Version() (uint, uint, uint) { + return 0, 0, 0 +} + +// Enabled is true if seccomp support is compiled in. +const Enabled = false diff --git a/sysbox-runc/libcontainer/setns_init_linux.go b/sysbox-runc/libcontainer/setns_init_linux.go new file mode 100644 index 00000000..cd6b298c --- /dev/null +++ b/sysbox-runc/libcontainer/setns_init_linux.go @@ -0,0 +1,134 @@ +//go:build linux +// +build linux + +package libcontainer + +import ( + "os" + "runtime" + + "github.com/opencontainers/runc/libcontainer/apparmor" + "github.com/opencontainers/runc/libcontainer/keys" + "github.com/opencontainers/runc/libcontainer/seccomp" + "github.com/opencontainers/runc/libcontainer/system" + "github.com/opencontainers/runc/libcontainer/utils" + "github.com/opencontainers/selinux/go-selinux" + "github.com/pkg/errors" + "golang.org/x/sys/unix" +) + +// linuxSetnsInit performs the container's initialization for running a new process +// inside an existing container. +type linuxSetnsInit struct { + pipe *os.File + consoleSocket *os.File + config *initConfig +} + +func (l *linuxSetnsInit) getSessionRingName() string { + return "_ses." + l.config.ContainerId +} + +func (l *linuxSetnsInit) Init() error { + runtime.LockOSThread() + defer runtime.UnlockOSThread() + + if !l.config.Config.NoNewKeyring { + if err := selinux.SetKeyLabel(l.config.ProcessLabel); err != nil { + return err + } + defer selinux.SetKeyLabel("") + // Do not inherit the parent's session keyring. + if _, err := keys.JoinSessionKeyring(l.getSessionRingName()); err != nil { + // Same justification as in standart_init_linux.go as to why we + // don't bail on ENOSYS. + // + // TODO(cyphar): And we should have logging here too. + if errors.Cause(err) != unix.ENOSYS { + return errors.Wrap(err, "join session keyring") + } + } + } + if l.config.CreateConsole { + if err := setupConsole(l.consoleSocket, l.config, false); err != nil { + return err + } + if err := system.Setctty(); err != nil { + return err + } + } + if l.config.NoNewPrivileges { + if err := unix.Prctl(unix.PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0); err != nil { + return err + } + } + if err := selinux.SetExecLabel(l.config.ProcessLabel); err != nil { + return err + } + defer selinux.SetExecLabel("") + + // Normally we enable seccomp just before exec'ing into the sys container's so as few + // syscalls take place after enabling seccomp. However, if the process does not have + // CAP_SYS_ADMIN (e.g., the process is non-root) and NoNewPrivileges is cleared, then + // we must enable seccomp here (before we drop the process caps in finalizeNamespace() + // below). Otherwise we get a permission denied error. + + seccompNotifDone := false + seccompFiltDone := false + + if !l.config.NoNewPrivileges && + (l.config.Capabilities != nil && !utils.StringSliceContains(l.config.Capabilities.Effective, "CAP_SYS_ADMIN")) || + (l.config.Config.Capabilities != nil && !utils.StringSliceContains(l.config.Config.Capabilities.Effective, "CAP_SYS_ADMIN")) { + + if l.config.Config.SeccompNotif != nil { + if err := setupSyscallTraps(l.config, l.pipe); err != nil { + return newSystemErrorWithCause(err, "loading seccomp notification rules") + } + seccompNotifDone = true + } + + if l.config.Config.Seccomp != nil { + if _, err := seccomp.InitSeccomp(l.config.Config.Seccomp); err != nil { + return newSystemErrorWithCause(err, "loading seccomp filtering rules") + } + seccompFiltDone = true + } + } + + if err := finalizeNamespace(l.config); err != nil { + return err + } + if err := apparmor.ApplyProfile(l.config.AppArmorProfile); err != nil { + return err + } + + // Set seccomp as close to execve as possible, so as few syscalls take + // place afterward (reducing the amount of syscalls that users need to + // enable in their seccomp profiles). + if l.config.Config.SeccompNotif != nil && !seccompNotifDone { + if err := setupSyscallTraps(l.config, l.pipe); err != nil { + return newSystemErrorWithCause(err, "loading seccomp notification rules") + } + } + if l.config.Config.Seccomp != nil && !seccompFiltDone { + if _, err := seccomp.InitSeccomp(l.config.Config.Seccomp); err != nil { + return newSystemErrorWithCause(err, "loading seccomp filtering rules") + } + } + + // Close all file descriptors we are not passing to the container. This is + // necessary because the execve target could use internal sysbox-runc fds as the + // execve path, potentially giving access to binary files from the host + // (which can then be opened by container processes, leading to container + // escapes). Note that because this operation will close any open file + // descriptors that are referenced by (*os.File) handles from underneath + // the Go runtime, we must not do any file operations after this point + // (otherwise the (*os.File) finaliser could close the wrong file). See + // runc CVE-2024-21626 for more information as to why this protection is + // necessary. + if err := utils.UnsafeCloseFrom(l.config.PassedFilesCount + 3); err != nil { + return err + } + + return system.Execv(l.config.Args[0], l.config.Args[0:], os.Environ()) +} diff --git a/sysbox-runc/libcontainer/specconv/example.go b/sysbox-runc/libcontainer/specconv/example.go new file mode 100644 index 00000000..8a201bc7 --- /dev/null +++ b/sysbox-runc/libcontainer/specconv/example.go @@ -0,0 +1,230 @@ +package specconv + +import ( + "os" + "strings" + + "github.com/opencontainers/runc/libcontainer/cgroups" + "github.com/opencontainers/runtime-spec/specs-go" +) + +// Example returns an example spec file, with many options set so a user can +// see what a standard spec file looks like. +func Example() *specs.Spec { + spec := &specs.Spec{ + Version: specs.Version, + Root: &specs.Root{ + Path: "rootfs", + Readonly: true, + }, + Process: &specs.Process{ + Terminal: true, + User: specs.User{}, + Args: []string{ + "sh", + }, + Env: []string{ + "PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin", + "TERM=xterm", + }, + Cwd: "/", + NoNewPrivileges: true, + Capabilities: &specs.LinuxCapabilities{ + Bounding: []string{ + "CAP_AUDIT_WRITE", + "CAP_KILL", + "CAP_NET_BIND_SERVICE", + }, + Permitted: []string{ + "CAP_AUDIT_WRITE", + "CAP_KILL", + "CAP_NET_BIND_SERVICE", + }, + Inheritable: []string{ + "CAP_AUDIT_WRITE", + "CAP_KILL", + "CAP_NET_BIND_SERVICE", + }, + Ambient: []string{ + "CAP_AUDIT_WRITE", + "CAP_KILL", + "CAP_NET_BIND_SERVICE", + }, + Effective: []string{ + "CAP_AUDIT_WRITE", + "CAP_KILL", + "CAP_NET_BIND_SERVICE", + }, + }, + Rlimits: []specs.POSIXRlimit{ + { + Type: "RLIMIT_NOFILE", + Hard: uint64(1024), + Soft: uint64(1024), + }, + }, + }, + Hostname: "runc", + Mounts: []specs.Mount{ + { + Destination: "/proc", + Type: "proc", + Source: "proc", + Options: nil, + }, + { + Destination: "/dev", + Type: "tmpfs", + Source: "tmpfs", + Options: []string{"nosuid", "strictatime", "mode=755", "size=65536k"}, + }, + { + Destination: "/dev/pts", + Type: "devpts", + Source: "devpts", + Options: []string{"nosuid", "noexec", "newinstance", "ptmxmode=0666", "mode=0620", "gid=5"}, + }, + { + Destination: "/dev/shm", + Type: "tmpfs", + Source: "shm", + Options: []string{"nosuid", "noexec", "nodev", "mode=1777", "size=65536k"}, + }, + { + Destination: "/dev/mqueue", + Type: "mqueue", + Source: "mqueue", + Options: []string{"nosuid", "noexec", "nodev"}, + }, + { + Destination: "/sys", + Type: "sysfs", + Source: "sysfs", + Options: []string{"nosuid", "noexec", "nodev", "ro"}, + }, + { + Destination: "/sys/fs/cgroup", + Type: "cgroup", + Source: "cgroup", + Options: []string{"nosuid", "noexec", "nodev", "relatime", "ro"}, + }, + }, + Linux: &specs.Linux{ + MaskedPaths: []string{ + "/proc/acpi", + "/proc/asound", + "/proc/kcore", + "/proc/keys", + "/proc/latency_stats", + "/proc/timer_list", + "/proc/timer_stats", + "/proc/sched_debug", + "/sys/firmware", + "/proc/scsi", + }, + ReadonlyPaths: []string{ + "/proc/bus", + "/proc/fs", + "/proc/irq", + "/proc/sys", + "/proc/sysrq-trigger", + }, + Resources: &specs.LinuxResources{ + Devices: []specs.LinuxDeviceCgroup{ + { + Allow: false, + Access: "rwm", + }, + }, + }, + Namespaces: []specs.LinuxNamespace{ + { + Type: specs.PIDNamespace, + }, + { + Type: specs.NetworkNamespace, + }, + { + Type: specs.IPCNamespace, + }, + { + Type: specs.UTSNamespace, + }, + { + Type: specs.MountNamespace, + }, + }, + }, + } + if cgroups.IsCgroup2UnifiedMode() { + spec.Linux.Namespaces = append(spec.Linux.Namespaces, specs.LinuxNamespace{ + Type: specs.CgroupNamespace, + }) + } + return spec +} + +// ToRootless converts the given spec file into one that should work with +// rootless containers (euid != 0), by removing incompatible options and adding others that +// are needed. +func ToRootless(spec *specs.Spec) { + var namespaces []specs.LinuxNamespace + + // Remove networkns from the spec. + for _, ns := range spec.Linux.Namespaces { + switch ns.Type { + case specs.NetworkNamespace, specs.UserNamespace: + // Do nothing. + default: + namespaces = append(namespaces, ns) + } + } + // Add userns to the spec. + namespaces = append(namespaces, specs.LinuxNamespace{ + Type: specs.UserNamespace, + }) + spec.Linux.Namespaces = namespaces + + // Add mappings for the current user. + spec.Linux.UIDMappings = []specs.LinuxIDMapping{{ + HostID: uint32(os.Geteuid()), + ContainerID: 0, + Size: 1, + }} + spec.Linux.GIDMappings = []specs.LinuxIDMapping{{ + HostID: uint32(os.Getegid()), + ContainerID: 0, + Size: 1, + }} + + // Fix up mounts. + var mounts []specs.Mount + for _, mount := range spec.Mounts { + // Ignore all mounts that are under /sys. + if strings.HasPrefix(mount.Destination, "/sys") { + continue + } + + // Remove all gid= and uid= mappings. + var options []string + for _, option := range mount.Options { + if !strings.HasPrefix(option, "gid=") && !strings.HasPrefix(option, "uid=") { + options = append(options, option) + } + } + + mount.Options = options + mounts = append(mounts, mount) + } + // Add the sysfs mount as an rbind. + mounts = append(mounts, specs.Mount{ + Source: "/sys", + Destination: "/sys", + Type: "none", + Options: []string{"rbind", "nosuid", "noexec", "nodev", "ro"}, + }) + spec.Mounts = mounts + + // Remove cgroup settings. + spec.Linux.Resources = nil +} diff --git a/sysbox-runc/libcontainer/specconv/spec_linux.go b/sysbox-runc/libcontainer/specconv/spec_linux.go new file mode 100644 index 00000000..ee1dc8fd --- /dev/null +++ b/sysbox-runc/libcontainer/specconv/spec_linux.go @@ -0,0 +1,1031 @@ +//go:build linux +// +build linux + +// Package specconv implements conversion of specifications to libcontainer +// configurations +package specconv + +import ( + "errors" + "fmt" + "os" + "path/filepath" + "regexp" + "strings" + "syscall" + "time" + + systemdDbus "github.com/coreos/go-systemd/v22/dbus" + dbus "github.com/godbus/dbus/v5" + sh "github.com/nestybox/sysbox-libs/idShiftUtils" + "github.com/opencontainers/runc/libcontainer/cgroups" + "github.com/opencontainers/runc/libcontainer/configs" + "github.com/opencontainers/runc/libcontainer/devices" + "github.com/opencontainers/runc/libcontainer/seccomp" + libcontainerUtils "github.com/opencontainers/runc/libcontainer/utils" + "github.com/opencontainers/runtime-spec/specs-go" + "github.com/sirupsen/logrus" + + "golang.org/x/sys/unix" +) + +var namespaceMapping = map[specs.LinuxNamespaceType]configs.NamespaceType{ + specs.PIDNamespace: configs.NEWPID, + specs.NetworkNamespace: configs.NEWNET, + specs.MountNamespace: configs.NEWNS, + specs.UserNamespace: configs.NEWUSER, + specs.IPCNamespace: configs.NEWIPC, + specs.UTSNamespace: configs.NEWUTS, + specs.CgroupNamespace: configs.NEWCGROUP, +} + +var mountPropagationMapping = map[string]int{ + "rprivate": unix.MS_PRIVATE | unix.MS_REC, + "private": unix.MS_PRIVATE, + "rslave": unix.MS_SLAVE | unix.MS_REC, + "slave": unix.MS_SLAVE, + "rshared": unix.MS_SHARED | unix.MS_REC, + "shared": unix.MS_SHARED, + "runbindable": unix.MS_UNBINDABLE | unix.MS_REC, + "unbindable": unix.MS_UNBINDABLE, + "": 0, +} + +// AllowedDevices is the set of devices which are automatically included for +// all containers. +// +// XXX (cyphar) +// +// This behaviour is at the very least "questionable" (if not outright +// wrong) according to the runtime-spec. +// +// Yes, we have to include certain devices other than the ones the user +// specifies, but several devices listed here are not part of the spec +// (including "mknod for any device"?!). In addition, these rules are +// appended to the user-provided set which means that users *cannot disable +// this behaviour*. +// +// ... unfortunately I'm too scared to change this now because who knows how +// many people depend on this (incorrect and arguably insecure) behaviour. +var AllowedDevices = []*devices.Device{ + // allow mknod for any device + { + Rule: devices.Rule{ + Type: devices.CharDevice, + Major: devices.Wildcard, + Minor: devices.Wildcard, + Permissions: "m", + Allow: true, + }, + }, + { + Rule: devices.Rule{ + Type: devices.BlockDevice, + Major: devices.Wildcard, + Minor: devices.Wildcard, + Permissions: "m", + Allow: true, + }, + }, + { + Path: "/dev/null", + FileMode: 0666, + Uid: 0, + Gid: 0, + Rule: devices.Rule{ + Type: devices.CharDevice, + Major: 1, + Minor: 3, + Permissions: "rwm", + Allow: true, + }, + }, + { + Path: "/dev/random", + FileMode: 0666, + Uid: 0, + Gid: 0, + Rule: devices.Rule{ + Type: devices.CharDevice, + Major: 1, + Minor: 8, + Permissions: "rwm", + Allow: true, + }, + }, + { + Path: "/dev/full", + FileMode: 0666, + Uid: 0, + Gid: 0, + Rule: devices.Rule{ + Type: devices.CharDevice, + Major: 1, + Minor: 7, + Permissions: "rwm", + Allow: true, + }, + }, + { + Path: "/dev/tty", + FileMode: 0666, + Uid: 0, + Gid: 0, + Rule: devices.Rule{ + Type: devices.CharDevice, + Major: 5, + Minor: 0, + Permissions: "rwm", + Allow: true, + }, + }, + { + Path: "/dev/zero", + FileMode: 0666, + Uid: 0, + Gid: 0, + Rule: devices.Rule{ + Type: devices.CharDevice, + Major: 1, + Minor: 5, + Permissions: "rwm", + Allow: true, + }, + }, + { + Path: "/dev/urandom", + FileMode: 0666, + Uid: 0, + Gid: 0, + Rule: devices.Rule{ + Type: devices.CharDevice, + Major: 1, + Minor: 9, + Permissions: "rwm", + Allow: true, + }, + }, + // /dev/pts/ - pts namespaces are "coming soon" + { + Rule: devices.Rule{ + Type: devices.CharDevice, + Major: 136, + Minor: devices.Wildcard, + Permissions: "rwm", + Allow: true, + }, + }, + { + Rule: devices.Rule{ + Type: devices.CharDevice, + Major: 5, + Minor: 2, + Permissions: "rwm", + Allow: true, + }, + }, + // tuntap + { + Rule: devices.Rule{ + Type: devices.CharDevice, + Major: 10, + Minor: 200, + Permissions: "rwm", + Allow: true, + }, + }, +} + +type CreateOpts struct { + CgroupName string + UseSystemdCgroup bool + NoPivotRoot bool + NoNewKeyring bool + Spec *specs.Spec + RootlessEUID bool + RootlessCgroups bool + RootfsUidShiftType sh.IDShiftType + BindMntUidShiftType sh.IDShiftType + SwitchDockerDns bool + RootfsCloned bool + FsuidMapFailOnErr bool + IDshiftIgnoreList []string +} + +// CreateLibcontainerConfig creates a new libcontainer configuration from a +// given specification and a cgroup name +func CreateLibcontainerConfig(opts *CreateOpts) (*configs.Config, error) { + // runc's cwd will always be the bundle path + rcwd, err := os.Getwd() + if err != nil { + return nil, err + } + cwd, err := filepath.Abs(rcwd) + if err != nil { + return nil, err + } + spec := opts.Spec + if spec.Root == nil { + return nil, fmt.Errorf("Root must be specified") + } + rootfsPath := spec.Root.Path + if !filepath.IsAbs(rootfsPath) { + rootfsPath = filepath.Join(cwd, rootfsPath) + } + labels := []string{} + for k, v := range spec.Annotations { + labels = append(labels, k+"="+v) + } + config := &configs.Config{ + Rootfs: rootfsPath, + NoPivotRoot: opts.NoPivotRoot, + Readonlyfs: spec.Root.Readonly, + Hostname: spec.Hostname, + Labels: append(labels, "bundle="+cwd), + NoNewKeyring: opts.NoNewKeyring, + RootlessEUID: opts.RootlessEUID, + RootlessCgroups: opts.RootlessCgroups, + RootfsUidShiftType: opts.RootfsUidShiftType, + BindMntUidShiftType: opts.BindMntUidShiftType, + SwitchDockerDns: opts.SwitchDockerDns, + RootfsCloned: opts.RootfsCloned, + FsuidMapFailOnErr: opts.FsuidMapFailOnErr, + IDshiftIgnoreList: opts.IDshiftIgnoreList, + } + + for _, m := range spec.Mounts { + mount, err := createLibcontainerMount(cwd, m) + if err != nil { + return nil, fmt.Errorf("failed to create lib container mount: %v", err) + } + config.Mounts = append(config.Mounts, mount) + } + + defaultDevs, err := createDevices(spec, config) + if err != nil { + return nil, err + } + + c, err := CreateCgroupConfig(opts, defaultDevs) + if err != nil { + return nil, err + } + + config.Cgroups = c + + // set linux-specific config + if spec.Linux != nil { + var exists bool + if config.RootPropagation, exists = mountPropagationMapping[spec.Linux.RootfsPropagation]; !exists { + return nil, fmt.Errorf("rootfsPropagation=%v is not supported", spec.Linux.RootfsPropagation) + } + if config.NoPivotRoot && (config.RootPropagation&unix.MS_PRIVATE != 0) { + return nil, fmt.Errorf("rootfsPropagation of [r]private is not safe without pivot_root") + } + + for _, ns := range spec.Linux.Namespaces { + t, exists := namespaceMapping[ns.Type] + if !exists { + return nil, fmt.Errorf("namespace %q does not exist", ns) + } + if config.Namespaces.Contains(t) { + return nil, fmt.Errorf("malformed spec file: duplicated ns %q", ns) + } + config.Namespaces.Add(t, ns.Path) + } + if config.Namespaces.Contains(configs.NEWNET) && config.Namespaces.PathOf(configs.NEWNET) == "" { + config.Networks = []*configs.Network{ + { + Type: "loopback", + }, + } + } + if config.Namespaces.Contains(configs.NEWUSER) { + if err := setupUserNamespace(spec, config); err != nil { + return nil, err + } + } + config.MaskPaths = spec.Linux.MaskedPaths + config.ReadonlyPaths = spec.Linux.ReadonlyPaths + config.MountLabel = spec.Linux.MountLabel + config.Sysctl = spec.Linux.Sysctl + if spec.Linux.Seccomp != nil { + seccomp, err := SetupSeccomp(spec.Linux.Seccomp) + if err != nil { + return nil, err + } + config.Seccomp = seccomp + } + if spec.Linux.IntelRdt != nil { + config.IntelRdt = &configs.IntelRdt{} + if spec.Linux.IntelRdt.L3CacheSchema != "" { + config.IntelRdt.L3CacheSchema = spec.Linux.IntelRdt.L3CacheSchema + } + if spec.Linux.IntelRdt.MemBwSchema != "" { + config.IntelRdt.MemBwSchema = spec.Linux.IntelRdt.MemBwSchema + } + } + } + if spec.Process != nil { + config.OomScoreAdj = spec.Process.OOMScoreAdj + config.NoNewPrivileges = spec.Process.NoNewPrivileges + config.Umask = spec.Process.User.Umask + if spec.Process.SelinuxLabel != "" { + config.ProcessLabel = spec.Process.SelinuxLabel + } + if spec.Process.Capabilities != nil { + config.Capabilities = &configs.Capabilities{ + Bounding: spec.Process.Capabilities.Bounding, + Effective: spec.Process.Capabilities.Effective, + Permitted: spec.Process.Capabilities.Permitted, + Inheritable: spec.Process.Capabilities.Inheritable, + Ambient: spec.Process.Capabilities.Ambient, + } + } + } + createHooks(spec, config) + config.Version = specs.Version + return config, nil +} + +func createLibcontainerMount(cwd string, m specs.Mount) (*configs.Mount, error) { + flags, pgflags, data, ext := parseMountOptions(m.Options) + source := m.Source + device := m.Type + if flags&unix.MS_BIND != 0 { + // Any "type" the user specified is meaningless (and ignored) for + // bind-mounts -- so we set it to "bind" because rootfs_linux.go + // (incorrectly) relies on this for some checks. + device = "bind" + if !filepath.IsAbs(source) { + source = filepath.Join(cwd, m.Source) + } + } + + // sysbox-runc: for bind mounts, collect some info on the mount source. We do this + // here so that we don't have to do this from within the container's init process (as + // the latter may not have search permission into the bind source). + var bindSrcInfo configs.BindSrcInfo + + if device == "bind" { + var err error + var fi os.FileInfo + + // check if the bind source is a directory + fi, err = os.Stat(source) + if err != nil { + return nil, fmt.Errorf("failed to stat mount source at %s: %v", source, err) + } + + // collect the bind source ownership info + st, ok := fi.Sys().(*syscall.Stat_t) + if !ok { + return nil, fmt.Errorf("failed to convert to syscall.Stat_t") + } + + bindSrcInfo = configs.BindSrcInfo{ + IsDir: fi.IsDir(), + Uid: st.Uid, + Gid: st.Gid, + } + } + + return &configs.Mount{ + Device: device, + Source: source, + Destination: m.Destination, + Data: data, + Flags: flags, + PropagationFlags: pgflags, + Extensions: ext, + BindSrcInfo: bindSrcInfo, + }, nil +} + +// systemd property name check: latin letters only, at least 3 of them +var isValidName = regexp.MustCompile(`^[a-zA-Z]{3,}$`).MatchString + +var isSecSuffix = regexp.MustCompile(`[a-z]Sec$`).MatchString + +// Some systemd properties are documented as having "Sec" suffix +// (e.g. TimeoutStopSec) but are expected to have "USec" suffix +// here, so let's provide conversion to improve compatibility. +func convertSecToUSec(value dbus.Variant) (dbus.Variant, error) { + var sec uint64 + const M = 1000000 + vi := value.Value() + switch value.Signature().String() { + case "y": + sec = uint64(vi.(byte)) * M + case "n": + sec = uint64(vi.(int16)) * M + case "q": + sec = uint64(vi.(uint16)) * M + case "i": + sec = uint64(vi.(int32)) * M + case "u": + sec = uint64(vi.(uint32)) * M + case "x": + sec = uint64(vi.(int64)) * M + case "t": + sec = vi.(uint64) * M + case "d": + sec = uint64(vi.(float64) * M) + default: + return value, errors.New("not a number") + } + return dbus.MakeVariant(sec), nil +} + +func initSystemdProps(spec *specs.Spec) ([]systemdDbus.Property, error) { + const keyPrefix = "org.systemd.property." + var sp []systemdDbus.Property + + for k, v := range spec.Annotations { + name := strings.TrimPrefix(k, keyPrefix) + if len(name) == len(k) { // prefix not there + continue + } + if !isValidName(name) { + return nil, fmt.Errorf("Annotation %s name incorrect: %s", k, name) + } + value, err := dbus.ParseVariant(v, dbus.Signature{}) + if err != nil { + return nil, fmt.Errorf("Annotation %s=%s value parse error: %v", k, v, err) + } + if isSecSuffix(name) { + name = strings.TrimSuffix(name, "Sec") + "USec" + value, err = convertSecToUSec(value) + if err != nil { + return nil, fmt.Errorf("Annotation %s=%s value parse error: %v", k, v, err) + } + } + sp = append(sp, systemdDbus.Property{Name: name, Value: value}) + } + + return sp, nil +} + +func CreateCgroupConfig(opts *CreateOpts, defaultDevs []*devices.Device) (*configs.Cgroup, error) { + var ( + myCgroupPath string + + spec = opts.Spec + useSystemdCgroup = opts.UseSystemdCgroup + name = opts.CgroupName + ) + + c := &configs.Cgroup{ + Resources: &configs.Resources{}, + } + + if useSystemdCgroup { + sp, err := initSystemdProps(spec) + if err != nil { + return nil, err + } + c.SystemdProps = sp + } + + if spec.Linux != nil && spec.Linux.CgroupsPath != "" { + myCgroupPath = libcontainerUtils.CleanPath(spec.Linux.CgroupsPath) + if useSystemdCgroup { + myCgroupPath = spec.Linux.CgroupsPath + } + } + + if useSystemdCgroup { + if myCgroupPath == "" { + c.Parent = "system.slice" + c.ScopePrefix = "runc" + c.Name = name + } else { + // Parse the path from expected "slice:prefix:name" + // for e.g. "system.slice:docker:1234" + parts := strings.Split(myCgroupPath, ":") + if len(parts) != 3 { + return nil, fmt.Errorf("expected cgroupsPath to be of format \"slice:prefix:name\" for systemd cgroups, got %q instead", myCgroupPath) + } + c.Parent = parts[0] + c.ScopePrefix = parts[1] + c.Name = parts[2] + } + } else { + if myCgroupPath == "" { + c.Name = name + } + c.Path = myCgroupPath + } + + // In rootless containers, any attempt to make cgroup changes is likely to fail. + // libcontainer will validate this but ignores the error. + if spec.Linux != nil { + r := spec.Linux.Resources + if r != nil { + for i, d := range spec.Linux.Resources.Devices { + var ( + t = "a" + major = int64(-1) + minor = int64(-1) + ) + if d.Type != "" { + t = d.Type + } + if d.Major != nil { + major = *d.Major + } + if d.Minor != nil { + minor = *d.Minor + } + if d.Access == "" { + return nil, fmt.Errorf("device access at %d field cannot be empty", i) + } + dt, err := stringToCgroupDeviceRune(t) + if err != nil { + return nil, err + } + c.Resources.Devices = append(c.Resources.Devices, &devices.Rule{ + Type: dt, + Major: major, + Minor: minor, + Permissions: devices.Permissions(d.Access), + Allow: d.Allow, + }) + } + if r.Memory != nil { + if r.Memory.Limit != nil { + c.Resources.Memory = *r.Memory.Limit + } + if r.Memory.Reservation != nil { + c.Resources.MemoryReservation = *r.Memory.Reservation + } + if r.Memory.Swap != nil { + c.Resources.MemorySwap = *r.Memory.Swap + } + if r.Memory.Kernel != nil || r.Memory.KernelTCP != nil { + logrus.Warn("Kernel memory settings are ignored and will be removed") + } + if r.Memory.Swappiness != nil { + c.Resources.MemorySwappiness = r.Memory.Swappiness + } + if r.Memory.DisableOOMKiller != nil { + c.Resources.OomKillDisable = *r.Memory.DisableOOMKiller + } + } + if r.CPU != nil { + if r.CPU.Shares != nil { + c.Resources.CpuShares = *r.CPU.Shares + + //CpuWeight is used for cgroupv2 and should be converted + c.Resources.CpuWeight = cgroups.ConvertCPUSharesToCgroupV2Value(c.Resources.CpuShares) + } + if r.CPU.Quota != nil { + c.Resources.CpuQuota = *r.CPU.Quota + } + if r.CPU.Period != nil { + c.Resources.CpuPeriod = *r.CPU.Period + } + if r.CPU.RealtimeRuntime != nil { + c.Resources.CpuRtRuntime = *r.CPU.RealtimeRuntime + } + if r.CPU.RealtimePeriod != nil { + c.Resources.CpuRtPeriod = *r.CPU.RealtimePeriod + } + if r.CPU.Cpus != "" { + c.Resources.CpusetCpus = r.CPU.Cpus + } + if r.CPU.Mems != "" { + c.Resources.CpusetMems = r.CPU.Mems + } + } + if r.Pids != nil { + c.Resources.PidsLimit = r.Pids.Limit + } + if r.BlockIO != nil { + if r.BlockIO.Weight != nil { + c.Resources.BlkioWeight = *r.BlockIO.Weight + } + if r.BlockIO.LeafWeight != nil { + c.Resources.BlkioLeafWeight = *r.BlockIO.LeafWeight + } + if r.BlockIO.WeightDevice != nil { + for _, wd := range r.BlockIO.WeightDevice { + var weight, leafWeight uint16 + if wd.Weight != nil { + weight = *wd.Weight + } + if wd.LeafWeight != nil { + leafWeight = *wd.LeafWeight + } + weightDevice := configs.NewWeightDevice(wd.Major, wd.Minor, weight, leafWeight) + c.Resources.BlkioWeightDevice = append(c.Resources.BlkioWeightDevice, weightDevice) + } + } + if r.BlockIO.ThrottleReadBpsDevice != nil { + for _, td := range r.BlockIO.ThrottleReadBpsDevice { + rate := td.Rate + throttleDevice := configs.NewThrottleDevice(td.Major, td.Minor, rate) + c.Resources.BlkioThrottleReadBpsDevice = append(c.Resources.BlkioThrottleReadBpsDevice, throttleDevice) + } + } + if r.BlockIO.ThrottleWriteBpsDevice != nil { + for _, td := range r.BlockIO.ThrottleWriteBpsDevice { + rate := td.Rate + throttleDevice := configs.NewThrottleDevice(td.Major, td.Minor, rate) + c.Resources.BlkioThrottleWriteBpsDevice = append(c.Resources.BlkioThrottleWriteBpsDevice, throttleDevice) + } + } + if r.BlockIO.ThrottleReadIOPSDevice != nil { + for _, td := range r.BlockIO.ThrottleReadIOPSDevice { + rate := td.Rate + throttleDevice := configs.NewThrottleDevice(td.Major, td.Minor, rate) + c.Resources.BlkioThrottleReadIOPSDevice = append(c.Resources.BlkioThrottleReadIOPSDevice, throttleDevice) + } + } + if r.BlockIO.ThrottleWriteIOPSDevice != nil { + for _, td := range r.BlockIO.ThrottleWriteIOPSDevice { + rate := td.Rate + throttleDevice := configs.NewThrottleDevice(td.Major, td.Minor, rate) + c.Resources.BlkioThrottleWriteIOPSDevice = append(c.Resources.BlkioThrottleWriteIOPSDevice, throttleDevice) + } + } + } + for _, l := range r.HugepageLimits { + c.Resources.HugetlbLimit = append(c.Resources.HugetlbLimit, &configs.HugepageLimit{ + Pagesize: l.Pagesize, + Limit: l.Limit, + }) + } + if len(r.Rdma) > 0 { + c.Resources.Rdma = make(map[string]configs.LinuxRdma, len(r.Rdma)) + for k, v := range r.Rdma { + c.Resources.Rdma[k] = configs.LinuxRdma{ + HcaHandles: v.HcaHandles, + HcaObjects: v.HcaObjects, + } + } + } + if r.Network != nil { + if r.Network.ClassID != nil { + c.Resources.NetClsClassid = *r.Network.ClassID + } + for _, m := range r.Network.Priorities { + c.Resources.NetPrioIfpriomap = append(c.Resources.NetPrioIfpriomap, &configs.IfPrioMap{ + Interface: m.Name, + Priority: int64(m.Priority), + }) + } + } + if len(r.Unified) > 0 { + // copy the map + c.Resources.Unified = make(map[string]string, len(r.Unified)) + for k, v := range r.Unified { + c.Resources.Unified[k] = v + } + } + } + } + + // Append the default allowed devices to the end of the list. + for _, device := range defaultDevs { + c.Resources.Devices = append(c.Resources.Devices, &device.Rule) + } + return c, nil +} + +func stringToCgroupDeviceRune(s string) (devices.Type, error) { + switch s { + case "a": + return devices.WildcardDevice, nil + case "b": + return devices.BlockDevice, nil + case "c": + return devices.CharDevice, nil + default: + return 0, fmt.Errorf("invalid cgroup device type %q", s) + } +} + +func stringToDeviceRune(s string) (devices.Type, error) { + switch s { + case "p": + return devices.FifoDevice, nil + case "u", "c": + return devices.CharDevice, nil + case "b": + return devices.BlockDevice, nil + default: + return 0, fmt.Errorf("invalid device type %q", s) + } +} + +func createDevices(spec *specs.Spec, config *configs.Config) ([]*devices.Device, error) { + // If a spec device is redundant with a default device, remove that default + // device (the spec one takes priority). + dedupedAllowDevs := []*devices.Device{} + +next: + for _, ad := range AllowedDevices { + if ad.Path != "" { + for _, sd := range spec.Linux.Devices { + if sd.Path == ad.Path { + continue next + } + } + } + dedupedAllowDevs = append(dedupedAllowDevs, ad) + if ad.Path != "" { + config.Devices = append(config.Devices, ad) + } + } + + // Merge in additional devices from the spec. + if spec.Linux != nil { + for _, d := range spec.Linux.Devices { + var uid, gid uint32 + var filemode os.FileMode = 0666 + + if d.UID != nil { + uid = *d.UID + } + if d.GID != nil { + gid = *d.GID + } + dt, err := stringToDeviceRune(d.Type) + if err != nil { + return nil, err + } + if d.FileMode != nil { + filemode = *d.FileMode + } + device := &devices.Device{ + Rule: devices.Rule{ + Type: dt, + Major: d.Major, + Minor: d.Minor, + }, + Path: d.Path, + FileMode: filemode, + Uid: uid, + Gid: gid, + } + config.Devices = append(config.Devices, device) + } + } + + return dedupedAllowDevs, nil +} + +func setupUserNamespace(spec *specs.Spec, config *configs.Config) error { + create := func(m specs.LinuxIDMapping) configs.IDMap { + return configs.IDMap{ + HostID: int(m.HostID), + ContainerID: int(m.ContainerID), + Size: int(m.Size), + } + } + if spec.Linux != nil { + for _, m := range spec.Linux.UIDMappings { + config.UidMappings = append(config.UidMappings, create(m)) + } + for _, m := range spec.Linux.GIDMappings { + config.GidMappings = append(config.GidMappings, create(m)) + } + } + rootUID, err := config.HostRootUID() + if err != nil { + return err + } + rootGID, err := config.HostRootGID() + if err != nil { + return err + } + for _, node := range config.Devices { + node.Uid = uint32(rootUID) + node.Gid = uint32(rootGID) + } + return nil +} + +// parseMountOptions parses the string and returns the flags, propagation +// flags and any mount data that it contains. +func parseMountOptions(options []string) (int, []int, string, int) { + var ( + flag int + pgflag []int + data []string + extFlags int + ) + flags := map[string]struct { + clear bool + flag int + }{ + "acl": {false, unix.MS_POSIXACL}, + "async": {true, unix.MS_SYNCHRONOUS}, + "atime": {true, unix.MS_NOATIME}, + "bind": {false, unix.MS_BIND}, + "defaults": {false, 0}, + "dev": {true, unix.MS_NODEV}, + "diratime": {true, unix.MS_NODIRATIME}, + "dirsync": {false, unix.MS_DIRSYNC}, + "exec": {true, unix.MS_NOEXEC}, + "iversion": {false, unix.MS_I_VERSION}, + "lazytime": {false, unix.MS_LAZYTIME}, + "loud": {true, unix.MS_SILENT}, + "mand": {false, unix.MS_MANDLOCK}, + "noacl": {true, unix.MS_POSIXACL}, + "noatime": {false, unix.MS_NOATIME}, + "nodev": {false, unix.MS_NODEV}, + "nodiratime": {false, unix.MS_NODIRATIME}, + "noexec": {false, unix.MS_NOEXEC}, + "noiversion": {true, unix.MS_I_VERSION}, + "nolazytime": {true, unix.MS_LAZYTIME}, + "nomand": {true, unix.MS_MANDLOCK}, + "norelatime": {true, unix.MS_RELATIME}, + "nostrictatime": {true, unix.MS_STRICTATIME}, + "nosuid": {false, unix.MS_NOSUID}, + "rbind": {false, unix.MS_BIND | unix.MS_REC}, + "relatime": {false, unix.MS_RELATIME}, + "remount": {false, unix.MS_REMOUNT}, + "ro": {false, unix.MS_RDONLY}, + "rw": {true, unix.MS_RDONLY}, + "silent": {false, unix.MS_SILENT}, + "strictatime": {false, unix.MS_STRICTATIME}, + "suid": {true, unix.MS_NOSUID}, + "sync": {false, unix.MS_SYNCHRONOUS}, + } + propagationFlags := map[string]int{ + "private": unix.MS_PRIVATE, + "shared": unix.MS_SHARED, + "slave": unix.MS_SLAVE, + "unbindable": unix.MS_UNBINDABLE, + "rprivate": unix.MS_PRIVATE | unix.MS_REC, + "rshared": unix.MS_SHARED | unix.MS_REC, + "rslave": unix.MS_SLAVE | unix.MS_REC, + "runbindable": unix.MS_UNBINDABLE | unix.MS_REC, + } + extensionFlags := map[string]struct { + clear bool + flag int + }{ + "tmpcopyup": {false, configs.EXT_COPYUP}, + } + for _, o := range options { + // If the option does not exist in the flags table or the flag + // is not supported on the platform, + // then it is a data value for a specific fs type + if f, exists := flags[o]; exists && f.flag != 0 { + if f.clear { + flag &= ^f.flag + } else { + flag |= f.flag + } + } else if f, exists := propagationFlags[o]; exists && f != 0 { + pgflag = append(pgflag, f) + } else if f, exists := extensionFlags[o]; exists && f.flag != 0 { + if f.clear { + extFlags &= ^f.flag + } else { + extFlags |= f.flag + } + } else { + data = append(data, o) + } + } + return flag, pgflag, strings.Join(data, ","), extFlags +} + +func SetupSeccomp(config *specs.LinuxSeccomp) (*configs.Seccomp, error) { + if config == nil { + return nil, nil + } + + // No default action specified, no syscalls listed, assume seccomp disabled + if config.DefaultAction == "" && len(config.Syscalls) == 0 { + return nil, nil + } + + newConfig := new(configs.Seccomp) + newConfig.Syscalls = []*configs.Syscall{} + + // The list of flags defined in runtime-spec is a subset of the flags + // in the seccomp() syscall. + if config.Flags == nil { + // No flags are set explicitly (not even the empty set); + // set the default of specs.LinuxSeccompFlagSpecAllow, + // if it is supported by the libseccomp and the kernel. + if err := seccomp.FlagSupported(specs.LinuxSeccompFlagSpecAllow); err == nil { + newConfig.Flags = []specs.LinuxSeccompFlag{specs.LinuxSeccompFlagSpecAllow} + } + } else { + // Fail early if some flags are unknown or unsupported. + for _, flag := range config.Flags { + if err := seccomp.FlagSupported(flag); err != nil { + return nil, err + } + newConfig.Flags = append(newConfig.Flags, flag) + } + } + + if len(config.Architectures) > 0 { + newConfig.Architectures = []string{} + for _, arch := range config.Architectures { + newArch, err := seccomp.ConvertStringToArch(string(arch)) + if err != nil { + return nil, err + } + newConfig.Architectures = append(newConfig.Architectures, newArch) + } + } + + // Convert default action from string representation + newDefaultAction, err := seccomp.ConvertStringToAction(string(config.DefaultAction)) + if err != nil { + return nil, err + } + newConfig.DefaultAction = newDefaultAction + newConfig.DefaultErrnoRet = config.DefaultErrnoRet + + newConfig.ListenerPath = config.ListenerPath + newConfig.ListenerMetadata = config.ListenerMetadata + + // Loop through all syscall blocks and convert them to libcontainer format + for _, call := range config.Syscalls { + newAction, err := seccomp.ConvertStringToAction(string(call.Action)) + if err != nil { + return nil, err + } + + for _, name := range call.Names { + newCall := configs.Syscall{ + Name: name, + Action: newAction, + ErrnoRet: call.ErrnoRet, + Args: []*configs.Arg{}, + } + // Loop through all the arguments of the syscall and convert them + for _, arg := range call.Args { + + newOp, err := seccomp.ConvertStringToOperator(string(arg.Op)) + if err != nil { + return nil, err + } + + newArg := configs.Arg{ + Index: arg.Index, + Value: arg.Value, + ValueTwo: arg.ValueTwo, + Op: newOp, + } + + newCall.Args = append(newCall.Args, &newArg) + } + newConfig.Syscalls = append(newConfig.Syscalls, &newCall) + } + } + + return newConfig, nil +} + +func createHooks(rspec *specs.Spec, config *configs.Config) { + config.Hooks = configs.Hooks{} + if rspec.Hooks != nil { + for _, h := range rspec.Hooks.Prestart { + cmd := createCommandHook(h) + config.Hooks[configs.Prestart] = append(config.Hooks[configs.Prestart], configs.NewCommandHook(cmd)) + } + for _, h := range rspec.Hooks.CreateRuntime { + cmd := createCommandHook(h) + config.Hooks[configs.CreateRuntime] = append(config.Hooks[configs.CreateRuntime], configs.NewCommandHook(cmd)) + } + for _, h := range rspec.Hooks.CreateContainer { + cmd := createCommandHook(h) + config.Hooks[configs.CreateContainer] = append(config.Hooks[configs.CreateContainer], configs.NewCommandHook(cmd)) + } + for _, h := range rspec.Hooks.StartContainer { + cmd := createCommandHook(h) + config.Hooks[configs.StartContainer] = append(config.Hooks[configs.StartContainer], configs.NewCommandHook(cmd)) + } + for _, h := range rspec.Hooks.Poststart { + cmd := createCommandHook(h) + config.Hooks[configs.Poststart] = append(config.Hooks[configs.Poststart], configs.NewCommandHook(cmd)) + } + for _, h := range rspec.Hooks.Poststop { + cmd := createCommandHook(h) + config.Hooks[configs.Poststop] = append(config.Hooks[configs.Poststop], configs.NewCommandHook(cmd)) + } + } +} + +func createCommandHook(h specs.Hook) configs.Command { + cmd := configs.Command{ + Path: h.Path, + Args: h.Args, + Env: h.Env, + } + if h.Timeout != nil { + d := time.Duration(*h.Timeout) * time.Second + cmd.Timeout = &d + } + return cmd +} diff --git a/sysbox-runc/libcontainer/specconv/spec_linux_test.go b/sysbox-runc/libcontainer/specconv/spec_linux_test.go new file mode 100644 index 00000000..f3c543a5 --- /dev/null +++ b/sysbox-runc/libcontainer/specconv/spec_linux_test.go @@ -0,0 +1,748 @@ +// +build linux + +package specconv + +import ( + "os" + "strings" + "testing" + + dbus "github.com/godbus/dbus/v5" + "github.com/opencontainers/runc/libcontainer/configs" + "github.com/opencontainers/runc/libcontainer/configs/validate" + "github.com/opencontainers/runc/libcontainer/devices" + "github.com/opencontainers/runtime-spec/specs-go" + "golang.org/x/sys/unix" +) + +func TestCreateCommandHookTimeout(t *testing.T) { + timeout := 3600 + hook := specs.Hook{ + Path: "/some/hook/path", + Args: []string{"--some", "thing"}, + Env: []string{"SOME=value"}, + Timeout: &timeout, + } + command := createCommandHook(hook) + timeoutStr := command.Timeout.String() + if timeoutStr != "1h0m0s" { + t.Errorf("Expected the Timeout to be 1h0m0s, got: %s", timeoutStr) + } +} + +func TestCreateHooks(t *testing.T) { + rspec := &specs.Spec{ + Hooks: &specs.Hooks{ + Prestart: []specs.Hook{ + { + Path: "/some/hook/path", + }, + { + Path: "/some/hook2/path", + Args: []string{"--some", "thing"}, + }, + }, + CreateRuntime: []specs.Hook{ + { + Path: "/some/hook/path", + }, + { + Path: "/some/hook2/path", + Args: []string{"--some", "thing"}, + }, + }, + CreateContainer: []specs.Hook{ + { + Path: "/some/hook/path", + }, + { + Path: "/some/hook2/path", + Args: []string{"--some", "thing"}, + }, + }, + StartContainer: []specs.Hook{ + { + Path: "/some/hook/path", + }, + { + Path: "/some/hook2/path", + Args: []string{"--some", "thing"}, + }, + }, + Poststart: []specs.Hook{ + { + Path: "/some/hook/path", + Args: []string{"--some", "thing"}, + Env: []string{"SOME=value"}, + }, + { + Path: "/some/hook2/path", + }, + { + Path: "/some/hook3/path", + }, + }, + Poststop: []specs.Hook{ + { + Path: "/some/hook/path", + Args: []string{"--some", "thing"}, + Env: []string{"SOME=value"}, + }, + { + Path: "/some/hook2/path", + }, + { + Path: "/some/hook3/path", + }, + { + Path: "/some/hook4/path", + Args: []string{"--some", "thing"}, + }, + }, + }, + } + conf := &configs.Config{} + createHooks(rspec, conf) + + prestart := conf.Hooks[configs.Prestart] + + if len(prestart) != 2 { + t.Error("Expected 2 Prestart hooks") + } + + createRuntime := conf.Hooks[configs.CreateRuntime] + + if len(createRuntime) != 2 { + t.Error("Expected 2 createRuntime hooks") + } + + createContainer := conf.Hooks[configs.CreateContainer] + + if len(createContainer) != 2 { + t.Error("Expected 2 createContainer hooks") + } + + startContainer := conf.Hooks[configs.StartContainer] + + if len(startContainer) != 2 { + t.Error("Expected 2 startContainer hooks") + } + + poststart := conf.Hooks[configs.Poststart] + + if len(poststart) != 3 { + t.Error("Expected 3 Poststart hooks") + } + + poststop := conf.Hooks[configs.Poststop] + + if len(poststop) != 4 { + t.Error("Expected 4 Poststop hooks") + } + +} +func TestSetupSeccomp(t *testing.T) { + conf := &specs.LinuxSeccomp{ + DefaultAction: "SCMP_ACT_ERRNO", + Architectures: []specs.Arch{specs.ArchX86_64, specs.ArchARM}, + Syscalls: []specs.LinuxSyscall{ + { + Names: []string{"clone"}, + Action: "SCMP_ACT_ALLOW", + Args: []specs.LinuxSeccompArg{ + { + Index: 0, + Value: unix.CLONE_NEWNS | unix.CLONE_NEWUTS | unix.CLONE_NEWIPC | unix.CLONE_NEWUSER | unix.CLONE_NEWPID | unix.CLONE_NEWNET | unix.CLONE_NEWCGROUP, + ValueTwo: 0, + Op: "SCMP_CMP_MASKED_EQ", + }, + }, + }, + { + Names: []string{ + "select", + "semctl", + "semget", + "semop", + "semtimedop", + "send", + "sendfile", + }, + Action: "SCMP_ACT_ALLOW", + }, + }, + } + seccomp, err := SetupSeccomp(conf) + + if err != nil { + t.Errorf("Couldn't create Seccomp config: %v", err) + } + + if seccomp.DefaultAction != 2 { // SCMP_ACT_ERRNO + t.Error("Wrong conversion for DefaultAction") + } + + if len(seccomp.Architectures) != 2 { + t.Error("Wrong number of architectures") + } + + if seccomp.Architectures[0] != "amd64" || seccomp.Architectures[1] != "arm" { + t.Error("Expected architectures are not found") + } + + calls := seccomp.Syscalls + + callsLength := len(calls) + if callsLength != 8 { + t.Errorf("Expected 8 syscalls, got :%d", callsLength) + } + + for i, call := range calls { + if i == 0 { + expectedCloneSyscallArgs := configs.Arg{ + Index: 0, + Op: 7, // SCMP_CMP_MASKED_EQ + Value: unix.CLONE_NEWNS | unix.CLONE_NEWUTS | unix.CLONE_NEWIPC | unix.CLONE_NEWUSER | unix.CLONE_NEWPID | unix.CLONE_NEWNET | unix.CLONE_NEWCGROUP, + ValueTwo: 0, + } + if expectedCloneSyscallArgs != *call.Args[0] { + t.Errorf("Wrong arguments conversion for the clone syscall under test") + } + } + if call.Action != 4 { + t.Error("Wrong conversion for the clone syscall action") + } + + } + +} + +func TestLinuxCgroupWithMemoryResource(t *testing.T) { + cgroupsPath := "/user/cgroups/path/id" + + spec := &specs.Spec{} + devices := []specs.LinuxDeviceCgroup{ + { + Allow: false, + Access: "rwm", + }, + } + + limit := int64(100) + reservation := int64(50) + swap := int64(20) + kernel := int64(40) + kernelTCP := int64(45) + swappiness := uint64(1) + swappinessPtr := &swappiness + disableOOMKiller := true + resources := &specs.LinuxResources{ + Devices: devices, + Memory: &specs.LinuxMemory{ + Limit: &limit, + Reservation: &reservation, + Swap: &swap, + Kernel: &kernel, + KernelTCP: &kernelTCP, + Swappiness: swappinessPtr, + DisableOOMKiller: &disableOOMKiller, + }, + } + spec.Linux = &specs.Linux{ + CgroupsPath: cgroupsPath, + Resources: resources, + } + + opts := &CreateOpts{ + CgroupName: "ContainerID", + UseSystemdCgroup: false, + Spec: spec, + } + + cgroup, err := CreateCgroupConfig(opts, nil) + if err != nil { + t.Errorf("Couldn't create Cgroup config: %v", err) + } + + if cgroup.Path != cgroupsPath { + t.Errorf("Wrong cgroupsPath, expected '%s' got '%s'", cgroupsPath, cgroup.Path) + } + if cgroup.Resources.Memory != limit { + t.Errorf("Expected to have %d as memory limit, got %d", limit, cgroup.Resources.Memory) + } + if cgroup.Resources.MemoryReservation != reservation { + t.Errorf("Expected to have %d as memory reservation, got %d", reservation, cgroup.Resources.MemoryReservation) + } + if cgroup.Resources.MemorySwap != swap { + t.Errorf("Expected to have %d as swap, got %d", swap, cgroup.Resources.MemorySwap) + } + if cgroup.Resources.MemorySwappiness != swappinessPtr { + t.Errorf("Expected to have %d as memory swappiness, got %d", swappinessPtr, cgroup.Resources.MemorySwappiness) + } + if cgroup.Resources.OomKillDisable != disableOOMKiller { + t.Errorf("The OOMKiller should be enabled") + } +} + +func TestLinuxCgroupSystemd(t *testing.T) { + cgroupsPath := "parent:scopeprefix:name" + + spec := &specs.Spec{} + spec.Linux = &specs.Linux{ + CgroupsPath: cgroupsPath, + } + + opts := &CreateOpts{ + UseSystemdCgroup: true, + Spec: spec, + } + + cgroup, err := CreateCgroupConfig(opts, nil) + + if err != nil { + t.Errorf("Couldn't create Cgroup config: %v", err) + } + + expectedParent := "parent" + if cgroup.Parent != expectedParent { + t.Errorf("Expected to have %s as Parent instead of %s", expectedParent, cgroup.Parent) + } + + expectedScopePrefix := "scopeprefix" + if cgroup.ScopePrefix != expectedScopePrefix { + t.Errorf("Expected to have %s as ScopePrefix instead of %s", expectedScopePrefix, cgroup.ScopePrefix) + } + + expectedName := "name" + if cgroup.Name != expectedName { + t.Errorf("Expected to have %s as Name instead of %s", expectedName, cgroup.Name) + } +} + +func TestLinuxCgroupSystemdWithEmptyPath(t *testing.T) { + cgroupsPath := "" + + spec := &specs.Spec{} + spec.Linux = &specs.Linux{ + CgroupsPath: cgroupsPath, + } + + opts := &CreateOpts{ + CgroupName: "ContainerID", + UseSystemdCgroup: true, + Spec: spec, + } + + cgroup, err := CreateCgroupConfig(opts, nil) + + if err != nil { + t.Errorf("Couldn't create Cgroup config: %v", err) + } + + expectedParent := "system.slice" + if cgroup.Parent != expectedParent { + t.Errorf("Expected to have %s as Parent instead of %s", expectedParent, cgroup.Parent) + } + + expectedScopePrefix := "runc" + if cgroup.ScopePrefix != expectedScopePrefix { + t.Errorf("Expected to have %s as ScopePrefix instead of %s", expectedScopePrefix, cgroup.ScopePrefix) + } + + if cgroup.Name != opts.CgroupName { + t.Errorf("Expected to have %s as Name instead of %s", opts.CgroupName, cgroup.Name) + } +} + +func TestLinuxCgroupSystemdWithInvalidPath(t *testing.T) { + cgroupsPath := "/user/cgroups/path/id" + + spec := &specs.Spec{} + spec.Linux = &specs.Linux{ + CgroupsPath: cgroupsPath, + } + + opts := &CreateOpts{ + CgroupName: "ContainerID", + UseSystemdCgroup: true, + Spec: spec, + } + + _, err := CreateCgroupConfig(opts, nil) + if err == nil { + t.Error("Expected to produce an error if not using the correct format for cgroup paths belonging to systemd") + } +} +func TestLinuxCgroupsPathSpecified(t *testing.T) { + cgroupsPath := "/user/cgroups/path/id" + + spec := &specs.Spec{} + spec.Linux = &specs.Linux{ + CgroupsPath: cgroupsPath, + } + + opts := &CreateOpts{ + CgroupName: "ContainerID", + UseSystemdCgroup: false, + Spec: spec, + } + + cgroup, err := CreateCgroupConfig(opts, nil) + if err != nil { + t.Errorf("Couldn't create Cgroup config: %v", err) + } + + if cgroup.Path != cgroupsPath { + t.Errorf("Wrong cgroupsPath, expected '%s' got '%s'", cgroupsPath, cgroup.Path) + } +} + +func TestLinuxCgroupsPathNotSpecified(t *testing.T) { + spec := &specs.Spec{} + opts := &CreateOpts{ + CgroupName: "ContainerID", + UseSystemdCgroup: false, + Spec: spec, + } + + cgroup, err := CreateCgroupConfig(opts, nil) + if err != nil { + t.Errorf("Couldn't create Cgroup config: %v", err) + } + + if cgroup.Path != "" { + t.Errorf("Wrong cgroupsPath, expected it to be empty string, got '%s'", cgroup.Path) + } +} + +func TestSpecconvExampleValidate(t *testing.T) { + spec := Example() + spec.Root.Path = "/" + + opts := &CreateOpts{ + CgroupName: "ContainerID", + UseSystemdCgroup: false, + Spec: spec, + } + + config, err := CreateLibcontainerConfig(opts) + if err != nil { + t.Errorf("Couldn't create libcontainer config: %v", err) + } + + if config.NoNewPrivileges != spec.Process.NoNewPrivileges { + t.Errorf("specconv NoNewPrivileges mismatch. Expected %v got %v", + spec.Process.NoNewPrivileges, config.NoNewPrivileges) + } + + validator := validate.New() + if err := validator.Validate(config); err != nil { + t.Errorf("Expected specconv to produce valid container config: %v", err) + } +} + +func TestDupNamespaces(t *testing.T) { + spec := &specs.Spec{ + Root: &specs.Root{ + Path: "rootfs", + }, + Linux: &specs.Linux{ + Namespaces: []specs.LinuxNamespace{ + { + Type: "pid", + }, + { + Type: "pid", + Path: "/proc/1/ns/pid", + }, + }, + }, + } + + _, err := CreateLibcontainerConfig(&CreateOpts{ + Spec: spec, + }) + + if !strings.Contains(err.Error(), "malformed spec file: duplicated ns") { + t.Errorf("Duplicated namespaces should be forbidden") + } +} + +func TestNonZeroEUIDCompatibleSpecconvValidate(t *testing.T) { + if _, err := os.Stat("/proc/self/ns/user"); os.IsNotExist(err) { + t.Skip("userns is unsupported") + } + + spec := Example() + spec.Root.Path = "/" + ToRootless(spec) + + opts := &CreateOpts{ + CgroupName: "ContainerID", + UseSystemdCgroup: false, + Spec: spec, + RootlessEUID: true, + RootlessCgroups: true, + } + + config, err := CreateLibcontainerConfig(opts) + if err != nil { + t.Errorf("Couldn't create libcontainer config: %v", err) + } + + validator := validate.New() + if err := validator.Validate(config); err != nil { + t.Errorf("Expected specconv to produce valid rootless container config: %v", err) + } +} + +func TestInitSystemdProps(t *testing.T) { + type inT struct { + name, value string + } + type expT struct { + isErr bool + name string + value interface{} + } + + testCases := []struct { + desc string + in inT + exp expT + }{ + { + in: inT{"org.systemd.property.TimeoutStopUSec", "uint64 123456789"}, + exp: expT{false, "TimeoutStopUSec", uint64(123456789)}, + }, + { + desc: "convert USec to Sec (default numeric type)", + in: inT{"org.systemd.property.TimeoutStopSec", "456"}, + exp: expT{false, "TimeoutStopUSec", uint64(456000000)}, + }, + { + desc: "convert USec to Sec (byte)", + in: inT{"org.systemd.property.TimeoutStopSec", "byte 234"}, + exp: expT{false, "TimeoutStopUSec", uint64(234000000)}, + }, + { + desc: "convert USec to Sec (int16)", + in: inT{"org.systemd.property.TimeoutStopSec", "int16 234"}, + exp: expT{false, "TimeoutStopUSec", uint64(234000000)}, + }, + { + desc: "convert USec to Sec (uint16)", + in: inT{"org.systemd.property.TimeoutStopSec", "uint16 234"}, + exp: expT{false, "TimeoutStopUSec", uint64(234000000)}, + }, + { + desc: "convert USec to Sec (int32)", + in: inT{"org.systemd.property.TimeoutStopSec", "int32 234"}, + exp: expT{false, "TimeoutStopUSec", uint64(234000000)}, + }, + { + desc: "convert USec to Sec (uint32)", + in: inT{"org.systemd.property.TimeoutStopSec", "uint32 234"}, + exp: expT{false, "TimeoutStopUSec", uint64(234000000)}, + }, + { + desc: "convert USec to Sec (int64)", + in: inT{"org.systemd.property.TimeoutStopSec", "int64 234"}, + exp: expT{false, "TimeoutStopUSec", uint64(234000000)}, + }, + { + desc: "convert USec to Sec (uint64)", + in: inT{"org.systemd.property.TimeoutStopSec", "uint64 234"}, + exp: expT{false, "TimeoutStopUSec", uint64(234000000)}, + }, + { + desc: "convert USec to Sec (float)", + in: inT{"org.systemd.property.TimeoutStopSec", "234.789"}, + exp: expT{false, "TimeoutStopUSec", uint64(234789000)}, + }, + { + desc: "convert USec to Sec (bool -- invalid value)", + in: inT{"org.systemd.property.TimeoutStopSec", "false"}, + exp: expT{true, "", ""}, + }, + { + desc: "convert USec to Sec (string -- invalid value)", + in: inT{"org.systemd.property.TimeoutStopSec", "'covfefe'"}, + exp: expT{true, "", ""}, + }, + { + in: inT{"org.systemd.property.CollectMode", "'inactive-or-failed'"}, + exp: expT{false, "CollectMode", "inactive-or-failed"}, + }, + { + desc: "unrelated property", + in: inT{"some.other.annotation", "0"}, + exp: expT{false, "", ""}, + }, + { + desc: "too short property name", + in: inT{"org.systemd.property.Xo", "1"}, + exp: expT{true, "", ""}, + }, + { + desc: "invalid character in property name", + in: inT{"org.systemd.property.Number1", "1"}, + exp: expT{true, "", ""}, + }, + { + desc: "invalid property value", + in: inT{"org.systemd.property.ValidName", "invalid-value"}, + exp: expT{true, "", ""}, + }, + } + + spec := &specs.Spec{} + + for _, tc := range testCases { + tc := tc + spec.Annotations = map[string]string{tc.in.name: tc.in.value} + + outMap, err := initSystemdProps(spec) + //t.Logf("input %+v, expected %+v, got err:%v out:%+v", tc.in, tc.exp, err, outMap) + + if tc.exp.isErr != (err != nil) { + t.Errorf("input %+v, expecting error: %v, got %v", tc.in, tc.exp.isErr, err) + } + expLen := 1 // expect a single item + if tc.exp.name == "" { + expLen = 0 // expect nothing + } + if len(outMap) != expLen { + t.Fatalf("input %+v, expected %d, got %d entries: %v", tc.in, expLen, len(outMap), outMap) + } + if expLen == 0 { + continue + } + + out := outMap[0] + if tc.exp.name != out.Name { + t.Errorf("input %+v, expecting name: %q, got %q", tc.in, tc.exp.name, out.Name) + } + expValue := dbus.MakeVariant(tc.exp.value).String() + if expValue != out.Value.String() { + t.Errorf("input %+v, expecting value: %s, got %s", tc.in, expValue, out.Value) + } + } +} + +func TestNullProcess(t *testing.T) { + spec := Example() + spec.Process = nil + + _, err := CreateLibcontainerConfig(&CreateOpts{ + Spec: spec, + }) + + if err != nil { + t.Errorf("Null process should be forbidden") + } +} + +func TestCreateDevices(t *testing.T) { + spec := Example() + + // dummy uid/gid for /dev/tty; will enable the test to check if createDevices() + // preferred the spec's device over the redundant default device + ttyUid := uint32(1000) + ttyGid := uint32(1000) + fm := os.FileMode(0666) + + spec.Linux = &specs.Linux{ + Devices: []specs.LinuxDevice{ + { + // This is purposely redundant with one of runc's default devices + Path: "/dev/tty", + Type: "c", + Major: 5, + Minor: 0, + FileMode: &fm, + UID: &ttyUid, + GID: &ttyGid, + }, + { + // This is purposely not redundant with one of runc's default devices + Path: "/dev/ram0", + Type: "b", + Major: 1, + Minor: 0, + }, + }, + } + + conf := &configs.Config{} + + defaultDevs, err := createDevices(spec, conf) + if err != nil { + t.Errorf("failed to create devices: %v", err) + } + + // Verify the returned default devices has the /dev/tty entry deduplicated + found := false + for _, d := range defaultDevs { + if d.Path == "/dev/tty" { + if found { + t.Errorf("createDevices failed: returned a duplicated device entry: %v", defaultDevs) + } + found = true + } + } + + // Verify that createDevices() placed all default devices in the config + for _, allowedDev := range AllowedDevices { + if allowedDev.Path == "" { + continue + } + + found := false + for _, configDev := range conf.Devices { + if configDev.Path == allowedDev.Path { + found = true + } + } + if !found { + configDevPaths := []string{} + for _, configDev := range conf.Devices { + configDevPaths = append(configDevPaths, configDev.Path) + } + t.Errorf("allowedDevice %s was not found in the config's devices: %v", allowedDev.Path, configDevPaths) + } + } + + // Verify that createDevices() deduplicated the /dev/tty entry in the config + for _, configDev := range conf.Devices { + if configDev.Path == "/dev/tty" { + wantDev := &devices.Device{ + Path: "/dev/tty", + FileMode: 0666, + Uid: 1000, + Gid: 1000, + Rule: devices.Rule{ + Type: devices.CharDevice, + Major: 5, + Minor: 0, + }, + } + + if *configDev != *wantDev { + t.Errorf("redundant dev was not deduplicated correctly: want %v, got %v", wantDev, configDev) + } + } + } + + // Verify that createDevices() added the entry for /dev/ram0 in the config + found = false + for _, configDev := range conf.Devices { + if configDev.Path == "/dev/ram0" { + found = true + break + } + } + if !found { + t.Errorf("device /dev/ram0 not found in config devices; got %v", conf.Devices) + } +} diff --git a/sysbox-runc/libcontainer/stacktrace/capture.go b/sysbox-runc/libcontainer/stacktrace/capture.go new file mode 100644 index 00000000..0bbe1495 --- /dev/null +++ b/sysbox-runc/libcontainer/stacktrace/capture.go @@ -0,0 +1,27 @@ +package stacktrace + +import "runtime" + +// Capture captures a stacktrace for the current calling go program +// +// skip is the number of frames to skip +func Capture(userSkip int) Stacktrace { + var ( + skip = userSkip + 1 // add one for our own function + frames []Frame + prevPc uintptr + ) + for i := skip; ; i++ { + pc, file, line, ok := runtime.Caller(i) + //detect if caller is repeated to avoid loop, gccgo + //currently runs into a loop without this check + if !ok || pc == prevPc { + break + } + frames = append(frames, NewFrame(pc, file, line)) + prevPc = pc + } + return Stacktrace{ + Frames: frames, + } +} diff --git a/sysbox-runc/libcontainer/stacktrace/capture_test.go b/sysbox-runc/libcontainer/stacktrace/capture_test.go new file mode 100644 index 00000000..b704629f --- /dev/null +++ b/sysbox-runc/libcontainer/stacktrace/capture_test.go @@ -0,0 +1,33 @@ +package stacktrace + +import ( + "strings" + "testing" +) + +func captureFunc() Stacktrace { + return Capture(0) +} + +func TestCaptureTestFunc(t *testing.T) { + stack := captureFunc() + + if len(stack.Frames) == 0 { + t.Fatal("expected stack frames to be returned") + } + + // the first frame is the caller + frame := stack.Frames[0] + if expected := "captureFunc"; frame.Function != expected { + t.Fatalf("expected function %q but received %q", expected, frame.Function) + } + + expected := "github.com/nestybox/sysbox-runc/libcontainer/stacktrace" + + if !strings.HasSuffix(frame.Package, expected) { + t.Fatalf("expected package %q but received %q", expected, frame.Package) + } + if expected := "capture_test.go"; frame.File != expected { + t.Fatalf("expected file %q but received %q", expected, frame.File) + } +} diff --git a/sysbox-runc/libcontainer/stacktrace/frame.go b/sysbox-runc/libcontainer/stacktrace/frame.go new file mode 100644 index 00000000..0d590d9a --- /dev/null +++ b/sysbox-runc/libcontainer/stacktrace/frame.go @@ -0,0 +1,38 @@ +package stacktrace + +import ( + "path/filepath" + "runtime" + "strings" +) + +// NewFrame returns a new stack frame for the provided information +func NewFrame(pc uintptr, file string, line int) Frame { + fn := runtime.FuncForPC(pc) + if fn == nil { + return Frame{} + } + pack, name := parseFunctionName(fn.Name()) + return Frame{ + Line: line, + File: filepath.Base(file), + Package: pack, + Function: name, + } +} + +func parseFunctionName(name string) (string, string) { + i := strings.LastIndex(name, ".") + if i == -1 { + return "", name + } + return name[:i], name[i+1:] +} + +// Frame contains all the information for a stack frame within a go program +type Frame struct { + File string + Function string + Package string + Line int +} diff --git a/sysbox-runc/libcontainer/stacktrace/frame_test.go b/sysbox-runc/libcontainer/stacktrace/frame_test.go new file mode 100644 index 00000000..c6fc78e0 --- /dev/null +++ b/sysbox-runc/libcontainer/stacktrace/frame_test.go @@ -0,0 +1,20 @@ +package stacktrace + +import "testing" + +func TestParsePackageName(t *testing.T) { + var ( + name = "github.com/opencontainers/runc/libcontainer/stacktrace.captureFunc" + expectedPackage = "github.com/opencontainers/runc/libcontainer/stacktrace" + expectedFunction = "captureFunc" + ) + + pack, funcName := parseFunctionName(name) + if pack != expectedPackage { + t.Fatalf("expected package %q but received %q", expectedPackage, pack) + } + + if funcName != expectedFunction { + t.Fatalf("expected function %q but received %q", expectedFunction, funcName) + } +} diff --git a/sysbox-runc/libcontainer/stacktrace/stacktrace.go b/sysbox-runc/libcontainer/stacktrace/stacktrace.go new file mode 100644 index 00000000..5e8b58d2 --- /dev/null +++ b/sysbox-runc/libcontainer/stacktrace/stacktrace.go @@ -0,0 +1,5 @@ +package stacktrace + +type Stacktrace struct { + Frames []Frame +} diff --git a/sysbox-runc/libcontainer/standard_init_linux.go b/sysbox-runc/libcontainer/standard_init_linux.go new file mode 100644 index 00000000..0f3ebb7d --- /dev/null +++ b/sysbox-runc/libcontainer/standard_init_linux.go @@ -0,0 +1,347 @@ +package libcontainer + +import ( + "os" + "os/exec" + "runtime" + "strconv" + + "github.com/nestybox/sysbox-libs/mount" + "github.com/opencontainers/runc/libcontainer/apparmor" + "github.com/opencontainers/runc/libcontainer/configs" + "github.com/opencontainers/runc/libcontainer/keys" + "github.com/opencontainers/runc/libcontainer/seccomp" + "github.com/opencontainers/runc/libcontainer/system" + "github.com/opencontainers/runc/libcontainer/utils" + "github.com/opencontainers/runtime-spec/specs-go" + "github.com/opencontainers/selinux/go-selinux" + "github.com/pkg/errors" + "golang.org/x/sys/unix" +) + +type linuxStandardInit struct { + pipe *os.File + consoleSocket *os.File + parentPid int + fifoFd int + config *initConfig +} + +// sysbox-runc: info passed when the sys container's init process requests its parent runc +// to perform an operation on its behalf. +type opReqType int + +const ( + bind = iota + switchDockerDns + chown + mkdir + rootfsIDMap +) + +type opReq struct { + // If multiple opReqs of a given type are sent, the ones below are common to + // all and therefore only set in the first one (opReq[0]). Not all request + // types use all these fields necessarily (most use Rootfs but only + // bind-mount requests use the other ones). + Op opReqType `json:"type"` + Rootfs string `json:"rootfs"` + FsuidMapFailOnErr bool `json:"fsuid_map_fail_on_err"` + + // bind + Mount configs.Mount `json:"mount"` + Label string `json:"label"` + + // switchDockerDns + OldDns string `json:"olddns"` + NewDns string `json:"newdns"` + + // chown & mkdir + Path string `json:"path"` + Uid int `json:"uid"` + Gid int `json:"gid"` + Mode os.FileMode `json:"mode"` +} + +func (l *linuxStandardInit) getSessionRingParams() (string, uint32, uint32) { + var newperms uint32 + + if l.config.Config.Namespaces.Contains(configs.NEWUSER) { + // With user ns we need 'other' search permissions. + newperms = 0x8 + } else { + // Without user ns we need 'UID' search permissions. + newperms = 0x80000 + } + + // Create a unique per session container name that we can join in setns; + // However, other containers can also join it. + return "_ses." + l.config.ContainerId, 0xffffffff, newperms +} + +func (l *linuxStandardInit) Init() error { + runtime.LockOSThread() + defer runtime.UnlockOSThread() + + if err := validateCwd(l.config.Config.Rootfs); err != nil { + return newSystemErrorWithCause(err, "validating cwd") + } + + if err := setupNetwork(l.config); err != nil { + return err + } + if err := setupRoute(l.config.Config); err != nil { + return err + } + + // initialises the labeling system + selinux.GetEnabled() + if err := prepareRootfs(l.pipe, l.config); err != nil { + return err + } + + if !l.config.Config.NoNewKeyring { + if err := selinux.SetKeyLabel(l.config.ProcessLabel); err != nil { + return err + } + defer selinux.SetKeyLabel("") + ringname, keepperms, newperms := l.getSessionRingParams() + + // Do not inherit the parent's session keyring. + if sessKeyId, err := keys.JoinSessionKeyring(ringname); err != nil { + // If keyrings aren't supported then it is likely we are on an + // older kernel (or inside an LXC container). While we could bail, + // the security feature we are using here is best-effort (it only + // really provides marginal protection since VFS credentials are + // the only significant protection of keyrings). + // + // TODO(cyphar): Log this so people know what's going on, once we + // have proper logging in 'runc init'. + if errors.Cause(err) != unix.ENOSYS { + return errors.Wrap(err, "join session keyring") + } + } else { + // Make session keyring searcheable. If we've gotten this far we + // bail on any error -- we don't want to have a keyring with bad + // permissions. + if err := keys.ModKeyringPerm(sessKeyId, keepperms, newperms); err != nil { + return errors.Wrap(err, "mod keyring permissions") + } + } + } + + // Set up the console. This has to be done *before* we finalize the rootfs, + // but *after* we've given the user the chance to set up all of the mounts + // they wanted. + if l.config.CreateConsole { + if err := setupConsole(l.consoleSocket, l.config, true); err != nil { + return err + } + if err := system.Setctty(); err != nil { + return errors.Wrap(err, "setctty") + } + } + + // Finish the rootfs setup. + if l.config.Config.Namespaces.Contains(configs.NEWNS) { + if err := finalizeRootfs(l.config.Config); err != nil { + return err + } + } + + if hostname := l.config.Config.Hostname; hostname != "" { + if err := unix.Sethostname([]byte(hostname)); err != nil { + return errors.Wrap(err, "sethostname") + } + } + + if err := apparmor.ApplyProfile(l.config.AppArmorProfile); err != nil { + return errors.Wrap(err, "apply apparmor profile") + } + + // Notify rootfs readiness to parent so that sysbox-fs registration can be + // completed. + if err := syncParentRootfsReady(l.pipe); err != nil { + return errors.Wrap(err, "send immutable list to parent") + } + + // The instructions that follow and that precede the 'parentReady' signal + // notification, must all execute after the container has been properly + // registered with sysbox-fs. + if l.config.Config.SwitchDockerDns { + if err := switchDockerDnsIP(l.config.Config, l.pipe); err != nil { + return errors.Wrap(err, "switching Docker DNS") + } + } + + // Config the sysctls + for key, value := range l.config.Config.Sysctl { + if err := writeSystemProperty(key, value); err != nil { + return errors.Wrapf(err, "write sysctl key %s", key) + } + } + + // Handle read-only paths + if len(l.config.Config.ReadonlyPaths) > 0 { + mounts, err := mount.GetMounts() + if err != nil { + return errors.Wrap(err, "getting mounts") + } + + for _, path := range l.config.Config.ReadonlyPaths { + if err := readonlyPath(path, mounts); err != nil { + return errors.Wrapf(err, "readonly path %s", path) + } + } + } + + // Handle masked paths + for _, path := range l.config.Config.MaskPaths { + if err := maskPath(path, l.config.Config.MountLabel); err != nil { + return errors.Wrapf(err, "mask path %s", path) + } + } + + pdeath, err := system.GetParentDeathSignal() + if err != nil { + return errors.Wrap(err, "get pdeath signal") + } + + if l.config.NoNewPrivileges { + if err := unix.Prctl(unix.PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0); err != nil { + return errors.Wrap(err, "set nonewprivileges") + } + } + + // Tell our parent that we're ready to Execv. This must be done before the + // Seccomp rules have been applied, because we need to be able to read and + // write to a socket. + if err := syncParentReady(l.pipe); err != nil { + return errors.Wrap(err, "sync ready") + } + + if err := selinux.SetExecLabel(l.config.ProcessLabel); err != nil { + return errors.Wrap(err, "set process label") + } + defer selinux.SetExecLabel("") + + // Normally we enable seccomp just before exec'ing into the sys container's so as few + // syscalls take place after enabling seccomp. However, if the process does not have + // CAP_SYS_ADMIN (e.g., the process is non-root) and NoNewPrivileges is cleared, then + // we must enable seccomp here (before we drop the process caps in finalizeNamespace() + // below). Otherwise we get a permission denied error. + + seccompNotifDone := false + seccompFiltDone := false + + if !l.config.NoNewPrivileges && + (l.config.Capabilities != nil && !utils.StringSliceContains(l.config.Capabilities.Effective, "CAP_SYS_ADMIN")) || + (l.config.Config.Capabilities != nil && !utils.StringSliceContains(l.config.Config.Capabilities.Effective, "CAP_SYS_ADMIN")) { + + if l.config.Config.SeccompNotif != nil { + if err := setupSyscallTraps(l.config, l.pipe); err != nil { + return err + } + seccompNotifDone = true + } + + if l.config.Config.Seccomp != nil { + if _, err := seccomp.InitSeccomp(l.config.Config.Seccomp); err != nil { + return newSystemErrorWithCause(err, "loading seccomp filtering rules") + } + seccompFiltDone = true + } + } + + // finalizeNamespace drops the caps, sets the correct user and working dir, and marks + // any leaked file descriptors for closing before executing the command inside the + // namespace + if err := finalizeNamespace(l.config); err != nil { + return err + } + + // finalizeNamespace can change user/group which clears the parent death + // signal, so we restore it here. + if err := pdeath.Restore(); err != nil { + return errors.Wrap(err, "restore pdeath signal") + } + + // Compare the parent from the initial start of the init process and make + // sure that it did not change. if the parent changes that means it died + // and we were reparented to something else so we should just kill ourself + // and not cause problems for someone else. + if unix.Getppid() != l.parentPid { + return unix.Kill(unix.Getpid(), unix.SIGKILL) + } + + // Check for the arg before waiting to make sure it exists and it is + // returned as a create time error. + name, err := exec.LookPath(l.config.Args[0]) + if err != nil { + return err + } + + // sysbox-runc: setup syscall trapping (must do this before closing the pipe) + if l.config.Config.SeccompNotif != nil && !seccompNotifDone { + if err := setupSyscallTraps(l.config, l.pipe); err != nil { + return err + } + } + + // Close the pipe to signal that we have completed our init. + l.pipe.Close() + + // Wait for the FIFO to be opened on the other side before exec-ing the + // user process. We open it through /proc/self/fd/$fd, because the fd that + // was given to us was an O_PATH fd to the fifo itself. Linux allows us to + // re-open an O_PATH fd through /proc. + fd, err := unix.Open("/proc/self/fd/"+strconv.Itoa(l.fifoFd), unix.O_WRONLY|unix.O_CLOEXEC, 0) + if err != nil { + return newSystemErrorWithCause(err, "open exec fifo") + } + if _, err := unix.Write(fd, []byte("0")); err != nil { + return newSystemErrorWithCause(err, "write 0 exec fifo") + } + + // Close the O_PATH fifofd fd before exec because the kernel resets + // dumpable in the wrong order. This has been fixed in newer kernels, but + // we keep this to ensure CVE-2016-9962 doesn't re-emerge on older kernels. + // N.B. the core issue itself (passing dirfds to the host filesystem) has + // since been resolved. + // https://github.com/torvalds/linux/blob/v4.9/fs/exec.c#L1290-L1318 + unix.Close(l.fifoFd) + + // Load the seccomp syscall whitelist as close to execve as possible, so as few + // syscalls take place afterward (reducing the amount of syscalls that users need to + // enable in their seccomp profiles). + if l.config.Config.Seccomp != nil && !seccompFiltDone { + if _, err := seccomp.InitSeccomp(l.config.Config.Seccomp); err != nil { + return newSystemErrorWithCause(err, "loading seccomp filtering rules") + } + } + + s := l.config.SpecState + s.Pid = unix.Getpid() + s.Status = specs.StateCreated + if err := l.config.Config.Hooks[configs.StartContainer].RunHooks(s); err != nil { + return err + } + + // Close all file descriptors we are not passing to the container. This is + // necessary because the execve target could use internal sysbox-runc fds as + // the execve path, potentially giving access to binary files from the host + // (which can then be opened by container processes, leading to container + // escapes). Note that because this operation will close any open file + // descriptors that are referenced by (*os.File) handles from underneath the + // Go runtime, we must not do any file operations after this point (otherwise + // the (*os.File) finaliser could close the wrong file). See CVE-2024-21626 + // for more information as to why this protection is necessary. + if err := utils.UnsafeCloseFrom(l.config.PassedFilesCount + 3); err != nil { + return err + } + + if err := unix.Exec(name, l.config.Args[0:], os.Environ()); err != nil { + return newSystemErrorWithCausef(err, "exec user process: name = %v, args = %v, environ = %v", name, l.config.Args[0:], os.Environ()) + } + return nil +} diff --git a/sysbox-runc/libcontainer/state_linux.go b/sysbox-runc/libcontainer/state_linux.go new file mode 100644 index 00000000..555e4b89 --- /dev/null +++ b/sysbox-runc/libcontainer/state_linux.go @@ -0,0 +1,248 @@ +// +build linux + +package libcontainer + +import ( + "fmt" + "os" + "path/filepath" + + "github.com/opencontainers/runc/libcontainer/configs" + "github.com/opencontainers/runtime-spec/specs-go" + + "github.com/sirupsen/logrus" + + "golang.org/x/sys/unix" +) + +func newStateTransitionError(from, to containerState) error { + return &stateTransitionError{ + From: from.status().String(), + To: to.status().String(), + } +} + +// stateTransitionError is returned when an invalid state transition happens from one +// state to another. +type stateTransitionError struct { + From string + To string +} + +func (s *stateTransitionError) Error() string { + return fmt.Sprintf("invalid state transition from %s to %s", s.From, s.To) +} + +type containerState interface { + transition(containerState) error + destroy() error + status() Status +} + +func destroy(c *linuxContainer) error { + if !c.config.Namespaces.Contains(configs.NEWPID) || + c.config.Namespaces.PathOf(configs.NEWPID) != "" { + if err := signalAllProcesses(c.cgroupManager, unix.SIGKILL); err != nil { + logrus.Warn(err) + } + } + err := c.cgroupManager.Destroy() + if c.intelRdtManager != nil { + if ierr := c.intelRdtManager.Destroy(); err == nil { + err = ierr + } + } + if rerr := os.RemoveAll(c.root); err == nil { + err = rerr + } + c.initProcess = nil + if herr := runPoststopHooks(c); err == nil { + err = herr + } + c.state = &stoppedState{c: c} + + return err +} + +func runPoststopHooks(c *linuxContainer) error { + hooks := c.config.Hooks + if hooks == nil { + return nil + } + + s, err := c.currentOCIState() + if err != nil { + return err + } + s.Status = specs.StateStopped + + if err := hooks[configs.Poststop].RunHooks(s); err != nil { + return err + } + + return nil +} + +// stoppedState represents a container is a stopped/destroyed state. +type stoppedState struct { + c *linuxContainer +} + +func (b *stoppedState) status() Status { + return Stopped +} + +func (b *stoppedState) transition(s containerState) error { + switch s.(type) { + case *runningState, *restoredState: + b.c.state = s + return nil + case *stoppedState: + return nil + } + return newStateTransitionError(b, s) +} + +func (b *stoppedState) destroy() error { + return destroy(b.c) +} + +// runningState represents a container that is currently running. +type runningState struct { + c *linuxContainer +} + +func (r *runningState) status() Status { + return Running +} + +func (r *runningState) transition(s containerState) error { + switch s.(type) { + case *stoppedState: + if r.c.runType() == Running { + return newGenericError(fmt.Errorf("container still running"), ContainerNotStopped) + } + r.c.state = s + return nil + case *pausedState: + r.c.state = s + return nil + case *runningState: + return nil + } + return newStateTransitionError(r, s) +} + +func (r *runningState) destroy() error { + if r.c.runType() == Running { + return newGenericError(fmt.Errorf("container is not destroyed"), ContainerNotStopped) + } + return destroy(r.c) +} + +type createdState struct { + c *linuxContainer +} + +func (i *createdState) status() Status { + return Created +} + +func (i *createdState) transition(s containerState) error { + switch s.(type) { + case *runningState, *pausedState, *stoppedState: + i.c.state = s + return nil + case *createdState: + return nil + } + return newStateTransitionError(i, s) +} + +func (i *createdState) destroy() error { + i.c.initProcess.signal(unix.SIGKILL) + return destroy(i.c) +} + +// pausedState represents a container that is currently pause. It cannot be destroyed in a +// paused state and must transition back to running first. +type pausedState struct { + c *linuxContainer +} + +func (p *pausedState) status() Status { + return Paused +} + +func (p *pausedState) transition(s containerState) error { + switch s.(type) { + case *runningState, *stoppedState: + p.c.state = s + return nil + case *pausedState: + return nil + } + return newStateTransitionError(p, s) +} + +func (p *pausedState) destroy() error { + t := p.c.runType() + if t != Running && t != Created { + if err := p.c.cgroupManager.Freeze(configs.Thawed); err != nil { + return err + } + return destroy(p.c) + } + return newGenericError(fmt.Errorf("container is paused"), ContainerPaused) +} + +// restoredState is the same as the running state but also has associated checkpoint +// information that maybe need destroyed when the container is stopped and destroy is called. +type restoredState struct { + imageDir string + c *linuxContainer +} + +func (r *restoredState) status() Status { + return Running +} + +func (r *restoredState) transition(s containerState) error { + switch s.(type) { + case *stoppedState, *runningState: + return nil + } + return newStateTransitionError(r, s) +} + +func (r *restoredState) destroy() error { + if _, err := os.Stat(filepath.Join(r.c.root, "checkpoint")); err != nil { + if !os.IsNotExist(err) { + return err + } + } + return destroy(r.c) +} + +// loadedState is used whenever a container is restored, loaded, or setting additional +// processes inside and it should not be destroyed when it is exiting. +type loadedState struct { + c *linuxContainer + s Status +} + +func (n *loadedState) status() Status { + return n.s +} + +func (n *loadedState) transition(s containerState) error { + n.c.state = s + return nil +} + +func (n *loadedState) destroy() error { + if err := n.c.refreshState(); err != nil { + return err + } + return n.c.state.destroy() +} diff --git a/sysbox-runc/libcontainer/state_linux_test.go b/sysbox-runc/libcontainer/state_linux_test.go new file mode 100644 index 00000000..6ef516b7 --- /dev/null +++ b/sysbox-runc/libcontainer/state_linux_test.go @@ -0,0 +1,116 @@ +// +build linux + +package libcontainer + +import ( + "reflect" + "testing" +) + +var states = map[containerState]Status{ + &createdState{}: Created, + &runningState{}: Running, + &restoredState{}: Running, + &pausedState{}: Paused, + &stoppedState{}: Stopped, + &loadedState{s: Running}: Running, +} + +func TestStateStatus(t *testing.T) { + for s, status := range states { + if s.status() != status { + t.Fatalf("state returned %s but expected %s", s.status(), status) + } + } +} + +func isStateTransitionError(err error) bool { + _, ok := err.(*stateTransitionError) + return ok +} + +func testTransitions(t *testing.T, initialState containerState, valid []containerState) { + validMap := map[reflect.Type]interface{}{} + for _, validState := range valid { + validMap[reflect.TypeOf(validState)] = nil + t.Run(validState.status().String(), func(t *testing.T) { + if err := initialState.transition(validState); err != nil { + t.Fatal(err) + } + }) + } + for state := range states { + if _, ok := validMap[reflect.TypeOf(state)]; ok { + continue + } + t.Run(state.status().String(), func(t *testing.T) { + err := initialState.transition(state) + if err == nil { + t.Fatal("transition should fail") + } + if !isStateTransitionError(err) { + t.Fatal("expected stateTransitionError") + } + }) + } +} + +func TestStoppedStateTransition(t *testing.T) { + testTransitions( + t, + &stoppedState{c: &linuxContainer{}}, + []containerState{ + &stoppedState{}, + &runningState{}, + &restoredState{}, + }, + ) +} + +func TestPausedStateTransition(t *testing.T) { + testTransitions( + t, + &pausedState{c: &linuxContainer{}}, + []containerState{ + &pausedState{}, + &runningState{}, + &stoppedState{}, + }, + ) +} + +func TestRestoredStateTransition(t *testing.T) { + testTransitions( + t, + &restoredState{c: &linuxContainer{}}, + []containerState{ + &stoppedState{}, + &runningState{}, + }, + ) +} + +func TestRunningStateTransition(t *testing.T) { + testTransitions( + t, + &runningState{c: &linuxContainer{}}, + []containerState{ + &stoppedState{}, + &pausedState{}, + &runningState{}, + }, + ) +} + +func TestCreatedStateTransition(t *testing.T) { + testTransitions( + t, + &createdState{c: &linuxContainer{}}, + []containerState{ + &stoppedState{}, + &pausedState{}, + &runningState{}, + &createdState{}, + }, + ) +} diff --git a/sysbox-runc/libcontainer/stats_linux.go b/sysbox-runc/libcontainer/stats_linux.go new file mode 100644 index 00000000..fff9dd37 --- /dev/null +++ b/sysbox-runc/libcontainer/stats_linux.go @@ -0,0 +1,13 @@ +package libcontainer + +import ( + "github.com/opencontainers/runc/libcontainer/cgroups" + "github.com/opencontainers/runc/libcontainer/intelrdt" + "github.com/opencontainers/runc/types" +) + +type Stats struct { + Interfaces []*types.NetworkInterface + CgroupStats *cgroups.Stats + IntelRdtStats *intelrdt.Stats +} diff --git a/sysbox-runc/libcontainer/sync.go b/sysbox-runc/libcontainer/sync.go new file mode 100644 index 00000000..26081fd9 --- /dev/null +++ b/sysbox-runc/libcontainer/sync.go @@ -0,0 +1,127 @@ +package libcontainer + +import ( + "encoding/json" + "errors" + "fmt" + "io" + + "github.com/opencontainers/runc/libcontainer/utils" +) + +type syncType string + +// Constants that are used for synchronization between the parent and child +// during container setup. They come in pairs (with procError being a generic +// response which is followed by a &genericError). +// +// [ child ] <-> [ parent ] +// +// opReq --> +// <-- sendOpInfo +// [send(info)] --> [recv(info)] +// <-- opDone +// +// procHooks --> [run hooks] +// <-- procResume +// +// rootfsReady --> [complete container registration] +// <-- rootfsReadyAck +// +// procReady --> [final setup] +// <-- procRun +// +// procFd --> +// <-- sendFd +// [send(fd)] --> [recv(fd)] +// <-- procFdDone +// + +const ( + procError syncType = "procError" + procReady syncType = "procReady" + procRun syncType = "procRun" + procHooks syncType = "procHooks" + procResume syncType = "procResume" + + reqOp syncType = "reqOp" + sendOpInfo syncType = "sendOpInfo" + opDone syncType = "opDone" + + procFd syncType = "procFd" + sendFd syncType = "sendFd" + procFdDone syncType = "procFdDone" + + rootfsReady syncType = "rootfsReady" + rootfsReadyAck syncType = "rootfsReadyAck" +) + +type syncT struct { + Type syncType `json:"type"` +} + +// writeSync is used to write to a synchronisation pipe. An error is returned +// if there was a problem writing the payload. +func writeSync(pipe io.Writer, sync syncType) error { + return utils.WriteJSON(pipe, syncT{sync}) +} + +// readSync is used to read from a synchronisation pipe. An error is returned +// if we got a genericError, the pipe was closed, or we got an unexpected flag. +func readSync(pipe io.Reader, expected syncType) error { + var procSync syncT + if err := json.NewDecoder(pipe).Decode(&procSync); err != nil { + if err == io.EOF { + return errors.New("parent closed synchronisation channel") + } + return fmt.Errorf("failed reading error from parent: %v", err) + } + + if procSync.Type == procError { + var ierr genericError + + if err := json.NewDecoder(pipe).Decode(&ierr); err != nil { + return fmt.Errorf("failed reading error from parent: %v", err) + } + + return &ierr + } + + if procSync.Type != expected { + return errors.New("invalid synchronisation flag from parent") + } + return nil +} + +// parseSync runs the given callback function on each syncT received from the +// child. It will return once io.EOF is returned from the given pipe. +func parseSync(pipe io.Reader, fn func(*syncT) error) error { + dec := json.NewDecoder(pipe) + for { + var sync syncT + if err := dec.Decode(&sync); err != nil { + if err == io.EOF { + break + } + return err + } + + // We handle this case outside fn for cleanliness reasons. + var ierr *genericError + if sync.Type == procError { + if err := dec.Decode(&ierr); err != nil && err != io.EOF { + return newSystemErrorWithCause(err, "decoding proc error from init") + } + if ierr != nil { + return ierr + } + // Programmer error. + panic("No error following JSON procError payload.") + } + + if err := fn(&sync); err != nil { + return err + } + } + return nil +} diff --git a/sysbox-runc/libcontainer/system/linux.go b/sysbox-runc/libcontainer/system/linux.go new file mode 100644 index 00000000..49471960 --- /dev/null +++ b/sysbox-runc/libcontainer/system/linux.go @@ -0,0 +1,150 @@ +// +build linux + +package system + +import ( + "os" + "os/exec" + "sync" + "unsafe" + + "github.com/opencontainers/runc/libcontainer/user" + "golang.org/x/sys/unix" +) + +type ParentDeathSignal int + +func (p ParentDeathSignal) Restore() error { + if p == 0 { + return nil + } + current, err := GetParentDeathSignal() + if err != nil { + return err + } + if p == current { + return nil + } + return p.Set() +} + +func (p ParentDeathSignal) Set() error { + return SetParentDeathSignal(uintptr(p)) +} + +func Execv(cmd string, args []string, env []string) error { + name, err := exec.LookPath(cmd) + if err != nil { + return err + } + + return unix.Exec(name, args, env) +} + +func Prlimit(pid, resource int, limit unix.Rlimit) error { + _, _, err := unix.RawSyscall6(unix.SYS_PRLIMIT64, uintptr(pid), uintptr(resource), uintptr(unsafe.Pointer(&limit)), uintptr(unsafe.Pointer(&limit)), 0, 0) + if err != 0 { + return err + } + return nil +} + +func SetParentDeathSignal(sig uintptr) error { + if err := unix.Prctl(unix.PR_SET_PDEATHSIG, sig, 0, 0, 0); err != nil { + return err + } + return nil +} + +func GetParentDeathSignal() (ParentDeathSignal, error) { + var sig int + if err := unix.Prctl(unix.PR_GET_PDEATHSIG, uintptr(unsafe.Pointer(&sig)), 0, 0, 0); err != nil { + return -1, err + } + return ParentDeathSignal(sig), nil +} + +func SetKeepCaps() error { + if err := unix.Prctl(unix.PR_SET_KEEPCAPS, 1, 0, 0, 0); err != nil { + return err + } + + return nil +} + +func ClearKeepCaps() error { + if err := unix.Prctl(unix.PR_SET_KEEPCAPS, 0, 0, 0, 0); err != nil { + return err + } + + return nil +} + +func Setctty() error { + if err := unix.IoctlSetInt(0, unix.TIOCSCTTY, 0); err != nil { + return err + } + return nil +} + +var ( + inUserNS bool + nsOnce sync.Once +) + +// RunningInUserNS detects whether we are currently running in a user namespace. +// Originally copied from github.com/lxc/lxd/shared/util.go +func RunningInUserNS() bool { + nsOnce.Do(func() { + uidmap, err := user.CurrentProcessUIDMap() + if err != nil { + // This kernel-provided file only exists if user namespaces are supported + return + } + inUserNS = UIDMapInUserNS(uidmap) + }) + return inUserNS +} + +func UIDMapInUserNS(uidmap []user.IDMap) bool { + /* + * We assume we are in the initial user namespace if we have a full + * range - 4294967295 uids starting at uid 0. + */ + if len(uidmap) == 1 && uidmap[0].ID == 0 && uidmap[0].ParentID == 0 && uidmap[0].Count == 4294967295 { + return false + } + return true +} + +// GetParentNSeuid returns the euid within the parent user namespace +func GetParentNSeuid() int64 { + euid := int64(os.Geteuid()) + uidmap, err := user.CurrentProcessUIDMap() + if err != nil { + // This kernel-provided file only exists if user namespaces are supported + return euid + } + for _, um := range uidmap { + if um.ID <= euid && euid <= um.ID+um.Count-1 { + return um.ParentID + euid - um.ID + } + } + return euid +} + +// SetSubreaper sets the value i as the subreaper setting for the calling process +func SetSubreaper(i int) error { + return unix.Prctl(unix.PR_SET_CHILD_SUBREAPER, uintptr(i), 0, 0, 0) +} + +// GetSubreaper returns the subreaper setting for the calling process +func GetSubreaper() (int, error) { + var i uintptr + + if err := unix.Prctl(unix.PR_GET_CHILD_SUBREAPER, uintptr(unsafe.Pointer(&i)), 0, 0, 0); err != nil { + return -1, err + } + + return int(i), nil +} diff --git a/sysbox-runc/libcontainer/system/linux_test.go b/sysbox-runc/libcontainer/system/linux_test.go new file mode 100644 index 00000000..4d613d84 --- /dev/null +++ b/sysbox-runc/libcontainer/system/linux_test.go @@ -0,0 +1,45 @@ +// +build linux + +package system + +import ( + "strings" + "testing" + + "github.com/opencontainers/runc/libcontainer/user" +) + +func TestUIDMapInUserNS(t *testing.T) { + cases := []struct { + s string + expected bool + }{ + { + s: " 0 0 4294967295\n", + expected: false, + }, + { + s: " 0 0 1\n", + expected: true, + }, + { + s: " 0 1001 1\n 1 231072 65536\n", + expected: true, + }, + { + // file exist but empty (the initial state when userns is created. see man 7 user_namespaces) + s: "", + expected: true, + }, + } + for _, c := range cases { + uidmap, err := user.ParseIDMap(strings.NewReader(c.s)) + if err != nil { + t.Fatal(err) + } + actual := UIDMapInUserNS(uidmap) + if c.expected != actual { + t.Fatalf("expected %v, got %v for %q", c.expected, actual, c.s) + } + } +} diff --git a/sysbox-runc/libcontainer/system/proc.go b/sysbox-runc/libcontainer/system/proc.go new file mode 100644 index 00000000..b73cf70b --- /dev/null +++ b/sysbox-runc/libcontainer/system/proc.go @@ -0,0 +1,103 @@ +package system + +import ( + "fmt" + "io/ioutil" + "path/filepath" + "strconv" + "strings" +) + +// State is the status of a process. +type State rune + +const ( // Only values for Linux 3.14 and later are listed here + Dead State = 'X' + DiskSleep State = 'D' + Running State = 'R' + Sleeping State = 'S' + Stopped State = 'T' + TracingStop State = 't' + Zombie State = 'Z' +) + +// String forms of the state from proc(5)'s documentation for +// /proc/[pid]/status' "State" field. +func (s State) String() string { + switch s { + case Dead: + return "dead" + case DiskSleep: + return "disk sleep" + case Running: + return "running" + case Sleeping: + return "sleeping" + case Stopped: + return "stopped" + case TracingStop: + return "tracing stop" + case Zombie: + return "zombie" + default: + return fmt.Sprintf("unknown (%c)", s) + } +} + +// Stat_t represents the information from /proc/[pid]/stat, as +// described in proc(5) with names based on the /proc/[pid]/status +// fields. +type Stat_t struct { + // PID is the process ID. + PID uint + + // Name is the command run by the process. + Name string + + // State is the state of the process. + State State + + // StartTime is the number of clock ticks after system boot (since + // Linux 2.6). + StartTime uint64 +} + +// Stat returns a Stat_t instance for the specified process. +func Stat(pid int) (stat Stat_t, err error) { + bytes, err := ioutil.ReadFile(filepath.Join("/proc", strconv.Itoa(pid), "stat")) + if err != nil { + return stat, err + } + return parseStat(string(bytes)) +} + +func parseStat(data string) (stat Stat_t, err error) { + // From proc(5), field 2 could contain space and is inside `(` and `)`. + // The following is an example: + // 89653 (gunicorn: maste) S 89630 89653 89653 0 -1 4194560 29689 28896 0 3 146 32 76 19 20 0 1 0 2971844 52965376 3920 18446744073709551615 1 1 0 0 0 0 0 16781312 137447943 0 0 0 17 1 0 0 0 0 0 0 0 0 0 0 0 0 0 + i := strings.LastIndex(data, ")") + if i <= 2 || i >= len(data)-1 { + return stat, fmt.Errorf("invalid stat data: %q", data) + } + + parts := strings.SplitN(data[:i], "(", 2) + if len(parts) != 2 { + return stat, fmt.Errorf("invalid stat data: %q", data) + } + + stat.Name = parts[1] + _, err = fmt.Sscanf(parts[0], "%d", &stat.PID) + if err != nil { + return stat, err + } + + // parts indexes should be offset by 3 from the field number given + // proc(5), because parts is zero-indexed and we've removed fields + // one (PID) and two (Name) in the paren-split. + parts = strings.Split(data[i+2:], " ") + var state int + fmt.Sscanf(parts[3-3], "%c", &state) + stat.State = State(state) + fmt.Sscanf(parts[22-3], "%d", &stat.StartTime) + return stat, nil +} diff --git a/sysbox-runc/libcontainer/system/proc_test.go b/sysbox-runc/libcontainer/system/proc_test.go new file mode 100644 index 00000000..7e1acc5b --- /dev/null +++ b/sysbox-runc/libcontainer/system/proc_test.go @@ -0,0 +1,45 @@ +package system + +import "testing" + +func TestParseStartTime(t *testing.T) { + data := map[string]Stat_t{ + "4902 (gunicorn: maste) S 4885 4902 4902 0 -1 4194560 29683 29929 61 83 78 16 96 17 20 0 1 0 9126532 52965376 1903 18446744073709551615 4194304 7461796 140733928751520 140733928698072 139816984959091 0 0 16781312 137447943 1 0 0 17 3 0 0 9 0 0 9559488 10071156 33050624 140733928758775 140733928758945 140733928758945 140733928759264 0": { + PID: 4902, + Name: "gunicorn: maste", + State: 'S', + StartTime: 9126532, + }, + "9534 (cat) R 9323 9534 9323 34828 9534 4194304 95 0 0 0 0 0 0 0 20 0 1 0 9214966 7626752 168 18446744073709551615 4194304 4240332 140732237651568 140732237650920 140570710391216 0 0 0 0 0 0 0 17 1 0 0 0 0 0 6340112 6341364 21553152 140732237653865 140732237653885 140732237653885 140732237656047 0": { + PID: 9534, + Name: "cat", + State: 'R', + StartTime: 9214966, + }, + + "24767 (irq/44-mei_me) S 2 0 0 0 -1 2129984 0 0 0 0 0 0 0 0 -51 0 1 0 8722075 0 0 18446744073709551615 0 0 0 0 0 0 0 2147483647 0 0 0 0 17 1 50 1 0 0 0 0 0 0 0 0 0 0 0": { + PID: 24767, + Name: "irq/44-mei_me", + State: 'S', + StartTime: 8722075, + }, + } + for line, expected := range data { + st, err := parseStat(line) + if err != nil { + t.Fatal(err) + } + if st.PID != expected.PID { + t.Fatalf("expected PID %q but received %q", expected.PID, st.PID) + } + if st.State != expected.State { + t.Fatalf("expected state %q but received %q", expected.State, st.State) + } + if st.Name != expected.Name { + t.Fatalf("expected name %q but received %q", expected.Name, st.Name) + } + if st.StartTime != expected.StartTime { + t.Fatalf("expected start time %q but received %q", expected.StartTime, st.StartTime) + } + } +} diff --git a/sysbox-runc/libcontainer/system/syscall_linux_32.go b/sysbox-runc/libcontainer/system/syscall_linux_32.go new file mode 100644 index 00000000..c5ca5d86 --- /dev/null +++ b/sysbox-runc/libcontainer/system/syscall_linux_32.go @@ -0,0 +1,26 @@ +// +build linux +// +build 386 arm + +package system + +import ( + "golang.org/x/sys/unix" +) + +// Setuid sets the uid of the calling thread to the specified uid. +func Setuid(uid int) (err error) { + _, _, e1 := unix.RawSyscall(unix.SYS_SETUID32, uintptr(uid), 0, 0) + if e1 != 0 { + err = e1 + } + return +} + +// Setgid sets the gid of the calling thread to the specified gid. +func Setgid(gid int) (err error) { + _, _, e1 := unix.RawSyscall(unix.SYS_SETGID32, uintptr(gid), 0, 0) + if e1 != 0 { + err = e1 + } + return +} diff --git a/sysbox-runc/libcontainer/system/syscall_linux_64.go b/sysbox-runc/libcontainer/system/syscall_linux_64.go new file mode 100644 index 00000000..e05e30ad --- /dev/null +++ b/sysbox-runc/libcontainer/system/syscall_linux_64.go @@ -0,0 +1,26 @@ +// +build linux +// +build arm64 amd64 mips mipsle mips64 mips64le ppc ppc64 ppc64le riscv64 s390x + +package system + +import ( + "golang.org/x/sys/unix" +) + +// Setuid sets the uid of the calling thread to the specified uid. +func Setuid(uid int) (err error) { + _, _, e1 := unix.RawSyscall(unix.SYS_SETUID, uintptr(uid), 0, 0) + if e1 != 0 { + err = e1 + } + return +} + +// Setgid sets the gid of the calling thread to the specified gid. +func Setgid(gid int) (err error) { + _, _, e1 := unix.RawSyscall(unix.SYS_SETGID, uintptr(gid), 0, 0) + if e1 != 0 { + err = e1 + } + return +} diff --git a/sysbox-runc/libcontainer/system/unsupported.go b/sysbox-runc/libcontainer/system/unsupported.go new file mode 100644 index 00000000..b94be74a --- /dev/null +++ b/sysbox-runc/libcontainer/system/unsupported.go @@ -0,0 +1,27 @@ +// +build !linux + +package system + +import ( + "os" + + "github.com/opencontainers/runc/libcontainer/user" +) + +// RunningInUserNS is a stub for non-Linux systems +// Always returns false +func RunningInUserNS() bool { + return false +} + +// UIDMapInUserNS is a stub for non-Linux systems +// Always returns false +func UIDMapInUserNS(uidmap []user.IDMap) bool { + return false +} + +// GetParentNSeuid returns the euid within the parent user namespace +// Always returns os.Geteuid on non-linux +func GetParentNSeuid() int { + return os.Geteuid() +} diff --git a/sysbox-runc/libcontainer/system/xattrs_linux.go b/sysbox-runc/libcontainer/system/xattrs_linux.go new file mode 100644 index 00000000..a6823fc9 --- /dev/null +++ b/sysbox-runc/libcontainer/system/xattrs_linux.go @@ -0,0 +1,35 @@ +package system + +import "golang.org/x/sys/unix" + +// Returns a []byte slice if the xattr is set and nil otherwise +// Requires path and its attribute as arguments +func Lgetxattr(path string, attr string) ([]byte, error) { + var sz int + // Start with a 128 length byte array + dest := make([]byte, 128) + sz, errno := unix.Lgetxattr(path, attr, dest) + + switch { + case errno == unix.ENODATA: + return nil, errno + case errno == unix.ENOTSUP: + return nil, errno + case errno == unix.ERANGE: + // 128 byte array might just not be good enough, + // A dummy buffer is used to get the real size + // of the xattrs on disk + sz, errno = unix.Lgetxattr(path, attr, []byte{}) + if errno != nil { + return nil, errno + } + dest = make([]byte, sz) + sz, errno = unix.Lgetxattr(path, attr, dest) + if errno != nil { + return nil, errno + } + case errno != nil: + return nil, errno + } + return dest[:sz], nil +} diff --git a/sysbox-runc/libcontainer/user/MAINTAINERS b/sysbox-runc/libcontainer/user/MAINTAINERS new file mode 100644 index 00000000..edbe2006 --- /dev/null +++ b/sysbox-runc/libcontainer/user/MAINTAINERS @@ -0,0 +1,2 @@ +Tianon Gravi (@tianon) +Aleksa Sarai (@cyphar) diff --git a/sysbox-runc/libcontainer/user/lookup.go b/sysbox-runc/libcontainer/user/lookup.go new file mode 100644 index 00000000..6fd8dd0d --- /dev/null +++ b/sysbox-runc/libcontainer/user/lookup.go @@ -0,0 +1,41 @@ +package user + +import ( + "errors" +) + +var ( + // The current operating system does not provide the required data for user lookups. + ErrUnsupported = errors.New("user lookup: operating system does not provide passwd-formatted data") + // No matching entries found in file. + ErrNoPasswdEntries = errors.New("no matching entries in passwd file") + ErrNoGroupEntries = errors.New("no matching entries in group file") +) + +// LookupUser looks up a user by their username in /etc/passwd. If the user +// cannot be found (or there is no /etc/passwd file on the filesystem), then +// LookupUser returns an error. +func LookupUser(username string) (User, error) { + return lookupUser(username) +} + +// LookupUid looks up a user by their user id in /etc/passwd. If the user cannot +// be found (or there is no /etc/passwd file on the filesystem), then LookupId +// returns an error. +func LookupUid(uid int) (User, error) { + return lookupUid(uid) +} + +// LookupGroup looks up a group by its name in /etc/group. If the group cannot +// be found (or there is no /etc/group file on the filesystem), then LookupGroup +// returns an error. +func LookupGroup(groupname string) (Group, error) { + return lookupGroup(groupname) +} + +// LookupGid looks up a group by its group id in /etc/group. If the group cannot +// be found (or there is no /etc/group file on the filesystem), then LookupGid +// returns an error. +func LookupGid(gid int) (Group, error) { + return lookupGid(gid) +} diff --git a/sysbox-runc/libcontainer/user/lookup_unix.go b/sysbox-runc/libcontainer/user/lookup_unix.go new file mode 100644 index 00000000..92b5ae8d --- /dev/null +++ b/sysbox-runc/libcontainer/user/lookup_unix.go @@ -0,0 +1,144 @@ +// +build darwin dragonfly freebsd linux netbsd openbsd solaris + +package user + +import ( + "io" + "os" + "strconv" + + "golang.org/x/sys/unix" +) + +// Unix-specific path to the passwd and group formatted files. +const ( + unixPasswdPath = "/etc/passwd" + unixGroupPath = "/etc/group" +) + +func lookupUser(username string) (User, error) { + return lookupUserFunc(func(u User) bool { + return u.Name == username + }) +} + +func lookupUid(uid int) (User, error) { + return lookupUserFunc(func(u User) bool { + return u.Uid == uid + }) +} + +func lookupUserFunc(filter func(u User) bool) (User, error) { + // Get operating system-specific passwd reader-closer. + passwd, err := GetPasswd() + if err != nil { + return User{}, err + } + defer passwd.Close() + + // Get the users. + users, err := ParsePasswdFilter(passwd, filter) + if err != nil { + return User{}, err + } + + // No user entries found. + if len(users) == 0 { + return User{}, ErrNoPasswdEntries + } + + // Assume the first entry is the "correct" one. + return users[0], nil +} + +func lookupGroup(groupname string) (Group, error) { + return lookupGroupFunc(func(g Group) bool { + return g.Name == groupname + }) +} + +func lookupGid(gid int) (Group, error) { + return lookupGroupFunc(func(g Group) bool { + return g.Gid == gid + }) +} + +func lookupGroupFunc(filter func(g Group) bool) (Group, error) { + // Get operating system-specific group reader-closer. + group, err := GetGroup() + if err != nil { + return Group{}, err + } + defer group.Close() + + // Get the users. + groups, err := ParseGroupFilter(group, filter) + if err != nil { + return Group{}, err + } + + // No user entries found. + if len(groups) == 0 { + return Group{}, ErrNoGroupEntries + } + + // Assume the first entry is the "correct" one. + return groups[0], nil +} + +func GetPasswdPath() (string, error) { + return unixPasswdPath, nil +} + +func GetPasswd() (io.ReadCloser, error) { + return os.Open(unixPasswdPath) +} + +func GetGroupPath() (string, error) { + return unixGroupPath, nil +} + +func GetGroup() (io.ReadCloser, error) { + return os.Open(unixGroupPath) +} + +// CurrentUser looks up the current user by their user id in /etc/passwd. If the +// user cannot be found (or there is no /etc/passwd file on the filesystem), +// then CurrentUser returns an error. +func CurrentUser() (User, error) { + return LookupUid(unix.Getuid()) +} + +// CurrentGroup looks up the current user's group by their primary group id's +// entry in /etc/passwd. If the group cannot be found (or there is no +// /etc/group file on the filesystem), then CurrentGroup returns an error. +func CurrentGroup() (Group, error) { + return LookupGid(unix.Getgid()) +} + +func currentUserSubIDs(fileName string) ([]SubID, error) { + u, err := CurrentUser() + if err != nil { + return nil, err + } + filter := func(entry SubID) bool { + return entry.Name == u.Name || entry.Name == strconv.Itoa(u.Uid) + } + return ParseSubIDFileFilter(fileName, filter) +} + +func CurrentUserSubUIDs() ([]SubID, error) { + return currentUserSubIDs("/etc/subuid") +} + +func CurrentUserSubGIDs() ([]SubID, error) { + return currentUserSubIDs("/etc/subgid") +} + +func CurrentProcessUIDMap() ([]IDMap, error) { + return ParseIDMapFile("/proc/self/uid_map") +} + +func CurrentProcessGIDMap() ([]IDMap, error) { + return ParseIDMapFile("/proc/self/gid_map") +} diff --git a/sysbox-runc/libcontainer/user/lookup_windows.go b/sysbox-runc/libcontainer/user/lookup_windows.go new file mode 100644 index 00000000..f19333e6 --- /dev/null +++ b/sysbox-runc/libcontainer/user/lookup_windows.go @@ -0,0 +1,40 @@ +// +build windows + +package user + +import ( + "os/user" + "strconv" +) + +func lookupUser(username string) (User, error) { + u, err := user.Lookup(username) + if err != nil { + return User{}, err + } + return userFromOS(u) +} + +func lookupUid(uid int) (User, error) { + u, err := user.LookupId(strconv.Itoa(uid)) + if err != nil { + return User{}, err + } + return userFromOS(u) +} + +func lookupGroup(groupname string) (Group, error) { + g, err := user.LookupGroup(groupname) + if err != nil { + return Group{}, err + } + return groupFromOS(g) +} + +func lookupGid(gid int) (Group, error) { + g, err := user.LookupGroupId(strconv.Itoa(gid)) + if err != nil { + return Group{}, err + } + return groupFromOS(g) +} diff --git a/sysbox-runc/libcontainer/user/user.go b/sysbox-runc/libcontainer/user/user.go new file mode 100644 index 00000000..a533bf5e --- /dev/null +++ b/sysbox-runc/libcontainer/user/user.go @@ -0,0 +1,604 @@ +package user + +import ( + "bufio" + "fmt" + "io" + "os" + "os/user" + "strconv" + "strings" +) + +const ( + minId = 0 + maxId = 1<<31 - 1 //for 32-bit systems compatibility +) + +var ( + ErrRange = fmt.Errorf("uids and gids must be in range %d-%d", minId, maxId) +) + +type User struct { + Name string + Pass string + Uid int + Gid int + Gecos string + Home string + Shell string +} + +// userFromOS converts an os/user.(*User) to local User +// +// (This does not include Pass, Shell or Gecos) +func userFromOS(u *user.User) (User, error) { + newUser := User{ + Name: u.Username, + Home: u.HomeDir, + } + id, err := strconv.Atoi(u.Uid) + if err != nil { + return newUser, err + } + newUser.Uid = id + + id, err = strconv.Atoi(u.Gid) + if err != nil { + return newUser, err + } + newUser.Gid = id + return newUser, nil +} + +type Group struct { + Name string + Pass string + Gid int + List []string +} + +// groupFromOS converts an os/user.(*Group) to local Group +// +// (This does not include Pass or List) +func groupFromOS(g *user.Group) (Group, error) { + newGroup := Group{ + Name: g.Name, + } + + id, err := strconv.Atoi(g.Gid) + if err != nil { + return newGroup, err + } + newGroup.Gid = id + + return newGroup, nil +} + +// SubID represents an entry in /etc/sub{u,g}id +type SubID struct { + Name string + SubID int64 + Count int64 +} + +// IDMap represents an entry in /proc/PID/{u,g}id_map +type IDMap struct { + ID int64 + ParentID int64 + Count int64 +} + +func parseLine(line string, v ...interface{}) { + parseParts(strings.Split(line, ":"), v...) +} + +func parseParts(parts []string, v ...interface{}) { + if len(parts) == 0 { + return + } + + for i, p := range parts { + // Ignore cases where we don't have enough fields to populate the arguments. + // Some configuration files like to misbehave. + if len(v) <= i { + break + } + + // Use the type of the argument to figure out how to parse it, scanf() style. + // This is legit. + switch e := v[i].(type) { + case *string: + *e = p + case *int: + // "numbers", with conversion errors ignored because of some misbehaving configuration files. + *e, _ = strconv.Atoi(p) + case *int64: + *e, _ = strconv.ParseInt(p, 10, 64) + case *[]string: + // Comma-separated lists. + if p != "" { + *e = strings.Split(p, ",") + } else { + *e = []string{} + } + default: + // Someone goof'd when writing code using this function. Scream so they can hear us. + panic(fmt.Sprintf("parseLine only accepts {*string, *int, *int64, *[]string} as arguments! %#v is not a pointer!", e)) + } + } +} + +func ParsePasswdFile(path string) ([]User, error) { + passwd, err := os.Open(path) + if err != nil { + return nil, err + } + defer passwd.Close() + return ParsePasswd(passwd) +} + +func ParsePasswd(passwd io.Reader) ([]User, error) { + return ParsePasswdFilter(passwd, nil) +} + +func ParsePasswdFileFilter(path string, filter func(User) bool) ([]User, error) { + passwd, err := os.Open(path) + if err != nil { + return nil, err + } + defer passwd.Close() + return ParsePasswdFilter(passwd, filter) +} + +func ParsePasswdFilter(r io.Reader, filter func(User) bool) ([]User, error) { + if r == nil { + return nil, fmt.Errorf("nil source for passwd-formatted data") + } + + var ( + s = bufio.NewScanner(r) + out = []User{} + ) + + for s.Scan() { + line := strings.TrimSpace(s.Text()) + if line == "" { + continue + } + + // see: man 5 passwd + // name:password:UID:GID:GECOS:directory:shell + // Name:Pass:Uid:Gid:Gecos:Home:Shell + // root:x:0:0:root:/root:/bin/bash + // adm:x:3:4:adm:/var/adm:/bin/false + p := User{} + parseLine(line, &p.Name, &p.Pass, &p.Uid, &p.Gid, &p.Gecos, &p.Home, &p.Shell) + + if filter == nil || filter(p) { + out = append(out, p) + } + } + if err := s.Err(); err != nil { + return nil, err + } + + return out, nil +} + +func ParseGroupFile(path string) ([]Group, error) { + group, err := os.Open(path) + if err != nil { + return nil, err + } + + defer group.Close() + return ParseGroup(group) +} + +func ParseGroup(group io.Reader) ([]Group, error) { + return ParseGroupFilter(group, nil) +} + +func ParseGroupFileFilter(path string, filter func(Group) bool) ([]Group, error) { + group, err := os.Open(path) + if err != nil { + return nil, err + } + defer group.Close() + return ParseGroupFilter(group, filter) +} + +func ParseGroupFilter(r io.Reader, filter func(Group) bool) ([]Group, error) { + if r == nil { + return nil, fmt.Errorf("nil source for group-formatted data") + } + + var ( + s = bufio.NewScanner(r) + out = []Group{} + ) + + for s.Scan() { + text := s.Text() + if text == "" { + continue + } + + // see: man 5 group + // group_name:password:GID:user_list + // Name:Pass:Gid:List + // root:x:0:root + // adm:x:4:root,adm,daemon + p := Group{} + parseLine(text, &p.Name, &p.Pass, &p.Gid, &p.List) + + if filter == nil || filter(p) { + out = append(out, p) + } + } + if err := s.Err(); err != nil { + return nil, err + } + + return out, nil +} + +type ExecUser struct { + Uid int + Gid int + Sgids []int + Home string +} + +// GetExecUserPath is a wrapper for GetExecUser. It reads data from each of the +// given file paths and uses that data as the arguments to GetExecUser. If the +// files cannot be opened for any reason, the error is ignored and a nil +// io.Reader is passed instead. +func GetExecUserPath(userSpec string, defaults *ExecUser, passwdPath, groupPath string) (*ExecUser, error) { + var passwd, group io.Reader + + if passwdFile, err := os.Open(passwdPath); err == nil { + passwd = passwdFile + defer passwdFile.Close() + } + + if groupFile, err := os.Open(groupPath); err == nil { + group = groupFile + defer groupFile.Close() + } + + return GetExecUser(userSpec, defaults, passwd, group) +} + +// GetExecUser parses a user specification string (using the passwd and group +// readers as sources for /etc/passwd and /etc/group data, respectively). In +// the case of blank fields or missing data from the sources, the values in +// defaults is used. +// +// GetExecUser will return an error if a user or group literal could not be +// found in any entry in passwd and group respectively. +// +// Examples of valid user specifications are: +// * "" +// * "user" +// * "uid" +// * "user:group" +// * "uid:gid +// * "user:gid" +// * "uid:group" +// +// It should be noted that if you specify a numeric user or group id, they will +// not be evaluated as usernames (only the metadata will be filled). So attempting +// to parse a user with user.Name = "1337" will produce the user with a UID of +// 1337. +func GetExecUser(userSpec string, defaults *ExecUser, passwd, group io.Reader) (*ExecUser, error) { + if defaults == nil { + defaults = new(ExecUser) + } + + // Copy over defaults. + user := &ExecUser{ + Uid: defaults.Uid, + Gid: defaults.Gid, + Sgids: defaults.Sgids, + Home: defaults.Home, + } + + // Sgids slice *cannot* be nil. + if user.Sgids == nil { + user.Sgids = []int{} + } + + // Allow for userArg to have either "user" syntax, or optionally "user:group" syntax + var userArg, groupArg string + parseLine(userSpec, &userArg, &groupArg) + + // Convert userArg and groupArg to be numeric, so we don't have to execute + // Atoi *twice* for each iteration over lines. + uidArg, uidErr := strconv.Atoi(userArg) + gidArg, gidErr := strconv.Atoi(groupArg) + + // Find the matching user. + users, err := ParsePasswdFilter(passwd, func(u User) bool { + if userArg == "" { + // Default to current state of the user. + return u.Uid == user.Uid + } + + if uidErr == nil { + // If the userArg is numeric, always treat it as a UID. + return uidArg == u.Uid + } + + return u.Name == userArg + }) + + // If we can't find the user, we have to bail. + if err != nil && passwd != nil { + if userArg == "" { + userArg = strconv.Itoa(user.Uid) + } + return nil, fmt.Errorf("unable to find user %s: %v", userArg, err) + } + + var matchedUserName string + if len(users) > 0 { + // First match wins, even if there's more than one matching entry. + matchedUserName = users[0].Name + user.Uid = users[0].Uid + user.Gid = users[0].Gid + user.Home = users[0].Home + } else if userArg != "" { + // If we can't find a user with the given username, the only other valid + // option is if it's a numeric username with no associated entry in passwd. + + if uidErr != nil { + // Not numeric. + return nil, fmt.Errorf("unable to find user %s: %v", userArg, ErrNoPasswdEntries) + } + user.Uid = uidArg + + // Must be inside valid uid range. + if user.Uid < minId || user.Uid > maxId { + return nil, ErrRange + } + + // Okay, so it's numeric. We can just roll with this. + } + + // On to the groups. If we matched a username, we need to do this because of + // the supplementary group IDs. + if groupArg != "" || matchedUserName != "" { + groups, err := ParseGroupFilter(group, func(g Group) bool { + // If the group argument isn't explicit, we'll just search for it. + if groupArg == "" { + // Check if user is a member of this group. + for _, u := range g.List { + if u == matchedUserName { + return true + } + } + return false + } + + if gidErr == nil { + // If the groupArg is numeric, always treat it as a GID. + return gidArg == g.Gid + } + + return g.Name == groupArg + }) + if err != nil && group != nil { + return nil, fmt.Errorf("unable to find groups for spec %v: %v", matchedUserName, err) + } + + // Only start modifying user.Gid if it is in explicit form. + if groupArg != "" { + if len(groups) > 0 { + // First match wins, even if there's more than one matching entry. + user.Gid = groups[0].Gid + } else { + // If we can't find a group with the given name, the only other valid + // option is if it's a numeric group name with no associated entry in group. + + if gidErr != nil { + // Not numeric. + return nil, fmt.Errorf("unable to find group %s: %v", groupArg, ErrNoGroupEntries) + } + user.Gid = gidArg + + // Must be inside valid gid range. + if user.Gid < minId || user.Gid > maxId { + return nil, ErrRange + } + + // Okay, so it's numeric. We can just roll with this. + } + } else if len(groups) > 0 { + // Supplementary group ids only make sense if in the implicit form. + user.Sgids = make([]int, len(groups)) + for i, group := range groups { + user.Sgids[i] = group.Gid + } + } + } + + return user, nil +} + +// GetAdditionalGroups looks up a list of groups by name or group id +// against the given /etc/group formatted data. If a group name cannot +// be found, an error will be returned. If a group id cannot be found, +// or the given group data is nil, the id will be returned as-is +// provided it is in the legal range. +func GetAdditionalGroups(additionalGroups []string, group io.Reader) ([]int, error) { + var groups = []Group{} + if group != nil { + var err error + groups, err = ParseGroupFilter(group, func(g Group) bool { + for _, ag := range additionalGroups { + if g.Name == ag || strconv.Itoa(g.Gid) == ag { + return true + } + } + return false + }) + if err != nil { + return nil, fmt.Errorf("Unable to find additional groups %v: %v", additionalGroups, err) + } + } + + gidMap := make(map[int]struct{}) + for _, ag := range additionalGroups { + var found bool + for _, g := range groups { + // if we found a matched group either by name or gid, take the + // first matched as correct + if g.Name == ag || strconv.Itoa(g.Gid) == ag { + if _, ok := gidMap[g.Gid]; !ok { + gidMap[g.Gid] = struct{}{} + found = true + break + } + } + } + // we asked for a group but didn't find it. let's check to see + // if we wanted a numeric group + if !found { + gid, err := strconv.ParseInt(ag, 10, 64) + if err != nil { + return nil, fmt.Errorf("Unable to find group %s", ag) + } + // Ensure gid is inside gid range. + if gid < minId || gid > maxId { + return nil, ErrRange + } + gidMap[int(gid)] = struct{}{} + } + } + gids := []int{} + for gid := range gidMap { + gids = append(gids, gid) + } + return gids, nil +} + +// GetAdditionalGroupsPath is a wrapper around GetAdditionalGroups +// that opens the groupPath given and gives it as an argument to +// GetAdditionalGroups. +func GetAdditionalGroupsPath(additionalGroups []string, groupPath string) ([]int, error) { + var group io.Reader + + if groupFile, err := os.Open(groupPath); err == nil { + group = groupFile + defer groupFile.Close() + } + return GetAdditionalGroups(additionalGroups, group) +} + +func ParseSubIDFile(path string) ([]SubID, error) { + subid, err := os.Open(path) + if err != nil { + return nil, err + } + defer subid.Close() + return ParseSubID(subid) +} + +func ParseSubID(subid io.Reader) ([]SubID, error) { + return ParseSubIDFilter(subid, nil) +} + +func ParseSubIDFileFilter(path string, filter func(SubID) bool) ([]SubID, error) { + subid, err := os.Open(path) + if err != nil { + return nil, err + } + defer subid.Close() + return ParseSubIDFilter(subid, filter) +} + +func ParseSubIDFilter(r io.Reader, filter func(SubID) bool) ([]SubID, error) { + if r == nil { + return nil, fmt.Errorf("nil source for subid-formatted data") + } + + var ( + s = bufio.NewScanner(r) + out = []SubID{} + ) + + for s.Scan() { + line := strings.TrimSpace(s.Text()) + if line == "" { + continue + } + + // see: man 5 subuid + p := SubID{} + parseLine(line, &p.Name, &p.SubID, &p.Count) + + if filter == nil || filter(p) { + out = append(out, p) + } + } + if err := s.Err(); err != nil { + return nil, err + } + + return out, nil +} + +func ParseIDMapFile(path string) ([]IDMap, error) { + r, err := os.Open(path) + if err != nil { + return nil, err + } + defer r.Close() + return ParseIDMap(r) +} + +func ParseIDMap(r io.Reader) ([]IDMap, error) { + return ParseIDMapFilter(r, nil) +} + +func ParseIDMapFileFilter(path string, filter func(IDMap) bool) ([]IDMap, error) { + r, err := os.Open(path) + if err != nil { + return nil, err + } + defer r.Close() + return ParseIDMapFilter(r, filter) +} + +func ParseIDMapFilter(r io.Reader, filter func(IDMap) bool) ([]IDMap, error) { + if r == nil { + return nil, fmt.Errorf("nil source for idmap-formatted data") + } + + var ( + s = bufio.NewScanner(r) + out = []IDMap{} + ) + + for s.Scan() { + line := strings.TrimSpace(s.Text()) + if line == "" { + continue + } + + // see: man 7 user_namespaces + p := IDMap{} + parseParts(strings.Fields(line), &p.ID, &p.ParentID, &p.Count) + + if filter == nil || filter(p) { + out = append(out, p) + } + } + if err := s.Err(); err != nil { + return nil, err + } + + return out, nil +} diff --git a/sysbox-runc/libcontainer/user/user_test.go b/sysbox-runc/libcontainer/user/user_test.go new file mode 100644 index 00000000..23bf4667 --- /dev/null +++ b/sysbox-runc/libcontainer/user/user_test.go @@ -0,0 +1,502 @@ +package user + +import ( + "io" + "reflect" + "sort" + "strconv" + "strings" + "testing" +) + +func TestUserParseLine(t *testing.T) { + var ( + a, b string + c []string + d int + ) + + parseLine("", &a, &b) + if a != "" || b != "" { + t.Fatalf("a and b should be empty ('%v', '%v')", a, b) + } + + parseLine("a", &a, &b) + if a != "a" || b != "" { + t.Fatalf("a should be 'a' and b should be empty ('%v', '%v')", a, b) + } + + parseLine("bad boys:corny cows", &a, &b) + if a != "bad boys" || b != "corny cows" { + t.Fatalf("a should be 'bad boys' and b should be 'corny cows' ('%v', '%v')", a, b) + } + + parseLine("", &c) + if len(c) != 0 { + t.Fatalf("c should be empty (%#v)", c) + } + + parseLine("d,e,f:g:h:i,j,k", &c, &a, &b, &c) + if a != "g" || b != "h" || len(c) != 3 || c[0] != "i" || c[1] != "j" || c[2] != "k" { + t.Fatalf("a should be 'g', b should be 'h', and c should be ['i','j','k'] ('%v', '%v', '%#v')", a, b, c) + } + + parseLine("::::::::::", &a, &b, &c) + if a != "" || b != "" || len(c) != 0 { + t.Fatalf("a, b, and c should all be empty ('%v', '%v', '%#v')", a, b, c) + } + + parseLine("not a number", &d) + if d != 0 { + t.Fatalf("d should be 0 (%v)", d) + } + + parseLine("b:12:c", &a, &d, &b) + if a != "b" || b != "c" || d != 12 { + t.Fatalf("a should be 'b' and b should be 'c', and d should be 12 ('%v', '%v', %v)", a, b, d) + } +} + +func TestUserParsePasswd(t *testing.T) { + users, err := ParsePasswdFilter(strings.NewReader(` +root:x:0:0:root:/root:/bin/bash +adm:x:3:4:adm:/var/adm:/bin/false +this is just some garbage data +`), nil) + if err != nil { + t.Fatalf("Unexpected error: %v", err) + } + if len(users) != 3 { + t.Fatalf("Expected 3 users, got %v", len(users)) + } + if users[0].Uid != 0 || users[0].Name != "root" { + t.Fatalf("Expected users[0] to be 0 - root, got %v - %v", users[0].Uid, users[0].Name) + } + if users[1].Uid != 3 || users[1].Name != "adm" { + t.Fatalf("Expected users[1] to be 3 - adm, got %v - %v", users[1].Uid, users[1].Name) + } +} + +func TestUserParseGroup(t *testing.T) { + groups, err := ParseGroupFilter(strings.NewReader(` +root:x:0:root +adm:x:4:root,adm,daemon +this is just some garbage data +`), nil) + if err != nil { + t.Fatalf("Unexpected error: %v", err) + } + if len(groups) != 3 { + t.Fatalf("Expected 3 groups, got %v", len(groups)) + } + if groups[0].Gid != 0 || groups[0].Name != "root" || len(groups[0].List) != 1 { + t.Fatalf("Expected groups[0] to be 0 - root - 1 member, got %v - %v - %v", groups[0].Gid, groups[0].Name, len(groups[0].List)) + } + if groups[1].Gid != 4 || groups[1].Name != "adm" || len(groups[1].List) != 3 { + t.Fatalf("Expected groups[1] to be 4 - adm - 3 members, got %v - %v - %v", groups[1].Gid, groups[1].Name, len(groups[1].List)) + } +} + +func TestValidGetExecUser(t *testing.T) { + const passwdContent = ` +root:x:0:0:root user:/root:/bin/bash +adm:x:42:43:adm:/var/adm:/bin/false +111:x:222:333::/var/garbage +odd:x:111:112::/home/odd::::: +this is just some garbage data +` + const groupContent = ` +root:x:0:root +adm:x:43: +grp:x:1234:root,adm +444:x:555:111 +odd:x:444: +this is just some garbage data +` + defaultExecUser := ExecUser{ + Uid: 8888, + Gid: 8888, + Sgids: []int{8888}, + Home: "/8888", + } + + tests := []struct { + ref string + expected ExecUser + }{ + { + ref: "root", + expected: ExecUser{ + Uid: 0, + Gid: 0, + Sgids: []int{0, 1234}, + Home: "/root", + }, + }, + { + ref: "adm", + expected: ExecUser{ + Uid: 42, + Gid: 43, + Sgids: []int{1234}, + Home: "/var/adm", + }, + }, + { + ref: "root:adm", + expected: ExecUser{ + Uid: 0, + Gid: 43, + Sgids: defaultExecUser.Sgids, + Home: "/root", + }, + }, + { + ref: "adm:1234", + expected: ExecUser{ + Uid: 42, + Gid: 1234, + Sgids: defaultExecUser.Sgids, + Home: "/var/adm", + }, + }, + { + ref: "42:1234", + expected: ExecUser{ + Uid: 42, + Gid: 1234, + Sgids: defaultExecUser.Sgids, + Home: "/var/adm", + }, + }, + { + ref: "1337:1234", + expected: ExecUser{ + Uid: 1337, + Gid: 1234, + Sgids: defaultExecUser.Sgids, + Home: defaultExecUser.Home, + }, + }, + { + ref: "1337", + expected: ExecUser{ + Uid: 1337, + Gid: defaultExecUser.Gid, + Sgids: defaultExecUser.Sgids, + Home: defaultExecUser.Home, + }, + }, + { + ref: "", + expected: ExecUser{ + Uid: defaultExecUser.Uid, + Gid: defaultExecUser.Gid, + Sgids: defaultExecUser.Sgids, + Home: defaultExecUser.Home, + }, + }, + + // Regression tests for #695. + { + ref: "111", + expected: ExecUser{ + Uid: 111, + Gid: 112, + Sgids: defaultExecUser.Sgids, + Home: "/home/odd", + }, + }, + { + ref: "111:444", + expected: ExecUser{ + Uid: 111, + Gid: 444, + Sgids: defaultExecUser.Sgids, + Home: "/home/odd", + }, + }, + } + + for _, test := range tests { + passwd := strings.NewReader(passwdContent) + group := strings.NewReader(groupContent) + + execUser, err := GetExecUser(test.ref, &defaultExecUser, passwd, group) + if err != nil { + t.Logf("got unexpected error when parsing '%s': %s", test.ref, err.Error()) + t.Fail() + continue + } + + if !reflect.DeepEqual(test.expected, *execUser) { + t.Logf("ref: %v", test.ref) + t.Logf("got: %#v", execUser) + t.Logf("expected: %#v", test.expected) + t.Fail() + continue + } + } +} + +func TestInvalidGetExecUser(t *testing.T) { + const passwdContent = ` +root:x:0:0:root user:/root:/bin/bash +adm:x:42:43:adm:/var/adm:/bin/false +-42:x:12:13:broken:/very/broken +this is just some garbage data +` + const groupContent = ` +root:x:0:root +adm:x:43: +grp:x:1234:root,adm +this is just some garbage data +` + + tests := []string{ + // No such user/group. + "notuser", + "notuser:notgroup", + "root:notgroup", + "notuser:adm", + "8888:notgroup", + "notuser:8888", + + // Invalid user/group values. + "-1:0", + "0:-3", + "-5:-2", + "-42", + "-43", + } + + for _, test := range tests { + passwd := strings.NewReader(passwdContent) + group := strings.NewReader(groupContent) + + execUser, err := GetExecUser(test, nil, passwd, group) + if err == nil { + t.Logf("got unexpected success when parsing '%s': %#v", test, execUser) + t.Fail() + continue + } + } +} + +func TestGetExecUserNilSources(t *testing.T) { + const passwdContent = ` +root:x:0:0:root user:/root:/bin/bash +adm:x:42:43:adm:/var/adm:/bin/false +this is just some garbage data +` + const groupContent = ` +root:x:0:root +adm:x:43: +grp:x:1234:root,adm +this is just some garbage data +` + + defaultExecUser := ExecUser{ + Uid: 8888, + Gid: 8888, + Sgids: []int{8888}, + Home: "/8888", + } + + tests := []struct { + ref string + passwd, group bool + expected ExecUser + }{ + { + ref: "", + passwd: false, + group: false, + expected: ExecUser{ + Uid: 8888, + Gid: 8888, + Sgids: []int{8888}, + Home: "/8888", + }, + }, + { + ref: "root", + passwd: true, + group: false, + expected: ExecUser{ + Uid: 0, + Gid: 0, + Sgids: []int{8888}, + Home: "/root", + }, + }, + { + ref: "0", + passwd: false, + group: false, + expected: ExecUser{ + Uid: 0, + Gid: 8888, + Sgids: []int{8888}, + Home: "/8888", + }, + }, + { + ref: "0:0", + passwd: false, + group: false, + expected: ExecUser{ + Uid: 0, + Gid: 0, + Sgids: []int{8888}, + Home: "/8888", + }, + }, + } + + for _, test := range tests { + var passwd, group io.Reader + + if test.passwd { + passwd = strings.NewReader(passwdContent) + } + + if test.group { + group = strings.NewReader(groupContent) + } + + execUser, err := GetExecUser(test.ref, &defaultExecUser, passwd, group) + if err != nil { + t.Logf("got unexpected error when parsing '%s': %s", test.ref, err.Error()) + t.Fail() + continue + } + + if !reflect.DeepEqual(test.expected, *execUser) { + t.Logf("got: %#v", execUser) + t.Logf("expected: %#v", test.expected) + t.Fail() + continue + } + } +} + +func TestGetAdditionalGroups(t *testing.T) { + type foo struct { + groups []string + expected []int + hasError bool + } + + const groupContent = ` +root:x:0:root +adm:x:43: +grp:x:1234:root,adm +adm:x:4343:root,adm-duplicate +this is just some garbage data +` + tests := []foo{ + { + // empty group + groups: []string{}, + expected: []int{}, + }, + { + // single group + groups: []string{"adm"}, + expected: []int{43}, + }, + { + // multiple groups + groups: []string{"adm", "grp"}, + expected: []int{43, 1234}, + }, + { + // invalid group + groups: []string{"adm", "grp", "not-exist"}, + expected: nil, + hasError: true, + }, + { + // group with numeric id + groups: []string{"43"}, + expected: []int{43}, + }, + { + // group with unknown numeric id + groups: []string{"adm", "10001"}, + expected: []int{43, 10001}, + }, + { + // groups specified twice with numeric and name + groups: []string{"adm", "43"}, + expected: []int{43}, + }, + { + // groups with too small id + groups: []string{"-1"}, + expected: nil, + hasError: true, + }, + { + // groups with too large id + groups: []string{strconv.FormatInt(1<<31, 10)}, + expected: nil, + hasError: true, + }, + } + + for _, test := range tests { + group := strings.NewReader(groupContent) + + gids, err := GetAdditionalGroups(test.groups, group) + if test.hasError && err == nil { + t.Errorf("Parse(%#v) expects error but has none", test) + continue + } + if !test.hasError && err != nil { + t.Errorf("Parse(%#v) has error %v", test, err) + continue + } + sort.Ints(gids) + if !reflect.DeepEqual(gids, test.expected) { + t.Errorf("Gids(%v), expect %v from groups %v", gids, test.expected, test.groups) + } + } +} + +func TestGetAdditionalGroupsNumeric(t *testing.T) { + tests := []struct { + groups []string + expected []int + hasError bool + }{ + { + // numeric groups only + groups: []string{"1234", "5678"}, + expected: []int{1234, 5678}, + }, + { + // numeric and alphabetic + groups: []string{"1234", "fake"}, + expected: nil, + hasError: true, + }, + } + + for _, test := range tests { + gids, err := GetAdditionalGroups(test.groups, nil) + if test.hasError && err == nil { + t.Errorf("Parse(%#v) expects error but has none", test) + continue + } + if !test.hasError && err != nil { + t.Errorf("Parse(%#v) has error %v", test, err) + continue + } + sort.Ints(gids) + if !reflect.DeepEqual(gids, test.expected) { + t.Errorf("Gids(%v), expect %v from groups %v", gids, test.expected, test.groups) + } + } +} diff --git a/sysbox-runc/libcontainer/utils/cmsg.go b/sysbox-runc/libcontainer/utils/cmsg.go new file mode 100644 index 00000000..c8a9364d --- /dev/null +++ b/sysbox-runc/libcontainer/utils/cmsg.go @@ -0,0 +1,93 @@ +// +build linux + +package utils + +/* + * Copyright 2016, 2017 SUSE LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import ( + "fmt" + "os" + + "golang.org/x/sys/unix" +) + +// MaxSendfdLen is the maximum length of the name of a file descriptor being +// sent using SendFd. The name of the file handle returned by RecvFd will never +// be larger than this value. +const MaxNameLen = 4096 + +// oobSpace is the size of the oob slice required to store a single FD. Note +// that unix.UnixRights appears to make the assumption that fd is always int32, +// so sizeof(fd) = 4. +var oobSpace = unix.CmsgSpace(4) + +// RecvFd waits for a file descriptor to be sent over the given AF_UNIX +// socket. The file name of the remote file descriptor will be recreated +// locally (it is sent as non-auxiliary data in the same payload). +func RecvFd(socket *os.File) (*os.File, error) { + // For some reason, unix.Recvmsg uses the length rather than the capacity + // when passing the msg_controllen and other attributes to recvmsg. So we + // have to actually set the length. + name := make([]byte, MaxNameLen) + oob := make([]byte, oobSpace) + + sockfd := socket.Fd() + n, oobn, _, _, err := unix.Recvmsg(int(sockfd), name, oob, 0) + if err != nil { + return nil, err + } + + if n >= MaxNameLen || oobn != oobSpace { + return nil, fmt.Errorf("recvfd: incorrect number of bytes read (n=%d oobn=%d)", n, oobn) + } + + // Truncate. + name = name[:n] + oob = oob[:oobn] + + scms, err := unix.ParseSocketControlMessage(oob) + if err != nil { + return nil, err + } + if len(scms) != 1 { + return nil, fmt.Errorf("recvfd: number of SCMs is not 1: %d", len(scms)) + } + scm := scms[0] + + fds, err := unix.ParseUnixRights(&scm) + if err != nil { + return nil, err + } + if len(fds) != 1 { + return nil, fmt.Errorf("recvfd: number of fds is not 1: %d", len(fds)) + } + fd := uintptr(fds[0]) + + return os.NewFile(fd, string(name)), nil +} + +// SendFd sends a file descriptor over the given AF_UNIX socket. In +// addition, the file.Name() of the given file will also be sent as +// non-auxiliary data in the same payload (allowing to send contextual +// information for a file descriptor). +func SendFd(socket *os.File, name string, fd uintptr) error { + if len(name) >= MaxNameLen { + return fmt.Errorf("sendfd: filename too long: %s", name) + } + oob := unix.UnixRights(int(fd)) + return unix.Sendmsg(int(socket.Fd()), []byte(name), oob, nil, 0) +} diff --git a/sysbox-runc/libcontainer/utils/utils.go b/sysbox-runc/libcontainer/utils/utils.go new file mode 100644 index 00000000..04040645 --- /dev/null +++ b/sysbox-runc/libcontainer/utils/utils.go @@ -0,0 +1,154 @@ +package utils + +import ( + "encoding/binary" + "encoding/json" + "io" + "os" + "path/filepath" + "strings" + "unsafe" + + "golang.org/x/sys/unix" +) + +const ( + exitSignalOffset = 128 +) + +// NativeEndian is the native byte order of the host system. +var NativeEndian binary.ByteOrder + +func init() { + // Copied from . + i := uint32(1) + b := (*[4]byte)(unsafe.Pointer(&i)) + if b[0] == 1 { + NativeEndian = binary.LittleEndian + } else { + NativeEndian = binary.BigEndian + } +} + +// ResolveRootfs ensures that the current working directory is +// not a symlink and returns the absolute path to the rootfs +func ResolveRootfs(uncleanRootfs string) (string, error) { + rootfs, err := filepath.Abs(uncleanRootfs) + if err != nil { + return "", err + } + return filepath.EvalSymlinks(rootfs) +} + +// ExitStatus returns the correct exit status for a process based on if it +// was signaled or exited cleanly +func ExitStatus(status unix.WaitStatus) int { + if status.Signaled() { + return exitSignalOffset + int(status.Signal()) + } + return status.ExitStatus() +} + +// WriteJSON writes the provided struct v to w using standard json marshaling +func WriteJSON(w io.Writer, v interface{}) error { + data, err := json.Marshal(v) + if err != nil { + return err + } + _, err = w.Write(data) + return err +} + +// CleanPath makes a path safe for use with filepath.Join. This is done by not +// only cleaning the path, but also (if the path is relative) adding a leading +// '/' and cleaning it (then removing the leading '/'). This ensures that a +// path resulting from prepending another path will always resolve to lexically +// be a subdirectory of the prefixed path. This is all done lexically, so paths +// that include symlinks won't be safe as a result of using CleanPath. +func CleanPath(path string) string { + // Deal with empty strings nicely. + if path == "" { + return "" + } + + // Ensure that all paths are cleaned (especially problematic ones like + // "/../../../../../" which can cause lots of issues). + path = filepath.Clean(path) + + // If the path isn't absolute, we need to do more processing to fix paths + // such as "../../../..//some/path". We also shouldn't convert absolute + // paths to relative ones. + if !filepath.IsAbs(path) { + path = filepath.Clean(string(os.PathSeparator) + path) + // This can't fail, as (by definition) all paths are relative to root. + path, _ = filepath.Rel(string(os.PathSeparator), path) + } + + // Clean the path again for good measure. + return filepath.Clean(path) +} + +// StripRoot returns the passed path, stripping the root path if it was +// (lexicially) inside it. Note that both passed paths will always be treated +// as absolute, and the returned path will also always be absolute. In +// addition, the paths are cleaned before stripping the root. +func StripRoot(root, path string) string { + // Make the paths clean and absolute. + root, path = CleanPath("/"+root), CleanPath("/"+path) + switch { + case path == root: + path = "/" + case root == "/": + // do nothing + case strings.HasPrefix(path, root+"/"): + path = strings.TrimPrefix(path, root+"/") + } + return CleanPath("/" + path) +} + +// SearchLabels searches a list of key-value pairs for the provided key and +// returns the corresponding value. The pairs must be separated with '='. +func SearchLabels(labels []string, query string) string { + for _, l := range labels { + parts := strings.SplitN(l, "=", 2) + if len(parts) < 2 { + continue + } + if parts[0] == query { + return parts[1] + } + } + return "" +} + +// Annotations returns the bundle path and user defined annotations from the +// libcontainer state. We need to remove the bundle because that is a label +// added by libcontainer. +func Annotations(labels []string) (bundle string, userAnnotations map[string]string) { + userAnnotations = make(map[string]string) + for _, l := range labels { + parts := strings.SplitN(l, "=", 2) + if len(parts) < 2 { + continue + } + if parts[0] == "bundle" { + bundle = parts[1] + } else { + userAnnotations[parts[0]] = parts[1] + } + } + return +} + +func GetIntSize() int { + return int(unsafe.Sizeof(1)) +} + +func StringSliceContains(s []string, val string) bool { + for _, n := range s { + if val == n { + return true + } + } + return false +} diff --git a/sysbox-runc/libcontainer/utils/utils_test.go b/sysbox-runc/libcontainer/utils/utils_test.go new file mode 100644 index 00000000..4d9c15b8 --- /dev/null +++ b/sysbox-runc/libcontainer/utils/utils_test.go @@ -0,0 +1,180 @@ +package utils + +import ( + "bytes" + "os" + "path/filepath" + "testing" + + "golang.org/x/sys/unix" +) + +var labelTest = []struct { + labels []string + query string + expectedValue string +}{ + {[]string{"bundle=/path/to/bundle"}, "bundle", "/path/to/bundle"}, + {[]string{"test=a", "test=b"}, "bundle", ""}, + {[]string{"bundle=a", "test=b", "bundle=c"}, "bundle", "a"}, + {[]string{"", "test=a", "bundle=b"}, "bundle", "b"}, + {[]string{"test", "bundle=a"}, "bundle", "a"}, + {[]string{"test=a", "bundle="}, "bundle", ""}, +} + +func TestSearchLabels(t *testing.T) { + for _, tt := range labelTest { + if v := SearchLabels(tt.labels, tt.query); v != tt.expectedValue { + t.Errorf("expected value '%s' for query '%s'; got '%s'", tt.expectedValue, tt.query, v) + } + } +} + +func TestResolveRootfs(t *testing.T) { + dir := "rootfs" + if err := os.Mkdir(dir, 0600); err != nil { + t.Fatal(err) + } + defer os.Remove(dir) + + path, err := ResolveRootfs(dir) + if err != nil { + t.Fatal(err) + } + pwd, err := os.Getwd() + if err != nil { + t.Fatal(err) + } + if path != pwd+"/rootfs" { + t.Errorf("expected rootfs to be abs and was %s", path) + } +} + +func TestResolveRootfsWithSymlink(t *testing.T) { + dir := "rootfs" + tmpDir, _ := filepath.EvalSymlinks(os.TempDir()) + if err := os.Symlink(tmpDir, dir); err != nil { + t.Fatal(err) + } + defer os.Remove(dir) + + path, err := ResolveRootfs(dir) + if err != nil { + t.Fatal(err) + } + + if path != tmpDir { + t.Errorf("expected rootfs to be the real path %s and was %s", path, os.TempDir()) + } +} + +func TestResolveRootfsWithNonExistingDir(t *testing.T) { + _, err := ResolveRootfs("foo") + if err == nil { + t.Error("expected error to happen but received nil") + } +} + +func TestExitStatus(t *testing.T) { + status := unix.WaitStatus(0) + ex := ExitStatus(status) + if ex != 0 { + t.Errorf("expected exit status to equal 0 and received %d", ex) + } +} + +func TestExitStatusSignaled(t *testing.T) { + status := unix.WaitStatus(2) + ex := ExitStatus(status) + if ex != 130 { + t.Errorf("expected exit status to equal 130 and received %d", ex) + } +} + +func TestWriteJSON(t *testing.T) { + person := struct { + Name string + Age int + }{ + Name: "Alice", + Age: 30, + } + + var b bytes.Buffer + err := WriteJSON(&b, person) + if err != nil { + t.Fatal(err) + } + + expected := `{"Name":"Alice","Age":30}` + if b.String() != expected { + t.Errorf("expected to write %s but was %s", expected, b.String()) + } +} + +func TestCleanPath(t *testing.T) { + path := CleanPath("") + if path != "" { + t.Errorf("expected to receive empty string and received %s", path) + } + + path = CleanPath("rootfs") + if path != "rootfs" { + t.Errorf("expected to receive 'rootfs' and received %s", path) + } + + path = CleanPath("../../../var") + if path != "var" { + t.Errorf("expected to receive 'var' and received %s", path) + } + + path = CleanPath("/../../../var") + if path != "/var" { + t.Errorf("expected to receive '/var' and received %s", path) + } + + path = CleanPath("/foo/bar/") + if path != "/foo/bar" { + t.Errorf("expected to receive '/foo/bar' and received %s", path) + } + + path = CleanPath("/foo/bar/../") + if path != "/foo" { + t.Errorf("expected to receive '/foo' and received %s", path) + } +} + +func TestStripRoot(t *testing.T) { + for _, test := range []struct { + root, path, out string + }{ + // Works with multiple components. + {"/a/b", "/a/b/c", "/c"}, + {"/hello/world", "/hello/world/the/quick-brown/fox", "/the/quick-brown/fox"}, + // '/' must be a no-op. + {"/", "/a/b/c", "/a/b/c"}, + // Must be the correct order. + {"/a/b", "/a/c/b", "/a/c/b"}, + // Must be at start. + {"/abc/def", "/foo/abc/def/bar", "/foo/abc/def/bar"}, + // Must be a lexical parent. + {"/foo/bar", "/foo/barSAMECOMPONENT", "/foo/barSAMECOMPONENT"}, + // Must only strip the root once. + {"/foo/bar", "/foo/bar/foo/bar/baz", "/foo/bar/baz"}, + // Deal with .. in a fairly sane way. + {"/foo/bar", "/foo/bar/../baz", "/foo/baz"}, + {"/foo/bar", "../../../../../../foo/bar/baz", "/baz"}, + {"/foo/bar", "/../../../../../../foo/bar/baz", "/baz"}, + {"/foo/bar/../baz", "/foo/baz/bar", "/bar"}, + {"/foo/bar/../baz", "/foo/baz/../bar/../baz/./foo", "/foo"}, + // All paths are made absolute before stripping. + {"foo/bar", "/foo/bar/baz/bee", "/baz/bee"}, + {"/foo/bar", "foo/bar/baz/beef", "/baz/beef"}, + {"foo/bar", "foo/bar/baz/beets", "/baz/beets"}, + } { + got := StripRoot(test.root, test.path) + if got != test.out { + t.Errorf("stripRoot(%q, %q) -- got %q, expected %q", test.root, test.path, got, test.out) + } + } +} diff --git a/sysbox-runc/libcontainer/utils/utils_unix.go b/sysbox-runc/libcontainer/utils/utils_unix.go new file mode 100644 index 00000000..a21d7200 --- /dev/null +++ b/sysbox-runc/libcontainer/utils/utils_unix.go @@ -0,0 +1,281 @@ +//go:build !windows +// +build !windows + +package utils + +import ( + "fmt" + "math" + "os" + "path/filepath" + "runtime" + "strconv" + "strings" + "sync" + _ "unsafe" // for go:linkname + + securejoin "github.com/cyphar/filepath-securejoin" + "github.com/sirupsen/logrus" + "golang.org/x/sys/unix" +) + +// EnsureProcHandle returns whether or not the given file handle is on procfs. +func EnsureProcHandle(fh *os.File) error { + var buf unix.Statfs_t + if err := unix.Fstatfs(int(fh.Fd()), &buf); err != nil { + return fmt.Errorf("ensure %s is on procfs: %w", fh.Name(), err) + } + if buf.Type != unix.PROC_SUPER_MAGIC { + return fmt.Errorf("%s is not on procfs", fh.Name()) + } + return nil +} + +var ( + haveCloseRangeCloexecBool bool + haveCloseRangeCloexecOnce sync.Once +) + +func haveCloseRangeCloexec() bool { + haveCloseRangeCloexecOnce.Do(func() { + // Make sure we're not closing a random file descriptor. + tmpFd, err := unix.FcntlInt(0, unix.F_DUPFD_CLOEXEC, 0) + if err != nil { + return + } + defer unix.Close(tmpFd) + + err = unix.CloseRange(uint(tmpFd), uint(tmpFd), unix.CLOSE_RANGE_CLOEXEC) + // Any error means we cannot use close_range(CLOSE_RANGE_CLOEXEC). + // -ENOSYS and -EINVAL ultimately mean we don't have support, but any + // other potential error would imply that even the most basic close + // operation wouldn't work. + haveCloseRangeCloexecBool = err == nil + }) + return haveCloseRangeCloexecBool +} + +type fdFunc func(fd int) + +// fdRangeFrom calls the passed fdFunc for each file descriptor that is open in +// the current process. +func fdRangeFrom(minFd int, fn fdFunc) error { + procSelfFd, closer := ProcThreadSelf("fd") + defer closer() + + fdDir, err := os.Open(procSelfFd) + if err != nil { + return err + } + defer fdDir.Close() + + if err := EnsureProcHandle(fdDir); err != nil { + return err + } + + fdList, err := fdDir.Readdirnames(-1) + if err != nil { + return err + } + for _, fdStr := range fdList { + fd, err := strconv.Atoi(fdStr) + // Ignore non-numeric file names. + if err != nil { + continue + } + // Ignore descriptors lower than our specified minimum. + if fd < minFd { + continue + } + // Ignore the file descriptor we used for readdir, as it will be closed + // when we return. + if uintptr(fd) == fdDir.Fd() { + continue + } + // Run the closure. + fn(fd) + } + return nil +} + +// CloseExecFrom sets the O_CLOEXEC flag on all file descriptors greater or +// equal to minFd in the current process. +func CloseExecFrom(minFd int) error { + // Use close_range(CLOSE_RANGE_CLOEXEC) if possible. + if haveCloseRangeCloexec() { + err := unix.CloseRange(uint(minFd), math.MaxUint, unix.CLOSE_RANGE_CLOEXEC) + return os.NewSyscallError("close_range", err) + } + // Otherwise, fall back to the standard loop. + return fdRangeFrom(minFd, unix.CloseOnExec) +} + +//go:linkname runtime_IsPollDescriptor internal/poll.IsPollDescriptor + +// In order to make sure we do not close the internal epoll descriptors the Go +// runtime uses, we need to ensure that we skip descriptors that match +// "internal/poll".IsPollDescriptor. Yes, this is a Go runtime internal thing, +// unfortunately there's no other way to be sure we're only keeping the file +// descriptors the Go runtime needs. Hopefully nothing blows up doing this... +func runtime_IsPollDescriptor(fd uintptr) bool //nolint:revive + +// UnsafeCloseFrom closes all file descriptors greater or equal to minFd in the +// current process, except for those critical to Go's runtime (such as the +// netpoll management descriptors). +// +// NOTE: That this function is incredibly dangerous to use in most Go code, as +// closing file descriptors from underneath *os.File handles can lead to very +// bad behaviour (the closed file descriptor can be re-used and then any +// *os.File operations would apply to the wrong file). This function is only +// intended to be called from the last stage of runc init. +func UnsafeCloseFrom(minFd int) error { + // We cannot use close_range(2) even if it is available, because we must + // not close some file descriptors. + return fdRangeFrom(minFd, func(fd int) { + if runtime_IsPollDescriptor(uintptr(fd)) { + // These are the Go runtimes internal netpoll file descriptors. + // These file descriptors are operated on deep in the Go scheduler, + // and closing those files from underneath Go can result in panics. + // There is no issue with keeping them because they are not + // executable and are not useful to an attacker anyway. Also we + // don't have any choice. + return + } + // There's nothing we can do about errors from close(2), and the + // only likely error to be seen is EBADF which indicates the fd was + // already closed (in which case, we got what we wanted). + _ = unix.Close(fd) + }) +} + +// NewSockPair returns a new SOCK_STREAM unix socket pair. +func NewSockPair(name string) (parent, child *os.File, err error) { + fds, err := unix.Socketpair(unix.AF_LOCAL, unix.SOCK_STREAM|unix.SOCK_CLOEXEC, 0) + if err != nil { + return nil, nil, err + } + return os.NewFile(uintptr(fds[1]), name+"-p"), os.NewFile(uintptr(fds[0]), name+"-c"), nil +} + +// WithProcfd runs the passed closure with a procfd path (/proc/self/fd/...) +// corresponding to the unsafePath resolved within the root. Before passing the +// fd, this path is verified to have been inside the root -- so operating on it +// through the passed fdpath should be safe. Do not access this path through +// the original path strings, and do not attempt to use the pathname outside of +// the passed closure (the file handle will be freed once the closure returns). +func WithProcfd(root, unsafePath string, fn func(procfd string) error) error { + + // Remove the root then forcefully resolve inside the root. + unsafePath = StripRoot(root, unsafePath) + path, err := securejoin.SecureJoin(root, unsafePath) + if err != nil { + return fmt.Errorf("resolving path inside rootfs failed: %w", err) + } + + procSelfFd, closer := ProcThreadSelf("fd/") + defer closer() + + // Open the target path. + fh, err := os.OpenFile(path, unix.O_PATH|unix.O_CLOEXEC, 0) + if err != nil { + return fmt.Errorf("open o_path procfd: %w", err) + } + defer fh.Close() + + procfd := filepath.Join(procSelfFd, strconv.Itoa(int(fh.Fd()))) + + realpath, err := os.Readlink(procfd) + if err != nil { + return fmt.Errorf("procfd verification failed: %w", err) + } + + // The realPath has the absolute path; if root is cwd, then we need the relative path + if root == "." { + rootAbs, err := os.Readlink("/proc/self/cwd") + if err != nil { + return err + } + if !strings.HasSuffix(rootAbs, "/") { + rootAbs = rootAbs + "/" + } + realpath = strings.TrimPrefix(realpath, rootAbs) + } + + // Verify the path string and /proc/self/fd match + if realpath != path { + return fmt.Errorf("possibly malicious path detected -- refusing to operate on %s", realpath) + } + + return fn(procfd) +} + +type ProcThreadSelfCloser func() + +var ( + haveProcThreadSelf bool + haveProcThreadSelfOnce sync.Once +) + +// ProcThreadSelf returns a string that is equivalent to +// /proc/thread-self/, with a graceful fallback on older kernels where +// /proc/thread-self doesn't exist. This method DOES NOT use SecureJoin, +// meaning that the passed string needs to be trusted. The caller _must_ call +// the returned procThreadSelfCloser function (which is runtime.UnlockOSThread) +// *only once* after it has finished using the returned path string. +func ProcThreadSelf(subpath string) (string, ProcThreadSelfCloser) { + haveProcThreadSelfOnce.Do(func() { + if _, err := os.Stat("/proc/thread-self/"); err == nil { + haveProcThreadSelf = true + } else { + logrus.Debugf("cannot stat /proc/thread-self (%v), falling back to /proc/self/task/", err) + } + }) + + // We need to lock our thread until the caller is done with the path string + // because any non-atomic operation on the path (such as opening a file, + // then reading it) could be interrupted by the Go runtime where the + // underlying thread is swapped out and the original thread is killed, + // resulting in pull-your-hair-out-hard-to-debug issues in the caller. In + // addition, the pre-3.17 fallback makes everything non-atomic because the + // same thing could happen between unix.Gettid() and the path operations. + // + // In theory, we don't need to lock in the atomic user case when using + // /proc/thread-self/, but it's better to be safe than sorry (and there are + // only one or two truly atomic users of /proc/thread-self/). + runtime.LockOSThread() + + threadSelf := "/proc/thread-self/" + if !haveProcThreadSelf { + // Pre-3.17 kernels did not have /proc/thread-self, so do it manually. + threadSelf = "/proc/self/task/" + strconv.Itoa(unix.Gettid()) + "/" + if _, err := os.Stat(threadSelf); err != nil { + // Unfortunately, this code is called from rootfs_linux.go where we + // are running inside the pid namespace of the container but /proc + // is the host's procfs. Unfortunately there is no real way to get + // the correct tid to use here (the kernel age means we cannot do + // things like set up a private fsopen("proc") -- even scanning + // NSpid in all of the tasks in /proc/self/task/*/status requires + // Linux 4.1). + // + // So, we just have to assume that /proc/self is acceptable in this + // one specific case. + if os.Getpid() == 1 { + logrus.Debugf("/proc/thread-self (tid=%d) cannot be emulated inside the initial container setup -- using /proc/self instead: %v", unix.Gettid(), err) + } else { + // This should never happen, but the fallback should work in most cases... + logrus.Warnf("/proc/thread-self could not be emulated for pid=%d (tid=%d) -- using more buggy /proc/self fallback instead: %v", os.Getpid(), unix.Gettid(), err) + } + threadSelf = "/proc/self/" + } + } + return threadSelf + subpath, runtime.UnlockOSThread +} + +// ProcThreadSelfFd is small wrapper around ProcThreadSelf to make it easier to +// create a /proc/thread-self handle for given file descriptor. +// +// It is basically equivalent to ProcThreadSelf(fmt.Sprintf("fd/%d", fd)), but +// without using fmt.Sprintf to avoid unneeded overhead. +func ProcThreadSelfFd(fd uintptr) (string, ProcThreadSelfCloser) { + return ProcThreadSelf("fd/" + strconv.FormatUint(uint64(fd), 10)) +} diff --git a/sysbox-runc/libsysbox/sysbox/fs.go b/sysbox-runc/libsysbox/sysbox/fs.go new file mode 100644 index 00000000..04c4b8c4 --- /dev/null +++ b/sysbox-runc/libsysbox/sysbox/fs.go @@ -0,0 +1,183 @@ +// +// Copyright 2019-2020 Nestybox, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +// Exposes functions for sysbox-runc to interact with sysbox-fs + +package sysbox + +import ( + "fmt" + "time" + + "github.com/nestybox/sysbox-ipc/sysboxFsGrpc" + unixIpc "github.com/nestybox/sysbox-ipc/unix" + specs "github.com/opencontainers/runtime-spec/specs-go" +) + +// FsRegInfo contains info about a sys container registered with sysbox-fs +type FsRegInfo struct { + Hostname string + Pid int + Uid int + Gid int + IdSize int + ProcRoPaths []string + ProcMaskPaths []string +} + +type Fs struct { + Active bool + Id string // container-id + PreReg bool // indicates if the container was pre-registered with sysbox-fs + Reg bool // indicates if sys container was registered with sysbox-fs + Mountpoint string // sysbox-fs FUSE mountpoint +} + +func NewFs(id string, enable bool) *Fs { + return &Fs{ + Active: enable, + Id: id, + } +} + +func (fs *Fs) Enabled() bool { + return fs.Active +} + +func (fs *Fs) GetConfig() error { + + mp, err := sysboxFsGrpc.GetMountpoint() + if err != nil { + return fmt.Errorf("failed to get config from sysbox-fs: %v", err) + } + + fs.Mountpoint = mp + return nil +} + +// Pre-registers container with sysbox-fs. +func (fs *Fs) PreRegister(linuxNamespaces []specs.LinuxNamespace) error { + if fs.PreReg { + return fmt.Errorf("container %v already pre-registered", fs.Id) + } + + data := &sysboxFsGrpc.ContainerData{ + Id: fs.Id, + } + + // If the new container is entering an existing net-ns, pass the ns info to + // sysbox-fs; containers which share the same net-ns see a common view of + // the resources emulated by sysbox-fs (e.g., as in Kubernetes pods or + // "docker run --net=container: some-image"). + for _, ns := range linuxNamespaces { + if ns.Type == specs.NetworkNamespace && ns.Path != "" { + data.Netns = ns.Path + } + } + + if err := sysboxFsGrpc.SendContainerPreRegistration(data); err != nil { + return fmt.Errorf("failed to pre-register with sysbox-fs: %v", err) + } + + fs.PreReg = true + + return nil +} + +// Registers container with sysbox-fs. +func (fs *Fs) Register(info *FsRegInfo) error { + + if !fs.PreReg { + return fmt.Errorf("container %v was not pre-registered", fs.Id) + } + + if fs.Reg { + return fmt.Errorf("container %v already registered", fs.Id) + } + + data := &sysboxFsGrpc.ContainerData{ + Id: fs.Id, + InitPid: int32(info.Pid), + Hostname: info.Hostname, + UidFirst: int32(info.Uid), + UidSize: int32(info.IdSize), + GidFirst: int32(info.Gid), + GidSize: int32(info.IdSize), + ProcRoPaths: info.ProcRoPaths, + ProcMaskPaths: info.ProcMaskPaths, + } + + if err := sysboxFsGrpc.SendContainerRegistration(data); err != nil { + return fmt.Errorf("failed to register with sysbox-fs: %v", err) + } + + fs.Reg = true + + return nil +} + +// Sends container creation time to sysbox-fs +func (fs *Fs) SendCreationTime(t time.Time) error { + if !fs.Reg { + return fmt.Errorf("must register container %v before", fs.Id) + } + data := &sysboxFsGrpc.ContainerData{ + Id: fs.Id, + Ctime: t, + } + if err := sysboxFsGrpc.SendContainerUpdate(data); err != nil { + return fmt.Errorf("failed to send creation time to sysbox-fs: %v", err) + } + return nil +} + +// Sends the seccomp-notification fd to sysbox-fs (tracer) to setup syscall +// trapping and waits for its response (ack). +func (fs *Fs) SendSeccompInit(pid int, id string, seccompFd int32) error { + + // TODO: Think about a better location for this one. + const seccompTracerSockAddr = "/run/sysbox/sysfs-seccomp.sock" + + conn, err := unixIpc.Connect(seccompTracerSockAddr) + if err != nil { + return fmt.Errorf("Unable to establish connection with seccomp-tracer: %v\n", err) + } + + if err = unixIpc.SendSeccompInitMsg(conn, int32(pid), id, seccompFd); err != nil { + return fmt.Errorf("Unable to send message to seccomp-tracer: %v\n", err) + } + + if err = unixIpc.RecvSeccompInitAckMsg(conn); err != nil { + return fmt.Errorf("Unable to receive expected seccomp-notif-ack message: %v\n", err) + } + + return nil +} + +// Unregisters the container with sysbox-fs +func (fs *Fs) Unregister() error { + if fs.PreReg || fs.Reg { + data := &sysboxFsGrpc.ContainerData{ + Id: fs.Id, + } + if err := sysboxFsGrpc.SendContainerUnregistration(data); err != nil { + return fmt.Errorf("failed to unregister with sysbox-fs: %v", err) + } + fs.PreReg = false + fs.Reg = false + } + return nil +} diff --git a/sysbox-runc/libsysbox/sysbox/mgr.go b/sysbox-runc/libsysbox/sysbox/mgr.go new file mode 100644 index 00000000..4716780f --- /dev/null +++ b/sysbox-runc/libsysbox/sysbox/mgr.go @@ -0,0 +1,215 @@ +// +// Copyright 2019-2022 Nestybox, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +// Exposes functions for sysbox-runc to interact with sysbox-mgr + +package sysbox + +import ( + "fmt" + "path/filepath" + + "github.com/nestybox/sysbox-ipc/sysboxMgrGrpc" + ipcLib "github.com/nestybox/sysbox-ipc/sysboxMgrLib" + sh "github.com/nestybox/sysbox-libs/idShiftUtils" + "github.com/nestybox/sysbox-libs/shiftfs" + "github.com/opencontainers/runc/libcontainer/configs" + specs "github.com/opencontainers/runtime-spec/specs-go" +) + +type Mgr struct { + Active bool + Id string // container-id + Config *ipcLib.ContainerConfig // sysbox-mgr mandated container config + clonedRootfs string // path to cloned rootfs (used when rootfs needs chown but it's on ovfs without metacopy=on) + +} + +func NewMgr(id string, enable bool) *Mgr { + return &Mgr{ + Active: enable, + Id: id, + Config: &ipcLib.ContainerConfig{ + // Configs that default to true in sysbox-mgr; we set them here for + // sysbox-runc unit tests where sysbox-mgr is not present. + AliasDns: true, + AllowTrustedXattr: true, + SyscontMode: true, + }, + } +} + +func (mgr *Mgr) Enabled() bool { + return mgr.Active +} + +// Registers the container with sysbox-mgr. If successful, stores the +// sysbox configuration tokens for sysbox-runc in mgr.Config +func (mgr *Mgr) Register(spec *specs.Spec) error { + var userns string + var netns string + + rootfs, err := filepath.Abs(spec.Root.Path) + if err != nil { + return err + } + + for _, ns := range spec.Linux.Namespaces { + if ns.Type == specs.UserNamespace && ns.Path != "" { + userns = ns.Path + } + if ns.Type == specs.NetworkNamespace && ns.Path != "" { + netns = ns.Path + } + } + + regInfo := &ipcLib.RegistrationInfo{ + Id: mgr.Id, + Rootfs: rootfs, + Userns: userns, + Netns: netns, + UidMappings: spec.Linux.UIDMappings, + GidMappings: spec.Linux.GIDMappings, + } + + config, err := sysboxMgrGrpc.Register(regInfo) + if err != nil { + return fmt.Errorf("failed to register with sysbox-mgr: %v", err) + } + + mgr.Config = config + + return nil +} + +func (mgr *Mgr) Update(userns, netns string, + uidMappings, gidMappings []specs.LinuxIDMapping, + rootfsUidShiftType sh.IDShiftType) error { + + updateInfo := &ipcLib.UpdateInfo{ + Id: mgr.Id, + Userns: userns, + Netns: netns, + UidMappings: uidMappings, + GidMappings: gidMappings, + RootfsUidShiftType: rootfsUidShiftType, + } + + if err := sysboxMgrGrpc.Update(updateInfo); err != nil { + return fmt.Errorf("failed to update container info with sysbox-mgr: %v", err) + } + return nil +} + +// Unregisters the container with sysbox-mgr. +func (mgr *Mgr) Unregister() error { + if err := sysboxMgrGrpc.Unregister(mgr.Id); err != nil { + return fmt.Errorf("failed to unregister with sysbox-mgr: %v", err) + } + return nil +} + +// ReqSubid requests sysbox-mgr to allocate uid & gids for the container user-ns. +func (mgr *Mgr) ReqSubid(size uint32) (uint32, uint32, error) { + uid, gid, err := sysboxMgrGrpc.SubidAlloc(mgr.Id, uint64(size)) + if err != nil { + return 0, 0, fmt.Errorf("failed to request subid from sysbox-mgr: %v", err) + } + return uid, gid, nil +} + +// PrepMounts sends a request to sysbox-mgr for prepare the given container mounts; all paths must be absolute. +func (mgr *Mgr) PrepMounts(uid, gid uint32, prepList []ipcLib.MountPrepInfo) error { + if err := sysboxMgrGrpc.PrepMounts(mgr.Id, uid, gid, prepList); err != nil { + return fmt.Errorf("failed to request mount source preps from sysbox-mgr: %v", err) + } + return nil +} + +// ReqMounts sends a request to sysbox-mgr for container mounts; all paths must be absolute. +func (mgr *Mgr) ReqMounts(rootfsUidShiftType sh.IDShiftType, reqList []ipcLib.MountReqInfo) ([]specs.Mount, error) { + mounts, err := sysboxMgrGrpc.ReqMounts(mgr.Id, rootfsUidShiftType, reqList) + if err != nil { + return nil, fmt.Errorf("failed to request mounts from sysbox-mgr: %v", err) + } + return mounts, nil +} + +// ReqShiftfsMark sends a request to sysbox-mgr to mark shiftfs on the given dirs; all paths must be absolute. +func (mgr *Mgr) ReqShiftfsMark(mounts []shiftfs.MountPoint) ([]shiftfs.MountPoint, error) { + resp, err := sysboxMgrGrpc.ReqShiftfsMark(mgr.Id, mounts) + if err != nil { + return nil, fmt.Errorf("failed to request shiftfs marking to sysbox-mgr: %v", err) + } + return resp, nil +} + +// ReqFsState sends a request to sysbox-mgr for container's rootfs state. +func (mgr *Mgr) ReqFsState(rootfs string) ([]configs.FsEntry, error) { + state, err := sysboxMgrGrpc.ReqFsState(mgr.Id, rootfs) + if err != nil { + return nil, fmt.Errorf("failed to request fsState from sysbox-mgr: %v", err) + } + + return state, nil +} + +func (mgr *Mgr) Pause() error { + if err := sysboxMgrGrpc.Pause(mgr.Id); err != nil { + return fmt.Errorf("failed to notify pause to sysbox-mgr: %v", err) + } + return nil +} + +func (mgr *Mgr) Resume() error { + if err := sysboxMgrGrpc.Resume(mgr.Id); err != nil { + return fmt.Errorf("failed to notify resume to sysbox-mgr: %v", err) + } + return nil +} + +// ClonedRootfs sends a request to sysbox-mgr to setup an alternate rootfs for the container. +// It returns the path to the new rootfs. +func (mgr *Mgr) CloneRootfs() (string, error) { + + newRootfs, err := sysboxMgrGrpc.ReqCloneRootfs(mgr.Id) + if err != nil { + return "", fmt.Errorf("failed to request rootfs cloning from sysbox-mgr: %v", err) + } + + mgr.clonedRootfs = newRootfs + return newRootfs, nil +} + +// Sends a requests to sysbox-mgr to chown a cloned rootfs, using the +// given uid and gid offsets. Must call after CloneRootfs(). +func (mgr *Mgr) ChownClonedRootfs(uidOffset, gidOffset int32) error { + return sysboxMgrGrpc.ChownClonedRootfs(mgr.Id, uidOffset, gidOffset) +} + +// Sends a requests to sysbox-mgr to revert the chown of a cloned rootfs. +// Must call after ChownClonedRootfs(). +func (mgr *Mgr) RevertClonedRootfsChown() error { + return sysboxMgrGrpc.RevertClonedRootfsChown(mgr.Id) +} + +func (mgr *Mgr) IsRootfsCloned() bool { + return mgr.clonedRootfs != "" +} + +func (mgr *Mgr) GetClonedRootfs() string { + return mgr.clonedRootfs +} diff --git a/sysbox-runc/libsysbox/sysbox/sysbox.go b/sysbox-runc/libsysbox/sysbox/sysbox.go new file mode 100644 index 00000000..a2873667 --- /dev/null +++ b/sysbox-runc/libsysbox/sysbox/sysbox.go @@ -0,0 +1,245 @@ +// +// Copyright 2019-2020 Nestybox, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +package sysbox + +import ( + "fmt" + "os" + "strconv" + "strings" + "syscall" + + sh "github.com/nestybox/sysbox-libs/idShiftUtils" + linuxUtils "github.com/nestybox/sysbox-libs/linuxUtils" + libutils "github.com/nestybox/sysbox-libs/utils" + specs "github.com/opencontainers/runtime-spec/specs-go" + "github.com/urfave/cli" +) + +// Holds sysbox-specific config +type Sysbox struct { + Id string + Mgr *Mgr + Fs *Fs + RootfsUidShiftType sh.IDShiftType + BindMntUidShiftType sh.IDShiftType + RootfsCloned bool + SwitchDockerDns bool + OrigRootfs string + OrigMounts []specs.Mount + IDshiftIgnoreList []string +} + +func NewSysbox(id string, withMgr, withFs bool) *Sysbox { + + sysMgr := NewMgr(id, withMgr) + sysFs := NewFs(id, withFs) + + return &Sysbox{ + Id: id, + Mgr: sysMgr, + Fs: sysFs, + } +} + +func checkKernelVersion(distro string) error { + var ( + reqMaj, reqMin int + major, minor int + ) + + rel, err := linuxUtils.GetKernelRelease() + if err != nil { + return err + } + + major, minor, err = linuxUtils.ParseKernelRelease(rel) + if err != nil { + return err + } + + if distro == "ubuntu" { + reqMaj = minKernelUbuntu.major + reqMin = minKernelUbuntu.minor + } else { + reqMaj = minKernel.major + reqMin = minKernel.minor + } + + supported := false + if major > reqMaj { + supported = true + } else if major == reqMaj { + if minor >= reqMin { + supported = true + } + } + + if !supported { + s := []string{strconv.Itoa(reqMaj), strconv.Itoa(reqMin)} + kver := strings.Join(s, ".") + return fmt.Errorf("%s kernel release %v is not supported; need >= %v", distro, rel, kver) + } + + return nil +} + +// needUidShiftOnRootfs checks if uid/gid shifting is required on the container's rootfs. +func needUidShiftOnRootfs(spec *specs.Spec) (bool, error) { + var hostUidMap, hostGidMap uint32 + + // the uid map is assumed to be present + for _, mapping := range spec.Linux.UIDMappings { + if mapping.ContainerID == 0 { + hostUidMap = mapping.HostID + break + } + } + + // the gid map is assumed to be present + for _, mapping := range spec.Linux.GIDMappings { + if mapping.ContainerID == 0 { + hostGidMap = mapping.HostID + break + } + } + + // find the rootfs owner + rootfs := spec.Root.Path + + fi, err := os.Stat(rootfs) + if err != nil { + return false, err + } + + st, ok := fi.Sys().(*syscall.Stat_t) + if !ok { + return false, fmt.Errorf("failed to convert to syscall.Stat_t") + } + + rootfsUid := st.Uid + rootfsGid := st.Gid + + // Use shifting when the rootfs is owned by true root and the containers uid/gid root + // mapping don't match the container's rootfs owner. + if rootfsUid == 0 && rootfsGid == 0 && + hostUidMap != rootfsUid && hostGidMap != rootfsGid { + return true, nil + } + + return false, nil +} + +// checkUidShifting returns the type of UID shifting needed (if any) for the +// container. The first return value indicates the type of UID shifting to be +// used for the container's rootfs, while the second indicates the type of UID +// shifting for container bind-mounts. +func CheckUidShifting(sysMgr *Mgr, spec *specs.Spec) (sh.IDShiftType, sh.IDShiftType, error) { + + shiftfsOk := sysMgr.Config.ShiftfsOk + shiftfsOnOvfsOk := sysMgr.Config.ShiftfsOnOverlayfsOk + + idMapMountOk := sysMgr.Config.IDMapMountOk + ovfsOnIDMapMountOk := sysMgr.Config.OverlayfsOnIDMapMountOk + + rootfsShiftType := sysMgr.Config.RootfsUidShiftType + + if rootfsShiftType == sh.NoShift { + + useShiftfsOnRootfs := false + useIDMapMountOnRootfs := false + + rootPathFs, err := libutils.GetFsName(spec.Root.Path) + if err != nil { + return sh.NoShift, sh.NoShift, err + } + + if idMapMountOk { + if rootPathFs == "overlayfs" && ovfsOnIDMapMountOk { + useIDMapMountOnRootfs = true + } + } + + if shiftfsOk { + if rootPathFs == "overlayfs" && shiftfsOnOvfsOk { + useShiftfsOnRootfs = true + } + } + + needShiftOnRootfs, err := needUidShiftOnRootfs(spec) + if err != nil { + return sh.NoShift, sh.NoShift, fmt.Errorf("failed to check uid-shifting requirement on rootfs: %s", err) + } + + // Check uid shifting type to be used for the container's rootfs. + // + // We do it via ID-mapping (preferably) or via shiftfs (if available on + // the host) or by chown'ing the rootfs hierarchy. Chowning is the least + // preferred and slowest approach, but won't disrupt anything on the host + // since the container's rootfs is dedicated to the container (no other + // entity in the system will use it while the container is running). + if needShiftOnRootfs { + if useIDMapMountOnRootfs { + rootfsShiftType = sh.IDMappedMount + } else if useShiftfsOnRootfs { + rootfsShiftType = sh.Shiftfs + } else { + rootfsShiftType = sh.Chown + } + } + } + + // Check uid shifting type to be used for the container's bind mounts. + // + // For bind mounts, we use ID-mapping or shiftfs, but never chown. Chowning + // for bind mounts is not a good idea since we don't know what's being bind + // mounted (e.g., the bind mount could be a user's home dir, a critical + // system file, etc.). + bindMountShiftType := sh.NoShift + + if idMapMountOk && shiftfsOk { + bindMountShiftType = sh.IDMappedMountOrShiftfs + } else if idMapMountOk { + bindMountShiftType = sh.IDMappedMount + } else if shiftfsOk { + bindMountShiftType = sh.Shiftfs + } + + return rootfsShiftType, bindMountShiftType, nil +} + +// CheckHostConfig checks if the host is configured appropriately to run a +// container with sysbox +func CheckHostConfig(context *cli.Context, spec *specs.Spec) error { + + distro, err := linuxUtils.GetDistro() + if err != nil { + return err + } + + if !context.GlobalBool("no-kernel-check") { + if err := checkKernelVersion(distro); err != nil { + return fmt.Errorf("kernel version check failed: %v", err) + } + } + + if err := checkUnprivilegedUserns(); err != nil { + return err + } + + return nil +} diff --git a/sysbox-runc/libsysbox/sysbox/utils.go b/sysbox-runc/libsysbox/sysbox/utils.go new file mode 100644 index 00000000..4e7c91ae --- /dev/null +++ b/sysbox-runc/libsysbox/sysbox/utils.go @@ -0,0 +1,93 @@ +// +// Copyright 2019-2020 Nestybox, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +package sysbox + +import ( + "fmt" + "os" + "unsafe" +) + +// The min supported kernel release is chosen based on whether it contains all kernel +// fixes required to run Sysbox. Refer to the Sysbox distro compatibility doc. +type kernelRelease struct{ major, minor int } + +var minKernel = kernelRelease{5, 5} // 5.5 +var minKernelUbuntu = kernelRelease{5, 0} // 5.0 + +func readFileInt(path string) (int, error) { + + f, err := os.Open(path) + if err != nil { + return -1, err + } + defer f.Close() + + var b []byte = make([]byte, unsafe.Sizeof(int(0))) + _, err = f.Read(b) + if err != nil { + return -1, err + } + + var val int + _, err = fmt.Sscanf(string(b), "%d", &val) + if err != nil { + return -1, err + } + + return val, nil +} + +// checks if the kernel is configured to allow unprivileged users to create +// namespaces. This is necessary for running containers inside a system +// container. +func checkUnprivilegedUserns() error { + + // In Debian-based distros, unprivileged userns creation is enabled via + // "/proc/sys/kernel/unprivileged_userns_clone". In Fedora (and related) + // distros this sysctl does not exist. Rather, unprivileged userns creation + // is enabled by setting a non-zero value in "/proc/sys/user/max_user_namespaces". + // Here we check both. + + path := "/proc/sys/kernel/unprivileged_userns_clone" + if _, err := os.Stat(path); err == nil { + + val, err := readFileInt(path) + if err != nil { + return err + } + + if val != 1 { + return fmt.Errorf("kernel is not configured to allow unprivileged users to create namespaces: %s: want 1, have %d", + path, val) + } + } + + path = "/proc/sys/user/max_user_namespaces" + + val, err := readFileInt(path) + if err != nil { + return err + } + + if val == 0 { + return fmt.Errorf("kernel is not configured to allow unprivileged users to create namespaces: %s: want >= 1, have %d", + path, val) + } + + return nil +} diff --git a/sysbox-runc/libsysbox/syscont/example.go b/sysbox-runc/libsysbox/syscont/example.go new file mode 100644 index 00000000..5e533c6f --- /dev/null +++ b/sysbox-runc/libsysbox/syscont/example.go @@ -0,0 +1,152 @@ +// +// Copyright 2019-2020 Nestybox, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +package syscont + +import ( + "github.com/opencontainers/runtime-spec/specs-go" +) + +// Example returns an example OCI spec file for a system container +func Example() (*specs.Spec, error) { + + sysboxCaps, err := getSysboxEffCaps() + if err != nil { + return nil, err + } + + return &specs.Spec{ + Version: specs.Version, + Root: &specs.Root{ + Path: "rootfs", + }, + Hostname: "syscont", + Process: &specs.Process{ + Terminal: true, + User: specs.User{ + UID: 0, + GID: 0, + }, + Args: []string{ + "sh", + }, + Env: []string{ + "PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin", + "TERM=xterm", + }, + Cwd: "/", + NoNewPrivileges: true, + Capabilities: &specs.LinuxCapabilities{ + Bounding: sysboxCaps, + Permitted: sysboxCaps, + Inheritable: sysboxCaps, + Ambient: sysboxCaps, + Effective: sysboxCaps, + }, + Rlimits: []specs.POSIXRlimit{ + { + Type: "RLIMIT_NOFILE", + Hard: uint64(1024), + Soft: uint64(1024), + }, + }, + }, + Mounts: []specs.Mount{ + { + Destination: "/proc", + Type: "proc", + Source: "proc", + Options: nil, + }, + { + Destination: "/dev", + Type: "tmpfs", + Source: "tmpfs", + Options: []string{"nosuid", "strictatime", "mode=755", "size=65536k"}, + }, + { + Destination: "/dev/pts", + Type: "devpts", + Source: "devpts", + Options: []string{"nosuid", "noexec", "newinstance", "ptmxmode=0666", "mode=0620", "gid=5"}, + }, + { + Destination: "/dev/shm", + Type: "tmpfs", + Source: "shm", + Options: []string{"nosuid", "noexec", "nodev", "mode=1777", "size=65536k"}, + }, + { + Destination: "/dev/mqueue", + Type: "mqueue", + Source: "mqueue", + Options: []string{"nosuid", "noexec", "nodev"}, + }, + { + Destination: "/sys", + Type: "sysfs", + Source: "sysfs", + Options: []string{"nosuid", "noexec", "nodev", "ro"}, + }, + { + Destination: "/sys/fs/cgroup", + Type: "cgroup", + Source: "cgroup", + Options: []string{"nosuid", "noexec", "nodev", "relatime"}, + }, + }, + Linux: &specs.Linux{ + Namespaces: []specs.LinuxNamespace{ + { + Type: "pid", + }, + { + Type: "network", + }, + { + Type: "ipc", + }, + { + Type: "uts", + }, + { + Type: "mount", + }, + { + Type: "cgroup", + }, + }, + MaskedPaths: []string{ + "/proc/kcore", + "/proc/latency_stats", + "/proc/timer_list", + "/proc/timer_stats", + "/proc/sched_debug", + "/sys/firmware", + "/proc/scsi", + }, + ReadonlyPaths: []string{ + "/proc/asound", + "/proc/bus", + "/proc/fs", + "/proc/irq", + "/proc/sys", + "/proc/sysrq-trigger", + }, + CgroupsPath: "", + }, + }, nil +} diff --git a/sysbox-runc/libsysbox/syscont/spec.go b/sysbox-runc/libsysbox/syscont/spec.go new file mode 100644 index 00000000..5bf28363 --- /dev/null +++ b/sysbox-runc/libsysbox/syscont/spec.go @@ -0,0 +1,1390 @@ +// +// Copyright 2019-2022 Nestybox, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +//go:build linux +// +build linux + +package syscont + +import ( + "bufio" + "fmt" + "os" + "path/filepath" + "regexp" + "strings" + + mapset "github.com/deckarep/golang-set/v2" + ipcLib "github.com/nestybox/sysbox-ipc/sysboxMgrLib" + "github.com/nestybox/sysbox-libs/capability" + sh "github.com/nestybox/sysbox-libs/idShiftUtils" + utils "github.com/nestybox/sysbox-libs/utils" + "github.com/opencontainers/runc/libsysbox/sysbox" + "github.com/opencontainers/runtime-spec/specs-go" + "github.com/sirupsen/logrus" + "github.com/urfave/cli" + "golang.org/x/sys/unix" +) + +// Exported +const ( + IdRangeMin uint32 = 65536 +) + +// Internal +const ( + defaultUid uint32 = 231072 + defaultGid uint32 = 231072 +) + +var ( + SysboxFsDir string = "/var/lib/sysboxfs" +) + +// System container "must-have" mounts +var syscontMounts = []specs.Mount{ + specs.Mount{ + Destination: "/sys", + Source: "sysfs", + Type: "sysfs", + Options: []string{"noexec", "nosuid", "nodev"}, + }, + specs.Mount{ + Destination: "/sys/fs/cgroup", + Source: "cgroup", + Type: "cgroup", + Options: []string{"noexec", "nosuid", "nodev"}, + }, + specs.Mount{ + Destination: "/proc", + Source: "proc", + Type: "proc", + Options: []string{"noexec", "nosuid", "nodev"}, + }, + specs.Mount{ + Destination: "/dev", + Source: "tmpfs", + Type: "tmpfs", + Options: []string{"nosuid", "strictatime", "mode=755", "size=65536k"}, + }, + //we don't yet support /dev/kmsg; create a dummy one + specs.Mount{ + Destination: "/dev/kmsg", + Source: "/dev/null", + Type: "bind", + Options: []string{"rbind", "rprivate"}, + }, +} + +// Container mounts virtualized by sysbox-fs +// +// TODO: in the future get these from sysbox-fs via grpc +var sysboxFsMounts = []specs.Mount{ + // + // procfs mounts + // + specs.Mount{ + Destination: "/proc/sys", + Source: filepath.Join(SysboxFsDir, "proc/sys"), + Type: "bind", + Options: []string{"rbind", "rprivate"}, + }, + specs.Mount{ + Destination: "/proc/swaps", + Source: filepath.Join(SysboxFsDir, "proc/swaps"), + Type: "bind", + Options: []string{"rbind", "rprivate"}, + }, + specs.Mount{ + Destination: "/proc/uptime", + Source: filepath.Join(SysboxFsDir, "proc/uptime"), + Type: "bind", + Options: []string{"rbind", "rprivate"}, + }, + + // XXX: In the future sysbox-fs will also virtualize the following + + // specs.Mount{ + // Destination: "/proc/cpuinfo", + // Source: filepath.Join(SysboxFsDir, "proc/cpuinfo"), + // Type: "bind", + // Options: []string{"rbind", "rprivate"}, + // }, + // specs.Mount{ + // Destination: "/proc/cgroups", + // Source: filepath.Join(SysboxFsDir, "proc/cgroups"), + // Type: "bind", + // Options: []string{"rbind", "rprivate"}, + // }, + // specs.Mount{ + // Destination: "/proc/devices", + // Source: filepath.Join(SysboxFsDir, "proc/devices"), + // Type: "bind", + // Options: []string{"rbind", "rprivate"}, + // }, + // specs.Mount{ + // Destination: "/proc/diskstats", + // Source: filepath.Join(SysboxFsDir, "proc/diskstats"), + // Type: "bind", + // Options: []string{"rbind", "rprivate"}, + // }, + // specs.Mount{ + // Destination: "/proc/loadavg", + // Source: filepath.Join(SysboxFsDir, "proc/loadavg"), + // Type: "bind", + // Options: []string{"rbind", "rprivate"}, + // }, + // specs.Mount{ + // Destination: "/proc/meminfo", + // Source: filepath.Join(SysboxFsDir, "proc/meminfo"), + // Type: "bind", + // Options: []string{"rbind", "rprivate"}, + // }, + // specs.Mount{ + // Destination: "/proc/pagetypeinfo", + // Source: filepath.Join(SysboxFsDir, "proc/pagetypeinfo"), + // Type: "bind", + // Options: []string{"rbind", "rprivate"}, + // }, + // specs.Mount{ + // Destination: "/proc/partitions", + // Source: filepath.Join(SysboxFsDir, "proc/partitions"), + // Type: "bind", + // Options: []string{"rbind", "rprivate"}, + // }, + // specs.Mount{ + // Destination: "/proc/stat", + // Source: filepath.Join(SysboxFsDir, "proc/stat"), + // Type: "bind", + // Options: []string{"rbind", "rprivate"}, + // }, + + // + // sysfs mounts + // + specs.Mount{ + Destination: "/sys/kernel", + Source: filepath.Join(SysboxFsDir, "sys/kernel"), + Type: "bind", + Options: []string{"rbind", "rprivate"}, + }, + specs.Mount{ + Destination: "/sys/devices/virtual", + Source: filepath.Join(SysboxFsDir, "sys/devices/virtual"), + Type: "bind", + Options: []string{"rbind", "rprivate"}, + }, + specs.Mount{ + Destination: "/sys/module/nf_conntrack/parameters", + Source: filepath.Join(SysboxFsDir, "sys/module/nf_conntrack/parameters"), + Type: "bind", + Options: []string{"rbind", "rprivate"}, + }, +} + +// sys container systemd mount requirements +var syscontSystemdMounts = []specs.Mount{ + specs.Mount{ + Destination: "/run", + Source: "tmpfs", + Type: "tmpfs", + Options: []string{"rw", "rprivate", "nosuid", "nodev", "mode=755", "size=64m"}, + }, + specs.Mount{ + Destination: "/run/lock", + Source: "tmpfs", + Type: "tmpfs", + Options: []string{"rw", "rprivate", "noexec", "nosuid", "nodev", "size=4m"}, + }, +} + +// sys container systemd env-vars requirements +var syscontSystemdEnvVars = []string{ + + // Allow systemd to identify the virtualization mode to operate on (container + // with user-namespace). See 'ConditionVirtualization' attribute here: + // https://www.freedesktop.org/software/systemd/man/systemd.unit.html + "container=private-users", +} + +// List of file-system paths within a sys container that must be read-write to satisfy +// system-level apps' requirements. This slide is useful in scenarios where the sys +// container is running as "read-only" as per the corresponding oci-spec attribute. +// In this mode, the sys container's rootfs is mounted read-only, which prevents RW +// operations to tmpfs paths like '/run' and '/tmp'. This slide helps us override the +// read-only attribute for these paths. +var syscontMustBeRwMounts = []specs.Mount{ + { + Destination: "/run", + Source: "tmpfs", + Type: "tmpfs", + Options: []string{"rw", "rprivate", "noexec", "nosuid", "nodev", "mode=755", "size=64m"}, + }, + { + Destination: "/tmp", + Source: "tmpfs", + Type: "tmpfs", + Options: []string{"rw", "rprivate", "noexec", "nosuid", "nodev", "mode=755", "size=64m"}, + }, +} + +// syscontRwPaths list the paths within the sys container's rootfs +// that must have read-write permission +var syscontRwPaths = []string{ + "/proc", + "/proc/sys", +} + +// syscontExposedPaths list the paths within the sys container's rootfs +// that must not be masked +var syscontExposedPaths = []string{ + "/proc", + "/proc/sys", + + // Some apps need these to be exposed (or more accurately need them to not be masked + // via a bind-mount from /dev/null, as described in sysbox issue #511). It's not a + // security concern to expose these in sys containers, as they are either not accessible + // or don't provide meaningful info (due to the sys container's user-ns). + "/proc/kcore", + "/proc/kallsyms", + "/proc/kmsg", +} + +// syscontSystemdExposedPaths list the paths within the sys container's rootfs +// that must not be masked when the sys container runs systemd +var syscontSystemdExposedPaths = []string{ + "/run", + "/run/lock", + "/tmp", + "/sys/kernel", +} + +// syscontRwPaths list the paths within the sys container's rootfs +// that must have read-write permission +var syscontSystemdRwPaths = []string{ + "/run", + "/run/lock", + "/tmp", + "/sys/kernel", +} + +// cfgNamespaces checks that the namespace config has the minimum set +// of namespaces required and adds any missing namespaces to it +func cfgNamespaces(sysMgr *sysbox.Mgr, spec *specs.Spec) error { + + // user-ns and cgroup-ns are not required per the OCI spec, but we will add + // them to the system container spec. + var allNs = []string{"pid", "ipc", "uts", "mount", "network", "user", "cgroup"} + var reqNs = []string{"pid", "ipc", "uts", "mount", "network"} + + allNsSet := mapset.NewSet[string]() + for _, ns := range allNs { + allNsSet.Add(ns) + } + + reqNsSet := mapset.NewSet[string]() + for _, ns := range reqNs { + reqNsSet.Add(ns) + } + + specNsSet := mapset.NewSet[string]() + for _, ns := range spec.Linux.Namespaces { + specNsSet.Add(string(ns.Type)) + } + + if !reqNsSet.IsSubset(specNsSet) { + return fmt.Errorf("sysbox containers can't share namespaces %v with the host (because they use the linux user-namespace for isolation)", reqNsSet.Difference(specNsSet).ToSlice()) + } + + addNsSet := allNsSet.Difference(specNsSet) + for ns := range addNsSet.Iter() { + str := fmt.Sprintf("%v", ns) + newns := specs.LinuxNamespace{ + Type: specs.LinuxNamespaceType(str), + Path: "", + } + spec.Linux.Namespaces = append(spec.Linux.Namespaces, newns) + logrus.Debugf("added namespace %s to spec", ns) + } + + // Check if we have a sysbox-mgr override for the container's user-ns + if sysMgr.Enabled() { + if sysMgr.Config.Userns != "" { + updatedNs := []specs.LinuxNamespace{} + + for _, ns := range spec.Linux.Namespaces { + if ns.Type == specs.UserNamespace { + ns.Path = sysMgr.Config.Userns + } + updatedNs = append(updatedNs, ns) + } + + spec.Linux.Namespaces = updatedNs + } + } + + return nil +} + +// allocIDMappings performs uid and gid allocation for the system container +func allocIDMappings(sysMgr *sysbox.Mgr, spec *specs.Spec) error { + var uid, gid uint32 + var err error + + if sysMgr.Enabled() { + uid, gid, err = sysMgr.ReqSubid(IdRangeMin) + if err != nil { + return fmt.Errorf("subid allocation failed: %v", err) + } + } else { + uid = defaultUid + gid = defaultGid + } + + uidMap := specs.LinuxIDMapping{ + ContainerID: 0, + HostID: uid, + Size: IdRangeMin, + } + + gidMap := specs.LinuxIDMapping{ + ContainerID: 0, + HostID: gid, + Size: IdRangeMin, + } + + spec.Linux.UIDMappings = append(spec.Linux.UIDMappings, uidMap) + spec.Linux.GIDMappings = append(spec.Linux.GIDMappings, gidMap) + + return nil +} + +// validateIDMappings checks if the spec's user namespace uid and gid mappings meet +// sysbox-runc requirements. +func validateIDMappings(spec *specs.Spec) error { + var err error + + if len(spec.Linux.UIDMappings) == 0 || len(spec.Linux.GIDMappings) == 0 { + return fmt.Errorf("detected missing user-ns UID and/or GID mappings") + } + + // Sysbox requires that the container uid & gid mappings map a continuous + // range of container IDs to host IDs. This is a requirement implicitly + // imposed by Sysbox's usage of shiftfs. The call to mergeIDmappings ensures + // this is the case and returns a single ID mapping range in case the + // container's spec gave us a continuous mapping in multiple continuous + // sub-ranges. + + spec.Linux.UIDMappings, err = mergeIDMappings(spec.Linux.UIDMappings) + if err != nil { + return err + } + + spec.Linux.GIDMappings, err = mergeIDMappings(spec.Linux.GIDMappings) + if err != nil { + return err + } + + uidMap := spec.Linux.UIDMappings[0] + gidMap := spec.Linux.GIDMappings[0] + + if uidMap.ContainerID != 0 || uidMap.Size < IdRangeMin { + return fmt.Errorf("uid mapping range must specify a container with at least %d uids starting at uid 0; found %v", + IdRangeMin, uidMap) + } + + if gidMap.ContainerID != 0 || gidMap.Size < IdRangeMin { + return fmt.Errorf("gid mapping range must specify a container with at least %d gids starting at gid 0; found %v", + IdRangeMin, gidMap) + } + + if uidMap.HostID != gidMap.HostID { + return fmt.Errorf("detecting non-matching uid & gid mappings; found uid = %v, gid = %d", + uidMap, gidMap) + } + + if uidMap.HostID == 0 { + return fmt.Errorf("detected user-ns uid mapping to host ID 0 (%v); this breaks container isolation", + uidMap) + } + + if gidMap.HostID == 0 { + return fmt.Errorf("detected user-ns gid mapping to host ID 0 (%v); this breaks container isolation", + uidMap) + } + + return nil +} + +// cfgIDMappings checks if the uid/gid mappings are present and valid; if they are not +// present, it allocates them. +func cfgIDMappings(sysMgr *sysbox.Mgr, spec *specs.Spec) error { + + // Honor user-ns uid & gid mapping spec overrides from sysbox-mgr; this occur + // when a container shares the same userns and netns of another container (i.e., + // they must also share the mappings). + if sysMgr.Enabled() { + if len(sysMgr.Config.UidMappings) > 0 { + spec.Linux.UIDMappings = sysMgr.Config.UidMappings + } + if len(sysMgr.Config.GidMappings) > 0 { + spec.Linux.GIDMappings = sysMgr.Config.GidMappings + } + } + + // If no mappings are present, let's allocate some. + if len(spec.Linux.UIDMappings) == 0 && len(spec.Linux.GIDMappings) == 0 { + return allocIDMappings(sysMgr, spec) + } + + return validateIDMappings(spec) +} + +// cfgCapabilities sets the capabilities for the process in the system container +func cfgCapabilities(p *specs.Process) error { + caps := p.Capabilities + uid := p.User.UID + + noCaps := []string{} + + sysboxCaps, err := getSysboxEffCaps() + if err != nil { + return err + } + + if uid == 0 { + // init processes owned by root have all capabilities assigned to Sysbox itself (i.e., all root caps) + caps.Bounding = sysboxCaps + caps.Effective = sysboxCaps + caps.Inheritable = sysboxCaps + caps.Permitted = sysboxCaps + caps.Ambient = sysboxCaps + } else { + // init processes owned by others have all caps disabled and the bounding caps all + // set (just as in a regular host) + caps.Bounding = sysboxCaps + caps.Effective = noCaps + caps.Inheritable = noCaps + caps.Permitted = noCaps + caps.Ambient = noCaps + } + + return nil +} + +// getSysboxEffCaps returns the list of capabilities assigned to sysbox-runc itself. +func getSysboxEffCaps() ([]string, error) { + caps, err := capability.NewPid2(0) + if err != nil { + return nil, err + } + err = caps.Load() + if err != nil { + return nil, err + } + + allCapsStr := caps.StringCap(capability.EFFECTIVE, capability.OCI_STRING) + allCapsStr = strings.ReplaceAll(allCapsStr, ", ", ",") + return strings.Split(allCapsStr, ","), nil +} + +// cfgMaskedPaths adds or removes from the container's config any masked paths required by Sysbox. +func cfgMaskedPaths(spec *specs.Spec) { + if systemdInit(spec.Process) { + spec.Linux.MaskedPaths = utils.StringSliceRemove(spec.Linux.MaskedPaths, syscontSystemdExposedPaths) + } + spec.Linux.MaskedPaths = utils.StringSliceRemove(spec.Linux.MaskedPaths, syscontExposedPaths) +} + +// cfgReadonlyPaths removes from the container's config any read-only paths +// that must be read-write in the system container +func cfgReadonlyPaths(spec *specs.Spec) { + if systemdInit(spec.Process) { + spec.Linux.ReadonlyPaths = utils.StringSliceRemove(spec.Linux.ReadonlyPaths, syscontSystemdRwPaths) + } + spec.Linux.ReadonlyPaths = utils.StringSliceRemove(spec.Linux.ReadonlyPaths, syscontRwPaths) +} + +// cfgMounts configures the system container mounts +func cfgMounts(spec *specs.Spec, sysbox *sysbox.Sysbox) error { + + sysMgr := sysbox.Mgr + sysFs := sysbox.Fs + + // We will modify the container's mounts; remember the original ones + sysbox.OrigMounts = spec.Mounts + + if sysMgr.Config.SyscontMode { + cfgSyscontMounts(sysMgr, spec) + } + + if sysFs.Enabled() { + cfgSysboxFsMounts(spec, sysFs) + } + + if sysMgr.Enabled() { + if err := sysMgrSetupMounts(sysbox, spec); err != nil { + return err + } + } + + hasSystemd := false + if systemdInit(spec.Process) && sysMgr.Config.SyscontMode { + hasSystemd = true + cfgSystemdMounts(spec) + } + + sortMounts(spec, hasSystemd) + + return nil +} + +// cfgSyscontMounts adds mounts required by sys containers; if the spec +// has conflicting mounts, these are replaced with the required ones. +func cfgSyscontMounts(sysMgr *sysbox.Mgr, spec *specs.Spec) { + + // Disallow mounts under the container's /sys/fs/cgroup/* (i.e., Sysbox sets those up) + var cgroupMounts = []specs.Mount{ + specs.Mount{ + Destination: "/sys/fs/cgroup/", + }, + } + + spec.Mounts = utils.MountSliceRemove(spec.Mounts, cgroupMounts, func(m1, m2 specs.Mount) bool { + return strings.HasPrefix(m1.Destination, m2.Destination) + }) + + // Remove other conflicting mounts + spec.Mounts = utils.MountSliceRemove(spec.Mounts, syscontMounts, func(m1, m2 specs.Mount) bool { + return m1.Destination == m2.Destination + }) + + // Remove devices that conflict with sysbox mounts (e.g., /dev/kmsg) + specDevs := []specs.LinuxDevice{} + for _, dev := range spec.Linux.Devices { + m := specs.Mount{Destination: dev.Path} + if !utils.MountSliceContains(syscontMounts, m, func(m1, m2 specs.Mount) bool { + return m1.Destination == m2.Destination + }) { + specDevs = append(specDevs, dev) + } + } + spec.Linux.Devices = specDevs + + // In read-only scenarios, we want to adjust the syscont mounts to ensure that + // certain container mounts are always mounted as read-write. + if spec.Root.Readonly { + cfgSyscontMountsReadOnly(sysMgr, spec) + return + } + + // Add sysbox's default syscont mounts + spec.Mounts = append(spec.Mounts, syscontMounts...) +} + +// cfgSyscontMountsReadOnly adjusts the RW/RO character of syscont mounts in +// 'read-only' container scenarios. The purpose of this function is to ensure: +// - certain container mounts are always mounted as read-write (i.e., /run and /tmp) +// - /sys mounts are always mounted as read-only unless the sysbox-mgr is configured +// with the "relaxed-readonly" option. +func cfgSyscontMountsReadOnly(sysMgr *sysbox.Mgr, spec *specs.Spec) { + // We want to ensure that certain container mounts are always mounted as + // read-write (i.e., /run and /tmp) unless the user has explicitly marked + // them as read-only in the container spec. + for _, m := range syscontMustBeRwMounts { + var found bool + for _, p := range spec.Mounts { + if p.Destination == m.Destination { + found = true + break + } + } + if !found { + spec.Mounts = append(spec.Mounts, m) + } + } + + var tmpMounts = []specs.Mount{} + + // If sysbox-mgr is configured with the "relaxed-readonly" option, we allow + // "/sys" syscont mountpoints to be read-write, otherwise we mark them as + // read-only. + if sysMgr.Config.RelaxedReadOnly { + roOpt := []string{"ro"} + for _, m := range syscontMounts { + if strings.HasPrefix(m.Destination, "/sys") { + m.Options = utils.StringSliceRemove(m.Options, roOpt) + m.Options = append(m.Options, "rw") + } + tmpMounts = append(tmpMounts, m) + } + } else { + rwOpt := []string{"rw"} + for _, m := range syscontMounts { + if strings.HasPrefix(m.Destination, "/sys") { + m.Options = utils.StringSliceRemove(m.Options, rwOpt) + m.Options = append(m.Options, "ro") + } + tmpMounts = append(tmpMounts, m) + } + } + spec.Mounts = append(spec.Mounts, tmpMounts...) +} + +// cfgSysboxFsMounts adds the sysbox-fs mounts to the container's config. +func cfgSysboxFsMounts(spec *specs.Spec, sysFs *sysbox.Fs) { + + spec.Mounts = utils.MountSliceRemove(spec.Mounts, sysboxFsMounts, func(m1, m2 specs.Mount) bool { + return m1.Destination == m2.Destination + }) + + // Adjust sysboxFsMounts path attending to container-id value. + cntrMountpoint := filepath.Join(sysFs.Mountpoint, sysFs.Id) + + for i := range sysboxFsMounts { + sysboxFsMounts[i].Source = + strings.Replace( + sysboxFsMounts[i].Source, + SysboxFsDir, + cntrMountpoint, + 1, + ) + } + + SysboxFsDir = sysFs.Mountpoint + + // If the spec indicates a read-only rootfs, the sysbox-fs mounts should also + // be read-only. However, we don't mark them read-only here explicitly, so + // that they are initially mounted read-write while setting up the container. + // This is needed because the setup process may need to write to some of + // these mounts (e.g., writes to /proc/sys during networking setup). Instead, + // we add the mounts to the "readonly" paths list, so that they will be + // remounted to read-only after the container setup completes, right before + // starting the container's init process. + if spec.Root.Readonly { + for _, m := range sysboxFsMounts { + spec.Linux.ReadonlyPaths = append(spec.Linux.ReadonlyPaths, m.Destination) + } + } + + spec.Mounts = append(spec.Mounts, sysboxFsMounts...) +} + +// cfgSystemdMounts adds systemd related mounts to the spec +func cfgSystemdMounts(spec *specs.Spec) { + + // For sys containers with systemd inside, sysbox mounts tmpfs over certain directories + // of the container (this is a systemd requirement). However, if the container spec + // already has tmpfs mounts over any of these directories, we honor the spec mounts + // (i.e., these override the sysbox mount). + + spec.Mounts = utils.MountSliceRemove(spec.Mounts, syscontSystemdMounts, func(m1, m2 specs.Mount) bool { + return m1.Destination == m2.Destination && m1.Type != "tmpfs" + }) + + syscontSystemdMounts = utils.MountSliceRemove(syscontSystemdMounts, spec.Mounts, func(m1, m2 specs.Mount) bool { + return m1.Destination == m2.Destination && m2.Type == "tmpfs" + }) + + spec.Mounts = append(spec.Mounts, syscontSystemdMounts...) +} + +// Function parses any given 'file' looking for an 'attr' field. For the parsing +// operation to succeed, the say file is expected to conform to this layout: +// ": ". +// +// Examples: +// +// - Docker -> "data-root": "/var/lib/docker", +// - RKE2 -> "data-dir": "/var/lib/rancher/rke2", +func getFileAttrValue(file, attr string) (string, error) { + + f, err := os.Open(file) + if err != nil { + return "", err + } + defer f.Close() + + // Splits on newlines by default. + scanner := bufio.NewScanner(f) + + // Parse the 'attr' field of the passed file. + for scanner.Scan() { + data := scanner.Text() + if strings.Contains(data, attr) { + + dataRootStr := strings.Split(data, ":") + if len(dataRootStr) == 2 { + dataRoot := dataRootStr[1] + dataRoot = strings.TrimSpace(dataRoot) + dataRoot = strings.Trim(dataRoot, "\",") + + if len(dataRoot) > 0 { + return dataRoot, nil + } + + break + } + } + } + + return "", nil +} + +// Obtains the docker data-root path utilized by the inner docker process to store its +// data. This is used to define the container mountpoint on which the host's docker +// volume (backing this resource) will be mounted on. +// +// Notice that even though this code habilitates the custom definition of the data-root's +// location, this will be only honored by Sysbox if this attribute is set prior to the +// container creation (i.e., at docker-image build time). +func getInnerDockerDataRootPath(spec *specs.Spec) (string, error) { + + var defaultDataRoot = "/var/lib/docker" + + rootPath, err := filepath.Abs(spec.Root.Path) + if err != nil { + return "", err + } + + dockerCfgFile := filepath.Join(rootPath, "/etc/docker/daemon.json") + + val, err := getFileAttrValue(dockerCfgFile, "data-root") + if err != nil || val == "" { + return defaultDataRoot, nil + } + + return val, nil +} + +// Obtains the data-dir path utilized by the inner rke or k3s server/agents to +// to store their data. This is used to define the container mountpoint on which +// the host's docker volume (backing this resource) will be mounted on. +func getInnerK3sDataDirPath(spec *specs.Spec) (string, error) { + + var defaultDataDir = "/var/lib/rancher/k3s" + + rootPath, err := filepath.Abs(spec.Root.Path) + if err != nil { + return "", err + } + + k3sCfgFile := filepath.Join(rootPath, "/etc/rancher/k3s/config.yaml") + + val, err := getFileAttrValue(k3sCfgFile, "data-dir") + if err != nil || val == "" { + return defaultDataDir, nil + } + + return val, nil +} + +// Obtains the rke2 data-dir path utilized by the inner rke2 server and agent +// processes to store their data. This is used to define the container mountpoint +// on which the host's docker volume (backing this resource) will be mounted on. +func getInnerRke2DataDirPath(spec *specs.Spec) (string, error) { + + var defaultDataDir = "/var/lib/rancher/rke2" + + rootPath, err := filepath.Abs(spec.Root.Path) + if err != nil { + return "", err + } + + rke2CfgFile := filepath.Join(rootPath, "/etc/rancher/rke2/config.yaml") + + val, err := getFileAttrValue(rke2CfgFile, "data-dir") + if err != nil || val == "" { + return defaultDataDir, nil + } + + return val, nil +} + +func getSpecialDirs(spec *specs.Spec) (map[string]ipcLib.MntKind, error) { + + innerDockerDataRoot, err := getInnerDockerDataRootPath(spec) + if err != nil { + return nil, err + } + + innerK3sDataDir, err := getInnerK3sDataDirPath(spec) + if err != nil { + return nil, err + } + + innerRke2DataDir, err := getInnerRke2DataDirPath(spec) + if err != nil { + return nil, err + } + + // These directories in the sys container are bind-mounted from host dirs managed by sysbox-mgr + specialDirMap := map[string]ipcLib.MntKind{ + innerDockerDataRoot: ipcLib.MntVarLibDocker, + "/var/lib/kubelet": ipcLib.MntVarLibKubelet, + "/var/lib/k0s": ipcLib.MntVarLibK0s, + innerK3sDataDir: ipcLib.MntVarLibRancherK3s, + innerRke2DataDir: ipcLib.MntVarLibRancherRke2, + "/var/lib/buildkit": ipcLib.MntVarLibBuildkit, + "/var/lib/containerd/io.containerd.snapshotter.v1.overlayfs": ipcLib.MntVarLibContainerdOvfs, + } + + return specialDirMap, nil +} + +// sysMgrSetupMounts requests the sysbox-mgr to setup special container mounts +// (e.g., the implicit mounts that sysbox creates over the container's +// /var/lib/docker, /var/lib/kubelet, etc. in order to enable system software to +// run in the container seamlessly). +func sysMgrSetupMounts(sysbox *sysbox.Sysbox, spec *specs.Spec) error { + + rootfsUidShiftType := sysbox.RootfsUidShiftType + mgr := sysbox.Mgr + + specialDirMap, err := getSpecialDirs(spec) + if err != nil { + return err + } + + // If the spec has a host bind-mount over one of the special dirs, ask the + // sysbox-mgr to prepare the mount source (e.g., chown files to match the + // container host uid & gid). + + prepList := []ipcLib.MountPrepInfo{} + + for i := len(spec.Mounts) - 1; i >= 0; i-- { + m := spec.Mounts[i] + _, isSpecialDir := specialDirMap[m.Destination] + + if m.Type == "bind" && isSpecialDir { + info := ipcLib.MountPrepInfo{ + Source: m.Source, + Exclusive: true, + } + + prepList = append(prepList, info) + delete(specialDirMap, m.Destination) + } + } + + uid := spec.Linux.UIDMappings[0].HostID + gid := spec.Linux.GIDMappings[0].HostID + + if len(prepList) > 0 { + if err := mgr.PrepMounts(uid, gid, prepList); err != nil { + return err + } + } + + // If we are not in sys container mode, skip setting up the implicit sysbox-mgr mounts. + if !mgr.Config.SyscontMode { + return nil + } + + // Add the special dirs to the list of implicit mounts setup by Sysbox. + // sysbox-mgr will setup host dirs to back the mounts; it will also send us + // a list of any other implicit mounts it needs. + reqList := []ipcLib.MountReqInfo{} + for dest, kind := range specialDirMap { + info := ipcLib.MountReqInfo{ + Kind: kind, + Dest: dest, + } + reqList = append(reqList, info) + } + + m, err := mgr.ReqMounts(rootfsUidShiftType, reqList) + if err != nil { + return err + } + + // If any sysbox-mgr mounts conflict with any in the spec (i.e., + // same dest), prioritize the spec ones + mounts := utils.MountSliceRemove(m, spec.Mounts, func(m1, m2 specs.Mount) bool { + return m1.Destination == m2.Destination + }) + + // If the spec indicates a read-only rootfs, the sysbox-mgr mounts should + // also be read-only. The exception to this rule is in scenarios where the + // sysbox-mgr is in "relaxed-read-only" mode, in which case the mounts are + // left as read-write. + if spec.Root.Readonly && !sysbox.Mgr.Config.RelaxedReadOnly { + tmpMounts := []specs.Mount{} + rwOpt := []string{"rw"} + for _, m := range mounts { + m.Options = utils.StringSliceRemove(m.Options, rwOpt) + m.Options = append(m.Options, "ro") + tmpMounts = append(tmpMounts, m) + } + mounts = tmpMounts + } + + spec.Mounts = append(spec.Mounts, mounts...) + + return nil +} + +// checkSpec performs some basic checks on the system container's spec +func checkSpec(spec *specs.Spec) error { + + if spec.Root == nil || spec.Linux == nil { + return fmt.Errorf("not a linux container spec") + } + + // Ensure the container's network ns is not shared with the host + for _, ns := range spec.Linux.Namespaces { + if ns.Type == specs.NetworkNamespace && ns.Path != "" { + var st1, st2 unix.Stat_t + + if err := unix.Stat("/proc/self/ns/net", &st1); err != nil { + return fmt.Errorf("unable to stat sysbox's network namespace: %s", err) + } + if err := unix.Stat(ns.Path, &st2); err != nil { + return fmt.Errorf("unable to stat %q: %s", ns.Path, err) + } + + if (st1.Dev == st2.Dev) && (st1.Ino == st2.Ino) { + return fmt.Errorf("sysbox containers can't share a network namespace with the host (because they use the linux user-namespace for isolation)") + } + + break + } + } + + return nil +} + +// getSysboxEnvVarConfigs collects the SYSBOX_* env vars passed to the container. +func getSysboxEnvVarConfigs(p *specs.Process, sbox *sysbox.Sysbox) error { + var knownEnvVars = map[string]string{ + "SYSBOX_IGNORE_SYSFS_CHOWN": "bool", + "SYSBOX_ALLOW_TRUSTED_XATTR": "bool", + "SYSBOX_HONOR_CAPS": "bool", + "SYSBOX_SYSCONT_MODE": "bool", + "SYSBOX_SKIP_UID_SHIFT": "string", + } + + for _, ev := range p.Env { + if !strings.HasPrefix(ev, "SYSBOX_") { + continue + } + + tokens := strings.Split(ev, "=") + if len(tokens) != 2 { + return fmt.Errorf("env var %s has incorrect format; expected VAR=VALUE.", ev) + } + + evName := tokens[0] + evVal := tokens[1] + + // If a SYSBOX_* env var is specified, it must be one of the supported ones. + envVarType, ok := knownEnvVars[evName] + if !ok { + return fmt.Errorf("invalid env var %s; must be one of %v", evName, knownEnvVars) + } + + switch envVarType { + case "bool": + if err := getSysboxBoolEnvVarConfigs(sbox, evName, evVal); err != nil { + return err + } + case "string": + if err := getSysboxStringEnvVarConfigs(sbox, evName, evVal); err != nil { + return err + } + } + } + + return nil +} + +func getSysboxBoolEnvVarConfigs(sbox *sysbox.Sysbox, evName string, evVal string) error { + if evVal != "TRUE" && evVal != "FALSE" { + return fmt.Errorf("env var %s has invalid value %s; expect [TRUE|FALSE].", evName, evVal) + } + + switch evName { + case "SYSBOX_IGNORE_SYSFS_CHOWN": + sbox.Mgr.Config.IgnoreSysfsChown = (evVal == "TRUE") + case "SYSBOX_ALLOW_TRUSTED_XATTR": + sbox.Mgr.Config.AllowTrustedXattr = (evVal == "TRUE") + case "SYSBOX_HONOR_CAPS": + sbox.Mgr.Config.HonorCaps = (evVal == "TRUE") + case "SYSBOX_SYSCONT_MODE": + sbox.Mgr.Config.SyscontMode = (evVal == "TRUE") + } + + return nil +} + +func getSysboxStringEnvVarConfigs(sbox *sysbox.Sysbox, evName string, evVal string) error { + if evVal == "" { + return fmt.Errorf("env var %s has empty value", evName) + } + if isWhitespacePresent := regexp.MustCompile(`\s`).MatchString(evVal); isWhitespacePresent { + return fmt.Errorf("env var %s has invalid value %s; space characters are not allowed", evName, evVal) + } + + var err error + + switch evName { + case "SYSBOX_SKIP_UID_SHIFT": + if sbox.IDshiftIgnoreList, err = getSysboxEnvVarIDshiftIgnoreConfig(evName, evVal); err != nil { + return err + } + } + + return nil +} + +// getSysboxEnvVarIDshiftIgnoreConfig parses the SYSBOX_SKIP_UID_SHIFT env-var and returns +// a list of paths for which id-shifting operations must be avoided. +// Example: "/var/lib/mutagen,/var/lib/docker". +func getSysboxEnvVarIDshiftIgnoreConfig(evName, evVal string) ([]string, error) { + paths := strings.Split(evVal, ",") + + // Iterate through the paths and verify they are all absolute. + for i, p := range paths { + if !filepath.IsAbs(p) { + return nil, fmt.Errorf("env var %s has an invalid (not absolute) path: %s", evName, p) + } + paths[i] = p + } + + return paths, nil +} + +// removeSysboxEnvVarsForExec removes the SYSBOX_* env vars from the process spec. +// It only does this for env vars meant to be per-container (rather than per-process). +func removeSysboxEnvVarsForExec(p *specs.Process) { + env := []string{} + for _, envVar := range p.Env { + if !strings.HasPrefix(envVar, "SYSBOX_IGNORE_SYSFS_CHOWN=") && + !strings.HasPrefix(envVar, "SYSBOX_ALLOW_TRUSTED_XATTR=") && + !strings.HasPrefix(envVar, "SYSBOX_SYSCONT_MODE=") && + !strings.HasPrefix(envVar, "SYSBOX_SKIP_UID_SHIFT=") { + env = append(env, envVar) + } + } + + p.Env = env +} + +func cfgOomScoreAdj(spec *specs.Spec) { + + // For sys containers we don't allow -1000 for the OOM score value, as this + // is not supported from within a user-ns. + + if spec.Process.OOMScoreAdj != nil { + if *spec.Process.OOMScoreAdj < -999 { + *spec.Process.OOMScoreAdj = -999 + } + } +} + +// cfgSeccomp configures the system container's seccomp settings. +func cfgSeccomp(seccomp *specs.LinuxSeccomp) error { + + if seccomp == nil { + return nil + } + + supportedArch := false + for _, arch := range seccomp.Architectures { + if arch == specs.ArchX86_64 || arch == specs.ArchAARCH64 || arch == specs.ArchARM { + supportedArch = true + } + } + if !supportedArch { + return nil + } + + // we don't yet support specs with default trap, trace, or log actions + if seccomp.DefaultAction != specs.ActAllow && + seccomp.DefaultAction != specs.ActErrno && + seccomp.DefaultAction != specs.ActKill { + return fmt.Errorf("spec seccomp default actions other than allow, errno, and kill are not supported") + } + + // categorize syscalls per seccomp actions + allowSet := mapset.NewSet[string]() + disallowSet := mapset.NewSet[string]() + + for _, syscall := range seccomp.Syscalls { + for _, name := range syscall.Names { + switch syscall.Action { + case specs.ActAllow: + allowSet.Add(name) + case specs.ActErrno: + fallthrough + case specs.ActKill: + disallowSet.Add(name) + } + } + } + + // convert sys container syscall whitelist to a set + syscontAllowSet := mapset.NewSet[string]() + for _, sc := range syscontSyscallWhitelist { + syscontAllowSet.Add(sc) + } + + // seccomp syscall list may be a whitelist or blacklist + whitelist := (seccomp.DefaultAction == specs.ActErrno || + seccomp.DefaultAction == specs.ActKill) + + addSet := mapset.NewSet[string]() + rmSet := mapset.NewSet[string]() + + if whitelist { + addSet = syscontAllowSet.Difference(allowSet) + rmSet = disallowSet.Intersect(syscontAllowSet) + } else { + // Note: no addSet here since we don't have a sysbox syscall blacklist + rmSet = disallowSet.Difference(syscontAllowSet) + } + + // Remove syscalls from the container's error/kill lists + if rmSet.Cardinality() > 0 { + var newSyscalls []specs.LinuxSyscall + for _, sc := range seccomp.Syscalls { + if sc.Action == specs.ActErrno || sc.Action == specs.ActKill { + n := []string{} + for _, scName := range sc.Names { + if !rmSet.Contains(scName) { + n = append(n, scName) + } + } + sc.Names = n + if len(sc.Names) > 0 { + newSyscalls = append(newSyscalls, sc) + } + } else { + newSyscalls = append(newSyscalls, sc) + } + } + seccomp.Syscalls = newSyscalls + } + + // Add syscalls to the container's allowed list + if addSet.Cardinality() > 0 { + for syscallName := range addSet.Iter() { + str := fmt.Sprintf("%v", syscallName) + sc := specs.LinuxSyscall{ + Names: []string{str}, + Action: specs.ActAllow, + } + seccomp.Syscalls = append(seccomp.Syscalls, sc) + } + } + + if whitelist { + // Remove argument restrictions on syscalls (except those for which we + // allow such restrictions). + for i, syscall := range seccomp.Syscalls { + for _, name := range syscall.Names { + if !utils.StringSliceContains(syscontSyscallAllowRestrList, name) { + seccomp.Syscalls[i].Args = nil + } + } + } + } + + return nil +} + +// Configures which syscalls are trapped by Sysbox inside a system container +func cfgSyscontSyscallTraps(sysMgr *sysbox.Mgr) { + + if sysMgr.Config.IgnoreSysfsChown { + chownSyscalls := []string{ + "chown", "fchown", "fchownat", + } + syscallTrapList = append(syscallTrapList, chownSyscalls...) + } + + if sysMgr.Config.AllowTrustedXattr { + xattrSyscalls := []string{ + "setxattr", "lsetxattr", "fsetxattr", + "getxattr", "lgetxattr", "fgetxattr", + "removexattr", "lremovexattr", "fremovexattr", + "listxattr", "llistxattr", "flistxattr", + } + syscallTrapList = append(syscallTrapList, xattrSyscalls...) + } +} + +// Configures rootfs cloning (when required); returns true if rootfs was cloned. +func cfgRootfsCloning(spec *specs.Spec, sysbox *sysbox.Sysbox) (bool, error) { + + sysMgr := sysbox.Mgr + sysbox.OrigRootfs = spec.Root.Path + + if !sysMgr.Enabled() || sysMgr.Config.NoRootfsCloning { + return false, nil + } + + cloneRootfs, err := rootfsCloningRequired(spec.Root.Path) + if err != nil || !cloneRootfs { + return false, err + } + + newRootfs, err := sysMgr.CloneRootfs() + if err != nil { + return false, err + } + + spec.Root.Path = newRootfs + return true, nil +} + +// cfgAppArmor sets up the apparmor config for sys containers +func cfgAppArmor(p *specs.Process) error { + + // The default docker profile is too restrictive for sys containers (e.g., preveting + // mounts, write access to /proc/sys/*, etc). For now, we simply ignore any apparmor + // profile in the container's config. + // + // TODO: In the near future, we should develop an apparmor profile for sys-containers, + // and have sysbox-mgr load it to the kernel (if apparmor is enabled on the system) + // and then configure the container to use that profile here. + + p.ApparmorProfile = "" + return nil +} + +// Configure environment variables required for systemd +func cfgSystemdEnv(p *specs.Process) { + + p.Env = utils.StringSliceRemoveMatch(p.Env, func(specEnvVar string) bool { + name, _, err := utils.GetEnvVarInfo(specEnvVar) + if err != nil { + return false + } + for _, sysboxSysdEnvVar := range syscontSystemdEnvVars { + sname, _, err := utils.GetEnvVarInfo(sysboxSysdEnvVar) + if err == nil && name == sname { + return true + } + } + return false + }) + + p.Env = append(p.Env, syscontSystemdEnvVars...) +} + +// systemdInit returns true if the sys container is running systemd +func systemdInit(p *specs.Process) bool { + return p.Args[0] == "/sbin/init" +} + +// Configure the container's process spec for system containers +func ConvertProcessSpec(p *specs.Process, sbox *sysbox.Sysbox, isExec bool) error { + + sysMgr := sbox.Mgr + + if isExec { + removeSysboxEnvVarsForExec(p) + if err := getSysboxEnvVarConfigs(p, sbox); err != nil { + return err + } + } + + if sysMgr.Config.SyscontMode && !sysMgr.Config.HonorCaps { + if err := cfgCapabilities(p); err != nil { + return err + } + } + + if sysMgr.Config.SyscontMode { + if err := cfgAppArmor(p); err != nil { + return fmt.Errorf("failed to configure AppArmor profile: %v", err) + } + } + + if systemdInit(p) && sysMgr.Config.SyscontMode { + cfgSystemdEnv(p) + } + + return nil +} + +// ConvertSpec converts the given container spec to a system container spec. +func ConvertSpec(context *cli.Context, spec *specs.Spec, sbox *sysbox.Sysbox) error { + + sysMgr := sbox.Mgr + + if err := getSysboxEnvVarConfigs(spec.Process, sbox); err != nil { + return err + } + + if err := checkSpec(spec); err != nil { + return fmt.Errorf("invalid or unsupported container spec: %v", err) + } + + if err := cfgNamespaces(sysMgr, spec); err != nil { + return fmt.Errorf("invalid or unsupported container spec: %v", err) + } + + if err := cfgIDMappings(sysMgr, spec); err != nil { + return fmt.Errorf("invalid user/group ID config: %v", err) + } + + // Must do this after cfgIDMappings() + rootfsUidShiftType, bindMntUidShiftType, err := sysbox.CheckUidShifting(sysMgr, spec) + if err != nil { + return err + } + + rootfsCloned := false + if rootfsUidShiftType == sh.Chown { + rootfsCloned, err = cfgRootfsCloning(spec, sbox) + if err != nil { + return err + } + } + + sbox.RootfsUidShiftType = rootfsUidShiftType + sbox.BindMntUidShiftType = bindMntUidShiftType + sbox.RootfsCloned = rootfsCloned + + if err := cfgMounts(spec, sbox); err != nil { + return fmt.Errorf("invalid mount config: %v", err) + } + + if sysMgr.Config.SyscontMode { + cfgMaskedPaths(spec) + cfgReadonlyPaths(spec) + } + + cfgOomScoreAdj(spec) + + if err := ConvertProcessSpec(spec.Process, sbox, false); err != nil { + return fmt.Errorf("failed to configure process spec: %v", err) + } + + if sysMgr.Config.SyscontMode { + if err := cfgSeccomp(spec.Linux.Seccomp); err != nil { + return fmt.Errorf("failed to configure seccomp: %v", err) + } + } + + if sysMgr.Config.SyscontMode { + cfgSyscontSyscallTraps(sysMgr) + } + + return nil +} diff --git a/sysbox-runc/libsysbox/syscont/spec_test.go b/sysbox-runc/libsysbox/syscont/spec_test.go new file mode 100644 index 00000000..46c15030 --- /dev/null +++ b/sysbox-runc/libsysbox/syscont/spec_test.go @@ -0,0 +1,821 @@ +// +// Copyright 2019-2020 Nestybox, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +package syscont + +import ( + "reflect" + "testing" + + ipcLib "github.com/nestybox/sysbox-ipc/sysboxMgrLib" + utils "github.com/nestybox/sysbox-libs/utils" + "github.com/opencontainers/runc/libsysbox/sysbox" + "github.com/opencontainers/runtime-spec/specs-go" +) + +func findSeccompSyscall(seccomp *specs.LinuxSeccomp, targetSyscalls []string) (allFound bool, notFound []string) { + if seccomp == nil { + return false, notFound + } + + for _, target := range targetSyscalls { + found := false + for _, syscall := range seccomp.Syscalls { + for _, name := range syscall.Names { + if name == target { + found = true + } + } + } + if !found { + notFound = append(notFound, target) + } + } + + allFound = (len(notFound) == 0) + return allFound, notFound +} + +// genSeccompWhitelist generates a seccomp whitelist from the given syscall slice +func genSeccompWhitelist(syscalls []string) []specs.LinuxSyscall { + specSyscalls := []specs.LinuxSyscall{} + for _, s := range syscalls { + newSpecSyscall := specs.LinuxSyscall{ + Names: []string{s}, + Action: specs.ActAllow, + } + specSyscalls = append(specSyscalls, newSpecSyscall) + } + return specSyscalls +} + +func TestCfgSeccomp(t *testing.T) { + var seccomp *specs.LinuxSeccomp + + // Test handling of nil seccomp + if err := cfgSeccomp(nil); err != nil { + t.Errorf("cfgSeccomp: returned error: %v", err) + } + + // Test handling of unsupported arch + seccomp = &specs.LinuxSeccomp{ + DefaultAction: specs.ActErrno, + Architectures: []specs.Arch{specs.ArchARM}, + Syscalls: []specs.LinuxSyscall{}, + } + if err := cfgSeccomp(seccomp); err != nil { + t.Errorf("cfgSeccomp: failed to handle unsupported arch: %v", err) + } + + // Test handling of empty syscall whitelist + seccomp = &specs.LinuxSeccomp{ + DefaultAction: specs.ActErrno, + Architectures: []specs.Arch{specs.ArchX86_64}, + Syscalls: []specs.LinuxSyscall{}, + } + if err := cfgSeccomp(seccomp); err != nil { + t.Errorf("cfgSeccomp: returned error: %v", err) + } + if ok, notFound := findSeccompSyscall(seccomp, syscontSyscallWhitelist); !ok { + t.Errorf("cfgSeccomp: empty whitelist test failed: missing syscalls: %s", notFound) + } + + // Test handling of complete syscall whitelist + seccomp = &specs.LinuxSeccomp{ + DefaultAction: specs.ActErrno, + Architectures: []specs.Arch{specs.ArchX86_64}, + Syscalls: genSeccompWhitelist(syscontSyscallWhitelist), + } + if err := cfgSeccomp(seccomp); err != nil { + t.Errorf("cfgSeccomp: returned error: %v", err) + } + if ok, notFound := findSeccompSyscall(seccomp, syscontSyscallWhitelist); !ok { + t.Errorf("cfgSeccomp: full whitelist test failed: missing syscalls: %s", notFound) + } + + // Test handling of incomplete syscall whitelist + partialList := []string{"accept", "accept4", "access", "adjtimex"} + seccomp = &specs.LinuxSeccomp{ + DefaultAction: specs.ActErrno, + Architectures: []specs.Arch{specs.ArchX86_64}, + Syscalls: genSeccompWhitelist(partialList), + } + if err := cfgSeccomp(seccomp); err != nil { + t.Errorf("cfgSeccomp: returned error: %v", err) + } + if ok, notFound := findSeccompSyscall(seccomp, syscontSyscallWhitelist); !ok { + t.Errorf("cfgSeccomp: incomplete whitelist test failed: missing syscalls: %s", notFound) + } + + // Test handling of whitelist with multiple syscalls per LinuxSyscall entry + linuxSyscall := specs.LinuxSyscall{ + Names: syscontSyscallWhitelist, + Action: specs.ActAllow, + } + seccomp = &specs.LinuxSeccomp{ + DefaultAction: specs.ActErrno, + Architectures: []specs.Arch{specs.ArchX86_64}, + Syscalls: []specs.LinuxSyscall{linuxSyscall}, + } + if err := cfgSeccomp(seccomp); err != nil { + t.Errorf("cfgSeccomp: returned error: %v", err) + } + if ok, notFound := findSeccompSyscall(seccomp, syscontSyscallWhitelist); !ok { + t.Errorf("cfgSeccomp: multiple syscall per entry whitelist test failed: missing syscalls: %s", notFound) + } + + // Docker uses whitelists, so we skip the blacklist tests for now + // TODO: Test handling of empty blacklist + // TODO: Test handling of conflicting blacklist + // TODO: Test handling of non-conflicting blacklist +} + +// Test removal of seccomp syscall arg restrictions +func TestCfgSeccompArgRemoval(t *testing.T) { + + // The following resembles the way Docker programs seccomp syscall argument + // restrictions for the "personality" and "clone" syscalls. + + personalityArg := specs.LinuxSeccompArg{ + Index: 0, + Value: 131072, + Op: "SCMP_CMP_EQ", + } + + cloneArg := specs.LinuxSeccompArg{ + Index: 0, + Value: 2080505856, + Op: "SCMP_CMP_MASKED_EQ", + } + + seccomp := &specs.LinuxSeccomp{ + DefaultAction: specs.ActErrno, + Architectures: []specs.Arch{specs.ArchX86_64}, + Syscalls: []specs.LinuxSyscall{ + { + Names: []string{"personality"}, + Action: "SCMP_ACT_ALLOW", + Args: []specs.LinuxSeccompArg{personalityArg}, + }, + { + Names: []string{"clone"}, + Action: "SCMP_ACT_ALLOW", + Args: []specs.LinuxSeccompArg{cloneArg}, + }, + }, + } + + if err := cfgSeccomp(seccomp); err != nil { + t.Errorf("cfgSeccomp: returned error: %v", err) + } + + // Verify that arg restrictions for personality() where left untouched, while arg + // restrictions for clone() were removed. See syscontSyscallAllowRestrList. + + if seccomp.Syscalls[0].Args[0] != personalityArg { + t.Errorf("cfgSeccompArgRemoval failed: personality() syscall args invalid: want %v, got %v", personalityArg, seccomp.Syscalls[0].Args[0]) + } + + if seccomp.Syscalls[1].Args != nil { + t.Errorf("cfgSeccompArgRemoval failed: clone() syscall args invalid: want nil, got %v", seccomp.Syscalls[1].Args) + } +} + +func TestCfgMaskedPaths(t *testing.T) { + spec := new(specs.Spec) + spec.Linux = new(specs.Linux) + spec.Linux.MaskedPaths = []string{"/proc", "/some/path", "/proc/sys", "/other/path"} + spec.Process = new(specs.Process) + spec.Process.Args = []string{"/bin/bash"} + + cfgMaskedPaths(spec) + + for _, mp := range spec.Linux.MaskedPaths { + for _, ep := range syscontExposedPaths { + if mp == ep { + t.Errorf("cfgMaskedPaths: failed to unmask path %s", ep) + } + } + } + + want := []string{"/some/path", "/other/path"} + if !utils.StringSliceEqual(spec.Linux.MaskedPaths, want) { + t.Errorf("cfgMaskedPaths: removed unexpected path; got %v, want %v", spec.Linux.MaskedPaths, want) + } +} + +func TestCfgReadonlyPaths(t *testing.T) { + spec := new(specs.Spec) + spec.Linux = new(specs.Linux) + spec.Linux.ReadonlyPaths = []string{"/proc", "/some/path", "/proc/sys", "/other/path"} + spec.Process = new(specs.Process) + spec.Process.Args = []string{"/bin/bash"} + + cfgReadonlyPaths(spec) + + for _, rop := range spec.Linux.ReadonlyPaths { + for _, rwp := range syscontRwPaths { + if rop == rwp { + t.Errorf("cfgReadonlyPaths: failed to remove read-only on path %s", rwp) + } + } + } + + want := []string{"/some/path", "/other/path"} + if !utils.StringSliceEqual(spec.Linux.ReadonlyPaths, want) { + t.Errorf("cfgReadonlyPaths: removed unexpected path; got %v, want %v", spec.Linux.ReadonlyPaths, want) + } +} + +func TestCfgSystemd(t *testing.T) { + + spec := new(specs.Spec) + spec.Process = new(specs.Process) + spec.Linux = new(specs.Linux) + + // Create a spec that has intentional conflicts with systemd resources + spec.Process.Args = []string{"/sbin/init"} + + spec.Mounts = []specs.Mount{ + specs.Mount{ + Source: "/somepath", + Destination: "/run", + Type: "bind", + Options: []string{"ro", "rprivate"}, + }, + specs.Mount{ + Source: "/otherpath", + Destination: "/run/lock", + Type: "bind", + Options: []string{"rw"}, + }, + specs.Mount{ + Source: "/somepath", + Destination: "/test", + Type: "bind", + Options: []string{"ro", "rprivate"}, + }, + } + + // This call should remove the conflicting info above + cfgSystemdMounts(spec) + + wantMounts := []specs.Mount{ + specs.Mount{ + Source: "/somepath", + Destination: "/test", + Type: "bind", + Options: []string{"ro", "rprivate"}, + }, + specs.Mount{ + Source: "tmpfs", + Destination: "/run", + Type: "tmpfs", + Options: []string{"rw", "rprivate", "nosuid", "nodev", "mode=755", "size=64m"}, + }, + specs.Mount{ + Source: "tmpfs", + Destination: "/run/lock", + Type: "tmpfs", + Options: []string{"rw", "rprivate", "noexec", "nosuid", "nodev", "size=4m"}, + }, + } + + if !utils.MountSliceEqual(spec.Mounts, wantMounts) { + t.Errorf("cfgSystemd() failed: spec.Mounts: want %v, got %v", wantMounts, spec.Mounts) + } +} + +func TestCfgSystemdOverride(t *testing.T) { + + spec := new(specs.Spec) + spec.Process = new(specs.Process) + spec.Linux = new(specs.Linux) + + // Create a spec that overrides the sysbox systemd mounts (spec tmpfs mounts override + // the sysbox tmpfs mounts for systemd). + spec.Process.Args = []string{"/sbin/init"} + + spec.Mounts = []specs.Mount{ + specs.Mount{ + Source: "/somepath", + Destination: "/run", + Type: "tmpfs", + Options: []string{"rw", "nosuid", "noexec", "size=128m"}, + }, + specs.Mount{ + Source: "/otherpath", + Destination: "/run/lock", + Type: "tmpfs", + Options: []string{"rw", "nosuid", "noexec", "size=8m"}, + }, + } + + wantMounts := spec.Mounts + + // This call should honor the spec mount overrides. + cfgSystemdMounts(spec) + + if !utils.MountSliceEqual(spec.Mounts, wantMounts) { + t.Errorf("cfgSystemd() failed: spec.Mounts: want %v, got %v", wantMounts, spec.Mounts) + } +} + +func TestValidateIDMappings(t *testing.T) { + var err error + + spec := new(specs.Spec) + spec.Linux = new(specs.Linux) + + // Test empty user-ns ID mappings + spec.Linux.UIDMappings = []specs.LinuxIDMapping{} + spec.Linux.GIDMappings = []specs.LinuxIDMapping{} + + err = validateIDMappings(spec) + if err == nil { + t.Errorf("validateIDMappings(): expected failure due to empty mappings, but it passed") + } + + // Test non-contiguous container ID mappings + spec.Linux.UIDMappings = []specs.LinuxIDMapping{ + {ContainerID: 0, HostID: 1000000, Size: 1}, + {ContainerID: 2, HostID: 1000001, Size: 65535}, + } + + spec.Linux.GIDMappings = spec.Linux.UIDMappings + + err = validateIDMappings(spec) + if err == nil { + t.Errorf("validateIDMappings(): expected failure due to non-contiguous container ID mappings, but it passed") + } + + // Test non-contiguous host ID mappings + spec.Linux.UIDMappings = []specs.LinuxIDMapping{ + {ContainerID: 0, HostID: 1000000, Size: 1}, + {ContainerID: 1, HostID: 1000002, Size: 65535}, + } + + spec.Linux.GIDMappings = spec.Linux.UIDMappings + + err = validateIDMappings(spec) + if err == nil { + t.Errorf("validateIDMappings(): expected failure due to non-contiguous host ID mappings, but it passed") + } + + // Test mappings with container ID range starting above 0 + spec.Linux.UIDMappings = []specs.LinuxIDMapping{ + {ContainerID: 1, HostID: 1000000, Size: 65536}, + } + + spec.Linux.GIDMappings = spec.Linux.UIDMappings + + err = validateIDMappings(spec) + if err == nil { + t.Errorf("validateIDMappings(): expected failure due to container ID range starting above 0, but it passed") + } + + // Test mappings with ID range below IdRangeMin + spec.Linux.UIDMappings = []specs.LinuxIDMapping{ + {ContainerID: 0, HostID: 1000000, Size: IdRangeMin - 1}, + } + + spec.Linux.GIDMappings = spec.Linux.UIDMappings + + err = validateIDMappings(spec) + if err == nil { + t.Errorf("validateIDMappings(): expected failure due to ID range size < %d, but it passed", IdRangeMin) + } + + // Test non-matching uid & gid mappings + spec.Linux.UIDMappings = []specs.LinuxIDMapping{ + {ContainerID: 0, HostID: 1000000, Size: 65536}, + } + + spec.Linux.GIDMappings = []specs.LinuxIDMapping{ + {ContainerID: 0, HostID: 2000000, Size: 65536}, + } + + err = validateIDMappings(spec) + if err == nil { + t.Errorf("validateIDMappings(): expected failure due to non-matching uid & gid mappings, but it passed") + } + + // Test mapping to host UID 0 + spec.Linux.UIDMappings = []specs.LinuxIDMapping{ + {ContainerID: 0, HostID: 0, Size: 65536}, + } + + spec.Linux.GIDMappings = []specs.LinuxIDMapping{ + {ContainerID: 0, HostID: 2000000, Size: 65536}, + } + + err = validateIDMappings(spec) + if err == nil { + t.Errorf("validateIDMappings(): expected failure due to uid mapping to host ID 0, but it passed") + } + + // Test mapping to host GID 0 + spec.Linux.UIDMappings = []specs.LinuxIDMapping{ + {ContainerID: 0, HostID: 1000000, Size: 65536}, + } + + spec.Linux.GIDMappings = []specs.LinuxIDMapping{ + {ContainerID: 0, HostID: 0, Size: 65536}, + } + + err = validateIDMappings(spec) + if err == nil { + t.Errorf("validateIDMappings(): expected failure due to gid mapping to host ID 0, but it passed") + } + + // Test valid single entry mapping + spec.Linux.UIDMappings = []specs.LinuxIDMapping{ + {ContainerID: 0, HostID: 1000000, Size: 65536}, + } + + spec.Linux.GIDMappings = spec.Linux.UIDMappings + + err = validateIDMappings(spec) + if err != nil { + t.Errorf("validateIDMappings(): expected pass but it failed; mapping = %v", spec.Linux.UIDMappings) + } + + // Test valid multi-entry mapping is accepted and merged into a single entry mapping + spec.Linux.UIDMappings = []specs.LinuxIDMapping{ + {ContainerID: 0, HostID: 1000000, Size: 1}, + {ContainerID: 1, HostID: 1000001, Size: 9}, + {ContainerID: 10, HostID: 1000010, Size: 65526}, + } + + spec.Linux.GIDMappings = spec.Linux.UIDMappings + origMapping := spec.Linux.UIDMappings + + err = validateIDMappings(spec) + if err != nil { + t.Errorf("validateIDMappings(): expected pass but it failed; mapping = %v", origMapping) + } + + want := []specs.LinuxIDMapping{ + {ContainerID: 0, HostID: 1000000, Size: 65536}, + } + + if !equalIDMappings(want, spec.Linux.UIDMappings) { + t.Errorf("validateIDMappings(): uid mappings are not correct; want %v, got %v", + want, spec.Linux.UIDMappings) + } + + if !equalIDMappings(want, spec.Linux.GIDMappings) { + t.Errorf("validateIDMappings(): gid mappings are not correct; want %v, got %v", + want, spec.Linux.GIDMappings) + } +} + +func Test_getSysboxEnvVarConfigs(t *testing.T) { + type args struct { + p *specs.Process + sbox *sysbox.Sysbox + } + tests := []struct { + name string + args args + wantErr bool + resSbox *sysbox.Sysbox + }{ + { + // Test-case 1: Unknown SYSBOX env-var. Expected error. + name: "unknown-sysbox-envvar", + args: args{p: &specs.Process{Env: []string{"SYSBOX_ENV=1"}}, sbox: &sysbox.Sysbox{}}, + wantErr: true, + }, + { + // Test-case 2: Invalid format for generic env-var. Error expected. + name: "invalid-format-generic-envvar", + args: args{p: &specs.Process{Env: []string{"SYSBOX_HONOR_CAPS"}}, sbox: &sysbox.Sysbox{}}, + wantErr: true, + }, + { + // Test-case 3: Invalid format for boolean env-var. Error expected. + name: "invalid-format-bool-envvar", + args: args{p: &specs.Process{Env: []string{"SYSBOX_HONOR_CAPS=1"}}, sbox: &sysbox.Sysbox{}}, + wantErr: true, + }, + { + // Test-case 4: Invalid format for string env-var. Error expected. + name: "invalid-format-string-envvar", + args: args{p: &specs.Process{Env: []string{"SYSBOX_SKIP_UID_SHIFT="}}, sbox: &sysbox.Sysbox{}}, + wantErr: true, + }, + { + // Test-case 5: Verify proper parsing of SYSBOX_SYSCONT_MODE. No error expected. + name: "syscont-mode-envvar", + args: args{p: &specs.Process{Env: []string{"SYSBOX_SYSCONT_MODE=FALSE"}}, sbox: &sysbox.Sysbox{Mgr: &sysbox.Mgr{Config: &ipcLib.ContainerConfig{SyscontMode: false}}}}, + wantErr: false, + resSbox: &sysbox.Sysbox{Mgr: &sysbox.Mgr{Config: &ipcLib.ContainerConfig{SyscontMode: false}}}, + }, + { + // Test-case 6: Verify proper parsing of SYSBOX_SKIP_UID_SHIFT. No error expected. + name: "skip-uid-shift-envvar", + args: args{p: &specs.Process{Env: []string{"SYSBOX_SKIP_UID_SHIFT=/var/lib/1,/var/lib/2,/var/lib/3"}}, sbox: &sysbox.Sysbox{}}, + wantErr: false, + resSbox: &sysbox.Sysbox{IDshiftIgnoreList: []string{"/var/lib/1", "/var/lib/2", "/var/lib/3"}}, + }, + { + // Test-case 7: Verify identification of SYSBOX_SKIP_UID_SHIFT's invalid (relative) path. Error expected. + name: "skip-uid-shift-envvar-relative-path", + args: args{p: &specs.Process{Env: []string{"SYSBOX_SKIP_UID_SHIFT=/var/lib/1,var/lib/2"}}, sbox: &sysbox.Sysbox{}}, + wantErr: true, + }, + { + // Test-case 8: Verify identification of SYSBOX_SKIP_UID_SHIFT's invalid path (with spaces). Error expected. + name: "skip-uid-shift-envvar-space-in-path", + args: args{p: &specs.Process{Env: []string{"SYSBOX_SKIP_UID_SHIFT=/var/lib/1, /var/lib/2"}}, sbox: &sysbox.Sysbox{}}, + wantErr: true, + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + if err := getSysboxEnvVarConfigs(tt.args.p, tt.args.sbox); (err != nil) != tt.wantErr && tt.args.sbox != tt.resSbox { + t.Errorf("getSysboxEnvVarConfigs() error = %v, wantErr %v", err, tt.wantErr) + } + }) + } +} + +func Test_cfgSyscontMountsReadOnly(t *testing.T) { + type args struct { + sysMgr *sysbox.Mgr + spec *specs.Spec + expectedMounts []specs.Mount + } + + sysMgrDefault := &sysbox.Mgr{ + Config: &ipcLib.ContainerConfig{ + RelaxedReadOnly: false, + }, + } + sysMgrRelaxedRO := &sysbox.Mgr{ + Config: &ipcLib.ContainerConfig{ + RelaxedReadOnly: true, + }, + } + + // UT1: test with no overlapping mounts + mountsUT1 := []specs.Mount{} + expectedMountsUT1 := []specs.Mount{ + { + Destination: "/run", + Source: "tmpfs", + Type: "tmpfs", + Options: []string{"rw", "rprivate", "noexec", "nosuid", "nodev", "mode=755", "size=64m"}, + }, + { + Destination: "/tmp", + Source: "tmpfs", + Type: "tmpfs", + Options: []string{"rw", "rprivate", "noexec", "nosuid", "nodev", "mode=755", "size=64m"}, + }, + { + Destination: "/sys", + Source: "sysfs", + Type: "sysfs", + Options: []string{"noexec", "nosuid", "nodev", "ro"}, + }, + { + Destination: "/sys/fs/cgroup", + Source: "cgroup", + Type: "cgroup", + Options: []string{"noexec", "nosuid", "nodev", "ro"}, + }, + { + Destination: "/proc", + Source: "proc", + Type: "proc", + Options: []string{"noexec", "nosuid", "nodev"}, + }, + { + Destination: "/dev", + Source: "tmpfs", + Type: "tmpfs", + Options: []string{"nosuid", "strictatime", "mode=755", "size=65536k"}, + }, + { + Destination: "/dev/kmsg", + Source: "/dev/null", + Type: "bind", + Options: []string{"rbind", "rprivate"}, + }, + } + + // UT2: test with overlapping ro mount (/run) + mountsUT2 := []specs.Mount{ + { + Destination: "/run", + Source: "/somepath", + Type: "bind", + Options: []string{"ro", "whatever"}, + }, + } + expectedMountsUT2 := []specs.Mount{ + { + Destination: "/run", + Source: "/somepath", + Type: "bind", + Options: []string{"ro", "whatever"}, + }, + { + Destination: "/tmp", + Source: "tmpfs", + Type: "tmpfs", + Options: []string{"rw", "rprivate", "noexec", "nosuid", "nodev", "mode=755", "size=64m"}, + }, + { + Destination: "/sys", + Source: "sysfs", + Type: "sysfs", + Options: []string{"noexec", "nosuid", "nodev", "ro"}, + }, + { + Destination: "/sys/fs/cgroup", + Source: "cgroup", + Type: "cgroup", + Options: []string{"noexec", "nosuid", "nodev", "ro"}, + }, + { + Destination: "/proc", + Source: "proc", + Type: "proc", + Options: []string{"noexec", "nosuid", "nodev"}, + }, + { + Destination: "/dev", + Source: "tmpfs", + Type: "tmpfs", + Options: []string{"nosuid", "strictatime", "mode=755", "size=65536k"}, + }, + { + Destination: "/dev/kmsg", + Source: "/dev/null", + Type: "bind", + Options: []string{"rbind", "rprivate"}, + }, + } + + // UT3: test with overlapping rw mount (/tmp) + mountsUT3 := []specs.Mount{ + { + Destination: "/tmp", + Source: "/somepath", + Type: "bind", + Options: []string{"rw", "whatever"}, + }, + } + expectedMountsUT3 := []specs.Mount{ + { + Destination: "/tmp", + Source: "/somepath", + Type: "bind", + Options: []string{"rw", "whatever"}, + }, + { + Destination: "/run", + Source: "tmpfs", + Type: "tmpfs", + Options: []string{"rw", "rprivate", "noexec", "nosuid", "nodev", "mode=755", "size=64m"}, + }, + { + Destination: "/sys", + Source: "sysfs", + Type: "sysfs", + Options: []string{"noexec", "nosuid", "nodev", "ro"}, + }, + { + Destination: "/sys/fs/cgroup", + Source: "cgroup", + Type: "cgroup", + Options: []string{"noexec", "nosuid", "nodev", "ro"}, + }, + { + Destination: "/proc", + Source: "proc", + Type: "proc", + Options: []string{"noexec", "nosuid", "nodev"}, + }, + { + Destination: "/dev", + Source: "tmpfs", + Type: "tmpfs", + Options: []string{"nosuid", "strictatime", "mode=755", "size=65536k"}, + }, + { + Destination: "/dev/kmsg", + Source: "/dev/null", + Type: "bind", + Options: []string{"rbind", "rprivate"}, + }, + } + + // UT4: relaxed-read-only setup with ro mounts (/sys) + mountsUT4 := []specs.Mount{ + { + Destination: "/sys/fs/test", + Source: "blah", + Options: []string{"ro", "whatever"}, + }, + } + expectedMountsUT4 := []specs.Mount{ + { + Destination: "/sys/fs/test", + Source: "blah", + Options: []string{"ro", "whatever"}, + }, + { + Destination: "/run", + Source: "tmpfs", + Type: "tmpfs", + Options: []string{"rw", "rprivate", "noexec", "nosuid", "nodev", "mode=755", "size=64m"}, + }, + { + Destination: "/tmp", + Source: "tmpfs", + Type: "tmpfs", + Options: []string{"rw", "rprivate", "noexec", "nosuid", "nodev", "mode=755", "size=64m"}, + }, + { + Destination: "/sys", + Source: "sysfs", + Type: "sysfs", + Options: []string{"noexec", "nosuid", "nodev", "rw"}, + }, + { + Destination: "/sys/fs/cgroup", + Source: "cgroup", + Type: "cgroup", + Options: []string{"noexec", "nosuid", "nodev", "rw"}, + }, + { + Destination: "/proc", + Source: "proc", + Type: "proc", + Options: []string{"noexec", "nosuid", "nodev"}, + }, + { + Destination: "/dev", + Source: "tmpfs", + Type: "tmpfs", + Options: []string{"nosuid", "strictatime", "mode=755", "size=65536k"}, + }, + { + Destination: "/dev/kmsg", + Source: "/dev/null", + Type: "bind", + Options: []string{"rbind", "rprivate"}, + }, + } + + tests := []struct { + name string + args args + }{ + // Test-cases definition + { + name: "No overlapping mounts", + args: args{sysMgrDefault, &specs.Spec{Root: &specs.Root{Readonly: true}, Mounts: mountsUT1}, expectedMountsUT1}, + }, + { + name: "Overlapping ro mount (/run)", + args: args{sysMgrDefault, &specs.Spec{Root: &specs.Root{Readonly: true}, Mounts: mountsUT2}, expectedMountsUT2}, + }, + { + name: "Overlapping rw mount (/tmp)", + args: args{sysMgrDefault, &specs.Spec{Root: &specs.Root{Readonly: true}, Mounts: mountsUT3}, expectedMountsUT3}, + }, + { + name: "Relaxed-read-only setup with non-overlapping ro mounts (/sys/test/1)", + args: args{sysMgrRelaxedRO, &specs.Spec{Root: &specs.Root{Readonly: true}, Mounts: mountsUT4}, expectedMountsUT4}, + }, + } + + // Test-cases execution + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + cfgSyscontMountsReadOnly(tt.args.sysMgr, tt.args.spec) + }) + + if !reflect.DeepEqual(tt.args.spec.Mounts, tt.args.expectedMounts) { + t.Errorf("cfgSyscontMountsReadOnly failed: unexpected mounts; got %v, want %v", tt.args.spec.Mounts, tt.args.expectedMounts) + } + } +} diff --git a/sysbox-runc/libsysbox/syscont/syscalls.go b/sysbox-runc/libsysbox/syscont/syscalls.go new file mode 100644 index 00000000..27382590 --- /dev/null +++ b/sysbox-runc/libsysbox/syscont/syscalls.go @@ -0,0 +1,412 @@ +// +// Copyright 2019-2020 Nestybox, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +//go:build linux +// +build linux + +package syscont + +import ( + "fmt" + + "github.com/opencontainers/runc/libcontainer/configs" +) + +// List of syscalls allowed inside a system container +var syscontSyscallWhitelist = []string{ + + // docker allows these by default + "accept", + "accept4", + "access", + "adjtimex", + "alarm", + "bind", + "brk", + "capget", + "capset", + "chdir", + "chmod", + "chown", + "chown32", + "clock_getres", + "clock_gettime", + "clock_nanosleep", + "close", + "connect", + "copy_file_range", + "creat", + "dup", + "dup2", + "dup3", + "epoll_create", + "epoll_create1", + "epoll_ctl", + "epoll_ctl_old", + "epoll_pwait", + "epoll_wait", + "epoll_wait_old", + "eventfd", + "eventfd2", + "execve", + "execveat", + "exit", + "exit_group", + "faccessat", + "faccessat2", + "fadvise64", + "fadvise64_64", + "fallocate", + "fanotify_mark", + "fchdir", + "fchmod", + "fchmodat", + "fchown", + "fchown32", + "fchownat", + "fcntl", + "fcntl64", + "fdatasync", + "fgetxattr", + "flistxattr", + "flock", + "fork", + "fremovexattr", + "fsetxattr", + "fstat", + "fstat64", + "fstatat64", + "fstatfs", + "fstatfs64", + "fsync", + "ftruncate", + "ftruncate64", + "futex", + "futimesat", + "getcpu", + "getcwd", + "getdents", + "getdents64", + "getegid", + "getegid32", + "geteuid", + "geteuid32", + "getgid", + "getgid32", + "getgroups", + "getgroups32", + "getitimer", + "getpeername", + "getpgid", + "getpgrp", + "getpid", + "getppid", + "getpriority", + "getrandom", + "getresgid", + "getresgid32", + "getresuid", + "getresuid32", + "getrlimit", + "get_robust_list", + "getrusage", + "getsid", + "getsockname", + "getsockopt", + "get_thread_area", + "gettid", + "gettimeofday", + "getuid", + "getuid32", + "getxattr", + "inotify_add_watch", + "inotify_init", + "inotify_init1", + "inotify_rm_watch", + "io_cancel", + "ioctl", + "io_destroy", + "io_getevents", + "ioprio_get", + "ioprio_set", + "io_setup", + "io_submit", + "ipc", + "kill", + "lchown", + "lchown32", + "lgetxattr", + "link", + "linkat", + "listen", + "listxattr", + "llistxattr", + "_llseek", + "lremovexattr", + "lseek", + "lsetxattr", + "lstat", + "lstat64", + "madvise", + "memfd_create", + "mincore", + "mkdir", + "mkdirat", + "mknod", + "mknodat", + "mlock", + "mlock2", + "mlockall", + "mmap", + "mmap2", + "mprotect", + "mq_getsetattr", + "mq_notify", + "mq_open", + "mq_timedreceive", + "mq_timedsend", + "mq_unlink", + "mremap", + "msgctl", + "msgget", + "msgrcv", + "msgsnd", + "msync", + "munlock", + "munlockall", + "munmap", + "nanosleep", + "newfstatat", + "_newselect", + "open", + "openat", + "openat2", + "pause", + "pipe", + "pipe2", + "poll", + "ppoll", + "prctl", + "pread64", + "preadv", + "preadv2", + "prlimit64", + "pselect6", + "pwrite64", + "pwritev", + "pwritev2", + "read", + "readahead", + "readlink", + "readlinkat", + "readv", + "recv", + "recvfrom", + "recvmmsg", + "recvmsg", + "remap_file_pages", + "removexattr", + "rename", + "renameat", + "renameat2", + "restart_syscall", + "rmdir", + "rt_sigaction", + "rt_sigpending", + "rt_sigprocmask", + "rt_sigqueueinfo", + "rt_sigreturn", + "rt_sigsuspend", + "rt_sigtimedwait", + "rt_tgsigqueueinfo", + "sched_getaffinity", + "sched_getattr", + "sched_getparam", + "sched_get_priority_max", + "sched_get_priority_min", + "sched_getscheduler", + "sched_rr_get_interval", + "sched_setaffinity", + "sched_setattr", + "sched_setparam", + "sched_setscheduler", + "sched_yield", + "seccomp", + "select", + "semctl", + "semget", + "semop", + "semtimedop", + "send", + "sendfile", + "sendfile64", + "sendmmsg", + "sendmsg", + "sendto", + "setfsgid", + "setfsgid32", + "setfsuid", + "setfsuid32", + "setgid", + "setgid32", + "setgroups", + "setgroups32", + "setitimer", + "setpgid", + "setpriority", + "setregid", + "setregid32", + "setresgid", + "setresgid32", + "setresuid", + "setresuid32", + "setreuid", + "setreuid32", + "setrlimit", + "set_robust_list", + "setsid", + "setsockopt", + "set_thread_area", + "set_tid_address", + "setuid", + "setuid32", + "setxattr", + "shmat", + "shmctl", + "shmdt", + "shmget", + "shutdown", + "sigaltstack", + "signalfd", + "signalfd4", + "sigreturn", + "socket", + "socketcall", + "socketpair", + "splice", + "stat", + "stat64", + "statfs", + "statfs64", + "statx", + "symlink", + "symlinkat", + "sync", + "sync_file_range", + "syncfs", + "sysinfo", + "tee", + "tgkill", + "time", + "timer_create", + "timer_delete", + "timerfd_create", + "timerfd_gettime", + "timerfd_settime", + "timer_getoverrun", + "timer_gettime", + "timer_settime", + "times", + "tkill", + "truncate", + "truncate64", + "ugetrlimit", + "umask", + "uname", + "unlink", + "unlinkat", + "utime", + "utimensat", + "utimes", + "vfork", + "vmsplice", + "wait4", + "waitid", + "waitpid", + "write", + "writev", + + "personality", + "arch_prctl", + "modify_ldt", + "clone", + "chroot", + + // docker blocks these by default; sysbox-runc allows them + "mount", + "umount", + "umount2", + "add_key", + "request_key", + "keyctl", + "pivot_root", + "gethostname", + "sethostname", + "close_range", + + // allow namespace creation inside the system container (for nested containers) + "setns", + "unshare", +} + +// List of syscalls with allowed argument restrictions (via seccomp) +var syscontSyscallAllowRestrList = []string{ + "personality", + "socket", +} + +// Base list of syscalls that are always trapped & emulated inside a Sysbox +// container (regardless of whether it's operating in sys container mode or +// not); more syscalls may be added when Sysbox operates in system container +// mode (see cfgSyscontSyscallTraps() in spec.go). +// +// NOTE: the seccomp filter that blocks syscalls has precedence over the seccomp +// filter that traps them. For example, when syscontMode=false, mount and +// umount2 are typically blocked by containers managers (e.g., Docker) so they +// won't be trapped; but when syscontMode=true, then the syscontSyscallWhitelist +// above will unblock them and therefore they will be trapped by Sysbox. +var syscallTrapList = []string{ + "mount", + "umount2", +} + +// AddSyscallTraps modifies the given libcontainer config to add seccomp-notification +// actions (for syscall trapping) +func AddSyscallTraps(config *configs.Config) error { + + if config.SeccompNotif != nil { + return fmt.Errorf("conflicting seccomp notification config found.") + } + + if len(syscallTrapList) > 0 { + list := []*configs.Syscall{} + for _, call := range syscallTrapList { + s := &configs.Syscall{ + Name: call, + Action: configs.Notify, + } + list = append(list, s) + } + + config.SeccompNotif = &configs.Seccomp{ + DefaultAction: configs.Allow, + Architectures: []string{"amd64", "x86", "arm64", "arm"}, + Syscalls: list, + } + } + + return nil +} diff --git a/sysbox-runc/libsysbox/syscont/utils.go b/sysbox-runc/libsysbox/syscont/utils.go new file mode 100644 index 00000000..85f24f38 --- /dev/null +++ b/sysbox-runc/libsysbox/syscont/utils.go @@ -0,0 +1,194 @@ +// +// Copyright 2019-2020 Nestybox, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +package syscont + +import ( + "fmt" + "sort" + "strings" + + "github.com/nestybox/sysbox-libs/mount" + "github.com/opencontainers/runtime-spec/specs-go" +) + +// sortMounts sorts the sys container mounts in the given spec. +func sortMounts(spec *specs.Spec, hasSystemd bool) { + + // The OCI spec requires the runtime to honor the ordering on + // mounts in the spec. However, we deviate a bit and always order + // the mounts in the orderList below. + + // First, sort by destination prefix + orderList := map[string]int{ + "/sys": 1, + "/proc": 2, + "/dev": 3, + } + + if hasSystemd { + orderList["/run"] = 4 + } + + sort.SliceStable(spec.Mounts, func(i, j int) bool { + + // for mounts in the sort list, sort them by destination path + d1 := spec.Mounts[i].Destination + d2 := spec.Mounts[j].Destination + + d1Prefix := "" + for prefix := range orderList { + if strings.HasPrefix(d1, prefix) { + d1Prefix = prefix + break + } + } + + d2Prefix := "" + for prefix := range orderList { + if strings.HasPrefix(d2, prefix) { + d2Prefix = prefix + break + } + } + + if d1Prefix != "" && d2Prefix != "" { + if d1Prefix != d2Prefix { + return orderList[d1Prefix] < orderList[d2Prefix] + } else { + return d1 < d2 + } + } else if d1Prefix != "" && d2Prefix == "" { + return true + } else if d1Prefix == "" && d2Prefix != "" { + return false + } + + // for mounts not in the sort list, leave their ordering untouched + return false + }) + + // Now, place all the bind mounts at the end of the mount list (this improves performance + // as it allows us to process the bind mounts in bulk (see rootfs_linux.go)) + sort.SliceStable(spec.Mounts, func(i, j int) bool { + + t1 := spec.Mounts[i].Type + t2 := spec.Mounts[j].Type + + if t1 == "bind" && t2 == "bind" { + + // Among bind mounts, sort them such that a mount that + // depends on another one come after that other one. + if strings.HasPrefix(spec.Mounts[j].Destination, spec.Mounts[i].Destination) { + return true + } + + return false + } + + if t2 == "bind" { + return true + } + + return false + }) + +} + +// sortIDMappings sorts the given ID mappings by container ID (in increasing +// order). If byHostID is true, then the mappings are sorted by host ID instead +// (in increasing order). +func sortIDMappings(idMappings []specs.LinuxIDMapping, byHostID bool) { + + if byHostID { + sort.Slice(idMappings, func(i, j int) bool { + return idMappings[i].HostID < idMappings[j].HostID + }) + } else { + sort.Slice(idMappings, func(i, j int) bool { + return idMappings[i].ContainerID < idMappings[j].ContainerID + }) + } +} + +// mergeIDMappings coallesces the given user-ns ID mappings into a single continuous range. +// If this can't be done (because either the container IDs or host IDs are non-contiguous, +// an error is returned). +func mergeIDMappings(idMappings []specs.LinuxIDMapping) ([]specs.LinuxIDMapping, error) { + + idMappingLen := len(idMappings) + + if idMappingLen < 2 { + return idMappings, nil + } + + sortIDMappings(idMappings, false) + + mergedMapping := specs.LinuxIDMapping{ + ContainerID: idMappings[0].ContainerID, + HostID: idMappings[0].HostID, + Size: idMappings[0].Size, + } + + for i := 1; i < idMappingLen; i++ { + curr := idMappings[i] + prev := idMappings[i-1] + + if curr.ContainerID != (prev.ContainerID + prev.Size) { + return nil, fmt.Errorf("container ID mappings are non-contiguous: %+v", idMappings) + } + if curr.HostID != (prev.HostID + prev.Size) { + return nil, fmt.Errorf("host ID mappings are non-contiguous: %+v", idMappings) + } + + mergedMapping.Size += curr.Size + } + + return []specs.LinuxIDMapping{mergedMapping}, nil +} + +func rootfsCloningRequired(rootfs string) (bool, error) { + + // If the rootfs is on an overlayfs mount, then chown can be very slow unless + // the overlay was mounted with "metacopy=on" (in the order of many seconds + // because it triggers a "copy-up" of every file). If metacopy is disabled + // then we need a solution. Note that Docker does not set metacopy=on because + // it breaks container snapshots via "docker commit" or "docker build". + // + // A simple solution would be to add "metacopy=on" to the existing overlayfs + // mount on the rootfs via a remount. However, this is not supported by + // overlayfs. We could unmount and then remount, but the unmount may break + // the container manager that set up the mount. We tried, it did not work + // (Docker/containerd did not like it). + // + // The solution we came up with is to ask the sysbox-mgr to clone the rootfs + // at a separate location, using two stacked overlayfs mounts, one with + // metacopy=on to enable fast chown, the other without it to ensure container + // snapshots work properly. Once the rootfs is cloned, we then setup the + // container using this cloned rootfs. + + mounts, err := mount.GetMounts() + if err != nil { + return false, err + } + + mi, err := mount.GetMountAt(rootfs, mounts) + if err == nil && mi.Fstype == "overlay" && !strings.Contains(mi.Opts, "metacopy=on") { + return true, nil + } + + return false, nil +} diff --git a/sysbox-runc/libsysbox/syscont/utils_test.go b/sysbox-runc/libsysbox/syscont/utils_test.go new file mode 100644 index 00000000..a6a324ff --- /dev/null +++ b/sysbox-runc/libsysbox/syscont/utils_test.go @@ -0,0 +1,173 @@ +// +// Copyright 2019-2020 Nestybox, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +package syscont + +import ( + "testing" + + utils "github.com/nestybox/sysbox-libs/utils" + "github.com/opencontainers/runtime-spec/specs-go" +) + +func equalIDMappings(a, b []specs.LinuxIDMapping) bool { + if len(a) != len(b) { + return false + } + + for i := 0; i < len(a); i++ { + if a[i] != b[i] { + return false + } + } + + return true +} + +func TestMergeIDMappings(t *testing.T) { + + // test merging of continuous ID mappings + have := []specs.LinuxIDMapping{ + {ContainerID: 0, HostID: 1000000, Size: 1}, + {ContainerID: 1, HostID: 1000001, Size: 2}, + {ContainerID: 3, HostID: 1000003, Size: 65533}, + } + + want := []specs.LinuxIDMapping{ + {ContainerID: 0, HostID: 1000000, Size: 65536}, + } + + got, err := mergeIDMappings(have) + + if err != nil { + t.Errorf("mergeIDMappings(%v) failed with error: %s", have, err) + } else if !equalIDMappings(want, got) { + t.Errorf("mergeIDMappings(%v) failed: got %v, want %v", have, got, want) + } + + // test that merging on non-continuous host ID mappings fails + have = []specs.LinuxIDMapping{ + {ContainerID: 0, HostID: 1000000, Size: 1}, + {ContainerID: 1, HostID: 1000002, Size: 65535}, + } + + got, err = mergeIDMappings(have) + + if err == nil { + t.Errorf("mergeIDMappings(%v) passed; expected to fail", have) + } + + // test that merging on non-continuous container ID mappings fails + have = []specs.LinuxIDMapping{ + {ContainerID: 0, HostID: 1000000, Size: 1}, + {ContainerID: 2, HostID: 1000001, Size: 65535}, + } + + got, err = mergeIDMappings(have) + + if err == nil { + t.Errorf("mergeIDMappings(%v) passed; expected to fail", have) + } + + // test single mapping + have = []specs.LinuxIDMapping{ + {ContainerID: 0, HostID: 1000000, Size: 65536}, + } + + want = have + got, err = mergeIDMappings(have) + + if err != nil { + t.Errorf("mergeIDMappings(%v) failed with error: %s", have, err) + } else if !equalIDMappings(want, got) { + t.Errorf("mergeIDMappings(%v) failed: got %v, want %v", have, got, want) + } + + // test empty mapping + have = []specs.LinuxIDMapping{} + want = have + got, err = mergeIDMappings(have) + + if err != nil { + t.Errorf("mergeIDMappings(%v) failed with error: %s", have, err) + } else if !equalIDMappings(want, got) { + t.Errorf("mergeIDMappings(%v) failed: got %v, want %v", have, got, want) + } +} + +func TestSortMounts(t *testing.T) { + spec := new(specs.Spec) + + spec.Mounts = []specs.Mount{ + {Destination: "/dev", Type: "tmpfs"}, + {Destination: "/proc/swaps", Type: "bind"}, + {Destination: "/proc", Type: "proc"}, + {Destination: "/var/lib/docker/overlay2", Type: "bind"}, + {Destination: "/var/lib/docker", Type: "bind"}, + {Destination: "/var/lib/docker/overlay2/diff", Type: "bind"}, + {Destination: "/tmp/run", Type: "tmpfs"}, + {Destination: "/sys/fs/cgroup", Type: "cgroup"}, + {Destination: "/sys", Type: "sysfs"}, + {Destination: "/tmp/run2", Type: "tmpfs"}, + } + + wantMounts := []specs.Mount{ + {Destination: "/sys", Type: "sysfs"}, + {Destination: "/sys/fs/cgroup", Type: "cgroup"}, + {Destination: "/proc", Type: "proc"}, + {Destination: "/dev", Type: "tmpfs"}, + {Destination: "/tmp/run", Type: "tmpfs"}, + {Destination: "/tmp/run2", Type: "tmpfs"}, + + // bind mounts should be grouped at the end; bind mounts + // dependent on others must be placed after those others. + + {Destination: "/proc/swaps", Type: "bind"}, + {Destination: "/var/lib/docker", Type: "bind"}, + {Destination: "/var/lib/docker/overlay2", Type: "bind"}, + {Destination: "/var/lib/docker/overlay2/diff", Type: "bind"}, + } + + sortMounts(spec, false) + + if !utils.MountSliceEqual(spec.Mounts, wantMounts) { + t.Errorf("sortMounts() failed: got %v, want %v", spec.Mounts, wantMounts) + } +} + +func TestSortMountsSystemd(t *testing.T) { + spec := new(specs.Spec) + + spec.Mounts = []specs.Mount{ + {Destination: "/run/secrets/serviceaccount/token", Type: "bind"}, + {Destination: "/run/lock", Type: "tmpfs"}, + {Destination: "/var/run/secrets/serviceaccount/token", Type: "bind"}, + {Destination: "/run", Type: "tmpfs"}, + } + + wantMounts := []specs.Mount{ + {Destination: "/run", Type: "tmpfs"}, + {Destination: "/run/lock", Type: "tmpfs"}, + {Destination: "/run/secrets/serviceaccount/token", Type: "bind"}, + {Destination: "/var/run/secrets/serviceaccount/token", Type: "bind"}, + } + + sortMounts(spec, true) + + if !utils.MountSliceEqual(spec.Mounts, wantMounts) { + t.Errorf("sortMounts() failed: got %v, want %v", spec.Mounts, wantMounts) + } +} diff --git a/sysbox-runc/list.go b/sysbox-runc/list.go new file mode 100644 index 00000000..8365c919 --- /dev/null +++ b/sysbox-runc/list.go @@ -0,0 +1,177 @@ +//go:build linux +// +build linux + +package main + +import ( + "errors" + "fmt" + "io/ioutil" + "os" + "path/filepath" + "syscall" + "text/tabwriter" + "time" + + "encoding/json" + + "github.com/opencontainers/runc/libcontainer" + "github.com/opencontainers/runc/libcontainer/user" + "github.com/opencontainers/runc/libcontainer/utils" + "github.com/urfave/cli" +) + +const formatOptions = `table or json` + +// containerState represents the platform agnostic pieces relating to a +// running container's status and state +type containerState struct { + // Version is the OCI version for the container + Version string `json:"ociVersion"` + // ID is the container ID + ID string `json:"id"` + // InitProcessPid is the init process id in the parent namespace + InitProcessPid int `json:"pid"` + // Status is the current status of the container, running, paused, ... + Status string `json:"status"` + // Bundle is the path on the filesystem to the bundle + Bundle string `json:"bundle"` + // Rootfs is a path to a directory containing the container's root filesystem. + Rootfs string `json:"rootfs"` + // Created is the unix timestamp for the creation time of the container in UTC + Created time.Time `json:"created"` + // Annotations is the user defined annotations added to the config. + Annotations map[string]string `json:"annotations,omitempty"` + // The owner of the state directory (the owner of the container). + Owner string `json:"owner"` +} + +var listCommand = cli.Command{ + Name: "list", + Usage: "lists containers started by sysbox-runc with the given root", + ArgsUsage: ` + +Where the given root is specified via the global option "--root" +(default: "/run/runc"). + +EXAMPLE 1: +To list containers created via the default "--root": + # sysbox-runc list + +EXAMPLE 2: +To list containers created using a non-default value for "--root": + # sysbox-runc --root value list`, + Flags: []cli.Flag{ + cli.StringFlag{ + Name: "format, f", + Value: "table", + Usage: `select one of: ` + formatOptions, + }, + cli.BoolFlag{ + Name: "quiet, q", + Usage: "display only container IDs", + }, + }, + Action: func(context *cli.Context) error { + if err := checkArgs(context, 0, exactArgs); err != nil { + return err + } + s, err := getContainers(context) + if err != nil { + return err + } + + if context.Bool("quiet") { + for _, item := range s { + fmt.Println(item.ID) + } + return nil + } + + switch context.String("format") { + case "table": + w := tabwriter.NewWriter(os.Stdout, 12, 1, 3, ' ', 0) + fmt.Fprint(w, "ID\tPID\tSTATUS\tBUNDLE\tCREATED\tOWNER\n") + for _, item := range s { + fmt.Fprintf(w, "%s\t%d\t%s\t%s\t%s\t%s\n", + item.ID, + item.InitProcessPid, + item.Status, + item.Bundle, + item.Created.Format(time.RFC3339Nano), + item.Owner) + } + if err := w.Flush(); err != nil { + return err + } + case "json": + if err := json.NewEncoder(os.Stdout).Encode(s); err != nil { + return err + } + default: + return errors.New("invalid format option") + } + return nil + }, +} + +func getContainers(context *cli.Context) ([]containerState, error) { + factory, err := loadFactory(context, nil) + if err != nil { + return nil, err + } + root := context.GlobalString("root") + absRoot, err := filepath.Abs(root) + if err != nil { + return nil, err + } + list, err := ioutil.ReadDir(absRoot) + if err != nil { + fatal(err) + } + + var s []containerState + for _, item := range list { + if item.IsDir() { + // This cast is safe on Linux. + stat := item.Sys().(*syscall.Stat_t) + owner, err := user.LookupUid(int(stat.Uid)) + if err != nil { + owner.Name = fmt.Sprintf("#%d", stat.Uid) + } + + container, err := factory.Load(item.Name()) + if err != nil { + fmt.Fprintf(os.Stderr, "load container %s: %v\n", item.Name(), err) + continue + } + containerStatus, err := container.Status() + if err != nil { + fmt.Fprintf(os.Stderr, "status for %s: %v\n", item.Name(), err) + continue + } + state, err := container.State() + if err != nil { + fmt.Fprintf(os.Stderr, "state for %s: %v\n", item.Name(), err) + continue + } + pid := state.BaseState.InitProcessPid + if containerStatus == libcontainer.Stopped { + pid = 0 + } + bundle, annotations := utils.Annotations(state.Config.Labels) + s = append(s, containerState{ + Version: state.BaseState.Config.Version, + ID: state.BaseState.ID, + InitProcessPid: pid, + Status: containerStatus.String(), + Bundle: bundle, + Rootfs: state.BaseState.Config.Rootfs, + Created: state.BaseState.Created, + Annotations: annotations, + Owner: owner.Name, + }) + } + } + return s, nil +} diff --git a/sysbox-runc/main.go b/sysbox-runc/main.go new file mode 100644 index 00000000..69b65953 --- /dev/null +++ b/sysbox-runc/main.go @@ -0,0 +1,185 @@ +package main + +import ( + "fmt" + "io" + "os" + + "github.com/opencontainers/runc/libcontainer/logs" + "github.com/opencontainers/runtime-spec/specs-go" + + "github.com/sirupsen/logrus" + "github.com/urfave/cli" +) + +// Globals to be populated at build time during Makefile processing. +var ( + edition string // Sysbox Edition: CE or EE + version string // extracted from VERSION file + commitId string // latest sysbox-runc's git commit-id + builtAt string // build time + builtBy string // build owner +) + +const ( + specConfig = "config.json" + usage = `sysbox-runc + +Nestybox's system container runtime. + +Info: https://github.com/nestybox/sysbox +` +) + +func main() { + app := cli.NewApp() + app.Name = "sysbox-runc" + app.Usage = usage + app.Version = version + + // show-version specialization. + cli.VersionPrinter = func(c *cli.Context) { + fmt.Printf("sysbox-runc\n"+ + "\tedition: \t%s\n"+ + "\tversion: \t%s\n"+ + "\tcommit: \t%s\n"+ + "\tbuilt at: \t%s\n"+ + "\tbuilt by: \t%s\n"+ + "\toci-specs: \t%s\n", + edition, c.App.Version, commitId, builtAt, builtBy, specs.Version) + } + + xdgRuntimeDir := "" + root := "/run/sysbox-runc" + if shouldHonorXDGRuntimeDir() { + if runtimeDir := os.Getenv("XDG_RUNTIME_DIR"); runtimeDir != "" { + root = runtimeDir + "/sysbox-runc" + xdgRuntimeDir = root + } + } + + app.Flags = []cli.Flag{ + cli.BoolFlag{ + Name: "debug", + Usage: "enable debug output for logging", + }, + cli.StringFlag{ + Name: "log", + Value: "", + Usage: "set the log file path where internal debug information is written", + }, + cli.StringFlag{ + Name: "log-format", + Value: "text", + Usage: "set the format used by logs ('text' (default), or 'json')", + }, + cli.StringFlag{ + Name: "root", + Value: root, + Usage: "root directory for storage of container state (this should be located in tmpfs)", + }, + cli.BoolFlag{ + Name: "no-sysbox-fs", + Usage: "do not interact with sysbox-fs; meant for testing and debugging.", + }, + cli.BoolFlag{ + Name: "no-sysbox-mgr", + Usage: "do not interact with sysbox-mgr; meant for testing and debugging.", + }, + cli.BoolFlag{ + Name: "no-kernel-check", + Usage: "do not check kernel compatibility; meant for testing and debugging.", + }, + cli.BoolFlag{ + Name: "cpu-profiling", + Usage: "enable cpu-profiling data collection; profile data is stored in the cwd of the process invoking sysbox-runc. Ignore the 'cannot set cpu profile rate' message (it's expected).", + Hidden: true, + }, + cli.BoolFlag{ + Name: "memory-profiling", + Usage: "enable memory-profiling data collectionprofile data is stored in the cwd of the process invoking sysbox-runc.", + Hidden: true, + }, + cli.BoolFlag{ + Name: "systemd-cgroup", + Usage: "enable systemd cgroup support, expects cgroupsPath to be of form \"slice:prefix:name\" for e.g. \"system.slice:runc:434234\"", + }, + } + + app.Commands = []cli.Command{ + createCommand, + deleteCommand, + eventsCommand, + execCommand, + initCommand, + killCommand, + listCommand, + pauseCommand, + psCommand, + resumeCommand, + runCommand, + specCommand, + startCommand, + stateCommand, + updateCommand, + } + + app.Before = func(context *cli.Context) error { + if !context.IsSet("root") && xdgRuntimeDir != "" { + // According to the XDG specification, we need to set anything in + // XDG_RUNTIME_DIR to have a sticky bit if we don't want it to get + // auto-pruned. + if err := os.MkdirAll(root, 0700); err != nil { + fmt.Fprintln(os.Stderr, "the path in $XDG_RUNTIME_DIR must be writable by the user") + fatal(err) + } + if err := os.Chmod(root, 0700|os.ModeSticky); err != nil { + fmt.Fprintln(os.Stderr, "you should check permission of the path in $XDG_RUNTIME_DIR") + fatal(err) + } + } + if err := reviseRootDir(context); err != nil { + return err + } + return logs.ConfigureLogging(createLogConfig(context)) + } + + // If the command returns an error, cli takes upon itself to print + // the error on cli.ErrWriter and exit. + // Use our own writer here to ensure the log gets sent to the right location. + cli.ErrWriter = &FatalWriter{cli.ErrWriter} + if err := app.Run(os.Args); err != nil { + fatal(err) + } +} + +type FatalWriter struct { + cliErrWriter io.Writer +} + +func (f *FatalWriter) Write(p []byte) (n int, err error) { + logrus.Error(string(p)) + if !logrusToStderr() { + return f.cliErrWriter.Write(p) + } + return len(p), nil +} + +func createLogConfig(context *cli.Context) logs.Config { + logFilePath := context.GlobalString("log") + logPipeFd := "" + if logFilePath == "" { + logPipeFd = "2" + } + config := logs.Config{ + LogPipeFd: logPipeFd, + LogLevel: logrus.InfoLevel, + LogFilePath: logFilePath, + LogFormat: context.GlobalString("log-format"), + } + if context.GlobalBool("debug") { + config.LogLevel = logrus.DebugLevel + } + + return config +} diff --git a/sysbox-runc/man/README.md b/sysbox-runc/man/README.md new file mode 100644 index 00000000..1d7a54fa --- /dev/null +++ b/sysbox-runc/man/README.md @@ -0,0 +1,11 @@ +runc man pages +==================== + +This directory contains man pages for runc in markdown format. + +To generate man pages from it, use this command + + ./md2man-all.sh + +You will see man pages generated under the man8 directory. + diff --git a/sysbox-runc/man/md2man-all.sh b/sysbox-runc/man/md2man-all.sh new file mode 100755 index 00000000..eaee58ee --- /dev/null +++ b/sysbox-runc/man/md2man-all.sh @@ -0,0 +1,27 @@ +#!/bin/bash +set -e + +# get into this script's directory +cd "$(dirname "$(readlink -f "$BASH_SOURCE")")" + +[ "$1" = '-q' ] || { + set -x + pwd +} + +if ! type go-md2man; then + echo "To install man pages, please install 'go-md2man'." + exit 0 +fi + +for FILE in *.md; do + base="$(basename "$FILE")" + name="${base%.md}" + num="${name##*.}" + if [ -z "$num" -o "$name" = "$num" ]; then + # skip files that aren't of the format xxxx.N.md (like README.md) + continue + fi + mkdir -p "./man${num}" + go-md2man -in "$FILE" -out "./man${num}/${name}" +done diff --git a/sysbox-runc/man/runc-checkpoint.8.md b/sysbox-runc/man/runc-checkpoint.8.md new file mode 100644 index 00000000..08e6b1fa --- /dev/null +++ b/sysbox-runc/man/runc-checkpoint.8.md @@ -0,0 +1,30 @@ +% runc-checkpoint "8" + +# NAME + runc checkpoint - checkpoint a running container + +# SYNOPSIS + runc checkpoint [command options] `` + +Where "``" is the name for the instance of the container to be +checkpointed. + +# DESCRIPTION + The checkpoint command saves the state of the container instance. + +# OPTIONS + --image-path value path for saving criu image files + --work-path value path for saving work files and logs + --parent-path value path for previous criu image files in pre-dump + --leave-running leave the process running after checkpointing + --tcp-established allow open tcp connections + --ext-unix-sk allow external unix sockets + --shell-job allow shell jobs + --lazy-pages use userfaultfd to lazily restore memory pages + --status-fd value criu writes \0 to this FD once lazy-pages is ready + --page-server value ADDRESS:PORT of the page server + --file-locks handle file locks, for safety + --pre-dump dump container's memory information only, leave the container running after this + --manage-cgroups-mode value cgroups mode: 'soft' (default), 'full' and 'strict' + --empty-ns value create a namespace, but don't restore its properties + --auto-dedup enable auto deduplication of memory images diff --git a/sysbox-runc/man/runc-create.8.md b/sysbox-runc/man/runc-create.8.md new file mode 100644 index 00000000..99c0a2c0 --- /dev/null +++ b/sysbox-runc/man/runc-create.8.md @@ -0,0 +1,29 @@ +% runc-create "8" + +# NAME + runc create - create a container + +# SYNOPSIS + runc create [command options] `` + +Where "``" is your name for the instance of the container that you +are starting. The name you provide for the container instance must be unique on +your host. + +# DESCRIPTION + The create command creates an instance of a container for a bundle. The bundle +is a directory with a specification file named "config.json" and a root +filesystem. + +The specification file includes an args parameter. The args parameter is used +to specify command(s) that get run when the container is started. To change the +command(s) that get executed on start, edit the args parameter of the spec. See +"runc spec --help" for more explanation. + +# OPTIONS + --bundle value, -b value path to the root of the bundle directory, defaults to the current directory + --console-socket value path to an AF_UNIX socket which will receive a file descriptor referencing the master end of the console's pseudoterminal + --pid-file value specify the file to write the process id to + --no-pivot do not use pivot root to jail process inside rootfs. This should be used whenever the rootfs is on top of a ramdisk + --no-new-keyring do not create a new session keyring for the container. This will cause the container to inherit the calling processes session key + --preserve-fds value Pass N additional file descriptors to the container (stdio + $LISTEN_FDS + N in total) (default: 0) diff --git a/sysbox-runc/man/runc-delete.8.md b/sysbox-runc/man/runc-delete.8.md new file mode 100644 index 00000000..84922a0c --- /dev/null +++ b/sysbox-runc/man/runc-delete.8.md @@ -0,0 +1,19 @@ +% runc-delete "8" + +# NAME + runc delete - delete any resources held by the container often used with detached container + +# SYNOPSIS + runc delete [command options] `` + +Where "``" is the name for the instance of the container. + +# OPTIONS + --force, -f Forcibly deletes the container if it is still running (uses SIGKILL) + +# EXAMPLE +For example, if the container id is "ubuntu01" and runc list currently shows the +status of "ubuntu01" as "stopped" the following will delete resources held for +"ubuntu01" removing "ubuntu01" from the runc list of containers: + + # runc delete ubuntu01 diff --git a/sysbox-runc/man/runc-events.8.md b/sysbox-runc/man/runc-events.8.md new file mode 100644 index 00000000..d998a38e --- /dev/null +++ b/sysbox-runc/man/runc-events.8.md @@ -0,0 +1,17 @@ +% runc-events "8" + +# NAME + runc events - display container events such as OOM notifications, cpu, memory, and IO usage statistics + +# SYNOPSIS + runc events [command options] `` + +Where "``" is the name for the instance of the container. + +# DESCRIPTION + The events command displays information about the container. By default the +information is displayed once every 5 seconds. + +# OPTIONS + --interval value set the stats collection interval (default: 5s) + --stats display the container's stats then exit diff --git a/sysbox-runc/man/runc-exec.8.md b/sysbox-runc/man/runc-exec.8.md new file mode 100644 index 00000000..dbaaefec --- /dev/null +++ b/sysbox-runc/man/runc-exec.8.md @@ -0,0 +1,33 @@ +% runc-exec "8" + +# NAME + runc exec - execute new process inside the container + +# SYNOPSIS + runc exec [command options] `` -- `` [args...] + +Where "``" is the name for the instance of the container and +"``" is the command to be executed in the container. + +# EXAMPLE +For example, if the container is configured to run the linux ps command the +following will output a list of processes running in the container: + + # runc exec ps + +# OPTIONS + --console value specify the pty slave path for use with the container + --cwd value current working directory in the container + --env value, -e value set environment variables + --tty, -t allocate a pseudo-TTY + --user value, -u value UID (format: [:]) + --additional-gids value, -g value additional gids + --process value, -p value path to the process.json + --detach, -d detach from the container's process + --pid-file value specify the file to write the process id to + --process-label value set the asm process label for the process commonly used with selinux + --apparmor value set the apparmor profile for the process + --no-new-privs set the no new privileges value for the process + --cap value, -c value add a capability to the bounding set for the process + --no-subreaper disable the use of the subreaper used to reap reparented processes + --preserve-fds value pass N additional file descriptors to the container (stdio + $LISTEN_FDS + N in total) (default: 0) diff --git a/sysbox-runc/man/runc-kill.8.md b/sysbox-runc/man/runc-kill.8.md new file mode 100644 index 00000000..1ea579a7 --- /dev/null +++ b/sysbox-runc/man/runc-kill.8.md @@ -0,0 +1,20 @@ +% runc-kill "8" + +# NAME + runc kill - kill sends the specified signal (default: SIGTERM) to the container's init process + +# SYNOPSIS + runc kill [command options] `` `` + +Where "``" is the name for the instance of the container and +"``" is the signal to be sent to the init process. + +# OPTIONS + --all, -a send the specified signal to all processes inside the container + +# EXAMPLE + +For example, if the container id is "ubuntu01" the following will send a "KILL" +signal to the init process of the "ubuntu01" container: + + # runc kill ubuntu01 KILL diff --git a/sysbox-runc/man/runc-list.8.md b/sysbox-runc/man/runc-list.8.md new file mode 100644 index 00000000..46cd5d05 --- /dev/null +++ b/sysbox-runc/man/runc-list.8.md @@ -0,0 +1,21 @@ +% runc-list "8" + +# NAME + runc list - lists containers started by runc with the given root + +# SYNOPSIS + runc list [command options] + +# EXAMPLE +Where the given root is specified via the global option "--root" +(default: "/run/runc"). + +To list containers created via the default "--root": + # runc list + +To list containers created using a non-default value for "--root": + # runc --root value list + +# OPTIONS + --format value, -f value select one of: table or json (default: "table") + --quiet, -q display only container IDs diff --git a/sysbox-runc/man/runc-pause.8.md b/sysbox-runc/man/runc-pause.8.md new file mode 100644 index 00000000..965f7dad --- /dev/null +++ b/sysbox-runc/man/runc-pause.8.md @@ -0,0 +1,14 @@ +% runc-pause "8" + +# NAME + runc pause - pause suspends all processes inside the container + +# SYNOPSIS + runc pause `` + +Where "``" is the name for the instance of the container to be +paused. + +# DESCRIPTION + The pause command suspends all processes in the instance of the container. +Use runc list to identify instances of containers and their current status. diff --git a/sysbox-runc/man/runc-ps.8.md b/sysbox-runc/man/runc-ps.8.md new file mode 100644 index 00000000..1fad4674 --- /dev/null +++ b/sysbox-runc/man/runc-ps.8.md @@ -0,0 +1,15 @@ +% runc-ps "8" + +# NAME + runc ps - ps displays the processes running inside a container + +# SYNOPSIS + runc ps [command options] `` [ps options] + +# OPTIONS + --format value, -f value select one of: table(default) or json + +The default format is table. The following will output the processes of a container +in json format: + + # runc ps -f json diff --git a/sysbox-runc/man/runc-restore.8.md b/sysbox-runc/man/runc-restore.8.md new file mode 100644 index 00000000..e475bd57 --- /dev/null +++ b/sysbox-runc/man/runc-restore.8.md @@ -0,0 +1,28 @@ +% runc-restore "8" + +# NAME + runc restore - restore a container from a previous checkpoint + +# SYNOPSIS + runc restore [command options] `` + +Where "``" is the name for the instance of the container to be +restored. + +# DESCRIPTION + Restores the saved state of the container instance that was previously saved +using the runc checkpoint command. + +# OPTIONS + --image-path value path to criu image files for restoring + --work-path value path for saving work files and logs + --tcp-established allow open tcp connections + --ext-unix-sk allow external unix sockets + --shell-job allow shell jobs + --file-locks handle file locks, for safety + --manage-cgroups-mode value cgroups mode: 'soft' (default), 'full' and 'strict' + --bundle value, -b value path to the root of the bundle directory + --detach, -d detach from the container's process + --pid-file value specify the file to write the process id to + --no-subreaper disable the use of the subreaper used to reap reparented processes + --no-pivot do not use pivot root to jail process inside rootfs. This should be used whenever the rootfs is on top of a ramdisk diff --git a/sysbox-runc/man/runc-resume.8.md b/sysbox-runc/man/runc-resume.8.md new file mode 100644 index 00000000..25d342f9 --- /dev/null +++ b/sysbox-runc/man/runc-resume.8.md @@ -0,0 +1,14 @@ +% runc-resume "8" + +# NAME + runc resume - resumes all processes that have been previously paused + +# SYNOPSIS + runc resume `` + +Where "``" is the name for the instance of the container to be +resumed. + +# DESCRIPTION + The resume command resumes all processes in the instance of the container. +Use runc list to identify instances of containers and their current status. diff --git a/sysbox-runc/man/runc-run.8.md b/sysbox-runc/man/runc-run.8.md new file mode 100644 index 00000000..c39a112a --- /dev/null +++ b/sysbox-runc/man/runc-run.8.md @@ -0,0 +1,32 @@ +% runc-run "8" + +# NAME + runc run - create and run a container + +# SYNOPSIS + runc run [command options] `` + +Where "``" is your name for the instance of the container that you +are starting. The name you provide for the container instance must be unique on +your host. + +# DESCRIPTION + The run command creates an instance of a container for a bundle. The bundle +is a directory with a specification file named "config.json" and a root +filesystem. + +The specification file includes an args parameter. The args parameter is used +to specify command(s) that get run when the container is started. To change the +command(s) that get executed on start, edit the args parameter of the spec. See +"runc spec --help" for more explanation. + +# OPTIONS + --bundle value, -b value path to the root of the bundle directory, defaults to the current directory + --console-socket value path to an AF_UNIX socket which will receive a file descriptor referencing the master end of the console's pseudoterminal + --detach, -d detach from the container's process + --keep keep the container's state directory and cgroup. This can be helpful if a user wants to check the state (e.g., of cgroup controllers) after the container has exited. If this option is used, a manual **runc delete** is needed afterwards to clean the exited container's artifacts. + --pid-file value specify the file to write the process id to + --no-subreaper disable the use of the subreaper used to reap reparented processes + --no-pivot do not use pivot root to jail process inside rootfs. This should be used whenever the rootfs is on top of a ramdisk + --no-new-keyring do not create a new session keyring for the container. This will cause the container to inherit the calling processes session key + --preserve-fds value Pass N additional file descriptors to the container (stdio + $LISTEN_FDS + N in total) (default: 0) diff --git a/sysbox-runc/man/runc-spec.8.md b/sysbox-runc/man/runc-spec.8.md new file mode 100644 index 00000000..6a181cde --- /dev/null +++ b/sysbox-runc/man/runc-spec.8.md @@ -0,0 +1,56 @@ +% runc-spec "8" + +# NAME + runc spec - create a new specification file + +# SYNOPSIS + runc spec [command options] [arguments...] + +# DESCRIPTION + The spec command creates the new specification file named "config.json" for +the bundle. + +The spec generated is just a starter file. Editing of the spec is required to +achieve desired results. For example, the newly generated spec includes an args +parameter that is initially set to call the "sh" command when the container is +started. Calling "sh" may work for an ubuntu container or busybox, but will not +work for containers that do not include the "sh" program. + +# EXAMPLE + To run docker's hello-world container one needs to set the args parameter +in the spec to call hello. This can be done using the sed command or a text +editor. The following commands create a bundle for hello-world, change the +default args parameter in the spec from "sh" to "/hello", then run the hello +command in a new hello-world container named container1: + + mkdir hello + cd hello + docker pull hello-world + docker export $(docker create hello-world) > hello-world.tar + mkdir rootfs + tar -C rootfs -xf hello-world.tar + runc spec + sed -i 's;"sh";"/hello";' config.json + runc start container1 + +In the start command above, "container1" is the name for the instance of the +container that you are starting. The name you provide for the container instance +must be unique on your host. + +An alternative for generating a customized spec config is to use "oci-runtime-tool", the +sub-command "oci-runtime-tool generate" has lots of options that can be used to do any +customizations as you want, see [runtime-tools](https://github.com/opencontainers/runtime-tools) +to get more information. + +When starting a container through runc, runc needs root privilege. If not +already running as root, you can use sudo to give runc root privilege. For +example: "sudo runc start container1" will give runc root privilege to start the +container on your host. + +Alternatively, you can start a rootless container, which has the ability to run without root privileges. +For this to work, the specification file needs to be adjusted accordingly. +You can pass the parameter **--rootless** to this command to generate a proper rootless spec file. + +# OPTIONS + --bundle value, -b value path to the root of the bundle directory + --rootless generate a configuration for a rootless container diff --git a/sysbox-runc/man/runc-start.8.md b/sysbox-runc/man/runc-start.8.md new file mode 100644 index 00000000..e4bbacc3 --- /dev/null +++ b/sysbox-runc/man/runc-start.8.md @@ -0,0 +1,14 @@ +% runc-start "8" + +# NAME + runc start - start executes the user defined process in a created container + +# SYNOPSIS + runc start `` + +Where "``" is your name for the instance of the container that you +are starting. The name you provide for the container instance must be unique on +your host. + +# DESCRIPTION + The start command executes the user defined process in a created container. diff --git a/sysbox-runc/man/runc-state.8.md b/sysbox-runc/man/runc-state.8.md new file mode 100644 index 00000000..768f79f8 --- /dev/null +++ b/sysbox-runc/man/runc-state.8.md @@ -0,0 +1,13 @@ +% runc-state "8" + +# NAME + runc state - output the state of a container + +# SYNOPSIS + runc state `` + +Where "``" is your name for the instance of the container. + +# DESCRIPTION + The state command outputs current state information for the +instance of a container. diff --git a/sysbox-runc/man/runc-update.8.md b/sysbox-runc/man/runc-update.8.md new file mode 100644 index 00000000..5c02e451 --- /dev/null +++ b/sysbox-runc/man/runc-update.8.md @@ -0,0 +1,53 @@ +% runc-update "8" + +# NAME + runc update - update container resource constraints + +# SYNOPSIS + runc update [command options] `` + +# DESCRIPTION + The data can be read from a file or the standard input, the +accepted format is as follow (unchanged values can be omitted): + + { + "memory": { + "limit": 0, + "reservation": 0, + "swap": 0, + "kernel": 0, + "kernelTCP": 0 + }, + "cpu": { + "shares": 0, + "quota": 0, + "period": 0, + "realtimeRuntime": 0, + "realtimePeriod": 0, + "cpus": "", + "mems": "" + }, + "blockIO": { + "blkioWeight": 0 + } + } + +Note: if data is to be read from a file or the standard input, all +other options are ignored. + +# OPTIONS + --resources value, -r value path to the file containing the resources to update or '-' to read from the standard input + --blkio-weight value Specifies per cgroup weight, range is from 10 to 1000 (default: 0) + --cpu-period value CPU CFS period to be used for hardcapping (in usecs). 0 to use system default + --cpu-quota value CPU CFS hardcap limit (in usecs). Allowed cpu time in a given period + --cpu-rt-period value CPU realtime period to be used for hardcapping (in usecs). 0 to use system default + --cpu-rt-runtime value CPU realtime hardcap limit (in usecs). Allowed cpu time in a given period + --cpu-share value CPU shares (relative weight vs. other containers) + --cpuset-cpus value CPU(s) to use + --cpuset-mems value Memory node(s) to use + --memory value Memory limit (in bytes) + --memory-reservation value Memory reservation or soft_limit (in bytes) + --memory-swap value Total memory usage (memory + swap); set '-1' to enable unlimited swap + --pids-limit value Maximum number of pids allowed in the container (default: 0) + --l3-cache-schema The string of Intel RDT/CAT L3 cache schema + --mem-bw-schema The string of Intel RDT/MBA memory bandwidth schema diff --git a/sysbox-runc/man/runc.8.md b/sysbox-runc/man/runc.8.md new file mode 100644 index 00000000..49df5254 --- /dev/null +++ b/sysbox-runc/man/runc.8.md @@ -0,0 +1,61 @@ +% runc "8" + +# NAME + runc - Open Container Initiative runtime + +# SYNOPSIS + runc [global options] command [command options] [arguments...] + +# DESCRIPTION +runc is a command line client for running applications packaged according to +the Open Container Initiative (OCI) format and is a compliant implementation of the +Open Container Initiative specification. + +runc integrates well with existing process supervisors to provide a production +container runtime environment for applications. It can be used with your +existing process monitoring tools and the container will be spawned as a +direct child of the process supervisor. + +Containers are configured using bundles. A bundle for a container is a directory +that includes a specification file named "config.json" and a root filesystem. +The root filesystem contains the contents of the container. + +To start a new instance of a container: + + # runc start [ -b bundle ] + +Where "``" is your name for the instance of the container that you +are starting. The name you provide for the container instance must be unique on +your host. Providing the bundle directory using "-b" is optional. The default +value for "bundle" is the current directory. + +# COMMANDS + checkpoint checkpoint a running container + create create a container + delete delete any resources held by the container often used with detached containers + events display container events such as OOM notifications, cpu, memory, IO and network stats + exec execute new process inside the container + init initialize the namespaces and launch the process (do not call it outside of runc) + kill kill sends the specified signal (default: SIGTERM) to the container's init process + list lists containers started by runc with the given root + pause pause suspends all processes inside the container + ps displays the processes running inside a container + restore restore a container from a previous checkpoint + resume resumes all processes that have been previously paused + run create and run a container + spec create a new specification file + start executes the user defined process in a created container + state output the state of a container + update update container resource constraints + help, h Shows a list of commands or help for one command + +# GLOBAL OPTIONS + --debug enable debug output for logging + --log value set the log file path where internal debug information is written (default: "/dev/null") + --log-format value set the format used by logs ('text' (default), or 'json') (default: "text") + --root value root directory for storage of container state (this should be located in tmpfs) (default: "/run/runc" or $XDG_RUNTIME_DIR/runc for rootless containers) + --criu value path to the criu binary used for checkpoint and restore (default: "criu") + --systemd-cgroup enable systemd cgroup support, expects cgroupsPath to be of form "slice:prefix:name" for e.g. "system.slice:runc:434234" + --rootless value enable rootless mode ('true', 'false', or 'auto') (default: "auto") + --help, -h show help + --version, -v print the version diff --git a/sysbox-runc/notify_socket.go b/sysbox-runc/notify_socket.go new file mode 100644 index 00000000..bb1dad77 --- /dev/null +++ b/sysbox-runc/notify_socket.go @@ -0,0 +1,172 @@ +// +build linux + +package main + +import ( + "bytes" + "net" + "os" + "path" + "path/filepath" + "strconv" + "time" + + "github.com/opencontainers/runc/libcontainer" + "github.com/opencontainers/runtime-spec/specs-go" + "github.com/urfave/cli" +) + +type notifySocket struct { + socket *net.UnixConn + host string + socketPath string +} + +func newNotifySocket(context *cli.Context, notifySocketHost string, id string) *notifySocket { + if notifySocketHost == "" { + return nil + } + + root := filepath.Join(context.GlobalString("root"), id) + socketPath := filepath.Join(root, "notify", "notify.sock") + + notifySocket := ¬ifySocket{ + socket: nil, + host: notifySocketHost, + socketPath: socketPath, + } + + return notifySocket +} + +func (s *notifySocket) Close() error { + return s.socket.Close() +} + +// If systemd is supporting sd_notify protocol, this function will add support +// for sd_notify protocol from within the container. +func (s *notifySocket) setupSpec(context *cli.Context, spec *specs.Spec) error { + pathInContainer := filepath.Join("/run/notify", path.Base(s.socketPath)) + mount := specs.Mount{ + Destination: path.Dir(pathInContainer), + Source: path.Dir(s.socketPath), + Options: []string{"bind", "nosuid", "noexec", "nodev", "ro"}, + } + spec.Mounts = append(spec.Mounts, mount) + spec.Process.Env = append(spec.Process.Env, "NOTIFY_SOCKET="+pathInContainer) + return nil +} + +func (s *notifySocket) bindSocket() error { + addr := net.UnixAddr{ + Name: s.socketPath, + Net: "unixgram", + } + + socket, err := net.ListenUnixgram("unixgram", &addr) + if err != nil { + return err + } + + err = os.Chmod(s.socketPath, 0777) + if err != nil { + socket.Close() + return err + } + + s.socket = socket + return nil +} + +func (s *notifySocket) setupSocketDirectory() error { + return os.Mkdir(path.Dir(s.socketPath), 0755) +} + +func notifySocketStart(context *cli.Context, notifySocketHost, id string) (*notifySocket, error) { + notifySocket := newNotifySocket(context, notifySocketHost, id) + if notifySocket == nil { + return nil, nil + } + + if err := notifySocket.bindSocket(); err != nil { + return nil, err + } + return notifySocket, nil +} + +func (n *notifySocket) waitForContainer(container libcontainer.Container) error { + s, err := container.State() + if err != nil { + return err + } + return n.run(s.InitProcessPid) +} + +func (n *notifySocket) run(pid1 int) error { + if n.socket == nil { + return nil + } + notifySocketHostAddr := net.UnixAddr{Name: n.host, Net: "unixgram"} + client, err := net.DialUnix("unixgram", nil, ¬ifySocketHostAddr) + if err != nil { + return err + } + + ticker := time.NewTicker(time.Millisecond * 100) + defer ticker.Stop() + + fileChan := make(chan []byte) + go func() { + for { + buf := make([]byte, 4096) + r, err := n.socket.Read(buf) + if err != nil { + return + } + got := buf[0:r] + // systemd-ready sends a single datagram with the state string as payload, + // so we don't need to worry about partial messages. + for _, line := range bytes.Split(got, []byte{'\n'}) { + if bytes.HasPrefix(got, []byte("READY=")) { + fileChan <- line + return + } + } + + } + }() + + for { + select { + case <-ticker.C: + _, err := os.Stat(filepath.Join("/proc", strconv.Itoa(pid1))) + if err != nil { + return nil + } + case b := <-fileChan: + var out bytes.Buffer + _, err = out.Write(b) + if err != nil { + return err + } + + _, err = out.Write([]byte{'\n'}) + if err != nil { + return err + } + + _, err = client.Write(out.Bytes()) + if err != nil { + return err + } + + // now we can inform systemd to use pid1 as the pid to monitor + newPid := "MAINPID=" + strconv.Itoa(pid1) + _, err := client.Write([]byte(newPid + "\n")) + if err != nil { + return err + } + return nil + } + } +} diff --git a/sysbox-runc/pause.go b/sysbox-runc/pause.go new file mode 100644 index 00000000..a21f5cb1 --- /dev/null +++ b/sysbox-runc/pause.go @@ -0,0 +1,66 @@ +// +build linux + +package main + +import ( + "github.com/sirupsen/logrus" + "github.com/urfave/cli" +) + +var pauseCommand = cli.Command{ + Name: "pause", + Usage: "pause suspends all processes inside the container", + ArgsUsage: ` + +Where "" is the name for the instance of the container to be +paused. `, + Description: `The pause command suspends all processes in the instance of the container. + +Use sysbox-runc list to identify instances of containers and their current status.`, + Action: func(context *cli.Context) error { + if err := checkArgs(context, 1, exactArgs); err != nil { + return err + } + rootlessCg, err := shouldUseRootlessCgroupManager(context) + if err != nil { + return err + } + if rootlessCg { + logrus.Warnf("runc pause may fail if you don't have the full access to cgroups") + } + container, err := getContainer(context) + if err != nil { + return err + } + return container.Pause() + }, +} + +var resumeCommand = cli.Command{ + Name: "resume", + Usage: "resumes all processes that have been previously paused", + ArgsUsage: ` + +Where "" is the name for the instance of the container to be +resumed.`, + Description: `The resume command resumes all processes in the instance of the container. + +Use sysbox-runc list to identify instances of containers and their current status.`, + Action: func(context *cli.Context) error { + if err := checkArgs(context, 1, exactArgs); err != nil { + return err + } + rootlessCg, err := shouldUseRootlessCgroupManager(context) + if err != nil { + return err + } + if rootlessCg { + logrus.Warn("runc resume may fail if you don't have the full access to cgroups") + } + container, err := getContainer(context) + if err != nil { + return err + } + return container.Resume() + }, +} diff --git a/sysbox-runc/profile.go b/sysbox-runc/profile.go new file mode 100644 index 00000000..6bc102fd --- /dev/null +++ b/sysbox-runc/profile.go @@ -0,0 +1,61 @@ +package main + +import ( + "fmt" + "runtime" + + "github.com/pkg/profile" + "github.com/sirupsen/logrus" + "github.com/urfave/cli" +) + +// Run cpu / memory profiling collection. +func runProfiler(ctx *cli.Context) (interface{ Stop() }, error) { + + var prof interface{ Stop() } + + cpuProfOn := ctx.GlobalBool("cpu-profiling") + memProfOn := ctx.GlobalBool("memory-profiling") + + // Typical (i.e., non-profiling) case. + if !cpuProfOn && !memProfOn { + return nil, nil + } + + // Cpu and Memory profiling options seem to be mutually exclused in pprof. + if cpuProfOn && memProfOn { + return nil, fmt.Errorf("Unsupported parameter combination: cpu and memory profiling") + } + + if cpuProfOn { + + // set the profiler's sampling rate at twice the usual to get a + // more accurate result (sysbox-runc executes quickly). + // + // Note: this may result in the following error message when + // running sysbox-runc with profiling enabled: "runtime: cannot + // set cpu profile rate until previous profile has finished." + // We can ignore it; it occurs because profile.Start() invokes + // pprof.go which calls SetCPUProfileRate() again. Since we have + // already set the value, the one from pprof will be ignored. + runtime.SetCPUProfileRate(200) + + prof = profile.Start( + profile.Quiet, + profile.CPUProfile, + profile.ProfilePath("."), + ) + logrus.Info("Initiated cpu-profiling data collection.") + } + + if memProfOn { + prof = profile.Start( + profile.Quiet, + profile.MemProfile, + profile.ProfilePath("."), + ) + logrus.Info("Initiated memory-profiling data collection.") + } + + return prof, nil +} diff --git a/sysbox-runc/ps.go b/sysbox-runc/ps.go new file mode 100644 index 00000000..ffe00f37 --- /dev/null +++ b/sysbox-runc/ps.go @@ -0,0 +1,114 @@ +// +build linux + +package main + +import ( + "encoding/json" + "errors" + "fmt" + "os" + "os/exec" + "strconv" + "strings" + + "github.com/sirupsen/logrus" + "github.com/urfave/cli" +) + +var psCommand = cli.Command{ + Name: "ps", + Usage: "ps displays the processes running inside a container", + ArgsUsage: ` [ps options]`, + Flags: []cli.Flag{ + cli.StringFlag{ + Name: "format, f", + Value: "table", + Usage: `select one of: ` + formatOptions, + }, + }, + Action: func(context *cli.Context) error { + if err := checkArgs(context, 1, minArgs); err != nil { + return err + } + rootlessCg, err := shouldUseRootlessCgroupManager(context) + if err != nil { + return err + } + if rootlessCg { + logrus.Warn("runc ps may fail if you don't have the full access to cgroups") + } + + container, err := getContainer(context) + if err != nil { + return err + } + + pids, err := container.Processes() + if err != nil { + return err + } + + switch context.String("format") { + case "table": + case "json": + return json.NewEncoder(os.Stdout).Encode(pids) + default: + return errors.New("invalid format option") + } + + // [1:] is to remove command name, ex: + // context.Args(): [container_id ps_arg1 ps_arg2 ...] + // psArgs: [ps_arg1 ps_arg2 ...] + // + psArgs := context.Args()[1:] + if len(psArgs) == 0 { + psArgs = []string{"-ef"} + } + + cmd := exec.Command("ps", psArgs...) + output, err := cmd.CombinedOutput() + if err != nil { + return fmt.Errorf("%s: %s", err, output) + } + + lines := strings.Split(string(output), "\n") + pidIndex, err := getPidIndex(lines[0]) + if err != nil { + return err + } + + fmt.Println(lines[0]) + for _, line := range lines[1:] { + if len(line) == 0 { + continue + } + fields := strings.Fields(line) + p, err := strconv.Atoi(fields[pidIndex]) + if err != nil { + return fmt.Errorf("unexpected pid '%s': %s", fields[pidIndex], err) + } + + for _, pid := range pids { + if pid == p { + fmt.Println(line) + break + } + } + } + return nil + }, + SkipArgReorder: true, +} + +func getPidIndex(title string) (int, error) { + titles := strings.Fields(title) + + pidIndex := -1 + for i, name := range titles { + if name == "PID" { + return i, nil + } + } + + return pidIndex, errors.New("couldn't find PID field in ps output") +} diff --git a/sysbox-runc/restore.go b/sysbox-runc/restore.go new file mode 100644 index 00000000..02beca1a --- /dev/null +++ b/sysbox-runc/restore.go @@ -0,0 +1,185 @@ +//go:build linux +// +build linux + +package main + +import ( + "fmt" + "os" + + "github.com/opencontainers/runc/libcontainer" + "github.com/opencontainers/runc/libcontainer/system" + "github.com/opencontainers/runc/libsysbox/sysbox" + "github.com/opencontainers/runc/libsysbox/syscont" + specs "github.com/opencontainers/runtime-spec/specs-go" + "github.com/sirupsen/logrus" + "github.com/urfave/cli" +) + +var restoreCommand = cli.Command{ + Name: "restore", + Usage: "restore a system container from a previous checkpoint", + ArgsUsage: ` + +Where "" is the name for the instance of the container to be +restored.`, + Description: `Restores the saved state of the container instance that was previously saved +using the sysbox-runc checkpoint command.`, + Flags: []cli.Flag{ + cli.StringFlag{ + Name: "console-socket", + Value: "", + Usage: "path to an AF_UNIX socket which will receive a file descriptor referencing the master end of the console's pseudoterminal", + }, + cli.StringFlag{ + Name: "image-path", + Value: "", + Usage: "path to criu image files for restoring", + }, + cli.StringFlag{ + Name: "work-path", + Value: "", + Usage: "path for saving work files and logs", + }, + cli.BoolFlag{ + Name: "tcp-established", + Usage: "allow open tcp connections", + }, + cli.BoolFlag{ + Name: "ext-unix-sk", + Usage: "allow external unix sockets", + }, + cli.BoolFlag{ + Name: "shell-job", + Usage: "allow shell jobs", + }, + cli.BoolFlag{ + Name: "file-locks", + Usage: "handle file locks, for safety", + }, + cli.StringFlag{ + Name: "manage-cgroups-mode", + Value: "", + Usage: "cgroups mode: 'soft' (default), 'full' and 'strict'", + }, + cli.StringFlag{ + Name: "bundle, b", + Value: "", + Usage: "path to the root of the bundle directory", + }, + cli.BoolFlag{ + Name: "detach,d", + Usage: "detach from the container's process", + }, + cli.StringFlag{ + Name: "pid-file", + Value: "", + Usage: "specify the file to write the process id to", + }, + cli.BoolFlag{ + Name: "no-subreaper", + Usage: "disable the use of the subreaper used to reap reparented processes", + }, + cli.BoolFlag{ + Name: "no-pivot", + Usage: "do not use pivot root to jail process inside rootfs. This should be used whenever the rootfs is on top of a ramdisk", + }, + cli.StringSliceFlag{ + Name: "empty-ns", + Usage: "create a namespace, but don't restore its properties", + }, + cli.BoolFlag{ + Name: "auto-dedup", + Usage: "enable auto deduplication of memory images", + }, + cli.BoolFlag{ + Name: "lazy-pages", + Usage: "use userfaultfd to lazily restore memory pages", + }, + }, + Action: func(context *cli.Context) error { + var ( + err error + spec *specs.Spec + status int + ) + + if err = checkArgs(context, 1, exactArgs); err != nil { + return err + } + // XXX: Currently this is untested with rootless containers. + if os.Geteuid() != 0 || system.RunningInUserNS() { + logrus.Warn("sysbox-runc restore is untested") + } + + spec, err = setupSpec(context) + if err != nil { + return err + } + + id := context.Args().First() + + withMgr := !context.GlobalBool("no-sysbox-mgr") + withFs := !context.GlobalBool("no-sysbox-fs") + + sysbox := sysbox.NewSysbox(id, withMgr, withFs) + + // register with sysMgr (registration with sysFs occurs later (within libcontainer)) + if sysbox.Mgr.Enabled() { + if err = sysbox.Mgr.Register(spec); err != nil { + return err + } + defer func() { + if err != nil { + sysbox.Mgr.Unregister() + } + }() + } + + // Get sysbox-fs related configs + if sysbox.Fs.Enabled() { + if err = sysbox.Fs.GetConfig(); err != nil { + return err + } + } + + if err = syscont.ConvertSpec(context, spec, sysbox); err != nil { + return fmt.Errorf("error in the container spec: %v", err) + } + + options := criuOptions(context) + if err = setEmptyNsMask(context, options); err != nil { + return err + } + status, err = startContainer(context, spec, CT_ACT_RESTORE, options, sysbox) + if err != nil { + sysbox.Fs.Unregister() + return err + } + // exit with the container's exit status so any external supervisor is + // notified of the exit with the correct exit status. + os.Exit(status) + return nil + }, +} + +func criuOptions(context *cli.Context) *libcontainer.CriuOpts { + imagePath := getCheckpointImagePath(context) + if err := os.MkdirAll(imagePath, 0755); err != nil { + fatal(err) + } + return &libcontainer.CriuOpts{ + ImagesDirectory: imagePath, + WorkDirectory: context.String("work-path"), + ParentImage: context.String("parent-path"), + LeaveRunning: context.Bool("leave-running"), + TcpEstablished: context.Bool("tcp-established"), + ExternalUnixConnections: context.Bool("ext-unix-sk"), + ShellJob: context.Bool("shell-job"), + FileLocks: context.Bool("file-locks"), + PreDump: context.Bool("pre-dump"), + AutoDedup: context.Bool("auto-dedup"), + LazyPages: context.Bool("lazy-pages"), + StatusFd: context.Int("status-fd"), + } +} diff --git a/sysbox-runc/rlimit_linux.go b/sysbox-runc/rlimit_linux.go new file mode 100644 index 00000000..c9462951 --- /dev/null +++ b/sysbox-runc/rlimit_linux.go @@ -0,0 +1,31 @@ +package main + +import "fmt" +import "golang.org/x/sys/unix" + +var rlimitMap = map[string]int{ + "RLIMIT_CPU": unix.RLIMIT_CPU, + "RLIMIT_FSIZE": unix.RLIMIT_FSIZE, + "RLIMIT_DATA": unix.RLIMIT_DATA, + "RLIMIT_STACK": unix.RLIMIT_STACK, + "RLIMIT_CORE": unix.RLIMIT_CORE, + "RLIMIT_RSS": unix.RLIMIT_RSS, + "RLIMIT_NPROC": unix.RLIMIT_NPROC, + "RLIMIT_NOFILE": unix.RLIMIT_NOFILE, + "RLIMIT_MEMLOCK": unix.RLIMIT_MEMLOCK, + "RLIMIT_AS": unix.RLIMIT_AS, + "RLIMIT_LOCKS": unix.RLIMIT_LOCKS, + "RLIMIT_SIGPENDING": unix.RLIMIT_SIGPENDING, + "RLIMIT_MSGQUEUE": unix.RLIMIT_MSGQUEUE, + "RLIMIT_NICE": unix.RLIMIT_NICE, + "RLIMIT_RTPRIO": unix.RLIMIT_RTPRIO, + "RLIMIT_RTTIME": unix.RLIMIT_RTTIME, +} + +func strToRlimit(key string) (int, error) { + rl, ok := rlimitMap[key] + if !ok { + return 0, fmt.Errorf("wrong rlimit value: %s", key) + } + return rl, nil +} diff --git a/sysbox-runc/rootless_linux.go b/sysbox-runc/rootless_linux.go new file mode 100644 index 00000000..348f7c4f --- /dev/null +++ b/sysbox-runc/rootless_linux.go @@ -0,0 +1,73 @@ +// +build linux + +package main + +import ( + "os" + + "github.com/opencontainers/runc/libcontainer/cgroups/systemd" + "github.com/opencontainers/runc/libcontainer/system" + "github.com/sirupsen/logrus" + "github.com/urfave/cli" +) + +func shouldUseRootlessCgroupManager(context *cli.Context) (bool, error) { + if context != nil { + b, err := parseBoolOrAuto(context.GlobalString("rootless")) + if err != nil { + return false, err + } + // nil b stands for "auto detect" + if b != nil { + return *b, nil + } + } + if os.Geteuid() != 0 { + return true, nil + } + if !system.RunningInUserNS() { + // euid == 0 , in the initial ns (i.e. the real root) + return false, nil + } + // euid = 0, in a userns. + // + // [systemd driver] + // We can call DetectUID() to parse the OwnerUID value from `busctl --user --no-pager status` result. + // The value corresponds to sd_bus_creds_get_owner_uid(3). + // If the value is 0, we have rootful systemd inside userns, so we do not need the rootless cgroup manager. + // + // On error, we assume we are root. An error may happen during shelling out to `busctl` CLI, + // mostly when $DBUS_SESSION_BUS_ADDRESS is unset. + if context.GlobalBool("systemd-cgroup") { + ownerUID, err := systemd.DetectUID() + if err != nil { + logrus.WithError(err).Debug("failed to get the OwnerUID value, assuming the value to be 0") + ownerUID = 0 + } + return ownerUID != 0, nil + } + // [cgroupfs driver] + // As we are unaware of cgroups path, we can't determine whether we have the full + // access to the cgroups path. + // Either way, we can safely decide to use the rootless cgroups manager. + return true, nil +} + +func shouldHonorXDGRuntimeDir() bool { + if os.Getenv("XDG_RUNTIME_DIR") == "" { + return false + } + if os.Geteuid() != 0 { + return true + } + if !system.RunningInUserNS() { + // euid == 0 , in the initial ns (i.e. the real root) + // in this case, we should use /run/runc and ignore + // $XDG_RUNTIME_DIR (e.g. /run/user/0) for backward + // compatibility. + return false + } + // euid = 0, in a userns. + u, ok := os.LookupEnv("USER") + return !ok || u != "root" +} diff --git a/sysbox-runc/run.go b/sysbox-runc/run.go new file mode 100644 index 00000000..6956a893 --- /dev/null +++ b/sysbox-runc/run.go @@ -0,0 +1,170 @@ +//go:build linux +// +build linux + +package main + +import ( + "fmt" + "os" + + "github.com/opencontainers/runc/libsysbox/sysbox" + "github.com/opencontainers/runc/libsysbox/syscont" + specs "github.com/opencontainers/runtime-spec/specs-go" + "github.com/sirupsen/logrus" + "github.com/urfave/cli" +) + +// default action is to start a container +var runCommand = cli.Command{ + Name: "run", + Usage: "create and run a system container", + ArgsUsage: ` + +Where "" is your name for the instance of the container that you +are starting. The name you provide for the container instance must be unique on +your host.`, + Description: `The run command creates an instance of a container for a bundle. The bundle +is a directory with a specification file named "` + specConfig + `" and a root +filesystem. + +The specification file includes an args parameter. The args parameter is used +to specify command(s) that get run when the container is started. To change the +command(s) that get executed on start, edit the args parameter of the spec. See +"runc spec --help" for more explanation.`, + Flags: []cli.Flag{ + cli.StringFlag{ + Name: "bundle, b", + Value: "", + Usage: `path to the root of the bundle directory, defaults to the current directory`, + }, + cli.StringFlag{ + Name: "console-socket", + Value: "", + Usage: "path to an AF_UNIX socket which will receive a file descriptor referencing the master end of the console's pseudoterminal", + }, + cli.BoolFlag{ + Name: "detach, d", + Usage: "detach from the container's process", + }, + cli.BoolFlag{ + Name: "keep", + Usage: "do not delete the container after it exits", + }, + cli.StringFlag{ + Name: "pid-file", + Value: "", + Usage: "specify the file to write the process id to", + }, + cli.BoolFlag{ + Name: "no-subreaper", + Usage: "disable the use of the subreaper used to reap reparented processes", + }, + cli.BoolFlag{ + Name: "no-pivot", + Usage: "do not use pivot root to jail process inside rootfs. This should be used whenever the rootfs is on top of a ramdisk", + }, + cli.BoolFlag{ + Name: "no-new-keyring", + Usage: "do not create a new session keyring for the container. This will cause the container to inherit the calling processes session key", + }, + cli.IntFlag{ + Name: "preserve-fds", + Usage: "Pass N additional file descriptors to the container (stdio + $LISTEN_FDS + N in total)", + }, + }, + Action: func(context *cli.Context) error { + var ( + err error + spec *specs.Spec + status int + profiler interface{ Stop() } + ) + + // Enable profiler if requested to do so + profiler, err = runProfiler(context) + if err != nil { + return err + } + + defer func() { + if profiler != nil { + logrus.Info("Stopping profiler ...") + profiler.Stop() + } + }() + + if err = checkArgs(context, 1, exactArgs); err != nil { + return err + } + if err = revisePidFile(context); err != nil { + return err + } + + spec, err = setupSpec(context) + if err != nil { + return err + } + + if err = sysbox.CheckHostConfig(context, spec); err != nil { + return err + } + + id := context.Args().First() + + withMgr := !context.GlobalBool("no-sysbox-mgr") + withFs := !context.GlobalBool("no-sysbox-fs") + + sysbox := sysbox.NewSysbox(id, withMgr, withFs) + + // register with sysbox-mgr + if sysbox.Mgr.Enabled() { + if err = sysbox.Mgr.Register(spec); err != nil { + return err + } + defer func() { + if err != nil { + sysbox.Mgr.Unregister() + } + }() + } + + // Get sysbox-fs related configs + if sysbox.Fs.Enabled() { + if err = sysbox.Fs.GetConfig(); err != nil { + return err + } + } + + if err = syscont.ConvertSpec(context, spec, sysbox); err != nil { + return fmt.Errorf("error in the container spec: %v", err) + } + + // pre-register with sysFs + if sysbox.Fs.Enabled() { + if err = sysbox.Fs.PreRegister(spec.Linux.Namespaces); err != nil { + return err + } + defer func() { + if err != nil { + sysbox.Fs.Unregister() + } + }() + } + + status, err = startContainer(context, spec, CT_ACT_RUN, nil, sysbox) + if err == nil { + + // note: defer func() to stop profiler won't execute on os.Exit(); must explicitly stop it. + if profiler != nil { + logrus.Info("Stopping profiler ...") + profiler.Stop() + } + + // exit with the container's exit status so any external supervisor is + // notified of the exit with the correct exit status. + os.Exit(status) + } + + return err + }, +} diff --git a/sysbox-runc/script/.validate b/sysbox-runc/script/.validate new file mode 100644 index 00000000..170d6747 --- /dev/null +++ b/sysbox-runc/script/.validate @@ -0,0 +1,33 @@ +#!/bin/bash + +if [ -z "$VALIDATE_UPSTREAM" ]; then + # this is kind of an expensive check, so let's not do this twice if we + # are running more than one validate bundlescript + + VALIDATE_REPO='https://github.com/opencontainers/runc.git' + VALIDATE_BRANCH='master' + + if [ "$TRAVIS" = 'true' -a "$TRAVIS_PULL_REQUEST" != 'false' ]; then + VALIDATE_REPO="https://github.com/${TRAVIS_REPO_SLUG}.git" + VALIDATE_BRANCH="${TRAVIS_BRANCH}" + fi + + VALIDATE_HEAD="$(git rev-parse --verify HEAD)" + + git fetch -q "$VALIDATE_REPO" "refs/heads/$VALIDATE_BRANCH" + VALIDATE_UPSTREAM="$(git rev-parse --verify FETCH_HEAD)" + + VALIDATE_COMMIT_LOG="$VALIDATE_UPSTREAM..$VALIDATE_HEAD" + VALIDATE_COMMIT_DIFF="$VALIDATE_UPSTREAM...$VALIDATE_HEAD" + + validate_diff() { + if [ "$VALIDATE_UPSTREAM" != "$VALIDATE_HEAD" ]; then + git diff "$VALIDATE_COMMIT_DIFF" "$@" + fi + } + validate_log() { + if [ "$VALIDATE_UPSTREAM" != "$VALIDATE_HEAD" ]; then + git log "$VALIDATE_COMMIT_LOG" "$@" + fi + } +fi diff --git a/sysbox-runc/script/check-config.sh b/sysbox-runc/script/check-config.sh new file mode 100755 index 00000000..bf13cd1b --- /dev/null +++ b/sysbox-runc/script/check-config.sh @@ -0,0 +1,274 @@ +#!/usr/bin/env bash +set -e + +# bits of this were adapted from check_config.sh in docker +# see also https://github.com/docker/docker/blob/master/contrib/check-config.sh + +possibleConfigs=( + '/proc/config.gz' + "/boot/config-$(uname -r)" + "/usr/src/linux-$(uname -r)/.config" + '/usr/src/linux/.config' +) +possibleConfigFiles=( + 'config.gz' + "config-$(uname -r)" + '.config' +) + +if ! command -v zgrep &>/dev/null; then + zgrep() { + zcat "$2" | grep "$1" + } +fi + +kernelVersion="$(uname -r)" +kernelMajor="${kernelVersion%%.*}" +kernelMinor="${kernelVersion#$kernelMajor.}" +kernelMinor="${kernelMinor%%.*}" + +is_set() { + zgrep "CONFIG_$1=[y|m]" "$CONFIG" >/dev/null +} +is_set_in_kernel() { + zgrep "CONFIG_$1=y" "$CONFIG" >/dev/null +} +is_set_as_module() { + zgrep "CONFIG_$1=m" "$CONFIG" >/dev/null +} + +color() { + local codes=() + if [ "$1" = 'bold' ]; then + codes=("${codes[@]}" '1') + shift + fi + if [ "$#" -gt 0 ]; then + local code + case "$1" in + # see https://en.wikipedia.org/wiki/ANSI_escape_code#Colors + black) code=30 ;; + red) code=31 ;; + green) code=32 ;; + yellow) code=33 ;; + blue) code=34 ;; + magenta) code=35 ;; + cyan) code=36 ;; + white) code=37 ;; + esac + if [ "$code" ]; then + codes=("${codes[@]}" "$code") + fi + fi + local IFS=';' + echo -en '\033['"${codes[*]}"'m' +} +wrap_color() { + text="$1" + shift + color "$@" + echo -n "$text" + color reset + echo +} + +wrap_good() { + echo "$(wrap_color "$1" white): $(wrap_color "$2" green)" +} +wrap_bad() { + echo "$(wrap_color "$1" bold): $(wrap_color "$2" bold red)" +} +wrap_warning() { + wrap_color >&2 "$*" red +} + +check_flag() { + if is_set_in_kernel "$1"; then + wrap_good "CONFIG_$1" 'enabled' + elif is_set_as_module "$1"; then + wrap_good "CONFIG_$1" 'enabled (as module)' + else + wrap_bad "CONFIG_$1" 'missing' + fi +} + +check_flags() { + for flag in "$@"; do + echo "- $(check_flag "$flag")" + done +} + +check_distro_userns() { + source /etc/os-release 2>/dev/null || /bin/true + if [[ "${ID}" =~ ^(centos|rhel)$ && "${VERSION_ID}" =~ ^7 ]]; then + # this is a CentOS7 or RHEL7 system + grep -q "user_namespace.enable=1" /proc/cmdline || { + # no user namespace support enabled + wrap_bad " (RHEL7/CentOS7" "User namespaces disabled; add 'user_namespace.enable=1' to boot command line)" + } + fi +} + +is_config() { + local config="$1" + + # Todo: more check + [[ -f "$config" ]] && return 0 + return 1 +} + +search_config() { + local target_dir="$1" + [[ "$target_dir" ]] || target_dir=("${possibleConfigs[@]}") + + local tryConfig + for tryConfig in "${target_dir[@]}"; do + is_config "$tryConfig" && { + CONFIG="$tryConfig" + return + } + [[ -d "$tryConfig" ]] && { + for tryFile in "${possibleConfigFiles[@]}"; do + is_config "$tryConfig/$tryFile" && { + CONFIG="$tryConfig/$tryFile" + return + } + done + } + done + + wrap_warning "error: cannot find kernel config" + wrap_warning " try running this script again, specifying the kernel config:" + wrap_warning " CONFIG=/path/to/kernel/.config $0 or $0 /path/to/kernel/.config" + exit 1 +} + +CONFIG="$1" + +is_config "$CONFIG" || { + if [[ ! "$CONFIG" ]]; then + wrap_color "info: no config specified, searching for kernel config ..." white + search_config + elif [[ -d "$CONFIG" ]]; then + wrap_color "info: input is a directory, searching for kernel config in this directory..." white + search_config "$CONFIG" + else + wrap_warning "warning: $CONFIG seems not a kernel config, searching other paths for kernel config ..." + search_config + fi +} + +wrap_color "info: reading kernel config from $CONFIG ..." white +echo + +echo 'Generally Necessary:' + +echo -n '- ' +if [ "$(stat -f -c %t /sys/fs/cgroup 2>/dev/null)" = "63677270" ]; then + echo "$(wrap_good 'cgroup hierarchy' 'cgroupv2')" +else + cgroupSubsystemDir="$(awk '/[, ](cpu|cpuacct|cpuset|devices|freezer|memory)[, ]/ && $3 == "cgroup" { print $2 }' /proc/mounts | head -n1)" + cgroupDir="$(dirname "$cgroupSubsystemDir")" + if [ -d "$cgroupDir/cpu" -o -d "$cgroupDir/cpuacct" -o -d "$cgroupDir/cpuset" -o -d "$cgroupDir/devices" -o -d "$cgroupDir/freezer" -o -d "$cgroupDir/memory" ]; then + echo "$(wrap_good 'cgroup hierarchy' 'properly mounted') [$cgroupDir]" + else + if [ "$cgroupSubsystemDir" ]; then + echo "$(wrap_bad 'cgroup hierarchy' 'single mountpoint!') [$cgroupSubsystemDir]" + else + echo "$(wrap_bad 'cgroup hierarchy' 'nonexistent??')" + fi + echo " $(wrap_color '(see https://github.com/tianon/cgroupfs-mount)' yellow)" + fi +fi + +if [ "$(cat /sys/module/apparmor/parameters/enabled 2>/dev/null)" = 'Y' ]; then + echo -n '- ' + if command -v apparmor_parser &>/dev/null; then + echo "$(wrap_good 'apparmor' 'enabled and tools installed')" + else + echo "$(wrap_bad 'apparmor' 'enabled, but apparmor_parser missing')" + echo -n ' ' + if command -v apt-get &>/dev/null; then + echo "$(wrap_color '(use "apt-get install apparmor" to fix this)')" + elif command -v yum &>/dev/null; then + echo "$(wrap_color '(your best bet is "yum install apparmor-parser")')" + else + echo "$(wrap_color '(look for an "apparmor" package for your distribution)')" + fi + fi +fi + +flags=( + NAMESPACES {NET,PID,IPC,UTS}_NS + CGROUPS CGROUP_CPUACCT CGROUP_DEVICE CGROUP_FREEZER CGROUP_SCHED CPUSETS MEMCG + KEYS + VETH BRIDGE BRIDGE_NETFILTER + IP_NF_FILTER IP_NF_TARGET_MASQUERADE + NETFILTER_XT_MATCH_{ADDRTYPE,CONNTRACK,IPVS} + IP_NF_NAT NF_NAT + + # required for bind-mounting /dev/mqueue into containers + POSIX_MQUEUE +) +check_flags "${flags[@]}" + +if [ "$kernelMajor" -lt 5 ] || [ "$kernelMajor" -eq 5 -a "$kernelMinor" -le 1 ]; then + check_flags NF_NAT_IPV4 +fi + +if [ "$kernelMajor" -lt 5 ] || [ "$kernelMajor" -eq 5 -a "$kernelMinor" -le 2 ]; then + check_flags NF_NAT_NEEDED +fi + +echo + +echo 'Optional Features:' +{ + check_flags USER_NS + check_distro_userns + + check_flags SECCOMP + check_flags CGROUP_PIDS + + check_flags MEMCG_SWAP + + if [ "$kernelMajor" -lt 5 ] || [ "$kernelMajor" -eq 5 -a "$kernelMinor" -le 8 ]; then + check_flags MEMCG_SWAP_ENABLED + if is_set MEMCG_SWAP && ! is_set MEMCG_SWAP_ENABLED; then + echo " $(wrap_color '(note that cgroup swap accounting is not enabled in your kernel config, you can enable it by setting boot option "swapaccount=1")' bold black)" + fi + fi +} + +if [ "$kernelMajor" -lt 4 ] || [ "$kernelMajor" -eq 4 -a "$kernelMinor" -le 5 ]; then + check_flags MEMCG_KMEM +fi + +if [ "$kernelMajor" -lt 3 ] || [ "$kernelMajor" -eq 3 -a "$kernelMinor" -le 18 ]; then + check_flags RESOURCE_COUNTERS +fi + +if [ "$kernelMajor" -lt 3 ] || [ "$kernelMajor" -eq 3 -a "$kernelMinor" -le 13 ]; then + netprio=NETPRIO_CGROUP +else + netprio=CGROUP_NET_PRIO +fi + +if [ "$kernelMajor" -lt 5 ]; then + check_flags IOSCHED_CFQ CFQ_GROUP_IOSCHED +fi + +flags=( + BLK_CGROUP BLK_DEV_THROTTLING + CGROUP_PERF + CGROUP_HUGETLB + NET_CLS_CGROUP $netprio + CFS_BANDWIDTH FAIR_GROUP_SCHED RT_GROUP_SCHED + IP_NF_TARGET_REDIRECT + IP_VS + IP_VS_NFCT + IP_VS_PROTO_TCP + IP_VS_PROTO_UDP + IP_VS_RR +) +check_flags "${flags[@]}" diff --git a/sysbox-runc/script/release.sh b/sysbox-runc/script/release.sh new file mode 100755 index 00000000..c3e77703 --- /dev/null +++ b/sysbox-runc/script/release.sh @@ -0,0 +1,132 @@ +#!/bin/bash +# Copyright (C) 2017 SUSE LLC. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +set -e + +## ---> +# Project-specific options and functions. In *theory* you shouldn't need to +# touch anything else in this script in order to use this elsewhere. +project="runc" +root="$(readlink -f "$(dirname "${BASH_SOURCE}")/..")" + +# This function takes an output path as an argument, where the built +# (preferably static) binary should be placed. +function build_project() { + builddir="$(dirname "$1")" + + make -C "$root" COMMIT_NO= static + mv "$root/$project" "$1" +} + +# End of the easy-to-configure portion. +## <--- + +# Print usage information. +function usage() { + echo "usage: release.sh [-S ] [-c ] [-r ] [-v ]" >&2 + exit 1 +} + +# Log something to stderr. +function log() { + echo "[*] $*" >&2 +} + +# Log something to stderr and then exit with 0. +function bail() { + log "$@" + exit 0 +} + +# Conduct a sanity-check to make sure that GPG provided with the given +# arguments can sign something. Inability to sign things is not a fatal error. +function gpg_cansign() { + gpg "$@" --clear-sign /dev/null +} + +# When creating releases we need to build static binaries, an archive of the +# current commit, and generate detached signatures for both. +keyid="" +commit="HEAD" +version="" +releasedir="" +hashcmd="" +while getopts "S:c:r:v:h:" opt; do + case "$opt" in + S) + keyid="$OPTARG" + ;; + c) + commit="$OPTARG" + ;; + r) + releasedir="$OPTARG" + ;; + v) + version="$OPTARG" + ;; + h) + hashcmd="$OPTARG" + ;; + \:) + echo "Missing argument: -$OPTARG" >&2 + usage + ;; + \?) + echo "Invalid option: -$OPTARG" >&2 + usage + ;; + esac +done + +version="${version:-$(<"$root/VERSION")}" +releasedir="${releasedir:-release/$version}" +hashcmd="${hashcmd:-sha256sum}" +goarch="$(go env GOARCH || echo "amd64")" + +log "creating $project release in '$releasedir'" +log " version: $version" +log " commit: $commit" +log " key: ${keyid:-DEFAULT}" +log " hash: $hashcmd" + +# Make explicit what we're doing. +set -x + +# Make the release directory. +rm -rf "$releasedir" && mkdir -p "$releasedir" + +# Build project. +build_project "$releasedir/$project.$goarch" + +# Generate new archive. +git archive --format=tar --prefix="$project-$version/" "$commit" | xz >"$releasedir/$project.tar.xz" + +# Generate sha256 checksums for both. +( + cd "$releasedir" + "$hashcmd" "$project".{"$goarch",tar.xz} >"$project.$hashcmd" +) + +# Set up the gpgflags. +[[ "$keyid" ]] && export gpgflags="--default-key $keyid" +gpg_cansign $gpgflags || bail "Could not find suitable GPG key, skipping signing step." + +# Sign everything. +gpg $gpgflags --detach-sign --armor "$releasedir/$project.$goarch" +gpg $gpgflags --detach-sign --armor "$releasedir/$project.tar.xz" +gpg $gpgflags --clear-sign --armor \ + --output "$releasedir/$project.$hashcmd"{.tmp,} && + mv "$releasedir/$project.$hashcmd"{.tmp,} diff --git a/sysbox-runc/script/validate-c b/sysbox-runc/script/validate-c new file mode 100755 index 00000000..c5333a8f --- /dev/null +++ b/sysbox-runc/script/validate-c @@ -0,0 +1,42 @@ +#!/bin/bash + +source "$(dirname "$BASH_SOURCE")/.validate" + +IFS=$'\n' +files=($(validate_diff --diff-filter=ACMR --name-only -- '*.c' | grep -v '^vendor/' || true)) +unset IFS + +# indent(1): "You must use the ‘-T’ option to tell indent the name of all the typenames in your program that are defined by typedef." +INDENT="indent -linux -l120 -T size_t -T jmp_buf" +if [ -z "$(indent --version 2>&1 | grep GNU)" ]; then + echo "Skipping C indentation checks, as GNU indent is not installed." + exit 0 +fi + +badFiles=() +for f in "${files[@]}"; do + orig=$(mktemp) + formatted=$(mktemp) + # we use "git show" here to validate that what's committed is formatted + git show "$VALIDATE_HEAD:$f" >${orig} + ${INDENT} ${orig} -o ${formatted} + if [ "$(diff -u ${orig} ${formatted})" ]; then + badFiles+=("$f") + fi + rm -f ${orig} ${formatted} +done + +if [ ${#badFiles[@]} -eq 0 ]; then + echo 'Congratulations! All C source files are properly formatted.' +else + { + echo "These files are not properly formatted:" + for f in "${badFiles[@]}"; do + echo " - $f" + done + echo + echo "Please reformat the above files using \"${INDENT}\" and commit the result." + echo + } >&2 + false +fi diff --git a/sysbox-runc/script/validate-gofmt b/sysbox-runc/script/validate-gofmt new file mode 100755 index 00000000..8337ed2d --- /dev/null +++ b/sysbox-runc/script/validate-gofmt @@ -0,0 +1,30 @@ +#!/bin/bash + +source "$(dirname "$BASH_SOURCE")/.validate" + +IFS=$'\n' +files=($(validate_diff --diff-filter=ACMR --name-only -- '*.go' | grep -v '^vendor/' || true)) +unset IFS + +badFiles=() +for f in "${files[@]}"; do + # we use "git show" here to validate that what's committed is formatted + if [ "$(git show "$VALIDATE_HEAD:$f" | gofmt -s -l)" ]; then + badFiles+=("$f") + fi +done + +if [ ${#badFiles[@]} -eq 0 ]; then + echo 'Congratulations! All Go source files are properly formatted.' +else + { + echo "These files are not properly gofmt'd:" + for f in "${badFiles[@]}"; do + echo " - $f" + done + echo + echo 'Please reformat the above files using "gofmt -s -w" and commit the result.' + echo + } >&2 + false +fi diff --git a/sysbox-runc/signals.go b/sysbox-runc/signals.go new file mode 100644 index 00000000..5682989c --- /dev/null +++ b/sysbox-runc/signals.go @@ -0,0 +1,136 @@ +// +build linux + +package main + +import ( + "os" + "os/signal" + + "github.com/opencontainers/runc/libcontainer" + "github.com/opencontainers/runc/libcontainer/system" + "github.com/opencontainers/runc/libcontainer/utils" + + "github.com/sirupsen/logrus" + "golang.org/x/sys/unix" +) + +const signalBufferSize = 2048 + +// newSignalHandler returns a signal handler for processing SIGCHLD and SIGWINCH signals +// while still forwarding all other signals to the process. +// If notifySocket is present, use it to read systemd notifications from the container and +// forward them to notifySocketHost. +func newSignalHandler(enableSubreaper bool, notifySocket *notifySocket) *signalHandler { + if enableSubreaper { + // set us as the subreaper before registering the signal handler for the container + if err := system.SetSubreaper(1); err != nil { + logrus.Warn(err) + } + } + // ensure that we have a large buffer size so that we do not miss any signals + // in case we are not processing them fast enough. + s := make(chan os.Signal, signalBufferSize) + // handle all signals for the process. + signal.Notify(s) + return &signalHandler{ + signals: s, + notifySocket: notifySocket, + } +} + +// exit models a process exit status with the pid and +// exit status. +type exit struct { + pid int + status int +} + +type signalHandler struct { + signals chan os.Signal + notifySocket *notifySocket +} + +// forward handles the main signal event loop forwarding, resizing, or reaping depending +// on the signal received. +func (h *signalHandler) forward(process *libcontainer.Process, tty *tty, detach bool) (int, error) { + // make sure we know the pid of our main process so that we can return + // after it dies. + if detach && h.notifySocket == nil { + return 0, nil + } + + pid1, err := process.Pid() + if err != nil { + return -1, err + } + + if h.notifySocket != nil { + if detach { + h.notifySocket.run(pid1) + return 0, nil + } + h.notifySocket.run(os.Getpid()) + go h.notifySocket.run(0) + } + + // Perform the initial tty resize. Always ignore errors resizing because + // stdout might have disappeared (due to races with when SIGHUP is sent). + _ = tty.resize() + // Handle and forward signals. + for s := range h.signals { + switch s { + case unix.SIGWINCH: + // Ignore errors resizing, as above. + _ = tty.resize() + case unix.SIGCHLD: + exits, err := h.reap() + if err != nil { + logrus.Error(err) + } + for _, e := range exits { + logrus.WithFields(logrus.Fields{ + "pid": e.pid, + "status": e.status, + }).Debug("process exited") + if e.pid == pid1 { + // call Wait() on the process even though we already have the exit + // status because we must ensure that any of the go specific process + // fun such as flushing pipes are complete before we return. + process.Wait() + return e.status, nil + } + } + default: + logrus.Debugf("sending signal to process %s", s) + if err := unix.Kill(pid1, s.(unix.Signal)); err != nil { + logrus.Error(err) + } + } + } + return -1, nil +} + +// reap runs wait4 in a loop until we have finished processing any existing exits +// then returns all exits to the main event loop for further processing. +func (h *signalHandler) reap() (exits []exit, err error) { + var ( + ws unix.WaitStatus + rus unix.Rusage + ) + for { + pid, err := unix.Wait4(-1, &ws, unix.WNOHANG, &rus) + if err != nil { + if err == unix.ECHILD { + return exits, nil + } + return nil, err + } + if pid <= 0 { + return exits, nil + } + exits = append(exits, exit{ + pid: pid, + status: utils.ExitStatus(ws), + }) + } +} diff --git a/sysbox-runc/spec.go b/sysbox-runc/spec.go new file mode 100644 index 00000000..4f8648b6 --- /dev/null +++ b/sysbox-runc/spec.go @@ -0,0 +1,181 @@ +// +build linux + +package main + +import ( + "encoding/json" + "fmt" + "io/ioutil" + "math" + "os" + "strconv" + "strings" + + "github.com/opencontainers/runc/libcontainer/configs" + "github.com/opencontainers/runc/libsysbox/syscont" + + "github.com/opencontainers/runtime-spec/specs-go" + "github.com/urfave/cli" +) + +var specCommand = cli.Command{ + Name: "spec", + Usage: "create a new system container specification file", + Description: `The spec command creates the new system container specification file +named "` + specConfig + `" for the bundle. + +The spec generated is just a starter file. Editing of the spec is required to +achieve desired results. + +ID mapping configuration: + +Nestybox system containers always use the Linux user namespace and thus require user +and group ID mappings. + +The "--id-map" option allows configuration of these mappings for the generated spec. +It's normally not required, unless the user wants to control the user and group IDs +mappings of the container. + +If the "--id-map" option is omitted, the generated spec will not include the +user and group ID mappings. In this case sysbox-runc will automatically +allocate them when the container is created. The allocation is done in such as +way as to provide each sys container an exclusive range of uid(gid)s on the +host, as a means to improve isolation. This feature requires that the +container's root filesystem be owned by "root:root". + +If the "--id-map" option is given, the generated spec will include them and +sysbox-runc will honor them when creating the container. They are expected +to match the container's root filesystem ownership. Note that the size of the +range is required be >= ` + strconv.FormatUint(uint64(syscont.IdRangeMin), 10) + ` (for compatibility with Linux distros +that use ID 65534 as "nobody"). +`, + Flags: []cli.Flag{ + cli.StringFlag{ + Name: "bundle, b", + Value: "", + Usage: "path to the sys container's bundle directory", + }, + cli.StringFlag{ + Name: "id-map, m", + Value: "", + Usage: `"uid gid size" ID mappings (see description above)`, + }, + }, + Action: func(context *cli.Context) error { + var uid, gid, size uint32 + + idMap := context.String("id-map") + if idMap != "" { + if err := parseIDMap(idMap, &uid, &gid, &size); err != nil { + return err + } + } + + spec, err := syscont.Example() + if err != nil { + return err + } + + if idMap != "" { + spec.Linux.Namespaces = append(spec.Linux.Namespaces, specs.LinuxNamespace{Type: "user"}) + + spec.Linux.UIDMappings = []specs.LinuxIDMapping{{ + HostID: uid, + ContainerID: 0, + Size: size, + }} + spec.Linux.GIDMappings = []specs.LinuxIDMapping{{ + HostID: gid, + ContainerID: 0, + Size: size, + }} + } + + bundle := context.String("bundle") + if bundle != "" { + if err := os.Chdir(bundle); err != nil { + return err + } + } + + checkNoFile := func(name string) error { + _, err := os.Stat(name) + if err == nil { + return fmt.Errorf("File %s exists. Remove it first", name) + } + if !os.IsNotExist(err) { + return err + } + return nil + } + + if err := checkNoFile(specConfig); err != nil { + return err + } + + data, err := json.MarshalIndent(spec, "", "\t") + if err != nil { + return err + } + return ioutil.WriteFile(specConfig, data, 0666) + }, +} + +// parseIDMap parses the id-map flag and returns the uid, gid, and size +func parseIDMap(idMap string, uid, gid, size *uint32) error { + var num [3]uint64 + var err error + + fields := strings.Fields(idMap) + if len(fields) != 3 { + return fmt.Errorf("id-map must be of the form \"uid gid size\"; got %v", idMap) + } + + for i, f := range fields { + num[i], err = strconv.ParseUint(f, 10, 32) + if err != nil { + return err + } + } + + *uid = uint32(num[0]) + *gid = uint32(num[1]) + *size = uint32(num[2]) + + if *uid > math.MaxUint32 || *gid > math.MaxUint32 || *size < syscont.IdRangeMin { + return fmt.Errorf("invalid id-map \"%v\": uid and gid must be <= %v, size must be >= %v", + idMap, uint32(math.MaxUint32), syscont.IdRangeMin) + } + + return nil +} + +// loadSpec loads the specification from the provided path +func loadSpec(cPath string) (spec *specs.Spec, err error) { + cf, err := os.Open(cPath) + if err != nil { + if os.IsNotExist(err) { + return nil, fmt.Errorf("JSON specification file %s not found", cPath) + } + return nil, err + } + defer cf.Close() + + if err = json.NewDecoder(cf).Decode(&spec); err != nil { + return nil, err + } + + return spec, validateProcessSpec(spec.Process) +} + +func createLibContainerRlimit(rlimit specs.POSIXRlimit) (configs.Rlimit, error) { + rl, err := strToRlimit(rlimit.Type) + if err != nil { + return configs.Rlimit{}, err + } + return configs.Rlimit{ + Type: rl, + Hard: rlimit.Hard, + Soft: rlimit.Soft, + }, nil +} diff --git a/sysbox-runc/start.go b/sysbox-runc/start.go new file mode 100644 index 00000000..9bc2780c --- /dev/null +++ b/sysbox-runc/start.go @@ -0,0 +1,54 @@ +package main + +import ( + "errors" + "fmt" + "os" + + "github.com/opencontainers/runc/libcontainer" + "github.com/urfave/cli" +) + +var startCommand = cli.Command{ + Name: "start", + Usage: "executes the user defined process in a created system container", + ArgsUsage: ` + +Where "" is your name for the instance of the system container that you +are starting. The name you provide for the container instance must be unique on +your host.`, + Description: `The start command executes the user defined process in a created container.`, + Action: func(context *cli.Context) error { + if err := checkArgs(context, 1, exactArgs); err != nil { + return err + } + container, err := getContainer(context) + if err != nil { + return err + } + status, err := container.Status() + if err != nil { + return err + } + switch status { + case libcontainer.Created: + notifySocket, err := notifySocketStart(context, os.Getenv("NOTIFY_SOCKET"), container.ID()) + if err != nil { + return err + } + if err := container.Exec(); err != nil { + return err + } + if notifySocket != nil { + return notifySocket.waitForContainer(container) + } + return nil + case libcontainer.Stopped: + return errors.New("cannot start a container that has stopped") + case libcontainer.Running: + return errors.New("cannot start an already running container") + default: + return fmt.Errorf("cannot start a container in the %s state\n", status) + } + }, +} diff --git a/sysbox-runc/state.go b/sysbox-runc/state.go new file mode 100644 index 00000000..718813c3 --- /dev/null +++ b/sysbox-runc/state.go @@ -0,0 +1,60 @@ +// +build linux + +package main + +import ( + "encoding/json" + "os" + + "github.com/opencontainers/runc/libcontainer" + "github.com/opencontainers/runc/libcontainer/utils" + "github.com/urfave/cli" +) + +var stateCommand = cli.Command{ + Name: "state", + Usage: "output the state of a container", + ArgsUsage: ` + +Where "" is your name for the instance of the container.`, + Description: `The state command outputs current state information for the +instance of a container.`, + Action: func(context *cli.Context) error { + if err := checkArgs(context, 1, exactArgs); err != nil { + return err + } + container, err := getContainer(context) + if err != nil { + return err + } + containerStatus, err := container.Status() + if err != nil { + return err + } + state, err := container.State() + if err != nil { + return err + } + pid := state.BaseState.InitProcessPid + if containerStatus == libcontainer.Stopped { + pid = 0 + } + bundle, annotations := utils.Annotations(state.Config.Labels) + cs := containerState{ + Version: state.BaseState.Config.Version, + ID: state.BaseState.ID, + InitProcessPid: pid, + Status: containerStatus.String(), + Bundle: bundle, + Rootfs: state.BaseState.Config.Rootfs, + Created: state.BaseState.Created, + Annotations: annotations, + } + data, err := json.MarshalIndent(cs, "", " ") + if err != nil { + return err + } + os.Stdout.Write(data) + return nil + }, +} diff --git a/sysbox-runc/tests/integration/README.md b/sysbox-runc/tests/integration/README.md new file mode 100644 index 00000000..d31fbf80 --- /dev/null +++ b/sysbox-runc/tests/integration/README.md @@ -0,0 +1,85 @@ +# sysbox-runc Integration Tests + +Integration tests provide end-to-end testing of sysbox-runc. + +Note that integration tests do **not** replace unit tests. + +As a rule of thumb, code should be tested thoroughly with unit tests. +Integration tests on the other hand are meant to test a specific feature end +to end. + +Integration tests are written in *bash* using the +[bats (Bash Automated Testing System)](https://github.com/bats-core/bats-core) +framework. + +## Running integration tests + +The easiest way to run integration tests is with Docker: +``` +$ make integration +``` +Alternatively, you can run integration tests directly on your host through make: +``` +$ sudo make localintegration +``` +Or you can just run them directly using bats +``` +$ sudo bats tests/integration +``` +To run a single test bucket: +``` +$ make integration TESTPATH="/checkpoint.bats" +``` + + +To run them on your host, you need to set up a development environment plus +[bats (Bash Automated Testing System)](https://github.com/bats-core/bats-core#installing-bats-from-source). + +For example: +``` +$ cd ~/go/src/github.com +$ git clone https://github.com/bats-core/bats-core.git +$ cd bats-core +$ ./install.sh /usr/local +``` + +> **Note**: There are known issues running the integration tests using +> **devicemapper** as a storage driver, make sure that your docker daemon +> is using **aufs** if you want to successfully run the integration tests. + +## Writing integration tests + +[helper functions] +(https://github.com/opencontainers/runc/blob/master/tests/integration/helpers.bash) +are provided in order to facilitate writing tests. + +```sh +#!/usr/bin/env bats + +# This will load the helpers. +load helpers + +# setup is called at the beginning of every test. +function setup() { + # see functions teardown_hello and setup_hello in helpers.bash, used to + # create a pristine environment for running your tests + teardown_hello + setup_hello +} + +# teardown is called at the end of every test. +function teardown() { + teardown_hello +} + +@test "this is a simple test" { + runc run containerid + # "The runc macro" automatically populates $status, $output and $lines. + # Please refer to bats documentation to find out more. + [ "$status" -eq 0 ] + + # check expected output + [[ "${output}" == *"Hello"* ]] +} + +``` diff --git a/sysbox-runc/tests/integration/cgroups.bats b/sysbox-runc/tests/integration/cgroups.bats new file mode 100644 index 00000000..584fff70 --- /dev/null +++ b/sysbox-runc/tests/integration/cgroups.bats @@ -0,0 +1,223 @@ +#!/usr/bin/env bats + +load helpers + +function teardown() { + teardown_running_container test_cgroups_kmem + teardown_running_container test_cgroups_permissions + teardown_running_container test_cgroups_group + teardown_running_container test_cgroups_unified + teardown_busybox +} + +function setup() { + teardown + setup_busybox +} + +@test "runc create (no limits + no cgrouppath + no permission) succeeds" { + runc run -d --console-socket "$CONSOLE_SOCKET" test_cgroups_permissions + [ "$status" -eq 0 ] +} + +@test "runc create (rootless + no limits + cgrouppath + no permission) fails with permission error" { + requires rootless + requires rootless_no_cgroup + # systemd controls the permission, so error does not happen + requires no_systemd + + set_cgroups_path "$BUSYBOX_BUNDLE" + + runc run -d --console-socket "$CONSOLE_SOCKET" test_cgroups_permissions + [ "$status" -eq 1 ] + [[ ${lines[0]} == *"permission denied"* ]] +} + +@test "runc create (rootless + limits + no cgrouppath + no permission) fails with informative error" { + requires rootless + requires rootless_no_cgroup + # systemd controls the permission, so error does not happen + requires no_systemd + + set_resources_limit "$BUSYBOX_BUNDLE" + + runc run -d --console-socket "$CONSOLE_SOCKET" test_cgroups_permissions + [ "$status" -eq 1 ] + [[ ${lines[0]} == *"rootless needs no limits + no cgrouppath when no permission is granted for cgroups"* ]] || [[ ${lines[0]} == *"cannot set pids limit: container could not join or create cgroup"* ]] +} + +@test "runc create (limits + cgrouppath + permission on the cgroup dir) succeeds" { + [[ "$ROOTLESS" -ne 0 ]] && requires rootless_cgroup + + set_cgroups_path "$BUSYBOX_BUNDLE" + set_resources_limit "$BUSYBOX_BUNDLE" + + runc run -d --console-socket "$CONSOLE_SOCKET" test_cgroups_permissions + [ "$status" -eq 0 ] + if [ "$CGROUP_UNIFIED" != "no" ]; then + if [ -n "${RUNC_USE_SYSTEMD}" ]; then + if [ "$(id -u)" = "0" ]; then + check_cgroup_value "cgroup.controllers" "$(cat /sys/fs/cgroup/machine.slice/cgroup.controllers)" + else + # shellcheck disable=SC2046 + check_cgroup_value "cgroup.controllers" "$(cat /sys/fs/cgroup/user.slice/user-$(id -u).slice/cgroup.controllers)" + fi + else + check_cgroup_value "cgroup.controllers" "$(cat /sys/fs/cgroup/cgroup.controllers)" + fi + fi +} + +@test "runc exec (limits + cgrouppath + permission on the cgroup dir) succeeds" { + [[ "$ROOTLESS" -ne 0 ]] && requires rootless_cgroup + + set_cgroups_path "$BUSYBOX_BUNDLE" + set_resources_limit "$BUSYBOX_BUNDLE" + + runc run -d --console-socket "$CONSOLE_SOCKET" test_cgroups_permissions + [ "$status" -eq 0 ] + + runc exec test_cgroups_permissions echo "cgroups_exec" + [ "$status" -eq 0 ] + [[ ${lines[0]} == *"cgroups_exec"* ]] +} + +@test "runc exec (cgroup v2 + init process in non-root cgroup) succeeds" { + requires root cgroups_v2 + + set_cgroups_path "$BUSYBOX_BUNDLE" + set_cgroup_mount_writable "$BUSYBOX_BUNDLE" + + runc run -d --console-socket "$CONSOLE_SOCKET" test_cgroups_group + [ "$status" -eq 0 ] + + runc exec test_cgroups_group cat /sys/fs/cgroup/cgroup.controllers + [ "$status" -eq 0 ] + [[ ${lines[0]} == *"memory"* ]] + + runc exec test_cgroups_group cat /proc/self/cgroup + [ "$status" -eq 0 ] + [[ ${lines[0]} == "0::/" ]] + + runc exec test_cgroups_group mkdir /sys/fs/cgroup/foo + [ "$status" -eq 0 ] + + runc exec test_cgroups_group sh -c "echo 1 > /sys/fs/cgroup/foo/cgroup.procs" + [ "$status" -eq 0 ] + + # the init process is now in "/foo", but an exec process can still join "/" + # because we haven't enabled any domain controller. + runc exec test_cgroups_group cat /proc/self/cgroup + [ "$status" -eq 0 ] + [[ ${lines[0]} == "0::/" ]] + + # turn on a domain controller (memory) + runc exec test_cgroups_group sh -euxc 'echo $$ > /sys/fs/cgroup/foo/cgroup.procs; echo +memory > /sys/fs/cgroup/cgroup.subtree_control' + [ "$status" -eq 0 ] + + # an exec process can no longer join "/" after turning on a domain controller. + # falls back to "/foo". + runc exec test_cgroups_group cat /proc/self/cgroup + [ "$status" -eq 0 ] + [[ ${lines[0]} == "0::/foo" ]] + + # teardown: remove "/foo" + # shellcheck disable=SC2016 + runc exec test_cgroups_group sh -uxc 'echo -memory > /sys/fs/cgroup/cgroup.subtree_control; for f in $(cat /sys/fs/cgroup/foo/cgroup.procs); do echo $f > /sys/fs/cgroup/cgroup.procs; done; rmdir /sys/fs/cgroup/foo' + runc exec test_cgroups_group test ! -d /sys/fs/cgroup/foo + [ "$status" -eq 0 ] + # +} + +@test "runc run (cgroup v1 + unified resources should fail)" { + requires root cgroups_v1 + + set_cgroups_path "$BUSYBOX_BUNDLE" + set_resources_limit "$BUSYBOX_BUNDLE" + update_config '.linux.resources.unified |= {"memory.min": "131072"}' "$BUSYBOX_BUNDLE" + + runc run -d --console-socket "$CONSOLE_SOCKET" test_cgroups_unified + [ "$status" -ne 0 ] + [[ "$output" == *'invalid configuration'* ]] +} + +@test "runc run (cgroup v2 resources.unified only)" { + requires root cgroups_v2 + + set_cgroups_path "$BUSYBOX_BUNDLE" + update_config ' .linux.resources.unified |= { + "memory.min": "131072", + "memory.low": "524288", + "memory.high": "5242880", + "memory.max": "10485760", + "memory.swap.max": "20971520", + "pids.max": "99", + "cpu.max": "10000 100000", + "cpu.weight": "42" + }' "$BUSYBOX_BUNDLE" + + runc run -d --console-socket "$CONSOLE_SOCKET" test_cgroups_unified + [ "$status" -eq 0 ] + + runc exec test_cgroups_unified sh -c 'cd /sys/fs/cgroup && grep . *.min *.max *.low *.high' + [ "$status" -eq 0 ] + echo "$output" + + echo "$output" | grep -q '^memory.min:131072$' + echo "$output" | grep -q '^memory.low:524288$' + echo "$output" | grep -q '^memory.high:5242880$' + echo "$output" | grep -q '^memory.max:10485760$' + echo "$output" | grep -q '^memory.swap.max:20971520$' + echo "$output" | grep -q '^pids.max:99$' + echo "$output" | grep -q '^cpu.max:10000 100000$' + + check_systemd_value "MemoryMin" 131072 + check_systemd_value "MemoryLow" 524288 + check_systemd_value "MemoryHigh" 5242880 + check_systemd_value "MemoryMax" 10485760 + check_systemd_value "MemorySwapMax" 20971520 + check_systemd_value "TasksMax" 99 + check_cpu_quota 10000 100000 "100ms" + check_cpu_weight 42 +} + +@test "runc run (cgroup v2 resources.unified override)" { + requires root cgroups_v2 + + set_cgroups_path "$BUSYBOX_BUNDLE" + # CPU shares of 3333 corresponds to CPU weight of 128. + update_config ' .linux.resources.memory |= {"limit": 33554432} + | .linux.resources.memorySwap |= {"limit": 33554432} + | .linux.resources.cpu |= { + "shares": 3333, + "quota": 40000, + "period": 100000 + } + | .linux.resources.unified |= { + "memory.min": "131072", + "memory.max": "10485760", + "pids.max": "42", + "cpu.max": "5000 50000", + "cpu.weight": "42" + }' "$BUSYBOX_BUNDLE" + + runc run -d --console-socket "$CONSOLE_SOCKET" test_cgroups_unified + [ "$status" -eq 0 ] + + runc exec test_cgroups_unified cat /sys/fs/cgroup/memory.min + [ "$status" -eq 0 ] + [ "$output" = '131072' ] + + runc exec test_cgroups_unified cat /sys/fs/cgroup/memory.max + [ "$status" -eq 0 ] + [ "$output" = '10485760' ] + + runc exec test_cgroups_unified cat /sys/fs/cgroup/pids.max + [ "$status" -eq 0 ] + [ "$output" = '42' ] + check_systemd_value "TasksMax" 42 + + check_cpu_quota 5000 50000 "100ms" + + check_cpu_weight 42 +} diff --git a/sysbox-runc/tests/integration/checkpoint.bats b/sysbox-runc/tests/integration/checkpoint.bats new file mode 100644 index 00000000..3332cf2c --- /dev/null +++ b/sysbox-runc/tests/integration/checkpoint.bats @@ -0,0 +1,329 @@ +#!/usr/bin/env bats + +load helpers + +function setup() { + # XXX: currently criu require root containers. + requires criu root + + teardown_busybox + setup_busybox +} + +function teardown() { + teardown_busybox + teardown_running_container test_busybox_restore +} + +function setup_pipes() { + # The changes to 'terminal' are needed for running in detached mode + # shellcheck disable=SC2016 + update_config ' (.. | select(.terminal? != null)) .terminal |= false + | (.. | select(.[]? == "sh")) += ["-c", "for i in `seq 10`; do read xxx || continue; echo ponG $xxx; done"]' + + # Create three sets of pipes for __runc run. + # for stderr + exec {pipe}<> <(:) + exec {err_r}/proc/self/fd/$pipe + exec {pipe}>&- + # for stdout + exec {pipe}<> <(:) + exec {out_r}/proc/self/fd/$pipe + exec {pipe}>&- + # for stdin + exec {pipe}<> <(:) + exec {in_r}/proc/self/fd/$pipe + exec {pipe}>&- +} + +function check_pipes() { + echo Ping >&${in_w} + exec {in_w}>&- + exec {out_w}>&- + output=$(cat <&${out_r}) + [[ "${output}" == *"ponG Ping"* ]] +} + +# Usage: runc_run_with_pipes container-name +function runc_run_with_pipes() { + # Start a container to be checkpointed, with stdin/stdout redirected + # so that check_pipes can be used to check it's working fine. + # We have to redirect stderr as well because otherwise it is + # redirected to a bats log file, which is not accessible to CRIU + # (i.e. outside of container) so checkpointing will fail. + ret=0 + __runc run -d "$1" <&${in_r} >&${out_w} 2>&${err_w} || ret=$? + if [ "$ret" -ne 0 ]; then + echo "runc run -d $1 (status: $ret):" + exec {err_w}>&- + cat <&${err_r} + fail "runc run failed" + fi + + testcontainer "$1" running +} + +# Usage: runc_restore_with_pipes work-dir container-name [optional-arguments ...] +function runc_restore_with_pipes() { + workdir="$1" + shift + name="$1" + shift + + ret=0 + __runc --criu "$CRIU" restore -d --work-path "$workdir" --image-path ./image-dir "$@" "$name" <&${in_r} >&${out_w} 2>&${err_w} || ret=$? + if [ "$ret" -ne 0 ]; then + echo "__runc restore $name failed (status: $ret)" + exec {err_w}>&- + cat <&${err_r} + echo "CRIU restore log errors (if any):" + grep -B 5 Error "$workdir"/restore.log || true + fail "runc restore failed" + fi + + testcontainer "$name" running + + runc exec --cwd /bin "$name" echo ok + [ "$status" -eq 0 ] + [[ ${output} == "ok" ]] +} + +function simple_cr() { + runc run -d --console-socket "$CONSOLE_SOCKET" test_busybox + [ "$status" -eq 0 ] + + testcontainer test_busybox running + + for _ in $(seq 2); do + # checkpoint the running container + runc --criu "$CRIU" checkpoint --work-path ./work-dir test_busybox + grep -B 5 Error ./work-dir/dump.log || true + [ "$status" -eq 0 ] + + # after checkpoint busybox is no longer running + testcontainer test_busybox checkpointed + + # restore from checkpoint + runc --criu "$CRIU" restore -d --work-path ./work-dir --console-socket "$CONSOLE_SOCKET" test_busybox + grep -B 5 Error ./work-dir/restore.log || true + [ "$status" -eq 0 ] + + # busybox should be back up and running + testcontainer test_busybox running + done +} + +@test "checkpoint and restore " { + skip "sysbox-runc unsupported feature" + + simple_cr +} + +@test "checkpoint and restore (cgroupns)" { + skip "sysbox-runc unsupported feature" + + # cgroupv2 already enables cgroupns so this case was tested above already + requires cgroups_v1 cgroupns + + # enable CGROUPNS + update_config '.linux.namespaces += [{"type": "cgroup"}]' + + simple_cr +} + +@test "checkpoint --pre-dump and restore" { + skip "sysbox-runc unsupported feature" + + setup_pipes + runc_run_with_pipes test_busybox + + #test checkpoint pre-dump + mkdir parent-dir + runc --criu "$CRIU" checkpoint --pre-dump --image-path ./parent-dir test_busybox + [ "$status" -eq 0 ] + + # busybox should still be running + testcontainer test_busybox running + + # checkpoint the running container + mkdir image-dir + mkdir work-dir + runc --criu "$CRIU" checkpoint --parent-path ./parent-dir --work-path ./work-dir --image-path ./image-dir test_busybox + grep -B 5 Error ./work-dir/dump.log || true + [ "$status" -eq 0 ] + + # after checkpoint busybox is no longer running + testcontainer test_busybox checkpointed + + runc_restore_with_pipes ./work-dir test_busybox + check_pipes +} + +@test "checkpoint --lazy-pages and restore" { + skip "sysbox-runc unsupported feature" + + # check if lazy-pages is supported + if ! "${CRIU}" check --feature uffd-noncoop; then + skip "this criu does not support lazy migration" + fi + + setup_pipes + runc_run_with_pipes test_busybox + + # checkpoint the running container + mkdir image-dir + mkdir work-dir + + # For lazy migration we need to know when CRIU is ready to serve + # the memory pages via TCP. + exec {pipe}<> <(:) + # shellcheck disable=SC2094 + exec {lazy_r}/proc/self/fd/$pipe + exec {pipe}>&- + + # TCP port for lazy migration + port=27277 + + __runc --criu "$CRIU" checkpoint --lazy-pages --page-server 0.0.0.0:${port} --status-fd ${lazy_w} --work-path ./work-dir --image-path ./image-dir test_busybox & + cpt_pid=$! + + # wait for lazy page server to be ready + out=$(timeout 2 dd if=/proc/self/fd/${lazy_r} bs=1 count=1 2>/dev/null | od) + exec {lazy_w}>&- + # shellcheck disable=SC2116,SC2086 + out=$(echo $out) # rm newlines + # show errors if there are any before we fail + grep -B5 Error ./work-dir/dump.log || true + # expecting \0 which od prints as + [ "$out" = "0000000 000000 0000001" ] + + # Check if inventory.img was written + [ -e image-dir/inventory.img ] + + # Start CRIU in lazy-daemon mode + ${CRIU} lazy-pages --page-server --address 127.0.0.1 --port ${port} -D image-dir & + lp_pid=$! + + # Restore lazily from checkpoint. + # The restored container needs a different name as the checkpointed + # container is not yet destroyed. It is only destroyed at that point + # in time when the last page is lazily transferred to the destination. + # Killing the CRIU on the checkpoint side will let the container + # continue to run if the migration failed at some point. + runc_restore_with_pipes ./image-dir test_busybox_restore --lazy-pages + + wait $cpt_pid + + wait $lp_pid + + check_pipes +} + +@test "checkpoint and restore in external network namespace" { + skip "sysbox-runc unsupported feature" + + # check if external_net_ns is supported; only with criu 3.10++ + if ! "${CRIU}" check --feature external_net_ns; then + # this criu does not support external_net_ns; skip the test + skip "this criu does not support external network namespaces" + fi + + # create a temporary name for the test network namespace + tmp=$(mktemp) + rm -f "$tmp" + ns_name=$(basename "$tmp") + # create network namespace + ip netns add "$ns_name" + ns_path=$(ip netns add "$ns_name" 2>&1 | sed -e 's/.*"\(.*\)".*/\1/') + # shellcheck disable=SC2012 + ns_inode=$(ls -iL "$ns_path" | awk '{ print $1 }') + + # tell runc which network namespace to use + update_config '(.. | select(.type? == "network")) .path |= "'"$ns_path"'"' + + runc run -d --console-socket "$CONSOLE_SOCKET" test_busybox + [ "$status" -eq 0 ] + + testcontainer test_busybox running + + for _ in $(seq 2); do + # checkpoint the running container; this automatically tells CRIU to + # handle the network namespace defined in config.json as an external + runc --criu "$CRIU" checkpoint --work-path ./work-dir test_busybox + grep -B 5 Error ./work-dir/dump.log || true + [ "$status" -eq 0 ] + + # after checkpoint busybox is no longer running + testcontainer test_busybox checkpointed + + # restore from checkpoint; this should restore the container into the existing network namespace + runc --criu "$CRIU" restore -d --work-path ./work-dir --console-socket "$CONSOLE_SOCKET" test_busybox + grep -B 5 Error ./work-dir/restore.log || true + [ "$status" -eq 0 ] + + # busybox should be back up and running + testcontainer test_busybox running + + # container should be running in same network namespace as before + pid=$(__runc state test_busybox | jq '.pid') + ns_inode_new=$(readlink /proc/"$pid"/ns/net | sed -e 's/.*\[\(.*\)\]/\1/') + echo "old network namespace inode $ns_inode" + echo "new network namespace inode $ns_inode_new" + [ "$ns_inode" -eq "$ns_inode_new" ] + done + ip netns del "$ns_name" +} + +@test "checkpoint and restore with container specific CRIU config" { + skip "sysbox-runc unsupported feature" + + tmp=$(mktemp /tmp/runc-criu-XXXXXX.conf) + # This is the file we write to /etc/criu/default.conf + tmplog1=$(mktemp /tmp/runc-criu-log-XXXXXX.log) + unlink "$tmplog1" + tmplog1=$(basename "$tmplog1") + # That is the actual configuration file to be used + tmplog2=$(mktemp /tmp/runc-criu-log-XXXXXX.log) + unlink "$tmplog2" + tmplog2=$(basename "$tmplog2") + # This adds the annotation 'org.criu.config' to set a container + # specific CRIU config file. + update_config '.annotations += {"org.criu.config": "'"$tmp"'"}' + + # Tell CRIU to use another configuration file + mkdir -p /etc/criu + echo "log-file=$tmplog1" >/etc/criu/default.conf + # Make sure the RPC defined configuration file overwrites the previous + echo "log-file=$tmplog2" >"$tmp" + + runc run -d --console-socket "$CONSOLE_SOCKET" test_busybox + [ "$status" -eq 0 ] + + testcontainer test_busybox running + + # checkpoint the running container + runc --criu "$CRIU" checkpoint --work-path ./work-dir test_busybox + grep -B 5 Error ./work-dir/dump.log || true + [ "$status" -eq 0 ] + ! test -f ./work-dir/"$tmplog1" + test -f ./work-dir/"$tmplog2" + + # after checkpoint busybox is no longer running + testcontainer test_busybox checkpointed + + test -f ./work-dir/"$tmplog2" && unlink ./work-dir/"$tmplog2" + # restore from checkpoint + runc --criu "$CRIU" restore -d --work-path ./work-dir --console-socket "$CONSOLE_SOCKET" test_busybox + grep -B 5 Error ./work-dir/restore.log || true + [ "$status" -eq 0 ] + ! test -f ./work-dir/"$tmplog1" + test -f ./work-dir/"$tmplog2" + + # busybox should be back up and running + testcontainer test_busybox running + unlink "$tmp" + test -f ./work-dir/"$tmplog2" && unlink ./work-dir/"$tmplog2" +} diff --git a/sysbox-runc/tests/integration/create.bats b/sysbox-runc/tests/integration/create.bats new file mode 100644 index 00000000..cc357bbd --- /dev/null +++ b/sysbox-runc/tests/integration/create.bats @@ -0,0 +1,83 @@ +#!/usr/bin/env bats + +load helpers + +function setup() { + teardown_busybox + setup_busybox +} + +function teardown() { + teardown_busybox +} + +@test "runc create" { + runc create --console-socket "$CONSOLE_SOCKET" test_busybox + [ "$status" -eq 0 ] + + testcontainer test_busybox created + + # start the command + runc start test_busybox + [ "$status" -eq 0 ] + + testcontainer test_busybox running +} + +@test "runc create exec" { + runc create --console-socket "$CONSOLE_SOCKET" test_busybox + [ "$status" -eq 0 ] + + testcontainer test_busybox created + + runc exec test_busybox true + [ "$status" -eq 0 ] + + testcontainer test_busybox created + + # start the command + runc start test_busybox + [ "$status" -eq 0 ] + + testcontainer test_busybox running +} + +@test "runc create --pid-file" { + runc create --pid-file pid.txt --console-socket "$CONSOLE_SOCKET" test_busybox + [ "$status" -eq 0 ] + + testcontainer test_busybox created + + # check pid.txt was generated + [ -e pid.txt ] + + [[ $(cat pid.txt) == $(__runc state test_busybox | jq '.pid') ]] + + # start the command + runc start test_busybox + [ "$status" -eq 0 ] + + testcontainer test_busybox running +} + +@test "runc create --pid-file with new CWD" { + # create pid_file directory as the CWD + mkdir pid_file + cd pid_file + + runc create --pid-file pid.txt -b "$BUSYBOX_BUNDLE" --console-socket "$CONSOLE_SOCKET" test_busybox + [ "$status" -eq 0 ] + + testcontainer test_busybox created + + # check pid.txt was generated + [ -e pid.txt ] + + [[ $(cat pid.txt) == $(__runc state test_busybox | jq '.pid') ]] + + # start the command + runc start test_busybox + [ "$status" -eq 0 ] + + testcontainer test_busybox running +} diff --git a/sysbox-runc/tests/integration/cwd.bats b/sysbox-runc/tests/integration/cwd.bats new file mode 100644 index 00000000..66fd5a39 --- /dev/null +++ b/sysbox-runc/tests/integration/cwd.bats @@ -0,0 +1,70 @@ +#!/usr/bin/env bats + +load helpers + +function setup() { + teardown_busybox + setup_busybox +} + +function teardown() { + teardown_busybox +} + +# Test case for https://github.com/opencontainers/runc/pull/2086 +@test "runc exec --user with no access to cwd" { + requires root + + # sysbox-runc: containers always user the user-ns. If uid-shifting is not + # used, the rootfs ownership must be within the range of host uids assigned + # to the container. + local uid + if [ -z "$SHIFT_ROOTFS_UIDS" ]; then + uid=$((UID_MAP + 42)) + else + uid=42 + fi + + chown $uid rootfs/root + chmod 700 rootfs/root + + update_config ' .process.cwd = "/root" + | .process.user.uid = 42 + | .process.user.gid = 42 + | .process.args |= ["sleep", "1h"]' + + runc run -d --console-socket "$CONSOLE_SOCKET" test_busybox + [ "$status" -eq 0 ] + + runc exec --user 0 test_busybox true + [ "$status" -eq 0 ] +} + +# Verify a cwd owned by the container user can be chdir'd to, +# even if runc doesn't have the privilege to do so. +@test "runc create sets up user before chdir to cwd" { + requires rootless rootless_idmap + + # Some setup for this test (AUX_DIR and AUX_UID) is done + # by rootless.sh. Check that setup is done... + if [[ ! -d "$AUX_DIR" || -z "$AUX_UID" ]]; then + skip "bad/unset AUX_DIR/AUX_UID" + fi + # ... and is correct, i.e. the current user + # does not have permission to access AUX_DIR. + if ls -l "$AUX_DIR" 2>/dev/null; then + skip "bad AUX_DIR permissions" + fi + + update_config ' .mounts += [{ + source: "'"$AUX_DIR"'", + destination: "'"$AUX_DIR"'", + options: ["bind"] + }] + | .process.user.uid = '"$AUX_UID"' + | .process.cwd = "'"$AUX_DIR"'" + | .process.args |= ["ls", "'"$AUX_DIR"'"]' + + runc run test_busybox + [ "$status" -eq 0 ] +} diff --git a/sysbox-runc/tests/integration/debug.bats b/sysbox-runc/tests/integration/debug.bats new file mode 100644 index 00000000..a92b9013 --- /dev/null +++ b/sysbox-runc/tests/integration/debug.bats @@ -0,0 +1,78 @@ +#!/usr/bin/env bats + +load helpers + +function setup() { + teardown_hello + setup_hello +} + +function teardown() { + teardown_hello +} + +@test "global --debug" { + # run hello-world + runc --debug run test_hello + echo "${output}" + [ "$status" -eq 0 ] + + # check expected debug output was sent to stderr + [[ "${output}" == *"level=debug"* ]] + [[ "${output}" == *"nsexec started"* ]] + [[ "${output}" == *"child process in init()"* ]] +} + +@test "global --debug to --log" { + # run hello-world + runc --log log.out --debug run test_hello + [ "$status" -eq 0 ] + + # check output does not include debug info + [[ "${output}" != *"level=debug"* ]] + + # check log.out was generated + [ -e log.out ] + + # check expected debug output was sent to log.out + output=$(cat log.out) + [[ "${output}" == *"level=debug"* ]] + [[ "${output}" == *"nsexec started"* ]] + [[ "${output}" == *"child process in init()"* ]] +} + +@test "global --debug to --log --log-format 'text'" { + # run hello-world + runc --log log.out --log-format "text" --debug run test_hello + [ "$status" -eq 0 ] + + # check output does not include debug info + [[ "${output}" != *"level=debug"* ]] + + # check log.out was generated + [ -e log.out ] + + # check expected debug output was sent to log.out + output=$(cat log.out) + [[ "${output}" == *"level=debug"* ]] + [[ "${output}" == *"nsexec started"* ]] + [[ "${output}" == *"child process in init()"* ]] +} + +@test "global --debug to --log --log-format 'json'" { + # run hello-world + runc --log log.out --log-format "json" --debug run test_hello + [ "$status" -eq 0 ] + + # check output does not include debug info + [[ "${output}" != *"level=debug"* ]] + + # check log.out was generated + [ -e log.out ] + + # check expected debug output was sent to log.out + output=$(cat log.out) + [[ "${output}" == *'"level":"debug"'* ]] + [[ "${output}" == *"nsexec started"* ]] + [[ "${output}" == *"child process in init()"* ]] +} diff --git a/sysbox-runc/tests/integration/delete.bats b/sysbox-runc/tests/integration/delete.bats new file mode 100644 index 00000000..ad4d85cd --- /dev/null +++ b/sysbox-runc/tests/integration/delete.bats @@ -0,0 +1,155 @@ +#!/usr/bin/env bats + +load helpers + +function setup() { + teardown_busybox + setup_busybox +} + +function teardown() { + teardown_busybox + teardown_running_container testbusyboxdelete +} + +@test "runc delete" { + runc run -d --console-socket "$CONSOLE_SOCKET" testbusyboxdelete + [ "$status" -eq 0 ] + + testcontainer testbusyboxdelete running + + runc kill testbusyboxdelete KILL + [ "$status" -eq 0 ] + retry 10 1 eval "__runc state testbusyboxdelete | grep -q 'stopped'" + + runc delete testbusyboxdelete + [ "$status" -eq 0 ] + + runc state testbusyboxdelete + [ "$status" -ne 0 ] + + output=$(find /sys/fs/cgroup -wholename '*testbusyboxdelete*' -type d) + [ "$output" = "" ] || fail "cgroup not cleaned up correctly: $output" +} + +@test "runc delete --force" { + # run busybox detached + runc run -d --console-socket "$CONSOLE_SOCKET" test_busybox + [ "$status" -eq 0 ] + + # check state + testcontainer test_busybox running + + # force delete test_busybox + runc delete --force test_busybox + + runc state test_busybox + [ "$status" -ne 0 ] +} + +@test "runc delete --force ignore not exist" { + runc delete --force notexists + [ "$status" -eq 0 ] +} + +@test "runc delete --force in cgroupv1 with subcgroups" { + requires cgroups_v1 root cgroupns + set_cgroups_path "$BUSYBOX_BUNDLE" + set_cgroup_mount_writable "$BUSYBOX_BUNDLE" + + # sysbox-runc: the container sample spec comes with cgroup ns enabled + # update_config '.linux.namespaces += [{"type": "cgroup"}]' + + # sysbox-runc: this avoids the "__runc exec -d" below + update_config '.process.args = ["sleep", "1d"]' + + local subsystems="memory freezer" + + runc run -d --console-socket "$CONSOLE_SOCKET" test_busybox + [ "$status" -eq 0 ] + + testcontainer test_busybox running + + # Skip as it triggers issue #707 + #__runc exec -d test_busybox sleep 1d + + # find the pid of sleep + pid=$(__runc exec test_busybox ps -a | grep 1d | awk '{print $1}') + [[ ${pid} =~ [0-9]+ ]] + + # create a sub-cgroup + cat < tasks + cat tasks +done +EOF + [ "$status" -eq 0 ] + [[ "$output" =~ [0-9]+ ]] + + for s in ${subsystems}; do + name=CGROUP_${s^^} + eval path=\$"${name}"/syscont-cgroup-root/foo + # shellcheck disable=SC2154 + [ -d "${path}" ] || fail "test failed to create memory sub-cgroup ($path not found)" + done + + runc delete --force test_busybox + + runc state test_busybox + [ "$status" -ne 0 ] + + output=$(find /sys/fs/cgroup -wholename '*testbusyboxdelete*' -type d) + [ "$output" = "" ] || fail "cgroup not cleaned up correctly: $output" +} + +@test "runc delete --force in cgroupv2 with subcgroups" { + requires cgroups_v2 root + set_cgroups_path "$BUSYBOX_BUNDLE" + set_cgroup_mount_writable "$BUSYBOX_BUNDLE" + + # run busybox detached + runc run -d --console-socket "$CONSOLE_SOCKET" test_busybox + [ "$status" -eq 0 ] + + # check state + testcontainer test_busybox running + + # create a sub process + __runc exec -d test_busybox sleep 1d + + # find the pid of sleep + pid=$(__runc exec test_busybox ps -a | grep 1d | awk '{print $1}') + [[ ${pid} =~ [0-9]+ ]] + + # create subcgroups + cat <nest.sh + set -e -u -x + cd /sys/fs/cgroup + echo +pids > cgroup.subtree_control + mkdir foo + cd foo + echo threaded > cgroup.type + echo ${pid} > cgroup.threads + cat cgroup.threads +EOF + runc exec test_busybox sh events.log) & + # 2. Waits for an event that includes test_busybox then kills the + # test_busybox container which causes the event logger to exit. + ( + retry 10 "$retry_every" eval "grep -q 'test_busybox' events.log" + teardown_running_container test_busybox + ) & + wait # for both subshells to finish + + [ -e events.log ] + + output=$(head -1 events.log) + [[ "$output" == [\{]"\"type\""[:]"\"stats\""[,]"\"id\""[:]"\"test_busybox\""[,]* ]] + [[ "$output" == *"data"* ]] +} + +@test "events --interval default" { + test_events +} + +@test "events --interval 1s" { + test_events 1s 1 +} + +@test "events --interval 100ms" { + test_events 100ms 0.1 +} + +@test "events oom" { + + # XXX: DEBUG + skip "XXX: hangs on ubuntu-impish host; needs debug" + + # XXX: currently cgroups require root containers. + requires root cgroups_swap + init_cgroup_paths + + # we need the container to hit OOM, so disable swap + update_config '(.. | select(.resources? != null)) .resources.memory |= {"limit": 33554432, "swap": 33554432}' "${BUSYBOX_BUNDLE}" + + # run busybox detached + runc run -d --console-socket "$CONSOLE_SOCKET" test_busybox + [ "$status" -eq 0 ] + + # spawn two sub processes (shells) + # the first sub process is an event logger that sends stats events to events.log + # the second sub process exec a memory hog process to cause a oom condition + # and waits for an oom event + (__runc events test_busybox >events.log) & + ( + retry 10 1 eval "grep -q 'test_busybox' events.log" + # shellcheck disable=SC2016 + __runc exec -d test_busybox sh -c 'test=$(dd if=/dev/urandom ibs=5120k)' + retry 10 1 eval "grep -q 'oom' events.log" + __runc delete -f test_busybox + ) & + wait # wait for the above sub shells to finish + + grep -q '{"type":"oom","id":"test_busybox"}' events.log +} diff --git a/sysbox-runc/tests/integration/exec.bats b/sysbox-runc/tests/integration/exec.bats new file mode 100644 index 00000000..3f428d30 --- /dev/null +++ b/sysbox-runc/tests/integration/exec.bats @@ -0,0 +1,138 @@ +#!/usr/bin/env bats + +load helpers + +function setup() { + teardown_busybox + setup_busybox +} + +function teardown() { + teardown_busybox +} + +@test "runc exec" { + # run busybox detached + runc run -d --console-socket "$CONSOLE_SOCKET" test_busybox + [ "$status" -eq 0 ] + + runc exec test_busybox echo Hello from exec + [ "$status" -eq 0 ] + echo text echoed = "'""${output}""'" + [[ "${output}" == *"Hello from exec"* ]] +} + +@test "runc exec --pid-file" { + # run busybox detached + runc run -d --console-socket "$CONSOLE_SOCKET" test_busybox + [ "$status" -eq 0 ] + + runc exec --pid-file pid.txt test_busybox echo Hello from exec + [ "$status" -eq 0 ] + echo text echoed = "'""${output}""'" + [[ "${output}" == *"Hello from exec"* ]] + + # check pid.txt was generated + [ -e pid.txt ] + + output=$(cat pid.txt) + [[ "$output" =~ [0-9]+ ]] + [[ "$output" != $(__runc state test_busybox | jq '.pid') ]] +} + +@test "runc exec --pid-file with new CWD" { + # create pid_file directory as the CWD + mkdir pid_file + cd pid_file + + # run busybox detached + runc run -d -b "$BUSYBOX_BUNDLE" --console-socket "$CONSOLE_SOCKET" test_busybox + [ "$status" -eq 0 ] + + runc exec --pid-file pid.txt test_busybox echo Hello from exec + [ "$status" -eq 0 ] + echo text echoed = "'""${output}""'" + [[ "${output}" == *"Hello from exec"* ]] + + # check pid.txt was generated + [ -e pid.txt ] + + output=$(cat pid.txt) + [[ "$output" =~ [0-9]+ ]] + [[ "$output" != $(__runc state test_busybox | jq '.pid') ]] +} + +@test "runc exec ls -la" { + # run busybox detached + runc run -d --console-socket "$CONSOLE_SOCKET" test_busybox + [ "$status" -eq 0 ] + + runc exec test_busybox ls -la + [ "$status" -eq 0 ] + [[ ${lines[0]} == *"total"* ]] + [[ ${lines[1]} == *"."* ]] + [[ ${lines[2]} == *".."* ]] +} + +@test "runc exec ls -la with --cwd" { + # run busybox detached + runc run -d --console-socket "$CONSOLE_SOCKET" test_busybox + [ "$status" -eq 0 ] + + runc exec --cwd /bin test_busybox pwd + [ "$status" -eq 0 ] + [[ ${output} == "/bin"* ]] +} + +@test "runc exec --env" { + # run busybox detached + runc run -d --console-socket "$CONSOLE_SOCKET" test_busybox + [ "$status" -eq 0 ] + + runc exec --env RUNC_EXEC_TEST=true test_busybox env + [ "$status" -eq 0 ] + + [[ ${output} == *"RUNC_EXEC_TEST=true"* ]] +} + +@test "runc exec --user" { + # --user can't work in rootless containers that don't have idmap. + [[ "$ROOTLESS" -ne 0 ]] && requires rootless_idmap + + # run busybox detached + runc run -d --console-socket "$CONSOLE_SOCKET" test_busybox + [ "$status" -eq 0 ] + + runc exec --user 1000:1000 test_busybox id + [ "$status" -eq 0 ] + + [[ "${output}" == "uid=1000 gid=1000"* ]] +} + +@test "runc exec --additional-gids" { + requires root + + # run busybox detached + runc run -d --console-socket "$CONSOLE_SOCKET" test_busybox + [ "$status" -eq 0 ] + + wait_for_container 15 1 test_busybox + + runc exec --user 1000:1000 --additional-gids 100 --additional-gids 65534 test_busybox id -G + [ "$status" -eq 0 ] + + [[ ${output} == "1000 100 65534" ]] +} + +@test "runc exec --preserve-fds" { + + # run busybox detached + runc run -d --console-socket "$CONSOLE_SOCKET" test_busybox + [ "$status" -eq 0 ] + + echo hello >preserve-fds.test + # fd 3 is used by bats, so we use 4 + exec 4&2 + echo "$output" >&2 +} + +# Raw wrapper for runc. +function __runc() { + "$RUNC" ${RUNC_USE_SYSTEMD+--systemd-cgroup} ${RUNC_FLAGS} --root "$ROOT" "$@" +} + +# Wrapper for runc spec, which takes only one argument (the bundle path). +function runc_spec() { + ! [[ "$#" > 1 ]] + + local args=() + local bundle="" + + if [ "$ROOTLESS" -ne 0 ]; then + args+=("--rootless") + fi + if [ "$#" -ne 0 ]; then + bundle="$1" + args+=("--bundle" "$bundle") + fi + + # sysbox-runc: sys container spec takes id mappings + $RUNC spec "${args[@]}" --id-map "$UID_MAP $GID_MAP $ID_MAP_SIZE" + + # Always add additional mappings if we have idmaps. + if [[ "$ROOTLESS" -ne 0 ]] && [[ "$ROOTLESS_FEATURES" == *"idmap"* ]]; then + runc_rootless_idmap "$bundle" + fi + + # Ensure config.json contains linux.resources + if [[ "$ROOTLESS" -ne 0 ]] && [[ "$ROOTLESS_FEATURES" == *"cgroup"* ]]; then + runc_rootless_cgroup "$bundle" + fi +} + +# Helper function to reformat config.json file. Input uses jq syntax. +function update_config() { + bundle="${2:-.}" + jq "$1" "$bundle/config.json" | awk 'BEGIN{RS="";getline<"-";print>ARGV[1]}' "$bundle/config.json" +} + +# Shortcut to add additional uids and gids, based on the values set as part of +# a rootless configuration. +function runc_rootless_idmap() { + bundle="${1:-.}" + update_config ' .mounts |= map((select(.type == "devpts") | .options += ["gid=5"]) // .) + | .linux.uidMappings += [{"hostID": '"$ROOTLESS_UIDMAP_START"', "containerID": 1000, "size": '"$ROOTLESS_UIDMAP_LENGTH"'}] + | .linux.gidMappings += [{"hostID": '"$ROOTLESS_GIDMAP_START"', "containerID": 100, "size": 1}] + | .linux.gidMappings += [{"hostID": '"$(($ROOTLESS_GIDMAP_START + 10))"', "containerID": 1, "size": 20}] + | .linux.gidMappings += [{"hostID": '"$(($ROOTLESS_GIDMAP_START + 100))"', "containerID": 1000, "size": '"$(($ROOTLESS_GIDMAP_LENGTH - 1000))"'}]' $bundle +} + +# Shortcut to add empty resources as part of a rootless configuration. +function runc_rootless_cgroup() { + bundle="${1:-.}" + update_config '.linux.resources += {"memory":{},"cpu":{},"blockio":{},"pids":{}}' $bundle +} + +# Returns systemd version as a number (-1 if systemd is not enabled/supported). +function systemd_version() { + if [ -n "${RUNC_USE_SYSTEMD}" ]; then + systemctl --version | awk '/^systemd / {print $2; exit}' + return + fi + + echo "-1" +} + +function init_cgroup_paths() { + # init once + test -n "$CGROUP_UNIFIED" && return + + if [ -n "${RUNC_USE_SYSTEMD}" ]; then + SD_UNIT_NAME="runc-cgroups-integration-test.scope" + if [ $(id -u) = "0" ]; then + REL_CGROUPS_PATH="/machine.slice/$SD_UNIT_NAME" + OCI_CGROUPS_PATH="machine.slice:runc-cgroups:integration-test" + else + REL_CGROUPS_PATH="/user.slice/user-$(id -u).slice/user@$(id -u).service/machine.slice/$SD_UNIT_NAME" + # OCI path doesn't contain "/user.slice/user-$(id -u).slice/user@$(id -u).service/" prefix + OCI_CGROUPS_PATH="machine.slice:runc-cgroups:integration-test" + fi + else + REL_CGROUPS_PATH="/runc-cgroups-integration-test/test-cgroup/" + OCI_CGROUPS_PATH=$REL_CGROUPS_PATH + fi + + if stat -f -c %t /sys/fs/cgroup | grep -qFw 63677270; then + CGROUP_UNIFIED=yes + # "pseudo" controllers do not appear in /sys/fs/cgroup/cgroup.controllers. + # - devices (since kernel 4.15) we must assume to be supported because + # it's quite hard to test. + # - freezer (since kernel 5.2) we can auto-detect by looking for the + # "cgroup.freeze" file a *non-root* cgroup. + CGROUP_SUBSYSTEMS=$( + cat /sys/fs/cgroup/cgroup.controllers + echo devices + ) + CGROUP_BASE_PATH=/sys/fs/cgroup + CGROUP_PATH=${CGROUP_BASE_PATH}${REL_CGROUPS_PATH} + + # Find any cgroup.freeze files... + if [ -n "$(find "$CGROUP_BASE_PATH" -type f -name "cgroup.freeze" -print -quit)" ]; then + CGROUP_SUBSYSTEMS+=" freezer" + fi + else + CGROUP_UNIFIED=no + CGROUP_SUBSYSTEMS=$(awk '!/^#/ {print $1}' /proc/cgroups) + for g in ${CGROUP_SUBSYSTEMS}; do + base_path=$(gawk '$(NF-2) == "cgroup" && $NF ~ /\<'${g}'\>/ { print $5; exit }' /proc/self/mountinfo) + test -z "$base_path" && continue + eval CGROUP_${g^^}_BASE_PATH="${base_path}" + eval CGROUP_${g^^}="${base_path}${REL_CGROUPS_PATH}" + done + fi +} + +# Helper function to set cgroupsPath to the value of $OCI_CGROUPS_PATH +function set_cgroups_path() { + bundle="${1:-.}" + init_cgroup_paths + update_config '.linux.cgroupsPath |= "'"${OCI_CGROUPS_PATH}"'"' $bundle +} + +# Helper to check a value in cgroups. +function check_cgroup_value() { + source=$1 + expected=$2 + + if [ "x$CGROUP_UNIFIED" = "xyes" ]; then + cgroup=$CGROUP_PATH + else + ctrl=${source%%.*} + eval cgroup=\$CGROUP_${ctrl^^} + fi + + current=$(cat $cgroup/$source) + echo $cgroup/$source + echo "current" $current "!?" "$expected" + [ "$current" = "$expected" ] +} + +# Helper to check a value in systemd. +function check_systemd_value() { + [ -z "${RUNC_USE_SYSTEMD}" ] && return + local source=$1 + [ "$source" = "unsupported" ] && return + local expected="$2" + local expected2="$3" + local user="" + [ $(id -u) != "0" ] && user="--user" + + current=$(systemctl show $user --property $source $SD_UNIT_NAME | awk -F= '{print $2}') + echo "systemd $source: current $current !? $expected $expected2" + [ "$current" = "$expected" ] || [ -n "$expected2" -a "$current" = "$expected2" ] +} + +function check_cpu_quota() { + local quota=$1 + local period=$2 + local sd_quota=$3 + + if [ "$CGROUP_UNIFIED" = "yes" ]; then + if [ "$quota" = "-1" ]; then + quota="max" + fi + check_cgroup_value "cpu.max" "$quota $period" + else + check_cgroup_value "cpu.cfs_quota_us" $quota + check_cgroup_value "cpu.cfs_period_us" "$period" + fi + # systemd values are the same for v1 and v2 + check_systemd_value "CPUQuotaPerSecUSec" "$sd_quota" + + # CPUQuotaPeriodUSec requires systemd >= v242 + [ "$(systemd_version)" -lt 242 ] && return + + local sd_period=$((period / 1000))ms + [ "$sd_period" = "1000ms" ] && sd_period="1s" + local sd_infinity="" + # 100ms is the default value, and if not set, shown as infinity + [ "$sd_period" = "100ms" ] && sd_infinity="infinity" + check_systemd_value "CPUQuotaPeriodUSec" $sd_period $sd_infinity +} + +# Works for cgroup v1 and v2, accepts v1 shares as an argument. +function check_cpu_shares() { + local shares=$1 + + if [ "$CGROUP_UNIFIED" = "yes" ]; then + local weight=$((1 + ((shares - 2) * 9999) / 262142)) + check_cpu_weight "$weight" + else + check_cgroup_value "cpu.shares" "$shares" + check_systemd_value "CPUShares" "$shares" + fi +} + +# Works only for cgroup v2, accept v2 weight. +function check_cpu_weight() { + local weight=$1 + + check_cgroup_value "cpu.weight" $weight + check_systemd_value "CPUWeight" $weight +} + +# Helper function to set a resources limit +function set_resources_limit() { + bundle="${1:-.}" + update_config '.linux.resources.pids.limit |= 100' $bundle +} + +# Helper function to make /sys/fs/cgroup writable +function set_cgroup_mount_writable() { + bundle="${1:-.}" + update_config '.mounts |= map((select(.type == "cgroup") | .options -= ["ro"]) // .)' \ + $bundle +} + +# Fails the current test, providing the error given. +function fail() { + echo "$@" >&2 + exit 1 +} + +# Allows a test to specify what things it requires. If the environment can't +# support it, the test is skipped with a message. +function requires() { + for var in "$@"; do + local skip_me + case $var in + criu) + if [ ! -e "$CRIU" ]; then + skip_me=1 + fi + ;; + root) + if [ "$ROOTLESS" -ne 0 ]; then + skip_me=1 + fi + ;; + rootless) + if [ "$ROOTLESS" -eq 0 ]; then + skip_me=1 + fi + ;; + rootless_idmap) + if [[ "$ROOTLESS_FEATURES" != *"idmap"* ]]; then + skip_me=1 + fi + ;; + rootless_cgroup) + if [[ "$ROOTLESS_FEATURES" != *"cgroup"* ]]; then + skip_me=1 + fi + ;; + rootless_no_cgroup) + if [[ "$ROOTLESS_FEATURES" == *"cgroup"* ]]; then + skip_me=1 + fi + ;; + rootless_no_features) + if [ "$ROOTLESS_FEATURES" != "" ]; then + skip_me=1 + fi + ;; + cgroups_freezer) + init_cgroup_paths + if [[ "$CGROUP_SUBSYSTEMS" != *"freezer"* ]]; then + skip_me=1 + fi + ;; + cgroups_rt) + init_cgroup_paths + if [ ! -e "${CGROUP_CPU_BASE_PATH}/cpu.rt_period_us" ]; then + skip_me=1 + fi + ;; + cgroups_swap) + init_cgroup_paths + if [ $CGROUP_UNIFIED = "no" -a ! -e "${CGROUP_MEMORY_BASE_PATH}/memory.memsw.limit_in_bytes" ]; then + skip_me=1 + fi + ;; + cgroupns) + if [ ! -e "/proc/self/ns/cgroup" ]; then + skip_me=1 + fi + ;; + cgroups_v1) + init_cgroup_paths + if [ "$CGROUP_UNIFIED" != "no" ]; then + skip_me=1 + fi + ;; + cgroups_v2) + init_cgroup_paths + if [ "$CGROUP_UNIFIED" != "yes" ]; then + skip_me=1 + fi + ;; + smp) + local cpu_count=$(grep -c '^processor' /proc/cpuinfo) + if [ "$cpu_count" -lt 2 ]; then + skip_me=1 + fi + ;; + systemd) + if [ -z "${RUNC_USE_SYSTEMD}" ]; then + skip_me=1 + fi + ;; + no_systemd) + if [ -n "${RUNC_USE_SYSTEMD}" ]; then + skip_me=1 + fi + ;; + *) + fail "BUG: Invalid requires $var." + ;; + esac + if [ -n "$skip_me" ]; then + skip "test requires $var" + fi + done +} + +# Retry a command $1 times until it succeeds. Wait $2 seconds between retries. +function retry() { + local attempts=$1 + shift + local delay=$1 + shift + local i + + for ((i = 0; i < attempts; i++)); do + run "$@" + if [[ "$status" -eq 0 ]]; then + return 0 + fi + sleep $delay + done + + echo "Command \"$@\" failed $attempts times. Output: $output" + false +} + +# retry until the given container has state +function wait_for_container() { + local attempts=$1 + local delay=$2 + local cid=$3 + # optionally wait for a specific status + local wait_for_status="${4:-}" + local i + + for ((i = 0; i < attempts; i++)); do + runc state $cid + if [[ "$status" -eq 0 ]]; then + if [[ "${output}" == *"${wait_for_status}"* ]]; then + return 0 + fi + fi + sleep $delay + done + + echo "runc state failed to return state $statecheck $attempts times. Output: $output" + false +} + +# retry until the given container has state +function wait_for_container_inroot() { + local attempts=$1 + local delay=$2 + local cid=$3 + # optionally wait for a specific status + local wait_for_status="${4:-}" + local i + + for ((i = 0; i < attempts; i++)); do + ROOT=$4 runc state $cid + if [[ "$status" -eq 0 ]]; then + if [[ "${output}" == *"${wait_for_status}"* ]]; then + return 0 + fi + fi + sleep $delay + done + + echo "runc state failed to return state $statecheck $attempts times. Output: $output" + false +} + +function testcontainer() { + # test state of container + runc state $1 + if [ $2 == "checkpointed" ]; then + [ "$status" -eq 1 ] + return + fi + [ "$status" -eq 0 ] + [[ "${output}" == *"$2"* ]] +} + +function setup_recvtty() { + # We need to start recvtty in the background, so we double fork in the shell. + ("$RECVTTY" --pid-file "$WORK_DIR/recvtty.pid" --mode null "$CONSOLE_SOCKET" &) & +} + +function teardown_recvtty() { + # When we kill recvtty, the container will also be killed. + if [ -f "$WORK_DIR/recvtty.pid" ]; then + kill -9 $(cat "$WORK_DIR/recvtty.pid") + fi + + # Clean up the files that might be left over. + rm -f "$WORK_DIR/recvtty.pid" + rm -f "$CONSOLE_SOCKET" +} + +function setup_busybox() { + setup_recvtty + mkdir -p "$BUSYBOX_BUNDLE"/rootfs + + if [ -e "/testdata/busybox.tar" ]; then + BUSYBOX_IMAGE="/testdata/busybox.tar" + fi + if [ ! -e $BUSYBOX_IMAGE ]; then + curl -o $BUSYBOX_IMAGE -sSL $(get_busybox) + fi + tar --exclude './dev/*' -C "$BUSYBOX_BUNDLE"/rootfs -xf "$BUSYBOX_IMAGE" + + # sysbox-runc: set bundle ownership to match system + # container's uid/gid map, except if using uid-shifting + if [ -z "$SHIFT_ROOTFS_UIDS" ]; then + chown -R "$UID_MAP":"$GID_MAP" "$BUSYBOX_BUNDLE" + fi + + cd "$BUSYBOX_BUNDLE" + + runc_spec +} + +function setup_hello() { + setup_recvtty + + mkdir -p "$HELLO_BUNDLE"/rootfs + tar --exclude './dev/*' -C "$HELLO_BUNDLE"/rootfs -xf "$HELLO_IMAGE" + + # sysbox-runc: set bundle ownership to match system + # container's uid/gid map, except if using uid-shifting + if [ -z "$SHIFT_ROOTFS_UIDS" ]; then + chown -R "$UID_MAP":"$GID_MAP" "$HELLO_BUNDLE" + fi + + cd "$HELLO_BUNDLE" + runc_spec + update_config '(.. | select(.? == "sh")) |= "/hello"' +} + +function setup_debian() { + # skopeo and umoci are not installed on the travis runner + if [ -n "${RUNC_USE_SYSTEMD}" ]; then + return + fi + + setup_recvtty + mkdir -p "$DEBIAN_BUNDLE" + + if [ ! -d "$DEBIAN_ROOTFS/rootfs" ]; then + get_and_extract_debian "$DEBIAN_BUNDLE" + fi + + # Use the cached version + if [ ! -d "$DEBIAN_BUNDLE/rootfs" ]; then + cp -r "$DEBIAN_ROOTFS"/* "$DEBIAN_BUNDLE/" + fi + + cd "$DEBIAN_BUNDLE" +} + +function teardown_running_container() { + __runc delete -f "$1" +} + +function teardown_running_container_inroot() { + ROOT="$2" __runc delete -f "$1" +} + +function teardown_busybox() { + cd "$INTEGRATION_ROOT" + teardown_recvtty + teardown_running_container test_busybox + rm -f -r "$BUSYBOX_BUNDLE" +} + +function teardown_hello() { + cd "$INTEGRATION_ROOT" + teardown_recvtty + teardown_running_container test_hello + rm -f -r "$HELLO_BUNDLE" +} + +function teardown_debian() { + cd "$INTEGRATION_ROOT" + teardown_recvtty + teardown_running_container test_debian + rm -f -r "$DEBIAN_BUNDLE" +} diff --git a/sysbox-runc/tests/integration/hooks.bats b/sysbox-runc/tests/integration/hooks.bats new file mode 100644 index 00000000..31ec2a16 --- /dev/null +++ b/sysbox-runc/tests/integration/hooks.bats @@ -0,0 +1,64 @@ +#!/usr/bin/env bats + +load helpers + +# CR = CreateRuntime +# CC = CreataContainer +HOOKLIBCR=librunc-hooks-create-runtime.so +HOOKLIBCC=librunc-hooks-create-container.so +LIBPATH="$DEBIAN_BUNDLE/rootfs/lib/" + +function setup() { + umount "$LIBPATH"/$HOOKLIBCR.1.0.0 &>/dev/null || true + umount "$LIBPATH"/$HOOKLIBCC.1.0.0 &>/dev/null || true + + requires root no_systemd + + teardown_debian + setup_debian +} + +function teardown() { + umount "$LIBPATH"/$HOOKLIBCR.1.0.0 &>/dev/null || true + umount "$LIBPATH"/$HOOKLIBCC.1.0.0 &>/dev/null || true + + rm -f $HOOKLIBCR.1.0.0 $HOOKLIBCC.1.0.0 + teardown_debian +} + +@test "runc run (hooks library tests)" { + skip "unsupported" + + # setup some dummy libs + gcc -shared -Wl,-soname,librunc-hooks-create-runtime.so.1 -o "$HOOKLIBCR.1.0.0" + gcc -shared -Wl,-soname,librunc-hooks-create-container.so.1 -o "$HOOKLIBCC.1.0.0" + + current_pwd="$(pwd)" + + # To mount $HOOKLIBCR we need to do that in the container namespace + create_runtime_hook=$( + cat <<-EOF + pid=\$(cat - | jq -r '.pid') + touch "$LIBPATH/$HOOKLIBCR.1.0.0" + nsenter -m \$ns -t \$pid mount --bind "$current_pwd/$HOOKLIBCR.1.0.0" "$LIBPATH/$HOOKLIBCR.1.0.0" + EOF + ) + + create_container_hook="touch ./lib/$HOOKLIBCC.1.0.0 && mount --bind $current_pwd/$HOOKLIBCC.1.0.0 ./lib/$HOOKLIBCC.1.0.0" + + CONFIG=$(jq --arg create_runtime_hook "$create_runtime_hook" --arg create_container_hook "$create_container_hook" ' + .hooks |= . + {"createRuntime": [{"path": "/bin/sh", "args": ["/bin/sh", "-c", $create_runtime_hook]}]} | + .hooks |= . + {"createContainer": [{"path": "/bin/sh", "args": ["/bin/sh", "-c", $create_container_hook]}]} | + .hooks |= . + {"startContainer": [{"path": "/bin/sh", "args": ["/bin/sh", "-c", "ldconfig"]}]} | + .process.args = ["/bin/sh", "-c", "ldconfig -p | grep librunc"]' "$DEBIAN_BUNDLE"/config.json) + echo "${CONFIG}" >config.json + + runc run test_debian + [ "$status" -eq 0 ] + + echo "Checking create-runtime library" + echo "$output" | grep $HOOKLIBCR + + echo "Checking create-container library" + echo "$output" | grep $HOOKLIBCC +} diff --git a/sysbox-runc/tests/integration/kill.bats b/sysbox-runc/tests/integration/kill.bats new file mode 100644 index 00000000..4233fd96 --- /dev/null +++ b/sysbox-runc/tests/integration/kill.bats @@ -0,0 +1,33 @@ +#!/usr/bin/env bats + +load helpers + +function setup() { + teardown_busybox + setup_busybox +} + +function teardown() { + teardown_busybox +} + +@test "kill detached busybox" { + # run busybox detached + runc run -d --console-socket "$CONSOLE_SOCKET" test_busybox + [ "$status" -eq 0 ] + + # check state + testcontainer test_busybox running + + runc kill test_busybox KILL + [ "$status" -eq 0 ] + + retry 10 1 eval "__runc state test_busybox | grep -q 'stopped'" + + # we should ensure kill work after the container stopped + runc kill -a test_busybox 0 + [ "$status" -eq 0 ] + + runc delete test_busybox + [ "$status" -eq 0 ] +} diff --git a/sysbox-runc/tests/integration/list.bats b/sysbox-runc/tests/integration/list.bats new file mode 100644 index 00000000..4a6c4f5c --- /dev/null +++ b/sysbox-runc/tests/integration/list.bats @@ -0,0 +1,56 @@ +#!/usr/bin/env bats + +load helpers + +function setup() { + teardown_running_container_inroot test_box1 "$HELLO_BUNDLE" + teardown_running_container_inroot test_box2 "$HELLO_BUNDLE" + teardown_running_container_inroot test_box3 "$HELLO_BUNDLE" + teardown_busybox + setup_busybox +} + +function teardown() { + teardown_running_container_inroot test_box1 "$HELLO_BUNDLE" + teardown_running_container_inroot test_box2 "$HELLO_BUNDLE" + teardown_running_container_inroot test_box3 "$HELLO_BUNDLE" + teardown_busybox +} + +@test "list" { + # run a few busyboxes detached + ROOT=$HELLO_BUNDLE runc run -d --console-socket "$CONSOLE_SOCKET" test_box1 + [ "$status" -eq 0 ] + + ROOT=$HELLO_BUNDLE runc run -d --console-socket "$CONSOLE_SOCKET" test_box2 + [ "$status" -eq 0 ] + + ROOT=$HELLO_BUNDLE runc run -d --console-socket "$CONSOLE_SOCKET" test_box3 + [ "$status" -eq 0 ] + + ROOT=$HELLO_BUNDLE runc list + [ "$status" -eq 0 ] + [[ ${lines[0]} =~ ID\ +PID\ +STATUS\ +BUNDLE\ +CREATED+ ]] + [[ "${lines[1]}" == *"test_box1"*[0-9]*"running"*$BUSYBOX_BUNDLE*[0-9]* ]] + [[ "${lines[2]}" == *"test_box2"*[0-9]*"running"*$BUSYBOX_BUNDLE*[0-9]* ]] + [[ "${lines[3]}" == *"test_box3"*[0-9]*"running"*$BUSYBOX_BUNDLE*[0-9]* ]] + + ROOT=$HELLO_BUNDLE runc list -q + [ "$status" -eq 0 ] + [[ "${lines[0]}" == "test_box1" ]] + [[ "${lines[1]}" == "test_box2" ]] + [[ "${lines[2]}" == "test_box3" ]] + + ROOT=$HELLO_BUNDLE runc list --format table + [ "$status" -eq 0 ] + [[ ${lines[0]} =~ ID\ +PID\ +STATUS\ +BUNDLE\ +CREATED+ ]] + [[ "${lines[1]}" == *"test_box1"*[0-9]*"running"*$BUSYBOX_BUNDLE*[0-9]* ]] + [[ "${lines[2]}" == *"test_box2"*[0-9]*"running"*$BUSYBOX_BUNDLE*[0-9]* ]] + [[ "${lines[3]}" == *"test_box3"*[0-9]*"running"*$BUSYBOX_BUNDLE*[0-9]* ]] + + ROOT=$HELLO_BUNDLE runc list --format json + [ "$status" -eq 0 ] + [[ "${lines[0]}" == [\[][\{]"\"ociVersion\""[:]"\""*[0-9][\.]*[0-9][\.]*[0-9]*"\""[,]"\"id\""[:]"\"test_box1\""[,]"\"pid\""[:]*[0-9][,]"\"status\""[:]*"\"running\""[,]"\"bundle\""[:]*$BUSYBOX_BUNDLE*[,]"\"rootfs\""[:]"\""*"\""[,]"\"created\""[:]*[0-9]*[\}]* ]] + [[ "${lines[0]}" == *[,][\{]"\"ociVersion\""[:]"\""*[0-9][\.]*[0-9][\.]*[0-9]*"\""[,]"\"id\""[:]"\"test_box2\""[,]"\"pid\""[:]*[0-9][,]"\"status\""[:]*"\"running\""[,]"\"bundle\""[:]*$BUSYBOX_BUNDLE*[,]"\"rootfs\""[:]"\""*"\""[,]"\"created\""[:]*[0-9]*[\}]* ]] + [[ "${lines[0]}" == *[,][\{]"\"ociVersion\""[:]"\""*[0-9][\.]*[0-9][\.]*[0-9]*"\""[,]"\"id\""[:]"\"test_box3\""[,]"\"pid\""[:]*[0-9][,]"\"status\""[:]*"\"running\""[,]"\"bundle\""[:]*$BUSYBOX_BUNDLE*[,]"\"rootfs\""[:]"\""*"\""[,]"\"created\""[:]*[0-9]*[\}][\]] ]] +} diff --git a/sysbox-runc/tests/integration/mask.bats b/sysbox-runc/tests/integration/mask.bats new file mode 100644 index 00000000..d56eb9d6 --- /dev/null +++ b/sysbox-runc/tests/integration/mask.bats @@ -0,0 +1,83 @@ +#!/usr/bin/env bats + +load helpers + +function setup() { + teardown_busybox + setup_busybox + + # Create fake rootfs. + mkdir rootfs/testdir + echo "Forbidden information!" >rootfs/testfile + + # sysbox-runc + if [ -z "$SHIFT_ROOTFS_UIDS" ]; then + chown "$UID_MAP":"$GID_MAP" rootfs/testdir + chown "$UID_MAP":"$GID_MAP" rootfs/testfile + fi + + # add extra masked paths + update_config '(.. | select(.maskedPaths? != null)) .maskedPaths += ["/testdir", "/testfile"]' +} + +function teardown() { + teardown_busybox +} + +@test "mask paths [file]" { + + skip "NEEDS FIX" + + # run busybox detached + runc run -d --console-socket "$CONSOLE_SOCKET" test_busybox + [ "$status" -eq 0 ] + + runc exec test_busybox cat /testfile + [ "$status" -eq 0 ] + [[ "${output}" == "" ]] + + runc exec test_busybox rm -f /testfile + [ "$status" -eq 1 ] + [[ "${output}" == *"Device or resource busy"* ]] + + # TODO: this operation passes in sys containers, but problably should + # fail; we don't want to allow unmasking of a masked path. + + runc exec test_busybox umount /testfile + [ "$status" -eq 1 ] + [[ "${output}" == *"Device or resource busy"* ]] +} + +@test "mask paths [directory]" { + # run busybox detached + runc run -d --console-socket "$CONSOLE_SOCKET" test_busybox + [ "$status" -eq 0 ] + + runc exec test_busybox ls /testdir + [ "$status" -eq 0 ] + [[ "${output}" == "" ]] + + runc exec test_busybox touch /testdir/foo + [ "$status" -eq 1 ] + [[ "${output}" == *"Read-only file system"* ]] + + runc exec test_busybox rm -rf /testdir + [ "$status" -eq 1 ] + [[ "${output}" == *"Device or resource busy"* ]] +} + +# sysbox-runc: this test is expected to fail until sysbox can intercept +# the mount syscall to prevent umounting of mounts for masked paths +# @test "mask path umounting" { +# run busybox detached +# runc run -d --console-socket $CONSOLE_SOCKET test_busybox +# [ "$status" -eq 0 ] +# +# runc exec test_busybox umount /testfile +# [ "$status" -eq 1 ] +# [[ "${output}" == *"Operation not permitted"* ]] +# +# runc exec test_busybox umount /testdir +# [ "$status" -eq 1 ] +# [[ "${output}" == *"Operation not permitted"* ]] +# } diff --git a/sysbox-runc/tests/integration/mounts.bats b/sysbox-runc/tests/integration/mounts.bats new file mode 100644 index 00000000..530d3b99 --- /dev/null +++ b/sysbox-runc/tests/integration/mounts.bats @@ -0,0 +1,213 @@ +#!/usr/bin/env bats + +load helpers + +function setup_busybox_tmpfs() { + + mkdir -p /tmp/busyboxtest/rootfs + mount -t tmpfs tmpfs /tmp/busyboxtest/rootfs + + tar --exclude './dev/*' -C /tmp/busyboxtest/rootfs -xf "$BUSYBOX_IMAGE" + + # sysbox-runc: set bundle ownership to match system + # container's uid(gid) map, except if using uid-shifting + if [ -z "$SHIFT_ROOTFS_UIDS" ]; then + chown -R "$UID_MAP":"$GID_MAP" /tmp/busyboxtest + fi + + cd /tmp/busyboxtest + runc_spec +} + +function cleanup_busybox_tmpfs() { + cd + teardown_running_container "$1" + + run sh -c 'findmnt -o TARGET | grep /tmp/busyboxtest/rootfs' + if [ "$status" -eq 0 ]; then + umount /tmp/busyboxtest/rootfs + fi + + rm -rf /tmp/busyboxtest +} + +function setup() { + teardown_busybox + setup_busybox +} + +function teardown() { + teardown_busybox +} + +@test "runc run [bind mount]" { + mkdir -p /mnt/test-dir + touch /mnt/test-dir/test-file + + update_config ' .mounts |= . + [{ + source: "/mnt/test-dir", + destination: "/mnt/test-dir", + options: ["bind"] + }] + | .process.args = ["ls", "/mnt/test-dir/"]' + + runc run test_busybox + [ "$status" -eq 0 ] + [[ "${lines[0]}" =~ 'test-file' ]] + + rm -rf /mnt/test-dir +} + +@test "runc run [ro tmpfs mount]" { + update_config ' .mounts += [{ + source: "tmpfs", + destination: "/mnt", + type: "tmpfs", + options: ["ro", "nodev", "nosuid", "mode=755"] + }] + | .process.args |= ["grep", "^tmpfs /mnt", "/proc/mounts"]' + + runc run test_busybox + [ "$status" -eq 0 ] + [[ "${lines[0]}" == *'ro,'* ]] +} + +@test "runc runc [bind mount above rootfs]" { + + # test: bind mount source path is above but not directly above rootfs + run mkdir bindSrc + [ "$status" -eq 0 ] + + run touch bindSrc/test-file + [ "$status" -eq 0 ] + + update_config ' .mounts |= . + [{ + source: "bindSrc", + destination: "/tmp/bind", + options: ["bind"] + }] + | .process.args = ["ls", "/tmp/bind/"]' + + runc run test_busybox + [ "$status" -eq 0 ] + [[ "${lines[0]}" =~ 'test-file' ]] +} + +@test "runc run [bind mount directly above rootfs]" { + + # Bind mounting a dir located directly above container's rootfs into the + # container leads to shiftfs-on-shiftfs, and this is not allowed by + # shiftfs. To solve this, the sysbox-mgr marks shiftfs mounts by creating + # mark points under /var/lib/sysbox, which prevents the shiftfs-on-shiftfs + # scenario. + # + # Thus, this test requires the sysbox-mgr, so we can't run it (since sysbox-mgr + # is not present in sysbox-runc integration tests). + # + # Though sysbox-runc has a mock shiftfs mark code in setupShiftfsMarkLocal() + # (see container_linux.go), this code does not prevent the shiftfs-on-shiftfs + # scenario so the test would fail. We can re-enable this test if and when + # the mock shiftfs mark code handles the shiftfs-on-shiftfs scenario. + + if [ -n "$SHIFT_ROOTFS_UIDS" ]; then + skip "Requires sysbox-mgr; skip" + fi + + update_config ' .mounts |= . + [{ + source: ".", + destination: "/tmp/bind", + options: ["bind"] + }] + | .process.args = ["ls", "/tmp/bind/"]' + + runc run test_busybox + + [ "$status" -eq 0 ] + [[ "${lines[0]}" =~ config.json ]] +} + +@test "runc run [bind mount below the rootfs]" { + + update_config ' .mounts |= . + [{ + source: "rootfs/root", + destination: "/tmp/bind", + options: ["bind"] + }] + | .process.args = ["/bin/sh"]' + + runc run -d --console-socket "$CONSOLE_SOCKET" test_busybox + [ "$status" -eq 0 ] + + runc exec test_busybox touch /root/test-file.txt + [ "$status" -eq 0 ] + + runc exec test_busybox ls /root + [ "$status" -eq 0 ] + [[ "${lines[0]}" =~ test-file.txt ]] + + runc exec test_busybox ls /tmp/bind + [ "$status" -eq 0 ] + [[ "${lines[0]}" =~ test-file.txt ]] + + runc exec test_busybox rm /tmp/bind/test-file.txt + [ "$status" -eq 0 ] + + runc exec test_busybox ls /root + [ "$status" -eq 0 ] + [[ "${lines[0]}" =~ '' ]] +} + +@test "runc run [rootfs on tmpfs]" { + setup_busybox_tmpfs + + runc run -d --console-socket "$CONSOLE_SOCKET" test_busybox + [ "$status" -eq 0 ] + + runc kill test_busybox + [ "$status" -eq 0 ] + + cleanup_busybox_tmpfs test_busybox +} + +@test "runc run [bind mount on tmpfs]" { + mkdir -p /tmp/busyboxtest/test-dir + mount -t tmpfs tmpfs /tmp/busyboxtest/test-dir + touch /tmp/busyboxtest/test-dir/test-file + + update_config ' .mounts |= . + [{ + source: "/tmp/busyboxtest/test-dir", + destination: "/tmp/bind", + options: ["bind"] + }] + | .process.args = ["ls", "/tmp/bind"]' + + runc run test_busybox + [ "$status" -eq 0 ] + [[ "${lines[0]}" =~ 'test-file' ]] + + umount /tmp/busyboxtest/test-dir + [ "$status" -eq 0 ] + + rm -rf /tmp/busyboxtest +} + +@test "runc run [tmpfs mount with absolute symlink]" { + # in container, /conf -> /real/conf + mkdir -p rootfs/real/conf + + if [ -z "$SHIFT_ROOTFS_UIDS" ]; then + chown -R "$UID_MAP":"$GID_MAP" rootfs/real/conf + fi + + ln -s /real/conf rootfs/conf + + update_config ' .mounts += [{ + type: "tmpfs", + source: "tmpfs", + destination: "/conf/stack", + options: ["ro", "nodev", "nosuid"] + }] + | .process.args |= ["true"]' + runc run test_busybox + [ "$status" -eq 0 ] +} diff --git a/sysbox-runc/tests/integration/multi-arch.bash b/sysbox-runc/tests/integration/multi-arch.bash new file mode 100644 index 00000000..1dd751bb --- /dev/null +++ b/sysbox-runc/tests/integration/multi-arch.bash @@ -0,0 +1,44 @@ +#!/bin/bash +get_busybox() { + case $(go env GOARCH) in + arm64) + echo 'https://github.com/docker-library/busybox/raw/dist-arm64v8/stable/glibc/busybox.tar.xz' + ;; + *) + echo 'https://github.com/docker-library/busybox/raw/dist-amd64/stable/glibc/busybox.tar.xz' + ;; + esac +} + +get_hello() { + case $(go env GOARCH) in + arm64) + echo 'hello-world-aarch64.tar' + ;; + *) + echo 'hello-world.tar' + ;; + esac +} + +get_and_extract_debian() { + tmp=$(mktemp -d) + cd "$tmp" + + debian="debian:3.11.6" + + case $(go env GOARCH) in + arm64) + skopeo copy docker://arm64v8/debian:buster "oci:$debian" + ;; + *) + skopeo copy docker://amd64/debian:buster "oci:$debian" + ;; + esac + + args="$([ -z "${ROOTLESS_TESTPATH+x}" ] && echo "--rootless")" + umoci unpack $args --image "$debian" "$1" + + cd - + rm -rf "$tmp" +} diff --git a/sysbox-runc/tests/integration/no_pivot.bats b/sysbox-runc/tests/integration/no_pivot.bats new file mode 100644 index 00000000..844a0ca5 --- /dev/null +++ b/sysbox-runc/tests/integration/no_pivot.bats @@ -0,0 +1,22 @@ +#!/usr/bin/env bats + +load helpers + +function setup() { + teardown_busybox + setup_busybox +} + +function teardown() { + teardown_busybox +} + +@test "runc run --no-pivot must not expose bare /proc" { + requires root + + update_config '.process.args |= ["unshare", "-mrpf", "sh", "-euxc", "mount -t proc none /proc && echo h > /proc/sysrq-trigger"]' + + runc run --no-pivot test_no_pivot + [ "$status" -eq 1 ] + [[ "$output" == *"mount: permission denied"* ]] +} diff --git a/sysbox-runc/tests/integration/pause.bats b/sysbox-runc/tests/integration/pause.bats new file mode 100644 index 00000000..1d6e2084 --- /dev/null +++ b/sysbox-runc/tests/integration/pause.bats @@ -0,0 +1,78 @@ +#!/usr/bin/env bats + +load helpers + +function setup() { + teardown_busybox + setup_busybox +} + +function teardown() { + teardown_busybox +} + +@test "runc pause and resume" { + if [[ "$ROOTLESS" -ne 0 ]]; then + requires rootless_cgroup + set_cgroups_path "$BUSYBOX_BUNDLE" + fi + requires cgroups_freezer + + # run busybox detached + runc run -d --console-socket "$CONSOLE_SOCKET" test_busybox + [ "$status" -eq 0 ] + + testcontainer test_busybox running + + # pause busybox + runc pause test_busybox + [ "$status" -eq 0 ] + + # test state of busybox is paused + testcontainer test_busybox paused + + # resume busybox + runc resume test_busybox + [ "$status" -eq 0 ] + + # test state of busybox is back to running + testcontainer test_busybox running +} + +@test "runc pause and resume with nonexist container" { + if [[ "$ROOTLESS" -ne 0 ]]; then + requires rootless_cgroup + set_cgroups_path "$BUSYBOX_BUNDLE" + fi + requires cgroups_freezer + + # run test_busybox detached + runc run -d --console-socket "$CONSOLE_SOCKET" test_busybox + [ "$status" -eq 0 ] + + testcontainer test_busybox running + + # pause test_busybox and nonexistent container + runc pause test_busybox + [ "$status" -eq 0 ] + runc pause nonexistent + [ "$status" -ne 0 ] + + # test state of test_busybox is paused + testcontainer test_busybox paused + + # resume test_busybox and nonexistent container + runc resume test_busybox + [ "$status" -eq 0 ] + runc resume nonexistent + [ "$status" -ne 0 ] + + # test state of test_busybox is back to running + testcontainer test_busybox running + + # delete test_busybox + runc delete --force test_busybox + + runc state test_busybox + [ "$status" -ne 0 ] +} diff --git a/sysbox-runc/tests/integration/ps.bats b/sysbox-runc/tests/integration/ps.bats new file mode 100644 index 00000000..29cdcfac --- /dev/null +++ b/sysbox-runc/tests/integration/ps.bats @@ -0,0 +1,92 @@ +#!/usr/bin/env bats + +load helpers + +function setup() { + teardown_busybox + setup_busybox +} + +function teardown() { + teardown_busybox +} + +@test "ps" { + # ps is not supported, it requires cgroups + requires root + + # start busybox detached + runc run -d --console-socket "$CONSOLE_SOCKET" test_busybox + [ "$status" -eq 0 ] + + # check state + testcontainer test_busybox running + + runc ps test_busybox + [ "$status" -eq 0 ] + [[ ${lines[0]} =~ UID\ +PID\ +PPID\ +C\ +STIME\ +TTY\ +TIME\ +CMD+ ]] + [[ "${lines[1]}" == *"$UID_MAP"*[0-9]* ]] +} + +@test "ps -f json" { + # ps is not supported, it requires cgroups + requires root + + # start busybox detached + runc run -d --console-socket "$CONSOLE_SOCKET" test_busybox + [ "$status" -eq 0 ] + + # check state + testcontainer test_busybox running + + runc ps -f json test_busybox + [ "$status" -eq 0 ] + [[ ${lines[0]} =~ [0-9]+ ]] +} + +@test "ps -e" { + + # Note: in the OCI runc, this test uses "ps -e -x"; but the "-x" flag + # causes no processes to be listed because the process doing ps does + # no have the same UID as the process inside the sys container (due + # to sysbox's user-namespace usage). + + # ps is not supported, it requires cgroups + requires root + + # start busybox detached + runc run -d --console-socket "$CONSOLE_SOCKET" test_busybox + [ "$status" -eq 0 ] + + # check state + testcontainer test_busybox running + + runc ps test_busybox -e + [ "$status" -eq 0 ] + [[ ${lines[0]} =~ \ +PID\ +TTY\ +TIME\ +CMD+ ]] + [[ "${lines[1]}" =~ [0-9]+ ]] +} + +@test "ps after the container stopped" { + # ps requires cgroups + [[ "$ROOTLESS" -ne 0 ]] && requires rootless_cgroup + set_cgroups_path "$BUSYBOX_BUNDLE" + + # start busybox detached + runc run -d --console-socket "$CONSOLE_SOCKET" test_busybox + [ "$status" -eq 0 ] + + # check state + testcontainer test_busybox running + + runc ps test_busybox + [ "$status" -eq 0 ] + + runc kill test_busybox KILL + [ "$status" -eq 0 ] + + retry 10 1 eval "__runc state test_busybox | grep -q 'stopped'" + + runc ps test_busybox + [ "$status" -eq 0 ] +} diff --git a/sysbox-runc/tests/integration/root.bats b/sysbox-runc/tests/integration/root.bats new file mode 100644 index 00000000..f3cd3bab --- /dev/null +++ b/sysbox-runc/tests/integration/root.bats @@ -0,0 +1,50 @@ +#!/usr/bin/env bats + +load helpers + +function setup() { + teardown_running_container_inroot test_dotbox "$HELLO_BUNDLE" + teardown_busybox + setup_busybox +} + +function teardown() { + teardown_running_container_inroot test_dotbox "$HELLO_BUNDLE" + teardown_busybox +} + +@test "global --root" { + # run busybox detached using $HELLO_BUNDLE for state + ROOT=$HELLO_BUNDLE runc run -d --console-socket "$CONSOLE_SOCKET" test_dotbox + [ "$status" -eq 0 ] + + # run busybox detached in default root + runc run -d --console-socket "$CONSOLE_SOCKET" test_busybox + [ "$status" -eq 0 ] + + runc state test_busybox + [ "$status" -eq 0 ] + [[ "${output}" == *"running"* ]] + + ROOT=$HELLO_BUNDLE runc state test_dotbox + [ "$status" -eq 0 ] + [[ "${output}" == *"running"* ]] + + ROOT=$HELLO_BUNDLE runc state test_busybox + [ "$status" -ne 0 ] + + runc state test_dotbox + [ "$status" -ne 0 ] + + runc kill test_busybox KILL + [ "$status" -eq 0 ] + retry 10 1 eval "__runc state test_busybox | grep -q 'stopped'" + runc delete test_busybox + [ "$status" -eq 0 ] + + ROOT=$HELLO_BUNDLE runc kill test_dotbox KILL + [ "$status" -eq 0 ] + retry 10 1 eval "ROOT='$HELLO_BUNDLE' __runc state test_dotbox | grep -q 'stopped'" + ROOT=$HELLO_BUNDLE runc delete test_dotbox + [ "$status" -eq 0 ] +} diff --git a/sysbox-runc/tests/integration/run.bats b/sysbox-runc/tests/integration/run.bats new file mode 100644 index 00000000..63be89d2 --- /dev/null +++ b/sysbox-runc/tests/integration/run.bats @@ -0,0 +1,59 @@ +#!/usr/bin/env bats + +load helpers + +function setup() { + setup_hello +} + +function teardown() { + teardown_bundle +} + +@test "runc run" { + runc run test_hello + [ "$status" -eq 0 ] + + runc state test_hello + [ "$status" -ne 0 ] +} + +@test "runc run --keep" { + runc run --keep test_run_keep + [ "$status" -eq 0 ] + + testcontainer test_run_keep stopped + + runc state test_run_keep + [ "$status" -eq 0 ] + + runc delete test_run_keep + + runc state test_run_keep + [ "$status" -ne 0 ] +} + +@test "runc run --keep (check cgroup exists)" { + # for systemd driver, the unit's cgroup path will be auto removed if container's all processes exited + requires no_systemd + + [[ "$ROOTLESS" -ne 0 ]] && requires rootless_cgroup + + set_cgroups_path + + runc run --keep test_run_keep + [ "$status" -eq 0 ] + + testcontainer test_run_keep stopped + + runc state test_run_keep + [ "$status" -eq 0 ] + + # check that cgroup exists + check_cgroup_value "pids.max" "max" + + runc delete test_run_keep + + runc state test_run_keep + [ "$status" -ne 0 ] +} diff --git a/sysbox-runc/tests/integration/spec.bats b/sysbox-runc/tests/integration/spec.bats new file mode 100644 index 00000000..6edeff89 --- /dev/null +++ b/sysbox-runc/tests/integration/spec.bats @@ -0,0 +1,48 @@ +#!/usr/bin/env bats + +load helpers + +function setup() { + setup_hello + + # sysbox-runc: bundle must have same uid/gid as that passed to + # "runc spec" (see runc_spec()) + chown -R "$UID_MAP":"$GID_MAP" "$HELLO_BUNDLE" +} + +function teardown() { + teardown_hello +} + +@test "spec generation cwd" { + runc run test_hello + [ "$status" -eq 0 ] +} + +@test "spec generation --bundle" { + runc run --bundle "$HELLO_BUNDLE" test_hello + [ "$status" -eq 0 ] +} + +@test "spec validator" { + + requires rootless_no_features + + SPEC_VERSION=$(awk '$1 == "github.com/opencontainers/runtime-spec" {print $2}' "$BATS_TEST_DIRNAME"/../../go.mod) + # Will look like this when not pinned to specific tag: "v0.0.0-20190207185410-29686dbc5559", otherwise "v1.0.0" + SPEC_COMMIT=$(cut -d "-" -f 3 <<<"$SPEC_VERSION") + SPEC_REF=$([[ -z "$SPEC_COMMIT" ]] && echo "$SPEC_VERSION" || echo "$SPEC_COMMIT") + + git clone https://github.com/opencontainers/runtime-spec.git + (cd runtime-spec && git reset --hard "$SPEC_REF") + SCHEMA='runtime-spec/schema/config-schema.json' + [ -e "$SCHEMA" ] + + runc spec "$UID_MAP" "$GID_MAP" "$ID_MAP_SIZE" + [ -e config.json ] + + GO111MODULE=auto go get github.com/xeipuuv/gojsonschema + GO111MODULE=auto go build runtime-spec/schema/validate.go + + ./validate "$SCHEMA" config.json +} diff --git a/sysbox-runc/tests/integration/start.bats b/sysbox-runc/tests/integration/start.bats new file mode 100644 index 00000000..6a6e6168 --- /dev/null +++ b/sysbox-runc/tests/integration/start.bats @@ -0,0 +1,31 @@ +#!/usr/bin/env bats + +load helpers + +function setup() { + teardown_busybox + setup_busybox +} + +function teardown() { + teardown_busybox +} + +@test "runc start" { + runc create --console-socket "$CONSOLE_SOCKET" test_busybox + [ "$status" -eq 0 ] + + testcontainer test_busybox created + + # start container test_busybox + runc start test_busybox + [ "$status" -eq 0 ] + + testcontainer test_busybox running + + # delete test_busybox + runc delete --force test_busybox + + runc state test_busybox + [ "$status" -ne 0 ] +} diff --git a/sysbox-runc/tests/integration/start_detached.bats b/sysbox-runc/tests/integration/start_detached.bats new file mode 100644 index 00000000..2d67c6f7 --- /dev/null +++ b/sysbox-runc/tests/integration/start_detached.bats @@ -0,0 +1,70 @@ +#!/usr/bin/env bats + +load helpers + +function setup() { + teardown_busybox + setup_busybox +} + +function teardown() { + teardown_busybox +} + +@test "runc run detached" { + # run busybox detached + runc run -d --console-socket "$CONSOLE_SOCKET" test_busybox + [ "$status" -eq 0 ] + + # check state + testcontainer test_busybox running +} + +@test "runc run detached ({u,g}id != 0)" { + # cannot start containers as another user in rootless setup without idmap + [[ "$ROOTLESS" -ne 0 ]] && requires rootless_idmap + + # replace "uid": 0 with "uid": 1000 + # and do a similar thing for gid. + update_config ' (.. | select(.uid? == 0)) .uid |= 1000 + | (.. | select(.gid? == 0)) .gid |= 100' + + # run busybox detached + runc run -d --console-socket "$CONSOLE_SOCKET" test_busybox + [ "$status" -eq 0 ] + + # check state + testcontainer test_busybox running +} + +@test "runc run detached --pid-file" { + # run busybox detached + runc run --pid-file pid.txt -d --console-socket "$CONSOLE_SOCKET" test_busybox + [ "$status" -eq 0 ] + + # check state + testcontainer test_busybox running + + # check pid.txt was generated + [ -e pid.txt ] + + [[ "$(cat pid.txt)" == $(__runc state test_busybox | jq '.pid') ]] +} + +@test "runc run detached --pid-file with new CWD" { + # create pid_file directory as the CWD + mkdir pid_file + cd pid_file + + # run busybox detached + runc run --pid-file pid.txt -d -b "$BUSYBOX_BUNDLE" --console-socket "$CONSOLE_SOCKET" test_busybox + [ "$status" -eq 0 ] + + # check state + testcontainer test_busybox running + + # check pid.txt was generated + [ -e pid.txt ] + + [[ "$(cat pid.txt)" == $(__runc state test_busybox | jq '.pid') ]] +} diff --git a/sysbox-runc/tests/integration/start_hello.bats b/sysbox-runc/tests/integration/start_hello.bats new file mode 100644 index 00000000..b30138bd --- /dev/null +++ b/sysbox-runc/tests/integration/start_hello.bats @@ -0,0 +1,62 @@ +#!/usr/bin/env bats + +load helpers + +function setup() { + teardown_hello + setup_hello +} + +function teardown() { + teardown_hello +} + +@test "runc run" { + # run hello-world + runc run test_hello + [ "$status" -eq 0 ] + + # check expected output + [[ "${output}" == *"Hello"* ]] +} + +@test "runc run ({u,g}id != 0)" { + # cannot start containers as another user in rootless setup without idmap + [[ "$ROOTLESS" -ne 0 ]] && requires rootless_idmap + + # replace "uid": 0 with "uid": 1000 + # and do a similar thing for gid. + update_config ' (.. | select(.uid? == 0)) .uid |= 1000 + | (.. | select(.gid? == 0)) .gid |= 100' + + # run hello-world + runc run test_hello + [ "$status" -eq 0 ] + + # check expected output + [[ "${output}" == *"Hello"* ]] +} + +@test "runc run with rootfs set to ." { + cp config.json rootfs/. + rm config.json + cd rootfs + update_config '(.. | select(. == "rootfs")) |= "."' + + # run hello-world + runc run test_hello + [ "$status" -eq 0 ] + [[ "${output}" == *"Hello"* ]] +} + +@test "runc run --pid-file" { + # run hello-world + runc run --pid-file pid.txt test_hello + [ "$status" -eq 0 ] + [[ "${output}" == *"Hello"* ]] + + # check pid.txt was generated + [ -e pid.txt ] + + [[ "$(cat pid.txt)" =~ [0-9]+ ]] +} diff --git a/sysbox-runc/tests/integration/state.bats b/sysbox-runc/tests/integration/state.bats new file mode 100644 index 00000000..d9e4b552 --- /dev/null +++ b/sysbox-runc/tests/integration/state.bats @@ -0,0 +1,66 @@ +#!/usr/bin/env bats + +load helpers + +function setup() { + teardown_busybox + setup_busybox +} + +function teardown() { + teardown_busybox +} + +@test "state (kill + delete)" { + runc state test_busybox + [ "$status" -ne 0 ] + + # run busybox detached + runc run -d --console-socket "$CONSOLE_SOCKET" test_busybox + [ "$status" -eq 0 ] + + # check state + testcontainer test_busybox running + + runc kill test_busybox KILL + [ "$status" -eq 0 ] + + # wait for busybox to be in the destroyed state + retry 10 1 eval "__runc state test_busybox | grep -q 'stopped'" + + # delete test_busybox + runc delete test_busybox + [ "$status" -eq 0 ] + + runc state test_busybox + [ "$status" -ne 0 ] +} + +@test "state (pause + resume)" { + # XXX: pause and resume require cgroups. + requires root + + runc state test_busybox + [ "$status" -ne 0 ] + + # run busybox detached + runc run -d --console-socket "$CONSOLE_SOCKET" test_busybox + [ "$status" -eq 0 ] + + # check state + testcontainer test_busybox running + + # pause busybox + runc pause test_busybox + [ "$status" -eq 0 ] + + # test state of busybox is paused + testcontainer test_busybox paused + + # resume busybox + runc resume test_busybox + [ "$status" -eq 0 ] + + # test state of busybox is back to running + testcontainer test_busybox running +} diff --git a/sysbox-runc/tests/integration/syscont-caps.bats b/sysbox-runc/tests/integration/syscont-caps.bats new file mode 100644 index 00000000..665cb682 --- /dev/null +++ b/sysbox-runc/tests/integration/syscont-caps.bats @@ -0,0 +1,100 @@ +#!/usr/bin/env bats + +load helpers + +function setup() { + teardown_busybox + setup_busybox +} + +function teardown() { + teardown_busybox +} + +# A sys container root process has full caps (regardless of the container spec) +@test "syscont: root process caps" { + + sed -i "/\"CAP_SYS_ADMIN\",/d" "${BUSYBOX_BUNDLE}/config.json" + sed -i "/\"CAP_NET_ADMIN\",/d" "${BUSYBOX_BUNDLE}/config.json" + + runc run -d --console-socket "$CONSOLE_SOCKET" test_busybox + [ "$status" -eq 0 ] + + # Ensure init is a root process in this container + runc exec test_busybox grep Uid /proc/1/status + [ "$status" -eq 0 ] + + for i in $(seq 2 5); do + id=$(echo "$output" | awk -v var="$i" '{print $var}') + [ "$id" -eq "0" ] + done + + # Ensure init has all caps + for capType in CapInh CapPrm CapEff CapBnd CapAmb; do + runc exec test_busybox grep "$capType" /proc/1/status + [ "$status" -eq 0 ] + [[ "${output}" == *"0000003fffffffff"* ]] + done +} + +# A sys container root process has all caps when entered via exec +@test "syscont: exec root process caps" { + + sed -i "/\"CAP_SYS_ADMIN\",/d" "${BUSYBOX_BUNDLE}/config.json" + sed -i "/\"CAP_NET_ADMIN\",/d" "${BUSYBOX_BUNDLE}/config.json" + + runc run -d --console-socket "$CONSOLE_SOCKET" test_busybox + [ "$status" -eq 0 ] + + for capType in CapInh CapPrm CapEff CapBnd CapAmb; do + runc exec test_busybox grep "$capType" /proc/self/status + [ "$status" -eq 0 ] + [[ "${output}" == *"0000003fffffffff"* ]] + done +} + +# A sys container non-root init process caps are all cleared, except CapBnd +@test "syscont: init non-root process caps" { + + sed -i "s/\"uid\": 0/\"uid\": 1000/" "${BUSYBOX_BUNDLE}/config.json" + sed -i "s/\"gid\": 0/\"gid\": 1000/" "${BUSYBOX_BUNDLE}/config.json" + + runc run -d --console-socket "$CONSOLE_SOCKET" test_busybox + [ "$status" -eq 0 ] + + runc exec test_busybox grep CapBnd /proc/1/status + [ "$status" -eq 0 ] + [[ "${output}" == *"0000003fffffffff"* ]] + + for capType in CapInh CapPrm CapEff CapAmb; do + runc exec test_busybox grep "$capType" /proc/1/status + [ "$status" -eq 0 ] + [[ "${output}" == *"0000000000000000"* ]] + done +} + +# A sys container non-root init process caps are all cleared when entered via exec (except CapBnd) +@test "syscont: exec non-root process caps" { + + runc run -d --console-socket "$CONSOLE_SOCKET" test_busybox + [ "$status" -eq 0 ] + + for capType in CapInh CapPrm CapEff CapAmb; do + runc exec --user 1000:1000 test_busybox grep "$capType" /proc/self/status + [ "$status" -eq 0 ] + [[ "${output}" == *"0000000000000000"* ]] + done + + runc exec --user 1000:1000 test_busybox grep CapBnd /proc/self/status + [ "$status" -eq 0 ] + [[ "${output}" == *"0000003fffffffff"* ]] +} + +# TODO: Verify sysbox-runc exec caps are set correctly when giving exec a process.json + +# TODO: Verify that sysbox-runc exec cap override works +# - create spec without any caps and run sys container +# - exec into sys container as user 0 with --cap=CAP_SYS_ADMIN; verify root has all caps +# - exec into sys container as user 1000 with --cap=CAP_SYS_ADMIN; verify root has CAP_SYS_ADMIN only + +# TODO: Verify specs without capabilities object are handled correctly diff --git a/sysbox-runc/tests/integration/syscont-cgroup.bats b/sysbox-runc/tests/integration/syscont-cgroup.bats new file mode 100644 index 00000000..2d9379bc --- /dev/null +++ b/sysbox-runc/tests/integration/syscont-cgroup.bats @@ -0,0 +1,79 @@ +#!/usr/bin/env bats + +load helpers + +function setup() { + teardown_busybox + setup_busybox +} + +function teardown() { + teardown_busybox +} + +# Verify the cgroup mounts inside the sys container +@test "syscont: cgroup mounts" { + runc run -d --console-socket "$CONSOLE_SOCKET" test_busybox + [ "$status" -eq 0 ] + + # verify /sys/fs/cgroup has root:root ownership + # + # (dev note: single quotes in a single-quote delimited script is '\'' ; use + # 'echo' instead of 'sh -c' to see shell interpretation) + + runc exec test_busybox sh -c 'ls -l /sys/fs/cgroup/ | grep -v rdma | grep -v misc | awk '\''{print $3}'\'' | tr '\''\n'\'' '\'' '\'' ' + [ "$status" -eq 0 ] + + for i in ${lines[0]}; do + [ "$i" == "root" ] + done + + runc exec test_busybox sh -c 'ls -l /sys/fs/cgroup/ | grep -v rdma | grep -v misc | awk '\''{print $4}'\'' | tr '\''\n'\'' '\'' '\'' ' + [ "$status" -eq 0 ] + + for i in ${lines[0]}; do + [ "$i" == "root" ] + done + + # verify sys container cgroup root in /proc/$$/cgroup is "/" + runc exec test_busybox sh -c 'cat /proc/1/cgroup | cut -d":" -f3 | tr '\''\n'\'' '\'' '\'' ' + [ "$status" -eq 0 ] + + for i in ${lines[0]}; do + [ "$i" == "/" ] + done + + # verify sys container cgroup root in /proc/$$/mountinfo + runc exec test_busybox sh -c 'cat /proc/1/mountinfo | grep "/sys/fs/cgroup/" | cut -d" " -f4 | tr '\''\n'\'' '\'' '\'' ' + [ "$status" -eq 0 ] + + for i in ${lines[0]}; do + [ "$i" == "/" ] + done + + # verify cgroup is mounted read-write + runc exec test_busybox sh -c 'cat /proc/1/mountinfo | grep "cgroup" | cut -d" " -f6 | tr '\''\n'\'' '\'' '\'' ' + [ "$status" -eq 0 ] + + for i in ${lines[0]}; do + [[ "$i" =~ "rw," ]] + done +} + +# Verify that sys container root can create cgroups +@test "syscont: cgroup create" { + runc run -d --console-socket "$CONSOLE_SOCKET" test_busybox + [ "$status" -eq 0 ] + + cgList=$(runc exec test_busybox ls /sys/fs/cgroup) + for cg in $cgList; do + runc exec test_busybox mkdir /sys/fs/cgroup/$cg/subCgroup + [ "$status" -eq 0 ] + + runc exec test_busybox ls /sys/fs/cgroup/$cg/subCgroup/cgroup.procs + [ "$status" -eq 0 ] + + runc exec test_busybox rmdir /sys/fs/cgroup/$cg/subCgroup + [ "$status" -eq 0 ] + done +} diff --git a/sysbox-runc/tests/integration/syscont-ns.bats b/sysbox-runc/tests/integration/syscont-ns.bats new file mode 100644 index 00000000..4328714b --- /dev/null +++ b/sysbox-runc/tests/integration/syscont-ns.bats @@ -0,0 +1,41 @@ +#!/usr/bin/env bats + +load helpers + +function setup() { + teardown_busybox + setup_busybox +} + +function teardown() { + teardown_busybox +} + +@test "syscont: uses all namespaces" { + + runc run -d --console-socket "$CONSOLE_SOCKET" test_busybox + [ "$status" -eq 0 ] + + # For each ns, check that the sys container's init process is in a + # different namespace than the test script. + + for nsType in cgroup ipc mnt net pid user uts; do + syscont_ns=$(runc exec test_busybox ls -l /proc/1/ns | grep -i "$nsType" | cut -d":" -f3) + [ "$status" -eq 0 ] + test_ns=$(ls -l /proc/self/ns | grep -i "$nsType" | cut -d":" -f3) + [ "$status" -eq 0 ] + [ "$syscont_ns" != "$test_ns" ] + done +} + +@test "syscont: unshare" { + + runc run -d --console-socket "$CONSOLE_SOCKET" test_busybox + [ "$status" -eq 0 ] + + # check that unshare(2) works inside a system container + runc exec test_busybox sh -c "unshare -i -m -n -p -u -f --mount-proc echo 1 > /dev/null" + [ "$status" -eq 0 ] + + # TODO: test that nsenter(2) also works +} diff --git a/sysbox-runc/tests/integration/syscont-oom.bats b/sysbox-runc/tests/integration/syscont-oom.bats new file mode 100644 index 00000000..45bc4d2d --- /dev/null +++ b/sysbox-runc/tests/integration/syscont-oom.bats @@ -0,0 +1,128 @@ +#!/usr/bin/env bats + +load helpers + +function setup() { + teardown_busybox + setup_busybox +} + +function teardown() { + teardown_busybox +} + +@test "syscont: default oom_score_adj" { + + runc run -d --console-socket "$CONSOLE_SOCKET" test_busybox + [ "$status" -eq 0 ] + + # verify default setting + runc exec test_busybox sh -c "cat /proc/1/oom_score_adj" + [ "$status" -eq 0 ] + [[ "$output" == "0" ]] + + # verify min setting + runc exec test_busybox sh -c "echo -999 > /proc/1/oom_score_adj" + [ "$status" -eq 0 ] + + runc exec test_busybox sh -c "cat /proc/1/oom_score_adj" + [ "$status" -eq 0 ] + [[ "$output" == "-999" ]] + + # verify max setting + runc exec test_busybox sh -c "echo 999 > /proc/1/oom_score_adj" + [ "$status" -eq 0 ] + + runc exec test_busybox sh -c "cat /proc/1/oom_score_adj" + [ "$status" -eq 0 ] + [[ "$output" == "999" ]] + + # verify -1000 (unkillable) not allowed + runc exec test_busybox sh -c "echo -1000 > /proc/1/oom_score_adj" + [ "$status" -eq 1 ] + +} + +@test "syscont: custom oom_score_adj" { + + CONFIG=$(jq '.process.oomScoreAdj = 100' config.json) + echo "${CONFIG}" >config.json + + runc run -d --console-socket "$CONSOLE_SOCKET" test_busybox + [ "$status" -eq 0 ] + + # verify default setting + runc exec test_busybox sh -c "cat /proc/1/oom_score_adj" + [ "$status" -eq 0 ] + [[ "$output" == "100" ]] + + # verify min setting + runc exec test_busybox sh -c "echo -999 > /proc/1/oom_score_adj" + [ "$status" -eq 0 ] + + runc exec test_busybox sh -c "cat /proc/1/oom_score_adj" + [ "$status" -eq 0 ] + [[ "$output" == "-999" ]] + + # verify max setting + runc exec test_busybox sh -c "echo 999 > /proc/1/oom_score_adj" + [ "$status" -eq 0 ] + + runc exec test_busybox sh -c "cat /proc/1/oom_score_adj" + [ "$status" -eq 0 ] + [[ "$output" == "999" ]] + + # verify -1000 (unkillable) not allowed + runc exec test_busybox sh -c "echo -1000 > /proc/1/oom_score_adj" + [ "$status" -eq 1 ] + +} + +@test "syscont: oom_score_adj inherit" { + + # adjust test's OOM score + echo 200 >/proc/self/oom_score_adj + + # sys container should inherit test's OOM score + runc run -d --console-socket "$CONSOLE_SOCKET" test_busybox + [ "$status" -eq 0 ] + + runc exec test_busybox sh -c "cat /proc/1/oom_score_adj" + [ "$status" -eq 0 ] + [[ "$output" == "200" ]] + + # verify min setting + runc exec test_busybox sh -c "echo -999 > /proc/1/oom_score_adj" + [ "$status" -eq 0 ] + + runc exec test_busybox sh -c "cat /proc/1/oom_score_adj" + [ "$status" -eq 0 ] + [[ "$output" == "-999" ]] + + # verify max setting + runc exec test_busybox sh -c "echo 999 > /proc/1/oom_score_adj" + [ "$status" -eq 0 ] + + runc exec test_busybox sh -c "cat /proc/1/oom_score_adj" + [ "$status" -eq 0 ] + [[ "$output" == "999" ]] + + # verify -1000 (unkillable) not allowed + runc exec test_busybox sh -c "echo -1000 > /proc/1/oom_score_adj" + [ "$status" -eq 1 ] + +} + +@test "syscont: exec oom_score_adj" { + + CONFIG=$(jq '.process.oomScoreAdj = 300' config.json) + echo "${CONFIG}" >config.json + + runc run -d --console-socket "$CONSOLE_SOCKET" test_busybox + [ "$status" -eq 0 ] + + # exec process inherits container's configure oom score adjustment + runc exec test_busybox sh -c "cat /proc/self/oom_score_adj" + [ "$status" -eq 0 ] + [[ "$output" == "300" ]] +} diff --git a/sysbox-runc/tests/integration/syscont-syscalls.bats b/sysbox-runc/tests/integration/syscont-syscalls.bats new file mode 100644 index 00000000..aa328687 --- /dev/null +++ b/sysbox-runc/tests/integration/syscont-syscalls.bats @@ -0,0 +1,33 @@ +#!/usr/bin/env bats + +load helpers + +function setup() { + teardown_busybox + setup_busybox +} + +function teardown() { + teardown_busybox +} + +@test "syscont: syscall: mount and umount" { + runc run -d --console-socket "$CONSOLE_SOCKET" test_busybox + [ "$status" -eq 0 ] + + runc exec test_busybox sh -c "mkdir /root/test" + [ "$status" -eq 0 ] + + runc exec test_busybox sh -c "mount --bind /root/test /root/test" + [ "$status" -eq 0 ] + + runc exec test_busybox sh -c 'mount | grep "test"' + [ "$status" -eq 0 ] + [[ "${output}" =~ "/root/test" ]] + + runc exec test_busybox sh -c "umount /root/test" + [ "$status" -eq 0 ] + + runc exec test_busybox sh -c "rmdir /root/test" + [ "$status" -eq 0 ] +} diff --git a/sysbox-runc/tests/integration/syscont-uid.bats b/sysbox-runc/tests/integration/syscont-uid.bats new file mode 100644 index 00000000..414524a3 --- /dev/null +++ b/sysbox-runc/tests/integration/syscont-uid.bats @@ -0,0 +1,28 @@ +#!/usr/bin/env bats + +load helpers + +function setup() { + teardown_busybox + setup_busybox +} + +function teardown() { + teardown_busybox +} + +@test "syscont uid/gid mappings" { + runc run -d --console-socket "$CONSOLE_SOCKET" test_busybox + [ "$status" -eq 0 ] + + runc exec test_busybox cat /proc/1/uid_map + [ "$status" -eq 0 ] + + uid_int=$(echo "${lines[0]}" | awk '{print $1}') + uid_ext=$(echo "${lines[0]}" | awk '{print $2}') + uid_size=$(echo "${lines[0]}" | awk '{print $3}') + + [[ "$uid_int" == "0" ]] + [[ "$uid_ext" == "$UID_MAP" ]] + [[ "$uid_size" == "$ID_MAP_SIZE" ]] +} diff --git a/sysbox-runc/tests/integration/testdata/hello-world-aarch64.tar b/sysbox-runc/tests/integration/testdata/hello-world-aarch64.tar new file mode 100644 index 00000000..186c8aef Binary files /dev/null and b/sysbox-runc/tests/integration/testdata/hello-world-aarch64.tar differ diff --git a/sysbox-runc/tests/integration/testdata/hello-world.tar b/sysbox-runc/tests/integration/testdata/hello-world.tar new file mode 100644 index 00000000..aec830e2 Binary files /dev/null and b/sysbox-runc/tests/integration/testdata/hello-world.tar differ diff --git a/sysbox-runc/tests/integration/tty.bats b/sysbox-runc/tests/integration/tty.bats new file mode 100644 index 00000000..baea69a5 --- /dev/null +++ b/sysbox-runc/tests/integration/tty.bats @@ -0,0 +1,268 @@ +#!/usr/bin/env bats + +load helpers + +function setup() { + teardown_busybox + setup_busybox +} + +function teardown() { + teardown_busybox +} + +@test "runc run [stdin not a tty]" { + # stty size fails without a tty + update_config '(.. | select(.[]? == "sh")) += ["-c", "stty size"]' + # note that stdout/stderr are already redirected by bats' run + runc run test_busybox /tmp/tty-info" + ], + "cwd": "/" +} +EOF + ) + + # run the exec + runc exec -t --pid-file pid.txt -d --console-socket "$CONSOLE_SOCKET" -p <(echo "$tty_info_with_consize_size") test_busybox + [ "$status" -eq 0 ] + + # check the pid was generated + [ -e pid.txt ] + + # wait for the process to finish + timeout 5 tail --pid="$(head -n 1 pid.txt)" -f /dev/null + + tty_info=$( + cat <= 244 (Fedora >= 32, Ubuntu >= 20.04). + for f in memory pids cpuset; do + if grep -qwv $f "$file"; then + skip "$f is not enabled in $file" + fi + done + fi + init_cgroup_paths + + # run a few busyboxes detached + runc run -d --console-socket "$CONSOLE_SOCKET" test_update + [ "$status" -eq 0 ] + + # Set a few variables to make the code below work for both v1 and v2 + case $CGROUP_UNIFIED in + no) + MEM_LIMIT="memory.limit_in_bytes" + SD_MEM_LIMIT="MemoryLimit" + MEM_RESERVE="memory.soft_limit_in_bytes" + SD_MEM_RESERVE="unsupported" + MEM_SWAP="memory.memsw.limit_in_bytes" + SD_MEM_SWAP="unsupported" + SYSTEM_MEM=$(cat "${CGROUP_MEMORY_BASE_PATH}/${MEM_LIMIT}") + HAVE_SWAP="no" + if [ -f "${CGROUP_MEMORY_BASE_PATH}/${MEM_SWAP}" ]; then + HAVE_SWAP="yes" + fi + ;; + yes) + MEM_LIMIT="memory.max" + SD_MEM_LIMIT="MemoryMax" + MEM_RESERVE="memory.low" + SD_MEM_RESERVE="MemoryLow" + MEM_SWAP="memory.swap.max" + SD_MEM_SWAP="MemorySwapMax" + SYSTEM_MEM="max" + HAVE_SWAP="yes" + ;; + esac + SD_UNLIMITED="infinity" + SD_VERSION=$(systemctl --version | awk '{print $2; exit}') + if [ "$SD_VERSION" -lt 227 ]; then + SD_UNLIMITED="18446744073709551615" + fi + + # check that initial values were properly set + check_cgroup_value "cpuset.cpus" 0 + if [[ "$CGROUP_UNIFIED" = "yes" ]] && ! grep -qw memory "$CGROUP_PATH/cgroup.controllers"; then + # This happen on containerized environment because "echo +memory > /sys/fs/cgroup/cgroup.subtree_control" fails with EINVAL + skip "memory controller not available" + fi + check_cgroup_value $MEM_LIMIT 38551552 + check_systemd_value $SD_MEM_LIMIT 38551552 + + check_cgroup_value $MEM_RESERVE 25165824 + check_systemd_value $SD_MEM_RESERVE 25165824 + + check_cgroup_value "pids.max" 20 + check_systemd_value "TasksMax" 20 + + # update cpuset if supported (i.e. we're running on a multicore cpu) + # + # NOTE: with sysbox, we always create a parent and a child cgroup; the + # first is owned by the host, the later is delegated to the sys + # container. Because of this, cpuset updates on the parent fail with + # "Devicer or resource busy" because they can only be done when there are + # no child cgroups. Thus, we check for failure here. + + cpu_count=$(grep -c '^processor' /proc/cpuinfo) + if [ "$cpu_count" -gt 1 ]; then + runc update test_update --cpuset-cpus "1" + [ "$status" -eq 1 ] + fi + + # update memory limit + runc update test_update --memory 67108864 + [ "$status" -eq 0 ] + check_cgroup_value $MEM_LIMIT 67108864 + check_systemd_value $SD_MEM_LIMIT 67108864 + + runc update test_update --memory 50M + [ "$status" -eq 0 ] + check_cgroup_value $MEM_LIMIT 52428800 + check_systemd_value $SD_MEM_LIMIT 52428800 + + # update memory soft limit + runc update test_update --memory-reservation 38551552 + [ "$status" -eq 0 ] + check_cgroup_value "$MEM_RESERVE" 38551552 + check_systemd_value "$SD_MEM_RESERVE" 38551552 + + # Run swap memory tests if swap is available + if [ "$HAVE_SWAP" = "yes" ]; then + # try to remove memory swap limit + runc update test_update --memory-swap -1 + [ "$status" -eq 0 ] + check_cgroup_value "$MEM_SWAP" $SYSTEM_MEM + check_systemd_value "$SD_MEM_SWAP" $SD_UNLIMITED + + # update memory swap + if [ "$CGROUP_UNIFIED" = "yes" ]; then + # for cgroupv2, memory and swap can only be set together + runc update test_update --memory 52428800 --memory-swap 96468992 + [ "$status" -eq 0 ] + # for cgroupv2, swap is a separate limit (it does not include mem) + check_cgroup_value "$MEM_SWAP" $((96468992 - 52428800)) + check_systemd_value "$SD_MEM_SWAP" $((96468992 - 52428800)) + else + runc update test_update --memory-swap 96468992 + [ "$status" -eq 0 ] + check_cgroup_value "$MEM_SWAP" 96468992 + check_systemd_value "$SD_MEM_SWAP" 96468992 + fi + fi + + # try to remove memory limit + runc update test_update --memory -1 + [ "$status" -eq 0 ] + + # check memory limit is gone + check_cgroup_value $MEM_LIMIT $SYSTEM_MEM + check_systemd_value $SD_MEM_LIMIT $SD_UNLIMITED + + # check swap memory limited is gone + if [ "$HAVE_SWAP" = "yes" ]; then + check_cgroup_value $MEM_SWAP $SYSTEM_MEM + check_systemd_value "$SD_MEM_SWAP" $SD_UNLIMITED + fi + + # update pids limit + runc update test_update --pids-limit 10 + [ "$status" -eq 0 ] + check_cgroup_value "pids.max" 10 + check_systemd_value "TasksMax" 10 + + # unlimited + runc update test_update --pids-limit -1 + [ "$status" -eq 0 ] + check_cgroup_value "pids.max" max + check_systemd_value "TasksMax" $SD_UNLIMITED + + # Revert to the test initial value via json on stdin + runc update -r - test_update <"$BATS_TMPDIR"/runc-cgroups-integration-test.json +{ + "memory": { + "limit": 38551552, + "reservation": 25165824 + }, + "cpu": { + "shares": 100, + "quota": 500000, + "period": 1000000, + "cpus": "0" + }, + "pids": { + "limit": 20 + } +} +EOF + + runc update -r "$BATS_TMPDIR"/runc-cgroups-integration-test.json test_update + [ "$status" -eq 0 ] + check_cgroup_value "cpuset.cpus" 0 + + check_cgroup_value $MEM_LIMIT 38551552 + check_systemd_value $SD_MEM_LIMIT 38551552 + + check_cgroup_value $MEM_RESERVE 25165824 + check_systemd_value $SD_MEM_RESERVE 25165824 + + check_cgroup_value "pids.max" 20 + check_systemd_value "TasksMax" 20 +} + +@test "update cgroup cpu limits" { + [[ "$ROOTLESS" -ne 0 ]] && requires rootless_cgroup + + # run a few busyboxes detached + runc run -d --console-socket "$CONSOLE_SOCKET" test_update + [ "$status" -eq 0 ] + + # check that initial values were properly set + check_cpu_quota 500000 1000000 "500ms" + check_cpu_shares 100 + + # update cpu period + runc update test_update --cpu-period 900000 + [ "$status" -eq 0 ] + check_cpu_quota 500000 900000 "560ms" + + # update cpu quota + runc update test_update --cpu-quota 600000 + [ "$status" -eq 0 ] + check_cpu_quota 600000 900000 "670ms" + + # remove cpu quota + runc update test_update --cpu-quota -1 + [ "$status" -eq 0 ] + check_cpu_quota -1 900000 "infinity" + + # update cpu-shares + runc update test_update --cpu-share 200 + [ "$status" -eq 0 ] + check_cpu_shares 200 + + # Revert to the test initial value via json on stding + runc update -r - test_update <"$BATS_TMPDIR"/runc-cgroups-integration-test.json +{ + "cpu": { + "shares": 100, + "quota": 500000, + "period": 1000000 + } +} +EOF + [ "$status" -eq 0 ] + + runc update -r "$BATS_TMPDIR"/runc-cgroups-integration-test.json test_update + [ "$status" -eq 0 ] + check_cpu_quota 500000 1000000 "500ms" + check_cpu_shares 100 +} + +@test "set cpu period with no quota" { + [[ "$ROOTLESS" -ne 0 ]] && requires rootless_cgroup + + update_config '.linux.resources.cpu |= { "period": 1000000 }' "${BUSYBOX_BUNDLE}" + + runc run -d --console-socket "$CONSOLE_SOCKET" test_update + [ "$status" -eq 0 ] + + check_cpu_quota -1 1000000 "infinity" +} + +@test "set cpu quota with no period" { + [[ "$ROOTLESS" -ne 0 ]] && requires rootless_cgroup + + update_config '.linux.resources.cpu |= { "quota": 5000 }' "${BUSYBOX_BUNDLE}" + + runc run -d --console-socket "$CONSOLE_SOCKET" test_update + [ "$status" -eq 0 ] + check_cpu_quota 5000 100000 "50ms" +} + +@test "update cpu period with no previous period/quota set" { + [[ "$ROOTLESS" -ne 0 ]] && requires rootless_cgroup + + update_config '.linux.resources.cpu |= {}' "${BUSYBOX_BUNDLE}" + + runc run -d --console-socket "$CONSOLE_SOCKET" test_update + [ "$status" -eq 0 ] + + # update the period alone, no old values were set + runc update --cpu-period 50000 test_update + [ "$status" -eq 0 ] + check_cpu_quota -1 50000 "infinity" +} + +@test "update cpu quota with no previous period/quota set" { + [[ "$ROOTLESS" -ne 0 ]] && requires rootless_cgroup + + update_config '.linux.resources.cpu |= {}' "${BUSYBOX_BUNDLE}" + + runc run -d --console-socket "$CONSOLE_SOCKET" test_update + [ "$status" -eq 0 ] + + # update the quota alone, no old values were set + runc update --cpu-quota 30000 test_update + [ "$status" -eq 0 ] + check_cpu_quota 30000 100000 "300ms" +} + +@test "update cgroup v2 resources via unified map" { + [[ "$ROOTLESS" -ne 0 ]] && requires rootless_cgroup + requires cgroups_v2 + + runc run -d --console-socket "$CONSOLE_SOCKET" test_update + [ "$status" -eq 0 ] + + # check that initial values were properly set + check_cpu_quota 500000 1000000 "500ms" + # initial cpu shares of 100 corresponds to weight of 4 + check_cpu_weight 4 + check_systemd_value "TasksMax" 20 + + runc update -r - test_update <= v244 + if [ "$(systemd_version)" -lt 244 ]; then + # a hack to skip checks, see check_systemd_value() + AllowedCPUs='unsupported' + AllowedMemoryNodes='unsupported' + fi + + update_config ' .linux.resources.CPU |= { + "Cpus": "0", + "Mems": "0" + }' "${BUSYBOX_BUNDLE}" + runc run -d --console-socket "$CONSOLE_SOCKET" test_update + [ "$status" -eq 0 ] + + # check that initial values were properly set + check_systemd_value "$AllowedCPUs" 0 + check_systemd_value "$AllowedMemoryNodes" 0 + + # Note: the cpuset update must be a superset of the current cpuset, because + # of sysbox-runc's cgroup delegation. + runc update -r - test_update <= v244 + [[ "$ROOTLESS" -ne 0 ]] && requires rootless_cgroup + requires cgroups_v2 smp + + update_config ' .linux.resources.unified |= { + "cpuset.cpus": "0", + "cpuset.mems": "0" + }' "${BUSYBOX_BUNDLE}" + runc run -d --console-socket "$CONSOLE_SOCKET" test_update + [ "$status" -eq 0 ] + + # check that initial values were properly set + check_systemd_value "AllowedCPUs" 0 + check_systemd_value "AllowedMemoryNodes" 0 + + runc update -r - test_update <"$target_period" + target_runtime="${target}/cpu.rt_runtime_us" + echo "Writing ${root_runtime} to ${target_runtime}" + echo "$root_runtime" >"$target_runtime" + done + + # run a detached busybox + runc run -d --console-socket "$CONSOLE_SOCKET" test_update_rt + [ "$status" -eq 0 ] + + runc update -r - test_update_rt </dev/null; done"]' + + # Set up a temporary console socket and recvtty so we can get the stdio. + TMP_RECVTTY_DIR="$(mktemp -d "$BATS_TMPDIR/runc-tmp-recvtty.XXXXXX")" + TMP_RECVTTY_PID="$TMP_RECVTTY_DIR/recvtty.pid" + TMP_CONSOLE_SOCKET="$TMP_RECVTTY_DIR/console.sock" + CONTAINER_OUTPUT="$TMP_RECVTTY_DIR/output" + ("$RECVTTY" --no-stdin --pid-file "$TMP_RECVTTY_PID" \ + --mode single "$TMP_CONSOLE_SOCKET" &>"$CONTAINER_OUTPUT") & + retry 10 0.1 [ -e "$TMP_CONSOLE_SOCKET" ] + + # Run the container in the background. + runc run -d --console-socket "$TMP_CONSOLE_SOCKET" test_update + cat "$CONTAINER_OUTPUT" + [ "$status" -eq 0 ] + + # Trigger an update. This update doesn't actually change the device rules, + # but it will trigger the devices cgroup code to reapply the current rules. + # We trigger the update a few times to make sure we hit the race. + for _ in {1..12}; do + # TODO: Update "runc update" so we can change the device rules. + runc update --pids-limit 30 test_update + [ "$status" -eq 0 ] + done + + # Kill recvtty. + kill -9 "$(<"$TMP_RECVTTY_PID")" + + # There should've been no output from the container. + cat "$CONTAINER_OUTPUT" + [ -z "$(<"$CONTAINER_OUTPUT")" ] +} + +@test "update paused container" { + [[ "$ROOTLESS" -ne 0 ]] && requires rootless_cgroup + requires cgroups_freezer + + # Run the container in the background. + runc run -d --console-socket "$CONSOLE_SOCKET" test_update + [ "$status" -eq 0 ] + + # Pause the container. + runc pause test_update + [ "$status" -eq 0 ] + + # Trigger an unrelated update. + runc update --pids-limit 30 test_update + [ "$status" -eq 0 ] + + # The container should still be paused. + testcontainer test_update paused + + # Resume the container. + runc resume test_update + [ "$status" -eq 0 ] +} diff --git a/sysbox-runc/tests/integration/version.bats b/sysbox-runc/tests/integration/version.bats new file mode 100644 index 00000000..bd6eecee --- /dev/null +++ b/sysbox-runc/tests/integration/version.bats @@ -0,0 +1,13 @@ +#!/usr/bin/env bats + +load helpers + +@test "runc version" { + skip "fails when git change is pending" + + runc -v + [ "$status" -eq 0 ] + [[ ${lines[0]} =~ runc\ version\ [0-9]+\.[0-9]+\.[0-9]+ ]] + [[ ${lines[1]} =~ commit:+ ]] + [[ ${lines[2]} =~ spec:\ [0-9]+\.[0-9]+\.[0-9]+ ]] +} diff --git a/sysbox-runc/tests/rootless.sh b/sysbox-runc/tests/rootless.sh new file mode 100755 index 00000000..929fec2c --- /dev/null +++ b/sysbox-runc/tests/rootless.sh @@ -0,0 +1,174 @@ +#!/bin/bash +# Copyright (C) 2017 SUSE LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# rootless.sh -- Runner for rootless container tests. The purpose of this +# script is to allow for the addition (and testing) of "opportunistic" features +# to rootless containers while still testing the base features. In order to add +# a new feature, please match the existing style. Add an entry to $ALL_FEATURES, +# and add an enable_* and disable_* hook. + +ALL_FEATURES=("idmap" "cgroup") +# cgroup is managed by systemd when RUNC_USE_SYSTEMD is set +if [[ -n "${RUNC_USE_SYSTEMD}" ]]; then + ALL_FEATURES=("idmap") +fi +ROOT="$(readlink -f "$(dirname "${BASH_SOURCE}")/..")" + +# FEATURE: Opportunistic new{uid,gid}map support, allowing a rootless container +# to be set up with the usage of helper setuid binaries. + +function enable_idmap() { + export ROOTLESS_UIDMAP_START=100000 ROOTLESS_UIDMAP_LENGTH=65536 + export ROOTLESS_GIDMAP_START=200000 ROOTLESS_GIDMAP_LENGTH=65536 + + # Set up sub{uid,gid} mappings. + [ -e /etc/subuid.tmp ] && mv /etc/subuid{.tmp,} + ( + grep -v '^rootless' /etc/subuid + echo "rootless:$ROOTLESS_UIDMAP_START:$ROOTLESS_UIDMAP_LENGTH" + ) >/etc/subuid.tmp + mv /etc/subuid{.tmp,} + [ -e /etc/subgid.tmp ] && mv /etc/subgid{.tmp,} + ( + grep -v '^rootless' /etc/subgid + echo "rootless:$ROOTLESS_GIDMAP_START:$ROOTLESS_GIDMAP_LENGTH" + ) >/etc/subgid.tmp + mv /etc/subgid{.tmp,} + + # Reactivate new{uid,gid}map helpers if applicable. + [ -e /usr/bin/unused-newuidmap ] && mv /usr/bin/{unused-,}newuidmap + [ -e /usr/bin/unused-newgidmap ] && mv /usr/bin/{unused-,}newgidmap + + # Create a directory owned by $AUX_UID inside container, to be used + # by a test case in cwd.bats. This setup can't be done by the test itself, + # as it needs root for chown. + set -e + export AUX_UID=1024 + AUX_DIR="$(mktemp -d)" + # 1000 is linux.uidMappings.containerID value, + # as set by runc_rootless_idmap + chown "$((ROOTLESS_UIDMAP_START - 1000 + AUX_UID))" "$AUX_DIR" + export AUX_DIR + set +e +} + +function disable_idmap() { + export ROOTLESS_UIDMAP_START ROOTLESS_UIDMAP_LENGTH + export ROOTLESS_GIDMAP_START ROOTLESS_GIDMAP_LENGTH + + # Deactivate sub{uid,gid} mappings. + [ -e /etc/subuid ] && mv /etc/subuid{,.tmp} + [ -e /etc/subgid ] && mv /etc/subgid{,.tmp} + + # Deactivate new{uid,gid}map helpers. setuid is preserved with mv(1). + [ -e /usr/bin/newuidmap ] && mv /usr/bin/{,unused-}newuidmap + [ -e /usr/bin/newgidmap ] && mv /usr/bin/{,unused-}newgidmap +} + +function cleanup() { + if [ -n "$AUX_DIR" ]; then + rmdir "$AUX_DIR" + unset AUX_DIX + fi +} + +# FEATURE: Opportunistic cgroups support, allowing a rootless container to set +# resource limits on condition that cgroupsPath is set to a path the +# rootless user has permissions on. + +# List of cgroups. We handle name= cgroups as well as combined +# (comma-separated) cgroups and correctly split and/or strip them. +ALL_CGROUPS=($(cat /proc/self/cgroup | cut -d: -f2 | sed -E '{s/^name=//;s/,/\n/;/^$/D}')) +CGROUP_MOUNT="/sys/fs/cgroup" +CGROUP_PATH="/runc-cgroups-integration-test" + +function enable_cgroup() { + # Set up cgroups for use in rootless containers. + for cg in "${ALL_CGROUPS[@]}"; do + mkdir -p "$CGROUP_MOUNT/$cg$CGROUP_PATH" + # We only need to allow write access to {cgroup.procs,tasks} and the + # directory. Rather than changing the owner entirely, we just change + # the group and then allow write access to the group (in order to + # further limit the possible DAC permissions that runc could use). + chown root:rootless "$CGROUP_MOUNT/$cg$CGROUP_PATH/"{,cgroup.procs,tasks} + chmod g+rwx "$CGROUP_MOUNT/$cg$CGROUP_PATH/"{,cgroup.procs,tasks} + # Due to cpuset's semantics we need to give extra permissions to allow + # for runc to set up the hierarchy. XXX: This really shouldn't be + # necessary, and might actually be a bug in our impl of cgroup + # handling. + [[ "$cg" == "cpuset" ]] && chown rootless:rootless "$CGROUP_MOUNT/$cg$CGROUP_PATH/cpuset."{cpus,mems} + done + # cgroup v2 + if [[ -e "$CGROUP_MOUNT/cgroup.controllers" ]]; then + # Enable controllers. Some controller (e.g. memory) may fail on containerized environment. + set -x + for f in $(cat "$CGROUP_MOUNT/cgroup.controllers"); do echo +$f >"$CGROUP_MOUNT/cgroup.subtree_control"; done + set +x + # Create the cgroup. + mkdir -p "$CGROUP_MOUNT/$CGROUP_PATH" + # chown/chmod dir + cgroup.subtree_control + cgroup.procs + parent's cgroup.procs. + # See https://www.kernel.org/doc/html/latest/admin-guide/cgroup-v2.html#delegation-containment + chown root:rootless "$CGROUP_MOUNT/$CGROUP_PATH" "$CGROUP_MOUNT/$CGROUP_PATH/cgroup.subtree_control" "$CGROUP_MOUNT/$CGROUP_PATH/cgroup.procs" "$CGROUP_MOUNT/cgroup.procs" + chmod g+rwx "$CGROUP_MOUNT/$CGROUP_PATH" + chmod g+rw "$CGROUP_MOUNT/$CGROUP_PATH/cgroup.subtree_control" "$CGROUP_MOUNT/$CGROUP_PATH/cgroup.procs" "$CGROUP_MOUNT/cgroup.procs" + fi +} + +function disable_cgroup() { + # Remove cgroups used in rootless containers. + for cg in "${ALL_CGROUPS[@]}"; do + [ -d "$CGROUP_MOUNT/$cg$CGROUP_PATH" ] && rmdir "$CGROUP_MOUNT/$cg$CGROUP_PATH" + done + # cgroup v2 + [ -d "$CGROUP_MOUNT/$CGROUP_PATH" ] && rmdir "$CGROUP_MOUNT/$CGROUP_PATH" +} + +# Create a powerset of $ALL_FEATURES (the set of all subsets of $ALL_FEATURES). +# We test all of the possible combinations (as long as we don't add too many +# feature knobs this shouldn't take too long -- but the number of tested +# combinations is O(2^n)). +function powerset() { + eval printf '%s' $(printf '{,%s+}' "$@"): +} +features_powerset="$(powerset "${ALL_FEATURES[@]}")" + +# Iterate over the powerset of all features. +IFS=: +for enabled_features in $features_powerset; do + idx="$(($idx + 1))" + echo "[$(printf '%.2d' "$idx")] run rootless tests ... (${enabled_features%%+})" + + unset IFS + for feature in "${ALL_FEATURES[@]}"; do + hook_func="disable_$feature" + grep -E "(^|\+)$feature(\+|$)" <<<$enabled_features &>/dev/null && hook_func="enable_$feature" + "$hook_func" + done + + # Run the test suite! + set -e + echo path: $PATH + export ROOTLESS_FEATURES="$enabled_features" + if [[ -n "${RUNC_USE_SYSTEMD}" ]]; then + # We use `ssh rootless@localhost` instead of `sudo -u rootless` for creating systemd user session. + # Alternatively we could use `machinectl shell`, but it is known not to work well on SELinux-enabled hosts as of April 2020: + # https://bugzilla.redhat.com/show_bug.cgi?id=1788616 + ssh -t -t -o UserKnownHostsFile=/dev/null -o StrictHostKeyChecking=no -i $HOME/rootless.key rootless@localhost -- PATH="$PATH" RUNC_USE_SYSTEMD="$RUNC_USE_SYSTEMD" bats -t "$ROOT/tests/integration$ROOTLESS_TESTPATH" + else + sudo -HE -u rootless PATH="$PATH" $(which bats) -t "$ROOT/tests/integration$ROOTLESS_TESTPATH" + fi + set +e + cleanup +done diff --git a/sysbox-runc/tty.go b/sysbox-runc/tty.go new file mode 100644 index 00000000..844d7f84 --- /dev/null +++ b/sysbox-runc/tty.go @@ -0,0 +1,200 @@ +// +build linux + +package main + +import ( + "fmt" + "io" + "os" + "os/signal" + "sync" + + "github.com/containerd/console" + "github.com/opencontainers/runc/libcontainer" + "github.com/opencontainers/runc/libcontainer/utils" + "github.com/pkg/errors" +) + +type tty struct { + epoller *console.Epoller + console *console.EpollConsole + hostConsole console.Console + closers []io.Closer + postStart []io.Closer + wg sync.WaitGroup + consoleC chan error +} + +func (t *tty) copyIO(w io.Writer, r io.ReadCloser) { + defer t.wg.Done() + io.Copy(w, r) + r.Close() +} + +// setup pipes for the process so that advanced features like c/r are able to easily checkpoint +// and restore the process's IO without depending on a host specific path or device +func setupProcessPipes(p *libcontainer.Process, rootuid, rootgid int) (*tty, error) { + i, err := p.InitializeIO(rootuid, rootgid) + if err != nil { + return nil, err + } + t := &tty{ + closers: []io.Closer{ + i.Stdin, + i.Stdout, + i.Stderr, + }, + } + // add the process's io to the post start closers if they support close + for _, cc := range []interface{}{ + p.Stdin, + p.Stdout, + p.Stderr, + } { + if c, ok := cc.(io.Closer); ok { + t.postStart = append(t.postStart, c) + } + } + go func() { + io.Copy(i.Stdin, os.Stdin) + i.Stdin.Close() + }() + t.wg.Add(2) + go t.copyIO(os.Stdout, i.Stdout) + go t.copyIO(os.Stderr, i.Stderr) + return t, nil +} + +func inheritStdio(process *libcontainer.Process) error { + process.Stdin = os.Stdin + process.Stdout = os.Stdout + process.Stderr = os.Stderr + return nil +} + +func (t *tty) initHostConsole() error { + // Usually all three (stdin, stdout, and stderr) streams are open to + // the terminal, but they might be redirected, so try them all. + for _, s := range []*os.File{os.Stderr, os.Stdout, os.Stdin} { + c, err := console.ConsoleFromFile(s) + switch err { + case nil: + t.hostConsole = c + return nil + case console.ErrNotAConsole: + continue + default: + // should not happen + return errors.Wrap(err, "unable to get console") + } + } + // If all streams are redirected, but we still have a controlling + // terminal, it can be obtained by opening /dev/tty. + tty, err := os.Open("/dev/tty") + if err != nil { + return err + } + c, err := console.ConsoleFromFile(tty) + if err != nil { + return errors.Wrap(err, "unable to get console") + } + + t.hostConsole = c + return nil +} + +func (t *tty) recvtty(process *libcontainer.Process, socket *os.File) (Err error) { + f, err := utils.RecvFd(socket) + if err != nil { + return err + } + cons, err := console.ConsoleFromFile(f) + if err != nil { + return err + } + err = console.ClearONLCR(cons.Fd()) + if err != nil { + return err + } + epoller, err := console.NewEpoller() + if err != nil { + return err + } + epollConsole, err := epoller.Add(cons) + if err != nil { + return err + } + defer func() { + if Err != nil { + epollConsole.Close() + } + }() + go epoller.Wait() + go io.Copy(epollConsole, os.Stdin) + t.wg.Add(1) + go t.copyIO(os.Stdout, epollConsole) + + // Set raw mode for the controlling terminal. + if err := t.hostConsole.SetRaw(); err != nil { + return fmt.Errorf("failed to set the terminal from the stdin: %v", err) + } + go handleInterrupt(t.hostConsole) + + t.epoller = epoller + t.console = epollConsole + t.closers = []io.Closer{epollConsole} + return nil +} + +func handleInterrupt(c console.Console) { + sigchan := make(chan os.Signal, 1) + signal.Notify(sigchan, os.Interrupt) + <-sigchan + c.Reset() + os.Exit(0) +} + +func (t *tty) waitConsole() error { + if t.consoleC != nil { + return <-t.consoleC + } + return nil +} + +// ClosePostStart closes any fds that are provided to the container and dup2'd +// so that we no longer have copy in our process. +func (t *tty) ClosePostStart() error { + for _, c := range t.postStart { + c.Close() + } + return nil +} + +// Close closes all open fds for the tty and/or restores the original +// stdin state to what it was prior to the container execution +func (t *tty) Close() error { + // ensure that our side of the fds are always closed + for _, c := range t.postStart { + c.Close() + } + // the process is gone at this point, shutting down the console if we have + // one and wait for all IO to be finished + if t.console != nil && t.epoller != nil { + t.console.Shutdown(t.epoller.CloseConsole) + } + t.wg.Wait() + for _, c := range t.closers { + c.Close() + } + if t.hostConsole != nil { + t.hostConsole.Reset() + } + return nil +} + +func (t *tty) resize() error { + if t.console == nil || t.hostConsole == nil { + return nil + } + return t.console.ResizeFrom(t.hostConsole) +} diff --git a/sysbox-runc/types/events.go b/sysbox-runc/types/events.go new file mode 100644 index 00000000..81bde829 --- /dev/null +++ b/sysbox-runc/types/events.go @@ -0,0 +1,155 @@ +package types + +import "github.com/opencontainers/runc/libcontainer/intelrdt" + +// Event struct for encoding the event data to json. +type Event struct { + Type string `json:"type"` + ID string `json:"id"` + Data interface{} `json:"data,omitempty"` +} + +// stats is the runc specific stats structure for stability when encoding and decoding stats. +type Stats struct { + CPU Cpu `json:"cpu"` + CPUSet CPUSet `json:"cpuset"` + Memory Memory `json:"memory"` + Pids Pids `json:"pids"` + Blkio Blkio `json:"blkio"` + Hugetlb map[string]Hugetlb `json:"hugetlb"` + IntelRdt IntelRdt `json:"intel_rdt"` + NetworkInterfaces []*NetworkInterface `json:"network_interfaces"` +} + +type Hugetlb struct { + Usage uint64 `json:"usage,omitempty"` + Max uint64 `json:"max,omitempty"` + Failcnt uint64 `json:"failcnt"` +} + +type BlkioEntry struct { + Major uint64 `json:"major,omitempty"` + Minor uint64 `json:"minor,omitempty"` + Op string `json:"op,omitempty"` + Value uint64 `json:"value,omitempty"` +} + +type Blkio struct { + IoServiceBytesRecursive []BlkioEntry `json:"ioServiceBytesRecursive,omitempty"` + IoServicedRecursive []BlkioEntry `json:"ioServicedRecursive,omitempty"` + IoQueuedRecursive []BlkioEntry `json:"ioQueueRecursive,omitempty"` + IoServiceTimeRecursive []BlkioEntry `json:"ioServiceTimeRecursive,omitempty"` + IoWaitTimeRecursive []BlkioEntry `json:"ioWaitTimeRecursive,omitempty"` + IoMergedRecursive []BlkioEntry `json:"ioMergedRecursive,omitempty"` + IoTimeRecursive []BlkioEntry `json:"ioTimeRecursive,omitempty"` + SectorsRecursive []BlkioEntry `json:"sectorsRecursive,omitempty"` +} + +type Pids struct { + Current uint64 `json:"current,omitempty"` + Limit uint64 `json:"limit,omitempty"` +} + +type Throttling struct { + Periods uint64 `json:"periods,omitempty"` + ThrottledPeriods uint64 `json:"throttledPeriods,omitempty"` + ThrottledTime uint64 `json:"throttledTime,omitempty"` +} + +type CpuUsage struct { + // Units: nanoseconds. + Total uint64 `json:"total,omitempty"` + Percpu []uint64 `json:"percpu,omitempty"` + PercpuKernel []uint64 `json:"percpu_kernel,omitempty"` + PercpuUser []uint64 `json:"percpu_user,omitempty"` + Kernel uint64 `json:"kernel"` + User uint64 `json:"user"` +} + +type Cpu struct { + Usage CpuUsage `json:"usage,omitempty"` + Throttling Throttling `json:"throttling,omitempty"` +} + +type CPUSet struct { + CPUs []uint16 `json:"cpus,omitempty"` + CPUExclusive uint64 `json:"cpu_exclusive"` + Mems []uint16 `json:"mems,omitempty"` + MemHardwall uint64 `json:"mem_hardwall"` + MemExclusive uint64 `json:"mem_exclusive"` + MemoryMigrate uint64 `json:"memory_migrate"` + MemorySpreadPage uint64 `json:"memory_spread_page"` + MemorySpreadSlab uint64 `json:"memory_spread_slab"` + MemoryPressure uint64 `json:"memory_pressure"` + SchedLoadBalance uint64 `json:"sched_load_balance"` + SchedRelaxDomainLevel int64 `json:"sched_relax_domain_level"` +} + +type MemoryEntry struct { + Limit uint64 `json:"limit"` + Usage uint64 `json:"usage,omitempty"` + Max uint64 `json:"max,omitempty"` + Failcnt uint64 `json:"failcnt"` +} + +type Memory struct { + Cache uint64 `json:"cache,omitempty"` + Usage MemoryEntry `json:"usage,omitempty"` + Swap MemoryEntry `json:"swap,omitempty"` + Kernel MemoryEntry `json:"kernel,omitempty"` + KernelTCP MemoryEntry `json:"kernelTCP,omitempty"` + Raw map[string]uint64 `json:"raw,omitempty"` +} + +type L3CacheInfo struct { + CbmMask string `json:"cbm_mask,omitempty"` + MinCbmBits uint64 `json:"min_cbm_bits,omitempty"` + NumClosids uint64 `json:"num_closids,omitempty"` +} + +type MemBwInfo struct { + BandwidthGran uint64 `json:"bandwidth_gran,omitempty"` + DelayLinear uint64 `json:"delay_linear,omitempty"` + MinBandwidth uint64 `json:"min_bandwidth,omitempty"` + NumClosids uint64 `json:"num_closids,omitempty"` +} + +type IntelRdt struct { + // The read-only L3 cache information + L3CacheInfo *L3CacheInfo `json:"l3_cache_info,omitempty"` + + // The read-only L3 cache schema in root + L3CacheSchemaRoot string `json:"l3_cache_schema_root,omitempty"` + + // The L3 cache schema in 'container_id' group + L3CacheSchema string `json:"l3_cache_schema,omitempty"` + + // The read-only memory bandwidth information + MemBwInfo *MemBwInfo `json:"mem_bw_info,omitempty"` + + // The read-only memory bandwidth schema in root + MemBwSchemaRoot string `json:"mem_bw_schema_root,omitempty"` + + // The memory bandwidth schema in 'container_id' group + MemBwSchema string `json:"mem_bw_schema,omitempty"` + + // The memory bandwidth monitoring statistics from NUMA nodes in 'container_id' group + MBMStats *[]intelrdt.MBMNumaNodeStats `json:"mbm_stats,omitempty"` + + // The cache monitoring technology statistics from NUMA nodes in 'container_id' group + CMTStats *[]intelrdt.CMTNumaNodeStats `json:"cmt_stats,omitempty"` +} + +type NetworkInterface struct { + // Name is the name of the network interface. + Name string + + RxBytes uint64 + RxPackets uint64 + RxErrors uint64 + RxDropped uint64 + TxBytes uint64 + TxPackets uint64 + TxErrors uint64 + TxDropped uint64 +} diff --git a/sysbox-runc/update.go b/sysbox-runc/update.go new file mode 100644 index 00000000..127a6e71 --- /dev/null +++ b/sysbox-runc/update.go @@ -0,0 +1,334 @@ +// +build linux + +package main + +import ( + "encoding/json" + "errors" + "fmt" + "os" + "strconv" + + "github.com/opencontainers/runc/libcontainer/cgroups" + "github.com/sirupsen/logrus" + + "github.com/docker/go-units" + "github.com/opencontainers/runc/libcontainer/configs" + "github.com/opencontainers/runc/libcontainer/intelrdt" + "github.com/opencontainers/runtime-spec/specs-go" + "github.com/urfave/cli" +) + +func i64Ptr(i int64) *int64 { return &i } +func u64Ptr(i uint64) *uint64 { return &i } +func u16Ptr(i uint16) *uint16 { return &i } + +var updateCommand = cli.Command{ + Name: "update", + Usage: "update container resource constraints", + ArgsUsage: ``, + Flags: []cli.Flag{ + cli.StringFlag{ + Name: "resources, r", + Value: "", + Usage: `path to the file containing the resources to update or '-' to read from the standard input + +The accepted format is as follow (unchanged values can be omitted): + +{ + "memory": { + "limit": 0, + "reservation": 0, + "swap": 0 + }, + "cpu": { + "shares": 0, + "quota": 0, + "period": 0, + "realtimeRuntime": 0, + "realtimePeriod": 0, + "cpus": "", + "mems": "" + }, + "blockIO": { + "weight": 0 + } +} + +Note: if data is to be read from a file or the standard input, all +other options are ignored. +`, + }, + + cli.IntFlag{ + Name: "blkio-weight", + Usage: "Specifies per cgroup weight, range is from 10 to 1000", + }, + cli.StringFlag{ + Name: "cpu-period", + Usage: "CPU CFS period to be used for hardcapping (in usecs). 0 to use system default", + }, + cli.StringFlag{ + Name: "cpu-quota", + Usage: "CPU CFS hardcap limit (in usecs). Allowed cpu time in a given period", + }, + cli.StringFlag{ + Name: "cpu-share", + Usage: "CPU shares (relative weight vs. other containers)", + }, + cli.StringFlag{ + Name: "cpu-rt-period", + Usage: "CPU realtime period to be used for hardcapping (in usecs). 0 to use system default", + }, + cli.StringFlag{ + Name: "cpu-rt-runtime", + Usage: "CPU realtime hardcap limit (in usecs). Allowed cpu time in a given period", + }, + cli.StringFlag{ + Name: "cpuset-cpus", + Usage: "CPU(s) to use", + }, + cli.StringFlag{ + Name: "cpuset-mems", + Usage: "Memory node(s) to use", + }, + cli.StringFlag{ + Name: "kernel-memory", + Usage: "(obsoleted; do not use)", + }, + cli.StringFlag{ + Name: "kernel-memory-tcp", + Usage: "(obsoleted; do not use)", + }, + cli.StringFlag{ + Name: "memory", + Usage: "Memory limit (in bytes)", + }, + cli.StringFlag{ + Name: "memory-reservation", + Usage: "Memory reservation or soft_limit (in bytes)", + }, + cli.StringFlag{ + Name: "memory-swap", + Usage: "Total memory usage (memory + swap); set '-1' to enable unlimited swap", + }, + cli.IntFlag{ + Name: "pids-limit", + Usage: "Maximum number of pids allowed in the container", + }, + cli.StringFlag{ + Name: "l3-cache-schema", + Usage: "The string of Intel RDT/CAT L3 cache schema", + }, + cli.StringFlag{ + Name: "mem-bw-schema", + Usage: "The string of Intel RDT/MBA memory bandwidth schema", + }, + }, + Action: func(context *cli.Context) error { + if err := checkArgs(context, 1, exactArgs); err != nil { + return err + } + container, err := getContainer(context) + if err != nil { + return err + } + + r := specs.LinuxResources{ + Memory: &specs.LinuxMemory{ + Limit: i64Ptr(0), + Reservation: i64Ptr(0), + Swap: i64Ptr(0), + Kernel: i64Ptr(0), + KernelTCP: i64Ptr(0), + }, + CPU: &specs.LinuxCPU{ + Shares: u64Ptr(0), + Quota: i64Ptr(0), + Period: u64Ptr(0), + RealtimeRuntime: i64Ptr(0), + RealtimePeriod: u64Ptr(0), + Cpus: "", + Mems: "", + }, + BlockIO: &specs.LinuxBlockIO{ + Weight: u16Ptr(0), + }, + Pids: &specs.LinuxPids{ + Limit: 0, + }, + } + + config := container.Config() + + if in := context.String("resources"); in != "" { + var ( + f *os.File + err error + ) + switch in { + case "-": + f = os.Stdin + default: + f, err = os.Open(in) + if err != nil { + return err + } + } + err = json.NewDecoder(f).Decode(&r) + if err != nil { + return err + } + } else { + if val := context.Int("blkio-weight"); val != 0 { + r.BlockIO.Weight = u16Ptr(uint16(val)) + } + if val := context.String("cpuset-cpus"); val != "" { + r.CPU.Cpus = val + } + if val := context.String("cpuset-mems"); val != "" { + r.CPU.Mems = val + } + + for _, pair := range []struct { + opt string + dest *uint64 + }{ + + {"cpu-period", r.CPU.Period}, + {"cpu-rt-period", r.CPU.RealtimePeriod}, + {"cpu-share", r.CPU.Shares}, + } { + if val := context.String(pair.opt); val != "" { + var err error + *pair.dest, err = strconv.ParseUint(val, 10, 64) + if err != nil { + return fmt.Errorf("invalid value for %s: %s", pair.opt, err) + } + } + } + for _, pair := range []struct { + opt string + dest *int64 + }{ + + {"cpu-quota", r.CPU.Quota}, + {"cpu-rt-runtime", r.CPU.RealtimeRuntime}, + } { + if val := context.String(pair.opt); val != "" { + var err error + *pair.dest, err = strconv.ParseInt(val, 10, 64) + if err != nil { + return fmt.Errorf("invalid value for %s: %s", pair.opt, err) + } + } + } + for _, pair := range []struct { + opt string + dest *int64 + }{ + {"memory", r.Memory.Limit}, + {"memory-swap", r.Memory.Swap}, + {"kernel-memory", r.Memory.Kernel}, + {"kernel-memory-tcp", r.Memory.KernelTCP}, + {"memory-reservation", r.Memory.Reservation}, + } { + if val := context.String(pair.opt); val != "" { + var v int64 + + if val != "-1" { + v, err = units.RAMInBytes(val) + if err != nil { + return fmt.Errorf("invalid value for %s: %s", pair.opt, err) + } + } else { + v = -1 + } + *pair.dest = v + } + } + + r.Pids.Limit = int64(context.Int("pids-limit")) + } + + if *r.Memory.Kernel != 0 || *r.Memory.KernelTCP != 0 { + logrus.Warn("Kernel memory settings are ignored and will be removed") + } + + // Update the values + config.Cgroups.Resources.BlkioWeight = *r.BlockIO.Weight + + // Seting CPU quota and period independently does not make much sense, + // but historically runc allowed it and this needs to be supported + // to not break compatibility. + // + // For systemd cgroup drivers to set CPU quota/period correctly, + // it needs to know both values. For fs2 cgroup driver to be compatible + // with the fs driver, it also needs to know both values. + // + // Here in update, previously set values are available from config. + // If only one of {quota,period} is set and the other is not, leave + // the unset parameter at the old value (don't overwrite config). + p, q := *r.CPU.Period, *r.CPU.Quota + if (p == 0 && q == 0) || (p != 0 && q != 0) { + // both values are either set or unset (0) + config.Cgroups.Resources.CpuPeriod = p + config.Cgroups.Resources.CpuQuota = q + } else { + // one is set and the other is not + if p != 0 { + // set new period, leave quota at old value + config.Cgroups.Resources.CpuPeriod = p + } else if q != 0 { + // set new quota, leave period at old value + config.Cgroups.Resources.CpuQuota = q + } + } + + config.Cgroups.Resources.CpuShares = *r.CPU.Shares + //CpuWeight is used for cgroupv2 and should be converted + config.Cgroups.Resources.CpuWeight = cgroups.ConvertCPUSharesToCgroupV2Value(*r.CPU.Shares) + config.Cgroups.Resources.CpuRtPeriod = *r.CPU.RealtimePeriod + config.Cgroups.Resources.CpuRtRuntime = *r.CPU.RealtimeRuntime + config.Cgroups.Resources.CpusetCpus = r.CPU.Cpus + config.Cgroups.Resources.CpusetMems = r.CPU.Mems + config.Cgroups.Resources.Memory = *r.Memory.Limit + config.Cgroups.Resources.MemoryReservation = *r.Memory.Reservation + config.Cgroups.Resources.MemorySwap = *r.Memory.Swap + config.Cgroups.Resources.PidsLimit = r.Pids.Limit + config.Cgroups.Resources.Unified = r.Unified + + // Update Intel RDT + l3CacheSchema := context.String("l3-cache-schema") + memBwSchema := context.String("mem-bw-schema") + if l3CacheSchema != "" && !intelrdt.IsCATEnabled() { + return errors.New("Intel RDT/CAT: l3 cache schema is not enabled") + } + + if memBwSchema != "" && !intelrdt.IsMBAEnabled() { + return errors.New("Intel RDT/MBA: memory bandwidth schema is not enabled") + } + + if l3CacheSchema != "" || memBwSchema != "" { + // If intelRdt is not specified in original configuration, we just don't + // Apply() to create intelRdt group or attach tasks for this container. + // In update command, we could re-enable through IntelRdtManager.Apply() + // and then update intelrdt constraint. + if config.IntelRdt == nil { + state, err := container.State() + if err != nil { + return err + } + config.IntelRdt = &configs.IntelRdt{} + intelRdtManager := intelrdt.NewManager(&config, container.ID(), state.IntelRdtPath) + if err := intelRdtManager.Apply(state.InitProcessPid); err != nil { + return err + } + } + config.IntelRdt.L3CacheSchema = l3CacheSchema + config.IntelRdt.MemBwSchema = memBwSchema + } + + return container.Set(config) + }, +} diff --git a/sysbox-runc/utils.go b/sysbox-runc/utils.go new file mode 100644 index 00000000..82a9c8ea --- /dev/null +++ b/sysbox-runc/utils.go @@ -0,0 +1,118 @@ +package main + +import ( + "fmt" + "os" + "path/filepath" + "strconv" + "strings" + + "github.com/opencontainers/runtime-spec/specs-go" + + "github.com/sirupsen/logrus" + "github.com/urfave/cli" +) + +const ( + exactArgs = iota + minArgs + maxArgs +) + +func checkArgs(context *cli.Context, expected, checkType int) error { + var err error + cmdName := context.Command.Name + switch checkType { + case exactArgs: + if context.NArg() != expected { + err = fmt.Errorf("%s: %q requires exactly %d argument(s)", os.Args[0], cmdName, expected) + } + case minArgs: + if context.NArg() < expected { + err = fmt.Errorf("%s: %q requires a minimum of %d argument(s)", os.Args[0], cmdName, expected) + } + case maxArgs: + if context.NArg() > expected { + err = fmt.Errorf("%s: %q requires a maximum of %d argument(s)", os.Args[0], cmdName, expected) + } + } + + if err != nil { + fmt.Printf("Incorrect Usage.\n\n") + cli.ShowCommandHelp(context, cmdName) + return err + } + return nil +} + +func logrusToStderr() bool { + l, ok := logrus.StandardLogger().Out.(*os.File) + return ok && l.Fd() == os.Stderr.Fd() +} + +// fatal prints the error's details if it is a libcontainer specific error type +// then exits the program with an exit status of 1. +func fatal(err error) { + // make sure the error is written to the logger + logrus.Error(err) + if !logrusToStderr() { + fmt.Fprintln(os.Stderr, err) + } + + os.Exit(1) +} + +// setupSpec performs initial setup based on the cli.Context for the container +func setupSpec(context *cli.Context) (*specs.Spec, error) { + bundle := context.String("bundle") + if bundle != "" { + if err := os.Chdir(bundle); err != nil { + return nil, err + } + } + spec, err := loadSpec(specConfig) + if err != nil { + return nil, err + } + + return spec, nil +} + +func revisePidFile(context *cli.Context) error { + pidFile := context.String("pid-file") + if pidFile == "" { + return nil + } + + // convert pid-file to an absolute path so we can write to the right + // file after chdir to bundle + pidFile, err := filepath.Abs(pidFile) + if err != nil { + return err + } + return context.Set("pid-file", pidFile) +} + +// reviseRootDir convert the root to absolute path +func reviseRootDir(context *cli.Context) error { + root := context.GlobalString("root") + if root == "" { + return nil + } + + root, err := filepath.Abs(root) + if err != nil { + return err + } + + return context.GlobalSet("root", root) +} + +// parseBoolOrAuto returns (nil, nil) if s is empty or "auto" +func parseBoolOrAuto(s string) (*bool, error) { + if s == "" || strings.ToLower(s) == "auto" { + return nil, nil + } + b, err := strconv.ParseBool(s) + return &b, err +} diff --git a/sysbox-runc/utils_linux.go b/sysbox-runc/utils_linux.go new file mode 100644 index 00000000..724858b8 --- /dev/null +++ b/sysbox-runc/utils_linux.go @@ -0,0 +1,584 @@ +//go:build linux +// +build linux + +package main + +import ( + "fmt" + "net" + "os" + "os/exec" + "path/filepath" + "strconv" + "strings" + + "github.com/nestybox/sysbox-libs/dockerUtils" + "github.com/opencontainers/runc/libcontainer" + "github.com/opencontainers/runc/libcontainer/cgroups/systemd" + "github.com/opencontainers/runc/libcontainer/configs" + "github.com/opencontainers/runc/libcontainer/specconv" + "github.com/opencontainers/runc/libcontainer/utils" + "github.com/opencontainers/runc/libsysbox/sysbox" + "github.com/opencontainers/runc/libsysbox/syscont" + "github.com/opencontainers/runtime-spec/specs-go" + selinux "github.com/opencontainers/selinux/go-selinux" + + "github.com/coreos/go-systemd/v22/activation" + "github.com/pkg/errors" + "github.com/sirupsen/logrus" + "github.com/urfave/cli" + "golang.org/x/sys/unix" +) + +var errEmptyID = errors.New("container id cannot be empty") + +// loadFactory returns the configured factory instance for execing containers. +func loadFactory(context *cli.Context, sysbox *sysbox.Sysbox) (libcontainer.Factory, error) { + root := context.GlobalString("root") + abs, err := filepath.Abs(root) + if err != nil { + return nil, err + } + + // We default to cgroupfs, and can only use systemd if the system is a + // systemd box. + cgroupManager := libcontainer.Cgroupfs + rootlessCg, err := shouldUseRootlessCgroupManager(context) + if err != nil { + return nil, err + } + if rootlessCg { + cgroupManager = libcontainer.RootlessCgroupfs + } + if context.GlobalBool("systemd-cgroup") { + if !systemd.IsRunningSystemd() { + return nil, errors.New("systemd cgroup flag passed, but systemd support for managing cgroups is not available") + } + cgroupManager = libcontainer.SystemdCgroups + if rootlessCg { + cgroupManager = libcontainer.RootlessSystemdCgroups + } + } + + intelRdtManager := libcontainer.IntelRdtFs + + // We resolve the paths for {newuidmap,newgidmap} from the context of runc, + // to avoid doing a path lookup in the nsexec context. TODO: The binary + // names are not currently configurable. + newuidmap, err := exec.LookPath("newuidmap") + if err != nil { + newuidmap = "" + } + newgidmap, err := exec.LookPath("newgidmap") + if err != nil { + newgidmap = "" + } + + return libcontainer.New(abs, cgroupManager, intelRdtManager, + libcontainer.CriuPath(context.GlobalString("criu")), + libcontainer.NewuidmapPath(newuidmap), + libcontainer.NewgidmapPath(newgidmap), + libcontainer.Sysbox(sysbox)) +} + +// getContainer returns the specified container instance by loading it from state +// with the default factory. +func getContainer(context *cli.Context) (libcontainer.Container, error) { + id := context.Args().First() + if id == "" { + return nil, errEmptyID + } + factory, err := loadFactory(context, nil) + if err != nil { + return nil, err + } + return factory.Load(id) +} + +func getDefaultImagePath(context *cli.Context) string { + cwd, err := os.Getwd() + if err != nil { + panic(err) + } + return filepath.Join(cwd, "checkpoint") +} + +// newProcess returns a new libcontainer Process with the arguments from the +// spec and stdio from the current process. +func newProcess(p specs.Process, init bool, logLevel string) (*libcontainer.Process, error) { + lp := &libcontainer.Process{ + Args: p.Args, + Env: p.Env, + // TODO: fix libcontainer's API to better support uid/gid in a typesafe way. + User: fmt.Sprintf("%d:%d", p.User.UID, p.User.GID), + Cwd: p.Cwd, + Label: p.SelinuxLabel, + NoNewPrivileges: &p.NoNewPrivileges, + AppArmorProfile: p.ApparmorProfile, + Init: init, + LogLevel: logLevel, + } + + if p.ConsoleSize != nil { + lp.ConsoleWidth = uint16(p.ConsoleSize.Width) + lp.ConsoleHeight = uint16(p.ConsoleSize.Height) + } + + if p.Capabilities != nil { + lp.Capabilities = &configs.Capabilities{} + lp.Capabilities.Bounding = p.Capabilities.Bounding + lp.Capabilities.Effective = p.Capabilities.Effective + lp.Capabilities.Inheritable = p.Capabilities.Inheritable + lp.Capabilities.Permitted = p.Capabilities.Permitted + lp.Capabilities.Ambient = p.Capabilities.Ambient + } + for _, gid := range p.User.AdditionalGids { + lp.AdditionalGroups = append(lp.AdditionalGroups, strconv.FormatUint(uint64(gid), 10)) + } + for _, rlimit := range p.Rlimits { + rl, err := createLibContainerRlimit(rlimit) + if err != nil { + return nil, err + } + lp.Rlimits = append(lp.Rlimits, rl) + } + return lp, nil +} + +func destroy(container libcontainer.Container) { + if err := container.Destroy(); err != nil { + logrus.Error(err) + } +} + +// setupIO modifies the given process config according to the options. +func setupIO(process *libcontainer.Process, rootuid, rootgid int, createTTY, detach bool, sockpath string) (*tty, error) { + if createTTY { + process.Stdin = nil + process.Stdout = nil + process.Stderr = nil + t := &tty{} + if !detach { + if err := t.initHostConsole(); err != nil { + return nil, err + } + parent, child, err := utils.NewSockPair("console") + if err != nil { + return nil, err + } + process.ConsoleSocket = child + t.postStart = append(t.postStart, parent, child) + t.consoleC = make(chan error, 1) + go func() { + t.consoleC <- t.recvtty(process, parent) + }() + } else { + // the caller of runc will handle receiving the console master + conn, err := net.Dial("unix", sockpath) + if err != nil { + return nil, err + } + uc, ok := conn.(*net.UnixConn) + if !ok { + return nil, errors.New("casting to UnixConn failed") + } + t.postStart = append(t.postStart, uc) + socket, err := uc.File() + if err != nil { + return nil, err + } + t.postStart = append(t.postStart, socket) + process.ConsoleSocket = socket + } + return t, nil + } + // when runc will detach the caller provides the stdio to runc via runc's 0,1,2 + // and the container's process inherits runc's stdio. + if detach { + + // sysbox-runc: in detach mode, ensure the ownership of stdio matches the + // container init process uid(gid). This is necessary because sysbox-runc + // allocates the container's uid(gid) when using uid-shifting, and that + // uid may not have permission to access stdio. + // + // However, this is not ideal, as we are changing the ownership of files + // which we don't really own. If that file is a pipe or FIFO (as when + // Docker/containerd launch the container), then this is probably fine + // since we are changing the ownership of the side of the pipe/FIFO that + // was assigned to the container's init process. But if that file is a + // regular file (e.g., when sysbox-runc is invoked from a shell and the + // output is redirected to a regular file (as done by bats in the test + // suite), then this may not be kosher (see Sysbox issue #707 for an + // example). + + link, err := os.Readlink(fmt.Sprintf("/proc/self/fd/%d", os.Stdin.Fd())) + if err != nil { + return nil, err + } + + if !strings.HasPrefix(link, "/dev") { + if err := unix.Fchown(int(os.Stdin.Fd()), rootuid, rootgid); err != nil { + return nil, fmt.Errorf("failed to chown stdin") + } + } + + link, err = os.Readlink(fmt.Sprintf("/proc/self/fd/%d", os.Stdout.Fd())) + if err != nil { + return nil, err + } + + if !strings.HasPrefix(link, "/dev") { + if err := unix.Fchown(int(os.Stdout.Fd()), rootuid, rootgid); err != nil { + return nil, fmt.Errorf("failed to chown stdout") + } + } + + link, err = os.Readlink(fmt.Sprintf("/proc/self/fd/%d", os.Stderr.Fd())) + if err != nil { + return nil, err + } + + if !strings.HasPrefix(link, "/dev") { + if err := unix.Fchown(int(os.Stderr.Fd()), rootuid, rootgid); err != nil { + return nil, fmt.Errorf("failed to chown stderr") + } + } + + if err := inheritStdio(process); err != nil { + return nil, err + } + + return &tty{}, nil + } + return setupProcessPipes(process, rootuid, rootgid) +} + +// createPidFile creates a file with the processes pid inside it atomically +// it creates a temp file with the paths filename + '.' infront of it +// then renames the file +func createPidFile(path string, process *libcontainer.Process) error { + pid, err := process.Pid() + if err != nil { + return err + } + var ( + tmpDir = filepath.Dir(path) + tmpName = filepath.Join(tmpDir, "."+filepath.Base(path)) + ) + f, err := os.OpenFile(tmpName, os.O_RDWR|os.O_CREATE|os.O_EXCL|os.O_SYNC, 0666) + if err != nil { + return err + } + _, err = f.WriteString(strconv.Itoa(pid)) + f.Close() + if err != nil { + return err + } + return os.Rename(tmpName, path) +} + +// sysMgrGetFsState inquires sysbox-mgr for special files/dirs to +// to be added to container's rootfs. +func sysMgrGetFsState(mgr *sysbox.Mgr, config *configs.Config) error { + + state, err := mgr.ReqFsState(config.Rootfs) + if err != nil { + return err + } + + config.FsState = state + + return nil +} + +func createContainer(context *cli.Context, + id string, + spec *specs.Spec, + sysbox *sysbox.Sysbox) (libcontainer.Container, error) { + + sysMgr := sysbox.Mgr + sysFs := sysbox.Fs + + rootlessCg, err := shouldUseRootlessCgroupManager(context) + if err != nil { + return nil, err + } + + config, err := specconv.CreateLibcontainerConfig(&specconv.CreateOpts{ + CgroupName: id, + UseSystemdCgroup: context.GlobalBool("systemd-cgroup"), + NoPivotRoot: context.Bool("no-pivot"), + NoNewKeyring: context.Bool("no-new-keyring"), + Spec: spec, + RootlessEUID: os.Geteuid() != 0, + RootlessCgroups: rootlessCg, + RootfsUidShiftType: sysbox.RootfsUidShiftType, + BindMntUidShiftType: sysbox.BindMntUidShiftType, + SwitchDockerDns: sysbox.SwitchDockerDns, + RootfsCloned: sysbox.RootfsCloned, + FsuidMapFailOnErr: sysMgr.Config.FsuidMapFailOnErr, + IDshiftIgnoreList: sysbox.IDshiftIgnoreList, + }) + if err != nil { + return nil, err + } + + // sysbox-runc: For container's proper operation, collect from sysbox-mgr + // fsState to be added to container's rootfs. + if sysMgr.Enabled() { + if err := sysMgrGetFsState(sysMgr, config); err != nil { + return nil, err + } + } + + // sysbox-runc: setup sys container syscall trapping + if sysFs.Enabled() { + if err := syscont.AddSyscallTraps(config); err != nil { + return nil, err + } + } + + factory, err := loadFactory(context, sysbox) + if err != nil { + return nil, err + } + return factory.Create(id, config) +} + +type runner struct { + init bool + enableSubreaper bool + shouldDestroy bool + detach bool + listenFDs []*os.File + preserveFDs int + pidFile string + consoleSocket string + container libcontainer.Container + action CtAct + notifySocket *notifySocket + criuOpts *libcontainer.CriuOpts + logLevel string +} + +func (r *runner) run(config *specs.Process) (int, error) { + var err error + defer func() { + if err != nil { + r.destroy() + } + }() + if err = r.checkTerminal(config); err != nil { + return -1, err + } + process, err := newProcess(*config, r.init, r.logLevel) + if err != nil { + return -1, err + } + if len(r.listenFDs) > 0 { + process.Env = append(process.Env, "LISTEN_FDS="+strconv.Itoa(len(r.listenFDs)), "LISTEN_PID=1") + process.ExtraFiles = append(process.ExtraFiles, r.listenFDs...) + } + baseFd := 3 + len(process.ExtraFiles) + for i := baseFd; i < baseFd+r.preserveFDs; i++ { + _, err = os.Stat("/proc/self/fd/" + strconv.Itoa(i)) + if err != nil { + return -1, errors.Wrapf(err, "please check that preserved-fd %d (of %d) is present", i-baseFd, r.preserveFDs) + } + process.ExtraFiles = append(process.ExtraFiles, os.NewFile(uintptr(i), "PreserveFD:"+strconv.Itoa(i))) + } + rootuid, err := r.container.Config().HostRootUID() + if err != nil { + return -1, err + } + rootgid, err := r.container.Config().HostRootGID() + if err != nil { + return -1, err + } + var ( + detach = r.detach || (r.action == CT_ACT_CREATE) + ) + + // Setting up IO is a two stage process. We need to modify process to deal + // with detaching containers, and then we get a tty after the container has + // started. + handler := newSignalHandler(r.enableSubreaper, r.notifySocket) + tty, err := setupIO(process, rootuid, rootgid, config.Terminal, detach, r.consoleSocket) + if err != nil { + return -1, err + } + defer tty.Close() + + switch r.action { + case CT_ACT_CREATE: + err = r.container.Start(process) + case CT_ACT_RESTORE: + err = r.container.Restore(process, r.criuOpts) + case CT_ACT_RUN: + err = r.container.Run(process) + default: + panic("Unknown action") + } + if err != nil { + return -1, err + } + if err = tty.waitConsole(); err != nil { + r.terminate(process) + return -1, err + } + if err = tty.ClosePostStart(); err != nil { + r.terminate(process) + return -1, err + } + if r.pidFile != "" { + if err = createPidFile(r.pidFile, process); err != nil { + r.terminate(process) + return -1, err + } + } + status, err := handler.forward(process, tty, detach) + if err != nil { + r.terminate(process) + } + if detach { + return 0, nil + } + if err == nil { + r.destroy() + } + return status, err +} + +func (r *runner) destroy() { + if r.shouldDestroy { + destroy(r.container) + } +} + +func (r *runner) terminate(p *libcontainer.Process) { + _ = p.Signal(unix.SIGKILL) + _, _ = p.Wait() +} + +func (r *runner) checkTerminal(config *specs.Process) error { + detach := r.detach || (r.action == CT_ACT_CREATE) + // Check command-line for sanity. + if detach && config.Terminal && r.consoleSocket == "" { + return errors.New("cannot allocate tty if runc will detach without setting console socket") + } + if (!detach || !config.Terminal) && r.consoleSocket != "" { + return errors.New("cannot use console socket if runc will not detach or allocate tty") + } + return nil +} + +func validateProcessSpec(spec *specs.Process) error { + if spec == nil { + return errors.New("process property must not be empty") + } + if spec.Cwd == "" { + return errors.New("Cwd property must not be empty") + } + if !filepath.IsAbs(spec.Cwd) { + return errors.New("Cwd must be an absolute path") + } + if len(spec.Args) == 0 { + return errors.New("args must not be empty") + } + if spec.SelinuxLabel != "" && !selinux.GetEnabled() { + return errors.New("selinux label is specified in config, but selinux is disabled or not supported") + } + + // If the spec does not contain a capabilities object, create one + // so that it can be manipulated later on + if spec.Capabilities == nil { + spec.Capabilities = new(specs.LinuxCapabilities) + } + + return nil +} + +type CtAct uint8 + +const ( + CT_ACT_CREATE CtAct = iota + 1 + CT_ACT_RUN + CT_ACT_RESTORE +) + +func startContainer(context *cli.Context, + spec *specs.Spec, + action CtAct, + criuOpts *libcontainer.CriuOpts, + sysbox *sysbox.Sysbox) (int, error) { + + id := context.Args().First() + if id == "" { + return -1, errEmptyID + } + + sysMgr := sysbox.Mgr + + switchDockerDns := false + if sysMgr.Enabled() && sysMgr.Config.AliasDns { + var err error + switchDockerDns, err = dockerUtils.ContainerIsDocker(id, spec.Root.Path) + if err != nil { + return -1, err + } + } + + sysbox.SwitchDockerDns = switchDockerDns + + notifySocket := newNotifySocket(context, os.Getenv("NOTIFY_SOCKET"), id) + if notifySocket != nil { + if err := notifySocket.setupSpec(context, spec); err != nil { + return -1, err + } + } + + container, err := createContainer(context, id, spec, sysbox) + if err != nil { + return -1, err + } + + if notifySocket != nil { + if err := notifySocket.setupSocketDirectory(); err != nil { + return -1, err + } + if action == CT_ACT_RUN { + if err := notifySocket.bindSocket(); err != nil { + return -1, err + } + } + } + + // Support on-demand socket activation by passing file descriptors into the container init process. + listenFDs := []*os.File{} + if os.Getenv("LISTEN_FDS") != "" { + listenFDs = activation.Files(false) + } + + logLevel := "info" + if context.GlobalBool("debug") { + logLevel = "debug" + } + + r := &runner{ + enableSubreaper: !context.Bool("no-subreaper"), + shouldDestroy: !context.Bool("keep"), + container: container, + listenFDs: listenFDs, + notifySocket: notifySocket, + consoleSocket: context.String("console-socket"), + detach: context.Bool("detach"), + pidFile: context.String("pid-file"), + preserveFDs: context.Int("preserve-fds"), + action: action, + criuOpts: criuOpts, + init: true, + logLevel: logLevel, + } + + return r.run(spec.Process) +}