diff --git a/go.mod b/go.mod index 7d0139c9..2225c018 100644 --- a/go.mod +++ b/go.mod @@ -124,11 +124,11 @@ require ( github.com/moby/sys/mountinfo v0.6.2 // indirect github.com/moby/term v0.0.0-20210619224110-3f7ff695adc6 // indirect github.com/morikuni/aec v1.0.0 // indirect - github.com/mrunalp/fileutils v0.5.0 // indirect + github.com/mrunalp/fileutils v0.5.1 // indirect github.com/oklog/run v1.1.0 // indirect github.com/opencontainers/go-digest v1.0.0 // indirect github.com/opencontainers/image-spec v1.1.0-rc2.0.20221005185240-3a7f492d3f1b // indirect - github.com/opencontainers/runc v1.1.5 // indirect + github.com/opencontainers/runc v1.1.12 // indirect github.com/opencontainers/runtime-spec v1.0.3-0.20210326190908-1c3f411f0417 // indirect github.com/opencontainers/selinux v1.10.1 // indirect github.com/pierrec/lz4 v2.6.1+incompatible // indirect diff --git a/go.sum b/go.sum index 5f52a591..bfafdbb9 100644 --- a/go.sum +++ b/go.sum @@ -162,7 +162,6 @@ github.com/cilium/ebpf v0.0.0-20200702112145-1c8d4c9ef775/go.mod h1:7cR51M8ViRLI github.com/cilium/ebpf v0.2.0/go.mod h1:To2CFviqOWL/M0gIMsvSMlqe7em/l1ALkX1PyjrX2Qs= github.com/cilium/ebpf v0.4.0/go.mod h1:4tRaxcgiL706VnOzHOdBlY8IEAIdxINsQBcU4xJJXRs= github.com/cilium/ebpf v0.6.2/go.mod h1:4tRaxcgiL706VnOzHOdBlY8IEAIdxINsQBcU4xJJXRs= -github.com/cilium/ebpf v0.7.0/go.mod h1:/oI2+1shJiTGAMgl6/RgJr36Eo1jzrRcAWbcXO2usCA= github.com/cilium/ebpf v0.9.1 h1:64sn2K3UKw8NbP/blsixRpF3nXuyhz/VjRlRzvlBRu4= github.com/cilium/ebpf v0.9.1/go.mod h1:+OhNOIXx/Fnu1IE8bJz2dzOA+VSfyTfdNUVdlQnxUFY= github.com/circonus-labs/circonus-gometrics v2.3.1+incompatible/go.mod h1:nmEj6Dob7S7YxXgwXpfOuvO54S+tGdZdw9fuRZt25Ag= @@ -300,7 +299,6 @@ github.com/creack/pty v1.1.11/go.mod h1:oKZEueFk5CKHvIhNR5MUki03XCEU+Q6VDXinZuGJ github.com/creack/pty v1.1.18 h1:n56/Zwd5o6whRC5PMGretI4IdRLlmBXYNjScPaBgsbY= github.com/creack/pty v1.1.18/go.mod h1:MOBLtS5ELjhRRrroQr9kyvTxUAFNvYEK993ew/Vr4O4= github.com/cyphar/filepath-securejoin v0.2.2/go.mod h1:FpkQEhXnPnOthhzymB7CGsFk2G9VLXONKD9G7QGMM+4= -github.com/cyphar/filepath-securejoin v0.2.3/go.mod h1:aPGpWjXOXUn2NCNjFvBE6aRxGGx79pTxQpKOJNYHHl4= github.com/cyphar/filepath-securejoin v0.2.4 h1:Ugdm7cg7i6ZK6x3xDF1oEu1nfkyfH53EtKeQYTC3kyg= github.com/cyphar/filepath-securejoin v0.2.4/go.mod h1:aPGpWjXOXUn2NCNjFvBE6aRxGGx79pTxQpKOJNYHHl4= github.com/d2g/dhcp4 v0.0.0-20170904100407-a1d1b6c41b1c/go.mod h1:Ct2BUK8SB0YC1SMSibvLzxjeJLnrYEVLULFNiHY9YfQ= @@ -416,7 +414,6 @@ github.com/godbus/dbus v5.0.1+incompatible h1:fsDsnr/6MFSIm3kl6JJpq5pH+vO/rA5jUu github.com/godbus/dbus v5.0.1+incompatible/go.mod h1:/YcGZj5zSblfDWMMoOzV4fas9FZnQYTkDnsGvmh2Grw= github.com/godbus/dbus/v5 v5.0.3/go.mod h1:xhWf0FNVPg57R7Z0UbKHbJfkEywrmjJnf7w5xrFpKfA= github.com/godbus/dbus/v5 v5.0.4/go.mod h1:xhWf0FNVPg57R7Z0UbKHbJfkEywrmjJnf7w5xrFpKfA= -github.com/godbus/dbus/v5 v5.0.6/go.mod h1:xhWf0FNVPg57R7Z0UbKHbJfkEywrmjJnf7w5xrFpKfA= github.com/godbus/dbus/v5 v5.1.0 h1:4KLkAxT3aOY8Li4FRJe/KvhoNFFxo0m6fNuFUO8QJUk= github.com/godbus/dbus/v5 v5.1.0/go.mod h1:xhWf0FNVPg57R7Z0UbKHbJfkEywrmjJnf7w5xrFpKfA= github.com/gogo/googleapis v1.2.0/go.mod h1:Njal3psf3qN6dwBtQfUmBZh2ybovJ0tlu3o/AC7HYjU= @@ -784,7 +781,6 @@ github.com/moby/sys/mount v0.3.3 h1:fX1SVkXFJ47XWDoeFW4Sq7PdQJnV2QIDZAqjNqgEjUs= github.com/moby/sys/mount v0.3.3/go.mod h1:PBaEorSNTLG5t/+4EgukEQVlAvVEc6ZjTySwKdqp5K0= github.com/moby/sys/mountinfo v0.4.0/go.mod h1:rEr8tzG/lsIZHBtN/JjGG+LMYx9eXgW2JI+6q0qou+A= github.com/moby/sys/mountinfo v0.4.1/go.mod h1:rEr8tzG/lsIZHBtN/JjGG+LMYx9eXgW2JI+6q0qou+A= -github.com/moby/sys/mountinfo v0.5.0/go.mod h1:3bMD3Rg+zkqx8MRYPi7Pyb0Ie97QEBmdxbhnCLlSvSU= github.com/moby/sys/mountinfo v0.6.2 h1:BzJjoreD5BMFNmD9Rus6gdd1pLuecOFPt8wC+Vygl78= github.com/moby/sys/mountinfo v0.6.2/go.mod h1:IJb6JQeOklcdMU9F5xQ8ZALD+CUr5VlGpwtX+VE0rpI= github.com/moby/sys/symlink v0.1.0/go.mod h1:GGDODQmbFOjFsXvfLVn3+ZRxkch54RkSiGqsZeMYowQ= @@ -797,8 +793,9 @@ github.com/modern-go/reflect2 v0.0.0-20180701023420-4b7aa43c6742/go.mod h1:bx2lN github.com/modern-go/reflect2 v1.0.1/go.mod h1:bx2lNnkwVCuqBIxFjflWJWanXIb3RllmbCylyMrvgv0= github.com/morikuni/aec v1.0.0 h1:nP9CBfwrvYnBRgY6qfDQkygYDmYwOilePFkwzv4dU8A= github.com/morikuni/aec v1.0.0/go.mod h1:BbKIizmSmc5MMPqRYbxO4ZU0S0+P200+tUnFx7PXmsc= -github.com/mrunalp/fileutils v0.5.0 h1:NKzVxiH7eSk+OQ4M+ZYW1K6h27RUV3MI6NUTsHhU6Z4= github.com/mrunalp/fileutils v0.5.0/go.mod h1:M1WthSahJixYnrXQl/DFQuteStB1weuxD2QJNHXfbSQ= +github.com/mrunalp/fileutils v0.5.1 h1:F+S7ZlNKnrwHfSwdlgNSkKo67ReVf8o9fel6C3dkm/Q= +github.com/mrunalp/fileutils v0.5.1/go.mod h1:M1WthSahJixYnrXQl/DFQuteStB1weuxD2QJNHXfbSQ= github.com/munnerz/goautoneg v0.0.0-20120707110453-a547fc61f48d/go.mod h1:+n7T8mK8HuQTcFwEeznm/DIxMOiR9yIdICNftLE1DvQ= github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822/go.mod h1:+n7T8mK8HuQTcFwEeznm/DIxMOiR9yIdICNftLE1DvQ= github.com/mwitkow/go-conntrack v0.0.0-20161129095857-cc309e4a2223/go.mod h1:qRWi+5nqEBWmkhHvq77mSJWrCKwh8bxhgT7d/eI7P4U= @@ -847,8 +844,8 @@ github.com/opencontainers/runc v1.0.0-rc8.0.20190926000215-3e425f80a8c9/go.mod h github.com/opencontainers/runc v1.0.0-rc9/go.mod h1:qT5XzbpPznkRYVz/mWwUaVBUv2rmF59PVA73FjuZG0U= github.com/opencontainers/runc v1.0.0-rc93/go.mod h1:3NOsor4w32B2tC0Zbl8Knk4Wg84SM2ImC1fxBuqJ/H0= github.com/opencontainers/runc v1.0.2/go.mod h1:aTaHFFwQXuA71CiyxOdFFIorAoemI04suvGRQFzWTD0= -github.com/opencontainers/runc v1.1.5 h1:L44KXEpKmfWDcS02aeGm8QNTFXTo2D+8MYGDIJ/GDEs= -github.com/opencontainers/runc v1.1.5/go.mod h1:1J5XiS+vdZ3wCyZybsuxXZWGrgSr8fFJHLXuG2PsnNg= +github.com/opencontainers/runc v1.1.12 h1:BOIssBaW1La0/qbNZHXOOa71dZfZEQOzW7dqQf3phss= +github.com/opencontainers/runc v1.1.12/go.mod h1:S+lQwSfncpBha7XTy/5lBwWgm5+y5Ma/O44Ekby9FK8= github.com/opencontainers/runtime-spec v0.1.2-0.20190507144316-5b71a03e2700/go.mod h1:jwyrGlmzljRJv/Fgzds9SsS/C5hL+LL3ko9hs6T5lQ0= github.com/opencontainers/runtime-spec v1.0.1/go.mod h1:jwyrGlmzljRJv/Fgzds9SsS/C5hL+LL3ko9hs6T5lQ0= github.com/opencontainers/runtime-spec v1.0.2-0.20190207185410-29686dbc5559/go.mod h1:jwyrGlmzljRJv/Fgzds9SsS/C5hL+LL3ko9hs6T5lQ0= @@ -860,7 +857,6 @@ github.com/opencontainers/runtime-tools v0.0.0-20181011054405-1d69bd0f9c39/go.mo github.com/opencontainers/selinux v1.6.0/go.mod h1:VVGKuOLlE7v4PJyT6h7mNWvq1rzqiriPsEqVhc+svHE= github.com/opencontainers/selinux v1.8.0/go.mod h1:RScLhm78qiWa2gbVCcGkC7tCGdgk3ogry1nUQF8Evvo= github.com/opencontainers/selinux v1.8.2/go.mod h1:MUIHuUEvKB1wtJjQdOyYRgOnLD2xAPP8dBsCoU0KuF8= -github.com/opencontainers/selinux v1.10.0/go.mod h1:2i0OySw99QjzBBQByd1Gr9gSjvuho1lHsJxIJ3gGbJI= github.com/opencontainers/selinux v1.10.1 h1:09LIPVRP3uuZGQvgR+SgMSNBd1Eb3vlRbGqQpoHsF8w= github.com/opencontainers/selinux v1.10.1/go.mod h1:2i0OySw99QjzBBQByd1Gr9gSjvuho1lHsJxIJ3gGbJI= github.com/opentracing/opentracing-go v1.0.2/go.mod h1:UkNAQd3GIcIGf0SeVgPpRdFStlNbqXla1AfSYxPUl2o= @@ -942,7 +938,6 @@ github.com/sclevine/spec v1.2.0/go.mod h1:W4J29eT/Kzv7/b9IWLB055Z+qvVC9vt0Arko24 github.com/sean-/seed v0.0.0-20170313163322-e2103e2c3529 h1:nn5Wsu0esKSJiIVhscUtVbo7ada43DJhG55ua/hjS5I= github.com/sean-/seed v0.0.0-20170313163322-e2103e2c3529/go.mod h1:DxrIzT+xaE7yg65j358z/aeFdxmN0P9QXhEzd20vsDc= github.com/seccomp/libseccomp-golang v0.9.1/go.mod h1:GbW5+tmTXfcxTToHLXlScSlAvWlF4P2Ca7zGrPiEpWo= -github.com/seccomp/libseccomp-golang v0.9.2-0.20220502022130-f33da4d89646/go.mod h1:JA8cRccbGaA1s33RQf7Y1+q9gHmZX1yB/z9WDN1C6fg= github.com/seccomp/libseccomp-golang v0.10.0 h1:aA4bp+/Zzi0BnWZ2F1wgNBs5gTpm+na2rWM6M9YjLpY= github.com/seccomp/libseccomp-golang v0.10.0/go.mod h1:JA8cRccbGaA1s33RQf7Y1+q9gHmZX1yB/z9WDN1C6fg= github.com/sergi/go-diff v1.0.0/go.mod h1:0CfEIISq7TuYL3j771MWULgwwjU+GofnZX9QAmXWZgo= @@ -1283,10 +1278,7 @@ golang.org/x/sys v0.0.0-20210510120138-977fb7262007/go.mod h1:oPkhp1MJrh7nUepCBc golang.org/x/sys v0.0.0-20210615035016-665e8c7367d1/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.0.0-20210616094352-59db8d763f22/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.0.0-20210630005230-0f9fa26af87c/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= -golang.org/x/sys v0.0.0-20210906170528-6f6e22806c34/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.0.0-20210927094055-39ccf1dd6fa6/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= -golang.org/x/sys v0.0.0-20211025201205-69cdffdb9359/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= -golang.org/x/sys v0.0.0-20211116061358-0a5406a5449c/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.0.0-20211216021012-1d35b9e2eb4e/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.0.0-20220412211240-33da011f77ad/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.0.0-20220503163025-988cb79eb6c6/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= diff --git a/vendor/github.com/mrunalp/fileutils/fileutils.go b/vendor/github.com/mrunalp/fileutils/fileutils.go index 7421e620..81851c81 100644 --- a/vendor/github.com/mrunalp/fileutils/fileutils.go +++ b/vendor/github.com/mrunalp/fileutils/fileutils.go @@ -125,6 +125,7 @@ func CopyDirectory(source string, dest string) error { if err != nil { return nil } + destPath := filepath.Join(dest, relPath) if info.IsDir() { // Skip the source directory. @@ -138,18 +139,20 @@ func CopyDirectory(source string, dest string) error { uid := int(st.Uid) gid := int(st.Gid) - if err := os.Mkdir(filepath.Join(dest, relPath), info.Mode()); err != nil { + if err := os.Mkdir(destPath, info.Mode()); err != nil { return err } - - if err := os.Lchown(filepath.Join(dest, relPath), uid, gid); err != nil { + if err := os.Lchown(destPath, uid, gid); err != nil { + return err + } + if err := os.Chmod(destPath, info.Mode()); err != nil { return err } } return nil } - return CopyFile(path, filepath.Join(dest, relPath)) + return CopyFile(path, destPath) }) } diff --git a/vendor/github.com/mrunalp/fileutils/idtools.go b/vendor/github.com/mrunalp/fileutils/idtools.go index bad6539d..0ae2dfb2 100644 --- a/vendor/github.com/mrunalp/fileutils/idtools.go +++ b/vendor/github.com/mrunalp/fileutils/idtools.go @@ -49,6 +49,9 @@ func MkdirAllNewAs(path string, mode os.FileMode, ownerUID, ownerGID int) error if err := os.Chown(pathComponent, ownerUID, ownerGID); err != nil { return err } + if err := os.Chmod(pathComponent, mode); err != nil { + return err + } } return nil } diff --git a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/ebpf/ebpf_linux.go b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/ebpf/ebpf_linux.go index 104c74a8..35b00aaf 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/ebpf/ebpf_linux.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/ebpf/ebpf_linux.go @@ -93,7 +93,7 @@ var ( ) // Loosely based on the BPF_F_REPLACE support check in -// . +// https://github.com/cilium/ebpf/blob/v0.6.0/link/syscalls.go. // // TODO: move this logic to cilium/ebpf func haveBpfProgReplace() bool { diff --git a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/file.go b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/file.go index 0cdaf747..f6e1b73b 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/file.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/file.go @@ -10,6 +10,7 @@ import ( "strings" "sync" + "github.com/opencontainers/runc/libcontainer/utils" "github.com/sirupsen/logrus" "golang.org/x/sys/unix" ) @@ -76,16 +77,16 @@ var ( // TestMode is set to true by unit tests that need "fake" cgroupfs. TestMode bool - cgroupFd int = -1 - prepOnce sync.Once - prepErr error - resolveFlags uint64 + cgroupRootHandle *os.File + prepOnce sync.Once + prepErr error + resolveFlags uint64 ) func prepareOpenat2() error { prepOnce.Do(func() { fd, err := unix.Openat2(-1, cgroupfsDir, &unix.OpenHow{ - Flags: unix.O_DIRECTORY | unix.O_PATH, + Flags: unix.O_DIRECTORY | unix.O_PATH | unix.O_CLOEXEC, }) if err != nil { prepErr = &os.PathError{Op: "openat2", Path: cgroupfsDir, Err: err} @@ -96,15 +97,16 @@ func prepareOpenat2() error { } return } + file := os.NewFile(uintptr(fd), cgroupfsDir) + var st unix.Statfs_t - if err = unix.Fstatfs(fd, &st); err != nil { + if err := unix.Fstatfs(int(file.Fd()), &st); err != nil { prepErr = &os.PathError{Op: "statfs", Path: cgroupfsDir, Err: err} logrus.Warnf("falling back to securejoin: %s", prepErr) return } - cgroupFd = fd - + cgroupRootHandle = file resolveFlags = unix.RESOLVE_BENEATH | unix.RESOLVE_NO_MAGICLINKS if st.Type == unix.CGROUP2_SUPER_MAGIC { // cgroupv2 has a single mountpoint and no "cpu,cpuacct" symlinks @@ -122,7 +124,7 @@ func openFile(dir, file string, flags int) (*os.File, error) { flags |= os.O_TRUNC | os.O_CREATE mode = 0o600 } - path := path.Join(dir, file) + path := path.Join(dir, utils.CleanPath(file)) if prepareOpenat2() != nil { return openFallback(path, flags, mode) } @@ -131,7 +133,7 @@ func openFile(dir, file string, flags int) (*os.File, error) { return openFallback(path, flags, mode) } - fd, err := unix.Openat2(cgroupFd, relPath, + fd, err := unix.Openat2(int(cgroupRootHandle.Fd()), relPath, &unix.OpenHow{ Resolve: resolveFlags, Flags: uint64(flags) | unix.O_CLOEXEC, @@ -139,20 +141,20 @@ func openFile(dir, file string, flags int) (*os.File, error) { }) if err != nil { err = &os.PathError{Op: "openat2", Path: path, Err: err} - // Check if cgroupFd is still opened to cgroupfsDir + // Check if cgroupRootHandle is still opened to cgroupfsDir // (happens when this package is incorrectly used // across the chroot/pivot_root/mntns boundary, or // when /sys/fs/cgroup is remounted). // // TODO: if such usage will ever be common, amend this - // to reopen cgroupFd and retry openat2. - fdStr := strconv.Itoa(cgroupFd) + // to reopen cgroupRootHandle and retry openat2. + fdStr := strconv.Itoa(int(cgroupRootHandle.Fd())) fdDest, _ := os.Readlink("/proc/self/fd/" + fdStr) if fdDest != cgroupfsDir { - // Wrap the error so it is clear that cgroupFd + // Wrap the error so it is clear that cgroupRootHandle // is opened to an unexpected/wrong directory. - err = fmt.Errorf("cgroupFd %s unexpectedly opened to %s != %s: %w", - fdStr, fdDest, cgroupfsDir, err) + err = fmt.Errorf("cgroupRootHandle %d unexpectedly opened to %s != %s: %w", + cgroupRootHandle.Fd(), fdDest, cgroupfsDir, err) } return nil, err } diff --git a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/fs.go b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/fs.go index fb4fcc7f..9e2f0ec0 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/fs.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/fs.go @@ -28,6 +28,7 @@ var subsystems = []subsystem{ &FreezerGroup{}, &RdmaGroup{}, &NameGroup{GroupName: "name=systemd", Join: true}, + &NameGroup{GroupName: "misc", Join: true}, } var errSubsystemDoesNotExist = errors.New("cgroup: subsystem does not exist") diff --git a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/hugetlb.go b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/hugetlb.go index 8ddd6fdd..50f8f30c 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/hugetlb.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/hugetlb.go @@ -1,6 +1,8 @@ package fs import ( + "errors" + "os" "strconv" "github.com/opencontainers/runc/libcontainer/cgroups" @@ -19,8 +21,23 @@ func (s *HugetlbGroup) Apply(path string, _ *configs.Resources, pid int) error { } func (s *HugetlbGroup) Set(path string, r *configs.Resources) error { + const suffix = ".limit_in_bytes" + skipRsvd := false + for _, hugetlb := range r.HugetlbLimit { - if err := cgroups.WriteFile(path, "hugetlb."+hugetlb.Pagesize+".limit_in_bytes", strconv.FormatUint(hugetlb.Limit, 10)); err != nil { + prefix := "hugetlb." + hugetlb.Pagesize + val := strconv.FormatUint(hugetlb.Limit, 10) + if err := cgroups.WriteFile(path, prefix+suffix, val); err != nil { + return err + } + if skipRsvd { + continue + } + if err := cgroups.WriteFile(path, prefix+".rsvd"+suffix, val); err != nil { + if errors.Is(err, os.ErrNotExist) { + skipRsvd = true + continue + } return err } } @@ -32,24 +49,29 @@ func (s *HugetlbGroup) GetStats(path string, stats *cgroups.Stats) error { if !cgroups.PathExists(path) { return nil } + rsvd := ".rsvd" hugetlbStats := cgroups.HugetlbStats{} for _, pageSize := range cgroups.HugePageSizes() { - usage := "hugetlb." + pageSize + ".usage_in_bytes" - value, err := fscommon.GetCgroupParamUint(path, usage) + again: + prefix := "hugetlb." + pageSize + rsvd + + value, err := fscommon.GetCgroupParamUint(path, prefix+".usage_in_bytes") if err != nil { + if rsvd != "" && errors.Is(err, os.ErrNotExist) { + rsvd = "" + goto again + } return err } hugetlbStats.Usage = value - maxUsage := "hugetlb." + pageSize + ".max_usage_in_bytes" - value, err = fscommon.GetCgroupParamUint(path, maxUsage) + value, err = fscommon.GetCgroupParamUint(path, prefix+".max_usage_in_bytes") if err != nil { return err } hugetlbStats.MaxUsage = value - failcnt := "hugetlb." + pageSize + ".failcnt" - value, err = fscommon.GetCgroupParamUint(path, failcnt) + value, err = fscommon.GetCgroupParamUint(path, prefix+".failcnt") if err != nil { return err } diff --git a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/memory.go b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/memory.go index b7c75f94..783566d6 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/memory.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/memory.go @@ -170,6 +170,10 @@ func (s *MemoryGroup) GetStats(path string, stats *cgroups.Stats) error { return err } stats.MemoryStats.SwapUsage = swapUsage + stats.MemoryStats.SwapOnlyUsage = cgroups.MemoryData{ + Usage: swapUsage.Usage - memoryUsage.Usage, + Failcnt: swapUsage.Failcnt - memoryUsage.Failcnt, + } kernelUsage, err := getMemoryData(path, "kmem") if err != nil { return err @@ -234,6 +238,12 @@ func getMemoryData(path, name string) (cgroups.MemoryData, error) { memoryData.Failcnt = value value, err = fscommon.GetCgroupParamUint(path, limit) if err != nil { + if name == "kmem" && os.IsNotExist(err) { + // Ignore ENOENT as kmem.limit_in_bytes has + // been removed in newer kernels. + return memoryData, nil + } + return cgroups.MemoryData{}, err } memoryData.Limit = value diff --git a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/paths.go b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/paths.go index 1092331b..2cb970a3 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/paths.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/paths.go @@ -83,6 +83,7 @@ func tryDefaultCgroupRoot() string { if err != nil { return "" } + defer dir.Close() names, err := dir.Readdirnames(1) if err != nil { return "" diff --git a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs2/hugetlb.go b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs2/hugetlb.go index c92a7e64..2ce2631e 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs2/hugetlb.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs2/hugetlb.go @@ -1,6 +1,8 @@ package fs2 import ( + "errors" + "os" "strconv" "github.com/opencontainers/runc/libcontainer/cgroups" @@ -16,8 +18,22 @@ func setHugeTlb(dirPath string, r *configs.Resources) error { if !isHugeTlbSet(r) { return nil } + const suffix = ".max" + skipRsvd := false for _, hugetlb := range r.HugetlbLimit { - if err := cgroups.WriteFile(dirPath, "hugetlb."+hugetlb.Pagesize+".max", strconv.FormatUint(hugetlb.Limit, 10)); err != nil { + prefix := "hugetlb." + hugetlb.Pagesize + val := strconv.FormatUint(hugetlb.Limit, 10) + if err := cgroups.WriteFile(dirPath, prefix+suffix, val); err != nil { + return err + } + if skipRsvd { + continue + } + if err := cgroups.WriteFile(dirPath, prefix+".rsvd"+suffix, val); err != nil { + if errors.Is(err, os.ErrNotExist) { + skipRsvd = true + continue + } return err } } @@ -27,15 +43,21 @@ func setHugeTlb(dirPath string, r *configs.Resources) error { func statHugeTlb(dirPath string, stats *cgroups.Stats) error { hugetlbStats := cgroups.HugetlbStats{} + rsvd := ".rsvd" for _, pagesize := range cgroups.HugePageSizes() { - value, err := fscommon.GetCgroupParamUint(dirPath, "hugetlb."+pagesize+".current") + again: + prefix := "hugetlb." + pagesize + rsvd + value, err := fscommon.GetCgroupParamUint(dirPath, prefix+".current") if err != nil { + if rsvd != "" && errors.Is(err, os.ErrNotExist) { + rsvd = "" + goto again + } return err } hugetlbStats.Usage = value - fileName := "hugetlb." + pagesize + ".events" - value, err = fscommon.GetValueByKey(dirPath, fileName, "max") + value, err = fscommon.GetValueByKey(dirPath, prefix+".events", "max") if err != nil { return err } diff --git a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs2/memory.go b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs2/memory.go index adbc4b23..01fe7d8e 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs2/memory.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs2/memory.go @@ -100,17 +100,20 @@ func statMemory(dirPath string, stats *cgroups.Stats) error { memoryUsage, err := getMemoryDataV2(dirPath, "") if err != nil { if errors.Is(err, unix.ENOENT) && dirPath == UnifiedMountpoint { - // The root cgroup does not have memory.{current,max} - // so emulate those using data from /proc/meminfo. - return statsFromMeminfo(stats) + // The root cgroup does not have memory.{current,max,peak} + // so emulate those using data from /proc/meminfo and + // /sys/fs/cgroup/memory.stat + return rootStatsFromMeminfo(stats) } return err } stats.MemoryStats.Usage = memoryUsage - swapUsage, err := getMemoryDataV2(dirPath, "swap") + swapOnlyUsage, err := getMemoryDataV2(dirPath, "swap") if err != nil { return err } + stats.MemoryStats.SwapOnlyUsage = swapOnlyUsage + swapUsage := swapOnlyUsage // As cgroup v1 reports SwapUsage values as mem+swap combined, // while in cgroup v2 swap values do not include memory, // report combined mem+swap for v1 compatibility. @@ -118,6 +121,9 @@ func statMemory(dirPath string, stats *cgroups.Stats) error { if swapUsage.Limit != math.MaxUint64 { swapUsage.Limit += memoryUsage.Limit } + // The `MaxUsage` of mem+swap cannot simply combine mem with + // swap. So set it to 0 for v1 compatibility. + swapUsage.MaxUsage = 0 stats.MemoryStats.SwapUsage = swapUsage return nil @@ -132,6 +138,7 @@ func getMemoryDataV2(path, name string) (cgroups.MemoryData, error) { } usage := moduleName + ".current" limit := moduleName + ".max" + maxUsage := moduleName + ".peak" value, err := fscommon.GetCgroupParamUint(path, usage) if err != nil { @@ -151,10 +158,18 @@ func getMemoryDataV2(path, name string) (cgroups.MemoryData, error) { } memoryData.Limit = value + // `memory.peak` since kernel 5.19 + // `memory.swap.peak` since kernel 6.5 + value, err = fscommon.GetCgroupParamUint(path, maxUsage) + if err != nil && !os.IsNotExist(err) { + return cgroups.MemoryData{}, err + } + memoryData.MaxUsage = value + return memoryData, nil } -func statsFromMeminfo(stats *cgroups.Stats) error { +func rootStatsFromMeminfo(stats *cgroups.Stats) error { const file = "/proc/meminfo" f, err := os.Open(file) if err != nil { @@ -166,14 +181,10 @@ func statsFromMeminfo(stats *cgroups.Stats) error { var ( swap_free uint64 swap_total uint64 - main_total uint64 - main_free uint64 ) mem := map[string]*uint64{ "SwapFree": &swap_free, "SwapTotal": &swap_total, - "MemTotal": &main_total, - "MemFree": &main_free, } found := 0 @@ -206,11 +217,18 @@ func statsFromMeminfo(stats *cgroups.Stats) error { return &parseError{Path: "", File: file, Err: err} } + // cgroup v1 `usage_in_bytes` reports memory usage as the sum of + // - rss (NR_ANON_MAPPED) + // - cache (NR_FILE_PAGES) + // cgroup v1 reports SwapUsage values as mem+swap combined + // cgroup v2 reports rss and cache as anon and file. + // sum `anon` + `file` to report the same value as `usage_in_bytes` in v1. + // sum swap usage as combined mem+swap usage for consistency as well. + stats.MemoryStats.Usage.Usage = stats.MemoryStats.Stats["anon"] + stats.MemoryStats.Stats["file"] + stats.MemoryStats.Usage.Limit = math.MaxUint64 stats.MemoryStats.SwapUsage.Usage = (swap_total - swap_free) * 1024 stats.MemoryStats.SwapUsage.Limit = math.MaxUint64 - - stats.MemoryStats.Usage.Usage = (main_total - main_free) * 1024 - stats.MemoryStats.Usage.Limit = math.MaxUint64 + stats.MemoryStats.SwapUsage.Usage += stats.MemoryStats.Usage.Usage return nil } diff --git a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/stats.go b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/stats.go index 40a81dd5..0d8371b0 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/stats.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/stats.go @@ -78,6 +78,8 @@ type MemoryStats struct { Usage MemoryData `json:"usage,omitempty"` // usage of memory + swap SwapUsage MemoryData `json:"swap_usage,omitempty"` + // usage of swap only + SwapOnlyUsage MemoryData `json:"swap_only_usage,omitempty"` // usage of kernel memory KernelUsage MemoryData `json:"kernel_usage,omitempty"` // usage of kernel TCP memory diff --git a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/systemd/common.go b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/systemd/common.go index b6bfb080..c5b476e2 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/systemd/common.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/systemd/common.go @@ -177,7 +177,7 @@ func allowAllDevices() []systemdDbus.Property { // generateDeviceProperties takes the configured device rules and generates a // corresponding set of systemd properties to configure the devices correctly. -func generateDeviceProperties(r *configs.Resources) ([]systemdDbus.Property, error) { +func generateDeviceProperties(r *configs.Resources, sdVer int) ([]systemdDbus.Property, error) { if r.SkipDevices { return nil, nil } @@ -238,9 +238,10 @@ func generateDeviceProperties(r *configs.Resources) ([]systemdDbus.Property, err // trickery to convert things: // // * Concrete rules with non-wildcard major/minor numbers have to use - // /dev/{block,char} paths. This is slightly odd because it means - // that we cannot add whitelist rules for devices that don't exist, - // but there's not too much we can do about that. + // /dev/{block,char}/MAJOR:minor paths. Before v240, systemd uses + // stat(2) on such paths to look up device properties, meaning we + // cannot add whitelist rules for devices that don't exist. Since v240, + // device properties are parsed from the path string. // // However, path globbing is not support for path-based rules so we // need to handle wildcards in some other manner. @@ -288,21 +289,14 @@ func generateDeviceProperties(r *configs.Resources) ([]systemdDbus.Property, err case devices.CharDevice: entry.Path = fmt.Sprintf("/dev/char/%d:%d", rule.Major, rule.Minor) } - // systemd will issue a warning if the path we give here doesn't exist. - // Since all of this logic is best-effort anyway (we manually set these - // rules separately to systemd) we can safely skip entries that don't - // have a corresponding path. - if _, err := os.Stat(entry.Path); err != nil { - // Also check /sys/dev so that we don't depend on /dev/{block,char} - // being populated. (/dev/{block,char} is populated by udev, which - // isn't strictly required for systemd). Ironically, this happens most - // easily when starting containerd within a runc created container - // itself. - - // We don't bother with securejoin here because we create entry.Path - // right above here, so we know it's safe. - if _, err := os.Stat("/sys" + entry.Path); err != nil { - logrus.Warnf("skipping device %s for systemd: %s", entry.Path, err) + if sdVer < 240 { + // Old systemd versions use stat(2) on path to find out device major:minor + // numbers and type. If the path doesn't exist, it will not add the rule, + // emitting a warning instead. + // Since all of this logic is best-effort anyway (we manually set these + // rules separately to systemd) we can safely skip entries that don't + // have a corresponding path. + if _, err := os.Stat(entry.Path); err != nil { continue } } @@ -353,32 +347,55 @@ func isUnitExists(err error) bool { return isDbusError(err, "org.freedesktop.systemd1.UnitExists") } -func startUnit(cm *dbusConnManager, unitName string, properties []systemdDbus.Property) error { +func startUnit(cm *dbusConnManager, unitName string, properties []systemdDbus.Property, ignoreExist bool) error { statusChan := make(chan string, 1) + retry := true + +retry: err := cm.retryOnDisconnect(func(c *systemdDbus.Conn) error { _, err := c.StartTransientUnitContext(context.TODO(), unitName, "replace", properties, statusChan) return err }) - if err == nil { - timeout := time.NewTimer(30 * time.Second) - defer timeout.Stop() - - select { - case s := <-statusChan: - close(statusChan) - // Please refer to https://pkg.go.dev/github.com/coreos/go-systemd/v22/dbus#Conn.StartUnit - if s != "done" { - resetFailedUnit(cm, unitName) - return fmt.Errorf("error creating systemd unit `%s`: got `%s`", unitName, s) + if err != nil { + if !isUnitExists(err) { + return err + } + if ignoreExist { + // TODO: remove this hack. + // This is kubelet making sure a slice exists (see + // https://github.com/opencontainers/runc/pull/1124). + return nil + } + if retry { + // In case a unit with the same name exists, this may + // be a leftover failed unit. Reset it, so systemd can + // remove it, and retry once. + err = resetFailedUnit(cm, unitName) + if err != nil { + logrus.Warnf("unable to reset failed unit: %v", err) } - case <-timeout.C: - resetFailedUnit(cm, unitName) - return errors.New("Timeout waiting for systemd to create " + unitName) + retry = false + goto retry } - } else if !isUnitExists(err) { return err } + timeout := time.NewTimer(30 * time.Second) + defer timeout.Stop() + + select { + case s := <-statusChan: + close(statusChan) + // Please refer to https://pkg.go.dev/github.com/coreos/go-systemd/v22/dbus#Conn.StartUnit + if s != "done" { + _ = resetFailedUnit(cm, unitName) + return fmt.Errorf("error creating systemd unit `%s`: got `%s`", unitName, s) + } + case <-timeout.C: + _ = resetFailedUnit(cm, unitName) + return errors.New("Timeout waiting for systemd to create " + unitName) + } + return nil } @@ -403,16 +420,17 @@ func stopUnit(cm *dbusConnManager, unitName string) error { return errors.New("Timed out while waiting for systemd to remove " + unitName) } } + + // In case of a failed unit, let systemd remove it. + _ = resetFailedUnit(cm, unitName) + return nil } -func resetFailedUnit(cm *dbusConnManager, name string) { - err := cm.retryOnDisconnect(func(c *systemdDbus.Conn) error { +func resetFailedUnit(cm *dbusConnManager, name string) error { + return cm.retryOnDisconnect(func(c *systemdDbus.Conn) error { return c.ResetFailedUnitContext(context.TODO(), name) }) - if err != nil { - logrus.Warnf("unable to reset failed unit: %v", err) - } } func getUnitTypeProperty(cm *dbusConnManager, unitName string, unitType string, propertyName string) (*systemdDbus.Property, error) { diff --git a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/systemd/cpuset.go b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/systemd/cpuset.go index 83d10dd7..dd474cf1 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/systemd/cpuset.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/systemd/cpuset.go @@ -51,5 +51,10 @@ func RangeToBits(str string) ([]byte, error) { // do not allow empty values return nil, errors.New("empty value") } + + // fit cpuset parsing order in systemd + for l, r := 0, len(ret)-1; l < r; l, r = l+1, r-1 { + ret[l], ret[r] = ret[r], ret[l] + } return ret, nil } diff --git a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/systemd/v1.go b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/systemd/v1.go index a74a05a5..a574552d 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/systemd/v1.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/systemd/v1.go @@ -71,12 +71,13 @@ var legacySubsystems = []subsystem{ &fs.NetClsGroup{}, &fs.NameGroup{GroupName: "name=systemd"}, &fs.RdmaGroup{}, + &fs.NameGroup{GroupName: "misc"}, } func genV1ResourcesProperties(r *configs.Resources, cm *dbusConnManager) ([]systemdDbus.Property, error) { var properties []systemdDbus.Property - deviceProperties, err := generateDeviceProperties(r) + deviceProperties, err := generateDeviceProperties(r, systemdVersion(cm)) if err != nil { return nil, err } @@ -206,7 +207,7 @@ func (m *legacyManager) Apply(pid int) error { properties = append(properties, c.SystemdProps...) - if err := startUnit(m.dbus, unitName, properties); err != nil { + if err := startUnit(m.dbus, unitName, properties, pid == -1); err != nil { return err } @@ -273,14 +274,7 @@ func getSubsystemPath(slice, unit, subsystem string) (string, error) { return "", err } - initPath, err := cgroups.GetInitCgroup(subsystem) - if err != nil { - return "", err - } - // if pid 1 is systemd 226 or later, it will be in init.scope, not the root - initPath = strings.TrimSuffix(filepath.Clean(initPath), "init.scope") - - return filepath.Join(mountpoint, initPath, slice, unit), nil + return filepath.Join(mountpoint, slice, unit), nil } func (m *legacyManager) Freeze(state configs.FreezerState) error { @@ -423,6 +417,15 @@ func (m *legacyManager) Set(r *configs.Resources) error { if err := m.doFreeze(configs.Frozen); err != nil { // If freezer cgroup isn't supported, we just warn about it. logrus.Infof("freeze container before SetUnitProperties failed: %v", err) + // skip update the cgroup while frozen failed. #3803 + if !errors.Is(err, errSubsystemDoesNotExist) { + if needsThaw { + if thawErr := m.doFreeze(configs.Thawed); thawErr != nil { + logrus.Infof("thaw container after doFreeze failed: %v", thawErr) + } + } + return err + } } } setErr := setUnitProperties(m.dbus, unitName, properties...) diff --git a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/systemd/v2.go b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/systemd/v2.go index de0cb974..919e5632 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/systemd/v2.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/systemd/v2.go @@ -182,7 +182,7 @@ func genV2ResourcesProperties(r *configs.Resources, cm *dbusConnManager) ([]syst // aren't the end of the world, but it is a bit concerning. However // it's unclear if systemd removes all eBPF programs attached when // doing SetUnitProperties... - deviceProperties, err := generateDeviceProperties(r) + deviceProperties, err := generateDeviceProperties(r, systemdVersion(cm)) if err != nil { return nil, err } @@ -284,7 +284,7 @@ func (m *unifiedManager) Apply(pid int) error { properties = append(properties, c.SystemdProps...) - if err := startUnit(m.dbus, unitName, properties); err != nil { + if err := startUnit(m.dbus, unitName, properties, pid == -1); err != nil { return fmt.Errorf("unable to start unit %q (properties %+v): %w", unitName, properties, err) } diff --git a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/utils.go b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/utils.go index b32af4ee..fc4ae44a 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/utils.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/utils.go @@ -162,8 +162,10 @@ func readProcsFile(dir string) ([]int, error) { // ParseCgroupFile parses the given cgroup file, typically /proc/self/cgroup // or /proc//cgroup, into a map of subsystems to cgroup paths, e.g. -// "cpu": "/user.slice/user-1000.slice" -// "pids": "/user.slice/user-1000.slice" +// +// "cpu": "/user.slice/user-1000.slice" +// "pids": "/user.slice/user-1000.slice" +// // etc. // // Note that for cgroup v2 unified hierarchy, there are no per-controller diff --git a/vendor/github.com/opencontainers/runc/libcontainer/configs/config.go b/vendor/github.com/opencontainers/runc/libcontainer/configs/config.go index c1b4a004..6ebf5ec7 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/configs/config.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/configs/config.go @@ -21,9 +21,9 @@ type Rlimit struct { // IDMap represents UID/GID Mappings for User Namespaces. type IDMap struct { - ContainerID int `json:"container_id"` - HostID int `json:"host_id"` - Size int `json:"size"` + ContainerID int64 `json:"container_id"` + HostID int64 `json:"host_id"` + Size int64 `json:"size"` } // Seccomp represents syscall restrictions diff --git a/vendor/github.com/opencontainers/runc/libcontainer/configs/config_linux.go b/vendor/github.com/opencontainers/runc/libcontainer/configs/config_linux.go index 8c02848b..51fe9407 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/configs/config_linux.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/configs/config_linux.go @@ -1,6 +1,10 @@ package configs -import "errors" +import ( + "errors" + "fmt" + "math" +) var ( errNoUIDMap = errors.New("User namespaces enabled, but no uid mappings found.") @@ -16,11 +20,18 @@ func (c Config) HostUID(containerId int) (int, error) { if c.UidMappings == nil { return -1, errNoUIDMap } - id, found := c.hostIDFromMapping(containerId, c.UidMappings) + id, found := c.hostIDFromMapping(int64(containerId), c.UidMappings) if !found { return -1, errNoUserMap } - return id, nil + // If we are a 32-bit binary running on a 64-bit system, it's possible + // the mapped user is too large to store in an int, which means we + // cannot do the mapping. We can't just return an int64, because + // os.Setuid() takes an int. + if id > math.MaxInt { + return -1, fmt.Errorf("mapping for uid %d (host id %d) is larger than native integer size (%d)", containerId, id, math.MaxInt) + } + return int(id), nil } // Return unchanged id. return containerId, nil @@ -39,11 +50,18 @@ func (c Config) HostGID(containerId int) (int, error) { if c.GidMappings == nil { return -1, errNoGIDMap } - id, found := c.hostIDFromMapping(containerId, c.GidMappings) + id, found := c.hostIDFromMapping(int64(containerId), c.GidMappings) if !found { return -1, errNoGroupMap } - return id, nil + // If we are a 32-bit binary running on a 64-bit system, it's possible + // the mapped user is too large to store in an int, which means we + // cannot do the mapping. We can't just return an int64, because + // os.Setgid() takes an int. + if id > math.MaxInt { + return -1, fmt.Errorf("mapping for gid %d (host id %d) is larger than native integer size (%d)", containerId, id, math.MaxInt) + } + return int(id), nil } // Return unchanged id. return containerId, nil @@ -57,7 +75,7 @@ func (c Config) HostRootGID() (int, error) { // Utility function that gets a host ID for a container ID from user namespace map // if that ID is present in the map. -func (c Config) hostIDFromMapping(containerID int, uMap []IDMap) (int, bool) { +func (c Config) hostIDFromMapping(containerID int64, uMap []IDMap) (int64, bool) { for _, m := range uMap { if (containerID >= m.ContainerID) && (containerID <= (m.ContainerID + m.Size - 1)) { hostID := m.HostID + (containerID - m.ContainerID) diff --git a/vendor/github.com/opencontainers/runc/libcontainer/configs/validate/rootless.go b/vendor/github.com/opencontainers/runc/libcontainer/configs/validate/rootless.go index 9a6e5eb3..37c38336 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/configs/validate/rootless.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/configs/validate/rootless.go @@ -28,25 +28,18 @@ func (v *ConfigValidator) rootlessEUID(config *configs.Config) error { return nil } -func hasIDMapping(id int, mappings []configs.IDMap) bool { - for _, m := range mappings { - if id >= m.ContainerID && id < m.ContainerID+m.Size { - return true - } - } - return false -} - func rootlessEUIDMappings(config *configs.Config) error { if !config.Namespaces.Contains(configs.NEWUSER) { return errors.New("rootless container requires user namespaces") } - - if len(config.UidMappings) == 0 { - return errors.New("rootless containers requires at least one UID mapping") - } - if len(config.GidMappings) == 0 { - return errors.New("rootless containers requires at least one GID mapping") + // We only require mappings if we are not joining another userns. + if path := config.Namespaces.PathOf(configs.NEWUSER); path == "" { + if len(config.UidMappings) == 0 { + return errors.New("rootless containers requires at least one UID mapping") + } + if len(config.GidMappings) == 0 { + return errors.New("rootless containers requires at least one GID mapping") + } } return nil } @@ -70,8 +63,8 @@ func rootlessEUIDMount(config *configs.Config) error { // Ignore unknown mount options. continue } - if !hasIDMapping(uid, config.UidMappings) { - return errors.New("cannot specify uid= mount options for unmapped uid in rootless containers") + if _, err := config.HostUID(uid); err != nil { + return fmt.Errorf("cannot specify uid=%d mount option for rootless container: %w", uid, err) } } @@ -82,8 +75,8 @@ func rootlessEUIDMount(config *configs.Config) error { // Ignore unknown mount options. continue } - if !hasIDMapping(gid, config.GidMappings) { - return errors.New("cannot specify gid= mount options for unmapped gid in rootless containers") + if _, err := config.HostGID(gid); err != nil { + return fmt.Errorf("cannot specify gid=%d mount option for rootless container: %w", gid, err) } } } diff --git a/vendor/github.com/opencontainers/runc/libcontainer/configs/validate/validator.go b/vendor/github.com/opencontainers/runc/libcontainer/configs/validate/validator.go index 627621a5..ece70a45 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/configs/validate/validator.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/configs/validate/validator.go @@ -109,11 +109,19 @@ func (v *ConfigValidator) security(config *configs.Config) error { func (v *ConfigValidator) usernamespace(config *configs.Config) error { if config.Namespaces.Contains(configs.NEWUSER) { if _, err := os.Stat("/proc/self/ns/user"); os.IsNotExist(err) { - return errors.New("USER namespaces aren't enabled in the kernel") + return errors.New("user namespaces aren't enabled in the kernel") } + hasPath := config.Namespaces.PathOf(configs.NEWUSER) != "" + hasMappings := config.UidMappings != nil || config.GidMappings != nil + if !hasPath && !hasMappings { + return errors.New("user namespaces enabled, but no namespace path to join nor mappings to apply specified") + } + // The hasPath && hasMappings validation case is handled in specconv -- + // we cache the mappings in Config during specconv in the hasPath case, + // so we cannot do that validation here. } else { if config.UidMappings != nil || config.GidMappings != nil { - return errors.New("User namespace mappings specified, but USER namespace isn't enabled in the config") + return errors.New("user namespace mappings specified, but user namespace isn't enabled in the config") } } return nil @@ -131,9 +139,8 @@ func (v *ConfigValidator) cgroupnamespace(config *configs.Config) error { // convertSysctlVariableToDotsSeparator can return sysctl variables in dots separator format. // The '/' separator is also accepted in place of a '.'. // Convert the sysctl variables to dots separator format for validation. -// More info: -// https://man7.org/linux/man-pages/man8/sysctl.8.html -// https://man7.org/linux/man-pages/man5/sysctl.d.5.html +// More info: sysctl(8), sysctl.d(5). +// // For example: // Input sysctl variable "net/ipv4/conf/eno2.100.rp_filter" // will return the converted value "net.ipv4.conf.eno2/100.rp_filter" diff --git a/vendor/github.com/opencontainers/runc/libcontainer/container_linux.go b/vendor/github.com/opencontainers/runc/libcontainer/container_linux.go index 9df830d8..40b332f9 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/container_linux.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/container_linux.go @@ -40,7 +40,7 @@ type linuxContainer struct { root string config *configs.Config cgroupManager cgroups.Manager - intelRdtManager intelrdt.Manager + intelRdtManager *intelrdt.Manager initPath string initArgs []string initProcess parentProcess @@ -146,19 +146,21 @@ func (c *linuxContainer) OCIState() (*specs.State, error) { return c.currentOCIState() } -func (c *linuxContainer) Processes() ([]int, error) { - var pids []int - status, err := c.currentStatus() - if err != nil { - return pids, err +// ignoreCgroupError filters out cgroup-related errors that can be ignored, +// because the container is stopped and its cgroup is gone. +func (c *linuxContainer) ignoreCgroupError(err error) error { + if err == nil { + return nil } - // for systemd cgroup, the unit's cgroup path will be auto removed if container's all processes exited - if status == Stopped && !c.cgroupManager.Exists() { - return pids, nil + if errors.Is(err, os.ErrNotExist) && c.runType() == Stopped && !c.cgroupManager.Exists() { + return nil } + return err +} - pids, err = c.cgroupManager.GetAllPids() - if err != nil { +func (c *linuxContainer) Processes() ([]int, error) { + pids, err := c.cgroupManager.GetAllPids() + if err = c.ignoreCgroupError(err); err != nil { return nil, fmt.Errorf("unable to get all container pids: %w", err) } return pids, nil @@ -351,6 +353,15 @@ func (c *linuxContainer) start(process *Process) (retErr error) { }() } + // Before starting "runc init", mark all non-stdio open files as O_CLOEXEC + // to make sure we don't leak any files into "runc init". Any files to be + // passed to "runc init" through ExtraFiles will get dup2'd by the Go + // runtime and thus their O_CLOEXEC flag will be cleared. This is some + // additional protection against attacks like CVE-2024-21626, by making + // sure we never leak files to "runc init" we didn't intend to. + if err := utils.CloseExecFrom(3); err != nil { + return fmt.Errorf("unable to mark non-stdio fds as cloexec: %w", err) + } if err := parent.start(); err != nil { return fmt.Errorf("unable to start container process: %w", err) } @@ -382,11 +393,12 @@ func (c *linuxContainer) Signal(s os.Signal, all bool) error { return err } if all { - // for systemd cgroup, the unit's cgroup path will be auto removed if container's all processes exited if status == Stopped && !c.cgroupManager.Exists() { + // Avoid calling signalAllProcesses which may print + // a warning trying to freeze a non-existing cgroup. return nil } - return signalAllProcesses(c.cgroupManager, s) + return c.ignoreCgroupError(signalAllProcesses(c.cgroupManager, s)) } // to avoid a PID reuse attack if status == Running || status == Created || status == Paused { @@ -926,7 +938,7 @@ func (c *linuxContainer) criuSupportsExtNS(t configs.NamespaceType) bool { } func criuNsToKey(t configs.NamespaceType) string { - return "extRoot" + strings.Title(configs.NsName(t)) + "NS" + return "extRoot" + strings.Title(configs.NsName(t)) + "NS" //nolint:staticcheck // SA1019: strings.Title is deprecated } func (c *linuxContainer) handleCheckpointingExternalNamespaces(rpcOpts *criurpc.CriuOpts, t configs.NamespaceType) error { @@ -2265,7 +2277,7 @@ func ignoreTerminateErrors(err error) error { func requiresRootOrMappingTool(c *configs.Config) bool { gidMap := []configs.IDMap{ - {ContainerID: 0, HostID: os.Getegid(), Size: 1}, + {ContainerID: 0, HostID: int64(os.Getegid()), Size: 1}, } return !reflect.DeepEqual(c.GidMappings, gidMap) } diff --git a/vendor/github.com/opencontainers/runc/libcontainer/eaccess_go119.go b/vendor/github.com/opencontainers/runc/libcontainer/eaccess_go119.go new file mode 100644 index 00000000..cc1e2079 --- /dev/null +++ b/vendor/github.com/opencontainers/runc/libcontainer/eaccess_go119.go @@ -0,0 +1,17 @@ +//go:build !go1.20 +// +build !go1.20 + +package libcontainer + +import "golang.org/x/sys/unix" + +func eaccess(path string) error { + // This check is similar to access(2) with X_OK except for + // setuid/setgid binaries where it checks against the effective + // (rather than real) uid and gid. It is not needed in go 1.20 + // and beyond and will be removed later. + + // Relies on code added in https://go-review.googlesource.com/c/sys/+/468877 + // and older CLs linked from there. + return unix.Faccessat(unix.AT_FDCWD, path, unix.X_OK, unix.AT_EACCESS) +} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/eaccess_stub.go b/vendor/github.com/opencontainers/runc/libcontainer/eaccess_stub.go new file mode 100644 index 00000000..7c049fd7 --- /dev/null +++ b/vendor/github.com/opencontainers/runc/libcontainer/eaccess_stub.go @@ -0,0 +1,10 @@ +//go:build go1.20 + +package libcontainer + +func eaccess(path string) error { + // Not needed in Go 1.20+ as the functionality is already in there + // (added by https://go.dev/cl/416115, https://go.dev/cl/414824, + // and fixed in Go 1.20.2 by https://go.dev/cl/469956). + return nil +} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/factory_linux.go b/vendor/github.com/opencontainers/runc/libcontainer/factory_linux.go index e6c71ac3..546d0eb0 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/factory_linux.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/factory_linux.go @@ -48,20 +48,6 @@ func InitArgs(args ...string) func(*LinuxFactory) error { } } -// IntelRdtfs is an options func to configure a LinuxFactory to return -// containers that use the Intel RDT "resource control" filesystem to -// create and manage Intel RDT resources (e.g., L3 cache, memory bandwidth). -func IntelRdtFs(l *LinuxFactory) error { - if !intelrdt.IsCATEnabled() && !intelrdt.IsMBAEnabled() { - l.NewIntelRdtManager = nil - } else { - l.NewIntelRdtManager = func(config *configs.Config, id string, path string) intelrdt.Manager { - return intelrdt.NewManager(config, id, path) - } - } - return nil -} - // TmpfsRoot is an option func to mount LinuxFactory.Root to tmpfs. func TmpfsRoot(l *LinuxFactory) error { mounted, err := mountinfo.Mounted(l.Root) @@ -136,9 +122,6 @@ type LinuxFactory struct { // Validator provides validation to container configurations. Validator validate.Validator - - // NewIntelRdtManager returns an initialized Intel RDT manager for a single container. - NewIntelRdtManager func(config *configs.Config, id string, path string) intelrdt.Manager } func (l *LinuxFactory) Create(id string, config *configs.Config) (Container, error) { @@ -179,6 +162,12 @@ func (l *LinuxFactory) Create(id string, config *configs.Config) (Container, err return nil, fmt.Errorf("unable to get cgroup PIDs: %w", err) } if len(pids) != 0 { + if config.Cgroups.Systemd { + // systemd cgroup driver can't add a pid to an + // existing systemd unit and will return an + // error anyway, so let's error out early. + return nil, fmt.Errorf("container's cgroup is not empty: %d process(es) found", len(pids)) + } // TODO: return an error. logrus.Warnf("container's cgroup is not empty: %d process(es) found", len(pids)) logrus.Warn("DEPRECATED: running container in a non-empty cgroup won't be supported in runc 1.2; https://github.com/opencontainers/runc/issues/3132") @@ -202,18 +191,16 @@ func (l *LinuxFactory) Create(id string, config *configs.Config) (Container, err return nil, err } c := &linuxContainer{ - id: id, - root: containerRoot, - config: config, - initPath: l.InitPath, - initArgs: l.InitArgs, - criuPath: l.CriuPath, - newuidmapPath: l.NewuidmapPath, - newgidmapPath: l.NewgidmapPath, - cgroupManager: cm, - } - if l.NewIntelRdtManager != nil { - c.intelRdtManager = l.NewIntelRdtManager(config, id, "") + id: id, + root: containerRoot, + config: config, + initPath: l.InitPath, + initArgs: l.InitArgs, + criuPath: l.CriuPath, + newuidmapPath: l.NewuidmapPath, + newgidmapPath: l.NewgidmapPath, + cgroupManager: cm, + intelRdtManager: intelrdt.NewManager(config, id, ""), } c.state = &stoppedState{c: c} return c, nil @@ -255,12 +242,10 @@ func (l *LinuxFactory) Load(id string) (Container, error) { newuidmapPath: l.NewuidmapPath, newgidmapPath: l.NewgidmapPath, cgroupManager: cm, + intelRdtManager: intelrdt.NewManager(&state.Config, id, state.IntelRdtPath), root: containerRoot, created: state.Created, } - if l.NewIntelRdtManager != nil { - c.intelRdtManager = l.NewIntelRdtManager(&state.Config, id, state.IntelRdtPath) - } c.state = &loadedState{c: c} if err := c.refreshState(); err != nil { return nil, err @@ -338,10 +323,9 @@ func (l *LinuxFactory) StartInitialization() (err error) { defer func() { if e := recover(); e != nil { - if e, ok := e.(error); ok { - err = fmt.Errorf("panic from initialization: %w, %s", e, debug.Stack()) + if ee, ok := e.(error); ok { + err = fmt.Errorf("panic from initialization: %w, %s", ee, debug.Stack()) } else { - //nolint:errorlint // here e is not of error type err = fmt.Errorf("panic from initialization: %v, %s", e, debug.Stack()) } } diff --git a/vendor/github.com/opencontainers/runc/libcontainer/init_linux.go b/vendor/github.com/opencontainers/runc/libcontainer/init_linux.go index 2e4c5935..d9f18139 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/init_linux.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/init_linux.go @@ -8,6 +8,7 @@ import ( "io" "net" "os" + "path/filepath" "strings" "unsafe" @@ -116,17 +117,17 @@ func populateProcessEnvironment(env []string) error { for _, pair := range env { p := strings.SplitN(pair, "=", 2) if len(p) < 2 { - return fmt.Errorf("invalid environment variable: %q", pair) + return errors.New("invalid environment variable: missing '='") } name, val := p[0], p[1] if name == "" { - return fmt.Errorf("environment variable name can't be empty: %q", pair) + return errors.New("invalid environment variable: name cannot be empty") } if strings.IndexByte(name, 0) >= 0 { - return fmt.Errorf("environment variable name can't contain null(\\x00): %q", pair) + return fmt.Errorf("invalid environment variable %q: name contains nul byte (\\x00)", name) } if strings.IndexByte(val, 0) >= 0 { - return fmt.Errorf("environment variable value can't contain null(\\x00): %q", pair) + return fmt.Errorf("invalid environment variable %q: value contains nul byte (\\x00)", name) } if err := os.Setenv(name, val); err != nil { return err @@ -135,6 +136,32 @@ func populateProcessEnvironment(env []string) error { return nil } +// verifyCwd ensures that the current directory is actually inside the mount +// namespace root of the current process. +func verifyCwd() error { + // getcwd(2) on Linux detects if cwd is outside of the rootfs of the + // current mount namespace root, and in that case prefixes "(unreachable)" + // to the returned string. glibc's getcwd(3) and Go's Getwd() both detect + // when this happens and return ENOENT rather than returning a non-absolute + // path. In both cases we can therefore easily detect if we have an invalid + // cwd by checking the return value of getcwd(3). See getcwd(3) for more + // details, and CVE-2024-21626 for the security issue that motivated this + // check. + // + // We have to use unix.Getwd() here because os.Getwd() has a workaround for + // $PWD which involves doing stat(.), which can fail if the current + // directory is inaccessible to the container process. + if wd, err := unix.Getwd(); errors.Is(err, unix.ENOENT) { + return errors.New("current working directory is outside of container mount namespace root -- possible container breakout detected") + } else if err != nil { + return fmt.Errorf("failed to verify if current working directory is safe: %w", err) + } else if !filepath.IsAbs(wd) { + // We shouldn't ever hit this, but check just in case. + return fmt.Errorf("current working directory is not absolute -- possible container breakout detected: cwd is %q", wd) + } + return nil +} + // finalizeNamespace drops the caps, sets the correct user // and working dir, and closes any leaked file descriptors // before executing the command inside the namespace @@ -193,6 +220,10 @@ func finalizeNamespace(config *initConfig) error { return fmt.Errorf("chdir to cwd (%q) set in config.json failed: %w", config.Cwd, err) } } + // Make sure our final working directory is inside the container. + if err := verifyCwd(); err != nil { + return err + } if err := system.ClearKeepCaps(); err != nil { return fmt.Errorf("unable to clear keep caps: %w", err) } diff --git a/vendor/github.com/opencontainers/runc/libcontainer/intelrdt/intelrdt.go b/vendor/github.com/opencontainers/runc/libcontainer/intelrdt/intelrdt.go index 8b6bf3ef..0f5c457b 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/intelrdt/intelrdt.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/intelrdt/intelrdt.go @@ -1,11 +1,9 @@ package intelrdt import ( - "bufio" "bytes" "errors" "fmt" - "io" "os" "path/filepath" "strconv" @@ -13,6 +11,8 @@ import ( "sync" "github.com/moby/sys/mountinfo" + "golang.org/x/sys/unix" + "github.com/opencontainers/runc/libcontainer/cgroups/fscommon" "github.com/opencontainers/runc/libcontainer/configs" ) @@ -145,34 +145,31 @@ import ( * } */ -type Manager interface { - // Applies Intel RDT configuration to the process with the specified pid - Apply(pid int) error - - // Returns statistics for Intel RDT - GetStats() (*Stats, error) - - // Destroys the Intel RDT container-specific 'container_id' group - Destroy() error - - // Returns Intel RDT path to save in a state file and to be able to - // restore the object later - GetPath() string - - // Set Intel RDT "resource control" filesystem as configured. - Set(container *configs.Config) error -} - -// This implements interface Manager -type intelRdtManager struct { +type Manager struct { mu sync.Mutex config *configs.Config id string path string } -func NewManager(config *configs.Config, id string, path string) Manager { - return &intelRdtManager{ +// NewManager returns a new instance of Manager, or nil if the Intel RDT +// functionality is not specified in the config, available from hardware or +// enabled in the kernel. +func NewManager(config *configs.Config, id string, path string) *Manager { + if config.IntelRdt == nil { + return nil + } + if _, err := Root(); err != nil { + // Intel RDT is not available. + return nil + } + return newManager(config, id, path) +} + +// newManager is the same as NewManager, except it does not check if the feature +// is actually available. Used by unit tests that mock intelrdt paths. +func newManager(config *configs.Config, id string, path string) *Manager { + return &Manager{ config: config, id: id, path: path, @@ -188,71 +185,52 @@ var ( catEnabled bool // The flag to indicate if Intel RDT/MBA is enabled mbaEnabled bool - // The flag to indicate if Intel RDT/MBA Software Controller is enabled - mbaScEnabled bool // For Intel RDT initialization initOnce sync.Once - errNotFound = errors.New("Intel RDT resctrl mount point not found") + errNotFound = errors.New("Intel RDT not available") ) // Check if Intel RDT sub-features are enabled in featuresInit() func featuresInit() { initOnce.Do(func() { - // 1. Check if hardware and kernel support Intel RDT sub-features - flagsSet, err := parseCpuInfoFile("/proc/cpuinfo") - if err != nil { - return - } - - // 2. Check if Intel RDT "resource control" filesystem is available. + // 1. Check if Intel RDT "resource control" filesystem is available. // The user guarantees to mount the filesystem. root, err := Root() if err != nil { return } - // 3. Double check if Intel RDT sub-features are available in - // "resource control" filesystem. Intel RDT sub-features can be + // 2. Check if Intel RDT sub-features are available in "resource + // control" filesystem. Intel RDT sub-features can be // selectively disabled or enabled by kernel command line // (e.g., rdt=!l3cat,mba) in 4.14 and newer kernel - if flagsSet.CAT { - if _, err := os.Stat(filepath.Join(root, "info", "L3")); err == nil { - catEnabled = true - } + if _, err := os.Stat(filepath.Join(root, "info", "L3")); err == nil { + catEnabled = true } - if mbaScEnabled { - // We confirm MBA Software Controller is enabled in step 2, - // MBA should be enabled because MBA Software Controller - // depends on MBA + if _, err := os.Stat(filepath.Join(root, "info", "MB")); err == nil { mbaEnabled = true - } else if flagsSet.MBA { - if _, err := os.Stat(filepath.Join(root, "info", "MB")); err == nil { - mbaEnabled = true - } } - if flagsSet.MBMTotal || flagsSet.MBMLocal || flagsSet.CMT { - if _, err := os.Stat(filepath.Join(root, "info", "L3_MON")); err != nil { - return - } - enabledMonFeatures, err = getMonFeatures(root) - if err != nil { - return - } - if enabledMonFeatures.mbmTotalBytes || enabledMonFeatures.mbmLocalBytes { - mbmEnabled = true - } - if enabledMonFeatures.llcOccupancy { - cmtEnabled = true - } + if _, err := os.Stat(filepath.Join(root, "info", "L3_MON")); err != nil { + return + } + enabledMonFeatures, err = getMonFeatures(root) + if err != nil { + return + } + if enabledMonFeatures.mbmTotalBytes || enabledMonFeatures.mbmLocalBytes { + mbmEnabled = true + } + if enabledMonFeatures.llcOccupancy { + cmtEnabled = true } }) } -// Return the mount point path of Intel RDT "resource control" filesysem -func findIntelRdtMountpointDir(f io.Reader) (string, error) { - mi, err := mountinfo.GetMountsFromReader(f, func(m *mountinfo.Info) (bool, bool) { +// findIntelRdtMountpointDir returns the mount point of the Intel RDT "resource control" filesystem. +func findIntelRdtMountpointDir() (string, error) { + mi, err := mountinfo.GetMounts(func(m *mountinfo.Info) (bool, bool) { // similar to mountinfo.FSTypeFilter but stops after the first match if m.FSType == "resctrl" { return false, true // don't skip, stop @@ -266,97 +244,45 @@ func findIntelRdtMountpointDir(f io.Reader) (string, error) { return "", errNotFound } - // Check if MBA Software Controller is enabled through mount option "-o mba_MBps" - if strings.Contains(","+mi[0].VFSOptions+",", ",mba_MBps,") { - mbaScEnabled = true - } - return mi[0].Mountpoint, nil } // For Root() use only. var ( - intelRdtRoot string - rootMu sync.Mutex + intelRdtRoot string + intelRdtRootErr error + rootOnce sync.Once ) +// The kernel creates this (empty) directory if resctrl is supported by the +// hardware and kernel. The user is responsible for mounting the resctrl +// filesystem, and they could mount it somewhere else if they wanted to. +const defaultResctrlMountpoint = "/sys/fs/resctrl" + // Root returns the Intel RDT "resource control" filesystem mount point. func Root() (string, error) { - rootMu.Lock() - defer rootMu.Unlock() - - if intelRdtRoot != "" { - return intelRdtRoot, nil - } - - f, err := os.Open("/proc/self/mountinfo") - if err != nil { - return "", err - } - root, err := findIntelRdtMountpointDir(f) - f.Close() - if err != nil { - return "", err - } - - if _, err := os.Stat(root); err != nil { - return "", err - } - - intelRdtRoot = root - return intelRdtRoot, nil -} - -type cpuInfoFlags struct { - CAT bool // Cache Allocation Technology - MBA bool // Memory Bandwidth Allocation - - // Memory Bandwidth Monitoring related. - MBMTotal bool - MBMLocal bool - - CMT bool // Cache Monitoring Technology -} - -func parseCpuInfoFile(path string) (cpuInfoFlags, error) { - infoFlags := cpuInfoFlags{} - - f, err := os.Open(path) - if err != nil { - return infoFlags, err - } - defer f.Close() - - s := bufio.NewScanner(f) - for s.Scan() { - line := s.Text() - - // Search "cat_l3" and "mba" flags in first "flags" line - if strings.HasPrefix(line, "flags") { - flags := strings.Split(line, " ") - // "cat_l3" flag for CAT and "mba" flag for MBA - for _, flag := range flags { - switch flag { - case "cat_l3": - infoFlags.CAT = true - case "mba": - infoFlags.MBA = true - case "cqm_mbm_total": - infoFlags.MBMTotal = true - case "cqm_mbm_local": - infoFlags.MBMLocal = true - case "cqm_occup_llc": - infoFlags.CMT = true - } + rootOnce.Do(func() { + // Does this system support resctrl? + var statfs unix.Statfs_t + if err := unix.Statfs(defaultResctrlMountpoint, &statfs); err != nil { + if errors.Is(err, unix.ENOENT) { + err = errNotFound } - return infoFlags, nil + intelRdtRootErr = err + return } - } - if err := s.Err(); err != nil { - return infoFlags, err - } - return infoFlags, nil + // Has the resctrl fs been mounted to the default mount point? + if statfs.Type == unix.RDTGROUP_SUPER_MAGIC { + intelRdtRoot = defaultResctrlMountpoint + return + } + + // The resctrl fs could have been mounted somewhere nonstandard. + intelRdtRoot, intelRdtRootErr = findIntelRdtMountpointDir() + }) + + return intelRdtRoot, intelRdtRootErr } // Gets a single uint64 value from the specified file. @@ -502,14 +428,8 @@ func IsMBAEnabled() bool { return mbaEnabled } -// Check if Intel RDT/MBA Software Controller is enabled -func IsMBAScEnabled() bool { - featuresInit() - return mbaScEnabled -} - // Get the path of the clos group in "resource control" filesystem that the container belongs to -func (m *intelRdtManager) getIntelRdtPath() (string, error) { +func (m *Manager) getIntelRdtPath() (string, error) { rootPath, err := Root() if err != nil { return "", err @@ -524,7 +444,7 @@ func (m *intelRdtManager) getIntelRdtPath() (string, error) { } // Applies Intel RDT configuration to the process with the specified pid -func (m *intelRdtManager) Apply(pid int) (err error) { +func (m *Manager) Apply(pid int) (err error) { // If intelRdt is not specified in config, we do nothing if m.config.IntelRdt == nil { return nil @@ -559,11 +479,11 @@ func (m *intelRdtManager) Apply(pid int) (err error) { } // Destroys the Intel RDT container-specific 'container_id' group -func (m *intelRdtManager) Destroy() error { +func (m *Manager) Destroy() error { // Don't remove resctrl group if closid has been explicitly specified. The // group is likely externally managed, i.e. by some other entity than us. // There are probably other containers/tasks sharing the same group. - if m.config.IntelRdt == nil || m.config.IntelRdt.ClosID == "" { + if m.config.IntelRdt != nil && m.config.IntelRdt.ClosID == "" { m.mu.Lock() defer m.mu.Unlock() if err := os.RemoveAll(m.GetPath()); err != nil { @@ -576,7 +496,7 @@ func (m *intelRdtManager) Destroy() error { // Returns Intel RDT path to save in a state file and to be able to // restore the object later -func (m *intelRdtManager) GetPath() string { +func (m *Manager) GetPath() string { if m.path == "" { m.path, _ = m.getIntelRdtPath() } @@ -584,7 +504,7 @@ func (m *intelRdtManager) GetPath() string { } // Returns statistics for Intel RDT -func (m *intelRdtManager) GetStats() (*Stats, error) { +func (m *Manager) GetStats() (*Stats, error) { // If intelRdt is not specified in config if m.config.IntelRdt == nil { return nil, nil @@ -670,7 +590,7 @@ func (m *intelRdtManager) GetStats() (*Stats, error) { } // Set Intel RDT "resource control" filesystem as configured. -func (m *intelRdtManager) Set(container *configs.Config) error { +func (m *Manager) Set(container *configs.Config) error { // About L3 cache schema: // It has allocation bitmasks/values for L3 cache on each socket, // which contains L3 cache id and capacity bitmask (CBM). diff --git a/vendor/github.com/opencontainers/runc/libcontainer/mount_linux.go b/vendor/github.com/opencontainers/runc/libcontainer/mount_linux.go index 5f49de96..948b6c0b 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/mount_linux.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/mount_linux.go @@ -1,6 +1,7 @@ package libcontainer import ( + "io/fs" "strconv" "golang.org/x/sys/unix" @@ -81,3 +82,20 @@ func unmount(target string, flags int) error { } return nil } + +// syscallMode returns the syscall-specific mode bits from Go's portable mode bits. +// Copy from https://cs.opensource.google/go/go/+/refs/tags/go1.20.7:src/os/file_posix.go;l=61-75 +func syscallMode(i fs.FileMode) (o uint32) { + o |= uint32(i.Perm()) + if i&fs.ModeSetuid != 0 { + o |= unix.S_ISUID + } + if i&fs.ModeSetgid != 0 { + o |= unix.S_ISGID + } + if i&fs.ModeSticky != 0 { + o |= unix.S_ISVTX + } + // No mapping for Go's ModeTemporary (plan9 only). + return +} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/nsenter/cloned_binary.c b/vendor/github.com/opencontainers/runc/libcontainer/nsenter/cloned_binary.c index 4268ebda..d1b2d4c5 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/nsenter/cloned_binary.c +++ b/vendor/github.com/opencontainers/runc/libcontainer/nsenter/cloned_binary.c @@ -151,7 +151,7 @@ static int is_self_cloned(void) * Is the binary a fully-sealed memfd? We don't need CLONED_BINARY_ENV for * this, because you cannot write to a sealed memfd no matter what (so * sharing it isn't a bad thing -- and an admin could bind-mount a sealed - * memfd to /usr/bin/runc to allow re-use). + * memfd to /usr/bin/runc to allow reuse). */ ret = fcntl(fd, F_GET_SEALS); if (ret >= 0) { diff --git a/vendor/github.com/opencontainers/runc/libcontainer/process_linux.go b/vendor/github.com/opencontainers/runc/libcontainer/process_linux.go index 446649af..0d9ceb9c 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/process_linux.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/process_linux.go @@ -299,7 +299,7 @@ type initProcess struct { logFilePair filePair config *initConfig manager cgroups.Manager - intelRdtManager intelrdt.Manager + intelRdtManager *intelrdt.Manager container *linuxContainer fds []string process *Process diff --git a/vendor/github.com/opencontainers/runc/libcontainer/rootfs_linux.go b/vendor/github.com/opencontainers/runc/libcontainer/rootfs_linux.go index c3f88fc7..c701d6a2 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/rootfs_linux.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/rootfs_linux.go @@ -459,11 +459,16 @@ func mountToRootfs(m *configs.Mount, c *mountConfig) error { } return label.SetFileLabel(dest, mountLabel) case "tmpfs": - stat, err := os.Stat(dest) - if err != nil { + if stat, err := os.Stat(dest); err != nil { if err := os.MkdirAll(dest, 0o755); err != nil { return err } + } else { + dt := fmt.Sprintf("mode=%04o", syscallMode(stat.Mode())) + if m.Data != "" { + dt = dt + "," + m.Data + } + m.Data = dt } if m.Extensions&configs.EXT_COPYUP == configs.EXT_COPYUP { @@ -472,16 +477,7 @@ func mountToRootfs(m *configs.Mount, c *mountConfig) error { err = mountPropagate(m, rootfs, mountLabel, nil) } - if err != nil { - return err - } - - if stat != nil { - if err = os.Chmod(dest, stat.Mode()); err != nil { - return err - } - } - return nil + return err case "bind": if err := prepareBindMount(m, rootfs, mountFd); err != nil { return err diff --git a/vendor/github.com/opencontainers/runc/libcontainer/seccomp/config.go b/vendor/github.com/opencontainers/runc/libcontainer/seccomp/config.go index 98e08e8f..2b15576a 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/seccomp/config.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/seccomp/config.go @@ -66,6 +66,7 @@ var archs = map[string]string{ "SCMP_ARCH_PPC": "ppc", "SCMP_ARCH_PPC64": "ppc64", "SCMP_ARCH_PPC64LE": "ppc64le", + "SCMP_ARCH_RISCV64": "riscv64", "SCMP_ARCH_S390": "s390", "SCMP_ARCH_S390X": "s390x", } diff --git a/vendor/github.com/opencontainers/runc/libcontainer/seccomp/patchbpf/enosys_linux.go b/vendor/github.com/opencontainers/runc/libcontainer/seccomp/patchbpf/enosys_linux.go index 7d4ec6a4..efe6dca5 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/seccomp/patchbpf/enosys_linux.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/seccomp/patchbpf/enosys_linux.go @@ -48,6 +48,13 @@ const uintptr_t C_FILTER_FLAG_LOG = SECCOMP_FILTER_FLAG_LOG; #endif const uintptr_t C_FILTER_FLAG_NEW_LISTENER = SECCOMP_FILTER_FLAG_NEW_LISTENER; +#ifndef AUDIT_ARCH_RISCV64 +#ifndef EM_RISCV +#define EM_RISCV 243 +#endif +#define AUDIT_ARCH_RISCV64 (EM_RISCV|__AUDIT_ARCH_64BIT|__AUDIT_ARCH_LE) +#endif + // We use the AUDIT_ARCH_* values because those are the ones used by the kernel // and SCMP_ARCH_* sometimes has fake values (such as SCMP_ARCH_X32). But we // use so we get libseccomp's fallback definitions of AUDIT_ARCH_*. @@ -67,13 +74,14 @@ const uint32_t C_AUDIT_ARCH_PPC64 = AUDIT_ARCH_PPC64; const uint32_t C_AUDIT_ARCH_PPC64LE = AUDIT_ARCH_PPC64LE; const uint32_t C_AUDIT_ARCH_S390 = AUDIT_ARCH_S390; const uint32_t C_AUDIT_ARCH_S390X = AUDIT_ARCH_S390X; +const uint32_t C_AUDIT_ARCH_RISCV64 = AUDIT_ARCH_RISCV64; */ import "C" var retErrnoEnosys = uint32(C.C_ACT_ERRNO_ENOSYS) // This syscall is used for multiplexing "large" syscalls on s390(x). Unknown -// syscalls will end up with this syscall number, so we need to explcitly +// syscalls will end up with this syscall number, so we need to explicitly // return -ENOSYS for this syscall on those architectures. const s390xMultiplexSyscall libseccomp.ScmpSyscall = 0 @@ -202,6 +210,8 @@ func archToNative(arch libseccomp.ScmpArch) (nativeArch, error) { return nativeArch(C.C_AUDIT_ARCH_S390), nil case libseccomp.ArchS390X: return nativeArch(C.C_AUDIT_ARCH_S390X), nil + case libseccomp.ArchRISCV64: + return nativeArch(C.C_AUDIT_ARCH_RISCV64), nil default: return invalidArch, fmt.Errorf("unknown architecture: %v", arch) } diff --git a/vendor/github.com/opencontainers/runc/libcontainer/setns_init_linux.go b/vendor/github.com/opencontainers/runc/libcontainer/setns_init_linux.go index 09ab552b..d1bb1227 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/setns_init_linux.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/setns_init_linux.go @@ -4,6 +4,7 @@ import ( "errors" "fmt" "os" + "os/exec" "strconv" "github.com/opencontainers/selinux/go-selinux" @@ -14,6 +15,7 @@ import ( "github.com/opencontainers/runc/libcontainer/keys" "github.com/opencontainers/runc/libcontainer/seccomp" "github.com/opencontainers/runc/libcontainer/system" + "github.com/opencontainers/runc/libcontainer/utils" ) // linuxSetnsInit performs the container's initialization for running a new process @@ -82,6 +84,21 @@ func (l *linuxSetnsInit) Init() error { if err := apparmor.ApplyProfile(l.config.AppArmorProfile); err != nil { return err } + + // Check for the arg before waiting to make sure it exists and it is + // returned as a create time error. + name, err := exec.LookPath(l.config.Args[0]) + if err != nil { + return err + } + // exec.LookPath in Go < 1.20 might return no error for an executable + // residing on a file system mounted with noexec flag, so perform this + // extra check now while we can still return a proper error. + // TODO: remove this once go < 1.20 is not supported. + if err := eaccess(name); err != nil { + return &os.PathError{Op: "eaccess", Path: name, Err: err} + } + // Set seccomp as close to execve as possible, so as few syscalls take // place afterward (reducing the amount of syscalls that users need to // enable in their seccomp profiles). @@ -101,5 +118,23 @@ func (l *linuxSetnsInit) Init() error { return &os.PathError{Op: "close log pipe", Path: "fd " + strconv.Itoa(l.logFd), Err: err} } - return system.Execv(l.config.Args[0], l.config.Args[0:], os.Environ()) + // Close all file descriptors we are not passing to the container. This is + // necessary because the execve target could use internal runc fds as the + // execve path, potentially giving access to binary files from the host + // (which can then be opened by container processes, leading to container + // escapes). Note that because this operation will close any open file + // descriptors that are referenced by (*os.File) handles from underneath + // the Go runtime, we must not do any file operations after this point + // (otherwise the (*os.File) finaliser could close the wrong file). See + // CVE-2024-21626 for more information as to why this protection is + // necessary. + // + // This is not needed for runc-dmz, because the extra execve(2) step means + // that all O_CLOEXEC file descriptors have already been closed and thus + // the second execve(2) from runc-dmz cannot access internal file + // descriptors from runc. + if err := utils.UnsafeCloseFrom(l.config.PassedFilesCount + 3); err != nil { + return err + } + return system.Exec(name, l.config.Args[0:], os.Environ()) } diff --git a/vendor/github.com/opencontainers/runc/libcontainer/specconv/spec_linux.go b/vendor/github.com/opencontainers/runc/libcontainer/specconv/spec_linux.go index c7ca4c8a..7dbfb869 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/specconv/spec_linux.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/specconv/spec_linux.go @@ -18,6 +18,7 @@ import ( "github.com/opencontainers/runc/libcontainer/configs" "github.com/opencontainers/runc/libcontainer/devices" "github.com/opencontainers/runc/libcontainer/seccomp" + "github.com/opencontainers/runc/libcontainer/userns" libcontainerUtils "github.com/opencontainers/runc/libcontainer/utils" "github.com/opencontainers/runtime-spec/specs-go" "github.com/sirupsen/logrus" @@ -176,18 +177,19 @@ func KnownMountOptions() []string { // AllowedDevices is the set of devices which are automatically included for // all containers. // -// XXX (cyphar) -// This behaviour is at the very least "questionable" (if not outright -// wrong) according to the runtime-spec. +// # XXX (cyphar) // -// Yes, we have to include certain devices other than the ones the user -// specifies, but several devices listed here are not part of the spec -// (including "mknod for any device"?!). In addition, these rules are -// appended to the user-provided set which means that users *cannot disable -// this behaviour*. +// This behaviour is at the very least "questionable" (if not outright +// wrong) according to the runtime-spec. // -// ... unfortunately I'm too scared to change this now because who knows how -// many people depend on this (incorrect and arguably insecure) behaviour. +// Yes, we have to include certain devices other than the ones the user +// specifies, but several devices listed here are not part of the spec +// (including "mknod for any device"?!). In addition, these rules are +// appended to the user-provided set which means that users *cannot disable +// this behaviour*. +// +// ... unfortunately I'm too scared to change this now because who knows how +// many people depend on this (incorrect and arguably insecure) behaviour. var AllowedDevices = []*devices.Device{ // allow mknod for any device { @@ -925,9 +927,9 @@ next: func setupUserNamespace(spec *specs.Spec, config *configs.Config) error { create := func(m specs.LinuxIDMapping) configs.IDMap { return configs.IDMap{ - HostID: int(m.HostID), - ContainerID: int(m.ContainerID), - Size: int(m.Size), + HostID: int64(m.HostID), + ContainerID: int64(m.ContainerID), + Size: int64(m.Size), } } if spec.Linux != nil { @@ -938,6 +940,40 @@ func setupUserNamespace(spec *specs.Spec, config *configs.Config) error { config.GidMappings = append(config.GidMappings, create(m)) } } + if path := config.Namespaces.PathOf(configs.NEWUSER); path != "" { + // Cache the current userns mappings in our configuration, so that we + // can calculate uid and gid mappings within runc. These mappings are + // never used for configuring the container if the path is set. + uidMap, gidMap, err := userns.GetUserNamespaceMappings(path) + if err != nil { + return fmt.Errorf("failed to cache mappings for userns: %w", err) + } + // We cannot allow uid or gid mappings to be set if we are also asked + // to join a userns. + if config.UidMappings != nil || config.GidMappings != nil { + // FIXME: It turns out that containerd and CRIO pass both a userns + // path and the mappings of the namespace in the same config.json. + // Such a configuration is technically not valid, but we used to + // require mappings be specified, and thus users worked around our + // bug -- so we can't regress it at the moment. But we also don't + // want to produce broken behaviour if the mapping doesn't match + // the userns. So (for now) we output a warning if the actual + // userns mappings match the configuration, otherwise we return an + // error. + if !userns.IsSameMapping(uidMap, config.UidMappings) || + !userns.IsSameMapping(gidMap, config.GidMappings) { + return errors.New("user namespaces enabled, but both namespace path and non-matching mapping specified -- you may only provide one") + } + logrus.Warnf("config.json has both a userns path to join and a matching userns mapping specified -- you may only provide one. Future versions of runc may return an error with this configuration, please report a bug on if you see this warning and cannot update your configuration.") + } + + config.UidMappings = uidMap + config.GidMappings = gidMap + logrus.WithFields(logrus.Fields{ + "uid_map": uidMap, + "gid_map": gidMap, + }).Debugf("config uses path-based userns configuration -- current uid and gid mappings cached") + } rootUID, err := config.HostRootUID() if err != nil { return err diff --git a/vendor/github.com/opencontainers/runc/libcontainer/standard_init_linux.go b/vendor/github.com/opencontainers/runc/libcontainer/standard_init_linux.go index 081d1503..d1d94352 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/standard_init_linux.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/standard_init_linux.go @@ -17,6 +17,7 @@ import ( "github.com/opencontainers/runc/libcontainer/keys" "github.com/opencontainers/runc/libcontainer/seccomp" "github.com/opencontainers/runc/libcontainer/system" + "github.com/opencontainers/runc/libcontainer/utils" ) type linuxStandardInit struct { @@ -198,11 +199,12 @@ func (l *linuxStandardInit) Init() error { if err != nil { return err } - // exec.LookPath might return no error for an executable residing on a - // file system mounted with noexec flag, so perform this extra check - // now while we can still return a proper error. - if err := system.Eaccess(name); err != nil { - return &os.PathError{Op: "exec", Path: name, Err: err} + // exec.LookPath in Go < 1.20 might return no error for an executable + // residing on a file system mounted with noexec flag, so perform this + // extra check now while we can still return a proper error. + // TODO: remove this once go < 1.20 is not supported. + if err := eaccess(name); err != nil { + return &os.PathError{Op: "eaccess", Path: name, Err: err} } // Set seccomp as close to execve as possible, so as few syscalls take @@ -257,5 +259,23 @@ func (l *linuxStandardInit) Init() error { return err } + // Close all file descriptors we are not passing to the container. This is + // necessary because the execve target could use internal runc fds as the + // execve path, potentially giving access to binary files from the host + // (which can then be opened by container processes, leading to container + // escapes). Note that because this operation will close any open file + // descriptors that are referenced by (*os.File) handles from underneath + // the Go runtime, we must not do any file operations after this point + // (otherwise the (*os.File) finaliser could close the wrong file). See + // CVE-2024-21626 for more information as to why this protection is + // necessary. + // + // This is not needed for runc-dmz, because the extra execve(2) step means + // that all O_CLOEXEC file descriptors have already been closed and thus + // the second execve(2) from runc-dmz cannot access internal file + // descriptors from runc. + if err := utils.UnsafeCloseFrom(l.config.PassedFilesCount + 3); err != nil { + return err + } return system.Exec(name, l.config.Args[0:], os.Environ()) } diff --git a/vendor/github.com/opencontainers/runc/libcontainer/sync.go b/vendor/github.com/opencontainers/runc/libcontainer/sync.go index c9a23ef3..25dc2863 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/sync.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/sync.go @@ -15,16 +15,16 @@ type syncType string // during container setup. They come in pairs (with procError being a generic // response which is followed by an &initError). // -// [ child ] <-> [ parent ] +// [ child ] <-> [ parent ] // -// procHooks --> [run hooks] -// <-- procResume +// procHooks --> [run hooks] +// <-- procResume // -// procReady --> [final setup] -// <-- procRun +// procReady --> [final setup] +// <-- procRun // -// procSeccomp --> [pick up seccomp fd with pidfd_getfd()] -// <-- procSeccompDone +// procSeccomp --> [pick up seccomp fd with pidfd_getfd()] +// <-- procSeccompDone const ( procError syncType = "procError" procReady syncType = "procReady" diff --git a/vendor/github.com/opencontainers/runc/libcontainer/system/linux.go b/vendor/github.com/opencontainers/runc/libcontainer/system/linux.go index 039059a4..e1d6eb18 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/system/linux.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/system/linux.go @@ -31,25 +31,6 @@ func (p ParentDeathSignal) Set() error { return SetParentDeathSignal(uintptr(p)) } -// Eaccess is similar to unix.Access except for setuid/setgid binaries -// it checks against the effective (rather than real) uid and gid. -func Eaccess(path string) error { - err := unix.Faccessat2(unix.AT_FDCWD, path, unix.X_OK, unix.AT_EACCESS) - if err != unix.ENOSYS && err != unix.EPERM { //nolint:errorlint // unix errors are bare - return err - } - - // Faccessat2() not available; check if we are a set[ug]id binary. - if os.Getuid() == os.Geteuid() && os.Getgid() == os.Getegid() { - // For a non-set[ug]id binary, use access(2). - return unix.Access(path, unix.X_OK) - } - - // For a setuid/setgid binary, there is no fallback way - // so assume we can execute the binary. - return nil -} - func Execv(cmd string, args []string, env []string) error { name, err := exec.LookPath(cmd) if err != nil { diff --git a/vendor/github.com/opencontainers/runc/libcontainer/user/user.go b/vendor/github.com/opencontainers/runc/libcontainer/user/user.go index 2473c5ea..984466d1 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/user/user.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/user/user.go @@ -201,7 +201,7 @@ func ParseGroupFilter(r io.Reader, filter func(Group) bool) ([]Group, error) { if err != nil { // We should return no error if EOF is reached // without a match. - if err == io.EOF { //nolint:errorlint // comparison with io.EOF is legit, https://github.com/polyfloyd/go-errorlint/pull/12 + if err == io.EOF { err = nil } return out, err @@ -280,13 +280,13 @@ func GetExecUserPath(userSpec string, defaults *ExecUser, passwdPath, groupPath // found in any entry in passwd and group respectively. // // Examples of valid user specifications are: -// * "" -// * "user" -// * "uid" -// * "user:group" -// * "uid:gid -// * "user:gid" -// * "uid:group" +// - "" +// - "user" +// - "uid" +// - "user:group" +// - "uid:gid +// - "user:gid" +// - "uid:group" // // It should be noted that if you specify a numeric user or group id, they will // not be evaluated as usernames (only the metadata will be filled). So attempting diff --git a/vendor/github.com/opencontainers/runc/libcontainer/userns/userns_maps.c b/vendor/github.com/opencontainers/runc/libcontainer/userns/userns_maps.c new file mode 100644 index 00000000..84f2c618 --- /dev/null +++ b/vendor/github.com/opencontainers/runc/libcontainer/userns/userns_maps.c @@ -0,0 +1,79 @@ +#define _GNU_SOURCE +#include +#include +#include +#include +#include +#include + +/* + * All of the code here is run inside an aync-signal-safe context, so we need + * to be careful to not call any functions that could cause issues. In theory, + * since we are a Go program, there are fewer restrictions in practice, it's + * better to be safe than sorry. + * + * The only exception is exit, which we need to call to make sure we don't + * return into runc. + */ + +void bail(int pipefd, const char *fmt, ...) +{ + va_list args; + + va_start(args, fmt); + vdprintf(pipefd, fmt, args); + va_end(args); + + exit(1); +} + +int spawn_userns_cat(char *userns_path, char *path, int outfd, int errfd) +{ + char buffer[4096] = { 0 }; + + pid_t child = fork(); + if (child != 0) + return child; + /* in child */ + + /* Join the target userns. */ + int nsfd = open(userns_path, O_RDONLY); + if (nsfd < 0) + bail(errfd, "open userns path %s failed: %m", userns_path); + + int err = setns(nsfd, CLONE_NEWUSER); + if (err < 0) + bail(errfd, "setns %s failed: %m", userns_path); + + close(nsfd); + + /* Pipe the requested file contents. */ + int fd = open(path, O_RDONLY); + if (fd < 0) + bail(errfd, "open %s in userns %s failed: %m", path, userns_path); + + int nread, ntotal = 0; + while ((nread = read(fd, buffer, sizeof(buffer))) != 0) { + if (nread < 0) + bail(errfd, "read bytes from %s failed (after %d total bytes read): %m", path, ntotal); + ntotal += nread; + + int nwritten = 0; + while (nwritten < nread) { + int n = write(outfd, buffer, nread - nwritten); + if (n < 0) + bail(errfd, "write %d bytes from %s failed (after %d bytes written): %m", + nread - nwritten, path, nwritten); + nwritten += n; + } + if (nread != nwritten) + bail(errfd, "mismatch for bytes read and written: %d read != %d written", nread, nwritten); + } + + close(fd); + close(outfd); + close(errfd); + + /* We must exit here, otherwise we would return into a forked runc. */ + exit(0); +} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/userns/userns_maps_linux.go b/vendor/github.com/opencontainers/runc/libcontainer/userns/userns_maps_linux.go new file mode 100644 index 00000000..7a8c2b02 --- /dev/null +++ b/vendor/github.com/opencontainers/runc/libcontainer/userns/userns_maps_linux.go @@ -0,0 +1,186 @@ +//go:build linux + +package userns + +import ( + "bufio" + "bytes" + "fmt" + "io" + "os" + "unsafe" + + "github.com/opencontainers/runc/libcontainer/configs" + "github.com/sirupsen/logrus" +) + +/* +#include +extern int spawn_userns_cat(char *userns_path, char *path, int outfd, int errfd); +*/ +import "C" + +func parseIdmapData(data []byte) (ms []configs.IDMap, err error) { + scanner := bufio.NewScanner(bytes.NewReader(data)) + for scanner.Scan() { + var m configs.IDMap + line := scanner.Text() + if _, err := fmt.Sscanf(line, "%d %d %d", &m.ContainerID, &m.HostID, &m.Size); err != nil { + return nil, fmt.Errorf("parsing id map failed: invalid format in line %q: %w", line, err) + } + ms = append(ms, m) + } + if err := scanner.Err(); err != nil { + return nil, fmt.Errorf("parsing id map failed: %w", err) + } + return ms, nil +} + +// Do something equivalent to nsenter --user= cat , but more +// efficiently. Returns the contents of the requested file from within the user +// namespace. +func spawnUserNamespaceCat(nsPath string, path string) ([]byte, error) { + rdr, wtr, err := os.Pipe() + if err != nil { + return nil, fmt.Errorf("create pipe for userns spawn failed: %w", err) + } + defer rdr.Close() + defer wtr.Close() + + errRdr, errWtr, err := os.Pipe() + if err != nil { + return nil, fmt.Errorf("create error pipe for userns spawn failed: %w", err) + } + defer errRdr.Close() + defer errWtr.Close() + + cNsPath := C.CString(nsPath) + defer C.free(unsafe.Pointer(cNsPath)) + cPath := C.CString(path) + defer C.free(unsafe.Pointer(cPath)) + + childPid := C.spawn_userns_cat(cNsPath, cPath, C.int(wtr.Fd()), C.int(errWtr.Fd())) + + if childPid < 0 { + return nil, fmt.Errorf("failed to spawn fork for userns") + } else if childPid == 0 { + // this should never happen + panic("runc executing inside fork child -- unsafe state!") + } + + // We are in the parent -- close the write end of the pipe before reading. + wtr.Close() + output, err := io.ReadAll(rdr) + rdr.Close() + if err != nil { + return nil, fmt.Errorf("reading from userns spawn failed: %w", err) + } + + // Ditto for the error pipe. + errWtr.Close() + errOutput, err := io.ReadAll(errRdr) + errRdr.Close() + if err != nil { + return nil, fmt.Errorf("reading from userns spawn error pipe failed: %w", err) + } + errOutput = bytes.TrimSpace(errOutput) + + // Clean up the child. + child, err := os.FindProcess(int(childPid)) + if err != nil { + return nil, fmt.Errorf("could not find userns spawn process: %w", err) + } + state, err := child.Wait() + if err != nil { + return nil, fmt.Errorf("failed to wait for userns spawn process: %w", err) + } + if !state.Success() { + errStr := string(errOutput) + if errStr == "" { + errStr = fmt.Sprintf("unknown error (status code %d)", state.ExitCode()) + } + return nil, fmt.Errorf("userns spawn: %s", errStr) + } else if len(errOutput) > 0 { + // We can just ignore weird output in the error pipe if the process + // didn't bail(), but for completeness output for debugging. + logrus.Debugf("userns spawn succeeded but unexpected error message found: %s", string(errOutput)) + } + // The subprocess succeeded, return whatever it wrote to the pipe. + return output, nil +} + +func GetUserNamespaceMappings(nsPath string) (uidMap, gidMap []configs.IDMap, err error) { + var ( + pid int + extra rune + tryFastPath bool + ) + + // nsPath is usually of the form /proc//ns/user, which means that we + // already have a pid that is part of the user namespace and thus we can + // just use the pid to read from /proc//*id_map. + // + // Note that Sscanf doesn't consume the whole input, so we check for any + // trailing data with %c. That way, we can be sure the pattern matched + // /proc/$pid/ns/user _exactly_ iff n === 1. + if n, _ := fmt.Sscanf(nsPath, "/proc/%d/ns/user%c", &pid, &extra); n == 1 { + tryFastPath = pid > 0 + } + + for _, mapType := range []struct { + name string + idMap *[]configs.IDMap + }{ + {"uid_map", &uidMap}, + {"gid_map", &gidMap}, + } { + var mapData []byte + + if tryFastPath { + path := fmt.Sprintf("/proc/%d/%s", pid, mapType.name) + data, err := os.ReadFile(path) + if err != nil { + // Do not error out here -- we need to try the slow path if the + // fast path failed. + logrus.Debugf("failed to use fast path to read %s from userns %s (error: %s), falling back to slow userns-join path", mapType.name, nsPath, err) + } else { + mapData = data + } + } else { + logrus.Debugf("cannot use fast path to read %s from userns %s, falling back to slow userns-join path", mapType.name, nsPath) + } + + if mapData == nil { + // We have to actually join the namespace if we cannot take the + // fast path. The path is resolved with respect to the child + // process, so just use /proc/self. + data, err := spawnUserNamespaceCat(nsPath, "/proc/self/"+mapType.name) + if err != nil { + return nil, nil, err + } + mapData = data + } + idMap, err := parseIdmapData(mapData) + if err != nil { + return nil, nil, fmt.Errorf("failed to parse %s of userns %s: %w", mapType.name, nsPath, err) + } + *mapType.idMap = idMap + } + + return uidMap, gidMap, nil +} + +// IsSameMapping returns whether or not the two id mappings are the same. Note +// that if the order of the mappings is different, or a mapping has been split, +// the mappings will be considered different. +func IsSameMapping(a, b []configs.IDMap) bool { + if len(a) != len(b) { + return false + } + for idx := range a { + if a[idx] != b[idx] { + return false + } + } + return true +} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/utils/utils_unix.go b/vendor/github.com/opencontainers/runc/libcontainer/utils/utils_unix.go index 220d0b43..bf3237a2 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/utils/utils_unix.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/utils/utils_unix.go @@ -7,6 +7,7 @@ import ( "fmt" "os" "strconv" + _ "unsafe" // for go:linkname "golang.org/x/sys/unix" ) @@ -23,9 +24,11 @@ func EnsureProcHandle(fh *os.File) error { return nil } -// CloseExecFrom applies O_CLOEXEC to all file descriptors currently open for -// the process (except for those below the given fd value). -func CloseExecFrom(minFd int) error { +type fdFunc func(fd int) + +// fdRangeFrom calls the passed fdFunc for each file descriptor that is open in +// the current process. +func fdRangeFrom(minFd int, fn fdFunc) error { fdDir, err := os.Open("/proc/self/fd") if err != nil { return err @@ -50,15 +53,60 @@ func CloseExecFrom(minFd int) error { if fd < minFd { continue } - // Intentionally ignore errors from unix.CloseOnExec -- the cases where - // this might fail are basically file descriptors that have already - // been closed (including and especially the one that was created when - // os.ReadDir did the "opendir" syscall). - unix.CloseOnExec(fd) + // Ignore the file descriptor we used for readdir, as it will be closed + // when we return. + if uintptr(fd) == fdDir.Fd() { + continue + } + // Run the closure. + fn(fd) } return nil } +// CloseExecFrom sets the O_CLOEXEC flag on all file descriptors greater or +// equal to minFd in the current process. +func CloseExecFrom(minFd int) error { + return fdRangeFrom(minFd, unix.CloseOnExec) +} + +//go:linkname runtime_IsPollDescriptor internal/poll.IsPollDescriptor + +// In order to make sure we do not close the internal epoll descriptors the Go +// runtime uses, we need to ensure that we skip descriptors that match +// "internal/poll".IsPollDescriptor. Yes, this is a Go runtime internal thing, +// unfortunately there's no other way to be sure we're only keeping the file +// descriptors the Go runtime needs. Hopefully nothing blows up doing this... +func runtime_IsPollDescriptor(fd uintptr) bool //nolint:revive + +// UnsafeCloseFrom closes all file descriptors greater or equal to minFd in the +// current process, except for those critical to Go's runtime (such as the +// netpoll management descriptors). +// +// NOTE: That this function is incredibly dangerous to use in most Go code, as +// closing file descriptors from underneath *os.File handles can lead to very +// bad behaviour (the closed file descriptor can be re-used and then any +// *os.File operations would apply to the wrong file). This function is only +// intended to be called from the last stage of runc init. +func UnsafeCloseFrom(minFd int) error { + // We must not close some file descriptors. + return fdRangeFrom(minFd, func(fd int) { + if runtime_IsPollDescriptor(uintptr(fd)) { + // These are the Go runtimes internal netpoll file descriptors. + // These file descriptors are operated on deep in the Go scheduler, + // and closing those files from underneath Go can result in panics. + // There is no issue with keeping them because they are not + // executable and are not useful to an attacker anyway. Also we + // don't have any choice. + return + } + // There's nothing we can do about errors from close(2), and the + // only likely error to be seen is EBADF which indicates the fd was + // already closed (in which case, we got what we wanted). + _ = unix.Close(fd) + }) +} + // NewSockPair returns a new unix socket pair func NewSockPair(name string) (parent *os.File, child *os.File, err error) { fds, err := unix.Socketpair(unix.AF_LOCAL, unix.SOCK_STREAM|unix.SOCK_CLOEXEC, 0) diff --git a/vendor/modules.txt b/vendor/modules.txt index 6bd31da7..e96b1f2e 100644 --- a/vendor/modules.txt +++ b/vendor/modules.txt @@ -557,7 +557,7 @@ github.com/moby/term/windows # github.com/morikuni/aec v1.0.0 ## explicit github.com/morikuni/aec -# github.com/mrunalp/fileutils v0.5.0 +# github.com/mrunalp/fileutils v0.5.1 ## explicit; go 1.13 github.com/mrunalp/fileutils # github.com/oklog/run v1.1.0 @@ -570,8 +570,8 @@ github.com/opencontainers/go-digest ## explicit; go 1.17 github.com/opencontainers/image-spec/specs-go github.com/opencontainers/image-spec/specs-go/v1 -# github.com/opencontainers/runc v1.1.5 -## explicit; go 1.16 +# github.com/opencontainers/runc v1.1.12 +## explicit; go 1.17 github.com/opencontainers/runc/libcontainer github.com/opencontainers/runc/libcontainer/apparmor github.com/opencontainers/runc/libcontainer/capabilities