489 lines
12 KiB
Go
489 lines
12 KiB
Go
/*
|
|
Copyright The containerd Authors.
|
|
|
|
Licensed under the Apache License, Version 2.0 (the "License");
|
|
you may not use this file except in compliance with the License.
|
|
You may obtain a copy of the License at
|
|
|
|
http://www.apache.org/licenses/LICENSE-2.0
|
|
|
|
Unless required by applicable law or agreed to in writing, software
|
|
distributed under the License is distributed on an "AS IS" BASIS,
|
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
See the License for the specific language governing permissions and
|
|
limitations under the License.
|
|
*/
|
|
|
|
package cgroup2
|
|
|
|
import (
|
|
"bufio"
|
|
"errors"
|
|
"fmt"
|
|
"io"
|
|
"math"
|
|
"os"
|
|
"path/filepath"
|
|
"strconv"
|
|
"strings"
|
|
"sync"
|
|
"time"
|
|
"unsafe"
|
|
|
|
"github.com/containerd/cgroups/v3/cgroup2/stats"
|
|
|
|
"github.com/godbus/dbus/v5"
|
|
"github.com/opencontainers/runtime-spec/specs-go"
|
|
"github.com/sirupsen/logrus"
|
|
"golang.org/x/sys/unix"
|
|
)
|
|
|
|
const (
|
|
cgroupProcs = "cgroup.procs"
|
|
cgroupThreads = "cgroup.threads"
|
|
defaultDirPerm = 0o755
|
|
)
|
|
|
|
// defaultFilePerm is a var so that the test framework can change the filemode
|
|
// of all files created when the tests are running. The difference between the
|
|
// tests and real world use is that files like "cgroup.procs" will exist when writing
|
|
// to a read cgroup filesystem and do not exist prior when running in the tests.
|
|
// this is set to a non 0 value in the test code
|
|
var defaultFilePerm = os.FileMode(0)
|
|
|
|
// remove will remove a cgroup path handling EAGAIN and EBUSY errors and
|
|
// retrying the remove after a exp timeout
|
|
func remove(path string) error {
|
|
var err error
|
|
delay := 10 * time.Millisecond
|
|
for i := 0; i < 5; i++ {
|
|
if i != 0 {
|
|
time.Sleep(delay)
|
|
delay *= 2
|
|
}
|
|
if err = os.RemoveAll(path); err == nil {
|
|
return nil
|
|
}
|
|
}
|
|
return fmt.Errorf("cgroups: unable to remove path %q: %w", path, err)
|
|
}
|
|
|
|
// parseCgroupProcsFile parses /sys/fs/cgroup/$GROUPPATH/cgroup.procs
|
|
func parseCgroupProcsFile(path string) ([]uint64, error) {
|
|
f, err := os.Open(path)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
defer f.Close()
|
|
var (
|
|
out []uint64
|
|
s = bufio.NewScanner(f)
|
|
)
|
|
for s.Scan() {
|
|
if t := s.Text(); t != "" {
|
|
pid, err := strconv.ParseUint(t, 10, 0)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
out = append(out, pid)
|
|
}
|
|
}
|
|
if err := s.Err(); err != nil {
|
|
return nil, err
|
|
}
|
|
return out, nil
|
|
}
|
|
|
|
func parseKV(raw string) (string, uint64, error) {
|
|
parts := strings.Fields(raw)
|
|
if len(parts) != 2 {
|
|
return "", 0, ErrInvalidFormat
|
|
}
|
|
v, err := parseUint(parts[1], 10, 64)
|
|
return parts[0], v, err
|
|
}
|
|
|
|
func parseUint(s string, base, bitSize int) (uint64, error) {
|
|
v, err := strconv.ParseUint(s, base, bitSize)
|
|
if err != nil {
|
|
intValue, intErr := strconv.ParseInt(s, base, bitSize)
|
|
// 1. Handle negative values greater than MinInt64 (and)
|
|
// 2. Handle negative values lesser than MinInt64
|
|
if intErr == nil && intValue < 0 {
|
|
return 0, nil
|
|
} else if intErr != nil &&
|
|
intErr.(*strconv.NumError).Err == strconv.ErrRange &&
|
|
intValue < 0 {
|
|
return 0, nil
|
|
}
|
|
return 0, err
|
|
}
|
|
return v, nil
|
|
}
|
|
|
|
// parseCgroupFile parses /proc/PID/cgroup file and return string
|
|
func parseCgroupFile(path string) (string, error) {
|
|
f, err := os.Open(path)
|
|
if err != nil {
|
|
return "", err
|
|
}
|
|
defer f.Close()
|
|
return parseCgroupFromReader(f)
|
|
}
|
|
|
|
func parseCgroupFromReader(r io.Reader) (string, error) {
|
|
s := bufio.NewScanner(r)
|
|
for s.Scan() {
|
|
var (
|
|
text = s.Text()
|
|
parts = strings.SplitN(text, ":", 3)
|
|
)
|
|
if len(parts) < 3 {
|
|
return "", fmt.Errorf("invalid cgroup entry: %q", text)
|
|
}
|
|
// text is like "0::/user.slice/user-1001.slice/session-1.scope"
|
|
if parts[0] == "0" && parts[1] == "" {
|
|
return parts[2], nil
|
|
}
|
|
}
|
|
if err := s.Err(); err != nil {
|
|
return "", err
|
|
}
|
|
return "", fmt.Errorf("cgroup path not found")
|
|
}
|
|
|
|
// ToResources converts the oci LinuxResources struct into a
|
|
// v2 Resources type for use with this package.
|
|
//
|
|
// converting cgroups configuration from v1 to v2
|
|
// ref: https://github.com/containers/crun/blob/master/crun.1.md#cgroup-v2
|
|
func ToResources(spec *specs.LinuxResources) *Resources {
|
|
var resources Resources
|
|
if cpu := spec.CPU; cpu != nil {
|
|
resources.CPU = &CPU{
|
|
Cpus: cpu.Cpus,
|
|
Mems: cpu.Mems,
|
|
}
|
|
if shares := cpu.Shares; shares != nil {
|
|
convertedWeight := 1 + ((*shares-2)*9999)/262142
|
|
resources.CPU.Weight = &convertedWeight
|
|
}
|
|
if period := cpu.Period; period != nil {
|
|
resources.CPU.Max = NewCPUMax(cpu.Quota, period)
|
|
}
|
|
}
|
|
if mem := spec.Memory; mem != nil {
|
|
resources.Memory = &Memory{}
|
|
if swap := mem.Swap; swap != nil {
|
|
resources.Memory.Swap = swap
|
|
}
|
|
if l := mem.Limit; l != nil {
|
|
resources.Memory.Max = l
|
|
}
|
|
if l := mem.Reservation; l != nil {
|
|
resources.Memory.Low = l
|
|
}
|
|
}
|
|
if hugetlbs := spec.HugepageLimits; hugetlbs != nil {
|
|
hugeTlbUsage := HugeTlb{}
|
|
for _, hugetlb := range hugetlbs {
|
|
hugeTlbUsage = append(hugeTlbUsage, HugeTlbEntry{
|
|
HugePageSize: hugetlb.Pagesize,
|
|
Limit: hugetlb.Limit,
|
|
})
|
|
}
|
|
resources.HugeTlb = &hugeTlbUsage
|
|
}
|
|
if pids := spec.Pids; pids != nil {
|
|
resources.Pids = &Pids{
|
|
Max: pids.Limit,
|
|
}
|
|
}
|
|
if i := spec.BlockIO; i != nil {
|
|
resources.IO = &IO{}
|
|
if i.Weight != nil {
|
|
resources.IO.BFQ.Weight = 1 + (*i.Weight-10)*9999/990
|
|
}
|
|
for t, devices := range map[IOType][]specs.LinuxThrottleDevice{
|
|
ReadBPS: i.ThrottleReadBpsDevice,
|
|
WriteBPS: i.ThrottleWriteBpsDevice,
|
|
ReadIOPS: i.ThrottleReadIOPSDevice,
|
|
WriteIOPS: i.ThrottleWriteIOPSDevice,
|
|
} {
|
|
for _, d := range devices {
|
|
resources.IO.Max = append(resources.IO.Max, Entry{
|
|
Type: t,
|
|
Major: d.Major,
|
|
Minor: d.Minor,
|
|
Rate: d.Rate,
|
|
})
|
|
}
|
|
}
|
|
}
|
|
if i := spec.Rdma; i != nil {
|
|
resources.RDMA = &RDMA{}
|
|
for device, value := range spec.Rdma {
|
|
if device != "" && (value.HcaHandles != nil && value.HcaObjects != nil) {
|
|
resources.RDMA.Limit = append(resources.RDMA.Limit, RDMAEntry{
|
|
Device: device,
|
|
HcaHandles: *value.HcaHandles,
|
|
HcaObjects: *value.HcaObjects,
|
|
})
|
|
}
|
|
}
|
|
}
|
|
|
|
return &resources
|
|
}
|
|
|
|
// Gets uint64 parsed content of single value cgroup stat file
|
|
func getStatFileContentUint64(filePath string) uint64 {
|
|
f, err := os.Open(filePath)
|
|
if err != nil {
|
|
return 0
|
|
}
|
|
defer f.Close()
|
|
|
|
// We expect an unsigned 64 bit integer, or a "max" string
|
|
// in some cases.
|
|
buf := make([]byte, 32)
|
|
n, err := f.Read(buf)
|
|
if err != nil {
|
|
return 0
|
|
}
|
|
|
|
trimmed := strings.TrimSpace(string(buf[:n]))
|
|
if trimmed == "max" {
|
|
return math.MaxUint64
|
|
}
|
|
|
|
res, err := parseUint(trimmed, 10, 64)
|
|
if err != nil {
|
|
logrus.Errorf("unable to parse %q as a uint from Cgroup file %q", trimmed, filePath)
|
|
return res
|
|
}
|
|
|
|
return res
|
|
}
|
|
|
|
func readIoStats(path string) []*stats.IOEntry {
|
|
// more details on the io.stat file format: https://www.kernel.org/doc/Documentation/cgroup-v2.txt
|
|
var usage []*stats.IOEntry
|
|
fpath := filepath.Join(path, "io.stat")
|
|
currentData, err := os.ReadFile(fpath)
|
|
if err != nil {
|
|
return usage
|
|
}
|
|
entries := strings.Split(string(currentData), "\n")
|
|
|
|
for _, entry := range entries {
|
|
parts := strings.Split(entry, " ")
|
|
if len(parts) < 2 {
|
|
continue
|
|
}
|
|
majmin := strings.Split(parts[0], ":")
|
|
if len(majmin) != 2 {
|
|
continue
|
|
}
|
|
major, err := strconv.ParseUint(majmin[0], 10, 0)
|
|
if err != nil {
|
|
return usage
|
|
}
|
|
minor, err := strconv.ParseUint(majmin[1], 10, 0)
|
|
if err != nil {
|
|
return usage
|
|
}
|
|
parts = parts[1:]
|
|
ioEntry := stats.IOEntry{
|
|
Major: major,
|
|
Minor: minor,
|
|
}
|
|
for _, s := range parts {
|
|
keyPairValue := strings.Split(s, "=")
|
|
if len(keyPairValue) != 2 {
|
|
continue
|
|
}
|
|
v, err := strconv.ParseUint(keyPairValue[1], 10, 0)
|
|
if err != nil {
|
|
continue
|
|
}
|
|
switch keyPairValue[0] {
|
|
case "rbytes":
|
|
ioEntry.Rbytes = v
|
|
case "wbytes":
|
|
ioEntry.Wbytes = v
|
|
case "rios":
|
|
ioEntry.Rios = v
|
|
case "wios":
|
|
ioEntry.Wios = v
|
|
}
|
|
}
|
|
usage = append(usage, &ioEntry)
|
|
}
|
|
return usage
|
|
}
|
|
|
|
func rdmaStats(filepath string) []*stats.RdmaEntry {
|
|
currentData, err := os.ReadFile(filepath)
|
|
if err != nil {
|
|
return []*stats.RdmaEntry{}
|
|
}
|
|
return toRdmaEntry(strings.Split(string(currentData), "\n"))
|
|
}
|
|
|
|
func parseRdmaKV(raw string, entry *stats.RdmaEntry) {
|
|
var value uint64
|
|
var err error
|
|
|
|
parts := strings.Split(raw, "=")
|
|
switch len(parts) {
|
|
case 2:
|
|
if parts[1] == "max" {
|
|
value = math.MaxUint32
|
|
} else {
|
|
value, err = parseUint(parts[1], 10, 32)
|
|
if err != nil {
|
|
return
|
|
}
|
|
}
|
|
if parts[0] == "hca_handle" {
|
|
entry.HcaHandles = uint32(value)
|
|
} else if parts[0] == "hca_object" {
|
|
entry.HcaObjects = uint32(value)
|
|
}
|
|
}
|
|
}
|
|
|
|
func toRdmaEntry(strEntries []string) []*stats.RdmaEntry {
|
|
var rdmaEntries []*stats.RdmaEntry
|
|
for i := range strEntries {
|
|
parts := strings.Fields(strEntries[i])
|
|
switch len(parts) {
|
|
case 3:
|
|
entry := new(stats.RdmaEntry)
|
|
entry.Device = parts[0]
|
|
parseRdmaKV(parts[1], entry)
|
|
parseRdmaKV(parts[2], entry)
|
|
|
|
rdmaEntries = append(rdmaEntries, entry)
|
|
default:
|
|
continue
|
|
}
|
|
}
|
|
return rdmaEntries
|
|
}
|
|
|
|
// isUnitExists returns true if the error is that a systemd unit already exists.
|
|
func isUnitExists(err error) bool {
|
|
if err != nil {
|
|
if dbusError, ok := err.(dbus.Error); ok {
|
|
return strings.Contains(dbusError.Name, "org.freedesktop.systemd1.UnitExists")
|
|
}
|
|
}
|
|
return false
|
|
}
|
|
|
|
func systemdUnitFromPath(path string) string {
|
|
_, unit := filepath.Split(path)
|
|
return unit
|
|
}
|
|
|
|
func readHugeTlbStats(path string) []*stats.HugeTlbStat {
|
|
hpSizes := hugePageSizes()
|
|
usage := make([]*stats.HugeTlbStat, len(hpSizes))
|
|
for idx, pagesize := range hpSizes {
|
|
usage[idx] = &stats.HugeTlbStat{
|
|
Max: getStatFileContentUint64(filepath.Join(path, "hugetlb."+pagesize+".max")),
|
|
Current: getStatFileContentUint64(filepath.Join(path, "hugetlb."+pagesize+".current")),
|
|
Pagesize: pagesize,
|
|
}
|
|
}
|
|
return usage
|
|
}
|
|
|
|
var (
|
|
hPageSizes []string
|
|
initHPSOnce sync.Once
|
|
)
|
|
|
|
// The following idea and implementation is taken pretty much line for line from
|
|
// runc. Because the hugetlb files are well known, and the only variable thrown in
|
|
// the mix is what huge page sizes you have on your host, this lends itself well
|
|
// to doing the work to find the files present once, and then re-using this. This
|
|
// saves a os.Readdirnames(0) call to search for hugeltb files on every `manager.Stat`
|
|
// call.
|
|
// https://github.com/opencontainers/runc/blob/3a2c0c2565644d8a7e0f1dd594a060b21fa96cf1/libcontainer/cgroups/utils.go#L301
|
|
func hugePageSizes() []string {
|
|
initHPSOnce.Do(func() {
|
|
dir, err := os.OpenFile("/sys/kernel/mm/hugepages", unix.O_DIRECTORY|unix.O_RDONLY, 0)
|
|
if err != nil {
|
|
return
|
|
}
|
|
files, err := dir.Readdirnames(0)
|
|
dir.Close()
|
|
if err != nil {
|
|
return
|
|
}
|
|
|
|
hPageSizes, err = getHugePageSizeFromFilenames(files)
|
|
if err != nil {
|
|
logrus.Warnf("hugePageSizes: %s", err)
|
|
}
|
|
})
|
|
|
|
return hPageSizes
|
|
}
|
|
|
|
func getHugePageSizeFromFilenames(fileNames []string) ([]string, error) {
|
|
pageSizes := make([]string, 0, len(fileNames))
|
|
var warn error
|
|
|
|
for _, file := range fileNames {
|
|
// example: hugepages-1048576kB
|
|
val := strings.TrimPrefix(file, "hugepages-")
|
|
if len(val) == len(file) {
|
|
// Unexpected file name: no prefix found, ignore it.
|
|
continue
|
|
}
|
|
// In all known versions of Linux up to 6.3 the suffix is always
|
|
// "kB". If we find something else, produce an error but keep going.
|
|
eLen := len(val) - 2
|
|
val = strings.TrimSuffix(val, "kB")
|
|
if len(val) != eLen {
|
|
// Highly unlikely.
|
|
if warn == nil {
|
|
warn = errors.New(file + `: invalid suffix (expected "kB")`)
|
|
}
|
|
continue
|
|
}
|
|
size, err := strconv.Atoi(val)
|
|
if err != nil {
|
|
// Highly unlikely.
|
|
if warn == nil {
|
|
warn = fmt.Errorf("%s: %w", file, err)
|
|
}
|
|
continue
|
|
}
|
|
// Model after https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/mm/hugetlb_cgroup.c?id=eff48ddeab782e35e58ccc8853f7386bbae9dec4#n574
|
|
// but in our case the size is in KB already.
|
|
if size >= (1 << 20) {
|
|
val = strconv.Itoa(size>>20) + "GB"
|
|
} else if size >= (1 << 10) {
|
|
val = strconv.Itoa(size>>10) + "MB"
|
|
} else {
|
|
val += "KB"
|
|
}
|
|
pageSizes = append(pageSizes, val)
|
|
}
|
|
|
|
return pageSizes, warn
|
|
}
|
|
|
|
func getSubreaper() (int, error) {
|
|
var i uintptr
|
|
if err := unix.Prctl(unix.PR_GET_CHILD_SUBREAPER, uintptr(unsafe.Pointer(&i)), 0, 0, 0); err != nil {
|
|
return -1, err
|
|
}
|
|
return int(i), nil
|
|
}
|