diff --git a/NOTICE b/NOTICE index d08942acbc5cbed3bcfb8f00be75b9ecf0ff19fc..12f55adef0298e914da02af177f8d92d464594af 100644 --- a/NOTICE +++ b/NOTICE @@ -6498,6 +6498,32 @@ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +LICENSE - github.com/cilium/ebpf +MIT License + +Copyright (c) 2017 Nathan Sweet +Copyright (c) 2018, 2019 Cloudflare +Copyright (c) 2019 Authors of Cilium + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. + ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ LICENSE - github.com/client9/reopen The MIT License (MIT) diff --git a/go.mod b/go.mod index 82a81c2d0eda1be419f970f81ac29487c70e6949..09924a1f4542cc919cea11ef88f0169e6a878083 100644 --- a/go.mod +++ b/go.mod @@ -94,6 +94,7 @@ require ( github.com/beorn7/perks v1.0.1 // indirect github.com/census-instrumentation/opencensus-proto v0.4.1 // indirect github.com/cespare/xxhash/v2 v2.2.0 // indirect + github.com/cilium/ebpf v0.9.1 // indirect github.com/client9/reopen v1.0.0 // indirect github.com/cloudflare/circl v1.3.3 // indirect github.com/coreos/go-systemd/v22 v22.5.0 // indirect diff --git a/go.sum b/go.sum index 10b4d19f0390dd9249f5460d9001f2f1df00277b..482abdfee397444a73dbb12f831df9f2c63b5c80 100644 --- a/go.sum +++ b/go.sum @@ -682,6 +682,8 @@ github.com/cilium/ebpf v0.2.0/go.mod h1:To2CFviqOWL/M0gIMsvSMlqe7em/l1ALkX1PyjrX github.com/cilium/ebpf v0.4.0/go.mod h1:4tRaxcgiL706VnOzHOdBlY8IEAIdxINsQBcU4xJJXRs= github.com/cilium/ebpf v0.6.2/go.mod h1:4tRaxcgiL706VnOzHOdBlY8IEAIdxINsQBcU4xJJXRs= github.com/cilium/ebpf v0.7.0/go.mod h1:/oI2+1shJiTGAMgl6/RgJr36Eo1jzrRcAWbcXO2usCA= +github.com/cilium/ebpf v0.9.1 h1:64sn2K3UKw8NbP/blsixRpF3nXuyhz/VjRlRzvlBRu4= +github.com/cilium/ebpf v0.9.1/go.mod h1:+OhNOIXx/Fnu1IE8bJz2dzOA+VSfyTfdNUVdlQnxUFY= github.com/circonus-labs/circonus-gometrics v2.3.1+incompatible/go.mod h1:nmEj6Dob7S7YxXgwXpfOuvO54S+tGdZdw9fuRZt25Ag= github.com/circonus-labs/circonusllhist v0.1.3/go.mod h1:kMXHVDlOchFAehlya5ePtbp5jckzBHf4XRpQvBOLI+I= github.com/clbanning/mxj v1.8.4/go.mod h1:BVjHeAH+rl9rs6f+QIpeRl0tfu10SXn1pUSa5PVGJng= @@ -942,6 +944,7 @@ github.com/fortytw2/leaktest v1.3.0/go.mod h1:jDsjWgpAGjm2CA7WthBh/CdZYEPF31XHqu github.com/franela/goblin v0.0.0-20210519012713-85d372ac71e2/go.mod h1:VzmDKDJVZI3aJmnRI9VjAn9nJ8qPPsN1fqzr9dqInIo= github.com/franela/goreq v0.0.0-20171204163338-bcd34c9993f8/go.mod h1:ZhphrRTfi2rbfLwlschooIH4+wKKDR4Pdxhh+TRoA20= github.com/frankban/quicktest v1.11.3/go.mod h1:wRf/ReqHper53s+kmmSZizM8NamnL3IM0I9ntUbOk+k= +github.com/frankban/quicktest v1.14.3 h1:FJKSZTDHjyhriyC81FLQ0LY93eSai0ZyR/ZIkd3ZUKE= github.com/frankban/quicktest v1.14.3/go.mod h1:mgiwOwqx65TmIk1wJ6Q7wvnVMocbUorkibMOrVTHZps= github.com/fsnotify/fsnotify v1.4.7/go.mod h1:jwhsz4b93w/PPRr/qN1Yymfu8t87LnFCMoQvtojpjFo= github.com/fsnotify/fsnotify v1.4.9/go.mod h1:znqG4EE+3YCdAaPaxE2ZRY/06pZUdp0tY4IgpuI1SZQ= diff --git a/internal/cgroups/cgroups.go b/internal/cgroups/cgroups.go index 5d44ba70dcf10031ed5cc3021be0dd99780b5ed3..0c6927502e96f27c602dc68094eeb77701d5df38 100644 --- a/internal/cgroups/cgroups.go +++ b/internal/cgroups/cgroups.go @@ -2,11 +2,9 @@ package cgroups import ( "os/exec" - "path/filepath" "github.com/prometheus/client_golang/prometheus" log "github.com/sirupsen/logrus" - "gitlab.com/gitlab-org/gitaly/v16/internal/gitaly/config" "gitlab.com/gitlab-org/gitaly/v16/internal/gitaly/config/cgroups" ) @@ -44,7 +42,7 @@ type Manager interface { // NewManager returns the appropriate Cgroups manager func NewManager(cfg cgroups.Config, pid int) Manager { if cfg.Repositories.Count > 0 { - return newV1Manager(cfg, pid) + return newCgroupManager(cfg, pid) } return &NoopManager{} @@ -52,23 +50,5 @@ func NewManager(cfg cgroups.Config, pid int) Manager { // PruneOldCgroups prunes old cgroups for both the memory and cpu subsystems func PruneOldCgroups(cfg cgroups.Config, logger log.FieldLogger) { - if cfg.HierarchyRoot == "" { - return - } - - if err := config.PruneOldGitalyProcessDirectories( - logger, - filepath.Join(cfg.Mountpoint, "memory", - cfg.HierarchyRoot), - ); err != nil { - logger.WithError(err).Error("failed to clean up memory cgroups") - } - - if err := config.PruneOldGitalyProcessDirectories( - logger, - filepath.Join(cfg.Mountpoint, "cpu", - cfg.HierarchyRoot), - ); err != nil { - logger.WithError(err).Error("failed to clean up cpu cgroups") - } + pruneOldCgroups(cfg, logger) } diff --git a/internal/cgroups/cgroups_linux_test.go b/internal/cgroups/cgroups_linux_test.go index e52eecb5eb2a63299be2fac3cbf1a424bc9c3e9d..8ed551d2d8bef6f95cf891708124666d7eb1575b 100644 --- a/internal/cgroups/cgroups_linux_test.go +++ b/internal/cgroups/cgroups_linux_test.go @@ -1,17 +1,12 @@ +//go:build linux + package cgroups import ( - "fmt" - "io/fs" - "os" - "os/exec" - "path/filepath" "testing" - "github.com/sirupsen/logrus/hooks/test" "github.com/stretchr/testify/require" "gitlab.com/gitlab-org/gitaly/v16/internal/gitaly/config/cgroups" - "gitlab.com/gitlab-org/gitaly/v16/internal/helper/perm" "gitlab.com/gitlab-org/gitaly/v16/internal/testhelper" ) @@ -20,174 +15,5 @@ func TestMain(m *testing.M) { } func TestNewManager(t *testing.T) { - cfg := cgroups.Config{Repositories: cgroups.Repositories{Count: 10}} - - require.IsType(t, &CGroupV1Manager{}, &CGroupV1Manager{cfg: cfg}) require.IsType(t, &NoopManager{}, NewManager(cgroups.Config{}, 1)) } - -func TestPruneOldCgroups(t *testing.T) { - t.Parallel() - - testCases := []struct { - desc string - cfg cgroups.Config - expectedPruned bool - // setup returns a pid - setup func(*testing.T, cgroups.Config) int - }{ - { - desc: "process belongs to another user", - cfg: cgroups.Config{ - Mountpoint: testhelper.TempDir(t), - HierarchyRoot: "gitaly", - Repositories: cgroups.Repositories{ - Count: 10, - MemoryBytes: 10 * 1024 * 1024, - CPUShares: 1024, - }, - }, - setup: func(t *testing.T, cfg cgroups.Config) int { - pid := 1 - cgroupManager := NewManager(cfg, pid) - require.NoError(t, cgroupManager.Setup()) - - return pid - }, - expectedPruned: true, - }, - { - desc: "no hierarchy root", - cfg: cgroups.Config{ - Mountpoint: testhelper.TempDir(t), - HierarchyRoot: "", - Repositories: cgroups.Repositories{ - Count: 10, - MemoryBytes: 10 * 1024 * 1024, - CPUShares: 1024, - }, - }, - setup: func(t *testing.T, cfg cgroups.Config) int { - pid := 1 - cgroupManager := NewManager(cfg, pid) - require.NoError(t, cgroupManager.Setup()) - - return 1 - }, - expectedPruned: false, - }, - { - desc: "pid of finished process", - cfg: cgroups.Config{ - Mountpoint: testhelper.TempDir(t), - HierarchyRoot: "gitaly", - Repositories: cgroups.Repositories{ - Count: 10, - MemoryBytes: 10 * 1024 * 1024, - CPUShares: 1024, - }, - }, - setup: func(t *testing.T, cfg cgroups.Config) int { - cmd := exec.Command("ls") - require.NoError(t, cmd.Run()) - pid := cmd.Process.Pid - - cgroupManager := NewManager(cfg, pid) - require.NoError(t, cgroupManager.Setup()) - - memoryRoot := filepath.Join( - cfg.Mountpoint, - "memory", - cfg.HierarchyRoot, - "memory.limit_in_bytes", - ) - require.NoError(t, os.WriteFile(memoryRoot, []byte{}, fs.ModeAppend)) - - return pid - }, - expectedPruned: true, - }, - { - desc: "pid of running process", - cfg: cgroups.Config{ - Mountpoint: testhelper.TempDir(t), - HierarchyRoot: "gitaly", - Repositories: cgroups.Repositories{ - Count: 10, - MemoryBytes: 10 * 1024 * 1024, - CPUShares: 1024, - }, - }, - setup: func(t *testing.T, cfg cgroups.Config) int { - pid := os.Getpid() - - cgroupManager := NewManager(cfg, pid) - require.NoError(t, cgroupManager.Setup()) - - return pid - }, - expectedPruned: false, - }, - { - desc: "gitaly-0 directory is deleted", - cfg: cgroups.Config{ - Mountpoint: testhelper.TempDir(t), - HierarchyRoot: "gitaly", - Repositories: cgroups.Repositories{ - Count: 10, - MemoryBytes: 10 * 1024 * 1024, - CPUShares: 1024, - }, - }, - setup: func(t *testing.T, cfg cgroups.Config) int { - cgroupManager := NewManager(cfg, 0) - require.NoError(t, cgroupManager.Setup()) - - return 0 - }, - expectedPruned: true, - }, - } - - for _, tc := range testCases { - t.Run(tc.desc, func(t *testing.T) { - memoryRoot := filepath.Join( - tc.cfg.Mountpoint, - "memory", - tc.cfg.HierarchyRoot, - ) - cpuRoot := filepath.Join( - tc.cfg.Mountpoint, - "cpu", - tc.cfg.HierarchyRoot, - ) - - require.NoError(t, os.MkdirAll(cpuRoot, perm.PublicDir)) - require.NoError(t, os.MkdirAll(memoryRoot, perm.PublicDir)) - - pid := tc.setup(t, tc.cfg) - - logger, hook := test.NewNullLogger() - PruneOldCgroups(tc.cfg, logger) - - // create cgroups directories with a different pid - oldGitalyProcessMemoryDir := filepath.Join( - memoryRoot, - fmt.Sprintf("gitaly-%d", pid), - ) - oldGitalyProcesssCPUDir := filepath.Join( - cpuRoot, - fmt.Sprintf("gitaly-%d", pid), - ) - - if tc.expectedPruned { - require.NoDirExists(t, oldGitalyProcessMemoryDir) - require.NoDirExists(t, oldGitalyProcesssCPUDir) - } else { - require.DirExists(t, oldGitalyProcessMemoryDir) - require.DirExists(t, oldGitalyProcesssCPUDir) - require.Len(t, hook.Entries, 0) - } - }) - } -} diff --git a/internal/cgroups/manager.go b/internal/cgroups/manager.go new file mode 100644 index 0000000000000000000000000000000000000000..44513839485059bf856c4c98bb720406b97b8290 --- /dev/null +++ b/internal/cgroups/manager.go @@ -0,0 +1,17 @@ +//go:build !linux + +package cgroups + +import ( + log "github.com/sirupsen/logrus" + "gitlab.com/gitlab-org/gitaly/v16/internal/gitaly/config/cgroups" + cgroupscfg "gitlab.com/gitlab-org/gitaly/v16/internal/gitaly/config/cgroups" +) + +func newCgroupManager(cfg cgroupscfg.Config, pid int) Manager { + return &NoopManager{} +} + +func pruneOldCgroups(cfg cgroups.Config, logger log.FieldLogger) { + return +} diff --git a/internal/cgroups/manager_linux.go b/internal/cgroups/manager_linux.go new file mode 100644 index 0000000000000000000000000000000000000000..7b8c4a34d985d3660f6b5d7a38f3ab06e8683bd6 --- /dev/null +++ b/internal/cgroups/manager_linux.go @@ -0,0 +1,177 @@ +//go:build linux + +package cgroups + +import ( + "fmt" + "hash/crc32" + "os/exec" + "strings" + + cgrps "github.com/containerd/cgroups/v3" + "github.com/opencontainers/runtime-spec/specs-go" + "github.com/prometheus/client_golang/prometheus" + log "github.com/sirupsen/logrus" + cgroupscfg "gitlab.com/gitlab-org/gitaly/v16/internal/gitaly/config/cgroups" +) + +// cfs_period_us hardcoded to be 100ms. +const cfsPeriodUs uint64 = 100000 + +type cgroupHandler interface { + setupParent(reposResources *specs.LinuxResources) error + setupRepository(reposResources *specs.LinuxResources) error + addToCgroup(pid int, cgroupPath string) error + collect(ch chan<- prometheus.Metric) + cleanup() error + currentProcessCgroup() string + repoPath(groupID int) string +} + +// CGroupManager is a manager class that implements specific methods related to cgroups +type CGroupManager struct { + cfg cgroupscfg.Config + pid int + + handler cgroupHandler +} + +func newCgroupManager(cfg cgroupscfg.Config, pid int) *CGroupManager { + return newCgroupManagerWithMode(cfg, pid, cgrps.Mode()) +} + +func newCgroupManagerWithMode(cfg cgroupscfg.Config, pid int, mode cgrps.CGMode) *CGroupManager { + var handler cgroupHandler + switch mode { + case cgrps.Legacy, cgrps.Hybrid: + handler = newV1Handler(cfg, pid) + case cgrps.Unified: + handler = newV2Handler(cfg, pid) + log.Warnf("Gitaly now includes experimental support for CgroupV2. Please proceed with caution and use this experimental feature at your own risk") + default: + log.Fatalf("unknown cgroup version") + } + + return &CGroupManager{ + cfg: cfg, + pid: pid, + handler: handler, + } +} + +// Setup parent cgroups and repository sub cgroups +func (cgm *CGroupManager) Setup() error { + if err := cgm.handler.setupParent(cgm.configParentResources()); err != nil { + return err + } + if err := cgm.handler.setupRepository(cgm.configRepositoryResources()); err != nil { + return err + } + return nil +} + +// AddCommand adds a Cmd to a cgroup +func (cgm *CGroupManager) AddCommand(cmd *exec.Cmd, opts ...AddCommandOption) (string, error) { + var cfg addCommandCfg + for _, opt := range opts { + opt(&cfg) + } + + key := cfg.cgroupKey + if key == "" { + key = strings.Join(cmd.Args, "/") + } + + checksum := crc32.ChecksumIEEE( + []byte(key), + ) + + if cmd.Process == nil { + return "", fmt.Errorf("cannot add command that has not yet been started") + } + + groupID := uint(checksum) % cgm.cfg.Repositories.Count + cgroupPath := cgm.handler.repoPath(int(groupID)) + + return cgroupPath, cgm.handler.addToCgroup(cmd.Process.Pid, cgroupPath) +} + +// Cleanup cleans up cgroups created in Setup. +func (cgm *CGroupManager) Cleanup() error { + return cgm.handler.cleanup() +} + +// Describe is used to generate description information for each CGroupManager prometheus metric +func (cgm *CGroupManager) Describe(ch chan<- *prometheus.Desc) { + prometheus.DescribeByCollect(cgm, ch) +} + +// Collect is used to collect the current values of all CGroupManager prometheus metrics +func (cgm *CGroupManager) Collect(ch chan<- prometheus.Metric) { + cgm.handler.collect(ch) +} + +func (cgm *CGroupManager) currentProcessCgroup() string { + return cgm.handler.currentProcessCgroup() +} + +func (cgm *CGroupManager) configParentResources() *specs.LinuxResources { + cfsPeriodUs := cfsPeriodUs + var parentResources specs.LinuxResources + // Leave them `nil` so it takes kernel default unless cfg value above `0`. + parentResources.CPU = &specs.LinuxCPU{} + + if cgm.cfg.CPUShares > 0 { + parentResources.CPU.Shares = &cgm.cfg.CPUShares + } + + if cgm.cfg.CPUQuotaUs > 0 { + parentResources.CPU.Quota = &cgm.cfg.CPUQuotaUs + parentResources.CPU.Period = &cfsPeriodUs + } + + if cgm.cfg.MemoryBytes > 0 { + parentResources.Memory = &specs.LinuxMemory{Limit: &cgm.cfg.MemoryBytes} + } + return &parentResources +} + +func (cgm *CGroupManager) configRepositoryResources() *specs.LinuxResources { + cfsPeriodUs := cfsPeriodUs + var reposResources specs.LinuxResources + // Leave them `nil` so it takes kernel default unless cfg value above `0`. + reposResources.CPU = &specs.LinuxCPU{} + + if cgm.cfg.Repositories.CPUShares > 0 { + reposResources.CPU.Shares = &cgm.cfg.Repositories.CPUShares + } + + if cgm.cfg.Repositories.CPUQuotaUs > 0 { + reposResources.CPU.Quota = &cgm.cfg.Repositories.CPUQuotaUs + reposResources.CPU.Period = &cfsPeriodUs + } + + if cgm.cfg.Repositories.MemoryBytes > 0 { + reposResources.Memory = &specs.LinuxMemory{Limit: &cgm.cfg.Repositories.MemoryBytes} + } + return &reposResources +} + +func pruneOldCgroups(cfg cgroupscfg.Config, logger log.FieldLogger) { + pruneOldCgroupsWithMode(cfg, logger, cgrps.Mode()) +} + +func pruneOldCgroupsWithMode(cfg cgroupscfg.Config, logger log.FieldLogger, mode cgrps.CGMode) { + if cfg.HierarchyRoot == "" { + return + } + + switch mode { + case cgrps.Legacy, cgrps.Hybrid: + pruneOldCgroupsV1(cfg, logger) + case cgrps.Unified: + pruneOldCgroupsV2(cfg, logger) + default: + log.Fatalf("unknown cgroup version") + } +} diff --git a/internal/cgroups/metrics.go b/internal/cgroups/metrics.go new file mode 100644 index 0000000000000000000000000000000000000000..a8ffa618f2eb7b7f02b810079409ac6e1df1bebf --- /dev/null +++ b/internal/cgroups/metrics.go @@ -0,0 +1,87 @@ +package cgroups + +import "github.com/prometheus/client_golang/prometheus" + +type cgroupsMetrics struct { + memoryReclaimAttemptsTotal *prometheus.GaugeVec + cpuUsage *prometheus.GaugeVec + cpuCFSPeriods *prometheus.Desc + cpuCFSThrottledPeriods *prometheus.Desc + cpuCFSThrottledTime *prometheus.Desc + procs *prometheus.GaugeVec +} + +func newV1CgroupsMetrics() *cgroupsMetrics { + return &cgroupsMetrics{ + memoryReclaimAttemptsTotal: prometheus.NewGaugeVec( + prometheus.GaugeOpts{ + Name: "gitaly_cgroup_memory_reclaim_attempts_total", + Help: "Number of memory usage hits limits", + }, + []string{"path"}, + ), + cpuUsage: prometheus.NewGaugeVec( + prometheus.GaugeOpts{ + Name: "gitaly_cgroup_cpu_usage_total", + Help: "CPU Usage of Cgroup", + }, + []string{"path", "type"}, + ), + cpuCFSPeriods: prometheus.NewDesc( + "gitaly_cgroup_cpu_cfs_periods_total", + "Number of elapsed enforcement period intervals", + []string{"path"}, nil, + ), + cpuCFSThrottledPeriods: prometheus.NewDesc( + "gitaly_cgroup_cpu_cfs_throttled_periods_total", + "Number of throttled period intervals", + []string{"path"}, nil, + ), + cpuCFSThrottledTime: prometheus.NewDesc( + "gitaly_cgroup_cpu_cfs_throttled_seconds_total", + "Total time duration the Cgroup has been throttled", + []string{"path"}, nil, + ), + procs: prometheus.NewGaugeVec( + prometheus.GaugeOpts{ + Name: "gitaly_cgroup_procs_total", + Help: "Total number of procs", + }, + []string{"path", "subsystem"}, + ), + } +} + +func newV2CgroupsMetrics() *cgroupsMetrics { + return &cgroupsMetrics{ + cpuUsage: prometheus.NewGaugeVec( + prometheus.GaugeOpts{ + Name: "gitaly_cgroup_cpu_usage_total", + Help: "CPU Usage of Cgroup", + }, + []string{"path", "type"}, + ), + cpuCFSPeriods: prometheus.NewDesc( + "gitaly_cgroup_cpu_cfs_periods_total", + "Number of elapsed enforcement period intervals", + []string{"path"}, nil, + ), + cpuCFSThrottledPeriods: prometheus.NewDesc( + "gitaly_cgroup_cpu_cfs_throttled_periods_total", + "Number of throttled period intervals", + []string{"path"}, nil, + ), + cpuCFSThrottledTime: prometheus.NewDesc( + "gitaly_cgroup_cpu_cfs_throttled_seconds_total", + "Total time duration the Cgroup has been throttled", + []string{"path"}, nil, + ), + procs: prometheus.NewGaugeVec( + prometheus.GaugeOpts{ + Name: "gitaly_cgroup_procs_total", + Help: "Total number of procs", + }, + []string{"path", "subsystem"}, + ), + } +} diff --git a/internal/cgroups/mock_linux_test.go b/internal/cgroups/mock_linux_test.go index 2cf735149d6a5b895d66b6146ae8bb1b9194f236..135dca76fe4497633661b59e82a8dbed03201420 100644 --- a/internal/cgroups/mock_linux_test.go +++ b/internal/cgroups/mock_linux_test.go @@ -1,3 +1,5 @@ +//go:build linux + /* Adapted from https://github.com/containerd/cgroups/blob/f1d9380fd3c028194db9582825512fdf3f39ab2a/mock_test.go @@ -25,8 +27,11 @@ import ( "strconv" "testing" + cgrps "github.com/containerd/cgroups/v3" "github.com/containerd/cgroups/v3/cgroup1" + "github.com/sirupsen/logrus" "github.com/stretchr/testify/require" + cgroupscfg "gitlab.com/gitlab-org/gitaly/v16/internal/gitaly/config/cgroups" "gitlab.com/gitlab-org/gitaly/v16/internal/helper/perm" "gitlab.com/gitlab-org/gitaly/v16/internal/testhelper" ) @@ -54,13 +59,9 @@ func newMock(t *testing.T) *mockCgroup { } } -func (m *mockCgroup) hierarchy() ([]cgroup1.Subsystem, error) { - return m.subsystems, nil -} - func (m *mockCgroup) setupMockCgroupFiles( t *testing.T, - manager *CGroupV1Manager, + manager *CGroupManager, memFailCount int, ) { for _, s := range m.subsystems { @@ -117,3 +118,71 @@ throttled_time 1000000` } } } + +func (m *mockCgroup) newCgroupManager(cfg cgroupscfg.Config, pid int) *CGroupManager { + return newCgroupManagerWithMode(cfg, pid, cgrps.Legacy) +} + +func (m *mockCgroup) pruneOldCgroups(cfg cgroupscfg.Config, logger logrus.FieldLogger) { + pruneOldCgroupsWithMode(cfg, logger, cgrps.Legacy) +} + +type mockCgroupV2 struct { + root string +} + +func newMockV2(t *testing.T) *mockCgroupV2 { + t.Helper() + + return &mockCgroupV2{ + root: testhelper.TempDir(t), + } +} + +func (m *mockCgroupV2) setupMockCgroupFiles( + t *testing.T, + manager *CGroupManager, +) { + cgroupPath := filepath.Join(m.root, manager.currentProcessCgroup()) + require.NoError(t, os.MkdirAll(cgroupPath, perm.SharedDir)) + + contentByFilename := map[string]string{ + "cgroup.procs": "", + "cgroup.subtree_control": "cpu cpuset memory", + "cgroup.controllers": "cpu cpuset memory", + "cpu.max": "max 100000", + "cpu.weight": "10", + "memory.max": "max", + "cpu.stat": `nr_periods 10 + nr_throttled 20 + throttled_usec 1000000`, + } + + for filename, content := range contentByFilename { + controlFilePath := filepath.Join(m.root, manager.cfg.HierarchyRoot, filename) + require.NoError(t, os.WriteFile(controlFilePath, []byte(content), perm.SharedFile)) + } + + for filename, content := range contentByFilename { + controlFilePath := filepath.Join(cgroupPath, filename) + require.NoError(t, os.WriteFile(controlFilePath, []byte(content), perm.SharedFile)) + } + + for shard := uint(0); shard < manager.cfg.Repositories.Count; shard++ { + shardPath := filepath.Join(cgroupPath, fmt.Sprintf("repos-%d", shard)) + require.NoError(t, os.MkdirAll(shardPath, perm.SharedDir)) + + for filename, content := range contentByFilename { + shardControlFilePath := filepath.Join(shardPath, filename) + require.NoError(t, os.WriteFile(shardControlFilePath, []byte(content), perm.SharedFile)) + } + } +} + +func (m *mockCgroupV2) newCgroupManager(cfg cgroupscfg.Config, pid int) *CGroupManager { + return newCgroupManagerWithMode(cfg, pid, cgrps.Unified) +} + +func (m *mockCgroupV2) pruneOldCgroups(cfg cgroupscfg.Config, logger logrus.FieldLogger) { + pruneOldCgroupsWithMode(cfg, logger, cgrps.Unified) +} diff --git a/internal/cgroups/v1.go b/internal/cgroups/v1.go deleted file mode 100644 index 8935bcdc54a1172c43cd5bb0d3f59c946d491283..0000000000000000000000000000000000000000 --- a/internal/cgroups/v1.go +++ /dev/null @@ -1,12 +0,0 @@ -//go:build !linux - -package cgroups - -import ( - "gitlab.com/gitlab-org/gitaly/v16/internal/gitaly/config/cgroups" -) - -// For systems other than Linux, we return a noop manager if cgroups was enabled. -func newV1Manager(cfg cgroups.Config, pid int) *NoopManager { - return &NoopManager{} -} diff --git a/internal/cgroups/v1_linux.go b/internal/cgroups/v1_linux.go index 09bf236199a0700ef0b9af98e295597de9c58b76..22e9ab841750cd81c4870bc3e1b98695f72b93ff 100644 --- a/internal/cgroups/v1_linux.go +++ b/internal/cgroups/v1_linux.go @@ -1,9 +1,9 @@ +//go:build linux + package cgroups import ( "fmt" - "hash/crc32" - "os/exec" "path/filepath" "strings" "time" @@ -11,167 +11,59 @@ import ( "github.com/containerd/cgroups/v3/cgroup1" specs "github.com/opencontainers/runtime-spec/specs-go" "github.com/prometheus/client_golang/prometheus" + "github.com/sirupsen/logrus" "gitlab.com/gitlab-org/gitaly/v16/internal/gitaly/config" cgroupscfg "gitlab.com/gitlab-org/gitaly/v16/internal/gitaly/config/cgroups" "gitlab.com/gitlab-org/gitaly/v16/internal/log" ) -// cfs_period_us hardcoded to be 100ms. -const cfsPeriodUs uint64 = 100000 +type cgroupV1Handler struct { + cfg cgroupscfg.Config + hierarchy func() ([]cgroup1.Subsystem, error) -// CGroupV1Manager is the manager for cgroups v1 -type CGroupV1Manager struct { - cfg cgroupscfg.Config - hierarchy func() ([]cgroup1.Subsystem, error) - memoryReclaimAttemptsTotal *prometheus.GaugeVec - cpuUsage *prometheus.GaugeVec - cpuCFSPeriods *prometheus.Desc - cpuCFSThrottledPeriods *prometheus.Desc - cpuCFSThrottledTime *prometheus.Desc - procs *prometheus.GaugeVec - pid int + *cgroupsMetrics + pid int } -func newV1Manager(cfg cgroupscfg.Config, pid int) *CGroupV1Manager { - return &CGroupV1Manager{ +func newV1Handler(cfg cgroupscfg.Config, pid int) *cgroupV1Handler { + return &cgroupV1Handler{ cfg: cfg, pid: pid, hierarchy: func() ([]cgroup1.Subsystem, error) { return defaultSubsystems(cfg.Mountpoint) }, - memoryReclaimAttemptsTotal: prometheus.NewGaugeVec( - prometheus.GaugeOpts{ - Name: "gitaly_cgroup_memory_reclaim_attempts_total", - Help: "Number of memory usage hits limits", - }, - []string{"path"}, - ), - cpuUsage: prometheus.NewGaugeVec( - prometheus.GaugeOpts{ - Name: "gitaly_cgroup_cpu_usage_total", - Help: "CPU Usage of Cgroup", - }, - []string{"path", "type"}, - ), - cpuCFSPeriods: prometheus.NewDesc( - "gitaly_cgroup_cpu_cfs_periods_total", - "Number of elapsed enforcement period intervals", - []string{"path"}, nil, - ), - cpuCFSThrottledPeriods: prometheus.NewDesc( - "gitaly_cgroup_cpu_cfs_throttled_periods_total", - "Number of throttled period intervals", - []string{"path"}, nil, - ), - cpuCFSThrottledTime: prometheus.NewDesc( - "gitaly_cgroup_cpu_cfs_throttled_seconds_total", - "Total time duration the Cgroup has been throttled", - []string{"path"}, nil, - ), - procs: prometheus.NewGaugeVec( - prometheus.GaugeOpts{ - Name: "gitaly_cgroup_procs_total", - Help: "Total number of procs", - }, - []string{"path", "subsystem"}, - ), + cgroupsMetrics: newV1CgroupsMetrics(), } } -//nolint:revive // This is unintentionally missing documentation. -func (cg *CGroupV1Manager) Setup() error { - cfsPeriodUs := cfsPeriodUs - - var parentResources specs.LinuxResources - // Leave them `nil` so it takes kernel default unless cfg value above `0`. - parentResources.CPU = &specs.LinuxCPU{} - - if cg.cfg.CPUShares > 0 { - parentResources.CPU.Shares = &cg.cfg.CPUShares - } - - if cg.cfg.CPUQuotaUs > 0 { - parentResources.CPU.Quota = &cg.cfg.CPUQuotaUs - parentResources.CPU.Period = &cfsPeriodUs - } - - if cg.cfg.MemoryBytes > 0 { - parentResources.Memory = &specs.LinuxMemory{Limit: &cg.cfg.MemoryBytes} - } - +func (cvh *cgroupV1Handler) setupParent(parentResources *specs.LinuxResources) error { if _, err := cgroup1.New( - cgroup1.StaticPath(cg.currentProcessCgroup()), - &parentResources, - cgroup1.WithHiearchy(cg.hierarchy), + cgroup1.StaticPath(cvh.currentProcessCgroup()), + parentResources, + cgroup1.WithHiearchy(cvh.hierarchy), ); err != nil { return fmt.Errorf("failed creating parent cgroup: %w", err) } + return nil +} - var reposResources specs.LinuxResources - // Leave them `nil` so it takes kernel default unless cfg value above `0`. - reposResources.CPU = &specs.LinuxCPU{} - - if cg.cfg.Repositories.CPUShares > 0 { - reposResources.CPU.Shares = &cg.cfg.Repositories.CPUShares - } - - if cg.cfg.Repositories.CPUQuotaUs > 0 { - reposResources.CPU.Quota = &cg.cfg.Repositories.CPUQuotaUs - reposResources.CPU.Period = &cfsPeriodUs - } - - if cg.cfg.Repositories.MemoryBytes > 0 { - reposResources.Memory = &specs.LinuxMemory{Limit: &cg.cfg.Repositories.MemoryBytes} - } - - for i := 0; i < int(cg.cfg.Repositories.Count); i++ { +func (cvh *cgroupV1Handler) setupRepository(reposResources *specs.LinuxResources) error { + for i := 0; i < int(cvh.cfg.Repositories.Count); i++ { if _, err := cgroup1.New( - cgroup1.StaticPath(cg.repoPath(i)), - &reposResources, - cgroup1.WithHiearchy(cg.hierarchy), + cgroup1.StaticPath(cvh.repoPath(i)), + reposResources, + cgroup1.WithHiearchy(cvh.hierarchy), ); err != nil { return fmt.Errorf("failed creating repository cgroup: %w", err) } } - return nil } -// AddCommand adds the given command to one of the CGroup's buckets. The bucket used for the command -// is determined by hashing the repository storage and path. No error is returned if the command has already -// exited. -func (cg *CGroupV1Manager) AddCommand( - cmd *exec.Cmd, - opts ...AddCommandOption, -) (string, error) { - var cfg addCommandCfg - for _, opt := range opts { - opt(&cfg) - } - - key := cfg.cgroupKey - if key == "" { - key = strings.Join(cmd.Args, "/") - } - - checksum := crc32.ChecksumIEEE( - []byte(key), - ) - - if cmd.Process == nil { - return "", fmt.Errorf("cannot add command that has not yet been started") - } - - groupID := uint(checksum) % cg.cfg.Repositories.Count - cgroupPath := cg.repoPath(int(groupID)) - - return cgroupPath, cg.addToCgroup(cmd.Process.Pid, cgroupPath) -} - -func (cg *CGroupV1Manager) addToCgroup(pid int, cgroupPath string) error { +func (cvh *cgroupV1Handler) addToCgroup(pid int, cgroupPath string) error { control, err := cgroup1.Load( cgroup1.StaticPath(cgroupPath), - cgroup1.WithHiearchy(cg.hierarchy), + cgroup1.WithHiearchy(cvh.hierarchy), ) if err != nil { return fmt.Errorf("failed loading %s cgroup: %w", cgroupPath, err) @@ -189,18 +81,17 @@ func (cg *CGroupV1Manager) addToCgroup(pid int, cgroupPath string) error { return nil } -// Collect collects metrics from the cgroups controller -func (cg *CGroupV1Manager) Collect(ch chan<- prometheus.Metric) { - if !cg.cfg.MetricsEnabled { +func (cvh *cgroupV1Handler) collect(ch chan<- prometheus.Metric) { + if !cvh.cfg.MetricsEnabled { return } - for i := 0; i < int(cg.cfg.Repositories.Count); i++ { - repoPath := cg.repoPath(i) + for i := 0; i < int(cvh.cfg.Repositories.Count); i++ { + repoPath := cvh.repoPath(i) logger := log.Default().WithField("cgroup_path", repoPath) control, err := cgroup1.Load( cgroup1.StaticPath(repoPath), - cgroup1.WithHiearchy(cg.hierarchy), + cgroup1.WithHiearchy(cvh.hierarchy), ) if err != nil { logger.WithError(err).Warn("unable to load cgroup controller") @@ -210,41 +101,41 @@ func (cg *CGroupV1Manager) Collect(ch chan<- prometheus.Metric) { if metrics, err := control.Stat(); err != nil { logger.WithError(err).Warn("unable to get cgroup stats") } else { - memoryMetric := cg.memoryReclaimAttemptsTotal.WithLabelValues(repoPath) + memoryMetric := cvh.memoryReclaimAttemptsTotal.WithLabelValues(repoPath) memoryMetric.Set(float64(metrics.Memory.Usage.Failcnt)) ch <- memoryMetric - cpuUserMetric := cg.cpuUsage.WithLabelValues(repoPath, "user") + cpuUserMetric := cvh.cpuUsage.WithLabelValues(repoPath, "user") cpuUserMetric.Set(float64(metrics.CPU.Usage.User)) ch <- cpuUserMetric ch <- prometheus.MustNewConstMetric( - cg.cpuCFSPeriods, + cvh.cpuCFSPeriods, prometheus.CounterValue, float64(metrics.CPU.Throttling.Periods), repoPath, ) ch <- prometheus.MustNewConstMetric( - cg.cpuCFSThrottledPeriods, + cvh.cpuCFSThrottledPeriods, prometheus.CounterValue, float64(metrics.CPU.Throttling.ThrottledPeriods), repoPath, ) ch <- prometheus.MustNewConstMetric( - cg.cpuCFSThrottledTime, + cvh.cpuCFSThrottledTime, prometheus.CounterValue, float64(metrics.CPU.Throttling.ThrottledTime)/float64(time.Second), repoPath, ) - cpuKernelMetric := cg.cpuUsage.WithLabelValues(repoPath, "kernel") + cpuKernelMetric := cvh.cpuUsage.WithLabelValues(repoPath, "kernel") cpuKernelMetric.Set(float64(metrics.CPU.Usage.Kernel)) ch <- cpuKernelMetric } - if subsystems, err := cg.hierarchy(); err != nil { + if subsystems, err := cvh.hierarchy(); err != nil { logger.WithError(err).Warn("unable to get cgroup hierarchy") } else { for _, subsystem := range subsystems { @@ -256,7 +147,7 @@ func (cg *CGroupV1Manager) Collect(ch chan<- prometheus.Metric) { continue } - procsMetric := cg.procs.WithLabelValues(repoPath, string(subsystem.Name())) + procsMetric := cvh.procs.WithLabelValues(repoPath, string(subsystem.Name())) procsMetric.Set(float64(len(processes))) ch <- procsMetric } @@ -264,18 +155,12 @@ func (cg *CGroupV1Manager) Collect(ch chan<- prometheus.Metric) { } } -// Describe describes the cgroup metrics that Collect provides -func (cg *CGroupV1Manager) Describe(ch chan<- *prometheus.Desc) { - prometheus.DescribeByCollect(cg, ch) -} - -//nolint:revive // This is unintentionally missing documentation. -func (cg *CGroupV1Manager) Cleanup() error { - processCgroupPath := cg.currentProcessCgroup() +func (cvh *cgroupV1Handler) cleanup() error { + processCgroupPath := cvh.currentProcessCgroup() control, err := cgroup1.Load( cgroup1.StaticPath(processCgroupPath), - cgroup1.WithHiearchy(cg.hierarchy), + cgroup1.WithHiearchy(cvh.hierarchy), ) if err != nil { return fmt.Errorf("failed loading cgroup %s: %w", processCgroupPath, err) @@ -288,12 +173,12 @@ func (cg *CGroupV1Manager) Cleanup() error { return nil } -func (cg *CGroupV1Manager) repoPath(groupID int) string { - return filepath.Join(cg.currentProcessCgroup(), fmt.Sprintf("repos-%d", groupID)) +func (cvh *cgroupV1Handler) repoPath(groupID int) string { + return filepath.Join(cvh.currentProcessCgroup(), fmt.Sprintf("repos-%d", groupID)) } -func (cg *CGroupV1Manager) currentProcessCgroup() string { - return config.GetGitalyProcessTempDir(cg.cfg.HierarchyRoot, cg.pid) +func (cvh *cgroupV1Handler) currentProcessCgroup() string { + return config.GetGitalyProcessTempDir(cvh.cfg.HierarchyRoot, cvh.pid) } func defaultSubsystems(root string) ([]cgroup1.Subsystem, error) { @@ -304,3 +189,21 @@ func defaultSubsystems(root string) ([]cgroup1.Subsystem, error) { return subsystems, nil } + +func pruneOldCgroupsV1(cfg cgroupscfg.Config, logger logrus.FieldLogger) { + if err := config.PruneOldGitalyProcessDirectories( + logger, + filepath.Join(cfg.Mountpoint, "memory", + cfg.HierarchyRoot), + ); err != nil { + logger.WithError(err).Error("failed to clean up memory cgroups") + } + + if err := config.PruneOldGitalyProcessDirectories( + logger, + filepath.Join(cfg.Mountpoint, "cpu", + cfg.HierarchyRoot), + ); err != nil { + logger.WithError(err).Error("failed to clean up cpu cgroups") + } +} diff --git a/internal/cgroups/v1_linux_test.go b/internal/cgroups/v1_linux_test.go index a364d79655ebe9694aa93472a69f9fb5add2da66..a68ebed4d077cbf5b3a6b2f014ba496db22dde39 100644 --- a/internal/cgroups/v1_linux_test.go +++ b/internal/cgroups/v1_linux_test.go @@ -1,9 +1,11 @@ +//go:build linux + package cgroups import ( - "bytes" "fmt" "hash/crc32" + "io/fs" "os" "os/exec" "path/filepath" @@ -11,7 +13,9 @@ import ( "strings" "testing" + cgrps "github.com/containerd/cgroups/v3" "github.com/prometheus/client_golang/prometheus/testutil" + "github.com/sirupsen/logrus/hooks/test" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" "gitlab.com/gitlab-org/gitaly/v16/internal/gitaly/config/cgroups" @@ -31,6 +35,15 @@ func defaultCgroupsConfig() cgroups.Config { } } +func TestNewManagerV1(t *testing.T) { + cfg := cgroups.Config{Repositories: cgroups.Repositories{Count: 10}} + + manager := newCgroupManagerWithMode(cfg, 1, cgrps.Legacy) + require.IsType(t, &cgroupV1Handler{}, manager.handler) + manager = newCgroupManagerWithMode(cfg, 1, cgrps.Hybrid) + require.IsType(t, &cgroupV1Handler{}, manager.handler) +} + func TestSetup_ParentCgroups(t *testing.T) { tests := []struct { name string @@ -84,12 +97,9 @@ func TestSetup_ParentCgroups(t *testing.T) { mock := newMock(t) pid := 1 tt.cfg.HierarchyRoot = "gitaly" + tt.cfg.Mountpoint = mock.root - v1Manager := &CGroupV1Manager{ - cfg: tt.cfg, - hierarchy: mock.hierarchy, - pid: pid, - } + v1Manager := mock.newCgroupManager(tt.cfg, pid) require.NoError(t, v1Manager.Setup()) memoryLimitPath := filepath.Join( @@ -167,12 +177,10 @@ func TestSetup_RepoCgroups(t *testing.T) { cfg := defaultCgroupsConfig() cfg.Repositories = tt.cfg cfg.Repositories.Count = 3 + cfg.HierarchyRoot = "gitaly" + cfg.Mountpoint = mock.root - v1Manager := &CGroupV1Manager{ - cfg: cfg, - hierarchy: mock.hierarchy, - pid: pid, - } + v1Manager := mock.newCgroupManager(cfg, pid) require.NoError(t, v1Manager.Setup()) @@ -208,24 +216,18 @@ func TestAddCommand(t *testing.T) { config.Repositories.Count = 10 config.Repositories.MemoryBytes = 1024 config.Repositories.CPUShares = 16 + config.HierarchyRoot = "gitaly" + config.Mountpoint = mock.root pid := 1 - v1Manager1 := &CGroupV1Manager{ - cfg: config, - hierarchy: mock.hierarchy, - pid: pid, - } + v1Manager1 := mock.newCgroupManager(config, pid) require.NoError(t, v1Manager1.Setup()) ctx := testhelper.Context(t) cmd2 := exec.CommandContext(ctx, "ls", "-hal", ".") require.NoError(t, cmd2.Run()) - v1Manager2 := &CGroupV1Manager{ - cfg: config, - hierarchy: mock.hierarchy, - pid: pid, - } + v1Manager2 := mock.newCgroupManager(config, pid) t.Run("without overridden key", func(t *testing.T) { _, err := v1Manager2.AddCommand(cmd2) @@ -270,11 +272,11 @@ func TestCleanup(t *testing.T) { mock := newMock(t) pid := 1 - v1Manager := &CGroupV1Manager{ - cfg: defaultCgroupsConfig(), - hierarchy: mock.hierarchy, - pid: pid, - } + cfg := defaultCgroupsConfig() + cfg.Mountpoint = mock.root + + v1Manager := mock.newCgroupManager(cfg, pid) + require.NoError(t, v1Manager.Setup()) require.NoError(t, v1Manager.Cleanup()) @@ -288,48 +290,17 @@ func TestCleanup(t *testing.T) { } func TestMetrics(t *testing.T) { - t.Parallel() - - mock := newMock(t) - - config := defaultCgroupsConfig() - config.Repositories.Count = 1 - config.Repositories.MemoryBytes = 1048576 - config.Repositories.CPUShares = 16 - - v1Manager1 := newV1Manager(config, 1) - v1Manager1.hierarchy = mock.hierarchy - - mock.setupMockCgroupFiles(t, v1Manager1, 2) - - require.NoError(t, v1Manager1.Setup()) - - ctx := testhelper.Context(t) - - cmd := exec.CommandContext(ctx, "ls", "-hal", ".") - require.NoError(t, cmd.Start()) - _, err := v1Manager1.AddCommand(cmd) - require.NoError(t, err) - - gitCmd1 := exec.CommandContext(ctx, "ls", "-hal", ".") - require.NoError(t, gitCmd1.Start()) - _, err = v1Manager1.AddCommand(gitCmd1) - require.NoError(t, err) - - gitCmd2 := exec.CommandContext(ctx, "ls", "-hal", ".") - require.NoError(t, gitCmd2.Start()) - _, err = v1Manager1.AddCommand(gitCmd2) - require.NoError(t, err) - defer func() { - require.NoError(t, gitCmd2.Wait()) - }() - - require.NoError(t, cmd.Wait()) - require.NoError(t, gitCmd1.Wait()) - - repoCgroupPath := filepath.Join(v1Manager1.currentProcessCgroup(), "repos-0") - - expected := strings.NewReader(strings.ReplaceAll(`# HELP gitaly_cgroup_cpu_usage_total CPU Usage of Cgroup + tests := []struct { + name string + metricsEnabled bool + pid int + expect string + }{ + { + name: "metrics enabled: true", + metricsEnabled: true, + pid: 1, + expect: `# HELP gitaly_cgroup_cpu_usage_total CPU Usage of Cgroup # TYPE gitaly_cgroup_cpu_usage_total gauge gitaly_cgroup_cpu_usage_total{path="%s",type="kernel"} 0 gitaly_cgroup_cpu_usage_total{path="%s",type="user"} 0 @@ -349,20 +320,223 @@ gitaly_cgroup_cpu_cfs_throttled_periods_total{path="%s"} 20 # HELP gitaly_cgroup_cpu_cfs_throttled_seconds_total Total time duration the Cgroup has been throttled # TYPE gitaly_cgroup_cpu_cfs_throttled_seconds_total counter gitaly_cgroup_cpu_cfs_throttled_seconds_total{path="%s"} 0.001 -`, "%s", repoCgroupPath)) +`, + }, + { + name: "metrics enabled: false", + metricsEnabled: false, + pid: 2, + }, + } + + for _, tt := range tests { + tt := tt + t.Run(tt.name, func(t *testing.T) { + t.Parallel() + mock := newMock(t) + + config := defaultCgroupsConfig() + config.Repositories.Count = 1 + config.Repositories.MemoryBytes = 1048576 + config.Repositories.CPUShares = 16 + config.Mountpoint = mock.root + config.MetricsEnabled = tt.metricsEnabled + + v1Manager1 := mock.newCgroupManager(config, tt.pid) + + mock.setupMockCgroupFiles(t, v1Manager1, 2) + require.NoError(t, v1Manager1.Setup()) + + ctx := testhelper.Context(t) + + cmd := exec.CommandContext(ctx, "ls", "-hal", ".") + require.NoError(t, cmd.Start()) + _, err := v1Manager1.AddCommand(cmd) + require.NoError(t, err) + + gitCmd1 := exec.CommandContext(ctx, "ls", "-hal", ".") + require.NoError(t, gitCmd1.Start()) + _, err = v1Manager1.AddCommand(gitCmd1) + require.NoError(t, err) + + gitCmd2 := exec.CommandContext(ctx, "ls", "-hal", ".") + require.NoError(t, gitCmd2.Start()) + _, err = v1Manager1.AddCommand(gitCmd2) + require.NoError(t, err) + defer func() { + require.NoError(t, gitCmd2.Wait()) + }() + + require.NoError(t, cmd.Wait()) + require.NoError(t, gitCmd1.Wait()) - for _, metricsEnabled := range []bool{true, false} { - t.Run(fmt.Sprintf("metrics enabled: %v", metricsEnabled), func(t *testing.T) { - v1Manager1.cfg.MetricsEnabled = metricsEnabled + repoCgroupPath := filepath.Join(v1Manager1.currentProcessCgroup(), "repos-0") + + expected := strings.NewReader(strings.ReplaceAll(tt.expect, "%s", repoCgroupPath)) + assert.NoError(t, testutil.CollectAndCompare(v1Manager1, expected)) + }) + } +} + +func TestPruneOldCgroups(t *testing.T) { + t.Parallel() + + testCases := []struct { + desc string + cfg cgroups.Config + expectedPruned bool + // setup returns a pid + setup func(t *testing.T, cfg cgroups.Config, mock *mockCgroup) int + }{ + { + desc: "process belongs to another user", + cfg: cgroups.Config{ + HierarchyRoot: "gitaly", + Repositories: cgroups.Repositories{ + Count: 10, + MemoryBytes: 10 * 1024 * 1024, + CPUShares: 1024, + }, + }, + setup: func(t *testing.T, cfg cgroups.Config, mock *mockCgroup) int { + pid := 1 + cgroupManager := mock.newCgroupManager(cfg, pid) + require.NoError(t, cgroupManager.Setup()) + + return pid + }, + expectedPruned: true, + }, + { + desc: "no hierarchy root", + cfg: cgroups.Config{ + HierarchyRoot: "", + Repositories: cgroups.Repositories{ + Count: 10, + MemoryBytes: 10 * 1024 * 1024, + CPUShares: 1024, + }, + }, + setup: func(t *testing.T, cfg cgroups.Config, mock *mockCgroup) int { + pid := 1 + cgroupManager := mock.newCgroupManager(cfg, pid) + require.NoError(t, cgroupManager.Setup()) + return 1 + }, + expectedPruned: false, + }, + { + desc: "pid of finished process", + cfg: cgroups.Config{ + HierarchyRoot: "gitaly", + Repositories: cgroups.Repositories{ + Count: 10, + MemoryBytes: 10 * 1024 * 1024, + CPUShares: 1024, + }, + }, + setup: func(t *testing.T, cfg cgroups.Config, mock *mockCgroup) int { + cmd := exec.Command("ls") + require.NoError(t, cmd.Run()) + pid := cmd.Process.Pid + + cgroupManager := mock.newCgroupManager(cfg, pid) + require.NoError(t, cgroupManager.Setup()) + + memoryRoot := filepath.Join( + cfg.Mountpoint, + "memory", + cfg.HierarchyRoot, + "memory.limit_in_bytes", + ) + require.NoError(t, os.WriteFile(memoryRoot, []byte{}, fs.ModeAppend)) + + return pid + }, + expectedPruned: true, + }, + { + desc: "pid of running process", + cfg: cgroups.Config{ + HierarchyRoot: "gitaly", + Repositories: cgroups.Repositories{ + Count: 10, + MemoryBytes: 10 * 1024 * 1024, + CPUShares: 1024, + }, + }, + setup: func(t *testing.T, cfg cgroups.Config, mock *mockCgroup) int { + pid := os.Getpid() + + cgroupManager := mock.newCgroupManager(cfg, pid) + require.NoError(t, cgroupManager.Setup()) + + return pid + }, + expectedPruned: false, + }, + { + desc: "gitaly-0 directory is deleted", + cfg: cgroups.Config{ + HierarchyRoot: "gitaly", + Repositories: cgroups.Repositories{ + Count: 10, + MemoryBytes: 10 * 1024 * 1024, + CPUShares: 1024, + }, + }, + setup: func(t *testing.T, cfg cgroups.Config, mock *mockCgroup) int { + cgroupManager := mock.newCgroupManager(cfg, 0) + require.NoError(t, cgroupManager.Setup()) + + return 0 + }, + expectedPruned: true, + }, + } + + for _, tc := range testCases { + t.Run(tc.desc, func(t *testing.T) { + mock := newMock(t) + tc.cfg.Mountpoint = mock.root + + memoryRoot := filepath.Join( + tc.cfg.Mountpoint, + "memory", + tc.cfg.HierarchyRoot, + ) + cpuRoot := filepath.Join( + tc.cfg.Mountpoint, + "cpu", + tc.cfg.HierarchyRoot, + ) + + require.NoError(t, os.MkdirAll(cpuRoot, perm.PublicDir)) + require.NoError(t, os.MkdirAll(memoryRoot, perm.PublicDir)) + + pid := tc.setup(t, tc.cfg, mock) + + logger, hook := test.NewNullLogger() + + mock.pruneOldCgroups(tc.cfg, logger) + + // create cgroups directories with a different pid + oldGitalyProcessMemoryDir := filepath.Join( + memoryRoot, + fmt.Sprintf("gitaly-%d", pid), + ) + oldGitalyProcesssCPUDir := filepath.Join( + cpuRoot, + fmt.Sprintf("gitaly-%d", pid), + ) - if metricsEnabled { - assert.NoError(t, testutil.CollectAndCompare( - v1Manager1, - expected)) + if tc.expectedPruned { + require.NoDirExists(t, oldGitalyProcessMemoryDir) + require.NoDirExists(t, oldGitalyProcesssCPUDir) } else { - assert.NoError(t, testutil.CollectAndCompare( - v1Manager1, - bytes.NewBufferString(""))) + require.DirExists(t, oldGitalyProcessMemoryDir) + require.DirExists(t, oldGitalyProcesssCPUDir) + require.Len(t, hook.Entries, 0) } }) } diff --git a/internal/cgroups/v2_linux.go b/internal/cgroups/v2_linux.go new file mode 100644 index 0000000000000000000000000000000000000000..a2f81f60b58f08dc934a6fead26bcbd5f448be75 --- /dev/null +++ b/internal/cgroups/v2_linux.go @@ -0,0 +1,175 @@ +//go:build linux + +package cgroups + +import ( + "errors" + "fmt" + "io/fs" + "path/filepath" + "strings" + "time" + + "github.com/containerd/cgroups/v3/cgroup2" + "github.com/opencontainers/runtime-spec/specs-go" + "github.com/prometheus/client_golang/prometheus" + "github.com/sirupsen/logrus" + "gitlab.com/gitlab-org/gitaly/v16/internal/gitaly/config" + cgroupscfg "gitlab.com/gitlab-org/gitaly/v16/internal/gitaly/config/cgroups" + "gitlab.com/gitlab-org/gitaly/v16/internal/log" +) + +type cgroupV2Handler struct { + cfg cgroupscfg.Config + + *cgroupsMetrics + pid int +} + +func newV2Handler(cfg cgroupscfg.Config, pid int) *cgroupV2Handler { + return &cgroupV2Handler{ + cfg: cfg, + pid: pid, + cgroupsMetrics: newV2CgroupsMetrics(), + } +} + +func (cvh *cgroupV2Handler) setupParent(parentResources *specs.LinuxResources) error { + if _, err := cgroup2.NewManager(cvh.cfg.Mountpoint, "/"+cvh.currentProcessCgroup(), cgroup2.ToResources(parentResources)); err != nil { + return fmt.Errorf("failed creating parent cgroup: %w", err) + } + + return nil +} + +func (cvh *cgroupV2Handler) setupRepository(reposResources *specs.LinuxResources) error { + for i := 0; i < int(cvh.cfg.Repositories.Count); i++ { + if _, err := cgroup2.NewManager( + cvh.cfg.Mountpoint, + "/"+cvh.repoPath(i), + cgroup2.ToResources(reposResources), + ); err != nil { + return fmt.Errorf("failed creating repository cgroup: %w", err) + } + } + return nil +} + +func (cvh *cgroupV2Handler) addToCgroup(pid int, cgroupPath string) error { + control, err := cgroup2.Load("/"+cgroupPath, cgroup2.WithMountpoint(cvh.cfg.Mountpoint)) + if err != nil { + return fmt.Errorf("failed loading %s cgroup: %w", cgroupPath, err) + } + + if err := control.AddProc(uint64(pid)); err != nil { + // Command could finish so quickly before we can add it to a cgroup, so + // we don't consider it an error. + if strings.Contains(err.Error(), "no such process") { + return nil + } + return fmt.Errorf("failed adding process to cgroup: %w", err) + } + + return nil +} + +func (cvh *cgroupV2Handler) collect(ch chan<- prometheus.Metric) { + if !cvh.cfg.MetricsEnabled { + return + } + + for i := 0; i < int(cvh.cfg.Repositories.Count); i++ { + repoPath := cvh.repoPath(i) + logger := log.Default().WithField("cgroup_path", repoPath) + control, err := cgroup2.Load("/"+repoPath, cgroup2.WithMountpoint(cvh.cfg.Mountpoint)) + if err != nil { + logger.WithError(err).Warn("unable to load cgroup controller") + return + } + + if metrics, err := control.Stat(); err != nil { + logger.WithError(err).Warn("unable to get cgroup stats") + } else { + cpuUserMetric := cvh.cpuUsage.WithLabelValues(repoPath, "user") + cpuUserMetric.Set(float64(metrics.CPU.UserUsec)) + ch <- cpuUserMetric + + ch <- prometheus.MustNewConstMetric( + cvh.cpuCFSPeriods, + prometheus.CounterValue, + float64(metrics.CPU.NrPeriods), + repoPath, + ) + + ch <- prometheus.MustNewConstMetric( + cvh.cpuCFSThrottledPeriods, + prometheus.CounterValue, + float64(metrics.CPU.NrThrottled), + repoPath, + ) + + ch <- prometheus.MustNewConstMetric( + cvh.cpuCFSThrottledTime, + prometheus.CounterValue, + float64(metrics.CPU.ThrottledUsec)/float64(time.Second), + repoPath, + ) + + cpuKernelMetric := cvh.cpuUsage.WithLabelValues(repoPath, "kernel") + cpuKernelMetric.Set(float64(metrics.CPU.SystemUsec)) + ch <- cpuKernelMetric + } + + if subsystems, err := control.Controllers(); err != nil { + logger.WithError(err).Warn("unable to get cgroup hierarchy") + } else { + processes, err := control.Procs(true) + if err != nil { + logger.WithError(err). + Warn("unable to get process list") + continue + } + + for _, subsystem := range subsystems { + procsMetric := cvh.procs.WithLabelValues(repoPath, subsystem) + procsMetric.Set(float64(len(processes))) + ch <- procsMetric + } + } + } +} + +func (cvh *cgroupV2Handler) cleanup() error { + processCgroupPath := cvh.currentProcessCgroup() + + control, err := cgroup2.Load("/"+processCgroupPath, cgroup2.WithMountpoint(cvh.cfg.Mountpoint)) + if err != nil { + return fmt.Errorf("failed loading cgroup %s: %w", processCgroupPath, err) + } + + if err := control.Delete(); err != nil { + return fmt.Errorf("failed cleaning up cgroup %s: %w", processCgroupPath, err) + } + + return nil +} + +func (cvh *cgroupV2Handler) repoPath(groupID int) string { + return filepath.Join(cvh.currentProcessCgroup(), fmt.Sprintf("repos-%d", groupID)) +} + +func (cvh *cgroupV2Handler) currentProcessCgroup() string { + return config.GetGitalyProcessTempDir(cvh.cfg.HierarchyRoot, cvh.pid) +} + +func pruneOldCgroupsV2(cfg cgroupscfg.Config, logger logrus.FieldLogger) { + if err := config.PruneOldGitalyProcessDirectories( + logger, + filepath.Join(cfg.Mountpoint, cfg.HierarchyRoot), + ); err != nil { + var pathError *fs.PathError + if !errors.As(err, &pathError) { + logger.WithError(err).Error("failed to clean up cpu cgroups") + } + } +} diff --git a/internal/cgroups/v2_linux_test.go b/internal/cgroups/v2_linux_test.go new file mode 100644 index 0000000000000000000000000000000000000000..834a148cd60ac056b950abe84c1f278448f6f60c --- /dev/null +++ b/internal/cgroups/v2_linux_test.go @@ -0,0 +1,546 @@ +//go:build linux + +package cgroups + +import ( + "fmt" + "hash/crc32" + "io/fs" + "os" + "os/exec" + "path/filepath" + "strconv" + "strings" + "testing" + + cgrps "github.com/containerd/cgroups/v3" + "github.com/prometheus/client_golang/prometheus/testutil" + "github.com/sirupsen/logrus/hooks/test" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + "gitlab.com/gitlab-org/gitaly/v16/internal/gitaly/config/cgroups" + "gitlab.com/gitlab-org/gitaly/v16/internal/helper/perm" + "gitlab.com/gitlab-org/gitaly/v16/internal/testhelper" +) + +func defaultCgroupsV2Config() cgroups.Config { + return cgroups.Config{ + HierarchyRoot: "gitaly", + Repositories: cgroups.Repositories{ + Count: 3, + MemoryBytes: 1024000, + CPUShares: 256, + CPUQuotaUs: 2000, + }, + } +} + +func TestNewManagerV2(t *testing.T) { + cfg := cgroups.Config{Repositories: cgroups.Repositories{Count: 10}} + + manager := newCgroupManagerWithMode(cfg, 1, cgrps.Unified) + require.IsType(t, &cgroupV2Handler{}, manager.handler) +} + +func TestSetup_ParentCgroupsV2(t *testing.T) { + tests := []struct { + name string + cfg cgroups.Config + wantMemoryBytes int + wantCPUWeight int + wantCPUMax string + }{ + { + name: "all config specified", + cfg: cgroups.Config{ + MemoryBytes: 102400, + CPUShares: 256, + CPUQuotaUs: 2000, + }, + wantMemoryBytes: 102400, + wantCPUWeight: 256, + wantCPUMax: "2000 100000", + }, + { + name: "only memory limit set", + cfg: cgroups.Config{ + MemoryBytes: 102400, + }, + wantMemoryBytes: 102400, + }, + { + name: "only cpu shares set", + cfg: cgroups.Config{ + CPUShares: 512, + }, + wantCPUWeight: 512, + }, + { + name: "only cpu quota set", + cfg: cgroups.Config{ + CPUQuotaUs: 2000, + }, + wantCPUMax: "2000 100000", + }, + } + + for _, tt := range tests { + tt := tt + t.Run(tt.name, func(t *testing.T) { + t.Parallel() + + mock := newMockV2(t) + + pid := 1 + tt.cfg.HierarchyRoot = "gitaly" + tt.cfg.Mountpoint = mock.root + + v2Manager := mock.newCgroupManager(tt.cfg, pid) + mock.setupMockCgroupFiles(t, v2Manager) + + require.NoError(t, v2Manager.Setup()) + + memoryMaxPath := filepath.Join( + mock.root, "gitaly", fmt.Sprintf("gitaly-%d", pid), "memory.max", + ) + requireCgroupWithInt(t, memoryMaxPath, tt.wantMemoryBytes) + + cpuWeightPath := filepath.Join( + mock.root, "gitaly", fmt.Sprintf("gitaly-%d", pid), "cpu.weight", + ) + requireCgroupWithInt(t, cpuWeightPath, calculateWantCPUWeight(tt.wantCPUWeight)) + + cpuMaxPath := filepath.Join( + mock.root, "gitaly", fmt.Sprintf("gitaly-%d", pid), "cpu.max", + ) + requireCgroupWithString(t, cpuMaxPath, tt.wantCPUMax) + }) + } +} + +func TestSetup_RepoCgroupsV2(t *testing.T) { + tests := []struct { + name string + cfg cgroups.Repositories + wantMemoryBytes int + wantCPUWeight int + wantCPUMax string + }{ + { + name: "all config specified", + cfg: defaultCgroupsV2Config().Repositories, + wantMemoryBytes: 1024000, + wantCPUWeight: 256, + wantCPUMax: "2000 100000", + }, + { + name: "only memory limit set", + cfg: cgroups.Repositories{ + Count: 3, + MemoryBytes: 1024000, + }, + wantMemoryBytes: 1024000, + }, + { + name: "only cpu shares set", + cfg: cgroups.Repositories{ + Count: 3, + CPUShares: 512, + }, + wantCPUWeight: 512, + }, + { + name: "only cpu quota set", + cfg: cgroups.Repositories{ + Count: 3, + CPUQuotaUs: 1000, + }, + wantCPUMax: "1000 100000", + }, + } + + for _, tt := range tests { + tt := tt + t.Run(tt.name, func(t *testing.T) { + t.Parallel() + + mock := newMockV2(t) + + pid := 1 + + cfg := defaultCgroupsV2Config() + cfg.Mountpoint = mock.root + cfg.Repositories = tt.cfg + + v2Manager := mock.newCgroupManager(cfg, pid) + mock.setupMockCgroupFiles(t, v2Manager) + require.NoError(t, v2Manager.Setup()) + + for i := 0; i < 3; i++ { + memoryMaxPath := filepath.Join( + mock.root, "gitaly", fmt.Sprintf("gitaly-%d", pid), fmt.Sprintf("repos-%d", i), "memory.max", + ) + requireCgroupWithInt(t, memoryMaxPath, tt.wantMemoryBytes) + + cpuWeightPath := filepath.Join( + mock.root, "gitaly", fmt.Sprintf("gitaly-%d", pid), fmt.Sprintf("repos-%d", i), "cpu.weight", + ) + requireCgroupWithInt(t, cpuWeightPath, calculateWantCPUWeight(tt.wantCPUWeight)) + + cpuMaxPath := filepath.Join( + mock.root, "gitaly", fmt.Sprintf("gitaly-%d", pid), fmt.Sprintf("repos-%d", i), "cpu.max", + ) + requireCgroupWithString(t, cpuMaxPath, tt.wantCPUMax) + } + }) + } +} + +func TestAddCommandV2(t *testing.T) { + mock := newMockV2(t) + + config := defaultCgroupsV2Config() + config.Repositories.Count = 10 + config.Repositories.MemoryBytes = 1024 + config.Repositories.CPUShares = 16 + config.Mountpoint = mock.root + + pid := 1 + + v2Manager1 := mock.newCgroupManager(config, pid) + mock.setupMockCgroupFiles(t, v2Manager1) + + require.NoError(t, v2Manager1.Setup()) + ctx := testhelper.Context(t) + + cmd2 := exec.CommandContext(ctx, "ls", "-hal", ".") + require.NoError(t, cmd2.Run()) + + v2Manager2 := mock.newCgroupManager(config, pid) + + t.Run("without overridden key", func(t *testing.T) { + _, err := v2Manager2.AddCommand(cmd2) + require.NoError(t, err) + + checksum := crc32.ChecksumIEEE([]byte(strings.Join(cmd2.Args, "/"))) + groupID := uint(checksum) % config.Repositories.Count + + path := filepath.Join(mock.root, "gitaly", + fmt.Sprintf("gitaly-%d", pid), fmt.Sprintf("repos-%d", groupID), "cgroup.procs") + content := readCgroupFile(t, path) + + cmdPid, err := strconv.Atoi(string(content)) + require.NoError(t, err) + + require.Equal(t, cmd2.Process.Pid, cmdPid) + }) + + t.Run("with overridden key", func(t *testing.T) { + _, err := v2Manager2.AddCommand(cmd2, WithCgroupKey("foobar")) + require.NoError(t, err) + + checksum := crc32.ChecksumIEEE([]byte("foobar")) + groupID := uint(checksum) % config.Repositories.Count + + path := filepath.Join(mock.root, "gitaly", + fmt.Sprintf("gitaly-%d", pid), fmt.Sprintf("repos-%d", groupID), "cgroup.procs") + content := readCgroupFile(t, path) + + cmdPid, err := strconv.Atoi(string(content)) + require.NoError(t, err) + + require.Equal(t, cmd2.Process.Pid, cmdPid) + }) +} + +func TestCleanupV2(t *testing.T) { + mock := newMockV2(t) + + pid := 1 + cfg := defaultCgroupsV2Config() + cfg.Mountpoint = mock.root + + v2Manager := mock.newCgroupManager(cfg, pid) + mock.setupMockCgroupFiles(t, v2Manager) + + require.NoError(t, v2Manager.Setup()) + require.NoError(t, v2Manager.Cleanup()) + + for i := 0; i < 3; i++ { + require.NoDirExists(t, filepath.Join(mock.root, "gitaly", fmt.Sprintf("gitaly-%d", pid), fmt.Sprintf("repos-%d", i))) + } +} + +func TestMetricsV2(t *testing.T) { + tests := []struct { + name string + metricsEnabled bool + pid int + expect string + }{ + { + name: "metrics enabled: true", + metricsEnabled: true, + pid: 1, + expect: `# HELP gitaly_cgroup_cpu_cfs_periods_total Number of elapsed enforcement period intervals +# TYPE gitaly_cgroup_cpu_cfs_periods_total counter +gitaly_cgroup_cpu_cfs_periods_total{path="%s"} 10 +# HELP gitaly_cgroup_cpu_cfs_throttled_periods_total Number of throttled period intervals +# TYPE gitaly_cgroup_cpu_cfs_throttled_periods_total counter +gitaly_cgroup_cpu_cfs_throttled_periods_total{path="%s"} 20 +# HELP gitaly_cgroup_cpu_cfs_throttled_seconds_total Total time duration the Cgroup has been throttled +# TYPE gitaly_cgroup_cpu_cfs_throttled_seconds_total counter +gitaly_cgroup_cpu_cfs_throttled_seconds_total{path="%s"} 0.001 +# HELP gitaly_cgroup_cpu_usage_total CPU Usage of Cgroup +# TYPE gitaly_cgroup_cpu_usage_total gauge +gitaly_cgroup_cpu_usage_total{path="%s",type="kernel"} 0 +gitaly_cgroup_cpu_usage_total{path="%s",type="user"} 0 +# HELP gitaly_cgroup_procs_total Total number of procs +# TYPE gitaly_cgroup_procs_total gauge +gitaly_cgroup_procs_total{path="%s",subsystem="cpu"} 1 +gitaly_cgroup_procs_total{path="%s",subsystem="cpuset"} 1 +gitaly_cgroup_procs_total{path="%s",subsystem="memory"} 1 +`, + }, + { + name: "metrics enabled: false", + metricsEnabled: false, + pid: 2, + }, + } + + for _, tt := range tests { + tt := tt + t.Run(tt.name, func(t *testing.T) { + t.Parallel() + mock := newMockV2(t) + + config := defaultCgroupsV2Config() + config.Repositories.Count = 1 + config.Repositories.MemoryBytes = 1048576 + config.Repositories.CPUShares = 16 + config.Mountpoint = mock.root + config.MetricsEnabled = tt.metricsEnabled + + v2Manager1 := mock.newCgroupManager(config, tt.pid) + + mock.setupMockCgroupFiles(t, v2Manager1) + require.NoError(t, v2Manager1.Setup()) + + ctx := testhelper.Context(t) + + cmd := exec.CommandContext(ctx, "ls", "-hal", ".") + require.NoError(t, cmd.Start()) + _, err := v2Manager1.AddCommand(cmd) + require.NoError(t, err) + + gitCmd1 := exec.CommandContext(ctx, "ls", "-hal", ".") + require.NoError(t, gitCmd1.Start()) + _, err = v2Manager1.AddCommand(gitCmd1) + require.NoError(t, err) + + gitCmd2 := exec.CommandContext(ctx, "ls", "-hal", ".") + require.NoError(t, gitCmd2.Start()) + _, err = v2Manager1.AddCommand(gitCmd2) + require.NoError(t, err) + defer func() { + require.NoError(t, gitCmd2.Wait()) + }() + + require.NoError(t, cmd.Wait()) + require.NoError(t, gitCmd1.Wait()) + + repoCgroupPath := filepath.Join(v2Manager1.currentProcessCgroup(), "repos-0") + + expected := strings.NewReader(strings.ReplaceAll(tt.expect, "%s", repoCgroupPath)) + + assert.NoError(t, testutil.CollectAndCompare(v2Manager1, expected)) + }) + } +} + +func TestPruneOldCgroupsV2(t *testing.T) { + t.Parallel() + + testCases := []struct { + desc string + cfg cgroups.Config + expectedPruned bool + // setup returns a pid + setup func(*testing.T, cgroups.Config, *mockCgroupV2) int + }{ + { + desc: "process belongs to another user", + cfg: cgroups.Config{ + HierarchyRoot: "gitaly", + Repositories: cgroups.Repositories{ + Count: 10, + MemoryBytes: 10 * 1024 * 1024, + CPUShares: 1024, + }, + }, + setup: func(t *testing.T, cfg cgroups.Config, mock *mockCgroupV2) int { + pid := 1 + + cgroupManager := mock.newCgroupManager(cfg, pid) + mock.setupMockCgroupFiles(t, cgroupManager) + require.NoError(t, cgroupManager.Setup()) + + return pid + }, + expectedPruned: true, + }, + { + desc: "no hierarchy root", + cfg: cgroups.Config{ + HierarchyRoot: "", + Repositories: cgroups.Repositories{ + Count: 10, + MemoryBytes: 10 * 1024 * 1024, + CPUShares: 1024, + }, + }, + setup: func(t *testing.T, cfg cgroups.Config, mock *mockCgroupV2) int { + pid := 1 + + cgroupManager := mock.newCgroupManager(cfg, pid) + mock.setupMockCgroupFiles(t, cgroupManager) + require.NoError(t, cgroupManager.Setup()) + return 1 + }, + expectedPruned: false, + }, + { + desc: "pid of finished process", + cfg: cgroups.Config{ + HierarchyRoot: "gitaly", + Repositories: cgroups.Repositories{ + Count: 10, + MemoryBytes: 10 * 1024 * 1024, + CPUShares: 1024, + }, + }, + setup: func(t *testing.T, cfg cgroups.Config, mock *mockCgroupV2) int { + cmd := exec.Command("ls") + require.NoError(t, cmd.Run()) + pid := cmd.Process.Pid + + cgroupManager := mock.newCgroupManager(cfg, pid) + mock.setupMockCgroupFiles(t, cgroupManager) + require.NoError(t, cgroupManager.Setup()) + + memoryFile := filepath.Join( + cfg.Mountpoint, + cfg.HierarchyRoot, + "memory.limit_in_bytes", + ) + require.NoError(t, os.WriteFile(memoryFile, []byte{}, fs.ModeAppend)) + + return pid + }, + expectedPruned: true, + }, + { + desc: "pid of running process", + cfg: cgroups.Config{ + HierarchyRoot: "gitaly", + Repositories: cgroups.Repositories{ + Count: 10, + MemoryBytes: 10 * 1024 * 1024, + CPUShares: 1024, + }, + }, + setup: func(t *testing.T, cfg cgroups.Config, mock *mockCgroupV2) int { + pid := os.Getpid() + + cgroupManager := mock.newCgroupManager(cfg, pid) + mock.setupMockCgroupFiles(t, cgroupManager) + require.NoError(t, cgroupManager.Setup()) + + return pid + }, + expectedPruned: false, + }, + { + desc: "gitaly-0 directory is deleted", + cfg: cgroups.Config{ + HierarchyRoot: "gitaly", + Repositories: cgroups.Repositories{ + Count: 10, + MemoryBytes: 10 * 1024 * 1024, + CPUShares: 1024, + }, + }, + setup: func(t *testing.T, cfg cgroups.Config, mock *mockCgroupV2) int { + cgroupManager := mock.newCgroupManager(cfg, 0) + mock.setupMockCgroupFiles(t, cgroupManager) + require.NoError(t, cgroupManager.Setup()) + + return 0 + }, + expectedPruned: true, + }, + } + + for _, tc := range testCases { + t.Run(tc.desc, func(t *testing.T) { + mock := newMockV2(t) + tc.cfg.Mountpoint = mock.root + + root := filepath.Join( + tc.cfg.Mountpoint, + tc.cfg.HierarchyRoot, + ) + require.NoError(t, os.MkdirAll(root, perm.PublicDir)) + + pid := tc.setup(t, tc.cfg, mock) + + logger, _ := test.NewNullLogger() + mock.pruneOldCgroups(tc.cfg, logger) + + // create cgroups directories with a different pid + oldGitalyProcessDir := filepath.Join( + root, + fmt.Sprintf("gitaly-%d", pid), + ) + + if tc.expectedPruned { + require.NoDirExists(t, oldGitalyProcessDir) + } else { + require.DirExists(t, oldGitalyProcessDir) + } + }) + } +} + +func calculateWantCPUWeight(wantCPUWeight int) int { + if wantCPUWeight == 0 { + return 0 + } + return 1 + ((wantCPUWeight-2)*9999)/262142 +} + +func requireCgroupWithString(t *testing.T, cgroupFile string, want string) { + t.Helper() + + if want == "" { + return + } + require.Equal(t, + string(readCgroupFile(t, cgroupFile)), + want, + ) +} + +func requireCgroupWithInt(t *testing.T, cgroupFile string, want int) { + t.Helper() + + if want <= 0 { + return + } + + require.Equal(t, + string(readCgroupFile(t, cgroupFile)), + strconv.Itoa(want), + ) +}