From 1b6ea551991d1da8ef606fbd972ead9a2d0c5c75 Mon Sep 17 00:00:00 2001 From: Eric Ju Date: Tue, 20 May 2025 18:01:59 -0400 Subject: [PATCH 1/7] snapshot: Add the snapshot filter Not all files are needed when taking a snapshot. For example, .lock files in refs/heads are unnecessary because the transaction manager handles concurrency and conflict checks. As a result, we need a filter to specify which files should be included in the snapshot and which should not. This commit introduces such a filter. The filter builds an internal index based on matching patterns, which determine what to include or exclude. When a snapshot is taken, the repository directory is filtered accordingly. --- .../operations/user_delete_branch_test.go | 6 +- .../gitaly/service/ref/delete_refs_test.go | 4 + .../service/ref/update_references_test.go | 4 + .../service/repository/repository_info.go | 2 +- .../repository/repository_info_test.go | 38 ++- internal/gitaly/service/repository/size.go | 22 +- .../gitaly/service/repository/size_test.go | 10 +- .../service/repository/snapshot_test.go | 30 +- .../service/repository/write_ref_test.go | 4 + .../storagemgr/partition/snapshot/snapshot.go | 35 +- .../partition/snapshot/snapshot_filter.go | 170 ++++++++++ .../snapshot/snapshot_filter_test.go | 300 ++++++++++++++++++ .../storagemgr/partition/testhelper_test.go | 11 + .../transaction_manager_offloading_test.go | 7 +- 14 files changed, 602 insertions(+), 41 deletions(-) create mode 100644 internal/gitaly/storage/storagemgr/partition/snapshot/snapshot_filter.go create mode 100644 internal/gitaly/storage/storagemgr/partition/snapshot/snapshot_filter_test.go diff --git a/internal/gitaly/service/operations/user_delete_branch_test.go b/internal/gitaly/service/operations/user_delete_branch_test.go index ae1fceb175..578cf33223 100644 --- a/internal/gitaly/service/operations/user_delete_branch_test.go +++ b/internal/gitaly/service/operations/user_delete_branch_test.go @@ -346,10 +346,14 @@ func TestUserDeleteBranch_allowed(t *testing.T) { } func TestUserDeleteBranch_concurrentUpdate(t *testing.T) { + // In the context of transaction management, refs do not require a .lock file for concurrency control. + // This is because transaction manager ensures that no two transactions can operate + // on the same ref concurrently and tracks all ongoing ref transactions enforces mutual exclusion. + testhelper.SkipWithWAL(t, "snapshot ignore ref .lock file") + t.Parallel() ctx := testhelper.Context(t) - ctx, cfg, client := setupOperationsService(t, ctx) repo, repoPath := gittest.CreateRepository(t, ctx, cfg) diff --git a/internal/gitaly/service/ref/delete_refs_test.go b/internal/gitaly/service/ref/delete_refs_test.go index 2cf7c803a5..a7a0d2237e 100644 --- a/internal/gitaly/service/ref/delete_refs_test.go +++ b/internal/gitaly/service/ref/delete_refs_test.go @@ -183,6 +183,10 @@ func TestDeleteRefs_invalidRefFormat(t *testing.T) { } func TestDeleteRefs_refLocked(t *testing.T) { + // In the context of transaction management, refs do not require a .lock file for concurrency control. + // This is because transaction manager ensures that no two transactions can operate + // on the same ref concurrently and tracks all ongoing ref transactions enforces mutual exclusion. + testhelper.SkipWithWAL(t, "snapshot ignore ref .lock file") t.Parallel() ctx := testhelper.Context(t) diff --git a/internal/gitaly/service/ref/update_references_test.go b/internal/gitaly/service/ref/update_references_test.go index 8d4768000f..bf5d20b232 100644 --- a/internal/gitaly/service/ref/update_references_test.go +++ b/internal/gitaly/service/ref/update_references_test.go @@ -296,6 +296,10 @@ func TestUpdateReferences(t *testing.T) { { desc: "locked reference", setup: func(t *testing.T) setupData { + // In the context of transaction management, refs do not require a .lock file for concurrency control. + // This is because transaction manager ensures that no two transactions can operate + // on the same ref concurrently and tracks all ongoing ref transactions enforces mutual exclusion. + testhelper.SkipWithWAL(t, "snapshot ignore ref .lock file") repoProto, repoPath := gittest.CreateRepository(t, ctx, cfg) repo := localrepo.NewTestRepo(t, cfg, repoProto) diff --git a/internal/gitaly/service/repository/repository_info.go b/internal/gitaly/service/repository/repository_info.go index b3eff25e63..a4d619a684 100644 --- a/internal/gitaly/service/repository/repository_info.go +++ b/internal/gitaly/service/repository/repository_info.go @@ -26,7 +26,7 @@ func (s *server) RepositoryInfo( return nil, err } - repoSize, err := dirSizeInBytes(repoPath) + repoSize, err := dirSizeInBytes(repoPath, s.cfg.Transactions.Enabled, true) if err != nil { return nil, fmt.Errorf("calculating repository size: %w", err) } diff --git a/internal/gitaly/service/repository/repository_info_test.go b/internal/gitaly/service/repository/repository_info_test.go index abf6af78b8..3c217a29ac 100644 --- a/internal/gitaly/service/repository/repository_info_test.go +++ b/internal/gitaly/service/repository/repository_info_test.go @@ -38,7 +38,9 @@ func TestRepositoryInfo(t *testing.T) { emptyRepoSize := func() uint64 { _, repoPath := gittest.CreateRepository(t, ctx, cfg) - size, err := dirSizeInBytes(repoPath) + + size, err := dirSizeInBytes(repoPath, testhelper.IsWALEnabled(), true) + require.NoError(t, err) return uint64(size) }() @@ -71,8 +73,10 @@ func TestRepositoryInfo(t *testing.T) { } for _, tc := range []struct { - desc string - setup func(t *testing.T) setupData + desc string + setup func(t *testing.T) setupData + ignore bool + ignoreReason string }{ { desc: "unset repository", @@ -338,6 +342,8 @@ func TestRepositoryInfo(t *testing.T) { }, } }, + ignore: testhelper.IsWALEnabled(), + ignoreReason: "transaction snapshot exclude .mtime file in objects/pack", }, { desc: "repository with keep pack", @@ -370,6 +376,8 @@ func TestRepositoryInfo(t *testing.T) { }, } }, + ignore: testhelper.IsWALEnabled(), + ignoreReason: "transaction snapshots exclude .keep file in objects/pack", }, { desc: "repository with different pack types", @@ -409,6 +417,8 @@ func TestRepositoryInfo(t *testing.T) { }, } }, + ignore: testhelper.IsWALEnabled(), + ignoreReason: "transaction snapshot exclude .keep and .mtime file in objects/pack", }, { desc: "repository with commit-graph", @@ -430,7 +440,8 @@ func TestRepositoryInfo(t *testing.T) { repoStats, err := stats.RepositoryInfoForRepository(ctx, localrepo.NewTestRepo(t, cfg, repo)) require.NoError(t, err) - repoSize, err := dirSizeInBytes(repoPath) + repoSize, err := dirSizeInBytes(repoPath, testhelper.IsWALEnabled(), true) + require.NoError(t, err) return setupData{ @@ -478,7 +489,8 @@ func TestRepositoryInfo(t *testing.T) { repoStats, err := stats.RepositoryInfoForRepository(ctx, localrepo.NewTestRepo(t, cfg, repo)) require.NoError(t, err) - repoSize, err := dirSizeInBytes(repoPath) + repoSize, err := dirSizeInBytes(repoPath, testhelper.IsWALEnabled(), true) + require.NoError(t, err) return setupData{ @@ -531,7 +543,8 @@ func TestRepositoryInfo(t *testing.T) { repoStats, err := stats.RepositoryInfoForRepository(ctx, localrepo.NewTestRepo(t, cfg, repo)) require.NoError(t, err) - repoSize, err := dirSizeInBytes(repoPath) + repoSize, err := dirSizeInBytes(repoPath, testhelper.IsWALEnabled(), true) + require.NoError(t, err) return setupData{ @@ -581,7 +594,8 @@ func TestRepositoryInfo(t *testing.T) { repoStats, err := stats.RepositoryInfoForRepository(ctx, localrepo.NewTestRepo(t, cfg, repo)) require.NoError(t, err) - repoSize, err := dirSizeInBytes(repoPath) + repoSize, err := dirSizeInBytes(repoPath, testhelper.IsWALEnabled(), true) + require.NoError(t, err) return setupData{ @@ -628,7 +642,8 @@ func TestRepositoryInfo(t *testing.T) { repoStats, err := stats.RepositoryInfoForRepository(ctx, localrepo.NewTestRepo(t, cfg, repo)) require.NoError(t, err) - repoSize, err := dirSizeInBytes(repoPath) + repoSize, err := dirSizeInBytes(repoPath, testhelper.IsWALEnabled(), true) + require.NoError(t, err) return setupData{ @@ -683,7 +698,8 @@ func TestRepositoryInfo(t *testing.T) { require.NoError(t, err) } - repoSize, err := dirSizeInBytes(repoPath) + repoSize, err := dirSizeInBytes(repoPath, testhelper.IsWALEnabled(), true) + require.NoError(t, err) return setupData{ @@ -710,6 +726,10 @@ func TestRepositoryInfo(t *testing.T) { }, } { t.Run(tc.desc, func(t *testing.T) { + if tc.ignore { + t.Skip(tc.ignoreReason) + } + t.Parallel() setup := tc.setup(t) diff --git a/internal/gitaly/service/repository/size.go b/internal/gitaly/service/repository/size.go index 94a2c01865..87f1f4ec02 100644 --- a/internal/gitaly/service/repository/size.go +++ b/internal/gitaly/service/repository/size.go @@ -8,6 +8,7 @@ import ( "os" "path/filepath" + "gitlab.com/gitlab-org/gitaly/v16/internal/gitaly/storage/storagemgr/partition/snapshot" "gitlab.com/gitlab-org/gitaly/v16/internal/structerr" "gitlab.com/gitlab-org/gitaly/v16/proto/go/gitalypb" ) @@ -26,7 +27,7 @@ func (s *server) RepositorySize(ctx context.Context, in *gitalypb.RepositorySize return nil, err } - sizeInBytes, err := dirSizeInBytes(path) + sizeInBytes, err := dirSizeInBytes(path, s.cfg.Transactions.Enabled, true) if err != nil { return nil, fmt.Errorf("calculating directory size: %w", err) } @@ -45,8 +46,8 @@ func (s *server) GetObjectDirectorySize(ctx context.Context, in *gitalypb.GetObj if err != nil { return nil, err } - - sizeInBytes, err := dirSizeInBytes(path) + // path is the objects directory path, not repo's path + sizeInBytes, err := dirSizeInBytes(path, s.cfg.Transactions.Enabled, false) if err != nil { return nil, fmt.Errorf("calculating directory size: %w", err) } @@ -54,10 +55,10 @@ func (s *server) GetObjectDirectorySize(ctx context.Context, in *gitalypb.GetObj return &gitalypb.GetObjectDirectorySizeResponse{Size: sizeInBytes / 1024}, nil } -func dirSizeInBytes(path string) (int64, error) { +func dirSizeInBytes(dirPath string, transactionEnabled, isRepoPath bool) (int64, error) { var totalSize int64 - if err := filepath.WalkDir(path, func(path string, d fs.DirEntry, err error) error { + if err := filepath.WalkDir(dirPath, func(path string, d fs.DirEntry, err error) error { if err != nil { // It can happen that we try to walk a directory like the object shards or // an empty reference directory that gets deleted concurrently. This is fine @@ -73,6 +74,17 @@ func dirSizeInBytes(path string) (int64, error) { return nil } + relPath, err := filepath.Rel(dirPath, path) + if err != nil { + return fmt.Errorf("calculating path relative to repo root: %w", err) + } + if transactionEnabled && isRepoPath && relPath != "." { + matcher := snapshot.NewSnapshotFilter() + if ok := matcher.Matches(relPath, d.IsDir()); !ok { + return nil + } + } + fi, err := d.Info() if err != nil { // The file may have been concurrently removed. diff --git a/internal/gitaly/service/repository/size_test.go b/internal/gitaly/service/repository/size_test.go index 4ae283c167..40db03077f 100644 --- a/internal/gitaly/service/repository/size_test.go +++ b/internal/gitaly/service/repository/size_test.go @@ -99,10 +99,16 @@ func TestRepositorySize_normalRepository(t *testing.T) { invalidateSnapshot() requireRepositorySize(t, ctx, client, repo, 16) - // Even garbage should increase the size. require.NoError(t, os.WriteFile(filepath.Join(repoPath, "garbage"), incompressibleData(5*1024), mode.File)) invalidateSnapshot() - requireRepositorySize(t, ctx, client, repo, 21) + if testhelper.IsWALEnabled() { + // when snapshot filter is enabled in transaction manager + // garbage is not included in object directory's size + requireRepositorySize(t, ctx, client, repo, 16) + } else { + // Otherwise, even garbage should increase the size. + requireRepositorySize(t, ctx, client, repo, 21) + } } func TestRepositorySize_failure(t *testing.T) { diff --git a/internal/gitaly/service/repository/snapshot_test.go b/internal/gitaly/service/repository/snapshot_test.go index 4c4c226f8c..3bac7f6743 100644 --- a/internal/gitaly/service/repository/snapshot_test.go +++ b/internal/gitaly/service/repository/snapshot_test.go @@ -136,8 +136,10 @@ func TestGetSnapshot(t *testing.T) { // Unreachable objects should also be included in the snapshot. unreachableCommitID := gittest.WriteCommit(t, cfg, repoPath, gittest.WithMessage("unreachable")) - // The shallow file, used if the repository is a shallow clone, is also included in snapshots. - require.NoError(t, os.WriteFile(filepath.Join(repoPath, "shallow"), nil, mode.File)) + if !testhelper.IsWALEnabled() { + // The shallow file, used if the repository is a shallow clone, is also included in snapshots. + require.NoError(t, os.WriteFile(filepath.Join(repoPath, "shallow"), nil, mode.File)) + } // Custom Git hooks are not included in snapshots. require.NoError(t, os.MkdirAll(filepath.Join(repoPath, "hooks"), mode.Directory)) @@ -149,16 +151,24 @@ func TestGetSnapshot(t *testing.T) { mode.File, )) - expected := testhelper.DirectoryState{ - "HEAD": { - Mode: archive.TarFileMode, - ParseContent: ignoreContent, - }, - "shallow": { + expected := func() testhelper.DirectoryState { + info := testhelper.DirectoryEntry{ Mode: archive.TarFileMode, ParseContent: ignoreContent, - }, - } + } + if testhelper.IsWALEnabled() { + // Only HEAD is included in snapshots + // when snapshot filter is enabled in transaction manager. + return testhelper.DirectoryState{ + "HEAD": info, + } + } + return testhelper.DirectoryState{ + "HEAD": info, + "shallow": info, + } + }() + expected[fmt.Sprintf("objects/%s/%s", treeID[0:2], treeID[2:])] = testhelper.DirectoryEntry{ Mode: archive.TarFileMode, ParseContent: ignoreContent, diff --git a/internal/gitaly/service/repository/write_ref_test.go b/internal/gitaly/service/repository/write_ref_test.go index 08c5f41768..0b9a5c664d 100644 --- a/internal/gitaly/service/repository/write_ref_test.go +++ b/internal/gitaly/service/repository/write_ref_test.go @@ -393,6 +393,10 @@ func TestWriteRef(t *testing.T) { } func TestWriteRef_locked(t *testing.T) { + // In the context of transaction management, refs do not require a .lock file for concurrency control. + // This is because transaction manager ensures that no two transactions can operate + // on the same ref concurrently and tracks all ongoing ref transactions enforces mutual exclusion. + testhelper.SkipWithWAL(t, "snapshots ignore ref .lock files") t.Parallel() ctx := testhelper.Context(t) diff --git a/internal/gitaly/storage/storagemgr/partition/snapshot/snapshot.go b/internal/gitaly/storage/storagemgr/partition/snapshot/snapshot.go index 132c50ee11..d57d329598 100644 --- a/internal/gitaly/storage/storagemgr/partition/snapshot/snapshot.go +++ b/internal/gitaly/storage/storagemgr/partition/snapshot/snapshot.go @@ -10,7 +10,6 @@ import ( "strings" "time" - "gitlab.com/gitlab-org/gitaly/v16/internal/git/housekeeping" "gitlab.com/gitlab-org/gitaly/v16/internal/gitaly/storage" "gitlab.com/gitlab-org/gitaly/v16/internal/gitaly/storage/gitstorage" "gitlab.com/gitlab-org/gitaly/v16/internal/gitaly/storage/mode" @@ -242,13 +241,9 @@ func createParentDirectories(storageRoot, snapshotRoot, relativePath string, sta // are shared between the snapshot and the repository, they must not be modified. Git doesn't modify // existing files but writes new ones so this property is upheld. func createRepositorySnapshot(ctx context.Context, storageRoot, snapshotRoot, relativePath string, stats *snapshotStatistics) error { - if err := createDirectorySnapshot(ctx, filepath.Join(storageRoot, relativePath), filepath.Join(snapshotRoot, relativePath), map[string]struct{}{ - // Don't include worktrees in the snapshot. All of the worktrees in the repository should be leftover - // state from before transaction management was introduced as the transactions would create their - // worktrees in the snapshot. - housekeeping.WorktreesPrefix: {}, - housekeeping.GitlabWorktreePrefix: {}, - }, stats); err != nil { + if err := createDirectorySnapshot(ctx, filepath.Join(storageRoot, relativePath), + filepath.Join(snapshotRoot, relativePath), + NewSnapshotFilter(), stats); err != nil { return fmt.Errorf("create directory snapshot: %w", err) } @@ -258,8 +253,8 @@ func createRepositorySnapshot(ctx context.Context, storageRoot, snapshotRoot, re // createDirectorySnapshot recursively recreates the directory structure from originalDirectory into // snapshotDirectory and hard links files into the same locations in snapshotDirectory. // -// skipRelativePaths can be provided to skip certain entries from being included in the snapshot. -func createDirectorySnapshot(ctx context.Context, originalDirectory, snapshotDirectory string, skipRelativePaths map[string]struct{}, stats *snapshotStatistics) error { +// includeRelativePaths is needed to track which paths we want to include in the snapshot. +func createDirectorySnapshot(ctx context.Context, originalDirectory, snapshotDirectory string, matcher Filter, stats *snapshotStatistics) error { if err := filepath.Walk(originalDirectory, func(oldPath string, info fs.FileInfo, err error) error { if err != nil { if errors.Is(err, fs.ErrNotExist) && oldPath == originalDirectory { @@ -276,8 +271,24 @@ func createDirectorySnapshot(ctx context.Context, originalDirectory, snapshotDir return fmt.Errorf("rel: %w", err) } - if _, ok := skipRelativePaths[relativePath]; ok { - return fs.SkipDir + //// TODO relativePath is when oldPath == originalDirectory. explain it + //if relativePath != "." { + // if ok := matcher.Matches(ctx, relativePath, info.IsDir()); !ok { + // if info.IsDir() { + // // TODO verify this, description file will cause whole dir be ignored + // // if we dont't have this conditionl + // return fs.SkipDir + // } else { + // return nil + // } + // } + //} + if ok := matcher.Matches(relativePath, info.IsDir()); !ok { + if info.IsDir() { + return fs.SkipDir + } else { + return nil + } } newPath := filepath.Join(snapshotDirectory, relativePath) diff --git a/internal/gitaly/storage/storagemgr/partition/snapshot/snapshot_filter.go b/internal/gitaly/storage/storagemgr/partition/snapshot/snapshot_filter.go new file mode 100644 index 0000000000..7c8645b37c --- /dev/null +++ b/internal/gitaly/storage/storagemgr/partition/snapshot/snapshot_filter.go @@ -0,0 +1,170 @@ +package snapshot + +import ( + "fmt" + "os" + "path/filepath" + "strings" +) + +var ( + // GitalySnapshotFileIncludePatterns defines the list of include patterns for selecting + // files during a repository snapshot. There are four types of supported patterns: + // + // 1. Exact file: Matches a specific file path. + // Example: "objects/info/commit-graph" matches only that exact file. + // + // 2. Prefix pattern: Matches all files and directories under a given path prefix. + // The '*' wildcard is required. + // Example: "refs/*" matches all files and directories under the "refs/" directory. + // + // 3. Postfix pattern: Matches files with a specific suffix within a given directory. + // The '*' wildcard is required and must follow the format: /*.suffix. + // This pattern does not match files in nested subdirectories. + // + // Example: "objects/pack/*.pack" matches all `.pack` files directly under "objects/pack/", + // but "objects/pack/*.pack" will not match "objects/pack/x/y/z.pack". + // + // 4. Special range pattern: Matches a range of directories by expanding a range notation. + // Example: "objects/00~ff/*" expands to include all files under "objects/00/" through "objects/ff/". + GitalySnapshotFileIncludePatterns = []string{ + // special identifier + "objects/00~ff/*", + + // exact files + "config", ".gitaly-full-repack-timestamp", + "gitaly-language.stats", + "HEAD", "packed-refs", "objects/info/alternates", + //"objects/info/packs", + "objects/info/commit-graph", + "objects/info/commit-graphs/commit-graph-chain", + "objects/pack/multi-pack-index", + + // prefix patterns + "reftable/*", + "refs/*", + "custom_hooks/*", + + // postfix patterns + "objects/info/commit-graphs/*.graph", + "objects/pack/*.pack", + "objects/pack/*.idx", + "objects/pack/*.rev", + "objects/pack/*.bitmap", + } + + // GitalySnapshotFileExcludePatterns is the list of exclude patterns for snapshot files in when + // taking snapshot. Only postfix pattern is supported, e.g. "refs/heads/*.lock" + GitalySnapshotFileExcludePatterns = []string{ + "refs/heads/*.lock", + "refs/tags/*.lock", + } +) + +// Filter is an interface to determine whether a given path should be included in a snapshot. +type Filter interface { + Matches(path string, isDir bool) bool +} + +// gitalySnapshotFilter filters Git repository files to determine which ones +// should be included in a snapshot. It applies the include and exclude patterns +// to efficiently select files based on their paths. +type gitalySnapshotFilter struct { + includeIndex *map[string]struct{} + excludeIndex *map[string]struct{} +} + +// NewSnapshotFilter create a filter to determine which files should be included in a repository snapshot. +func NewSnapshotFilter() Filter { + filter := &gitalySnapshotFilter{} + filter.includeIndex = &map[string]struct{}{} + filter.excludeIndex = &map[string]struct{}{} + filter.buildIndex(GitalySnapshotFileIncludePatterns, GitalySnapshotFileExcludePatterns) + return filter +} + +// buildIndex initializes the filter.index with given include and exclude lists. +func (f *gitalySnapshotFilter) buildIndex(includeList, excludeList []string) { + for _, path := range includeList { + addToIndex(path, f.includeIndex) + } + for _, path := range excludeList { + (*f.excludeIndex)[path] = struct{}{} + } +} + +func addToIndex(filePath string, index *map[string]struct{}) { + switch filePath { + case "objects/00~ff/*": + // Put objects/00/* to objects/ff/* into the index + (*index)["objects"] = struct{}{} + for i := 0; i < 256; i++ { + (*index)[fmt.Sprintf("objects%c%02x%c*", os.PathSeparator, i, os.PathSeparator)] = struct{}{} + } + default: + // "." is to ensure the inclusion of the top-level directory during the iteration. + (*index)["."] = struct{}{} + // For example, given the path "objects/pack/multi-pack-index", + // we need to add the following entries to the index: + // "objects", "objects/pack", and "objects/pack/multi-pack-index". + // This ensures that directory traversal (e.g., checking the "objects" directory) succeeds. + split := strings.Split(filePath, string(os.PathSeparator)) + var key string + for _, e := range split { + if e == "." || e == "" { + continue + } + key = filepath.Join(key, e) + if _, ok := (*index)[key]; !ok { + (*index)[key] = struct{}{} + } + } + } +} + +// Matches checks if a given file or directory should be included in the snapshot based on the filter. +func (f *gitalySnapshotFilter) Matches(file string, isDir bool) bool { + if isDir { + if _, exIdxHit := (*f.excludeIndex)[file]; exIdxHit { + return false + } + if _, inIdxHit := (*f.includeIndex)[file]; inIdxHit { + return inIdxHit + } + } + + // Exact file match + if _, inIdxHit := (*f.includeIndex)[file]; inIdxHit { + return true + } + // process file with ext, i.e. Postfix pattern, e.g. objects/pack/*.pack + if ext := filepath.Ext(file); ext != "" { + // For example, if we encounter the path "objects/pack/pack-abc.pack", + // it is transformed into "objects/pack/*.pack" for matching against entries in the index. + file = filepath.Join(filepath.Dir(file), "*"+ext) + if _, exIdxHit := (*f.excludeIndex)[file]; exIdxHit { + return false + } + if _, inIdxHit := (*f.includeIndex)[file]; inIdxHit { + return true + } + } + + // Process prefix patterns. For example, given the path "refs/remotes/origin/main", + // we split it into: refs, refs/remotes, refs/remotes/origin. Then we search for matches + // using the patterns: "refs/*", "refs/remotes/*", and "refs/remotes/origin/*". + // Note: we do not search "refs", "refs/remotes", or "refs/remotes/origin" directly, + // because these entries are added to enable matching files under them: they do not imply that + // all files under those paths are matches. + // The trailing '*' indicates that all entries under the given path should be considered. + split := strings.Split(file, string(os.PathSeparator)) + var dirToSearch string + for _, e := range split { + dirToSearch = filepath.Join(dirToSearch, e) + if _, inIdxHit := (*f.includeIndex)[filepath.Join(dirToSearch, "*")]; inIdxHit { + return true + } + } + + return false +} diff --git a/internal/gitaly/storage/storagemgr/partition/snapshot/snapshot_filter_test.go b/internal/gitaly/storage/storagemgr/partition/snapshot/snapshot_filter_test.go new file mode 100644 index 0000000000..fcca7a8d7d --- /dev/null +++ b/internal/gitaly/storage/storagemgr/partition/snapshot/snapshot_filter_test.go @@ -0,0 +1,300 @@ +package snapshot + +import ( + "errors" + "fmt" + "io/fs" + "os" + "path/filepath" + "regexp" + "strconv" + "strings" + "testing" + + "github.com/stretchr/testify/require" + "gitlab.com/gitlab-org/gitaly/v16/internal/git" + "gitlab.com/gitlab-org/gitaly/v16/internal/git/gittest" + "gitlab.com/gitlab-org/gitaly/v16/internal/gitaly/storage/mode" + "gitlab.com/gitlab-org/gitaly/v16/internal/testhelper" + "gitlab.com/gitlab-org/gitaly/v16/internal/testhelper/testcfg" +) + +var ( + exactInMatchPaths = map[string]struct{}{ + "config": {}, + ".gitaly-full-repack-timestamp": {}, + "gitaly-language.stats": {}, + "custom_hooks": {}, + "HEAD": {}, + "refs": {}, + "packed-refs": {}, + "reftable": {}, + "objects": {}, + "objects/info": {}, + "objects/info/alternates": {}, + "objects/info/commit-graph": {}, + "objects/info/commit-graphs": {}, + "objects/info/packs": {}, + "objects/info/commit-graphs/commit-graph-chain": {}, + "objects/pack": {}, + "objects/pack/multi-pack-index": {}, + } + regexInMatches = []string{ + `^objects/[0-9a-f]{2}.*`, + `^objects/pack/.*\.(pack|idx|rev|bitmap)$`, + `^objects/info/commit-graphs/.*\.graph$`, + `^custom_hooks/.*`, + `^packed-refs.*`, + `^reftable/.*`, + `^refs/.*`, + } + regexExMatches = []string{ + `^refs/.*\.lock$`, + } +) + +func TestSnapshotMatcher(t *testing.T) { + t.Parallel() + + type resultInfo struct { + isDir bool + shouldPass bool + } + fileShouldPass := &resultInfo{false, true} + dirShouldPass := &resultInfo{true, true} + fileShouldNotPass := &resultInfo{false, false} + dirShouldNotPass := &resultInfo{true, false} + + for _, tc := range []struct { + desc string + expectedResult map[string]*resultInfo + }{ + { + desc: "metadata files that should pass", + expectedResult: map[string]*resultInfo{ + "config": fileShouldPass, + ".gitaly-full-repack-timestamp": fileShouldPass, + "gitaly-language.stats": fileShouldPass, + }, + }, + { + desc: "directories that should pass", + expectedResult: map[string]*resultInfo{ + "custom_hooks": dirShouldPass, + "refs": dirShouldPass, + "refs/heads/": dirShouldPass, + "refs/tags/": dirShouldPass, + "reftable": dirShouldPass, + "objects": dirShouldPass, + "objects/info": dirShouldPass, + "objects/pack": dirShouldPass, + "objects/info/commit-graphs": dirShouldPass, + }, + }, + { + desc: "object related files that should pass", + expectedResult: map[string]*resultInfo{ + "objects/info/alternates": fileShouldPass, + "objects/info/commit-graph": fileShouldPass, + "objects/info/commit-graphs/commit-graph-chain": fileShouldPass, + "objects/pack/multi-pack-index": fileShouldPass, + "objects/12/3f7d4e2cba986d43d9a4d5e58e1f2305f0c0b6a1": fileShouldPass, + "objects/a6/a6f4d6cb2e1b9a7930e01db68bc3c1f5943733e52ff6cb7b2dcf1a4738d29c18": fileShouldPass, + "objects/info/commit-graphs/graph-1.graph": fileShouldPass, + "objects/pack/pack-1234567890abcdef1234567890abcdef12345678.pack": fileShouldPass, + "objects/pack/pack-1234567890abcdef1234567890abcdef12345678.idx": fileShouldPass, + "objects/pack/pack-1234567890abcdef1234567890abcdef12345678.rev": fileShouldPass, + "objects/pack/pack-1234567890abcdef1234567890abcdef12345678.bitmap": fileShouldPass, + "objects/pack/pack-9c07ab982ccbd33e3ea1a8b2048b1c6c3d1f6e5bcbfa3d3ce13e6ec01e5935f2.pack": fileShouldPass, + "objects/pack/pack-c81a913b8b08f7d69bfaab9a9db9a1d8a9ec4ef9b4e556290cb9a8cf867e3d04.idx": fileShouldPass, + "objects/pack/pack-1a35b2d0f2f3c0adca8b5c1d3a98b09e0a3fa5d4516c2d5e5cce9b21a7b9f3a2.rev": fileShouldPass, + "objects/pack/pack-5fa2f4c4e8c3a9e1dc7f5e4c8b1a0dbf0c2d3f4b1d5a6c9e3e8b7d0c4f9a2e61.bitmap": fileShouldPass, + }, + }, + { + desc: "ref related files that should pass", + expectedResult: map[string]*resultInfo{ + "HEAD": fileShouldPass, + "packed-refs": fileShouldPass, + "refs/heads/master": fileShouldPass, + "refs/heads/feature-x": fileShouldPass, + "refs/remotes/origin/main": fileShouldPass, + "refs/tags/v1.0.0": fileShouldPass, + "reftable/0x000000000001-0x000000000004-b74a713a.ref": fileShouldPass, + "reftable/tables.list": fileShouldPass, + }, + }, + { + desc: "metadata files or dirs that should be rejected", + expectedResult: map[string]*resultInfo{ + "description": fileShouldNotPass, + "info": dirShouldNotPass, + "hooks": dirShouldNotPass, + "worktrees": dirShouldNotPass, + "gitlab-worktree": dirShouldNotPass, + }, + }, + { + desc: "git objects related files that should be rejected", + expectedResult: map[string]*resultInfo{ + "objects/pack/pack-234567890abcdef1234567890abcdef12345678.keep": fileShouldNotPass, + "objects/pack/pack-1234567890abcdef1234567890abcdef12345678.mtime": fileShouldNotPass, + "objects/pack/pack-3ed1a7c8f0b2e4c1d9a8e3f4b0c5d6a1b9f3e7c4d2b1f0e8a3c7d5b2e0a9c6d3.keep": fileShouldNotPass, + "objects/pack/pack-0b2e4c1d9a3ed1a7c8f8e3f4b0c5d6a1b9f3e7c4d2b1f0e8a3c7d5b2e0a9abcd.mtime": fileShouldNotPass, + "refs/heads/main.lock": fileShouldNotPass, + }, + }, + } { + t.Run(tc.desc, func(t *testing.T) { + t.Parallel() + matcher := newRegexBasedFilter() + for snapshotPath, expectedResult := range tc.expectedResult { + require.Equal(t, expectedResult.shouldPass, matcher.Matches(snapshotPath, expectedResult.isDir), "Mismatch for path %s", snapshotPath) + } + + filter := NewSnapshotFilter() + for snapshotPath, expectedResult := range tc.expectedResult { + require.Equal(t, expectedResult.shouldPass, filter.Matches(snapshotPath, expectedResult.isDir), "Mismatch for path %s", snapshotPath) + } + }) + } +} + +func BenchmarkFilterPerformance(b *testing.B) { + regexMatcher := newRegexBasedFilter() + customMatcher := NewSnapshotFilter() + + // A local existing repo can also be used to speed up the benchmark + // e.g. repoPath=/xxx/yyy/zzz/gitlab/.git + repoPath := setupBenchmarkRepo(b) + b.Run("regex based filter", func(b *testing.B) { + for i := 0; i < b.N; i++ { + err := walkRepo(repoPath, regexMatcher) + if err != nil { + b.Fatalf("regex filter failed: %v", err) + } + } + }) + + b.Run("customized gitaly snapshot filter", func(b *testing.B) { + for i := 0; i < b.N; i++ { + err := walkRepo(repoPath, customMatcher) + if err != nil { + b.Fatalf("custom filter failed: %v", err) + } + } + }) +} + +// regexBasedFilter is a filter implementation that uses regular expressions. +// The benefit of this approach is that the pattern syntax is based on regex, +// which is widely understood and expressive. However, performance may be limited +// by the efficiency of the regex engine. +// +// This implementation serves two main purposes: +// 1. To compare its results with those of gitalySnapshotFilter to ensure correctness. +// Since regex patterns are easy to understand, any mismatches can help identify +// misconfigurations in gitalySnapshotFilter. +// 2. To compare its performance with gitalySnapshotFilter. +type regexBasedFilter struct { + exactIncludeMatcher *map[string]struct{} + regexIncludeMatcher *regexp.Regexp + regexExcludeMatcher *regexp.Regexp +} + +func newRegexBasedFilter() Filter { + pattern := strings.Join(regexInMatches, "|") + regexIncludeMatcher := regexp.MustCompile(pattern) + + var regexExcludeMatcher *regexp.Regexp + if len(regexExMatches) != 0 { + pattern = strings.Join(regexExMatches, "|") + regexExcludeMatcher = regexp.MustCompile(pattern) + } + + return ®exBasedFilter{ + exactIncludeMatcher: &exactInMatchPaths, + regexIncludeMatcher: regexIncludeMatcher, + regexExcludeMatcher: regexExcludeMatcher, + } +} + +func (sm *regexBasedFilter) Matches(snapshotPath string, isDir bool) bool { + // try exactMatcher first + if _, found := (*sm.exactIncludeMatcher)[snapshotPath]; found { + return true + } + + // otherwise try regexMatcher + if sm.regexExcludeMatcher != nil { + if sm.regexIncludeMatcher.MatchString(snapshotPath) && !sm.regexExcludeMatcher.MatchString(snapshotPath) { + return true + } + } else { + if sm.regexIncludeMatcher.MatchString(snapshotPath) { + return true + } + } + + return false +} + +func walkRepo(repoPath string, matcher Filter) error { + if err := filepath.WalkDir(repoPath, func(path string, d fs.DirEntry, err error) error { + if err != nil { + if errors.Is(err, os.ErrNotExist) { + return nil + } + return err + } + + if d.IsDir() { + return nil + } + + relPath, err := filepath.Rel(repoPath, path) + if err != nil { + return fmt.Errorf("calculating path relative to repo root: %w", err) + } + if relPath != "." { + if ok := matcher.Matches(relPath, d.IsDir()); !ok { + return nil + } + } + + return nil + }); err != nil { + return fmt.Errorf("walking directory: %w", err) + } + + return nil +} + +// setupBenchmarkRepos create a repo with a lot of loose objects file, refs and a bunch of +// garbage file such as .keep in packs and .lock in refs +func setupBenchmarkRepo(tb testing.TB) string { + tb.Helper() + ctx := testhelper.Context(tb) + cfg := testcfg.Build(tb) + _, repoPath := gittest.CreateRepository(tb, ctx, cfg, gittest.CreateRepositoryConfig{ + SkipCreationViaService: true, + }) + + blobNum := 10000 + for i := 0; i < blobNum; i++ { + gittest.WriteBlob(tb, cfg, repoPath, []byte(strconv.Itoa(i))) + err := os.WriteFile(filepath.Join(repoPath, "objects", "pack", fmt.Sprintf("pack-%d.keep", i)), []byte{}, mode.File) + require.NoError(tb, err) + } + refNum := 10000 + for i := 0; i < refNum; i++ { + branch := fmt.Sprintf("test%d", i) + commit := gittest.WriteCommit(tb, cfg, repoPath, gittest.WithBranch(branch)) + gittest.WriteCommit(tb, cfg, repoPath, gittest.WithBranch(branch)) + gittest.WriteRef(tb, cfg, repoPath, git.ReferenceName(fmt.Sprintf("refs/heads/%d", i)), commit) + err := os.WriteFile(filepath.Join(repoPath, "refs", "heads", fmt.Sprintf("%d.lock", i)), []byte{}, mode.File) + require.NoError(tb, err) + } + + return repoPath +} diff --git a/internal/gitaly/storage/storagemgr/partition/testhelper_test.go b/internal/gitaly/storage/storagemgr/partition/testhelper_test.go index 765c1b916f..070646f3fb 100644 --- a/internal/gitaly/storage/storagemgr/partition/testhelper_test.go +++ b/internal/gitaly/storage/storagemgr/partition/testhelper_test.go @@ -1057,6 +1057,10 @@ type Prune struct { ExpectedObjects []git.ObjectID } +//type PruneLeftOvers struct { +// FileToRemove []string +//} + // ConsumerAcknowledge calls AcknowledgeConsumerPosition for all consumers. type ConsumerAcknowledge struct { // LSN is the LSN acknowledged by the consumers. @@ -1521,6 +1525,13 @@ func runTransactionTest(t *testing.T, ctx context.Context, tc transactionTestCas gittest.Exec(t, setup.Config, "-C", repoPath, "prune") require.ElementsMatch(t, step.ExpectedObjects, gittest.ListObjects(t, setup.Config, repoPath)) + + // Clean up files that are not needed by the snapshot. + // If these files are not removed, they may affect the delete repository test, + // as the snapshot is unaware of them and will not delete them. + require.NoError(t, os.RemoveAll(filepath.Join(repoPath, "info"))) + require.NoError(t, os.RemoveAll(filepath.Join(repoPath, "objects", "info", "packs"))) + case RemoveRepository: require.NoError(t, os.RemoveAll(repoPath)) case CreateRepository: diff --git a/internal/gitaly/storage/storagemgr/partition/transaction_manager_offloading_test.go b/internal/gitaly/storage/storagemgr/partition/transaction_manager_offloading_test.go index df0385a189..26ad447965 100644 --- a/internal/gitaly/storage/storagemgr/partition/transaction_manager_offloading_test.go +++ b/internal/gitaly/storage/storagemgr/partition/transaction_manager_offloading_test.go @@ -525,10 +525,15 @@ func generateOffloadingTests(t *testing.T, ctx context.Context, testPartitionID StartManager{ ModifyStorage: func(tb testing.TB, cfg config.Cfg, storagePath string) { repoPath := filepath.Join(storagePath, relativePath) - // Do a git gc to clean loose objects. git repack with filter may be // ineffective when there is loose objects. gittest.Exec(tb, cfg, "-C", repoPath, "gc") + + // Clean up files that are not needed by the snapshot. + // If these files are not removed, they may affect the delete repository test, + // as the snapshot is unaware of them and will not delete them. + require.NoError(t, os.RemoveAll(filepath.Join(repoPath, "info"))) + require.NoError(t, os.RemoveAll(filepath.Join(repoPath, "objects", "info", "packs"))) }, }, Begin{ -- GitLab From b516b05df61c8c0fa31162130a64f6d341deba82 Mon Sep 17 00:00:00 2001 From: Eric Ju Date: Wed, 21 May 2025 20:50:44 -0400 Subject: [PATCH 2/7] featureflag: Add feature flag to control snapshot filter This commit introduces a feature flag to enable or disable the snapshot filter. This allows us to gradually roll out or test the filter without affecting all environments. --- internal/featureflag/ff_snapshot_filter.go | 9 +++++++ .../service/repository/repository_info.go | 2 +- .../repository/repository_info_test.go | 14 +++++------ internal/gitaly/service/repository/size.go | 8 +++---- .../storagemgr/partition/snapshot/snapshot.go | 2 +- .../partition/snapshot/snapshot_filter.go | 12 ++++++++-- .../snapshot/snapshot_filter_test.go | 24 +++++++++++-------- internal/testhelper/testhelper.go | 3 +++ 8 files changed, 49 insertions(+), 25 deletions(-) create mode 100644 internal/featureflag/ff_snapshot_filter.go diff --git a/internal/featureflag/ff_snapshot_filter.go b/internal/featureflag/ff_snapshot_filter.go new file mode 100644 index 0000000000..c346e58f21 --- /dev/null +++ b/internal/featureflag/ff_snapshot_filter.go @@ -0,0 +1,9 @@ +package featureflag + +// SnapshotFilter enables snapshot filter feature. +var SnapshotFilter = NewFeatureFlag( + "snapshot_filter", + "v17.8.0", + "https://gitlab.com/gitlab-org/gitaly/-/issues/5737", + false, +) diff --git a/internal/gitaly/service/repository/repository_info.go b/internal/gitaly/service/repository/repository_info.go index a4d619a684..cc7a07cde8 100644 --- a/internal/gitaly/service/repository/repository_info.go +++ b/internal/gitaly/service/repository/repository_info.go @@ -26,7 +26,7 @@ func (s *server) RepositoryInfo( return nil, err } - repoSize, err := dirSizeInBytes(repoPath, s.cfg.Transactions.Enabled, true) + repoSize, err := dirSizeInBytes(ctx, repoPath, s.cfg.Transactions.Enabled, true) if err != nil { return nil, fmt.Errorf("calculating repository size: %w", err) } diff --git a/internal/gitaly/service/repository/repository_info_test.go b/internal/gitaly/service/repository/repository_info_test.go index 3c217a29ac..ead46c77f2 100644 --- a/internal/gitaly/service/repository/repository_info_test.go +++ b/internal/gitaly/service/repository/repository_info_test.go @@ -39,7 +39,7 @@ func TestRepositoryInfo(t *testing.T) { emptyRepoSize := func() uint64 { _, repoPath := gittest.CreateRepository(t, ctx, cfg) - size, err := dirSizeInBytes(repoPath, testhelper.IsWALEnabled(), true) + size, err := dirSizeInBytes(ctx, repoPath, testhelper.IsWALEnabled(), true) require.NoError(t, err) return uint64(size) @@ -440,7 +440,7 @@ func TestRepositoryInfo(t *testing.T) { repoStats, err := stats.RepositoryInfoForRepository(ctx, localrepo.NewTestRepo(t, cfg, repo)) require.NoError(t, err) - repoSize, err := dirSizeInBytes(repoPath, testhelper.IsWALEnabled(), true) + repoSize, err := dirSizeInBytes(ctx, repoPath, testhelper.IsWALEnabled(), true) require.NoError(t, err) @@ -489,7 +489,7 @@ func TestRepositoryInfo(t *testing.T) { repoStats, err := stats.RepositoryInfoForRepository(ctx, localrepo.NewTestRepo(t, cfg, repo)) require.NoError(t, err) - repoSize, err := dirSizeInBytes(repoPath, testhelper.IsWALEnabled(), true) + repoSize, err := dirSizeInBytes(ctx, repoPath, testhelper.IsWALEnabled(), true) require.NoError(t, err) @@ -543,7 +543,7 @@ func TestRepositoryInfo(t *testing.T) { repoStats, err := stats.RepositoryInfoForRepository(ctx, localrepo.NewTestRepo(t, cfg, repo)) require.NoError(t, err) - repoSize, err := dirSizeInBytes(repoPath, testhelper.IsWALEnabled(), true) + repoSize, err := dirSizeInBytes(ctx, repoPath, testhelper.IsWALEnabled(), true) require.NoError(t, err) @@ -594,7 +594,7 @@ func TestRepositoryInfo(t *testing.T) { repoStats, err := stats.RepositoryInfoForRepository(ctx, localrepo.NewTestRepo(t, cfg, repo)) require.NoError(t, err) - repoSize, err := dirSizeInBytes(repoPath, testhelper.IsWALEnabled(), true) + repoSize, err := dirSizeInBytes(ctx, repoPath, testhelper.IsWALEnabled(), true) require.NoError(t, err) @@ -642,7 +642,7 @@ func TestRepositoryInfo(t *testing.T) { repoStats, err := stats.RepositoryInfoForRepository(ctx, localrepo.NewTestRepo(t, cfg, repo)) require.NoError(t, err) - repoSize, err := dirSizeInBytes(repoPath, testhelper.IsWALEnabled(), true) + repoSize, err := dirSizeInBytes(ctx, repoPath, testhelper.IsWALEnabled(), true) require.NoError(t, err) @@ -698,7 +698,7 @@ func TestRepositoryInfo(t *testing.T) { require.NoError(t, err) } - repoSize, err := dirSizeInBytes(repoPath, testhelper.IsWALEnabled(), true) + repoSize, err := dirSizeInBytes(ctx, repoPath, testhelper.IsWALEnabled(), true) require.NoError(t, err) diff --git a/internal/gitaly/service/repository/size.go b/internal/gitaly/service/repository/size.go index 87f1f4ec02..a835df60b5 100644 --- a/internal/gitaly/service/repository/size.go +++ b/internal/gitaly/service/repository/size.go @@ -27,7 +27,7 @@ func (s *server) RepositorySize(ctx context.Context, in *gitalypb.RepositorySize return nil, err } - sizeInBytes, err := dirSizeInBytes(path, s.cfg.Transactions.Enabled, true) + sizeInBytes, err := dirSizeInBytes(ctx, path, s.cfg.Transactions.Enabled, true) if err != nil { return nil, fmt.Errorf("calculating directory size: %w", err) } @@ -47,7 +47,7 @@ func (s *server) GetObjectDirectorySize(ctx context.Context, in *gitalypb.GetObj return nil, err } // path is the objects directory path, not repo's path - sizeInBytes, err := dirSizeInBytes(path, s.cfg.Transactions.Enabled, false) + sizeInBytes, err := dirSizeInBytes(ctx, path, s.cfg.Transactions.Enabled, false) if err != nil { return nil, fmt.Errorf("calculating directory size: %w", err) } @@ -55,7 +55,7 @@ func (s *server) GetObjectDirectorySize(ctx context.Context, in *gitalypb.GetObj return &gitalypb.GetObjectDirectorySizeResponse{Size: sizeInBytes / 1024}, nil } -func dirSizeInBytes(dirPath string, transactionEnabled, isRepoPath bool) (int64, error) { +func dirSizeInBytes(ctx context.Context, dirPath string, transactionEnabled, isRepoPath bool) (int64, error) { var totalSize int64 if err := filepath.WalkDir(dirPath, func(path string, d fs.DirEntry, err error) error { @@ -80,7 +80,7 @@ func dirSizeInBytes(dirPath string, transactionEnabled, isRepoPath bool) (int64, } if transactionEnabled && isRepoPath && relPath != "." { matcher := snapshot.NewSnapshotFilter() - if ok := matcher.Matches(relPath, d.IsDir()); !ok { + if ok := matcher.Matches(ctx, relPath, d.IsDir()); !ok { return nil } } diff --git a/internal/gitaly/storage/storagemgr/partition/snapshot/snapshot.go b/internal/gitaly/storage/storagemgr/partition/snapshot/snapshot.go index d57d329598..dd6b3d6c2e 100644 --- a/internal/gitaly/storage/storagemgr/partition/snapshot/snapshot.go +++ b/internal/gitaly/storage/storagemgr/partition/snapshot/snapshot.go @@ -283,7 +283,7 @@ func createDirectorySnapshot(ctx context.Context, originalDirectory, snapshotDir // } // } //} - if ok := matcher.Matches(relativePath, info.IsDir()); !ok { + if ok := matcher.Matches(ctx, relativePath, info.IsDir()); !ok { if info.IsDir() { return fs.SkipDir } else { diff --git a/internal/gitaly/storage/storagemgr/partition/snapshot/snapshot_filter.go b/internal/gitaly/storage/storagemgr/partition/snapshot/snapshot_filter.go index 7c8645b37c..64eed4d755 100644 --- a/internal/gitaly/storage/storagemgr/partition/snapshot/snapshot_filter.go +++ b/internal/gitaly/storage/storagemgr/partition/snapshot/snapshot_filter.go @@ -1,10 +1,13 @@ package snapshot import ( + "context" "fmt" "os" "path/filepath" "strings" + + "gitlab.com/gitlab-org/gitaly/v16/internal/featureflag" ) var ( @@ -63,7 +66,7 @@ var ( // Filter is an interface to determine whether a given path should be included in a snapshot. type Filter interface { - Matches(path string, isDir bool) bool + Matches(ctx context.Context, path string, isDir bool) bool } // gitalySnapshotFilter filters Git repository files to determine which ones @@ -123,7 +126,12 @@ func addToIndex(filePath string, index *map[string]struct{}) { } // Matches checks if a given file or directory should be included in the snapshot based on the filter. -func (f *gitalySnapshotFilter) Matches(file string, isDir bool) bool { +func (f *gitalySnapshotFilter) Matches(ctx context.Context, file string, isDir bool) bool { + if !featureflag.SnapshotFilter.IsEnabled(ctx) { + // disabled snapshot filter feature pass everything + return true + } + if isDir { if _, exIdxHit := (*f.excludeIndex)[file]; exIdxHit { return false diff --git a/internal/gitaly/storage/storagemgr/partition/snapshot/snapshot_filter_test.go b/internal/gitaly/storage/storagemgr/partition/snapshot/snapshot_filter_test.go index fcca7a8d7d..53991dcc5e 100644 --- a/internal/gitaly/storage/storagemgr/partition/snapshot/snapshot_filter_test.go +++ b/internal/gitaly/storage/storagemgr/partition/snapshot/snapshot_filter_test.go @@ -1,6 +1,7 @@ package snapshot import ( + "context" "errors" "fmt" "io/fs" @@ -56,6 +57,7 @@ var ( func TestSnapshotMatcher(t *testing.T) { t.Parallel() + ctx := testhelper.Context(t) type resultInfo struct { isDir bool shouldPass bool @@ -149,12 +151,12 @@ func TestSnapshotMatcher(t *testing.T) { t.Parallel() matcher := newRegexBasedFilter() for snapshotPath, expectedResult := range tc.expectedResult { - require.Equal(t, expectedResult.shouldPass, matcher.Matches(snapshotPath, expectedResult.isDir), "Mismatch for path %s", snapshotPath) + require.Equal(t, expectedResult.shouldPass, matcher.Matches(ctx, snapshotPath, expectedResult.isDir), "Mismatch for path %s", snapshotPath) } filter := NewSnapshotFilter() for snapshotPath, expectedResult := range tc.expectedResult { - require.Equal(t, expectedResult.shouldPass, filter.Matches(snapshotPath, expectedResult.isDir), "Mismatch for path %s", snapshotPath) + require.Equal(t, expectedResult.shouldPass, filter.Matches(ctx, snapshotPath, expectedResult.isDir), "Mismatch for path %s", snapshotPath) } }) } @@ -164,12 +166,15 @@ func BenchmarkFilterPerformance(b *testing.B) { regexMatcher := newRegexBasedFilter() customMatcher := NewSnapshotFilter() + ctx := testhelper.Context(b) + // A local existing repo can also be used to speed up the benchmark // e.g. repoPath=/xxx/yyy/zzz/gitlab/.git - repoPath := setupBenchmarkRepo(b) + repoPath := setupBenchmarkRepo(b, ctx) + b.Run("regex based filter", func(b *testing.B) { for i := 0; i < b.N; i++ { - err := walkRepo(repoPath, regexMatcher) + err := walkRepo(ctx, repoPath, regexMatcher) if err != nil { b.Fatalf("regex filter failed: %v", err) } @@ -178,7 +183,7 @@ func BenchmarkFilterPerformance(b *testing.B) { b.Run("customized gitaly snapshot filter", func(b *testing.B) { for i := 0; i < b.N; i++ { - err := walkRepo(repoPath, customMatcher) + err := walkRepo(ctx, repoPath, customMatcher) if err != nil { b.Fatalf("custom filter failed: %v", err) } @@ -219,7 +224,7 @@ func newRegexBasedFilter() Filter { } } -func (sm *regexBasedFilter) Matches(snapshotPath string, isDir bool) bool { +func (sm *regexBasedFilter) Matches(ctx context.Context, snapshotPath string, isDir bool) bool { // try exactMatcher first if _, found := (*sm.exactIncludeMatcher)[snapshotPath]; found { return true @@ -239,7 +244,7 @@ func (sm *regexBasedFilter) Matches(snapshotPath string, isDir bool) bool { return false } -func walkRepo(repoPath string, matcher Filter) error { +func walkRepo(ctx context.Context, repoPath string, matcher Filter) error { if err := filepath.WalkDir(repoPath, func(path string, d fs.DirEntry, err error) error { if err != nil { if errors.Is(err, os.ErrNotExist) { @@ -257,7 +262,7 @@ func walkRepo(repoPath string, matcher Filter) error { return fmt.Errorf("calculating path relative to repo root: %w", err) } if relPath != "." { - if ok := matcher.Matches(relPath, d.IsDir()); !ok { + if ok := matcher.Matches(ctx, relPath, d.IsDir()); !ok { return nil } } @@ -272,9 +277,8 @@ func walkRepo(repoPath string, matcher Filter) error { // setupBenchmarkRepos create a repo with a lot of loose objects file, refs and a bunch of // garbage file such as .keep in packs and .lock in refs -func setupBenchmarkRepo(tb testing.TB) string { +func setupBenchmarkRepo(tb testing.TB, ctx context.Context) string { tb.Helper() - ctx := testhelper.Context(tb) cfg := testcfg.Build(tb) _, repoPath := gittest.CreateRepository(tb, ctx, cfg, gittest.CreateRepositoryConfig{ SkipCreationViaService: true, diff --git a/internal/testhelper/testhelper.go b/internal/testhelper/testhelper.go index 454a9ae389..1ef8bc64c7 100644 --- a/internal/testhelper/testhelper.go +++ b/internal/testhelper/testhelper.go @@ -321,6 +321,9 @@ func ContextWithoutCancel(opts ...ContextOpt) context.Context { newRepoReftableEnabled := env.GetString("GIT_DEFAULT_REF_FORMAT", "files") ctx = featureflag.ContextWithFeatureFlag(ctx, featureflag.NewRepoReftableBackend, newRepoReftableEnabled == "reftable") + // Enable snapshot filter + ctx = featureflag.ContextWithFeatureFlag(ctx, featureflag.SnapshotFilter, true) + for _, opt := range opts { ctx = opt(ctx) } -- GitLab From 445641f6971d6f861035572c2fd8dfc7d645fff8 Mon Sep 17 00:00:00 2001 From: Eric Ju Date: Wed, 21 May 2025 20:46:00 -0400 Subject: [PATCH 3/7] migration: Add the leftover file migration This commit introduces a migration that removes files not needed by Gitaly. These files are identified using the snapshot filter. --- internal/cli/gitaly/serve.go | 4 +- .../migration/leftover_file_migration.go | 106 ++++++++ .../migration/leftover_file_migration_test.go | 230 ++++++++++++++++++ 3 files changed, 339 insertions(+), 1 deletion(-) create mode 100644 internal/gitaly/storage/storagemgr/partition/migration/leftover_file_migration.go create mode 100644 internal/gitaly/storage/storagemgr/partition/migration/leftover_file_migration_test.go diff --git a/internal/cli/gitaly/serve.go b/internal/cli/gitaly/serve.go index 60a72ebae1..00e7b507fc 100644 --- a/internal/cli/gitaly/serve.go +++ b/internal/cli/gitaly/serve.go @@ -372,7 +372,9 @@ func run(appCtx *cli.Command, cfg config.Cfg, logger log.Logger) error { reftableMigratorMetrics := reftable.NewMetrics() prometheus.MustRegister(housekeepingMetrics, storageMetrics, partitionMetrics, migrationMetrics, raftMetrics, reftableMigratorMetrics) - migrations := []migration.Migration{} + migrations := []migration.Migration{ + migration.NewLeftoverFileMigration(localrepoFactory, migration.DeleteDirectly), + } var txMiddleware server.TransactionMiddleware var node storage.Node diff --git a/internal/gitaly/storage/storagemgr/partition/migration/leftover_file_migration.go b/internal/gitaly/storage/storagemgr/partition/migration/leftover_file_migration.go new file mode 100644 index 0000000000..e484504672 --- /dev/null +++ b/internal/gitaly/storage/storagemgr/partition/migration/leftover_file_migration.go @@ -0,0 +1,106 @@ +package migration + +import ( + "context" + "errors" + "fmt" + "io/fs" + "os" + "path/filepath" + + "gitlab.com/gitlab-org/gitaly/v16/internal/git/localrepo" + "gitlab.com/gitlab-org/gitaly/v16/internal/gitaly/storage" + "gitlab.com/gitlab-org/gitaly/v16/internal/gitaly/storage/storagemgr/partition/snapshot" + "gitlab.com/gitlab-org/gitaly/v16/proto/go/gitalypb" +) + +// leftoverProcessingFn is a function we use to remove leftover file +type leftoverProcessingFn func(storage.FS, string) error + +// NewLeftoverFileMigration returns a migration which deletes left-over files in the repository. +func NewLeftoverFileMigration(localRepoFactory localrepo.Factory, fn leftoverProcessingFn) Migration { + return Migration{ + ID: 2, + Name: "leftover files removal", + IsDisabled: func(ctx context.Context) bool { + return false + }, + Fn: func(ctx context.Context, tx storage.Transaction, storageName string, relativePath string) error { + originalRepo := &gitalypb.Repository{ + StorageName: storageName, + RelativePath: relativePath, + } + originalRepo = tx.OriginalRepository(originalRepo) + scopedFactory, err := localRepoFactory.ScopeByStorage(ctx, storageName) + if err != nil { + return fmt.Errorf("creating storage scoped factory: %w", err) + } + repo := scopedFactory.Build(originalRepo.GetRelativePath()) + repoPath, err := repo.Path(ctx) + if err != nil { + return fmt.Errorf("original repo path: %w", err) + } + + snapshotFilter := snapshot.NewSnapshotFilter() + if err := filepath.WalkDir(repoPath, func(path string, d fs.DirEntry, err error) error { + if err != nil { + // It can happen that we try to walk a directory like the object shards or + // an empty reference directory that gets deleted concurrently. This is fine + // and expected to happen, so let's ignore any such errors. + if errors.Is(err, os.ErrNotExist) { + return nil + } + + return err + } + + if d.IsDir() { + return nil + } + + relPath, err := filepath.Rel(repoPath, path) + if err != nil { + return fmt.Errorf("calculating path relative to repo root: %w", err) + } + if relPath != "." { + if ok := snapshotFilter.Matches(ctx, relPath, d.IsDir()); !ok { + if err := fn(tx.FS(), relPath); err != nil { + return fmt.Errorf("processing lefover file: %w", err) + } + } + } + + return nil + }); err != nil { + return fmt.Errorf("walking directory: %w", err) + } + + return nil + }, + } +} + +// DeleteDirectly just delete the file and error in case of failure. +func DeleteDirectly(fs storage.FS, file string) error { + if err := os.RemoveAll(file); err != nil { + return fmt.Errorf("removing file: %w", err) + } + return nil +} + +func MoveToGarbageFolder(fs storage.FS, file string) error { + targetPath := filepath.Join("transaction_garbage", file) + err := storage.MkdirAll(fs, filepath.Dir(targetPath)) + if err != nil { + return fmt.Errorf("make dir: %w", err) + } + err = storage.Link(fs, file, targetPath) + if err != nil { + return fmt.Errorf("move file to transaction_garbage: %w", err) + } + err = storage.RecordDirectoryRemoval(fs, fs.Root(), file) + if err != nil { + return fmt.Errorf("remove file: %w", err) + } + return nil +} diff --git a/internal/gitaly/storage/storagemgr/partition/migration/leftover_file_migration_test.go b/internal/gitaly/storage/storagemgr/partition/migration/leftover_file_migration_test.go new file mode 100644 index 0000000000..b8b4ba9c48 --- /dev/null +++ b/internal/gitaly/storage/storagemgr/partition/migration/leftover_file_migration_test.go @@ -0,0 +1,230 @@ +package migration + +import ( + "context" + "errors" + "fmt" + "io/fs" + "os" + "path/filepath" + "strconv" + "testing" + + "github.com/stretchr/testify/require" + "gitlab.com/gitlab-org/gitaly/v16/internal/git" + "gitlab.com/gitlab-org/gitaly/v16/internal/git/catfile" + "gitlab.com/gitlab-org/gitaly/v16/internal/git/gitcmd" + "gitlab.com/gitlab-org/gitaly/v16/internal/git/gittest" + "gitlab.com/gitlab-org/gitaly/v16/internal/git/housekeeping" + "gitlab.com/gitlab-org/gitaly/v16/internal/git/localrepo" + "gitlab.com/gitlab-org/gitaly/v16/internal/gitaly/config" + "gitlab.com/gitlab-org/gitaly/v16/internal/gitaly/storage" + "gitlab.com/gitlab-org/gitaly/v16/internal/gitaly/storage/keyvalue" + "gitlab.com/gitlab-org/gitaly/v16/internal/gitaly/storage/keyvalue/databasemgr" + "gitlab.com/gitlab-org/gitaly/v16/internal/gitaly/storage/mode" + nodeimpl "gitlab.com/gitlab-org/gitaly/v16/internal/gitaly/storage/node" + "gitlab.com/gitlab-org/gitaly/v16/internal/gitaly/storage/raftmgr" + "gitlab.com/gitlab-org/gitaly/v16/internal/gitaly/storage/storagemgr" + "gitlab.com/gitlab-org/gitaly/v16/internal/gitaly/storage/storagemgr/partition" + "gitlab.com/gitlab-org/gitaly/v16/internal/helper" + "gitlab.com/gitlab-org/gitaly/v16/internal/log" + "gitlab.com/gitlab-org/gitaly/v16/internal/testhelper" + "gitlab.com/gitlab-org/gitaly/v16/internal/testhelper/testcfg" + "gitlab.com/gitlab-org/gitaly/v16/proto/go/gitalypb" +) + +func TestNewLeftoverFileMigration(t *testing.T) { + // setup node + logger := testhelper.SharedLogger(t) + cfg := testcfg.Build(t) + ctx := testhelper.Context(t) + cmdFactory := gittest.NewCommandFactory(t, cfg) + catfileCache := catfile.NewCache(cfg) + t.Cleanup(catfileCache.Stop) + localRepoFactory := localrepo.NewFactory(logger, config.NewLocator(cfg), cmdFactory, catfileCache) + //leftoverFileMigration := NewLeftoverFileMigration(localRepoFactory, DeleteDirectly) + leftoverFileMigration := NewLeftoverFileMigration(localRepoFactory, MoveToGarbageFolder) + node := setupNodeForTransaction(t, ctx, cfg, logger, cmdFactory, localRepoFactory, []Migration{leftoverFileMigration}) + t.Cleanup(node.Close) + + // setup repo + setup := setupRepoForLeftoverMigration(t, ctx, cfg) + + // manually create transaction + originalRepo := &gitalypb.Repository{ + StorageName: setup.repo.GetStorageName(), + RelativePath: setup.repo.GetRelativePath(), + } + if tx := storage.ExtractTransaction(ctx); tx != nil { + originalRepo = tx.OriginalRepository(originalRepo) + } + storageHandle, err := node.GetStorage(setup.repo.GetStorageName()) + require.NoError(t, err) + + tx, err := storageHandle.Begin(ctx, storage.TransactionOptions{ + ReadOnly: false, + RelativePath: originalRepo.GetRelativePath(), + }) + require.NoError(t, err) + err = tx.Commit(ctx) + require.NoError(t, err) + + applyWAL(t, ctx, setup.repo, node) + + // check all files the left are what we expect them to be + acttualReftableFileCount := 0 + expectedReftableFileCount := 0 + if testhelper.IsReftableEnabled() { + expectedReftableFileCount = 2 + } + err = filepath.WalkDir(setup.repoPath, func(path string, d fs.DirEntry, err error) error { + if err != nil { + if errors.Is(err, os.ErrNotExist) { + return nil + } + return err + } + + if d.IsDir() { + return nil + } + if !testhelper.IsReftableEnabled() { + require.Contains(t, setup.filesToStay, path) + return nil + } + + if filepath.Dir(path) == filepath.Join(setup.repoPath, "reftable") && filepath.Ext(path) == ".ref" { + acttualReftableFileCount++ + } else { + require.Contains(t, setup.filesToStay, path) + } + + return nil + }) + require.NoError(t, err) + require.Equal(t, expectedReftableFileCount, acttualReftableFileCount) +} + +type leftoverMigrationSetup struct { + repoPath string + filesToStay []string + repo *gitalypb.Repository +} + +func setupRepoForLeftoverMigration(t *testing.T, ctx context.Context, cfg config.Cfg) leftoverMigrationSetup { + t.Helper() + + repo, repoPath := gittest.CreateRepository(t, ctx, cfg, gittest.CreateRepositoryConfig{ + SkipCreationViaService: true, + }) + + blobNum := 3 + refNum := 4 + filesToStay := make([]string, 0) + treeEntries := make([]gittest.TreeEntry, 0) + + err := os.WriteFile(filepath.Join(repoPath, "description"), []byte{}, mode.File) + require.NoError(t, err) + + for i := 0; i < blobNum; i++ { + blob := gittest.WriteBlob(t, cfg, repoPath, []byte(strconv.Itoa(i))) + filesToStay = append(filesToStay, filepath.Join(repoPath, "objects", blob.String()[0:2], blob.String()[2:])) + treeEntries = append(treeEntries, gittest.TreeEntry{Path: fmt.Sprintf("file%d", i), Mode: "100644", OID: blob}) + + err := os.WriteFile(filepath.Join(repoPath, "objects", "pack", fmt.Sprintf("pack-%d.keep", i)), []byte{}, mode.File) + require.NoError(t, err) + } + + tree := gittest.WriteTree(t, cfg, repoPath, treeEntries) + filesToStay = append(filesToStay, filepath.Join(repoPath, "objects", tree.String()[0:2], tree.String()[2:])) + + for i := 0; i < refNum; i++ { + branch := fmt.Sprintf("test%d", i) + commit := gittest.WriteCommit(t, cfg, repoPath, gittest.WithTree(tree), gittest.WithBranch(branch)) + gittest.WriteRef(t, cfg, repoPath, git.ReferenceName(fmt.Sprintf("refs/heads/%d", i)), commit) + + filesToStay = append(filesToStay, filepath.Join(repoPath, "objects", commit.String()[0:2], commit.String()[2:])) + + if !testhelper.IsReftableEnabled() { + filesToStay = append(filesToStay, filepath.Join(repoPath, "refs", "heads", branch)) + filesToStay = append(filesToStay, filepath.Join(repoPath, "refs", "heads", fmt.Sprintf("%d", i))) + + err := os.WriteFile(filepath.Join(repoPath, "refs", "heads", fmt.Sprintf("%d.lock", i)), []byte{}, mode.File) + require.NoError(t, err) + } + + } + if testhelper.IsReftableEnabled() { + filesToStay = append(filesToStay, filepath.Join(repoPath, "refs", "heads")) + filesToStay = append(filesToStay, filepath.Join(repoPath, "reftable", "tables.list")) + } + + filesToStay = append(filesToStay, filepath.Join(repoPath, "config"), filepath.Join(repoPath, "HEAD")) + + return leftoverMigrationSetup{ + repoPath: repoPath, + filesToStay: filesToStay, + repo: repo, + } +} + +func setupNodeForTransaction(t *testing.T, ctx context.Context, cfg config.Cfg, + logger log.Logger, cmdFactory gitcmd.CommandFactory, + localRepoFactory localrepo.Factory, migrations []Migration, +) *nodeimpl.Manager { + dbMgr, err := databasemgr.NewDBManager( + ctx, + cfg.Storages, + keyvalue.NewBadgerStore, + helper.NewNullTickerFactory(), + logger, + ) + require.NoError(t, err) + t.Cleanup(dbMgr.Close) + + raftNode, err := raftmgr.NewNode(cfg, logger, dbMgr, nil) + require.NoError(t, err) + + raftFactory := raftmgr.DefaultFactoryWithNode(cfg.Raft, raftNode) + + partitionFactoryOptions := []partition.FactoryOption{ + partition.WithCmdFactory(cmdFactory), + partition.WithRepoFactory(localRepoFactory), + partition.WithMetrics(partition.NewMetrics(housekeeping.NewMetrics(cfg.Prometheus))), + partition.WithRaftConfig(cfg.Raft), + partition.WithRaftFactory(raftFactory), + } + + node, err := nodeimpl.NewManager( + cfg.Storages, + storagemgr.NewFactory( + logger, + dbMgr, + NewFactory( + partition.NewFactory(partitionFactoryOptions...), + NewMetrics(), + migrations, + ), + config.DefaultMaxInactivePartitions, + storagemgr.NewMetrics(cfg.Prometheus), + ), + ) + require.NoError(t, err) + return node +} + +func applyWAL(t *testing.T, ctx context.Context, repo *gitalypb.Repository, node storage.Node) { + nodeStorage, err := node.GetStorage(repo.GetStorageName()) + require.NoError(t, err) + + // Start a transaction to ensure the WAL is fully applied. This test is still + // accessing the repository directly in the storage. + tx, err := nodeStorage.Begin(ctx, storage.TransactionOptions{ + ReadOnly: true, + RelativePath: repo.GetRelativePath(), + }) + require.NoError(t, err) + defer func() { + require.NoError(t, tx.Rollback(ctx)) + }() +} -- GitLab From 181677797b02d245581960f565e0e537e8de3c9b Mon Sep 17 00:00:00 2001 From: Eric Ju Date: Fri, 23 May 2025 10:41:30 -0400 Subject: [PATCH 4/7] featureflag: Add feature flag for leftover file migration This commit introduces a feature flag to control the execution of the leftover file migration. When turned off, the migration is disabled. --- internal/featureflag/ff_leftover_migration.go | 9 +++++++++ .../partition/migration/leftover_file_migration.go | 3 ++- internal/testhelper/testhelper.go | 2 ++ 3 files changed, 13 insertions(+), 1 deletion(-) create mode 100644 internal/featureflag/ff_leftover_migration.go diff --git a/internal/featureflag/ff_leftover_migration.go b/internal/featureflag/ff_leftover_migration.go new file mode 100644 index 0000000000..0df51a4a2b --- /dev/null +++ b/internal/featureflag/ff_leftover_migration.go @@ -0,0 +1,9 @@ +package featureflag + +// LeftoverMigration enables the feature to delete garbage files in repositories that Gitaly doesn't use. +var LeftoverMigration = NewFeatureFlag( + "leftover_migration", + "v17.8.0", + "https://gitlab.com/gitlab-org/gitaly/-/issues/5737", + false, +) diff --git a/internal/gitaly/storage/storagemgr/partition/migration/leftover_file_migration.go b/internal/gitaly/storage/storagemgr/partition/migration/leftover_file_migration.go index e484504672..acae649856 100644 --- a/internal/gitaly/storage/storagemgr/partition/migration/leftover_file_migration.go +++ b/internal/gitaly/storage/storagemgr/partition/migration/leftover_file_migration.go @@ -8,6 +8,7 @@ import ( "os" "path/filepath" + "gitlab.com/gitlab-org/gitaly/v16/internal/featureflag" "gitlab.com/gitlab-org/gitaly/v16/internal/git/localrepo" "gitlab.com/gitlab-org/gitaly/v16/internal/gitaly/storage" "gitlab.com/gitlab-org/gitaly/v16/internal/gitaly/storage/storagemgr/partition/snapshot" @@ -23,7 +24,7 @@ func NewLeftoverFileMigration(localRepoFactory localrepo.Factory, fn leftoverPro ID: 2, Name: "leftover files removal", IsDisabled: func(ctx context.Context) bool { - return false + return featureflag.LeftoverMigration.IsDisabled(ctx) }, Fn: func(ctx context.Context, tx storage.Transaction, storageName string, relativePath string) error { originalRepo := &gitalypb.Repository{ diff --git a/internal/testhelper/testhelper.go b/internal/testhelper/testhelper.go index 1ef8bc64c7..a0d6fa3a1b 100644 --- a/internal/testhelper/testhelper.go +++ b/internal/testhelper/testhelper.go @@ -323,6 +323,8 @@ func ContextWithoutCancel(opts ...ContextOpt) context.Context { // Enable snapshot filter ctx = featureflag.ContextWithFeatureFlag(ctx, featureflag.SnapshotFilter, true) + // Enable leftover file migration + ctx = featureflag.ContextWithFeatureFlag(ctx, featureflag.LeftoverMigration, true) for _, opt := range opts { ctx = opt(ctx) -- GitLab From f48771bd33c0c520edffda4add6c67bac3ed1fbe Mon Sep 17 00:00:00 2001 From: Eric Ju Date: Mon, 26 May 2025 21:07:32 -0400 Subject: [PATCH 5/7] fixup! snapshot: Add the snapshot filter --- internal/gitaly/service/repository/size.go | 5 +-- .../storagemgr/partition/snapshot/snapshot.go | 5 +-- .../partition/snapshot/snapshot_filter.go | 13 ++++---- .../snapshot/snapshot_filter_test.go | 32 +++++++++++-------- 4 files changed, 31 insertions(+), 24 deletions(-) diff --git a/internal/gitaly/service/repository/size.go b/internal/gitaly/service/repository/size.go index a835df60b5..1a341d88c8 100644 --- a/internal/gitaly/service/repository/size.go +++ b/internal/gitaly/service/repository/size.go @@ -8,6 +8,7 @@ import ( "os" "path/filepath" + "gitlab.com/gitlab-org/gitaly/v16/internal/featureflag" "gitlab.com/gitlab-org/gitaly/v16/internal/gitaly/storage/storagemgr/partition/snapshot" "gitlab.com/gitlab-org/gitaly/v16/internal/structerr" "gitlab.com/gitlab-org/gitaly/v16/proto/go/gitalypb" @@ -79,8 +80,8 @@ func dirSizeInBytes(ctx context.Context, dirPath string, transactionEnabled, isR return fmt.Errorf("calculating path relative to repo root: %w", err) } if transactionEnabled && isRepoPath && relPath != "." { - matcher := snapshot.NewSnapshotFilter() - if ok := matcher.Matches(ctx, relPath, d.IsDir()); !ok { + matcher := snapshot.NewSnapshotFilter(featureflag.SnapshotFilter.IsDisabled(ctx)) + if ok := matcher.Matches(relPath, d.IsDir()); !ok { return nil } } diff --git a/internal/gitaly/storage/storagemgr/partition/snapshot/snapshot.go b/internal/gitaly/storage/storagemgr/partition/snapshot/snapshot.go index dd6b3d6c2e..c8d22c6787 100644 --- a/internal/gitaly/storage/storagemgr/partition/snapshot/snapshot.go +++ b/internal/gitaly/storage/storagemgr/partition/snapshot/snapshot.go @@ -10,6 +10,7 @@ import ( "strings" "time" + "gitlab.com/gitlab-org/gitaly/v16/internal/featureflag" "gitlab.com/gitlab-org/gitaly/v16/internal/gitaly/storage" "gitlab.com/gitlab-org/gitaly/v16/internal/gitaly/storage/gitstorage" "gitlab.com/gitlab-org/gitaly/v16/internal/gitaly/storage/mode" @@ -243,7 +244,7 @@ func createParentDirectories(storageRoot, snapshotRoot, relativePath string, sta func createRepositorySnapshot(ctx context.Context, storageRoot, snapshotRoot, relativePath string, stats *snapshotStatistics) error { if err := createDirectorySnapshot(ctx, filepath.Join(storageRoot, relativePath), filepath.Join(snapshotRoot, relativePath), - NewSnapshotFilter(), stats); err != nil { + NewSnapshotFilter(featureflag.SnapshotFilter.IsDisabled(ctx)), stats); err != nil { return fmt.Errorf("create directory snapshot: %w", err) } @@ -283,7 +284,7 @@ func createDirectorySnapshot(ctx context.Context, originalDirectory, snapshotDir // } // } //} - if ok := matcher.Matches(ctx, relativePath, info.IsDir()); !ok { + if ok := matcher.Matches(relativePath, info.IsDir()); !ok { if info.IsDir() { return fs.SkipDir } else { diff --git a/internal/gitaly/storage/storagemgr/partition/snapshot/snapshot_filter.go b/internal/gitaly/storage/storagemgr/partition/snapshot/snapshot_filter.go index 64eed4d755..df51226571 100644 --- a/internal/gitaly/storage/storagemgr/partition/snapshot/snapshot_filter.go +++ b/internal/gitaly/storage/storagemgr/partition/snapshot/snapshot_filter.go @@ -1,13 +1,10 @@ package snapshot import ( - "context" "fmt" "os" "path/filepath" "strings" - - "gitlab.com/gitlab-org/gitaly/v16/internal/featureflag" ) var ( @@ -66,20 +63,22 @@ var ( // Filter is an interface to determine whether a given path should be included in a snapshot. type Filter interface { - Matches(ctx context.Context, path string, isDir bool) bool + Matches(path string, isDir bool) bool } // gitalySnapshotFilter filters Git repository files to determine which ones // should be included in a snapshot. It applies the include and exclude patterns // to efficiently select files based on their paths. type gitalySnapshotFilter struct { + disabled bool includeIndex *map[string]struct{} excludeIndex *map[string]struct{} } // NewSnapshotFilter create a filter to determine which files should be included in a repository snapshot. -func NewSnapshotFilter() Filter { +func NewSnapshotFilter(disabled bool) Filter { filter := &gitalySnapshotFilter{} + filter.disabled = disabled filter.includeIndex = &map[string]struct{}{} filter.excludeIndex = &map[string]struct{}{} filter.buildIndex(GitalySnapshotFileIncludePatterns, GitalySnapshotFileExcludePatterns) @@ -126,8 +125,8 @@ func addToIndex(filePath string, index *map[string]struct{}) { } // Matches checks if a given file or directory should be included in the snapshot based on the filter. -func (f *gitalySnapshotFilter) Matches(ctx context.Context, file string, isDir bool) bool { - if !featureflag.SnapshotFilter.IsEnabled(ctx) { +func (f *gitalySnapshotFilter) Matches(file string, isDir bool) bool { + if f.disabled { // disabled snapshot filter feature pass everything return true } diff --git a/internal/gitaly/storage/storagemgr/partition/snapshot/snapshot_filter_test.go b/internal/gitaly/storage/storagemgr/partition/snapshot/snapshot_filter_test.go index 53991dcc5e..1a75fe82d8 100644 --- a/internal/gitaly/storage/storagemgr/partition/snapshot/snapshot_filter_test.go +++ b/internal/gitaly/storage/storagemgr/partition/snapshot/snapshot_filter_test.go @@ -13,6 +13,7 @@ import ( "testing" "github.com/stretchr/testify/require" + "gitlab.com/gitlab-org/gitaly/v16/internal/featureflag" "gitlab.com/gitlab-org/gitaly/v16/internal/git" "gitlab.com/gitlab-org/gitaly/v16/internal/git/gittest" "gitlab.com/gitlab-org/gitaly/v16/internal/gitaly/storage/mode" @@ -149,24 +150,23 @@ func TestSnapshotMatcher(t *testing.T) { } { t.Run(tc.desc, func(t *testing.T) { t.Parallel() - matcher := newRegexBasedFilter() + matcher := newRegexBasedFilter(featureflag.SnapshotFilter.IsDisabled(ctx)) for snapshotPath, expectedResult := range tc.expectedResult { - require.Equal(t, expectedResult.shouldPass, matcher.Matches(ctx, snapshotPath, expectedResult.isDir), "Mismatch for path %s", snapshotPath) + require.Equal(t, expectedResult.shouldPass, matcher.Matches(snapshotPath, expectedResult.isDir), "Mismatch for path %s", snapshotPath) } - filter := NewSnapshotFilter() + filter := NewSnapshotFilter(featureflag.SnapshotFilter.IsDisabled(ctx)) for snapshotPath, expectedResult := range tc.expectedResult { - require.Equal(t, expectedResult.shouldPass, filter.Matches(ctx, snapshotPath, expectedResult.isDir), "Mismatch for path %s", snapshotPath) + require.Equal(t, expectedResult.shouldPass, filter.Matches(snapshotPath, expectedResult.isDir), "Mismatch for path %s", snapshotPath) } }) } } func BenchmarkFilterPerformance(b *testing.B) { - regexMatcher := newRegexBasedFilter() - customMatcher := NewSnapshotFilter() - ctx := testhelper.Context(b) + regexMatcher := newRegexBasedFilter(featureflag.SnapshotFilter.IsDisabled(ctx)) + customMatcher := NewSnapshotFilter(featureflag.SnapshotFilter.IsDisabled(ctx)) // A local existing repo can also be used to speed up the benchmark // e.g. repoPath=/xxx/yyy/zzz/gitlab/.git @@ -174,7 +174,7 @@ func BenchmarkFilterPerformance(b *testing.B) { b.Run("regex based filter", func(b *testing.B) { for i := 0; i < b.N; i++ { - err := walkRepo(ctx, repoPath, regexMatcher) + err := walkRepo(repoPath, regexMatcher) if err != nil { b.Fatalf("regex filter failed: %v", err) } @@ -183,7 +183,7 @@ func BenchmarkFilterPerformance(b *testing.B) { b.Run("customized gitaly snapshot filter", func(b *testing.B) { for i := 0; i < b.N; i++ { - err := walkRepo(ctx, repoPath, customMatcher) + err := walkRepo(repoPath, customMatcher) if err != nil { b.Fatalf("custom filter failed: %v", err) } @@ -202,12 +202,13 @@ func BenchmarkFilterPerformance(b *testing.B) { // misconfigurations in gitalySnapshotFilter. // 2. To compare its performance with gitalySnapshotFilter. type regexBasedFilter struct { + disabled bool exactIncludeMatcher *map[string]struct{} regexIncludeMatcher *regexp.Regexp regexExcludeMatcher *regexp.Regexp } -func newRegexBasedFilter() Filter { +func newRegexBasedFilter(disabled bool) Filter { pattern := strings.Join(regexInMatches, "|") regexIncludeMatcher := regexp.MustCompile(pattern) @@ -218,14 +219,19 @@ func newRegexBasedFilter() Filter { } return ®exBasedFilter{ + disabled: disabled, exactIncludeMatcher: &exactInMatchPaths, regexIncludeMatcher: regexIncludeMatcher, regexExcludeMatcher: regexExcludeMatcher, } } -func (sm *regexBasedFilter) Matches(ctx context.Context, snapshotPath string, isDir bool) bool { +func (sm *regexBasedFilter) Matches(snapshotPath string, isDir bool) bool { // try exactMatcher first + if sm.disabled { + return true + } + if _, found := (*sm.exactIncludeMatcher)[snapshotPath]; found { return true } @@ -244,7 +250,7 @@ func (sm *regexBasedFilter) Matches(ctx context.Context, snapshotPath string, is return false } -func walkRepo(ctx context.Context, repoPath string, matcher Filter) error { +func walkRepo(repoPath string, matcher Filter) error { if err := filepath.WalkDir(repoPath, func(path string, d fs.DirEntry, err error) error { if err != nil { if errors.Is(err, os.ErrNotExist) { @@ -262,7 +268,7 @@ func walkRepo(ctx context.Context, repoPath string, matcher Filter) error { return fmt.Errorf("calculating path relative to repo root: %w", err) } if relPath != "." { - if ok := matcher.Matches(ctx, relPath, d.IsDir()); !ok { + if ok := matcher.Matches(relPath, d.IsDir()); !ok { return nil } } -- GitLab From 8f32100ec1abf9f94419415fb4cde2156e02f510 Mon Sep 17 00:00:00 2001 From: Eric Ju Date: Mon, 26 May 2025 21:07:46 -0400 Subject: [PATCH 6/7] fixup! featureflag: Add feature flag to control snapshot filter --- internal/featureflag/ff_snapshot_filter.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/internal/featureflag/ff_snapshot_filter.go b/internal/featureflag/ff_snapshot_filter.go index c346e58f21..49da87721e 100644 --- a/internal/featureflag/ff_snapshot_filter.go +++ b/internal/featureflag/ff_snapshot_filter.go @@ -3,7 +3,7 @@ package featureflag // SnapshotFilter enables snapshot filter feature. var SnapshotFilter = NewFeatureFlag( "snapshot_filter", - "v17.8.0", + "v18.1.0", "https://gitlab.com/gitlab-org/gitaly/-/issues/5737", false, ) -- GitLab From 7ffcbe311dfcf6c32f41e94f5a5524352895b6de Mon Sep 17 00:00:00 2001 From: Eric Ju Date: Mon, 26 May 2025 21:08:08 -0400 Subject: [PATCH 7/7] fixup! migration: Add the leftover file migration --- .../partition/migration/leftover_file_migration.go | 9 +++++++-- .../partition/migration/leftover_file_migration_test.go | 2 +- 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/internal/gitaly/storage/storagemgr/partition/migration/leftover_file_migration.go b/internal/gitaly/storage/storagemgr/partition/migration/leftover_file_migration.go index acae649856..8fade36596 100644 --- a/internal/gitaly/storage/storagemgr/partition/migration/leftover_file_migration.go +++ b/internal/gitaly/storage/storagemgr/partition/migration/leftover_file_migration.go @@ -27,6 +27,10 @@ func NewLeftoverFileMigration(localRepoFactory localrepo.Factory, fn leftoverPro return featureflag.LeftoverMigration.IsDisabled(ctx) }, Fn: func(ctx context.Context, tx storage.Transaction, storageName string, relativePath string) error { + if featureflag.SnapshotFilter.IsEnabled(ctx) { + return fmt.Errorf("leftover migration should disable snapshot_filter") + } + originalRepo := &gitalypb.Repository{ StorageName: storageName, RelativePath: relativePath, @@ -42,7 +46,7 @@ func NewLeftoverFileMigration(localRepoFactory localrepo.Factory, fn leftoverPro return fmt.Errorf("original repo path: %w", err) } - snapshotFilter := snapshot.NewSnapshotFilter() + snapshotFilter := snapshot.NewSnapshotFilter(false) if err := filepath.WalkDir(repoPath, func(path string, d fs.DirEntry, err error) error { if err != nil { // It can happen that we try to walk a directory like the object shards or @@ -64,7 +68,7 @@ func NewLeftoverFileMigration(localRepoFactory localrepo.Factory, fn leftoverPro return fmt.Errorf("calculating path relative to repo root: %w", err) } if relPath != "." { - if ok := snapshotFilter.Matches(ctx, relPath, d.IsDir()); !ok { + if ok := snapshotFilter.Matches(relPath, d.IsDir()); !ok { if err := fn(tx.FS(), relPath); err != nil { return fmt.Errorf("processing lefover file: %w", err) } @@ -89,6 +93,7 @@ func DeleteDirectly(fs storage.FS, file string) error { return nil } +// MoveToGarbageFolder moves the file to the transaction_garbage folder. func MoveToGarbageFolder(fs storage.FS, file string) error { targetPath := filepath.Join("transaction_garbage", file) err := storage.MkdirAll(fs, filepath.Dir(targetPath)) diff --git a/internal/gitaly/storage/storagemgr/partition/migration/leftover_file_migration_test.go b/internal/gitaly/storage/storagemgr/partition/migration/leftover_file_migration_test.go index b8b4ba9c48..bd152a9ab0 100644 --- a/internal/gitaly/storage/storagemgr/partition/migration/leftover_file_migration_test.go +++ b/internal/gitaly/storage/storagemgr/partition/migration/leftover_file_migration_test.go @@ -42,7 +42,7 @@ func TestNewLeftoverFileMigration(t *testing.T) { catfileCache := catfile.NewCache(cfg) t.Cleanup(catfileCache.Stop) localRepoFactory := localrepo.NewFactory(logger, config.NewLocator(cfg), cmdFactory, catfileCache) - //leftoverFileMigration := NewLeftoverFileMigration(localRepoFactory, DeleteDirectly) + // leftoverFileMigration := NewLeftoverFileMigration(localRepoFactory, DeleteDirectly) leftoverFileMigration := NewLeftoverFileMigration(localRepoFactory, MoveToGarbageFolder) node := setupNodeForTransaction(t, ctx, cfg, logger, cmdFactory, localRepoFactory, []Migration{leftoverFileMigration}) t.Cleanup(node.Close) -- GitLab