From 84c89aa6dc7e6b7f25f52dc8426ccae2ee806dab Mon Sep 17 00:00:00 2001 From: Mustafa Bayar Date: Tue, 27 May 2025 21:02:31 +0200 Subject: [PATCH] housekeeping: Prune empty directories Housekeeping with transactions removes loose reference directories as part of repacking. However, it only removes the directories that had loose references in them during the reference packing run. Empty directories that didn't contain any loose references to pack are not removed. This leads to empty directories from other processes never being pruned. With this change we will walk the entire ref folder and delete any remaining empty directories. Walk works in post order to delete in cascading effect. If a parent directory becomes empty due to child being removed, parent will also get removed. --- .../transaction_manager_housekeeping.go | 83 ++++++++++++++----- .../transaction_manager_housekeeping_test.go | 50 +++++++++++ 2 files changed, 113 insertions(+), 20 deletions(-) diff --git a/internal/gitaly/storage/storagemgr/partition/transaction_manager_housekeeping.go b/internal/gitaly/storage/storagemgr/partition/transaction_manager_housekeeping.go index d9ee19b46f..af0c3750ab 100644 --- a/internal/gitaly/storage/storagemgr/partition/transaction_manager_housekeeping.go +++ b/internal/gitaly/storage/storagemgr/partition/transaction_manager_housekeeping.go @@ -694,6 +694,11 @@ func (mgr *TransactionManager) verifyPackRefsReftable(transaction *Transaction) // command runs and the time this transaction is logged. Thus, we need to verify if the transaction conflicts with the // current state of the repository. // +// After packing references, the function uses a post-order directory traversal to identify and remove +// empty directories. Directories are considered empty if they contain no entries or if all entries are +// already scheduled for deletion. This ensures proper cleanup of nested empty directories that may +// result from reference packing. +// // There are three cases when a reference is modified: // - Reference creation: this is the easiest case. The new reference exists as a loose reference on disk and shadows the // one in the packed-ref. @@ -755,45 +760,83 @@ func (mgr *TransactionManager) verifyPackRefsFiles(ctx context.Context, transact // path that became empty as a result of removing the references. As we're operating on the real // repository here, we can't actually perform deletions in it. We instead keep track of the // files and directories we've deleted in-memory to ensure we only delete empty directories. + // First pass: Mark references for deletion but don't make decisions about directories yet deletedPaths := make(map[string]struct{}, len(packRefs.PrunedRefs)) if err := prunedRefs.WalkPostOrder(func(path string, isDir bool) error { relativePath := filepath.Join(transaction.relativePath, path) + + // Don't touch protected directories if _, ok := directoriesToKeep[relativePath]; ok { return nil } - if isDir { - // If this is a directory, we need to ensure it is actually empty before removing - // it. Check if we find any directory entries we haven't yet deleted. - entries, err := os.ReadDir(mgr.getAbsolutePath(relativePath)) - if err != nil { - return fmt.Errorf("read dir: %w", err) - } + if !isDir { + deletedPaths[relativePath] = struct{}{} + transaction.walEntry.RemoveDirectoryEntry(relativePath) + return nil + } - for _, entry := range entries { - if _, ok := deletedPaths[filepath.Join(relativePath, entry.Name())]; ok { - // This path was already deleted. Don't consider it to exist. - continue - } + return nil + }); err != nil { + return nil, fmt.Errorf("walk post order for references: %w", err) + } - // This directory was not empty because someone concurrently wrote - // a reference into it. Keep it in place. - directoriesToKeep[relativePath] = struct{}{} - return nil - } + // Second pass: Process directories separately with a true post-order traversal + // This ensures all files are processed first, then directories are processed bottom-up + if err := mgr.forEachEmptyDir(filepath.Join(transaction.relativePath, "refs"), deletedPaths, func(path string) error { + // Skip if it is a protected directory or already deleted + if _, ok := deletedPaths[path]; ok { + return nil + } + + if _, ok := directoriesToKeep[path]; ok { + return nil } - deletedPaths[relativePath] = struct{}{} - transaction.walEntry.RemoveDirectoryEntry(relativePath) + deletedPaths[path] = struct{}{} + transaction.walEntry.RemoveDirectoryEntry(path) return nil }); err != nil { - return nil, fmt.Errorf("walk post order: %w", err) + return nil, fmt.Errorf("for each empty dir: %w", err) } return &gitalypb.LogEntry_Housekeeping_PackRefs{}, nil } +// forEachEmptyDir marks directories for deletion recursively in a post-order traversal, +// ensuring that parent directories are only processed after all their children have been processed. +func (mgr *TransactionManager) forEachEmptyDir(path string, deletedPaths map[string]struct{}, fn func(string) error) error { + // We need to check the actual storage path to make sure + // someone did not concurrently wrote a reference into it. + entries, err := os.ReadDir(mgr.getAbsolutePath(path)) + if err != nil { + return fmt.Errorf("read dir: %w", err) + } + + var hasFiles bool + for _, entry := range entries { + if entry.IsDir() { + childPath := filepath.Join(path, entry.Name()) + if err := mgr.forEachEmptyDir(childPath, deletedPaths, fn); err != nil { + return err + } + } else { + if _, ok := deletedPaths[filepath.Join(path, entry.Name())]; ok { + // This path was already deleted. Don't consider it to exist. + continue + } + hasFiles = true + } + } + + if hasFiles { + return nil + } + + return fn(path) +} + // prepareOffloading implements the core offloading logic within the transaction manager. // It must be called inside a snapshot repository and performs these steps: // diff --git a/internal/gitaly/storage/storagemgr/partition/transaction_manager_housekeeping_test.go b/internal/gitaly/storage/storagemgr/partition/transaction_manager_housekeeping_test.go index 1df0c36898..fbd0b4efb8 100644 --- a/internal/gitaly/storage/storagemgr/partition/transaction_manager_housekeeping_test.go +++ b/internal/gitaly/storage/storagemgr/partition/transaction_manager_housekeeping_test.go @@ -1384,6 +1384,56 @@ func generateHousekeepingPackRefsTests(t *testing.T, ctx context.Context, testPa }, }, }, + { + desc: "empty directories without references are pruned during pack-refs", + skip: func(t *testing.T) { + testhelper.SkipWithReftable(t, "we don't deal with directories for reftable") + }, + steps: steps{ + StartManager{ + ModifyStorage: func(tb testing.TB, cfg config.Cfg, storagePath string) { + repoPath := filepath.Join(storagePath, setup.RelativePath) + + // Create nested empty directories + nestedDirPath1 := filepath.Join(repoPath, "refs", "heads", "first-branch", "empty-dir", "another-empty-dir") + require.NoError(tb, os.MkdirAll(nestedDirPath1, mode.Directory)) + nestedDirPath2 := filepath.Join(repoPath, "refs", "heads", "second-branch", "empty-dir") + require.NoError(tb, os.MkdirAll(nestedDirPath2, mode.Directory)) + + // Add a reference that would be packed + gittest.WriteRef(tb, cfg, repoPath, "refs/heads/first-branch/someRef", setup.Commits.First.OID) + }, + }, + Begin{ + TransactionID: 1, + RelativePaths: []string{setup.RelativePath}, + }, + RunPackRefs{ + TransactionID: 1, + }, + Commit{ + TransactionID: 1, + }, + }, + expectedState: StateAssertion{ + Database: DatabaseState{ + string(keyAppliedLSN): storage.LSN(1).ToProto(), + }, + Repositories: RepositoryStates{ + relativePath: { + DefaultBranch: "refs/heads/main", + References: &ReferencesState{ + FilesBackend: &FilesBackendState{ + PackedReferences: map[git.ReferenceName]git.ObjectID{ + "refs/heads/first-branch/someRef": setup.Commits.First.OID, + }, + LooseReferences: map[git.ReferenceName]git.ObjectID{}, + }, + }, + }, + }, + }, + }, { desc: "housekeeping fails in read-only transaction", customSetup: customSetup, -- GitLab