From 831e02340b9de46c9ea0a1bbce3894f390f5a45e Mon Sep 17 00:00:00 2001 From: Patrick Steinhardt Date: Wed, 19 Nov 2025 08:50:49 +0100 Subject: [PATCH 01/25] path: move `enter_repo()` into "setup.c" The function `enter_repo()` is used to enter a repository at a given path. As such it sits way closer to setting up a repository than it does with handling paths, but regardless of that it's located in "path.c" instead of in "setup.c". Move the function into "setup.c". Signed-off-by: Patrick Steinhardt Signed-off-by: Junio C Hamano --- builtin/receive-pack.c | 2 +- builtin/upload-archive.c | 2 +- builtin/upload-pack.c | 2 +- http-backend.c | 1 + path.c | 100 --------------------------------------- path.h | 15 ------ setup.c | 81 +++++++++++++++++++++++++++++++ setup.h | 38 +++++++++++++++ 8 files changed, 123 insertions(+), 118 deletions(-) diff --git a/builtin/receive-pack.c b/builtin/receive-pack.c index c9288a9c7e..79a0fd4756 100644 --- a/builtin/receive-pack.c +++ b/builtin/receive-pack.c @@ -34,7 +34,6 @@ #include "object-file.h" #include "object-name.h" #include "odb.h" -#include "path.h" #include "protocol.h" #include "commit-reach.h" #include "server-info.h" @@ -42,6 +41,7 @@ #include "trace2.h" #include "worktree.h" #include "shallow.h" +#include "setup.h" #include "parse-options.h" static const char * const receive_pack_usage[] = { diff --git a/builtin/upload-archive.c b/builtin/upload-archive.c index 97d7c9522f..25312bb2a5 100644 --- a/builtin/upload-archive.c +++ b/builtin/upload-archive.c @@ -4,8 +4,8 @@ #define USE_THE_REPOSITORY_VARIABLE #include "builtin.h" #include "archive.h" -#include "path.h" #include "pkt-line.h" +#include "setup.h" #include "sideband.h" #include "run-command.h" #include "strvec.h" diff --git a/builtin/upload-pack.c b/builtin/upload-pack.c index c2bbc035ab..30498fafea 100644 --- a/builtin/upload-pack.c +++ b/builtin/upload-pack.c @@ -5,11 +5,11 @@ #include "gettext.h" #include "pkt-line.h" #include "parse-options.h" -#include "path.h" #include "protocol.h" #include "replace-object.h" #include "upload-pack.h" #include "serve.h" +#include "setup.h" #include "commit.h" #include "environment.h" diff --git a/http-backend.c b/http-backend.c index 52f0483dd3..e9d1ef92bd 100644 --- a/http-backend.c +++ b/http-backend.c @@ -16,6 +16,7 @@ #include "run-command.h" #include "string-list.h" #include "url.h" +#include "setup.h" #include "strvec.h" #include "packfile.h" #include "odb.h" diff --git a/path.c b/path.c index 7f56eaf993..d726537622 100644 --- a/path.c +++ b/path.c @@ -738,106 +738,6 @@ char *interpolate_path(const char *path, int real_home) return NULL; } -/* - * First, one directory to try is determined by the following algorithm. - * - * (0) If "strict" is given, the path is used as given and no DWIM is - * done. Otherwise: - * (1) "~/path" to mean path under the running user's home directory; - * (2) "~user/path" to mean path under named user's home directory; - * (3) "relative/path" to mean cwd relative directory; or - * (4) "/absolute/path" to mean absolute directory. - * - * Unless "strict" is given, we check "%s/.git", "%s", "%s.git/.git", "%s.git" - * in this order. We select the first one that is a valid git repository, and - * chdir() to it. If none match, or we fail to chdir, we return NULL. - * - * If all goes well, we return the directory we used to chdir() (but - * before ~user is expanded), avoiding getcwd() resolving symbolic - * links. User relative paths are also returned as they are given, - * except DWIM suffixing. - */ -const char *enter_repo(const char *path, unsigned flags) -{ - static struct strbuf validated_path = STRBUF_INIT; - static struct strbuf used_path = STRBUF_INIT; - - if (!path) - return NULL; - - if (!(flags & ENTER_REPO_STRICT)) { - static const char *suffix[] = { - "/.git", "", ".git/.git", ".git", NULL, - }; - const char *gitfile; - int len = strlen(path); - int i; - while ((1 < len) && (path[len-1] == '/')) - len--; - - /* - * We can handle arbitrary-sized buffers, but this remains as a - * sanity check on untrusted input. - */ - if (PATH_MAX <= len) - return NULL; - - strbuf_reset(&used_path); - strbuf_reset(&validated_path); - strbuf_add(&used_path, path, len); - strbuf_add(&validated_path, path, len); - - if (used_path.buf[0] == '~') { - char *newpath = interpolate_path(used_path.buf, 0); - if (!newpath) - return NULL; - strbuf_attach(&used_path, newpath, strlen(newpath), - strlen(newpath)); - } - for (i = 0; suffix[i]; i++) { - struct stat st; - size_t baselen = used_path.len; - strbuf_addstr(&used_path, suffix[i]); - if (!stat(used_path.buf, &st) && - (S_ISREG(st.st_mode) || - (S_ISDIR(st.st_mode) && is_git_directory(used_path.buf)))) { - strbuf_addstr(&validated_path, suffix[i]); - break; - } - strbuf_setlen(&used_path, baselen); - } - if (!suffix[i]) - return NULL; - gitfile = read_gitfile(used_path.buf); - if (!(flags & ENTER_REPO_ANY_OWNER_OK)) - die_upon_dubious_ownership(gitfile, NULL, used_path.buf); - if (gitfile) { - strbuf_reset(&used_path); - strbuf_addstr(&used_path, gitfile); - } - if (chdir(used_path.buf)) - return NULL; - path = validated_path.buf; - } - else { - const char *gitfile = read_gitfile(path); - if (!(flags & ENTER_REPO_ANY_OWNER_OK)) - die_upon_dubious_ownership(gitfile, NULL, path); - if (gitfile) - path = gitfile; - if (chdir(path)) - return NULL; - } - - if (is_git_directory(".")) { - set_git_dir(".", 0); - check_repository_format(NULL); - return path; - } - - return NULL; -} - int calc_shared_perm(struct repository *repo, int mode) { diff --git a/path.h b/path.h index e67348f253..0ec95a0b07 100644 --- a/path.h +++ b/path.h @@ -146,21 +146,6 @@ int adjust_shared_perm(struct repository *repo, const char *path); char *interpolate_path(const char *path, int real_home); -/* The bits are as follows: - * - * - ENTER_REPO_STRICT: callers that require exact paths (as opposed - * to allowing known suffixes like ".git", ".git/.git" to be - * omitted) can set this bit. - * - * - ENTER_REPO_ANY_OWNER_OK: callers that are willing to run without - * ownership check can set this bit. - */ -enum { - ENTER_REPO_STRICT = (1<<0), - ENTER_REPO_ANY_OWNER_OK = (1<<1), -}; - -const char *enter_repo(const char *path, unsigned flags); const char *remove_leading_path(const char *in, const char *prefix); const char *relative_path(const char *in, const char *prefix, struct strbuf *sb); int normalize_path_copy_len(char *dst, const char *src, int *prefix_len); diff --git a/setup.c b/setup.c index 7086741e6c..98c6fd8ee4 100644 --- a/setup.c +++ b/setup.c @@ -1703,6 +1703,87 @@ void set_git_dir(const char *path, int make_realpath) strbuf_release(&realpath); } +const char *enter_repo(const char *path, unsigned flags) +{ + static struct strbuf validated_path = STRBUF_INIT; + static struct strbuf used_path = STRBUF_INIT; + + if (!path) + return NULL; + + if (!(flags & ENTER_REPO_STRICT)) { + static const char *suffix[] = { + "/.git", "", ".git/.git", ".git", NULL, + }; + const char *gitfile; + int len = strlen(path); + int i; + while ((1 < len) && (path[len-1] == '/')) + len--; + + /* + * We can handle arbitrary-sized buffers, but this remains as a + * sanity check on untrusted input. + */ + if (PATH_MAX <= len) + return NULL; + + strbuf_reset(&used_path); + strbuf_reset(&validated_path); + strbuf_add(&used_path, path, len); + strbuf_add(&validated_path, path, len); + + if (used_path.buf[0] == '~') { + char *newpath = interpolate_path(used_path.buf, 0); + if (!newpath) + return NULL; + strbuf_attach(&used_path, newpath, strlen(newpath), + strlen(newpath)); + } + for (i = 0; suffix[i]; i++) { + struct stat st; + size_t baselen = used_path.len; + strbuf_addstr(&used_path, suffix[i]); + if (!stat(used_path.buf, &st) && + (S_ISREG(st.st_mode) || + (S_ISDIR(st.st_mode) && is_git_directory(used_path.buf)))) { + strbuf_addstr(&validated_path, suffix[i]); + break; + } + strbuf_setlen(&used_path, baselen); + } + if (!suffix[i]) + return NULL; + gitfile = read_gitfile(used_path.buf); + if (!(flags & ENTER_REPO_ANY_OWNER_OK)) + die_upon_dubious_ownership(gitfile, NULL, used_path.buf); + if (gitfile) { + strbuf_reset(&used_path); + strbuf_addstr(&used_path, gitfile); + } + if (chdir(used_path.buf)) + return NULL; + path = validated_path.buf; + } + else { + const char *gitfile = read_gitfile(path); + if (!(flags & ENTER_REPO_ANY_OWNER_OK)) + die_upon_dubious_ownership(gitfile, NULL, path); + if (gitfile) + path = gitfile; + if (chdir(path)) + return NULL; + } + + if (is_git_directory(".")) { + set_git_dir(".", 0); + check_repository_format(NULL); + return path; + } + + return NULL; +} + static int git_work_tree_initialized; /* diff --git a/setup.h b/setup.h index 8522fa8575..bfea199bcd 100644 --- a/setup.h +++ b/setup.h @@ -97,6 +97,44 @@ static inline int discover_git_directory(struct strbuf *commondir, void set_git_dir(const char *path, int make_realpath); void set_git_work_tree(const char *tree); +/* Flags that can be passed to `enter_repo()`. */ +enum { + /* + * Callers that require exact paths (as opposed to allowing known + * suffixes like ".git", ".git/.git" to be omitted) can set this bit. + */ + ENTER_REPO_STRICT = (1<<0), + + /* + * Callers that are willing to run without ownership check can set this + * bit. + */ + ENTER_REPO_ANY_OWNER_OK = (1<<1), +}; + +/* + * Discover and enter a repository. + * + * First, one directory to try is determined by the following algorithm. + * + * (0) If "strict" is given, the path is used as given and no DWIM is + * done. Otherwise: + * (1) "~/path" to mean path under the running user's home directory; + * (2) "~user/path" to mean path under named user's home directory; + * (3) "relative/path" to mean cwd relative directory; or + * (4) "/absolute/path" to mean absolute directory. + * + * Unless "strict" is given, we check "%s/.git", "%s", "%s.git/.git", "%s.git" + * in this order. We select the first one that is a valid git repository, and + * chdir() to it. If none match, or we fail to chdir, we return NULL. + * + * If all goes well, we return the directory we used to chdir() (but + * before ~user is expanded), avoiding getcwd() resolving symbolic + * links. User relative paths are also returned as they are given, + * except DWIM suffixing. + */ +const char *enter_repo(const char *path, unsigned flags); + const char *setup_git_directory_gently(int *); const char *setup_git_directory(void); char *prefix_path(const char *prefix, int len, const char *path); -- GitLab From 7c188a9e45405ff911b81a5dd9029f4e91fb338e Mon Sep 17 00:00:00 2001 From: Patrick Steinhardt Date: Wed, 19 Nov 2025 08:50:50 +0100 Subject: [PATCH 02/25] setup: convert `set_git_dir()` to have file scope We don't have any external callers of `set_git_dir()` anymore now that `enter_repo()` has been moved into "setup.c". Remove the declaration and mark the function as static. Note that this change requires us to move the implementation around so that we can avoid adding any new forward declarations. Signed-off-by: Patrick Steinhardt Signed-off-by: Junio C Hamano --- setup.c | 80 ++++++++++++++++++++++++++++----------------------------- setup.h | 1 - 2 files changed, 40 insertions(+), 41 deletions(-) diff --git a/setup.c b/setup.c index 98c6fd8ee4..8bf52df716 100644 --- a/setup.c +++ b/setup.c @@ -1002,6 +1002,46 @@ const char *read_gitfile_gently(const char *path, int *return_error_code) return error_code ? NULL : path; } +static void set_git_dir_1(const char *path) +{ + xsetenv(GIT_DIR_ENVIRONMENT, path, 1); + setup_git_env(path); +} + +static void update_relative_gitdir(const char *name UNUSED, + const char *old_cwd, + const char *new_cwd, + void *data UNUSED) +{ + char *path = reparent_relative_path(old_cwd, new_cwd, + repo_get_git_dir(the_repository)); + struct tmp_objdir *tmp_objdir = tmp_objdir_unapply_primary_odb(); + + trace_printf_key(&trace_setup_key, + "setup: move $GIT_DIR to '%s'", + path); + set_git_dir_1(path); + if (tmp_objdir) + tmp_objdir_reapply_primary_odb(tmp_objdir, old_cwd, new_cwd); + free(path); +} + +static void set_git_dir(const char *path, int make_realpath) +{ + struct strbuf realpath = STRBUF_INIT; + + if (make_realpath) { + strbuf_realpath(&realpath, path, 1); + path = realpath.buf; + } + + set_git_dir_1(path); + if (!is_absolute_path(path)) + chdir_notify_register(NULL, update_relative_gitdir, NULL); + + strbuf_release(&realpath); +} + static const char *setup_explicit_git_dir(const char *gitdirenv, struct strbuf *cwd, struct repository_format *repo_fmt, @@ -1663,46 +1703,6 @@ void setup_git_env(const char *git_dir) fetch_if_missing = 0; } -static void set_git_dir_1(const char *path) -{ - xsetenv(GIT_DIR_ENVIRONMENT, path, 1); - setup_git_env(path); -} - -static void update_relative_gitdir(const char *name UNUSED, - const char *old_cwd, - const char *new_cwd, - void *data UNUSED) -{ - char *path = reparent_relative_path(old_cwd, new_cwd, - repo_get_git_dir(the_repository)); - struct tmp_objdir *tmp_objdir = tmp_objdir_unapply_primary_odb(); - - trace_printf_key(&trace_setup_key, - "setup: move $GIT_DIR to '%s'", - path); - set_git_dir_1(path); - if (tmp_objdir) - tmp_objdir_reapply_primary_odb(tmp_objdir, old_cwd, new_cwd); - free(path); -} - -void set_git_dir(const char *path, int make_realpath) -{ - struct strbuf realpath = STRBUF_INIT; - - if (make_realpath) { - strbuf_realpath(&realpath, path, 1); - path = realpath.buf; - } - - set_git_dir_1(path); - if (!is_absolute_path(path)) - chdir_notify_register(NULL, update_relative_gitdir, NULL); - - strbuf_release(&realpath); -} - const char *enter_repo(const char *path, unsigned flags) { static struct strbuf validated_path = STRBUF_INIT; diff --git a/setup.h b/setup.h index bfea199bcd..d55dcc6608 100644 --- a/setup.h +++ b/setup.h @@ -94,7 +94,6 @@ static inline int discover_git_directory(struct strbuf *commondir, return 0; } -void set_git_dir(const char *path, int make_realpath); void set_git_work_tree(const char *tree); /* Flags that can be passed to `enter_repo()`. */ -- GitLab From 9aaba579932781c74f67d6cecddaad59f0daaaef Mon Sep 17 00:00:00 2001 From: Patrick Steinhardt Date: Wed, 19 Nov 2025 08:50:51 +0100 Subject: [PATCH 03/25] odb: adopt logic to close object databases The logic to close an object database is currently contained in the packfile subsystem. That choice is somewhat relatable, as most of the logic really is to close resources associated with the packfile store itself. But we also end up handling object sources and commit graphs, which certainly is not related to packfiles. Move the function into the object database subsystem and rename it to `odb_close()`. Signed-off-by: Patrick Steinhardt Signed-off-by: Junio C Hamano --- builtin/clone.c | 2 +- builtin/gc.c | 2 +- builtin/repack.c | 2 +- midx-write.c | 2 +- odb.c | 18 +++++++++++++++++- odb.h | 7 +++++++ packfile.c | 15 --------------- packfile.h | 1 - run-command.c | 2 +- scalar.c | 2 +- 10 files changed, 30 insertions(+), 23 deletions(-) diff --git a/builtin/clone.c b/builtin/clone.c index c990f398ef..b19b302b06 100644 --- a/builtin/clone.c +++ b/builtin/clone.c @@ -1617,7 +1617,7 @@ int cmd_clone(int argc, transport_disconnect(transport); if (option_dissociate) { - close_object_store(the_repository->objects); + odb_close(the_repository->objects); dissociate_from_references(); } diff --git a/builtin/gc.c b/builtin/gc.c index d212cbb9b8..961fa343c4 100644 --- a/builtin/gc.c +++ b/builtin/gc.c @@ -1048,7 +1048,7 @@ int cmd_gc(int argc, report_garbage = report_pack_garbage; odb_reprepare(the_repository->objects); if (pack_garbage.nr > 0) { - close_object_store(the_repository->objects); + odb_close(the_repository->objects); clean_pack_garbage(); } diff --git a/builtin/repack.c b/builtin/repack.c index cfdb4c0920..d9012141f6 100644 --- a/builtin/repack.c +++ b/builtin/repack.c @@ -488,7 +488,7 @@ int cmd_repack(int argc, string_list_sort(&names); - close_object_store(repo->objects); + odb_close(repo->objects); /* * Ok we have prepared all new packfiles. diff --git a/midx-write.c b/midx-write.c index c73010df6d..60497586fd 100644 --- a/midx-write.c +++ b/midx-write.c @@ -1459,7 +1459,7 @@ static int write_midx_internal(struct odb_source *source, } if (ctx.m || ctx.base_midx) - close_object_store(ctx.repo->objects); + odb_close(ctx.repo->objects); if (commit_lock_file(&lk) < 0) die_errno(_("could not write multi-pack-index")); diff --git a/odb.c b/odb.c index 3ec21ef24e..bcefa5cede 100644 --- a/odb.c +++ b/odb.c @@ -9,6 +9,7 @@ #include "khash.h" #include "lockfile.h" #include "loose.h" +#include "midx.h" #include "object-file-convert.h" #include "object-file.h" #include "odb.h" @@ -1044,6 +1045,21 @@ struct object_database *odb_new(struct repository *repo) return o; } +void odb_close(struct object_database *o) +{ + struct odb_source *source; + + packfile_store_close(o->packfiles); + + for (source = o->sources; source; source = source->next) { + if (source->midx) + close_midx(source->midx); + source->midx = NULL; + } + + close_commit_graph(o); +} + static void odb_free_sources(struct object_database *o) { while (o->sources) { @@ -1076,7 +1092,7 @@ void odb_clear(struct object_database *o) free((char *) o->cached_objects[i].value.buf); FREE_AND_NULL(o->cached_objects); - close_object_store(o); + odb_close(o); packfile_store_free(o->packfiles); o->packfiles = NULL; diff --git a/odb.h b/odb.h index 9bb28008b1..71b4897c82 100644 --- a/odb.h +++ b/odb.h @@ -169,6 +169,13 @@ struct object_database { struct object_database *odb_new(struct repository *repo); void odb_clear(struct object_database *o); +/* + * Close the object database and all of its sources so that any held resources + * will be released. The database can still be used after closing it, in which + * case these resources may be reallocated. + */ +void odb_close(struct object_database *o); + /* * Clear caches, reload alternates and then reload object sources so that new * objects may become accessible. diff --git a/packfile.c b/packfile.c index 40f733dd23..af71eaf7e3 100644 --- a/packfile.c +++ b/packfile.c @@ -359,21 +359,6 @@ void close_pack(struct packed_git *p) oidset_clear(&p->bad_objects); } -void close_object_store(struct object_database *o) -{ - struct odb_source *source; - - packfile_store_close(o->packfiles); - - for (source = o->sources; source; source = source->next) { - if (source->midx) - close_midx(source->midx); - source->midx = NULL; - } - - close_commit_graph(o); -} - void unlink_pack_path(const char *pack_name, int force_delete) { static const char *exts[] = {".idx", ".pack", ".rev", ".keep", ".bitmap", ".promisor", ".mtimes"}; diff --git a/packfile.h b/packfile.h index 58fcc88e20..d9226a072a 100644 --- a/packfile.h +++ b/packfile.h @@ -279,7 +279,6 @@ struct object_database; unsigned char *use_pack(struct packed_git *, struct pack_window **, off_t, unsigned long *); void close_pack_windows(struct packed_git *); void close_pack(struct packed_git *); -void close_object_store(struct object_database *o); void unuse_pack(struct pack_window **); void clear_delta_base_cache(void); struct packed_git *add_packed_git(struct repository *r, const char *path, diff --git a/run-command.c b/run-command.c index ed9575bd6a..e3e02475cc 100644 --- a/run-command.c +++ b/run-command.c @@ -743,7 +743,7 @@ int start_command(struct child_process *cmd) fflush(NULL); if (cmd->close_object_store) - close_object_store(the_repository->objects); + odb_close(the_repository->objects); #ifndef GIT_WINDOWS_NATIVE { diff --git a/scalar.c b/scalar.c index f754311627..2aeb191cc8 100644 --- a/scalar.c +++ b/scalar.c @@ -931,7 +931,7 @@ static int cmd_delete(int argc, const char **argv) if (dir_inside_of(cwd, enlistment.buf) >= 0) res = error(_("refusing to delete current working directory")); else { - close_object_store(the_repository->objects); + odb_close(the_repository->objects); res = delete_enlistment(&enlistment); } strbuf_release(&enlistment); -- GitLab From f8bdf3127ab7df8a8f3039f41889b35eefe029a3 Mon Sep 17 00:00:00 2001 From: Patrick Steinhardt Date: Wed, 19 Nov 2025 08:50:52 +0100 Subject: [PATCH 04/25] odb: refactor `odb_clear()` to `odb_free()` The function `odb_clear()` releases all resources allocated to an object database and ensures that all fields become zero'd out. Despite its naming though it doesn't really clear the object database so that it becomes ready for reuse afterwards again -- the caller would first have to reinitialize it, and that contradicts the terminology of "clearing" as we have defined it in our coding guidelines. There isn't really only a reason to have "clearing" semantics, either. There's only a single caller of `odb_clear()`, and that caller also ends up freeing the object database structure itself. Refactor the function to have "freeing" semantics instead, so that the structure itself is also freed, which allows us to drop some useless boilerplate to zero out the structure's members. This refactoring reveals that we're trying to close the commit graph multiple times: once directly via `free_commit_graph()`, and once via `odb_close()`. Drop the former call. Signed-off-by: Patrick Steinhardt Signed-off-by: Junio C Hamano --- odb.c | 19 ++++++++----------- odb.h | 4 +++- repository.c | 4 ++-- 3 files changed, 13 insertions(+), 14 deletions(-) diff --git a/odb.c b/odb.c index bcefa5cede..29cf6496c5 100644 --- a/odb.c +++ b/odb.c @@ -1073,30 +1073,27 @@ static void odb_free_sources(struct object_database *o) o->source_by_path = NULL; } -void odb_clear(struct object_database *o) +void odb_free(struct object_database *o) { - FREE_AND_NULL(o->alternate_db); + if (!o) + return; + + free(o->alternate_db); oidmap_clear(&o->replace_map, 1); pthread_mutex_destroy(&o->replace_mutex); - free_commit_graph(o->commit_graph); - o->commit_graph = NULL; - o->commit_graph_attempted = 0; - odb_free_sources(o); - o->sources_tail = NULL; - o->loaded_alternates = 0; for (size_t i = 0; i < o->cached_object_nr; i++) free((char *) o->cached_objects[i].value.buf); - FREE_AND_NULL(o->cached_objects); + free(o->cached_objects); odb_close(o); packfile_store_free(o->packfiles); - o->packfiles = NULL; - string_list_clear(&o->submodule_source_paths, 0); + + free(o); } void odb_reprepare(struct object_database *o) diff --git a/odb.h b/odb.h index 71b4897c82..77b313b784 100644 --- a/odb.h +++ b/odb.h @@ -167,7 +167,9 @@ struct object_database { }; struct object_database *odb_new(struct repository *repo); -void odb_clear(struct object_database *o); + +/* Free the object database and release all resources. */ +void odb_free(struct object_database *o); /* * Close the object database and all of its sources so that any held resources diff --git a/repository.c b/repository.c index 6aaa7ba008..3c8b3813b0 100644 --- a/repository.c +++ b/repository.c @@ -382,8 +382,8 @@ void repo_clear(struct repository *repo) FREE_AND_NULL(repo->worktree); FREE_AND_NULL(repo->submodule_prefix); - odb_clear(repo->objects); - FREE_AND_NULL(repo->objects); + odb_free(repo->objects); + repo->objects = NULL; parsed_object_pool_clear(repo->parsed_objects); FREE_AND_NULL(repo->parsed_objects); -- GitLab From b67b2d9fb7ccfaa72446d76abc8c36849d2e0685 Mon Sep 17 00:00:00 2001 From: Patrick Steinhardt Date: Wed, 19 Nov 2025 08:50:53 +0100 Subject: [PATCH 05/25] odb: move logic to disable ref updates into repo Our object database sources have a field `disable_ref_updates`. This field can obviously be set to disable reference updates, but it is somewhat curious that this logic is hosted by the object database. The reason for this is that it was primarily added to keep us from accidentally updating references while an ODB transaction is ongoing. Any objects part of the transaction have not yet been committed to disk, so new references that point to them might get corrupted in case we never end up committing the transaction. As such, whenever we create a new transaction we set up a new temporary ODB source and mark it as disabling reference updates. This has one (and only one?) upside: once we have committed the transaction, the temporary source will be dropped and thus we clean up the disabled reference updates automatically. But other than that, it's somewhat misdesigned: - We can have multiple ODB sources, but only the currently active source inhibits reference updates. - We're mixing concerns of the refdb with the ODB. Arguably, the decision of whether we can update references or not should be handled by the refdb. But that wouldn't be a great fit either, as there can be one refdb per worktree. So we'd again have the same problem that a "global" intent becomes localized to a specific instance. Instead, move the setting into the repository. While at it, convert it into a boolean. Signed-off-by: Patrick Steinhardt Signed-off-by: Junio C Hamano --- odb.c | 3 ++- odb.h | 7 ------- refs.c | 2 +- repository.c | 2 +- repository.h | 9 ++++++++- setup.c | 2 +- 6 files changed, 13 insertions(+), 12 deletions(-) diff --git a/odb.c b/odb.c index 29cf6496c5..ccc6e999e7 100644 --- a/odb.c +++ b/odb.c @@ -360,7 +360,7 @@ struct odb_source *odb_set_temporary_primary_source(struct object_database *odb, * Disable ref updates while a temporary odb is active, since * the objects in the database may roll back. */ - source->disable_ref_updates = 1; + odb->repo->disable_ref_updates = true; source->will_destroy = will_destroy; source->next = odb->sources; odb->sources = source; @@ -387,6 +387,7 @@ void odb_restore_primary_source(struct object_database *odb, if (cur_source->next != restore_source) BUG("we expect the old primary object store to be the first alternate"); + odb->repo->disable_ref_updates = false; odb->sources = restore_source; odb_source_free(cur_source); } diff --git a/odb.h b/odb.h index 77b313b784..99c4d48972 100644 --- a/odb.h +++ b/odb.h @@ -66,13 +66,6 @@ struct odb_source { */ bool local; - /* - * This is a temporary object store created by the tmp_objdir - * facility. Disable ref updates since the objects in the store - * might be discarded on rollback. - */ - int disable_ref_updates; - /* * This object store is ephemeral, so there is no need to fsync. */ diff --git a/refs.c b/refs.c index 965381367e..6c7283d9eb 100644 --- a/refs.c +++ b/refs.c @@ -2491,7 +2491,7 @@ int ref_transaction_prepare(struct ref_transaction *transaction, break; } - if (refs->repo->objects->sources->disable_ref_updates) { + if (refs->repo->disable_ref_updates) { strbuf_addstr(err, _("ref updates forbidden inside quarantine environment")); return -1; diff --git a/repository.c b/repository.c index 3c8b3813b0..455c2d279f 100644 --- a/repository.c +++ b/repository.c @@ -179,7 +179,7 @@ void repo_set_gitdir(struct repository *repo, repo->objects->sources->path = objects_path; } - repo->objects->sources->disable_ref_updates = o->disable_ref_updates; + repo->disable_ref_updates = o->disable_ref_updates; free(repo->objects->alternate_db); repo->objects->alternate_db = xstrdup_or_null(o->alternate_db); diff --git a/repository.h b/repository.h index 5808a5d610..614649413b 100644 --- a/repository.h +++ b/repository.h @@ -71,6 +71,13 @@ struct repository { */ struct ref_store *refs_private; + /* + * Disable ref updates. This is especially used in contexts where + * transactions may still be rolled back so that we don't start to + * reference objects that may vanish. + */ + bool disable_ref_updates; + /* * A strmap of ref_stores, stored by submodule name, accessible via * `repo_get_submodule_ref_store()`. @@ -187,7 +194,7 @@ struct set_gitdir_args { const char *graft_file; const char *index_file; const char *alternate_db; - int disable_ref_updates; + bool disable_ref_updates; }; void repo_set_gitdir(struct repository *repo, const char *root, diff --git a/setup.c b/setup.c index 8bf52df716..a752e9fc84 100644 --- a/setup.c +++ b/setup.c @@ -1682,7 +1682,7 @@ void setup_git_env(const char *git_dir) args.index_file = getenv_safe(&to_free, INDEX_ENVIRONMENT); args.alternate_db = getenv_safe(&to_free, ALTERNATE_DB_ENVIRONMENT); if (getenv(GIT_QUARANTINE_ENVIRONMENT)) { - args.disable_ref_updates = 1; + args.disable_ref_updates = true; } repo_set_gitdir(the_repository, git_dir, &args); -- GitLab From 5d795b34dcbf46039c3dda028bb8df8d75a5a9d0 Mon Sep 17 00:00:00 2001 From: Patrick Steinhardt Date: Wed, 19 Nov 2025 08:50:54 +0100 Subject: [PATCH 06/25] oidset: introduce `oidset_equal()` Introduce a new function that allows the caller to verify whether two oidsets contain the exact same object IDs. Note that this change requires us to change `oidset_iter_init()` to accept a `const struct oidset`. Signed-off-by: Patrick Steinhardt Signed-off-by: Junio C Hamano --- oidset.c | 16 ++++++++++++++++ oidset.h | 9 +++++++-- 2 files changed, 23 insertions(+), 2 deletions(-) diff --git a/oidset.c b/oidset.c index 8d36aef8dc..c8ff0b385c 100644 --- a/oidset.c +++ b/oidset.c @@ -16,6 +16,22 @@ int oidset_contains(const struct oidset *set, const struct object_id *oid) return pos != kh_end(&set->set); } +bool oidset_equal(const struct oidset *a, const struct oidset *b) +{ + struct oidset_iter iter; + struct object_id *a_oid; + + if (oidset_size(a) != oidset_size(b)) + return false; + + oidset_iter_init(a, &iter); + while ((a_oid = oidset_iter_next(&iter))) + if (!oidset_contains(b, a_oid)) + return false; + + return true; +} + int oidset_insert(struct oidset *set, const struct object_id *oid) { int added; diff --git a/oidset.h b/oidset.h index 0106b6f278..e0f1a6ff4f 100644 --- a/oidset.h +++ b/oidset.h @@ -38,6 +38,11 @@ void oidset_init(struct oidset *set, size_t initial_size); */ int oidset_contains(const struct oidset *set, const struct object_id *oid); +/** + * Returns true iff `a` and `b` contain the exact same OIDs. + */ +bool oidset_equal(const struct oidset *a, const struct oidset *b); + /** * Insert the oid into the set; a copy is made, so "oid" does not need * to persist after this function is called. @@ -94,11 +99,11 @@ void oidset_parse_file_carefully(struct oidset *set, const char *path, oidset_parse_tweak_fn fn, void *cbdata); struct oidset_iter { - kh_oid_set_t *set; + const kh_oid_set_t *set; khiter_t iter; }; -static inline void oidset_iter_init(struct oidset *set, +static inline void oidset_iter_init(const struct oidset *set, struct oidset_iter *iter) { iter->set = &set->set; -- GitLab From 8dc22e87f000a092b62b4fb08e2542433f1ae192 Mon Sep 17 00:00:00 2001 From: Patrick Steinhardt Date: Wed, 19 Nov 2025 08:50:55 +0100 Subject: [PATCH 07/25] builtin/index-pack: fix deferred fsck outside repos When asked to perform object consistency checks via the `--fsck-objects` flag we verify that each object part of the pack is valid. In general, this check can even be performed outside of a Git repository: we don't need an initialized object database as we simply read the object from the packfile directly. But there's one exception: a subset of the object checks may be deferred to a later point in time. For now, this only concerns ".gitmodules" and ".gitattributes" files: whenever we see a tree referencing these files we queue them for a deferred check. This is done because we need to do some extra checks for those files to ensure that they are well-formed, and these checks need to be done regardless of whether the corresponding blobs are part of the packfile or not. This works inside a repository, but unfortunately the logic leads to a segfault when running outside of one. This is because we eventually call `odb_read_object()`, which will crash because the object database has not been initialized. There's multiple options here: - We could in theory create a purely in-memory database with only a packfile store that contains the single packfile. We don't really have the infrastructure for this yet though, and it would end up being quite hacky. - We could refuse to perform consistency checks outside of a repository. But most of the checks work alright, so this would be a regression. - We can skip the finalizing consistency checks when running outside of a repository. This is not as invasive as skipping all checks, but it's not great to randomly skip a subset of tests, either. None of these options really feel perfect. The first one would be the obvious choice if easily possible. There's another option though: instead of skipping the final object checks, we can die if there are any queued object checks. With this change we now die exactly if and only if we would have previously segfaulted. Like this we ensure that objects that _may_ fail the consistency checks won't be silently skipped, and at the same time we give users a much better error message. Refactor the code accordingly and add a test that would have triggered the segfault. Note that we also move down the logic to add the packfile to the store. There is no point doing this any earlier than right before we execute `fsck_finish()`, and it ensures that the logic to set up and perform the consistency check is self-contained. Signed-off-by: Patrick Steinhardt Signed-off-by: Junio C Hamano --- builtin/index-pack.c | 21 ++++++++++++++++++--- fsck.c | 6 ++++++ fsck.h | 7 +++++++ t/t5302-pack-index.sh | 16 ++++++++++++++++ 4 files changed, 47 insertions(+), 3 deletions(-) diff --git a/builtin/index-pack.c b/builtin/index-pack.c index 2b78ba7fe4..699fe678cd 100644 --- a/builtin/index-pack.c +++ b/builtin/index-pack.c @@ -1640,7 +1640,7 @@ static void final(const char *final_pack_name, const char *curr_pack_name, rename_tmp_packfile(&final_index_name, curr_index_name, &index_name, hash, "idx", 1); - if (do_fsck_object) + if (do_fsck_object && startup_info->have_repository) packfile_store_load_pack(the_repository->objects->packfiles, final_index_name, 0); @@ -2110,8 +2110,23 @@ int cmd_index_pack(int argc, else close(input_fd); - if (do_fsck_object && fsck_finish(&fsck_options)) - die(_("fsck error in pack objects")); + if (do_fsck_object) { + /* + * We cannot perform queued consistency checks when running + * outside of a repository because those require us to read + * from the object database, which is uninitialized. + * + * TODO: we may eventually set up an in-memory object database, + * which would allow us to perform these queued checks. + */ + if (!startup_info->have_repository && + fsck_has_queued_checks(&fsck_options)) + die(_("cannot perform queued object checks outside " + "of a repository")); + + if (fsck_finish(&fsck_options)) + die(_("fsck error in pack objects")); + } free(opts.anomaly); free(objects); diff --git a/fsck.c b/fsck.c index 341e100d24..8e1565fe6d 100644 --- a/fsck.c +++ b/fsck.c @@ -1350,6 +1350,12 @@ int fsck_finish(struct fsck_options *options) return ret; } +bool fsck_has_queued_checks(struct fsck_options *options) +{ + return !oidset_equal(&options->gitmodules_found, &options->gitmodules_done) || + !oidset_equal(&options->gitattributes_found, &options->gitattributes_done); +} + void fsck_options_clear(struct fsck_options *options) { free(options->msg_type); diff --git a/fsck.h b/fsck.h index cb6ef32f4f..336917c045 100644 --- a/fsck.h +++ b/fsck.h @@ -248,6 +248,13 @@ int fsck_tag_standalone(const struct object_id *oid, const char *buffer, */ int fsck_finish(struct fsck_options *options); +/* + * Check whether there are any checks that have been queued up and that still + * need to be run. Returns `false` iff `fsck_finish()` wouldn't perform any + * actions, `true` otherwise. + */ +bool fsck_has_queued_checks(struct fsck_options *options); + /* * Clear the fsck_options struct, freeing any allocated memory. */ diff --git a/t/t5302-pack-index.sh b/t/t5302-pack-index.sh index 413c99274c..9697448cb2 100755 --- a/t/t5302-pack-index.sh +++ b/t/t5302-pack-index.sh @@ -293,4 +293,20 @@ test_expect_success 'too-large packs report the breach' ' grep "maximum allowed size (20 bytes)" err ' +# git-index-pack(1) uses the default hash algorithm outside of the repository, +# and it has no way to tell it otherwise. So we can only run this test with the +# default hash algorithm, as it would otherwise fail to parse the tree. +test_expect_success DEFAULT_HASH_ALGORITHM 'index-pack --fsck-objects outside of a repo' ' + test_when_finished "rm -rf repo" && + git init repo && + ( + cd repo && + printf "100644 blob $(test_oid 001)\t.gitattributes\n" >tree && + git mktree --missing tree-oid && + git pack-objects err && + test_grep "cannot perform queued object checks outside of a repository" err + ) +' + test_done -- GitLab From eea83c010cf431325a05f06cc7c64029538c8495 Mon Sep 17 00:00:00 2001 From: Patrick Steinhardt Date: Wed, 19 Nov 2025 08:50:56 +0100 Subject: [PATCH 08/25] t/helper: stop setting up `the_repository` repeatedly The "repository" test helper sets up `the_repository` twice. In fact though, we don't even have to set it up even once: all we need is to set up its hash algorithm, because we still depend on some subsystems that aren't free of `the_repository`. Refactor the code accordingly. This prepares for a subsequent change, where setting up the repository repeatedly will lead to a `BUG()`. Signed-off-by: Patrick Steinhardt Signed-off-by: Junio C Hamano --- t/helper/test-repository.c | 16 ++-------------- 1 file changed, 2 insertions(+), 14 deletions(-) diff --git a/t/helper/test-repository.c b/t/helper/test-repository.c index 63c37de33d..9ba94cdffa 100644 --- a/t/helper/test-repository.c +++ b/t/helper/test-repository.c @@ -17,10 +17,6 @@ static void test_parse_commit_in_graph(const char *gitdir, const char *worktree, struct commit *c; struct commit_list *parent; - setup_git_env(gitdir); - - repo_clear(the_repository); - if (repo_init(&r, gitdir, worktree)) die("Couldn't init repo"); @@ -47,10 +43,6 @@ static void test_get_commit_tree_in_graph(const char *gitdir, struct commit *c; struct tree *tree; - setup_git_env(gitdir); - - repo_clear(the_repository); - if (repo_init(&r, gitdir, worktree)) die("Couldn't init repo"); @@ -75,24 +67,20 @@ static void test_get_commit_tree_in_graph(const char *gitdir, int cmd__repository(int argc, const char **argv) { - int nongit_ok = 0; - - setup_git_directory_gently(&nongit_ok); - if (argc < 2) die("must have at least 2 arguments"); if (!strcmp(argv[1], "parse_commit_in_graph")) { struct object_id oid; if (argc < 5) die("not enough arguments"); - if (parse_oid_hex(argv[4], &oid, &argv[4])) + if (parse_oid_hex_any(argv[4], &oid, &argv[4]) == GIT_HASH_UNKNOWN) die("cannot parse oid '%s'", argv[4]); test_parse_commit_in_graph(argv[2], argv[3], &oid); } else if (!strcmp(argv[1], "get_commit_tree_in_graph")) { struct object_id oid; if (argc < 5) die("not enough arguments"); - if (parse_oid_hex(argv[4], &oid, &argv[4])) + if (parse_oid_hex_any(argv[4], &oid, &argv[4]) == GIT_HASH_UNKNOWN) die("cannot parse oid '%s'", argv[4]); test_get_commit_tree_in_graph(argv[2], argv[3], &oid); } else { -- GitLab From c257bd59165e2f55dfa2c97b0ca1e39131513654 Mon Sep 17 00:00:00 2001 From: Patrick Steinhardt Date: Wed, 19 Nov 2025 08:50:57 +0100 Subject: [PATCH 09/25] http-push: stop setting up `the_repository` for each reference When pushing references via HTTP we call `repo_init_revisions()` in a loop for each reference that we're about to push. As third argument we pass the result of `setup_git_directory()`, which causes us to reinitialize the repository every single time. This is an obvious waste of compute, as the repository that we're working in will never change across any of the initializations. The only reason that we do this is to retrieve the directory of the repository. Furthermore, this is about to create issues in a subsequent commit, where reinitializing the repository will cause a `BUG()`. Address this by storing the Git directory in a variable instead so that we don't have to call the function repeatedly. Signed-off-by: Patrick Steinhardt Signed-off-by: Junio C Hamano --- http-push.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/http-push.c b/http-push.c index a1c01e3b9b..a48ca23799 100644 --- a/http-push.c +++ b/http-push.c @@ -1725,6 +1725,7 @@ int cmd_main(int argc, const char **argv) int i; int new_refs; struct ref *ref, *local_refs = NULL; + const char *gitdir; CALLOC_ARRAY(repo, 1); @@ -1787,7 +1788,7 @@ int cmd_main(int argc, const char **argv) if (delete_branch && rs.nr != 1) die("You must specify only one branch name when deleting a remote branch"); - setup_git_directory(); + gitdir = setup_git_directory(); memset(remote_dir_exists, -1, 256); @@ -1941,7 +1942,7 @@ int cmd_main(int argc, const char **argv) if (!push_all && !is_null_oid(&ref->old_oid)) strvec_pushf(&commit_argv, "^%s", oid_to_hex(&ref->old_oid)); - repo_init_revisions(the_repository, &revs, setup_git_directory()); + repo_init_revisions(the_repository, &revs, gitdir); setup_revisions_from_strvec(&commit_argv, &revs, NULL); revs.edge_hint = 0; /* just in case */ -- GitLab From 35d9fc65edc0a5df9f714d02afaa2c942fb28570 Mon Sep 17 00:00:00 2001 From: Patrick Steinhardt Date: Wed, 19 Nov 2025 08:50:58 +0100 Subject: [PATCH 10/25] odb: handle initialization of sources in `odb_new()` The logic to set up a new object database is currently distributed across two functions in "repository.c": - In `initialize_repository()` we initialize an empty object database. This object database is not fully initialized and doesn't have any sources attached to it. - The primary object database source is then created in `repo_set_gitdir()`. Ideally though, the logic should be entirely self-contained so that we can iterate more readily on how exactly the sources themselves get set up. Refactor `odb_new()` to handle both allocation and setup of the object database. This ensures that the object database is always initialized and ready for use, and it allows us to change how the sources get set up eventually. Note that `repo_set_gitdir()` still reaches into the sources when the function gets called with an already-initialized object database. This will be fixed in the next commit. Signed-off-by: Patrick Steinhardt Signed-off-by: Junio C Hamano --- odb.c | 14 +++++++++++++- odb.h | 15 ++++++++++++++- repository.c | 20 ++++++++------------ 3 files changed, 35 insertions(+), 14 deletions(-) diff --git a/odb.c b/odb.c index ccc6e999e7..88b40c81c0 100644 --- a/odb.c +++ b/odb.c @@ -1034,15 +1034,27 @@ int odb_write_object_stream(struct object_database *odb, return odb_source_loose_write_stream(odb->sources, stream, len, oid); } -struct object_database *odb_new(struct repository *repo) +struct object_database *odb_new(struct repository *repo, + const char *primary_source, + const char *secondary_sources) { struct object_database *o = xmalloc(sizeof(*o)); + char *to_free = NULL; memset(o, 0, sizeof(*o)); o->repo = repo; o->packfiles = packfile_store_new(o); pthread_mutex_init(&o->replace_mutex, NULL); string_list_init_dup(&o->submodule_source_paths); + + if (!primary_source) + primary_source = to_free = xstrfmt("%s/objects", repo->commondir); + o->sources = odb_source_new(o, primary_source, true); + o->sources_tail = &o->sources->next; + o->alternate_db = xstrdup_or_null(secondary_sources); + + free(to_free); + return o; } diff --git a/odb.h b/odb.h index 99c4d48972..41b3c03027 100644 --- a/odb.h +++ b/odb.h @@ -159,7 +159,20 @@ struct object_database { struct string_list submodule_source_paths; }; -struct object_database *odb_new(struct repository *repo); +/* + * Create a new object database for the given repository. + * + * If the primary source parameter is set it will override the usual primary + * object directory derived from the repository's common directory. The + * alternate sources are expected to be a PATH_SEP-separated list of secondary + * sources. Note that these alternate sources will be added in addition to, not + * instead of, the alternates identified by the primary source. + * + * Returns the newly created object database. + */ +struct object_database *odb_new(struct repository *repo, + const char *primary_source, + const char *alternate_sources); /* Free the object database and release all resources. */ void odb_free(struct object_database *o); diff --git a/repository.c b/repository.c index 455c2d279f..5975c8f341 100644 --- a/repository.c +++ b/repository.c @@ -52,7 +52,6 @@ static void set_default_hash_algo(struct repository *repo) void initialize_repository(struct repository *repo) { - repo->objects = odb_new(repo); repo->remote_state = remote_state_new(); repo->parsed_objects = parsed_object_pool_new(repo); ALLOC_ARRAY(repo->index, 1); @@ -160,29 +159,26 @@ void repo_set_gitdir(struct repository *repo, * until after xstrdup(root). Then we can free it. */ char *old_gitdir = repo->gitdir; - char *objects_path = NULL; repo->gitdir = xstrdup(gitfile ? gitfile : root); free(old_gitdir); repo_set_commondir(repo, o->commondir); - expand_base_dir(&objects_path, o->object_dir, - repo->commondir, "objects"); - - if (!repo->objects->sources) { - repo->objects->sources = odb_source_new(repo->objects, - objects_path, true); - repo->objects->sources_tail = &repo->objects->sources->next; - free(objects_path); + + if (!repo->objects) { + repo->objects = odb_new(repo, o->object_dir, o->alternate_db); } else { + char *objects_path = NULL; + expand_base_dir(&objects_path, o->object_dir, + repo->commondir, "objects"); free(repo->objects->sources->path); repo->objects->sources->path = objects_path; + free(repo->objects->alternate_db); + repo->objects->alternate_db = xstrdup_or_null(o->alternate_db); } repo->disable_ref_updates = o->disable_ref_updates; - free(repo->objects->alternate_db); - repo->objects->alternate_db = xstrdup_or_null(o->alternate_db); expand_base_dir(&repo->graft_file, o->graft_file, repo->commondir, "info/grafts"); expand_base_dir(&repo->index_file, o->index_file, -- GitLab From 2574c617362a0c67d15fa01e01cdbd0f6bcdbc93 Mon Sep 17 00:00:00 2001 From: Patrick Steinhardt Date: Wed, 19 Nov 2025 08:50:59 +0100 Subject: [PATCH 11/25] chdir-notify: add function to unregister listeners While we (obviously) have a way to register new listeners that get called whenever we chdir(3p), we don't have an equivalent that can be used to unregister such a listener again. Add one, as it will be required in a subsequent commit. Signed-off-by: Patrick Steinhardt Signed-off-by: Junio C Hamano --- chdir-notify.c | 18 ++++++++++++++++++ chdir-notify.h | 2 ++ 2 files changed, 20 insertions(+) diff --git a/chdir-notify.c b/chdir-notify.c index 0d7bc04607..f8bfe3cbef 100644 --- a/chdir-notify.c +++ b/chdir-notify.c @@ -25,6 +25,24 @@ void chdir_notify_register(const char *name, list_add_tail(&e->list, &chdir_notify_entries); } +void chdir_notify_unregister(const char *name, chdir_notify_callback cb, + void *data) +{ + struct list_head *pos, *p; + + list_for_each_safe(pos, p, &chdir_notify_entries) { + struct chdir_notify_entry *e = + list_entry(pos, struct chdir_notify_entry, list); + + if (e->cb != cb || e->data != data || !e->name != !name || + (e->name && strcmp(e->name, name))) + continue; + + list_del(pos); + free(e); + } +} + static void reparent_cb(const char *name, const char *old_cwd, const char *new_cwd, diff --git a/chdir-notify.h b/chdir-notify.h index 366e4c1ee9..81eb69d846 100644 --- a/chdir-notify.h +++ b/chdir-notify.h @@ -41,6 +41,8 @@ typedef void (*chdir_notify_callback)(const char *name, const char *new_cwd, void *data); void chdir_notify_register(const char *name, chdir_notify_callback cb, void *data); +void chdir_notify_unregister(const char *name, chdir_notify_callback cb, + void *data); void chdir_notify_reparent(const char *name, char **path); /* -- GitLab From 2816b748e5c300afda559a09426f8342c235c29d Mon Sep 17 00:00:00 2001 From: Patrick Steinhardt Date: Wed, 19 Nov 2025 08:51:00 +0100 Subject: [PATCH 12/25] odb: handle changing a repository's commondir The function `repo_set_gitdir()` is called in two situations: - To initialize the repository with its discovered location. As part of this we also set up the new object database. - To update the repository's discovered location in case the process changes its working directory so that we update relative paths. This means we also have to update any relative paths that are potentially used in the object database. In the context of the object database we ideally wouldn't ever have to worry about the second case: if all paths used by our object database sources were absolute, then we wouldn't have to update them. But unfortunately, the paths aren't only used to locate files owned by the given source, but we also use them for reporting purposes. One such example is `repo_get_object_directory()`, where we cannot just change semantics to always return absolute paths, as that is likely to break tooling out there. One solution to this would be to have both a "display path" and an "internal path". This would allow us to use internal paths for all internal matters, but continue to use the potentially-relative display paths so that we don't break compatibility. But converting the codebase to honor this split is quite a messy endeavour, and it wouldn't even help us with the goal to get rid of the need to update the display path on chdir(3p). Another solution would be to rework "setup.c" so that we never have to update paths in the first place. In that case, we'd only initialize the repository once we have figured out final locations for all directories. This would be a significant simplification of that subsystem indeed, but the current logic is so messy that it would take significant investments to get there. Meanwhile though, while object sources may still use relative paths, the best thing we can do is to handle the reparenting of the object source paths in the object database itself. This can be done by registering one callback for each object database so that we get notified whenever the current working directory changes, and we then perform the reparenting ourselves. Ideally, this wouldn't even happen on the object database level, but instead handled by each object database source. But we don't yet have proper pluggable object database sources, so this will need to be handled at a later point in time. The logic itself is rather simple: - We register the callback when creating the object database. - We unregister the callback when releasing it again. - We split up `set_git_dir_1()` so that it becomes possible to skip recreating the object database. This is required because the function is called both when the current working directory changes, but also when we set up the repository. Calling this function without skipping creation of the ODB will result in a bug in case it's already created. Signed-off-by: Patrick Steinhardt Signed-off-by: Junio C Hamano --- odb.c | 37 +++++++++++++++++++++-- odb.h | 4 --- repository.c | 13 ++------ repository.h | 1 + setup.c | 84 ++++++++++++++++++++++++++++------------------------ 5 files changed, 83 insertions(+), 56 deletions(-) diff --git a/odb.c b/odb.c index 88b40c81c0..70665fb7f4 100644 --- a/odb.c +++ b/odb.c @@ -1,5 +1,6 @@ #include "git-compat-util.h" #include "abspath.h" +#include "chdir-notify.h" #include "commit-graph.h" #include "config.h" #include "dir.h" @@ -142,9 +143,9 @@ static void read_info_alternates(struct object_database *odb, const char *relative_base, int depth); -struct odb_source *odb_source_new(struct object_database *odb, - const char *path, - bool local) +static struct odb_source *odb_source_new(struct object_database *odb, + const char *path, + bool local) { struct odb_source *source; @@ -1034,6 +1035,32 @@ int odb_write_object_stream(struct object_database *odb, return odb_source_loose_write_stream(odb->sources, stream, len, oid); } +static void odb_update_commondir(const char *name UNUSED, + const char *old_cwd, + const char *new_cwd, + void *cb_data) +{ + struct object_database *odb = cb_data; + struct odb_source *source; + + /* + * In theory, we only have to do this for the primary object source, as + * alternates' paths are always resolved to an absolute path. + */ + for (source = odb->sources; source; source = source->next) { + char *path; + + if (is_absolute_path(source->path)) + continue; + + path = reparent_relative_path(old_cwd, new_cwd, + source->path); + + free(source->path); + source->path = path; + } +} + struct object_database *odb_new(struct repository *repo, const char *primary_source, const char *secondary_sources) @@ -1055,6 +1082,8 @@ struct object_database *odb_new(struct repository *repo, free(to_free); + chdir_notify_register(NULL, odb_update_commondir, o); + return o; } @@ -1106,6 +1135,8 @@ void odb_free(struct object_database *o) packfile_store_free(o->packfiles); string_list_clear(&o->submodule_source_paths, 0); + chdir_notify_unregister(NULL, odb_update_commondir, o); + free(o); } diff --git a/odb.h b/odb.h index 41b3c03027..014cd9585a 100644 --- a/odb.h +++ b/odb.h @@ -78,10 +78,6 @@ struct odb_source { char *path; }; -struct odb_source *odb_source_new(struct object_database *odb, - const char *path, - bool local); - struct packed_git; struct packfile_store; struct cached_object_entry; diff --git a/repository.c b/repository.c index 5975c8f341..863f24411b 100644 --- a/repository.c +++ b/repository.c @@ -165,17 +165,10 @@ void repo_set_gitdir(struct repository *repo, repo_set_commondir(repo, o->commondir); - if (!repo->objects) { + if (!repo->objects) repo->objects = odb_new(repo, o->object_dir, o->alternate_db); - } else { - char *objects_path = NULL; - expand_base_dir(&objects_path, o->object_dir, - repo->commondir, "objects"); - free(repo->objects->sources->path); - repo->objects->sources->path = objects_path; - free(repo->objects->alternate_db); - repo->objects->alternate_db = xstrdup_or_null(o->alternate_db); - } + else if (!o->skip_initializing_odb) + BUG("cannot reinitialize an already-initialized object directory"); repo->disable_ref_updates = o->disable_ref_updates; diff --git a/repository.h b/repository.h index 614649413b..6063c4b846 100644 --- a/repository.h +++ b/repository.h @@ -195,6 +195,7 @@ struct set_gitdir_args { const char *index_file; const char *alternate_db; bool disable_ref_updates; + bool skip_initializing_odb; }; void repo_set_gitdir(struct repository *repo, const char *root, diff --git a/setup.c b/setup.c index a752e9fc84..a625f9fbc8 100644 --- a/setup.c +++ b/setup.c @@ -1002,10 +1002,51 @@ const char *read_gitfile_gently(const char *path, int *return_error_code) return error_code ? NULL : path; } -static void set_git_dir_1(const char *path) +static void setup_git_env_internal(const char *git_dir, + bool skip_initializing_odb) +{ + char *git_replace_ref_base; + const char *shallow_file; + const char *replace_ref_base; + struct set_gitdir_args args = { NULL }; + struct strvec to_free = STRVEC_INIT; + + args.commondir = getenv_safe(&to_free, GIT_COMMON_DIR_ENVIRONMENT); + args.object_dir = getenv_safe(&to_free, DB_ENVIRONMENT); + args.graft_file = getenv_safe(&to_free, GRAFT_ENVIRONMENT); + args.index_file = getenv_safe(&to_free, INDEX_ENVIRONMENT); + args.alternate_db = getenv_safe(&to_free, ALTERNATE_DB_ENVIRONMENT); + if (getenv(GIT_QUARANTINE_ENVIRONMENT)) + args.disable_ref_updates = true; + args.skip_initializing_odb = skip_initializing_odb; + + repo_set_gitdir(the_repository, git_dir, &args); + strvec_clear(&to_free); + + if (getenv(NO_REPLACE_OBJECTS_ENVIRONMENT)) + disable_replace_refs(); + replace_ref_base = getenv(GIT_REPLACE_REF_BASE_ENVIRONMENT); + git_replace_ref_base = xstrdup(replace_ref_base ? replace_ref_base + : "refs/replace/"); + update_ref_namespace(NAMESPACE_REPLACE, git_replace_ref_base); + + shallow_file = getenv(GIT_SHALLOW_FILE_ENVIRONMENT); + if (shallow_file) + set_alternate_shallow_file(the_repository, shallow_file, 0); + + if (git_env_bool(NO_LAZY_FETCH_ENVIRONMENT, 0)) + fetch_if_missing = 0; +} + +void setup_git_env(const char *git_dir) +{ + setup_git_env_internal(git_dir, false); +} + +static void set_git_dir_1(const char *path, bool skip_initializing_odb) { xsetenv(GIT_DIR_ENVIRONMENT, path, 1); - setup_git_env(path); + setup_git_env_internal(path, skip_initializing_odb); } static void update_relative_gitdir(const char *name UNUSED, @@ -1020,7 +1061,7 @@ static void update_relative_gitdir(const char *name UNUSED, trace_printf_key(&trace_setup_key, "setup: move $GIT_DIR to '%s'", path); - set_git_dir_1(path); + set_git_dir_1(path, true); if (tmp_objdir) tmp_objdir_reapply_primary_odb(tmp_objdir, old_cwd, new_cwd); free(path); @@ -1035,7 +1076,7 @@ static void set_git_dir(const char *path, int make_realpath) path = realpath.buf; } - set_git_dir_1(path); + set_git_dir_1(path, false); if (!is_absolute_path(path)) chdir_notify_register(NULL, update_relative_gitdir, NULL); @@ -1668,41 +1709,6 @@ enum discovery_result discover_git_directory_reason(struct strbuf *commondir, return result; } -void setup_git_env(const char *git_dir) -{ - char *git_replace_ref_base; - const char *shallow_file; - const char *replace_ref_base; - struct set_gitdir_args args = { NULL }; - struct strvec to_free = STRVEC_INIT; - - args.commondir = getenv_safe(&to_free, GIT_COMMON_DIR_ENVIRONMENT); - args.object_dir = getenv_safe(&to_free, DB_ENVIRONMENT); - args.graft_file = getenv_safe(&to_free, GRAFT_ENVIRONMENT); - args.index_file = getenv_safe(&to_free, INDEX_ENVIRONMENT); - args.alternate_db = getenv_safe(&to_free, ALTERNATE_DB_ENVIRONMENT); - if (getenv(GIT_QUARANTINE_ENVIRONMENT)) { - args.disable_ref_updates = true; - } - - repo_set_gitdir(the_repository, git_dir, &args); - strvec_clear(&to_free); - - if (getenv(NO_REPLACE_OBJECTS_ENVIRONMENT)) - disable_replace_refs(); - replace_ref_base = getenv(GIT_REPLACE_REF_BASE_ENVIRONMENT); - git_replace_ref_base = xstrdup(replace_ref_base ? replace_ref_base - : "refs/replace/"); - update_ref_namespace(NAMESPACE_REPLACE, git_replace_ref_base); - - shallow_file = getenv(GIT_SHALLOW_FILE_ENVIRONMENT); - if (shallow_file) - set_alternate_shallow_file(the_repository, shallow_file, 0); - - if (git_env_bool(NO_LAZY_FETCH_ENVIRONMENT, 0)) - fetch_if_missing = 0; -} - const char *enter_repo(const char *path, unsigned flags) { static struct strbuf validated_path = STRBUF_INIT; -- GitLab From ac65c70663b092e823b0d3de1c1cfdee0a4fbc8e Mon Sep 17 00:00:00 2001 From: Patrick Steinhardt Date: Wed, 19 Nov 2025 08:51:01 +0100 Subject: [PATCH 13/25] odb: handle recreation of quarantine directories In the preceding commit we have moved the logic that reparents object database sources on chdir(3p) from "setup.c" into "odb.c". Let's also do the same for any temporary quarantine directories so that the complete reparenting logic is self-contained in "odb.c". Signed-off-by: Patrick Steinhardt Signed-off-by: Junio C Hamano --- odb.c | 7 +++++++ setup.c | 5 ----- 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/odb.c b/odb.c index 70665fb7f4..dc8f292f3d 100644 --- a/odb.c +++ b/odb.c @@ -24,6 +24,7 @@ #include "strbuf.h" #include "strvec.h" #include "submodule.h" +#include "tmp-objdir.h" #include "trace2.h" #include "write-or-die.h" @@ -1041,8 +1042,11 @@ static void odb_update_commondir(const char *name UNUSED, void *cb_data) { struct object_database *odb = cb_data; + struct tmp_objdir *tmp_objdir; struct odb_source *source; + tmp_objdir = tmp_objdir_unapply_primary_odb(); + /* * In theory, we only have to do this for the primary object source, as * alternates' paths are always resolved to an absolute path. @@ -1059,6 +1063,9 @@ static void odb_update_commondir(const char *name UNUSED, free(source->path); source->path = path; } + + if (tmp_objdir) + tmp_objdir_reapply_primary_odb(tmp_objdir, old_cwd, new_cwd); } struct object_database *odb_new(struct repository *repo, diff --git a/setup.c b/setup.c index a625f9fbc8..ae66188af3 100644 --- a/setup.c +++ b/setup.c @@ -22,7 +22,6 @@ #include "chdir-notify.h" #include "path.h" #include "quote.h" -#include "tmp-objdir.h" #include "trace.h" #include "trace2.h" #include "worktree.h" @@ -1056,14 +1055,10 @@ static void update_relative_gitdir(const char *name UNUSED, { char *path = reparent_relative_path(old_cwd, new_cwd, repo_get_git_dir(the_repository)); - struct tmp_objdir *tmp_objdir = tmp_objdir_unapply_primary_odb(); - trace_printf_key(&trace_setup_key, "setup: move $GIT_DIR to '%s'", path); set_git_dir_1(path, true); - if (tmp_objdir) - tmp_objdir_reapply_primary_odb(tmp_objdir, old_cwd, new_cwd); free(path); } -- GitLab From b59aab1761468e0fd448a85a90cb1a35ebd140cf Mon Sep 17 00:00:00 2001 From: Patrick Steinhardt Date: Thu, 4 Dec 2025 09:10:17 +0100 Subject: [PATCH 14/25] builtin/maintenance: use "geometric" strategy by default Hi, this patch series is a follow-up to Git 2.52, where I introduced a new "geometric" strategy for repository maintenance. The main selling point of this strategy is improved heuristics for repacking objects: whereas git-gc(1) regularly performs all-into-one repacks, the geometric strategy only does that in case a significant number of objects have been added to the repository. This is a significant improvement, especially for large repositories. This patch series thus proposes to make the "geometric" strategy the default strategy for our housekeeping to finally start sunsetting git-gc(1). Both GitHub and GitLab have been running with geometric repacking in production systems for a long time already, which makes me reasonably sure that it works as expected. That being said, I discovered some bugs with this series: - Geometric repacking does not work for repositories that have promisor remotes. This bug is fixed in the first patch though. Other than that, most of the patches adapt our tests to fix races and implicit reliance on the default maintenance strategy. - We miscompute the auto-condition for writing commit graphs. - We have a memory leak caused by us not properly closing submodule repositories in git-grep(1). The final patch starts to use the "geometric" strategy unconditionally. I was initially pondering whether we maybe want to guard it with "feature.experimental" or maybe even `WITH_BREAKING_CHANGES`. For now I decided against that, but I'm happy to revisit that decision. The patch series is built on top of f0ef5b6d9b with ps/object-source-management at ac65c70663 (odb: handle recreation of quarantine directories, 2025-11-19) merged into it. Thanks! Patrick To: git@vger.kernel.org Cc: Taylor Blau Cc: Derrick Stolee --- b4-submit-tracking --- # This section is used internally by b4 prep for tracking purposes. { "series": { "revision": 1, "change-id": "20251204-b4-pks-maintenance-default-geometric-strategy-bb4083796238", "prefixes": [] } } -- GitLab From 30afd5f5516136aa706242adc9c9ea169fa0bd17 Mon Sep 17 00:00:00 2001 From: Patrick Steinhardt Date: Wed, 3 Dec 2025 13:16:13 +0100 Subject: [PATCH 15/25] builtin/repack: fix geometric repacks with promisor remotes When repacking a repository with promisor remotes git-repack(1) knows to pass "--exclude-promisor-objects" to git-pack-objects(1). This option ensures that the new pack will not contain any promised object that do not yet exist locally. This command line option is incompatible with "--stdin-packs": the latter option enables the rev-walk-based machinery to figure out which objects to add to the pack, whereas the former tells git-pack-objects(1) to merge all packs passed via stdin into one large pack. As we do not know to filter those packs via the passed-in revisions it is clear that at the current point in time nothing sensible comes out of combining these two options. But there is one case where git-repack(1) decides to pass both options: when performing a geometric repack we always pass "--stdin-packs" to identify the packs that should be merged. So if one performs a geometric repack in a partial clone we'll end up with both options, and that causes the repack to fail. Fix this issue by never passing "--exclude-promisor-objects" when we have a geometric split factor. We don't need the option anyway when doing a geometric repack as we will only ever pack loose objects or merge multiple packs. And neither of those cases can yield a promisor object. Signed-off-by: Patrick Steinhardt --- builtin/repack.c | 5 +++-- t/t7703-repack-geometric.sh | 26 ++++++++++++++++++++++++++ 2 files changed, 29 insertions(+), 2 deletions(-) diff --git a/builtin/repack.c b/builtin/repack.c index d9012141f6..4621eed3e6 100644 --- a/builtin/repack.c +++ b/builtin/repack.c @@ -294,9 +294,10 @@ int cmd_repack(int argc, strvec_push(&cmd.args, "--all"); strvec_push(&cmd.args, "--reflog"); strvec_push(&cmd.args, "--indexed-objects"); + + if (repo_has_promisor_remote(repo)) + strvec_push(&cmd.args, "--exclude-promisor-objects"); } - if (repo_has_promisor_remote(repo)) - strvec_push(&cmd.args, "--exclude-promisor-objects"); if (!write_midx) { if (write_bitmaps > 0) strvec_push(&cmd.args, "--write-bitmap-index"); diff --git a/t/t7703-repack-geometric.sh b/t/t7703-repack-geometric.sh index 9fc1626fbf..6d2c712bff 100755 --- a/t/t7703-repack-geometric.sh +++ b/t/t7703-repack-geometric.sh @@ -445,4 +445,30 @@ test_expect_success '--geometric -l disables writing bitmaps with non-local pack test_path_is_file member/.git/objects/pack/multi-pack-index-*.bitmap ' +test_expect_success '--geometric works with promisor packs' ' + test_when_finished "rm -fr remote local" && + + git init remote && + test_commit -C remote first file first && + test_commit -C remote second file second && + git -C remote config set uploadpack.allowfilter 1 && + git -C remote config set uploadpack.allowanysha1inwant 1 && + git -C remote repack -Ad && + + git clone --filter=blob:none file://"$(pwd)"/remote local && + git -C local rev-list --objects --missing=print HEAD >missing-objects && + test_grep "^?" missing-objects && + + # Assert that promisor packs are left alone and that we still manage to + # create new geometric packs. + ls local/.git/objects/pack/*.promisor >promisors-before && + ls local/.git/objects/pack/*.pack >packs-before && + test_commit -C local change && + git -C local repack --geometric=2 && + ls local/.git/objects/pack/*.promisor >promisors-after && + ls local/.git/objects/pack/*.pack >packs-after && + ! cmp packs-before packs-after && + test_cmp promisors-before promisors-after +' + test_done -- GitLab From 28fed7931765064eb5454d2f1b8eafc1f89086c6 Mon Sep 17 00:00:00 2001 From: Patrick Steinhardt Date: Thu, 4 Dec 2025 17:38:18 +0100 Subject: [PATCH 16/25] builtin/gc: fix condition for whether to write commit graphs When performing auto-maintenance we check whether commit graphs need to be generated by counting the number of commits that are reachable by any reference, but not covered by a commit graph. This search is performed by iterating through all references and then doing a depth-first search until we have found enough commits that are not present in the commit graph. This logic has a memory leak though: Direct leak of 16 byte(s) in 1 object(s) allocated from: #0 0x55555562e433 in malloc (git+0xda433) #1 0x555555964322 in do_xmalloc ../wrapper.c:55:8 #2 0x5555559642e6 in xmalloc ../wrapper.c:76:9 #3 0x55555579bf29 in commit_list_append ../commit.c:1872:35 #4 0x55555569f160 in dfs_on_ref ../builtin/gc.c:1165:4 #5 0x5555558c33fd in do_for_each_ref_iterator ../refs/iterator.c:431:12 #6 0x5555558af520 in do_for_each_ref ../refs.c:1828:9 #7 0x5555558ac317 in refs_for_each_ref ../refs.c:1833:9 #8 0x55555569e207 in should_write_commit_graph ../builtin/gc.c:1188:11 #9 0x55555569c915 in maintenance_is_needed ../builtin/gc.c:3492:8 #10 0x55555569b76a in cmd_maintenance ../builtin/gc.c:3542:9 #11 0x55555575166a in run_builtin ../git.c:506:11 #12 0x5555557502f0 in handle_builtin ../git.c:779:9 #13 0x555555751127 in run_argv ../git.c:862:4 #14 0x55555575007b in cmd_main ../git.c:984:19 #15 0x5555557523aa in main ../common-main.c:9:11 #16 0x7ffff7a2a4d7 in __libc_start_call_main (/nix/store/xx7cm72qy2c0643cm1ipngd87aqwkcdp-glibc-2.40-66/lib/libc.so.6+0x2a4d7) (BuildId: cddea92d6cba8333be952b5a02fd47d61054c5ab) #17 0x7ffff7a2a59a in __libc_start_main@GLIBC_2.2.5 (/nix/store/xx7cm72qy2c0643cm1ipngd87aqwkcdp-glibc-2.40-66/lib/libc.so.6+0x2a59a) (BuildId: cddea92d6cba8333be952b5a02fd47d61054c5ab) #18 0x5555555f0934 in _start (git+0x9c934) The root cause of this memory leak is our use of `commit_list_append()`. This function expects as parameters the item to append and the _tail_ of the list to append. This tail will then be overwritten with the new tail of the list so that it can be used in subsequent calls. But we call it with `commit_list_append(parent->item, &stack)`, so we end up losing everything but the new item. This issue only surfaces when counting merge commits. Next to being a memory leak, it also shows that we're in fact miscounting as we only respect children of the last parent. All previous parents are discarded, so their children will be disregarded unless they are hit via another reference. While crafting a test case for the issue I was puzzled that I couldn't establish the proper border at which the auto-condition would be fulfilled. As it turns out, there's another bug: if an object is at the tip of any reference we don't mark it as seen. Consequently, if it is reachable via any other reference, we'd count that object twice. Fix both of these bugs so that we properly count objects without leaking any memory. --- builtin/gc.c | 8 +++++--- t/t7900-maintenance.sh | 26 ++++++++++++++++++++++++++ 2 files changed, 31 insertions(+), 3 deletions(-) diff --git a/builtin/gc.c b/builtin/gc.c index 92c6e7b954..17ff68cbd9 100644 --- a/builtin/gc.c +++ b/builtin/gc.c @@ -1130,8 +1130,10 @@ static int dfs_on_ref(const struct reference *ref, void *cb_data) return 0; commit = lookup_commit(the_repository, maybe_peeled); - if (!commit) + if (!commit || commit->object.flags & SEEN) return 0; + commit->object.flags |= SEEN; + if (repo_parse_commit(the_repository, commit) || commit_graph_position(commit) != COMMIT_NOT_FROM_GRAPH) return 0; @@ -1141,7 +1143,7 @@ static int dfs_on_ref(const struct reference *ref, void *cb_data) if (data->num_not_in_graph >= data->limit) return 1; - commit_list_append(commit, &stack); + commit_list_insert(commit, &stack); while (!result && stack) { struct commit_list *parent; @@ -1162,7 +1164,7 @@ static int dfs_on_ref(const struct reference *ref, void *cb_data) break; } - commit_list_append(parent->item, &stack); + commit_list_insert(parent->item, &stack); } } diff --git a/t/t7900-maintenance.sh b/t/t7900-maintenance.sh index 6b36f52df7..6f3117304f 100755 --- a/t/t7900-maintenance.sh +++ b/t/t7900-maintenance.sh @@ -206,6 +206,32 @@ test_expect_success 'commit-graph auto condition' ' test_subcommand $COMMIT_GRAPH_WRITE err && test_grep "is not a valid task" err -- GitLab From 6ec34e26547b3e707f74414f11454115a57027b5 Mon Sep 17 00:00:00 2001 From: Patrick Steinhardt Date: Thu, 4 Dec 2025 18:24:17 +0100 Subject: [PATCH 17/25] odb: properly close sources before freeing them In the next commit we are about to move the packfile store into the ODB source so that we have one store per source. This will lead to a memory leak in the following commit when reading data from a submodule via git-grep(1): Direct leak of 192 byte(s) in 1 object(s) allocated from: #0 0x55555562e726 in calloc (git+0xda726) #1 0x555555964734 in xcalloc ../wrapper.c:154:8 #2 0x555555835136 in load_multi_pack_index_one ../midx.c:135:2 #3 0x555555834fd6 in load_multi_pack_index ../midx.c:382:6 #4 0x5555558365b6 in prepare_multi_pack_index_one ../midx.c:716:17 #5 0x55555586c605 in packfile_store_prepare ../packfile.c:1103:3 #6 0x55555586c90c in packfile_store_reprepare ../packfile.c:1118:2 #7 0x5555558546b3 in odb_reprepare ../odb.c:1106:2 #8 0x5555558539e4 in do_oid_object_info_extended ../odb.c:715:4 #9 0x5555558533d1 in odb_read_object_info_extended ../odb.c:862:8 #10 0x5555558540bd in odb_read_object ../odb.c:920:6 #11 0x55555580a330 in grep_source_load_oid ../grep.c:1934:12 #12 0x55555580a13a in grep_source_load ../grep.c:1986:10 #13 0x555555809103 in grep_source_is_binary ../grep.c:2014:7 #14 0x555555807574 in grep_source_1 ../grep.c:1625:8 #15 0x555555807322 in grep_source ../grep.c:1837:10 #16 0x5555556a5c58 in run ../builtin/grep.c:208:10 #17 0x55555562bb42 in void* ThreadStartFunc(void*) lsan_interceptors.cpp.o #18 0x7ffff7a9a979 in start_thread (/nix/store/xx7cm72qy2c0643cm1ipngd87aqwkcdp-glibc-2.40-66/lib/libc.so.6+0x9a979) (BuildId: cddea92d6cba8333be952b5a02fd47d61054c5ab) #19 0x7ffff7b22d2b in __GI___clone3 (/nix/store/xx7cm72qy2c0643cm1ipngd87aqwkcdp-glibc-2.40-66/lib/libc.so.6+0x122d2b) (BuildId: cddea92d6cba8333be952b5a02fd47d61054c5ab) The root caues of this leak is the way we set up and release the submodule: 1. We use `repo_submodule_init()` to initialize a new repository. This repository is stored in `repos_to_free`. 2. We now read data from the submodule repository. 3. We then call `repo_clear()` on the submodule repositories. 4. `repo_clear()` calls `odb_free()`. 5. `odb_free()` calls `odb_free_sources()` followed by `odb_close()`. The issue here is the 5th step: we call `odb_free_sources()` _before_ we call `odb_close()`. But `odb_free_sources()` already frees all sources, so the logic that closes them in `odb_close()` now becomes a no-op. As a consequence, we never explicitly close sources at all. This isn't a problem at the current point in time: the loose sources don't have any state to release, and the packfile store is not yet part of the sources. But once the packfile store is owned by the source we won't close it anymore, and this causes us to leak the packfile windows. Fix the upcoming leak by closing the store before we free the sources. Signed-off-by: Patrick Steinhardt --- odb.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/odb.c b/odb.c index dc8f292f3d..8e67afe185 100644 --- a/odb.c +++ b/odb.c @@ -1132,13 +1132,13 @@ void odb_free(struct object_database *o) oidmap_clear(&o->replace_map, 1); pthread_mutex_destroy(&o->replace_mutex); + odb_close(o); odb_free_sources(o); for (size_t i = 0; i < o->cached_object_nr; i++) free((char *) o->cached_objects[i].value.buf); free(o->cached_objects); - odb_close(o); packfile_store_free(o->packfiles); string_list_clear(&o->submodule_source_paths, 0); -- GitLab From f9cedf1b81dc75fdc216e71f02c725bb87db9f22 Mon Sep 17 00:00:00 2001 From: Patrick Steinhardt Date: Wed, 3 Dec 2025 13:15:37 +0100 Subject: [PATCH 18/25] t: fix races caused by background maintenance MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Many Git commands spawn git-maintenance(1) to optimize the repository in the background. By default, performing the maintenance is for most of the part asynchronous: we fork the executable and then continue with the rest of our business logic. This is working as expected for our users, but this behaviour is somewhat problematic for our test suite as this is inherently racy. We have many tests that verify the on-disk state of repositories, and those tests may easily race with our background maintenance. In a similar fashion, we may end up with processes that "leak" out of a current test case. Until now this tends to not be much of a problem. Our maintenance uses git-gc(1) by default, which knows to bail out in case there aren't either too many packfiles or too many loose objects. So even if other data structures would need to be optimized, we won't do so unless the object database also needs optimizations. This is about to change though, as a subsequent commit will switch to the "geometric" maintenance strategy as a default. The consequence is that we will run required optimizations even if the object database is well-optimized. And this uncovers races between our test suite and background maintenance all over the place. Disabling maintenance outright in our test suite is not really an option, as it would result in significantly divergence from the "real world" and reduce our test coverage. But we've got an alternative up our sleeves: we can ensure that garbage collection runs synchronously by overriding the "maintenance.autoDetach" configuration. Of course that also diverges from the real world, as we now stop testing that background maintenance interacts in a benign way with normal Git commands. But on the other hand this ensures that the maintenance itself does not for example lead to data loss in a more reproducible way. Another concern is that this would make execution of the test suite much slower. But a quick benchmark on my machine demonstrates that this does not seem to be the case: Benchmark 1: meson test (revision = HEAD~) Time (mean ± σ): 131.182 s ± 1.293 s [User: 853.737 s, System: 1160.479 s] Range (min … max): 130.001 s … 132.563 s 3 runs Benchmark 2: meson test (revision = HEAD) Time (mean ± σ): 129.554 s ± 0.507 s [User: 849.040 s, System: 1152.664 s] Range (min … max): 129.000 s … 129.994 s 3 runs Summary meson test (revision = HEAD) ran 1.01 ± 0.01 times faster than meson test (revision = HEAD~) Funny enough, it even seems as if this speeds up test execution ever so slightly, but that may just as well be noise. Introduce a new `GIT_TEST_MAINT_AUTO_DETACH` environment variable that allows us to override the auto-detach behaviour and set that varibale in our tests. Signed-off-by: Patrick Steinhardt --- run-command.c | 2 +- t/t5616-partial-clone.sh | 6 +++--- t/t7900-maintenance.sh | 1 + t/test-lib.sh | 4 ++++ 4 files changed, 9 insertions(+), 4 deletions(-) diff --git a/run-command.c b/run-command.c index e3e02475cc..438a290d30 100644 --- a/run-command.c +++ b/run-command.c @@ -1828,7 +1828,7 @@ int prepare_auto_maintenance(int quiet, struct child_process *maint) */ if (repo_config_get_bool(the_repository, "maintenance.autodetach", &auto_detach) && repo_config_get_bool(the_repository, "gc.autodetach", &auto_detach)) - auto_detach = 1; + auto_detach = git_env_bool("GIT_TEST_MAINT_AUTO_DETACH", true); maint->git_cmd = 1; maint->close_object_store = 1; diff --git a/t/t5616-partial-clone.sh b/t/t5616-partial-clone.sh index 1e354e057f..d62760eb92 100755 --- a/t/t5616-partial-clone.sh +++ b/t/t5616-partial-clone.sh @@ -229,7 +229,7 @@ test_expect_success 'fetch --refetch triggers repacking' ' GIT_TRACE2_EVENT="$PWD/trace1.event" \ git -C pc1 fetch --refetch origin && - test_subcommand git maintenance run --auto --no-quiet --detach Date: Wed, 3 Dec 2025 13:53:28 +0100 Subject: [PATCH 19/25] t: disable maintenance where we verify object database structure We have a couple of tests that explicitly verify the structure of the object database. Naturally, this structure is dependent on whether or not we run repository maintenance: if it decides to optimize the object database the expected structure is likely to not materialize. Explicitly disable auto-maintenance in such tests so that we are not dependent on decisions made by our maintenance. Signed-off-by: Patrick Steinhardt --- t/t0081-find-pack.sh | 1 + t/t5316-pack-delta-depth.sh | 1 + t/t5319-multi-pack-index.sh | 2 ++ t/t5326-multi-pack-bitmaps.sh | 3 ++- t/t5327-multi-pack-bitmaps-rev.sh | 3 ++- t/t5331-pack-objects-stdin.sh | 2 ++ t/t5332-multi-pack-reuse.sh | 1 + t/t5334-incremental-multi-pack-index.sh | 1 + t/t5500-fetch-pack.sh | 3 ++- t/t5616-partial-clone.sh | 1 + t/t7700-repack.sh | 3 +++ 11 files changed, 18 insertions(+), 3 deletions(-) diff --git a/t/t0081-find-pack.sh b/t/t0081-find-pack.sh index 5a628bf735..26f017422d 100755 --- a/t/t0081-find-pack.sh +++ b/t/t0081-find-pack.sh @@ -68,6 +68,7 @@ test_expect_success 'add more packfiles' ' ' test_expect_success 'add more commits (as loose objects)' ' + test_config maintenance.auto false && test_commit six && test_commit seven && diff --git a/t/t5316-pack-delta-depth.sh b/t/t5316-pack-delta-depth.sh index 03dfb7a61e..8a067a45cb 100755 --- a/t/t5316-pack-delta-depth.sh +++ b/t/t5316-pack-delta-depth.sh @@ -48,6 +48,7 @@ test_description='pack-objects breaks long cross-pack delta chains' # repeatedly-modified file to generate the delta chain). test_expect_success 'create series of packs' ' + test_config maintenance.auto false && test-tool genrandom foo 4096 >content && prev= && for i in $(test_seq 1 10) diff --git a/t/t5319-multi-pack-index.sh b/t/t5319-multi-pack-index.sh index 93f319a4b2..9505dd7def 100755 --- a/t/t5319-multi-pack-index.sh +++ b/t/t5319-multi-pack-index.sh @@ -1254,11 +1254,13 @@ test_expect_success 'bitmapped packs are stored via the BTMP chunk' ' ( cd repo && + git config set maintenance.auto false && for i in 1 2 3 4 5 do test_commit "$i" && git repack -d || return 1 done && + git config unset maintenance.auto && find $objdir/pack -type f -name "*.idx" | xargs -n 1 basename | sort >packs && diff --git a/t/t5326-multi-pack-bitmaps.sh b/t/t5326-multi-pack-bitmaps.sh index 892aeb09e4..62bd973d92 100755 --- a/t/t5326-multi-pack-bitmaps.sh +++ b/t/t5326-multi-pack-bitmaps.sh @@ -93,7 +93,8 @@ test_midx_bitmap_cases () { test_expect_success 'setup test_repository' ' rm -rf * .git && git init && - git config pack.writeBitmapLookupTable '"$writeLookupTable"' + git config pack.writeBitmapLookupTable '"$writeLookupTable"' && + git config maintenance.auto false ' midx_bitmap_core diff --git a/t/t5327-multi-pack-bitmaps-rev.sh b/t/t5327-multi-pack-bitmaps-rev.sh index 9cac03a94b..cfa12de2a8 100755 --- a/t/t5327-multi-pack-bitmaps-rev.sh +++ b/t/t5327-multi-pack-bitmaps-rev.sh @@ -30,7 +30,8 @@ test_midx_bitmap_rev () { test_expect_success 'setup bitmap config' ' rm -rf * .git && git init && - git config pack.writeBitmapLookupTable '"$writeLookupTable"' + git config pack.writeBitmapLookupTable '"$writeLookupTable"' && + git config maintenance.auto false ' midx_bitmap_core rev diff --git a/t/t5331-pack-objects-stdin.sh b/t/t5331-pack-objects-stdin.sh index 4a8df5a389..8a6bf3d98f 100755 --- a/t/t5331-pack-objects-stdin.sh +++ b/t/t5331-pack-objects-stdin.sh @@ -14,6 +14,7 @@ packed_objects () { test_expect_success 'setup for --stdin-packs tests' ' git init stdin-packs && + git -C stdin-packs config set maintenance.auto false && ( cd stdin-packs && @@ -255,6 +256,7 @@ test_expect_success '--stdin-packs=follow walks into unknown packs' ' git init repo && ( cd repo && + git config set maintenance.auto false && for c in A B C D do diff --git a/t/t5332-multi-pack-reuse.sh b/t/t5332-multi-pack-reuse.sh index 395d09444c..881ce668e1 100755 --- a/t/t5332-multi-pack-reuse.sh +++ b/t/t5332-multi-pack-reuse.sh @@ -59,6 +59,7 @@ test_pack_objects_reused () { test_expect_success 'preferred pack is reused for single-pack reuse' ' test_config pack.allowPackReuse single && + git config set maintenance.auto false && for i in A B do diff --git a/t/t5334-incremental-multi-pack-index.sh b/t/t5334-incremental-multi-pack-index.sh index d30d7253d6..99c7d44d8e 100755 --- a/t/t5334-incremental-multi-pack-index.sh +++ b/t/t5334-incremental-multi-pack-index.sh @@ -15,6 +15,7 @@ midx_chain=$midxdir/multi-pack-index-chain test_expect_success 'convert non-incremental MIDX to incremental' ' test_commit base && + git config set maintenance.auto false && git repack -ad && git multi-pack-index write && diff --git a/t/t5500-fetch-pack.sh b/t/t5500-fetch-pack.sh index 2677cd5faa..0ddbd7aa96 100755 --- a/t/t5500-fetch-pack.sh +++ b/t/t5500-fetch-pack.sh @@ -154,7 +154,8 @@ test_expect_success 'clone shallow depth 1 with fsck' ' ' test_expect_success 'clone shallow' ' - git clone --no-single-branch --depth 2 "file://$(pwd)/." shallow + git clone --no-single-branch --depth 2 "file://$(pwd)/." shallow && + git -C shallow config set maintenance.auto false ' test_expect_success 'clone shallow depth count' ' diff --git a/t/t5616-partial-clone.sh b/t/t5616-partial-clone.sh index d62760eb92..1c2805acca 100755 --- a/t/t5616-partial-clone.sh +++ b/t/t5616-partial-clone.sh @@ -585,6 +585,7 @@ test_expect_success 'verify fetch downloads only one pack when updating refs' ' git clone --filter=blob:none "file://$(pwd)/srv.bare" pack-test && ls pack-test/.git/objects/pack/*pack >pack-list && test_line_count = 2 pack-list && + test_config -C pack-test maintenance.auto false && for i in A B C do test_commit -C src $i && diff --git a/t/t7700-repack.sh b/t/t7700-repack.sh index 73b78bdd88..acc2589f21 100755 --- a/t/t7700-repack.sh +++ b/t/t7700-repack.sh @@ -217,6 +217,7 @@ test_expect_success 'repack --keep-pack' ' cd keep-pack && # avoid producing different packs due to delta/base choices git config pack.window 0 && + git config maintenance.auto false && P1=$(commit_and_pack 1) && P2=$(commit_and_pack 2) && P3=$(commit_and_pack 3) && @@ -260,6 +261,7 @@ test_expect_success 'repacking fails when missing .pack actually means missing o # Avoid producing different packs due to delta/base choices git config pack.window 0 && + git config maintenance.auto false && P1=$(commit_and_pack 1) && P2=$(commit_and_pack 2) && P3=$(commit_and_pack 3) && @@ -534,6 +536,7 @@ test_expect_success 'setup for --write-midx tests' ' ( cd midx && git config core.multiPackIndex true && + git config maintenance.auto false && test_commit base ) -- GitLab From 194c18d86bdadc84ce7915953286d68b53734c5b Mon Sep 17 00:00:00 2001 From: Patrick Steinhardt Date: Wed, 3 Dec 2025 13:57:07 +0100 Subject: [PATCH 20/25] t34xx: don't collect reflogs where it matters We have a couple of tests in the t34xx range that rely on reflogs. This never really used to be a problem, but in a subsequent commit we will change the default maintenance strategy from "gc" to "geometric", and this will cause us to drop all reflogs in these tests. This may seem surprising and like a bug at first, but it's actually not. The main difference between these two strategies is that the "gc" strategy will skip all maintenance in case the object database is in a well-optimized state. The "geometric" strategy has separate subtasks though, and the conditions for each of these tasks is evaluated on a case by case basis. This means that even if the object database is in good shape, we may still decide to expire reflogs. So why is that a problem? The issue is that Git's test suite hardcodes the committer and author dates to a date in 2005. Interestingly though, these hardcoded dates not only impact the commits, but also the reflog entries. The consequence is that all newly written reflog entries are immediately considered stale as our reflog expiration threshold is in the range of weeks, only. It follows that executing `git reflog expire` will thus immediately purge all reflog entries. This hasn't been a problem in our test suite by pure chance, as the repository shapes simply didn't cause us to perform actual garbage collection. But with the upcoming "geometric" strategy we _will_ start to execute `git reflog expire`, thus surfacing this issue. Prepare for this by explicitly disabling reflog expiration in tests impacted by this upcoming change. Signed-off-by: Patrick Steinhardt --- t/t3404-rebase-interactive.sh | 2 ++ t/t3406-rebase-message.sh | 3 +++ t/t3431-rebase-fork-point.sh | 2 ++ t/t3432-rebase-fast-forward.sh | 2 ++ 4 files changed, 9 insertions(+) diff --git a/t/t3404-rebase-interactive.sh b/t/t3404-rebase-interactive.sh index e778dd8ae4..5e4623f7f1 100755 --- a/t/t3404-rebase-interactive.sh +++ b/t/t3404-rebase-interactive.sh @@ -31,6 +31,8 @@ Initial setup: . "$TEST_DIRECTORY"/lib-rebase.sh test_expect_success 'setup' ' + git config set gc.reflogExpire never && + git config set gc.reflogExpireUnreachable never && git switch -C primary && test_commit A file1 && test_commit B file1 && diff --git a/t/t3406-rebase-message.sh b/t/t3406-rebase-message.sh index a1d7fa7f7c..f89209c8d9 100755 --- a/t/t3406-rebase-message.sh +++ b/t/t3406-rebase-message.sh @@ -8,6 +8,9 @@ export GIT_TEST_DEFAULT_INITIAL_BRANCH_NAME . ./test-lib.sh test_expect_success 'setup' ' + git config set gc.reflogExpire never && + git config set gc.reflogExpireUnreachable never && + test_commit O fileO && test_commit X fileX && git branch fast-forward && diff --git a/t/t3431-rebase-fork-point.sh b/t/t3431-rebase-fork-point.sh index be09fc78c1..3a3c3a70a5 100755 --- a/t/t3431-rebase-fork-point.sh +++ b/t/t3431-rebase-fork-point.sh @@ -17,6 +17,8 @@ export GIT_TEST_DEFAULT_INITIAL_BRANCH_NAME # C was formerly part of main but main was rewound to remove C # test_expect_success setup ' + git config set gc.reflogExpire never && + git config set gc.reflogExpireUnreachable never && test_commit A && test_commit B && test_commit C && diff --git a/t/t3432-rebase-fast-forward.sh b/t/t3432-rebase-fast-forward.sh index 5086e14c02..6e8de6c7aa 100755 --- a/t/t3432-rebase-fast-forward.sh +++ b/t/t3432-rebase-fast-forward.sh @@ -11,6 +11,8 @@ export GIT_TEST_DEFAULT_INITIAL_BRANCH_NAME . ./test-lib.sh test_expect_success setup ' + git config set gc.reflogExpire never && + git config set gc.reflogExpireUnreachable never && test_commit A && test_commit B && test_commit C && -- GitLab From fcc3e6e8183c7d42d66181c0a8f7293f1c015181 Mon Sep 17 00:00:00 2001 From: Patrick Steinhardt Date: Wed, 3 Dec 2025 14:25:41 +0100 Subject: [PATCH 21/25] t5400: explicitly use "gc" strategy In t5400 we verify that git-receive-pack(1) runs automated repository maintenance in the remote repository. The check is performed indirectly by observing an effect that git-gc(1) would have, namely to prune a temporary object from the object database. In a subsequent commit we're about to switch to the "geometric" strategy by default though, and here we stop observing that effect. Adapt the test to explicitly use the "gc" strategy to prepare for that upcoming change. Signed-off-by: Patrick Steinhardt --- t/t5400-send-pack.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/t/t5400-send-pack.sh b/t/t5400-send-pack.sh index 83b42ff073..b32a0a6aa7 100755 --- a/t/t5400-send-pack.sh +++ b/t/t5400-send-pack.sh @@ -187,6 +187,7 @@ test_expect_success 'receive-pack runs auto-gc in remote repo' ' cd child && git config gc.autopacklimit 1 && git config gc.autodetach false && + git config maintenance.strategy gc && git branch test_auto_gc && # And create a file that follows the temporary object naming # convention for the auto-gc to remove -- GitLab From da94ef5d1adba9c6670763769628666aab3f84b6 Mon Sep 17 00:00:00 2001 From: Patrick Steinhardt Date: Wed, 3 Dec 2025 14:56:11 +0100 Subject: [PATCH 22/25] t5510: explicitly use "gc" strategy One of the tests in t5510 wants to verify that auto-gc does not lock up when fetching into a repository. Adapt it to explicitly pick the "gc" strategy for auto-maintenance. Signed-off-by: Patrick Steinhardt --- t/t5510-fetch.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/t/t5510-fetch.sh b/t/t5510-fetch.sh index b7059cccaa..e59307449f 100755 --- a/t/t5510-fetch.sh +++ b/t/t5510-fetch.sh @@ -1321,6 +1321,7 @@ test_expect_success 'fetching with auto-gc does not lock up' ' git config fetch.unpackLimit 1 && git config gc.autoPackLimit 1 && git config gc.autoDetach false && + git config maintenance.strategy gc && GIT_ASK_YESNO="$TRASH_DIRECTORY/askyesno" git fetch --verbose >fetch.out 2>&1 && test_grep "Auto packing the repository" fetch.out && ! grep "Should I try again" fetch.out -- GitLab From 2df32d32a6cd94cdc6660bd253e144e2e6f17750 Mon Sep 17 00:00:00 2001 From: Patrick Steinhardt Date: Wed, 3 Dec 2025 15:28:29 +0100 Subject: [PATCH 23/25] t6500: explicitly pick "gc" strategy The test in t6500 explicitly wants to exercise git-gc(1) and is thus highly specific to the actual on-disk state of the repository and specifically of the object database. An upcoming change modifies the default maintenance strategy to be the "geometric" strategy though, which breaks a couple of assumptions. One fix would arguably be to disable auto-maintenance altogether, as we do want to explicitly verify git-gc(1) anyway. But as the whole test suite is about git-gc(1) in the first place it feels more sensible to configure the default maintenance strategy to be "gc". Signed-off-by: Patrick Steinhardt --- t/t6500-gc.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/t/t6500-gc.sh b/t/t6500-gc.sh index bef472cb8d..ea9aaad470 100755 --- a/t/t6500-gc.sh +++ b/t/t6500-gc.sh @@ -11,6 +11,7 @@ test_expect_success 'setup' ' # behavior, make sure we always pack everything to one pack by # default git config gc.bigPackThreshold 2g && + git config set --global maintenance.strategy gc && test_oid_init ' -- GitLab From edf302601ab91bb4d71553ceb2cd4f82a350dc57 Mon Sep 17 00:00:00 2001 From: Patrick Steinhardt Date: Wed, 3 Dec 2025 15:36:39 +0100 Subject: [PATCH 24/25] t7900: prepare for switch of the default strategy The t7900 test suite is exercising git-maintenance(1) and is thus of course heavily reliant on the exact maintenance strategy. This reliance comes in two flavors: - One test explicitly wants to verify that git-gc(1) is run as part of `git maintenance run`. This test is adapted by explicitly picking the "gc" strategy. - The other tests assume a specific shape of the object database, which is dependent on whether or not we run auto-maintenance before we come to the actual subject under test. These tests are adapted by disabling auto-maintenance. With these changes t7900 passes with both "gc" and "geometric" default strategies. Signed-off-by: Patrick Steinhardt --- t/t7900-maintenance.sh | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/t/t7900-maintenance.sh b/t/t7900-maintenance.sh index a94537ef6e..5726f6c836 100755 --- a/t/t7900-maintenance.sh +++ b/t/t7900-maintenance.sh @@ -43,7 +43,8 @@ test_expect_success 'help text' ' test_grep "usage: git maintenance" err ' -test_expect_success 'run [--auto|--quiet]' ' +test_expect_success 'run [--auto|--quiet] with gc strategy' ' + test_config maintenance.strategy gc && GIT_TRACE2_EVENT="$(pwd)/run-no-auto.txt" \ git maintenance run 2>/dev/null && GIT_TRACE2_EVENT="$(pwd)/run-auto.txt" \ @@ -498,6 +499,7 @@ test_expect_success 'maintenance.incremental-repack.auto' ' ( cd incremental-repack-true && git config core.multiPackIndex true && + git config maintenance.auto false && run_incremental_repack_and_verify ) ' @@ -508,6 +510,7 @@ test_expect_success 'maintenance.incremental-repack.auto (when config is unset)' ( cd incremental-repack-unset && test_unconfig core.multiPackIndex && + git config maintenance.auto false && run_incremental_repack_and_verify ) ' @@ -618,6 +621,7 @@ test_expect_success 'geometric repacking with --auto' ' git init repo && ( cd repo && + git config set maintenance.auto false && # An empty repository does not need repacking, except when # explicitly told to do it. -- GitLab From 00345a768e90c31bc8750b11555eee10fe0887c6 Mon Sep 17 00:00:00 2001 From: Patrick Steinhardt Date: Wed, 3 Dec 2025 13:16:51 +0100 Subject: [PATCH 25/25] builtin/maintenance: use "geometric" strategy by default The git-gc(1) command has been introduced in the early days of Git in 30f610b7b0 (Create 'git gc' to perform common maintenance operations., 2006-12-27) as the main repository maintenance utility. And while the tool has of course evolved since then to cover new parts, the basic strategy it uses has never really changed much. It is safe to say that since 2006 the Git ecosystem has changed quite a bit. Repositories tend to be much larger nowadays than they have been almost 20 years ago, and large parts of the industry went crazy for monorepos (for various wildly different definitions of "monorepo"). So the maintenance strategy we used back then may not be the best fit nowadays anymore. Arguably, most of the maintenance tasks that git-gc(1) does are still perfectly fine today: repacking references, expiring various data structures and things like tend to not cause huge problems. But the big exception is the way we repack objects. git-gc(1) by default uses a split strategy: it performs incremental repacks by default, and then whenever we have too many packs we perform a large all-into-one repack. This all-into-one repack is what is causing problems nowadays, as it is an operation that is quite expensive. While it is wasteful in small- and medium-sized repositories, in large repos it may even be prohibitively expensive. We have eventually introduced git-maintenance(1) that was slated as a replacement for git-gc(1). In contrast to git-gc(1), it was much more flexible as it is structured around configurable tasks and strategies. And while it knows about the "incremental" strategy that we may use for scheduled maintenance when configured via Scalar, its default still is to use git-gc(1) in the background. The "incremental" strategy isn't really a full replacement for git-gc(1) though, as it doesn't know to expire unused data structures. In Git 2.52 we have thus introduced a new "geometric" strategy that is a proper replacement for the old git-gc(1). In contrast to the incremental/all-into-one split used by git-gc(1), the new "geometric" strategy maintains a geometric progression of packfiles, which significantly reduces the number of all-into-one repacks that we have to perform in large repositories. It is thus a much better fit for large repositories than git-gc(1). Note that the "geometric" strategy isn't perfect though: while we perform way less all-into-one repacks compared to git-gc(1), we still have to perform them eventually. But for the largest repositories out there this may not be an option, as client machines might not be powerful enough to perform such a repack in the first place. These cases would thus still be covered by Scalar's "incremental" strategy. Switch the default strategy away from "gc" to "geometric", but retain the "incremental" strategy configured by Scalar. Signed-off-by: Patrick Steinhardt --- builtin/gc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/builtin/gc.c b/builtin/gc.c index 17ff68cbd9..c0243ed99e 100644 --- a/builtin/gc.c +++ b/builtin/gc.c @@ -1980,7 +1980,7 @@ static void initialize_task_config(struct maintenance_run_opts *opts, strategy = none_strategy; type = MAINTENANCE_TYPE_SCHEDULED; } else { - strategy = gc_strategy; + strategy = geometric_strategy; type = MAINTENANCE_TYPE_MANUAL; } -- GitLab