From f82e430b4e46120e0ef67959e0ef9d8ab9282c56 Mon Sep 17 00:00:00 2001 From: Patrick Steinhardt Date: Mon, 3 Nov 2025 08:41:56 +0100 Subject: [PATCH 01/33] odb: fix subtle logic to check whether an alternate is usable When adding an alternate to the object database we first check whether or not the path is usable. A path is usable if: - It actually exists. - We don't have it in our object sources yet. While the former check is trivial enough, the latter part is somewhat subtle and prone for bugs. This is because the function doesn't only check whether or not the given path is usable. But if it _is_ usable, we also store that path in the map of object sources immediately. The tricky part here is that the path that gets stored in the map is _not_ copied. Instead, we rely on the fact that subsequent code uses `strbuf_detach()` to store the exact same allocated memory in the created object source. Consequently, the memory is owned by the source but _also_ stored in the map. This subtlety is easy to miss, so if one decides to refactor this code one can easily end up breaking this mechanism. Make the relationship more explicit by not storing the path as part of `alt_odb_usable()`. Instead, store the path after we have created the source so that we can use the source's path pointer directly. Signed-off-by: Patrick Steinhardt Signed-off-by: Junio C Hamano --- odb.c | 30 +++++++++++++++++------------- 1 file changed, 17 insertions(+), 13 deletions(-) diff --git a/odb.c b/odb.c index 00a6e71568..57d85ed950 100644 --- a/odb.c +++ b/odb.c @@ -86,17 +86,16 @@ int odb_mkstemp(struct object_database *odb, /* * Return non-zero iff the path is usable as an alternate object database. */ -static int alt_odb_usable(struct object_database *o, - struct strbuf *path, - const char *normalized_objdir, khiter_t *pos) +static int alt_odb_usable(struct object_database *o, const char *path, + const char *normalized_objdir) { int r; /* Detect cases where alternate disappeared */ - if (!is_directory(path->buf)) { + if (!is_directory(path)) { error(_("object directory %s does not exist; " "check .git/objects/info/alternates"), - path->buf); + path); return 0; } @@ -113,11 +112,14 @@ static int alt_odb_usable(struct object_database *o, assert(r == 1); /* never used */ kh_value(o->source_by_path, p) = o->sources; } - if (fspatheq(path->buf, normalized_objdir)) + + if (fspatheq(path, normalized_objdir)) + return 0; + + if (kh_get_odb_path_map(o->source_by_path, path) < kh_end(o->source_by_path)) return 0; - *pos = kh_put_odb_path_map(o->source_by_path, path->buf, &r); - /* r: 0 = exists, 1 = never used, 2 = deleted */ - return r == 0 ? 0 : 1; + + return 1; } /* @@ -148,6 +150,7 @@ static struct odb_source *link_alt_odb_entry(struct object_database *odb, struct strbuf pathbuf = STRBUF_INIT; struct strbuf tmp = STRBUF_INIT; khiter_t pos; + int ret; if (!is_absolute_path(dir) && relative_base) { strbuf_realpath(&pathbuf, relative_base, 1); @@ -172,20 +175,21 @@ static struct odb_source *link_alt_odb_entry(struct object_database *odb, strbuf_reset(&tmp); strbuf_realpath(&tmp, odb->sources->path, 1); - if (!alt_odb_usable(odb, &pathbuf, tmp.buf, &pos)) + if (!alt_odb_usable(odb, pathbuf.buf, tmp.buf)) goto error; CALLOC_ARRAY(alternate, 1); alternate->odb = odb; alternate->local = false; - /* pathbuf.buf is already in r->objects->source_by_path */ alternate->path = strbuf_detach(&pathbuf, NULL); /* add the alternate entry */ *odb->sources_tail = alternate; odb->sources_tail = &(alternate->next); - alternate->next = NULL; - assert(odb->source_by_path); + + pos = kh_put_odb_path_map(odb->source_by_path, alternate->path, &ret); + if (!ret) + BUG("source must not yet exist"); kh_value(odb->source_by_path, pos) = alternate; /* recursively add alternates */ -- GitLab From 0820a4b120f310d87ac8817ade63896a901c9267 Mon Sep 17 00:00:00 2001 From: Patrick Steinhardt Date: Mon, 3 Nov 2025 08:41:57 +0100 Subject: [PATCH 02/33] odb: introduce `odb_source_new()` We have three different locations where we create a new ODB source. Deduplicate the logic via a new `odb_source_new()` function. Signed-off-by: Patrick Steinhardt Signed-off-by: Junio C Hamano --- odb.c | 23 ++++++++++++++++------- odb.h | 4 ++++ repository.c | 14 +++++++++----- 3 files changed, 29 insertions(+), 12 deletions(-) diff --git a/odb.c b/odb.c index 57d85ed950..d2d4c514ae 100644 --- a/odb.c +++ b/odb.c @@ -141,6 +141,20 @@ static void read_info_alternates(struct object_database *odb, const char *relative_base, int depth); +struct odb_source *odb_source_new(struct object_database *odb, + const char *path, + bool local) +{ + struct odb_source *source; + + CALLOC_ARRAY(source, 1); + source->odb = odb; + source->local = local; + source->path = xstrdup(path); + + return source; +} + static struct odb_source *link_alt_odb_entry(struct object_database *odb, const char *dir, const char *relative_base, @@ -178,10 +192,7 @@ static struct odb_source *link_alt_odb_entry(struct object_database *odb, if (!alt_odb_usable(odb, pathbuf.buf, tmp.buf)) goto error; - CALLOC_ARRAY(alternate, 1); - alternate->odb = odb; - alternate->local = false; - alternate->path = strbuf_detach(&pathbuf, NULL); + alternate = odb_source_new(odb, pathbuf.buf, false); /* add the alternate entry */ *odb->sources_tail = alternate; @@ -341,9 +352,7 @@ struct odb_source *odb_set_temporary_primary_source(struct object_database *odb, * Make a new primary odb and link the old primary ODB in as an * alternate */ - source = xcalloc(1, sizeof(*source)); - source->odb = odb; - source->path = xstrdup(dir); + source = odb_source_new(odb, dir, false); /* * Disable ref updates while a temporary odb is active, since diff --git a/odb.h b/odb.h index e6602dd90c..2bec895d13 100644 --- a/odb.h +++ b/odb.h @@ -89,6 +89,10 @@ struct odb_source { char *path; }; +struct odb_source *odb_source_new(struct object_database *odb, + const char *path, + bool local); + struct packed_git; struct packfile_store; struct cached_object_entry; diff --git a/repository.c b/repository.c index 6faf5c7398..6aaa7ba008 100644 --- a/repository.c +++ b/repository.c @@ -160,20 +160,24 @@ void repo_set_gitdir(struct repository *repo, * until after xstrdup(root). Then we can free it. */ char *old_gitdir = repo->gitdir; + char *objects_path = NULL; repo->gitdir = xstrdup(gitfile ? gitfile : root); free(old_gitdir); repo_set_commondir(repo, o->commondir); + expand_base_dir(&objects_path, o->object_dir, + repo->commondir, "objects"); if (!repo->objects->sources) { - CALLOC_ARRAY(repo->objects->sources, 1); - repo->objects->sources->odb = repo->objects; - repo->objects->sources->local = true; + repo->objects->sources = odb_source_new(repo->objects, + objects_path, true); repo->objects->sources_tail = &repo->objects->sources->next; + free(objects_path); + } else { + free(repo->objects->sources->path); + repo->objects->sources->path = objects_path; } - expand_base_dir(&repo->objects->sources->path, o->object_dir, - repo->commondir, "objects"); repo->objects->sources->disable_ref_updates = o->disable_ref_updates; -- GitLab From c2da110411919387fef0f869181fb510d06295d8 Mon Sep 17 00:00:00 2001 From: Patrick Steinhardt Date: Mon, 3 Nov 2025 08:41:58 +0100 Subject: [PATCH 03/33] odb: adjust naming to free object sources The functions `free_object_directory()` and `free_object_directories()` are responsible for freeing a single object source or all object sources connected to an object database, respectively. The associated structure has been renamed from `struct object_directory` to `struct odb_source` in a1e2581a1e (object-store: rename `object_directory` to `odb_source`, 2025-07-01) though, so the names are somewhat stale nowadays. Rename them to mention the new struct name instead. Furthermore, while at it, adapt them to our modern naming schema where we first have the subject followed by a verb. Signed-off-by: Patrick Steinhardt Signed-off-by: Junio C Hamano --- odb.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/odb.c b/odb.c index d2d4c514ae..77490d7fdb 100644 --- a/odb.c +++ b/odb.c @@ -365,7 +365,7 @@ struct odb_source *odb_set_temporary_primary_source(struct object_database *odb, return source->next; } -static void free_object_directory(struct odb_source *source) +static void odb_source_free(struct odb_source *source) { free(source->path); odb_clear_loose_cache(source); @@ -387,7 +387,7 @@ void odb_restore_primary_source(struct object_database *odb, BUG("we expect the old primary object store to be the first alternate"); odb->sources = restore_source; - free_object_directory(cur_source); + odb_source_free(cur_source); } char *compute_alternate_path(const char *path, struct strbuf *err) @@ -1015,13 +1015,13 @@ struct object_database *odb_new(struct repository *repo) return o; } -static void free_object_directories(struct object_database *o) +static void odb_free_sources(struct object_database *o) { while (o->sources) { struct odb_source *next; next = o->sources->next; - free_object_directory(o->sources); + odb_source_free(o->sources); o->sources = next; } kh_destroy_odb_path_map(o->source_by_path); @@ -1039,7 +1039,7 @@ void odb_clear(struct object_database *o) o->commit_graph = NULL; o->commit_graph_attempted = 0; - free_object_directories(o); + odb_free_sources(o); o->sources_tail = NULL; o->loaded_alternates = 0; -- GitLab From 0cc12dedef2885dba8cf2635697767d394baf91f Mon Sep 17 00:00:00 2001 From: Patrick Steinhardt Date: Mon, 3 Nov 2025 08:41:59 +0100 Subject: [PATCH 04/33] object-file: move `fetch_if_missing` The `fetch_if_missing` global variable is declared in "object-file.h" but defined in "odb.c". The variable relates to the whole object database instead of only loose objects, so move the declaration into "odb.h" accordingly. Signed-off-by: Patrick Steinhardt Signed-off-by: Junio C Hamano --- object-file.h | 8 -------- odb.h | 8 ++++++++ 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/object-file.h b/object-file.h index 3fd48dcafb..097e9764be 100644 --- a/object-file.h +++ b/object-file.h @@ -7,14 +7,6 @@ struct index_state; -/* - * Set this to 0 to prevent odb_read_object_info_extended() from fetching missing - * blobs. This has a difference only if extensions.partialClone is set. - * - * Its default value is 1. - */ -extern int fetch_if_missing; - enum { INDEX_WRITE_OBJECT = (1 << 0), INDEX_FORMAT_CHECK = (1 << 1), diff --git a/odb.h b/odb.h index 2bec895d13..2346ffeca8 100644 --- a/odb.h +++ b/odb.h @@ -14,6 +14,14 @@ struct strbuf; struct repository; struct multi_pack_index; +/* + * Set this to 0 to prevent odb_read_object_info_extended() from fetching missing + * blobs. This has a difference only if extensions.partialClone is set. + * + * Its default value is 1. + */ +extern int fetch_if_missing; + /* * Compute the exact path an alternate is at and returns it. In case of * error NULL is returned and the human readable error is added to `err` -- GitLab From ece43d9dc70b1717484ee78b66aef4f9390c2b2b Mon Sep 17 00:00:00 2001 From: Patrick Steinhardt Date: Mon, 3 Nov 2025 08:42:00 +0100 Subject: [PATCH 05/33] object-file: introduce `struct odb_source_loose` Currently, all state that relates to loose objects is held directly by the `struct odb_source`. Introduce a new `struct odb_source_loose` to hold the state instead so that it is entirely self-contained. This structure will eventually morph into the backend for accessing loose objects. As such, this is part of the refactorings to introduce pluggable object databases. Signed-off-by: Patrick Steinhardt Signed-off-by: Junio C Hamano --- object-file.c | 13 +++++++++++++ object-file.h | 7 +++++++ odb.c | 2 ++ odb.h | 3 +++ 4 files changed, 25 insertions(+) diff --git a/object-file.c b/object-file.c index 4675c8ed6b..cd6aa561fa 100644 --- a/object-file.c +++ b/object-file.c @@ -1995,3 +1995,16 @@ void object_file_transaction_commit(struct odb_transaction *transaction) transaction->odb->transaction = NULL; free(transaction); } + +struct odb_source_loose *odb_source_loose_new(struct odb_source *source) +{ + struct odb_source_loose *loose; + CALLOC_ARRAY(loose, 1); + loose->source = source; + return loose; +} + +void odb_source_loose_free(struct odb_source_loose *loose) +{ + free(loose); +} diff --git a/object-file.h b/object-file.h index 097e9764be..695a7e8e7c 100644 --- a/object-file.h +++ b/object-file.h @@ -18,6 +18,13 @@ int index_path(struct index_state *istate, struct object_id *oid, const char *pa struct odb_source; +struct odb_source_loose { + struct odb_source *source; +}; + +struct odb_source_loose *odb_source_loose_new(struct odb_source *source); +void odb_source_loose_free(struct odb_source_loose *loose); + /* * Populate and return the loose object cache array corresponding to the * given object ID. diff --git a/odb.c b/odb.c index 77490d7fdb..2d06ab0bb8 100644 --- a/odb.c +++ b/odb.c @@ -151,6 +151,7 @@ struct odb_source *odb_source_new(struct object_database *odb, source->odb = odb; source->local = local; source->path = xstrdup(path); + source->loose = odb_source_loose_new(source); return source; } @@ -368,6 +369,7 @@ struct odb_source *odb_set_temporary_primary_source(struct object_database *odb, static void odb_source_free(struct odb_source *source) { free(source->path); + odb_source_loose_free(source->loose); odb_clear_loose_cache(source); loose_object_map_clear(&source->loose_map); free(source); diff --git a/odb.h b/odb.h index 2346ffeca8..49b398beda 100644 --- a/odb.h +++ b/odb.h @@ -48,6 +48,9 @@ struct odb_source { /* Object database that owns this object source. */ struct object_database *odb; + /* Private state for loose objects. */ + struct odb_source_loose *loose; + /* * Used to store the results of readdir(3) calls when we are OK * sacrificing accuracy due to races for speed. That includes -- GitLab From 90a93f9dea88532623ef7422dbc21d8dc70a58dd Mon Sep 17 00:00:00 2001 From: Patrick Steinhardt Date: Mon, 3 Nov 2025 08:42:01 +0100 Subject: [PATCH 06/33] object-file: move loose object cache into loose source Our loose objects use a cache that (optionally) stores all objects for each of the opened sharding directories. This cache is located in the `struct odb_source`, but now that we have `struct odb_source_loose` it makes sense to move it into the latter structure so that all state that relates to loose objects is entirely self-contained. Do so. While at it, rename corresponding functions to have a prefix that relates to `struct odb_source_loose`. Note that despite this prefix, the functions still accept a `struct odb_source` as input. This is done intentionally: once we introduce pluggable object databases, we will continue to accept this struct but then do a cast inside these functions to `struct odb_source_loose`. This design is similar to how we do it for our ref backends. Signed-off-by: Patrick Steinhardt Signed-off-by: Junio C Hamano --- loose.c | 9 +++++---- object-file.c | 35 +++++++++++++++++++---------------- object-file.h | 16 ++++++++++++++-- object-name.c | 2 +- odb.c | 1 - odb.h | 12 ------------ 6 files changed, 39 insertions(+), 36 deletions(-) diff --git a/loose.c b/loose.c index e8ea6e7e24..8cc7573ff2 100644 --- a/loose.c +++ b/loose.c @@ -1,6 +1,7 @@ #include "git-compat-util.h" #include "hash.h" #include "path.h" +#include "object-file.h" #include "odb.h" #include "hex.h" #include "repository.h" @@ -54,7 +55,7 @@ static int insert_loose_map(struct odb_source *source, inserted |= insert_oid_pair(map->to_compat, oid, compat_oid); inserted |= insert_oid_pair(map->to_storage, compat_oid, oid); if (inserted) - oidtree_insert(source->loose_objects_cache, compat_oid); + oidtree_insert(source->loose->cache, compat_oid); return inserted; } @@ -66,9 +67,9 @@ static int load_one_loose_object_map(struct repository *repo, struct odb_source if (!source->loose_map) loose_object_map_init(&source->loose_map); - if (!source->loose_objects_cache) { - ALLOC_ARRAY(source->loose_objects_cache, 1); - oidtree_init(source->loose_objects_cache); + if (!source->loose->cache) { + ALLOC_ARRAY(source->loose->cache, 1); + oidtree_init(source->loose->cache); } insert_loose_map(source, repo->hash_algo->empty_tree, repo->compat_hash_algo->empty_tree); diff --git a/object-file.c b/object-file.c index cd6aa561fa..fef00d6d3d 100644 --- a/object-file.c +++ b/object-file.c @@ -223,7 +223,7 @@ static int quick_has_loose(struct repository *r, odb_prepare_alternates(r->objects); for (source = r->objects->sources; source; source = source->next) { - if (oidtree_contains(odb_loose_cache(source, oid), oid)) + if (oidtree_contains(odb_source_loose_cache(source, oid), oid)) return 1; } return 0; @@ -1802,44 +1802,44 @@ static int append_loose_object(const struct object_id *oid, return 0; } -struct oidtree *odb_loose_cache(struct odb_source *source, - const struct object_id *oid) +struct oidtree *odb_source_loose_cache(struct odb_source *source, + const struct object_id *oid) { int subdir_nr = oid->hash[0]; struct strbuf buf = STRBUF_INIT; - size_t word_bits = bitsizeof(source->loose_objects_subdir_seen[0]); + size_t word_bits = bitsizeof(source->loose->subdir_seen[0]); size_t word_index = subdir_nr / word_bits; size_t mask = (size_t)1u << (subdir_nr % word_bits); uint32_t *bitmap; if (subdir_nr < 0 || - (size_t) subdir_nr >= bitsizeof(source->loose_objects_subdir_seen)) + (size_t) subdir_nr >= bitsizeof(source->loose->subdir_seen)) BUG("subdir_nr out of range"); - bitmap = &source->loose_objects_subdir_seen[word_index]; + bitmap = &source->loose->subdir_seen[word_index]; if (*bitmap & mask) - return source->loose_objects_cache; - if (!source->loose_objects_cache) { - ALLOC_ARRAY(source->loose_objects_cache, 1); - oidtree_init(source->loose_objects_cache); + return source->loose->cache; + if (!source->loose->cache) { + ALLOC_ARRAY(source->loose->cache, 1); + oidtree_init(source->loose->cache); } strbuf_addstr(&buf, source->path); for_each_file_in_obj_subdir(subdir_nr, &buf, source->odb->repo->hash_algo, append_loose_object, NULL, NULL, - source->loose_objects_cache); + source->loose->cache); *bitmap |= mask; strbuf_release(&buf); - return source->loose_objects_cache; + return source->loose->cache; } void odb_clear_loose_cache(struct odb_source *source) { - oidtree_clear(source->loose_objects_cache); - FREE_AND_NULL(source->loose_objects_cache); - memset(&source->loose_objects_subdir_seen, 0, - sizeof(source->loose_objects_subdir_seen)); + oidtree_clear(source->loose->cache); + FREE_AND_NULL(source->loose->cache); + memset(&source->loose->subdir_seen, 0, + sizeof(source->loose->subdir_seen)); } static int check_stream_oid(git_zstream *stream, @@ -2006,5 +2006,8 @@ struct odb_source_loose *odb_source_loose_new(struct odb_source *source) void odb_source_loose_free(struct odb_source_loose *loose) { + if (!loose) + return; + odb_clear_loose_cache(loose->source); free(loose); } diff --git a/object-file.h b/object-file.h index 695a7e8e7c..90da69cf5f 100644 --- a/object-file.h +++ b/object-file.h @@ -20,6 +20,18 @@ struct odb_source; struct odb_source_loose { struct odb_source *source; + + /* + * Used to store the results of readdir(3) calls when we are OK + * sacrificing accuracy due to races for speed. That includes + * object existence with OBJECT_INFO_QUICK, as well as + * our search for unique abbreviated hashes. Don't use it for tasks + * requiring greater accuracy! + * + * Be sure to call odb_load_loose_cache() before using. + */ + uint32_t subdir_seen[8]; /* 256 bits */ + struct oidtree *cache; }; struct odb_source_loose *odb_source_loose_new(struct odb_source *source); @@ -29,8 +41,8 @@ void odb_source_loose_free(struct odb_source_loose *loose); * Populate and return the loose object cache array corresponding to the * given object ID. */ -struct oidtree *odb_loose_cache(struct odb_source *source, - const struct object_id *oid); +struct oidtree *odb_source_loose_cache(struct odb_source *source, + const struct object_id *oid); /* Empty the loose object cache for the specified object directory. */ void odb_clear_loose_cache(struct odb_source *source); diff --git a/object-name.c b/object-name.c index 766c757042..8ce0ef7c23 100644 --- a/object-name.c +++ b/object-name.c @@ -116,7 +116,7 @@ static void find_short_object_filename(struct disambiguate_state *ds) struct odb_source *source; for (source = ds->repo->objects->sources; source && !ds->ambiguous; source = source->next) - oidtree_each(odb_loose_cache(source, &ds->bin_pfx), + oidtree_each(odb_source_loose_cache(source, &ds->bin_pfx), &ds->bin_pfx, ds->len, match_prefix, ds); } diff --git a/odb.c b/odb.c index 2d06ab0bb8..87d84688c6 100644 --- a/odb.c +++ b/odb.c @@ -370,7 +370,6 @@ static void odb_source_free(struct odb_source *source) { free(source->path); odb_source_loose_free(source->loose); - odb_clear_loose_cache(source); loose_object_map_clear(&source->loose_map); free(source); } diff --git a/odb.h b/odb.h index 49b398beda..77104396af 100644 --- a/odb.h +++ b/odb.h @@ -51,18 +51,6 @@ struct odb_source { /* Private state for loose objects. */ struct odb_source_loose *loose; - /* - * Used to store the results of readdir(3) calls when we are OK - * sacrificing accuracy due to races for speed. That includes - * object existence with OBJECT_INFO_QUICK, as well as - * our search for unique abbreviated hashes. Don't use it for tasks - * requiring greater accuracy! - * - * Be sure to call odb_load_loose_cache() before using. - */ - uint32_t loose_objects_subdir_seen[8]; /* 256 bits */ - struct oidtree *loose_objects_cache; - /* Map between object IDs for loose objects. */ struct loose_object_map *loose_map; -- GitLab From be659c97eae3b68e38b71f0a67067dede23903b5 Mon Sep 17 00:00:00 2001 From: Patrick Steinhardt Date: Mon, 3 Nov 2025 08:42:02 +0100 Subject: [PATCH 07/33] object-file: hide internals when we need to reprepare loose sources There are two different situations where we have to clear the cache of loose objects: - When freeing the loose object source itself to avoid memory leaks. - When repreparing the loose object source so that any potentially- stale data is getting evicted from the cache. The former is already handled by `odb_source_loose_free()`. But the latter case is still done manually by in `odb_reprepare()`, so we are leaking internals into that code. Introduce a new `odb_source_loose_reprepare()` function as an equivalent to `packfile_store_prepare()` to hide these implementation details. Furthermore, while at it, rename the function `odb_clear_loose_cache()` to `odb_source_loose_clear()`. Signed-off-by: Patrick Steinhardt Signed-off-by: Junio C Hamano --- object-file.c | 17 +++++++++++------ object-file.h | 6 +++--- odb.c | 2 +- 3 files changed, 15 insertions(+), 10 deletions(-) diff --git a/object-file.c b/object-file.c index fef00d6d3d..20daa629a1 100644 --- a/object-file.c +++ b/object-file.c @@ -1834,12 +1834,17 @@ struct oidtree *odb_source_loose_cache(struct odb_source *source, return source->loose->cache; } -void odb_clear_loose_cache(struct odb_source *source) +static void odb_source_loose_clear_cache(struct odb_source_loose *loose) { - oidtree_clear(source->loose->cache); - FREE_AND_NULL(source->loose->cache); - memset(&source->loose->subdir_seen, 0, - sizeof(source->loose->subdir_seen)); + oidtree_clear(loose->cache); + FREE_AND_NULL(loose->cache); + memset(&loose->subdir_seen, 0, + sizeof(loose->subdir_seen)); +} + +void odb_source_loose_reprepare(struct odb_source *source) +{ + odb_source_loose_clear_cache(source->loose); } static int check_stream_oid(git_zstream *stream, @@ -2008,6 +2013,6 @@ void odb_source_loose_free(struct odb_source_loose *loose) { if (!loose) return; - odb_clear_loose_cache(loose->source); + odb_source_loose_clear_cache(loose); free(loose); } diff --git a/object-file.h b/object-file.h index 90da69cf5f..bec855e8e5 100644 --- a/object-file.h +++ b/object-file.h @@ -37,6 +37,9 @@ struct odb_source_loose { struct odb_source_loose *odb_source_loose_new(struct odb_source *source); void odb_source_loose_free(struct odb_source_loose *loose); +/* Reprepare the loose source by emptying the loose object cache. */ +void odb_source_loose_reprepare(struct odb_source *source); + /* * Populate and return the loose object cache array corresponding to the * given object ID. @@ -44,9 +47,6 @@ void odb_source_loose_free(struct odb_source_loose *loose); struct oidtree *odb_source_loose_cache(struct odb_source *source, const struct object_id *oid); -/* Empty the loose object cache for the specified object directory. */ -void odb_clear_loose_cache(struct odb_source *source); - /* * Put in `buf` the name of the file in the local object database that * would be used to store a loose object with the specified oid. diff --git a/odb.c b/odb.c index 87d84688c6..b3e8d4a49c 100644 --- a/odb.c +++ b/odb.c @@ -1071,7 +1071,7 @@ void odb_reprepare(struct object_database *o) odb_prepare_alternates(o); for (source = o->sources; source; source = source->next) - odb_clear_loose_cache(source); + odb_source_loose_reprepare(source); o->approximate_object_count_valid = 0; -- GitLab From 376016ec71c3a6c883f2ca77a3f1c0245fd60dc2 Mon Sep 17 00:00:00 2001 From: Patrick Steinhardt Date: Mon, 3 Nov 2025 08:42:03 +0100 Subject: [PATCH 08/33] object-file: move loose object map into loose source The loose object map is used to map from the repository's canonical object hash to the compatibility hash. As the name indicates, this map is only used for loose objects, and as such it is tied to a specific loose object source. Same as with preceding commits, move this map into the loose object source accordingly. Signed-off-by: Patrick Steinhardt Signed-off-by: Junio C Hamano --- loose.c | 10 +++++----- object-file.c | 1 + object-file.h | 3 +++ odb.c | 1 - odb.h | 3 --- 5 files changed, 9 insertions(+), 9 deletions(-) diff --git a/loose.c b/loose.c index 8cc7573ff2..56cf64b648 100644 --- a/loose.c +++ b/loose.c @@ -49,7 +49,7 @@ static int insert_loose_map(struct odb_source *source, const struct object_id *oid, const struct object_id *compat_oid) { - struct loose_object_map *map = source->loose_map; + struct loose_object_map *map = source->loose->map; int inserted = 0; inserted |= insert_oid_pair(map->to_compat, oid, compat_oid); @@ -65,8 +65,8 @@ static int load_one_loose_object_map(struct repository *repo, struct odb_source struct strbuf buf = STRBUF_INIT, path = STRBUF_INIT; FILE *fp; - if (!source->loose_map) - loose_object_map_init(&source->loose_map); + if (!source->loose->map) + loose_object_map_init(&source->loose->map); if (!source->loose->cache) { ALLOC_ARRAY(source->loose->cache, 1); oidtree_init(source->loose->cache); @@ -125,7 +125,7 @@ int repo_read_loose_object_map(struct repository *repo) int repo_write_loose_object_map(struct repository *repo) { - kh_oid_map_t *map = repo->objects->sources->loose_map->to_compat; + kh_oid_map_t *map = repo->objects->sources->loose->map->to_compat; struct lock_file lock; int fd; khiter_t iter; @@ -231,7 +231,7 @@ int repo_loose_object_map_oid(struct repository *repo, khiter_t pos; for (source = repo->objects->sources; source; source = source->next) { - struct loose_object_map *loose_map = source->loose_map; + struct loose_object_map *loose_map = source->loose->map; if (!loose_map) continue; map = (to == repo->compat_hash_algo) ? diff --git a/object-file.c b/object-file.c index 20daa629a1..ccc67713fa 100644 --- a/object-file.c +++ b/object-file.c @@ -2014,5 +2014,6 @@ void odb_source_loose_free(struct odb_source_loose *loose) if (!loose) return; odb_source_loose_clear_cache(loose); + loose_object_map_clear(&loose->map); free(loose); } diff --git a/object-file.h b/object-file.h index bec855e8e5..f8a96a45f5 100644 --- a/object-file.h +++ b/object-file.h @@ -32,6 +32,9 @@ struct odb_source_loose { */ uint32_t subdir_seen[8]; /* 256 bits */ struct oidtree *cache; + + /* Map between object IDs for loose objects. */ + struct loose_object_map *map; }; struct odb_source_loose *odb_source_loose_new(struct odb_source *source); diff --git a/odb.c b/odb.c index b3e8d4a49c..d1df9609e2 100644 --- a/odb.c +++ b/odb.c @@ -370,7 +370,6 @@ static void odb_source_free(struct odb_source *source) { free(source->path); odb_source_loose_free(source->loose); - loose_object_map_clear(&source->loose_map); free(source); } diff --git a/odb.h b/odb.h index 77104396af..f9a3137a34 100644 --- a/odb.h +++ b/odb.h @@ -51,9 +51,6 @@ struct odb_source { /* Private state for loose objects. */ struct odb_source_loose *loose; - /* Map between object IDs for loose objects. */ - struct loose_object_map *loose_map; - /* * private data * -- GitLab From ff7ad5cb3936514ec0be32531ff6274b53dbe091 Mon Sep 17 00:00:00 2001 From: Patrick Steinhardt Date: Mon, 3 Nov 2025 08:42:04 +0100 Subject: [PATCH 09/33] object-file: read objects via the loose object source When reading an object via `loose_object_info()` or `map_loose_object()` we hand in the whole repository. We then iterate through each of the object sources to figure out whether that source has the object in question. This logic is reversing responsibility though: a specific backend should only care about one specific source, where the object sources themselves are then managed by the object database. Refactor the code accordingly by passing an object source to both of these functions instead. The different sources are then handled by either `do_oid_object_info_extended()`, which sits on the object database level, and by `open_istream_loose()`. The latter function arguably is still at the wrong level, but this will be cleaned up at a later point in time. Signed-off-by: Patrick Steinhardt Signed-off-by: Junio C Hamano --- object-file.c | 68 +++++++++++++++++++-------------------------------- object-file.h | 15 ++++++------ odb.c | 9 +++++-- streaming.c | 11 ++++++++- 4 files changed, 50 insertions(+), 53 deletions(-) diff --git a/object-file.c b/object-file.c index ccc67713fa..6d6e9a5a2a 100644 --- a/object-file.c +++ b/object-file.c @@ -167,25 +167,22 @@ int stream_object_signature(struct repository *r, const struct object_id *oid) } /* - * Find "oid" as a loose object in the local repository or in an alternate. + * Find "oid" as a loose object in given source. * Returns 0 on success, negative on failure. * * The "path" out-parameter will give the path of the object we found (if any). * Note that it may point to static storage and is only valid until another * call to stat_loose_object(). */ -static int stat_loose_object(struct repository *r, const struct object_id *oid, +static int stat_loose_object(struct odb_source_loose *loose, + const struct object_id *oid, struct stat *st, const char **path) { - struct odb_source *source; static struct strbuf buf = STRBUF_INIT; - odb_prepare_alternates(r->objects); - for (source = r->objects->sources; source; source = source->next) { - *path = odb_loose_path(source, &buf, oid); - if (!lstat(*path, st)) - return 0; - } + *path = odb_loose_path(loose->source, &buf, oid); + if (!lstat(*path, st)) + return 0; return -1; } @@ -194,39 +191,24 @@ static int stat_loose_object(struct repository *r, const struct object_id *oid, * Like stat_loose_object(), but actually open the object and return the * descriptor. See the caveats on the "path" parameter above. */ -static int open_loose_object(struct repository *r, +static int open_loose_object(struct odb_source_loose *loose, const struct object_id *oid, const char **path) { - int fd; - struct odb_source *source; - int most_interesting_errno = ENOENT; static struct strbuf buf = STRBUF_INIT; + int fd; - odb_prepare_alternates(r->objects); - for (source = r->objects->sources; source; source = source->next) { - *path = odb_loose_path(source, &buf, oid); - fd = git_open(*path); - if (fd >= 0) - return fd; + *path = odb_loose_path(loose->source, &buf, oid); + fd = git_open(*path); + if (fd >= 0) + return fd; - if (most_interesting_errno == ENOENT) - most_interesting_errno = errno; - } - errno = most_interesting_errno; return -1; } -static int quick_has_loose(struct repository *r, +static int quick_has_loose(struct odb_source_loose *loose, const struct object_id *oid) { - struct odb_source *source; - - odb_prepare_alternates(r->objects); - for (source = r->objects->sources; source; source = source->next) { - if (oidtree_contains(odb_source_loose_cache(source, oid), oid)) - return 1; - } - return 0; + return !!oidtree_contains(odb_source_loose_cache(loose->source, oid), oid); } /* @@ -252,12 +234,12 @@ static void *map_fd(int fd, const char *path, unsigned long *size) return map; } -void *map_loose_object(struct repository *r, - const struct object_id *oid, - unsigned long *size) +void *odb_source_loose_map_object(struct odb_source *source, + const struct object_id *oid, + unsigned long *size) { const char *p; - int fd = open_loose_object(r, oid, &p); + int fd = open_loose_object(source->loose, oid, &p); if (fd < 0) return NULL; @@ -407,9 +389,9 @@ int parse_loose_header(const char *hdr, struct object_info *oi) return 0; } -int loose_object_info(struct repository *r, - const struct object_id *oid, - struct object_info *oi, int flags) +int odb_source_loose_read_object_info(struct odb_source *source, + const struct object_id *oid, + struct object_info *oi, int flags) { int status = 0; int fd; @@ -422,7 +404,7 @@ int loose_object_info(struct repository *r, enum object_type type_scratch; if (oi->delta_base_oid) - oidclr(oi->delta_base_oid, r->hash_algo); + oidclr(oi->delta_base_oid, source->odb->repo->hash_algo); /* * If we don't care about type or size, then we don't @@ -435,15 +417,15 @@ int loose_object_info(struct repository *r, if (!oi->typep && !oi->sizep && !oi->contentp) { struct stat st; if (!oi->disk_sizep && (flags & OBJECT_INFO_QUICK)) - return quick_has_loose(r, oid) ? 0 : -1; - if (stat_loose_object(r, oid, &st, &path) < 0) + return quick_has_loose(source->loose, oid) ? 0 : -1; + if (stat_loose_object(source->loose, oid, &st, &path) < 0) return -1; if (oi->disk_sizep) *oi->disk_sizep = st.st_size; return 0; } - fd = open_loose_object(r, oid, &path); + fd = open_loose_object(source->loose, oid, &path); if (fd < 0) { if (errno != ENOENT) error_errno(_("unable to open loose object %s"), oid_to_hex(oid)); diff --git a/object-file.h b/object-file.h index f8a96a45f5..ca13d3d64e 100644 --- a/object-file.h +++ b/object-file.h @@ -43,6 +43,14 @@ void odb_source_loose_free(struct odb_source_loose *loose); /* Reprepare the loose source by emptying the loose object cache. */ void odb_source_loose_reprepare(struct odb_source *source); +int odb_source_loose_read_object_info(struct odb_source *source, + const struct object_id *oid, + struct object_info *oi, int flags); + +void *odb_source_loose_map_object(struct odb_source *source, + const struct object_id *oid, + unsigned long *size); + /* * Populate and return the loose object cache array corresponding to the * given object ID. @@ -66,9 +74,6 @@ const char *odb_loose_path(struct odb_source *source, int has_loose_object(struct odb_source *source, const struct object_id *oid); -void *map_loose_object(struct repository *r, const struct object_id *oid, - unsigned long *size); - /* * Iterate over the files in the loose-object parts of the object * directory "path", triggering the following callbacks: @@ -196,10 +201,6 @@ int check_object_signature(struct repository *r, const struct object_id *oid, */ int stream_object_signature(struct repository *r, const struct object_id *oid); -int loose_object_info(struct repository *r, - const struct object_id *oid, - struct object_info *oi, int flags); - enum finalize_object_file_flags { FOF_SKIP_COLLISION_CHECK = 1, }; diff --git a/odb.c b/odb.c index d1df9609e2..4c0b4fdcd5 100644 --- a/odb.c +++ b/odb.c @@ -697,13 +697,18 @@ static int do_oid_object_info_extended(struct object_database *odb, return 0; } + odb_prepare_alternates(odb); + while (1) { + struct odb_source *source; + if (find_pack_entry(odb->repo, real, &e)) break; /* Most likely it's a loose object. */ - if (!loose_object_info(odb->repo, real, oi, flags)) - return 0; + for (source = odb->sources; source; source = source->next) + if (!odb_source_loose_read_object_info(source, real, oi, flags)) + return 0; /* Not a loose object; someone else may have just packed it. */ if (!(flags & OBJECT_INFO_QUICK)) { diff --git a/streaming.c b/streaming.c index 4b13827668..00ad649ae3 100644 --- a/streaming.c +++ b/streaming.c @@ -230,12 +230,21 @@ static int open_istream_loose(struct git_istream *st, struct repository *r, enum object_type *type) { struct object_info oi = OBJECT_INFO_INIT; + struct odb_source *source; + oi.sizep = &st->size; oi.typep = type; - st->u.loose.mapped = map_loose_object(r, oid, &st->u.loose.mapsize); + odb_prepare_alternates(r->objects); + for (source = r->objects->sources; source; source = source->next) { + st->u.loose.mapped = odb_source_loose_map_object(source, oid, + &st->u.loose.mapsize); + if (st->u.loose.mapped) + break; + } if (!st->u.loose.mapped) return -1; + switch (unpack_loose_header(&st->z, st->u.loose.mapped, st->u.loose.mapsize, st->u.loose.hdr, sizeof(st->u.loose.hdr))) { -- GitLab From 05130c6c9eed9ff7450e9067d7215032eb914c10 Mon Sep 17 00:00:00 2001 From: Patrick Steinhardt Date: Mon, 3 Nov 2025 08:42:05 +0100 Subject: [PATCH 10/33] object-file: rename `has_loose_object()` Rename `has_loose_object()` to `odb_source_loose_has_object()` so that it becomes clear that this is tied to a specific loose object source. This matches our modern naming schema for functions. Signed-off-by: Patrick Steinhardt Signed-off-by: Junio C Hamano --- builtin/pack-objects.c | 4 ++-- object-file.c | 6 +++--- object-file.h | 16 ++++++++-------- 3 files changed, 13 insertions(+), 13 deletions(-) diff --git a/builtin/pack-objects.c b/builtin/pack-objects.c index b5454e5df1..69e80b1443 100644 --- a/builtin/pack-objects.c +++ b/builtin/pack-objects.c @@ -1716,7 +1716,7 @@ static int want_object_in_pack_mtime(const struct object_id *oid, */ struct odb_source *source = the_repository->objects->sources->next; for (; source; source = source->next) - if (has_loose_object(source, oid)) + if (odb_source_loose_has_object(source, oid)) return 0; } @@ -3978,7 +3978,7 @@ static void add_cruft_object_entry(const struct object_id *oid, enum object_type int found = 0; for (; !found && source; source = source->next) - if (has_loose_object(source, oid)) + if (odb_source_loose_has_object(source, oid)) found = 1; /* diff --git a/object-file.c b/object-file.c index 6d6e9a5a2a..79e7ab8d2e 100644 --- a/object-file.c +++ b/object-file.c @@ -99,8 +99,8 @@ static int check_and_freshen_source(struct odb_source *source, return check_and_freshen_file(path.buf, freshen); } -int has_loose_object(struct odb_source *source, - const struct object_id *oid) +int odb_source_loose_has_object(struct odb_source *source, + const struct object_id *oid) { return check_and_freshen_source(source, oid, 0); } @@ -1161,7 +1161,7 @@ int force_object_loose(struct odb_source *source, int ret; for (struct odb_source *s = source->odb->sources; s; s = s->next) - if (has_loose_object(s, oid)) + if (odb_source_loose_has_object(s, oid)) return 0; oi.typep = &type; diff --git a/object-file.h b/object-file.h index ca13d3d64e..065a44bb8a 100644 --- a/object-file.h +++ b/object-file.h @@ -51,6 +51,14 @@ void *odb_source_loose_map_object(struct odb_source *source, const struct object_id *oid, unsigned long *size); +/* + * Return true iff an object database source has a loose object + * with the specified name. This function does not respect replace + * references. + */ +int odb_source_loose_has_object(struct odb_source *source, + const struct object_id *oid); + /* * Populate and return the loose object cache array corresponding to the * given object ID. @@ -66,14 +74,6 @@ const char *odb_loose_path(struct odb_source *source, struct strbuf *buf, const struct object_id *oid); -/* - * Return true iff an object database source has a loose object - * with the specified name. This function does not respect replace - * references. - */ -int has_loose_object(struct odb_source *source, - const struct object_id *oid); - /* * Iterate over the files in the loose-object parts of the object * directory "path", triggering the following callbacks: -- GitLab From f2bd88a308a2754e727cb462e03102307cdfe004 Mon Sep 17 00:00:00 2001 From: Patrick Steinhardt Date: Mon, 3 Nov 2025 08:42:06 +0100 Subject: [PATCH 11/33] object-file: refactor freshening of objects When writing an object that already exists in our object database we skip the write and instead only update mtimes of the object, either in its packed or loose object format. This logic is wholly contained in "object-file.c", but that file is really only concerned with loose objects. So it does not really make sense that it also contains the logic to freshen a packed object. Introduce a new `odb_freshen_object()` function that sits on the object database level and two functions `packfile_store_freshen_object()` and `odb_source_loose_freshen_object()`. Like this, the format-specific functions can be part of their respective subsystems, while the backend agnostic function to freshen an object sits at the object database layer. Note that this change also moves the logic that iterates through object sources from the object source layer into the object database layer. This change is intentional: object sources should ideally only have to worry about themselves, and coordination of different sources should be handled on the object database level. Signed-off-by: Patrick Steinhardt Signed-off-by: Junio C Hamano --- object-file.c | 33 +++++---------------------------- object-file.h | 3 +++ odb.c | 16 ++++++++++++++++ odb.h | 3 +++ packfile.c | 16 ++++++++++++++++ packfile.h | 3 +++ 6 files changed, 46 insertions(+), 28 deletions(-) diff --git a/object-file.c b/object-file.c index 79e7ab8d2e..893c32adcd 100644 --- a/object-file.c +++ b/object-file.c @@ -968,30 +968,10 @@ static int write_loose_object(struct odb_source *source, FOF_SKIP_COLLISION_CHECK); } -static int freshen_loose_object(struct object_database *odb, - const struct object_id *oid) +int odb_source_loose_freshen_object(struct odb_source *source, + const struct object_id *oid) { - odb_prepare_alternates(odb); - for (struct odb_source *source = odb->sources; source; source = source->next) - if (check_and_freshen_source(source, oid, 1)) - return 1; - return 0; -} - -static int freshen_packed_object(struct object_database *odb, - const struct object_id *oid) -{ - struct pack_entry e; - if (!find_pack_entry(odb->repo, oid, &e)) - return 0; - if (e.p->is_cruft) - return 0; - if (e.p->freshened) - return 1; - if (!freshen_file(e.p->pack_name)) - return 0; - e.p->freshened = 1; - return 1; + return !!check_and_freshen_source(source, oid, 1); } int stream_loose_object(struct odb_source *source, @@ -1073,12 +1053,10 @@ int stream_loose_object(struct odb_source *source, die(_("deflateEnd on stream object failed (%d)"), ret); close_loose_object(source, fd, tmp_file.buf); - if (freshen_packed_object(source->odb, oid) || - freshen_loose_object(source->odb, oid)) { + if (odb_freshen_object(source->odb, oid)) { unlink_or_warn(tmp_file.buf); goto cleanup; } - odb_loose_path(source, &filename, oid); /* We finally know the object path, and create the missing dir. */ @@ -1137,8 +1115,7 @@ int write_object_file(struct odb_source *source, * it out into .git/objects/??/?{38} file. */ write_object_file_prepare(algo, buf, len, type, oid, hdr, &hdrlen); - if (freshen_packed_object(source->odb, oid) || - freshen_loose_object(source->odb, oid)) + if (odb_freshen_object(source->odb, oid)) return 0; if (write_loose_object(source, oid, hdr, hdrlen, buf, len, 0, flags)) return -1; diff --git a/object-file.h b/object-file.h index 065a44bb8a..ee5b24cec6 100644 --- a/object-file.h +++ b/object-file.h @@ -59,6 +59,9 @@ void *odb_source_loose_map_object(struct odb_source *source, int odb_source_loose_has_object(struct odb_source *source, const struct object_id *oid); +int odb_source_loose_freshen_object(struct odb_source *source, + const struct object_id *oid); + /* * Populate and return the loose object cache array corresponding to the * given object ID. diff --git a/odb.c b/odb.c index 4c0b4fdcd5..17734bdaff 100644 --- a/odb.c +++ b/odb.c @@ -987,6 +987,22 @@ int odb_has_object(struct object_database *odb, const struct object_id *oid, return odb_read_object_info_extended(odb, oid, NULL, object_info_flags) >= 0; } +int odb_freshen_object(struct object_database *odb, + const struct object_id *oid) +{ + struct odb_source *source; + + if (packfile_store_freshen_object(odb->packfiles, oid)) + return 1; + + odb_prepare_alternates(odb); + for (source = odb->sources; source; source = source->next) + if (odb_source_loose_freshen_object(source, oid)) + return 1; + + return 0; +} + void odb_assert_oid_type(struct object_database *odb, const struct object_id *oid, enum object_type expect) { diff --git a/odb.h b/odb.h index f9a3137a34..2653247e0c 100644 --- a/odb.h +++ b/odb.h @@ -396,6 +396,9 @@ int odb_has_object(struct object_database *odb, const struct object_id *oid, unsigned flags); +int odb_freshen_object(struct object_database *odb, + const struct object_id *oid); + void odb_assert_oid_type(struct object_database *odb, const struct object_id *oid, enum object_type expect); diff --git a/packfile.c b/packfile.c index 1ae2b2fe1e..40f733dd23 100644 --- a/packfile.c +++ b/packfile.c @@ -819,6 +819,22 @@ struct packed_git *packfile_store_load_pack(struct packfile_store *store, return p; } +int packfile_store_freshen_object(struct packfile_store *store, + const struct object_id *oid) +{ + struct pack_entry e; + if (!find_pack_entry(store->odb->repo, oid, &e)) + return 0; + if (e.p->is_cruft) + return 0; + if (e.p->freshened) + return 1; + if (utime(e.p->pack_name, NULL)) + return 0; + e.p->freshened = 1; + return 1; +} + void (*report_garbage)(unsigned seen_bits, const char *path); static void report_helper(const struct string_list *list, diff --git a/packfile.h b/packfile.h index c9d0b93446..58fcc88e20 100644 --- a/packfile.h +++ b/packfile.h @@ -163,6 +163,9 @@ struct list_head *packfile_store_get_packs_mru(struct packfile_store *store); struct packed_git *packfile_store_load_pack(struct packfile_store *store, const char *idx_path, int local); +int packfile_store_freshen_object(struct packfile_store *store, + const struct object_id *oid); + struct pack_window { struct pack_window *next; unsigned char *base; -- GitLab From bfb1b2b4ac5cfa99f7d2503b404d282714d84bdf Mon Sep 17 00:00:00 2001 From: Patrick Steinhardt Date: Mon, 3 Nov 2025 08:42:07 +0100 Subject: [PATCH 12/33] object-file: rename `write_object_file()` Rename `write_object_file()` to `odb_source_loose_write_object()` so that it becomes clear that this is tied to a specific loose object source. This matches our modern naming schema for functions. Signed-off-by: Patrick Steinhardt Signed-off-by: Junio C Hamano --- object-file.c | 8 ++++---- object-file.h | 10 +++++----- odb.c | 3 ++- 3 files changed, 11 insertions(+), 10 deletions(-) diff --git a/object-file.c b/object-file.c index 893c32adcd..fdc644a427 100644 --- a/object-file.c +++ b/object-file.c @@ -1084,10 +1084,10 @@ int stream_loose_object(struct odb_source *source, return err; } -int write_object_file(struct odb_source *source, - const void *buf, unsigned long len, - enum object_type type, struct object_id *oid, - struct object_id *compat_oid_in, unsigned flags) +int odb_source_loose_write_object(struct odb_source *source, + const void *buf, unsigned long len, + enum object_type type, struct object_id *oid, + struct object_id *compat_oid_in, unsigned flags) { const struct git_hash_algo *algo = source->odb->repo->hash_algo; const struct git_hash_algo *compat = source->odb->repo->compat_hash_algo; diff --git a/object-file.h b/object-file.h index ee5b24cec6..36a60e15c4 100644 --- a/object-file.h +++ b/object-file.h @@ -62,6 +62,11 @@ int odb_source_loose_has_object(struct odb_source *source, int odb_source_loose_freshen_object(struct odb_source *source, const struct object_id *oid); +int odb_source_loose_write_object(struct odb_source *source, + const void *buf, unsigned long len, + enum object_type type, struct object_id *oid, + struct object_id *compat_oid_in, unsigned flags); + /* * Populate and return the loose object cache array corresponding to the * given object ID. @@ -168,11 +173,6 @@ enum unpack_loose_header_result unpack_loose_header(git_zstream *stream, struct object_info; int parse_loose_header(const char *hdr, struct object_info *oi); -int write_object_file(struct odb_source *source, - const void *buf, unsigned long len, - enum object_type type, struct object_id *oid, - struct object_id *compat_oid_in, unsigned flags); - struct input_stream { const void *(*read)(struct input_stream *, unsigned long *len); void *data; diff --git a/odb.c b/odb.c index 17734bdaff..da44f1d63b 100644 --- a/odb.c +++ b/odb.c @@ -1021,7 +1021,8 @@ int odb_write_object_ext(struct object_database *odb, struct object_id *compat_oid, unsigned flags) { - return write_object_file(odb->sources, buf, len, type, oid, compat_oid, flags); + return odb_source_loose_write_object(odb->sources, buf, len, type, + oid, compat_oid, flags); } struct object_database *odb_new(struct repository *repo) -- GitLab From 3e5e360888316ed1a44da69bf134bb6ec70aee1b Mon Sep 17 00:00:00 2001 From: Patrick Steinhardt Date: Mon, 3 Nov 2025 08:42:08 +0100 Subject: [PATCH 13/33] object-file: refactor writing objects via a stream We have two different ways to write an object into the database: - We either provide the full buffer and write the object all at once. - Or we provide an input stream that has a `read()` function so that we can chunk the object. The latter is especially used for large objects, where it may be too expensive to hold the complete object in memory all at once. While we already have `odb_write_object()` at the ODB-layer, we don't have an equivalent for streaming an object. Introduce a new function `odb_write_object_stream()` to address this gap so that callers don't have to be aware of the inner workings of how to stream an object to disk with a specific object source. Rename `stream_loose_object()` to `odb_source_loose_write_stream()` to clarify its scope. This matches our modern best practices around how to name functions. Signed-off-by: Patrick Steinhardt Signed-off-by: Junio C Hamano --- builtin/unpack-objects.c | 7 +++---- object-file.c | 6 +++--- object-file.h | 14 ++++---------- odb.c | 7 +++++++ odb.h | 10 ++++++++++ 5 files changed, 27 insertions(+), 17 deletions(-) diff --git a/builtin/unpack-objects.c b/builtin/unpack-objects.c index ef79e43715..6fc64e9e4b 100644 --- a/builtin/unpack-objects.c +++ b/builtin/unpack-objects.c @@ -363,7 +363,7 @@ struct input_zstream_data { int status; }; -static const void *feed_input_zstream(struct input_stream *in_stream, +static const void *feed_input_zstream(struct odb_write_stream *in_stream, unsigned long *readlen) { struct input_zstream_data *data = in_stream->data; @@ -393,7 +393,7 @@ static void stream_blob(unsigned long size, unsigned nr) { git_zstream zstream = { 0 }; struct input_zstream_data data = { 0 }; - struct input_stream in_stream = { + struct odb_write_stream in_stream = { .read = feed_input_zstream, .data = &data, }; @@ -402,8 +402,7 @@ static void stream_blob(unsigned long size, unsigned nr) data.zstream = &zstream; git_inflate_init(&zstream); - if (stream_loose_object(the_repository->objects->sources, - &in_stream, size, &info->oid)) + if (odb_write_object_stream(the_repository->objects, &in_stream, size, &info->oid)) die(_("failed to write object in stream")); if (data.status != Z_STREAM_END) diff --git a/object-file.c b/object-file.c index fdc644a427..811c569ed3 100644 --- a/object-file.c +++ b/object-file.c @@ -974,9 +974,9 @@ int odb_source_loose_freshen_object(struct odb_source *source, return !!check_and_freshen_source(source, oid, 1); } -int stream_loose_object(struct odb_source *source, - struct input_stream *in_stream, size_t len, - struct object_id *oid) +int odb_source_loose_write_stream(struct odb_source *source, + struct odb_write_stream *in_stream, size_t len, + struct object_id *oid) { const struct git_hash_algo *compat = source->odb->repo->compat_hash_algo; struct object_id compat_oid; diff --git a/object-file.h b/object-file.h index 36a60e15c4..eeffa67bbd 100644 --- a/object-file.h +++ b/object-file.h @@ -67,6 +67,10 @@ int odb_source_loose_write_object(struct odb_source *source, enum object_type type, struct object_id *oid, struct object_id *compat_oid_in, unsigned flags); +int odb_source_loose_write_stream(struct odb_source *source, + struct odb_write_stream *stream, size_t len, + struct object_id *oid); + /* * Populate and return the loose object cache array corresponding to the * given object ID. @@ -173,16 +177,6 @@ enum unpack_loose_header_result unpack_loose_header(git_zstream *stream, struct object_info; int parse_loose_header(const char *hdr, struct object_info *oi); -struct input_stream { - const void *(*read)(struct input_stream *, unsigned long *len); - void *data; - int is_finished; -}; - -int stream_loose_object(struct odb_source *source, - struct input_stream *in_stream, size_t len, - struct object_id *oid); - int force_object_loose(struct odb_source *source, const struct object_id *oid, time_t mtime); diff --git a/odb.c b/odb.c index da44f1d63b..3ec21ef24e 100644 --- a/odb.c +++ b/odb.c @@ -1025,6 +1025,13 @@ int odb_write_object_ext(struct object_database *odb, oid, compat_oid, flags); } +int odb_write_object_stream(struct object_database *odb, + struct odb_write_stream *stream, size_t len, + struct object_id *oid) +{ + return odb_source_loose_write_stream(odb->sources, stream, len, oid); +} + struct object_database *odb_new(struct repository *repo) { struct object_database *o = xmalloc(sizeof(*o)); diff --git a/odb.h b/odb.h index 2653247e0c..9bb28008b1 100644 --- a/odb.h +++ b/odb.h @@ -492,4 +492,14 @@ static inline int odb_write_object(struct object_database *odb, return odb_write_object_ext(odb, buf, len, type, oid, NULL, 0); } +struct odb_write_stream { + const void *(*read)(struct odb_write_stream *, unsigned long *len); + void *data; + int is_finished; +}; + +int odb_write_object_stream(struct object_database *odb, + struct odb_write_stream *stream, size_t len, + struct object_id *oid); + #endif /* ODB_H */ -- GitLab From 5512947276aa9c87754e91a820e96caee2e9dbb4 Mon Sep 17 00:00:00 2001 From: Patrick Steinhardt Date: Fri, 7 Nov 2025 08:55:08 +0100 Subject: [PATCH 14/33] Refactor object read streams to work via object sources Hi, the `git_istream` data structure can be used to read objects from the object database in a streaming fashion. This is used for example to read large files that one doesn't want to load into memory in full. In the current architecture, all the logic to handle these streams is fully self-contained in "streaming.c". It contains the logic to set up streams for loose, packed, in-memory and filtered objects. This doesn't really play all that well with pluggable object databases, as it should be the responsibility of the object database source itself to handle the logic. This patch series thus revamps our object read streams: instead of being entirely contained in "streaming.c", the format-specific streams are now created by the ODB sources. This allows each source itself to decide whether and, if so, how to make objects streamable. This overall requires quite a bit of refactoring, but I think that the end result is an easier-to-understand infrastructure that is an improvement even without pluggable object databases. This series is built on top of v2.52.0 with ps/object-source-loose at 3e5e360888 (object-file: refactor writing objects via a stream, 2025-11-03) merged into it. Changes in v3: - Clarify why we want to get rid of the `open()` callback. - Explain change in semantics now that we iterate through sources first to create the read stream. - Fix "opaque" comment applying to the correct structure. - Rename `odb_read_object_stream()` to `odb_read_stream_open()`. - Link to v2: https://lore.kernel.org/r/20251121-b4-pks-odb-read-stream-v2-0-ca8534963150@pks.im Changes in v2: - Some commit message improvements. - Drop the `type` and `size` out pointers in `odb_read_object_stream()` in an additional commit. - Improve a "hidden" variable declaration by moving it onto its own line. - Link to v1: https://lore.kernel.org/r/20251119-b4-pks-odb-read-stream-v1-0-adacf03c2ccf@pks.im Thanks! Patrick To: git@vger.kernel.org Cc: Karthik Nayak Cc: Justin Tobler Cc: Junio C Hamano --- b4-submit-tracking --- # This section is used internally by b4 prep for tracking purposes. { "series": { "revision": 3, "change-id": "20251107-b4-pks-odb-read-stream-7ea7f0e0a8f4", "prefixes": [], "history": { "v1": [ "20251119-b4-pks-odb-read-stream-v1-0-adacf03c2ccf@pks.im" ], "v2": [ "20251121-b4-pks-odb-read-stream-v2-0-ca8534963150@pks.im" ] } } } -- GitLab From 1fed5169f85103733de61f56c2d24cc2ec970190 Mon Sep 17 00:00:00 2001 From: Patrick Steinhardt Date: Wed, 22 Oct 2025 12:36:59 +0200 Subject: [PATCH 15/33] streaming: rename `git_istream` into `odb_read_stream` In the following patches we are about to make the `git_istream` more generic so that it becomes fully controlled by the specific object source that wants to create it. As part of these refactorings we'll fully move the structure into the object database subsystem. Prepare for this change by renaming the structure from `git_istream` to `odb_read_stream`. This mirrors the `odb_write_stream` structure that we already have. Signed-off-by: Patrick Steinhardt --- archive-tar.c | 2 +- archive-zip.c | 2 +- builtin/index-pack.c | 2 +- builtin/pack-objects.c | 4 +-- object-file.c | 2 +- streaming.c | 62 +++++++++++++++++++++--------------------- streaming.h | 12 ++++---- 7 files changed, 43 insertions(+), 43 deletions(-) diff --git a/archive-tar.c b/archive-tar.c index 73b63ddc41..dc1eda09e0 100644 --- a/archive-tar.c +++ b/archive-tar.c @@ -129,7 +129,7 @@ static void write_trailer(void) */ static int stream_blocked(struct repository *r, const struct object_id *oid) { - struct git_istream *st; + struct odb_read_stream *st; enum object_type type; unsigned long sz; char buf[BLOCKSIZE]; diff --git a/archive-zip.c b/archive-zip.c index bea5bdd43d..40a9c93ff9 100644 --- a/archive-zip.c +++ b/archive-zip.c @@ -309,7 +309,7 @@ static int write_zip_entry(struct archiver_args *args, enum zip_method method; unsigned char *out; void *deflated = NULL; - struct git_istream *stream = NULL; + struct odb_read_stream *stream = NULL; unsigned long flags = 0; int is_binary = -1; const char *path_without_prefix = path + args->baselen; diff --git a/builtin/index-pack.c b/builtin/index-pack.c index 2b78ba7fe4..5f90f12f92 100644 --- a/builtin/index-pack.c +++ b/builtin/index-pack.c @@ -762,7 +762,7 @@ static void find_ref_delta_children(const struct object_id *oid, struct compare_data { struct object_entry *entry; - struct git_istream *st; + struct odb_read_stream *st; unsigned char *buf; unsigned long buf_size; }; diff --git a/builtin/pack-objects.c b/builtin/pack-objects.c index 69e80b1443..c693d948e1 100644 --- a/builtin/pack-objects.c +++ b/builtin/pack-objects.c @@ -404,7 +404,7 @@ static unsigned long do_compress(void **pptr, unsigned long size) return stream.total_out; } -static unsigned long write_large_blob_data(struct git_istream *st, struct hashfile *f, +static unsigned long write_large_blob_data(struct odb_read_stream *st, struct hashfile *f, const struct object_id *oid) { git_zstream stream; @@ -513,7 +513,7 @@ static unsigned long write_no_reuse_object(struct hashfile *f, struct object_ent unsigned hdrlen; enum object_type type; void *buf; - struct git_istream *st = NULL; + struct odb_read_stream *st = NULL; const unsigned hashsz = the_hash_algo->rawsz; if (!usable_delta) { diff --git a/object-file.c b/object-file.c index 811c569ed3..b62b21a452 100644 --- a/object-file.c +++ b/object-file.c @@ -134,7 +134,7 @@ int stream_object_signature(struct repository *r, const struct object_id *oid) struct object_id real_oid; unsigned long size; enum object_type obj_type; - struct git_istream *st; + struct odb_read_stream *st; struct git_hash_ctx c; char hdr[MAX_HEADER_LEN]; int hdrlen; diff --git a/streaming.c b/streaming.c index 00ad649ae3..1fb4b7c1c0 100644 --- a/streaming.c +++ b/streaming.c @@ -14,17 +14,17 @@ #include "replace-object.h" #include "packfile.h" -typedef int (*open_istream_fn)(struct git_istream *, +typedef int (*open_istream_fn)(struct odb_read_stream *, struct repository *, const struct object_id *, enum object_type *); -typedef int (*close_istream_fn)(struct git_istream *); -typedef ssize_t (*read_istream_fn)(struct git_istream *, char *, size_t); +typedef int (*close_istream_fn)(struct odb_read_stream *); +typedef ssize_t (*read_istream_fn)(struct odb_read_stream *, char *, size_t); #define FILTER_BUFFER (1024*16) struct filtered_istream { - struct git_istream *upstream; + struct odb_read_stream *upstream; struct stream_filter *filter; char ibuf[FILTER_BUFFER]; char obuf[FILTER_BUFFER]; @@ -33,7 +33,7 @@ struct filtered_istream { int input_finished; }; -struct git_istream { +struct odb_read_stream { open_istream_fn open; close_istream_fn close; read_istream_fn read; @@ -71,7 +71,7 @@ struct git_istream { * *****************************************************************/ -static void close_deflated_stream(struct git_istream *st) +static void close_deflated_stream(struct odb_read_stream *st) { if (st->z_state == z_used) git_inflate_end(&st->z); @@ -84,13 +84,13 @@ static void close_deflated_stream(struct git_istream *st) * *****************************************************************/ -static int close_istream_filtered(struct git_istream *st) +static int close_istream_filtered(struct odb_read_stream *st) { free_stream_filter(st->u.filtered.filter); return close_istream(st->u.filtered.upstream); } -static ssize_t read_istream_filtered(struct git_istream *st, char *buf, +static ssize_t read_istream_filtered(struct odb_read_stream *st, char *buf, size_t sz) { struct filtered_istream *fs = &(st->u.filtered); @@ -150,10 +150,10 @@ static ssize_t read_istream_filtered(struct git_istream *st, char *buf, return filled; } -static struct git_istream *attach_stream_filter(struct git_istream *st, - struct stream_filter *filter) +static struct odb_read_stream *attach_stream_filter(struct odb_read_stream *st, + struct stream_filter *filter) { - struct git_istream *ifs = xmalloc(sizeof(*ifs)); + struct odb_read_stream *ifs = xmalloc(sizeof(*ifs)); struct filtered_istream *fs = &(ifs->u.filtered); ifs->close = close_istream_filtered; @@ -173,7 +173,7 @@ static struct git_istream *attach_stream_filter(struct git_istream *st, * *****************************************************************/ -static ssize_t read_istream_loose(struct git_istream *st, char *buf, size_t sz) +static ssize_t read_istream_loose(struct odb_read_stream *st, char *buf, size_t sz) { size_t total_read = 0; @@ -218,14 +218,14 @@ static ssize_t read_istream_loose(struct git_istream *st, char *buf, size_t sz) return total_read; } -static int close_istream_loose(struct git_istream *st) +static int close_istream_loose(struct odb_read_stream *st) { close_deflated_stream(st); munmap(st->u.loose.mapped, st->u.loose.mapsize); return 0; } -static int open_istream_loose(struct git_istream *st, struct repository *r, +static int open_istream_loose(struct odb_read_stream *st, struct repository *r, const struct object_id *oid, enum object_type *type) { @@ -277,7 +277,7 @@ static int open_istream_loose(struct git_istream *st, struct repository *r, * *****************************************************************/ -static ssize_t read_istream_pack_non_delta(struct git_istream *st, char *buf, +static ssize_t read_istream_pack_non_delta(struct odb_read_stream *st, char *buf, size_t sz) { size_t total_read = 0; @@ -336,13 +336,13 @@ static ssize_t read_istream_pack_non_delta(struct git_istream *st, char *buf, return total_read; } -static int close_istream_pack_non_delta(struct git_istream *st) +static int close_istream_pack_non_delta(struct odb_read_stream *st) { close_deflated_stream(st); return 0; } -static int open_istream_pack_non_delta(struct git_istream *st, +static int open_istream_pack_non_delta(struct odb_read_stream *st, struct repository *r UNUSED, const struct object_id *oid UNUSED, enum object_type *type UNUSED) @@ -380,13 +380,13 @@ static int open_istream_pack_non_delta(struct git_istream *st, * *****************************************************************/ -static int close_istream_incore(struct git_istream *st) +static int close_istream_incore(struct odb_read_stream *st) { free(st->u.incore.buf); return 0; } -static ssize_t read_istream_incore(struct git_istream *st, char *buf, size_t sz) +static ssize_t read_istream_incore(struct odb_read_stream *st, char *buf, size_t sz) { size_t read_size = sz; size_t remainder = st->size - st->u.incore.read_ptr; @@ -400,7 +400,7 @@ static ssize_t read_istream_incore(struct git_istream *st, char *buf, size_t sz) return read_size; } -static int open_istream_incore(struct git_istream *st, struct repository *r, +static int open_istream_incore(struct odb_read_stream *st, struct repository *r, const struct object_id *oid, enum object_type *type) { struct object_info oi = OBJECT_INFO_INIT; @@ -420,7 +420,7 @@ static int open_istream_incore(struct git_istream *st, struct repository *r, * static helpers variables and functions for users of streaming interface *****************************************************************************/ -static int istream_source(struct git_istream *st, +static int istream_source(struct odb_read_stream *st, struct repository *r, const struct object_id *oid, enum object_type *type) @@ -458,25 +458,25 @@ static int istream_source(struct git_istream *st, * Users of streaming interface ****************************************************************/ -int close_istream(struct git_istream *st) +int close_istream(struct odb_read_stream *st) { int r = st->close(st); free(st); return r; } -ssize_t read_istream(struct git_istream *st, void *buf, size_t sz) +ssize_t read_istream(struct odb_read_stream *st, void *buf, size_t sz) { return st->read(st, buf, sz); } -struct git_istream *open_istream(struct repository *r, - const struct object_id *oid, - enum object_type *type, - unsigned long *size, - struct stream_filter *filter) +struct odb_read_stream *open_istream(struct repository *r, + const struct object_id *oid, + enum object_type *type, + unsigned long *size, + struct stream_filter *filter) { - struct git_istream *st = xmalloc(sizeof(*st)); + struct odb_read_stream *st = xmalloc(sizeof(*st)); const struct object_id *real = lookup_replace_object(r, oid); int ret = istream_source(st, r, real, type); @@ -493,7 +493,7 @@ struct git_istream *open_istream(struct repository *r, } if (filter) { /* Add "&& !is_null_stream_filter(filter)" for performance */ - struct git_istream *nst = attach_stream_filter(st, filter); + struct odb_read_stream *nst = attach_stream_filter(st, filter); if (!nst) { close_istream(st); return NULL; @@ -508,7 +508,7 @@ struct git_istream *open_istream(struct repository *r, int stream_blob_to_fd(int fd, const struct object_id *oid, struct stream_filter *filter, int can_seek) { - struct git_istream *st; + struct odb_read_stream *st; enum object_type type; unsigned long sz; ssize_t kept = 0; diff --git a/streaming.h b/streaming.h index bd27f59e57..f5ff5d7ac9 100644 --- a/streaming.h +++ b/streaming.h @@ -7,14 +7,14 @@ #include "object.h" /* opaque */ -struct git_istream; +struct odb_read_stream; struct stream_filter; -struct git_istream *open_istream(struct repository *, const struct object_id *, - enum object_type *, unsigned long *, - struct stream_filter *); -int close_istream(struct git_istream *); -ssize_t read_istream(struct git_istream *, void *, size_t); +struct odb_read_stream *open_istream(struct repository *, const struct object_id *, + enum object_type *, unsigned long *, + struct stream_filter *); +int close_istream(struct odb_read_stream *); +ssize_t read_istream(struct odb_read_stream *, void *, size_t); int stream_blob_to_fd(int fd, const struct object_id *, struct stream_filter *, int can_seek); -- GitLab From 30f6a0ec466b8c90a39565f51cd7f1864c512591 Mon Sep 17 00:00:00 2001 From: Patrick Steinhardt Date: Fri, 24 Oct 2025 09:47:58 +0200 Subject: [PATCH 16/33] streaming: drop the `open()` callback function When creating a read stream we first populate the structure with the open callback function and then subsequently call the function. This layout is somewhat weird though: - The structure needs to be allocated and partially populated with the open function before we can properly initialize it. - We only ever call the `open()` callback function right after having populated the `struct odb_read_stream::open` member, and it's never called thereafter again. So it is somewhat pointless to store the callback in the first place. Especially the first point creates a problem for us. In subsequent commits we'll want to fully move construction of the read source into the respective object sources. E.g., the loose object source will be the one that is responsible for creating the structure. But this creates a problem: if we first need to create the structure so that we can call the source-specific callback we cannot fully handle creation of the structure in the source itself. We could of course work around that and have the loose object source create the structure and populate its `open()` callback, only. But this doesn't really buy us anything due to the second bullet point above. Instead, drop the callback entirely and refactor `istream_source()` so that we open the streams immediately. This unblocks a subsequent step, where we'll also start to allocate the structure in the source-specific logic. Signed-off-by: Patrick Steinhardt --- streaming.c | 37 +++++++++++++++---------------------- 1 file changed, 15 insertions(+), 22 deletions(-) diff --git a/streaming.c b/streaming.c index 1fb4b7c1c0..1bb3f393b8 100644 --- a/streaming.c +++ b/streaming.c @@ -14,10 +14,6 @@ #include "replace-object.h" #include "packfile.h" -typedef int (*open_istream_fn)(struct odb_read_stream *, - struct repository *, - const struct object_id *, - enum object_type *); typedef int (*close_istream_fn)(struct odb_read_stream *); typedef ssize_t (*read_istream_fn)(struct odb_read_stream *, char *, size_t); @@ -34,7 +30,6 @@ struct filtered_istream { }; struct odb_read_stream { - open_istream_fn open; close_istream_fn close; read_istream_fn read; @@ -437,21 +432,25 @@ static int istream_source(struct odb_read_stream *st, switch (oi.whence) { case OI_LOOSE: - st->open = open_istream_loose; + if (open_istream_loose(st, r, oid, type) < 0) + break; return 0; case OI_PACKED: - if (!oi.u.packed.is_delta && - repo_settings_get_big_file_threshold(the_repository) < size) { - st->u.in_pack.pack = oi.u.packed.pack; - st->u.in_pack.pos = oi.u.packed.offset; - st->open = open_istream_pack_non_delta; - return 0; - } - /* fallthru */ - default: - st->open = open_istream_incore; + if (oi.u.packed.is_delta || + repo_settings_get_big_file_threshold(the_repository) >= size) + break; + + st->u.in_pack.pack = oi.u.packed.pack; + st->u.in_pack.pos = oi.u.packed.offset; + if (open_istream_pack_non_delta(st, r, oid, type) < 0) + break; + return 0; + default: + break; } + + return open_istream_incore(st, r, oid, type); } /**************************************************************** @@ -485,12 +484,6 @@ struct odb_read_stream *open_istream(struct repository *r, return NULL; } - if (st->open(st, r, real, type)) { - if (open_istream_incore(st, r, real, type)) { - free(st); - return NULL; - } - } if (filter) { /* Add "&& !is_null_stream_filter(filter)" for performance */ struct odb_read_stream *nst = attach_stream_filter(st, filter); -- GitLab From 3055ce427e60d79fe6053c706d36e4509cbda355 Mon Sep 17 00:00:00 2001 From: Patrick Steinhardt Date: Sun, 23 Nov 2025 15:54:02 +0100 Subject: [PATCH 17/33] streaming: propagate final object type via the stream When opening the read stream for a specific object the caller is also expected to pass in a pointer to the object type. This type is passed down via multiple levels and will eventually be populated with the type of the looked-up object. The way we propagate down the pointer though is somewhat non-obvious. While `istream_source()` still expects the pointer and looks it up via `odb_read_object_info_extended()`, we also pass it down even further into the format-specific callbacks that perform another lookup. This is quite confusing overall. Refactor the code so that the responsibility to populate the object type rests solely with the format-specific callbacks. This will allow us to drop the call to `odb_read_object_info_extended()` in `istream_source()` entirely in a subsequent patch. Furthermore, instead of propagating the type via an in-pointer, we now propagate the type via a new field in the object stream. It already has a `size` field, so it's only natural to have a second field that contains the object type. Signed-off-by: Patrick Steinhardt --- streaming.c | 30 +++++++++++++++--------------- 1 file changed, 15 insertions(+), 15 deletions(-) diff --git a/streaming.c b/streaming.c index 1bb3f393b8..665624ddc0 100644 --- a/streaming.c +++ b/streaming.c @@ -33,6 +33,7 @@ struct odb_read_stream { close_istream_fn close; read_istream_fn read; + enum object_type type; unsigned long size; /* inflated size of full object */ git_zstream z; enum { z_unused, z_used, z_done, z_error } z_state; @@ -159,6 +160,7 @@ static struct odb_read_stream *attach_stream_filter(struct odb_read_stream *st, fs->o_end = fs->o_ptr = 0; fs->input_finished = 0; ifs->size = -1; /* unknown */ + ifs->type = st->type; return ifs; } @@ -221,14 +223,13 @@ static int close_istream_loose(struct odb_read_stream *st) } static int open_istream_loose(struct odb_read_stream *st, struct repository *r, - const struct object_id *oid, - enum object_type *type) + const struct object_id *oid) { struct object_info oi = OBJECT_INFO_INIT; struct odb_source *source; oi.sizep = &st->size; - oi.typep = type; + oi.typep = &st->type; odb_prepare_alternates(r->objects); for (source = r->objects->sources; source; source = source->next) { @@ -249,7 +250,7 @@ static int open_istream_loose(struct odb_read_stream *st, struct repository *r, case ULHR_TOO_LONG: goto error; } - if (parse_loose_header(st->u.loose.hdr, &oi) < 0 || *type < 0) + if (parse_loose_header(st->u.loose.hdr, &oi) < 0 || st->type < 0) goto error; st->u.loose.hdr_used = strlen(st->u.loose.hdr) + 1; @@ -339,8 +340,7 @@ static int close_istream_pack_non_delta(struct odb_read_stream *st) static int open_istream_pack_non_delta(struct odb_read_stream *st, struct repository *r UNUSED, - const struct object_id *oid UNUSED, - enum object_type *type UNUSED) + const struct object_id *oid UNUSED) { struct pack_window *window; enum object_type in_pack_type; @@ -361,6 +361,7 @@ static int open_istream_pack_non_delta(struct odb_read_stream *st, case OBJ_TAG: break; } + st->type = in_pack_type; st->z_state = z_unused; st->close = close_istream_pack_non_delta; st->read = read_istream_pack_non_delta; @@ -396,7 +397,7 @@ static ssize_t read_istream_incore(struct odb_read_stream *st, char *buf, size_t } static int open_istream_incore(struct odb_read_stream *st, struct repository *r, - const struct object_id *oid, enum object_type *type) + const struct object_id *oid) { struct object_info oi = OBJECT_INFO_INIT; @@ -404,7 +405,7 @@ static int open_istream_incore(struct odb_read_stream *st, struct repository *r, st->close = close_istream_incore; st->read = read_istream_incore; - oi.typep = type; + oi.typep = &st->type; oi.sizep = &st->size; oi.contentp = (void **)&st->u.incore.buf; return odb_read_object_info_extended(r->objects, oid, &oi, @@ -417,14 +418,12 @@ static int open_istream_incore(struct odb_read_stream *st, struct repository *r, static int istream_source(struct odb_read_stream *st, struct repository *r, - const struct object_id *oid, - enum object_type *type) + const struct object_id *oid) { unsigned long size; int status; struct object_info oi = OBJECT_INFO_INIT; - oi.typep = type; oi.sizep = &size; status = odb_read_object_info_extended(r->objects, oid, &oi, 0); if (status < 0) @@ -432,7 +431,7 @@ static int istream_source(struct odb_read_stream *st, switch (oi.whence) { case OI_LOOSE: - if (open_istream_loose(st, r, oid, type) < 0) + if (open_istream_loose(st, r, oid) < 0) break; return 0; case OI_PACKED: @@ -442,7 +441,7 @@ static int istream_source(struct odb_read_stream *st, st->u.in_pack.pack = oi.u.packed.pack; st->u.in_pack.pos = oi.u.packed.offset; - if (open_istream_pack_non_delta(st, r, oid, type) < 0) + if (open_istream_pack_non_delta(st, r, oid) < 0) break; return 0; @@ -450,7 +449,7 @@ static int istream_source(struct odb_read_stream *st, break; } - return open_istream_incore(st, r, oid, type); + return open_istream_incore(st, r, oid); } /**************************************************************** @@ -477,7 +476,7 @@ struct odb_read_stream *open_istream(struct repository *r, { struct odb_read_stream *st = xmalloc(sizeof(*st)); const struct object_id *real = lookup_replace_object(r, oid); - int ret = istream_source(st, r, real, type); + int ret = istream_source(st, r, real); if (ret) { free(st); @@ -495,6 +494,7 @@ struct odb_read_stream *open_istream(struct repository *r, } *size = st->size; + *type = st->type; return st; } -- GitLab From f56cdc206943a0e68da33cfe50ac61ffd9419f93 Mon Sep 17 00:00:00 2001 From: Patrick Steinhardt Date: Fri, 24 Oct 2025 09:48:58 +0200 Subject: [PATCH 18/33] streaming: explicitly pass packfile info when streaming a packed object When streaming a packed object we first populate the stream with information about the pack that contains the object before calling `open_istream_pack_non_delta()`. This is done because we have already looked up both the pack and the object's offset, so it would be a waste of time to look up this information again. But the way this is done makes for a somewhat awkward calling interface, as the caller now needs to be aware of how exactly the function itself behaves. Refactor the code so that we instead explicitly pass the packfile info into `open_istream_pack_non_delta()`. This makes the calling convention explicit, but more importantly this allows us to refactor the function so that it becomes its responsibility to allocate the stream itself in a subsequent patch. Signed-off-by: Patrick Steinhardt --- streaming.c | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/streaming.c b/streaming.c index 665624ddc0..bf277daadd 100644 --- a/streaming.c +++ b/streaming.c @@ -340,16 +340,18 @@ static int close_istream_pack_non_delta(struct odb_read_stream *st) static int open_istream_pack_non_delta(struct odb_read_stream *st, struct repository *r UNUSED, - const struct object_id *oid UNUSED) + const struct object_id *oid UNUSED, + struct packed_git *pack, + off_t offset) { struct pack_window *window; enum object_type in_pack_type; window = NULL; - in_pack_type = unpack_object_header(st->u.in_pack.pack, + in_pack_type = unpack_object_header(pack, &window, - &st->u.in_pack.pos, + &offset, &st->size); unuse_pack(&window); switch (in_pack_type) { @@ -365,6 +367,8 @@ static int open_istream_pack_non_delta(struct odb_read_stream *st, st->z_state = z_unused; st->close = close_istream_pack_non_delta; st->read = read_istream_pack_non_delta; + st->u.in_pack.pack = pack; + st->u.in_pack.pos = offset; return 0; } @@ -436,14 +440,10 @@ static int istream_source(struct odb_read_stream *st, return 0; case OI_PACKED: if (oi.u.packed.is_delta || - repo_settings_get_big_file_threshold(the_repository) >= size) + repo_settings_get_big_file_threshold(the_repository) >= size || + open_istream_pack_non_delta(st, r, oid, oi.u.packed.pack, + oi.u.packed.offset) < 0) break; - - st->u.in_pack.pack = oi.u.packed.pack; - st->u.in_pack.pos = oi.u.packed.offset; - if (open_istream_pack_non_delta(st, r, oid) < 0) - break; - return 0; default: break; -- GitLab From bccc32d635c1af2a3bcc6dd25b9088484c2a3231 Mon Sep 17 00:00:00 2001 From: Patrick Steinhardt Date: Sun, 23 Nov 2025 15:55:05 +0100 Subject: [PATCH 19/33] streaming: allocate stream inside the backend-specific logic When creating a new stream we first allocate it and then call into backend-specific logic to populate the stream. This design requires that the stream itself contains a `union` with backend-specific members that then ultimately get populated by the backend-specific logic. This works, but it's awkward in the context of pluggable object databases. Each backend will need its own member in that union, and as the structure itself is completely opaque (it's only defined in "streaming.c") it also has the consequence that we must have the logic that is specific to backends in "streaming.c". Ideally though, the infrastructure would be reversed: we have a generic `struct odb_read_stream` and some helper functions in "streaming.c", whereas the backend-specific logic sits in the backend's subsystem itself. This can be realized by using a design that is similar to how we handle reference databases: instead of having a union of members, we instead have backend-specific structures with a `struct odb_read_stream base` as its first member. The backends would thus hand out the pointer to the base, but internally they know to cast back to the backend-specific type. This means though that we need to allocate different structures depending on the backend. To prepare for this, move allocation of the structure into the backend-specific functions that open a new stream. Subsequent commits will then create those new backend-specific structs. Signed-off-by: Patrick Steinhardt --- streaming.c | 103 +++++++++++++++++++++++++++++++++------------------- 1 file changed, 65 insertions(+), 38 deletions(-) diff --git a/streaming.c b/streaming.c index bf277daadd..a2c2d88738 100644 --- a/streaming.c +++ b/streaming.c @@ -222,27 +222,34 @@ static int close_istream_loose(struct odb_read_stream *st) return 0; } -static int open_istream_loose(struct odb_read_stream *st, struct repository *r, +static int open_istream_loose(struct odb_read_stream **out, + struct repository *r, const struct object_id *oid) { struct object_info oi = OBJECT_INFO_INIT; + struct odb_read_stream *st; struct odb_source *source; - - oi.sizep = &st->size; - oi.typep = &st->type; + unsigned long mapsize; + void *mapped; odb_prepare_alternates(r->objects); for (source = r->objects->sources; source; source = source->next) { - st->u.loose.mapped = odb_source_loose_map_object(source, oid, - &st->u.loose.mapsize); - if (st->u.loose.mapped) + mapped = odb_source_loose_map_object(source, oid, &mapsize); + if (mapped) break; } - if (!st->u.loose.mapped) + if (!mapped) return -1; - switch (unpack_loose_header(&st->z, st->u.loose.mapped, - st->u.loose.mapsize, st->u.loose.hdr, + /* + * Note: we must allocate this structure early even though we may still + * fail. This is because we need to initialize the zlib stream, and it + * is not possible to copy the stream around after the fact because it + * has self-referencing pointers. + */ + CALLOC_ARRAY(st, 1); + + switch (unpack_loose_header(&st->z, mapped, mapsize, st->u.loose.hdr, sizeof(st->u.loose.hdr))) { case ULHR_OK: break; @@ -250,19 +257,28 @@ static int open_istream_loose(struct odb_read_stream *st, struct repository *r, case ULHR_TOO_LONG: goto error; } + + oi.sizep = &st->size; + oi.typep = &st->type; + if (parse_loose_header(st->u.loose.hdr, &oi) < 0 || st->type < 0) goto error; + st->u.loose.mapped = mapped; + st->u.loose.mapsize = mapsize; st->u.loose.hdr_used = strlen(st->u.loose.hdr) + 1; st->u.loose.hdr_avail = st->z.total_out; st->z_state = z_used; st->close = close_istream_loose; st->read = read_istream_loose; + *out = st; + return 0; error: git_inflate_end(&st->z); munmap(st->u.loose.mapped, st->u.loose.mapsize); + free(st); return -1; } @@ -338,12 +354,16 @@ static int close_istream_pack_non_delta(struct odb_read_stream *st) return 0; } -static int open_istream_pack_non_delta(struct odb_read_stream *st, +static int open_istream_pack_non_delta(struct odb_read_stream **out, struct repository *r UNUSED, const struct object_id *oid UNUSED, struct packed_git *pack, off_t offset) { + struct odb_read_stream stream = { + .close = close_istream_pack_non_delta, + .read = read_istream_pack_non_delta, + }; struct pack_window *window; enum object_type in_pack_type; @@ -352,7 +372,7 @@ static int open_istream_pack_non_delta(struct odb_read_stream *st, in_pack_type = unpack_object_header(pack, &window, &offset, - &st->size); + &stream.size); unuse_pack(&window); switch (in_pack_type) { default: @@ -363,12 +383,13 @@ static int open_istream_pack_non_delta(struct odb_read_stream *st, case OBJ_TAG: break; } - st->type = in_pack_type; - st->z_state = z_unused; - st->close = close_istream_pack_non_delta; - st->read = read_istream_pack_non_delta; - st->u.in_pack.pack = pack; - st->u.in_pack.pos = offset; + stream.type = in_pack_type; + stream.z_state = z_unused; + stream.u.in_pack.pack = pack; + stream.u.in_pack.pos = offset; + + CALLOC_ARRAY(*out, 1); + **out = stream; return 0; } @@ -400,27 +421,35 @@ static ssize_t read_istream_incore(struct odb_read_stream *st, char *buf, size_t return read_size; } -static int open_istream_incore(struct odb_read_stream *st, struct repository *r, +static int open_istream_incore(struct odb_read_stream **out, + struct repository *r, const struct object_id *oid) { struct object_info oi = OBJECT_INFO_INIT; - - st->u.incore.read_ptr = 0; - st->close = close_istream_incore; - st->read = read_istream_incore; - - oi.typep = &st->type; - oi.sizep = &st->size; - oi.contentp = (void **)&st->u.incore.buf; - return odb_read_object_info_extended(r->objects, oid, &oi, - OBJECT_INFO_DIE_IF_CORRUPT); + struct odb_read_stream stream = { + .close = close_istream_incore, + .read = read_istream_incore, + }; + int ret; + + oi.typep = &stream.type; + oi.sizep = &stream.size; + oi.contentp = (void **)&stream.u.incore.buf; + ret = odb_read_object_info_extended(r->objects, oid, &oi, + OBJECT_INFO_DIE_IF_CORRUPT); + if (ret) + return ret; + + CALLOC_ARRAY(*out, 1); + **out = stream; + return 0; } /***************************************************************************** * static helpers variables and functions for users of streaming interface *****************************************************************************/ -static int istream_source(struct odb_read_stream *st, +static int istream_source(struct odb_read_stream **out, struct repository *r, const struct object_id *oid) { @@ -435,13 +464,13 @@ static int istream_source(struct odb_read_stream *st, switch (oi.whence) { case OI_LOOSE: - if (open_istream_loose(st, r, oid) < 0) + if (open_istream_loose(out, r, oid) < 0) break; return 0; case OI_PACKED: if (oi.u.packed.is_delta || repo_settings_get_big_file_threshold(the_repository) >= size || - open_istream_pack_non_delta(st, r, oid, oi.u.packed.pack, + open_istream_pack_non_delta(out, r, oid, oi.u.packed.pack, oi.u.packed.offset) < 0) break; return 0; @@ -449,7 +478,7 @@ static int istream_source(struct odb_read_stream *st, break; } - return open_istream_incore(st, r, oid); + return open_istream_incore(out, r, oid); } /**************************************************************** @@ -474,14 +503,12 @@ struct odb_read_stream *open_istream(struct repository *r, unsigned long *size, struct stream_filter *filter) { - struct odb_read_stream *st = xmalloc(sizeof(*st)); + struct odb_read_stream *st; const struct object_id *real = lookup_replace_object(r, oid); - int ret = istream_source(st, r, real); + int ret = istream_source(&st, r, real); - if (ret) { - free(st); + if (ret) return NULL; - } if (filter) { /* Add "&& !is_null_stream_filter(filter)" for performance */ -- GitLab From 5823ff1f0803d615d9bba76e619e8be23ec06805 Mon Sep 17 00:00:00 2001 From: Patrick Steinhardt Date: Fri, 24 Oct 2025 09:50:48 +0200 Subject: [PATCH 20/33] streaming: create structure for in-core object streams As explained in a preceding commit, we want to get rid of the union of stream-type specific data in `struct odb_read_stream`. Create a new structure for in-core object streams to move towards this design. Signed-off-by: Patrick Steinhardt --- streaming.c | 44 +++++++++++++++++++++++++------------------- 1 file changed, 25 insertions(+), 19 deletions(-) diff --git a/streaming.c b/streaming.c index a2c2d88738..35307d7229 100644 --- a/streaming.c +++ b/streaming.c @@ -39,11 +39,6 @@ struct odb_read_stream { enum { z_unused, z_used, z_done, z_error } z_state; union { - struct { - char *buf; /* from odb_read_object_info_extended() */ - unsigned long read_ptr; - } incore; - struct { void *mapped; unsigned long mapsize; @@ -401,22 +396,30 @@ static int open_istream_pack_non_delta(struct odb_read_stream **out, * *****************************************************************/ -static int close_istream_incore(struct odb_read_stream *st) +struct odb_incore_read_stream { + struct odb_read_stream base; + char *buf; /* from odb_read_object_info_extended() */ + unsigned long read_ptr; +}; + +static int close_istream_incore(struct odb_read_stream *_st) { - free(st->u.incore.buf); + struct odb_incore_read_stream *st = (struct odb_incore_read_stream *)_st; + free(st->buf); return 0; } -static ssize_t read_istream_incore(struct odb_read_stream *st, char *buf, size_t sz) +static ssize_t read_istream_incore(struct odb_read_stream *_st, char *buf, size_t sz) { + struct odb_incore_read_stream *st = (struct odb_incore_read_stream *)_st; size_t read_size = sz; - size_t remainder = st->size - st->u.incore.read_ptr; + size_t remainder = st->base.size - st->read_ptr; if (remainder <= read_size) read_size = remainder; if (read_size) { - memcpy(buf, st->u.incore.buf + st->u.incore.read_ptr, read_size); - st->u.incore.read_ptr += read_size; + memcpy(buf, st->buf + st->read_ptr, read_size); + st->read_ptr += read_size; } return read_size; } @@ -426,22 +429,25 @@ static int open_istream_incore(struct odb_read_stream **out, const struct object_id *oid) { struct object_info oi = OBJECT_INFO_INIT; - struct odb_read_stream stream = { - .close = close_istream_incore, - .read = read_istream_incore, + struct odb_incore_read_stream stream = { + .base.close = close_istream_incore, + .base.read = read_istream_incore, }; + struct odb_incore_read_stream *st; int ret; - oi.typep = &stream.type; - oi.sizep = &stream.size; - oi.contentp = (void **)&stream.u.incore.buf; + oi.typep = &stream.base.type; + oi.sizep = &stream.base.size; + oi.contentp = (void **)&stream.buf; ret = odb_read_object_info_extended(r->objects, oid, &oi, OBJECT_INFO_DIE_IF_CORRUPT); if (ret) return ret; - CALLOC_ARRAY(*out, 1); - **out = stream; + CALLOC_ARRAY(st, 1); + *st = stream; + *out = &st->base; + return 0; } -- GitLab From 95784e3d4a46fe4a0b8cb0ad685c8686d0ef5a82 Mon Sep 17 00:00:00 2001 From: Patrick Steinhardt Date: Fri, 24 Oct 2025 09:50:48 +0200 Subject: [PATCH 21/33] streaming: create structure for loose object streams As explained in a preceding commit, we want to get rid of the union of stream-type specific data in `struct odb_read_stream`. Create a new structure for loose object streams to move towards this design. Signed-off-by: Patrick Steinhardt --- streaming.c | 85 +++++++++++++++++++++++++++-------------------------- 1 file changed, 44 insertions(+), 41 deletions(-) diff --git a/streaming.c b/streaming.c index 35307d7229..ac7b3026f5 100644 --- a/streaming.c +++ b/streaming.c @@ -39,14 +39,6 @@ struct odb_read_stream { enum { z_unused, z_used, z_done, z_error } z_state; union { - struct { - void *mapped; - unsigned long mapsize; - char hdr[32]; - int hdr_avail; - int hdr_used; - } loose; - struct { struct packed_git *pack; off_t pos; @@ -165,11 +157,21 @@ static struct odb_read_stream *attach_stream_filter(struct odb_read_stream *st, * *****************************************************************/ -static ssize_t read_istream_loose(struct odb_read_stream *st, char *buf, size_t sz) +struct odb_loose_read_stream { + struct odb_read_stream base; + void *mapped; + unsigned long mapsize; + char hdr[32]; + int hdr_avail; + int hdr_used; +}; + +static ssize_t read_istream_loose(struct odb_read_stream *_st, char *buf, size_t sz) { + struct odb_loose_read_stream *st = (struct odb_loose_read_stream *)_st; size_t total_read = 0; - switch (st->z_state) { + switch (st->base.z_state) { case z_done: return 0; case z_error: @@ -178,42 +180,43 @@ static ssize_t read_istream_loose(struct odb_read_stream *st, char *buf, size_t break; } - if (st->u.loose.hdr_used < st->u.loose.hdr_avail) { - size_t to_copy = st->u.loose.hdr_avail - st->u.loose.hdr_used; + if (st->hdr_used < st->hdr_avail) { + size_t to_copy = st->hdr_avail - st->hdr_used; if (sz < to_copy) to_copy = sz; - memcpy(buf, st->u.loose.hdr + st->u.loose.hdr_used, to_copy); - st->u.loose.hdr_used += to_copy; + memcpy(buf, st->hdr + st->hdr_used, to_copy); + st->hdr_used += to_copy; total_read += to_copy; } while (total_read < sz) { int status; - st->z.next_out = (unsigned char *)buf + total_read; - st->z.avail_out = sz - total_read; - status = git_inflate(&st->z, Z_FINISH); + st->base.z.next_out = (unsigned char *)buf + total_read; + st->base.z.avail_out = sz - total_read; + status = git_inflate(&st->base.z, Z_FINISH); - total_read = st->z.next_out - (unsigned char *)buf; + total_read = st->base.z.next_out - (unsigned char *)buf; if (status == Z_STREAM_END) { - git_inflate_end(&st->z); - st->z_state = z_done; + git_inflate_end(&st->base.z); + st->base.z_state = z_done; break; } if (status != Z_OK && (status != Z_BUF_ERROR || total_read < sz)) { - git_inflate_end(&st->z); - st->z_state = z_error; + git_inflate_end(&st->base.z); + st->base.z_state = z_error; return -1; } } return total_read; } -static int close_istream_loose(struct odb_read_stream *st) +static int close_istream_loose(struct odb_read_stream *_st) { - close_deflated_stream(st); - munmap(st->u.loose.mapped, st->u.loose.mapsize); + struct odb_loose_read_stream *st = (struct odb_loose_read_stream *)_st; + close_deflated_stream(&st->base); + munmap(st->mapped, st->mapsize); return 0; } @@ -222,7 +225,7 @@ static int open_istream_loose(struct odb_read_stream **out, const struct object_id *oid) { struct object_info oi = OBJECT_INFO_INIT; - struct odb_read_stream *st; + struct odb_loose_read_stream *st; struct odb_source *source; unsigned long mapsize; void *mapped; @@ -244,8 +247,8 @@ static int open_istream_loose(struct odb_read_stream **out, */ CALLOC_ARRAY(st, 1); - switch (unpack_loose_header(&st->z, mapped, mapsize, st->u.loose.hdr, - sizeof(st->u.loose.hdr))) { + switch (unpack_loose_header(&st->base.z, mapped, mapsize, st->hdr, + sizeof(st->hdr))) { case ULHR_OK: break; case ULHR_BAD: @@ -253,26 +256,26 @@ static int open_istream_loose(struct odb_read_stream **out, goto error; } - oi.sizep = &st->size; - oi.typep = &st->type; + oi.sizep = &st->base.size; + oi.typep = &st->base.type; - if (parse_loose_header(st->u.loose.hdr, &oi) < 0 || st->type < 0) + if (parse_loose_header(st->hdr, &oi) < 0 || st->base.type < 0) goto error; - st->u.loose.mapped = mapped; - st->u.loose.mapsize = mapsize; - st->u.loose.hdr_used = strlen(st->u.loose.hdr) + 1; - st->u.loose.hdr_avail = st->z.total_out; - st->z_state = z_used; - st->close = close_istream_loose; - st->read = read_istream_loose; + st->mapped = mapped; + st->mapsize = mapsize; + st->hdr_used = strlen(st->hdr) + 1; + st->hdr_avail = st->base.z.total_out; + st->base.z_state = z_used; + st->base.close = close_istream_loose; + st->base.read = read_istream_loose; - *out = st; + *out = &st->base; return 0; error: - git_inflate_end(&st->z); - munmap(st->u.loose.mapped, st->u.loose.mapsize); + git_inflate_end(&st->base.z); + munmap(st->mapped, st->mapsize); free(st); return -1; } -- GitLab From 543acbe2944227349ed3981df135c48bc3ddd720 Mon Sep 17 00:00:00 2001 From: Patrick Steinhardt Date: Fri, 24 Oct 2025 09:50:48 +0200 Subject: [PATCH 22/33] streaming: create structure for packed object streams As explained in a preceding commit, we want to get rid of the union of stream-type specific data in `struct odb_read_stream`. Create a new structure for packed object streams to move towards this design. Signed-off-by: Patrick Steinhardt --- streaming.c | 75 ++++++++++++++++++++++++++++------------------------- 1 file changed, 40 insertions(+), 35 deletions(-) diff --git a/streaming.c b/streaming.c index ac7b3026f5..788f04e83e 100644 --- a/streaming.c +++ b/streaming.c @@ -39,11 +39,6 @@ struct odb_read_stream { enum { z_unused, z_used, z_done, z_error } z_state; union { - struct { - struct packed_git *pack; - off_t pos; - } in_pack; - struct filtered_istream filtered; } u; }; @@ -287,16 +282,23 @@ static int open_istream_loose(struct odb_read_stream **out, * *****************************************************************/ -static ssize_t read_istream_pack_non_delta(struct odb_read_stream *st, char *buf, +struct odb_packed_read_stream { + struct odb_read_stream base; + struct packed_git *pack; + off_t pos; +}; + +static ssize_t read_istream_pack_non_delta(struct odb_read_stream *_st, char *buf, size_t sz) { + struct odb_packed_read_stream *st = (struct odb_packed_read_stream *)_st; size_t total_read = 0; - switch (st->z_state) { + switch (st->base.z_state) { case z_unused: - memset(&st->z, 0, sizeof(st->z)); - git_inflate_init(&st->z); - st->z_state = z_used; + memset(&st->base.z, 0, sizeof(st->base.z)); + git_inflate_init(&st->base.z); + st->base.z_state = z_used; break; case z_done: return 0; @@ -311,21 +313,21 @@ static ssize_t read_istream_pack_non_delta(struct odb_read_stream *st, char *buf struct pack_window *window = NULL; unsigned char *mapped; - mapped = use_pack(st->u.in_pack.pack, &window, - st->u.in_pack.pos, &st->z.avail_in); + mapped = use_pack(st->pack, &window, + st->pos, &st->base.z.avail_in); - st->z.next_out = (unsigned char *)buf + total_read; - st->z.avail_out = sz - total_read; - st->z.next_in = mapped; - status = git_inflate(&st->z, Z_FINISH); + st->base.z.next_out = (unsigned char *)buf + total_read; + st->base.z.avail_out = sz - total_read; + st->base.z.next_in = mapped; + status = git_inflate(&st->base.z, Z_FINISH); - st->u.in_pack.pos += st->z.next_in - mapped; - total_read = st->z.next_out - (unsigned char *)buf; + st->pos += st->base.z.next_in - mapped; + total_read = st->base.z.next_out - (unsigned char *)buf; unuse_pack(&window); if (status == Z_STREAM_END) { - git_inflate_end(&st->z); - st->z_state = z_done; + git_inflate_end(&st->base.z); + st->base.z_state = z_done; break; } @@ -338,17 +340,18 @@ static ssize_t read_istream_pack_non_delta(struct odb_read_stream *st, char *buf * or truncated), then use_pack() catches that and will die(). */ if (status != Z_OK && status != Z_BUF_ERROR) { - git_inflate_end(&st->z); - st->z_state = z_error; + git_inflate_end(&st->base.z); + st->base.z_state = z_error; return -1; } } return total_read; } -static int close_istream_pack_non_delta(struct odb_read_stream *st) +static int close_istream_pack_non_delta(struct odb_read_stream *_st) { - close_deflated_stream(st); + struct odb_packed_read_stream *st = (struct odb_packed_read_stream *)_st; + close_deflated_stream(&st->base); return 0; } @@ -358,19 +361,17 @@ static int open_istream_pack_non_delta(struct odb_read_stream **out, struct packed_git *pack, off_t offset) { - struct odb_read_stream stream = { - .close = close_istream_pack_non_delta, - .read = read_istream_pack_non_delta, - }; + struct odb_packed_read_stream *stream; struct pack_window *window; enum object_type in_pack_type; + size_t size; window = NULL; in_pack_type = unpack_object_header(pack, &window, &offset, - &stream.size); + &size); unuse_pack(&window); switch (in_pack_type) { default: @@ -381,13 +382,17 @@ static int open_istream_pack_non_delta(struct odb_read_stream **out, case OBJ_TAG: break; } - stream.type = in_pack_type; - stream.z_state = z_unused; - stream.u.in_pack.pack = pack; - stream.u.in_pack.pos = offset; - CALLOC_ARRAY(*out, 1); - **out = stream; + CALLOC_ARRAY(stream, 1); + stream->base.close = close_istream_pack_non_delta; + stream->base.read = read_istream_pack_non_delta; + stream->base.type = in_pack_type; + stream->base.size = size; + stream->base.z_state = z_unused; + stream->pack = pack; + stream->pos = offset; + + *out = &stream->base; return 0; } -- GitLab From cfc5c44247ca693dbfb3a1fb5288a1f39c11f468 Mon Sep 17 00:00:00 2001 From: Patrick Steinhardt Date: Fri, 24 Oct 2025 09:50:48 +0200 Subject: [PATCH 23/33] streaming: create structure for filtered object streams As explained in a preceding commit, we want to get rid of the union of stream-type specific data in `struct odb_read_stream`. Create a new structure for filtered object streams to move towards this design. Signed-off-by: Patrick Steinhardt --- streaming.c | 54 +++++++++++++++++++++++++---------------------------- 1 file changed, 25 insertions(+), 29 deletions(-) diff --git a/streaming.c b/streaming.c index 788f04e83e..199cca5abb 100644 --- a/streaming.c +++ b/streaming.c @@ -19,16 +19,6 @@ typedef ssize_t (*read_istream_fn)(struct odb_read_stream *, char *, size_t); #define FILTER_BUFFER (1024*16) -struct filtered_istream { - struct odb_read_stream *upstream; - struct stream_filter *filter; - char ibuf[FILTER_BUFFER]; - char obuf[FILTER_BUFFER]; - int i_end, i_ptr; - int o_end, o_ptr; - int input_finished; -}; - struct odb_read_stream { close_istream_fn close; read_istream_fn read; @@ -37,10 +27,6 @@ struct odb_read_stream { unsigned long size; /* inflated size of full object */ git_zstream z; enum { z_unused, z_used, z_done, z_error } z_state; - - union { - struct filtered_istream filtered; - } u; }; /***************************************************************** @@ -62,16 +48,28 @@ static void close_deflated_stream(struct odb_read_stream *st) * *****************************************************************/ -static int close_istream_filtered(struct odb_read_stream *st) +struct odb_filtered_read_stream { + struct odb_read_stream base; + struct odb_read_stream *upstream; + struct stream_filter *filter; + char ibuf[FILTER_BUFFER]; + char obuf[FILTER_BUFFER]; + int i_end, i_ptr; + int o_end, o_ptr; + int input_finished; +}; + +static int close_istream_filtered(struct odb_read_stream *_fs) { - free_stream_filter(st->u.filtered.filter); - return close_istream(st->u.filtered.upstream); + struct odb_filtered_read_stream *fs = (struct odb_filtered_read_stream *)_fs; + free_stream_filter(fs->filter); + return close_istream(fs->upstream); } -static ssize_t read_istream_filtered(struct odb_read_stream *st, char *buf, +static ssize_t read_istream_filtered(struct odb_read_stream *_fs, char *buf, size_t sz) { - struct filtered_istream *fs = &(st->u.filtered); + struct odb_filtered_read_stream *fs = (struct odb_filtered_read_stream *)_fs; size_t filled = 0; while (sz) { @@ -131,19 +129,17 @@ static ssize_t read_istream_filtered(struct odb_read_stream *st, char *buf, static struct odb_read_stream *attach_stream_filter(struct odb_read_stream *st, struct stream_filter *filter) { - struct odb_read_stream *ifs = xmalloc(sizeof(*ifs)); - struct filtered_istream *fs = &(ifs->u.filtered); + struct odb_filtered_read_stream *fs; - ifs->close = close_istream_filtered; - ifs->read = read_istream_filtered; + CALLOC_ARRAY(fs, 1); + fs->base.close = close_istream_filtered; + fs->base.read = read_istream_filtered; fs->upstream = st; fs->filter = filter; - fs->i_end = fs->i_ptr = 0; - fs->o_end = fs->o_ptr = 0; - fs->input_finished = 0; - ifs->size = -1; /* unknown */ - ifs->type = st->type; - return ifs; + fs->base.size = -1; /* unknown */ + fs->base.type = st->type; + + return &fs->base; } /***************************************************************** -- GitLab From 0dbd09b136aa3282172dfd69860c0f545f79c5e1 Mon Sep 17 00:00:00 2001 From: Patrick Steinhardt Date: Fri, 24 Oct 2025 10:20:36 +0200 Subject: [PATCH 24/33] streaming: move zlib stream into backends While all backend-specific data is now contained in a backend-specific structure, we still share the zlib stream across the loose and packed objects. Refactor the code and move it into the specific structures so that we fully detangle the different backends from one another. Signed-off-by: Patrick Steinhardt --- streaming.c | 104 ++++++++++++++++++++++++++-------------------------- 1 file changed, 52 insertions(+), 52 deletions(-) diff --git a/streaming.c b/streaming.c index 199cca5abb..46fddaf2ca 100644 --- a/streaming.c +++ b/streaming.c @@ -25,23 +25,8 @@ struct odb_read_stream { enum object_type type; unsigned long size; /* inflated size of full object */ - git_zstream z; - enum { z_unused, z_used, z_done, z_error } z_state; }; -/***************************************************************** - * - * Common helpers - * - *****************************************************************/ - -static void close_deflated_stream(struct odb_read_stream *st) -{ - if (st->z_state == z_used) - git_inflate_end(&st->z); -} - - /***************************************************************** * * Filtered stream @@ -150,6 +135,12 @@ static struct odb_read_stream *attach_stream_filter(struct odb_read_stream *st, struct odb_loose_read_stream { struct odb_read_stream base; + git_zstream z; + enum { + ODB_LOOSE_READ_STREAM_INUSE, + ODB_LOOSE_READ_STREAM_DONE, + ODB_LOOSE_READ_STREAM_ERROR, + } z_state; void *mapped; unsigned long mapsize; char hdr[32]; @@ -162,10 +153,10 @@ static ssize_t read_istream_loose(struct odb_read_stream *_st, char *buf, size_t struct odb_loose_read_stream *st = (struct odb_loose_read_stream *)_st; size_t total_read = 0; - switch (st->base.z_state) { - case z_done: + switch (st->z_state) { + case ODB_LOOSE_READ_STREAM_DONE: return 0; - case z_error: + case ODB_LOOSE_READ_STREAM_ERROR: return -1; default: break; @@ -183,20 +174,20 @@ static ssize_t read_istream_loose(struct odb_read_stream *_st, char *buf, size_t while (total_read < sz) { int status; - st->base.z.next_out = (unsigned char *)buf + total_read; - st->base.z.avail_out = sz - total_read; - status = git_inflate(&st->base.z, Z_FINISH); + st->z.next_out = (unsigned char *)buf + total_read; + st->z.avail_out = sz - total_read; + status = git_inflate(&st->z, Z_FINISH); - total_read = st->base.z.next_out - (unsigned char *)buf; + total_read = st->z.next_out - (unsigned char *)buf; if (status == Z_STREAM_END) { - git_inflate_end(&st->base.z); - st->base.z_state = z_done; + git_inflate_end(&st->z); + st->z_state = ODB_LOOSE_READ_STREAM_DONE; break; } if (status != Z_OK && (status != Z_BUF_ERROR || total_read < sz)) { - git_inflate_end(&st->base.z); - st->base.z_state = z_error; + git_inflate_end(&st->z); + st->z_state = ODB_LOOSE_READ_STREAM_ERROR; return -1; } } @@ -206,7 +197,8 @@ static ssize_t read_istream_loose(struct odb_read_stream *_st, char *buf, size_t static int close_istream_loose(struct odb_read_stream *_st) { struct odb_loose_read_stream *st = (struct odb_loose_read_stream *)_st; - close_deflated_stream(&st->base); + if (st->z_state == ODB_LOOSE_READ_STREAM_INUSE) + git_inflate_end(&st->z); munmap(st->mapped, st->mapsize); return 0; } @@ -238,7 +230,7 @@ static int open_istream_loose(struct odb_read_stream **out, */ CALLOC_ARRAY(st, 1); - switch (unpack_loose_header(&st->base.z, mapped, mapsize, st->hdr, + switch (unpack_loose_header(&st->z, mapped, mapsize, st->hdr, sizeof(st->hdr))) { case ULHR_OK: break; @@ -256,8 +248,8 @@ static int open_istream_loose(struct odb_read_stream **out, st->mapped = mapped; st->mapsize = mapsize; st->hdr_used = strlen(st->hdr) + 1; - st->hdr_avail = st->base.z.total_out; - st->base.z_state = z_used; + st->hdr_avail = st->z.total_out; + st->z_state = ODB_LOOSE_READ_STREAM_INUSE; st->base.close = close_istream_loose; st->base.read = read_istream_loose; @@ -265,7 +257,7 @@ static int open_istream_loose(struct odb_read_stream **out, return 0; error: - git_inflate_end(&st->base.z); + git_inflate_end(&st->z); munmap(st->mapped, st->mapsize); free(st); return -1; @@ -281,6 +273,13 @@ static int open_istream_loose(struct odb_read_stream **out, struct odb_packed_read_stream { struct odb_read_stream base; struct packed_git *pack; + git_zstream z; + enum { + ODB_PACKED_READ_STREAM_UNINITIALIZED, + ODB_PACKED_READ_STREAM_INUSE, + ODB_PACKED_READ_STREAM_DONE, + ODB_PACKED_READ_STREAM_ERROR, + } z_state; off_t pos; }; @@ -290,17 +289,17 @@ static ssize_t read_istream_pack_non_delta(struct odb_read_stream *_st, char *bu struct odb_packed_read_stream *st = (struct odb_packed_read_stream *)_st; size_t total_read = 0; - switch (st->base.z_state) { - case z_unused: - memset(&st->base.z, 0, sizeof(st->base.z)); - git_inflate_init(&st->base.z); - st->base.z_state = z_used; + switch (st->z_state) { + case ODB_PACKED_READ_STREAM_UNINITIALIZED: + memset(&st->z, 0, sizeof(st->z)); + git_inflate_init(&st->z); + st->z_state = ODB_PACKED_READ_STREAM_INUSE; break; - case z_done: + case ODB_PACKED_READ_STREAM_DONE: return 0; - case z_error: + case ODB_PACKED_READ_STREAM_ERROR: return -1; - case z_used: + case ODB_PACKED_READ_STREAM_INUSE: break; } @@ -310,20 +309,20 @@ static ssize_t read_istream_pack_non_delta(struct odb_read_stream *_st, char *bu unsigned char *mapped; mapped = use_pack(st->pack, &window, - st->pos, &st->base.z.avail_in); + st->pos, &st->z.avail_in); - st->base.z.next_out = (unsigned char *)buf + total_read; - st->base.z.avail_out = sz - total_read; - st->base.z.next_in = mapped; - status = git_inflate(&st->base.z, Z_FINISH); + st->z.next_out = (unsigned char *)buf + total_read; + st->z.avail_out = sz - total_read; + st->z.next_in = mapped; + status = git_inflate(&st->z, Z_FINISH); - st->pos += st->base.z.next_in - mapped; - total_read = st->base.z.next_out - (unsigned char *)buf; + st->pos += st->z.next_in - mapped; + total_read = st->z.next_out - (unsigned char *)buf; unuse_pack(&window); if (status == Z_STREAM_END) { - git_inflate_end(&st->base.z); - st->base.z_state = z_done; + git_inflate_end(&st->z); + st->z_state = ODB_PACKED_READ_STREAM_DONE; break; } @@ -336,8 +335,8 @@ static ssize_t read_istream_pack_non_delta(struct odb_read_stream *_st, char *bu * or truncated), then use_pack() catches that and will die(). */ if (status != Z_OK && status != Z_BUF_ERROR) { - git_inflate_end(&st->base.z); - st->base.z_state = z_error; + git_inflate_end(&st->z); + st->z_state = ODB_PACKED_READ_STREAM_ERROR; return -1; } } @@ -347,7 +346,8 @@ static ssize_t read_istream_pack_non_delta(struct odb_read_stream *_st, char *bu static int close_istream_pack_non_delta(struct odb_read_stream *_st) { struct odb_packed_read_stream *st = (struct odb_packed_read_stream *)_st; - close_deflated_stream(&st->base); + if (st->z_state == ODB_PACKED_READ_STREAM_INUSE) + git_inflate_end(&st->z); return 0; } @@ -384,7 +384,7 @@ static int open_istream_pack_non_delta(struct odb_read_stream **out, stream->base.read = read_istream_pack_non_delta; stream->base.type = in_pack_type; stream->base.size = size; - stream->base.z_state = z_unused; + stream->z_state = ODB_PACKED_READ_STREAM_UNINITIALIZED; stream->pack = pack; stream->pos = offset; -- GitLab From 4354899ab9df7727863593f48b668b98bb27de4b Mon Sep 17 00:00:00 2001 From: Patrick Steinhardt Date: Sun, 19 Oct 2025 15:44:20 +0200 Subject: [PATCH 25/33] packfile: introduce function to read object info from a store Extract the logic to read object info for a packed object from `do_oid_object_into_extended()` into a standalone function that operates on the packfile store. This function will be used in a subsequent commit. Note that this change allows us to make `find_pack_entry()` an internal implementation detail. As a consequence though we have to move around `packfile_store_freshen_object()` so that it is defined after that function. Signed-off-by: Patrick Steinhardt --- odb.c | 29 +++------------------- packfile.c | 71 +++++++++++++++++++++++++++++++++++++++++------------- packfile.h | 12 ++++++++- 3 files changed, 69 insertions(+), 43 deletions(-) diff --git a/odb.c b/odb.c index 3ec21ef24e..f4cbee4b04 100644 --- a/odb.c +++ b/odb.c @@ -666,8 +666,6 @@ static int do_oid_object_info_extended(struct object_database *odb, { static struct object_info blank_oi = OBJECT_INFO_INIT; const struct cached_object *co; - struct pack_entry e; - int rtype; const struct object_id *real = oid; int already_retried = 0; @@ -702,8 +700,8 @@ static int do_oid_object_info_extended(struct object_database *odb, while (1) { struct odb_source *source; - if (find_pack_entry(odb->repo, real, &e)) - break; + if (!packfile_store_read_object_info(odb->packfiles, real, oi, flags)) + return 0; /* Most likely it's a loose object. */ for (source = odb->sources; source; source = source->next) @@ -713,8 +711,8 @@ static int do_oid_object_info_extended(struct object_database *odb, /* Not a loose object; someone else may have just packed it. */ if (!(flags & OBJECT_INFO_QUICK)) { odb_reprepare(odb->repo->objects); - if (find_pack_entry(odb->repo, real, &e)) - break; + if (!packfile_store_read_object_info(odb->packfiles, real, oi, flags)) + return 0; } /* @@ -747,25 +745,6 @@ static int do_oid_object_info_extended(struct object_database *odb, } return -1; } - - if (oi == &blank_oi) - /* - * We know that the caller doesn't actually need the - * information below, so return early. - */ - return 0; - rtype = packed_object_info(odb->repo, e.p, e.offset, oi); - if (rtype < 0) { - mark_bad_packed_object(e.p, real); - return do_oid_object_info_extended(odb, real, oi, 0); - } else if (oi->whence == OI_PACKED) { - oi->u.packed.offset = e.offset; - oi->u.packed.pack = e.p; - oi->u.packed.is_delta = (rtype == OBJ_REF_DELTA || - rtype == OBJ_OFS_DELTA); - } - - return 0; } static int oid_object_info_convert(struct repository *r, diff --git a/packfile.c b/packfile.c index 40f733dd23..b4bc40d895 100644 --- a/packfile.c +++ b/packfile.c @@ -819,22 +819,6 @@ struct packed_git *packfile_store_load_pack(struct packfile_store *store, return p; } -int packfile_store_freshen_object(struct packfile_store *store, - const struct object_id *oid) -{ - struct pack_entry e; - if (!find_pack_entry(store->odb->repo, oid, &e)) - return 0; - if (e.p->is_cruft) - return 0; - if (e.p->freshened) - return 1; - if (utime(e.p->pack_name, NULL)) - return 0; - e.p->freshened = 1; - return 1; -} - void (*report_garbage)(unsigned seen_bits, const char *path); static void report_helper(const struct string_list *list, @@ -2064,7 +2048,9 @@ static int fill_pack_entry(const struct object_id *oid, return 1; } -int find_pack_entry(struct repository *r, const struct object_id *oid, struct pack_entry *e) +static int find_pack_entry(struct repository *r, + const struct object_id *oid, + struct pack_entry *e) { struct list_head *pos; @@ -2087,6 +2073,57 @@ int find_pack_entry(struct repository *r, const struct object_id *oid, struct pa return 0; } +int packfile_store_freshen_object(struct packfile_store *store, + const struct object_id *oid) +{ + struct pack_entry e; + if (!find_pack_entry(store->odb->repo, oid, &e)) + return 0; + if (e.p->is_cruft) + return 0; + if (e.p->freshened) + return 1; + if (utime(e.p->pack_name, NULL)) + return 0; + e.p->freshened = 1; + return 1; +} + +int packfile_store_read_object_info(struct packfile_store *store, + const struct object_id *oid, + struct object_info *oi, + unsigned flags UNUSED) +{ + static struct object_info blank_oi = OBJECT_INFO_INIT; + struct pack_entry e; + int rtype; + + if (!find_pack_entry(store->odb->repo, oid, &e)) + return 1; + + /* + * We know that the caller doesn't actually need the + * information below, so return early. + */ + if (oi == &blank_oi) + return 0; + + rtype = packed_object_info(store->odb->repo, e.p, e.offset, oi); + if (rtype < 0) { + mark_bad_packed_object(e.p, oid); + return -1; + } + + if (oi->whence == OI_PACKED) { + oi->u.packed.offset = e.offset; + oi->u.packed.pack = e.p; + oi->u.packed.is_delta = (rtype == OBJ_REF_DELTA || + rtype == OBJ_OFS_DELTA); + } + + return 0; +} + static void maybe_invalidate_kept_pack_cache(struct repository *r, unsigned flags) { diff --git a/packfile.h b/packfile.h index 58fcc88e20..0a98bddd81 100644 --- a/packfile.h +++ b/packfile.h @@ -144,6 +144,17 @@ void packfile_store_add_pack(struct packfile_store *store, #define repo_for_each_pack(repo, p) \ for (p = packfile_store_get_packs(repo->objects->packfiles); p; p = p->next) +/* + * Try to read the object identified by its ID from the object store and + * populate the object info with its data. Returns 1 in case the object was + * not found, 0 if it was and read successfully, and a negative error code in + * case the object was corrupted. + */ +int packfile_store_read_object_info(struct packfile_store *store, + const struct object_id *oid, + struct object_info *oi, + unsigned flags); + /* * Get all packs managed by the given store, including packfiles that are * referenced by multi-pack indices. @@ -357,7 +368,6 @@ const struct packed_git *has_packed_and_bad(struct repository *, const struct ob * Iff a pack file in the given repository contains the object named by sha1, * return true and store its location to e. */ -int find_pack_entry(struct repository *r, const struct object_id *oid, struct pack_entry *e); int find_kept_pack_entry(struct repository *r, const struct object_id *oid, unsigned flags, struct pack_entry *e); int has_object_pack(struct repository *r, const struct object_id *oid); -- GitLab From bff8328a2246353b301bac0f0351061d4abc6f95 Mon Sep 17 00:00:00 2001 From: Patrick Steinhardt Date: Mon, 3 Nov 2025 09:56:01 +0100 Subject: [PATCH 26/33] streaming: rely on object sources to create object stream When creating an object stream we first look up the object info and, if it's present, we call into the respective backend that contains the object to create a new stream for it. This has the consequence that, for loose object source, we basically iterate through the object sources twice: we first discover that the file exists as a loose object in the first place by iterating through all sources. And, once we have discovered it, we again walk through all sources to try and map the object. The same issue will eventually also surface once the packfile store becomes per-object-source. Furthermore, it feels rather pointless to first look up the object only to then try and read it. Refactor the logic to be centered around sources instead. Instead of first reading the object, we immediately ask the source to create the object stream for us. If the object exists we get stream, otherwise we'll try the next source. Like this we only have to iterate through sources once. But even more importantly, this change also helps us to make the whole logic pluggable. The object read stream subsystem does not need to be aware of the different source backends anymore, but eventually it'll only have to call the source's callback function. Note that at the current point in time we aren't fully there yet: - The packfile store still sits on the object database level and is thus agnostic of the sources. - We still have to call into both the packfile store and the loose object source. But both of these issues will soon be addressed. This refactoring results in a slight change to semantics: previously, it was `odb_read_object_info_extended()` that picked the source for us, and it would have favored packed (non-deltified) objects over loose objects. And while we still favor packed over loose objects for a single source with the new logic, we'll now favor a loose object from an earlier source over a packed object from a later source. Ultimately this shouldn't matter though: the stream doesn't indicate to the caller which source it is from and whether it was created from a packed or loose object, so such details are opaque to the caller. And other than that we should be able to assume that two objects with the same object ID should refer to the same content, so the streamed data would be the same, too. Signed-off-by: Patrick Steinhardt --- streaming.c | 65 ++++++++++++++++++++--------------------------------- 1 file changed, 24 insertions(+), 41 deletions(-) diff --git a/streaming.c b/streaming.c index 46fddaf2ca..f0f7d31956 100644 --- a/streaming.c +++ b/streaming.c @@ -204,21 +204,15 @@ static int close_istream_loose(struct odb_read_stream *_st) } static int open_istream_loose(struct odb_read_stream **out, - struct repository *r, + struct odb_source *source, const struct object_id *oid) { struct object_info oi = OBJECT_INFO_INIT; struct odb_loose_read_stream *st; - struct odb_source *source; unsigned long mapsize; void *mapped; - odb_prepare_alternates(r->objects); - for (source = r->objects->sources; source; source = source->next) { - mapped = odb_source_loose_map_object(source, oid, &mapsize); - if (mapped) - break; - } + mapped = odb_source_loose_map_object(source, oid, &mapsize); if (!mapped) return -1; @@ -352,21 +346,25 @@ static int close_istream_pack_non_delta(struct odb_read_stream *_st) } static int open_istream_pack_non_delta(struct odb_read_stream **out, - struct repository *r UNUSED, - const struct object_id *oid UNUSED, - struct packed_git *pack, - off_t offset) + struct object_database *odb, + const struct object_id *oid) { struct odb_packed_read_stream *stream; - struct pack_window *window; + struct pack_window *window = NULL; + struct object_info oi = OBJECT_INFO_INIT; enum object_type in_pack_type; - size_t size; + unsigned long size; - window = NULL; + oi.sizep = &size; + + if (packfile_store_read_object_info(odb->packfiles, oid, &oi, 0) || + oi.u.packed.is_delta || + repo_settings_get_big_file_threshold(the_repository) >= size) + return -1; - in_pack_type = unpack_object_header(pack, + in_pack_type = unpack_object_header(oi.u.packed.pack, &window, - &offset, + &oi.u.packed.offset, &size); unuse_pack(&window); switch (in_pack_type) { @@ -385,8 +383,8 @@ static int open_istream_pack_non_delta(struct odb_read_stream **out, stream->base.type = in_pack_type; stream->base.size = size; stream->z_state = ODB_PACKED_READ_STREAM_UNINITIALIZED; - stream->pack = pack; - stream->pos = offset; + stream->pack = oi.u.packed.pack; + stream->pos = oi.u.packed.offset; *out = &stream->base; @@ -463,30 +461,15 @@ static int istream_source(struct odb_read_stream **out, struct repository *r, const struct object_id *oid) { - unsigned long size; - int status; - struct object_info oi = OBJECT_INFO_INIT; - - oi.sizep = &size; - status = odb_read_object_info_extended(r->objects, oid, &oi, 0); - if (status < 0) - return status; + struct odb_source *source; - switch (oi.whence) { - case OI_LOOSE: - if (open_istream_loose(out, r, oid) < 0) - break; - return 0; - case OI_PACKED: - if (oi.u.packed.is_delta || - repo_settings_get_big_file_threshold(the_repository) >= size || - open_istream_pack_non_delta(out, r, oid, oi.u.packed.pack, - oi.u.packed.offset) < 0) - break; + if (!open_istream_pack_non_delta(out, r->objects, oid)) return 0; - default: - break; - } + + odb_prepare_alternates(r->objects); + for (source = r->objects->sources; source; source = source->next) + if (!open_istream_loose(out, source, oid)) + return 0; return open_istream_incore(out, r, oid); } -- GitLab From e3fe839602bb9ea963d8ae90521dfea9c0435064 Mon Sep 17 00:00:00 2001 From: Patrick Steinhardt Date: Fri, 24 Oct 2025 10:54:37 +0200 Subject: [PATCH 27/33] streaming: get rid of `the_repository` Subsequent commits will move the backend-specific logic of object streaming into their respective subsystems. These subsystems have gotten rid of `the_repository` already, but we still use it in two locations in the streaming subsystem. Prepare for the move by fixing those two cases. Converting the logic in `open_istream_pack_non_delta()` is trivial as we already got the object database as input. But for `stream_blob_to_fd()` we have to add a new parameter to make it accessible. So, as we already have to adjust all callers anyway, rename the function to `odb_stream_blob_to_fd()` to indicate it's part of the object subsystem. Signed-off-by: Patrick Steinhardt --- builtin/cat-file.c | 2 +- builtin/fsck.c | 3 ++- builtin/log.c | 4 ++-- entry.c | 2 +- parallel-checkout.c | 3 ++- streaming.c | 13 +++++++------ streaming.h | 18 +++++++++++++++++- 7 files changed, 32 insertions(+), 13 deletions(-) diff --git a/builtin/cat-file.c b/builtin/cat-file.c index 983ecec837..120d626d66 100644 --- a/builtin/cat-file.c +++ b/builtin/cat-file.c @@ -95,7 +95,7 @@ static int filter_object(const char *path, unsigned mode, static int stream_blob(const struct object_id *oid) { - if (stream_blob_to_fd(1, oid, NULL, 0)) + if (odb_stream_blob_to_fd(the_repository->objects, 1, oid, NULL, 0)) die("unable to stream %s to stdout", oid_to_hex(oid)); return 0; } diff --git a/builtin/fsck.c b/builtin/fsck.c index b1a650c673..1a348d43c2 100644 --- a/builtin/fsck.c +++ b/builtin/fsck.c @@ -340,7 +340,8 @@ static void check_unreachable_object(struct object *obj) } f = xfopen(filename, "w"); if (obj->type == OBJ_BLOB) { - if (stream_blob_to_fd(fileno(f), &obj->oid, NULL, 1)) + if (odb_stream_blob_to_fd(the_repository->objects, fileno(f), + &obj->oid, NULL, 1)) die_errno(_("could not write '%s'"), filename); } else fprintf(f, "%s\n", describe_object(&obj->oid)); diff --git a/builtin/log.c b/builtin/log.c index c8319b8af3..e7b83a6e00 100644 --- a/builtin/log.c +++ b/builtin/log.c @@ -584,7 +584,7 @@ static int show_blob_object(const struct object_id *oid, struct rev_info *rev, c fflush(rev->diffopt.file); if (!rev->diffopt.flags.textconv_set_via_cmdline || !rev->diffopt.flags.allow_textconv) - return stream_blob_to_fd(1, oid, NULL, 0); + return odb_stream_blob_to_fd(the_repository->objects, 1, oid, NULL, 0); if (get_oid_with_context(the_repository, obj_name, GET_OID_RECORD_PATH, @@ -594,7 +594,7 @@ static int show_blob_object(const struct object_id *oid, struct rev_info *rev, c !textconv_object(the_repository, obj_context.path, obj_context.mode, &oidc, 1, &buf, &size)) { object_context_release(&obj_context); - return stream_blob_to_fd(1, oid, NULL, 0); + return odb_stream_blob_to_fd(the_repository->objects, 1, oid, NULL, 0); } if (!buf) diff --git a/entry.c b/entry.c index cae02eb503..38dfe670f7 100644 --- a/entry.c +++ b/entry.c @@ -139,7 +139,7 @@ static int streaming_write_entry(const struct cache_entry *ce, char *path, if (fd < 0) return -1; - result |= stream_blob_to_fd(fd, &ce->oid, filter, 1); + result |= odb_stream_blob_to_fd(the_repository->objects, fd, &ce->oid, filter, 1); *fstat_done = fstat_checkout_output(fd, state, statbuf); result |= close(fd); diff --git a/parallel-checkout.c b/parallel-checkout.c index fba6aa65a6..1cb6701b92 100644 --- a/parallel-checkout.c +++ b/parallel-checkout.c @@ -281,7 +281,8 @@ static int write_pc_item_to_fd(struct parallel_checkout_item *pc_item, int fd, filter = get_stream_filter_ca(&pc_item->ca, &pc_item->ce->oid); if (filter) { - if (stream_blob_to_fd(fd, &pc_item->ce->oid, filter, 1)) { + if (odb_stream_blob_to_fd(the_repository->objects, fd, + &pc_item->ce->oid, filter, 1)) { /* On error, reset fd to try writing without streaming */ if (reset_fd(fd, path)) return -1; diff --git a/streaming.c b/streaming.c index f0f7d31956..807a6e03a8 100644 --- a/streaming.c +++ b/streaming.c @@ -2,8 +2,6 @@ * Copyright (c) 2011, Google Inc. */ -#define USE_THE_REPOSITORY_VARIABLE - #include "git-compat-util.h" #include "convert.h" #include "environment.h" @@ -359,7 +357,7 @@ static int open_istream_pack_non_delta(struct odb_read_stream **out, if (packfile_store_read_object_info(odb->packfiles, oid, &oi, 0) || oi.u.packed.is_delta || - repo_settings_get_big_file_threshold(the_repository) >= size) + repo_settings_get_big_file_threshold(odb->repo) >= size) return -1; in_pack_type = unpack_object_header(oi.u.packed.pack, @@ -518,8 +516,11 @@ struct odb_read_stream *open_istream(struct repository *r, return st; } -int stream_blob_to_fd(int fd, const struct object_id *oid, struct stream_filter *filter, - int can_seek) +int odb_stream_blob_to_fd(struct object_database *odb, + int fd, + const struct object_id *oid, + struct stream_filter *filter, + int can_seek) { struct odb_read_stream *st; enum object_type type; @@ -527,7 +528,7 @@ int stream_blob_to_fd(int fd, const struct object_id *oid, struct stream_filter ssize_t kept = 0; int result = -1; - st = open_istream(the_repository, oid, &type, &sz, filter); + st = open_istream(odb->repo, oid, &type, &sz, filter); if (!st) { if (filter) free_stream_filter(filter); diff --git a/streaming.h b/streaming.h index f5ff5d7ac9..148f6b3069 100644 --- a/streaming.h +++ b/streaming.h @@ -6,6 +6,7 @@ #include "object.h" +struct object_database; /* opaque */ struct odb_read_stream; struct stream_filter; @@ -16,6 +17,21 @@ struct odb_read_stream *open_istream(struct repository *, const struct object_id int close_istream(struct odb_read_stream *); ssize_t read_istream(struct odb_read_stream *, void *, size_t); -int stream_blob_to_fd(int fd, const struct object_id *, struct stream_filter *, int can_seek); +/* + * Look up the object by its ID and write the full contents to the file + * descriptor. The object must be a blob, or the function will fail. When + * provided, the filter is used to transform the blob contents. + * + * `can_seek` should be set to 1 in case the given file descriptor can be + * seek(3p)'d on. This is used to support files with holes in case a + * significant portion of the blob contains NUL bytes. + * + * Returns a negative error code on failure, 0 on success. + */ +int odb_stream_blob_to_fd(struct object_database *odb, + int fd, + const struct object_id *oid, + struct stream_filter *filter, + int can_seek); #endif /* STREAMING_H */ -- GitLab From cd079b6d47ae2990f18a5341e54d0f924017aab5 Mon Sep 17 00:00:00 2001 From: Patrick Steinhardt Date: Fri, 24 Oct 2025 10:55:16 +0200 Subject: [PATCH 28/33] streaming: make the `odb_read_stream` definition public Subsequent commits will move the backend-specific logic of setting up an object read stream into the specific subsystems. As the backends are now the ones that are responsible for allocating the stream they'll need to have the stream definition available to them. Make the stream definition public to prepare for this. Signed-off-by: Patrick Steinhardt --- streaming.c | 11 ----------- streaming.h | 15 ++++++++++++++- 2 files changed, 14 insertions(+), 12 deletions(-) diff --git a/streaming.c b/streaming.c index 807a6e03a8..0635b7c12e 100644 --- a/streaming.c +++ b/streaming.c @@ -12,19 +12,8 @@ #include "replace-object.h" #include "packfile.h" -typedef int (*close_istream_fn)(struct odb_read_stream *); -typedef ssize_t (*read_istream_fn)(struct odb_read_stream *, char *, size_t); - #define FILTER_BUFFER (1024*16) -struct odb_read_stream { - close_istream_fn close; - read_istream_fn read; - - enum object_type type; - unsigned long size; /* inflated size of full object */ -}; - /***************************************************************** * * Filtered stream diff --git a/streaming.h b/streaming.h index 148f6b3069..acfdef1598 100644 --- a/streaming.h +++ b/streaming.h @@ -7,10 +7,23 @@ #include "object.h" struct object_database; -/* opaque */ struct odb_read_stream; struct stream_filter; +typedef int (*odb_read_stream_close_fn)(struct odb_read_stream *); +typedef ssize_t (*odb_read_stream_read_fn)(struct odb_read_stream *, char *, size_t); + +/* + * A stream that can be used to read an object from the object database without + * loading all of it into memory. + */ +struct odb_read_stream { + odb_read_stream_close_fn close; + odb_read_stream_read_fn read; + enum object_type type; + unsigned long size; /* inflated size of full object */ +}; + struct odb_read_stream *open_istream(struct repository *, const struct object_id *, enum object_type *, unsigned long *, struct stream_filter *); -- GitLab From e2c2d3857a9311e84e5a5fa38baa5d72c15fa71c Mon Sep 17 00:00:00 2001 From: Patrick Steinhardt Date: Mon, 3 Nov 2025 10:05:07 +0100 Subject: [PATCH 29/33] streaming: move logic to read loose objects streams into backend Move the logic to read loose object streams into the respective subsystem. This allows us to make a couple of function declarations private. Signed-off-by: Patrick Steinhardt --- object-file.c | 167 +++++++++++++++++++++++++++++++++++++++++++++++--- object-file.h | 42 ++----------- streaming.c | 133 +--------------------------------------- 3 files changed, 164 insertions(+), 178 deletions(-) diff --git a/object-file.c b/object-file.c index b62b21a452..8c67847fea 100644 --- a/object-file.c +++ b/object-file.c @@ -234,9 +234,9 @@ static void *map_fd(int fd, const char *path, unsigned long *size) return map; } -void *odb_source_loose_map_object(struct odb_source *source, - const struct object_id *oid, - unsigned long *size) +static void *odb_source_loose_map_object(struct odb_source *source, + const struct object_id *oid, + unsigned long *size) { const char *p; int fd = open_loose_object(source->loose, oid, &p); @@ -246,11 +246,29 @@ void *odb_source_loose_map_object(struct odb_source *source, return map_fd(fd, p, size); } -enum unpack_loose_header_result unpack_loose_header(git_zstream *stream, - unsigned char *map, - unsigned long mapsize, - void *buffer, - unsigned long bufsiz) +enum unpack_loose_header_result { + ULHR_OK, + ULHR_BAD, + ULHR_TOO_LONG, +}; + +/** + * unpack_loose_header() initializes the data stream needed to unpack + * a loose object header. + * + * Returns: + * + * - ULHR_OK on success + * - ULHR_BAD on error + * - ULHR_TOO_LONG if the header was too long + * + * It will only parse up to MAX_HEADER_LEN bytes. + */ +static enum unpack_loose_header_result unpack_loose_header(git_zstream *stream, + unsigned char *map, + unsigned long mapsize, + void *buffer, + unsigned long bufsiz) { int status; @@ -329,11 +347,18 @@ static void *unpack_loose_rest(git_zstream *stream, } /* + * parse_loose_header() parses the starting " \0" of an + * object. If it doesn't follow that format -1 is returned. To check + * the validity of the populate the "typep" in the "struct + * object_info". It will be OBJ_BAD if the object type is unknown. The + * parsed can be retrieved via "oi->sizep", and from there + * passed to unpack_loose_rest(). + * * We used to just use "sscanf()", but that's actually way * too permissive for what we want to check. So do an anal * object header parse by hand. */ -int parse_loose_header(const char *hdr, struct object_info *oi) +static int parse_loose_header(const char *hdr, struct object_info *oi) { const char *type_buf = hdr; size_t size; @@ -1976,3 +2001,127 @@ void odb_source_loose_free(struct odb_source_loose *loose) loose_object_map_clear(&loose->map); free(loose); } + +struct odb_loose_read_stream { + struct odb_read_stream base; + git_zstream z; + enum { + ODB_LOOSE_READ_STREAM_INUSE, + ODB_LOOSE_READ_STREAM_DONE, + ODB_LOOSE_READ_STREAM_ERROR, + } z_state; + void *mapped; + unsigned long mapsize; + char hdr[32]; + int hdr_avail; + int hdr_used; +}; + +static ssize_t read_istream_loose(struct odb_read_stream *_st, char *buf, size_t sz) +{ + struct odb_loose_read_stream *st = (struct odb_loose_read_stream *)_st; + size_t total_read = 0; + + switch (st->z_state) { + case ODB_LOOSE_READ_STREAM_DONE: + return 0; + case ODB_LOOSE_READ_STREAM_ERROR: + return -1; + default: + break; + } + + if (st->hdr_used < st->hdr_avail) { + size_t to_copy = st->hdr_avail - st->hdr_used; + if (sz < to_copy) + to_copy = sz; + memcpy(buf, st->hdr + st->hdr_used, to_copy); + st->hdr_used += to_copy; + total_read += to_copy; + } + + while (total_read < sz) { + int status; + + st->z.next_out = (unsigned char *)buf + total_read; + st->z.avail_out = sz - total_read; + status = git_inflate(&st->z, Z_FINISH); + + total_read = st->z.next_out - (unsigned char *)buf; + + if (status == Z_STREAM_END) { + git_inflate_end(&st->z); + st->z_state = ODB_LOOSE_READ_STREAM_DONE; + break; + } + if (status != Z_OK && (status != Z_BUF_ERROR || total_read < sz)) { + git_inflate_end(&st->z); + st->z_state = ODB_LOOSE_READ_STREAM_ERROR; + return -1; + } + } + return total_read; +} + +static int close_istream_loose(struct odb_read_stream *_st) +{ + struct odb_loose_read_stream *st = (struct odb_loose_read_stream *)_st; + if (st->z_state == ODB_LOOSE_READ_STREAM_INUSE) + git_inflate_end(&st->z); + munmap(st->mapped, st->mapsize); + return 0; +} + +int odb_source_loose_read_object_stream(struct odb_read_stream **out, + struct odb_source *source, + const struct object_id *oid) +{ + struct object_info oi = OBJECT_INFO_INIT; + struct odb_loose_read_stream *st; + unsigned long mapsize; + void *mapped; + + mapped = odb_source_loose_map_object(source, oid, &mapsize); + if (!mapped) + return -1; + + /* + * Note: we must allocate this structure early even though we may still + * fail. This is because we need to initialize the zlib stream, and it + * is not possible to copy the stream around after the fact because it + * has self-referencing pointers. + */ + CALLOC_ARRAY(st, 1); + + switch (unpack_loose_header(&st->z, mapped, mapsize, st->hdr, + sizeof(st->hdr))) { + case ULHR_OK: + break; + case ULHR_BAD: + case ULHR_TOO_LONG: + goto error; + } + + oi.sizep = &st->base.size; + oi.typep = &st->base.type; + + if (parse_loose_header(st->hdr, &oi) < 0 || st->base.type < 0) + goto error; + + st->mapped = mapped; + st->mapsize = mapsize; + st->hdr_used = strlen(st->hdr) + 1; + st->hdr_avail = st->z.total_out; + st->z_state = ODB_LOOSE_READ_STREAM_INUSE; + st->base.close = close_istream_loose; + st->base.read = read_istream_loose; + + *out = &st->base; + + return 0; +error: + git_inflate_end(&st->z); + munmap(st->mapped, st->mapsize); + free(st); + return -1; +} diff --git a/object-file.h b/object-file.h index eeffa67bbd..1229d5f675 100644 --- a/object-file.h +++ b/object-file.h @@ -16,6 +16,8 @@ enum { int index_fd(struct index_state *istate, struct object_id *oid, int fd, struct stat *st, enum object_type type, const char *path, unsigned flags); int index_path(struct index_state *istate, struct object_id *oid, const char *path, struct stat *st, unsigned flags); +struct object_info; +struct odb_read_stream; struct odb_source; struct odb_source_loose { @@ -47,9 +49,9 @@ int odb_source_loose_read_object_info(struct odb_source *source, const struct object_id *oid, struct object_info *oi, int flags); -void *odb_source_loose_map_object(struct odb_source *source, - const struct object_id *oid, - unsigned long *size); +int odb_source_loose_read_object_stream(struct odb_read_stream **out, + struct odb_source *source, + const struct object_id *oid); /* * Return true iff an object database source has a loose object @@ -143,40 +145,6 @@ int for_each_loose_object(struct object_database *odb, int format_object_header(char *str, size_t size, enum object_type type, size_t objsize); -/** - * unpack_loose_header() initializes the data stream needed to unpack - * a loose object header. - * - * Returns: - * - * - ULHR_OK on success - * - ULHR_BAD on error - * - ULHR_TOO_LONG if the header was too long - * - * It will only parse up to MAX_HEADER_LEN bytes. - */ -enum unpack_loose_header_result { - ULHR_OK, - ULHR_BAD, - ULHR_TOO_LONG, -}; -enum unpack_loose_header_result unpack_loose_header(git_zstream *stream, - unsigned char *map, - unsigned long mapsize, - void *buffer, - unsigned long bufsiz); - -/** - * parse_loose_header() parses the starting " \0" of an - * object. If it doesn't follow that format -1 is returned. To check - * the validity of the populate the "typep" in the "struct - * object_info". It will be OBJ_BAD if the object type is unknown. The - * parsed can be retrieved via "oi->sizep", and from there - * passed to unpack_loose_rest(). - */ -struct object_info; -int parse_loose_header(const char *hdr, struct object_info *oi); - int force_object_loose(struct odb_source *source, const struct object_id *oid, time_t mtime); diff --git a/streaming.c b/streaming.c index 0635b7c12e..d5acc1c396 100644 --- a/streaming.c +++ b/streaming.c @@ -114,137 +114,6 @@ static struct odb_read_stream *attach_stream_filter(struct odb_read_stream *st, return &fs->base; } -/***************************************************************** - * - * Loose object stream - * - *****************************************************************/ - -struct odb_loose_read_stream { - struct odb_read_stream base; - git_zstream z; - enum { - ODB_LOOSE_READ_STREAM_INUSE, - ODB_LOOSE_READ_STREAM_DONE, - ODB_LOOSE_READ_STREAM_ERROR, - } z_state; - void *mapped; - unsigned long mapsize; - char hdr[32]; - int hdr_avail; - int hdr_used; -}; - -static ssize_t read_istream_loose(struct odb_read_stream *_st, char *buf, size_t sz) -{ - struct odb_loose_read_stream *st = (struct odb_loose_read_stream *)_st; - size_t total_read = 0; - - switch (st->z_state) { - case ODB_LOOSE_READ_STREAM_DONE: - return 0; - case ODB_LOOSE_READ_STREAM_ERROR: - return -1; - default: - break; - } - - if (st->hdr_used < st->hdr_avail) { - size_t to_copy = st->hdr_avail - st->hdr_used; - if (sz < to_copy) - to_copy = sz; - memcpy(buf, st->hdr + st->hdr_used, to_copy); - st->hdr_used += to_copy; - total_read += to_copy; - } - - while (total_read < sz) { - int status; - - st->z.next_out = (unsigned char *)buf + total_read; - st->z.avail_out = sz - total_read; - status = git_inflate(&st->z, Z_FINISH); - - total_read = st->z.next_out - (unsigned char *)buf; - - if (status == Z_STREAM_END) { - git_inflate_end(&st->z); - st->z_state = ODB_LOOSE_READ_STREAM_DONE; - break; - } - if (status != Z_OK && (status != Z_BUF_ERROR || total_read < sz)) { - git_inflate_end(&st->z); - st->z_state = ODB_LOOSE_READ_STREAM_ERROR; - return -1; - } - } - return total_read; -} - -static int close_istream_loose(struct odb_read_stream *_st) -{ - struct odb_loose_read_stream *st = (struct odb_loose_read_stream *)_st; - if (st->z_state == ODB_LOOSE_READ_STREAM_INUSE) - git_inflate_end(&st->z); - munmap(st->mapped, st->mapsize); - return 0; -} - -static int open_istream_loose(struct odb_read_stream **out, - struct odb_source *source, - const struct object_id *oid) -{ - struct object_info oi = OBJECT_INFO_INIT; - struct odb_loose_read_stream *st; - unsigned long mapsize; - void *mapped; - - mapped = odb_source_loose_map_object(source, oid, &mapsize); - if (!mapped) - return -1; - - /* - * Note: we must allocate this structure early even though we may still - * fail. This is because we need to initialize the zlib stream, and it - * is not possible to copy the stream around after the fact because it - * has self-referencing pointers. - */ - CALLOC_ARRAY(st, 1); - - switch (unpack_loose_header(&st->z, mapped, mapsize, st->hdr, - sizeof(st->hdr))) { - case ULHR_OK: - break; - case ULHR_BAD: - case ULHR_TOO_LONG: - goto error; - } - - oi.sizep = &st->base.size; - oi.typep = &st->base.type; - - if (parse_loose_header(st->hdr, &oi) < 0 || st->base.type < 0) - goto error; - - st->mapped = mapped; - st->mapsize = mapsize; - st->hdr_used = strlen(st->hdr) + 1; - st->hdr_avail = st->z.total_out; - st->z_state = ODB_LOOSE_READ_STREAM_INUSE; - st->base.close = close_istream_loose; - st->base.read = read_istream_loose; - - *out = &st->base; - - return 0; -error: - git_inflate_end(&st->z); - munmap(st->mapped, st->mapsize); - free(st); - return -1; -} - - /***************************************************************** * * Non-delta packed object stream @@ -455,7 +324,7 @@ static int istream_source(struct odb_read_stream **out, odb_prepare_alternates(r->objects); for (source = r->objects->sources; source; source = source->next) - if (!open_istream_loose(out, source, oid)) + if (!odb_source_loose_read_object_stream(out, source, oid)) return 0; return open_istream_incore(out, r, oid); -- GitLab From 6de4fc122e7ff0bc9f42482f3551cc714047ed57 Mon Sep 17 00:00:00 2001 From: Patrick Steinhardt Date: Fri, 24 Oct 2025 12:16:27 +0200 Subject: [PATCH 30/33] streaming: move logic to read packed objects streams into backend Move the logic to read packed object streams into the respective subsystem. Signed-off-by: Patrick Steinhardt --- packfile.c | 128 +++++++++++++++++++++++++++++++++++++++++++++++++ packfile.h | 5 ++ streaming.c | 136 +--------------------------------------------------- 3 files changed, 134 insertions(+), 135 deletions(-) diff --git a/packfile.c b/packfile.c index b4bc40d895..ad56ce0b90 100644 --- a/packfile.c +++ b/packfile.c @@ -20,6 +20,7 @@ #include "tree.h" #include "object-file.h" #include "odb.h" +#include "streaming.h" #include "midx.h" #include "commit-graph.h" #include "pack-revindex.h" @@ -2406,3 +2407,130 @@ void packfile_store_close(struct packfile_store *store) close_pack(p); } } + +struct odb_packed_read_stream { + struct odb_read_stream base; + struct packed_git *pack; + git_zstream z; + enum { + ODB_PACKED_READ_STREAM_UNINITIALIZED, + ODB_PACKED_READ_STREAM_INUSE, + ODB_PACKED_READ_STREAM_DONE, + ODB_PACKED_READ_STREAM_ERROR, + } z_state; + off_t pos; +}; + +static ssize_t read_istream_pack_non_delta(struct odb_read_stream *_st, char *buf, + size_t sz) +{ + struct odb_packed_read_stream *st = (struct odb_packed_read_stream *)_st; + size_t total_read = 0; + + switch (st->z_state) { + case ODB_PACKED_READ_STREAM_UNINITIALIZED: + memset(&st->z, 0, sizeof(st->z)); + git_inflate_init(&st->z); + st->z_state = ODB_PACKED_READ_STREAM_INUSE; + break; + case ODB_PACKED_READ_STREAM_DONE: + return 0; + case ODB_PACKED_READ_STREAM_ERROR: + return -1; + case ODB_PACKED_READ_STREAM_INUSE: + break; + } + + while (total_read < sz) { + int status; + struct pack_window *window = NULL; + unsigned char *mapped; + + mapped = use_pack(st->pack, &window, + st->pos, &st->z.avail_in); + + st->z.next_out = (unsigned char *)buf + total_read; + st->z.avail_out = sz - total_read; + st->z.next_in = mapped; + status = git_inflate(&st->z, Z_FINISH); + + st->pos += st->z.next_in - mapped; + total_read = st->z.next_out - (unsigned char *)buf; + unuse_pack(&window); + + if (status == Z_STREAM_END) { + git_inflate_end(&st->z); + st->z_state = ODB_PACKED_READ_STREAM_DONE; + break; + } + + /* + * Unlike the loose object case, we do not have to worry here + * about running out of input bytes and spinning infinitely. If + * we get Z_BUF_ERROR due to too few input bytes, then we'll + * replenish them in the next use_pack() call when we loop. If + * we truly hit the end of the pack (i.e., because it's corrupt + * or truncated), then use_pack() catches that and will die(). + */ + if (status != Z_OK && status != Z_BUF_ERROR) { + git_inflate_end(&st->z); + st->z_state = ODB_PACKED_READ_STREAM_ERROR; + return -1; + } + } + return total_read; +} + +static int close_istream_pack_non_delta(struct odb_read_stream *_st) +{ + struct odb_packed_read_stream *st = (struct odb_packed_read_stream *)_st; + if (st->z_state == ODB_PACKED_READ_STREAM_INUSE) + git_inflate_end(&st->z); + return 0; +} + +int packfile_store_read_object_stream(struct odb_read_stream **out, + struct packfile_store *store, + const struct object_id *oid) +{ + struct odb_packed_read_stream *stream; + struct pack_window *window = NULL; + struct object_info oi = OBJECT_INFO_INIT; + enum object_type in_pack_type; + unsigned long size; + + oi.sizep = &size; + + if (packfile_store_read_object_info(store, oid, &oi, 0) || + oi.u.packed.is_delta || + repo_settings_get_big_file_threshold(store->odb->repo) >= size) + return -1; + + in_pack_type = unpack_object_header(oi.u.packed.pack, + &window, + &oi.u.packed.offset, + &size); + unuse_pack(&window); + switch (in_pack_type) { + default: + return -1; /* we do not do deltas for now */ + case OBJ_COMMIT: + case OBJ_TREE: + case OBJ_BLOB: + case OBJ_TAG: + break; + } + + CALLOC_ARRAY(stream, 1); + stream->base.close = close_istream_pack_non_delta; + stream->base.read = read_istream_pack_non_delta; + stream->base.type = in_pack_type; + stream->base.size = size; + stream->z_state = ODB_PACKED_READ_STREAM_UNINITIALIZED; + stream->pack = oi.u.packed.pack; + stream->pos = oi.u.packed.offset; + + *out = &stream->base; + + return 0; +} diff --git a/packfile.h b/packfile.h index 0a98bddd81..3fcc5ae6e0 100644 --- a/packfile.h +++ b/packfile.h @@ -8,6 +8,7 @@ /* in odb.h */ struct object_info; +struct odb_read_stream; struct packed_git { struct hashmap_entry packmap_ent; @@ -144,6 +145,10 @@ void packfile_store_add_pack(struct packfile_store *store, #define repo_for_each_pack(repo, p) \ for (p = packfile_store_get_packs(repo->objects->packfiles); p; p = p->next) +int packfile_store_read_object_stream(struct odb_read_stream **out, + struct packfile_store *store, + const struct object_id *oid); + /* * Try to read the object identified by its ID from the object store and * populate the object info with its data. Returns 1 in case the object was diff --git a/streaming.c b/streaming.c index d5acc1c396..3140728a70 100644 --- a/streaming.c +++ b/streaming.c @@ -114,140 +114,6 @@ static struct odb_read_stream *attach_stream_filter(struct odb_read_stream *st, return &fs->base; } -/***************************************************************** - * - * Non-delta packed object stream - * - *****************************************************************/ - -struct odb_packed_read_stream { - struct odb_read_stream base; - struct packed_git *pack; - git_zstream z; - enum { - ODB_PACKED_READ_STREAM_UNINITIALIZED, - ODB_PACKED_READ_STREAM_INUSE, - ODB_PACKED_READ_STREAM_DONE, - ODB_PACKED_READ_STREAM_ERROR, - } z_state; - off_t pos; -}; - -static ssize_t read_istream_pack_non_delta(struct odb_read_stream *_st, char *buf, - size_t sz) -{ - struct odb_packed_read_stream *st = (struct odb_packed_read_stream *)_st; - size_t total_read = 0; - - switch (st->z_state) { - case ODB_PACKED_READ_STREAM_UNINITIALIZED: - memset(&st->z, 0, sizeof(st->z)); - git_inflate_init(&st->z); - st->z_state = ODB_PACKED_READ_STREAM_INUSE; - break; - case ODB_PACKED_READ_STREAM_DONE: - return 0; - case ODB_PACKED_READ_STREAM_ERROR: - return -1; - case ODB_PACKED_READ_STREAM_INUSE: - break; - } - - while (total_read < sz) { - int status; - struct pack_window *window = NULL; - unsigned char *mapped; - - mapped = use_pack(st->pack, &window, - st->pos, &st->z.avail_in); - - st->z.next_out = (unsigned char *)buf + total_read; - st->z.avail_out = sz - total_read; - st->z.next_in = mapped; - status = git_inflate(&st->z, Z_FINISH); - - st->pos += st->z.next_in - mapped; - total_read = st->z.next_out - (unsigned char *)buf; - unuse_pack(&window); - - if (status == Z_STREAM_END) { - git_inflate_end(&st->z); - st->z_state = ODB_PACKED_READ_STREAM_DONE; - break; - } - - /* - * Unlike the loose object case, we do not have to worry here - * about running out of input bytes and spinning infinitely. If - * we get Z_BUF_ERROR due to too few input bytes, then we'll - * replenish them in the next use_pack() call when we loop. If - * we truly hit the end of the pack (i.e., because it's corrupt - * or truncated), then use_pack() catches that and will die(). - */ - if (status != Z_OK && status != Z_BUF_ERROR) { - git_inflate_end(&st->z); - st->z_state = ODB_PACKED_READ_STREAM_ERROR; - return -1; - } - } - return total_read; -} - -static int close_istream_pack_non_delta(struct odb_read_stream *_st) -{ - struct odb_packed_read_stream *st = (struct odb_packed_read_stream *)_st; - if (st->z_state == ODB_PACKED_READ_STREAM_INUSE) - git_inflate_end(&st->z); - return 0; -} - -static int open_istream_pack_non_delta(struct odb_read_stream **out, - struct object_database *odb, - const struct object_id *oid) -{ - struct odb_packed_read_stream *stream; - struct pack_window *window = NULL; - struct object_info oi = OBJECT_INFO_INIT; - enum object_type in_pack_type; - unsigned long size; - - oi.sizep = &size; - - if (packfile_store_read_object_info(odb->packfiles, oid, &oi, 0) || - oi.u.packed.is_delta || - repo_settings_get_big_file_threshold(odb->repo) >= size) - return -1; - - in_pack_type = unpack_object_header(oi.u.packed.pack, - &window, - &oi.u.packed.offset, - &size); - unuse_pack(&window); - switch (in_pack_type) { - default: - return -1; /* we do not do deltas for now */ - case OBJ_COMMIT: - case OBJ_TREE: - case OBJ_BLOB: - case OBJ_TAG: - break; - } - - CALLOC_ARRAY(stream, 1); - stream->base.close = close_istream_pack_non_delta; - stream->base.read = read_istream_pack_non_delta; - stream->base.type = in_pack_type; - stream->base.size = size; - stream->z_state = ODB_PACKED_READ_STREAM_UNINITIALIZED; - stream->pack = oi.u.packed.pack; - stream->pos = oi.u.packed.offset; - - *out = &stream->base; - - return 0; -} - - /***************************************************************** * * In-core stream @@ -319,7 +185,7 @@ static int istream_source(struct odb_read_stream **out, { struct odb_source *source; - if (!open_istream_pack_non_delta(out, r->objects, oid)) + if (!packfile_store_read_object_stream(out, r->objects->packfiles, oid)) return 0; odb_prepare_alternates(r->objects); -- GitLab From 7addfec8d86d5396d290e43bd148dd74a44bf83a Mon Sep 17 00:00:00 2001 From: Patrick Steinhardt Date: Sun, 23 Nov 2025 15:56:11 +0100 Subject: [PATCH 31/33] streaming: refactor interface to be object-database-centric Refactor the streaming interface to be centered around object databases instead of centered around the repository. Rename the functions accordingly. Signed-off-by: Patrick Steinhardt --- archive-tar.c | 6 +++--- archive-zip.c | 12 ++++++------ builtin/index-pack.c | 8 ++++---- builtin/pack-objects.c | 14 +++++++------- object-file.c | 8 ++++---- streaming.c | 44 +++++++++++++++++++++--------------------- streaming.h | 30 +++++++++++++++++++++++----- 7 files changed, 71 insertions(+), 51 deletions(-) diff --git a/archive-tar.c b/archive-tar.c index dc1eda09e0..4d87b28504 100644 --- a/archive-tar.c +++ b/archive-tar.c @@ -135,16 +135,16 @@ static int stream_blocked(struct repository *r, const struct object_id *oid) char buf[BLOCKSIZE]; ssize_t readlen; - st = open_istream(r, oid, &type, &sz, NULL); + st = odb_read_stream_open(r->objects, oid, &type, &sz, NULL); if (!st) return error(_("cannot stream blob %s"), oid_to_hex(oid)); for (;;) { - readlen = read_istream(st, buf, sizeof(buf)); + readlen = odb_read_stream_read(st, buf, sizeof(buf)); if (readlen <= 0) break; do_write_blocked(buf, readlen); } - close_istream(st); + odb_read_stream_close(st); if (!readlen) finish_record(); return readlen; diff --git a/archive-zip.c b/archive-zip.c index 40a9c93ff9..c44684aebc 100644 --- a/archive-zip.c +++ b/archive-zip.c @@ -348,8 +348,8 @@ static int write_zip_entry(struct archiver_args *args, if (!buffer) { enum object_type type; - stream = open_istream(args->repo, oid, &type, &size, - NULL); + stream = odb_read_stream_open(args->repo->objects, oid, + &type, &size, NULL); if (!stream) return error(_("cannot stream blob %s"), oid_to_hex(oid)); @@ -429,7 +429,7 @@ static int write_zip_entry(struct archiver_args *args, ssize_t readlen; for (;;) { - readlen = read_istream(stream, buf, sizeof(buf)); + readlen = odb_read_stream_read(stream, buf, sizeof(buf)); if (readlen <= 0) break; crc = crc32(crc, buf, readlen); @@ -439,7 +439,7 @@ static int write_zip_entry(struct archiver_args *args, buf, readlen); write_or_die(1, buf, readlen); } - close_istream(stream); + odb_read_stream_close(stream); if (readlen) return readlen; @@ -462,7 +462,7 @@ static int write_zip_entry(struct archiver_args *args, zstream.avail_out = sizeof(compressed); for (;;) { - readlen = read_istream(stream, buf, sizeof(buf)); + readlen = odb_read_stream_read(stream, buf, sizeof(buf)); if (readlen <= 0) break; crc = crc32(crc, buf, readlen); @@ -486,7 +486,7 @@ static int write_zip_entry(struct archiver_args *args, } } - close_istream(stream); + odb_read_stream_close(stream); if (readlen) return readlen; diff --git a/builtin/index-pack.c b/builtin/index-pack.c index 5f90f12f92..fb76ef0f4c 100644 --- a/builtin/index-pack.c +++ b/builtin/index-pack.c @@ -779,7 +779,7 @@ static int compare_objects(const unsigned char *buf, unsigned long size, } while (size) { - ssize_t len = read_istream(data->st, data->buf, size); + ssize_t len = odb_read_stream_read(data->st, data->buf, size); if (len == 0) die(_("SHA1 COLLISION FOUND WITH %s !"), oid_to_hex(&data->entry->idx.oid)); @@ -807,15 +807,15 @@ static int check_collison(struct object_entry *entry) memset(&data, 0, sizeof(data)); data.entry = entry; - data.st = open_istream(the_repository, &entry->idx.oid, &type, &size, - NULL); + data.st = odb_read_stream_open(the_repository->objects, &entry->idx.oid, + &type, &size, NULL); if (!data.st) return -1; if (size != entry->size || type != entry->type) die(_("SHA1 COLLISION FOUND WITH %s !"), oid_to_hex(&entry->idx.oid)); unpack_data(entry, compare_objects, &data); - close_istream(data.st); + odb_read_stream_close(data.st); free(data.buf); return 0; } diff --git a/builtin/pack-objects.c b/builtin/pack-objects.c index c693d948e1..1353c2384c 100644 --- a/builtin/pack-objects.c +++ b/builtin/pack-objects.c @@ -417,7 +417,7 @@ static unsigned long write_large_blob_data(struct odb_read_stream *st, struct ha for (;;) { ssize_t readlen; int zret = Z_OK; - readlen = read_istream(st, ibuf, sizeof(ibuf)); + readlen = odb_read_stream_read(st, ibuf, sizeof(ibuf)); if (readlen == -1) die(_("unable to read %s"), oid_to_hex(oid)); @@ -520,8 +520,8 @@ static unsigned long write_no_reuse_object(struct hashfile *f, struct object_ent if (oe_type(entry) == OBJ_BLOB && oe_size_greater_than(&to_pack, entry, repo_settings_get_big_file_threshold(the_repository)) && - (st = open_istream(the_repository, &entry->idx.oid, &type, - &size, NULL)) != NULL) + (st = odb_read_stream_open(the_repository->objects, &entry->idx.oid, + &type, &size, NULL)) != NULL) buf = NULL; else { buf = odb_read_object(the_repository->objects, @@ -577,7 +577,7 @@ static unsigned long write_no_reuse_object(struct hashfile *f, struct object_ent dheader[--pos] = 128 | (--ofs & 127); if (limit && hdrlen + sizeof(dheader) - pos + datalen + hashsz >= limit) { if (st) - close_istream(st); + odb_read_stream_close(st); free(buf); return 0; } @@ -591,7 +591,7 @@ static unsigned long write_no_reuse_object(struct hashfile *f, struct object_ent */ if (limit && hdrlen + hashsz + datalen + hashsz >= limit) { if (st) - close_istream(st); + odb_read_stream_close(st); free(buf); return 0; } @@ -601,7 +601,7 @@ static unsigned long write_no_reuse_object(struct hashfile *f, struct object_ent } else { if (limit && hdrlen + datalen + hashsz >= limit) { if (st) - close_istream(st); + odb_read_stream_close(st); free(buf); return 0; } @@ -609,7 +609,7 @@ static unsigned long write_no_reuse_object(struct hashfile *f, struct object_ent } if (st) { datalen = write_large_blob_data(st, f, &entry->idx.oid); - close_istream(st); + odb_read_stream_close(st); } else { hashwrite(f, buf, datalen); free(buf); diff --git a/object-file.c b/object-file.c index 8c67847fea..9ba40a848c 100644 --- a/object-file.c +++ b/object-file.c @@ -139,7 +139,7 @@ int stream_object_signature(struct repository *r, const struct object_id *oid) char hdr[MAX_HEADER_LEN]; int hdrlen; - st = open_istream(r, oid, &obj_type, &size, NULL); + st = odb_read_stream_open(r->objects, oid, &obj_type, &size, NULL); if (!st) return -1; @@ -151,10 +151,10 @@ int stream_object_signature(struct repository *r, const struct object_id *oid) git_hash_update(&c, hdr, hdrlen); for (;;) { char buf[1024 * 16]; - ssize_t readlen = read_istream(st, buf, sizeof(buf)); + ssize_t readlen = odb_read_stream_read(st, buf, sizeof(buf)); if (readlen < 0) { - close_istream(st); + odb_read_stream_close(st); return -1; } if (!readlen) @@ -162,7 +162,7 @@ int stream_object_signature(struct repository *r, const struct object_id *oid) git_hash_update(&c, buf, readlen); } git_hash_final_oid(&real_oid, &c); - close_istream(st); + odb_read_stream_close(st); return !oideq(oid, &real_oid) ? -1 : 0; } diff --git a/streaming.c b/streaming.c index 3140728a70..06993a751c 100644 --- a/streaming.c +++ b/streaming.c @@ -35,7 +35,7 @@ static int close_istream_filtered(struct odb_read_stream *_fs) { struct odb_filtered_read_stream *fs = (struct odb_filtered_read_stream *)_fs; free_stream_filter(fs->filter); - return close_istream(fs->upstream); + return odb_read_stream_close(fs->upstream); } static ssize_t read_istream_filtered(struct odb_read_stream *_fs, char *buf, @@ -87,7 +87,7 @@ static ssize_t read_istream_filtered(struct odb_read_stream *_fs, char *buf, /* refill the input from the upstream */ if (!fs->input_finished) { - fs->i_end = read_istream(fs->upstream, fs->ibuf, FILTER_BUFFER); + fs->i_end = odb_read_stream_read(fs->upstream, fs->ibuf, FILTER_BUFFER); if (fs->i_end < 0) return -1; if (fs->i_end) @@ -149,7 +149,7 @@ static ssize_t read_istream_incore(struct odb_read_stream *_st, char *buf, size_ } static int open_istream_incore(struct odb_read_stream **out, - struct repository *r, + struct object_database *odb, const struct object_id *oid) { struct object_info oi = OBJECT_INFO_INIT; @@ -163,7 +163,7 @@ static int open_istream_incore(struct odb_read_stream **out, oi.typep = &stream.base.type; oi.sizep = &stream.base.size; oi.contentp = (void **)&stream.buf; - ret = odb_read_object_info_extended(r->objects, oid, &oi, + ret = odb_read_object_info_extended(odb, oid, &oi, OBJECT_INFO_DIE_IF_CORRUPT); if (ret) return ret; @@ -180,47 +180,47 @@ static int open_istream_incore(struct odb_read_stream **out, *****************************************************************************/ static int istream_source(struct odb_read_stream **out, - struct repository *r, + struct object_database *odb, const struct object_id *oid) { struct odb_source *source; - if (!packfile_store_read_object_stream(out, r->objects->packfiles, oid)) + if (!packfile_store_read_object_stream(out, odb->packfiles, oid)) return 0; - odb_prepare_alternates(r->objects); - for (source = r->objects->sources; source; source = source->next) + odb_prepare_alternates(odb); + for (source = odb->sources; source; source = source->next) if (!odb_source_loose_read_object_stream(out, source, oid)) return 0; - return open_istream_incore(out, r, oid); + return open_istream_incore(out, odb, oid); } /**************************************************************** * Users of streaming interface ****************************************************************/ -int close_istream(struct odb_read_stream *st) +int odb_read_stream_close(struct odb_read_stream *st) { int r = st->close(st); free(st); return r; } -ssize_t read_istream(struct odb_read_stream *st, void *buf, size_t sz) +ssize_t odb_read_stream_read(struct odb_read_stream *st, void *buf, size_t sz) { return st->read(st, buf, sz); } -struct odb_read_stream *open_istream(struct repository *r, - const struct object_id *oid, - enum object_type *type, - unsigned long *size, - struct stream_filter *filter) +struct odb_read_stream *odb_read_stream_open(struct object_database *odb, + const struct object_id *oid, + enum object_type *type, + unsigned long *size, + struct stream_filter *filter) { struct odb_read_stream *st; - const struct object_id *real = lookup_replace_object(r, oid); - int ret = istream_source(&st, r, real); + const struct object_id *real = lookup_replace_object(odb->repo, oid); + int ret = istream_source(&st, odb, real); if (ret) return NULL; @@ -229,7 +229,7 @@ struct odb_read_stream *open_istream(struct repository *r, /* Add "&& !is_null_stream_filter(filter)" for performance */ struct odb_read_stream *nst = attach_stream_filter(st, filter); if (!nst) { - close_istream(st); + odb_read_stream_close(st); return NULL; } st = nst; @@ -252,7 +252,7 @@ int odb_stream_blob_to_fd(struct object_database *odb, ssize_t kept = 0; int result = -1; - st = open_istream(odb->repo, oid, &type, &sz, filter); + st = odb_read_stream_open(odb, oid, &type, &sz, filter); if (!st) { if (filter) free_stream_filter(filter); @@ -263,7 +263,7 @@ int odb_stream_blob_to_fd(struct object_database *odb, for (;;) { char buf[1024 * 16]; ssize_t wrote, holeto; - ssize_t readlen = read_istream(st, buf, sizeof(buf)); + ssize_t readlen = odb_read_stream_read(st, buf, sizeof(buf)); if (readlen < 0) goto close_and_exit; @@ -294,6 +294,6 @@ int odb_stream_blob_to_fd(struct object_database *odb, result = 0; close_and_exit: - close_istream(st); + odb_read_stream_close(st); return result; } diff --git a/streaming.h b/streaming.h index acfdef1598..7cb55213b7 100644 --- a/streaming.h +++ b/streaming.h @@ -24,11 +24,31 @@ struct odb_read_stream { unsigned long size; /* inflated size of full object */ }; -struct odb_read_stream *open_istream(struct repository *, const struct object_id *, - enum object_type *, unsigned long *, - struct stream_filter *); -int close_istream(struct odb_read_stream *); -ssize_t read_istream(struct odb_read_stream *, void *, size_t); +/* + * Create a new object stream for the given object database. Populates the type + * and size pointers with the object's info. An optional filter can be used to + * transform the object's content. + * + * Returns the stream on success, a `NULL` pointer otherwise. + */ +struct odb_read_stream *odb_read_stream_open(struct object_database *odb, + const struct object_id *oid, + enum object_type *type, + unsigned long *size, + struct stream_filter *filter); + +/* + * Close the given read stream and release all resources associated with it. + * Returns 0 on success, a negative error code otherwise. + */ +int odb_read_stream_close(struct odb_read_stream *stream); + +/* + * Read data from the stream into the buffer. Returns 0 on EOF and the number + * of bytes read on success. Returns a negative error code in case reading from + * the stream fails. + */ +ssize_t odb_read_stream_read(struct odb_read_stream *stream, void *buf, size_t len); /* * Look up the object by its ID and write the full contents to the file -- GitLab From 776018bbcb90eecfa105c672d57b486ee751d201 Mon Sep 17 00:00:00 2001 From: Patrick Steinhardt Date: Fri, 24 Oct 2025 11:06:05 +0200 Subject: [PATCH 32/33] streaming: move into object database subsystem The "streaming" terminology is somewhat generic, so it may not be immediately obvious that "streaming.{c,h}" is specific to the object database. Rectify this by moving it into the "odb/" directory so that it can be immediately attributed to the object subsystem. Signed-off-by: Patrick Steinhardt --- Makefile | 2 +- archive-tar.c | 2 +- archive-zip.c | 2 +- builtin/cat-file.c | 2 +- builtin/fsck.c | 2 +- builtin/index-pack.c | 2 +- builtin/log.c | 2 +- builtin/pack-objects.c | 2 +- entry.c | 2 +- meson.build | 2 +- object-file.c | 2 +- streaming.c => odb/streaming.c | 2 +- streaming.h => odb/streaming.h | 0 packfile.c | 2 +- parallel-checkout.c | 2 +- 15 files changed, 14 insertions(+), 14 deletions(-) rename streaming.c => odb/streaming.c (99%) rename streaming.h => odb/streaming.h (100%) diff --git a/Makefile b/Makefile index 7e0f77e298..6d8dcc4622 100644 --- a/Makefile +++ b/Makefile @@ -1201,6 +1201,7 @@ LIB_OBJS += object-file.o LIB_OBJS += object-name.o LIB_OBJS += object.o LIB_OBJS += odb.o +LIB_OBJS += odb/streaming.o LIB_OBJS += oid-array.o LIB_OBJS += oidmap.o LIB_OBJS += oidset.o @@ -1294,7 +1295,6 @@ LIB_OBJS += split-index.o LIB_OBJS += stable-qsort.o LIB_OBJS += statinfo.o LIB_OBJS += strbuf.o -LIB_OBJS += streaming.o LIB_OBJS += string-list.o LIB_OBJS += strmap.o LIB_OBJS += strvec.o diff --git a/archive-tar.c b/archive-tar.c index 4d87b28504..494b9f0667 100644 --- a/archive-tar.c +++ b/archive-tar.c @@ -12,8 +12,8 @@ #include "tar.h" #include "archive.h" #include "odb.h" +#include "odb/streaming.h" #include "strbuf.h" -#include "streaming.h" #include "run-command.h" #include "write-or-die.h" diff --git a/archive-zip.c b/archive-zip.c index c44684aebc..a0bdc2fe3b 100644 --- a/archive-zip.c +++ b/archive-zip.c @@ -10,9 +10,9 @@ #include "gettext.h" #include "git-zlib.h" #include "hex.h" -#include "streaming.h" #include "utf8.h" #include "odb.h" +#include "odb/streaming.h" #include "strbuf.h" #include "userdiff.h" #include "write-or-die.h" diff --git a/builtin/cat-file.c b/builtin/cat-file.c index 120d626d66..505ddaa12f 100644 --- a/builtin/cat-file.c +++ b/builtin/cat-file.c @@ -18,13 +18,13 @@ #include "list-objects-filter-options.h" #include "parse-options.h" #include "userdiff.h" -#include "streaming.h" #include "oid-array.h" #include "packfile.h" #include "pack-bitmap.h" #include "object-file.h" #include "object-name.h" #include "odb.h" +#include "odb/streaming.h" #include "replace-object.h" #include "promisor-remote.h" #include "mailmap.h" diff --git a/builtin/fsck.c b/builtin/fsck.c index 1a348d43c2..c7d2eea287 100644 --- a/builtin/fsck.c +++ b/builtin/fsck.c @@ -13,11 +13,11 @@ #include "fsck.h" #include "parse-options.h" #include "progress.h" -#include "streaming.h" #include "packfile.h" #include "object-file.h" #include "object-name.h" #include "odb.h" +#include "odb/streaming.h" #include "path.h" #include "read-cache-ll.h" #include "replace-object.h" diff --git a/builtin/index-pack.c b/builtin/index-pack.c index fb76ef0f4c..581023495f 100644 --- a/builtin/index-pack.c +++ b/builtin/index-pack.c @@ -16,12 +16,12 @@ #include "progress.h" #include "fsck.h" #include "strbuf.h" -#include "streaming.h" #include "thread-utils.h" #include "packfile.h" #include "pack-revindex.h" #include "object-file.h" #include "odb.h" +#include "odb/streaming.h" #include "oid-array.h" #include "oidset.h" #include "path.h" diff --git a/builtin/log.c b/builtin/log.c index e7b83a6e00..d4cf9c59c8 100644 --- a/builtin/log.c +++ b/builtin/log.c @@ -16,6 +16,7 @@ #include "refs.h" #include "object-name.h" #include "odb.h" +#include "odb/streaming.h" #include "pager.h" #include "color.h" #include "commit.h" @@ -35,7 +36,6 @@ #include "parse-options.h" #include "line-log.h" #include "branch.h" -#include "streaming.h" #include "version.h" #include "mailmap.h" #include "progress.h" diff --git a/builtin/pack-objects.c b/builtin/pack-objects.c index 1353c2384c..f109e26786 100644 --- a/builtin/pack-objects.c +++ b/builtin/pack-objects.c @@ -22,7 +22,6 @@ #include "pack-objects.h" #include "progress.h" #include "refs.h" -#include "streaming.h" #include "thread-utils.h" #include "pack-bitmap.h" #include "delta-islands.h" @@ -33,6 +32,7 @@ #include "packfile.h" #include "object-file.h" #include "odb.h" +#include "odb/streaming.h" #include "replace-object.h" #include "dir.h" #include "midx.h" diff --git a/entry.c b/entry.c index 38dfe670f7..7817aee362 100644 --- a/entry.c +++ b/entry.c @@ -2,13 +2,13 @@ #include "git-compat-util.h" #include "odb.h" +#include "odb/streaming.h" #include "dir.h" #include "environment.h" #include "gettext.h" #include "hex.h" #include "name-hash.h" #include "sparse-index.h" -#include "streaming.h" #include "submodule.h" #include "symlinks.h" #include "progress.h" diff --git a/meson.build b/meson.build index 1f95a06edb..fc82929b37 100644 --- a/meson.build +++ b/meson.build @@ -397,6 +397,7 @@ libgit_sources = [ 'object-name.c', 'object.c', 'odb.c', + 'odb/streaming.c', 'oid-array.c', 'oidmap.c', 'oidset.c', @@ -490,7 +491,6 @@ libgit_sources = [ 'stable-qsort.c', 'statinfo.c', 'strbuf.c', - 'streaming.c', 'string-list.c', 'strmap.c', 'strvec.c', diff --git a/object-file.c b/object-file.c index 9ba40a848c..9601fdb12d 100644 --- a/object-file.c +++ b/object-file.c @@ -20,13 +20,13 @@ #include "object-file-convert.h" #include "object-file.h" #include "odb.h" +#include "odb/streaming.h" #include "oidtree.h" #include "pack.h" #include "packfile.h" #include "path.h" #include "read-cache-ll.h" #include "setup.h" -#include "streaming.h" #include "tempfile.h" #include "tmp-objdir.h" diff --git a/streaming.c b/odb/streaming.c similarity index 99% rename from streaming.c rename to odb/streaming.c index 06993a751c..7ef58adaa2 100644 --- a/streaming.c +++ b/odb/streaming.c @@ -5,10 +5,10 @@ #include "git-compat-util.h" #include "convert.h" #include "environment.h" -#include "streaming.h" #include "repository.h" #include "object-file.h" #include "odb.h" +#include "odb/streaming.h" #include "replace-object.h" #include "packfile.h" diff --git a/streaming.h b/odb/streaming.h similarity index 100% rename from streaming.h rename to odb/streaming.h diff --git a/packfile.c b/packfile.c index ad56ce0b90..7a16aaa90d 100644 --- a/packfile.c +++ b/packfile.c @@ -20,7 +20,7 @@ #include "tree.h" #include "object-file.h" #include "odb.h" -#include "streaming.h" +#include "odb/streaming.h" #include "midx.h" #include "commit-graph.h" #include "pack-revindex.h" diff --git a/parallel-checkout.c b/parallel-checkout.c index 1cb6701b92..0bf4bd6d4a 100644 --- a/parallel-checkout.c +++ b/parallel-checkout.c @@ -13,7 +13,7 @@ #include "read-cache-ll.h" #include "run-command.h" #include "sigchain.h" -#include "streaming.h" +#include "odb/streaming.h" #include "symlinks.h" #include "thread-utils.h" #include "trace2.h" -- GitLab From 6ad4caa5e71efea7e2c49e17a4f3a3fb95754e98 Mon Sep 17 00:00:00 2001 From: Patrick Steinhardt Date: Sun, 23 Nov 2025 15:47:49 +0100 Subject: [PATCH 33/33] streaming: drop redundant type and size pointers In the preceding commits we have turned `struct odb_read_stream` into a publicly visible structure. Furthermore, this structure now contains the type and size of the object that we are about to stream. Consequently, the out-pointers that we used before to propagate the type and size of the streamed object are now somewhat redundant with the data contained in the structure itself. Drop these out-pointers and adapt callers accordingly. Signed-off-by: Patrick Steinhardt --- archive-tar.c | 4 +--- archive-zip.c | 5 ++--- builtin/index-pack.c | 7 ++----- builtin/pack-objects.c | 6 ++++-- object-file.c | 6 ++---- odb/streaming.c | 10 ++-------- odb/streaming.h | 7 ++----- 7 files changed, 15 insertions(+), 30 deletions(-) diff --git a/archive-tar.c b/archive-tar.c index 494b9f0667..0fc70d13a8 100644 --- a/archive-tar.c +++ b/archive-tar.c @@ -130,12 +130,10 @@ static void write_trailer(void) static int stream_blocked(struct repository *r, const struct object_id *oid) { struct odb_read_stream *st; - enum object_type type; - unsigned long sz; char buf[BLOCKSIZE]; ssize_t readlen; - st = odb_read_stream_open(r->objects, oid, &type, &sz, NULL); + st = odb_read_stream_open(r->objects, oid, NULL); if (!st) return error(_("cannot stream blob %s"), oid_to_hex(oid)); for (;;) { diff --git a/archive-zip.c b/archive-zip.c index a0bdc2fe3b..97ea8d60d6 100644 --- a/archive-zip.c +++ b/archive-zip.c @@ -347,12 +347,11 @@ static int write_zip_entry(struct archiver_args *args, method = ZIP_METHOD_DEFLATE; if (!buffer) { - enum object_type type; - stream = odb_read_stream_open(args->repo->objects, oid, - &type, &size, NULL); + stream = odb_read_stream_open(args->repo->objects, oid, NULL); if (!stream) return error(_("cannot stream blob %s"), oid_to_hex(oid)); + size = stream->size; flags |= ZIP_STREAM; out = NULL; } else { diff --git a/builtin/index-pack.c b/builtin/index-pack.c index 581023495f..b01cb77f4a 100644 --- a/builtin/index-pack.c +++ b/builtin/index-pack.c @@ -798,8 +798,6 @@ static int compare_objects(const unsigned char *buf, unsigned long size, static int check_collison(struct object_entry *entry) { struct compare_data data; - enum object_type type; - unsigned long size; if (entry->size <= repo_settings_get_big_file_threshold(the_repository) || entry->type != OBJ_BLOB) @@ -807,11 +805,10 @@ static int check_collison(struct object_entry *entry) memset(&data, 0, sizeof(data)); data.entry = entry; - data.st = odb_read_stream_open(the_repository->objects, &entry->idx.oid, - &type, &size, NULL); + data.st = odb_read_stream_open(the_repository->objects, &entry->idx.oid, NULL); if (!data.st) return -1; - if (size != entry->size || type != entry->type) + if (data.st->size != entry->size || data.st->type != entry->type) die(_("SHA1 COLLISION FOUND WITH %s !"), oid_to_hex(&entry->idx.oid)); unpack_data(entry, compare_objects, &data); diff --git a/builtin/pack-objects.c b/builtin/pack-objects.c index f109e26786..0d1d6995bf 100644 --- a/builtin/pack-objects.c +++ b/builtin/pack-objects.c @@ -521,9 +521,11 @@ static unsigned long write_no_reuse_object(struct hashfile *f, struct object_ent oe_size_greater_than(&to_pack, entry, repo_settings_get_big_file_threshold(the_repository)) && (st = odb_read_stream_open(the_repository->objects, &entry->idx.oid, - &type, &size, NULL)) != NULL) + NULL)) != NULL) { buf = NULL; - else { + type = st->type; + size = st->size; + } else { buf = odb_read_object(the_repository->objects, &entry->idx.oid, &type, &size); diff --git a/object-file.c b/object-file.c index 9601fdb12d..12177a7dd7 100644 --- a/object-file.c +++ b/object-file.c @@ -132,19 +132,17 @@ int check_object_signature(struct repository *r, const struct object_id *oid, int stream_object_signature(struct repository *r, const struct object_id *oid) { struct object_id real_oid; - unsigned long size; - enum object_type obj_type; struct odb_read_stream *st; struct git_hash_ctx c; char hdr[MAX_HEADER_LEN]; int hdrlen; - st = odb_read_stream_open(r->objects, oid, &obj_type, &size, NULL); + st = odb_read_stream_open(r->objects, oid, NULL); if (!st) return -1; /* Generate the header */ - hdrlen = format_object_header(hdr, sizeof(hdr), obj_type, size); + hdrlen = format_object_header(hdr, sizeof(hdr), st->type, st->size); /* Sha1.. */ r->hash_algo->init_fn(&c); diff --git a/odb/streaming.c b/odb/streaming.c index 7ef58adaa2..745cd486fb 100644 --- a/odb/streaming.c +++ b/odb/streaming.c @@ -214,8 +214,6 @@ ssize_t odb_read_stream_read(struct odb_read_stream *st, void *buf, size_t sz) struct odb_read_stream *odb_read_stream_open(struct object_database *odb, const struct object_id *oid, - enum object_type *type, - unsigned long *size, struct stream_filter *filter) { struct odb_read_stream *st; @@ -235,8 +233,6 @@ struct odb_read_stream *odb_read_stream_open(struct object_database *odb, st = nst; } - *size = st->size; - *type = st->type; return st; } @@ -247,18 +243,16 @@ int odb_stream_blob_to_fd(struct object_database *odb, int can_seek) { struct odb_read_stream *st; - enum object_type type; - unsigned long sz; ssize_t kept = 0; int result = -1; - st = odb_read_stream_open(odb, oid, &type, &sz, filter); + st = odb_read_stream_open(odb, oid, filter); if (!st) { if (filter) free_stream_filter(filter); return result; } - if (type != OBJ_BLOB) + if (st->type != OBJ_BLOB) goto close_and_exit; for (;;) { char buf[1024 * 16]; diff --git a/odb/streaming.h b/odb/streaming.h index 7cb55213b7..c7861f7e13 100644 --- a/odb/streaming.h +++ b/odb/streaming.h @@ -25,16 +25,13 @@ struct odb_read_stream { }; /* - * Create a new object stream for the given object database. Populates the type - * and size pointers with the object's info. An optional filter can be used to - * transform the object's content. + * Create a new object stream for the given object database. An optional filter + * can be used to transform the object's content. * * Returns the stream on success, a `NULL` pointer otherwise. */ struct odb_read_stream *odb_read_stream_open(struct object_database *odb, const struct object_id *oid, - enum object_type *type, - unsigned long *size, struct stream_filter *filter); /* -- GitLab