From e38d1b59177926d30d9279e6a5a681ad89b39e1d Mon Sep 17 00:00:00 2001 From: Derrick Stolee Date: Thu, 29 Aug 2024 12:27:28 -0400 Subject: [PATCH] path-walk: introduce an object walk by path TODO Signed-off-by: Derrick Stolee --- Makefile | 1 + path-walk.c | 237 ++++++++++++++++++++++++++++++++++++++++++++++++++++ path-walk.h | 43 ++++++++++ 3 files changed, 281 insertions(+) create mode 100644 path-walk.c create mode 100644 path-walk.h diff --git a/Makefile b/Makefile index deb175a0408782..e83f6de9a2c413 100644 --- a/Makefile +++ b/Makefile @@ -1090,6 +1090,7 @@ LIB_OBJS += parse-options.o LIB_OBJS += patch-delta.o LIB_OBJS += patch-ids.o LIB_OBJS += path.o +LIB_OBJS += path-walk.o LIB_OBJS += pathspec.o LIB_OBJS += pkt-line.o LIB_OBJS += preload-index.o diff --git a/path-walk.c b/path-walk.c new file mode 100644 index 00000000000000..9b77860d152a56 --- /dev/null +++ b/path-walk.c @@ -0,0 +1,237 @@ +/* + * path-walk.c: implementation for path-based walks of the object graph. + */ +#include "git-compat-util.h" +#include "path-walk.h" +#include "blob.h" +#include "commit.h" +#include "dir.h" +#include "hashmap.h" +#include "hex.h" +#include "object.h" +#include "oid-array.h" +#include "revision.h" +#include "string-list.h" +#include "strmap.h" +#include "trace2.h" +#include "tree.h" +#include "tree-walk.h" + +struct type_and_oid_list +{ + enum object_type type; + struct oid_array oids; +}; + +#define TYPE_AND_OID_LIST_INIT { \ + .type = OBJ_NONE, \ + .oids = OID_ARRAY_INIT \ +} + +struct path_walk_context { + /** + * Repeats of data in 'struct path_walk_info' for + * access with fewer characters. + */ + struct repository *repo; + struct rev_info *revs; + struct path_walk_info *info; + + /** + * Map a path to a 'struct type_and_oid_list' + * containing the objects discovered at that + * path. + */ + struct strmap paths_to_lists; + + /** + * Store the current list of paths in a stack, to + * facilitate depth-first-search without recursion. + */ + struct string_list path_stack; +}; + +static int add_children(struct path_walk_context *ctx, + const char *base_path, + struct object_id *oid) +{ + struct tree_desc desc; + struct name_entry entry; + struct strbuf path = STRBUF_INIT; + size_t base_len; + struct tree *tree = lookup_tree(ctx->repo, oid); + + if (!tree) { + error(_("failed to walk children of tree %s: not found"), + oid_to_hex(oid)); + return -1; + } + + strbuf_addstr(&path, base_path); + base_len = path.len; + + parse_tree(tree); + init_tree_desc(&desc, &tree->object.oid, tree->buffer, tree->size); + while (tree_entry(&desc, &entry)) { + struct type_and_oid_list *list; + struct object *o; + /* Not actually true, but we will ignore submodules later. */ + enum object_type type = S_ISDIR(entry.mode) ? OBJ_TREE : OBJ_BLOB; + + /* Skip submodules. */ + if (S_ISGITLINK(entry.mode)) + continue; + + if (type == OBJ_TREE) { + struct tree *child = lookup_tree(ctx->repo, &entry.oid); + o = child ? &child->object : NULL; + } else if (type == OBJ_BLOB) { + struct blob *child = lookup_blob(ctx->repo, &entry.oid); + o = child ? &child->object : NULL; + } + + if (!o) /* report error?*/ + continue; + + /* + * Pass uninteresting flag, if necessary. This must be done + * before checking the SEEN flag, in case this object was added + * from an interesting object first. + */ + if (tree->object.flags & UNINTERESTING) + o->flags |= UNINTERESTING; + + /* Skip this object if already seen. */ + if (o->flags & SEEN) + continue; + o->flags |= SEEN; + + strbuf_setlen(&path, base_len); + strbuf_add(&path, entry.path, entry.pathlen); + + /* + * Trees will end with "/" for concatenation and distinction + * from blobs at the same path. + */ + if (type == OBJ_TREE) + strbuf_addch(&path, '/'); + + if (!(list = strmap_get(&ctx->paths_to_lists, path.buf))) { + CALLOC_ARRAY(list, 1); + list->type = type; + strmap_put(&ctx->paths_to_lists, path.buf, list); + string_list_append(&ctx->path_stack, path.buf); + } + oid_array_append(&list->oids, &entry.oid); + } + + free_tree_buffer(tree); + strbuf_release(&path); + return 0; +} + +/* + * For each path in paths_to_explore, walk the trees another level + * and add any found blobs to the batch (but only if they don't + * exist and haven't been added yet). + */ +static int walk_path(struct path_walk_context *ctx, + const char *path) +{ + struct type_and_oid_list *list; + int ret = 0; + + list = strmap_get(&ctx->paths_to_lists, path); + + /* Evaluate function pointer on this data. */ + ret = ctx->info->path_fn(path, &list->oids, list->type, + ctx->info->path_fn_data); + + /* Expand data for children. */ + if (list->type == OBJ_TREE) { + for (size_t i = 0; i < list->oids.nr; i++) { + ret |= add_children(ctx, + path, + &list->oids.oid[i]); + } + } + + oid_array_clear(&list->oids); + strmap_remove(&ctx->paths_to_lists, path, 1); + return ret; +} + +static void clear_strmap(struct strmap *map) +{ + struct hashmap_iter iter; + struct strmap_entry *e; + + hashmap_for_each_entry(&map->map, &iter, e, ent) { + struct type_and_oid_list *list = e->value; + oid_array_clear(&list->oids); + } + strmap_clear(map, 1); + strmap_init(map); +} + +/** + * Given the configuration of 'info', walk the commits based on 'info->revs' and + * call 'info->path_fn' on each discovered path. + * + * Returns nonzero on an error. + */ +int walk_objects_by_path(struct path_walk_info *info) +{ + int ret = 0; + size_t commits_nr = 0, paths_nr = 0; + struct commit *c; + struct type_and_oid_list *list; + struct path_walk_context ctx = { + .repo = info->revs->repo, + .revs = info->revs, + .info = info, + .path_stack = STRING_LIST_INIT_DUP, + .paths_to_lists = STRMAP_INIT + }; + + trace2_region_enter("path-walk", "commit-walk", info->revs->repo); + + /* Insert a single list for the root tree into the paths. */ + CALLOC_ARRAY(list, 1); + list->type = OBJ_TREE; + strmap_put(&ctx.paths_to_lists, "", list); + + if (prepare_revision_walk(info->revs)) + die(_("failed to setup revision walk")); + + while ((c = get_revision(info->revs))) + { + struct object_id *oid = get_commit_tree_oid(c); + struct tree *t = lookup_tree(info->revs->repo, oid); + + if (t && (c->object.flags & UNINTERESTING)) + t->object.flags |= UNINTERESTING; + + oid_array_append(&list->oids, oid); + } + + trace2_data_intmax("path-walk", ctx.repo, "commits", commits_nr); + trace2_region_leave("path-walk", "commit-walk", info->revs->repo); + + string_list_append(&ctx.path_stack, ""); + + while (!ret && ctx.path_stack.nr) { + char *path = ctx.path_stack.items[ctx.path_stack.nr - 1].string; + ctx.path_stack.nr--; + paths_nr++; + + ret = walk_path(&ctx, path); + + free(path); + } + trace2_data_intmax("path-walk", ctx.repo, "paths", paths_nr); + + clear_strmap(&ctx.paths_to_lists); + string_list_clear(&ctx.path_stack, 0); + return ret; +} diff --git a/path-walk.h b/path-walk.h new file mode 100644 index 00000000000000..c9e94a98bc8f6b --- /dev/null +++ b/path-walk.h @@ -0,0 +1,43 @@ +/* + * path-walk.h : Methods and structures for walking the object graph in batches + * by the paths that can reach those objects. + */ +#include "object.h" /* Required for 'enum object_type'. */ + +struct rev_info; +struct oid_array; + +/** + * The type of a function pointer for the method that is called on a list of + * objects reachable at a given path. + */ +typedef int (*path_fn)(const char *path, + struct oid_array *oids, + enum object_type type, + void *data); + +struct path_walk_info { + /** + * revs provides the definitions for the commit walk, including + * which commits are UNINTERESTING or not. + */ + struct rev_info *revs; + + /** + * The caller wishes to execute custom logic on objects reachable at a + * given path. Every reachable object will be visited exactly once, and + * the first path to see an object wins. This may not be a stable choice. + */ + path_fn path_fn; + void *path_fn_data; +}; + +#define PATH_WALK_INFO_INIT { 0 } + +/** + * Given the configuration of 'info', walk the commits based on 'info->revs' and + * call 'info->path_fn' on each discovered path. + * + * Returns nonzero on an error. + */ +int walk_objects_by_path(struct path_walk_info *info);