Thread (19 messages) 19 messages, 5 authors, 2022-06-22

Re: [PATCH v3 2/3] dir: cache git_dir's realpath

From: Elijah Newren <hidden>
Date: 2022-05-24 14:32:58

On Fri, May 20, 2022 at 12:29 PM Goss Geppert [off-list ref] wrote:
quoted hunk ↗ jump to hunk
When traversing the directory tree, the realpath of the `git_dir` is
required each time a nested repository is encountered to determine how
the nested repository should be treated.  To prevent excessive resource
usage in pathological cases (e.g. many nested repositories, a long path,
symlinks, etc.), cache the realpath after the first usage until the
traversal is complete.
---
 dir.c | 63 ++++++++++++++++++++++++++++++++++++++++-------------------
 1 file changed, 43 insertions(+), 20 deletions(-)
diff --git a/dir.c b/dir.c
index e45e117875..45b89560fc 100644
--- a/dir.c
+++ b/dir.c
@@ -47,10 +47,22 @@ struct cached_dir {
        struct untracked_cache_dir *ucd;
 };

+/*
+ * Support data structure for the recursing into the directory tree.
+ */
+struct traversal_cache {
+       /*
+        * The realpath for the `git_dir` may be required in the recursion and is
+        * cached for repeated use.
+        */
+       char *real_gitdir;
+};
+
 static enum path_treatment read_directory_recursive(struct dir_struct *dir,
        struct index_state *istate, const char *path, int len,
        struct untracked_cache_dir *untracked,
-       int check_only, int stop_at_first_file, const struct pathspec *pathspec);
+       int check_only, int stop_at_first_file, const struct pathspec *pathspec,
+       struct traversal_cache *cache);
 static int resolve_dtype(int dtype, struct index_state *istate,
                         const char *path, int len);
 struct dirent *readdir_skip_dot_and_dotdot(DIR *dirp)
@@ -1852,7 +1864,8 @@ static enum path_treatment treat_directory(struct dir_struct *dir,
        struct index_state *istate,
        struct untracked_cache_dir *untracked,
        const char *dirname, int len, int baselen, int excluded,
-       const struct pathspec *pathspec)
+       const struct pathspec *pathspec,
+       struct traversal_cache *cache)
 {
        /*
         * WARNING: From this function, you can return path_recurse or you
@@ -1906,13 +1919,13 @@ static enum path_treatment treat_directory(struct dir_struct *dir,
                nested_repo = is_nonbare_repository_dir(&sb);

                if (nested_repo) {
-                       char *real_dirname, *real_gitdir;
+                       char *real_dirname;
                        strbuf_addstr(&sb, ".git");
                        real_dirname = real_pathdup(sb.buf, 1);
-                       real_gitdir = real_pathdup(the_repository->gitdir, 1);
+                       if (!cache->real_gitdir)
+                               cache->real_gitdir = real_pathdup(the_repository->gitdir, 1);

-                       nested_repo = !!strcmp(real_dirname, real_gitdir);
-                       free(real_gitdir);
+                       nested_repo = !!strcmp(real_dirname, cache->real_gitdir);
                        free(real_dirname);
                }
                strbuf_release(&sb);
@@ -1944,7 +1957,7 @@ static enum path_treatment treat_directory(struct dir_struct *dir,
                                return path_excluded;

                        if (read_directory_recursive(dir, istate, dirname, len,
-                                                    untracked, 1, 1, pathspec) == path_excluded)
+                                                    untracked, 1, 1, pathspec, cache) == path_excluded)
                                return path_excluded;

                        return path_none;
@@ -2045,7 +2058,7 @@ static enum path_treatment treat_directory(struct dir_struct *dir,
        untracked = lookup_untracked(dir->untracked, untracked,
                                     dirname + baselen, len - baselen);
        state = read_directory_recursive(dir, istate, dirname, len, untracked,
-                                        check_only, stop_early, pathspec);
+                                        check_only, stop_early, pathspec, cache);

        /* There are a variety of reasons we may need to fixup the state... */
        if (state == path_excluded) {
@@ -2242,7 +2255,8 @@ static enum path_treatment treat_path_fast(struct dir_struct *dir,
                                           struct index_state *istate,
                                           struct strbuf *path,
                                           int baselen,
-                                          const struct pathspec *pathspec)
+                                          const struct pathspec *pathspec,
+                                          struct traversal_cache *cache)
 {
        /*
         * WARNING: From this function, you can return path_recurse or you
@@ -2264,7 +2278,7 @@ static enum path_treatment treat_path_fast(struct dir_struct *dir,
                 * with check_only set.
                 */
                return read_directory_recursive(dir, istate, path->buf, path->len,
-                                               cdir->ucd, 1, 0, pathspec);
+                                               cdir->ucd, 1, 0, pathspec, cache);
        /*
         * We get path_recurse in the first run when
         * directory_exists_in_index() returns index_nonexistent. We
@@ -2280,13 +2294,14 @@ static enum path_treatment treat_path(struct dir_struct *dir,
                                      struct index_state *istate,
                                      struct strbuf *path,
                                      int baselen,
-                                     const struct pathspec *pathspec)
+                                     const struct pathspec *pathspec,
+                                     struct traversal_cache *cache)
 {
        int has_path_in_index, dtype, excluded;

        if (!cdir->d_name)
                return treat_path_fast(dir, cdir, istate, path,
-                                      baselen, pathspec);
+                                      baselen, pathspec, cache);
        if (is_dot_or_dotdot(cdir->d_name) || !fspathcmp(cdir->d_name, ".git"))
                return path_none;
        strbuf_setlen(path, baselen);
@@ -2348,7 +2363,7 @@ static enum path_treatment treat_path(struct dir_struct *dir,
                strbuf_addch(path, '/');
                return treat_directory(dir, istate, untracked,
                                       path->buf, path->len,
-                                      baselen, excluded, pathspec);
+                                      baselen, excluded, pathspec, cache);
        case DT_REG:
        case DT_LNK:
                if (pathspec &&
@@ -2549,7 +2564,8 @@ static void add_path_to_appropriate_result_list(struct dir_struct *dir,
 static enum path_treatment read_directory_recursive(struct dir_struct *dir,
        struct index_state *istate, const char *base, int baselen,
        struct untracked_cache_dir *untracked, int check_only,
-       int stop_at_first_file, const struct pathspec *pathspec)
+       int stop_at_first_file, const struct pathspec *pathspec,
+       struct traversal_cache *cache)
 {
        /*
         * WARNING: Do NOT recurse unless path_recurse is returned from
@@ -2572,7 +2588,7 @@ static enum path_treatment read_directory_recursive(struct dir_struct *dir,
        while (!read_cached_dir(&cdir)) {
                /* check how the file or directory should be treated */
                state = treat_path(dir, untracked, &cdir, istate, &path,
-                                  baselen, pathspec);
+                                  baselen, pathspec, cache);
                dir->visited_paths++;

                if (state > dir_state)
@@ -2587,7 +2603,7 @@ static enum path_treatment read_directory_recursive(struct dir_struct *dir,
                        subdir_state =
                                read_directory_recursive(dir, istate, path.buf,
                                                         path.len, ud,
-                                                        check_only, stop_at_first_file, pathspec);
+                                                        check_only, stop_at_first_file, pathspec, cache);
                        if (subdir_state > dir_state)
                                dir_state = subdir_state;
@@ -2661,7 +2677,8 @@ int check_dir_entry_contains(const struct dir_entry *out, const struct dir_entry
 static int treat_leading_path(struct dir_struct *dir,
                              struct index_state *istate,
                              const char *path, int len,
-                             const struct pathspec *pathspec)
+                             const struct pathspec *pathspec,
+                             struct traversal_cache *cache)
 {
        struct strbuf sb = STRBUF_INIT;
        struct strbuf subdir = STRBUF_INIT;
@@ -2713,7 +2730,8 @@ static int treat_leading_path(struct dir_struct *dir,
                strbuf_reset(&subdir);
                strbuf_add(&subdir, path+prevlen, baselen-prevlen);
                cdir.d_name = subdir.buf;
-               state = treat_path(dir, NULL, &cdir, istate, &sb, prevlen, pathspec);
+
+               state = treat_path(dir, NULL, &cdir, istate, &sb, prevlen, pathspec, cache);

                if (state != path_recurse)
                        break; /* do not recurse into it */
@@ -2986,6 +3004,9 @@ int read_directory(struct dir_struct *dir, struct index_state *istate,
                   const char *path, int len, const struct pathspec *pathspec)
 {
        struct untracked_cache_dir *untracked;
+       struct traversal_cache cache = {
+               .real_gitdir = NULL,
+       };

        trace2_region_enter("dir", "read_directory", istate->repo);
        dir->visited_paths = 0;
@@ -3003,8 +3024,10 @@ int read_directory(struct dir_struct *dir, struct index_state *istate,
                 * e.g. prep_exclude()
                 */
                dir->untracked = NULL;
-       if (!len || treat_leading_path(dir, istate, path, len, pathspec))
-               read_directory_recursive(dir, istate, path, len, untracked, 0, 0, pathspec);
+       if (!len || treat_leading_path(dir, istate, path, len, pathspec, &cache)) {
+               read_directory_recursive(dir, istate, path, len, untracked, 0, 0, pathspec, &cache);
+       }
+       FREE_AND_NULL(cache.real_gitdir);
        QSORT(dir->entries, dir->nr, cmp_dir_entry);
        QSORT(dir->ignored, dir->ignored_nr, cmp_dir_entry);

--
2.36.0
This makes sense as a way to avoid the call to real_pathdup() if
that's what we want, but I feel it's a micro-optimization that isn't
worth this code churn.  I'd rather drop this patch from the series and
just use the real_pathdup() you had before (see
https://lore.kernel.org/git/CABPp-BGXRzYCvyM38dEUvQ125+VtRu++7L9UiRz98u+1=Lov7A@mail.gmail.com/ (local)).
Let's see what Junio says, but I'm hoping he agrees this patch isn't
needed.
Keyboard shortcuts
hback out one level
jnext message in thread
kprevious message in thread
ldrill in
Escclose help / fold thread tree
?toggle this help