From: Stefan Sperling Subject: Re: faster history traversal for 'got blame' To: Martin Pieuchot Cc: gameoftrees@openbsd.org Date: Tue, 7 Jan 2020 18:57:03 +0100 On Tue, Jan 07, 2020 at 01:02:13PM +0100, Martin Pieuchot wrote: > got_packidx_init_hdr() is also taking a lot of CPU time. Is it because > of the SHA operations? Is git using the sames? Can they be improved? The answer is that the pack index caching code is broken. This function was called way too often: 749 times for 14 pack files. The diff below fixes this bug and reduces this function to 2.2%. Updated profile graph attached. However, this patch only has a very small impact on performance. > Another expensive function is got_object_id_cmp() which compares SHA > hashes. It is called often via RB-tree insert. Is a RB-tree necessary? > In this use case you have ~20K insertions and ~40k search that both > result in ~3.4M comparisons. Would another data structure result in > fewer comparisons? It was a TAILQ initially, which was slow. Then it was changed to 256 lists indexed by the first byte of the hash. Both were much slower than RB-tree. I don't think we have a better data structure in queue.h or tree.h. The basic problem is searching through a set of byte strings (20 bytes each). Perhaps there is something else we could use but it might be difficult to find something that performs much better than a tree. > And maybe it is possible to traverse commits via a different linking. > Isn't a parent/child linking available? Could that be used in some cases? In data on disk the links point one way only. I built the graph with both links initially but I didn't find a need for traversal in the other direction so that code was removed. diff refs/heads/master refs/heads/packidx-cache blob - c6a59fb3b391d5b454b14cefecfaac891dd8e32e blob + 888f9dd8b23cf061c2e0cca1e727db4c9f4a4f1b --- include/got_error.h +++ include/got_error.h @@ -129,6 +129,7 @@ #define GOT_ERR_REF_NAME_MINUS 113 #define GOT_ERR_GITCONFIG_SYNTAX 114 #define GOT_ERR_REBASE_OUT_OF_DATE 115 +#define GOT_ERR_CACHE_DUP_ENTRY 116 static const struct got_error { int code; @@ -265,6 +266,7 @@ static const struct got_error { { GOT_ERR_GITCONFIG_SYNTAX, "gitconfig syntax error" }, { GOT_ERR_REBASE_OUT_OF_DATE, "work tree must be updated before it " "can be used to rebase a branch" }, + { GOT_ERR_CACHE_DUP_ENTRY, "duplicate cache entry" }, }; /* blob - 93f1734b814bde4b84f85106f95c3646947c45d8 blob + fe655627724bb2f95ad79457b9a0e7ce0e64140e --- lib/got_lib_repository.h +++ lib/got_lib_repository.h @@ -67,8 +67,6 @@ const struct got_error*got_repo_cache_tag(struct got_r struct got_object_id *, struct got_tag_object *); struct got_tag_object *got_repo_get_cached_tag(struct got_repository *, struct got_object_id *); -const struct got_error *got_repo_cache_packidx(struct got_repository *, - struct got_packidx *); const struct got_error *got_repo_search_packidx(struct got_packidx **, int *, struct got_repository *, struct got_object_id *); const struct got_error *got_repo_cache_pack(struct got_pack **, blob - 4f98167915dd8661414656d1b92c2624248f1519 blob + 499c7c7c8d19381b481479ba9b78bab2fc685319 --- lib/object.c +++ lib/object.c @@ -307,8 +307,6 @@ open_packed_object(struct got_object **obj, struct got err = read_packed_object_privsep(obj, repo, pack, packidx, idx, id); if (err) goto done; - - err = got_repo_cache_pack(NULL, repo, path_packfile, packidx); done: free(path_packfile); return err; blob - 63d77fece6bad83738a170bef8131abd5a01eeda blob + 358389209a5812144cf90fd8c59a48aee2b1c150 --- lib/repository.c +++ lib/repository.c @@ -784,8 +784,9 @@ done: return err; } -const struct got_error * -got_repo_cache_packidx(struct got_repository *repo, struct got_packidx *packidx) +static const struct got_error * +cache_packidx(struct got_repository *repo, struct got_packidx *packidx, + const char *path_packidx) { const struct got_error *err = NULL; int i; @@ -793,6 +794,10 @@ got_repo_cache_packidx(struct got_repository *repo, st for (i = 0; i < nitems(repo->packidx_cache); i++) { if (repo->packidx_cache[i] == NULL) break; + if (strcmp(repo->packidx_cache[i]->path_packidx, + path_packidx) == 0) { + return got_error(GOT_ERR_CACHE_DUP_ENTRY); + } } if (i == nitems(repo->packidx_cache)) { err = got_packidx_close(repo->packidx_cache[i - 1]); @@ -845,7 +850,12 @@ got_repo_search_packidx(struct got_packidx **packidx, break; *idx = got_packidx_get_object_idx(repo->packidx_cache[i], id); if (*idx != -1) { + struct got_packidx *p; + /* Move matched cache entry to the front. */ + p = repo->packidx_cache[0]; *packidx = repo->packidx_cache[i]; + repo->packidx_cache[0] = *packidx; + repo->packidx_cache[i] = p; return NULL; } } @@ -865,6 +875,8 @@ got_repo_search_packidx(struct got_packidx **packidx, } while ((dent = readdir(packdir)) != NULL) { + int is_cached = 0; + if (!is_packidx_filename(dent->d_name, dent->d_namlen)) continue; @@ -874,7 +886,27 @@ got_repo_search_packidx(struct got_packidx **packidx, goto done; } + for (i = 0; i < nitems(repo->packidx_cache); i++) { + if (repo->packidx_cache[i] == NULL) + break; + if (strcmp(repo->packidx_cache[i]->path_packidx, + path_packidx) == 0) { + is_cached = 1; + break; + } + } + if (is_cached) { + free(path_packidx); + continue; /* already searched */ + } + err = got_packidx_open(packidx, path_packidx, 0); + if (err) { + free(path_packidx); + goto done; + } + + err = cache_packidx(repo, *packidx, path_packidx); free(path_packidx); if (err) goto done; @@ -882,14 +914,8 @@ got_repo_search_packidx(struct got_packidx **packidx, *idx = got_packidx_get_object_idx(*packidx, id); if (*idx != -1) { err = NULL; /* found the object */ - err = got_repo_cache_packidx(repo, *packidx); goto done; } - - err = got_packidx_close(*packidx); - *packidx = NULL; - if (err) - goto done; } err = got_error_no_obj(id); @@ -959,7 +985,7 @@ got_repo_cache_pack(struct got_pack **packp, struct go if (pack->path_packfile == NULL) break; if (strcmp(pack->path_packfile, path_packfile) == 0) - return NULL; + return got_error(GOT_ERR_CACHE_DUP_ENTRY); } if (i == nitems(repo->packs) - 1) {