Download raw body.
faster history traversal for 'got blame'
On Tue, Jan 07, 2020 at 01:02:13PM +0100, Martin Pieuchot wrote:
> got_packidx_init_hdr() is also taking a lot of CPU time. Is it because
> of the SHA operations? Is git using the sames? Can they be improved?
The answer is that the pack index caching code is broken.
This function was called way too often: 749 times for 14 pack files.
The diff below fixes this bug and reduces this function to 2.2%.
Updated profile graph attached.
However, this patch only has a very small impact on performance.
> Another expensive function is got_object_id_cmp() which compares SHA
> hashes. It is called often via RB-tree insert. Is a RB-tree necessary?
> In this use case you have ~20K insertions and ~40k search that both
> result in ~3.4M comparisons. Would another data structure result in
> fewer comparisons?
It was a TAILQ initially, which was slow. Then it was changed to 256 lists
indexed by the first byte of the hash. Both were much slower than RB-tree.
I don't think we have a better data structure in queue.h or tree.h.
The basic problem is searching through a set of byte strings (20 bytes each).
Perhaps there is something else we could use but it might be difficult to
find something that performs much better than a tree.
> And maybe it is possible to traverse commits via a different linking.
> Isn't a parent/child linking available? Could that be used in some cases?
In data on disk the links point one way only. I built the graph with
both links initially but I didn't find a need for traversal in the
other direction so that code was removed.
diff refs/heads/master refs/heads/packidx-cache
blob - c6a59fb3b391d5b454b14cefecfaac891dd8e32e
blob + 888f9dd8b23cf061c2e0cca1e727db4c9f4a4f1b
--- include/got_error.h
+++ include/got_error.h
@@ -129,6 +129,7 @@
#define GOT_ERR_REF_NAME_MINUS 113
#define GOT_ERR_GITCONFIG_SYNTAX 114
#define GOT_ERR_REBASE_OUT_OF_DATE 115
+#define GOT_ERR_CACHE_DUP_ENTRY 116
static const struct got_error {
int code;
@@ -265,6 +266,7 @@ static const struct got_error {
{ GOT_ERR_GITCONFIG_SYNTAX, "gitconfig syntax error" },
{ GOT_ERR_REBASE_OUT_OF_DATE, "work tree must be updated before it "
"can be used to rebase a branch" },
+ { GOT_ERR_CACHE_DUP_ENTRY, "duplicate cache entry" },
};
/*
blob - 93f1734b814bde4b84f85106f95c3646947c45d8
blob + fe655627724bb2f95ad79457b9a0e7ce0e64140e
--- lib/got_lib_repository.h
+++ lib/got_lib_repository.h
@@ -67,8 +67,6 @@ const struct got_error*got_repo_cache_tag(struct got_r
struct got_object_id *, struct got_tag_object *);
struct got_tag_object *got_repo_get_cached_tag(struct got_repository *,
struct got_object_id *);
-const struct got_error *got_repo_cache_packidx(struct got_repository *,
- struct got_packidx *);
const struct got_error *got_repo_search_packidx(struct got_packidx **, int *,
struct got_repository *, struct got_object_id *);
const struct got_error *got_repo_cache_pack(struct got_pack **,
blob - 4f98167915dd8661414656d1b92c2624248f1519
blob + 499c7c7c8d19381b481479ba9b78bab2fc685319
--- lib/object.c
+++ lib/object.c
@@ -307,8 +307,6 @@ open_packed_object(struct got_object **obj, struct got
err = read_packed_object_privsep(obj, repo, pack, packidx, idx, id);
if (err)
goto done;
-
- err = got_repo_cache_pack(NULL, repo, path_packfile, packidx);
done:
free(path_packfile);
return err;
blob - 63d77fece6bad83738a170bef8131abd5a01eeda
blob + 358389209a5812144cf90fd8c59a48aee2b1c150
--- lib/repository.c
+++ lib/repository.c
@@ -784,8 +784,9 @@ done:
return err;
}
-const struct got_error *
-got_repo_cache_packidx(struct got_repository *repo, struct got_packidx *packidx)
+static const struct got_error *
+cache_packidx(struct got_repository *repo, struct got_packidx *packidx,
+ const char *path_packidx)
{
const struct got_error *err = NULL;
int i;
@@ -793,6 +794,10 @@ got_repo_cache_packidx(struct got_repository *repo, st
for (i = 0; i < nitems(repo->packidx_cache); i++) {
if (repo->packidx_cache[i] == NULL)
break;
+ if (strcmp(repo->packidx_cache[i]->path_packidx,
+ path_packidx) == 0) {
+ return got_error(GOT_ERR_CACHE_DUP_ENTRY);
+ }
}
if (i == nitems(repo->packidx_cache)) {
err = got_packidx_close(repo->packidx_cache[i - 1]);
@@ -845,7 +850,12 @@ got_repo_search_packidx(struct got_packidx **packidx,
break;
*idx = got_packidx_get_object_idx(repo->packidx_cache[i], id);
if (*idx != -1) {
+ struct got_packidx *p;
+ /* Move matched cache entry to the front. */
+ p = repo->packidx_cache[0];
*packidx = repo->packidx_cache[i];
+ repo->packidx_cache[0] = *packidx;
+ repo->packidx_cache[i] = p;
return NULL;
}
}
@@ -865,6 +875,8 @@ got_repo_search_packidx(struct got_packidx **packidx,
}
while ((dent = readdir(packdir)) != NULL) {
+ int is_cached = 0;
+
if (!is_packidx_filename(dent->d_name, dent->d_namlen))
continue;
@@ -874,7 +886,27 @@ got_repo_search_packidx(struct got_packidx **packidx,
goto done;
}
+ for (i = 0; i < nitems(repo->packidx_cache); i++) {
+ if (repo->packidx_cache[i] == NULL)
+ break;
+ if (strcmp(repo->packidx_cache[i]->path_packidx,
+ path_packidx) == 0) {
+ is_cached = 1;
+ break;
+ }
+ }
+ if (is_cached) {
+ free(path_packidx);
+ continue; /* already searched */
+ }
+
err = got_packidx_open(packidx, path_packidx, 0);
+ if (err) {
+ free(path_packidx);
+ goto done;
+ }
+
+ err = cache_packidx(repo, *packidx, path_packidx);
free(path_packidx);
if (err)
goto done;
@@ -882,14 +914,8 @@ got_repo_search_packidx(struct got_packidx **packidx,
*idx = got_packidx_get_object_idx(*packidx, id);
if (*idx != -1) {
err = NULL; /* found the object */
- err = got_repo_cache_packidx(repo, *packidx);
goto done;
}
-
- err = got_packidx_close(*packidx);
- *packidx = NULL;
- if (err)
- goto done;
}
err = got_error_no_obj(id);
@@ -959,7 +985,7 @@ got_repo_cache_pack(struct got_pack **packp, struct go
if (pack->path_packfile == NULL)
break;
if (strcmp(pack->path_packfile, path_packfile) == 0)
- return NULL;
+ return got_error(GOT_ERR_CACHE_DUP_ENTRY);
}
if (i == nitems(repo->packs) - 1) {
faster history traversal for 'got blame'