"GOT", but the "O" is a cute, smiling pufferfish. Index | Thread | Search

From:
Stefan Sperling <stsp@stsp.name>
Subject:
Re: faster history traversal for 'got blame'
To:
Martin Pieuchot <mpi@openbsd.org>
Cc:
gameoftrees@openbsd.org
Date:
Tue, 7 Jan 2020 18:57:03 +0100

Download raw body.

Thread
On Tue, Jan 07, 2020 at 01:02:13PM +0100, Martin Pieuchot wrote:
> got_packidx_init_hdr() is also taking a lot of CPU time.  Is it because
> of the SHA operations?  Is git using the sames?  Can they be improved?

The answer is that the pack index caching code is broken.
This function was called way too often: 749 times for 14 pack files.
The diff below fixes this bug and reduces this function to 2.2%.
Updated profile graph attached.
However, this patch only has a very small impact on performance.

> Another expensive function is got_object_id_cmp() which compares SHA
> hashes.  It is called often via RB-tree insert.  Is a RB-tree necessary?
> In this use case you have ~20K insertions and ~40k search that both
> result in ~3.4M comparisons.  Would another data structure result in
> fewer comparisons?

It was a TAILQ initially, which was slow. Then it was changed to 256 lists
indexed by the first byte of the hash. Both were much slower than RB-tree.
I don't think we have a better data structure in queue.h or tree.h.

The basic problem is searching through a set of byte strings (20 bytes each).
Perhaps there is something else we could use but it might be difficult to
find something that performs much better than a tree.

> And maybe it is possible to traverse commits via a different linking.
> Isn't a parent/child linking available?  Could that be used in some cases?

In data on disk the links point one way only. I built the graph with
both links initially but I didn't find a need for traversal in the
other direction so that code was removed.


diff refs/heads/master refs/heads/packidx-cache
blob - c6a59fb3b391d5b454b14cefecfaac891dd8e32e
blob + 888f9dd8b23cf061c2e0cca1e727db4c9f4a4f1b
--- include/got_error.h
+++ include/got_error.h
@@ -129,6 +129,7 @@
 #define GOT_ERR_REF_NAME_MINUS	113
 #define GOT_ERR_GITCONFIG_SYNTAX 114
 #define GOT_ERR_REBASE_OUT_OF_DATE 115
+#define GOT_ERR_CACHE_DUP_ENTRY	116
 
 static const struct got_error {
 	int code;
@@ -265,6 +266,7 @@ static const struct got_error {
 	{ GOT_ERR_GITCONFIG_SYNTAX, "gitconfig syntax error" },
 	{ GOT_ERR_REBASE_OUT_OF_DATE, "work tree must be updated before it "
 	    "can be used to rebase a branch" },
+	{ GOT_ERR_CACHE_DUP_ENTRY, "duplicate cache entry" },
 };
 
 /*
blob - 93f1734b814bde4b84f85106f95c3646947c45d8
blob + fe655627724bb2f95ad79457b9a0e7ce0e64140e
--- lib/got_lib_repository.h
+++ lib/got_lib_repository.h
@@ -67,8 +67,6 @@ const struct got_error*got_repo_cache_tag(struct got_r
     struct got_object_id *, struct got_tag_object *);
 struct got_tag_object *got_repo_get_cached_tag(struct got_repository *,
     struct got_object_id *);
-const struct got_error *got_repo_cache_packidx(struct got_repository *,
-    struct got_packidx *);
 const struct got_error *got_repo_search_packidx(struct got_packidx **, int *,
     struct got_repository *, struct got_object_id *);
 const struct got_error *got_repo_cache_pack(struct got_pack **,
blob - 4f98167915dd8661414656d1b92c2624248f1519
blob + 499c7c7c8d19381b481479ba9b78bab2fc685319
--- lib/object.c
+++ lib/object.c
@@ -307,8 +307,6 @@ open_packed_object(struct got_object **obj, struct got
 	err = read_packed_object_privsep(obj, repo, pack, packidx, idx, id);
 	if (err)
 		goto done;
-
-	err = got_repo_cache_pack(NULL, repo, path_packfile, packidx);
 done:
 	free(path_packfile);
 	return err;
blob - 63d77fece6bad83738a170bef8131abd5a01eeda
blob + 358389209a5812144cf90fd8c59a48aee2b1c150
--- lib/repository.c
+++ lib/repository.c
@@ -784,8 +784,9 @@ done:
 	return err;
 }
 
-const struct got_error *
-got_repo_cache_packidx(struct got_repository *repo, struct got_packidx *packidx)
+static const struct got_error *
+cache_packidx(struct got_repository *repo, struct got_packidx *packidx,
+    const char *path_packidx)
 {
 	const struct got_error *err = NULL;
 	int i;
@@ -793,6 +794,10 @@ got_repo_cache_packidx(struct got_repository *repo, st
 	for (i = 0; i < nitems(repo->packidx_cache); i++) {
 		if (repo->packidx_cache[i] == NULL)
 			break;
+		if (strcmp(repo->packidx_cache[i]->path_packidx,
+		    path_packidx) == 0) {
+			return got_error(GOT_ERR_CACHE_DUP_ENTRY);
+		}
 	}
 	if (i == nitems(repo->packidx_cache)) {
 		err = got_packidx_close(repo->packidx_cache[i - 1]);
@@ -845,7 +850,12 @@ got_repo_search_packidx(struct got_packidx **packidx, 
 			break;
 		*idx = got_packidx_get_object_idx(repo->packidx_cache[i], id);
 		if (*idx != -1) {
+			struct got_packidx *p;
+			/* Move matched cache entry to the front. */
+			p = repo->packidx_cache[0];
 			*packidx = repo->packidx_cache[i];
+			repo->packidx_cache[0] = *packidx;
+			repo->packidx_cache[i] = p;
 			return NULL;
 		}
 	}
@@ -865,6 +875,8 @@ got_repo_search_packidx(struct got_packidx **packidx, 
 	}
 
 	while ((dent = readdir(packdir)) != NULL) {
+		int is_cached = 0;
+
 		if (!is_packidx_filename(dent->d_name, dent->d_namlen))
 			continue;
 
@@ -874,7 +886,27 @@ got_repo_search_packidx(struct got_packidx **packidx, 
 			goto done;
 		}
 
+		for (i = 0; i < nitems(repo->packidx_cache); i++) {
+			if (repo->packidx_cache[i] == NULL)
+				break;
+			if (strcmp(repo->packidx_cache[i]->path_packidx,
+			    path_packidx) == 0) {
+				is_cached = 1;
+				break;
+			}
+		}
+		if (is_cached) {
+			free(path_packidx);
+			continue; /* already searched */
+		}
+
 		err = got_packidx_open(packidx, path_packidx, 0);
+		if (err) {
+			free(path_packidx);
+			goto done;
+		}
+
+		err = cache_packidx(repo, *packidx, path_packidx);
 		free(path_packidx);
 		if (err)
 			goto done;
@@ -882,14 +914,8 @@ got_repo_search_packidx(struct got_packidx **packidx, 
 		*idx = got_packidx_get_object_idx(*packidx, id);
 		if (*idx != -1) {
 			err = NULL; /* found the object */
-			err = got_repo_cache_packidx(repo, *packidx);
 			goto done;
 		}
-
-		err = got_packidx_close(*packidx);
-		*packidx = NULL;
-		if (err)
-			goto done;
 	}
 
 	err = got_error_no_obj(id);
@@ -959,7 +985,7 @@ got_repo_cache_pack(struct got_pack **packp, struct go
 		if (pack->path_packfile == NULL)
 			break;
 		if (strcmp(pack->path_packfile, path_packfile) == 0)
-			return NULL;
+			return got_error(GOT_ERR_CACHE_DUP_ENTRY);
 	}
 
 	if (i == nitems(repo->packs) - 1) {