"GOT", but the "O" is a cute, smiling pufferfish. Index | Thread | Search

From:
Stefan Sperling <stsp@stsp.name>
Subject:
optimize tree_path_changed()
To:
gameoftrees@openbsd.org
Date:
Sun, 23 Apr 2023 23:45:45 +0200

Download raw body.

Thread
This patch avoids having to parse and sort all entries of two trees
in order to figure out whether a particular entry has changed.

Instead, we now decode entries one-by-one until we find an entry that
matches the path being logged. We also compare entries in the on-disk
sorting order rather than got_path_cmp() order. As long as the order
is stable (which it is supposed to be on disk) this works as avoids
having to sort thousands of entries over and over when traversing a
large repository like freebsd-ports.git. Other parts of Got still rely
on got_path_cmp() order and are not affected by this change.

ok?

-----------------------------------------------
 when finding changed paths iterate tree entries in on-disk order for speed
 
diff 91db220264b6d9be6d44223dd76ae8eb9bea3641 c61b19b32b39e2c85a64bca8cc9ddad22ed10219
commit - 91db220264b6d9be6d44223dd76ae8eb9bea3641
commit + c61b19b32b39e2c85a64bca8cc9ddad22ed10219
blob - ff0be54ad39998cf5dffd4bd99a222298cd15e06
blob + 746e561381b85c24e2b02e2298cff0dbbf3b9912
--- lib/got_lib_object_parse.h
+++ lib/got_lib_object_parse.h
@@ -32,6 +32,8 @@ const struct got_error *got_object_parse_tree(struct g
 	mode_t mode; /* Mode parsed from tree buffer. */
 	uint8_t *id; /* Points to ID in parsed tree buffer. */
 };
+const struct got_error *got_object_parse_tree_entry(
+    struct got_parsed_tree_entry *, size_t *, char *, size_t);
 const struct got_error *got_object_parse_tree(struct got_parsed_tree_entry **,
     size_t *, size_t *, uint8_t *, size_t);
 const struct got_error *got_object_read_tree(struct got_parsed_tree_entry **,
blob - 28536b3cc7243ce070909cda1331a2b7750713e0
blob + 97321f872de069983725c100f1ac2f900bfbe39e
--- lib/object_parse.c
+++ lib/object_parse.c
@@ -761,9 +761,9 @@ static const struct got_error *
 	free(tree);
 }
 
-static const struct got_error *
-parse_tree_entry(struct got_parsed_tree_entry *pte, size_t *elen, char *buf,
-    size_t maxlen)
+const struct got_error *
+got_object_parse_tree_entry(struct got_parsed_tree_entry *pte, size_t *elen,
+    char *buf, size_t maxlen)
 {
 	char *p, *space;
 
@@ -835,7 +835,7 @@ got_object_parse_tree(struct got_parsed_tree_entry **e
 		}
 
 		pte = &(*entries)[*nentries];
-		err = parse_tree_entry(pte, &elen, buf, remain);
+		err = got_object_parse_tree_entry(pte, &elen, buf, remain);
 		if (err)
 			goto done;
 		buf += elen;
blob - 1ff3707be9bef771b3d4084997853ada44c722ac
blob + 47432bc05913abd4ec23ec1b1aed9d66279e7d3c
--- libexec/got-read-pack/got-read-pack.c
+++ libexec/got-read-pack/got-read-pack.c
@@ -185,20 +185,21 @@ open_tree(uint8_t **buf, struct got_parsed_tree_entry 
 }
 
 static const struct got_error *
-open_tree(uint8_t **buf, struct got_parsed_tree_entry **entries, size_t *nentries,
-    size_t *nentries_alloc, struct got_pack *pack, struct got_packidx *packidx,
-    int obj_idx, struct got_object_id *id, struct got_object_cache *objcache)
+open_tree(uint8_t **buf, size_t *len,
+    struct got_pack *pack, struct got_packidx *packidx, int obj_idx,
+    struct got_object_id *id, struct got_object_cache *objcache)
 {
 	const struct got_error *err = NULL;
 	struct got_object *obj = NULL;
-	size_t len;
+	int cached = 0;
 
 	*buf = NULL;
-	*nentries = 0;
+	*len = 0;
 
 	obj = got_object_cache_get(objcache, id);
 	if (obj) {
 		obj->refcnt++;
+		cached = 1;
 	} else {
 		err = open_object(&obj, pack, packidx, obj_idx, id,
 		    objcache);
@@ -206,14 +207,12 @@ open_tree(uint8_t **buf, struct got_parsed_tree_entry 
 			return err;
 	}
 
-	err = got_packfile_extract_object_to_mem(buf, &len, obj, pack);
+	err = got_packfile_extract_object_to_mem(buf, len, obj, pack);
 	if (err)
 		goto done;
 
-	obj->size = len;
-
-	err = got_object_parse_tree(entries, nentries, nentries_alloc,
-	    *buf, len);
+	if (!cached)
+		obj->size = *len;
 done:
 	got_object_close(obj);
 	if (err) {
@@ -232,6 +231,7 @@ tree_request(struct imsg *imsg, struct imsgbuf *ibuf, 
 	const struct got_error *err = NULL;
 	struct got_imsg_packed_object iobj;
 	uint8_t *buf = NULL;
+	size_t len = 0;
 	struct got_object_id id;
 	size_t datalen;
 
@@ -241,20 +241,24 @@ tree_request(struct imsg *imsg, struct imsgbuf *ibuf, 
 	memcpy(&iobj, imsg->data, sizeof(iobj));
 	memcpy(&id, &iobj.id, sizeof(id));
 
-	err = open_tree(&buf, entries, nentries, nentries_alloc,
-	    pack, packidx, iobj.idx, &id, objcache);
+	err = open_tree(&buf, &len, pack, packidx, iobj.idx, &id, objcache);
 	if (err)
 		return err;
 
+	err = got_object_parse_tree(entries, nentries, nentries_alloc,
+	    buf, len);
+	if (err)
+		goto done;
+
 	err = got_privsep_send_tree(ibuf, *entries, *nentries);
-	free(buf);
 	if (err) {
 		if (err->code == GOT_ERR_PRIVSEP_PIPE)
 			err = NULL;
 		else
 			got_privsep_send_error(ibuf, err);
 	}
-
+done:
+	free(buf);
 	return err;
 }
 
@@ -432,41 +436,23 @@ static struct got_parsed_tree_entry *
 	return err;
 }
 
-static struct got_parsed_tree_entry *
-find_entry_by_name(struct got_parsed_tree_entry *entries, int nentries,
-    const char *name, size_t len)
-{
-	struct got_parsed_tree_entry *pte;
-	int cmp, i;
-
-	/* Note that tree entries are sorted in strncmp() order. */
-	for (i = 0; i < nentries; i++) {
-		pte = &entries[i];
-		cmp = strncmp(pte->name, name, len);
-		if (cmp < 0)
-			continue;
-		if (cmp > 0)
-			break;
-		if (pte->name[len] == '\0')
-			return pte;
-	}
-	return NULL;
-}
-
 static const struct got_error *
-tree_path_changed(int *changed, uint8_t **buf1, uint8_t **buf2,
-    struct got_parsed_tree_entry **entries1, size_t *nentries1,
-    size_t *nentries_alloc1,
-    struct got_parsed_tree_entry **entries2, size_t *nentries2,
-    size_t *nentries_alloc2,
-    const char *path, struct got_pack *pack, struct got_packidx *packidx,
+tree_path_changed(int *changed, uint8_t **buf1, size_t *len1,
+    uint8_t **buf2, size_t *len2, const char *path,
+    struct got_pack *pack, struct got_packidx *packidx,
     struct imsgbuf *ibuf, struct got_object_cache *objcache)
 {
 	const struct got_error *err = NULL;
-	struct got_parsed_tree_entry *pte1 = NULL, *pte2 = NULL;
+	struct got_parsed_tree_entry pte1, pte2;
 	const char *seg, *s;
 	size_t seglen;
+	size_t remain1 = *len1, remain2 = *len2, elen;
+	uint8_t *next_entry1 = *buf1;
+	uint8_t *next_entry2 = *buf2;
 
+	memset(&pte1, 0, sizeof(pte1));
+	memset(&pte2, 0, sizeof(pte2));
+
 	*changed = 0;
 
 	/* We not do support comparing the root path. */
@@ -486,24 +472,67 @@ tree_path_changed(int *changed, uint8_t **buf1, uint8_
 				continue;
 		}
 
-		pte1 = find_entry_by_name(*entries1, *nentries1, seg, seglen);
-		if (pte1 == NULL) {
+		/*
+		 * As an optimization we compare entries in on-disk order
+		 * rather than in got_path_cmp() order. We only need to
+		 * find out if any entries differ. Parsing all entries and
+		 * sorting them slows us down significantly when tree objects
+		 * have thousands of entries. We can assume that on-disk entry
+		 * ordering is stable, as per got_object_tree_create() and
+		 * sort_tree_entries_the_way_git_likes_it(). Other orderings
+		 * are incompatible with Git and would yield false positives
+		 * here, too.
+		 */
+		while (remain1 > 0) {
+			err = got_object_parse_tree_entry(&pte1, &elen,
+			    next_entry1, remain1);
+			if (err)
+				return err;
+			next_entry1 += elen;
+			remain1 -= elen;
+			if (strncmp(pte1.name, seg, seglen) != 0 ||
+			    pte1.name[seglen] != '\0') {
+				memset(&pte1, 0, sizeof(pte1));
+				continue;
+			} else
+				break;
+		}
+		if (pte1.name == NULL) {
 			err = got_error(GOT_ERR_NO_OBJ);
 			break;
 		}
 
-		pte2 = find_entry_by_name(*entries2, *nentries2, seg, seglen);
-		if (pte2 == NULL) {
+		if (remain2 == 0) {
 			*changed = 1;
 			break;
 		}
 
-		if (pte1->mode != pte2->mode) {
+		while (remain2 > 0) {
+			err = got_object_parse_tree_entry(&pte2, &elen,
+			    next_entry2, remain2);
+			if (err)
+				return err;
+			next_entry2 += elen;
+			remain2 -= elen;
+			if (strncmp(pte2.name, seg, seglen) != 0 ||
+			    pte2.name[seglen] != '\0') {
+				memset(&pte2, 0, sizeof(pte2));
+				continue;
+			} else
+				break;
+		}
+
+		if (pte2.name == NULL) {
 			*changed = 1;
 			break;
 		}
 
-		if (memcmp(pte1->id, pte2->id, SHA1_DIGEST_LENGTH) == 0) {
+		if (pte1.mode != pte2.mode) {
+			*changed = 1;
+			break;
+		}
+
+		if (memcmp(pte1.id, pte2.id, SHA1_DIGEST_LENGTH) == 0) {
 			*changed = 0;
 			break;
 		}
@@ -520,37 +549,37 @@ tree_path_changed(int *changed, uint8_t **buf1, uint8_
 			struct got_object_id id1, id2;
 			int idx;
 
-			memcpy(id1.sha1, pte1->id, SHA1_DIGEST_LENGTH);
+			memcpy(id1.sha1, pte1.id, SHA1_DIGEST_LENGTH);
 			idx = got_packidx_get_object_idx(packidx, &id1);
 			if (idx == -1) {
 				err = got_error_no_obj(&id1);
 				break;
 			}
-			*nentries1 = 0;
 			free(*buf1);
 			*buf1 = NULL;
-			err = open_tree(buf1, entries1, nentries1,
-			    nentries_alloc1, pack, packidx, idx, &id1,
+			err = open_tree(buf1, len1, pack, packidx, idx, &id1,
 			    objcache);
-			pte1 = NULL;
+			memset(&pte1, 0, sizeof(pte1));
 			if (err)
 				break;
+			next_entry1 = *buf1;
+			remain1 = *len1;
 
-			memcpy(id2.sha1, pte2->id, SHA1_DIGEST_LENGTH);
+			memcpy(id2.sha1, pte2.id, SHA1_DIGEST_LENGTH);
 			idx = got_packidx_get_object_idx(packidx, &id2);
 			if (idx == -1) {
 				err = got_error_no_obj(&id2);
 				break;
 			}
-			*nentries2 = 0;
 			free(*buf2);
 			*buf2 = NULL;
-			err = open_tree(buf2, entries2, nentries2,
-			    nentries_alloc2, pack, packidx, idx, &id2,
+			err = open_tree(buf2, len2, pack, packidx, idx, &id2,
 			    objcache);
-			pte2 = NULL;
+			memset(&pte2, 0, sizeof(pte2));
 			if (err)
 				break;
+			next_entry2 = *buf2;
+			remain2 = *len2;
 		}
 	}
 
@@ -606,9 +635,6 @@ commit_traversal_request(struct imsg *imsg, struct ims
 	struct got_imsg_commit_traversal_request ctreq;
 	struct got_object_qid *pid;
 	struct got_commit_object *commit = NULL, *pcommit = NULL;
-	struct got_parsed_tree_entry *entries = NULL, *pentries = NULL;
-	size_t nentries = 0, nentries_alloc = 0;
-	size_t pnentries = 0, pnentries_alloc = 0;
 	struct got_object_id id;
 	size_t datalen;
 	char *path = NULL;
@@ -707,6 +733,7 @@ commit_traversal_request(struct imsg *imsg, struct ims
 		} else {
 			int pidx;
 			uint8_t *buf = NULL, *pbuf = NULL;
+			size_t len = 0, plen = 0;
 
 			idx = got_packidx_get_object_idx(packidx,
 			    commit->tree_id);
@@ -717,27 +744,23 @@ commit_traversal_request(struct imsg *imsg, struct ims
 			if (pidx == -1)
 				break;
 
-			err = open_tree(&buf, &entries, &nentries,
-			    &nentries_alloc, pack, packidx, idx,
+			err = open_tree(&buf, &len, pack, packidx, idx,
 			    commit->tree_id, objcache);
 			if (err)
 				goto done;
-			err = open_tree(&pbuf, &pentries, &pnentries,
-			    &pnentries_alloc, pack, packidx, pidx,
+
+			err = open_tree(&pbuf, &plen, pack, packidx, pidx,
 			    pcommit->tree_id, objcache);
 			if (err) {
 				free(buf);
 				goto done;
 			}
 
-			err = tree_path_changed(&changed, &buf, &pbuf,
-			    &entries, &nentries, &nentries_alloc,
-			    &pentries, &pnentries, &pnentries_alloc,
-			    path, pack, packidx, ibuf, objcache);
+			err = tree_path_changed(&changed, &buf, &len,
+			    &pbuf, &plen, path, pack, packidx, ibuf,
+			    objcache);
 
-			nentries = 0;
 			free(buf);
-			pnentries = 0;
 			free(pbuf);
 			if (err) {
 				if (err->code != GOT_ERR_NO_OBJ)
@@ -774,8 +797,6 @@ done:
 		got_object_commit_close(commit);
 	if (pcommit)
 		got_object_commit_close(pcommit);
-	free(entries);
-	free(pentries);
 	if (err) {
 		if (err->code == GOT_ERR_PRIVSEP_PIPE)
 			err = NULL;
@@ -1185,6 +1206,7 @@ enumerate_tree(int *have_all_entries, struct imsgbuf *
 	struct got_object_id_queue ids;
 	struct got_object_qid *qid;
 	uint8_t *buf = NULL;
+	size_t len = 0;
 	struct got_parsed_tree_entry *entries = NULL;
 	size_t nentries = 0, nentries_alloc = 0, i;
 	struct enumerated_tree *tree;
@@ -1225,13 +1247,18 @@ enumerate_tree(int *have_all_entries, struct imsgbuf *
 			break;
 		}
 
-		err = open_tree(&buf, &entries, &nentries, &nentries_alloc,
-		    pack, packidx, idx, &qid->id, objcache);
+		err = open_tree(&buf, &len, pack, packidx, idx, &qid->id,
+		    objcache);
 		if (err) {
 			if (err->code != GOT_ERR_NO_OBJ)
 				goto done;
 		}
 
+		err = got_object_parse_tree(&entries, &nentries,
+		    &nentries_alloc, buf, len);
+		if (err)
+			goto done;
+
 		err = got_object_idset_add(idset, &qid->id, NULL);
 		if (err)
 			goto done;