"GOT", but the "O" is a cute, smiling pufferfish. Index | Thread | Search

From:
Stefan Sperling <stsp@stsp.name>
Subject:
Re: speed up tree-entry parsing
To:
gameoftrees@openbsd.org
Date:
Wed, 18 May 2022 13:13:10 +0200

Download raw body.

Thread
On Wed, May 18, 2022 at 09:58:28AM +0200, Stefan Sperling wrote:
> Slightly teaked diff: It is better to grow the array when we are about
> to add an entry which would overflow the array, rather than growing the
> array after adding a new entry. Avoids growing the array unnecessarily
> in case the number of entries equals the number pre-allocated slots.

And this new version avoids use of a pathlist entirely.
The previous version still used a pathlist to check for duplicate
tree entries.

Instead, we can do this check after mergesort() by iterating over
the array. This avoids some malloc/free and is slightly faster.

I have another diff which builds on top of this coming soon.

diff refs/heads/main refs/heads/parse-tree-array
blob - adbb8bb3e8521199cb1e1ee0ae5ad5dce3f3503d
blob + 79b6e83d9872c4ee03472131af53beb3186ab6be
--- lib/got_lib_object_parse.h
+++ lib/got_lib_object_parse.h
@@ -23,18 +23,14 @@ struct got_tree_entry *got_alloc_tree_entry_partial(vo
 const struct got_error *got_object_parse_commit(struct got_commit_object **,
     char *, size_t);
 
-/*
- * In a pathlist returned by got_object_parse_tree(), a pointer to the entry's
- * name is stored in the pathlist element's path, while the pathlist element's
- * data pointer points to a struct got_parsed_tree_entry.
- */
 struct got_parsed_tree_entry {
+	const char *name; /* Points to name in parsed buffer */
+	size_t namelen; /* strlen(name) */
 	mode_t mode; /* Mode parsed from tree buffer. */
 	uint8_t *id; /* Points to ID in parsed tree buffer. */
 };
-const struct got_error *got_object_parse_tree(struct got_pathlist_head *, int *,
-    uint8_t *, size_t);
-void got_object_parsed_tree_entries_free(struct got_pathlist_head *);
+const struct got_error *got_object_parse_tree(struct got_parsed_tree_entry **,
+    int *, uint8_t *, size_t);
 
 const struct got_error *got_object_parse_tag(struct got_tag_object **,
     uint8_t *, size_t);
blob - b310a5120a14c05adbe049177d4af040d61f1ab9
blob + c6937820f54e6d66c1cbf81056a6b916516c9c5d
--- lib/got_lib_privsep.h
+++ lib/got_lib_privsep.h
@@ -662,8 +662,9 @@ const struct got_error *got_privsep_recv_commit(struct
     struct imsgbuf *);
 const struct got_error *got_privsep_recv_tree(struct got_tree_object **,
     struct imsgbuf *);
+struct got_parsed_tree_entry;
 const struct got_error *got_privsep_send_tree(struct imsgbuf *,
-    struct got_pathlist_head *, int);
+    struct got_parsed_tree_entry *, int);
 const struct got_error *got_privsep_send_blob(struct imsgbuf *, size_t, size_t,
     const uint8_t *);
 const struct got_error *got_privsep_recv_blob(uint8_t **, size_t *, size_t *,
blob - 511053239b0a892dbb0de04e93ccd9ed8e69e7b8
blob + 15e34b7d7b360b66d71d7201ab317ddee510e1c9
--- lib/object_parse.c
+++ lib/object_parse.c
@@ -669,89 +669,90 @@ got_object_tree_close(struct got_tree_object *tree)
 }
 
 static const struct got_error *
-parse_tree_entry(struct got_parsed_tree_entry **pte, const char **name,
-    size_t *elen, char *buf,
+parse_tree_entry(struct got_parsed_tree_entry *pte, size_t *elen, char *buf,
     size_t maxlen)
 {
 	char *p, *space;
-	const struct got_error *err = NULL;
 
-	*name = NULL;
 	*elen = 0;
 
-	*pte = malloc(sizeof(**pte));
-	if (*pte == NULL)
-		return got_error_from_errno("malloc");
-
 	*elen = strnlen(buf, maxlen) + 1;
-	if (*elen > maxlen) {
-		free(*pte);
-		*pte = NULL;
+	if (*elen > maxlen)
 		return got_error(GOT_ERR_BAD_OBJ_DATA);
-	}
 
 	space = memchr(buf, ' ', *elen);
-	if (space == NULL || space <= buf) {
-		err = got_error(GOT_ERR_BAD_OBJ_DATA);
-		free(*pte);
-		*pte = NULL;
-		return err;
-	}
-	(*pte)->mode = 0;
+	if (space == NULL || space <= buf)
+		return got_error(GOT_ERR_BAD_OBJ_DATA);
+
+	pte->mode = 0;
 	p = buf;
 	while (p < space) {
-		if (*p < '0' && *p > '7') {
-			err = got_error(GOT_ERR_BAD_OBJ_DATA);
-			goto done;
-		}
-		(*pte)->mode <<= 3;
-		(*pte)->mode |= *p - '0';
+		if (*p < '0' && *p > '7')
+			return got_error(GOT_ERR_BAD_OBJ_DATA);
+		pte->mode <<= 3;
+		pte->mode |= *p - '0';
 		p++;
 	}
 
-	if (*elen > maxlen || maxlen - *elen < SHA1_DIGEST_LENGTH) {
-		err = got_error(GOT_ERR_BAD_OBJ_DATA);
-		goto done;
-	}
-	*name = space + 1;
+	if (*elen > maxlen || maxlen - *elen < SHA1_DIGEST_LENGTH)
+		return got_error(GOT_ERR_BAD_OBJ_DATA);
+
+	pte->name = space + 1;
+	pte->namelen = strlen(pte->name);
 	buf += *elen;
-	(*pte)->id = buf;
+	pte->id = buf;
 	*elen += SHA1_DIGEST_LENGTH;
-done:
-	if (err) {
-		free(*pte);
-		*pte = NULL;
-	}
-	return err;
+	return NULL;
 }
 
+static int
+pte_cmp(const void *pa, const void *pb)
+{
+	const struct got_parsed_tree_entry *a, *b;
+
+	a = (const struct got_parsed_tree_entry *)pa;
+	b = (const struct got_parsed_tree_entry *)pb;
+
+	return got_path_cmp(a->name, b->name, a->namelen, b->namelen);
+}
+
 const struct got_error *
-got_object_parse_tree(struct got_pathlist_head *entries, int *nentries,
+got_object_parse_tree(struct got_parsed_tree_entry **entries, int *nentries,
     uint8_t *buf, size_t len)
 {
 	const struct got_error *err = NULL;
-	size_t remain = len;
+	size_t remain = len, totalloc;
+	const size_t nalloc = 16;
+	struct got_parsed_tree_entry *pte;
+	int i;
 
 	*nentries = 0;
 	if (remain == 0)
 		return NULL; /* tree is empty */
 
+	*entries = calloc(nalloc, sizeof(**entries));
+	if (*entries == NULL)
+		return got_error_from_errno("calloc");
+	totalloc = nalloc;
+
 	while (remain > 0) {
-		struct got_parsed_tree_entry *pte;
-		struct got_pathlist_entry *new = NULL;
-		const char *name;
 		size_t elen;
 
-		err = parse_tree_entry(&pte, &name, &elen, buf, remain);
-		if (err)
-			goto done;
-		err = got_pathlist_insert(&new, entries, name, pte);
-		if (err)
-			goto done;
-		if (new == NULL) {
-			err = got_error(GOT_ERR_TREE_DUP_ENTRY);
-			goto done;
+		if (*nentries >= totalloc) {
+			pte = recallocarray(*entries, totalloc,
+			    totalloc + nalloc, sizeof(**entries));
+			if (pte == NULL) {
+				err = got_error_from_errno("recallocarray");
+				goto done;
+			}
+			*entries = pte;
+			totalloc += nalloc;
 		}
+
+		pte = &(*entries)[*nentries];
+		err = parse_tree_entry(pte, &elen, buf, remain);
+		if (err)
+			goto done;
 		buf += elen;
 		remain -= elen;
 		(*nentries)++;
@@ -761,27 +762,30 @@ got_object_parse_tree(struct got_pathlist_head *entrie
 		err = got_error(GOT_ERR_BAD_OBJ_DATA);
 		goto done;
 	}
+
+	if (*nentries > 1) {
+		mergesort(*entries, *nentries, sizeof(**entries), pte_cmp);
+
+		for (i = 0; i < *nentries - 1; i++) {
+			struct got_parsed_tree_entry *prev = &(*entries)[i];
+			pte = &(*entries)[i + 1];
+			if (got_path_cmp(prev->name, pte->name,
+			    prev->namelen, pte->namelen) == 0) {
+				err = got_error(GOT_ERR_TREE_DUP_ENTRY);
+				break;
+			}
+		}
+	}
 done:
 	if (err) {
-		got_object_parsed_tree_entries_free(entries);
+		free(*entries);
+		*entries = NULL;
 		*nentries = 0;
 	}
 	return err;
 }
 
 void
-got_object_parsed_tree_entries_free(struct got_pathlist_head *entries)
-{
-	struct got_pathlist_entry *pe;
-
-	TAILQ_FOREACH(pe, entries, entry) {
-		struct got_parsed_tree_entry *pte = pe->data;
-		free(pte);
-	}
-	got_pathlist_free(entries);
-}
-
-void
 got_object_tag_close(struct got_tag_object *tag)
 {
 	if (tag->refcnt > 0) {
blob - e7450fd7c2dce123856dba8a632d60b237b54e91
blob + 7b7f894dda9fc261842dfd019c4e9c2f444508a0
--- lib/privsep.c
+++ lib/privsep.c
@@ -1490,14 +1490,13 @@ got_privsep_recv_commit(struct got_commit_object **com
 }
 
 const struct got_error *
-got_privsep_send_tree(struct imsgbuf *ibuf, struct got_pathlist_head *entries,
-    int nentries)
+got_privsep_send_tree(struct imsgbuf *ibuf,
+    struct got_parsed_tree_entry *entries, int nentries)
 {
 	const struct got_error *err = NULL;
 	struct got_imsg_tree_object itree;
-	struct got_pathlist_entry *pe;
 	size_t totlen;
-	int nimsg; /* number of imsg queued in ibuf */
+	int i, nimsg; /* number of imsg queued in ibuf */
 
 	itree.nentries = nentries;
 	if (imsg_compose(ibuf, GOT_IMSG_TREE, 0, 0, -1, &itree, sizeof(itree))
@@ -1506,12 +1505,10 @@ got_privsep_send_tree(struct imsgbuf *ibuf, struct got
 
 	totlen = sizeof(itree);
 	nimsg = 1;
-	TAILQ_FOREACH(pe, entries, entry) {
-		const char *name = pe->path;
-		struct got_parsed_tree_entry *pte = pe->data;
+	for (i = 0; i < nentries; i++) {
+		struct got_parsed_tree_entry *pte = &entries[i];
 		struct ibuf *wbuf;
-		size_t namelen = strlen(name);
-		size_t len = sizeof(struct got_imsg_tree_entry) + namelen;
+		size_t len = sizeof(struct got_imsg_tree_entry) + pte->namelen;
 
 		if (len > MAX_IMSGSIZE)
 			return got_error(GOT_ERR_NO_SPACE);
@@ -1540,7 +1537,7 @@ got_privsep_send_tree(struct imsgbuf *ibuf, struct got
 			return err;
 		}
 
-		if (imsg_add(wbuf, name, namelen) == -1) {
+		if (imsg_add(wbuf, pte->name, pte->namelen) == -1) {
 			err = got_error_from_errno("imsg_add TREE_ENTRY");
 			ibuf_free(wbuf);
 			return err;
blob - 20a67c9701816c2474c4d9e546a0c6f4aeb8d0ae
blob + b6d3c895f92db84cc93320a4f869170172782abd
--- libexec/got-read-pack/got-read-pack.c
+++ libexec/got-read-pack/got-read-pack.c
@@ -177,7 +177,7 @@ done:
 }
 
 static const struct got_error *
-open_tree(uint8_t **buf, struct got_pathlist_head *entries, int *nentries,
+open_tree(uint8_t **buf, struct got_parsed_tree_entry **entries, int *nentries,
     struct got_pack *pack, struct got_packidx *packidx, int obj_idx,
     struct got_object_id *id, struct got_object_cache *objcache)
 {
@@ -220,14 +220,12 @@ tree_request(struct imsg *imsg, struct imsgbuf *ibuf, 
 {
 	const struct got_error *err = NULL;
 	struct got_imsg_packed_object iobj;
-	struct got_pathlist_head entries;
+	struct got_parsed_tree_entry *entries = NULL;
 	int nentries = 0;
 	uint8_t *buf = NULL;
 	struct got_object_id id;
 	size_t datalen;
 
-	TAILQ_INIT(&entries);
-
 	datalen = imsg->hdr.len - IMSG_HEADER_SIZE;
 	if (datalen != sizeof(iobj))
 		return got_error(GOT_ERR_PRIVSEP_LEN);
@@ -239,8 +237,8 @@ tree_request(struct imsg *imsg, struct imsgbuf *ibuf, 
 	if (err)
 		return err;
 
-	err = got_privsep_send_tree(ibuf, &entries, nentries);
-	got_object_parsed_tree_entries_free(&entries);
+	err = got_privsep_send_tree(ibuf, entries, nentries);
+	free(entries);
 	free(buf);
 	if (err) {
 		if (err->code == GOT_ERR_PRIVSEP_PIPE)
@@ -427,28 +425,30 @@ done:
 }
 
 static struct got_parsed_tree_entry *
-find_entry_by_name(struct got_pathlist_head *entries, int nentries,
+find_entry_by_name(struct got_parsed_tree_entry *entries, int nentries,
     const char *name, size_t len)
 {
-	struct got_pathlist_entry *pe;
+	struct got_parsed_tree_entry *pte;
+	int cmp, i;
 
 	/* Note that tree entries are sorted in strncmp() order. */
-	TAILQ_FOREACH(pe, entries, entry) {
-		int cmp = strncmp(pe->path, name, len);
+	for (i = 0; i < nentries; i++) {
+		pte = &entries[i];
+		cmp = strncmp(pte->name, name, len);
 		if (cmp < 0)
 			continue;
 		if (cmp > 0)
 			break;
-		if (pe->path[len] == '\0')
-			return (struct got_parsed_tree_entry *)pe->data;
+		if (pte->name[len] == '\0')
+			return pte;
 	}
 	return NULL;
 }
 
 static const struct got_error *
 tree_path_changed(int *changed, uint8_t **buf1, uint8_t **buf2,
-    struct got_pathlist_head *entries1, int *nentries1,
-    struct got_pathlist_head *entries2, int *nentries2,
+    struct got_parsed_tree_entry **entries1, int *nentries1,
+    struct got_parsed_tree_entry **entries2, int *nentries2,
     const char *path, struct got_pack *pack, struct got_packidx *packidx,
     struct imsgbuf *ibuf, struct got_object_cache *objcache)
 {
@@ -476,13 +476,13 @@ tree_path_changed(int *changed, uint8_t **buf1, uint8_
 				continue;
 		}
 
-		pte1 = find_entry_by_name(entries1, *nentries1, seg, seglen);
+		pte1 = find_entry_by_name(*entries1, *nentries1, seg, seglen);
 		if (pte1 == NULL) {
 			err = got_error(GOT_ERR_NO_OBJ);
 			break;
 		}
 
-		pte2 = find_entry_by_name(entries2, *nentries2, seg, seglen);
+		pte2 = find_entry_by_name(*entries2, *nentries2, seg, seglen);
 		if (pte2 == NULL) {
 			*changed = 1;
 			break;
@@ -516,7 +516,7 @@ tree_path_changed(int *changed, uint8_t **buf1, uint8_
 				err = got_error_no_obj(&id1);
 				break;
 			}
-			got_object_parsed_tree_entries_free(entries1);
+			free(*entries1);
 			*nentries1 = 0;
 			free(*buf1);
 			*buf1 = NULL;
@@ -532,7 +532,7 @@ tree_path_changed(int *changed, uint8_t **buf1, uint8_
 				err = got_error_no_obj(&id2);
 				break;
 			}
-			got_object_parsed_tree_entries_free(entries2);
+			free(*entries2);
 			*nentries2 = 0;
 			free(*buf2);
 			*buf2 = NULL;
@@ -602,7 +602,7 @@ commit_traversal_request(struct imsg *imsg, struct ims
 	struct got_imsg_packed_object iobj;
 	struct got_object_qid *pid;
 	struct got_commit_object *commit = NULL, *pcommit = NULL;
-	struct got_pathlist_head entries, pentries;
+	struct got_parsed_tree_entry *entries = NULL, *pentries = NULL;
 	int nentries = 0, pnentries = 0;
 	struct got_object_id id;
 	size_t datalen, path_len;
@@ -611,9 +611,6 @@ commit_traversal_request(struct imsg *imsg, struct ims
 	int changed = 0, ncommits = 0, nallocated = 0;
 	struct got_object_id *commit_ids = NULL;
 
-	TAILQ_INIT(&entries);
-	TAILQ_INIT(&pentries);
-
 	datalen = imsg->hdr.len - IMSG_HEADER_SIZE;
 	if (datalen < sizeof(iobj))
 		return got_error(GOT_ERR_PRIVSEP_LEN);
@@ -731,10 +728,12 @@ commit_traversal_request(struct imsg *imsg, struct ims
 			    &entries, &nentries, &pentries, &pnentries, path,
 			    pack, packidx, ibuf, objcache);
 
-			got_object_parsed_tree_entries_free(&entries);
+			free(entries);
+			entries = NULL;
 			nentries = 0;
 			free(buf);
-			got_object_parsed_tree_entries_free(&pentries);
+			free(pentries);
+			pentries = NULL;
 			pnentries = 0;
 			free(pbuf);
 			if (err) {
@@ -771,10 +770,8 @@ done:
 		got_object_commit_close(commit);
 	if (pcommit)
 		got_object_commit_close(pcommit);
-	if (nentries != 0)
-		got_object_parsed_tree_entries_free(&entries);
-	if (pnentries != 0)
-		got_object_parsed_tree_entries_free(&pentries);
+	free(entries);
+	free(pentries);
 	if (err) {
 		if (err->code == GOT_ERR_PRIVSEP_PIPE)
 			err = NULL;
blob - 95e34c068c96ce1a4a00a1ea034531e243919dfd
blob + 225f928a01e58925e0d8bcb3e1ffb1a2cc09d631
--- libexec/got-read-tree/got-read-tree.c
+++ libexec/got-read-tree/got-read-tree.c
@@ -50,7 +50,7 @@ catch_sigint(int signo)
 }
 
 static const struct got_error *
-read_tree_object(struct got_pathlist_head *entries, int *nentries,
+read_tree_object(struct got_parsed_tree_entry **entries, int *nentries,
     uint8_t **p, FILE *f, struct got_object_id *expected_id)
 {
 	const struct got_error *err = NULL;
@@ -119,13 +119,11 @@ main(int argc, char *argv[])
 	for (;;) {
 		struct imsg imsg;
 		FILE *f = NULL;
-		struct got_pathlist_head entries;
+		struct got_parsed_tree_entry *entries = NULL;
 		int nentries = 0;
 		uint8_t *buf = NULL;
 		struct got_object_id expected_id;
 
-		TAILQ_INIT(&entries);
-
 		if (sigint_received) {
 			err = got_error(GOT_ERR_CANCELLED);
 			break;
@@ -170,9 +168,9 @@ main(int argc, char *argv[])
 		if (err)
 			goto done;
 
-		err = got_privsep_send_tree(&ibuf, &entries, nentries);
+		err = got_privsep_send_tree(&ibuf, entries, nentries);
 done:
-		got_object_parsed_tree_entries_free(&entries);
+		free(entries);
 		free(buf);
 		if (f) {
 			if (fclose(f) == EOF && err == NULL)