"GOT", but the "O" is a cute, smiling pufferfish. Index | Thread | Search

From:
Omar Polo <op@openbsd.org>
Subject:
Re: speed up tree-entry parsing
To:
Stefan Sperling <stsp@stsp.name>
Cc:
gameoftrees@openbsd.org
Date:
Wed, 18 May 2022 17:12:47 +0200

Download raw body.

Thread
Stefan Sperling <stsp@stsp.name> wrote:
> On Wed, May 18, 2022 at 09:58:28AM +0200, Stefan Sperling wrote:
> > Slightly teaked diff: It is better to grow the array when we are about
> > to add an entry which would overflow the array, rather than growing the
> > array after adding a new entry. Avoids growing the array unnecessarily
> > in case the number of entries equals the number pre-allocated slots.

agreed

> And this new version avoids use of a pathlist entirely.
> The previous version still used a pathlist to check for duplicate
> tree entries.
>
> Instead, we can do this check after mergesort() by iterating over
> the array. This avoids some malloc/free and is slightly faster.
> 
> I have another diff which builds on top of this coming soon.

I haven't done any real runtime testing but it reads fine and regress is
passing.

I have two questions below but the diff reads ok to me as-is :)

> diff refs/heads/main refs/heads/parse-tree-array
> blob - adbb8bb3e8521199cb1e1ee0ae5ad5dce3f3503d
> blob + 79b6e83d9872c4ee03472131af53beb3186ab6be
> --- lib/got_lib_object_parse.h
> +++ lib/got_lib_object_parse.h
> @@ -23,18 +23,14 @@ struct got_tree_entry *got_alloc_tree_entry_partial(vo
>  const struct got_error *got_object_parse_commit(struct got_commit_object **,
>      char *, size_t);
>  
> -/*
> - * In a pathlist returned by got_object_parse_tree(), a pointer to the entry's
> - * name is stored in the pathlist element's path, while the pathlist element's
> - * data pointer points to a struct got_parsed_tree_entry.
> - */
>  struct got_parsed_tree_entry {
> +	const char *name; /* Points to name in parsed buffer */
> +	size_t namelen; /* strlen(name) */
>  	mode_t mode; /* Mode parsed from tree buffer. */
>  	uint8_t *id; /* Points to ID in parsed tree buffer. */
>  };
> -const struct got_error *got_object_parse_tree(struct got_pathlist_head *, int *,
> -    uint8_t *, size_t);
> -void got_object_parsed_tree_entries_free(struct got_pathlist_head *);
> +const struct got_error *got_object_parse_tree(struct got_parsed_tree_entry **,
> +    int *, uint8_t *, size_t);
>  
>  const struct got_error *got_object_parse_tag(struct got_tag_object **,
>      uint8_t *, size_t);
> blob - b310a5120a14c05adbe049177d4af040d61f1ab9
> blob + c6937820f54e6d66c1cbf81056a6b916516c9c5d
> --- lib/got_lib_privsep.h
> +++ lib/got_lib_privsep.h
> @@ -662,8 +662,9 @@ const struct got_error *got_privsep_recv_commit(struct
>      struct imsgbuf *);
>  const struct got_error *got_privsep_recv_tree(struct got_tree_object **,
>      struct imsgbuf *);
> +struct got_parsed_tree_entry;
>  const struct got_error *got_privsep_send_tree(struct imsgbuf *,
> -    struct got_pathlist_head *, int);
> +    struct got_parsed_tree_entry *, int);
>  const struct got_error *got_privsep_send_blob(struct imsgbuf *, size_t, size_t,
>      const uint8_t *);
>  const struct got_error *got_privsep_recv_blob(uint8_t **, size_t *, size_t *,
> blob - 511053239b0a892dbb0de04e93ccd9ed8e69e7b8
> blob + 15e34b7d7b360b66d71d7201ab317ddee510e1c9
> --- lib/object_parse.c
> +++ lib/object_parse.c
> @@ -669,89 +669,90 @@ got_object_tree_close(struct got_tree_object *tree)
>  }
>  
>  static const struct got_error *
> -parse_tree_entry(struct got_parsed_tree_entry **pte, const char **name,
> -    size_t *elen, char *buf,
> +parse_tree_entry(struct got_parsed_tree_entry *pte, size_t *elen, char *buf,
>      size_t maxlen)
>  {
>  	char *p, *space;
> -	const struct got_error *err = NULL;
>  
> -	*name = NULL;
>  	*elen = 0;
>  
> -	*pte = malloc(sizeof(**pte));
> -	if (*pte == NULL)
> -		return got_error_from_errno("malloc");
> -
>  	*elen = strnlen(buf, maxlen) + 1;
> -	if (*elen > maxlen) {
> -		free(*pte);
> -		*pte = NULL;
> +	if (*elen > maxlen)
>  		return got_error(GOT_ERR_BAD_OBJ_DATA);
> -	}
>  
>  	space = memchr(buf, ' ', *elen);
> -	if (space == NULL || space <= buf) {
> -		err = got_error(GOT_ERR_BAD_OBJ_DATA);
> -		free(*pte);
> -		*pte = NULL;
> -		return err;
> -	}
> -	(*pte)->mode = 0;

this was the same in the previous code too, but why check for space <=
buf?

> +	if (space == NULL || space <= buf)

`space' is either NULL or somewhere between [buf, buf+(*elen)), the <=
is intended just to make sure buf doesn't start with a space right?

> +		return got_error(GOT_ERR_BAD_OBJ_DATA);
> +
> +	pte->mode = 0;
>  	p = buf;
>  	while (p < space) {
> -		if (*p < '0' && *p > '7') {
> -			err = got_error(GOT_ERR_BAD_OBJ_DATA);
> -			goto done;
> -		}
> -		(*pte)->mode <<= 3;
> -		(*pte)->mode |= *p - '0';
> +		if (*p < '0' && *p > '7')
> +			return got_error(GOT_ERR_BAD_OBJ_DATA);
> +		pte->mode <<= 3;
> +		pte->mode |= *p - '0';
>  		p++;
>  	}
>  
> -	if (*elen > maxlen || maxlen - *elen < SHA1_DIGEST_LENGTH) {
> -		err = got_error(GOT_ERR_BAD_OBJ_DATA);
> -		goto done;
> -	}
> -	*name = space + 1;
> +	if (*elen > maxlen || maxlen - *elen < SHA1_DIGEST_LENGTH)
> +		return got_error(GOT_ERR_BAD_OBJ_DATA);
> +
> +	pte->name = space + 1;
> +	pte->namelen = strlen(pte->name);
>  	buf += *elen;
> -	(*pte)->id = buf;
> +	pte->id = buf;
>  	*elen += SHA1_DIGEST_LENGTH;
> -done:
> -	if (err) {
> -		free(*pte);
> -		*pte = NULL;
> -	}
> -	return err;
> +	return NULL;
>  }
>  
> +static int
> +pte_cmp(const void *pa, const void *pb)
> +{
> +	const struct got_parsed_tree_entry *a, *b;
> +
> +	a = (const struct got_parsed_tree_entry *)pa;
> +	b = (const struct got_parsed_tree_entry *)pb;

are these cast needed?  It's just a matter of style, I'm just asking to
be sure so future patches will be coherent with the rest of the code.  I
have written some code in got_patch IIRC which uses the implicit cast
from void *

> +	return got_path_cmp(a->name, b->name, a->namelen, b->namelen);
> +}
> +
>  const struct got_error *
> -got_object_parse_tree(struct got_pathlist_head *entries, int *nentries,
> +got_object_parse_tree(struct got_parsed_tree_entry **entries, int *nentries,
>      uint8_t *buf, size_t len)
>  {
>  	const struct got_error *err = NULL;
> -	size_t remain = len;
> +	size_t remain = len, totalloc;
> +	const size_t nalloc = 16;
> +	struct got_parsed_tree_entry *pte;
> +	int i;
>  
>  	*nentries = 0;
>  	if (remain == 0)
>  		return NULL; /* tree is empty */
>  
> +	*entries = calloc(nalloc, sizeof(**entries));
> +	if (*entries == NULL)
> +		return got_error_from_errno("calloc");
> +	totalloc = nalloc;
> +
>  	while (remain > 0) {
> -		struct got_parsed_tree_entry *pte;
> -		struct got_pathlist_entry *new = NULL;
> -		const char *name;
>  		size_t elen;
>  
> -		err = parse_tree_entry(&pte, &name, &elen, buf, remain);
> -		if (err)
> -			goto done;
> -		err = got_pathlist_insert(&new, entries, name, pte);
> -		if (err)
> -			goto done;
> -		if (new == NULL) {
> -			err = got_error(GOT_ERR_TREE_DUP_ENTRY);
> -			goto done;
> +		if (*nentries >= totalloc) {
> +			pte = recallocarray(*entries, totalloc,
> +			    totalloc + nalloc, sizeof(**entries));
> +			if (pte == NULL) {
> +				err = got_error_from_errno("recallocarray");
> +				goto done;
> +			}
> +			*entries = pte;
> +			totalloc += nalloc;
>  		}
> +
> +		pte = &(*entries)[*nentries];
> +		err = parse_tree_entry(pte, &elen, buf, remain);
> +		if (err)
> +			goto done;
>  		buf += elen;
>  		remain -= elen;
>  		(*nentries)++;
> @@ -761,27 +762,30 @@ got_object_parse_tree(struct got_pathlist_head *entrie
>  		err = got_error(GOT_ERR_BAD_OBJ_DATA);
>  		goto done;
>  	}
> +
> +	if (*nentries > 1) {
> +		mergesort(*entries, *nentries, sizeof(**entries), pte_cmp);
> +
> +		for (i = 0; i < *nentries - 1; i++) {
> +			struct got_parsed_tree_entry *prev = &(*entries)[i];
> +			pte = &(*entries)[i + 1];
> +			if (got_path_cmp(prev->name, pte->name,
> +			    prev->namelen, pte->namelen) == 0) {
> +				err = got_error(GOT_ERR_TREE_DUP_ENTRY);
> +				break;
> +			}
> +		}
> +	}
>  done:
>  	if (err) {
> -		got_object_parsed_tree_entries_free(entries);
> +		free(*entries);
> +		*entries = NULL;
>  		*nentries = 0;
>  	}
>  	return err;
>  }
>  
>  void
> -got_object_parsed_tree_entries_free(struct got_pathlist_head *entries)
> -{
> -	struct got_pathlist_entry *pe;
> -
> -	TAILQ_FOREACH(pe, entries, entry) {
> -		struct got_parsed_tree_entry *pte = pe->data;
> -		free(pte);
> -	}
> -	got_pathlist_free(entries);
> -}
> -
> -void
>  got_object_tag_close(struct got_tag_object *tag)
>  {
>  	if (tag->refcnt > 0) {
> blob - e7450fd7c2dce123856dba8a632d60b237b54e91
> blob + 7b7f894dda9fc261842dfd019c4e9c2f444508a0
> --- lib/privsep.c
> +++ lib/privsep.c
> @@ -1490,14 +1490,13 @@ got_privsep_recv_commit(struct got_commit_object **com
>  }
>  
>  const struct got_error *
> -got_privsep_send_tree(struct imsgbuf *ibuf, struct got_pathlist_head *entries,
> -    int nentries)
> +got_privsep_send_tree(struct imsgbuf *ibuf,
> +    struct got_parsed_tree_entry *entries, int nentries)
>  {
>  	const struct got_error *err = NULL;
>  	struct got_imsg_tree_object itree;
> -	struct got_pathlist_entry *pe;
>  	size_t totlen;
> -	int nimsg; /* number of imsg queued in ibuf */
> +	int i, nimsg; /* number of imsg queued in ibuf */
>  
>  	itree.nentries = nentries;
>  	if (imsg_compose(ibuf, GOT_IMSG_TREE, 0, 0, -1, &itree, sizeof(itree))
> @@ -1506,12 +1505,10 @@ got_privsep_send_tree(struct imsgbuf *ibuf, struct got
>  
>  	totlen = sizeof(itree);
>  	nimsg = 1;
> -	TAILQ_FOREACH(pe, entries, entry) {
> -		const char *name = pe->path;
> -		struct got_parsed_tree_entry *pte = pe->data;
> +	for (i = 0; i < nentries; i++) {
> +		struct got_parsed_tree_entry *pte = &entries[i];
>  		struct ibuf *wbuf;
> -		size_t namelen = strlen(name);
> -		size_t len = sizeof(struct got_imsg_tree_entry) + namelen;
> +		size_t len = sizeof(struct got_imsg_tree_entry) + pte->namelen;
>  
>  		if (len > MAX_IMSGSIZE)
>  			return got_error(GOT_ERR_NO_SPACE);
> @@ -1540,7 +1537,7 @@ got_privsep_send_tree(struct imsgbuf *ibuf, struct got
>  			return err;
>  		}
>  
> -		if (imsg_add(wbuf, name, namelen) == -1) {
> +		if (imsg_add(wbuf, pte->name, pte->namelen) == -1) {
>  			err = got_error_from_errno("imsg_add TREE_ENTRY");
>  			ibuf_free(wbuf);
>  			return err;
> blob - 20a67c9701816c2474c4d9e546a0c6f4aeb8d0ae
> blob + b6d3c895f92db84cc93320a4f869170172782abd
> --- libexec/got-read-pack/got-read-pack.c
> +++ libexec/got-read-pack/got-read-pack.c
> @@ -177,7 +177,7 @@ done:
>  }
>  
>  static const struct got_error *
> -open_tree(uint8_t **buf, struct got_pathlist_head *entries, int *nentries,
> +open_tree(uint8_t **buf, struct got_parsed_tree_entry **entries, int *nentries,
>      struct got_pack *pack, struct got_packidx *packidx, int obj_idx,
>      struct got_object_id *id, struct got_object_cache *objcache)
>  {
> @@ -220,14 +220,12 @@ tree_request(struct imsg *imsg, struct imsgbuf *ibuf, 
>  {
>  	const struct got_error *err = NULL;
>  	struct got_imsg_packed_object iobj;
> -	struct got_pathlist_head entries;
> +	struct got_parsed_tree_entry *entries = NULL;
>  	int nentries = 0;
>  	uint8_t *buf = NULL;
>  	struct got_object_id id;
>  	size_t datalen;
>  
> -	TAILQ_INIT(&entries);
> -
>  	datalen = imsg->hdr.len - IMSG_HEADER_SIZE;
>  	if (datalen != sizeof(iobj))
>  		return got_error(GOT_ERR_PRIVSEP_LEN);
> @@ -239,8 +237,8 @@ tree_request(struct imsg *imsg, struct imsgbuf *ibuf, 
>  	if (err)
>  		return err;
>  
> -	err = got_privsep_send_tree(ibuf, &entries, nentries);
> -	got_object_parsed_tree_entries_free(&entries);
> +	err = got_privsep_send_tree(ibuf, entries, nentries);
> +	free(entries);
>  	free(buf);
>  	if (err) {
>  		if (err->code == GOT_ERR_PRIVSEP_PIPE)
> @@ -427,28 +425,30 @@ done:
>  }
>  
>  static struct got_parsed_tree_entry *
> -find_entry_by_name(struct got_pathlist_head *entries, int nentries,
> +find_entry_by_name(struct got_parsed_tree_entry *entries, int nentries,
>      const char *name, size_t len)
>  {
> -	struct got_pathlist_entry *pe;
> +	struct got_parsed_tree_entry *pte;
> +	int cmp, i;
>  
>  	/* Note that tree entries are sorted in strncmp() order. */
> -	TAILQ_FOREACH(pe, entries, entry) {
> -		int cmp = strncmp(pe->path, name, len);
> +	for (i = 0; i < nentries; i++) {
> +		pte = &entries[i];
> +		cmp = strncmp(pte->name, name, len);
>  		if (cmp < 0)
>  			continue;
>  		if (cmp > 0)
>  			break;
> -		if (pe->path[len] == '\0')
> -			return (struct got_parsed_tree_entry *)pe->data;
> +		if (pte->name[len] == '\0')
> +			return pte;
>  	}
>  	return NULL;
>  }
>  
>  static const struct got_error *
>  tree_path_changed(int *changed, uint8_t **buf1, uint8_t **buf2,
> -    struct got_pathlist_head *entries1, int *nentries1,
> -    struct got_pathlist_head *entries2, int *nentries2,
> +    struct got_parsed_tree_entry **entries1, int *nentries1,
> +    struct got_parsed_tree_entry **entries2, int *nentries2,
>      const char *path, struct got_pack *pack, struct got_packidx *packidx,
>      struct imsgbuf *ibuf, struct got_object_cache *objcache)
>  {
> @@ -476,13 +476,13 @@ tree_path_changed(int *changed, uint8_t **buf1, uint8_
>  				continue;
>  		}
>  
> -		pte1 = find_entry_by_name(entries1, *nentries1, seg, seglen);
> +		pte1 = find_entry_by_name(*entries1, *nentries1, seg, seglen);
>  		if (pte1 == NULL) {
>  			err = got_error(GOT_ERR_NO_OBJ);
>  			break;
>  		}
>  
> -		pte2 = find_entry_by_name(entries2, *nentries2, seg, seglen);
> +		pte2 = find_entry_by_name(*entries2, *nentries2, seg, seglen);
>  		if (pte2 == NULL) {
>  			*changed = 1;
>  			break;
> @@ -516,7 +516,7 @@ tree_path_changed(int *changed, uint8_t **buf1, uint8_
>  				err = got_error_no_obj(&id1);
>  				break;
>  			}
> -			got_object_parsed_tree_entries_free(entries1);
> +			free(*entries1);
>  			*nentries1 = 0;
>  			free(*buf1);
>  			*buf1 = NULL;
> @@ -532,7 +532,7 @@ tree_path_changed(int *changed, uint8_t **buf1, uint8_
>  				err = got_error_no_obj(&id2);
>  				break;
>  			}
> -			got_object_parsed_tree_entries_free(entries2);
> +			free(*entries2);
>  			*nentries2 = 0;
>  			free(*buf2);
>  			*buf2 = NULL;
> @@ -602,7 +602,7 @@ commit_traversal_request(struct imsg *imsg, struct ims
>  	struct got_imsg_packed_object iobj;
>  	struct got_object_qid *pid;
>  	struct got_commit_object *commit = NULL, *pcommit = NULL;
> -	struct got_pathlist_head entries, pentries;
> +	struct got_parsed_tree_entry *entries = NULL, *pentries = NULL;
>  	int nentries = 0, pnentries = 0;
>  	struct got_object_id id;
>  	size_t datalen, path_len;
> @@ -611,9 +611,6 @@ commit_traversal_request(struct imsg *imsg, struct ims
>  	int changed = 0, ncommits = 0, nallocated = 0;
>  	struct got_object_id *commit_ids = NULL;
>  
> -	TAILQ_INIT(&entries);
> -	TAILQ_INIT(&pentries);
> -
>  	datalen = imsg->hdr.len - IMSG_HEADER_SIZE;
>  	if (datalen < sizeof(iobj))
>  		return got_error(GOT_ERR_PRIVSEP_LEN);
> @@ -731,10 +728,12 @@ commit_traversal_request(struct imsg *imsg, struct ims
>  			    &entries, &nentries, &pentries, &pnentries, path,
>  			    pack, packidx, ibuf, objcache);
>  
> -			got_object_parsed_tree_entries_free(&entries);
> +			free(entries);
> +			entries = NULL;
>  			nentries = 0;
>  			free(buf);
> -			got_object_parsed_tree_entries_free(&pentries);
> +			free(pentries);
> +			pentries = NULL;
>  			pnentries = 0;
>  			free(pbuf);
>  			if (err) {
> @@ -771,10 +770,8 @@ done:
>  		got_object_commit_close(commit);
>  	if (pcommit)
>  		got_object_commit_close(pcommit);
> -	if (nentries != 0)
> -		got_object_parsed_tree_entries_free(&entries);
> -	if (pnentries != 0)
> -		got_object_parsed_tree_entries_free(&pentries);
> +	free(entries);
> +	free(pentries);
>  	if (err) {
>  		if (err->code == GOT_ERR_PRIVSEP_PIPE)
>  			err = NULL;
> blob - 95e34c068c96ce1a4a00a1ea034531e243919dfd
> blob + 225f928a01e58925e0d8bcb3e1ffb1a2cc09d631
> --- libexec/got-read-tree/got-read-tree.c
> +++ libexec/got-read-tree/got-read-tree.c
> @@ -50,7 +50,7 @@ catch_sigint(int signo)
>  }
>  
>  static const struct got_error *
> -read_tree_object(struct got_pathlist_head *entries, int *nentries,
> +read_tree_object(struct got_parsed_tree_entry **entries, int *nentries,
>      uint8_t **p, FILE *f, struct got_object_id *expected_id)
>  {
>  	const struct got_error *err = NULL;
> @@ -119,13 +119,11 @@ main(int argc, char *argv[])
>  	for (;;) {
>  		struct imsg imsg;
>  		FILE *f = NULL;
> -		struct got_pathlist_head entries;
> +		struct got_parsed_tree_entry *entries = NULL;
>  		int nentries = 0;
>  		uint8_t *buf = NULL;
>  		struct got_object_id expected_id;
>  
> -		TAILQ_INIT(&entries);
> -
>  		if (sigint_received) {
>  			err = got_error(GOT_ERR_CANCELLED);
>  			break;
> @@ -170,9 +168,9 @@ main(int argc, char *argv[])
>  		if (err)
>  			goto done;
>  
> -		err = got_privsep_send_tree(&ibuf, &entries, nentries);
> +		err = got_privsep_send_tree(&ibuf, entries, nentries);
>  done:
> -		got_object_parsed_tree_entries_free(&entries);
> +		free(entries);
>  		free(buf);
>  		if (f) {
>  			if (fclose(f) == EOF && err == NULL)