"GOT", but the "O" is a cute, smiling pufferfish. Index | Thread | Search

From:
Mark Jamsek <mark@jamsek.com>
Subject:
Re: optimize tree_path_changed()
To:
gameoftrees@openbsd.org
Date:
Mon, 24 Apr 2023 21:50:00 +1000

Download raw body.

Thread
On 23-04-23 11:45PM, Stefan Sperling wrote:
> This patch avoids having to parse and sort all entries of two trees
> in order to figure out whether a particular entry has changed.
> 
> Instead, we now decode entries one-by-one until we find an entry that
> matches the path being logged. We also compare entries in the on-disk
> sorting order rather than got_path_cmp() order. As long as the order
> is stable (which it is supposed to be on disk) this works as avoids
> having to sort thousands of entries over and over when traversing a
> large repository like freebsd-ports.git. Other parts of Got still rely
> on got_path_cmp() order and are not affected by this change.
> 
> ok?

ok

> -----------------------------------------------
>  when finding changed paths iterate tree entries in on-disk order for speed
>  
> diff 91db220264b6d9be6d44223dd76ae8eb9bea3641 c61b19b32b39e2c85a64bca8cc9ddad22ed10219
> commit - 91db220264b6d9be6d44223dd76ae8eb9bea3641
> commit + c61b19b32b39e2c85a64bca8cc9ddad22ed10219
> blob - ff0be54ad39998cf5dffd4bd99a222298cd15e06
> blob + 746e561381b85c24e2b02e2298cff0dbbf3b9912
> --- lib/got_lib_object_parse.h
> +++ lib/got_lib_object_parse.h
> @@ -32,6 +32,8 @@ const struct got_error *got_object_parse_tree(struct g
>  	mode_t mode; /* Mode parsed from tree buffer. */
>  	uint8_t *id; /* Points to ID in parsed tree buffer. */
>  };
> +const struct got_error *got_object_parse_tree_entry(
> +    struct got_parsed_tree_entry *, size_t *, char *, size_t);
>  const struct got_error *got_object_parse_tree(struct got_parsed_tree_entry **,
>      size_t *, size_t *, uint8_t *, size_t);
>  const struct got_error *got_object_read_tree(struct got_parsed_tree_entry **,
> blob - 28536b3cc7243ce070909cda1331a2b7750713e0
> blob + 97321f872de069983725c100f1ac2f900bfbe39e
> --- lib/object_parse.c
> +++ lib/object_parse.c
> @@ -761,9 +761,9 @@ static const struct got_error *
>  	free(tree);
>  }
>  
> -static const struct got_error *
> -parse_tree_entry(struct got_parsed_tree_entry *pte, size_t *elen, char *buf,
> -    size_t maxlen)
> +const struct got_error *
> +got_object_parse_tree_entry(struct got_parsed_tree_entry *pte, size_t *elen,
> +    char *buf, size_t maxlen)
>  {
>  	char *p, *space;
>  
> @@ -835,7 +835,7 @@ got_object_parse_tree(struct got_parsed_tree_entry **e
>  		}
>  
>  		pte = &(*entries)[*nentries];
> -		err = parse_tree_entry(pte, &elen, buf, remain);
> +		err = got_object_parse_tree_entry(pte, &elen, buf, remain);
>  		if (err)
>  			goto done;
>  		buf += elen;
> blob - 1ff3707be9bef771b3d4084997853ada44c722ac
> blob + 47432bc05913abd4ec23ec1b1aed9d66279e7d3c
> --- libexec/got-read-pack/got-read-pack.c
> +++ libexec/got-read-pack/got-read-pack.c
> @@ -185,20 +185,21 @@ open_tree(uint8_t **buf, struct got_parsed_tree_entry 
>  }
>  
>  static const struct got_error *
> -open_tree(uint8_t **buf, struct got_parsed_tree_entry **entries, size_t *nentries,
> -    size_t *nentries_alloc, struct got_pack *pack, struct got_packidx *packidx,
> -    int obj_idx, struct got_object_id *id, struct got_object_cache *objcache)
> +open_tree(uint8_t **buf, size_t *len,
> +    struct got_pack *pack, struct got_packidx *packidx, int obj_idx,
> +    struct got_object_id *id, struct got_object_cache *objcache)
>  {
>  	const struct got_error *err = NULL;
>  	struct got_object *obj = NULL;
> -	size_t len;
> +	int cached = 0;
>  
>  	*buf = NULL;
> -	*nentries = 0;
> +	*len = 0;
>  
>  	obj = got_object_cache_get(objcache, id);
>  	if (obj) {
>  		obj->refcnt++;
> +		cached = 1;
>  	} else {
>  		err = open_object(&obj, pack, packidx, obj_idx, id,
>  		    objcache);
> @@ -206,14 +207,12 @@ open_tree(uint8_t **buf, struct got_parsed_tree_entry 
>  			return err;
>  	}
>  
> -	err = got_packfile_extract_object_to_mem(buf, &len, obj, pack);
> +	err = got_packfile_extract_object_to_mem(buf, len, obj, pack);
>  	if (err)
>  		goto done;
>  
> -	obj->size = len;
> -
> -	err = got_object_parse_tree(entries, nentries, nentries_alloc,
> -	    *buf, len);
> +	if (!cached)
> +		obj->size = *len;
>  done:
>  	got_object_close(obj);
>  	if (err) {
> @@ -232,6 +231,7 @@ tree_request(struct imsg *imsg, struct imsgbuf *ibuf, 
>  	const struct got_error *err = NULL;
>  	struct got_imsg_packed_object iobj;
>  	uint8_t *buf = NULL;
> +	size_t len = 0;
>  	struct got_object_id id;
>  	size_t datalen;
>  
> @@ -241,20 +241,24 @@ tree_request(struct imsg *imsg, struct imsgbuf *ibuf, 
>  	memcpy(&iobj, imsg->data, sizeof(iobj));
>  	memcpy(&id, &iobj.id, sizeof(id));
>  
> -	err = open_tree(&buf, entries, nentries, nentries_alloc,
> -	    pack, packidx, iobj.idx, &id, objcache);
> +	err = open_tree(&buf, &len, pack, packidx, iobj.idx, &id, objcache);
>  	if (err)
>  		return err;
>  
> +	err = got_object_parse_tree(entries, nentries, nentries_alloc,
> +	    buf, len);
> +	if (err)
> +		goto done;
> +
>  	err = got_privsep_send_tree(ibuf, *entries, *nentries);
> -	free(buf);
>  	if (err) {
>  		if (err->code == GOT_ERR_PRIVSEP_PIPE)
>  			err = NULL;
>  		else
>  			got_privsep_send_error(ibuf, err);
>  	}
> -
> +done:
> +	free(buf);
>  	return err;
>  }
>  
> @@ -432,41 +436,23 @@ static struct got_parsed_tree_entry *
>  	return err;
>  }
>  
> -static struct got_parsed_tree_entry *
> -find_entry_by_name(struct got_parsed_tree_entry *entries, int nentries,
> -    const char *name, size_t len)
> -{
> -	struct got_parsed_tree_entry *pte;
> -	int cmp, i;
> -
> -	/* Note that tree entries are sorted in strncmp() order. */
> -	for (i = 0; i < nentries; i++) {
> -		pte = &entries[i];
> -		cmp = strncmp(pte->name, name, len);
> -		if (cmp < 0)
> -			continue;
> -		if (cmp > 0)
> -			break;
> -		if (pte->name[len] == '\0')
> -			return pte;
> -	}
> -	return NULL;
> -}
> -
>  static const struct got_error *
> -tree_path_changed(int *changed, uint8_t **buf1, uint8_t **buf2,
> -    struct got_parsed_tree_entry **entries1, size_t *nentries1,
> -    size_t *nentries_alloc1,
> -    struct got_parsed_tree_entry **entries2, size_t *nentries2,
> -    size_t *nentries_alloc2,
> -    const char *path, struct got_pack *pack, struct got_packidx *packidx,
> +tree_path_changed(int *changed, uint8_t **buf1, size_t *len1,
> +    uint8_t **buf2, size_t *len2, const char *path,
> +    struct got_pack *pack, struct got_packidx *packidx,
>      struct imsgbuf *ibuf, struct got_object_cache *objcache)
>  {
>  	const struct got_error *err = NULL;
> -	struct got_parsed_tree_entry *pte1 = NULL, *pte2 = NULL;
> +	struct got_parsed_tree_entry pte1, pte2;
>  	const char *seg, *s;
>  	size_t seglen;
> +	size_t remain1 = *len1, remain2 = *len2, elen;
> +	uint8_t *next_entry1 = *buf1;
> +	uint8_t *next_entry2 = *buf2;
>  
> +	memset(&pte1, 0, sizeof(pte1));
> +	memset(&pte2, 0, sizeof(pte2));
> +
>  	*changed = 0;
>  
>  	/* We not do support comparing the root path. */
> @@ -486,24 +472,67 @@ tree_path_changed(int *changed, uint8_t **buf1, uint8_
>  				continue;
>  		}
>  
> -		pte1 = find_entry_by_name(*entries1, *nentries1, seg, seglen);
> -		if (pte1 == NULL) {
> +		/*
> +		 * As an optimization we compare entries in on-disk order
> +		 * rather than in got_path_cmp() order. We only need to
> +		 * find out if any entries differ. Parsing all entries and
> +		 * sorting them slows us down significantly when tree objects
> +		 * have thousands of entries. We can assume that on-disk entry
> +		 * ordering is stable, as per got_object_tree_create() and
> +		 * sort_tree_entries_the_way_git_likes_it(). Other orderings
> +		 * are incompatible with Git and would yield false positives
> +		 * here, too.
> +		 */
> +		while (remain1 > 0) {
> +			err = got_object_parse_tree_entry(&pte1, &elen,
> +			    next_entry1, remain1);
> +			if (err)
> +				return err;
> +			next_entry1 += elen;
> +			remain1 -= elen;
> +			if (strncmp(pte1.name, seg, seglen) != 0 ||
> +			    pte1.name[seglen] != '\0') {
> +				memset(&pte1, 0, sizeof(pte1));
> +				continue;
> +			} else
> +				break;
> +		}
> +		if (pte1.name == NULL) {
>  			err = got_error(GOT_ERR_NO_OBJ);
>  			break;
>  		}
>  
> -		pte2 = find_entry_by_name(*entries2, *nentries2, seg, seglen);
> -		if (pte2 == NULL) {
> +		if (remain2 == 0) {
>  			*changed = 1;
>  			break;
>  		}
>  
> -		if (pte1->mode != pte2->mode) {
> +		while (remain2 > 0) {
> +			err = got_object_parse_tree_entry(&pte2, &elen,
> +			    next_entry2, remain2);
> +			if (err)
> +				return err;
> +			next_entry2 += elen;
> +			remain2 -= elen;
> +			if (strncmp(pte2.name, seg, seglen) != 0 ||
> +			    pte2.name[seglen] != '\0') {
> +				memset(&pte2, 0, sizeof(pte2));
> +				continue;
> +			} else
> +				break;
> +		}
> +
> +		if (pte2.name == NULL) {
>  			*changed = 1;
>  			break;
>  		}
>  
> -		if (memcmp(pte1->id, pte2->id, SHA1_DIGEST_LENGTH) == 0) {
> +		if (pte1.mode != pte2.mode) {
> +			*changed = 1;
> +			break;
> +		}
> +
> +		if (memcmp(pte1.id, pte2.id, SHA1_DIGEST_LENGTH) == 0) {
>  			*changed = 0;
>  			break;
>  		}
> @@ -520,37 +549,37 @@ tree_path_changed(int *changed, uint8_t **buf1, uint8_
>  			struct got_object_id id1, id2;
>  			int idx;
>  
> -			memcpy(id1.sha1, pte1->id, SHA1_DIGEST_LENGTH);
> +			memcpy(id1.sha1, pte1.id, SHA1_DIGEST_LENGTH);
>  			idx = got_packidx_get_object_idx(packidx, &id1);
>  			if (idx == -1) {
>  				err = got_error_no_obj(&id1);
>  				break;
>  			}
> -			*nentries1 = 0;
>  			free(*buf1);
>  			*buf1 = NULL;
> -			err = open_tree(buf1, entries1, nentries1,
> -			    nentries_alloc1, pack, packidx, idx, &id1,
> +			err = open_tree(buf1, len1, pack, packidx, idx, &id1,
>  			    objcache);
> -			pte1 = NULL;
> +			memset(&pte1, 0, sizeof(pte1));
>  			if (err)
>  				break;
> +			next_entry1 = *buf1;
> +			remain1 = *len1;
>  
> -			memcpy(id2.sha1, pte2->id, SHA1_DIGEST_LENGTH);
> +			memcpy(id2.sha1, pte2.id, SHA1_DIGEST_LENGTH);
>  			idx = got_packidx_get_object_idx(packidx, &id2);
>  			if (idx == -1) {
>  				err = got_error_no_obj(&id2);
>  				break;
>  			}
> -			*nentries2 = 0;
>  			free(*buf2);
>  			*buf2 = NULL;
> -			err = open_tree(buf2, entries2, nentries2,
> -			    nentries_alloc2, pack, packidx, idx, &id2,
> +			err = open_tree(buf2, len2, pack, packidx, idx, &id2,
>  			    objcache);
> -			pte2 = NULL;
> +			memset(&pte2, 0, sizeof(pte2));
>  			if (err)
>  				break;
> +			next_entry2 = *buf2;
> +			remain2 = *len2;
>  		}
>  	}
>  
> @@ -606,9 +635,6 @@ commit_traversal_request(struct imsg *imsg, struct ims
>  	struct got_imsg_commit_traversal_request ctreq;
>  	struct got_object_qid *pid;
>  	struct got_commit_object *commit = NULL, *pcommit = NULL;
> -	struct got_parsed_tree_entry *entries = NULL, *pentries = NULL;
> -	size_t nentries = 0, nentries_alloc = 0;
> -	size_t pnentries = 0, pnentries_alloc = 0;
>  	struct got_object_id id;
>  	size_t datalen;
>  	char *path = NULL;
> @@ -707,6 +733,7 @@ commit_traversal_request(struct imsg *imsg, struct ims
>  		} else {
>  			int pidx;
>  			uint8_t *buf = NULL, *pbuf = NULL;
> +			size_t len = 0, plen = 0;
>  
>  			idx = got_packidx_get_object_idx(packidx,
>  			    commit->tree_id);
> @@ -717,27 +744,23 @@ commit_traversal_request(struct imsg *imsg, struct ims
>  			if (pidx == -1)
>  				break;
>  
> -			err = open_tree(&buf, &entries, &nentries,
> -			    &nentries_alloc, pack, packidx, idx,
> +			err = open_tree(&buf, &len, pack, packidx, idx,
>  			    commit->tree_id, objcache);
>  			if (err)
>  				goto done;
> -			err = open_tree(&pbuf, &pentries, &pnentries,
> -			    &pnentries_alloc, pack, packidx, pidx,
> +
> +			err = open_tree(&pbuf, &plen, pack, packidx, pidx,
>  			    pcommit->tree_id, objcache);
>  			if (err) {
>  				free(buf);
>  				goto done;
>  			}
>  
> -			err = tree_path_changed(&changed, &buf, &pbuf,
> -			    &entries, &nentries, &nentries_alloc,
> -			    &pentries, &pnentries, &pnentries_alloc,
> -			    path, pack, packidx, ibuf, objcache);
> +			err = tree_path_changed(&changed, &buf, &len,
> +			    &pbuf, &plen, path, pack, packidx, ibuf,
> +			    objcache);
>  
> -			nentries = 0;
>  			free(buf);
> -			pnentries = 0;
>  			free(pbuf);
>  			if (err) {
>  				if (err->code != GOT_ERR_NO_OBJ)
> @@ -774,8 +797,6 @@ done:
>  		got_object_commit_close(commit);
>  	if (pcommit)
>  		got_object_commit_close(pcommit);
> -	free(entries);
> -	free(pentries);
>  	if (err) {
>  		if (err->code == GOT_ERR_PRIVSEP_PIPE)
>  			err = NULL;
> @@ -1185,6 +1206,7 @@ enumerate_tree(int *have_all_entries, struct imsgbuf *
>  	struct got_object_id_queue ids;
>  	struct got_object_qid *qid;
>  	uint8_t *buf = NULL;
> +	size_t len = 0;
>  	struct got_parsed_tree_entry *entries = NULL;
>  	size_t nentries = 0, nentries_alloc = 0, i;
>  	struct enumerated_tree *tree;
> @@ -1225,13 +1247,18 @@ enumerate_tree(int *have_all_entries, struct imsgbuf *
>  			break;
>  		}
>  
> -		err = open_tree(&buf, &entries, &nentries, &nentries_alloc,
> -		    pack, packidx, idx, &qid->id, objcache);
> +		err = open_tree(&buf, &len, pack, packidx, idx, &qid->id,
> +		    objcache);
>  		if (err) {
>  			if (err->code != GOT_ERR_NO_OBJ)
>  				goto done;
>  		}
>  
> +		err = got_object_parse_tree(&entries, &nentries,
> +		    &nentries_alloc, buf, len);
> +		if (err)
> +			goto done;
> +
>  		err = got_object_idset_add(idset, &qid->id, NULL);
>  		if (err)
>  			goto done;
> 

-- 
Mark Jamsek <fnc.bsdbox.org|got.bsdbox.org>
GPG: F2FF 13DE 6A06 C471 CA80  E6E2 2930 DC66 86EE CF68