Download raw body.
optimize tree_path_changed()
On 23-04-23 11:45PM, Stefan Sperling wrote:
> This patch avoids having to parse and sort all entries of two trees
> in order to figure out whether a particular entry has changed.
>
> Instead, we now decode entries one-by-one until we find an entry that
> matches the path being logged. We also compare entries in the on-disk
> sorting order rather than got_path_cmp() order. As long as the order
> is stable (which it is supposed to be on disk) this works as avoids
> having to sort thousands of entries over and over when traversing a
> large repository like freebsd-ports.git. Other parts of Got still rely
> on got_path_cmp() order and are not affected by this change.
>
> ok?
ok
> -----------------------------------------------
> when finding changed paths iterate tree entries in on-disk order for speed
>
> diff 91db220264b6d9be6d44223dd76ae8eb9bea3641 c61b19b32b39e2c85a64bca8cc9ddad22ed10219
> commit - 91db220264b6d9be6d44223dd76ae8eb9bea3641
> commit + c61b19b32b39e2c85a64bca8cc9ddad22ed10219
> blob - ff0be54ad39998cf5dffd4bd99a222298cd15e06
> blob + 746e561381b85c24e2b02e2298cff0dbbf3b9912
> --- lib/got_lib_object_parse.h
> +++ lib/got_lib_object_parse.h
> @@ -32,6 +32,8 @@ const struct got_error *got_object_parse_tree(struct g
> mode_t mode; /* Mode parsed from tree buffer. */
> uint8_t *id; /* Points to ID in parsed tree buffer. */
> };
> +const struct got_error *got_object_parse_tree_entry(
> + struct got_parsed_tree_entry *, size_t *, char *, size_t);
> const struct got_error *got_object_parse_tree(struct got_parsed_tree_entry **,
> size_t *, size_t *, uint8_t *, size_t);
> const struct got_error *got_object_read_tree(struct got_parsed_tree_entry **,
> blob - 28536b3cc7243ce070909cda1331a2b7750713e0
> blob + 97321f872de069983725c100f1ac2f900bfbe39e
> --- lib/object_parse.c
> +++ lib/object_parse.c
> @@ -761,9 +761,9 @@ static const struct got_error *
> free(tree);
> }
>
> -static const struct got_error *
> -parse_tree_entry(struct got_parsed_tree_entry *pte, size_t *elen, char *buf,
> - size_t maxlen)
> +const struct got_error *
> +got_object_parse_tree_entry(struct got_parsed_tree_entry *pte, size_t *elen,
> + char *buf, size_t maxlen)
> {
> char *p, *space;
>
> @@ -835,7 +835,7 @@ got_object_parse_tree(struct got_parsed_tree_entry **e
> }
>
> pte = &(*entries)[*nentries];
> - err = parse_tree_entry(pte, &elen, buf, remain);
> + err = got_object_parse_tree_entry(pte, &elen, buf, remain);
> if (err)
> goto done;
> buf += elen;
> blob - 1ff3707be9bef771b3d4084997853ada44c722ac
> blob + 47432bc05913abd4ec23ec1b1aed9d66279e7d3c
> --- libexec/got-read-pack/got-read-pack.c
> +++ libexec/got-read-pack/got-read-pack.c
> @@ -185,20 +185,21 @@ open_tree(uint8_t **buf, struct got_parsed_tree_entry
> }
>
> static const struct got_error *
> -open_tree(uint8_t **buf, struct got_parsed_tree_entry **entries, size_t *nentries,
> - size_t *nentries_alloc, struct got_pack *pack, struct got_packidx *packidx,
> - int obj_idx, struct got_object_id *id, struct got_object_cache *objcache)
> +open_tree(uint8_t **buf, size_t *len,
> + struct got_pack *pack, struct got_packidx *packidx, int obj_idx,
> + struct got_object_id *id, struct got_object_cache *objcache)
> {
> const struct got_error *err = NULL;
> struct got_object *obj = NULL;
> - size_t len;
> + int cached = 0;
>
> *buf = NULL;
> - *nentries = 0;
> + *len = 0;
>
> obj = got_object_cache_get(objcache, id);
> if (obj) {
> obj->refcnt++;
> + cached = 1;
> } else {
> err = open_object(&obj, pack, packidx, obj_idx, id,
> objcache);
> @@ -206,14 +207,12 @@ open_tree(uint8_t **buf, struct got_parsed_tree_entry
> return err;
> }
>
> - err = got_packfile_extract_object_to_mem(buf, &len, obj, pack);
> + err = got_packfile_extract_object_to_mem(buf, len, obj, pack);
> if (err)
> goto done;
>
> - obj->size = len;
> -
> - err = got_object_parse_tree(entries, nentries, nentries_alloc,
> - *buf, len);
> + if (!cached)
> + obj->size = *len;
> done:
> got_object_close(obj);
> if (err) {
> @@ -232,6 +231,7 @@ tree_request(struct imsg *imsg, struct imsgbuf *ibuf,
> const struct got_error *err = NULL;
> struct got_imsg_packed_object iobj;
> uint8_t *buf = NULL;
> + size_t len = 0;
> struct got_object_id id;
> size_t datalen;
>
> @@ -241,20 +241,24 @@ tree_request(struct imsg *imsg, struct imsgbuf *ibuf,
> memcpy(&iobj, imsg->data, sizeof(iobj));
> memcpy(&id, &iobj.id, sizeof(id));
>
> - err = open_tree(&buf, entries, nentries, nentries_alloc,
> - pack, packidx, iobj.idx, &id, objcache);
> + err = open_tree(&buf, &len, pack, packidx, iobj.idx, &id, objcache);
> if (err)
> return err;
>
> + err = got_object_parse_tree(entries, nentries, nentries_alloc,
> + buf, len);
> + if (err)
> + goto done;
> +
> err = got_privsep_send_tree(ibuf, *entries, *nentries);
> - free(buf);
> if (err) {
> if (err->code == GOT_ERR_PRIVSEP_PIPE)
> err = NULL;
> else
> got_privsep_send_error(ibuf, err);
> }
> -
> +done:
> + free(buf);
> return err;
> }
>
> @@ -432,41 +436,23 @@ static struct got_parsed_tree_entry *
> return err;
> }
>
> -static struct got_parsed_tree_entry *
> -find_entry_by_name(struct got_parsed_tree_entry *entries, int nentries,
> - const char *name, size_t len)
> -{
> - struct got_parsed_tree_entry *pte;
> - int cmp, i;
> -
> - /* Note that tree entries are sorted in strncmp() order. */
> - for (i = 0; i < nentries; i++) {
> - pte = &entries[i];
> - cmp = strncmp(pte->name, name, len);
> - if (cmp < 0)
> - continue;
> - if (cmp > 0)
> - break;
> - if (pte->name[len] == '\0')
> - return pte;
> - }
> - return NULL;
> -}
> -
> static const struct got_error *
> -tree_path_changed(int *changed, uint8_t **buf1, uint8_t **buf2,
> - struct got_parsed_tree_entry **entries1, size_t *nentries1,
> - size_t *nentries_alloc1,
> - struct got_parsed_tree_entry **entries2, size_t *nentries2,
> - size_t *nentries_alloc2,
> - const char *path, struct got_pack *pack, struct got_packidx *packidx,
> +tree_path_changed(int *changed, uint8_t **buf1, size_t *len1,
> + uint8_t **buf2, size_t *len2, const char *path,
> + struct got_pack *pack, struct got_packidx *packidx,
> struct imsgbuf *ibuf, struct got_object_cache *objcache)
> {
> const struct got_error *err = NULL;
> - struct got_parsed_tree_entry *pte1 = NULL, *pte2 = NULL;
> + struct got_parsed_tree_entry pte1, pte2;
> const char *seg, *s;
> size_t seglen;
> + size_t remain1 = *len1, remain2 = *len2, elen;
> + uint8_t *next_entry1 = *buf1;
> + uint8_t *next_entry2 = *buf2;
>
> + memset(&pte1, 0, sizeof(pte1));
> + memset(&pte2, 0, sizeof(pte2));
> +
> *changed = 0;
>
> /* We not do support comparing the root path. */
> @@ -486,24 +472,67 @@ tree_path_changed(int *changed, uint8_t **buf1, uint8_
> continue;
> }
>
> - pte1 = find_entry_by_name(*entries1, *nentries1, seg, seglen);
> - if (pte1 == NULL) {
> + /*
> + * As an optimization we compare entries in on-disk order
> + * rather than in got_path_cmp() order. We only need to
> + * find out if any entries differ. Parsing all entries and
> + * sorting them slows us down significantly when tree objects
> + * have thousands of entries. We can assume that on-disk entry
> + * ordering is stable, as per got_object_tree_create() and
> + * sort_tree_entries_the_way_git_likes_it(). Other orderings
> + * are incompatible with Git and would yield false positives
> + * here, too.
> + */
> + while (remain1 > 0) {
> + err = got_object_parse_tree_entry(&pte1, &elen,
> + next_entry1, remain1);
> + if (err)
> + return err;
> + next_entry1 += elen;
> + remain1 -= elen;
> + if (strncmp(pte1.name, seg, seglen) != 0 ||
> + pte1.name[seglen] != '\0') {
> + memset(&pte1, 0, sizeof(pte1));
> + continue;
> + } else
> + break;
> + }
> + if (pte1.name == NULL) {
> err = got_error(GOT_ERR_NO_OBJ);
> break;
> }
>
> - pte2 = find_entry_by_name(*entries2, *nentries2, seg, seglen);
> - if (pte2 == NULL) {
> + if (remain2 == 0) {
> *changed = 1;
> break;
> }
>
> - if (pte1->mode != pte2->mode) {
> + while (remain2 > 0) {
> + err = got_object_parse_tree_entry(&pte2, &elen,
> + next_entry2, remain2);
> + if (err)
> + return err;
> + next_entry2 += elen;
> + remain2 -= elen;
> + if (strncmp(pte2.name, seg, seglen) != 0 ||
> + pte2.name[seglen] != '\0') {
> + memset(&pte2, 0, sizeof(pte2));
> + continue;
> + } else
> + break;
> + }
> +
> + if (pte2.name == NULL) {
> *changed = 1;
> break;
> }
>
> - if (memcmp(pte1->id, pte2->id, SHA1_DIGEST_LENGTH) == 0) {
> + if (pte1.mode != pte2.mode) {
> + *changed = 1;
> + break;
> + }
> +
> + if (memcmp(pte1.id, pte2.id, SHA1_DIGEST_LENGTH) == 0) {
> *changed = 0;
> break;
> }
> @@ -520,37 +549,37 @@ tree_path_changed(int *changed, uint8_t **buf1, uint8_
> struct got_object_id id1, id2;
> int idx;
>
> - memcpy(id1.sha1, pte1->id, SHA1_DIGEST_LENGTH);
> + memcpy(id1.sha1, pte1.id, SHA1_DIGEST_LENGTH);
> idx = got_packidx_get_object_idx(packidx, &id1);
> if (idx == -1) {
> err = got_error_no_obj(&id1);
> break;
> }
> - *nentries1 = 0;
> free(*buf1);
> *buf1 = NULL;
> - err = open_tree(buf1, entries1, nentries1,
> - nentries_alloc1, pack, packidx, idx, &id1,
> + err = open_tree(buf1, len1, pack, packidx, idx, &id1,
> objcache);
> - pte1 = NULL;
> + memset(&pte1, 0, sizeof(pte1));
> if (err)
> break;
> + next_entry1 = *buf1;
> + remain1 = *len1;
>
> - memcpy(id2.sha1, pte2->id, SHA1_DIGEST_LENGTH);
> + memcpy(id2.sha1, pte2.id, SHA1_DIGEST_LENGTH);
> idx = got_packidx_get_object_idx(packidx, &id2);
> if (idx == -1) {
> err = got_error_no_obj(&id2);
> break;
> }
> - *nentries2 = 0;
> free(*buf2);
> *buf2 = NULL;
> - err = open_tree(buf2, entries2, nentries2,
> - nentries_alloc2, pack, packidx, idx, &id2,
> + err = open_tree(buf2, len2, pack, packidx, idx, &id2,
> objcache);
> - pte2 = NULL;
> + memset(&pte2, 0, sizeof(pte2));
> if (err)
> break;
> + next_entry2 = *buf2;
> + remain2 = *len2;
> }
> }
>
> @@ -606,9 +635,6 @@ commit_traversal_request(struct imsg *imsg, struct ims
> struct got_imsg_commit_traversal_request ctreq;
> struct got_object_qid *pid;
> struct got_commit_object *commit = NULL, *pcommit = NULL;
> - struct got_parsed_tree_entry *entries = NULL, *pentries = NULL;
> - size_t nentries = 0, nentries_alloc = 0;
> - size_t pnentries = 0, pnentries_alloc = 0;
> struct got_object_id id;
> size_t datalen;
> char *path = NULL;
> @@ -707,6 +733,7 @@ commit_traversal_request(struct imsg *imsg, struct ims
> } else {
> int pidx;
> uint8_t *buf = NULL, *pbuf = NULL;
> + size_t len = 0, plen = 0;
>
> idx = got_packidx_get_object_idx(packidx,
> commit->tree_id);
> @@ -717,27 +744,23 @@ commit_traversal_request(struct imsg *imsg, struct ims
> if (pidx == -1)
> break;
>
> - err = open_tree(&buf, &entries, &nentries,
> - &nentries_alloc, pack, packidx, idx,
> + err = open_tree(&buf, &len, pack, packidx, idx,
> commit->tree_id, objcache);
> if (err)
> goto done;
> - err = open_tree(&pbuf, &pentries, &pnentries,
> - &pnentries_alloc, pack, packidx, pidx,
> +
> + err = open_tree(&pbuf, &plen, pack, packidx, pidx,
> pcommit->tree_id, objcache);
> if (err) {
> free(buf);
> goto done;
> }
>
> - err = tree_path_changed(&changed, &buf, &pbuf,
> - &entries, &nentries, &nentries_alloc,
> - &pentries, &pnentries, &pnentries_alloc,
> - path, pack, packidx, ibuf, objcache);
> + err = tree_path_changed(&changed, &buf, &len,
> + &pbuf, &plen, path, pack, packidx, ibuf,
> + objcache);
>
> - nentries = 0;
> free(buf);
> - pnentries = 0;
> free(pbuf);
> if (err) {
> if (err->code != GOT_ERR_NO_OBJ)
> @@ -774,8 +797,6 @@ done:
> got_object_commit_close(commit);
> if (pcommit)
> got_object_commit_close(pcommit);
> - free(entries);
> - free(pentries);
> if (err) {
> if (err->code == GOT_ERR_PRIVSEP_PIPE)
> err = NULL;
> @@ -1185,6 +1206,7 @@ enumerate_tree(int *have_all_entries, struct imsgbuf *
> struct got_object_id_queue ids;
> struct got_object_qid *qid;
> uint8_t *buf = NULL;
> + size_t len = 0;
> struct got_parsed_tree_entry *entries = NULL;
> size_t nentries = 0, nentries_alloc = 0, i;
> struct enumerated_tree *tree;
> @@ -1225,13 +1247,18 @@ enumerate_tree(int *have_all_entries, struct imsgbuf *
> break;
> }
>
> - err = open_tree(&buf, &entries, &nentries, &nentries_alloc,
> - pack, packidx, idx, &qid->id, objcache);
> + err = open_tree(&buf, &len, pack, packidx, idx, &qid->id,
> + objcache);
> if (err) {
> if (err->code != GOT_ERR_NO_OBJ)
> goto done;
> }
>
> + err = got_object_parse_tree(&entries, &nentries,
> + &nentries_alloc, buf, len);
> + if (err)
> + goto done;
> +
> err = got_object_idset_add(idset, &qid->id, NULL);
> if (err)
> goto done;
>
--
Mark Jamsek <fnc.bsdbox.org|got.bsdbox.org>
GPG: F2FF 13DE 6A06 C471 CA80 E6E2 2930 DC66 86EE CF68
optimize tree_path_changed()