From: Mark Jamsek Subject: Re: optimize tree_path_changed() To: gameoftrees@openbsd.org Date: Mon, 24 Apr 2023 21:50:00 +1000 On 23-04-23 11:45PM, Stefan Sperling wrote: > This patch avoids having to parse and sort all entries of two trees > in order to figure out whether a particular entry has changed. > > Instead, we now decode entries one-by-one until we find an entry that > matches the path being logged. We also compare entries in the on-disk > sorting order rather than got_path_cmp() order. As long as the order > is stable (which it is supposed to be on disk) this works as avoids > having to sort thousands of entries over and over when traversing a > large repository like freebsd-ports.git. Other parts of Got still rely > on got_path_cmp() order and are not affected by this change. > > ok? ok > ----------------------------------------------- > when finding changed paths iterate tree entries in on-disk order for speed > > diff 91db220264b6d9be6d44223dd76ae8eb9bea3641 c61b19b32b39e2c85a64bca8cc9ddad22ed10219 > commit - 91db220264b6d9be6d44223dd76ae8eb9bea3641 > commit + c61b19b32b39e2c85a64bca8cc9ddad22ed10219 > blob - ff0be54ad39998cf5dffd4bd99a222298cd15e06 > blob + 746e561381b85c24e2b02e2298cff0dbbf3b9912 > --- lib/got_lib_object_parse.h > +++ lib/got_lib_object_parse.h > @@ -32,6 +32,8 @@ const struct got_error *got_object_parse_tree(struct g > mode_t mode; /* Mode parsed from tree buffer. */ > uint8_t *id; /* Points to ID in parsed tree buffer. */ > }; > +const struct got_error *got_object_parse_tree_entry( > + struct got_parsed_tree_entry *, size_t *, char *, size_t); > const struct got_error *got_object_parse_tree(struct got_parsed_tree_entry **, > size_t *, size_t *, uint8_t *, size_t); > const struct got_error *got_object_read_tree(struct got_parsed_tree_entry **, > blob - 28536b3cc7243ce070909cda1331a2b7750713e0 > blob + 97321f872de069983725c100f1ac2f900bfbe39e > --- lib/object_parse.c > +++ lib/object_parse.c > @@ -761,9 +761,9 @@ static const struct got_error * > free(tree); > } > > -static const struct got_error * > -parse_tree_entry(struct got_parsed_tree_entry *pte, size_t *elen, char *buf, > - size_t maxlen) > +const struct got_error * > +got_object_parse_tree_entry(struct got_parsed_tree_entry *pte, size_t *elen, > + char *buf, size_t maxlen) > { > char *p, *space; > > @@ -835,7 +835,7 @@ got_object_parse_tree(struct got_parsed_tree_entry **e > } > > pte = &(*entries)[*nentries]; > - err = parse_tree_entry(pte, &elen, buf, remain); > + err = got_object_parse_tree_entry(pte, &elen, buf, remain); > if (err) > goto done; > buf += elen; > blob - 1ff3707be9bef771b3d4084997853ada44c722ac > blob + 47432bc05913abd4ec23ec1b1aed9d66279e7d3c > --- libexec/got-read-pack/got-read-pack.c > +++ libexec/got-read-pack/got-read-pack.c > @@ -185,20 +185,21 @@ open_tree(uint8_t **buf, struct got_parsed_tree_entry > } > > static const struct got_error * > -open_tree(uint8_t **buf, struct got_parsed_tree_entry **entries, size_t *nentries, > - size_t *nentries_alloc, struct got_pack *pack, struct got_packidx *packidx, > - int obj_idx, struct got_object_id *id, struct got_object_cache *objcache) > +open_tree(uint8_t **buf, size_t *len, > + struct got_pack *pack, struct got_packidx *packidx, int obj_idx, > + struct got_object_id *id, struct got_object_cache *objcache) > { > const struct got_error *err = NULL; > struct got_object *obj = NULL; > - size_t len; > + int cached = 0; > > *buf = NULL; > - *nentries = 0; > + *len = 0; > > obj = got_object_cache_get(objcache, id); > if (obj) { > obj->refcnt++; > + cached = 1; > } else { > err = open_object(&obj, pack, packidx, obj_idx, id, > objcache); > @@ -206,14 +207,12 @@ open_tree(uint8_t **buf, struct got_parsed_tree_entry > return err; > } > > - err = got_packfile_extract_object_to_mem(buf, &len, obj, pack); > + err = got_packfile_extract_object_to_mem(buf, len, obj, pack); > if (err) > goto done; > > - obj->size = len; > - > - err = got_object_parse_tree(entries, nentries, nentries_alloc, > - *buf, len); > + if (!cached) > + obj->size = *len; > done: > got_object_close(obj); > if (err) { > @@ -232,6 +231,7 @@ tree_request(struct imsg *imsg, struct imsgbuf *ibuf, > const struct got_error *err = NULL; > struct got_imsg_packed_object iobj; > uint8_t *buf = NULL; > + size_t len = 0; > struct got_object_id id; > size_t datalen; > > @@ -241,20 +241,24 @@ tree_request(struct imsg *imsg, struct imsgbuf *ibuf, > memcpy(&iobj, imsg->data, sizeof(iobj)); > memcpy(&id, &iobj.id, sizeof(id)); > > - err = open_tree(&buf, entries, nentries, nentries_alloc, > - pack, packidx, iobj.idx, &id, objcache); > + err = open_tree(&buf, &len, pack, packidx, iobj.idx, &id, objcache); > if (err) > return err; > > + err = got_object_parse_tree(entries, nentries, nentries_alloc, > + buf, len); > + if (err) > + goto done; > + > err = got_privsep_send_tree(ibuf, *entries, *nentries); > - free(buf); > if (err) { > if (err->code == GOT_ERR_PRIVSEP_PIPE) > err = NULL; > else > got_privsep_send_error(ibuf, err); > } > - > +done: > + free(buf); > return err; > } > > @@ -432,41 +436,23 @@ static struct got_parsed_tree_entry * > return err; > } > > -static struct got_parsed_tree_entry * > -find_entry_by_name(struct got_parsed_tree_entry *entries, int nentries, > - const char *name, size_t len) > -{ > - struct got_parsed_tree_entry *pte; > - int cmp, i; > - > - /* Note that tree entries are sorted in strncmp() order. */ > - for (i = 0; i < nentries; i++) { > - pte = &entries[i]; > - cmp = strncmp(pte->name, name, len); > - if (cmp < 0) > - continue; > - if (cmp > 0) > - break; > - if (pte->name[len] == '\0') > - return pte; > - } > - return NULL; > -} > - > static const struct got_error * > -tree_path_changed(int *changed, uint8_t **buf1, uint8_t **buf2, > - struct got_parsed_tree_entry **entries1, size_t *nentries1, > - size_t *nentries_alloc1, > - struct got_parsed_tree_entry **entries2, size_t *nentries2, > - size_t *nentries_alloc2, > - const char *path, struct got_pack *pack, struct got_packidx *packidx, > +tree_path_changed(int *changed, uint8_t **buf1, size_t *len1, > + uint8_t **buf2, size_t *len2, const char *path, > + struct got_pack *pack, struct got_packidx *packidx, > struct imsgbuf *ibuf, struct got_object_cache *objcache) > { > const struct got_error *err = NULL; > - struct got_parsed_tree_entry *pte1 = NULL, *pte2 = NULL; > + struct got_parsed_tree_entry pte1, pte2; > const char *seg, *s; > size_t seglen; > + size_t remain1 = *len1, remain2 = *len2, elen; > + uint8_t *next_entry1 = *buf1; > + uint8_t *next_entry2 = *buf2; > > + memset(&pte1, 0, sizeof(pte1)); > + memset(&pte2, 0, sizeof(pte2)); > + > *changed = 0; > > /* We not do support comparing the root path. */ > @@ -486,24 +472,67 @@ tree_path_changed(int *changed, uint8_t **buf1, uint8_ > continue; > } > > - pte1 = find_entry_by_name(*entries1, *nentries1, seg, seglen); > - if (pte1 == NULL) { > + /* > + * As an optimization we compare entries in on-disk order > + * rather than in got_path_cmp() order. We only need to > + * find out if any entries differ. Parsing all entries and > + * sorting them slows us down significantly when tree objects > + * have thousands of entries. We can assume that on-disk entry > + * ordering is stable, as per got_object_tree_create() and > + * sort_tree_entries_the_way_git_likes_it(). Other orderings > + * are incompatible with Git and would yield false positives > + * here, too. > + */ > + while (remain1 > 0) { > + err = got_object_parse_tree_entry(&pte1, &elen, > + next_entry1, remain1); > + if (err) > + return err; > + next_entry1 += elen; > + remain1 -= elen; > + if (strncmp(pte1.name, seg, seglen) != 0 || > + pte1.name[seglen] != '\0') { > + memset(&pte1, 0, sizeof(pte1)); > + continue; > + } else > + break; > + } > + if (pte1.name == NULL) { > err = got_error(GOT_ERR_NO_OBJ); > break; > } > > - pte2 = find_entry_by_name(*entries2, *nentries2, seg, seglen); > - if (pte2 == NULL) { > + if (remain2 == 0) { > *changed = 1; > break; > } > > - if (pte1->mode != pte2->mode) { > + while (remain2 > 0) { > + err = got_object_parse_tree_entry(&pte2, &elen, > + next_entry2, remain2); > + if (err) > + return err; > + next_entry2 += elen; > + remain2 -= elen; > + if (strncmp(pte2.name, seg, seglen) != 0 || > + pte2.name[seglen] != '\0') { > + memset(&pte2, 0, sizeof(pte2)); > + continue; > + } else > + break; > + } > + > + if (pte2.name == NULL) { > *changed = 1; > break; > } > > - if (memcmp(pte1->id, pte2->id, SHA1_DIGEST_LENGTH) == 0) { > + if (pte1.mode != pte2.mode) { > + *changed = 1; > + break; > + } > + > + if (memcmp(pte1.id, pte2.id, SHA1_DIGEST_LENGTH) == 0) { > *changed = 0; > break; > } > @@ -520,37 +549,37 @@ tree_path_changed(int *changed, uint8_t **buf1, uint8_ > struct got_object_id id1, id2; > int idx; > > - memcpy(id1.sha1, pte1->id, SHA1_DIGEST_LENGTH); > + memcpy(id1.sha1, pte1.id, SHA1_DIGEST_LENGTH); > idx = got_packidx_get_object_idx(packidx, &id1); > if (idx == -1) { > err = got_error_no_obj(&id1); > break; > } > - *nentries1 = 0; > free(*buf1); > *buf1 = NULL; > - err = open_tree(buf1, entries1, nentries1, > - nentries_alloc1, pack, packidx, idx, &id1, > + err = open_tree(buf1, len1, pack, packidx, idx, &id1, > objcache); > - pte1 = NULL; > + memset(&pte1, 0, sizeof(pte1)); > if (err) > break; > + next_entry1 = *buf1; > + remain1 = *len1; > > - memcpy(id2.sha1, pte2->id, SHA1_DIGEST_LENGTH); > + memcpy(id2.sha1, pte2.id, SHA1_DIGEST_LENGTH); > idx = got_packidx_get_object_idx(packidx, &id2); > if (idx == -1) { > err = got_error_no_obj(&id2); > break; > } > - *nentries2 = 0; > free(*buf2); > *buf2 = NULL; > - err = open_tree(buf2, entries2, nentries2, > - nentries_alloc2, pack, packidx, idx, &id2, > + err = open_tree(buf2, len2, pack, packidx, idx, &id2, > objcache); > - pte2 = NULL; > + memset(&pte2, 0, sizeof(pte2)); > if (err) > break; > + next_entry2 = *buf2; > + remain2 = *len2; > } > } > > @@ -606,9 +635,6 @@ commit_traversal_request(struct imsg *imsg, struct ims > struct got_imsg_commit_traversal_request ctreq; > struct got_object_qid *pid; > struct got_commit_object *commit = NULL, *pcommit = NULL; > - struct got_parsed_tree_entry *entries = NULL, *pentries = NULL; > - size_t nentries = 0, nentries_alloc = 0; > - size_t pnentries = 0, pnentries_alloc = 0; > struct got_object_id id; > size_t datalen; > char *path = NULL; > @@ -707,6 +733,7 @@ commit_traversal_request(struct imsg *imsg, struct ims > } else { > int pidx; > uint8_t *buf = NULL, *pbuf = NULL; > + size_t len = 0, plen = 0; > > idx = got_packidx_get_object_idx(packidx, > commit->tree_id); > @@ -717,27 +744,23 @@ commit_traversal_request(struct imsg *imsg, struct ims > if (pidx == -1) > break; > > - err = open_tree(&buf, &entries, &nentries, > - &nentries_alloc, pack, packidx, idx, > + err = open_tree(&buf, &len, pack, packidx, idx, > commit->tree_id, objcache); > if (err) > goto done; > - err = open_tree(&pbuf, &pentries, &pnentries, > - &pnentries_alloc, pack, packidx, pidx, > + > + err = open_tree(&pbuf, &plen, pack, packidx, pidx, > pcommit->tree_id, objcache); > if (err) { > free(buf); > goto done; > } > > - err = tree_path_changed(&changed, &buf, &pbuf, > - &entries, &nentries, &nentries_alloc, > - &pentries, &pnentries, &pnentries_alloc, > - path, pack, packidx, ibuf, objcache); > + err = tree_path_changed(&changed, &buf, &len, > + &pbuf, &plen, path, pack, packidx, ibuf, > + objcache); > > - nentries = 0; > free(buf); > - pnentries = 0; > free(pbuf); > if (err) { > if (err->code != GOT_ERR_NO_OBJ) > @@ -774,8 +797,6 @@ done: > got_object_commit_close(commit); > if (pcommit) > got_object_commit_close(pcommit); > - free(entries); > - free(pentries); > if (err) { > if (err->code == GOT_ERR_PRIVSEP_PIPE) > err = NULL; > @@ -1185,6 +1206,7 @@ enumerate_tree(int *have_all_entries, struct imsgbuf * > struct got_object_id_queue ids; > struct got_object_qid *qid; > uint8_t *buf = NULL; > + size_t len = 0; > struct got_parsed_tree_entry *entries = NULL; > size_t nentries = 0, nentries_alloc = 0, i; > struct enumerated_tree *tree; > @@ -1225,13 +1247,18 @@ enumerate_tree(int *have_all_entries, struct imsgbuf * > break; > } > > - err = open_tree(&buf, &entries, &nentries, &nentries_alloc, > - pack, packidx, idx, &qid->id, objcache); > + err = open_tree(&buf, &len, pack, packidx, idx, &qid->id, > + objcache); > if (err) { > if (err->code != GOT_ERR_NO_OBJ) > goto done; > } > > + err = got_object_parse_tree(&entries, &nentries, > + &nentries_alloc, buf, len); > + if (err) > + goto done; > + > err = got_object_idset_add(idset, &qid->id, NULL); > if (err) > goto done; > -- Mark Jamsek GPG: F2FF 13DE 6A06 C471 CA80 E6E2 2930 DC66 86EE CF68