From: Stefan Sperling Subject: optimize tree_path_changed() To: gameoftrees@openbsd.org Date: Sun, 23 Apr 2023 23:45:45 +0200 This patch avoids having to parse and sort all entries of two trees in order to figure out whether a particular entry has changed. Instead, we now decode entries one-by-one until we find an entry that matches the path being logged. We also compare entries in the on-disk sorting order rather than got_path_cmp() order. As long as the order is stable (which it is supposed to be on disk) this works as avoids having to sort thousands of entries over and over when traversing a large repository like freebsd-ports.git. Other parts of Got still rely on got_path_cmp() order and are not affected by this change. ok? ----------------------------------------------- when finding changed paths iterate tree entries in on-disk order for speed diff 91db220264b6d9be6d44223dd76ae8eb9bea3641 c61b19b32b39e2c85a64bca8cc9ddad22ed10219 commit - 91db220264b6d9be6d44223dd76ae8eb9bea3641 commit + c61b19b32b39e2c85a64bca8cc9ddad22ed10219 blob - ff0be54ad39998cf5dffd4bd99a222298cd15e06 blob + 746e561381b85c24e2b02e2298cff0dbbf3b9912 --- lib/got_lib_object_parse.h +++ lib/got_lib_object_parse.h @@ -32,6 +32,8 @@ const struct got_error *got_object_parse_tree(struct g mode_t mode; /* Mode parsed from tree buffer. */ uint8_t *id; /* Points to ID in parsed tree buffer. */ }; +const struct got_error *got_object_parse_tree_entry( + struct got_parsed_tree_entry *, size_t *, char *, size_t); const struct got_error *got_object_parse_tree(struct got_parsed_tree_entry **, size_t *, size_t *, uint8_t *, size_t); const struct got_error *got_object_read_tree(struct got_parsed_tree_entry **, blob - 28536b3cc7243ce070909cda1331a2b7750713e0 blob + 97321f872de069983725c100f1ac2f900bfbe39e --- lib/object_parse.c +++ lib/object_parse.c @@ -761,9 +761,9 @@ static const struct got_error * free(tree); } -static const struct got_error * -parse_tree_entry(struct got_parsed_tree_entry *pte, size_t *elen, char *buf, - size_t maxlen) +const struct got_error * +got_object_parse_tree_entry(struct got_parsed_tree_entry *pte, size_t *elen, + char *buf, size_t maxlen) { char *p, *space; @@ -835,7 +835,7 @@ got_object_parse_tree(struct got_parsed_tree_entry **e } pte = &(*entries)[*nentries]; - err = parse_tree_entry(pte, &elen, buf, remain); + err = got_object_parse_tree_entry(pte, &elen, buf, remain); if (err) goto done; buf += elen; blob - 1ff3707be9bef771b3d4084997853ada44c722ac blob + 47432bc05913abd4ec23ec1b1aed9d66279e7d3c --- libexec/got-read-pack/got-read-pack.c +++ libexec/got-read-pack/got-read-pack.c @@ -185,20 +185,21 @@ open_tree(uint8_t **buf, struct got_parsed_tree_entry } static const struct got_error * -open_tree(uint8_t **buf, struct got_parsed_tree_entry **entries, size_t *nentries, - size_t *nentries_alloc, struct got_pack *pack, struct got_packidx *packidx, - int obj_idx, struct got_object_id *id, struct got_object_cache *objcache) +open_tree(uint8_t **buf, size_t *len, + struct got_pack *pack, struct got_packidx *packidx, int obj_idx, + struct got_object_id *id, struct got_object_cache *objcache) { const struct got_error *err = NULL; struct got_object *obj = NULL; - size_t len; + int cached = 0; *buf = NULL; - *nentries = 0; + *len = 0; obj = got_object_cache_get(objcache, id); if (obj) { obj->refcnt++; + cached = 1; } else { err = open_object(&obj, pack, packidx, obj_idx, id, objcache); @@ -206,14 +207,12 @@ open_tree(uint8_t **buf, struct got_parsed_tree_entry return err; } - err = got_packfile_extract_object_to_mem(buf, &len, obj, pack); + err = got_packfile_extract_object_to_mem(buf, len, obj, pack); if (err) goto done; - obj->size = len; - - err = got_object_parse_tree(entries, nentries, nentries_alloc, - *buf, len); + if (!cached) + obj->size = *len; done: got_object_close(obj); if (err) { @@ -232,6 +231,7 @@ tree_request(struct imsg *imsg, struct imsgbuf *ibuf, const struct got_error *err = NULL; struct got_imsg_packed_object iobj; uint8_t *buf = NULL; + size_t len = 0; struct got_object_id id; size_t datalen; @@ -241,20 +241,24 @@ tree_request(struct imsg *imsg, struct imsgbuf *ibuf, memcpy(&iobj, imsg->data, sizeof(iobj)); memcpy(&id, &iobj.id, sizeof(id)); - err = open_tree(&buf, entries, nentries, nentries_alloc, - pack, packidx, iobj.idx, &id, objcache); + err = open_tree(&buf, &len, pack, packidx, iobj.idx, &id, objcache); if (err) return err; + err = got_object_parse_tree(entries, nentries, nentries_alloc, + buf, len); + if (err) + goto done; + err = got_privsep_send_tree(ibuf, *entries, *nentries); - free(buf); if (err) { if (err->code == GOT_ERR_PRIVSEP_PIPE) err = NULL; else got_privsep_send_error(ibuf, err); } - +done: + free(buf); return err; } @@ -432,41 +436,23 @@ static struct got_parsed_tree_entry * return err; } -static struct got_parsed_tree_entry * -find_entry_by_name(struct got_parsed_tree_entry *entries, int nentries, - const char *name, size_t len) -{ - struct got_parsed_tree_entry *pte; - int cmp, i; - - /* Note that tree entries are sorted in strncmp() order. */ - for (i = 0; i < nentries; i++) { - pte = &entries[i]; - cmp = strncmp(pte->name, name, len); - if (cmp < 0) - continue; - if (cmp > 0) - break; - if (pte->name[len] == '\0') - return pte; - } - return NULL; -} - static const struct got_error * -tree_path_changed(int *changed, uint8_t **buf1, uint8_t **buf2, - struct got_parsed_tree_entry **entries1, size_t *nentries1, - size_t *nentries_alloc1, - struct got_parsed_tree_entry **entries2, size_t *nentries2, - size_t *nentries_alloc2, - const char *path, struct got_pack *pack, struct got_packidx *packidx, +tree_path_changed(int *changed, uint8_t **buf1, size_t *len1, + uint8_t **buf2, size_t *len2, const char *path, + struct got_pack *pack, struct got_packidx *packidx, struct imsgbuf *ibuf, struct got_object_cache *objcache) { const struct got_error *err = NULL; - struct got_parsed_tree_entry *pte1 = NULL, *pte2 = NULL; + struct got_parsed_tree_entry pte1, pte2; const char *seg, *s; size_t seglen; + size_t remain1 = *len1, remain2 = *len2, elen; + uint8_t *next_entry1 = *buf1; + uint8_t *next_entry2 = *buf2; + memset(&pte1, 0, sizeof(pte1)); + memset(&pte2, 0, sizeof(pte2)); + *changed = 0; /* We not do support comparing the root path. */ @@ -486,24 +472,67 @@ tree_path_changed(int *changed, uint8_t **buf1, uint8_ continue; } - pte1 = find_entry_by_name(*entries1, *nentries1, seg, seglen); - if (pte1 == NULL) { + /* + * As an optimization we compare entries in on-disk order + * rather than in got_path_cmp() order. We only need to + * find out if any entries differ. Parsing all entries and + * sorting them slows us down significantly when tree objects + * have thousands of entries. We can assume that on-disk entry + * ordering is stable, as per got_object_tree_create() and + * sort_tree_entries_the_way_git_likes_it(). Other orderings + * are incompatible with Git and would yield false positives + * here, too. + */ + while (remain1 > 0) { + err = got_object_parse_tree_entry(&pte1, &elen, + next_entry1, remain1); + if (err) + return err; + next_entry1 += elen; + remain1 -= elen; + if (strncmp(pte1.name, seg, seglen) != 0 || + pte1.name[seglen] != '\0') { + memset(&pte1, 0, sizeof(pte1)); + continue; + } else + break; + } + if (pte1.name == NULL) { err = got_error(GOT_ERR_NO_OBJ); break; } - pte2 = find_entry_by_name(*entries2, *nentries2, seg, seglen); - if (pte2 == NULL) { + if (remain2 == 0) { *changed = 1; break; } - if (pte1->mode != pte2->mode) { + while (remain2 > 0) { + err = got_object_parse_tree_entry(&pte2, &elen, + next_entry2, remain2); + if (err) + return err; + next_entry2 += elen; + remain2 -= elen; + if (strncmp(pte2.name, seg, seglen) != 0 || + pte2.name[seglen] != '\0') { + memset(&pte2, 0, sizeof(pte2)); + continue; + } else + break; + } + + if (pte2.name == NULL) { *changed = 1; break; } - if (memcmp(pte1->id, pte2->id, SHA1_DIGEST_LENGTH) == 0) { + if (pte1.mode != pte2.mode) { + *changed = 1; + break; + } + + if (memcmp(pte1.id, pte2.id, SHA1_DIGEST_LENGTH) == 0) { *changed = 0; break; } @@ -520,37 +549,37 @@ tree_path_changed(int *changed, uint8_t **buf1, uint8_ struct got_object_id id1, id2; int idx; - memcpy(id1.sha1, pte1->id, SHA1_DIGEST_LENGTH); + memcpy(id1.sha1, pte1.id, SHA1_DIGEST_LENGTH); idx = got_packidx_get_object_idx(packidx, &id1); if (idx == -1) { err = got_error_no_obj(&id1); break; } - *nentries1 = 0; free(*buf1); *buf1 = NULL; - err = open_tree(buf1, entries1, nentries1, - nentries_alloc1, pack, packidx, idx, &id1, + err = open_tree(buf1, len1, pack, packidx, idx, &id1, objcache); - pte1 = NULL; + memset(&pte1, 0, sizeof(pte1)); if (err) break; + next_entry1 = *buf1; + remain1 = *len1; - memcpy(id2.sha1, pte2->id, SHA1_DIGEST_LENGTH); + memcpy(id2.sha1, pte2.id, SHA1_DIGEST_LENGTH); idx = got_packidx_get_object_idx(packidx, &id2); if (idx == -1) { err = got_error_no_obj(&id2); break; } - *nentries2 = 0; free(*buf2); *buf2 = NULL; - err = open_tree(buf2, entries2, nentries2, - nentries_alloc2, pack, packidx, idx, &id2, + err = open_tree(buf2, len2, pack, packidx, idx, &id2, objcache); - pte2 = NULL; + memset(&pte2, 0, sizeof(pte2)); if (err) break; + next_entry2 = *buf2; + remain2 = *len2; } } @@ -606,9 +635,6 @@ commit_traversal_request(struct imsg *imsg, struct ims struct got_imsg_commit_traversal_request ctreq; struct got_object_qid *pid; struct got_commit_object *commit = NULL, *pcommit = NULL; - struct got_parsed_tree_entry *entries = NULL, *pentries = NULL; - size_t nentries = 0, nentries_alloc = 0; - size_t pnentries = 0, pnentries_alloc = 0; struct got_object_id id; size_t datalen; char *path = NULL; @@ -707,6 +733,7 @@ commit_traversal_request(struct imsg *imsg, struct ims } else { int pidx; uint8_t *buf = NULL, *pbuf = NULL; + size_t len = 0, plen = 0; idx = got_packidx_get_object_idx(packidx, commit->tree_id); @@ -717,27 +744,23 @@ commit_traversal_request(struct imsg *imsg, struct ims if (pidx == -1) break; - err = open_tree(&buf, &entries, &nentries, - &nentries_alloc, pack, packidx, idx, + err = open_tree(&buf, &len, pack, packidx, idx, commit->tree_id, objcache); if (err) goto done; - err = open_tree(&pbuf, &pentries, &pnentries, - &pnentries_alloc, pack, packidx, pidx, + + err = open_tree(&pbuf, &plen, pack, packidx, pidx, pcommit->tree_id, objcache); if (err) { free(buf); goto done; } - err = tree_path_changed(&changed, &buf, &pbuf, - &entries, &nentries, &nentries_alloc, - &pentries, &pnentries, &pnentries_alloc, - path, pack, packidx, ibuf, objcache); + err = tree_path_changed(&changed, &buf, &len, + &pbuf, &plen, path, pack, packidx, ibuf, + objcache); - nentries = 0; free(buf); - pnentries = 0; free(pbuf); if (err) { if (err->code != GOT_ERR_NO_OBJ) @@ -774,8 +797,6 @@ done: got_object_commit_close(commit); if (pcommit) got_object_commit_close(pcommit); - free(entries); - free(pentries); if (err) { if (err->code == GOT_ERR_PRIVSEP_PIPE) err = NULL; @@ -1185,6 +1206,7 @@ enumerate_tree(int *have_all_entries, struct imsgbuf * struct got_object_id_queue ids; struct got_object_qid *qid; uint8_t *buf = NULL; + size_t len = 0; struct got_parsed_tree_entry *entries = NULL; size_t nentries = 0, nentries_alloc = 0, i; struct enumerated_tree *tree; @@ -1225,13 +1247,18 @@ enumerate_tree(int *have_all_entries, struct imsgbuf * break; } - err = open_tree(&buf, &entries, &nentries, &nentries_alloc, - pack, packidx, idx, &qid->id, objcache); + err = open_tree(&buf, &len, pack, packidx, idx, &qid->id, + objcache); if (err) { if (err->code != GOT_ERR_NO_OBJ) goto done; } + err = got_object_parse_tree(&entries, &nentries, + &nentries_alloc, buf, len); + if (err) + goto done; + err = got_object_idset_add(idset, &qid->id, NULL); if (err) goto done;