From: Stefan Sperling Subject: Re: speed up tree-entry parsing To: gameoftrees@openbsd.org Date: Wed, 18 May 2022 13:13:10 +0200 On Wed, May 18, 2022 at 09:58:28AM +0200, Stefan Sperling wrote: > Slightly teaked diff: It is better to grow the array when we are about > to add an entry which would overflow the array, rather than growing the > array after adding a new entry. Avoids growing the array unnecessarily > in case the number of entries equals the number pre-allocated slots. And this new version avoids use of a pathlist entirely. The previous version still used a pathlist to check for duplicate tree entries. Instead, we can do this check after mergesort() by iterating over the array. This avoids some malloc/free and is slightly faster. I have another diff which builds on top of this coming soon. diff refs/heads/main refs/heads/parse-tree-array blob - adbb8bb3e8521199cb1e1ee0ae5ad5dce3f3503d blob + 79b6e83d9872c4ee03472131af53beb3186ab6be --- lib/got_lib_object_parse.h +++ lib/got_lib_object_parse.h @@ -23,18 +23,14 @@ struct got_tree_entry *got_alloc_tree_entry_partial(vo const struct got_error *got_object_parse_commit(struct got_commit_object **, char *, size_t); -/* - * In a pathlist returned by got_object_parse_tree(), a pointer to the entry's - * name is stored in the pathlist element's path, while the pathlist element's - * data pointer points to a struct got_parsed_tree_entry. - */ struct got_parsed_tree_entry { + const char *name; /* Points to name in parsed buffer */ + size_t namelen; /* strlen(name) */ mode_t mode; /* Mode parsed from tree buffer. */ uint8_t *id; /* Points to ID in parsed tree buffer. */ }; -const struct got_error *got_object_parse_tree(struct got_pathlist_head *, int *, - uint8_t *, size_t); -void got_object_parsed_tree_entries_free(struct got_pathlist_head *); +const struct got_error *got_object_parse_tree(struct got_parsed_tree_entry **, + int *, uint8_t *, size_t); const struct got_error *got_object_parse_tag(struct got_tag_object **, uint8_t *, size_t); blob - b310a5120a14c05adbe049177d4af040d61f1ab9 blob + c6937820f54e6d66c1cbf81056a6b916516c9c5d --- lib/got_lib_privsep.h +++ lib/got_lib_privsep.h @@ -662,8 +662,9 @@ const struct got_error *got_privsep_recv_commit(struct struct imsgbuf *); const struct got_error *got_privsep_recv_tree(struct got_tree_object **, struct imsgbuf *); +struct got_parsed_tree_entry; const struct got_error *got_privsep_send_tree(struct imsgbuf *, - struct got_pathlist_head *, int); + struct got_parsed_tree_entry *, int); const struct got_error *got_privsep_send_blob(struct imsgbuf *, size_t, size_t, const uint8_t *); const struct got_error *got_privsep_recv_blob(uint8_t **, size_t *, size_t *, blob - 511053239b0a892dbb0de04e93ccd9ed8e69e7b8 blob + 15e34b7d7b360b66d71d7201ab317ddee510e1c9 --- lib/object_parse.c +++ lib/object_parse.c @@ -669,89 +669,90 @@ got_object_tree_close(struct got_tree_object *tree) } static const struct got_error * -parse_tree_entry(struct got_parsed_tree_entry **pte, const char **name, - size_t *elen, char *buf, +parse_tree_entry(struct got_parsed_tree_entry *pte, size_t *elen, char *buf, size_t maxlen) { char *p, *space; - const struct got_error *err = NULL; - *name = NULL; *elen = 0; - *pte = malloc(sizeof(**pte)); - if (*pte == NULL) - return got_error_from_errno("malloc"); - *elen = strnlen(buf, maxlen) + 1; - if (*elen > maxlen) { - free(*pte); - *pte = NULL; + if (*elen > maxlen) return got_error(GOT_ERR_BAD_OBJ_DATA); - } space = memchr(buf, ' ', *elen); - if (space == NULL || space <= buf) { - err = got_error(GOT_ERR_BAD_OBJ_DATA); - free(*pte); - *pte = NULL; - return err; - } - (*pte)->mode = 0; + if (space == NULL || space <= buf) + return got_error(GOT_ERR_BAD_OBJ_DATA); + + pte->mode = 0; p = buf; while (p < space) { - if (*p < '0' && *p > '7') { - err = got_error(GOT_ERR_BAD_OBJ_DATA); - goto done; - } - (*pte)->mode <<= 3; - (*pte)->mode |= *p - '0'; + if (*p < '0' && *p > '7') + return got_error(GOT_ERR_BAD_OBJ_DATA); + pte->mode <<= 3; + pte->mode |= *p - '0'; p++; } - if (*elen > maxlen || maxlen - *elen < SHA1_DIGEST_LENGTH) { - err = got_error(GOT_ERR_BAD_OBJ_DATA); - goto done; - } - *name = space + 1; + if (*elen > maxlen || maxlen - *elen < SHA1_DIGEST_LENGTH) + return got_error(GOT_ERR_BAD_OBJ_DATA); + + pte->name = space + 1; + pte->namelen = strlen(pte->name); buf += *elen; - (*pte)->id = buf; + pte->id = buf; *elen += SHA1_DIGEST_LENGTH; -done: - if (err) { - free(*pte); - *pte = NULL; - } - return err; + return NULL; } +static int +pte_cmp(const void *pa, const void *pb) +{ + const struct got_parsed_tree_entry *a, *b; + + a = (const struct got_parsed_tree_entry *)pa; + b = (const struct got_parsed_tree_entry *)pb; + + return got_path_cmp(a->name, b->name, a->namelen, b->namelen); +} + const struct got_error * -got_object_parse_tree(struct got_pathlist_head *entries, int *nentries, +got_object_parse_tree(struct got_parsed_tree_entry **entries, int *nentries, uint8_t *buf, size_t len) { const struct got_error *err = NULL; - size_t remain = len; + size_t remain = len, totalloc; + const size_t nalloc = 16; + struct got_parsed_tree_entry *pte; + int i; *nentries = 0; if (remain == 0) return NULL; /* tree is empty */ + *entries = calloc(nalloc, sizeof(**entries)); + if (*entries == NULL) + return got_error_from_errno("calloc"); + totalloc = nalloc; + while (remain > 0) { - struct got_parsed_tree_entry *pte; - struct got_pathlist_entry *new = NULL; - const char *name; size_t elen; - err = parse_tree_entry(&pte, &name, &elen, buf, remain); - if (err) - goto done; - err = got_pathlist_insert(&new, entries, name, pte); - if (err) - goto done; - if (new == NULL) { - err = got_error(GOT_ERR_TREE_DUP_ENTRY); - goto done; + if (*nentries >= totalloc) { + pte = recallocarray(*entries, totalloc, + totalloc + nalloc, sizeof(**entries)); + if (pte == NULL) { + err = got_error_from_errno("recallocarray"); + goto done; + } + *entries = pte; + totalloc += nalloc; } + + pte = &(*entries)[*nentries]; + err = parse_tree_entry(pte, &elen, buf, remain); + if (err) + goto done; buf += elen; remain -= elen; (*nentries)++; @@ -761,27 +762,30 @@ got_object_parse_tree(struct got_pathlist_head *entrie err = got_error(GOT_ERR_BAD_OBJ_DATA); goto done; } + + if (*nentries > 1) { + mergesort(*entries, *nentries, sizeof(**entries), pte_cmp); + + for (i = 0; i < *nentries - 1; i++) { + struct got_parsed_tree_entry *prev = &(*entries)[i]; + pte = &(*entries)[i + 1]; + if (got_path_cmp(prev->name, pte->name, + prev->namelen, pte->namelen) == 0) { + err = got_error(GOT_ERR_TREE_DUP_ENTRY); + break; + } + } + } done: if (err) { - got_object_parsed_tree_entries_free(entries); + free(*entries); + *entries = NULL; *nentries = 0; } return err; } void -got_object_parsed_tree_entries_free(struct got_pathlist_head *entries) -{ - struct got_pathlist_entry *pe; - - TAILQ_FOREACH(pe, entries, entry) { - struct got_parsed_tree_entry *pte = pe->data; - free(pte); - } - got_pathlist_free(entries); -} - -void got_object_tag_close(struct got_tag_object *tag) { if (tag->refcnt > 0) { blob - e7450fd7c2dce123856dba8a632d60b237b54e91 blob + 7b7f894dda9fc261842dfd019c4e9c2f444508a0 --- lib/privsep.c +++ lib/privsep.c @@ -1490,14 +1490,13 @@ got_privsep_recv_commit(struct got_commit_object **com } const struct got_error * -got_privsep_send_tree(struct imsgbuf *ibuf, struct got_pathlist_head *entries, - int nentries) +got_privsep_send_tree(struct imsgbuf *ibuf, + struct got_parsed_tree_entry *entries, int nentries) { const struct got_error *err = NULL; struct got_imsg_tree_object itree; - struct got_pathlist_entry *pe; size_t totlen; - int nimsg; /* number of imsg queued in ibuf */ + int i, nimsg; /* number of imsg queued in ibuf */ itree.nentries = nentries; if (imsg_compose(ibuf, GOT_IMSG_TREE, 0, 0, -1, &itree, sizeof(itree)) @@ -1506,12 +1505,10 @@ got_privsep_send_tree(struct imsgbuf *ibuf, struct got totlen = sizeof(itree); nimsg = 1; - TAILQ_FOREACH(pe, entries, entry) { - const char *name = pe->path; - struct got_parsed_tree_entry *pte = pe->data; + for (i = 0; i < nentries; i++) { + struct got_parsed_tree_entry *pte = &entries[i]; struct ibuf *wbuf; - size_t namelen = strlen(name); - size_t len = sizeof(struct got_imsg_tree_entry) + namelen; + size_t len = sizeof(struct got_imsg_tree_entry) + pte->namelen; if (len > MAX_IMSGSIZE) return got_error(GOT_ERR_NO_SPACE); @@ -1540,7 +1537,7 @@ got_privsep_send_tree(struct imsgbuf *ibuf, struct got return err; } - if (imsg_add(wbuf, name, namelen) == -1) { + if (imsg_add(wbuf, pte->name, pte->namelen) == -1) { err = got_error_from_errno("imsg_add TREE_ENTRY"); ibuf_free(wbuf); return err; blob - 20a67c9701816c2474c4d9e546a0c6f4aeb8d0ae blob + b6d3c895f92db84cc93320a4f869170172782abd --- libexec/got-read-pack/got-read-pack.c +++ libexec/got-read-pack/got-read-pack.c @@ -177,7 +177,7 @@ done: } static const struct got_error * -open_tree(uint8_t **buf, struct got_pathlist_head *entries, int *nentries, +open_tree(uint8_t **buf, struct got_parsed_tree_entry **entries, int *nentries, struct got_pack *pack, struct got_packidx *packidx, int obj_idx, struct got_object_id *id, struct got_object_cache *objcache) { @@ -220,14 +220,12 @@ tree_request(struct imsg *imsg, struct imsgbuf *ibuf, { const struct got_error *err = NULL; struct got_imsg_packed_object iobj; - struct got_pathlist_head entries; + struct got_parsed_tree_entry *entries = NULL; int nentries = 0; uint8_t *buf = NULL; struct got_object_id id; size_t datalen; - TAILQ_INIT(&entries); - datalen = imsg->hdr.len - IMSG_HEADER_SIZE; if (datalen != sizeof(iobj)) return got_error(GOT_ERR_PRIVSEP_LEN); @@ -239,8 +237,8 @@ tree_request(struct imsg *imsg, struct imsgbuf *ibuf, if (err) return err; - err = got_privsep_send_tree(ibuf, &entries, nentries); - got_object_parsed_tree_entries_free(&entries); + err = got_privsep_send_tree(ibuf, entries, nentries); + free(entries); free(buf); if (err) { if (err->code == GOT_ERR_PRIVSEP_PIPE) @@ -427,28 +425,30 @@ done: } static struct got_parsed_tree_entry * -find_entry_by_name(struct got_pathlist_head *entries, int nentries, +find_entry_by_name(struct got_parsed_tree_entry *entries, int nentries, const char *name, size_t len) { - struct got_pathlist_entry *pe; + struct got_parsed_tree_entry *pte; + int cmp, i; /* Note that tree entries are sorted in strncmp() order. */ - TAILQ_FOREACH(pe, entries, entry) { - int cmp = strncmp(pe->path, name, len); + for (i = 0; i < nentries; i++) { + pte = &entries[i]; + cmp = strncmp(pte->name, name, len); if (cmp < 0) continue; if (cmp > 0) break; - if (pe->path[len] == '\0') - return (struct got_parsed_tree_entry *)pe->data; + if (pte->name[len] == '\0') + return pte; } return NULL; } static const struct got_error * tree_path_changed(int *changed, uint8_t **buf1, uint8_t **buf2, - struct got_pathlist_head *entries1, int *nentries1, - struct got_pathlist_head *entries2, int *nentries2, + struct got_parsed_tree_entry **entries1, int *nentries1, + struct got_parsed_tree_entry **entries2, int *nentries2, const char *path, struct got_pack *pack, struct got_packidx *packidx, struct imsgbuf *ibuf, struct got_object_cache *objcache) { @@ -476,13 +476,13 @@ tree_path_changed(int *changed, uint8_t **buf1, uint8_ continue; } - pte1 = find_entry_by_name(entries1, *nentries1, seg, seglen); + pte1 = find_entry_by_name(*entries1, *nentries1, seg, seglen); if (pte1 == NULL) { err = got_error(GOT_ERR_NO_OBJ); break; } - pte2 = find_entry_by_name(entries2, *nentries2, seg, seglen); + pte2 = find_entry_by_name(*entries2, *nentries2, seg, seglen); if (pte2 == NULL) { *changed = 1; break; @@ -516,7 +516,7 @@ tree_path_changed(int *changed, uint8_t **buf1, uint8_ err = got_error_no_obj(&id1); break; } - got_object_parsed_tree_entries_free(entries1); + free(*entries1); *nentries1 = 0; free(*buf1); *buf1 = NULL; @@ -532,7 +532,7 @@ tree_path_changed(int *changed, uint8_t **buf1, uint8_ err = got_error_no_obj(&id2); break; } - got_object_parsed_tree_entries_free(entries2); + free(*entries2); *nentries2 = 0; free(*buf2); *buf2 = NULL; @@ -602,7 +602,7 @@ commit_traversal_request(struct imsg *imsg, struct ims struct got_imsg_packed_object iobj; struct got_object_qid *pid; struct got_commit_object *commit = NULL, *pcommit = NULL; - struct got_pathlist_head entries, pentries; + struct got_parsed_tree_entry *entries = NULL, *pentries = NULL; int nentries = 0, pnentries = 0; struct got_object_id id; size_t datalen, path_len; @@ -611,9 +611,6 @@ commit_traversal_request(struct imsg *imsg, struct ims int changed = 0, ncommits = 0, nallocated = 0; struct got_object_id *commit_ids = NULL; - TAILQ_INIT(&entries); - TAILQ_INIT(&pentries); - datalen = imsg->hdr.len - IMSG_HEADER_SIZE; if (datalen < sizeof(iobj)) return got_error(GOT_ERR_PRIVSEP_LEN); @@ -731,10 +728,12 @@ commit_traversal_request(struct imsg *imsg, struct ims &entries, &nentries, &pentries, &pnentries, path, pack, packidx, ibuf, objcache); - got_object_parsed_tree_entries_free(&entries); + free(entries); + entries = NULL; nentries = 0; free(buf); - got_object_parsed_tree_entries_free(&pentries); + free(pentries); + pentries = NULL; pnentries = 0; free(pbuf); if (err) { @@ -771,10 +770,8 @@ done: got_object_commit_close(commit); if (pcommit) got_object_commit_close(pcommit); - if (nentries != 0) - got_object_parsed_tree_entries_free(&entries); - if (pnentries != 0) - got_object_parsed_tree_entries_free(&pentries); + free(entries); + free(pentries); if (err) { if (err->code == GOT_ERR_PRIVSEP_PIPE) err = NULL; blob - 95e34c068c96ce1a4a00a1ea034531e243919dfd blob + 225f928a01e58925e0d8bcb3e1ffb1a2cc09d631 --- libexec/got-read-tree/got-read-tree.c +++ libexec/got-read-tree/got-read-tree.c @@ -50,7 +50,7 @@ catch_sigint(int signo) } static const struct got_error * -read_tree_object(struct got_pathlist_head *entries, int *nentries, +read_tree_object(struct got_parsed_tree_entry **entries, int *nentries, uint8_t **p, FILE *f, struct got_object_id *expected_id) { const struct got_error *err = NULL; @@ -119,13 +119,11 @@ main(int argc, char *argv[]) for (;;) { struct imsg imsg; FILE *f = NULL; - struct got_pathlist_head entries; + struct got_parsed_tree_entry *entries = NULL; int nentries = 0; uint8_t *buf = NULL; struct got_object_id expected_id; - TAILQ_INIT(&entries); - if (sigint_received) { err = got_error(GOT_ERR_CANCELLED); break; @@ -170,9 +168,9 @@ main(int argc, char *argv[]) if (err) goto done; - err = got_privsep_send_tree(&ibuf, &entries, nentries); + err = got_privsep_send_tree(&ibuf, entries, nentries); done: - got_object_parsed_tree_entries_free(&entries); + free(entries); free(buf); if (f) { if (fclose(f) == EOF && err == NULL)