From: Omar Polo Subject: Re: speed up tree-entry parsing To: Stefan Sperling Cc: gameoftrees@openbsd.org Date: Wed, 18 May 2022 17:12:47 +0200 Stefan Sperling wrote: > On Wed, May 18, 2022 at 09:58:28AM +0200, Stefan Sperling wrote: > > Slightly teaked diff: It is better to grow the array when we are about > > to add an entry which would overflow the array, rather than growing the > > array after adding a new entry. Avoids growing the array unnecessarily > > in case the number of entries equals the number pre-allocated slots. agreed > And this new version avoids use of a pathlist entirely. > The previous version still used a pathlist to check for duplicate > tree entries. > > Instead, we can do this check after mergesort() by iterating over > the array. This avoids some malloc/free and is slightly faster. > > I have another diff which builds on top of this coming soon. I haven't done any real runtime testing but it reads fine and regress is passing. I have two questions below but the diff reads ok to me as-is :) > diff refs/heads/main refs/heads/parse-tree-array > blob - adbb8bb3e8521199cb1e1ee0ae5ad5dce3f3503d > blob + 79b6e83d9872c4ee03472131af53beb3186ab6be > --- lib/got_lib_object_parse.h > +++ lib/got_lib_object_parse.h > @@ -23,18 +23,14 @@ struct got_tree_entry *got_alloc_tree_entry_partial(vo > const struct got_error *got_object_parse_commit(struct got_commit_object **, > char *, size_t); > > -/* > - * In a pathlist returned by got_object_parse_tree(), a pointer to the entry's > - * name is stored in the pathlist element's path, while the pathlist element's > - * data pointer points to a struct got_parsed_tree_entry. > - */ > struct got_parsed_tree_entry { > + const char *name; /* Points to name in parsed buffer */ > + size_t namelen; /* strlen(name) */ > mode_t mode; /* Mode parsed from tree buffer. */ > uint8_t *id; /* Points to ID in parsed tree buffer. */ > }; > -const struct got_error *got_object_parse_tree(struct got_pathlist_head *, int *, > - uint8_t *, size_t); > -void got_object_parsed_tree_entries_free(struct got_pathlist_head *); > +const struct got_error *got_object_parse_tree(struct got_parsed_tree_entry **, > + int *, uint8_t *, size_t); > > const struct got_error *got_object_parse_tag(struct got_tag_object **, > uint8_t *, size_t); > blob - b310a5120a14c05adbe049177d4af040d61f1ab9 > blob + c6937820f54e6d66c1cbf81056a6b916516c9c5d > --- lib/got_lib_privsep.h > +++ lib/got_lib_privsep.h > @@ -662,8 +662,9 @@ const struct got_error *got_privsep_recv_commit(struct > struct imsgbuf *); > const struct got_error *got_privsep_recv_tree(struct got_tree_object **, > struct imsgbuf *); > +struct got_parsed_tree_entry; > const struct got_error *got_privsep_send_tree(struct imsgbuf *, > - struct got_pathlist_head *, int); > + struct got_parsed_tree_entry *, int); > const struct got_error *got_privsep_send_blob(struct imsgbuf *, size_t, size_t, > const uint8_t *); > const struct got_error *got_privsep_recv_blob(uint8_t **, size_t *, size_t *, > blob - 511053239b0a892dbb0de04e93ccd9ed8e69e7b8 > blob + 15e34b7d7b360b66d71d7201ab317ddee510e1c9 > --- lib/object_parse.c > +++ lib/object_parse.c > @@ -669,89 +669,90 @@ got_object_tree_close(struct got_tree_object *tree) > } > > static const struct got_error * > -parse_tree_entry(struct got_parsed_tree_entry **pte, const char **name, > - size_t *elen, char *buf, > +parse_tree_entry(struct got_parsed_tree_entry *pte, size_t *elen, char *buf, > size_t maxlen) > { > char *p, *space; > - const struct got_error *err = NULL; > > - *name = NULL; > *elen = 0; > > - *pte = malloc(sizeof(**pte)); > - if (*pte == NULL) > - return got_error_from_errno("malloc"); > - > *elen = strnlen(buf, maxlen) + 1; > - if (*elen > maxlen) { > - free(*pte); > - *pte = NULL; > + if (*elen > maxlen) > return got_error(GOT_ERR_BAD_OBJ_DATA); > - } > > space = memchr(buf, ' ', *elen); > - if (space == NULL || space <= buf) { > - err = got_error(GOT_ERR_BAD_OBJ_DATA); > - free(*pte); > - *pte = NULL; > - return err; > - } > - (*pte)->mode = 0; this was the same in the previous code too, but why check for space <= buf? > + if (space == NULL || space <= buf) `space' is either NULL or somewhere between [buf, buf+(*elen)), the <= is intended just to make sure buf doesn't start with a space right? > + return got_error(GOT_ERR_BAD_OBJ_DATA); > + > + pte->mode = 0; > p = buf; > while (p < space) { > - if (*p < '0' && *p > '7') { > - err = got_error(GOT_ERR_BAD_OBJ_DATA); > - goto done; > - } > - (*pte)->mode <<= 3; > - (*pte)->mode |= *p - '0'; > + if (*p < '0' && *p > '7') > + return got_error(GOT_ERR_BAD_OBJ_DATA); > + pte->mode <<= 3; > + pte->mode |= *p - '0'; > p++; > } > > - if (*elen > maxlen || maxlen - *elen < SHA1_DIGEST_LENGTH) { > - err = got_error(GOT_ERR_BAD_OBJ_DATA); > - goto done; > - } > - *name = space + 1; > + if (*elen > maxlen || maxlen - *elen < SHA1_DIGEST_LENGTH) > + return got_error(GOT_ERR_BAD_OBJ_DATA); > + > + pte->name = space + 1; > + pte->namelen = strlen(pte->name); > buf += *elen; > - (*pte)->id = buf; > + pte->id = buf; > *elen += SHA1_DIGEST_LENGTH; > -done: > - if (err) { > - free(*pte); > - *pte = NULL; > - } > - return err; > + return NULL; > } > > +static int > +pte_cmp(const void *pa, const void *pb) > +{ > + const struct got_parsed_tree_entry *a, *b; > + > + a = (const struct got_parsed_tree_entry *)pa; > + b = (const struct got_parsed_tree_entry *)pb; are these cast needed? It's just a matter of style, I'm just asking to be sure so future patches will be coherent with the rest of the code. I have written some code in got_patch IIRC which uses the implicit cast from void * > + return got_path_cmp(a->name, b->name, a->namelen, b->namelen); > +} > + > const struct got_error * > -got_object_parse_tree(struct got_pathlist_head *entries, int *nentries, > +got_object_parse_tree(struct got_parsed_tree_entry **entries, int *nentries, > uint8_t *buf, size_t len) > { > const struct got_error *err = NULL; > - size_t remain = len; > + size_t remain = len, totalloc; > + const size_t nalloc = 16; > + struct got_parsed_tree_entry *pte; > + int i; > > *nentries = 0; > if (remain == 0) > return NULL; /* tree is empty */ > > + *entries = calloc(nalloc, sizeof(**entries)); > + if (*entries == NULL) > + return got_error_from_errno("calloc"); > + totalloc = nalloc; > + > while (remain > 0) { > - struct got_parsed_tree_entry *pte; > - struct got_pathlist_entry *new = NULL; > - const char *name; > size_t elen; > > - err = parse_tree_entry(&pte, &name, &elen, buf, remain); > - if (err) > - goto done; > - err = got_pathlist_insert(&new, entries, name, pte); > - if (err) > - goto done; > - if (new == NULL) { > - err = got_error(GOT_ERR_TREE_DUP_ENTRY); > - goto done; > + if (*nentries >= totalloc) { > + pte = recallocarray(*entries, totalloc, > + totalloc + nalloc, sizeof(**entries)); > + if (pte == NULL) { > + err = got_error_from_errno("recallocarray"); > + goto done; > + } > + *entries = pte; > + totalloc += nalloc; > } > + > + pte = &(*entries)[*nentries]; > + err = parse_tree_entry(pte, &elen, buf, remain); > + if (err) > + goto done; > buf += elen; > remain -= elen; > (*nentries)++; > @@ -761,27 +762,30 @@ got_object_parse_tree(struct got_pathlist_head *entrie > err = got_error(GOT_ERR_BAD_OBJ_DATA); > goto done; > } > + > + if (*nentries > 1) { > + mergesort(*entries, *nentries, sizeof(**entries), pte_cmp); > + > + for (i = 0; i < *nentries - 1; i++) { > + struct got_parsed_tree_entry *prev = &(*entries)[i]; > + pte = &(*entries)[i + 1]; > + if (got_path_cmp(prev->name, pte->name, > + prev->namelen, pte->namelen) == 0) { > + err = got_error(GOT_ERR_TREE_DUP_ENTRY); > + break; > + } > + } > + } > done: > if (err) { > - got_object_parsed_tree_entries_free(entries); > + free(*entries); > + *entries = NULL; > *nentries = 0; > } > return err; > } > > void > -got_object_parsed_tree_entries_free(struct got_pathlist_head *entries) > -{ > - struct got_pathlist_entry *pe; > - > - TAILQ_FOREACH(pe, entries, entry) { > - struct got_parsed_tree_entry *pte = pe->data; > - free(pte); > - } > - got_pathlist_free(entries); > -} > - > -void > got_object_tag_close(struct got_tag_object *tag) > { > if (tag->refcnt > 0) { > blob - e7450fd7c2dce123856dba8a632d60b237b54e91 > blob + 7b7f894dda9fc261842dfd019c4e9c2f444508a0 > --- lib/privsep.c > +++ lib/privsep.c > @@ -1490,14 +1490,13 @@ got_privsep_recv_commit(struct got_commit_object **com > } > > const struct got_error * > -got_privsep_send_tree(struct imsgbuf *ibuf, struct got_pathlist_head *entries, > - int nentries) > +got_privsep_send_tree(struct imsgbuf *ibuf, > + struct got_parsed_tree_entry *entries, int nentries) > { > const struct got_error *err = NULL; > struct got_imsg_tree_object itree; > - struct got_pathlist_entry *pe; > size_t totlen; > - int nimsg; /* number of imsg queued in ibuf */ > + int i, nimsg; /* number of imsg queued in ibuf */ > > itree.nentries = nentries; > if (imsg_compose(ibuf, GOT_IMSG_TREE, 0, 0, -1, &itree, sizeof(itree)) > @@ -1506,12 +1505,10 @@ got_privsep_send_tree(struct imsgbuf *ibuf, struct got > > totlen = sizeof(itree); > nimsg = 1; > - TAILQ_FOREACH(pe, entries, entry) { > - const char *name = pe->path; > - struct got_parsed_tree_entry *pte = pe->data; > + for (i = 0; i < nentries; i++) { > + struct got_parsed_tree_entry *pte = &entries[i]; > struct ibuf *wbuf; > - size_t namelen = strlen(name); > - size_t len = sizeof(struct got_imsg_tree_entry) + namelen; > + size_t len = sizeof(struct got_imsg_tree_entry) + pte->namelen; > > if (len > MAX_IMSGSIZE) > return got_error(GOT_ERR_NO_SPACE); > @@ -1540,7 +1537,7 @@ got_privsep_send_tree(struct imsgbuf *ibuf, struct got > return err; > } > > - if (imsg_add(wbuf, name, namelen) == -1) { > + if (imsg_add(wbuf, pte->name, pte->namelen) == -1) { > err = got_error_from_errno("imsg_add TREE_ENTRY"); > ibuf_free(wbuf); > return err; > blob - 20a67c9701816c2474c4d9e546a0c6f4aeb8d0ae > blob + b6d3c895f92db84cc93320a4f869170172782abd > --- libexec/got-read-pack/got-read-pack.c > +++ libexec/got-read-pack/got-read-pack.c > @@ -177,7 +177,7 @@ done: > } > > static const struct got_error * > -open_tree(uint8_t **buf, struct got_pathlist_head *entries, int *nentries, > +open_tree(uint8_t **buf, struct got_parsed_tree_entry **entries, int *nentries, > struct got_pack *pack, struct got_packidx *packidx, int obj_idx, > struct got_object_id *id, struct got_object_cache *objcache) > { > @@ -220,14 +220,12 @@ tree_request(struct imsg *imsg, struct imsgbuf *ibuf, > { > const struct got_error *err = NULL; > struct got_imsg_packed_object iobj; > - struct got_pathlist_head entries; > + struct got_parsed_tree_entry *entries = NULL; > int nentries = 0; > uint8_t *buf = NULL; > struct got_object_id id; > size_t datalen; > > - TAILQ_INIT(&entries); > - > datalen = imsg->hdr.len - IMSG_HEADER_SIZE; > if (datalen != sizeof(iobj)) > return got_error(GOT_ERR_PRIVSEP_LEN); > @@ -239,8 +237,8 @@ tree_request(struct imsg *imsg, struct imsgbuf *ibuf, > if (err) > return err; > > - err = got_privsep_send_tree(ibuf, &entries, nentries); > - got_object_parsed_tree_entries_free(&entries); > + err = got_privsep_send_tree(ibuf, entries, nentries); > + free(entries); > free(buf); > if (err) { > if (err->code == GOT_ERR_PRIVSEP_PIPE) > @@ -427,28 +425,30 @@ done: > } > > static struct got_parsed_tree_entry * > -find_entry_by_name(struct got_pathlist_head *entries, int nentries, > +find_entry_by_name(struct got_parsed_tree_entry *entries, int nentries, > const char *name, size_t len) > { > - struct got_pathlist_entry *pe; > + struct got_parsed_tree_entry *pte; > + int cmp, i; > > /* Note that tree entries are sorted in strncmp() order. */ > - TAILQ_FOREACH(pe, entries, entry) { > - int cmp = strncmp(pe->path, name, len); > + for (i = 0; i < nentries; i++) { > + pte = &entries[i]; > + cmp = strncmp(pte->name, name, len); > if (cmp < 0) > continue; > if (cmp > 0) > break; > - if (pe->path[len] == '\0') > - return (struct got_parsed_tree_entry *)pe->data; > + if (pte->name[len] == '\0') > + return pte; > } > return NULL; > } > > static const struct got_error * > tree_path_changed(int *changed, uint8_t **buf1, uint8_t **buf2, > - struct got_pathlist_head *entries1, int *nentries1, > - struct got_pathlist_head *entries2, int *nentries2, > + struct got_parsed_tree_entry **entries1, int *nentries1, > + struct got_parsed_tree_entry **entries2, int *nentries2, > const char *path, struct got_pack *pack, struct got_packidx *packidx, > struct imsgbuf *ibuf, struct got_object_cache *objcache) > { > @@ -476,13 +476,13 @@ tree_path_changed(int *changed, uint8_t **buf1, uint8_ > continue; > } > > - pte1 = find_entry_by_name(entries1, *nentries1, seg, seglen); > + pte1 = find_entry_by_name(*entries1, *nentries1, seg, seglen); > if (pte1 == NULL) { > err = got_error(GOT_ERR_NO_OBJ); > break; > } > > - pte2 = find_entry_by_name(entries2, *nentries2, seg, seglen); > + pte2 = find_entry_by_name(*entries2, *nentries2, seg, seglen); > if (pte2 == NULL) { > *changed = 1; > break; > @@ -516,7 +516,7 @@ tree_path_changed(int *changed, uint8_t **buf1, uint8_ > err = got_error_no_obj(&id1); > break; > } > - got_object_parsed_tree_entries_free(entries1); > + free(*entries1); > *nentries1 = 0; > free(*buf1); > *buf1 = NULL; > @@ -532,7 +532,7 @@ tree_path_changed(int *changed, uint8_t **buf1, uint8_ > err = got_error_no_obj(&id2); > break; > } > - got_object_parsed_tree_entries_free(entries2); > + free(*entries2); > *nentries2 = 0; > free(*buf2); > *buf2 = NULL; > @@ -602,7 +602,7 @@ commit_traversal_request(struct imsg *imsg, struct ims > struct got_imsg_packed_object iobj; > struct got_object_qid *pid; > struct got_commit_object *commit = NULL, *pcommit = NULL; > - struct got_pathlist_head entries, pentries; > + struct got_parsed_tree_entry *entries = NULL, *pentries = NULL; > int nentries = 0, pnentries = 0; > struct got_object_id id; > size_t datalen, path_len; > @@ -611,9 +611,6 @@ commit_traversal_request(struct imsg *imsg, struct ims > int changed = 0, ncommits = 0, nallocated = 0; > struct got_object_id *commit_ids = NULL; > > - TAILQ_INIT(&entries); > - TAILQ_INIT(&pentries); > - > datalen = imsg->hdr.len - IMSG_HEADER_SIZE; > if (datalen < sizeof(iobj)) > return got_error(GOT_ERR_PRIVSEP_LEN); > @@ -731,10 +728,12 @@ commit_traversal_request(struct imsg *imsg, struct ims > &entries, &nentries, &pentries, &pnentries, path, > pack, packidx, ibuf, objcache); > > - got_object_parsed_tree_entries_free(&entries); > + free(entries); > + entries = NULL; > nentries = 0; > free(buf); > - got_object_parsed_tree_entries_free(&pentries); > + free(pentries); > + pentries = NULL; > pnentries = 0; > free(pbuf); > if (err) { > @@ -771,10 +770,8 @@ done: > got_object_commit_close(commit); > if (pcommit) > got_object_commit_close(pcommit); > - if (nentries != 0) > - got_object_parsed_tree_entries_free(&entries); > - if (pnentries != 0) > - got_object_parsed_tree_entries_free(&pentries); > + free(entries); > + free(pentries); > if (err) { > if (err->code == GOT_ERR_PRIVSEP_PIPE) > err = NULL; > blob - 95e34c068c96ce1a4a00a1ea034531e243919dfd > blob + 225f928a01e58925e0d8bcb3e1ffb1a2cc09d631 > --- libexec/got-read-tree/got-read-tree.c > +++ libexec/got-read-tree/got-read-tree.c > @@ -50,7 +50,7 @@ catch_sigint(int signo) > } > > static const struct got_error * > -read_tree_object(struct got_pathlist_head *entries, int *nentries, > +read_tree_object(struct got_parsed_tree_entry **entries, int *nentries, > uint8_t **p, FILE *f, struct got_object_id *expected_id) > { > const struct got_error *err = NULL; > @@ -119,13 +119,11 @@ main(int argc, char *argv[]) > for (;;) { > struct imsg imsg; > FILE *f = NULL; > - struct got_pathlist_head entries; > + struct got_parsed_tree_entry *entries = NULL; > int nentries = 0; > uint8_t *buf = NULL; > struct got_object_id expected_id; > > - TAILQ_INIT(&entries); > - > if (sigint_received) { > err = got_error(GOT_ERR_CANCELLED); > break; > @@ -170,9 +168,9 @@ main(int argc, char *argv[]) > if (err) > goto done; > > - err = got_privsep_send_tree(&ibuf, &entries, nentries); > + err = got_privsep_send_tree(&ibuf, entries, nentries); > done: > - got_object_parsed_tree_entries_free(&entries); > + free(entries); > free(buf); > if (f) { > if (fclose(f) == EOF && err == NULL)