Download raw body.
speed up tree-entry parsing
Stefan Sperling <stsp@stsp.name> wrote:
> On Wed, May 18, 2022 at 09:58:28AM +0200, Stefan Sperling wrote:
> > Slightly teaked diff: It is better to grow the array when we are about
> > to add an entry which would overflow the array, rather than growing the
> > array after adding a new entry. Avoids growing the array unnecessarily
> > in case the number of entries equals the number pre-allocated slots.
agreed
> And this new version avoids use of a pathlist entirely.
> The previous version still used a pathlist to check for duplicate
> tree entries.
>
> Instead, we can do this check after mergesort() by iterating over
> the array. This avoids some malloc/free and is slightly faster.
>
> I have another diff which builds on top of this coming soon.
I haven't done any real runtime testing but it reads fine and regress is
passing.
I have two questions below but the diff reads ok to me as-is :)
> diff refs/heads/main refs/heads/parse-tree-array
> blob - adbb8bb3e8521199cb1e1ee0ae5ad5dce3f3503d
> blob + 79b6e83d9872c4ee03472131af53beb3186ab6be
> --- lib/got_lib_object_parse.h
> +++ lib/got_lib_object_parse.h
> @@ -23,18 +23,14 @@ struct got_tree_entry *got_alloc_tree_entry_partial(vo
> const struct got_error *got_object_parse_commit(struct got_commit_object **,
> char *, size_t);
>
> -/*
> - * In a pathlist returned by got_object_parse_tree(), a pointer to the entry's
> - * name is stored in the pathlist element's path, while the pathlist element's
> - * data pointer points to a struct got_parsed_tree_entry.
> - */
> struct got_parsed_tree_entry {
> + const char *name; /* Points to name in parsed buffer */
> + size_t namelen; /* strlen(name) */
> mode_t mode; /* Mode parsed from tree buffer. */
> uint8_t *id; /* Points to ID in parsed tree buffer. */
> };
> -const struct got_error *got_object_parse_tree(struct got_pathlist_head *, int *,
> - uint8_t *, size_t);
> -void got_object_parsed_tree_entries_free(struct got_pathlist_head *);
> +const struct got_error *got_object_parse_tree(struct got_parsed_tree_entry **,
> + int *, uint8_t *, size_t);
>
> const struct got_error *got_object_parse_tag(struct got_tag_object **,
> uint8_t *, size_t);
> blob - b310a5120a14c05adbe049177d4af040d61f1ab9
> blob + c6937820f54e6d66c1cbf81056a6b916516c9c5d
> --- lib/got_lib_privsep.h
> +++ lib/got_lib_privsep.h
> @@ -662,8 +662,9 @@ const struct got_error *got_privsep_recv_commit(struct
> struct imsgbuf *);
> const struct got_error *got_privsep_recv_tree(struct got_tree_object **,
> struct imsgbuf *);
> +struct got_parsed_tree_entry;
> const struct got_error *got_privsep_send_tree(struct imsgbuf *,
> - struct got_pathlist_head *, int);
> + struct got_parsed_tree_entry *, int);
> const struct got_error *got_privsep_send_blob(struct imsgbuf *, size_t, size_t,
> const uint8_t *);
> const struct got_error *got_privsep_recv_blob(uint8_t **, size_t *, size_t *,
> blob - 511053239b0a892dbb0de04e93ccd9ed8e69e7b8
> blob + 15e34b7d7b360b66d71d7201ab317ddee510e1c9
> --- lib/object_parse.c
> +++ lib/object_parse.c
> @@ -669,89 +669,90 @@ got_object_tree_close(struct got_tree_object *tree)
> }
>
> static const struct got_error *
> -parse_tree_entry(struct got_parsed_tree_entry **pte, const char **name,
> - size_t *elen, char *buf,
> +parse_tree_entry(struct got_parsed_tree_entry *pte, size_t *elen, char *buf,
> size_t maxlen)
> {
> char *p, *space;
> - const struct got_error *err = NULL;
>
> - *name = NULL;
> *elen = 0;
>
> - *pte = malloc(sizeof(**pte));
> - if (*pte == NULL)
> - return got_error_from_errno("malloc");
> -
> *elen = strnlen(buf, maxlen) + 1;
> - if (*elen > maxlen) {
> - free(*pte);
> - *pte = NULL;
> + if (*elen > maxlen)
> return got_error(GOT_ERR_BAD_OBJ_DATA);
> - }
>
> space = memchr(buf, ' ', *elen);
> - if (space == NULL || space <= buf) {
> - err = got_error(GOT_ERR_BAD_OBJ_DATA);
> - free(*pte);
> - *pte = NULL;
> - return err;
> - }
> - (*pte)->mode = 0;
this was the same in the previous code too, but why check for space <=
buf?
> + if (space == NULL || space <= buf)
`space' is either NULL or somewhere between [buf, buf+(*elen)), the <=
is intended just to make sure buf doesn't start with a space right?
> + return got_error(GOT_ERR_BAD_OBJ_DATA);
> +
> + pte->mode = 0;
> p = buf;
> while (p < space) {
> - if (*p < '0' && *p > '7') {
> - err = got_error(GOT_ERR_BAD_OBJ_DATA);
> - goto done;
> - }
> - (*pte)->mode <<= 3;
> - (*pte)->mode |= *p - '0';
> + if (*p < '0' && *p > '7')
> + return got_error(GOT_ERR_BAD_OBJ_DATA);
> + pte->mode <<= 3;
> + pte->mode |= *p - '0';
> p++;
> }
>
> - if (*elen > maxlen || maxlen - *elen < SHA1_DIGEST_LENGTH) {
> - err = got_error(GOT_ERR_BAD_OBJ_DATA);
> - goto done;
> - }
> - *name = space + 1;
> + if (*elen > maxlen || maxlen - *elen < SHA1_DIGEST_LENGTH)
> + return got_error(GOT_ERR_BAD_OBJ_DATA);
> +
> + pte->name = space + 1;
> + pte->namelen = strlen(pte->name);
> buf += *elen;
> - (*pte)->id = buf;
> + pte->id = buf;
> *elen += SHA1_DIGEST_LENGTH;
> -done:
> - if (err) {
> - free(*pte);
> - *pte = NULL;
> - }
> - return err;
> + return NULL;
> }
>
> +static int
> +pte_cmp(const void *pa, const void *pb)
> +{
> + const struct got_parsed_tree_entry *a, *b;
> +
> + a = (const struct got_parsed_tree_entry *)pa;
> + b = (const struct got_parsed_tree_entry *)pb;
are these cast needed? It's just a matter of style, I'm just asking to
be sure so future patches will be coherent with the rest of the code. I
have written some code in got_patch IIRC which uses the implicit cast
from void *
> + return got_path_cmp(a->name, b->name, a->namelen, b->namelen);
> +}
> +
> const struct got_error *
> -got_object_parse_tree(struct got_pathlist_head *entries, int *nentries,
> +got_object_parse_tree(struct got_parsed_tree_entry **entries, int *nentries,
> uint8_t *buf, size_t len)
> {
> const struct got_error *err = NULL;
> - size_t remain = len;
> + size_t remain = len, totalloc;
> + const size_t nalloc = 16;
> + struct got_parsed_tree_entry *pte;
> + int i;
>
> *nentries = 0;
> if (remain == 0)
> return NULL; /* tree is empty */
>
> + *entries = calloc(nalloc, sizeof(**entries));
> + if (*entries == NULL)
> + return got_error_from_errno("calloc");
> + totalloc = nalloc;
> +
> while (remain > 0) {
> - struct got_parsed_tree_entry *pte;
> - struct got_pathlist_entry *new = NULL;
> - const char *name;
> size_t elen;
>
> - err = parse_tree_entry(&pte, &name, &elen, buf, remain);
> - if (err)
> - goto done;
> - err = got_pathlist_insert(&new, entries, name, pte);
> - if (err)
> - goto done;
> - if (new == NULL) {
> - err = got_error(GOT_ERR_TREE_DUP_ENTRY);
> - goto done;
> + if (*nentries >= totalloc) {
> + pte = recallocarray(*entries, totalloc,
> + totalloc + nalloc, sizeof(**entries));
> + if (pte == NULL) {
> + err = got_error_from_errno("recallocarray");
> + goto done;
> + }
> + *entries = pte;
> + totalloc += nalloc;
> }
> +
> + pte = &(*entries)[*nentries];
> + err = parse_tree_entry(pte, &elen, buf, remain);
> + if (err)
> + goto done;
> buf += elen;
> remain -= elen;
> (*nentries)++;
> @@ -761,27 +762,30 @@ got_object_parse_tree(struct got_pathlist_head *entrie
> err = got_error(GOT_ERR_BAD_OBJ_DATA);
> goto done;
> }
> +
> + if (*nentries > 1) {
> + mergesort(*entries, *nentries, sizeof(**entries), pte_cmp);
> +
> + for (i = 0; i < *nentries - 1; i++) {
> + struct got_parsed_tree_entry *prev = &(*entries)[i];
> + pte = &(*entries)[i + 1];
> + if (got_path_cmp(prev->name, pte->name,
> + prev->namelen, pte->namelen) == 0) {
> + err = got_error(GOT_ERR_TREE_DUP_ENTRY);
> + break;
> + }
> + }
> + }
> done:
> if (err) {
> - got_object_parsed_tree_entries_free(entries);
> + free(*entries);
> + *entries = NULL;
> *nentries = 0;
> }
> return err;
> }
>
> void
> -got_object_parsed_tree_entries_free(struct got_pathlist_head *entries)
> -{
> - struct got_pathlist_entry *pe;
> -
> - TAILQ_FOREACH(pe, entries, entry) {
> - struct got_parsed_tree_entry *pte = pe->data;
> - free(pte);
> - }
> - got_pathlist_free(entries);
> -}
> -
> -void
> got_object_tag_close(struct got_tag_object *tag)
> {
> if (tag->refcnt > 0) {
> blob - e7450fd7c2dce123856dba8a632d60b237b54e91
> blob + 7b7f894dda9fc261842dfd019c4e9c2f444508a0
> --- lib/privsep.c
> +++ lib/privsep.c
> @@ -1490,14 +1490,13 @@ got_privsep_recv_commit(struct got_commit_object **com
> }
>
> const struct got_error *
> -got_privsep_send_tree(struct imsgbuf *ibuf, struct got_pathlist_head *entries,
> - int nentries)
> +got_privsep_send_tree(struct imsgbuf *ibuf,
> + struct got_parsed_tree_entry *entries, int nentries)
> {
> const struct got_error *err = NULL;
> struct got_imsg_tree_object itree;
> - struct got_pathlist_entry *pe;
> size_t totlen;
> - int nimsg; /* number of imsg queued in ibuf */
> + int i, nimsg; /* number of imsg queued in ibuf */
>
> itree.nentries = nentries;
> if (imsg_compose(ibuf, GOT_IMSG_TREE, 0, 0, -1, &itree, sizeof(itree))
> @@ -1506,12 +1505,10 @@ got_privsep_send_tree(struct imsgbuf *ibuf, struct got
>
> totlen = sizeof(itree);
> nimsg = 1;
> - TAILQ_FOREACH(pe, entries, entry) {
> - const char *name = pe->path;
> - struct got_parsed_tree_entry *pte = pe->data;
> + for (i = 0; i < nentries; i++) {
> + struct got_parsed_tree_entry *pte = &entries[i];
> struct ibuf *wbuf;
> - size_t namelen = strlen(name);
> - size_t len = sizeof(struct got_imsg_tree_entry) + namelen;
> + size_t len = sizeof(struct got_imsg_tree_entry) + pte->namelen;
>
> if (len > MAX_IMSGSIZE)
> return got_error(GOT_ERR_NO_SPACE);
> @@ -1540,7 +1537,7 @@ got_privsep_send_tree(struct imsgbuf *ibuf, struct got
> return err;
> }
>
> - if (imsg_add(wbuf, name, namelen) == -1) {
> + if (imsg_add(wbuf, pte->name, pte->namelen) == -1) {
> err = got_error_from_errno("imsg_add TREE_ENTRY");
> ibuf_free(wbuf);
> return err;
> blob - 20a67c9701816c2474c4d9e546a0c6f4aeb8d0ae
> blob + b6d3c895f92db84cc93320a4f869170172782abd
> --- libexec/got-read-pack/got-read-pack.c
> +++ libexec/got-read-pack/got-read-pack.c
> @@ -177,7 +177,7 @@ done:
> }
>
> static const struct got_error *
> -open_tree(uint8_t **buf, struct got_pathlist_head *entries, int *nentries,
> +open_tree(uint8_t **buf, struct got_parsed_tree_entry **entries, int *nentries,
> struct got_pack *pack, struct got_packidx *packidx, int obj_idx,
> struct got_object_id *id, struct got_object_cache *objcache)
> {
> @@ -220,14 +220,12 @@ tree_request(struct imsg *imsg, struct imsgbuf *ibuf,
> {
> const struct got_error *err = NULL;
> struct got_imsg_packed_object iobj;
> - struct got_pathlist_head entries;
> + struct got_parsed_tree_entry *entries = NULL;
> int nentries = 0;
> uint8_t *buf = NULL;
> struct got_object_id id;
> size_t datalen;
>
> - TAILQ_INIT(&entries);
> -
> datalen = imsg->hdr.len - IMSG_HEADER_SIZE;
> if (datalen != sizeof(iobj))
> return got_error(GOT_ERR_PRIVSEP_LEN);
> @@ -239,8 +237,8 @@ tree_request(struct imsg *imsg, struct imsgbuf *ibuf,
> if (err)
> return err;
>
> - err = got_privsep_send_tree(ibuf, &entries, nentries);
> - got_object_parsed_tree_entries_free(&entries);
> + err = got_privsep_send_tree(ibuf, entries, nentries);
> + free(entries);
> free(buf);
> if (err) {
> if (err->code == GOT_ERR_PRIVSEP_PIPE)
> @@ -427,28 +425,30 @@ done:
> }
>
> static struct got_parsed_tree_entry *
> -find_entry_by_name(struct got_pathlist_head *entries, int nentries,
> +find_entry_by_name(struct got_parsed_tree_entry *entries, int nentries,
> const char *name, size_t len)
> {
> - struct got_pathlist_entry *pe;
> + struct got_parsed_tree_entry *pte;
> + int cmp, i;
>
> /* Note that tree entries are sorted in strncmp() order. */
> - TAILQ_FOREACH(pe, entries, entry) {
> - int cmp = strncmp(pe->path, name, len);
> + for (i = 0; i < nentries; i++) {
> + pte = &entries[i];
> + cmp = strncmp(pte->name, name, len);
> if (cmp < 0)
> continue;
> if (cmp > 0)
> break;
> - if (pe->path[len] == '\0')
> - return (struct got_parsed_tree_entry *)pe->data;
> + if (pte->name[len] == '\0')
> + return pte;
> }
> return NULL;
> }
>
> static const struct got_error *
> tree_path_changed(int *changed, uint8_t **buf1, uint8_t **buf2,
> - struct got_pathlist_head *entries1, int *nentries1,
> - struct got_pathlist_head *entries2, int *nentries2,
> + struct got_parsed_tree_entry **entries1, int *nentries1,
> + struct got_parsed_tree_entry **entries2, int *nentries2,
> const char *path, struct got_pack *pack, struct got_packidx *packidx,
> struct imsgbuf *ibuf, struct got_object_cache *objcache)
> {
> @@ -476,13 +476,13 @@ tree_path_changed(int *changed, uint8_t **buf1, uint8_
> continue;
> }
>
> - pte1 = find_entry_by_name(entries1, *nentries1, seg, seglen);
> + pte1 = find_entry_by_name(*entries1, *nentries1, seg, seglen);
> if (pte1 == NULL) {
> err = got_error(GOT_ERR_NO_OBJ);
> break;
> }
>
> - pte2 = find_entry_by_name(entries2, *nentries2, seg, seglen);
> + pte2 = find_entry_by_name(*entries2, *nentries2, seg, seglen);
> if (pte2 == NULL) {
> *changed = 1;
> break;
> @@ -516,7 +516,7 @@ tree_path_changed(int *changed, uint8_t **buf1, uint8_
> err = got_error_no_obj(&id1);
> break;
> }
> - got_object_parsed_tree_entries_free(entries1);
> + free(*entries1);
> *nentries1 = 0;
> free(*buf1);
> *buf1 = NULL;
> @@ -532,7 +532,7 @@ tree_path_changed(int *changed, uint8_t **buf1, uint8_
> err = got_error_no_obj(&id2);
> break;
> }
> - got_object_parsed_tree_entries_free(entries2);
> + free(*entries2);
> *nentries2 = 0;
> free(*buf2);
> *buf2 = NULL;
> @@ -602,7 +602,7 @@ commit_traversal_request(struct imsg *imsg, struct ims
> struct got_imsg_packed_object iobj;
> struct got_object_qid *pid;
> struct got_commit_object *commit = NULL, *pcommit = NULL;
> - struct got_pathlist_head entries, pentries;
> + struct got_parsed_tree_entry *entries = NULL, *pentries = NULL;
> int nentries = 0, pnentries = 0;
> struct got_object_id id;
> size_t datalen, path_len;
> @@ -611,9 +611,6 @@ commit_traversal_request(struct imsg *imsg, struct ims
> int changed = 0, ncommits = 0, nallocated = 0;
> struct got_object_id *commit_ids = NULL;
>
> - TAILQ_INIT(&entries);
> - TAILQ_INIT(&pentries);
> -
> datalen = imsg->hdr.len - IMSG_HEADER_SIZE;
> if (datalen < sizeof(iobj))
> return got_error(GOT_ERR_PRIVSEP_LEN);
> @@ -731,10 +728,12 @@ commit_traversal_request(struct imsg *imsg, struct ims
> &entries, &nentries, &pentries, &pnentries, path,
> pack, packidx, ibuf, objcache);
>
> - got_object_parsed_tree_entries_free(&entries);
> + free(entries);
> + entries = NULL;
> nentries = 0;
> free(buf);
> - got_object_parsed_tree_entries_free(&pentries);
> + free(pentries);
> + pentries = NULL;
> pnentries = 0;
> free(pbuf);
> if (err) {
> @@ -771,10 +770,8 @@ done:
> got_object_commit_close(commit);
> if (pcommit)
> got_object_commit_close(pcommit);
> - if (nentries != 0)
> - got_object_parsed_tree_entries_free(&entries);
> - if (pnentries != 0)
> - got_object_parsed_tree_entries_free(&pentries);
> + free(entries);
> + free(pentries);
> if (err) {
> if (err->code == GOT_ERR_PRIVSEP_PIPE)
> err = NULL;
> blob - 95e34c068c96ce1a4a00a1ea034531e243919dfd
> blob + 225f928a01e58925e0d8bcb3e1ffb1a2cc09d631
> --- libexec/got-read-tree/got-read-tree.c
> +++ libexec/got-read-tree/got-read-tree.c
> @@ -50,7 +50,7 @@ catch_sigint(int signo)
> }
>
> static const struct got_error *
> -read_tree_object(struct got_pathlist_head *entries, int *nentries,
> +read_tree_object(struct got_parsed_tree_entry **entries, int *nentries,
> uint8_t **p, FILE *f, struct got_object_id *expected_id)
> {
> const struct got_error *err = NULL;
> @@ -119,13 +119,11 @@ main(int argc, char *argv[])
> for (;;) {
> struct imsg imsg;
> FILE *f = NULL;
> - struct got_pathlist_head entries;
> + struct got_parsed_tree_entry *entries = NULL;
> int nentries = 0;
> uint8_t *buf = NULL;
> struct got_object_id expected_id;
>
> - TAILQ_INIT(&entries);
> -
> if (sigint_received) {
> err = got_error(GOT_ERR_CANCELLED);
> break;
> @@ -170,9 +168,9 @@ main(int argc, char *argv[])
> if (err)
> goto done;
>
> - err = got_privsep_send_tree(&ibuf, &entries, nentries);
> + err = got_privsep_send_tree(&ibuf, entries, nentries);
> done:
> - got_object_parsed_tree_entries_free(&entries);
> + free(entries);
> free(buf);
> if (f) {
> if (fclose(f) == EOF && err == NULL)
speed up tree-entry parsing