From: Mark Jamsek Subject: optimise reading the file index To: gameoftrees@openbsd.org Date: Thu, 27 Jul 2023 18:04:21 +1000 This diff is borne from a discussion with stsp on irc, and attempts to make reading the file index faster, which should improve many operations. The main change is that we memory map the file index to avoid lots of small reads that can be costly in large repos, for example, with many files. And we also reduce multiple small allocations down to one allocation for the entire padded path length when building each index entry. If memory mapping the file fails, however, we fall back to the file io status quo. Regress is still happy, but I haven't properly profiled this yet. Some trivial measurements show a marked improvement, and it appears to be perceptibly faster; however, my machine is already quite fast even with 'got status' in src.git so I'm looking for wider testing to see if this is indeed worth it. In theory it should be, but I'd rather have some good measurements. ----------------------------------------------- commit 268225b94f834976b5c424d7f119b0ec49c096c7 (index) from: Mark Jamsek date: Thu Jul 27 07:59:22 2023 UTC optimise reading the file index Memory map the file index to avoid lots of small reads, and reduce multiple pathname allocations to one for the entire padded path length when building index entries. M lib/fileindex.c | 226+ 23- M lib/got_lib_fileindex.h | 2+ 1- M lib/worktree.c | 25+ 4- M lib/worktree_cvg.c | 25+ 4- 4 files changed, 278 insertions(+), 32 deletions(-) diff c2ba7aa6808a2583895c024f5c85fff03948494e 268225b94f834976b5c424d7f119b0ec49c096c7 commit - c2ba7aa6808a2583895c024f5c85fff03948494e commit + 268225b94f834976b5c424d7f119b0ec49c096c7 blob - e989c6b7cd01f55aa6173b8bbf04441802b11fe6 blob + 0d993cfe6294f00a78e2125d31d3bdd5798aa2dc --- lib/fileindex.c +++ lib/fileindex.c @@ -21,6 +21,7 @@ #include #include #include +#include #include #include #include @@ -537,6 +538,187 @@ read_fileindex_val64(uint64_t *val, struct got_hash *c } static const struct got_error * +mread_fileindex_val64(uint64_t *val, struct got_hash *ctx, const uint8_t *p, + size_t psz, size_t *offset) +{ + if (psz < *offset + sizeof(*val)) + return got_error(GOT_ERR_FILEIDX_BAD); + + memcpy(val, p + *offset, sizeof(*val)); + got_hash_update(ctx, val, sizeof(*val)); + *val = be64toh(*val); + *offset += sizeof(*val); + + return NULL; +} + +static const struct got_error * +mread_fileindex_val32(uint32_t *val, struct got_hash *ctx, const uint8_t *p, + size_t psz, size_t *offset) +{ + if (psz < *offset + sizeof(*val)) + return got_error(GOT_ERR_FILEIDX_BAD); + + memcpy(val, p + *offset, sizeof(*val)); + got_hash_update(ctx, val, sizeof(*val)); + *val = be32toh(*val); + *offset += sizeof(*val); + + return NULL; +} + +static const struct got_error * +mread_fileindex_val16(uint16_t *val, struct got_hash *ctx, const uint8_t *p, + size_t psz, size_t *offset) +{ + if (psz < *offset + sizeof(*val)) + return got_error(GOT_ERR_FILEIDX_BAD); + + memcpy(val, p + *offset, sizeof(*val)); + got_hash_update(ctx, val, sizeof(*val)); + *val = be16toh(*val); + *offset += sizeof(*val); + + return NULL; +} + + +static const struct got_error * +mread_fileindex_sha1bin(uint8_t *val, struct got_hash *ctx, const uint8_t *p, + size_t psz, size_t *offset) +{ + if (psz < *offset + SHA1_DIGEST_LENGTH) + return got_error(GOT_ERR_FILEIDX_BAD); + + memcpy(val, p + *offset, SHA1_DIGEST_LENGTH); + got_hash_update(ctx, val, SHA1_DIGEST_LENGTH); + *offset += SHA1_DIGEST_LENGTH; + + return NULL; +} + +static const struct got_error * +mread_fileindex_path(char **path, struct got_hash *ctx, const uint8_t *map, + size_t mapsz, size_t *offset) +{ + const uint8_t *p, *nul; + size_t len, pad, pathlen; + + if (mapsz < *offset) + return got_error(GOT_ERR_FILEIDX_BAD); + + p = map + *offset; + + nul = memchr(p, '\0', mapsz); + len = nul - p; + pad = 8 - len % 8; + + pathlen = len + pad; + + if (mapsz < *offset + pathlen) + return got_error(GOT_ERR_FILEIDX_BAD); + + *path = malloc(pathlen); + if (*path == NULL) + return got_error_from_errno("malloc"); + + memcpy(*path, p, pathlen); + got_hash_update(ctx, *path, pathlen); + + *offset += pathlen; + + return NULL; +} + +static const struct got_error * +mread_fileindex_entry(struct got_fileindex_entry **iep, struct got_hash *ctx, + const uint8_t *map, size_t mapsz, uint32_t version, size_t *offset) +{ + const struct got_error *err; + struct got_fileindex_entry *ie; + const uint8_t *p; + size_t n = 0; + + *iep = NULL; + + if (mapsz < *offset) + return got_error(GOT_ERR_FILEIDX_BAD); + + p = map + *offset; + + ie = calloc(1, sizeof(*ie)); + if (ie == NULL) + return got_error_from_errno("calloc"); + + err = mread_fileindex_val64(&ie->ctime_sec, ctx, p, mapsz, &n); + if (err != NULL) + goto done; + err = mread_fileindex_val64(&ie->ctime_nsec, ctx, p, mapsz, &n); + if (err != NULL) + goto done; + err = mread_fileindex_val64(&ie->mtime_sec, ctx, p, mapsz, &n); + if (err != NULL) + goto done; + err = mread_fileindex_val64(&ie->mtime_nsec, ctx, p, mapsz, &n); + if (err != NULL) + goto done; + + err = mread_fileindex_val32(&ie->uid, ctx, p, mapsz, &n); + if (err != NULL) + goto done; + err = mread_fileindex_val32(&ie->gid, ctx, p, mapsz, &n); + if (err != NULL) + goto done; + err = mread_fileindex_val32(&ie->size, ctx, p, mapsz, &n); + if (err != NULL) + goto done; + + err = mread_fileindex_val16(&ie->mode, ctx, p, mapsz, &n); + if (err != NULL) + goto done; + + err = mread_fileindex_sha1bin(ie->blob_sha1, ctx, p, mapsz, &n); + if (err != NULL) + goto done; + err = mread_fileindex_sha1bin(ie->commit_sha1, ctx, p, mapsz, &n); + if (err != NULL) + goto done; + + err = mread_fileindex_val32(&ie->flags, ctx, p, mapsz, &n); + if (err != NULL) + goto done; + + err = mread_fileindex_path(&ie->path, ctx, p, mapsz, &n); + if (err != NULL) + goto done; + + if (version >= 2) { + uint32_t stage; + + stage = got_fileindex_entry_stage_get(ie); + if (stage == GOT_FILEIDX_STAGE_MODIFY || + stage == GOT_FILEIDX_STAGE_ADD) { + err = mread_fileindex_sha1bin(ie->staged_blob_sha1, + ctx, p, mapsz, &n); + if (err != NULL) + goto done; + } + } else { + /* GOT_FILE_INDEX_VERSION 1 does not support staging. */ + ie->flags &= ~GOT_FILEIDX_F_STAGE; + } + + *offset += n; + +done: + if (err != NULL) + got_fileindex_entry_free(ie); + else + *iep = ie; + return err; +} + +static const struct got_error * read_fileindex_val64(uint64_t *val, struct got_hash *ctx, FILE *infile) { size_t n; @@ -702,7 +884,8 @@ got_fileindex_read(struct got_fileindex *fileindex, FI } const struct got_error * -got_fileindex_read(struct got_fileindex *fileindex, FILE *infile) +got_fileindex_read(struct got_fileindex *fileindex, FILE *infile, + const uint8_t *map, size_t mapsz) { const struct got_error *err = NULL; struct got_fileindex_hdr hdr; @@ -710,29 +893,38 @@ got_fileindex_read(struct got_fileindex *fileindex, FI struct got_fileindex_entry *ie; uint8_t sha1_expected[SHA1_DIGEST_LENGTH]; uint8_t sha1[SHA1_DIGEST_LENGTH]; - size_t n; + size_t n, offset; int i; got_hash_init(&ctx, GOT_HASH_SHA1); - n = fread(&hdr.signature, 1, sizeof(hdr.signature), infile); - if (n != sizeof(hdr.signature)) { - if (n == 0) /* EOF */ - return NULL; - return got_ferror(infile, GOT_ERR_FILEIDX_BAD); + if (map != NULL) { + offset = offsetof(struct got_fileindex_hdr, sha1); + if (mapsz < offset) + return got_error(GOT_ERR_FILEIDX_BAD); + + /* copy hdr up to and including nentries */ + memcpy(&hdr, map, offset); + } else { + n = fread(&hdr.signature, 1, sizeof(hdr.signature), infile); + if (n != sizeof(hdr.signature)) { + if (n == 0) /* EOF */ + return NULL; + return got_ferror(infile, GOT_ERR_FILEIDX_BAD); + } + n = fread(&hdr.version, 1, sizeof(hdr.version), infile); + if (n != sizeof(hdr.version)) { + if (n == 0) /* EOF */ + return NULL; + return got_ferror(infile, GOT_ERR_FILEIDX_BAD); + } + n = fread(&hdr.nentries, 1, sizeof(hdr.nentries), infile); + if (n != sizeof(hdr.nentries)) { + if (n == 0) /* EOF */ + return NULL; + return got_ferror(infile, GOT_ERR_FILEIDX_BAD); + } } - n = fread(&hdr.version, 1, sizeof(hdr.version), infile); - if (n != sizeof(hdr.version)) { - if (n == 0) /* EOF */ - return NULL; - return got_ferror(infile, GOT_ERR_FILEIDX_BAD); - } - n = fread(&hdr.nentries, 1, sizeof(hdr.nentries), infile); - if (n != sizeof(hdr.nentries)) { - if (n == 0) /* EOF */ - return NULL; - return got_ferror(infile, GOT_ERR_FILEIDX_BAD); - } got_hash_update(&ctx, &hdr.signature, sizeof(hdr.signature)); got_hash_update(&ctx, &hdr.version, sizeof(hdr.version)); @@ -748,7 +940,12 @@ got_fileindex_read(struct got_fileindex *fileindex, FI return got_error(GOT_ERR_FILEIDX_VER); for (i = 0; i < hdr.nentries; i++) { - err = read_fileindex_entry(&ie, &ctx, infile, hdr.version); + if (map != NULL) + err = mread_fileindex_entry(&ie, &ctx, map, + mapsz, hdr.version, &offset); + else + err = read_fileindex_entry(&ie, &ctx, infile, + hdr.version); if (err) return err; err = add_entry(fileindex, ie); @@ -758,9 +955,15 @@ got_fileindex_read(struct got_fileindex *fileindex, FI } } - n = fread(sha1_expected, 1, sizeof(sha1_expected), infile); - if (n != sizeof(sha1_expected)) - return got_ferror(infile, GOT_ERR_FILEIDX_BAD); + if (map != NULL) { + if (mapsz < offset + sizeof(sha1_expected)) + return got_error(GOT_ERR_FILEIDX_BAD); + memcpy(sha1_expected, map + offset, sizeof(sha1_expected)); + } else { + n = fread(sha1_expected, 1, sizeof(sha1_expected), infile); + if (n != sizeof(sha1_expected)) + return got_ferror(infile, GOT_ERR_FILEIDX_BAD); + } got_hash_final(&ctx, sha1); if (memcmp(sha1, sha1_expected, SHA1_DIGEST_LENGTH) != 0) return got_error(GOT_ERR_FILEIDX_CSUM); blob - 7045733009e3c274bef6f25b589a001bff3de260 blob + c2e4498c52366b3ce1d284297d23ff4006158aed --- lib/got_lib_fileindex.h +++ lib/got_lib_fileindex.h @@ -123,7 +123,8 @@ const struct got_error *got_fileindex_read(struct got_ struct got_fileindex_entry *); struct got_fileindex_entry *got_fileindex_entry_get(struct got_fileindex *, const char *, size_t); -const struct got_error *got_fileindex_read(struct got_fileindex *, FILE *); +const struct got_error *got_fileindex_read(struct got_fileindex *, FILE *, + const uint8_t *, size_t); typedef const struct got_error *(*got_fileindex_cb)(void *, struct got_fileindex_entry *); const struct got_error *got_fileindex_for_each_entry_safe( blob - 2cdafab5a89da7e31a55bc887af42b14226637d8 blob + f101b3ce7d8eb7d7214bfe274d32a7256d86fd33 --- lib/worktree.c +++ lib/worktree.c @@ -15,6 +15,7 @@ */ #include +#include #include #include @@ -2508,6 +2509,8 @@ open_fileindex(struct got_fileindex **fileindex, char { const struct got_error *err = NULL; FILE *index = NULL; + struct stat st; + uint8_t *map = NULL; *fileindex_path = NULL; *fileindex = got_fileindex_alloc(); @@ -2522,12 +2525,30 @@ open_fileindex(struct got_fileindex **fileindex, char if (index == NULL) { if (errno != ENOENT) err = got_error_from_errno2("fopen", *fileindex_path); - } else { - err = got_fileindex_read(*fileindex, index); - if (fclose(index) == EOF && err == NULL) - err = got_error_from_errno("fclose"); + goto done; } + + if (fstat(fileno(index), &st) == -1) { + err = got_error_from_errno2("fstat", *fileindex_path); + goto done; + } + + map = mmap(NULL, st.st_size, PROT_READ, MAP_PRIVATE, + fileno(index), 0); + if (map == MAP_FAILED) { + /* fallback to file io */; + map = NULL; + } + + err = got_fileindex_read(*fileindex, index, map, st.st_size); + done: + if (index != NULL && fclose(index) == EOF && err == NULL) + err = got_error_from_errno("fclose"); + if (map != NULL) { + if (munmap(map, st.st_size) == -1 && err == NULL) + err = got_error_from_errno("munmap"); + } if (err) { free(*fileindex_path); *fileindex_path = NULL; blob - f488cd9dcd98b68c647973cb84a54e18daefced4 blob + 0f85630c6b83c2a66ca8e5f67fa7cba5a1e8e453 --- lib/worktree_cvg.c +++ lib/worktree_cvg.c @@ -15,6 +15,7 @@ */ #include +#include #include #include @@ -567,6 +568,8 @@ open_fileindex(struct got_fileindex **fileindex, char { const struct got_error *err = NULL; FILE *index = NULL; + struct stat st; + uint8_t *map = NULL; *fileindex_path = NULL; *fileindex = got_fileindex_alloc(); @@ -581,12 +584,30 @@ open_fileindex(struct got_fileindex **fileindex, char if (index == NULL) { if (errno != ENOENT) err = got_error_from_errno2("fopen", *fileindex_path); - } else { - err = got_fileindex_read(*fileindex, index); - if (fclose(index) == EOF && err == NULL) - err = got_error_from_errno("fclose"); + goto done; } + + if (fstat(fileno(index), &st) == -1) { + err = got_error_from_errno2("fstat", *fileindex_path); + goto done; + } + + map = mmap(NULL, st.st_size, PROT_READ, MAP_PRIVATE, + fileno(index), 0); + if (map == MAP_FAILED) { + /* fallback to file io */; + map = NULL; + } + + err = got_fileindex_read(*fileindex, index, map, st.st_size); + done: + if (index != NULL && fclose(index) == EOF && err == NULL) + err = got_error_from_errno("fclose"); + if (map != NULL) { + if (munmap(map, st.st_size) == -1 && err == NULL) + err = got_error_from_errno("munmap"); + } if (err) { free(*fileindex_path); *fileindex_path = NULL; -- Mark Jamsek GPG: F2FF 13DE 6A06 C471 CA80 E6E2 2930 DC66 86EE CF68