"GOT", but the "O" is a cute, smiling pufferfish. Index | Thread | Search

From:
Mark Jamsek <mark@jamsek.com>
Subject:
optimise reading the file index
To:
gameoftrees@openbsd.org
Date:
Thu, 27 Jul 2023 18:04:21 +1000

Download raw body.

Thread
This diff is borne from a discussion with stsp on irc, and attempts to
make reading the file index faster, which should improve many operations.

The main change is that we memory map the file index to avoid lots of
small reads that can be costly in large repos, for example, with many
files. And we also reduce multiple small allocations down to one
allocation for the entire padded path length when building each index
entry. If memory mapping the file fails, however, we fall back to the
file io status quo.

Regress is still happy, but I haven't properly profiled this yet. Some
trivial measurements show a marked improvement, and it appears to be
perceptibly faster; however, my machine is already quite fast even with
'got status' in src.git so I'm looking for wider testing to see if this
is indeed worth it. In theory it should be, but I'd rather have some
good measurements.


-----------------------------------------------
commit 268225b94f834976b5c424d7f119b0ec49c096c7 (index)
from: Mark Jamsek <mark@jamsek.dev>
date: Thu Jul 27 07:59:22 2023 UTC
 
 optimise reading the file index
 
 Memory map the file index to avoid lots of small reads, and reduce multiple
 pathname allocations to one for the entire padded path length when building
 index entries.
 
 M  lib/fileindex.c          |  226+  23-
 M  lib/got_lib_fileindex.h  |    2+   1-
 M  lib/worktree.c           |   25+   4-
 M  lib/worktree_cvg.c       |   25+   4-

4 files changed, 278 insertions(+), 32 deletions(-)

diff c2ba7aa6808a2583895c024f5c85fff03948494e 268225b94f834976b5c424d7f119b0ec49c096c7
commit - c2ba7aa6808a2583895c024f5c85fff03948494e
commit + 268225b94f834976b5c424d7f119b0ec49c096c7
blob - e989c6b7cd01f55aa6173b8bbf04441802b11fe6
blob + 0d993cfe6294f00a78e2125d31d3bdd5798aa2dc
--- lib/fileindex.c
+++ lib/fileindex.c
@@ -21,6 +21,7 @@
 #include <errno.h>
 #include <dirent.h>
 #include <fcntl.h>
+#include <stddef.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
@@ -537,6 +538,187 @@ read_fileindex_val64(uint64_t *val, struct got_hash *c
 }
 
 static const struct got_error *
+mread_fileindex_val64(uint64_t *val, struct got_hash *ctx, const uint8_t *p,
+    size_t psz, size_t *offset)
+{
+	if (psz < *offset + sizeof(*val))
+		return got_error(GOT_ERR_FILEIDX_BAD);
+
+	memcpy(val, p + *offset, sizeof(*val));
+	got_hash_update(ctx, val, sizeof(*val));
+	*val = be64toh(*val);
+	*offset += sizeof(*val);
+
+	return NULL;
+}
+
+static const struct got_error *
+mread_fileindex_val32(uint32_t *val, struct got_hash *ctx, const uint8_t *p,
+    size_t psz, size_t *offset)
+{
+	if (psz < *offset + sizeof(*val))
+		return got_error(GOT_ERR_FILEIDX_BAD);
+
+	memcpy(val, p + *offset, sizeof(*val));
+	got_hash_update(ctx, val, sizeof(*val));
+	*val = be32toh(*val);
+	*offset += sizeof(*val);
+
+	return NULL;
+}
+
+static const struct got_error *
+mread_fileindex_val16(uint16_t *val, struct got_hash *ctx, const uint8_t *p,
+    size_t psz, size_t *offset)
+{
+	if (psz < *offset + sizeof(*val))
+		return got_error(GOT_ERR_FILEIDX_BAD);
+
+	memcpy(val, p + *offset, sizeof(*val));
+	got_hash_update(ctx, val, sizeof(*val));
+	*val = be16toh(*val);
+	*offset += sizeof(*val);
+
+	return NULL;
+}
+
+
+static const struct got_error *
+mread_fileindex_sha1bin(uint8_t *val, struct got_hash *ctx, const uint8_t *p,
+    size_t psz, size_t *offset)
+{
+	if (psz < *offset + SHA1_DIGEST_LENGTH)
+		return got_error(GOT_ERR_FILEIDX_BAD);
+
+	memcpy(val, p + *offset, SHA1_DIGEST_LENGTH);
+	got_hash_update(ctx, val, SHA1_DIGEST_LENGTH);
+	*offset += SHA1_DIGEST_LENGTH;
+
+	return NULL;
+}
+
+static const struct got_error *
+mread_fileindex_path(char **path, struct got_hash *ctx, const uint8_t *map,
+    size_t mapsz, size_t *offset)
+{
+	const uint8_t	*p, *nul;
+	size_t		 len, pad, pathlen;
+
+	if (mapsz < *offset)
+		return got_error(GOT_ERR_FILEIDX_BAD);
+
+	p = map + *offset;
+
+	nul = memchr(p, '\0', mapsz);
+	len = nul - p;
+	pad = 8 - len % 8;
+
+	pathlen = len + pad;
+
+	if (mapsz < *offset + pathlen)
+		return got_error(GOT_ERR_FILEIDX_BAD);
+
+	*path = malloc(pathlen);
+	if (*path == NULL)
+		return got_error_from_errno("malloc");
+
+	memcpy(*path, p, pathlen);
+	got_hash_update(ctx, *path, pathlen);
+
+	*offset += pathlen;
+
+	return NULL;
+}
+
+static const struct got_error *
+mread_fileindex_entry(struct got_fileindex_entry **iep, struct got_hash *ctx,
+    const uint8_t *map, size_t mapsz, uint32_t version, size_t *offset)
+{
+	const struct got_error		*err;
+	struct got_fileindex_entry	*ie;
+	const uint8_t			*p;
+	size_t				 n = 0;
+
+	*iep = NULL;
+
+	if (mapsz < *offset)
+		return got_error(GOT_ERR_FILEIDX_BAD);
+
+	p = map + *offset;
+
+	ie = calloc(1, sizeof(*ie));
+	if (ie == NULL)
+		return got_error_from_errno("calloc");
+
+	err = mread_fileindex_val64(&ie->ctime_sec, ctx, p, mapsz, &n);
+	if (err != NULL)
+		goto done;
+	err = mread_fileindex_val64(&ie->ctime_nsec, ctx, p, mapsz, &n);
+	if (err != NULL)
+		goto done;
+	err = mread_fileindex_val64(&ie->mtime_sec, ctx, p, mapsz, &n);
+	if (err != NULL)
+		goto done;
+	err = mread_fileindex_val64(&ie->mtime_nsec, ctx, p, mapsz, &n);
+	if (err != NULL)
+		goto done;
+
+	err = mread_fileindex_val32(&ie->uid, ctx, p, mapsz, &n);
+	if (err != NULL)
+		goto done;
+	err = mread_fileindex_val32(&ie->gid, ctx, p, mapsz, &n);
+	if (err != NULL)
+		goto done;
+	err = mread_fileindex_val32(&ie->size, ctx, p, mapsz, &n);
+	if (err != NULL)
+		goto done;
+
+	err = mread_fileindex_val16(&ie->mode, ctx, p, mapsz, &n);
+	if (err != NULL)
+		goto done;
+
+	err = mread_fileindex_sha1bin(ie->blob_sha1, ctx, p, mapsz, &n);
+	if (err != NULL)
+		goto done;
+	err = mread_fileindex_sha1bin(ie->commit_sha1, ctx, p, mapsz, &n);
+	if (err != NULL)
+		goto done;
+
+	err = mread_fileindex_val32(&ie->flags, ctx, p, mapsz, &n);
+	if (err != NULL)
+		goto done;
+
+	err = mread_fileindex_path(&ie->path, ctx, p, mapsz, &n);
+	if (err != NULL)
+		goto done;
+
+	if (version >= 2) {
+		uint32_t stage;
+
+		stage = got_fileindex_entry_stage_get(ie);
+		if (stage == GOT_FILEIDX_STAGE_MODIFY ||
+		    stage == GOT_FILEIDX_STAGE_ADD) {
+			err = mread_fileindex_sha1bin(ie->staged_blob_sha1,
+			    ctx, p, mapsz, &n);
+			if (err != NULL)
+				goto done;
+		}
+	} else {
+		/* GOT_FILE_INDEX_VERSION 1 does not support staging. */
+		ie->flags &= ~GOT_FILEIDX_F_STAGE;
+	}
+
+	*offset += n;
+
+done:
+	if (err != NULL)
+		got_fileindex_entry_free(ie);
+	else
+		*iep = ie;
+	return err;
+}
+
+static const struct got_error *
 read_fileindex_val64(uint64_t *val, struct got_hash *ctx, FILE *infile)
 {
 	size_t n;
@@ -702,7 +884,8 @@ got_fileindex_read(struct got_fileindex *fileindex, FI
 }
 
 const struct got_error *
-got_fileindex_read(struct got_fileindex *fileindex, FILE *infile)
+got_fileindex_read(struct got_fileindex *fileindex, FILE *infile,
+    const uint8_t *map, size_t mapsz)
 {
 	const struct got_error *err = NULL;
 	struct got_fileindex_hdr hdr;
@@ -710,29 +893,38 @@ got_fileindex_read(struct got_fileindex *fileindex, FI
 	struct got_fileindex_entry *ie;
 	uint8_t sha1_expected[SHA1_DIGEST_LENGTH];
 	uint8_t sha1[SHA1_DIGEST_LENGTH];
-	size_t n;
+	size_t n, offset;
 	int i;
 
 	got_hash_init(&ctx, GOT_HASH_SHA1);
 
-	n = fread(&hdr.signature, 1, sizeof(hdr.signature), infile);
-	if (n != sizeof(hdr.signature)) {
-		if (n == 0) /* EOF */
-			return NULL;
-		return got_ferror(infile, GOT_ERR_FILEIDX_BAD);
+	if (map != NULL) {
+		offset = offsetof(struct got_fileindex_hdr, sha1);
+		if (mapsz < offset)
+			return got_error(GOT_ERR_FILEIDX_BAD);
+
+		/* copy hdr up to and including nentries */
+		memcpy(&hdr, map, offset);
+	} else {
+		n = fread(&hdr.signature, 1, sizeof(hdr.signature), infile);
+		if (n != sizeof(hdr.signature)) {
+			if (n == 0) /* EOF */
+				return NULL;
+			return got_ferror(infile, GOT_ERR_FILEIDX_BAD);
+		}
+		n = fread(&hdr.version, 1, sizeof(hdr.version), infile);
+		if (n != sizeof(hdr.version)) {
+			if (n == 0) /* EOF */
+				return NULL;
+			return got_ferror(infile, GOT_ERR_FILEIDX_BAD);
+		}
+		n = fread(&hdr.nentries, 1, sizeof(hdr.nentries), infile);
+		if (n != sizeof(hdr.nentries)) {
+			if (n == 0) /* EOF */
+				return NULL;
+			return got_ferror(infile, GOT_ERR_FILEIDX_BAD);
+		}
 	}
-	n = fread(&hdr.version, 1, sizeof(hdr.version), infile);
-	if (n != sizeof(hdr.version)) {
-		if (n == 0) /* EOF */
-			return NULL;
-		return got_ferror(infile, GOT_ERR_FILEIDX_BAD);
-	}
-	n = fread(&hdr.nentries, 1, sizeof(hdr.nentries), infile);
-	if (n != sizeof(hdr.nentries)) {
-		if (n == 0) /* EOF */
-			return NULL;
-		return got_ferror(infile, GOT_ERR_FILEIDX_BAD);
-	}
 
 	got_hash_update(&ctx, &hdr.signature, sizeof(hdr.signature));
 	got_hash_update(&ctx, &hdr.version, sizeof(hdr.version));
@@ -748,7 +940,12 @@ got_fileindex_read(struct got_fileindex *fileindex, FI
 		return got_error(GOT_ERR_FILEIDX_VER);
 
 	for (i = 0; i < hdr.nentries; i++) {
-		err = read_fileindex_entry(&ie, &ctx, infile, hdr.version);
+		if (map != NULL)
+			err = mread_fileindex_entry(&ie, &ctx, map,
+			    mapsz, hdr.version, &offset);
+		else
+			err = read_fileindex_entry(&ie, &ctx, infile,
+			    hdr.version);
 		if (err)
 			return err;
 		err = add_entry(fileindex, ie);
@@ -758,9 +955,15 @@ got_fileindex_read(struct got_fileindex *fileindex, FI
 		}
 	}
 
-	n = fread(sha1_expected, 1, sizeof(sha1_expected), infile);
-	if (n != sizeof(sha1_expected))
-		return got_ferror(infile, GOT_ERR_FILEIDX_BAD);
+	if (map != NULL) {
+		if (mapsz < offset + sizeof(sha1_expected))
+			return got_error(GOT_ERR_FILEIDX_BAD);
+		memcpy(sha1_expected, map + offset, sizeof(sha1_expected));
+	} else {
+		n = fread(sha1_expected, 1, sizeof(sha1_expected), infile);
+		if (n != sizeof(sha1_expected))
+			return got_ferror(infile, GOT_ERR_FILEIDX_BAD);
+	}
 	got_hash_final(&ctx, sha1);
 	if (memcmp(sha1, sha1_expected, SHA1_DIGEST_LENGTH) != 0)
 		return got_error(GOT_ERR_FILEIDX_CSUM);
blob - 7045733009e3c274bef6f25b589a001bff3de260
blob + c2e4498c52366b3ce1d284297d23ff4006158aed
--- lib/got_lib_fileindex.h
+++ lib/got_lib_fileindex.h
@@ -123,7 +123,8 @@ const struct got_error *got_fileindex_read(struct got_
     struct got_fileindex_entry *);
 struct got_fileindex_entry *got_fileindex_entry_get(struct got_fileindex *,
     const char *, size_t);
-const struct got_error *got_fileindex_read(struct got_fileindex *, FILE *);
+const struct got_error *got_fileindex_read(struct got_fileindex *, FILE *,
+    const uint8_t *, size_t);
 typedef const struct got_error *(*got_fileindex_cb)(void *,
     struct got_fileindex_entry *);
 const struct got_error *got_fileindex_for_each_entry_safe(
blob - 2cdafab5a89da7e31a55bc887af42b14226637d8
blob + f101b3ce7d8eb7d7214bfe274d32a7256d86fd33
--- lib/worktree.c
+++ lib/worktree.c
@@ -15,6 +15,7 @@
  */
 
 #include <sys/stat.h>
+#include <sys/mman.h>
 #include <sys/queue.h>
 #include <sys/tree.h>
 
@@ -2508,6 +2509,8 @@ open_fileindex(struct got_fileindex **fileindex, char 
 {
 	const struct got_error *err = NULL;
 	FILE *index = NULL;
+	struct stat st;
+	uint8_t *map = NULL;
 
 	*fileindex_path = NULL;
 	*fileindex = got_fileindex_alloc();
@@ -2522,12 +2525,30 @@ open_fileindex(struct got_fileindex **fileindex, char 
 	if (index == NULL) {
 		if (errno != ENOENT)
 			err = got_error_from_errno2("fopen", *fileindex_path);
-	} else {
-		err = got_fileindex_read(*fileindex, index);
-		if (fclose(index) == EOF && err == NULL)
-			err = got_error_from_errno("fclose");
+		goto done;
 	}
+
+	if (fstat(fileno(index), &st) == -1) {
+		err = got_error_from_errno2("fstat", *fileindex_path);
+		goto done;
+	}
+
+	map = mmap(NULL, st.st_size, PROT_READ, MAP_PRIVATE,
+	    fileno(index), 0);
+	if (map == MAP_FAILED) {
+		/* fallback to file io */;
+		map = NULL;
+	}
+
+	err = got_fileindex_read(*fileindex, index, map, st.st_size);
+
 done:
+	if (index != NULL && fclose(index) == EOF && err == NULL)
+		err = got_error_from_errno("fclose");
+	if (map != NULL) {
+		if (munmap(map, st.st_size) == -1 && err == NULL)
+			err = got_error_from_errno("munmap");
+	}
 	if (err) {
 		free(*fileindex_path);
 		*fileindex_path = NULL;
blob - f488cd9dcd98b68c647973cb84a54e18daefced4
blob + 0f85630c6b83c2a66ca8e5f67fa7cba5a1e8e453
--- lib/worktree_cvg.c
+++ lib/worktree_cvg.c
@@ -15,6 +15,7 @@
  */
 
 #include <sys/stat.h>
+#include <sys/mman.h>
 #include <sys/queue.h>
 #include <sys/tree.h>
 
@@ -567,6 +568,8 @@ open_fileindex(struct got_fileindex **fileindex, char 
 {
 	const struct got_error *err = NULL;
 	FILE *index = NULL;
+	struct stat st;
+	uint8_t *map = NULL;
 
 	*fileindex_path = NULL;
 	*fileindex = got_fileindex_alloc();
@@ -581,12 +584,30 @@ open_fileindex(struct got_fileindex **fileindex, char 
 	if (index == NULL) {
 		if (errno != ENOENT)
 			err = got_error_from_errno2("fopen", *fileindex_path);
-	} else {
-		err = got_fileindex_read(*fileindex, index);
-		if (fclose(index) == EOF && err == NULL)
-			err = got_error_from_errno("fclose");
+		goto done;
 	}
+
+	if (fstat(fileno(index), &st) == -1) {
+		err = got_error_from_errno2("fstat", *fileindex_path);
+		goto done;
+	}
+
+	map = mmap(NULL, st.st_size, PROT_READ, MAP_PRIVATE,
+	    fileno(index), 0);
+	if (map == MAP_FAILED) {
+		/* fallback to file io */;
+		map = NULL;
+	}
+
+	err = got_fileindex_read(*fileindex, index, map, st.st_size);
+
 done:
+	if (index != NULL && fclose(index) == EOF && err == NULL)
+		err = got_error_from_errno("fclose");
+	if (map != NULL) {
+		if (munmap(map, st.st_size) == -1 && err == NULL)
+			err = got_error_from_errno("munmap");
+	}
 	if (err) {
 		free(*fileindex_path);
 		*fileindex_path = NULL;


-- 
Mark Jamsek <https://bsdbox.org>
GPG: F2FF 13DE 6A06 C471 CA80  E6E2 2930 DC66 86EE CF68