From: "Omar Polo" <op@omarpolo.com>
Subject: attempt at speeding up the deltification
To: gameoftrees@openbsd.org
Date: Sun, 25 Feb 2024 02:30:28 +0100

The deltification is quite slow not only because it is objectively a
heavy task, but also because we try to use as little memory as possible.
got_deltify() is used in pack_create() which in turns is used in gotd
too, so a moderate amount of memory used is a good thing.

The downside is that we misuse the hash table and spend a considerable
amount of time during the insert and resize the table.  When the table
is filled we enter in a loop where we increase it by 256 slots, rehash
everything, have a lot of collisions since the table is 90%+ full, rinse
and repeat.

Here's another idea: instead of using the table as container for the
data, use an hash table of indexes of an array.  We can save quite some
memory this way, since int32 indexes gives us plenty of space and
reduces the size needed by the hash table by 20 bytes per slot (the
struct got_delta_block is 24 bytes on amd64.)  The blocks themselves can
be allocated on a vector which doesn't degrade the performances when
filled.

With a smaller hash table, we can pay the memory price for a better
resize scheme, which in turns gives us better performances.

Another advantage of using the hash table as an index, is that the
resize is quick: we can just delete the table and re-insert the all the
elements.

I'm trading a considerable amount of time with some more memory usage.
For example, for a 512k of /dev/random, we would now use 136k insted of
58k of memory.  For 100m of /dev/random it is 33M and 9M respectively.
I'm also using a generous scheme where each resize doubles the size of
the table though.

To get an idea of the improvements, I've extracted the code for the
deltification as a standalone executable.  For example, to
got_deltify_init() the alpine iso (207M)

suzaku% m && time ./obj/d /home/op/Downloads/alpine-standard-3.19.1-x86_64.iso
array: 2250886/2251136 99% -- ht: 761991/2097152 36%
total memory: 62415872
    0m01.81s real     0m01.20s user     0m00.62s system

or for a 1.0G mp4 video:

suzaku% m && time ./obj/d /home/op/Downloads/[...].mp4
array: 9967559/9967744 99% -- ht: 4011730/8388608 47%
total memory: 272780288
    0m09.67s real     0m06.12s user     0m03.49s system

The "total memory" is computed by considering the total memory needed by
the array and by the table (so around 62M and 272M respectively.)  IMHO
the time are more than decent now.  I have no idea how much it'll take
with the current implementation, but given the trend it exibits (~3
minutes for 100M of /dev/random), "too much" is a good estimate ;-)

(these times are all with got_deltify_init(), not _mem() which could be
slightly more faster.)

Not sure if diff below is acceptable, and even if it is, the constants
probably have to be tweaked.  I'm open to suggestions.


Thanks,

Omar Polo


diff /home/op/w/got
commit - 7a86002db34d49472a7d75c1802ee99c2120ef3c
path + /home/op/w/got
blob - 99b5d418b8cec83d130d52be4e5022d0310f8c31
file + lib/deltify.c
--- lib/deltify.c
+++ lib/deltify.c
@@ -87,6 +87,11 @@ static const uint32_t geartab[256] = {
     0xf1f6e72c, 0x5551128a, 0x83af87e2, 0x6f0342ba,
 };
 
+static const struct got_error *addblk(struct got_delta_table *, FILE *, off_t,
+    off_t, off_t, uint32_t);
+static const struct got_error *addblk_mem(struct got_delta_table *,
+    uint8_t *, off_t, off_t, off_t, uint32_t);
+
 static uint32_t
 hashblk(const unsigned char *p, off_t n, uint32_t seed)
 {
@@ -94,10 +99,52 @@ hashblk(const unsigned char *p, off_t n, uint32_t seed
 }
 
 static const struct got_error *
+resizeht(struct got_delta_table *dt, FILE *f, off_t offset0, uint8_t *data,
+    off_t len)
+{
+	struct got_delta_block *b;
+	size_t newsize;
+	int i;
+
+	if (dt->nblocks == dt->nalloc) {
+		newsize = dt->nalloc + 256;
+		b = reallocarray(dt->blocks, newsize, sizeof(*dt->blocks));
+		if (b == NULL)
+			return got_error_from_errno("reallocarray");
+		dt->blocks = b;
+		dt->nalloc = newsize;
+	}
+
+	if (dt->blocks == NULL || dt->len * 100 / dt->size > 70) {
+		newsize = dt->size * 2;
+		free(dt->offs);
+		dt->offs = calloc(newsize, sizeof(*dt->offs));
+		if (dt->offs == NULL)
+			return got_error_from_errno("calloc");
+		dt->size = newsize;
+		dt->len = 0;
+
+		/* rehash the table -- cannot fail */
+		for (i = 0; i < dt->nblocks; ++i) {
+			b = &dt->blocks[i];
+			if (f)
+				addblk(dt, f, offset0, b->len, b->offset,
+				    b->hash);
+			else
+				addblk_mem(dt, data, offset0, b->len,
+				    b->offset, b->hash);
+		}
+	}
+
+	return NULL;
+}
+
+static const struct got_error *
 addblk(struct got_delta_table *dt, FILE *f, off_t file_offset0, off_t len,
     off_t offset, uint32_t h)
 {
-	const struct got_error *err = NULL;
+	const struct got_error *err;
+	struct got_delta_block *block;
 	int i;
 	uint8_t buf[GOT_DELTIFY_MAXCHUNK];
 	uint8_t buf2[GOT_DELTIFY_MAXCHUNK];
@@ -106,69 +153,57 @@ addblk(struct got_delta_table *dt, FILE *f, off_t file
 	if (len == 0)
 		return NULL;
 
-	i = h % dt->nalloc;
-	while (dt->blocks[i].len != 0) {
+	err = resizeht(dt, f, file_offset0, NULL, 0);
+	if (err)
+		return err;
+
+	for (i = h % dt->size; /* forever */; i = (i + 1) % dt->size) {
 		/*
+		 * Zero signals that this slot is empty; the real
+		 * offset is one less the value.
+		 */
+		if (dt->offs[i] == 0)
+			break;
+
+		block = &dt->blocks[dt->offs[i] - 1];
+
+		/*
 		 * Avoid adding duplicate blocks.
 		 * NB: A matching hash is insufficient for detecting equality.
 		 * The hash can only detect inequality.
 		 */
-		if (len == dt->blocks[i].len && h == dt->blocks[i].hash) {
-			if (r == 0) {
-				if (fseeko(f, file_offset0 + offset, SEEK_SET) == -1)
-					return got_error_from_errno("fseeko");
-				r = fread(buf, 1, len, f);
-				if (r != len) {
-					if (!ferror(f))
-						return NULL;
-					return got_ferror(f, GOT_ERR_IO);
-				}
-			}
-			if (fseeko(f, file_offset0 + dt->blocks[i].offset,
-			    SEEK_SET) == -1)
+		if (block->len != len || block->hash != h)
+			continue;
+
+		if (r == 0) {
+			if (fseeko(f, file_offset0 + offset, SEEK_SET) == -1)
 				return got_error_from_errno("fseeko");
-			if (fread(buf2, 1, len, f) != len)
+			r = fread(buf, 1, len, f);
+			if (r != len) {
+				if (!ferror(f))
+					return NULL;
 				return got_ferror(f, GOT_ERR_IO);
-			if (memcmp(buf, buf2, len) == 0)
-				return NULL;
+			}
 		}
 
-		i = (i + 1) % dt->nalloc;
+		if (fseeko(f, file_offset0 + block->offset, SEEK_SET) == -1)
+			return got_error_from_errno("fseeko");
+		if (fread(buf2, 1, len, f) != len)
+			return got_ferror(f, GOT_ERR_IO);
+		if (memcmp(buf, buf2, len) == 0)
+			return NULL;
 	}
-	assert(dt->blocks[i].len == 0);
-	dt->blocks[i].len = len;
-	dt->blocks[i].offset = offset;
-	dt->blocks[i].hash = h;
+
+	dt->offs[i] = dt->nblocks + 1;
+	block = &dt->blocks[dt->nblocks];
+	block->len = len;
+	block->offset = offset;
+	block->hash = h;
+
 	dt->nblocks++;
-	if (dt->nalloc < dt->nblocks + 256) {
-		struct got_delta_block *db;
-		size_t old_size = dt->nalloc;
-		db = dt->blocks;
-		dt->blocks = calloc(dt->nalloc + 256,
-		    sizeof(struct got_delta_block));
-		if (dt->blocks == NULL) {
-			err = got_error_from_errno("calloc");
-			dt->blocks = db;
-			return err;
-		}
-		dt->nalloc += 256;
-		/*
-		 * Recompute all block positions. Hash-based indices of blocks
-		 * in the array depend on the allocated length of the array.
-		 */
-		dt->nblocks = 0;
-		for (i = 0; i < old_size; i++) {
-			if (db[i].len == 0)
-				continue;
-			err = addblk(dt, f, file_offset0, db[i].len,
-			    db[i].offset, db[i].hash);
-			if (err)
-				break;
-		}
-		free(db);
-	}
+	dt->len++;
 
-	return err;
+	return NULL;
 }
 
 static const struct got_error *
@@ -177,62 +212,50 @@ addblk_mem(struct got_delta_table *dt, uint8_t *data, 
 {
 	const struct got_error *err = NULL;
 	int i;
+	struct got_delta_block *block;
 	uint8_t *block1;
 	uint8_t *block2;
 
 	if (len == 0)
 		return NULL;
 
-	i = h % dt->nalloc;
-	while (dt->blocks[i].len != 0) {
+	err = resizeht(dt, NULL, file_offset0, data, len);
+	if (err)
+		return err;
+
+	for (i = h % dt->size; /* forever */; i = (i + 1) % dt->size) {
 		/*
+		 * Zero signals that this slot is empty; the real
+		 * offset is one less the value.
+		 */
+		if (dt->offs[i] == 0)
+			break;
+
+		block = &dt->blocks[dt->offs[i] - 1];
+
+		/*
 		 * Avoid adding duplicate blocks.
 		 * NB: A matching hash is insufficient for detecting equality.
 		 * The hash can only detect inequality.
 		 */
-		if (len == dt->blocks[i].len && h == dt->blocks[i].hash) {
-			block1 = data + file_offset0 + dt->blocks[i].offset;
+		if (len == block->len && h == block->hash) {
+			block1 = data + file_offset0 + block->offset;
 			block2 = data + file_offset0 + offset;
 			if (memcmp(block1, block2, len) == 0)
 				return NULL;
 		}
-
-		i = (i + 1) % dt->nalloc;
 	}
-	assert(dt->blocks[i].len == 0);
-	dt->blocks[i].len = len;
-	dt->blocks[i].offset = offset;
-	dt->blocks[i].hash = h;
+
+	dt->offs[i] = dt->nblocks + 1;
+	block = &dt->blocks[dt->nblocks];
+	block->len = len;
+	block->offset = offset;
+	block->hash = h;
+
 	dt->nblocks++;
-	if (dt->nalloc < dt->nblocks + 256) {
-		struct got_delta_block *db;
-		size_t old_size = dt->nalloc;
-		db = dt->blocks;
-		dt->blocks = calloc(dt->nalloc + 256,
-		    sizeof(struct got_delta_block));
-		if (dt->blocks == NULL) {
-			err = got_error_from_errno("calloc");
-			dt->blocks = db;
-			return err;
-		}
-		dt->nalloc += 256;
-		/*
-		 * Recompute all block positions. Hash-based indices of blocks
-		 * in the array depend on the allocated length of the array.
-		 */
-		dt->nblocks = 0;
-		for (i = 0; i < old_size; i++) {
-			if (db[i].len == 0)
-				continue;
-			err = addblk_mem(dt, data, file_offset0, db[i].len,
-			    db[i].offset, db[i].hash);
-			if (err)
-				break;
-		}
-		free(db);
-	}
+	dt->len++;
 
-	return err;
+	return NULL;
 }
 
 static const struct got_error *
@@ -243,24 +266,24 @@ lookupblk(struct got_delta_block **block, struct got_d
 	int i;
 	uint32_t h;
 	uint8_t buf[GOT_DELTIFY_MAXCHUNK];
+	struct got_delta_block *b;
 	size_t r;
 
 	*block = NULL;
 
 	h = hashblk(p, len, seed);
-	for (i = h % dt->nalloc; dt->blocks[i].len != 0;
-	     i = (i + 1) % dt->nalloc) {
-		if (dt->blocks[i].hash != h ||
-		    dt->blocks[i].len != len)
+	for (i = h % dt->size; dt->offs[i] != 0; i = (i + 1) % dt->size) {
+		b = &dt->blocks[dt->offs[i] - 1];
+		if (b->hash != h || b->len != len)
 			continue;
-		if (fseeko(basefile, basefile_offset0 + dt->blocks[i].offset,
+		if (fseeko(basefile, basefile_offset0 + b->offset,
 		    SEEK_SET) == -1)
 			return got_error_from_errno("fseeko");
 		r = fread(buf, 1, len, basefile);
 		if (r != len)
 			return got_ferror(basefile, GOT_ERR_IO);
 		if (memcmp(p, buf, len) == 0) {
-			*block = &dt->blocks[i];
+			*block = b;
 			break;
 		}
 	}
@@ -274,19 +297,19 @@ lookupblk_mem(struct got_delta_block **block, struct g
 {
 	int i;
 	uint32_t h;
-	uint8_t *b;
+	uint8_t *d;
+	struct got_delta_block *b;
 
 	*block = NULL;
 
 	h = hashblk(p, len, seed);
-	for (i = h % dt->nalloc; dt->blocks[i].len != 0;
-	     i = (i + 1) % dt->nalloc) {
-		if (dt->blocks[i].hash != h ||
-		    dt->blocks[i].len != len)
+	for (i = h % dt->size; dt->offs[i] != 0; i = (i + 1) % dt->size) {
+		b = &dt->blocks[dt->offs[i] - 1];
+		if (b->hash != h || b->len != len)
 			continue;
-		b = basedata + basefile_offset0 + dt->blocks[i].offset;
-		if (memcmp(p, b, len) == 0) {
-			*block = &dt->blocks[i];
+		d = basedata + basefile_offset0 + b->offset;
+		if (memcmp(p, d, len) == 0) {
+			*block = b;
 			break;
 		}
 	}
@@ -370,6 +393,13 @@ got_deltify_init(struct got_delta_table **dt, FILE *f,
 		goto done;
 	}
 
+	(*dt)->size = 128;
+	(*dt)->offs = calloc((*dt)->size, sizeof(*(*dt)->offs));
+	if ((*dt)->offs == NULL) {
+		err = got_error_from_errno("calloc");
+		goto done;
+	}
+
 	if (fseeko(f, fileoffset, SEEK_SET) == -1)
 		return got_error_from_errno("fseeko");
 
@@ -420,6 +450,13 @@ got_deltify_init_mem(struct got_delta_table **dt, uint
 		goto done;
 	}
 
+	(*dt)->size = 128;
+	(*dt)->offs = calloc((*dt)->size, sizeof(*(*dt)->offs));
+	if ((*dt)->offs == NULL) {
+		err = got_error_from_errno("calloc");
+		goto done;
+	}
+
 	while (fileoffset < filesize) {
 		off_t blocklen;
 		err = nextblk_mem(&blocklen, data, fileoffset, filesize);
blob - 23d51d665edd5927c4b9ca18e2ebebf3473b1795
file + lib/got_lib_deltify.h
--- lib/got_lib_deltify.h
+++ lib/got_lib_deltify.h
@@ -25,6 +25,11 @@ struct got_delta_table {
 	struct got_delta_block	*blocks;
 	int			nblocks;
 	int			nalloc;
+
+	/* index for blocks */
+	uint32_t		*offs;
+	int			 len;
+	int			 size;
 };
 
 struct got_delta_instruction {