From: Omar Polo <op@omarpolo.com>
Subject: Re: attempt at speeding up the deltification
To: Omar Polo <op@omarpolo.com>
Cc: Stefan Sperling <stsp@stsp.name>, gameoftrees@openbsd.org
Date: Mon, 26 Feb 2024 11:51:26 +0100

On 2024/02/25 12:02:33 +0100, Omar Polo <op@omarpolo.com> wrote:
> Stefan Sperling <stsp@stsp.name> wrote:
> > On Sun, Feb 25, 2024 at 02:30:28AM +0100, Omar Polo wrote:
> > > To get an idea of the improvements, I've extracted the code for the
> > > deltification as a standalone executable.  For example, to
> > > got_deltify_init() the alpine iso (207M)
> > > 
> > > suzaku% m && time ./obj/d /home/op/Downloads/alpine-standard-3.19.1-x86_64.iso
> > > array: 2250886/2251136 99% -- ht: 761991/2097152 36%
> > > total memory: 62415872
> > >     0m01.81s real     0m01.20s user     0m00.62s system
> > > 
> > > or for a 1.0G mp4 video:
> > > 
> > > suzaku% m && time ./obj/d /home/op/Downloads/[...].mp4
> > > array: 9967559/9967744 99% -- ht: 4011730/8388608 47%
> > > total memory: 272780288
> > >     0m09.67s real     0m06.12s user     0m03.49s system
> > > 
> > > The "total memory" is computed by considering the total memory needed by
> > > the array and by the table (so around 62M and 272M respectively.)  IMHO
> > > the time are more than decent now.  I have no idea how much it'll take
> > > with the current implementation, but given the trend it exibits (~3
> > > minutes for 100M of /dev/random), "too much" is a good estimate ;-)
> > 
> > Could you still run a test on the same set of files using the
> > old code please, for comparison?
> > 
> > And show how well the new code performs on 100MB of /dev/random?
> > 
> > If this change brings us from an order of minutes down to an order
> > of less than 10 seconds, that's very impressive.
> > 
> > However, comparing deltification of /dev/random to deltification
> > of structured files could be an apple vs. oranges comparison.
> > There is little chance of finding common blocks within /dev/random.
> 
> Yeah, I did my testing all with the same set of files from /dev/random,
> but when composing the email I thought of using some more realistic
> files I had around.
> 
> I've published the code I used for these experiments at:
> <https://codeberg.org/op/deltify>.
> 
> By the way, gathering the numbers for before/after I noticed a bug in
> the rehashing of the hash table that was causing my diff to use more
> memory than needed.  I've fixed the insert in my repo, will cook an
> updated diff for got later.

Here's the updated diff for got.

I did some benchmarking on 'gotadmin pack -a' with hyperfine, on the
repo that once prompted me to bump the resize step from 64 to 256.  In
practice there I can some small improvements (around 0.5s), which I'm
not sure warrants the added code.

I also did some profiling on a dummy repository with some files off
/dev/random in there -- 1m, 10m, 20m and 50m -- and the performance gain
is much noticeable, in the x10 range, but I'm not sure it's a good idea
to optimize for an edge cases.

diffstat /home/op/w/got
 M  lib/deltify.c          |  150+  133-
 M  lib/got_lib_deltify.h  |    8+    0-

2 files changed, 158 insertions(+), 133 deletions(-)

diff /home/op/w/got
commit - 7a86002db34d49472a7d75c1802ee99c2120ef3c
path + /home/op/w/got
blob - 99b5d418b8cec83d130d52be4e5022d0310f8c31
file + lib/deltify.c
--- lib/deltify.c
+++ lib/deltify.c
@@ -87,6 +87,9 @@ static const uint32_t geartab[256] = {
     0xf1f6e72c, 0x5551128a, 0x83af87e2, 0x6f0342ba,
 };
 
+static const struct got_error *addblk(struct got_delta_table *, FILE *,
+    uint8_t *, off_t, off_t, off_t, uint32_t);
+
 static uint32_t
 hashblk(const unsigned char *p, off_t n, uint32_t seed)
 {
@@ -94,145 +97,150 @@ hashblk(const unsigned char *p, off_t n, uint32_t seed
 }
 
 static const struct got_error *
-addblk(struct got_delta_table *dt, FILE *f, off_t file_offset0, off_t len,
-    off_t offset, uint32_t h)
+lookup(int *n, struct got_delta_table *dt, FILE *f, off_t file_offset0,
+    off_t len, off_t offset, uint32_t h)
 {
-	const struct got_error *err = NULL;
-	int i;
+	struct got_delta_block *block;
 	uint8_t buf[GOT_DELTIFY_MAXCHUNK];
 	uint8_t buf2[GOT_DELTIFY_MAXCHUNK];
 	size_t r = 0;
+	int i;
 
-	if (len == 0)
-		return NULL;
+	for (i = h % dt->size; dt->offs[i] != 0; i = (i + 1) % dt->size) {
+		block = &dt->blocks[dt->offs[i] - 1];
+		if (block->len != len || block->hash != h)
+			continue;
 
-	i = h % dt->nalloc;
-	while (dt->blocks[i].len != 0) {
-		/*
-		 * Avoid adding duplicate blocks.
-		 * NB: A matching hash is insufficient for detecting equality.
-		 * The hash can only detect inequality.
-		 */
-		if (len == dt->blocks[i].len && h == dt->blocks[i].hash) {
-			if (r == 0) {
-				if (fseeko(f, file_offset0 + offset, SEEK_SET) == -1)
-					return got_error_from_errno("fseeko");
-				r = fread(buf, 1, len, f);
-				if (r != len) {
-					if (!ferror(f))
-						return NULL;
-					return got_ferror(f, GOT_ERR_IO);
-				}
-			}
-			if (fseeko(f, file_offset0 + dt->blocks[i].offset,
-			    SEEK_SET) == -1)
+		if (r == 0) {
+			if (fseeko(f, file_offset0 + offset, SEEK_SET) == -1)
 				return got_error_from_errno("fseeko");
-			if (fread(buf2, 1, len, f) != len)
+			r = fread(buf, 1, len, f);
+			if (r != len) {
+				if (!ferror(f))
+					break;		/* why? */
 				return got_ferror(f, GOT_ERR_IO);
-			if (memcmp(buf, buf2, len) == 0)
-				return NULL;
+			}
 		}
 
-		i = (i + 1) % dt->nalloc;
+		if (fseeko(f, file_offset0 + block->offset, SEEK_SET) == -1)
+			return got_error_from_errno("fseeko");
+		if (fread(buf2, 1, len, f) != len)
+			return got_ferror(f, GOT_ERR_IO);
+		if (memcmp(buf, buf2, len) == 0)
+			break;
 	}
-	assert(dt->blocks[i].len == 0);
-	dt->blocks[i].len = len;
-	dt->blocks[i].offset = offset;
-	dt->blocks[i].hash = h;
-	dt->nblocks++;
-	if (dt->nalloc < dt->nblocks + 256) {
-		struct got_delta_block *db;
-		size_t old_size = dt->nalloc;
-		db = dt->blocks;
-		dt->blocks = calloc(dt->nalloc + 256,
-		    sizeof(struct got_delta_block));
-		if (dt->blocks == NULL) {
-			err = got_error_from_errno("calloc");
-			dt->blocks = db;
-			return err;
-		}
-		dt->nalloc += 256;
-		/*
-		 * Recompute all block positions. Hash-based indices of blocks
-		 * in the array depend on the allocated length of the array.
-		 */
-		dt->nblocks = 0;
-		for (i = 0; i < old_size; i++) {
-			if (db[i].len == 0)
-				continue;
-			err = addblk(dt, f, file_offset0, db[i].len,
-			    db[i].offset, db[i].hash);
-			if (err)
+
+	*n = i;
+	return NULL;
+}
+
+static const struct got_error *
+lookup_mem(int *n, struct got_delta_table *dt, uint8_t *basedata,
+    off_t basefile_offset0, uint8_t *p, off_t len, uint32_t h)
+{
+	struct got_delta_block *block;
+	uint8_t *d;
+	int i;
+
+	for (i = h % dt->size; dt->offs[i] != 0; i = (i + 1) % dt->size) {
+		block = &dt->blocks[dt->offs[i] - 1];
+		if (len == block->len && h == block->hash) {
+			d = basedata + basefile_offset0 + block->offset;
+			if (memcmp(d, p, len) == 0)
 				break;
 		}
-		free(db);
 	}
 
-	return err;
+	*n = i;
+	return NULL;
 }
 
 static const struct got_error *
-addblk_mem(struct got_delta_table *dt, uint8_t *data, off_t file_offset0,
+resizeht(struct got_delta_table *dt, FILE *f, off_t offset0, uint8_t *data,
+    off_t len)
+{
+	const struct got_error *err;
+	struct got_delta_block *b;
+	size_t newsize, oldsize;
+	int i, n;
+
+	if (dt->nblocks == dt->nalloc) {
+		newsize = dt->nalloc + 256;
+		b = reallocarray(dt->blocks, newsize, sizeof(*dt->blocks));
+		if (b == NULL)
+			return got_error_from_errno("reallocarray");
+		dt->blocks = b;
+		dt->nalloc = newsize;
+	}
+
+	if (dt->blocks == NULL || dt->len * 100 / dt->size > 70) {
+		oldsize = dt->size;
+		dt->size *= 2;
+		free(dt->offs);
+		dt->offs = calloc(dt->size, sizeof(*dt->offs));
+		if (dt->offs == NULL) {
+			dt->size = oldsize;
+			return got_error_from_errno("calloc");
+		}
+
+		for (i = 0; i < dt->nblocks; ++i) {
+			b = &dt->blocks[i];
+			if (f)
+				err = lookup(&n, dt, f, offset0, b->len,
+				    b->offset, b->hash);
+			else
+				err = lookup_mem(&n, dt, data, offset0,
+				    data + b->offset, b->len, b->hash);
+			if (err) {
+				free(dt->offs);
+				dt->offs = NULL;
+				dt->size = oldsize;
+				return err;
+			}
+			assert(dt->offs[n] == 0);
+			dt->offs[n] = i + 1;
+		}
+	}
+
+	return NULL;
+}
+
+static const struct got_error *
+addblk(struct got_delta_table *dt, FILE *f, uint8_t *data, off_t file_offset0,
     off_t len, off_t offset, uint32_t h)
 {
-	const struct got_error *err = NULL;
+	const struct got_error *err;
+	struct got_delta_block *block;
 	int i;
-	uint8_t *block1;
-	uint8_t *block2;
 
 	if (len == 0)
 		return NULL;
 
-	i = h % dt->nalloc;
-	while (dt->blocks[i].len != 0) {
-		/*
-		 * Avoid adding duplicate blocks.
-		 * NB: A matching hash is insufficient for detecting equality.
-		 * The hash can only detect inequality.
-		 */
-		if (len == dt->blocks[i].len && h == dt->blocks[i].hash) {
-			block1 = data + file_offset0 + dt->blocks[i].offset;
-			block2 = data + file_offset0 + offset;
-			if (memcmp(block1, block2, len) == 0)
-				return NULL;
-		}
+	err = resizeht(dt, f, file_offset0, data, len);
+	if (err)
+		return err;
 
-		i = (i + 1) % dt->nalloc;
-	}
-	assert(dt->blocks[i].len == 0);
-	dt->blocks[i].len = len;
-	dt->blocks[i].offset = offset;
-	dt->blocks[i].hash = h;
+	if (f)
+		err = lookup(&i, dt, f, file_offset0, len, offset, h);
+	else
+		err = lookup_mem(&i, dt, data, file_offset0, data + offset,
+		    len, h);
+	if (err)
+		return err;
+
+	if (dt->offs[i] != 0)
+		return NULL;	/* found */
+
+	dt->offs[i] = dt->nblocks + 1;
+	block = &dt->blocks[dt->nblocks];
+	block->len = len;
+	block->offset = offset;
+	block->hash = h;
+
 	dt->nblocks++;
-	if (dt->nalloc < dt->nblocks + 256) {
-		struct got_delta_block *db;
-		size_t old_size = dt->nalloc;
-		db = dt->blocks;
-		dt->blocks = calloc(dt->nalloc + 256,
-		    sizeof(struct got_delta_block));
-		if (dt->blocks == NULL) {
-			err = got_error_from_errno("calloc");
-			dt->blocks = db;
-			return err;
-		}
-		dt->nalloc += 256;
-		/*
-		 * Recompute all block positions. Hash-based indices of blocks
-		 * in the array depend on the allocated length of the array.
-		 */
-		dt->nblocks = 0;
-		for (i = 0; i < old_size; i++) {
-			if (db[i].len == 0)
-				continue;
-			err = addblk_mem(dt, data, file_offset0, db[i].len,
-			    db[i].offset, db[i].hash);
-			if (err)
-				break;
-		}
-		free(db);
-	}
+	dt->len++;
 
-	return err;
+	return NULL;
 }
 
 static const struct got_error *
@@ -243,24 +251,25 @@ lookupblk(struct got_delta_block **block, struct got_d
 	int i;
 	uint32_t h;
 	uint8_t buf[GOT_DELTIFY_MAXCHUNK];
+	struct got_delta_block *b;
 	size_t r;
 
 	*block = NULL;
 
 	h = hashblk(p, len, seed);
-	for (i = h % dt->nalloc; dt->blocks[i].len != 0;
-	     i = (i + 1) % dt->nalloc) {
-		if (dt->blocks[i].hash != h ||
-		    dt->blocks[i].len != len)
+	
+	for (i = h % dt->size; dt->offs[i] != 0; i = (i + 1) % dt->size) {
+		b = &dt->blocks[dt->offs[i] - 1];
+		if (b->hash != h || b->len != len)
 			continue;
-		if (fseeko(basefile, basefile_offset0 + dt->blocks[i].offset,
+		if (fseeko(basefile, basefile_offset0 + b->offset,
 		    SEEK_SET) == -1)
 			return got_error_from_errno("fseeko");
 		r = fread(buf, 1, len, basefile);
 		if (r != len)
 			return got_ferror(basefile, GOT_ERR_IO);
 		if (memcmp(p, buf, len) == 0) {
-			*block = &dt->blocks[i];
+			*block = b;
 			break;
 		}
 	}
@@ -272,24 +281,17 @@ lookupblk_mem(struct got_delta_block **block, struct g
     unsigned char *p, off_t len, uint32_t seed, uint8_t *basedata,
     off_t basefile_offset0)
 {
-	int i;
+	const struct got_error *err;
+	int n;
 	uint32_t h;
-	uint8_t *b;
 
 	*block = NULL;
-
 	h = hashblk(p, len, seed);
-	for (i = h % dt->nalloc; dt->blocks[i].len != 0;
-	     i = (i + 1) % dt->nalloc) {
-		if (dt->blocks[i].hash != h ||
-		    dt->blocks[i].len != len)
-			continue;
-		b = basedata + basefile_offset0 + dt->blocks[i].offset;
-		if (memcmp(p, b, len) == 0) {
-			*block = &dt->blocks[i];
-			break;
-		}
-	}
+	err = lookup_mem(&n, dt, basedata, basefile_offset0, p, len, h);
+	if (err)
+		return err;
+	if (dt->offs[n] != 0)
+		*block = &dt->blocks[dt->offs[n] - 1];
 	return NULL;
 }
 
@@ -370,6 +372,13 @@ got_deltify_init(struct got_delta_table **dt, FILE *f,
 		goto done;
 	}
 
+	(*dt)->size = 128;
+	(*dt)->offs = calloc((*dt)->size, sizeof(*(*dt)->offs));
+	if ((*dt)->offs == NULL) {
+		err = got_error_from_errno("calloc");
+		goto done;
+	}
+
 	if (fseeko(f, fileoffset, SEEK_SET) == -1)
 		return got_error_from_errno("fseeko");
 
@@ -382,7 +391,7 @@ got_deltify_init(struct got_delta_table **dt, FILE *f,
 		if (blocklen == 0)
 			break;
 		h = hashblk(buf, blocklen, seed);
-		err = addblk(*dt, f, offset0, blocklen,
+		err = addblk(*dt, f, NULL, offset0, blocklen,
 		    fileoffset - offset0, h);
 		if (err)
 			goto done;
@@ -420,6 +429,13 @@ got_deltify_init_mem(struct got_delta_table **dt, uint
 		goto done;
 	}
 
+	(*dt)->size = 128;
+	(*dt)->offs = calloc((*dt)->size, sizeof(*(*dt)->offs));
+	if ((*dt)->offs == NULL) {
+		err = got_error_from_errno("calloc");
+		goto done;
+	}
+
 	while (fileoffset < filesize) {
 		off_t blocklen;
 		err = nextblk_mem(&blocklen, data, fileoffset, filesize);
@@ -428,7 +444,7 @@ got_deltify_init_mem(struct got_delta_table **dt, uint
 		if (blocklen == 0)
 			break;
 		h = hashblk(data + fileoffset, blocklen, seed);
-		err = addblk_mem(*dt, data, offset0, blocklen,
+		err = addblk(*dt, NULL, data, offset0, blocklen,
 		    fileoffset - offset0, h);
 		if (err)
 			goto done;
@@ -450,6 +466,7 @@ got_deltify_free(struct got_delta_table *dt)
 	if (dt == NULL)
 		return;
 	free(dt->blocks);
+	free(dt->offs);
 	free(dt);
 }
 
blob - 23d51d665edd5927c4b9ca18e2ebebf3473b1795
file + lib/got_lib_deltify.h
--- lib/got_lib_deltify.h
+++ lib/got_lib_deltify.h
@@ -25,6 +25,14 @@ struct got_delta_table {
 	struct got_delta_block	*blocks;
 	int			nblocks;
 	int			nalloc;
+
+	/*
+	 * Index for blocks.  offs[n] is zero when the slot is free,
+	 * otherwise it points to blocks[offs[n] - 1].
+	 */
+	uint32_t		*offs;
+	int			 len;
+	int			 size;
 };
 
 struct got_delta_instruction {