Download raw body.
attempt at speeding up the deltification
On 2024/02/25 12:02:33 +0100, Omar Polo <op@omarpolo.com> wrote:
> Stefan Sperling <stsp@stsp.name> wrote:
> > On Sun, Feb 25, 2024 at 02:30:28AM +0100, Omar Polo wrote:
> > > To get an idea of the improvements, I've extracted the code for the
> > > deltification as a standalone executable. For example, to
> > > got_deltify_init() the alpine iso (207M)
> > >
> > > suzaku% m && time ./obj/d /home/op/Downloads/alpine-standard-3.19.1-x86_64.iso
> > > array: 2250886/2251136 99% -- ht: 761991/2097152 36%
> > > total memory: 62415872
> > > 0m01.81s real 0m01.20s user 0m00.62s system
> > >
> > > or for a 1.0G mp4 video:
> > >
> > > suzaku% m && time ./obj/d /home/op/Downloads/[...].mp4
> > > array: 9967559/9967744 99% -- ht: 4011730/8388608 47%
> > > total memory: 272780288
> > > 0m09.67s real 0m06.12s user 0m03.49s system
> > >
> > > The "total memory" is computed by considering the total memory needed by
> > > the array and by the table (so around 62M and 272M respectively.) IMHO
> > > the time are more than decent now. I have no idea how much it'll take
> > > with the current implementation, but given the trend it exibits (~3
> > > minutes for 100M of /dev/random), "too much" is a good estimate ;-)
> >
> > Could you still run a test on the same set of files using the
> > old code please, for comparison?
> >
> > And show how well the new code performs on 100MB of /dev/random?
> >
> > If this change brings us from an order of minutes down to an order
> > of less than 10 seconds, that's very impressive.
> >
> > However, comparing deltification of /dev/random to deltification
> > of structured files could be an apple vs. oranges comparison.
> > There is little chance of finding common blocks within /dev/random.
>
> Yeah, I did my testing all with the same set of files from /dev/random,
> but when composing the email I thought of using some more realistic
> files I had around.
>
> I've published the code I used for these experiments at:
> <https://codeberg.org/op/deltify>.
>
> By the way, gathering the numbers for before/after I noticed a bug in
> the rehashing of the hash table that was causing my diff to use more
> memory than needed. I've fixed the insert in my repo, will cook an
> updated diff for got later.
Here's the updated diff for got.
I did some benchmarking on 'gotadmin pack -a' with hyperfine, on the
repo that once prompted me to bump the resize step from 64 to 256. In
practice there I can some small improvements (around 0.5s), which I'm
not sure warrants the added code.
I also did some profiling on a dummy repository with some files off
/dev/random in there -- 1m, 10m, 20m and 50m -- and the performance gain
is much noticeable, in the x10 range, but I'm not sure it's a good idea
to optimize for an edge cases.
diffstat /home/op/w/got
M lib/deltify.c | 150+ 133-
M lib/got_lib_deltify.h | 8+ 0-
2 files changed, 158 insertions(+), 133 deletions(-)
diff /home/op/w/got
commit - 7a86002db34d49472a7d75c1802ee99c2120ef3c
path + /home/op/w/got
blob - 99b5d418b8cec83d130d52be4e5022d0310f8c31
file + lib/deltify.c
--- lib/deltify.c
+++ lib/deltify.c
@@ -87,6 +87,9 @@ static const uint32_t geartab[256] = {
0xf1f6e72c, 0x5551128a, 0x83af87e2, 0x6f0342ba,
};
+static const struct got_error *addblk(struct got_delta_table *, FILE *,
+ uint8_t *, off_t, off_t, off_t, uint32_t);
+
static uint32_t
hashblk(const unsigned char *p, off_t n, uint32_t seed)
{
@@ -94,145 +97,150 @@ hashblk(const unsigned char *p, off_t n, uint32_t seed
}
static const struct got_error *
-addblk(struct got_delta_table *dt, FILE *f, off_t file_offset0, off_t len,
- off_t offset, uint32_t h)
+lookup(int *n, struct got_delta_table *dt, FILE *f, off_t file_offset0,
+ off_t len, off_t offset, uint32_t h)
{
- const struct got_error *err = NULL;
- int i;
+ struct got_delta_block *block;
uint8_t buf[GOT_DELTIFY_MAXCHUNK];
uint8_t buf2[GOT_DELTIFY_MAXCHUNK];
size_t r = 0;
+ int i;
- if (len == 0)
- return NULL;
+ for (i = h % dt->size; dt->offs[i] != 0; i = (i + 1) % dt->size) {
+ block = &dt->blocks[dt->offs[i] - 1];
+ if (block->len != len || block->hash != h)
+ continue;
- i = h % dt->nalloc;
- while (dt->blocks[i].len != 0) {
- /*
- * Avoid adding duplicate blocks.
- * NB: A matching hash is insufficient for detecting equality.
- * The hash can only detect inequality.
- */
- if (len == dt->blocks[i].len && h == dt->blocks[i].hash) {
- if (r == 0) {
- if (fseeko(f, file_offset0 + offset, SEEK_SET) == -1)
- return got_error_from_errno("fseeko");
- r = fread(buf, 1, len, f);
- if (r != len) {
- if (!ferror(f))
- return NULL;
- return got_ferror(f, GOT_ERR_IO);
- }
- }
- if (fseeko(f, file_offset0 + dt->blocks[i].offset,
- SEEK_SET) == -1)
+ if (r == 0) {
+ if (fseeko(f, file_offset0 + offset, SEEK_SET) == -1)
return got_error_from_errno("fseeko");
- if (fread(buf2, 1, len, f) != len)
+ r = fread(buf, 1, len, f);
+ if (r != len) {
+ if (!ferror(f))
+ break; /* why? */
return got_ferror(f, GOT_ERR_IO);
- if (memcmp(buf, buf2, len) == 0)
- return NULL;
+ }
}
- i = (i + 1) % dt->nalloc;
+ if (fseeko(f, file_offset0 + block->offset, SEEK_SET) == -1)
+ return got_error_from_errno("fseeko");
+ if (fread(buf2, 1, len, f) != len)
+ return got_ferror(f, GOT_ERR_IO);
+ if (memcmp(buf, buf2, len) == 0)
+ break;
}
- assert(dt->blocks[i].len == 0);
- dt->blocks[i].len = len;
- dt->blocks[i].offset = offset;
- dt->blocks[i].hash = h;
- dt->nblocks++;
- if (dt->nalloc < dt->nblocks + 256) {
- struct got_delta_block *db;
- size_t old_size = dt->nalloc;
- db = dt->blocks;
- dt->blocks = calloc(dt->nalloc + 256,
- sizeof(struct got_delta_block));
- if (dt->blocks == NULL) {
- err = got_error_from_errno("calloc");
- dt->blocks = db;
- return err;
- }
- dt->nalloc += 256;
- /*
- * Recompute all block positions. Hash-based indices of blocks
- * in the array depend on the allocated length of the array.
- */
- dt->nblocks = 0;
- for (i = 0; i < old_size; i++) {
- if (db[i].len == 0)
- continue;
- err = addblk(dt, f, file_offset0, db[i].len,
- db[i].offset, db[i].hash);
- if (err)
+
+ *n = i;
+ return NULL;
+}
+
+static const struct got_error *
+lookup_mem(int *n, struct got_delta_table *dt, uint8_t *basedata,
+ off_t basefile_offset0, uint8_t *p, off_t len, uint32_t h)
+{
+ struct got_delta_block *block;
+ uint8_t *d;
+ int i;
+
+ for (i = h % dt->size; dt->offs[i] != 0; i = (i + 1) % dt->size) {
+ block = &dt->blocks[dt->offs[i] - 1];
+ if (len == block->len && h == block->hash) {
+ d = basedata + basefile_offset0 + block->offset;
+ if (memcmp(d, p, len) == 0)
break;
}
- free(db);
}
- return err;
+ *n = i;
+ return NULL;
}
static const struct got_error *
-addblk_mem(struct got_delta_table *dt, uint8_t *data, off_t file_offset0,
+resizeht(struct got_delta_table *dt, FILE *f, off_t offset0, uint8_t *data,
+ off_t len)
+{
+ const struct got_error *err;
+ struct got_delta_block *b;
+ size_t newsize, oldsize;
+ int i, n;
+
+ if (dt->nblocks == dt->nalloc) {
+ newsize = dt->nalloc + 256;
+ b = reallocarray(dt->blocks, newsize, sizeof(*dt->blocks));
+ if (b == NULL)
+ return got_error_from_errno("reallocarray");
+ dt->blocks = b;
+ dt->nalloc = newsize;
+ }
+
+ if (dt->blocks == NULL || dt->len * 100 / dt->size > 70) {
+ oldsize = dt->size;
+ dt->size *= 2;
+ free(dt->offs);
+ dt->offs = calloc(dt->size, sizeof(*dt->offs));
+ if (dt->offs == NULL) {
+ dt->size = oldsize;
+ return got_error_from_errno("calloc");
+ }
+
+ for (i = 0; i < dt->nblocks; ++i) {
+ b = &dt->blocks[i];
+ if (f)
+ err = lookup(&n, dt, f, offset0, b->len,
+ b->offset, b->hash);
+ else
+ err = lookup_mem(&n, dt, data, offset0,
+ data + b->offset, b->len, b->hash);
+ if (err) {
+ free(dt->offs);
+ dt->offs = NULL;
+ dt->size = oldsize;
+ return err;
+ }
+ assert(dt->offs[n] == 0);
+ dt->offs[n] = i + 1;
+ }
+ }
+
+ return NULL;
+}
+
+static const struct got_error *
+addblk(struct got_delta_table *dt, FILE *f, uint8_t *data, off_t file_offset0,
off_t len, off_t offset, uint32_t h)
{
- const struct got_error *err = NULL;
+ const struct got_error *err;
+ struct got_delta_block *block;
int i;
- uint8_t *block1;
- uint8_t *block2;
if (len == 0)
return NULL;
- i = h % dt->nalloc;
- while (dt->blocks[i].len != 0) {
- /*
- * Avoid adding duplicate blocks.
- * NB: A matching hash is insufficient for detecting equality.
- * The hash can only detect inequality.
- */
- if (len == dt->blocks[i].len && h == dt->blocks[i].hash) {
- block1 = data + file_offset0 + dt->blocks[i].offset;
- block2 = data + file_offset0 + offset;
- if (memcmp(block1, block2, len) == 0)
- return NULL;
- }
+ err = resizeht(dt, f, file_offset0, data, len);
+ if (err)
+ return err;
- i = (i + 1) % dt->nalloc;
- }
- assert(dt->blocks[i].len == 0);
- dt->blocks[i].len = len;
- dt->blocks[i].offset = offset;
- dt->blocks[i].hash = h;
+ if (f)
+ err = lookup(&i, dt, f, file_offset0, len, offset, h);
+ else
+ err = lookup_mem(&i, dt, data, file_offset0, data + offset,
+ len, h);
+ if (err)
+ return err;
+
+ if (dt->offs[i] != 0)
+ return NULL; /* found */
+
+ dt->offs[i] = dt->nblocks + 1;
+ block = &dt->blocks[dt->nblocks];
+ block->len = len;
+ block->offset = offset;
+ block->hash = h;
+
dt->nblocks++;
- if (dt->nalloc < dt->nblocks + 256) {
- struct got_delta_block *db;
- size_t old_size = dt->nalloc;
- db = dt->blocks;
- dt->blocks = calloc(dt->nalloc + 256,
- sizeof(struct got_delta_block));
- if (dt->blocks == NULL) {
- err = got_error_from_errno("calloc");
- dt->blocks = db;
- return err;
- }
- dt->nalloc += 256;
- /*
- * Recompute all block positions. Hash-based indices of blocks
- * in the array depend on the allocated length of the array.
- */
- dt->nblocks = 0;
- for (i = 0; i < old_size; i++) {
- if (db[i].len == 0)
- continue;
- err = addblk_mem(dt, data, file_offset0, db[i].len,
- db[i].offset, db[i].hash);
- if (err)
- break;
- }
- free(db);
- }
+ dt->len++;
- return err;
+ return NULL;
}
static const struct got_error *
@@ -243,24 +251,25 @@ lookupblk(struct got_delta_block **block, struct got_d
int i;
uint32_t h;
uint8_t buf[GOT_DELTIFY_MAXCHUNK];
+ struct got_delta_block *b;
size_t r;
*block = NULL;
h = hashblk(p, len, seed);
- for (i = h % dt->nalloc; dt->blocks[i].len != 0;
- i = (i + 1) % dt->nalloc) {
- if (dt->blocks[i].hash != h ||
- dt->blocks[i].len != len)
+
+ for (i = h % dt->size; dt->offs[i] != 0; i = (i + 1) % dt->size) {
+ b = &dt->blocks[dt->offs[i] - 1];
+ if (b->hash != h || b->len != len)
continue;
- if (fseeko(basefile, basefile_offset0 + dt->blocks[i].offset,
+ if (fseeko(basefile, basefile_offset0 + b->offset,
SEEK_SET) == -1)
return got_error_from_errno("fseeko");
r = fread(buf, 1, len, basefile);
if (r != len)
return got_ferror(basefile, GOT_ERR_IO);
if (memcmp(p, buf, len) == 0) {
- *block = &dt->blocks[i];
+ *block = b;
break;
}
}
@@ -272,24 +281,17 @@ lookupblk_mem(struct got_delta_block **block, struct g
unsigned char *p, off_t len, uint32_t seed, uint8_t *basedata,
off_t basefile_offset0)
{
- int i;
+ const struct got_error *err;
+ int n;
uint32_t h;
- uint8_t *b;
*block = NULL;
-
h = hashblk(p, len, seed);
- for (i = h % dt->nalloc; dt->blocks[i].len != 0;
- i = (i + 1) % dt->nalloc) {
- if (dt->blocks[i].hash != h ||
- dt->blocks[i].len != len)
- continue;
- b = basedata + basefile_offset0 + dt->blocks[i].offset;
- if (memcmp(p, b, len) == 0) {
- *block = &dt->blocks[i];
- break;
- }
- }
+ err = lookup_mem(&n, dt, basedata, basefile_offset0, p, len, h);
+ if (err)
+ return err;
+ if (dt->offs[n] != 0)
+ *block = &dt->blocks[dt->offs[n] - 1];
return NULL;
}
@@ -370,6 +372,13 @@ got_deltify_init(struct got_delta_table **dt, FILE *f,
goto done;
}
+ (*dt)->size = 128;
+ (*dt)->offs = calloc((*dt)->size, sizeof(*(*dt)->offs));
+ if ((*dt)->offs == NULL) {
+ err = got_error_from_errno("calloc");
+ goto done;
+ }
+
if (fseeko(f, fileoffset, SEEK_SET) == -1)
return got_error_from_errno("fseeko");
@@ -382,7 +391,7 @@ got_deltify_init(struct got_delta_table **dt, FILE *f,
if (blocklen == 0)
break;
h = hashblk(buf, blocklen, seed);
- err = addblk(*dt, f, offset0, blocklen,
+ err = addblk(*dt, f, NULL, offset0, blocklen,
fileoffset - offset0, h);
if (err)
goto done;
@@ -420,6 +429,13 @@ got_deltify_init_mem(struct got_delta_table **dt, uint
goto done;
}
+ (*dt)->size = 128;
+ (*dt)->offs = calloc((*dt)->size, sizeof(*(*dt)->offs));
+ if ((*dt)->offs == NULL) {
+ err = got_error_from_errno("calloc");
+ goto done;
+ }
+
while (fileoffset < filesize) {
off_t blocklen;
err = nextblk_mem(&blocklen, data, fileoffset, filesize);
@@ -428,7 +444,7 @@ got_deltify_init_mem(struct got_delta_table **dt, uint
if (blocklen == 0)
break;
h = hashblk(data + fileoffset, blocklen, seed);
- err = addblk_mem(*dt, data, offset0, blocklen,
+ err = addblk(*dt, NULL, data, offset0, blocklen,
fileoffset - offset0, h);
if (err)
goto done;
@@ -450,6 +466,7 @@ got_deltify_free(struct got_delta_table *dt)
if (dt == NULL)
return;
free(dt->blocks);
+ free(dt->offs);
free(dt);
}
blob - 23d51d665edd5927c4b9ca18e2ebebf3473b1795
file + lib/got_lib_deltify.h
--- lib/got_lib_deltify.h
+++ lib/got_lib_deltify.h
@@ -25,6 +25,14 @@ struct got_delta_table {
struct got_delta_block *blocks;
int nblocks;
int nalloc;
+
+ /*
+ * Index for blocks. offs[n] is zero when the slot is free,
+ * otherwise it points to blocks[offs[n] - 1].
+ */
+ uint32_t *offs;
+ int len;
+ int size;
};
struct got_delta_instruction {
attempt at speeding up the deltification