attempt at speeding up the deltification

On 2024/02/25 12:02:33 +0100, Omar Polo <op@omarpolo.com> wrote: > Stefan Sperling <stsp@stsp.name> wrote: > > On Sun, Feb 25, 2024 at 02:30:28AM +0100, Omar Polo wrote: > > > To get an idea of the improvements, I've extracted the code for the > > > deltification as a standalone executable. For example, to > > > got_deltify_init() the alpine iso (207M) > > > > > > suzaku% m && time ./obj/d /home/op/Downloads/alpine-standard-3.19.1-x86_64.iso > > > array: 2250886/2251136 99% -- ht: 761991/2097152 36% > > > total memory: 62415872 > > > 0m01.81s real 0m01.20s user 0m00.62s system > > > > > > or for a 1.0G mp4 video: > > > > > > suzaku% m && time ./obj/d /home/op/Downloads/[...].mp4 > > > array: 9967559/9967744 99% -- ht: 4011730/8388608 47% > > > total memory: 272780288 > > > 0m09.67s real 0m06.12s user 0m03.49s system > > > > > > The "total memory" is computed by considering the total memory needed by > > > the array and by the table (so around 62M and 272M respectively.) IMHO > > > the time are more than decent now. I have no idea how much it'll take > > > with the current implementation, but given the trend it exibits (~3 > > > minutes for 100M of /dev/random), "too much" is a good estimate ;-) > > > > Could you still run a test on the same set of files using the > > old code please, for comparison? > > > > And show how well the new code performs on 100MB of /dev/random? > > > > If this change brings us from an order of minutes down to an order > > of less than 10 seconds, that's very impressive. > > > > However, comparing deltification of /dev/random to deltification > > of structured files could be an apple vs. oranges comparison. > > There is little chance of finding common blocks within /dev/random. > > Yeah, I did my testing all with the same set of files from /dev/random, > but when composing the email I thought of using some more realistic > files I had around. > > I've published the code I used for these experiments at: > <https://codeberg.org/op/deltify>. > > By the way, gathering the numbers for before/after I noticed a bug in > the rehashing of the hash table that was causing my diff to use more > memory than needed. I've fixed the insert in my repo, will cook an > updated diff for got later. Here's the updated diff for got. I did some benchmarking on 'gotadmin pack -a' with hyperfine, on the repo that once prompted me to bump the resize step from 64 to 256. In practice there I can some small improvements (around 0.5s), which I'm not sure warrants the added code. I also did some profiling on a dummy repository with some files off /dev/random in there -- 1m, 10m, 20m and 50m -- and the performance gain is much noticeable, in the x10 range, but I'm not sure it's a good idea to optimize for an edge cases. diffstat /home/op/w/got M lib/deltify.c | 150+ 133- M lib/got_lib_deltify.h | 8+ 0- 2 files changed, 158 insertions(+), 133 deletions(-) diff /home/op/w/got commit - 7a86002db34d49472a7d75c1802ee99c2120ef3c path + /home/op/w/got blob - 99b5d418b8cec83d130d52be4e5022d0310f8c31 file + lib/deltify.c --- lib/deltify.c +++ lib/deltify.c @@ -87,6 +87,9 @@ static const uint32_t geartab[256] = { 0xf1f6e72c, 0x5551128a, 0x83af87e2, 0x6f0342ba, }; +static const struct got_error *addblk(struct got_delta_table *, FILE *, + uint8_t *, off_t, off_t, off_t, uint32_t); + static uint32_t hashblk(const unsigned char *p, off_t n, uint32_t seed) { @@ -94,145 +97,150 @@ hashblk(const unsigned char *p, off_t n, uint32_t seed } static const struct got_error * -addblk(struct got_delta_table *dt, FILE *f, off_t file_offset0, off_t len, - off_t offset, uint32_t h) +lookup(int *n, struct got_delta_table *dt, FILE *f, off_t file_offset0, + off_t len, off_t offset, uint32_t h) { - const struct got_error *err = NULL; - int i; + struct got_delta_block *block; uint8_t buf[GOT_DELTIFY_MAXCHUNK]; uint8_t buf2[GOT_DELTIFY_MAXCHUNK]; size_t r = 0; + int i; - if (len == 0) - return NULL; + for (i = h % dt->size; dt->offs[i] != 0; i = (i + 1) % dt->size) { + block = &dt->blocks[dt->offs[i] - 1]; + if (block->len != len || block->hash != h) + continue; - i = h % dt->nalloc; - while (dt->blocks[i].len != 0) { - /* - * Avoid adding duplicate blocks. - * NB: A matching hash is insufficient for detecting equality. - * The hash can only detect inequality. - */ - if (len == dt->blocks[i].len && h == dt->blocks[i].hash) { - if (r == 0) { - if (fseeko(f, file_offset0 + offset, SEEK_SET) == -1) - return got_error_from_errno("fseeko"); - r = fread(buf, 1, len, f); - if (r != len) { - if (!ferror(f)) - return NULL; - return got_ferror(f, GOT_ERR_IO); - } - } - if (fseeko(f, file_offset0 + dt->blocks[i].offset, - SEEK_SET) == -1) + if (r == 0) { + if (fseeko(f, file_offset0 + offset, SEEK_SET) == -1) return got_error_from_errno("fseeko"); - if (fread(buf2, 1, len, f) != len) + r = fread(buf, 1, len, f); + if (r != len) { + if (!ferror(f)) + break; /* why? */ return got_ferror(f, GOT_ERR_IO); - if (memcmp(buf, buf2, len) == 0) - return NULL; + } } - i = (i + 1) % dt->nalloc; + if (fseeko(f, file_offset0 + block->offset, SEEK_SET) == -1) + return got_error_from_errno("fseeko"); + if (fread(buf2, 1, len, f) != len) + return got_ferror(f, GOT_ERR_IO); + if (memcmp(buf, buf2, len) == 0) + break; } - assert(dt->blocks[i].len == 0); - dt->blocks[i].len = len; - dt->blocks[i].offset = offset; - dt->blocks[i].hash = h; - dt->nblocks++; - if (dt->nalloc < dt->nblocks + 256) { - struct got_delta_block *db; - size_t old_size = dt->nalloc; - db = dt->blocks; - dt->blocks = calloc(dt->nalloc + 256, - sizeof(struct got_delta_block)); - if (dt->blocks == NULL) { - err = got_error_from_errno("calloc"); - dt->blocks = db; - return err; - } - dt->nalloc += 256; - /* - * Recompute all block positions. Hash-based indices of blocks - * in the array depend on the allocated length of the array. - */ - dt->nblocks = 0; - for (i = 0; i < old_size; i++) { - if (db[i].len == 0) - continue; - err = addblk(dt, f, file_offset0, db[i].len, - db[i].offset, db[i].hash); - if (err) + + *n = i; + return NULL; +} + +static const struct got_error * +lookup_mem(int *n, struct got_delta_table *dt, uint8_t *basedata, + off_t basefile_offset0, uint8_t *p, off_t len, uint32_t h) +{ + struct got_delta_block *block; + uint8_t *d; + int i; + + for (i = h % dt->size; dt->offs[i] != 0; i = (i + 1) % dt->size) { + block = &dt->blocks[dt->offs[i] - 1]; + if (len == block->len && h == block->hash) { + d = basedata + basefile_offset0 + block->offset; + if (memcmp(d, p, len) == 0) break; } - free(db); } - return err; + *n = i; + return NULL; } static const struct got_error * -addblk_mem(struct got_delta_table *dt, uint8_t *data, off_t file_offset0, +resizeht(struct got_delta_table *dt, FILE *f, off_t offset0, uint8_t *data, + off_t len) +{ + const struct got_error *err; + struct got_delta_block *b; + size_t newsize, oldsize; + int i, n; + + if (dt->nblocks == dt->nalloc) { + newsize = dt->nalloc + 256; + b = reallocarray(dt->blocks, newsize, sizeof(*dt->blocks)); + if (b == NULL) + return got_error_from_errno("reallocarray"); + dt->blocks = b; + dt->nalloc = newsize; + } + + if (dt->blocks == NULL || dt->len * 100 / dt->size > 70) { + oldsize = dt->size; + dt->size *= 2; + free(dt->offs); + dt->offs = calloc(dt->size, sizeof(*dt->offs)); + if (dt->offs == NULL) { + dt->size = oldsize; + return got_error_from_errno("calloc"); + } + + for (i = 0; i < dt->nblocks; ++i) { + b = &dt->blocks[i]; + if (f) + err = lookup(&n, dt, f, offset0, b->len, + b->offset, b->hash); + else + err = lookup_mem(&n, dt, data, offset0, + data + b->offset, b->len, b->hash); + if (err) { + free(dt->offs); + dt->offs = NULL; + dt->size = oldsize; + return err; + } + assert(dt->offs[n] == 0); + dt->offs[n] = i + 1; + } + } + + return NULL; +} + +static const struct got_error * +addblk(struct got_delta_table *dt, FILE *f, uint8_t *data, off_t file_offset0, off_t len, off_t offset, uint32_t h) { - const struct got_error *err = NULL; + const struct got_error *err; + struct got_delta_block *block; int i; - uint8_t *block1; - uint8_t *block2; if (len == 0) return NULL; - i = h % dt->nalloc; - while (dt->blocks[i].len != 0) { - /* - * Avoid adding duplicate blocks. - * NB: A matching hash is insufficient for detecting equality. - * The hash can only detect inequality. - */ - if (len == dt->blocks[i].len && h == dt->blocks[i].hash) { - block1 = data + file_offset0 + dt->blocks[i].offset; - block2 = data + file_offset0 + offset; - if (memcmp(block1, block2, len) == 0) - return NULL; - } + err = resizeht(dt, f, file_offset0, data, len); + if (err) + return err; - i = (i + 1) % dt->nalloc; - } - assert(dt->blocks[i].len == 0); - dt->blocks[i].len = len; - dt->blocks[i].offset = offset; - dt->blocks[i].hash = h; + if (f) + err = lookup(&i, dt, f, file_offset0, len, offset, h); + else + err = lookup_mem(&i, dt, data, file_offset0, data + offset, + len, h); + if (err) + return err; + + if (dt->offs[i] != 0) + return NULL; /* found */ + + dt->offs[i] = dt->nblocks + 1; + block = &dt->blocks[dt->nblocks]; + block->len = len; + block->offset = offset; + block->hash = h; + dt->nblocks++; - if (dt->nalloc < dt->nblocks + 256) { - struct got_delta_block *db; - size_t old_size = dt->nalloc; - db = dt->blocks; - dt->blocks = calloc(dt->nalloc + 256, - sizeof(struct got_delta_block)); - if (dt->blocks == NULL) { - err = got_error_from_errno("calloc"); - dt->blocks = db; - return err; - } - dt->nalloc += 256; - /* - * Recompute all block positions. Hash-based indices of blocks - * in the array depend on the allocated length of the array. - */ - dt->nblocks = 0; - for (i = 0; i < old_size; i++) { - if (db[i].len == 0) - continue; - err = addblk_mem(dt, data, file_offset0, db[i].len, - db[i].offset, db[i].hash); - if (err) - break; - } - free(db); - } + dt->len++; - return err; + return NULL; } static const struct got_error * @@ -243,24 +251,25 @@ lookupblk(struct got_delta_block **block, struct got_d int i; uint32_t h; uint8_t buf[GOT_DELTIFY_MAXCHUNK]; + struct got_delta_block *b; size_t r; *block = NULL; h = hashblk(p, len, seed); - for (i = h % dt->nalloc; dt->blocks[i].len != 0; - i = (i + 1) % dt->nalloc) { - if (dt->blocks[i].hash != h || - dt->blocks[i].len != len) + + for (i = h % dt->size; dt->offs[i] != 0; i = (i + 1) % dt->size) { + b = &dt->blocks[dt->offs[i] - 1]; + if (b->hash != h || b->len != len) continue; - if (fseeko(basefile, basefile_offset0 + dt->blocks[i].offset, + if (fseeko(basefile, basefile_offset0 + b->offset, SEEK_SET) == -1) return got_error_from_errno("fseeko"); r = fread(buf, 1, len, basefile); if (r != len) return got_ferror(basefile, GOT_ERR_IO); if (memcmp(p, buf, len) == 0) { - *block = &dt->blocks[i]; + *block = b; break; } } @@ -272,24 +281,17 @@ lookupblk_mem(struct got_delta_block **block, struct g unsigned char *p, off_t len, uint32_t seed, uint8_t *basedata, off_t basefile_offset0) { - int i; + const struct got_error *err; + int n; uint32_t h; - uint8_t *b; *block = NULL; - h = hashblk(p, len, seed); - for (i = h % dt->nalloc; dt->blocks[i].len != 0; - i = (i + 1) % dt->nalloc) { - if (dt->blocks[i].hash != h || - dt->blocks[i].len != len) - continue; - b = basedata + basefile_offset0 + dt->blocks[i].offset; - if (memcmp(p, b, len) == 0) { - *block = &dt->blocks[i]; - break; - } - } + err = lookup_mem(&n, dt, basedata, basefile_offset0, p, len, h); + if (err) + return err; + if (dt->offs[n] != 0) + *block = &dt->blocks[dt->offs[n] - 1]; return NULL; } @@ -370,6 +372,13 @@ got_deltify_init(struct got_delta_table **dt, FILE *f, goto done; } + (*dt)->size = 128; + (*dt)->offs = calloc((*dt)->size, sizeof(*(*dt)->offs)); + if ((*dt)->offs == NULL) { + err = got_error_from_errno("calloc"); + goto done; + } + if (fseeko(f, fileoffset, SEEK_SET) == -1) return got_error_from_errno("fseeko"); @@ -382,7 +391,7 @@ got_deltify_init(struct got_delta_table **dt, FILE *f, if (blocklen == 0) break; h = hashblk(buf, blocklen, seed); - err = addblk(*dt, f, offset0, blocklen, + err = addblk(*dt, f, NULL, offset0, blocklen, fileoffset - offset0, h); if (err) goto done; @@ -420,6 +429,13 @@ got_deltify_init_mem(struct got_delta_table **dt, uint goto done; } + (*dt)->size = 128; + (*dt)->offs = calloc((*dt)->size, sizeof(*(*dt)->offs)); + if ((*dt)->offs == NULL) { + err = got_error_from_errno("calloc"); + goto done; + } + while (fileoffset < filesize) { off_t blocklen; err = nextblk_mem(&blocklen, data, fileoffset, filesize); @@ -428,7 +444,7 @@ got_deltify_init_mem(struct got_delta_table **dt, uint if (blocklen == 0) break; h = hashblk(data + fileoffset, blocklen, seed); - err = addblk_mem(*dt, data, offset0, blocklen, + err = addblk(*dt, NULL, data, offset0, blocklen, fileoffset - offset0, h); if (err) goto done; @@ -450,6 +466,7 @@ got_deltify_free(struct got_delta_table *dt) if (dt == NULL) return; free(dt->blocks); + free(dt->offs); free(dt); } blob - 23d51d665edd5927c4b9ca18e2ebebf3473b1795 file + lib/got_lib_deltify.h --- lib/got_lib_deltify.h +++ lib/got_lib_deltify.h @@ -25,6 +25,14 @@ struct got_delta_table { struct got_delta_block *blocks; int nblocks; int nalloc; + + /* + * Index for blocks. offs[n] is zero when the slot is free, + * otherwise it points to blocks[offs[n] - 1]. + */ + uint32_t *offs; + int len; + int size; }; struct got_delta_instruction {

2024-02-25 10:09 Stefan Sperling:
attempt at speeding up the deltification
- 2024-02-25 11:02 Omar Polo:
  attempt at speeding up the deltification
- - 2024-02-26 10:51 Omar Polo:
    attempt at speeding up the deltification
  - - 2024-02-26 11:28 Stefan Sperling:
      attempt at speeding up the deltification
    - - 2024-02-26 12:09 Omar Polo:
        attempt at speeding up the deltification