From: "Omar Polo" Subject: attempt at speeding up the deltification To: gameoftrees@openbsd.org Date: Sun, 25 Feb 2024 02:30:28 +0100 The deltification is quite slow not only because it is objectively a heavy task, but also because we try to use as little memory as possible. got_deltify() is used in pack_create() which in turns is used in gotd too, so a moderate amount of memory used is a good thing. The downside is that we misuse the hash table and spend a considerable amount of time during the insert and resize the table. When the table is filled we enter in a loop where we increase it by 256 slots, rehash everything, have a lot of collisions since the table is 90%+ full, rinse and repeat. Here's another idea: instead of using the table as container for the data, use an hash table of indexes of an array. We can save quite some memory this way, since int32 indexes gives us plenty of space and reduces the size needed by the hash table by 20 bytes per slot (the struct got_delta_block is 24 bytes on amd64.) The blocks themselves can be allocated on a vector which doesn't degrade the performances when filled. With a smaller hash table, we can pay the memory price for a better resize scheme, which in turns gives us better performances. Another advantage of using the hash table as an index, is that the resize is quick: we can just delete the table and re-insert the all the elements. I'm trading a considerable amount of time with some more memory usage. For example, for a 512k of /dev/random, we would now use 136k insted of 58k of memory. For 100m of /dev/random it is 33M and 9M respectively. I'm also using a generous scheme where each resize doubles the size of the table though. To get an idea of the improvements, I've extracted the code for the deltification as a standalone executable. For example, to got_deltify_init() the alpine iso (207M) suzaku% m && time ./obj/d /home/op/Downloads/alpine-standard-3.19.1-x86_64.iso array: 2250886/2251136 99% -- ht: 761991/2097152 36% total memory: 62415872 0m01.81s real 0m01.20s user 0m00.62s system or for a 1.0G mp4 video: suzaku% m && time ./obj/d /home/op/Downloads/[...].mp4 array: 9967559/9967744 99% -- ht: 4011730/8388608 47% total memory: 272780288 0m09.67s real 0m06.12s user 0m03.49s system The "total memory" is computed by considering the total memory needed by the array and by the table (so around 62M and 272M respectively.) IMHO the time are more than decent now. I have no idea how much it'll take with the current implementation, but given the trend it exibits (~3 minutes for 100M of /dev/random), "too much" is a good estimate ;-) (these times are all with got_deltify_init(), not _mem() which could be slightly more faster.) Not sure if diff below is acceptable, and even if it is, the constants probably have to be tweaked. I'm open to suggestions. Thanks, Omar Polo diff /home/op/w/got commit - 7a86002db34d49472a7d75c1802ee99c2120ef3c path + /home/op/w/got blob - 99b5d418b8cec83d130d52be4e5022d0310f8c31 file + lib/deltify.c --- lib/deltify.c +++ lib/deltify.c @@ -87,6 +87,11 @@ static const uint32_t geartab[256] = { 0xf1f6e72c, 0x5551128a, 0x83af87e2, 0x6f0342ba, }; +static const struct got_error *addblk(struct got_delta_table *, FILE *, off_t, + off_t, off_t, uint32_t); +static const struct got_error *addblk_mem(struct got_delta_table *, + uint8_t *, off_t, off_t, off_t, uint32_t); + static uint32_t hashblk(const unsigned char *p, off_t n, uint32_t seed) { @@ -94,10 +99,52 @@ hashblk(const unsigned char *p, off_t n, uint32_t seed } static const struct got_error * +resizeht(struct got_delta_table *dt, FILE *f, off_t offset0, uint8_t *data, + off_t len) +{ + struct got_delta_block *b; + size_t newsize; + int i; + + if (dt->nblocks == dt->nalloc) { + newsize = dt->nalloc + 256; + b = reallocarray(dt->blocks, newsize, sizeof(*dt->blocks)); + if (b == NULL) + return got_error_from_errno("reallocarray"); + dt->blocks = b; + dt->nalloc = newsize; + } + + if (dt->blocks == NULL || dt->len * 100 / dt->size > 70) { + newsize = dt->size * 2; + free(dt->offs); + dt->offs = calloc(newsize, sizeof(*dt->offs)); + if (dt->offs == NULL) + return got_error_from_errno("calloc"); + dt->size = newsize; + dt->len = 0; + + /* rehash the table -- cannot fail */ + for (i = 0; i < dt->nblocks; ++i) { + b = &dt->blocks[i]; + if (f) + addblk(dt, f, offset0, b->len, b->offset, + b->hash); + else + addblk_mem(dt, data, offset0, b->len, + b->offset, b->hash); + } + } + + return NULL; +} + +static const struct got_error * addblk(struct got_delta_table *dt, FILE *f, off_t file_offset0, off_t len, off_t offset, uint32_t h) { - const struct got_error *err = NULL; + const struct got_error *err; + struct got_delta_block *block; int i; uint8_t buf[GOT_DELTIFY_MAXCHUNK]; uint8_t buf2[GOT_DELTIFY_MAXCHUNK]; @@ -106,69 +153,57 @@ addblk(struct got_delta_table *dt, FILE *f, off_t file if (len == 0) return NULL; - i = h % dt->nalloc; - while (dt->blocks[i].len != 0) { + err = resizeht(dt, f, file_offset0, NULL, 0); + if (err) + return err; + + for (i = h % dt->size; /* forever */; i = (i + 1) % dt->size) { /* + * Zero signals that this slot is empty; the real + * offset is one less the value. + */ + if (dt->offs[i] == 0) + break; + + block = &dt->blocks[dt->offs[i] - 1]; + + /* * Avoid adding duplicate blocks. * NB: A matching hash is insufficient for detecting equality. * The hash can only detect inequality. */ - if (len == dt->blocks[i].len && h == dt->blocks[i].hash) { - if (r == 0) { - if (fseeko(f, file_offset0 + offset, SEEK_SET) == -1) - return got_error_from_errno("fseeko"); - r = fread(buf, 1, len, f); - if (r != len) { - if (!ferror(f)) - return NULL; - return got_ferror(f, GOT_ERR_IO); - } - } - if (fseeko(f, file_offset0 + dt->blocks[i].offset, - SEEK_SET) == -1) + if (block->len != len || block->hash != h) + continue; + + if (r == 0) { + if (fseeko(f, file_offset0 + offset, SEEK_SET) == -1) return got_error_from_errno("fseeko"); - if (fread(buf2, 1, len, f) != len) + r = fread(buf, 1, len, f); + if (r != len) { + if (!ferror(f)) + return NULL; return got_ferror(f, GOT_ERR_IO); - if (memcmp(buf, buf2, len) == 0) - return NULL; + } } - i = (i + 1) % dt->nalloc; + if (fseeko(f, file_offset0 + block->offset, SEEK_SET) == -1) + return got_error_from_errno("fseeko"); + if (fread(buf2, 1, len, f) != len) + return got_ferror(f, GOT_ERR_IO); + if (memcmp(buf, buf2, len) == 0) + return NULL; } - assert(dt->blocks[i].len == 0); - dt->blocks[i].len = len; - dt->blocks[i].offset = offset; - dt->blocks[i].hash = h; + + dt->offs[i] = dt->nblocks + 1; + block = &dt->blocks[dt->nblocks]; + block->len = len; + block->offset = offset; + block->hash = h; + dt->nblocks++; - if (dt->nalloc < dt->nblocks + 256) { - struct got_delta_block *db; - size_t old_size = dt->nalloc; - db = dt->blocks; - dt->blocks = calloc(dt->nalloc + 256, - sizeof(struct got_delta_block)); - if (dt->blocks == NULL) { - err = got_error_from_errno("calloc"); - dt->blocks = db; - return err; - } - dt->nalloc += 256; - /* - * Recompute all block positions. Hash-based indices of blocks - * in the array depend on the allocated length of the array. - */ - dt->nblocks = 0; - for (i = 0; i < old_size; i++) { - if (db[i].len == 0) - continue; - err = addblk(dt, f, file_offset0, db[i].len, - db[i].offset, db[i].hash); - if (err) - break; - } - free(db); - } + dt->len++; - return err; + return NULL; } static const struct got_error * @@ -177,62 +212,50 @@ addblk_mem(struct got_delta_table *dt, uint8_t *data, { const struct got_error *err = NULL; int i; + struct got_delta_block *block; uint8_t *block1; uint8_t *block2; if (len == 0) return NULL; - i = h % dt->nalloc; - while (dt->blocks[i].len != 0) { + err = resizeht(dt, NULL, file_offset0, data, len); + if (err) + return err; + + for (i = h % dt->size; /* forever */; i = (i + 1) % dt->size) { /* + * Zero signals that this slot is empty; the real + * offset is one less the value. + */ + if (dt->offs[i] == 0) + break; + + block = &dt->blocks[dt->offs[i] - 1]; + + /* * Avoid adding duplicate blocks. * NB: A matching hash is insufficient for detecting equality. * The hash can only detect inequality. */ - if (len == dt->blocks[i].len && h == dt->blocks[i].hash) { - block1 = data + file_offset0 + dt->blocks[i].offset; + if (len == block->len && h == block->hash) { + block1 = data + file_offset0 + block->offset; block2 = data + file_offset0 + offset; if (memcmp(block1, block2, len) == 0) return NULL; } - - i = (i + 1) % dt->nalloc; } - assert(dt->blocks[i].len == 0); - dt->blocks[i].len = len; - dt->blocks[i].offset = offset; - dt->blocks[i].hash = h; + + dt->offs[i] = dt->nblocks + 1; + block = &dt->blocks[dt->nblocks]; + block->len = len; + block->offset = offset; + block->hash = h; + dt->nblocks++; - if (dt->nalloc < dt->nblocks + 256) { - struct got_delta_block *db; - size_t old_size = dt->nalloc; - db = dt->blocks; - dt->blocks = calloc(dt->nalloc + 256, - sizeof(struct got_delta_block)); - if (dt->blocks == NULL) { - err = got_error_from_errno("calloc"); - dt->blocks = db; - return err; - } - dt->nalloc += 256; - /* - * Recompute all block positions. Hash-based indices of blocks - * in the array depend on the allocated length of the array. - */ - dt->nblocks = 0; - for (i = 0; i < old_size; i++) { - if (db[i].len == 0) - continue; - err = addblk_mem(dt, data, file_offset0, db[i].len, - db[i].offset, db[i].hash); - if (err) - break; - } - free(db); - } + dt->len++; - return err; + return NULL; } static const struct got_error * @@ -243,24 +266,24 @@ lookupblk(struct got_delta_block **block, struct got_d int i; uint32_t h; uint8_t buf[GOT_DELTIFY_MAXCHUNK]; + struct got_delta_block *b; size_t r; *block = NULL; h = hashblk(p, len, seed); - for (i = h % dt->nalloc; dt->blocks[i].len != 0; - i = (i + 1) % dt->nalloc) { - if (dt->blocks[i].hash != h || - dt->blocks[i].len != len) + for (i = h % dt->size; dt->offs[i] != 0; i = (i + 1) % dt->size) { + b = &dt->blocks[dt->offs[i] - 1]; + if (b->hash != h || b->len != len) continue; - if (fseeko(basefile, basefile_offset0 + dt->blocks[i].offset, + if (fseeko(basefile, basefile_offset0 + b->offset, SEEK_SET) == -1) return got_error_from_errno("fseeko"); r = fread(buf, 1, len, basefile); if (r != len) return got_ferror(basefile, GOT_ERR_IO); if (memcmp(p, buf, len) == 0) { - *block = &dt->blocks[i]; + *block = b; break; } } @@ -274,19 +297,19 @@ lookupblk_mem(struct got_delta_block **block, struct g { int i; uint32_t h; - uint8_t *b; + uint8_t *d; + struct got_delta_block *b; *block = NULL; h = hashblk(p, len, seed); - for (i = h % dt->nalloc; dt->blocks[i].len != 0; - i = (i + 1) % dt->nalloc) { - if (dt->blocks[i].hash != h || - dt->blocks[i].len != len) + for (i = h % dt->size; dt->offs[i] != 0; i = (i + 1) % dt->size) { + b = &dt->blocks[dt->offs[i] - 1]; + if (b->hash != h || b->len != len) continue; - b = basedata + basefile_offset0 + dt->blocks[i].offset; - if (memcmp(p, b, len) == 0) { - *block = &dt->blocks[i]; + d = basedata + basefile_offset0 + b->offset; + if (memcmp(p, d, len) == 0) { + *block = b; break; } } @@ -370,6 +393,13 @@ got_deltify_init(struct got_delta_table **dt, FILE *f, goto done; } + (*dt)->size = 128; + (*dt)->offs = calloc((*dt)->size, sizeof(*(*dt)->offs)); + if ((*dt)->offs == NULL) { + err = got_error_from_errno("calloc"); + goto done; + } + if (fseeko(f, fileoffset, SEEK_SET) == -1) return got_error_from_errno("fseeko"); @@ -420,6 +450,13 @@ got_deltify_init_mem(struct got_delta_table **dt, uint goto done; } + (*dt)->size = 128; + (*dt)->offs = calloc((*dt)->size, sizeof(*(*dt)->offs)); + if ((*dt)->offs == NULL) { + err = got_error_from_errno("calloc"); + goto done; + } + while (fileoffset < filesize) { off_t blocklen; err = nextblk_mem(&blocklen, data, fileoffset, filesize); blob - 23d51d665edd5927c4b9ca18e2ebebf3473b1795 file + lib/got_lib_deltify.h --- lib/got_lib_deltify.h +++ lib/got_lib_deltify.h @@ -25,6 +25,11 @@ struct got_delta_table { struct got_delta_block *blocks; int nblocks; int nalloc; + + /* index for blocks */ + uint32_t *offs; + int len; + int size; }; struct got_delta_instruction {