Download raw body.
attempt at speeding up the deltification
On 2024/02/25 02:30:28 +0100, "Omar Polo" <op@omarpolo.com> wrote:
> The deltification is quite slow not only because it is objectively a
> heavy task, but also because we try to use as little memory as possible.
> got_deltify() is used in pack_create() which in turns is used in gotd
> too, so a moderate amount of memory used is a good thing.
>
> The downside is that we misuse the hash table and spend a considerable
> amount of time during the insert and resize the table. When the table
> is filled we enter in a loop where we increase it by 256 slots, rehash
> everything, have a lot of collisions since the table is 90%+ full, rinse
> and repeat.
>
> Here's another idea: instead of using the table as container for the
> data, use an hash table of indexes of an array. We can save quite some
> memory this way, since int32 indexes gives us plenty of space and
> reduces the size needed by the hash table by 20 bytes per slot (the
> struct got_delta_block is 24 bytes on amd64.) The blocks themselves can
> be allocated on a vector which doesn't degrade the performances when
> filled.
This was phrased poorly. This actually increases the memory needed:
instead of using a table that's in practice almost always filled, we
used an array (which is similarly filled) plus an hash table.
What I was trying to say it's that by using a hash table that requires
less memory per slot we can use a more generous resizing scheme whithout
using a lot of more memory.
and p.s. the comment in resizeht is wrong, addblk could still fail
during the rehashing since it needs to seek and read the file.
> With a smaller hash table, we can pay the memory price for a better
> resize scheme, which in turns gives us better performances.
>
> Another advantage of using the hash table as an index, is that the
> resize is quick: we can just delete the table and re-insert the all the
> elements.
>
> I'm trading a considerable amount of time with some more memory usage.
> For example, for a 512k of /dev/random, we would now use 136k insted of
> 58k of memory. For 100m of /dev/random it is 33M and 9M respectively.
> I'm also using a generous scheme where each resize doubles the size of
> the table though.
>
> To get an idea of the improvements, I've extracted the code for the
> deltification as a standalone executable. For example, to
> got_deltify_init() the alpine iso (207M)
>
> suzaku% m && time ./obj/d /home/op/Downloads/alpine-standard-3.19.1-x86_64.iso
> array: 2250886/2251136 99% -- ht: 761991/2097152 36%
> total memory: 62415872
> 0m01.81s real 0m01.20s user 0m00.62s system
>
> or for a 1.0G mp4 video:
>
> suzaku% m && time ./obj/d /home/op/Downloads/[...].mp4
> array: 9967559/9967744 99% -- ht: 4011730/8388608 47%
> total memory: 272780288
> 0m09.67s real 0m06.12s user 0m03.49s system
>
> The "total memory" is computed by considering the total memory needed by
> the array and by the table (so around 62M and 272M respectively.) IMHO
> the time are more than decent now. I have no idea how much it'll take
> with the current implementation, but given the trend it exibits (~3
> minutes for 100M of /dev/random), "too much" is a good estimate ;-)
>
> (these times are all with got_deltify_init(), not _mem() which could be
> slightly more faster.)
>
> Not sure if diff below is acceptable, and even if it is, the constants
> probably have to be tweaked. I'm open to suggestions.
>
>
> Thanks,
>
> Omar Polo
>
>
> diff /home/op/w/got
> commit - 7a86002db34d49472a7d75c1802ee99c2120ef3c
> path + /home/op/w/got
> blob - 99b5d418b8cec83d130d52be4e5022d0310f8c31
> file + lib/deltify.c
> --- lib/deltify.c
> +++ lib/deltify.c
> @@ -87,6 +87,11 @@ static const uint32_t geartab[256] = {
> 0xf1f6e72c, 0x5551128a, 0x83af87e2, 0x6f0342ba,
> };
>
> +static const struct got_error *addblk(struct got_delta_table *, FILE *, off_t,
> + off_t, off_t, uint32_t);
> +static const struct got_error *addblk_mem(struct got_delta_table *,
> + uint8_t *, off_t, off_t, off_t, uint32_t);
> +
> static uint32_t
> hashblk(const unsigned char *p, off_t n, uint32_t seed)
> {
> @@ -94,10 +99,52 @@ hashblk(const unsigned char *p, off_t n, uint32_t seed
> }
>
> static const struct got_error *
> +resizeht(struct got_delta_table *dt, FILE *f, off_t offset0, uint8_t *data,
> + off_t len)
> +{
> + struct got_delta_block *b;
> + size_t newsize;
> + int i;
> +
> + if (dt->nblocks == dt->nalloc) {
> + newsize = dt->nalloc + 256;
> + b = reallocarray(dt->blocks, newsize, sizeof(*dt->blocks));
> + if (b == NULL)
> + return got_error_from_errno("reallocarray");
> + dt->blocks = b;
> + dt->nalloc = newsize;
> + }
> +
> + if (dt->blocks == NULL || dt->len * 100 / dt->size > 70) {
> + newsize = dt->size * 2;
> + free(dt->offs);
> + dt->offs = calloc(newsize, sizeof(*dt->offs));
> + if (dt->offs == NULL)
> + return got_error_from_errno("calloc");
> + dt->size = newsize;
> + dt->len = 0;
> +
> + /* rehash the table -- cannot fail */
> + for (i = 0; i < dt->nblocks; ++i) {
> + b = &dt->blocks[i];
> + if (f)
> + addblk(dt, f, offset0, b->len, b->offset,
> + b->hash);
> + else
> + addblk_mem(dt, data, offset0, b->len,
> + b->offset, b->hash);
> + }
> + }
> +
> + return NULL;
> +}
> +
> +static const struct got_error *
> addblk(struct got_delta_table *dt, FILE *f, off_t file_offset0, off_t len,
> off_t offset, uint32_t h)
> {
> - const struct got_error *err = NULL;
> + const struct got_error *err;
> + struct got_delta_block *block;
> int i;
> uint8_t buf[GOT_DELTIFY_MAXCHUNK];
> uint8_t buf2[GOT_DELTIFY_MAXCHUNK];
> @@ -106,69 +153,57 @@ addblk(struct got_delta_table *dt, FILE *f, off_t file
> if (len == 0)
> return NULL;
>
> - i = h % dt->nalloc;
> - while (dt->blocks[i].len != 0) {
> + err = resizeht(dt, f, file_offset0, NULL, 0);
> + if (err)
> + return err;
> +
> + for (i = h % dt->size; /* forever */; i = (i + 1) % dt->size) {
> /*
> + * Zero signals that this slot is empty; the real
> + * offset is one less the value.
> + */
> + if (dt->offs[i] == 0)
> + break;
> +
> + block = &dt->blocks[dt->offs[i] - 1];
> +
> + /*
> * Avoid adding duplicate blocks.
> * NB: A matching hash is insufficient for detecting equality.
> * The hash can only detect inequality.
> */
> - if (len == dt->blocks[i].len && h == dt->blocks[i].hash) {
> - if (r == 0) {
> - if (fseeko(f, file_offset0 + offset, SEEK_SET) == -1)
> - return got_error_from_errno("fseeko");
> - r = fread(buf, 1, len, f);
> - if (r != len) {
> - if (!ferror(f))
> - return NULL;
> - return got_ferror(f, GOT_ERR_IO);
> - }
> - }
> - if (fseeko(f, file_offset0 + dt->blocks[i].offset,
> - SEEK_SET) == -1)
> + if (block->len != len || block->hash != h)
> + continue;
> +
> + if (r == 0) {
> + if (fseeko(f, file_offset0 + offset, SEEK_SET) == -1)
> return got_error_from_errno("fseeko");
> - if (fread(buf2, 1, len, f) != len)
> + r = fread(buf, 1, len, f);
> + if (r != len) {
> + if (!ferror(f))
> + return NULL;
> return got_ferror(f, GOT_ERR_IO);
> - if (memcmp(buf, buf2, len) == 0)
> - return NULL;
> + }
> }
>
> - i = (i + 1) % dt->nalloc;
> + if (fseeko(f, file_offset0 + block->offset, SEEK_SET) == -1)
> + return got_error_from_errno("fseeko");
> + if (fread(buf2, 1, len, f) != len)
> + return got_ferror(f, GOT_ERR_IO);
> + if (memcmp(buf, buf2, len) == 0)
> + return NULL;
> }
> - assert(dt->blocks[i].len == 0);
> - dt->blocks[i].len = len;
> - dt->blocks[i].offset = offset;
> - dt->blocks[i].hash = h;
> +
> + dt->offs[i] = dt->nblocks + 1;
> + block = &dt->blocks[dt->nblocks];
> + block->len = len;
> + block->offset = offset;
> + block->hash = h;
> +
> dt->nblocks++;
> - if (dt->nalloc < dt->nblocks + 256) {
> - struct got_delta_block *db;
> - size_t old_size = dt->nalloc;
> - db = dt->blocks;
> - dt->blocks = calloc(dt->nalloc + 256,
> - sizeof(struct got_delta_block));
> - if (dt->blocks == NULL) {
> - err = got_error_from_errno("calloc");
> - dt->blocks = db;
> - return err;
> - }
> - dt->nalloc += 256;
> - /*
> - * Recompute all block positions. Hash-based indices of blocks
> - * in the array depend on the allocated length of the array.
> - */
> - dt->nblocks = 0;
> - for (i = 0; i < old_size; i++) {
> - if (db[i].len == 0)
> - continue;
> - err = addblk(dt, f, file_offset0, db[i].len,
> - db[i].offset, db[i].hash);
> - if (err)
> - break;
> - }
> - free(db);
> - }
> + dt->len++;
>
> - return err;
> + return NULL;
> }
>
> static const struct got_error *
> @@ -177,62 +212,50 @@ addblk_mem(struct got_delta_table *dt, uint8_t *data,
> {
> const struct got_error *err = NULL;
> int i;
> + struct got_delta_block *block;
> uint8_t *block1;
> uint8_t *block2;
>
> if (len == 0)
> return NULL;
>
> - i = h % dt->nalloc;
> - while (dt->blocks[i].len != 0) {
> + err = resizeht(dt, NULL, file_offset0, data, len);
> + if (err)
> + return err;
> +
> + for (i = h % dt->size; /* forever */; i = (i + 1) % dt->size) {
> /*
> + * Zero signals that this slot is empty; the real
> + * offset is one less the value.
> + */
> + if (dt->offs[i] == 0)
> + break;
> +
> + block = &dt->blocks[dt->offs[i] - 1];
> +
> + /*
> * Avoid adding duplicate blocks.
> * NB: A matching hash is insufficient for detecting equality.
> * The hash can only detect inequality.
> */
> - if (len == dt->blocks[i].len && h == dt->blocks[i].hash) {
> - block1 = data + file_offset0 + dt->blocks[i].offset;
> + if (len == block->len && h == block->hash) {
> + block1 = data + file_offset0 + block->offset;
> block2 = data + file_offset0 + offset;
> if (memcmp(block1, block2, len) == 0)
> return NULL;
> }
> -
> - i = (i + 1) % dt->nalloc;
> }
> - assert(dt->blocks[i].len == 0);
> - dt->blocks[i].len = len;
> - dt->blocks[i].offset = offset;
> - dt->blocks[i].hash = h;
> +
> + dt->offs[i] = dt->nblocks + 1;
> + block = &dt->blocks[dt->nblocks];
> + block->len = len;
> + block->offset = offset;
> + block->hash = h;
> +
> dt->nblocks++;
> - if (dt->nalloc < dt->nblocks + 256) {
> - struct got_delta_block *db;
> - size_t old_size = dt->nalloc;
> - db = dt->blocks;
> - dt->blocks = calloc(dt->nalloc + 256,
> - sizeof(struct got_delta_block));
> - if (dt->blocks == NULL) {
> - err = got_error_from_errno("calloc");
> - dt->blocks = db;
> - return err;
> - }
> - dt->nalloc += 256;
> - /*
> - * Recompute all block positions. Hash-based indices of blocks
> - * in the array depend on the allocated length of the array.
> - */
> - dt->nblocks = 0;
> - for (i = 0; i < old_size; i++) {
> - if (db[i].len == 0)
> - continue;
> - err = addblk_mem(dt, data, file_offset0, db[i].len,
> - db[i].offset, db[i].hash);
> - if (err)
> - break;
> - }
> - free(db);
> - }
> + dt->len++;
>
> - return err;
> + return NULL;
> }
>
> static const struct got_error *
> @@ -243,24 +266,24 @@ lookupblk(struct got_delta_block **block, struct got_d
> int i;
> uint32_t h;
> uint8_t buf[GOT_DELTIFY_MAXCHUNK];
> + struct got_delta_block *b;
> size_t r;
>
> *block = NULL;
>
> h = hashblk(p, len, seed);
> - for (i = h % dt->nalloc; dt->blocks[i].len != 0;
> - i = (i + 1) % dt->nalloc) {
> - if (dt->blocks[i].hash != h ||
> - dt->blocks[i].len != len)
> + for (i = h % dt->size; dt->offs[i] != 0; i = (i + 1) % dt->size) {
> + b = &dt->blocks[dt->offs[i] - 1];
> + if (b->hash != h || b->len != len)
> continue;
> - if (fseeko(basefile, basefile_offset0 + dt->blocks[i].offset,
> + if (fseeko(basefile, basefile_offset0 + b->offset,
> SEEK_SET) == -1)
> return got_error_from_errno("fseeko");
> r = fread(buf, 1, len, basefile);
> if (r != len)
> return got_ferror(basefile, GOT_ERR_IO);
> if (memcmp(p, buf, len) == 0) {
> - *block = &dt->blocks[i];
> + *block = b;
> break;
> }
> }
> @@ -274,19 +297,19 @@ lookupblk_mem(struct got_delta_block **block, struct g
> {
> int i;
> uint32_t h;
> - uint8_t *b;
> + uint8_t *d;
> + struct got_delta_block *b;
>
> *block = NULL;
>
> h = hashblk(p, len, seed);
> - for (i = h % dt->nalloc; dt->blocks[i].len != 0;
> - i = (i + 1) % dt->nalloc) {
> - if (dt->blocks[i].hash != h ||
> - dt->blocks[i].len != len)
> + for (i = h % dt->size; dt->offs[i] != 0; i = (i + 1) % dt->size) {
> + b = &dt->blocks[dt->offs[i] - 1];
> + if (b->hash != h || b->len != len)
> continue;
> - b = basedata + basefile_offset0 + dt->blocks[i].offset;
> - if (memcmp(p, b, len) == 0) {
> - *block = &dt->blocks[i];
> + d = basedata + basefile_offset0 + b->offset;
> + if (memcmp(p, d, len) == 0) {
> + *block = b;
> break;
> }
> }
> @@ -370,6 +393,13 @@ got_deltify_init(struct got_delta_table **dt, FILE *f,
> goto done;
> }
>
> + (*dt)->size = 128;
> + (*dt)->offs = calloc((*dt)->size, sizeof(*(*dt)->offs));
> + if ((*dt)->offs == NULL) {
> + err = got_error_from_errno("calloc");
> + goto done;
> + }
> +
> if (fseeko(f, fileoffset, SEEK_SET) == -1)
> return got_error_from_errno("fseeko");
>
> @@ -420,6 +450,13 @@ got_deltify_init_mem(struct got_delta_table **dt, uint
> goto done;
> }
>
> + (*dt)->size = 128;
> + (*dt)->offs = calloc((*dt)->size, sizeof(*(*dt)->offs));
> + if ((*dt)->offs == NULL) {
> + err = got_error_from_errno("calloc");
> + goto done;
> + }
> +
> while (fileoffset < filesize) {
> off_t blocklen;
> err = nextblk_mem(&blocklen, data, fileoffset, filesize);
> blob - 23d51d665edd5927c4b9ca18e2ebebf3473b1795
> file + lib/got_lib_deltify.h
> --- lib/got_lib_deltify.h
> +++ lib/got_lib_deltify.h
> @@ -25,6 +25,11 @@ struct got_delta_table {
> struct got_delta_block *blocks;
> int nblocks;
> int nalloc;
> +
> + /* index for blocks */
> + uint32_t *offs;
> + int len;
> + int size;
> };
>
> struct got_delta_instruction {
attempt at speeding up the deltification