Download raw body.
deltify addblk() fixes
On Wed, Jun 09, 2021 at 05:15:35PM +0200, Christian Weisgerber wrote:
> Stefan Sperling:
>
> > Fix deltify addblk() list size confusion: We only have 'old_size' number
> > of blocks to rehash after growing the array.
>
> That fix is correct.
>
> > And only extend the array slightly when we outgrow it, instead of doubling
> > the array's size each time
>
> Isn't the underlying idea that the array is somewhat sparsely
> populated to that h % dt->nalloc is likely to hit an empty slot on
> insert and the right slot on lookup?
>
> Anyway, this doesn't look right:
>
> > dt->nblocks++;
> > - if (dt->nalloc < 2 * dt->nblocks) {
> > + if (dt->nalloc < dt->nblocks + 64) {
> > struct got_delta_block *db;
> > - nalloc = dt->nalloc * 2;
> > + size_t old_size = dt->nalloc;
> > db = dt->blocks;
> > - dt->blocks = calloc(nalloc, sizeof(struct got_delta_block));
> > + dt->blocks = calloc(dt->nblocks + 64, sizeof(struct got_delta_block));
> > if (dt->blocks == NULL) {
> > err = got_error_from_errno("calloc");
> > dt->blocks = db;
> > return err;
> > }
> > - dt->nalloc = nalloc;
> > + dt->nalloc = dt->nblocks + 64;
>
> Assume that dt->nalloc is 100. Then as soon as dt->nblock hits 37,
> the if triggers (100 < 101). We then allocate 101 blocks...
> Therefore, each addblk() call now extends the array by 1 block.
>
> I think you meant
> calloc(dt->nalloc + 64, ...)
> dt->nalloc += 64
>
>
> While we're here, I'm also puzzled by this part:
>
> * Avoid adding duplicate blocks.
> * NB: A matching hash is insufficient for detecting equality.
> * The hash can only detect inequality, so only check 'len'.
> */
> if (len == dt->blocks[i].len) {
> ...
> if (memcmp(buf, buf2, len) == 0)
> return NULL;
> }
>
> If the hash can only detect inequality, shouldn't we still check it
>
> if (len == dt->blocks[i].len && h == dt->blocks[i].hash) {
>
> to skip expensive compares?
Thank you! Here is a new diff with these problems fixed.
Your fixes don't make a huge difference but they do cause 'malloc'
and 'fseek' calls reported by gprof to drop down by about 2% on a
trivial test repository.
diff 282f42e5d1095015379b49280429e558b3bbc4fe /home/stsp/src/got
blob - 0b023fb4330314246e6a110ad5aade625f8b9a0f
file + lib/deltify.c
--- lib/deltify.c
+++ lib/deltify.c
@@ -99,7 +99,7 @@ static const struct got_error *
addblk(struct got_delta_table *dt, FILE *f, off_t len, off_t offset, uint64_t h)
{
const struct got_error *err = NULL;
- int i, nalloc;
+ int i;
uint8_t buf[GOT_DELTIFY_MAXCHUNK];
size_t r = 0;
@@ -121,9 +121,9 @@ addblk(struct got_delta_table *dt, FILE *f, off_t len,
/*
* Avoid adding duplicate blocks.
* NB: A matching hash is insufficient for detecting equality.
- * The hash can only detect inequality, so only check 'len'.
+ * The hash can only detect inequality.
*/
- if (len == dt->blocks[i].len) {
+ if (len == dt->blocks[i].len && h == dt->blocks[i].hash) {
uint8_t buf2[GOT_DELTIFY_MAXCHUNK];
if (fseeko(f, dt->blocks[i].offset, SEEK_SET) == -1)
return got_error_from_errno("fseeko");
@@ -140,23 +140,24 @@ addblk(struct got_delta_table *dt, FILE *f, off_t len,
dt->blocks[i].offset = offset;
dt->blocks[i].hash = h;
dt->nblocks++;
- if (dt->nalloc < 2 * dt->nblocks) {
+ if (dt->nalloc < dt->nblocks + 64) {
struct got_delta_block *db;
- nalloc = dt->nalloc * 2;
+ size_t old_size = dt->nalloc;
db = dt->blocks;
- dt->blocks = calloc(nalloc, sizeof(struct got_delta_block));
+ dt->blocks = calloc(dt->nalloc + 64,
+ sizeof(struct got_delta_block));
if (dt->blocks == NULL) {
err = got_error_from_errno("calloc");
dt->blocks = db;
return err;
}
- dt->nalloc = nalloc;
+ dt->nalloc += 64;
/*
* Recompute all block positions. Hash-based indices of blocks
* in the array depend on the allocated length of the array.
*/
dt->nblocks = 0;
- for (i = 0; i < nalloc; i++) {
+ for (i = 0; i < old_size; i++) {
if (db[i].len == 0)
continue;
err = addblk(dt, f, db[i].len, db[i].offset,
deltify addblk() fixes