Download raw body.
process deltas in compressed form
Stefan Sperling <stsp@stsp.name> wrote: > During packing we currently decompress reused deltas while reading them > from the pack file, and store the decompressed data either in memory or > in the delta cache file. > The same applies to newly computed deltas; we write them to the delta > cache file in uncompressed form, and compress the delta data while > copying it into the generated pack file. > > This approach works, but it is wasteful. > A 4GB /tmp paritition will run out of space during 'gotadmin pack -a' > in a copy of the OpenBSD src repo because the delta cache file grows > too large. i'm happy about hte change, but i remember 'gotadmin pack -a' working on a 4GB /tmp partition: % df -h | fgrep /tmp /dev/sd1d 3.9G 6.8M 3.7G 0% /tmp % /usr/local/bin/gotadmin -V gotadmin 0.68 % time /usr/local/bin/gotadmin pack -a 219498 commits colored; 2088729 objects found; 1010361 trees scanned packing 5 references; 2088729 objects; deltify: 100%; writing pack: 1.2G 100% Wrote fe8e346aa0f23dc77371a3ad76f5bcea7d5ea071.pack 1.2G packed; indexing 100%; resolving deltas 100% Indexed fe8e346aa0f23dc77371a3ad76f5bcea7d5ea071.pack 23m04.15s real 14m22.31s user 7m40.23s system > With the patch below, we store deltas in compressed form. > Reused deltas will be copied as-is from their pack file, so we won't > waste time compressing them again. We still decompress such deltas > once order to verify that decompression succeeds. This is a sanity check > also performed by Git. It is intended to protect against silent bitrot. > So I decided to do the same. We would need to decompress a delta anyway > at least in part because we need to read two size values stored at the > beginning of a delta stream. > > With this patch my system no longer runs out of space in /tmp when > repacking the OpenBSD src repo. > > I am adding more wrapper functions around libz. I am not happy about that > as we already have a lot of glue code on top of libz. But the existing > abstractions do not support incremental writes to a file with compression, > they only support one-shot compression of an existing file or memory buffer. > It would be good to tidy this up a bit and shrink the amount of abstractions > in use here. But that can be done later. > > ok? it reads fine, ok op@; small nitpics inline below. slightly related: while reading I've stubled upon csum_output where we're potentially truncating a size_t into an 'unsigned int' when calling crc32. It seems unlikely (maybe even impossible) to end up calling csum_output with a `len' greater than UINT_MAX, but what about using crc32_z? diff 6b7665acf3ac9dd7d0c30372df5a4fa09b1b47fa /tmp/got blob - 3c97a77f77b3da0548ab67dbcb2685723456ee39 file + lib/deflate.c --- lib/deflate.c +++ lib/deflate.c @@ -83,7 +83,7 @@ static void csum_output(struct got_deflate_checksum *csum, const uint8_t *buf, size_t len) { if (csum->output_crc) - *csum->output_crc = crc32(*csum->output_crc, buf, len); + *csum->output_crc = crc32_z(*csum->output_crc, buf, len); if (csum->output_sha1) SHA1Update(csum->output_sha1, buf, len); > diff cf8f868e7c97644d885e9cc2a06debbe9eac72b0 da308c0ea675766ef4047c919ba7eb6d104d842e > blob - 3c97a77f77b3da0548ab67dbcb2685723456ee39 > blob + 8be992eef33b9ee118e5dc75d039f224e5acb657 > --- lib/deflate.c > +++ lib/deflate.c > @@ -136,9 +136,9 @@ got_deflate_read(struct got_deflate_buf *zb, FILE *f, > return NULL; > } > > -const struct got_error * > -got_deflate_read_mmap(struct got_deflate_buf *zb, uint8_t *map, size_t offset, > - size_t len, size_t *outlenp, size_t *consumed) > +static const struct got_error * > +deflate_read_mmap(struct got_deflate_buf *zb, uint8_t *map, size_t offset, > + size_t len, size_t *outlenp, size_t *consumed, int flush_on_eof) > { > z_stream *z = &zb->z; > size_t last_total_out = z->total_out; > @@ -159,7 +159,8 @@ got_deflate_read_mmap(struct got_deflate_buf *zb, uint > z->avail_in = len - *consumed; > if (z->avail_in == 0) { > /* EOF */ > - ret = deflate(z, Z_FINISH); > + if (flush_on_eof) > + ret = deflate(z, Z_FINISH); > break; > } > } > @@ -179,6 +180,53 @@ got_deflate_read_mmap(struct got_deflate_buf *zb, uint > return NULL; > } > > +const struct got_error * > +got_deflate_read_mmap(struct got_deflate_buf *zb, uint8_t *map, size_t offset, > + size_t len, size_t *outlenp, size_t *consumed) > +{ > + return deflate_read_mmap(zb, map, offset, len, outlenp, consumed, 1); > +} > + > +const struct got_error * > +got_deflate_flush(struct got_deflate_buf *zb, FILE *outfile, > + struct got_deflate_checksum *csum, off_t *outlenp) > +{ > + int ret; > + size_t n; > + z_stream *z = &zb->z; > + nit: can drop the braces here > + if (z->avail_in != 0) { > + return got_error_msg(GOT_ERR_COMPRESSION, > + "cannot flush zb with pending input data"); > + } > + > + do { > + size_t avail, last_total_out = zb->z.total_out; > + > + z->next_out = zb->outbuf; > + z->avail_out = zb->outlen; > + > + ret = deflate(z, Z_FINISH); > + if (ret != Z_STREAM_END && ret != Z_OK) > + return got_error(GOT_ERR_COMPRESSION); > + > + avail = z->total_out - last_total_out; > + if (avail > 0) { > + n = fwrite(zb->outbuf, avail, 1, outfile); > + if (n != 1) > + return got_ferror(outfile, GOT_ERR_IO); > + if (csum) > + csum_output(csum, zb->outbuf, avail); > + if (outlenp) > + *outlenp += avail; > + } > + } while (ret != Z_STREAM_END); > + > + zb->flags &= ~GOT_DEFLATE_F_HAVE_MORE; > + return NULL; nit: empty line > + > +} > + > void > got_deflate_end(struct got_deflate_buf *zb) > { > @@ -263,3 +311,97 @@ done: > got_deflate_end(&zb); > return err; > } > + > +const struct got_error * > +got_deflate_append_to_file_mmap(struct got_deflate_buf *zb, off_t *outlen, > + uint8_t *map, size_t offset, size_t len, FILE *outfile, > + struct got_deflate_checksum *csum) > +{ > + const struct got_error *err; > + size_t avail, consumed; > + > + do { > + err = deflate_read_mmap(zb, map, offset, len, &avail, > + &consumed, 0); > + if (err) > + break; > + offset += consumed; > + len -= consumed; > + if (avail > 0) { > + size_t n; > + n = fwrite(zb->outbuf, avail, 1, outfile); > + if (n != 1) { > + err = got_ferror(outfile, GOT_ERR_IO); > + break; > + } > + if (csum) > + csum_output(csum, zb->outbuf, avail); > + if (outlen) > + *outlen += avail; > + } > + } while ((zb->flags & GOT_DEFLATE_F_HAVE_MORE) && len > 0); > + > + return err; > +} > + > +const struct got_error * > +got_deflate_to_mem_mmap(uint8_t **outbuf, size_t *outlen, > + size_t *consumed_total, struct got_deflate_checksum *csum, uint8_t *map, > + size_t offset, size_t len) > +{ > + const struct got_error *err; > + size_t avail, consumed; > + struct got_deflate_buf zb; > + void *newbuf; nit: i'd go with a size_t for nbuf > + int nbuf = 1; > + > + if (outbuf) { > + *outbuf = malloc(GOT_DEFLATE_BUFSIZE); > + if (*outbuf == NULL) > + return got_error_from_errno("malloc"); > + err = got_deflate_init(&zb, *outbuf, GOT_DEFLATE_BUFSIZE); > + if (err) { > + free(*outbuf); > + *outbuf = NULL; > + return err; > + } > + } else { > + err = got_deflate_init(&zb, NULL, GOT_DEFLATE_BUFSIZE); > + if (err) > + return err; > + } > + > + *outlen = 0; > + if (consumed_total) > + *consumed_total = 0; > + do { > + err = got_deflate_read_mmap(&zb, map, offset, len, &avail, > + &consumed); > + if (err) > + goto done; > + offset += consumed; > + if (consumed_total) > + *consumed_total += consumed; > + len -= consumed; > + if (avail > 0 && csum) > + csum_output(csum, zb.outbuf, avail); > + *outlen += avail; > + if ((zb.flags & GOT_DEFLATE_F_HAVE_MORE) && outbuf != NULL) { > + newbuf = reallocarray(*outbuf, ++nbuf, > + GOT_DEFLATE_BUFSIZE); > + if (newbuf == NULL) { > + err = got_error_from_errno("reallocarray"); > + free(*outbuf); > + *outbuf = NULL; > + *outlen = 0; > + goto done; > + } > + *outbuf = newbuf; > + zb.outbuf = newbuf + *outlen; > + zb.outlen = (nbuf * GOT_DEFLATE_BUFSIZE) - *outlen; > + } > + } while (zb.flags & GOT_DEFLATE_F_HAVE_MORE); > +done: > + got_deflate_end(&zb); > + return err; > +} > blob - 1c429af85a27229451e05798e9511d0a8f474968 > blob + 09a8755cf062db2ffd1f7e2c26deef470dbba250 > --- lib/got_lib_deflate.h > +++ lib/got_lib_deflate.h > @@ -39,8 +39,17 @@ const struct got_error *got_deflate_init(struct got_de > size_t); > const struct got_error *got_deflate_read(struct got_deflate_buf *, FILE *, > off_t, size_t *, off_t *); > +const struct got_error *got_deflate_read_mmap(struct got_deflate_buf *, > + uint8_t *, size_t, size_t, size_t *, size_t *); > void got_deflate_end(struct got_deflate_buf *); > const struct got_error *got_deflate_to_file(off_t *, FILE *, off_t, FILE *, > struct got_deflate_checksum *); > const struct got_error *got_deflate_to_file_mmap(off_t *, uint8_t *, > size_t, size_t, FILE *, struct got_deflate_checksum *); > +const struct got_error *got_deflate_flush(struct got_deflate_buf *, FILE *, > + struct got_deflate_checksum *, off_t *); > +const struct got_error *got_deflate_append_to_file_mmap( > + struct got_deflate_buf *, off_t *, uint8_t *, size_t, size_t, FILE *, > + struct got_deflate_checksum *); > +const struct got_error *got_deflate_to_mem_mmap(uint8_t **, size_t *, size_t *, > + struct got_deflate_checksum *, uint8_t *, size_t, size_t); > blob - 6af8d574c7b345c52d3e0c19759bf3ae6bd62b20 > blob + 4bbe44dda07c97ba6f0ef878da0caadbe6de9741 > --- lib/got_lib_object.h > +++ lib/got_lib_object.h > @@ -104,7 +104,7 @@ const struct got_error *got_object_open_from_packfile( > struct got_object_id *, struct got_pack *, struct got_packidx *, int, > struct got_repository *); > const struct got_error *got_object_read_raw_delta(uint64_t *, uint64_t *, > - off_t *, off_t *, off_t *, struct got_object_id **, int, > + off_t *, off_t *, off_t *, off_t *, struct got_object_id **, int, > struct got_packidx *, int, struct got_object_id *, struct got_repository *); > const struct got_error *got_object_read_header_privsep(struct got_object **, > struct got_object_id *, struct got_repository *, int); > blob - e8fb373e287ee80486d50ed07964d9d39924308d > blob + 6a3d3981c9afd96d48ef7746b2d0b1d78793a7ca > --- lib/got_lib_pack.h > +++ lib/got_lib_pack.h > @@ -212,7 +212,7 @@ const struct got_error *got_packfile_extract_object(st > const struct got_error *got_packfile_extract_object_to_mem(uint8_t **, size_t *, > struct got_object *, struct got_pack *); > const struct got_error *got_packfile_extract_raw_delta(uint8_t **, size_t *, > - off_t *, off_t *, struct got_object_id *, uint64_t *, uint64_t *, > + size_t *, off_t *, off_t *, struct got_object_id *, uint64_t *, uint64_t *, > struct got_pack *, struct got_packidx *, int); > struct got_pack *got_repo_get_cached_pack(struct got_repository *, > const char *); > blob - 110fe049d86c1a33fb3b33e4fe74ffa8a3dbbfa8 > blob + e57f4dd3f8f4d207324b69c89c54442ae78cd5bb > --- lib/got_lib_privsep.h > +++ lib/got_lib_privsep.h > @@ -284,6 +284,7 @@ struct got_imsg_raw_delta { > uint64_t base_size; > uint64_t result_size; > off_t delta_size; > + off_t delta_compressed_size; > off_t delta_offset; > off_t delta_out_offset; > > @@ -662,8 +663,9 @@ const struct got_error *got_privsep_send_raw_delta_req > struct got_object_id *); > const struct got_error *got_privsep_send_raw_delta_outfd(struct imsgbuf *, int); > const struct got_error *got_privsep_send_raw_delta(struct imsgbuf *, uint64_t, > - uint64_t, off_t, off_t, off_t, struct got_object_id *); > + uint64_t, off_t, off_t, off_t, off_t, struct got_object_id *); > const struct got_error *got_privsep_recv_raw_delta(uint64_t *, uint64_t *, > - off_t *, off_t *, off_t *, struct got_object_id **, struct imsgbuf *); > + off_t *, off_t *, off_t *, off_t *, struct got_object_id **, > + struct imsgbuf *); > > void got_privsep_exec_child(int[2], const char *, const char *); > blob - b87e6eecb828ef98889452c3dd9b205e5eaf3c33 > blob + 4e5facc7f5e3c665aa540bb9caf3299f68626c2d > --- lib/object.c > +++ lib/object.c > @@ -388,8 +388,8 @@ got_object_open_from_packfile(struct got_object **obj, > > const struct got_error * > got_object_read_raw_delta(uint64_t *base_size, uint64_t *result_size, > - off_t *delta_size, off_t *delta_offset, off_t *delta_out_offset, > - struct got_object_id **base_id, int delta_cache_fd, > + off_t *delta_size, off_t *delta_compressed_size, off_t *delta_offset, > + off_t *delta_out_offset, struct got_object_id **base_id, int delta_cache_fd, > struct got_packidx *packidx, int obj_idx, struct got_object_id *id, > struct got_repository *repo) > { > @@ -400,6 +400,7 @@ got_object_read_raw_delta(uint64_t *base_size, uint64_ > *base_size = 0; > *result_size = 0; > *delta_size = 0; > + *delta_compressed_size = 0; > *delta_offset = 0; > *delta_out_offset = 0; > > @@ -439,7 +440,8 @@ got_object_read_raw_delta(uint64_t *base_size, uint64_ > return err; > > return got_privsep_recv_raw_delta(base_size, result_size, delta_size, > - delta_offset, delta_out_offset, base_id, pack->privsep_child->ibuf); > + delta_compressed_size, delta_offset, delta_out_offset, base_id, > + pack->privsep_child->ibuf); > } > > static const struct got_error * > blob - d875046e25b7f0b4172baa4dbd064445a73f18c4 > blob + e901a95d83890c031e00dc5f8a9ee560e51ace41 > --- lib/pack.c > +++ lib/pack.c > @@ -902,23 +902,33 @@ got_pack_parse_offset_delta(off_t *base_offset, size_t > > static const struct got_error * > read_delta_data(uint8_t **delta_buf, size_t *delta_len, > - size_t delta_data_offset, struct got_pack *pack) > + size_t *delta_compressed_len, size_t delta_data_offset, > + struct got_pack *pack) > { > const struct got_error *err = NULL; > + size_t consumed = 0; > > if (pack->map) { > if (delta_data_offset >= pack->filesize) > return got_error(GOT_ERR_PACK_OFFSET); > err = got_inflate_to_mem_mmap(delta_buf, delta_len, > - NULL, NULL, pack->map, delta_data_offset, > + &consumed, NULL, pack->map, delta_data_offset, > pack->filesize - delta_data_offset); > + if (err) > + return err; > } else { > if (lseek(pack->fd, delta_data_offset, SEEK_SET) == -1) > return got_error_from_errno("lseek"); > - err = got_inflate_to_mem_fd(delta_buf, delta_len, NULL, > - NULL, 0, pack->fd); > + err = got_inflate_to_mem_fd(delta_buf, delta_len, > + &consumed, NULL, 0, pack->fd); > + if (err) > + return err; > } > - return err; > + > + if (delta_compressed_len) > + *delta_compressed_len = consumed; > + > + return NULL; > } > > static const struct got_error * > @@ -1200,7 +1210,7 @@ got_pack_get_delta_chain_max_size(uint64_t *max_size, > if (delta_buf == NULL) { > cached = 0; > err = read_delta_data(&delta_buf, &delta_len, > - delta->data_offset, pack); > + NULL, delta->data_offset, pack); > if (err) > return err; > err = got_delta_cache_add(pack->delta_cache, > @@ -1336,7 +1346,7 @@ got_pack_dump_delta_chain_to_file(size_t *result_size, > pack->delta_cache, delta->data_offset); > if (delta_buf == NULL) { > cached = 0; > - err = read_delta_data(&delta_buf, &delta_len, > + err = read_delta_data(&delta_buf, &delta_len, NULL, > delta->data_offset, pack); > if (err) > goto done; > @@ -1482,7 +1492,7 @@ got_pack_dump_delta_chain_to_mem(uint8_t **outbuf, siz > pack->delta_cache, delta->data_offset); > if (delta_buf == NULL) { > cached = 0; > - err = read_delta_data(&delta_buf, &delta_len, > + err = read_delta_data(&delta_buf, &delta_len, NULL, > delta->data_offset, pack); > if (err) > goto done; > @@ -1601,20 +1611,76 @@ got_packfile_extract_object_to_mem(uint8_t **buf, size > return err; > } > > +static const struct got_error * > +read_raw_delta_data(uint8_t **delta_buf, size_t *delta_len, > + size_t *delta_len_compressed, uint64_t *base_size, uint64_t *result_size, > + off_t delta_data_offset, struct got_pack *pack, struct got_packidx *packidx) > +{ > + const struct got_error *err = NULL; > + > + /* Validate decompression and obtain the decompressed size. */ > + err = read_delta_data(delta_buf, delta_len, delta_len_compressed, > + delta_data_offset, pack); > + if (err) > + return err; > + > + /* Read delta base/result sizes from head of delta stream. */ > + err = got_delta_get_sizes(base_size, result_size, > + *delta_buf, *delta_len); > + if (err) > + goto done; > + > + /* Discard decompressed delta and read it again in compressed form. */ > + free(*delta_buf); > + *delta_buf = malloc(*delta_len_compressed); > + if (*delta_buf == NULL) > + return got_error_from_errno("malloc"); goto done? probably not important, I expect callers to check the error before looking at the returned data, but the other error conditions don't return directly.
process deltas in compressed form