"GOT", but the "O" is a cute, smiling pufferfish. Index | Thread | Search

From:
Stefan Sperling <stsp@stsp.name>
Subject:
compress delta data from delta_cache directly into pack file
To:
gameoftrees@openbsd.org
Date:
Tue, 18 Jan 2022 12:27:06 +0100

Download raw body.

Thread
While creating pack files in genpack() we currently copy each delta
from the large "delta cache" file into a separate temporary file,
and then copy and compress this temporary file into the pack file.

This patch removes the need for an intermediate temporary file by
allowing got_deflate_to_file() to read just a part of its source file,
instead of always compressing its entire source file. We can now pass
delta data from the delta_cache file directly into this function.

This change has some no-op ripple effects into other code which
calls into deflate.c, such as the commit logic. This code now needs
to calculate the size of the object file it creates, which is cheap.

While here, also fix the type of the output length parameters of
got_deflate_to_file() and got_deflate_to_file_mmap(): File sizes should
be traded in off_t, not size_t (size_t is used for memory buffer sizes).

Tests are passing, and git fsck reports no issues on commits or pack
files created with this new code.

ok?
 
diff 6b447a6decf6cde0663a251e1d400761580d7568 c3418ddcdf460457c19c4d288a9a0029450d50c6
blob - 52c3d2577c7b0308e62c7ff466152e61a38f3c50
blob + edb6ba2b65cdfd7ffdfd37b841e66e0d617db5e3
--- lib/deflate.c
+++ lib/deflate.c
@@ -90,7 +90,8 @@ csum_output(struct got_deflate_checksum *csum, const u
 }
 
 const struct got_error *
-got_deflate_read(struct got_deflate_buf *zb, FILE *f, size_t *outlenp)
+got_deflate_read(struct got_deflate_buf *zb, FILE *f, off_t len,
+    size_t *outlenp, off_t *consumed)
 {
 	size_t last_total_out = zb->z.total_out;
 	z_stream *z = &zb->z;
@@ -100,9 +101,13 @@ got_deflate_read(struct got_deflate_buf *zb, FILE *f, 
 	z->avail_out = zb->outlen;
 
 	*outlenp = 0;
+	*consumed = 0;
 	do {
+		size_t last_total_in = z->total_in;
 		if (z->avail_in == 0) {
-			size_t n = fread(zb->inbuf, 1, zb->inlen, f);
+			size_t n = 0;
+			if (len > 0)
+				n = fread(zb->inbuf, 1, MIN(zb->inlen, len), f);
 			if (n == 0) {
 				if (ferror(f))
 					return got_ferror(f, GOT_ERR_IO);
@@ -114,6 +119,7 @@ got_deflate_read(struct got_deflate_buf *zb, FILE *f, 
 			z->avail_in = n;
 		}
 		ret = deflate(z, Z_NO_FLUSH);
+		*consumed += z->total_in - last_total_in;
 	} while (ret == Z_OK && z->avail_out > 0);
 
 	if (ret == Z_OK) {
@@ -178,11 +184,12 @@ got_deflate_end(struct got_deflate_buf *zb)
 }
 
 const struct got_error *
-got_deflate_to_file(size_t *outlen, FILE *infile, FILE *outfile,
-    struct got_deflate_checksum *csum)
+got_deflate_to_file(off_t *outlen, FILE *infile, off_t len,
+    FILE *outfile, struct got_deflate_checksum *csum)
 {
 	const struct got_error *err;
 	size_t avail;
+	off_t consumed;
 	struct got_deflate_buf zb;
 
 	err = got_deflate_init(&zb, NULL, GOT_DEFLATE_BUFSIZE);
@@ -192,9 +199,10 @@ got_deflate_to_file(size_t *outlen, FILE *infile, FILE
 	*outlen = 0;
 
 	do {
-		err = got_deflate_read(&zb, infile, &avail);
+		err = got_deflate_read(&zb, infile, len, &avail, &consumed);
 		if (err)
 			goto done;
+		len -= consumed;
 		if (avail > 0) {
 			size_t n;
 			n = fwrite(zb.outbuf, avail, 1, outfile);
@@ -214,7 +222,7 @@ done:
 }
 
 const struct got_error *
-got_deflate_to_file_mmap(size_t *outlen, uint8_t *map, size_t offset,
+got_deflate_to_file_mmap(off_t *outlen, uint8_t *map, size_t offset,
     size_t len, FILE *outfile, struct got_deflate_checksum *csum)
 {
 	const struct got_error *err;
blob - c0b4fe271d48124cc97b135feff4b79f951c6720
blob + 1c429af85a27229451e05798e9511d0a8f474968
--- lib/got_lib_deflate.h
+++ lib/got_lib_deflate.h
@@ -38,9 +38,9 @@ struct got_deflate_buf {
 const struct got_error *got_deflate_init(struct got_deflate_buf *, uint8_t *,
     size_t);
 const struct got_error *got_deflate_read(struct got_deflate_buf *, FILE *,
-    size_t *);
+    off_t, size_t *, off_t *);
 void got_deflate_end(struct got_deflate_buf *);
-const struct got_error *got_deflate_to_file(size_t *, FILE *, FILE *,
+const struct got_error *got_deflate_to_file(off_t *, FILE *, off_t, FILE *,
     struct got_deflate_checksum *);
-const struct got_error *got_deflate_to_file_mmap(size_t *, uint8_t *,
+const struct got_error *got_deflate_to_file_mmap(off_t *, uint8_t *,
     size_t, size_t, FILE *, struct got_deflate_checksum *);
blob - 67676602467844816045891f0532476d0e14c799
blob + 1d6840ec3d3bc9318e74417e89f1c1056a322dd0
--- lib/got_lib_object_create.h
+++ lib/got_lib_object_create.h
@@ -15,7 +15,7 @@
  */
 
 const struct got_error *got_object_blob_file_create(struct got_object_id **,
-    FILE **, const char *);
+    FILE **, off_t *, const char *);
 const struct got_error *got_object_blob_create(struct got_object_id **,
     const char *, struct got_repository *);
 
blob - e5e4c44c8a0327211b0094775682f2bed8ea42a8
blob + 84740e1bdc8bd31fb2394900a4795c7b36dc6234
--- lib/object_create.c
+++ lib/object_create.c
@@ -49,13 +49,13 @@
 
 static const struct got_error *
 create_object_file(struct got_object_id *id, FILE *content,
-    struct got_repository *repo)
+    off_t content_len, struct got_repository *repo)
 {
 	const struct got_error *err = NULL, *unlock_err = NULL;
 	char *objpath = NULL, *tmppath = NULL;
 	FILE *tmpfile = NULL;
 	struct got_lockfile *lf = NULL;
-	size_t tmplen = 0;
+	off_t tmplen = 0;
 
 	err = got_object_get_path(&objpath, id, repo);
 	if (err)
@@ -83,7 +83,7 @@ create_object_file(struct got_object_id *id, FILE *con
 		goto done;
 	}
 
-	err = got_deflate_to_file(&tmplen, content, tmpfile, NULL);
+	err = got_deflate_to_file(&tmplen, content, content_len, tmpfile, NULL);
 	if (err)
 		goto done;
 
@@ -113,7 +113,7 @@ done:
 
 const struct got_error *
 got_object_blob_file_create(struct got_object_id **id, FILE **blobfile,
-    const char *ondisk_path)
+    off_t *blobsize, const char *ondisk_path)
 {
 	const struct got_error *err = NULL;
 	char *header = NULL;
@@ -124,6 +124,7 @@ got_object_blob_file_create(struct got_object_id **id,
 
 	*id = NULL;
 	*blobfile = NULL;
+	*blobsize = 0;
 
 	SHA1Init(&sha1_ctx);
 
@@ -160,6 +161,7 @@ got_object_blob_file_create(struct got_object_id **id,
 		err = got_ferror(*blobfile, GOT_ERR_IO);
 		goto done;
 	}
+	*blobsize += headerlen;
 	for (;;) {
 		char buf[PATH_MAX * 8];
 		ssize_t inlen;
@@ -185,6 +187,7 @@ got_object_blob_file_create(struct got_object_id **id,
 			err = got_ferror(*blobfile, GOT_ERR_IO);
 			goto done;
 		}
+		*blobsize += n;
 		if (S_ISLNK(sb.st_mode))
 			break;
 	}
@@ -222,12 +225,14 @@ got_object_blob_create(struct got_object_id **id, cons
 {
 	const struct got_error *err = NULL;
 	FILE *blobfile = NULL;
+	off_t blobsize;
 
-	err = got_object_blob_file_create(id, &blobfile, ondisk_path);
+	err = got_object_blob_file_create(id, &blobfile, &blobsize,
+	    ondisk_path);
 	if (err)
 		return err;
 
-	err = create_object_file(*id, blobfile, repo);
+	err = create_object_file(*id, blobfile, blobsize, repo);
 	if (fclose(blobfile) == EOF && err == NULL)
 		err = got_error_from_errno("fclose");
 	if (err) {
@@ -299,6 +304,7 @@ got_object_tree_create(struct got_object_id **id,
 	char *header = NULL;
 	size_t headerlen, len = 0, n;
 	FILE *treefile = NULL;
+	off_t treesize = 0;
 	struct got_pathlist_entry *pe;
 	struct got_tree_entry **sorted_entries;
 	struct got_tree_entry *te;
@@ -345,6 +351,7 @@ got_object_tree_create(struct got_object_id **id,
 		err = got_ferror(treefile, GOT_ERR_IO);
 		goto done;
 	}
+	treesize += headerlen;
 
 	for (i = 0; i < nentries; i++) {
 		te = sorted_entries[i];
@@ -358,6 +365,7 @@ got_object_tree_create(struct got_object_id **id,
 			goto done;
 		}
 		SHA1Update(&sha1_ctx, modebuf, len);
+		treesize += len;
 
 		len = strlen(te->name) + 1; /* must include NUL */
 		n = fwrite(te->name, 1, len, treefile);
@@ -366,6 +374,7 @@ got_object_tree_create(struct got_object_id **id,
 			goto done;
 		}
 		SHA1Update(&sha1_ctx, te->name, len);
+		treesize += len;
 
 		len = SHA1_DIGEST_LENGTH;
 		n = fwrite(te->id.sha1, 1, len, treefile);
@@ -374,6 +383,7 @@ got_object_tree_create(struct got_object_id **id,
 			goto done;
 		}
 		SHA1Update(&sha1_ctx, te->id.sha1, len);
+		treesize += len;
 	}
 
 	*id = malloc(sizeof(**id));
@@ -389,7 +399,7 @@ got_object_tree_create(struct got_object_id **id,
 	}
 	rewind(treefile);
 
-	err = create_object_file(*id, treefile, repo);
+	err = create_object_file(*id, treefile, treesize, repo);
 done:
 	free(header);
 	free(sorted_entries);
@@ -416,6 +426,7 @@ got_object_commit_create(struct got_object_id **id,
 	char *id_str = NULL;
 	size_t headerlen, len = 0, n;
 	FILE *commitfile = NULL;
+	off_t commitsize = 0;
 	struct got_object_qid *qid;
 	char *msg0, *msg;
 
@@ -471,6 +482,7 @@ got_object_commit_create(struct got_object_id **id,
 		err = got_ferror(commitfile, GOT_ERR_IO);
 		goto done;
 	}
+	commitsize += headerlen;
 
 	err = got_object_id_str(&id_str, tree_id);
 	if (err)
@@ -567,7 +579,7 @@ got_object_commit_create(struct got_object_id **id,
 	}
 	rewind(commitfile);
 
-	err = create_object_file(*id, commitfile, repo);
+	err = create_object_file(*id, commitfile, commitsize, repo);
 done:
 	free(id_str);
 	free(msg0);
@@ -596,6 +608,7 @@ got_object_tag_create(struct got_object_id **id,
 	char *id_str = NULL, *obj_str = NULL, *type_str = NULL;
 	size_t headerlen, len = 0, n;
 	FILE *tagfile = NULL;
+	off_t tagsize = 0;
 	char *msg0 = NULL, *msg;
 	const char *obj_type_str;
 	int obj_type;
@@ -681,6 +694,7 @@ got_object_tag_create(struct got_object_id **id,
 		err = got_ferror(tagfile, GOT_ERR_IO);
 		goto done;
 	}
+	tagsize += headerlen;
 	len = strlen(obj_str);
 	SHA1Update(&sha1_ctx, obj_str, len);
 	n = fwrite(obj_str, 1, len, tagfile);
@@ -688,6 +702,7 @@ got_object_tag_create(struct got_object_id **id,
 		err = got_ferror(tagfile, GOT_ERR_IO);
 		goto done;
 	}
+	tagsize += len;
 	len = strlen(type_str);
 	SHA1Update(&sha1_ctx, type_str, len);
 	n = fwrite(type_str, 1, len, tagfile);
@@ -695,6 +710,7 @@ got_object_tag_create(struct got_object_id **id,
 		err = got_ferror(tagfile, GOT_ERR_IO);
 		goto done;
 	}
+	tagsize += len;
 
 	len = strlen(tag_str);
 	SHA1Update(&sha1_ctx, tag_str, len);
@@ -703,6 +719,7 @@ got_object_tag_create(struct got_object_id **id,
 		err = got_ferror(tagfile, GOT_ERR_IO);
 		goto done;
 	}
+	tagsize += len;
 
 	len = strlen(tagger_str);
 	SHA1Update(&sha1_ctx, tagger_str, len);
@@ -711,6 +728,7 @@ got_object_tag_create(struct got_object_id **id,
 		err = got_ferror(tagfile, GOT_ERR_IO);
 		goto done;
 	}
+	tagsize += len;
 
 	SHA1Update(&sha1_ctx, "\n", 1);
 	n = fwrite("\n", 1, 1, tagfile);
@@ -718,6 +736,7 @@ got_object_tag_create(struct got_object_id **id,
 		err = got_ferror(tagfile, GOT_ERR_IO);
 		goto done;
 	}
+	tagsize += n;
 
 	len = strlen(msg);
 	SHA1Update(&sha1_ctx, msg, len);
@@ -726,6 +745,7 @@ got_object_tag_create(struct got_object_id **id,
 		err = got_ferror(tagfile, GOT_ERR_IO);
 		goto done;
 	}
+	tagsize += len;
 
 	SHA1Update(&sha1_ctx, "\n", 1);
 	n = fwrite("\n", 1, 1, tagfile);
@@ -733,6 +753,7 @@ got_object_tag_create(struct got_object_id **id,
 		err = got_ferror(tagfile, GOT_ERR_IO);
 		goto done;
 	}
+	tagsize += n;
 
 	*id = malloc(sizeof(**id));
 	if (*id == NULL) {
@@ -747,7 +768,7 @@ got_object_tag_create(struct got_object_id **id,
 	}
 	rewind(tagfile);
 
-	err = create_object_file(*id, tagfile, repo);
+	err = create_object_file(*id, tagfile, tagsize, repo);
 done:
 	free(msg0);
 	free(header);
blob - 8a3d8a8fd7f7eb29a2707c38e554ff6a37b63685
blob + f4c9f4cd1e5f2d7ec2bb640a2745e9ef11b8f29a
--- lib/pack_create.c
+++ lib/pack_create.c
@@ -1385,11 +1385,10 @@ genpack(uint8_t *pack_sha1, FILE *packfile, FILE *delt
 	SHA1_CTX ctx;
 	struct got_pack_meta *m;
 	struct got_raw_object *raw = NULL;
-	FILE *delta_file = NULL;
 	char buf[32];
-	size_t outlen, n;
+	size_t n;
 	struct got_deflate_checksum csum;
-	off_t packfile_size = 0;
+	off_t outlen, packfile_size = 0;
 	int outfd = -1;
 
 	SHA1Init(&ctx);
@@ -1440,7 +1439,7 @@ genpack(uint8_t *pack_sha1, FILE *packfile, FILE *delt
 					goto done;
 				}
 				err = got_deflate_to_file(&outlen, raw->f,
-				    packfile, &csum);
+				    raw->size, packfile, &csum);
 				if (err)
 					goto done;
 			}
@@ -1460,57 +1459,17 @@ genpack(uint8_t *pack_sha1, FILE *packfile, FILE *delt
 			free(m->delta_buf);
 			m->delta_buf = NULL;
 		} else {
-			off_t remain;
-			if (delta_file == NULL) {
-				delta_file = got_opentemp();
-				if (delta_file == NULL) {
-					err = got_error_from_errno(
-					    "got_opentemp");
-					goto done;
-				}
-			}
-			if (ftruncate(fileno(delta_file), 0L) == -1) {
-				err = got_error_from_errno("ftruncate");
-				goto done;
-			}
-			if (fseeko(delta_file, 0L, SEEK_SET) == -1) {
-				err = got_error_from_errno("fseeko");
-				goto done;
-			}
 			if (fseeko(delta_cache, m->delta_offset, SEEK_SET)
 			    == -1) {
 				err = got_error_from_errno("fseeko");
 				goto done;
 			}
-			remain = m->delta_len;
-			while (remain > 0) {
-				char delta_buf[8192];
-				size_t r, w, n;
-				n = MIN(remain, sizeof(delta_buf));
-				r = fread(delta_buf, 1, n, delta_cache);
-				if (r != n) {
-					err = got_ferror(delta_cache,
-					    GOT_ERR_IO);
-					goto done;
-				}
-				w = fwrite(delta_buf, 1, n, delta_file);
-				if (w != n) {
-					err = got_ferror(delta_file,
-					    GOT_ERR_IO);
-					goto done;
-				}
-				remain -= n;
-			}
 			err = deltahdr(&packfile_size, &ctx, packfile,
 			    m, use_offset_deltas);
 			if (err)
 				goto done;
-			if (fseeko(delta_file, 0L, SEEK_SET) == -1) {
-				err = got_error_from_errno("fseeko");
-				goto done;
-			}
-			err = got_deflate_to_file(&outlen, delta_file,
-			    packfile, &csum);
+			err = got_deflate_to_file(&outlen, delta_cache,
+			    m->delta_len, packfile, &csum);
 			if (err)
 				goto done;
 			packfile_size += outlen;
@@ -1529,8 +1488,6 @@ genpack(uint8_t *pack_sha1, FILE *packfile, FILE *delt
 			goto done;
 	}
 done:
-	if (delta_file && fclose(delta_file) == EOF && err == NULL)
-		err = got_error_from_errno("fclose");
 	if (raw)
 		got_object_raw_close(raw);
 	if (outfd != -1 && close(outfd) == -1 && err == NULL)
blob - dba36bee48eede3aca73746d5501b4565bbddf64
blob + 0f78ee740bd078fd66df140aa19f2527ef430e0f
--- lib/worktree.c
+++ lib/worktree.c
@@ -2884,11 +2884,13 @@ merge_file_cb(void *arg, struct got_blob_object *blob1
 		case GOT_STATUS_ADD: {
 			struct got_object_id *id;
 			FILE *blob1_f;
+			off_t blob1_size;
 			/*
 			 * Delete the added file only if its content already
 			 * exists in the repository.
 			 */
-			err = got_object_blob_file_create(&id, &blob1_f, path1);
+			err = got_object_blob_file_create(&id, &blob1_f,
+			    &blob1_size, path1);
 			if (err)
 				goto done;
 			if (got_object_id_cmp(id, id1) == 0) {