"GOT", but the "O" is a cute, smiling pufferfish. Index | Thread | Search

From:
Omar Polo <op@omarpolo.com>
Subject:
Re: opaquify sorted-ids (second try)
To:
gameoftrees@openbsd.org
Date:
Fri, 12 Jul 2024 17:22:50 +0200

Download raw body.

Thread
Omar Polo <op@omarpolo.com> wrote:
> It's very very similar to the previous diff, maybe just slightly
> prettier in a couple of nits but otherwise a pretty ugly diff.
> packfiles (v2) in sha256 repos have larger digests and we have to handle
> both sha1 and sha256.

Forgot a couple of things in the diff, here's a better version.

commit dd96e029a8f0911725a77e589c7b86b2969bc651 (main)
from: Omar Polo <op@omarpolo.com>
date: Fri Jul 12 15:20:20 2024 UTC
 
 opaquify sorted_ids
 
diff 243943f63dac33bd84f18ae5c81cff8f79d4b29f dd96e029a8f0911725a77e589c7b86b2969bc651
commit - 243943f63dac33bd84f18ae5c81cff8f79d4b29f
commit + dd96e029a8f0911725a77e589c7b86b2969bc651
blob - acdbcf213622afbf5553af4a03676be3862379b8
blob + d29cab547a3e6b7153c70dab10764138ee97b990
--- lib/got_lib_pack.h
+++ lib/got_lib_pack.h
@@ -68,10 +68,6 @@ struct got_packidx_trailer {
 	u_int8_t	packidx_sha1[SHA1_DIGEST_LENGTH];
 } __attribute__((__packed__));
 
-struct got_packidx_object_id {
-	u_int8_t sha1[SHA1_DIGEST_LENGTH];
-} __attribute__((__packed__));
-
 /* Ignore pack index version 1 which is no longer written by Git. */
 #define GOT_PACKIDX_VERSION 2
 
@@ -91,8 +87,8 @@ struct got_packidx_v2_hdr {
 	uint32_t	*fanout_table;	/* values are big endian */
 #define GOT_PACKIDX_V2_FANOUT_TABLE_ITEMS (0xff + 1)
 
-	/* Sorted SHA1 checksums for each object in the pack file. */
-	struct got_packidx_object_id *sorted_ids;
+	/* Sorted checksums for each object in the pack file. */
+	uint8_t *sorted_ids;
 
 	/* CRC32 of the packed representation of each object. */
 	uint32_t	*crc32;
blob - 5aabc43dd66acd9a5cb6ffe80627cc59ca257fbf
blob + aba3538bb59276a61594f385c2b5c61f44fde700
--- lib/pack.c
+++ lib/pack.c
@@ -188,8 +188,7 @@ got_packidx_init_hdr(struct got_packidx *p, int verify
 		goto done;
 	}
 	if (p->map)
-		h->sorted_ids =
-		    (struct got_packidx_object_id *)((uint8_t*)(p->map + offset));
+		h->sorted_ids = p->map + offset;
 	else {
 		h->sorted_ids = malloc(len_ids);
 		if (h->sorted_ids == NULL) {
@@ -495,17 +494,18 @@ got_packidx_get_object_idx(struct got_packidx *packidx
 	u_int8_t id0 = id->sha1[0];
 	uint32_t totobj = be32toh(packidx->hdr.fanout_table[0xff]);
 	int left = 0, right = totobj - 1;
+	size_t digest_len = got_hash_digest_length(packidx->algo);
 
 	if (id0 > 0)
 		left = be32toh(packidx->hdr.fanout_table[id0 - 1]);
 
 	while (left <= right) {
-		struct got_packidx_object_id *oid;
+		uint8_t *oid;
 		int i, cmp;
 
 		i = ((left + right) / 2);
-		oid = &packidx->hdr.sorted_ids[i];
-		cmp = memcmp(id->sha1, oid->sha1, SHA1_DIGEST_LENGTH);
+		oid = packidx->hdr.sorted_ids + i * digest_len;
+		cmp = memcmp(id->sha1, oid, digest_len);
 		if (cmp == 0)
 			return i;
 		else if (cmp > 0)
@@ -655,13 +655,14 @@ got_packidx_get_object_id(struct got_object_id *id,
     struct got_packidx *packidx, int idx)
 {
 	uint32_t totobj = be32toh(packidx->hdr.fanout_table[0xff]);
-	struct got_packidx_object_id *oid;
+	uint8_t *oid;
+	size_t digest_len = got_hash_digest_length(packidx->algo);
 
 	if (idx < 0 || idx >= totobj)
 		return got_error(GOT_ERR_NO_OBJ);
 
-	oid = &packidx->hdr.sorted_ids[idx];
-	memcpy(id->sha1, oid->sha1, SHA1_DIGEST_LENGTH);
+	oid = packidx->hdr.sorted_ids + idx * digest_len;
+	memcpy(id->sha1, oid, digest_len);
 	return NULL;
 }
 
@@ -674,8 +675,9 @@ got_packidx_match_id_str_prefix(struct got_object_id_q
 	uint32_t totobj = be32toh(packidx->hdr.fanout_table[0xff]);
 	char hex[3];
 	size_t prefix_len = strlen(id_str_prefix);
-	struct got_packidx_object_id *oid;
+	uint8_t *oid;
 	uint32_t i = 0;
+	size_t digest_len = got_hash_digest_length(packidx->algo);
 
 	if (prefix_len < 2)
 		return got_error_path(id_str_prefix, GOT_ERR_BAD_OBJ_ID_STR);
@@ -688,18 +690,18 @@ got_packidx_match_id_str_prefix(struct got_object_id_q
 
 	if (id0 > 0)
 		i = be32toh(packidx->hdr.fanout_table[id0 - 1]);
-	oid = &packidx->hdr.sorted_ids[i];
-	while (i < totobj && oid->sha1[0] == id0) {
+	oid = packidx->hdr.sorted_ids + i * digest_len;
+	while (i < totobj && oid[0] == id0) {
 		char id_str[SHA1_DIGEST_STRING_LENGTH];
 		struct got_object_qid *qid;
 		int cmp;
 
-		if (!got_sha1_digest_to_str(oid->sha1, id_str, sizeof(id_str)))
+		if (!got_sha1_digest_to_str(oid, id_str, sizeof(id_str)))
 			return got_error(GOT_ERR_NO_SPACE);
 
 		cmp = strncmp(id_str, id_str_prefix, prefix_len);
 		if (cmp < 0) {
-			oid = &packidx->hdr.sorted_ids[++i];
+			oid = packidx->hdr.sorted_ids + (++i) * digest_len;
 			continue;
 		} else if (cmp > 0)
 			break;
@@ -707,10 +709,10 @@ got_packidx_match_id_str_prefix(struct got_object_id_q
 		err = got_object_qid_alloc_partial(&qid);
 		if (err)
 			return err;
-		memcpy(qid->id.sha1, oid->sha1, SHA1_DIGEST_LENGTH);
+		memcpy(qid->id.sha1, oid, digest_len);
 		STAILQ_INSERT_TAIL(matched_ids, qid, entry);
 
-		oid = &packidx->hdr.sorted_ids[++i];
+		oid = packidx->hdr.sorted_ids + (++i) * digest_len;
 	}
 
 	return NULL;
blob - 2a64424df931938a52b2dbc73af82948e7aec145
blob + bdf8d2fa044c36ef8e599957fb912fc761b444fb
--- lib/pack_index.c
+++ lib/pack_index.c
@@ -445,17 +445,18 @@ find_object_idx(struct got_packidx *packidx, uint8_t *
 	uint32_t nindexed = be32toh(packidx->hdr.fanout_table[0xff]);
 	int left = 0, right = nindexed - 1;
 	int cmp = 0, i = 0;
+	size_t digest_len = got_hash_digest_length(packidx->algo);
 
 	if (id0 > 0)
 		left = be32toh(packidx->hdr.fanout_table[id0 - 1]);
 
 	while (left <= right) {
-		struct got_packidx_object_id *oid;
+		uint8_t *oid;
 
 		i = ((left + right) / 2);
-		oid = &packidx->hdr.sorted_ids[i];
+		oid = packidx->hdr.sorted_ids + i * digest_len;
 
-		cmp = memcmp(sha1, oid->sha1, SHA1_DIGEST_LENGTH);
+		cmp = memcmp(sha1, oid, digest_len);
 		if (cmp == 0)
 			return -1; /* object already indexed */
 		else if (cmp > 0)
@@ -472,12 +473,13 @@ static void
 print_packidx(struct got_packidx *packidx)
 {
 	uint32_t nindexed = be32toh(packidx->hdr.fanout_table[0xff]);
+	size_t digest_len = got_hash_digest_length(packidx->algo);
 	int i;
 
 	fprintf(stderr, "object IDs:\n");
 	for (i = 0; i < nindexed; i++) {
 		char hex[SHA1_DIGEST_STRING_LENGTH];
-		got_sha1_digest_to_str(packidx->hdr.sorted_ids[i].sha1,
+		got_sha1_digest_to_str(packidx->hdr.sorted_ids + i * digest_len,
 		    hex, sizeof(hex));
 		fprintf(stderr, "%s\n", hex);
 	}
@@ -507,9 +509,11 @@ add_indexed_object(struct got_packidx *packidx, uint32
     struct got_indexed_object *obj)
 {
 	int i;
+	uint8_t *oid;
+	size_t digest_len = got_hash_digest_length(packidx->algo);
 
-	memcpy(packidx->hdr.sorted_ids[idx].sha1, obj->id.sha1,
-	    SHA1_DIGEST_LENGTH);
+	oid = packidx->hdr.sorted_ids + idx * digest_len;
+	memcpy(oid, obj->id.sha1, digest_len);
 	packidx->hdr.crc32[idx] = htobe32(obj->crc);
 	if (obj->off < GOT_PACKIDX_OFFSET_VAL_IS_LARGE_IDX)
 		packidx->hdr.offsets[idx] = htobe32(obj->off);
@@ -565,14 +569,16 @@ update_packidx(struct got_packidx *packidx, uint32_t n
 {
 	int idx;
 	uint32_t nindexed = be32toh(packidx->hdr.fanout_table[0xff]);
+	size_t digest_len = got_hash_digest_length(packidx->algo);
+	uint8_t *from, *to;
 
 	idx = find_object_idx(packidx, obj->id.sha1);
 	if (idx == -1)
 		return; /* object already indexed */
 
-	memmove(&packidx->hdr.sorted_ids[idx + 1],
-	    &packidx->hdr.sorted_ids[idx],
-	    sizeof(struct got_packidx_object_id) * (nindexed - idx));
+	from = packidx->hdr.sorted_ids + idx * digest_len;
+	to = from + digest_len;
+	memmove(to, from, digest_len * (nindexed - idx));
 	memmove(&packidx->hdr.offsets[idx + 1], &packidx->hdr.offsets[idx],
 	    sizeof(uint32_t) * (nindexed - idx));
 
@@ -674,7 +680,7 @@ got_pack_index(struct got_pack *pack, int idxfd, FILE 
 		goto done;
 	}
 	packidx.hdr.sorted_ids = calloc(nobj,
-	    sizeof(struct got_packidx_object_id));
+	    got_hash_digest_length(pack->algo));
 	if (packidx.hdr.sorted_ids == NULL) {
 		err = got_error_from_errno("calloc");
 		goto done;
blob - adbe1b7e6e30b6571e3c7c1324bf23fbe0132c21
blob + 37bc0f5633b6a62f7d4a4d4b044b9301b502b0bd
--- lib/repository.c
+++ lib/repository.c
@@ -1236,8 +1236,10 @@ add_packidx_bloom_filter(struct got_repository *repo,
 {
 	int i, nobjects = be32toh(packidx->hdr.fanout_table[0xff]);
 	struct got_packidx_bloom_filter *bf;
-	size_t len;
+	size_t len, digest_len;
 
+	digest_len = got_hash_digest_length(repo->algo);
+
 	/*
 	 * Don't use bloom filters for very large pack index files.
 	 * Large pack files will contain a relatively large fraction
@@ -1274,9 +1276,8 @@ add_packidx_bloom_filter(struct got_repository *repo,
 	/* Minimum size supported by our bloom filter is 1000 entries. */
 	bloom_init(bf->bloom, nobjects < 1000 ? 1000 : nobjects, 0.1);
 	for (i = 0; i < nobjects; i++) {
-		struct got_packidx_object_id *id;
-		id = &packidx->hdr.sorted_ids[i];
-		bloom_add(bf->bloom, id->sha1, sizeof(id->sha1));
+		uint8_t *id = packidx->hdr.sorted_ids + i * digest_len;
+		bloom_add(bf->bloom, id, digest_len);
 	}
 
 	RB_INSERT(got_packidx_bloom_filter_tree,
blob - cf40a58dbaf2381f06e5e93fe1cdf138e1a72a9c
blob + 3e6484604a0fd85e52307e35003a601798197c14
--- lib/repository_admin.c
+++ lib/repository_admin.c
@@ -532,6 +532,7 @@ got_repo_list_pack(FILE *packfile, struct got_object_i
 	struct got_packidx *packidx = NULL;
 	struct got_pack *pack = NULL;
 	uint32_t nobj, i;
+	size_t digest_len = got_hash_digest_length(repo->algo);
 
 	err = got_object_id_str(&id_str, pack_hash);
 	if (err)
@@ -559,7 +560,7 @@ got_repo_list_pack(FILE *packfile, struct got_object_i
 
 	nobj = be32toh(packidx->hdr.fanout_table[0xff]);
 	for (i = 0; i < nobj; i++) {
-		struct got_packidx_object_id *oid;
+		uint8_t *oid;
 		struct got_object_id id, base_id;
 		off_t offset, base_offset = 0;
 		uint8_t type;
@@ -571,8 +572,8 @@ got_repo_list_pack(FILE *packfile, struct got_object_i
 			if (err)
 				break;
 		}
-		oid = &packidx->hdr.sorted_ids[i];
-		memcpy(id.sha1, oid->sha1, SHA1_DIGEST_LENGTH);
+		oid = packidx->hdr.sorted_ids + i * digest_len;
+		memcpy(id.sha1, oid, digest_len);
 
 		offset = got_packidx_get_object_offset(packidx, i);
 		if (offset == -1) {
@@ -1270,9 +1271,10 @@ pack_is_redundant(int *redundant, struct got_repositor
 {
 	const struct got_error *err;
 	struct got_packidx *packidx;
-	struct got_packidx_object_id *pid;
+	uint8_t *pid;
 	struct got_object_id id;
 	size_t i, nobjects;
+	size_t digest_len = got_hash_digest_length(repo->algo);
 
 	*redundant = 1;
 
@@ -1282,10 +1284,10 @@ pack_is_redundant(int *redundant, struct got_repositor
 
 	nobjects = be32toh(packidx->hdr.fanout_table[0xff]);
 	for (i = 0; i < nobjects; ++i) {
-		pid = &packidx->hdr.sorted_ids[i];
+		pid = packidx->hdr.sorted_ids + i * digest_len;
 
 		memset(&id, 0, sizeof(id));
-		memcpy(&id.sha1, pid->sha1, sizeof(id.sha1));
+		memcpy(&id.sha1, pid, digest_len);
 
 		if (got_object_idset_contains(idset, &id))
 			continue;