From: Omar Polo <op@omarpolo.com>
Subject: Re: towards sha256 pack files: opaquify sorted_ids
To: Omar Polo <op@omarpolo.com>
Cc: Stefan Sperling <stsp@stsp.name>, gameoftrees@openbsd.org
Date: Fri, 31 Mar 2023 16:14:17 +0200

ping :)

On 2023/03/08 15:38:40 +0100, Omar Polo <op@omarpolo.com> wrote:
> On 2023/03/08 14:44:42 +0100, Stefan Sperling <stsp@stsp.name> wrote:
> > On Wed, Mar 01, 2023 at 07:40:24PM +0100, Omar Polo wrote:
> > > Hello,
> > > 
> > > This is a first step to make handle sha256 object ids in pack files.
> > > It is also the messiest and the one I'm less sure about.
> > > 
> > > The `sorted_ids' field is an "array" of 20-bytes SHA1 digests
> > > currently, but in sha256 repos it is made of 40-bytes SHA256 digests.
> > > 
> > > I'm trying to make it "opaque" and providing a macro for lower-level
> > > code to access it and retiring the struct got_packidx_object_id that
> > > is now completely unused.  I'm still not completely sold on the way
> > > I've handled it, but it's the best idea I could come up with.
> > 
> > Why is GOT_PACKIDX_GET_OBJECT a macro while the other helpers you've
> > added are inline functions?
> > I don't see a reason why GOT_PACKIDX_GET_OBJECT could not be an inline
> > function as well, or even just a regular function, which would return
> > a void * the caller can cast if needed, or even a struct got_object_id *.
> > This would add type checking of the arguments and also make things easier
> > to debug in gdb.
> 
> mostly to avoid polluting other files.  making it static inline would
> require to include got_hash.h in a few files that technically don't
> neede it.  A poor reason actually...
> 
> > Since this macro/function returns an object ID wouldn't a better name be
> > GOT_PACKIDX_GET_OBJECT_ID/got_packidx_get_object_id?
> 
> because got_packidx_get_object_id already exists ^^"
> 
> The idea was to replace the packidx->hdr.sorted_ids[N] with a
> similarly cheap macro call since we have a few places where we loop or
> bisect over the sorted_ids and the cost of the bounds and error check
> of got_packidx_get_object_id could be heard.
> 
> Actually, I fallen in a cosmetic trap, here's an updated diff with the
> macro expanded and it's not bad...
> 
> (regress is still running but i don't expect failures from this, it's
> essentially the same thing as the previous diff.)
> 
> diff refs/heads/main refs/heads/w
> commit - cee3836880940ba0d1203e0682a43a680132bc27
> commit + bacd69bfabab060f21f9c0f5405a25c8649584e4
> blob - 1405a67ac335af59f8e1c2f04ac566ecd07436e7
> blob + 5c5eda14f6f68eb22642b1b51f17e933a403026c
> --- lib/got_lib_hash.h
> +++ lib/got_lib_hash.h
> @@ -28,6 +28,32 @@ struct got_hash {
>  int got_parse_object_id(struct got_object_id *, const char *,
>      enum got_hash_algorithm);
>  
> +static inline int
> +got_hash_digest_length(enum got_hash_algorithm algo)
> +{
> +	switch (algo) {
> +	case GOT_HASH_SHA1:
> +		return SHA1_DIGEST_LENGTH;
> +	case GOT_HASH_SHA256:
> +		return SHA256_DIGEST_LENGTH;
> +	default:
> +		return 0;
> +	}
> +}
> +
> +static inline int
> +got_hash_digest_string_length(enum got_hash_algorithm algo)
> +{
> +	switch (algo) {
> +	case GOT_HASH_SHA1:
> +		return SHA1_DIGEST_STRING_LENGTH;
> +	case GOT_HASH_SHA256:
> +		return SHA256_DIGEST_STRING_LENGTH;
> +	default:
> +		return 0;
> +	}
> +}
> +
>  struct got_hash {
>  	SHA1_CTX		 sha1_ctx;
>  	SHA2_CTX		 sha256_ctx;
> blob - 188c28e9e67bdc37146176e911d6226b223c40d2
> blob + 624957e3cdf40254ef2a42563ef7c7b07d90e7b6
> --- lib/got_lib_pack.h
> +++ lib/got_lib_pack.h
> @@ -67,10 +67,6 @@ struct got_packidx_object_id {
>  	u_int8_t	packidx_sha1[SHA1_DIGEST_LENGTH];
>  } __attribute__((__packed__));
>  
> -struct got_packidx_object_id {
> -	u_int8_t sha1[SHA1_DIGEST_LENGTH];
> -} __attribute__((__packed__));
> -
>  /* Ignore pack index version 1 which is no longer written by Git. */
>  #define GOT_PACKIDX_VERSION 2
>  
> @@ -90,8 +86,11 @@ struct got_packidx_v2_hdr {
>  	uint32_t	*fanout_table;	/* values are big endian */
>  #define GOT_PACKIDX_V2_FANOUT_TABLE_ITEMS (0xff + 1)
>  
> -	/* Sorted SHA1 checksums for each object in the pack file. */
> -	struct got_packidx_object_id *sorted_ids;
> +	/*
> +	 * Sorted hash digest for each object in the pack file.
> +	 * Exact size depends on the repository object format.
> +	 */
> +	void *sorted_ids;
>  
>  	/* CRC32 of the packed representation of each object. */
>  	uint32_t	*crc32;
> blob - af03219f49ee499a7c4d1a39e802bc0773762f06
> blob + 034297c396e60c3125397d5036bfe4e252fcfdf9
> --- lib/pack.c
> +++ lib/pack.c
> @@ -182,14 +182,13 @@ got_packidx_init_hdr(struct got_packidx *p, int verify
>  	remain -= len_fanout;
>  
>  	nobj = be32toh(h->fanout_table[0xff]);
> -	len_ids = nobj * sizeof(*h->sorted_ids);
> +	len_ids = nobj * got_hash_digest_length(algo);
>  	if (len_ids <= nobj || len_ids > remain) {
>  		err = got_error(GOT_ERR_BAD_PACKIDX);
>  		goto done;
>  	}
>  	if (p->map)
> -		h->sorted_ids =
> -		    (struct got_packidx_object_id *)((uint8_t*)(p->map + offset));
> +		h->sorted_ids = p->map + offset;
>  	else {
>  		h->sorted_ids = malloc(len_ids);
>  		if (h->sorted_ids == NULL) {
> @@ -493,17 +492,19 @@ got_packidx_get_object_idx(struct got_packidx *packidx
>  	u_int8_t id0 = id->sha1[0];
>  	uint32_t totobj = be32toh(packidx->hdr.fanout_table[0xff]);
>  	int left = 0, right = totobj - 1;
> +	size_t idlen;
>  
>  	if (id0 > 0)
>  		left = be32toh(packidx->hdr.fanout_table[id0 - 1]);
>  
> +	idlen = got_hash_digest_length(GOT_HASH_SHA1);
>  	while (left <= right) {
> -		struct got_packidx_object_id *oid;
> +		void *oid;
>  		int i, cmp;
>  
>  		i = ((left + right) / 2);
> -		oid = &packidx->hdr.sorted_ids[i];
> -		cmp = memcmp(id->sha1, oid->sha1, SHA1_DIGEST_LENGTH);
> +		oid = packidx->hdr.sorted_ids + i * idlen;
> +		cmp = memcmp(id->sha1, oid, idlen);
>  		if (cmp == 0)
>  			return i;
>  		else if (cmp > 0)
> @@ -653,13 +654,15 @@ got_packidx_get_object_id(struct got_object_id *id,
>      struct got_packidx *packidx, int idx)
>  {
>  	uint32_t totobj = be32toh(packidx->hdr.fanout_table[0xff]);
> -	struct got_packidx_object_id *oid;
> +	void *oid;
> +	size_t idlen;
>  
>  	if (idx < 0 || idx >= totobj)
>  		return got_error(GOT_ERR_NO_OBJ);
>  
> -	oid = &packidx->hdr.sorted_ids[idx];
> -	memcpy(id->sha1, oid->sha1, SHA1_DIGEST_LENGTH);
> +	idlen = got_hash_digest_length(GOT_HASH_SHA1);
> +	oid = packidx->hdr.sorted_ids + idx * idlen;
> +	memcpy(id->sha1, oid, idlen);
>  	return NULL;
>  }
>  
> @@ -671,8 +674,8 @@ got_packidx_match_id_str_prefix(struct got_object_id_q
>  	u_int8_t id0;
>  	uint32_t totobj = be32toh(packidx->hdr.fanout_table[0xff]);
>  	char hex[3];
> -	size_t prefix_len = strlen(id_str_prefix);
> -	struct got_packidx_object_id *oid;
> +	size_t idlen, prefix_len = strlen(id_str_prefix);
> +	uint8_t *oid;
>  	uint32_t i = 0;
>  
>  	STAILQ_INIT(matched_ids);
> @@ -688,18 +691,21 @@ got_packidx_match_id_str_prefix(struct got_object_id_q
>  
>  	if (id0 > 0)
>  		i = be32toh(packidx->hdr.fanout_table[id0 - 1]);
> -	oid = &packidx->hdr.sorted_ids[i];
> -	while (i < totobj && oid->sha1[0] == id0) {
> +
> +	idlen = got_hash_digest_length(GOT_HASH_SHA1);
> +	oid = packidx->hdr.sorted_ids + i * idlen;
> +	while (i < totobj && oid[0] == id0) {
>  		char id_str[SHA1_DIGEST_STRING_LENGTH];
>  		struct got_object_qid *qid;
>  		int cmp;
>  
> -		if (!got_sha1_digest_to_str(oid->sha1, id_str, sizeof(id_str)))
> +		if (!got_sha1_digest_to_str(oid, id_str, sizeof(id_str)))
>  		        return got_error(GOT_ERR_NO_SPACE);
>  
>  		cmp = strncmp(id_str, id_str_prefix, prefix_len);
>  		if (cmp < 0) {
> -			oid = &packidx->hdr.sorted_ids[++i];
> +			++i;
> +			oid = packidx->hdr.sorted_ids + i * idlen;
>  			continue;
>  		} else if (cmp > 0)
>  			break;
> @@ -707,10 +713,11 @@ got_packidx_match_id_str_prefix(struct got_object_id_q
>  		err = got_object_qid_alloc_partial(&qid);
>  		if (err)
>  			break;
> -		memcpy(qid->id.sha1, oid->sha1, SHA1_DIGEST_LENGTH);
> +		memcpy(qid->id.sha1, oid, idlen);
>  		STAILQ_INSERT_TAIL(matched_ids, qid, entry);
>  
> -		oid = &packidx->hdr.sorted_ids[++i];
> +		++i;
> +		oid = packidx->hdr.sorted_ids + i * idlen;
>  	}
>  
>  	if (err)
> blob - 25889d5c1fc15e6c52fd8f251cc2c9c692d0cbcf
> blob + bbbf6e2645aa162075e265229ade29a55f60d739
> --- lib/pack_index.c
> +++ lib/pack_index.c
> @@ -474,17 +474,18 @@ find_object_idx(struct got_packidx *packidx, uint8_t *
>  	uint32_t nindexed = be32toh(packidx->hdr.fanout_table[0xff]);
>  	int left = 0, right = nindexed - 1;
>  	int cmp = 0, i = 0;
> +	void *oid;
> +	size_t idlen;
>  
>  	if (id0 > 0)
>  		left = be32toh(packidx->hdr.fanout_table[id0 - 1]);
>  
> +	idlen = got_hash_digest_length(GOT_HASH_SHA1);
>  	while (left <= right) {
> -		struct got_packidx_object_id *oid;
> -
>  		i = ((left + right) / 2);
> -		oid = &packidx->hdr.sorted_ids[i];
> +		oid = packidx->hdr.sorted_ids + i * idlen;
>  
> -		cmp = memcmp(sha1, oid->sha1, SHA1_DIGEST_LENGTH);
> +		cmp = memcmp(sha1, oid, idlen);
>  		if (cmp == 0)
>  			return -1; /* object already indexed */
>  		else if (cmp > 0)
> @@ -506,8 +507,11 @@ print_packidx(struct got_packidx *packidx)
>  	fprintf(stderr, "object IDs:\n");
>  	for (i = 0; i < nindexed; i++) {
>  		char hex[SHA1_DIGEST_STRING_LENGTH];
> -		got_sha1_digest_to_str(packidx->hdr.sorted_ids[i].sha1,
> -		    hex, sizeof(hex));
> +		void *oid;
> +
> +		oid = packidx->hdr.sorted_ids +
> +		    i * got_hash_digest_length(GOT_HASH_SHA1);
> +		got_sha1_digest_to_str(oid, hex, sizeof(hex));
>  		fprintf(stderr, "%s\n", hex);
>  	}
>  	fprintf(stderr, "\n");
> @@ -535,10 +539,13 @@ add_indexed_object(struct got_packidx *packidx, uint32
>  add_indexed_object(struct got_packidx *packidx, uint32_t idx,
>      struct got_indexed_object *obj)
>  {
> +	void *oid;
>  	int i;
> +	size_t idlen;
>  
> -	memcpy(packidx->hdr.sorted_ids[idx].sha1, obj->id.sha1,
> -	    SHA1_DIGEST_LENGTH);
> +	idlen = got_hash_digest_length(GOT_HASH_SHA1);
> +	oid = packidx->hdr.sorted_ids + idx * idlen;
> +	memcpy(oid, obj->id.sha1, idlen);
>  	packidx->hdr.crc32[idx] = htobe32(obj->crc);
>  	if (obj->off < GOT_PACKIDX_OFFSET_VAL_IS_LARGE_IDX)
>  		packidx->hdr.offsets[idx] = htobe32(obj->off);
> @@ -593,15 +600,17 @@ update_packidx(struct got_packidx *packidx, uint32_t n
>      struct got_indexed_object *obj)
>  {
>  	int idx;
> +	size_t idlen;
>  	uint32_t nindexed = be32toh(packidx->hdr.fanout_table[0xff]);
>  
>  	idx = find_object_idx(packidx, obj->id.sha1);
>  	if (idx == -1)
>  		return; /* object already indexed */
>  
> -	memmove(&packidx->hdr.sorted_ids[idx + 1],
> -	    &packidx->hdr.sorted_ids[idx],
> -	    sizeof(struct got_packidx_object_id) * (nindexed - idx));
> +	idlen = got_hash_digest_length(GOT_HASH_SHA1);
> +	memmove(packidx->hdr.sorted_ids + (idx + 1) * idlen,
> +	    packidx->hdr.sorted_ids + idx * idlen,
> +	    idlen * (nindexed - idx));
>  	memmove(&packidx->hdr.offsets[idx + 1], &packidx->hdr.offsets[idx],
>  	    sizeof(uint32_t) * (nindexed - idx));
>  
> @@ -703,7 +712,7 @@ got_pack_index(struct got_pack *pack, int idxfd, FILE 
>  		goto done;
>  	}
>  	packidx.hdr.sorted_ids = calloc(nobj,
> -	    sizeof(struct got_packidx_object_id));
> +	    got_hash_digest_length(GOT_HASH_SHA1));
>  	if (packidx.hdr.sorted_ids == NULL) {
>  		err = got_error_from_errno("calloc");
>  		goto done;
> @@ -946,7 +955,7 @@ got_pack_index(struct got_pack *pack, int idxfd, FILE 
>  	if (err)
>  		goto done;
>  	err = got_pack_hwrite(idxfd, packidx.hdr.sorted_ids,
> -	    nobj * SHA1_DIGEST_LENGTH, &ctx);
> +	    nobj * got_hash_digest_length(GOT_HASH_SHA1), &ctx);
>  	if (err)
>  		goto done;
>  	err = got_pack_hwrite(idxfd, packidx.hdr.crc32,
> blob - 6a757d85bc1ec107e2faa5b5d8a2f03388bef3d7
> blob + ba606b8d4113d9346332b7b8d98847343cf07967
> --- lib/repository.c
> +++ lib/repository.c
> @@ -1118,7 +1118,7 @@ add_packidx_bloom_filter(struct got_repository *repo,
>  {
>  	int i, nobjects = be32toh(packidx->hdr.fanout_table[0xff]);
>  	struct got_packidx_bloom_filter *bf;
> -	size_t len;
> +	size_t len, idlen;
>  
>  	/*
>  	 * Don't use bloom filters for very large pack index files.
> @@ -1153,12 +1153,14 @@ add_packidx_bloom_filter(struct got_repository *repo,
>  	}
>  	bf->path_len = len;
>  
> +	idlen = got_hash_digest_length(GOT_HASH_SHA1);
> +
>  	/* Minimum size supported by our bloom filter is 1000 entries. */
>  	bloom_init(bf->bloom, nobjects < 1000 ? 1000 : nobjects, 0.1);
>  	for (i = 0; i < nobjects; i++) {
> -		struct got_packidx_object_id *id;
> -		id = &packidx->hdr.sorted_ids[i];
> -		bloom_add(bf->bloom, id->sha1, sizeof(id->sha1));
> +		void *oid;
> +		oid = packidx->hdr.sorted_ids + i * idlen;
> +		bloom_add(bf->bloom, oid, idlen);
>  	}
>  
>  	RB_INSERT(got_packidx_bloom_filter_tree,
> blob - 48bed8a15e5f885c25ca796741f37fb70f1b87e3
> blob + 4a78540b33f385a58dde782dbe6c4b166b67a8d0
> --- lib/repository_admin.c
> +++ lib/repository_admin.c
> @@ -532,6 +532,7 @@ got_repo_list_pack(FILE *packfile, struct got_object_i
>  	struct got_packidx *packidx = NULL;
>  	struct got_pack *pack = NULL;
>  	uint32_t nobj, i;
> +	size_t idlen;
>  
>  	err = got_object_id_str(&id_str, pack_hash);
>  	if (err)
> @@ -556,9 +557,10 @@ got_repo_list_pack(FILE *packfile, struct got_object_i
>  	if (err)
>  		goto done;
>  
> +	idlen = got_hash_digest_length(GOT_HASH_SHA1);
>  	nobj = be32toh(packidx->hdr.fanout_table[0xff]);
>  	for (i = 0; i < nobj; i++) {
> -		struct got_packidx_object_id *oid;
> +		void *oid;
>  		struct got_object_id id, base_id;
>  		off_t offset, base_offset = 0;
>  		uint8_t type;
> @@ -570,8 +572,9 @@ got_repo_list_pack(FILE *packfile, struct got_object_i
>  			if (err)
>  				break;
>  		}
> -		oid = &packidx->hdr.sorted_ids[i];
> -		memcpy(id.sha1, oid->sha1, SHA1_DIGEST_LENGTH);
> +		oid = packidx->hdr.sorted_ids + i * idlen;
> +		memset(&id, 0, sizeof(id));
> +		memcpy(id.sha1, oid, idlen);
>  
>  		offset = got_packidx_get_object_offset(packidx, i);
>  		if (offset == -1) {