Download raw body.
towards sha256 pack files: opaquify sorted_ids
On 2023/03/01 19:40:24 +0100, Omar Polo <op@omarpolo.com> wrote:
> Hello,
>
> This is a first step to make handle sha256 object ids in pack files.
> It is also the messiest and the one I'm less sure about.
>
> The `sorted_ids' field is an "array" of 20-bytes SHA1 digests
> currently, but in sha256 repos it is made of 40-bytes SHA256 digests.
>
> I'm trying to make it "opaque" and providing a macro for lower-level
> code to access it and retiring the struct got_packidx_object_id that
> is now completely unused. I'm still not completely sold on the way
> I've handled it, but it's the best idea I could come up with.
>
> I'm also introducing some helpers: got_hash_digest_length and
> got_hash_digest_string_length that I'm starting to use but will come
> in handy way more in the future.
>
> As always, all these GOT_HASH_SHA1 will be "bubbled up" soon and stop
> being hardcoded.
ping :)
> diff refs/heads/main refs/heads/w
> commit - 3fa763e5f0869e218fa64750c375a84b706c6f92
> commit + 429d1d8c1a59c44d846147442d4e6385d2536d61
> blob - 1405a67ac335af59f8e1c2f04ac566ecd07436e7
> blob + 5c5eda14f6f68eb22642b1b51f17e933a403026c
> --- lib/got_lib_hash.h
> +++ lib/got_lib_hash.h
> @@ -28,6 +28,32 @@ struct got_hash {
> int got_parse_object_id(struct got_object_id *, const char *,
> enum got_hash_algorithm);
>
> +static inline int
> +got_hash_digest_length(enum got_hash_algorithm algo)
> +{
> + switch (algo) {
> + case GOT_HASH_SHA1:
> + return SHA1_DIGEST_LENGTH;
> + case GOT_HASH_SHA256:
> + return SHA256_DIGEST_LENGTH;
> + default:
> + return 0;
> + }
> +}
> +
> +static inline int
> +got_hash_digest_string_length(enum got_hash_algorithm algo)
> +{
> + switch (algo) {
> + case GOT_HASH_SHA1:
> + return SHA1_DIGEST_STRING_LENGTH;
> + case GOT_HASH_SHA256:
> + return SHA256_DIGEST_STRING_LENGTH;
> + default:
> + return 0;
> + }
> +}
> +
> struct got_hash {
> SHA1_CTX sha1_ctx;
> SHA2_CTX sha256_ctx;
> blob - 188c28e9e67bdc37146176e911d6226b223c40d2
> blob + e1279d24139787a085f780a37632bf4b34e151be
> --- lib/got_lib_pack.h
> +++ lib/got_lib_pack.h
> @@ -67,10 +67,6 @@ struct got_packidx_object_id {
> u_int8_t packidx_sha1[SHA1_DIGEST_LENGTH];
> } __attribute__((__packed__));
>
> -struct got_packidx_object_id {
> - u_int8_t sha1[SHA1_DIGEST_LENGTH];
> -} __attribute__((__packed__));
> -
> /* Ignore pack index version 1 which is no longer written by Git. */
> #define GOT_PACKIDX_VERSION 2
>
> @@ -90,8 +86,11 @@ struct got_packidx_v2_hdr {
> uint32_t *fanout_table; /* values are big endian */
> #define GOT_PACKIDX_V2_FANOUT_TABLE_ITEMS (0xff + 1)
>
> - /* Sorted SHA1 checksums for each object in the pack file. */
> - struct got_packidx_object_id *sorted_ids;
> + /*
> + * Sorted hash digest for each object in the pack file.
> + * Exact size depends on the repository object format.
> + */
> + void *sorted_ids;
>
> /* CRC32 of the packed representation of each object. */
> uint32_t *crc32;
> @@ -129,6 +128,9 @@ struct got_packfile_hdr {
> struct got_pack_large_offset_index *sorted_large_offsets;
> };
>
> +#define GOT_PACKIDX_GET_OBJECT(packidx, n) \
> + ((packidx)->hdr.sorted_ids + (n) * got_hash_digest_length(GOT_HASH_SHA1))
> +
> struct got_packfile_hdr {
> uint32_t signature;
> #define GOT_PACKFILE_SIGNATURE 0x5041434b /* 'P' 'A' 'C' 'K' */
> blob - af03219f49ee499a7c4d1a39e802bc0773762f06
> blob + 65be0482c78ef9765651a657fb61215530a56b4e
> --- lib/pack.c
> +++ lib/pack.c
> @@ -182,14 +182,13 @@ got_packidx_init_hdr(struct got_packidx *p, int verify
> remain -= len_fanout;
>
> nobj = be32toh(h->fanout_table[0xff]);
> - len_ids = nobj * sizeof(*h->sorted_ids);
> + len_ids = nobj * got_hash_digest_length(algo);
> if (len_ids <= nobj || len_ids > remain) {
> err = got_error(GOT_ERR_BAD_PACKIDX);
> goto done;
> }
> if (p->map)
> - h->sorted_ids =
> - (struct got_packidx_object_id *)((uint8_t*)(p->map + offset));
> + h->sorted_ids = p->map + offset;
> else {
> h->sorted_ids = malloc(len_ids);
> if (h->sorted_ids == NULL) {
> @@ -498,12 +497,13 @@ got_packidx_get_object_idx(struct got_packidx *packidx
> left = be32toh(packidx->hdr.fanout_table[id0 - 1]);
>
> while (left <= right) {
> - struct got_packidx_object_id *oid;
> + void *oid;
> int i, cmp;
>
> i = ((left + right) / 2);
> - oid = &packidx->hdr.sorted_ids[i];
> - cmp = memcmp(id->sha1, oid->sha1, SHA1_DIGEST_LENGTH);
> + oid = GOT_PACKIDX_GET_OBJECT(packidx, i);
> + cmp = memcmp(id->sha1, oid,
> + got_hash_digest_length(GOT_HASH_SHA1));
> if (cmp == 0)
> return i;
> else if (cmp > 0)
> @@ -653,13 +653,15 @@ got_packidx_get_object_id(struct got_object_id *id,
> struct got_packidx *packidx, int idx)
> {
> uint32_t totobj = be32toh(packidx->hdr.fanout_table[0xff]);
> - struct got_packidx_object_id *oid;
> + void *oid;
> + size_t idlen;
>
> if (idx < 0 || idx >= totobj)
> return got_error(GOT_ERR_NO_OBJ);
>
> - oid = &packidx->hdr.sorted_ids[idx];
> - memcpy(id->sha1, oid->sha1, SHA1_DIGEST_LENGTH);
> + idlen = got_hash_digest_length(GOT_HASH_SHA1);
> + oid = GOT_PACKIDX_GET_OBJECT(packidx, idx);
> + memcpy(id->sha1, oid, idlen);
> return NULL;
> }
>
> @@ -672,7 +674,7 @@ got_packidx_match_id_str_prefix(struct got_object_id_q
> uint32_t totobj = be32toh(packidx->hdr.fanout_table[0xff]);
> char hex[3];
> size_t prefix_len = strlen(id_str_prefix);
> - struct got_packidx_object_id *oid;
> + uint8_t *oid;
> uint32_t i = 0;
>
> STAILQ_INIT(matched_ids);
> @@ -688,18 +690,18 @@ got_packidx_match_id_str_prefix(struct got_object_id_q
>
> if (id0 > 0)
> i = be32toh(packidx->hdr.fanout_table[id0 - 1]);
> - oid = &packidx->hdr.sorted_ids[i];
> - while (i < totobj && oid->sha1[0] == id0) {
> + oid = GOT_PACKIDX_GET_OBJECT(packidx, i);
> + while (i < totobj && oid[0] == id0) {
> char id_str[SHA1_DIGEST_STRING_LENGTH];
> struct got_object_qid *qid;
> int cmp;
>
> - if (!got_sha1_digest_to_str(oid->sha1, id_str, sizeof(id_str)))
> + if (!got_sha1_digest_to_str(oid, id_str, sizeof(id_str)))
> return got_error(GOT_ERR_NO_SPACE);
>
> cmp = strncmp(id_str, id_str_prefix, prefix_len);
> if (cmp < 0) {
> - oid = &packidx->hdr.sorted_ids[++i];
> + oid = GOT_PACKIDX_GET_OBJECT(packidx, ++i);
> continue;
> } else if (cmp > 0)
> break;
> @@ -707,10 +709,11 @@ got_packidx_match_id_str_prefix(struct got_object_id_q
> err = got_object_qid_alloc_partial(&qid);
> if (err)
> break;
> - memcpy(qid->id.sha1, oid->sha1, SHA1_DIGEST_LENGTH);
> + memcpy(qid->id.sha1, oid,
> + got_hash_digest_length(GOT_HASH_SHA1));
> STAILQ_INSERT_TAIL(matched_ids, qid, entry);
>
> - oid = &packidx->hdr.sorted_ids[++i];
> + oid = GOT_PACKIDX_GET_OBJECT(packidx, ++i);
> }
>
> if (err)
> blob - 25889d5c1fc15e6c52fd8f251cc2c9c692d0cbcf
> blob + a6918ffc21e8236686568168477e68ee4b901965
> --- lib/pack_index.c
> +++ lib/pack_index.c
> @@ -474,17 +474,16 @@ find_object_idx(struct got_packidx *packidx, uint8_t *
> uint32_t nindexed = be32toh(packidx->hdr.fanout_table[0xff]);
> int left = 0, right = nindexed - 1;
> int cmp = 0, i = 0;
> + void *oid;
>
> if (id0 > 0)
> left = be32toh(packidx->hdr.fanout_table[id0 - 1]);
>
> while (left <= right) {
> - struct got_packidx_object_id *oid;
> -
> i = ((left + right) / 2);
> - oid = &packidx->hdr.sorted_ids[i];
> + oid = GOT_PACKIDX_GET_OBJECT(packidx, i);
>
> - cmp = memcmp(sha1, oid->sha1, SHA1_DIGEST_LENGTH);
> + cmp = memcmp(sha1, oid, got_hash_digest_length(GOT_HASH_SHA1));
> if (cmp == 0)
> return -1; /* object already indexed */
> else if (cmp > 0)
> @@ -506,8 +505,10 @@ print_packidx(struct got_packidx *packidx)
> fprintf(stderr, "object IDs:\n");
> for (i = 0; i < nindexed; i++) {
> char hex[SHA1_DIGEST_STRING_LENGTH];
> - got_sha1_digest_to_str(packidx->hdr.sorted_ids[i].sha1,
> - hex, sizeof(hex));
> + void *oid;
> +
> + oid = GOT_PACKIDX_GET_OBJECT(packidx, i);
> + got_sha1_digest_to_str(oid, hex, sizeof(hex));
> fprintf(stderr, "%s\n", hex);
> }
> fprintf(stderr, "\n");
> @@ -535,10 +536,11 @@ add_indexed_object(struct got_packidx *packidx, uint32
> add_indexed_object(struct got_packidx *packidx, uint32_t idx,
> struct got_indexed_object *obj)
> {
> + void *oid;
> int i;
>
> - memcpy(packidx->hdr.sorted_ids[idx].sha1, obj->id.sha1,
> - SHA1_DIGEST_LENGTH);
> + oid = GOT_PACKIDX_GET_OBJECT(packidx, idx);
> + memcpy(oid, obj->id.sha1, got_hash_digest_length(GOT_HASH_SHA1));
> packidx->hdr.crc32[idx] = htobe32(obj->crc);
> if (obj->off < GOT_PACKIDX_OFFSET_VAL_IS_LARGE_IDX)
> packidx->hdr.offsets[idx] = htobe32(obj->off);
> @@ -593,15 +595,17 @@ update_packidx(struct got_packidx *packidx, uint32_t n
> struct got_indexed_object *obj)
> {
> int idx;
> + size_t idlen;
> uint32_t nindexed = be32toh(packidx->hdr.fanout_table[0xff]);
>
> idx = find_object_idx(packidx, obj->id.sha1);
> if (idx == -1)
> return; /* object already indexed */
>
> - memmove(&packidx->hdr.sorted_ids[idx + 1],
> - &packidx->hdr.sorted_ids[idx],
> - sizeof(struct got_packidx_object_id) * (nindexed - idx));
> + idlen = got_hash_digest_length(GOT_HASH_SHA1);
> + memmove(packidx->hdr.sorted_ids + (idx + 1) * idlen,
> + packidx->hdr.sorted_ids + idx * idlen,
> + idlen * (nindexed - idx));
> memmove(&packidx->hdr.offsets[idx + 1], &packidx->hdr.offsets[idx],
> sizeof(uint32_t) * (nindexed - idx));
>
> @@ -703,7 +707,7 @@ got_pack_index(struct got_pack *pack, int idxfd, FILE
> goto done;
> }
> packidx.hdr.sorted_ids = calloc(nobj,
> - sizeof(struct got_packidx_object_id));
> + got_hash_digest_length(GOT_HASH_SHA1));
> if (packidx.hdr.sorted_ids == NULL) {
> err = got_error_from_errno("calloc");
> goto done;
> @@ -946,7 +950,7 @@ got_pack_index(struct got_pack *pack, int idxfd, FILE
> if (err)
> goto done;
> err = got_pack_hwrite(idxfd, packidx.hdr.sorted_ids,
> - nobj * SHA1_DIGEST_LENGTH, &ctx);
> + nobj * got_hash_digest_length(GOT_HASH_SHA1), &ctx);
> if (err)
> goto done;
> err = got_pack_hwrite(idxfd, packidx.hdr.crc32,
> blob - 6a757d85bc1ec107e2faa5b5d8a2f03388bef3d7
> blob + 49925f8b0ef27e9fbec7746b9864f16867c3a53c
> --- lib/repository.c
> +++ lib/repository.c
> @@ -1156,9 +1156,10 @@ add_packidx_bloom_filter(struct got_repository *repo,
> /* Minimum size supported by our bloom filter is 1000 entries. */
> bloom_init(bf->bloom, nobjects < 1000 ? 1000 : nobjects, 0.1);
> for (i = 0; i < nobjects; i++) {
> - struct got_packidx_object_id *id;
> - id = &packidx->hdr.sorted_ids[i];
> - bloom_add(bf->bloom, id->sha1, sizeof(id->sha1));
> + void *oid;
> + oid = GOT_PACKIDX_GET_OBJECT(packidx, i);
> + bloom_add(bf->bloom, oid,
> + got_hash_digest_length(GOT_HASH_SHA1));
> }
>
> RB_INSERT(got_packidx_bloom_filter_tree,
> blob - 48bed8a15e5f885c25ca796741f37fb70f1b87e3
> blob + bb16d00bb2cbcef62005ced69f00144a9f994a96
> --- lib/repository_admin.c
> +++ lib/repository_admin.c
> @@ -558,7 +558,7 @@ got_repo_list_pack(FILE *packfile, struct got_object_i
>
> nobj = be32toh(packidx->hdr.fanout_table[0xff]);
> for (i = 0; i < nobj; i++) {
> - struct got_packidx_object_id *oid;
> + void *oid;
> struct got_object_id id, base_id;
> off_t offset, base_offset = 0;
> uint8_t type;
> @@ -570,8 +570,9 @@ got_repo_list_pack(FILE *packfile, struct got_object_i
> if (err)
> break;
> }
> - oid = &packidx->hdr.sorted_ids[i];
> - memcpy(id.sha1, oid->sha1, SHA1_DIGEST_LENGTH);
> + oid = GOT_PACKIDX_GET_OBJECT(packidx, i);
> + memset(&id, 0, sizeof(id));
> + memcpy(id.sha1, oid, got_hash_digest_length(GOT_HASH_SHA1));
>
> offset = got_packidx_get_object_offset(packidx, i);
> if (offset == -1) {
towards sha256 pack files: opaquify sorted_ids