towards sha256 pack files: opaquify sorted

On 2023/03/01 19:40:24 +0100, Omar Polo <op@omarpolo.com> wrote: > Hello, > > This is a first step to make handle sha256 object ids in pack files. > It is also the messiest and the one I'm less sure about. > > The `sorted_ids' field is an "array" of 20-bytes SHA1 digests > currently, but in sha256 repos it is made of 40-bytes SHA256 digests. > > I'm trying to make it "opaque" and providing a macro for lower-level > code to access it and retiring the struct got_packidx_object_id that > is now completely unused. I'm still not completely sold on the way > I've handled it, but it's the best idea I could come up with. > > I'm also introducing some helpers: got_hash_digest_length and > got_hash_digest_string_length that I'm starting to use but will come > in handy way more in the future. > > As always, all these GOT_HASH_SHA1 will be "bubbled up" soon and stop > being hardcoded. ping :) > diff refs/heads/main refs/heads/w > commit - 3fa763e5f0869e218fa64750c375a84b706c6f92 > commit + 429d1d8c1a59c44d846147442d4e6385d2536d61 > blob - 1405a67ac335af59f8e1c2f04ac566ecd07436e7 > blob + 5c5eda14f6f68eb22642b1b51f17e933a403026c > --- lib/got_lib_hash.h > +++ lib/got_lib_hash.h > @@ -28,6 +28,32 @@ struct got_hash { > int got_parse_object_id(struct got_object_id *, const char *, > enum got_hash_algorithm); > > +static inline int > +got_hash_digest_length(enum got_hash_algorithm algo) > +{ > + switch (algo) { > + case GOT_HASH_SHA1: > + return SHA1_DIGEST_LENGTH; > + case GOT_HASH_SHA256: > + return SHA256_DIGEST_LENGTH; > + default: > + return 0; > + } > +} > + > +static inline int > +got_hash_digest_string_length(enum got_hash_algorithm algo) > +{ > + switch (algo) { > + case GOT_HASH_SHA1: > + return SHA1_DIGEST_STRING_LENGTH; > + case GOT_HASH_SHA256: > + return SHA256_DIGEST_STRING_LENGTH; > + default: > + return 0; > + } > +} > + > struct got_hash { > SHA1_CTX sha1_ctx; > SHA2_CTX sha256_ctx; > blob - 188c28e9e67bdc37146176e911d6226b223c40d2 > blob + e1279d24139787a085f780a37632bf4b34e151be > --- lib/got_lib_pack.h > +++ lib/got_lib_pack.h > @@ -67,10 +67,6 @@ struct got_packidx_object_id { > u_int8_t packidx_sha1[SHA1_DIGEST_LENGTH]; > } __attribute__((__packed__)); > > -struct got_packidx_object_id { > - u_int8_t sha1[SHA1_DIGEST_LENGTH]; > -} __attribute__((__packed__)); > - > /* Ignore pack index version 1 which is no longer written by Git. */ > #define GOT_PACKIDX_VERSION 2 > > @@ -90,8 +86,11 @@ struct got_packidx_v2_hdr { > uint32_t *fanout_table; /* values are big endian */ > #define GOT_PACKIDX_V2_FANOUT_TABLE_ITEMS (0xff + 1) > > - /* Sorted SHA1 checksums for each object in the pack file. */ > - struct got_packidx_object_id *sorted_ids; > + /* > + * Sorted hash digest for each object in the pack file. > + * Exact size depends on the repository object format. > + */ > + void *sorted_ids; > > /* CRC32 of the packed representation of each object. */ > uint32_t *crc32; > @@ -129,6 +128,9 @@ struct got_packfile_hdr { > struct got_pack_large_offset_index *sorted_large_offsets; > }; > > +#define GOT_PACKIDX_GET_OBJECT(packidx, n) \ > + ((packidx)->hdr.sorted_ids + (n) * got_hash_digest_length(GOT_HASH_SHA1)) > + > struct got_packfile_hdr { > uint32_t signature; > #define GOT_PACKFILE_SIGNATURE 0x5041434b /* 'P' 'A' 'C' 'K' */ > blob - af03219f49ee499a7c4d1a39e802bc0773762f06 > blob + 65be0482c78ef9765651a657fb61215530a56b4e > --- lib/pack.c > +++ lib/pack.c > @@ -182,14 +182,13 @@ got_packidx_init_hdr(struct got_packidx *p, int verify > remain -= len_fanout; > > nobj = be32toh(h->fanout_table[0xff]); > - len_ids = nobj * sizeof(*h->sorted_ids); > + len_ids = nobj * got_hash_digest_length(algo); > if (len_ids <= nobj || len_ids > remain) { > err = got_error(GOT_ERR_BAD_PACKIDX); > goto done; > } > if (p->map) > - h->sorted_ids = > - (struct got_packidx_object_id *)((uint8_t*)(p->map + offset)); > + h->sorted_ids = p->map + offset; > else { > h->sorted_ids = malloc(len_ids); > if (h->sorted_ids == NULL) { > @@ -498,12 +497,13 @@ got_packidx_get_object_idx(struct got_packidx *packidx > left = be32toh(packidx->hdr.fanout_table[id0 - 1]); > > while (left <= right) { > - struct got_packidx_object_id *oid; > + void *oid; > int i, cmp; > > i = ((left + right) / 2); > - oid = &packidx->hdr.sorted_ids[i]; > - cmp = memcmp(id->sha1, oid->sha1, SHA1_DIGEST_LENGTH); > + oid = GOT_PACKIDX_GET_OBJECT(packidx, i); > + cmp = memcmp(id->sha1, oid, > + got_hash_digest_length(GOT_HASH_SHA1)); > if (cmp == 0) > return i; > else if (cmp > 0) > @@ -653,13 +653,15 @@ got_packidx_get_object_id(struct got_object_id *id, > struct got_packidx *packidx, int idx) > { > uint32_t totobj = be32toh(packidx->hdr.fanout_table[0xff]); > - struct got_packidx_object_id *oid; > + void *oid; > + size_t idlen; > > if (idx < 0 || idx >= totobj) > return got_error(GOT_ERR_NO_OBJ); > > - oid = &packidx->hdr.sorted_ids[idx]; > - memcpy(id->sha1, oid->sha1, SHA1_DIGEST_LENGTH); > + idlen = got_hash_digest_length(GOT_HASH_SHA1); > + oid = GOT_PACKIDX_GET_OBJECT(packidx, idx); > + memcpy(id->sha1, oid, idlen); > return NULL; > } > > @@ -672,7 +674,7 @@ got_packidx_match_id_str_prefix(struct got_object_id_q > uint32_t totobj = be32toh(packidx->hdr.fanout_table[0xff]); > char hex[3]; > size_t prefix_len = strlen(id_str_prefix); > - struct got_packidx_object_id *oid; > + uint8_t *oid; > uint32_t i = 0; > > STAILQ_INIT(matched_ids); > @@ -688,18 +690,18 @@ got_packidx_match_id_str_prefix(struct got_object_id_q > > if (id0 > 0) > i = be32toh(packidx->hdr.fanout_table[id0 - 1]); > - oid = &packidx->hdr.sorted_ids[i]; > - while (i < totobj && oid->sha1[0] == id0) { > + oid = GOT_PACKIDX_GET_OBJECT(packidx, i); > + while (i < totobj && oid[0] == id0) { > char id_str[SHA1_DIGEST_STRING_LENGTH]; > struct got_object_qid *qid; > int cmp; > > - if (!got_sha1_digest_to_str(oid->sha1, id_str, sizeof(id_str))) > + if (!got_sha1_digest_to_str(oid, id_str, sizeof(id_str))) > return got_error(GOT_ERR_NO_SPACE); > > cmp = strncmp(id_str, id_str_prefix, prefix_len); > if (cmp < 0) { > - oid = &packidx->hdr.sorted_ids[++i]; > + oid = GOT_PACKIDX_GET_OBJECT(packidx, ++i); > continue; > } else if (cmp > 0) > break; > @@ -707,10 +709,11 @@ got_packidx_match_id_str_prefix(struct got_object_id_q > err = got_object_qid_alloc_partial(&qid); > if (err) > break; > - memcpy(qid->id.sha1, oid->sha1, SHA1_DIGEST_LENGTH); > + memcpy(qid->id.sha1, oid, > + got_hash_digest_length(GOT_HASH_SHA1)); > STAILQ_INSERT_TAIL(matched_ids, qid, entry); > > - oid = &packidx->hdr.sorted_ids[++i]; > + oid = GOT_PACKIDX_GET_OBJECT(packidx, ++i); > } > > if (err) > blob - 25889d5c1fc15e6c52fd8f251cc2c9c692d0cbcf > blob + a6918ffc21e8236686568168477e68ee4b901965 > --- lib/pack_index.c > +++ lib/pack_index.c > @@ -474,17 +474,16 @@ find_object_idx(struct got_packidx *packidx, uint8_t * > uint32_t nindexed = be32toh(packidx->hdr.fanout_table[0xff]); > int left = 0, right = nindexed - 1; > int cmp = 0, i = 0; > + void *oid; > > if (id0 > 0) > left = be32toh(packidx->hdr.fanout_table[id0 - 1]); > > while (left <= right) { > - struct got_packidx_object_id *oid; > - > i = ((left + right) / 2); > - oid = &packidx->hdr.sorted_ids[i]; > + oid = GOT_PACKIDX_GET_OBJECT(packidx, i); > > - cmp = memcmp(sha1, oid->sha1, SHA1_DIGEST_LENGTH); > + cmp = memcmp(sha1, oid, got_hash_digest_length(GOT_HASH_SHA1)); > if (cmp == 0) > return -1; /* object already indexed */ > else if (cmp > 0) > @@ -506,8 +505,10 @@ print_packidx(struct got_packidx *packidx) > fprintf(stderr, "object IDs:\n"); > for (i = 0; i < nindexed; i++) { > char hex[SHA1_DIGEST_STRING_LENGTH]; > - got_sha1_digest_to_str(packidx->hdr.sorted_ids[i].sha1, > - hex, sizeof(hex)); > + void *oid; > + > + oid = GOT_PACKIDX_GET_OBJECT(packidx, i); > + got_sha1_digest_to_str(oid, hex, sizeof(hex)); > fprintf(stderr, "%s\n", hex); > } > fprintf(stderr, "\n"); > @@ -535,10 +536,11 @@ add_indexed_object(struct got_packidx *packidx, uint32 > add_indexed_object(struct got_packidx *packidx, uint32_t idx, > struct got_indexed_object *obj) > { > + void *oid; > int i; > > - memcpy(packidx->hdr.sorted_ids[idx].sha1, obj->id.sha1, > - SHA1_DIGEST_LENGTH); > + oid = GOT_PACKIDX_GET_OBJECT(packidx, idx); > + memcpy(oid, obj->id.sha1, got_hash_digest_length(GOT_HASH_SHA1)); > packidx->hdr.crc32[idx] = htobe32(obj->crc); > if (obj->off < GOT_PACKIDX_OFFSET_VAL_IS_LARGE_IDX) > packidx->hdr.offsets[idx] = htobe32(obj->off); > @@ -593,15 +595,17 @@ update_packidx(struct got_packidx *packidx, uint32_t n > struct got_indexed_object *obj) > { > int idx; > + size_t idlen; > uint32_t nindexed = be32toh(packidx->hdr.fanout_table[0xff]); > > idx = find_object_idx(packidx, obj->id.sha1); > if (idx == -1) > return; /* object already indexed */ > > - memmove(&packidx->hdr.sorted_ids[idx + 1], > - &packidx->hdr.sorted_ids[idx], > - sizeof(struct got_packidx_object_id) * (nindexed - idx)); > + idlen = got_hash_digest_length(GOT_HASH_SHA1); > + memmove(packidx->hdr.sorted_ids + (idx + 1) * idlen, > + packidx->hdr.sorted_ids + idx * idlen, > + idlen * (nindexed - idx)); > memmove(&packidx->hdr.offsets[idx + 1], &packidx->hdr.offsets[idx], > sizeof(uint32_t) * (nindexed - idx)); > > @@ -703,7 +707,7 @@ got_pack_index(struct got_pack *pack, int idxfd, FILE > goto done; > } > packidx.hdr.sorted_ids = calloc(nobj, > - sizeof(struct got_packidx_object_id)); > + got_hash_digest_length(GOT_HASH_SHA1)); > if (packidx.hdr.sorted_ids == NULL) { > err = got_error_from_errno("calloc"); > goto done; > @@ -946,7 +950,7 @@ got_pack_index(struct got_pack *pack, int idxfd, FILE > if (err) > goto done; > err = got_pack_hwrite(idxfd, packidx.hdr.sorted_ids, > - nobj * SHA1_DIGEST_LENGTH, &ctx); > + nobj * got_hash_digest_length(GOT_HASH_SHA1), &ctx); > if (err) > goto done; > err = got_pack_hwrite(idxfd, packidx.hdr.crc32, > blob - 6a757d85bc1ec107e2faa5b5d8a2f03388bef3d7 > blob + 49925f8b0ef27e9fbec7746b9864f16867c3a53c > --- lib/repository.c > +++ lib/repository.c > @@ -1156,9 +1156,10 @@ add_packidx_bloom_filter(struct got_repository *repo, > /* Minimum size supported by our bloom filter is 1000 entries. */ > bloom_init(bf->bloom, nobjects < 1000 ? 1000 : nobjects, 0.1); > for (i = 0; i < nobjects; i++) { > - struct got_packidx_object_id *id; > - id = &packidx->hdr.sorted_ids[i]; > - bloom_add(bf->bloom, id->sha1, sizeof(id->sha1)); > + void *oid; > + oid = GOT_PACKIDX_GET_OBJECT(packidx, i); > + bloom_add(bf->bloom, oid, > + got_hash_digest_length(GOT_HASH_SHA1)); > } > > RB_INSERT(got_packidx_bloom_filter_tree, > blob - 48bed8a15e5f885c25ca796741f37fb70f1b87e3 > blob + bb16d00bb2cbcef62005ced69f00144a9f994a96 > --- lib/repository_admin.c > +++ lib/repository_admin.c > @@ -558,7 +558,7 @@ got_repo_list_pack(FILE *packfile, struct got_object_i > > nobj = be32toh(packidx->hdr.fanout_table[0xff]); > for (i = 0; i < nobj; i++) { > - struct got_packidx_object_id *oid; > + void *oid; > struct got_object_id id, base_id; > off_t offset, base_offset = 0; > uint8_t type; > @@ -570,8 +570,9 @@ got_repo_list_pack(FILE *packfile, struct got_object_i > if (err) > break; > } > - oid = &packidx->hdr.sorted_ids[i]; > - memcpy(id.sha1, oid->sha1, SHA1_DIGEST_LENGTH); > + oid = GOT_PACKIDX_GET_OBJECT(packidx, i); > + memset(&id, 0, sizeof(id)); > + memcpy(id.sha1, oid, got_hash_digest_length(GOT_HASH_SHA1)); > > offset = got_packidx_get_object_offset(packidx, i); > if (offset == -1) {

2023-03-01 18:40 Omar Polo:
towards sha256 pack files: opaquify sorted_ids
- 2023-03-07 13:36 Omar Polo:
  towards sha256 pack files: opaquify sorted_ids
- 2023-03-08 13:44 Stefan Sperling:
  towards sha256 pack files: opaquify sorted_ids
- - 2023-03-08 14:38 Omar Polo:
    towards sha256 pack files: opaquify sorted_ids