towards sha256 pack files: opaquify sorted

ping :) On 2023/03/08 15:38:40 +0100, Omar Polo <op@omarpolo.com> wrote: > On 2023/03/08 14:44:42 +0100, Stefan Sperling <stsp@stsp.name> wrote: > > On Wed, Mar 01, 2023 at 07:40:24PM +0100, Omar Polo wrote: > > > Hello, > > > > > > This is a first step to make handle sha256 object ids in pack files. > > > It is also the messiest and the one I'm less sure about. > > > > > > The `sorted_ids' field is an "array" of 20-bytes SHA1 digests > > > currently, but in sha256 repos it is made of 40-bytes SHA256 digests. > > > > > > I'm trying to make it "opaque" and providing a macro for lower-level > > > code to access it and retiring the struct got_packidx_object_id that > > > is now completely unused. I'm still not completely sold on the way > > > I've handled it, but it's the best idea I could come up with. > > > > Why is GOT_PACKIDX_GET_OBJECT a macro while the other helpers you've > > added are inline functions? > > I don't see a reason why GOT_PACKIDX_GET_OBJECT could not be an inline > > function as well, or even just a regular function, which would return > > a void * the caller can cast if needed, or even a struct got_object_id *. > > This would add type checking of the arguments and also make things easier > > to debug in gdb. > > mostly to avoid polluting other files. making it static inline would > require to include got_hash.h in a few files that technically don't > neede it. A poor reason actually... > > > Since this macro/function returns an object ID wouldn't a better name be > > GOT_PACKIDX_GET_OBJECT_ID/got_packidx_get_object_id? > > because got_packidx_get_object_id already exists ^^" > > The idea was to replace the packidx->hdr.sorted_ids[N] with a > similarly cheap macro call since we have a few places where we loop or > bisect over the sorted_ids and the cost of the bounds and error check > of got_packidx_get_object_id could be heard. > > Actually, I fallen in a cosmetic trap, here's an updated diff with the > macro expanded and it's not bad... > > (regress is still running but i don't expect failures from this, it's > essentially the same thing as the previous diff.) > > diff refs/heads/main refs/heads/w > commit - cee3836880940ba0d1203e0682a43a680132bc27 > commit + bacd69bfabab060f21f9c0f5405a25c8649584e4 > blob - 1405a67ac335af59f8e1c2f04ac566ecd07436e7 > blob + 5c5eda14f6f68eb22642b1b51f17e933a403026c > --- lib/got_lib_hash.h > +++ lib/got_lib_hash.h > @@ -28,6 +28,32 @@ struct got_hash { > int got_parse_object_id(struct got_object_id *, const char *, > enum got_hash_algorithm); > > +static inline int > +got_hash_digest_length(enum got_hash_algorithm algo) > +{ > + switch (algo) { > + case GOT_HASH_SHA1: > + return SHA1_DIGEST_LENGTH; > + case GOT_HASH_SHA256: > + return SHA256_DIGEST_LENGTH; > + default: > + return 0; > + } > +} > + > +static inline int > +got_hash_digest_string_length(enum got_hash_algorithm algo) > +{ > + switch (algo) { > + case GOT_HASH_SHA1: > + return SHA1_DIGEST_STRING_LENGTH; > + case GOT_HASH_SHA256: > + return SHA256_DIGEST_STRING_LENGTH; > + default: > + return 0; > + } > +} > + > struct got_hash { > SHA1_CTX sha1_ctx; > SHA2_CTX sha256_ctx; > blob - 188c28e9e67bdc37146176e911d6226b223c40d2 > blob + 624957e3cdf40254ef2a42563ef7c7b07d90e7b6 > --- lib/got_lib_pack.h > +++ lib/got_lib_pack.h > @@ -67,10 +67,6 @@ struct got_packidx_object_id { > u_int8_t packidx_sha1[SHA1_DIGEST_LENGTH]; > } __attribute__((__packed__)); > > -struct got_packidx_object_id { > - u_int8_t sha1[SHA1_DIGEST_LENGTH]; > -} __attribute__((__packed__)); > - > /* Ignore pack index version 1 which is no longer written by Git. */ > #define GOT_PACKIDX_VERSION 2 > > @@ -90,8 +86,11 @@ struct got_packidx_v2_hdr { > uint32_t *fanout_table; /* values are big endian */ > #define GOT_PACKIDX_V2_FANOUT_TABLE_ITEMS (0xff + 1) > > - /* Sorted SHA1 checksums for each object in the pack file. */ > - struct got_packidx_object_id *sorted_ids; > + /* > + * Sorted hash digest for each object in the pack file. > + * Exact size depends on the repository object format. > + */ > + void *sorted_ids; > > /* CRC32 of the packed representation of each object. */ > uint32_t *crc32; > blob - af03219f49ee499a7c4d1a39e802bc0773762f06 > blob + 034297c396e60c3125397d5036bfe4e252fcfdf9 > --- lib/pack.c > +++ lib/pack.c > @@ -182,14 +182,13 @@ got_packidx_init_hdr(struct got_packidx *p, int verify > remain -= len_fanout; > > nobj = be32toh(h->fanout_table[0xff]); > - len_ids = nobj * sizeof(*h->sorted_ids); > + len_ids = nobj * got_hash_digest_length(algo); > if (len_ids <= nobj || len_ids > remain) { > err = got_error(GOT_ERR_BAD_PACKIDX); > goto done; > } > if (p->map) > - h->sorted_ids = > - (struct got_packidx_object_id *)((uint8_t*)(p->map + offset)); > + h->sorted_ids = p->map + offset; > else { > h->sorted_ids = malloc(len_ids); > if (h->sorted_ids == NULL) { > @@ -493,17 +492,19 @@ got_packidx_get_object_idx(struct got_packidx *packidx > u_int8_t id0 = id->sha1[0]; > uint32_t totobj = be32toh(packidx->hdr.fanout_table[0xff]); > int left = 0, right = totobj - 1; > + size_t idlen; > > if (id0 > 0) > left = be32toh(packidx->hdr.fanout_table[id0 - 1]); > > + idlen = got_hash_digest_length(GOT_HASH_SHA1); > while (left <= right) { > - struct got_packidx_object_id *oid; > + void *oid; > int i, cmp; > > i = ((left + right) / 2); > - oid = &packidx->hdr.sorted_ids[i]; > - cmp = memcmp(id->sha1, oid->sha1, SHA1_DIGEST_LENGTH); > + oid = packidx->hdr.sorted_ids + i * idlen; > + cmp = memcmp(id->sha1, oid, idlen); > if (cmp == 0) > return i; > else if (cmp > 0) > @@ -653,13 +654,15 @@ got_packidx_get_object_id(struct got_object_id *id, > struct got_packidx *packidx, int idx) > { > uint32_t totobj = be32toh(packidx->hdr.fanout_table[0xff]); > - struct got_packidx_object_id *oid; > + void *oid; > + size_t idlen; > > if (idx < 0 || idx >= totobj) > return got_error(GOT_ERR_NO_OBJ); > > - oid = &packidx->hdr.sorted_ids[idx]; > - memcpy(id->sha1, oid->sha1, SHA1_DIGEST_LENGTH); > + idlen = got_hash_digest_length(GOT_HASH_SHA1); > + oid = packidx->hdr.sorted_ids + idx * idlen; > + memcpy(id->sha1, oid, idlen); > return NULL; > } > > @@ -671,8 +674,8 @@ got_packidx_match_id_str_prefix(struct got_object_id_q > u_int8_t id0; > uint32_t totobj = be32toh(packidx->hdr.fanout_table[0xff]); > char hex[3]; > - size_t prefix_len = strlen(id_str_prefix); > - struct got_packidx_object_id *oid; > + size_t idlen, prefix_len = strlen(id_str_prefix); > + uint8_t *oid; > uint32_t i = 0; > > STAILQ_INIT(matched_ids); > @@ -688,18 +691,21 @@ got_packidx_match_id_str_prefix(struct got_object_id_q > > if (id0 > 0) > i = be32toh(packidx->hdr.fanout_table[id0 - 1]); > - oid = &packidx->hdr.sorted_ids[i]; > - while (i < totobj && oid->sha1[0] == id0) { > + > + idlen = got_hash_digest_length(GOT_HASH_SHA1); > + oid = packidx->hdr.sorted_ids + i * idlen; > + while (i < totobj && oid[0] == id0) { > char id_str[SHA1_DIGEST_STRING_LENGTH]; > struct got_object_qid *qid; > int cmp; > > - if (!got_sha1_digest_to_str(oid->sha1, id_str, sizeof(id_str))) > + if (!got_sha1_digest_to_str(oid, id_str, sizeof(id_str))) > return got_error(GOT_ERR_NO_SPACE); > > cmp = strncmp(id_str, id_str_prefix, prefix_len); > if (cmp < 0) { > - oid = &packidx->hdr.sorted_ids[++i]; > + ++i; > + oid = packidx->hdr.sorted_ids + i * idlen; > continue; > } else if (cmp > 0) > break; > @@ -707,10 +713,11 @@ got_packidx_match_id_str_prefix(struct got_object_id_q > err = got_object_qid_alloc_partial(&qid); > if (err) > break; > - memcpy(qid->id.sha1, oid->sha1, SHA1_DIGEST_LENGTH); > + memcpy(qid->id.sha1, oid, idlen); > STAILQ_INSERT_TAIL(matched_ids, qid, entry); > > - oid = &packidx->hdr.sorted_ids[++i]; > + ++i; > + oid = packidx->hdr.sorted_ids + i * idlen; > } > > if (err) > blob - 25889d5c1fc15e6c52fd8f251cc2c9c692d0cbcf > blob + bbbf6e2645aa162075e265229ade29a55f60d739 > --- lib/pack_index.c > +++ lib/pack_index.c > @@ -474,17 +474,18 @@ find_object_idx(struct got_packidx *packidx, uint8_t * > uint32_t nindexed = be32toh(packidx->hdr.fanout_table[0xff]); > int left = 0, right = nindexed - 1; > int cmp = 0, i = 0; > + void *oid; > + size_t idlen; > > if (id0 > 0) > left = be32toh(packidx->hdr.fanout_table[id0 - 1]); > > + idlen = got_hash_digest_length(GOT_HASH_SHA1); > while (left <= right) { > - struct got_packidx_object_id *oid; > - > i = ((left + right) / 2); > - oid = &packidx->hdr.sorted_ids[i]; > + oid = packidx->hdr.sorted_ids + i * idlen; > > - cmp = memcmp(sha1, oid->sha1, SHA1_DIGEST_LENGTH); > + cmp = memcmp(sha1, oid, idlen); > if (cmp == 0) > return -1; /* object already indexed */ > else if (cmp > 0) > @@ -506,8 +507,11 @@ print_packidx(struct got_packidx *packidx) > fprintf(stderr, "object IDs:\n"); > for (i = 0; i < nindexed; i++) { > char hex[SHA1_DIGEST_STRING_LENGTH]; > - got_sha1_digest_to_str(packidx->hdr.sorted_ids[i].sha1, > - hex, sizeof(hex)); > + void *oid; > + > + oid = packidx->hdr.sorted_ids + > + i * got_hash_digest_length(GOT_HASH_SHA1); > + got_sha1_digest_to_str(oid, hex, sizeof(hex)); > fprintf(stderr, "%s\n", hex); > } > fprintf(stderr, "\n"); > @@ -535,10 +539,13 @@ add_indexed_object(struct got_packidx *packidx, uint32 > add_indexed_object(struct got_packidx *packidx, uint32_t idx, > struct got_indexed_object *obj) > { > + void *oid; > int i; > + size_t idlen; > > - memcpy(packidx->hdr.sorted_ids[idx].sha1, obj->id.sha1, > - SHA1_DIGEST_LENGTH); > + idlen = got_hash_digest_length(GOT_HASH_SHA1); > + oid = packidx->hdr.sorted_ids + idx * idlen; > + memcpy(oid, obj->id.sha1, idlen); > packidx->hdr.crc32[idx] = htobe32(obj->crc); > if (obj->off < GOT_PACKIDX_OFFSET_VAL_IS_LARGE_IDX) > packidx->hdr.offsets[idx] = htobe32(obj->off); > @@ -593,15 +600,17 @@ update_packidx(struct got_packidx *packidx, uint32_t n > struct got_indexed_object *obj) > { > int idx; > + size_t idlen; > uint32_t nindexed = be32toh(packidx->hdr.fanout_table[0xff]); > > idx = find_object_idx(packidx, obj->id.sha1); > if (idx == -1) > return; /* object already indexed */ > > - memmove(&packidx->hdr.sorted_ids[idx + 1], > - &packidx->hdr.sorted_ids[idx], > - sizeof(struct got_packidx_object_id) * (nindexed - idx)); > + idlen = got_hash_digest_length(GOT_HASH_SHA1); > + memmove(packidx->hdr.sorted_ids + (idx + 1) * idlen, > + packidx->hdr.sorted_ids + idx * idlen, > + idlen * (nindexed - idx)); > memmove(&packidx->hdr.offsets[idx + 1], &packidx->hdr.offsets[idx], > sizeof(uint32_t) * (nindexed - idx)); > > @@ -703,7 +712,7 @@ got_pack_index(struct got_pack *pack, int idxfd, FILE > goto done; > } > packidx.hdr.sorted_ids = calloc(nobj, > - sizeof(struct got_packidx_object_id)); > + got_hash_digest_length(GOT_HASH_SHA1)); > if (packidx.hdr.sorted_ids == NULL) { > err = got_error_from_errno("calloc"); > goto done; > @@ -946,7 +955,7 @@ got_pack_index(struct got_pack *pack, int idxfd, FILE > if (err) > goto done; > err = got_pack_hwrite(idxfd, packidx.hdr.sorted_ids, > - nobj * SHA1_DIGEST_LENGTH, &ctx); > + nobj * got_hash_digest_length(GOT_HASH_SHA1), &ctx); > if (err) > goto done; > err = got_pack_hwrite(idxfd, packidx.hdr.crc32, > blob - 6a757d85bc1ec107e2faa5b5d8a2f03388bef3d7 > blob + ba606b8d4113d9346332b7b8d98847343cf07967 > --- lib/repository.c > +++ lib/repository.c > @@ -1118,7 +1118,7 @@ add_packidx_bloom_filter(struct got_repository *repo, > { > int i, nobjects = be32toh(packidx->hdr.fanout_table[0xff]); > struct got_packidx_bloom_filter *bf; > - size_t len; > + size_t len, idlen; > > /* > * Don't use bloom filters for very large pack index files. > @@ -1153,12 +1153,14 @@ add_packidx_bloom_filter(struct got_repository *repo, > } > bf->path_len = len; > > + idlen = got_hash_digest_length(GOT_HASH_SHA1); > + > /* Minimum size supported by our bloom filter is 1000 entries. */ > bloom_init(bf->bloom, nobjects < 1000 ? 1000 : nobjects, 0.1); > for (i = 0; i < nobjects; i++) { > - struct got_packidx_object_id *id; > - id = &packidx->hdr.sorted_ids[i]; > - bloom_add(bf->bloom, id->sha1, sizeof(id->sha1)); > + void *oid; > + oid = packidx->hdr.sorted_ids + i * idlen; > + bloom_add(bf->bloom, oid, idlen); > } > > RB_INSERT(got_packidx_bloom_filter_tree, > blob - 48bed8a15e5f885c25ca796741f37fb70f1b87e3 > blob + 4a78540b33f385a58dde782dbe6c4b166b67a8d0 > --- lib/repository_admin.c > +++ lib/repository_admin.c > @@ -532,6 +532,7 @@ got_repo_list_pack(FILE *packfile, struct got_object_i > struct got_packidx *packidx = NULL; > struct got_pack *pack = NULL; > uint32_t nobj, i; > + size_t idlen; > > err = got_object_id_str(&id_str, pack_hash); > if (err) > @@ -556,9 +557,10 @@ got_repo_list_pack(FILE *packfile, struct got_object_i > if (err) > goto done; > > + idlen = got_hash_digest_length(GOT_HASH_SHA1); > nobj = be32toh(packidx->hdr.fanout_table[0xff]); > for (i = 0; i < nobj; i++) { > - struct got_packidx_object_id *oid; > + void *oid; > struct got_object_id id, base_id; > off_t offset, base_offset = 0; > uint8_t type; > @@ -570,8 +572,9 @@ got_repo_list_pack(FILE *packfile, struct got_object_i > if (err) > break; > } > - oid = &packidx->hdr.sorted_ids[i]; > - memcpy(id.sha1, oid->sha1, SHA1_DIGEST_LENGTH); > + oid = packidx->hdr.sorted_ids + i * idlen; > + memset(&id, 0, sizeof(id)); > + memcpy(id.sha1, oid, idlen); > > offset = got_packidx_get_object_offset(packidx, i); > if (offset == -1) {

2023-03-08 13:44 Stefan Sperling:
towards sha256 pack files: opaquify sorted_ids
- 2023-03-08 14:38 Omar Polo:
  towards sha256 pack files: opaquify sorted_ids
- - 2023-03-31 14:14 Omar Polo:
    towards sha256 pack files: opaquify sorted_ids