towards sha256 pack files: opaquify sorted

On 2023/03/08 14:44:42 +0100, Stefan Sperling <stsp@stsp.name> wrote: > On Wed, Mar 01, 2023 at 07:40:24PM +0100, Omar Polo wrote: > > Hello, > > > > This is a first step to make handle sha256 object ids in pack files. > > It is also the messiest and the one I'm less sure about. > > > > The `sorted_ids' field is an "array" of 20-bytes SHA1 digests > > currently, but in sha256 repos it is made of 40-bytes SHA256 digests. > > > > I'm trying to make it "opaque" and providing a macro for lower-level > > code to access it and retiring the struct got_packidx_object_id that > > is now completely unused. I'm still not completely sold on the way > > I've handled it, but it's the best idea I could come up with. > > Why is GOT_PACKIDX_GET_OBJECT a macro while the other helpers you've > added are inline functions? > I don't see a reason why GOT_PACKIDX_GET_OBJECT could not be an inline > function as well, or even just a regular function, which would return > a void * the caller can cast if needed, or even a struct got_object_id *. > This would add type checking of the arguments and also make things easier > to debug in gdb. mostly to avoid polluting other files. making it static inline would require to include got_hash.h in a few files that technically don't neede it. A poor reason actually... > Since this macro/function returns an object ID wouldn't a better name be > GOT_PACKIDX_GET_OBJECT_ID/got_packidx_get_object_id? because got_packidx_get_object_id already exists ^^" The idea was to replace the packidx->hdr.sorted_ids[N] with a similarly cheap macro call since we have a few places where we loop or bisect over the sorted_ids and the cost of the bounds and error check of got_packidx_get_object_id could be heard. Actually, I fallen in a cosmetic trap, here's an updated diff with the macro expanded and it's not bad... (regress is still running but i don't expect failures from this, it's essentially the same thing as the previous diff.) diff refs/heads/main refs/heads/w commit - cee3836880940ba0d1203e0682a43a680132bc27 commit + bacd69bfabab060f21f9c0f5405a25c8649584e4 blob - 1405a67ac335af59f8e1c2f04ac566ecd07436e7 blob + 5c5eda14f6f68eb22642b1b51f17e933a403026c --- lib/got_lib_hash.h +++ lib/got_lib_hash.h @@ -28,6 +28,32 @@ struct got_hash { int got_parse_object_id(struct got_object_id *, const char *, enum got_hash_algorithm); +static inline int +got_hash_digest_length(enum got_hash_algorithm algo) +{ + switch (algo) { + case GOT_HASH_SHA1: + return SHA1_DIGEST_LENGTH; + case GOT_HASH_SHA256: + return SHA256_DIGEST_LENGTH; + default: + return 0; + } +} + +static inline int +got_hash_digest_string_length(enum got_hash_algorithm algo) +{ + switch (algo) { + case GOT_HASH_SHA1: + return SHA1_DIGEST_STRING_LENGTH; + case GOT_HASH_SHA256: + return SHA256_DIGEST_STRING_LENGTH; + default: + return 0; + } +} + struct got_hash { SHA1_CTX sha1_ctx; SHA2_CTX sha256_ctx; blob - 188c28e9e67bdc37146176e911d6226b223c40d2 blob + 624957e3cdf40254ef2a42563ef7c7b07d90e7b6 --- lib/got_lib_pack.h +++ lib/got_lib_pack.h @@ -67,10 +67,6 @@ struct got_packidx_object_id { u_int8_t packidx_sha1[SHA1_DIGEST_LENGTH]; } __attribute__((__packed__)); -struct got_packidx_object_id { - u_int8_t sha1[SHA1_DIGEST_LENGTH]; -} __attribute__((__packed__)); - /* Ignore pack index version 1 which is no longer written by Git. */ #define GOT_PACKIDX_VERSION 2 @@ -90,8 +86,11 @@ struct got_packidx_v2_hdr { uint32_t *fanout_table; /* values are big endian */ #define GOT_PACKIDX_V2_FANOUT_TABLE_ITEMS (0xff + 1) - /* Sorted SHA1 checksums for each object in the pack file. */ - struct got_packidx_object_id *sorted_ids; + /* + * Sorted hash digest for each object in the pack file. + * Exact size depends on the repository object format. + */ + void *sorted_ids; /* CRC32 of the packed representation of each object. */ uint32_t *crc32; blob - af03219f49ee499a7c4d1a39e802bc0773762f06 blob + 034297c396e60c3125397d5036bfe4e252fcfdf9 --- lib/pack.c +++ lib/pack.c @@ -182,14 +182,13 @@ got_packidx_init_hdr(struct got_packidx *p, int verify remain -= len_fanout; nobj = be32toh(h->fanout_table[0xff]); - len_ids = nobj * sizeof(*h->sorted_ids); + len_ids = nobj * got_hash_digest_length(algo); if (len_ids <= nobj || len_ids > remain) { err = got_error(GOT_ERR_BAD_PACKIDX); goto done; } if (p->map) - h->sorted_ids = - (struct got_packidx_object_id *)((uint8_t*)(p->map + offset)); + h->sorted_ids = p->map + offset; else { h->sorted_ids = malloc(len_ids); if (h->sorted_ids == NULL) { @@ -493,17 +492,19 @@ got_packidx_get_object_idx(struct got_packidx *packidx u_int8_t id0 = id->sha1[0]; uint32_t totobj = be32toh(packidx->hdr.fanout_table[0xff]); int left = 0, right = totobj - 1; + size_t idlen; if (id0 > 0) left = be32toh(packidx->hdr.fanout_table[id0 - 1]); + idlen = got_hash_digest_length(GOT_HASH_SHA1); while (left <= right) { - struct got_packidx_object_id *oid; + void *oid; int i, cmp; i = ((left + right) / 2); - oid = &packidx->hdr.sorted_ids[i]; - cmp = memcmp(id->sha1, oid->sha1, SHA1_DIGEST_LENGTH); + oid = packidx->hdr.sorted_ids + i * idlen; + cmp = memcmp(id->sha1, oid, idlen); if (cmp == 0) return i; else if (cmp > 0) @@ -653,13 +654,15 @@ got_packidx_get_object_id(struct got_object_id *id, struct got_packidx *packidx, int idx) { uint32_t totobj = be32toh(packidx->hdr.fanout_table[0xff]); - struct got_packidx_object_id *oid; + void *oid; + size_t idlen; if (idx < 0 || idx >= totobj) return got_error(GOT_ERR_NO_OBJ); - oid = &packidx->hdr.sorted_ids[idx]; - memcpy(id->sha1, oid->sha1, SHA1_DIGEST_LENGTH); + idlen = got_hash_digest_length(GOT_HASH_SHA1); + oid = packidx->hdr.sorted_ids + idx * idlen; + memcpy(id->sha1, oid, idlen); return NULL; } @@ -671,8 +674,8 @@ got_packidx_match_id_str_prefix(struct got_object_id_q u_int8_t id0; uint32_t totobj = be32toh(packidx->hdr.fanout_table[0xff]); char hex[3]; - size_t prefix_len = strlen(id_str_prefix); - struct got_packidx_object_id *oid; + size_t idlen, prefix_len = strlen(id_str_prefix); + uint8_t *oid; uint32_t i = 0; STAILQ_INIT(matched_ids); @@ -688,18 +691,21 @@ got_packidx_match_id_str_prefix(struct got_object_id_q if (id0 > 0) i = be32toh(packidx->hdr.fanout_table[id0 - 1]); - oid = &packidx->hdr.sorted_ids[i]; - while (i < totobj && oid->sha1[0] == id0) { + + idlen = got_hash_digest_length(GOT_HASH_SHA1); + oid = packidx->hdr.sorted_ids + i * idlen; + while (i < totobj && oid[0] == id0) { char id_str[SHA1_DIGEST_STRING_LENGTH]; struct got_object_qid *qid; int cmp; - if (!got_sha1_digest_to_str(oid->sha1, id_str, sizeof(id_str))) + if (!got_sha1_digest_to_str(oid, id_str, sizeof(id_str))) return got_error(GOT_ERR_NO_SPACE); cmp = strncmp(id_str, id_str_prefix, prefix_len); if (cmp < 0) { - oid = &packidx->hdr.sorted_ids[++i]; + ++i; + oid = packidx->hdr.sorted_ids + i * idlen; continue; } else if (cmp > 0) break; @@ -707,10 +713,11 @@ got_packidx_match_id_str_prefix(struct got_object_id_q err = got_object_qid_alloc_partial(&qid); if (err) break; - memcpy(qid->id.sha1, oid->sha1, SHA1_DIGEST_LENGTH); + memcpy(qid->id.sha1, oid, idlen); STAILQ_INSERT_TAIL(matched_ids, qid, entry); - oid = &packidx->hdr.sorted_ids[++i]; + ++i; + oid = packidx->hdr.sorted_ids + i * idlen; } if (err) blob - 25889d5c1fc15e6c52fd8f251cc2c9c692d0cbcf blob + bbbf6e2645aa162075e265229ade29a55f60d739 --- lib/pack_index.c +++ lib/pack_index.c @@ -474,17 +474,18 @@ find_object_idx(struct got_packidx *packidx, uint8_t * uint32_t nindexed = be32toh(packidx->hdr.fanout_table[0xff]); int left = 0, right = nindexed - 1; int cmp = 0, i = 0; + void *oid; + size_t idlen; if (id0 > 0) left = be32toh(packidx->hdr.fanout_table[id0 - 1]); + idlen = got_hash_digest_length(GOT_HASH_SHA1); while (left <= right) { - struct got_packidx_object_id *oid; - i = ((left + right) / 2); - oid = &packidx->hdr.sorted_ids[i]; + oid = packidx->hdr.sorted_ids + i * idlen; - cmp = memcmp(sha1, oid->sha1, SHA1_DIGEST_LENGTH); + cmp = memcmp(sha1, oid, idlen); if (cmp == 0) return -1; /* object already indexed */ else if (cmp > 0) @@ -506,8 +507,11 @@ print_packidx(struct got_packidx *packidx) fprintf(stderr, "object IDs:\n"); for (i = 0; i < nindexed; i++) { char hex[SHA1_DIGEST_STRING_LENGTH]; - got_sha1_digest_to_str(packidx->hdr.sorted_ids[i].sha1, - hex, sizeof(hex)); + void *oid; + + oid = packidx->hdr.sorted_ids + + i * got_hash_digest_length(GOT_HASH_SHA1); + got_sha1_digest_to_str(oid, hex, sizeof(hex)); fprintf(stderr, "%s\n", hex); } fprintf(stderr, "\n"); @@ -535,10 +539,13 @@ add_indexed_object(struct got_packidx *packidx, uint32 add_indexed_object(struct got_packidx *packidx, uint32_t idx, struct got_indexed_object *obj) { + void *oid; int i; + size_t idlen; - memcpy(packidx->hdr.sorted_ids[idx].sha1, obj->id.sha1, - SHA1_DIGEST_LENGTH); + idlen = got_hash_digest_length(GOT_HASH_SHA1); + oid = packidx->hdr.sorted_ids + idx * idlen; + memcpy(oid, obj->id.sha1, idlen); packidx->hdr.crc32[idx] = htobe32(obj->crc); if (obj->off < GOT_PACKIDX_OFFSET_VAL_IS_LARGE_IDX) packidx->hdr.offsets[idx] = htobe32(obj->off); @@ -593,15 +600,17 @@ update_packidx(struct got_packidx *packidx, uint32_t n struct got_indexed_object *obj) { int idx; + size_t idlen; uint32_t nindexed = be32toh(packidx->hdr.fanout_table[0xff]); idx = find_object_idx(packidx, obj->id.sha1); if (idx == -1) return; /* object already indexed */ - memmove(&packidx->hdr.sorted_ids[idx + 1], - &packidx->hdr.sorted_ids[idx], - sizeof(struct got_packidx_object_id) * (nindexed - idx)); + idlen = got_hash_digest_length(GOT_HASH_SHA1); + memmove(packidx->hdr.sorted_ids + (idx + 1) * idlen, + packidx->hdr.sorted_ids + idx * idlen, + idlen * (nindexed - idx)); memmove(&packidx->hdr.offsets[idx + 1], &packidx->hdr.offsets[idx], sizeof(uint32_t) * (nindexed - idx)); @@ -703,7 +712,7 @@ got_pack_index(struct got_pack *pack, int idxfd, FILE goto done; } packidx.hdr.sorted_ids = calloc(nobj, - sizeof(struct got_packidx_object_id)); + got_hash_digest_length(GOT_HASH_SHA1)); if (packidx.hdr.sorted_ids == NULL) { err = got_error_from_errno("calloc"); goto done; @@ -946,7 +955,7 @@ got_pack_index(struct got_pack *pack, int idxfd, FILE if (err) goto done; err = got_pack_hwrite(idxfd, packidx.hdr.sorted_ids, - nobj * SHA1_DIGEST_LENGTH, &ctx); + nobj * got_hash_digest_length(GOT_HASH_SHA1), &ctx); if (err) goto done; err = got_pack_hwrite(idxfd, packidx.hdr.crc32, blob - 6a757d85bc1ec107e2faa5b5d8a2f03388bef3d7 blob + ba606b8d4113d9346332b7b8d98847343cf07967 --- lib/repository.c +++ lib/repository.c @@ -1118,7 +1118,7 @@ add_packidx_bloom_filter(struct got_repository *repo, { int i, nobjects = be32toh(packidx->hdr.fanout_table[0xff]); struct got_packidx_bloom_filter *bf; - size_t len; + size_t len, idlen; /* * Don't use bloom filters for very large pack index files. @@ -1153,12 +1153,14 @@ add_packidx_bloom_filter(struct got_repository *repo, } bf->path_len = len; + idlen = got_hash_digest_length(GOT_HASH_SHA1); + /* Minimum size supported by our bloom filter is 1000 entries. */ bloom_init(bf->bloom, nobjects < 1000 ? 1000 : nobjects, 0.1); for (i = 0; i < nobjects; i++) { - struct got_packidx_object_id *id; - id = &packidx->hdr.sorted_ids[i]; - bloom_add(bf->bloom, id->sha1, sizeof(id->sha1)); + void *oid; + oid = packidx->hdr.sorted_ids + i * idlen; + bloom_add(bf->bloom, oid, idlen); } RB_INSERT(got_packidx_bloom_filter_tree, blob - 48bed8a15e5f885c25ca796741f37fb70f1b87e3 blob + 4a78540b33f385a58dde782dbe6c4b166b67a8d0 --- lib/repository_admin.c +++ lib/repository_admin.c @@ -532,6 +532,7 @@ got_repo_list_pack(FILE *packfile, struct got_object_i struct got_packidx *packidx = NULL; struct got_pack *pack = NULL; uint32_t nobj, i; + size_t idlen; err = got_object_id_str(&id_str, pack_hash); if (err) @@ -556,9 +557,10 @@ got_repo_list_pack(FILE *packfile, struct got_object_i if (err) goto done; + idlen = got_hash_digest_length(GOT_HASH_SHA1); nobj = be32toh(packidx->hdr.fanout_table[0xff]); for (i = 0; i < nobj; i++) { - struct got_packidx_object_id *oid; + void *oid; struct got_object_id id, base_id; off_t offset, base_offset = 0; uint8_t type; @@ -570,8 +572,9 @@ got_repo_list_pack(FILE *packfile, struct got_object_i if (err) break; } - oid = &packidx->hdr.sorted_ids[i]; - memcpy(id.sha1, oid->sha1, SHA1_DIGEST_LENGTH); + oid = packidx->hdr.sorted_ids + i * idlen; + memset(&id, 0, sizeof(id)); + memcpy(id.sha1, oid, idlen); offset = got_packidx_get_object_offset(packidx, i); if (offset == -1) {

2023-03-07 13:36 Omar Polo:
towards sha256 pack files: opaquify sorted_ids
2023-03-08 13:44 Stefan Sperling:
towards sha256 pack files: opaquify sorted_ids
- 2023-03-08 14:38 Omar Polo:
  towards sha256 pack files: opaquify sorted_ids
- - 2023-03-31 14:14 Omar Polo:
    towards sha256 pack files: opaquify sorted_ids