From: Mark Jamsek Subject: Re: expose chunk offsets in the diff API To: Game of Trees Date: Mon, 20 Feb 2023 18:52:08 +1100 On 23-02-19 06:34PM, Stefan Sperling wrote: > On Mon, Feb 20, 2023 at 12:18:00AM +1100, Mark Jamsek wrote: > > On 23-02-20 12:05AM, Mark Jamsek wrote: > > > As discussed with stsp, if we expose each chunk's offset, we can more > > > efficiently seek to where we need to start parsing lines rather than > > > skipping each line till we get there. The below diff makes this change. > > > > I completely forgot to explain that atom->pos is indeed set whether > > mmap'd or not--it is just that atom->at is _only_ set when mmap'd. > > This is done in the atomize routines in diff_atomize_text.c. > > Perhaps the comment in the diff_main.h header could be more precise? > > Indeed. The code is good but the comment is misleading. We should > fix the comment in diff.git and eventually sync over the change. > > > > This also requires tightening the check for chunks with only added lines > > > so we don't grab the offset to unchanged lines (i.e., right_count == > > > left_count). On that note, should we also expose the diff_chunk_type() > > > routine? > > I would say yes. Having this in the public diff APi will be useful as > we expose more of the internals to improve caller efficiency. Thanks, I think so too! The below diff amends the misleading comment, makes diff_chunk_type() public, and replaces the handrolled chunk type deduction with it in get_modified_file_content_status(). If ok, I'll commit the diff bits to diff.git and sync them over before committing the worktree.c change. diffstat /home/mark/src/got M lib/diff_internal.h | 0+ 24- M lib/diff_main.c | 16+ 0- M lib/diff_main.h | 12+ 2- M lib/worktree.c | 1+ 4- 4 files changed, 29 insertions(+), 30 deletions(-) diff /home/mark/src/got commit - 2262237761cd6c8c89a4d4263a8d9c05a7a81d70 path + /home/mark/src/got blob - 7c2bfddb8f77a9026ec782026f3775a07e004f5e file + lib/diff_internal.h --- lib/diff_internal.h +++ lib/diff_internal.h @@ -96,30 +96,6 @@ enum diff_chunk_type { #define DIFF_RESULT_ALLOC_BLOCKSIZE 128 -enum diff_chunk_type { - CHUNK_EMPTY, - CHUNK_PLUS, - CHUNK_MINUS, - CHUNK_SAME, - CHUNK_ERROR, -}; - -static inline enum diff_chunk_type -diff_chunk_type(const struct diff_chunk *chunk) -{ - if (!chunk->left_count && !chunk->right_count) - return CHUNK_EMPTY; - if (!chunk->solved) - return CHUNK_ERROR; - if (!chunk->right_count) - return CHUNK_MINUS; - if (!chunk->left_count) - return CHUNK_PLUS; - if (chunk->left_count != chunk->right_count) - return CHUNK_ERROR; - return CHUNK_SAME; -} - struct diff_chunk_context; bool blob - 25d476df152bc9f65ca65f47590adcfc8ef6ae48 file + lib/diff_main.c --- lib/diff_main.c +++ lib/diff_main.c @@ -35,6 +35,22 @@ static int #include "diff_internal.h" #include "diff_debug.h" +inline enum diff_chunk_type +diff_chunk_type(const struct diff_chunk *chunk) +{ + if (!chunk->left_count && !chunk->right_count) + return CHUNK_EMPTY; + if (!chunk->solved) + return CHUNK_ERROR; + if (!chunk->right_count) + return CHUNK_MINUS; + if (!chunk->left_count) + return CHUNK_PLUS; + if (chunk->left_count != chunk->right_count) + return CHUNK_ERROR; + return CHUNK_SAME; +} + static int read_at(FILE *f, off_t at_pos, unsigned char *buf, size_t len) { blob - fa9dc982956eab298f65b4b9ffca114a46b50b76 file + lib/diff_main.h --- lib/diff_main.h +++ lib/diff_main.h @@ -28,8 +28,8 @@ struct diff_atom { struct diff_atom { struct diff_data *root; /* back pointer to root diff data */ - off_t pos; /* if not memory-mapped */ - const uint8_t *at; /* if memory-mapped */ + off_t pos; /* set whether memory-mapped or not */ + const uint8_t *at; /* only set if memory-mapped */ off_t len; /* This hash is just a very cheap speed up for finding *mismatching* @@ -142,6 +142,16 @@ struct diff_state; diff_chunk_arraylist_t chunks; }; +enum diff_chunk_type { + CHUNK_EMPTY, + CHUNK_PLUS, + CHUNK_MINUS, + CHUNK_SAME, + CHUNK_ERROR, +}; + +enum diff_chunk_type diff_chunk_type(const struct diff_chunk *c); + struct diff_state; /* Signature of a utility function to divide a file into diff atoms. blob - 010ea00d11c12ab19a0251310a86b1a87bc8a3e9 file + lib/worktree.c --- lib/worktree.c +++ lib/worktree.c @@ -1561,17 +1561,14 @@ get_modified_file_content_status(unsigned char *status struct diff_chunk *c; struct diff_chunk_context cc = {}; off_t pos; - int clc, crc; /* * We can optimise a little by advancing straight * to the next chunk if this one has no added lines. */ c = diff_chunk_get(r, n); - clc = diff_chunk_get_left_count(c); - crc = diff_chunk_get_right_count(c); - if (!crc || crc == clc) { + if (diff_chunk_type(c) != CHUNK_PLUS) { nchunks_parsed = 1; continue; /* removed or unchanged lines */ } -- Mark Jamsek GPG: F2FF 13DE 6A06 C471 CA80 E6E2 2930 DC66 86EE CF68