"GOT", but the "O" is a cute, smiling pufferfish. Index | Thread | Search

From:
Mark Jamsek <mark@jamsek.com>
Subject:
Re: expose chunk offsets in the diff API
To:
Game of Trees <gameoftrees@openbsd.org>
Date:
Mon, 20 Feb 2023 18:52:08 +1100

Download raw body.

Thread
On 23-02-19 06:34PM, Stefan Sperling wrote:
> On Mon, Feb 20, 2023 at 12:18:00AM +1100, Mark Jamsek wrote:
> > On 23-02-20 12:05AM, Mark Jamsek wrote:
> > > As discussed with stsp, if we expose each chunk's offset, we can more
> > > efficiently seek to where we need to start parsing lines rather than
> > > skipping each line till we get there. The below diff makes this change.
> > 
> > I completely forgot to explain that atom->pos is indeed set whether
> > mmap'd or not--it is just that atom->at is _only_ set when mmap'd.
> > This is done in the atomize routines in diff_atomize_text.c.
> > Perhaps the comment in the diff_main.h header could be more precise?
> 
> Indeed. The code is good but the comment is misleading. We should
> fix the comment in diff.git and eventually sync over the change.
> 
> > > This also requires tightening the check for chunks with only added lines
> > > so we don't grab the offset to unchanged lines (i.e., right_count ==
> > > left_count). On that note, should we also expose the diff_chunk_type()
> > > routine?
> 
> I would say yes. Having this in the public diff APi will be useful as
> we expose more of the internals to improve caller efficiency.

Thanks, I think so too! The below diff amends the misleading comment,
makes diff_chunk_type() public, and replaces the handrolled chunk type
deduction with it in get_modified_file_content_status().

If ok, I'll commit the diff bits to diff.git and sync them over before
committing the worktree.c change.

diffstat /home/mark/src/got
 M  lib/diff_internal.h  |   0+  24-
 M  lib/diff_main.c      |  16+   0-
 M  lib/diff_main.h      |  12+   2-
 M  lib/worktree.c       |   1+   4-

4 files changed, 29 insertions(+), 30 deletions(-)

diff /home/mark/src/got
commit - 2262237761cd6c8c89a4d4263a8d9c05a7a81d70
path + /home/mark/src/got
blob - 7c2bfddb8f77a9026ec782026f3775a07e004f5e
file + lib/diff_internal.h
--- lib/diff_internal.h
+++ lib/diff_internal.h
@@ -96,30 +96,6 @@ enum diff_chunk_type {
 
 #define DIFF_RESULT_ALLOC_BLOCKSIZE 128
 
-enum diff_chunk_type {
-	CHUNK_EMPTY,
-	CHUNK_PLUS,
-	CHUNK_MINUS,
-	CHUNK_SAME,
-	CHUNK_ERROR,
-};
-
-static inline enum diff_chunk_type
-diff_chunk_type(const struct diff_chunk *chunk)
-{
-	if (!chunk->left_count && !chunk->right_count)
-		return CHUNK_EMPTY;
-	if (!chunk->solved)
-		return CHUNK_ERROR;
-	if (!chunk->right_count)
-		return CHUNK_MINUS;
-	if (!chunk->left_count)
-		return CHUNK_PLUS;
-	if (chunk->left_count != chunk->right_count)
-		return CHUNK_ERROR;
-	return CHUNK_SAME;
-}
-
 struct diff_chunk_context;
 
 bool
blob - 25d476df152bc9f65ca65f47590adcfc8ef6ae48
file + lib/diff_main.c
--- lib/diff_main.c
+++ lib/diff_main.c
@@ -35,6 +35,22 @@ static int
 #include "diff_internal.h"
 #include "diff_debug.h"
 
+inline enum diff_chunk_type
+diff_chunk_type(const struct diff_chunk *chunk)
+{
+	if (!chunk->left_count && !chunk->right_count)
+		return CHUNK_EMPTY;
+	if (!chunk->solved)
+		return CHUNK_ERROR;
+	if (!chunk->right_count)
+		return CHUNK_MINUS;
+	if (!chunk->left_count)
+		return CHUNK_PLUS;
+	if (chunk->left_count != chunk->right_count)
+		return CHUNK_ERROR;
+	return CHUNK_SAME;
+}
+
 static int
 read_at(FILE *f, off_t at_pos, unsigned char *buf, size_t len)
 {
blob - fa9dc982956eab298f65b4b9ffca114a46b50b76
file + lib/diff_main.h
--- lib/diff_main.h
+++ lib/diff_main.h
@@ -28,8 +28,8 @@ struct diff_atom {
 struct diff_atom {
 	struct diff_data *root; /* back pointer to root diff data */
 
-	off_t pos;		/* if not memory-mapped */
-	const uint8_t *at;	/* if memory-mapped */
+	off_t pos;		/* set whether memory-mapped or not */
+	const uint8_t *at;	/* only set if memory-mapped */
 	off_t len;
 
 	/* This hash is just a very cheap speed up for finding *mismatching*
@@ -142,6 +142,16 @@ struct diff_state;
 	diff_chunk_arraylist_t chunks;
 };
 
+enum diff_chunk_type {
+	CHUNK_EMPTY,
+	CHUNK_PLUS,
+	CHUNK_MINUS,
+	CHUNK_SAME,
+	CHUNK_ERROR,
+};
+
+enum diff_chunk_type diff_chunk_type(const struct diff_chunk *c);
+
 struct diff_state;
 
 /* Signature of a utility function to divide a file into diff atoms.
blob - 010ea00d11c12ab19a0251310a86b1a87bc8a3e9
file + lib/worktree.c
--- lib/worktree.c
+++ lib/worktree.c
@@ -1561,17 +1561,14 @@ get_modified_file_content_status(unsigned char *status
 		struct diff_chunk *c;
 		struct diff_chunk_context cc = {};
 		off_t pos;
-		int clc, crc;
 
 		/*
 		 * We can optimise a little by advancing straight
 		 * to the next chunk if this one has no added lines.
 		 */
 		c = diff_chunk_get(r, n);
-		clc = diff_chunk_get_left_count(c);
-		crc = diff_chunk_get_right_count(c);
 
-		if (!crc || crc == clc) {
+		if (diff_chunk_type(c) != CHUNK_PLUS) {
 			nchunks_parsed = 1;
 			continue;  /* removed or unchanged lines */
 		}

-- 
Mark Jamsek <fnc.bsdbox.org|got.bsdbox.org>
GPG: F2FF 13DE 6A06 C471 CA80  E6E2 2930 DC66 86EE CF68