"GOT", but the "O" is a cute, smiling pufferfish. Index | Thread | Search

From:
Stefan Sperling <stsp@stsp.name>
Subject:
Re: diff: limit search effort for function prototypes
To:
Christian Weisgerber <naddy@mips.inka.de>
Cc:
gameoftrees@openbsd.org
Date:
Wed, 9 Dec 2020 01:16:53 +0100

Download raw body.

Thread
On Mon, Dec 07, 2020 at 07:15:32PM +0100, Christian Weisgerber wrote:
> Stefan Sperling:
> 
> >   git clone --bare https://git.torproject.org/tor.git
> >   cd tor.git && got log -l1 -p -c 9c83cd1993b6ed98301020e5a170df9d5427a1e6
> > 
> > Diffing this commit currently takes a very long time.
> > 
> > This limits the upwards search to 1000 lines, and searches just the
> > first byte of each line being scanned.
> 
> Yes to the first-byte optimization.
> 
> I suspect the problem isn't that we scan back thousands of lines,
> but that we do so thousands of times.  GNU diff caches the last
> function prototype and the line number from which it started
> searching.  The next time it will only scan back as far as that
> line number; if it hasn't found a prototype, it will reuse the
> cached one and update the line number.  Is that an approach we could
> use?

Yes, we could. This is a great idea. Implemented below. This approach
is also much faster than my previous patch.

This becomes a bit easier to implement when we search only one of the
files being diffed, instead of either side. This way, a single line
index is sufficient, otherwise we'd have to track two indices (which
would work, but looks awkward).

So with this diff we only search the 'left' version of the file for
prototypes. But that's probably how other diff tools do it as well.

ok?

diff 86b603da3068dce115470492279dc6f86f17f60b /home/stsp/src/diff
blob - 699cdbdee8d7c7fa45ac1a2cf93547d0a2c9fdc8
file + lib/diff_internal.h
--- lib/diff_internal.h
+++ lib/diff_internal.h
@@ -174,6 +174,7 @@ int diff_output_trailing_newline_msg(struct diff_outpu
 				     FILE *dest,
 				     const struct diff_chunk *c);
 int diff_output_match_function_prototype(char **prototype,
+					 int *last_prototype_idx,
 					 const struct diff_result *result,
 					 const struct diff_chunk_context *cc);
 
blob - e286c6225a8b2095cd86a33a23e4c4031b3f3a0c
file + lib/diff_output.c
--- lib/diff_output.c
+++ lib/diff_output.c
@@ -236,9 +236,9 @@ diff_output_trailing_newline_msg(struct diff_output_in
 }
 
 static bool
-is_function_prototype(const char *buf)
+is_function_prototype(unsigned char ch)
 {
-	return isalpha(buf[0]) || buf[0] == '_' || buf[0] == '$';
+	return (isalpha(ch) || ch == '_' || ch == '$');
 }
 
 #define FUNCTION_CONTEXT_SIZE	55
@@ -246,58 +246,62 @@ is_function_prototype(const char *buf)
 
 int
 diff_output_match_function_prototype(char **prototype,
-    const struct diff_result *result,
+    int *last_prototype_idx, const struct diff_result *result,
     const struct diff_chunk_context *cc)
 {
 	struct diff_atom *start_atom, *atom;
 	const struct diff_data *data;
 	unsigned char buf[FUNCTION_CONTEXT_SIZE];
 	char *state = NULL;
-	int rc, i;
+	int rc, i, ch;
 
 	*prototype = NULL;
 
 	if (result->left->atoms.len > 0 && cc->left.start > 0) {
 		data = result->left;
 		start_atom = &data->atoms.head[cc->left.start - 1];
-	} else if (result->right->atoms.len > 0 && cc->right.start > 0) {
-		data = result->right;
-		start_atom = &data->atoms.head[cc->right.start - 1];
 	} else
 		return DIFF_RC_OK;
 
 	diff_data_foreach_atom_backwards_from(start_atom, atom, data) {
-		for (i = 0; i < atom->len && i < sizeof(buf) - 1; i++) {
-			unsigned int ch;
+		int atom_idx = diff_atom_root_idx(data, atom);
+		if (atom_idx < *last_prototype_idx)
+			break;
+		rc = get_atom_byte(&ch, atom, 0);
+		if (rc)
+			return rc;
+		buf[0] = (unsigned char)ch;
+		if (!is_function_prototype(buf[0]))
+			continue;
+		for (i = 1; i < atom->len && i < sizeof(buf) - 1; i++) {
 			rc = get_atom_byte(&ch, atom, i);
 			if (rc)
 				return rc;
 			if (ch == '\n')
 				break;
-			buf[i] = ch;
+			buf[i] = (unsigned char)ch;
 		}
 		buf[i] = '\0';
-		if (is_function_prototype(buf)) {
-			if (begins_with(buf, "private:")) {
-				if (!state)
-					state = " (private)";
-			} else if (begins_with(buf, "protected:")) {
-				if (!state)
-					state = " (protected)";
-			} else if (begins_with(buf, "public:")) {
-				if (!state)
-					state = " (public)";
-			} else {
-				if (state)  /* don't care about truncation */
-					strlcat(buf, state, sizeof(buf));
-				*prototype = strdup(buf);
-				if (*prototype == NULL)
-					return ENOMEM;
-				return DIFF_RC_OK;
-			}
+		if (begins_with(buf, "private:")) {
+			if (!state)
+				state = " (private)";
+		} else if (begins_with(buf, "protected:")) {
+			if (!state)
+				state = " (protected)";
+		} else if (begins_with(buf, "public:")) {
+			if (!state)
+				state = " (public)";
+		} else {
+			if (state)  /* don't care about truncation */
+				strlcat(buf, state, sizeof(buf));
+			*prototype = strdup(buf);
+			if (*prototype == NULL)
+				return ENOMEM;
+			break;
 		}
 	}
 
+	*last_prototype_idx = diff_atom_root_idx(data, start_atom);
 	return DIFF_RC_OK;
 }
 
blob - 0c30eeafe6f5d012f49287395a528b8490816ccd
file + lib/diff_output_unidiff.c
--- lib/diff_output_unidiff.c
+++ lib/diff_output_unidiff.c
@@ -189,6 +189,8 @@ diff_chunk_context_load_change(struct diff_chunk_conte
 
 struct diff_output_unidiff_state {
 	bool header_printed;
+	char *prototype;
+	int last_prototype_idx;
 };
 
 struct diff_output_unidiff_state *
@@ -206,11 +208,15 @@ void
 diff_output_unidiff_state_reset(struct diff_output_unidiff_state *state)
 {
 	state->header_printed = false;
+	free(state->prototype);
+	state->prototype = NULL;
+	state->last_prototype_idx = 0;
 }
 
 void
 diff_output_unidiff_state_free(struct diff_output_unidiff_state *state)
 {
+	free(state->prototype);
 	free(state);
 }
 
@@ -224,7 +230,6 @@ output_unidiff_chunk(struct diff_output_info *outinfo,
 {
 	int rc, left_start, left_len, right_start, right_len;
 	off_t outoff = 0, *offp;
-	char *prototype = NULL;
 
 	if (diff_range_empty(&cc->left) && diff_range_empty(&cc->right))
 		return DIFF_RC_OK;
@@ -279,34 +284,38 @@ output_unidiff_chunk(struct diff_output_info *outinfo,
 		right_start = cc->right.start + 1;
 
 	if (show_function_prototypes) {
+		char *prototype;
 		rc = diff_output_match_function_prototype(&prototype,
-		    result, cc);
+		    &state->last_prototype_idx, result, cc);
 		if (rc)
 			return rc;
+		if (prototype) {
+			free(state->prototype);
+			state->prototype = prototype;
+		}
 	}
 
 	if (left_len == 1 && right_len == 1) {
 		rc = fprintf(dest, "@@ -%d +%d @@%s%s\n",
 			left_start, right_start,
-			prototype ? " " : "",
-			prototype ? : "");
+			state->prototype ? " " : "",
+			state->prototype ? : "");
 	} else if (left_len == 1 && right_len != 1) {
 		rc = fprintf(dest, "@@ -%d +%d,%d @@%s%s\n",
 			left_start, right_start, right_len,
-			prototype ? " " : "",
-			prototype ? : "");
+			state->prototype ? " " : "",
+			state->prototype ? : "");
 	} else if (left_len != 1 && right_len == 1) {
 		rc = fprintf(dest, "@@ -%d,%d +%d @@%s%s\n",
 			left_start, left_len, right_start,
-			prototype ? " " : "",
-			prototype ? : "");
+			state->prototype ? " " : "",
+			state->prototype ? : "");
 	} else {
 		rc = fprintf(dest, "@@ -%d,%d +%d,%d @@%s%s\n",
 			left_start, left_len, right_start, right_len,
-			prototype ? " " : "",
-			prototype ? : "");
+			state->prototype ? " " : "",
+			state->prototype ? : "");
 	}
-	free(prototype);
 	if (rc < 0)
 		return errno;
 	if (outinfo) {