From: Tom Jones Subject: Re: diff.git: Fix ed script output To: gameoftrees@openbsd.org Date: Wed, 31 Aug 2022 13:24:44 +0100 On Tue, Aug 30, 2022 at 04:55:39PM +0100, Tom Jones wrote: > On Tue, Aug 30, 2022 at 05:35:31PM +0200, Stefan Sperling wrote: > > On Tue, Aug 30, 2022 at 04:28:07PM +0100, Tom Jones wrote: > > > On Tue, Aug 30, 2022 at 05:19:29PM +0200, Stefan Sperling wrote: > > > > On Tue, Aug 30, 2022 at 02:47:58PM +0100, Tom Jones wrote: > > > > > Prior to this change ed script output was in the wrong order, i.e. in > > > > > the order diff_result provides and changes where missing added or > > > > > changed lines. ed edits need to be in reverse order to keep the edited > > > > > file in sync > > > > > > > > As it is, this patch breaks merging on Got, which relies on the > > > > existing output. The existing output is based on the 'ed-style' > > > > script which Caldera diff3(1) generates internally. As you've > > > > discovered this seems to not match traditional diff(1) ed output, > > > > which is unfortunate, and I was unaware of this when I added the > > > > ed output module for the purpose of supporting Got's merging code. > > > > > > > > So we will need both variants. Should we add a flag? > > > > Or would providing two different ed-output modules be better? > > > > > > Does the got version include the changed/added lines? The diff.git one > > > didn't, which made me think it didn't work. > > > > > > In addition to ed script diff also has forward ed (-f) with mangled commands > > > and reverse ed (-n). > > > > > > -f --forward-ed > > > Identical output to that of the -e flag, but in reverse order. > > > It cannot be digested by ed(1). > > > > > > -n Produces a script similar to that of -e, but in the opposite > > > order and with a count of changed lines on each insert or delete > > > command. This is the form used by rcsdiff. > > > > > > Is one of these the input that diff3 requires? > > > > I am not sure which it is. > > I just worked towards what the diff3 code expects, and tested my changes > > by running the Got test suite. The diff3 code used by Got is the same as > > used by OpenRCS and OpenCVS. FreeBSD has this code in usr.bin/diff3/diff3.c. > > Weird. > > I have done some work on diff3 in FreeBSD recently. I'll have a look > into this and respond with a suggestion for how to handle this, it might > be that diff3 is happy just getting the edit ranges from the diff > program and doesn't validate the actual ed script.. > Hi, I have looked into this. getchange in diff3.c takes any line that starts with an integer and treats it as a hunk header. The current diff_output_edscript.c is creating valid classic diff headers rather than ed edits. This seems to mean that while the edscript output works, it is isn't correct just in the way that diff3 wants. I have substituted in the updated diff_output_plain.c from yesterday and am able to pass got's `make regress`. For this case there is an optimisation where diff_output_plain only generates the headers for a hunk, but I'll leave that path for future hackers. Follows is a diff with the changes I made to got, they are minimal beyond copying over the updated diff_output_plain.c - Tom diff --git a/lib/diff3.c b/lib/diff3.c index 3acd3174..ca41b5e1 100644 --- a/lib/diff3.c +++ b/lib/diff3.c @@ -241,7 +241,7 @@ diffreg(BUF **d, const char *path1, const char *path2, } err = got_diffreg_output(NULL, NULL, diffreg_result, 1, 1, "", "", - GOT_DIFF_OUTPUT_EDSCRIPT, 0, outfile); + GOT_DIFF_OUTPUT_PLAIN, 0, outfile); if (err) goto done; diff --git a/lib/diff_output_plain.c b/lib/diff_output_plain.c index 9053d5ed..1ead9121 100644 --- a/lib/diff_output_plain.c +++ b/lib/diff_output_plain.c @@ -27,42 +27,197 @@ #include "diff_internal.h" +static int +output_plain_chunk(struct diff_output_info *outinfo, + FILE *dest, const struct diff_input_info *info, + const struct diff_result *result, + struct diff_chunk_context *cc) +{ + off_t outoff = 0, *offp; + int left_start, left_len, right_start, right_len; + int rc; + bool change = false; + + left_len = cc->left.end - cc->left.start; + if (left_len < 0) + return EINVAL; + else if (result->left->atoms.len == 0) + left_start = 0; + else if (left_len == 0 && cc->left.start > 0) + left_start = cc->left.start; + else if (cc->left.end > 0) + left_start = cc->left.start + 1; + else + left_start = cc->left.start; + + right_len = cc->right.end - cc->right.start; + if (right_len < 0) + return EINVAL; + else if (result->right->atoms.len == 0) + right_start = 0; + else if (right_len == 0 && cc->right.start > 0) + right_start = cc->right.start; + else if (cc->right.end > 0) + right_start = cc->right.start + 1; + else + right_start = cc->right.start; + + if (left_len == 0) { + /* addition */ + if (right_len == 1) { + rc = fprintf(dest, "%da%d\n", left_start, right_start); + } else { + rc = fprintf(dest, "%da%d,%d\n", left_start, + right_start, cc->right.end); + } + } else if (right_len == 0) { + /* deletion */ + if (left_len == 1) { + rc = fprintf(dest, "%dd%d\n", left_start, + right_start); + } else { + rc = fprintf(dest, "%d,%dd%d\n", left_start, + cc->left.end, right_start); + } + } else { + /* change */ + change = true; + if (left_len == 1 && right_len == 1) { + rc = fprintf(dest, "%dc%d\n", left_start, right_start); + } else if (left_len == 1) { + rc = fprintf(dest, "%dc%d,%d\n", left_start, + right_start, cc->right.end); + } else if (right_len == 1) { + rc = fprintf(dest, "%d,%dc%d\n", left_start, + cc->left.end, right_start); + } else { + rc = fprintf(dest, "%d,%dc%d,%d\n", left_start, + cc->left.end, right_start, cc->right.end); + } + } + + /* + * Now write out all the joined chunks. + * + * If the hunk denotes a change, it will come in the form of a deletion + * chunk followed by a addition chunk. Print a marker to break up the + * additions and deletions when this happens. + */ + int c_idx; + for (c_idx = cc->chunk.start; c_idx < cc->chunk.end; c_idx++) { + const struct diff_chunk *c = &result->chunks.head[c_idx]; + if (c->left_count && !c->right_count) + rc = diff_output_lines(outinfo, dest, + c->solved ? "< " : "?", + c->left_start, c->left_count); + else if (c->right_count && !c->left_count) { + if (change) + fprintf(dest, "---\n"); + rc = diff_output_lines(outinfo, dest, + c->solved ? "> " : "?", + c->right_start, c->right_count); + } + if (rc) + return rc; + if (cc->chunk.end == result->chunks.len) { + rc = diff_output_trailing_newline_msg(outinfo, dest, c); + if (rc != DIFF_RC_OK) + return rc; + } + } + + if (rc < 0) + return errno; + if (outinfo) { + ARRAYLIST_ADD(offp, outinfo->line_offsets); + if (offp == NULL) + return ENOMEM; + outoff += rc; + *offp = outoff; + } + + return DIFF_RC_OK; +} + int -diff_output_plain(struct diff_output_info **output_info, FILE *dest, - const struct diff_input_info *info, - const struct diff_result *result) +diff_output_plain(struct diff_output_info **output_info, + FILE *dest, const struct diff_input_info *info, + const struct diff_result *result) { struct diff_output_info *outinfo = NULL; + struct diff_chunk_context cc = {}; + int atomizer_flags = (result->left->atomizer_flags| + result->right->atomizer_flags); + int flags = (result->left->root->diff_flags | + result->right->root->diff_flags); + bool force_text = (flags & DIFF_FLAG_FORCE_TEXT_DATA); + bool have_binary = (atomizer_flags & DIFF_ATOMIZER_FOUND_BINARY_DATA); int i, rc; if (!result) return EINVAL; if (result->rc != DIFF_RC_OK) return result->rc; - + if (output_info) { *output_info = diff_output_info_alloc(); if (*output_info == NULL) - return errno; + return ENOMEM; outinfo = *output_info; } + if (have_binary && !force_text) { + for (i = 0; i < result->chunks.len; i++) { + struct diff_chunk *c = &result->chunks.head[i]; + enum diff_chunk_type t = diff_chunk_type(c); + + if (t != CHUNK_MINUS && t != CHUNK_PLUS) + continue; + + fprintf(dest, "Binary files %s and %s differ\n", + diff_output_get_label_left(info), + diff_output_get_label_right(info)); + break; + } + + return DIFF_RC_OK; + } + for (i = 0; i < result->chunks.len; i++) { - struct diff_chunk *c = &result->chunks.head[i]; - if (c->left_count && c->right_count) - rc = diff_output_lines(outinfo, dest, - c->solved ? " " : "?", - c->left_start, c->left_count); - else if (c->left_count && !c->right_count) - rc = diff_output_lines(outinfo, dest, - c->solved ? "-" : "?", - c->left_start, c->left_count); - else if (c->right_count && !c->left_count) - rc = diff_output_lines(outinfo, dest, - c->solved ? "+" : "?", - c->right_start, c->right_count); - if (rc) + struct diff_chunk *chunk = &result->chunks.head[i]; + enum diff_chunk_type t = diff_chunk_type(chunk); + struct diff_chunk_context next; + + if (t != CHUNK_MINUS && t != CHUNK_PLUS) + continue; + + if (diff_chunk_context_empty(&cc)) { + /* Note down the start point, any number of subsequent + * chunks may be joined up to this chunk by being + * directly adjacent. */ + diff_chunk_context_get(&cc, result, i, 0); + continue; + } + + /* There already is a previous chunk noted down for being + * printed. Does it join up with this one? */ + diff_chunk_context_get(&next, result, i, 0); + + if (diff_chunk_contexts_touch(&cc, &next)) { + /* This next context touches or overlaps the previous + * one, join. */ + diff_chunk_contexts_merge(&cc, &next); + /* When we merge the last chunk we can end up with one + * hanging chunk and have to come back for it after the + * loop */ + continue; + } + rc = output_plain_chunk(outinfo, dest, info, result, &cc); + if (rc != DIFF_RC_OK) return rc; + cc = next; } + if (!diff_chunk_context_empty(&cc)) + return output_plain_chunk(outinfo, dest, info, result, &cc); return DIFF_RC_OK; } diff --git a/lib/diffreg.c b/lib/diffreg.c index 6bd8285d..077eb7fc 100644 --- a/lib/diffreg.c +++ b/lib/diffreg.c @@ -277,8 +277,8 @@ got_diffreg_output(struct got_diff_line **lines, size_t *nlines, if (rc != DIFF_RC_OK) return got_error_set_errno(rc, "diff_output_unidiff"); break; - case GOT_DIFF_OUTPUT_EDSCRIPT: - rc = diff_output_edscript(lines ? &output_info : NULL, + case GOT_DIFF_OUTPUT_PLAIN: + rc = diff_output_plain(lines ? &output_info : NULL, outfile, &info, diff_result->result); if (rc != DIFF_RC_OK) return got_error_set_errno(rc, "diff_output_edscript"); diff --git a/lib/got_lib_diff.h b/lib/got_lib_diff.h index 740595ae..8aae9cbb 100644 --- a/lib/got_lib_diff.h +++ b/lib/got_lib_diff.h @@ -20,7 +20,7 @@ enum got_diff_output_format { GOT_DIFF_OUTPUT_UNIDIFF, - GOT_DIFF_OUTPUT_EDSCRIPT, + GOT_DIFF_OUTPUT_PLAIN, }; struct got_diffreg_result {