"GOT", but the "O" is a cute, smiling pufferfish. Index | Thread | Search

From:
Stefan Sperling <stsp@stsp.name>
Subject:
Re: new diff implementation
To:
Christian Weisgerber <naddy@mips.inka.de>
Cc:
gameoftrees@openbsd.org
Date:
Sat, 24 Oct 2020 14:50:17 +0200

Download raw body.

Thread
On Fri, Oct 23, 2020 at 04:40:41PM +0200, Christian Weisgerber wrote:
> Stefan Sperling:
> 
> > The patch below was generated with the new diff code :)
> 
> Which already reveals a problem.  The patch doesn't apply, because
> this hunk...
> 
> > --- lib/diff.c
> > +++ lib/diff.c
> [...]
> > @@ -796,8 +834,20 @@ got_diff_objects_as_trees(struct got_object_id *id1, s
> >  	arg.diff_context = diff_context;
> >  	arg.ignore_whitespace = ignore_whitespace;
> >  	arg.outfile = outfile;
> > +	if (want_lineoffsets) {
> > +		arg.line_offsets = *line_offsets;
> > +		arg.nlines = *nlines;
> > +	} else {
> > +		arg.line_offsets = NULL;
> > +		arg.nlines = 0;
> > +	}
> >  	err = got_diff_tree(tree1, tree2, label1, label2, repo,
> > -	    got_diff_blob_output_unidiff, &arg, 1);
> > +	    got_diff_blob_output_unidiff, &arg, 1);
> > +
> > +	if (want_lineoffsets) {
> > +		*line_offsets = arg.line_offsets; /* was likely re-allocated */
> > +		*nlines = arg.nlines;
> > +	}
> >  	    got_diff_blob_output_unidiff, &arg, 1);
> >  done:
> >  	if (tree1)
> >  		got_object_tree_close(tree1);
> 
> ... ends with an extraneous line and patch(1) becomes very confused:

It's not just an extra line of context. This line appears three times,
once as -, once as +, and once as context:

  	    got_diff_blob_output_unidiff, &arg, 1);

The old diff code only shows that line as a context line.

Meanwhile Neels has fixed the segfault in patience diff. The patience
diff code does not have the above problem. The updated patch below was
generated with patience diff and round-trips correctly with a fresh
checkout of the Got source tree.

Performance with patience diff seems better than with the old diff code:

$ time got blame got/got.c >/dev/null
    0m23.80s real     0m14.75s user     0m05.36s system
$ time /usr/local/bin/got blame got/got.c >/dev/null
    0m55.23s real     0m29.53s user     0m17.34s system

I've also fixed a problem where a pointless error message about an fprintf
failure was generated when less was closed with 'q' after 'got diff ..| less'

diff refs/heads/main refs/heads/diff
blob - 0eb11ed98ef6532811ab693d8ca239ff97f63150
blob + 29c7f19e9b90a3fa15fae12f16e5e7654db9f42e
--- got/Makefile
+++ got/Makefile
@@ -9,7 +9,11 @@ SRCS=		got.c blame.c commit_graph.c delta.c diff.c \
 		privsep.c reference.c repository.c sha1.c worktree.c \
 		inflate.c buf.c rcsutil.c diff3.c lockfile.c \
 		deflate.c object_create.c delta_cache.c fetch.c \
-		gotconfig.c
+		gotconfig.c diff_main.c diff_atomize_text.c \
+		diff_myers.c diff_output.c diff_output_plain.c \
+		diff_output_unidiff.c diff_output_edscript.c \
+		diff_patience.c
+
 MAN =		${PROG}.1 got-worktree.5 git-repository.5 got.conf.5
 
 CPPFLAGS = -I${.CURDIR}/../include -I${.CURDIR}/../lib
blob - 5bfcaac31178afc5018d764b995989379c1ccfe7
blob + d76b7778e7da109dd7488d8a390d1eb6782a3943
--- got/got.c
+++ got/got.c
@@ -3141,8 +3141,8 @@ diff_blobs(struct got_object_id *blob_id1, struct got_
 
 	while (path[0] == '/')
 		path++;
-	err = got_diff_blob(blob1, blob2, path, path, diff_context,
-	    ignore_whitespace, stdout);
+	err = got_diff_blob(NULL, NULL, blob1, blob2, path, path,
+	    diff_context, ignore_whitespace, stdout);
 done:
 	if (blob1)
 		got_object_blob_close(blob1);
@@ -3172,6 +3172,8 @@ diff_trees(struct got_object_id *tree_id1, struct got_
 	arg.diff_context = diff_context;
 	arg.ignore_whitespace = ignore_whitespace;
 	arg.outfile = stdout;
+	arg.line_offsets = NULL;
+	arg.nlines = 0;
 	while (path[0] == '/')
 		path++;
 	err = got_diff_tree(tree1, tree2, path, path, repo,
@@ -4044,9 +4046,9 @@ print_diff(void *arg, unsigned char status, unsigned c
 		default:
 			return got_error(GOT_ERR_FILE_STATUS);
 		}
-		return got_diff_objects_as_blobs(blob_id, staged_blob_id,
-		    label1, label2, a->diff_context, a->ignore_whitespace,
-		    a->repo, stdout);
+		return got_diff_objects_as_blobs(NULL, NULL, blob_id,
+		    staged_blob_id, label1, label2, a->diff_context,
+		    a->ignore_whitespace, a->repo, stdout);
 	}
 
 	if (staged_status == GOT_STATUS_ADD ||
@@ -4308,17 +4310,18 @@ cmd_diff(int argc, char *argv[])
 
 	switch (type1) {
 	case GOT_OBJ_TYPE_BLOB:
-		error = got_diff_objects_as_blobs(id1, id2, NULL, NULL,
-		    diff_context, ignore_whitespace, repo, stdout);
+		error = got_diff_objects_as_blobs(NULL, NULL, id1, id2,
+		    NULL, NULL, diff_context, ignore_whitespace, repo,
+		    stdout);
 		break;
 	case GOT_OBJ_TYPE_TREE:
-		error = got_diff_objects_as_trees(id1, id2, "", "",
-		    diff_context, ignore_whitespace, repo, stdout);
+		error = got_diff_objects_as_trees(NULL, NULL, id1, id2,
+		    "", "", diff_context, ignore_whitespace, repo, stdout);
 		break;
 	case GOT_OBJ_TYPE_COMMIT:
 		printf("diff %s %s\n", label1, label2);
-		error = got_diff_objects_as_commits(id1, id2, diff_context,
-		    ignore_whitespace, repo, stdout);
+		error = got_diff_objects_as_commits(NULL, NULL, id1, id2,
+		    diff_context, ignore_whitespace, repo, stdout);
 		break;
 	default:
 		error = got_error(GOT_ERR_OBJ_TYPE);
blob - fd8185fd7383c6da58510578d7d7f7217653f964
blob + 6e5a0c492ea1eaa3cc496e6131fd71978725ad78
--- include/got_diff.h
+++ include/got_diff.h
@@ -21,9 +21,12 @@
  * If a label is NULL, use the blob's SHA1 checksum instead.
  * The number of context lines to show in the diff must be specified as well.
  * Whitespace differences may optionally be ignored.
+ * If not NULL, the two initial output arguments will be populated with an
+ * array of line offsets for, and the number of lines in, the unidiff text.
  */
-const struct got_error *got_diff_blob(struct got_blob_object *,
-    struct got_blob_object *, const char *, const char *, int, int, FILE *);
+const struct got_error *got_diff_blob(off_t **, size_t *,
+    struct got_blob_object *, struct got_blob_object *,
+    const char *, const char *, int, int, FILE *);
 
 /*
  * Compute the differences between a blob and a file and write unified diff
@@ -61,6 +64,21 @@ struct got_diff_blob_output_unidiff_arg {
 	FILE *outfile;		/* Unidiff text will be written here. */
 	int diff_context;	/* Sets the number of context lines. */
 	int ignore_whitespace;	/* Ignore whitespace differences. */
+
+	/*
+	 * The number of lines contained in produced unidiff text output,
+	 * and an array of byte offsets to each line. May be initialized to
+	 * zero and NULL to ignore line offsets. If not NULL, then the line
+	 * offsets array will be populated. Optionally, the array can be
+	 * pre-populated with line offsets, with nlines > 0 indicating
+	 * the length of the pre-populated array. This is useful if the
+	 * output file already contains some lines of text.
+	 * The array will be grown as needed to accomodate additional line
+	 * offsets, and the last offset found in a pre-populated array will
+	 * be added to all subsequent offsets.
+	 */
+	size_t nlines;
+	off_t *line_offsets;	/* Dispose of with free(3) when done. */
 };
 const struct got_error *got_diff_blob_output_unidiff(void *,
     struct got_blob_object *, struct got_blob_object *,
@@ -105,9 +123,12 @@ const struct got_error *got_diff_tree_collect_changed_
  * the diff output. If a label is NULL, use the blob's SHA1 checksum instead.
  * The number of context lines to show in the diff must be specified as well.
  * Write unified diff text to the provided output FILE.
+ * If not NULL, the two initial output arguments will be populated with an
+ * array of line offsets for, and the number of lines in, the unidiff text.
  */
-const struct got_error *got_diff_objects_as_blobs(struct got_object_id *,
-    struct got_object_id *, const char *, const char *, int, int,
+const struct got_error *got_diff_objects_as_blobs(off_t **, size_t *,
+    struct got_object_id *, struct got_object_id *,
+    const char *, const char *, int, int,
     struct got_repository *, FILE *);
 
 /*
@@ -116,17 +137,22 @@ const struct got_error *got_diff_objects_as_blobs(stru
  * the trees. If a label is NULL, use the blob's SHA1 checksum instead.
  * The number of context lines to show in diffs must be specified.
  * Write unified diff text to the provided output FILE.
+ * If not NULL, the two initial output arguments will be populated with an
+ * array of line offsets for, and the number of lines in, the unidiff text.
  */
-const struct got_error *got_diff_objects_as_trees(struct got_object_id *,
-    struct got_object_id *, char *, char *, int, int,
-    struct got_repository *, FILE *);
+const struct got_error *got_diff_objects_as_trees(off_t **, size_t *,
+    struct got_object_id *, struct got_object_id *, char *, char *,
+    int, int, struct got_repository *, FILE *);
 
 /*
  * Diff two objects, assuming both objects are commits.
  * The number of context lines to show in diffs must be specified.
  * Write unified diff text to the provided output FILE.
+ * If not NULL, the two initial output arguments will be populated with an
+ * array of line offsets for, and the number of lines in, the unidiff text.
  */
-const struct got_error *got_diff_objects_as_commits(struct got_object_id *,
-    struct got_object_id *, int, int, struct got_repository *, FILE *);
+const struct got_error *got_diff_objects_as_commits(off_t **, size_t *,
+    struct got_object_id *, struct got_object_id *, int, int,
+    struct got_repository *, FILE *);
 
 #define GOT_DIFF_MAX_CONTEXT	64
blob - f8f1b5eaab4da84a1833683fc78601583cb35fc1
blob + 88eac60191927adbd26a773cb99990d980d23d3a
--- lib/blame.c
+++ lib/blame.c
@@ -15,10 +15,12 @@
  */
 
 #include <sys/queue.h>
+#include <sys/mman.h>
 #include <sys/stat.h>
 
 #include <sha1.h>
 #include <string.h>
+#include <stdbool.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <time.h>
@@ -38,6 +40,10 @@
 #include "got_lib_object.h"
 #include "got_lib_diff.h"
 
+#ifndef MAX
+#define	MAX(_a,_b) ((_a) > (_b) ? (_a) : (_b))
+#endif
+
 struct got_blame_line {
 	int annotated;
 	struct got_object_id id;
@@ -45,6 +51,10 @@ struct got_blame_line {
 
 struct got_blame {
 	FILE *f;
+	char *map;
+	struct diff_data left_data;
+	struct diff_data right_data;
+	const struct diff_config *cfg;
 	size_t filesize;
 	int nlines;
 	int nannotated;
@@ -77,22 +87,25 @@ annotate_line(struct got_blame *blame, int lineno, str
 }
 
 static const struct got_error *
-blame_changes(struct got_blame *blame, struct got_diff_changes *changes,
+blame_changes(struct got_blame *blame, struct diff_result *diff_result,
     struct got_object_id *commit_id,
     const struct got_error *(*cb)(void *, int, int, struct got_object_id *),
     void *arg)
 {
 	const struct got_error *err = NULL;
-	struct got_diff_change *change;
+	int i, nchunks_used = 0;
 
-	SIMPLEQ_FOREACH(change, &changes->entries, entry) {
-		int c = change->cv.c;
-		int d = change->cv.d;
-		int new_lineno = (c < d ? c : d);
-		int new_length = (c < d ? d - c + 1 : (c == d ? 1 : 0));
+	for (i = 0; i < diff_result->chunks.len; i += nchunks_used) {
+		struct diff_chunk_context cc = {};
+		int lineno, len;
 		int ln;
 
-		for (ln = new_lineno; ln < new_lineno + new_length; ln++) {
+		diff_chunk_context_load_change(&cc, &nchunks_used,
+		    diff_result, i, 0);
+
+		lineno = cc.right.start + 1;
+		len = MAX(0, cc.right.end - cc.right.start);
+		for (ln = lineno; ln < lineno + len; ln++) {
 			err = annotate_line(blame, ln, commit_id, cb, arg);
 			if (err)
 				return err;
@@ -115,7 +128,7 @@ blame_commit(struct got_blame *blame, struct got_objec
 	struct got_object_id *obj_id = NULL;
 	struct got_commit_object *commit = NULL;
 	struct got_blob_object *blob = NULL;
-	struct got_diff_changes *changes = NULL;
+	struct got_diffreg_result *diffreg_result = NULL;
 
 	err = got_object_open_as_commit(&commit, repo, parent_id);
 	if (err)
@@ -146,17 +159,25 @@ blame_commit(struct got_blame *blame, struct got_objec
 		goto done;
 	}
 
-	err = got_diff_blob_file_lines_changed(&changes, blob, blame->f,
-	    blame->filesize);
+	diff_data_free(&blame->left_data);
+	memset(&blame->left_data, 0, sizeof(blame->left_data));
+	err = got_diff_blob_prepared_file(&diffreg_result, &blame->left_data,
+	    blob, &blame->right_data, blame->f, blame->map, blame->filesize,
+	    blame->cfg, 0);
 	if (err)
 		goto done;
 
-	if (changes) {
-		err = blame_changes(blame, changes, id, cb, arg);
-		got_diff_free_changes(changes);
+	if (diffreg_result->result->chunks.len > 0) {
+		err = blame_changes(blame, diffreg_result->result, id, cb, arg);
 	} else if (cb)
 		err = cb(arg, blame->nlines, -1, id);
 done:
+	if (diffreg_result) {
+		const struct got_error *free_err;
+		free_err = got_diffreg_result_free_left(diffreg_result);
+		if (free_err && err == NULL)
+			err = free_err;
+	}
 	if (commit)
 		got_object_commit_close(commit);
 	free(obj_id);
@@ -172,7 +193,11 @@ blame_close(struct got_blame *blame)
 {
 	const struct got_error *err = NULL;
 
-	if (blame->f && fclose(blame->f) != 0)
+	diff_data_free(&blame->left_data);
+	diff_data_free(&blame->right_data);
+	if (blame->map && munmap(blame->map, blame->filesize) == -1)
+		err = got_error_from_errno("munmap");
+	if (blame->f && fclose(blame->f) != 0 && err == NULL)
 		err = got_error_from_errno("fclose");
 	free(blame->lines);
 	free(blame);
@@ -191,7 +216,8 @@ blame_open(struct got_blame **blamep, const char *path
 	struct got_blob_object *blob = NULL;
 	struct got_blame *blame = NULL;
 	struct got_object_id *id = NULL, *pid = NULL;
-	int lineno;
+	int lineno, created;
+	size_t size;
 	struct got_commit_graph *graph = NULL;
 
 	*blamep = NULL;
@@ -227,6 +253,12 @@ blame_open(struct got_blame **blamep, const char *path
 	if (err || blame->nlines == 0)
 		goto done;
 
+	blame->cfg = got_diff_get_config(GOT_DIFF_ALGORITHM_MYERS),
+	err = got_diff_prepare_file(&blame->f, &blame->map, &created, &size,
+	    &blame->right_data, blame->cfg, 0);
+	if (err)
+		goto done;
+
 	/* Don't include \n at EOF in the blame line count. */
 	if (blame->line_offsets[blame->nlines - 1] == blame->filesize)
 		blame->nlines--;
blob - /dev/null
blob + 112da0c2e234000d38b2dea5c3b1112fa045b7a0 (mode 644)
--- /dev/null
+++ lib/arraylist.h
@@ -0,0 +1,115 @@
+/* Auto-reallocating array for arbitrary member types. */
+/*
+ * Copyright (c) 2020 Neels Hofmeyr <neels@hofmeyr.de>
+ *
+ * Permission to use, copy, modify, and distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+/* Usage:
+ *
+ * ARRAYLIST(any_type_t) list;
+ * // OR
+ * typedef ARRAYLIST(any_type_t) any_type_list_t;
+ * any_type_list_t list;
+ *
+ * // pass the number of (at first unused) members to add on each realloc:
+ * ARRAYLIST_INIT(list, 128);
+ * any_type_t *x;
+ * while (bar) {
+ *         // This enlarges the allocated array as needed;
+ *         // list.head may change due to realloc:
+ *         ARRAYLIST_ADD(x, list);
+ *         if (!x)
+ *                 return ENOMEM; 
+ *         *x = random_foo_value;
+ * }
+ * for (i = 0; i < list.len; i++)
+ *         printf("%s", foo_to_str(list.head[i]));
+ * ARRAYLIST_FREE(list);
+ */
+#define ARRAYLIST(MEMBER_TYPE) \
+	struct { \
+		MEMBER_TYPE *head; \
+		MEMBER_TYPE *p; \
+		unsigned int len; \
+		unsigned int allocated; \
+		unsigned int alloc_blocksize; \
+	}
+
+#define ARRAYLIST_INIT(ARRAY_LIST, ALLOC_BLOCKSIZE) do { \
+		(ARRAY_LIST).head = NULL; \
+		(ARRAY_LIST).len = 0; \
+		(ARRAY_LIST).allocated = 0; \
+		(ARRAY_LIST).alloc_blocksize = ALLOC_BLOCKSIZE; \
+	} while(0)
+
+#define ARRAYLIST_ADD(NEW_ITEM_P, ARRAY_LIST) do { \
+		if ((ARRAY_LIST).len && !(ARRAY_LIST).allocated) { \
+			NEW_ITEM_P = NULL; \
+			break; \
+		} \
+		if ((ARRAY_LIST).head == NULL \
+		    || (ARRAY_LIST).allocated < (ARRAY_LIST).len + 1) { \
+			(ARRAY_LIST).p = recallocarray((ARRAY_LIST).head, \
+				(ARRAY_LIST).len, \
+				(ARRAY_LIST).allocated + \
+				((ARRAY_LIST).alloc_blocksize ? : 8), \
+				sizeof(*(ARRAY_LIST).head)); \
+			if ((ARRAY_LIST).p == NULL) { \
+				NEW_ITEM_P = NULL; \
+				break; \
+			} \
+			(ARRAY_LIST).allocated += \
+				(ARRAY_LIST).alloc_blocksize ? : 8; \
+			(ARRAY_LIST).head = (ARRAY_LIST).p; \
+			(ARRAY_LIST).p = NULL; \
+		}; \
+		if ((ARRAY_LIST).head == NULL \
+		    || (ARRAY_LIST).allocated < (ARRAY_LIST).len + 1) { \
+			NEW_ITEM_P = NULL; \
+			break; \
+		} \
+		(NEW_ITEM_P) = &(ARRAY_LIST).head[(ARRAY_LIST).len]; \
+		(ARRAY_LIST).len++; \
+	} while (0)
+
+#define ARRAYLIST_INSERT(NEW_ITEM_P, ARRAY_LIST, AT_IDX) do { \
+		int _at_idx = (AT_IDX); \
+		ARRAYLIST_ADD(NEW_ITEM_P, ARRAY_LIST); \
+		if ((NEW_ITEM_P) \
+		    && _at_idx >= 0 \
+		    && _at_idx < (ARRAY_LIST).len) { \
+			memmove(&(ARRAY_LIST).head[_at_idx + 1], \
+				&(ARRAY_LIST).head[_at_idx], \
+				((ARRAY_LIST).len - 1 - _at_idx) \
+					* sizeof(*(ARRAY_LIST).head)); \
+			(NEW_ITEM_P) = &(ARRAY_LIST).head[_at_idx]; \
+		}; \
+	} while (0)
+
+#define ARRAYLIST_CLEAR(ARRAY_LIST) \
+	(ARRAY_LIST).len = 0
+
+#define ARRAYLIST_FREE(ARRAY_LIST) \
+	do { \
+		if ((ARRAY_LIST).head && (ARRAY_LIST).allocated) \
+			free((ARRAY_LIST).head); \
+		ARRAYLIST_INIT(ARRAY_LIST, (ARRAY_LIST).alloc_blocksize); \
+	} while(0)
+
+#define ARRAYLIST_FOREACH(ITEM_P, ARRAY_LIST) \
+	for ((ITEM_P) = (ARRAY_LIST).head; \
+	     (ITEM_P) - (ARRAY_LIST).head < (ARRAY_LIST).len; \
+	     (ITEM_P)++)
+
+#define ARRAYLIST_IDX(ITEM_P, ARRAY_LIST) ((ITEM_P) - (ARRAY_LIST).head)
blob - 7e5ee06994a5158bc937ce8307bd19f51d7e0ef5
blob + 8e431b84cef0e76870b8054c4318dfac9bc02177
--- lib/diff.c
+++ lib/diff.c
@@ -17,6 +17,7 @@
 #include <sys/queue.h>
 #include <sys/stat.h>
 
+#include <stdbool.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
@@ -39,27 +40,47 @@
 #include "got_lib_object.h"
 
 static const struct got_error *
-diff_blobs(struct got_blob_object *blob1, struct got_blob_object *blob2,
+add_line_offset(off_t **line_offsets, size_t *nlines, off_t off)
+{
+	off_t *p;
+
+	p = reallocarray(*line_offsets, *nlines + 1, sizeof(off_t));
+	if (p == NULL)
+		return got_error_from_errno("reallocarray");
+	*line_offsets = p;
+	(*line_offsets)[*nlines] = off;
+	(*nlines)++;
+	return NULL;
+}
+
+static const struct got_error *
+diff_blobs(off_t **line_offsets, size_t *nlines,
+    struct got_diffreg_result **resultp, struct got_blob_object *blob1,
+    struct got_blob_object *blob2,
     const char *label1, const char *label2, mode_t mode1, mode_t mode2,
-    int diff_context, int ignore_whitespace, FILE *outfile,
-    struct got_diff_changes *changes)
+    int diff_context, int ignore_whitespace, FILE *outfile)
 {
-	struct got_diff_state ds;
-	struct got_diff_args args;
-	const struct got_error *err = NULL;
+	const struct got_error *err = NULL, *free_err;
 	FILE *f1 = NULL, *f2 = NULL;
 	char hex1[SHA1_DIGEST_STRING_LENGTH];
 	char hex2[SHA1_DIGEST_STRING_LENGTH];
 	char *idstr1 = NULL, *idstr2 = NULL;
 	size_t size1, size2;
-	int res, flags = 0;
+	struct got_diffreg_result *result;
+	off_t outoff = 0;
+	int n;
+
+	if (line_offsets && *line_offsets && *nlines > 0)
+		outoff = (*line_offsets)[*nlines - 1];
+
+	if (resultp)
+		*resultp = NULL;
 
 	if (blob1) {
 		f1 = got_opentemp();
 		if (f1 == NULL)
 			return got_error_from_errno("got_opentemp");
-	} else
-		flags |= D_EMPTY1;
+	}
 
 	if (blob2) {
 		f2 = got_opentemp();
@@ -68,8 +89,7 @@ diff_blobs(struct got_blob_object *blob1, struct got_b
 			fclose(f1);
 			return err;
 		}
-	} else
-		flags |= D_EMPTY2;
+	}
 
 	size1 = 0;
 	if (blob1) {
@@ -91,27 +111,6 @@ diff_blobs(struct got_blob_object *blob1, struct got_b
 	} else
 		idstr2 = "/dev/null";
 
-	memset(&ds, 0, sizeof(ds));
-	/* XXX should stat buffers be passed in args instead of ds? */
-	ds.stb1.st_mode = S_IFREG;
-	if (blob1)
-		ds.stb1.st_size = size1;
-	ds.stb1.st_mtime = 0; /* XXX */
-
-	ds.stb2.st_mode = S_IFREG;
-	if (blob2)
-		ds.stb2.st_size = size2;
-	ds.stb2.st_mtime = 0; /* XXX */
-
-	memset(&args, 0, sizeof(args));
-	args.diff_format = D_UNIFIED;
-	args.label[0] = label1 ? label1 : idstr1;
-	args.label[1] = label2 ? label2 : idstr2;
-	args.diff_context = diff_context;
-	flags |= D_PROTOTYPE;
-	if (ignore_whitespace)
-		flags |= D_IGNOREBLANKS;
-
 	if (outfile) {
 		char *modestr1 = NULL, *modestr2 = NULL;
 		int modebits;
@@ -137,15 +136,52 @@ diff_blobs(struct got_blob_object *blob1, struct got_b
 				goto done;
 			}
 		}
-		fprintf(outfile, "blob - %s%s\n", idstr1,
+		n = fprintf(outfile, "blob - %s%s\n", idstr1,
 		    modestr1 ? modestr1 : "");
-		fprintf(outfile, "blob + %s%s\n", idstr2,
+		if (n < 0)
+			goto done;
+		outoff += n;
+		if (line_offsets) {
+			err = add_line_offset(line_offsets, nlines, outoff);
+			if (err)
+				goto done;
+		}
+
+		n = fprintf(outfile, "blob + %s%s\n", idstr2,
 		    modestr2 ? modestr2 : "");
+		if (n < 0)
+			goto done;
+		outoff += n;
+		if (line_offsets) {
+			err = add_line_offset(line_offsets, nlines, outoff);
+			if (err)
+				goto done;
+		}
+
 		free(modestr1);
 		free(modestr2);
 	}
-	err = got_diffreg(&res, f1, f2, flags, &args, &ds, outfile, changes);
-	got_diff_state_free(&ds);
+	err = got_diffreg(&result, f1, f2, GOT_DIFF_ALGORITHM_PATIENCE,
+	    ignore_whitespace);
+	if (err)
+		goto done;
+
+	if (outfile) {
+		err = got_diffreg_output(line_offsets, nlines, result, f1, f2,
+		    label1 ? label1 : idstr1,
+		    label2 ? label2 : idstr2,
+		    GOT_DIFF_OUTPUT_UNIDIFF, diff_context, outfile);
+		if (err)
+			goto done;
+	}
+
+	if (resultp && err == NULL)
+		*resultp = result;
+	else {
+		free_err = got_diffreg_result_free(result);
+		if (free_err && err == NULL)
+			err = free_err;
+	}
 done:
 	if (f1 && fclose(f1) != 0 && err == NULL)
 		err = got_error_from_errno("fclose");
@@ -162,45 +198,35 @@ got_diff_blob_output_unidiff(void *arg, struct got_blo
 {
 	struct got_diff_blob_output_unidiff_arg *a = arg;
 
-	return diff_blobs(blob1, blob2, label1, label2, mode1, mode2,
-	    a->diff_context, a->ignore_whitespace, a->outfile, NULL);
+	return diff_blobs(&a->line_offsets, &a->nlines, NULL,
+	    blob1, blob2, label1, label2, mode1, mode2, a->diff_context,
+	    a->ignore_whitespace, a->outfile);
 }
 
 const struct got_error *
-got_diff_blob(struct got_blob_object *blob1, struct got_blob_object *blob2,
+got_diff_blob(off_t **line_offsets, size_t *nlines,
+    struct got_blob_object *blob1, struct got_blob_object *blob2,
     const char *label1, const char *label2, int diff_context,
     int ignore_whitespace, FILE *outfile)
 {
-	return diff_blobs(blob1, blob2, label1, label2, 0, 0, diff_context,
-	    ignore_whitespace, outfile, NULL);
+	return diff_blobs(line_offsets, nlines, NULL, blob1, blob2,
+	    label1, label2, 0, 0, diff_context, ignore_whitespace, outfile);
 }
 
 static const struct got_error *
-alloc_changes(struct got_diff_changes **changes)
-{
-	*changes = calloc(1, sizeof(**changes));
-	if (*changes == NULL)
-		return got_error_from_errno("calloc");
-	SIMPLEQ_INIT(&(*changes)->entries);
-	return NULL;
-}
-
-static const struct got_error *
-diff_blob_file(struct got_diff_changes **changes,
+diff_blob_file(struct got_diffreg_result **resultp,
     struct got_blob_object *blob1, const char *label1, FILE *f2, size_t size2,
     const char *label2, int diff_context, int ignore_whitespace, FILE *outfile)
 {
-	struct got_diff_state ds;
-	struct got_diff_args args;
-	const struct got_error *err = NULL;
+	const struct got_error *err = NULL, *free_err;
 	FILE *f1 = NULL;
 	char hex1[SHA1_DIGEST_STRING_LENGTH];
 	char *idstr1 = NULL;
 	size_t size1;
-	int res, flags = 0;
+	struct got_diffreg_result *result = NULL;
 
-	if (changes)
-		*changes = NULL;
+	if (resultp)
+		*resultp = NULL;
 
 	size1 = 0;
 	if (blob1) {
@@ -213,46 +239,35 @@ diff_blob_file(struct got_diff_changes **changes,
 		if (err)
 			goto done;
 	} else {
-		flags |= D_EMPTY1;
 		idstr1 = "/dev/null";
 	}
-
-	if (f2 == NULL)
-		flags |= D_EMPTY2;
-
-	memset(&ds, 0, sizeof(ds));
-	/* XXX should stat buffers be passed in args instead of ds? */
-	ds.stb1.st_mode = S_IFREG;
-	if (blob1)
-		ds.stb1.st_size = size1;
-	ds.stb1.st_mtime = 0; /* XXX */
-
-	ds.stb2.st_mode = S_IFREG;
-	ds.stb2.st_size = size2;
-	ds.stb2.st_mtime = 0; /* XXX */
-
-	memset(&args, 0, sizeof(args));
-	args.diff_format = D_UNIFIED;
-	args.label[0] = label2;
-	args.label[1] = label2;
-	args.diff_context = diff_context;
-	flags |= D_PROTOTYPE;
-	if (ignore_whitespace)
-		flags |= D_IGNOREBLANKS;
 
 	if (outfile) {
 		fprintf(outfile, "blob - %s\n", label1 ? label1 : idstr1);
 		fprintf(outfile, "file + %s\n",
 		    f2 == NULL ? "/dev/null" : label2);
 	}
-	if (changes) {
-		err = alloc_changes(changes);
+
+	err = got_diffreg(&result, f1, f2, GOT_DIFF_ALGORITHM_PATIENCE, 
+	    ignore_whitespace);
+	if (err)
+		goto done;
+
+	if (outfile) {
+		err = got_diffreg_output(NULL, NULL, result, f1, f2,
+		    label2, label2, GOT_DIFF_OUTPUT_UNIDIFF, diff_context,
+		    outfile);
 		if (err)
-			return err;
+			goto done;
 	}
-	err = got_diffreg(&res, f1, f2, flags, &args, &ds, outfile,
-	    changes ? *changes : NULL);
-	got_diff_state_free(&ds);
+
+	if (resultp && err == NULL)
+		*resultp = result;
+	else if (result) {
+		free_err = got_diffreg_result_free(result);
+		if (free_err && err == NULL)
+			err = free_err;
+	}
 done:
 	if (f1 && fclose(f1) != 0 && err == NULL)
 		err = got_error_from_errno("fclose");
@@ -269,41 +284,57 @@ got_diff_blob_file(struct got_blob_object *blob1, cons
 }
 
 const struct got_error *
-got_diff_blob_file_lines_changed(struct got_diff_changes **changes,
-    struct got_blob_object *blob1, FILE *f2, size_t size2)
+got_diff_blob_prepared_file(struct got_diffreg_result **resultp,
+    struct diff_data *data1, struct got_blob_object *blob1,
+    struct diff_data *data2, FILE *f2, char *p2, size_t size2,
+    const struct diff_config *cfg, int ignore_whitespace)
 {
-	return diff_blob_file(changes, blob1, NULL, f2, size2, NULL,
-	    0, 0, NULL);
-}
+	const struct got_error *err = NULL, *free_err;
+	FILE *f1 = NULL;
+	char hex1[SHA1_DIGEST_STRING_LENGTH];
+	char *idstr1 = NULL, *p1 = NULL;
+	size_t size1, size;
+	struct got_diffreg_result *result = NULL;
+	int f1_created = 0;
 
-const struct got_error *
-got_diff_blob_lines_changed(struct got_diff_changes **changes,
-    struct got_blob_object *blob1, struct got_blob_object *blob2)
-{
-	const struct got_error *err = NULL;
+	*resultp = NULL;
 
-	err = alloc_changes(changes);
+	size1 = 0;
+	if (blob1) {
+		f1 = got_opentemp();
+		if (f1 == NULL)
+			return got_error_from_errno("got_opentemp");
+		idstr1 = got_object_blob_id_str(blob1, hex1, sizeof(hex1));
+		err = got_object_blob_dump_to_file(&size1, NULL, NULL, f1,
+		    blob1);
+		if (err)
+			goto done;
+	} else {
+		idstr1 = "/dev/null";
+	}
+
+	err = got_diff_prepare_file(&f1, &p1, &f1_created, &size,
+	    data1, cfg, ignore_whitespace);
 	if (err)
-		return err;
+		goto done;
 
-	err = diff_blobs(blob1, blob2, NULL, NULL, 0, 0, 3, 0, NULL, *changes);
+	err = got_diffreg_prepared_files(&result, cfg, data1, f1,
+	    p1, size1, data2, f2, p2, size2);
+	if (err)
+		goto done;
+
+	*resultp = result;
+done:
 	if (err) {
-		got_diff_free_changes(*changes);
-		*changes = NULL;
+		if (result)
+			free_err = got_diffreg_result_free_left(result);
+		else
+			free_err = got_diffreg_close(f1, p1, size1, NULL,
+			    NULL, 0);
+		if (free_err && err == NULL)
+			err = free_err;
 	}
 	return err;
-}
-
-void
-got_diff_free_changes(struct got_diff_changes *changes)
-{
-	struct got_diff_change *change;
-	while (!SIMPLEQ_EMPTY(&changes->entries)) {
-		change = SIMPLEQ_FIRST(&changes->entries);
-		SIMPLEQ_REMOVE_HEAD(&changes->entries, entry);
-		free(change);
-	}
-	free(changes);
 }
 
 static const struct got_error *
@@ -741,7 +772,8 @@ got_diff_tree(struct got_tree_object *tree1, struct go
 }
 
 const struct got_error *
-got_diff_objects_as_blobs(struct got_object_id *id1, struct got_object_id *id2,
+got_diff_objects_as_blobs(off_t **line_offsets, size_t *nlines,
+    struct got_object_id *id1, struct got_object_id *id2,
     const char *label1, const char *label2, int diff_context,
     int ignore_whitespace, struct got_repository *repo, FILE *outfile)
 {
@@ -761,8 +793,8 @@ got_diff_objects_as_blobs(struct got_object_id *id1, s
 		if (err)
 			goto done;
 	}
-	err = got_diff_blob(blob1, blob2, label1, label2, diff_context,
-	    ignore_whitespace, outfile);
+	err = got_diff_blob(line_offsets, nlines, blob1, blob2,
+	    label1, label2, diff_context, ignore_whitespace, outfile);
 done:
 	if (blob1)
 		got_object_blob_close(blob1);
@@ -772,13 +804,15 @@ done:
 }
 
 const struct got_error *
-got_diff_objects_as_trees(struct got_object_id *id1, struct got_object_id *id2,
+got_diff_objects_as_trees(off_t **line_offsets, size_t *nlines,
+    struct got_object_id *id1, struct got_object_id *id2,
     char *label1, char *label2, int diff_context, int ignore_whitespace,
     struct got_repository *repo, FILE *outfile)
 {
 	const struct got_error *err;
 	struct got_tree_object *tree1 = NULL, *tree2 = NULL;
 	struct got_diff_blob_output_unidiff_arg arg;
+	int want_lineoffsets = (line_offsets != NULL && *line_offsets != NULL);
 
 	if (id1 == NULL && id2 == NULL)
 		return got_error(GOT_ERR_NO_OBJ);
@@ -796,8 +830,20 @@ got_diff_objects_as_trees(struct got_object_id *id1, s
 	arg.diff_context = diff_context;
 	arg.ignore_whitespace = ignore_whitespace;
 	arg.outfile = outfile;
+	if (want_lineoffsets) {
+		arg.line_offsets = *line_offsets;
+		arg.nlines = *nlines;
+	} else {
+		arg.line_offsets = NULL;
+		arg.nlines = 0;
+	}
 	err = got_diff_tree(tree1, tree2, label1, label2, repo,
 	    got_diff_blob_output_unidiff, &arg, 1);
+
+	if (want_lineoffsets) {
+		*line_offsets = arg.line_offsets; /* was likely re-allocated */
+		*nlines = arg.nlines;
+	}
 done:
 	if (tree1)
 		got_object_tree_close(tree1);
@@ -807,8 +853,9 @@ done:
 }
 
 const struct got_error *
-got_diff_objects_as_commits(struct got_object_id *id1,
-    struct got_object_id *id2, int diff_context, int ignore_whitespace,
+got_diff_objects_as_commits(off_t **line_offsets, size_t *nlines,
+    struct got_object_id *id1, struct got_object_id *id2,
+    int diff_context, int ignore_whitespace,
     struct got_repository *repo, FILE *outfile)
 {
 	const struct got_error *err;
@@ -827,7 +874,7 @@ got_diff_objects_as_commits(struct got_object_id *id1,
 	if (err)
 		goto done;
 
-	err = got_diff_objects_as_trees(
+	err = got_diff_objects_as_trees(line_offsets, nlines,
 	    commit1 ? got_object_commit_get_tree_id(commit1) : NULL,
 	    got_object_commit_get_tree_id(commit2), "", "", diff_context,
 	    ignore_whitespace, repo, outfile);
@@ -840,50 +887,15 @@ done:
 }
 
 const struct got_error *
-got_diff_files(struct got_diff_changes **changes,
-    struct got_diff_state **ds,
-    struct got_diff_args **args,
-    int *flags,
-    FILE *f1, size_t size1, const char *label1,
-    FILE *f2, size_t size2, const char *label2,
-    int diff_context, FILE *outfile)
+got_diff_files(struct got_diffreg_result **resultp,
+    FILE *f1, const char *label1, FILE *f2, const char *label2,
+    int diff_context, int ignore_whitespace, FILE *outfile)
 {
 	const struct got_error *err = NULL;
-	int res;
+	struct got_diffreg_result *diffreg_result = NULL;
 
-	*flags = 0;
-	*ds = calloc(1, sizeof(**ds));
-	if (*ds == NULL)
-		return got_error_from_errno("calloc");
-	*args = calloc(1, sizeof(**args));
-	if (*args == NULL) {
-		err = got_error_from_errno("calloc");
-		goto done;
-	}
-
-	if (changes)
-		*changes = NULL;
-
-	if (f1 == NULL)
-		*flags |= D_EMPTY1;
-
-	if (f2 == NULL)
-		*flags |= D_EMPTY2;
-
-	/* XXX should stat buffers be passed in args instead of ds? */
-	(*ds)->stb1.st_mode = S_IFREG;
-	(*ds)->stb1.st_size = size1;
-	(*ds)->stb1.st_mtime = 0; /* XXX */
-
-	(*ds)->stb2.st_mode = S_IFREG;
-	(*ds)->stb2.st_size = size2;
-	(*ds)->stb2.st_mtime = 0; /* XXX */
-
-	(*args)->diff_format = D_UNIFIED;
-	(*args)->label[0] = label1;
-	(*args)->label[1] = label2;
-	(*args)->diff_context = diff_context;
-	*flags |= D_PROTOTYPE;
+	if (resultp)
+		*resultp = NULL;
 
 	if (outfile) {
 		fprintf(outfile, "file - %s\n",
@@ -891,29 +903,29 @@ got_diff_files(struct got_diff_changes **changes,
 		fprintf(outfile, "file + %s\n",
 		    f2 == NULL ? "/dev/null" : label2);
 	}
-	if (changes) {
-		err = alloc_changes(changes);
+
+	err = got_diffreg(&diffreg_result, f1, f2, GOT_DIFF_ALGORITHM_PATIENCE,
+	    ignore_whitespace);
+	if (err)
+		goto done;
+
+	if (outfile) {
+		err = got_diffreg_output(NULL, NULL, diffreg_result,
+		    f1, f2, label1, label2, GOT_DIFF_OUTPUT_UNIDIFF,
+		    diff_context, outfile);
 		if (err)
 			goto done;
 	}
-	err = got_diffreg(&res, f1, f2, *flags, *args, *ds, outfile,
-	    changes ? *changes : NULL);
+
 done:
-	if (err) {
-		if (*ds) {
-			got_diff_state_free(*ds);
-			free(*ds);
-			*ds = NULL;
-		}
-		if (*args) {
-			free(*args);
-			*args = NULL;
-		}
-		if (changes) {
-			if (*changes)
-				got_diff_free_changes(*changes);
-			*changes = NULL;
-		}
+	if (resultp && err == NULL)
+		*resultp = diffreg_result;
+	else if (diffreg_result) {
+		const struct got_error *free_err;
+		free_err = got_diffreg_result_free(diffreg_result);
+		if (free_err && err == NULL)
+			err = free_err;
 	}
+
 	return err;
 }
blob - e8841fa9d921c07d456480829fe90a68bca1f84a
blob + d685f9e10bd4f51dc68c4cb81c4adbec7998b4fa
--- lib/diff3.c
+++ lib/diff3.c
@@ -202,9 +202,7 @@ diffreg(BUF **d, const char *path1, const char *path2)
 	const struct got_error *err = NULL;
 	FILE *f1 = NULL, *f2 = NULL, *outfile = NULL;
 	char *outpath = NULL;
-	struct got_diff_state ds;
-	struct got_diff_args args;
-	int res;
+	struct got_diffreg_result *diffreg_result = NULL;
 
 	*d = NULL;
 
@@ -224,25 +222,13 @@ diffreg(BUF **d, const char *path1, const char *path2)
 	if (err)
 		goto done;
 
-	memset(&ds, 0, sizeof(ds));
-	/* XXX should stat buffers be passed in args instead of ds? */
-	if (stat(path1, &ds.stb1) == -1) {
-		err = got_error_from_errno2("stat", path1);
+	err = got_diffreg(&diffreg_result, f1, f2,
+	    GOT_DIFF_ALGORITHM_PATIENCE, 0);
+	if (err)
 		goto done;
-	}
-	if (stat(path2, &ds.stb2) == -1) {
-		err = got_error_from_errno2("stat", path2);
-		goto done;
-	}
 
-	memset(&args, 0, sizeof(args));
-	args.diff_format = D_NORMAL;
-	args.label[0] = "";
-	args.label[1] = "";
-	args.diff_context = 0;
-
-	err = got_diffreg(&res, f1, f2, D_FORCEASCII, &args, &ds,
-	    outfile, NULL);
+	err = got_diffreg_output(NULL, NULL, diffreg_result, f1, f2, "", "",
+	    GOT_DIFF_OUTPUT_EDSCRIPT, 0, outfile);
 	if (err)
 		goto done;
 
blob - 6ababe7641483792ecbe0478ab2912f465c8388c
blob + c6318249803d7c84edab9017aff44a259be82283
--- lib/diffreg.c
+++ lib/diffreg.c
@@ -1,1254 +1,390 @@
-/*	$OpenBSD: diffreg.c,v 1.91 2016/03/01 20:57:35 natano Exp $	*/
-
 /*
- * Copyright (C) Caldera International Inc.  2001-2002.
- * All rights reserved.
+ * Copyright (c) 2020 Neels Hofmeyr <neels@hofmeyr.de>
+ * Copyright (c) 2020 Stefan Sperling <stsp@openbsd.org>
  *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code and documentation must retain the above
- *    copyright notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in the
- *    documentation and/or other materials provided with the distribution.
- * 3. All advertising materials mentioning features or use of this software
- *    must display the following acknowledgement:
- *	This product includes software developed or owned by Caldera
- *	International, Inc.
- * 4. Neither the name of Caldera International, Inc. nor the names of other
- *    contributors may be used to endorse or promote products derived from
- *    this software without specific prior written permission.
+ * Permission to use, copy, modify, and distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
  *
- * USE OF THE SOFTWARE PROVIDED FOR UNDER THIS LICENSE BY CALDERA
- * INTERNATIONAL, INC. AND CONTRIBUTORS ``AS IS'' AND ANY EXPRESS OR
- * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
- * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
- * IN NO EVENT SHALL CALDERA INTERNATIONAL, INC. BE LIABLE FOR ANY DIRECT,
- * INDIRECT INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
- * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- * POSSIBILITY OF SUCH DAMAGE.
- */
-/*-
- * Copyright (c) 1991, 1993
- *	The Regents of the University of California.  All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in the
- *    documentation and/or other materials provided with the distribution.
- * 3. Neither the name of the University nor the names of its contributors
- *    may be used to endorse or promote products derived from this software
- *    without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
- * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
- * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- * SUCH DAMAGE.
- *
- *	@(#)diffreg.c   8.1 (Berkeley) 6/6/93
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
  */
 
-#include <sys/types.h>
+#include <sys/mman.h>
 #include <sys/stat.h>
-#include <sys/wait.h>
 #include <sys/queue.h>
 
-#include <ctype.h>
-#include <err.h>
 #include <errno.h>
-#include <fcntl.h>
-#include <paths.h>
-#include <stdarg.h>
-#include <stddef.h>
-#include <stdint.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
-#include <time.h>
-#include <unistd.h>
-#include <limits.h>
-#include <sha1.h>
-#include <zlib.h>
 
-#include "got_error.h"
 #include "got_object.h"
-#include "got_diff.h"
+#include "got_opentemp.h"
+#include "got_error.h"
 
 #include "got_lib_diff.h"
 
-#define MINIMUM(a, b)	(((a) < (b)) ? (a) : (b))
-#define MAXIMUM(a, b)	(((a) > (b)) ? (a) : (b))
+const struct diff_algo_config myers_then_patience;
+const struct diff_algo_config myers_then_myers_divide;
+const struct diff_algo_config patience;
+const struct diff_algo_config myers_divide;
 
-/*
- * diff - compare two files.
- */
+const struct diff_algo_config myers_then_patience = (struct diff_algo_config){
+	.impl = diff_algo_myers,
+	.permitted_state_size = 1024 * 1024 * sizeof(int),
+	.fallback_algo = &patience,
+};
 
-/*
- *	Uses an algorithm due to Harold Stone, which finds
- *	a pair of longest identical subsequences in the two
- *	files.
- *
- *	The major goal is to generate the match vector J.
- *	J[i] is the index of the line in file1 corresponding
- *	to line i file0. J[i] = 0 if there is no
- *	such line in file1.
- *
- *	Lines are hashed so as to work in core. All potential
- *	matches are located by sorting the lines of each file
- *	on the hash (called ``value''). In particular, this
- *	collects the equivalence classes in file1 together.
- *	Subroutine equiv replaces the value of each line in
- *	file0 by the index of the first element of its
- *	matching equivalence in (the reordered) file1.
- *	To save space equiv squeezes file1 into a single
- *	array member in which the equivalence classes
- *	are simply concatenated, except that their first
- *	members are flagged by changing sign.
- *
- *	Next the indices that point into member are unsorted into
- *	array class according to the original order of file0.
- *
- *	The cleverness lies in routine stone. This marches
- *	through the lines of file0, developing a vector klist
- *	of "k-candidates". At step i a k-candidate is a matched
- *	pair of lines x,y (x in file0 y in file1) such that
- *	there is a common subsequence of length k
- *	between the first i lines of file0 and the first y
- *	lines of file1, but there is no such subsequence for
- *	any smaller y. x is the earliest possible mate to y
- *	that occurs in such a subsequence.
- *
- *	Whenever any of the members of the equivalence class of
- *	lines in file1 matable to a line in file0 has serial number
- *	less than the y of some k-candidate, that k-candidate
- *	with the smallest such y is replaced. The new
- *	k-candidate is chained (via pred) to the current
- *	k-1 candidate so that the actual subsequence can
- *	be recovered. When a member has serial number greater
- *	that the y of all k-candidates, the klist is extended.
- *	At the end, the longest subsequence is pulled out
- *	and placed in the array J by unravel
- *
- *	With J in hand, the matches there recorded are
- *	check'ed against reality to assure that no spurious
- *	matches have crept in due to hashing. If they have,
- *	they are broken, and "jackpot" is recorded--a harmless
- *	matter except that a true match for a spuriously
- *	mated line may now be unnecessarily reported as a change.
- *
- *	Much of the complexity of the program comes simply
- *	from trying to minimize core utilization and
- *	maximize the range of doable problems by dynamically
- *	allocating what is needed and reusing what is not.
- *	The core requirements for problems larger than somewhat
- *	are (in words) 2*length(file0) + length(file1) +
- *	3*(number of k-candidates installed),  typically about
- *	6n words for files of length n.
- */
+const struct diff_algo_config myers_then_myers_divide =
+	(struct diff_algo_config){
+	.impl = diff_algo_myers,
+	.permitted_state_size = 1024 * 1024 * sizeof(int),
+	.fallback_algo = &myers_divide,
+};
 
-struct cand {
-	int	x;
-	int	y;
-	int	pred;
+const struct diff_algo_config patience = (struct diff_algo_config){
+	.impl = diff_algo_patience,
+	/* After subdivision, do Patience again: */
+	.inner_algo = &patience,
+	/* If subdivision failed, do Myers Divide et Impera: */
+	.fallback_algo = &myers_then_myers_divide,
 };
 
-struct line {
-	int	serial;
-	int	value;
+const struct diff_algo_config myers_divide = (struct diff_algo_config){
+	.impl = diff_algo_myers_divide,
+	/* When division succeeded, start from the top: */
+	.inner_algo = &myers_then_myers_divide,
+	/* (fallback_algo = NULL implies diff_algo_none). */
 };
 
-static void	 diff_output(FILE *, const char *, ...);
-static int	 output(FILE *, struct got_diff_changes *, struct got_diff_state *, struct got_diff_args *, const char *, FILE *, const char *, FILE *, int);
-static void	 check(struct got_diff_state *, FILE *, FILE *, int);
-static void	 range(FILE *, int, int, char *);
-static void	 uni_range(FILE *, int, int);
-static void	 dump_unified_vec(FILE *, struct got_diff_changes *, struct got_diff_state *, struct got_diff_args *, FILE *, FILE *, int);
-static int	 prepare(struct got_diff_state *, int, FILE *, off_t, int);
-static void	 prune(struct got_diff_state *);
-static void	 equiv(struct line *, int, struct line *, int, int *);
-static void	 unravel(struct got_diff_state *, int);
-static int	 unsort(struct line *, int, int *);
-static int	 change(FILE *, struct got_diff_changes *, struct got_diff_state *, struct got_diff_args *, const char *, FILE *, const char *, FILE *, int, int, int, int, int *);
-static void	 sort(struct line *, int);
-static void	 print_header(FILE *, struct got_diff_state *, struct got_diff_args *, const char *, const char *);
-static int	 asciifile(FILE *);
-static void	 fetch(FILE *, struct got_diff_state *, struct got_diff_args *, long *, int, int, FILE *, int, int);
-static int	 newcand(struct got_diff_state *, int, int, int, int *);
-static int	 search(struct got_diff_state *, int *, int, int);
-static int	 skipline(FILE *);
-static int	 isqrt(int);
-static int	 stone(struct got_diff_state *, int *, int, int *, int *, int);
-static int	 readhash(struct got_diff_state *, FILE *, int);
-static int	 files_differ(struct got_diff_state *, FILE *, FILE *);
-static char	*match_function(struct got_diff_state *, const long *, int, FILE *);
+/* If the state for a forward-Myers is small enough, use Myers, otherwise first
+ * do a Myers-divide. */
+const struct diff_config diff_config_myers_then_myers_divide = {
+	.atomize_func = diff_atomize_text_by_line,
+	.algo = &myers_then_myers_divide,
+};
 
-/*
- * chrtran points to one of 2 translation tables: cup2low if folding upper to
- * lower case clow2low if not folding case
- */
-u_char clow2low[256] = {
-	0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a,
-	0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15,
-	0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, 0x20,
-	0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28, 0x29, 0x2a, 0x2b,
-	0x2c, 0x2d, 0x2e, 0x2f, 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36,
-	0x37, 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, 0x40, 0x41,
-	0x42, 0x43, 0x44, 0x45, 0x46, 0x47, 0x48, 0x49, 0x4a, 0x4b, 0x4c,
-	0x4d, 0x4e, 0x4f, 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57,
-	0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, 0x60, 0x61, 0x62,
-	0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d,
-	0x6e, 0x6f, 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78,
-	0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, 0x80, 0x81, 0x82, 0x83,
-	0x84, 0x85, 0x86, 0x87, 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e,
-	0x8f, 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98, 0x99,
-	0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, 0xa0, 0xa1, 0xa2, 0xa3, 0xa4,
-	0xa5, 0xa6, 0xa7, 0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf,
-	0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, 0xb8, 0xb9, 0xba,
-	0xbb, 0xbc, 0xbd, 0xbe, 0xbf, 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5,
-	0xc6, 0xc7, 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, 0xd0,
-	0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, 0xd8, 0xd9, 0xda, 0xdb,
-	0xdc, 0xdd, 0xde, 0xdf, 0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6,
-	0xe7, 0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, 0xf0, 0xf1,
-	0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, 0xf8, 0xf9, 0xfa, 0xfb, 0xfc,
-	0xfd, 0xfe, 0xff
+/* If the state for a forward-Myers is small enough, use Myers, otherwise first
+ * do a Patience. */
+const struct diff_config diff_config_myers_then_patience = {
+	.atomize_func = diff_atomize_text_by_line,
+	.algo = &myers_then_patience,
 };
 
-u_char cup2low[256] = {
-	0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a,
-	0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15,
-	0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, 0x20,
-	0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28, 0x29, 0x2a, 0x2b,
-	0x2c, 0x2d, 0x2e, 0x2f, 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36,
-	0x37, 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, 0x60, 0x61,
-	0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x6b, 0x6c,
-	0x6d, 0x6e, 0x6f, 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
-	0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, 0x60, 0x61, 0x62,
-	0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d,
-	0x6e, 0x6f, 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78,
-	0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, 0x80, 0x81, 0x82, 0x83,
-	0x84, 0x85, 0x86, 0x87, 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e,
-	0x8f, 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98, 0x99,
-	0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, 0xa0, 0xa1, 0xa2, 0xa3, 0xa4,
-	0xa5, 0xa6, 0xa7, 0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf,
-	0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, 0xb8, 0xb9, 0xba,
-	0xbb, 0xbc, 0xbd, 0xbe, 0xbf, 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5,
-	0xc6, 0xc7, 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, 0xd0,
-	0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, 0xd8, 0xd9, 0xda, 0xdb,
-	0xdc, 0xdd, 0xde, 0xdf, 0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6,
-	0xe7, 0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, 0xf0, 0xf1,
-	0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, 0xf8, 0xf9, 0xfa, 0xfb, 0xfc,
-	0xfd, 0xfe, 0xff
+/* Directly force Patience as a first divider of the source file. */
+const struct diff_config diff_config_patience = {
+	.atomize_func = diff_atomize_text_by_line,
+	.algo = &patience,
 };
 
-static void
-diff_output(FILE *outfile, const char *fmt, ...)
+/* Directly force Patience as a first divider of the source file. */
+const struct diff_config diff_config_no_algo = {
+	.atomize_func = diff_atomize_text_by_line,
+};
+
+const struct got_error *
+got_diffreg_close(FILE *f1, char *p1, size_t size1,
+    FILE *f2, char *p2, size_t size2)
 {
-	va_list ap;
+	const struct got_error *err = NULL;
 
-	if (outfile == NULL)
-		return;
-
-	va_start(ap, fmt);
-	vfprintf(outfile, fmt, ap);
-	va_end(ap);
+	if (p1 && munmap(p1, size1) == -1 && err == NULL)
+		err = got_error_from_errno("munmap");
+	if (p2 && munmap(p2, size2) == -1 && err == NULL)
+		err = got_error_from_errno("munmap");
+	if (f1 && fclose(f1) != 0 && err == NULL)
+		err = got_error_from_errno("fclose");
+	if (f2 && fclose(f2) != 0 && err == NULL)
+		err = got_error_from_errno("fclose");
+	return err;
 }
 
-void
-got_diff_state_free(struct got_diff_state *ds)
+const struct diff_config *
+got_diff_get_config(enum got_diff_algorithm algorithm)
 {
-	free(ds->J);
-	free(ds->member);
-	free(ds->class);
-	free(ds->clist);
-	free(ds->klist);
-	free(ds->ixold);
-	free(ds->ixnew);
+	switch (algorithm) {
+	case GOT_DIFF_ALGORITHM_PATIENCE:
+		return &diff_config_patience;
+	case GOT_DIFF_ALGORITHM_MYERS:
+		return &diff_config_myers_then_myers_divide;
+	}
+	return NULL; /* should not happen */
 }
 
 const struct got_error *
-got_diffreg(int *rval, FILE *f1, FILE *f2, int flags,
-    struct got_diff_args *args, struct got_diff_state *ds, FILE *outfile,
-    struct got_diff_changes *changes)
+got_diff_prepare_file(FILE **f, char **p, int *f_created, size_t *size,
+    struct diff_data *diff_data, const struct diff_config *cfg,
+    int ignore_whitespace)
 {
 	const struct got_error *err = NULL;
-	int i, *p;
-	long *lp;
+	struct stat st;
+	int diff_flags = 0, rc;
 
-	*rval = D_SAME;
-	ds->anychange = 0;
-	ds->lastline = 0;
-	ds->lastmatchline = 0;
-	ds->context_vec_ptr = ds->context_vec_start - 1;
-	ds->max_context = GOT_DIFF_MAX_CONTEXT;
-	if (flags & D_IGNORECASE)
-		ds->chrtran = cup2low;
+	*size = 0;
+
+	diff_flags |= DIFF_FLAG_SHOW_PROTOTYPES;
+	if (ignore_whitespace)
+		diff_flags |= DIFF_FLAG_IGNORE_WHITESPACE;
+
+	if (f && *f) {
+		if (fstat(fileno(*f), &st) == -1) {
+			err = got_error_from_errno("fstat");
+			goto done;
+		}
+	#ifndef GOT_DIFF_NO_MMAP
+		*p = mmap(NULL, st.st_size, PROT_READ, MAP_PRIVATE,
+		    fileno(*f), 0);
+		if (*p == MAP_FAILED)
+	#endif
+			*p = NULL; /* fall back on file I/O */
+	} else {
+		*f_created = 1;
+		st.st_size = 0;
+		*f = got_opentemp();
+		if (*f == NULL) {
+			err = got_error_from_errno("got_opentemp");
+			goto done;
+		}
+	}
+
+	rc = diff_atomize_file(diff_data, cfg, *f, *p, st.st_size, diff_flags);
+	if (rc) {
+		err = got_error_set_errno(rc, "diff_atomize_file");
+		goto done;
+	}
+done:
+	if (err)
+		diff_data_free(diff_data);
 	else
-		ds->chrtran = clow2low;
-	if (S_ISDIR(ds->stb1.st_mode) != S_ISDIR(ds->stb2.st_mode)) {
-		*rval = (S_ISDIR(ds->stb1.st_mode) ? D_MISMATCH1 : D_MISMATCH2);
-		return NULL;
+		*size = st.st_size;
+	return err;
+}
+
+const struct got_error *
+got_diffreg_prepared_files(struct got_diffreg_result **diffreg_result,
+    const struct diff_config *cfg,
+    struct diff_data *left, FILE *f1, char *p1, size_t size1,
+    struct diff_data *right, FILE *f2, char *p2, size_t size2)
+{
+	const struct got_error *err = NULL;
+	struct diff_result *diff_result;
+
+	*diffreg_result = calloc(1, sizeof(**diffreg_result));
+	if (*diffreg_result == NULL)
+		return got_error_from_errno("calloc");
+	
+	diff_result = diff_main(cfg, left, right);
+	if (diff_result == NULL) {
+		err = got_error_set_errno(ENOMEM, "malloc");
+		goto done;
 	}
-	if (!(flags & D_EMPTY1) && f1 == NULL) {
-		args->status |= 2;
-		goto closem;
+	if (diff_result->rc != DIFF_RC_OK) {
+		err = got_error_set_errno(diff_result->rc, "diff");
+		goto done;
 	}
 
-	if (!(flags & D_EMPTY2) && f2 == NULL) {
-		args->status |= 2;
-		goto closem;
+	(*diffreg_result)->result = diff_result;
+	(*diffreg_result)->f1 = f1;
+	(*diffreg_result)->map1 = p1;
+	(*diffreg_result)->size1 = size1;
+	(*diffreg_result)->f2 = f2;
+	(*diffreg_result)->map2 = p2;
+	(*diffreg_result)->size2 = size2;
+done:
+	if (err) {
+		if (diffreg_result) {
+			free(*diffreg_result);
+			*diffreg_result = NULL;
+		}
 	}
+	
+	return err;
+}
 
-	switch (files_differ(ds, f1, f2)) {
-	case 0:
-		goto closem;
-	case 1:
-		break;
-	default:
-		/* error */
-		args->status |= 2;
-		goto closem;
-	}
-
-	if ((flags & D_FORCEASCII) == 0 &&
-	    (!asciifile(f1) || !asciifile(f2))) {
-		*rval = D_BINARY;
-		args->status |= 1;
-		goto closem;
-	}
-	if (prepare(ds, 0, f1, ds->stb1.st_size, flags)) {
-		err = got_error_from_errno("prepare");
-		goto closem;
-	}
-	if (prepare(ds, 1, f2, ds->stb2.st_size, flags)) {
-		err = got_error_from_errno("prepare");
-		goto closem;
-	}
+const struct got_error *
+got_diffreg(struct got_diffreg_result **diffreg_result, FILE *f1, FILE *f2,
+    enum got_diff_algorithm algorithm, int ignore_whitespace)
+{
+	const struct got_error *err = NULL;
+	const struct diff_config *cfg;
+	char *p1 = NULL, *p2 = NULL;
+	int f1_created = 0, f2_created = 0;
+	size_t size1, size2;
+	struct diff_data d_left, d_right;
+	struct diff_data *left, *right;
+	struct diff_result *diff_result;
 
-	prune(ds);
-	sort(ds->sfile[0], ds->slen[0]);
-	sort(ds->sfile[1], ds->slen[1]);
-
-	ds->member = (int *)ds->file[1];
-	equiv(ds->sfile[0], ds->slen[0], ds->sfile[1], ds->slen[1], ds->member);
-	p = reallocarray(ds->member, ds->slen[1] + 2, sizeof(*ds->member));
-	if (p == NULL) {
-		err = got_error_from_errno("reallocarray");
-		goto closem;
+	if (diffreg_result) {
+		*diffreg_result = calloc(1, sizeof(**diffreg_result));
+		if (*diffreg_result == NULL)
+			return got_error_from_errno("calloc");
+		left = &(*diffreg_result)->left;
+		right = &(*diffreg_result)->right;
+	} else {
+		memset(&d_left, 0, sizeof(d_left));
+		memset(&d_right, 0, sizeof(d_right));
+		left = &d_left;
+		right = &d_right;
 	}
-	ds->member = p;
-
-	ds->class = (int *)ds->file[0];
-	if (unsort(ds->sfile[0], ds->slen[0], ds->class)) {
-		err = got_error_from_errno("unsort");
-		goto closem;
+	
+	cfg = got_diff_get_config(algorithm);
+	if (cfg == NULL) {
+		err = got_error(GOT_ERR_NOT_IMPL);
+		goto done;
 	}
-	p = reallocarray(ds->class, ds->slen[0] + 2, sizeof(*ds->class));
-	if (p == NULL) {
-		err = got_error_from_errno("reallocarray");
-		goto closem;
-	}
-	ds->class = p;
 
-	ds->klist = calloc(ds->slen[0] + 2, sizeof(*ds->klist));
-	if (ds->klist == NULL) {
-		err = got_error_from_errno("calloc");
-		goto closem;
-	}
-	ds->clen = 0;
-	ds->clistlen = 100;
-	ds->clist = calloc(ds->clistlen, sizeof(*ds->clist));
-	if (ds->clist == NULL) {
-		err = got_error_from_errno("calloc");
-		goto closem;
-	}
-	i = stone(ds, ds->class, ds->slen[0], ds->member, ds->klist, flags);
-	if (i < 0) {
-		err = got_error_from_errno("stone");
-		goto closem;
-	}
+	err = got_diff_prepare_file(&f1, &p1, &f1_created, &size1,
+	    left, cfg, ignore_whitespace);
+	if (err)
+		goto done;
 
-	p = reallocarray(ds->J, ds->len[0] + 2, sizeof(*ds->J));
-	if (p == NULL) {
-		err = got_error_from_errno("reallocarray");
-		goto closem;
-	}
-	ds->J = p;
-	unravel(ds, ds->klist[i]);
+	err = got_diff_prepare_file(&f2, &p2, &f2_created, &size2,
+	    right, cfg, ignore_whitespace);
+	if (err)
+		goto done;
 
-	lp = reallocarray(ds->ixold, ds->len[0] + 2, sizeof(*ds->ixold));
-	if (lp == NULL) {
-		err = got_error_from_errno("reallocarray");
-		goto closem;
+	diff_result = diff_main(cfg, left, right);
+	if (diff_result == NULL) {
+		err = got_error_set_errno(ENOMEM, "malloc");
+		goto done;
 	}
-	ds->ixold = lp;
-	lp = reallocarray(ds->ixnew, ds->len[1] + 2, sizeof(*ds->ixnew));
-	if (lp == NULL) {
-		err = got_error_from_errno("reallocarray");
-		goto closem;
+	if (diff_result->rc != DIFF_RC_OK) {
+		err = got_error_set_errno(diff_result->rc, "diff");
+		goto done;
 	}
-	ds->ixnew = lp;
-	check(ds, f1, f2, flags);
-	if (output(outfile, changes, ds, args, args->label[0], f1,
-	    args->label[1], f2, flags))
-		err = got_error_from_errno("output");
-closem:
-	if (ds->anychange) {
-		args->status |= 1;
-		if (*rval == D_SAME)
-			*rval = D_DIFFER;
-	}
-	return (err);
-}
 
-/*
- * Check to see if the given files differ.
- * Returns 0 if they are the same, 1 if different, and -1 on error.
- * XXX - could use code from cmp(1) [faster]
- */
-static int
-files_differ(struct got_diff_state *ds, FILE *f1, FILE *f2)
-{
-	char buf1[BUFSIZ], buf2[BUFSIZ];
-	size_t i, j;
-
-	if (f1 == NULL || f2 == NULL || ds->stb1.st_size != ds->stb2.st_size ||
-	    (ds->stb1.st_mode & S_IFMT) != (ds->stb2.st_mode & S_IFMT))
-		return (1);
-	for (;;) {
-		i = fread(buf1, 1, sizeof(buf1), f1);
-		j = fread(buf2, 1, sizeof(buf2), f2);
-		if ((!i && ferror(f1)) || (!j && ferror(f2)))
-			return (-1);
-		if (i != j)
-			return (1);
-		if (i == 0)
-			return (0);
-		if (memcmp(buf1, buf2, i) != 0)
-			return (1);
+	if (diffreg_result) {
+		(*diffreg_result)->result = diff_result;
+		if (f1_created)
+			(*diffreg_result)->f1 = f1;
+		(*diffreg_result)->map1 = p1;
+		(*diffreg_result)->size1 = size1;
+		if (f2_created)
+			(*diffreg_result)->f2 = f2;
+		(*diffreg_result)->map2 = p2;
+		(*diffreg_result)->size2 = size2;
 	}
-}
-
-static int
-prepare(struct got_diff_state *ds, int i, FILE *fd, off_t filesize, int flags)
-{
-	struct line *p, *q;
-	int j, h;
-	size_t sz;
-
-	if (fd != NULL)
-		rewind(fd);
-
-	sz = (filesize <= SIZE_MAX ? filesize : SIZE_MAX) / 25;
-	if (sz < 100)
-		sz = 100;
-
-	p = calloc(sz + 3, sizeof(*p));
-	if (p == NULL)
-		return (-1);
-	for (j = 0; fd != NULL && (h = readhash(ds, fd, flags));) {
-		if (j == sz) {
-			sz = sz * 3 / 2;
-			q = reallocarray(p, sz + 3, sizeof(*p));
-			if (q == NULL) {
-				free(p);
-				return (-1);
-			}
-			p = q;
-		}
-		p[++j].value = h;
+done:
+	if (diffreg_result == NULL) {
+		diff_data_free(left);
+		diff_data_free(right);
 	}
-	ds->len[i] = j;
-	ds->file[i] = p;
-
-	return (0);
-}
-
-static void
-prune(struct got_diff_state *ds)
-{
-	int i, j;
-
-	for (ds->pref = 0; ds->pref < ds->len[0] && ds->pref < ds->len[1] &&
-	    ds->file[0][ds->pref + 1].value == ds->file[1][ds->pref + 1].value;
-	    ds->pref++)
-		;
-	for (ds->suff = 0; ds->suff < ds->len[0] - ds->pref && ds->suff < ds->len[1] - ds->pref &&
-	    ds->file[0][ds->len[0] - ds->suff].value == ds->file[1][ds->len[1] - ds->suff].value;
-	    ds->suff++)
-		;
-	for (j = 0; j < 2; j++) {
-		ds->sfile[j] = ds->file[j] + ds->pref;
-		ds->slen[j] = ds->len[j] - ds->pref - ds->suff;
-		for (i = 0; i <= ds->slen[j]; i++)
-			ds->sfile[j][i].serial = i;
-	}
-}
-
-static void
-equiv(struct line *a, int n, struct line *b, int m, int *c)
-{
-	int i, j;
-
-	i = j = 1;
-	while (i <= n && j <= m) {
-		if (a[i].value < b[j].value)
-			a[i++].value = 0;
-		else if (a[i].value == b[j].value)
-			a[i++].value = j;
-		else
-			j++;
-	}
-	while (i <= n)
-		a[i++].value = 0;
-	b[m + 1].value = 0;
-	j = 0;
-	while (++j <= m) {
-		c[j] = -b[j].serial;
-		while (b[j + 1].value == b[j].value) {
-			j++;
-			c[j] = b[j].serial;
+	if (err) {
+		got_diffreg_close(f1_created ? f1 : NULL, p1, size1,
+		    f2_created ? f2 : NULL, p2, size2);
+		if (diffreg_result) {
+			diff_data_free(left);
+			diff_data_free(right);
+			free(*diffreg_result);
+			*diffreg_result = NULL;
 		}
 	}
-	c[j] = -1;
+	
+	return err;
 }
 
-/* Code taken from ping.c */
-static int
-isqrt(int n)
+const struct got_error *
+got_diffreg_output(off_t **line_offsets, size_t *nlines,
+    struct got_diffreg_result *diff_result, FILE *f1, FILE *f2,
+    const char *path1, const char *path2,
+    enum got_diff_output_format output_format, int context_lines, FILE *outfile)
 {
-	int y, x = 1;
+	struct diff_input_info info = {
+		.left_path = path1,
+		.right_path = path2,
+	};
+	int rc;
+	struct diff_output_info *output_info;
 
-	if (n == 0)
-		return (0);
+	switch (output_format) {
+	case GOT_DIFF_OUTPUT_UNIDIFF:
+		rc = diff_output_unidiff(
+		    line_offsets ? &output_info : NULL, outfile, &info,
+		    diff_result->result, context_lines);
+		if (rc != DIFF_RC_OK)
+			return got_error_set_errno(rc, "diff_output_unidiff");
+		break;
+	case GOT_DIFF_OUTPUT_EDSCRIPT:
+		rc = diff_output_edscript(line_offsets ? &output_info : NULL,
+		    outfile, &info, diff_result->result);
+		if (rc != DIFF_RC_OK)
+			return got_error_set_errno(rc, "diff_output_edscript");
+		break;
 
-	do { /* newton was a stinker */
-		y = x;
-		x = n / x;
-		x += y;
-		x /= 2;
-	} while ((x - y) > 1 || (x - y) < -1);
-
-	return (x);
-}
-
-static int
-stone(struct got_diff_state *ds, int *a, int n, int *b, int *c, int flags)
-{
-	int i, k, y, j, l;
-	int oldc, tc, oldl, sq;
-	u_int numtries, bound;
-	int error;
-
-	if (flags & D_MINIMAL)
-		bound = UINT_MAX;
-	else {
-		sq = isqrt(n);
-		bound = MAXIMUM(256, sq);
 	}
 
-	k = 0;
-	c[0] = newcand(ds, 0, 0, 0, &error);
-	if (error)
-		return -1;
-	for (i = 1; i <= n; i++) {
-		j = a[i];
-		if (j == 0)
-			continue;
-		y = -b[j];
-		oldl = 0;
-		oldc = c[0];
-		numtries = 0;
-		do {
-			if (y <= ds->clist[oldc].y)
-				continue;
-			l = search(ds, c, k, y);
-			if (l != oldl + 1)
-				oldc = c[l - 1];
-			if (l <= k) {
-				if (ds->clist[c[l]].y <= y)
-					continue;
-				tc = c[l];
-				c[l] = newcand(ds, i, y, oldc, &error);
-				if (error)
-					return -1;
-				oldc = tc;
-				oldl = l;
-				numtries++;
-			} else {
-				c[l] = newcand(ds, i, y, oldc, &error);
-				if (error)
-					return -1;
-				k++;
-				break;
-			}
-		} while ((y = b[++j]) > 0 && numtries < bound);
-	}
-	return (k);
-}
-
-static int
-newcand(struct got_diff_state *ds, int x, int y, int pred, int *errorp)
-{
-	struct cand *q;
-
-	if (ds->clen == ds->clistlen) {
-		ds->clistlen = ds->clistlen * 11 / 10;
-		q = reallocarray(ds->clist, ds->clistlen, sizeof(*ds->clist));
-		if (q == NULL) {
-			*errorp = -1;
-			free(ds->clist);
-			ds->clist = NULL;
-			return 0;
-		}
-		ds->clist = q;
-	}
-	q = ds->clist + ds->clen;
-	q->x = x;
-	q->y = y;
-	q->pred = pred;
-	*errorp = 0;
-	return (ds->clen++);
-}
-
-static int
-search(struct got_diff_state *ds, int *c, int k, int y)
-{
-	int i, j, l, t;
-
-	if (ds->clist[c[k]].y < y)	/* quick look for typical case */
-		return (k + 1);
-	i = 0;
-	j = k + 1;
-	for (;;) {
-		l = (i + j) / 2;
-		if (l <= i)
-			break;
-		t = ds->clist[c[l]].y;
-		if (t > y)
-			j = l;
-		else if (t < y)
-			i = l;
-		else
-			return (l);
-	}
-	return (l + 1);
-}
-
-static void
-unravel(struct got_diff_state *ds, int p)
-{
-	struct cand *q;
-	int i;
-
-	for (i = 0; i <= ds->len[0]; i++)
-		ds->J[i] = i <= ds->pref ? i :
-		    i > ds->len[0] - ds->suff ? i + ds->len[1] - ds->len[0] : 0;
-	for (q = ds->clist + p; q->y != 0; q = ds->clist + q->pred)
-		ds->J[q->x + ds->pref] = q->y + ds->pref;
-}
-
-/*
- * Check does double duty:
- *  1.	ferret out any fortuitous correspondences due
- *	to confounding by hashing (which result in "jackpot")
- *  2.  collect random access indexes to the two files
- */
-static void
-check(struct got_diff_state *ds, FILE *f1, FILE *f2, int flags)
-{
-	int i, j, jackpot, c, d;
-	long ctold, ctnew;
-
-	if (f1 != NULL)
-		rewind(f1);
-	if (f2 != NULL)
-		rewind(f2);
-	j = 1;
-	ds->ixold[0] = ds->ixnew[0] = 0;
-	jackpot = 0;
-	ctold = ctnew = 0;
-	for (i = 1; i <= ds->len[0]; i++) {
-		if (ds->J[i] == 0) {
-			ds->ixold[i] = ctold += skipline(f1);
-			continue;
-		}
-		while (j < ds->J[i]) {
-			ds->ixnew[j] = ctnew += skipline(f2);
-			j++;
-		}
-		if (flags & (D_FOLDBLANKS|D_IGNOREBLANKS|D_IGNORECASE)) {
-			for (;;) {
-				c = (f1 == NULL ? EOF : getc(f1));
-				d = (f2 == NULL ? EOF : getc(f2));
+	if (line_offsets && *line_offsets) {
+		if (output_info->line_offsets.len > 0) {
+			off_t prev_offset = 0, *p, *o;
+			int i, len;
+			if (*nlines > 0) {
+				prev_offset = (*line_offsets)[*nlines - 1];
 				/*
-				 * GNU diff ignores a missing newline
-				 * in one file for -b or -w.
+				 * First line offset is always zero. Skip it
+				 * when appending to a pre-populated array.
 				 */
-				if (flags & (D_FOLDBLANKS|D_IGNOREBLANKS)) {
-					if (c == EOF && d == '\n') {
-						ctnew++;
-						break;
-					} else if (c == '\n' && d == EOF) {
-						ctold++;
-						break;
-					}
-				}
-				ctold++;
-				ctnew++;
-				if ((flags & D_FOLDBLANKS) && isspace(c) &&
-				    isspace(d)) {
-					do {
-						if (c == '\n')
-							break;
-						ctold++;
-					} while (f1 && isspace(c = getc(f1)));
-					do {
-						if (d == '\n')
-							break;
-						ctnew++;
-					} while (f2 && isspace(d = getc(f2)));
-				} else if ((flags & D_IGNOREBLANKS)) {
-					while (f1 && isspace(c) && c != '\n') {
-						c = getc(f1);
-						ctold++;
-					}
-					while (f2 && isspace(d) && d != '\n') {
-						d = getc(f2);
-						ctnew++;
-					}
-				}
-				if (ds->chrtran[c] != ds->chrtran[d]) {
-					jackpot++;
-					ds->J[i] = 0;
-					if (c != '\n' && c != EOF)
-						ctold += skipline(f1);
-					if (d != '\n' && c != EOF)
-						ctnew += skipline(f2);
-					break;
-				}
-				if (c == '\n' || c == EOF)
-					break;
-			}
-		} else {
-			for (;;) {
-				ctold++;
-				ctnew++;
-				c = (f1 == NULL ? EOF : getc(f1));
-				d = (f2 == NULL ? EOF : getc(f2));
-				if (c != d) {
-					/* jackpot++; */
-					ds->J[i] = 0;
-					if (c != '\n' && c != EOF)
-						ctold += skipline(f1);
-					if (d != '\n' && c != EOF)
-						ctnew += skipline(f2);
-					break;
-				}
-				if (c == '\n' || c == EOF)
-					break;
-			}
-		}
-		ds->ixold[i] = ctold;
-		ds->ixnew[j] = ctnew;
-		j++;
-	}
-	for (; j <= ds->len[1]; j++)
-		ds->ixnew[j] = ctnew += skipline(f2);
-	/*
-	 * if (jackpot)
-	 *	fprintf(stderr, "jackpot\n");
-	 */
-}
-
-/* shellsort CACM #201 */
-static void
-sort(struct line *a, int n)
-{
-	struct line *ai, *aim, w;
-	int j, m = 0, k;
-
-	if (n == 0)
-		return;
-	for (j = 1; j <= n; j *= 2)
-		m = 2 * j - 1;
-	for (m /= 2; m != 0; m /= 2) {
-		k = n - m;
-		for (j = 1; j <= k; j++) {
-			for (ai = &a[j]; ai > a; ai -= m) {
-				aim = &ai[m];
-				if (aim < ai)
-					break;	/* wraparound */
-				if (aim->value > ai[0].value ||
-				    (aim->value == ai[0].value &&
-					aim->serial > ai[0].serial))
-					break;
-				w.value = ai[0].value;
-				ai[0].value = aim->value;
-				aim->value = w.value;
-				w.serial = ai[0].serial;
-				ai[0].serial = aim->serial;
-				aim->serial = w.serial;
-			}
-		}
-	}
-}
-
-static int
-unsort(struct line *f, int l, int *b)
-{
-	int *a, i;
-
-	a = calloc(l + 1, sizeof(*a));
-	if (a == NULL)
-		return (-1);
-	for (i = 1; i <= l; i++)
-		a[f[i].serial] = f[i].value;
-	for (i = 1; i <= l; i++)
-		b[i] = a[i];
-	free(a);
-
-	return (0);
-}
-
-static int
-skipline(FILE *f)
-{
-	int i, c;
-
-	for (i = 1; f != NULL && (c = getc(f)) != '\n' && c != EOF; i++)
-		continue;
-	return (i);
-}
-
-static int
-output(FILE *outfile, struct got_diff_changes *changes,
-    struct got_diff_state *ds, struct got_diff_args *args,
-    const char *file1, FILE *f1, const char *file2, FILE *f2, int flags)
-{
-	int m, i0, i1, j0, j1;
-	int error = 0;
-
-	if (f1 != NULL)
-		rewind(f1);
-	if (f2 != NULL)
-		rewind(f2);
-	m = ds->len[0];
-	ds->J[0] = 0;
-	ds->J[m + 1] = ds->len[1] + 1;
-	for (i0 = 1; i0 <= m; i0 = i1 + 1) {
-		while (i0 <= m && ds->J[i0] == ds->J[i0 - 1] + 1)
-			i0++;
-		j0 = ds->J[i0 - 1] + 1;
-		i1 = i0 - 1;
-		while (i1 < m && ds->J[i1 + 1] == 0)
-			i1++;
-		j1 = ds->J[i1 + 1] - 1;
-		ds->J[i1] = j1;
-		error = change(outfile, changes, ds, args, file1, f1, file2, f2,
-		    i0, i1, j0, j1, &flags);
-		if (error)
-			return (error);
-	}
-	if (m == 0) {
-		error = change(outfile, changes, ds, args, file1, f1, file2, f2,
-		    1, 0, 1, ds->len[1], &flags);
-		if (error)
-			return (error);
-	}
-	if (ds->anychange != 0 && args->diff_format == D_UNIFIED)
-		dump_unified_vec(outfile, changes, ds, args, f1, f2, flags);
-
-	return (0);
-}
-
-static void
-range(FILE *outfile, int a, int b, char *separator)
-{
-	diff_output(outfile, "%d", a > b ? b : a);
-	if (a < b)
-		diff_output(outfile, "%s%d", separator, b);
-}
-
-static void
-uni_range(FILE *outfile, int a, int b)
-{
-	if (a < b)
-		diff_output(outfile, "%d,%d", a, b - a + 1);
-	else if (a == b)
-		diff_output(outfile, "%d", b);
-	else
-		diff_output(outfile, "%d,0", b);
-}
-
-/*
- * Indicate that there is a difference between lines a and b of the from file
- * to get to lines c to d of the to file.  If a is greater then b then there
- * are no lines in the from file involved and this means that there were
- * lines appended (beginning at b).  If c is greater than d then there are
- * lines missing from the to file.
- */
-static int
-change(FILE *outfile, struct got_diff_changes *changes,
-    struct got_diff_state *ds, struct got_diff_args *args,
-    const char *file1, FILE *f1, const char *file2, FILE *f2,
-    int a, int b, int c, int d, int *pflags)
-{
-	if (a > b && c > d)
-		return (0);
-
-	if (*pflags & D_HEADER) {
-		diff_output(outfile, "%s %s %s\n", args->diffargs, file1, file2);
-		*pflags &= ~D_HEADER;
-	}
-	if (args->diff_format == D_UNIFIED) {
-		/*
-		 * Allocate change records as needed.
-		 */
-		if (ds->context_vec_ptr == ds->context_vec_end - 1) {
-			struct context_vec *cvp;
-			ptrdiff_t offset;
-			offset = ds->context_vec_ptr - ds->context_vec_start;
-			ds->max_context <<= 1;
-			cvp = reallocarray(ds->context_vec_start,
-			    ds->max_context, sizeof(*ds->context_vec_start));
-			if (cvp == NULL) {
-				free(ds->context_vec_start);
-				return (-1);
-			}
-			ds->context_vec_start = cvp;
-			ds->context_vec_end = ds->context_vec_start +
-			    ds->max_context;
-			ds->context_vec_ptr = ds->context_vec_start + offset;
-		}
-		if (ds->anychange == 0) {
-			/*
-			 * Print the context/unidiff header first time through.
-			 */
-			print_header(outfile, ds, args, file1, file2);
-			ds->anychange = 1;
-		} else if (a > ds->context_vec_ptr->b + (2 * args->diff_context) + 1 &&
-		    c > ds->context_vec_ptr->d + (2 * args->diff_context) + 1) {
-			/*
-			 * If this change is more than 'diff_context' lines from the
-			 * previous change, dump the record and reset it.
-			 */
-			dump_unified_vec(outfile, changes, ds, args, f1, f2,
-			    *pflags);
-		}
-		ds->context_vec_ptr++;
-		ds->context_vec_ptr->a = a;
-		ds->context_vec_ptr->b = b;
-		ds->context_vec_ptr->c = c;
-		ds->context_vec_ptr->d = d;
-		return (0);
-	}
-	if (ds->anychange == 0)
-		ds->anychange = 1;
-	if (args->diff_format == D_BRIEF)
-		return (0);
-	if (args->diff_format == D_NORMAL) {
-		range(outfile, a, b, ",");
-		diff_output(outfile, "%c", a > b ? 'a' : c > d ? 'd' : 'c');
-		range(outfile, c, d, ",");
-		diff_output(outfile, "\n");
-		fetch(outfile, ds, args, ds->ixold, a, b, f1, '<', *pflags);
-		if (a <= b && c <= d)
-			diff_output(outfile, "---\n");
-	}
-	fetch(outfile, ds, args, ds->ixnew, c, d, f2,
-	    args->diff_format == D_NORMAL ? '>' : '\0', *pflags);
-	return (0);
-}
-
-static void
-fetch(FILE *outfile, struct got_diff_state *ds, struct got_diff_args *args,
-    long *f, int a, int b, FILE *lb, int ch, int flags)
-{
-	int i, j, c, col, nc;
-
-	if (lb == NULL || a > b)
-		return;
-	for (i = a; i <= b; i++) {
-		fseek(lb, f[i - 1], SEEK_SET);
-		nc = f[i] - f[i - 1];
-		if (ch != '\0') {
-			diff_output(outfile, "%c", ch);
-			if (args->Tflag && (args->diff_format == D_UNIFIED ||
-			    args->diff_format == D_NORMAL))
-				diff_output(outfile, "\t");
-			else if (args->diff_format != D_UNIFIED)
-				diff_output(outfile, " ");
-		}
-		col = 0;
-		for (j = 0; j < nc; j++) {
-			if ((c = getc(lb)) == EOF) {
-				diff_output(outfile, "\n\\ No newline at end of "
-				    "file\n");
-				return;
-			}
-			if (c == '\t' && (flags & D_EXPANDTABS)) {
-				do {
-					diff_output(outfile, " ");
-				} while (++col & 7);
+				o = &output_info->line_offsets.head[1];
+				len = output_info->line_offsets.len - 1;
 			} else {
-				diff_output(outfile, "%c", c);
-				col++;
+				o = &output_info->line_offsets.head[0];
+				len = output_info->line_offsets.len;
 			}
+			p = reallocarray(*line_offsets, *nlines + len,
+			    sizeof(off_t));
+			if (p == NULL)
+				return got_error_from_errno("calloc");
+			for (i = 0; i < len; i++)
+				p[*nlines + i] = o[i] + prev_offset;
+			*line_offsets = p;
+			*nlines += len;
 		}
+		diff_output_info_free(output_info);
 	}
-}
 
-/*
- * Hash function taken from Robert Sedgewick, Algorithms in C, 3d ed., p 578.
- */
-static int
-readhash(struct got_diff_state *ds, FILE *f, int flags)
-{
-	int i, t, space;
-	int sum;
-
-	sum = 1;
-	space = 0;
-	if ((flags & (D_FOLDBLANKS|D_IGNOREBLANKS)) == 0) {
-		if (flags & D_IGNORECASE)
-			for (i = 0; (t = getc(f)) != '\n'; i++) {
-				if (t == EOF) {
-					if (i == 0)
-						return (0);
-					break;
-				}
-				sum = sum * 127 + ds->chrtran[t];
-			}
-		else
-			for (i = 0; (t = getc(f)) != '\n'; i++) {
-				if (t == EOF) {
-					if (i == 0)
-						return (0);
-					break;
-				}
-				sum = sum * 127 + t;
-			}
-	} else {
-		for (i = 0;;) {
-			switch (t = getc(f)) {
-			case '\t':
-			case '\r':
-			case '\v':
-			case '\f':
-			case ' ':
-				space++;
-				continue;
-			default:
-				if (space && (flags & D_IGNOREBLANKS) == 0) {
-					i++;
-					space = 0;
-				}
-				sum = sum * 127 + ds->chrtran[t];
-				i++;
-				continue;
-			case EOF:
-				if (i == 0)
-					return (0);
-				/* FALLTHROUGH */
-			case '\n':
-				break;
-			}
-			break;
-		}
-	}
-	/*
-	 * There is a remote possibility that we end up with a zero sum.
-	 * Zero is used as an EOF marker, so return 1 instead.
-	 */
-	return (sum == 0 ? 1 : sum);
+	return NULL;
 }
 
-static int
-asciifile(FILE *f)
+const struct got_error *
+got_diffreg_result_free(struct got_diffreg_result *diffreg_result)
 {
-	unsigned char buf[BUFSIZ];
-	size_t cnt;
+	const struct got_error *err;
 
-	if (f == NULL)
-		return (1);
-
-	rewind(f);
-	cnt = fread(buf, 1, sizeof(buf), f);
-	return (memchr(buf, '\0', cnt) == NULL);
+	diff_result_free(diffreg_result->result);
+	diff_data_free(&diffreg_result->left);
+	diff_data_free(&diffreg_result->right);
+	err = got_diffreg_close(diffreg_result->f1, diffreg_result->map1,
+	    diffreg_result->size1, diffreg_result->f2,
+	    diffreg_result->map2, diffreg_result->size2);
+	free(diffreg_result);
+	return err;
 }
 
-#define begins_with(s, pre) (strncmp(s, pre, sizeof(pre)-1) == 0)
-
-static char *
-match_function(struct got_diff_state *ds, const long *f, int pos, FILE *fp)
+const struct got_error *
+got_diffreg_result_free_left(struct got_diffreg_result *diffreg_result)
 {
-	unsigned char buf[FUNCTION_CONTEXT_SIZE];
-	size_t nc;
-	int last = ds->lastline;
-	char *state = NULL;
-
-	ds->lastline = pos;
-	while (pos > last) {
-		fseek(fp, f[pos - 1], SEEK_SET);
-		nc = f[pos] - f[pos - 1];
-		if (nc >= sizeof(buf))
-			nc = sizeof(buf) - 1;
-		nc = fread(buf, 1, nc, fp);
-		if (nc > 0) {
-			buf[nc] = '\0';
-			buf[strcspn(buf, "\n")] = '\0';
-			if (isalpha(buf[0]) || buf[0] == '_' || buf[0] == '$') {
-				if (begins_with(buf, "private:")) {
-					if (!state)
-						state = " (private)";
-				} else if (begins_with(buf, "protected:")) {
-					if (!state)
-						state = " (protected)";
-				} else if (begins_with(buf, "public:")) {
-					if (!state)
-						state = " (public)";
-				} else {
-					strlcpy(ds->lastbuf, buf, sizeof ds->lastbuf);
-					if (state)
-						strlcat(ds->lastbuf, state,
-						    sizeof ds->lastbuf);
-					ds->lastmatchline = pos;
-					return ds->lastbuf;
-				}
-			}
-		}
-		pos--;
-	}
-	return ds->lastmatchline > 0 ? ds->lastbuf : NULL;
+	diff_data_free(&diffreg_result->left);
+	memset(&diffreg_result->left, 0, sizeof(diffreg_result->left));
+	return got_diffreg_close(diffreg_result->f1, diffreg_result->map1,
+	    diffreg_result->size1, NULL, NULL, 0);
 }
 
-/* dump accumulated "unified" diff changes */
-static void
-dump_unified_vec(FILE *outfile, struct got_diff_changes *changes,
-    struct got_diff_state *ds, struct got_diff_args *args,
-    FILE *f1, FILE *f2, int flags)
+const struct got_error *
+got_diffreg_result_free_right(struct got_diffreg_result *diffreg_result)
 {
-	struct context_vec *cvp = ds->context_vec_start;
-	int lowa, upb, lowc, upd;
-	int a, b, c, d;
-	char ch, *f;
-
-	if (ds->context_vec_start > ds->context_vec_ptr)
-		return;
-
-	b = d = 0;		/* gcc */
-	lowa = MAXIMUM(1, cvp->a - args->diff_context);
-	upb = MINIMUM(ds->len[0], ds->context_vec_ptr->b + args->diff_context);
-	lowc = MAXIMUM(1, cvp->c - args->diff_context);
-	upd = MINIMUM(ds->len[1], ds->context_vec_ptr->d + args->diff_context);
-
-	diff_output(outfile, "@@ -");
-	uni_range(outfile, lowa, upb);
-	diff_output(outfile, " +");
-	uni_range(outfile, lowc, upd);
-	diff_output(outfile, " @@");
-	if (f1 != NULL && (flags & D_PROTOTYPE)) {
-		f = match_function(ds, ds->ixold, lowa-1, f1);
-		if (f != NULL)
-			diff_output(outfile, " %s", f);
-	}
-	diff_output(outfile, "\n");
-
-	/*
-	 * Output changes in "unified" diff format--the old and new lines
-	 * are printed together.
-	 */
-	for (; cvp <= ds->context_vec_ptr; cvp++) {
-		if (changes) {
-			struct got_diff_change *change;
-			change = calloc(1, sizeof(*change));
-			if (change) {
-				memcpy(&change->cv, cvp, sizeof(change->cv));
-				SIMPLEQ_INSERT_TAIL(&changes->entries, change,
-				    entry);
-				changes->nchanges++;
-			}
-		}
-
-		a = cvp->a;
-		b = cvp->b;
-		c = cvp->c;
-		d = cvp->d;
-
-		/*
-		 * c: both new and old changes
-		 * d: only changes in the old file
-		 * a: only changes in the new file
-		 */
-		if (a <= b && c <= d)
-			ch = 'c';
-		else
-			ch = (a <= b) ? 'd' : 'a';
-
-		switch (ch) {
-		case 'c':
-			fetch(outfile, ds, args, ds->ixold, lowa, a - 1, f1, ' ', flags);
-			fetch(outfile, ds, args, ds->ixold, a, b, f1, '-', flags);
-			fetch(outfile, ds, args, ds->ixnew, c, d, f2, '+', flags);
-			break;
-		case 'd':
-			fetch(outfile, ds, args, ds->ixold, lowa, a - 1, f1, ' ', flags);
-			fetch(outfile, ds, args, ds->ixold, a, b, f1, '-', flags);
-			break;
-		case 'a':
-			fetch(outfile, ds, args, ds->ixnew, lowc, c - 1, f2, ' ', flags);
-			fetch(outfile, ds, args, ds->ixnew, c, d, f2, '+', flags);
-			break;
-		}
-		lowa = b + 1;
-		lowc = d + 1;
-	}
-	fetch(outfile, ds, args, ds->ixnew, d + 1, upd, f2, ' ', flags);
-
-	ds->context_vec_ptr = ds->context_vec_start - 1;
+	diff_data_free(&diffreg_result->right);
+	memset(&diffreg_result->right, 0, sizeof(diffreg_result->right));
+	return got_diffreg_close(NULL, NULL, 0, diffreg_result->f2,
+	    diffreg_result->map2, diffreg_result->size2);
 }
 
 void
-got_diff_dump_change(FILE *outfile, struct got_diff_change *change,
-    struct got_diff_state *ds, struct got_diff_args *args,
-    FILE *f1, FILE *f2, int diff_flags)
+got_diff_dump_change(FILE *outfile, struct diff_chunk *change,
+    FILE *f1, FILE *f2)
 {
-	ds->context_vec_ptr = &change->cv;
-	ds->context_vec_start = &change->cv;
-	ds->context_vec_end = &change->cv;
-
-	/* XXX TODO needs error checking */
-	dump_unified_vec(outfile, NULL, ds, args, f1, f2, diff_flags);
 }
-
-static void
-print_header(FILE *outfile, struct got_diff_state *ds, struct got_diff_args *args,
-    const char *file1, const char *file2)
-{
-	if (args->label[0] != NULL)
-		diff_output(outfile, "--- %s\n", args->label[0]);
-	else
-		diff_output(outfile, "--- %s\t%s", file1,
-		    ctime(&ds->stb1.st_mtime));
-	if (args->label[1] != NULL)
-		diff_output(outfile, "+++ %s\n", args->label[1]);
-	else
-		diff_output(outfile, "+++ %s\t%s", file2,
-		    ctime(&ds->stb2.st_mtime));
-}
blob - /dev/null
blob + a3d25db09a7170f984d78db4fa9d4de2a904607a (mode 644)
--- /dev/null
+++ lib/diff_atomize_text.c
@@ -0,0 +1,179 @@
+/* Split source by line breaks, and calculate a simplistic checksum. */
+/*
+ * Copyright (c) 2020 Neels Hofmeyr <neels@hofmeyr.de>
+ *
+ * Permission to use, copy, modify, and distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#include <errno.h>
+#include <inttypes.h>
+#include <stdbool.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <ctype.h>
+
+#include <arraylist.h>
+#include <diff_main.h>
+
+#include "diff_internal.h"
+#include "diff_debug.h"
+
+static int
+diff_data_atomize_text_lines_fd(struct diff_data *d)
+{
+	off_t pos = 0;
+	const off_t end = pos + d->len;
+	unsigned int array_size_estimate = d->len / 50;
+	unsigned int pow2 = 1;
+	bool ignore_whitespace = (d->diff_flags & DIFF_FLAG_IGNORE_WHITESPACE);
+
+	while (array_size_estimate >>= 1)
+		pow2++;
+
+	ARRAYLIST_INIT(d->atoms, 1 << pow2);
+
+	if (fseek(d->root->f, 0L, SEEK_SET) == -1)
+		return errno;
+
+	while (pos < end) {
+		off_t line_end = pos;
+		unsigned int hash = 0;
+		unsigned char buf[512];
+		size_t r, i;
+		struct diff_atom *atom;
+		int eol = 0;
+
+		while (eol == 0 && line_end < end) {
+			r = fread(buf, sizeof(char), sizeof(buf), d->root->f);
+			if (r == 0 && ferror(d->root->f))
+				return errno;
+			i = 0;
+			while (eol == 0 && i < r) {
+				if (buf[i] != '\r' && buf[i] != '\n') {
+					if (!ignore_whitespace
+					    || !isspace(buf[i]))
+						hash = hash * 23 + buf[i];
+					line_end++;
+				} else
+					eol = buf[i];
+				i++;
+			}
+		}
+
+		/* When not at the end of data, the line ending char ('\r' or
+		 * '\n') must follow */
+		if (line_end < end)
+			line_end++;
+		/* If that was an '\r', also pull in any following '\n' */
+		if (line_end < end && eol == '\r') {
+			if (fseeko(d->root->f, line_end, SEEK_SET) == -1)
+				return errno;
+			r = fread(buf, sizeof(char), sizeof(buf), d->root->f);
+			if (r == 0 && ferror(d->root->f))
+				return errno;
+			if (r == 1 && buf[0] == '\n' )
+				line_end++;
+		}
+
+		/* Record the found line as diff atom */
+		ARRAYLIST_ADD(atom, d->atoms);
+		if (!atom)
+			return ENOMEM;
+
+		*atom = (struct diff_atom){
+			.root = d,
+			.pos = pos,
+			.at = NULL,	/* atom data is not memory-mapped */
+			.len = line_end - pos,
+			.hash = hash,
+		};
+
+		/* Starting point for next line: */
+		pos = line_end;
+		if (fseeko(d->root->f, pos, SEEK_SET) == -1)
+			return errno;
+	}
+
+	return DIFF_RC_OK;
+}
+
+static int
+diff_data_atomize_text_lines_mmap(struct diff_data *d)
+{
+	const uint8_t *pos = d->data;
+	const uint8_t *end = pos + d->len;
+	bool ignore_whitespace = (d->diff_flags & DIFF_FLAG_IGNORE_WHITESPACE);
+
+	unsigned int array_size_estimate = d->len / 50;
+	unsigned int pow2 = 1;
+	while (array_size_estimate >>= 1)
+		pow2++;
+
+	ARRAYLIST_INIT(d->atoms, 1 << pow2);
+
+	while (pos < end) {
+		const uint8_t *line_end = pos;
+		unsigned int hash = 0;
+
+		while (line_end < end && *line_end != '\r' && *line_end != '\n') {
+			if (!ignore_whitespace
+			    || !isspace(*line_end))
+				hash = hash * 23 + *line_end;
+			line_end++;
+		}
+
+		/* When not at the end of data, the line ending char ('\r' or
+		 * '\n') must follow */
+		if (line_end < end)
+			line_end++;
+		/* If that was an '\r', also pull in any following '\n' */
+		if (line_end < end - 1 && line_end[0] == '\r' &&
+		    line_end[1] == '\n')
+			line_end++;
+
+		/* Record the found line as diff atom */
+		struct diff_atom *atom;
+		ARRAYLIST_ADD(atom, d->atoms);
+		if (!atom)
+			return ENOMEM;
+
+		*atom = (struct diff_atom){
+			.root = d,
+			.pos = (off_t)(pos - d->data),
+			.at = pos,
+			.len = line_end - pos,
+			.hash = hash,
+		};
+
+		/* Starting point for next line: */
+		pos = line_end;
+	}
+
+	return DIFF_RC_OK;
+}
+
+static int
+diff_data_atomize_text_lines(struct diff_data *d)
+{
+	if (d->data == NULL)
+		return diff_data_atomize_text_lines_fd(d);
+	else
+		return diff_data_atomize_text_lines_mmap(d);
+}
+
+int
+diff_atomize_text_by_line(void *func_data, struct diff_data *d)
+{
+	return diff_data_atomize_text_lines(d);
+}
blob - /dev/null
blob + 0ed8a5e2244549f0bc385a02aa8eb6f5d610deac (mode 644)
--- /dev/null
+++ lib/diff_debug.h
@@ -0,0 +1,228 @@
+/*
+ * Copyright (c) 2020 Neels Hofmeyr <neels@hofmeyr.de>
+ *
+ * Permission to use, copy, modify, and distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#define DEBUG 0
+
+#if DEBUG
+#include <stdio.h>
+#include <unistd.h>
+#define print(args...) fprintf(stderr, ##args)
+#define debug print
+#define debug_dump dump
+#define debug_dump_atom dump_atom
+#define debug_dump_atoms dump_atoms
+
+static inline void
+print_atom_byte(unsigned char c) {
+	if (c == '\r')
+		print("\\r");
+	else if (c == '\n')
+		print("\\n");
+	else if ((c < 32 || c >= 127) && (c != '\t'))
+		print("\\x%02x", c);
+	else
+		print("%c", c);
+}
+
+static inline void
+dump_atom(const struct diff_data *left, const struct diff_data *right,
+	  const struct diff_atom *atom)
+{
+	if (!atom) {
+		print("NULL atom\n");
+		return;
+	}
+	if (left)
+		print(" %3u '", diff_atom_root_idx(left, atom));
+
+	if (atom->at == NULL) {
+		off_t remain = atom->len;
+		if (fseek(atom->root->f, atom->pos, SEEK_SET) == -1)
+			abort(); /* cannot return error */
+		while (remain > 0) {
+			char buf[16];
+			size_t r;
+			int i;
+			r = fread(buf, 1, MIN(remain, sizeof(buf)),
+			    atom->root->f);
+			if (r == -1)
+				abort(); /* cannot return error */
+			if (r == 0)
+				break;
+			remain -= r;
+			for (i = 0; i < r; i++)
+				print_atom_byte(buf[i]);
+		}
+	} else {
+		const char *s;
+		for (s = atom->at; s < (const char*)(atom->at + atom->len); s++)
+			print_atom_byte(*s);
+	}
+	print("'\n");
+}
+
+static inline void
+dump_atoms(const struct diff_data *d, struct diff_atom *atom,
+	   unsigned int count)
+{
+	if (count > 42) {
+		dump_atoms(d, atom, 20);
+		print("[%u lines skipped]\n", count - 20 - 20);
+		dump_atoms(d, atom + count - 20, 20);
+		return;
+	} else {
+		struct diff_atom *i;
+		foreach_diff_atom(i, atom, count) {
+			dump_atom(d, NULL, i);
+		}
+	}
+}
+
+static inline void
+dump(struct diff_data *d)
+{
+	dump_atoms(d, d->atoms.head, d->atoms.len);
+}
+
+/* kd is a quadratic space myers matrix from the original Myers algorithm.
+ * kd_forward and kd_backward are linear slices of a myers matrix from the Myers
+ * Divide algorithm.
+ */
+static inline void
+dump_myers_graph(const struct diff_data *l, const struct diff_data *r,
+		 int *kd, int *kd_forward, int kd_forward_d,
+		 int *kd_backward, int kd_backward_d)
+{
+	#define COLOR_YELLOW "\033[1;33m"
+	#define COLOR_GREEN "\033[1;32m"
+	#define COLOR_BLUE "\033[1;34m"
+	#define COLOR_RED "\033[1;31m"
+	#define COLOR_END "\033[0;m"
+	int x;
+	int y;
+	print("   ");
+	for (x = 0; x <= l->atoms.len; x++)
+		print("%2d", x % 100);
+	print("\n");
+
+	for (y = 0; y <= r->atoms.len; y++) {
+		print("%3d ", y);
+		for (x = 0; x <= l->atoms.len; x++) {
+
+			/* print d advancements from kd, if any. */
+			char label = 'o';
+			char *color = NULL;
+			if (kd) {
+				int max = l->atoms.len + r->atoms.len;
+				size_t kd_len = max + 1 + max;
+				int *kd_pos = kd;
+				int di;
+#define xk_to_y(X, K) ((X) - (K))
+				for (di = 0; di < max; di++) {
+					int ki;
+					for (ki = di; ki >= -di; ki -= 2) {
+						if (x != kd_pos[ki]
+						    || y != xk_to_y(x, ki))
+							continue;
+						label = '0' + (di % 10);
+						color = COLOR_YELLOW;
+						break;
+					}
+					if (label != 'o')
+						break;
+					kd_pos += kd_len;
+				}
+			}
+			if (kd_forward && kd_forward_d >= 0) {
+#define xc_to_y(X, C, DELTA) ((X) - (C) + (DELTA))
+				int ki;
+				for (ki = kd_forward_d;
+				     ki >= -kd_forward_d;
+				     ki -= 2) {
+					if (x != kd_forward[ki])
+						continue;
+					if (y != xk_to_y(x, ki))
+						continue;
+					label = 'F';
+					color = COLOR_GREEN;
+					break;
+				}
+			}
+			if (kd_backward && kd_backward_d >= 0) {
+				int delta = (int)r->atoms.len
+					    - (int)l->atoms.len;
+				int ki;
+				for (ki = kd_backward_d;
+				     ki >= -kd_backward_d;
+				     ki -= 2) {
+					if (x != kd_backward[ki])
+						continue;
+					if (y != xc_to_y(x, ki, delta))
+						continue;
+					if (label == 'o') {
+						label = 'B';
+						color = COLOR_BLUE;
+					} else {
+						label = 'X';
+						color = COLOR_RED;
+					}
+					break;
+				}
+			}
+			if (color)
+				print("%s", color);
+			print("%c", label);
+			if (color)
+				print("%s", COLOR_END);
+			if (x < l->atoms.len)
+				print("-");
+		}
+		print("\n");
+		if (y == r->atoms.len)
+			break;
+
+		print("    ");
+		for (x = 0; x < l->atoms.len; x++) {
+			bool same;
+			diff_atom_same(&same, &l->atoms.head[x],
+			    &r->atoms.head[y]);
+			if (same)
+				print("|\\");
+			else
+				print("| ");
+		}
+		print("|\n");
+	}
+}
+
+static inline void
+debug_dump_myers_graph(const struct diff_data *l, const struct diff_data *r,
+		       int *kd, int *kd_forward, int kd_forward_d,
+		       int *kd_backward, int kd_backward_d)
+{
+	if (l->atoms.len > 99 || r->atoms.len > 99)
+		return;
+	dump_myers_graph(l, r, kd, kd_forward, kd_forward_d,
+			 kd_backward, kd_backward_d);
+}
+
+#else
+#define debug(args...)
+#define debug_dump(args...)
+#define debug_dump_atom(args...)
+#define debug_dump_atoms(args...)
+#define debug_dump_myers_graph(args...)
+#endif
blob - /dev/null
blob + 94ef28c472ae1b07dee34bf8618414ded3037d74 (mode 644)
--- /dev/null
+++ lib/diff_internal.h
@@ -0,0 +1,244 @@
+/* Generic infrastructure to implement various diff algorithms. */
+/*
+ * Copyright (c) 2020 Neels Hofmeyr <neels@hofmeyr.de>
+ *
+ * Permission to use, copy, modify, and distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#ifndef MAX
+#define MAX(A,B) ((A)>(B)?(A):(B))
+#endif
+#ifndef MIN
+#define MIN(A,B) ((A)<(B)?(A):(B))
+#endif
+
+static inline bool
+diff_range_empty(const struct diff_range *r)
+{
+	return r->start == r->end;
+}
+
+static inline bool
+diff_ranges_touch(const struct diff_range *a, const struct diff_range *b)
+{
+	return (a->end >= b->start) && (a->start <= b->end);
+}
+
+static inline void
+diff_ranges_merge(struct diff_range *a, const struct diff_range *b)
+{
+	*a = (struct diff_range){
+		.start = MIN(a->start, b->start),
+		.end = MAX(a->end, b->end),
+	};
+}
+
+static inline int
+diff_range_len(const struct diff_range *r)
+{
+	if (!r)
+		return 0;
+	return r->end - r->start;
+}
+
+/* List of all possible return codes of a diff invocation. */
+#define DIFF_RC_USE_DIFF_ALGO_FALLBACK	-1
+#define DIFF_RC_OK			0
+/* Any positive return values are errno values from sys/errno.h */
+
+struct diff_data;
+
+struct diff_atom {
+	struct diff_data *root; /* back pointer to root diff data */
+
+	off_t pos;		/* if not memory-mapped */
+	const uint8_t *at;	/* if memory-mapped */
+	off_t len;
+
+	/* This hash is just a very cheap speed up for finding *mismatching*
+	 * atoms. When hashes match, we still need to compare entire atoms to
+	 * find out whether they are indeed identical or not. */
+	unsigned int hash;
+};
+
+int
+diff_atom_cmp(int *cmp,
+	      const struct diff_atom *left,
+	      const struct diff_atom *right);
+
+/* Indicate whether two given diff atoms match. */
+int
+diff_atom_same(bool *same,
+	       const struct diff_atom *left,
+	       const struct diff_atom *right);
+
+/* The atom's index in the entire file. For atoms divided by lines of text, this
+ * yields the line number (starting with 0). Also works for diff_data that
+ * reference only a subsection of a file, always reflecting the global position
+ * in the file (and not the relative position within the subsection). */
+#define diff_atom_root_idx(DIFF_DATA, ATOM) \
+	((ATOM) && ((ATOM) >= (DIFF_DATA)->root->atoms.head) \
+	 ? (unsigned int)((ATOM) - ((DIFF_DATA)->root->atoms.head)) \
+	 : (DIFF_DATA)->root->atoms.len)
+
+/* The atom's index within DIFF_DATA. For atoms divided by lines of text, this
+ * yields the line number (starting with 0). */
+#define diff_atom_idx(DIFF_DATA, ATOM) \
+	((ATOM) && ((ATOM) >= (DIFF_DATA)->atoms.head) \
+	 ? (unsigned int)((ATOM) - ((DIFF_DATA)->atoms.head)) \
+	 : (DIFF_DATA)->atoms.len)
+
+#define foreach_diff_atom(ATOM, FIRST_ATOM, COUNT) \
+	for ((ATOM) = (FIRST_ATOM); \
+	     (ATOM) \
+	     && ((ATOM) >= (FIRST_ATOM)) \
+	     && ((ATOM) - (FIRST_ATOM) < (COUNT)); \
+	     (ATOM)++)
+
+#define diff_data_foreach_atom(ATOM, DIFF_DATA) \
+	foreach_diff_atom(ATOM, (DIFF_DATA)->atoms.head, (DIFF_DATA)->atoms.len)
+
+#define diff_data_foreach_atom_from(FROM, ATOM, DIFF_DATA) \
+	for ((ATOM) = (FROM); \
+	     (ATOM) \
+	     && ((ATOM) >= (DIFF_DATA)->atoms.head) \
+	     && ((ATOM) - (DIFF_DATA)->atoms.head < (DIFF_DATA)->atoms.len); \
+	     (ATOM)++)
+
+#define diff_data_foreach_atom_backwards_from(FROM, ATOM, DIFF_DATA) \
+	for ((ATOM) = (FROM); \
+	     (ATOM) \
+	     && ((ATOM) >= (DIFF_DATA)->atoms.head) \
+	     && ((ATOM) - (DIFF_DATA)->atoms.head >= 0); \
+	     (ATOM)--)
+
+/* A diff chunk represents a set of atoms on the left and/or a set of atoms on
+ * the right.
+ *
+ * If solved == false:
+ * The diff algorithm has divided the source file, and this is a chunk that the
+ * inner_algo should run on next.
+ * The lines on the left should be diffed against the lines on the right.
+ * (If there are no left lines or no right lines, it implies solved == true,
+ * because there is nothing to diff.)
+ *
+ * If solved == true:
+ * If there are only left atoms, it is a chunk removing atoms from the left ("a
+ * minus chunk").
+ * If there are only right atoms, it is a chunk adding atoms from the right ("a
+ * plus chunk").
+ * If there are both left and right lines, it is a chunk of equal content on
+ * both sides, and left_count == right_count:
+ *
+ * - foo  }
+ * - bar  }-- diff_chunk{ left_start = &left.atoms.head[0], left_count = 3,
+ * - baz  }               right_start = NULL, right_count = 0 }
+ *   moo    }
+ *   goo    }-- diff_chunk{ left_start = &left.atoms.head[3], left_count = 3,
+ *   zoo    }              right_start = &right.atoms.head[0], right_count = 3 }
+ *  +loo      }
+ *  +roo      }-- diff_chunk{ left_start = NULL, left_count = 0,
+ *  +too      }            right_start = &right.atoms.head[3], right_count = 3 }
+ *
+ */
+struct diff_chunk {
+	bool solved;
+	struct diff_atom *left_start;
+	unsigned int left_count;
+	struct diff_atom *right_start;
+	unsigned int right_count;
+};
+
+#define DIFF_RESULT_ALLOC_BLOCKSIZE 128
+
+enum diff_chunk_type {
+	CHUNK_EMPTY,
+	CHUNK_PLUS,
+	CHUNK_MINUS,
+	CHUNK_SAME,
+	CHUNK_ERROR,
+};
+
+static inline enum diff_chunk_type
+diff_chunk_type(const struct diff_chunk *chunk)
+{
+	if (!chunk->left_count && !chunk->right_count)
+		return CHUNK_EMPTY;
+	if (!chunk->solved)
+		return CHUNK_ERROR;
+	if (!chunk->right_count)
+		return CHUNK_MINUS;
+	if (!chunk->left_count)
+		return CHUNK_PLUS;
+	if (chunk->left_count != chunk->right_count)
+		return CHUNK_ERROR;
+	return CHUNK_SAME;
+}
+
+struct diff_chunk_context;
+
+bool
+diff_chunk_context_empty(const struct diff_chunk_context *cc);
+
+bool
+diff_chunk_contexts_touch(const struct diff_chunk_context *cc,
+			  const struct diff_chunk_context *other);
+
+void
+diff_chunk_contexts_merge(struct diff_chunk_context *cc,
+			  const struct diff_chunk_context *other);
+
+struct diff_state {
+	/* The final result passed to the original diff caller. */
+	struct diff_result *result;
+
+	/* The root diff_data is in result->left,right, these are (possibly)
+	 * subsections of the root data. */
+	struct diff_data left;
+	struct diff_data right;
+
+	unsigned int recursion_depth_left;
+
+	/* Remaining chunks from one diff algorithm pass, if any solved == false
+	 * chunks came up. */
+	diff_chunk_arraylist_t temp_result;
+
+ 	/* State buffer used by Myers algorithm. */
+	int *kd_buf;
+	size_t kd_buf_size; /* in units of sizeof(int), not bytes */
+};
+
+struct diff_chunk *diff_state_add_chunk(struct diff_state *state, bool solved,
+					struct diff_atom *left_start,
+					unsigned int left_count,
+					struct diff_atom *right_start,
+					unsigned int right_count);
+
+struct diff_output_info;
+
+int diff_output_lines(struct diff_output_info *output_info, FILE *dest,
+		       const char *prefix, struct diff_atom *start_atom,
+		       unsigned int count);
+
+int diff_output_trailing_newline_msg(struct diff_output_info *outinfo,
+				     FILE *dest,
+				     const struct diff_chunk *c);
+int diff_output_match_function_prototype(char **prototype,
+					 const struct diff_result *result,
+					 const struct diff_chunk_context *cc);
+
+struct diff_output_info *diff_output_info_alloc(void);
+
+void
+diff_data_init_subsection(struct diff_data *d, struct diff_data *parent,
+			  struct diff_atom *from_atom, unsigned int atoms_count);
blob - /dev/null
blob + 1d0c6fd4faa11438034964c34bfb319969e36bff (mode 644)
--- /dev/null
+++ lib/diff_main.c
@@ -0,0 +1,608 @@
+/* Generic infrastructure to implement various diff algorithms (implementation). */
+/*
+ * Copyright (c) 2020 Neels Hofmeyr <neels@hofmeyr.de>
+ *
+ * Permission to use, copy, modify, and distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+
+#include <sys/queue.h>
+#include <ctype.h>
+#include <errno.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <stdbool.h>
+#include <stdio.h>
+#include <string.h>
+#include <limits.h>
+#include <unistd.h>
+
+#include <assert.h>
+
+#include <arraylist.h>
+#include <diff_main.h>
+
+#include "diff_internal.h"
+#include "diff_debug.h"
+
+static int
+read_at(FILE *f, off_t at_pos, unsigned char *buf, size_t len)
+{
+	int r;
+	if (fseeko(f, at_pos, SEEK_SET) == -1)
+		return errno;
+	r = fread(buf, sizeof(char), len, f);
+	if ((r == 0 || r < len) && ferror(f))
+		return errno;
+	if (r != len)
+		return EIO;
+	return 0;
+}
+
+static int
+buf_cmp(const unsigned char *left, size_t left_len,
+	const unsigned char *right, size_t right_len,
+	bool ignore_whitespace)
+{
+	int cmp;
+
+	if (ignore_whitespace) {
+		int il = 0, ir = 0;
+		while (il < left_len && ir < right_len) {
+			unsigned char cl = left[il];
+			unsigned char cr = right[ir];
+
+			if (isspace(cl) && il < left_len) {
+				il++;
+				continue;
+			}
+			if (isspace(cr) && ir < right_len) {
+				ir++;
+				continue;
+			}
+
+			if (cl > cr)
+				return 1;
+			if (cr > cl)
+				return -1;
+			il++;
+			ir++;
+		}
+		while (il < left_len) {
+			unsigned char cl = left[il++];
+			if (!isspace(cl))
+				return 1;
+		}
+		while (ir < right_len) {
+			unsigned char cr = right[ir++];
+			if (!isspace(cr))
+				return -1;
+		}
+
+		return 0;
+	}
+
+	cmp = memcmp(left, right, MIN(left_len, right_len));
+	if (cmp)
+		return cmp;
+	if (left_len == right_len)
+		return 0;
+	return (left_len > right_len) ? 1 : -1;
+}
+
+int
+diff_atom_cmp(int *cmp,
+	      const struct diff_atom *left,
+	      const struct diff_atom *right)
+{
+	off_t remain_left, remain_right;
+	int flags = (left->root->diff_flags | right->root->diff_flags);
+	bool ignore_whitespace = (flags & DIFF_FLAG_IGNORE_WHITESPACE);
+
+	if (!left->len && !right->len) {
+		*cmp = 0;
+		return 0;
+	}
+	if (!ignore_whitespace) {
+		if (!right->len) {
+			*cmp = 1;
+			return 0;
+		}
+		if (!left->len) {
+			*cmp = -1;
+			return 0;
+		}
+	}
+
+	if (left->at != NULL && right->at != NULL) {
+		*cmp = buf_cmp(left->at, left->len, right->at, right->len,
+		    ignore_whitespace);
+		return 0;
+	}
+
+	remain_left = left->len;
+	remain_right = right->len;
+	while (remain_left > 0 || remain_right > 0) {
+		const size_t chunksz = 8192;
+		unsigned char buf_left[chunksz], buf_right[chunksz];
+		const uint8_t *p_left, *p_right;
+		off_t n_left, n_right;
+		ssize_t r;
+
+		if (!remain_right) {
+			*cmp = 1;
+			return 0;
+		}
+		if (!remain_left) {
+			*cmp = -1;
+			return 0;
+		}
+
+		n_left = MIN(chunksz, remain_left);
+		n_right = MIN(chunksz, remain_right);
+
+		if (left->at == NULL) {
+			r = read_at(left->root->f,
+				    left->pos + (left->len - remain_left),
+				    buf_left, n_left);
+			if (r) {
+				*cmp = 0;
+				return r;
+			}
+			p_left = buf_left;
+		} else {
+			p_left = left->at + (left->len - remain_left);
+		}
+
+		if (right->at == NULL) {
+			r = read_at(right->root->f,
+				    right->pos + (right->len - remain_right),
+				    buf_right, n_right);
+			if (r) {
+				*cmp = 0;
+				return r;
+			}
+			p_right = buf_right;
+		} else {
+			p_right = right->at + (right->len - remain_right);
+		}
+
+		r = buf_cmp(p_left, n_left, p_right, n_right,
+		    ignore_whitespace);
+		if (r) {
+			*cmp = r;
+			return 0;
+		}
+
+		remain_left -= n_left;
+		remain_right -= n_right;
+	}
+
+	*cmp = 0;
+	return 0;
+}
+
+int
+diff_atom_same(bool *same,
+	       const struct diff_atom *left,
+	       const struct diff_atom *right)
+{
+	int cmp;
+	int r;
+	if (left->hash != right->hash) {
+		*same = false;
+		return 0;
+	}
+	r = diff_atom_cmp(&cmp, left, right);
+	if (r) {
+		*same = true;
+		return r;
+	}
+	*same = (cmp == 0);
+	return 0;
+}
+
+static struct diff_chunk *
+diff_state_add_solved_chunk(struct diff_state *state,
+			    const struct diff_chunk *chunk)
+{
+	diff_chunk_arraylist_t *result;
+	struct diff_chunk *new_chunk;
+	enum diff_chunk_type last_t;
+	enum diff_chunk_type new_t;
+
+	/* Append to solved chunks; make sure that adjacent chunks of same type are combined, and that a minus chunk
+	 * never directly follows a plus chunk. */
+	result = &state->result->chunks;
+
+	last_t = diff_chunk_type(&result->head[result->len - 1]);
+	new_t = diff_chunk_type(chunk);
+
+	debug("Add %s chunk:\n", chunk->solved ? "solved" : "UNSOLVED");
+	debug("L\n");
+	debug_dump_atoms(&state->left, chunk->left_start, chunk->left_count);
+	debug("R\n");
+	debug_dump_atoms(&state->right, chunk->right_start, chunk->right_count);
+
+	if (new_t == last_t) {
+		new_chunk = &result->head[result->len - 1];
+		new_chunk->left_count += chunk->left_count;
+		new_chunk->right_count += chunk->right_count;
+		debug("  - added chunk touches previous one of same type, joined:\n");
+		debug("L\n");
+		debug_dump_atoms(&state->left, new_chunk->left_start, new_chunk->left_count);
+		debug("R\n");
+		debug_dump_atoms(&state->right, new_chunk->right_start, new_chunk->right_count);
+	} else if (last_t == CHUNK_PLUS && new_t == CHUNK_MINUS) {
+		enum diff_chunk_type prev_last_t =
+			result->len > 1 ?
+				diff_chunk_type(&result->head[result->len - 2])
+				: CHUNK_EMPTY;
+		/* If a minus-chunk follows a plus-chunk, place it above the plus-chunk->
+		 * Is the one before that also a minus? combine. */
+		if (prev_last_t == CHUNK_MINUS) {
+			new_chunk = &result->head[result->len - 2];
+			new_chunk->left_count += chunk->left_count;
+			new_chunk->right_count += chunk->right_count;
+
+			debug("  - added minus-chunk follows plus-chunk,"
+			      " put before that plus-chunk and joined"
+			      " with preceding minus-chunk:\n");
+			debug("L\n");
+			debug_dump_atoms(&state->left, new_chunk->left_start, new_chunk->left_count);
+			debug("R\n");
+			debug_dump_atoms(&state->right, new_chunk->right_start, new_chunk->right_count);
+		} else {
+			ARRAYLIST_INSERT(new_chunk, *result, result->len - 1);
+			if (!new_chunk)
+				return NULL;
+			*new_chunk = *chunk;
+
+			debug("  - added minus-chunk follows plus-chunk,"
+			      " put before that plus-chunk\n");
+		}
+	} else {
+		ARRAYLIST_ADD(new_chunk, *result);
+		if (!new_chunk)
+			return NULL;
+		*new_chunk = *chunk;
+	}
+	return new_chunk;
+}
+
+/* Even if a left or right side is empty, diff output may need to know the
+ * position in that file.
+ * So left_start or right_start must never be NULL -- pass left_count or
+ * right_count as zero to indicate staying at that position without consuming
+ * any lines. */
+struct diff_chunk *
+diff_state_add_chunk(struct diff_state *state, bool solved,
+		     struct diff_atom *left_start, unsigned int left_count,
+		     struct diff_atom *right_start, unsigned int right_count)
+{
+	diff_chunk_arraylist_t *result = NULL;
+	struct diff_chunk *new_chunk;
+	struct diff_chunk chunk = {
+		.solved = solved,
+		.left_start = left_start,
+		.left_count = left_count,
+		.right_start = right_start,
+		.right_count = right_count,
+	};
+
+	if (!solved || state->temp_result.len) {
+		/* Append to temp_result */
+		result = &state->temp_result;
+		debug("append to temp_result %p L %u  R %u\n", result,
+		      left_count, right_count);
+		debug("L\n");
+		debug_dump_atoms(&state->left, left_start, left_count);
+		debug("R\n");
+		debug_dump_atoms(&state->right, right_start, right_count);
+	} else if (!state->result->chunks.len) {
+		/* Append to final result */
+		result = &state->result->chunks;
+		debug("Add first chunk:\n");
+		debug("L\n");
+		debug_dump_atoms(&state->left, left_start, left_count);
+		debug("R\n");
+		debug_dump_atoms(&state->right, right_start, right_count);
+	}
+	if (result) {
+		ARRAYLIST_ADD(new_chunk, *result);
+		if (!new_chunk)
+			return NULL;
+		*new_chunk = chunk;
+		return new_chunk;
+	}
+
+	return diff_state_add_solved_chunk(state, &chunk);
+}
+
+void
+diff_data_init_root(struct diff_data *d, FILE *f, const uint8_t *data,
+    unsigned long long len, int diff_flags)
+{
+	*d = (struct diff_data){
+		.f = f,
+		.pos = 0,
+		.data = data,
+		.len = len,
+		.root = d,
+		.diff_flags = diff_flags,
+	};
+}
+
+void
+diff_data_init_subsection(struct diff_data *d, struct diff_data *parent,
+			  struct diff_atom *from_atom, unsigned int atoms_count)
+{
+	struct diff_atom *last_atom;
+
+	debug("diff_data %p  parent %p  from_atom %p  atoms_count %u\n",
+	      d, parent, from_atom, atoms_count);
+	debug("  from_atom ");
+	debug_dump_atom(parent, NULL, from_atom);
+
+	if (atoms_count == 0) {
+		*d = (struct diff_data){
+			.f = NULL,
+			.pos = 0,
+			.data = NULL,
+			.len = 0,
+			.root = parent->root,
+			.atoms.head = NULL,
+			.atoms.len = atoms_count,
+		};
+
+		return;
+	}
+
+	last_atom = from_atom + atoms_count - 1;
+	*d = (struct diff_data){
+		.f = NULL,
+		.pos = from_atom->pos,
+		.data = from_atom->at,
+		.len = (last_atom->pos + last_atom->len) - from_atom->pos,
+		.root = parent->root,
+		.atoms.head = from_atom,
+		.atoms.len = atoms_count,
+	};
+
+	debug("subsection:\n");
+	debug_dump(d);
+}
+
+void
+diff_data_free(struct diff_data *diff_data)
+{
+	if (!diff_data)
+		return;
+	if (diff_data->atoms.allocated)
+		ARRAYLIST_FREE(diff_data->atoms);
+}
+
+int
+diff_algo_none(const struct diff_algo_config *algo_config,
+	       struct diff_state *state)
+{
+	debug("\n** %s\n", __func__);
+	debug("left:\n");
+	debug_dump(&state->left);
+	debug("right:\n");
+	debug_dump(&state->right);
+	debug_dump_myers_graph(&state->left, &state->right, NULL, NULL, 0, NULL,
+			       0);
+
+	/* Add a chunk of equal lines, if any */
+	struct diff_atom *l = state->left.atoms.head;
+	unsigned int l_len = state->left.atoms.len;
+	struct diff_atom *r = state->right.atoms.head;
+	unsigned int r_len = state->right.atoms.len;
+	unsigned int equal_atoms_start = 0;
+	unsigned int equal_atoms_end = 0;
+	unsigned int l_idx = 0;
+	unsigned int r_idx = 0;
+
+	while (equal_atoms_start < l_len
+	       && equal_atoms_start < r_len) {
+		int err;
+		bool same;
+		err = diff_atom_same(&same, &l[equal_atoms_start],
+				     &r[equal_atoms_start]);
+		if (err)
+			return err;
+		if (!same)
+			break;
+		equal_atoms_start++;
+	}
+	while (equal_atoms_end < (l_len - equal_atoms_start)
+	       && equal_atoms_end < (r_len - equal_atoms_start)) {
+		int err;
+		bool same;
+		err = diff_atom_same(&same, &l[l_len - 1 - equal_atoms_end],
+				   &r[r_len - 1 - equal_atoms_end]);
+		if (err)
+			return err;
+		if (!same)
+			break;
+		equal_atoms_end++;
+	}
+
+	/* Add a chunk of equal lines at the start */
+	if (equal_atoms_start) {
+		if (!diff_state_add_chunk(state, true,
+					  l, equal_atoms_start,
+					  r, equal_atoms_start))
+			return ENOMEM;
+		l_idx += equal_atoms_start;
+		r_idx += equal_atoms_start;
+	}
+
+	/* Add a "minus" chunk with all lines from the left. */
+	if (equal_atoms_start + equal_atoms_end < l_len) {
+		unsigned int add_len = l_len - equal_atoms_start - equal_atoms_end;
+		if (!diff_state_add_chunk(state, true,
+					  &l[l_idx], add_len,
+					  &r[r_idx], 0))
+			return ENOMEM;
+		l_idx += add_len;
+	}
+
+	/* Add a "plus" chunk with all lines from the right. */
+	if (equal_atoms_start + equal_atoms_end < r_len) {
+		unsigned int add_len = r_len - equal_atoms_start - equal_atoms_end;
+		if (!diff_state_add_chunk(state, true,
+					  &l[l_idx], 0,
+					  &r[r_idx], add_len))
+			return ENOMEM;
+		r_idx += add_len;
+	}
+
+	/* Add a chunk of equal lines at the end */
+	if (equal_atoms_end) {
+		if (!diff_state_add_chunk(state, true,
+					  &l[l_idx], equal_atoms_end,
+					  &r[r_idx], equal_atoms_end))
+			return ENOMEM;
+	}
+
+	return DIFF_RC_OK;
+}
+
+int
+diff_run_algo(const struct diff_algo_config *algo_config,
+	      struct diff_state *state)
+{
+	int rc;
+
+	if (!algo_config || !algo_config->impl
+	    || !state->recursion_depth_left
+	    || !state->left.atoms.len || !state->right.atoms.len) {
+		debug("Fall back to diff_algo_none():%s%s%s\n",
+		      (!algo_config || !algo_config->impl) ? " no-cfg" : "",
+		      (!state->recursion_depth_left) ? " max-depth" : "",
+		      (!state->left.atoms.len || !state->right.atoms.len)?
+			      " trivial" : "");
+		return diff_algo_none(algo_config, state);
+	}
+
+	ARRAYLIST_FREE(state->temp_result);
+	ARRAYLIST_INIT(state->temp_result, DIFF_RESULT_ALLOC_BLOCKSIZE);
+	rc = algo_config->impl(algo_config, state);
+	switch (rc) {
+	case DIFF_RC_USE_DIFF_ALGO_FALLBACK:
+		debug("Got DIFF_RC_USE_DIFF_ALGO_FALLBACK (%p)\n",
+		      algo_config->fallback_algo);
+		rc = diff_run_algo(algo_config->fallback_algo, state);
+		goto return_rc;
+
+	case DIFF_RC_OK:
+		/* continue below */
+		break;
+
+	default:
+		/* some error happened */
+		goto return_rc;
+	}
+
+	/* Pick up any diff chunks that are still unsolved and feed to
+	 * inner_algo.  inner_algo will solve unsolved chunks and append to
+	 * result, and subsequent solved chunks on this level are then appended
+	 * to result afterwards. */
+	int i;
+	for (i = 0; i < state->temp_result.len; i++) {
+		struct diff_chunk *c = &state->temp_result.head[i];
+		if (c->solved) {
+			diff_state_add_solved_chunk(state, c);
+			continue;
+		}
+
+		/* c is an unsolved chunk, feed to inner_algo */
+		struct diff_state inner_state = {
+			.result = state->result,
+			.recursion_depth_left = state->recursion_depth_left - 1,
+			.kd_buf = state->kd_buf,
+			.kd_buf_size = state->kd_buf_size,
+		};
+		diff_data_init_subsection(&inner_state.left, &state->left,
+					  c->left_start, c->left_count);
+		diff_data_init_subsection(&inner_state.right, &state->right,
+					  c->right_start, c->right_count);
+
+		rc = diff_run_algo(algo_config->inner_algo, &inner_state);
+		state->kd_buf = inner_state.kd_buf;
+		state->kd_buf_size = inner_state.kd_buf_size;
+		if (rc != DIFF_RC_OK)
+			goto return_rc;
+	}
+
+	rc = DIFF_RC_OK;
+return_rc:
+	ARRAYLIST_FREE(state->temp_result);
+	return rc;
+}
+
+int
+diff_atomize_file(struct diff_data *d,
+		  const struct diff_config *config,
+		  FILE *f, const uint8_t *data, off_t len, int diff_flags)
+{
+	if (!config->atomize_func)
+		return EINVAL;
+
+	diff_data_init_root(d, f, data, len, diff_flags);
+
+	return config->atomize_func(config->atomize_func_data, d);
+
+}
+
+struct diff_result *
+diff_main(const struct diff_config *config, struct diff_data *left,
+	  struct diff_data *right)
+{
+	struct diff_result *result = malloc(sizeof(struct diff_result));
+	if (!result)
+		return NULL;
+
+	*result = (struct diff_result){};
+	result->left = left;
+	result->right = right;
+
+	struct diff_state state = {
+		.result = result,
+		.recursion_depth_left = config->max_recursion_depth ? : 32,
+		.kd_buf = NULL,
+		.kd_buf_size = 0,
+	};
+	diff_data_init_subsection(&state.left, left,
+				  left->atoms.head,
+				  left->atoms.len);
+	diff_data_init_subsection(&state.right, right,
+				  right->atoms.head,
+				  right->atoms.len);
+
+	result->rc = diff_run_algo(config->algo, &state);
+	free(state.kd_buf);
+
+	return result;
+}
+
+void
+diff_result_free(struct diff_result *result)
+{
+	if (!result)
+		return;
+	ARRAYLIST_FREE(result->chunks);
+	free(result);
+}
blob - /dev/null
blob + 40142164742fb5bb3d4316a5bca80a90d132836c (mode 644)
--- /dev/null
+++ lib/diff_main.h
@@ -0,0 +1,182 @@
+/* Generic infrastructure to implement various diff algorithms. */
+/*
+ * Copyright (c) 2020 Neels Hofmeyr <neels@hofmeyr.de>
+ *
+ * Permission to use, copy, modify, and distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+struct diff_range {
+	int start;
+	int end;
+};
+
+/* List of all possible return codes of a diff invocation. */
+#define DIFF_RC_USE_DIFF_ALGO_FALLBACK	-1
+#define DIFF_RC_OK			0
+/* Any positive return values are errno values from sys/errno.h */
+
+struct diff_atom;
+
+/* For each file, there is a "root" struct diff_data referencing the entire
+ * file, which the atoms are parsed from. In recursion of diff algorithm, there
+ * may be "child" struct diff_data only referencing a subsection of the file,
+ * re-using the atoms parsing. For "root" structs, atoms_allocated will be
+ * nonzero, indicating that the array of atoms is owned by that struct. For
+ * "child" structs, atoms_allocated == 0, to indicate that the struct is
+ * referencing a subset of atoms. */
+struct diff_data {
+	FILE *f;		/* if root diff_data and not memory-mapped */
+	off_t pos;		/* if not memory-mapped */
+	const uint8_t *data;	/* if memory-mapped */
+	off_t len;
+
+	ARRAYLIST(struct diff_atom) atoms;
+	struct diff_data *root;
+	struct diff_data *current;
+	void *algo_data;
+
+	int diff_flags;
+
+	int err;
+};
+
+#define DIFF_FLAG_IGNORE_WHITESPACE	0x00000001
+#define DIFF_FLAG_SHOW_PROTOTYPES	0x00000002
+
+void diff_data_free(struct diff_data *diff_data);
+
+struct diff_chunk;
+typedef ARRAYLIST(struct diff_chunk) diff_chunk_arraylist_t;
+
+struct diff_result {
+	int rc;
+
+	/*
+	 * Pointers to diff data passed in via diff_main.
+	 * Do not free these diff_data before freeing the diff_result struct.
+	 */
+	struct diff_data *left;
+	struct diff_data *right;
+
+	diff_chunk_arraylist_t chunks;
+};
+
+struct diff_state;
+
+/* Signature of a utility function to divide a file into diff atoms.
+ * An example is diff_atomize_text_by_line() in diff_atomize_text.c.
+ *
+ * func_data: context pointer (free to be used by implementation).
+ * d: struct diff_data with d->data and d->len already set up, and
+ * d->atoms to be created.
+ */
+typedef int (*diff_atomize_func_t)(void *func_data, struct diff_data *d);
+
+extern int diff_atomize_text_by_line(void *func_data, struct diff_data *d);
+
+struct diff_algo_config;
+typedef int (*diff_algo_impl_t)(
+	const struct diff_algo_config *algo_config, struct diff_state *state);
+
+/* Form a result with all left-side removed and all right-side added, i.e. no
+ * actual diff algorithm involved. */
+int diff_algo_none(const struct diff_algo_config *algo_config,
+	struct diff_state *state);
+
+/* Myers Diff tracing from the start all the way through to the end, requiring
+ * quadratic amounts of memory. This can fail if the required space surpasses
+ * algo_config->permitted_state_size. */
+extern int diff_algo_myers(const struct diff_algo_config *algo_config,
+	struct diff_state *state);
+
+/* Myers "Divide et Impera": tracing forwards from the start and backwards from
+ * the end to find a midpoint that divides the problem into smaller chunks.
+ * Requires only linear amounts of memory. */
+extern int diff_algo_myers_divide(
+	const struct diff_algo_config *algo_config, struct diff_state *state);
+
+/* Patience Diff algorithm, which divides a larger diff into smaller chunks. For
+ * very specific scenarios, it may lead to a complete diff result by itself, but
+ * needs a fallback algo to solve chunks that don't have common-unique atoms. */
+extern int diff_algo_patience(
+	const struct diff_algo_config *algo_config, struct diff_state *state);
+
+/* Diff algorithms to use, possibly nested. For example:
+ *
+ * struct diff_algo_config myers, patience, myers_divide;
+ *
+ * myers = (struct diff_algo_config){
+ *         .impl = diff_algo_myers,
+ *         .permitted_state_size = 32 * 1024 * 1024,
+ *         // When too large, do diff_algo_patience:
+ *         .fallback_algo = &patience,
+ * };
+ *
+ * const struct diff_algo_config patience = (struct diff_algo_config){
+ * 	.impl = diff_algo_patience,
+ * 	// After subdivision, do Patience again:
+ * 	.inner_algo = &patience,
+ * 	// If subdivision failed, do Myers Divide et Impera:
+ * 	.fallback_algo = &myers_then_myers_divide,
+ * };
+ * 
+ * const struct diff_algo_config myers_divide = (struct diff_algo_config){
+ * 	.impl = diff_algo_myers_divide,
+ * 	// When division succeeded, start from the top:
+ * 	.inner_algo = &myers_then_myers_divide,
+ * 	// (fallback_algo = NULL implies diff_algo_none).
+ * };
+ * struct diff_config config = {
+ *         .algo = &myers,
+ *         ...
+ * };
+ * diff_main(&config, ...);
+ */
+struct diff_algo_config {
+	diff_algo_impl_t impl;
+
+	/* Fail this algo if it would use more than this amount of memory, and
+	 * instead use fallback_algo (diff_algo_myers). permitted_state_size ==
+	 * 0 means no limitation. */
+	size_t permitted_state_size;
+
+	/* For algorithms that divide into smaller chunks, use this algorithm to
+	 * solve the divided chunks. */
+	const struct diff_algo_config *inner_algo;
+
+	/* If the algorithm fails (e.g. diff_algo_myers_if_small needs too large
+	 * state, or diff_algo_patience can't find any common-unique atoms),
+	 * then use this algorithm instead. */
+	const struct diff_algo_config *fallback_algo;
+};
+
+struct diff_config {
+	diff_atomize_func_t atomize_func;
+	void *atomize_func_data;
+
+	const struct diff_algo_config *algo;
+
+	/* How deep to step into subdivisions of a source file, a paranoia /
+	 * safety measure to guard against infinite loops through diff
+	 * algorithms. When the maximum recursion is reached, employ
+	 * diff_algo_none (i.e. remove all left atoms and add all right atoms).
+	 */
+	unsigned int max_recursion_depth;
+};
+
+int diff_atomize_file(struct diff_data *d, const struct diff_config *config,
+		      FILE *f, const uint8_t *data, off_t len, int diff_flags);
+struct diff_result *diff_main(const struct diff_config *config,
+			      struct diff_data *left,
+			      struct diff_data *right);
+void diff_result_free(struct diff_result *result);
blob - /dev/null
blob + 5eaebfacda33fd3b4931e4077bfad705adb4803c (mode 644)
--- /dev/null
+++ lib/diff_myers.c
@@ -0,0 +1,1459 @@
+/* Myers diff algorithm implementation, invented by Eugene W. Myers [1].
+ * Implementations of both the Myers Divide Et Impera (using linear space)
+ * and the canonical Myers algorithm (using quadratic space). */
+/*
+ * Copyright (c) 2020 Neels Hofmeyr <neels@hofmeyr.de>
+ *
+ * Permission to use, copy, modify, and distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#include <inttypes.h>
+#include <stdbool.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stdio.h>
+#include <errno.h>
+
+#include <arraylist.h>
+#include <diff_main.h>
+
+#include "diff_internal.h"
+#include "diff_debug.h"
+
+/* Myers' diff algorithm [1] is nicely explained in [2].
+ * [1] http://www.xmailserver.org/diff2.pdf
+ * [2] https://blog.jcoglan.com/2017/02/12/the-myers-diff-algorithm-part-1/ ff.
+ *
+ * Myers approaches finding the smallest diff as a graph problem.
+ * The crux is that the original algorithm requires quadratic amount of memory:
+ * both sides' lengths added, and that squared. So if we're diffing lines of
+ * text, two files with 1000 lines each would blow up to a matrix of about
+ * 2000 * 2000 ints of state, about 16 Mb of RAM to figure out 2 kb of text.
+ * The solution is using Myers' "divide and conquer" extension algorithm, which
+ * does the original traversal from both ends of the files to reach a middle
+ * where these "snakes" touch, hence does not need to backtrace the traversal,
+ * and so gets away with only keeping a single column of that huge state matrix
+ * in memory.
+ */
+
+struct diff_box {
+	unsigned int left_start;
+	unsigned int left_end;
+	unsigned int right_start;
+	unsigned int right_end;
+};
+
+/* If the two contents of a file are A B C D E and X B C Y,
+ * the Myers diff graph looks like:
+ *
+ *   k0  k1
+ *    \   \
+ * k-1     0 1 2 3 4 5
+ *   \      A B C D E
+ *     0   o-o-o-o-o-o
+ *      X  | | | | | |
+ *     1   o-o-o-o-o-o
+ *      B  | |\| | | |
+ *     2   o-o-o-o-o-o
+ *      C  | | |\| | |
+ *     3   o-o-o-o-o-o
+ *      Y  | | | | | |\
+ *     4   o-o-o-o-o-o c1
+ *                  \ \
+ *                 c-1 c0
+ *
+ * Moving right means delete an atom from the left-hand-side,
+ * Moving down means add an atom from the right-hand-side.
+ * Diagonals indicate identical atoms on both sides, the challenge is to use as
+ * many diagonals as possible.
+ *
+ * The original Myers algorithm walks all the way from the top left to the
+ * bottom right, remembers all steps, and then backtraces to find the shortest
+ * path. However, that requires keeping the entire graph in memory, which needs
+ * quadratic space.
+ *
+ * Myers adds a variant that uses linear space -- note, not linear time, only
+ * linear space: walk forward and backward, find a meeting point in the middle,
+ * and recurse on the two separate sections. This is called "divide and
+ * conquer".
+ *
+ * d: the step number, starting with 0, a.k.a. the distance from the starting
+ *    point.
+ * k: relative index in the state array for the forward scan, indicating on
+ *    which diagonal through the diff graph we currently are.
+ * c: relative index in the state array for the backward scan, indicating the
+ *    diagonal number from the bottom up.
+ *
+ * The "divide and conquer" traversal through the Myers graph looks like this:
+ *
+ *      | d=   0   1   2   3      2   1   0
+ *  ----+--------------------------------------------
+ *  k=  |                                      c=
+ *   4  |                                       3
+ *      |
+ *   3  |                 3,0    5,2            2
+ *      |                /          \
+ *   2  |             2,0            5,3        1
+ *      |            /                 \
+ *   1  |         1,0     4,3 >= 4,3    5,4<--  0
+ *      |        /       /          \  /
+ *   0  |  -->0,0     3,3            4,4       -1
+ *      |        \   /              /
+ *  -1  |         0,1     1,2    3,4           -2
+ *      |            \   /
+ *  -2  |             0,2                      -3
+ *      |                \
+ *      |                 0,3
+ *      |  forward->                 <-backward
+ *
+ * x,y pairs here are the coordinates in the Myers graph:
+ * x = atom index in left-side source, y = atom index in the right-side source.
+ *
+ * Only one forward column and one backward column are kept in mem, each need at
+ * most left.len + 1 + right.len items.  Note that each d step occupies either
+ * the even or the odd items of a column: if e.g. the previous column is in the
+ * odd items, the next column is formed in the even items, without overwriting
+ * the previous column's results.
+ *
+ * Also note that from the diagonal index k and the x coordinate, the y
+ * coordinate can be derived:
+ *    y = x - k
+ * Hence the state array only needs to keep the x coordinate, i.e. the position
+ * in the left-hand file, and the y coordinate, i.e. position in the right-hand
+ * file, is derived from the index in the state array.
+ *
+ * The two traces meet at 4,3, the first step (here found in the forward
+ * traversal) where a forward position is on or past a backward traced position
+ * on the same diagonal.
+ *
+ * This divides the problem space into:
+ *
+ *         0 1 2 3 4 5
+ *          A B C D E
+ *     0   o-o-o-o-o
+ *      X  | | | | |
+ *     1   o-o-o-o-o
+ *      B  | |\| | |
+ *     2   o-o-o-o-o
+ *      C  | | |\| |
+ *     3   o-o-o-o-*-o   *: forward and backward meet here
+ *      Y          | |
+ *     4           o-o
+ *
+ * Doing the same on each section lead to:
+ *
+ *         0 1 2 3 4 5
+ *          A B C D E
+ *     0   o-o
+ *      X  | |
+ *     1   o-b    b: backward d=1 first reaches here (sliding up the snake)
+ *      B     \   f: then forward d=2 reaches here (sliding down the snake)
+ *     2       o     As result, the box from b to f is found to be identical;
+ *      C       \    leaving a top box from 0,0 to 1,1 and a bottom trivial
+ *     3         f-o tail 3,3 to 4,3.
+ *
+ *     3           o-*
+ *      Y            |
+ *     4             o   *: forward and backward meet here
+ *
+ * and solving the last top left box gives:
+ *
+ *         0 1 2 3 4 5
+ *          A B C D E           -A
+ *     0   o-o                  +X
+ *      X    |                   B
+ *     1     o                   C
+ *      B     \                 -D
+ *     2       o                -E
+ *      C       \               +Y
+ *     3         o-o-o
+ *      Y            |
+ *     4             o
+ *
+ */
+
+#define xk_to_y(X, K) ((X) - (K))
+#define xc_to_y(X, C, DELTA) ((X) - (C) + (DELTA))
+#define k_to_c(K, DELTA) ((K) + (DELTA))
+#define c_to_k(C, DELTA) ((C) - (DELTA))
+
+/* Do one forwards step in the "divide and conquer" graph traversal.
+ * left: the left side to diff.
+ * right: the right side to diff against.
+ * kd_forward: the traversal state for forwards traversal, modified by this
+ *             function.
+ *             This is carried over between invocations with increasing d.
+ *             kd_forward points at the center of the state array, allowing
+ *             negative indexes.
+ * kd_backward: the traversal state for backwards traversal, to find a meeting
+ *              point.
+ *              Since forwards is done first, kd_backward will be valid for d -
+ *              1, not d.
+ *              kd_backward points at the center of the state array, allowing
+ *              negative indexes.
+ * d: Step or distance counter, indicating for what value of d the kd_forward
+ *    should be populated.
+ *    For d == 0, kd_forward[0] is initialized, i.e. the first invocation should
+ *    be for d == 0.
+ * meeting_snake: resulting meeting point, if any.
+ * Return true when a meeting point has been identified.
+ */
+static int
+diff_divide_myers_forward(bool *found_midpoint,
+			  struct diff_data *left, struct diff_data *right,
+			  int *kd_forward, int *kd_backward, int d,
+			  struct diff_box *meeting_snake)
+{
+	int delta = (int)right->atoms.len - (int)left->atoms.len;
+	int k;
+	int x;
+	int prev_x;
+	int prev_y;
+	int x_before_slide;
+	*found_midpoint = false;
+
+	debug("-- %s d=%d\n", __func__, d);
+
+	for (k = d; k >= -d; k -= 2) {
+		if (k < -(int)right->atoms.len || k > (int)left->atoms.len) {
+			/* This diagonal is completely outside of the Myers
+			 * graph, don't calculate it. */
+			if (k < -(int)right->atoms.len)
+				debug(" %d k < -(int)right->atoms.len %d\n", k,
+				      -(int)right->atoms.len);
+			else
+				debug(" %d k > left->atoms.len %d\n", k,
+				      left->atoms.len);
+			if (k < 0) {
+				/* We are traversing negatively, and already
+				 * below the entire graph, nothing will come of
+				 * this. */
+				debug(" break\n");
+				break;
+			}
+			debug(" continue\n");
+			continue;
+		}
+		debug("- k = %d\n", k);
+		if (d == 0) {
+			/* This is the initializing step. There is no prev_k
+			 * yet, get the initial x from the top left of the Myers
+			 * graph. */
+			x = 0;
+			prev_x = x;
+			prev_y = xk_to_y(x, k);
+		}
+		/* Favoring "-" lines first means favoring moving rightwards in
+		 * the Myers graph.
+		 * For this, all k should derive from k - 1, only the bottom
+		 * most k derive from k + 1:
+		 *
+		 *      | d=   0   1   2
+		 *  ----+----------------
+		 *  k=  |
+		 *   2  |             2,0 <-- from prev_k = 2 - 1 = 1
+		 *      |            /
+		 *   1  |         1,0
+		 *      |        /
+		 *   0  |  -->0,0     3,3
+		 *      |       \\   /
+		 *  -1  |         0,1 <-- bottom most for d=1 from
+		 *      |           \\    prev_k = -1 + 1 = 0
+		 *  -2  |             0,2 <-- bottom most for d=2 from
+		 *                            prev_k = -2 + 1 = -1
+		 *
+		 * Except when a k + 1 from a previous run already means a
+		 * further advancement in the graph.
+		 * If k == d, there is no k + 1 and k - 1 is the only option.
+		 * If k < d, use k + 1 in case that yields a larger x. Also use
+		 * k + 1 if k - 1 is outside the graph.
+		 */
+		else if (k > -d
+			 && (k == d
+			     || (k - 1 >= -(int)right->atoms.len
+				 && kd_forward[k - 1] >= kd_forward[k + 1]))) {
+			/* Advance from k - 1.
+			 * From position prev_k, step to the right in the Myers
+			 * graph: x += 1.
+			 */
+			int prev_k = k - 1;
+			prev_x = kd_forward[prev_k];
+			prev_y = xk_to_y(prev_x, prev_k);
+			x = prev_x + 1;
+		} else {
+			/* The bottom most one.
+			 * From position prev_k, step to the bottom in the Myers
+			 * graph: y += 1.
+			 * Incrementing y is achieved by decrementing k while
+			 * keeping the same x.
+			 * (since we're deriving y from y = x - k).
+			 */
+			int prev_k = k + 1;
+			prev_x = kd_forward[prev_k];
+			prev_y = xk_to_y(prev_x, prev_k);
+			x = prev_x;
+		}
+
+		x_before_slide = x;
+		/* Slide down any snake that we might find here. */
+		while (x < left->atoms.len && xk_to_y(x, k) < right->atoms.len) {
+			bool same;
+			int r = diff_atom_same(&same,
+					       &left->atoms.head[x],
+					       &right->atoms.head[
+						xk_to_y(x, k)]);
+			if (r)
+				return r;
+			if (!same)
+				break;
+			x++;
+		}
+		kd_forward[k] = x;
+		if (x_before_slide != x) {
+			debug("  down %d similar lines\n", x - x_before_slide);
+		}
+
+#if DEBUG
+		{
+			int fi;
+			for (fi = d; fi >= k; fi--) {
+				debug("kd_forward[%d] = (%d, %d)\n", fi,
+				      kd_forward[fi], kd_forward[fi] - fi);
+			}
+		}
+#endif
+
+		if (x < 0 || x > left->atoms.len
+		    || xk_to_y(x, k) < 0 || xk_to_y(x, k) > right->atoms.len)
+			continue;
+
+		/* Figured out a new forwards traversal, see if this has gone
+		 * onto or even past a preceding backwards traversal.
+		 *
+		 * If the delta in length is odd, then d and backwards_d hit the
+		 * same state indexes:
+		 *      | d=   0   1   2      1   0
+		 *  ----+----------------    ----------------
+		 *  k=  |                              c=
+		 *   4  |                               3
+		 *      |
+		 *   3  |                               2
+		 *      |                same
+		 *   2  |             2,0====5,3        1
+		 *      |            /          \
+		 *   1  |         1,0            5,4<-- 0
+		 *      |        /              /
+		 *   0  |  -->0,0     3,3====4,4       -1
+		 *      |        \   /
+		 *  -1  |         0,1                  -2
+		 *      |            \
+		 *  -2  |             0,2              -3
+		 *      |
+		 *
+		 * If the delta is even, they end up off-by-one, i.e. on
+		 * different diagonals:
+		 *
+		 *      | d=   0   1   2    1   0
+		 *  ----+----------------  ----------------
+		 *      |                            c=
+		 *   3  |                             3
+		 *      |
+		 *   2  |             2,0 off         2
+		 *      |            /   \\
+		 *   1  |         1,0      4,3        1
+		 *      |        /       //   \
+		 *   0  |  -->0,0     3,3      4,4<-- 0
+		 *      |        \   /        /
+		 *  -1  |         0,1      3,4       -1
+		 *      |            \   //
+		 *  -2  |             0,2            -2
+		 *      |
+		 *
+		 * So in the forward path, we can only match up diagonals when
+		 * the delta is odd.
+		 */
+		if ((delta & 1) == 0)
+			continue;
+		 /* Forwards is done first, so the backwards one was still at
+		  * d - 1. Can't do this for d == 0. */
+		int backwards_d = d - 1;
+		if (backwards_d < 0)
+			continue;
+
+		debug("backwards_d = %d\n", backwards_d);
+
+		/* If both sides have the same length, forward and backward
+		 * start on the same diagonal, meaning the backwards state index
+		 * c == k.
+		 * As soon as the lengths are not the same, the backwards
+		 * traversal starts on a different diagonal, and c = k shifted
+		 * by the difference in length.
+		 */
+		int c = k_to_c(k, delta);
+
+		/* When the file sizes are very different, the traversal trees
+		 * start on far distant diagonals.
+		 * They don't necessarily meet straight on. See whether this
+		 * forward value is on a diagonal that is also valid in
+		 * kd_backward[], and match them if so. */
+		if (c >= -backwards_d && c <= backwards_d) {
+			/* Current k is on a diagonal that exists in
+			 * kd_backward[]. If the two x positions have met or
+			 * passed (forward walked onto or past backward), then
+			 * we've found a midpoint / a mid-box.
+			 *
+			 * When forwards and backwards traversals meet, the
+			 * endpoints of the mid-snake are not the two points in
+			 * kd_forward and kd_backward, but rather the section
+			 * that was slid (if any) of the current
+			 * forward/backward traversal only.
+			 *
+			 * For example:
+			 *
+			 *   o
+			 *    \
+			 *     o
+			 *      \
+			 *       o
+			 *        \
+			 *         o
+			 *          \
+			 *       X o o
+			 *       | | |
+			 *     o-o-o o
+			 *          \|
+			 *           M
+			 *            \
+			 *             o
+			 *              \
+			 *               A o
+			 *               | |
+			 *             o-o-o
+			 *
+			 * The forward traversal reached M from the top and slid
+			 * downwards to A.  The backward traversal already
+			 * reached X, which is not a straight line from M
+			 * anymore, so picking a mid-snake from M to X would
+			 * yield a mistake.
+			 *
+			 * The correct mid-snake is between M and A. M is where
+			 * the forward traversal hit the diagonal that the
+			 * backward traversal has already passed, and A is what
+			 * it reaches when sliding down identical lines.
+			 */
+			int backward_x = kd_backward[c];
+			debug("Compare:  k=%d c=%d  is (%d,%d) >= (%d,%d)?\n",
+			      k, c, x, xk_to_y(x, k), backward_x,
+			      xc_to_y(backward_x, c, delta));
+			if (x >= backward_x) {
+				if (x_before_slide != x) {
+					/* met after sliding up a mid-snake */
+					*meeting_snake = (struct diff_box){
+						.left_start = x_before_slide,
+						.left_end = x,
+						.right_start = xc_to_y(x_before_slide,
+								       c, delta),
+						.right_end = xk_to_y(x, k),
+					};
+				} else {
+					/* met after a side step, non-identical
+					 * line. Mark that as box divider
+					 * instead. This makes sure that
+					 * myers_divide never returns the same
+					 * box that came as input, avoiding
+					 * "infinite" looping. */
+					*meeting_snake = (struct diff_box){
+						.left_start = prev_x,
+						.left_end = x,
+						.right_start = prev_y,
+						.right_end = xk_to_y(x, k),
+					};
+				}
+				debug("HIT x=(%u,%u) - y=(%u,%u)\n",
+				      meeting_snake->left_start,
+				      meeting_snake->right_start,
+				      meeting_snake->left_end,
+				      meeting_snake->right_end);
+				debug_dump_myers_graph(left, right, NULL,
+						       kd_forward, d,
+						       kd_backward, d-1);
+				*found_midpoint = true;
+				return 0;
+			}
+		}
+	}
+
+	debug_dump_myers_graph(left, right, NULL, kd_forward, d,
+			       kd_backward, d-1);
+	return 0;
+}
+
+/* Do one backwards step in the "divide and conquer" graph traversal.
+ * left: the left side to diff.
+ * right: the right side to diff against.
+ * kd_forward: the traversal state for forwards traversal, to find a meeting
+ *             point.
+ *             Since forwards is done first, after this, both kd_forward and
+ *             kd_backward will be valid for d.
+ *             kd_forward points at the center of the state array, allowing
+ *             negative indexes.
+ * kd_backward: the traversal state for backwards traversal, to find a meeting
+ *              point.
+ *              This is carried over between invocations with increasing d.
+ *              kd_backward points at the center of the state array, allowing
+ *              negative indexes.
+ * d: Step or distance counter, indicating for what value of d the kd_backward
+ *    should be populated.
+ *    Before the first invocation, kd_backward[0] shall point at the bottom
+ *    right of the Myers graph (left.len, right.len).
+ *    The first invocation will be for d == 1.
+ * meeting_snake: resulting meeting point, if any.
+ * Return true when a meeting point has been identified.
+ */
+static int
+diff_divide_myers_backward(bool *found_midpoint,
+			   struct diff_data *left, struct diff_data *right,
+			   int *kd_forward, int *kd_backward, int d,
+			   struct diff_box *meeting_snake)
+{
+	int delta = (int)right->atoms.len - (int)left->atoms.len;
+	int c;
+	int x;
+	int prev_x;
+	int prev_y;
+	int x_before_slide;
+
+	*found_midpoint = false;
+
+	debug("-- %s d=%d\n", __func__, d);
+
+	for (c = d; c >= -d; c -= 2) {
+		if (c < -(int)left->atoms.len || c > (int)right->atoms.len) {
+			/* This diagonal is completely outside of the Myers
+			 * graph, don't calculate it. */
+			if (c < -(int)left->atoms.len)
+				debug(" %d c < -(int)left->atoms.len %d\n", c,
+				      -(int)left->atoms.len);
+			else
+				debug(" %d c > right->atoms.len %d\n", c,
+				      right->atoms.len);
+			if (c < 0) {
+				/* We are traversing negatively, and already
+				 * below the entire graph, nothing will come of
+				 * this. */
+				debug(" break\n");
+				break;
+			}
+			debug(" continue\n");
+			continue;
+		}
+		debug("- c = %d\n", c);
+		if (d == 0) {
+			/* This is the initializing step. There is no prev_c
+			 * yet, get the initial x from the bottom right of the
+			 * Myers graph. */
+			x = left->atoms.len;
+			prev_x = x;
+			prev_y = xc_to_y(x, c, delta);
+		}
+		/* Favoring "-" lines first means favoring moving rightwards in
+		 * the Myers graph.
+		 * For this, all c should derive from c - 1, only the bottom
+		 * most c derive from c + 1:
+		 *
+		 *                                  2   1   0
+		 *  ---------------------------------------------------
+		 *                                               c=
+		 *                                                3
+		 *
+		 *         from prev_c = c - 1 --> 5,2            2
+		 *                                    \
+		 *                                     5,3        1
+		 *                                        \
+		 *                                 4,3     5,4<-- 0
+		 *                                    \   /
+		 *  bottom most for d=1 from c + 1 --> 4,4       -1
+		 *                                    /
+		 *         bottom most for d=2 --> 3,4           -2
+		 *
+		 * Except when a c + 1 from a previous run already means a
+		 * further advancement in the graph.
+		 * If c == d, there is no c + 1 and c - 1 is the only option.
+		 * If c < d, use c + 1 in case that yields a larger x.
+		 * Also use c + 1 if c - 1 is outside the graph.
+		 */
+		else if (c > -d && (c == d
+				    || (c - 1 >= -(int)right->atoms.len
+					&& kd_backward[c - 1] <= kd_backward[c + 1]))) {
+			/* A top one.
+			 * From position prev_c, step upwards in the Myers
+			 * graph: y -= 1.
+			 * Decrementing y is achieved by incrementing c while
+			 * keeping the same x. (since we're deriving y from
+			 * y = x - c + delta).
+			 */
+			int prev_c = c - 1;
+			prev_x = kd_backward[prev_c];
+			prev_y = xc_to_y(prev_x, prev_c, delta);
+			x = prev_x;
+		} else {
+			/* The bottom most one.
+			 * From position prev_c, step to the left in the Myers
+			 * graph: x -= 1.
+			 */
+			int prev_c = c + 1;
+			prev_x = kd_backward[prev_c];
+			prev_y = xc_to_y(prev_x, prev_c, delta);
+			x = prev_x - 1;
+		}
+
+		/* Slide up any snake that we might find here (sections of
+		 * identical lines on both sides). */
+		debug("c=%d x-1=%d Yb-1=%d-1=%d\n", c, x-1, xc_to_y(x, c,
+								    delta),
+		      xc_to_y(x, c, delta)-1);
+		if (x > 0) {
+			debug("  l=");
+			debug_dump_atom(left, right, &left->atoms.head[x-1]);
+		}
+		if (xc_to_y(x, c, delta) > 0) {
+			debug("  r=");
+			debug_dump_atom(right, left,
+				&right->atoms.head[xc_to_y(x, c, delta)-1]);
+		}
+		x_before_slide = x;
+		while (x > 0 && xc_to_y(x, c, delta) > 0) {
+			bool same;
+			int r = diff_atom_same(&same,
+					       &left->atoms.head[x-1],
+					       &right->atoms.head[
+						xc_to_y(x, c, delta)-1]);
+			if (r)
+				return r;
+			if (!same)
+				break;
+			x--;
+		}
+		kd_backward[c] = x;
+		if (x_before_slide != x) {
+			debug("  up %d similar lines\n", x_before_slide - x);
+		}
+
+		if (DEBUG) {
+			int fi;
+			for (fi = d; fi >= c; fi--) {
+				debug("kd_backward[%d] = (%d, %d)\n",
+				      fi,
+				      kd_backward[fi],
+				      kd_backward[fi] - fi + delta);
+			}
+		}
+
+		if (x < 0 || x > left->atoms.len
+		    || xc_to_y(x, c, delta) < 0
+		    || xc_to_y(x, c, delta) > right->atoms.len)
+			continue;
+
+		/* Figured out a new backwards traversal, see if this has gone
+		 * onto or even past a preceding forwards traversal.
+		 *
+		 * If the delta in length is even, then d and backwards_d hit
+		 * the same state indexes -- note how this is different from in
+		 * the forwards traversal, because now both d are the same:
+		 *
+		 *      | d=   0   1   2      2   1   0
+		 *  ----+----------------    --------------------
+		 *  k=  |                                  c=
+		 *   4  |
+		 *      |
+		 *   3  |                                   3
+		 *      |                same
+		 *   2  |             2,0====5,2            2
+		 *      |            /          \
+		 *   1  |         1,0            5,3        1
+		 *      |        /              /  \
+		 *   0  |  -->0,0     3,3====4,3    5,4<--  0
+		 *      |        \   /             /
+		 *  -1  |         0,1            4,4       -1
+		 *      |            \
+		 *  -2  |             0,2                  -2
+		 *      |
+		 *                                      -3
+		 * If the delta is odd, they end up off-by-one, i.e. on
+		 * different diagonals.
+		 * So in the backward path, we can only match up diagonals when
+		 * the delta is even.
+		 */
+		if ((delta & 1) != 0)
+			continue;
+		/* Forwards was done first, now both d are the same. */
+		int forwards_d = d;
+
+		/* As soon as the lengths are not the same, the
+		 * backwards traversal starts on a different diagonal,
+		 * and c = k shifted by the difference in length.
+		 */
+		int k = c_to_k(c, delta);
+
+		/* When the file sizes are very different, the traversal trees
+		 * start on far distant diagonals.
+		 * They don't necessarily meet straight on. See whether this
+		 * backward value is also on a valid diagonal in kd_forward[],
+		 * and match them if so. */
+		if (k >= -forwards_d && k <= forwards_d) {
+			/* Current c is on a diagonal that exists in
+			 * kd_forward[]. If the two x positions have met or
+			 * passed (backward walked onto or past forward), then
+			 * we've found a midpoint / a mid-box.
+			 *
+			 * When forwards and backwards traversals meet, the
+			 * endpoints of the mid-snake are not the two points in
+			 * kd_forward and kd_backward, but rather the section
+			 * that was slid (if any) of the current
+			 * forward/backward traversal only.
+			 *
+			 * For example:
+			 *
+			 *   o-o-o
+			 *   | |
+			 *   o A
+			 *   |  \
+			 *   o   o
+			 *        \
+			 *         M
+			 *         |\
+			 *         o o-o-o
+			 *         | | |
+			 *         o o X
+			 *          \
+			 *           o
+			 *            \
+			 *             o
+			 *              \
+			 *               o
+			 *
+			 * The backward traversal reached M from the bottom and
+			 * slid upwards.  The forward traversal already reached
+			 * X, which is not a straight line from M anymore, so
+			 * picking a mid-snake from M to X would yield a
+			 * mistake.
+			 *
+			 * The correct mid-snake is between M and A. M is where
+			 * the backward traversal hit the diagonal that the
+			 * forwards traversal has already passed, and A is what
+			 * it reaches when sliding up identical lines.
+			 */
+
+			int forward_x = kd_forward[k];
+			debug("Compare:  k=%d c=%d  is (%d,%d) >= (%d,%d)?\n",
+			      k, c, forward_x, xk_to_y(forward_x, k),
+			      x, xc_to_y(x, c, delta));
+			if (forward_x >= x) {
+				if (x_before_slide != x) {
+					/* met after sliding down a mid-snake */
+					*meeting_snake = (struct diff_box){
+						.left_start = x,
+						.left_end = x_before_slide,
+						.right_start = xc_to_y(x, c, delta),
+						.right_end = xk_to_y(x_before_slide, k),
+					};
+				} else {
+					/* met after a side step, non-identical
+					 * line. Mark that as box divider
+					 * instead. This makes sure that
+					 * myers_divide never returns the same
+					 * box that came as input, avoiding
+					 * "infinite" looping. */
+					*meeting_snake = (struct diff_box){
+						.left_start = x,
+						.left_end = prev_x,
+						.right_start = xc_to_y(x, c, delta),
+						.right_end = prev_y,
+					};
+				}
+				debug("HIT x=%u,%u - y=%u,%u\n",
+				      meeting_snake->left_start,
+				      meeting_snake->right_start,
+				      meeting_snake->left_end,
+				      meeting_snake->right_end);
+				debug_dump_myers_graph(left, right, NULL,
+						       kd_forward, d,
+						       kd_backward, d);
+				*found_midpoint = true;
+				return 0;
+			}
+		}
+	}
+	debug_dump_myers_graph(left, right, NULL, kd_forward, d, kd_backward,
+			       d);
+	return 0;
+}
+
+/* Integer square root approximation */
+static int
+shift_sqrt(int val)
+{
+	int i;
+        for (i = 1; val > 0; val >>= 2)
+		i <<= 1;
+        return i;
+}
+
+/* Myers "Divide et Impera": tracing forwards from the start and backwards from
+ * the end to find a midpoint that divides the problem into smaller chunks.
+ * Requires only linear amounts of memory. */
+int
+diff_algo_myers_divide(const struct diff_algo_config *algo_config,
+		       struct diff_state *state)
+{
+	int rc = ENOMEM;
+	struct diff_data *left = &state->left;
+	struct diff_data *right = &state->right;
+	int *kd_buf;
+
+	debug("\n** %s\n", __func__);
+	debug("left:\n");
+	debug_dump(left);
+	debug("right:\n");
+	debug_dump(right);
+	debug_dump_myers_graph(left, right, NULL, NULL, 0, NULL, 0);
+
+	/* Allocate two columns of a Myers graph, one for the forward and one
+	 * for the backward traversal. */
+	unsigned int max = left->atoms.len + right->atoms.len;
+	size_t kd_len = max + 1;
+	size_t kd_buf_size = kd_len << 1;
+
+	if (state->kd_buf_size < kd_buf_size) {
+		kd_buf = reallocarray(state->kd_buf, kd_buf_size,
+		    sizeof(int));
+		if (!kd_buf)
+			return ENOMEM;
+		state->kd_buf = kd_buf;
+		state->kd_buf_size = kd_buf_size;
+	} else
+		kd_buf = state->kd_buf;
+	int i;
+	for (i = 0; i < kd_buf_size; i++)
+		kd_buf[i] = -1;
+	int *kd_forward = kd_buf;
+	int *kd_backward = kd_buf + kd_len;
+	int max_effort = shift_sqrt(max/2);
+
+	/* The 'k' axis in Myers spans positive and negative indexes, so point
+	 * the kd to the middle.
+	 * It is then possible to index from -max/2 .. max/2. */
+	kd_forward += max/2;
+	kd_backward += max/2;
+
+	int d;
+	struct diff_box mid_snake = {};
+	bool found_midpoint = false;
+	for (d = 0; d <= (max/2); d++) {
+		int r;
+		debug("-- d=%d\n", d);
+		r = diff_divide_myers_forward(&found_midpoint, left, right,
+					      kd_forward, kd_backward, d,
+					      &mid_snake);
+		if (r)
+			return r;
+		if (found_midpoint)
+			break;
+		r = diff_divide_myers_backward(&found_midpoint, left, right,
+					       kd_forward, kd_backward, d,
+					       &mid_snake);
+		if (r)
+			return r;
+		if (found_midpoint)
+			break;
+
+		/* Limit the effort spent looking for a mid snake. If files have
+		 * very few lines in common, the effort spent to find nice mid
+		 * snakes is just not worth it, the diff result will still be
+		 * essentially minus everything on the left, plus everything on
+		 * the right, with a few useless matches here and there. */
+		if (d > max_effort) {
+			/* pick the furthest reaching point from
+			 * kd_forward and kd_backward, and use that as a
+			 * midpoint, to not step into another diff algo
+			 * recursion with unchanged box. */
+			int delta = (int)right->atoms.len - (int)left->atoms.len;
+			int x = 0;
+			int y;
+			int i;
+			int best_forward_i = 0;
+			int best_forward_distance = 0;
+			int best_backward_i = 0;
+			int best_backward_distance = 0;
+			int distance;
+			int best_forward_x;
+			int best_forward_y;
+			int best_backward_x;
+			int best_backward_y;
+
+			debug("~~~ HIT d = %d > max_effort = %d\n", d, max_effort);
+			debug_dump_myers_graph(left, right, NULL,
+					       kd_forward, d,
+					       kd_backward, d);
+
+			for (i = d; i >= -d; i -= 2) {
+				if (i >= -(int)right->atoms.len && i <= (int)left->atoms.len) {
+					x = kd_forward[i];
+					y = xk_to_y(x, i);
+					distance = x + y;
+					if (distance > best_forward_distance) {
+						best_forward_distance = distance;
+						best_forward_i = i;
+					}
+				}
+
+				if (i >= -(int)left->atoms.len && i <= (int)right->atoms.len) {
+					x = kd_backward[i];
+					y = xc_to_y(x, i, delta);
+					distance = (right->atoms.len - x)
+						+ (left->atoms.len - y);
+					if (distance >= best_backward_distance) {
+						best_backward_distance = distance;
+						best_backward_i = i;
+					}
+				}
+			}
+
+			/* The myers-divide didn't meet in the middle. We just
+			 * figured out the places where the forward path
+			 * advanced the most, and the backward path advanced the
+			 * most. Just divide at whichever one of those two is better.
+			 *
+			 *   o-o
+			 *     |
+			 *     o
+			 *      \
+			 *       o
+			 *        \
+			 *         F <-- cut here
+			 *
+			 *
+			 *
+			 *           or here --> B
+			 *                        \
+			 *                         o
+			 *                          \
+			 *                           o
+			 *                           |
+			 *                           o-o
+			 */
+			best_forward_x = kd_forward[best_forward_i];
+			best_forward_y = xk_to_y(best_forward_x, best_forward_i);
+			best_backward_x = kd_backward[best_backward_i];
+			best_backward_y = xc_to_y(best_backward_x, best_backward_i, delta);
+
+			if (best_forward_distance >= best_backward_distance) {
+				x = best_forward_x;
+				y = best_forward_y;
+			} else {
+				x = best_backward_x;
+				y = best_backward_y;
+			}
+
+			debug("max_effort cut at x=%d y=%d\n", x, y);
+			if (x < 0 || y < 0
+			    || x > left->atoms.len || y > right->atoms.len)
+				break;
+
+			found_midpoint = true;
+			mid_snake = (struct diff_box){
+				.left_start = x,
+				.left_end = x,
+				.right_start = y,
+				.right_end = y,
+			};
+			break;
+		}
+	}
+
+	if (!found_midpoint) {
+		/* Divide and conquer failed to find a meeting point. Use the
+		 * fallback_algo defined in the algo_config (leave this to the
+		 * caller). This is just paranoia/sanity, we normally should
+		 * always find a midpoint.
+		 */
+		debug(" no midpoint \n");
+		rc = DIFF_RC_USE_DIFF_ALGO_FALLBACK;
+		goto return_rc;
+	} else {
+		debug(" mid snake L: %u to %u of %u   R: %u to %u of %u\n",
+		      mid_snake.left_start, mid_snake.left_end, left->atoms.len,
+		      mid_snake.right_start, mid_snake.right_end,
+		      right->atoms.len);
+
+		/* Section before the mid-snake.  */
+		debug("Section before the mid-snake\n");
+
+		struct diff_atom *left_atom = &left->atoms.head[0];
+		unsigned int left_section_len = mid_snake.left_start;
+		struct diff_atom *right_atom = &right->atoms.head[0];
+		unsigned int right_section_len = mid_snake.right_start;
+
+		if (left_section_len && right_section_len) {
+			/* Record an unsolved chunk, the caller will apply
+			 * inner_algo() on this chunk. */
+			if (!diff_state_add_chunk(state, false,
+						  left_atom, left_section_len,
+						  right_atom,
+						  right_section_len))
+				goto return_rc;
+		} else if (left_section_len && !right_section_len) {
+			/* Only left atoms and none on the right, they form a
+			 * "minus" chunk, then. */
+			if (!diff_state_add_chunk(state, true,
+						  left_atom, left_section_len,
+						  right_atom, 0))
+				goto return_rc;
+		} else if (!left_section_len && right_section_len) {
+			/* No left atoms, only atoms on the right, they form a
+			 * "plus" chunk, then. */
+			if (!diff_state_add_chunk(state, true,
+						  left_atom, 0,
+						  right_atom,
+						  right_section_len))
+				goto return_rc;
+		}
+		/* else: left_section_len == 0 and right_section_len == 0, i.e.
+		 * nothing before the mid-snake. */
+
+		if (mid_snake.left_end > mid_snake.left_start
+		    || mid_snake.right_end > mid_snake.right_start) {
+			/* The midpoint is a section of identical data on both
+			 * sides, or a certain differing line: that section
+			 * immediately becomes a solved chunk. */
+			debug("the mid-snake\n");
+			if (!diff_state_add_chunk(state, true,
+				  &left->atoms.head[mid_snake.left_start],
+				  mid_snake.left_end - mid_snake.left_start,
+				  &right->atoms.head[mid_snake.right_start],
+				  mid_snake.right_end - mid_snake.right_start))
+				goto return_rc;
+		}
+
+		/* Section after the mid-snake. */
+		debug("Section after the mid-snake\n");
+		debug("  left_end %u  right_end %u\n",
+		      mid_snake.left_end, mid_snake.right_end);
+		debug("  left_count %u  right_count %u\n",
+		      left->atoms.len, right->atoms.len);
+		left_atom = &left->atoms.head[mid_snake.left_end];
+		left_section_len = left->atoms.len - mid_snake.left_end;
+		right_atom = &right->atoms.head[mid_snake.right_end];
+		right_section_len = right->atoms.len - mid_snake.right_end;
+
+		if (left_section_len && right_section_len) {
+			/* Record an unsolved chunk, the caller will apply
+			 * inner_algo() on this chunk. */
+			if (!diff_state_add_chunk(state, false,
+						  left_atom, left_section_len,
+						  right_atom,
+						  right_section_len))
+				goto return_rc;
+		} else if (left_section_len && !right_section_len) {
+			/* Only left atoms and none on the right, they form a
+			 * "minus" chunk, then. */
+			if (!diff_state_add_chunk(state, true,
+						  left_atom, left_section_len,
+						  right_atom, 0))
+				goto return_rc;
+		} else if (!left_section_len && right_section_len) {
+			/* No left atoms, only atoms on the right, they form a
+			 * "plus" chunk, then. */
+			if (!diff_state_add_chunk(state, true,
+						  left_atom, 0,
+						  right_atom,
+						  right_section_len))
+				goto return_rc;
+		}
+		/* else: left_section_len == 0 and right_section_len == 0, i.e.
+		 * nothing after the mid-snake. */
+	}
+
+	rc = DIFF_RC_OK;
+
+return_rc:
+	debug("** END %s\n", __func__);
+	return rc;
+}
+
+/* Myers Diff tracing from the start all the way through to the end, requiring
+ * quadratic amounts of memory. This can fail if the required space surpasses
+ * algo_config->permitted_state_size. */
+int
+diff_algo_myers(const struct diff_algo_config *algo_config,
+		struct diff_state *state)
+{
+	/* do a diff_divide_myers_forward() without a _backward(), so that it
+	 * walks forward across the entire files to reach the end. Keep each
+	 * run's state, and do a final backtrace. */
+	int rc = ENOMEM;
+	struct diff_data *left = &state->left;
+	struct diff_data *right = &state->right;
+	int *kd_buf;
+
+	debug("\n** %s\n", __func__);
+	debug("left:\n");
+	debug_dump(left);
+	debug("right:\n");
+	debug_dump(right);
+	debug_dump_myers_graph(left, right, NULL, NULL, 0, NULL, 0);
+
+	/* Allocate two columns of a Myers graph, one for the forward and one
+	 * for the backward traversal. */
+	unsigned int max = left->atoms.len + right->atoms.len;
+	size_t kd_len = max + 1 + max;
+	size_t kd_buf_size = kd_len * kd_len;
+	size_t kd_state_size = kd_buf_size * sizeof(int);
+	debug("state size: %zu\n", kd_state_size);
+	if (kd_buf_size < kd_len /* overflow? */
+	    || kd_state_size > algo_config->permitted_state_size) {
+		debug("state size %zu > permitted_state_size %zu, use fallback_algo\n",
+		      kd_state_size, algo_config->permitted_state_size);
+		return DIFF_RC_USE_DIFF_ALGO_FALLBACK;
+	}
+
+	if (state->kd_buf_size < kd_buf_size) {
+		kd_buf = reallocarray(state->kd_buf, kd_buf_size,
+		    sizeof(int));
+		if (!kd_buf)
+			return ENOMEM;
+		state->kd_buf = kd_buf;
+		state->kd_buf_size = kd_buf_size;
+	} else
+		kd_buf = state->kd_buf;
+
+	int i;
+	for (i = 0; i < kd_buf_size; i++)
+		kd_buf[i] = -1;
+
+	/* The 'k' axis in Myers spans positive and negative indexes, so point
+	 * the kd to the middle.
+	 * It is then possible to index from -max .. max. */
+	int *kd_origin = kd_buf + max;
+	int *kd_column = kd_origin;
+
+	int d;
+	int backtrack_d = -1;
+	int backtrack_k = 0;
+	int k;
+	int x, y;
+	for (d = 0; d <= max; d++, kd_column += kd_len) {
+		debug("-- d=%d\n", d);
+
+		debug("-- %s d=%d\n", __func__, d);
+
+		for (k = d; k >= -d; k -= 2) {
+			if (k < -(int)right->atoms.len
+			    || k > (int)left->atoms.len) {
+				/* This diagonal is completely outside of the
+				 * Myers graph, don't calculate it. */
+				if (k < -(int)right->atoms.len)
+					debug(" %d k <"
+					      " -(int)right->atoms.len %d\n",
+					      k, -(int)right->atoms.len);
+				else
+					debug(" %d k > left->atoms.len %d\n", k,
+					      left->atoms.len);
+				if (k < 0) {
+					/* We are traversing negatively, and
+					 * already below the entire graph,
+					 * nothing will come of this. */
+					debug(" break\n");
+					break;
+				}
+				debug(" continue\n");
+				continue;
+			}
+
+			debug("- k = %d\n", k);
+			if (d == 0) {
+				/* This is the initializing step. There is no
+				 * prev_k yet, get the initial x from the top
+				 * left of the Myers graph. */
+				x = 0;
+			} else {
+				int *kd_prev_column = kd_column - kd_len;
+
+				/* Favoring "-" lines first means favoring
+				 * moving rightwards in the Myers graph.
+				 * For this, all k should derive from k - 1,
+				 * only the bottom most k derive from k + 1:
+				 *
+				 *      | d=   0   1   2
+				 *  ----+----------------
+				 *  k=  |
+				 *   2  |             2,0 <-- from
+				 *      |            /        prev_k = 2 - 1 = 1
+				 *   1  |         1,0
+				 *      |        /
+				 *   0  |  -->0,0     3,3
+				 *      |       \\   /
+				 *  -1  |         0,1 <-- bottom most for d=1
+				 *      |           \\    from prev_k = -1+1 = 0
+				 *  -2  |             0,2 <-- bottom most for
+				 *                            d=2 from
+				 *                            prev_k = -2+1 = -1
+				 *
+				 * Except when a k + 1 from a previous run
+				 * already means a further advancement in the
+				 * graph.
+				 * If k == d, there is no k + 1 and k - 1 is the
+				 * only option.
+				 * If k < d, use k + 1 in case that yields a
+				 * larger x. Also use k + 1 if k - 1 is outside
+				 * the graph.
+				 */
+				if (k > -d
+				    && (k == d
+					|| (k - 1 >= -(int)right->atoms.len
+					    && kd_prev_column[k - 1]
+					       >= kd_prev_column[k + 1]))) {
+					/* Advance from k - 1.
+					 * From position prev_k, step to the
+					 * right in the Myers graph: x += 1.
+					 */
+					int prev_k = k - 1;
+					int prev_x = kd_prev_column[prev_k];
+					x = prev_x + 1;
+				} else {
+					/* The bottom most one.
+					 * From position prev_k, step to the
+					 * bottom in the Myers graph: y += 1.
+					 * Incrementing y is achieved by
+					 * decrementing k while keeping the same
+					 * x. (since we're deriving y from y =
+					 * x - k).
+					 */
+					int prev_k = k + 1;
+					int prev_x = kd_prev_column[prev_k];
+					x = prev_x;
+				}
+			}
+
+			/* Slide down any snake that we might find here. */
+			while (x < left->atoms.len
+			       && xk_to_y(x, k) < right->atoms.len) {
+				bool same;
+				int r = diff_atom_same(&same,
+						       &left->atoms.head[x],
+						       &right->atoms.head[
+							xk_to_y(x, k)]);
+				if (r)
+					return r;
+				if (!same)
+					break;
+				x++;
+			}
+			kd_column[k] = x;
+
+			if (DEBUG) {
+				int fi;
+				for (fi = d; fi >= k; fi-=2) {
+					debug("kd_column[%d] = (%d, %d)\n", fi,
+					      kd_column[fi],
+					      kd_column[fi] - fi);
+				}
+			}
+
+			if (x == left->atoms.len
+			    && xk_to_y(x, k) == right->atoms.len) {
+				/* Found a path */
+				backtrack_d = d;
+				backtrack_k = k;
+				debug("Reached the end at d = %d, k = %d\n",
+				      backtrack_d, backtrack_k);
+				break;
+			}
+		}
+
+		if (backtrack_d >= 0)
+			break;
+	}
+
+	debug_dump_myers_graph(left, right, kd_origin, NULL, 0, NULL, 0);
+
+	/* backtrack. A matrix spanning from start to end of the file is ready:
+	 *
+	 *      | d=   0   1   2   3   4
+	 *  ----+---------------------------------
+	 *  k=  |
+	 *   3  |
+	 *      |
+	 *   2  |             2,0
+	 *      |            /
+	 *   1  |         1,0     4,3
+	 *      |        /       /   \
+	 *   0  |  -->0,0     3,3     4,4 --> backtrack_d = 4, backtrack_k = 0
+	 *      |        \   /   \
+	 *  -1  |         0,1     3,4
+	 *      |            \
+	 *  -2  |             0,2
+	 *      |
+	 *
+	 * From (4,4) backwards, find the previous position that is the largest, and remember it.
+	 *
+	 */
+	for (d = backtrack_d, k = backtrack_k; d >= 0; d--) {
+		x = kd_column[k];
+		y = xk_to_y(x, k);
+
+		/* When the best position is identified, remember it for that
+		 * kd_column.
+		 * That kd_column is no longer needed otherwise, so just
+		 * re-purpose kd_column[0] = x and kd_column[1] = y,
+		 * so that there is no need to allocate more memory.
+		 */
+		kd_column[0] = x;
+		kd_column[1] = y;
+		debug("Backtrack d=%d: xy=(%d, %d)\n",
+		      d, kd_column[0], kd_column[1]);
+
+		/* Don't access memory before kd_buf */
+		if (d == 0)
+			break;
+		int *kd_prev_column = kd_column - kd_len;
+
+		/* When y == 0, backtracking downwards (k-1) is the only way.
+		 * When x == 0, backtracking upwards (k+1) is the only way.
+		 *
+		 *      | d=   0   1   2   3   4
+		 *  ----+---------------------------------
+		 *  k=  |
+		 *   3  |
+		 *      |                ..y == 0
+		 *   2  |             2,0
+		 *      |            /
+		 *   1  |         1,0     4,3
+		 *      |        /       /   \
+		 *   0  |  -->0,0     3,3     4,4 --> backtrack_d = 4,
+		 *      |        \   /   \            backtrack_k = 0
+		 *  -1  |         0,1     3,4
+		 *      |            \
+		 *  -2  |             0,2__
+		 *      |                  x == 0
+		 */
+		if (y == 0
+		    || (x > 0
+			&& kd_prev_column[k - 1] >= kd_prev_column[k + 1])) {
+			k = k - 1;
+			debug("prev k=k-1=%d x=%d y=%d\n",
+			      k, kd_prev_column[k],
+			      xk_to_y(kd_prev_column[k], k));
+		} else {
+			k = k + 1;
+			debug("prev k=k+1=%d x=%d y=%d\n",
+			      k, kd_prev_column[k],
+			      xk_to_y(kd_prev_column[k], k));
+		}
+		kd_column = kd_prev_column;
+	}
+
+	/* Forwards again, this time recording the diff chunks.
+	 * Definitely start from 0,0. kd_column[0] may actually point to the
+	 * bottom of a snake starting at 0,0 */
+	x = 0;
+	y = 0;
+
+	kd_column = kd_origin;
+	for (d = 0; d <= backtrack_d; d++, kd_column += kd_len) {
+		int next_x = kd_column[0];
+		int next_y = kd_column[1];
+		debug("Forward track from xy(%d,%d) to xy(%d,%d)\n",
+		      x, y, next_x, next_y);
+
+		struct diff_atom *left_atom = &left->atoms.head[x];
+		int left_section_len = next_x - x;
+		struct diff_atom *right_atom = &right->atoms.head[y];
+		int right_section_len = next_y - y;
+
+		rc = ENOMEM;
+		if (left_section_len && right_section_len) {
+			/* This must be a snake slide.
+			 * Snake slides have a straight line leading into them
+			 * (except when starting at (0,0)). Find out whether the
+			 * lead-in is horizontal or vertical:
+			 *
+			 *     left
+			 *  ---------->
+			 *  |
+			 * r|   o-o        o
+			 * i|      \       |
+			 * g|       o      o
+			 * h|        \      \
+			 * t|         o      o
+			 *  v
+			 *
+			 * If left_section_len > right_section_len, the lead-in
+			 * is horizontal, meaning first remove one atom from the
+			 * left before sliding down the snake.
+			 * If right_section_len > left_section_len, the lead-in
+			 * is vetical, so add one atom from the right before
+			 * sliding down the snake. */
+			if (left_section_len == right_section_len + 1) {
+				if (!diff_state_add_chunk(state, true,
+							  left_atom, 1,
+							  right_atom, 0))
+					goto return_rc;
+				left_atom++;
+				left_section_len--;
+			} else if (right_section_len == left_section_len + 1) {
+				if (!diff_state_add_chunk(state, true,
+							  left_atom, 0,
+							  right_atom, 1))
+					goto return_rc;
+				right_atom++;
+				right_section_len--;
+			} else if (left_section_len != right_section_len) {
+				/* The numbers are making no sense. Should never
+				 * happen. */
+				rc = DIFF_RC_USE_DIFF_ALGO_FALLBACK;
+				goto return_rc;
+			}
+
+			if (!diff_state_add_chunk(state, true,
+						  left_atom, left_section_len,
+						  right_atom,
+						  right_section_len))
+				goto return_rc;
+		} else if (left_section_len && !right_section_len) {
+			/* Only left atoms and none on the right, they form a
+			 * "minus" chunk, then. */
+			if (!diff_state_add_chunk(state, true,
+						  left_atom, left_section_len,
+						  right_atom, 0))
+				goto return_rc;
+		} else if (!left_section_len && right_section_len) {
+			/* No left atoms, only atoms on the right, they form a
+			 * "plus" chunk, then. */
+			if (!diff_state_add_chunk(state, true,
+						  left_atom, 0,
+						  right_atom,
+						  right_section_len))
+				goto return_rc;
+		}
+
+		x = next_x;
+		y = next_y;
+	}
+
+	rc = DIFF_RC_OK;
+
+return_rc:
+	debug("** END %s rc=%d\n", __func__, rc);
+	return rc;
+}
blob - /dev/null
blob + 92c42e1de1fdc900fa3a3e8cb4cbc92cacb295c7 (mode 644)
--- /dev/null
+++ lib/diff_output.c
@@ -0,0 +1,328 @@
+/* Common parts for printing diff output */
+/*
+ * Copyright (c) 2020 Neels Hofmeyr <neels@hofmeyr.de>
+ *
+ * Permission to use, copy, modify, and distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#include <ctype.h>
+#include <errno.h>
+#include <inttypes.h>
+#include <stdbool.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+
+#include <arraylist.h>
+#include <diff_main.h>
+#include <diff_output.h>
+
+#include "diff_internal.h"
+
+static int
+get_atom_byte(int *ch, struct diff_atom *atom, off_t off)
+{
+	off_t cur;
+
+	if (atom->at != NULL) {
+		*ch = atom->at[off];
+		return 0;
+	}
+
+	cur = ftello(atom->root->f);
+	if (cur == -1)
+		return errno;
+
+	if (cur != atom->pos + off &&
+	    fseeko(atom->root->f, atom->pos + off, SEEK_SET) == -1)
+		return errno;
+
+	*ch = fgetc(atom->root->f);
+	if (*ch == EOF && ferror(atom->root->f))
+		return errno;
+
+	return 0;
+}
+
+int
+diff_output_lines(struct diff_output_info *outinfo, FILE *dest,
+		  const char *prefix, struct diff_atom *start_atom,
+		  unsigned int count)
+{
+	struct diff_atom *atom;
+	off_t outoff = 0, *offp;
+	int rc;
+
+	if (outinfo && outinfo->line_offsets.len > 0) {
+		unsigned int idx = outinfo->line_offsets.len - 1;
+		outoff = outinfo->line_offsets.head[idx];
+	}
+
+	foreach_diff_atom(atom, start_atom, count) {
+		off_t outlen = 0;
+		int i, ch;
+		unsigned int len = atom->len;
+		rc = fprintf(dest, "%s", prefix);
+		if (rc < 0)
+			return errno;
+		outlen += rc;
+		if (len) {
+			rc = get_atom_byte(&ch, atom, len - 1);
+			if (rc)
+				return rc;
+			if (ch == '\n')
+				len--;
+			if (len) {
+				rc = get_atom_byte(&ch, atom, len - 1);
+				if (rc)
+					return rc;
+				if (ch == '\r')
+					len--;
+			}
+		}
+
+		for (i = 0; i < len; i++) {
+			rc = get_atom_byte(&ch, atom, i);
+			if (rc)
+				return rc;
+			rc = fprintf(dest, "%c", (unsigned char)ch);
+			if (rc < 0)
+				return errno;
+			outlen += rc;
+		}
+		rc = fprintf(dest, "\n");
+		if (rc < 0)
+			return errno;
+		outlen += rc;
+		if (outinfo) {
+			ARRAYLIST_ADD(offp, outinfo->line_offsets);
+			if (offp == NULL)
+				return ENOMEM;
+			outoff += outlen;
+			*offp = outoff;
+		}
+	}
+
+	return DIFF_RC_OK;
+}
+
+int
+diff_output_chunk_left_version(struct diff_output_info **output_info,
+			       FILE *dest,
+			       const struct diff_input_info *info,
+			       const struct diff_result *result,
+			       const struct diff_chunk_context *cc)
+{
+	int rc, c_idx;
+	struct diff_output_info *outinfo = NULL;
+
+	if (diff_range_empty(&cc->left))
+		return DIFF_RC_OK;
+
+	if (output_info) {
+		*output_info = diff_output_info_alloc();
+		if (*output_info == NULL)
+			return ENOMEM;
+		outinfo = *output_info;
+	}
+
+	/* Write out all chunks on the left side. */
+	for (c_idx = cc->chunk.start; c_idx < cc->chunk.end; c_idx++) {
+		const struct diff_chunk *c = &result->chunks.head[c_idx];
+
+		if (c->left_count) {
+			rc = diff_output_lines(outinfo, dest, "",
+			    c->left_start, c->left_count);
+			if (rc)
+				return rc;
+		}
+	}
+
+	return DIFF_RC_OK;
+}
+
+int
+diff_output_chunk_right_version(struct diff_output_info **output_info,
+				FILE *dest,
+				const struct diff_input_info *info,
+				const struct diff_result *result,
+				const struct diff_chunk_context *cc)
+{
+	int rc, c_idx;
+	struct diff_output_info *outinfo = NULL;
+
+	if (diff_range_empty(&cc->right))
+		return DIFF_RC_OK;
+
+	if (output_info) {
+		*output_info = diff_output_info_alloc();
+		if (*output_info == NULL)
+			return ENOMEM;
+		outinfo = *output_info;
+	}
+
+	/* Write out all chunks on the right side. */
+	for (c_idx = cc->chunk.start; c_idx < cc->chunk.end; c_idx++) {
+		const struct diff_chunk *c = &result->chunks.head[c_idx];
+
+		if (c->right_count) {
+			rc = diff_output_lines(outinfo, dest, "", c->right_start,
+			    c->right_count);
+			if (rc)
+				return rc;
+		}
+	}
+
+	return DIFF_RC_OK;
+}
+
+int
+diff_output_trailing_newline_msg(struct diff_output_info *outinfo, FILE *dest,
+				 const struct diff_chunk *c)
+{
+	enum diff_chunk_type chunk_type = diff_chunk_type(c);
+	struct diff_atom *atom, *start_atom;
+	unsigned int atom_count;
+	int rc, ch;
+	off_t outoff = 0, *offp;
+
+	if (chunk_type == CHUNK_MINUS || chunk_type == CHUNK_SAME) {
+		start_atom = c->left_start;
+		atom_count = c->left_count;
+	} else if (chunk_type == CHUNK_PLUS) {
+		start_atom = c->right_start;
+		atom_count = c->right_count;
+	} else
+		return EINVAL;
+
+	/* Locate the last atom. */
+	if (atom_count == 0)
+		return EINVAL;
+	atom = &start_atom[atom_count - 1];
+
+	rc = get_atom_byte(&ch, atom, atom->len - 1);
+	if (rc != DIFF_RC_OK)
+		return rc;
+
+	if (ch != '\n') {
+		if (outinfo && outinfo->line_offsets.len > 0) {
+			unsigned int idx = outinfo->line_offsets.len - 1;
+			outoff = outinfo->line_offsets.head[idx];
+		}
+		rc = fprintf(dest, "\\ No newline at end of file\n");
+		if (rc < 0)
+			return errno;
+		if (outinfo) {
+			ARRAYLIST_ADD(offp, outinfo->line_offsets);
+			if (offp == NULL)
+				return ENOMEM;
+			outoff += rc;
+			*offp = outoff;
+		}
+	}
+
+	return DIFF_RC_OK;
+}
+
+static bool
+is_function_prototype(const char *buf)
+{
+	return isalpha(buf[0]) || buf[0] == '_' || buf[0] == '$';
+}
+
+#define FUNCTION_CONTEXT_SIZE	55
+#define begins_with(s, pre) (strncmp(s, pre, sizeof(pre)-1) == 0)
+
+int
+diff_output_match_function_prototype(char **prototype,
+    const struct diff_result *result,
+    const struct diff_chunk_context *cc)
+{
+	struct diff_atom *start_atom, *atom;
+	const struct diff_data *data;
+	unsigned char buf[FUNCTION_CONTEXT_SIZE];
+	char *state = NULL;
+	int rc, i;
+
+	*prototype = NULL;
+
+	if (result->left->atoms.len > 0 && cc->left.start > 0) {
+		data = result->left;
+		start_atom = &data->atoms.head[cc->left.start - 1];
+	} else if (result->right->atoms.len > 0 && cc->right.start > 0) {
+		data = result->right;
+		start_atom = &data->atoms.head[cc->right.start - 1];
+	} else
+		return DIFF_RC_OK;
+
+	diff_data_foreach_atom_backwards_from(start_atom, atom, data) {
+		for (i = 0; i < atom->len && i < sizeof(buf) - 1; i++) {
+			unsigned int ch;
+			rc = get_atom_byte(&ch, atom, i);
+			if (rc)
+				return rc;
+			if (ch == '\n')
+				break;
+			buf[i] = ch;
+		}
+		buf[i] = '\0';
+		if (is_function_prototype(buf)) {
+			if (begins_with(buf, "private:")) {
+				if (!state)
+					state = " (private)";
+			} else if (begins_with(buf, "protected:")) {
+				if (!state)
+					state = " (protected)";
+			} else if (begins_with(buf, "public:")) {
+				if (!state)
+					state = " (public)";
+			} else {
+				if (state)  /* don't care about truncation */
+					strlcat(buf, state, sizeof(buf));
+				*prototype = strdup(buf);
+				if (*prototype == NULL)
+					return ENOMEM;
+				return DIFF_RC_OK;
+			}
+		}
+	}
+
+	return DIFF_RC_OK;
+}
+
+struct diff_output_info *
+diff_output_info_alloc(void)
+{
+	struct diff_output_info *output_info;
+	off_t *offp;
+
+	output_info = malloc(sizeof(*output_info));
+	if (output_info != NULL) {
+		ARRAYLIST_INIT(output_info->line_offsets, 128);
+		ARRAYLIST_ADD(offp, output_info->line_offsets);
+		if (offp == NULL) {
+			diff_output_info_free(output_info);
+			return NULL;
+		}
+		*offp = 0;
+	}
+	return output_info;
+}
+
+void
+diff_output_info_free(struct diff_output_info *output_info)
+{
+	ARRAYLIST_FREE(output_info->line_offsets);
+	free(output_info);
+}
blob - ace2137e04157dfffb2d383748e8d9561c074ac1
blob + 6a04918dc0d9b4b4220243d06ed37d787b17cd4a
--- lib/got_lib_diff.h
+++ lib/got_lib_diff.h
@@ -1,137 +1,43 @@
-
-
-/*ROR
- * Copyright (c) 1991, 1993
- *	The Regents of the University of California.  All rights reserved.
+/*
+ * Copyright (c) 2020 Stefan Sperling <stsp@openbsd.org>
  *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in the
- *    documentation and/or other materials provided with the distribution.
- * 3. Neither the name of the University nor the names of its contributors
- *    may be used to endorse or promote products derived from this software
- *    without specific prior written permission.
+ * Permission to use, copy, modify, and distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
  *
- * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
- * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
- * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- * SUCH DAMAGE.
- *
- *	@(#)diff.h	8.1 (Berkeley) 6/6/93
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
  */
 
-#include <sys/types.h>
-#include <regex.h>
+#include "arraylist.h"
+#include "diff_main.h"
+#include "diff_output.h"
 
-/*
- * Output format options
- */
-#define	D_NORMAL	0	/* Normal output */
-#define	D_UNIFIED	3	/* Unified context diff */
-#define	D_BRIEF		6	/* Say if the files differ */
-
-/*
- * Output flags
- */
-#define	D_HEADER	0x001	/* Print a header/footer between files */
-#define	D_EMPTY1	0x002	/* Treat first file as empty (/dev/null) */
-#define	D_EMPTY2	0x004	/* Treat second file as empty (/dev/null) */
-
-/*
- * Command line flags
- */
-#define D_FORCEASCII	0x008	/* Treat file as ascii regardless of content */
-#define D_FOLDBLANKS	0x010	/* Treat all white space as equal */
-#define D_MINIMAL	0x020	/* Make diff as small as possible */
-#define D_IGNORECASE	0x040	/* Case-insensitive matching */
-#define D_PROTOTYPE	0x080	/* Display C function prototype */
-#define D_EXPANDTABS	0x100	/* Expand tabs to spaces */
-#define D_IGNOREBLANKS	0x200	/* Ignore white space changes */
-
-/*
- * Status values for got_diffreg() return values
- */
-#define	D_SAME		0	/* Files are the same */
-#define	D_DIFFER	1	/* Files are different */
-#define	D_BINARY	2	/* Binary files are different */
-#define	D_MISMATCH1	3	/* path1 was a dir, path2 a file */
-#define	D_MISMATCH2	4	/* path1 was a file, path2 a dir */
-#define	D_SKIPPED1	5	/* path1 was a special file */
-#define	D_SKIPPED2	6	/* path2 was a special file */
-
-struct excludes {
-	char *pattern;
-	struct excludes *next;
+enum got_diff_algorithm {
+	GOT_DIFF_ALGORITHM_MYERS,
+	GOT_DIFF_ALGORITHM_PATIENCE,
 };
 
-/*
- * The following struct is used to record change information when
- * doing a "context" or "unified" diff.  (see routine "change" to
- * understand the highly mnemonic field names)
- */
-struct context_vec {
-	int	a;		/* start line in old file */
-	int	b;		/* end line in old file */
-	int	c;		/* start line in new file */
-	int	d;		/* end line in new file */
+enum got_diff_output_format {
+	GOT_DIFF_OUTPUT_UNIDIFF,
+	GOT_DIFF_OUTPUT_EDSCRIPT,
 };
 
-struct got_diff_change {
-	SIMPLEQ_ENTRY(got_diff_change) entry;
-	struct context_vec cv;
-};
-
-struct got_diff_changes {
-	int nchanges;
-	SIMPLEQ_HEAD(, got_diff_change) entries;
-};
-
-struct got_diff_state {
-	int  *J;			/* will be overlaid on class */
-	int  *class;		/* will be overlaid on file[0] */
-	int  *klist;		/* will be overlaid on file[0] after class */
-	int  *member;		/* will be overlaid on file[1] */
-	int   clen;
-	int   len[2];
-	int   pref, suff;	/* length of prefix and suffix */
-	int   slen[2];
-	int   anychange;
-	long *ixnew;		/* will be overlaid on file[1] */
-	long *ixold;		/* will be overlaid on klist */
-	struct cand *clist;	/* merely a free storage pot for candidates */
-	int   clistlen;		/* the length of clist */
-	struct line *sfile[2];	/* shortened by pruning common prefix/suffix */
-	u_char *chrtran;		/* translation table for case-folding */
-	struct context_vec *context_vec_start;
-	struct context_vec *context_vec_end;
-	struct context_vec *context_vec_ptr;
-	struct line *file[2];
-#define FUNCTION_CONTEXT_SIZE	55
-	char lastbuf[FUNCTION_CONTEXT_SIZE];
-	int lastline;
-	int lastmatchline;
-	struct stat stb1, stb2;
-	size_t max_context;
-};
-
-void got_diff_state_free(struct got_diff_state *);
-
-struct got_diff_args {
-	int	 Tflag;
-	int	 diff_format, diff_context, status;
-	char	 *diffargs;
-	const char *label[2];
+struct got_diffreg_result {
+	struct diff_result *result;
+	FILE *f1;
+	char *map1;
+	size_t size1;
+	FILE *f2;
+	char *map2;
+	size_t size2;
+	struct diff_data left;
+	struct diff_data right;
 };
 
 #define GOT_DIFF_CONFLICT_MARKER_BEGIN	"<<<<<<<"
@@ -139,22 +45,33 @@ struct got_diff_args {
 #define GOT_DIFF_CONFLICT_MARKER_SEP	"======="
 #define GOT_DIFF_CONFLICT_MARKER_END	">>>>>>>"
 
-const struct got_error *got_diffreg(int *, FILE *,
-    FILE *, int, struct got_diff_args *, struct got_diff_state *, FILE *,
-    struct got_diff_changes *);
-
-const struct got_error *got_diff_blob_lines_changed(struct got_diff_changes **,
-    struct got_blob_object *, struct got_blob_object *);
-const struct got_error *got_diff_blob_file_lines_changed(struct got_diff_changes **,
-    struct got_blob_object *, FILE *, size_t);
-void got_diff_free_changes(struct got_diff_changes *);
+const struct diff_config *got_diff_get_config(enum got_diff_algorithm);
+const struct got_error *got_diff_prepare_file(FILE **, char **, int *,
+    size_t *, struct diff_data *, const struct diff_config *, int); 
+const struct got_error *got_diffreg_prepared_files(struct got_diffreg_result **,
+    const struct diff_config *, struct diff_data *, FILE *, char *, size_t,
+    struct diff_data *, FILE *, char *, size_t);
+const struct got_error *got_diff_blob_prepared_file(
+    struct got_diffreg_result **, struct diff_data *, struct got_blob_object *,
+    struct diff_data *, FILE *, char *, size_t, const struct diff_config *,
+    int);
+const struct got_error *got_diffreg(struct got_diffreg_result **, FILE *, FILE *,
+    enum got_diff_algorithm, int);
+const struct got_error *got_diffreg_output(off_t **, size_t *,
+    struct got_diffreg_result *, FILE *, FILE *, const char *, const char *,
+    enum got_diff_output_format, int, FILE *);
+const struct got_error *got_diffreg_result_free(struct got_diffreg_result *);
+const struct got_error *got_diffreg_result_free_left(
+    struct got_diffreg_result *);
+const struct got_error *got_diffreg_result_free_right(
+    struct got_diffreg_result *);
+const struct got_error *got_diffreg_close(FILE *, char *, size_t,
+    FILE *, char *, size_t);
 
 const struct got_error *got_merge_diff3(int *, int, const char *, const char *,
     const char *, const char *, const char *, const char *);
 
-const struct got_error *got_diff_files(struct got_diff_changes **,
-    struct got_diff_state **, struct got_diff_args **, int *, FILE *, size_t,
-    const char *, FILE *, size_t, const char *, int, FILE *);
+const struct got_error *got_diff_files(struct got_diffreg_result **, FILE *,
+    const char *, FILE *, const char *, int, int, FILE *);
 
-void got_diff_dump_change(FILE *, struct got_diff_change *,
-    struct got_diff_state *, struct got_diff_args *, FILE *, FILE *, int);
+void got_diff_dump_change(FILE *, struct diff_chunk *, FILE *, FILE *);
blob - /dev/null
blob + 0fa63c3ef700e304d4e3c5950765ea9afd5d8a58 (mode 644)
--- /dev/null
+++ lib/diff_output.h
@@ -0,0 +1,88 @@
+/* Diff output generators and invocation shims. */
+/*
+ * Copyright (c) 2020 Neels Hofmeyr <neels@hofmeyr.de>
+ *
+ * Permission to use, copy, modify, and distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+struct diff_input_info {
+	const char *left_path;
+	const char *right_path;
+};
+
+struct diff_output_info {
+	/*
+	 * Byte offset to each line in the generated output file.
+	 * The total number of lines in the file is line_offsets.len - 1.
+	 * The last offset in this array corresponds to end-of-file.
+	 */
+	ARRAYLIST(off_t) line_offsets;
+};
+
+void diff_output_info_free(struct diff_output_info *output_info);
+
+struct diff_chunk_context {
+	struct diff_range chunk;
+	struct diff_range left, right;
+};
+
+int diff_output_plain(struct diff_output_info **output_info, FILE *dest,
+			const struct diff_input_info *info,
+			const struct diff_result *result);
+int diff_output_unidiff(struct diff_output_info **output_info,
+			FILE *dest, const struct diff_input_info *info,
+			const struct diff_result *result,
+			unsigned int context_lines);
+int diff_output_edscript(struct diff_output_info **output_info,
+			 FILE *dest, const struct diff_input_info *info,
+			 const struct diff_result *result);
+int diff_chunk_get_left_start(const struct diff_chunk *c,
+			      const struct diff_result *r,
+			      int context_lines);
+int diff_chunk_get_left_end(const struct diff_chunk *c,
+			    const struct diff_result *r,
+			    int context_lines);
+int diff_chunk_get_right_start(const struct diff_chunk *c,
+			       const struct diff_result *r,
+			       int context_lines);
+int diff_chunk_get_right_end(const struct diff_chunk *c,
+			     const struct diff_result *r,
+			     int context_lines);
+void diff_chunk_context_get(struct diff_chunk_context *cc,
+				 const struct diff_result *r,
+				 int chunk_idx, int context_lines);
+void diff_chunk_context_load_change(struct diff_chunk_context *cc,
+				    int *nchunks_used,
+				    struct diff_result *result,
+				    int start_chunk_idx,
+				    int context_lines);
+
+struct diff_output_unidiff_state;
+struct diff_output_unidiff_state *diff_output_unidiff_state_alloc(void);
+void diff_output_unidiff_state_reset(struct diff_output_unidiff_state *state);
+void diff_output_unidiff_state_free(struct diff_output_unidiff_state *state);
+int diff_output_unidiff_chunk(struct diff_output_info **output_info, FILE *dest,
+			  struct diff_output_unidiff_state *state,
+			  const struct diff_input_info *info,
+			  const struct diff_result *result,
+			  const struct diff_chunk_context *cc);
+int diff_output_chunk_left_version(struct diff_output_info **output_info,
+			       FILE *dest,
+			       const struct diff_input_info *info,
+			       const struct diff_result *result,
+			       const struct diff_chunk_context *cc);
+int diff_output_chunk_right_version(struct diff_output_info **output_info,
+				FILE *dest,
+				const struct diff_input_info *info,
+				const struct diff_result *result,
+				const struct diff_chunk_context *cc);
blob - /dev/null
blob + b6e9089caf74f10a59526427cb0c2b8d1fa47a03 (mode 644)
--- /dev/null
+++ lib/diff_output_edscript.c
@@ -0,0 +1,162 @@
+/* Produce ed(1) script output from a diff_result. */
+/*
+ * Copyright (c) 2020 Neels Hofmeyr <neels@hofmeyr.de>
+ * Copyright (c) 2020 Stefan Sperling <stsp@openbsd.org>
+ *
+ * Permission to use, copy, modify, and distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#include <errno.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdbool.h>
+
+#include <arraylist.h>
+#include <diff_main.h>
+#include <diff_output.h>
+
+#include "diff_internal.h"
+
+static int
+output_edscript_chunk(struct diff_output_info *outinfo,
+    FILE *dest, const struct diff_input_info *info,
+    const struct diff_result *result,
+    struct diff_chunk_context *cc)
+{
+	off_t outoff = 0, *offp;
+	int left_start, left_len, right_start, right_len;
+	int rc;
+
+	left_len = cc->left.end - cc->left.start;
+	if (left_len < 0)
+		return EINVAL;
+	else if (result->left->atoms.len == 0)
+		left_start = 0;
+	else if (left_len == 0 && cc->left.start > 0)
+		left_start = cc->left.start;
+	else
+		left_start = cc->left.start + 1;
+
+	right_len = cc->right.end - cc->right.start;
+	if (right_len < 0)
+		return EINVAL;
+	else if (result->right->atoms.len == 0)
+		right_start = 0;
+	else if (right_len == 0 && cc->right.start > 0)
+		right_start = cc->right.start;
+	else
+		right_start = cc->right.start + 1;
+
+	if (left_len == 0) {
+		/* addition */
+		if (right_len == 1) {
+			rc = fprintf(dest, "%da%d\n", left_start, right_start);
+		} else {
+			rc = fprintf(dest, "%da%d,%d\n", left_start,
+			    right_start, cc->right.end);
+		}
+	} else if (right_len == 0) {
+		/* deletion */
+		if (left_len == 1) {
+			rc = fprintf(dest, "%dd%d\n", left_start,
+			    right_start);
+		} else {
+			rc = fprintf(dest, "%d,%dd%d\n", left_start,
+			    cc->left.end, right_start);
+		}
+	} else {
+		/* change */
+		if (left_len == 1 && right_len == 1) {
+			rc = fprintf(dest, "%dc%d\n", left_start, right_start);
+		} else if (left_len == 1) {
+			rc = fprintf(dest, "%dc%d,%d\n", left_start,
+			    right_start, cc->right.end);
+		} else if (right_len == 1) {
+			rc = fprintf(dest, "%d,%dc%d\n", left_start,
+			    cc->left.end, right_start);
+		} else {
+			rc = fprintf(dest, "%d,%dc%d,%d\n", left_start,
+			    cc->left.end, right_start, cc->right.end);
+		}
+	}
+	if (rc < 0)
+		return errno;
+	if (outinfo) {
+		ARRAYLIST_ADD(offp, outinfo->line_offsets);
+		if (offp == NULL)
+			return ENOMEM;
+		outoff += rc;
+		*offp = outoff;
+	}
+
+	return DIFF_RC_OK;
+}
+
+int
+diff_output_edscript(struct diff_output_info **output_info,
+    FILE *dest, const struct diff_input_info *info,
+    const struct diff_result *result)
+{
+	struct diff_output_info *outinfo = NULL;
+	struct diff_chunk_context cc = {};
+	int i, rc;
+
+	if (!result)
+		return EINVAL;
+	if (result->rc != DIFF_RC_OK)
+		return result->rc;
+
+	if (output_info) {
+		*output_info = diff_output_info_alloc();
+		if (*output_info == NULL)
+			return ENOMEM;
+		outinfo = *output_info;
+	}
+
+	for (i = 0; i < result->chunks.len; i++) {
+		struct diff_chunk *chunk = &result->chunks.head[i];
+		enum diff_chunk_type t = diff_chunk_type(chunk);
+		struct diff_chunk_context next;
+
+		if (t != CHUNK_MINUS && t != CHUNK_PLUS)
+			continue;
+
+		if (diff_chunk_context_empty(&cc)) {
+			/* Note down the start point, any number of subsequent
+			 * chunks may be joined up to this chunk by being
+			 * directly adjacent. */
+			diff_chunk_context_get(&cc, result, i, 0);
+			continue;
+		}
+
+		/* There already is a previous chunk noted down for being
+		 * printed. Does it join up with this one? */
+		diff_chunk_context_get(&next, result, i, 0);
+
+		if (diff_chunk_contexts_touch(&cc, &next)) {
+			/* This next context touches or overlaps the previous
+			 * one, join. */
+			diff_chunk_contexts_merge(&cc, &next);
+			continue;
+		}
+
+		rc = output_edscript_chunk(outinfo, dest, info, result, &cc);
+		if (rc != DIFF_RC_OK)
+			return rc;
+		cc = next;
+	}
+
+	if (!diff_chunk_context_empty(&cc))
+		return output_edscript_chunk(outinfo, dest, info, result, &cc);
+	return DIFF_RC_OK;
+}
blob - /dev/null
blob + cc478ba4dd635825a2cf8235364f9ddcebb2f6aa (mode 644)
--- /dev/null
+++ lib/diff_output_plain.c
@@ -0,0 +1,68 @@
+/* Output all lines of a diff_result. */
+/*
+ * Copyright (c) 2020 Neels Hofmeyr <neels@hofmeyr.de>
+ *
+ * Permission to use, copy, modify, and distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#include <errno.h>
+#include <inttypes.h>
+#include <stdio.h>
+#include <stdbool.h>
+#include <stdlib.h>
+
+#include <arraylist.h>
+#include <diff_main.h>
+#include <diff_output.h>
+
+#include "diff_internal.h"
+
+int
+diff_output_plain(struct diff_output_info **output_info, FILE *dest,
+		 const struct diff_input_info *info,
+		 const struct diff_result *result)
+{
+	struct diff_output_info *outinfo = NULL;
+	int i, rc;
+
+	if (!result)
+		return EINVAL;
+	if (result->rc != DIFF_RC_OK)
+		return result->rc;
+	
+	if (output_info) {
+		*output_info = diff_output_info_alloc();
+		if (*output_info == NULL)
+			return errno;
+		outinfo = *output_info;
+	}
+
+	for (i = 0; i < result->chunks.len; i++) {
+		struct diff_chunk *c = &result->chunks.head[i];
+		if (c->left_count && c->right_count)
+			rc = diff_output_lines(outinfo, dest,
+					  c->solved ? " " : "?",
+					  c->left_start, c->left_count);
+		else if (c->left_count && !c->right_count)
+			rc = diff_output_lines(outinfo, dest,
+					  c->solved ? "-" : "?",
+					  c->left_start, c->left_count);
+		else if (c->right_count && !c->left_count)
+			rc = diff_output_lines(outinfo, dest,
+					  c->solved ? "+" : "?",
+					  c->right_start, c->right_count);
+		if (rc)
+			return rc;
+	}
+	return DIFF_RC_OK;
+}
blob - /dev/null
blob + c7749377c1c41c1fee60256db169cee8ece8fd9e (mode 644)
--- /dev/null
+++ lib/diff_output_unidiff.c
@@ -0,0 +1,498 @@
+/* Produce a unidiff output from a diff_result. */
+/*
+ * Copyright (c) 2020 Neels Hofmeyr <neels@hofmeyr.de>
+ *
+ * Permission to use, copy, modify, and distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#include <errno.h>
+#include <inttypes.h>
+#include <stdbool.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+#include <arraylist.h>
+#include <diff_main.h>
+#include <diff_output.h>
+
+#include "diff_internal.h"
+#include "diff_debug.h"
+
+bool
+diff_chunk_context_empty(const struct diff_chunk_context *cc)
+{
+	return diff_range_empty(&cc->chunk);
+}
+
+int
+diff_chunk_get_left_start(const struct diff_chunk *c,
+    const struct diff_result *r, int context_lines)
+{
+	int left_start = diff_atom_root_idx(r->left, c->left_start);
+	return MAX(0, left_start - context_lines);
+}
+
+int
+diff_chunk_get_left_end(const struct diff_chunk *c,
+    const struct diff_result *r, int context_lines)
+{
+	int left_start = diff_chunk_get_left_start(c, r, 0);
+	return MIN(r->left->atoms.len,
+	    left_start + c->left_count + context_lines);
+}
+
+int
+diff_chunk_get_right_start(const struct diff_chunk *c,
+    const struct diff_result *r, int context_lines)
+{
+	int right_start = diff_atom_root_idx(r->right, c->right_start);
+	return MAX(0, right_start - context_lines);
+}
+
+int
+diff_chunk_get_right_end(const struct diff_chunk *c,
+    const struct diff_result *r, int context_lines)
+{
+	int right_start = diff_chunk_get_right_start(c, r, 0);
+	return MIN(r->right->atoms.len,
+	    right_start + c->right_count + context_lines);
+}
+
+void
+diff_chunk_context_get(struct diff_chunk_context *cc, const struct diff_result *r,
+		  int chunk_idx, int context_lines)
+{
+	const struct diff_chunk *c = &r->chunks.head[chunk_idx];
+	int left_start = diff_chunk_get_left_start(c, r, context_lines);
+	int left_end = diff_chunk_get_left_end(c, r, context_lines);
+	int right_start = diff_chunk_get_right_start(c, r, context_lines);
+	int right_end = diff_chunk_get_right_end(c, r,  context_lines);
+
+	*cc = (struct diff_chunk_context){
+		.chunk = {
+			.start = chunk_idx,
+			.end = chunk_idx + 1,
+		},
+		.left = {
+			.start = left_start,
+			.end = left_end,
+		},
+		.right = {
+			.start = right_start,
+			.end = right_end,
+		},
+	};
+}
+
+bool
+diff_chunk_contexts_touch(const struct diff_chunk_context *cc,
+			  const struct diff_chunk_context *other)
+{
+	return diff_ranges_touch(&cc->chunk, &other->chunk)
+		|| diff_ranges_touch(&cc->left, &other->left)
+		|| diff_ranges_touch(&cc->right, &other->right);
+}
+
+void
+diff_chunk_contexts_merge(struct diff_chunk_context *cc,
+			  const struct diff_chunk_context *other)
+{
+	diff_ranges_merge(&cc->chunk, &other->chunk);
+	diff_ranges_merge(&cc->left, &other->left);
+	diff_ranges_merge(&cc->right, &other->right);
+}
+
+void
+diff_chunk_context_load_change(struct diff_chunk_context *cc,
+			       int *nchunks_used,
+			       struct diff_result *result,
+			       int start_chunk_idx,
+			       int context_lines)
+{
+	int i;
+	int seen_minus = 0, seen_plus = 0;
+
+	if (nchunks_used)
+		*nchunks_used = 0;
+
+	for (i = start_chunk_idx; i < result->chunks.len; i++) {
+		struct diff_chunk *chunk = &result->chunks.head[i];
+		enum diff_chunk_type t = diff_chunk_type(chunk);
+		struct diff_chunk_context next;
+
+		if (t != CHUNK_MINUS && t != CHUNK_PLUS) {
+			if (nchunks_used)
+				(*nchunks_used)++;
+			if (seen_minus || seen_plus)
+				break;
+			else
+				continue;
+		} else if (t == CHUNK_MINUS)
+			seen_minus = 1;
+		else if (t == CHUNK_PLUS)
+			seen_plus = 1;
+
+		if (diff_chunk_context_empty(cc)) {
+			/* Note down the start point, any number of subsequent
+			 * chunks may be joined up to this chunk by being
+			 * directly adjacent. */
+			diff_chunk_context_get(cc, result, i, context_lines);
+			if (nchunks_used)
+				(*nchunks_used)++;
+			continue;
+		}
+
+		/* There already is a previous chunk noted down for being
+		 * printed. Does it join up with this one? */
+		diff_chunk_context_get(&next, result, i, context_lines);
+
+		if (diff_chunk_contexts_touch(cc, &next)) {
+			/* This next context touches or overlaps the previous
+			 * one, join. */
+			diff_chunk_contexts_merge(cc, &next);
+			if (nchunks_used)
+				(*nchunks_used)++;
+			continue;
+		} else
+			break;
+	}
+}
+
+struct diff_output_unidiff_state {
+	bool header_printed;
+};
+
+struct diff_output_unidiff_state *
+diff_output_unidiff_state_alloc(void)
+{
+	struct diff_output_unidiff_state *state;
+
+	state = calloc(1, sizeof(struct diff_output_unidiff_state));
+	if (state != NULL)
+		diff_output_unidiff_state_reset(state);
+	return state;
+}
+
+void
+diff_output_unidiff_state_reset(struct diff_output_unidiff_state *state)
+{
+	state->header_printed = false;
+}
+
+void
+diff_output_unidiff_state_free(struct diff_output_unidiff_state *state)
+{
+	free(state);
+}
+
+static int
+output_unidiff_chunk(struct diff_output_info *outinfo, FILE *dest,
+		     struct diff_output_unidiff_state *state,
+		     const struct diff_input_info *info,
+		     const struct diff_result *result,
+		     bool print_header, bool show_function_prototypes,
+		     const struct diff_chunk_context *cc)
+{
+	int rc, left_start, left_len, right_start, right_len;
+	off_t outoff = 0, *offp;
+	char *prototype = NULL;
+
+	if (diff_range_empty(&cc->left) && diff_range_empty(&cc->right))
+		return DIFF_RC_OK;
+
+	if (outinfo && outinfo->line_offsets.len > 0) {
+		unsigned int idx = outinfo->line_offsets.len - 1;
+		outoff = outinfo->line_offsets.head[idx];
+	}
+
+	if (print_header && !(state->header_printed)) {
+		rc = fprintf(dest, "--- %s\n", info->left_path ? : "a");
+		if (rc < 0)
+			return errno;
+		if (outinfo) {
+			ARRAYLIST_ADD(offp, outinfo->line_offsets);
+			if (offp == NULL)
+				return ENOMEM;
+			outoff += rc;
+			*offp = outoff;
+
+		}
+		rc = fprintf(dest, "+++ %s\n", info->right_path ? : "b");
+		if (rc < 0)
+			return errno;
+		if (outinfo) {
+			ARRAYLIST_ADD(offp, outinfo->line_offsets);
+			if (offp == NULL)
+				return ENOMEM;
+			outoff += rc;
+			*offp = outoff;
+
+		}
+		state->header_printed = true;
+	}
+
+	left_len = cc->left.end - cc->left.start;
+	if (result->left->atoms.len == 0)
+		left_start = 0;
+	else if (left_len == 0 && cc->left.start > 0)
+		left_start = cc->left.start;
+	else
+		left_start = cc->left.start + 1;
+
+	right_len = cc->right.end - cc->right.start;
+	if (result->right->atoms.len == 0)
+		right_start = 0;
+	else if (right_len == 0 && cc->right.start > 0)
+		right_start = cc->right.start;
+	else
+		right_start = cc->right.start + 1;
+
+	if (show_function_prototypes) {
+		rc = diff_output_match_function_prototype(&prototype,
+		    result, cc);
+		if (rc)
+			return rc;
+	}
+
+	if (left_len == 1 && right_len == 1) {
+		rc = fprintf(dest, "@@ -%d +%d @@%s%s\n",
+			left_start, right_start,
+			prototype ? " " : "",
+			prototype ? : "");
+	} else if (left_len == 1 && right_len != 1) {
+		rc = fprintf(dest, "@@ -%d +%d,%d @@%s%s\n",
+			left_start, right_start, right_len,
+			prototype ? " " : "",
+			prototype ? : "");
+	} else if (left_len != 1 && right_len == 1) {
+		rc = fprintf(dest, "@@ -%d,%d +%d @@%s%s\n",
+			left_start, left_len, right_start,
+			prototype ? " " : "",
+			prototype ? : "");
+	} else {
+		rc = fprintf(dest, "@@ -%d,%d +%d,%d @@%s%s\n",
+			left_start, left_len, right_start, right_len,
+			prototype ? " " : "",
+			prototype ? : "");
+	}
+	free(prototype);
+	if (rc < 0)
+		return errno;
+	if (outinfo) {
+		ARRAYLIST_ADD(offp, outinfo->line_offsets);
+		if (offp == NULL)
+			return ENOMEM;
+		outoff += rc;
+		*offp = outoff;
+
+	}
+
+	/* Got the absolute line numbers where to start printing, and the index
+	 * of the interesting (non-context) chunk.
+	 * To print context lines above the interesting chunk, nipping on the
+	 * previous chunk index may be necessary.
+	 * It is guaranteed to be only context lines where left == right, so it
+	 * suffices to look on the left. */
+	const struct diff_chunk *first_chunk;
+	int chunk_start_line;
+	first_chunk = &result->chunks.head[cc->chunk.start];
+	chunk_start_line = diff_atom_root_idx(result->left,
+					      first_chunk->left_start);
+	if (cc->left.start < chunk_start_line) {
+		rc = diff_output_lines(outinfo, dest, " ",
+				  &result->left->atoms.head[cc->left.start],
+				  chunk_start_line - cc->left.start);
+		if (rc)
+			return rc;
+	}
+
+	/* Now write out all the joined chunks and contexts between them */
+	int c_idx;
+	for (c_idx = cc->chunk.start; c_idx < cc->chunk.end; c_idx++) {
+		const struct diff_chunk *c = &result->chunks.head[c_idx];
+
+		if (c->left_count && c->right_count)
+			rc = diff_output_lines(outinfo, dest,
+					  c->solved ? " " : "?",
+					  c->left_start, c->left_count);
+		else if (c->left_count && !c->right_count)
+			rc = diff_output_lines(outinfo, dest,
+					  c->solved ? "-" : "?",
+					  c->left_start, c->left_count);
+		else if (c->right_count && !c->left_count)
+			rc = diff_output_lines(outinfo, dest,
+					  c->solved ? "+" : "?",
+					  c->right_start, c->right_count);
+		if (rc)
+			return rc;
+
+		if (cc->chunk.end == result->chunks.len) {
+			rc = diff_output_trailing_newline_msg(outinfo, dest, c);
+			if (rc != DIFF_RC_OK)
+				return rc;
+		}
+	}
+
+	/* Trailing context? */
+	const struct diff_chunk *last_chunk;
+	int chunk_end_line;
+	last_chunk = &result->chunks.head[cc->chunk.end - 1];
+	chunk_end_line = diff_atom_root_idx(result->left,
+					    last_chunk->left_start
+					    + last_chunk->left_count);
+	if (cc->left.end > chunk_end_line) {
+		rc = diff_output_lines(outinfo, dest, " ",
+				  &result->left->atoms.head[chunk_end_line],
+				  cc->left.end - chunk_end_line);
+		if (rc)
+			return rc;
+	}
+
+	return DIFF_RC_OK;
+}
+
+int
+diff_output_unidiff_chunk(struct diff_output_info **output_info, FILE *dest,
+			  struct diff_output_unidiff_state *state,
+			  const struct diff_input_info *info,
+			  const struct diff_result *result,
+			  const struct diff_chunk_context *cc)
+{
+	struct diff_output_info *outinfo = NULL;
+	int flags = (result->left->root->diff_flags |
+	    result->right->root->diff_flags);
+	bool show_function_prototypes = (flags & DIFF_FLAG_SHOW_PROTOTYPES);
+
+	if (output_info) {
+		*output_info = diff_output_info_alloc();
+		if (*output_info == NULL)
+			return ENOMEM;
+		outinfo = *output_info;
+	}
+
+	return output_unidiff_chunk(outinfo, dest, state, info,
+	    result, false, show_function_prototypes, cc);
+}
+
+int
+diff_output_unidiff(struct diff_output_info **output_info,
+		    FILE *dest, const struct diff_input_info *info,
+		    const struct diff_result *result,
+		    unsigned int context_lines)
+{
+	struct diff_output_unidiff_state *state;
+	struct diff_chunk_context cc = {};
+	struct diff_output_info *outinfo = NULL;
+	int flags = (result->left->root->diff_flags |
+	    result->right->root->diff_flags);
+	bool show_function_prototypes = (flags & DIFF_FLAG_SHOW_PROTOTYPES);
+	int i;
+
+	if (!result)
+		return EINVAL;
+	if (result->rc != DIFF_RC_OK)
+		return result->rc;
+
+	if (output_info) {
+		*output_info = diff_output_info_alloc();
+		if (*output_info == NULL)
+			return ENOMEM;
+		outinfo = *output_info;
+	}
+
+	state = diff_output_unidiff_state_alloc();
+	if (state == NULL) {
+		if (output_info) {
+			diff_output_info_free(*output_info);
+			*output_info = NULL;
+		}
+		return ENOMEM;
+	}
+
+#if DEBUG
+	for (i = 0; i < result->chunks.len; i++) {
+		struct diff_chunk *c = &result->chunks.head[i];
+		enum diff_chunk_type t = diff_chunk_type(c);
+
+		debug("[%d] %s lines L%d R%d @L %d @R %d\n",
+		      i, (t == CHUNK_MINUS ? "minus" :
+			  (t == CHUNK_PLUS ? "plus" :
+			   (t == CHUNK_SAME ? "same" : "?"))),
+		      c->left_count,
+		      c->right_count,
+		      c->left_start ? diff_atom_root_idx(result->left, c->left_start) : -1,
+		      c->right_start ? diff_atom_root_idx(result->right, c->right_start) : -1);
+	}
+#endif
+
+	for (i = 0; i < result->chunks.len; i++) {
+		struct diff_chunk *c = &result->chunks.head[i];
+		enum diff_chunk_type t = diff_chunk_type(c);
+		struct diff_chunk_context next;
+
+		if (t != CHUNK_MINUS && t != CHUNK_PLUS)
+			continue;
+
+		if (diff_chunk_context_empty(&cc)) {
+			/* These are the first lines being printed.
+			 * Note down the start point, any number of subsequent
+			 * chunks may be joined up to this unidiff chunk by
+			 * context lines or by being directly adjacent. */
+			diff_chunk_context_get(&cc, result, i, context_lines);
+			debug("new chunk to be printed:"
+			      " chunk %d-%d left %d-%d right %d-%d\n",
+			      cc.chunk.start, cc.chunk.end,
+			      cc.left.start, cc.left.end,
+			      cc.right.start, cc.right.end);
+			continue;
+		}
+
+		/* There already is a previous chunk noted down for being
+		 * printed. Does it join up with this one? */
+		diff_chunk_context_get(&next, result, i, context_lines);
+		debug("new chunk to be printed:"
+		      " chunk %d-%d left %d-%d right %d-%d\n",
+		      next.chunk.start, next.chunk.end,
+		      next.left.start, next.left.end,
+		      next.right.start, next.right.end);
+
+		if (diff_chunk_contexts_touch(&cc, &next)) {
+			/* This next context touches or overlaps the previous
+			 * one, join. */
+			diff_chunk_contexts_merge(&cc, &next);
+			debug("new chunk to be printed touches previous chunk,"
+			      " now: left %d-%d right %d-%d\n",
+			      cc.left.start, cc.left.end,
+			      cc.right.start, cc.right.end);
+			continue;
+		}
+
+		/* No touching, so the previous context is complete with a gap
+		 * between it and this next one. Print the previous one and
+		 * start fresh here. */
+		debug("new chunk to be printed does not touch previous chunk;"
+		      " print left %d-%d right %d-%d\n",
+		      cc.left.start, cc.left.end, cc.right.start, cc.right.end);
+		output_unidiff_chunk(outinfo, dest, state, info, result,
+		    true, show_function_prototypes, &cc);
+		cc = next;
+		debug("new unprinted chunk is left %d-%d right %d-%d\n",
+		      cc.left.start, cc.left.end, cc.right.start, cc.right.end);
+	}
+
+	if (!diff_chunk_context_empty(&cc))
+		output_unidiff_chunk(outinfo, dest, state, info, result,
+		    true, show_function_prototypes, &cc);
+	diff_output_unidiff_state_free(state);
+	return DIFF_RC_OK;
+}
blob - /dev/null
blob + a845cbd14cd84545c0356b938d9ecc930f7f9890 (mode 644)
--- /dev/null
+++ lib/diff_patience.c
@@ -0,0 +1,777 @@
+/* Implementation of the Patience Diff algorithm invented by Bram Cohen:
+ * Divide a diff problem into smaller chunks by an LCS (Longest Common Sequence)
+ * of common-unique lines. */
+/*
+ * Copyright (c) 2020 Neels Hofmeyr <neels@hofmeyr.de>
+ *
+ * Permission to use, copy, modify, and distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#include <assert.h>
+#include <inttypes.h>
+#include <errno.h>
+#include <stdbool.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+#include <arraylist.h>
+#include <diff_main.h>
+
+#include "diff_internal.h"
+#include "diff_debug.h"
+
+/* Algorithm to find unique lines:
+ * 0: stupidly iterate atoms
+ * 1: qsort
+ * 2: mergesort
+ */
+#define UNIQUE_STRATEGY 1
+
+/* Per-atom state for the Patience Diff algorithm */
+struct atom_patience {
+#if UNIQUE_STRATEGY == 0
+	bool unique_here;
+#endif
+	bool unique_in_both;
+	struct diff_atom *pos_in_other;
+	struct diff_atom *prev_stack;
+	struct diff_range identical_lines;
+};
+
+/* A diff_atom has a backpointer to the root diff_data. That points to the
+ * current diff_data, a possibly smaller section of the root. That current
+ * diff_data->algo_data is a pointer to an array of struct atom_patience. The
+ * atom's index in current diff_data gives the index in the atom_patience array.
+ */
+#define PATIENCE(ATOM) \
+	(((struct atom_patience*)((ATOM)->root->current->algo_data))\
+	 [diff_atom_idx((ATOM)->root->current, ATOM)])
+
+#if UNIQUE_STRATEGY == 0
+
+/* Stupid iteration and comparison of all atoms */
+static int
+diff_atoms_mark_unique(struct diff_data *d, unsigned int *unique_count)
+{
+	struct diff_atom *i;
+	unsigned int count = 0;
+	diff_data_foreach_atom(i, d) {
+		PATIENCE(i).unique_here = true;
+		PATIENCE(i).unique_in_both = true;
+		count++;
+	}
+	diff_data_foreach_atom(i, d) {
+		struct diff_atom *j;
+
+		if (!PATIENCE(i).unique_here)
+			continue;
+
+		diff_data_foreach_atom_from(i + 1, j, d) {
+			bool same;
+			int r = diff_atom_same(&same, i, j);
+			if (r)
+				return r;
+			if (!same)
+				continue;
+			if (PATIENCE(i).unique_here) {
+				PATIENCE(i).unique_here = false;
+				PATIENCE(i).unique_in_both = false;
+				count--;
+			}
+			PATIENCE(j).unique_here = false;
+			PATIENCE(j).unique_in_both = false;
+			count--;
+		}
+	}
+	if (unique_count)
+		*unique_count = count;
+	return 0;
+}
+
+/* Mark those lines as PATIENCE(atom).unique_in_both = true that appear exactly
+ * once in each side. */
+static int
+diff_atoms_mark_unique_in_both(struct diff_data *left, struct diff_data *right,
+			       unsigned int *unique_in_both_count)
+{
+	/* Derive the final unique_in_both count without needing an explicit
+	 * iteration. So this is just some optimiziation to save one iteration
+	 * in the end. */
+	unsigned int unique_in_both;
+	int r;
+
+	r = diff_atoms_mark_unique(left, &unique_in_both);
+	if (r)
+		return r;
+	r = diff_atoms_mark_unique(right, NULL);
+	if (r)
+		return r;
+
+	debug("unique_in_both %u\n", unique_in_both);
+
+	struct diff_atom *i;
+	diff_data_foreach_atom(i, left) {
+		if (!PATIENCE(i).unique_here)
+			continue;
+		struct diff_atom *j;
+		int found_in_b = 0;
+		diff_data_foreach_atom(j, right) {
+			bool same;
+			int r = diff_atom_same(&same, i, j);
+			if (r)
+				return r;
+			if (!same)
+				continue;
+			if (!PATIENCE(j).unique_here) {
+				found_in_b = 2; /* or more */
+				break;
+			} else {
+				found_in_b = 1;
+				PATIENCE(j).pos_in_other = i;
+				PATIENCE(i).pos_in_other = j;
+			}
+		}
+
+		if (found_in_b == 0 || found_in_b > 1) {
+			PATIENCE(i).unique_in_both = false;
+			unique_in_both--;
+			debug("unique_in_both %u  (%d) ", unique_in_both,
+			      found_in_b);
+			debug_dump_atom(left, NULL, i);
+		}
+	}
+
+	/* Still need to unmark right[*]->patience.unique_in_both for atoms that
+	 * don't exist in left */
+	diff_data_foreach_atom(i, right) {
+		if (!PATIENCE(i).unique_here
+		    || !PATIENCE(i).unique_in_both)
+			continue;
+		struct diff_atom *j;
+		bool found_in_a = false;
+		diff_data_foreach_atom(j, left) {
+			bool same;
+			int r;
+			if (!PATIENCE(j).unique_in_both)
+				continue;
+			r = diff_atom_same(&same, i, j);
+			if (r)
+				return r;
+			if (!same)
+				continue;
+			found_in_a = true;
+			break;
+		}
+
+		if (!found_in_a)
+			PATIENCE(i).unique_in_both = false;
+	}
+
+	if (unique_in_both_count)
+		*unique_in_both_count = unique_in_both;
+	return 0;
+}
+
+#else /* UNIQUE_STRATEGY != 0 */
+
+/* Use an optimized sorting algorithm (qsort, mergesort) to find unique lines */
+
+int diff_atoms_compar(const void *_a, const void *_b)
+{
+	const struct diff_atom *a = *(struct diff_atom**)_a;
+	const struct diff_atom *b = *(struct diff_atom**)_b;
+	int cmp;
+	int rc = 0;
+
+	/* If there's been an error (e.g. I/O error) in a previous compar, we
+	 * have no way to abort the sort but just report the rc and stop
+	 * comparing. Make sure to catch errors on either side. If atoms are
+	 * from more than one diff_data, make sure the error, if any, spreads
+	 * to all of them, so we can cut short all future comparisons. */
+	if (a->root->err)
+		rc = a->root->err;
+	if (b->root->err)
+		rc = b->root->err;
+	if (rc) {
+		a->root->err = rc;
+		b->root->err = rc;
+		/* just return 'equal' to not swap more positions */
+		return 0;
+	}
+
+	/* Sort by the simplistic hash */
+	if (a->hash < b->hash)
+		return -1;
+	if (a->hash > b->hash)
+		return 1;
+
+	/* If hashes are the same, the lines may still differ. Do a full cmp. */
+	rc = diff_atom_cmp(&cmp, a, b);
+
+	if (rc) {
+		/* Mark the I/O error so that the caller can find out about it.
+		 * For the case atoms are from more than one diff_data, mark in
+		 * both. */
+		a->root->err = rc;
+		if (a->root != b->root)
+			b->root->err = rc;
+		return 0;
+	}
+
+	return cmp;
+}
+
+/* Sort an array of struct diff_atom* in-place. */
+static int diff_atoms_sort(struct diff_atom *atoms[],
+			   size_t atoms_count)
+{
+#if UNIQUE_STRATEGY == 1
+	qsort(atoms, atoms_count, sizeof(struct diff_atom*), diff_atoms_compar);
+#else
+	mergesort(atoms, atoms_count, sizeof(struct diff_atom*),
+		  diff_atoms_compar);
+#endif
+	return atoms[0]->root->err;
+}
+
+static int
+diff_atoms_mark_unique_in_both(struct diff_data *left, struct diff_data *right,
+			       unsigned int *unique_in_both_count_p)
+{
+	struct diff_atom *a;
+	struct diff_atom *b;
+	struct diff_atom **all_atoms;
+	unsigned int len = 0;
+	unsigned int i;
+	unsigned int unique_in_both_count = 0;
+	int rc;
+
+	all_atoms = calloc(left->atoms.len + right->atoms.len,
+	    sizeof(struct diff_atom *));
+	if (all_atoms == NULL)
+		return ENOMEM;
+
+	left->err = 0;
+	right->err = 0;
+	left->root->err = 0;
+	right->root->err = 0;
+	diff_data_foreach_atom(a, left) {
+		all_atoms[len++] = a;
+	}
+	diff_data_foreach_atom(b, right) {
+		all_atoms[len++] = b;
+	}
+
+	rc = diff_atoms_sort(all_atoms, len);
+	if (rc)
+		goto free_and_exit;
+
+	/* Now we have a sorted array of atom pointers. All similar lines are
+	 * adjacent. Walk through the array and mark those that are unique on
+	 * each side, but exist once in both sources. */
+	for (i = 0; i < len; i++) {
+		bool same;
+		unsigned int next_differing_i;
+		unsigned int last_identical_i;
+		unsigned int j;
+		unsigned int count_first_side = 1;
+		unsigned int count_other_side = 0;
+		a = all_atoms[i];
+		debug("a: ");
+		debug_dump_atom(a->root, NULL, a);
+
+		/* Do as few diff_atom_cmp() as possible: first walk forward
+		 * only using the cheap hash as indicator for differing atoms;
+		 * then walk backwards until hitting an identical atom. */
+		for (next_differing_i = i + 1; next_differing_i < len;
+		     next_differing_i++) {
+			b = all_atoms[next_differing_i];
+			if (a->hash != b->hash)
+				break;
+		}
+		for (last_identical_i = next_differing_i - 1;
+		     last_identical_i > i;
+		     last_identical_i--) {
+			b = all_atoms[last_identical_i];
+			rc = diff_atom_same(&same, a, b);
+			if (rc)
+				goto free_and_exit;
+			if (same)
+				break;
+		}
+		next_differing_i = last_identical_i + 1;
+
+		for (j = i+1; j < next_differing_i; j++) {
+			b = all_atoms[j];
+			/* A following atom is the same. See on which side the
+			 * repetition counts. */
+			if (a->root == b->root)
+				count_first_side ++;
+			else
+				count_other_side ++;
+			debug("b: ");
+			debug_dump_atom(b->root, NULL, b);
+			debug("   count_first_side=%d count_other_side=%d\n",
+			      count_first_side, count_other_side);
+		}
+
+		/* Counted a section of similar atoms, put the results back to
+		 * the atoms. */
+		if ((count_first_side == 1)
+		    && (count_other_side == 1)) {
+			b = all_atoms[i+1];
+			PATIENCE(a).unique_in_both = true;
+			PATIENCE(a).pos_in_other = b;
+			PATIENCE(b).unique_in_both = true;
+			PATIENCE(b).pos_in_other = a;
+			unique_in_both_count++;
+		}
+
+		/* j now points at the first atom after 'a' that is not
+		 * identical to 'a'. j is always > i. */
+		i = j - 1;
+	}
+	*unique_in_both_count_p = unique_in_both_count;
+	rc = 0;
+free_and_exit:
+	free(all_atoms);
+	return rc;
+}
+#endif /* UNIQUE_STRATEGY != 0 */
+
+static int
+diff_atoms_swallow_identical_neighbors(struct diff_data *left,
+				       struct diff_data *right,
+				       unsigned int *unique_in_both_count)
+{
+	debug("trivially combine identical lines"
+	      " around unique_in_both lines\n");
+
+	unsigned int l_idx;
+	unsigned int next_l_idx;
+	/* Only checking against identical-line overlaps on the left; overlaps
+	 * on the right are tolerated and ironed out later. See also the other
+	 * place marked with (1). */
+	unsigned int l_min = 0;
+	for (l_idx = 0; l_idx < left->atoms.len; l_idx = next_l_idx) {
+		next_l_idx = l_idx + 1;
+		struct diff_atom *l = &left->atoms.head[l_idx];
+		struct diff_atom *r;
+
+		if (!PATIENCE(l).unique_in_both)
+			continue;
+
+		debug("check identical lines around\n");
+		debug(" L "); debug_dump_atom(left, right, l);
+
+		unsigned int r_idx = diff_atom_idx(right, PATIENCE(l).pos_in_other);
+		r = &right->atoms.head[r_idx];
+		debug(" R "); debug_dump_atom(right, left, r);
+
+		struct diff_range identical_l;
+		struct diff_range identical_r;
+
+		/* Swallow upwards.
+		 * Each common-unique line swallows identical lines upwards and
+		 * downwards.
+		 * Will never hit another identical common-unique line above on
+		 * the left, because those would already have swallowed this
+		 * common-unique line in a previous iteration.
+		 */
+		for (identical_l.start = l_idx, identical_r.start = r_idx;
+		     identical_l.start > l_min && identical_r.start > 0;
+		     identical_l.start--, identical_r.start--) {
+			bool same;
+			int rc = diff_atom_same(&same,
+				&left->atoms.head[identical_l.start - 1],
+				&right->atoms.head[identical_r.start - 1]);
+			if (rc)
+				return rc;
+			if (!same)
+				break;
+		}
+
+		/* Swallow downwards. Join any common-unique lines in a block of
+		 * matching L/R lines with this one. */
+		for (identical_l.end = l_idx + 1, identical_r.end = r_idx + 1;
+		     identical_l.end < left->atoms.len
+			 && identical_r.end < right->atoms.len;
+		     identical_l.end++, identical_r.end++,
+		     next_l_idx++) {
+			struct diff_atom *l_end;
+			struct diff_atom *r_end;
+			bool same;
+			int rc = diff_atom_same(&same,
+					&left->atoms.head[identical_l.end],
+					&right->atoms.head[identical_r.end]);
+			if (rc)
+				return rc;
+			if (!same)
+				break;
+			l_end = &left->atoms.head[identical_l.end];
+			r_end = &right->atoms.head[identical_r.end];
+			if (!PATIENCE(l_end).unique_in_both)
+				continue;
+			/* A unique_in_both atom is joined with a preceding
+			 * unique_in_both atom, remove the joined atom from
+			 * listing of unique_in_both atoms */
+			PATIENCE(l_end).unique_in_both = false;
+			PATIENCE(r_end).unique_in_both = false;
+			(*unique_in_both_count)--;
+		}
+
+		PATIENCE(l).identical_lines = identical_l;
+		PATIENCE(r).identical_lines = identical_r;
+
+		l_min = identical_l.end;
+
+		if (!diff_range_empty(&PATIENCE(l).identical_lines)) {
+			debug("common-unique line at l=%u r=%u swallowed"
+			      " identical lines l=%u-%u r=%u-%u\n",
+			      l_idx, r_idx,
+			      identical_l.start, identical_l.end,
+			      identical_r.start, identical_r.end);
+		}
+		debug("next_l_idx = %u\n", next_l_idx);
+	}
+	return 0;
+}
+
+/* binary search to find the stack to put this atom "card" on. */
+static int
+find_target_stack(struct diff_atom *atom,
+		  struct diff_atom **patience_stacks,
+		  unsigned int patience_stacks_count)
+{
+	unsigned int lo = 0;
+	unsigned int hi = patience_stacks_count;
+	while (lo < hi) {
+		unsigned int mid = (lo + hi) >> 1;
+
+		if (PATIENCE(patience_stacks[mid]).pos_in_other
+		    < PATIENCE(atom).pos_in_other)
+			lo = mid + 1;
+		else
+			hi = mid;
+	}
+	return lo;
+}
+
+/* Among the lines that appear exactly once in each side, find the longest
+ * streak that appear in both files in the same order (with other stuff allowed
+ * to interleave). Use patience sort for that, as in the Patience Diff
+ * algorithm.
+ * See https://bramcohen.livejournal.com/73318.html and, for a much more
+ * detailed explanation,
+ * https://blog.jcoglan.com/2017/09/19/the-patience-diff-algorithm/ */
+int
+diff_algo_patience(const struct diff_algo_config *algo_config,
+		   struct diff_state *state)
+{
+	int rc;
+	struct diff_data *left = &state->left;
+	struct diff_data *right = &state->right;
+	struct atom_patience *atom_patience_left =
+		calloc(left->atoms.len, sizeof(struct atom_patience));
+	struct atom_patience *atom_patience_right =
+		calloc(right->atoms.len, sizeof(struct atom_patience));
+	unsigned int unique_in_both_count;
+	struct diff_atom **lcs = NULL;
+
+	debug("\n** %s\n", __func__);
+
+	left->root->current = left;
+	right->root->current = right;
+	left->algo_data = atom_patience_left;
+	right->algo_data = atom_patience_right;
+
+	/* Find those lines that appear exactly once in 'left' and exactly once
+	 * in 'right'. */
+	rc = diff_atoms_mark_unique_in_both(left, right, &unique_in_both_count);
+	if (rc)
+		goto free_and_exit;
+
+	debug("unique_in_both_count %u\n", unique_in_both_count);
+	debug("left:\n");
+	debug_dump(left);
+	debug("right:\n");
+	debug_dump(right);
+
+	if (!unique_in_both_count) {
+		/* Cannot apply Patience, tell the caller to use fallback_algo
+		 * instead. */
+		rc = DIFF_RC_USE_DIFF_ALGO_FALLBACK;
+		goto free_and_exit;
+	}
+
+	rc = diff_atoms_swallow_identical_neighbors(left, right,
+						    &unique_in_both_count);
+	if (rc)
+		goto free_and_exit;
+	debug("After swallowing identical neighbors: unique_in_both = %u\n",
+	      unique_in_both_count);
+
+	rc = ENOMEM;
+
+	/* An array of Longest Common Sequence is the result of the below
+	 * subscope: */
+	unsigned int lcs_count = 0;
+	struct diff_atom *lcs_tail = NULL;
+
+	{
+		/* This subscope marks the lifetime of the atom_pointers
+		 * allocation */
+
+		/* One chunk of storage for atom pointers */
+		struct diff_atom **atom_pointers;
+		atom_pointers = recallocarray(NULL, 0, unique_in_both_count * 2,
+					      sizeof(struct diff_atom*));
+		if (atom_pointers == NULL)
+			return ENOMEM;
+		/* Half for the list of atoms that still need to be put on
+		 * stacks */
+		struct diff_atom **uniques = atom_pointers;
+
+		/* Half for the patience sort state's "card stacks" -- we
+		 * remember only each stack's topmost "card" */
+		struct diff_atom **patience_stacks;
+		patience_stacks = atom_pointers + unique_in_both_count;
+		unsigned int patience_stacks_count = 0;
+
+		/* Take all common, unique items from 'left' ... */
+
+		struct diff_atom *atom;
+		struct diff_atom **uniques_end = uniques;
+		diff_data_foreach_atom(atom, left) {
+			if (!PATIENCE(atom).unique_in_both)
+				continue;
+			*uniques_end = atom;
+			uniques_end++;
+		}
+
+		/* ...and sort them to the order found in 'right'.
+		 * The idea is to find the leftmost stack that has a higher line
+		 * number and add it to the stack's top.
+		 * If there is no such stack, open a new one on the right. The
+		 * line number is derived from the atom*, which are array items
+		 * and hence reflect the relative position in the source file.
+		 * So we got the common-uniques from 'left' and sort them
+		 * according to PATIENCE(atom).pos_in_other. */
+		unsigned int i;
+		for (i = 0; i < unique_in_both_count; i++) {
+			atom = uniques[i];
+			unsigned int target_stack;
+			target_stack = find_target_stack(atom, patience_stacks,
+							 patience_stacks_count);
+			assert(target_stack <= patience_stacks_count);
+			patience_stacks[target_stack] = atom;
+			if (target_stack == patience_stacks_count)
+				patience_stacks_count++;
+
+			/* Record a back reference to the next stack on the
+			 * left, which will form the final longest sequence
+			 * later. */
+			PATIENCE(atom).prev_stack = target_stack ?
+				patience_stacks[target_stack - 1] : NULL;
+
+			{
+				int xx;
+				for (xx = 0; xx < patience_stacks_count; xx++) {
+					debug(" %s%d",
+					      (xx == target_stack) ? ">" : "",
+					      diff_atom_idx(right,
+							    PATIENCE(patience_stacks[xx]).pos_in_other));
+				}
+				debug("\n");
+			}
+		}
+
+		/* backtrace through prev_stack references to form the final
+		 * longest common sequence */
+		lcs_tail = patience_stacks[patience_stacks_count - 1];
+		lcs_count = patience_stacks_count;
+
+		/* uniques and patience_stacks are no longer needed.
+		 * Backpointers are in PATIENCE(atom).prev_stack */
+		free(atom_pointers);
+	}
+
+	lcs = recallocarray(NULL, 0, lcs_count, sizeof(struct diff_atom*));
+	struct diff_atom **lcs_backtrace_pos = &lcs[lcs_count - 1];
+	struct diff_atom *atom;
+	for (atom = lcs_tail; atom; atom = PATIENCE(atom).prev_stack, lcs_backtrace_pos--) {
+		assert(lcs_backtrace_pos >= lcs);
+		*lcs_backtrace_pos = atom;
+	}
+
+	unsigned int i;
+	if (DEBUG) {
+		debug("\npatience LCS:\n");
+		for (i = 0; i < lcs_count; i++) {
+			debug("\n L "); debug_dump_atom(left, right, lcs[i]);
+			debug(" R "); debug_dump_atom(right, left,
+						      PATIENCE(lcs[i]).pos_in_other);
+		}
+	}
+
+
+	/* TODO: For each common-unique line found (now listed in lcs), swallow
+	 * lines upwards and downwards that are identical on each side. Requires
+	 * a way to represent atoms being glued to adjacent atoms. */
+
+	debug("\ntraverse LCS, possibly recursing:\n");
+
+	/* Now we have pinned positions in both files at which it makes sense to
+	 * divide the diff problem into smaller chunks. Go into the next round:
+	 * look at each section in turn, trying to again find common-unique
+	 * lines in those smaller sections. As soon as no more are found, the
+	 * remaining smaller sections are solved by Myers. */
+	/* left_pos and right_pos are indexes in left/right->atoms.head until
+	 * which the atoms are already handled (added to result chunks). */
+	unsigned int left_pos = 0;
+	unsigned int right_pos = 0;
+	for (i = 0; i <= lcs_count; i++) {
+		struct diff_atom *atom;
+		struct diff_atom *atom_r;
+		/* left_idx and right_idx are indexes of the start of this
+		 * section of identical lines on both sides.
+		 * left_pos marks the index of the first still unhandled line,
+		 * left_idx is the start of an identical section some way
+		 * further down, and this loop adds an unsolved chunk of
+		 * [left_pos..left_idx[ and a solved chunk of
+		 * [left_idx..identical_lines.end[. */
+		unsigned int left_idx;
+		unsigned int right_idx;
+
+		debug("iteration %u of %u  left_pos %u  right_pos %u\n",
+		      i, lcs_count, left_pos, right_pos);
+
+		if (i < lcs_count) {
+			atom = lcs[i];
+			atom_r = PATIENCE(atom).pos_in_other;
+			debug("lcs[%u] = left[%u] = right[%u]\n", i,
+			      diff_atom_idx(left, atom), diff_atom_idx(right, atom_r));
+			left_idx = PATIENCE(atom).identical_lines.start;
+			right_idx = PATIENCE(atom_r).identical_lines.start;
+			debug(" identical lines l %u-%u  r %u-%u\n",
+			      PATIENCE(atom).identical_lines.start, PATIENCE(atom).identical_lines.end,
+			      PATIENCE(atom_r).identical_lines.start, PATIENCE(atom_r).identical_lines.end);
+		} else {
+			/* There are no more identical lines until the end of
+			 * left and right. */
+			atom = NULL;
+			atom_r = NULL;
+			left_idx = left->atoms.len;
+			right_idx = right->atoms.len;
+		}
+
+		if (right_idx < right_pos) {
+			/* This may happen if common-unique lines were in a
+			 * different order on the right, and then ended up
+			 * consuming the same identical atoms around a pair of
+			 * common-unique atoms more than once.
+			 * See also marker the other place marked with (1). */
+			int already_done_count = right_pos - right_idx;
+			left_idx += already_done_count;
+			right_idx += already_done_count;
+			/* Paranoia: make sure we're skipping just an
+			 * additionally joined identical line around it, and not
+			 * the common-unique line itself. */
+			assert(left_idx <= diff_atom_idx(left, atom));
+		}
+
+		/* 'atom' (if not NULL) now marks an atom that matches on both
+		 * sides according to patience-diff (a common-unique identical
+		 * atom in both files).
+		 * Handle the section before and the atom itself; the section
+		 * after will be handled by the next loop iteration -- note that
+		 * i loops to last element + 1 ("i <= lcs_count"), so that there
+		 * will be another final iteration to pick up the last remaining
+		 * items after the last LCS atom.
+		 */
+
+		debug("iteration %u  left_pos %u  left_idx %u"
+		      "  right_pos %u  right_idx %u\n",
+		      i, left_pos, left_idx, right_pos, right_idx);
+
+		/* Section before the matching atom */
+		struct diff_atom *left_atom = &left->atoms.head[left_pos];
+		unsigned int left_section_len = left_idx - left_pos;
+
+		struct diff_atom *right_atom = &(right->atoms.head[right_pos]);
+		unsigned int right_section_len = right_idx - right_pos;
+
+		if (left_section_len && right_section_len) {
+			/* Record an unsolved chunk, the caller will apply
+			 * inner_algo() on this chunk. */
+			if (!diff_state_add_chunk(state, false,
+						  left_atom, left_section_len,
+						  right_atom,
+						  right_section_len))
+				goto free_and_exit;
+		} else if (left_section_len && !right_section_len) {
+			/* Only left atoms and none on the right, they form a
+			 * "minus" chunk, then. */
+			if (!diff_state_add_chunk(state, true,
+						  left_atom, left_section_len,
+						  right_atom, 0))
+				goto free_and_exit;
+		} else if (!left_section_len && right_section_len) {
+			/* No left atoms, only atoms on the right, they form a
+			 * "plus" chunk, then. */
+			if (!diff_state_add_chunk(state, true,
+						  left_atom, 0,
+						  right_atom, right_section_len))
+				goto free_and_exit;
+		}
+		/* else: left_section_len == 0 and right_section_len == 0, i.e.
+		 * nothing here. */
+
+		/* The atom found to match on both sides forms a chunk of equals
+		 * on each side. In the very last iteration of this loop, there
+		 * is no matching atom, we were just cleaning out the remaining
+		 * lines. */
+		if (atom) {
+			void *ok;
+			ok = diff_state_add_chunk(state, true,
+				left->atoms.head
+				    + PATIENCE(atom).identical_lines.start,
+				diff_range_len(&PATIENCE(atom).identical_lines),
+				right->atoms.head
+				    + PATIENCE(atom_r).identical_lines.start,
+				diff_range_len(&PATIENCE(atom_r).identical_lines));
+			if (!ok)
+				goto free_and_exit;
+			left_pos = PATIENCE(atom).identical_lines.end;
+			right_pos = PATIENCE(atom_r).identical_lines.end;
+		} else {
+			left_pos = left_idx + 1;
+			right_pos = right_idx + 1;
+		}
+		debug("end of iteration %u  left_pos %u  left_idx %u"
+		      "  right_pos %u  right_idx %u\n",
+		      i, left_pos, left_idx, right_pos, right_idx);
+	}
+	debug("** END %s\n", __func__);
+
+	rc = DIFF_RC_OK;
+
+free_and_exit:
+	left->root->current = NULL;
+	right->root->current = NULL;
+	free(atom_patience_left);
+	free(atom_patience_right);
+	if (lcs)
+		free(lcs);
+	return rc;
+}
blob - 54a1c38fbbcfb2ede54f20d464e12db1a15f0447
blob + 569a3c00f996043b4dd11dba2f0d4cb675991cae
--- lib/worktree.c
+++ lib/worktree.c
@@ -4116,47 +4116,62 @@ copy_remaining_content(FILE *f1, FILE *f2, int *line_c
 }
 
 static const struct got_error *
-apply_or_reject_change(int *choice, struct got_diff_change *change, int n,
-    int nchanges, struct got_diff_state *ds, struct got_diff_args *args,
-    int diff_flags, const char *relpath, FILE *f1, FILE *f2, int *line_cur1,
-    int *line_cur2, FILE *outfile, FILE *rejectfile,
+apply_or_reject_change(int *choice, int *nchunks_used,
+    struct diff_result *diff_result, int n,
+    const char *relpath, FILE *f1, FILE *f2, int *line_cur1, int *line_cur2,
+    FILE *outfile, FILE *rejectfile, int changeno, int nchanges,
     got_worktree_patch_cb patch_cb, void *patch_arg)
 {
 	const struct got_error *err = NULL;
-	int start_old = change->cv.a;
-	int end_old = change->cv.b;
-	int start_new = change->cv.c;
-	int end_new = change->cv.d;
-	long pos1, pos2;
+	struct diff_chunk_context cc = {}; 
+	int start_old, end_old, start_new, end_new;
 	FILE *hunkfile;
+	struct diff_output_unidiff_state *diff_state;
+	struct diff_input_info diff_info;
+	int rc;
 
 	*choice = GOT_PATCH_CHOICE_NONE;
 
+	/* Get changed line numbers without context lines for copy_change(). */
+	diff_chunk_context_load_change(&cc, NULL, diff_result, n, 0);
+	start_old = cc.left.start;
+	end_old = cc.left.end;
+	start_new = cc.right.start;
+	end_new = cc.right.end;
+
+	/* Get the same change with context lines for display. */
+	memset(&cc, 0, sizeof(cc));
+	diff_chunk_context_load_change(&cc, nchunks_used, diff_result, n, 3);
+
+	memset(&diff_info, 0, sizeof(diff_info));
+	diff_info.left_path = relpath;
+	diff_info.right_path = relpath;
+
+	diff_state = diff_output_unidiff_state_alloc();
+	if (diff_state == NULL)
+		return got_error_set_errno(ENOMEM,
+		    "diff_output_unidiff_state_alloc");
+
 	hunkfile = got_opentemp();
-	if (hunkfile == NULL)
-		return got_error_from_errno("got_opentemp");
-
-	pos1 = ftell(f1);
-	pos2 = ftell(f2);
-
-	/* XXX TODO needs error checking */
-	got_diff_dump_change(hunkfile, change, ds, args, f1, f2, diff_flags);
-
-	if (fseek(f1, pos1, SEEK_SET) == -1) {
-		err = got_ferror(f1, GOT_ERR_IO);
+	if (hunkfile == NULL) {
+		err = got_error_from_errno("got_opentemp");
 		goto done;
 	}
-	if (fseek(f2, pos2, SEEK_SET) == -1) {
-		err = got_ferror(f1, GOT_ERR_IO);
+
+	rc = diff_output_unidiff_chunk(NULL, hunkfile, diff_state, &diff_info,
+	    diff_result, &cc);
+	if (rc != DIFF_RC_OK) {
+		err = got_error_set_errno(rc, "diff_output_unidiff_chunk");
 		goto done;
 	}
+
 	if (fseek(hunkfile, 0L, SEEK_SET) == -1) {
 		err = got_ferror(hunkfile, GOT_ERR_IO);
 		goto done;
 	}
 
 	err = (*patch_cb)(choice, patch_arg, GOT_STATUS_MODIFY, relpath,
-	    hunkfile, n, nchanges);
+	    hunkfile, changeno, nchanges);
 	if (err)
 		goto done;
 
@@ -4176,6 +4191,7 @@ apply_or_reject_change(int *choice, struct got_diff_ch
 		break;
 	}
 done:
+	diff_output_unidiff_state_free(diff_state);
 	if (hunkfile && fclose(hunkfile) == EOF && err == NULL)
 		err = got_error_from_errno("fclose");
 	return err;
@@ -4198,20 +4214,17 @@ create_patched_content(char **path_outfile, int revers
     const char *relpath, struct got_repository *repo,
     got_worktree_patch_cb patch_cb, void *patch_arg)
 {
-	const struct got_error *err;
+	const struct got_error *err, *free_err;
 	struct got_blob_object *blob = NULL;
 	FILE *f1 = NULL, *f2 = NULL, *outfile = NULL;
 	int fd2 = -1;
 	char link_target[PATH_MAX];
 	ssize_t link_len = 0;
 	char *path1 = NULL, *id_str = NULL;
-	struct stat sb1, sb2;
-	struct got_diff_changes *changes = NULL;
-	struct got_diff_state *ds = NULL;
-	struct got_diff_args *args = NULL;
-	struct got_diff_change *change;
-	int diff_flags = 0, line_cur1 = 1, line_cur2 = 1, have_content = 0;
-	int n = 0;
+	struct stat sb2;
+	struct got_diffreg_result *diffreg_result = NULL;
+	int line_cur1 = 1, line_cur2 = 1, have_content = 0;
+	int i = 0, n = 0, nchunks_used = 0, nchanges = 0;
 
 	*path_outfile = NULL;
 
@@ -4291,13 +4304,8 @@ create_patched_content(char **path_outfile, int revers
 	if (err)
 		goto done;
 
-	if (stat(path1, &sb1) == -1) {
-		err = got_error_from_errno2("stat", path1);
-		goto done;
-	}
-
-	err = got_diff_files(&changes, &ds, &args, &diff_flags,
-	    f1, sb1.st_size, id_str, f2, sb2.st_size, path2, 3, NULL);
+	err = got_diff_files(&diffreg_result, f1, id_str, f2, path2, 3, 0,
+	    NULL);
 	if (err)
 		goto done;
 
@@ -4309,14 +4317,22 @@ create_patched_content(char **path_outfile, int revers
 		return got_ferror(f1, GOT_ERR_IO);
 	if (fseek(f2, 0L, SEEK_SET) == -1)
 		return got_ferror(f2, GOT_ERR_IO);
-	SIMPLEQ_FOREACH(change, &changes->entries, entry) {
+
+	/* Count the number of actual changes in the diff result. */
+	for (n = 0; n < diffreg_result->result->chunks.len; n += nchunks_used) {
+		struct diff_chunk_context cc = {}; 
+		diff_chunk_context_load_change(&cc, &nchunks_used,
+		    diffreg_result->result, n, 0);
+		nchanges++;
+	}
+	for (n = 0; n < diffreg_result->result->chunks.len; n += nchunks_used) {
 		int choice;
-		err = apply_or_reject_change(&choice, change, ++n,
-		    changes->nchanges, ds, args, diff_flags, relpath,
-		    f1, f2, &line_cur1, &line_cur2,
+		err = apply_or_reject_change(&choice, &nchunks_used,
+		    diffreg_result->result, n, relpath, f1, f2,
+		    &line_cur1, &line_cur2,
 		    reverse_patch ? NULL : outfile,
 		    reverse_patch ? outfile : NULL,
-		    patch_cb, patch_arg);
+		    ++i, nchanges, patch_cb, patch_arg);
 		if (err)
 			goto done;
 		if (choice == GOT_PATCH_CHOICE_YES)
@@ -4342,6 +4358,9 @@ done:
 	free(id_str);
 	if (blob)
 		got_object_blob_close(blob);
+	free_err = got_diffreg_result_free(diffreg_result);
+	if (err == NULL)
+		err = free_err;
 	if (f1 && fclose(f1) == EOF && err == NULL)
 		err = got_error_from_errno2("fclose", path1);
 	if (f2 && fclose(f2) == EOF && err == NULL)
@@ -4358,13 +4377,6 @@ done:
 		free(*path_outfile);
 		*path_outfile = NULL;
 	}
-	free(args);
-	if (ds) {
-		got_diff_state_free(ds);
-		free(ds);
-	}
-	if (changes)
-		got_diff_free_changes(changes);
 	free(path1);
 	return err;
 }
@@ -7378,17 +7390,13 @@ create_unstaged_content(char **path_unstaged_content,
     struct got_repository *repo,
     got_worktree_patch_cb patch_cb, void *patch_arg)
 {
-	const struct got_error *err;
+	const struct got_error *err, *free_err;
 	struct got_blob_object *blob = NULL, *staged_blob = NULL;
 	FILE *f1 = NULL, *f2 = NULL, *outfile = NULL, *rejectfile = NULL;
 	char *path1 = NULL, *path2 = NULL, *label1 = NULL;
-	struct stat sb1, sb2;
-	struct got_diff_changes *changes = NULL;
-	struct got_diff_state *ds = NULL;
-	struct got_diff_args *args = NULL;
-	struct got_diff_change *change;
-	int diff_flags = 0, line_cur1 = 1, line_cur2 = 1, n = 0;
-	int have_content = 0, have_rejected_content = 0;
+	struct got_diffreg_result *diffreg_result = NULL;
+	int line_cur1 = 1, line_cur2 = 1, n = 0, nchunks_used = 0;
+	int have_content = 0, have_rejected_content = 0, i = 0, nchanges = 0;
 
 	*path_unstaged_content = NULL;
 	*path_new_staged_content = NULL;
@@ -7420,18 +7428,8 @@ create_unstaged_content(char **path_unstaged_content,
 	if (err)
 		goto done;
 
-	if (stat(path1, &sb1) == -1) {
-		err = got_error_from_errno2("stat", path1);
-		goto done;
-	}
-
-	if (stat(path2, &sb2) == -1) {
-		err = got_error_from_errno2("stat", path2);
-		goto done;
-	}
-
-	err = got_diff_files(&changes, &ds, &args, &diff_flags,
-	    f1, sb1.st_size, label1, f2, sb2.st_size, path2, 3, NULL);
+	err = got_diff_files(&diffreg_result, f1, label1, f2,
+	    path2, 3, 0, NULL);
 	if (err)
 		goto done;
 
@@ -7452,12 +7450,19 @@ create_unstaged_content(char **path_unstaged_content,
 		err = got_ferror(f2, GOT_ERR_IO);
 		goto done;
 	}
-	SIMPLEQ_FOREACH(change, &changes->entries, entry) {
+	/* Count the number of actual changes in the diff result. */
+	for (n = 0; n < diffreg_result->result->chunks.len; n += nchunks_used) {
+		struct diff_chunk_context cc = {}; 
+		diff_chunk_context_load_change(&cc, &nchunks_used,
+		    diffreg_result->result, n, 0);
+		nchanges++;
+	}
+	for (n = 0; n < diffreg_result->result->chunks.len; n += nchunks_used) {
 		int choice;
-		err = apply_or_reject_change(&choice, change, ++n,
-		    changes->nchanges, ds, args, diff_flags, relpath,
-		    f1, f2, &line_cur1, &line_cur2,
-		    outfile, rejectfile, patch_cb, patch_arg);
+		err = apply_or_reject_change(&choice, &nchunks_used,
+		    diffreg_result->result, n, relpath, f1, f2,
+		    &line_cur1, &line_cur2,
+		    outfile, rejectfile, ++i, nchanges, patch_cb, patch_arg);
 		if (err)
 			goto done;
 		if (choice == GOT_PATCH_CHOICE_YES)
@@ -7476,6 +7481,9 @@ done:
 		got_object_blob_close(blob);
 	if (staged_blob)
 		got_object_blob_close(staged_blob);
+	free_err = got_diffreg_result_free(diffreg_result);
+	if (free_err && err == NULL)
+		err = free_err;
 	if (f1 && fclose(f1) == EOF && err == NULL)
 		err = got_error_from_errno2("fclose", path1);
 	if (f2 && fclose(f2) == EOF && err == NULL)
@@ -7504,13 +7512,6 @@ done:
 		free(*path_new_staged_content);
 		*path_new_staged_content = NULL;
 	}
-	free(args);
-	if (ds) {
-		got_diff_state_free(ds);
-		free(ds);
-	}
-	if (changes)
-		got_diff_free_changes(changes);
 	free(path1);
 	free(path2);
 	return err;
blob - 4c63c9313c4b48595f5494758ae00d53b19995d2
blob + 529ce0b4748441cedafca8750001367158498163
--- regress/cmdline/stage.sh
+++ regress/cmdline/stage.sh
@@ -1799,7 +1799,7 @@ EOF
 
 	cat > $testroot/stdout.expected <<EOF
 -----------------------------------------------
-@@ -1,5 +1,5 @@ b
+@@ -1,5 +1,5 @@
  1
 -2
 +a
blob - 41af3bef58e4bdab819c0a659369ee2a3c2f2e2f
blob + 0c38c99e81144fe6d7e8fc5915eab0302808bfc3
--- tog/Makefile
+++ tog/Makefile
@@ -9,7 +9,10 @@ SRCS=		tog.c blame.c commit_graph.c delta.c diff.c \
 		privsep.c reference.c repository.c sha1.c worktree.c \
 		utf8.c inflate.c buf.c rcsutil.c diff3.c \
 		lockfile.c deflate.c object_create.c delta_cache.c \
-		gotconfig.c
+		gotconfig.c diff_main.c diff_atomize_text.c \
+		diff_myers.c diff_output.c diff_output_plain.c \
+		diff_output_unidiff.c diff_output_edscript.c \
+		diff_patience.c
 MAN =		${PROG}.1
 
 CPPFLAGS = -I${.CURDIR}/../include -I${.CURDIR}/../lib
blob - 1b0d1ec057af633bce4d306c98106dd9eb8eedb2
blob + 5c342d04532bc0afd259f34ddbde2e17205b733a
--- tog/tog.c
+++ tog/tog.c
@@ -252,11 +252,10 @@ struct tog_diff_view_state {
 	struct got_repository *repo;
 	struct got_reflist_head *refs;
 	struct tog_colors colors;
-	int nlines;
+	size_t nlines;
 	off_t *line_offsets;
 	int matched_line;
 	int selected_line;
-	size_t filesize;
 
 	/* passed from log view; may be NULL */
 	struct tog_view *log_view;
@@ -2796,19 +2795,23 @@ match_color(struct tog_colors *colors, const char *lin
 }
 
 static const struct got_error *
-draw_file(struct tog_view *view, FILE *f, int *first_displayed_line, int nlines,
-    int selected_line, int max_lines, int *last_displayed_line, int *eof,
-    char *header, struct tog_colors *colors)
+draw_file(struct tog_view *view, FILE *f, int first_displayed_line, int nlines,
+    off_t *line_offsets, int selected_line, int max_lines,
+    int *last_displayed_line, int *eof, char *header, struct tog_colors *colors)
 {
 	const struct got_error *err;
-	int lineno = 0, nprinted = 0;
+	int nprinted = 0;
 	char *line;
 	struct tog_color *tc;
 	size_t len;
 	wchar_t *wline;
 	int width;
+	off_t line_offset;
 
-	rewind(f);
+	line_offset = line_offsets[first_displayed_line - 1];
+	if (fseeko(f, line_offset, SEEK_SET) == -1)
+		return got_error_from_errno("fseek");
+
 	werase(view->window);
 
 	if (header) {
@@ -2831,17 +2834,12 @@ draw_file(struct tog_view *view, FILE *f, int *first_d
 	}
 
 	*eof = 0;
-	while (nprinted < max_lines) {
+	while (max_lines > 0 && nprinted < max_lines) {
 		line = parse_next_line(f, &len);
 		if (line == NULL) {
 			*eof = 1;
 			break;
 		}
-		if (++lineno < *first_displayed_line) {
-			free(line);
-			continue;
-		}
-
 		err = format_line(&wline, &width, line, view->ncols, 0);
 		if (err) {
 			free(line);
@@ -2858,13 +2856,15 @@ draw_file(struct tog_view *view, FILE *f, int *first_d
 			    COLOR_PAIR(tc->colorpair), NULL);
 		if (width <= view->ncols - 1)
 			waddch(view->window, '\n');
-		if (++nprinted == 1)
-			*first_displayed_line = lineno;
+		nprinted++;
 		free(line);
 		free(wline);
 		wline = NULL;
 	}
-	*last_displayed_line = lineno;
+	if (nprinted >= 1)
+		*last_displayed_line = first_displayed_line + (nprinted - 1);
+	else
+		*last_displayed_line = first_displayed_line;
 
 	view_vborder(view);
 
@@ -2949,18 +2949,35 @@ done:
 }
 
 static const struct got_error *
-write_commit_info(struct got_object_id *commit_id,
-    struct got_reflist_head *refs, struct got_repository *repo, FILE *outfile)
+add_line_offset(off_t **line_offsets, size_t *nlines, off_t off)
+{
+	off_t *p;
+
+	p = reallocarray(*line_offsets, *nlines + 1, sizeof(off_t));
+	if (p == NULL)
+		return got_error_from_errno("reallocarray");
+	*line_offsets = p;
+	(*line_offsets)[*nlines] = off;
+	(*nlines)++;
+	return NULL;
+}
+
+static const struct got_error *
+write_commit_info(off_t **line_offsets, size_t *nlines,
+    struct got_object_id *commit_id, struct got_reflist_head *refs,
+    struct got_repository *repo, FILE *outfile)
 {
 	const struct got_error *err = NULL;
 	char datebuf[26], *datestr;
 	struct got_commit_object *commit;
-	char *id_str = NULL, *logmsg = NULL;
+	char *id_str = NULL, *logmsg = NULL, *s = NULL, *line;
 	time_t committer_time;
 	const char *author, *committer;
 	char *refs_str = NULL;
 	struct got_pathlist_head changed_paths;
 	struct got_pathlist_entry *pe;
+	off_t outoff = 0;
+	int n;
 
 	TAILQ_INIT(&changed_paths);
 
@@ -2980,152 +2997,107 @@ write_commit_info(struct got_object_id *commit_id,
 		goto done;
 	}
 
-	if (fprintf(outfile, "commit %s%s%s%s\n", id_str, refs_str ? " (" : "",
-	    refs_str ? refs_str : "", refs_str ? ")" : "") < 0) {
+	err = add_line_offset(line_offsets, nlines, 0);
+	if (err)
+		goto done;
+
+	n = fprintf(outfile, "commit %s%s%s%s\n", id_str, refs_str ? " (" : "",
+	    refs_str ? refs_str : "", refs_str ? ")" : "");
+	if (n < 0) {
 		err = got_error_from_errno("fprintf");
 		goto done;
 	}
-	if (fprintf(outfile, "from: %s\n",
-	    got_object_commit_get_author(commit)) < 0) {
+	outoff += n;
+	err = add_line_offset(line_offsets, nlines, outoff);
+	if (err)
+		goto done;
+
+	n = fprintf(outfile, "from: %s\n",
+	    got_object_commit_get_author(commit));
+	if (n < 0) {
 		err = got_error_from_errno("fprintf");
 		goto done;
 	}
+	outoff += n;
+	err = add_line_offset(line_offsets, nlines, outoff);
+	if (err)
+		goto done;
+
 	committer_time = got_object_commit_get_committer_time(commit);
 	datestr = get_datestr(&committer_time, datebuf);
-	if (datestr && fprintf(outfile, "date: %s UTC\n", datestr) < 0) {
-		err = got_error_from_errno("fprintf");
-		goto done;
+	if (datestr) {
+		n = fprintf(outfile, "date: %s UTC\n", datestr);
+		if (n < 0) {
+			err = got_error_from_errno("fprintf");
+			goto done;
+		}
+		outoff += n;
+		err = add_line_offset(line_offsets, nlines, outoff);
+		if (err)
+			goto done;
 	}
 	author = got_object_commit_get_author(commit);
 	committer = got_object_commit_get_committer(commit);
-	if (strcmp(author, committer) != 0 &&
-	    fprintf(outfile, "via: %s\n", committer) < 0) {
-		err = got_error_from_errno("fprintf");
-		goto done;
+	if (strcmp(author, committer) != 0) {
+		n = fprintf(outfile, "via: %s\n", committer);
+		if (n < 0) {
+			err = got_error_from_errno("fprintf");
+			goto done;
+		}
+		outoff += n;
+		err = add_line_offset(line_offsets, nlines, outoff);
+		if (err)
+			goto done;
 	}
 	err = got_object_commit_get_logmsg(&logmsg, commit);
 	if (err)
 		goto done;
-	if (fprintf(outfile, "%s\n", logmsg) < 0) {
-		err = got_error_from_errno("fprintf");
-		goto done;
+	s = logmsg;
+	while ((line = strsep(&s, "\n")) != NULL) {
+		n = fprintf(outfile, "%s\n", line);
+		if (n < 0) {
+			err = got_error_from_errno("fprintf");
+			goto done;
+		}
+		outoff += n;
+		err = add_line_offset(line_offsets, nlines, outoff);
+		if (err)
+			goto done;
 	}
+
 	err = get_changed_paths(&changed_paths, commit, repo);
 	if (err)
 		goto done;
 	TAILQ_FOREACH(pe, &changed_paths, entry) {
 		struct got_diff_changed_path *cp = pe->data;
-		fprintf(outfile, "%c  %s\n", cp->status, pe->path);
+		n = fprintf(outfile, "%c  %s\n", cp->status, pe->path);
+		if (n < 0) {
+			err = got_error_from_errno("fprintf");
+			goto done;
+		}
+		outoff += n;
+		err = add_line_offset(line_offsets, nlines, outoff);
+		if (err)
+			goto done;
 		free((char *)pe->path);
 		free(pe->data);
 	}
+
 	fputc('\n', outfile);
+	outoff++;
+	err = add_line_offset(line_offsets, nlines, outoff);
 done:
 	got_pathlist_free(&changed_paths);
 	free(id_str);
 	free(logmsg);
 	free(refs_str);
 	got_object_commit_close(commit);
-	return err;
-}
-
-const struct got_error *
-get_filestream_info(size_t *filesize, int *nlines, off_t **line_offsets,
-    FILE *infile)
-{
-	const struct got_error *err = NULL;
-	size_t len, remain;
-	char buf[32768];
-	int i;
-	size_t nalloc = 0;
-	off_t off = 0;
-
-	*line_offsets = NULL;
-	*filesize = 0;
-	*nlines = 0;
-
-	if (fseek(infile, 0, SEEK_END) == -1)
-		return got_error_from_errno("fseek");
-	len = ftell(infile) + 1;
-	if (ferror(infile))
-		return got_error_from_errno("ftell");
-	if (fseek(infile, 0, SEEK_SET) == -1)
-		return got_error_from_errno("fseek");
-
-	if (len == 0)
-		return NULL;
-
-	remain = len;
-	while (remain > 0) {
-		size_t r, n = MIN(remain, sizeof(buf));
-		r = fread(buf, 1, n, infile);
-		if (r == 0) {
-			if (ferror(infile)) {
-				err = got_error_from_errno("fread");
-				goto done;
-			}
-			break;
-		}
-		i = 0;
-		remain -= r;
-
-		if (*line_offsets == NULL) {
-			/* Have some data but perhaps no '\n'. */
-			*nlines = 1;
-			nalloc = len / 40; /* 40-char average line length */
-			*line_offsets = calloc(nalloc, sizeof(**line_offsets));
-			if (*line_offsets == NULL) {
-				err = got_error_from_errno("calloc");
-				goto done;
-			}
-				/* Skip forward over end of first line. */
-			while (i < len) {
-				if (buf[i] == '\n')
-					break;
-				i++;
-			}
-		}
-
-		/* Scan '\n' offsets in remaining chunk of data. */
-		while (i < r) {
-			if (buf[i] != '\n') {
-				i++;
-				continue;
-			}
-			(*nlines)++;
-			if (nalloc < *nlines) {
-				size_t nallocnew = *nlines + (remain / 40);
-				off_t *o = recallocarray(*line_offsets,
-				    nalloc, nallocnew, sizeof(**line_offsets));
-				if (o == NULL) {
-					err = got_error_from_errno(
-					    "recallocarray");
-					goto done;
-				}
-				*line_offsets = o;
-				nalloc = nallocnew;
-			}
-			off = i + 1;
-			(*line_offsets)[*nlines - 1] = off;
-			i++;
-		}
-	}
-
-	if (fflush(infile) != 0) {
-		err = got_error_from_errno("fflush");
-		goto done;
-	}
-	rewind(infile);
-
-	*filesize = len;
-done:
 	if (err) {
 		free(*line_offsets);
 		*line_offsets = NULL;
-		*filesize = 0;
 		*nlines = 0;
 	}
-	return NULL;
+	return err;
 }
 
 static const struct got_error *
@@ -3135,6 +3107,12 @@ create_diff(struct tog_diff_view_state *s)
 	FILE *f = NULL;
 	int obj_type;
 
+	free(s->line_offsets);
+	s->line_offsets = malloc(sizeof(off_t));
+	if (s->line_offsets == NULL)
+		return got_error_from_errno("malloc");
+	s->nlines = 0;
+
 	f = got_opentemp();
 	if (f == NULL) {
 		err = got_error_from_errno("got_opentemp");
@@ -3155,12 +3133,13 @@ create_diff(struct tog_diff_view_state *s)
 
 	switch (obj_type) {
 	case GOT_OBJ_TYPE_BLOB:
-		err = got_diff_objects_as_blobs(s->id1, s->id2, NULL, NULL,
-		    s->diff_context, 0, s->repo, s->f);
+		err = got_diff_objects_as_blobs(&s->line_offsets, &s->nlines,
+		    s->id1, s->id2, NULL, NULL, s->diff_context, 0,
+		    s->repo, s->f);
 		break;
 	case GOT_OBJ_TYPE_TREE:
-		err = got_diff_objects_as_trees(s->id1, s->id2, "", "",
-		    s->diff_context, 0, s->repo, s->f);
+		err = got_diff_objects_as_trees(&s->line_offsets, &s->nlines,
+		    s->id1, s->id2, "", "", s->diff_context, 0, s->repo, s->f);
 		break;
 	case GOT_OBJ_TYPE_COMMIT: {
 		const struct got_object_id_queue *parent_ids;
@@ -3172,15 +3151,17 @@ create_diff(struct tog_diff_view_state *s)
 			goto done;
 		/* Show commit info if we're diffing to a parent/root commit. */
 		if (s->id1 == NULL) {
-			err = write_commit_info(s->id2, s->refs, s->repo, s->f);
+			err = write_commit_info(&s->line_offsets, &s->nlines,
+			    s->id2, s->refs, s->repo, s->f);
 			if (err)
 				goto done;
 		} else {
 			parent_ids = got_object_commit_get_parent_ids(commit2);
 			SIMPLEQ_FOREACH(pid, parent_ids, entry) {
 				if (got_object_id_cmp(s->id1, pid->id) == 0) {
-					err = write_commit_info(s->id2, s->refs,
-					    s->repo, s->f);
+					err = write_commit_info(
+					    &s->line_offsets, &s->nlines,
+					    s->id2, s->refs, s->repo, s->f);
 					if (err)
 						goto done;
 					break;
@@ -3189,8 +3170,8 @@ create_diff(struct tog_diff_view_state *s)
 		}
 		got_object_commit_close(commit2);
 
-		err = got_diff_objects_as_commits(s->id1, s->id2,
-		    s->diff_context, 0, s->repo, s->f);
+		err = got_diff_objects_as_commits(&s->line_offsets, &s->nlines,
+		    s->id1, s->id2, s->diff_context, 0, s->repo, s->f);
 		break;
 	}
 	default:
@@ -3199,8 +3180,6 @@ create_diff(struct tog_diff_view_state *s)
 	}
 	if (err)
 		goto done;
-	err = get_filestream_info(&s->filesize, &s->nlines, &s->line_offsets,
-	    s->f);
 done:
 	if (s->f && fflush(s->f) != 0 && err == NULL)
 		err = got_error_from_errno("fflush");
@@ -3394,6 +3373,8 @@ open_diff_view(struct tog_view *view, struct got_objec
 		show_log_view(log_view); /* draw vborder */
 	diff_view_indicate_progress(view);
 
+	s->line_offsets = NULL;
+	s->nlines = 0;
 	err = create_diff(s);
 	if (err) {
 		free(s->id1);
@@ -3426,6 +3407,8 @@ close_diff_view(struct tog_view *view)
 		err = got_error_from_errno("fclose");
 	free_colors(&s->colors);
 	free(s->line_offsets);
+	s->line_offsets = NULL;
+	s->nlines = 0;
 	return err;
 }
 
@@ -3455,9 +3438,9 @@ show_diff_view(struct tog_view *view)
 	free(id_str1);
 	free(id_str2);
 
-	return draw_file(view, s->f, &s->first_displayed_line, s->nlines,
-	    s->selected_line, view->nlines, &s->last_displayed_line, &s->eof,
-	    header, &s->colors);
+	return draw_file(view, s->f, s->first_displayed_line, s->nlines,
+	    s->line_offsets, s->selected_line, view->nlines,
+	    &s->last_displayed_line, &s->eof, header, &s->colors);
 }
 
 static const struct got_error *