"GOT", but the "O" is a cute, smiling pufferfish. Index | Thread | Search

From:
Dag-Erling Smørgrav <des@FreeBSD.org>
Subject:
diff: detect input file truncation
To:
gameoftrees@openbsd.org
Date:
Thu, 23 May 2024 05:25:07 +0200

Download raw body.

Thread
A downstream consumer of FreeBSD's GoT-derived diff implementation
reported experiencing SIGBUS crashes in diff.  They didn't provide much
context, but from the stack traces I was given the crash appears to
occur while atomizing mmapped files, and I was able to reproduce it with
a test program that creates and mmaps a file, then truncates it before
calling diff_atomize_file() with the original length.  The atomizer then
crashes when it reaches the end of the page that contains the new EOF.
This turned out to be easy enough to work around, so here are the
patches if you want them.  A regression test can be found here:

https://cgit.freebsd.org/src/tree/lib/libdiff/tests/libdiff_test.c

Note that pos must be volatile to prevent the compiler from assuming
that it is unchanged when sigsetjmp() returns non-zero, so I added
line_start as an optimizable copy of pos inside the loop.  I'm wondering
if this could have been done differently, e.g. by replacing `end = pos`
with `end = *(const uint8_t * volatile *)&pos` so pos itself doesn't
have to be volatile.

DES
-- 
Dag-Erling Smørgrav - des@FreeBSD.org

From f721104def0a3971dfea7616a209d6df4149f307 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Dag-Erling=20Sm=C3=B8rgrav?= <des@des.no>
Date: Thu, 23 May 2024 05:01:47 +0200
Subject: [PATCH 1/2] Detect and recover from file truncation.

If a memory-mapped file is truncated before we get to the end, the
atomizer may catch SIGBUS.  Detect that, reduce the input length to
what we were actually able to read, and set a flag so the caller can
take further action (e.g. warn the user and / or start over).
---
 include/diff_main.h     |  1 +
 lib/diff_atomize_text.c | 35 ++++++++++++++++++++++++++++++-----
 2 files changed, 31 insertions(+), 5 deletions(-)

diff --git a/include/diff_main.h b/include/diff_main.h
index 04a6c6e..1170058 100644
--- a/include/diff_main.h
+++ b/include/diff_main.h
@@ -118,6 +118,7 @@ struct diff_data {
 
 /* Flags set by file atomizer. */
 #define DIFF_ATOMIZER_FOUND_BINARY_DATA	0x00000001
+#define DIFF_ATOMIZER_FILE_TRUNCATED	0x00000002
 
 /* Flags set by caller of diff_main(). */
 #define DIFF_FLAG_IGNORE_WHITESPACE	0x00000001
diff --git a/lib/diff_atomize_text.c b/lib/diff_atomize_text.c
index 3202310..d8a6973 100644
--- a/lib/diff_atomize_text.c
+++ b/lib/diff_atomize_text.c
@@ -16,6 +16,8 @@
  */
 
 #include <errno.h>
+#include <setjmp.h>
+#include <signal.h>
 #include <stdbool.h>
 #include <stdint.h>
 #include <stdio.h>
@@ -122,10 +124,18 @@ diff_data_atomize_text_lines_fd(struct diff_data *d)
 	return DIFF_RC_OK;
 }
 
+static sigjmp_buf diff_data_signal_env;
+static void
+diff_data_signal_handler(int sig)
+{
+	siglongjmp(diff_data_signal_env, sig);
+}
+
 static int
 diff_data_atomize_text_lines_mmap(struct diff_data *d)
 {
-	const uint8_t *pos = d->data;
+	struct sigaction act, oact;
+	const uint8_t *volatile pos = d->data;
 	const uint8_t *end = pos + d->len;
 	bool ignore_whitespace = (d->diff_flags & DIFF_FLAG_IGNORE_WHITESPACE);
 	bool embedded_nul = false;
@@ -136,8 +146,22 @@ diff_data_atomize_text_lines_mmap(struct diff_data *d)
 
 	ARRAYLIST_INIT(d->atoms, 1 << pow2);
 
+	sigemptyset(&act.sa_mask);
+	act.sa_flags = 0;
+	act.sa_handler = diff_data_signal_handler;
+	sigaction(SIGBUS, &act, &oact);
+	if (sigsetjmp(diff_data_signal_env, 0) > 0) {
+		/*
+		 * The file was truncated while we were reading it.  Set
+		 * the end pointer to the beginning of the line we were
+		 * trying to read, adjust the file length, and set a flag.
+		 */
+		end = pos;
+		d->len = end - d->data;
+		d->atomizer_flags |= DIFF_ATOMIZER_FILE_TRUNCATED;
+	}
 	while (pos < end) {
-		const uint8_t *line_end = pos;
+		const uint8_t *line_start = pos, *line_end = pos;
 		unsigned int hash = 0;
 
 		while (line_end < end && *line_end != '\r' && *line_end != '\n') {
@@ -164,15 +188,16 @@ diff_data_atomize_text_lines_mmap(struct diff_data *d)
 
 		*atom = (struct diff_atom){
 			.root = d,
-			.pos = (off_t)(pos - d->data),
-			.at = pos,
-			.len = line_end - pos,
+			.pos = (off_t)(line_start - d->data),
+			.at = line_start,
+			.len = line_end - line_start,
 			.hash = hash,
 		};
 
 		/* Starting point for next line: */
 		pos = line_end;
 	}
+	sigaction(SIGBUS, &oact, NULL);
 
 	/* File are considered binary if they contain embedded '\0' bytes. */
 	if (embedded_nul)
-- 
2.45.1

From 31eed98fe9f5d69695d9b5bc2702cbb0c78af0e8 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Dag-Erling=20Sm=C3=B8rgrav?= <des@des.no>
Date: Thu, 23 May 2024 05:02:15 +0200
Subject: [PATCH 2/2] Warn if the atomizer detected truncation.

---
 diff/diff.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/diff/diff.c b/diff/diff.c
index eded416..119ca79 100644
--- a/diff/diff.c
+++ b/diff/diff.c
@@ -229,9 +229,13 @@ diffreg(char *file1, char *file2, enum diffreg_algo algo, bool force_text,
 	rc = diff_atomize_file(&left, cfg, f1, str1, st1.st_size, diff_flags);
 	if (rc)
 		goto done;
+	if (left.atomizer_flags & DIFF_ATOMIZER_FILE_TRUNCATED)
+		warnx("%s truncated", file1);
 	rc = diff_atomize_file(&right, cfg, f2, str2, st2.st_size, diff_flags);
 	if (rc)
 		goto done;
+	if (right.atomizer_flags & DIFF_ATOMIZER_FILE_TRUNCATED)
+		warnx("%s truncated", file2);
 
 	result = diff_main(cfg, &left, &right);
 #if 0
-- 
2.45.1