"GOT", but the "O" is a cute, smiling pufferfish. Index | Thread | Search

From:
"Omar Polo" <op@omarpolo.com>
Subject:
improve the binary file checks
To:
gameoftrees@openbsd.org
Date:
Thu, 31 Jul 2025 12:02:02 +0200

Download raw body.

Thread
jtt found out that for some "binary files" we still try to print the
diff.  These "binary files" are gpg encrypted files, but luckily
(weirdly?) enough they don't have any NUL byte, yet they have a lot of
other control characters.

This diff attempts at being better at caching binary files.  any control
character, except our friends tab characters, marks the file as binary.

regress passes, but i'm not sure i'm actually liking this diff.  should
i replace iscntrl() with the hardcoded (x < ' ' && x != 0x7F) check?

thoughts?

diff /home/op/w/got
path + /home/op/w/got
commit - 40e36ce5301442ff9172f57a80b91e1ca8bfd426
blob - 32023105af9438217a65a1bf6b821e4d225516d1
file + lib/diff_atomize_text.c
--- lib/diff_atomize_text.c
+++ lib/diff_atomize_text.c
@@ -43,7 +43,7 @@ diff_data_atomize_text_lines_fd(struct diff_data *d)
 	unsigned int array_size_estimate = d->len / 50;
 	unsigned int pow2 = 1;
 	bool ignore_whitespace = (d->diff_flags & DIFF_FLAG_IGNORE_WHITESPACE);
-	bool embedded_nul = false;
+	bool isbinary = false;
 
 	while (array_size_estimate >>= 1)
 		pow2++;
@@ -72,8 +72,9 @@ diff_data_atomize_text_lines_fd(struct diff_data *d)
 					    || !isspace((unsigned char)buf[i]))
 						hash = diff_atom_hash_update(
 						    hash, buf[i]);
-					if (buf[i] == '\0')
-						embedded_nul = true;
+					if (iscntrl((unsigned char)buf[i]) &&
+					    buf[i] != '\t' && buf[i] != '\v')
+						isbinary = true;
 					line_end++;
 				} else
 					eol = buf[i];
@@ -115,8 +116,8 @@ diff_data_atomize_text_lines_fd(struct diff_data *d)
 			return errno;
 	}
 
-	/* File are considered binary if they contain embedded '\0' bytes. */
-	if (embedded_nul)
+	/* File are considered binary if they contain control bytes. */
+	if (isbinary)
 		d->atomizer_flags |= DIFF_ATOMIZER_FOUND_BINARY_DATA;
 
 	return DIFF_RC_OK;
@@ -128,7 +129,7 @@ diff_data_atomize_text_lines_mmap(struct diff_data *d)
 	const uint8_t *pos = d->data;
 	const uint8_t *end = pos + d->len;
 	bool ignore_whitespace = (d->diff_flags & DIFF_FLAG_IGNORE_WHITESPACE);
-	bool embedded_nul = false;
+	bool isbinary = false;
 	unsigned int array_size_estimate = d->len / 50;
 	unsigned int pow2 = 1;
 	while (array_size_estimate >>= 1)
@@ -144,8 +145,9 @@ diff_data_atomize_text_lines_mmap(struct diff_data *d)
 			if (!ignore_whitespace
 			    || !isspace((unsigned char)*line_end))
 				hash = diff_atom_hash_update(hash, *line_end);
-			if (*line_end == '\0')
-				embedded_nul = true;
+			if (iscntrl((unsigned char)*line_end) &&
+			    *line_end != '\t' && *line_end != '\v')
+				isbinary = true;
 			line_end++;
 		}
 
@@ -174,8 +176,8 @@ diff_data_atomize_text_lines_mmap(struct diff_data *d)
 		pos = line_end;
 	}
 
-	/* File are considered binary if they contain embedded '\0' bytes. */
-	if (embedded_nul)
+	/* File are considered binary if they contain embedded control bytes. */
+	if (isbinary)
 		d->atomizer_flags |= DIFF_ATOMIZER_FOUND_BINARY_DATA;
 
 	return DIFF_RC_OK;
commit - 40e36ce5301442ff9172f57a80b91e1ca8bfd426
blob - b58b9970c4f4924c6663e2de3b426bcf71082141
file + lib/object.c
--- lib/object.c
+++ lib/object.c
@@ -21,6 +21,7 @@
 #include <sys/uio.h>
 #include <sys/mman.h>
 
+#include <ctype.h>
 #include <errno.h>
 #include <fcntl.h>
 #include <stdio.h>
@@ -374,7 +375,7 @@ const struct got_error *
 got_object_blob_is_binary(int *binary, struct got_blob_object *blob)
 {
 	const struct got_error *err;
-	size_t hdrlen, len;
+	size_t hdrlen, len, i;
 
 	*binary = 0;
 	hdrlen = got_object_blob_get_hdrlen(blob);
@@ -386,7 +387,14 @@ got_object_blob_is_binary(int *binary, struct got_blob
 	if (err)
 		return err;
 
-	*binary = memchr(blob->read_buf, '\0', len) != NULL;
+	for (i = 0; i < len; ++i) {
+		if (iscntrl((unsigned char)blob->read_buf[i]) &&
+		    blob->read_buf[i] != '\t' &&
+		    blob->read_buf[i] != '\v') {
+			*binary = 1;
+			break;
+		}
+	}
 
 	if (fseeko(blob->f, hdrlen, SEEK_SET) == -1)
 		return got_error_from_errno("fseeko");