"GOT", but the "O" is a cute, smiling pufferfish. Index | Thread | Search

From:
"Omar Polo" <op@omarpolo.com>
Subject:
Re: improve the binary file checks
To:
Stefan Sperling <stsp@stsp.name>
Cc:
gameoftrees@openbsd.org
Date:
Thu, 31 Jul 2025 14:17:19 +0200

Download raw body.

Thread
Stefan Sperling <stsp@stsp.name> wrote:
> On Thu, Jul 31, 2025 at 12:02:02PM +0200, Omar Polo wrote:
> > jtt found out that for some "binary files" we still try to print the
> > diff.  These "binary files" are gpg encrypted files, but luckily
> > (weirdly?) enough they don't have any NUL byte, yet they have a lot of
> > other control characters.
> > 
> > This diff attempts at being better at caching binary files.  any control
> > character, except our friends tab characters, marks the file as binary.
> > 
> > regress passes, but i'm not sure i'm actually liking this diff.  should
> > i replace iscntrl() with the hardcoded (x < ' ' && x != 0x7F) check?
> > 
> > thoughts?
> 
> If this helps with .gpg files then I don't see why not.
> I cannot think of a better solution.
> 
> Could you use && !isspace() instead of only checking for tab characters?

yes, i can :)

if ok i should commit diff_atomize_text.c to diff.git before and then
sync, right?

diff /home/op/w/got
path + /home/op/w/got
commit - 40e36ce5301442ff9172f57a80b91e1ca8bfd426
blob - 32023105af9438217a65a1bf6b821e4d225516d1
file + lib/diff_atomize_text.c
--- lib/diff_atomize_text.c
+++ lib/diff_atomize_text.c
@@ -43,7 +43,7 @@ diff_data_atomize_text_lines_fd(struct diff_data *d)
 	unsigned int array_size_estimate = d->len / 50;
 	unsigned int pow2 = 1;
 	bool ignore_whitespace = (d->diff_flags & DIFF_FLAG_IGNORE_WHITESPACE);
-	bool embedded_nul = false;
+	bool isbinary = false;
 
 	while (array_size_estimate >>= 1)
 		pow2++;
@@ -72,8 +72,9 @@ diff_data_atomize_text_lines_fd(struct diff_data *d)
 					    || !isspace((unsigned char)buf[i]))
 						hash = diff_atom_hash_update(
 						    hash, buf[i]);
-					if (buf[i] == '\0')
-						embedded_nul = true;
+					if (iscntrl((unsigned char)buf[i]) &&
+					    !isspace((unsigned char)buf[i]))
+						isbinary = true;
 					line_end++;
 				} else
 					eol = buf[i];
@@ -115,8 +116,8 @@ diff_data_atomize_text_lines_fd(struct diff_data *d)
 			return errno;
 	}
 
-	/* File are considered binary if they contain embedded '\0' bytes. */
-	if (embedded_nul)
+	/* File are considered binary if they contain control bytes. */
+	if (isbinary)
 		d->atomizer_flags |= DIFF_ATOMIZER_FOUND_BINARY_DATA;
 
 	return DIFF_RC_OK;
@@ -128,7 +129,7 @@ diff_data_atomize_text_lines_mmap(struct diff_data *d)
 	const uint8_t *pos = d->data;
 	const uint8_t *end = pos + d->len;
 	bool ignore_whitespace = (d->diff_flags & DIFF_FLAG_IGNORE_WHITESPACE);
-	bool embedded_nul = false;
+	bool isbinary = false;
 	unsigned int array_size_estimate = d->len / 50;
 	unsigned int pow2 = 1;
 	while (array_size_estimate >>= 1)
@@ -144,8 +145,9 @@ diff_data_atomize_text_lines_mmap(struct diff_data *d)
 			if (!ignore_whitespace
 			    || !isspace((unsigned char)*line_end))
 				hash = diff_atom_hash_update(hash, *line_end);
-			if (*line_end == '\0')
-				embedded_nul = true;
+			if (iscntrl((unsigned char)*line_end) &&
+			    !isspace((unsigned char)*line_end))
+				isbinary = true;
 			line_end++;
 		}
 
@@ -174,8 +176,8 @@ diff_data_atomize_text_lines_mmap(struct diff_data *d)
 		pos = line_end;
 	}
 
-	/* File are considered binary if they contain embedded '\0' bytes. */
-	if (embedded_nul)
+	/* File are considered binary if they contain embedded control bytes. */
+	if (isbinary)
 		d->atomizer_flags |= DIFF_ATOMIZER_FOUND_BINARY_DATA;
 
 	return DIFF_RC_OK;
commit - 40e36ce5301442ff9172f57a80b91e1ca8bfd426
blob - b58b9970c4f4924c6663e2de3b426bcf71082141
file + lib/object.c
--- lib/object.c
+++ lib/object.c
@@ -21,6 +21,7 @@
 #include <sys/uio.h>
 #include <sys/mman.h>
 
+#include <ctype.h>
 #include <errno.h>
 #include <fcntl.h>
 #include <stdio.h>
@@ -374,7 +375,7 @@ const struct got_error *
 got_object_blob_is_binary(int *binary, struct got_blob_object *blob)
 {
 	const struct got_error *err;
-	size_t hdrlen, len;
+	size_t hdrlen, len, i;
 
 	*binary = 0;
 	hdrlen = got_object_blob_get_hdrlen(blob);
@@ -386,7 +387,13 @@ got_object_blob_is_binary(int *binary, struct got_blob
 	if (err)
 		return err;
 
-	*binary = memchr(blob->read_buf, '\0', len) != NULL;
+	for (i = 0; i < len; ++i) {
+		if (iscntrl((unsigned char)blob->read_buf[i]) &&
+		    !isspace((unsigned char)blob->read_buf[i])) {
+			*binary = 1;
+			break;
+		}
+	}
 
 	if (fseeko(blob->f, hdrlen, SEEK_SET) == -1)
 		return got_error_from_errno("fseeko");