"GOT", but the "O" is a cute, smiling pufferfish. Index | Thread | Search

From:
Stefan Sperling <stsp@stsp.name>
Subject:
Re: improve the binary file checks
To:
Omar Polo <op@omarpolo.com>
Cc:
gameoftrees@openbsd.org
Date:
Thu, 31 Jul 2025 12:37:03 +0200

Download raw body.

Thread
On Thu, Jul 31, 2025 at 12:02:02PM +0200, Omar Polo wrote:
> jtt found out that for some "binary files" we still try to print the
> diff.  These "binary files" are gpg encrypted files, but luckily
> (weirdly?) enough they don't have any NUL byte, yet they have a lot of
> other control characters.
> 
> This diff attempts at being better at caching binary files.  any control
> character, except our friends tab characters, marks the file as binary.
> 
> regress passes, but i'm not sure i'm actually liking this diff.  should
> i replace iscntrl() with the hardcoded (x < ' ' && x != 0x7F) check?
> 
> thoughts?

If this helps with .gpg files then I don't see why not.
I cannot think of a better solution.

Could you use && !isspace() instead of only checking for tab characters?


> diff /home/op/w/got
> path + /home/op/w/got
> commit - 40e36ce5301442ff9172f57a80b91e1ca8bfd426
> blob - 32023105af9438217a65a1bf6b821e4d225516d1
> file + lib/diff_atomize_text.c
> --- lib/diff_atomize_text.c
> +++ lib/diff_atomize_text.c
> @@ -43,7 +43,7 @@ diff_data_atomize_text_lines_fd(struct diff_data *d)
>  	unsigned int array_size_estimate = d->len / 50;
>  	unsigned int pow2 = 1;
>  	bool ignore_whitespace = (d->diff_flags & DIFF_FLAG_IGNORE_WHITESPACE);
> -	bool embedded_nul = false;
> +	bool isbinary = false;
>  
>  	while (array_size_estimate >>= 1)
>  		pow2++;
> @@ -72,8 +72,9 @@ diff_data_atomize_text_lines_fd(struct diff_data *d)
>  					    || !isspace((unsigned char)buf[i]))
>  						hash = diff_atom_hash_update(
>  						    hash, buf[i]);
> -					if (buf[i] == '\0')
> -						embedded_nul = true;
> +					if (iscntrl((unsigned char)buf[i]) &&
> +					    buf[i] != '\t' && buf[i] != '\v')
> +						isbinary = true;
>  					line_end++;
>  				} else
>  					eol = buf[i];
> @@ -115,8 +116,8 @@ diff_data_atomize_text_lines_fd(struct diff_data *d)
>  			return errno;
>  	}
>  
> -	/* File are considered binary if they contain embedded '\0' bytes. */
> -	if (embedded_nul)
> +	/* File are considered binary if they contain control bytes. */
> +	if (isbinary)
>  		d->atomizer_flags |= DIFF_ATOMIZER_FOUND_BINARY_DATA;
>  
>  	return DIFF_RC_OK;
> @@ -128,7 +129,7 @@ diff_data_atomize_text_lines_mmap(struct diff_data *d)
>  	const uint8_t *pos = d->data;
>  	const uint8_t *end = pos + d->len;
>  	bool ignore_whitespace = (d->diff_flags & DIFF_FLAG_IGNORE_WHITESPACE);
> -	bool embedded_nul = false;
> +	bool isbinary = false;
>  	unsigned int array_size_estimate = d->len / 50;
>  	unsigned int pow2 = 1;
>  	while (array_size_estimate >>= 1)
> @@ -144,8 +145,9 @@ diff_data_atomize_text_lines_mmap(struct diff_data *d)
>  			if (!ignore_whitespace
>  			    || !isspace((unsigned char)*line_end))
>  				hash = diff_atom_hash_update(hash, *line_end);
> -			if (*line_end == '\0')
> -				embedded_nul = true;
> +			if (iscntrl((unsigned char)*line_end) &&
> +			    *line_end != '\t' && *line_end != '\v')
> +				isbinary = true;
>  			line_end++;
>  		}
>  
> @@ -174,8 +176,8 @@ diff_data_atomize_text_lines_mmap(struct diff_data *d)
>  		pos = line_end;
>  	}
>  
> -	/* File are considered binary if they contain embedded '\0' bytes. */
> -	if (embedded_nul)
> +	/* File are considered binary if they contain embedded control bytes. */
> +	if (isbinary)
>  		d->atomizer_flags |= DIFF_ATOMIZER_FOUND_BINARY_DATA;
>  
>  	return DIFF_RC_OK;
> commit - 40e36ce5301442ff9172f57a80b91e1ca8bfd426
> blob - b58b9970c4f4924c6663e2de3b426bcf71082141
> file + lib/object.c
> --- lib/object.c
> +++ lib/object.c
> @@ -21,6 +21,7 @@
>  #include <sys/uio.h>
>  #include <sys/mman.h>
>  
> +#include <ctype.h>
>  #include <errno.h>
>  #include <fcntl.h>
>  #include <stdio.h>
> @@ -374,7 +375,7 @@ const struct got_error *
>  got_object_blob_is_binary(int *binary, struct got_blob_object *blob)
>  {
>  	const struct got_error *err;
> -	size_t hdrlen, len;
> +	size_t hdrlen, len, i;
>  
>  	*binary = 0;
>  	hdrlen = got_object_blob_get_hdrlen(blob);
> @@ -386,7 +387,14 @@ got_object_blob_is_binary(int *binary, struct got_blob
>  	if (err)
>  		return err;
>  
> -	*binary = memchr(blob->read_buf, '\0', len) != NULL;
> +	for (i = 0; i < len; ++i) {
> +		if (iscntrl((unsigned char)blob->read_buf[i]) &&
> +		    blob->read_buf[i] != '\t' &&
> +		    blob->read_buf[i] != '\v') {
> +			*binary = 1;
> +			break;
> +		}
> +	}
>  
>  	if (fseeko(blob->f, hdrlen, SEEK_SET) == -1)
>  		return got_error_from_errno("fseeko");
> 
>