Download raw body.
improve the binary file checks
On Thu, Jul 31, 2025 at 12:02:02PM +0200, Omar Polo wrote:
> jtt found out that for some "binary files" we still try to print the
> diff. These "binary files" are gpg encrypted files, but luckily
> (weirdly?) enough they don't have any NUL byte, yet they have a lot of
> other control characters.
>
> This diff attempts at being better at caching binary files. any control
> character, except our friends tab characters, marks the file as binary.
>
> regress passes, but i'm not sure i'm actually liking this diff. should
> i replace iscntrl() with the hardcoded (x < ' ' && x != 0x7F) check?
>
> thoughts?
If this helps with .gpg files then I don't see why not.
I cannot think of a better solution.
Could you use && !isspace() instead of only checking for tab characters?
> diff /home/op/w/got
> path + /home/op/w/got
> commit - 40e36ce5301442ff9172f57a80b91e1ca8bfd426
> blob - 32023105af9438217a65a1bf6b821e4d225516d1
> file + lib/diff_atomize_text.c
> --- lib/diff_atomize_text.c
> +++ lib/diff_atomize_text.c
> @@ -43,7 +43,7 @@ diff_data_atomize_text_lines_fd(struct diff_data *d)
> unsigned int array_size_estimate = d->len / 50;
> unsigned int pow2 = 1;
> bool ignore_whitespace = (d->diff_flags & DIFF_FLAG_IGNORE_WHITESPACE);
> - bool embedded_nul = false;
> + bool isbinary = false;
>
> while (array_size_estimate >>= 1)
> pow2++;
> @@ -72,8 +72,9 @@ diff_data_atomize_text_lines_fd(struct diff_data *d)
> || !isspace((unsigned char)buf[i]))
> hash = diff_atom_hash_update(
> hash, buf[i]);
> - if (buf[i] == '\0')
> - embedded_nul = true;
> + if (iscntrl((unsigned char)buf[i]) &&
> + buf[i] != '\t' && buf[i] != '\v')
> + isbinary = true;
> line_end++;
> } else
> eol = buf[i];
> @@ -115,8 +116,8 @@ diff_data_atomize_text_lines_fd(struct diff_data *d)
> return errno;
> }
>
> - /* File are considered binary if they contain embedded '\0' bytes. */
> - if (embedded_nul)
> + /* File are considered binary if they contain control bytes. */
> + if (isbinary)
> d->atomizer_flags |= DIFF_ATOMIZER_FOUND_BINARY_DATA;
>
> return DIFF_RC_OK;
> @@ -128,7 +129,7 @@ diff_data_atomize_text_lines_mmap(struct diff_data *d)
> const uint8_t *pos = d->data;
> const uint8_t *end = pos + d->len;
> bool ignore_whitespace = (d->diff_flags & DIFF_FLAG_IGNORE_WHITESPACE);
> - bool embedded_nul = false;
> + bool isbinary = false;
> unsigned int array_size_estimate = d->len / 50;
> unsigned int pow2 = 1;
> while (array_size_estimate >>= 1)
> @@ -144,8 +145,9 @@ diff_data_atomize_text_lines_mmap(struct diff_data *d)
> if (!ignore_whitespace
> || !isspace((unsigned char)*line_end))
> hash = diff_atom_hash_update(hash, *line_end);
> - if (*line_end == '\0')
> - embedded_nul = true;
> + if (iscntrl((unsigned char)*line_end) &&
> + *line_end != '\t' && *line_end != '\v')
> + isbinary = true;
> line_end++;
> }
>
> @@ -174,8 +176,8 @@ diff_data_atomize_text_lines_mmap(struct diff_data *d)
> pos = line_end;
> }
>
> - /* File are considered binary if they contain embedded '\0' bytes. */
> - if (embedded_nul)
> + /* File are considered binary if they contain embedded control bytes. */
> + if (isbinary)
> d->atomizer_flags |= DIFF_ATOMIZER_FOUND_BINARY_DATA;
>
> return DIFF_RC_OK;
> commit - 40e36ce5301442ff9172f57a80b91e1ca8bfd426
> blob - b58b9970c4f4924c6663e2de3b426bcf71082141
> file + lib/object.c
> --- lib/object.c
> +++ lib/object.c
> @@ -21,6 +21,7 @@
> #include <sys/uio.h>
> #include <sys/mman.h>
>
> +#include <ctype.h>
> #include <errno.h>
> #include <fcntl.h>
> #include <stdio.h>
> @@ -374,7 +375,7 @@ const struct got_error *
> got_object_blob_is_binary(int *binary, struct got_blob_object *blob)
> {
> const struct got_error *err;
> - size_t hdrlen, len;
> + size_t hdrlen, len, i;
>
> *binary = 0;
> hdrlen = got_object_blob_get_hdrlen(blob);
> @@ -386,7 +387,14 @@ got_object_blob_is_binary(int *binary, struct got_blob
> if (err)
> return err;
>
> - *binary = memchr(blob->read_buf, '\0', len) != NULL;
> + for (i = 0; i < len; ++i) {
> + if (iscntrl((unsigned char)blob->read_buf[i]) &&
> + blob->read_buf[i] != '\t' &&
> + blob->read_buf[i] != '\v') {
> + *binary = 1;
> + break;
> + }
> + }
>
> if (fseeko(blob->f, hdrlen, SEEK_SET) == -1)
> return got_error_from_errno("fseeko");
>
>
improve the binary file checks