From: Stefan Sperling Subject: Re: improve the binary file checks To: Omar Polo Cc: gameoftrees@openbsd.org Date: Thu, 31 Jul 2025 12:37:03 +0200 On Thu, Jul 31, 2025 at 12:02:02PM +0200, Omar Polo wrote: > jtt found out that for some "binary files" we still try to print the > diff. These "binary files" are gpg encrypted files, but luckily > (weirdly?) enough they don't have any NUL byte, yet they have a lot of > other control characters. > > This diff attempts at being better at caching binary files. any control > character, except our friends tab characters, marks the file as binary. > > regress passes, but i'm not sure i'm actually liking this diff. should > i replace iscntrl() with the hardcoded (x < ' ' && x != 0x7F) check? > > thoughts? If this helps with .gpg files then I don't see why not. I cannot think of a better solution. Could you use && !isspace() instead of only checking for tab characters? > diff /home/op/w/got > path + /home/op/w/got > commit - 40e36ce5301442ff9172f57a80b91e1ca8bfd426 > blob - 32023105af9438217a65a1bf6b821e4d225516d1 > file + lib/diff_atomize_text.c > --- lib/diff_atomize_text.c > +++ lib/diff_atomize_text.c > @@ -43,7 +43,7 @@ diff_data_atomize_text_lines_fd(struct diff_data *d) > unsigned int array_size_estimate = d->len / 50; > unsigned int pow2 = 1; > bool ignore_whitespace = (d->diff_flags & DIFF_FLAG_IGNORE_WHITESPACE); > - bool embedded_nul = false; > + bool isbinary = false; > > while (array_size_estimate >>= 1) > pow2++; > @@ -72,8 +72,9 @@ diff_data_atomize_text_lines_fd(struct diff_data *d) > || !isspace((unsigned char)buf[i])) > hash = diff_atom_hash_update( > hash, buf[i]); > - if (buf[i] == '\0') > - embedded_nul = true; > + if (iscntrl((unsigned char)buf[i]) && > + buf[i] != '\t' && buf[i] != '\v') > + isbinary = true; > line_end++; > } else > eol = buf[i]; > @@ -115,8 +116,8 @@ diff_data_atomize_text_lines_fd(struct diff_data *d) > return errno; > } > > - /* File are considered binary if they contain embedded '\0' bytes. */ > - if (embedded_nul) > + /* File are considered binary if they contain control bytes. */ > + if (isbinary) > d->atomizer_flags |= DIFF_ATOMIZER_FOUND_BINARY_DATA; > > return DIFF_RC_OK; > @@ -128,7 +129,7 @@ diff_data_atomize_text_lines_mmap(struct diff_data *d) > const uint8_t *pos = d->data; > const uint8_t *end = pos + d->len; > bool ignore_whitespace = (d->diff_flags & DIFF_FLAG_IGNORE_WHITESPACE); > - bool embedded_nul = false; > + bool isbinary = false; > unsigned int array_size_estimate = d->len / 50; > unsigned int pow2 = 1; > while (array_size_estimate >>= 1) > @@ -144,8 +145,9 @@ diff_data_atomize_text_lines_mmap(struct diff_data *d) > if (!ignore_whitespace > || !isspace((unsigned char)*line_end)) > hash = diff_atom_hash_update(hash, *line_end); > - if (*line_end == '\0') > - embedded_nul = true; > + if (iscntrl((unsigned char)*line_end) && > + *line_end != '\t' && *line_end != '\v') > + isbinary = true; > line_end++; > } > > @@ -174,8 +176,8 @@ diff_data_atomize_text_lines_mmap(struct diff_data *d) > pos = line_end; > } > > - /* File are considered binary if they contain embedded '\0' bytes. */ > - if (embedded_nul) > + /* File are considered binary if they contain embedded control bytes. */ > + if (isbinary) > d->atomizer_flags |= DIFF_ATOMIZER_FOUND_BINARY_DATA; > > return DIFF_RC_OK; > commit - 40e36ce5301442ff9172f57a80b91e1ca8bfd426 > blob - b58b9970c4f4924c6663e2de3b426bcf71082141 > file + lib/object.c > --- lib/object.c > +++ lib/object.c > @@ -21,6 +21,7 @@ > #include > #include > > +#include > #include > #include > #include > @@ -374,7 +375,7 @@ const struct got_error * > got_object_blob_is_binary(int *binary, struct got_blob_object *blob) > { > const struct got_error *err; > - size_t hdrlen, len; > + size_t hdrlen, len, i; > > *binary = 0; > hdrlen = got_object_blob_get_hdrlen(blob); > @@ -386,7 +387,14 @@ got_object_blob_is_binary(int *binary, struct got_blob > if (err) > return err; > > - *binary = memchr(blob->read_buf, '\0', len) != NULL; > + for (i = 0; i < len; ++i) { > + if (iscntrl((unsigned char)blob->read_buf[i]) && > + blob->read_buf[i] != '\t' && > + blob->read_buf[i] != '\v') { > + *binary = 1; > + break; > + } > + } > > if (fseeko(blob->f, hdrlen, SEEK_SET) == -1) > return got_error_from_errno("fseeko"); > >