Download raw body.
improve the binary file checks
Stefan Sperling <stsp@stsp.name> wrote:
> On Thu, Jul 31, 2025 at 12:02:02PM +0200, Omar Polo wrote:
> > jtt found out that for some "binary files" we still try to print the
> > diff. These "binary files" are gpg encrypted files, but luckily
> > (weirdly?) enough they don't have any NUL byte, yet they have a lot of
> > other control characters.
> >
> > This diff attempts at being better at caching binary files. any control
> > character, except our friends tab characters, marks the file as binary.
> >
> > regress passes, but i'm not sure i'm actually liking this diff. should
> > i replace iscntrl() with the hardcoded (x < ' ' && x != 0x7F) check?
> >
> > thoughts?
>
> If this helps with .gpg files then I don't see why not.
> I cannot think of a better solution.
>
> Could you use && !isspace() instead of only checking for tab characters?
yes, i can :)
if ok i should commit diff_atomize_text.c to diff.git before and then
sync, right?
diff /home/op/w/got
path + /home/op/w/got
commit - 40e36ce5301442ff9172f57a80b91e1ca8bfd426
blob - 32023105af9438217a65a1bf6b821e4d225516d1
file + lib/diff_atomize_text.c
--- lib/diff_atomize_text.c
+++ lib/diff_atomize_text.c
@@ -43,7 +43,7 @@ diff_data_atomize_text_lines_fd(struct diff_data *d)
unsigned int array_size_estimate = d->len / 50;
unsigned int pow2 = 1;
bool ignore_whitespace = (d->diff_flags & DIFF_FLAG_IGNORE_WHITESPACE);
- bool embedded_nul = false;
+ bool isbinary = false;
while (array_size_estimate >>= 1)
pow2++;
@@ -72,8 +72,9 @@ diff_data_atomize_text_lines_fd(struct diff_data *d)
|| !isspace((unsigned char)buf[i]))
hash = diff_atom_hash_update(
hash, buf[i]);
- if (buf[i] == '\0')
- embedded_nul = true;
+ if (iscntrl((unsigned char)buf[i]) &&
+ !isspace((unsigned char)buf[i]))
+ isbinary = true;
line_end++;
} else
eol = buf[i];
@@ -115,8 +116,8 @@ diff_data_atomize_text_lines_fd(struct diff_data *d)
return errno;
}
- /* File are considered binary if they contain embedded '\0' bytes. */
- if (embedded_nul)
+ /* File are considered binary if they contain control bytes. */
+ if (isbinary)
d->atomizer_flags |= DIFF_ATOMIZER_FOUND_BINARY_DATA;
return DIFF_RC_OK;
@@ -128,7 +129,7 @@ diff_data_atomize_text_lines_mmap(struct diff_data *d)
const uint8_t *pos = d->data;
const uint8_t *end = pos + d->len;
bool ignore_whitespace = (d->diff_flags & DIFF_FLAG_IGNORE_WHITESPACE);
- bool embedded_nul = false;
+ bool isbinary = false;
unsigned int array_size_estimate = d->len / 50;
unsigned int pow2 = 1;
while (array_size_estimate >>= 1)
@@ -144,8 +145,9 @@ diff_data_atomize_text_lines_mmap(struct diff_data *d)
if (!ignore_whitespace
|| !isspace((unsigned char)*line_end))
hash = diff_atom_hash_update(hash, *line_end);
- if (*line_end == '\0')
- embedded_nul = true;
+ if (iscntrl((unsigned char)*line_end) &&
+ !isspace((unsigned char)*line_end))
+ isbinary = true;
line_end++;
}
@@ -174,8 +176,8 @@ diff_data_atomize_text_lines_mmap(struct diff_data *d)
pos = line_end;
}
- /* File are considered binary if they contain embedded '\0' bytes. */
- if (embedded_nul)
+ /* File are considered binary if they contain embedded control bytes. */
+ if (isbinary)
d->atomizer_flags |= DIFF_ATOMIZER_FOUND_BINARY_DATA;
return DIFF_RC_OK;
commit - 40e36ce5301442ff9172f57a80b91e1ca8bfd426
blob - b58b9970c4f4924c6663e2de3b426bcf71082141
file + lib/object.c
--- lib/object.c
+++ lib/object.c
@@ -21,6 +21,7 @@
#include <sys/uio.h>
#include <sys/mman.h>
+#include <ctype.h>
#include <errno.h>
#include <fcntl.h>
#include <stdio.h>
@@ -374,7 +375,7 @@ const struct got_error *
got_object_blob_is_binary(int *binary, struct got_blob_object *blob)
{
const struct got_error *err;
- size_t hdrlen, len;
+ size_t hdrlen, len, i;
*binary = 0;
hdrlen = got_object_blob_get_hdrlen(blob);
@@ -386,7 +387,13 @@ got_object_blob_is_binary(int *binary, struct got_blob
if (err)
return err;
- *binary = memchr(blob->read_buf, '\0', len) != NULL;
+ for (i = 0; i < len; ++i) {
+ if (iscntrl((unsigned char)blob->read_buf[i]) &&
+ !isspace((unsigned char)blob->read_buf[i])) {
+ *binary = 1;
+ break;
+ }
+ }
if (fseeko(blob->f, hdrlen, SEEK_SET) == -1)
return got_error_from_errno("fseeko");
improve the binary file checks