Download raw body.
improve the binary file checks
Stefan Sperling <stsp@stsp.name> wrote: > On Thu, Jul 31, 2025 at 12:02:02PM +0200, Omar Polo wrote: > > jtt found out that for some "binary files" we still try to print the > > diff. These "binary files" are gpg encrypted files, but luckily > > (weirdly?) enough they don't have any NUL byte, yet they have a lot of > > other control characters. > > > > This diff attempts at being better at caching binary files. any control > > character, except our friends tab characters, marks the file as binary. > > > > regress passes, but i'm not sure i'm actually liking this diff. should > > i replace iscntrl() with the hardcoded (x < ' ' && x != 0x7F) check? > > > > thoughts? > > If this helps with .gpg files then I don't see why not. > I cannot think of a better solution. > > Could you use && !isspace() instead of only checking for tab characters? yes, i can :) if ok i should commit diff_atomize_text.c to diff.git before and then sync, right? diff /home/op/w/got path + /home/op/w/got commit - 40e36ce5301442ff9172f57a80b91e1ca8bfd426 blob - 32023105af9438217a65a1bf6b821e4d225516d1 file + lib/diff_atomize_text.c --- lib/diff_atomize_text.c +++ lib/diff_atomize_text.c @@ -43,7 +43,7 @@ diff_data_atomize_text_lines_fd(struct diff_data *d) unsigned int array_size_estimate = d->len / 50; unsigned int pow2 = 1; bool ignore_whitespace = (d->diff_flags & DIFF_FLAG_IGNORE_WHITESPACE); - bool embedded_nul = false; + bool isbinary = false; while (array_size_estimate >>= 1) pow2++; @@ -72,8 +72,9 @@ diff_data_atomize_text_lines_fd(struct diff_data *d) || !isspace((unsigned char)buf[i])) hash = diff_atom_hash_update( hash, buf[i]); - if (buf[i] == '\0') - embedded_nul = true; + if (iscntrl((unsigned char)buf[i]) && + !isspace((unsigned char)buf[i])) + isbinary = true; line_end++; } else eol = buf[i]; @@ -115,8 +116,8 @@ diff_data_atomize_text_lines_fd(struct diff_data *d) return errno; } - /* File are considered binary if they contain embedded '\0' bytes. */ - if (embedded_nul) + /* File are considered binary if they contain control bytes. */ + if (isbinary) d->atomizer_flags |= DIFF_ATOMIZER_FOUND_BINARY_DATA; return DIFF_RC_OK; @@ -128,7 +129,7 @@ diff_data_atomize_text_lines_mmap(struct diff_data *d) const uint8_t *pos = d->data; const uint8_t *end = pos + d->len; bool ignore_whitespace = (d->diff_flags & DIFF_FLAG_IGNORE_WHITESPACE); - bool embedded_nul = false; + bool isbinary = false; unsigned int array_size_estimate = d->len / 50; unsigned int pow2 = 1; while (array_size_estimate >>= 1) @@ -144,8 +145,9 @@ diff_data_atomize_text_lines_mmap(struct diff_data *d) if (!ignore_whitespace || !isspace((unsigned char)*line_end)) hash = diff_atom_hash_update(hash, *line_end); - if (*line_end == '\0') - embedded_nul = true; + if (iscntrl((unsigned char)*line_end) && + !isspace((unsigned char)*line_end)) + isbinary = true; line_end++; } @@ -174,8 +176,8 @@ diff_data_atomize_text_lines_mmap(struct diff_data *d) pos = line_end; } - /* File are considered binary if they contain embedded '\0' bytes. */ - if (embedded_nul) + /* File are considered binary if they contain embedded control bytes. */ + if (isbinary) d->atomizer_flags |= DIFF_ATOMIZER_FOUND_BINARY_DATA; return DIFF_RC_OK; commit - 40e36ce5301442ff9172f57a80b91e1ca8bfd426 blob - b58b9970c4f4924c6663e2de3b426bcf71082141 file + lib/object.c --- lib/object.c +++ lib/object.c @@ -21,6 +21,7 @@ #include <sys/uio.h> #include <sys/mman.h> +#include <ctype.h> #include <errno.h> #include <fcntl.h> #include <stdio.h> @@ -374,7 +375,7 @@ const struct got_error * got_object_blob_is_binary(int *binary, struct got_blob_object *blob) { const struct got_error *err; - size_t hdrlen, len; + size_t hdrlen, len, i; *binary = 0; hdrlen = got_object_blob_get_hdrlen(blob); @@ -386,7 +387,13 @@ got_object_blob_is_binary(int *binary, struct got_blob if (err) return err; - *binary = memchr(blob->read_buf, '\0', len) != NULL; + for (i = 0; i < len; ++i) { + if (iscntrl((unsigned char)blob->read_buf[i]) && + !isspace((unsigned char)blob->read_buf[i])) { + *binary = 1; + break; + } + } if (fseeko(blob->f, hdrlen, SEEK_SET) == -1) return got_error_from_errno("fseeko");
improve the binary file checks