Download raw body.
improve the binary file checks
jtt found out that for some "binary files" we still try to print the diff. These "binary files" are gpg encrypted files, but luckily (weirdly?) enough they don't have any NUL byte, yet they have a lot of other control characters. This diff attempts at being better at caching binary files. any control character, except our friends tab characters, marks the file as binary. regress passes, but i'm not sure i'm actually liking this diff. should i replace iscntrl() with the hardcoded (x < ' ' && x != 0x7F) check? thoughts? diff /home/op/w/got path + /home/op/w/got commit - 40e36ce5301442ff9172f57a80b91e1ca8bfd426 blob - 32023105af9438217a65a1bf6b821e4d225516d1 file + lib/diff_atomize_text.c --- lib/diff_atomize_text.c +++ lib/diff_atomize_text.c @@ -43,7 +43,7 @@ diff_data_atomize_text_lines_fd(struct diff_data *d) unsigned int array_size_estimate = d->len / 50; unsigned int pow2 = 1; bool ignore_whitespace = (d->diff_flags & DIFF_FLAG_IGNORE_WHITESPACE); - bool embedded_nul = false; + bool isbinary = false; while (array_size_estimate >>= 1) pow2++; @@ -72,8 +72,9 @@ diff_data_atomize_text_lines_fd(struct diff_data *d) || !isspace((unsigned char)buf[i])) hash = diff_atom_hash_update( hash, buf[i]); - if (buf[i] == '\0') - embedded_nul = true; + if (iscntrl((unsigned char)buf[i]) && + buf[i] != '\t' && buf[i] != '\v') + isbinary = true; line_end++; } else eol = buf[i]; @@ -115,8 +116,8 @@ diff_data_atomize_text_lines_fd(struct diff_data *d) return errno; } - /* File are considered binary if they contain embedded '\0' bytes. */ - if (embedded_nul) + /* File are considered binary if they contain control bytes. */ + if (isbinary) d->atomizer_flags |= DIFF_ATOMIZER_FOUND_BINARY_DATA; return DIFF_RC_OK; @@ -128,7 +129,7 @@ diff_data_atomize_text_lines_mmap(struct diff_data *d) const uint8_t *pos = d->data; const uint8_t *end = pos + d->len; bool ignore_whitespace = (d->diff_flags & DIFF_FLAG_IGNORE_WHITESPACE); - bool embedded_nul = false; + bool isbinary = false; unsigned int array_size_estimate = d->len / 50; unsigned int pow2 = 1; while (array_size_estimate >>= 1) @@ -144,8 +145,9 @@ diff_data_atomize_text_lines_mmap(struct diff_data *d) if (!ignore_whitespace || !isspace((unsigned char)*line_end)) hash = diff_atom_hash_update(hash, *line_end); - if (*line_end == '\0') - embedded_nul = true; + if (iscntrl((unsigned char)*line_end) && + *line_end != '\t' && *line_end != '\v') + isbinary = true; line_end++; } @@ -174,8 +176,8 @@ diff_data_atomize_text_lines_mmap(struct diff_data *d) pos = line_end; } - /* File are considered binary if they contain embedded '\0' bytes. */ - if (embedded_nul) + /* File are considered binary if they contain embedded control bytes. */ + if (isbinary) d->atomizer_flags |= DIFF_ATOMIZER_FOUND_BINARY_DATA; return DIFF_RC_OK; commit - 40e36ce5301442ff9172f57a80b91e1ca8bfd426 blob - b58b9970c4f4924c6663e2de3b426bcf71082141 file + lib/object.c --- lib/object.c +++ lib/object.c @@ -21,6 +21,7 @@ #include <sys/uio.h> #include <sys/mman.h> +#include <ctype.h> #include <errno.h> #include <fcntl.h> #include <stdio.h> @@ -374,7 +375,7 @@ const struct got_error * got_object_blob_is_binary(int *binary, struct got_blob_object *blob) { const struct got_error *err; - size_t hdrlen, len; + size_t hdrlen, len, i; *binary = 0; hdrlen = got_object_blob_get_hdrlen(blob); @@ -386,7 +387,14 @@ got_object_blob_is_binary(int *binary, struct got_blob if (err) return err; - *binary = memchr(blob->read_buf, '\0', len) != NULL; + for (i = 0; i < len; ++i) { + if (iscntrl((unsigned char)blob->read_buf[i]) && + blob->read_buf[i] != '\t' && + blob->read_buf[i] != '\v') { + *binary = 1; + break; + } + } if (fseeko(blob->f, hdrlen, SEEK_SET) == -1) return got_error_from_errno("fseeko");
improve the binary file checks