From: Mark Jamsek Subject: Re: got patch vs binary files To: Omar Polo Cc: gameoftrees@openbsd.org Date: Tue, 30 May 2023 20:25:29 +1000 On 23-05-29 08:52AM, Omar Polo wrote: > ping > > On 2023/05/12 21:06:15 +0200, Omar Polo wrote: > > back in february naddy reported that got patch choked on the perl > > update. > > > > https://marc.gameoftrees.org/thread/1675780522.55927_0.html > > > > The problem was two-fold: > > 1. the "Result too large" for very long lines > > 2. NUL bytes in the middle of the lines > > > > Here I'm adressing the most interesting one, the sencond. I'll handle > > the very long lines case next. > > > > got patch assumes lines can be encoded as strings, so embedded NUL > > bytes effectively truncates the line. diff below changes this > > assumption, by considering lines just binary strings LF-delimited. > > > > While doing this, I took the chance to ship some other minor changes. > > > > The type of the line (i.e. '+', '-' or ' ' aka context) now is a > > separate field instead of being encoded in the first byte of each > > line, this simplifies the various +1/-1 that were needed otherwise. > > > > I've renamed linecmp into lineq and made it boolean: it's simpler to > > write, to use and we'll never be interested in sorting lines anyway. > > > > One quirk is that I'm still allocating the space for a NUL terminator. > > This is not required, and in fact makes pushline() maybe not obvious, > > but it's useful for debugging so it's worth it I think. > > > > The diff is not committable as-is however. I had to disable a check > > in diff3 to allow got_merge_diff3 to proceed on binary files. I'm not > > sure what we want to do in for this, but I believe that even without > > the will to support "binary patches" most of the diff still makes > > sense, as it propagate lengths rather than recomputing strlen() here > > and there and avoids some extra useless allocations in got-read-patch. I really like the change to parsing lines as simply newline delimited binary strings; this is how it's done in fossil too. And I like the other changes, particularly the got_patch_line struct and lineq() simplification. The diff also reads fine. I'm just not sure about the diff3 issue, tbh. Also, test_cherrypick_binary_file regress is tripping on this change. > > > > diff /home/op/w/got > > commit - ba0bed23e515b14cd3fe877c6847785c8c29971e > > path + /home/op/w/got > > blob - 5cd79150cc6476f78c90b997d76b00eaa99fc341 > > file + lib/diff3.c > > --- lib/diff3.c > > +++ lib/diff3.c > > @@ -231,6 +231,7 @@ diffreg(BUF **d, const char *path1, const char *path2, > > if (err) > > goto done; > > > > +#if 0 > > if (diffreg_result) { > > struct diff_result *diff_result = diffreg_result->result; > > int atomizer_flags = (diff_result->left->atomizer_flags | > > @@ -240,6 +241,7 @@ diffreg(BUF **d, const char *path1, const char *path2, > > goto done; > > } > > } > > +#endif > > > > err = got_diffreg_output(NULL, NULL, diffreg_result, 1, 1, "", "", > > GOT_DIFF_OUTPUT_PLAIN, 0, outfile); > > blob - 9f524a6bfc0deae40498090e43fe7f9238eb4e64 > > file + lib/patch.c > > --- lib/patch.c > > +++ lib/patch.c > > @@ -59,6 +59,12 @@ struct got_patch_hunk { > > #define MIN(a, b) ((a) < (b) ? (a) : (b)) > > #endif > > > > +struct got_patch_line { > > + char mode; > > + char *line; > > + size_t len; > > +}; > > + > > struct got_patch_hunk { > > STAILQ_ENTRY(got_patch_hunk) entries; > > const struct got_error *err; > > @@ -72,7 +78,7 @@ struct got_patch_hunk { > > int new_lines; > > size_t len; > > size_t cap; > > - char **lines; > > + struct got_patch_line *lines; > > }; > > > > STAILQ_HEAD(got_patch_hunk_head, got_patch_hunk); > > @@ -128,7 +134,7 @@ patch_free(struct got_patch *p) > > STAILQ_REMOVE_HEAD(&p->head, entries); > > > > for (i = 0; i < h->len; ++i) > > - free(h->lines[i]); > > + free(h->lines[i].line); > > free(h->lines); > > free(h); > > } > > @@ -141,7 +147,7 @@ pushline(struct got_patch_hunk *h, const char *line) > > } > > > > static const struct got_error * > > -pushline(struct got_patch_hunk *h, const char *line) > > +pushline(struct got_patch_hunk *h, const char *line, size_t len) > > { > > void *t; > > size_t newcap; > > @@ -157,10 +163,14 @@ pushline(struct got_patch_hunk *h, const char *line) > > h->cap = newcap; > > } > > > > - if ((t = strdup(line)) == NULL) > > - return got_error_from_errno("strdup"); > > + if ((t = malloc(len - 1)) == NULL) > > + return got_error_from_errno("malloc"); > > + memcpy(t, line + 1, len - 1); /* skip the line type */ > > > > - h->lines[h->len++] = t; > > + h->lines[h->len].mode = *line; > > + h->lines[h->len].line = t; > > + h->lines[h->len].len = len - 2; /* skip line type and trailing NUL */ > > + h->len++; > > return NULL; > > } > > > > @@ -301,7 +311,7 @@ recv_patch(struct imsgbuf *ibuf, int *done, struct got > > } > > > > if (*t != '\\') > > - err = pushline(h, t); > > + err = pushline(h, t, datalen); > > else if (lastmode == '-') > > h->old_nonl = 1; > > else if (lastmode == '+') > > @@ -351,10 +361,10 @@ reverse_patch(struct got_patch *p) > > h->new_nonl = tmp; > > > > for (i = 0; i < h->len; ++i) { > > - if (*h->lines[i] == '+') > > - *h->lines[i] = '-'; > > - else if (*h->lines[i] == '-') > > - *h->lines[i] = '+'; > > + if (h->lines[i].mode == '+') > > + h->lines[i].mode = '-'; > > + else if (h->lines[i].mode == '-') > > + h->lines[i].mode = '+'; > > } > > } > > } > > @@ -392,14 +402,15 @@ static int linecmp(const char *, const char *, int *); > > return NULL; > > } > > > > -static int linecmp(const char *, const char *, int *); > > +static int lineq(struct got_patch_line *, const char *, size_t, int *); > > > > static const struct got_error * > > locate_hunk(FILE *orig, struct got_patch_hunk *h, off_t *pos, int *lineno) > > { > > const struct got_error *err = NULL; > > + struct got_patch_line *l = &h->lines[0]; > > char *line = NULL; > > - char mode = *h->lines[0]; > > + char mode = l->mode; > > size_t linesize = 0; > > ssize_t linelen; > > off_t match = -1; > > @@ -414,12 +425,10 @@ locate_hunk(FILE *orig, struct got_patch_hunk *h, off_ > > err = got_error(GOT_ERR_HUNK_FAILED); > > break; > > } > > - if (line[linelen - 1] == '\n') > > - line[linelen - 1] = '\0'; > > (*lineno)++; > > > > - if ((mode == ' ' && !linecmp(h->lines[0] + 1, line, &mangled)) || > > - (mode == '-' && !linecmp(h->lines[0] + 1, line, &mangled)) || > > + if ((mode == ' ' && lineq(l, line, linelen, &mangled)) || > > + (mode == '-' && lineq(l, line, linelen, &mangled)) || > > (mode == '+' && *lineno == h->old_from)) { > > match = ftello(orig); > > if (match == -1) { > > @@ -449,27 +458,34 @@ linecmp(const char *a, const char *b, int *mangled) > > } > > > > static int > > -linecmp(const char *a, const char *b, int *mangled) > > +lineq(struct got_patch_line *l, const char *b, size_t len, int *mangled) > > { > > - int c; > > + char *a = l->line; > > + size_t i, j; > > > > + if (b[len - 1] == '\n') > > + len--; > > + > > *mangled = 0; > > - c = strcmp(a, b); > > - if (c == 0) > > - return c; > > + if (l->len == len && !memcmp(a, b, len)) > > + return 1; > > > > *mangled = 1; > > + > > + i = j = 0; > > for (;;) { > > - while (*a == '\t' || *a == ' ' || *a == '\f') > > - a++; > > - while (*b == '\t' || *b == ' ' || *b == '\f') > > - b++; > > - if (*a == '\0' || *a != *b) > > + while (i < l->len && > > + (a[i] == '\t' || a[i] == ' ' || a[i] == '\f')) > > + i++; > > + while (j < len && > > + (b[j] == '\t' || b[j] == ' ' || b[j] == '\f')) > > + j++; > > + if (i == l->len || j == len || a[i] != b[j]) > > break; > > - a++, b++; > > + i++, j++; > > } > > > > - return *a - *b; > > + return (i == l->len && j == len); > > } > > > > static const struct got_error * > > @@ -482,7 +498,7 @@ test_hunk(FILE *orig, struct got_patch_hunk *h) > > int mangled; > > > > for (i = 0; i < h->len; ++i) { > > - switch (*h->lines[i]) { > > + switch (h->lines[i].mode) { > > case '+': > > continue; > > case ' ': > > @@ -496,9 +512,7 @@ test_hunk(FILE *orig, struct got_patch_hunk *h) > > GOT_ERR_HUNK_FAILED); > > goto done; > > } > > - if (line[linelen - 1] == '\n') > > - line[linelen - 1] = '\0'; > > - if (linecmp(h->lines[i] + 1, line, &mangled)) { > > + if (!lineq(&h->lines[i], line, linelen, &mangled)) { > > err = got_error(GOT_ERR_HUNK_FAILED); > > goto done; > > } > > @@ -522,13 +536,14 @@ apply_hunk(FILE *orig, FILE *tmp, struct got_patch_hun > > size_t linesize = 0, i, new = 0; > > char *line = NULL; > > char mode; > > + size_t l; > > ssize_t linelen; > > > > if (orig != NULL && fseeko(orig, from, SEEK_SET) == -1) > > return got_error_from_errno("fseeko"); > > > > for (i = 0; i < h->len; ++i) { > > - switch (mode = *h->lines[i]) { > > + switch (mode = h->lines[i].mode) { > > case '-': > > case ' ': > > (*lineno)++; > > @@ -541,18 +556,24 @@ apply_hunk(FILE *orig, FILE *tmp, struct got_patch_hun > > if (line[linelen - 1] == '\n') > > line[linelen - 1] = '\0'; > > t = line; > > - } else > > - t = h->lines[i] + 1; > > + l = linelen - 1; > > + } else { > > + t = h->lines[i].line; > > + l = h->lines[i].len; > > + } > > if (mode == '-') > > continue; > > - if (fprintf(tmp, "%s\n", t) < 0) { > > + if (fwrite(t, 1, l, tmp) != l || > > + fputc('\n', tmp) == EOF) { > > err = got_error_from_errno("fprintf"); > > goto done; > > } > > break; > > case '+': > > new++; > > - if (fprintf(tmp, "%s", h->lines[i] + 1) < 0) { > > + t = h->lines[i].line; > > + l = h->lines[i].len; > > + if (fwrite(t, 1, l, tmp) != l) { > > err = got_error_from_errno("fprintf"); > > goto done; > > } > > blob - a5fe8c2470856e960745214cc536ddbc804c7ef0 > > file + libexec/got-read-patch/got-read-patch.c > > --- libexec/got-read-patch/got-read-patch.c > > +++ libexec/got-read-patch/got-read-patch.c > > @@ -445,23 +445,29 @@ send_line(const char *line) > > } > > > > static const struct got_error * > > -send_line(const char *line) > > +send_line(const char *line, size_t len) > > { > > static const struct got_error *err = NULL; > > - char *p = NULL; > > + struct iovec iov[2]; > > + int iovcnt = 0; > > > > + memset(&iov, 0, sizeof(iov)); > > + > > if (*line != '+' && *line != '-' && *line != ' ' && *line != '\\') { > > - if (asprintf(&p, " %s", line) == -1) > > - return got_error_from_errno("asprintf"); > > - line = p; > > + iov[iovcnt].iov_base = (void *)" "; > > + iov[iovcnt].iov_len = 1; > > + iovcnt++; > > } > > > > - if (imsg_compose(&ibuf, GOT_IMSG_PATCH_LINE, 0, 0, -1, > > - line, strlen(line) + 1) == -1) > > + iov[iovcnt].iov_base = (void *)line; > > + iov[iovcnt].iov_len = len; > > + iovcnt++; > > + > > + if (imsg_composev(&ibuf, GOT_IMSG_PATCH_LINE, 0, 0, -1, > > + iov, iovcnt) == -1) > > err = got_error_from_errno( > > "imsg_compose GOT_IMSG_PATCH_LINE"); > > > > - free(p); > > return err; > > } > > > > @@ -478,7 +484,7 @@ peek_special_line(FILE *fp) > > } > > > > if (ch == '\\') { > > - err = send_line("\\"); > > + err = send_line("\\", 2); > > if (err) > > return err; > > } > > @@ -568,7 +574,7 @@ parse_hunk(FILE *fp, int *done) > > goto done; > > } > > > > - err = send_line(line); > > + err = send_line(line, linelen); > > if (err) > > goto done; > > > > blob - 17db471e5c3aa81e0eb9233b406861cdccd08fcd > > file + regress/cmdline/patch.sh > > --- regress/cmdline/patch.sh > > +++ regress/cmdline/patch.sh > > @@ -1956,6 +1956,57 @@ test_parseargs "$@" > > test_done $testroot 0 > > } > > > > +test_patch_binary() { > > + local testroot=`test_init patch_binary` > > + > > + if ! got checkout "$testroot/repo" "$testroot/wt" >/dev/null; then > > + test_done "$testroot" $ret > > + return 1 > > + fi > > + > > + printf 'e\0ta' > $testroot/wt/eta > > + (cd "$testroot/wt" && got add eta && got commit -m '+eta') >/dev/null > > + ret=$? > > + if [ $ret -ne 0 ]; then > > + echo "got add failed unexpectedly" >&2 > > + test_done "$testroot" $ret > > + return 1 > > + fi > > + > > + printf 'et\0a' | tee "$testroot/wt/eta.expected" > $testroot/wt/eta > > + > > + (cd "$testroot/wt" && got diff -a > patch && got revert eta) >/dev/null > > + ret=$? > > + if [ $ret -ne 0 ]; then > > + echo "got diff or revert failed unexpectedly" >&2 > > + test_done "$testroot" $ret > > + return 1 > > + fi > > + > > + (cd "$testroot/wt" && got patch $testroot/stdout > > + ret=$? > > + if [ $ret -ne 0 ]; then > > + echo "got patch failed unexpectedly" >&2 > > + test_done "$testroot" $ret > > + return 1 > > + fi > > + > > + echo 'G eta' > $testroot/stdout.expected > > + if ! cmp -s "$testroot/stdout.expected" "$testroot/stdout"; then > > + diff -u "$testroot/stdout.expected" "$testroot/stdout" > > + test_done "$testroot" 1 > > + return 1 > > + fi > > + > > + if ! cmp -s "$testroot/wt/eta.expected" "$testroot/wt/eta"; then > > + diff -u "$testroot/wt/eta.expected" "$testroot/wt/eta" > > + test_done "$testroot" 1 > > + return 1 > > + fi > > + > > + test_done "$testroot" 0 > > +} > > + > > test_parseargs "$@" > > run_test test_patch_basic > > run_test test_patch_dont_apply > > @@ -1986,3 +2037,4 @@ run_test test_patch_remove_binary_file > > run_test test_patch_newfile_xbit_git_diff > > run_test test_patch_umask > > run_test test_patch_remove_binary_file > > +run_test test_patch_binary > > -- Mark Jamsek GPG: F2FF 13DE 6A06 C471 CA80 E6E2 2930 DC66 86EE CF68