"GOT", but the "O" is a cute, smiling pufferfish. Index | Thread | Search

From:
Omar Polo <op@omarpolo.com>
Subject:
Re: got patch vs binary files
To:
Omar Polo <op@omarpolo.com>
Cc:
gameoftrees@openbsd.org
Date:
Sun, 22 Oct 2023 21:08:32 +0200

Download raw body.

Thread
On 2023/05/12 21:06:15 +0200, Omar Polo <op@omarpolo.com> wrote:
> back in february naddy reported that got patch choked on the perl
> update.
> 
> https://marc.gameoftrees.org/thread/1675780522.55927_0.html
> 
> The problem was two-fold:
>   1. the "Result too large" for very long lines
>   2. NUL bytes in the middle of the lines
> 
> Here I'm adressing the most interesting one, the sencond.  I'll handle
> the very long lines case next.
> 
> got patch assumes lines can be encoded as strings, so embedded NUL
> bytes effectively truncates the line.  diff below changes this
> assumption, by considering lines just binary strings LF-delimited.
> 
> While doing this, I took the chance to ship some other minor changes.
> 
> The type of the line (i.e. '+', '-' or ' ' aka context) now is a
> separate field instead of being encoded in the first byte of each
> line, this simplifies the various +1/-1 that were needed otherwise.
> 
> I've renamed linecmp into lineq and made it boolean: it's simpler to
> write, to use and we'll never be interested in sorting lines anyway.
> 
> One quirk is that I'm still allocating the space for a NUL terminator.
> This is not required, and in fact makes pushline() maybe not obvious,
> but it's useful for debugging so it's worth it I think.

I'd like to resume this to eventually make some more progress on the
patch code.

> The diff is not committable as-is however.  I had to disable a check
> in diff3 to allow got_merge_diff3 to proceed on binary files.  I'm not
> sure what we want to do in for this, but I believe that even without
> the will to support "binary patches" most of the diff still makes
> sense, as it propagate lengths rather than recomputing strlen() here
> and there and avoids some extra useless allocations in got-read-patch.

I've dropped this part and the regress test.  To be fair I don't like
the idea of silently switching from diff3 merge to search/replace for
binary files.  Since the diff3 code is not ready to deal with NULs, it's
guarded and will return an error.  If we ever fix it, patch will
immediately start to work with binary files too.

thoughts/oks?


diff /home/op/w/got
commit - 87bd0c08f248a23b64471e77d2b081d5b04b4fa5
path + /home/op/w/got
blob - 9f524a6bfc0deae40498090e43fe7f9238eb4e64
file + lib/patch.c
--- lib/patch.c
+++ lib/patch.c
@@ -59,6 +59,12 @@
 #define MIN(a, b) ((a) < (b) ? (a) : (b))
 #endif
 
+struct got_patch_line {
+	char	 mode;
+	char	*line;
+	size_t	 len;
+};
+
 struct got_patch_hunk {
 	STAILQ_ENTRY(got_patch_hunk) entries;
 	const struct got_error *err;
@@ -72,7 +78,7 @@ struct got_patch_hunk {
 	int	new_lines;
 	size_t	len;
 	size_t	cap;
-	char	**lines;
+	struct got_patch_line *lines;
 };
 
 STAILQ_HEAD(got_patch_hunk_head, got_patch_hunk);
@@ -128,7 +134,7 @@ patch_free(struct got_patch *p)
 		STAILQ_REMOVE_HEAD(&p->head, entries);
 
 		for (i = 0; i < h->len; ++i)
-			free(h->lines[i]);
+			free(h->lines[i].line);
 		free(h->lines);
 		free(h);
 	}
@@ -141,7 +147,7 @@ patch_free(struct got_patch *p)
 }
 
 static const struct got_error *
-pushline(struct got_patch_hunk *h, const char *line)
+pushline(struct got_patch_hunk *h, const char *line, size_t len)
 {
 	void 	*t;
 	size_t	 newcap;
@@ -157,10 +163,14 @@ pushline(struct got_patch_hunk *h, const char *line)
 		h->cap = newcap;
 	}
 
-	if ((t = strdup(line)) == NULL)
-		return got_error_from_errno("strdup");
+	if ((t = malloc(len - 1)) == NULL)
+		return got_error_from_errno("malloc");
+	memcpy(t, line + 1, len - 1);	/* skip the line type */
 
-	h->lines[h->len++] = t;
+	h->lines[h->len].mode = *line;
+	h->lines[h->len].line = t;
+	h->lines[h->len].len = len - 2;	/* line type and trailing NUL */
+	h->len++;
 	return NULL;
 }
 
@@ -301,7 +311,7 @@ recv_patch(struct imsgbuf *ibuf, int *done, struct got
 			}
 
 			if (*t != '\\')
-				err = pushline(h, t);
+				err = pushline(h, t, datalen);
 			else if (lastmode == '-')
 				h->old_nonl = 1;
 			else if (lastmode == '+')
@@ -351,10 +361,10 @@ reverse_patch(struct got_patch *p)
 		h->new_nonl = tmp;
 
 		for (i = 0; i < h->len; ++i) {
-			if (*h->lines[i] == '+')
-				*h->lines[i] = '-';
-			else if (*h->lines[i] == '-')
-				*h->lines[i] = '+';
+			if (h->lines[i].mode == '+')
+				h->lines[i].mode = '-';
+			else if (h->lines[i].mode == '-')
+				h->lines[i].mode = '+';
 		}
 	}
 }
@@ -392,14 +402,15 @@ copy(FILE *tmp, FILE *orig, off_t copypos, off_t pos)
 	return NULL;
 }
 
-static int linecmp(const char *, const char *, int *);
+static int lineq(struct got_patch_line *, const char *, size_t, int *);
 
 static const struct got_error *
 locate_hunk(FILE *orig, struct got_patch_hunk *h, off_t *pos, int *lineno)
 {
 	const struct got_error *err = NULL;
+	struct got_patch_line *l = &h->lines[0];
 	char *line = NULL;
-	char mode = *h->lines[0];
+	char mode = l->mode;
 	size_t linesize = 0;
 	ssize_t linelen;
 	off_t match = -1;
@@ -414,12 +425,10 @@ locate_hunk(FILE *orig, struct got_patch_hunk *h, off_
 				err = got_error(GOT_ERR_HUNK_FAILED);
 			break;
 		}
-		if (line[linelen - 1] == '\n')
-			line[linelen - 1] = '\0';
 		(*lineno)++;
 
-		if ((mode == ' ' && !linecmp(h->lines[0] + 1, line, &mangled)) ||
-		    (mode == '-' && !linecmp(h->lines[0] + 1, line, &mangled)) ||
+		if ((mode == ' ' && lineq(l, line, linelen, &mangled)) ||
+		    (mode == '-' && lineq(l, line, linelen, &mangled)) ||
 		    (mode == '+' && *lineno == h->old_from)) {
 			match = ftello(orig);
 			if (match == -1) {
@@ -449,27 +458,34 @@ locate_hunk(FILE *orig, struct got_patch_hunk *h, off_
 }
 
 static int
-linecmp(const char *a, const char *b, int *mangled)
+lineq(struct got_patch_line *l, const char *b, size_t len, int *mangled)
 {
-	int c;
+	char *a = l->line;
+	size_t i, j;
 
+	if (len > 00 && b[len - 1] == '\n')
+		len--;
+
 	*mangled = 0;
-	c = strcmp(a, b);
-	if (c == 0)
-		return c;
+	if (l->len == len && !memcmp(a, b, len))
+		return 1;
 
 	*mangled = 1;
+
+	i = j = 0;
 	for (;;) {
-		while (*a == '\t' || *a == ' ' || *a == '\f')
-			a++;
-		while (*b == '\t' || *b == ' ' || *b == '\f')
-			b++;
-		if (*a == '\0' || *a != *b)
+		while (i < l->len &&
+		    (a[i] == '\t' || a[i] == ' ' || a[i] == '\f'))
+			i++;
+		while (j < len &&
+		    (b[j] == '\t' || b[j] == ' ' || b[j] == '\f'))
+			j++;
+		if (i == l->len || j == len || a[i] != b[j])
 			break;
-		a++, b++;
+		i++, j++;
 	}
 
-	return *a - *b;
+	return (i == l->len && j == len);
 }
 
 static const struct got_error *
@@ -482,7 +498,7 @@ test_hunk(FILE *orig, struct got_patch_hunk *h)
 	int mangled;
 
 	for (i = 0; i < h->len; ++i) {
-		switch (*h->lines[i]) {
+		switch (h->lines[i].mode) {
 		case '+':
 			continue;
 		case ' ':
@@ -496,9 +512,7 @@ test_hunk(FILE *orig, struct got_patch_hunk *h)
 					    GOT_ERR_HUNK_FAILED);
 				goto done;
 			}
-			if (line[linelen - 1] == '\n')
-				line[linelen - 1] = '\0';
-			if (linecmp(h->lines[i] + 1, line, &mangled)) {
+			if (!lineq(&h->lines[i], line, linelen, &mangled)) {
 				err = got_error(GOT_ERR_HUNK_FAILED);
 				goto done;
 			}
@@ -522,13 +536,14 @@ apply_hunk(FILE *orig, FILE *tmp, struct got_patch_hun
 	size_t linesize = 0, i, new = 0;
 	char *line = NULL;
 	char mode;
+	size_t l;
 	ssize_t linelen;
 
 	if (orig != NULL && fseeko(orig, from, SEEK_SET) == -1)
 		return got_error_from_errno("fseeko");
 
 	for (i = 0; i < h->len; ++i) {
-		switch (mode = *h->lines[i]) {
+		switch (mode = h->lines[i].mode) {
 		case '-':
 		case ' ':
 			(*lineno)++;
@@ -541,18 +556,24 @@ apply_hunk(FILE *orig, FILE *tmp, struct got_patch_hun
 				if (line[linelen - 1] == '\n')
 					line[linelen - 1] = '\0';
 				t = line;
-			} else
-				t = h->lines[i] + 1;
+				l = linelen - 1;
+			} else {
+				t = h->lines[i].line;
+				l = h->lines[i].len;
+			}
 			if (mode == '-')
 				continue;
-			if (fprintf(tmp, "%s\n", t) < 0) {
+			if (fwrite(t, 1, l, tmp) != l ||
+			    fputc('\n', tmp) == EOF) {
 				err = got_error_from_errno("fprintf");
 				goto done;
 			}
 			break;
 		case '+':
 			new++;
-			if (fprintf(tmp, "%s", h->lines[i] + 1) < 0) {
+			t = h->lines[i].line;
+			l = h->lines[i].len;
+			if (fwrite(t, 1, l, tmp) != l) {
 				err = got_error_from_errno("fprintf");
 				goto done;
 			}
blob - a5fe8c2470856e960745214cc536ddbc804c7ef0
file + libexec/got-read-patch/got-read-patch.c
--- libexec/got-read-patch/got-read-patch.c
+++ libexec/got-read-patch/got-read-patch.c
@@ -445,23 +445,29 @@ parse_hdr(char *s, int *done, struct got_imsg_patch_hu
 }
 
 static const struct got_error *
-send_line(const char *line)
+send_line(const char *line, size_t len)
 {
 	static const struct got_error *err = NULL;
-	char *p = NULL;
+	struct iovec iov[2];
+	int iovcnt = 0;
 
+	memset(&iov, 0, sizeof(iov));
+
 	if (*line != '+' && *line != '-' && *line != ' ' && *line != '\\') {
-		if (asprintf(&p, " %s", line) == -1)
-			return got_error_from_errno("asprintf");
-		line = p;
+		iov[iovcnt].iov_base = (void *)" ";
+		iov[iovcnt].iov_len = 1;
+		iovcnt++;
 	}
 
-	if (imsg_compose(&ibuf, GOT_IMSG_PATCH_LINE, 0, 0, -1,
-	    line, strlen(line) + 1) == -1)
+	iov[iovcnt].iov_base = (void *)line;
+	iov[iovcnt].iov_len = len;
+	iovcnt++;
+
+	if (imsg_composev(&ibuf, GOT_IMSG_PATCH_LINE, 0, 0, -1,
+	    iov, iovcnt) == -1)
 		err = got_error_from_errno(
 		    "imsg_compose GOT_IMSG_PATCH_LINE");
 
-	free(p);
 	return err;
 }
 
@@ -478,7 +484,7 @@ peek_special_line(FILE *fp)
 	}
 
 	if (ch == '\\') {
-		err = send_line("\\");
+		err = send_line("\\", 2);
 		if (err)
 			return err;
 	}
@@ -568,7 +574,7 @@ parse_hunk(FILE *fp, int *done)
 			goto done;
 		}
 
-		err = send_line(line);
+		err = send_line(line, linelen);
 		if (err)
 			goto done;