"GOT", but the "O" is a cute, smiling pufferfish. Index | Thread | Search

From:
Mark Jamsek <mark@jamsek.com>
Subject:
Re: speed up 'got status' a little
To:
Stefan Sperling <stsp@stsp.name>
Cc:
gameoftrees@openbsd.org
Date:
Tue, 21 Jan 2025 17:04:52 +1100

Download raw body.

Thread
Stefan Sperling <stsp@stsp.name> wrote:
> On Mon, Jan 20, 2025 at 05:13:07PM +1100, Mark Jamsek wrote:
> > It'd be good if we could elide the conflict marker detection, too, but
> > bypassing the full content comparison when we know the file size is
> > different is a nice little tweak :)
> 
> Below is another diff as a next step, and a suggestion for future work.
> 
> We can skip the content comparison loop entirely, and always use the
> diff engine which will pass over the content anyway to atomize it.
> 
> In the special case a differently-sized binary is found on disk we can
> skip the diff engine. (We sill detect merge conflicts correctly because
> binaries already in conflict status are replaced by a text file on disk
> which contains conflict markers. Our tests cover this behaviour.)
> 
> This provides a significant speed-up for large modified binaries:
> 
> $ time /usr/local/bin/got st
> M  test.mp4
>     0m14.59s real     0m08.00s user     0m04.96s system
> $ time got st
> M  test.mp4
>     0m04.73s real     0m02.52s user     0m01.62s system

Nice!

And some cursory measurements reveal a marginal albeit consistent
speed-up in the general case too :)

> 
> A future optimization might be to add a diff API which asks for file
> modication status only, rather than the full diff result. This would
> allow the diff engine to stop doing work on modified files earlier,
> as soon as any modification is found. This would speed up more cases.

This also sounds like a good idea.

> 
> M  lib/worktree.c  |  58+  53-

ok for the diff with a small nit and minor suggestions inline;
for convenience, I've included a diff below that you can apply
on top of yours and commit in case you agree

> 
> 1 file changed, 58 insertions(+), 53 deletions(-)
> 
> commit - cdc669650695cff64baae8dfcccf30a57d751f82
> commit + c1142a3476cd388c1eddb581390528dc92d36bdd
> blob - 71a215359b98caf08ed6fafa8841fdd8dbd4caac
> blob + 0d5a5b54769d0ac8294a460110d7bfbac89c9b55
> --- lib/worktree.c
> +++ lib/worktree.c
> @@ -1537,11 +1537,12 @@ done:
>  }
>  
>  /*
> - * Upgrade STATUS_MODIFY to STATUS_CONFLICT if a
> - * conflict marker is found in newly added lines only.
> + * Detect whether a file is in STATUS_MODIFY or in STATUS_CONFLICT.
> + * The latter is determined by the presence of conflict markers on
> + * newly added lines.
>   */
>  static const struct got_error *
> -get_modified_file_content_status(unsigned char *status,
> +get_modified_file_status(unsigned char *status,
>      struct got_blob_object *blob, const char *path, struct stat *sb,
>      FILE *ondisk_file)
>  {
> @@ -1559,9 +1560,6 @@ get_modified_file_content_status(unsigned char *status
>  	size_t linesize = 0;
>  	ssize_t linelen;
>  
> -	if (*status != GOT_STATUS_MODIFY)
> -		return NULL;
> -
>  	f1 = got_opentemp();
>  	if (f1 == NULL)
>  		return got_error_from_errno("got_opentemp");
> @@ -1585,12 +1583,22 @@ get_modified_file_content_status(unsigned char *status
>  		struct diff_chunk_context cc = {};
>  		off_t pos;
>  
> -		/*
> -		 * We can optimise a little by advancing straight
> -		 * to the next chunk if this one has no added lines.
> -		 */
>  		c = diff_chunk_get(r, n);
>  
> +		/*
> +		 * Upgrade NO_CHANGE to MODIFY in case there are
> +		 * chunks which contain changes.
> +		 */
> +		if (*status == GOT_STATUS_NO_CHANGE &&
> +		    (diff_chunk_type(c) == CHUNK_MINUS ||
> +		    diff_chunk_type(c) == CHUNK_PLUS))
> +			*status = GOT_STATUS_MODIFY;
> +
> +		/*
> +		 * We can optimise conflict detection a little by
> +		 * advancing straight to the next chunk if this
> +		 * one has no added lines.
> +		 */
>  		if (diff_chunk_type(c) != CHUNK_PLUS) {
>  			nchunks_parsed = 1;
>  			continue;  /* removed or unchanged lines */
> @@ -1725,18 +1733,42 @@ get_symlink_modification_status(unsigned char *status,
>  }
>  
>  static const struct got_error *
> +file_is_binary(int *is_binary , FILE *f)
> +{
> +	const struct got_error *err = NULL;
> +	char buf[8192];
> +	size_t r;
> +	off_t pos = ftello(f);

Please check ftello(3) for failure and split the function call
from the declaration.

> +
> +	*is_binary = 0;
> +
> +	if (fseek(f, 0L, SEEK_SET) == -1)
> +		return got_ferror(f, GOT_ERR_IO);

it seems fseek() is preferred in got.c but maybe use fseeko(3) instead?

> +
> +	r = fread(buf, 1, sizeof(buf), f);
> +	if (r == 0 && ferror(f))
> +		return got_error_from_errno("fread");
> +
> +	if (r > 0)
> +		*is_binary = memchr(buf, '\0', r) != NULL;
> +
> +	if (fseek(f, pos, SEEK_SET) == -1 && err == NULL)

besides its NULL initialisation, err is not assigned before here in this
routine so we can drop the ` && err == NULL`. We could even drop the
`struct got_error *err` declaration altogether and do:

		return got_ferror(f, GOT_ERR_IO);

	return NULL;

> +		err = got_ferror(f, GOT_ERR_IO);
> +
> +	return err;
> +}
> +
> +static const struct got_error *
>  get_file_status(unsigned char *status, struct stat *sb,
>      struct got_fileindex_entry *ie, const char *abspath,
>      int dirfd, const char *de_name, struct got_repository *repo)
>  {
>  	const struct got_error *err = NULL;
>  	struct got_object_id id;
> -	size_t hdrlen;
>  	int fd = -1, fd1 = -1;
>  	FILE *f = NULL;
>  	uint8_t fbuf[8192];
>  	struct got_blob_object *blob = NULL;
> -	size_t flen, blen;
>  	unsigned char staged_status;
>  
>  	staged_status = get_staged_status(ie);
> @@ -1845,52 +1877,25 @@ get_file_status(unsigned char *status, struct stat *sb
>  	    got_fileindex_entry_filetype_get(ie) ==
>  	    GOT_FILEIDX_MODE_REGULAR_FILE &&
>  	    ie->size != (sb->st_size & 0xffffffff)) {
> +		int is_binary;
> +
> +		/* The size of regular files differs. */
> +		*status = GOT_STATUS_MODIFY;
> +
>  		/*
> -		 * The size of regular files differs. We can skip the full
> -		 * content comparison loop below but still need to check
> -		 * for conflict markers.
> +		 * We can skip the diff engine for binary files on disk.
> +		 * Text files still need to be checked for conflict markers.
>  		 */
> -		*status = GOT_STATUS_MODIFY;
> -	}
> -
> -	hdrlen = got_object_blob_get_hdrlen(blob);
> -	while (*status == GOT_STATUS_NO_CHANGE) {
> -		const uint8_t *bbuf = got_object_blob_get_read_buf(blob);
> -		err = got_object_blob_read_block(&blen, blob);
> -		if (err)
> +		err = file_is_binary(&is_binary, f);
> +		if (err || is_binary)
>  			goto done;
> -		/* Skip length of blob object header first time around. */
> -		flen = fread(fbuf, 1, sizeof(fbuf) - hdrlen, f);
> -		if (flen == 0 && ferror(f)) {
> -			err = got_error_from_errno("fread");
> -			goto done;
> -		}
> -		if (blen - hdrlen == 0) {
> -			if (flen != 0)
> -				*status = GOT_STATUS_MODIFY;
> -			break;
> -		} else if (flen == 0) {
> -			if (blen - hdrlen != 0)
> -				*status = GOT_STATUS_MODIFY;
> -			break;
> -		} else if (blen - hdrlen == flen) {
> -			/* Skip blob object header first time around. */
> -			if (memcmp(bbuf + hdrlen, fbuf, flen) != 0) {
> -				*status = GOT_STATUS_MODIFY;
> -				break;
> -			}
> -		} else {
> -			*status = GOT_STATUS_MODIFY;
> -			break;
> -		}
> -		hdrlen = 0;
>  	}
>  
> -	if (*status == GOT_STATUS_MODIFY) {
> -		rewind(f);
> -		err = get_modified_file_content_status(status, blob, ie->path,
> -		    sb, f);
> -	} else if (xbit_differs(ie, sb->st_mode))
> +	err = get_modified_file_status(status, blob, ie->path, sb, f);
> +	if (err)
> +		goto done;
> +
> +	if (*status == GOT_STATUS_NO_CHANGE && xbit_differs(ie, sb->st_mode))
>  		*status = GOT_STATUS_MODE_CHANGE;
>  done:
>  	if (fd1 != -1 && close(fd1) == -1 && err == NULL)


diff of the abovementioned changes:


M  lib/worktree.c  |  9+  6-

1 file changed, 9 insertions(+), 6 deletions(-)

path + /home/mark/src/got
commit - 0e374d987f11f714d77bc4fa3605c87f6c839628
blob - 0d5a5b54769d0ac8294a460110d7bfbac89c9b55
file + lib/worktree.c
--- lib/worktree.c
+++ lib/worktree.c
@@ -1735,14 +1735,17 @@ get_symlink_modification_status(unsigned char *status,
 static const struct got_error *
 file_is_binary(int *is_binary , FILE *f)
 {
-	const struct got_error *err = NULL;
 	char buf[8192];
 	size_t r;
-	off_t pos = ftello(f);
+	off_t pos;
 
 	*is_binary = 0;
 
-	if (fseek(f, 0L, SEEK_SET) == -1)
+	pos = ftello(f);
+	if (pos == -1)
+		return got_error_from_errno("ftello");
+
+	if (fseeko(f, 0L, SEEK_SET) == -1)
 		return got_ferror(f, GOT_ERR_IO);
 
 	r = fread(buf, 1, sizeof(buf), f);
@@ -1752,10 +1755,10 @@ file_is_binary(int *is_binary , FILE *f)
 	if (r > 0)
 		*is_binary = memchr(buf, '\0', r) != NULL;
 
-	if (fseek(f, pos, SEEK_SET) == -1 && err == NULL)
-		err = got_ferror(f, GOT_ERR_IO);
+	if (fseeko(f, pos, SEEK_SET) == -1)
+		return got_ferror(f, GOT_ERR_IO);
 
-	return err;
+	return NULL;
 }
 
 static const struct got_error *


-- 
Mark Jamsek <https://bsdbox.org>
GPG: F2FF 13DE 6A06 C471 CA80  E6E2 2930 DC66 86EE CF68