Download raw body.
speed up 'got status' a little
Stefan Sperling <stsp@stsp.name> wrote:
> On Mon, Jan 20, 2025 at 05:13:07PM +1100, Mark Jamsek wrote:
> > It'd be good if we could elide the conflict marker detection, too, but
> > bypassing the full content comparison when we know the file size is
> > different is a nice little tweak :)
>
> Below is another diff as a next step, and a suggestion for future work.
>
> We can skip the content comparison loop entirely, and always use the
> diff engine which will pass over the content anyway to atomize it.
>
> In the special case a differently-sized binary is found on disk we can
> skip the diff engine. (We sill detect merge conflicts correctly because
> binaries already in conflict status are replaced by a text file on disk
> which contains conflict markers. Our tests cover this behaviour.)
>
> This provides a significant speed-up for large modified binaries:
>
> $ time /usr/local/bin/got st
> M test.mp4
> 0m14.59s real 0m08.00s user 0m04.96s system
> $ time got st
> M test.mp4
> 0m04.73s real 0m02.52s user 0m01.62s system
Nice!
And some cursory measurements reveal a marginal albeit consistent
speed-up in the general case too :)
>
> A future optimization might be to add a diff API which asks for file
> modication status only, rather than the full diff result. This would
> allow the diff engine to stop doing work on modified files earlier,
> as soon as any modification is found. This would speed up more cases.
This also sounds like a good idea.
>
> M lib/worktree.c | 58+ 53-
ok for the diff with a small nit and minor suggestions inline;
for convenience, I've included a diff below that you can apply
on top of yours and commit in case you agree
>
> 1 file changed, 58 insertions(+), 53 deletions(-)
>
> commit - cdc669650695cff64baae8dfcccf30a57d751f82
> commit + c1142a3476cd388c1eddb581390528dc92d36bdd
> blob - 71a215359b98caf08ed6fafa8841fdd8dbd4caac
> blob + 0d5a5b54769d0ac8294a460110d7bfbac89c9b55
> --- lib/worktree.c
> +++ lib/worktree.c
> @@ -1537,11 +1537,12 @@ done:
> }
>
> /*
> - * Upgrade STATUS_MODIFY to STATUS_CONFLICT if a
> - * conflict marker is found in newly added lines only.
> + * Detect whether a file is in STATUS_MODIFY or in STATUS_CONFLICT.
> + * The latter is determined by the presence of conflict markers on
> + * newly added lines.
> */
> static const struct got_error *
> -get_modified_file_content_status(unsigned char *status,
> +get_modified_file_status(unsigned char *status,
> struct got_blob_object *blob, const char *path, struct stat *sb,
> FILE *ondisk_file)
> {
> @@ -1559,9 +1560,6 @@ get_modified_file_content_status(unsigned char *status
> size_t linesize = 0;
> ssize_t linelen;
>
> - if (*status != GOT_STATUS_MODIFY)
> - return NULL;
> -
> f1 = got_opentemp();
> if (f1 == NULL)
> return got_error_from_errno("got_opentemp");
> @@ -1585,12 +1583,22 @@ get_modified_file_content_status(unsigned char *status
> struct diff_chunk_context cc = {};
> off_t pos;
>
> - /*
> - * We can optimise a little by advancing straight
> - * to the next chunk if this one has no added lines.
> - */
> c = diff_chunk_get(r, n);
>
> + /*
> + * Upgrade NO_CHANGE to MODIFY in case there are
> + * chunks which contain changes.
> + */
> + if (*status == GOT_STATUS_NO_CHANGE &&
> + (diff_chunk_type(c) == CHUNK_MINUS ||
> + diff_chunk_type(c) == CHUNK_PLUS))
> + *status = GOT_STATUS_MODIFY;
> +
> + /*
> + * We can optimise conflict detection a little by
> + * advancing straight to the next chunk if this
> + * one has no added lines.
> + */
> if (diff_chunk_type(c) != CHUNK_PLUS) {
> nchunks_parsed = 1;
> continue; /* removed or unchanged lines */
> @@ -1725,18 +1733,42 @@ get_symlink_modification_status(unsigned char *status,
> }
>
> static const struct got_error *
> +file_is_binary(int *is_binary , FILE *f)
> +{
> + const struct got_error *err = NULL;
> + char buf[8192];
> + size_t r;
> + off_t pos = ftello(f);
Please check ftello(3) for failure and split the function call
from the declaration.
> +
> + *is_binary = 0;
> +
> + if (fseek(f, 0L, SEEK_SET) == -1)
> + return got_ferror(f, GOT_ERR_IO);
it seems fseek() is preferred in got.c but maybe use fseeko(3) instead?
> +
> + r = fread(buf, 1, sizeof(buf), f);
> + if (r == 0 && ferror(f))
> + return got_error_from_errno("fread");
> +
> + if (r > 0)
> + *is_binary = memchr(buf, '\0', r) != NULL;
> +
> + if (fseek(f, pos, SEEK_SET) == -1 && err == NULL)
besides its NULL initialisation, err is not assigned before here in this
routine so we can drop the ` && err == NULL`. We could even drop the
`struct got_error *err` declaration altogether and do:
return got_ferror(f, GOT_ERR_IO);
return NULL;
> + err = got_ferror(f, GOT_ERR_IO);
> +
> + return err;
> +}
> +
> +static const struct got_error *
> get_file_status(unsigned char *status, struct stat *sb,
> struct got_fileindex_entry *ie, const char *abspath,
> int dirfd, const char *de_name, struct got_repository *repo)
> {
> const struct got_error *err = NULL;
> struct got_object_id id;
> - size_t hdrlen;
> int fd = -1, fd1 = -1;
> FILE *f = NULL;
> uint8_t fbuf[8192];
> struct got_blob_object *blob = NULL;
> - size_t flen, blen;
> unsigned char staged_status;
>
> staged_status = get_staged_status(ie);
> @@ -1845,52 +1877,25 @@ get_file_status(unsigned char *status, struct stat *sb
> got_fileindex_entry_filetype_get(ie) ==
> GOT_FILEIDX_MODE_REGULAR_FILE &&
> ie->size != (sb->st_size & 0xffffffff)) {
> + int is_binary;
> +
> + /* The size of regular files differs. */
> + *status = GOT_STATUS_MODIFY;
> +
> /*
> - * The size of regular files differs. We can skip the full
> - * content comparison loop below but still need to check
> - * for conflict markers.
> + * We can skip the diff engine for binary files on disk.
> + * Text files still need to be checked for conflict markers.
> */
> - *status = GOT_STATUS_MODIFY;
> - }
> -
> - hdrlen = got_object_blob_get_hdrlen(blob);
> - while (*status == GOT_STATUS_NO_CHANGE) {
> - const uint8_t *bbuf = got_object_blob_get_read_buf(blob);
> - err = got_object_blob_read_block(&blen, blob);
> - if (err)
> + err = file_is_binary(&is_binary, f);
> + if (err || is_binary)
> goto done;
> - /* Skip length of blob object header first time around. */
> - flen = fread(fbuf, 1, sizeof(fbuf) - hdrlen, f);
> - if (flen == 0 && ferror(f)) {
> - err = got_error_from_errno("fread");
> - goto done;
> - }
> - if (blen - hdrlen == 0) {
> - if (flen != 0)
> - *status = GOT_STATUS_MODIFY;
> - break;
> - } else if (flen == 0) {
> - if (blen - hdrlen != 0)
> - *status = GOT_STATUS_MODIFY;
> - break;
> - } else if (blen - hdrlen == flen) {
> - /* Skip blob object header first time around. */
> - if (memcmp(bbuf + hdrlen, fbuf, flen) != 0) {
> - *status = GOT_STATUS_MODIFY;
> - break;
> - }
> - } else {
> - *status = GOT_STATUS_MODIFY;
> - break;
> - }
> - hdrlen = 0;
> }
>
> - if (*status == GOT_STATUS_MODIFY) {
> - rewind(f);
> - err = get_modified_file_content_status(status, blob, ie->path,
> - sb, f);
> - } else if (xbit_differs(ie, sb->st_mode))
> + err = get_modified_file_status(status, blob, ie->path, sb, f);
> + if (err)
> + goto done;
> +
> + if (*status == GOT_STATUS_NO_CHANGE && xbit_differs(ie, sb->st_mode))
> *status = GOT_STATUS_MODE_CHANGE;
> done:
> if (fd1 != -1 && close(fd1) == -1 && err == NULL)
diff of the abovementioned changes:
M lib/worktree.c | 9+ 6-
1 file changed, 9 insertions(+), 6 deletions(-)
path + /home/mark/src/got
commit - 0e374d987f11f714d77bc4fa3605c87f6c839628
blob - 0d5a5b54769d0ac8294a460110d7bfbac89c9b55
file + lib/worktree.c
--- lib/worktree.c
+++ lib/worktree.c
@@ -1735,14 +1735,17 @@ get_symlink_modification_status(unsigned char *status,
static const struct got_error *
file_is_binary(int *is_binary , FILE *f)
{
- const struct got_error *err = NULL;
char buf[8192];
size_t r;
- off_t pos = ftello(f);
+ off_t pos;
*is_binary = 0;
- if (fseek(f, 0L, SEEK_SET) == -1)
+ pos = ftello(f);
+ if (pos == -1)
+ return got_error_from_errno("ftello");
+
+ if (fseeko(f, 0L, SEEK_SET) == -1)
return got_ferror(f, GOT_ERR_IO);
r = fread(buf, 1, sizeof(buf), f);
@@ -1752,10 +1755,10 @@ file_is_binary(int *is_binary , FILE *f)
if (r > 0)
*is_binary = memchr(buf, '\0', r) != NULL;
- if (fseek(f, pos, SEEK_SET) == -1 && err == NULL)
- err = got_ferror(f, GOT_ERR_IO);
+ if (fseeko(f, pos, SEEK_SET) == -1)
+ return got_ferror(f, GOT_ERR_IO);
- return err;
+ return NULL;
}
static const struct got_error *
--
Mark Jamsek <https://bsdbox.org>
GPG: F2FF 13DE 6A06 C471 CA80 E6E2 2930 DC66 86EE CF68
speed up 'got status' a little