From: Stefan Sperling Subject: Re: speed up 'got status' a little To: Mark Jamsek Cc: gameoftrees@openbsd.org Date: Mon, 20 Jan 2025 13:25:48 +0100 On Mon, Jan 20, 2025 at 05:13:07PM +1100, Mark Jamsek wrote: > It'd be good if we could elide the conflict marker detection, too, but > bypassing the full content comparison when we know the file size is > different is a nice little tweak :) Below is another diff as a next step, and a suggestion for future work. We can skip the content comparison loop entirely, and always use the diff engine which will pass over the content anyway to atomize it. In the special case a differently-sized binary is found on disk we can skip the diff engine. (We sill detect merge conflicts correctly because binaries already in conflict status are replaced by a text file on disk which contains conflict markers. Our tests cover this behaviour.) This provides a significant speed-up for large modified binaries: $ time /usr/local/bin/got st M test.mp4 0m14.59s real 0m08.00s user 0m04.96s system $ time got st M test.mp4 0m04.73s real 0m02.52s user 0m01.62s system A future optimization might be to add a diff API which asks for file modication status only, rather than the full diff result. This would allow the diff engine to stop doing work on modified files earlier, as soon as any modification is found. This would speed up more cases. M lib/worktree.c | 58+ 53- 1 file changed, 58 insertions(+), 53 deletions(-) commit - cdc669650695cff64baae8dfcccf30a57d751f82 commit + c1142a3476cd388c1eddb581390528dc92d36bdd blob - 71a215359b98caf08ed6fafa8841fdd8dbd4caac blob + 0d5a5b54769d0ac8294a460110d7bfbac89c9b55 --- lib/worktree.c +++ lib/worktree.c @@ -1537,11 +1537,12 @@ done: } /* - * Upgrade STATUS_MODIFY to STATUS_CONFLICT if a - * conflict marker is found in newly added lines only. + * Detect whether a file is in STATUS_MODIFY or in STATUS_CONFLICT. + * The latter is determined by the presence of conflict markers on + * newly added lines. */ static const struct got_error * -get_modified_file_content_status(unsigned char *status, +get_modified_file_status(unsigned char *status, struct got_blob_object *blob, const char *path, struct stat *sb, FILE *ondisk_file) { @@ -1559,9 +1560,6 @@ get_modified_file_content_status(unsigned char *status size_t linesize = 0; ssize_t linelen; - if (*status != GOT_STATUS_MODIFY) - return NULL; - f1 = got_opentemp(); if (f1 == NULL) return got_error_from_errno("got_opentemp"); @@ -1585,12 +1583,22 @@ get_modified_file_content_status(unsigned char *status struct diff_chunk_context cc = {}; off_t pos; - /* - * We can optimise a little by advancing straight - * to the next chunk if this one has no added lines. - */ c = diff_chunk_get(r, n); + /* + * Upgrade NO_CHANGE to MODIFY in case there are + * chunks which contain changes. + */ + if (*status == GOT_STATUS_NO_CHANGE && + (diff_chunk_type(c) == CHUNK_MINUS || + diff_chunk_type(c) == CHUNK_PLUS)) + *status = GOT_STATUS_MODIFY; + + /* + * We can optimise conflict detection a little by + * advancing straight to the next chunk if this + * one has no added lines. + */ if (diff_chunk_type(c) != CHUNK_PLUS) { nchunks_parsed = 1; continue; /* removed or unchanged lines */ @@ -1725,18 +1733,42 @@ get_symlink_modification_status(unsigned char *status, } static const struct got_error * +file_is_binary(int *is_binary , FILE *f) +{ + const struct got_error *err = NULL; + char buf[8192]; + size_t r; + off_t pos = ftello(f); + + *is_binary = 0; + + if (fseek(f, 0L, SEEK_SET) == -1) + return got_ferror(f, GOT_ERR_IO); + + r = fread(buf, 1, sizeof(buf), f); + if (r == 0 && ferror(f)) + return got_error_from_errno("fread"); + + if (r > 0) + *is_binary = memchr(buf, '\0', r) != NULL; + + if (fseek(f, pos, SEEK_SET) == -1 && err == NULL) + err = got_ferror(f, GOT_ERR_IO); + + return err; +} + +static const struct got_error * get_file_status(unsigned char *status, struct stat *sb, struct got_fileindex_entry *ie, const char *abspath, int dirfd, const char *de_name, struct got_repository *repo) { const struct got_error *err = NULL; struct got_object_id id; - size_t hdrlen; int fd = -1, fd1 = -1; FILE *f = NULL; uint8_t fbuf[8192]; struct got_blob_object *blob = NULL; - size_t flen, blen; unsigned char staged_status; staged_status = get_staged_status(ie); @@ -1845,52 +1877,25 @@ get_file_status(unsigned char *status, struct stat *sb got_fileindex_entry_filetype_get(ie) == GOT_FILEIDX_MODE_REGULAR_FILE && ie->size != (sb->st_size & 0xffffffff)) { + int is_binary; + + /* The size of regular files differs. */ + *status = GOT_STATUS_MODIFY; + /* - * The size of regular files differs. We can skip the full - * content comparison loop below but still need to check - * for conflict markers. + * We can skip the diff engine for binary files on disk. + * Text files still need to be checked for conflict markers. */ - *status = GOT_STATUS_MODIFY; - } - - hdrlen = got_object_blob_get_hdrlen(blob); - while (*status == GOT_STATUS_NO_CHANGE) { - const uint8_t *bbuf = got_object_blob_get_read_buf(blob); - err = got_object_blob_read_block(&blen, blob); - if (err) + err = file_is_binary(&is_binary, f); + if (err || is_binary) goto done; - /* Skip length of blob object header first time around. */ - flen = fread(fbuf, 1, sizeof(fbuf) - hdrlen, f); - if (flen == 0 && ferror(f)) { - err = got_error_from_errno("fread"); - goto done; - } - if (blen - hdrlen == 0) { - if (flen != 0) - *status = GOT_STATUS_MODIFY; - break; - } else if (flen == 0) { - if (blen - hdrlen != 0) - *status = GOT_STATUS_MODIFY; - break; - } else if (blen - hdrlen == flen) { - /* Skip blob object header first time around. */ - if (memcmp(bbuf + hdrlen, fbuf, flen) != 0) { - *status = GOT_STATUS_MODIFY; - break; - } - } else { - *status = GOT_STATUS_MODIFY; - break; - } - hdrlen = 0; } - if (*status == GOT_STATUS_MODIFY) { - rewind(f); - err = get_modified_file_content_status(status, blob, ie->path, - sb, f); - } else if (xbit_differs(ie, sb->st_mode)) + err = get_modified_file_status(status, blob, ie->path, sb, f); + if (err) + goto done; + + if (*status == GOT_STATUS_NO_CHANGE && xbit_differs(ie, sb->st_mode)) *status = GOT_STATUS_MODE_CHANGE; done: if (fd1 != -1 && close(fd1) == -1 && err == NULL)