Download raw body.
speed up 'got status' a little
Stefan Sperling <stsp@stsp.name> wrote: > On Mon, Jan 20, 2025 at 05:13:07PM +1100, Mark Jamsek wrote: > > It'd be good if we could elide the conflict marker detection, too, but > > bypassing the full content comparison when we know the file size is > > different is a nice little tweak :) > > Below is another diff as a next step, and a suggestion for future work. > > We can skip the content comparison loop entirely, and always use the > diff engine which will pass over the content anyway to atomize it. > > In the special case a differently-sized binary is found on disk we can > skip the diff engine. (We sill detect merge conflicts correctly because > binaries already in conflict status are replaced by a text file on disk > which contains conflict markers. Our tests cover this behaviour.) > > This provides a significant speed-up for large modified binaries: > > $ time /usr/local/bin/got st > M test.mp4 > 0m14.59s real 0m08.00s user 0m04.96s system > $ time got st > M test.mp4 > 0m04.73s real 0m02.52s user 0m01.62s system Nice! And some cursory measurements reveal a marginal albeit consistent speed-up in the general case too :) > > A future optimization might be to add a diff API which asks for file > modication status only, rather than the full diff result. This would > allow the diff engine to stop doing work on modified files earlier, > as soon as any modification is found. This would speed up more cases. This also sounds like a good idea. > > M lib/worktree.c | 58+ 53- ok for the diff with a small nit and minor suggestions inline; for convenience, I've included a diff below that you can apply on top of yours and commit in case you agree > > 1 file changed, 58 insertions(+), 53 deletions(-) > > commit - cdc669650695cff64baae8dfcccf30a57d751f82 > commit + c1142a3476cd388c1eddb581390528dc92d36bdd > blob - 71a215359b98caf08ed6fafa8841fdd8dbd4caac > blob + 0d5a5b54769d0ac8294a460110d7bfbac89c9b55 > --- lib/worktree.c > +++ lib/worktree.c > @@ -1537,11 +1537,12 @@ done: > } > > /* > - * Upgrade STATUS_MODIFY to STATUS_CONFLICT if a > - * conflict marker is found in newly added lines only. > + * Detect whether a file is in STATUS_MODIFY or in STATUS_CONFLICT. > + * The latter is determined by the presence of conflict markers on > + * newly added lines. > */ > static const struct got_error * > -get_modified_file_content_status(unsigned char *status, > +get_modified_file_status(unsigned char *status, > struct got_blob_object *blob, const char *path, struct stat *sb, > FILE *ondisk_file) > { > @@ -1559,9 +1560,6 @@ get_modified_file_content_status(unsigned char *status > size_t linesize = 0; > ssize_t linelen; > > - if (*status != GOT_STATUS_MODIFY) > - return NULL; > - > f1 = got_opentemp(); > if (f1 == NULL) > return got_error_from_errno("got_opentemp"); > @@ -1585,12 +1583,22 @@ get_modified_file_content_status(unsigned char *status > struct diff_chunk_context cc = {}; > off_t pos; > > - /* > - * We can optimise a little by advancing straight > - * to the next chunk if this one has no added lines. > - */ > c = diff_chunk_get(r, n); > > + /* > + * Upgrade NO_CHANGE to MODIFY in case there are > + * chunks which contain changes. > + */ > + if (*status == GOT_STATUS_NO_CHANGE && > + (diff_chunk_type(c) == CHUNK_MINUS || > + diff_chunk_type(c) == CHUNK_PLUS)) > + *status = GOT_STATUS_MODIFY; > + > + /* > + * We can optimise conflict detection a little by > + * advancing straight to the next chunk if this > + * one has no added lines. > + */ > if (diff_chunk_type(c) != CHUNK_PLUS) { > nchunks_parsed = 1; > continue; /* removed or unchanged lines */ > @@ -1725,18 +1733,42 @@ get_symlink_modification_status(unsigned char *status, > } > > static const struct got_error * > +file_is_binary(int *is_binary , FILE *f) > +{ > + const struct got_error *err = NULL; > + char buf[8192]; > + size_t r; > + off_t pos = ftello(f); Please check ftello(3) for failure and split the function call from the declaration. > + > + *is_binary = 0; > + > + if (fseek(f, 0L, SEEK_SET) == -1) > + return got_ferror(f, GOT_ERR_IO); it seems fseek() is preferred in got.c but maybe use fseeko(3) instead? > + > + r = fread(buf, 1, sizeof(buf), f); > + if (r == 0 && ferror(f)) > + return got_error_from_errno("fread"); > + > + if (r > 0) > + *is_binary = memchr(buf, '\0', r) != NULL; > + > + if (fseek(f, pos, SEEK_SET) == -1 && err == NULL) besides its NULL initialisation, err is not assigned before here in this routine so we can drop the ` && err == NULL`. We could even drop the `struct got_error *err` declaration altogether and do: return got_ferror(f, GOT_ERR_IO); return NULL; > + err = got_ferror(f, GOT_ERR_IO); > + > + return err; > +} > + > +static const struct got_error * > get_file_status(unsigned char *status, struct stat *sb, > struct got_fileindex_entry *ie, const char *abspath, > int dirfd, const char *de_name, struct got_repository *repo) > { > const struct got_error *err = NULL; > struct got_object_id id; > - size_t hdrlen; > int fd = -1, fd1 = -1; > FILE *f = NULL; > uint8_t fbuf[8192]; > struct got_blob_object *blob = NULL; > - size_t flen, blen; > unsigned char staged_status; > > staged_status = get_staged_status(ie); > @@ -1845,52 +1877,25 @@ get_file_status(unsigned char *status, struct stat *sb > got_fileindex_entry_filetype_get(ie) == > GOT_FILEIDX_MODE_REGULAR_FILE && > ie->size != (sb->st_size & 0xffffffff)) { > + int is_binary; > + > + /* The size of regular files differs. */ > + *status = GOT_STATUS_MODIFY; > + > /* > - * The size of regular files differs. We can skip the full > - * content comparison loop below but still need to check > - * for conflict markers. > + * We can skip the diff engine for binary files on disk. > + * Text files still need to be checked for conflict markers. > */ > - *status = GOT_STATUS_MODIFY; > - } > - > - hdrlen = got_object_blob_get_hdrlen(blob); > - while (*status == GOT_STATUS_NO_CHANGE) { > - const uint8_t *bbuf = got_object_blob_get_read_buf(blob); > - err = got_object_blob_read_block(&blen, blob); > - if (err) > + err = file_is_binary(&is_binary, f); > + if (err || is_binary) > goto done; > - /* Skip length of blob object header first time around. */ > - flen = fread(fbuf, 1, sizeof(fbuf) - hdrlen, f); > - if (flen == 0 && ferror(f)) { > - err = got_error_from_errno("fread"); > - goto done; > - } > - if (blen - hdrlen == 0) { > - if (flen != 0) > - *status = GOT_STATUS_MODIFY; > - break; > - } else if (flen == 0) { > - if (blen - hdrlen != 0) > - *status = GOT_STATUS_MODIFY; > - break; > - } else if (blen - hdrlen == flen) { > - /* Skip blob object header first time around. */ > - if (memcmp(bbuf + hdrlen, fbuf, flen) != 0) { > - *status = GOT_STATUS_MODIFY; > - break; > - } > - } else { > - *status = GOT_STATUS_MODIFY; > - break; > - } > - hdrlen = 0; > } > > - if (*status == GOT_STATUS_MODIFY) { > - rewind(f); > - err = get_modified_file_content_status(status, blob, ie->path, > - sb, f); > - } else if (xbit_differs(ie, sb->st_mode)) > + err = get_modified_file_status(status, blob, ie->path, sb, f); > + if (err) > + goto done; > + > + if (*status == GOT_STATUS_NO_CHANGE && xbit_differs(ie, sb->st_mode)) > *status = GOT_STATUS_MODE_CHANGE; > done: > if (fd1 != -1 && close(fd1) == -1 && err == NULL) diff of the abovementioned changes: M lib/worktree.c | 9+ 6- 1 file changed, 9 insertions(+), 6 deletions(-) path + /home/mark/src/got commit - 0e374d987f11f714d77bc4fa3605c87f6c839628 blob - 0d5a5b54769d0ac8294a460110d7bfbac89c9b55 file + lib/worktree.c --- lib/worktree.c +++ lib/worktree.c @@ -1735,14 +1735,17 @@ get_symlink_modification_status(unsigned char *status, static const struct got_error * file_is_binary(int *is_binary , FILE *f) { - const struct got_error *err = NULL; char buf[8192]; size_t r; - off_t pos = ftello(f); + off_t pos; *is_binary = 0; - if (fseek(f, 0L, SEEK_SET) == -1) + pos = ftello(f); + if (pos == -1) + return got_error_from_errno("ftello"); + + if (fseeko(f, 0L, SEEK_SET) == -1) return got_ferror(f, GOT_ERR_IO); r = fread(buf, 1, sizeof(buf), f); @@ -1752,10 +1755,10 @@ file_is_binary(int *is_binary , FILE *f) if (r > 0) *is_binary = memchr(buf, '\0', r) != NULL; - if (fseek(f, pos, SEEK_SET) == -1 && err == NULL) - err = got_ferror(f, GOT_ERR_IO); + if (fseeko(f, pos, SEEK_SET) == -1) + return got_ferror(f, GOT_ERR_IO); - return err; + return NULL; } static const struct got_error * -- Mark Jamsek <https://bsdbox.org> GPG: F2FF 13DE 6A06 C471 CA80 E6E2 2930 DC66 86EE CF68
speed up 'got status' a little