From: Stefan Sperling Subject: fix blame To: gameoftrees@openbsd.org Date: Mon, 16 Nov 2020 16:26:38 +0100 This patch reimplements the blame algorithm. Neels and I discovered that the old algorithm could yield incorrect results. The old blame algorithm assumed that diffs for N:N-1, N:N-2, N:N-3 etc. will produce a consistently growing set + lines across history. This assumption is wrong! Diff algorithms may decide to partition different input files in entirely different ways, depending on the content of lines present in any given file version. This means there isn't a single answer as to how lines in a file should be blamed, either. To see an example of the problem, look at how the definition of struct tog_cmd in tog/tog.c is annocated: 0073) c2301be8 2018-04-30 stsp struct tog_cmd { 0074) c2301be8 2018-04-30 stsp const char *name; 0075) c2301be8 2018-04-30 stsp const struct got_error *(*cmd_main)(int, char *[]); 0076) c2301be8 2018-04-30 stsp void (*cmd_usage)(void); 0077) c2301be8 2018-04-30 stsp }; Commit c2301be8 did not introduce all of the lines shown above, just the line 'const char *name;'. With the patch below, these lines are annotated correctly: 0073) 9f7d7167 2018-04-29 stsp struct tog_cmd { 0074) c2301be8 2018-04-30 stsp const char *name; 0075) 9f7d7167 2018-04-29 stsp const struct got_error *(*cmd_main)(int, char *[]); 0076) 9f7d7167 2018-04-29 stsp void (*cmd_usage)(void); 0077) 9f7d7167 2018-04-29 stsp }; What users expect is that a line which is attributed to commit N appears as a + line in the diff between commit N-1 and N. The new blame algorithm uses a sequence of diffs between commits N:N-1, N-1:N-2, N-2:N-3 etc. in order to meet this expectation. To make this work, Neels came up with a way to map line numbers from older versions of the file to the latest version. This is an initial stupid implementation which does not yet re-use the blob file from N-1 during commit diffing sequence N:N-1, N-1:N-2. This can be optimized later. Testing uncovered a performance issue in got_object_blob_dump_to_file(). This function calls recallocarray() once per line in the blob. This is very expensive and needs to be fixed. For now, since blame only needs to know the number of lines in the blob, and doesn't care about line offsets, I have worked around this by decoupling the line_offsets output paramter from the nlines output parameter. ok? diff refs/heads/main refs/heads/blame blob - 63258946e8404a1df4a6b0bd159a0fcc690a7932 blob + 341dff3c13879436458fa44d3796e3a7a9a14479 --- lib/blame.c +++ lib/blame.c @@ -51,9 +51,7 @@ struct got_blame_line { struct got_blame { FILE *f; - char *map; - struct diff_data left_data; - struct diff_data right_data; + off_t size; const struct diff_config *cfg; size_t filesize; int nlines; @@ -61,6 +59,15 @@ struct got_blame { struct got_blame_line *lines; /* one per line */ off_t *line_offsets; /* one per line */ int ncommits; + + /* + * Map line numbers of an older version of the file to valid line + * numbers in blame->f. This map is updated with each commit we + * traverse throughout the file's history. + * Lines mapped to -1 do not correspond to any line in blame->f. + */ + int *linemap2; + int nlines2; }; static const struct got_error * @@ -71,10 +78,10 @@ annotate_line(struct got_blame *blame, int lineno, str const struct got_error *err = NULL; struct got_blame_line *line; - if (lineno < 1 || lineno > blame->nlines) + if (lineno < 0 || lineno >= blame->nlines) return NULL; - line = &blame->lines[lineno - 1]; + line = &blame->lines[lineno]; if (line->annotated) return NULL; @@ -82,38 +89,54 @@ annotate_line(struct got_blame *blame, int lineno, str line->annotated = 1; blame->nannotated++; if (cb) - err = cb(arg, blame->nlines, lineno, id); + err = cb(arg, blame->nlines, lineno + 1, id); return err; } static const struct got_error * -blame_changes(struct got_blame *blame, struct diff_result *diff_result, - struct got_object_id *commit_id, +blame_changes(struct got_blame *blame, int *linemap1, + struct diff_result *diff_result, struct got_object_id *commit_id, const struct got_error *(*cb)(void *, int, int, struct got_object_id *), void *arg) { const struct got_error *err = NULL; int i; + int idx1 = 0, idx2 = 0; - for (i = 0; i < diff_result->chunks.len; i++) { + for (i = 0; i < diff_result->chunks.len && + blame->nannotated < blame->nlines; i++) { struct diff_chunk *c = diff_chunk_get(diff_result, i); + unsigned int left_start, left_count; unsigned int right_start, right_count; - int lineno, len; - int ln; + int j; - if (diff_chunk_get_left_count(c) != 0) - continue; - - len = diff_chunk_get_right_count(c); - if (len == 0) - continue; - + /* + * We do not need to worry about idx1/idx2 growing out + * of bounds because the diff implementation ensures + * that chunk ranges never exceed the number of lines + * in the left/right input files. + */ + left_start = diff_chunk_get_left_start(c, diff_result, 0); + left_count = diff_chunk_get_left_count(c); right_start = diff_chunk_get_right_start(c, diff_result, 0); right_count = diff_chunk_get_right_count(c); - lineno = right_start + 1; - len = right_count; - for (ln = lineno; ln < lineno + len; ln++) { + if (left_count == right_count) { + for (j = 0; j < left_count; j++) { + linemap1[idx1++] = blame->linemap2[idx2++]; + } + continue; + } + + if (right_count == 0) { + for (j = 0; j < left_count; j++) { + linemap1[idx1++] = -1; + } + continue; + } + + for (j = 0; j < right_count; j++) { + int ln = blame->linemap2[idx2++]; err = annotate_line(blame, ln, commit_id, cb, arg); if (err) return err; @@ -126,73 +149,117 @@ blame_changes(struct got_blame *blame, struct diff_res } static const struct got_error * -blame_commit(struct got_blame *blame, struct got_object_id *parent_id, - struct got_object_id *id, const char *path, struct got_repository *repo, +blame_commit(struct got_blame *blame, struct got_object_id *id, + const char *path, struct got_repository *repo, const struct got_error *(*cb)(void *, int, int, struct got_object_id *), void *arg) { const struct got_error *err = NULL; - struct got_object *obj = NULL; - struct got_object_id *obj_id = NULL; struct got_commit_object *commit = NULL; - struct got_blob_object *blob = NULL; + struct got_object_qid *pid = NULL; + struct got_object_id *blob_id = NULL, *pblob_id = NULL; + struct got_blob_object *blob = NULL, *pblob = NULL; struct got_diffreg_result *diffreg_result = NULL; + FILE *f1 = NULL, *f2 = NULL; + size_t size1, size2; + int nlines1, nlines2; + int *linemap1 = NULL; - err = got_object_open_as_commit(&commit, repo, parent_id); + err = got_object_open_as_commit(&commit, repo, id); if (err) return err; - err = got_object_id_by_path(&obj_id, repo, parent_id, path); + pid = SIMPLEQ_FIRST(got_object_commit_get_parent_ids(commit)); + if (pid == NULL) { + got_object_commit_close(commit); + return NULL; + } + + err = got_object_id_by_path(&blob_id, repo, id, path); if (err) { if (err->code == GOT_ERR_NO_TREE_ENTRY) err = NULL; goto done; } - err = got_object_open(&obj, repo, obj_id); + err = got_object_open_as_blob(&blob, repo, blob_id, 8192); if (err) goto done; - if (obj->type != GOT_OBJ_TYPE_BLOB) { - err = got_error_path(path, GOT_ERR_OBJ_TYPE); + f2 = got_opentemp(); + if (f2 == NULL) { + err = got_error_from_errno("got_opentemp"); goto done; } - - err = got_object_blob_open(&blob, repo, obj, 8192); + err = got_object_blob_dump_to_file(&size2, &nlines2, NULL, + f2, blob); if (err) goto done; - if (fseek(blame->f, 0L, SEEK_SET) == -1) { - err = got_ferror(blame->f, GOT_ERR_IO); + err = got_object_id_by_path(&pblob_id, repo, pid->id, path); + if (err) { + if (err->code == GOT_ERR_NO_TREE_ENTRY) + err = NULL; goto done; } - diff_data_free(&blame->left_data); - memset(&blame->left_data, 0, sizeof(blame->left_data)); - err = got_diff_blob_prepared_file(&diffreg_result, &blame->left_data, - blob, &blame->right_data, blame->f, blame->map, blame->filesize, - blame->cfg, 0); + err = got_object_open_as_blob(&pblob, repo, pblob_id, 8192); if (err) goto done; + f1 = got_opentemp(); + if (f1 == NULL) { + err = got_error_from_errno("got_opentemp"); + goto done; + } + err = got_object_blob_dump_to_file(&size1, &nlines1, NULL, f1, pblob); + if (err) + goto done; + + err = got_diff_files(&diffreg_result, f1, "", f2, "", + 0, 0, NULL); + if (err) + goto done; if (diffreg_result->result->chunks.len > 0) { - err = blame_changes(blame, diffreg_result->result, id, cb, arg); + if (nlines1 > 0) { + linemap1 = calloc(nlines1, sizeof(*linemap1)); + if (linemap1 == NULL) { + err = got_error_from_errno("malloc"); + goto done; + } + } + err = blame_changes(blame, linemap1, + diffreg_result->result, id, cb, arg); + if (err) { + free(linemap1); + goto done; + } + if (linemap1) { + free(blame->linemap2); + blame->linemap2 = linemap1; + blame->nlines2 = nlines1; + } } else if (cb) err = cb(arg, blame->nlines, -1, id); done: if (diffreg_result) { const struct got_error *free_err; - free_err = got_diffreg_result_free_left(diffreg_result); + free_err = got_diffreg_result_free(diffreg_result); if (free_err && err == NULL) err = free_err; } if (commit) got_object_commit_close(commit); - free(obj_id); - if (obj) - got_object_close(obj); + free(blob_id); + free(pblob_id); if (blob) got_object_blob_close(blob); + if (pblob) + got_object_blob_close(pblob); + if (f1 && fclose(f1) != 0 && err == NULL) + err = got_error_from_errno("fclose"); + if (f2 && fclose(f2) != 0 && err == NULL) + err = got_error_from_errno("fclose"); return err; } @@ -201,13 +268,10 @@ blame_close(struct got_blame *blame) { const struct got_error *err = NULL; - diff_data_free(&blame->left_data); - diff_data_free(&blame->right_data); - if (blame->map && munmap(blame->map, blame->filesize) == -1) - err = got_error_from_errno("munmap"); if (blame->f && fclose(blame->f) != 0 && err == NULL) err = got_error_from_errno("fclose"); free(blame->lines); + free(blame->linemap2); free(blame); return err; } @@ -223,16 +287,15 @@ blame_open(struct got_blame **blamep, const char *path struct got_object_id *obj_id = NULL; struct got_blob_object *blob = NULL; struct got_blame *blame = NULL; - struct got_object_id *id = NULL, *pid = NULL; - int lineno, created; - size_t size; + struct got_object_id *id = NULL; + int lineno; struct got_commit_graph *graph = NULL; *blamep = NULL; err = got_object_id_by_path(&obj_id, repo, start_commit_id, path); if (err) - return err; + goto done; err = got_object_open(&obj, repo, obj_id); if (err) @@ -248,8 +311,10 @@ blame_open(struct got_blame **blamep, const char *path goto done; blame = calloc(1, sizeof(*blame)); - if (blame == NULL) - return got_error_from_errno("calloc"); + if (blame == NULL) { + err = got_error_from_errno("calloc"); + goto done; + } blame->f = got_opentemp(); if (blame->f == NULL) { @@ -261,11 +326,7 @@ blame_open(struct got_blame **blamep, const char *path if (err || blame->nlines == 0) goto done; - blame->cfg = got_diff_get_config(GOT_DIFF_ALGORITHM_PATIENCE), - err = got_diff_prepare_file(&blame->f, &blame->map, &created, &size, - &blame->right_data, blame->cfg, 0); - if (err) - goto done; + blame->cfg = got_diff_get_config(GOT_DIFF_ALGORITHM_PATIENCE); /* Don't include \n at EOF in the blame line count. */ if (blame->line_offsets[blame->nlines - 1] == blame->filesize) @@ -277,38 +338,50 @@ blame_open(struct got_blame **blamep, const char *path goto done; } + blame->nlines2 = blame->nlines; + blame->linemap2 = calloc(blame->nlines2, sizeof(*blame->linemap2)); + if (blame->linemap2 == NULL) { + err = got_error_from_errno("calloc"); + goto done; + } + for (lineno = 0; lineno < blame->nlines2; lineno++) + blame->linemap2[lineno] = lineno; + err = got_commit_graph_open(&graph, path, 1); if (err) - return err; + goto done; + err = got_commit_graph_iter_start(graph, start_commit_id, repo, cancel_cb, cancel_arg); if (err) goto done; - id = start_commit_id; for (;;) { - err = got_commit_graph_iter_next(&pid, graph, repo, + struct got_object_id *next_id; + err = got_commit_graph_iter_next(&next_id, graph, repo, cancel_cb, cancel_arg); if (err) { - if (err->code == GOT_ERR_ITER_COMPLETED) + if (err->code == GOT_ERR_ITER_COMPLETED) { err = NULL; - break; + break; + } + goto done; } - if (pid) { - err = blame_commit(blame, pid, id, path, repo, cb, arg); + if (next_id) { + id = next_id; + err = blame_commit(blame, id, path, repo, cb, arg); if (err) { if (err->code == GOT_ERR_ITER_COMPLETED) err = NULL; - break; + goto done; } if (blame->nannotated == blame->nlines) break; } - id = pid; } if (id && blame->nannotated < blame->nlines) { /* Annotate remaining non-annotated lines with last commit. */ - for (lineno = 1; lineno <= blame->nlines; lineno++) { + for (lineno = 0; lineno < blame->nlines; lineno++) { err = annotate_line(blame, lineno, id, cb, arg); if (err) goto done; blob - a8e56d3ae101b242e7f90d82f2276a5ab517a4fb blob + a74eaa79aa8bbba7182e603f859139548a3255cc --- lib/object.c +++ lib/object.c @@ -1300,8 +1300,8 @@ got_object_blob_dump_to_file(size_t *filesize, int *nl break; buf = got_object_blob_get_read_buf(blob); i = hdrlen; - if (line_offsets && nlines) { - if (*line_offsets == NULL) { + if (nlines) { + if (line_offsets && *line_offsets == NULL) { /* Have some data but perhaps no '\n'. */ noffsets = 1; *nlines = 1; @@ -1323,7 +1323,7 @@ got_object_blob_dump_to_file(size_t *filesize, int *nl continue; } (*nlines)++; - if (noffsets < *nlines) { + if (line_offsets && noffsets < *nlines) { off_t *o = recallocarray(*line_offsets, noffsets, *nlines, sizeof(**line_offsets)); @@ -1336,8 +1336,10 @@ got_object_blob_dump_to_file(size_t *filesize, int *nl *line_offsets = o; noffsets = *nlines; } - off = total_len + i - hdrlen + 1; - (*line_offsets)[*nlines - 1] = off; + if (line_offsets) { + off = total_len + i - hdrlen + 1; + (*line_offsets)[*nlines - 1] = off; + } i++; } } blob - 10d305bc5b630ca236e00ba304d323a7c271501d blob + d666c6b3541b953a5c3463d3a8b2c9a0d3ade54c --- regress/cmdline/blame.sh +++ regress/cmdline/blame.sh @@ -882,6 +882,104 @@ test_blame_symlink() { test_done "$testroot" "$ret" } +test_blame_lines_shifted_skip() { + local testroot=`test_init blame_lines_shifted_skip` + + got checkout $testroot/repo $testroot/wt > /dev/null + ret="$?" + if [ "$ret" != "0" ]; then + test_done "$testroot" "$ret" + return 1 + fi + + cat > $testroot/wt/alpha < /dev/null) + local commit1=`git_show_head $testroot/repo` + local short_commit1=`trim_obj_id 32 $commit1` + local author_time=`git_show_author_time $testroot/repo` + + cat > $testroot/wt/alpha < /dev/null) + local commit2=`git_show_head $testroot/repo` + local short_commit2=`trim_obj_id 32 $commit2` + + cat > $testroot/wt/alpha < /dev/null) + local commit3=`git_show_head $testroot/repo` + local short_commit3=`trim_obj_id 32 $commit3` + local author_time=`git_show_author_time $testroot/repo` + + cat > $testroot/wt/alpha < /dev/null) + local commit4=`git_show_head $testroot/repo` + local short_commit4=`trim_obj_id 32 $commit4` + local author_time=`git_show_author_time $testroot/repo` + + cat > $testroot/wt/alpha < /dev/null) + local commit5=`git_show_head $testroot/repo` + local short_commit5=`trim_obj_id 32 $commit5` + local author_time=`git_show_author_time $testroot/repo` + + (cd $testroot/wt && got blame alpha > $testroot/stdout) + + d=`date -r $author_time +"%G-%m-%d"` + echo "1) $short_commit5 $d $GOT_AUTHOR_8 X" > $testroot/stdout.expected + echo "2) $short_commit1 $d $GOT_AUTHOR_8 A" >> $testroot/stdout.expected + echo "3) $short_commit1 $d $GOT_AUTHOR_8 B" >> $testroot/stdout.expected + echo "4) $short_commit1 $d $GOT_AUTHOR_8 C" >> $testroot/stdout.expected + echo "5) $short_commit2 $d $GOT_AUTHOR_8 P" >> $testroot/stdout.expected + echo "6) $short_commit4 $d $GOT_AUTHOR_8 Y" >> $testroot/stdout.expected + echo "7) $short_commit2 $d $GOT_AUTHOR_8 Q" >> $testroot/stdout.expected + + cmp -s $testroot/stdout.expected $testroot/stdout + ret="$?" + if [ "$ret" != "0" ]; then + diff -u $testroot/stdout.expected $testroot/stdout + test_done "$testroot" "$ret" + return 1 + fi + + blame_cmp "$testroot" "alpha" + ret="$?" + test_done "$testroot" "$ret" +} + test_parseargs "$@" run_test test_blame_basic run_test test_blame_tag @@ -895,3 +993,4 @@ run_test test_blame_blame_h run_test test_blame_added_on_branch run_test test_blame_submodule run_test test_blame_symlink +run_test test_blame_lines_shifted_skip