2020-04-06 19:59:54 +03:00
|
|
|
#!/bin/sh
|
|
|
|
|
|
|
|
test_description='git log for a path with Bloom filters'
|
|
|
|
. ./test-lib.sh
|
|
|
|
|
|
|
|
GIT_TEST_COMMIT_GRAPH=0
|
|
|
|
GIT_TEST_COMMIT_GRAPH_CHANGED_PATHS=0
|
|
|
|
|
|
|
|
test_expect_success 'setup test - repo, commits, commit graph, log outputs' '
|
|
|
|
git init &&
|
|
|
|
mkdir A A/B A/B/C &&
|
|
|
|
test_commit c1 A/file1 &&
|
|
|
|
test_commit c2 A/B/file2 &&
|
|
|
|
test_commit c3 A/B/C/file3 &&
|
|
|
|
test_commit c4 A/file1 &&
|
|
|
|
test_commit c5 A/B/file2 &&
|
|
|
|
test_commit c6 A/B/C/file3 &&
|
|
|
|
test_commit c7 A/file1 &&
|
|
|
|
test_commit c8 A/B/file2 &&
|
|
|
|
test_commit c9 A/B/C/file3 &&
|
|
|
|
test_commit c10 file_to_be_deleted &&
|
|
|
|
git checkout -b side HEAD~4 &&
|
|
|
|
test_commit side-1 file4 &&
|
|
|
|
git checkout master &&
|
|
|
|
git merge side &&
|
|
|
|
test_commit c11 file5 &&
|
|
|
|
mv file5 file5_renamed &&
|
|
|
|
git add file5_renamed &&
|
|
|
|
git commit -m "rename" &&
|
|
|
|
rm file_to_be_deleted &&
|
|
|
|
git add . &&
|
|
|
|
git commit -m "file removed" &&
|
|
|
|
git commit-graph write --reachable --changed-paths
|
|
|
|
'
|
|
|
|
graph_read_expect () {
|
|
|
|
NUM_CHUNKS=5
|
|
|
|
cat >expect <<- EOF
|
|
|
|
header: 43475048 1 1 $NUM_CHUNKS 0
|
|
|
|
num_commits: $1
|
|
|
|
chunks: oid_fanout oid_lookup commit_metadata bloom_indexes bloom_data
|
|
|
|
EOF
|
|
|
|
test-tool read-graph >actual &&
|
|
|
|
test_cmp expect actual
|
|
|
|
}
|
|
|
|
|
|
|
|
test_expect_success 'commit-graph write wrote out the bloom chunks' '
|
|
|
|
graph_read_expect 15
|
|
|
|
'
|
|
|
|
|
|
|
|
# Turn off any inherited trace2 settings for this test.
|
|
|
|
sane_unset GIT_TRACE2 GIT_TRACE2_PERF GIT_TRACE2_EVENT
|
|
|
|
sane_unset GIT_TRACE2_PERF_BRIEF
|
|
|
|
sane_unset GIT_TRACE2_CONFIG_PARAMS
|
|
|
|
|
|
|
|
setup () {
|
t4216: use an '&&'-chain
In a759bfa9ee (t4216: add end to end tests for git log with Bloom
filters, 2020-04-06), a 'rm' invocation was added without a
corresponding '&&' chain.
When 'trace.perf' already exists, everything works fine. However, the
function can be executed without 'trace.perf' on disk (eg., when the
subset of tests run is altered with '--run'), and so the bare 'rm'
complains about a missing file.
To remove some noise from the test log, invoke 'rm' with '-f', at which
point it is sensible to place the 'rm -f' in an '&&'-chain, which is
both (1) our usual style, and (2) avoids a broken chain in the future if
more commands are added at the beginning of the function.
Helped-by: Eric Sunshine <sunshine@sunshineco.com>
Helped-by: Jeff King <peff@peff.net>
Signed-off-by: Taylor Blau <me@ttaylorr.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2020-09-09 18:22:50 +03:00
|
|
|
rm -f "$TRASH_DIRECTORY/trace.perf" &&
|
2020-04-06 19:59:54 +03:00
|
|
|
git -c core.commitGraph=false log --pretty="format:%s" $1 >log_wo_bloom &&
|
|
|
|
GIT_TRACE2_PERF="$TRASH_DIRECTORY/trace.perf" git -c core.commitGraph=true log --pretty="format:%s" $1 >log_w_bloom
|
|
|
|
}
|
|
|
|
|
|
|
|
test_bloom_filters_used () {
|
|
|
|
log_args=$1
|
commit-graph: introduce 'get_bloom_filter_settings()'
Many places in the code often need a pointer to the commit-graph's
'struct bloom_filter_settings', in which case they often take the value
from the top-most commit-graph.
In the non-split case, this works as expected. In the split case,
however, things get a little tricky. Not all layers in a chain of
incremental commit-graphs are required to themselves have Bloom data,
and so whether or not some part of the code uses Bloom filters depends
entirely on whether or not the top-most level of the commit-graph chain
has Bloom filters.
This has been the behavior since Bloom filters were introduced, and has
been codified into the tests since a759bfa9ee (t4216: add end to end
tests for git log with Bloom filters, 2020-04-06). In fact, t4216.130
requires that Bloom filters are not used in exactly the case described
earlier.
There is no reason that this needs to be the case, since it is perfectly
valid for commits in an earlier layer to have Bloom filters when commits
in a newer layer do not.
Since Bloom settings are guaranteed in practice to be the same for any
layer in a chain that has Bloom data, it is sufficient to traverse the
'->base_graph' pointer until either (1) a non-null 'struct
bloom_filter_settings *' is found, or (2) until we are at the root of
the commit-graph chain.
Introduce a 'get_bloom_filter_settings()' function that does just this,
and use it instead of purely dereferencing the top-most graph's
'->bloom_filter_settings' pointer.
While we're at it, add an additional test in t5324 to guard against code
in the commit-graph writing machinery that doesn't correctly handle a
NULL 'struct bloom_filter *'.
Co-authored-by: Derrick Stolee <dstolee@microsoft.com>
Signed-off-by: Taylor Blau <me@ttaylorr.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2020-09-09 18:22:44 +03:00
|
|
|
bloom_trace_prefix="statistics:{\"filter_not_present\":${2:-0},\"maybe\""
|
2020-04-06 19:59:54 +03:00
|
|
|
setup "$log_args" &&
|
|
|
|
grep -q "$bloom_trace_prefix" "$TRASH_DIRECTORY/trace.perf" &&
|
|
|
|
test_cmp log_wo_bloom log_w_bloom &&
|
|
|
|
test_path_is_file "$TRASH_DIRECTORY/trace.perf"
|
|
|
|
}
|
|
|
|
|
|
|
|
test_bloom_filters_not_used () {
|
|
|
|
log_args=$1
|
|
|
|
setup "$log_args" &&
|
2020-05-20 06:44:42 +03:00
|
|
|
! grep -q "statistics:{\"filter_not_present\":" "$TRASH_DIRECTORY/trace.perf" &&
|
2020-04-06 19:59:54 +03:00
|
|
|
test_cmp log_wo_bloom log_w_bloom
|
|
|
|
}
|
|
|
|
|
|
|
|
for path in A A/B A/B/C A/file1 A/B/file2 A/B/C/file3 file4 file5 file5_renamed file_to_be_deleted
|
|
|
|
do
|
|
|
|
for option in "" \
|
|
|
|
"--all" \
|
|
|
|
"--full-history" \
|
|
|
|
"--full-history --simplify-merges" \
|
|
|
|
"--simplify-merges" \
|
|
|
|
"--simplify-by-decoration" \
|
|
|
|
"--follow" \
|
|
|
|
"--first-parent" \
|
|
|
|
"--topo-order" \
|
|
|
|
"--date-order" \
|
|
|
|
"--author-date-order" \
|
|
|
|
"--ancestry-path side..master"
|
|
|
|
do
|
|
|
|
test_expect_success "git log option: $option for path: $path" '
|
2020-09-09 18:23:10 +03:00
|
|
|
test_bloom_filters_used "$option -- $path" &&
|
|
|
|
test_config commitgraph.readChangedPaths false &&
|
|
|
|
test_bloom_filters_not_used "$option -- $path"
|
2020-04-06 19:59:54 +03:00
|
|
|
'
|
|
|
|
done
|
|
|
|
done
|
|
|
|
|
|
|
|
test_expect_success 'git log -- folder works with and without the trailing slash' '
|
|
|
|
test_bloom_filters_used "-- A" &&
|
|
|
|
test_bloom_filters_used "-- A/"
|
|
|
|
'
|
|
|
|
|
|
|
|
test_expect_success 'git log for path that does not exist. ' '
|
|
|
|
test_bloom_filters_used "-- path_does_not_exist"
|
|
|
|
'
|
|
|
|
|
|
|
|
test_expect_success 'git log with --walk-reflogs does not use Bloom filters' '
|
|
|
|
test_bloom_filters_not_used "--walk-reflogs -- A"
|
|
|
|
'
|
|
|
|
|
|
|
|
test_expect_success 'git log -- multiple path specs does not use Bloom filters' '
|
|
|
|
test_bloom_filters_not_used "-- file4 A/file1"
|
|
|
|
'
|
|
|
|
|
2020-07-01 16:27:29 +03:00
|
|
|
test_expect_success 'git log -- "." pathspec at root does not use Bloom filters' '
|
|
|
|
test_bloom_filters_not_used "-- ."
|
|
|
|
'
|
|
|
|
|
2020-04-06 19:59:54 +03:00
|
|
|
test_expect_success 'git log with wildcard that resolves to a single path uses Bloom filters' '
|
|
|
|
test_bloom_filters_used "-- *4" &&
|
|
|
|
test_bloom_filters_used "-- *renamed"
|
|
|
|
'
|
|
|
|
|
|
|
|
test_expect_success 'git log with wildcard that resolves to a multiple paths does not uses Bloom filters' '
|
|
|
|
test_bloom_filters_not_used "-- *" &&
|
|
|
|
test_bloom_filters_not_used "-- file*"
|
|
|
|
'
|
|
|
|
|
|
|
|
test_expect_success 'setup - add commit-graph to the chain without Bloom filters' '
|
|
|
|
test_commit c14 A/anotherFile2 &&
|
|
|
|
test_commit c15 A/B/anotherFile2 &&
|
|
|
|
test_commit c16 A/B/C/anotherFile2 &&
|
2020-07-01 16:27:24 +03:00
|
|
|
git commit-graph write --reachable --split --no-changed-paths &&
|
2020-04-06 19:59:54 +03:00
|
|
|
test_line_count = 2 .git/objects/info/commit-graphs/commit-graph-chain
|
|
|
|
'
|
|
|
|
|
commit-graph: introduce 'get_bloom_filter_settings()'
Many places in the code often need a pointer to the commit-graph's
'struct bloom_filter_settings', in which case they often take the value
from the top-most commit-graph.
In the non-split case, this works as expected. In the split case,
however, things get a little tricky. Not all layers in a chain of
incremental commit-graphs are required to themselves have Bloom data,
and so whether or not some part of the code uses Bloom filters depends
entirely on whether or not the top-most level of the commit-graph chain
has Bloom filters.
This has been the behavior since Bloom filters were introduced, and has
been codified into the tests since a759bfa9ee (t4216: add end to end
tests for git log with Bloom filters, 2020-04-06). In fact, t4216.130
requires that Bloom filters are not used in exactly the case described
earlier.
There is no reason that this needs to be the case, since it is perfectly
valid for commits in an earlier layer to have Bloom filters when commits
in a newer layer do not.
Since Bloom settings are guaranteed in practice to be the same for any
layer in a chain that has Bloom data, it is sufficient to traverse the
'->base_graph' pointer until either (1) a non-null 'struct
bloom_filter_settings *' is found, or (2) until we are at the root of
the commit-graph chain.
Introduce a 'get_bloom_filter_settings()' function that does just this,
and use it instead of purely dereferencing the top-most graph's
'->bloom_filter_settings' pointer.
While we're at it, add an additional test in t5324 to guard against code
in the commit-graph writing machinery that doesn't correctly handle a
NULL 'struct bloom_filter *'.
Co-authored-by: Derrick Stolee <dstolee@microsoft.com>
Signed-off-by: Taylor Blau <me@ttaylorr.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2020-09-09 18:22:44 +03:00
|
|
|
test_expect_success 'use Bloom filters even if the latest graph does not have Bloom filters' '
|
|
|
|
# Ensure that the number of empty filters is equal to the number of
|
|
|
|
# filters in the latest graph layer to prove that they are loaded (and
|
|
|
|
# ignored).
|
|
|
|
test_bloom_filters_used "-- A/B" 3
|
2020-04-06 19:59:54 +03:00
|
|
|
'
|
|
|
|
|
|
|
|
test_expect_success 'setup - add commit-graph to the chain with Bloom filters' '
|
|
|
|
test_commit c17 A/anotherFile3 &&
|
|
|
|
git commit-graph write --reachable --changed-paths --split &&
|
|
|
|
test_line_count = 3 .git/objects/info/commit-graphs/commit-graph-chain
|
|
|
|
'
|
|
|
|
|
|
|
|
test_bloom_filters_used_when_some_filters_are_missing () {
|
|
|
|
log_args=$1
|
commit-graph: check all leading directories in changed path Bloom filters
The file 'dir/subdir/file' can only be modified if its leading
directories 'dir' and 'dir/subdir' are modified as well.
So when checking modified path Bloom filters looking for commits
modifying a path with multiple path components, then check not only
the full path in the Bloom filters, but all its leading directories as
well. Take care to check these paths in "deepest first" order,
because it's the full path that is least likely to be modified, and
the Bloom filter queries can short circuit sooner.
This can significantly reduce the average false positive rate, by
about an order of magnitude or three(!), and can further speed up
pathspec-limited revision walks. The table below compares the average
false positive rate and runtime of
git rev-list HEAD -- "$path"
before and after this change for 5000+ randomly* selected paths from
each repository:
Average false Average Average
positive rate runtime runtime
before after before after difference
------------------------------------------------------------------
git 3.220% 0.7853% 0.0558s 0.0387s -30.6%
linux 2.453% 0.0296% 0.1046s 0.0766s -26.8%
tensorflow 2.536% 0.6977% 0.0594s 0.0420s -29.2%
*Path selection was done with the following pipeline:
git ls-tree -r --name-only HEAD | sort -R | head -n 5000
The improvements in runtime are much smaller than the improvements in
average false positive rate, as we are clearly reaching diminishing
returns here. However, all these timings depend on that accessing
tree objects is reasonably fast (warm caches). If we had a partial
clone and the tree objects had to be fetched from a promisor remote,
e.g.:
$ git clone --filter=tree:0 --bare file://.../webkit.git webkit.notrees.git
$ git -C webkit.git -c core.modifiedPathBloomFilters=1 \
commit-graph write --reachable
$ cp webkit.git/objects/info/commit-graph webkit.notrees.git/objects/info/
$ git -C webkit.notrees.git -c core.modifiedPathBloomFilters=1 \
rev-list HEAD -- "$path"
then checking all leading path component can reduce the runtime from
over an hour to a few seconds (and this is with the clone and the
promisor on the same machine).
This adjusts the tracing values in t4216-log-bloom.sh, which provides a
concrete way to notice the improvement.
Helped-by: Taylor Blau <me@ttaylorr.com>
Helped-by: René Scharfe <l.s.r@web.de>
Signed-off-by: SZEDER Gábor <szeder.dev@gmail.com>
Signed-off-by: Derrick Stolee <dstolee@microsoft.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2020-07-01 16:27:30 +03:00
|
|
|
bloom_trace_prefix="statistics:{\"filter_not_present\":3,\"maybe\":6,\"definitely_not\":8"
|
2020-04-06 19:59:54 +03:00
|
|
|
setup "$log_args" &&
|
|
|
|
grep -q "$bloom_trace_prefix" "$TRASH_DIRECTORY/trace.perf" &&
|
|
|
|
test_cmp log_wo_bloom log_w_bloom
|
|
|
|
}
|
|
|
|
|
|
|
|
test_expect_success 'Use Bloom filters if they exist in the latest but not all commit graphs in the chain.' '
|
|
|
|
test_bloom_filters_used_when_some_filters_are_missing "-- A/B"
|
|
|
|
'
|
|
|
|
|
2020-07-01 16:27:24 +03:00
|
|
|
test_expect_success 'persist filter settings' '
|
|
|
|
test_when_finished rm -rf .git/objects/info/commit-graph* &&
|
|
|
|
rm -rf .git/objects/info/commit-graph* &&
|
|
|
|
GIT_TRACE2_EVENT="$(pwd)/trace2.txt" \
|
|
|
|
GIT_TRACE2_EVENT_NESTING=5 \
|
|
|
|
GIT_TEST_BLOOM_SETTINGS_NUM_HASHES=9 \
|
|
|
|
GIT_TEST_BLOOM_SETTINGS_BITS_PER_ENTRY=15 \
|
|
|
|
git commit-graph write --reachable --changed-paths &&
|
2020-09-17 16:34:42 +03:00
|
|
|
grep "{\"hash_version\":1,\"num_hashes\":9,\"bits_per_entry\":15,\"max_changed_paths\":512" trace2.txt &&
|
2020-07-01 16:27:24 +03:00
|
|
|
GIT_TRACE2_EVENT="$(pwd)/trace2-auto.txt" \
|
|
|
|
GIT_TRACE2_EVENT_NESTING=5 \
|
|
|
|
git commit-graph write --reachable --changed-paths &&
|
2020-09-17 16:34:42 +03:00
|
|
|
grep "{\"hash_version\":1,\"num_hashes\":9,\"bits_per_entry\":15,\"max_changed_paths\":512" trace2-auto.txt
|
2020-07-01 16:27:24 +03:00
|
|
|
'
|
|
|
|
|
bloom/diff: properly short-circuit on max_changes
Commit e3696980 (diff: halt tree-diff early after max_changes,
2020-03-30) intended to create a mechanism to short-circuit a diff
calculation after a certain number of paths were modified. By
incrementing a "num_changes" counter throughout the recursive
ll_diff_tree_paths(), this was supposed to match the number of changes
that would be written into the changed-path Bloom filters.
Unfortunately, this was not implemented correctly and instead misses
simple cases like file modifications. This then does not stop very
large changed-path filters from being written (unless they add or remove
many files).
To start, change the implementation in ll_diff_tree_paths() to instead
use the global diff_queue_diff struct's 'nr' member as the count. This
is a way to simplify the logic instead of making more mistakes in the
complicated diff code.
This has a drawback: the diff_queue_diff struct only lists the paths
corresponding to blob changes, not their leading directories. Thus,
get_or_compute_bloom_filter() needs an additional check to see if the
hashmap with the leading directories becomes too large.
One reason why this was not caught by test cases was that the test in
t4216-log-bloom.sh that was supposed to check this "too many changes"
condition only checked this on the initial commit of a repository. The
old logic counted these values correctly. Update this test in a few
ways:
1. Use GIT_TEST_BLOOM_SETTINGS_MAX_CHANGED_PATHS to reduce the limit,
allowing smaller commits to engage with this logic.
2. Create several interesting cases of edits, adds, removes, and mode
changes (in the second commit). By testing both sides of the
inequality with the *_MAX_CHANGED_PATHS variable, we can see that
the count is exactly correct, so none of these changes are missed
or over-counted.
3. Use the trace2 data value filter_found_large to verify that these
commits are on the correct side of the limit.
Another way to verify the behavior is correct is through performance
tests. By testing on my local copies of the Git repository and the Linux
kernel repository, I could measure the effect of these short-circuits
when computing a fresh commit-graph file with changed-path Bloom filters
using the command
GIT_TEST_BLOOM_SETTINGS_MAX_CHANGED_PATHS=N time \
git commit-graph write --reachable --changed-paths
and reporting the wall time and resulting commit-graph size.
For Git, the results are
| | N=1 | N=10 | N=512 |
|--------|----------------|----------------|----------------|
| HEAD~1 | 10.90s 9.18MB | 11.11s 9.34MB | 11.31s 9.35MB |
| HEAD | 9.21s 8.62MB | 11.11s 9.29MB | 11.29s 9.34MB |
For Linux, the results are
| | N=1 | N=20 | N=512 |
|--------|----------------|---------------|---------------|
| HEAD~1 | 61.28s 64.3MB | 76.9s 72.6MB | 77.6s 72.6MB |
| HEAD | 49.44s 56.3MB | 68.7s 65.9MB | 69.2s 65.9MB |
Naturally, the improvement becomes much less as the limit grows, as
fewer commits satisfy the short-circuit.
Reported-by: SZEDER Gábor <szeder.dev@gmail.com>
Signed-off-by: Derrick Stolee <dstolee@microsoft.com>
Signed-off-by: Taylor Blau <me@ttaylorr.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2020-09-16 21:07:52 +03:00
|
|
|
test_max_changed_paths () {
|
|
|
|
grep "\"max_changed_paths\":$1" $2
|
|
|
|
}
|
|
|
|
|
|
|
|
test_filter_computed () {
|
|
|
|
grep "\"key\":\"filter-computed\",\"value\":\"$1\"" $2
|
|
|
|
}
|
|
|
|
|
|
|
|
test_filter_trunc_large () {
|
|
|
|
grep "\"key\":\"filter-trunc-large\",\"value\":\"$1\"" $2
|
|
|
|
}
|
|
|
|
|
2020-07-01 16:27:23 +03:00
|
|
|
test_expect_success 'correctly report changes over limit' '
|
bloom/diff: properly short-circuit on max_changes
Commit e3696980 (diff: halt tree-diff early after max_changes,
2020-03-30) intended to create a mechanism to short-circuit a diff
calculation after a certain number of paths were modified. By
incrementing a "num_changes" counter throughout the recursive
ll_diff_tree_paths(), this was supposed to match the number of changes
that would be written into the changed-path Bloom filters.
Unfortunately, this was not implemented correctly and instead misses
simple cases like file modifications. This then does not stop very
large changed-path filters from being written (unless they add or remove
many files).
To start, change the implementation in ll_diff_tree_paths() to instead
use the global diff_queue_diff struct's 'nr' member as the count. This
is a way to simplify the logic instead of making more mistakes in the
complicated diff code.
This has a drawback: the diff_queue_diff struct only lists the paths
corresponding to blob changes, not their leading directories. Thus,
get_or_compute_bloom_filter() needs an additional check to see if the
hashmap with the leading directories becomes too large.
One reason why this was not caught by test cases was that the test in
t4216-log-bloom.sh that was supposed to check this "too many changes"
condition only checked this on the initial commit of a repository. The
old logic counted these values correctly. Update this test in a few
ways:
1. Use GIT_TEST_BLOOM_SETTINGS_MAX_CHANGED_PATHS to reduce the limit,
allowing smaller commits to engage with this logic.
2. Create several interesting cases of edits, adds, removes, and mode
changes (in the second commit). By testing both sides of the
inequality with the *_MAX_CHANGED_PATHS variable, we can see that
the count is exactly correct, so none of these changes are missed
or over-counted.
3. Use the trace2 data value filter_found_large to verify that these
commits are on the correct side of the limit.
Another way to verify the behavior is correct is through performance
tests. By testing on my local copies of the Git repository and the Linux
kernel repository, I could measure the effect of these short-circuits
when computing a fresh commit-graph file with changed-path Bloom filters
using the command
GIT_TEST_BLOOM_SETTINGS_MAX_CHANGED_PATHS=N time \
git commit-graph write --reachable --changed-paths
and reporting the wall time and resulting commit-graph size.
For Git, the results are
| | N=1 | N=10 | N=512 |
|--------|----------------|----------------|----------------|
| HEAD~1 | 10.90s 9.18MB | 11.11s 9.34MB | 11.31s 9.35MB |
| HEAD | 9.21s 8.62MB | 11.11s 9.29MB | 11.29s 9.34MB |
For Linux, the results are
| | N=1 | N=20 | N=512 |
|--------|----------------|---------------|---------------|
| HEAD~1 | 61.28s 64.3MB | 76.9s 72.6MB | 77.6s 72.6MB |
| HEAD | 49.44s 56.3MB | 68.7s 65.9MB | 69.2s 65.9MB |
Naturally, the improvement becomes much less as the limit grows, as
fewer commits satisfy the short-circuit.
Reported-by: SZEDER Gábor <szeder.dev@gmail.com>
Signed-off-by: Derrick Stolee <dstolee@microsoft.com>
Signed-off-by: Taylor Blau <me@ttaylorr.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2020-09-16 21:07:52 +03:00
|
|
|
git init limits &&
|
2020-07-01 16:27:23 +03:00
|
|
|
(
|
bloom/diff: properly short-circuit on max_changes
Commit e3696980 (diff: halt tree-diff early after max_changes,
2020-03-30) intended to create a mechanism to short-circuit a diff
calculation after a certain number of paths were modified. By
incrementing a "num_changes" counter throughout the recursive
ll_diff_tree_paths(), this was supposed to match the number of changes
that would be written into the changed-path Bloom filters.
Unfortunately, this was not implemented correctly and instead misses
simple cases like file modifications. This then does not stop very
large changed-path filters from being written (unless they add or remove
many files).
To start, change the implementation in ll_diff_tree_paths() to instead
use the global diff_queue_diff struct's 'nr' member as the count. This
is a way to simplify the logic instead of making more mistakes in the
complicated diff code.
This has a drawback: the diff_queue_diff struct only lists the paths
corresponding to blob changes, not their leading directories. Thus,
get_or_compute_bloom_filter() needs an additional check to see if the
hashmap with the leading directories becomes too large.
One reason why this was not caught by test cases was that the test in
t4216-log-bloom.sh that was supposed to check this "too many changes"
condition only checked this on the initial commit of a repository. The
old logic counted these values correctly. Update this test in a few
ways:
1. Use GIT_TEST_BLOOM_SETTINGS_MAX_CHANGED_PATHS to reduce the limit,
allowing smaller commits to engage with this logic.
2. Create several interesting cases of edits, adds, removes, and mode
changes (in the second commit). By testing both sides of the
inequality with the *_MAX_CHANGED_PATHS variable, we can see that
the count is exactly correct, so none of these changes are missed
or over-counted.
3. Use the trace2 data value filter_found_large to verify that these
commits are on the correct side of the limit.
Another way to verify the behavior is correct is through performance
tests. By testing on my local copies of the Git repository and the Linux
kernel repository, I could measure the effect of these short-circuits
when computing a fresh commit-graph file with changed-path Bloom filters
using the command
GIT_TEST_BLOOM_SETTINGS_MAX_CHANGED_PATHS=N time \
git commit-graph write --reachable --changed-paths
and reporting the wall time and resulting commit-graph size.
For Git, the results are
| | N=1 | N=10 | N=512 |
|--------|----------------|----------------|----------------|
| HEAD~1 | 10.90s 9.18MB | 11.11s 9.34MB | 11.31s 9.35MB |
| HEAD | 9.21s 8.62MB | 11.11s 9.29MB | 11.29s 9.34MB |
For Linux, the results are
| | N=1 | N=20 | N=512 |
|--------|----------------|---------------|---------------|
| HEAD~1 | 61.28s 64.3MB | 76.9s 72.6MB | 77.6s 72.6MB |
| HEAD | 49.44s 56.3MB | 68.7s 65.9MB | 69.2s 65.9MB |
Naturally, the improvement becomes much less as the limit grows, as
fewer commits satisfy the short-circuit.
Reported-by: SZEDER Gábor <szeder.dev@gmail.com>
Signed-off-by: Derrick Stolee <dstolee@microsoft.com>
Signed-off-by: Taylor Blau <me@ttaylorr.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2020-09-16 21:07:52 +03:00
|
|
|
cd limits &&
|
|
|
|
mkdir d &&
|
|
|
|
mkdir d/e &&
|
|
|
|
|
|
|
|
for i in $(test_seq 1 2)
|
2020-07-01 16:27:23 +03:00
|
|
|
do
|
bloom/diff: properly short-circuit on max_changes
Commit e3696980 (diff: halt tree-diff early after max_changes,
2020-03-30) intended to create a mechanism to short-circuit a diff
calculation after a certain number of paths were modified. By
incrementing a "num_changes" counter throughout the recursive
ll_diff_tree_paths(), this was supposed to match the number of changes
that would be written into the changed-path Bloom filters.
Unfortunately, this was not implemented correctly and instead misses
simple cases like file modifications. This then does not stop very
large changed-path filters from being written (unless they add or remove
many files).
To start, change the implementation in ll_diff_tree_paths() to instead
use the global diff_queue_diff struct's 'nr' member as the count. This
is a way to simplify the logic instead of making more mistakes in the
complicated diff code.
This has a drawback: the diff_queue_diff struct only lists the paths
corresponding to blob changes, not their leading directories. Thus,
get_or_compute_bloom_filter() needs an additional check to see if the
hashmap with the leading directories becomes too large.
One reason why this was not caught by test cases was that the test in
t4216-log-bloom.sh that was supposed to check this "too many changes"
condition only checked this on the initial commit of a repository. The
old logic counted these values correctly. Update this test in a few
ways:
1. Use GIT_TEST_BLOOM_SETTINGS_MAX_CHANGED_PATHS to reduce the limit,
allowing smaller commits to engage with this logic.
2. Create several interesting cases of edits, adds, removes, and mode
changes (in the second commit). By testing both sides of the
inequality with the *_MAX_CHANGED_PATHS variable, we can see that
the count is exactly correct, so none of these changes are missed
or over-counted.
3. Use the trace2 data value filter_found_large to verify that these
commits are on the correct side of the limit.
Another way to verify the behavior is correct is through performance
tests. By testing on my local copies of the Git repository and the Linux
kernel repository, I could measure the effect of these short-circuits
when computing a fresh commit-graph file with changed-path Bloom filters
using the command
GIT_TEST_BLOOM_SETTINGS_MAX_CHANGED_PATHS=N time \
git commit-graph write --reachable --changed-paths
and reporting the wall time and resulting commit-graph size.
For Git, the results are
| | N=1 | N=10 | N=512 |
|--------|----------------|----------------|----------------|
| HEAD~1 | 10.90s 9.18MB | 11.11s 9.34MB | 11.31s 9.35MB |
| HEAD | 9.21s 8.62MB | 11.11s 9.29MB | 11.29s 9.34MB |
For Linux, the results are
| | N=1 | N=20 | N=512 |
|--------|----------------|---------------|---------------|
| HEAD~1 | 61.28s 64.3MB | 76.9s 72.6MB | 77.6s 72.6MB |
| HEAD | 49.44s 56.3MB | 68.7s 65.9MB | 69.2s 65.9MB |
Naturally, the improvement becomes much less as the limit grows, as
fewer commits satisfy the short-circuit.
Reported-by: SZEDER Gábor <szeder.dev@gmail.com>
Signed-off-by: Derrick Stolee <dstolee@microsoft.com>
Signed-off-by: Taylor Blau <me@ttaylorr.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2020-09-16 21:07:52 +03:00
|
|
|
printf $i >d/file$i.txt &&
|
|
|
|
printf $i >d/e/file$i.txt || return 1
|
2020-07-01 16:27:23 +03:00
|
|
|
done &&
|
bloom/diff: properly short-circuit on max_changes
Commit e3696980 (diff: halt tree-diff early after max_changes,
2020-03-30) intended to create a mechanism to short-circuit a diff
calculation after a certain number of paths were modified. By
incrementing a "num_changes" counter throughout the recursive
ll_diff_tree_paths(), this was supposed to match the number of changes
that would be written into the changed-path Bloom filters.
Unfortunately, this was not implemented correctly and instead misses
simple cases like file modifications. This then does not stop very
large changed-path filters from being written (unless they add or remove
many files).
To start, change the implementation in ll_diff_tree_paths() to instead
use the global diff_queue_diff struct's 'nr' member as the count. This
is a way to simplify the logic instead of making more mistakes in the
complicated diff code.
This has a drawback: the diff_queue_diff struct only lists the paths
corresponding to blob changes, not their leading directories. Thus,
get_or_compute_bloom_filter() needs an additional check to see if the
hashmap with the leading directories becomes too large.
One reason why this was not caught by test cases was that the test in
t4216-log-bloom.sh that was supposed to check this "too many changes"
condition only checked this on the initial commit of a repository. The
old logic counted these values correctly. Update this test in a few
ways:
1. Use GIT_TEST_BLOOM_SETTINGS_MAX_CHANGED_PATHS to reduce the limit,
allowing smaller commits to engage with this logic.
2. Create several interesting cases of edits, adds, removes, and mode
changes (in the second commit). By testing both sides of the
inequality with the *_MAX_CHANGED_PATHS variable, we can see that
the count is exactly correct, so none of these changes are missed
or over-counted.
3. Use the trace2 data value filter_found_large to verify that these
commits are on the correct side of the limit.
Another way to verify the behavior is correct is through performance
tests. By testing on my local copies of the Git repository and the Linux
kernel repository, I could measure the effect of these short-circuits
when computing a fresh commit-graph file with changed-path Bloom filters
using the command
GIT_TEST_BLOOM_SETTINGS_MAX_CHANGED_PATHS=N time \
git commit-graph write --reachable --changed-paths
and reporting the wall time and resulting commit-graph size.
For Git, the results are
| | N=1 | N=10 | N=512 |
|--------|----------------|----------------|----------------|
| HEAD~1 | 10.90s 9.18MB | 11.11s 9.34MB | 11.31s 9.35MB |
| HEAD | 9.21s 8.62MB | 11.11s 9.29MB | 11.29s 9.34MB |
For Linux, the results are
| | N=1 | N=20 | N=512 |
|--------|----------------|---------------|---------------|
| HEAD~1 | 61.28s 64.3MB | 76.9s 72.6MB | 77.6s 72.6MB |
| HEAD | 49.44s 56.3MB | 68.7s 65.9MB | 69.2s 65.9MB |
Naturally, the improvement becomes much less as the limit grows, as
fewer commits satisfy the short-circuit.
Reported-by: SZEDER Gábor <szeder.dev@gmail.com>
Signed-off-by: Derrick Stolee <dstolee@microsoft.com>
Signed-off-by: Taylor Blau <me@ttaylorr.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2020-09-16 21:07:52 +03:00
|
|
|
|
|
|
|
mkdir mode &&
|
|
|
|
printf bash >mode/script.sh &&
|
|
|
|
|
|
|
|
mkdir foo &&
|
|
|
|
touch foo/bar &&
|
|
|
|
touch foo.txt &&
|
|
|
|
|
|
|
|
git add d foo foo.txt mode &&
|
2020-07-01 16:27:23 +03:00
|
|
|
git commit -m "files" &&
|
bloom/diff: properly short-circuit on max_changes
Commit e3696980 (diff: halt tree-diff early after max_changes,
2020-03-30) intended to create a mechanism to short-circuit a diff
calculation after a certain number of paths were modified. By
incrementing a "num_changes" counter throughout the recursive
ll_diff_tree_paths(), this was supposed to match the number of changes
that would be written into the changed-path Bloom filters.
Unfortunately, this was not implemented correctly and instead misses
simple cases like file modifications. This then does not stop very
large changed-path filters from being written (unless they add or remove
many files).
To start, change the implementation in ll_diff_tree_paths() to instead
use the global diff_queue_diff struct's 'nr' member as the count. This
is a way to simplify the logic instead of making more mistakes in the
complicated diff code.
This has a drawback: the diff_queue_diff struct only lists the paths
corresponding to blob changes, not their leading directories. Thus,
get_or_compute_bloom_filter() needs an additional check to see if the
hashmap with the leading directories becomes too large.
One reason why this was not caught by test cases was that the test in
t4216-log-bloom.sh that was supposed to check this "too many changes"
condition only checked this on the initial commit of a repository. The
old logic counted these values correctly. Update this test in a few
ways:
1. Use GIT_TEST_BLOOM_SETTINGS_MAX_CHANGED_PATHS to reduce the limit,
allowing smaller commits to engage with this logic.
2. Create several interesting cases of edits, adds, removes, and mode
changes (in the second commit). By testing both sides of the
inequality with the *_MAX_CHANGED_PATHS variable, we can see that
the count is exactly correct, so none of these changes are missed
or over-counted.
3. Use the trace2 data value filter_found_large to verify that these
commits are on the correct side of the limit.
Another way to verify the behavior is correct is through performance
tests. By testing on my local copies of the Git repository and the Linux
kernel repository, I could measure the effect of these short-circuits
when computing a fresh commit-graph file with changed-path Bloom filters
using the command
GIT_TEST_BLOOM_SETTINGS_MAX_CHANGED_PATHS=N time \
git commit-graph write --reachable --changed-paths
and reporting the wall time and resulting commit-graph size.
For Git, the results are
| | N=1 | N=10 | N=512 |
|--------|----------------|----------------|----------------|
| HEAD~1 | 10.90s 9.18MB | 11.11s 9.34MB | 11.31s 9.35MB |
| HEAD | 9.21s 8.62MB | 11.11s 9.29MB | 11.29s 9.34MB |
For Linux, the results are
| | N=1 | N=20 | N=512 |
|--------|----------------|---------------|---------------|
| HEAD~1 | 61.28s 64.3MB | 76.9s 72.6MB | 77.6s 72.6MB |
| HEAD | 49.44s 56.3MB | 68.7s 65.9MB | 69.2s 65.9MB |
Naturally, the improvement becomes much less as the limit grows, as
fewer commits satisfy the short-circuit.
Reported-by: SZEDER Gábor <szeder.dev@gmail.com>
Signed-off-by: Derrick Stolee <dstolee@microsoft.com>
Signed-off-by: Taylor Blau <me@ttaylorr.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2020-09-16 21:07:52 +03:00
|
|
|
|
|
|
|
# Commit has 7 file and 4 directory adds
|
|
|
|
GIT_TEST_BLOOM_SETTINGS_MAX_CHANGED_PATHS=10 \
|
|
|
|
GIT_TRACE2_EVENT="$(pwd)/trace" \
|
|
|
|
git commit-graph write --reachable --changed-paths &&
|
|
|
|
test_max_changed_paths 10 trace &&
|
|
|
|
test_filter_computed 1 trace &&
|
|
|
|
test_filter_trunc_large 1 trace &&
|
|
|
|
|
|
|
|
for path in $(git ls-tree -r --name-only HEAD)
|
|
|
|
do
|
|
|
|
git -c commitGraph.readChangedPaths=false log \
|
|
|
|
-- $path >expect &&
|
|
|
|
git log -- $path >actual &&
|
|
|
|
test_cmp expect actual || return 1
|
|
|
|
done &&
|
|
|
|
|
|
|
|
# Make a variety of path changes
|
|
|
|
printf new1 >d/e/file1.txt &&
|
|
|
|
printf new2 >d/file2.txt &&
|
|
|
|
rm d/e/file2.txt &&
|
|
|
|
rm -r foo &&
|
|
|
|
printf text >foo &&
|
|
|
|
mkdir f &&
|
|
|
|
printf new1 >f/file1.txt &&
|
|
|
|
|
|
|
|
# including a mode-only change (counts as modified)
|
|
|
|
git update-index --chmod=+x mode/script.sh &&
|
|
|
|
|
|
|
|
git add foo d f &&
|
|
|
|
git commit -m "complicated" &&
|
|
|
|
|
|
|
|
# start from scratch and rebuild
|
|
|
|
rm -f .git/objects/info/commit-graph &&
|
|
|
|
GIT_TEST_BLOOM_SETTINGS_MAX_CHANGED_PATHS=10 \
|
|
|
|
GIT_TRACE2_EVENT="$(pwd)/trace-edit" \
|
|
|
|
git commit-graph write --reachable --changed-paths &&
|
|
|
|
test_max_changed_paths 10 trace-edit &&
|
|
|
|
test_filter_computed 2 trace-edit &&
|
|
|
|
test_filter_trunc_large 2 trace-edit &&
|
|
|
|
|
|
|
|
for path in $(git ls-tree -r --name-only HEAD)
|
|
|
|
do
|
|
|
|
git -c commitGraph.readChangedPaths=false log \
|
|
|
|
-- $path >expect &&
|
|
|
|
git log -- $path >actual &&
|
|
|
|
test_cmp expect actual || return 1
|
|
|
|
done &&
|
|
|
|
|
|
|
|
# start from scratch and rebuild
|
|
|
|
rm -f .git/objects/info/commit-graph &&
|
|
|
|
GIT_TEST_BLOOM_SETTINGS_MAX_CHANGED_PATHS=11 \
|
|
|
|
GIT_TRACE2_EVENT="$(pwd)/trace-update" \
|
|
|
|
git commit-graph write --reachable --changed-paths &&
|
|
|
|
test_max_changed_paths 11 trace-update &&
|
|
|
|
test_filter_computed 2 trace-update &&
|
|
|
|
test_filter_trunc_large 0 trace-update &&
|
|
|
|
|
|
|
|
for path in $(git ls-tree -r --name-only HEAD)
|
2020-07-01 16:27:23 +03:00
|
|
|
do
|
bloom/diff: properly short-circuit on max_changes
Commit e3696980 (diff: halt tree-diff early after max_changes,
2020-03-30) intended to create a mechanism to short-circuit a diff
calculation after a certain number of paths were modified. By
incrementing a "num_changes" counter throughout the recursive
ll_diff_tree_paths(), this was supposed to match the number of changes
that would be written into the changed-path Bloom filters.
Unfortunately, this was not implemented correctly and instead misses
simple cases like file modifications. This then does not stop very
large changed-path filters from being written (unless they add or remove
many files).
To start, change the implementation in ll_diff_tree_paths() to instead
use the global diff_queue_diff struct's 'nr' member as the count. This
is a way to simplify the logic instead of making more mistakes in the
complicated diff code.
This has a drawback: the diff_queue_diff struct only lists the paths
corresponding to blob changes, not their leading directories. Thus,
get_or_compute_bloom_filter() needs an additional check to see if the
hashmap with the leading directories becomes too large.
One reason why this was not caught by test cases was that the test in
t4216-log-bloom.sh that was supposed to check this "too many changes"
condition only checked this on the initial commit of a repository. The
old logic counted these values correctly. Update this test in a few
ways:
1. Use GIT_TEST_BLOOM_SETTINGS_MAX_CHANGED_PATHS to reduce the limit,
allowing smaller commits to engage with this logic.
2. Create several interesting cases of edits, adds, removes, and mode
changes (in the second commit). By testing both sides of the
inequality with the *_MAX_CHANGED_PATHS variable, we can see that
the count is exactly correct, so none of these changes are missed
or over-counted.
3. Use the trace2 data value filter_found_large to verify that these
commits are on the correct side of the limit.
Another way to verify the behavior is correct is through performance
tests. By testing on my local copies of the Git repository and the Linux
kernel repository, I could measure the effect of these short-circuits
when computing a fresh commit-graph file with changed-path Bloom filters
using the command
GIT_TEST_BLOOM_SETTINGS_MAX_CHANGED_PATHS=N time \
git commit-graph write --reachable --changed-paths
and reporting the wall time and resulting commit-graph size.
For Git, the results are
| | N=1 | N=10 | N=512 |
|--------|----------------|----------------|----------------|
| HEAD~1 | 10.90s 9.18MB | 11.11s 9.34MB | 11.31s 9.35MB |
| HEAD | 9.21s 8.62MB | 11.11s 9.29MB | 11.29s 9.34MB |
For Linux, the results are
| | N=1 | N=20 | N=512 |
|--------|----------------|---------------|---------------|
| HEAD~1 | 61.28s 64.3MB | 76.9s 72.6MB | 77.6s 72.6MB |
| HEAD | 49.44s 56.3MB | 68.7s 65.9MB | 69.2s 65.9MB |
Naturally, the improvement becomes much less as the limit grows, as
fewer commits satisfy the short-circuit.
Reported-by: SZEDER Gábor <szeder.dev@gmail.com>
Signed-off-by: Derrick Stolee <dstolee@microsoft.com>
Signed-off-by: Taylor Blau <me@ttaylorr.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2020-09-16 21:07:52 +03:00
|
|
|
git -c commitGraph.readChangedPaths=false log \
|
|
|
|
-- $path >expect &&
|
|
|
|
git log -- $path >actual &&
|
2020-07-01 16:27:23 +03:00
|
|
|
test_cmp expect actual || return 1
|
|
|
|
done
|
|
|
|
)
|
|
|
|
'
|
|
|
|
|
2020-05-08 02:51:02 +03:00
|
|
|
test_done
|