2006-05-01 10:28:15 +04:00
|
|
|
/*
|
|
|
|
* Builtin "git grep"
|
|
|
|
*
|
|
|
|
* Copyright (c) 2006 Junio C Hamano
|
|
|
|
*/
|
2019-01-24 11:29:12 +03:00
|
|
|
#define USE_THE_INDEX_COMPATIBILITY_MACROS
|
2006-05-01 10:28:15 +04:00
|
|
|
#include "cache.h"
|
2017-06-22 21:43:46 +03:00
|
|
|
#include "repository.h"
|
2017-06-14 21:07:36 +03:00
|
|
|
#include "config.h"
|
2006-05-01 10:28:15 +04:00
|
|
|
#include "blob.h"
|
|
|
|
#include "tree.h"
|
|
|
|
#include "commit.h"
|
|
|
|
#include "tag.h"
|
2006-05-02 02:58:29 +04:00
|
|
|
#include "tree-walk.h"
|
2006-05-01 10:28:15 +04:00
|
|
|
#include "builtin.h"
|
2009-05-07 23:46:48 +04:00
|
|
|
#include "parse-options.h"
|
2010-06-12 20:36:51 +04:00
|
|
|
#include "string-list.h"
|
|
|
|
#include "run-command.h"
|
2009-07-02 02:07:24 +04:00
|
|
|
#include "userdiff.h"
|
2006-09-18 03:02:52 +04:00
|
|
|
#include "grep.h"
|
2009-09-05 16:31:17 +04:00
|
|
|
#include "quote.h"
|
2010-02-06 21:40:08 +03:00
|
|
|
#include "dir.h"
|
2013-07-14 12:35:25 +04:00
|
|
|
#include "pathspec.h"
|
2016-12-16 22:03:20 +03:00
|
|
|
#include "submodule.h"
|
2016-12-16 22:03:21 +03:00
|
|
|
#include "submodule-config.h"
|
2018-03-23 20:20:55 +03:00
|
|
|
#include "object-store.h"
|
2020-01-16 05:39:57 +03:00
|
|
|
#include "packfile.h"
|
2010-01-26 01:51:39 +03:00
|
|
|
|
built-ins: trust the "prefix" from run_builtin()
Change code in "builtin/grep.c" and "builtin/ls-tree.c" to trust the
"prefix" passed from "run_builtin()". The "prefix" we get from setup.c
is either going to be NULL or a string of length >0, never "".
So we can drop the "prefix && *prefix" checks added for
"builtin/grep.c" in 0d042fecf2f (git-grep: show pathnames relative to
the current directory, 2006-08-11), and for "builtin/ls-tree.c" in
a69dd585fca (ls-tree: chomp leading directories when run from a
subdirectory, 2005-12-23).
As seen in code in revision.c that was added in cd676a51367 (diff
--relative: output paths as relative to the current subdirectory,
2008-02-12) we already have existing code that does away with this
assertion.
This makes it easier to reason about a subsequent change to the
"prefix_length" code in grep.c in a subsequent commit, and since we're
going to the trouble of doing that let's leave behind an assert() to
promise this to any future callers.
For "builtin/grep.c" it would be painful to pass the "prefix" down the
callchain of:
cmd_grep -> grep_tree -> grep_submodule -> grep_cache -> grep_oid ->
grep_source_name
So for the code that needs it in grep_source_name() let's add a
"grep_prefix" variable similar to the existing "ls_tree_prefix".
While at it let's move the code in cmd_ls_tree() around so that we
assign to the "ls_tree_prefix" right after declaring the variables,
and stop assigning to "prefix". We only subsequently used that
variable later in the function after clobbering it. Let's just use our
own "grep_prefix" instead.
Let's also add an assert() in git.c, so that we'll make this promise
about the "prefix" to any current and future callers, as well as to
any readers of the code.
Code history:
* The strlen() in "grep.c" hasn't been used since 493b7a08d80 (grep:
accept relative paths outside current working directory, 2009-09-05).
When that code was added in 0d042fecf2f (git-grep: show pathnames
relative to the current directory, 2006-08-11) we used the length.
But since 493b7a08d80 we haven't used it for anything except a
boolean check that we could have done on the "prefix" member
itself.
Signed-off-by: Ævar Arnfjörð Bjarmason <avarab@gmail.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2022-02-16 03:00:34 +03:00
|
|
|
static const char *grep_prefix;
|
|
|
|
|
2009-05-07 23:46:48 +04:00
|
|
|
static char const * const grep_usage[] = {
|
2015-01-13 10:44:47 +03:00
|
|
|
N_("git grep [<options>] [-e] <pattern> [<rev>...] [[--] <path>...]"),
|
2009-05-07 23:46:48 +04:00
|
|
|
NULL
|
|
|
|
};
|
|
|
|
|
2016-12-16 22:03:20 +03:00
|
|
|
static int recurse_submodules;
|
|
|
|
|
2015-12-15 18:31:39 +03:00
|
|
|
static int num_threads;
|
2010-01-26 01:51:39 +03:00
|
|
|
|
2015-12-15 18:31:39 +03:00
|
|
|
static pthread_t *threads;
|
2010-01-26 01:51:39 +03:00
|
|
|
|
|
|
|
/* We use one producer thread and THREADS consumer
|
|
|
|
* threads. The producer adds struct work_items to 'todo' and the
|
|
|
|
* consumers pick work items from the same array.
|
|
|
|
*/
|
2011-03-16 10:08:34 +03:00
|
|
|
struct work_item {
|
2012-02-02 12:19:37 +04:00
|
|
|
struct grep_source source;
|
2010-01-26 01:51:39 +03:00
|
|
|
char done;
|
|
|
|
struct strbuf out;
|
|
|
|
};
|
|
|
|
|
|
|
|
/* In the range [todo_done, todo_start) in 'todo' we have work_items
|
|
|
|
* that have been or are processed by a consumer thread. We haven't
|
|
|
|
* written the result for these to stdout yet.
|
|
|
|
*
|
|
|
|
* The work_items in [todo_start, todo_end) are waiting to be picked
|
|
|
|
* up by a consumer thread.
|
|
|
|
*
|
|
|
|
* The ranges are modulo TODO_SIZE.
|
|
|
|
*/
|
|
|
|
#define TODO_SIZE 128
|
|
|
|
static struct work_item todo[TODO_SIZE];
|
|
|
|
static int todo_start;
|
|
|
|
static int todo_end;
|
|
|
|
static int todo_done;
|
|
|
|
|
|
|
|
/* Has all work items been added? */
|
|
|
|
static int all_work_added;
|
|
|
|
|
2021-08-17 00:09:55 +03:00
|
|
|
static struct repository **repos_to_free;
|
|
|
|
static size_t repos_to_free_nr, repos_to_free_alloc;
|
|
|
|
|
2010-01-26 01:51:39 +03:00
|
|
|
/* This lock protects all the variables above. */
|
|
|
|
static pthread_mutex_t grep_mutex;
|
|
|
|
|
2011-10-26 22:45:15 +04:00
|
|
|
static inline void grep_lock(void)
|
|
|
|
{
|
2017-05-25 22:45:35 +03:00
|
|
|
pthread_mutex_lock(&grep_mutex);
|
2011-10-26 22:45:15 +04:00
|
|
|
}
|
|
|
|
|
|
|
|
static inline void grep_unlock(void)
|
|
|
|
{
|
2017-05-25 22:45:35 +03:00
|
|
|
pthread_mutex_unlock(&grep_mutex);
|
2011-10-26 22:45:15 +04:00
|
|
|
}
|
|
|
|
|
2010-01-26 01:51:39 +03:00
|
|
|
/* Signalled when a new work_item is added to todo. */
|
|
|
|
static pthread_cond_t cond_add;
|
|
|
|
|
|
|
|
/* Signalled when the result from one work_item is written to
|
|
|
|
* stdout.
|
|
|
|
*/
|
|
|
|
static pthread_cond_t cond_write;
|
|
|
|
|
|
|
|
/* Signalled when we are finished with everything. */
|
|
|
|
static pthread_cond_t cond_result;
|
|
|
|
|
2011-06-05 19:24:15 +04:00
|
|
|
static int skip_first_line;
|
2010-03-15 19:21:10 +03:00
|
|
|
|
2020-01-16 05:39:59 +03:00
|
|
|
static void add_work(struct grep_opt *opt, struct grep_source *gs)
|
2010-01-26 01:51:39 +03:00
|
|
|
{
|
2020-01-16 05:39:59 +03:00
|
|
|
if (opt->binary != GREP_BINARY_TEXT)
|
|
|
|
grep_source_load_driver(gs, opt->repo->index);
|
|
|
|
|
2010-01-26 01:51:39 +03:00
|
|
|
grep_lock();
|
|
|
|
|
|
|
|
while ((todo_end+1) % ARRAY_SIZE(todo) == todo_done) {
|
|
|
|
pthread_cond_wait(&cond_write, &grep_mutex);
|
|
|
|
}
|
|
|
|
|
2018-02-23 17:47:56 +03:00
|
|
|
todo[todo_end].source = *gs;
|
2010-01-26 01:51:39 +03:00
|
|
|
todo[todo_end].done = 0;
|
|
|
|
strbuf_reset(&todo[todo_end].out);
|
|
|
|
todo_end = (todo_end + 1) % ARRAY_SIZE(todo);
|
|
|
|
|
|
|
|
pthread_cond_signal(&cond_add);
|
|
|
|
grep_unlock();
|
|
|
|
}
|
|
|
|
|
|
|
|
static struct work_item *get_work(void)
|
|
|
|
{
|
|
|
|
struct work_item *ret;
|
|
|
|
|
|
|
|
grep_lock();
|
|
|
|
while (todo_start == todo_end && !all_work_added) {
|
|
|
|
pthread_cond_wait(&cond_add, &grep_mutex);
|
|
|
|
}
|
|
|
|
|
|
|
|
if (todo_start == todo_end && all_work_added) {
|
|
|
|
ret = NULL;
|
|
|
|
} else {
|
|
|
|
ret = &todo[todo_start];
|
|
|
|
todo_start = (todo_start + 1) % ARRAY_SIZE(todo);
|
|
|
|
}
|
|
|
|
grep_unlock();
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
|
|
|
static void work_done(struct work_item *w)
|
|
|
|
{
|
|
|
|
int old_done;
|
|
|
|
|
|
|
|
grep_lock();
|
|
|
|
w->done = 1;
|
|
|
|
old_done = todo_done;
|
|
|
|
for(; todo[todo_done].done && todo_done != todo_start;
|
|
|
|
todo_done = (todo_done+1) % ARRAY_SIZE(todo)) {
|
|
|
|
w = &todo[todo_done];
|
2010-03-15 19:21:10 +03:00
|
|
|
if (w->out.len) {
|
2011-06-05 19:24:15 +04:00
|
|
|
const char *p = w->out.buf;
|
|
|
|
size_t len = w->out.len;
|
|
|
|
|
|
|
|
/* Skip the leading hunk mark of the first file. */
|
|
|
|
if (skip_first_line) {
|
|
|
|
while (len) {
|
|
|
|
len--;
|
|
|
|
if (*p++ == '\n')
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
skip_first_line = 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
write_or_die(1, p, len);
|
2010-03-15 19:21:10 +03:00
|
|
|
}
|
2012-02-02 12:19:37 +04:00
|
|
|
grep_source_clear(&w->source);
|
2010-01-26 01:51:39 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
if (old_done != todo_done)
|
|
|
|
pthread_cond_signal(&cond_write);
|
|
|
|
|
|
|
|
if (all_work_added && todo_done == todo_end)
|
|
|
|
pthread_cond_signal(&cond_result);
|
|
|
|
|
|
|
|
grep_unlock();
|
|
|
|
}
|
|
|
|
|
2021-08-17 00:09:55 +03:00
|
|
|
static void free_repos(void)
|
|
|
|
{
|
|
|
|
int i;
|
|
|
|
|
|
|
|
for (i = 0; i < repos_to_free_nr; i++) {
|
|
|
|
repo_clear(repos_to_free[i]);
|
|
|
|
free(repos_to_free[i]);
|
|
|
|
}
|
|
|
|
FREE_AND_NULL(repos_to_free);
|
|
|
|
repos_to_free_nr = 0;
|
|
|
|
repos_to_free_alloc = 0;
|
|
|
|
}
|
|
|
|
|
2010-01-26 01:51:39 +03:00
|
|
|
static void *run(void *arg)
|
|
|
|
{
|
|
|
|
int hit = 0;
|
|
|
|
struct grep_opt *opt = arg;
|
|
|
|
|
|
|
|
while (1) {
|
|
|
|
struct work_item *w = get_work();
|
|
|
|
if (!w)
|
|
|
|
break;
|
|
|
|
|
|
|
|
opt->output_priv = w;
|
2017-08-02 22:49:23 +03:00
|
|
|
hit |= grep_source(opt, &w->source);
|
2012-02-02 12:19:37 +04:00
|
|
|
grep_source_clear_data(&w->source);
|
2010-01-26 01:51:39 +03:00
|
|
|
work_done(w);
|
|
|
|
}
|
2021-10-22 11:55:39 +03:00
|
|
|
free_grep_patterns(opt);
|
|
|
|
free(opt);
|
2010-01-26 01:51:39 +03:00
|
|
|
|
|
|
|
return (void*) (intptr_t) hit;
|
|
|
|
}
|
|
|
|
|
|
|
|
static void strbuf_out(struct grep_opt *opt, const void *buf, size_t size)
|
|
|
|
{
|
|
|
|
struct work_item *w = opt->output_priv;
|
|
|
|
strbuf_add(&w->out, buf, size);
|
|
|
|
}
|
|
|
|
|
|
|
|
static void start_threads(struct grep_opt *opt)
|
|
|
|
{
|
|
|
|
int i;
|
|
|
|
|
|
|
|
pthread_mutex_init(&grep_mutex, NULL);
|
2011-12-13 01:16:07 +04:00
|
|
|
pthread_mutex_init(&grep_attr_mutex, NULL);
|
2010-01-26 01:51:39 +03:00
|
|
|
pthread_cond_init(&cond_add, NULL);
|
|
|
|
pthread_cond_init(&cond_write, NULL);
|
|
|
|
pthread_cond_init(&cond_result, NULL);
|
grep: make locking flag global
The low-level grep code traditionally didn't care about
threading, as it doesn't do any threading itself and didn't
call out to other non-thread-safe code. That changed with
0579f91 (grep: enable threading with -p and -W using lazy
attribute lookup, 2011-12-12), which pushed the lookup of
funcname attributes (which is not thread-safe) into the
low-level grep code.
As a result, the low-level code learned about a new global
"grep_attr_mutex" to serialize access to the attribute code.
A multi-threaded caller (e.g., builtin/grep.c) is expected
to initialize the mutex and set "use_threads" in the
grep_opt structure. The low-level code only uses the lock if
use_threads is set.
However, putting the use_threads flag into the grep_opt
struct is not the most logical place. Whether threading is
in use is not something that matters for each call to
grep_buffer, but is instead global to the whole program
(i.e., if any thread is doing multi-threaded grep, every
other thread, even if it thinks it is doing its own
single-threaded grep, would need to use the locking). In
practice, this distinction isn't a problem for us, because
the only user of multi-threaded grep is "git-grep", which
does nothing except call grep.
This patch turns the opt->use_threads flag into a global
flag. More important than the nit-picking semantic argument
above is that this means that the locking functions don't
need to actually have access to a grep_opt to know whether
to lock. Which in turn can make adding new locks simpler, as
we don't need to pass around a grep_opt.
Signed-off-by: Jeff King <peff@peff.net>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2012-02-02 12:18:29 +04:00
|
|
|
grep_use_locks = 1;
|
grep: replace grep_read_mutex by internal obj read lock
git-grep uses 'grep_read_mutex' to protect its calls to object reading
operations. But these have their own internal lock now, which ensures a
better performance (allowing parallel access to more regions). So, let's
remove the former and, instead, activate the latter with
enable_obj_read_lock().
Sections that are currently protected by 'grep_read_mutex' but are not
internally protected by the object reading lock should be surrounded by
obj_read_lock() and obj_read_unlock(). These guarantee mutual exclusion
with object reading operations, keeping the current behavior and
avoiding race conditions. Namely, these places are:
In grep.c:
- fill_textconv() at fill_textconv_grep().
- userdiff_get_textconv() at grep_source_1().
In builtin/grep.c:
- parse_object_or_die() and the submodule functions at
grep_submodule().
- deref_tag() and gitmodules_config_oid() at grep_objects().
If these functions become thread-safe, in the future, we might remove
the locking and probably get some speedup.
Note that some of the submodule functions will already be thread-safe
(or close to being thread-safe) with the internal object reading lock.
However, as some of them will require additional modifications to be
removed from the critical section, this will be done in its own patch.
Signed-off-by: Matheus Tavares <matheus.bernardino@usp.br>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2020-01-16 05:39:54 +03:00
|
|
|
enable_obj_read_lock();
|
2010-01-26 01:51:39 +03:00
|
|
|
|
|
|
|
for (i = 0; i < ARRAY_SIZE(todo); i++) {
|
|
|
|
strbuf_init(&todo[i].out, 0);
|
|
|
|
}
|
|
|
|
|
2021-03-13 19:17:22 +03:00
|
|
|
CALLOC_ARRAY(threads, num_threads);
|
2015-12-15 18:31:39 +03:00
|
|
|
for (i = 0; i < num_threads; i++) {
|
2010-01-26 01:51:39 +03:00
|
|
|
int err;
|
|
|
|
struct grep_opt *o = grep_opt_dup(opt);
|
|
|
|
o->output = strbuf_out;
|
|
|
|
compile_grep_patterns(o);
|
|
|
|
err = pthread_create(&threads[i], NULL, run, o);
|
|
|
|
|
|
|
|
if (err)
|
2011-02-23 02:41:55 +03:00
|
|
|
die(_("grep: failed to create thread: %s"),
|
2010-01-26 01:51:39 +03:00
|
|
|
strerror(err));
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
static int wait_all(void)
|
|
|
|
{
|
|
|
|
int hit = 0;
|
|
|
|
int i;
|
|
|
|
|
2018-11-03 11:48:43 +03:00
|
|
|
if (!HAVE_THREADS)
|
2018-11-03 11:48:44 +03:00
|
|
|
BUG("Never call this function unless you have started threads");
|
2018-11-03 11:48:43 +03:00
|
|
|
|
2010-01-26 01:51:39 +03:00
|
|
|
grep_lock();
|
|
|
|
all_work_added = 1;
|
|
|
|
|
|
|
|
/* Wait until all work is done. */
|
|
|
|
while (todo_done != todo_end)
|
|
|
|
pthread_cond_wait(&cond_result, &grep_mutex);
|
|
|
|
|
|
|
|
/* Wake up all the consumer threads so they can see that there
|
|
|
|
* is no more work to do.
|
|
|
|
*/
|
|
|
|
pthread_cond_broadcast(&cond_add);
|
|
|
|
grep_unlock();
|
|
|
|
|
2015-12-15 18:31:39 +03:00
|
|
|
for (i = 0; i < num_threads; i++) {
|
2010-01-26 01:51:39 +03:00
|
|
|
void *h;
|
|
|
|
pthread_join(threads[i], &h);
|
|
|
|
hit |= (int) (intptr_t) h;
|
|
|
|
}
|
|
|
|
|
2015-12-15 18:31:39 +03:00
|
|
|
free(threads);
|
|
|
|
|
2010-01-26 01:51:39 +03:00
|
|
|
pthread_mutex_destroy(&grep_mutex);
|
2011-12-13 01:16:07 +04:00
|
|
|
pthread_mutex_destroy(&grep_attr_mutex);
|
2010-01-26 01:51:39 +03:00
|
|
|
pthread_cond_destroy(&cond_add);
|
|
|
|
pthread_cond_destroy(&cond_write);
|
|
|
|
pthread_cond_destroy(&cond_result);
|
grep: make locking flag global
The low-level grep code traditionally didn't care about
threading, as it doesn't do any threading itself and didn't
call out to other non-thread-safe code. That changed with
0579f91 (grep: enable threading with -p and -W using lazy
attribute lookup, 2011-12-12), which pushed the lookup of
funcname attributes (which is not thread-safe) into the
low-level grep code.
As a result, the low-level code learned about a new global
"grep_attr_mutex" to serialize access to the attribute code.
A multi-threaded caller (e.g., builtin/grep.c) is expected
to initialize the mutex and set "use_threads" in the
grep_opt structure. The low-level code only uses the lock if
use_threads is set.
However, putting the use_threads flag into the grep_opt
struct is not the most logical place. Whether threading is
in use is not something that matters for each call to
grep_buffer, but is instead global to the whole program
(i.e., if any thread is doing multi-threaded grep, every
other thread, even if it thinks it is doing its own
single-threaded grep, would need to use the locking). In
practice, this distinction isn't a problem for us, because
the only user of multi-threaded grep is "git-grep", which
does nothing except call grep.
This patch turns the opt->use_threads flag into a global
flag. More important than the nit-picking semantic argument
above is that this means that the locking functions don't
need to actually have access to a grep_opt to know whether
to lock. Which in turn can make adding new locks simpler, as
we don't need to pass around a grep_opt.
Signed-off-by: Jeff King <peff@peff.net>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2012-02-02 12:18:29 +04:00
|
|
|
grep_use_locks = 0;
|
grep: replace grep_read_mutex by internal obj read lock
git-grep uses 'grep_read_mutex' to protect its calls to object reading
operations. But these have their own internal lock now, which ensures a
better performance (allowing parallel access to more regions). So, let's
remove the former and, instead, activate the latter with
enable_obj_read_lock().
Sections that are currently protected by 'grep_read_mutex' but are not
internally protected by the object reading lock should be surrounded by
obj_read_lock() and obj_read_unlock(). These guarantee mutual exclusion
with object reading operations, keeping the current behavior and
avoiding race conditions. Namely, these places are:
In grep.c:
- fill_textconv() at fill_textconv_grep().
- userdiff_get_textconv() at grep_source_1().
In builtin/grep.c:
- parse_object_or_die() and the submodule functions at
grep_submodule().
- deref_tag() and gitmodules_config_oid() at grep_objects().
If these functions become thread-safe, in the future, we might remove
the locking and probably get some speedup.
Note that some of the submodule functions will already be thread-safe
(or close to being thread-safe) with the internal object reading lock.
However, as some of them will require additional modifications to be
removed from the critical section, this will be done in its own patch.
Signed-off-by: Matheus Tavares <matheus.bernardino@usp.br>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2020-01-16 05:39:54 +03:00
|
|
|
disable_obj_read_lock();
|
2010-01-26 01:51:39 +03:00
|
|
|
|
|
|
|
return hit;
|
|
|
|
}
|
|
|
|
|
2012-10-10 03:04:12 +04:00
|
|
|
static int grep_cmd_config(const char *var, const char *value, void *cb)
|
|
|
|
{
|
|
|
|
int st = grep_config(var, value, cb);
|
2022-02-16 03:00:35 +03:00
|
|
|
if (git_color_default_config(var, value, NULL) < 0)
|
2012-10-10 03:04:12 +04:00
|
|
|
st = -1;
|
2015-12-15 18:31:39 +03:00
|
|
|
|
|
|
|
if (!strcmp(var, "grep.threads")) {
|
|
|
|
num_threads = git_config_int(var, value);
|
|
|
|
if (num_threads < 0)
|
|
|
|
die(_("invalid number of threads specified (%d) for %s"),
|
|
|
|
num_threads, var);
|
2018-11-03 11:48:44 +03:00
|
|
|
else if (!HAVE_THREADS && num_threads > 1) {
|
2017-05-25 22:45:34 +03:00
|
|
|
/*
|
|
|
|
* TRANSLATORS: %s is the configuration
|
|
|
|
* variable for tweaking threads, currently
|
|
|
|
* grep.threads
|
|
|
|
*/
|
|
|
|
warning(_("no threads support, ignoring %s"), var);
|
2018-11-03 11:48:44 +03:00
|
|
|
num_threads = 1;
|
2017-05-25 22:45:34 +03:00
|
|
|
}
|
2015-12-15 18:31:39 +03:00
|
|
|
}
|
|
|
|
|
2017-06-01 03:30:48 +03:00
|
|
|
if (!strcmp(var, "submodule.recurse"))
|
|
|
|
recurse_submodules = git_config_bool(var, value);
|
|
|
|
|
2012-10-10 03:04:12 +04:00
|
|
|
return st;
|
|
|
|
}
|
|
|
|
|
2020-04-19 09:33:24 +03:00
|
|
|
static void grep_source_name(struct grep_opt *opt, const char *filename,
|
|
|
|
int tree_name_len, struct strbuf *out)
|
|
|
|
{
|
|
|
|
strbuf_reset(out);
|
|
|
|
|
|
|
|
if (opt->null_following_name) {
|
built-ins: trust the "prefix" from run_builtin()
Change code in "builtin/grep.c" and "builtin/ls-tree.c" to trust the
"prefix" passed from "run_builtin()". The "prefix" we get from setup.c
is either going to be NULL or a string of length >0, never "".
So we can drop the "prefix && *prefix" checks added for
"builtin/grep.c" in 0d042fecf2f (git-grep: show pathnames relative to
the current directory, 2006-08-11), and for "builtin/ls-tree.c" in
a69dd585fca (ls-tree: chomp leading directories when run from a
subdirectory, 2005-12-23).
As seen in code in revision.c that was added in cd676a51367 (diff
--relative: output paths as relative to the current subdirectory,
2008-02-12) we already have existing code that does away with this
assertion.
This makes it easier to reason about a subsequent change to the
"prefix_length" code in grep.c in a subsequent commit, and since we're
going to the trouble of doing that let's leave behind an assert() to
promise this to any future callers.
For "builtin/grep.c" it would be painful to pass the "prefix" down the
callchain of:
cmd_grep -> grep_tree -> grep_submodule -> grep_cache -> grep_oid ->
grep_source_name
So for the code that needs it in grep_source_name() let's add a
"grep_prefix" variable similar to the existing "ls_tree_prefix".
While at it let's move the code in cmd_ls_tree() around so that we
assign to the "ls_tree_prefix" right after declaring the variables,
and stop assigning to "prefix". We only subsequently used that
variable later in the function after clobbering it. Let's just use our
own "grep_prefix" instead.
Let's also add an assert() in git.c, so that we'll make this promise
about the "prefix" to any current and future callers, as well as to
any readers of the code.
Code history:
* The strlen() in "grep.c" hasn't been used since 493b7a08d80 (grep:
accept relative paths outside current working directory, 2009-09-05).
When that code was added in 0d042fecf2f (git-grep: show pathnames
relative to the current directory, 2006-08-11) we used the length.
But since 493b7a08d80 we haven't used it for anything except a
boolean check that we could have done on the "prefix" member
itself.
Signed-off-by: Ævar Arnfjörð Bjarmason <avarab@gmail.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2022-02-16 03:00:34 +03:00
|
|
|
if (opt->relative && grep_prefix) {
|
2020-04-19 09:33:24 +03:00
|
|
|
struct strbuf rel_buf = STRBUF_INIT;
|
|
|
|
const char *rel_name =
|
|
|
|
relative_path(filename + tree_name_len,
|
built-ins: trust the "prefix" from run_builtin()
Change code in "builtin/grep.c" and "builtin/ls-tree.c" to trust the
"prefix" passed from "run_builtin()". The "prefix" we get from setup.c
is either going to be NULL or a string of length >0, never "".
So we can drop the "prefix && *prefix" checks added for
"builtin/grep.c" in 0d042fecf2f (git-grep: show pathnames relative to
the current directory, 2006-08-11), and for "builtin/ls-tree.c" in
a69dd585fca (ls-tree: chomp leading directories when run from a
subdirectory, 2005-12-23).
As seen in code in revision.c that was added in cd676a51367 (diff
--relative: output paths as relative to the current subdirectory,
2008-02-12) we already have existing code that does away with this
assertion.
This makes it easier to reason about a subsequent change to the
"prefix_length" code in grep.c in a subsequent commit, and since we're
going to the trouble of doing that let's leave behind an assert() to
promise this to any future callers.
For "builtin/grep.c" it would be painful to pass the "prefix" down the
callchain of:
cmd_grep -> grep_tree -> grep_submodule -> grep_cache -> grep_oid ->
grep_source_name
So for the code that needs it in grep_source_name() let's add a
"grep_prefix" variable similar to the existing "ls_tree_prefix".
While at it let's move the code in cmd_ls_tree() around so that we
assign to the "ls_tree_prefix" right after declaring the variables,
and stop assigning to "prefix". We only subsequently used that
variable later in the function after clobbering it. Let's just use our
own "grep_prefix" instead.
Let's also add an assert() in git.c, so that we'll make this promise
about the "prefix" to any current and future callers, as well as to
any readers of the code.
Code history:
* The strlen() in "grep.c" hasn't been used since 493b7a08d80 (grep:
accept relative paths outside current working directory, 2009-09-05).
When that code was added in 0d042fecf2f (git-grep: show pathnames
relative to the current directory, 2006-08-11) we used the length.
But since 493b7a08d80 we haven't used it for anything except a
boolean check that we could have done on the "prefix" member
itself.
Signed-off-by: Ævar Arnfjörð Bjarmason <avarab@gmail.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2022-02-16 03:00:34 +03:00
|
|
|
grep_prefix, &rel_buf);
|
2020-04-19 09:33:24 +03:00
|
|
|
|
|
|
|
if (tree_name_len)
|
|
|
|
strbuf_add(out, filename, tree_name_len);
|
|
|
|
|
|
|
|
strbuf_addstr(out, rel_name);
|
|
|
|
strbuf_release(&rel_buf);
|
|
|
|
} else {
|
|
|
|
strbuf_addstr(out, filename);
|
|
|
|
}
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
built-ins: trust the "prefix" from run_builtin()
Change code in "builtin/grep.c" and "builtin/ls-tree.c" to trust the
"prefix" passed from "run_builtin()". The "prefix" we get from setup.c
is either going to be NULL or a string of length >0, never "".
So we can drop the "prefix && *prefix" checks added for
"builtin/grep.c" in 0d042fecf2f (git-grep: show pathnames relative to
the current directory, 2006-08-11), and for "builtin/ls-tree.c" in
a69dd585fca (ls-tree: chomp leading directories when run from a
subdirectory, 2005-12-23).
As seen in code in revision.c that was added in cd676a51367 (diff
--relative: output paths as relative to the current subdirectory,
2008-02-12) we already have existing code that does away with this
assertion.
This makes it easier to reason about a subsequent change to the
"prefix_length" code in grep.c in a subsequent commit, and since we're
going to the trouble of doing that let's leave behind an assert() to
promise this to any future callers.
For "builtin/grep.c" it would be painful to pass the "prefix" down the
callchain of:
cmd_grep -> grep_tree -> grep_submodule -> grep_cache -> grep_oid ->
grep_source_name
So for the code that needs it in grep_source_name() let's add a
"grep_prefix" variable similar to the existing "ls_tree_prefix".
While at it let's move the code in cmd_ls_tree() around so that we
assign to the "ls_tree_prefix" right after declaring the variables,
and stop assigning to "prefix". We only subsequently used that
variable later in the function after clobbering it. Let's just use our
own "grep_prefix" instead.
Let's also add an assert() in git.c, so that we'll make this promise
about the "prefix" to any current and future callers, as well as to
any readers of the code.
Code history:
* The strlen() in "grep.c" hasn't been used since 493b7a08d80 (grep:
accept relative paths outside current working directory, 2009-09-05).
When that code was added in 0d042fecf2f (git-grep: show pathnames
relative to the current directory, 2006-08-11) we used the length.
But since 493b7a08d80 we haven't used it for anything except a
boolean check that we could have done on the "prefix" member
itself.
Signed-off-by: Ævar Arnfjörð Bjarmason <avarab@gmail.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2022-02-16 03:00:34 +03:00
|
|
|
if (opt->relative && grep_prefix)
|
|
|
|
quote_path(filename + tree_name_len, grep_prefix, out, 0);
|
2020-04-19 09:33:24 +03:00
|
|
|
else
|
|
|
|
quote_c_style(filename + tree_name_len, out, NULL, 0);
|
|
|
|
|
|
|
|
if (tree_name_len)
|
|
|
|
strbuf_insert(out, 0, filename, tree_name_len);
|
|
|
|
}
|
|
|
|
|
2017-02-22 02:47:25 +03:00
|
|
|
static int grep_oid(struct grep_opt *opt, const struct object_id *oid,
|
2012-10-12 14:49:38 +04:00
|
|
|
const char *filename, int tree_name_len,
|
|
|
|
const char *path)
|
2010-01-26 01:51:39 +03:00
|
|
|
{
|
|
|
|
struct strbuf pathbuf = STRBUF_INIT;
|
2018-02-23 17:47:56 +03:00
|
|
|
struct grep_source gs;
|
2010-01-26 01:51:39 +03:00
|
|
|
|
2020-04-19 09:33:24 +03:00
|
|
|
grep_source_name(opt, filename, tree_name_len, &pathbuf);
|
2021-08-17 00:09:56 +03:00
|
|
|
grep_source_init_oid(&gs, pathbuf.buf, path, oid, opt->repo);
|
2018-02-23 17:47:57 +03:00
|
|
|
strbuf_release(&pathbuf);
|
2018-02-23 17:47:56 +03:00
|
|
|
|
2018-11-03 11:48:44 +03:00
|
|
|
if (num_threads > 1) {
|
2018-02-23 17:47:56 +03:00
|
|
|
/*
|
|
|
|
* add_work() copies gs and thus assumes ownership of
|
|
|
|
* its fields, so do not call grep_source_clear()
|
|
|
|
*/
|
|
|
|
add_work(opt, &gs);
|
2010-01-26 01:51:39 +03:00
|
|
|
return 0;
|
2018-11-03 11:48:43 +03:00
|
|
|
} else {
|
2010-01-26 01:51:39 +03:00
|
|
|
int hit;
|
2006-05-01 10:28:15 +04:00
|
|
|
|
2012-02-02 12:19:37 +04:00
|
|
|
hit = grep_source(opt, &gs);
|
2007-03-07 04:44:37 +03:00
|
|
|
|
2012-02-02 12:19:37 +04:00
|
|
|
grep_source_clear(&gs);
|
|
|
|
return hit;
|
2006-05-01 10:28:15 +04:00
|
|
|
}
|
2010-01-26 01:51:39 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
static int grep_file(struct grep_opt *opt, const char *filename)
|
|
|
|
{
|
|
|
|
struct strbuf buf = STRBUF_INIT;
|
2018-02-23 17:47:56 +03:00
|
|
|
struct grep_source gs;
|
2010-01-26 01:51:39 +03:00
|
|
|
|
2020-04-19 09:33:24 +03:00
|
|
|
grep_source_name(opt, filename, 0, &buf);
|
2021-08-17 00:09:53 +03:00
|
|
|
grep_source_init_file(&gs, buf.buf, filename);
|
2018-02-23 17:47:57 +03:00
|
|
|
strbuf_release(&buf);
|
2018-02-23 17:47:56 +03:00
|
|
|
|
2018-11-03 11:48:44 +03:00
|
|
|
if (num_threads > 1) {
|
2018-02-23 17:47:56 +03:00
|
|
|
/*
|
|
|
|
* add_work() copies gs and thus assumes ownership of
|
|
|
|
* its fields, so do not call grep_source_clear()
|
|
|
|
*/
|
|
|
|
add_work(opt, &gs);
|
2010-01-26 01:51:39 +03:00
|
|
|
return 0;
|
2018-11-03 11:48:43 +03:00
|
|
|
} else {
|
2010-01-26 01:51:39 +03:00
|
|
|
int hit;
|
|
|
|
|
2012-02-02 12:19:37 +04:00
|
|
|
hit = grep_source(opt, &gs);
|
|
|
|
|
|
|
|
grep_source_clear(&gs);
|
2010-01-26 01:51:39 +03:00
|
|
|
return hit;
|
|
|
|
}
|
2006-05-01 10:28:15 +04:00
|
|
|
}
|
|
|
|
|
2010-06-12 20:36:51 +04:00
|
|
|
static void append_path(struct grep_opt *opt, const void *data, size_t len)
|
|
|
|
{
|
|
|
|
struct string_list *path_list = opt->output_priv;
|
|
|
|
|
|
|
|
if (len == 1 && *(const char *)data == '\0')
|
|
|
|
return;
|
2021-10-22 11:55:41 +03:00
|
|
|
string_list_append_nodup(path_list, xstrndup(data, len));
|
2010-06-12 20:36:51 +04:00
|
|
|
}
|
|
|
|
|
|
|
|
static void run_pager(struct grep_opt *opt, const char *prefix)
|
|
|
|
{
|
|
|
|
struct string_list *path_list = opt->output_priv;
|
2016-02-23 01:44:21 +03:00
|
|
|
struct child_process child = CHILD_PROCESS_INIT;
|
2010-06-12 20:36:51 +04:00
|
|
|
int i, status;
|
|
|
|
|
|
|
|
for (i = 0; i < path_list->nr; i++)
|
2020-07-28 23:24:27 +03:00
|
|
|
strvec_push(&child.args, path_list->items[i].string);
|
2016-02-23 01:44:21 +03:00
|
|
|
child.dir = prefix;
|
|
|
|
child.use_shell = 1;
|
2010-06-12 20:36:51 +04:00
|
|
|
|
2016-02-23 01:44:21 +03:00
|
|
|
status = run_command(&child);
|
2010-06-12 20:36:51 +04:00
|
|
|
if (status)
|
|
|
|
exit(status);
|
|
|
|
}
|
|
|
|
|
2019-01-12 05:13:22 +03:00
|
|
|
static int grep_cache(struct grep_opt *opt,
|
2017-08-02 22:49:23 +03:00
|
|
|
const struct pathspec *pathspec, int cached);
|
|
|
|
static int grep_tree(struct grep_opt *opt, const struct pathspec *pathspec,
|
|
|
|
struct tree_desc *tree, struct strbuf *base, int tn_len,
|
2019-01-12 05:13:22 +03:00
|
|
|
int check_attr);
|
2016-12-16 22:03:20 +03:00
|
|
|
|
2019-01-12 05:13:22 +03:00
|
|
|
static int grep_submodule(struct grep_opt *opt,
|
2017-08-02 22:49:23 +03:00
|
|
|
const struct pathspec *pathspec,
|
|
|
|
const struct object_id *oid,
|
2019-07-30 19:53:27 +03:00
|
|
|
const char *filename, const char *path, int cached)
|
2016-12-16 22:03:20 +03:00
|
|
|
{
|
2021-08-17 00:09:55 +03:00
|
|
|
struct repository *subrepo;
|
2019-01-12 05:13:22 +03:00
|
|
|
struct repository *superproject = opt->repo;
|
|
|
|
struct grep_opt subopt;
|
2021-08-17 00:09:55 +03:00
|
|
|
int hit = 0;
|
2016-12-16 22:03:21 +03:00
|
|
|
|
grep: allow submodule functions to run in parallel
Now that object reading operations are internally protected, the
submodule initialization functions at builtin/grep.c:grep_submodule()
are very close to being thread-safe. Let's take a look at each call and
remove from the critical section what we can, for better performance:
- submodule_from_path() and is_submodule_active() cannot be called in
parallel yet only because they call repo_read_gitmodules() which
contains, in its call stack, operations that would otherwise be in
race condition with object reading (for example parse_object() and
is_promisor_remote()). However, they only call repo_read_gitmodules()
if it wasn't read before. So let's pre-read it before firing the
threads and allow these two functions to safely be called in
parallel.
- repo_submodule_init() is already thread-safe, so remove it from the
critical section without other necessary changes.
- The repo_read_gitmodules(&subrepo) call at grep_submodule() is safe as
no other thread is performing object reading operations in the subrepo
yet. However, threads might be working in the superproject, and this
function calls add_to_alternates_memory() internally, which is racy
with object readings in the superproject. So it must be kept
protected for now. Let's add a "NEEDSWORK" to it, informing why it
cannot be removed from the critical section yet.
- Finally, add_to_alternates_memory() must be kept protected for the
same reason as the item above.
Signed-off-by: Matheus Tavares <matheus.bernardino@usp.br>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2020-01-16 05:39:56 +03:00
|
|
|
if (!is_submodule_active(superproject, path))
|
2017-08-02 22:49:23 +03:00
|
|
|
return 0;
|
2016-12-16 22:03:20 +03:00
|
|
|
|
2021-08-17 00:09:55 +03:00
|
|
|
subrepo = xmalloc(sizeof(*subrepo));
|
2021-09-09 21:47:28 +03:00
|
|
|
if (repo_submodule_init(subrepo, superproject, path, null_oid())) {
|
2021-08-17 00:09:55 +03:00
|
|
|
free(subrepo);
|
2017-08-02 22:49:23 +03:00
|
|
|
return 0;
|
2021-08-17 00:09:55 +03:00
|
|
|
}
|
|
|
|
ALLOC_GROW(repos_to_free, repos_to_free_nr + 1, repos_to_free_alloc);
|
|
|
|
repos_to_free[repos_to_free_nr++] = subrepo;
|
2017-03-17 20:22:55 +03:00
|
|
|
|
grep: allow submodule functions to run in parallel
Now that object reading operations are internally protected, the
submodule initialization functions at builtin/grep.c:grep_submodule()
are very close to being thread-safe. Let's take a look at each call and
remove from the critical section what we can, for better performance:
- submodule_from_path() and is_submodule_active() cannot be called in
parallel yet only because they call repo_read_gitmodules() which
contains, in its call stack, operations that would otherwise be in
race condition with object reading (for example parse_object() and
is_promisor_remote()). However, they only call repo_read_gitmodules()
if it wasn't read before. So let's pre-read it before firing the
threads and allow these two functions to safely be called in
parallel.
- repo_submodule_init() is already thread-safe, so remove it from the
critical section without other necessary changes.
- The repo_read_gitmodules(&subrepo) call at grep_submodule() is safe as
no other thread is performing object reading operations in the subrepo
yet. However, threads might be working in the superproject, and this
function calls add_to_alternates_memory() internally, which is racy
with object readings in the superproject. So it must be kept
protected for now. Let's add a "NEEDSWORK" to it, informing why it
cannot be removed from the critical section yet.
- Finally, add_to_alternates_memory() must be kept protected for the
same reason as the item above.
Signed-off-by: Matheus Tavares <matheus.bernardino@usp.br>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2020-01-16 05:39:56 +03:00
|
|
|
/*
|
|
|
|
* NEEDSWORK: repo_read_gitmodules() might call
|
|
|
|
* add_to_alternates_memory() via config_from_gitmodules(). This
|
|
|
|
* operation causes a race condition with concurrent object readings
|
|
|
|
* performed by the worker threads. That's why we need obj_read_lock()
|
|
|
|
* here. It should be removed once it's no longer necessary to add the
|
|
|
|
* subrepo's odbs to the in-memory alternates list.
|
|
|
|
*/
|
|
|
|
obj_read_lock();
|
builtin/grep.c: integrate with sparse index
Turn on sparse index and remove ensure_full_index().
Before this patch, `git-grep` utilizes the ensure_full_index() method to
expand the index and search all the entries. Because this method
requires walking all the trees and constructing the index, it is the
slow part within the whole command.
To achieve better performance, this patch uses grep_tree() to search the
sparse directory entries and get rid of the ensure_full_index() method.
Why grep_tree() is a better choice over ensure_full_index()?
1) grep_tree() is as correct as ensure_full_index(). grep_tree() looks
into every sparse-directory entry (represented by a tree) recursively
when looping over the index, and the result of doing so matches the
result of expanding the index.
2) grep_tree() utilizes pathspecs to limit the scope of searching.
ensure_full_index() always expands the index, which means it will
always walk all the trees and blobs in the repo without caring if
the user only wants a subset of the content, i.e. using a pathspec.
On the other hand, grep_tree() will only search the contents that
match the pathspec, and thus possibly walking fewer trees.
3) grep_tree() does not construct and copy back a new index, while
ensure_full_index() does. This also saves some time.
----------------
Performance test
- Summary:
p2000 tests demonstrate a ~71% execution time reduction for
`git grep --cached bogus -- "f2/f1/f1/*"` using tree-walking logic.
However, notice that this result varies depending on the pathspec
given. See below "Command used for testing" for more details.
Test HEAD~ HEAD
-------------------------------------------------------
2000.78: git grep ... (full-v3) 0.35 0.39 (≈)
2000.79: git grep ... (full-v4) 0.36 0.30 (≈)
2000.80: git grep ... (sparse-v3) 0.88 0.23 (-73.8%)
2000.81: git grep ... (sparse-v4) 0.83 0.26 (-68.6%)
- Command used for testing:
git grep --cached bogus -- "f2/f1/f1/*"
The reason for specifying a pathspec is that, if we don't specify a
pathspec, then grep_tree() will walk all the trees and blobs to find the
pattern, and the time consumed doing so is not too different from using
the original ensure_full_index() method, which also spends most of the
time walking trees. However, when a pathspec is specified, this latest
logic will only walk the area of trees enclosed by the pathspec, and the
time consumed is reasonably a lot less.
Generally speaking, because the performance gain is acheived by walking
less trees, which are specified by the pathspec, the HEAD time v.s.
HEAD~ time in sparse-v[3|4], should be proportional to
"pathspec enclosed area" v.s. "all area", respectively. Namely, the
wider the <pathspec> is encompassing, the less the performance
difference between HEAD~ and HEAD, and vice versa.
That is, if we don't specify a pathspec, the performance difference [1]
is indistinguishable: both methods walk all the trees and take generally
same amount of time (even with the index construction time included for
ensure_full_index()).
[1] Performance test result without pathspec (hence walking all trees):
Command used:
git grep --cached bogus
Test HEAD~ HEAD
---------------------------------------------------
2000.78: git grep ... (full-v3) 6.17 5.19 (≈)
2000.79: git grep ... (full-v4) 6.19 5.46 (≈)
2000.80: git grep ... (sparse-v3) 6.57 6.44 (≈)
2000.81: git grep ... (sparse-v4) 6.65 6.28 (≈)
--------------------------
NEEDSWORK about submodules
There are a few NEEDSWORKs that belong to improvements beyond this
topic. See the NEEDSWORK in builtin/grep.c::grep_submodule() for
more context. The other two NEEDSWORKs in t1092 are also relative.
Suggested-by: Derrick Stolee <derrickstolee@github.com>
Helped-by: Derrick Stolee <derrickstolee@github.com>
Helped-by: Victoria Dye <vdye@github.com>
Helped-by: Elijah Newren <newren@gmail.com>
Signed-off-by: Shaoxuan Yuan <shaoxuan.yuan02@gmail.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2022-09-23 07:18:42 +03:00
|
|
|
|
|
|
|
/*
|
|
|
|
* NEEDSWORK: when reading a submodule, the sparsity settings in the
|
|
|
|
* superproject are incorrectly forgotten or misused. For example:
|
|
|
|
*
|
|
|
|
* 1. "command_requires_full_index"
|
|
|
|
* When this setting is turned on for `grep`, only the superproject
|
|
|
|
* knows it. All the submodules are read with their own configs
|
|
|
|
* and get prepare_repo_settings()'d. Therefore, these submodules
|
|
|
|
* "forget" the sparse-index feature switch. As a result, the index
|
|
|
|
* of these submodules are expanded unexpectedly.
|
|
|
|
*
|
|
|
|
* 2. "core_apply_sparse_checkout"
|
|
|
|
* When running `grep` in the superproject, this setting is
|
|
|
|
* populated using the superproject's configs. However, once
|
|
|
|
* initialized, this config is globally accessible and is read by
|
|
|
|
* prepare_repo_settings() for the submodules. For instance, if a
|
|
|
|
* submodule is using a sparse-checkout, however, the superproject
|
|
|
|
* is not, the result is that the config from the superproject will
|
|
|
|
* dictate the behavior for the submodule, making it "forget" its
|
|
|
|
* sparse-checkout state.
|
|
|
|
*
|
|
|
|
* 3. "core_sparse_checkout_cone"
|
|
|
|
* ditto.
|
|
|
|
*
|
|
|
|
* Note that this list is not exhaustive.
|
|
|
|
*/
|
2021-08-17 00:09:55 +03:00
|
|
|
repo_read_gitmodules(subrepo, 0);
|
2016-12-16 22:03:20 +03:00
|
|
|
|
2016-12-16 22:03:21 +03:00
|
|
|
/*
|
2021-08-17 00:09:56 +03:00
|
|
|
* All code paths tested by test code no longer need submodule ODBs to
|
|
|
|
* be added as alternates, but add it to the list just in case.
|
|
|
|
* Submodule ODBs added through add_submodule_odb_by_path() will be
|
|
|
|
* lazily registered as alternates when needed (and except in an
|
|
|
|
* unexpected code interaction, it won't be needed).
|
2016-12-16 22:03:21 +03:00
|
|
|
*/
|
2021-08-17 00:09:55 +03:00
|
|
|
add_submodule_odb_by_path(subrepo->objects->odb->path);
|
grep: replace grep_read_mutex by internal obj read lock
git-grep uses 'grep_read_mutex' to protect its calls to object reading
operations. But these have their own internal lock now, which ensures a
better performance (allowing parallel access to more regions). So, let's
remove the former and, instead, activate the latter with
enable_obj_read_lock().
Sections that are currently protected by 'grep_read_mutex' but are not
internally protected by the object reading lock should be surrounded by
obj_read_lock() and obj_read_unlock(). These guarantee mutual exclusion
with object reading operations, keeping the current behavior and
avoiding race conditions. Namely, these places are:
In grep.c:
- fill_textconv() at fill_textconv_grep().
- userdiff_get_textconv() at grep_source_1().
In builtin/grep.c:
- parse_object_or_die() and the submodule functions at
grep_submodule().
- deref_tag() and gitmodules_config_oid() at grep_objects().
If these functions become thread-safe, in the future, we might remove
the locking and probably get some speedup.
Note that some of the submodule functions will already be thread-safe
(or close to being thread-safe) with the internal object reading lock.
However, as some of them will require additional modifications to be
removed from the critical section, this will be done in its own patch.
Signed-off-by: Matheus Tavares <matheus.bernardino@usp.br>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2020-01-16 05:39:54 +03:00
|
|
|
obj_read_unlock();
|
2016-12-16 22:03:21 +03:00
|
|
|
|
2019-01-12 05:13:22 +03:00
|
|
|
memcpy(&subopt, opt, sizeof(subopt));
|
2021-08-17 00:09:55 +03:00
|
|
|
subopt.repo = subrepo;
|
2019-01-12 05:13:22 +03:00
|
|
|
|
2017-08-02 22:49:23 +03:00
|
|
|
if (oid) {
|
2021-08-17 00:09:54 +03:00
|
|
|
enum object_type object_type;
|
2017-08-02 22:49:23 +03:00
|
|
|
struct tree_desc tree;
|
|
|
|
void *data;
|
|
|
|
unsigned long size;
|
|
|
|
struct strbuf base = STRBUF_INIT;
|
2016-12-16 22:03:21 +03:00
|
|
|
|
grep: replace grep_read_mutex by internal obj read lock
git-grep uses 'grep_read_mutex' to protect its calls to object reading
operations. But these have their own internal lock now, which ensures a
better performance (allowing parallel access to more regions). So, let's
remove the former and, instead, activate the latter with
enable_obj_read_lock().
Sections that are currently protected by 'grep_read_mutex' but are not
internally protected by the object reading lock should be surrounded by
obj_read_lock() and obj_read_unlock(). These guarantee mutual exclusion
with object reading operations, keeping the current behavior and
avoiding race conditions. Namely, these places are:
In grep.c:
- fill_textconv() at fill_textconv_grep().
- userdiff_get_textconv() at grep_source_1().
In builtin/grep.c:
- parse_object_or_die() and the submodule functions at
grep_submodule().
- deref_tag() and gitmodules_config_oid() at grep_objects().
If these functions become thread-safe, in the future, we might remove
the locking and probably get some speedup.
Note that some of the submodule functions will already be thread-safe
(or close to being thread-safe) with the internal object reading lock.
However, as some of them will require additional modifications to be
removed from the critical section, this will be done in its own patch.
Signed-off-by: Matheus Tavares <matheus.bernardino@usp.br>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2020-01-16 05:39:54 +03:00
|
|
|
obj_read_lock();
|
2021-08-17 00:09:55 +03:00
|
|
|
object_type = oid_object_info(subrepo, oid, NULL);
|
grep: replace grep_read_mutex by internal obj read lock
git-grep uses 'grep_read_mutex' to protect its calls to object reading
operations. But these have their own internal lock now, which ensures a
better performance (allowing parallel access to more regions). So, let's
remove the former and, instead, activate the latter with
enable_obj_read_lock().
Sections that are currently protected by 'grep_read_mutex' but are not
internally protected by the object reading lock should be surrounded by
obj_read_lock() and obj_read_unlock(). These guarantee mutual exclusion
with object reading operations, keeping the current behavior and
avoiding race conditions. Namely, these places are:
In grep.c:
- fill_textconv() at fill_textconv_grep().
- userdiff_get_textconv() at grep_source_1().
In builtin/grep.c:
- parse_object_or_die() and the submodule functions at
grep_submodule().
- deref_tag() and gitmodules_config_oid() at grep_objects().
If these functions become thread-safe, in the future, we might remove
the locking and probably get some speedup.
Note that some of the submodule functions will already be thread-safe
(or close to being thread-safe) with the internal object reading lock.
However, as some of them will require additional modifications to be
removed from the critical section, this will be done in its own patch.
Signed-off-by: Matheus Tavares <matheus.bernardino@usp.br>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2020-01-16 05:39:54 +03:00
|
|
|
obj_read_unlock();
|
2021-08-17 00:09:55 +03:00
|
|
|
data = read_object_with_reference(subrepo,
|
2022-02-05 02:48:34 +03:00
|
|
|
oid, OBJ_TREE,
|
2017-08-02 22:49:23 +03:00
|
|
|
&size, NULL);
|
|
|
|
if (!data)
|
2021-08-17 00:09:54 +03:00
|
|
|
die(_("unable to read tree (%s)"), oid_to_hex(oid));
|
2016-12-16 22:03:20 +03:00
|
|
|
|
2017-08-02 22:49:23 +03:00
|
|
|
strbuf_addstr(&base, filename);
|
|
|
|
strbuf_addch(&base, '/');
|
2016-12-16 22:03:20 +03:00
|
|
|
|
2017-08-02 22:49:23 +03:00
|
|
|
init_tree_desc(&tree, data, size);
|
2019-01-12 05:13:22 +03:00
|
|
|
hit = grep_tree(&subopt, pathspec, &tree, &base, base.len,
|
2021-08-17 00:09:54 +03:00
|
|
|
object_type == OBJ_COMMIT);
|
2017-08-02 22:49:23 +03:00
|
|
|
strbuf_release(&base);
|
|
|
|
free(data);
|
|
|
|
} else {
|
2019-07-30 19:53:27 +03:00
|
|
|
hit = grep_cache(&subopt, pathspec, cached);
|
2016-12-16 22:03:22 +03:00
|
|
|
}
|
2016-12-16 22:03:20 +03:00
|
|
|
|
2017-08-02 22:49:23 +03:00
|
|
|
return hit;
|
2016-12-16 22:03:20 +03:00
|
|
|
}
|
|
|
|
|
2019-01-12 05:13:22 +03:00
|
|
|
static int grep_cache(struct grep_opt *opt,
|
2017-08-02 22:49:23 +03:00
|
|
|
const struct pathspec *pathspec, int cached)
|
2006-05-01 10:28:15 +04:00
|
|
|
{
|
2019-01-12 05:13:22 +03:00
|
|
|
struct repository *repo = opt->repo;
|
2006-05-01 10:28:15 +04:00
|
|
|
int hit = 0;
|
|
|
|
int nr;
|
2016-12-16 22:03:20 +03:00
|
|
|
struct strbuf name = STRBUF_INIT;
|
|
|
|
int name_base_len = 0;
|
2017-08-02 22:49:23 +03:00
|
|
|
if (repo->submodule_prefix) {
|
|
|
|
name_base_len = strlen(repo->submodule_prefix);
|
|
|
|
strbuf_addstr(&name, repo->submodule_prefix);
|
2016-12-16 22:03:20 +03:00
|
|
|
}
|
|
|
|
|
2018-05-15 04:04:25 +03:00
|
|
|
if (repo_read_index(repo) < 0)
|
2018-07-21 10:49:23 +03:00
|
|
|
die(_("index file corrupt"));
|
2006-05-01 10:28:15 +04:00
|
|
|
|
2017-08-02 22:49:23 +03:00
|
|
|
for (nr = 0; nr < repo->index->cache_nr; nr++) {
|
|
|
|
const struct cache_entry *ce = repo->index->cache[nr];
|
2021-02-10 00:33:30 +03:00
|
|
|
|
|
|
|
if (!cached && ce_skip_worktree(ce))
|
|
|
|
continue;
|
|
|
|
|
2016-12-16 22:03:20 +03:00
|
|
|
strbuf_setlen(&name, name_base_len);
|
|
|
|
strbuf_addstr(&name, ce->name);
|
builtin/grep.c: integrate with sparse index
Turn on sparse index and remove ensure_full_index().
Before this patch, `git-grep` utilizes the ensure_full_index() method to
expand the index and search all the entries. Because this method
requires walking all the trees and constructing the index, it is the
slow part within the whole command.
To achieve better performance, this patch uses grep_tree() to search the
sparse directory entries and get rid of the ensure_full_index() method.
Why grep_tree() is a better choice over ensure_full_index()?
1) grep_tree() is as correct as ensure_full_index(). grep_tree() looks
into every sparse-directory entry (represented by a tree) recursively
when looping over the index, and the result of doing so matches the
result of expanding the index.
2) grep_tree() utilizes pathspecs to limit the scope of searching.
ensure_full_index() always expands the index, which means it will
always walk all the trees and blobs in the repo without caring if
the user only wants a subset of the content, i.e. using a pathspec.
On the other hand, grep_tree() will only search the contents that
match the pathspec, and thus possibly walking fewer trees.
3) grep_tree() does not construct and copy back a new index, while
ensure_full_index() does. This also saves some time.
----------------
Performance test
- Summary:
p2000 tests demonstrate a ~71% execution time reduction for
`git grep --cached bogus -- "f2/f1/f1/*"` using tree-walking logic.
However, notice that this result varies depending on the pathspec
given. See below "Command used for testing" for more details.
Test HEAD~ HEAD
-------------------------------------------------------
2000.78: git grep ... (full-v3) 0.35 0.39 (≈)
2000.79: git grep ... (full-v4) 0.36 0.30 (≈)
2000.80: git grep ... (sparse-v3) 0.88 0.23 (-73.8%)
2000.81: git grep ... (sparse-v4) 0.83 0.26 (-68.6%)
- Command used for testing:
git grep --cached bogus -- "f2/f1/f1/*"
The reason for specifying a pathspec is that, if we don't specify a
pathspec, then grep_tree() will walk all the trees and blobs to find the
pattern, and the time consumed doing so is not too different from using
the original ensure_full_index() method, which also spends most of the
time walking trees. However, when a pathspec is specified, this latest
logic will only walk the area of trees enclosed by the pathspec, and the
time consumed is reasonably a lot less.
Generally speaking, because the performance gain is acheived by walking
less trees, which are specified by the pathspec, the HEAD time v.s.
HEAD~ time in sparse-v[3|4], should be proportional to
"pathspec enclosed area" v.s. "all area", respectively. Namely, the
wider the <pathspec> is encompassing, the less the performance
difference between HEAD~ and HEAD, and vice versa.
That is, if we don't specify a pathspec, the performance difference [1]
is indistinguishable: both methods walk all the trees and take generally
same amount of time (even with the index construction time included for
ensure_full_index()).
[1] Performance test result without pathspec (hence walking all trees):
Command used:
git grep --cached bogus
Test HEAD~ HEAD
---------------------------------------------------
2000.78: git grep ... (full-v3) 6.17 5.19 (≈)
2000.79: git grep ... (full-v4) 6.19 5.46 (≈)
2000.80: git grep ... (sparse-v3) 6.57 6.44 (≈)
2000.81: git grep ... (sparse-v4) 6.65 6.28 (≈)
--------------------------
NEEDSWORK about submodules
There are a few NEEDSWORKs that belong to improvements beyond this
topic. See the NEEDSWORK in builtin/grep.c::grep_submodule() for
more context. The other two NEEDSWORKs in t1092 are also relative.
Suggested-by: Derrick Stolee <derrickstolee@github.com>
Helped-by: Derrick Stolee <derrickstolee@github.com>
Helped-by: Victoria Dye <vdye@github.com>
Helped-by: Elijah Newren <newren@gmail.com>
Signed-off-by: Shaoxuan Yuan <shaoxuan.yuan02@gmail.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2022-09-23 07:18:42 +03:00
|
|
|
if (S_ISSPARSEDIR(ce->ce_mode)) {
|
|
|
|
enum object_type type;
|
|
|
|
struct tree_desc tree;
|
|
|
|
void *data;
|
|
|
|
unsigned long size;
|
2016-12-16 22:03:20 +03:00
|
|
|
|
builtin/grep.c: integrate with sparse index
Turn on sparse index and remove ensure_full_index().
Before this patch, `git-grep` utilizes the ensure_full_index() method to
expand the index and search all the entries. Because this method
requires walking all the trees and constructing the index, it is the
slow part within the whole command.
To achieve better performance, this patch uses grep_tree() to search the
sparse directory entries and get rid of the ensure_full_index() method.
Why grep_tree() is a better choice over ensure_full_index()?
1) grep_tree() is as correct as ensure_full_index(). grep_tree() looks
into every sparse-directory entry (represented by a tree) recursively
when looping over the index, and the result of doing so matches the
result of expanding the index.
2) grep_tree() utilizes pathspecs to limit the scope of searching.
ensure_full_index() always expands the index, which means it will
always walk all the trees and blobs in the repo without caring if
the user only wants a subset of the content, i.e. using a pathspec.
On the other hand, grep_tree() will only search the contents that
match the pathspec, and thus possibly walking fewer trees.
3) grep_tree() does not construct and copy back a new index, while
ensure_full_index() does. This also saves some time.
----------------
Performance test
- Summary:
p2000 tests demonstrate a ~71% execution time reduction for
`git grep --cached bogus -- "f2/f1/f1/*"` using tree-walking logic.
However, notice that this result varies depending on the pathspec
given. See below "Command used for testing" for more details.
Test HEAD~ HEAD
-------------------------------------------------------
2000.78: git grep ... (full-v3) 0.35 0.39 (≈)
2000.79: git grep ... (full-v4) 0.36 0.30 (≈)
2000.80: git grep ... (sparse-v3) 0.88 0.23 (-73.8%)
2000.81: git grep ... (sparse-v4) 0.83 0.26 (-68.6%)
- Command used for testing:
git grep --cached bogus -- "f2/f1/f1/*"
The reason for specifying a pathspec is that, if we don't specify a
pathspec, then grep_tree() will walk all the trees and blobs to find the
pattern, and the time consumed doing so is not too different from using
the original ensure_full_index() method, which also spends most of the
time walking trees. However, when a pathspec is specified, this latest
logic will only walk the area of trees enclosed by the pathspec, and the
time consumed is reasonably a lot less.
Generally speaking, because the performance gain is acheived by walking
less trees, which are specified by the pathspec, the HEAD time v.s.
HEAD~ time in sparse-v[3|4], should be proportional to
"pathspec enclosed area" v.s. "all area", respectively. Namely, the
wider the <pathspec> is encompassing, the less the performance
difference between HEAD~ and HEAD, and vice versa.
That is, if we don't specify a pathspec, the performance difference [1]
is indistinguishable: both methods walk all the trees and take generally
same amount of time (even with the index construction time included for
ensure_full_index()).
[1] Performance test result without pathspec (hence walking all trees):
Command used:
git grep --cached bogus
Test HEAD~ HEAD
---------------------------------------------------
2000.78: git grep ... (full-v3) 6.17 5.19 (≈)
2000.79: git grep ... (full-v4) 6.19 5.46 (≈)
2000.80: git grep ... (sparse-v3) 6.57 6.44 (≈)
2000.81: git grep ... (sparse-v4) 6.65 6.28 (≈)
--------------------------
NEEDSWORK about submodules
There are a few NEEDSWORKs that belong to improvements beyond this
topic. See the NEEDSWORK in builtin/grep.c::grep_submodule() for
more context. The other two NEEDSWORKs in t1092 are also relative.
Suggested-by: Derrick Stolee <derrickstolee@github.com>
Helped-by: Derrick Stolee <derrickstolee@github.com>
Helped-by: Victoria Dye <vdye@github.com>
Helped-by: Elijah Newren <newren@gmail.com>
Signed-off-by: Shaoxuan Yuan <shaoxuan.yuan02@gmail.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2022-09-23 07:18:42 +03:00
|
|
|
data = read_object_file(&ce->oid, &type, &size);
|
|
|
|
init_tree_desc(&tree, data, size);
|
|
|
|
|
|
|
|
hit |= grep_tree(opt, pathspec, &tree, &name, 0, 0);
|
|
|
|
strbuf_setlen(&name, name_base_len);
|
|
|
|
strbuf_addstr(&name, ce->name);
|
|
|
|
free(data);
|
|
|
|
} else if (S_ISREG(ce->ce_mode) &&
|
2018-08-13 19:14:34 +03:00
|
|
|
match_pathspec(repo->index, pathspec, name.buf, name.len, 0, NULL,
|
2016-12-16 22:03:20 +03:00
|
|
|
S_ISDIR(ce->ce_mode) ||
|
|
|
|
S_ISGITLINK(ce->ce_mode))) {
|
|
|
|
/*
|
|
|
|
* If CE_VALID is on, we assume worktree file and its
|
|
|
|
* cache entry are identical, even if worktree file has
|
|
|
|
* been modified, so use cache version instead
|
|
|
|
*/
|
2021-02-10 00:33:30 +03:00
|
|
|
if (cached || (ce->ce_flags & CE_VALID)) {
|
2016-12-16 22:03:20 +03:00
|
|
|
if (ce_stage(ce) || ce_intent_to_add(ce))
|
|
|
|
continue;
|
2017-08-02 22:49:23 +03:00
|
|
|
hit |= grep_oid(opt, &ce->oid, name.buf,
|
|
|
|
0, name.buf);
|
2016-12-16 22:03:20 +03:00
|
|
|
} else {
|
2017-08-02 22:49:23 +03:00
|
|
|
hit |= grep_file(opt, name.buf);
|
2016-12-16 22:03:20 +03:00
|
|
|
}
|
|
|
|
} else if (recurse_submodules && S_ISGITLINK(ce->ce_mode) &&
|
2018-08-13 19:14:34 +03:00
|
|
|
submodule_path_match(repo->index, pathspec, name.buf, NULL)) {
|
2019-07-30 19:53:27 +03:00
|
|
|
hit |= grep_submodule(opt, pathspec, NULL, ce->name,
|
|
|
|
ce->name, cached);
|
2016-12-16 22:03:20 +03:00
|
|
|
} else {
|
2006-05-01 10:28:15 +04:00
|
|
|
continue;
|
2006-11-26 23:47:52 +03:00
|
|
|
}
|
2016-12-16 22:03:20 +03:00
|
|
|
|
2006-11-26 23:47:52 +03:00
|
|
|
if (ce_stage(ce)) {
|
|
|
|
do {
|
|
|
|
nr++;
|
2017-08-02 22:49:23 +03:00
|
|
|
} while (nr < repo->index->cache_nr &&
|
|
|
|
!strcmp(ce->name, repo->index->cache[nr]->name));
|
2006-11-26 23:47:52 +03:00
|
|
|
nr--; /* compensate for loop control */
|
|
|
|
}
|
2010-01-26 02:37:23 +03:00
|
|
|
if (hit && opt->status_only)
|
|
|
|
break;
|
2006-05-01 10:28:15 +04:00
|
|
|
}
|
2016-12-16 22:03:20 +03:00
|
|
|
|
|
|
|
strbuf_release(&name);
|
2006-05-01 10:28:15 +04:00
|
|
|
return hit;
|
|
|
|
}
|
|
|
|
|
2010-12-15 18:02:51 +03:00
|
|
|
static int grep_tree(struct grep_opt *opt, const struct pathspec *pathspec,
|
2012-10-12 14:49:38 +04:00
|
|
|
struct tree_desc *tree, struct strbuf *base, int tn_len,
|
2019-01-12 05:13:22 +03:00
|
|
|
int check_attr)
|
2006-05-01 10:28:15 +04:00
|
|
|
{
|
2019-01-12 05:13:22 +03:00
|
|
|
struct repository *repo = opt->repo;
|
2011-10-24 10:36:10 +04:00
|
|
|
int hit = 0;
|
|
|
|
enum interesting match = entry_not_interesting;
|
tree_entry(): new tree-walking helper function
This adds a "tree_entry()" function that combines the common operation of
doing a "tree_entry_extract()" + "update_tree_entry()".
It also has a simplified calling convention, designed for simple loops
that traverse over a whole tree: the arguments are pointers to the tree
descriptor and a name_entry structure to fill in, and it returns a boolean
"true" if there was an entry left to be gotten in the tree.
This allows tree traversal with
struct tree_desc desc;
struct name_entry entry;
desc.buf = tree->buffer;
desc.size = tree->size;
while (tree_entry(&desc, &entry) {
... use "entry.{path, sha1, mode, pathlen}" ...
}
which is not only shorter than writing it out in full, it's hopefully less
error prone too.
[ It's actually a tad faster too - we don't need to recalculate the entry
pathlength in both extract and update, but need to do it only once.
Also, some callers can avoid doing a "strlen()" on the result, since
it's returned as part of the name_entry structure.
However, by now we're talking just 1% speedup on "git-rev-list --objects
--all", and we're definitely at the point where tree walking is no
longer the issue any more. ]
NOTE! Not everybody wants to use this new helper function, since some of
the tree walkers very much on purpose do the descriptor update separately
from the entry extraction. So the "extract + update" sequence still
remains as the core sequence, this is just a simplified interface.
We should probably add a silly two-line inline helper function for
initializing the descriptor from the "struct tree" too, just to cut down
on the noise from that common "desc" initializer.
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
Signed-off-by: Junio C Hamano <junkio@cox.net>
2006-05-30 20:45:45 +04:00
|
|
|
struct name_entry entry;
|
2010-12-17 15:44:25 +03:00
|
|
|
int old_baselen = base->len;
|
2016-12-16 22:03:21 +03:00
|
|
|
struct strbuf name = STRBUF_INIT;
|
|
|
|
int name_base_len = 0;
|
2017-08-02 22:49:23 +03:00
|
|
|
if (repo->submodule_prefix) {
|
|
|
|
strbuf_addstr(&name, repo->submodule_prefix);
|
2016-12-16 22:03:21 +03:00
|
|
|
name_base_len = name.len;
|
|
|
|
}
|
2006-05-01 10:28:15 +04:00
|
|
|
|
tree_entry(): new tree-walking helper function
This adds a "tree_entry()" function that combines the common operation of
doing a "tree_entry_extract()" + "update_tree_entry()".
It also has a simplified calling convention, designed for simple loops
that traverse over a whole tree: the arguments are pointers to the tree
descriptor and a name_entry structure to fill in, and it returns a boolean
"true" if there was an entry left to be gotten in the tree.
This allows tree traversal with
struct tree_desc desc;
struct name_entry entry;
desc.buf = tree->buffer;
desc.size = tree->size;
while (tree_entry(&desc, &entry) {
... use "entry.{path, sha1, mode, pathlen}" ...
}
which is not only shorter than writing it out in full, it's hopefully less
error prone too.
[ It's actually a tad faster too - we don't need to recalculate the entry
pathlength in both extract and update, but need to do it only once.
Also, some callers can avoid doing a "strlen()" on the result, since
it's returned as part of the name_entry structure.
However, by now we're talking just 1% speedup on "git-rev-list --objects
--all", and we're definitely at the point where tree walking is no
longer the issue any more. ]
NOTE! Not everybody wants to use this new helper function, since some of
the tree walkers very much on purpose do the descriptor update separately
from the entry extraction. So the "extract + update" sequence still
remains as the core sequence, this is just a simplified interface.
We should probably add a silly two-line inline helper function for
initializing the descriptor from the "struct tree" too, just to cut down
on the noise from that common "desc" initializer.
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
Signed-off-by: Junio C Hamano <junkio@cox.net>
2006-05-30 20:45:45 +04:00
|
|
|
while (tree_entry(tree, &entry)) {
|
2011-10-24 10:36:09 +04:00
|
|
|
int te_len = tree_entry_len(&entry);
|
2010-12-17 15:44:25 +03:00
|
|
|
|
2011-10-24 10:36:10 +04:00
|
|
|
if (match != all_entries_interesting) {
|
2016-12-16 22:03:21 +03:00
|
|
|
strbuf_addstr(&name, base->buf + tn_len);
|
2018-11-18 19:47:57 +03:00
|
|
|
match = tree_entry_interesting(repo->index,
|
|
|
|
&entry, &name,
|
2016-12-16 22:03:21 +03:00
|
|
|
0, pathspec);
|
|
|
|
strbuf_setlen(&name, name_base_len);
|
|
|
|
|
2011-10-24 10:36:10 +04:00
|
|
|
if (match == all_entries_not_interesting)
|
2011-03-25 12:34:20 +03:00
|
|
|
break;
|
2011-10-24 10:36:10 +04:00
|
|
|
if (match == entry_not_interesting)
|
2010-12-17 15:45:33 +03:00
|
|
|
continue;
|
|
|
|
}
|
2006-05-01 10:28:15 +04:00
|
|
|
|
2010-12-17 15:45:33 +03:00
|
|
|
strbuf_add(base, entry.path, te_len);
|
2006-05-01 23:27:56 +04:00
|
|
|
|
2010-12-17 15:45:33 +03:00
|
|
|
if (S_ISREG(entry.mode)) {
|
2019-01-15 03:39:44 +03:00
|
|
|
hit |= grep_oid(opt, &entry.oid, base->buf, tn_len,
|
2012-10-12 14:49:38 +04:00
|
|
|
check_attr ? base->buf + tn_len : NULL);
|
2016-12-16 22:03:21 +03:00
|
|
|
} else if (S_ISDIR(entry.mode)) {
|
2007-02-26 22:55:59 +03:00
|
|
|
enum object_type type;
|
2006-05-01 10:28:15 +04:00
|
|
|
struct tree_desc sub;
|
|
|
|
void *data;
|
2007-03-21 20:08:25 +03:00
|
|
|
unsigned long size;
|
|
|
|
|
grep: replace grep_read_mutex by internal obj read lock
git-grep uses 'grep_read_mutex' to protect its calls to object reading
operations. But these have their own internal lock now, which ensures a
better performance (allowing parallel access to more regions). So, let's
remove the former and, instead, activate the latter with
enable_obj_read_lock().
Sections that are currently protected by 'grep_read_mutex' but are not
internally protected by the object reading lock should be surrounded by
obj_read_lock() and obj_read_unlock(). These guarantee mutual exclusion
with object reading operations, keeping the current behavior and
avoiding race conditions. Namely, these places are:
In grep.c:
- fill_textconv() at fill_textconv_grep().
- userdiff_get_textconv() at grep_source_1().
In builtin/grep.c:
- parse_object_or_die() and the submodule functions at
grep_submodule().
- deref_tag() and gitmodules_config_oid() at grep_objects().
If these functions become thread-safe, in the future, we might remove
the locking and probably get some speedup.
Note that some of the submodule functions will already be thread-safe
(or close to being thread-safe) with the internal object reading lock.
However, as some of them will require additional modifications to be
removed from the critical section, this will be done in its own patch.
Signed-off-by: Matheus Tavares <matheus.bernardino@usp.br>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2020-01-16 05:39:54 +03:00
|
|
|
data = read_object_file(&entry.oid, &type, &size);
|
2006-05-01 10:28:15 +04:00
|
|
|
if (!data)
|
2011-02-23 02:41:55 +03:00
|
|
|
die(_("unable to read tree (%s)"),
|
2019-01-15 03:39:44 +03:00
|
|
|
oid_to_hex(&entry.oid));
|
2010-12-17 15:45:33 +03:00
|
|
|
|
|
|
|
strbuf_addch(base, '/');
|
2007-03-21 20:08:25 +03:00
|
|
|
init_tree_desc(&sub, data, size);
|
2012-10-12 14:49:38 +04:00
|
|
|
hit |= grep_tree(opt, pathspec, &sub, base, tn_len,
|
2019-01-12 05:13:22 +03:00
|
|
|
check_attr);
|
2006-05-01 10:28:15 +04:00
|
|
|
free(data);
|
2016-12-16 22:03:21 +03:00
|
|
|
} else if (recurse_submodules && S_ISGITLINK(entry.mode)) {
|
2019-02-07 09:05:22 +03:00
|
|
|
hit |= grep_submodule(opt, pathspec, &entry.oid,
|
2019-07-30 19:53:27 +03:00
|
|
|
base->buf, base->buf + tn_len,
|
|
|
|
1); /* ignored */
|
2006-05-01 10:28:15 +04:00
|
|
|
}
|
2016-12-16 22:03:21 +03:00
|
|
|
|
2010-12-17 15:44:25 +03:00
|
|
|
strbuf_setlen(base, old_baselen);
|
|
|
|
|
2010-01-26 02:37:23 +03:00
|
|
|
if (hit && opt->status_only)
|
|
|
|
break;
|
2006-05-01 10:28:15 +04:00
|
|
|
}
|
2016-12-16 22:03:21 +03:00
|
|
|
|
|
|
|
strbuf_release(&name);
|
2006-05-01 10:28:15 +04:00
|
|
|
return hit;
|
|
|
|
}
|
|
|
|
|
2010-12-15 18:02:51 +03:00
|
|
|
static int grep_object(struct grep_opt *opt, const struct pathspec *pathspec,
|
2018-03-29 01:35:27 +03:00
|
|
|
struct object *obj, const char *name, const char *path)
|
2006-05-01 10:28:15 +04:00
|
|
|
{
|
2006-07-12 07:45:31 +04:00
|
|
|
if (obj->type == OBJ_BLOB)
|
2017-02-22 02:47:25 +03:00
|
|
|
return grep_oid(opt, &obj->oid, name, 0, path);
|
2006-07-12 07:45:31 +04:00
|
|
|
if (obj->type == OBJ_COMMIT || obj->type == OBJ_TREE) {
|
2006-05-01 10:28:15 +04:00
|
|
|
struct tree_desc tree;
|
|
|
|
void *data;
|
2007-03-21 20:08:25 +03:00
|
|
|
unsigned long size;
|
2010-12-17 15:44:25 +03:00
|
|
|
struct strbuf base;
|
|
|
|
int hit, len;
|
|
|
|
|
2019-06-27 12:28:47 +03:00
|
|
|
data = read_object_with_reference(opt->repo,
|
2022-02-05 02:48:34 +03:00
|
|
|
&obj->oid, OBJ_TREE,
|
2007-03-21 20:08:25 +03:00
|
|
|
&size, NULL);
|
2006-05-01 10:28:15 +04:00
|
|
|
if (!data)
|
2015-11-10 05:22:28 +03:00
|
|
|
die(_("unable to read tree (%s)"), oid_to_hex(&obj->oid));
|
2010-12-17 15:44:25 +03:00
|
|
|
|
|
|
|
len = name ? strlen(name) : 0;
|
|
|
|
strbuf_init(&base, PATH_MAX + len + 1);
|
|
|
|
if (len) {
|
|
|
|
strbuf_add(&base, name, len);
|
|
|
|
strbuf_addch(&base, ':');
|
|
|
|
}
|
2007-03-21 20:08:25 +03:00
|
|
|
init_tree_desc(&tree, data, size);
|
2012-10-12 14:49:38 +04:00
|
|
|
hit = grep_tree(opt, pathspec, &tree, &base, base.len,
|
2019-01-12 05:13:22 +03:00
|
|
|
obj->type == OBJ_COMMIT);
|
2010-12-17 15:44:25 +03:00
|
|
|
strbuf_release(&base);
|
2006-05-01 10:28:15 +04:00
|
|
|
free(data);
|
|
|
|
return hit;
|
|
|
|
}
|
2018-02-14 21:59:24 +03:00
|
|
|
die(_("unable to grep from object of type %s"), type_name(obj->type));
|
2006-05-01 10:28:15 +04:00
|
|
|
}
|
|
|
|
|
2010-12-15 18:02:51 +03:00
|
|
|
static int grep_objects(struct grep_opt *opt, const struct pathspec *pathspec,
|
2010-06-12 20:31:18 +04:00
|
|
|
const struct object_array *list)
|
|
|
|
{
|
|
|
|
unsigned int i;
|
|
|
|
int hit = 0;
|
|
|
|
const unsigned int nr = list->nr;
|
|
|
|
|
|
|
|
for (i = 0; i < nr; i++) {
|
|
|
|
struct object *real_obj;
|
2020-01-16 05:39:51 +03:00
|
|
|
|
grep: replace grep_read_mutex by internal obj read lock
git-grep uses 'grep_read_mutex' to protect its calls to object reading
operations. But these have their own internal lock now, which ensures a
better performance (allowing parallel access to more regions). So, let's
remove the former and, instead, activate the latter with
enable_obj_read_lock().
Sections that are currently protected by 'grep_read_mutex' but are not
internally protected by the object reading lock should be surrounded by
obj_read_lock() and obj_read_unlock(). These guarantee mutual exclusion
with object reading operations, keeping the current behavior and
avoiding race conditions. Namely, these places are:
In grep.c:
- fill_textconv() at fill_textconv_grep().
- userdiff_get_textconv() at grep_source_1().
In builtin/grep.c:
- parse_object_or_die() and the submodule functions at
grep_submodule().
- deref_tag() and gitmodules_config_oid() at grep_objects().
If these functions become thread-safe, in the future, we might remove
the locking and probably get some speedup.
Note that some of the submodule functions will already be thread-safe
(or close to being thread-safe) with the internal object reading lock.
However, as some of them will require additional modifications to be
removed from the critical section, this will be done in its own patch.
Signed-off-by: Matheus Tavares <matheus.bernardino@usp.br>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2020-01-16 05:39:54 +03:00
|
|
|
obj_read_lock();
|
2019-01-12 05:13:22 +03:00
|
|
|
real_obj = deref_tag(opt->repo, list->objects[i].item,
|
2018-06-29 04:22:05 +03:00
|
|
|
NULL, 0);
|
grep: replace grep_read_mutex by internal obj read lock
git-grep uses 'grep_read_mutex' to protect its calls to object reading
operations. But these have their own internal lock now, which ensures a
better performance (allowing parallel access to more regions). So, let's
remove the former and, instead, activate the latter with
enable_obj_read_lock().
Sections that are currently protected by 'grep_read_mutex' but are not
internally protected by the object reading lock should be surrounded by
obj_read_lock() and obj_read_unlock(). These guarantee mutual exclusion
with object reading operations, keeping the current behavior and
avoiding race conditions. Namely, these places are:
In grep.c:
- fill_textconv() at fill_textconv_grep().
- userdiff_get_textconv() at grep_source_1().
In builtin/grep.c:
- parse_object_or_die() and the submodule functions at
grep_submodule().
- deref_tag() and gitmodules_config_oid() at grep_objects().
If these functions become thread-safe, in the future, we might remove
the locking and probably get some speedup.
Note that some of the submodule functions will already be thread-safe
(or close to being thread-safe) with the internal object reading lock.
However, as some of them will require additional modifications to be
removed from the critical section, this will be done in its own patch.
Signed-off-by: Matheus Tavares <matheus.bernardino@usp.br>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2020-01-16 05:39:54 +03:00
|
|
|
obj_read_unlock();
|
2016-12-16 22:03:21 +03:00
|
|
|
|
2020-10-11 19:03:28 +03:00
|
|
|
if (!real_obj) {
|
|
|
|
char hex[GIT_MAX_HEXSZ + 1];
|
|
|
|
const char *name = list->objects[i].name;
|
|
|
|
|
|
|
|
if (!name) {
|
|
|
|
oid_to_hex_r(hex, &list->objects[i].item->oid);
|
|
|
|
name = hex;
|
|
|
|
}
|
|
|
|
die(_("invalid object '%s' given."), name);
|
|
|
|
}
|
|
|
|
|
2016-12-16 22:03:21 +03:00
|
|
|
/* load the gitmodules file for this rev */
|
|
|
|
if (recurse_submodules) {
|
2019-01-12 05:13:22 +03:00
|
|
|
submodule_free(opt->repo);
|
grep: replace grep_read_mutex by internal obj read lock
git-grep uses 'grep_read_mutex' to protect its calls to object reading
operations. But these have their own internal lock now, which ensures a
better performance (allowing parallel access to more regions). So, let's
remove the former and, instead, activate the latter with
enable_obj_read_lock().
Sections that are currently protected by 'grep_read_mutex' but are not
internally protected by the object reading lock should be surrounded by
obj_read_lock() and obj_read_unlock(). These guarantee mutual exclusion
with object reading operations, keeping the current behavior and
avoiding race conditions. Namely, these places are:
In grep.c:
- fill_textconv() at fill_textconv_grep().
- userdiff_get_textconv() at grep_source_1().
In builtin/grep.c:
- parse_object_or_die() and the submodule functions at
grep_submodule().
- deref_tag() and gitmodules_config_oid() at grep_objects().
If these functions become thread-safe, in the future, we might remove
the locking and probably get some speedup.
Note that some of the submodule functions will already be thread-safe
(or close to being thread-safe) with the internal object reading lock.
However, as some of them will require additional modifications to be
removed from the critical section, this will be done in its own patch.
Signed-off-by: Matheus Tavares <matheus.bernardino@usp.br>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2020-01-16 05:39:54 +03:00
|
|
|
obj_read_lock();
|
2017-07-14 02:49:20 +03:00
|
|
|
gitmodules_config_oid(&real_obj->oid);
|
grep: replace grep_read_mutex by internal obj read lock
git-grep uses 'grep_read_mutex' to protect its calls to object reading
operations. But these have their own internal lock now, which ensures a
better performance (allowing parallel access to more regions). So, let's
remove the former and, instead, activate the latter with
enable_obj_read_lock().
Sections that are currently protected by 'grep_read_mutex' but are not
internally protected by the object reading lock should be surrounded by
obj_read_lock() and obj_read_unlock(). These guarantee mutual exclusion
with object reading operations, keeping the current behavior and
avoiding race conditions. Namely, these places are:
In grep.c:
- fill_textconv() at fill_textconv_grep().
- userdiff_get_textconv() at grep_source_1().
In builtin/grep.c:
- parse_object_or_die() and the submodule functions at
grep_submodule().
- deref_tag() and gitmodules_config_oid() at grep_objects().
If these functions become thread-safe, in the future, we might remove
the locking and probably get some speedup.
Note that some of the submodule functions will already be thread-safe
(or close to being thread-safe) with the internal object reading lock.
However, as some of them will require additional modifications to be
removed from the critical section, this will be done in its own patch.
Signed-off-by: Matheus Tavares <matheus.bernardino@usp.br>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2020-01-16 05:39:54 +03:00
|
|
|
obj_read_unlock();
|
2016-12-16 22:03:21 +03:00
|
|
|
}
|
2018-03-29 01:35:27 +03:00
|
|
|
if (grep_object(opt, pathspec, real_obj, list->objects[i].name,
|
|
|
|
list->objects[i].path)) {
|
2010-06-12 20:31:18 +04:00
|
|
|
hit = 1;
|
|
|
|
if (opt->status_only)
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return hit;
|
|
|
|
}
|
|
|
|
|
2011-10-05 05:40:41 +04:00
|
|
|
static int grep_directory(struct grep_opt *opt, const struct pathspec *pathspec,
|
grep: turn off gitlink detection for --no-index
If we are running "git grep --no-index" outside of a git
repository, we behave roughly like "grep -r", examining all
files in the current directory and its subdirectories.
However, because we use fill_directory() to do the
recursion, it will skip over any directories which look like
sub-repositories.
For a normal git operation (like "git grep" in a repository)
this makes sense; we do not want to cross the boundary out
of our current repository into a submodule. But for
"--no-index" without a repository, we should look at all
files, including embedded repositories.
There is one exception, though: we probably should _not_
descend into ".git" directories. Doing so is inefficient and
unlikely to turn up useful hits.
This patch drops our use of dir.c's gitlink-detection, but
we do still avoid ".git". That makes us more like tools such
as "ack" or "ag", which also know to avoid cruft in .git.
As a bonus, this also drops our usage of the ref code
when we are outside of a repository, making the transition
to pluggable ref backends cleaner.
Signed-off-by: Jeff King <peff@peff.net>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2016-03-07 18:51:21 +03:00
|
|
|
int exc_std, int use_index)
|
2010-02-06 21:40:08 +03:00
|
|
|
{
|
2021-07-01 13:51:27 +03:00
|
|
|
struct dir_struct dir = DIR_INIT;
|
2010-02-06 21:40:08 +03:00
|
|
|
int i, hit = 0;
|
|
|
|
|
grep: turn off gitlink detection for --no-index
If we are running "git grep --no-index" outside of a git
repository, we behave roughly like "grep -r", examining all
files in the current directory and its subdirectories.
However, because we use fill_directory() to do the
recursion, it will skip over any directories which look like
sub-repositories.
For a normal git operation (like "git grep" in a repository)
this makes sense; we do not want to cross the boundary out
of our current repository into a submodule. But for
"--no-index" without a repository, we should look at all
files, including embedded repositories.
There is one exception, though: we probably should _not_
descend into ".git" directories. Doing so is inefficient and
unlikely to turn up useful hits.
This patch drops our use of dir.c's gitlink-detection, but
we do still avoid ".git". That makes us more like tools such
as "ack" or "ag", which also know to avoid cruft in .git.
As a bonus, this also drops our usage of the ref code
when we are outside of a repository, making the transition
to pluggable ref backends cleaner.
Signed-off-by: Jeff King <peff@peff.net>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2016-03-07 18:51:21 +03:00
|
|
|
if (!use_index)
|
|
|
|
dir.flags |= DIR_NO_GITLINKS;
|
2011-09-28 00:43:12 +04:00
|
|
|
if (exc_std)
|
|
|
|
setup_standard_excludes(&dir);
|
2010-02-06 21:40:08 +03:00
|
|
|
|
2019-01-12 05:13:22 +03:00
|
|
|
fill_directory(&dir, opt->repo->index, pathspec);
|
2010-02-06 21:40:08 +03:00
|
|
|
for (i = 0; i < dir.nr; i++) {
|
|
|
|
hit |= grep_file(opt, dir.entries[i]->name);
|
|
|
|
if (hit && opt->status_only)
|
|
|
|
break;
|
|
|
|
}
|
dir: fix problematic API to avoid memory leaks
The dir structure seemed to have a number of leaks and problems around
it. First I noticed that parent_hashmap and recursive_hashmap were
being leaked (though Peff noticed and submitted fixes before me). Then
I noticed in the previous commit that clear_directory() was only taking
responsibility for a subset of fields within dir_struct, despite the
fact that entries[] and ignored[] we allocated internally to dir.c.
That, of course, resulted in many callers either leaking or haphazardly
trying to free these arrays and their contents.
Digging further, I found that despite the pretty clear documentation
near the top of dir.h that folks were supposed to call clear_directory()
when the user no longer needed the dir_struct, there were four callers
that didn't bother doing that at all. However, two of them clearly
thought about leaks since they had an UNLEAK(dir) directive, which to me
suggests that the method to free the data was too unclear. I suspect
the non-obviousness of the API and its holes led folks to avoid it,
which then snowballed into further problems with the entries[],
ignored[], parent_hashmap, and recursive_hashmap problems.
Rename clear_directory() to dir_clear() to be more in line with other
data structures in git, and introduce a dir_init() to handle the
suggested memsetting of dir_struct to all zeroes. I hope that a name
like "dir_clear()" is more clear, and that the presence of dir_init()
will provide a hint to those looking at the code that they need to look
for either a dir_clear() or a dir_free() and lead them to find
dir_clear().
Signed-off-by: Elijah Newren <newren@gmail.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2020-08-19 01:58:26 +03:00
|
|
|
dir_clear(&dir);
|
2010-02-06 21:40:08 +03:00
|
|
|
return hit;
|
|
|
|
}
|
|
|
|
|
2009-05-21 02:05:22 +04:00
|
|
|
static int context_callback(const struct option *opt, const char *arg,
|
|
|
|
int unset)
|
2009-05-07 23:46:48 +04:00
|
|
|
{
|
|
|
|
struct grep_opt *grep_opt = opt->value;
|
|
|
|
int value;
|
|
|
|
const char *endp;
|
|
|
|
|
|
|
|
if (unset) {
|
|
|
|
grep_opt->pre_context = grep_opt->post_context = 0;
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
value = strtol(arg, (char **)&endp, 10);
|
|
|
|
if (*endp) {
|
2011-02-23 02:41:55 +03:00
|
|
|
return error(_("switch `%c' expects a numerical value"),
|
2009-05-07 23:46:48 +04:00
|
|
|
opt->short_name);
|
|
|
|
}
|
|
|
|
grep_opt->pre_context = grep_opt->post_context = value;
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2009-05-21 02:05:22 +04:00
|
|
|
static int file_callback(const struct option *opt, const char *arg, int unset)
|
2009-05-07 23:46:48 +04:00
|
|
|
{
|
|
|
|
struct grep_opt *grep_opt = opt->value;
|
assert NOARG/NONEG behavior of parse-options callbacks
When we define a parse-options callback, the flags we put in the option
struct must match what the callback expects. For example, a callback
which does not handle the "unset" parameter should only be used with
PARSE_OPT_NONEG. But since the callback and the option struct are not
defined next to each other, it's easy to get this wrong (as earlier
patches in this series show).
Fortunately, the compiler can help us here: compiling with
-Wunused-parameters can show us which callbacks ignore their "unset"
parameters (and likewise, ones that ignore "arg" expect to be triggered
with PARSE_OPT_NOARG).
But after we've inspected a callback and determined that all of its
callers use the right flags, what do we do next? We'd like to silence
the compiler warning, but do so in a way that will catch any wrong calls
in the future.
We can do that by actually checking those variables and asserting that
they match our expectations. Because this is such a common pattern,
we'll introduce some helper macros. The resulting messages aren't
as descriptive as we could make them, but the file/line information from
BUG() is enough to identify the problem (and anyway, the point is that
these should never be seen).
Each of the annotated callbacks in this patch triggers
-Wunused-parameters, and was manually inspected to make sure all callers
use the correct options (so none of these BUGs should be triggerable).
Signed-off-by: Jeff King <peff@peff.net>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2018-11-05 09:45:42 +03:00
|
|
|
int from_stdin;
|
2009-05-07 23:46:48 +04:00
|
|
|
FILE *patterns;
|
|
|
|
int lno = 0;
|
2009-10-16 18:13:25 +04:00
|
|
|
struct strbuf sb = STRBUF_INIT;
|
2009-05-07 23:46:48 +04:00
|
|
|
|
assert NOARG/NONEG behavior of parse-options callbacks
When we define a parse-options callback, the flags we put in the option
struct must match what the callback expects. For example, a callback
which does not handle the "unset" parameter should only be used with
PARSE_OPT_NONEG. But since the callback and the option struct are not
defined next to each other, it's easy to get this wrong (as earlier
patches in this series show).
Fortunately, the compiler can help us here: compiling with
-Wunused-parameters can show us which callbacks ignore their "unset"
parameters (and likewise, ones that ignore "arg" expect to be triggered
with PARSE_OPT_NOARG).
But after we've inspected a callback and determined that all of its
callers use the right flags, what do we do next? We'd like to silence
the compiler warning, but do so in a way that will catch any wrong calls
in the future.
We can do that by actually checking those variables and asserting that
they match our expectations. Because this is such a common pattern,
we'll introduce some helper macros. The resulting messages aren't
as descriptive as we could make them, but the file/line information from
BUG() is enough to identify the problem (and anyway, the point is that
these should never be seen).
Each of the annotated callbacks in this patch triggers
-Wunused-parameters, and was manually inspected to make sure all callers
use the correct options (so none of these BUGs should be triggerable).
Signed-off-by: Jeff King <peff@peff.net>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2018-11-05 09:45:42 +03:00
|
|
|
BUG_ON_OPT_NEG(unset);
|
|
|
|
|
|
|
|
from_stdin = !strcmp(arg, "-");
|
2011-03-19 21:33:15 +03:00
|
|
|
patterns = from_stdin ? stdin : fopen(arg, "r");
|
2009-05-07 23:46:48 +04:00
|
|
|
if (!patterns)
|
2011-02-23 02:41:55 +03:00
|
|
|
die_errno(_("cannot open '%s'"), arg);
|
2015-10-28 23:53:47 +03:00
|
|
|
while (strbuf_getline(&sb, patterns) == 0) {
|
2009-05-07 23:46:48 +04:00
|
|
|
/* ignore empty line like grep does */
|
|
|
|
if (sb.len == 0)
|
|
|
|
continue;
|
2010-05-23 01:43:43 +04:00
|
|
|
|
2012-05-21 20:10:09 +04:00
|
|
|
append_grep_pat(grep_opt, sb.buf, sb.len, arg, ++lno,
|
|
|
|
GREP_PATTERN);
|
2009-05-07 23:46:48 +04:00
|
|
|
}
|
2011-03-19 21:33:15 +03:00
|
|
|
if (!from_stdin)
|
|
|
|
fclose(patterns);
|
2009-05-07 23:46:48 +04:00
|
|
|
strbuf_release(&sb);
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2009-05-21 02:05:22 +04:00
|
|
|
static int not_callback(const struct option *opt, const char *arg, int unset)
|
2009-05-07 23:46:48 +04:00
|
|
|
{
|
|
|
|
struct grep_opt *grep_opt = opt->value;
|
assert NOARG/NONEG behavior of parse-options callbacks
When we define a parse-options callback, the flags we put in the option
struct must match what the callback expects. For example, a callback
which does not handle the "unset" parameter should only be used with
PARSE_OPT_NONEG. But since the callback and the option struct are not
defined next to each other, it's easy to get this wrong (as earlier
patches in this series show).
Fortunately, the compiler can help us here: compiling with
-Wunused-parameters can show us which callbacks ignore their "unset"
parameters (and likewise, ones that ignore "arg" expect to be triggered
with PARSE_OPT_NOARG).
But after we've inspected a callback and determined that all of its
callers use the right flags, what do we do next? We'd like to silence
the compiler warning, but do so in a way that will catch any wrong calls
in the future.
We can do that by actually checking those variables and asserting that
they match our expectations. Because this is such a common pattern,
we'll introduce some helper macros. The resulting messages aren't
as descriptive as we could make them, but the file/line information from
BUG() is enough to identify the problem (and anyway, the point is that
these should never be seen).
Each of the annotated callbacks in this patch triggers
-Wunused-parameters, and was manually inspected to make sure all callers
use the correct options (so none of these BUGs should be triggerable).
Signed-off-by: Jeff King <peff@peff.net>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2018-11-05 09:45:42 +03:00
|
|
|
BUG_ON_OPT_NEG(unset);
|
|
|
|
BUG_ON_OPT_ARG(arg);
|
2009-05-07 23:46:48 +04:00
|
|
|
append_grep_pattern(grep_opt, "--not", "command line", 0, GREP_NOT);
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2009-05-21 02:05:22 +04:00
|
|
|
static int and_callback(const struct option *opt, const char *arg, int unset)
|
2009-05-07 23:46:48 +04:00
|
|
|
{
|
|
|
|
struct grep_opt *grep_opt = opt->value;
|
assert NOARG/NONEG behavior of parse-options callbacks
When we define a parse-options callback, the flags we put in the option
struct must match what the callback expects. For example, a callback
which does not handle the "unset" parameter should only be used with
PARSE_OPT_NONEG. But since the callback and the option struct are not
defined next to each other, it's easy to get this wrong (as earlier
patches in this series show).
Fortunately, the compiler can help us here: compiling with
-Wunused-parameters can show us which callbacks ignore their "unset"
parameters (and likewise, ones that ignore "arg" expect to be triggered
with PARSE_OPT_NOARG).
But after we've inspected a callback and determined that all of its
callers use the right flags, what do we do next? We'd like to silence
the compiler warning, but do so in a way that will catch any wrong calls
in the future.
We can do that by actually checking those variables and asserting that
they match our expectations. Because this is such a common pattern,
we'll introduce some helper macros. The resulting messages aren't
as descriptive as we could make them, but the file/line information from
BUG() is enough to identify the problem (and anyway, the point is that
these should never be seen).
Each of the annotated callbacks in this patch triggers
-Wunused-parameters, and was manually inspected to make sure all callers
use the correct options (so none of these BUGs should be triggerable).
Signed-off-by: Jeff King <peff@peff.net>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2018-11-05 09:45:42 +03:00
|
|
|
BUG_ON_OPT_NEG(unset);
|
|
|
|
BUG_ON_OPT_ARG(arg);
|
2009-05-07 23:46:48 +04:00
|
|
|
append_grep_pattern(grep_opt, "--and", "command line", 0, GREP_AND);
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2009-05-21 02:05:22 +04:00
|
|
|
static int open_callback(const struct option *opt, const char *arg, int unset)
|
2009-05-07 23:46:48 +04:00
|
|
|
{
|
|
|
|
struct grep_opt *grep_opt = opt->value;
|
assert NOARG/NONEG behavior of parse-options callbacks
When we define a parse-options callback, the flags we put in the option
struct must match what the callback expects. For example, a callback
which does not handle the "unset" parameter should only be used with
PARSE_OPT_NONEG. But since the callback and the option struct are not
defined next to each other, it's easy to get this wrong (as earlier
patches in this series show).
Fortunately, the compiler can help us here: compiling with
-Wunused-parameters can show us which callbacks ignore their "unset"
parameters (and likewise, ones that ignore "arg" expect to be triggered
with PARSE_OPT_NOARG).
But after we've inspected a callback and determined that all of its
callers use the right flags, what do we do next? We'd like to silence
the compiler warning, but do so in a way that will catch any wrong calls
in the future.
We can do that by actually checking those variables and asserting that
they match our expectations. Because this is such a common pattern,
we'll introduce some helper macros. The resulting messages aren't
as descriptive as we could make them, but the file/line information from
BUG() is enough to identify the problem (and anyway, the point is that
these should never be seen).
Each of the annotated callbacks in this patch triggers
-Wunused-parameters, and was manually inspected to make sure all callers
use the correct options (so none of these BUGs should be triggerable).
Signed-off-by: Jeff King <peff@peff.net>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2018-11-05 09:45:42 +03:00
|
|
|
BUG_ON_OPT_NEG(unset);
|
|
|
|
BUG_ON_OPT_ARG(arg);
|
2009-05-07 23:46:48 +04:00
|
|
|
append_grep_pattern(grep_opt, "(", "command line", 0, GREP_OPEN_PAREN);
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2009-05-21 02:05:22 +04:00
|
|
|
static int close_callback(const struct option *opt, const char *arg, int unset)
|
2009-05-07 23:46:48 +04:00
|
|
|
{
|
|
|
|
struct grep_opt *grep_opt = opt->value;
|
assert NOARG/NONEG behavior of parse-options callbacks
When we define a parse-options callback, the flags we put in the option
struct must match what the callback expects. For example, a callback
which does not handle the "unset" parameter should only be used with
PARSE_OPT_NONEG. But since the callback and the option struct are not
defined next to each other, it's easy to get this wrong (as earlier
patches in this series show).
Fortunately, the compiler can help us here: compiling with
-Wunused-parameters can show us which callbacks ignore their "unset"
parameters (and likewise, ones that ignore "arg" expect to be triggered
with PARSE_OPT_NOARG).
But after we've inspected a callback and determined that all of its
callers use the right flags, what do we do next? We'd like to silence
the compiler warning, but do so in a way that will catch any wrong calls
in the future.
We can do that by actually checking those variables and asserting that
they match our expectations. Because this is such a common pattern,
we'll introduce some helper macros. The resulting messages aren't
as descriptive as we could make them, but the file/line information from
BUG() is enough to identify the problem (and anyway, the point is that
these should never be seen).
Each of the annotated callbacks in this patch triggers
-Wunused-parameters, and was manually inspected to make sure all callers
use the correct options (so none of these BUGs should be triggerable).
Signed-off-by: Jeff King <peff@peff.net>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2018-11-05 09:45:42 +03:00
|
|
|
BUG_ON_OPT_NEG(unset);
|
|
|
|
BUG_ON_OPT_ARG(arg);
|
2009-05-07 23:46:48 +04:00
|
|
|
append_grep_pattern(grep_opt, ")", "command line", 0, GREP_CLOSE_PAREN);
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2009-05-21 02:05:22 +04:00
|
|
|
static int pattern_callback(const struct option *opt, const char *arg,
|
|
|
|
int unset)
|
2009-05-07 23:46:48 +04:00
|
|
|
{
|
|
|
|
struct grep_opt *grep_opt = opt->value;
|
assert NOARG/NONEG behavior of parse-options callbacks
When we define a parse-options callback, the flags we put in the option
struct must match what the callback expects. For example, a callback
which does not handle the "unset" parameter should only be used with
PARSE_OPT_NONEG. But since the callback and the option struct are not
defined next to each other, it's easy to get this wrong (as earlier
patches in this series show).
Fortunately, the compiler can help us here: compiling with
-Wunused-parameters can show us which callbacks ignore their "unset"
parameters (and likewise, ones that ignore "arg" expect to be triggered
with PARSE_OPT_NOARG).
But after we've inspected a callback and determined that all of its
callers use the right flags, what do we do next? We'd like to silence
the compiler warning, but do so in a way that will catch any wrong calls
in the future.
We can do that by actually checking those variables and asserting that
they match our expectations. Because this is such a common pattern,
we'll introduce some helper macros. The resulting messages aren't
as descriptive as we could make them, but the file/line information from
BUG() is enough to identify the problem (and anyway, the point is that
these should never be seen).
Each of the annotated callbacks in this patch triggers
-Wunused-parameters, and was manually inspected to make sure all callers
use the correct options (so none of these BUGs should be triggerable).
Signed-off-by: Jeff King <peff@peff.net>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2018-11-05 09:45:42 +03:00
|
|
|
BUG_ON_OPT_NEG(unset);
|
2009-05-07 23:46:48 +04:00
|
|
|
append_grep_pattern(grep_opt, arg, "-e option", 0, GREP_PATTERN);
|
|
|
|
return 0;
|
|
|
|
}
|
2006-05-01 10:28:15 +04:00
|
|
|
|
2006-07-29 09:44:25 +04:00
|
|
|
int cmd_grep(int argc, const char **argv, const char *prefix)
|
2006-05-01 10:28:15 +04:00
|
|
|
{
|
|
|
|
int hit = 0;
|
2011-09-28 00:43:12 +04:00
|
|
|
int cached = 0, untracked = 0, opt_exclude = -1;
|
2006-05-09 10:55:47 +04:00
|
|
|
int seen_dashdash = 0;
|
2010-01-13 06:06:41 +03:00
|
|
|
int external_grep_allowed__ignored;
|
2010-06-12 20:39:46 +04:00
|
|
|
const char *show_in_pager = NULL, *default_pager = "dummy";
|
2006-05-01 10:28:15 +04:00
|
|
|
struct grep_opt opt;
|
2010-08-29 06:04:17 +04:00
|
|
|
struct object_array list = OBJECT_ARRAY_INIT;
|
2010-12-15 18:02:51 +03:00
|
|
|
struct pathspec pathspec;
|
2021-10-22 11:55:41 +03:00
|
|
|
struct string_list path_list = STRING_LIST_INIT_DUP;
|
2006-05-09 10:55:47 +04:00
|
|
|
int i;
|
2009-05-07 23:46:48 +04:00
|
|
|
int dummy;
|
2010-08-06 07:06:39 +04:00
|
|
|
int use_index = 1;
|
2017-02-15 00:54:36 +03:00
|
|
|
int allow_revs;
|
2011-05-10 05:48:36 +04:00
|
|
|
|
2009-05-07 23:46:48 +04:00
|
|
|
struct option options[] = {
|
2013-08-03 15:51:19 +04:00
|
|
|
OPT_BOOL(0, "cached", &cached,
|
2012-08-20 16:32:15 +04:00
|
|
|
N_("search in index instead of in the work tree")),
|
2012-02-28 23:06:09 +04:00
|
|
|
OPT_NEGBIT(0, "no-index", &use_index,
|
2012-08-20 16:32:55 +04:00
|
|
|
N_("find in contents not managed by git"), 1),
|
2013-08-03 15:51:19 +04:00
|
|
|
OPT_BOOL(0, "untracked", &untracked,
|
2012-08-20 16:32:15 +04:00
|
|
|
N_("search in both tracked and untracked files")),
|
2011-09-28 00:43:12 +04:00
|
|
|
OPT_SET_INT(0, "exclude-standard", &opt_exclude,
|
2015-02-27 17:01:58 +03:00
|
|
|
N_("ignore files specified via '.gitignore'"), 1),
|
2016-12-16 22:03:20 +03:00
|
|
|
OPT_BOOL(0, "recurse-submodules", &recurse_submodules,
|
2017-03-17 20:22:53 +03:00
|
|
|
N_("recursively search in each submodule")),
|
2009-05-07 23:46:48 +04:00
|
|
|
OPT_GROUP(""),
|
2013-08-03 15:51:19 +04:00
|
|
|
OPT_BOOL('v', "invert-match", &opt.invert,
|
2012-08-20 16:32:15 +04:00
|
|
|
N_("show non-matching lines")),
|
2013-08-03 15:51:19 +04:00
|
|
|
OPT_BOOL('i', "ignore-case", &opt.ignore_case,
|
2012-08-20 16:32:15 +04:00
|
|
|
N_("case insensitive matching")),
|
2013-08-03 15:51:19 +04:00
|
|
|
OPT_BOOL('w', "word-regexp", &opt.word_regexp,
|
2012-08-20 16:32:15 +04:00
|
|
|
N_("match patterns only at word boundaries")),
|
2009-05-07 23:46:48 +04:00
|
|
|
OPT_SET_INT('a', "text", &opt.binary,
|
2012-08-20 16:32:15 +04:00
|
|
|
N_("process binary files as text"), GREP_BINARY_TEXT),
|
2009-05-07 23:46:48 +04:00
|
|
|
OPT_SET_INT('I', NULL, &opt.binary,
|
2012-08-20 16:32:15 +04:00
|
|
|
N_("don't match patterns in binary files"),
|
2009-05-07 23:46:48 +04:00
|
|
|
GREP_BINARY_NOMATCH),
|
2013-05-10 19:10:15 +04:00
|
|
|
OPT_BOOL(0, "textconv", &opt.allow_textconv,
|
|
|
|
N_("process binary files with textconv filters")),
|
2018-10-01 22:15:57 +03:00
|
|
|
OPT_SET_INT('r', "recursive", &opt.max_depth,
|
|
|
|
N_("search in subdirectories (default)"), -1),
|
2012-08-20 16:32:15 +04:00
|
|
|
{ OPTION_INTEGER, 0, "max-depth", &opt.max_depth, N_("depth"),
|
|
|
|
N_("descend at most <depth> levels"), PARSE_OPT_NONEG,
|
grep: Add --max-depth option.
It is useful to grep directories non-recursively, e.g. when one wants to
look for all files in the toplevel directory, but not in any subdirectory,
or in Documentation/, but not in Documentation/technical/.
This patch adds support for --max-depth <depth> option to git-grep. If it is
given, git-grep descends at most <depth> levels of directories below paths
specified on the command line.
Note that if path specified on command line contains wildcards, this option
makes no sense, e.g.
$ git grep -l --max-depth 0 GNU -- 'contrib/*'
(note the quotes) will search all files in contrib/, even in
subdirectories, because '*' matches all files.
Documentation updates, bash-completion and simple test cases are also
provided.
Signed-off-by: Michał Kiedrowicz <michal.kiedrowicz@gmail.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2009-07-22 21:52:15 +04:00
|
|
|
NULL, 1 },
|
2009-05-07 23:46:48 +04:00
|
|
|
OPT_GROUP(""),
|
grep: simplify config parsing and option parsing
Simplify the parsing of "grep.patternType" and
"grep.extendedRegexp". This changes no behavior, but gets rid of
complex parsing logic that isn't needed anymore.
When "grep.patternType" was introduced in 84befcd0a4a (grep: add a
grep.patternType configuration setting, 2012-08-03) we promised that:
1. You can set "grep.patternType", and "[setting it to] 'default'
will return to the default matching behavior".
In that context "the default" meant whatever the configuration
system specified before that change, i.e. via grep.extendedRegexp.
2. We'd support the existing "grep.extendedRegexp" option, but ignore
it when the new "grep.patternType" option is set. We said we'd
only ignore the older "grep.extendedRegexp" option "when the
`grep.patternType` option is set to a value other than
'default'".
In a preceding commit we changed grep_config() to be called after
grep_init(), which means that much of the complexity here can go
away.
As before both "grep.patternType" and "grep.extendedRegexp" are
last-one-wins variable, with "grep.extendedRegexp" yielding to
"grep.patternType", except when "grep.patternType=default".
Note that as the previously added tests indicate this cannot be done
on-the-fly as we see the config variables, without introducing more
state keeping. I.e. if we see:
-c grep.extendedRegexp=false
-c grep.patternType=default
-c extendedRegexp=true
We need to select ERE, since grep.patternType=default unselects that
variable, which normally has higher precedence, but we also need to
select BRE in cases of:
-c grep.extendedRegexp=true \
-c grep.extendedRegexp=false
Which would not be the case for this, which select ERE:
-c grep.patternType=extended \
-c grep.extendedRegexp=false
Therefore we cannot do this on-the-fly in grep_config without also
introducing tracking variables for not only the pattern type, but what
the source of that pattern type was.
So we need to decide on the pattern after our config was fully
parsed. Let's do that by deferring the decision on the pattern type
until it's time to compile it in compile_regexp().
By that time we've not only parsed the config, but also handled the
command-line options. Those will set "opt.pattern_type_option" (*not*
"opt.extended_regexp_option"!).
At that point all we need to do is see if "grep.patternType" was
UNSPECIFIED in the end (including an explicit "=default"), if so we'll
use the "grep.extendedRegexp" configuration, if any.
See my 07a3d411739 (grep: remove regflags from the public grep_opt
API, 2017-06-29) for addition of the two comments being removed here,
i.e. the complexity noted in that commit is now going away.
1. https://lore.kernel.org/git/patch-v8-09.10-c211bb0c69d-20220118T155211Z-avarab@gmail.com/
Signed-off-by: Ævar Arnfjörð Bjarmason <avarab@gmail.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2022-02-16 03:00:39 +03:00
|
|
|
OPT_SET_INT('E', "extended-regexp", &opt.pattern_type_option,
|
2012-08-20 16:32:15 +04:00
|
|
|
N_("use extended POSIX regular expressions"),
|
grep: add a grep.patternType configuration setting
The grep.extendedRegexp configuration setting enables the -E flag on grep
by default but there are no equivalents for the -G, -F and -P flags.
Rather than adding an additional setting for grep.fooRegexp for current
and future pattern matching options, add a grep.patternType setting that
can accept appropriate values for modifying the default grep pattern
matching behavior. The current values are "basic", "extended", "fixed",
"perl" and "default" for setting -G, -E, -F, -P and the default behavior
respectively.
When grep.patternType is set to a value other than "default", the
grep.extendedRegexp setting is ignored. The value of "default" restores
the current default behavior, including the grep.extendedRegexp
behavior.
Signed-off-by: J Smith <dark.panda@gmail.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2012-08-03 18:53:50 +04:00
|
|
|
GREP_PATTERN_TYPE_ERE),
|
grep: simplify config parsing and option parsing
Simplify the parsing of "grep.patternType" and
"grep.extendedRegexp". This changes no behavior, but gets rid of
complex parsing logic that isn't needed anymore.
When "grep.patternType" was introduced in 84befcd0a4a (grep: add a
grep.patternType configuration setting, 2012-08-03) we promised that:
1. You can set "grep.patternType", and "[setting it to] 'default'
will return to the default matching behavior".
In that context "the default" meant whatever the configuration
system specified before that change, i.e. via grep.extendedRegexp.
2. We'd support the existing "grep.extendedRegexp" option, but ignore
it when the new "grep.patternType" option is set. We said we'd
only ignore the older "grep.extendedRegexp" option "when the
`grep.patternType` option is set to a value other than
'default'".
In a preceding commit we changed grep_config() to be called after
grep_init(), which means that much of the complexity here can go
away.
As before both "grep.patternType" and "grep.extendedRegexp" are
last-one-wins variable, with "grep.extendedRegexp" yielding to
"grep.patternType", except when "grep.patternType=default".
Note that as the previously added tests indicate this cannot be done
on-the-fly as we see the config variables, without introducing more
state keeping. I.e. if we see:
-c grep.extendedRegexp=false
-c grep.patternType=default
-c extendedRegexp=true
We need to select ERE, since grep.patternType=default unselects that
variable, which normally has higher precedence, but we also need to
select BRE in cases of:
-c grep.extendedRegexp=true \
-c grep.extendedRegexp=false
Which would not be the case for this, which select ERE:
-c grep.patternType=extended \
-c grep.extendedRegexp=false
Therefore we cannot do this on-the-fly in grep_config without also
introducing tracking variables for not only the pattern type, but what
the source of that pattern type was.
So we need to decide on the pattern after our config was fully
parsed. Let's do that by deferring the decision on the pattern type
until it's time to compile it in compile_regexp().
By that time we've not only parsed the config, but also handled the
command-line options. Those will set "opt.pattern_type_option" (*not*
"opt.extended_regexp_option"!).
At that point all we need to do is see if "grep.patternType" was
UNSPECIFIED in the end (including an explicit "=default"), if so we'll
use the "grep.extendedRegexp" configuration, if any.
See my 07a3d411739 (grep: remove regflags from the public grep_opt
API, 2017-06-29) for addition of the two comments being removed here,
i.e. the complexity noted in that commit is now going away.
1. https://lore.kernel.org/git/patch-v8-09.10-c211bb0c69d-20220118T155211Z-avarab@gmail.com/
Signed-off-by: Ævar Arnfjörð Bjarmason <avarab@gmail.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2022-02-16 03:00:39 +03:00
|
|
|
OPT_SET_INT('G', "basic-regexp", &opt.pattern_type_option,
|
2012-08-20 16:32:15 +04:00
|
|
|
N_("use basic POSIX regular expressions (default)"),
|
grep: add a grep.patternType configuration setting
The grep.extendedRegexp configuration setting enables the -E flag on grep
by default but there are no equivalents for the -G, -F and -P flags.
Rather than adding an additional setting for grep.fooRegexp for current
and future pattern matching options, add a grep.patternType setting that
can accept appropriate values for modifying the default grep pattern
matching behavior. The current values are "basic", "extended", "fixed",
"perl" and "default" for setting -G, -E, -F, -P and the default behavior
respectively.
When grep.patternType is set to a value other than "default", the
grep.extendedRegexp setting is ignored. The value of "default" restores
the current default behavior, including the grep.extendedRegexp
behavior.
Signed-off-by: J Smith <dark.panda@gmail.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2012-08-03 18:53:50 +04:00
|
|
|
GREP_PATTERN_TYPE_BRE),
|
grep: simplify config parsing and option parsing
Simplify the parsing of "grep.patternType" and
"grep.extendedRegexp". This changes no behavior, but gets rid of
complex parsing logic that isn't needed anymore.
When "grep.patternType" was introduced in 84befcd0a4a (grep: add a
grep.patternType configuration setting, 2012-08-03) we promised that:
1. You can set "grep.patternType", and "[setting it to] 'default'
will return to the default matching behavior".
In that context "the default" meant whatever the configuration
system specified before that change, i.e. via grep.extendedRegexp.
2. We'd support the existing "grep.extendedRegexp" option, but ignore
it when the new "grep.patternType" option is set. We said we'd
only ignore the older "grep.extendedRegexp" option "when the
`grep.patternType` option is set to a value other than
'default'".
In a preceding commit we changed grep_config() to be called after
grep_init(), which means that much of the complexity here can go
away.
As before both "grep.patternType" and "grep.extendedRegexp" are
last-one-wins variable, with "grep.extendedRegexp" yielding to
"grep.patternType", except when "grep.patternType=default".
Note that as the previously added tests indicate this cannot be done
on-the-fly as we see the config variables, without introducing more
state keeping. I.e. if we see:
-c grep.extendedRegexp=false
-c grep.patternType=default
-c extendedRegexp=true
We need to select ERE, since grep.patternType=default unselects that
variable, which normally has higher precedence, but we also need to
select BRE in cases of:
-c grep.extendedRegexp=true \
-c grep.extendedRegexp=false
Which would not be the case for this, which select ERE:
-c grep.patternType=extended \
-c grep.extendedRegexp=false
Therefore we cannot do this on-the-fly in grep_config without also
introducing tracking variables for not only the pattern type, but what
the source of that pattern type was.
So we need to decide on the pattern after our config was fully
parsed. Let's do that by deferring the decision on the pattern type
until it's time to compile it in compile_regexp().
By that time we've not only parsed the config, but also handled the
command-line options. Those will set "opt.pattern_type_option" (*not*
"opt.extended_regexp_option"!).
At that point all we need to do is see if "grep.patternType" was
UNSPECIFIED in the end (including an explicit "=default"), if so we'll
use the "grep.extendedRegexp" configuration, if any.
See my 07a3d411739 (grep: remove regflags from the public grep_opt
API, 2017-06-29) for addition of the two comments being removed here,
i.e. the complexity noted in that commit is now going away.
1. https://lore.kernel.org/git/patch-v8-09.10-c211bb0c69d-20220118T155211Z-avarab@gmail.com/
Signed-off-by: Ævar Arnfjörð Bjarmason <avarab@gmail.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2022-02-16 03:00:39 +03:00
|
|
|
OPT_SET_INT('F', "fixed-strings", &opt.pattern_type_option,
|
2012-08-20 16:32:15 +04:00
|
|
|
N_("interpret patterns as fixed strings"),
|
grep: add a grep.patternType configuration setting
The grep.extendedRegexp configuration setting enables the -E flag on grep
by default but there are no equivalents for the -G, -F and -P flags.
Rather than adding an additional setting for grep.fooRegexp for current
and future pattern matching options, add a grep.patternType setting that
can accept appropriate values for modifying the default grep pattern
matching behavior. The current values are "basic", "extended", "fixed",
"perl" and "default" for setting -G, -E, -F, -P and the default behavior
respectively.
When grep.patternType is set to a value other than "default", the
grep.extendedRegexp setting is ignored. The value of "default" restores
the current default behavior, including the grep.extendedRegexp
behavior.
Signed-off-by: J Smith <dark.panda@gmail.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2012-08-03 18:53:50 +04:00
|
|
|
GREP_PATTERN_TYPE_FIXED),
|
grep: simplify config parsing and option parsing
Simplify the parsing of "grep.patternType" and
"grep.extendedRegexp". This changes no behavior, but gets rid of
complex parsing logic that isn't needed anymore.
When "grep.patternType" was introduced in 84befcd0a4a (grep: add a
grep.patternType configuration setting, 2012-08-03) we promised that:
1. You can set "grep.patternType", and "[setting it to] 'default'
will return to the default matching behavior".
In that context "the default" meant whatever the configuration
system specified before that change, i.e. via grep.extendedRegexp.
2. We'd support the existing "grep.extendedRegexp" option, but ignore
it when the new "grep.patternType" option is set. We said we'd
only ignore the older "grep.extendedRegexp" option "when the
`grep.patternType` option is set to a value other than
'default'".
In a preceding commit we changed grep_config() to be called after
grep_init(), which means that much of the complexity here can go
away.
As before both "grep.patternType" and "grep.extendedRegexp" are
last-one-wins variable, with "grep.extendedRegexp" yielding to
"grep.patternType", except when "grep.patternType=default".
Note that as the previously added tests indicate this cannot be done
on-the-fly as we see the config variables, without introducing more
state keeping. I.e. if we see:
-c grep.extendedRegexp=false
-c grep.patternType=default
-c extendedRegexp=true
We need to select ERE, since grep.patternType=default unselects that
variable, which normally has higher precedence, but we also need to
select BRE in cases of:
-c grep.extendedRegexp=true \
-c grep.extendedRegexp=false
Which would not be the case for this, which select ERE:
-c grep.patternType=extended \
-c grep.extendedRegexp=false
Therefore we cannot do this on-the-fly in grep_config without also
introducing tracking variables for not only the pattern type, but what
the source of that pattern type was.
So we need to decide on the pattern after our config was fully
parsed. Let's do that by deferring the decision on the pattern type
until it's time to compile it in compile_regexp().
By that time we've not only parsed the config, but also handled the
command-line options. Those will set "opt.pattern_type_option" (*not*
"opt.extended_regexp_option"!).
At that point all we need to do is see if "grep.patternType" was
UNSPECIFIED in the end (including an explicit "=default"), if so we'll
use the "grep.extendedRegexp" configuration, if any.
See my 07a3d411739 (grep: remove regflags from the public grep_opt
API, 2017-06-29) for addition of the two comments being removed here,
i.e. the complexity noted in that commit is now going away.
1. https://lore.kernel.org/git/patch-v8-09.10-c211bb0c69d-20220118T155211Z-avarab@gmail.com/
Signed-off-by: Ævar Arnfjörð Bjarmason <avarab@gmail.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2022-02-16 03:00:39 +03:00
|
|
|
OPT_SET_INT('P', "perl-regexp", &opt.pattern_type_option,
|
2012-08-20 16:32:15 +04:00
|
|
|
N_("use Perl-compatible regular expressions"),
|
grep: add a grep.patternType configuration setting
The grep.extendedRegexp configuration setting enables the -E flag on grep
by default but there are no equivalents for the -G, -F and -P flags.
Rather than adding an additional setting for grep.fooRegexp for current
and future pattern matching options, add a grep.patternType setting that
can accept appropriate values for modifying the default grep pattern
matching behavior. The current values are "basic", "extended", "fixed",
"perl" and "default" for setting -G, -E, -F, -P and the default behavior
respectively.
When grep.patternType is set to a value other than "default", the
grep.extendedRegexp setting is ignored. The value of "default" restores
the current default behavior, including the grep.extendedRegexp
behavior.
Signed-off-by: J Smith <dark.panda@gmail.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2012-08-03 18:53:50 +04:00
|
|
|
GREP_PATTERN_TYPE_PCRE),
|
2009-05-07 23:46:48 +04:00
|
|
|
OPT_GROUP(""),
|
2013-08-03 15:51:19 +04:00
|
|
|
OPT_BOOL('n', "line-number", &opt.linenum, N_("show line numbers")),
|
2018-06-22 18:49:45 +03:00
|
|
|
OPT_BOOL(0, "column", &opt.columnnum, N_("show column number of first match")),
|
2012-08-20 16:32:15 +04:00
|
|
|
OPT_NEGBIT('h', NULL, &opt.pathname, N_("don't show filenames"), 1),
|
|
|
|
OPT_BIT('H', NULL, &opt.pathname, N_("show filenames"), 1),
|
2009-05-07 23:46:48 +04:00
|
|
|
OPT_NEGBIT(0, "full-name", &opt.relative,
|
2012-08-20 16:32:15 +04:00
|
|
|
N_("show filenames relative to top directory"), 1),
|
2013-08-03 15:51:19 +04:00
|
|
|
OPT_BOOL('l', "files-with-matches", &opt.name_only,
|
2012-08-20 16:32:15 +04:00
|
|
|
N_("show only filenames instead of matching lines")),
|
2013-08-03 15:51:19 +04:00
|
|
|
OPT_BOOL(0, "name-only", &opt.name_only,
|
2012-08-20 16:32:15 +04:00
|
|
|
N_("synonym for --files-with-matches")),
|
2013-08-03 15:51:19 +04:00
|
|
|
OPT_BOOL('L', "files-without-match",
|
2009-05-07 23:46:48 +04:00
|
|
|
&opt.unmatch_name_only,
|
2012-08-20 16:32:15 +04:00
|
|
|
N_("show only the names of files without match")),
|
2018-02-09 14:01:59 +03:00
|
|
|
OPT_BOOL_F('z', "null", &opt.null_following_name,
|
|
|
|
N_("print NUL after filenames"),
|
|
|
|
PARSE_OPT_NOCOMPLETE),
|
2018-07-09 23:33:47 +03:00
|
|
|
OPT_BOOL('o', "only-matching", &opt.only_matching,
|
|
|
|
N_("show only matching parts of a line")),
|
2013-08-03 15:51:19 +04:00
|
|
|
OPT_BOOL('c', "count", &opt.count,
|
2012-08-20 16:32:15 +04:00
|
|
|
N_("show the number of matches instead of matching lines")),
|
|
|
|
OPT__COLOR(&opt.color, N_("highlight matches")),
|
2013-08-03 15:51:19 +04:00
|
|
|
OPT_BOOL(0, "break", &opt.file_break,
|
2012-08-20 16:32:15 +04:00
|
|
|
N_("print empty line between matches from different files")),
|
2013-08-03 15:51:19 +04:00
|
|
|
OPT_BOOL(0, "heading", &opt.heading,
|
2012-08-20 16:32:15 +04:00
|
|
|
N_("show filename only once above matches from same file")),
|
2009-05-07 23:46:48 +04:00
|
|
|
OPT_GROUP(""),
|
2012-08-20 16:32:15 +04:00
|
|
|
OPT_CALLBACK('C', "context", &opt, N_("n"),
|
|
|
|
N_("show <n> context lines before and after matches"),
|
2009-05-07 23:46:48 +04:00
|
|
|
context_callback),
|
2011-08-01 21:22:52 +04:00
|
|
|
OPT_INTEGER('B', "before-context", &opt.pre_context,
|
2012-08-20 16:32:15 +04:00
|
|
|
N_("show <n> context lines before matches")),
|
2011-08-01 21:22:52 +04:00
|
|
|
OPT_INTEGER('A', "after-context", &opt.post_context,
|
2012-08-20 16:32:15 +04:00
|
|
|
N_("show <n> context lines after matches")),
|
2015-12-15 18:31:39 +03:00
|
|
|
OPT_INTEGER(0, "threads", &num_threads,
|
|
|
|
N_("use <n> worker threads")),
|
2012-08-20 16:32:15 +04:00
|
|
|
OPT_NUMBER_CALLBACK(&opt, N_("shortcut for -C NUM"),
|
2009-05-07 23:46:48 +04:00
|
|
|
context_callback),
|
2013-08-03 15:51:19 +04:00
|
|
|
OPT_BOOL('p', "show-function", &opt.funcname,
|
2012-08-20 16:32:15 +04:00
|
|
|
N_("show a line with the function name before matches")),
|
2013-08-03 15:51:19 +04:00
|
|
|
OPT_BOOL('W', "function-context", &opt.funcbody,
|
2012-08-20 16:32:15 +04:00
|
|
|
N_("show the surrounding function")),
|
2009-05-07 23:46:48 +04:00
|
|
|
OPT_GROUP(""),
|
2012-08-20 16:32:15 +04:00
|
|
|
OPT_CALLBACK('f', NULL, &opt, N_("file"),
|
|
|
|
N_("read patterns from file"), file_callback),
|
Use OPT_CALLBACK and OPT_CALLBACK_F
In the codebase, there are many options which use OPTION_CALLBACK in a
plain ol' struct definition. However, we have the OPT_CALLBACK and
OPT_CALLBACK_F macros which are meant to abstract these plain struct
definitions away. These macros are useful as they semantically signal to
developers that these are just normal callback option with nothing fancy
happening.
Replace plain struct definitions of OPTION_CALLBACK with OPT_CALLBACK or
OPT_CALLBACK_F where applicable. The heavy lifting was done using the
following (disgusting) shell script:
#!/bin/sh
do_replacement () {
tr '\n' '\r' |
sed -e 's/{\s*OPTION_CALLBACK,\s*\([^,]*\),\([^,]*\),\([^,]*\),\([^,]*\),\([^,]*\),\s*0,\(\s*[^[:space:]}]*\)\s*}/OPT_CALLBACK(\1,\2,\3,\4,\5,\6)/g' |
sed -e 's/{\s*OPTION_CALLBACK,\s*\([^,]*\),\([^,]*\),\([^,]*\),\([^,]*\),\([^,]*\),\([^,]*\),\(\s*[^[:space:]}]*\)\s*}/OPT_CALLBACK_F(\1,\2,\3,\4,\5,\6,\7)/g' |
tr '\r' '\n'
}
for f in $(git ls-files \*.c)
do
do_replacement <"$f" >"$f.tmp"
mv "$f.tmp" "$f"
done
The result was manually inspected and then reformatted to match the
style of the surrounding code. Finally, using
`git grep OPTION_CALLBACK \*.c`, leftover results which were not handled
by the script were manually transformed.
Signed-off-by: Denton Liu <liu.denton@gmail.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2020-04-28 11:36:28 +03:00
|
|
|
OPT_CALLBACK_F('e', NULL, &opt, N_("pattern"),
|
|
|
|
N_("match <pattern>"), PARSE_OPT_NONEG, pattern_callback),
|
|
|
|
OPT_CALLBACK_F(0, "and", &opt, NULL,
|
|
|
|
N_("combine patterns specified with -e"),
|
|
|
|
PARSE_OPT_NOARG | PARSE_OPT_NONEG, and_callback),
|
2013-08-03 15:51:19 +04:00
|
|
|
OPT_BOOL(0, "or", &dummy, ""),
|
Use OPT_CALLBACK and OPT_CALLBACK_F
In the codebase, there are many options which use OPTION_CALLBACK in a
plain ol' struct definition. However, we have the OPT_CALLBACK and
OPT_CALLBACK_F macros which are meant to abstract these plain struct
definitions away. These macros are useful as they semantically signal to
developers that these are just normal callback option with nothing fancy
happening.
Replace plain struct definitions of OPTION_CALLBACK with OPT_CALLBACK or
OPT_CALLBACK_F where applicable. The heavy lifting was done using the
following (disgusting) shell script:
#!/bin/sh
do_replacement () {
tr '\n' '\r' |
sed -e 's/{\s*OPTION_CALLBACK,\s*\([^,]*\),\([^,]*\),\([^,]*\),\([^,]*\),\([^,]*\),\s*0,\(\s*[^[:space:]}]*\)\s*}/OPT_CALLBACK(\1,\2,\3,\4,\5,\6)/g' |
sed -e 's/{\s*OPTION_CALLBACK,\s*\([^,]*\),\([^,]*\),\([^,]*\),\([^,]*\),\([^,]*\),\([^,]*\),\(\s*[^[:space:]}]*\)\s*}/OPT_CALLBACK_F(\1,\2,\3,\4,\5,\6,\7)/g' |
tr '\r' '\n'
}
for f in $(git ls-files \*.c)
do
do_replacement <"$f" >"$f.tmp"
mv "$f.tmp" "$f"
done
The result was manually inspected and then reformatted to match the
style of the surrounding code. Finally, using
`git grep OPTION_CALLBACK \*.c`, leftover results which were not handled
by the script were manually transformed.
Signed-off-by: Denton Liu <liu.denton@gmail.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2020-04-28 11:36:28 +03:00
|
|
|
OPT_CALLBACK_F(0, "not", &opt, NULL, "",
|
|
|
|
PARSE_OPT_NOARG | PARSE_OPT_NONEG, not_callback),
|
|
|
|
OPT_CALLBACK_F('(', NULL, &opt, NULL, "",
|
|
|
|
PARSE_OPT_NOARG | PARSE_OPT_NONEG | PARSE_OPT_NODASH,
|
|
|
|
open_callback),
|
|
|
|
OPT_CALLBACK_F(')', NULL, &opt, NULL, "",
|
|
|
|
PARSE_OPT_NOARG | PARSE_OPT_NONEG | PARSE_OPT_NODASH,
|
|
|
|
close_callback),
|
2010-11-08 21:06:54 +03:00
|
|
|
OPT__QUIET(&opt.status_only,
|
2012-08-20 16:32:15 +04:00
|
|
|
N_("indicate hit with exit status without output")),
|
2013-08-03 15:51:19 +04:00
|
|
|
OPT_BOOL(0, "all-match", &opt.all_match,
|
2012-08-20 16:32:15 +04:00
|
|
|
N_("show only matches from files that match all patterns")),
|
2009-05-07 23:46:48 +04:00
|
|
|
OPT_GROUP(""),
|
2010-06-12 20:39:46 +04:00
|
|
|
{ OPTION_STRING, 'O', "open-files-in-pager", &show_in_pager,
|
2012-08-20 16:32:15 +04:00
|
|
|
N_("pager"), N_("show matching files in the pager"),
|
2018-02-09 14:01:59 +03:00
|
|
|
PARSE_OPT_OPTARG | PARSE_OPT_NOCOMPLETE,
|
|
|
|
NULL, (intptr_t)default_pager },
|
|
|
|
OPT_BOOL_F(0, "ext-grep", &external_grep_allowed__ignored,
|
|
|
|
N_("allow calling of grep(1) (ignored by this build)"),
|
|
|
|
PARSE_OPT_NOCOMPLETE),
|
2022-06-22 22:47:32 +03:00
|
|
|
OPT_INTEGER('m', "max-count", &opt.max_count,
|
|
|
|
N_("maximum number of results per file")),
|
2009-05-07 23:46:48 +04:00
|
|
|
OPT_END()
|
|
|
|
};
|
built-ins: trust the "prefix" from run_builtin()
Change code in "builtin/grep.c" and "builtin/ls-tree.c" to trust the
"prefix" passed from "run_builtin()". The "prefix" we get from setup.c
is either going to be NULL or a string of length >0, never "".
So we can drop the "prefix && *prefix" checks added for
"builtin/grep.c" in 0d042fecf2f (git-grep: show pathnames relative to
the current directory, 2006-08-11), and for "builtin/ls-tree.c" in
a69dd585fca (ls-tree: chomp leading directories when run from a
subdirectory, 2005-12-23).
As seen in code in revision.c that was added in cd676a51367 (diff
--relative: output paths as relative to the current subdirectory,
2008-02-12) we already have existing code that does away with this
assertion.
This makes it easier to reason about a subsequent change to the
"prefix_length" code in grep.c in a subsequent commit, and since we're
going to the trouble of doing that let's leave behind an assert() to
promise this to any future callers.
For "builtin/grep.c" it would be painful to pass the "prefix" down the
callchain of:
cmd_grep -> grep_tree -> grep_submodule -> grep_cache -> grep_oid ->
grep_source_name
So for the code that needs it in grep_source_name() let's add a
"grep_prefix" variable similar to the existing "ls_tree_prefix".
While at it let's move the code in cmd_ls_tree() around so that we
assign to the "ls_tree_prefix" right after declaring the variables,
and stop assigning to "prefix". We only subsequently used that
variable later in the function after clobbering it. Let's just use our
own "grep_prefix" instead.
Let's also add an assert() in git.c, so that we'll make this promise
about the "prefix" to any current and future callers, as well as to
any readers of the code.
Code history:
* The strlen() in "grep.c" hasn't been used since 493b7a08d80 (grep:
accept relative paths outside current working directory, 2009-09-05).
When that code was added in 0d042fecf2f (git-grep: show pathnames
relative to the current directory, 2006-08-11) we used the length.
But since 493b7a08d80 we haven't used it for anything except a
boolean check that we could have done on the "prefix" member
itself.
Signed-off-by: Ævar Arnfjörð Bjarmason <avarab@gmail.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2022-02-16 03:00:34 +03:00
|
|
|
grep_prefix = prefix;
|
2006-05-01 10:28:15 +04:00
|
|
|
|
built-ins: trust the "prefix" from run_builtin()
Change code in "builtin/grep.c" and "builtin/ls-tree.c" to trust the
"prefix" passed from "run_builtin()". The "prefix" we get from setup.c
is either going to be NULL or a string of length >0, never "".
So we can drop the "prefix && *prefix" checks added for
"builtin/grep.c" in 0d042fecf2f (git-grep: show pathnames relative to
the current directory, 2006-08-11), and for "builtin/ls-tree.c" in
a69dd585fca (ls-tree: chomp leading directories when run from a
subdirectory, 2005-12-23).
As seen in code in revision.c that was added in cd676a51367 (diff
--relative: output paths as relative to the current subdirectory,
2008-02-12) we already have existing code that does away with this
assertion.
This makes it easier to reason about a subsequent change to the
"prefix_length" code in grep.c in a subsequent commit, and since we're
going to the trouble of doing that let's leave behind an assert() to
promise this to any future callers.
For "builtin/grep.c" it would be painful to pass the "prefix" down the
callchain of:
cmd_grep -> grep_tree -> grep_submodule -> grep_cache -> grep_oid ->
grep_source_name
So for the code that needs it in grep_source_name() let's add a
"grep_prefix" variable similar to the existing "ls_tree_prefix".
While at it let's move the code in cmd_ls_tree() around so that we
assign to the "ls_tree_prefix" right after declaring the variables,
and stop assigning to "prefix". We only subsequently used that
variable later in the function after clobbering it. Let's just use our
own "grep_prefix" instead.
Let's also add an assert() in git.c, so that we'll make this promise
about the "prefix" to any current and future callers, as well as to
any readers of the code.
Code history:
* The strlen() in "grep.c" hasn't been used since 493b7a08d80 (grep:
accept relative paths outside current working directory, 2009-09-05).
When that code was added in 0d042fecf2f (git-grep: show pathnames
relative to the current directory, 2006-08-11) we used the length.
But since 493b7a08d80 we haven't used it for anything except a
boolean check that we could have done on the "prefix" member
itself.
Signed-off-by: Ævar Arnfjörð Bjarmason <avarab@gmail.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2022-02-16 03:00:34 +03:00
|
|
|
grep_init(&opt, the_repository);
|
2022-02-16 03:00:36 +03:00
|
|
|
git_config(grep_cmd_config, &opt);
|
2009-03-07 15:32:32 +03:00
|
|
|
|
2006-05-01 10:28:15 +04:00
|
|
|
/*
|
2006-05-09 10:55:47 +04:00
|
|
|
* If there is no -- then the paths must exist in the working
|
|
|
|
* tree. If there is no explicit pattern specified with -e or
|
|
|
|
* -f, we take the first unrecognized non option to be the
|
|
|
|
* pattern, but then what follows it must be zero or more
|
|
|
|
* valid refs up to the -- (if exists), and then existing
|
|
|
|
* paths. If there is an explicit pattern, then the first
|
2006-07-10 09:50:18 +04:00
|
|
|
* unrecognized non option is the beginning of the refs list
|
2006-05-09 10:55:47 +04:00
|
|
|
* that continues up to the -- (if exists), and then paths.
|
2006-05-01 10:28:15 +04:00
|
|
|
*/
|
2009-05-23 22:53:12 +04:00
|
|
|
argc = parse_options(argc, argv, prefix, options, grep_usage,
|
2009-05-07 23:46:48 +04:00
|
|
|
PARSE_OPT_KEEP_DASHDASH |
|
2015-11-17 13:25:53 +03:00
|
|
|
PARSE_OPT_STOP_AT_NON_OPTION);
|
2009-05-07 23:46:48 +04:00
|
|
|
|
builtin/grep.c: integrate with sparse index
Turn on sparse index and remove ensure_full_index().
Before this patch, `git-grep` utilizes the ensure_full_index() method to
expand the index and search all the entries. Because this method
requires walking all the trees and constructing the index, it is the
slow part within the whole command.
To achieve better performance, this patch uses grep_tree() to search the
sparse directory entries and get rid of the ensure_full_index() method.
Why grep_tree() is a better choice over ensure_full_index()?
1) grep_tree() is as correct as ensure_full_index(). grep_tree() looks
into every sparse-directory entry (represented by a tree) recursively
when looping over the index, and the result of doing so matches the
result of expanding the index.
2) grep_tree() utilizes pathspecs to limit the scope of searching.
ensure_full_index() always expands the index, which means it will
always walk all the trees and blobs in the repo without caring if
the user only wants a subset of the content, i.e. using a pathspec.
On the other hand, grep_tree() will only search the contents that
match the pathspec, and thus possibly walking fewer trees.
3) grep_tree() does not construct and copy back a new index, while
ensure_full_index() does. This also saves some time.
----------------
Performance test
- Summary:
p2000 tests demonstrate a ~71% execution time reduction for
`git grep --cached bogus -- "f2/f1/f1/*"` using tree-walking logic.
However, notice that this result varies depending on the pathspec
given. See below "Command used for testing" for more details.
Test HEAD~ HEAD
-------------------------------------------------------
2000.78: git grep ... (full-v3) 0.35 0.39 (≈)
2000.79: git grep ... (full-v4) 0.36 0.30 (≈)
2000.80: git grep ... (sparse-v3) 0.88 0.23 (-73.8%)
2000.81: git grep ... (sparse-v4) 0.83 0.26 (-68.6%)
- Command used for testing:
git grep --cached bogus -- "f2/f1/f1/*"
The reason for specifying a pathspec is that, if we don't specify a
pathspec, then grep_tree() will walk all the trees and blobs to find the
pattern, and the time consumed doing so is not too different from using
the original ensure_full_index() method, which also spends most of the
time walking trees. However, when a pathspec is specified, this latest
logic will only walk the area of trees enclosed by the pathspec, and the
time consumed is reasonably a lot less.
Generally speaking, because the performance gain is acheived by walking
less trees, which are specified by the pathspec, the HEAD time v.s.
HEAD~ time in sparse-v[3|4], should be proportional to
"pathspec enclosed area" v.s. "all area", respectively. Namely, the
wider the <pathspec> is encompassing, the less the performance
difference between HEAD~ and HEAD, and vice versa.
That is, if we don't specify a pathspec, the performance difference [1]
is indistinguishable: both methods walk all the trees and take generally
same amount of time (even with the index construction time included for
ensure_full_index()).
[1] Performance test result without pathspec (hence walking all trees):
Command used:
git grep --cached bogus
Test HEAD~ HEAD
---------------------------------------------------
2000.78: git grep ... (full-v3) 6.17 5.19 (≈)
2000.79: git grep ... (full-v4) 6.19 5.46 (≈)
2000.80: git grep ... (sparse-v3) 6.57 6.44 (≈)
2000.81: git grep ... (sparse-v4) 6.65 6.28 (≈)
--------------------------
NEEDSWORK about submodules
There are a few NEEDSWORKs that belong to improvements beyond this
topic. See the NEEDSWORK in builtin/grep.c::grep_submodule() for
more context. The other two NEEDSWORKs in t1092 are also relative.
Suggested-by: Derrick Stolee <derrickstolee@github.com>
Helped-by: Derrick Stolee <derrickstolee@github.com>
Helped-by: Victoria Dye <vdye@github.com>
Helped-by: Elijah Newren <newren@gmail.com>
Signed-off-by: Shaoxuan Yuan <shaoxuan.yuan02@gmail.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2022-09-23 07:18:42 +03:00
|
|
|
if (the_repository->gitdir) {
|
|
|
|
prepare_repo_settings(the_repository);
|
|
|
|
the_repository->settings.command_requires_full_index = 0;
|
|
|
|
}
|
|
|
|
|
2016-01-12 13:40:26 +03:00
|
|
|
if (use_index && !startup_info->have_repository) {
|
|
|
|
int fallback = 0;
|
|
|
|
git_config_get_bool("grep.fallbacktonoindex", &fallback);
|
|
|
|
if (fallback)
|
|
|
|
use_index = 0;
|
|
|
|
else
|
|
|
|
/* die the same way as if we did it at the beginning */
|
|
|
|
setup_git_directory();
|
|
|
|
}
|
2020-01-30 16:37:28 +03:00
|
|
|
/* Ignore --recurse-submodules if --no-index is given or implied */
|
|
|
|
if (!use_index)
|
|
|
|
recurse_submodules = 0;
|
2010-02-06 21:40:08 +03:00
|
|
|
|
2010-02-07 07:44:15 +03:00
|
|
|
/*
|
|
|
|
* skip a -- separator; we know it cannot be
|
|
|
|
* separating revisions from pathnames if
|
|
|
|
* we haven't even had any patterns yet
|
|
|
|
*/
|
|
|
|
if (argc > 0 && !opt.pattern_list && !strcmp(argv[0], "--")) {
|
|
|
|
argv++;
|
|
|
|
argc--;
|
|
|
|
}
|
|
|
|
|
2009-05-07 23:46:48 +04:00
|
|
|
/* First unrecognized non-option token */
|
|
|
|
if (argc > 0 && !opt.pattern_list) {
|
|
|
|
append_grep_pattern(&opt, argv[0], "command line", 0,
|
|
|
|
GREP_PATTERN);
|
|
|
|
argv++;
|
|
|
|
argc--;
|
2006-05-01 10:28:15 +04:00
|
|
|
}
|
2006-05-09 10:55:47 +04:00
|
|
|
|
2010-06-12 20:39:46 +04:00
|
|
|
if (show_in_pager == default_pager)
|
|
|
|
show_in_pager = git_pager(1);
|
2010-06-12 20:36:51 +04:00
|
|
|
if (show_in_pager) {
|
2010-07-03 06:55:06 +04:00
|
|
|
opt.color = 0;
|
2010-06-12 20:39:46 +04:00
|
|
|
opt.name_only = 1;
|
|
|
|
opt.null_following_name = 1;
|
|
|
|
opt.output_priv = &path_list;
|
|
|
|
opt.output = append_path;
|
2010-06-26 03:41:39 +04:00
|
|
|
string_list_append(&path_list, show_in_pager);
|
2010-06-12 20:36:51 +04:00
|
|
|
}
|
|
|
|
|
2006-05-03 02:40:49 +04:00
|
|
|
if (!opt.pattern_list)
|
2018-07-21 10:49:19 +03:00
|
|
|
die(_("no pattern given"));
|
2010-01-26 01:51:39 +03:00
|
|
|
|
2018-07-09 23:33:47 +03:00
|
|
|
/* --only-matching has no effect with --invert. */
|
|
|
|
if (opt.invert)
|
|
|
|
opt.only_matching = 0;
|
|
|
|
|
grep: fix "--" rev/pathspec disambiguation
If we see "git grep pattern rev -- file" then we apply the
usual rev/pathspec disambiguation rules: any "rev" before
the "--" must be a revision, and we do not need to apply the
verify_non_filename() check.
But there are two bugs here:
1. We keep a seen_dashdash flag to handle this case, but
we set it in the same left-to-right pass over the
arguments in which we parse "rev".
So when we see "rev", we do not yet know that there is
a "--", and we mistakenly complain if there is a
matching file.
We can fix this by making a preliminary pass over the
arguments to find the "--", and only then checking the rev
arguments.
2. If we can't resolve "rev" but there isn't a dashdash,
that's OK. We treat it like a path, and complain later
if it doesn't exist.
But if there _is_ a dashdash, then we know it must be a
rev, and should treat it as such, complaining if it
does not resolve. The current code instead ignores it
and tries to treat it like a path.
This patch fixes both bugs, and tries to comment the parsing
flow a bit better.
It adds tests that cover the two bugs, but also some related
situations (which already worked, but this confirms that our
fixes did not break anything).
Signed-off-by: Jeff King <peff@peff.net>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2017-02-14 09:05:55 +03:00
|
|
|
/*
|
|
|
|
* We have to find "--" in a separate pass, because its presence
|
|
|
|
* influences how we will parse arguments that come before it.
|
|
|
|
*/
|
|
|
|
for (i = 0; i < argc; i++) {
|
|
|
|
if (!strcmp(argv[i], "--")) {
|
|
|
|
seen_dashdash = 1;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Resolve any rev arguments. If we have a dashdash, then everything up
|
|
|
|
* to it must resolve as a rev. If not, then we stop at the first
|
|
|
|
* non-rev and assume everything else is a path.
|
|
|
|
*/
|
2017-02-15 00:54:36 +03:00
|
|
|
allow_revs = use_index && !untracked;
|
2009-05-07 23:46:48 +04:00
|
|
|
for (i = 0; i < argc; i++) {
|
2006-05-09 10:55:47 +04:00
|
|
|
const char *arg = argv[i];
|
2017-02-22 02:47:25 +03:00
|
|
|
struct object_id oid;
|
2013-05-10 19:10:16 +04:00
|
|
|
struct object_context oc;
|
2017-02-14 09:04:17 +03:00
|
|
|
struct object *object;
|
|
|
|
|
2017-02-14 09:03:03 +03:00
|
|
|
if (!strcmp(arg, "--")) {
|
|
|
|
i++;
|
|
|
|
break;
|
|
|
|
}
|
2017-02-14 09:04:17 +03:00
|
|
|
|
2017-02-15 00:54:36 +03:00
|
|
|
if (!allow_revs) {
|
grep: avoid resolving revision names in --no-index case
We disallow the use of revisions with --no-index, but we
don't actually check and complain until well after we've
parsed the revisions.
This is the cause of a few problems:
1. We shouldn't be calling get_sha1() at all when we aren't
in a repository, as it might access the ref or object
databases. For now, this should generally just return
failure, but eventually it will become a BUG().
2. When there's a "--" disambiguator and you're outside a
repository, we'll complain early with "unable to resolve
revision". But we can give a much more specific error.
3. When there isn't a "--" disambiguator, we still do the
normal rev/path checks. This is silly, as we know we
cannot have any revs with --no-index. Everything we see
must be a path.
Outside of a repository this doesn't matter (since we
know it won't resolve), but inside one, we may complain
unnecessarily if a filename happens to also match a
refname.
This patch skips the get_sha1() call entirely in the
no-index case, and behaves as if it failed (with the
exception of giving a better error message).
Signed-off-by: Jeff King <peff@peff.net>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2017-02-14 09:07:29 +03:00
|
|
|
if (seen_dashdash)
|
2017-02-15 00:54:36 +03:00
|
|
|
die(_("--no-index or --untracked cannot be used with revs"));
|
grep: avoid resolving revision names in --no-index case
We disallow the use of revisions with --no-index, but we
don't actually check and complain until well after we've
parsed the revisions.
This is the cause of a few problems:
1. We shouldn't be calling get_sha1() at all when we aren't
in a repository, as it might access the ref or object
databases. For now, this should generally just return
failure, but eventually it will become a BUG().
2. When there's a "--" disambiguator and you're outside a
repository, we'll complain early with "unable to resolve
revision". But we can give a much more specific error.
3. When there isn't a "--" disambiguator, we still do the
normal rev/path checks. This is silly, as we know we
cannot have any revs with --no-index. Everything we see
must be a path.
Outside of a repository this doesn't matter (since we
know it won't resolve), but inside one, we may complain
unnecessarily if a filename happens to also match a
refname.
This patch skips the get_sha1() call entirely in the
no-index case, and behaves as if it failed (with the
exception of giving a better error message).
Signed-off-by: Jeff King <peff@peff.net>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2017-02-14 09:07:29 +03:00
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
2019-01-12 05:13:28 +03:00
|
|
|
if (get_oid_with_context(the_repository, arg,
|
|
|
|
GET_OID_RECORD_PATH,
|
sha1_name: convert get_sha1* to get_oid*
Now that all the callers of get_sha1 directly or indirectly use struct
object_id, rename the functions starting with get_sha1 to start with
get_oid. Convert the internals in sha1_name.c to use struct object_id
as well, and eliminate explicit length checks where possible. Convert a
use of 40 in get_oid_basic to GIT_SHA1_HEXSZ.
Outside of sha1_name.c and cache.h, this transition was made with the
following semantic patch:
@@
expression E1, E2;
@@
- get_sha1(E1, E2.hash)
+ get_oid(E1, &E2)
@@
expression E1, E2;
@@
- get_sha1(E1, E2->hash)
+ get_oid(E1, E2)
@@
expression E1, E2;
@@
- get_sha1_committish(E1, E2.hash)
+ get_oid_committish(E1, &E2)
@@
expression E1, E2;
@@
- get_sha1_committish(E1, E2->hash)
+ get_oid_committish(E1, E2)
@@
expression E1, E2;
@@
- get_sha1_treeish(E1, E2.hash)
+ get_oid_treeish(E1, &E2)
@@
expression E1, E2;
@@
- get_sha1_treeish(E1, E2->hash)
+ get_oid_treeish(E1, E2)
@@
expression E1, E2;
@@
- get_sha1_commit(E1, E2.hash)
+ get_oid_commit(E1, &E2)
@@
expression E1, E2;
@@
- get_sha1_commit(E1, E2->hash)
+ get_oid_commit(E1, E2)
@@
expression E1, E2;
@@
- get_sha1_tree(E1, E2.hash)
+ get_oid_tree(E1, &E2)
@@
expression E1, E2;
@@
- get_sha1_tree(E1, E2->hash)
+ get_oid_tree(E1, E2)
@@
expression E1, E2;
@@
- get_sha1_blob(E1, E2.hash)
+ get_oid_blob(E1, &E2)
@@
expression E1, E2;
@@
- get_sha1_blob(E1, E2->hash)
+ get_oid_blob(E1, E2)
@@
expression E1, E2, E3, E4;
@@
- get_sha1_with_context(E1, E2, E3.hash, E4)
+ get_oid_with_context(E1, E2, &E3, E4)
@@
expression E1, E2, E3, E4;
@@
- get_sha1_with_context(E1, E2, E3->hash, E4)
+ get_oid_with_context(E1, E2, E3, E4)
Signed-off-by: brian m. carlson <sandals@crustytoothpaste.net>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2017-07-14 02:49:28 +03:00
|
|
|
&oid, &oc)) {
|
grep: fix "--" rev/pathspec disambiguation
If we see "git grep pattern rev -- file" then we apply the
usual rev/pathspec disambiguation rules: any "rev" before
the "--" must be a revision, and we do not need to apply the
verify_non_filename() check.
But there are two bugs here:
1. We keep a seen_dashdash flag to handle this case, but
we set it in the same left-to-right pass over the
arguments in which we parse "rev".
So when we see "rev", we do not yet know that there is
a "--", and we mistakenly complain if there is a
matching file.
We can fix this by making a preliminary pass over the
arguments to find the "--", and only then checking the rev
arguments.
2. If we can't resolve "rev" but there isn't a dashdash,
that's OK. We treat it like a path, and complain later
if it doesn't exist.
But if there _is_ a dashdash, then we know it must be a
rev, and should treat it as such, complaining if it
does not resolve. The current code instead ignores it
and tries to treat it like a path.
This patch fixes both bugs, and tries to comment the parsing
flow a bit better.
It adds tests that cover the two bugs, but also some related
situations (which already worked, but this confirms that our
fixes did not break anything).
Signed-off-by: Jeff King <peff@peff.net>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2017-02-14 09:05:55 +03:00
|
|
|
if (seen_dashdash)
|
|
|
|
die(_("unable to resolve revision: %s"), arg);
|
2017-02-14 09:04:17 +03:00
|
|
|
break;
|
grep: fix "--" rev/pathspec disambiguation
If we see "git grep pattern rev -- file" then we apply the
usual rev/pathspec disambiguation rules: any "rev" before
the "--" must be a revision, and we do not need to apply the
verify_non_filename() check.
But there are two bugs here:
1. We keep a seen_dashdash flag to handle this case, but
we set it in the same left-to-right pass over the
arguments in which we parse "rev".
So when we see "rev", we do not yet know that there is
a "--", and we mistakenly complain if there is a
matching file.
We can fix this by making a preliminary pass over the
arguments to find the "--", and only then checking the rev
arguments.
2. If we can't resolve "rev" but there isn't a dashdash,
that's OK. We treat it like a path, and complain later
if it doesn't exist.
But if there _is_ a dashdash, then we know it must be a
rev, and should treat it as such, complaining if it
does not resolve. The current code instead ignores it
and tries to treat it like a path.
This patch fixes both bugs, and tries to comment the parsing
flow a bit better.
It adds tests that cover the two bugs, but also some related
situations (which already worked, but this confirms that our
fixes did not break anything).
Signed-off-by: Jeff King <peff@peff.net>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2017-02-14 09:05:55 +03:00
|
|
|
}
|
2017-02-14 09:04:17 +03:00
|
|
|
|
object: convert parse_object* to take struct object_id
Make parse_object, parse_object_or_die, and parse_object_buffer take a
pointer to struct object_id. Remove the temporary variables inserted
earlier, since they are no longer necessary. Transform all of the
callers using the following semantic patch:
@@
expression E1;
@@
- parse_object(E1.hash)
+ parse_object(&E1)
@@
expression E1;
@@
- parse_object(E1->hash)
+ parse_object(E1)
@@
expression E1, E2;
@@
- parse_object_or_die(E1.hash, E2)
+ parse_object_or_die(&E1, E2)
@@
expression E1, E2;
@@
- parse_object_or_die(E1->hash, E2)
+ parse_object_or_die(E1, E2)
@@
expression E1, E2, E3, E4, E5;
@@
- parse_object_buffer(E1.hash, E2, E3, E4, E5)
+ parse_object_buffer(&E1, E2, E3, E4, E5)
@@
expression E1, E2, E3, E4, E5;
@@
- parse_object_buffer(E1->hash, E2, E3, E4, E5)
+ parse_object_buffer(E1, E2, E3, E4, E5)
Signed-off-by: brian m. carlson <sandals@crustytoothpaste.net>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2017-05-07 01:10:38 +03:00
|
|
|
object = parse_object_or_die(&oid, arg);
|
2017-02-14 09:04:17 +03:00
|
|
|
if (!seen_dashdash)
|
|
|
|
verify_non_filename(prefix, arg);
|
|
|
|
add_object_array_with_path(object, arg, &list, oc.mode, oc.path);
|
2017-05-19 15:54:43 +03:00
|
|
|
free(oc.path);
|
2006-05-02 02:58:29 +04:00
|
|
|
}
|
2006-05-09 10:55:47 +04:00
|
|
|
|
grep: fix "--" rev/pathspec disambiguation
If we see "git grep pattern rev -- file" then we apply the
usual rev/pathspec disambiguation rules: any "rev" before
the "--" must be a revision, and we do not need to apply the
verify_non_filename() check.
But there are two bugs here:
1. We keep a seen_dashdash flag to handle this case, but
we set it in the same left-to-right pass over the
arguments in which we parse "rev".
So when we see "rev", we do not yet know that there is
a "--", and we mistakenly complain if there is a
matching file.
We can fix this by making a preliminary pass over the
arguments to find the "--", and only then checking the rev
arguments.
2. If we can't resolve "rev" but there isn't a dashdash,
that's OK. We treat it like a path, and complain later
if it doesn't exist.
But if there _is_ a dashdash, then we know it must be a
rev, and should treat it as such, complaining if it
does not resolve. The current code instead ignores it
and tries to treat it like a path.
This patch fixes both bugs, and tries to comment the parsing
flow a bit better.
It adds tests that cover the two bugs, but also some related
situations (which already worked, but this confirms that our
fixes did not break anything).
Signed-off-by: Jeff King <peff@peff.net>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2017-02-14 09:05:55 +03:00
|
|
|
/*
|
|
|
|
* Anything left over is presumed to be a path. But in the non-dashdash
|
|
|
|
* "do what I mean" case, we verify and complain when that isn't true.
|
|
|
|
*/
|
2017-02-14 09:02:38 +03:00
|
|
|
if (!seen_dashdash) {
|
|
|
|
int j;
|
|
|
|
for (j = i; j < argc; j++)
|
2017-02-15 00:54:36 +03:00
|
|
|
verify_filename(prefix, argv[j], j == i && allow_revs);
|
2017-02-14 09:02:38 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
parse_pathspec(&pathspec, 0,
|
|
|
|
PATHSPEC_PREFER_CWD |
|
|
|
|
(opt.max_depth != -1 ? PATHSPEC_MAXDEPTH_VALID : 0),
|
|
|
|
prefix, argv + i);
|
|
|
|
pathspec.max_depth = opt.max_depth;
|
|
|
|
pathspec.recursive = 1;
|
2017-12-05 03:07:34 +03:00
|
|
|
pathspec.recurse_submodules = !!recurse_submodules;
|
2017-02-14 09:02:38 +03:00
|
|
|
|
2020-02-14 23:54:20 +03:00
|
|
|
if (recurse_submodules && untracked)
|
|
|
|
die(_("--untracked not supported with --recurse-submodules"));
|
grep: allow submodule functions to run in parallel
Now that object reading operations are internally protected, the
submodule initialization functions at builtin/grep.c:grep_submodule()
are very close to being thread-safe. Let's take a look at each call and
remove from the critical section what we can, for better performance:
- submodule_from_path() and is_submodule_active() cannot be called in
parallel yet only because they call repo_read_gitmodules() which
contains, in its call stack, operations that would otherwise be in
race condition with object reading (for example parse_object() and
is_promisor_remote()). However, they only call repo_read_gitmodules()
if it wasn't read before. So let's pre-read it before firing the
threads and allow these two functions to safely be called in
parallel.
- repo_submodule_init() is already thread-safe, so remove it from the
critical section without other necessary changes.
- The repo_read_gitmodules(&subrepo) call at grep_submodule() is safe as
no other thread is performing object reading operations in the subrepo
yet. However, threads might be working in the superproject, and this
function calls add_to_alternates_memory() internally, which is racy
with object readings in the superproject. So it must be kept
protected for now. Let's add a "NEEDSWORK" to it, informing why it
cannot be removed from the critical section yet.
- Finally, add_to_alternates_memory() must be kept protected for the
same reason as the item above.
Signed-off-by: Matheus Tavares <matheus.bernardino@usp.br>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2020-01-16 05:39:56 +03:00
|
|
|
|
2022-06-22 22:47:32 +03:00
|
|
|
/*
|
|
|
|
* Optimize out the case where the amount of matches is limited to zero.
|
|
|
|
* We do this to keep results consistent with GNU grep(1).
|
|
|
|
*/
|
|
|
|
if (opt.max_count == 0)
|
|
|
|
return 1;
|
|
|
|
|
grep: re-enable threads in non-worktree case
They were disabled at 53b8d93 ("grep: disable threading in non-worktree
case", 12-12-2011), due to observable performance drops (to the point
that using a single thread would be faster than multiple threads). But
now that zlib inflation can be performed in parallel we can regain the
speedup, so let's re-enable threads in non-worktree grep.
Grepping 'abcd[02]' ("Regex 1") and '(static|extern) (int|double) \*'
("Regex 2") at chromium's repository[1] I got:
Threads | Regex 1 | Regex 2
---------|------------|-----------
1 | 17.2920s | 20.9624s
2 | 9.6512s | 11.3184s
4 | 6.7723s | 7.6268s
8** | 6.2886s | 6.9843s
These are all means of 30 executions after 2 warmup runs. All tests were
executed on an i7-7700HQ (quad-core w/ hyper-threading), 16GB of RAM and
SSD, running Manjaro Linux. But to make sure the optimization also
performs well on HDD, the tests were repeated on another machine with an
i5-4210U (dual-core w/ hyper-threading), 8GB of RAM and HDD (SATA III,
5400 rpm), also running Manjaro Linux:
Threads | Regex 1 | Regex 2
---------|------------|-----------
1 | 18.4035s | 22.5368s
2 | 12.5063s | 14.6409s
4** | 10.9136s | 12.7106s
** Note that in these cases we relied on hyper-threading, and that's
probably why we don't see a big difference in time.
Unfortunately, multithreaded git-grep might be slow in the non-worktree
case when --textconv is used and there're too many text conversions.
Probably the reason for this is that the object read lock is used to
protect fill_textconv() and therefore there is a mutual exclusion
between textconv execution and object reading. Because both are
time-consuming operations, not being able to perform them in parallel
can cause performance drops. To inform the users about this (and other
threading details), let's also add a "NOTES ON THREADS" section to
Documentation/git-grep.txt.
[1]: chromium’s repo at commit 03ae96f (“Add filters testing at DSF=2”,
04-06-2019), after a 'git gc' execution.
Signed-off-by: Matheus Tavares <matheus.bernardino@usp.br>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2020-01-16 05:39:58 +03:00
|
|
|
if (show_in_pager) {
|
2018-11-03 11:48:44 +03:00
|
|
|
if (num_threads > 1)
|
|
|
|
warning(_("invalid option combination, ignoring --threads"));
|
|
|
|
num_threads = 1;
|
|
|
|
} else if (!HAVE_THREADS && num_threads > 1) {
|
2017-05-25 22:45:34 +03:00
|
|
|
warning(_("no threads support, ignoring --threads"));
|
2018-11-03 11:48:44 +03:00
|
|
|
num_threads = 1;
|
|
|
|
} else if (num_threads < 0)
|
|
|
|
die(_("invalid number of threads specified (%d)"), num_threads);
|
|
|
|
else if (num_threads == 0)
|
2020-01-16 05:40:00 +03:00
|
|
|
num_threads = HAVE_THREADS ? online_cpus() : 1;
|
2011-12-13 01:16:08 +04:00
|
|
|
|
2018-11-03 11:48:44 +03:00
|
|
|
if (num_threads > 1) {
|
|
|
|
if (!HAVE_THREADS)
|
|
|
|
BUG("Somebody got num_threads calculation wrong!");
|
|
|
|
if (!(opt.name_only || opt.unmatch_name_only || opt.count)
|
|
|
|
&& (opt.pre_context || opt.post_context ||
|
|
|
|
opt.file_break || opt.funcbody))
|
|
|
|
skip_first_line = 1;
|
grep: allow submodule functions to run in parallel
Now that object reading operations are internally protected, the
submodule initialization functions at builtin/grep.c:grep_submodule()
are very close to being thread-safe. Let's take a look at each call and
remove from the critical section what we can, for better performance:
- submodule_from_path() and is_submodule_active() cannot be called in
parallel yet only because they call repo_read_gitmodules() which
contains, in its call stack, operations that would otherwise be in
race condition with object reading (for example parse_object() and
is_promisor_remote()). However, they only call repo_read_gitmodules()
if it wasn't read before. So let's pre-read it before firing the
threads and allow these two functions to safely be called in
parallel.
- repo_submodule_init() is already thread-safe, so remove it from the
critical section without other necessary changes.
- The repo_read_gitmodules(&subrepo) call at grep_submodule() is safe as
no other thread is performing object reading operations in the subrepo
yet. However, threads might be working in the superproject, and this
function calls add_to_alternates_memory() internally, which is racy
with object readings in the superproject. So it must be kept
protected for now. Let's add a "NEEDSWORK" to it, informing why it
cannot be removed from the critical section yet.
- Finally, add_to_alternates_memory() must be kept protected for the
same reason as the item above.
Signed-off-by: Matheus Tavares <matheus.bernardino@usp.br>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2020-01-16 05:39:56 +03:00
|
|
|
|
|
|
|
/*
|
2020-01-16 05:39:57 +03:00
|
|
|
* Pre-read gitmodules (if not read already) and force eager
|
|
|
|
* initialization of packed_git to prevent racy lazy
|
|
|
|
* reading/initialization once worker threads are started.
|
grep: allow submodule functions to run in parallel
Now that object reading operations are internally protected, the
submodule initialization functions at builtin/grep.c:grep_submodule()
are very close to being thread-safe. Let's take a look at each call and
remove from the critical section what we can, for better performance:
- submodule_from_path() and is_submodule_active() cannot be called in
parallel yet only because they call repo_read_gitmodules() which
contains, in its call stack, operations that would otherwise be in
race condition with object reading (for example parse_object() and
is_promisor_remote()). However, they only call repo_read_gitmodules()
if it wasn't read before. So let's pre-read it before firing the
threads and allow these two functions to safely be called in
parallel.
- repo_submodule_init() is already thread-safe, so remove it from the
critical section without other necessary changes.
- The repo_read_gitmodules(&subrepo) call at grep_submodule() is safe as
no other thread is performing object reading operations in the subrepo
yet. However, threads might be working in the superproject, and this
function calls add_to_alternates_memory() internally, which is racy
with object readings in the superproject. So it must be kept
protected for now. Let's add a "NEEDSWORK" to it, informing why it
cannot be removed from the critical section yet.
- Finally, add_to_alternates_memory() must be kept protected for the
same reason as the item above.
Signed-off-by: Matheus Tavares <matheus.bernardino@usp.br>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2020-01-16 05:39:56 +03:00
|
|
|
*/
|
|
|
|
if (recurse_submodules)
|
|
|
|
repo_read_gitmodules(the_repository, 1);
|
2020-01-16 05:39:57 +03:00
|
|
|
if (startup_info->have_repository)
|
|
|
|
(void)get_packed_git(the_repository);
|
grep: allow submodule functions to run in parallel
Now that object reading operations are internally protected, the
submodule initialization functions at builtin/grep.c:grep_submodule()
are very close to being thread-safe. Let's take a look at each call and
remove from the critical section what we can, for better performance:
- submodule_from_path() and is_submodule_active() cannot be called in
parallel yet only because they call repo_read_gitmodules() which
contains, in its call stack, operations that would otherwise be in
race condition with object reading (for example parse_object() and
is_promisor_remote()). However, they only call repo_read_gitmodules()
if it wasn't read before. So let's pre-read it before firing the
threads and allow these two functions to safely be called in
parallel.
- repo_submodule_init() is already thread-safe, so remove it from the
critical section without other necessary changes.
- The repo_read_gitmodules(&subrepo) call at grep_submodule() is safe as
no other thread is performing object reading operations in the subrepo
yet. However, threads might be working in the superproject, and this
function calls add_to_alternates_memory() internally, which is racy
with object readings in the superproject. So it must be kept
protected for now. Let's add a "NEEDSWORK" to it, informing why it
cannot be removed from the critical section yet.
- Finally, add_to_alternates_memory() must be kept protected for the
same reason as the item above.
Signed-off-by: Matheus Tavares <matheus.bernardino@usp.br>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2020-01-16 05:39:56 +03:00
|
|
|
|
2018-11-03 11:48:44 +03:00
|
|
|
start_threads(&opt);
|
2018-11-03 11:48:43 +03:00
|
|
|
} else {
|
grep: don't redundantly compile throwaway patterns under threading
Change the pattern compilation logic under threading so that grep
doesn't compile a pattern it never ends up using on the non-threaded
code path, only to compile it again N times for N threads which will
each use their own copy, ignoring the initially compiled pattern.
This redundant compilation dates back to the initial introduction of
the threaded grep in commit 5b594f457a ("Threaded grep",
2010-01-25).
There was never any reason for doing this redundant work other than an
oversight in the initial commit. Jeff King suggested on-list in
<20170414212325.fefrl3qdjigwyitd@sigill.intra.peff.net> that this
might be needed to check the pattern for sanity before threaded
execution commences.
That's not the case. The pattern is compiled under threading in
start_threads() before any concurrent execution has started by calling
pthread_create(), so if the pattern contains an error we still do the
right thing. I.e. die with one error before any threaded execution has
commenced, instead of e.g. spewing out an error for each N threads,
which could be a regression a change like this might inadvertently
introduce.
This change is not meant as an optimization, any performance gains
from this are in the hundreds to thousands of nanoseconds at most. If
we wanted more performance here we could just re-use the compiled
patterns in multiple threads (regcomp(3) is thread-safe), or partially
re-use them and the associated structures in the case of later PCRE
JIT changes.
Rather, it's just to make the code easier to reason about. It's
confusing to debug this under threading & non-threading when the
threading codepaths redundantly compile a pattern which is never used.
The reason the patterns are recompiled is as a side-effect of
duplicating the whole grep_opt structure, which is not thread safe,
writable, and munged during execution. The grep_opt structure then
points to the grep_pat structure where pattern or patterns are stored.
I looked into e.g. splitting the API into some "do & alloc threadsafe
stuff", "spawn thread", "do and alloc non-threadsafe stuff", but the
execution time of grep_opt_dup() & pattern compilation is trivial
compared to actually executing the grep, so there was no point. Even
with the more expensive JIT changes to follow the most expensive PCRE
patterns take something like 0.0X milliseconds to compile at most[1].
The undocumented --debug mode added in commit 17bf35a3c7 ("grep: teach
--debug option to dump the parse tree", 2012-09-13) still works
properly with this change. It only emits debugging info during pattern
compilation, which is now dumped by the pattern compiled just before
the first thread is started.
1. http://sljit.sourceforge.net/pcre.html
Signed-off-by: Ævar Arnfjörð Bjarmason <avarab@gmail.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2017-05-25 23:05:22 +03:00
|
|
|
/*
|
|
|
|
* The compiled patterns on the main path are only
|
|
|
|
* used when not using threading. Otherwise
|
2018-11-03 11:48:44 +03:00
|
|
|
* start_threads() above calls compile_grep_patterns()
|
grep: don't redundantly compile throwaway patterns under threading
Change the pattern compilation logic under threading so that grep
doesn't compile a pattern it never ends up using on the non-threaded
code path, only to compile it again N times for N threads which will
each use their own copy, ignoring the initially compiled pattern.
This redundant compilation dates back to the initial introduction of
the threaded grep in commit 5b594f457a ("Threaded grep",
2010-01-25).
There was never any reason for doing this redundant work other than an
oversight in the initial commit. Jeff King suggested on-list in
<20170414212325.fefrl3qdjigwyitd@sigill.intra.peff.net> that this
might be needed to check the pattern for sanity before threaded
execution commences.
That's not the case. The pattern is compiled under threading in
start_threads() before any concurrent execution has started by calling
pthread_create(), so if the pattern contains an error we still do the
right thing. I.e. die with one error before any threaded execution has
commenced, instead of e.g. spewing out an error for each N threads,
which could be a regression a change like this might inadvertently
introduce.
This change is not meant as an optimization, any performance gains
from this are in the hundreds to thousands of nanoseconds at most. If
we wanted more performance here we could just re-use the compiled
patterns in multiple threads (regcomp(3) is thread-safe), or partially
re-use them and the associated structures in the case of later PCRE
JIT changes.
Rather, it's just to make the code easier to reason about. It's
confusing to debug this under threading & non-threading when the
threading codepaths redundantly compile a pattern which is never used.
The reason the patterns are recompiled is as a side-effect of
duplicating the whole grep_opt structure, which is not thread safe,
writable, and munged during execution. The grep_opt structure then
points to the grep_pat structure where pattern or patterns are stored.
I looked into e.g. splitting the API into some "do & alloc threadsafe
stuff", "spawn thread", "do and alloc non-threadsafe stuff", but the
execution time of grep_opt_dup() & pattern compilation is trivial
compared to actually executing the grep, so there was no point. Even
with the more expensive JIT changes to follow the most expensive PCRE
patterns take something like 0.0X milliseconds to compile at most[1].
The undocumented --debug mode added in commit 17bf35a3c7 ("grep: teach
--debug option to dump the parse tree", 2012-09-13) still works
properly with this change. It only emits debugging info during pattern
compilation, which is now dumped by the pattern compiled just before
the first thread is started.
1. http://sljit.sourceforge.net/pcre.html
Signed-off-by: Ævar Arnfjörð Bjarmason <avarab@gmail.com>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2017-05-25 23:05:22 +03:00
|
|
|
* for each thread.
|
|
|
|
*/
|
|
|
|
compile_grep_patterns(&opt);
|
2011-12-13 01:16:08 +04:00
|
|
|
}
|
|
|
|
|
2010-06-12 20:36:51 +04:00
|
|
|
if (show_in_pager && (cached || list.nr))
|
2011-02-23 02:41:56 +03:00
|
|
|
die(_("--open-files-in-pager only works on the worktree"));
|
2010-06-12 20:36:51 +04:00
|
|
|
|
|
|
|
if (show_in_pager && opt.pattern_list && !opt.pattern_list->next) {
|
|
|
|
const char *pager = path_list.items[0].string;
|
|
|
|
int len = strlen(pager);
|
|
|
|
|
|
|
|
if (len > 4 && is_dir_sep(pager[len - 5]))
|
|
|
|
pager += len - 4;
|
|
|
|
|
2011-02-08 09:17:24 +03:00
|
|
|
if (opt.ignore_case && !strcmp("less", pager))
|
|
|
|
string_list_append(&path_list, "-I");
|
|
|
|
|
2010-06-12 20:36:51 +04:00
|
|
|
if (!strcmp("less", pager) || !strcmp("vi", pager)) {
|
|
|
|
struct strbuf buf = STRBUF_INIT;
|
|
|
|
strbuf_addf(&buf, "+/%s%s",
|
|
|
|
strcmp("less", pager) ? "" : "*",
|
|
|
|
opt.pattern_list->pattern);
|
2021-10-22 11:55:41 +03:00
|
|
|
string_list_append_nodup(&path_list,
|
|
|
|
strbuf_detach(&buf, NULL));
|
2010-06-12 20:36:51 +04:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2015-03-18 21:00:13 +03:00
|
|
|
if (!show_in_pager && !opt.status_only)
|
2010-06-12 20:36:51 +04:00
|
|
|
setup_pager();
|
|
|
|
|
2022-02-01 01:07:46 +03:00
|
|
|
die_for_incompatible_opt3(!use_index, "--no-index",
|
|
|
|
untracked, "--untracked",
|
|
|
|
cached, "--cached");
|
2021-02-08 22:43:28 +03:00
|
|
|
|
2011-09-28 00:43:12 +04:00
|
|
|
if (!use_index || untracked) {
|
|
|
|
int use_exclude = (opt_exclude < 0) ? use_index : !!opt_exclude;
|
grep: turn off gitlink detection for --no-index
If we are running "git grep --no-index" outside of a git
repository, we behave roughly like "grep -r", examining all
files in the current directory and its subdirectories.
However, because we use fill_directory() to do the
recursion, it will skip over any directories which look like
sub-repositories.
For a normal git operation (like "git grep" in a repository)
this makes sense; we do not want to cross the boundary out
of our current repository into a submodule. But for
"--no-index" without a repository, we should look at all
files, including embedded repositories.
There is one exception, though: we probably should _not_
descend into ".git" directories. Doing so is inefficient and
unlikely to turn up useful hits.
This patch drops our use of dir.c's gitlink-detection, but
we do still avoid ".git". That makes us more like tools such
as "ack" or "ag", which also know to avoid cruft in .git.
As a bonus, this also drops our usage of the ref code
when we are outside of a repository, making the transition
to pluggable ref backends cleaner.
Signed-off-by: Jeff King <peff@peff.net>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
2016-03-07 18:51:21 +03:00
|
|
|
hit = grep_directory(&opt, &pathspec, use_exclude, use_index);
|
2011-10-05 05:40:41 +04:00
|
|
|
} else if (0 <= opt_exclude) {
|
2018-07-21 10:49:19 +03:00
|
|
|
die(_("--[no-]exclude-standard cannot be used for tracked contents"));
|
2010-06-12 20:32:11 +04:00
|
|
|
} else if (!list.nr) {
|
2008-08-28 17:04:30 +04:00
|
|
|
if (!cached)
|
|
|
|
setup_work_tree();
|
2010-01-26 01:51:39 +03:00
|
|
|
|
2019-01-12 05:13:22 +03:00
|
|
|
hit = grep_cache(&opt, &pathspec, cached);
|
2010-06-12 20:32:11 +04:00
|
|
|
} else {
|
|
|
|
if (cached)
|
2018-07-21 10:49:19 +03:00
|
|
|
die(_("both --cached and trees are given"));
|
2017-08-02 22:49:23 +03:00
|
|
|
|
2018-03-29 01:35:27 +03:00
|
|
|
hit = grep_objects(&opt, &pathspec, &list);
|
2006-05-01 10:28:15 +04:00
|
|
|
}
|
2010-01-26 01:51:39 +03:00
|
|
|
|
2018-11-03 11:48:44 +03:00
|
|
|
if (num_threads > 1)
|
2010-01-26 01:51:39 +03:00
|
|
|
hit |= wait_all();
|
2010-06-12 20:36:51 +04:00
|
|
|
if (hit && show_in_pager)
|
|
|
|
run_pager(&opt, prefix);
|
2017-04-09 22:59:00 +03:00
|
|
|
clear_pathspec(&pathspec);
|
2021-10-22 11:55:41 +03:00
|
|
|
string_list_clear(&path_list, 0);
|
2006-09-28 03:27:10 +04:00
|
|
|
free_grep_patterns(&opt);
|
2021-10-22 11:55:40 +03:00
|
|
|
object_array_clear(&list);
|
2021-08-17 00:09:55 +03:00
|
|
|
free_repos();
|
2006-05-01 10:28:15 +04:00
|
|
|
return !hit;
|
|
|
|
}
|