зеркало из https://github.com/microsoft/git.git
Merge branch 'nd/icase' into maint
"git grep -i" has been taught to fold case in non-ascii locales correctly. * nd/icase: grep.c: reuse "icase" variable diffcore-pickaxe: support case insensitive match on non-ascii diffcore-pickaxe: Add regcomp_or_die() grep/pcre: support utf-8 gettext: add is_utf8_locale() grep/pcre: prepare locale-dependent tables for icase matching grep: rewrite an if/else condition to avoid duplicate expression grep/icase: avoid kwsset when -F is specified grep/icase: avoid kwsset on literal non-ascii strings test-regex: expose full regcomp() to the command line test-regex: isolate the bug test code grep: break down an "if" stmt in preparation for next changes
This commit is contained in:
Коммит
6cbec0da47
|
@ -7,6 +7,8 @@
|
|||
#include "diffcore.h"
|
||||
#include "xdiff-interface.h"
|
||||
#include "kwset.h"
|
||||
#include "commit.h"
|
||||
#include "quote.h"
|
||||
|
||||
typedef int (*pickaxe_fn)(mmfile_t *one, mmfile_t *two,
|
||||
struct diff_options *o,
|
||||
|
@ -198,6 +200,18 @@ static void pickaxe(struct diff_queue_struct *q, struct diff_options *o,
|
|||
*q = outq;
|
||||
}
|
||||
|
||||
static void regcomp_or_die(regex_t *regex, const char *needle, int cflags)
|
||||
{
|
||||
int err = regcomp(regex, needle, cflags);
|
||||
if (err) {
|
||||
/* The POSIX.2 people are surely sick */
|
||||
char errbuf[1024];
|
||||
regerror(err, regex, errbuf, 1024);
|
||||
regfree(regex);
|
||||
die("invalid regex: %s", errbuf);
|
||||
}
|
||||
}
|
||||
|
||||
void diffcore_pickaxe(struct diff_options *o)
|
||||
{
|
||||
const char *needle = o->pickaxe;
|
||||
|
@ -206,18 +220,19 @@ void diffcore_pickaxe(struct diff_options *o)
|
|||
kwset_t kws = NULL;
|
||||
|
||||
if (opts & (DIFF_PICKAXE_REGEX | DIFF_PICKAXE_KIND_G)) {
|
||||
int err;
|
||||
int cflags = REG_EXTENDED | REG_NEWLINE;
|
||||
if (DIFF_OPT_TST(o, PICKAXE_IGNORE_CASE))
|
||||
cflags |= REG_ICASE;
|
||||
err = regcomp(®ex, needle, cflags);
|
||||
if (err) {
|
||||
/* The POSIX.2 people are surely sick */
|
||||
char errbuf[1024];
|
||||
regerror(err, ®ex, errbuf, 1024);
|
||||
regfree(®ex);
|
||||
die("invalid regex: %s", errbuf);
|
||||
}
|
||||
regcomp_or_die(®ex, needle, cflags);
|
||||
regexp = ®ex;
|
||||
} else if (DIFF_OPT_TST(o, PICKAXE_IGNORE_CASE) &&
|
||||
has_non_ascii(needle)) {
|
||||
struct strbuf sb = STRBUF_INIT;
|
||||
int cflags = REG_NEWLINE | REG_ICASE;
|
||||
|
||||
basic_regex_quote_buf(&sb, needle);
|
||||
regcomp_or_die(®ex, sb.buf, cflags);
|
||||
strbuf_release(&sb);
|
||||
regexp = ®ex;
|
||||
} else {
|
||||
kws = kwsalloc(DIFF_OPT_TST(o, PICKAXE_IGNORE_CASE)
|
||||
|
|
24
gettext.c
24
gettext.c
|
@ -18,6 +18,8 @@
|
|||
# endif
|
||||
#endif
|
||||
|
||||
static const char *charset;
|
||||
|
||||
/*
|
||||
* Guess the user's preferred languages from the value in LANGUAGE environment
|
||||
* variable and LC_MESSAGES locale category if NO_GETTEXT is not defined.
|
||||
|
@ -65,7 +67,6 @@ static int test_vsnprintf(const char *fmt, ...)
|
|||
return ret;
|
||||
}
|
||||
|
||||
static const char *charset;
|
||||
static void init_gettext_charset(const char *domain)
|
||||
{
|
||||
/*
|
||||
|
@ -172,8 +173,27 @@ int gettext_width(const char *s)
|
|||
{
|
||||
static int is_utf8 = -1;
|
||||
if (is_utf8 == -1)
|
||||
is_utf8 = !strcmp(charset, "UTF-8");
|
||||
is_utf8 = is_utf8_locale();
|
||||
|
||||
return is_utf8 ? utf8_strwidth(s) : strlen(s);
|
||||
}
|
||||
#endif
|
||||
|
||||
int is_utf8_locale(void)
|
||||
{
|
||||
#ifdef NO_GETTEXT
|
||||
if (!charset) {
|
||||
const char *env = getenv("LC_ALL");
|
||||
if (!env || !*env)
|
||||
env = getenv("LC_CTYPE");
|
||||
if (!env || !*env)
|
||||
env = getenv("LANG");
|
||||
if (!env)
|
||||
env = "";
|
||||
if (strchr(env, '.'))
|
||||
env = strchr(env, '.') + 1;
|
||||
charset = xstrdup(env);
|
||||
}
|
||||
#endif
|
||||
return is_encoding_utf8(charset);
|
||||
}
|
||||
|
|
|
@ -90,5 +90,6 @@ const char *Q_(const char *msgid, const char *plu, unsigned long n)
|
|||
#endif
|
||||
|
||||
const char *get_preferred_languages(void);
|
||||
extern int is_utf8_locale(void);
|
||||
|
||||
#endif
|
||||
|
|
64
grep.c
64
grep.c
|
@ -4,6 +4,8 @@
|
|||
#include "xdiff-interface.h"
|
||||
#include "diff.h"
|
||||
#include "diffcore.h"
|
||||
#include "commit.h"
|
||||
#include "quote.h"
|
||||
|
||||
static int grep_source_load(struct grep_source *gs);
|
||||
static int grep_source_is_binary(struct grep_source *gs);
|
||||
|
@ -322,11 +324,16 @@ static void compile_pcre_regexp(struct grep_pat *p, const struct grep_opt *opt)
|
|||
int erroffset;
|
||||
int options = PCRE_MULTILINE;
|
||||
|
||||
if (opt->ignore_case)
|
||||
if (opt->ignore_case) {
|
||||
if (has_non_ascii(p->pattern))
|
||||
p->pcre_tables = pcre_maketables();
|
||||
options |= PCRE_CASELESS;
|
||||
}
|
||||
if (is_utf8_locale() && has_non_ascii(p->pattern))
|
||||
options |= PCRE_UTF8;
|
||||
|
||||
p->pcre_regexp = pcre_compile(p->pattern, options, &error, &erroffset,
|
||||
NULL);
|
||||
p->pcre_tables);
|
||||
if (!p->pcre_regexp)
|
||||
compile_regexp_failed(p, error);
|
||||
|
||||
|
@ -360,6 +367,7 @@ static void free_pcre_regexp(struct grep_pat *p)
|
|||
{
|
||||
pcre_free(p->pcre_regexp);
|
||||
pcre_free(p->pcre_extra_info);
|
||||
pcre_free((void *)p->pcre_tables);
|
||||
}
|
||||
#else /* !USE_LIBPCRE */
|
||||
static void compile_pcre_regexp(struct grep_pat *p, const struct grep_opt *opt)
|
||||
|
@ -396,26 +404,68 @@ static int is_fixed(const char *s, size_t len)
|
|||
return 1;
|
||||
}
|
||||
|
||||
static void compile_fixed_regexp(struct grep_pat *p, struct grep_opt *opt)
|
||||
{
|
||||
struct strbuf sb = STRBUF_INIT;
|
||||
int err;
|
||||
int regflags;
|
||||
|
||||
basic_regex_quote_buf(&sb, p->pattern);
|
||||
regflags = opt->regflags & ~REG_EXTENDED;
|
||||
if (opt->ignore_case)
|
||||
regflags |= REG_ICASE;
|
||||
err = regcomp(&p->regexp, sb.buf, regflags);
|
||||
if (opt->debug)
|
||||
fprintf(stderr, "fixed %s\n", sb.buf);
|
||||
strbuf_release(&sb);
|
||||
if (err) {
|
||||
char errbuf[1024];
|
||||
regerror(err, &p->regexp, errbuf, sizeof(errbuf));
|
||||
regfree(&p->regexp);
|
||||
compile_regexp_failed(p, errbuf);
|
||||
}
|
||||
}
|
||||
|
||||
static void compile_regexp(struct grep_pat *p, struct grep_opt *opt)
|
||||
{
|
||||
int icase, ascii_only;
|
||||
int err;
|
||||
|
||||
p->word_regexp = opt->word_regexp;
|
||||
p->ignore_case = opt->ignore_case;
|
||||
icase = opt->regflags & REG_ICASE || p->ignore_case;
|
||||
ascii_only = !has_non_ascii(p->pattern);
|
||||
|
||||
/*
|
||||
* Even when -F (fixed) asks us to do a non-regexp search, we
|
||||
* may not be able to correctly case-fold when -i
|
||||
* (ignore-case) is asked (in which case, we'll synthesize a
|
||||
* regexp to match the pattern that matches regexp special
|
||||
* characters literally, while ignoring case differences). On
|
||||
* the other hand, even without -F, if the pattern does not
|
||||
* have any regexp special characters and there is no need for
|
||||
* case-folding search, we can internally turn it into a
|
||||
* simple string match using kws. p->fixed tells us if we
|
||||
* want to use kws.
|
||||
*/
|
||||
if (opt->fixed || is_fixed(p->pattern, p->patternlen))
|
||||
p->fixed = 1;
|
||||
p->fixed = !icase || ascii_only;
|
||||
else
|
||||
p->fixed = 0;
|
||||
|
||||
if (p->fixed) {
|
||||
if (opt->regflags & REG_ICASE || p->ignore_case)
|
||||
p->kws = kwsalloc(tolower_trans_tbl);
|
||||
else
|
||||
p->kws = kwsalloc(NULL);
|
||||
p->kws = kwsalloc(icase ? tolower_trans_tbl : NULL);
|
||||
kwsincr(p->kws, p->pattern, p->patternlen);
|
||||
kwsprep(p->kws);
|
||||
return;
|
||||
} else if (opt->fixed) {
|
||||
/*
|
||||
* We come here when the pattern has the non-ascii
|
||||
* characters we cannot case-fold, and asked to
|
||||
* ignore-case.
|
||||
*/
|
||||
compile_fixed_regexp(p, opt);
|
||||
return;
|
||||
}
|
||||
|
||||
if (opt->pcre) {
|
||||
|
|
1
grep.h
1
grep.h
|
@ -48,6 +48,7 @@ struct grep_pat {
|
|||
regex_t regexp;
|
||||
pcre *pcre_regexp;
|
||||
pcre_extra *pcre_extra_info;
|
||||
const unsigned char *pcre_tables;
|
||||
kwset_t kws;
|
||||
unsigned fixed:1;
|
||||
unsigned ignore_case:1;
|
||||
|
|
37
quote.c
37
quote.c
|
@ -453,3 +453,40 @@ void tcl_quote_buf(struct strbuf *sb, const char *src)
|
|||
}
|
||||
strbuf_addch(sb, '"');
|
||||
}
|
||||
|
||||
void basic_regex_quote_buf(struct strbuf *sb, const char *src)
|
||||
{
|
||||
char c;
|
||||
|
||||
if (*src == '^') {
|
||||
/* only beginning '^' is special and needs quoting */
|
||||
strbuf_addch(sb, '\\');
|
||||
strbuf_addch(sb, *src++);
|
||||
}
|
||||
if (*src == '*')
|
||||
/* beginning '*' is not special, no quoting */
|
||||
strbuf_addch(sb, *src++);
|
||||
|
||||
while ((c = *src++)) {
|
||||
switch (c) {
|
||||
case '[':
|
||||
case '.':
|
||||
case '\\':
|
||||
case '*':
|
||||
strbuf_addch(sb, '\\');
|
||||
strbuf_addch(sb, c);
|
||||
break;
|
||||
|
||||
case '$':
|
||||
/* only the end '$' is special and needs quoting */
|
||||
if (*src == '\0')
|
||||
strbuf_addch(sb, '\\');
|
||||
strbuf_addch(sb, c);
|
||||
break;
|
||||
|
||||
default:
|
||||
strbuf_addch(sb, c);
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
1
quote.h
1
quote.h
|
@ -70,5 +70,6 @@ extern char *quote_path_relative(const char *in, const char *prefix,
|
|||
extern void perl_quote_buf(struct strbuf *sb, const char *src);
|
||||
extern void python_quote_buf(struct strbuf *sb, const char *src);
|
||||
extern void tcl_quote_buf(struct strbuf *sb, const char *src);
|
||||
extern void basic_regex_quote_buf(struct strbuf *sb, const char *src);
|
||||
|
||||
#endif
|
||||
|
|
|
@ -1,6 +1,23 @@
|
|||
#include "git-compat-util.h"
|
||||
#include "gettext.h"
|
||||
|
||||
int main(int argc, char **argv)
|
||||
struct reg_flag {
|
||||
const char *name;
|
||||
int flag;
|
||||
};
|
||||
|
||||
static struct reg_flag reg_flags[] = {
|
||||
{ "EXTENDED", REG_EXTENDED },
|
||||
{ "NEWLINE", REG_NEWLINE },
|
||||
{ "ICASE", REG_ICASE },
|
||||
{ "NOTBOL", REG_NOTBOL },
|
||||
#ifdef REG_STARTEND
|
||||
{ "STARTEND", REG_STARTEND },
|
||||
#endif
|
||||
{ NULL, 0 }
|
||||
};
|
||||
|
||||
static int test_regex_bug(void)
|
||||
{
|
||||
char *pat = "[^={} \t]+";
|
||||
char *str = "={}\nfred";
|
||||
|
@ -16,5 +33,43 @@ int main(int argc, char **argv)
|
|||
if (m[0].rm_so == 3) /* matches '\n' when it should not */
|
||||
die("regex bug confirmed: re-build git with NO_REGEX=1");
|
||||
|
||||
exit(0);
|
||||
return 0;
|
||||
}
|
||||
|
||||
int main(int argc, char **argv)
|
||||
{
|
||||
const char *pat;
|
||||
const char *str;
|
||||
int flags = 0;
|
||||
regex_t r;
|
||||
regmatch_t m[1];
|
||||
|
||||
if (argc == 2 && !strcmp(argv[1], "--bug"))
|
||||
return test_regex_bug();
|
||||
else if (argc < 3)
|
||||
usage("test-regex --bug\n"
|
||||
"test-regex <pattern> <string> [<options>]");
|
||||
|
||||
argv++;
|
||||
pat = *argv++;
|
||||
str = *argv++;
|
||||
while (*argv) {
|
||||
struct reg_flag *rf;
|
||||
for (rf = reg_flags; rf->name; rf++)
|
||||
if (!strcmp(*argv, rf->name)) {
|
||||
flags |= rf->flag;
|
||||
break;
|
||||
}
|
||||
if (!rf->name)
|
||||
die("do not recognize %s", *argv);
|
||||
argv++;
|
||||
}
|
||||
git_setup_gettext();
|
||||
|
||||
if (regcomp(&r, pat, flags))
|
||||
die("failed regcomp() for pattern '%s'", pat);
|
||||
if (regexec(&r, str, 1, m, 0))
|
||||
return 1;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
|
|
@ -31,7 +31,7 @@ test_expect_success 'git_mkstemps_mode does not fail if fd 0 is not open' '
|
|||
|
||||
test_expect_success 'check for a bug in the regex routines' '
|
||||
# if this test fails, re-build git with NO_REGEX=1
|
||||
test-regex
|
||||
test-regex --bug
|
||||
'
|
||||
|
||||
test_done
|
||||
|
|
|
@ -0,0 +1,71 @@
|
|||
#!/bin/sh
|
||||
|
||||
test_description='grep icase on non-English locales'
|
||||
|
||||
. ./lib-gettext.sh
|
||||
|
||||
test_expect_success GETTEXT_LOCALE 'setup' '
|
||||
test_write_lines "TILRAUN: Halló Heimur!" >file &&
|
||||
git add file &&
|
||||
LC_ALL="$is_IS_locale" &&
|
||||
export LC_ALL
|
||||
'
|
||||
|
||||
test_have_prereq GETTEXT_LOCALE &&
|
||||
test-regex "HALLÓ" "Halló" ICASE &&
|
||||
test_set_prereq REGEX_LOCALE
|
||||
|
||||
test_expect_success REGEX_LOCALE 'grep literal string, no -F' '
|
||||
git grep -i "TILRAUN: Halló Heimur!" &&
|
||||
git grep -i "TILRAUN: HALLÓ HEIMUR!"
|
||||
'
|
||||
|
||||
test_expect_success GETTEXT_LOCALE,LIBPCRE 'grep pcre utf-8 icase' '
|
||||
git grep --perl-regexp "TILRAUN: H.lló Heimur!" &&
|
||||
git grep --perl-regexp -i "TILRAUN: H.lló Heimur!" &&
|
||||
git grep --perl-regexp -i "TILRAUN: H.LLÓ HEIMUR!"
|
||||
'
|
||||
|
||||
test_expect_success GETTEXT_LOCALE,LIBPCRE 'grep pcre utf-8 string with "+"' '
|
||||
test_write_lines "TILRAUN: Hallóó Heimur!" >file2 &&
|
||||
git add file2 &&
|
||||
git grep -l --perl-regexp "TILRAUN: H.lló+ Heimur!" >actual &&
|
||||
echo file >expected &&
|
||||
echo file2 >>expected &&
|
||||
test_cmp expected actual
|
||||
'
|
||||
|
||||
test_expect_success REGEX_LOCALE 'grep literal string, with -F' '
|
||||
git grep --debug -i -F "TILRAUN: Halló Heimur!" 2>&1 >/dev/null |
|
||||
grep fixed >debug1 &&
|
||||
test_write_lines "fixed TILRAUN: Halló Heimur!" >expect1 &&
|
||||
test_cmp expect1 debug1 &&
|
||||
|
||||
git grep --debug -i -F "TILRAUN: HALLÓ HEIMUR!" 2>&1 >/dev/null |
|
||||
grep fixed >debug2 &&
|
||||
test_write_lines "fixed TILRAUN: HALLÓ HEIMUR!" >expect2 &&
|
||||
test_cmp expect2 debug2
|
||||
'
|
||||
|
||||
test_expect_success REGEX_LOCALE 'grep string with regex, with -F' '
|
||||
test_write_lines "^*TILR^AUN:.* \\Halló \$He[]imur!\$" >file &&
|
||||
|
||||
git grep --debug -i -F "^*TILR^AUN:.* \\Halló \$He[]imur!\$" 2>&1 >/dev/null |
|
||||
grep fixed >debug1 &&
|
||||
test_write_lines "fixed \\^*TILR^AUN:\\.\\* \\\\Halló \$He\\[]imur!\\\$" >expect1 &&
|
||||
test_cmp expect1 debug1 &&
|
||||
|
||||
git grep --debug -i -F "^*TILR^AUN:.* \\HALLÓ \$HE[]IMUR!\$" 2>&1 >/dev/null |
|
||||
grep fixed >debug2 &&
|
||||
test_write_lines "fixed \\^*TILR^AUN:\\.\\* \\\\HALLÓ \$HE\\[]IMUR!\\\$" >expect2 &&
|
||||
test_cmp expect2 debug2
|
||||
'
|
||||
|
||||
test_expect_success REGEX_LOCALE 'pickaxe -i on non-ascii' '
|
||||
git commit -m first &&
|
||||
git log --format=%f -i -S"TILRAUN: HALLÓ HEIMUR!" >actual &&
|
||||
echo first >expected &&
|
||||
test_cmp expected actual
|
||||
'
|
||||
|
||||
test_done
|
|
@ -0,0 +1,19 @@
|
|||
#!/bin/sh
|
||||
|
||||
test_description='grep icase on non-English locales'
|
||||
|
||||
. ./lib-gettext.sh
|
||||
|
||||
test_expect_success GETTEXT_ISO_LOCALE 'setup' '
|
||||
printf "TILRAUN: Halló Heimur!" >file &&
|
||||
git add file &&
|
||||
LC_ALL="$is_IS_iso_locale" &&
|
||||
export LC_ALL
|
||||
'
|
||||
|
||||
test_expect_success GETTEXT_ISO_LOCALE,LIBPCRE 'grep pcre string' '
|
||||
git grep --perl-regexp -i "TILRAUN: H.lló Heimur!" &&
|
||||
git grep --perl-regexp -i "TILRAUN: H.LLÓ HEIMUR!"
|
||||
'
|
||||
|
||||
test_done
|
Загрузка…
Ссылка в новой задаче