зеркало из https://github.com/github/ruby.git
Fix memory leak in grapheme clusters
[Bug #20150] String#grapheme_cluters and String#each_grapheme_cluster leaks memory because if the string is not UTF-8, then the created regex will not be freed. For example: str = "hello world".encode(Encoding::UTF_32LE) 10.times do 1_000.times do str.grapheme_clusters end puts `ps -o rss= -p #{$$}` end Before: 26000 42256 59008 75792 92528 109232 125936 142672 159392 176160 After: 9264 9504 9808 10000 10128 10224 10352 10544 10704 10896
This commit is contained in:
Родитель
8f4eda5092
Коммит
b3d6128049
98
string.c
98
string.c
|
@ -9344,56 +9344,65 @@ static regex_t *
|
|||
get_reg_grapheme_cluster(rb_encoding *enc)
|
||||
{
|
||||
int encidx = rb_enc_to_index(enc);
|
||||
regex_t *reg_grapheme_cluster = NULL;
|
||||
static regex_t *reg_grapheme_cluster_utf8 = NULL;
|
||||
|
||||
/* synchronize */
|
||||
if (encidx == rb_utf8_encindex() && reg_grapheme_cluster_utf8) {
|
||||
reg_grapheme_cluster = reg_grapheme_cluster_utf8;
|
||||
}
|
||||
if (!reg_grapheme_cluster) {
|
||||
const OnigUChar source_ascii[] = "\\X";
|
||||
OnigErrorInfo einfo;
|
||||
const OnigUChar *source = source_ascii;
|
||||
size_t source_len = sizeof(source_ascii) - 1;
|
||||
switch (encidx) {
|
||||
const OnigUChar source_ascii[] = "\\X";
|
||||
const OnigUChar *source = source_ascii;
|
||||
size_t source_len = sizeof(source_ascii) - 1;
|
||||
|
||||
switch (encidx) {
|
||||
#define CHARS_16BE(x) (OnigUChar)((x)>>8), (OnigUChar)(x)
|
||||
#define CHARS_16LE(x) (OnigUChar)(x), (OnigUChar)((x)>>8)
|
||||
#define CHARS_32BE(x) CHARS_16BE((x)>>16), CHARS_16BE(x)
|
||||
#define CHARS_32LE(x) CHARS_16LE(x), CHARS_16LE((x)>>16)
|
||||
#define CASE_UTF(e) \
|
||||
case ENCINDEX_UTF_##e: { \
|
||||
static const OnigUChar source_UTF_##e[] = {CHARS_##e('\\'), CHARS_##e('X')}; \
|
||||
source = source_UTF_##e; \
|
||||
source_len = sizeof(source_UTF_##e); \
|
||||
break; \
|
||||
}
|
||||
CASE_UTF(16BE); CASE_UTF(16LE); CASE_UTF(32BE); CASE_UTF(32LE);
|
||||
case ENCINDEX_UTF_##e: { \
|
||||
static const OnigUChar source_UTF_##e[] = {CHARS_##e('\\'), CHARS_##e('X')}; \
|
||||
source = source_UTF_##e; \
|
||||
source_len = sizeof(source_UTF_##e); \
|
||||
break; \
|
||||
}
|
||||
CASE_UTF(16BE); CASE_UTF(16LE); CASE_UTF(32BE); CASE_UTF(32LE);
|
||||
#undef CASE_UTF
|
||||
#undef CHARS_16BE
|
||||
#undef CHARS_16LE
|
||||
#undef CHARS_32BE
|
||||
#undef CHARS_32LE
|
||||
}
|
||||
int r = onig_new(®_grapheme_cluster, source, source + source_len,
|
||||
ONIG_OPTION_DEFAULT, enc, OnigDefaultSyntax, &einfo);
|
||||
if (r) {
|
||||
UChar message[ONIG_MAX_ERROR_MESSAGE_LEN];
|
||||
onig_error_code_to_str(message, r, &einfo);
|
||||
rb_fatal("cannot compile grapheme cluster regexp: %s", (char *)message);
|
||||
}
|
||||
if (encidx == rb_utf8_encindex()) {
|
||||
reg_grapheme_cluster_utf8 = reg_grapheme_cluster;
|
||||
}
|
||||
}
|
||||
|
||||
regex_t *reg_grapheme_cluster;
|
||||
OnigErrorInfo einfo;
|
||||
int r = onig_new(®_grapheme_cluster, source, source + source_len,
|
||||
ONIG_OPTION_DEFAULT, enc, OnigDefaultSyntax, &einfo);
|
||||
if (r) {
|
||||
UChar message[ONIG_MAX_ERROR_MESSAGE_LEN];
|
||||
onig_error_code_to_str(message, r, &einfo);
|
||||
rb_fatal("cannot compile grapheme cluster regexp: %s", (char *)message);
|
||||
}
|
||||
|
||||
return reg_grapheme_cluster;
|
||||
}
|
||||
|
||||
static regex_t *
|
||||
get_cached_reg_grapheme_cluster(rb_encoding *enc)
|
||||
{
|
||||
int encidx = rb_enc_to_index(enc);
|
||||
static regex_t *reg_grapheme_cluster_utf8 = NULL;
|
||||
|
||||
if (encidx == rb_utf8_encindex()) {
|
||||
if (!reg_grapheme_cluster_utf8) {
|
||||
reg_grapheme_cluster_utf8 = get_reg_grapheme_cluster(enc);
|
||||
}
|
||||
|
||||
return reg_grapheme_cluster_utf8;
|
||||
}
|
||||
|
||||
return NULL;
|
||||
}
|
||||
|
||||
static VALUE
|
||||
rb_str_each_grapheme_cluster_size(VALUE str, VALUE args, VALUE eobj)
|
||||
{
|
||||
size_t grapheme_cluster_count = 0;
|
||||
regex_t *reg_grapheme_cluster = NULL;
|
||||
rb_encoding *enc = get_encoding(str);
|
||||
const char *ptr, *end;
|
||||
|
||||
|
@ -9401,7 +9410,13 @@ rb_str_each_grapheme_cluster_size(VALUE str, VALUE args, VALUE eobj)
|
|||
return rb_str_length(str);
|
||||
}
|
||||
|
||||
reg_grapheme_cluster = get_reg_grapheme_cluster(enc);
|
||||
bool cached_reg_grapheme_cluster = true;
|
||||
regex_t *reg_grapheme_cluster = get_cached_reg_grapheme_cluster(enc);
|
||||
if (!reg_grapheme_cluster) {
|
||||
reg_grapheme_cluster = get_reg_grapheme_cluster(enc);
|
||||
cached_reg_grapheme_cluster = false;
|
||||
}
|
||||
|
||||
ptr = RSTRING_PTR(str);
|
||||
end = RSTRING_END(str);
|
||||
|
||||
|
@ -9414,6 +9429,10 @@ rb_str_each_grapheme_cluster_size(VALUE str, VALUE args, VALUE eobj)
|
|||
ptr += len;
|
||||
}
|
||||
|
||||
if (!cached_reg_grapheme_cluster) {
|
||||
onig_free(reg_grapheme_cluster);
|
||||
}
|
||||
|
||||
return SIZET2NUM(grapheme_cluster_count);
|
||||
}
|
||||
|
||||
|
@ -9421,7 +9440,6 @@ static VALUE
|
|||
rb_str_enumerate_grapheme_clusters(VALUE str, VALUE ary)
|
||||
{
|
||||
VALUE orig = str;
|
||||
regex_t *reg_grapheme_cluster = NULL;
|
||||
rb_encoding *enc = get_encoding(str);
|
||||
const char *ptr0, *ptr, *end;
|
||||
|
||||
|
@ -9430,7 +9448,14 @@ rb_str_enumerate_grapheme_clusters(VALUE str, VALUE ary)
|
|||
}
|
||||
|
||||
if (!ary) str = rb_str_new_frozen(str);
|
||||
reg_grapheme_cluster = get_reg_grapheme_cluster(enc);
|
||||
|
||||
bool cached_reg_grapheme_cluster = true;
|
||||
regex_t *reg_grapheme_cluster = get_cached_reg_grapheme_cluster(enc);
|
||||
if (!reg_grapheme_cluster) {
|
||||
reg_grapheme_cluster = get_reg_grapheme_cluster(enc);
|
||||
cached_reg_grapheme_cluster = false;
|
||||
}
|
||||
|
||||
ptr0 = ptr = RSTRING_PTR(str);
|
||||
end = RSTRING_END(str);
|
||||
|
||||
|
@ -9442,6 +9467,11 @@ rb_str_enumerate_grapheme_clusters(VALUE str, VALUE ary)
|
|||
ENUM_ELEM(ary, rb_str_subseq(str, ptr-ptr0, len));
|
||||
ptr += len;
|
||||
}
|
||||
|
||||
if (!cached_reg_grapheme_cluster) {
|
||||
onig_free(reg_grapheme_cluster);
|
||||
}
|
||||
|
||||
RB_GC_GUARD(str);
|
||||
if (ary)
|
||||
return ary;
|
||||
|
|
|
@ -1108,6 +1108,17 @@ CODE
|
|||
assert_equal("C", res[2])
|
||||
end
|
||||
|
||||
def test_grapheme_clusters_memory_leak
|
||||
assert_no_memory_leak([], "", "#{<<~"begin;"}\n#{<<~'end;'}", "[Bug #todo]", rss: true)
|
||||
begin;
|
||||
str = "hello world".encode(Encoding::UTF_32LE)
|
||||
|
||||
10_000.times do
|
||||
str.grapheme_clusters
|
||||
end
|
||||
end;
|
||||
end
|
||||
|
||||
def test_each_line
|
||||
verbose, $VERBOSE = $VERBOSE, nil
|
||||
|
||||
|
|
Загрузка…
Ссылка в новой задаче