* encoding.c (rb_enc_check_str): add for performance.

This function only accept T_STRING (and T_REGEXP). This patch improves performance of a tiny_segmenter benchmark (num=2) 2.54sec -> 2.42sec on my machine. https://github.com/chezou/TinySegmenter.jl/blob/master/benchmark/benchmark.rb * encoding.c: add ENC_DEBUG and ENC_ASSERT() macros. * internal.h: add a decl. of rb_enc_check_str(). * string.c (rb_str_plus): use rb_enc_check_str(). * string.c (rb_str_subpat_set): ditto. git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@52350 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
2015-10-29 09:10:32 +00:00 · 2015-10-29 09:10:32 +00:00 · 05b9b42918
--- a/17
+++ b/17
@ -1,3 +1,20 @@
 Thu Oct 29 18:05:22 2015  Koichi Sasada  <ko1@atdot.net>
 	* encoding.c (rb_enc_check_str): add for performance.
 	  This function only accept T_STRING (and T_REGEXP).
 	  This patch improves performance of a tiny_segmenter benchmark
 	  (num=2) 2.54sec -> 2.42sec on my machine.
 	  https://github.com/chezou/TinySegmenter.jl/blob/master/benchmark/benchmark.rb
 	* encoding.c: add ENC_DEBUG and ENC_ASSERT() macros.
 	* internal.h: add a decl. of rb_enc_check_str().
 	* string.c (rb_str_plus): use rb_enc_check_str().
 	* string.c (rb_str_subpat_set): ditto.
 Thu Oct 29 17:16:40 2015  Koichi Sasada  <ko1@atdot.net>
 	* internal.h: export rb_wb_(un)protected_newobj_of()
--- a/encoding.c
+++ b/encoding.c
@ -15,6 +15,12 @@
 #include <ctype.h>
 #include "ruby/util.h"
 #include <assert.h>
 #ifndef ENC_DEBUG
 #define ENC_DEBUG 0
 #endif
 #define ENC_ASSERT(expr) do { if (ENC_DEBUG) {assert(expr);} } while (0)
 #undef rb_ascii8bit_encindex
 #undef rb_utf8_encindex
 #undef rb_usascii_encindex
@ -743,6 +749,19 @@ rb_id_encoding(void)
    return id_encoding;
 }
 static int
 enc_get_index_str(VALUE str)
 {
    int i = ENCODING_GET_INLINED(str);
    if (i == ENCODING_INLINE_MAX) {
 	VALUE iv;
 	iv = rb_ivar_get(str, rb_id_encoding());
 	i = NUM2INT(iv);
    }
    return i;
 }
 int
 rb_enc_get_index(VALUE obj)
 {
@ -758,13 +777,7 @@ rb_enc_get_index(VALUE obj)
      default:
      case T_STRING:
      case T_REGEXP:
-	i = ENCODING_GET_INLINED(obj);
+	i = enc_get_index_str(obj);
 	if (i == ENCODING_INLINE_MAX) {
 	    VALUE iv;
 	    iv = rb_ivar_get(obj, rb_id_encoding());
 	    i = NUM2INT(iv);
 	}
 	break;
      case T_FILE:
 	tmp = rb_funcallv(obj, rb_intern("internal_encoding"), 0, 0);
@ -842,6 +855,21 @@ rb_enc_get(VALUE obj)
    return rb_enc_from_index(rb_enc_get_index(obj));
 }
 static rb_encoding* enc_compatible_str(VALUE str1, VALUE str2);
 rb_encoding*
 rb_enc_check_str(VALUE str1, VALUE str2)
 {
    rb_encoding *enc = enc_compatible_str(str1, str2);
    ENC_ASSERT(TYPE(str1) == T_STRING);
    ENC_ASSERT(TYPE(str2) == T_STRING);
    if (!enc)
 	rb_raise(rb_eEncCompatError, "incompatible character encodings: %s and %s",
 		 rb_enc_name(rb_enc_get(str1)),
 		 rb_enc_name(rb_enc_get(str2)));
    return enc;
 }
 rb_encoding*
 rb_enc_check(VALUE str1, VALUE str2)
 {
@ -853,40 +881,28 @@ rb_enc_check(VALUE str1, VALUE str2)
    return enc;
 }
-rb_encoding*
+static rb_encoding*
-rb_enc_compatible(VALUE str1, VALUE str2)
+enc_compatible_latter(VALUE str1, VALUE str2, int idx1, int idx2)
 {
    int idx1, idx2;
    rb_encoding *enc1, *enc2;
    int isstr1, isstr2;
-
+    rb_encoding *enc1 = rb_enc_from_index(idx1);
-    idx1 = rb_enc_get_index(str1);
+    rb_encoding *enc2 = rb_enc_from_index(idx2);
    idx2 = rb_enc_get_index(str2);
    if (idx1 < 0 || idx2 < 0)
        return 0;
    if (idx1 == idx2) {
 	return rb_enc_from_index(idx1);
    }
    enc1 = rb_enc_from_index(idx1);
    enc2 = rb_enc_from_index(idx2);
    isstr2 = RB_TYPE_P(str2, T_STRING);
    if (isstr2 && RSTRING_LEN(str2) == 0)
-	return enc1;
+      return enc1;
    isstr1 = RB_TYPE_P(str1, T_STRING);
    if (isstr1 && RSTRING_LEN(str1) == 0)
-	return (rb_enc_asciicompat(enc1) && rb_enc_str_asciionly_p(str2)) ? enc1 : enc2;
+      return (rb_enc_asciicompat(enc1) && rb_enc_str_asciionly_p(str2)) ? enc1 : enc2;
    if (!rb_enc_asciicompat(enc1) || !rb_enc_asciicompat(enc2)) {
 	return 0;
    }
    /* objects whose encoding is the same of contents */
    if (!isstr2 && idx2 == ENCINDEX_US_ASCII)
-	return enc1;
+      return enc1;
    if (!isstr1 && idx1 == ENCINDEX_US_ASCII)
-	return enc2;
+      return enc2;
    if (!isstr1) {
 	VALUE tmp = str1;
@ -915,11 +931,44 @@ rb_enc_compatible(VALUE str1, VALUE str2)
 	    }
 	}
 	if (cr1 == ENC_CODERANGE_7BIT)
-	    return enc2;
+	  return enc2;
    }
    return 0;
 }
 static rb_encoding*
 enc_compatible_str(VALUE str1, VALUE str2)
 {
    int idx1 = enc_get_index_str(str1);
    int idx2 = enc_get_index_str(str2);
    if (idx1 < 0 || idx2 < 0)
        return 0;
    if (idx1 == idx2) {
 	return rb_enc_from_index(idx1);
    }
    else {
 	return enc_compatible_latter(str1, str2, idx1, idx2);
    }
 }
 rb_encoding*
 rb_enc_compatible(VALUE str1, VALUE str2)
 {
    int idx1 = rb_enc_get_index(str1);
    int idx2 = rb_enc_get_index(str2);
    if (idx1 < 0 || idx2 < 0)
        return 0;
    if (idx1 == idx2) {
 	return rb_enc_from_index(idx1);
    }
    return enc_compatible_latter(str1, str2, idx1, idx2);
 }
 void
 rb_enc_copy(VALUE obj1, VALUE obj2)
 {
--- a/internal.h
+++ b/internal.h
@ -716,6 +716,7 @@ void Init_ext(void);
 ID rb_id_encoding(void);
 void rb_gc_mark_encodings(void);
 rb_encoding *rb_enc_get_from_index(int index);
 rb_encoding *rb_enc_check_str(VALUE str1, VALUE str2);
 int rb_encdb_replicate(const char *alias, const char *orig);
 int rb_encdb_alias(const char *alias, const char *orig);
 int rb_encdb_dummy(const char *name);
--- a/string.c
+++ b/string.c
@ -1598,7 +1598,7 @@ rb_str_plus(VALUE str1, VALUE str2)
    long len1, len2;
    StringValue(str2);
-    enc = rb_enc_check(str1, str2);
+    enc = rb_enc_check_str(str1, str2);
    RSTRING_GETMEM(str1, ptr1, len1);
    RSTRING_GETMEM(str2, ptr2, len2);
    str3 = rb_str_new(0, len1+len2);
@ -4072,7 +4072,7 @@ rb_str_subpat_set(VALUE str, VALUE re, VALUE backref, VALUE val)
    end = END(nth);
    len = end - start;
    StringValue(val);
-    enc = rb_enc_check(str, val);
+    enc = rb_enc_check_str(str, val);
    rb_str_splice_0(str, start, len, val);
    rb_enc_associate(str, enc);
 }