* encoding.c (rb_enc_precise_mbclen): new function for mbclen with

validation.

* include/ruby/encoding.h (rb_enc_precise_mbclen): declared.
  (MBCLEN_CHARFOUND): new macro.
  (MBCLEN_INVALID): new macro.
  (MBCLEN_NEEDMORE): new macro.

* include/ruby/oniguruma.h (OnigEncodingTypeST): replace mbc_enc_len
  by precise_mbc_enc_len.
  (ONIGENC_PRECISE_MBC_ENC_LEN): new macro.
  (ONIGENC_CONSTRUCT_MBCLEN_CHARFOUND): new macro.
  (ONIGENC_CONSTRUCT_MBCLEN_INVALID): new macro.
  (ONIGENC_CONSTRUCT_MBCLEN_NEEDMORE): new macro.
  (ONIGENC_MBCLEN_CHARFOUND): new macro.
  (ONIGENC_MBCLEN_INVALID): new macro.
  (ONIGENC_MBCLEN_NEEDMORE): new macro.
  (ONIGENC_MBC_ENC_LEN): use ONIGENC_PRECISE_MBC_ENC_LEN.

* enc/euc_jp.c: validation implemented.

* enc/sjis.c: ditto.

* enc/utf8.c: ditto.

* string.c (rb_str_inspect): use rb_enc_precise_mbclen for invalid
  encoding.
  (rb_str_valid_encoding_p): new method String#valid_encoding?.

* io.c (rb_io_getc): use rb_enc_precise_mbclen.


git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@14119 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
This commit is contained in:
akr 2007-12-06 09:28:26 +00:00
Родитель de4ec68991
Коммит 69406aad50
10 изменённых файлов: 585 добавлений и 140 удалений

Просмотреть файл

@ -1,3 +1,36 @@
Thu Dec 6 18:22:11 2007 Tanaka Akira <akr@fsij.org>
* encoding.c (rb_enc_precise_mbclen): new function for mbclen with
validation.
* include/ruby/encoding.h (rb_enc_precise_mbclen): declared.
(MBCLEN_CHARFOUND): new macro.
(MBCLEN_INVALID): new macro.
(MBCLEN_NEEDMORE): new macro.
* include/ruby/oniguruma.h (OnigEncodingTypeST): replace mbc_enc_len
by precise_mbc_enc_len.
(ONIGENC_PRECISE_MBC_ENC_LEN): new macro.
(ONIGENC_CONSTRUCT_MBCLEN_CHARFOUND): new macro.
(ONIGENC_CONSTRUCT_MBCLEN_INVALID): new macro.
(ONIGENC_CONSTRUCT_MBCLEN_NEEDMORE): new macro.
(ONIGENC_MBCLEN_CHARFOUND): new macro.
(ONIGENC_MBCLEN_INVALID): new macro.
(ONIGENC_MBCLEN_NEEDMORE): new macro.
(ONIGENC_MBC_ENC_LEN): use ONIGENC_PRECISE_MBC_ENC_LEN.
* enc/euc_jp.c: validation implemented.
* enc/sjis.c: ditto.
* enc/utf8.c: ditto.
* string.c (rb_str_inspect): use rb_enc_precise_mbclen for invalid
encoding.
(rb_str_valid_encoding_p): new method String#valid_encoding?.
* io.c (rb_io_getc): use rb_enc_precise_mbclen.
Thu Dec 6 01:37:23 2007 Nobuyoshi Nakada <nobu@ruby-lang.org> Thu Dec 6 01:37:23 2007 Nobuyoshi Nakada <nobu@ruby-lang.org>
* regparse.c (i_apply_case_fold): fix for negative character class. a * regparse.c (i_apply_case_fold): fix for negative character class. a

Просмотреть файл

@ -50,10 +50,85 @@ static const int EncLen_EUCJP[] = {
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1
}; };
typedef enum { FAILURE = -2, ACCEPT = -1, S0 = 0, S1, S2 } state_t;
#define A ACCEPT
#define F FAILURE
static const signed char trans[][0x100] = {
{ /* S0 0 1 2 3 4 5 6 7 8 9 a b c d e f */
/* 0 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
/* 1 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
/* 2 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
/* 3 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
/* 4 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
/* 5 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
/* 6 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
/* 7 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
/* 8 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, 1, 2,
/* 9 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
/* a */ F, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
/* b */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
/* c */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
/* d */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
/* e */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
/* f */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, F
},
{ /* S1 0 1 2 3 4 5 6 7 8 9 a b c d e f */
/* 0 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
/* 1 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
/* 2 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
/* 3 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
/* 4 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
/* 5 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
/* 6 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
/* 7 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
/* 8 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
/* 9 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
/* a */ F, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
/* b */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
/* c */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
/* d */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
/* e */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
/* f */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, F
},
{ /* S2 0 1 2 3 4 5 6 7 8 9 a b c d e f */
/* 0 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
/* 1 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
/* 2 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
/* 3 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
/* 4 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
/* 5 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
/* 6 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
/* 7 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
/* 8 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
/* 9 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
/* a */ F, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
/* b */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
/* c */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
/* d */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
/* e */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
/* f */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, F
},
};
#undef A
#undef F
static int static int
mbc_enc_len(const UChar* p, const UChar* e, OnigEncoding enc) mbc_enc_len(const UChar* p, const UChar* e, OnigEncoding enc)
{ {
return EncLen_EUCJP[*p]; int firstbyte = *p++;
state_t s;
s = trans[0][firstbyte];
if (s < 0) return s == ACCEPT ? ONIGENC_CONSTRUCT_MBCLEN_CHARFOUND(1) :
ONIGENC_CONSTRUCT_MBCLEN_INVALID();
if (p == e) return ONIGENC_CONSTRUCT_MBCLEN_NEEDMORE(EncLen_EUCJP[firstbyte]-1);
s = trans[s][*p++];
if (s < 0) return s == ACCEPT ? ONIGENC_CONSTRUCT_MBCLEN_CHARFOUND(2) :
ONIGENC_CONSTRUCT_MBCLEN_INVALID();
if (p == e) return ONIGENC_CONSTRUCT_MBCLEN_NEEDMORE(EncLen_EUCJP[firstbyte]-2);
s = trans[s][*p++];
return s == ACCEPT ? ONIGENC_CONSTRUCT_MBCLEN_CHARFOUND(3) :
ONIGENC_CONSTRUCT_MBCLEN_INVALID();
} }
static OnigCodePoint static OnigCodePoint

Просмотреть файл

@ -70,10 +70,62 @@ static const char SJIS_CAN_BE_TRAIL_TABLE[256] = {
#define SJIS_ISMB_FIRST(byte) (EncLen_SJIS[byte] > 1) #define SJIS_ISMB_FIRST(byte) (EncLen_SJIS[byte] > 1)
#define SJIS_ISMB_TRAIL(byte) SJIS_CAN_BE_TRAIL_TABLE[(byte)] #define SJIS_ISMB_TRAIL(byte) SJIS_CAN_BE_TRAIL_TABLE[(byte)]
typedef enum { FAILURE = -2, ACCEPT = -1, S0 = 0, S1 } state_t;
#define A ACCEPT
#define F FAILURE
static const signed char trans[][0x100] = {
{ /* S0 0 1 2 3 4 5 6 7 8 9 a b c d e f */
/* 0 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
/* 1 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
/* 2 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
/* 3 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
/* 4 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
/* 5 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
/* 6 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
/* 7 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
/* 8 */ F, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
/* 9 */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
/* a */ F, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
/* b */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
/* c */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
/* d */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
/* e */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
/* f */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, F, F, F
},
{ /* S1 0 1 2 3 4 5 6 7 8 9 a b c d e f */
/* 0 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
/* 1 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
/* 2 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
/* 3 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
/* 4 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
/* 5 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
/* 6 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
/* 7 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, F,
/* 8 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
/* 9 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
/* a */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
/* b */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
/* c */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
/* d */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
/* e */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
/* f */ A, A, A, A, A, A, A, A, A, A, A, A, A, F, F, F
}
};
#undef A
#undef F
static int static int
mbc_enc_len(const UChar* p, const UChar* e, OnigEncoding enc) mbc_enc_len(const UChar* p, const UChar* e, OnigEncoding enc)
{ {
return EncLen_SJIS[*p]; int firstbyte = *p++;
state_t s;
s = trans[0][firstbyte];
if (s < 0) return s == ACCEPT ? ONIGENC_CONSTRUCT_MBCLEN_CHARFOUND(1) :
ONIGENC_CONSTRUCT_MBCLEN_INVALID();
if (p == e) return ONIGENC_CONSTRUCT_MBCLEN_NEEDMORE(EncLen_SJIS[firstbyte]-1);
s = trans[s][*p++];
return s == ACCEPT ? ONIGENC_CONSTRUCT_MBCLEN_CHARFOUND(2) :
ONIGENC_CONSTRUCT_MBCLEN_INVALID();
} }
static int static int

Просмотреть файл

@ -56,13 +56,189 @@ static const int EncLen_UTF8[] = {
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 1, 1 4, 4, 4, 4, 4, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1
}; };
typedef enum {
FAILURE = -2,
ACCEPT,
S0, S1, S2, S3,
S4, S5, S6, S7
} state_t;
#define A ACCEPT
#define F FAILURE
static const signed char trans[][0x100] = {
{ /* S0 0 1 2 3 4 5 6 7 8 9 a b c d e f */
/* 0 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
/* 1 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
/* 2 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
/* 3 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
/* 4 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
/* 5 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
/* 6 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
/* 7 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
/* 8 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
/* 9 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
/* a */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
/* b */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
/* c */ F, F, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
/* d */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
/* e */ 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 3, 3,
/* f */ 5, 6, 6, 6, 7, F, F, F, F, F, F, F, F, F, F, F
},
{ /* S1 0 1 2 3 4 5 6 7 8 9 a b c d e f */
/* 0 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
/* 1 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
/* 2 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
/* 3 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
/* 4 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
/* 5 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
/* 6 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
/* 7 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
/* 8 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
/* 9 */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
/* a */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
/* b */ A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,
/* c */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
/* d */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
/* e */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
/* f */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F
},
{ /* S2 0 1 2 3 4 5 6 7 8 9 a b c d e f */
/* 0 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
/* 1 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
/* 2 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
/* 3 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
/* 4 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
/* 5 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
/* 6 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
/* 7 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
/* 8 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
/* 9 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
/* a */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
/* b */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
/* c */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
/* d */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
/* e */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
/* f */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F
},
{ /* S3 0 1 2 3 4 5 6 7 8 9 a b c d e f */
/* 0 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
/* 1 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
/* 2 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
/* 3 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
/* 4 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
/* 5 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
/* 6 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
/* 7 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
/* 8 */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
/* 9 */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
/* a */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
/* b */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
/* c */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
/* d */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
/* e */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
/* f */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F
},
{ /* S4 0 1 2 3 4 5 6 7 8 9 a b c d e f */
/* 0 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
/* 1 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
/* 2 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
/* 3 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
/* 4 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
/* 5 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
/* 6 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
/* 7 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
/* 8 */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
/* 9 */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
/* a */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
/* b */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
/* c */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
/* d */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
/* e */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
/* f */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F
},
{ /* S5 0 1 2 3 4 5 6 7 8 9 a b c d e f */
/* 0 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
/* 1 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
/* 2 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
/* 3 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
/* 4 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
/* 5 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
/* 6 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
/* 7 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
/* 8 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
/* 9 */ 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
/* a */ 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
/* b */ 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
/* c */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
/* d */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
/* e */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
/* f */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F
},
{ /* S6 0 1 2 3 4 5 6 7 8 9 a b c d e f */
/* 0 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
/* 1 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
/* 2 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
/* 3 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
/* 4 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
/* 5 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
/* 6 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
/* 7 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
/* 8 */ 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
/* 9 */ 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
/* a */ 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
/* b */ 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
/* c */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
/* d */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
/* e */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
/* f */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F
},
{ /* S7 0 1 2 3 4 5 6 7 8 9 a b c d e f */
/* 0 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
/* 1 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
/* 2 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
/* 3 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
/* 4 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
/* 5 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
/* 6 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
/* 7 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
/* 8 */ 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
/* 9 */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
/* a */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
/* b */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
/* c */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
/* d */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
/* e */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F,
/* f */ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F
},
};
#undef A
#undef F
static int static int
utf8_mbc_enc_len(const UChar* p, const UChar* e, OnigEncoding enc) utf8_mbc_enc_len(const UChar* p, const UChar* e, OnigEncoding enc)
{ {
return EncLen_UTF8[*p]; int firstbyte = *p++;
state_t s;
s = trans[0][firstbyte];
if (s < 0) return s == ACCEPT ? ONIGENC_CONSTRUCT_MBCLEN_CHARFOUND(1) :
ONIGENC_CONSTRUCT_MBCLEN_INVALID();
if (p == e) return ONIGENC_CONSTRUCT_MBCLEN_NEEDMORE(EncLen_UTF8[firstbyte]-1);
s = trans[s][*p++];
if (s < 0) return s == ACCEPT ? ONIGENC_CONSTRUCT_MBCLEN_CHARFOUND(2) :
ONIGENC_CONSTRUCT_MBCLEN_INVALID();
if (p == e) return ONIGENC_CONSTRUCT_MBCLEN_NEEDMORE(EncLen_UTF8[firstbyte]-2);
s = trans[s][*p++];
if (s < 0) return s == ACCEPT ? ONIGENC_CONSTRUCT_MBCLEN_CHARFOUND(3) :
ONIGENC_CONSTRUCT_MBCLEN_INVALID();
if (p == e) return ONIGENC_CONSTRUCT_MBCLEN_NEEDMORE(EncLen_UTF8[firstbyte]-3);
s = trans[s][*p++];
return s == ACCEPT ? ONIGENC_CONSTRUCT_MBCLEN_CHARFOUND(4) :
ONIGENC_CONSTRUCT_MBCLEN_INVALID();
} }
static int static int

Просмотреть файл

@ -494,6 +494,12 @@ rb_enc_mbclen(const char *p, const char *e, rb_encoding *enc)
return n; return n;
} }
int
rb_enc_precise_mbclen(const char *p, const char *e, rb_encoding *enc)
{
return ONIGENC_PRECISE_MBC_ENC_LEN(enc, (UChar*)p, (UChar*)e);
}
int int
rb_enc_codelen(int c, rb_encoding *enc) rb_enc_codelen(int c, rb_encoding *enc)
{ {

Просмотреть файл

@ -68,9 +68,15 @@ rb_encoding * rb_enc_find(const char *name);
#define rb_enc_mbminlen(enc) (enc)->min_enc_len #define rb_enc_mbminlen(enc) (enc)->min_enc_len
#define rb_enc_mbmaxlen(enc) (enc)->max_enc_len #define rb_enc_mbmaxlen(enc) (enc)->max_enc_len
/* ptr,encoding -> mbclen */ /* ptr,endptr,encoding -> mbclen */
int rb_enc_mbclen(const char*, const char *, rb_encoding*); int rb_enc_mbclen(const char*, const char *, rb_encoding*);
/* ptr,endptr,encoding -> chlen, invalid or needmore */
int rb_enc_precise_mbclen(const char*, const char *, rb_encoding*);
#define MBCLEN_CHARFOUND(ret) ONIGENC_MBCLEN_CHARFOUND(ret)
#define MBCLEN_INVALID(ret) ONIGENC_MBCLEN_INVALID(ret)
#define MBCLEN_NEEDMORE(ret) ONIGENC_MBCLEN_NEEDMORE(ret)
/* code,encoding -> codelen */ /* code,encoding -> codelen */
int rb_enc_codelen(int, rb_encoding*); int rb_enc_codelen(int, rb_encoding*);

Просмотреть файл

@ -144,7 +144,7 @@ typedef struct {
typedef int (*OnigApplyAllCaseFoldFunc)(OnigCodePoint from, OnigCodePoint* to, int to_len, void* arg); typedef int (*OnigApplyAllCaseFoldFunc)(OnigCodePoint from, OnigCodePoint* to, int to_len, void* arg);
typedef struct OnigEncodingTypeST { typedef struct OnigEncodingTypeST {
int (*mbc_enc_len)(const OnigUChar* p,const OnigUChar* e, struct OnigEncodingTypeST* enc); int (*precise_mbc_enc_len)(const OnigUChar* p,const OnigUChar* e, struct OnigEncodingTypeST* enc);
const char* name; const char* name;
int max_enc_len; int max_enc_len;
int min_enc_len; int min_enc_len;
@ -282,7 +282,32 @@ ONIG_EXTERN OnigEncodingType OnigEncodingGB18030;
#define ONIGENC_STEP_BACK(enc,start,s,n) \ #define ONIGENC_STEP_BACK(enc,start,s,n) \
onigenc_step_back((enc),(start),(s),(n)) onigenc_step_back((enc),(start),(s),(n))
#define ONIGENC_MBC_ENC_LEN(enc,p,e) (enc)->mbc_enc_len(p,e,enc)
#define ONIGENC_CONSTRUCT_MBCLEN_CHARFOUND(n) (n)
#define ONIGENC_CONSTRUCT_MBCLEN_INVALID() (-1)
#define ONIGENC_CONSTRUCT_MBCLEN_NEEDMORE(n) (-1-n)
static inline int onigenc_mbclen_charfound(int r) { return 0 < r ? r : 0; }
static inline int onigenc_mbclen_needmore(int r) { return r < -1 ? -1 - r : 0; }
#define ONIGENC_MBCLEN_CHARFOUND(r) onigenc_mbclen_charfound(r)
#define ONIGENC_MBCLEN_INVALID(r) ((r) == -1)
#define ONIGENC_MBCLEN_NEEDMORE(r) onigenc_mbclen_needmore(r)
#define ONIGENC_PRECISE_MBC_ENC_LEN(enc,p,e) (enc)->precise_mbc_enc_len(p,e,enc)
static inline int onigenc_mbclen_recover(const OnigUChar* p,const OnigUChar* e, struct OnigEncodingTypeST* enc)
{
int ret = ONIGENC_PRECISE_MBC_ENC_LEN(enc,p,e);
int r;
if (ONIGENC_MBCLEN_INVALID(ret))
return 1;
else if ((r = ONIGENC_MBCLEN_NEEDMORE(ret)))
return e-p+r;
else
return ONIGENC_MBCLEN_CHARFOUND(ret);
}
#define ONIGENC_MBC_ENC_LEN(enc,p,e) onigenc_mbclen_recover(p,e,enc)
#define ONIGENC_MBC_MAXLEN(enc) ((enc)->max_enc_len) #define ONIGENC_MBC_MAXLEN(enc) ((enc)->max_enc_len)
#define ONIGENC_MBC_MAXLEN_DIST(enc) ONIGENC_MBC_MAXLEN(enc) #define ONIGENC_MBC_MAXLEN_DIST(enc) ONIGENC_MBC_MAXLEN(enc)
#define ONIGENC_MBC_MINLEN(enc) ((enc)->min_enc_len) #define ONIGENC_MBC_MINLEN(enc) ((enc)->min_enc_len)

32
io.c
Просмотреть файл

@ -2127,7 +2127,7 @@ rb_io_getc(VALUE io)
{ {
rb_encoding *enc; rb_encoding *enc;
rb_io_t *fptr; rb_io_t *fptr;
int n, left; int r, n;
VALUE str; VALUE str;
GetOpenFile(io, fptr); GetOpenFile(io, fptr);
@ -2138,22 +2138,30 @@ rb_io_getc(VALUE io)
if (io_fillbuf(fptr) < 0) { if (io_fillbuf(fptr) < 0) {
return Qnil; return Qnil;
} }
n = rb_enc_mbclen(fptr->rbuf+fptr->rbuf_off, fptr->rbuf+fptr->rbuf_len, enc); r = rb_enc_precise_mbclen(fptr->rbuf+fptr->rbuf_off, fptr->rbuf+fptr->rbuf_off+fptr->rbuf_len, enc);
if (n < fptr->rbuf_len) { if ((n = MBCLEN_CHARFOUND(r)) != 0 && n <= fptr->rbuf_len) {
str = rb_str_new(fptr->rbuf+fptr->rbuf_off, n); str = rb_str_new(fptr->rbuf+fptr->rbuf_off, n);
fptr->rbuf_off += n; fptr->rbuf_off += n;
fptr->rbuf_len -= n; fptr->rbuf_len -= n;
} }
else if (MBCLEN_NEEDMORE(r)) {
str = rb_str_new(fptr->rbuf+fptr->rbuf_off, fptr->rbuf_len);
fptr->rbuf_len = 0;
getc_needmore:
if (io_fillbuf(fptr) != -1) {
rb_str_cat(str, fptr->rbuf+fptr->rbuf_off, 1);
fptr->rbuf_off++;
fptr->rbuf_len--;
r = rb_enc_precise_mbclen(RSTRING_PTR(str), RSTRING_PTR(str)+RSTRING_LEN(str), enc);
if (MBCLEN_NEEDMORE(r)) {
goto getc_needmore;
}
}
}
else { else {
str = rb_str_new(0, n); str = rb_str_new(fptr->rbuf+fptr->rbuf_off, 1);
left = fptr->rbuf_len; fptr->rbuf_off++;
MEMCPY(RSTRING_PTR(str), fptr->rbuf+fptr->rbuf_off, char, left); fptr->rbuf_len--;
if (io_fillbuf(fptr) < 0) {
return Qnil;
}
MEMCPY(RSTRING_PTR(str)+left, fptr->rbuf, char, n-left);
fptr->rbuf_off += left;
fptr->rbuf_len -= left;
} }
rb_enc_associate(str, enc); rb_enc_associate(str, enc);

Просмотреть файл

@ -2919,10 +2919,20 @@ rb_str_inspect(VALUE str)
str_cat_char(result, '"', enc); str_cat_char(result, '"', enc);
p = RSTRING_PTR(str); pend = RSTRING_END(str); p = RSTRING_PTR(str); pend = RSTRING_END(str);
while (p < pend) { while (p < pend) {
int c = rb_enc_codepoint(p, pend, enc); int c;
int n = rb_enc_codelen(c, enc); int n;
int cc; int cc;
n = rb_enc_precise_mbclen(p, pend, enc);
if (!MBCLEN_CHARFOUND(n)) {
p++;
n = 1;
goto escape_codepoint;
}
c = rb_enc_codepoint(p, pend, enc);
n = rb_enc_codelen(c, enc);
p += n; p += n;
if (c == '"'|| c == '\\' || if (c == '"'|| c == '\\' ||
(c == '#' && (cc = rb_enc_codepoint(p,pend,enc), (c == '#' && (cc = rb_enc_codepoint(p,pend,enc),
@ -2954,19 +2964,21 @@ rb_str_inspect(VALUE str)
prefix_escape(result, 'e', enc); prefix_escape(result, 'e', enc);
} }
else if (rb_enc_isprint(c, enc)) { else if (rb_enc_isprint(c, enc)) {
char buf[5]; rb_str_buf_cat(result, p-n, n);
rb_enc_mbcput(c, buf, enc);
rb_str_buf_cat(result, buf, n);
} }
else { else {
char buf[5]; char buf[5];
char *s = buf; char *s;
char *q;
sprintf(buf, "\\%03o", c & 0377); escape_codepoint:
while (*s) { for (q = p-n; q < p; q++) {
str_cat_char(result, *s++, enc); s = buf;
} sprintf(buf, "\\%03o", *q & 0377);
while (*s) {
str_cat_char(result, *s++, enc);
}
}
} }
} }
str_cat_char(result, '"', enc); str_cat_char(result, '"', enc);
@ -5232,6 +5244,25 @@ rb_str_force_encoding(VALUE str, VALUE enc)
return str; return str;
} }
static VALUE
rb_str_valid_encoding_p(VALUE str)
{
char *p = RSTRING_PTR(str);
char *pend = RSTRING_END(str);
rb_encoding *enc = rb_enc_get(str);
while (p < pend) {
int n;
n = rb_enc_precise_mbclen(p, pend, enc);
if (!MBCLEN_CHARFOUND(n)) {
return Qfalse;
}
p += n;
}
return Qtrue;
}
/********************************************************************** /**********************************************************************
* Document-class: Symbol * Document-class: Symbol
* *
@ -5644,6 +5675,7 @@ Init_String(void)
rb_define_method(rb_cString, "encoding", rb_obj_encoding, 0); /* in encoding.c */ rb_define_method(rb_cString, "encoding", rb_obj_encoding, 0); /* in encoding.c */
rb_define_method(rb_cString, "force_encoding", rb_str_force_encoding, 1); rb_define_method(rb_cString, "force_encoding", rb_str_force_encoding, 1);
rb_define_method(rb_cString, "valid_encoding?", rb_str_valid_encoding_p, 0);
id_to_s = rb_intern("to_s"); id_to_s = rb_intern("to_s");

Просмотреть файл

@ -26,14 +26,46 @@ class TestM17N < Test::Unit::TestCase
end end
def test_string_mixed_unicode def test_string_mixed_unicode
assert_raise(SyntaxError) { eval(a(%{"\xc0\xa0\\u{6666}"})) } assert_raise(SyntaxError) { eval(a(%{"\xc2\xa0\\u{6666}"})) }
assert_raise(SyntaxError) { eval(e(%{"\xc0\xa0\\u{6666}"})) } assert_raise(SyntaxError) { eval(e(%{"\xc2\xa0\\u{6666}"})) }
assert_raise(SyntaxError) { eval(s(%{"\xc0\xa0\\u{6666}"})) } assert_raise(SyntaxError) { eval(s(%{"\xc2\xa0\\u{6666}"})) }
assert_nothing_raised { eval(u(%{"\xc0\xa0\\u{6666}"})) } assert_nothing_raised { eval(u(%{"\xc2\xa0\\u{6666}"})) }
assert_raise(SyntaxError) { eval(a(%{"\\u{6666}\xc0\xa0"})) } assert_raise(SyntaxError) { eval(a(%{"\\u{6666}\xc2\xa0"})) }
assert_raise(SyntaxError) { eval(e(%{"\\u{6666}\xc0\xa0"})) } assert_raise(SyntaxError) { eval(e(%{"\\u{6666}\xc2\xa0"})) }
assert_raise(SyntaxError) { eval(s(%{"\\u{6666}\xc0\xa0"})) } assert_raise(SyntaxError) { eval(s(%{"\\u{6666}\xc2\xa0"})) }
assert_nothing_raised { eval(u(%{"\\u{6666}\xc0\xa0"})) } assert_nothing_raised { eval(u(%{"\\u{6666}\xc2\xa0"})) }
end
def test_string_inspect
assert_equal('"\376"', e("\xfe").inspect)
assert_equal('"\216"', e("\x8e").inspect)
assert_equal('"\217"', e("\x8f").inspect)
assert_equal('"\217\241"', e("\x8f\xa1").inspect)
assert_equal('"\357"', s("\xef").inspect)
assert_equal('"\302"', u("\xc2").inspect)
assert_equal('"\340\200"', u("\xe0\x80").inspect)
assert_equal('"\360\200\200"', u("\xf0\x80\x80").inspect)
assert_equal('"\370\200\200\200"', u("\xf8\x80\x80\x80").inspect)
assert_equal('"\374\200\200\200\200"', u("\xfc\x80\x80\x80\x80").inspect)
assert_equal('"\376 "', e("\xfe ").inspect)
assert_equal('"\216 "', e("\x8e ").inspect)
assert_equal('"\217 "', e("\x8f ").inspect)
assert_equal('"\217\241 "', e("\x8f\xa1 ").inspect)
assert_equal('"\357 "', s("\xef ").inspect)
assert_equal('"\302 "', u("\xc2 ").inspect)
assert_equal('"\340\200 "', u("\xe0\x80 ").inspect)
assert_equal('"\360\200\200 "', u("\xf0\x80\x80 ").inspect)
assert_equal('"\370\200\200\200 "', u("\xf8\x80\x80\x80 ").inspect)
assert_equal('"\374\200\200\200\200 "', u("\xfc\x80\x80\x80\x80 ").inspect)
assert_equal(e("\"\\241\x8f\xa1\xa1\""), e("\xa1\x8f\xa1\xa1").inspect)
assert_equal('"\201."', s("\x81.").inspect)
assert_equal(s("\"\x81@\""), s("\x81@").inspect)
assert_equal('"\374"', u("\xfc").inspect)
end end
def test_regexp_too_short_multibyte_character def test_regexp_too_short_multibyte_character
@ -42,27 +74,27 @@ class TestM17N < Test::Unit::TestCase
assert_raise(SyntaxError) { eval('/\x8f/e') } assert_raise(SyntaxError) { eval('/\x8f/e') }
assert_raise(SyntaxError) { eval('/\x8f\xa1/e') } assert_raise(SyntaxError) { eval('/\x8f\xa1/e') }
assert_raise(SyntaxError) { eval('/\xef/s') } assert_raise(SyntaxError) { eval('/\xef/s') }
assert_raise(SyntaxError) { eval('/\xc0/u') } assert_raise(SyntaxError) { eval('/\xc2/u') }
assert_raise(SyntaxError) { eval('/\xe0\x80/u') } assert_raise(SyntaxError) { eval('/\xe0\x80/u') }
assert_raise(SyntaxError) { eval('/\xf0\x80\x80/u') } assert_raise(SyntaxError) { eval('/\xf0\x80\x80/u') }
assert_raise(SyntaxError) { eval('/\xf8\x80\x80\x80/u') } #assert_raise(SyntaxError) { eval('/\xf8\x80\x80\x80/u') }
assert_raise(SyntaxError) { eval('/\xfc\x80\x80\x80\x80/u') } #assert_raise(SyntaxError) { eval('/\xfc\x80\x80\x80\x80/u') }
# raw 8bit # raw 8bit
assert_raise(SyntaxError) { eval("/\xfe/e") } assert_raise(SyntaxError) { eval("/\xfe/e") }
assert_raise(SyntaxError) { eval("/\xc0/u") } assert_raise(SyntaxError) { eval("/\xc2/u") }
# invalid suffix # invalid suffix
assert_raise(SyntaxError) { eval('/\xc0\xff/u') } assert_raise(SyntaxError) { eval('/\xc2\xff/u') }
assert_raise(SyntaxError) { eval('/\xc0 /u') } assert_raise(SyntaxError) { eval('/\xc2 /u') }
#assert_raise(SyntaxError) { eval('/\xc0\x20/u') } #assert_raise(SyntaxError) { eval('/\xc2\x20/u') }
end end
def assert_regexp_generic_encoding(r) def assert_regexp_generic_encoding(r)
assert(!r.fixed_encoding?) assert(!r.fixed_encoding?)
%w[ASCII-8BIT EUC-JP Shift_JIS UTF-8].each {|ename| %w[ASCII-8BIT EUC-JP Shift_JIS UTF-8].each {|ename|
# "\xc0\xa1" is a valid sequence for ASCII-8BIT, EUC-JP, Shift_JIS and UTF-8. # "\xc2\xa1" is a valid sequence for ASCII-8BIT, EUC-JP, Shift_JIS and UTF-8.
assert_nothing_raised { r =~ "\xc0\xa1".force_encoding(ename) } assert_nothing_raised { r =~ "\xc2\xa1".force_encoding(ename) }
} }
end end
@ -71,9 +103,9 @@ class TestM17N < Test::Unit::TestCase
%w[ASCII-8BIT EUC-JP Shift_JIS UTF-8].each {|ename| %w[ASCII-8BIT EUC-JP Shift_JIS UTF-8].each {|ename|
enc = Encoding.find(ename) enc = Encoding.find(ename)
if enc == r.encoding if enc == r.encoding
assert_nothing_raised { r =~ "\xc0\xa1".force_encoding(enc) } assert_nothing_raised { r =~ "\xc2\xa1".force_encoding(enc) }
else else
assert_raise(ArgumentError) { r =~ "\xc0\xa1".force_encoding(enc) } assert_raise(ArgumentError) { r =~ "\xc2\xa1".force_encoding(enc) }
end end
} }
end end
@ -115,77 +147,77 @@ class TestM17N < Test::Unit::TestCase
assert_equal(0, r =~ e("a")) assert_equal(0, r =~ e("a"))
assert_equal(0, r =~ s("a")) assert_equal(0, r =~ s("a"))
assert_equal(0, r =~ u("a")) assert_equal(0, r =~ u("a"))
assert_equal(nil, r =~ a("\xc0\xa1")) assert_equal(nil, r =~ a("\xc2\xa1"))
assert_equal(nil, r =~ e("\xc0\xa1")) assert_equal(nil, r =~ e("\xc2\xa1"))
assert_equal(nil, r =~ s("\xc0\xa1")) assert_equal(nil, r =~ s("\xc2\xa1"))
assert_equal(nil, r =~ u("\xc0\xa1")) assert_equal(nil, r =~ u("\xc2\xa1"))
} }
end end
def test_regexp_ascii def test_regexp_ascii
assert_regexp_fixed_ascii8bit(/a/n) assert_regexp_fixed_ascii8bit(/a/n)
assert_regexp_fixed_ascii8bit(/\xc0\xa1/n) assert_regexp_fixed_ascii8bit(/\xc2\xa1/n)
assert_regexp_fixed_ascii8bit(eval(a(%{/\xc0\xa1/}))) assert_regexp_fixed_ascii8bit(eval(a(%{/\xc2\xa1/})))
assert_regexp_fixed_ascii8bit(eval(a(%{/\xc0\xa1/n}))) assert_regexp_fixed_ascii8bit(eval(a(%{/\xc2\xa1/n})))
assert_regexp_fixed_ascii8bit(eval(a(%q{/\xc0\xa1/}))) assert_regexp_fixed_ascii8bit(eval(a(%q{/\xc2\xa1/})))
[/a/n].each {|r| [/a/n].each {|r|
assert_equal(0, r =~ a("a")) assert_equal(0, r =~ a("a"))
assert_equal(0, r =~ e("a")) assert_equal(0, r =~ e("a"))
assert_equal(0, r =~ s("a")) assert_equal(0, r =~ s("a"))
assert_equal(0, r =~ u("a")) assert_equal(0, r =~ u("a"))
assert_equal(nil, r =~ a("\xc0\xa1")) assert_equal(nil, r =~ a("\xc2\xa1"))
assert_raise(ArgumentError) { r =~ e("\xc0\xa1") } assert_raise(ArgumentError) { r =~ e("\xc2\xa1") }
assert_raise(ArgumentError) { r =~ s("\xc0\xa1") } assert_raise(ArgumentError) { r =~ s("\xc2\xa1") }
assert_raise(ArgumentError) { r =~ u("\xc0\xa1") } assert_raise(ArgumentError) { r =~ u("\xc2\xa1") }
} }
[/\xc0\xa1/n, eval(a(%{/\xc0\xa1/})), eval(a(%{/\xc0\xa1/n}))].each {|r| [/\xc2\xa1/n, eval(a(%{/\xc2\xa1/})), eval(a(%{/\xc2\xa1/n}))].each {|r|
assert_equal(nil, r =~ a("a")) assert_equal(nil, r =~ a("a"))
assert_equal(nil, r =~ e("a")) assert_equal(nil, r =~ e("a"))
assert_equal(nil, r =~ s("a")) assert_equal(nil, r =~ s("a"))
assert_equal(nil, r =~ u("a")) assert_equal(nil, r =~ u("a"))
assert_equal(0, r =~ a("\xc0\xa1")) assert_equal(0, r =~ a("\xc2\xa1"))
assert_raise(ArgumentError) { r =~ e("\xc0\xa1") } assert_raise(ArgumentError) { r =~ e("\xc2\xa1") }
assert_raise(ArgumentError) { r =~ s("\xc0\xa1") } assert_raise(ArgumentError) { r =~ s("\xc2\xa1") }
assert_raise(ArgumentError) { r =~ u("\xc0\xa1") } assert_raise(ArgumentError) { r =~ u("\xc2\xa1") }
} }
end end
def test_regexp_euc def test_regexp_euc
assert_regexp_fixed_eucjp(/a/e) assert_regexp_fixed_eucjp(/a/e)
assert_regexp_fixed_eucjp(/\xc0\xa1/e) assert_regexp_fixed_eucjp(/\xc2\xa1/e)
assert_regexp_fixed_eucjp(eval(e(%{/\xc0\xa1/}))) assert_regexp_fixed_eucjp(eval(e(%{/\xc2\xa1/})))
assert_regexp_fixed_eucjp(eval(e(%q{/\xc0\xa1/}))) assert_regexp_fixed_eucjp(eval(e(%q{/\xc2\xa1/})))
[/a/e].each {|r| [/a/e].each {|r|
assert_equal(0, r =~ a("a")) assert_equal(0, r =~ a("a"))
assert_equal(0, r =~ e("a")) assert_equal(0, r =~ e("a"))
assert_equal(0, r =~ s("a")) assert_equal(0, r =~ s("a"))
assert_equal(0, r =~ u("a")) assert_equal(0, r =~ u("a"))
assert_raise(ArgumentError) { r =~ a("\xc0\xa1") } assert_raise(ArgumentError) { r =~ a("\xc2\xa1") }
assert_equal(nil, r =~ e("\xc0\xa1")) assert_equal(nil, r =~ e("\xc2\xa1"))
assert_raise(ArgumentError) { r =~ s("\xc0\xa1") } assert_raise(ArgumentError) { r =~ s("\xc2\xa1") }
assert_raise(ArgumentError) { r =~ u("\xc0\xa1") } assert_raise(ArgumentError) { r =~ u("\xc2\xa1") }
} }
[/\xc0\xa1/e, eval(e(%{/\xc0\xa1/})), eval(e(%q{/\xc0\xa1/}))].each {|r| [/\xc2\xa1/e, eval(e(%{/\xc2\xa1/})), eval(e(%q{/\xc2\xa1/}))].each {|r|
assert_equal(nil, r =~ a("a")) assert_equal(nil, r =~ a("a"))
assert_equal(nil, r =~ e("a")) assert_equal(nil, r =~ e("a"))
assert_equal(nil, r =~ s("a")) assert_equal(nil, r =~ s("a"))
assert_equal(nil, r =~ u("a")) assert_equal(nil, r =~ u("a"))
assert_raise(ArgumentError) { r =~ a("\xc0\xa1") } assert_raise(ArgumentError) { r =~ a("\xc2\xa1") }
assert_equal(0, r =~ e("\xc0\xa1")) assert_equal(0, r =~ e("\xc2\xa1"))
assert_raise(ArgumentError) { r =~ s("\xc0\xa1") } assert_raise(ArgumentError) { r =~ s("\xc2\xa1") }
assert_raise(ArgumentError) { r =~ u("\xc0\xa1") } assert_raise(ArgumentError) { r =~ u("\xc2\xa1") }
} }
end end
def test_regexp_sjis def test_regexp_sjis
assert_regexp_fixed_sjis(/a/s) assert_regexp_fixed_sjis(/a/s)
assert_regexp_fixed_sjis(/\xc0\xa1/s) assert_regexp_fixed_sjis(/\xc2\xa1/s)
assert_regexp_fixed_sjis(eval(s(%{/\xc0\xa1/}))) assert_regexp_fixed_sjis(eval(s(%{/\xc2\xa1/})))
assert_regexp_fixed_sjis(eval(s(%q{/\xc0\xa1/}))) assert_regexp_fixed_sjis(eval(s(%q{/\xc2\xa1/})))
end end
def test_begin_end_offset def test_begin_end_offset
@ -223,10 +255,10 @@ class TestM17N < Test::Unit::TestCase
assert_encoding("ASCII-8BIT", Regexp.quote(s("a")).encoding) assert_encoding("ASCII-8BIT", Regexp.quote(s("a")).encoding)
assert_encoding("ASCII-8BIT", Regexp.quote(u("a")).encoding) assert_encoding("ASCII-8BIT", Regexp.quote(u("a")).encoding)
assert_encoding("ASCII-8BIT", Regexp.quote(a("\xc0\xa1")).encoding) assert_encoding("ASCII-8BIT", Regexp.quote(a("\xc2\xa1")).encoding)
assert_encoding("EUC-JP", Regexp.quote(e("\xc0\xa1")).encoding) assert_encoding("EUC-JP", Regexp.quote(e("\xc2\xa1")).encoding)
assert_encoding("Shift_JIS", Regexp.quote(s("\xc0\xa1")).encoding) assert_encoding("Shift_JIS", Regexp.quote(s("\xc2\xa1")).encoding)
assert_encoding("UTF-8", Regexp.quote(u("\xc0\xa1")).encoding) assert_encoding("UTF-8", Regexp.quote(u("\xc2\xa1")).encoding)
end end
def test_union_0 def test_union_0
@ -254,10 +286,10 @@ class TestM17N < Test::Unit::TestCase
end end
def test_union_1_nonascii_string def test_union_1_nonascii_string
assert_regexp_fixed_ascii8bit(Regexp.union(a("\xc0\xa1"))) assert_regexp_fixed_ascii8bit(Regexp.union(a("\xc2\xa1")))
assert_regexp_fixed_eucjp(Regexp.union(e("\xc0\xa1"))) assert_regexp_fixed_eucjp(Regexp.union(e("\xc2\xa1")))
assert_regexp_fixed_sjis(Regexp.union(s("\xc0\xa1"))) assert_regexp_fixed_sjis(Regexp.union(s("\xc2\xa1")))
assert_regexp_fixed_utf8(Regexp.union(u("\xc0\xa1"))) assert_regexp_fixed_utf8(Regexp.union(u("\xc2\xa1")))
end end
def test_union_1_regexp def test_union_1_regexp
@ -271,7 +303,7 @@ class TestM17N < Test::Unit::TestCase
def test_union_2 def test_union_2
ary = [ ary = [
a(""), e(""), s(""), u(""), a(""), e(""), s(""), u(""),
a("\xc0\xa1"), e("\xc0\xa1"), s("\xc0\xa1"), u("\xc0\xa1") a("\xc2\xa1"), e("\xc2\xa1"), s("\xc2\xa1"), u("\xc2\xa1")
] ]
ary.each {|s1| ary.each {|s1|
ary.each {|s2| ary.each {|s2|
@ -304,26 +336,26 @@ class TestM17N < Test::Unit::TestCase
def test_dynamic_ascii_regexp def test_dynamic_ascii_regexp
assert_regexp_fixed_ascii8bit(/#{}/n) assert_regexp_fixed_ascii8bit(/#{}/n)
assert_regexp_fixed_ascii8bit(/#{}\xc0\xa1/n) assert_regexp_fixed_ascii8bit(/#{}\xc2\xa1/n)
assert_regexp_fixed_ascii8bit(/\xc0\xa1#{}/n) assert_regexp_fixed_ascii8bit(/\xc2\xa1#{}/n)
#assert_raise(SyntaxError) { eval('/\xc0#{}\xa1/s') } #assert_raise(SyntaxError) { eval('/\xc2#{}\xa1/s') }
#assert_raise(SyntaxError) { s1, s2 = s('\xc0'), s('\xa1'); /#{s1}#{s2}/ } #assert_raise(SyntaxError) { s1, s2 = s('\xc2'), s('\xa1'); /#{s1}#{s2}/ }
end end
def test_dynamic_eucjp_regexp def test_dynamic_eucjp_regexp
assert_regexp_fixed_eucjp(/#{}/e) assert_regexp_fixed_eucjp(/#{}/e)
assert_regexp_fixed_eucjp(/#{}\xc0\xa1/e) assert_regexp_fixed_eucjp(/#{}\xc2\xa1/e)
assert_regexp_fixed_eucjp(/\xc0\xa1#{}/e) assert_regexp_fixed_eucjp(/\xc2\xa1#{}/e)
assert_raise(RegexpError) { eval('/\xc0#{}/e') } assert_raise(RegexpError) { eval('/\xc2#{}/e') }
assert_raise(RegexpError) { eval('/#{}\xc0/e') } assert_raise(RegexpError) { eval('/#{}\xc2/e') }
#assert_raise(SyntaxError) { eval('/\xc0#{}\xa1/e') } #assert_raise(SyntaxError) { eval('/\xc2#{}\xa1/e') }
#assert_raise(SyntaxError) { s1, s2 = e('\xc0'), e('\xa1'); /#{s1}#{s2}/ } #assert_raise(SyntaxError) { s1, s2 = e('\xc2'), e('\xa1'); /#{s1}#{s2}/ }
end end
def test_dynamic_sjis_regexp def test_dynamic_sjis_regexp
assert_regexp_fixed_sjis(/#{}/s) assert_regexp_fixed_sjis(/#{}/s)
assert_regexp_fixed_sjis(/#{}\xc0\xa1/s) assert_regexp_fixed_sjis(/#{}\xc2\xa1/s)
assert_regexp_fixed_sjis(/\xc0\xa1#{}/s) assert_regexp_fixed_sjis(/\xc2\xa1#{}/s)
assert_raise(RegexpError) { eval('/\x81#{}/s') } assert_raise(RegexpError) { eval('/\x81#{}/s') }
assert_raise(RegexpError) { eval('/#{}\x81/s') } assert_raise(RegexpError) { eval('/#{}\x81/s') }
#assert_raise(SyntaxError) { eval('/\x81#{}\xa1/s') } #assert_raise(SyntaxError) { eval('/\x81#{}\xa1/s') }
@ -332,49 +364,49 @@ class TestM17N < Test::Unit::TestCase
def test_dynamic_utf8_regexp def test_dynamic_utf8_regexp
assert_regexp_fixed_utf8(/#{}/u) assert_regexp_fixed_utf8(/#{}/u)
assert_regexp_fixed_utf8(/#{}\xc0\xa1/u) assert_regexp_fixed_utf8(/#{}\xc2\xa1/u)
assert_regexp_fixed_utf8(/\xc0\xa1#{}/u) assert_regexp_fixed_utf8(/\xc2\xa1#{}/u)
assert_raise(RegexpError) { eval('/\xc0#{}/u') } assert_raise(RegexpError) { eval('/\xc2#{}/u') }
assert_raise(RegexpError) { eval('/#{}\xc0/u') } assert_raise(RegexpError) { eval('/#{}\xc2/u') }
#assert_raise(SyntaxError) { eval('/\xc0#{}\xa1/u') } #assert_raise(SyntaxError) { eval('/\xc2#{}\xa1/u') }
#assert_raise(SyntaxError) { s1, s2 = u('\xc0'), u('\xa1'); /#{s1}#{s2}/ } #assert_raise(SyntaxError) { s1, s2 = u('\xc2'), u('\xa1'); /#{s1}#{s2}/ }
end end
def test_regexp_mixed_unicode def test_regexp_mixed_unicode
assert_raise(SyntaxError) { eval(a(%{/\xc0\xa0\\u{6666}/})) } assert_raise(SyntaxError) { eval(a(%{/\xc2\xa0\\u{6666}/})) }
assert_raise(SyntaxError) { eval(e(%{/\xc0\xa0\\u{6666}/})) } assert_raise(SyntaxError) { eval(e(%{/\xc2\xa0\\u{6666}/})) }
assert_raise(SyntaxError) { eval(s(%{/\xc0\xa0\\u{6666}/})) } assert_raise(SyntaxError) { eval(s(%{/\xc2\xa0\\u{6666}/})) }
assert_nothing_raised { eval(u(%{/\xc0\xa0\\u{6666}/})) } assert_nothing_raised { eval(u(%{/\xc2\xa0\\u{6666}/})) }
assert_raise(SyntaxError) { eval(a(%{/\\u{6666}\xc0\xa0/})) } assert_raise(SyntaxError) { eval(a(%{/\\u{6666}\xc2\xa0/})) }
assert_raise(SyntaxError) { eval(e(%{/\\u{6666}\xc0\xa0/})) } assert_raise(SyntaxError) { eval(e(%{/\\u{6666}\xc2\xa0/})) }
assert_raise(SyntaxError) { eval(s(%{/\\u{6666}\xc0\xa0/})) } assert_raise(SyntaxError) { eval(s(%{/\\u{6666}\xc2\xa0/})) }
assert_nothing_raised { eval(u(%{/\\u{6666}\xc0\xa0/})) } assert_nothing_raised { eval(u(%{/\\u{6666}\xc2\xa0/})) }
assert_raise(SyntaxError) { eval(a(%{/\\xc0\\xa0\\u{6666}/})) } assert_raise(SyntaxError) { eval(a(%{/\\xc2\\xa0\\u{6666}/})) }
assert_raise(SyntaxError) { eval(e(%{/\\xc0\\xa0\\u{6666}/})) } assert_raise(SyntaxError) { eval(e(%{/\\xc2\\xa0\\u{6666}/})) }
assert_raise(SyntaxError) { eval(s(%{/\\xc0\\xa0\\u{6666}/})) } assert_raise(SyntaxError) { eval(s(%{/\\xc2\\xa0\\u{6666}/})) }
assert_nothing_raised { eval(u(%{/\\xc0\\xa0\\u{6666}/})) } assert_nothing_raised { eval(u(%{/\\xc2\\xa0\\u{6666}/})) }
assert_raise(SyntaxError) { eval(a(%{/\\u{6666}\\xc0\\xa0/})) } assert_raise(SyntaxError) { eval(a(%{/\\u{6666}\\xc2\\xa0/})) }
assert_raise(SyntaxError) { eval(e(%{/\\u{6666}\\xc0\\xa0/})) } assert_raise(SyntaxError) { eval(e(%{/\\u{6666}\\xc2\\xa0/})) }
assert_raise(SyntaxError) { eval(s(%{/\\u{6666}\\xc0\\xa0/})) } assert_raise(SyntaxError) { eval(s(%{/\\u{6666}\\xc2\\xa0/})) }
assert_nothing_raised { eval(u(%{/\\u{6666}\\xc0\\xa0/})) } assert_nothing_raised { eval(u(%{/\\u{6666}\\xc2\\xa0/})) }
assert_raise(SyntaxError) { eval(a(%{/\xc0\xa0#{}\\u{6666}/})) } assert_raise(SyntaxError) { eval(a(%{/\xc2\xa0#{}\\u{6666}/})) }
assert_raise(SyntaxError) { eval(e(%{/\xc0\xa0#{}\\u{6666}/})) } assert_raise(SyntaxError) { eval(e(%{/\xc2\xa0#{}\\u{6666}/})) }
assert_raise(SyntaxError) { eval(s(%{/\xc0\xa0#{}\\u{6666}/})) } assert_raise(SyntaxError) { eval(s(%{/\xc2\xa0#{}\\u{6666}/})) }
assert_nothing_raised { eval(u(%{/\xc0\xa0#{}\\u{6666}/})) } assert_nothing_raised { eval(u(%{/\xc2\xa0#{}\\u{6666}/})) }
assert_raise(SyntaxError) { eval(a(%{/\\u{6666}#{}\xc0\xa0/})) } assert_raise(SyntaxError) { eval(a(%{/\\u{6666}#{}\xc2\xa0/})) }
assert_raise(SyntaxError) { eval(e(%{/\\u{6666}#{}\xc0\xa0/})) } assert_raise(SyntaxError) { eval(e(%{/\\u{6666}#{}\xc2\xa0/})) }
assert_raise(SyntaxError) { eval(s(%{/\\u{6666}#{}\xc0\xa0/})) } assert_raise(SyntaxError) { eval(s(%{/\\u{6666}#{}\xc2\xa0/})) }
assert_nothing_raised { eval(u(%{/\\u{6666}#{}\xc0\xa0/})) } assert_nothing_raised { eval(u(%{/\\u{6666}#{}\xc2\xa0/})) }
assert_raise(SyntaxError) { eval(a(%{/\\xc0\\xa0#{}\\u{6666}/})) } assert_raise(SyntaxError) { eval(a(%{/\\xc2\\xa0#{}\\u{6666}/})) }
assert_raise(SyntaxError) { eval(e(%{/\\xc0\\xa0#{}\\u{6666}/})) } assert_raise(SyntaxError) { eval(e(%{/\\xc2\\xa0#{}\\u{6666}/})) }
assert_raise(SyntaxError) { eval(s(%{/\\xc0\\xa0#{}\\u{6666}/})) } assert_raise(SyntaxError) { eval(s(%{/\\xc2\\xa0#{}\\u{6666}/})) }
assert_nothing_raised { eval(u(%{/\\xc0\\xa0#{}\\u{6666}/})) } assert_nothing_raised { eval(u(%{/\\xc2\\xa0#{}\\u{6666}/})) }
assert_raise(SyntaxError) { eval(a(%{/\\u{6666}#{}\\xc0\\xa0/})) } assert_raise(SyntaxError) { eval(a(%{/\\u{6666}#{}\\xc2\\xa0/})) }
assert_raise(SyntaxError) { eval(e(%{/\\u{6666}#{}\\xc0\\xa0/})) } assert_raise(SyntaxError) { eval(e(%{/\\u{6666}#{}\\xc2\\xa0/})) }
assert_raise(SyntaxError) { eval(s(%{/\\u{6666}#{}\\xc0\\xa0/})) } assert_raise(SyntaxError) { eval(s(%{/\\u{6666}#{}\\xc2\\xa0/})) }
assert_nothing_raised { eval(u(%{/\\u{6666}#{}\\xc0\\xa0/})) } assert_nothing_raised { eval(u(%{/\\u{6666}#{}\\xc2\\xa0/})) }
end end
end end