зеркало из https://github.com/github/ruby.git
* transocode.c: register_functional_transcoder() added.
(init_transcoder_table(: register ISO-2022-JP. (str_transcode): add preprocessor and postprocessor. * transcode_data_japanese.c: add ISO-2022-JP support. * transcode_data.h: moved transcoder and transcoding difinition from transcode.c. git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@14607 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
This commit is contained in:
Родитель
f23bc6b2b5
Коммит
be86e3de33
11
ChangeLog
11
ChangeLog
|
@ -1,3 +1,14 @@
|
|||
Mon Dec 24 22:46:42 2007 NARUSE, Yui <naruse@ruby-lang.org>
|
||||
|
||||
* transocode.c: register_functional_transcoder() added.
|
||||
(init_transcoder_table(: register ISO-2022-JP.
|
||||
(str_transcode): add preprocessor and postprocessor.
|
||||
|
||||
* transcode_data_japanese.c: add ISO-2022-JP support.
|
||||
|
||||
* transcode_data.h: moved transcoder and transcoding difinition from
|
||||
transcode.c.
|
||||
|
||||
Mon Dec 24 20:29:28 2007 Koichi Sasada <ko1@atdot.net>
|
||||
|
||||
* test/io/nonblock/test_flush.rb: fix test for 1.9.
|
||||
|
|
84
transcode.c
84
transcode.c
|
@ -60,22 +60,17 @@ extern const BYTE_LOOKUP rb_from_EUC_JP;
|
|||
extern const BYTE_LOOKUP rb_to_SHIFT_JIS;
|
||||
extern const BYTE_LOOKUP rb_to_EUC_JP;
|
||||
|
||||
extern void from_iso_2022_jp_transcoder_preprocessor(char**, char**, char*, char*,
|
||||
struct transcoder_st *transcoder, struct transcoding*);
|
||||
extern void to_iso_2022_jp_transcoder_postprocessor(char**, char**, char*, char*,
|
||||
struct transcoder_st *transcoder, struct transcoding*);
|
||||
|
||||
/* declarations probably need to go into separate header file, e.g. transcode.h */
|
||||
|
||||
/* static structure, one per supported encoding pair */
|
||||
typedef struct {
|
||||
const char *from_encoding;
|
||||
const char *to_encoding;
|
||||
const BYTE_LOOKUP *conv_tree_start;
|
||||
int max_output;
|
||||
int from_utf8;
|
||||
} transcoder;
|
||||
|
||||
/* todo: dynamic structure, one per conversion (stream) */
|
||||
|
||||
/* in the future, add some mechanism for dynamically adding stuff here */
|
||||
#define MAX_TRANSCODERS 33 /* todo: fix: this number has to be adjusted by hand */
|
||||
#define MAX_TRANSCODERS 35 /* todo: fix: this number has to be adjusted by hand */
|
||||
static transcoder transcoder_table[MAX_TRANSCODERS];
|
||||
|
||||
/* not sure why it's not possible to do relocatable initializations */
|
||||
|
@ -99,6 +94,29 @@ register_transcoder(const char *from_e, const char *to_e,
|
|||
n++;
|
||||
}
|
||||
|
||||
static void
|
||||
register_functional_transcoder(const char *from_e, const char *to_e,
|
||||
const BYTE_LOOKUP *tree_start, int max_output, int from_utf8,
|
||||
void (*preprocessor)(char**, char**, char*, char*, transcoder*, transcoding*),
|
||||
void (*postprocessor)(char**, char**, char*, char*, transcoder*, transcoding*))
|
||||
{
|
||||
static int n = 0;
|
||||
if (n >= MAX_TRANSCODERS) {
|
||||
/* we are initializing, is it okay to use rb_raise here? */
|
||||
rb_raise(rb_eRuntimeError /*change exception*/, "not enough transcoder slots");
|
||||
}
|
||||
transcoder_table[n].from_encoding = from_e;
|
||||
transcoder_table[n].to_encoding = to_e;
|
||||
transcoder_table[n].conv_tree_start = tree_start;
|
||||
transcoder_table[n].max_output = max_output;
|
||||
transcoder_table[n].from_utf8 = from_utf8;
|
||||
transcoder_table[n].conv_tree_start = tree_start;
|
||||
transcoder_table[n].preprocessor = preprocessor;
|
||||
transcoder_table[n].postprocessor = postprocessor;
|
||||
|
||||
n++;
|
||||
}
|
||||
|
||||
static void
|
||||
init_transcoder_table(void)
|
||||
{
|
||||
|
@ -135,6 +153,10 @@ init_transcoder_table(void)
|
|||
register_transcoder("EUC-JP", "UTF-8", &rb_from_EUC_JP, 3, 0);
|
||||
register_transcoder("UTF-8", "SHIFT_JIS", &rb_to_SHIFT_JIS, 2, 1);
|
||||
register_transcoder("UTF-8", "EUC-JP", &rb_to_EUC_JP, 2, 1);
|
||||
register_functional_transcoder("ISO-2022-JP", "UTF-8", &rb_from_EUC_JP,
|
||||
8, 0, &from_iso_2022_jp_transcoder_preprocessor, NULL);
|
||||
register_functional_transcoder("UTF-8", "ISO-2022-JP", &rb_to_EUC_JP,
|
||||
8, 1, NULL, &to_iso_2022_jp_transcoder_postprocessor);
|
||||
|
||||
register_transcoder(NULL, NULL, NULL, 0, 0);
|
||||
}
|
||||
|
@ -165,14 +187,6 @@ transcode_dispatch(const char* from_encoding, const char* to_encoding)
|
|||
return NULL;
|
||||
}
|
||||
|
||||
/* dynamic structure, one per conversion (similar to iconv_t) */
|
||||
/* may carry conversion state (e.g. for iso-2022-jp) */
|
||||
typedef struct transcoding {
|
||||
VALUE ruby_string_dest; /* the String used as the conversion destination,
|
||||
or NULL if something else is being converted */
|
||||
char *(*flush_func)(struct transcoding*, int, int);
|
||||
} transcoding;
|
||||
|
||||
|
||||
/*
|
||||
* Transcoding engine logic
|
||||
|
@ -331,6 +345,23 @@ str_transcode(int argc, VALUE *argv, VALUE *self)
|
|||
rb_raise(rb_eArgError, "transcoding not supported (from %s to %s)", from_e, to_e);
|
||||
}
|
||||
|
||||
if (my_transcoder->preprocessor)
|
||||
{
|
||||
fromp = sp = RSTRING_PTR(str);
|
||||
slen = RSTRING_LEN(str);
|
||||
blen = slen + 30; /* len + margin */
|
||||
dest = rb_str_tmp_new(blen);
|
||||
bp = RSTRING_PTR(dest);
|
||||
my_transcoding.ruby_string_dest = dest;
|
||||
(*my_transcoder->preprocessor)(&fromp, &bp, (sp+slen), (bp+blen), my_transcoder, &my_transcoding);
|
||||
if (fromp != sp+slen) {
|
||||
rb_raise(rb_eArgError, "not fully converted, %d bytes left", sp+slen-fromp);
|
||||
}
|
||||
buf = RSTRING_PTR(dest);
|
||||
*bp = '\0';
|
||||
rb_str_set_len(dest, bp - buf);
|
||||
str = dest;
|
||||
}
|
||||
fromp = sp = RSTRING_PTR(str);
|
||||
slen = RSTRING_LEN(str);
|
||||
blen = slen + 30; /* len + margin */
|
||||
|
@ -346,6 +377,23 @@ str_transcode(int argc, VALUE *argv, VALUE *self)
|
|||
buf = RSTRING_PTR(dest);
|
||||
*bp = '\0';
|
||||
rb_str_set_len(dest, bp - buf);
|
||||
if (my_transcoder->postprocessor)
|
||||
{
|
||||
str = dest;
|
||||
fromp = sp = RSTRING_PTR(str);
|
||||
slen = RSTRING_LEN(str);
|
||||
blen = slen + 30; /* len + margin */
|
||||
dest = rb_str_tmp_new(blen);
|
||||
bp = RSTRING_PTR(dest);
|
||||
my_transcoding.ruby_string_dest = dest;
|
||||
(*my_transcoder->postprocessor)(&fromp, &bp, (sp+slen), (bp+blen), my_transcoder, &my_transcoding);
|
||||
if (fromp != sp+slen) {
|
||||
rb_raise(rb_eArgError, "not fully converted, %d bytes left", sp+slen-fromp);
|
||||
}
|
||||
buf = RSTRING_PTR(dest);
|
||||
*bp = '\0';
|
||||
rb_str_set_len(dest, bp - buf);
|
||||
}
|
||||
|
||||
if (encoding_equal(my_transcoder->to_encoding, to_e)) {
|
||||
final_encoding = 1;
|
||||
|
|
|
@ -1,3 +1,20 @@
|
|||
/**********************************************************************
|
||||
|
||||
transcode_data.h -
|
||||
|
||||
$Author$
|
||||
$Date$
|
||||
created at: Mon 10 Dec 2007 14:01:47 JST 2007
|
||||
|
||||
Copyright (C) 2007 Martin Duerst
|
||||
|
||||
**********************************************************************/
|
||||
|
||||
#include "ruby/ruby.h"
|
||||
|
||||
#ifndef RUBY_TRANSCODE_DATA_H
|
||||
#define RUBY_TRANSCODE_DATA_H 1
|
||||
|
||||
typedef unsigned char base_element;
|
||||
|
||||
typedef struct byte_lookup {
|
||||
|
@ -37,3 +54,25 @@ typedef struct byte_lookup {
|
|||
#define TWOTRAIL /* legal but undefined if two more trailing UTF-8 */
|
||||
#define THREETRAIL /* legal but undefined if three more trailing UTF-8 */
|
||||
|
||||
/* dynamic structure, one per conversion (similar to iconv_t) */
|
||||
/* may carry conversion state (e.g. for iso-2022-jp) */
|
||||
typedef struct transcoding {
|
||||
VALUE ruby_string_dest; /* the String used as the conversion destination,
|
||||
or NULL if something else is being converted */
|
||||
char *(*flush_func)(struct transcoding*, int, int);
|
||||
} transcoding;
|
||||
|
||||
/* static structure, one per supported encoding pair */
|
||||
typedef struct transcoder_st{
|
||||
const char *from_encoding;
|
||||
const char *to_encoding;
|
||||
const BYTE_LOOKUP *conv_tree_start;
|
||||
int max_output;
|
||||
int from_utf8;
|
||||
void (*preprocessor)(char**, char**, char*, char*,
|
||||
struct transcoder_st *transcoder, struct transcoding*);
|
||||
void (*postprocessor)(char**, char**, char*, char*,
|
||||
struct transcoder_st *transcoder, struct transcoding*);
|
||||
} transcoder;
|
||||
|
||||
#endif /* RUBY_TRANSCODE_DATA_H */
|
||||
|
|
|
@ -23618,4 +23618,209 @@ rb_to_EUC_JP = {
|
|||
to_EUC_JP_infos
|
||||
};
|
||||
|
||||
/* Footprint (bytes): gross: 212680, saved: 50764, net: 161916 */
|
||||
#define ISO_2022_ENCODING(escseq, byte) ((escseq<<8)|byte)
|
||||
enum ISO_2022_ESCSEQ {
|
||||
ISO_2022_CZD = '!',
|
||||
ISO_2022_C1D = '"',
|
||||
ISO_2022_GZD4 = '(',
|
||||
ISO_2022_G1D4 = ')',
|
||||
ISO_2022_G2D4 = '*',
|
||||
ISO_2022_G3D4 = '+',
|
||||
ISO_2022_G1D6 = '-',
|
||||
ISO_2022_G2D6 = '.',
|
||||
ISO_2022_G3D6 = '/',
|
||||
ISO_2022_GZDM4 = ISO_2022_ENCODING('$','('),
|
||||
ISO_2022_G1DM4 = ISO_2022_ENCODING('$',')'),
|
||||
ISO_2022_G2DM4 = ISO_2022_ENCODING('$','*'),
|
||||
ISO_2022_G3DM4 = ISO_2022_ENCODING('$','+'),
|
||||
ISO_2022_G1DM6 = ISO_2022_ENCODING('$','-'),
|
||||
ISO_2022_G2DM6 = ISO_2022_ENCODING('$','.'),
|
||||
ISO_2022_G3DM6 = ISO_2022_ENCODING('$','/'),
|
||||
ISO_2022_DOCS = ISO_2022_ENCODING('%','I'),
|
||||
ISO_2022_IRR = '&'
|
||||
};
|
||||
|
||||
|
||||
#define ISO_2022_GZ_ASCII ISO_2022_ENCODING(ISO_2022_GZD4, 'B')
|
||||
#define ISO_2022_GZ_JIS_X_0201_Katakana ISO_2022_ENCODING(ISO_2022_GZD4, 'I')
|
||||
#define ISO_2022_GZ_JIS_X_0201_Roman ISO_2022_ENCODING(ISO_2022_GZD4, 'J')
|
||||
#define ISO_2022_GZ_JIS_C_6226_1978 ISO_2022_ENCODING(ISO_2022_GZDM4,'@')
|
||||
#define ISO_2022_GZ_JIS_X_0208_1983 ISO_2022_ENCODING(ISO_2022_GZDM4,'B')
|
||||
#define ISO_2022_GZ_JIS_X_0212_1990 ISO_2022_ENCODING(ISO_2022_GZDM4,'D')
|
||||
#define ISO_2022_GZ_JIS_X_0213_2000_1 ISO_2022_ENCODING(ISO_2022_GZDM4,'O')
|
||||
#define ISO_2022_GZ_JIS_X_0213_2000_2 ISO_2022_ENCODING(ISO_2022_GZDM4,'P')
|
||||
#define ISO_2022_GZ_JIS_X_0213_2004_1 ISO_2022_ENCODING(ISO_2022_GZDM4,'Q')
|
||||
|
||||
static int
|
||||
get_iso_2022_mode(char **in_pos)
|
||||
{
|
||||
int new_mode;
|
||||
char *in_p = *in_pos;
|
||||
switch (*in_p++)
|
||||
{
|
||||
case '(':
|
||||
switch (*in_p++)
|
||||
{
|
||||
case 'B': case 'I': case 'J':
|
||||
new_mode = ISO_2022_ENCODING(ISO_2022_GZD4, *(in_p-1));
|
||||
break;
|
||||
default:
|
||||
rb_raise(rb_eRuntimeError /*change exception*/, "this mode is not supported (ESC ( %c)", *(in_p-1));
|
||||
break;
|
||||
}
|
||||
break;
|
||||
case '$':
|
||||
switch (*in_p++)
|
||||
{
|
||||
case '@': case 'A': case 'B':
|
||||
new_mode = ISO_2022_ENCODING(ISO_2022_GZDM4, *(in_p-1));
|
||||
break;
|
||||
case '(':
|
||||
switch (*in_p++)
|
||||
{
|
||||
case 'D': case 'O': case 'P': case 'Q':
|
||||
new_mode = ISO_2022_ENCODING(ISO_2022_GZDM4, *(in_p-1));
|
||||
break;
|
||||
default:
|
||||
rb_raise(rb_eRuntimeError /*change exception*/, "this mode is not supported (ESC $ ( %c)", *(in_p-1));
|
||||
break;
|
||||
}
|
||||
break;
|
||||
default:
|
||||
rb_raise(rb_eRuntimeError /*change exception*/, "this mode is not supported (ESC $ %c)", *(in_p-1));
|
||||
break;
|
||||
}
|
||||
break;
|
||||
default:
|
||||
rb_raise(rb_eRuntimeError /*change exception*/, "this mode is not supported (ESC %c)", *(in_p-1));
|
||||
break;
|
||||
}
|
||||
*in_pos = in_p;
|
||||
return new_mode;
|
||||
}
|
||||
|
||||
void
|
||||
from_iso_2022_jp_transcoder_preprocessor(char **in_pos, char **out_pos,
|
||||
char *in_stop, char *out_stop,
|
||||
transcoder *my_transcoder,
|
||||
transcoding *my_transcoding)
|
||||
{
|
||||
char *in_p = *in_pos, *out_p = *out_pos;
|
||||
int cur_mode = ISO_2022_GZ_ASCII;
|
||||
unsigned char c1;
|
||||
char *out_s = out_stop - my_transcoder->max_output + 1;
|
||||
while (in_p < in_stop) {
|
||||
if (out_p >= out_s) {
|
||||
int len = (out_p - *out_pos);
|
||||
int new_len = (len + my_transcoder->max_output) * 2;
|
||||
*out_pos = (*my_transcoding->flush_func)(my_transcoding, len, new_len);
|
||||
out_p = *out_pos + len;
|
||||
out_s = *out_pos + new_len - my_transcoder->max_output;
|
||||
}
|
||||
c1 = *in_p++;
|
||||
if (c1 == 0x1B) {
|
||||
cur_mode = get_iso_2022_mode(&in_p);
|
||||
} else if (c1 == 0x1E || c1 == 0x1F) {
|
||||
/* SHIFT */
|
||||
rb_raise(rb_eRuntimeError /*change exception*/, "shift is not supported");
|
||||
} else if (c1 >= 0x80) {
|
||||
rb_raise(rb_eRuntimeError /*change exception*/, "illegal byte sequence");
|
||||
} else {
|
||||
switch (cur_mode) {
|
||||
case ISO_2022_GZ_ASCII:
|
||||
case ISO_2022_GZ_JIS_X_0201_Roman:
|
||||
*out_p++ = c1;
|
||||
break;
|
||||
case ISO_2022_GZ_JIS_X_0201_Katakana:
|
||||
*out_p++ = 0x8E;
|
||||
*out_p++ = c1 | 0x80;
|
||||
break;
|
||||
case ISO_2022_GZ_JIS_X_0212_1990:
|
||||
*out_p++ = 0x8F;
|
||||
case ISO_2022_GZ_JIS_C_6226_1978:
|
||||
case ISO_2022_GZ_JIS_X_0208_1983:
|
||||
*out_p++ = c1 | 0x80;
|
||||
*out_p++ = *in_p++ | 0x80;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
/* cleanup */
|
||||
*in_pos = in_p;
|
||||
*out_pos = out_p;
|
||||
}
|
||||
|
||||
static int
|
||||
select_iso_2022_mode(char **out_pos, int new_mode)
|
||||
{
|
||||
char *out_p = *out_pos;
|
||||
*out_p++ = '\e';
|
||||
switch (new_mode>>8)
|
||||
{
|
||||
case ISO_2022_GZD4:
|
||||
*out_p++ = new_mode >> 8;
|
||||
*out_p++ = new_mode & 0x7F;
|
||||
break;
|
||||
case ISO_2022_GZDM4:
|
||||
*out_p++ = new_mode >> 16;
|
||||
if ((new_mode & 0x7F) != '@' &&
|
||||
(new_mode & 0x7F) != 'A' &&
|
||||
(new_mode & 0x7F) != 'B')
|
||||
{
|
||||
*out_p++ = (new_mode>>8) & 0x7F;
|
||||
}
|
||||
*out_p++ = new_mode & 0x7F;
|
||||
break;
|
||||
default:
|
||||
rb_raise(rb_eRuntimeError /*change exception*/, "this mode is not supported.");
|
||||
break;
|
||||
}
|
||||
*out_pos = out_p;
|
||||
return new_mode;
|
||||
}
|
||||
|
||||
void
|
||||
to_iso_2022_jp_transcoder_postprocessor(char **in_pos, char **out_pos,
|
||||
char *in_stop, char *out_stop,
|
||||
transcoder *my_transcoder,
|
||||
transcoding *my_transcoding)
|
||||
{
|
||||
char *in_p = *in_pos, *out_p = *out_pos;
|
||||
int cur_mode = ISO_2022_GZ_ASCII, new_mode = 0;
|
||||
unsigned char next_byte;
|
||||
char *out_s = out_stop - my_transcoder->max_output + 1;
|
||||
while (in_p < in_stop) {
|
||||
if (out_p >= out_s) {
|
||||
int len = (out_p - *out_pos);
|
||||
int new_len = (len + my_transcoder->max_output) * 2;
|
||||
*out_pos = (*my_transcoding->flush_func)(my_transcoding, len, new_len);
|
||||
out_p = *out_pos + len;
|
||||
out_s = *out_pos + new_len - my_transcoder->max_output;
|
||||
}
|
||||
next_byte = *in_p++;
|
||||
if (next_byte < 0x80) {
|
||||
new_mode = ISO_2022_GZ_ASCII;
|
||||
} else if (next_byte == 0x8E) {
|
||||
new_mode = ISO_2022_GZ_JIS_X_0201_Katakana;
|
||||
next_byte = *in_p++;
|
||||
} else if (next_byte == 0x8F) {
|
||||
new_mode = ISO_2022_GZ_JIS_X_0212_1990;
|
||||
next_byte = *in_p++;
|
||||
} else {
|
||||
new_mode = ISO_2022_GZ_JIS_X_0208_1983;
|
||||
}
|
||||
if (cur_mode != new_mode)
|
||||
cur_mode = select_iso_2022_mode(&out_p, new_mode);
|
||||
if (cur_mode < 0xFFFF) {
|
||||
*out_p++ = next_byte & 0x7F;
|
||||
} else {
|
||||
*out_p++ = next_byte & 0x7F;
|
||||
*out_p++ = *in_p++ & 0x7F;
|
||||
}
|
||||
}
|
||||
if (cur_mode != ISO_2022_GZ_ASCII)
|
||||
cur_mode = select_iso_2022_mode(&out_p, ISO_2022_GZ_ASCII);
|
||||
/* cleanup */
|
||||
*in_pos = in_p;
|
||||
*out_pos = out_p;
|
||||
}
|
||||
|
|
Загрузка…
Ссылка в новой задаче