ruby/ext/psych/psych_parser.c

598 строки
15 KiB
C

#include <psych.h>
VALUE cPsychParser;
VALUE ePsychSyntaxError;
static ID id_read;
static ID id_path;
static ID id_empty;
static ID id_start_stream;
static ID id_end_stream;
static ID id_start_document;
static ID id_end_document;
static ID id_alias;
static ID id_scalar;
static ID id_start_sequence;
static ID id_end_sequence;
static ID id_start_mapping;
static ID id_end_mapping;
#define PSYCH_TRANSCODE(_str, _yaml_enc, _internal_enc) \
do { \
rb_enc_associate_index((_str), (_yaml_enc)); \
if(_internal_enc) \
(_str) = rb_str_export_to_enc((_str), (_internal_enc)); \
} while (0)
static int io_reader(void * data, unsigned char *buf, size_t size, size_t *read)
{
VALUE io = (VALUE)data;
VALUE string = rb_funcall(io, id_read, 1, INT2NUM(size));
*read = 0;
if(! NIL_P(string)) {
void * str = (void *)StringValuePtr(string);
*read = (size_t)RSTRING_LEN(string);
memcpy(buf, str, *read);
}
return 1;
}
static void dealloc(void * ptr)
{
yaml_parser_t * parser;
parser = (yaml_parser_t *)ptr;
yaml_parser_delete(parser);
xfree(parser);
}
#if 0
static size_t memsize(const void *ptr)
{
const yaml_parser_t *parser = ptr;
/* TODO: calculate parser's size */
return 0;
}
#endif
static const rb_data_type_t psych_parser_type = {
"Psych/parser",
{0, dealloc, 0,},
0, 0,
#ifdef RUBY_TYPED_FREE_IMMEDIATELY
RUBY_TYPED_FREE_IMMEDIATELY,
#endif
};
static VALUE allocate(VALUE klass)
{
yaml_parser_t * parser;
VALUE obj = TypedData_Make_Struct(klass, yaml_parser_t, &psych_parser_type, parser);
yaml_parser_initialize(parser);
return obj;
}
static VALUE make_exception(yaml_parser_t * parser, VALUE path)
{
size_t line, column;
line = parser->context_mark.line + 1;
column = parser->context_mark.column + 1;
return rb_funcall(ePsychSyntaxError, rb_intern("new"), 6,
path,
INT2NUM(line),
INT2NUM(column),
INT2NUM(parser->problem_offset),
parser->problem ? rb_usascii_str_new2(parser->problem) : Qnil,
parser->context ? rb_usascii_str_new2(parser->context) : Qnil);
}
#ifdef HAVE_RUBY_ENCODING_H
static VALUE transcode_string(VALUE src, int * parser_encoding)
{
int utf8 = rb_utf8_encindex();
int utf16le = rb_enc_find_index("UTF-16LE");
int utf16be = rb_enc_find_index("UTF-16BE");
int source_encoding = rb_enc_get_index(src);
if (source_encoding == utf8) {
*parser_encoding = YAML_UTF8_ENCODING;
return src;
}
if (source_encoding == utf16le) {
*parser_encoding = YAML_UTF16LE_ENCODING;
return src;
}
if (source_encoding == utf16be) {
*parser_encoding = YAML_UTF16BE_ENCODING;
return src;
}
src = rb_str_export_to_enc(src, rb_utf8_encoding());
RB_GC_GUARD(src);
*parser_encoding = YAML_UTF8_ENCODING;
return src;
}
static VALUE transcode_io(VALUE src, int * parser_encoding)
{
VALUE io_external_encoding;
int io_external_enc_index;
io_external_encoding = rb_funcall(src, rb_intern("external_encoding"), 0);
/* if no encoding is returned, assume ascii8bit. */
if (NIL_P(io_external_encoding)) {
io_external_enc_index = rb_ascii8bit_encindex();
} else {
io_external_enc_index = rb_to_encoding_index(io_external_encoding);
}
/* Treat US-ASCII as utf_8 */
if (io_external_enc_index == rb_usascii_encindex()) {
*parser_encoding = YAML_UTF8_ENCODING;
return src;
}
if (io_external_enc_index == rb_utf8_encindex()) {
*parser_encoding = YAML_UTF8_ENCODING;
return src;
}
if (io_external_enc_index == rb_enc_find_index("UTF-16LE")) {
*parser_encoding = YAML_UTF16LE_ENCODING;
return src;
}
if (io_external_enc_index == rb_enc_find_index("UTF-16BE")) {
*parser_encoding = YAML_UTF16BE_ENCODING;
return src;
}
/* Just guess on ASCII-8BIT */
if (io_external_enc_index == rb_ascii8bit_encindex()) {
*parser_encoding = YAML_ANY_ENCODING;
return src;
}
/* If the external encoding is something we don't know how to handle,
* fall back to YAML_ANY_ENCODING. */
*parser_encoding = YAML_ANY_ENCODING;
return src;
}
#endif
static VALUE protected_start_stream(VALUE pointer)
{
VALUE *args = (VALUE *)pointer;
return rb_funcall(args[0], id_start_stream, 1, args[1]);
}
static VALUE protected_start_document(VALUE pointer)
{
VALUE *args = (VALUE *)pointer;
return rb_funcall3(args[0], id_start_document, 3, args + 1);
}
static VALUE protected_end_document(VALUE pointer)
{
VALUE *args = (VALUE *)pointer;
return rb_funcall(args[0], id_end_document, 1, args[1]);
}
static VALUE protected_alias(VALUE pointer)
{
VALUE *args = (VALUE *)pointer;
return rb_funcall(args[0], id_alias, 1, args[1]);
}
static VALUE protected_scalar(VALUE pointer)
{
VALUE *args = (VALUE *)pointer;
return rb_funcall3(args[0], id_scalar, 6, args + 1);
}
static VALUE protected_start_sequence(VALUE pointer)
{
VALUE *args = (VALUE *)pointer;
return rb_funcall3(args[0], id_start_sequence, 4, args + 1);
}
static VALUE protected_end_sequence(VALUE handler)
{
return rb_funcall(handler, id_end_sequence, 0);
}
static VALUE protected_start_mapping(VALUE pointer)
{
VALUE *args = (VALUE *)pointer;
return rb_funcall3(args[0], id_start_mapping, 4, args + 1);
}
static VALUE protected_end_mapping(VALUE handler)
{
return rb_funcall(handler, id_end_mapping, 0);
}
static VALUE protected_empty(VALUE handler)
{
return rb_funcall(handler, id_empty, 0);
}
static VALUE protected_end_stream(VALUE handler)
{
return rb_funcall(handler, id_end_stream, 0);
}
/*
* call-seq:
* parser.parse(yaml)
*
* Parse the YAML document contained in +yaml+. Events will be called on
* the handler set on the parser instance.
*
* See Psych::Parser and Psych::Parser#handler
*/
static VALUE parse(int argc, VALUE *argv, VALUE self)
{
VALUE yaml, path;
yaml_parser_t * parser;
yaml_event_t event;
int done = 0;
int tainted = 0;
int state = 0;
int parser_encoding = YAML_ANY_ENCODING;
#ifdef HAVE_RUBY_ENCODING_H
int encoding = rb_utf8_encindex();
rb_encoding * internal_enc = rb_default_internal_encoding();
#endif
VALUE handler = rb_iv_get(self, "@handler");
if (rb_scan_args(argc, argv, "11", &yaml, &path) == 1) {
if(rb_respond_to(yaml, id_path))
path = rb_funcall(yaml, id_path, 0);
else
path = rb_str_new2("<unknown>");
}
TypedData_Get_Struct(self, yaml_parser_t, &psych_parser_type, parser);
yaml_parser_delete(parser);
yaml_parser_initialize(parser);
if (OBJ_TAINTED(yaml)) tainted = 1;
if (rb_respond_to(yaml, id_read)) {
#ifdef HAVE_RUBY_ENCODING_H
yaml = transcode_io(yaml, &parser_encoding);
yaml_parser_set_encoding(parser, parser_encoding);
#endif
yaml_parser_set_input(parser, io_reader, (void *)yaml);
if (RTEST(rb_obj_is_kind_of(yaml, rb_cIO))) tainted = 1;
} else {
StringValue(yaml);
#ifdef HAVE_RUBY_ENCODING_H
yaml = transcode_string(yaml, &parser_encoding);
yaml_parser_set_encoding(parser, parser_encoding);
#endif
yaml_parser_set_input_string(
parser,
(const unsigned char *)RSTRING_PTR(yaml),
(size_t)RSTRING_LEN(yaml)
);
}
while(!done) {
if(!yaml_parser_parse(parser, &event)) {
VALUE exception;
exception = make_exception(parser, path);
yaml_parser_delete(parser);
yaml_parser_initialize(parser);
rb_exc_raise(exception);
}
switch(event.type) {
case YAML_STREAM_START_EVENT:
{
VALUE args[2];
args[0] = handler;
args[1] = INT2NUM((long)event.data.stream_start.encoding);
rb_protect(protected_start_stream, (VALUE)args, &state);
}
break;
case YAML_DOCUMENT_START_EVENT:
{
VALUE args[4];
/* Get a list of tag directives (if any) */
VALUE tag_directives = rb_ary_new();
/* Grab the document version */
VALUE version = event.data.document_start.version_directive ?
rb_ary_new3(
(long)2,
INT2NUM((long)event.data.document_start.version_directive->major),
INT2NUM((long)event.data.document_start.version_directive->minor)
) : rb_ary_new();
if(event.data.document_start.tag_directives.start) {
yaml_tag_directive_t *start =
event.data.document_start.tag_directives.start;
yaml_tag_directive_t *end =
event.data.document_start.tag_directives.end;
for(; start != end; start++) {
VALUE handle = Qnil;
VALUE prefix = Qnil;
if(start->handle) {
handle = rb_str_new2((const char *)start->handle);
if (tainted) OBJ_TAINT(handle);
#ifdef HAVE_RUBY_ENCODING_H
PSYCH_TRANSCODE(handle, encoding, internal_enc);
#endif
}
if(start->prefix) {
prefix = rb_str_new2((const char *)start->prefix);
if (tainted) OBJ_TAINT(prefix);
#ifdef HAVE_RUBY_ENCODING_H
PSYCH_TRANSCODE(prefix, encoding, internal_enc);
#endif
}
rb_ary_push(tag_directives, rb_ary_new3((long)2, handle, prefix));
}
}
args[0] = handler;
args[1] = version;
args[2] = tag_directives;
args[3] = event.data.document_start.implicit == 1 ? Qtrue : Qfalse;
rb_protect(protected_start_document, (VALUE)args, &state);
}
break;
case YAML_DOCUMENT_END_EVENT:
{
VALUE args[2];
args[0] = handler;
args[1] = event.data.document_end.implicit == 1 ? Qtrue : Qfalse;
rb_protect(protected_end_document, (VALUE)args, &state);
}
break;
case YAML_ALIAS_EVENT:
{
VALUE args[2];
VALUE alias = Qnil;
if(event.data.alias.anchor) {
alias = rb_str_new2((const char *)event.data.alias.anchor);
if (tainted) OBJ_TAINT(alias);
#ifdef HAVE_RUBY_ENCODING_H
PSYCH_TRANSCODE(alias, encoding, internal_enc);
#endif
}
args[0] = handler;
args[1] = alias;
rb_protect(protected_alias, (VALUE)args, &state);
}
break;
case YAML_SCALAR_EVENT:
{
VALUE args[7];
VALUE anchor = Qnil;
VALUE tag = Qnil;
VALUE plain_implicit, quoted_implicit, style;
VALUE val = rb_str_new(
(const char *)event.data.scalar.value,
(long)event.data.scalar.length
);
if (tainted) OBJ_TAINT(val);
#ifdef HAVE_RUBY_ENCODING_H
PSYCH_TRANSCODE(val, encoding, internal_enc);
#endif
if(event.data.scalar.anchor) {
anchor = rb_str_new2((const char *)event.data.scalar.anchor);
if (tainted) OBJ_TAINT(anchor);
#ifdef HAVE_RUBY_ENCODING_H
PSYCH_TRANSCODE(anchor, encoding, internal_enc);
#endif
}
if(event.data.scalar.tag) {
tag = rb_str_new2((const char *)event.data.scalar.tag);
if (tainted) OBJ_TAINT(tag);
#ifdef HAVE_RUBY_ENCODING_H
PSYCH_TRANSCODE(tag, encoding, internal_enc);
#endif
}
plain_implicit =
event.data.scalar.plain_implicit == 0 ? Qfalse : Qtrue;
quoted_implicit =
event.data.scalar.quoted_implicit == 0 ? Qfalse : Qtrue;
style = INT2NUM((long)event.data.scalar.style);
args[0] = handler;
args[1] = val;
args[2] = anchor;
args[3] = tag;
args[4] = plain_implicit;
args[5] = quoted_implicit;
args[6] = style;
rb_protect(protected_scalar, (VALUE)args, &state);
}
break;
case YAML_SEQUENCE_START_EVENT:
{
VALUE args[5];
VALUE anchor = Qnil;
VALUE tag = Qnil;
VALUE implicit, style;
if(event.data.sequence_start.anchor) {
anchor = rb_str_new2((const char *)event.data.sequence_start.anchor);
if (tainted) OBJ_TAINT(anchor);
#ifdef HAVE_RUBY_ENCODING_H
PSYCH_TRANSCODE(anchor, encoding, internal_enc);
#endif
}
tag = Qnil;
if(event.data.sequence_start.tag) {
tag = rb_str_new2((const char *)event.data.sequence_start.tag);
if (tainted) OBJ_TAINT(tag);
#ifdef HAVE_RUBY_ENCODING_H
PSYCH_TRANSCODE(tag, encoding, internal_enc);
#endif
}
implicit =
event.data.sequence_start.implicit == 0 ? Qfalse : Qtrue;
style = INT2NUM((long)event.data.sequence_start.style);
args[0] = handler;
args[1] = anchor;
args[2] = tag;
args[3] = implicit;
args[4] = style;
rb_protect(protected_start_sequence, (VALUE)args, &state);
}
break;
case YAML_SEQUENCE_END_EVENT:
rb_protect(protected_end_sequence, handler, &state);
break;
case YAML_MAPPING_START_EVENT:
{
VALUE args[5];
VALUE anchor = Qnil;
VALUE tag = Qnil;
VALUE implicit, style;
if(event.data.mapping_start.anchor) {
anchor = rb_str_new2((const char *)event.data.mapping_start.anchor);
if (tainted) OBJ_TAINT(anchor);
#ifdef HAVE_RUBY_ENCODING_H
PSYCH_TRANSCODE(anchor, encoding, internal_enc);
#endif
}
if(event.data.mapping_start.tag) {
tag = rb_str_new2((const char *)event.data.mapping_start.tag);
if (tainted) OBJ_TAINT(tag);
#ifdef HAVE_RUBY_ENCODING_H
PSYCH_TRANSCODE(tag, encoding, internal_enc);
#endif
}
implicit =
event.data.mapping_start.implicit == 0 ? Qfalse : Qtrue;
style = INT2NUM((long)event.data.mapping_start.style);
args[0] = handler;
args[1] = anchor;
args[2] = tag;
args[3] = implicit;
args[4] = style;
rb_protect(protected_start_mapping, (VALUE)args, &state);
}
break;
case YAML_MAPPING_END_EVENT:
rb_protect(protected_end_mapping, handler, &state);
break;
case YAML_NO_EVENT:
rb_protect(protected_empty, handler, &state);
break;
case YAML_STREAM_END_EVENT:
rb_protect(protected_end_stream, handler, &state);
done = 1;
break;
}
yaml_event_delete(&event);
if (state) rb_jump_tag(state);
}
return self;
}
/*
* call-seq:
* parser.mark # => #<Psych::Parser::Mark>
*
* Returns a Psych::Parser::Mark object that contains line, column, and index
* information.
*/
static VALUE mark(VALUE self)
{
VALUE mark_klass;
VALUE args[3];
yaml_parser_t * parser;
TypedData_Get_Struct(self, yaml_parser_t, &psych_parser_type, parser);
mark_klass = rb_const_get_at(cPsychParser, rb_intern("Mark"));
args[0] = INT2NUM(parser->mark.index);
args[1] = INT2NUM(parser->mark.line);
args[2] = INT2NUM(parser->mark.column);
return rb_class_new_instance(3, args, mark_klass);
}
void Init_psych_parser(void)
{
#if 0
mPsych = rb_define_module("Psych");
#endif
cPsychParser = rb_define_class_under(mPsych, "Parser", rb_cObject);
rb_define_alloc_func(cPsychParser, allocate);
/* Any encoding: Let the parser choose the encoding */
rb_define_const(cPsychParser, "ANY", INT2NUM(YAML_ANY_ENCODING));
/* UTF-8 Encoding */
rb_define_const(cPsychParser, "UTF8", INT2NUM(YAML_UTF8_ENCODING));
/* UTF-16-LE Encoding with BOM */
rb_define_const(cPsychParser, "UTF16LE", INT2NUM(YAML_UTF16LE_ENCODING));
/* UTF-16-BE Encoding with BOM */
rb_define_const(cPsychParser, "UTF16BE", INT2NUM(YAML_UTF16BE_ENCODING));
rb_require("psych/syntax_error");
ePsychSyntaxError = rb_const_get(mPsych, rb_intern("SyntaxError"));
rb_define_method(cPsychParser, "parse", parse, -1);
rb_define_method(cPsychParser, "mark", mark, 0);
id_read = rb_intern("read");
id_path = rb_intern("path");
id_empty = rb_intern("empty");
id_start_stream = rb_intern("start_stream");
id_end_stream = rb_intern("end_stream");
id_start_document = rb_intern("start_document");
id_end_document = rb_intern("end_document");
id_alias = rb_intern("alias");
id_scalar = rb_intern("scalar");
id_start_sequence = rb_intern("start_sequence");
id_end_sequence = rb_intern("end_sequence");
id_start_mapping = rb_intern("start_mapping");
id_end_mapping = rb_intern("end_mapping");
}
/* vim: set noet sws=4 sw=4: */