[ruby/prism] Provide options for reducing size

https://github.com/ruby/prism/commit/592128de4d
This commit is contained in:
Kevin Newton 2024-03-20 10:08:13 -04:00
Родитель 0e8b6c62a4
Коммит af7bf9e0d8
17 изменённых файлов: 393 добавлений и 254 удалений

Просмотреть файл

@ -2358,6 +2358,8 @@ pm_encoding_utf_8_isupper_char(const uint8_t *b, ptrdiff_t n) {
}
}
#ifndef PRISM_ENCODING_EXCLUDE_FULL
static pm_unicode_codepoint_t
pm_cesu_8_codepoint(const uint8_t *b, ptrdiff_t n, size_t *width) {
if (b[0] < 0x80) {
@ -2452,6 +2454,8 @@ pm_encoding_cesu_8_isupper_char(const uint8_t *b, ptrdiff_t n) {
}
}
#endif
#undef UNICODE_ALPHA_CODEPOINTS_LENGTH
#undef UNICODE_ALNUM_CODEPOINTS_LENGTH
#undef UNICODE_ISUPPER_CODEPOINTS_LENGTH
@ -2480,6 +2484,8 @@ static const uint8_t pm_encoding_ascii_table[256] = {
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // Fx
};
#ifndef PRISM_ENCODING_EXCLUDE_FULL
/**
* Each element of the following table contains a bitfield that indicates a
* piece of information about the corresponding CP850 character.
@ -3918,6 +3924,7 @@ PRISM_ENCODING_TABLE(windows_1258)
PRISM_ENCODING_TABLE(windows_874)
#undef PRISM_ENCODING_TABLE
#endif
/**
* Returns the size of the next character in the ASCII encoding. This basically
@ -3975,6 +3982,122 @@ pm_encoding_ascii_isupper_char(const uint8_t *b, PRISM_ATTRIBUTE_UNUSED ptrdiff_
return (pm_encoding_ascii_table[*b] & PRISM_ENCODING_UPPERCASE_BIT);
}
/**
* For a lot of encodings the default is that they are a single byte long no
* matter what the codepoint, so this function is shared between them.
*/
static size_t
pm_encoding_single_char_width(PRISM_ATTRIBUTE_UNUSED const uint8_t *b, PRISM_ATTRIBUTE_UNUSED ptrdiff_t n) {
return 1;
}
/**
* Returns the size of the next character in the EUC-JP encoding, or 0 if a
* character cannot be decoded from the given bytes.
*/
static size_t
pm_encoding_euc_jp_char_width(const uint8_t *b, ptrdiff_t n) {
// These are the single byte characters.
if (*b < 0x80) {
return 1;
}
// These are the double byte characters.
if ((n > 1) && ((b[0] == 0x8E) || (b[0] >= 0xA1 && b[0] <= 0xFE)) && (b[1] >= 0xA1 && b[1] <= 0xFE)) {
return 2;
}
// These are the triple byte characters.
if ((n > 2) && (b[0] == 0x8F) && (b[1] >= 0xA1 && b[2] <= 0xFE) && (b[2] >= 0xA1 && b[2] <= 0xFE)) {
return 3;
}
return 0;
}
/**
* Returns the size of the next character in the EUC-JP encoding if it is an
* uppercase character.
*/
static bool
pm_encoding_euc_jp_isupper_char(const uint8_t *b, ptrdiff_t n) {
size_t width = pm_encoding_euc_jp_char_width(b, n);
if (width == 1) {
return pm_encoding_ascii_isupper_char(b, n);
} else if (width == 2) {
return (
(b[0] == 0xA3 && b[1] >= 0xC1 && b[1] <= 0xDA) ||
(b[0] == 0xA6 && b[1] >= 0xA1 && b[1] <= 0xB8) ||
(b[0] == 0xA7 && b[1] >= 0xA1 && b[1] <= 0xC1)
);
} else {
return false;
}
}
/**
* Returns the size of the next character in the Shift_JIS encoding, or 0 if a
* character cannot be decoded from the given bytes.
*/
static size_t
pm_encoding_shift_jis_char_width(const uint8_t *b, ptrdiff_t n) {
// These are the single byte characters.
if (b[0] < 0x80 || (b[0] >= 0xA1 && b[0] <= 0xDF)) {
return 1;
}
// These are the double byte characters.
if ((n > 1) && ((b[0] >= 0x81 && b[0] <= 0x9F) || (b[0] >= 0xE0 && b[0] <= 0xFC)) && (b[1] >= 0x40 && b[1] <= 0xFC && b[1] != 0x7F)) {
return 2;
}
return 0;
}
/**
* Returns the size of the next character in the Shift_JIS encoding if it is an
* alphanumeric character.
*/
static size_t
pm_encoding_shift_jis_alnum_char(const uint8_t *b, ptrdiff_t n) {
size_t width = pm_encoding_shift_jis_char_width(b, n);
return width == 1 ? ((b[0] >= 0x80) || pm_encoding_ascii_alnum_char(b, n)) : width;
}
/**
* Returns the size of the next character in the Shift_JIS encoding if it is an
* alphabetical character.
*/
static size_t
pm_encoding_shift_jis_alpha_char(const uint8_t *b, ptrdiff_t n) {
size_t width = pm_encoding_shift_jis_char_width(b, n);
return width == 1 ? ((b[0] >= 0x80) || pm_encoding_ascii_alpha_char(b, n)) : width;
}
/**
* Returns the size of the next character in the Shift_JIS encoding if it is an
* uppercase character.
*/
static bool
pm_encoding_shift_jis_isupper_char(const uint8_t *b, ptrdiff_t n) {
size_t width = pm_encoding_shift_jis_char_width(b, n);
if (width == 1) {
return pm_encoding_ascii_isupper_char(b, n);
} else if (width == 2) {
return (
((b[0] == 0x82) && (b[1] >= 0x60 && b[1] <= 0x79)) ||
((b[0] == 0x83) && (b[1] >= 0x9F && b[1] <= 0xB6)) ||
((b[0] == 0x84) && (b[1] >= 0x40 && b[1] <= 0x60))
);
} else {
return width;
}
}
#ifndef PRISM_ENCODING_EXCLUDE_FULL
/**
* Certain encodings are equivalent to ASCII below 0x80, so it works for our
* purposes to have a function here that first checks the bounds and then falls
@ -3985,15 +4108,6 @@ pm_encoding_ascii_isupper_char_7bit(const uint8_t *b, ptrdiff_t n) {
return (*b < 0x80) && pm_encoding_ascii_isupper_char(b, n);
}
/**
* For a lot of encodings the default is that they are a single byte long no
* matter what the codepoint, so this function is shared between them.
*/
static size_t
pm_encoding_single_char_width(PRISM_ATTRIBUTE_UNUSED const uint8_t *b, PRISM_ATTRIBUTE_UNUSED ptrdiff_t n) {
return 1;
}
/**
* Returns the size of the next character in the Big5 encoding, or 0 if a
* character cannot be decoded from the given bytes.
@ -4075,51 +4189,6 @@ pm_encoding_emacs_mule_char_width(const uint8_t *b, ptrdiff_t n) {
return 0;
}
/**
* Returns the size of the next character in the EUC-JP encoding, or 0 if a
* character cannot be decoded from the given bytes.
*/
static size_t
pm_encoding_euc_jp_char_width(const uint8_t *b, ptrdiff_t n) {
// These are the single byte characters.
if (*b < 0x80) {
return 1;
}
// These are the double byte characters.
if ((n > 1) && ((b[0] == 0x8E) || (b[0] >= 0xA1 && b[0] <= 0xFE)) && (b[1] >= 0xA1 && b[1] <= 0xFE)) {
return 2;
}
// These are the triple byte characters.
if ((n > 2) && (b[0] == 0x8F) && (b[1] >= 0xA1 && b[2] <= 0xFE) && (b[2] >= 0xA1 && b[2] <= 0xFE)) {
return 3;
}
return 0;
}
/**
* Returns the size of the next character in the EUC-JP encoding if it is an
* uppercase character.
*/
static bool
pm_encoding_euc_jp_isupper_char(const uint8_t *b, ptrdiff_t n) {
size_t width = pm_encoding_euc_jp_char_width(b, n);
if (width == 1) {
return pm_encoding_ascii_isupper_char(b, n);
} else if (width == 2) {
return (
(b[0] == 0xA3 && b[1] >= 0xC1 && b[1] <= 0xDA) ||
(b[0] == 0xA6 && b[1] >= 0xA1 && b[1] <= 0xB8) ||
(b[0] == 0xA7 && b[1] >= 0xA1 && b[1] <= 0xC1)
);
} else {
return false;
}
}
/**
* Returns the size of the next character in the EUC-KR encoding, or 0 if a
* character cannot be decoded from the given bytes.
@ -4218,65 +4287,7 @@ pm_encoding_gbk_char_width(const uint8_t *b, ptrdiff_t n) {
return 0;
}
/**
* Returns the size of the next character in the Shift_JIS encoding, or 0 if a
* character cannot be decoded from the given bytes.
*/
static size_t
pm_encoding_shift_jis_char_width(const uint8_t *b, ptrdiff_t n) {
// These are the single byte characters.
if (b[0] < 0x80 || (b[0] >= 0xA1 && b[0] <= 0xDF)) {
return 1;
}
// These are the double byte characters.
if ((n > 1) && ((b[0] >= 0x81 && b[0] <= 0x9F) || (b[0] >= 0xE0 && b[0] <= 0xFC)) && (b[1] >= 0x40 && b[1] <= 0xFC && b[1] != 0x7F)) {
return 2;
}
return 0;
}
/**
* Returns the size of the next character in the Shift_JIS encoding if it is an
* alphanumeric character.
*/
static size_t
pm_encoding_shift_jis_alnum_char(const uint8_t *b, ptrdiff_t n) {
size_t width = pm_encoding_shift_jis_char_width(b, n);
return width == 1 ? ((b[0] >= 0x80) || pm_encoding_ascii_alnum_char(b, n)) : width;
}
/**
* Returns the size of the next character in the Shift_JIS encoding if it is an
* alphabetical character.
*/
static size_t
pm_encoding_shift_jis_alpha_char(const uint8_t *b, ptrdiff_t n) {
size_t width = pm_encoding_shift_jis_char_width(b, n);
return width == 1 ? ((b[0] >= 0x80) || pm_encoding_ascii_alpha_char(b, n)) : width;
}
/**
* Returns the size of the next character in the Shift_JIS encoding if it is an
* uppercase character.
*/
static bool
pm_encoding_shift_jis_isupper_char(const uint8_t *b, ptrdiff_t n) {
size_t width = pm_encoding_shift_jis_char_width(b, n);
if (width == 1) {
return pm_encoding_ascii_isupper_char(b, n);
} else if (width == 2) {
return (
((b[0] == 0x82) && (b[1] >= 0x60 && b[1] <= 0x79)) ||
((b[0] == 0x83) && (b[1] >= 0x9F && b[1] <= 0xB6)) ||
((b[0] == 0x84) && (b[1] >= 0x40 && b[1] <= 0x60))
);
} else {
return width;
}
}
#endif
/**
* This is the table of all of the encodings that prism supports.
@ -4290,6 +4301,14 @@ const pm_encoding_t pm_encodings[] = {
.isupper_char = pm_encoding_utf_8_isupper_char,
.multibyte = true
},
[PM_ENCODING_US_ASCII] = {
.name = "US-ASCII",
.char_width = pm_encoding_ascii_char_width,
.alnum_char = pm_encoding_ascii_alnum_char,
.alpha_char = pm_encoding_ascii_alpha_char,
.isupper_char = pm_encoding_ascii_isupper_char,
.multibyte = false
},
[PM_ENCODING_ASCII_8BIT] = {
.name = "ASCII-8BIT",
.char_width = pm_encoding_single_char_width,
@ -4298,6 +4317,24 @@ const pm_encoding_t pm_encodings[] = {
.isupper_char = pm_encoding_ascii_isupper_char,
.multibyte = false
},
[PM_ENCODING_EUC_JP] = {
.name = "EUC-JP",
.char_width = pm_encoding_euc_jp_char_width,
.alnum_char = pm_encoding_ascii_alnum_char_7bit,
.alpha_char = pm_encoding_ascii_alpha_char_7bit,
.isupper_char = pm_encoding_euc_jp_isupper_char,
.multibyte = true
},
[PM_ENCODING_WINDOWS_31J] = {
.name = "Windows-31J",
.char_width = pm_encoding_shift_jis_char_width,
.alnum_char = pm_encoding_shift_jis_alnum_char,
.alpha_char = pm_encoding_shift_jis_alpha_char,
.isupper_char = pm_encoding_shift_jis_isupper_char,
.multibyte = true
},
#ifndef PRISM_ENCODING_EXCLUDE_FULL
[PM_ENCODING_BIG5] = {
.name = "Big5",
.char_width = pm_encoding_big5_char_width,
@ -4394,14 +4431,6 @@ const pm_encoding_t pm_encodings[] = {
.isupper_char = pm_encoding_ascii_isupper_char_7bit,
.multibyte = true
},
[PM_ENCODING_EUC_JP] = {
.name = "EUC-JP",
.char_width = pm_encoding_euc_jp_char_width,
.alnum_char = pm_encoding_ascii_alnum_char_7bit,
.alpha_char = pm_encoding_ascii_alpha_char_7bit,
.isupper_char = pm_encoding_euc_jp_isupper_char,
.multibyte = true
},
[PM_ENCODING_EUC_JP_MS] = {
.name = "eucJP-ms",
.char_width = pm_encoding_euc_jp_char_width,
@ -4874,14 +4903,6 @@ const pm_encoding_t pm_encodings[] = {
.isupper_char = pm_encoding_tis_620_isupper_char,
.multibyte = false
},
[PM_ENCODING_US_ASCII] = {
.name = "US-ASCII",
.char_width = pm_encoding_ascii_char_width,
.alnum_char = pm_encoding_ascii_alnum_char,
.alpha_char = pm_encoding_ascii_alpha_char,
.isupper_char = pm_encoding_ascii_isupper_char,
.multibyte = false
},
[PM_ENCODING_UTF8_MAC] = {
.name = "UTF8-MAC",
.char_width = pm_encoding_utf_8_char_width,
@ -4986,14 +5007,6 @@ const pm_encoding_t pm_encodings[] = {
.isupper_char = pm_encoding_windows_1258_isupper_char,
.multibyte = false
},
[PM_ENCODING_WINDOWS_31J] = {
.name = "Windows-31J",
.char_width = pm_encoding_shift_jis_char_width,
.alnum_char = pm_encoding_shift_jis_alnum_char,
.alpha_char = pm_encoding_shift_jis_alpha_char,
.isupper_char = pm_encoding_shift_jis_isupper_char,
.multibyte = true
},
[PM_ENCODING_WINDOWS_874] = {
.name = "Windows-874",
.char_width = pm_encoding_single_char_width,
@ -5002,6 +5015,7 @@ const pm_encoding_t pm_encodings[] = {
.isupper_char = pm_encoding_windows_874_isupper_char,
.multibyte = false
}
#endif
};
/**
@ -5016,11 +5030,13 @@ pm_encoding_find(const uint8_t *start, const uint8_t *end) {
// UTF-8 can contain extra information at the end about the platform it is
// encoded on, such as UTF-8-MAC or UTF-8-UNIX. We'll ignore those suffixes.
if ((start + 5 <= end) && (pm_strncasecmp(start, (const uint8_t *) "UTF-8", 5) == 0)) {
#ifndef PRISM_ENCODING_EXCLUDE_FULL
// We need to explicitly handle UTF-8-HFS, as that one needs to switch
// over to being UTF8-MAC.
if (width == 9 && (pm_strncasecmp(start + 5, (const uint8_t *) "-HFS", 4) == 0)) {
return &pm_encodings[PM_ENCODING_UTF8_MAC];
}
#endif
// Otherwise we'll return the default UTF-8 encoding.
return PM_ENCODING_UTF_8_ENTRY;
@ -5040,11 +5056,16 @@ pm_encoding_find(const uint8_t *start, const uint8_t *end) {
break;
case 'B': case 'b':
ENCODING1("BINARY", PM_ENCODING_ASCII_8BIT);
#ifndef PRISM_ENCODING_EXCLUDE_FULL
ENCODING1("Big5", PM_ENCODING_BIG5);
ENCODING2("Big5-HKSCS", "Big5-HKSCS:2008", PM_ENCODING_BIG5_HKSCS);
ENCODING1("Big5-UAO", PM_ENCODING_BIG5_UAO);
#endif
break;
case 'C': case 'c':
ENCODING1("CP65001", PM_ENCODING_UTF_8);
ENCODING2("CP932", "csWindows31J", PM_ENCODING_WINDOWS_31J);
#ifndef PRISM_ENCODING_EXCLUDE_FULL
ENCODING1("CESU-8", PM_ENCODING_CESU_8);
ENCODING1("CP437", PM_ENCODING_IBM437);
ENCODING1("CP720", PM_ENCODING_IBM720);
@ -5064,7 +5085,6 @@ pm_encoding_find(const uint8_t *start, const uint8_t *end) {
ENCODING1("CP874", PM_ENCODING_WINDOWS_874);
ENCODING1("CP878", PM_ENCODING_KOI8_R);
ENCODING1("CP863", PM_ENCODING_IBM863);
ENCODING2("CP932", "csWindows31J", PM_ENCODING_WINDOWS_31J);
ENCODING1("CP936", PM_ENCODING_GBK);
ENCODING1("CP949", PM_ENCODING_CP949);
ENCODING1("CP950", PM_ENCODING_CP950);
@ -5079,25 +5099,30 @@ pm_encoding_find(const uint8_t *start, const uint8_t *end) {
ENCODING1("CP1257", PM_ENCODING_WINDOWS_1257);
ENCODING1("CP1258", PM_ENCODING_WINDOWS_1258);
ENCODING1("CP51932", PM_ENCODING_CP51932);
ENCODING1("CP65001", PM_ENCODING_UTF_8);
#endif
break;
case 'E': case 'e':
ENCODING2("EUC-JP", "eucJP", PM_ENCODING_EUC_JP);
#ifndef PRISM_ENCODING_EXCLUDE_FULL
ENCODING2("eucJP-ms", "euc-jp-ms", PM_ENCODING_EUC_JP_MS);
ENCODING2("EUC-JIS-2004", "EUC-JISX0213", PM_ENCODING_EUC_JIS_2004);
ENCODING2("EUC-KR", "eucKR", PM_ENCODING_EUC_KR);
ENCODING2("EUC-CN", "eucCN", PM_ENCODING_GB2312);
ENCODING2("EUC-TW", "eucTW", PM_ENCODING_EUC_TW);
ENCODING1("Emacs-Mule", PM_ENCODING_EMACS_MULE);
#endif
break;
case 'G': case 'g':
#ifndef PRISM_ENCODING_EXCLUDE_FULL
ENCODING1("GBK", PM_ENCODING_GBK);
ENCODING1("GB12345", PM_ENCODING_GB12345);
ENCODING1("GB18030", PM_ENCODING_GB18030);
ENCODING1("GB1988", PM_ENCODING_GB1988);
ENCODING1("GB2312", PM_ENCODING_GB2312);
#endif
break;
case 'I': case 'i':
#ifndef PRISM_ENCODING_EXCLUDE_FULL
ENCODING1("IBM437", PM_ENCODING_IBM437);
ENCODING1("IBM720", PM_ENCODING_IBM720);
ENCODING1("IBM737", PM_ENCODING_IBM737);
@ -5129,12 +5154,16 @@ pm_encoding_find(const uint8_t *start, const uint8_t *end) {
ENCODING2("ISO-8859-14", "ISO8859-14", PM_ENCODING_ISO_8859_14);
ENCODING2("ISO-8859-15", "ISO8859-15", PM_ENCODING_ISO_8859_15);
ENCODING2("ISO-8859-16", "ISO8859-16", PM_ENCODING_ISO_8859_16);
#endif
break;
case 'K': case 'k':
#ifndef PRISM_ENCODING_EXCLUDE_FULL
ENCODING1("KOI8-R", PM_ENCODING_KOI8_R);
ENCODING1("KOI8-U", PM_ENCODING_KOI8_U);
#endif
break;
case 'M': case 'm':
#ifndef PRISM_ENCODING_EXCLUDE_FULL
ENCODING1("macCentEuro", PM_ENCODING_MAC_CENT_EURO);
ENCODING1("macCroatian", PM_ENCODING_MAC_CROATIAN);
ENCODING1("macCyrillic", PM_ENCODING_MAC_CYRILLIC);
@ -5147,31 +5176,39 @@ pm_encoding_find(const uint8_t *start, const uint8_t *end) {
ENCODING1("macThai", PM_ENCODING_MAC_THAI);
ENCODING1("macTurkish", PM_ENCODING_MAC_TURKISH);
ENCODING1("macUkraine", PM_ENCODING_MAC_UKRAINE);
#endif
break;
case 'P': case 'p':
ENCODING1("PCK", PM_ENCODING_WINDOWS_31J);
break;
case 'S': case 's':
ENCODING1("Shift_JIS", PM_ENCODING_SHIFT_JIS);
ENCODING1("SJIS", PM_ENCODING_WINDOWS_31J);
#ifndef PRISM_ENCODING_EXCLUDE_FULL
ENCODING1("Shift_JIS", PM_ENCODING_SHIFT_JIS);
ENCODING1("SJIS-DoCoMo", PM_ENCODING_SJIS_DOCOMO);
ENCODING1("SJIS-KDDI", PM_ENCODING_SJIS_KDDI);
ENCODING1("SJIS-SoftBank", PM_ENCODING_SJIS_SOFTBANK);
ENCODING1("stateless-ISO-2022-JP", PM_ENCODING_STATELESS_ISO_2022_JP);
ENCODING1("stateless-ISO-2022-JP-KDDI", PM_ENCODING_STATELESS_ISO_2022_JP_KDDI);
#endif
break;
case 'T': case 't':
#ifndef PRISM_ENCODING_EXCLUDE_FULL
ENCODING1("TIS-620", PM_ENCODING_TIS_620);
#endif
break;
case 'U': case 'u':
ENCODING1("US-ASCII", PM_ENCODING_US_ASCII);
#ifndef PRISM_ENCODING_EXCLUDE_FULL
ENCODING2("UTF8-MAC", "UTF-8-HFS", PM_ENCODING_UTF8_MAC);
ENCODING1("UTF8-DoCoMo", PM_ENCODING_UTF8_DOCOMO);
ENCODING1("UTF8-KDDI", PM_ENCODING_UTF8_KDDI);
ENCODING1("UTF8-SoftBank", PM_ENCODING_UTF8_SOFTBANK);
#endif
break;
case 'W': case 'w':
ENCODING1("Windows-31J", PM_ENCODING_WINDOWS_31J);
#ifndef PRISM_ENCODING_EXCLUDE_FULL
ENCODING1("Windows-874", PM_ENCODING_WINDOWS_874);
ENCODING1("Windows-1250", PM_ENCODING_WINDOWS_1250);
ENCODING1("Windows-1251", PM_ENCODING_WINDOWS_1251);
@ -5182,6 +5219,7 @@ pm_encoding_find(const uint8_t *start, const uint8_t *end) {
ENCODING1("Windows-1256", PM_ENCODING_WINDOWS_1256);
ENCODING1("Windows-1257", PM_ENCODING_WINDOWS_1257);
ENCODING1("Windows-1258", PM_ENCODING_WINDOWS_1258);
#endif
break;
case '6':
ENCODING1("646", PM_ENCODING_US_ASCII);

Просмотреть файл

@ -135,7 +135,14 @@ extern const uint8_t pm_encoding_unicode_table[256];
*/
typedef enum {
PM_ENCODING_UTF_8 = 0,
PM_ENCODING_US_ASCII,
PM_ENCODING_ASCII_8BIT,
PM_ENCODING_EUC_JP,
PM_ENCODING_WINDOWS_31J,
// We optionally support excluding the full set of encodings to only support the
// minimum necessary to process Ruby code without encoding comments.
#ifndef PRISM_ENCODING_EXCLUDE_FULL
PM_ENCODING_BIG5,
PM_ENCODING_BIG5_HKSCS,
PM_ENCODING_BIG5_UAO,
@ -148,7 +155,6 @@ typedef enum {
PM_ENCODING_CP950,
PM_ENCODING_CP951,
PM_ENCODING_EMACS_MULE,
PM_ENCODING_EUC_JP,
PM_ENCODING_EUC_JP_MS,
PM_ENCODING_EUC_JIS_2004,
PM_ENCODING_EUC_KR,
@ -208,7 +214,6 @@ typedef enum {
PM_ENCODING_STATELESS_ISO_2022_JP,
PM_ENCODING_STATELESS_ISO_2022_JP_KDDI,
PM_ENCODING_TIS_620,
PM_ENCODING_US_ASCII,
PM_ENCODING_UTF8_MAC,
PM_ENCODING_UTF8_DOCOMO,
PM_ENCODING_UTF8_KDDI,
@ -222,8 +227,9 @@ typedef enum {
PM_ENCODING_WINDOWS_1256,
PM_ENCODING_WINDOWS_1257,
PM_ENCODING_WINDOWS_1258,
PM_ENCODING_WINDOWS_31J,
PM_ENCODING_WINDOWS_874,
#endif
PM_ENCODING_MAXIMUM
} pm_encoding_type_t;

Просмотреть файл

@ -311,7 +311,7 @@ dump(int argc, VALUE *argv, VALUE self) {
pm_options_t options = { 0 };
string_options(argc, argv, &input, &options);
#ifdef PRISM_DEBUG_MODE_BUILD
#ifdef PRISM_BUILD_DEBUG
size_t length = pm_string_length(&input);
char* dup = xmalloc(length);
memcpy(dup, pm_string_source(&input), length);
@ -320,7 +320,7 @@ dump(int argc, VALUE *argv, VALUE self) {
VALUE value = dump_input(&input, &options);
#ifdef PRISM_DEBUG_MODE_BUILD
#ifdef PRISM_BUILD_DEBUG
xfree(dup);
#endif
@ -737,7 +737,7 @@ parse(int argc, VALUE *argv, VALUE self) {
pm_options_t options = { 0 };
string_options(argc, argv, &input, &options);
#ifdef PRISM_DEBUG_MODE_BUILD
#ifdef PRISM_BUILD_DEBUG
size_t length = pm_string_length(&input);
char* dup = xmalloc(length);
memcpy(dup, pm_string_source(&input), length);
@ -746,7 +746,7 @@ parse(int argc, VALUE *argv, VALUE self) {
VALUE value = parse_input(&input, &options);
#ifdef PRISM_DEBUG_MODE_BUILD
#ifdef PRISM_BUILD_DEBUG
xfree(dup);
#endif

Просмотреть файл

@ -1,16 +1,43 @@
#include "prism/pack.h"
// We optionally support parsing String#pack templates. For systems that don't
// want or need this functionality, it can be turned off with the
// PRISM_EXCLUDE_PACK define.
#ifdef PRISM_EXCLUDE_PACK
void pm_pack_parse(void) {}
#else
#include <stdbool.h>
#include <errno.h>
static uintmax_t
strtoumaxc(const char **format);
strtoumaxc(const char **format) {
uintmax_t value = 0;
while (**format >= '0' && **format <= '9') {
if (value > UINTMAX_MAX / 10) {
errno = ERANGE;
}
value = value * 10 + ((uintmax_t) (**format - '0'));
(*format)++;
}
return value;
}
PRISM_EXPORTED_FUNCTION pm_pack_result
pm_pack_parse(pm_pack_variant variant, const char **format, const char *format_end,
pm_pack_type *type, pm_pack_signed *signed_type, pm_pack_endian *endian, pm_pack_size *size,
pm_pack_length_type *length_type, uint64_t *length, pm_pack_encoding *encoding) {
pm_pack_parse(
pm_pack_variant variant,
const char **format,
const char *format_end,
pm_pack_type *type,
pm_pack_signed *signed_type,
pm_pack_endian *endian,
pm_pack_size *size,
pm_pack_length_type *length_type,
uint64_t *length,
pm_pack_encoding *encoding
) {
if (*encoding == PM_PACK_ENCODING_START) {
*encoding = PM_PACK_ENCODING_US_ASCII;
}
@ -479,15 +506,4 @@ pm_size_to_native(pm_pack_size size) {
}
}
static uintmax_t
strtoumaxc(const char **format) {
uintmax_t value = 0;
while (**format >= '0' && **format <= '9') {
if (value > UINTMAX_MAX / 10) {
errno = ERANGE;
}
value = value * 10 + ((uintmax_t) (**format - '0'));
(*format)++;
}
return value;
}
#endif

Просмотреть файл

@ -6,6 +6,15 @@
#ifndef PRISM_PACK_H
#define PRISM_PACK_H
// We optionally support parsing String#pack templates. For systems that don't
// want or need this functionality, it can be turned off with the
// PRISM_EXCLUDE_PACK define.
#ifdef PRISM_EXCLUDE_PACK
void pm_pack_parse(void);
#else
#include "prism/defines.h"
#include <stdint.h>
@ -150,3 +159,5 @@ pm_pack_parse(
PRISM_EXPORTED_FUNCTION size_t pm_size_to_native(pm_pack_size size);
#endif
#endif

Просмотреть файл

@ -6,6 +6,12 @@
#ifndef PRISM_PRETTYPRINT_H
#define PRISM_PRETTYPRINT_H
#ifdef PRISM_EXCLUDE_PRETTYPRINT
void pm_prettyprint(void);
#else
#include "prism/defines.h"
#include <stdio.h>
@ -24,3 +30,5 @@
PRISM_EXPORTED_FUNCTION void pm_prettyprint(pm_buffer_t *output_buffer, const pm_parser_t *parser, const pm_node_t *node);
#endif
#endif

Просмотреть файл

@ -19316,6 +19316,41 @@ pm_parse_stream(pm_parser_t *parser, pm_buffer_t *buffer, void *stream, pm_parse
return node;
}
/**
* Parse the source and return true if it parses without errors or warnings.
*/
PRISM_EXPORTED_FUNCTION bool
pm_parse_success_p(const uint8_t *source, size_t size, const char *data) {
pm_options_t options = { 0 };
pm_options_read(&options, data);
pm_parser_t parser;
pm_parser_init(&parser, source, size, &options);
pm_node_t *node = pm_parse(&parser);
pm_node_destroy(&parser, node);
bool result = parser.error_list.size == 0 && parser.warning_list.size == 0;
pm_parser_free(&parser);
pm_options_free(&options);
return result;
}
#undef PM_CASE_KEYWORD
#undef PM_CASE_OPERATOR
#undef PM_CASE_WRITABLE
#undef PM_STRING_EMPTY
#undef PM_LOCATION_NODE_BASE_VALUE
#undef PM_LOCATION_NODE_VALUE
#undef PM_LOCATION_NULL_VALUE
#undef PM_LOCATION_TOKEN_VALUE
// We optionally support serializing to a binary string. For systems that don't
// want or need this functionality, it can be turned off with the
// PRISM_EXCLUDE_SERIALIZATION define.
#ifndef PRISM_EXCLUDE_SERIALIZATION
static inline void
pm_serialize_header(pm_buffer_t *buffer) {
pm_buffer_append_string(buffer, "PRISM", 5);
@ -19402,14 +19437,7 @@ pm_serialize_parse_comments(pm_buffer_t *buffer, const uint8_t *source, size_t s
pm_options_free(&options);
}
#undef PM_CASE_KEYWORD
#undef PM_CASE_OPERATOR
#undef PM_CASE_WRITABLE
#undef PM_STRING_EMPTY
#undef PM_LOCATION_NODE_BASE_VALUE
#undef PM_LOCATION_NODE_VALUE
#undef PM_LOCATION_NULL_VALUE
#undef PM_LOCATION_TOKEN_VALUE
#endif
/** An error that is going to be formatted into the output. */
typedef struct {

Просмотреть файл

@ -98,6 +98,11 @@ typedef char * (pm_parse_stream_fgets_t)(char *string, int size, void *stream);
*/
PRISM_EXPORTED_FUNCTION pm_node_t * pm_parse_stream(pm_parser_t *parser, pm_buffer_t *buffer, void *stream, pm_parse_stream_fgets_t *fgets, const pm_options_t *options);
// We optionally support serializing to a binary string. For systems that don't
// want or need this functionality, it can be turned off with the
// PRISM_EXCLUDE_SERIALIZATION define.
#ifndef PRISM_EXCLUDE_SERIALIZATION
/**
* Parse and serialize the AST represented by the source that is read out of the
* given stream into to the given buffer.
@ -185,6 +190,8 @@ PRISM_EXPORTED_FUNCTION void pm_serialize_lex(pm_buffer_t *buffer, const uint8_t
*/
PRISM_EXPORTED_FUNCTION void pm_serialize_parse_lex(pm_buffer_t *buffer, const uint8_t *source, size_t size, const char *data);
#endif
/**
* Parse the source and return true if it parses without errors or warnings.
*
@ -220,6 +227,10 @@ const char * pm_token_type_human(pm_token_type_t token_type);
*/
PRISM_EXPORTED_FUNCTION void pm_parser_errors_format(const pm_parser_t *parser, pm_buffer_t *buffer, bool colorize);
// We optionally support dumping to JSON. For systems that don't want or need
// this functionality, it can be turned off with the PRISM_EXCLUDE_JSON define.
#ifndef PRISM_EXCLUDE_JSON
/**
* Dump JSON to the given buffer.
*
@ -229,6 +240,8 @@ PRISM_EXPORTED_FUNCTION void pm_parser_errors_format(const pm_parser_t *parser,
*/
PRISM_EXPORTED_FUNCTION void pm_dump_json(pm_buffer_t *buffer, const pm_parser_t *parser, const pm_node_t *node);
#endif
/**
* @mainpage
*

Просмотреть файл

@ -247,6 +247,10 @@ pm_visit_child_nodes(const pm_node_t *node, bool (*visitor)(const pm_node_t *nod
}
}
// We optionally support dumping to JSON. For systems that don't want or need
// this functionality, it can be turned off with the PRISM_EXCLUDE_JSON define.
#ifndef PRISM_EXCLUDE_JSON
static void
pm_dump_json_constant(pm_buffer_t *buffer, const pm_parser_t *parser, pm_constant_id_t constant_id) {
const pm_constant_t *constant = pm_constant_pool_id_to_constant(&parser->constant_pool, constant_id);
@ -360,3 +364,5 @@ pm_dump_json(pm_buffer_t *buffer, const pm_parser_t *parser, const pm_node_t *no
break;
}
}
#endif

Просмотреть файл

@ -1,6 +1,15 @@
<%# encoding: ASCII -%>
#include "prism/prettyprint.h"
// We optionally support pretty printing nodes. For systems that don't want or
// need this functionality, it can be turned off with the
// PRISM_EXCLUDE_PRETTYPRINT define.
#ifdef PRISM_EXCLUDE_PRETTYPRINT
void pm_prettyprint(void) {}
#else
static inline void
prettyprint_location(pm_buffer_t *output_buffer, const pm_parser_t *parser, const pm_location_t *location) {
pm_line_column_t start = pm_newline_list_line_column(&parser->newline_list, location->start, parser->start_line);
@ -154,3 +163,5 @@ pm_prettyprint(pm_buffer_t *output_buffer, const pm_parser_t *parser, const pm_n
prettyprint_node(output_buffer, parser, node, &prefix_buffer);
pm_buffer_free(&prefix_buffer);
}
#endif

Просмотреть файл

@ -1,5 +1,10 @@
#include "prism.h"
// We optionally support serializing to a binary string. For systems that don't
// want or need this functionality, it can be turned off with the
// PRISM_EXCLUDE_SERIALIZATION define.
#ifndef PRISM_EXCLUDE_SERIALIZATION
#include <stdio.h>
static inline uint32_t
@ -394,23 +399,4 @@ pm_serialize_parse_lex(pm_buffer_t *buffer, const uint8_t *source, size_t size,
pm_options_free(&options);
}
/**
* Parse the source and return true if it parses without errors or warnings.
*/
PRISM_EXPORTED_FUNCTION bool
pm_parse_success_p(const uint8_t *source, size_t size, const char *data) {
pm_options_t options = { 0 };
pm_options_read(&options, data);
pm_parser_t parser;
pm_parser_init(&parser, source, size, &options);
pm_node_t *node = pm_parse(&parser);
pm_node_destroy(&parser, node);
bool result = parser.error_list.size == 0 && parser.warning_list.size == 0;
pm_parser_free(&parser);
pm_options_free(&options);
return result;
}
#endif

Просмотреть файл

@ -9,10 +9,13 @@ module Prism
codepoints_1byte = 0...0x100
encodings = {
Encoding::ASCII_8BIT => codepoints_1byte,
Encoding::US_ASCII => codepoints_1byte,
Encoding::Windows_1253 => codepoints_1byte
Encoding::US_ASCII => codepoints_1byte
}
if !ENV["PRISM_BUILD_MINIMAL"]
encodings[Encoding::Windows_1253] = codepoints_1byte
end
# By default we don't test every codepoint in these encodings because it
# takes a very long time.
if ENV["PRISM_TEST_ALL_ENCODINGS"]
@ -205,21 +208,6 @@ module Prism
assert_equal Encoding.find("utf-8"), actual
end
# This test may be a little confusing. Basically when we use our strpbrk, it
# takes into account the encoding of the file.
def test_strpbrk_multibyte
result = Prism.parse(<<~RUBY)
# encoding: Shift_JIS
%w[\x81\x5c]
RUBY
assert(result.errors.empty?)
assert_equal(
(+"\x81\x5c").force_encoding(Encoding::Shift_JIS),
result.value.statements.body.first.elements.first.unescaped
)
end
def test_utf_8_variations
%w[
utf-8-unix
@ -238,22 +226,39 @@ module Prism
assert_equal Encoding.find("ascii-8bit"), encoding
end
def test_slice_encoding
slice = Prism.parse("# encoding: Shift_JIS\n").value.slice
assert_equal (+"").force_encoding(Encoding::SHIFT_JIS), slice
assert_equal Encoding::SHIFT_JIS, slice.encoding
end
if !ENV["PRISM_BUILD_MINIMAL"]
# This test may be a little confusing. Basically when we use our strpbrk,
# it takes into account the encoding of the file.
def test_strpbrk_multibyte
result = Prism.parse(<<~RUBY)
# encoding: Shift_JIS
%w[\x81\x5c]
RUBY
def test_multibyte_escapes
[
["'", "'"],
["\"", "\""],
["`", "`"],
["/", "/"],
["<<'HERE'\n", "\nHERE"],
["<<-HERE\n", "\nHERE"]
].each do |opening, closing|
assert Prism.parse_success?("# encoding: shift_jis\n'\\\x82\xA0'\n")
assert(result.errors.empty?)
assert_equal(
(+"\x81\x5c").force_encoding(Encoding::Shift_JIS),
result.value.statements.body.first.elements.first.unescaped
)
end
def test_slice_encoding
slice = Prism.parse("# encoding: Shift_JIS\n").value.slice
assert_equal (+"").force_encoding(Encoding::SHIFT_JIS), slice
assert_equal Encoding::SHIFT_JIS, slice.encoding
end
def test_multibyte_escapes
[
["'", "'"],
["\"", "\""],
["`", "`"],
["/", "/"],
["<<'HERE'\n", "\nHERE"],
["<<-HERE\n", "\nHERE"]
].each do |opening, closing|
assert Prism.parse_success?("# encoding: shift_jis\n'\\\x82\xA0'\n")
end
end
end

Просмотреть файл

@ -1,9 +1,12 @@
# frozen_string_literal: true
return if ENV["PRISM_BUILD_MINIMAL"]
require_relative "test_helper"
module Prism
# These tests are simply to exercise snippets found by the fuzzer that caused invalid memory access.
# These tests are simply to exercise snippets found by the fuzzer that caused
# invalid memory access.
class FuzzerTest < TestCase
def self.snippet(name, source)
define_method(:"test_fuzzer_#{name}") { Prism.dump(source) }

Просмотреть файл

@ -17,11 +17,11 @@ module Prism
"# -*- \s\t\v encoding \s\t\v : \s\t\v ascii \s\t\v -*-",
"# -*- foo: bar; encoding: ascii -*-",
"# coding \t \r \v : \t \v \r ascii-8bit",
"# vim: filetype=ruby, fileencoding=big5, tabsize=3, shiftwidth=3"
"# vim: filetype=ruby, fileencoding=windows-31j, tabsize=3, shiftwidth=3"
]
examples.each do |example|
define_method(:"test_magic_comment_#{example}") do
examples.each.with_index(1) do |example, index|
define_method(:"test_magic_comment_#{index}") do
assert_magic_comment(example)
end
end

Просмотреть файл

@ -75,19 +75,21 @@ module Prism
assert_equal 5, tokens.length
end
def test_dump_file
assert_nothing_raised do
Prism.dump_file(__FILE__)
end
if !ENV["PRISM_BUILD_MINIMAL"]
def test_dump_file
assert_nothing_raised do
Prism.dump_file(__FILE__)
end
error = assert_raise Errno::ENOENT do
Prism.dump_file("idontexist.rb")
end
error = assert_raise Errno::ENOENT do
Prism.dump_file("idontexist.rb")
end
assert_equal "No such file or directory - idontexist.rb", error.message
assert_equal "No such file or directory - idontexist.rb", error.message
assert_raise TypeError do
Prism.dump_file(nil)
assert_raise TypeError do
Prism.dump_file(nil)
end
end
end
@ -259,9 +261,11 @@ module Prism
warn("Created snapshot at #{snapshot}.")
end
# Next, assert that the value can be serialized and deserialized without
# changing the shape of the tree.
assert_equal_nodes(result.value, Prism.load(source, Prism.dump(source, filepath: relative)).value)
if !ENV["PRISM_BUILD_MINIMAL"]
# Next, assert that the value can be serialized and deserialized
# without changing the shape of the tree.
assert_equal_nodes(result.value, Prism.load(source, Prism.dump(source, filepath: relative)).value)
end
# Next, check that the location ranges of each node in the tree are a
# superset of their respective child nodes.
@ -318,7 +322,9 @@ module Prism
result = Prism.parse(snippet, filepath: relative)
assert_empty result.errors
assert_equal_nodes(result.value, Prism.load(snippet, Prism.dump(snippet, filepath: relative)).value)
if !ENV["PRISM_BUILD_MINIMAL"]
assert_equal_nodes(result.value, Prism.load(snippet, Prism.dump(snippet, filepath: relative)).value)
end
end
end
end

Просмотреть файл

@ -4,20 +4,22 @@ require_relative "test_helper"
module Prism
class RubyAPITest < TestCase
def test_ruby_api
filepath = __FILE__
source = File.read(filepath, binmode: true, external_encoding: Encoding::UTF_8)
if !ENV["PRISM_BUILD_MINIMAL"]
def test_ruby_api
filepath = __FILE__
source = File.read(filepath, binmode: true, external_encoding: Encoding::UTF_8)
assert_equal Prism.lex(source, filepath: filepath).value, Prism.lex_file(filepath).value
assert_equal Prism.dump(source, filepath: filepath), Prism.dump_file(filepath)
assert_equal Prism.lex(source, filepath: filepath).value, Prism.lex_file(filepath).value
assert_equal Prism.dump(source, filepath: filepath), Prism.dump_file(filepath)
serialized = Prism.dump(source, filepath: filepath)
ast1 = Prism.load(source, serialized).value
ast2 = Prism.parse(source, filepath: filepath).value
ast3 = Prism.parse_file(filepath).value
serialized = Prism.dump(source, filepath: filepath)
ast1 = Prism.load(source, serialized).value
ast2 = Prism.parse(source, filepath: filepath).value
ast3 = Prism.parse_file(filepath).value
assert_equal_nodes ast1, ast2
assert_equal_nodes ast2, ast3
assert_equal_nodes ast1, ast2
assert_equal_nodes ast2, ast3
end
end
def test_parse_success?

Просмотреть файл

@ -54,7 +54,7 @@ module Prism
def test_source_encoding
assert_equal "#<Encoding:UTF-8>", static_inspect("__ENCODING__")
assert_equal "#<Encoding:Shift_JIS>", static_inspect("__ENCODING__", encoding: "Shift_JIS")
assert_equal "#<Encoding:Windows-31J>", static_inspect("__ENCODING__", encoding: "Windows-31J")
end
def test_source_file