From cd4316a51f73817a04071bbf4e6e56e235e0430d Mon Sep 17 00:00:00 2001 From: Ryan Garver Date: Fri, 17 Nov 2023 20:35:28 -0800 Subject: [PATCH] [ruby/prism] Big5 HKSCS encoding https://github.com/ruby/prism/commit/3ca9823eb4 --- lib/prism/prism.gemspec | 1 + prism/enc/pm_big5_hkscs.c | 54 +++++++++++++++++++++++++++++++++++++ prism/enc/pm_encoding.h | 1 + prism/prism.c | 1 + test/prism/encoding_test.rb | 1 + 5 files changed, 58 insertions(+) create mode 100644 prism/enc/pm_big5_hkscs.c diff --git a/lib/prism/prism.gemspec b/lib/prism/prism.gemspec index 4b0f87d442..5f3c666e45 100644 --- a/lib/prism/prism.gemspec +++ b/lib/prism/prism.gemspec @@ -85,6 +85,7 @@ Gem::Specification.new do |spec| "lib/prism/visitor.rb", "src/diagnostic.c", "src/enc/pm_big5.c", + "src/enc/pm_big5_hkscs.c", "src/enc/pm_cp51932.c", "src/enc/pm_euc_jp.c", "src/enc/pm_gbk.c", diff --git a/prism/enc/pm_big5_hkscs.c b/prism/enc/pm_big5_hkscs.c new file mode 100644 index 0000000000..e6a13737ed --- /dev/null +++ b/prism/enc/pm_big5_hkscs.c @@ -0,0 +1,54 @@ +#include "prism/enc/pm_encoding.h" + +static size_t +pm_encoding_big5_hkscs_char_width(const uint8_t *b, ptrdiff_t n) { + // These are the single byte characters. + if (*b < 0x80) { + return 1; + } + + // These are the double byte characters. + if ((n > 1) && (b[0] >= 0x87 && b[0] <= 0xFE) && + ((b[1] >= 0x40 && b[1] <= 0x7E) || (b[1] >= 0xA1 && b[1] <= 0xFE))) { + return 2; + } + + return 0; +} + +static size_t +pm_encoding_big5_hkscs_alpha_char(const uint8_t *b, ptrdiff_t n) { + if (pm_encoding_big5_hkscs_char_width(b, n) == 1) { + return pm_encoding_ascii_alpha_char(b, n); + } else { + return 0; + } +} + +static size_t +pm_encoding_big5_hkscs_alnum_char(const uint8_t *b, ptrdiff_t n) { + if (pm_encoding_big5_hkscs_char_width(b, n) == 1) { + return pm_encoding_ascii_alnum_char(b, n); + } else { + return 0; + } +} + +static bool +pm_encoding_big5_hkscs_isupper_char(const uint8_t *b, ptrdiff_t n) { + if (pm_encoding_big5_hkscs_char_width(b, n) == 1) { + return pm_encoding_ascii_isupper_char(b, n); + } else { + return false; + } +} + +/** Big5 encoding */ +pm_encoding_t pm_encoding_big5_hkscs = { + .name = "big5-hkscs", + .char_width = pm_encoding_big5_hkscs_char_width, + .alnum_char = pm_encoding_big5_hkscs_alnum_char, + .alpha_char = pm_encoding_big5_hkscs_alpha_char, + .isupper_char = pm_encoding_big5_hkscs_isupper_char, + .multibyte = true +}; diff --git a/prism/enc/pm_encoding.h b/prism/enc/pm_encoding.h index 957fa794f6..93599fc87f 100644 --- a/prism/enc/pm_encoding.h +++ b/prism/enc/pm_encoding.h @@ -158,6 +158,7 @@ extern const uint8_t pm_encoding_unicode_table[256]; extern pm_encoding_t pm_encoding_ascii; extern pm_encoding_t pm_encoding_ascii_8bit; extern pm_encoding_t pm_encoding_big5; +extern pm_encoding_t pm_encoding_big5_hkscs; extern pm_encoding_t pm_encoding_cp51932; extern pm_encoding_t pm_encoding_cp850; extern pm_encoding_t pm_encoding_cp852; diff --git a/prism/prism.c b/prism/prism.c index a219ffa52a..45be6cfb23 100644 --- a/prism/prism.c +++ b/prism/prism.c @@ -6062,6 +6062,7 @@ parser_lex_magic_comment_encoding_value(pm_parser_t *parser, const uint8_t *star case 'B': case 'b': ENCODING1("BINARY", pm_encoding_ascii_8bit); ENCODING1("Big5", pm_encoding_big5); + ENCODING1("Big5-HKSCS", pm_encoding_big5_hkscs); break; case 'C': case 'c': ENCODING1("CP437", pm_encoding_ibm437); diff --git a/test/prism/encoding_test.rb b/test/prism/encoding_test.rb index c2b90aca68..9d8f6bd809 100644 --- a/test/prism/encoding_test.rb +++ b/test/prism/encoding_test.rb @@ -57,6 +57,7 @@ module Prism Encoding::Windows_1258 => 0x00...0x100, Encoding::Windows_874 => 0x00...0x100, Encoding::Big5 => 0x00...0x10000, + Encoding::Big5_HKSCS => 0x00...0x10000, Encoding::CP51932 => 0x00...0x10000, Encoding::GBK => 0x00...0x10000, Encoding::Shift_JIS => 0x00...0x10000,