[ruby/prism] Provide options for reducing size

https://github.com/ruby/prism/commit/592128de4d
2024-03-20 10:08:13 -04:00 · 2024-03-20 10:08:13 -04:00 · af7bf9e0d8
--- a/prism/encoding.c
+++ b/prism/encoding.c
@ -2358,6 +2358,8 @@ pm_encoding_utf_8_isupper_char(const uint8_t *b, ptrdiff_t n) {
    }
 }

+#ifndef PRISM_ENCODING_EXCLUDE_FULL
+
 static pm_unicode_codepoint_t
 pm_cesu_8_codepoint(const uint8_t *b, ptrdiff_t n, size_t *width) {
    if (b[0] < 0x80) {
@ -2452,6 +2454,8 @@ pm_encoding_cesu_8_isupper_char(const uint8_t *b, ptrdiff_t n) {
    }
 }

+#endif
+
 #undef UNICODE_ALPHA_CODEPOINTS_LENGTH
 #undef UNICODE_ALNUM_CODEPOINTS_LENGTH
 #undef UNICODE_ISUPPER_CODEPOINTS_LENGTH
@ -2480,6 +2484,8 @@ static const uint8_t pm_encoding_ascii_table[256] = {
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // Fx
 };

+#ifndef PRISM_ENCODING_EXCLUDE_FULL
+
 /**
 * Each element of the following table contains a bitfield that indicates a
 * piece of information about the corresponding CP850 character.
@ -3918,6 +3924,7 @@ PRISM_ENCODING_TABLE(windows_1258)
 PRISM_ENCODING_TABLE(windows_874)

 #undef PRISM_ENCODING_TABLE
+#endif

 /**
 * Returns the size of the next character in the ASCII encoding. This basically
@ -3975,6 +3982,122 @@ pm_encoding_ascii_isupper_char(const uint8_t *b, PRISM_ATTRIBUTE_UNUSED ptrdiff_
    return (pm_encoding_ascii_table[*b] & PRISM_ENCODING_UPPERCASE_BIT);
 }

+/**
+ * For a lot of encodings the default is that they are a single byte long no
+ * matter what the codepoint, so this function is shared between them.
+ */
+static size_t
+pm_encoding_single_char_width(PRISM_ATTRIBUTE_UNUSED const uint8_t *b, PRISM_ATTRIBUTE_UNUSED ptrdiff_t n) {
+    return 1;
+}
+
+/**
+ * Returns the size of the next character in the EUC-JP encoding, or 0 if a
+ * character cannot be decoded from the given bytes.
+ */
+static size_t
+pm_encoding_euc_jp_char_width(const uint8_t *b, ptrdiff_t n) {
+    // These are the single byte characters.
+    if (*b < 0x80) {
+        return 1;
+    }
+
+    // These are the double byte characters.
+    if ((n > 1) && ((b[0] == 0x8E) || (b[0] >= 0xA1 && b[0] <= 0xFE)) && (b[1] >= 0xA1 && b[1] <= 0xFE)) {
+        return 2;
+    }
+
+    // These are the triple byte characters.
+    if ((n > 2) && (b[0] == 0x8F) && (b[1] >= 0xA1 && b[2] <= 0xFE) && (b[2] >= 0xA1 && b[2] <= 0xFE)) {
+        return 3;
+    }
+
+    return 0;
+}
+
+/**
+ * Returns the size of the next character in the EUC-JP encoding if it is an
+ * uppercase character.
+ */
+static bool
+pm_encoding_euc_jp_isupper_char(const uint8_t *b, ptrdiff_t n) {
+    size_t width = pm_encoding_euc_jp_char_width(b, n);
+
+    if (width == 1) {
+        return pm_encoding_ascii_isupper_char(b, n);
+    } else if (width == 2) {
+        return (
+            (b[0] == 0xA3 && b[1] >= 0xC1 && b[1] <= 0xDA) ||
+            (b[0] == 0xA6 && b[1] >= 0xA1 && b[1] <= 0xB8) ||
+            (b[0] == 0xA7 && b[1] >= 0xA1 && b[1] <= 0xC1)
+        );
+    } else {
+        return false;
+    }
+}
+
+/**
+ * Returns the size of the next character in the Shift_JIS encoding, or 0 if a
+ * character cannot be decoded from the given bytes.
+ */
+static size_t
+pm_encoding_shift_jis_char_width(const uint8_t *b, ptrdiff_t n) {
+    // These are the single byte characters.
+    if (b[0] < 0x80 || (b[0] >= 0xA1 && b[0] <= 0xDF)) {
+        return 1;
+    }
+
+    // These are the double byte characters.
+    if ((n > 1) && ((b[0] >= 0x81 && b[0] <= 0x9F) || (b[0] >= 0xE0 && b[0] <= 0xFC)) && (b[1] >= 0x40 && b[1] <= 0xFC && b[1] != 0x7F)) {
+        return 2;
+    }
+
+    return 0;
+}
+
+/**
+ * Returns the size of the next character in the Shift_JIS encoding if it is an
+ * alphanumeric character.
+ */
+static size_t
+pm_encoding_shift_jis_alnum_char(const uint8_t *b, ptrdiff_t n) {
+    size_t width = pm_encoding_shift_jis_char_width(b, n);
+    return width == 1 ? ((b[0] >= 0x80) || pm_encoding_ascii_alnum_char(b, n)) : width;
+}
+
+/**
+ * Returns the size of the next character in the Shift_JIS encoding if it is an
+ * alphabetical character.
+ */
+static size_t
+pm_encoding_shift_jis_alpha_char(const uint8_t *b, ptrdiff_t n) {
+    size_t width = pm_encoding_shift_jis_char_width(b, n);
+    return width == 1 ? ((b[0] >= 0x80) || pm_encoding_ascii_alpha_char(b, n)) : width;
+}
+
+/**
+ * Returns the size of the next character in the Shift_JIS encoding if it is an
+ * uppercase character.
+ */
+static bool
+pm_encoding_shift_jis_isupper_char(const uint8_t *b, ptrdiff_t n) {
+    size_t width = pm_encoding_shift_jis_char_width(b, n);
+
+    if (width == 1) {
+        return pm_encoding_ascii_isupper_char(b, n);
+    } else if (width == 2) {
+        return (
+            ((b[0] == 0x82) && (b[1] >= 0x60 && b[1] <= 0x79)) ||
+            ((b[0] == 0x83) && (b[1] >= 0x9F && b[1] <= 0xB6)) ||
+            ((b[0] == 0x84) && (b[1] >= 0x40 && b[1] <= 0x60))
+        );
+    } else {
+        return width;
+    }
+}
+
+#ifndef PRISM_ENCODING_EXCLUDE_FULL
+
 /**
 * Certain encodings are equivalent to ASCII below 0x80, so it works for our
 * purposes to have a function here that first checks the bounds and then falls
@ -3985,15 +4108,6 @@ pm_encoding_ascii_isupper_char_7bit(const uint8_t *b, ptrdiff_t n) {
    return (*b < 0x80) && pm_encoding_ascii_isupper_char(b, n);
 }

-/**
- * For a lot of encodings the default is that they are a single byte long no
- * matter what the codepoint, so this function is shared between them.
- */
-static size_t
-pm_encoding_single_char_width(PRISM_ATTRIBUTE_UNUSED const uint8_t *b, PRISM_ATTRIBUTE_UNUSED ptrdiff_t n) {
-    return 1;
-}
-
 /**
 * Returns the size of the next character in the Big5 encoding, or 0 if a
 * character cannot be decoded from the given bytes.
@ -4075,51 +4189,6 @@ pm_encoding_emacs_mule_char_width(const uint8_t *b, ptrdiff_t n) {
    return 0;
 }

-/**
- * Returns the size of the next character in the EUC-JP encoding, or 0 if a
- * character cannot be decoded from the given bytes.
- */
-static size_t
-pm_encoding_euc_jp_char_width(const uint8_t *b, ptrdiff_t n) {
-    // These are the single byte characters.
-    if (*b < 0x80) {
-        return 1;
-    }
-
-    // These are the double byte characters.
-    if ((n > 1) && ((b[0] == 0x8E) || (b[0] >= 0xA1 && b[0] <= 0xFE)) && (b[1] >= 0xA1 && b[1] <= 0xFE)) {
-        return 2;
-    }
-
-    // These are the triple byte characters.
-    if ((n > 2) && (b[0] == 0x8F) && (b[1] >= 0xA1 && b[2] <= 0xFE) && (b[2] >= 0xA1 && b[2] <= 0xFE)) {
-        return 3;
-    }
-
-    return 0;
-}
-
-/**
- * Returns the size of the next character in the EUC-JP encoding if it is an
- * uppercase character.
- */
-static bool
-pm_encoding_euc_jp_isupper_char(const uint8_t *b, ptrdiff_t n) {
-    size_t width = pm_encoding_euc_jp_char_width(b, n);
-
-    if (width == 1) {
-        return pm_encoding_ascii_isupper_char(b, n);
-    } else if (width == 2) {
-        return (
-            (b[0] == 0xA3 && b[1] >= 0xC1 && b[1] <= 0xDA) ||
-            (b[0] == 0xA6 && b[1] >= 0xA1 && b[1] <= 0xB8) ||
-            (b[0] == 0xA7 && b[1] >= 0xA1 && b[1] <= 0xC1)
-        );
-    } else {
-        return false;
-    }
-}
-
 /**
 * Returns the size of the next character in the EUC-KR encoding, or 0 if a
 * character cannot be decoded from the given bytes.
@ -4218,65 +4287,7 @@ pm_encoding_gbk_char_width(const uint8_t *b, ptrdiff_t n) {
    return 0;
 }

-/**
- * Returns the size of the next character in the Shift_JIS encoding, or 0 if a
- * character cannot be decoded from the given bytes.
- */
-static size_t
-pm_encoding_shift_jis_char_width(const uint8_t *b, ptrdiff_t n) {
-    // These are the single byte characters.
-    if (b[0] < 0x80 || (b[0] >= 0xA1 && b[0] <= 0xDF)) {
-        return 1;
-    }
-
-    // These are the double byte characters.
-    if ((n > 1) && ((b[0] >= 0x81 && b[0] <= 0x9F) || (b[0] >= 0xE0 && b[0] <= 0xFC)) && (b[1] >= 0x40 && b[1] <= 0xFC && b[1] != 0x7F)) {
-        return 2;
-    }
-
-    return 0;
-}
-
-/**
- * Returns the size of the next character in the Shift_JIS encoding if it is an
- * alphanumeric character.
- */
-static size_t
-pm_encoding_shift_jis_alnum_char(const uint8_t *b, ptrdiff_t n) {
-    size_t width = pm_encoding_shift_jis_char_width(b, n);
-    return width == 1 ? ((b[0] >= 0x80) || pm_encoding_ascii_alnum_char(b, n)) : width;
-}
-
-/**
- * Returns the size of the next character in the Shift_JIS encoding if it is an
- * alphabetical character.
- */
-static size_t
-pm_encoding_shift_jis_alpha_char(const uint8_t *b, ptrdiff_t n) {
-    size_t width = pm_encoding_shift_jis_char_width(b, n);
-    return width == 1 ? ((b[0] >= 0x80) || pm_encoding_ascii_alpha_char(b, n)) : width;
-}
-
-/**
- * Returns the size of the next character in the Shift_JIS encoding if it is an
- * uppercase character.
- */
-static bool
-pm_encoding_shift_jis_isupper_char(const uint8_t *b, ptrdiff_t n) {
-    size_t width = pm_encoding_shift_jis_char_width(b, n);
-
-    if (width == 1) {
-        return pm_encoding_ascii_isupper_char(b, n);
-    } else if (width == 2) {
-        return (
-            ((b[0] == 0x82) && (b[1] >= 0x60 && b[1] <= 0x79)) ||
-            ((b[0] == 0x83) && (b[1] >= 0x9F && b[1] <= 0xB6)) ||
-            ((b[0] == 0x84) && (b[1] >= 0x40 && b[1] <= 0x60))
-        );
-    } else {
-        return width;
-    }
-}
+#endif

 /**
 * This is the table of all of the encodings that prism supports.
@ -4290,6 +4301,14 @@ const pm_encoding_t pm_encodings[] = {
        .isupper_char = pm_encoding_utf_8_isupper_char,
        .multibyte = true
    },
+    [PM_ENCODING_US_ASCII] = {
+        .name = "US-ASCII",
+        .char_width = pm_encoding_ascii_char_width,
+        .alnum_char = pm_encoding_ascii_alnum_char,
+        .alpha_char = pm_encoding_ascii_alpha_char,
+        .isupper_char = pm_encoding_ascii_isupper_char,
+        .multibyte = false
+    },
    [PM_ENCODING_ASCII_8BIT] = {
        .name = "ASCII-8BIT",
        .char_width = pm_encoding_single_char_width,
@ -4298,6 +4317,24 @@ const pm_encoding_t pm_encodings[] = {
        .isupper_char = pm_encoding_ascii_isupper_char,
        .multibyte = false
    },
+    [PM_ENCODING_EUC_JP] = {
+        .name = "EUC-JP",
+        .char_width = pm_encoding_euc_jp_char_width,
+        .alnum_char = pm_encoding_ascii_alnum_char_7bit,
+        .alpha_char = pm_encoding_ascii_alpha_char_7bit,
+        .isupper_char = pm_encoding_euc_jp_isupper_char,
+        .multibyte = true
+    },
+    [PM_ENCODING_WINDOWS_31J] = {
+        .name = "Windows-31J",
+        .char_width = pm_encoding_shift_jis_char_width,
+        .alnum_char = pm_encoding_shift_jis_alnum_char,
+        .alpha_char = pm_encoding_shift_jis_alpha_char,
+        .isupper_char = pm_encoding_shift_jis_isupper_char,
+        .multibyte = true
+    },
+
+#ifndef PRISM_ENCODING_EXCLUDE_FULL
    [PM_ENCODING_BIG5] = {
        .name = "Big5",
        .char_width = pm_encoding_big5_char_width,
@ -4394,14 +4431,6 @@ const pm_encoding_t pm_encodings[] = {
        .isupper_char = pm_encoding_ascii_isupper_char_7bit,
        .multibyte = true
    },
-    [PM_ENCODING_EUC_JP] = {
-        .name = "EUC-JP",
-        .char_width = pm_encoding_euc_jp_char_width,
-        .alnum_char = pm_encoding_ascii_alnum_char_7bit,
-        .alpha_char = pm_encoding_ascii_alpha_char_7bit,
-        .isupper_char = pm_encoding_euc_jp_isupper_char,
-        .multibyte = true
-    },
    [PM_ENCODING_EUC_JP_MS] = {
        .name = "eucJP-ms",
        .char_width = pm_encoding_euc_jp_char_width,
@ -4874,14 +4903,6 @@ const pm_encoding_t pm_encodings[] = {
        .isupper_char = pm_encoding_tis_620_isupper_char,
        .multibyte = false
    },
-    [PM_ENCODING_US_ASCII] = {
-        .name = "US-ASCII",
-        .char_width = pm_encoding_ascii_char_width,
-        .alnum_char = pm_encoding_ascii_alnum_char,
-        .alpha_char = pm_encoding_ascii_alpha_char,
-        .isupper_char = pm_encoding_ascii_isupper_char,
-        .multibyte = false
-    },
    [PM_ENCODING_UTF8_MAC] = {
        .name = "UTF8-MAC",
        .char_width = pm_encoding_utf_8_char_width,
@ -4986,14 +5007,6 @@ const pm_encoding_t pm_encodings[] = {
        .isupper_char = pm_encoding_windows_1258_isupper_char,
        .multibyte = false
    },
-    [PM_ENCODING_WINDOWS_31J] = {
-        .name = "Windows-31J",
-        .char_width = pm_encoding_shift_jis_char_width,
-        .alnum_char = pm_encoding_shift_jis_alnum_char,
-        .alpha_char = pm_encoding_shift_jis_alpha_char,
-        .isupper_char = pm_encoding_shift_jis_isupper_char,
-        .multibyte = true
-    },
    [PM_ENCODING_WINDOWS_874] = {
        .name = "Windows-874",
        .char_width = pm_encoding_single_char_width,
@ -5002,6 +5015,7 @@ const pm_encoding_t pm_encodings[] = {
        .isupper_char = pm_encoding_windows_874_isupper_char,
        .multibyte = false
    }
+#endif
 };

 /**
@ -5016,11 +5030,13 @@ pm_encoding_find(const uint8_t *start, const uint8_t *end) {
    // UTF-8 can contain extra information at the end about the platform it is
    // encoded on, such as UTF-8-MAC or UTF-8-UNIX. We'll ignore those suffixes.
    if ((start + 5 <= end) && (pm_strncasecmp(start, (const uint8_t *) "UTF-8", 5) == 0)) {
+#ifndef PRISM_ENCODING_EXCLUDE_FULL
        // We need to explicitly handle UTF-8-HFS, as that one needs to switch
        // over to being UTF8-MAC.
        if (width == 9 && (pm_strncasecmp(start + 5, (const uint8_t *) "-HFS", 4) == 0)) {
            return &pm_encodings[PM_ENCODING_UTF8_MAC];
        }
+#endif

        // Otherwise we'll return the default UTF-8 encoding.
        return PM_ENCODING_UTF_8_ENTRY;
@ -5040,11 +5056,16 @@ pm_encoding_find(const uint8_t *start, const uint8_t *end) {
                break;
            case 'B': case 'b':
                ENCODING1("BINARY", PM_ENCODING_ASCII_8BIT);
+#ifndef PRISM_ENCODING_EXCLUDE_FULL
                ENCODING1("Big5", PM_ENCODING_BIG5);
                ENCODING2("Big5-HKSCS", "Big5-HKSCS:2008", PM_ENCODING_BIG5_HKSCS);
                ENCODING1("Big5-UAO", PM_ENCODING_BIG5_UAO);
+#endif
                break;
            case 'C': case 'c':
+                ENCODING1("CP65001", PM_ENCODING_UTF_8);
+                ENCODING2("CP932", "csWindows31J", PM_ENCODING_WINDOWS_31J);
+#ifndef PRISM_ENCODING_EXCLUDE_FULL
                ENCODING1("CESU-8", PM_ENCODING_CESU_8);
                ENCODING1("CP437", PM_ENCODING_IBM437);
                ENCODING1("CP720", PM_ENCODING_IBM720);
@ -5064,7 +5085,6 @@ pm_encoding_find(const uint8_t *start, const uint8_t *end) {
                ENCODING1("CP874", PM_ENCODING_WINDOWS_874);
                ENCODING1("CP878", PM_ENCODING_KOI8_R);
                ENCODING1("CP863", PM_ENCODING_IBM863);
-                ENCODING2("CP932", "csWindows31J", PM_ENCODING_WINDOWS_31J);
                ENCODING1("CP936", PM_ENCODING_GBK);
                ENCODING1("CP949", PM_ENCODING_CP949);
                ENCODING1("CP950", PM_ENCODING_CP950);
@ -5079,25 +5099,30 @@ pm_encoding_find(const uint8_t *start, const uint8_t *end) {
                ENCODING1("CP1257", PM_ENCODING_WINDOWS_1257);
                ENCODING1("CP1258", PM_ENCODING_WINDOWS_1258);
                ENCODING1("CP51932", PM_ENCODING_CP51932);
-                ENCODING1("CP65001", PM_ENCODING_UTF_8);
+#endif
                break;
            case 'E': case 'e':
                ENCODING2("EUC-JP", "eucJP", PM_ENCODING_EUC_JP);
+#ifndef PRISM_ENCODING_EXCLUDE_FULL
                ENCODING2("eucJP-ms", "euc-jp-ms", PM_ENCODING_EUC_JP_MS);
                ENCODING2("EUC-JIS-2004", "EUC-JISX0213", PM_ENCODING_EUC_JIS_2004);
                ENCODING2("EUC-KR", "eucKR", PM_ENCODING_EUC_KR);
                ENCODING2("EUC-CN", "eucCN", PM_ENCODING_GB2312);
                ENCODING2("EUC-TW", "eucTW", PM_ENCODING_EUC_TW);
                ENCODING1("Emacs-Mule", PM_ENCODING_EMACS_MULE);
+#endif
                break;
            case 'G': case 'g':
+#ifndef PRISM_ENCODING_EXCLUDE_FULL
                ENCODING1("GBK", PM_ENCODING_GBK);
                ENCODING1("GB12345", PM_ENCODING_GB12345);
                ENCODING1("GB18030", PM_ENCODING_GB18030);
                ENCODING1("GB1988", PM_ENCODING_GB1988);
                ENCODING1("GB2312", PM_ENCODING_GB2312);
+#endif
                break;
            case 'I': case 'i':
+#ifndef PRISM_ENCODING_EXCLUDE_FULL
                ENCODING1("IBM437", PM_ENCODING_IBM437);
                ENCODING1("IBM720", PM_ENCODING_IBM720);
                ENCODING1("IBM737", PM_ENCODING_IBM737);
@ -5129,12 +5154,16 @@ pm_encoding_find(const uint8_t *start, const uint8_t *end) {
                ENCODING2("ISO-8859-14", "ISO8859-14", PM_ENCODING_ISO_8859_14);
                ENCODING2("ISO-8859-15", "ISO8859-15", PM_ENCODING_ISO_8859_15);
                ENCODING2("ISO-8859-16", "ISO8859-16", PM_ENCODING_ISO_8859_16);
+#endif
                break;
            case 'K': case 'k':
+#ifndef PRISM_ENCODING_EXCLUDE_FULL
                ENCODING1("KOI8-R", PM_ENCODING_KOI8_R);
                ENCODING1("KOI8-U", PM_ENCODING_KOI8_U);
+#endif
                break;
            case 'M': case 'm':
+#ifndef PRISM_ENCODING_EXCLUDE_FULL
                ENCODING1("macCentEuro", PM_ENCODING_MAC_CENT_EURO);
                ENCODING1("macCroatian", PM_ENCODING_MAC_CROATIAN);
                ENCODING1("macCyrillic", PM_ENCODING_MAC_CYRILLIC);
@ -5147,31 +5176,39 @@ pm_encoding_find(const uint8_t *start, const uint8_t *end) {
                ENCODING1("macThai", PM_ENCODING_MAC_THAI);
                ENCODING1("macTurkish", PM_ENCODING_MAC_TURKISH);
                ENCODING1("macUkraine", PM_ENCODING_MAC_UKRAINE);
+#endif
                break;
            case 'P': case 'p':
                ENCODING1("PCK", PM_ENCODING_WINDOWS_31J);
                break;
            case 'S': case 's':
-                ENCODING1("Shift_JIS", PM_ENCODING_SHIFT_JIS);
                ENCODING1("SJIS", PM_ENCODING_WINDOWS_31J);
+#ifndef PRISM_ENCODING_EXCLUDE_FULL
+                ENCODING1("Shift_JIS", PM_ENCODING_SHIFT_JIS);
                ENCODING1("SJIS-DoCoMo", PM_ENCODING_SJIS_DOCOMO);
                ENCODING1("SJIS-KDDI", PM_ENCODING_SJIS_KDDI);
                ENCODING1("SJIS-SoftBank", PM_ENCODING_SJIS_SOFTBANK);
                ENCODING1("stateless-ISO-2022-JP", PM_ENCODING_STATELESS_ISO_2022_JP);
                ENCODING1("stateless-ISO-2022-JP-KDDI", PM_ENCODING_STATELESS_ISO_2022_JP_KDDI);
+#endif
                break;
            case 'T': case 't':
+#ifndef PRISM_ENCODING_EXCLUDE_FULL
                ENCODING1("TIS-620", PM_ENCODING_TIS_620);
+#endif
                break;
            case 'U': case 'u':
                ENCODING1("US-ASCII", PM_ENCODING_US_ASCII);
+#ifndef PRISM_ENCODING_EXCLUDE_FULL
                ENCODING2("UTF8-MAC", "UTF-8-HFS", PM_ENCODING_UTF8_MAC);
                ENCODING1("UTF8-DoCoMo", PM_ENCODING_UTF8_DOCOMO);
                ENCODING1("UTF8-KDDI", PM_ENCODING_UTF8_KDDI);
                ENCODING1("UTF8-SoftBank", PM_ENCODING_UTF8_SOFTBANK);
+#endif
                break;
            case 'W': case 'w':
                ENCODING1("Windows-31J", PM_ENCODING_WINDOWS_31J);
+#ifndef PRISM_ENCODING_EXCLUDE_FULL
                ENCODING1("Windows-874", PM_ENCODING_WINDOWS_874);
                ENCODING1("Windows-1250", PM_ENCODING_WINDOWS_1250);
                ENCODING1("Windows-1251", PM_ENCODING_WINDOWS_1251);
@ -5182,6 +5219,7 @@ pm_encoding_find(const uint8_t *start, const uint8_t *end) {
                ENCODING1("Windows-1256", PM_ENCODING_WINDOWS_1256);
                ENCODING1("Windows-1257", PM_ENCODING_WINDOWS_1257);
                ENCODING1("Windows-1258", PM_ENCODING_WINDOWS_1258);
+#endif
                break;
            case '6':
                ENCODING1("646", PM_ENCODING_US_ASCII);
--- a/prism/encoding.h
+++ b/prism/encoding.h
@ -135,7 +135,14 @@ extern const uint8_t pm_encoding_unicode_table[256];
 */
 typedef enum {
    PM_ENCODING_UTF_8 = 0,
+    PM_ENCODING_US_ASCII,
    PM_ENCODING_ASCII_8BIT,
+    PM_ENCODING_EUC_JP,
+    PM_ENCODING_WINDOWS_31J,
+
+// We optionally support excluding the full set of encodings to only support the
+// minimum necessary to process Ruby code without encoding comments.
+#ifndef PRISM_ENCODING_EXCLUDE_FULL
    PM_ENCODING_BIG5,
    PM_ENCODING_BIG5_HKSCS,
    PM_ENCODING_BIG5_UAO,
@ -148,7 +155,6 @@ typedef enum {
    PM_ENCODING_CP950,
    PM_ENCODING_CP951,
    PM_ENCODING_EMACS_MULE,
-    PM_ENCODING_EUC_JP,
    PM_ENCODING_EUC_JP_MS,
    PM_ENCODING_EUC_JIS_2004,
    PM_ENCODING_EUC_KR,
@ -208,7 +214,6 @@ typedef enum {
    PM_ENCODING_STATELESS_ISO_2022_JP,
    PM_ENCODING_STATELESS_ISO_2022_JP_KDDI,
    PM_ENCODING_TIS_620,
-    PM_ENCODING_US_ASCII,
    PM_ENCODING_UTF8_MAC,
    PM_ENCODING_UTF8_DOCOMO,
    PM_ENCODING_UTF8_KDDI,
@ -222,8 +227,9 @@ typedef enum {
    PM_ENCODING_WINDOWS_1256,
    PM_ENCODING_WINDOWS_1257,
    PM_ENCODING_WINDOWS_1258,
-    PM_ENCODING_WINDOWS_31J,
    PM_ENCODING_WINDOWS_874,
+#endif
+
    PM_ENCODING_MAXIMUM
 } pm_encoding_type_t;

--- a/prism/extension.c
+++ b/prism/extension.c
@ -311,7 +311,7 @@ dump(int argc, VALUE *argv, VALUE self) {
    pm_options_t options = { 0 };
    string_options(argc, argv, &input, &options);

-#ifdef PRISM_DEBUG_MODE_BUILD
+#ifdef PRISM_BUILD_DEBUG
    size_t length = pm_string_length(&input);
    char* dup = xmalloc(length);
    memcpy(dup, pm_string_source(&input), length);
@ -320,7 +320,7 @@ dump(int argc, VALUE *argv, VALUE self) {

    VALUE value = dump_input(&input, &options);

-#ifdef PRISM_DEBUG_MODE_BUILD
+#ifdef PRISM_BUILD_DEBUG
    xfree(dup);
 #endif

@ -737,7 +737,7 @@ parse(int argc, VALUE *argv, VALUE self) {
    pm_options_t options = { 0 };
    string_options(argc, argv, &input, &options);

-#ifdef PRISM_DEBUG_MODE_BUILD
+#ifdef PRISM_BUILD_DEBUG
    size_t length = pm_string_length(&input);
    char* dup = xmalloc(length);
    memcpy(dup, pm_string_source(&input), length);
@ -746,7 +746,7 @@ parse(int argc, VALUE *argv, VALUE self) {

    VALUE value = parse_input(&input, &options);

-#ifdef PRISM_DEBUG_MODE_BUILD
+#ifdef PRISM_BUILD_DEBUG
    xfree(dup);
 #endif

--- a/prism/pack.c
+++ b/prism/pack.c
@ -1,16 +1,43 @@
 #include "prism/pack.h"

+// We optionally support parsing String#pack templates. For systems that don't
+// want or need this functionality, it can be turned off with the
+// PRISM_EXCLUDE_PACK define.
+#ifdef PRISM_EXCLUDE_PACK
+
+void pm_pack_parse(void) {}
+
+#else
+
 #include <stdbool.h>
 #include <errno.h>

 static uintmax_t
-strtoumaxc(const char **format);
+strtoumaxc(const char **format) {
+    uintmax_t value = 0;
+    while (**format >= '0' && **format <= '9') {
+        if (value > UINTMAX_MAX / 10) {
+            errno = ERANGE;
+        }
+        value = value * 10 + ((uintmax_t) (**format - '0'));
+        (*format)++;
+    }
+    return value;
+}

 PRISM_EXPORTED_FUNCTION pm_pack_result
-pm_pack_parse(pm_pack_variant variant, const char **format, const char *format_end,
-                            pm_pack_type *type, pm_pack_signed *signed_type, pm_pack_endian *endian, pm_pack_size *size,
-                            pm_pack_length_type *length_type, uint64_t *length, pm_pack_encoding *encoding) {
-
+pm_pack_parse(
+    pm_pack_variant variant,
+    const char **format,
+    const char *format_end,
+    pm_pack_type *type,
+    pm_pack_signed *signed_type,
+    pm_pack_endian *endian,
+    pm_pack_size *size,
+    pm_pack_length_type *length_type,
+    uint64_t *length,
+    pm_pack_encoding *encoding
+) {
    if (*encoding == PM_PACK_ENCODING_START) {
        *encoding = PM_PACK_ENCODING_US_ASCII;
    }
@ -479,15 +506,4 @@ pm_size_to_native(pm_pack_size size) {
    }
 }

-static uintmax_t
-strtoumaxc(const char **format) {
-    uintmax_t value = 0;
-    while (**format >= '0' && **format <= '9') {
-        if (value > UINTMAX_MAX / 10) {
-            errno = ERANGE;
-        }
-        value = value * 10 + ((uintmax_t) (**format - '0'));
-        (*format)++;
-    }
-    return value;
-}
+#endif
--- a/prism/pack.h
+++ b/prism/pack.h
@ -6,6 +6,15 @@
 #ifndef PRISM_PACK_H
 #define PRISM_PACK_H

+// We optionally support parsing String#pack templates. For systems that don't
+// want or need this functionality, it can be turned off with the
+// PRISM_EXCLUDE_PACK define.
+#ifdef PRISM_EXCLUDE_PACK
+
+void pm_pack_parse(void);
+
+#else
+
 #include "prism/defines.h"

 #include <stdint.h>
@ -150,3 +159,5 @@ pm_pack_parse(
 PRISM_EXPORTED_FUNCTION size_t pm_size_to_native(pm_pack_size size);

 #endif
+
+#endif
--- a/prism/prettyprint.h
+++ b/prism/prettyprint.h
@ -6,6 +6,12 @@
 #ifndef PRISM_PRETTYPRINT_H
 #define PRISM_PRETTYPRINT_H

+#ifdef PRISM_EXCLUDE_PRETTYPRINT
+
+void pm_prettyprint(void);
+
+#else
+
 #include "prism/defines.h"

 #include <stdio.h>
@ -24,3 +30,5 @@
 PRISM_EXPORTED_FUNCTION void pm_prettyprint(pm_buffer_t *output_buffer, const pm_parser_t *parser, const pm_node_t *node);

 #endif
+
+#endif
--- a/prism/prism.c
+++ b/prism/prism.c
@ -19316,6 +19316,41 @@ pm_parse_stream(pm_parser_t *parser, pm_buffer_t *buffer, void *stream, pm_parse
    return node;
 }

+/**
+ * Parse the source and return true if it parses without errors or warnings.
+ */
+PRISM_EXPORTED_FUNCTION bool
+pm_parse_success_p(const uint8_t *source, size_t size, const char *data) {
+    pm_options_t options = { 0 };
+    pm_options_read(&options, data);
+
+    pm_parser_t parser;
+    pm_parser_init(&parser, source, size, &options);
+
+    pm_node_t *node = pm_parse(&parser);
+    pm_node_destroy(&parser, node);
+
+    bool result = parser.error_list.size == 0 && parser.warning_list.size == 0;
+    pm_parser_free(&parser);
+    pm_options_free(&options);
+
+    return result;
+}
+
+#undef PM_CASE_KEYWORD
+#undef PM_CASE_OPERATOR
+#undef PM_CASE_WRITABLE
+#undef PM_STRING_EMPTY
+#undef PM_LOCATION_NODE_BASE_VALUE
+#undef PM_LOCATION_NODE_VALUE
+#undef PM_LOCATION_NULL_VALUE
+#undef PM_LOCATION_TOKEN_VALUE
+
+// We optionally support serializing to a binary string. For systems that don't
+// want or need this functionality, it can be turned off with the
+// PRISM_EXCLUDE_SERIALIZATION define.
+#ifndef PRISM_EXCLUDE_SERIALIZATION
+
 static inline void
 pm_serialize_header(pm_buffer_t *buffer) {
    pm_buffer_append_string(buffer, "PRISM", 5);
@ -19402,14 +19437,7 @@ pm_serialize_parse_comments(pm_buffer_t *buffer, const uint8_t *source, size_t s
    pm_options_free(&options);
 }

-#undef PM_CASE_KEYWORD
-#undef PM_CASE_OPERATOR
-#undef PM_CASE_WRITABLE
-#undef PM_STRING_EMPTY
-#undef PM_LOCATION_NODE_BASE_VALUE
-#undef PM_LOCATION_NODE_VALUE
-#undef PM_LOCATION_NULL_VALUE
-#undef PM_LOCATION_TOKEN_VALUE
+#endif

 /** An error that is going to be formatted into the output. */
 typedef struct {
--- a/prism/prism.h
+++ b/prism/prism.h
@ -98,6 +98,11 @@ typedef char * (pm_parse_stream_fgets_t)(char *string, int size, void *stream);
 */
 PRISM_EXPORTED_FUNCTION pm_node_t * pm_parse_stream(pm_parser_t *parser, pm_buffer_t *buffer, void *stream, pm_parse_stream_fgets_t *fgets, const pm_options_t *options);

+// We optionally support serializing to a binary string. For systems that don't
+// want or need this functionality, it can be turned off with the
+// PRISM_EXCLUDE_SERIALIZATION define.
+#ifndef PRISM_EXCLUDE_SERIALIZATION
+
 /**
 * Parse and serialize the AST represented by the source that is read out of the
 * given stream into to the given buffer.
@ -185,6 +190,8 @@ PRISM_EXPORTED_FUNCTION void pm_serialize_lex(pm_buffer_t *buffer, const uint8_t
 */
 PRISM_EXPORTED_FUNCTION void pm_serialize_parse_lex(pm_buffer_t *buffer, const uint8_t *source, size_t size, const char *data);

+#endif
+
 /**
 * Parse the source and return true if it parses without errors or warnings.
 *
@ -220,6 +227,10 @@ const char * pm_token_type_human(pm_token_type_t token_type);
 */
 PRISM_EXPORTED_FUNCTION void pm_parser_errors_format(const pm_parser_t *parser, pm_buffer_t *buffer, bool colorize);

+// We optionally support dumping to JSON. For systems that don't want or need
+// this functionality, it can be turned off with the PRISM_EXCLUDE_JSON define.
+#ifndef PRISM_EXCLUDE_JSON
+
 /**
 * Dump JSON to the given buffer.
 *
@ -229,6 +240,8 @@ PRISM_EXPORTED_FUNCTION void pm_parser_errors_format(const pm_parser_t *parser,
 */
 PRISM_EXPORTED_FUNCTION void pm_dump_json(pm_buffer_t *buffer, const pm_parser_t *parser, const pm_node_t *node);

+#endif
+
 /**
 * @mainpage
 *
--- a/prism/templates/src/node.c.erb
+++ b/prism/templates/src/node.c.erb
@ -247,6 +247,10 @@ pm_visit_child_nodes(const pm_node_t *node, bool (*visitor)(const pm_node_t *nod
    }
 }

+// We optionally support dumping to JSON. For systems that don't want or need
+// this functionality, it can be turned off with the PRISM_EXCLUDE_JSON define.
+#ifndef PRISM_EXCLUDE_JSON
+
 static void
 pm_dump_json_constant(pm_buffer_t *buffer, const pm_parser_t *parser, pm_constant_id_t constant_id) {
    const pm_constant_t *constant = pm_constant_pool_id_to_constant(&parser->constant_pool, constant_id);
@ -360,3 +364,5 @@ pm_dump_json(pm_buffer_t *buffer, const pm_parser_t *parser, const pm_node_t *no
            break;
    }
 }
+
+#endif
--- a/prism/templates/src/prettyprint.c.erb
+++ b/prism/templates/src/prettyprint.c.erb
@ -1,6 +1,15 @@
 <%# encoding: ASCII -%>
 #include "prism/prettyprint.h"

+// We optionally support pretty printing nodes. For systems that don't want or
+// need this functionality, it can be turned off with the
+// PRISM_EXCLUDE_PRETTYPRINT define.
+#ifdef PRISM_EXCLUDE_PRETTYPRINT
+
+void pm_prettyprint(void) {}
+
+#else
+
 static inline void
 prettyprint_location(pm_buffer_t *output_buffer, const pm_parser_t *parser, const pm_location_t *location) {
    pm_line_column_t start = pm_newline_list_line_column(&parser->newline_list, location->start, parser->start_line);
@ -154,3 +163,5 @@ pm_prettyprint(pm_buffer_t *output_buffer, const pm_parser_t *parser, const pm_n
    prettyprint_node(output_buffer, parser, node, &prefix_buffer);
    pm_buffer_free(&prefix_buffer);
 }
+
+#endif
--- a/prism/templates/src/serialize.c.erb
+++ b/prism/templates/src/serialize.c.erb
@ -1,5 +1,10 @@
 #include "prism.h"

+// We optionally support serializing to a binary string. For systems that don't
+// want or need this functionality, it can be turned off with the
+// PRISM_EXCLUDE_SERIALIZATION define.
+#ifndef PRISM_EXCLUDE_SERIALIZATION
+
 #include <stdio.h>

 static inline uint32_t
@ -394,23 +399,4 @@ pm_serialize_parse_lex(pm_buffer_t *buffer, const uint8_t *source, size_t size,
    pm_options_free(&options);
 }

-/**
- * Parse the source and return true if it parses without errors or warnings.
- */
-PRISM_EXPORTED_FUNCTION bool
-pm_parse_success_p(const uint8_t *source, size_t size, const char *data) {
-    pm_options_t options = { 0 };
-    pm_options_read(&options, data);
-
-    pm_parser_t parser;
-    pm_parser_init(&parser, source, size, &options);
-
-    pm_node_t *node = pm_parse(&parser);
-    pm_node_destroy(&parser, node);
-
-    bool result = parser.error_list.size == 0 && parser.warning_list.size == 0;
-    pm_parser_free(&parser);
-    pm_options_free(&options);
-
-    return result;
-}
+#endif
--- a/test/prism/encoding_test.rb
+++ b/test/prism/encoding_test.rb
@ -9,10 +9,13 @@ module Prism
    codepoints_1byte = 0...0x100
    encodings = {
      Encoding::ASCII_8BIT =>   codepoints_1byte,
-      Encoding::US_ASCII =>     codepoints_1byte,
-      Encoding::Windows_1253 => codepoints_1byte
+      Encoding::US_ASCII =>     codepoints_1byte
    }

+    if !ENV["PRISM_BUILD_MINIMAL"]
+      encodings[Encoding::Windows_1253] = codepoints_1byte
+    end
+
    # By default we don't test every codepoint in these encodings because it
    # takes a very long time.
    if ENV["PRISM_TEST_ALL_ENCODINGS"]
@ -205,21 +208,6 @@ module Prism
      assert_equal Encoding.find("utf-8"), actual
    end

-    # This test may be a little confusing. Basically when we use our strpbrk, it
-    # takes into account the encoding of the file.
-    def test_strpbrk_multibyte
-      result = Prism.parse(<<~RUBY)
-        # encoding: Shift_JIS
-        %w[\x81\x5c]
-      RUBY
-
-      assert(result.errors.empty?)
-      assert_equal(
-        (+"\x81\x5c").force_encoding(Encoding::Shift_JIS),
-        result.value.statements.body.first.elements.first.unescaped
-      )
-    end
-
    def test_utf_8_variations
      %w[
        utf-8-unix
@ -238,22 +226,39 @@ module Prism
      assert_equal Encoding.find("ascii-8bit"), encoding
    end

-    def test_slice_encoding
-      slice = Prism.parse("# encoding: Shift_JIS\nア").value.slice
-      assert_equal (+"ア").force_encoding(Encoding::SHIFT_JIS), slice
-      assert_equal Encoding::SHIFT_JIS, slice.encoding
-    end
+    if !ENV["PRISM_BUILD_MINIMAL"]
+      # This test may be a little confusing. Basically when we use our strpbrk,
+      # it takes into account the encoding of the file.
+      def test_strpbrk_multibyte
+        result = Prism.parse(<<~RUBY)
+          # encoding: Shift_JIS
+          %w[\x81\x5c]
+        RUBY

-    def test_multibyte_escapes
-      [
-        ["'", "'"],
-        ["\"", "\""],
-        ["`", "`"],
-        ["/", "/"],
-        ["<<'HERE'\n", "\nHERE"],
-        ["<<-HERE\n", "\nHERE"]
-      ].each do |opening, closing|
-        assert Prism.parse_success?("# encoding: shift_jis\n'\\\x82\xA0'\n")
+        assert(result.errors.empty?)
+        assert_equal(
+          (+"\x81\x5c").force_encoding(Encoding::Shift_JIS),
+          result.value.statements.body.first.elements.first.unescaped
+        )
+      end
+
+      def test_slice_encoding
+        slice = Prism.parse("# encoding: Shift_JIS\nア").value.slice
+        assert_equal (+"ア").force_encoding(Encoding::SHIFT_JIS), slice
+        assert_equal Encoding::SHIFT_JIS, slice.encoding
+      end
+
+      def test_multibyte_escapes
+        [
+          ["'", "'"],
+          ["\"", "\""],
+          ["`", "`"],
+          ["/", "/"],
+          ["<<'HERE'\n", "\nHERE"],
+          ["<<-HERE\n", "\nHERE"]
+        ].each do |opening, closing|
+          assert Prism.parse_success?("# encoding: shift_jis\n'\\\x82\xA0'\n")
+        end
      end
    end

--- a/test/prism/fuzzer_test.rb
+++ b/test/prism/fuzzer_test.rb
@ -1,9 +1,12 @@
 # frozen_string_literal: true

+return if ENV["PRISM_BUILD_MINIMAL"]
+
 require_relative "test_helper"

 module Prism
-  # These tests are simply to exercise snippets found by the fuzzer that caused invalid memory access.
+  # These tests are simply to exercise snippets found by the fuzzer that caused
+  # invalid memory access.
  class FuzzerTest < TestCase
    def self.snippet(name, source)
      define_method(:"test_fuzzer_#{name}") { Prism.dump(source) }
--- a/test/prism/magic_comment_test.rb
+++ b/test/prism/magic_comment_test.rb
@ -17,11 +17,11 @@ module Prism
      "# -*- \s\t\v encoding \s\t\v : \s\t\v ascii \s\t\v -*-",
      "# -*- foo: bar; encoding: ascii -*-",
      "# coding \t \r  \v   :     \t \v    \r   ascii-8bit",
-      "# vim: filetype=ruby, fileencoding=big5, tabsize=3, shiftwidth=3"
+      "# vim: filetype=ruby, fileencoding=windows-31j, tabsize=3, shiftwidth=3"
    ]

-    examples.each do |example|
-      define_method(:"test_magic_comment_#{example}") do
+    examples.each.with_index(1) do |example, index|
+      define_method(:"test_magic_comment_#{index}") do
        assert_magic_comment(example)
      end
    end
--- a/test/prism/parse_test.rb
+++ b/test/prism/parse_test.rb
@ -75,19 +75,21 @@ module Prism
      assert_equal 5, tokens.length
    end

-    def test_dump_file
-      assert_nothing_raised do
-        Prism.dump_file(__FILE__)
-      end
+    if !ENV["PRISM_BUILD_MINIMAL"]
+      def test_dump_file
+        assert_nothing_raised do
+          Prism.dump_file(__FILE__)
+        end

-      error = assert_raise Errno::ENOENT do
-        Prism.dump_file("idontexist.rb")
-      end
+        error = assert_raise Errno::ENOENT do
+          Prism.dump_file("idontexist.rb")
+        end

-      assert_equal "No such file or directory - idontexist.rb", error.message
+        assert_equal "No such file or directory - idontexist.rb", error.message

-      assert_raise TypeError do
-        Prism.dump_file(nil)
+        assert_raise TypeError do
+          Prism.dump_file(nil)
+        end
      end
    end

@ -259,9 +261,11 @@ module Prism
          warn("Created snapshot at #{snapshot}.")
        end

-        # Next, assert that the value can be serialized and deserialized without
-        # changing the shape of the tree.
-        assert_equal_nodes(result.value, Prism.load(source, Prism.dump(source, filepath: relative)).value)
+        if !ENV["PRISM_BUILD_MINIMAL"]
+          # Next, assert that the value can be serialized and deserialized
+          # without changing the shape of the tree.
+          assert_equal_nodes(result.value, Prism.load(source, Prism.dump(source, filepath: relative)).value)
+        end

        # Next, check that the location ranges of each node in the tree are a
        # superset of their respective child nodes.
@ -318,7 +322,9 @@ module Prism
          result = Prism.parse(snippet, filepath: relative)
          assert_empty result.errors

-          assert_equal_nodes(result.value, Prism.load(snippet, Prism.dump(snippet, filepath: relative)).value)
+          if !ENV["PRISM_BUILD_MINIMAL"]
+            assert_equal_nodes(result.value, Prism.load(snippet, Prism.dump(snippet, filepath: relative)).value)
+          end
        end
      end
    end
--- a/test/prism/ruby_api_test.rb
+++ b/test/prism/ruby_api_test.rb
@ -4,20 +4,22 @@ require_relative "test_helper"

 module Prism
  class RubyAPITest < TestCase
-    def test_ruby_api
-      filepath = __FILE__
-      source = File.read(filepath, binmode: true, external_encoding: Encoding::UTF_8)
+    if !ENV["PRISM_BUILD_MINIMAL"]
+      def test_ruby_api
+        filepath = __FILE__
+        source = File.read(filepath, binmode: true, external_encoding: Encoding::UTF_8)

-      assert_equal Prism.lex(source, filepath: filepath).value, Prism.lex_file(filepath).value
-      assert_equal Prism.dump(source, filepath: filepath), Prism.dump_file(filepath)
+        assert_equal Prism.lex(source, filepath: filepath).value, Prism.lex_file(filepath).value
+        assert_equal Prism.dump(source, filepath: filepath), Prism.dump_file(filepath)

-      serialized = Prism.dump(source, filepath: filepath)
-      ast1 = Prism.load(source, serialized).value
-      ast2 = Prism.parse(source, filepath: filepath).value
-      ast3 = Prism.parse_file(filepath).value
+        serialized = Prism.dump(source, filepath: filepath)
+        ast1 = Prism.load(source, serialized).value
+        ast2 = Prism.parse(source, filepath: filepath).value
+        ast3 = Prism.parse_file(filepath).value

-      assert_equal_nodes ast1, ast2
-      assert_equal_nodes ast2, ast3
+        assert_equal_nodes ast1, ast2
+        assert_equal_nodes ast2, ast3
+      end
    end

    def test_parse_success?
--- a/test/prism/static_inspect_test.rb
+++ b/test/prism/static_inspect_test.rb
@ -54,7 +54,7 @@ module Prism

    def test_source_encoding
      assert_equal "#<Encoding:UTF-8>", static_inspect("__ENCODING__")
-      assert_equal "#<Encoding:Shift_JIS>", static_inspect("__ENCODING__", encoding: "Shift_JIS")
+      assert_equal "#<Encoding:Windows-31J>", static_inspect("__ENCODING__", encoding: "Windows-31J")
    end

    def test_source_file