2023-10-31 20:26:31 +03:00
|
|
|
/**
|
2023-11-30 19:36:10 +03:00
|
|
|
* @file encoding.h
|
2023-10-31 20:26:31 +03:00
|
|
|
*
|
|
|
|
* The encoding interface and implementations used by the parser.
|
|
|
|
*/
|
2023-09-27 19:24:48 +03:00
|
|
|
#ifndef PRISM_ENCODING_H
|
|
|
|
#define PRISM_ENCODING_H
|
2023-06-20 18:53:02 +03:00
|
|
|
|
2023-09-27 19:24:48 +03:00
|
|
|
#include "prism/defines.h"
|
2023-11-30 20:00:44 +03:00
|
|
|
#include "prism/util/pm_strncasecmp.h"
|
2023-06-20 18:53:02 +03:00
|
|
|
|
2023-08-15 20:00:54 +03:00
|
|
|
#include <assert.h>
|
2023-06-20 18:53:02 +03:00
|
|
|
#include <stdbool.h>
|
|
|
|
#include <stddef.h>
|
|
|
|
#include <stdint.h>
|
|
|
|
|
2023-10-31 15:54:52 +03:00
|
|
|
/**
|
|
|
|
* This struct defines the functions necessary to implement the encoding
|
|
|
|
* interface so we can determine how many bytes the subsequent character takes.
|
|
|
|
* Each callback should return the number of bytes, or 0 if the next bytes are
|
|
|
|
* invalid for the encoding and type.
|
|
|
|
*/
|
2023-06-20 18:53:02 +03:00
|
|
|
typedef struct {
|
2023-10-31 15:54:52 +03:00
|
|
|
/**
|
|
|
|
* Return the number of bytes that the next character takes if it is valid
|
|
|
|
* in the encoding. Does not read more than n bytes. It is assumed that n is
|
|
|
|
* at least 1.
|
|
|
|
*/
|
2023-08-29 17:48:20 +03:00
|
|
|
size_t (*char_width)(const uint8_t *b, ptrdiff_t n);
|
2023-06-30 21:30:24 +03:00
|
|
|
|
2023-10-31 15:54:52 +03:00
|
|
|
/**
|
|
|
|
* Return the number of bytes that the next character takes if it is valid
|
|
|
|
* in the encoding and is alphabetical. Does not read more than n bytes. It
|
|
|
|
* is assumed that n is at least 1.
|
|
|
|
*/
|
2023-08-29 17:48:20 +03:00
|
|
|
size_t (*alpha_char)(const uint8_t *b, ptrdiff_t n);
|
2023-06-30 21:30:24 +03:00
|
|
|
|
2023-10-31 15:54:52 +03:00
|
|
|
/**
|
|
|
|
* Return the number of bytes that the next character takes if it is valid
|
|
|
|
* in the encoding and is alphanumeric. Does not read more than n bytes. It
|
|
|
|
* is assumed that n is at least 1.
|
|
|
|
*/
|
2023-08-29 17:48:20 +03:00
|
|
|
size_t (*alnum_char)(const uint8_t *b, ptrdiff_t n);
|
2023-06-30 21:30:24 +03:00
|
|
|
|
2023-10-31 15:54:52 +03:00
|
|
|
/**
|
|
|
|
* Return true if the next character is valid in the encoding and is an
|
|
|
|
* uppercase character. Does not read more than n bytes. It is assumed that
|
|
|
|
* n is at least 1.
|
|
|
|
*/
|
2023-08-29 17:48:20 +03:00
|
|
|
bool (*isupper_char)(const uint8_t *b, ptrdiff_t n);
|
2023-06-30 21:30:24 +03:00
|
|
|
|
2023-10-31 15:54:52 +03:00
|
|
|
/**
|
|
|
|
* The name of the encoding. This should correspond to a value that can be
|
|
|
|
* passed to Encoding.find in Ruby.
|
|
|
|
*/
|
2023-06-30 21:30:24 +03:00
|
|
|
const char *name;
|
|
|
|
|
2023-10-31 15:54:52 +03:00
|
|
|
/**
|
|
|
|
* Return true if the encoding is a multibyte encoding.
|
|
|
|
*/
|
2023-06-30 21:30:24 +03:00
|
|
|
bool multibyte;
|
2023-09-27 19:24:48 +03:00
|
|
|
} pm_encoding_t;
|
2023-06-20 18:53:02 +03:00
|
|
|
|
2023-10-31 20:26:31 +03:00
|
|
|
/**
|
|
|
|
* All of the lookup tables use the first bit of each embedded byte to indicate
|
|
|
|
* whether the codepoint is alphabetical.
|
|
|
|
*/
|
2023-09-27 19:24:48 +03:00
|
|
|
#define PRISM_ENCODING_ALPHABETIC_BIT 1 << 0
|
2023-10-31 20:26:31 +03:00
|
|
|
|
|
|
|
/**
|
|
|
|
* All of the lookup tables use the second bit of each embedded byte to indicate
|
|
|
|
* whether the codepoint is alphanumeric.
|
|
|
|
*/
|
2023-09-27 19:24:48 +03:00
|
|
|
#define PRISM_ENCODING_ALPHANUMERIC_BIT 1 << 1
|
2023-10-31 20:26:31 +03:00
|
|
|
|
|
|
|
/**
|
|
|
|
* All of the lookup tables use the third bit of each embedded byte to indicate
|
|
|
|
* whether the codepoint is uppercase.
|
|
|
|
*/
|
2023-09-27 19:24:48 +03:00
|
|
|
#define PRISM_ENCODING_UPPERCASE_BIT 1 << 2
|
2023-06-20 18:53:02 +03:00
|
|
|
|
2023-10-31 15:54:52 +03:00
|
|
|
/**
|
|
|
|
* Return the size of the next character in the UTF-8 encoding if it is an
|
|
|
|
* alphabetical character.
|
|
|
|
*
|
|
|
|
* @param b The bytes to read.
|
|
|
|
* @param n The number of bytes that can be read.
|
|
|
|
* @returns The number of bytes that the next character takes if it is valid in
|
|
|
|
* the encoding, or 0 if it is not.
|
|
|
|
*/
|
2023-09-27 19:24:48 +03:00
|
|
|
size_t pm_encoding_utf_8_alpha_char(const uint8_t *b, ptrdiff_t n);
|
2023-10-31 15:54:52 +03:00
|
|
|
|
|
|
|
/**
|
|
|
|
* Return the size of the next character in the UTF-8 encoding if it is an
|
|
|
|
* alphanumeric character.
|
|
|
|
*
|
|
|
|
* @param b The bytes to read.
|
|
|
|
* @param n The number of bytes that can be read.
|
|
|
|
* @returns The number of bytes that the next character takes if it is valid in
|
|
|
|
* the encoding, or 0 if it is not.
|
|
|
|
*/
|
2023-09-27 19:24:48 +03:00
|
|
|
size_t pm_encoding_utf_8_alnum_char(const uint8_t *b, ptrdiff_t n);
|
2023-10-31 15:54:52 +03:00
|
|
|
|
|
|
|
/**
|
|
|
|
* Return true if the next character in the UTF-8 encoding if it is an uppercase
|
|
|
|
* character.
|
|
|
|
*
|
|
|
|
* @param b The bytes to read.
|
|
|
|
* @param n The number of bytes that can be read.
|
|
|
|
* @returns True if the next character is valid in the encoding and is an
|
|
|
|
* uppercase character, or false if it is not.
|
|
|
|
*/
|
2023-10-30 17:47:46 +03:00
|
|
|
bool pm_encoding_utf_8_isupper_char(const uint8_t *b, ptrdiff_t n);
|
2023-06-20 18:53:02 +03:00
|
|
|
|
2023-10-31 15:54:52 +03:00
|
|
|
/**
|
|
|
|
* This lookup table is referenced in both the UTF-8 encoding file and the
|
|
|
|
* parser directly in order to speed up the default encoding processing. It is
|
|
|
|
* used to indicate whether a character is alphabetical, alphanumeric, or
|
|
|
|
* uppercase in unicode mappings.
|
|
|
|
*/
|
2023-09-27 19:24:48 +03:00
|
|
|
extern const uint8_t pm_encoding_unicode_table[256];
|
2023-06-20 18:53:02 +03:00
|
|
|
|
2023-11-30 20:00:44 +03:00
|
|
|
/**
|
2023-11-30 20:50:49 +03:00
|
|
|
* These are all of the encodings that prisms supports.
|
2023-11-30 20:00:44 +03:00
|
|
|
*/
|
2023-11-30 20:50:49 +03:00
|
|
|
typedef enum {
|
|
|
|
PM_ENCODING_UTF_8 = 0,
|
|
|
|
PM_ENCODING_ASCII,
|
|
|
|
PM_ENCODING_ASCII_8BIT,
|
|
|
|
PM_ENCODING_BIG5,
|
|
|
|
PM_ENCODING_BIG5_HKSCS,
|
|
|
|
PM_ENCODING_BIG5_UAO,
|
|
|
|
PM_ENCODING_CESU_8,
|
|
|
|
PM_ENCODING_CP51932,
|
|
|
|
PM_ENCODING_CP850,
|
|
|
|
PM_ENCODING_CP852,
|
|
|
|
PM_ENCODING_CP855,
|
|
|
|
PM_ENCODING_CP949,
|
|
|
|
PM_ENCODING_CP950,
|
|
|
|
PM_ENCODING_CP951,
|
|
|
|
PM_ENCODING_EMACS_MULE,
|
|
|
|
PM_ENCODING_EUC_JP,
|
|
|
|
PM_ENCODING_EUC_JP_MS,
|
|
|
|
PM_ENCODING_EUC_JIS_2004,
|
|
|
|
PM_ENCODING_EUC_KR,
|
|
|
|
PM_ENCODING_EUC_TW,
|
|
|
|
PM_ENCODING_GB12345,
|
|
|
|
PM_ENCODING_GB18030,
|
|
|
|
PM_ENCODING_GB1988,
|
|
|
|
PM_ENCODING_GB2312,
|
|
|
|
PM_ENCODING_GBK,
|
|
|
|
PM_ENCODING_IBM437,
|
|
|
|
PM_ENCODING_IBM720,
|
|
|
|
PM_ENCODING_IBM737,
|
|
|
|
PM_ENCODING_IBM775,
|
|
|
|
PM_ENCODING_IBM852,
|
|
|
|
PM_ENCODING_IBM855,
|
|
|
|
PM_ENCODING_IBM857,
|
|
|
|
PM_ENCODING_IBM860,
|
|
|
|
PM_ENCODING_IBM861,
|
|
|
|
PM_ENCODING_IBM862,
|
|
|
|
PM_ENCODING_IBM863,
|
|
|
|
PM_ENCODING_IBM864,
|
|
|
|
PM_ENCODING_IBM865,
|
|
|
|
PM_ENCODING_IBM866,
|
|
|
|
PM_ENCODING_IBM869,
|
|
|
|
PM_ENCODING_ISO_8859_1,
|
|
|
|
PM_ENCODING_ISO_8859_2,
|
|
|
|
PM_ENCODING_ISO_8859_3,
|
|
|
|
PM_ENCODING_ISO_8859_4,
|
|
|
|
PM_ENCODING_ISO_8859_5,
|
|
|
|
PM_ENCODING_ISO_8859_6,
|
|
|
|
PM_ENCODING_ISO_8859_7,
|
|
|
|
PM_ENCODING_ISO_8859_8,
|
|
|
|
PM_ENCODING_ISO_8859_9,
|
|
|
|
PM_ENCODING_ISO_8859_10,
|
|
|
|
PM_ENCODING_ISO_8859_11,
|
|
|
|
PM_ENCODING_ISO_8859_13,
|
|
|
|
PM_ENCODING_ISO_8859_14,
|
|
|
|
PM_ENCODING_ISO_8859_15,
|
|
|
|
PM_ENCODING_ISO_8859_16,
|
|
|
|
PM_ENCODING_KOI8_R,
|
|
|
|
PM_ENCODING_KOI8_U,
|
|
|
|
PM_ENCODING_MAC_CENT_EURO,
|
|
|
|
PM_ENCODING_MAC_CROATIAN,
|
|
|
|
PM_ENCODING_MAC_CYRILLIC,
|
|
|
|
PM_ENCODING_MAC_GREEK,
|
|
|
|
PM_ENCODING_MAC_ICELAND,
|
|
|
|
PM_ENCODING_MAC_JAPANESE,
|
|
|
|
PM_ENCODING_MAC_ROMAN,
|
|
|
|
PM_ENCODING_MAC_ROMANIA,
|
|
|
|
PM_ENCODING_MAC_THAI,
|
|
|
|
PM_ENCODING_MAC_TURKISH,
|
|
|
|
PM_ENCODING_MAC_UKRAINE,
|
|
|
|
PM_ENCODING_SHIFT_JIS,
|
|
|
|
PM_ENCODING_SJIS_DOCOMO,
|
|
|
|
PM_ENCODING_SJIS_KDDI,
|
|
|
|
PM_ENCODING_SJIS_SOFTBANK,
|
|
|
|
PM_ENCODING_STATELESS_ISO_2022_JP,
|
|
|
|
PM_ENCODING_STATELESS_ISO_2022_JP_KDDI,
|
|
|
|
PM_ENCODING_TIS_620,
|
|
|
|
PM_ENCODING_UTF8_MAC,
|
|
|
|
PM_ENCODING_UTF8_DOCOMO,
|
|
|
|
PM_ENCODING_UTF8_KDDI,
|
|
|
|
PM_ENCODING_UTF8_SOFTBANK,
|
|
|
|
PM_ENCODING_WINDOWS_1250,
|
|
|
|
PM_ENCODING_WINDOWS_1251,
|
|
|
|
PM_ENCODING_WINDOWS_1252,
|
|
|
|
PM_ENCODING_WINDOWS_1253,
|
|
|
|
PM_ENCODING_WINDOWS_1254,
|
|
|
|
PM_ENCODING_WINDOWS_1255,
|
|
|
|
PM_ENCODING_WINDOWS_1256,
|
|
|
|
PM_ENCODING_WINDOWS_1257,
|
|
|
|
PM_ENCODING_WINDOWS_1258,
|
|
|
|
PM_ENCODING_WINDOWS_31J,
|
2023-11-30 20:52:42 +03:00
|
|
|
PM_ENCODING_WINDOWS_874
|
2023-11-30 20:50:49 +03:00
|
|
|
} pm_encoding_type_t;
|
|
|
|
|
|
|
|
/**
|
|
|
|
* This is the default UTF-8 encoding. We need it to quickly create parsers.
|
|
|
|
*/
|
|
|
|
extern const pm_encoding_t *pm_encoding_utf_8;
|
2023-11-30 20:00:44 +03:00
|
|
|
|
|
|
|
/**
|
|
|
|
* Parse the given name of an encoding and return a pointer to the corresponding
|
|
|
|
* encoding struct if one can be found, otherwise return NULL.
|
|
|
|
*
|
|
|
|
* @param start A pointer to the first byte of the name.
|
|
|
|
* @param end A pointer to the last byte of the name.
|
2023-11-30 20:50:49 +03:00
|
|
|
* @returns A pointer to the encoding struct if one is found, otherwise NULL.
|
2023-11-30 20:00:44 +03:00
|
|
|
*/
|
2023-11-30 20:50:49 +03:00
|
|
|
const pm_encoding_t * pm_encoding_find(const uint8_t *start, const uint8_t *end);
|
2023-06-20 18:53:02 +03:00
|
|
|
|
|
|
|
#endif
|