Added utf8toUnicode tfn
This commit is contained in:
Родитель
499819e232
Коммит
781a506bc9
2
CHANGES
2
CHANGES
|
@ -11,6 +11,8 @@ XX NNN 2012 - 2.7.0-rc3
|
|||
|
||||
* Added IIS and Ngnix platform code.
|
||||
|
||||
* Added new transformation utf8toUnicode.
|
||||
|
||||
23 Jul 2012 - 2.6.7
|
||||
-------------------
|
||||
|
||||
|
|
|
@ -74,6 +74,233 @@ static unsigned char *c2x(unsigned what, unsigned char *where);
|
|||
static unsigned char x2c(unsigned char *what);
|
||||
static unsigned char xsingle2c(unsigned char *what);
|
||||
|
||||
/** \brief Validate IPv4 Netmask
|
||||
*
|
||||
* \param mp Pointer to memory pool
|
||||
* \param input Pointer to input data
|
||||
* \param input_len Input data length
|
||||
* \param changed Set if data is changed
|
||||
*
|
||||
* \retval rval On Success
|
||||
*/
|
||||
char *utf8_unicode_inplace_ex(apr_pool_t *mp, unsigned char *input, long int input_len, int *changed) {
|
||||
int unicode_len = 0, length = 0;
|
||||
unsigned int d = 0, count = 0;
|
||||
unsigned char c, *utf;
|
||||
char *rval, *data;
|
||||
unsigned int i, len, j;
|
||||
unsigned int bytes_left = input_len;
|
||||
unsigned char *unicode = NULL;
|
||||
|
||||
*changed = 0;
|
||||
|
||||
len = input_len * 7 + 1;
|
||||
data = rval = apr_palloc(mp, len);
|
||||
if (rval == NULL) return NULL;
|
||||
|
||||
|
||||
if (input == NULL) return NULL;
|
||||
|
||||
for(i = 0; i < bytes_left;) {
|
||||
unicode_len = 0; d = 0;
|
||||
utf = (unsigned char *)&input[i];
|
||||
|
||||
c = *utf;
|
||||
|
||||
/* If first byte begins with binary 0 it is single byte encoding */
|
||||
if ((c & 0x80) == 0) {
|
||||
/* single byte unicode (7 bit ASCII equivilent) has no validation */
|
||||
count++;
|
||||
if(count <= len)
|
||||
*data++ = c;
|
||||
}
|
||||
/* If first byte begins with binary 110 it is two byte encoding*/
|
||||
else if ((c & 0xE0) == 0xC0) {
|
||||
/* check we have at least two bytes */
|
||||
if (bytes_left < 2) unicode_len = UNICODE_ERROR_CHARACTERS_MISSING;
|
||||
/* check second byte starts with binary 10 */
|
||||
else if (((*(utf + 1)) & 0xC0) != 0x80) unicode_len = UNICODE_ERROR_INVALID_ENCODING;
|
||||
else {
|
||||
unicode_len = 2;
|
||||
count+=6;
|
||||
if(count <= len) {
|
||||
/* compute character number */
|
||||
d = ((c & 0x1F) << 6) | (*(utf + 1) & 0x3F);
|
||||
*data++ = '%';
|
||||
*data++ = 'u';
|
||||
unicode = apr_psprintf(mp, "%x", d);
|
||||
length = strlen(unicode);
|
||||
|
||||
switch(length) {
|
||||
case 1:
|
||||
*data++ = '0';
|
||||
*data++ = '0';
|
||||
*data++ = '0';
|
||||
break;
|
||||
case 2:
|
||||
*data++ = '0';
|
||||
*data++ = '0';
|
||||
break;
|
||||
case 3:
|
||||
*data++ = '0';
|
||||
break;
|
||||
case 4:
|
||||
case 5:
|
||||
break;
|
||||
}
|
||||
|
||||
for(j=0; j<length; j++) {
|
||||
*data++ = unicode[j];
|
||||
}
|
||||
|
||||
*changed = 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
/* If first byte begins with binary 1110 it is three byte encoding */
|
||||
else if ((c & 0xF0) == 0xE0) {
|
||||
/* check we have at least three bytes */
|
||||
if (bytes_left < 3) unicode_len = UNICODE_ERROR_CHARACTERS_MISSING;
|
||||
/* check second byte starts with binary 10 */
|
||||
else if (((*(utf + 1)) & 0xC0) != 0x80) unicode_len = UNICODE_ERROR_INVALID_ENCODING;
|
||||
/* check third byte starts with binary 10 */
|
||||
else if (((*(utf + 2)) & 0xC0) != 0x80) unicode_len = UNICODE_ERROR_INVALID_ENCODING;
|
||||
else {
|
||||
unicode_len = 3;
|
||||
count+=6;
|
||||
if(count <= len) {
|
||||
/* compute character number */
|
||||
d = ((c & 0x0F) << 12) | ((*(utf + 1) & 0x3F) << 6) | (*(utf + 2) & 0x3F);
|
||||
*data++ = '%';
|
||||
*data++ = 'u';
|
||||
unicode = apr_psprintf(mp, "%x", d);
|
||||
length = strlen(unicode);
|
||||
|
||||
switch(length) {
|
||||
case 1:
|
||||
*data++ = '0';
|
||||
*data++ = '0';
|
||||
*data++ = '0';
|
||||
break;
|
||||
case 2:
|
||||
*data++ = '0';
|
||||
*data++ = '0';
|
||||
break;
|
||||
case 3:
|
||||
*data++ = '0';
|
||||
break;
|
||||
case 4:
|
||||
case 5:
|
||||
break;
|
||||
}
|
||||
|
||||
for(j=0; j<length; j++) {
|
||||
*data++ = unicode[j];
|
||||
}
|
||||
|
||||
*changed = 1;
|
||||
|
||||
}
|
||||
}
|
||||
}
|
||||
/* If first byte begins with binary 11110 it is four byte encoding */
|
||||
else if ((c & 0xF8) == 0xF0) {
|
||||
/* restrict characters to UTF-8 range (U+0000 - U+10FFFF)*/
|
||||
if (c >= 0xF5) {
|
||||
*data++ = c;
|
||||
}
|
||||
/* check we have at least four bytes */
|
||||
if (bytes_left < 4) unicode_len = UNICODE_ERROR_CHARACTERS_MISSING;
|
||||
/* check second byte starts with binary 10 */
|
||||
else if (((*(utf + 1)) & 0xC0) != 0x80) unicode_len = UNICODE_ERROR_INVALID_ENCODING;
|
||||
/* check third byte starts with binary 10 */
|
||||
else if (((*(utf + 2)) & 0xC0) != 0x80) unicode_len = UNICODE_ERROR_INVALID_ENCODING;
|
||||
/* check forth byte starts with binary 10 */
|
||||
else if (((*(utf + 3)) & 0xC0) != 0x80) unicode_len = UNICODE_ERROR_INVALID_ENCODING;
|
||||
else {
|
||||
unicode_len = 4;
|
||||
count+=7;
|
||||
if(count <= len) {
|
||||
/* compute character number */
|
||||
d = ((c & 0x07) << 18) | ((*(utf + 1) & 0x3F) << 12) | ((*(utf + 2) & 0x3F) < 6) | (*(utf + 3) & 0x3F);
|
||||
*data++ = '%';
|
||||
*data++ = 'u';
|
||||
unicode = apr_psprintf(mp, "%x", d);
|
||||
length = strlen(unicode);
|
||||
|
||||
switch(length) {
|
||||
case 1:
|
||||
*data++ = '0';
|
||||
*data++ = '0';
|
||||
*data++ = '0';
|
||||
break;
|
||||
case 2:
|
||||
*data++ = '0';
|
||||
*data++ = '0';
|
||||
break;
|
||||
case 3:
|
||||
*data++ = '0';
|
||||
break;
|
||||
case 4:
|
||||
case 5:
|
||||
break;
|
||||
}
|
||||
|
||||
for(j=0; j<length; j++) {
|
||||
*data++ = unicode[j];
|
||||
}
|
||||
|
||||
*changed = 1;
|
||||
|
||||
}
|
||||
}
|
||||
}
|
||||
/* any other first byte is invalid (RFC 3629) */
|
||||
else {
|
||||
count++;
|
||||
if(count <= len)
|
||||
*data++ = c;
|
||||
}
|
||||
|
||||
/* invalid UTF-8 character number range (RFC 3629) */
|
||||
if ((d >= 0xD800) && (d <= 0xDFFF)) {
|
||||
count++;
|
||||
if(count <= len)
|
||||
*data++ = c;
|
||||
}
|
||||
|
||||
/* check for overlong */
|
||||
if ((unicode_len == 4) && (d < 0x010000)) {
|
||||
/* four byte could be represented with less bytes */
|
||||
count++;
|
||||
if(count <= len)
|
||||
*data++ = c;
|
||||
}
|
||||
else if ((unicode_len == 3) && (d < 0x0800)) {
|
||||
/* three byte could be represented with less bytes */
|
||||
count++;
|
||||
if(count <= len)
|
||||
*data++ = c;
|
||||
}
|
||||
else if ((unicode_len == 2) && (d < 0x80)) {
|
||||
/* two byte could be represented with less bytes */
|
||||
count++;
|
||||
if(count <= len)
|
||||
*data++ = c;
|
||||
}
|
||||
|
||||
if(unicode_len > 0) {
|
||||
i += unicode_len;
|
||||
} else {
|
||||
i++;
|
||||
}
|
||||
}
|
||||
|
||||
*data ='\0';
|
||||
|
||||
return rval;
|
||||
}
|
||||
|
||||
/** \brief Validate IPv4 Netmask
|
||||
*
|
||||
* \param ip_strv6 Pointer to ipv6 address
|
||||
|
|
|
@ -40,6 +40,14 @@ int DSOLOCAL inet_pton(int family, const char *src, void *dst);
|
|||
#endif
|
||||
#endif
|
||||
|
||||
#define UNICODE_ERROR_CHARACTERS_MISSING -1
|
||||
#define UNICODE_ERROR_INVALID_ENCODING -2
|
||||
#define UNICODE_ERROR_OVERLONG_CHARACTER -3
|
||||
#define UNICODE_ERROR_RESTRICTED_CHARACTER -4
|
||||
#define UNICODE_ERROR_DECODING_ERROR -5
|
||||
|
||||
char DSOLOCAL *utf8_unicode_inplace_ex(apr_pool_t *mp, unsigned char *input, long int input_len, int *changed);
|
||||
|
||||
char DSOLOCAL *m_strcasestr(const char *haystack, const char *needle);
|
||||
|
||||
int DSOLOCAL normalize_path_inplace(unsigned char *input, int len, int win, int *changed);
|
||||
|
|
|
@ -4024,12 +4024,6 @@ static int msre_op_validateUrlEncoding_execute(modsec_rec *msr, msre_rule *rule,
|
|||
|
||||
/* validateUtf8Encoding */
|
||||
|
||||
#define UNICODE_ERROR_CHARACTERS_MISSING -1
|
||||
#define UNICODE_ERROR_INVALID_ENCODING -2
|
||||
#define UNICODE_ERROR_OVERLONG_CHARACTER -3
|
||||
#define UNICODE_ERROR_RESTRICTED_CHARACTER -4
|
||||
#define UNICODE_ERROR_DECODING_ERROR -5
|
||||
|
||||
/* NOTE: This is over-commented for ease of verification */
|
||||
static int detect_utf8_character(const unsigned char *p_read, unsigned int length) {
|
||||
int unicode_len = 0;
|
||||
|
|
|
@ -495,6 +495,18 @@ static int msre_fn_urlDecodeUni_execute(apr_pool_t *mptmp, unsigned char *input,
|
|||
return changed;
|
||||
}
|
||||
|
||||
static int msre_fn_utf8Unicode_execute(apr_pool_t *mptmp, unsigned char *input,
|
||||
long int input_len, char **rval, long int *rval_len)
|
||||
{
|
||||
int changed = 0;
|
||||
|
||||
*rval = (char *)utf8_unicode_inplace_ex(mptmp, input, input_len, &changed);
|
||||
*rval_len = strlen(*rval);
|
||||
|
||||
return changed;
|
||||
}
|
||||
|
||||
|
||||
/* urlEncode */
|
||||
|
||||
static int msre_fn_urlEncode_execute(apr_pool_t *mptmp, unsigned char *input,
|
||||
|
@ -1018,6 +1030,12 @@ void msre_engine_register_default_tfns(msre_engine *engine) {
|
|||
msre_fn_urlDecodeUni_execute
|
||||
);
|
||||
|
||||
/* Utf8Unicode */
|
||||
msre_engine_tfn_register(engine,
|
||||
"Utf8toUnicode",
|
||||
msre_fn_utf8Unicode_execute
|
||||
);
|
||||
|
||||
/* urlEncode */
|
||||
msre_engine_tfn_register(engine,
|
||||
"urlEncode",
|
||||
|
|
Загрузка…
Ссылка в новой задаче