Added utf8toUnicode tfn

2012-08-24 17:02:13 +00:00 · 2012-08-24 17:02:13 +00:00 · 781a506bc9
--- a/2
+++ b/2
@ -11,6 +11,8 @@ XX NNN 2012 - 2.7.0-rc3

 * Added IIS and Ngnix platform code.

+ * Added new transformation utf8toUnicode.
+
 23 Jul 2012 - 2.6.7
 -------------------

--- a/apache2/msc_util.c
+++ b/apache2/msc_util.c
@ -74,6 +74,233 @@ static unsigned char *c2x(unsigned what, unsigned char *where);
 static unsigned char x2c(unsigned char *what);
 static unsigned char xsingle2c(unsigned char *what);

+/** \brief Validate IPv4 Netmask
+ *
+ * \param mp Pointer to memory pool
+ * \param input Pointer to input data
+ * \param input_len Input data length
+ * \param changed Set if data is changed
+ *
+ * \retval rval On Success
+ */
+char *utf8_unicode_inplace_ex(apr_pool_t *mp, unsigned char *input, long int input_len, int *changed) {
+    int unicode_len = 0, length = 0;
+    unsigned int d = 0, count = 0;
+    unsigned char c, *utf;
+    char *rval, *data;
+    unsigned int i, len, j;
+    unsigned int bytes_left = input_len;
+    unsigned char *unicode = NULL;
+
+    *changed = 0;
+
+    len = input_len * 7 + 1;
+    data = rval = apr_palloc(mp, len);
+    if (rval == NULL) return NULL;
+
+
+    if (input == NULL) return NULL;
+
+    for(i = 0; i < bytes_left;)  {
+        unicode_len = 0; d = 0;
+        utf = (unsigned char *)&input[i];
+
+        c = *utf;
+
+        /* If first byte begins with binary 0 it is single byte encoding */
+        if ((c & 0x80) == 0) {
+            /* single byte unicode (7 bit ASCII equivilent) has no validation */
+            count++;
+            if(count <= len)
+                *data++ = c;
+        }
+        /* If first byte begins with binary 110 it is two byte encoding*/
+        else if ((c & 0xE0) == 0xC0) {
+            /* check we have at least two bytes */
+            if (bytes_left < 2) unicode_len = UNICODE_ERROR_CHARACTERS_MISSING;
+            /* check second byte starts with binary 10 */
+            else if (((*(utf + 1)) & 0xC0) != 0x80) unicode_len = UNICODE_ERROR_INVALID_ENCODING;
+            else {
+                unicode_len = 2;
+                count+=6;
+                if(count <= len) {
+                    /* compute character number */
+                    d = ((c & 0x1F) << 6) | (*(utf + 1) & 0x3F);
+                    *data++ = '%';
+                    *data++ = 'u';
+                    unicode = apr_psprintf(mp, "%x", d);
+                    length = strlen(unicode);
+
+                    switch(length)  {
+                        case 1:
+                            *data++ = '0';
+                            *data++ = '0';
+                            *data++ = '0';
+                            break;
+                        case 2:
+                            *data++ = '0';
+                            *data++ = '0';
+                            break;
+                        case 3:
+                            *data++ = '0';
+                            break;
+                        case 4:
+                        case 5:
+                            break;
+                    }
+
+                    for(j=0; j<length; j++) {
+                        *data++ = unicode[j];
+                    }
+
+                    *changed = 1;
+                }
+            }
+        }
+        /* If first byte begins with binary 1110 it is three byte encoding */
+        else if ((c & 0xF0) == 0xE0) {
+            /* check we have at least three bytes */
+            if (bytes_left < 3) unicode_len = UNICODE_ERROR_CHARACTERS_MISSING;
+            /* check second byte starts with binary 10 */
+            else if (((*(utf + 1)) & 0xC0) != 0x80) unicode_len = UNICODE_ERROR_INVALID_ENCODING;
+            /* check third byte starts with binary 10 */
+            else if (((*(utf + 2)) & 0xC0) != 0x80) unicode_len = UNICODE_ERROR_INVALID_ENCODING;
+            else {
+                unicode_len = 3;
+                count+=6;
+                if(count <= len) {
+                    /* compute character number */
+                    d = ((c & 0x0F) << 12) | ((*(utf + 1) & 0x3F) << 6) | (*(utf + 2) & 0x3F);
+                    *data++ = '%';
+                    *data++ = 'u';
+                    unicode = apr_psprintf(mp, "%x", d);
+                    length = strlen(unicode);
+
+                    switch(length)  {
+                        case 1:
+                            *data++ = '0';
+                            *data++ = '0';
+                            *data++ = '0';
+                            break;
+                        case 2:
+                            *data++ = '0';
+                            *data++ = '0';
+                            break;
+                        case 3:
+                            *data++ = '0';
+                            break;
+                        case 4:
+                        case 5:
+                            break;
+                    }
+
+                    for(j=0; j<length; j++) {
+                        *data++ = unicode[j];
+                    }
+
+                    *changed = 1;
+
+                }
+            }
+        }
+        /* If first byte begins with binary 11110 it is four byte encoding */
+        else if ((c & 0xF8) == 0xF0) {
+            /* restrict characters to UTF-8 range (U+0000 - U+10FFFF)*/
+            if (c >= 0xF5) {
+                *data++ = c;
+            }
+            /* check we have at least four bytes */
+            if (bytes_left < 4) unicode_len = UNICODE_ERROR_CHARACTERS_MISSING;
+            /* check second byte starts with binary 10 */
+            else if (((*(utf + 1)) & 0xC0) != 0x80) unicode_len = UNICODE_ERROR_INVALID_ENCODING;
+            /* check third byte starts with binary 10 */
+            else if (((*(utf + 2)) & 0xC0) != 0x80) unicode_len = UNICODE_ERROR_INVALID_ENCODING;
+            /* check forth byte starts with binary 10 */
+            else if (((*(utf + 3)) & 0xC0) != 0x80) unicode_len = UNICODE_ERROR_INVALID_ENCODING;
+            else {
+                unicode_len = 4;
+                count+=7;
+                if(count <= len) {
+                    /* compute character number */
+                    d = ((c & 0x07) << 18) | ((*(utf + 1) & 0x3F) << 12) | ((*(utf + 2) & 0x3F) < 6) | (*(utf + 3) & 0x3F);
+                    *data++ = '%';
+                    *data++ = 'u';
+                    unicode = apr_psprintf(mp, "%x", d);
+                    length = strlen(unicode);
+
+                    switch(length)  {
+                        case 1:
+                            *data++ = '0';
+                            *data++ = '0';
+                            *data++ = '0';
+                            break;
+                        case 2:
+                            *data++ = '0';
+                            *data++ = '0';
+                            break;
+                        case 3:
+                            *data++ = '0';
+                            break;
+                        case 4:
+                        case 5:
+                            break;
+                    }
+
+                    for(j=0; j<length; j++) {
+                        *data++ = unicode[j];
+                    }
+
+                    *changed = 1;
+
+                }
+            }
+        }
+        /* any other first byte is invalid (RFC 3629) */
+        else {
+            count++;
+            if(count <= len)
+                *data++ = c;
+        }
+
+        /* invalid UTF-8 character number range (RFC 3629) */
+        if ((d >= 0xD800) && (d <= 0xDFFF)) {
+            count++;
+            if(count <= len)
+                *data++ = c;
+        }
+
+        /* check for overlong */
+        if ((unicode_len == 4) && (d < 0x010000)) {
+            /* four byte could be represented with less bytes */
+            count++;
+            if(count <= len)
+                *data++ = c;
+        }
+        else if ((unicode_len == 3) && (d < 0x0800)) {
+            /* three byte could be represented with less bytes */
+            count++;
+            if(count <= len)
+                *data++ = c;
+        }
+        else if ((unicode_len == 2) && (d < 0x80)) {
+            /* two byte could be represented with less bytes */
+            count++;
+            if(count <= len)
+                *data++ = c;
+        }
+
+        if(unicode_len > 0) {
+            i += unicode_len;
+        } else {
+            i++;
+        }
+    }
+
+    *data ='\0';
+
+    return rval;
+}
+
 /** \brief Validate IPv4 Netmask
 *
 * \param ip_strv6 Pointer to ipv6 address
--- a/apache2/msc_util.h
+++ b/apache2/msc_util.h
@ -40,6 +40,14 @@ int DSOLOCAL inet_pton(int family, const char *src, void *dst);
 #endif
 #endif

+#define UNICODE_ERROR_CHARACTERS_MISSING    -1
+#define UNICODE_ERROR_INVALID_ENCODING      -2
+#define UNICODE_ERROR_OVERLONG_CHARACTER    -3
+#define UNICODE_ERROR_RESTRICTED_CHARACTER  -4
+#define UNICODE_ERROR_DECODING_ERROR        -5
+
+char DSOLOCAL *utf8_unicode_inplace_ex(apr_pool_t *mp, unsigned char *input, long int input_len, int *changed);
+
 char DSOLOCAL *m_strcasestr(const char *haystack, const char *needle);

 int DSOLOCAL normalize_path_inplace(unsigned char *input, int len, int win, int *changed);
--- a/apache2/re_operators.c
+++ b/apache2/re_operators.c
@ -4024,12 +4024,6 @@ static int msre_op_validateUrlEncoding_execute(modsec_rec *msr, msre_rule *rule,

 /* validateUtf8Encoding */

-#define UNICODE_ERROR_CHARACTERS_MISSING    -1
-#define UNICODE_ERROR_INVALID_ENCODING      -2
-#define UNICODE_ERROR_OVERLONG_CHARACTER    -3
-#define UNICODE_ERROR_RESTRICTED_CHARACTER  -4
-#define UNICODE_ERROR_DECODING_ERROR        -5
-
 /* NOTE: This is over-commented for ease of verification */
 static int detect_utf8_character(const unsigned char *p_read, unsigned int length) {
    int unicode_len = 0;
--- a/apache2/re_tfns.c
+++ b/apache2/re_tfns.c
@ -495,6 +495,18 @@ static int msre_fn_urlDecodeUni_execute(apr_pool_t *mptmp, unsigned char *input,
    return changed;
 }

+static int msre_fn_utf8Unicode_execute(apr_pool_t *mptmp, unsigned char *input,
+    long int input_len, char **rval, long int *rval_len)
+{
+    int changed = 0;
+
+    *rval = (char *)utf8_unicode_inplace_ex(mptmp, input, input_len, &changed);
+    *rval_len = strlen(*rval);
+
+    return changed;
+}
+
+
 /* urlEncode */

 static int msre_fn_urlEncode_execute(apr_pool_t *mptmp, unsigned char *input,
@ -1018,6 +1030,12 @@ void msre_engine_register_default_tfns(msre_engine *engine) {
        msre_fn_urlDecodeUni_execute
    );

+    /* Utf8Unicode */
+    msre_engine_tfn_register(engine,
+        "Utf8toUnicode",
+        msre_fn_utf8Unicode_execute
+    );
+
    /* urlEncode */
    msre_engine_tfn_register(engine,
        "urlEncode",