pjs/directory/c-sdk/ldap/libraries/libldap/utf8.c

/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*-
 *
 * The contents of this file are subject to the Netscape Public
 * License Version 1.1 (the "License"); you may not use this file
 * except in compliance with the License. You may obtain a copy of
 * the License at http://www.mozilla.org/NPL/
 *
 * Software distributed under the License is distributed on an "AS
 * IS" basis, WITHOUT WARRANTY OF ANY KIND, either express or
 * implied. See the License for the specific language governing
 * rights and limitations under the License.
 *
 * The Original Code is mozilla.org code.
 *
 * The Initial Developer of the Original Code is Netscape
 * Communications Corporation.  Portions created by Netscape are
 * Copyright (C) 1998 Netscape Communications Corporation. All
 * Rights Reserved.
 *
 * Contributor(s): 
 */
/* uft8.c - misc. utf8 "string" functions. */
#include "ldap.h"
#include "ldap-int.h"

static char UTF8len[64]
= {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
   2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 4, 4, 5, 6};

int
LDAP_CALL
ldap_utf8len (const char* s)
     /* Return the number of char's in the character at *s. */
{
    return ldap_utf8next((char*)s) - s;
}

char*
LDAP_CALL
ldap_utf8next (char* s)
     /* Return a pointer to the character immediately following *s.
	Handle any valid UTF-8 character, including '\0' and ASCII.
	Try to handle a misaligned pointer or a malformed character.
     */
{
    register unsigned char* next = (unsigned char*)s;
    switch (UTF8len [(*next >> 2) & 0x3F]) {
      case 0: /* erroneous: s points to the middle of a character. */
      case 6: if ((*++next & 0xC0) != 0x80) break;
      case 5: if ((*++next & 0xC0) != 0x80) break;
      case 4: if ((*++next & 0xC0) != 0x80) break;
      case 3: if ((*++next & 0xC0) != 0x80) break;
      case 2: if ((*++next & 0xC0) != 0x80) break;
      case 1: ++next;
    }
    return (char*) next;
}

char*
LDAP_CALL
ldap_utf8prev (char* s)
     /* Return a pointer to the character immediately preceding *s.
	Handle any valid UTF-8 character, including '\0' and ASCII.
	Try to handle a misaligned pointer or a malformed character.
     */
{
    register unsigned char* prev = (unsigned char*)s;
    unsigned char* limit = prev - 6;
    while (((*--prev & 0xC0) == 0x80) && (prev != limit)) {
    	;
    }
    return (char*) prev;
}

int
LDAP_CALL
ldap_utf8copy (char* dst, const char* src)
     /* Copy a character from src to dst; return the number of char's copied.
	Handle any valid UTF-8 character, including '\0' and ASCII.
	Try to handle a misaligned pointer or a malformed character.
     */
{
    register const unsigned char* s = (const unsigned char*)src;
    switch (UTF8len [(*s >> 2) & 0x3F]) {
      case 0: /* erroneous: s points to the middle of a character. */
      case 6: *dst++ = *s++; if ((*s & 0xC0) != 0x80) break;
      case 5: *dst++ = *s++; if ((*s & 0xC0) != 0x80) break;
      case 4: *dst++ = *s++; if ((*s & 0xC0) != 0x80) break;
      case 3: *dst++ = *s++; if ((*s & 0xC0) != 0x80) break;
      case 2: *dst++ = *s++; if ((*s & 0xC0) != 0x80) break;
      case 1: *dst   = *s++;
    }
    return s - (const unsigned char*)src;
}

size_t
LDAP_CALL
ldap_utf8characters (const char* src)
     /* Return the number of UTF-8 characters in the 0-terminated array s. */
{
    register char* s = (char*)src;
    size_t n;
    for (n = 0; *s; LDAP_UTF8INC(s)) ++n;
    return n;
}

unsigned long LDAP_CALL
ldap_utf8getcc( const char** src )
{
    register unsigned long c;
    register const unsigned char* s = (const unsigned char*)*src;
    switch (UTF8len [(*s >> 2) & 0x3F]) {
      case 0: /* erroneous: s points to the middle of a character. */
	      c = (*s++) & 0x3F; goto more5;
      case 1: c = (*s++); break;
      case 2: c = (*s++) & 0x1F; goto more1;
      case 3: c = (*s++) & 0x0F; goto more2;
      case 4: c = (*s++) & 0x07; goto more3;
      case 5: c = (*s++) & 0x03; goto more4;
      case 6: c = (*s++) & 0x01; goto more5;
      more5: if ((*s & 0xC0) != 0x80) break; c = (c << 6) | ((*s++) & 0x3F);
      more4: if ((*s & 0xC0) != 0x80) break; c = (c << 6) | ((*s++) & 0x3F);
      more3: if ((*s & 0xC0) != 0x80) break; c = (c << 6) | ((*s++) & 0x3F);
      more2: if ((*s & 0xC0) != 0x80) break; c = (c << 6) | ((*s++) & 0x3F);
      more1: if ((*s & 0xC0) != 0x80) break; c = (c << 6) | ((*s++) & 0x3F);
	break;
    }
    *src = (const char*)s;
    return c;
}

char*
LDAP_CALL
ldap_utf8strtok_r( char* sp, const char* brk, char** next)
{
    const char *bp;
    unsigned long sc, bc;
    char *tok;

    if (sp == NULL && (sp = *next) == NULL)
      return NULL;

    /* Skip leading delimiters; roughly, sp += strspn(sp, brk) */
  cont:
    sc = LDAP_UTF8GETC(sp);
    for (bp = brk; (bc = LDAP_UTF8GETCC(bp)) != 0;) {
	if (sc == bc)
	  goto cont;
    }

    if (sc == 0) { /* no non-delimiter characters */
	*next = NULL;
	return NULL;
    }
    tok = LDAP_UTF8PREV(sp);

    /* Scan token; roughly, sp += strcspn(sp, brk)
     * Note that brk must be 0-terminated; we stop if we see that, too.
     */
    while (1) {
	sc = LDAP_UTF8GETC(sp);
	bp = brk;
	do {
	    if ((bc = LDAP_UTF8GETCC(bp)) == sc) {
		if (sc == 0) {
		    *next = NULL;
		} else {
		    *next = sp;
		    *(LDAP_UTF8PREV(sp)) = 0;
		}
		return tok;
	    }
	} while (bc != 0);
    }
    /* NOTREACHED */
}

int
LDAP_CALL
ldap_utf8isalnum( char* s )
{
    register unsigned char c = *(unsigned char*)s;
    if (0x80 & c) return 0;
    if (c >= 'A' && c <= 'Z') return 1;
    if (c >= 'a' && c <= 'z') return 1;
    if (c >= '0' && c <= '9') return 1;
    return 0;
}

int
LDAP_CALL
ldap_utf8isalpha( char* s )
{
    register unsigned char c = *(unsigned char*)s;
    if (0x80 & c) return 0;
    if (c >= 'A' && c <= 'Z') return 1;
    if (c >= 'a' && c <= 'z') return 1;
    return 0;
}

int
LDAP_CALL
ldap_utf8isdigit( char* s )
{
    register unsigned char c = *(unsigned char*)s;
    if (0x80 & c) return 0;
    if (c >= '0' && c <= '9') return 1;
    return 0;
}

int
LDAP_CALL
ldap_utf8isxdigit( char* s )
{
    register unsigned char c = *(unsigned char*)s;
    if (0x80 & c) return 0;
    if (c >= '0' && c <= '9') return 1;
    if (c >= 'A' && c <= 'F') return 1;
    if (c >= 'a' && c <= 'f') return 1;
    return 0;
}

int
LDAP_CALL
ldap_utf8isspace( char* s )
{
    register unsigned char c = *(unsigned char*)s;
    if (0x80 & c) return 0;
    switch (c) {
      case ' ':
      case '\t':
      case '\n':
      case '\r':
      case '\v':
      case '\f':
	return 1;
      default: break;
    }
    return 0;
}
Added Directory (ldap) SDK source files for public release 1998-05-28 08:23:42 +04:00			`/* -- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 --`
			`*`
updating license boilerplate 1999-11-02 04:46:24 +03:00			`* The contents of this file are subject to the Netscape Public`
			`* License Version 1.1 (the "License"); you may not use this file`
			`* except in compliance with the License. You may obtain a copy of`
			`* the License at http://www.mozilla.org/NPL/`
Added Directory (ldap) SDK source files for public release 1998-05-28 08:23:42 +04:00			`*`
updating license boilerplate 1999-11-02 04:46:24 +03:00			`* Software distributed under the License is distributed on an "AS`
			`* IS" basis, WITHOUT WARRANTY OF ANY KIND, either express or`
			`* implied. See the License for the specific language governing`
			`* rights and limitations under the License.`
Added Directory (ldap) SDK source files for public release 1998-05-28 08:23:42 +04:00			`*`
updating license boilerplate 1999-11-02 04:46:24 +03:00			`* The Original Code is mozilla.org code.`
			`*`
			`* The Initial Developer of the Original Code is Netscape`
Added Directory (ldap) SDK source files for public release 1998-05-28 08:23:42 +04:00			`* Communications Corporation. Portions created by Netscape are`
updating license boilerplate 1999-11-02 04:46:24 +03:00			`* Copyright (C) 1998 Netscape Communications Corporation. All`
			`* Rights Reserved.`
			`*`
			`* Contributor(s):`
Added Directory (ldap) SDK source files for public release 1998-05-28 08:23:42 +04:00			`*/`
			`/* uft8.c - misc. utf8 "string" functions. */`
			`#include "ldap.h"`
Merge from LDAPCSDK_19981015_BRANCH 1998-11-23 01:03:18 +03:00			`#include "ldap-int.h"`
Added Directory (ldap) SDK source files for public release 1998-05-28 08:23:42 +04:00
			`static char UTF8len[64]`
			`= {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,`
			`1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,`
			`0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,`
			`2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 4, 4, 5, 6};`

			`int`
			`LDAP_CALL`
			`ldap_utf8len (const char* s)`
			`/* Return the number of char's in the character at s. /`
			`{`
			`return ldap_utf8next((char*)s) - s;`
			`}`

			`char*`
			`LDAP_CALL`
			`ldap_utf8next (char* s)`
			`/* Return a pointer to the character immediately following *s.`
			`Handle any valid UTF-8 character, including '\0' and ASCII.`
			`Try to handle a misaligned pointer or a malformed character.`
			`*/`
			`{`
			`register unsigned char* next = (unsigned char*)s;`
			`switch (UTF8len [(*next >> 2) & 0x3F]) {`
			`case 0: /* erroneous: s points to the middle of a character. */`
			`case 6: if ((*++next & 0xC0) != 0x80) break;`
			`case 5: if ((*++next & 0xC0) != 0x80) break;`
			`case 4: if ((*++next & 0xC0) != 0x80) break;`
			`case 3: if ((*++next & 0xC0) != 0x80) break;`
			`case 2: if ((*++next & 0xC0) != 0x80) break;`
			`case 1: ++next;`
			`}`
			`return (char*) next;`
			`}`

			`char*`
			`LDAP_CALL`
			`ldap_utf8prev (char* s)`
			`/* Return a pointer to the character immediately preceding *s.`
			`Handle any valid UTF-8 character, including '\0' and ASCII.`
			`Try to handle a misaligned pointer or a malformed character.`
			`*/`
			`{`
			`register unsigned char* prev = (unsigned char*)s;`
			`unsigned char* limit = prev - 6;`
			`while (((*--prev & 0xC0) == 0x80) && (prev != limit)) {`
			`;`
			`}`
			`return (char*) prev;`
			`}`

			`int`
			`LDAP_CALL`
			`ldap_utf8copy (char* dst, const char* src)`
			`/* Copy a character from src to dst; return the number of char's copied.`
			`Handle any valid UTF-8 character, including '\0' and ASCII.`
			`Try to handle a misaligned pointer or a malformed character.`
			`*/`
			`{`
			`register const unsigned char* s = (const unsigned char*)src;`
			`switch (UTF8len [(*s >> 2) & 0x3F]) {`
			`case 0: /* erroneous: s points to the middle of a character. */`
			`case 6: dst++ = s++; if ((*s & 0xC0) != 0x80) break;`
			`case 5: dst++ = s++; if ((*s & 0xC0) != 0x80) break;`
			`case 4: dst++ = s++; if ((*s & 0xC0) != 0x80) break;`
			`case 3: dst++ = s++; if ((*s & 0xC0) != 0x80) break;`
			`case 2: dst++ = s++; if ((*s & 0xC0) != 0x80) break;`
			`case 1: dst = s++;`
			`}`
			`return s - (const unsigned char*)src;`
			`}`

			`size_t`
			`LDAP_CALL`
			`ldap_utf8characters (const char* src)`
			`/* Return the number of UTF-8 characters in the 0-terminated array s. */`
			`{`
			`register char* s = (char*)src;`
			`size_t n;`
			`for (n = 0; *s; LDAP_UTF8INC(s)) ++n;`
			`return n;`
			`}`

			`unsigned long LDAP_CALL`
			`ldap_utf8getcc( const char** src )`
			`{`
			`register unsigned long c;`
			`register const unsigned char* s = (const unsigned char)src;`
			`switch (UTF8len [(*s >> 2) & 0x3F]) {`
			`case 0: /* erroneous: s points to the middle of a character. */`
			`c = (*s++) & 0x3F; goto more5;`
			`case 1: c = (*s++); break;`
			`case 2: c = (*s++) & 0x1F; goto more1;`
			`case 3: c = (*s++) & 0x0F; goto more2;`
			`case 4: c = (*s++) & 0x07; goto more3;`
			`case 5: c = (*s++) & 0x03; goto more4;`
			`case 6: c = (*s++) & 0x01; goto more5;`
			`more5: if ((s & 0xC0) != 0x80) break; c = (c << 6) \| ((s++) & 0x3F);`
			`more4: if ((s & 0xC0) != 0x80) break; c = (c << 6) \| ((s++) & 0x3F);`
			`more3: if ((s & 0xC0) != 0x80) break; c = (c << 6) \| ((s++) & 0x3F);`
			`more2: if ((s & 0xC0) != 0x80) break; c = (c << 6) \| ((s++) & 0x3F);`
			`more1: if ((s & 0xC0) != 0x80) break; c = (c << 6) \| ((s++) & 0x3F);`
			`break;`
			`}`
			`src = (const char)s;`
			`return c;`
			`}`

			`char*`
			`LDAP_CALL`
			`ldap_utf8strtok_r( char* sp, const char* brk, char** next)`
			`{`
			`const char *bp;`
			`unsigned long sc, bc;`
			`char *tok;`

			`if (sp == NULL && (sp = *next) == NULL)`
			`return NULL;`

			`/* Skip leading delimiters; roughly, sp += strspn(sp, brk) */`
			`cont:`
			`sc = LDAP_UTF8GETC(sp);`
			`for (bp = brk; (bc = LDAP_UTF8GETCC(bp)) != 0;) {`
			`if (sc == bc)`
			`goto cont;`
			`}`

			`if (sc == 0) { /* no non-delimiter characters */`
			`*next = NULL;`
			`return NULL;`
			`}`
			`tok = LDAP_UTF8PREV(sp);`

			`/* Scan token; roughly, sp += strcspn(sp, brk)`
			`* Note that brk must be 0-terminated; we stop if we see that, too.`
			`*/`
			`while (1) {`
			`sc = LDAP_UTF8GETC(sp);`
			`bp = brk;`
			`do {`
			`if ((bc = LDAP_UTF8GETCC(bp)) == sc) {`
			`if (sc == 0) {`
			`*next = NULL;`
			`} else {`
			`*next = sp;`
			`*(LDAP_UTF8PREV(sp)) = 0;`
			`}`
			`return tok;`
			`}`
			`} while (bc != 0);`
			`}`
			`/* NOTREACHED */`
			`}`

			`int`
			`LDAP_CALL`
			`ldap_utf8isalnum( char* s )`
			`{`
			`register unsigned char c = (unsigned char)s;`
			`if (0x80 & c) return 0;`
			`if (c >= 'A' && c <= 'Z') return 1;`
			`if (c >= 'a' && c <= 'z') return 1;`
			`if (c >= '0' && c <= '9') return 1;`
			`return 0;`
			`}`

			`int`
			`LDAP_CALL`
			`ldap_utf8isalpha( char* s )`
			`{`
			`register unsigned char c = (unsigned char)s;`
			`if (0x80 & c) return 0;`
			`if (c >= 'A' && c <= 'Z') return 1;`
			`if (c >= 'a' && c <= 'z') return 1;`
			`return 0;`
			`}`

			`int`
			`LDAP_CALL`
			`ldap_utf8isdigit( char* s )`
			`{`
			`register unsigned char c = (unsigned char)s;`
			`if (0x80 & c) return 0;`
			`if (c >= '0' && c <= '9') return 1;`
			`return 0;`
			`}`

			`int`
			`LDAP_CALL`
			`ldap_utf8isxdigit( char* s )`
			`{`
			`register unsigned char c = (unsigned char)s;`
			`if (0x80 & c) return 0;`
			`if (c >= '0' && c <= '9') return 1;`
			`if (c >= 'A' && c <= 'F') return 1;`
			`if (c >= 'a' && c <= 'f') return 1;`
			`return 0;`
			`}`

			`int`
			`LDAP_CALL`
			`ldap_utf8isspace( char* s )`
			`{`
			`register unsigned char c = (unsigned char)s;`
			`if (0x80 & c) return 0;`
			`switch (c) {`
			`case ' ':`
			`case '\t':`
			`case '\n':`
			`case '\r':`
			`case '\v':`
			`case '\f':`
			`return 1;`
			`default: break;`
			`}`
			`return 0;`
			`}`