/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- * * The contents of this file are subject to the Netscape Public * License Version 1.1 (the "License"); you may not use this file * except in compliance with the License. You may obtain a copy of * the License at http://www.mozilla.org/NPL/ * * Software distributed under the License is distributed on an "AS * IS" basis, WITHOUT WARRANTY OF ANY KIND, either express or * implied. See the License for the specific language governing * rights and limitations under the License. * * The Original Code is mozilla.org code. * * The Initial Developer of the Original Code is Netscape * Communications Corporation. Portions created by Netscape are * Copyright (C) 1998 Netscape Communications Corporation. All * Rights Reserved. * * Contributor(s): */ /* uft8.c - misc. utf8 "string" functions. */ #include "ldap.h" #include "ldap-int.h" static char UTF8len[64] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 4, 4, 5, 6}; int LDAP_CALL ldap_utf8len (const char* s) /* Return the number of char's in the character at *s. */ { return ldap_utf8next((char*)s) - s; } char* LDAP_CALL ldap_utf8next (char* s) /* Return a pointer to the character immediately following *s. Handle any valid UTF-8 character, including '\0' and ASCII. Try to handle a misaligned pointer or a malformed character. */ { register unsigned char* next = (unsigned char*)s; switch (UTF8len [(*next >> 2) & 0x3F]) { case 0: /* erroneous: s points to the middle of a character. */ case 6: if ((*++next & 0xC0) != 0x80) break; case 5: if ((*++next & 0xC0) != 0x80) break; case 4: if ((*++next & 0xC0) != 0x80) break; case 3: if ((*++next & 0xC0) != 0x80) break; case 2: if ((*++next & 0xC0) != 0x80) break; case 1: ++next; } return (char*) next; } char* LDAP_CALL ldap_utf8prev (char* s) /* Return a pointer to the character immediately preceding *s. Handle any valid UTF-8 character, including '\0' and ASCII. Try to handle a misaligned pointer or a malformed character. */ { register unsigned char* prev = (unsigned char*)s; unsigned char* limit = prev - 6; while (((*--prev & 0xC0) == 0x80) && (prev != limit)) { ; } return (char*) prev; } int LDAP_CALL ldap_utf8copy (char* dst, const char* src) /* Copy a character from src to dst; return the number of char's copied. Handle any valid UTF-8 character, including '\0' and ASCII. Try to handle a misaligned pointer or a malformed character. */ { register const unsigned char* s = (const unsigned char*)src; switch (UTF8len [(*s >> 2) & 0x3F]) { case 0: /* erroneous: s points to the middle of a character. */ case 6: *dst++ = *s++; if ((*s & 0xC0) != 0x80) break; case 5: *dst++ = *s++; if ((*s & 0xC0) != 0x80) break; case 4: *dst++ = *s++; if ((*s & 0xC0) != 0x80) break; case 3: *dst++ = *s++; if ((*s & 0xC0) != 0x80) break; case 2: *dst++ = *s++; if ((*s & 0xC0) != 0x80) break; case 1: *dst = *s++; } return s - (const unsigned char*)src; } size_t LDAP_CALL ldap_utf8characters (const char* src) /* Return the number of UTF-8 characters in the 0-terminated array s. */ { register char* s = (char*)src; size_t n; for (n = 0; *s; LDAP_UTF8INC(s)) ++n; return n; } unsigned long LDAP_CALL ldap_utf8getcc( const char** src ) { register unsigned long c; register const unsigned char* s = (const unsigned char*)*src; switch (UTF8len [(*s >> 2) & 0x3F]) { case 0: /* erroneous: s points to the middle of a character. */ c = (*s++) & 0x3F; goto more5; case 1: c = (*s++); break; case 2: c = (*s++) & 0x1F; goto more1; case 3: c = (*s++) & 0x0F; goto more2; case 4: c = (*s++) & 0x07; goto more3; case 5: c = (*s++) & 0x03; goto more4; case 6: c = (*s++) & 0x01; goto more5; more5: if ((*s & 0xC0) != 0x80) break; c = (c << 6) | ((*s++) & 0x3F); more4: if ((*s & 0xC0) != 0x80) break; c = (c << 6) | ((*s++) & 0x3F); more3: if ((*s & 0xC0) != 0x80) break; c = (c << 6) | ((*s++) & 0x3F); more2: if ((*s & 0xC0) != 0x80) break; c = (c << 6) | ((*s++) & 0x3F); more1: if ((*s & 0xC0) != 0x80) break; c = (c << 6) | ((*s++) & 0x3F); break; } *src = (const char*)s; return c; } char* LDAP_CALL ldap_utf8strtok_r( char* sp, const char* brk, char** next) { const char *bp; unsigned long sc, bc; char *tok; if (sp == NULL && (sp = *next) == NULL) return NULL; /* Skip leading delimiters; roughly, sp += strspn(sp, brk) */ cont: sc = LDAP_UTF8GETC(sp); for (bp = brk; (bc = LDAP_UTF8GETCC(bp)) != 0;) { if (sc == bc) goto cont; } if (sc == 0) { /* no non-delimiter characters */ *next = NULL; return NULL; } tok = LDAP_UTF8PREV(sp); /* Scan token; roughly, sp += strcspn(sp, brk) * Note that brk must be 0-terminated; we stop if we see that, too. */ while (1) { sc = LDAP_UTF8GETC(sp); bp = brk; do { if ((bc = LDAP_UTF8GETCC(bp)) == sc) { if (sc == 0) { *next = NULL; } else { *next = sp; *(LDAP_UTF8PREV(sp)) = 0; } return tok; } } while (bc != 0); } /* NOTREACHED */ } int LDAP_CALL ldap_utf8isalnum( char* s ) { register unsigned char c = *(unsigned char*)s; if (0x80 & c) return 0; if (c >= 'A' && c <= 'Z') return 1; if (c >= 'a' && c <= 'z') return 1; if (c >= '0' && c <= '9') return 1; return 0; } int LDAP_CALL ldap_utf8isalpha( char* s ) { register unsigned char c = *(unsigned char*)s; if (0x80 & c) return 0; if (c >= 'A' && c <= 'Z') return 1; if (c >= 'a' && c <= 'z') return 1; return 0; } int LDAP_CALL ldap_utf8isdigit( char* s ) { register unsigned char c = *(unsigned char*)s; if (0x80 & c) return 0; if (c >= '0' && c <= '9') return 1; return 0; } int LDAP_CALL ldap_utf8isxdigit( char* s ) { register unsigned char c = *(unsigned char*)s; if (0x80 & c) return 0; if (c >= '0' && c <= '9') return 1; if (c >= 'A' && c <= 'F') return 1; if (c >= 'a' && c <= 'f') return 1; return 0; } int LDAP_CALL ldap_utf8isspace( char* s ) { register unsigned char c = *(unsigned char*)s; if (0x80 & c) return 0; switch (c) { case ' ': case '\t': case '\n': case '\r': case '\v': case '\f': return 1; default: break; } return 0; }