Add StringBeginsWith, StringEndsWith, and UTF8ToNewUnicode. Move UTF8 handling utilities to separate file. b=131293 r=jst sr=alecf,jag a=asa b=131293

This commit is contained in:
dbaron%dbaron.org 2003-05-21 22:20:27 +00:00
Родитель 1a81e285f2
Коммит 77bf6f4e26
13 изменённых файлов: 679 добавлений и 408 удалений

Просмотреть файл

@ -44,6 +44,7 @@
#include "nsString.h"
#include "nsReadableUtils.h"
#include "nsDebug.h"
#include "nsUTF8Utils.h"
#ifndef nsCharTraits_h___
#include "nsCharTraits.h"
@ -1348,61 +1349,6 @@ NS_ConvertASCIItoUCS2::NS_ConvertASCIItoUCS2( const nsACString& aCString )
}
}
class CalculateUTF8Length
{
public:
typedef nsACString::char_type value_type;
CalculateUTF8Length() : mLength(0), mErrorEncountered(PR_FALSE) { }
size_t Length() const { return mLength; }
PRUint32 write( const value_type* start, PRUint32 N )
{
// ignore any further requests
if ( mErrorEncountered )
return N;
// algorithm assumes utf8 units won't
// be spread across fragments
const value_type* p = start;
const value_type* end = start + N;
for ( ; p < end /* && *p */; ++mLength )
{
if ( UTF8traits::isASCII(*p) )
p += 1;
else if ( UTF8traits::is2byte(*p) )
p += 2;
else if ( UTF8traits::is3byte(*p) )
p += 3;
else if ( UTF8traits::is4byte(*p) ) {
p += 4;
++mLength;
}
else if ( UTF8traits::is5byte(*p) )
p += 5;
else if ( UTF8traits::is6byte(*p) )
p += 6;
else
{
break;
}
}
if ( p != end )
{
NS_ERROR("Not a UTF-8 string. This code should only be used for converting from known UTF-8 strings.");
mErrorEncountered = PR_TRUE;
mLength = 0;
return N;
}
return p - start;
}
private:
size_t mLength;
PRBool mErrorEncountered;
};
void
NS_ConvertUTF8toUCS2::Init( const nsACString& aCString )
{

Просмотреть файл

@ -76,18 +76,6 @@
#include "nsStr.h"
class UTF8traits
{
public:
static PRBool isASCII(char c) { return (c & 0x80) == 0x00; }
static PRBool isInSeq(char c) { return (c & 0xC0) == 0x80; }
static PRBool is2byte(char c) { return (c & 0xE0) == 0xC0; }
static PRBool is3byte(char c) { return (c & 0xF0) == 0xE0; }
static PRBool is4byte(char c) { return (c & 0xF8) == 0xF0; }
static PRBool is5byte(char c) { return (c & 0xFC) == 0xF8; }
static PRBool is6byte(char c) { return (c & 0xFE) == 0xFC; }
};
#ifdef STANDALONE_MI_STRING_TESTS
class nsAFlatString { public: virtual ~nsAString() { } };
#endif
@ -566,140 +554,4 @@ class NS_COM NS_ConvertUTF8toUCS2
NS_ConvertUTF8toUCS2( PRUnichar );
};
#define PLANE1_BASE 0x00010000
#define UCS2_REPLACEMENT_CHAR 0xfffd
class ConvertUTF8toUCS2
{
public:
typedef nsACString::char_type value_type;
typedef nsAString::char_type buffer_type;
ConvertUTF8toUCS2( buffer_type* aBuffer )
: mStart(aBuffer), mBuffer(aBuffer), mErrorEncountered(PR_FALSE) {}
size_t Length() const { return mBuffer - mStart; }
PRUint32 write( const value_type* start, PRUint32 N )
{
if ( mErrorEncountered )
return N;
// algorithm assumes utf8 units won't
// be spread across fragments
const value_type* p = start;
const value_type* end = start + N;
for ( ; p != end /* && *p */; )
{
char c = *p++;
if ( UTF8traits::isASCII(c) )
{
*mBuffer++ = buffer_type(c);
continue;
}
PRUint32 ucs4;
PRUint32 minUcs4;
PRInt32 state = 0;
if ( UTF8traits::is2byte(c) )
{
ucs4 = (PRUint32(c) << 6) & 0x000007C0L;
state = 1;
minUcs4 = 0x00000080;
}
else if ( UTF8traits::is3byte(c) )
{
ucs4 = (PRUint32(c) << 12) & 0x0000F000L;
state = 2;
minUcs4 = 0x00000800;
}
else if ( UTF8traits::is4byte(c) )
{
ucs4 = (PRUint32(c) << 18) & 0x001F0000L;
state = 3;
minUcs4 = 0x00010000;
}
else if ( UTF8traits::is5byte(c) )
{
ucs4 = (PRUint32(c) << 24) & 0x03000000L;
state = 4;
minUcs4 = 0x00200000;
}
else if ( UTF8traits::is6byte(c) )
{
ucs4 = (PRUint32(c) << 30) & 0x40000000L;
state = 5;
minUcs4 = 0x04000000;
}
else
{
NS_ERROR("Not a UTF-8 string. This code should only be used for converting from known UTF-8 strings.");
mErrorEncountered = PR_TRUE;
return N;
}
while ( state-- )
{
c = *p++;
if ( UTF8traits::isInSeq(c) )
{
PRInt32 shift = state * 6;
ucs4 |= (PRUint32(c) & 0x3F) << shift;
}
else
{
NS_ERROR("not a UTF8 string");
mErrorEncountered = PR_TRUE;
return N;
}
}
if ( ucs4 < minUcs4 )
{
// Overlong sequence
*mBuffer++ = UCS2_REPLACEMENT_CHAR;
}
else if ( ucs4 <= 0xD7FF )
{
*mBuffer++ = ucs4;
}
else if ( /* ucs4 >= 0xD800 && */ ucs4 <= 0xDFFF )
{
// Surrogates
*mBuffer++ = UCS2_REPLACEMENT_CHAR;
}
else if ( ucs4 == 0xFFFE || ucs4 == 0xFFFF )
{
// Prohibited characters
*mBuffer++ = UCS2_REPLACEMENT_CHAR;
}
else if ( ucs4 >= PLANE1_BASE )
{
if ( ucs4 >= 0x00110000 )
*mBuffer++ = UCS2_REPLACEMENT_CHAR;
else {
// surrogate, see unicode specification 3.7 for following math.
ucs4 -= PLANE1_BASE;
*mBuffer++ = (PRUnichar)(ucs4 >> 10) | 0xd800u;
*mBuffer++ = (PRUnichar)(ucs4 & 0x3ff) | 0xdc00u;
}
}
else
{
if ( ucs4 != 0xFEFF ) // ignore BOM
*mBuffer++ = ucs4;
}
}
return p - start;
}
private:
buffer_type* mStart;
buffer_type* mBuffer;
PRBool mErrorEncountered;
};
#endif /* !defined(nsString2_h__) */

Просмотреть файл

@ -59,6 +59,7 @@ EXPORTS = \
nsSharableString.h \
nsSharedBufferList.h \
nsSlidingString.h \
nsUTF8Utils.h \
nsXPIDLString.h \
$(NULL)

Просмотреть файл

@ -107,11 +107,28 @@ NS_COM PRUnichar* ToNewUnicode( const nsAString& aSource );
* This conversion is not well defined; but it reproduces legacy string behavior.
* The new buffer is zero-terminated, but that may not help you if |aSource| contains embedded nulls.
*
* @param aSource an 8-bit wide string
* @param aSource an 8-bit wide string (a C-string, NOT UTF-8)
* @return a new |PRUnichar| buffer you must free with |nsMemory::Free|.
*/
NS_COM PRUnichar* ToNewUnicode( const nsACString& aSource );
/**
* Returns a new |PRUnichar| buffer containing a zero-terminated copy
* of |aSource|.
*
* Allocates and returns a new |char| buffer which you must free with
* |nsMemory::Free|. Performs an encoding conversion by 0-padding
* 8-bit wide characters up to 16-bits wide while copying |aSource| to
* your new buffer. This conversion is not well defined; but it
* reproduces legacy string behavior. The new buffer is
* zero-terminated, but that may not help you if |aSource| contains
* embedded nulls.
*
* @param aSource an 8-bit wide string, UTF-8 encoded
* @return a new |PRUnichar| buffer you must free with |nsMemory::Free|.
*/
NS_COM PRUnichar* UTF8ToNewUnicode( const nsACString& aSource );
/**
* Copies |aLength| 16-bit characters from the start of |aSource| to the
* |PRUnichar| buffer |aDest|.
@ -281,6 +298,15 @@ NS_COM PRUint32 CountCharInReadable( const nsAString& aStr,
NS_COM PRUint32 CountCharInReadable( const nsACString& aStr,
char aChar );
NS_COM PRBool StringBeginsWith( const nsAString& aSource,
const nsAString& aSubstring);
NS_COM PRBool StringBeginsWith( const nsACString& aSource,
const nsACString& aSubstring);
NS_COM PRBool StringEndsWith( const nsAString& aSource,
const nsAString& aSubstring);
NS_COM PRBool StringEndsWith( const nsACString& aSource,
const nsACString& aSubstring);
NS_COM PRUint32 HashString( const nsAString& aStr );
NS_COM PRUint32 HashString( const nsACString& aStr );

245
string/public/nsUTF8Utils.h Normal file
Просмотреть файл

@ -0,0 +1,245 @@
/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
/* ***** BEGIN LICENSE BLOCK *****
* Version: NPL 1.1/GPL 2.0/LGPL 2.1
*
* The contents of this file are subject to the Netscape Public License
* Version 1.1 (the "License"); you may not use this file except in
* compliance with the License. You may obtain a copy of the License at
* http://www.mozilla.org/NPL/
*
* Software distributed under the License is distributed on an "AS IS" basis,
* WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
* for the specific language governing rights and limitations under the
* License.
*
* The Original Code is mozilla.org code.
*
* The Initial Developer of the Original Code is
* Netscape Communications Corporation.
* Portions created by the Initial Developer are Copyright (C) 2001
* the Initial Developer. All Rights Reserved.
*
* Contributor(s):
* Peter Annema <jaggernaut@netscape.com> (original author)
*
* Alternatively, the contents of this file may be used under the terms of
* either the GNU General Public License Version 2 or later (the "GPL"), or
* the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
* in which case the provisions of the GPL or the LGPL are applicable instead
* of those above. If you wish to allow use of your version of this file only
* under the terms of either the GPL or the LGPL, and not to allow others to
* use your version of this file under the terms of the NPL, indicate your
* decision by deleting the provisions above and replace them with the notice
* and other provisions required by the GPL or the LGPL. If you do not delete
* the provisions above, a recipient may use your version of this file under
* the terms of any one of the NPL, the GPL or the LGPL.
*
* ***** END LICENSE BLOCK ***** */
#ifndef nsUTF8Utils_h_
#define nsUTF8Utils_h_
class UTF8traits
{
public:
static PRBool isASCII(char c) { return (c & 0x80) == 0x00; }
static PRBool isInSeq(char c) { return (c & 0xC0) == 0x80; }
static PRBool is2byte(char c) { return (c & 0xE0) == 0xC0; }
static PRBool is3byte(char c) { return (c & 0xF0) == 0xE0; }
static PRBool is4byte(char c) { return (c & 0xF8) == 0xF0; }
static PRBool is5byte(char c) { return (c & 0xFC) == 0xF8; }
static PRBool is6byte(char c) { return (c & 0xFE) == 0xFC; }
};
#define PLANE1_BASE 0x00010000
#define UCS2_REPLACEMENT_CHAR 0xfffd
class ConvertUTF8toUCS2
{
public:
typedef nsACString::char_type value_type;
typedef nsAString::char_type buffer_type;
ConvertUTF8toUCS2( buffer_type* aBuffer )
: mStart(aBuffer), mBuffer(aBuffer), mErrorEncountered(PR_FALSE) {}
size_t Length() const { return mBuffer - mStart; }
PRUint32 write( const value_type* start, PRUint32 N )
{
if ( mErrorEncountered )
return N;
// algorithm assumes utf8 units won't
// be spread across fragments
const value_type* p = start;
const value_type* end = start + N;
for ( ; p != end /* && *p */; )
{
char c = *p++;
if ( UTF8traits::isASCII(c) )
{
*mBuffer++ = buffer_type(c);
continue;
}
PRUint32 ucs4;
PRUint32 minUcs4;
PRInt32 state = 0;
if ( UTF8traits::is2byte(c) )
{
ucs4 = (PRUint32(c) << 6) & 0x000007C0L;
state = 1;
minUcs4 = 0x00000080;
}
else if ( UTF8traits::is3byte(c) )
{
ucs4 = (PRUint32(c) << 12) & 0x0000F000L;
state = 2;
minUcs4 = 0x00000800;
}
else if ( UTF8traits::is4byte(c) )
{
ucs4 = (PRUint32(c) << 18) & 0x001F0000L;
state = 3;
minUcs4 = 0x00010000;
}
else if ( UTF8traits::is5byte(c) )
{
ucs4 = (PRUint32(c) << 24) & 0x03000000L;
state = 4;
minUcs4 = 0x00200000;
}
else if ( UTF8traits::is6byte(c) )
{
ucs4 = (PRUint32(c) << 30) & 0x40000000L;
state = 5;
minUcs4 = 0x04000000;
}
else
{
NS_ERROR("Not a UTF-8 string. This code should only be used for converting from known UTF-8 strings.");
mErrorEncountered = PR_TRUE;
return N;
}
while ( state-- )
{
c = *p++;
if ( UTF8traits::isInSeq(c) )
{
PRInt32 shift = state * 6;
ucs4 |= (PRUint32(c) & 0x3F) << shift;
}
else
{
NS_ERROR("not a UTF8 string");
mErrorEncountered = PR_TRUE;
return N;
}
}
if ( ucs4 < minUcs4 )
{
// Overlong sequence
*mBuffer++ = UCS2_REPLACEMENT_CHAR;
}
else if ( ucs4 <= 0xD7FF )
{
*mBuffer++ = ucs4;
}
else if ( /* ucs4 >= 0xD800 && */ ucs4 <= 0xDFFF )
{
// Surrogates
*mBuffer++ = UCS2_REPLACEMENT_CHAR;
}
else if ( ucs4 == 0xFFFE || ucs4 == 0xFFFF )
{
// Prohibited characters
*mBuffer++ = UCS2_REPLACEMENT_CHAR;
}
else if ( ucs4 >= PLANE1_BASE )
{
if ( ucs4 >= 0x00110000 )
*mBuffer++ = UCS2_REPLACEMENT_CHAR;
else {
// surrogate, see unicode specification 3.7 for following math.
ucs4 -= PLANE1_BASE;
*mBuffer++ = (PRUnichar)(ucs4 >> 10) | 0xd800u;
*mBuffer++ = (PRUnichar)(ucs4 & 0x3ff) | 0xdc00u;
}
}
else
{
if ( ucs4 != 0xFEFF ) // ignore BOM
*mBuffer++ = ucs4;
}
}
return p - start;
}
private:
buffer_type* mStart;
buffer_type* mBuffer;
PRBool mErrorEncountered;
};
class CalculateUTF8Length
{
public:
typedef nsACString::char_type value_type;
CalculateUTF8Length() : mLength(0), mErrorEncountered(PR_FALSE) { }
size_t Length() const { return mLength; }
PRUint32 write( const value_type* start, PRUint32 N )
{
// ignore any further requests
if ( mErrorEncountered )
return N;
// algorithm assumes utf8 units won't
// be spread across fragments
const value_type* p = start;
const value_type* end = start + N;
for ( ; p < end /* && *p */; ++mLength )
{
if ( UTF8traits::isASCII(*p) )
p += 1;
else if ( UTF8traits::is2byte(*p) )
p += 2;
else if ( UTF8traits::is3byte(*p) )
p += 3;
else if ( UTF8traits::is4byte(*p) ) {
p += 4;
++mLength;
}
else if ( UTF8traits::is5byte(*p) )
p += 5;
else if ( UTF8traits::is6byte(*p) )
p += 6;
else
{
break;
}
}
if ( p != end )
{
NS_ERROR("Not a UTF-8 string. This code should only be used for converting from known UTF-8 strings.");
mErrorEncountered = PR_TRUE;
mLength = 0;
return N;
}
return p - start;
}
private:
size_t mLength;
PRBool mErrorEncountered;
};
#endif /* !defined(nsUTF8Utils_h_) */

Просмотреть файл

@ -25,6 +25,7 @@
#include "nsMemory.h"
#include "nsString.h"
#include "nsCRT.h"
#include "nsUTF8Utils.h"
#ifndef nsStringTraits_h___
#include "nsStringTraits.h"
@ -208,6 +209,8 @@ NS_COM
char*
ToNewUTF8String( const nsAString& aSource )
{
// XXX The conversion code in NS_ConvertUCS2toUTF8 needs to be
// refactored so that we can use it here without a double-copy.
NS_ConvertUCS2toUTF8 temp(aSource);
char* result;
@ -268,6 +271,26 @@ ToNewUnicode( const nsACString& aSource )
return result;
}
NS_COM
PRUnichar*
UTF8ToNewUnicode( const nsACString& aSource )
{
nsACString::const_iterator start, end;
CalculateUTF8Length calculator;
copy_string(aSource.BeginReading(start), aSource.EndReading(end),
calculator);
PRUnichar *result = NS_STATIC_CAST(PRUnichar*,
nsMemory::Alloc(sizeof(PRUnichar) * (calculator.Length() + 1)));
ConvertUTF8toUCS2 converter(result);
copy_string(aSource.BeginReading(start), aSource.EndReading(end),
converter);
NS_ASSERTION(calculator.Length() == converter.Length(), "length mismatch");
return result;
}
NS_COM
PRUnichar*
CopyUnicodeTo( const nsAString& aSource, PRUint32 aSrcOffset, PRUnichar* aDest, PRUint32 aLength )
@ -1080,6 +1103,48 @@ CountCharInReadable( const nsACString& aStr,
return count;
}
NS_COM PRBool
StringBeginsWith( const nsAString& aSource, const nsAString& aSubstring)
{
nsAString::size_type src_len = aSource.Length(),
sub_len = aSubstring.Length();
if (sub_len > src_len)
return PR_FALSE;
return Substring(aSource, 0, sub_len) == aSubstring;
}
NS_COM PRBool
StringBeginsWith( const nsACString& aSource, const nsACString& aSubstring)
{
nsACString::size_type src_len = aSource.Length(),
sub_len = aSubstring.Length();
if (sub_len > src_len)
return PR_FALSE;
return Substring(aSource, 0, sub_len) == aSubstring;
}
NS_COM PRBool
StringEndsWith( const nsAString& aSource, const nsAString& aSubstring)
{
nsAString::size_type src_len = aSource.Length(),
sub_len = aSubstring.Length();
if (sub_len > src_len)
return PR_FALSE;
return Substring(aSource, src_len - sub_len, sub_len) == aSubstring;
}
NS_COM PRBool
StringEndsWith( const nsACString& aSource, const nsACString& aSubstring)
{
nsACString::size_type src_len = aSource.Length(),
sub_len = aSubstring.Length();
if (sub_len > src_len)
return PR_FALSE;
return Substring(aSource, src_len - sub_len, sub_len) == aSubstring;
}
template <class CharT>
class CalculateHashCode
{

Просмотреть файл

@ -42,6 +42,7 @@
#include "nsIServiceManager.h"
#include "nsString.h"
#include "nsCRT.h"
#include "nsUTF8Utils.h"
#include <fcntl.h>
#if defined(NS_WIN32) || defined(XP_OS2_VACPP)
#include <io.h>

Просмотреть файл

@ -44,6 +44,7 @@
#include "nsString.h"
#include "nsReadableUtils.h"
#include "nsDebug.h"
#include "nsUTF8Utils.h"
#ifndef nsCharTraits_h___
#include "nsCharTraits.h"
@ -1348,61 +1349,6 @@ NS_ConvertASCIItoUCS2::NS_ConvertASCIItoUCS2( const nsACString& aCString )
}
}
class CalculateUTF8Length
{
public:
typedef nsACString::char_type value_type;
CalculateUTF8Length() : mLength(0), mErrorEncountered(PR_FALSE) { }
size_t Length() const { return mLength; }
PRUint32 write( const value_type* start, PRUint32 N )
{
// ignore any further requests
if ( mErrorEncountered )
return N;
// algorithm assumes utf8 units won't
// be spread across fragments
const value_type* p = start;
const value_type* end = start + N;
for ( ; p < end /* && *p */; ++mLength )
{
if ( UTF8traits::isASCII(*p) )
p += 1;
else if ( UTF8traits::is2byte(*p) )
p += 2;
else if ( UTF8traits::is3byte(*p) )
p += 3;
else if ( UTF8traits::is4byte(*p) ) {
p += 4;
++mLength;
}
else if ( UTF8traits::is5byte(*p) )
p += 5;
else if ( UTF8traits::is6byte(*p) )
p += 6;
else
{
break;
}
}
if ( p != end )
{
NS_ERROR("Not a UTF-8 string. This code should only be used for converting from known UTF-8 strings.");
mErrorEncountered = PR_TRUE;
mLength = 0;
return N;
}
return p - start;
}
private:
size_t mLength;
PRBool mErrorEncountered;
};
void
NS_ConvertUTF8toUCS2::Init( const nsACString& aCString )
{

Просмотреть файл

@ -76,18 +76,6 @@
#include "nsStr.h"
class UTF8traits
{
public:
static PRBool isASCII(char c) { return (c & 0x80) == 0x00; }
static PRBool isInSeq(char c) { return (c & 0xC0) == 0x80; }
static PRBool is2byte(char c) { return (c & 0xE0) == 0xC0; }
static PRBool is3byte(char c) { return (c & 0xF0) == 0xE0; }
static PRBool is4byte(char c) { return (c & 0xF8) == 0xF0; }
static PRBool is5byte(char c) { return (c & 0xFC) == 0xF8; }
static PRBool is6byte(char c) { return (c & 0xFE) == 0xFC; }
};
#ifdef STANDALONE_MI_STRING_TESTS
class nsAFlatString { public: virtual ~nsAString() { } };
#endif
@ -566,140 +554,4 @@ class NS_COM NS_ConvertUTF8toUCS2
NS_ConvertUTF8toUCS2( PRUnichar );
};
#define PLANE1_BASE 0x00010000
#define UCS2_REPLACEMENT_CHAR 0xfffd
class ConvertUTF8toUCS2
{
public:
typedef nsACString::char_type value_type;
typedef nsAString::char_type buffer_type;
ConvertUTF8toUCS2( buffer_type* aBuffer )
: mStart(aBuffer), mBuffer(aBuffer), mErrorEncountered(PR_FALSE) {}
size_t Length() const { return mBuffer - mStart; }
PRUint32 write( const value_type* start, PRUint32 N )
{
if ( mErrorEncountered )
return N;
// algorithm assumes utf8 units won't
// be spread across fragments
const value_type* p = start;
const value_type* end = start + N;
for ( ; p != end /* && *p */; )
{
char c = *p++;
if ( UTF8traits::isASCII(c) )
{
*mBuffer++ = buffer_type(c);
continue;
}
PRUint32 ucs4;
PRUint32 minUcs4;
PRInt32 state = 0;
if ( UTF8traits::is2byte(c) )
{
ucs4 = (PRUint32(c) << 6) & 0x000007C0L;
state = 1;
minUcs4 = 0x00000080;
}
else if ( UTF8traits::is3byte(c) )
{
ucs4 = (PRUint32(c) << 12) & 0x0000F000L;
state = 2;
minUcs4 = 0x00000800;
}
else if ( UTF8traits::is4byte(c) )
{
ucs4 = (PRUint32(c) << 18) & 0x001F0000L;
state = 3;
minUcs4 = 0x00010000;
}
else if ( UTF8traits::is5byte(c) )
{
ucs4 = (PRUint32(c) << 24) & 0x03000000L;
state = 4;
minUcs4 = 0x00200000;
}
else if ( UTF8traits::is6byte(c) )
{
ucs4 = (PRUint32(c) << 30) & 0x40000000L;
state = 5;
minUcs4 = 0x04000000;
}
else
{
NS_ERROR("Not a UTF-8 string. This code should only be used for converting from known UTF-8 strings.");
mErrorEncountered = PR_TRUE;
return N;
}
while ( state-- )
{
c = *p++;
if ( UTF8traits::isInSeq(c) )
{
PRInt32 shift = state * 6;
ucs4 |= (PRUint32(c) & 0x3F) << shift;
}
else
{
NS_ERROR("not a UTF8 string");
mErrorEncountered = PR_TRUE;
return N;
}
}
if ( ucs4 < minUcs4 )
{
// Overlong sequence
*mBuffer++ = UCS2_REPLACEMENT_CHAR;
}
else if ( ucs4 <= 0xD7FF )
{
*mBuffer++ = ucs4;
}
else if ( /* ucs4 >= 0xD800 && */ ucs4 <= 0xDFFF )
{
// Surrogates
*mBuffer++ = UCS2_REPLACEMENT_CHAR;
}
else if ( ucs4 == 0xFFFE || ucs4 == 0xFFFF )
{
// Prohibited characters
*mBuffer++ = UCS2_REPLACEMENT_CHAR;
}
else if ( ucs4 >= PLANE1_BASE )
{
if ( ucs4 >= 0x00110000 )
*mBuffer++ = UCS2_REPLACEMENT_CHAR;
else {
// surrogate, see unicode specification 3.7 for following math.
ucs4 -= PLANE1_BASE;
*mBuffer++ = (PRUnichar)(ucs4 >> 10) | 0xd800u;
*mBuffer++ = (PRUnichar)(ucs4 & 0x3ff) | 0xdc00u;
}
}
else
{
if ( ucs4 != 0xFEFF ) // ignore BOM
*mBuffer++ = ucs4;
}
}
return p - start;
}
private:
buffer_type* mStart;
buffer_type* mBuffer;
PRBool mErrorEncountered;
};
#endif /* !defined(nsString2_h__) */

Просмотреть файл

@ -59,6 +59,7 @@ EXPORTS = \
nsSharableString.h \
nsSharedBufferList.h \
nsSlidingString.h \
nsUTF8Utils.h \
nsXPIDLString.h \
$(NULL)

Просмотреть файл

@ -107,11 +107,28 @@ NS_COM PRUnichar* ToNewUnicode( const nsAString& aSource );
* This conversion is not well defined; but it reproduces legacy string behavior.
* The new buffer is zero-terminated, but that may not help you if |aSource| contains embedded nulls.
*
* @param aSource an 8-bit wide string
* @param aSource an 8-bit wide string (a C-string, NOT UTF-8)
* @return a new |PRUnichar| buffer you must free with |nsMemory::Free|.
*/
NS_COM PRUnichar* ToNewUnicode( const nsACString& aSource );
/**
* Returns a new |PRUnichar| buffer containing a zero-terminated copy
* of |aSource|.
*
* Allocates and returns a new |char| buffer which you must free with
* |nsMemory::Free|. Performs an encoding conversion by 0-padding
* 8-bit wide characters up to 16-bits wide while copying |aSource| to
* your new buffer. This conversion is not well defined; but it
* reproduces legacy string behavior. The new buffer is
* zero-terminated, but that may not help you if |aSource| contains
* embedded nulls.
*
* @param aSource an 8-bit wide string, UTF-8 encoded
* @return a new |PRUnichar| buffer you must free with |nsMemory::Free|.
*/
NS_COM PRUnichar* UTF8ToNewUnicode( const nsACString& aSource );
/**
* Copies |aLength| 16-bit characters from the start of |aSource| to the
* |PRUnichar| buffer |aDest|.
@ -281,6 +298,15 @@ NS_COM PRUint32 CountCharInReadable( const nsAString& aStr,
NS_COM PRUint32 CountCharInReadable( const nsACString& aStr,
char aChar );
NS_COM PRBool StringBeginsWith( const nsAString& aSource,
const nsAString& aSubstring);
NS_COM PRBool StringBeginsWith( const nsACString& aSource,
const nsACString& aSubstring);
NS_COM PRBool StringEndsWith( const nsAString& aSource,
const nsAString& aSubstring);
NS_COM PRBool StringEndsWith( const nsACString& aSource,
const nsACString& aSubstring);
NS_COM PRUint32 HashString( const nsAString& aStr );
NS_COM PRUint32 HashString( const nsACString& aStr );

Просмотреть файл

@ -0,0 +1,245 @@
/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
/* ***** BEGIN LICENSE BLOCK *****
* Version: NPL 1.1/GPL 2.0/LGPL 2.1
*
* The contents of this file are subject to the Netscape Public License
* Version 1.1 (the "License"); you may not use this file except in
* compliance with the License. You may obtain a copy of the License at
* http://www.mozilla.org/NPL/
*
* Software distributed under the License is distributed on an "AS IS" basis,
* WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
* for the specific language governing rights and limitations under the
* License.
*
* The Original Code is mozilla.org code.
*
* The Initial Developer of the Original Code is
* Netscape Communications Corporation.
* Portions created by the Initial Developer are Copyright (C) 2001
* the Initial Developer. All Rights Reserved.
*
* Contributor(s):
* Peter Annema <jaggernaut@netscape.com> (original author)
*
* Alternatively, the contents of this file may be used under the terms of
* either the GNU General Public License Version 2 or later (the "GPL"), or
* the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
* in which case the provisions of the GPL or the LGPL are applicable instead
* of those above. If you wish to allow use of your version of this file only
* under the terms of either the GPL or the LGPL, and not to allow others to
* use your version of this file under the terms of the NPL, indicate your
* decision by deleting the provisions above and replace them with the notice
* and other provisions required by the GPL or the LGPL. If you do not delete
* the provisions above, a recipient may use your version of this file under
* the terms of any one of the NPL, the GPL or the LGPL.
*
* ***** END LICENSE BLOCK ***** */
#ifndef nsUTF8Utils_h_
#define nsUTF8Utils_h_
class UTF8traits
{
public:
static PRBool isASCII(char c) { return (c & 0x80) == 0x00; }
static PRBool isInSeq(char c) { return (c & 0xC0) == 0x80; }
static PRBool is2byte(char c) { return (c & 0xE0) == 0xC0; }
static PRBool is3byte(char c) { return (c & 0xF0) == 0xE0; }
static PRBool is4byte(char c) { return (c & 0xF8) == 0xF0; }
static PRBool is5byte(char c) { return (c & 0xFC) == 0xF8; }
static PRBool is6byte(char c) { return (c & 0xFE) == 0xFC; }
};
#define PLANE1_BASE 0x00010000
#define UCS2_REPLACEMENT_CHAR 0xfffd
class ConvertUTF8toUCS2
{
public:
typedef nsACString::char_type value_type;
typedef nsAString::char_type buffer_type;
ConvertUTF8toUCS2( buffer_type* aBuffer )
: mStart(aBuffer), mBuffer(aBuffer), mErrorEncountered(PR_FALSE) {}
size_t Length() const { return mBuffer - mStart; }
PRUint32 write( const value_type* start, PRUint32 N )
{
if ( mErrorEncountered )
return N;
// algorithm assumes utf8 units won't
// be spread across fragments
const value_type* p = start;
const value_type* end = start + N;
for ( ; p != end /* && *p */; )
{
char c = *p++;
if ( UTF8traits::isASCII(c) )
{
*mBuffer++ = buffer_type(c);
continue;
}
PRUint32 ucs4;
PRUint32 minUcs4;
PRInt32 state = 0;
if ( UTF8traits::is2byte(c) )
{
ucs4 = (PRUint32(c) << 6) & 0x000007C0L;
state = 1;
minUcs4 = 0x00000080;
}
else if ( UTF8traits::is3byte(c) )
{
ucs4 = (PRUint32(c) << 12) & 0x0000F000L;
state = 2;
minUcs4 = 0x00000800;
}
else if ( UTF8traits::is4byte(c) )
{
ucs4 = (PRUint32(c) << 18) & 0x001F0000L;
state = 3;
minUcs4 = 0x00010000;
}
else if ( UTF8traits::is5byte(c) )
{
ucs4 = (PRUint32(c) << 24) & 0x03000000L;
state = 4;
minUcs4 = 0x00200000;
}
else if ( UTF8traits::is6byte(c) )
{
ucs4 = (PRUint32(c) << 30) & 0x40000000L;
state = 5;
minUcs4 = 0x04000000;
}
else
{
NS_ERROR("Not a UTF-8 string. This code should only be used for converting from known UTF-8 strings.");
mErrorEncountered = PR_TRUE;
return N;
}
while ( state-- )
{
c = *p++;
if ( UTF8traits::isInSeq(c) )
{
PRInt32 shift = state * 6;
ucs4 |= (PRUint32(c) & 0x3F) << shift;
}
else
{
NS_ERROR("not a UTF8 string");
mErrorEncountered = PR_TRUE;
return N;
}
}
if ( ucs4 < minUcs4 )
{
// Overlong sequence
*mBuffer++ = UCS2_REPLACEMENT_CHAR;
}
else if ( ucs4 <= 0xD7FF )
{
*mBuffer++ = ucs4;
}
else if ( /* ucs4 >= 0xD800 && */ ucs4 <= 0xDFFF )
{
// Surrogates
*mBuffer++ = UCS2_REPLACEMENT_CHAR;
}
else if ( ucs4 == 0xFFFE || ucs4 == 0xFFFF )
{
// Prohibited characters
*mBuffer++ = UCS2_REPLACEMENT_CHAR;
}
else if ( ucs4 >= PLANE1_BASE )
{
if ( ucs4 >= 0x00110000 )
*mBuffer++ = UCS2_REPLACEMENT_CHAR;
else {
// surrogate, see unicode specification 3.7 for following math.
ucs4 -= PLANE1_BASE;
*mBuffer++ = (PRUnichar)(ucs4 >> 10) | 0xd800u;
*mBuffer++ = (PRUnichar)(ucs4 & 0x3ff) | 0xdc00u;
}
}
else
{
if ( ucs4 != 0xFEFF ) // ignore BOM
*mBuffer++ = ucs4;
}
}
return p - start;
}
private:
buffer_type* mStart;
buffer_type* mBuffer;
PRBool mErrorEncountered;
};
class CalculateUTF8Length
{
public:
typedef nsACString::char_type value_type;
CalculateUTF8Length() : mLength(0), mErrorEncountered(PR_FALSE) { }
size_t Length() const { return mLength; }
PRUint32 write( const value_type* start, PRUint32 N )
{
// ignore any further requests
if ( mErrorEncountered )
return N;
// algorithm assumes utf8 units won't
// be spread across fragments
const value_type* p = start;
const value_type* end = start + N;
for ( ; p < end /* && *p */; ++mLength )
{
if ( UTF8traits::isASCII(*p) )
p += 1;
else if ( UTF8traits::is2byte(*p) )
p += 2;
else if ( UTF8traits::is3byte(*p) )
p += 3;
else if ( UTF8traits::is4byte(*p) ) {
p += 4;
++mLength;
}
else if ( UTF8traits::is5byte(*p) )
p += 5;
else if ( UTF8traits::is6byte(*p) )
p += 6;
else
{
break;
}
}
if ( p != end )
{
NS_ERROR("Not a UTF-8 string. This code should only be used for converting from known UTF-8 strings.");
mErrorEncountered = PR_TRUE;
mLength = 0;
return N;
}
return p - start;
}
private:
size_t mLength;
PRBool mErrorEncountered;
};
#endif /* !defined(nsUTF8Utils_h_) */

Просмотреть файл

@ -25,6 +25,7 @@
#include "nsMemory.h"
#include "nsString.h"
#include "nsCRT.h"
#include "nsUTF8Utils.h"
#ifndef nsStringTraits_h___
#include "nsStringTraits.h"
@ -208,6 +209,8 @@ NS_COM
char*
ToNewUTF8String( const nsAString& aSource )
{
// XXX The conversion code in NS_ConvertUCS2toUTF8 needs to be
// refactored so that we can use it here without a double-copy.
NS_ConvertUCS2toUTF8 temp(aSource);
char* result;
@ -268,6 +271,26 @@ ToNewUnicode( const nsACString& aSource )
return result;
}
NS_COM
PRUnichar*
UTF8ToNewUnicode( const nsACString& aSource )
{
nsACString::const_iterator start, end;
CalculateUTF8Length calculator;
copy_string(aSource.BeginReading(start), aSource.EndReading(end),
calculator);
PRUnichar *result = NS_STATIC_CAST(PRUnichar*,
nsMemory::Alloc(sizeof(PRUnichar) * (calculator.Length() + 1)));
ConvertUTF8toUCS2 converter(result);
copy_string(aSource.BeginReading(start), aSource.EndReading(end),
converter);
NS_ASSERTION(calculator.Length() == converter.Length(), "length mismatch");
return result;
}
NS_COM
PRUnichar*
CopyUnicodeTo( const nsAString& aSource, PRUint32 aSrcOffset, PRUnichar* aDest, PRUint32 aLength )
@ -1080,6 +1103,48 @@ CountCharInReadable( const nsACString& aStr,
return count;
}
NS_COM PRBool
StringBeginsWith( const nsAString& aSource, const nsAString& aSubstring)
{
nsAString::size_type src_len = aSource.Length(),
sub_len = aSubstring.Length();
if (sub_len > src_len)
return PR_FALSE;
return Substring(aSource, 0, sub_len) == aSubstring;
}
NS_COM PRBool
StringBeginsWith( const nsACString& aSource, const nsACString& aSubstring)
{
nsACString::size_type src_len = aSource.Length(),
sub_len = aSubstring.Length();
if (sub_len > src_len)
return PR_FALSE;
return Substring(aSource, 0, sub_len) == aSubstring;
}
NS_COM PRBool
StringEndsWith( const nsAString& aSource, const nsAString& aSubstring)
{
nsAString::size_type src_len = aSource.Length(),
sub_len = aSubstring.Length();
if (sub_len > src_len)
return PR_FALSE;
return Substring(aSource, src_len - sub_len, sub_len) == aSubstring;
}
NS_COM PRBool
StringEndsWith( const nsACString& aSource, const nsACString& aSubstring)
{
nsACString::size_type src_len = aSource.Length(),
sub_len = aSubstring.Length();
if (sub_len > src_len)
return PR_FALSE;
return Substring(aSource, src_len - sub_len, sub_len) == aSubstring;
}
template <class CharT>
class CalculateHashCode
{