pjs/modules/libnls/headers/unistring.h

/*
*****************************************************************************************
*                                                                                       *
* COPYRIGHT:                                                                            *
*   (C) Copyright Taligent, Inc.,  1997                                                 *
*   (C) Copyright International Business Machines Corporation,  1996                    *
*   Licensed Material - Program-Property of IBM - All Rights Reserved.                  *
*   US Government Users Restricted Rights - Use, duplication, or disclosure             *
*   restricted by GSA ADP Schedule Contract with IBM Corp.                              *
*                                                                                       *
*****************************************************************************************
*
*  FILE NAME : unistring.h
*
*	Modification History:
*
*	Date		Name		Description
*	02/05/97	aliu		Added UnicodeString streamIn and streamOut methods.
*	03/26/97	aliu		Added indexOf(UniChar,).
*	04/24/97	aliu		Numerous changes per code review.
*   05/06/97    helena      Added isBogus().
*****************************************************************************************
*/

#ifndef _UNISTRING
#define _UNISTRING

#include <limits.h>
#include <stdlib.h>
#include <iostream.h>
#include <stdio.h>
#include "ptypes.h"
class Locale;

/**
 * Simple Unicode string class. This is a simple class that encapsulates a
 * Unicode string, allowing the user to manipulate it and allowing it to grow
 * and shrink without the user having to worry about this.
 * <P>
 * The char* interfaces on this class work with either the Latin1 (ISO 8859-1)
 * character set or a host character set.  The host character set may be any
 * 8-bit character set for which TPlatformUtilities::mapHostTo8859_1() and
 * TPlatformUtilities::map8859_1ToHost() have been defined; the default
 * implementation maps to and from EBCDIC as defined in RFC 1345.  If the
 * host character set is used, then incoming characters are mapped to Unicode,
 * and outgoing characters are mapped back to the host character set.
 * <P>
 * All inbound transcoding of char* data is done by zero-extending the incoming
 * characters, and all outbound transcoding is done by truncating the top byte
 * from the characters.
 */
#ifdef NLS_MAC
#pragma export on
#endif

class T_UTILITY_API UnicodeString {
	public:
       /**
         * Standard operator new.  This function is only provided because the
         * special operator new would otherwise hide it.  This function just
         * turns around and calls the global operator new function.
         */
        void*                       operator new(size_t size);

        /**
         * Placement new.  This version of operator new just returns the "location"
         * parameter unchanged as its result.  It ignores the "size" parameter.
         * This function is here only to allow stack allocation of UnicodeStrings
         * through the C wrapper interface.  DO NOT CALL THIS FUNCTION FROM C++
         * UNLESS YOU'RE SURE YOU KNOW WHAT YOU'RE DOING!
         * @param size Ignored.  There's no way this function can check the size
         *             of the block you pass to it.  This function trusts you've
         *             allocated enough space at that location to hold a Unicode-
         *             String object.
         * @param location The location where you want the new UnicodeString to
         *                 be stored.  Typically this will be a local variable on
         *                 the stack.  This function trusts that there's enough
         *                 location to hold a UnicodeString object.
         * @return Whatever was passed in for "location".
         */
        void*                       operator new(size_t size, void* location);

									UnicodeString();
									UnicodeString(const UnicodeString&	that);
									UnicodeString(const UniChar*	that);
									UnicodeString(const UniChar*	that,
														t_int32		thatLength);
									UnicodeString(const char*	that); // Must be null-terminated

		/**
         * External-buffer constructor.  This constructor allows UnicodeString to
         * use storage provided by the client as its character buffer, rather than
         * allocating its own storage.  The client passes a pointer to the storage,
         * along with the number of characters currently stored in it (we don't
         * use null termination to determine the string length, and the string is
         * not ever guaranteed to be null-terminated) and the number of characters
         * the storage is capable of holding.
         * <P>
         * WARNING:  Do not change the characters in the buffer during the period
         *           that the UnicodeString it active.  Doing so may lead to
         *           undefined results.
         * <P>
         * WARNING:  If the string grows beyond the capacity of the buffer passed
         *           to this constructor, UnicodeString will allocate its own storage,
         *           and no subsequent changes to the UnicodeString will be reflected
         *           in the buffer passed to this constructor (UnicodeString itself
         *           will continue to work right, however.
         * <P>
         * WARNING:  The string stored in the client-owned buffer is never guaranteed
         *           to be null-terminated.
         * @param charBuffer    A pointer to a range of storage that the new UnicodeString
         *                      should use as its character-storage buffer.  The client
         *                      retains responsibility for deleting this storage after
         *                      the UnicodeString goes away.
         * @param numCharsInBuffer  The number of characters currently stored in charBuffer.
         * @param bufferCapabity    The number of characters the buffer if capable of
         *                          holding.  This must be greater than or equal to
         *                          numCharsInBuffer, but this isn't checked.
         */
                                    UnicodeString(UniChar*  charBuffer,
                                                  t_int32   numCharsInBuffer,
                                                  t_int32   bufferCapacity);


		/* Creates a UnicodeString from a given const char* buffer and an
		 * encoding name.
		 * Netscape added method.
		 * <P>
		 * @param that		A null-terminated char buffer in a given encoding
		 * @param encoding	name for the encoding used for buffer
		 *
		 */
								
									UnicodeString(const char*	that, 
														const char* encoding);
                                    ~UnicodeString() { if (!fClientOwnsStorage)
                                                         delete [] fChars; }
									
        UnicodeString&              operator=(const UnicodeString&  that);
        
        /**
         * Compares a UnicodeString to something else.  All versions of compare()
         * do bitwise comparison; internationally-sensitive comparison requires
         * the Collation library. The offset and length parameters are pinned to
         * permissible values if they are out of range.
         */
        t_int8                      compare(const UnicodeString&    that) const;
        t_int8                      compare(TextOffset              thisOffset,
                                            t_int32                 thisLength,
                                            const UnicodeString&    that,
                                            TextOffset              thatOffset,
                                            t_int32                 thatLength) const;
        t_int8                      compare(const UniChar*          that) const; // Must be null-terminated
        t_int8                      compare(const UniChar*          that,
                                            t_int32                 thatLength) const;
        t_int8                      compare(const char*             that) const;

        /**
         * Compares substrings of two UnicodeStrings. Same as compare(), but
         * takes starting and ending offsets instead of starting offsets and
         * character counts. The characters from the starting offset up to, but
         * not including the ending offset are compared. The start and limit
         * parameters are pinned to permissible values if they are out of range.
         */
		t_int8						compareBetween(	TextOffset				thisStart,
													TextOffset				thisLimit,
													const UnicodeString&	that,
													TextOffset				thatStart,
													TextOffset				thatLimit) const;

		/**
		 * Comparison operators.  All of these operators map through to compare().
		 */
		t_bool						operator==(const UnicodeString&	that) const;
		t_bool						operator!=(const UnicodeString&	that) const;
		t_bool						operator>(const UnicodeString&	that) const;
		t_bool						operator<(const UnicodeString&	that) const;
		t_bool						operator>=(const UnicodeString&	that) const;
		t_bool						operator<=(const UnicodeString&	that) const;

        /**
         * Returns the offset within this String of the first occurrence of the
         * specified substring "that". The search begins with the character at fromIndex
	 * and examines at most forLength characters.  Returns -1 if "that" is not found.
         */
		TextOffset					indexOf(const UnicodeString&	that,
											TextOffset				fromOffset = 0,
											t_uint32				forLength = -1) const;
		TextOffset					indexOf(UniChar					character,
											TextOffset				fromOffset = 0,
											t_uint32				forLength = -1) const;
        /**
         * Returns the offset within this String of the last occurrence of the
         * specified substring "that". The search begins with the character before fromOffset
	 * and examines at most forLength characters (moving backward from fromOffset).
	 * Returns -1 if "that" is not found.
         */
		TextOffset					lastIndexOf(const UnicodeString&	that,
												TextOffset				fromOffset = T_INT32_MAX,
												t_uint32				forLength = -1) const;
		TextOffset					lastIndexOf(UniChar					character,
												TextOffset				fromOffset = T_INT32_MAX,
												t_uint32				forLength = -1) const;

		
		/**
		 * Returns true if "that" appears in its entirety at the beginning of "this"
		 */
		t_bool						startsWith(const UnicodeString&	that) const;
		
		/**
		 * Returns true if "that" appears in its entirety at the end of "this"
		 */
		t_bool						endsWith(const UnicodeString&	that) const;
												
		/**
		 * Stores in "that" a copy of "this" that has had leading and trailing whitespace
		 * removed from it.  "this" itself is unaffected.
		 */
		UnicodeString&				trim(UnicodeString& that) const;

        /**
         * Trims leading and trailing whitespace from this UnicodeString.
         */
		void						trim();

		/**
		 * If the string is shorter than targetLength, adds enough copies of padChar to the
		 * beginning to make the length targetLength and returns true; otherwise returns false.
		 */
		t_bool						padLeading(	t_int32	targetLength,
												UniChar	padChar = ' ');

		/**
		 * If the string is shorter than targetLength, adds enough copies of padChar to the
		 * end to make the length targetLength and returns true; otherwise returns false.
		 */
		t_bool						padTrailing(t_int32	targetLength,
												UniChar	padChar = ' ');

		/**
		 * If the string is longer than targetLength, deletes enough characters from the
		 * end to make the length targetLength and returns true; otherwise returns false.
		 */
		t_bool						truncate(t_int32	targetLength);


        /**
         * Allows UnicodeString to be used with interfaces that use UniChar*.
         * Returns a pointer to the UnicodeString's internal storage. This
         * storage is still owned by the UnicodeString, and the caller is not
         * allowed to change it. The string returned by this function is
         * correctly null-terminated.
         */
									operator const UniChar*() const;

        /**
         * Extracts the characters from a UnicodeString without copying. Returns
         * a pointer to the UnicodeString's internal storage. The caller
         * acquires ownership of this storage and is responsible for deleting
         * it. The UnicodeString is set to empty by this operation. WARNING: The
         * string returned is not null-terminated unless the caller explicitly
         * adds a null character to the end with operator+=().
         */
		UniChar*					orphanStorage() ;

        /**
         * Extracts a substring.  Extracts the specified substring of the
         * UnicodeString into the storage referred to by extractInto. The offset
         * and length parameters are pinned to permissible values if they are
         * out of range.
         * <P>
         * NOTE: No null byte is written to UniChar* extractInto. If you want
         * extractInto to have a null-terminated string you should do
         * extractInto[len]=0, where len is the actual number of characters
         * extracted.
         */
		UnicodeString&				extract(	TextOffset		thisOffset,
												t_int32			thisLength,
												UnicodeString&	extractInto) const;
		void						extract(	TextOffset		thisOffset,
												t_int32			thisLength,
												UniChar*		extractInto) const;

        /**
         * This version of extract() extracts into an array of char. The
         * characters are converted from UniChar to char by truncating the
         * high-order byte (in other words, this function assumes the Unicode
         * data being converted is all from the Latin1 character set). The
         * offset and length parameters are pinned to permissible values if they
         * are out of range.
         * <P>
         * NOTE: No null byte is written. If you want extractInto to have a
         * null-terminated string you should do extractInto[len]=0, where len is
         * the actual number of characters extracted.
         */
		void						extract(	TextOffset		thisOffset,
												t_int32			thisLength,
												char*			extractInto) const;

        /**
         * Extract a substring. Same as extract(), but the substring is
         * specified as starting and ending offsets [start, limit). That is,
         * from the starting offset up to, but not including, the ending offset.
         * The start and limit parameters are pinned to permissible values if
         * they are out of range.
         */
		UnicodeString&				extractBetween(	TextOffset		start,
													TextOffset		limit,
													UnicodeString&	extractInto) const;
		
        /**
         * Return the character at the given offset of this string. If the
         * offset is out of range, return 0 (for the const method) or a
         * reference to a UniChar having the value 0 (for the non-const method).
         */
		UniChar						operator[](TextOffset	offset) const;
		UniChar&					operator[](TextOffset	offset);
		
		/**
		 * Append a string or character.  The specfied string or character is added
		 * to the end of the string.
		 */
		UnicodeString&				operator+=(const UnicodeString&	that);
		UnicodeString&				operator+=(UniChar	that);
		
		/**
		 * Insert a string.  The contents of "that" are inserted into *this so that
		 * the first character from "that" occurs at thisOffset.  If thisOffset is out
		 * of range, the new characters are added at the end.
		 */
		UnicodeString&				insert(	TextOffset				thisOffset,
											const UnicodeString&	that);
		
        /**
         * Remove part of this string. remove() with no arguments removes all
         * characters of this string. Note: The storage is not removed, but the
         * logical length, and possibly the contents, are altered.
         */
		UnicodeString&				remove();
		UnicodeString&				remove(	TextOffset	offset,
											t_int32		length = T_INT32_MAX);

        /**
         * Delete characters. Same as remove(), but the range of characters to
         * delete is specified as a pair of starting and ending offsets [start,
         * limit), rather than a starting offset and a character count. That is,
         * from the starting offset up to, but not including, the ending offset.
         * The start and limit parameters are pinned to permissible values if
         * they are out of range.
         */
		UnicodeString&				removeBetween(	TextOffset	start = 0,
													TextOffset	limit = T_INT32_MAX);

		/**
		 * Replace characters.  Replaces the characters in the range specified by
		 * thisOffset and thisLength with the characters in "that" (or the specfied
		 * subrange of "that").  All parameters are pinned to permissible values
		 * if necessary.  If the source and replacement text are different lengths,
		 * the string will be lengthened or shortened as necessary.
		 */
		UnicodeString&				replace(	TextOffset				thisOffset,
												t_int32					thisLength,
												const UnicodeString&	that,
												TextOffset				thatOffset = 0,
												t_int32					thatLength = T_INT32_MAX);
		UnicodeString&				replace(	TextOffset		thisOfset,
												t_int32			thisLength,
												const UniChar*	that);
		UnicodeString&				replace(	TextOffset		thisOffset,
												t_int32			thisLength,
												const UniChar*	that,
												t_int32			thatLength);

		UnicodeString&				replace(	TextOffset	thisOffset,
												t_int32		thisLength,
												const char*	that);

        /**
         * Replace characters. Same as replace(), but the affected subranges are
         * specified as pairs of starting and ending offsets [start, limit)
         * rather than starting offsets and lengths. That is, from the starting
         * offset up to, but not including, the ending offset. The start and
         * limit parameters are pinned to permissible values if they are out of
         * range.
         */
		UnicodeString&				replaceBetween(	TextOffset				thisStart,
													TextOffset				thisLimit,
													const UnicodeString&	that,
													TextOffset				thatStart = 0,
													TextOffset				thatLimit = T_INT32_MAX);
		
 		/**
		 * Replaces all occurrences of "oldText" in the string in the range defined by
		 * fromOffset and forLength with "newText".
		 */
		void						findAndReplace( const UnicodeString&	oldText,
													const UnicodeString&	newText,
													TextOffset				fromOffset = 0,
													t_uint32				forLength = -1);
        /**
         * Reverse the characters in this string in place. That is, "abcd"
         * becomes "dcba". Return a reference to this string.
         */
		UnicodeString&				reverse();
		UnicodeString&				reverse(TextOffset from,
                                            TextOffset to);

        /**
         * Convert this string to uppercase or lowercase. The methods which take
         * no arguments use the default Locale. (These methods cannot take a
         * default argument of Locale::getDefault() because that would create a
         * circular class dependency between UnicodeString and Locale.)
         */
		UnicodeString&				toUpper();
		UnicodeString&				toUpper(const Locale& locale);
		UnicodeString&				toLower();
		UnicodeString&				toLower(const Locale& locale);
		
        /**
         * Return the length of this string. This will always be a non-negative
         * number.
         */
		t_int32						size() const;

        /**
         * Return the hash code for this string. This is used by hash tables
         * which use this object as a key. The hash code is cached, and
         * recomputed when necessary. For this reason, this method may alter the
         * physical object, even though it is semantically const.
         */
		t_int32						hashCode() const;
		
	    /**
         * Returns the number of display cells the specified substring takes up.
         * This function is designed for Asian text and properly takes into account
         * halfwidth and fullwidth variants of various CJK characters and the combining
         * behavior of the Hangul Jamo characters (with some limitations; see
         * documentation for Unicode::getCellWidth()).
         * <P>
         * In order to avoid dealing
         * with fractions, this function can either be construed to return twice the
         * actual number of display cells or to treat a "cell" as the width of a halfwidth
         * character rather than the width of a fullwidth character.
         * <P>
         * The "asian" parameter controls whether characters considered NEUTRAL by
         * the Unicode class are treated as halfwidth or fullwidth here.  If you set
         * "asian" to FALSE, neutrals are treated as halfwidth, and this function returns
         * a close approximation of how many Latin display cells the text will take up
         * in a monospaced font.
		 */
		t_int32						numDisplayCells(TextOffset	fromOffset = 0,
													t_int32		forLength = T_INT32_MAX,
													t_bool		asian = TRUE) const;
       /**
         * The streamIn and streamOut methods read and write objects of this
         * class as binary, platform-dependent data in the iostream. The stream
         * must be in ios::binary mode for this to work. These methods are not
         * intended for general public use; they are used by the framework to
         * improve performance by storing certain objects in binary files.
         */
		void                        streamOut(FILE* os) const;
        void                        streamIn(FILE* is);

		/**
		 * Returns TRUE if the string resize failed.  It is very important
		 * to check if a unicode string is valid after modification.  
		 */
		t_bool						isBogus() const;

	/*
	 * Additional Netscape routines
	 */
		/** Converts the String to a char* using a target encoding */
		char*						toCString(const char* encoding) const;

		/** Compare case insensitive. Still diacrit sensitive. Is not locale sensitive.
		 *	All versions of compare() do bitwise comparison; internationally-
		 *	sensitive comparison requires the Collation library. */
		int							compareIgnoreCase(const UnicodeString&	that) const;
		int							compareIgnoreCase(const UniChar*	that, 
												t_int32 thatLength) const;
		int							compareIgnoreCase(const UniChar*	that) const;
		int							compareIgnoreCase(const char*	that, 
												const char* encoding) const;
		/* Assumes a LATIN-1 string */
		int	
									compareIgnoreCase(const char*	that) const;


	private:
		/* Netscape Private */
		char*						toCStringTruncate() const;

		static t_int32				lengthOf(const UniChar*	chars);
		static t_int32				lengthOf(const char*	chars);
		void						resize(t_int32 	newLength);
		void						setToBogus(void);
		static void					copy(	const UniChar*	from,
											UniChar*		to,
											t_int32			numChars);
		static void					copy(	const char*	from,
											UniChar*	to,
											t_int32		numChars);
		static void					copy(	const UniChar*	from,
											char*			to,
											t_int32			numChars);

		t_int8						doCompare(	const UniChar*	thiss,
												t_int32			thisLength,
												const UniChar*	that,
												t_int32			thatLength) const;
		static const t_int32		kInvalidHashCode;
		static const t_int32		kEmptyHashCode;
		static UniChar				fgErrorChar;
	
		UniChar*					fChars;
		t_int32						fSize;
		t_int32						fCapacity;
		t_int32						fHashCode;
        t_bool                      fClientOwnsStorage;
		t_bool						fBogus;
};

#ifdef NLS_MAC
#pragma export off
#endif

/**
 * Write the contents of a UnicodeString to an ostream. This functions writes
 * the characters in a UnicodeString to an ostream. The UniChars in the
 * UnicodeString are truncated to char, leading to undefined results with
 * anything not in the Latin1 character set.
 */
NLSUNIAPI_PUBLIC(ostream&) operator<<(ostream&				stream,
							  const UnicodeString&	string);

//----------------------------------------------------
// operator new
//----------------------------------------------------

inline void*
UnicodeString::operator new(size_t size)
{
    return ::operator new(size);
}

inline void*
UnicodeString::operator new(size_t size, void* location)
{
    // WARNING: Do not use this operator unless you're sure you know what you're
    // doing!  It just passes "location" through blindly.  If there isn't enough
    // free space at "location" to hold a UnicodeString (or if "location" is
    // somehow invalid), you're in trouble!
    return location;
}
//----------------------------------------------------
// Fast append
//----------------------------------------------------

inline UnicodeString&
UnicodeString::operator+=(UniChar	that)
{
	if (fSize < fCapacity) {
		fChars[fSize++] = that;
		fHashCode = kInvalidHashCode;
	} else {
		resize(fSize + 1);
		if (!fBogus)                   // change required for HP-UX         
		 fChars[fSize - 1] = that;
	} 
	return *this;
}

//----------------------------------------------------
// Character access
//----------------------------------------------------

inline UniChar
UnicodeString::operator[](TextOffset	offset) const
{
	// Cast to unsigned in order to detect negative values.
	// Assume fSize >= 0.
	return ((t_uint32)offset < (t_uint32)fSize) ? fChars[offset] : 0;
}

inline UniChar&
UnicodeString::operator[](TextOffset	offset)
{
	// Cast to unsigned in order to detect negative values
	// Assume fSize >= 0.
	
	UniChar& result = fgErrorChar;
	if ((t_uint32)offset < (t_uint32)fSize)
	{
		fHashCode = kInvalidHashCode;
		result =  fChars[offset];
	} else
	{
	        fgErrorChar = 0; // Always reset this to zero in case the caller has modified it
		result = fgErrorChar;
	}
	
	return result;
}

//----------------------------------------------------
// Other inline methods
//----------------------------------------------------

inline UnicodeString&
UnicodeString::remove()
{
	fSize = 0;
	fBogus = FALSE;
	return *this;
}

inline t_int32
UnicodeString::size() const
{
	return fSize;
}

inline t_int8
UnicodeString::compare(const UnicodeString& that) const
{
    return doCompare(fChars, fSize, that.fChars, that.fSize);
}

inline t_bool
UnicodeString::operator==(const UnicodeString&  that) const
{
    // Check fSize first to avoid the call to compare in many cases
    return fSize == that.fSize && compare(that) == 0;
}


inline t_bool
UnicodeString::operator!=(const UnicodeString&	that) const
{
	return compare(that) != 0;
}

inline t_bool
UnicodeString::operator>(const UnicodeString&	that) const
{
	return compare(that) == 1;
}

inline t_bool
UnicodeString::operator<(const UnicodeString&	that) const
{
	return compare(that) == -1;
}

inline t_bool
UnicodeString::operator<=(const UnicodeString&	that) const
{
	return compare(that) != 1;
}

inline t_bool
UnicodeString::operator>=(const UnicodeString&	that) const
{
	return compare(that) != -1;
}

inline t_bool
UnicodeString::isBogus() const { return fBogus; }

/**
 * The arrayCopy() methods copy an array of UnicodeString OBJECTS (not
 * pointers).
 */
inline void arrayCopy(const UnicodeString* src, UnicodeString* dst, t_int32 count)
	{ while (count-- > 0) *dst++ = *src++; }

inline void arrayCopy(const UnicodeString* src, t_int32 srcStart, UnicodeString* dst, t_int32 dstStart, t_int32 count)
	{ arrayCopy(src+srcStart, dst+dstStart, count); }

#endif