/* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ /* vim: set ts=8 sts=2 et sw=2 tw=80: */ /* This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ // IWYU pragma: private, include "nsString.h" #ifndef nsReadableUtils_h___ #define nsReadableUtils_h___ /** * I guess all the routines in this file are all mis-named. * According to our conventions, they should be |NS_xxx|. */ #include "mozilla/Assertions.h" #include "nsAString.h" #include "nsTArrayForwardDeclare.h" // Can't include mozilla/Encoding.h here extern "C" { size_t encoding_utf8_valid_up_to(uint8_t const* buffer, size_t buffer_len); size_t encoding_ascii_valid_up_to(uint8_t const* buffer, size_t buffer_len); } inline size_t Distance(const nsReadingIterator& aStart, const nsReadingIterator& aEnd) { MOZ_ASSERT(aStart.get() <= aEnd.get()); return static_cast(aEnd.get() - aStart.get()); } inline size_t Distance(const nsReadingIterator& aStart, const nsReadingIterator& aEnd) { MOZ_ASSERT(aStart.get() <= aEnd.get()); return static_cast(aEnd.get() - aStart.get()); } void LossyCopyUTF16toASCII(const nsAString& aSource, nsACString& aDest); void CopyASCIItoUTF16(const nsACString& aSource, nsAString& aDest); MOZ_MUST_USE bool CopyASCIItoUTF16(const nsACString& aSource, nsAString& aDest, const mozilla::fallible_t&); void LossyCopyUTF16toASCII(const char16ptr_t aSource, nsACString& aDest); void CopyASCIItoUTF16(const char* aSource, nsAString& aDest); void CopyUTF16toUTF8(const nsAString& aSource, nsACString& aDest); MOZ_MUST_USE bool CopyUTF16toUTF8(const nsAString& aSource, nsACString& aDest, const mozilla::fallible_t&); void CopyUTF8toUTF16(const nsACString& aSource, nsAString& aDest); void CopyUTF16toUTF8(const char16ptr_t aSource, nsACString& aDest); void CopyUTF8toUTF16(const char* aSource, nsAString& aDest); void LossyAppendUTF16toASCII(const nsAString& aSource, nsACString& aDest); void AppendASCIItoUTF16(const nsACString& aSource, nsAString& aDest); MOZ_MUST_USE bool AppendASCIItoUTF16(const nsACString& aSource, nsAString& aDest, const mozilla::fallible_t&); void LossyAppendUTF16toASCII(const char16ptr_t aSource, nsACString& aDest); MOZ_MUST_USE bool AppendASCIItoUTF16(const char* aSource, nsAString& aDest, const mozilla::fallible_t&); void AppendASCIItoUTF16(const char* aSource, nsAString& aDest); void AppendUTF16toUTF8(const nsAString& aSource, nsACString& aDest); MOZ_MUST_USE bool AppendUTF16toUTF8(const nsAString& aSource, nsACString& aDest, const mozilla::fallible_t&); void AppendUTF8toUTF16(const nsACString& aSource, nsAString& aDest); MOZ_MUST_USE bool AppendUTF8toUTF16(const nsACString& aSource, nsAString& aDest, const mozilla::fallible_t&); void AppendUTF16toUTF8(const char16ptr_t aSource, nsACString& aDest); void AppendUTF8toUTF16(const char* aSource, nsAString& aDest); /** * Returns a new |char| buffer containing a zero-terminated copy of |aSource|. * * Allocates and returns a new |char| buffer which you must free with |free|. * Performs a lossy encoding conversion by chopping 16-bit wide characters down to 8-bits wide while copying |aSource| to your new buffer. * This conversion is not well defined; but it reproduces legacy string behavior. * The new buffer is zero-terminated, but that may not help you if |aSource| contains embedded nulls. * * @param aSource a 16-bit wide string * @return a new |char| buffer you must free with |free|. */ char* ToNewCString(const nsAString& aSource); /** * Returns a new |char| buffer containing a zero-terminated copy of |aSource|. * * Allocates and returns a new |char| buffer which you must free with |free|. * The new buffer is zero-terminated, but that may not help you if |aSource| contains embedded nulls. * * @param aSource an 8-bit wide string * @return a new |char| buffer you must free with |free|. */ char* ToNewCString(const nsACString& aSource); /** * Returns a new |char| buffer containing a zero-terminated copy of |aSource|. * * Allocates and returns a new |char| buffer which you must free with * |free|. * Performs an encoding conversion from a UTF-16 string to a UTF-8 string * copying |aSource| to your new buffer. * The new buffer is zero-terminated, but that may not help you if |aSource| * contains embedded nulls. * * @param aSource a UTF-16 string (made of char16_t's) * @param aUTF8Count the number of 8-bit units that was returned * @return a new |char| buffer you must free with |free|. */ char* ToNewUTF8String(const nsAString& aSource, uint32_t* aUTF8Count = nullptr); /** * Returns a new |char16_t| buffer containing a zero-terminated copy of * |aSource|. * * Allocates and returns a new |char16_t| buffer which you must free with * |free|. * The new buffer is zero-terminated, but that may not help you if |aSource| * contains embedded nulls. * * @param aSource a UTF-16 string * @return a new |char16_t| buffer you must free with |free|. */ char16_t* ToNewUnicode(const nsAString& aSource); /** * Returns a new |char16_t| buffer containing a zero-terminated copy of |aSource|. * * Allocates and returns a new |char16_t| buffer which you must free with |free|. * Performs an encoding conversion by 0-padding 8-bit wide characters up to 16-bits wide while copying |aSource| to your new buffer. * This conversion is not well defined; but it reproduces legacy string behavior. * The new buffer is zero-terminated, but that may not help you if |aSource| contains embedded nulls. * * @param aSource an 8-bit wide string (a C-string, NOT UTF-8) * @return a new |char16_t| buffer you must free with |free|. */ char16_t* ToNewUnicode(const nsACString& aSource); /** * Returns the required length for a char16_t buffer holding * a copy of aSource, using UTF-8 to UTF-16 conversion. * The length does NOT include any space for zero-termination. * * @param aSource an 8-bit wide string, UTF-8 encoded * @return length of UTF-16 encoded string copy, not zero-terminated */ uint32_t CalcUTF8ToUnicodeLength(const nsACString& aSource); /** * Copies the source string into the specified buffer, converting UTF-8 to * UTF-16 in the process. The conversion is well defined for valid UTF-8 * strings. * The copied string will be zero-terminated! Any embedded nulls will be * copied nonetheless. It is the caller's responsiblity to ensure the buffer * is large enough to hold the string copy plus one char16_t for * zero-termination! * * @see CalcUTF8ToUnicodeLength( const nsACString& ) * @see UTF8ToNewUnicode( const nsACString&, uint32_t* ) * * @param aSource an 8-bit wide string, UTF-8 encoded * @param aBuffer the buffer holding the converted string copy * @param aUTF16Count receiving optionally the number of 16-bit units that * were copied * @return aBuffer pointer, for convenience */ char16_t* UTF8ToUnicodeBuffer(const nsACString& aSource, char16_t* aBuffer, uint32_t* aUTF16Count = nullptr); /** * Returns a new |char16_t| buffer containing a zero-terminated copy * of |aSource|. * * Allocates and returns a new |char| buffer which you must free with * |free|. Performs an encoding conversion from UTF-8 to UTF-16 * while copying |aSource| to your new buffer. This conversion is well defined * for a valid UTF-8 string. The new buffer is zero-terminated, but that * may not help you if |aSource| contains embedded nulls. * * @param aSource an 8-bit wide string, UTF-8 encoded * @param aUTF16Count the number of 16-bit units that was returned * @return a new |char16_t| buffer you must free with |free|. * (UTF-16 encoded) */ char16_t* UTF8ToNewUnicode(const nsACString& aSource, uint32_t* aUTF16Count = nullptr); /** * Copies |aLength| 16-bit code units from the start of |aSource| to the * |char16_t| buffer |aDest|. * * After this operation |aDest| is not null terminated. * * @param aSource a UTF-16 string * @param aSrcOffset start offset in the source string * @param aDest a |char16_t| buffer * @param aLength the number of 16-bit code units to copy * @return pointer to destination buffer - identical to |aDest| */ char16_t* CopyUnicodeTo(const nsAString& aSource, uint32_t aSrcOffset, char16_t* aDest, uint32_t aLength); /** * Copies 16-bit characters between iterators |aSrcStart| and * |aSrcEnd| to the writable string |aDest|. Similar to the * |nsString::Mid| method. * * After this operation |aDest| is not null terminated. * * @param aSrcStart start source iterator * @param aSrcEnd end source iterator * @param aDest destination for the copy */ void CopyUnicodeTo(const nsAString::const_iterator& aSrcStart, const nsAString::const_iterator& aSrcEnd, nsAString& aDest); /** * Appends 16-bit characters between iterators |aSrcStart| and * |aSrcEnd| to the writable string |aDest|. * * After this operation |aDest| is not null terminated. * * @param aSrcStart start source iterator * @param aSrcEnd end source iterator * @param aDest destination for the copy */ void AppendUnicodeTo(const nsAString::const_iterator& aSrcStart, const nsAString::const_iterator& aSrcEnd, nsAString& aDest); /** * Returns |true| if |aString| contains only ASCII characters, that is, characters in the range (0x00, 0x7F). * * @param aString a 16-bit wide string to scan */ bool IsASCII(const nsAString& aString); /** * Returns |true| if |aString| contains only ASCII characters, that is, characters in the range (0x00, 0x7F). * * @param aString a 8-bit wide string to scan */ inline bool IsASCII(const nsACString& aString) { size_t length = aString.Length(); const uint8_t* ptr = reinterpret_cast(aString.BeginReading()); // For short strings, calling into Rust is a pessimization, and the SIMD // code won't have a chance to kick in anyway. Additionally, handling the // case of the empty string here makes null-checking ptr unnecessary. // (Passing nullptr to Rust would technically be UB.) if (length < 16) { size_t accu = 0; for (size_t i = 0; i < length; i++) { accu |= ptr[i]; } return accu < 0x80; } // This is not quite optimal, because it's not fail-fast when the by-register // check already finds non-ASCII. Also, input to this function is almost // always ASCII, so even the by-register check wouldn't need to be fail-fast // and could be more like the loop above. return length == encoding_ascii_valid_up_to(ptr, length); } /** * Returns |true| if |aString| is a valid UTF-8 string. * * Note that this doesn't check whether the string might look like a valid * string in another encoding, too, e.g. ISO-2022-JP. * * @param aString an 8-bit wide string to scan */ inline bool IsUTF8(const nsACString& aString) { size_t length = aString.Length(); const uint8_t* ptr = reinterpret_cast(aString.BeginReading()); // For short strings, calling into Rust is a pessimization, and the SIMD // code won't have a chance to kick in anyway. Additionally, handling the // case of the empty string here makes null-checking ptr unnecessary. // (Passing nullptr to Rust would technically be UB.) if (length < 16) { for (size_t i = 0; i < length; i++) { if (ptr[i] >= 0x80) { ptr += i; length -= i; goto end; } } return true; } end: return length == encoding_utf8_valid_up_to(ptr, length); } bool ParseString(const nsACString& aAstring, char aDelimiter, nsTArray& aArray); /** * Converts case in place in the argument string. */ void ToUpperCase(nsACString&); void ToLowerCase(nsACString&); void ToUpperCase(nsACString&); void ToLowerCase(nsACString&); /** * Converts case from string aSource to aDest. */ void ToUpperCase(const nsACString& aSource, nsACString& aDest); void ToLowerCase(const nsACString& aSource, nsACString& aDest); /** * Finds the leftmost occurrence of |aPattern|, if any in the range |aSearchStart|..|aSearchEnd|. * * Returns |true| if a match was found, and adjusts |aSearchStart| and |aSearchEnd| to * point to the match. If no match was found, returns |false| and makes |aSearchStart == aSearchEnd|. * * Currently, this is equivalent to the O(m*n) implementation previously on |ns[C]String|. * If we need something faster, then we can implement that later. */ bool FindInReadable(const nsAString& aPattern, nsAString::const_iterator&, nsAString::const_iterator&, const nsStringComparator& = nsDefaultStringComparator()); bool FindInReadable(const nsACString& aPattern, nsACString::const_iterator&, nsACString::const_iterator&, const nsCStringComparator& = nsDefaultCStringComparator()); /* sometimes we don't care about where the string was, just that we * found it or not */ inline bool FindInReadable(const nsAString& aPattern, const nsAString& aSource, const nsStringComparator& aCompare = nsDefaultStringComparator()) { nsAString::const_iterator start, end; aSource.BeginReading(start); aSource.EndReading(end); return FindInReadable(aPattern, start, end, aCompare); } inline bool FindInReadable(const nsACString& aPattern, const nsACString& aSource, const nsCStringComparator& aCompare = nsDefaultCStringComparator()) { nsACString::const_iterator start, end; aSource.BeginReading(start); aSource.EndReading(end); return FindInReadable(aPattern, start, end, aCompare); } bool CaseInsensitiveFindInReadable(const nsACString& aPattern, nsACString::const_iterator&, nsACString::const_iterator&); /** * Finds the rightmost occurrence of |aPattern| * Returns |true| if a match was found, and adjusts |aSearchStart| and |aSearchEnd| to * point to the match. If no match was found, returns |false| and makes |aSearchStart == aSearchEnd|. * */ bool RFindInReadable(const nsAString& aPattern, nsAString::const_iterator&, nsAString::const_iterator&, const nsStringComparator& = nsDefaultStringComparator()); bool RFindInReadable(const nsACString& aPattern, nsACString::const_iterator&, nsACString::const_iterator&, const nsCStringComparator& = nsDefaultCStringComparator()); /** * Finds the leftmost occurrence of |aChar|, if any in the range * |aSearchStart|..|aSearchEnd|. * * Returns |true| if a match was found, and adjusts |aSearchStart| to * point to the match. If no match was found, returns |false| and * makes |aSearchStart == aSearchEnd|. */ bool FindCharInReadable(char16_t aChar, nsAString::const_iterator& aSearchStart, const nsAString::const_iterator& aSearchEnd); bool FindCharInReadable(char aChar, nsACString::const_iterator& aSearchStart, const nsACString::const_iterator& aSearchEnd); /** * Finds the number of occurences of |aChar| in the string |aStr| */ uint32_t CountCharInReadable(const nsAString& aStr, char16_t aChar); uint32_t CountCharInReadable(const nsACString& aStr, char aChar); bool StringBeginsWith(const nsAString& aSource, const nsAString& aSubstring); bool StringBeginsWith(const nsAString& aSource, const nsAString& aSubstring, const nsStringComparator& aComparator); bool StringBeginsWith(const nsACString& aSource, const nsACString& aSubstring); bool StringBeginsWith(const nsACString& aSource, const nsACString& aSubstring, const nsCStringComparator& aComparator); bool StringEndsWith(const nsAString& aSource, const nsAString& aSubstring); bool StringEndsWith(const nsAString& aSource, const nsAString& aSubstring, const nsStringComparator& aComparator); bool StringEndsWith(const nsACString& aSource, const nsACString& aSubstring); bool StringEndsWith(const nsACString& aSource, const nsACString& aSubstring, const nsCStringComparator& aComparator); const nsString& EmptyString(); const nsCString& EmptyCString(); const nsString& VoidString(); const nsCString& VoidCString(); /** * Compare a UTF-8 string to an UTF-16 string. * * Returns 0 if the strings are equal, -1 if aUTF8String is less * than aUTF16Count, and 1 in the reverse case. In case of fatal * error (eg the strings are not valid UTF8 and UTF16 respectively), * this method will return INT32_MIN. */ int32_t CompareUTF8toUTF16(const nsACString& aUTF8String, const nsAString& aUTF16String); void AppendUCS4ToUTF16(const uint32_t aSource, nsAString& aDest); template inline bool EnsureStringLength(T& aStr, uint32_t aLen) { aStr.SetLength(aLen); return (aStr.Length() == aLen); } #endif // !defined(nsReadableUtils_h___)