gecko-dev/intl/lwbrk/nsPangoBreaker.cpp

/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
/* This Source Code Form is subject to the terms of the Mozilla Public
 * License, v. 2.0. If a copy of the MPL was not distributed with this
 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */

#include "nsComplexBreaker.h"

#include <pango/pango-break.h>
#include "nsUTF8Utils.h"
#include "nsString.h"
#include "nsTArray.h"

void NS_GetComplexLineBreaks(const char16_t* aText, uint32_t aLength,
                             uint8_t* aBreakBefore) {
  NS_ASSERTION(aText, "aText shouldn't be null");

  memset(aBreakBefore, false, aLength * sizeof(uint8_t));

  AutoTArray<PangoLogAttr, 2000> attrBuffer;
  if (!attrBuffer.AppendElements(aLength + 1)) return;

  NS_ConvertUTF16toUTF8 aUTF8(aText, aLength);

  const gchar* p = aUTF8.Data();
  const gchar* end = p + aUTF8.Length();
  uint32_t u16Offset = 0;

  static PangoLanguage* language = pango_language_from_string("en");

  while (p < end) {
    PangoLogAttr* attr = attrBuffer.Elements();
    pango_get_log_attrs(p, end - p, -1, language, attr, attrBuffer.Length());

    while (p < end) {
      aBreakBefore[u16Offset] = attr->is_line_break;
      if (NS_IS_LOW_SURROGATE(aText[u16Offset]))
        aBreakBefore[++u16Offset] = false;  // Skip high surrogate
      ++u16Offset;

      // We're iterating over text obtained from NS_ConvertUTF16toUTF8,
      // so we know we have valid UTF-8 and don't need to check for
      // errors.
      uint32_t ch = UTF8CharEnumerator::NextChar(&p, end);
      ++attr;

      if (!ch) {
        // pango_break (pango 1.16.2) only analyses text before the
        // first NUL (but sets one extra attr). Workaround loop to call
        // pango_break again to analyse after the NUL is done somewhere else
        // (gfx/thebes/gfxFontconfigFonts.cpp: SetupClusterBoundaries()).
        // So, we do the same here for pango_get_log_attrs.
        break;
      }
    }
  }
}
Bug 336959. Use Pango to break inside Thai/Lao runs. Patch by Theppitak Karoonboonyanan, r+sr=roc 2007-07-19 07:26:51 +04:00			`/* -- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -- */`
Bug 716478 - update licence to MPL 2. 2012-05-21 15:12:37 +04:00			`/* This Source Code Form is subject to the terms of the Mozilla Public`
			`* License, v. 2.0. If a copy of the MPL was not distributed with this`
			`* file, You can obtain one at http://mozilla.org/MPL/2.0/. */`
Bug 336959. Use Pango to break inside Thai/Lao runs. Patch by Theppitak Karoonboonyanan, r+sr=roc 2007-07-19 07:26:51 +04:00
			`#include "nsComplexBreaker.h"`

			`#include <pango/pango-break.h>`
			`#include "nsUTF8Utils.h"`
			`#include "nsString.h"`
			`#include "nsTArray.h"`

Bug 927728 - Part 1: Replace PRUnichar with char16_t; r=roc This patch was automatically generated by the following script: #!/bin/bash # Command to convert PRUnichar to char16_t function convert() { echo "Converting $1 to $2..." find . ! -wholename "nsprpub" \ ! -wholename "security/nss" \ ! -wholename "modules/libmar" \ ! -wholename "/.hg" \ ! -wholename "obj-ff-dbg" \ ! -name prtypes.h \ ! -name Char16.h \ -type f \ \( -iname ".cpp" \ -o -iname ".h" \ -o -iname ".c" \ -o -iname ".cc" \ -o -iname ".idl" \ -o -iname ".ipdl" \ -o -iname ".ipdlh" \ -o -iname "*.mm" \) \| \ xargs -n 1 sed -i -e "s/\b$1\b/$2/g" } convert PRUnichar char16_t 2014-01-04 19:02:17 +04:00			`void NS_GetComplexLineBreaks(const char16_t* aText, uint32_t aLength,`
Bug 579517 - Part 1: Automated conversion of NSPR numeric types to stdint types in Gecko; r=bsmedberg This patch was generated by a script. Here's the source of the script for future reference: function convert() { echo "Converting $1 to $2..." find . ! -wholename "nsprpub" \ ! -wholename "security/nss" \ ! -wholename "/.hg" \ ! -wholename "obj-ff-dbg" \ ! -name nsXPCOMCID.h \ ! -name prtypes.h \ -type f \ \( -iname ".cpp" \ -o -iname ".h" \ -o -iname ".c" \ -o -iname ".cc" \ -o -iname ".idl" \ -o -iname ".ipdl" \ -o -iname ".ipdlh" \ -o -iname "*.mm" \) \| \ xargs -n 1 sed -i -e "s/\b$1\b/$2/g" } convert PRInt8 int8_t convert PRUint8 uint8_t convert PRInt16 int16_t convert PRUint16 uint16_t convert PRInt32 int32_t convert PRUint32 uint32_t convert PRInt64 int64_t convert PRUint64 uint64_t convert PRIntn int convert PRUintn unsigned convert PRSize size_t convert PROffset32 int32_t convert PROffset64 int64_t convert PRPtrdiff ptrdiff_t convert PRFloat64 double 2012-08-22 19:56:38 +04:00			`uint8_t* aBreakBefore) {`
Bug 336959. Use Pango to break inside Thai/Lao runs. Patch by Theppitak Karoonboonyanan, r+sr=roc 2007-07-19 07:26:51 +04:00			`NS_ASSERTION(aText, "aText shouldn't be null");`

Bug 579517 - Part 1: Automated conversion of NSPR numeric types to stdint types in Gecko; r=bsmedberg This patch was generated by a script. Here's the source of the script for future reference: function convert() { echo "Converting $1 to $2..." find . ! -wholename "nsprpub" \ ! -wholename "security/nss" \ ! -wholename "/.hg" \ ! -wholename "obj-ff-dbg" \ ! -name nsXPCOMCID.h \ ! -name prtypes.h \ -type f \ \( -iname ".cpp" \ -o -iname ".h" \ -o -iname ".c" \ -o -iname ".cc" \ -o -iname ".idl" \ -o -iname ".ipdl" \ -o -iname ".ipdlh" \ -o -iname "*.mm" \) \| \ xargs -n 1 sed -i -e "s/\b$1\b/$2/g" } convert PRInt8 int8_t convert PRUint8 uint8_t convert PRInt16 int16_t convert PRUint16 uint16_t convert PRInt32 int32_t convert PRUint32 uint32_t convert PRInt64 int64_t convert PRUint64 uint64_t convert PRIntn int convert PRUintn unsigned convert PRSize size_t convert PROffset32 int32_t convert PROffset64 int64_t convert PRPtrdiff ptrdiff_t convert PRFloat64 double 2012-08-22 19:56:38 +04:00			`memset(aBreakBefore, false, aLength * sizeof(uint8_t));`
Bug 336959. Exit properly if we get OOM. patch by Theppitak Karoonboonyanan, r+sr=roc,a=damon 2007-09-04 07:58:52 +04:00
Bug 1235261 - Part 1: Rename nsAutoTArray to AutoTArray. r=froydnj 2016-02-02 18:36:30 +03:00			`AutoTArray<PangoLogAttr, 2000> attrBuffer;`
Bug 336959. Use Pango to break inside Thai/Lao runs. Patch by Theppitak Karoonboonyanan, r+sr=roc 2007-07-19 07:26:51 +04:00			`if (!attrBuffer.AppendElements(aLength + 1)) return;`

			`NS_ConvertUTF16toUTF8 aUTF8(aText, aLength);`

			`const gchar* p = aUTF8.Data();`
			`const gchar* end = p + aUTF8.Length();`
Bug 579517 - Part 1: Automated conversion of NSPR numeric types to stdint types in Gecko; r=bsmedberg This patch was generated by a script. Here's the source of the script for future reference: function convert() { echo "Converting $1 to $2..." find . ! -wholename "nsprpub" \ ! -wholename "security/nss" \ ! -wholename "/.hg" \ ! -wholename "obj-ff-dbg" \ ! -name nsXPCOMCID.h \ ! -name prtypes.h \ -type f \ \( -iname ".cpp" \ -o -iname ".h" \ -o -iname ".c" \ -o -iname ".cc" \ -o -iname ".idl" \ -o -iname ".ipdl" \ -o -iname ".ipdlh" \ -o -iname "*.mm" \) \| \ xargs -n 1 sed -i -e "s/\b$1\b/$2/g" } convert PRInt8 int8_t convert PRUint8 uint8_t convert PRInt16 int16_t convert PRUint16 uint16_t convert PRInt32 int32_t convert PRUint32 uint32_t convert PRInt64 int64_t convert PRUint64 uint64_t convert PRIntn int convert PRUintn unsigned convert PRSize size_t convert PROffset32 int32_t convert PROffset64 int64_t convert PRPtrdiff ptrdiff_t convert PRFloat64 double 2012-08-22 19:56:38 +04:00			`uint32_t u16Offset = 0;`
Bug 336959. Use Pango to break inside Thai/Lao runs. Patch by Theppitak Karoonboonyanan, r+sr=roc 2007-07-19 07:26:51 +04:00
			`static PangoLanguage* language = pango_language_from_string("en");`

			`while (p < end) {`
			`PangoLogAttr* attr = attrBuffer.Elements();`
			`pango_get_log_attrs(p, end - p, -1, language, attr, attrBuffer.Length());`

			`while (p < end) {`
			`aBreakBefore[u16Offset] = attr->is_line_break;`
			`if (NS_IS_LOW_SURROGATE(aText[u16Offset]))`
Bug 690892 - Replace PR_TRUE/PR_FALSE with true/false on mozilla-central; rs=dbaron Landing on a CLOSED TREE 2011-10-17 18:59:28 +04:00			`aBreakBefore[++u16Offset] = false; // Skip high surrogate`
Bug 336959. Use Pango to break inside Thai/Lao runs. Patch by Theppitak Karoonboonyanan, r+sr=roc 2007-07-19 07:26:51 +04:00			`++u16Offset;`

Bug 1402247 - Use encoding_rs for XPCOM string encoding conversions. r=Nika,erahm,froydnj. Correctness improvements: * UTF errors are handled safely per spec instead of dangerously truncating strings. * There are fewer converter implementations. Performance improvements: * The old code did exact buffer length math, which meant doing UTF math twice on each input string (once for length calculation and another time for conversion). Exact length math is more complicated when handling errors properly, which the old code didn't do. The new code does UTF math on the string content only once (when converting) but risks allocating more than once. There are heuristics in place to lower the probability of reallocation in cases where the double math avoidance isn't enough of a saving to absorb an allocation and memcpy. * Previously, in UTF-16 <-> UTF-8 conversions, an ASCII prefix was optimized but a single non-ASCII code point pessimized the rest of the string. The new code tries to get back on the fast ASCII path. * UTF-16 to Latin1 conversion guarantees less about handling of out-of-range input to eliminate an operation from the inner loop on x86/x86_64. * When assigning to a pre-existing string, the new code tries to reuse the old buffer instead of first releasing the old buffer and then allocating a new one. * When reallocating from the new code, the memcpy covers only the data that is part of the logical length of the old string instead of memcpying the whole capacity. (For old callers old excess memcpy behavior is preserved due to bogus callers. See bug 1472113.) * UTF-8 strings in XPConnect that are in the Latin1 range are passed to SpiderMonkey as Latin1. New features: * Conversion between UTF-8 and Latin1 is added in order to enable faster future interop between Rust code (or otherwise UTF-8-using code) and text node and SpiderMonkey code that uses Latin1. MozReview-Commit-ID: JaJuExfILM9 2018-07-06 10:44:43 +03:00			`// We're iterating over text obtained from NS_ConvertUTF16toUTF8,`
			`// so we know we have valid UTF-8 and don't need to check for`
			`// errors.`
			`uint32_t ch = UTF8CharEnumerator::NextChar(&p, end);`
Bug 336959. Use Pango to break inside Thai/Lao runs. Patch by Theppitak Karoonboonyanan, r+sr=roc 2007-07-19 07:26:51 +04:00			`++attr;`

Bug 1402247 - Use encoding_rs for XPCOM string encoding conversions. r=Nika,erahm,froydnj. Correctness improvements: * UTF errors are handled safely per spec instead of dangerously truncating strings. * There are fewer converter implementations. Performance improvements: * The old code did exact buffer length math, which meant doing UTF math twice on each input string (once for length calculation and another time for conversion). Exact length math is more complicated when handling errors properly, which the old code didn't do. The new code does UTF math on the string content only once (when converting) but risks allocating more than once. There are heuristics in place to lower the probability of reallocation in cases where the double math avoidance isn't enough of a saving to absorb an allocation and memcpy. * Previously, in UTF-16 <-> UTF-8 conversions, an ASCII prefix was optimized but a single non-ASCII code point pessimized the rest of the string. The new code tries to get back on the fast ASCII path. * UTF-16 to Latin1 conversion guarantees less about handling of out-of-range input to eliminate an operation from the inner loop on x86/x86_64. * When assigning to a pre-existing string, the new code tries to reuse the old buffer instead of first releasing the old buffer and then allocating a new one. * When reallocating from the new code, the memcpy covers only the data that is part of the logical length of the old string instead of memcpying the whole capacity. (For old callers old excess memcpy behavior is preserved due to bogus callers. See bug 1472113.) * UTF-8 strings in XPConnect that are in the Latin1 range are passed to SpiderMonkey as Latin1. New features: * Conversion between UTF-8 and Latin1 is added in order to enable faster future interop between Rust code (or otherwise UTF-8-using code) and text node and SpiderMonkey code that uses Latin1. MozReview-Commit-ID: JaJuExfILM9 2018-07-06 10:44:43 +03:00			`if (!ch) {`
Bug 336959. Use Pango to break inside Thai/Lao runs. Patch by Theppitak Karoonboonyanan, r+sr=roc 2007-07-19 07:26:51 +04:00			`// pango_break (pango 1.16.2) only analyses text before the`
			`// first NUL (but sets one extra attr). Workaround loop to call`
			`// pango_break again to analyse after the NUL is done somewhere else`
Bug 948466: Rename gfxPangoFonts to gfxFontconfigFonts. r=nical --HG-- rename : gfx/thebes/gfxPangoFonts.cpp => gfx/thebes/gfxFontconfigFonts.cpp rename : gfx/thebes/gfxPangoFonts.h => gfx/thebes/gfxFontconfigFonts.h 2015-05-20 18:44:09 +03:00			`// (gfx/thebes/gfxFontconfigFonts.cpp: SetupClusterBoundaries()).`
Bug 336959. Use Pango to break inside Thai/Lao runs. Patch by Theppitak Karoonboonyanan, r+sr=roc 2007-07-19 07:26:51 +04:00			`// So, we do the same here for pango_get_log_attrs.`
			`break;`
			`}`
			`}`
			`}`
			`}`