Bug 1322992 - Implement locale-specific casing behavior for Lithuanian, and add more WPT tests for it. r=m_kato

Differential Revision: https://phabricator.services.mozilla.com/D32129 --HG-- rename : testing/web-platform/tests/css/css-text/text-transform/reference/text-transform-upperlower-039-ref.html => testing/web-platform/tests/css/css-text/text-transform/reference/text-transform-upperlower-044-ref.html rename : testing/web-platform/tests/css/css-text/text-transform/text-transform-upperlower-039.html => testing/web-platform/tests/css/css-text/text-transform/text-transform-upperlower-044.html extra : moz-landing-system : lando
2019-05-27 01:35:03 +00:00 · 2019-05-27 01:35:03 +00:00 · 5fe29c880d
--- a/layout/generic/nsTextRunTransformations.cpp
+++ b/layout/generic/nsTextRunTransformations.cpp
@ -219,11 +219,12 @@ gfxTextRunFactory::Parameters GetParametersForInner(
 // exhibit the behavior in question; multiple lang tags may map to the
 // same setting here, if the behavior is shared by other languages.
 enum LanguageSpecificCasingBehavior {
-  eLSCB_None,    // default non-lang-specific behavior
-  eLSCB_Dutch,   // treat "ij" digraph as a unit for capitalization
-  eLSCB_Greek,   // strip accent when uppercasing Greek vowels
-  eLSCB_Irish,   // keep prefix letters as lowercase when uppercasing Irish
-  eLSCB_Turkish  // preserve dotted/dotless-i distinction in uppercase
+  eLSCB_None,       // default non-lang-specific behavior
+  eLSCB_Dutch,      // treat "ij" digraph as a unit for capitalization
+  eLSCB_Greek,      // strip accent when uppercasing Greek vowels
+  eLSCB_Irish,      // keep prefix letters as lowercase when uppercasing Irish
+  eLSCB_Turkish,    // preserve dotted/dotless-i distinction in uppercase
+  eLSCB_Lithuanian  // retain dot on lowercase i/j when an accent is present
 };

 static LanguageSpecificCasingBehavior GetCasingFor(const nsAtom* aLang) {
@ -244,6 +245,9 @@ static LanguageSpecificCasingBehavior GetCasingFor(const nsAtom* aLang) {
  if (aLang == nsGkAtoms::ga) {
    return eLSCB_Irish;
  }
+  if (aLang == nsGkAtoms::lt_) {
+    return eLSCB_Lithuanian;
+  }

  // Is there a region subtag we should ignore?
  nsAtomString langStr(const_cast<nsAtom*>(aLang));
@ -277,6 +281,8 @@ bool nsCaseTransformTextRunFactory::TransformString(
  bool prevIsLetter = false;
  bool ntPrefix = false;  // true immediately after a word-initial 'n' or 't'
                          // when doing Irish lowercasing
+  bool seenSoftDotted = false;  // true immediately after an I or J that is
+                                // converted to lowercase in Lithuanian mode
  uint32_t sigmaIndex = uint32_t(-1);
  nsUGenCategory cat;

@ -353,6 +359,60 @@ bool nsCaseTransformTextRunFactory::TransformString(
          }
        }

+        if (languageSpecificCasing == eLSCB_Lithuanian) {
+          // clang-format off
+          /* From SpecialCasing.txt:
+           * # Introduce an explicit dot above when lowercasing capital I's and J's
+           * # whenever there are more accents above.
+           * # (of the accents used in Lithuanian: grave, acute, tilde above, and ogonek)
+           *
+           * 0049; 0069 0307; 0049; 0049; lt More_Above; # LATIN CAPITAL LETTER I
+           * 004A; 006A 0307; 004A; 004A; lt More_Above; # LATIN CAPITAL LETTER J
+           * 012E; 012F 0307; 012E; 012E; lt More_Above; # LATIN CAPITAL LETTER I WITH OGONEK
+           * 00CC; 0069 0307 0300; 00CC; 00CC; lt; # LATIN CAPITAL LETTER I WITH GRAVE
+           * 00CD; 0069 0307 0301; 00CD; 00CD; lt; # LATIN CAPITAL LETTER I WITH ACUTE
+           * 0128; 0069 0307 0303; 0128; 0128; lt; # LATIN CAPITAL LETTER I WITH TILDE
+           */
+          // clang-format on
+          if (ch == 'I' || ch == 'J' || ch == 0x012E) {
+            ch = ToLowerCase(ch);
+            prevIsLetter = true;
+            seenSoftDotted = true;
+            sigmaIndex = uint32_t(-1);
+            break;
+          }
+          if (ch == 0x00CC) {
+            aConvertedString.Append('i');
+            aConvertedString.Append(0x0307);
+            extraChars += 2;
+            ch = 0x0300;
+            prevIsLetter = true;
+            seenSoftDotted = false;
+            sigmaIndex = uint32_t(-1);
+            break;
+          }
+          if (ch == 0x00CD) {
+            aConvertedString.Append('i');
+            aConvertedString.Append(0x0307);
+            extraChars += 2;
+            ch = 0x0301;
+            prevIsLetter = true;
+            seenSoftDotted = false;
+            sigmaIndex = uint32_t(-1);
+            break;
+          }
+          if (ch == 0x0128) {
+            aConvertedString.Append('i');
+            aConvertedString.Append(0x0307);
+            extraChars += 2;
+            ch = 0x0303;
+            prevIsLetter = true;
+            seenSoftDotted = false;
+            sigmaIndex = uint32_t(-1);
+            break;
+          }
+        }
+
        cat = mozilla::unicode::GetGenCategory(ch);

        if (languageSpecificCasing == eLSCB_Irish &&
@ -371,6 +431,15 @@ bool nsCaseTransformTextRunFactory::TransformString(
          ntPrefix = false;
        }

+        if (seenSoftDotted && cat == nsUGenCategory::kMark) {
+          // The seenSoftDotted flag will only be set in Lithuanian mode.
+          if (ch == 0x0300 || ch == 0x0301 || ch == 0x0303) {
+            aConvertedString.Append(0x0307);
+            ++extraChars;
+          }
+        }
+        seenSoftDotted = false;
+
        // Special lowercasing behavior for Greek Sigma: note that this is
        // listed as context-sensitive in Unicode's SpecialCasing.txt, but is
        // *not* a language-specific mapping; it applies regardless of the
@ -463,6 +532,26 @@ bool nsCaseTransformTextRunFactory::TransformString(
          break;
        }

+        if (languageSpecificCasing == eLSCB_Lithuanian) {
+          /*
+           * # Remove DOT ABOVE after "i" with upper or titlecase
+           *
+           * 0307; 0307; ; ; lt After_Soft_Dotted; # COMBINING DOT ABOVE
+           */
+          if (ch == 'i' || ch == 'j' || ch == 0x012F) {
+            seenSoftDotted = true;
+            ch = ToTitleCase(ch);
+            break;
+          }
+          if (seenSoftDotted) {
+            seenSoftDotted = false;
+            if (ch == 0x0307) {
+              ch = uint32_t(-1);
+              break;
+            }
+          }
+        }
+
        if (languageSpecificCasing == eLSCB_Irish) {
          bool mark;
          uint8_t action;
@ -565,6 +654,25 @@ bool nsCaseTransformTextRunFactory::TransformString(
              capitalizeDutchIJ = true;
              break;
            }
+            if (languageSpecificCasing == eLSCB_Lithuanian) {
+              /*
+               * # Remove DOT ABOVE after "i" with upper or titlecase
+               *
+               * 0307; 0307; ; ; lt After_Soft_Dotted; # COMBINING DOT ABOVE
+               */
+              if (ch == 'i' || ch == 'j' || ch == 0x012F) {
+                seenSoftDotted = true;
+                ch = ToTitleCase(ch);
+                break;
+              }
+              if (seenSoftDotted) {
+                seenSoftDotted = false;
+                if (ch == 0x0307) {
+                  ch = uint32_t(-1);
+                  break;
+                }
+              }
+            }

            mcm = mozilla::unicode::SpecialTitle(ch);
            if (mcm) {
--- a/testing/web-platform/meta/css/css-text/text-transform/text-transform-upperlower-039.html.ini
+++ b/testing/web-platform/meta/css/css-text/text-transform/text-transform-upperlower-039.html.ini
@ -1,2 +0,0 @@
-[text-transform-upperlower-039.html]
-  expected: FAIL
--- a/testing/web-platform/tests/css/css-text/text-transform/reference/text-transform-upperlower-039-ref.html
+++ b/testing/web-platform/tests/css/css-text/text-transform/reference/text-transform-upperlower-039-ref.html
@ -17,7 +17,10 @@
 </head>
 <body>
 <p class="instructions">Test passes if both characters in each pair match. If you are missing a font glyph for a character, ignore that pair, but report which characters were ignored.</p>
-<div class="test" lang="lt"><span>&#x69;&#x307;&#x300; &#x69;&#x307;&#x300;</span> <span>&#x69;&#x307;&#x301; &#x69;&#x307;&#x301;</span> <span>&#x69;&#x307;&#x303; &#x69;&#x307;&#x303;</span></div>
+<div class="test" lang="lt">
+  <span>&#x69;&#x307;&#x300; &#x69;&#x307;&#x300;</span> <span>&#x69;&#x307;&#x301; &#x69;&#x307;&#x301;</span> <span>&#x69;&#x307;&#x303; &#x69;&#x307;&#x303;</span>
+  <span>&#x69;&#x307;&#x300; &#x69;&#x307;&#x300;</span> <span>&#x6A;&#x307;&#x301; &#x6A;&#x307;&#x301;</span> <span>&#x12F;&#x307;&#x303; &#x12F;&#x307;&#x303;</span>
+</div>
 <!--Notes:
 The language of the test box is set to Lithuanian (lt)
 -->
--- a/testing/web-platform/tests/css/css-text/text-transform/reference/text-transform-upperlower-044-ref.html
+++ b/testing/web-platform/tests/css/css-text/text-transform/reference/text-transform-upperlower-044-ref.html
@ -0,0 +1,33 @@
+<!DOCTYPE html>
+<html lang="en">
+<head>
+<meta charset="utf-8"/>
+<title>CSS3 Text, text transform: Lithuanian, uppercase</title>
+<link rel='author' title='Jonathan Kew' href='mailto:jkew@mozilla.com'>
+<style type='text/css'>
+@font-face {
+	font-family: 'webfont';
+	src: url('/fonts/DoulosSIL-R.woff') format('woff');
+	font-weight: normal;
+	font-style: normal;
+	}
+.test, .ref { font-size: 200%; line-height: 2.5em; font-family: webfont, serif; }
+.test span, .ref span { margin-right: 1em; white-space: nowrap; }
+</style>
+</head>
+<body>
+<p class="instructions">Test passes if both characters in each pair match. If you are missing a font glyph for a character, ignore that pair, but report which characters were ignored.</p>
+<div class="test" lang="lt">
+  <span>&#x49;&#x300; &#x49;&#x300;</span>
+  <span>&#x49;&#x301; &#x49;&#x301;</span>
+  <span>&#x49;&#x303; &#x49;&#x303;</span>
+  <span>&#x49; &#x49;</span>
+  <span>&#x4A; &#x4A;</span>
+  <span>&#x12E; &#x12E;</span>
+  <span>X&#x307; X&#x307;</span>
+</div>
+<!--Notes:
+The language of the test box is set to Lithuanian (lt)
+-->
+</body>
+</html>
--- a/testing/web-platform/tests/css/css-text/text-transform/text-transform-upperlower-039.html
+++ b/testing/web-platform/tests/css/css-text/text-transform/text-transform-upperlower-039.html
@ -22,7 +22,10 @@
 </head>
 <body>
 <p class="instructions">Test passes if both characters in each pair match. If you are missing a font glyph for a character, ignore that pair, but report which characters were ignored.</p>
-<div class="test" lang="lt"><span>&#xCC; &#x69;&#x307;&#x300;</span> <span>&#xCD; &#x69;&#x307;&#x301;</span> <span>&#x128; &#x69;&#x307;&#x303;</span></div>
+<div class="test" lang="lt">
+  <span>&#xCC; &#x69;&#x307;&#x300;</span> <span>&#xCD; &#x69;&#x307;&#x301;</span> <span>&#x128; &#x69;&#x307;&#x303;</span>
+  <span>&#x49;&#x300; &#x69;&#x307;&#x300;</span> <span>&#x4A;&#x301; &#x6A;&#x307;&#x301;</span> <span>&#x12E;&#x303; &#x12F;&#x307;&#x303;</span>
+</div>
 <!--Notes:
 The language of the test box is set to Lithuanian (lt)
 -->
--- a/testing/web-platform/tests/css/css-text/text-transform/text-transform-upperlower-044.html
+++ b/testing/web-platform/tests/css/css-text/text-transform/text-transform-upperlower-044.html
@ -0,0 +1,38 @@
+<!DOCTYPE html>
+<html lang="en">
+<head>
+<meta charset="utf-8"/>
+<title>CSS3 Text, text transform: Lithuanian, uppercase</title>
+<meta name="assert" content="text-transform: uppercase will uppercase Lithuanian as described in Unicode's SpecialCasing.txt .">
+<link rel='author' title='Jonathan Kew' href='mailto:jkew@mozilla.com'>
+<link rel='help' href='https://drafts.csswg.org/css-text-3/#text-transform'>
+<link rel="match" href="reference/text-transform-upperlower-044-ref.html">
+<style type='text/css'>
+@font-face {
+	font-family: 'webfont';
+	src: url('/fonts/DoulosSIL-R.woff') format('woff');
+	font-weight: normal;
+	font-style: normal;
+	}
+.test, .ref { font-size: 200%; line-height: 2.5em; font-family: webfont, serif; }
+.test span, .ref span { margin-right: 1em; white-space: nowrap; }
+/* the CSS above is not part of the test */
+.test { text-transform: uppercase; }
+</style>
+</head>
+<body>
+<p class="instructions">Test passes if both characters in each pair match. If you are missing a font glyph for a character, ignore that pair, but report which characters were ignored.</p>
+<div class="test" lang="lt">
+  <span>&#x69;&#x307;&#x300; &#x49;&#x300;</span>
+  <span>&#x69;&#x307;&#x301; &#x49;&#x301;</span>
+  <span>&#x69;&#x307;&#x303; &#x49;&#x303;</span>
+  <span>&#x69;&#x307; &#x49;</span>
+  <span>&#x6A;&#x307; &#x4A;</span>
+  <span>&#x12F;&#x307; &#x12E;</span>
+  <span>x&#x307; X&#x307;</span> <!-- check that dot isn't deleted in other contexts -->
+</div>
+<!--Notes:
+The language of the test box is set to Lithuanian (lt)
+-->
+</body>
+</html>
--- a/xpcom/ds/StaticAtoms.py
+++ b/xpcom/ds/StaticAtoms.py
@ -2118,6 +2118,7 @@ STATIC_ATOMS = [
    Atom("crh", "crh"),
    # Atom("el", "el"),  # "el" is present above
    Atom("ga", "ga"),
+    # Atom("lt", "lt"),  # "lt" is present above (atom name "lt_")
    Atom("nl", "nl"),

    # mathematical language, used for MathML