Bug 1550532 - Avoid auto-hyphenating capitalized words, except for German. r=emilio,mats

This affects a number of our existing reftests, so we'll need to update those
to not expect auto-hyphenation of a sentence-initial (capitalized) word.

(Hyphenation behavior is not sufficiently well-specified for this to be tested
at the WPT level, so we just use Gecko-specific reftests.)

Differential Revision: https://phabricator.services.mozilla.com/D30912

--HG--
extra : moz-landing-system : lando
This commit is contained in:
Jonathan Kew 2019-05-13 16:35:44 +00:00
Родитель fa6db29146
Коммит c5d721dff7
24 изменённых файлов: 123 добавлений и 96 удалений

Просмотреть файл

@ -80,6 +80,9 @@ already_AddRefed<nsHyphenator> nsHyphenationManager::GetHyphenator(
if (hyph) {
return hyph.forget();
}
nsAutoCString hyphCapPref("intl.hyphenate-capitalized.");
hyphCapPref.Append(nsAtomCString(aLocale));
bool hyphenateCapitalized = Preferences::GetBool(hyphCapPref.get());
nsCOMPtr<nsIURI> uri = mPatternFiles.Get(aLocale);
if (!uri) {
RefPtr<nsAtom> alias = mHyphAliases.Get(aLocale);
@ -111,7 +114,7 @@ already_AddRefed<nsHyphenator> nsHyphenationManager::GetHyphenator(
}
}
}
hyph = new nsHyphenator(uri);
hyph = new nsHyphenator(uri, hyphenateCapitalized);
if (hyph->IsValid()) {
mHyphenators.Put(aLocale, hyph);
return hyph.forget();

Просмотреть файл

@ -11,7 +11,8 @@
#include "hyphen.h"
nsHyphenator::nsHyphenator(nsIURI* aURI) : mDict(nullptr) {
nsHyphenator::nsHyphenator(nsIURI* aURI, bool aHyphenateCapitalized)
: mDict(nullptr), mHyphenateCapitalized(aHyphenateCapitalized) {
nsCString uriSpec;
nsresult rv = aURI->GetSpec(uriSpec);
if (NS_FAILED(rv)) {
@ -70,79 +71,95 @@ nsresult nsHyphenator::Hyphenate(const nsAString& aString,
}
if (inWord) {
// Convert the word to utf-8 for libhyphen, lowercasing it as we go
// so that it will match the (lowercased) patterns (bug 1105644).
nsAutoCString utf8;
const char16_t* const begin = aString.BeginReading();
const char16_t* cur = begin + wordStart;
const char16_t* end = begin + wordLimit;
while (cur < end) {
uint32_t ch = *cur++;
if (NS_IS_HIGH_SURROGATE(ch)) {
if (cur < end && NS_IS_LOW_SURROGATE(*cur)) {
ch = SURROGATE_TO_UCS4(ch, *cur++);
} else {
ch = 0xfffd; // unpaired surrogate, treat as REPLACEMENT CHAR
}
} else if (NS_IS_LOW_SURROGATE(ch)) {
ch = 0xfffd; // unpaired surrogate
}
// XXX What about language-specific casing? Consider Turkish I/i...
// In practice, it looks like the current patterns will not be
// affected by this, as they treat dotted and undotted i similarly.
ch = ToLowerCase(ch);
if (ch < 0x80) { // U+0000 - U+007F
utf8.Append(ch);
} else if (ch < 0x0800) { // U+0100 - U+07FF
utf8.Append(0xC0 | (ch >> 6));
utf8.Append(0x80 | (0x003F & ch));
} else if (ch < 0x10000) { // U+0800 - U+D7FF,U+E000 - U+FFFF
utf8.Append(0xE0 | (ch >> 12));
utf8.Append(0x80 | (0x003F & (ch >> 6)));
utf8.Append(0x80 | (0x003F & ch));
} else {
utf8.Append(0xF0 | (ch >> 18));
utf8.Append(0x80 | (0x003F & (ch >> 12)));
utf8.Append(0x80 | (0x003F & (ch >> 6)));
utf8.Append(0x80 | (0x003F & ch));
}
}
AutoTArray<char, 200> utf8hyphens;
utf8hyphens.SetLength(utf8.Length() + 5);
char** rep = nullptr;
int* pos = nullptr;
int* cut = nullptr;
int err = hnj_hyphen_hyphenate2((HyphenDict*)mDict, utf8.BeginReading(),
utf8.Length(), utf8hyphens.Elements(),
nullptr, &rep, &pos, &cut);
if (!err) {
// Surprisingly, hnj_hyphen_hyphenate2 converts the 'hyphens' buffer
// from utf8 code unit indexing (which would match the utf8 input
// string directly) to Unicode character indexing.
// We then need to convert this to utf16 code unit offsets for Gecko.
const char* hyphPtr = utf8hyphens.Elements();
const char16_t* cur = begin + wordStart;
const char16_t* end = begin + wordLimit;
while (cur < end) {
if (*hyphPtr & 0x01) {
aHyphens[cur - begin] = true;
}
cur++;
if (cur < end && NS_IS_LOW_SURROGATE(*cur) &&
NS_IS_HIGH_SURROGATE(*(cur - 1))) {
cur++;
}
hyphPtr++;
}
}
HyphenateWord(aString, wordStart, wordLimit, aHyphens);
inWord = false;
}
inWord = false;
}
return NS_OK;
}
void nsHyphenator::HyphenateWord(const nsAString& aString, uint32_t aStart,
uint32_t aLimit, nsTArray<bool>& aHyphens) {
// Convert word from aStart and aLimit in aString to utf-8 for libhyphen,
// lowercasing it as we go so that it will match the (lowercased) patterns
// (bug 1105644).
nsAutoCString utf8;
const char16_t* const begin = aString.BeginReading();
const char16_t* cur = begin + aStart;
const char16_t* end = begin + aLimit;
bool firstLetter = true;
while (cur < end) {
uint32_t ch = *cur++;
if (NS_IS_HIGH_SURROGATE(ch)) {
if (cur < end && NS_IS_LOW_SURROGATE(*cur)) {
ch = SURROGATE_TO_UCS4(ch, *cur++);
} else {
ch = 0xfffd; // unpaired surrogate, treat as REPLACEMENT CHAR
}
} else if (NS_IS_LOW_SURROGATE(ch)) {
ch = 0xfffd; // unpaired surrogate
}
// XXX What about language-specific casing? Consider Turkish I/i...
// In practice, it looks like the current patterns will not be
// affected by this, as they treat dotted and undotted i similarly.
uint32_t origCh = ch;
ch = ToLowerCase(ch);
// Avoid hyphenating capitalized words (bug 1550532) unless explicitly
// allowed by prefs for the language in use.
if (firstLetter) {
if (!mHyphenateCapitalized && ch != origCh) {
return;
}
firstLetter = false;
}
if (ch < 0x80) { // U+0000 - U+007F
utf8.Append(ch);
} else if (ch < 0x0800) { // U+0100 - U+07FF
utf8.Append(0xC0 | (ch >> 6));
utf8.Append(0x80 | (0x003F & ch));
} else if (ch < 0x10000) { // U+0800 - U+D7FF,U+E000 - U+FFFF
utf8.Append(0xE0 | (ch >> 12));
utf8.Append(0x80 | (0x003F & (ch >> 6)));
utf8.Append(0x80 | (0x003F & ch));
} else {
utf8.Append(0xF0 | (ch >> 18));
utf8.Append(0x80 | (0x003F & (ch >> 12)));
utf8.Append(0x80 | (0x003F & (ch >> 6)));
utf8.Append(0x80 | (0x003F & ch));
}
}
AutoTArray<char, 200> utf8hyphens;
utf8hyphens.SetLength(utf8.Length() + 5);
char** rep = nullptr;
int* pos = nullptr;
int* cut = nullptr;
int err = hnj_hyphen_hyphenate2((HyphenDict*)mDict, utf8.BeginReading(),
utf8.Length(), utf8hyphens.Elements(),
nullptr, &rep, &pos, &cut);
if (!err) {
// Surprisingly, hnj_hyphen_hyphenate2 converts the 'hyphens' buffer
// from utf8 code unit indexing (which would match the utf8 input
// string directly) to Unicode character indexing.
// We then need to convert this to utf16 code unit offsets for Gecko.
const char* hyphPtr = utf8hyphens.Elements();
const char16_t* cur = begin + aStart;
const char16_t* end = begin + aLimit;
while (cur < end) {
if (*hyphPtr & 0x01) {
aHyphens[cur - begin] = true;
}
cur++;
if (cur < end && NS_IS_LOW_SURROGATE(*cur) &&
NS_IS_HIGH_SURROGATE(*(cur - 1))) {
cur++;
}
hyphPtr++;
}
}
}

Просмотреть файл

@ -14,7 +14,7 @@ class nsIURI;
class nsHyphenator {
public:
explicit nsHyphenator(nsIURI* aURI);
nsHyphenator(nsIURI* aURI, bool aHyphenateCapitalized);
NS_INLINE_DECL_REFCOUNTING(nsHyphenator)
@ -25,8 +25,11 @@ class nsHyphenator {
private:
~nsHyphenator();
protected:
void HyphenateWord(const nsAString& aString, uint32_t aStart,
uint32_t aLimit, nsTArray<bool>& aHyphens);
void* mDict;
bool mHyphenateCapitalized;
};
#endif // nsHyphenator_h__

Просмотреть файл

@ -5,7 +5,7 @@
</head>
<body>
<div style="width:1em; hyphens:manual;" lang="af">
Al&shy;le mens&shy;li&shy;ke we&shy;sens word vry, met ge&shy;ly&shy;ke waar&shy;dig&shy;heid en reg&shy;te, ge&shy;bo&shy;re.
Alle mens&shy;li&shy;ke we&shy;sens word vry, met ge&shy;ly&shy;ke waar&shy;dig&shy;heid en reg&shy;te, ge&shy;bo&shy;re.
</div>
</body>
</html>

Просмотреть файл

@ -5,7 +5,7 @@
</head>
<body>
<div style="width:1em; hyphens:manual;" lang="bg">
Всич&shy;ки хо&shy;ра се раж&shy;дат сво&shy;бод&shy;ни и рав&shy;ни по дос&shy;тойн&shy;с&shy;т&shy;во и пра&shy;ва.
Всички хо&shy;ра се раж&shy;дат сво&shy;бод&shy;ни и рав&shy;ни по дос&shy;тойн&shy;с&shy;т&shy;во и пра&shy;ва.
</div>
</body>
</html>

Просмотреть файл

@ -5,7 +5,7 @@
</head>
<body>
<div style="width:1em; hyphens:manual;" lang="cy">
Gen&shy;ir pawb yn rhydd ac yn gyd&shy;radd â'i gil&shy;ydd mewn urdd&shy;as a hawl&shy;iau.
Genir pawb yn rhydd ac yn gyd&shy;radd â'i gil&shy;ydd mewn urdd&shy;as a hawl&shy;iau.
</div>
</body>
</html>

Просмотреть файл

@ -5,7 +5,7 @@
</head>
<body>
<div style="width:1em; hyphens:manual;" lang="da">
Al&shy;le men&shy;ne&shy;sker er født frie og li&shy;ge i vær&shy;dig&shy;hed og ret&shy;tig&shy;he&shy;der.
Alle men&shy;ne&shy;sker er født frie og li&shy;ge i vær&shy;dig&shy;hed og ret&shy;tig&shy;he&shy;der.
</div>
</body>
</html>

Просмотреть файл

@ -5,7 +5,7 @@
</head>
<body>
<div style="width:1em; hyphens:manual;" lang="es">
To&shy;dos los se&shy;res hu&shy;ma&shy;nos na&shy;cen li&shy;bres e igua&shy;les en dig&shy;ni&shy;dad y de&shy;re&shy;chos
Todos los se&shy;res hu&shy;ma&shy;nos na&shy;cen li&shy;bres e igua&shy;les en dig&shy;ni&shy;dad y de&shy;re&shy;chos
</div>
</body>
</html>

Просмотреть файл

@ -5,7 +5,7 @@
</head>
<body>
<div style="width:1em; hyphens:manual;" lang="fi">
Kaik&shy;ki ih&shy;mi&shy;set syn&shy;ty&shy;vät va&shy;pai&shy;na ja ta&shy;sa&shy;ver&shy;tai&shy;si&shy;na ar&shy;vol&shy;taan ja oi&shy;keuk&shy;sil&shy;taan.
Kaikki ih&shy;mi&shy;set syn&shy;ty&shy;vät va&shy;pai&shy;na ja ta&shy;sa&shy;ver&shy;tai&shy;si&shy;na ar&shy;vol&shy;taan ja oi&shy;keuk&shy;sil&shy;taan.
</div>
</body>
</html>

Просмотреть файл

@ -5,7 +5,7 @@
</head>
<body>
<div style="width:1em; hyphens:manual;" lang="gl">
&shy;do&shy;los se&shy;res hu&shy;ma&shy;nos na&shy;cen li&shy;bres e iguais en dig&shy;ni&shy;da&shy;de e de&shy;rei&shy;tos
Tódolos se&shy;res hu&shy;ma&shy;nos na&shy;cen li&shy;bres e iguais en dig&shy;ni&shy;da&shy;de e de&shy;rei&shy;tos
</div>
</body>
</html>

Просмотреть файл

@ -5,7 +5,7 @@
</head>
<body>
<div style="width:1em; hyphens:manual;" lang="hu">
Min&shy;den em&shy;be&shy;ri lény sza&shy;ba&shy;don szü&shy;le&shy;tik és egyen&shy;lő mél&shy;&shy;&shy;ga és jo&shy;ga van.
Minden em&shy;be&shy;ri lény sza&shy;ba&shy;don szü&shy;le&shy;tik és egyen&shy;lő mél&shy;&shy;&shy;ga és jo&shy;ga van.
</div>
</body>
</html>

Просмотреть файл

@ -5,7 +5,7 @@
</head>
<body>
<div style="width:1em; hyphens:manual;" lang="ia">
To&shy;te le es&shy;se&shy;res hu&shy;man na&shy;sce li&shy;be&shy;re e equal in dig&shy;ni&shy;ta&shy;te e in de&shy;rec&shy;tos
Tote le es&shy;se&shy;res hu&shy;man na&shy;sce li&shy;be&shy;re e equal in dig&shy;ni&shy;ta&shy;te e in de&shy;rec&shy;tos
</div>
</body>
</html>

Просмотреть файл

@ -5,7 +5,7 @@
</head>
<body>
<div style="width:1em; hyphens:manual;" lang="it">
Tut&shy;ti gli es&shy;se&shy;ri uma&shy;ni na&shy;sco&shy;no li&shy;be&shy;ri ed egua&shy;li in di&shy;gni&shy;tà e di&shy;rit&shy;ti.
Tutti gli es&shy;se&shy;ri uma&shy;ni na&shy;sco&shy;no li&shy;be&shy;ri ed egua&shy;li in di&shy;gni&shy;tà e di&shy;rit&shy;ti.
</div>
</body>
</html>

Просмотреть файл

@ -5,7 +5,7 @@
</head>
<body>
<div style="width:1em; hyphens:manual;" lang="kmr">
He&shy;mû mi&shy;rov azad û di we&shy;qar û ma&shy;fan de we&shy;k&shy;hev tên din&shy;ya&shy;
Hemû mi&shy;rov azad û di we&shy;qar û ma&shy;fan de we&shy;k&shy;hev tên din&shy;ya&shy;
</div>
</body>
</html>

Просмотреть файл

@ -5,7 +5,7 @@
</head>
<body>
<div style="width:1em; hyphens:manual;" lang="la">
Om&shy;nes ho&shy;mi&shy;nes di&shy;gni&shy;ta&shy;te et iu&shy;re li&shy;be&shy;ri et pa&shy;res na&shy;scun&shy;tur
Omnes ho&shy;mi&shy;nes di&shy;gni&shy;ta&shy;te et iu&shy;re li&shy;be&shy;ri et pa&shy;res na&shy;scun&shy;tur
</div>
</body>
</html>

Просмотреть файл

@ -5,7 +5,7 @@
</head>
<body>
<div style="width:1em; hyphens:manual;" lang="lt">
Vi&shy;si žmo&shy;nės gims&shy;ta lais&shy;vi ir ly&shy;gūs sa&shy;vo oru&shy;mu ir tei&shy;&shy;mis.
Visi žmo&shy;nės gims&shy;ta lais&shy;vi ir ly&shy;gūs sa&shy;vo oru&shy;mu ir tei&shy;&shy;mis.
</div>
</body>
</html>

Просмотреть файл

@ -5,7 +5,7 @@
</head>
<body>
<div style="width:1em; hyphens:manual;" lang="nl">
Al&shy;le men&shy;sen wor&shy;den vrij en ge&shy;lijk in waar&shy;dig&shy;heid en rech&shy;ten ge&shy;bo&shy;ren
Alle men&shy;sen wor&shy;den vrij en ge&shy;lijk in waar&shy;dig&shy;heid en rech&shy;ten ge&shy;bo&shy;ren
</div>
</body>
</html>

Просмотреть файл

@ -11,8 +11,7 @@ body {
</style>
</head>
<body>
<div style="white-space:pre-wrap;">Uni-
kod
<div style="white-space:pre-wrap;">Unikod
przy-
pi-
su-

Просмотреть файл

@ -5,7 +5,7 @@
</head>
<body>
<div style="width:1em; hyphens:manual;" lang="pt">
To&shy;dos os se&shy;res hu&shy;ma&shy;nos nas&shy;cem li&shy;vres e iguais em dig&shy;ni&shy;da&shy;de e em di&shy;rei&shy;tos
Todos os se&shy;res hu&shy;ma&shy;nos nas&shy;cem li&shy;vres e iguais em dig&shy;ni&shy;da&shy;de e em di&shy;rei&shy;tos
</div>
</body>
</html>

Просмотреть файл

@ -5,7 +5,7 @@
</head>
<body>
<div style="width:1em; hyphens:manual;" lang="sv">
Al&shy;la män&shy;ni&shy;skor äro föd&shy;da fria och li&shy;ka i vär&shy;de och rät&shy;tig&shy;he&shy;ter
Alla män&shy;ni&shy;skor äro föd&shy;da fria och li&shy;ka i vär&shy;de och rät&shy;tig&shy;he&shy;ter
</div>
</body>
</html>

Просмотреть файл

@ -5,7 +5,7 @@
</head>
<body>
<div style="width:1em; hyphens:manual;" lang="tr">
&shy;tün in&shy;san&shy;lar hür, hay&shy;si&shy;yet ve hak&shy;lar ba&shy;kı&shy;mın&shy;dan eşit do&shy;ğar&shy;lar.
Bütün in&shy;san&shy;lar hür, hay&shy;si&shy;yet ve hak&shy;lar ba&shy;kı&shy;mın&shy;dan eşit do&shy;ğar&shy;lar.
</div>
</body>
</html>

Просмотреть файл

@ -11,7 +11,7 @@ code {
</head>
<body lang="en-us">
<code style="width:100ch;">
ABCDEFGHIJKLMNOPQRSTUVWXYZABCDEFGHIJKLMNOPQRSTUVWXYZABCDEFGHIJKLMNOPQRSTUVWXYZABCDEFGHIJKLMNOPQRSTU-<br />VWXYZsupercalifragilisticexpialidocious-<br />ABCDEFGHIJKLMNOPQRSTUVWXYZABCDEFGHIJKLMNOPQRSTUVWXYZABCDEFGHIJKLMNOPQRSTUVWXYZ
abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstu-<br />vwxyzsupercalifragilisticexpialidocious-<br />abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyz
</code>
</body>
</html>

Просмотреть файл

@ -15,7 +15,7 @@ code {
manual hyphenation opportunities even if they are within an extreme long word.
-->
<code style="width:100ch;">
ABCDEFGHIJKLMNOPQRSTUVWXYZABCDEFGHIJKLMNOPQRSTUVWXYZABCDEFGHIJKLMNOPQRSTUVWXYZABCDEFGHIJKLMNOPQRSTUVWXYZsuper&shy;cali&shy;fragi&shy;listic&shy;expiali&shy;docious&shy;ABCDEFGHIJKLMNOPQRSTUVWXYZABCDEFGHIJKLMNOPQRSTUVWXYZABCDEFGHIJKLMNOPQRSTUVWXYZ
abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyzsuper&shy;cali&shy;fragi&shy;listic&shy;expiali&shy;docious&shy;abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyz
</code>
</body>
</html>

Просмотреть файл

@ -2497,6 +2497,11 @@ pref("intl.hyphenation-alias.no-*", "nb");
pref("intl.hyphenation-alias.nb-*", "nb");
pref("intl.hyphenation-alias.nn-*", "nn");
// In German, we allow hyphenation of capitalized words; otherwise not.
pref("intl.hyphenate-capitalized.de-1996", true);
pref("intl.hyphenate-capitalized.de-1901", true);
pref("intl.hyphenate-capitalized.de-CH", true);
// All prefs of default font should be "auto".
pref("font.name.serif.ar", "");
pref("font.name.sans-serif.ar", "");