зеркало из https://github.com/mozilla/gecko-dev.git
Bug 1550532 - Avoid auto-hyphenating capitalized words, except for German. r=emilio,mats
This affects a number of our existing reftests, so we'll need to update those to not expect auto-hyphenation of a sentence-initial (capitalized) word. (Hyphenation behavior is not sufficiently well-specified for this to be tested at the WPT level, so we just use Gecko-specific reftests.) Differential Revision: https://phabricator.services.mozilla.com/D30912 --HG-- extra : moz-landing-system : lando
This commit is contained in:
Родитель
fa6db29146
Коммит
c5d721dff7
|
@ -80,6 +80,9 @@ already_AddRefed<nsHyphenator> nsHyphenationManager::GetHyphenator(
|
|||
if (hyph) {
|
||||
return hyph.forget();
|
||||
}
|
||||
nsAutoCString hyphCapPref("intl.hyphenate-capitalized.");
|
||||
hyphCapPref.Append(nsAtomCString(aLocale));
|
||||
bool hyphenateCapitalized = Preferences::GetBool(hyphCapPref.get());
|
||||
nsCOMPtr<nsIURI> uri = mPatternFiles.Get(aLocale);
|
||||
if (!uri) {
|
||||
RefPtr<nsAtom> alias = mHyphAliases.Get(aLocale);
|
||||
|
@ -111,7 +114,7 @@ already_AddRefed<nsHyphenator> nsHyphenationManager::GetHyphenator(
|
|||
}
|
||||
}
|
||||
}
|
||||
hyph = new nsHyphenator(uri);
|
||||
hyph = new nsHyphenator(uri, hyphenateCapitalized);
|
||||
if (hyph->IsValid()) {
|
||||
mHyphenators.Put(aLocale, hyph);
|
||||
return hyph.forget();
|
||||
|
|
|
@ -11,7 +11,8 @@
|
|||
|
||||
#include "hyphen.h"
|
||||
|
||||
nsHyphenator::nsHyphenator(nsIURI* aURI) : mDict(nullptr) {
|
||||
nsHyphenator::nsHyphenator(nsIURI* aURI, bool aHyphenateCapitalized)
|
||||
: mDict(nullptr), mHyphenateCapitalized(aHyphenateCapitalized) {
|
||||
nsCString uriSpec;
|
||||
nsresult rv = aURI->GetSpec(uriSpec);
|
||||
if (NS_FAILED(rv)) {
|
||||
|
@ -70,79 +71,95 @@ nsresult nsHyphenator::Hyphenate(const nsAString& aString,
|
|||
}
|
||||
|
||||
if (inWord) {
|
||||
// Convert the word to utf-8 for libhyphen, lowercasing it as we go
|
||||
// so that it will match the (lowercased) patterns (bug 1105644).
|
||||
nsAutoCString utf8;
|
||||
const char16_t* const begin = aString.BeginReading();
|
||||
const char16_t* cur = begin + wordStart;
|
||||
const char16_t* end = begin + wordLimit;
|
||||
while (cur < end) {
|
||||
uint32_t ch = *cur++;
|
||||
|
||||
if (NS_IS_HIGH_SURROGATE(ch)) {
|
||||
if (cur < end && NS_IS_LOW_SURROGATE(*cur)) {
|
||||
ch = SURROGATE_TO_UCS4(ch, *cur++);
|
||||
} else {
|
||||
ch = 0xfffd; // unpaired surrogate, treat as REPLACEMENT CHAR
|
||||
}
|
||||
} else if (NS_IS_LOW_SURROGATE(ch)) {
|
||||
ch = 0xfffd; // unpaired surrogate
|
||||
}
|
||||
|
||||
// XXX What about language-specific casing? Consider Turkish I/i...
|
||||
// In practice, it looks like the current patterns will not be
|
||||
// affected by this, as they treat dotted and undotted i similarly.
|
||||
ch = ToLowerCase(ch);
|
||||
|
||||
if (ch < 0x80) { // U+0000 - U+007F
|
||||
utf8.Append(ch);
|
||||
} else if (ch < 0x0800) { // U+0100 - U+07FF
|
||||
utf8.Append(0xC0 | (ch >> 6));
|
||||
utf8.Append(0x80 | (0x003F & ch));
|
||||
} else if (ch < 0x10000) { // U+0800 - U+D7FF,U+E000 - U+FFFF
|
||||
utf8.Append(0xE0 | (ch >> 12));
|
||||
utf8.Append(0x80 | (0x003F & (ch >> 6)));
|
||||
utf8.Append(0x80 | (0x003F & ch));
|
||||
} else {
|
||||
utf8.Append(0xF0 | (ch >> 18));
|
||||
utf8.Append(0x80 | (0x003F & (ch >> 12)));
|
||||
utf8.Append(0x80 | (0x003F & (ch >> 6)));
|
||||
utf8.Append(0x80 | (0x003F & ch));
|
||||
}
|
||||
}
|
||||
|
||||
AutoTArray<char, 200> utf8hyphens;
|
||||
utf8hyphens.SetLength(utf8.Length() + 5);
|
||||
char** rep = nullptr;
|
||||
int* pos = nullptr;
|
||||
int* cut = nullptr;
|
||||
int err = hnj_hyphen_hyphenate2((HyphenDict*)mDict, utf8.BeginReading(),
|
||||
utf8.Length(), utf8hyphens.Elements(),
|
||||
nullptr, &rep, &pos, &cut);
|
||||
if (!err) {
|
||||
// Surprisingly, hnj_hyphen_hyphenate2 converts the 'hyphens' buffer
|
||||
// from utf8 code unit indexing (which would match the utf8 input
|
||||
// string directly) to Unicode character indexing.
|
||||
// We then need to convert this to utf16 code unit offsets for Gecko.
|
||||
const char* hyphPtr = utf8hyphens.Elements();
|
||||
const char16_t* cur = begin + wordStart;
|
||||
const char16_t* end = begin + wordLimit;
|
||||
while (cur < end) {
|
||||
if (*hyphPtr & 0x01) {
|
||||
aHyphens[cur - begin] = true;
|
||||
}
|
||||
cur++;
|
||||
if (cur < end && NS_IS_LOW_SURROGATE(*cur) &&
|
||||
NS_IS_HIGH_SURROGATE(*(cur - 1))) {
|
||||
cur++;
|
||||
}
|
||||
hyphPtr++;
|
||||
}
|
||||
}
|
||||
HyphenateWord(aString, wordStart, wordLimit, aHyphens);
|
||||
inWord = false;
|
||||
}
|
||||
|
||||
inWord = false;
|
||||
}
|
||||
|
||||
return NS_OK;
|
||||
}
|
||||
|
||||
void nsHyphenator::HyphenateWord(const nsAString& aString, uint32_t aStart,
|
||||
uint32_t aLimit, nsTArray<bool>& aHyphens) {
|
||||
// Convert word from aStart and aLimit in aString to utf-8 for libhyphen,
|
||||
// lowercasing it as we go so that it will match the (lowercased) patterns
|
||||
// (bug 1105644).
|
||||
nsAutoCString utf8;
|
||||
const char16_t* const begin = aString.BeginReading();
|
||||
const char16_t* cur = begin + aStart;
|
||||
const char16_t* end = begin + aLimit;
|
||||
bool firstLetter = true;
|
||||
while (cur < end) {
|
||||
uint32_t ch = *cur++;
|
||||
|
||||
if (NS_IS_HIGH_SURROGATE(ch)) {
|
||||
if (cur < end && NS_IS_LOW_SURROGATE(*cur)) {
|
||||
ch = SURROGATE_TO_UCS4(ch, *cur++);
|
||||
} else {
|
||||
ch = 0xfffd; // unpaired surrogate, treat as REPLACEMENT CHAR
|
||||
}
|
||||
} else if (NS_IS_LOW_SURROGATE(ch)) {
|
||||
ch = 0xfffd; // unpaired surrogate
|
||||
}
|
||||
|
||||
// XXX What about language-specific casing? Consider Turkish I/i...
|
||||
// In practice, it looks like the current patterns will not be
|
||||
// affected by this, as they treat dotted and undotted i similarly.
|
||||
uint32_t origCh = ch;
|
||||
ch = ToLowerCase(ch);
|
||||
|
||||
// Avoid hyphenating capitalized words (bug 1550532) unless explicitly
|
||||
// allowed by prefs for the language in use.
|
||||
if (firstLetter) {
|
||||
if (!mHyphenateCapitalized && ch != origCh) {
|
||||
return;
|
||||
}
|
||||
firstLetter = false;
|
||||
}
|
||||
|
||||
if (ch < 0x80) { // U+0000 - U+007F
|
||||
utf8.Append(ch);
|
||||
} else if (ch < 0x0800) { // U+0100 - U+07FF
|
||||
utf8.Append(0xC0 | (ch >> 6));
|
||||
utf8.Append(0x80 | (0x003F & ch));
|
||||
} else if (ch < 0x10000) { // U+0800 - U+D7FF,U+E000 - U+FFFF
|
||||
utf8.Append(0xE0 | (ch >> 12));
|
||||
utf8.Append(0x80 | (0x003F & (ch >> 6)));
|
||||
utf8.Append(0x80 | (0x003F & ch));
|
||||
} else {
|
||||
utf8.Append(0xF0 | (ch >> 18));
|
||||
utf8.Append(0x80 | (0x003F & (ch >> 12)));
|
||||
utf8.Append(0x80 | (0x003F & (ch >> 6)));
|
||||
utf8.Append(0x80 | (0x003F & ch));
|
||||
}
|
||||
}
|
||||
|
||||
AutoTArray<char, 200> utf8hyphens;
|
||||
utf8hyphens.SetLength(utf8.Length() + 5);
|
||||
char** rep = nullptr;
|
||||
int* pos = nullptr;
|
||||
int* cut = nullptr;
|
||||
int err = hnj_hyphen_hyphenate2((HyphenDict*)mDict, utf8.BeginReading(),
|
||||
utf8.Length(), utf8hyphens.Elements(),
|
||||
nullptr, &rep, &pos, &cut);
|
||||
if (!err) {
|
||||
// Surprisingly, hnj_hyphen_hyphenate2 converts the 'hyphens' buffer
|
||||
// from utf8 code unit indexing (which would match the utf8 input
|
||||
// string directly) to Unicode character indexing.
|
||||
// We then need to convert this to utf16 code unit offsets for Gecko.
|
||||
const char* hyphPtr = utf8hyphens.Elements();
|
||||
const char16_t* cur = begin + aStart;
|
||||
const char16_t* end = begin + aLimit;
|
||||
while (cur < end) {
|
||||
if (*hyphPtr & 0x01) {
|
||||
aHyphens[cur - begin] = true;
|
||||
}
|
||||
cur++;
|
||||
if (cur < end && NS_IS_LOW_SURROGATE(*cur) &&
|
||||
NS_IS_HIGH_SURROGATE(*(cur - 1))) {
|
||||
cur++;
|
||||
}
|
||||
hyphPtr++;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -14,7 +14,7 @@ class nsIURI;
|
|||
|
||||
class nsHyphenator {
|
||||
public:
|
||||
explicit nsHyphenator(nsIURI* aURI);
|
||||
nsHyphenator(nsIURI* aURI, bool aHyphenateCapitalized);
|
||||
|
||||
NS_INLINE_DECL_REFCOUNTING(nsHyphenator)
|
||||
|
||||
|
@ -25,8 +25,11 @@ class nsHyphenator {
|
|||
private:
|
||||
~nsHyphenator();
|
||||
|
||||
protected:
|
||||
void HyphenateWord(const nsAString& aString, uint32_t aStart,
|
||||
uint32_t aLimit, nsTArray<bool>& aHyphens);
|
||||
|
||||
void* mDict;
|
||||
bool mHyphenateCapitalized;
|
||||
};
|
||||
|
||||
#endif // nsHyphenator_h__
|
||||
|
|
|
@ -5,7 +5,7 @@
|
|||
</head>
|
||||
<body>
|
||||
<div style="width:1em; hyphens:manual;" lang="af">
|
||||
Al­le mens­li­ke we­sens word vry, met ge­ly­ke waar­dig­heid en reg­te, ge­bo­re.
|
||||
Alle mens­li­ke we­sens word vry, met ge­ly­ke waar­dig­heid en reg­te, ge­bo­re.
|
||||
</div>
|
||||
</body>
|
||||
</html>
|
||||
|
|
|
@ -5,7 +5,7 @@
|
|||
</head>
|
||||
<body>
|
||||
<div style="width:1em; hyphens:manual;" lang="bg">
|
||||
Всич­ки хо­ра се раж­дат сво­бод­ни и рав­ни по дос­тойн­с­т­во и пра­ва.
|
||||
Всички хо­ра се раж­дат сво­бод­ни и рав­ни по дос­тойн­с­т­во и пра­ва.
|
||||
</div>
|
||||
</body>
|
||||
</html>
|
||||
|
|
|
@ -5,7 +5,7 @@
|
|||
</head>
|
||||
<body>
|
||||
<div style="width:1em; hyphens:manual;" lang="cy">
|
||||
Gen­ir pawb yn rhydd ac yn gyd­radd â'i gil­ydd mewn urdd­as a hawl­iau.
|
||||
Genir pawb yn rhydd ac yn gyd­radd â'i gil­ydd mewn urdd­as a hawl­iau.
|
||||
</div>
|
||||
</body>
|
||||
</html>
|
||||
|
|
|
@ -5,7 +5,7 @@
|
|||
</head>
|
||||
<body>
|
||||
<div style="width:1em; hyphens:manual;" lang="da">
|
||||
Al­le men­ne­sker er født frie og li­ge i vær­dig­hed og ret­tig­he­der.
|
||||
Alle men­ne­sker er født frie og li­ge i vær­dig­hed og ret­tig­he­der.
|
||||
</div>
|
||||
</body>
|
||||
</html>
|
||||
|
|
|
@ -5,7 +5,7 @@
|
|||
</head>
|
||||
<body>
|
||||
<div style="width:1em; hyphens:manual;" lang="es">
|
||||
To­dos los se­res hu­ma­nos na­cen li­bres e igua­les en dig­ni­dad y de­re­chos
|
||||
Todos los se­res hu­ma­nos na­cen li­bres e igua­les en dig­ni­dad y de­re­chos
|
||||
</div>
|
||||
</body>
|
||||
</html>
|
||||
|
|
|
@ -5,7 +5,7 @@
|
|||
</head>
|
||||
<body>
|
||||
<div style="width:1em; hyphens:manual;" lang="fi">
|
||||
Kaik­ki ih­mi­set syn­ty­vät va­pai­na ja ta­sa­ver­tai­si­na ar­vol­taan ja oi­keuk­sil­taan.
|
||||
Kaikki ih­mi­set syn­ty­vät va­pai­na ja ta­sa­ver­tai­si­na ar­vol­taan ja oi­keuk­sil­taan.
|
||||
</div>
|
||||
</body>
|
||||
</html>
|
||||
|
|
|
@ -5,7 +5,7 @@
|
|||
</head>
|
||||
<body>
|
||||
<div style="width:1em; hyphens:manual;" lang="gl">
|
||||
Tó­do­los se­res hu­ma­nos na­cen li­bres e iguais en dig­ni­da­de e de­rei­tos
|
||||
Tódolos se­res hu­ma­nos na­cen li­bres e iguais en dig­ni­da­de e de­rei­tos
|
||||
</div>
|
||||
</body>
|
||||
</html>
|
||||
|
|
|
@ -5,7 +5,7 @@
|
|||
</head>
|
||||
<body>
|
||||
<div style="width:1em; hyphens:manual;" lang="hu">
|
||||
Min­den em­be­ri lény sza­ba­don szü­le­tik és egyen­lő mél­tó­sá­ga és jo­ga van.
|
||||
Minden em­be­ri lény sza­ba­don szü­le­tik és egyen­lő mél­tó­sá­ga és jo­ga van.
|
||||
</div>
|
||||
</body>
|
||||
</html>
|
||||
|
|
|
@ -5,7 +5,7 @@
|
|||
</head>
|
||||
<body>
|
||||
<div style="width:1em; hyphens:manual;" lang="ia">
|
||||
To­te le es­se­res hu­man na­sce li­be­re e equal in dig­ni­ta­te e in de­rec­tos
|
||||
Tote le es­se­res hu­man na­sce li­be­re e equal in dig­ni­ta­te e in de­rec­tos
|
||||
</div>
|
||||
</body>
|
||||
</html>
|
||||
|
|
|
@ -5,7 +5,7 @@
|
|||
</head>
|
||||
<body>
|
||||
<div style="width:1em; hyphens:manual;" lang="it">
|
||||
Tut­ti gli es­se­ri uma­ni na­sco­no li­be­ri ed egua­li in di­gni­tà e di­rit­ti.
|
||||
Tutti gli es­se­ri uma­ni na­sco­no li­be­ri ed egua­li in di­gni­tà e di­rit­ti.
|
||||
</div>
|
||||
</body>
|
||||
</html>
|
||||
|
|
|
@ -5,7 +5,7 @@
|
|||
</head>
|
||||
<body>
|
||||
<div style="width:1em; hyphens:manual;" lang="kmr">
|
||||
He­mû mi­rov azad û di we­qar û ma­fan de we­k­hev tên din­ya­yê
|
||||
Hemû mi­rov azad û di we­qar û ma­fan de we­k­hev tên din­ya­yê
|
||||
</div>
|
||||
</body>
|
||||
</html>
|
||||
|
|
|
@ -5,7 +5,7 @@
|
|||
</head>
|
||||
<body>
|
||||
<div style="width:1em; hyphens:manual;" lang="la">
|
||||
Om­nes ho­mi­nes di­gni­ta­te et iu­re li­be­ri et pa­res na­scun­tur
|
||||
Omnes ho­mi­nes di­gni­ta­te et iu­re li­be­ri et pa­res na­scun­tur
|
||||
</div>
|
||||
</body>
|
||||
</html>
|
||||
|
|
|
@ -5,7 +5,7 @@
|
|||
</head>
|
||||
<body>
|
||||
<div style="width:1em; hyphens:manual;" lang="lt">
|
||||
Vi­si žmo­nės gims­ta lais­vi ir ly­gūs sa­vo oru­mu ir tei­sė­mis.
|
||||
Visi žmo­nės gims­ta lais­vi ir ly­gūs sa­vo oru­mu ir tei­sė­mis.
|
||||
</div>
|
||||
</body>
|
||||
</html>
|
||||
|
|
|
@ -5,7 +5,7 @@
|
|||
</head>
|
||||
<body>
|
||||
<div style="width:1em; hyphens:manual;" lang="nl">
|
||||
Al­le men­sen wor­den vrij en ge­lijk in waar­dig­heid en rech­ten ge­bo­ren
|
||||
Alle men­sen wor­den vrij en ge­lijk in waar­dig­heid en rech­ten ge­bo­ren
|
||||
</div>
|
||||
</body>
|
||||
</html>
|
||||
|
|
|
@ -11,8 +11,7 @@ body {
|
|||
</style>
|
||||
</head>
|
||||
<body>
|
||||
<div style="white-space:pre-wrap;">Uni-
|
||||
kod
|
||||
<div style="white-space:pre-wrap;">Unikod
|
||||
przy-
|
||||
pi-
|
||||
su-
|
||||
|
|
|
@ -5,7 +5,7 @@
|
|||
</head>
|
||||
<body>
|
||||
<div style="width:1em; hyphens:manual;" lang="pt">
|
||||
To­dos os se­res hu­ma­nos nas­cem li­vres e iguais em dig­ni­da­de e em di­rei­tos
|
||||
Todos os se­res hu­ma­nos nas­cem li­vres e iguais em dig­ni­da­de e em di­rei­tos
|
||||
</div>
|
||||
</body>
|
||||
</html>
|
||||
|
|
|
@ -5,7 +5,7 @@
|
|||
</head>
|
||||
<body>
|
||||
<div style="width:1em; hyphens:manual;" lang="sv">
|
||||
Al­la män­ni­skor äro föd­da fria och li­ka i vär­de och rät­tig­he­ter
|
||||
Alla män­ni­skor äro föd­da fria och li­ka i vär­de och rät­tig­he­ter
|
||||
</div>
|
||||
</body>
|
||||
</html>
|
||||
|
|
|
@ -5,7 +5,7 @@
|
|||
</head>
|
||||
<body>
|
||||
<div style="width:1em; hyphens:manual;" lang="tr">
|
||||
Bü­tün in­san­lar hür, hay­si­yet ve hak­lar ba­kı­mın­dan eşit do­ğar­lar.
|
||||
Bütün in­san­lar hür, hay­si­yet ve hak­lar ba­kı­mın­dan eşit do­ğar­lar.
|
||||
</div>
|
||||
</body>
|
||||
</html>
|
||||
|
|
|
@ -11,7 +11,7 @@ code {
|
|||
</head>
|
||||
<body lang="en-us">
|
||||
<code style="width:100ch;">
|
||||
ABCDEFGHIJKLMNOPQRSTUVWXYZABCDEFGHIJKLMNOPQRSTUVWXYZABCDEFGHIJKLMNOPQRSTUVWXYZABCDEFGHIJKLMNOPQRSTU-<br />VWXYZsupercalifragilisticexpialidocious-<br />ABCDEFGHIJKLMNOPQRSTUVWXYZABCDEFGHIJKLMNOPQRSTUVWXYZABCDEFGHIJKLMNOPQRSTUVWXYZ
|
||||
abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstu-<br />vwxyzsupercalifragilisticexpialidocious-<br />abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyz
|
||||
</code>
|
||||
</body>
|
||||
</html>
|
||||
|
|
|
@ -15,7 +15,7 @@ code {
|
|||
manual hyphenation opportunities even if they are within an extreme long word.
|
||||
-->
|
||||
<code style="width:100ch;">
|
||||
ABCDEFGHIJKLMNOPQRSTUVWXYZABCDEFGHIJKLMNOPQRSTUVWXYZABCDEFGHIJKLMNOPQRSTUVWXYZABCDEFGHIJKLMNOPQRSTUVWXYZsuper­cali­fragi­listic­expiali­docious­ABCDEFGHIJKLMNOPQRSTUVWXYZABCDEFGHIJKLMNOPQRSTUVWXYZABCDEFGHIJKLMNOPQRSTUVWXYZ
|
||||
abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyzsuper­cali­fragi­listic­expiali­docious­abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyz
|
||||
</code>
|
||||
</body>
|
||||
</html>
|
||||
|
|
|
@ -2497,6 +2497,11 @@ pref("intl.hyphenation-alias.no-*", "nb");
|
|||
pref("intl.hyphenation-alias.nb-*", "nb");
|
||||
pref("intl.hyphenation-alias.nn-*", "nn");
|
||||
|
||||
// In German, we allow hyphenation of capitalized words; otherwise not.
|
||||
pref("intl.hyphenate-capitalized.de-1996", true);
|
||||
pref("intl.hyphenate-capitalized.de-1901", true);
|
||||
pref("intl.hyphenate-capitalized.de-CH", true);
|
||||
|
||||
// All prefs of default font should be "auto".
|
||||
pref("font.name.serif.ar", "");
|
||||
pref("font.name.sans-serif.ar", "");
|
||||
|
|
Загрузка…
Ссылка в новой задаче