зеркало из https://github.com/mozilla/gecko-dev.git
Backed out 11 changesets (bug 1763783) for causing gtest failures. CLOSED TREE
Backed out changeset 87e552902463 (bug 1763783) Backed out changeset ed05d313926c (bug 1763783) Backed out changeset 89b4fcf7e929 (bug 1763783) Backed out changeset e0dc696a1d53 (bug 1763783) Backed out changeset e8a743f53265 (bug 1763783) Backed out changeset 5d3f1290d8ba (bug 1763783) Backed out changeset 42ef2c926e71 (bug 1763783) Backed out changeset e346cf03c468 (bug 1763783) Backed out changeset 81c196a9f7c5 (bug 1763783) Backed out changeset 51e22286e481 (bug 1763783) Backed out changeset a2a8cd91233c (bug 1763783)
This commit is contained in:
Родитель
83e72b1d28
Коммит
15bdebfa75
2
CLOBBER
2
CLOBBER
|
@ -22,4 +22,4 @@
|
|||
# changes to stick? As of bug 928195, this shouldn't be necessary! Please
|
||||
# don't change CLOBBER for WebIDL changes any more.
|
||||
|
||||
Bug 1763783 - Update to ICU 71 requires clobber
|
||||
Merge day clobber 2022-04-04
|
Двоичные данные
config/external/icu/data/icudt71l.dat → config/external/icu/data/icudt70l.dat
поставляемый
Двоичные данные
config/external/icu/data/icudt71l.dat → config/external/icu/data/icudt70l.dat
поставляемый
Двоичный файл не отображается.
|
@ -318,7 +318,6 @@ EXPORTS.unicode += [
|
|||
'/intl/icu/source/i18n/unicode/ulocdata.h',
|
||||
'/intl/icu/source/i18n/unicode/umsg.h',
|
||||
'/intl/icu/source/i18n/unicode/unirepl.h',
|
||||
'/intl/icu/source/i18n/unicode/unounclass.h',
|
||||
'/intl/icu/source/i18n/unicode/unum.h',
|
||||
'/intl/icu/source/i18n/unicode/unumberformatter.h',
|
||||
'/intl/icu/source/i18n/unicode/unumberrangeformatter.h',
|
||||
|
|
|
@ -419,7 +419,8 @@ class DateTimeFormat final {
|
|||
* plan to remove it.
|
||||
*/
|
||||
template <typename B>
|
||||
ICUResult GetOriginalSkeleton(B& aBuffer) {
|
||||
ICUResult GetOriginalSkeleton(B& aBuffer,
|
||||
Maybe<HourCycle> aHourCycle = Nothing()) {
|
||||
static_assert(std::is_same_v<typename B::CharType, char16_t>);
|
||||
if (mOriginalSkeleton.length() == 0) {
|
||||
// Generate a skeleton from the resolved pattern, there was no originally
|
||||
|
@ -435,6 +436,10 @@ class DateTimeFormat final {
|
|||
if (!FillBuffer(mOriginalSkeleton, aBuffer)) {
|
||||
return Err(ICUError::OutOfMemory);
|
||||
}
|
||||
if (aHourCycle) {
|
||||
DateTimeFormat::ReplaceHourSymbol(Span(aBuffer.data(), aBuffer.length()),
|
||||
*aHourCycle);
|
||||
}
|
||||
return Ok();
|
||||
}
|
||||
/**
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
// Generated by make_intl_data.py. DO NOT EDIT.
|
||||
// Version: CLDR-41
|
||||
// URL: https://unicode.org/Public/cldr/41/core.zip
|
||||
// Version: CLDR-40
|
||||
// URL: https://unicode.org/Public/cldr/40/core.zip
|
||||
|
||||
#include "mozilla/Assertions.h"
|
||||
#include "mozilla/Span.h"
|
||||
|
@ -99,8 +99,8 @@ static bool IsCanonicallyCasedTransformType(mozilla::Span<const char> type) {
|
|||
#endif
|
||||
|
||||
// Mappings from language subtags to preferred values.
|
||||
// Derived from CLDR Supplemental Data, version 41.
|
||||
// https://unicode.org/Public/cldr/41/core.zip
|
||||
// Derived from CLDR Supplemental Data, version 40.
|
||||
// https://unicode.org/Public/cldr/40/core.zip
|
||||
bool mozilla::intl::Locale::LanguageMapping(LanguageSubtag& language) {
|
||||
MOZ_ASSERT(IsStructurallyValidLanguageTag(language.Span()));
|
||||
MOZ_ASSERT(IsCanonicallyCasedLanguageTag(language.Span()));
|
||||
|
@ -219,8 +219,8 @@ bool mozilla::intl::Locale::LanguageMapping(LanguageSubtag& language) {
|
|||
}
|
||||
|
||||
// Language subtags with complex mappings.
|
||||
// Derived from CLDR Supplemental Data, version 41.
|
||||
// https://unicode.org/Public/cldr/41/core.zip
|
||||
// Derived from CLDR Supplemental Data, version 40.
|
||||
// https://unicode.org/Public/cldr/40/core.zip
|
||||
bool mozilla::intl::Locale::ComplexLanguageMapping(const LanguageSubtag& language) {
|
||||
MOZ_ASSERT(IsStructurallyValidLanguageTag(language.Span()));
|
||||
MOZ_ASSERT(IsCanonicallyCasedLanguageTag(language.Span()));
|
||||
|
@ -241,8 +241,8 @@ bool mozilla::intl::Locale::ComplexLanguageMapping(const LanguageSubtag& languag
|
|||
}
|
||||
|
||||
// Mappings from script subtags to preferred values.
|
||||
// Derived from CLDR Supplemental Data, version 41.
|
||||
// https://unicode.org/Public/cldr/41/core.zip
|
||||
// Derived from CLDR Supplemental Data, version 40.
|
||||
// https://unicode.org/Public/cldr/40/core.zip
|
||||
bool mozilla::intl::Locale::ScriptMapping(ScriptSubtag& script) {
|
||||
MOZ_ASSERT(IsStructurallyValidScriptTag(script.Span()));
|
||||
MOZ_ASSERT(IsCanonicallyCasedScriptTag(script.Span()));
|
||||
|
@ -257,8 +257,8 @@ bool mozilla::intl::Locale::ScriptMapping(ScriptSubtag& script) {
|
|||
}
|
||||
|
||||
// Mappings from region subtags to preferred values.
|
||||
// Derived from CLDR Supplemental Data, version 41.
|
||||
// https://unicode.org/Public/cldr/41/core.zip
|
||||
// Derived from CLDR Supplemental Data, version 40.
|
||||
// https://unicode.org/Public/cldr/40/core.zip
|
||||
bool mozilla::intl::Locale::RegionMapping(RegionSubtag& region) {
|
||||
MOZ_ASSERT(IsStructurallyValidRegionTag(region.Span()));
|
||||
MOZ_ASSERT(IsCanonicallyCasedRegionTag(region.Span()));
|
||||
|
@ -357,8 +357,8 @@ bool mozilla::intl::Locale::RegionMapping(RegionSubtag& region) {
|
|||
}
|
||||
|
||||
// Region subtags with complex mappings.
|
||||
// Derived from CLDR Supplemental Data, version 41.
|
||||
// https://unicode.org/Public/cldr/41/core.zip
|
||||
// Derived from CLDR Supplemental Data, version 40.
|
||||
// https://unicode.org/Public/cldr/40/core.zip
|
||||
bool mozilla::intl::Locale::ComplexRegionMapping(const RegionSubtag& region) {
|
||||
MOZ_ASSERT(IsStructurallyValidRegionTag(region.Span()));
|
||||
MOZ_ASSERT(IsCanonicallyCasedRegionTag(region.Span()));
|
||||
|
@ -380,8 +380,8 @@ bool mozilla::intl::Locale::ComplexRegionMapping(const RegionSubtag& region) {
|
|||
}
|
||||
|
||||
// Language subtags with complex mappings.
|
||||
// Derived from CLDR Supplemental Data, version 41.
|
||||
// https://unicode.org/Public/cldr/41/core.zip
|
||||
// Derived from CLDR Supplemental Data, version 40.
|
||||
// https://unicode.org/Public/cldr/40/core.zip
|
||||
void mozilla::intl::Locale::PerformComplexLanguageMappings() {
|
||||
MOZ_ASSERT(IsStructurallyValidLanguageTag(Language().Span()));
|
||||
MOZ_ASSERT(IsCanonicallyCasedLanguageTag(Language().Span()));
|
||||
|
@ -416,8 +416,8 @@ void mozilla::intl::Locale::PerformComplexLanguageMappings() {
|
|||
}
|
||||
|
||||
// Region subtags with complex mappings.
|
||||
// Derived from CLDR Supplemental Data, version 41.
|
||||
// https://unicode.org/Public/cldr/41/core.zip
|
||||
// Derived from CLDR Supplemental Data, version 40.
|
||||
// https://unicode.org/Public/cldr/40/core.zip
|
||||
void mozilla::intl::Locale::PerformComplexRegionMappings() {
|
||||
MOZ_ASSERT(IsStructurallyValidLanguageTag(Language().Span()));
|
||||
MOZ_ASSERT(IsCanonicallyCasedLanguageTag(Language().Span()));
|
||||
|
@ -643,8 +643,8 @@ static bool IsLessThan(const T& a, const U& b) {
|
|||
}
|
||||
|
||||
// Mappings from variant subtags to preferred values.
|
||||
// Derived from CLDR Supplemental Data, version 41.
|
||||
// https://unicode.org/Public/cldr/41/core.zip
|
||||
// Derived from CLDR Supplemental Data, version 40.
|
||||
// https://unicode.org/Public/cldr/40/core.zip
|
||||
bool mozilla::intl::Locale::PerformVariantMappings() {
|
||||
// The variant subtags need to be sorted for binary search.
|
||||
MOZ_ASSERT(std::is_sorted(mVariants.begin(), mVariants.end(),
|
||||
|
@ -707,8 +707,8 @@ bool mozilla::intl::Locale::PerformVariantMappings() {
|
|||
}
|
||||
|
||||
// Canonicalize legacy locale identifiers.
|
||||
// Derived from CLDR Supplemental Data, version 41.
|
||||
// https://unicode.org/Public/cldr/41/core.zip
|
||||
// Derived from CLDR Supplemental Data, version 40.
|
||||
// https://unicode.org/Public/cldr/40/core.zip
|
||||
bool mozilla::intl::Locale::UpdateLegacyMappings() {
|
||||
// We're mapping legacy tags to non-legacy form here.
|
||||
// Other tags remain unchanged.
|
||||
|
@ -865,8 +865,8 @@ bool mozilla::intl::Locale::UpdateLegacyMappings() {
|
|||
}
|
||||
|
||||
// Mappings from legacy sign languages.
|
||||
// Derived from CLDR Supplemental Data, version 41.
|
||||
// https://unicode.org/Public/cldr/41/core.zip
|
||||
// Derived from CLDR Supplemental Data, version 40.
|
||||
// https://unicode.org/Public/cldr/40/core.zip
|
||||
bool mozilla::intl::Locale::SignLanguageMapping(LanguageSubtag& language,
|
||||
const RegionSubtag& region) {
|
||||
MOZ_ASSERT(language.EqualTo("sgn"));
|
||||
|
@ -1112,18 +1112,9 @@ const char* mozilla::intl::Locale::ReplaceTransformExtensionType(
|
|||
}
|
||||
}
|
||||
else if (IsTransformKey(key, "m0")) {
|
||||
if (IsTransformType(type, "beta-metsehaf")) {
|
||||
return "betamets";
|
||||
}
|
||||
if (IsTransformType(type, "ies-jes")) {
|
||||
return "iesjes";
|
||||
}
|
||||
if (IsTransformType(type, "names")) {
|
||||
return "prprname";
|
||||
}
|
||||
if (IsTransformType(type, "tekie-alibekit")) {
|
||||
return "tekieali";
|
||||
}
|
||||
}
|
||||
return nullptr;
|
||||
}
|
||||
|
|
|
@ -378,7 +378,7 @@ Maybe<NumberPartType> GetPartTypeForNumberField(UNumberFormatFields fieldName,
|
|||
return Some(NumberPartType::Unit);
|
||||
case UNUM_COMPACT_FIELD:
|
||||
return Some(NumberPartType::Compact);
|
||||
#ifndef U_HIDE_DRAFT_API
|
||||
#if !MOZ_SYSTEM_ICU
|
||||
case UNUM_APPROXIMATELY_SIGN_FIELD:
|
||||
return Some(NumberPartType::ApproximatelySign);
|
||||
#endif
|
||||
|
|
|
@ -169,8 +169,13 @@ Result<int32_t, ICUError> TimeZone::GetUTCOffsetMs(int64_t aLocalMilliseconds) {
|
|||
// time starts or the time zone offset is increased due to a time zone rule
|
||||
// change), t_local must be interpreted using the time zone offset before the
|
||||
// transition.
|
||||
#ifndef U_HIDE_DRAFT_API
|
||||
constexpr UTimeZoneLocalOption skippedTime = UCAL_TZ_LOCAL_FORMER;
|
||||
constexpr UTimeZoneLocalOption repeatedTime = UCAL_TZ_LOCAL_FORMER;
|
||||
#else
|
||||
constexpr UTimeZoneLocalOption skippedTime = UTimeZoneLocalOption(0x4);
|
||||
constexpr UTimeZoneLocalOption repeatedTime = UTimeZoneLocalOption(0x4);
|
||||
#endif
|
||||
|
||||
UDate date = UDate(aLocalMilliseconds);
|
||||
|
||||
|
|
|
@ -0,0 +1,56 @@
|
|||
# Add a new UNumberFormatFields constant for the approximately sign.
|
||||
#
|
||||
# https://unicode-org.atlassian.net/browse/ICU-21765
|
||||
|
||||
|
||||
diff --git a/intl/icu/source/i18n/number_affixutils.cpp b/intl/icu/source/i18n/number_affixutils.cpp
|
||||
--- a/intl/icu/source/i18n/number_affixutils.cpp
|
||||
+++ b/intl/icu/source/i18n/number_affixutils.cpp
|
||||
@@ -131,17 +131,17 @@ UnicodeString AffixUtils::escape(const U
|
||||
Field AffixUtils::getFieldForType(AffixPatternType type) {
|
||||
switch (type) {
|
||||
case TYPE_MINUS_SIGN:
|
||||
return {UFIELD_CATEGORY_NUMBER, UNUM_SIGN_FIELD};
|
||||
case TYPE_PLUS_SIGN:
|
||||
return {UFIELD_CATEGORY_NUMBER, UNUM_SIGN_FIELD};
|
||||
case TYPE_APPROXIMATELY_SIGN:
|
||||
// TODO: Introduce a new field for the approximately sign?
|
||||
- return {UFIELD_CATEGORY_NUMBER, UNUM_SIGN_FIELD};
|
||||
+ return {UFIELD_CATEGORY_NUMBER, UNUM_APPROXIMATELY_SIGN_FIELD};
|
||||
case TYPE_PERCENT:
|
||||
return {UFIELD_CATEGORY_NUMBER, UNUM_PERCENT_FIELD};
|
||||
case TYPE_PERMILLE:
|
||||
return {UFIELD_CATEGORY_NUMBER, UNUM_PERMILL_FIELD};
|
||||
case TYPE_CURRENCY_SINGLE:
|
||||
return {UFIELD_CATEGORY_NUMBER, UNUM_CURRENCY_FIELD};
|
||||
case TYPE_CURRENCY_DOUBLE:
|
||||
return {UFIELD_CATEGORY_NUMBER, UNUM_CURRENCY_FIELD};
|
||||
diff --git a/intl/icu/source/i18n/unicode/unum.h b/intl/icu/source/i18n/unicode/unum.h
|
||||
--- a/intl/icu/source/i18n/unicode/unum.h
|
||||
+++ b/intl/icu/source/i18n/unicode/unum.h
|
||||
@@ -397,22 +397,24 @@ typedef enum UNumberFormatFields {
|
||||
UNUM_PERMILL_FIELD,
|
||||
/** @stable ICU 49 */
|
||||
UNUM_SIGN_FIELD,
|
||||
/** @stable ICU 64 */
|
||||
UNUM_MEASURE_UNIT_FIELD,
|
||||
/** @stable ICU 64 */
|
||||
UNUM_COMPACT_FIELD,
|
||||
|
||||
+ UNUM_APPROXIMATELY_SIGN_FIELD,
|
||||
+
|
||||
#ifndef U_HIDE_DEPRECATED_API
|
||||
/**
|
||||
* One more than the highest normal UNumberFormatFields value.
|
||||
* @deprecated ICU 58 The numeric value may change over time, see ICU ticket #12420.
|
||||
*/
|
||||
- UNUM_FIELD_COUNT = UNUM_SIGN_FIELD + 3
|
||||
+ UNUM_FIELD_COUNT = UNUM_SIGN_FIELD + 4
|
||||
#endif /* U_HIDE_DEPRECATED_API */
|
||||
} UNumberFormatFields;
|
||||
|
||||
|
||||
/**
|
||||
* Selectors with special numeric values to use locale default minimum grouping
|
||||
* digits for the DecimalFormat/UNumberFormat setMinimumGroupingDigits method.
|
||||
* Do not use these constants with the [U]NumberFormatter API.
|
|
@ -1,5 +1,5 @@
|
|||
commit c205e7ee49a7086a28b9c275fcfdac9ca3dc815d
|
||||
Author: yumaoka <y.umaoka@gmail.com>
|
||||
Date: Wed Mar 30 14:47:46 2022 -0400
|
||||
commit a56dde820dc35665a66f2e9ee8ba58e75049b668
|
||||
Author: Shane F. Carr <shane@unicode.org>
|
||||
Date: Wed Oct 27 15:02:46 2021 -0700
|
||||
|
||||
ICU-21971 Added a new numeric currecny code SLE/695 for Sierra Leone Leone.
|
||||
ICU-21579 Fix warnings in number formatting code
|
||||
|
|
|
@ -79,7 +79,6 @@ UnhandledEngine::findBreaks( UText *text,
|
|||
int32_t /* startPos */,
|
||||
int32_t endPos,
|
||||
UVector32 &/*foundBreaks*/,
|
||||
UBool /* isPhraseBreaking */,
|
||||
UErrorCode &status) const {
|
||||
if (U_FAILURE(status)) return 0;
|
||||
UChar32 c = utext_current32(text);
|
||||
|
|
|
@ -75,7 +75,6 @@ class LanguageBreakEngine : public UMemory {
|
|||
int32_t startPos,
|
||||
int32_t endPos,
|
||||
UVector32 &foundBreaks,
|
||||
UBool isPhraseBreaking,
|
||||
UErrorCode &status) const = 0;
|
||||
|
||||
};
|
||||
|
@ -195,7 +194,6 @@ class UnhandledEngine : public LanguageBreakEngine {
|
|||
int32_t startPos,
|
||||
int32_t endPos,
|
||||
UVector32 &foundBreaks,
|
||||
UBool isPhraseBreaking,
|
||||
UErrorCode &status) const override;
|
||||
|
||||
/**
|
||||
|
|
|
@ -30,7 +30,6 @@
|
|||
#include "unicode/ures.h"
|
||||
#include "unicode/ustring.h"
|
||||
#include "unicode/filteredbrk.h"
|
||||
#include "bytesinkutil.h"
|
||||
#include "ucln_cmn.h"
|
||||
#include "cstring.h"
|
||||
#include "umutex.h"
|
||||
|
@ -116,7 +115,7 @@ BreakIterator::buildInstance(const Locale& loc, const char *type, UErrorCode &st
|
|||
}
|
||||
|
||||
// Create a RuleBasedBreakIterator
|
||||
result = new RuleBasedBreakIterator(file, uprv_strstr(type, "phrase") != NULL, status);
|
||||
result = new RuleBasedBreakIterator(file, status);
|
||||
|
||||
// If there is a result, set the valid locale and actual locale, and the kind
|
||||
if (U_SUCCESS(status) && result != NULL) {
|
||||
|
@ -409,6 +408,7 @@ BreakIterator::makeInstance(const Locale& loc, int32_t kind, UErrorCode& status)
|
|||
if (U_FAILURE(status)) {
|
||||
return NULL;
|
||||
}
|
||||
char lbType[kKeyValueLenMax];
|
||||
|
||||
BreakIterator *result = NULL;
|
||||
switch (kind) {
|
||||
|
@ -428,29 +428,18 @@ BreakIterator::makeInstance(const Locale& loc, int32_t kind, UErrorCode& status)
|
|||
break;
|
||||
case UBRK_LINE:
|
||||
{
|
||||
char lb_lw[kKeyValueLenMax];
|
||||
UTRACE_ENTRY(UTRACE_UBRK_CREATE_LINE);
|
||||
uprv_strcpy(lb_lw, "line");
|
||||
uprv_strcpy(lbType, "line");
|
||||
char lbKeyValue[kKeyValueLenMax] = {0};
|
||||
UErrorCode kvStatus = U_ZERO_ERROR;
|
||||
CharString value;
|
||||
CharStringByteSink valueSink(&value);
|
||||
loc.getKeywordValue("lb", valueSink, kvStatus);
|
||||
if (U_SUCCESS(kvStatus) && (value == "strict" || value == "normal" || value == "loose")) {
|
||||
uprv_strcat(lb_lw, "_");
|
||||
uprv_strcat(lb_lw, value.data());
|
||||
int32_t kLen = loc.getKeywordValue("lb", lbKeyValue, kKeyValueLenMax, kvStatus);
|
||||
if (U_SUCCESS(kvStatus) && kLen > 0 && (uprv_strcmp(lbKeyValue,"strict")==0 || uprv_strcmp(lbKeyValue,"normal")==0 || uprv_strcmp(lbKeyValue,"loose")==0)) {
|
||||
uprv_strcat(lbType, "_");
|
||||
uprv_strcat(lbType, lbKeyValue);
|
||||
}
|
||||
// lw=phrase is only supported in Japanese.
|
||||
if (uprv_strcmp(loc.getLanguage(), "ja") == 0) {
|
||||
value.clear();
|
||||
loc.getKeywordValue("lw", valueSink, kvStatus);
|
||||
if (U_SUCCESS(kvStatus) && value == "phrase") {
|
||||
uprv_strcat(lb_lw, "_");
|
||||
uprv_strcat(lb_lw, value.data());
|
||||
}
|
||||
}
|
||||
result = BreakIterator::buildInstance(loc, lb_lw, status);
|
||||
result = BreakIterator::buildInstance(loc, lbType, status);
|
||||
|
||||
UTRACE_DATA1(UTRACE_INFO, "lb_lw=%s", lb_lw);
|
||||
UTRACE_DATA1(UTRACE_INFO, "lb=%s", lbKeyValue);
|
||||
UTRACE_EXIT_STATUS(status);
|
||||
}
|
||||
break;
|
||||
|
|
|
@ -58,7 +58,7 @@
|
|||
<RuntimeLibrary>MultiThreadedDebugDLL</RuntimeLibrary>
|
||||
</ClCompile>
|
||||
<Link>
|
||||
<OutputFile>..\..\$(IcuBinOutputDir)\icuuc71d.dll</OutputFile>
|
||||
<OutputFile>..\..\$(IcuBinOutputDir)\icuuc70d.dll</OutputFile>
|
||||
<ProgramDatabaseFile>.\..\..\$(IcuLibOutputDir)\icuucd.pdb</ProgramDatabaseFile>
|
||||
<ImportLibrary>..\..\$(IcuLibOutputDir)\icuucd.lib</ImportLibrary>
|
||||
</Link>
|
||||
|
@ -70,7 +70,7 @@
|
|||
<FunctionLevelLinking>true</FunctionLevelLinking>
|
||||
</ClCompile>
|
||||
<Link>
|
||||
<OutputFile>..\..\$(IcuBinOutputDir)\icuuc71.dll</OutputFile>
|
||||
<OutputFile>..\..\$(IcuBinOutputDir)\icuuc70.dll</OutputFile>
|
||||
<ProgramDatabaseFile>.\..\..\$(IcuLibOutputDir)\icuuc.pdb</ProgramDatabaseFile>
|
||||
<ImportLibrary>..\..\$(IcuLibOutputDir)\icuuc.lib</ImportLibrary>
|
||||
</Link>
|
||||
|
|
|
@ -125,7 +125,7 @@
|
|||
<Link>
|
||||
<EnableCOMDATFolding>true</EnableCOMDATFolding>
|
||||
<AdditionalDependencies>vccorlib.lib;msvcrt.lib;vcruntime.lib;%(AdditionalDependencies)</AdditionalDependencies>
|
||||
<OutputFile>..\..\$(IcuBinOutputDir)\icuuc71.dll</OutputFile>
|
||||
<OutputFile>..\..\$(IcuBinOutputDir)\icuuc70.dll</OutputFile>
|
||||
<ProgramDatabaseFile>.\..\..\$(IcuLibOutputDir)\icuuc.pdb</ProgramDatabaseFile>
|
||||
<ImportLibrary>..\..\$(IcuLibOutputDir)\icuuc.lib</ImportLibrary>
|
||||
</Link>
|
||||
|
@ -148,7 +148,7 @@
|
|||
<Link>
|
||||
<GenerateDebugInformation>true</GenerateDebugInformation>
|
||||
<AdditionalDependencies>vccorlibd.lib;msvcrtd.lib;vcruntimed.lib;%(AdditionalDependencies)</AdditionalDependencies>
|
||||
<OutputFile>..\..\$(IcuBinOutputDir)\icuuc71d.dll</OutputFile>
|
||||
<OutputFile>..\..\$(IcuBinOutputDir)\icuuc70d.dll</OutputFile>
|
||||
<ProgramDatabaseFile>.\..\..\$(IcuLibOutputDir)\icuucd.pdb</ProgramDatabaseFile>
|
||||
<ImportLibrary>..\..\$(IcuLibOutputDir)\icuucd.lib</ImportLibrary>
|
||||
</Link>
|
||||
|
|
|
@ -17,10 +17,7 @@
|
|||
#include "dictbe.h"
|
||||
#include "unicode/uniset.h"
|
||||
#include "unicode/chariter.h"
|
||||
#include "unicode/resbund.h"
|
||||
#include "unicode/ubrk.h"
|
||||
#include "unicode/usetiter.h"
|
||||
#include "ubrkimpl.h"
|
||||
#include "utracimp.h"
|
||||
#include "uvectr32.h"
|
||||
#include "uvector.h"
|
||||
|
@ -51,7 +48,6 @@ DictionaryBreakEngine::findBreaks( UText *text,
|
|||
int32_t startPos,
|
||||
int32_t endPos,
|
||||
UVector32 &foundBreaks,
|
||||
UBool isPhraseBreaking,
|
||||
UErrorCode& status) const {
|
||||
if (U_FAILURE(status)) return 0;
|
||||
(void)startPos; // TODO: remove this param?
|
||||
|
@ -72,7 +68,7 @@ DictionaryBreakEngine::findBreaks( UText *text,
|
|||
}
|
||||
rangeStart = start;
|
||||
rangeEnd = current;
|
||||
result = divideUpDictionaryRange(text, rangeStart, rangeEnd, foundBreaks, isPhraseBreaking, status);
|
||||
result = divideUpDictionaryRange(text, rangeStart, rangeEnd, foundBreaks, status);
|
||||
utext_setNativeIndex(text, current);
|
||||
|
||||
return result;
|
||||
|
@ -203,13 +199,13 @@ ThaiBreakEngine::ThaiBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode
|
|||
{
|
||||
UTRACE_ENTRY(UTRACE_UBRK_CREATE_BREAK_ENGINE);
|
||||
UTRACE_DATA1(UTRACE_INFO, "dictbe=%s", "Thai");
|
||||
UnicodeSet thaiWordSet(UnicodeString(u"[[:Thai:]&[:LineBreak=SA:]]"), status);
|
||||
fThaiWordSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Thai:]&[:LineBreak=SA:]]"), status);
|
||||
if (U_SUCCESS(status)) {
|
||||
setCharacters(thaiWordSet);
|
||||
setCharacters(fThaiWordSet);
|
||||
}
|
||||
fMarkSet.applyPattern(UnicodeString(u"[[:Thai:]&[:LineBreak=SA:]&[:M:]]"), status);
|
||||
fMarkSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Thai:]&[:LineBreak=SA:]&[:M:]]"), status);
|
||||
fMarkSet.add(0x0020);
|
||||
fEndWordSet = thaiWordSet;
|
||||
fEndWordSet = fThaiWordSet;
|
||||
fEndWordSet.remove(0x0E31); // MAI HAN-AKAT
|
||||
fEndWordSet.remove(0x0E40, 0x0E44); // SARA E through SARA AI MAIMALAI
|
||||
fBeginWordSet.add(0x0E01, 0x0E2E); // KO KAI through HO NOKHUK
|
||||
|
@ -234,7 +230,6 @@ ThaiBreakEngine::divideUpDictionaryRange( UText *text,
|
|||
int32_t rangeStart,
|
||||
int32_t rangeEnd,
|
||||
UVector32 &foundBreaks,
|
||||
UBool /* isPhraseBreaking */,
|
||||
UErrorCode& status) const {
|
||||
if (U_FAILURE(status)) return 0;
|
||||
utext_setNativeIndex(text, rangeStart);
|
||||
|
@ -446,13 +441,13 @@ LaoBreakEngine::LaoBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &s
|
|||
{
|
||||
UTRACE_ENTRY(UTRACE_UBRK_CREATE_BREAK_ENGINE);
|
||||
UTRACE_DATA1(UTRACE_INFO, "dictbe=%s", "Laoo");
|
||||
UnicodeSet laoWordSet(UnicodeString(u"[[:Laoo:]&[:LineBreak=SA:]]"), status);
|
||||
fLaoWordSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Laoo:]&[:LineBreak=SA:]]"), status);
|
||||
if (U_SUCCESS(status)) {
|
||||
setCharacters(laoWordSet);
|
||||
setCharacters(fLaoWordSet);
|
||||
}
|
||||
fMarkSet.applyPattern(UnicodeString(u"[[:Laoo:]&[:LineBreak=SA:]&[:M:]]"), status);
|
||||
fMarkSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Laoo:]&[:LineBreak=SA:]&[:M:]]"), status);
|
||||
fMarkSet.add(0x0020);
|
||||
fEndWordSet = laoWordSet;
|
||||
fEndWordSet = fLaoWordSet;
|
||||
fEndWordSet.remove(0x0EC0, 0x0EC4); // prefix vowels
|
||||
fBeginWordSet.add(0x0E81, 0x0EAE); // basic consonants (including holes for corresponding Thai characters)
|
||||
fBeginWordSet.add(0x0EDC, 0x0EDD); // digraph consonants (no Thai equivalent)
|
||||
|
@ -474,7 +469,6 @@ LaoBreakEngine::divideUpDictionaryRange( UText *text,
|
|||
int32_t rangeStart,
|
||||
int32_t rangeEnd,
|
||||
UVector32 &foundBreaks,
|
||||
UBool /* isPhraseBreaking */,
|
||||
UErrorCode& status) const {
|
||||
if (U_FAILURE(status)) return 0;
|
||||
if ((rangeEnd - rangeStart) < LAO_MIN_WORD_SPAN) {
|
||||
|
@ -643,13 +637,14 @@ BurmeseBreakEngine::BurmeseBreakEngine(DictionaryMatcher *adoptDictionary, UErro
|
|||
{
|
||||
UTRACE_ENTRY(UTRACE_UBRK_CREATE_BREAK_ENGINE);
|
||||
UTRACE_DATA1(UTRACE_INFO, "dictbe=%s", "Mymr");
|
||||
fBeginWordSet.add(0x1000, 0x102A); // basic consonants and independent vowels
|
||||
fEndWordSet.applyPattern(UnicodeString(u"[[:Mymr:]&[:LineBreak=SA:]]"), status);
|
||||
fMarkSet.applyPattern(UnicodeString(u"[[:Mymr:]&[:LineBreak=SA:]&[:M:]]"), status);
|
||||
fMarkSet.add(0x0020);
|
||||
fBurmeseWordSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Mymr:]&[:LineBreak=SA:]]"), status);
|
||||
if (U_SUCCESS(status)) {
|
||||
setCharacters(fEndWordSet);
|
||||
setCharacters(fBurmeseWordSet);
|
||||
}
|
||||
fMarkSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Mymr:]&[:LineBreak=SA:]&[:M:]]"), status);
|
||||
fMarkSet.add(0x0020);
|
||||
fEndWordSet = fBurmeseWordSet;
|
||||
fBeginWordSet.add(0x1000, 0x102A); // basic consonants and independent vowels
|
||||
|
||||
// Compact for caching.
|
||||
fMarkSet.compact();
|
||||
|
@ -667,7 +662,6 @@ BurmeseBreakEngine::divideUpDictionaryRange( UText *text,
|
|||
int32_t rangeStart,
|
||||
int32_t rangeEnd,
|
||||
UVector32 &foundBreaks,
|
||||
UBool /* isPhraseBreaking */,
|
||||
UErrorCode& status ) const {
|
||||
if (U_FAILURE(status)) return 0;
|
||||
if ((rangeEnd - rangeStart) < BURMESE_MIN_WORD_SPAN) {
|
||||
|
@ -836,13 +830,13 @@ KhmerBreakEngine::KhmerBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCod
|
|||
{
|
||||
UTRACE_ENTRY(UTRACE_UBRK_CREATE_BREAK_ENGINE);
|
||||
UTRACE_DATA1(UTRACE_INFO, "dictbe=%s", "Khmr");
|
||||
UnicodeSet khmerWordSet(UnicodeString(u"[[:Khmr:]&[:LineBreak=SA:]]"), status);
|
||||
fKhmerWordSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Khmr:]&[:LineBreak=SA:]]"), status);
|
||||
if (U_SUCCESS(status)) {
|
||||
setCharacters(khmerWordSet);
|
||||
setCharacters(fKhmerWordSet);
|
||||
}
|
||||
fMarkSet.applyPattern(UnicodeString(u"[[:Khmr:]&[:LineBreak=SA:]&[:M:]]"), status);
|
||||
fMarkSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Khmr:]&[:LineBreak=SA:]&[:M:]]"), status);
|
||||
fMarkSet.add(0x0020);
|
||||
fEndWordSet = khmerWordSet;
|
||||
fEndWordSet = fKhmerWordSet;
|
||||
fBeginWordSet.add(0x1780, 0x17B3);
|
||||
//fBeginWordSet.add(0x17A3, 0x17A4); // deprecated vowels
|
||||
//fEndWordSet.remove(0x17A5, 0x17A9); // Khmer independent vowels that can't end a word
|
||||
|
@ -873,7 +867,6 @@ KhmerBreakEngine::divideUpDictionaryRange( UText *text,
|
|||
int32_t rangeStart,
|
||||
int32_t rangeEnd,
|
||||
UVector32 &foundBreaks,
|
||||
UBool /* isPhraseBreaking */,
|
||||
UErrorCode& status ) const {
|
||||
if (U_FAILURE(status)) return 0;
|
||||
if ((rangeEnd - rangeStart) < KHMER_MIN_WORD_SPAN) {
|
||||
|
@ -1057,27 +1050,25 @@ CjkBreakEngine::CjkBreakEngine(DictionaryMatcher *adoptDictionary, LanguageType
|
|||
: DictionaryBreakEngine(), fDictionary(adoptDictionary) {
|
||||
UTRACE_ENTRY(UTRACE_UBRK_CREATE_BREAK_ENGINE);
|
||||
UTRACE_DATA1(UTRACE_INFO, "dictbe=%s", "Hani");
|
||||
nfkcNorm2 = Normalizer2::getNFKCInstance(status);
|
||||
// Korean dictionary only includes Hangul syllables
|
||||
fHangulWordSet.applyPattern(UnicodeString(u"[\\uac00-\\ud7a3]"), status);
|
||||
fHangulWordSet.compact();
|
||||
// Digits, open puncutation and Alphabetic characters.
|
||||
fDigitOrOpenPunctuationOrAlphabetSet.applyPattern(
|
||||
UnicodeString(u"[[:Nd:][:Pi:][:Ps:][:Alphabetic:]]"), status);
|
||||
fDigitOrOpenPunctuationOrAlphabetSet.compact();
|
||||
fClosePunctuationSet.applyPattern(UnicodeString(u"[[:Pc:][:Pd:][:Pe:][:Pf:][:Po:]]"), status);
|
||||
fClosePunctuationSet.compact();
|
||||
fHangulWordSet.applyPattern(UNICODE_STRING_SIMPLE("[\\uac00-\\ud7a3]"), status);
|
||||
fHanWordSet.applyPattern(UNICODE_STRING_SIMPLE("[:Han:]"), status);
|
||||
fKatakanaWordSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Katakana:]\\uff9e\\uff9f]"), status);
|
||||
fHiraganaWordSet.applyPattern(UNICODE_STRING_SIMPLE("[:Hiragana:]"), status);
|
||||
nfkcNorm2 = Normalizer2::getNFKCInstance(status);
|
||||
|
||||
// handle Korean and Japanese/Chinese using different dictionaries
|
||||
if (type == kKorean) {
|
||||
if (U_SUCCESS(status)) {
|
||||
if (U_SUCCESS(status)) {
|
||||
// handle Korean and Japanese/Chinese using different dictionaries
|
||||
if (type == kKorean) {
|
||||
setCharacters(fHangulWordSet);
|
||||
}
|
||||
} else { //Chinese and Japanese
|
||||
UnicodeSet cjSet(UnicodeString(u"[[:Han:][:Hiragana:][:Katakana:]\\u30fc\\uff70\\uff9e\\uff9f]"), status);
|
||||
if (U_SUCCESS(status)) {
|
||||
} else { //Chinese and Japanese
|
||||
UnicodeSet cjSet;
|
||||
cjSet.addAll(fHanWordSet);
|
||||
cjSet.addAll(fKatakanaWordSet);
|
||||
cjSet.addAll(fHiraganaWordSet);
|
||||
cjSet.add(0xFF70); // HALFWIDTH KATAKANA-HIRAGANA PROLONGED SOUND MARK
|
||||
cjSet.add(0x30FC); // KATAKANA-HIRAGANA PROLONGED SOUND MARK
|
||||
setCharacters(cjSet);
|
||||
initJapanesePhraseParameter(status);
|
||||
}
|
||||
}
|
||||
UTRACE_EXIT_STATUS(status);
|
||||
|
@ -1105,12 +1096,14 @@ static inline bool isKatakana(UChar32 value) {
|
|||
(value >= 0xFF66 && value <= 0xFF9f);
|
||||
}
|
||||
|
||||
|
||||
// Function for accessing internal utext flags.
|
||||
// Replicates an internal UText function.
|
||||
|
||||
static inline int32_t utext_i32_flag(int32_t bitIndex) {
|
||||
return (int32_t)1 << bitIndex;
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* @param text A UText representing the text
|
||||
|
@ -1124,7 +1117,6 @@ CjkBreakEngine::divideUpDictionaryRange( UText *inText,
|
|||
int32_t rangeStart,
|
||||
int32_t rangeEnd,
|
||||
UVector32 &foundBreaks,
|
||||
UBool isPhraseBreaking,
|
||||
UErrorCode& status) const {
|
||||
if (U_FAILURE(status)) return 0;
|
||||
if (rangeStart >= rangeEnd) {
|
||||
|
@ -1355,31 +1347,6 @@ CjkBreakEngine::divideUpDictionaryRange( UText *inText,
|
|||
if ((uint32_t)bestSnlp.elementAti(numCodePts) == kuint32max) {
|
||||
t_boundary.addElement(numCodePts, status);
|
||||
numBreaks++;
|
||||
} else if (isPhraseBreaking) {
|
||||
t_boundary.addElement(numCodePts, status);
|
||||
if(U_SUCCESS(status)) {
|
||||
numBreaks++;
|
||||
int32_t prevIdx = numCodePts;
|
||||
|
||||
int32_t codeUnitIdx = -1;
|
||||
int32_t prevCodeUnitIdx = -1;
|
||||
int32_t length = -1;
|
||||
for (int32_t i = prev.elementAti(numCodePts); i > 0; i = prev.elementAti(i)) {
|
||||
codeUnitIdx = inString.moveIndex32(0, i);
|
||||
prevCodeUnitIdx = inString.moveIndex32(0, prevIdx);
|
||||
// Calculate the length by using the code unit.
|
||||
length = prevCodeUnitIdx - codeUnitIdx;
|
||||
prevIdx = i;
|
||||
// Keep the breakpoint if the pattern is not in the fSkipSet and continuous Katakana
|
||||
// characters don't occur.
|
||||
if (!fSkipSet.containsKey(inString.tempSubString(codeUnitIdx, length))
|
||||
&& (!isKatakana(inString.char32At(inString.moveIndex32(codeUnitIdx, -1)))
|
||||
|| !isKatakana(inString.char32At(codeUnitIdx)))) {
|
||||
t_boundary.addElement(i, status);
|
||||
numBreaks++;
|
||||
}
|
||||
}
|
||||
}
|
||||
} else {
|
||||
for (int32_t i = numCodePts; i > 0; i = prev.elementAti(i)) {
|
||||
t_boundary.addElement(i, status);
|
||||
|
@ -1400,8 +1367,7 @@ CjkBreakEngine::divideUpDictionaryRange( UText *inText,
|
|||
// while reversing t_boundary and pushing values to foundBreaks.
|
||||
int32_t prevCPPos = -1;
|
||||
int32_t prevUTextPos = -1;
|
||||
int32_t correctedNumBreaks = 0;
|
||||
for (int32_t i = numBreaks - 1; i >= 0; i--) {
|
||||
for (int32_t i = numBreaks-1; i >= 0; i--) {
|
||||
int32_t cpPos = t_boundary.elementAti(i);
|
||||
U_ASSERT(cpPos > prevCPPos);
|
||||
int32_t utextPos = inputMap.isValid() ? inputMap->elementAti(cpPos) : cpPos + rangeStart;
|
||||
|
@ -1409,15 +1375,7 @@ CjkBreakEngine::divideUpDictionaryRange( UText *inText,
|
|||
if (utextPos > prevUTextPos) {
|
||||
// Boundaries are added to foundBreaks output in ascending order.
|
||||
U_ASSERT(foundBreaks.size() == 0 || foundBreaks.peeki() < utextPos);
|
||||
// In phrase breaking, there has to be a breakpoint between Cj character and close
|
||||
// punctuation.
|
||||
// E.g.[携帯電話]正しい選択 -> [携帯▁電話]▁正しい▁選択 -> breakpoint between ] and 正
|
||||
if (utextPos != rangeStart
|
||||
|| (isPhraseBreaking && utextPos > 0
|
||||
&& fClosePunctuationSet.contains(utext_char32At(inText, utextPos - 1)))) {
|
||||
foundBreaks.push(utextPos, status);
|
||||
correctedNumBreaks++;
|
||||
}
|
||||
foundBreaks.push(utextPos, status);
|
||||
} else {
|
||||
// Normalization expanded the input text, the dictionary found a boundary
|
||||
// within the expansion, giving two boundaries with the same index in the
|
||||
|
@ -1429,52 +1387,9 @@ CjkBreakEngine::divideUpDictionaryRange( UText *inText,
|
|||
}
|
||||
(void)prevCPPos; // suppress compiler warnings about unused variable
|
||||
|
||||
UChar32 nextChar = utext_char32At(inText, rangeEnd);
|
||||
if (!foundBreaks.isEmpty() && foundBreaks.peeki() == rangeEnd) {
|
||||
// In phrase breaking, there has to be a breakpoint between Cj character and
|
||||
// the number/open punctuation.
|
||||
// E.g. る文字「そうだ、京都」->る▁文字▁「そうだ、▁京都」-> breakpoint between 字 and「
|
||||
// E.g. 乗車率90%程度だろうか -> 乗車▁率▁90%▁程度だろうか -> breakpoint between 率 and 9
|
||||
// E.g. しかもロゴがUnicode! -> しかも▁ロゴが▁Unicode!-> breakpoint between が and U
|
||||
if (isPhraseBreaking) {
|
||||
if (!fDigitOrOpenPunctuationOrAlphabetSet.contains(nextChar)) {
|
||||
foundBreaks.popi();
|
||||
correctedNumBreaks--;
|
||||
}
|
||||
} else {
|
||||
foundBreaks.popi();
|
||||
correctedNumBreaks--;
|
||||
}
|
||||
}
|
||||
|
||||
// inString goes out of scope
|
||||
// inputMap goes out of scope
|
||||
return correctedNumBreaks;
|
||||
}
|
||||
|
||||
void CjkBreakEngine::initJapanesePhraseParameter(UErrorCode& error) {
|
||||
loadJapaneseExtensions(error);
|
||||
loadHiragana(error);
|
||||
}
|
||||
|
||||
void CjkBreakEngine::loadJapaneseExtensions(UErrorCode& error) {
|
||||
const char* tag = "extensions";
|
||||
ResourceBundle ja(U_ICUDATA_BRKITR, "ja", error);
|
||||
if (U_SUCCESS(error)) {
|
||||
ResourceBundle bundle = ja.get(tag, error);
|
||||
while (U_SUCCESS(error) && bundle.hasNext()) {
|
||||
fSkipSet.puti(bundle.getNextString(error), 1, error);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void CjkBreakEngine::loadHiragana(UErrorCode& error) {
|
||||
UnicodeSet hiraganaWordSet(UnicodeString(u"[:Hiragana:]"), error);
|
||||
hiraganaWordSet.compact();
|
||||
UnicodeSetIterator iterator(hiraganaWordSet);
|
||||
while (iterator.next()) {
|
||||
fSkipSet.puti(UnicodeString(iterator.getCodepoint()), 1, error);
|
||||
}
|
||||
return numBreaks;
|
||||
}
|
||||
#endif
|
||||
|
||||
|
|
|
@ -15,7 +15,6 @@
|
|||
#include "unicode/utext.h"
|
||||
|
||||
#include "brkeng.h"
|
||||
#include "hash.h"
|
||||
#include "uvectr32.h"
|
||||
|
||||
U_NAMESPACE_BEGIN
|
||||
|
@ -81,7 +80,6 @@ class DictionaryBreakEngine : public LanguageBreakEngine {
|
|||
int32_t startPos,
|
||||
int32_t endPos,
|
||||
UVector32 &foundBreaks,
|
||||
UBool isPhraseBreaking,
|
||||
UErrorCode& status ) const override;
|
||||
|
||||
protected:
|
||||
|
@ -107,7 +105,6 @@ class DictionaryBreakEngine : public LanguageBreakEngine {
|
|||
int32_t rangeStart,
|
||||
int32_t rangeEnd,
|
||||
UVector32 &foundBreaks,
|
||||
UBool isPhraseBreaking,
|
||||
UErrorCode& status) const = 0;
|
||||
|
||||
};
|
||||
|
@ -130,6 +127,7 @@ class ThaiBreakEngine : public DictionaryBreakEngine {
|
|||
* @internal
|
||||
*/
|
||||
|
||||
UnicodeSet fThaiWordSet;
|
||||
UnicodeSet fEndWordSet;
|
||||
UnicodeSet fBeginWordSet;
|
||||
UnicodeSet fSuffixSet;
|
||||
|
@ -166,7 +164,6 @@ class ThaiBreakEngine : public DictionaryBreakEngine {
|
|||
int32_t rangeStart,
|
||||
int32_t rangeEnd,
|
||||
UVector32 &foundBreaks,
|
||||
UBool isPhraseBreaking,
|
||||
UErrorCode& status) const override;
|
||||
|
||||
};
|
||||
|
@ -189,6 +186,7 @@ class LaoBreakEngine : public DictionaryBreakEngine {
|
|||
* @internal
|
||||
*/
|
||||
|
||||
UnicodeSet fLaoWordSet;
|
||||
UnicodeSet fEndWordSet;
|
||||
UnicodeSet fBeginWordSet;
|
||||
UnicodeSet fMarkSet;
|
||||
|
@ -224,7 +222,6 @@ class LaoBreakEngine : public DictionaryBreakEngine {
|
|||
int32_t rangeStart,
|
||||
int32_t rangeEnd,
|
||||
UVector32 &foundBreaks,
|
||||
UBool isPhraseBreaking,
|
||||
UErrorCode& status) const override;
|
||||
|
||||
};
|
||||
|
@ -247,6 +244,7 @@ class BurmeseBreakEngine : public DictionaryBreakEngine {
|
|||
* @internal
|
||||
*/
|
||||
|
||||
UnicodeSet fBurmeseWordSet;
|
||||
UnicodeSet fEndWordSet;
|
||||
UnicodeSet fBeginWordSet;
|
||||
UnicodeSet fMarkSet;
|
||||
|
@ -282,7 +280,6 @@ class BurmeseBreakEngine : public DictionaryBreakEngine {
|
|||
int32_t rangeStart,
|
||||
int32_t rangeEnd,
|
||||
UVector32 &foundBreaks,
|
||||
UBool isPhraseBreaking,
|
||||
UErrorCode& status) const override;
|
||||
|
||||
};
|
||||
|
@ -305,6 +302,7 @@ class KhmerBreakEngine : public DictionaryBreakEngine {
|
|||
* @internal
|
||||
*/
|
||||
|
||||
UnicodeSet fKhmerWordSet;
|
||||
UnicodeSet fEndWordSet;
|
||||
UnicodeSet fBeginWordSet;
|
||||
UnicodeSet fMarkSet;
|
||||
|
@ -340,7 +338,6 @@ class KhmerBreakEngine : public DictionaryBreakEngine {
|
|||
int32_t rangeStart,
|
||||
int32_t rangeEnd,
|
||||
UVector32 &foundBreaks,
|
||||
UBool isPhraseBreaking,
|
||||
UErrorCode& status) const override;
|
||||
|
||||
};
|
||||
|
@ -369,22 +366,13 @@ class CjkBreakEngine : public DictionaryBreakEngine {
|
|||
* @internal
|
||||
*/
|
||||
UnicodeSet fHangulWordSet;
|
||||
UnicodeSet fDigitOrOpenPunctuationOrAlphabetSet;
|
||||
UnicodeSet fClosePunctuationSet;
|
||||
UnicodeSet fHanWordSet;
|
||||
UnicodeSet fKatakanaWordSet;
|
||||
UnicodeSet fHiraganaWordSet;
|
||||
|
||||
DictionaryMatcher *fDictionary;
|
||||
const Normalizer2 *nfkcNorm2;
|
||||
|
||||
private:
|
||||
// Load Japanese extensions.
|
||||
void loadJapaneseExtensions(UErrorCode& error);
|
||||
// Load Japanese Hiragana.
|
||||
void loadHiragana(UErrorCode& error);
|
||||
// Initialize fSkipSet by loading Japanese Hiragana and extensions.
|
||||
void initJapanesePhraseParameter(UErrorCode& error);
|
||||
|
||||
Hashtable fSkipSet;
|
||||
|
||||
public:
|
||||
|
||||
/**
|
||||
|
@ -416,7 +404,6 @@ class CjkBreakEngine : public DictionaryBreakEngine {
|
|||
int32_t rangeStart,
|
||||
int32_t rangeEnd,
|
||||
UVector32 &foundBreaks,
|
||||
UBool isPhraseBreaking,
|
||||
UErrorCode& status) const override;
|
||||
|
||||
};
|
||||
|
|
|
@ -168,9 +168,12 @@ void LocaleMatcher::Builder::clearSupportedLocales() {
|
|||
bool LocaleMatcher::Builder::ensureSupportedLocaleVector() {
|
||||
if (U_FAILURE(errorCode_)) { return false; }
|
||||
if (supportedLocales_ != nullptr) { return true; }
|
||||
LocalPointer<UVector> lpSupportedLocales(new UVector(uprv_deleteUObject, nullptr, errorCode_), errorCode_);
|
||||
supportedLocales_ = new UVector(uprv_deleteUObject, nullptr, errorCode_);
|
||||
if (U_FAILURE(errorCode_)) { return false; }
|
||||
supportedLocales_ = lpSupportedLocales.orphan();
|
||||
if (supportedLocales_ == nullptr) {
|
||||
errorCode_ = U_MEMORY_ALLOCATION_ERROR;
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
|
@ -184,8 +187,9 @@ LocaleMatcher::Builder &LocaleMatcher::Builder::setSupportedLocalesFromListStrin
|
|||
for (int32_t i = 0; i < length; ++i) {
|
||||
Locale *locale = list.orphanLocaleAt(i);
|
||||
if (locale == nullptr) { continue; }
|
||||
supportedLocales_->adoptElement(locale, errorCode_);
|
||||
supportedLocales_->addElementX(locale, errorCode_);
|
||||
if (U_FAILURE(errorCode_)) {
|
||||
delete locale;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
@ -193,21 +197,35 @@ LocaleMatcher::Builder &LocaleMatcher::Builder::setSupportedLocalesFromListStrin
|
|||
}
|
||||
|
||||
LocaleMatcher::Builder &LocaleMatcher::Builder::setSupportedLocales(Locale::Iterator &locales) {
|
||||
if (ensureSupportedLocaleVector()) {
|
||||
clearSupportedLocales();
|
||||
while (locales.hasNext() && U_SUCCESS(errorCode_)) {
|
||||
const Locale &locale = locales.next();
|
||||
LocalPointer<Locale> clone (locale.clone(), errorCode_);
|
||||
supportedLocales_->adoptElement(clone.orphan(), errorCode_);
|
||||
if (U_FAILURE(errorCode_)) { return *this; }
|
||||
clearSupportedLocales();
|
||||
if (!ensureSupportedLocaleVector()) { return *this; }
|
||||
while (locales.hasNext()) {
|
||||
const Locale &locale = locales.next();
|
||||
Locale *clone = locale.clone();
|
||||
if (clone == nullptr) {
|
||||
errorCode_ = U_MEMORY_ALLOCATION_ERROR;
|
||||
break;
|
||||
}
|
||||
supportedLocales_->addElementX(clone, errorCode_);
|
||||
if (U_FAILURE(errorCode_)) {
|
||||
delete clone;
|
||||
break;
|
||||
}
|
||||
}
|
||||
return *this;
|
||||
}
|
||||
|
||||
LocaleMatcher::Builder &LocaleMatcher::Builder::addSupportedLocale(const Locale &locale) {
|
||||
if (ensureSupportedLocaleVector()) {
|
||||
LocalPointer<Locale> clone(locale.clone(), errorCode_);
|
||||
supportedLocales_->adoptElement(clone.orphan(), errorCode_);
|
||||
if (!ensureSupportedLocaleVector()) { return *this; }
|
||||
Locale *clone = locale.clone();
|
||||
if (clone == nullptr) {
|
||||
errorCode_ = U_MEMORY_ALLOCATION_ERROR;
|
||||
return *this;
|
||||
}
|
||||
supportedLocales_->addElementX(clone, errorCode_);
|
||||
if (U_FAILURE(errorCode_)) {
|
||||
delete clone;
|
||||
}
|
||||
return *this;
|
||||
}
|
||||
|
|
|
@ -1204,11 +1204,14 @@ AliasReplacer::parseLanguageReplacement(
|
|||
// We have multiple field so we have to allocate and parse
|
||||
CharString* str = new CharString(
|
||||
replacement, (int32_t)uprv_strlen(replacement), status);
|
||||
LocalPointer<CharString> lpStr(str, status);
|
||||
toBeFreed.adoptElement(lpStr.orphan(), status);
|
||||
if (U_FAILURE(status)) {
|
||||
return;
|
||||
}
|
||||
if (str == nullptr) {
|
||||
status = U_MEMORY_ALLOCATION_ERROR;
|
||||
return;
|
||||
}
|
||||
toBeFreed.addElementX(str, status);
|
||||
char* data = str->data();
|
||||
replacedLanguage = (const char*) data;
|
||||
char* endOfField = uprv_strchr(data, '_');
|
||||
|
@ -1417,9 +1420,12 @@ AliasReplacer::replaceTerritory(UVector& toBeFreed, UErrorCode& status)
|
|||
(int32_t)(firstSpace - replacement), status), status);
|
||||
}
|
||||
if (U_FAILURE(status)) { return false; }
|
||||
if (item.isNull()) {
|
||||
status = U_MEMORY_ALLOCATION_ERROR;
|
||||
return false;
|
||||
}
|
||||
replacedRegion = item->data();
|
||||
toBeFreed.adoptElement(item.orphan(), status);
|
||||
if (U_FAILURE(status)) { return false; }
|
||||
toBeFreed.addElementX(item.orphan(), status);
|
||||
}
|
||||
U_ASSERT(!same(region, replacedRegion));
|
||||
region = replacedRegion;
|
||||
|
@ -1653,10 +1659,10 @@ AliasReplacer::replace(const Locale& locale, CharString& out, UErrorCode& status
|
|||
while ((end = uprv_strchr(start, SEP_CHAR)) != nullptr &&
|
||||
U_SUCCESS(status)) {
|
||||
*end = NULL_CHAR; // null terminate inside variantsBuff
|
||||
variants.addElement(start, status);
|
||||
variants.addElementX(start, status);
|
||||
start = end + 1;
|
||||
}
|
||||
variants.addElement(start, status);
|
||||
variants.addElementX(start, status);
|
||||
}
|
||||
if (U_FAILURE(status)) { return false; }
|
||||
|
||||
|
|
|
@ -1,8 +1,8 @@
|
|||
// © 2021 and later: Unicode, Inc. and others.
|
||||
// License & terms of use: http://www.unicode.org/copyright.html
|
||||
|
||||
#include <complex>
|
||||
#include <utility>
|
||||
#include <ctgmath>
|
||||
|
||||
#include "unicode/utypes.h"
|
||||
|
||||
|
@ -639,7 +639,6 @@ LSTMBreakEngine::divideUpDictionaryRange( UText *text,
|
|||
int32_t startPos,
|
||||
int32_t endPos,
|
||||
UVector32 &foundBreaks,
|
||||
UBool /* isPhraseBreaking */,
|
||||
UErrorCode& status) const {
|
||||
if (U_FAILURE(status)) return 0;
|
||||
int32_t beginFoundBreakSize = foundBreaks.size();
|
||||
|
|
|
@ -62,7 +62,6 @@ protected:
|
|||
int32_t rangeStart,
|
||||
int32_t rangeEnd,
|
||||
UVector32 &foundBreaks,
|
||||
UBool isPhraseBreaking,
|
||||
UErrorCode& status) const override;
|
||||
private:
|
||||
const LSTMData* fData;
|
||||
|
|
|
@ -2496,18 +2496,15 @@ void CanonIterData::addToStartSet(UChar32 origin, UChar32 decompLead, UErrorCode
|
|||
// origin is not the first character, or it is U+0000.
|
||||
UnicodeSet *set;
|
||||
if((canonValue&CANON_HAS_SET)==0) {
|
||||
LocalPointer<UnicodeSet> lpSet(new UnicodeSet, errorCode);
|
||||
set=lpSet.getAlias();
|
||||
if(U_FAILURE(errorCode)) {
|
||||
set=new UnicodeSet;
|
||||
if(set==NULL) {
|
||||
errorCode=U_MEMORY_ALLOCATION_ERROR;
|
||||
return;
|
||||
}
|
||||
UChar32 firstOrigin=(UChar32)(canonValue&CANON_VALUE_MASK);
|
||||
canonValue=(canonValue&~CANON_VALUE_MASK)|CANON_HAS_SET|(uint32_t)canonStartSets.size();
|
||||
umutablecptrie_set(mutableTrie, decompLead, canonValue, &errorCode);
|
||||
canonStartSets.adoptElement(lpSet.orphan(), errorCode);
|
||||
if (U_FAILURE(errorCode)) {
|
||||
return;
|
||||
}
|
||||
canonStartSets.addElementX(set, errorCode);
|
||||
if(firstOrigin!=0) {
|
||||
set->add(firstOrigin);
|
||||
}
|
||||
|
|
|
@ -82,19 +82,6 @@ RuleBasedBreakIterator::RuleBasedBreakIterator(RBBIDataHeader* data, UErrorCode
|
|||
}
|
||||
}
|
||||
|
||||
//-------------------------------------------------------------------------------
|
||||
//
|
||||
// Constructor from a UDataMemory handle to precompiled break rules
|
||||
// stored in an ICU data file. This construcotr is private API,
|
||||
// only for internal use.
|
||||
//
|
||||
//-------------------------------------------------------------------------------
|
||||
RuleBasedBreakIterator::RuleBasedBreakIterator(UDataMemory* udm, UBool isPhraseBreaking,
|
||||
UErrorCode &status) : RuleBasedBreakIterator(udm, status)
|
||||
{
|
||||
fIsPhraseBreaking = isPhraseBreaking;
|
||||
}
|
||||
|
||||
//
|
||||
// Construct from precompiled binary rules (tables). This constructor is public API,
|
||||
// taking the rules as a (const uint8_t *) to match the type produced by getBinaryRules().
|
||||
|
@ -335,7 +322,6 @@ void RuleBasedBreakIterator::init(UErrorCode &status) {
|
|||
fBreakCache = nullptr;
|
||||
fDictionaryCache = nullptr;
|
||||
fLookAheadMatches = nullptr;
|
||||
fIsPhraseBreaking = false;
|
||||
|
||||
// Note: IBM xlC is unable to assign or initialize member fText from UTEXT_INITIALIZER.
|
||||
// fText = UTEXT_INITIALIZER;
|
||||
|
|
|
@ -163,7 +163,7 @@ void RuleBasedBreakIterator::DictionaryCache::populateDictionary(int32_t startPo
|
|||
// Ask the language object if there are any breaks. It will add them to the cache and
|
||||
// leave the text pointer on the other side of its range, ready to search for the next one.
|
||||
if (lbe != NULL) {
|
||||
foundBreakCount += lbe->findBreaks(text, rangeStart, rangeEnd, fBreaks, fBI->fIsPhraseBreaking, status);
|
||||
foundBreakCount += lbe->findBreaks(text, rangeStart, rangeEnd, fBreaks, status);
|
||||
}
|
||||
|
||||
// Reload the loop variables for the next go-round
|
||||
|
|
|
@ -625,7 +625,10 @@ ICUService::getVisibleIDs(UVector& result, const UnicodeString* matchID, UErrorC
|
|||
}
|
||||
}
|
||||
|
||||
LocalPointer<UnicodeString> idClone(id->clone(), status);
|
||||
LocalPointer<UnicodeString> idClone(new UnicodeString(*id), status);
|
||||
if (U_SUCCESS(status) && idClone->isBogus()) {
|
||||
status = U_MEMORY_ALLOCATION_ERROR;
|
||||
}
|
||||
result.adoptElement(idClone.orphan(), status);
|
||||
}
|
||||
delete fallbackKey;
|
||||
|
|
|
@ -179,8 +179,7 @@ private:
|
|||
|
||||
length = other._ids.size();
|
||||
for(i = 0; i < length; ++i) {
|
||||
LocalPointer<UnicodeString> clonedId(((UnicodeString *)other._ids.elementAt(i))->clone(), status);
|
||||
_ids.adoptElement(clonedId.orphan(), status);
|
||||
_ids.addElementX(((UnicodeString *)other._ids.elementAt(i))->clone(), status);
|
||||
}
|
||||
|
||||
if(U_SUCCESS(status)) {
|
||||
|
|
|
@ -49,11 +49,7 @@ ICUNotifier::addListener(const EventListener* l, UErrorCode& status)
|
|||
if (acceptsListener(*l)) {
|
||||
Mutex lmx(¬ifyLock);
|
||||
if (listeners == NULL) {
|
||||
LocalPointer<UVector> lpListeners(new UVector(5, status), status);
|
||||
if (U_FAILURE(status)) {
|
||||
return;
|
||||
}
|
||||
listeners = lpListeners.orphan();
|
||||
listeners = new UVector(5, status);
|
||||
} else {
|
||||
for (int i = 0, e = listeners->size(); i < e; ++i) {
|
||||
const EventListener* el = (const EventListener*)(listeners->elementAt(i));
|
||||
|
@ -63,7 +59,7 @@ ICUNotifier::addListener(const EventListener* l, UErrorCode& status)
|
|||
}
|
||||
}
|
||||
|
||||
listeners->addElement((void*)l, status); // cast away const
|
||||
listeners->addElementX((void*)l, status); // cast away const
|
||||
}
|
||||
#ifdef NOTIFIER_DEBUG
|
||||
else {
|
||||
|
@ -106,11 +102,13 @@ ICUNotifier::removeListener(const EventListener *l, UErrorCode& status)
|
|||
void
|
||||
ICUNotifier::notifyChanged(void)
|
||||
{
|
||||
Mutex lmx(¬ifyLock);
|
||||
if (listeners != NULL) {
|
||||
for (int i = 0, e = listeners->size(); i < e; ++i) {
|
||||
EventListener* el = (EventListener*)listeners->elementAt(i);
|
||||
notifyListener(*el);
|
||||
Mutex lmx(¬ifyLock);
|
||||
if (listeners != NULL) {
|
||||
for (int i = 0, e = listeners->size(); i < e; ++i) {
|
||||
EventListener* el = (EventListener*)listeners->elementAt(i);
|
||||
notifyListener(*el);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -168,7 +168,7 @@ ubrk_safeClone(
|
|||
BreakIterator *newBI = ((BreakIterator *)bi)->clone();
|
||||
if (newBI == NULL) {
|
||||
*status = U_MEMORY_ALLOCATION_ERROR;
|
||||
} else if (pBufferSize != NULL) {
|
||||
} else {
|
||||
*status = U_SAFECLONE_ALLOCATED_WARNING;
|
||||
}
|
||||
return (UBreakIterator *)newBI;
|
||||
|
@ -176,7 +176,15 @@ ubrk_safeClone(
|
|||
|
||||
U_CAPI UBreakIterator * U_EXPORT2
|
||||
ubrk_clone(const UBreakIterator *bi, UErrorCode *status) {
|
||||
return ubrk_safeClone(bi, nullptr, nullptr, status);
|
||||
if (U_FAILURE(*status)) {
|
||||
return nullptr;
|
||||
}
|
||||
BreakIterator *newBI = ((BreakIterator *)bi)->clone();
|
||||
if (newBI == nullptr) {
|
||||
*status = U_MEMORY_ALLOCATION_ERROR;
|
||||
return nullptr;
|
||||
}
|
||||
return (UBreakIterator *)newBI;
|
||||
}
|
||||
|
||||
|
||||
|
|
|
@ -22,14 +22,27 @@
|
|||
#include "unicode/utypes.h"
|
||||
#include "unicode/unistr.h"
|
||||
#include "unicode/uset.h"
|
||||
#include "unicode/udata.h" /* UDataInfo */
|
||||
#include "unicode/utf16.h"
|
||||
#include "cmemory.h"
|
||||
#include "uassert.h"
|
||||
#include "ucase.h"
|
||||
#include "ucmndata.h" /* DataHeader */
|
||||
#include "udatamem.h"
|
||||
#include "umutex.h"
|
||||
#include "uassert.h"
|
||||
#include "cmemory.h"
|
||||
#include "utrie2.h"
|
||||
#include "ucase.h"
|
||||
|
||||
/* ucase_props_data.h is machine-generated by genprops/casepropsbuilder.cpp */
|
||||
struct UCaseProps {
|
||||
UDataMemory *mem;
|
||||
const int32_t *indexes;
|
||||
const uint16_t *exceptions;
|
||||
const uint16_t *unfold;
|
||||
|
||||
UTrie2 trie;
|
||||
uint8_t formatVersion[4];
|
||||
};
|
||||
|
||||
/* ucase_props_data.h is machine-generated by gencase --csource */
|
||||
#define INCLUDED_FROM_UCASE_CPP
|
||||
#include "ucase_props_data.h"
|
||||
|
||||
|
@ -64,13 +77,6 @@ ucase_addPropertyStarts(const USetAdder *sa, UErrorCode *pErrorCode) {
|
|||
|
||||
/* data access primitives --------------------------------------------------- */
|
||||
|
||||
U_CAPI const struct UCaseProps * U_EXPORT2
|
||||
ucase_getSingleton(int32_t *pExceptionsLength, int32_t *pUnfoldLength) {
|
||||
*pExceptionsLength = UPRV_LENGTHOF(ucase_props_exceptions);
|
||||
*pUnfoldLength = UPRV_LENGTHOF(ucase_props_unfold);
|
||||
return &ucase_props_singleton;
|
||||
}
|
||||
|
||||
U_CFUNC const UTrie2 * U_EXPORT2
|
||||
ucase_getTrie() {
|
||||
return &ucase_props_singleton.trie;
|
||||
|
@ -684,7 +690,7 @@ ucase_isCaseSensitive(UChar32 c) {
|
|||
* - The general category of C is
|
||||
* Nonspacing Mark (Mn), or Enclosing Mark (Me), or Format Control (Cf), or
|
||||
* Letter Modifier (Lm), or Symbol Modifier (Sk)
|
||||
* - C is one of the following characters
|
||||
* - C is one of the following characters
|
||||
* U+0027 APOSTROPHE
|
||||
* U+00AD SOFT HYPHEN (SHY)
|
||||
* U+2019 RIGHT SINGLE QUOTATION MARK
|
||||
|
@ -1058,8 +1064,6 @@ ucase_toFullLower(UChar32 c,
|
|||
// The sign of the result has meaning, input must be non-negative so that it can be returned as is.
|
||||
U_ASSERT(c >= 0);
|
||||
UChar32 result=c;
|
||||
// Reset the output pointer in case it was uninitialized.
|
||||
*pString=nullptr;
|
||||
uint16_t props=UTRIE2_GET16(&ucase_props_singleton.trie, c);
|
||||
if(!UCASE_HAS_EXCEPTION(props)) {
|
||||
if(UCASE_IS_UPPER_OR_TITLE(props)) {
|
||||
|
@ -1144,6 +1148,7 @@ ucase_toFullLower(UChar32 c,
|
|||
0307; ; 0307; 0307; tr After_I; # COMBINING DOT ABOVE
|
||||
0307; ; 0307; 0307; az After_I; # COMBINING DOT ABOVE
|
||||
*/
|
||||
*pString=nullptr;
|
||||
return 0; /* remove the dot (continue without output) */
|
||||
} else if(loc==UCASE_LOC_TURKISH && c==0x49 && !isFollowedByDotAbove(iter, context)) {
|
||||
/*
|
||||
|
@ -1210,8 +1215,6 @@ toUpperOrTitle(UChar32 c,
|
|||
// The sign of the result has meaning, input must be non-negative so that it can be returned as is.
|
||||
U_ASSERT(c >= 0);
|
||||
UChar32 result=c;
|
||||
// Reset the output pointer in case it was uninitialized.
|
||||
*pString=nullptr;
|
||||
uint16_t props=UTRIE2_GET16(&ucase_props_singleton.trie, c);
|
||||
if(!UCASE_HAS_EXCEPTION(props)) {
|
||||
if(UCASE_GET_TYPE(props)==UCASE_LOWER) {
|
||||
|
@ -1249,6 +1252,7 @@ toUpperOrTitle(UChar32 c,
|
|||
|
||||
0307; 0307; ; ; lt After_Soft_Dotted; # COMBINING DOT ABOVE
|
||||
*/
|
||||
*pString=nullptr;
|
||||
return 0; /* remove the dot (continue without output) */
|
||||
} else if(c==0x0587) {
|
||||
// See ICU-13416:
|
||||
|
@ -1445,8 +1449,6 @@ ucase_toFullFolding(UChar32 c,
|
|||
// The sign of the result has meaning, input must be non-negative so that it can be returned as is.
|
||||
U_ASSERT(c >= 0);
|
||||
UChar32 result=c;
|
||||
// Reset the output pointer in case it was uninitialized.
|
||||
*pString=nullptr;
|
||||
uint16_t props=UTRIE2_GET16(&ucase_props_singleton.trie, c);
|
||||
if(!UCASE_HAS_EXCEPTION(props)) {
|
||||
if(UCASE_IS_UPPER_OR_TITLE(props)) {
|
||||
|
@ -1540,7 +1542,7 @@ U_CAPI UChar32 U_EXPORT2
|
|||
u_tolower(UChar32 c) {
|
||||
return ucase_tolower(c);
|
||||
}
|
||||
|
||||
|
||||
/* Transforms the Unicode character to its upper case equivalent.*/
|
||||
U_CAPI UChar32 U_EXPORT2
|
||||
u_toupper(UChar32 c) {
|
||||
|
|
|
@ -312,21 +312,6 @@ UCaseMapFull(UChar32 c,
|
|||
|
||||
U_CDECL_END
|
||||
|
||||
/* for icuexportdata -------------------------------------------------------- */
|
||||
|
||||
struct UCaseProps {
|
||||
void *mem; // TODO: was unused, and type UDataMemory -- remove
|
||||
const int32_t *indexes;
|
||||
const uint16_t *exceptions;
|
||||
const uint16_t *unfold;
|
||||
|
||||
UTrie2 trie;
|
||||
uint8_t formatVersion[4];
|
||||
};
|
||||
|
||||
U_CAPI const struct UCaseProps * U_EXPORT2
|
||||
ucase_getSingleton(int32_t *pExceptionsLength, int32_t *pUnfoldLength);
|
||||
|
||||
/* file definitions --------------------------------------------------------- */
|
||||
|
||||
#define UCASE_DATA_NAME "ucase"
|
||||
|
|
|
@ -112,7 +112,8 @@ ucasemap_setLocale(UCaseMap *csm, const char *locale, UErrorCode *pErrorCode) {
|
|||
if(length==sizeof(csm->locale)) {
|
||||
*pErrorCode=U_BUFFER_OVERFLOW_ERROR;
|
||||
}
|
||||
if(U_SUCCESS(*pErrorCode)) {
|
||||
if(U_SUCCESS(*pErrorCode)) {
|
||||
csm->caseLocale=UCASE_LOC_UNKNOWN;
|
||||
csm->caseLocale = ucase_getCaseLocale(csm->locale);
|
||||
} else {
|
||||
csm->locale[0]=0;
|
||||
|
@ -419,97 +420,6 @@ void toUpper(int32_t caseLocale, uint32_t options,
|
|||
|
||||
#if !UCONFIG_NO_BREAK_ITERATION
|
||||
|
||||
namespace {
|
||||
|
||||
constexpr uint8_t ACUTE_BYTE0 = u8"\u0301"[0];
|
||||
|
||||
constexpr uint8_t ACUTE_BYTE1 = u8"\u0301"[1];
|
||||
|
||||
/**
|
||||
* Input: c is a letter I with or without acute accent.
|
||||
* start is the index in src after c, and is less than segmentLimit.
|
||||
* If a plain i/I is followed by a plain j/J,
|
||||
* or an i/I with acute (precomposed or decomposed) is followed by a j/J with acute,
|
||||
* then we output accordingly.
|
||||
*
|
||||
* @return the src index after the titlecased sequence, or the start index if no Dutch IJ
|
||||
*/
|
||||
int32_t maybeTitleDutchIJ(const uint8_t *src, UChar32 c, int32_t start, int32_t segmentLimit,
|
||||
ByteSink &sink, uint32_t options, icu::Edits *edits, UErrorCode &errorCode) {
|
||||
U_ASSERT(start < segmentLimit);
|
||||
|
||||
int32_t index = start;
|
||||
bool withAcute = false;
|
||||
|
||||
// If the conditions are met, then the following variables tell us what to output.
|
||||
int32_t unchanged1 = 0; // code units before the j, or the whole sequence (0..3)
|
||||
bool doTitleJ = false; // true if the j needs to be titlecased
|
||||
int32_t unchanged2 = 0; // after the j (0 or 1)
|
||||
|
||||
// next character after the first letter
|
||||
UChar32 c2;
|
||||
c2 = src[index++];
|
||||
|
||||
// Is the first letter an i/I with accent?
|
||||
if (c == u'I') {
|
||||
if (c2 == ACUTE_BYTE0 && index < segmentLimit && src[index++] == ACUTE_BYTE1) {
|
||||
withAcute = true;
|
||||
unchanged1 = 2; // ACUTE is 2 code units in UTF-8
|
||||
if (index == segmentLimit) { return start; }
|
||||
c2 = src[index++];
|
||||
}
|
||||
} else { // Í
|
||||
withAcute = true;
|
||||
}
|
||||
|
||||
// Is the next character a j/J?
|
||||
if (c2 == u'j') {
|
||||
doTitleJ = true;
|
||||
} else if (c2 == u'J') {
|
||||
++unchanged1;
|
||||
} else {
|
||||
return start;
|
||||
}
|
||||
|
||||
// A plain i/I must be followed by a plain j/J.
|
||||
// An i/I with acute must be followed by a j/J with acute.
|
||||
if (withAcute) {
|
||||
if ((index + 1) >= segmentLimit || src[index++] != ACUTE_BYTE0 || src[index++] != ACUTE_BYTE1) {
|
||||
return start;
|
||||
}
|
||||
if (doTitleJ) {
|
||||
unchanged2 = 2; // ACUTE is 2 code units in UTF-8
|
||||
} else {
|
||||
unchanged1 = unchanged1 + 2; // ACUTE is 2 code units in UTF-8
|
||||
}
|
||||
}
|
||||
|
||||
// There must not be another combining mark.
|
||||
if (index < segmentLimit) {
|
||||
int32_t cp;
|
||||
int32_t i = index;
|
||||
U8_NEXT(src, i, segmentLimit, cp);
|
||||
uint32_t typeMask = U_GET_GC_MASK(cp);
|
||||
if ((typeMask & U_GC_M_MASK) != 0) {
|
||||
return start;
|
||||
}
|
||||
}
|
||||
|
||||
// Output the rest of the Dutch IJ.
|
||||
ByteSinkUtil::appendUnchanged(src + start, unchanged1, sink, options, edits, errorCode);
|
||||
start += unchanged1;
|
||||
if (doTitleJ) {
|
||||
ByteSinkUtil::appendCodePoint(1, u'J', sink, edits);
|
||||
++start;
|
||||
}
|
||||
ByteSinkUtil::appendUnchanged(src + start, unchanged2, sink, options, edits, errorCode);
|
||||
|
||||
U_ASSERT(start + unchanged2 == index);
|
||||
return index;
|
||||
}
|
||||
|
||||
} // namespace
|
||||
|
||||
U_CFUNC void U_CALLCONV
|
||||
ucasemap_internalUTF8ToTitle(
|
||||
int32_t caseLocale, uint32_t options, BreakIterator *iter,
|
||||
|
@ -594,14 +504,19 @@ ucasemap_internalUTF8ToTitle(
|
|||
}
|
||||
|
||||
/* Special case Dutch IJ titlecasing */
|
||||
if (titleLimit < index &&
|
||||
caseLocale == UCASE_LOC_DUTCH) {
|
||||
if (c < 0) {
|
||||
c = ~c;
|
||||
}
|
||||
|
||||
if (c == u'I' || c == u'Í') {
|
||||
titleLimit = maybeTitleDutchIJ(src, c, titleLimit, index, sink, options, edits, errorCode);
|
||||
if (titleStart+1 < index &&
|
||||
caseLocale == UCASE_LOC_DUTCH &&
|
||||
(src[titleStart] == 0x0049 || src[titleStart] == 0x0069)) {
|
||||
if (src[titleStart+1] == 0x006A) {
|
||||
ByteSinkUtil::appendCodePoint(1, 0x004A, sink, edits);
|
||||
titleLimit++;
|
||||
} else if (src[titleStart+1] == 0x004A) {
|
||||
// Keep the capital J from getting lowercased.
|
||||
if (!ByteSinkUtil::appendUnchanged(src+titleStart+1, 1,
|
||||
sink, options, edits, errorCode)) {
|
||||
return;
|
||||
}
|
||||
titleLimit++;
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -252,10 +252,7 @@ ucnv_safeClone(const UConverter* cnv, void *stackBuffer, int32_t *pBufferSize, U
|
|||
UTRACE_EXIT_STATUS(*status);
|
||||
return NULL;
|
||||
}
|
||||
// If pBufferSize was NULL as the input, pBufferSize is set to &stackBufferSize in this function.
|
||||
if (pBufferSize != &stackBufferSize) {
|
||||
*status = U_SAFECLONE_ALLOCATED_WARNING;
|
||||
}
|
||||
*status = U_SAFECLONE_ALLOCATED_WARNING;
|
||||
|
||||
/* record the fact that memory was allocated */
|
||||
*pBufferSize = bufferSizeNeeded;
|
||||
|
@ -320,11 +317,7 @@ ucnv_safeClone(const UConverter* cnv, void *stackBuffer, int32_t *pBufferSize, U
|
|||
return localConverter;
|
||||
}
|
||||
|
||||
U_CAPI UConverter* U_EXPORT2
|
||||
ucnv_clone(const UConverter* cnv, UErrorCode *status)
|
||||
{
|
||||
return ucnv_safeClone(cnv, nullptr, nullptr, status);
|
||||
}
|
||||
|
||||
|
||||
/*Decreases the reference counter in the shared immutable section of the object
|
||||
*and frees the mutable part*/
|
||||
|
|
|
@ -254,7 +254,7 @@ currSymbolsEquiv_cleanup(void)
|
|||
}
|
||||
|
||||
/**
|
||||
* Deleter for IsoCodeEntry
|
||||
* Deleter for OlsonToMetaMappingEntry
|
||||
*/
|
||||
static void U_CALLCONV
|
||||
deleteIsoCodeEntry(void *obj) {
|
||||
|
|
|
@ -186,10 +186,10 @@ NULL
|
|||
};
|
||||
|
||||
static const char* const DEPRECATED_LANGUAGES[]={
|
||||
"in", "iw", "ji", "jw", "mo", NULL, NULL
|
||||
"in", "iw", "ji", "jw", NULL, NULL
|
||||
};
|
||||
static const char* const REPLACEMENT_LANGUAGES[]={
|
||||
"id", "he", "yi", "jv", "ro", NULL, NULL
|
||||
"id", "he", "yi", "jv", NULL, NULL
|
||||
};
|
||||
|
||||
/**
|
||||
|
@ -444,7 +444,7 @@ static const char * const COUNTRIES_3[] = {
|
|||
/* "VA", "VC", "VE", "VG", "VI", "VN", "VU", "WF", */
|
||||
"VAT", "VCT", "VEN", "VGB", "VIR", "VNM", "VUT", "WLF",
|
||||
/* "WS", "XK", "YE", "YT", "ZA", "ZM", "ZW", */
|
||||
"WSM", "XKK", "YEM", "MYT", "ZAF", "ZMB", "ZWE",
|
||||
"WSM", "XXK", "YEM", "MYT", "ZAF", "ZMB", "ZWE",
|
||||
NULL,
|
||||
/* "AN", "BU", "CS", "FX", "RO", "SU", "TP", "YD", "YU", "ZR" */
|
||||
"ANT", "BUR", "SCG", "FXX", "ROM", "SUN", "TMP", "YMD", "YUG", "ZAR",
|
||||
|
|
|
@ -461,13 +461,13 @@ public:
|
|||
* Option for whether to include or ignore one-way (fallback) match data.
|
||||
* By default, they are included.
|
||||
*
|
||||
* @param matchDirection the match direction to set.
|
||||
* @param direction the match direction to set.
|
||||
* @return this Builder object
|
||||
* @stable ICU 67
|
||||
*/
|
||||
Builder &setDirection(ULocMatchDirection matchDirection) {
|
||||
Builder &setDirection(ULocMatchDirection direction) {
|
||||
if (U_SUCCESS(errorCode_)) {
|
||||
direction_ = matchDirection;
|
||||
direction_ = direction;
|
||||
}
|
||||
return *this;
|
||||
}
|
||||
|
|
|
@ -147,11 +147,6 @@ private:
|
|||
*/
|
||||
int32_t *fLookAheadMatches;
|
||||
|
||||
/**
|
||||
* A flag to indicate if phrase based breaking is enabled.
|
||||
*/
|
||||
UBool fIsPhraseBreaking;
|
||||
|
||||
//=======================================================================
|
||||
// constructors
|
||||
//=======================================================================
|
||||
|
@ -168,21 +163,6 @@ private:
|
|||
*/
|
||||
RuleBasedBreakIterator(RBBIDataHeader* data, UErrorCode &status);
|
||||
|
||||
/**
|
||||
* This constructor uses the udata interface to create a BreakIterator
|
||||
* whose internal tables live in a memory-mapped file. "image" is an
|
||||
* ICU UDataMemory handle for the pre-compiled break iterator tables.
|
||||
* @param image handle to the memory image for the break iterator data.
|
||||
* Ownership of the UDataMemory handle passes to the Break Iterator,
|
||||
* which will be responsible for closing it when it is no longer needed.
|
||||
* @param status Information on any errors encountered.
|
||||
* @param isPhraseBreaking true if phrase based breaking is required, otherwise false.
|
||||
* @see udata_open
|
||||
* @see #getBinaryRules
|
||||
* @internal (private)
|
||||
*/
|
||||
RuleBasedBreakIterator(UDataMemory* image, UBool isPhraseBreaking, UErrorCode &status);
|
||||
|
||||
/** @internal */
|
||||
friend class RBBIRuleBuilder;
|
||||
/** @internal */
|
||||
|
|
|
@ -312,12 +312,11 @@ ubrk_openBinaryRules(const uint8_t *binaryRules, int32_t rulesLength,
|
|||
* If *pBufferSize is not enough for a stack-based safe clone,
|
||||
* new memory will be allocated.
|
||||
* @param status to indicate whether the operation went on smoothly or there were errors
|
||||
* An informational status value, U_SAFECLONE_ALLOCATED_ERROR, is used
|
||||
* if pBufferSize != NULL and any allocations were necessary
|
||||
* An informational status value, U_SAFECLONE_ALLOCATED_ERROR, is used if any allocations were necessary.
|
||||
* @return pointer to the new clone
|
||||
* @deprecated ICU 69 Use ubrk_clone() instead.
|
||||
*/
|
||||
U_DEPRECATED UBreakIterator * U_EXPORT2
|
||||
U_CAPI UBreakIterator * U_EXPORT2
|
||||
ubrk_safeClone(
|
||||
const UBreakIterator *bi,
|
||||
void *stackBuffer,
|
||||
|
@ -326,17 +325,21 @@ ubrk_safeClone(
|
|||
|
||||
#endif /* U_HIDE_DEPRECATED_API */
|
||||
|
||||
#ifndef U_HIDE_DRAFT_API
|
||||
|
||||
/**
|
||||
* Thread safe cloning operation.
|
||||
* @param bi iterator to be cloned
|
||||
* @param status to indicate whether the operation went on smoothly or there were errors
|
||||
* @return pointer to the new clone
|
||||
* @stable ICU 69
|
||||
* @draft ICU 69
|
||||
*/
|
||||
U_CAPI UBreakIterator * U_EXPORT2
|
||||
ubrk_clone(const UBreakIterator *bi,
|
||||
UErrorCode *status);
|
||||
|
||||
#endif // U_HIDE_DRAFT_API
|
||||
|
||||
#ifndef U_HIDE_DEPRECATED_API
|
||||
|
||||
/**
|
||||
|
|
|
@ -477,7 +477,7 @@ ucnv_openCCSID(int32_t codepage,
|
|||
*
|
||||
* <p>The name will NOT be looked up in the alias mechanism, nor will the converter be
|
||||
* stored in the converter cache or the alias table. The only way to open further converters
|
||||
* is call this function multiple times, or use the ucnv_clone() function to clone a
|
||||
* is call this function multiple times, or use the ucnv_safeClone() function to clone a
|
||||
* 'primary' converter.</p>
|
||||
*
|
||||
* <p>A future version of ICU may add alias table lookups and/or caching
|
||||
|
@ -493,27 +493,13 @@ ucnv_openCCSID(int32_t codepage,
|
|||
* @return the created Unicode converter object, or <TT>NULL</TT> if an error occurred
|
||||
* @see udata_open
|
||||
* @see ucnv_open
|
||||
* @see ucnv_clone
|
||||
* @see ucnv_safeClone
|
||||
* @see ucnv_close
|
||||
* @stable ICU 2.2
|
||||
*/
|
||||
U_CAPI UConverter* U_EXPORT2
|
||||
ucnv_openPackage(const char *packageName, const char *converterName, UErrorCode *err);
|
||||
|
||||
/**
|
||||
* Thread safe converter cloning operation.
|
||||
*
|
||||
* You must ucnv_close() the clone.
|
||||
*
|
||||
* @param cnv converter to be cloned
|
||||
* @param status to indicate whether the operation went on smoothly or there were errors
|
||||
* @return pointer to the new clone
|
||||
* @stable ICU 71
|
||||
*/
|
||||
U_CAPI UConverter* U_EXPORT2 ucnv_clone(const UConverter *cnv, UErrorCode *status);
|
||||
|
||||
#ifndef U_HIDE_DEPRECATED_API
|
||||
|
||||
/**
|
||||
* Thread safe converter cloning operation.
|
||||
* For most efficient operation, pass in a stackBuffer (and a *pBufferSize)
|
||||
|
@ -546,19 +532,21 @@ U_CAPI UConverter* U_EXPORT2 ucnv_clone(const UConverter *cnv, UErrorCode *statu
|
|||
* pointer to size of allocated space.
|
||||
* @param status to indicate whether the operation went on smoothly or there were errors
|
||||
* An informational status value, U_SAFECLONE_ALLOCATED_WARNING,
|
||||
* is used if pBufferSize != NULL and any allocations were necessary
|
||||
* is used if any allocations were necessary.
|
||||
* However, it is better to check if *pBufferSize grew for checking for
|
||||
* allocations because warning codes can be overridden by subsequent
|
||||
* function calls.
|
||||
* @return pointer to the new clone
|
||||
* @deprecated ICU 71 Use ucnv_clone() instead.
|
||||
* @stable ICU 2.0
|
||||
*/
|
||||
U_DEPRECATED UConverter * U_EXPORT2
|
||||
U_CAPI UConverter * U_EXPORT2
|
||||
ucnv_safeClone(const UConverter *cnv,
|
||||
void *stackBuffer,
|
||||
int32_t *pBufferSize,
|
||||
UErrorCode *status);
|
||||
|
||||
#ifndef U_HIDE_DEPRECATED_API
|
||||
|
||||
/**
|
||||
* \def U_CNV_SAFECLONE_BUFFERSIZE
|
||||
* Definition of a buffer size that is designed to be large enough for
|
||||
|
|
|
@ -1229,6 +1229,7 @@ public:
|
|||
*/
|
||||
UnicodeSet& retain(UChar32 c);
|
||||
|
||||
#ifndef U_HIDE_DRAFT_API
|
||||
/**
|
||||
* Retains only the specified string from this set if it is present.
|
||||
* Upon return this set will be empty if it did not contain s, or
|
||||
|
@ -1237,9 +1238,10 @@ public:
|
|||
*
|
||||
* @param s the source string
|
||||
* @return this object, for chaining
|
||||
* @stable ICU 69
|
||||
* @draft ICU 69
|
||||
*/
|
||||
UnicodeSet& retain(const UnicodeString &s);
|
||||
#endif // U_HIDE_DRAFT_API
|
||||
|
||||
/**
|
||||
* Removes the specified range from this set if it is present.
|
||||
|
|
|
@ -567,7 +567,6 @@
|
|||
#define ucase_addStringCaseClosure U_ICU_ENTRY_POINT_RENAME(ucase_addStringCaseClosure)
|
||||
#define ucase_fold U_ICU_ENTRY_POINT_RENAME(ucase_fold)
|
||||
#define ucase_getCaseLocale U_ICU_ENTRY_POINT_RENAME(ucase_getCaseLocale)
|
||||
#define ucase_getSingleton U_ICU_ENTRY_POINT_RENAME(ucase_getSingleton)
|
||||
#define ucase_getTrie U_ICU_ENTRY_POINT_RENAME(ucase_getTrie)
|
||||
#define ucase_getType U_ICU_ENTRY_POINT_RENAME(ucase_getType)
|
||||
#define ucase_getTypeOrIgnorable U_ICU_ENTRY_POINT_RENAME(ucase_getTypeOrIgnorable)
|
||||
|
@ -631,7 +630,6 @@
|
|||
#define ucnv_cbFromUWriteUChars U_ICU_ENTRY_POINT_RENAME(ucnv_cbFromUWriteUChars)
|
||||
#define ucnv_cbToUWriteSub U_ICU_ENTRY_POINT_RENAME(ucnv_cbToUWriteSub)
|
||||
#define ucnv_cbToUWriteUChars U_ICU_ENTRY_POINT_RENAME(ucnv_cbToUWriteUChars)
|
||||
#define ucnv_clone U_ICU_ENTRY_POINT_RENAME(ucnv_clone)
|
||||
#define ucnv_close U_ICU_ENTRY_POINT_RENAME(ucnv_close)
|
||||
#define ucnv_compareNames U_ICU_ENTRY_POINT_RENAME(ucnv_compareNames)
|
||||
#define ucnv_convert U_ICU_ENTRY_POINT_RENAME(ucnv_convert)
|
||||
|
@ -727,7 +725,6 @@
|
|||
#define ucnvsel_selectForString U_ICU_ENTRY_POINT_RENAME(ucnvsel_selectForString)
|
||||
#define ucnvsel_selectForUTF8 U_ICU_ENTRY_POINT_RENAME(ucnvsel_selectForUTF8)
|
||||
#define ucnvsel_serialize U_ICU_ENTRY_POINT_RENAME(ucnvsel_serialize)
|
||||
#define ucol_clone U_ICU_ENTRY_POINT_RENAME(ucol_clone)
|
||||
#define ucol_cloneBinary U_ICU_ENTRY_POINT_RENAME(ucol_cloneBinary)
|
||||
#define ucol_close U_ICU_ENTRY_POINT_RENAME(ucol_close)
|
||||
#define ucol_closeElements U_ICU_ENTRY_POINT_RENAME(ucol_closeElements)
|
||||
|
@ -907,7 +904,6 @@
|
|||
#define udatpg_getBestPattern U_ICU_ENTRY_POINT_RENAME(udatpg_getBestPattern)
|
||||
#define udatpg_getBestPatternWithOptions U_ICU_ENTRY_POINT_RENAME(udatpg_getBestPatternWithOptions)
|
||||
#define udatpg_getDateTimeFormat U_ICU_ENTRY_POINT_RENAME(udatpg_getDateTimeFormat)
|
||||
#define udatpg_getDateTimeFormatForStyle U_ICU_ENTRY_POINT_RENAME(udatpg_getDateTimeFormatForStyle)
|
||||
#define udatpg_getDecimal U_ICU_ENTRY_POINT_RENAME(udatpg_getDecimal)
|
||||
#define udatpg_getDefaultHourCycle U_ICU_ENTRY_POINT_RENAME(udatpg_getDefaultHourCycle)
|
||||
#define udatpg_getFieldDisplayName U_ICU_ENTRY_POINT_RENAME(udatpg_getFieldDisplayName)
|
||||
|
@ -922,7 +918,6 @@
|
|||
#define udatpg_setAppendItemFormat U_ICU_ENTRY_POINT_RENAME(udatpg_setAppendItemFormat)
|
||||
#define udatpg_setAppendItemName U_ICU_ENTRY_POINT_RENAME(udatpg_setAppendItemName)
|
||||
#define udatpg_setDateTimeFormat U_ICU_ENTRY_POINT_RENAME(udatpg_setDateTimeFormat)
|
||||
#define udatpg_setDateTimeFormatForStyle U_ICU_ENTRY_POINT_RENAME(udatpg_setDateTimeFormatForStyle)
|
||||
#define udatpg_setDecimal U_ICU_ENTRY_POINT_RENAME(udatpg_setDecimal)
|
||||
#define udict_swap U_ICU_ENTRY_POINT_RENAME(udict_swap)
|
||||
#define udtitvfmt_close U_ICU_ENTRY_POINT_RENAME(udtitvfmt_close)
|
||||
|
|
|
@ -628,6 +628,7 @@ uset_removeRange(USet* set, UChar32 start, UChar32 end);
|
|||
U_CAPI void U_EXPORT2
|
||||
uset_removeString(USet* set, const UChar* str, int32_t strLen);
|
||||
|
||||
#ifndef U_HIDE_DRAFT_API
|
||||
/**
|
||||
* Removes EACH of the characters in this string. Note: "ch" == {"c", "h"}
|
||||
* A frozen set will not be modified.
|
||||
|
@ -635,10 +636,11 @@ uset_removeString(USet* set, const UChar* str, int32_t strLen);
|
|||
* @param set the object to be modified
|
||||
* @param str the string
|
||||
* @param length the length of the string, or -1 if NUL-terminated
|
||||
* @stable ICU 69
|
||||
* @draft ICU 69
|
||||
*/
|
||||
U_CAPI void U_EXPORT2
|
||||
uset_removeAllCodePoints(USet *set, const UChar *str, int32_t length);
|
||||
#endif // U_HIDE_DRAFT_API
|
||||
|
||||
/**
|
||||
* Removes from this set all of its elements that are contained in the
|
||||
|
@ -669,6 +671,7 @@ uset_removeAll(USet* set, const USet* removeSet);
|
|||
U_CAPI void U_EXPORT2
|
||||
uset_retain(USet* set, UChar32 start, UChar32 end);
|
||||
|
||||
#ifndef U_HIDE_DRAFT_API
|
||||
/**
|
||||
* Retains only the specified string from this set if it is present.
|
||||
* Upon return this set will be empty if it did not contain s, or
|
||||
|
@ -678,7 +681,7 @@ uset_retain(USet* set, UChar32 start, UChar32 end);
|
|||
* @param set the object to be modified
|
||||
* @param str the string
|
||||
* @param length the length of the string, or -1 if NUL-terminated
|
||||
* @stable ICU 69
|
||||
* @draft ICU 69
|
||||
*/
|
||||
U_CAPI void U_EXPORT2
|
||||
uset_retainString(USet *set, const UChar *str, int32_t length);
|
||||
|
@ -690,10 +693,11 @@ uset_retainString(USet *set, const UChar *str, int32_t length);
|
|||
* @param set the object to be modified
|
||||
* @param str the string
|
||||
* @param length the length of the string, or -1 if NUL-terminated
|
||||
* @stable ICU 69
|
||||
* @draft ICU 69
|
||||
*/
|
||||
U_CAPI void U_EXPORT2
|
||||
uset_retainAllCodePoints(USet *set, const UChar *str, int32_t length);
|
||||
#endif // U_HIDE_DRAFT_API
|
||||
|
||||
/**
|
||||
* Retains only the elements in this set that are contained in the
|
||||
|
@ -737,6 +741,7 @@ uset_compact(USet* set);
|
|||
U_CAPI void U_EXPORT2
|
||||
uset_complement(USet* set);
|
||||
|
||||
#ifndef U_HIDE_DRAFT_API
|
||||
/**
|
||||
* Complements the specified range in this set. Any character in
|
||||
* the range will be removed if it is in this set, or will be
|
||||
|
@ -748,7 +753,7 @@ uset_complement(USet* set);
|
|||
* @param set the object to be modified
|
||||
* @param start first character, inclusive, of range
|
||||
* @param end last character, inclusive, of range
|
||||
* @stable ICU 69
|
||||
* @draft ICU 69
|
||||
*/
|
||||
U_CAPI void U_EXPORT2
|
||||
uset_complementRange(USet *set, UChar32 start, UChar32 end);
|
||||
|
@ -761,7 +766,7 @@ uset_complementRange(USet *set, UChar32 start, UChar32 end);
|
|||
* @param set the object to be modified
|
||||
* @param str the string
|
||||
* @param length the length of the string, or -1 if NUL-terminated
|
||||
* @stable ICU 69
|
||||
* @draft ICU 69
|
||||
*/
|
||||
U_CAPI void U_EXPORT2
|
||||
uset_complementString(USet *set, const UChar *str, int32_t length);
|
||||
|
@ -773,10 +778,11 @@ uset_complementString(USet *set, const UChar *str, int32_t length);
|
|||
* @param set the object to be modified
|
||||
* @param str the string
|
||||
* @param length the length of the string, or -1 if NUL-terminated
|
||||
* @stable ICU 69
|
||||
* @draft ICU 69
|
||||
*/
|
||||
U_CAPI void U_EXPORT2
|
||||
uset_complementAllCodePoints(USet *set, const UChar *str, int32_t length);
|
||||
#endif // U_HIDE_DRAFT_API
|
||||
|
||||
/**
|
||||
* Complements in this set all elements contained in the specified
|
||||
|
|
|
@ -60,7 +60,7 @@
|
|||
* This value will change in the subsequent releases of ICU
|
||||
* @stable ICU 2.4
|
||||
*/
|
||||
#define U_ICU_VERSION_MAJOR_NUM 71
|
||||
#define U_ICU_VERSION_MAJOR_NUM 70
|
||||
|
||||
/** The current ICU minor version as an integer.
|
||||
* This value will change in the subsequent releases of ICU
|
||||
|
@ -86,7 +86,7 @@
|
|||
* This value will change in the subsequent releases of ICU
|
||||
* @stable ICU 2.6
|
||||
*/
|
||||
#define U_ICU_VERSION_SUFFIX _71
|
||||
#define U_ICU_VERSION_SUFFIX _70
|
||||
|
||||
/**
|
||||
* \def U_DEF2_ICU_ENTRY_POINT_RENAME
|
||||
|
@ -139,7 +139,7 @@
|
|||
* This value will change in the subsequent releases of ICU
|
||||
* @stable ICU 2.4
|
||||
*/
|
||||
#define U_ICU_VERSION "71.1"
|
||||
#define U_ICU_VERSION "70.1"
|
||||
|
||||
/**
|
||||
* The current ICU library major version number as a string, for library name suffixes.
|
||||
|
@ -152,13 +152,13 @@
|
|||
*
|
||||
* @stable ICU 2.6
|
||||
*/
|
||||
#define U_ICU_VERSION_SHORT "71"
|
||||
#define U_ICU_VERSION_SHORT "70"
|
||||
|
||||
#ifndef U_HIDE_INTERNAL_API
|
||||
/** Data version in ICU4C.
|
||||
* @internal ICU 4.4 Internal Use Only
|
||||
**/
|
||||
#define U_ICU_DATA_VERSION "71.1"
|
||||
#define U_ICU_DATA_VERSION "70.1"
|
||||
#endif /* U_HIDE_INTERNAL_API */
|
||||
|
||||
/*===========================================================================
|
||||
|
|
|
@ -334,8 +334,7 @@ Replaceable::clone() const {
|
|||
// UnicodeString overrides clone() with a real implementation
|
||||
UnicodeString *
|
||||
UnicodeString::clone() const {
|
||||
LocalPointer<UnicodeString> clonedString(new UnicodeString(*this));
|
||||
return clonedString.isValid() && !clonedString->isBogus() ? clonedString.orphan() : nullptr;
|
||||
return new UnicodeString(*this);
|
||||
}
|
||||
|
||||
//========================================
|
||||
|
@ -1977,12 +1976,7 @@ The vector deleting destructor is already a part of UObject,
|
|||
but defining it here makes sure that it is included with this object file.
|
||||
This makes sure that static library dependencies are kept to a minimum.
|
||||
*/
|
||||
#if defined(__clang__) || U_GCC_MAJOR_MINOR >= 1100
|
||||
#pragma GCC diagnostic push
|
||||
#pragma GCC diagnostic ignored "-Wunused-function"
|
||||
static void uprv_UnicodeStringDummy(void) {
|
||||
delete [] (new UnicodeString[2]);
|
||||
}
|
||||
#pragma GCC diagnostic pop
|
||||
#endif
|
||||
#endif
|
||||
|
|
|
@ -36,12 +36,6 @@
|
|||
#include "ustr_imp.h"
|
||||
#include "uassert.h"
|
||||
|
||||
/**
|
||||
* Code point for COMBINING ACUTE ACCENT
|
||||
* @internal
|
||||
*/
|
||||
#define ACUTE u'\u0301'
|
||||
|
||||
U_NAMESPACE_BEGIN
|
||||
|
||||
namespace {
|
||||
|
@ -402,94 +396,6 @@ U_NAMESPACE_USE
|
|||
|
||||
#if !UCONFIG_NO_BREAK_ITERATION
|
||||
|
||||
namespace {
|
||||
|
||||
/**
|
||||
* Input: c is a letter I with or without acute accent.
|
||||
* start is the index in src after c, and is less than segmentLimit.
|
||||
* If a plain i/I is followed by a plain j/J,
|
||||
* or an i/I with acute (precomposed or decomposed) is followed by a j/J with acute,
|
||||
* then we output accordingly.
|
||||
*
|
||||
* @return the src index after the titlecased sequence, or the start index if no Dutch IJ
|
||||
*/
|
||||
int32_t maybeTitleDutchIJ(const UChar *src, UChar32 c, int32_t start, int32_t segmentLimit,
|
||||
UChar *dest, int32_t &destIndex, int32_t destCapacity, uint32_t options,
|
||||
icu::Edits *edits) {
|
||||
U_ASSERT(start < segmentLimit);
|
||||
|
||||
int32_t index = start;
|
||||
bool withAcute = false;
|
||||
|
||||
// If the conditions are met, then the following variables tell us what to output.
|
||||
int32_t unchanged1 = 0; // code units before the j, or the whole sequence (0..3)
|
||||
bool doTitleJ = false; // true if the j needs to be titlecased
|
||||
int32_t unchanged2 = 0; // after the j (0 or 1)
|
||||
|
||||
// next character after the first letter
|
||||
UChar c2 = src[index++];
|
||||
|
||||
// Is the first letter an i/I with accent?
|
||||
if (c == u'I') {
|
||||
if (c2 == ACUTE) {
|
||||
withAcute = true;
|
||||
unchanged1 = 1;
|
||||
if (index == segmentLimit) { return start; }
|
||||
c2 = src[index++];
|
||||
}
|
||||
} else { // Í
|
||||
withAcute = true;
|
||||
}
|
||||
|
||||
// Is the next character a j/J?
|
||||
if (c2 == u'j') {
|
||||
doTitleJ = true;
|
||||
} else if (c2 == u'J') {
|
||||
++unchanged1;
|
||||
} else {
|
||||
return start;
|
||||
}
|
||||
|
||||
// A plain i/I must be followed by a plain j/J.
|
||||
// An i/I with acute must be followed by a j/J with acute.
|
||||
if (withAcute) {
|
||||
if (index == segmentLimit || src[index++] != ACUTE) { return start; }
|
||||
if (doTitleJ) {
|
||||
unchanged2 = 1;
|
||||
} else {
|
||||
++unchanged1;
|
||||
}
|
||||
}
|
||||
|
||||
// There must not be another combining mark.
|
||||
if (index < segmentLimit) {
|
||||
int32_t cp;
|
||||
int32_t i = index;
|
||||
U16_NEXT(src, i, segmentLimit, cp);
|
||||
uint32_t typeMask = U_GET_GC_MASK(cp);
|
||||
if ((typeMask & U_GC_M_MASK) != 0) {
|
||||
return start;
|
||||
}
|
||||
}
|
||||
|
||||
// Output the rest of the Dutch IJ.
|
||||
destIndex = appendUnchanged(dest, destIndex, destCapacity, src + start, unchanged1, options, edits);
|
||||
start += unchanged1;
|
||||
if (doTitleJ) {
|
||||
destIndex = appendUChar(dest, destIndex, destCapacity, u'J');
|
||||
if (edits != nullptr) {
|
||||
edits->addReplace(1, 1);
|
||||
}
|
||||
++start;
|
||||
}
|
||||
destIndex = appendUnchanged(dest, destIndex, destCapacity, src + start, unchanged2, options, edits);
|
||||
|
||||
U_ASSERT(start + unchanged2 == index);
|
||||
return index;
|
||||
}
|
||||
|
||||
} // namespace
|
||||
|
||||
U_CFUNC int32_t U_CALLCONV
|
||||
ustrcase_internalToTitle(int32_t caseLocale, uint32_t options, BreakIterator *iter,
|
||||
UChar *dest, int32_t destCapacity,
|
||||
|
@ -506,14 +412,14 @@ ustrcase_internalToTitle(int32_t caseLocale, uint32_t options, BreakIterator *it
|
|||
csc.limit=srcLength;
|
||||
int32_t destIndex=0;
|
||||
int32_t prev=0;
|
||||
bool isFirstIndex=true;
|
||||
UBool isFirstIndex=TRUE;
|
||||
|
||||
/* titlecasing loop */
|
||||
while(prev<srcLength) {
|
||||
/* find next index where to titlecase */
|
||||
int32_t index;
|
||||
if(isFirstIndex) {
|
||||
isFirstIndex=false;
|
||||
isFirstIndex=FALSE;
|
||||
index=iter->first();
|
||||
} else {
|
||||
index=iter->next();
|
||||
|
@ -540,7 +446,7 @@ ustrcase_internalToTitle(int32_t caseLocale, uint32_t options, BreakIterator *it
|
|||
// Stop with titleStart<titleLimit<=index
|
||||
// if there is a character to be titlecased,
|
||||
// or else stop with titleStart==titleLimit==index.
|
||||
bool toCased = (options&U_TITLECASE_ADJUST_TO_CASED) != 0;
|
||||
UBool toCased = (options&U_TITLECASE_ADJUST_TO_CASED) != 0;
|
||||
while (toCased ? UCASE_NONE==ucase_getType(c) : !ustrcase_isLNS(c)) {
|
||||
titleStart=titleLimit;
|
||||
if(titleLimit==index) {
|
||||
|
@ -573,15 +479,27 @@ ustrcase_internalToTitle(int32_t caseLocale, uint32_t options, BreakIterator *it
|
|||
|
||||
/* Special case Dutch IJ titlecasing */
|
||||
if (titleStart+1 < index &&
|
||||
caseLocale == UCASE_LOC_DUTCH) {
|
||||
if (c < 0) {
|
||||
c = ~c;
|
||||
}
|
||||
|
||||
if (c == u'I' || c == u'Í') {
|
||||
titleLimit = maybeTitleDutchIJ(src, c, titleStart + 1, index,
|
||||
dest, destIndex, destCapacity, options,
|
||||
edits);
|
||||
caseLocale == UCASE_LOC_DUTCH &&
|
||||
(src[titleStart] == 0x0049 || src[titleStart] == 0x0069)) {
|
||||
if (src[titleStart+1] == 0x006A) {
|
||||
destIndex=appendUChar(dest, destIndex, destCapacity, 0x004A);
|
||||
if(destIndex<0) {
|
||||
errorCode=U_INDEX_OUTOFBOUNDS_ERROR;
|
||||
return 0;
|
||||
}
|
||||
if(edits!=NULL) {
|
||||
edits->addReplace(1, 1);
|
||||
}
|
||||
titleLimit++;
|
||||
} else if (src[titleStart+1] == 0x004A) {
|
||||
// Keep the capital J from getting lowercased.
|
||||
destIndex=appendUnchanged(dest, destIndex, destCapacity,
|
||||
src+titleStart+1, 1, options, edits);
|
||||
if(destIndex<0) {
|
||||
errorCode=U_INDEX_OUTOFBOUNDS_ERROR;
|
||||
return 0;
|
||||
}
|
||||
titleLimit++;
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -99,6 +99,14 @@ bool UVector::operator==(const UVector& other) const {
|
|||
return true;
|
||||
}
|
||||
|
||||
// TODO: delete this function once all call sites have been migrated to the
|
||||
// new addElement().
|
||||
void UVector::addElementX(void* obj, UErrorCode &status) {
|
||||
if (ensureCapacityX(count + 1, status)) {
|
||||
elements[count++].pointer = obj;
|
||||
}
|
||||
}
|
||||
|
||||
void UVector::addElement(void* obj, UErrorCode &status) {
|
||||
U_ASSERT(deleter == nullptr);
|
||||
if (ensureCapacity(count + 1, status)) {
|
||||
|
@ -323,6 +331,38 @@ int32_t UVector::indexOf(UElement key, int32_t startIndex, int8_t hint) const {
|
|||
return -1;
|
||||
}
|
||||
|
||||
UBool UVector::ensureCapacityX(int32_t minimumCapacity, UErrorCode &status) {
|
||||
if (minimumCapacity < 0) {
|
||||
status = U_ILLEGAL_ARGUMENT_ERROR;
|
||||
return FALSE;
|
||||
}
|
||||
if (capacity < minimumCapacity) {
|
||||
if (capacity > (INT32_MAX - 1) / 2) { // integer overflow check
|
||||
status = U_ILLEGAL_ARGUMENT_ERROR;
|
||||
return FALSE;
|
||||
}
|
||||
int32_t newCap = capacity * 2;
|
||||
if (newCap < minimumCapacity) {
|
||||
newCap = minimumCapacity;
|
||||
}
|
||||
if (newCap > (int32_t)(INT32_MAX / sizeof(UElement))) { // integer overflow check
|
||||
// We keep the original memory contents on bad minimumCapacity.
|
||||
status = U_ILLEGAL_ARGUMENT_ERROR;
|
||||
return FALSE;
|
||||
}
|
||||
UElement* newElems = (UElement *)uprv_realloc(elements, sizeof(UElement)*newCap);
|
||||
if (newElems == nullptr) {
|
||||
// We keep the original contents on the memory failure on realloc or bad minimumCapacity.
|
||||
status = U_MEMORY_ALLOCATION_ERROR;
|
||||
return FALSE;
|
||||
}
|
||||
elements = newElems;
|
||||
capacity = newCap;
|
||||
}
|
||||
return TRUE;
|
||||
}
|
||||
|
||||
|
||||
UBool UVector::ensureCapacity(int32_t minimumCapacity, UErrorCode &status) {
|
||||
if (U_FAILURE(status)) {
|
||||
return false;
|
||||
|
@ -330,7 +370,7 @@ UBool UVector::ensureCapacity(int32_t minimumCapacity, UErrorCode &status) {
|
|||
if (minimumCapacity < 0) {
|
||||
status = U_ILLEGAL_ARGUMENT_ERROR;
|
||||
return false;
|
||||
}
|
||||
}
|
||||
if (capacity < minimumCapacity) {
|
||||
if (capacity > (INT32_MAX - 1) / 2) { // integer overflow check
|
||||
status = U_ILLEGAL_ARGUMENT_ERROR;
|
||||
|
@ -356,7 +396,6 @@ UBool UVector::ensureCapacity(int32_t minimumCapacity, UErrorCode &status) {
|
|||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
/**
|
||||
* Change the size of this vector as follows: If newSize is smaller,
|
||||
* then truncate the array, possibly deleting held elements for i >=
|
||||
|
|
|
@ -123,6 +123,12 @@ public:
|
|||
// java.util.Vector API
|
||||
//------------------------------------------------------------
|
||||
|
||||
/*
|
||||
* Old version of addElement, with non-standard error handling.
|
||||
* Will be removed once all uses have been switched to the new addElement().
|
||||
*/
|
||||
void addElementX(void* obj, UErrorCode &status);
|
||||
|
||||
/**
|
||||
* Add an element at the end of the vector.
|
||||
* For use only with vectors that do not adopt their elements, which is to say,
|
||||
|
@ -191,6 +197,12 @@ public:
|
|||
|
||||
inline UBool isEmpty(void) const {return count == 0;}
|
||||
|
||||
/*
|
||||
* Old version of ensureCapacity, with non-standard error handling.
|
||||
* Will be removed once all uses have been switched to the new ensureCapacity().
|
||||
*/
|
||||
UBool ensureCapacityX(int32_t minimumCapacity, UErrorCode &status);
|
||||
|
||||
UBool ensureCapacity(int32_t minimumCapacity, UErrorCode &status);
|
||||
|
||||
/**
|
||||
|
|
|
@ -83,7 +83,7 @@ void UVector32::assign(const UVector32& other, UErrorCode &ec) {
|
|||
}
|
||||
|
||||
|
||||
bool UVector32::operator==(const UVector32& other) const {
|
||||
bool UVector32::operator==(const UVector32& other) {
|
||||
int32_t i;
|
||||
if (count != other.count) return false;
|
||||
for (i=0; i<count; ++i) {
|
||||
|
|
|
@ -86,12 +86,12 @@ public:
|
|||
* equal if they are of the same size and all elements are equal,
|
||||
* as compared using this object's comparer.
|
||||
*/
|
||||
bool operator==(const UVector32& other) const;
|
||||
bool operator==(const UVector32& other);
|
||||
|
||||
/**
|
||||
* Equivalent to !operator==()
|
||||
*/
|
||||
inline bool operator!=(const UVector32& other) const;
|
||||
inline bool operator!=(const UVector32& other);
|
||||
|
||||
//------------------------------------------------------------
|
||||
// java.util.Vector API
|
||||
|
@ -268,7 +268,7 @@ inline int32_t UVector32::lastElementi(void) const {
|
|||
return elementAti(count-1);
|
||||
}
|
||||
|
||||
inline bool UVector32::operator!=(const UVector32& other) const {
|
||||
inline bool UVector32::operator!=(const UVector32& other) {
|
||||
return !operator==(other);
|
||||
}
|
||||
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
#! /bin/sh
|
||||
# Guess values for system-dependent variables and create Makefiles.
|
||||
# Generated by GNU Autoconf 2.69 for ICU 71.1.
|
||||
# Generated by GNU Autoconf 2.69 for ICU 70.1.
|
||||
#
|
||||
# Report bugs to <http://icu-project.org/bugs>.
|
||||
#
|
||||
|
@ -582,8 +582,8 @@ MAKEFLAGS=
|
|||
# Identity of this package.
|
||||
PACKAGE_NAME='ICU'
|
||||
PACKAGE_TARNAME='International Components for Unicode'
|
||||
PACKAGE_VERSION='71.1'
|
||||
PACKAGE_STRING='ICU 71.1'
|
||||
PACKAGE_VERSION='70.1'
|
||||
PACKAGE_STRING='ICU 70.1'
|
||||
PACKAGE_BUGREPORT='http://icu-project.org/bugs'
|
||||
PACKAGE_URL='http://icu-project.org'
|
||||
|
||||
|
@ -1375,7 +1375,7 @@ if test "$ac_init_help" = "long"; then
|
|||
# Omit some internal or obsolete options to make the list less imposing.
|
||||
# This message is too long to be a string in the A/UX 3.1 sh.
|
||||
cat <<_ACEOF
|
||||
\`configure' configures ICU 71.1 to adapt to many kinds of systems.
|
||||
\`configure' configures ICU 70.1 to adapt to many kinds of systems.
|
||||
|
||||
Usage: $0 [OPTION]... [VAR=VALUE]...
|
||||
|
||||
|
@ -1442,7 +1442,7 @@ fi
|
|||
|
||||
if test -n "$ac_init_help"; then
|
||||
case $ac_init_help in
|
||||
short | recursive ) echo "Configuration of ICU 71.1:";;
|
||||
short | recursive ) echo "Configuration of ICU 70.1:";;
|
||||
esac
|
||||
cat <<\_ACEOF
|
||||
|
||||
|
@ -1580,7 +1580,7 @@ fi
|
|||
test -n "$ac_init_help" && exit $ac_status
|
||||
if $ac_init_version; then
|
||||
cat <<\_ACEOF
|
||||
ICU configure 71.1
|
||||
ICU configure 70.1
|
||||
generated by GNU Autoconf 2.69
|
||||
|
||||
Copyright (C) 2012 Free Software Foundation, Inc.
|
||||
|
@ -2326,7 +2326,7 @@ cat >config.log <<_ACEOF
|
|||
This file contains any messages produced by compilers while
|
||||
running configure, to aid debugging if configure makes a mistake.
|
||||
|
||||
It was created by ICU $as_me 71.1, which was
|
||||
It was created by ICU $as_me 70.1, which was
|
||||
generated by GNU Autoconf 2.69. Invocation command line was
|
||||
|
||||
$ $0 $@
|
||||
|
@ -6166,6 +6166,11 @@ $as_echo "$as_me: Adding CXXFLAGS option -std=c++11" >&6;}
|
|||
else
|
||||
CXXFLAGS="$OLD_CXXFLAGS"
|
||||
fi
|
||||
case "${host}" in
|
||||
*-*-solaris*)
|
||||
CXXFLAGS="$OLD_CXXFLAGS"
|
||||
;;
|
||||
esac
|
||||
fi
|
||||
fi
|
||||
|
||||
|
@ -8607,7 +8612,7 @@ cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
|
|||
# report actual input values of CONFIG_FILES etc. instead of their
|
||||
# values after options handling.
|
||||
ac_log="
|
||||
This file was extended by ICU $as_me 71.1, which was
|
||||
This file was extended by ICU $as_me 70.1, which was
|
||||
generated by GNU Autoconf 2.69. Invocation command line was
|
||||
|
||||
CONFIG_FILES = $CONFIG_FILES
|
||||
|
@ -8661,7 +8666,7 @@ _ACEOF
|
|||
cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
|
||||
ac_cs_config="`$as_echo "$ac_configure_args" | sed 's/^ //; s/[\\""\`\$]/\\\\&/g'`"
|
||||
ac_cs_version="\\
|
||||
ICU config.status 71.1
|
||||
ICU config.status 70.1
|
||||
configured by $0, generated by GNU Autoconf 2.69,
|
||||
with options \\"\$ac_cs_config\\"
|
||||
|
||||
|
|
|
@ -527,6 +527,11 @@ if [[ "$GXX" = yes ]]; then
|
|||
else
|
||||
CXXFLAGS="$OLD_CXXFLAGS"
|
||||
fi
|
||||
case "${host}" in
|
||||
*-*-solaris*)
|
||||
CXXFLAGS="$OLD_CXXFLAGS"
|
||||
;;
|
||||
esac
|
||||
fi
|
||||
fi
|
||||
|
||||
|
|
|
@ -3,5 +3,5 @@
|
|||
// Generated using tools/cldr/cldr-to-icu/build-icu-data.xml
|
||||
|
||||
{
|
||||
"cldrVersion": "41"
|
||||
"cldrVersion": "40"
|
||||
}
|
||||
|
|
|
@ -23139,7 +23139,6 @@
|
|||
ໂຢຮີມ
|
||||
ໂຢະ
|
||||
ຣ້ອຍ
|
||||
ຣະຄັງ
|
||||
ຣະມາ
|
||||
ຣະເມດ
|
||||
ຣັກກັດ
|
||||
|
|
|
@ -5,237 +5,7 @@ ja{
|
|||
boundaries{
|
||||
line:process(dependency){"line_normal.brk"}
|
||||
line_loose:process(dependency){"line_loose_cj.brk"}
|
||||
line_loose_phrase:process(dependency){"line_loose_phrase_cj.brk"}
|
||||
line_normal:process(dependency){"line_normal_cj.brk"}
|
||||
line_normal_phrase:process(dependency){"line_normal_phrase_cj.brk"}
|
||||
line_phrase:process(dependency){"line_phrase_cj.brk"}
|
||||
line_strict:process(dependency){"line_cj.brk"}
|
||||
line_strict_phrase:process(dependency){"line_phrase_cj.brk"}
|
||||
}
|
||||
extensions{
|
||||
"かい",
|
||||
"かしら",
|
||||
"から",
|
||||
"かれい",
|
||||
"かれつ",
|
||||
"かれる",
|
||||
"かれん",
|
||||
"きり",
|
||||
"くらい",
|
||||
"ぐらい",
|
||||
"けれど",
|
||||
"けれども",
|
||||
"こそ",
|
||||
"さえ",
|
||||
"しか",
|
||||
"した",
|
||||
"ずつ",
|
||||
"せる",
|
||||
"せん",
|
||||
"たい",
|
||||
"たがる",
|
||||
"たく",
|
||||
"たら",
|
||||
"たり",
|
||||
"たれ",
|
||||
"たれる",
|
||||
"だけ",
|
||||
"だに",
|
||||
"だの",
|
||||
"だり",
|
||||
"つつ",
|
||||
"てる",
|
||||
"です",
|
||||
"でも",
|
||||
"ところが",
|
||||
"ところで",
|
||||
"とも",
|
||||
"ない",
|
||||
"なか",
|
||||
"ながら",
|
||||
"なく",
|
||||
"なし",
|
||||
"なぞ",
|
||||
"など",
|
||||
"なら",
|
||||
"なり",
|
||||
"なれる",
|
||||
"なんぞ",
|
||||
"ねる",
|
||||
"ので",
|
||||
"のに",
|
||||
"のみ",
|
||||
"はれる",
|
||||
"ばかり",
|
||||
"へる",
|
||||
"ほど",
|
||||
"まい",
|
||||
"まう",
|
||||
"まし",
|
||||
"ます",
|
||||
"まっ",
|
||||
"まで",
|
||||
"まま",
|
||||
"まれ",
|
||||
"もん",
|
||||
"やら",
|
||||
"やれる",
|
||||
"よう",
|
||||
"より",
|
||||
"らしい",
|
||||
"られる",
|
||||
"れる",
|
||||
"ろう",
|
||||
"わっ",
|
||||
"わな",
|
||||
"わら",
|
||||
"わり",
|
||||
"わる",
|
||||
"われ",
|
||||
"われと",
|
||||
"われる",
|
||||
"わん",
|
||||
"えたい",
|
||||
"えて",
|
||||
"える",
|
||||
"けた",
|
||||
"けたい",
|
||||
"ける",
|
||||
"させる",
|
||||
"そうだ",
|
||||
"っきゃ",
|
||||
"っきり",
|
||||
"っけ",
|
||||
"っす",
|
||||
"ったらしい",
|
||||
"っちゅう",
|
||||
"って",
|
||||
"っていう",
|
||||
"ってか",
|
||||
"ってな",
|
||||
"っと",
|
||||
"っぱなし",
|
||||
"っぷり",
|
||||
"っぽい",
|
||||
"にあう",
|
||||
"にあがる",
|
||||
"にあたって",
|
||||
"にあたり",
|
||||
"にあたりまして",
|
||||
"にあたります",
|
||||
"にあたる",
|
||||
"において",
|
||||
"におきまして",
|
||||
"における",
|
||||
"にかけ",
|
||||
"にかけて",
|
||||
"にかけまして",
|
||||
"にたいして",
|
||||
"にたいしまして",
|
||||
"にたいします",
|
||||
"にたいする",
|
||||
"について",
|
||||
"につき",
|
||||
"につきまして",
|
||||
"につけ",
|
||||
"につれ",
|
||||
"につれて",
|
||||
"にて",
|
||||
"にとって",
|
||||
"にとり",
|
||||
"にとりまして",
|
||||
"にまつわります",
|
||||
"にまつわる",
|
||||
"にもかかわらず",
|
||||
"にゃ",
|
||||
"によって",
|
||||
"により",
|
||||
"によりまして",
|
||||
"によります",
|
||||
"による",
|
||||
"にわたって",
|
||||
"にわたり",
|
||||
"にわたりまして",
|
||||
"にわたります",
|
||||
"にわたる",
|
||||
"に対し",
|
||||
"に対して",
|
||||
"に対しまして",
|
||||
"に対します",
|
||||
"に対する",
|
||||
"に当たって",
|
||||
"に当たり",
|
||||
"に当たりまして",
|
||||
"に当たります",
|
||||
"に当たる",
|
||||
"に従い",
|
||||
"に従いまして",
|
||||
"に従います",
|
||||
"に従う",
|
||||
"に従って",
|
||||
"に関し",
|
||||
"に関して",
|
||||
"に関しまして",
|
||||
"に関します",
|
||||
"に関する",
|
||||
"に際し",
|
||||
"に際して",
|
||||
"ものの",
|
||||
"ろうし",
|
||||
"ろうと",
|
||||
"われと",
|
||||
"をの",
|
||||
"をめぐって",
|
||||
"をめぐりまして",
|
||||
"をめぐります",
|
||||
"をめぐる",
|
||||
"をもちまして",
|
||||
"をもって",
|
||||
"を以て",
|
||||
"を通して",
|
||||
"を通しまして",
|
||||
"を通じ",
|
||||
"を通じて",
|
||||
"を通じまして",
|
||||
"んじゃ",
|
||||
"んで",
|
||||
"々宮",
|
||||
"々家",
|
||||
"え目",
|
||||
"が丘",
|
||||
"が台",
|
||||
"が床",
|
||||
"が浜",
|
||||
"の内",
|
||||
"の山公園",
|
||||
"の峰",
|
||||
"の森",
|
||||
"の沢",
|
||||
"の通り",
|
||||
"の里",
|
||||
"ヵ国",
|
||||
"ヵ年",
|
||||
"ヵ所",
|
||||
"ヵ月",
|
||||
"ヵ村",
|
||||
"ヵ条",
|
||||
"ヶ丘",
|
||||
"ヶ国",
|
||||
"ヶ島",
|
||||
"ヶ年",
|
||||
"ヶ所",
|
||||
"ヶ月",
|
||||
"ヶ村",
|
||||
"ヶ条",
|
||||
"ージ",
|
||||
"ーズ",
|
||||
"ータ",
|
||||
"ード",
|
||||
"ーニャ",
|
||||
"ープランス",
|
||||
"ーユ",
|
||||
"ーランド",
|
||||
"ーリンズ",
|
||||
"ーン",
|
||||
}
|
||||
}
|
||||
|
|
|
@ -2,7 +2,7 @@
|
|||
// License & terms of use: http://www.unicode.org/copyright.html
|
||||
// Generated using tools/cldr/cldr-to-icu/build-icu-data.xml
|
||||
root{
|
||||
Version{"41"}
|
||||
Version{"40"}
|
||||
boundaries{
|
||||
grapheme:process(dependency){"char.brk"}
|
||||
line:process(dependency){"line.brk"}
|
||||
|
|
|
@ -17,8 +17,7 @@
|
|||
# line-break=loose (BCP47 -u-lb-loose) as defined for Chinese & Japanese.
|
||||
# It sets characters of class CJ to behave like ID.
|
||||
# In addition, it allows breaks:
|
||||
# * between ID and hyphens 2010 & 2013 (both BA)
|
||||
# * before 301C, 30A0 (both NS)
|
||||
# * before hyphens 2010 & 2013 (both BA) and 301C, 30A0 (both NS)
|
||||
# * before iteration marks 3005, 303B, 309D, 309E, 30FD, 30FE (all NS)
|
||||
# * between characters of LineBreak class IN such as 2026
|
||||
# * before some centered punct 203C, 2047, 2048, 2049, 30FB, FF1A, FF1B,
|
||||
|
@ -136,7 +135,7 @@ $CANT_CM = [ $SP $BK $CR $LF $NL $ZW $CM]; # Bases that can't take CMs
|
|||
# AL_FOLLOW set of chars that can unconditionally follow an AL
|
||||
# Needed in rules where stand-alone $CM s are treated as AL.
|
||||
#
|
||||
$AL_FOLLOW = [$BK $CR $LF $NL $ZW $SP $CL $CP $EX $HL $IS $SY $WJ $GL $OP30 $QU $BA $BAX $HY $NS $IN $NU $PR $PO $POX $ALPlus];
|
||||
$AL_FOLLOW = [$BK $CR $LF $NL $ZW $SP $CL $CP $EX $HL $IS $SY $WJ $GL $OP30 $QU $BA $HY $NS $IN $NU $PR $PO $POX $ALPlus];
|
||||
|
||||
|
||||
#
|
||||
|
@ -239,7 +238,7 @@ $OP $CM* $SP+ $CM+ $AL_FOLLOW?; # by rule 10, stand-alone CM behaves as AL
|
|||
# See issue ICU-20303
|
||||
|
||||
|
||||
$CanFollowIS = [$BK $CR $LF $NL $SP $ZW $WJ $GL $CL $CP $EX $IS $SY $QU $BA $BAX $HY $NS $ALPlus $HL $IN];
|
||||
$CanFollowIS = [$BK $CR $LF $NL $SP $ZW $WJ $GL $CL $CP $EX $IS $SY $QU $BA $HY $NS $ALPlus $HL $IN];
|
||||
$SP $IS / [^ $CanFollowIS $NU $CM];
|
||||
$SP $IS $CM* $CMX / [^ $CanFollowIS $NU $CM];
|
||||
|
||||
|
@ -295,13 +294,11 @@ $LB20NonBreaks = [$LB18NonBreaks - $CB];
|
|||
# LB 21 x (BA | HY | NS)
|
||||
# BB x
|
||||
#
|
||||
# DO allow breaks here before $NSX, so don't include it.
|
||||
# And DO allow breaks between ID and $BAX, so split out the handling of ID and do not include $BAX for them.
|
||||
[$LB20NonBreaks - $ID] $CM* ($BA | $BAX | $HY | $NS);
|
||||
$ID $CM* ($BA | $HY | $NS);
|
||||
# DO allow breaks here before $BAX and $NSX, so don't include them
|
||||
$LB20NonBreaks $CM* ($BA | $HY | $NS);
|
||||
|
||||
|
||||
^$CM+ ($BA | $BAX | $HY | $NS);
|
||||
^$CM+ ($BA | $HY | $NS);
|
||||
|
||||
$BB $CM* [^$CB]; # $BB x
|
||||
$BB $CM* $LB20NonBreaks;
|
||||
|
@ -377,9 +374,9 @@ $CP30 $CM* ($ALPlus | $HL | $NU);
|
|||
|
||||
# LB 30a Do not break between regional indicators. Break after pairs of them.
|
||||
# Tricky interaction with LB8a: ZWJ x . together with ZWJ acting like a CM.
|
||||
$RI $CM* $RI / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $BAX $HY $NS $IN $CM]];
|
||||
$RI $CM* $RI $CM* [$CM-$ZWJ] / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $BAX $HY $NS $IN $CM]];
|
||||
$RI $CM* $RI $CM* [$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $BAX $HY $NS $IN $ZWJ {eof}];
|
||||
$RI $CM* $RI / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $IN $CM]];
|
||||
$RI $CM* $RI $CM* [$CM-$ZWJ] / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $IN $CM]];
|
||||
$RI $CM* $RI $CM* [$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $IN $ZWJ {eof}];
|
||||
# note: the preceding rule includes {eof} rather than having the last [set] term qualified with '?'
|
||||
# because of the chain-out behavior difference. The rule must chain out only from the [set characters],
|
||||
# not from the preceding $RI or $CM, which it would be able to do if the set were optional.
|
||||
|
|
|
@ -1,406 +0,0 @@
|
|||
# Copyright (C) 2022 and later: Unicode, Inc. and others.
|
||||
# License & terms of use: http://www.unicode.org/copyright.html
|
||||
#
|
||||
# file: line_loose_phrase_cj.txt
|
||||
#
|
||||
# Line Breaking Rules
|
||||
# Implement default line breaking as defined by
|
||||
# Unicode Standard Annex #14 (https://www.unicode.org/reports/tr14/)
|
||||
# for Unicode 14.0, with the following modification:
|
||||
#
|
||||
# Boundaries between hyphens and following letters are suppressed when
|
||||
# there is a boundary preceding the hyphen. See rule 20.9
|
||||
#
|
||||
# This tailors the line break behavior to correspond to CSS
|
||||
# line-break=loose (BCP47 -u-lb-loose) as defined for Chinese & Japanese.
|
||||
# It sets characters of class CJ to behave like ID.
|
||||
# In addition, it allows breaks:
|
||||
# * between ID and hyphens 2010 & 2013 (both BA)
|
||||
# * before 301C, 30A0 (both NS)
|
||||
# * before iteration marks 3005, 303B, 309D, 309E, 30FD, 30FE (all NS)
|
||||
# * between characters of LineBreak class IN such as 2026
|
||||
# * before some centered punct 203C, 2047, 2048, 2049, 30FB, FF1A, FF1B,
|
||||
# FF65 (all NS) and FF01, FF1F (both EX).
|
||||
# * before suffix characters with LineBreak class PO and EastAsianWidth A,F,W;
|
||||
# this includes: 00B0 2030 2032 2033 2035 2103 2109 FE6A FF05 FFE0
|
||||
# * after prefix characters with LineBreak class PR and EastAsianWidth A,F,W;
|
||||
# this includes: 00A4 00B1 20AC 2116 FE69 FF04 FFE1 FFE5 FFE6
|
||||
# It allows breaking before 201C and after 201D, for zh_Hans, zh_Hant, and ja.
|
||||
#
|
||||
# The content is the same as line_loose_cj.txt except the following
|
||||
# 1. Add CJK into dictionary.
|
||||
# 2. Add East Asian Width with class F, W and H into $ALPlus.
|
||||
|
||||
|
||||
#
|
||||
# Character Classes defined by TR 14.
|
||||
#
|
||||
|
||||
!!chain;
|
||||
!!quoted_literals_only;
|
||||
|
||||
$AI = [:LineBreak = Ambiguous:];
|
||||
$AL = [:LineBreak = Alphabetic:];
|
||||
$BAX = [\u2010 \u2013];
|
||||
$BA = [[:LineBreak = Break_After:] - $BAX];
|
||||
$HH = [\u2010]; # \u2010 is HYPHEN, default line break is BA.
|
||||
$BB = [:LineBreak = Break_Before:];
|
||||
$BK = [:LineBreak = Mandatory_Break:];
|
||||
$B2 = [:LineBreak = Break_Both:];
|
||||
$CB = [:LineBreak = Contingent_Break:];
|
||||
$CJ = [:LineBreak = Conditional_Japanese_Starter:];
|
||||
$CL = [[:LineBreak = Close_Punctuation:] \u201d];
|
||||
# $CM = [:LineBreak = Combining_Mark:];
|
||||
$CP = [:LineBreak = Close_Parenthesis:];
|
||||
$CR = [:LineBreak = Carriage_Return:];
|
||||
$EB = [:LineBreak = EB:];
|
||||
$EM = [:LineBreak = EM:];
|
||||
$EXX = [\uFF01 \uFF1F];
|
||||
$EX = [[:LineBreak = Exclamation:] - $EXX];
|
||||
$GL = [:LineBreak = Glue:];
|
||||
$HL = [:LineBreak = Hebrew_Letter:];
|
||||
$HY = [:LineBreak = Hyphen:];
|
||||
$H2 = [:LineBreak = H2:];
|
||||
$H3 = [:LineBreak = H3:];
|
||||
# CSS Loose tailoring: CJ resolves to ID
|
||||
$ID = [[:LineBreak = Ideographic:] $CJ];
|
||||
$IN = [:LineBreak = Inseperable:];
|
||||
$IS = [:LineBreak = Infix_Numeric:];
|
||||
$JL = [:LineBreak = JL:];
|
||||
$JV = [:LineBreak = JV:];
|
||||
$JT = [:LineBreak = JT:];
|
||||
$LF = [:LineBreak = Line_Feed:];
|
||||
$NL = [:LineBreak = Next_Line:];
|
||||
$NSX = [\u301C \u30A0 \u3005 \u303B \u309D \u309E \u30FD \u30FE \u203C \u2047 \u2048 \u2049 \u30FB \uFF1A \uFF1B \uFF65];
|
||||
$NS = [[:LineBreak = Nonstarter:] - $NSX];
|
||||
$NU = [:LineBreak = Numeric:];
|
||||
$OP = [[:LineBreak = Open_Punctuation:] \u201c];
|
||||
$POX = [\u00B0 \u2030 \u2032 \u2033 \u2035 \u2103 \u2109 \uFE6A \uFF05 \uFFE0];
|
||||
$PO = [[:LineBreak = Postfix_Numeric:] - $POX];
|
||||
$PRX = [\u00A4 \u00B1 \u20AC \u2116 \uFE69 \uFF04 \uFFE1 \uFFE5 \uFFE6];
|
||||
$PR = [[:LineBreak = Prefix_Numeric:] - $PRX];
|
||||
$QU = [[:LineBreak = Quotation:] - [\u201c\u201d]];
|
||||
$RI = [:LineBreak = Regional_Indicator:];
|
||||
$SA = [:LineBreak = Complex_Context:];
|
||||
$SG = [:LineBreak = Surrogate:];
|
||||
$SP = [:LineBreak = Space:];
|
||||
$SY = [:LineBreak = Break_Symbols:];
|
||||
$WJ = [:LineBreak = Word_Joiner:];
|
||||
$XX = [:LineBreak = Unknown:];
|
||||
$ZW = [:LineBreak = ZWSpace:];
|
||||
$ZWJ = [:LineBreak = ZWJ:];
|
||||
|
||||
# OP30 and CP30 are variants of OP and CP that appear in-line in rule LB30 from UAX 14,
|
||||
# without a formal name. Because ICU rules require multiple uses of the expressions,
|
||||
# give them a single definition with a name
|
||||
|
||||
$OP30 = [$OP - [\p{ea=F}\p{ea=W}\p{ea=H}]];
|
||||
$CP30 = [$CP - [\p{ea=F}\p{ea=W}\p{ea=H}]];
|
||||
|
||||
$ExtPictUnassigned = [\p{Extended_Pictographic} & \p{Cn}];
|
||||
|
||||
# By LB9, a ZWJ also behaves as a CM. Including it in the definition of CM avoids having to explicitly
|
||||
# list it in the numerous rules that use CM.
|
||||
# By LB1, SA characters with general categor of Mn or Mc also resolve to CM.
|
||||
|
||||
$CM = [[:LineBreak = Combining_Mark:] $ZWJ [$SA & [[:Mn:][:Mc:]]]];
|
||||
$CMX = [[$CM] - [$ZWJ]];
|
||||
|
||||
# Dictionary character set, for triggering language-based break engines. Currently
|
||||
# limited to LineBreak=Complex_Context (SA) and $dictionaryCJK.
|
||||
|
||||
# Add CJK dictionary
|
||||
$Han = [:Han:];
|
||||
$Katakana = [:Katakana:];
|
||||
$Hiragana = [:Hiragana:];
|
||||
$HangulSyllable = [\uac00-\ud7a3];
|
||||
$ComplexContext = [:LineBreak = Complex_Context:];
|
||||
$KanaKanji = [$Han $Hiragana $Katakana \u30fc];
|
||||
$dictionaryCJK = [$KanaKanji $HangulSyllable];
|
||||
$dictionary = [$ComplexContext $dictionaryCJK];
|
||||
|
||||
|
||||
#
|
||||
# Rule LB1. By default, treat AI (characters with ambiguous east Asian width),
|
||||
# SA (Dictionary chars, excluding Mn and Mc)
|
||||
# SG (Unpaired Surrogates)
|
||||
# XX (Unknown, unassigned)
|
||||
# as $AL (Alphabetic)
|
||||
#
|
||||
# Let fullwidth-ASCII digits and letters be part of words.
|
||||
$FW_alphanum = [\uff10-\uff19\uff21-\uff3a\uff41-\uff5a];
|
||||
$ALPlus = [$AL $AI $SG $XX $FW_alphanum [$dictionary-[[:Mn:][:Mc:]]]];
|
||||
|
||||
|
||||
## -------------------------------------------------
|
||||
|
||||
#
|
||||
# CAN_CM is the set of characters that may combine with CM combining chars.
|
||||
# Note that Linebreak UAX 14's concept of a combining char and the rules
|
||||
# for what they can combine with are _very_ different from the rest of Unicode.
|
||||
#
|
||||
# Note that $CM itself is left out of this set. If CM is needed as a base
|
||||
# it must be listed separately in the rule.
|
||||
#
|
||||
$CAN_CM = [^$SP $BK $CR $LF $NL $ZW $CM]; # Bases that can take CMs
|
||||
$CANT_CM = [ $SP $BK $CR $LF $NL $ZW $CM]; # Bases that can't take CMs
|
||||
|
||||
#
|
||||
# AL_FOLLOW set of chars that can unconditionally follow an AL
|
||||
# Needed in rules where stand-alone $CM s are treated as AL.
|
||||
#
|
||||
$AL_FOLLOW = [$BK $CR $LF $NL $ZW $SP $CL $CP $EX $HL $IS $SY $WJ $GL $OP30 $QU $BA $BAX $HY $NS $IN $NU $PR $PO $POX $ALPlus];
|
||||
|
||||
|
||||
#
|
||||
# Rule LB 4, 5 Mandatory (Hard) breaks.
|
||||
#
|
||||
$LB4Breaks = [$BK $CR $LF $NL];
|
||||
$LB4NonBreaks = [^$BK $CR $LF $NL $CM];
|
||||
$CR $LF {100};
|
||||
|
||||
#
|
||||
# LB 6 Do not break before hard line breaks.
|
||||
#
|
||||
$LB4NonBreaks? $LB4Breaks {100}; # LB 5 do not break before hard breaks.
|
||||
$CAN_CM $CM* $LB4Breaks {100};
|
||||
^$CM+ $LB4Breaks {100};
|
||||
|
||||
# LB 7 x SP
|
||||
# x ZW
|
||||
$LB4NonBreaks [$SP $ZW];
|
||||
$CAN_CM $CM* [$SP $ZW];
|
||||
^$CM+ [$SP $ZW];
|
||||
|
||||
#
|
||||
# LB 8 Break after zero width space
|
||||
# ZW SP* ÷
|
||||
#
|
||||
$LB8Breaks = [$LB4Breaks $ZW];
|
||||
$LB8NonBreaks = [[$LB4NonBreaks] - [$ZW]];
|
||||
$ZW $SP* / [^$SP $ZW $LB4Breaks];
|
||||
|
||||
# LB 8a ZWJ x Do not break Emoji ZWJ sequences.
|
||||
#
|
||||
$ZWJ [^$CM];
|
||||
|
||||
# LB 9 Combining marks. X $CM needs to behave like X, where X is not $SP, $BK $CR $LF $NL
|
||||
# $CM not covered by the above needs to behave like $AL
|
||||
# See definition of $CAN_CM.
|
||||
|
||||
$CAN_CM $CM+; # Stick together any combining sequences that don't match other rules.
|
||||
^$CM+;
|
||||
|
||||
#
|
||||
# LB 11 Do not break before or after WORD JOINER & related characters.
|
||||
#
|
||||
$CAN_CM $CM* $WJ;
|
||||
$LB8NonBreaks $WJ;
|
||||
^$CM+ $WJ;
|
||||
|
||||
$WJ $CM* .;
|
||||
|
||||
#
|
||||
# LB 12 Do not break after NBSP and related characters.
|
||||
# GL x
|
||||
#
|
||||
$GL $CM* .;
|
||||
|
||||
#
|
||||
# LB 12a Do not break before NBSP and related characters ...
|
||||
# [^SP BA HY] x GL
|
||||
#
|
||||
[[$LB8NonBreaks] - [$SP $BA $BAX $HY]] $CM* $GL;
|
||||
^$CM+ $GL;
|
||||
|
||||
|
||||
|
||||
# LB 13 Don't break before ']' or '!' or '/', even after spaces.
|
||||
#
|
||||
# Do not include $EXX here
|
||||
$LB8NonBreaks $CL;
|
||||
$CAN_CM $CM* $CL;
|
||||
^$CM+ $CL; # by rule 10, stand-alone CM behaves as AL
|
||||
|
||||
$LB8NonBreaks $CP;
|
||||
$CAN_CM $CM* $CP;
|
||||
^$CM+ $CP; # by rule 10, stand-alone CM behaves as AL
|
||||
|
||||
$LB8NonBreaks $EX;
|
||||
$CAN_CM $CM* $EX;
|
||||
^$CM+ $EX; # by rule 10, stand-alone CM behaves as AL
|
||||
|
||||
$LB8NonBreaks $SY;
|
||||
$CAN_CM $CM* $SY;
|
||||
^$CM+ $SY; # by rule 10, stand-alone CM behaves as AL
|
||||
|
||||
|
||||
#
|
||||
# LB 14 Do not break after OP, even after spaces
|
||||
# Note subtle interaction with "SP IS /" rules in LB14a.
|
||||
# This rule consumes the SP, chaining happens on the IS, effectivley overriding the SP IS rules,
|
||||
# which is the desired behavior.
|
||||
#
|
||||
$OP $CM* $SP* .;
|
||||
|
||||
$OP $CM* $SP+ $CM+ $AL_FOLLOW?; # by rule 10, stand-alone CM behaves as AL
|
||||
# by rule 8, CM following a SP is stand-alone.
|
||||
|
||||
|
||||
# LB 14a Force a break before start of a number with a leading decimal pt, e.g. " .23"
|
||||
# Note: would be simpler to express as "$SP / $IS $CM* $NU;", but ICU rules have limitations.
|
||||
# See issue ICU-20303
|
||||
|
||||
|
||||
$CanFollowIS = [$BK $CR $LF $NL $SP $ZW $WJ $GL $CL $CP $EX $IS $SY $QU $BA $BAX $HY $NS $ALPlus $HL $IN];
|
||||
$SP $IS / [^ $CanFollowIS $NU $CM];
|
||||
$SP $IS $CM* $CMX / [^ $CanFollowIS $NU $CM];
|
||||
|
||||
#
|
||||
# LB 14b Do not break before numeric separators (IS), even after spaces.
|
||||
|
||||
[$LB8NonBreaks - $SP] $IS;
|
||||
$SP $IS $CM* [$CanFollowIS {eof}];
|
||||
$SP $IS $CM* $ZWJ [^$CM $NU];
|
||||
|
||||
$CAN_CM $CM* $IS;
|
||||
^$CM+ $IS; # by rule 10, stand-alone CM behaves as AL
|
||||
|
||||
|
||||
# LB 15
|
||||
$QU $CM* $SP* $OP;
|
||||
|
||||
# LB 16
|
||||
# Do not break between closing punctuation and $NS, even with intervening spaces
|
||||
# But DO allow a break between closing punctuation and $NSX, don't include it here
|
||||
($CL | $CP) $CM* $SP* $NS;
|
||||
|
||||
# LB 17
|
||||
$B2 $CM* $SP* $B2;
|
||||
|
||||
#
|
||||
# LB 18 Break after spaces.
|
||||
#
|
||||
$LB18NonBreaks = [$LB8NonBreaks - [$SP]];
|
||||
$LB18Breaks = [$LB8Breaks $SP];
|
||||
|
||||
|
||||
# LB 19
|
||||
# x QU
|
||||
$LB18NonBreaks $CM* $QU;
|
||||
^$CM+ $QU;
|
||||
|
||||
# QU x
|
||||
$QU $CM* .;
|
||||
|
||||
# LB 20
|
||||
# <break> $CB
|
||||
# $CB <break>
|
||||
#
|
||||
$LB20NonBreaks = [$LB18NonBreaks - $CB];
|
||||
|
||||
# LB 20.09 Don't break between Hyphens and Letters when there is a break preceding the hyphen.
|
||||
# Originally added as a Finnish tailoring, now promoted to default ICU behavior.
|
||||
# Note: this is not default UAX-14 behaviour. See issue ICU-8151.
|
||||
#
|
||||
^($HY | $HH) $CM* $ALPlus;
|
||||
|
||||
# LB 21 x (BA | HY | NS)
|
||||
# BB x
|
||||
#
|
||||
# DO allow breaks here before $NSX, so don't include it.
|
||||
# And DO allow breaks between ID and $BAX, so split out the handling of ID and do not include $BAX for them.
|
||||
[$LB20NonBreaks - $ID] $CM* ($BA | $BAX | $HY | $NS);
|
||||
$ID $CM* ($BA | $HY | $NS);
|
||||
|
||||
|
||||
^$CM+ ($BA | $BAX | $HY | $NS);
|
||||
|
||||
$BB $CM* [^$CB]; # $BB x
|
||||
$BB $CM* $LB20NonBreaks;
|
||||
|
||||
# LB 21a Don't break after Hebrew + Hyphen
|
||||
# HL (HY | BA) x
|
||||
#
|
||||
$HL $CM* ($HY | $BA | $BAX) $CM* [^$CB]?;
|
||||
|
||||
# LB 21b (forward) Don't break between SY and HL
|
||||
# (break between HL and SY already disallowed by LB 13 above)
|
||||
$SY $CM* $HL;
|
||||
|
||||
|
||||
# LB 22 Do not break before ellipses
|
||||
#
|
||||
[$LB20NonBreaks - $IN] $CM* $IN; # line_loose tailoring
|
||||
^$CM+ $IN;
|
||||
|
||||
|
||||
# LB 23
|
||||
#
|
||||
($ALPlus | $HL) $CM* $NU;
|
||||
^$CM+ $NU; # Rule 10, any otherwise unattached CM behaves as AL
|
||||
$NU $CM* ($ALPlus | $HL);
|
||||
|
||||
# LB 23a
|
||||
# Do not include $POX here
|
||||
#
|
||||
$PR $CM* ($ID | $EB | $EM);
|
||||
($ID | $EB | $EM) $CM* $PO;
|
||||
|
||||
|
||||
#
|
||||
# LB 24
|
||||
#
|
||||
# Do not include $PRX here
|
||||
($PR | $PO | $POX) $CM* ($ALPlus | $HL);
|
||||
($ALPlus | $HL) $CM* ($PR | $PO | $POX); # TODO: should this be ($PR | $PRX | $PO)
|
||||
^$CM+ ($PR | $PO | $POX); # Rule 10, any otherwise unattached CM behaves as AL
|
||||
|
||||
#
|
||||
# LB 25 Numbers.
|
||||
#
|
||||
# Here do not include $PRX at the beginning or $POX at the end
|
||||
(($PR | $PO | $POX) $CM*)? (($OP | $HY) $CM*)? ($IS $CM*)? $NU ($CM* ($NU | $SY | $IS))*
|
||||
($CM* ($CL | $CP))? ($CM* ($PR | $PRX | $PO))?;
|
||||
|
||||
# LB 26 Do not break a Korean syllable
|
||||
#
|
||||
$JL $CM* ($JL | $JV | $H2 | $H3);
|
||||
($JV | $H2) $CM* ($JV | $JT);
|
||||
($JT | $H3) $CM* $JT;
|
||||
|
||||
# LB 27 Treat korean Syllable Block the same as ID (don't break it)
|
||||
# Do not include $POX or $PRX here
|
||||
($JL | $JV | $JT | $H2 | $H3) $CM* $PO;
|
||||
$PR $CM* ($JL | $JV | $JT | $H2 | $H3);
|
||||
|
||||
|
||||
# LB 28 Do not break between alphabetics
|
||||
#
|
||||
($ALPlus | $HL) $CM* ($ALPlus | $HL);
|
||||
^$CM+ ($ALPlus | $HL); # The $CM+ is from rule 10, an unattached CM is treated as AL
|
||||
|
||||
# LB 29
|
||||
$IS $CM* ($ALPlus | $HL);
|
||||
|
||||
# LB 30
|
||||
($ALPlus | $HL | $NU) $CM* $OP30;
|
||||
^$CM+ $OP30; # The $CM+ is from rule 10, an unattached CM is treated as AL.
|
||||
$CP30 $CM* ($ALPlus | $HL | $NU);
|
||||
|
||||
# LB 30a Do not break between regional indicators. Break after pairs of them.
|
||||
# Tricky interaction with LB8a: ZWJ x . together with ZWJ acting like a CM.
|
||||
$RI $CM* $RI / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $BAX $HY $NS $IN $CM]];
|
||||
$RI $CM* $RI $CM* [$CM-$ZWJ] / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $BAX $HY $NS $IN $CM]];
|
||||
$RI $CM* $RI $CM* [$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $BAX $HY $NS $IN $ZWJ {eof}];
|
||||
# note: the preceding rule includes {eof} rather than having the last [set] term qualified with '?'
|
||||
# because of the chain-out behavior difference. The rule must chain out only from the [set characters],
|
||||
# not from the preceding $RI or $CM, which it would be able to do if the set were optional.
|
||||
|
||||
# LB30b Do not break between an emoji base (or potential emoji) and an emoji modifier.
|
||||
$EB $CM* $EM;
|
||||
$ExtPictUnassigned $CM* $EM;
|
||||
|
||||
# LB 31 Break everywhere else.
|
||||
# Match a single code point if no other rule applies.
|
||||
.;
|
|
@ -17,7 +17,7 @@
|
|||
# line-break=normal (BCP47 -u-lb-normal) as defined for Chinese & Japanese.
|
||||
# It sets characters of class CJ to behave like ID.
|
||||
# In addition, it allows breaks:
|
||||
# * before 301C, 30A0 (both NS)
|
||||
# * before hyphens 2010 & 2013 (both BA) and 301C, 30A0 (both NS)
|
||||
# It allows breaking before 201C and after 201D, for zh_Hans, zh_Hant, and ja.
|
||||
|
||||
#
|
||||
|
@ -29,7 +29,8 @@
|
|||
|
||||
$AI = [:LineBreak = Ambiguous:];
|
||||
$AL = [:LineBreak = Alphabetic:];
|
||||
$BA = [:LineBreak = Break_After:];
|
||||
$BAX = [\u2010 \u2013];
|
||||
$BA = [[:LineBreak = Break_After:] - $BAX];
|
||||
$HH = [\u2010]; # \u2010 is HYPHEN, default line break is BA.
|
||||
$BB = [:LineBreak = Break_Before:];
|
||||
$BK = [:LineBreak = Mandatory_Break:];
|
||||
|
@ -183,7 +184,7 @@ $GL $CM* .;
|
|||
# LB 12a Do not break before NBSP and related characters ...
|
||||
# [^SP BA HY] x GL
|
||||
#
|
||||
[[$LB8NonBreaks] - [$SP $BA $HY]] $CM* $GL;
|
||||
[[$LB8NonBreaks] - [$SP $BA $BAX $HY]] $CM* $GL;
|
||||
^$CM+ $GL;
|
||||
|
||||
|
||||
|
@ -281,7 +282,7 @@ $LB20NonBreaks = [$LB18NonBreaks - $CB];
|
|||
# LB 21 x (BA | HY | NS)
|
||||
# BB x
|
||||
#
|
||||
# DO allow breaks here before $NSX, so don't include it
|
||||
# DO allow breaks here before $BAX and $NSX, so don't include them
|
||||
$LB20NonBreaks $CM* ($BA | $HY | $NS);
|
||||
|
||||
|
||||
|
@ -293,7 +294,7 @@ $BB $CM* $LB20NonBreaks;
|
|||
# LB 21a Don't break after Hebrew + Hyphen
|
||||
# HL (HY | BA) x
|
||||
#
|
||||
$HL $CM* ($HY | $BA) $CM* [^$CB]?;
|
||||
$HL $CM* ($HY | $BA | $BAX) $CM* [^$CB]?;
|
||||
|
||||
# LB 21b (forward) Don't break between SY and HL
|
||||
# (break between HL and SY already disallowed by LB 13 above)
|
||||
|
|
|
@ -1,385 +0,0 @@
|
|||
# Copyright (C) 2022 and later: Unicode, Inc. and others.
|
||||
# License & terms of use: http://www.unicode.org/copyright.html
|
||||
#
|
||||
# file: line_normal_phrase_cj.txt
|
||||
#
|
||||
# Line Breaking Rules
|
||||
# Implement default line breaking as defined by
|
||||
# Unicode Standard Annex #14 (https://www.unicode.org/reports/tr14/)
|
||||
# for Unicode 14.0, with the following modification:
|
||||
#
|
||||
# Boundaries between hyphens and following letters are suppressed when
|
||||
# there is a boundary preceding the hyphen. See rule 20.9
|
||||
#
|
||||
# This tailors the line break behavior to correspond to CSS
|
||||
# line-break=normal (BCP47 -u-lb-normal) as defined for Chinese & Japanese.
|
||||
# It sets characters of class CJ to behave like ID.
|
||||
# In addition, it allows breaks:
|
||||
# * before 301C, 30A0 (both NS)
|
||||
# It allows breaking before 201C and after 201D, for zh_Hans, zh_Hant, and ja.
|
||||
#
|
||||
# The content is the same as line_normal_cj.txt except the following
|
||||
# 1. Add CJK into dictionary.
|
||||
# 2. Add East Asian Width with class F, W and H into $ALPlus.
|
||||
|
||||
#
|
||||
# Character Classes defined by TR 14.
|
||||
#
|
||||
|
||||
!!chain;
|
||||
!!quoted_literals_only;
|
||||
|
||||
$AI = [:LineBreak = Ambiguous:];
|
||||
$AL = [:LineBreak = Alphabetic:];
|
||||
$BA = [:LineBreak = Break_After:];
|
||||
$HH = [\u2010]; # \u2010 is HYPHEN, default line break is BA.
|
||||
$BB = [:LineBreak = Break_Before:];
|
||||
$BK = [:LineBreak = Mandatory_Break:];
|
||||
$B2 = [:LineBreak = Break_Both:];
|
||||
$CB = [:LineBreak = Contingent_Break:];
|
||||
$CJ = [:LineBreak = Conditional_Japanese_Starter:];
|
||||
$CL = [[:LineBreak = Close_Punctuation:] \u201d];
|
||||
# $CM = [:LineBreak = Combining_Mark:];
|
||||
$CP = [:LineBreak = Close_Parenthesis:];
|
||||
$CR = [:LineBreak = Carriage_Return:];
|
||||
$EB = [:LineBreak = EB:];
|
||||
$EM = [:LineBreak = EM:];
|
||||
$EX = [:LineBreak = Exclamation:];
|
||||
$GL = [:LineBreak = Glue:];
|
||||
$HL = [:LineBreak = Hebrew_Letter:];
|
||||
$HY = [:LineBreak = Hyphen:];
|
||||
$H2 = [:LineBreak = H2:];
|
||||
$H3 = [:LineBreak = H3:];
|
||||
# CSS Normal tailoring: CJ resolves to ID
|
||||
$ID = [[:LineBreak = Ideographic:] $CJ];
|
||||
$IN = [:LineBreak = Inseperable:];
|
||||
$IS = [:LineBreak = Infix_Numeric:];
|
||||
$JL = [:LineBreak = JL:];
|
||||
$JV = [:LineBreak = JV:];
|
||||
$JT = [:LineBreak = JT:];
|
||||
$LF = [:LineBreak = Line_Feed:];
|
||||
$NL = [:LineBreak = Next_Line:];
|
||||
$NSX = [\u301C \u30A0];
|
||||
$NS = [[:LineBreak = Nonstarter:] - $NSX];
|
||||
$NU = [:LineBreak = Numeric:];
|
||||
$OP = [[:LineBreak = Open_Punctuation:] \u201c];
|
||||
$PO = [:LineBreak = Postfix_Numeric:];
|
||||
$PR = [:LineBreak = Prefix_Numeric:];
|
||||
$QU = [[:LineBreak = Quotation:] - [\u201c\u201d]];
|
||||
$RI = [:LineBreak = Regional_Indicator:];
|
||||
$SA = [:LineBreak = Complex_Context:];
|
||||
$SG = [:LineBreak = Surrogate:];
|
||||
$SP = [:LineBreak = Space:];
|
||||
$SY = [:LineBreak = Break_Symbols:];
|
||||
$WJ = [:LineBreak = Word_Joiner:];
|
||||
$XX = [:LineBreak = Unknown:];
|
||||
$ZW = [:LineBreak = ZWSpace:];
|
||||
$ZWJ = [:LineBreak = ZWJ:];
|
||||
|
||||
# OP30 and CP30 are variants of OP and CP that appear in-line in rule LB30 from UAX 14,
|
||||
# without a formal name. Because ICU rules require multiple uses of the expressions,
|
||||
# give them a single definition with a name
|
||||
|
||||
$OP30 = [$OP - [\p{ea=F}\p{ea=W}\p{ea=H}]];
|
||||
$CP30 = [$CP - [\p{ea=F}\p{ea=W}\p{ea=H}]];
|
||||
|
||||
$ExtPictUnassigned = [\p{Extended_Pictographic} & \p{Cn}];
|
||||
|
||||
# By LB9, a ZWJ also behaves as a CM. Including it in the definition of CM avoids having to explicitly
|
||||
# list it in the numerous rules that use CM.
|
||||
# By LB1, SA characters with general categor of Mn or Mc also resolve to CM.
|
||||
|
||||
$CM = [[:LineBreak = Combining_Mark:] $ZWJ [$SA & [[:Mn:][:Mc:]]]];
|
||||
$CMX = [[$CM] - [$ZWJ]];
|
||||
|
||||
# Dictionary character set, for triggering language-based break engines. Currently
|
||||
# limited to LineBreak=Complex_Context (SA) and $dictionaryCJK.
|
||||
|
||||
# Add CJK dictionary
|
||||
$Han = [:Han:];
|
||||
$Katakana = [:Katakana:];
|
||||
$Hiragana = [:Hiragana:];
|
||||
$HangulSyllable = [\uac00-\ud7a3];
|
||||
$ComplexContext = [:LineBreak = Complex_Context:];
|
||||
$KanaKanji = [$Han $Hiragana $Katakana \u30fc];
|
||||
$dictionaryCJK = [$KanaKanji $HangulSyllable];
|
||||
$dictionary = [$ComplexContext $dictionaryCJK];
|
||||
|
||||
|
||||
#
|
||||
# Rule LB1. By default, treat AI (characters with ambiguous east Asian width),
|
||||
# SA (Dictionary chars, excluding Mn and Mc)
|
||||
# SG (Unpaired Surrogates)
|
||||
# XX (Unknown, unassigned)
|
||||
# as $AL (Alphabetic)
|
||||
#
|
||||
# Let fullwidth-ASCII digits and letters be part of words.
|
||||
$FW_alphanum = [\uff10-\uff19\uff21-\uff3a\uff41-\uff5a];
|
||||
$ALPlus = [$AL $AI $SG $XX $FW_alphanum [$dictionary-[[:Mn:][:Mc:]]]];
|
||||
|
||||
|
||||
## -------------------------------------------------
|
||||
|
||||
#
|
||||
# CAN_CM is the set of characters that may combine with CM combining chars.
|
||||
# Note that Linebreak UAX 14's concept of a combining char and the rules
|
||||
# for what they can combine with are _very_ different from the rest of Unicode.
|
||||
#
|
||||
# Note that $CM itself is left out of this set. If CM is needed as a base
|
||||
# it must be listed separately in the rule.
|
||||
#
|
||||
$CAN_CM = [^$SP $BK $CR $LF $NL $ZW $CM]; # Bases that can take CMs
|
||||
$CANT_CM = [ $SP $BK $CR $LF $NL $ZW $CM]; # Bases that can't take CMs
|
||||
|
||||
#
|
||||
# AL_FOLLOW set of chars that can unconditionally follow an AL
|
||||
# Needed in rules where stand-alone $CM s are treated as AL.
|
||||
#
|
||||
$AL_FOLLOW = [$BK $CR $LF $NL $ZW $SP $CL $CP $EX $HL $IS $SY $WJ $GL $OP30 $QU $BA $HY $NS $IN $NU $PR $PO $ALPlus];
|
||||
|
||||
|
||||
#
|
||||
# Rule LB 4, 5 Mandatory (Hard) breaks.
|
||||
#
|
||||
$LB4Breaks = [$BK $CR $LF $NL];
|
||||
$LB4NonBreaks = [^$BK $CR $LF $NL $CM];
|
||||
$CR $LF {100};
|
||||
|
||||
#
|
||||
# LB 6 Do not break before hard line breaks.
|
||||
#
|
||||
$LB4NonBreaks? $LB4Breaks {100}; # LB 5 do not break before hard breaks.
|
||||
$CAN_CM $CM* $LB4Breaks {100};
|
||||
^$CM+ $LB4Breaks {100};
|
||||
|
||||
# LB 7 x SP
|
||||
# x ZW
|
||||
$LB4NonBreaks [$SP $ZW];
|
||||
$CAN_CM $CM* [$SP $ZW];
|
||||
^$CM+ [$SP $ZW];
|
||||
|
||||
#
|
||||
# LB 8 Break after zero width space
|
||||
# ZW SP* ÷
|
||||
#
|
||||
$LB8Breaks = [$LB4Breaks $ZW];
|
||||
$LB8NonBreaks = [[$LB4NonBreaks] - [$ZW]];
|
||||
$ZW $SP* / [^$SP $ZW $LB4Breaks];
|
||||
|
||||
# LB 8a ZWJ x Do not break Emoji ZWJ sequences.
|
||||
#
|
||||
$ZWJ [^$CM];
|
||||
|
||||
# LB 9 Combining marks. X $CM needs to behave like X, where X is not $SP, $BK $CR $LF $NL
|
||||
# $CM not covered by the above needs to behave like $AL
|
||||
# See definition of $CAN_CM.
|
||||
|
||||
$CAN_CM $CM+; # Stick together any combining sequences that don't match other rules.
|
||||
^$CM+;
|
||||
|
||||
#
|
||||
# LB 11 Do not break before or after WORD JOINER & related characters.
|
||||
#
|
||||
$CAN_CM $CM* $WJ;
|
||||
$LB8NonBreaks $WJ;
|
||||
^$CM+ $WJ;
|
||||
|
||||
$WJ $CM* .;
|
||||
|
||||
#
|
||||
# LB 12 Do not break after NBSP and related characters.
|
||||
# GL x
|
||||
#
|
||||
$GL $CM* .;
|
||||
|
||||
#
|
||||
# LB 12a Do not break before NBSP and related characters ...
|
||||
# [^SP BA HY] x GL
|
||||
#
|
||||
[[$LB8NonBreaks] - [$SP $BA $HY]] $CM* $GL;
|
||||
^$CM+ $GL;
|
||||
|
||||
|
||||
|
||||
|
||||
# LB 13 Don't break before ']' or '!' or '/', even after spaces.
|
||||
#
|
||||
$LB8NonBreaks $CL;
|
||||
$CAN_CM $CM* $CL;
|
||||
^$CM+ $CL; # by rule 10, stand-alone CM behaves as AL
|
||||
|
||||
$LB8NonBreaks $CP;
|
||||
$CAN_CM $CM* $CP;
|
||||
^$CM+ $CP; # by rule 10, stand-alone CM behaves as AL
|
||||
|
||||
$LB8NonBreaks $EX;
|
||||
$CAN_CM $CM* $EX;
|
||||
^$CM+ $EX; # by rule 10, stand-alone CM behaves as AL
|
||||
|
||||
$LB8NonBreaks $SY;
|
||||
$CAN_CM $CM* $SY;
|
||||
^$CM+ $SY; # by rule 10, stand-alone CM behaves as AL
|
||||
|
||||
|
||||
#
|
||||
# LB 14 Do not break after OP, even after spaces
|
||||
# Note subtle interaction with "SP IS /" rules in LB14a.
|
||||
# This rule consumes the SP, chaining happens on the IS, effectivley overriding the SP IS rules,
|
||||
# which is the desired behavior.
|
||||
#
|
||||
$OP $CM* $SP* .;
|
||||
|
||||
$OP $CM* $SP+ $CM+ $AL_FOLLOW?; # by rule 10, stand-alone CM behaves as AL
|
||||
# by rule 8, CM following a SP is stand-alone.
|
||||
|
||||
|
||||
# LB 14a Force a break before start of a number with a leading decimal pt, e.g. " .23"
|
||||
# Note: would be simpler to express as "$SP / $IS $CM* $NU;", but ICU rules have limitations.
|
||||
# See issue ICU-20303
|
||||
|
||||
|
||||
$CanFollowIS = [$BK $CR $LF $NL $SP $ZW $WJ $GL $CL $CP $EX $IS $SY $QU $BA $HY $NS $ALPlus $HL $IN];
|
||||
$SP $IS / [^ $CanFollowIS $NU $CM];
|
||||
$SP $IS $CM* $CMX / [^ $CanFollowIS $NU $CM];
|
||||
|
||||
#
|
||||
# LB 14b Do not break before numeric separators (IS), even after spaces.
|
||||
|
||||
[$LB8NonBreaks - $SP] $IS;
|
||||
$SP $IS $CM* [$CanFollowIS {eof}];
|
||||
$SP $IS $CM* $ZWJ [^$CM $NU];
|
||||
|
||||
$CAN_CM $CM* $IS;
|
||||
^$CM+ $IS; # by rule 10, stand-alone CM behaves as AL
|
||||
|
||||
|
||||
# LB 15
|
||||
$QU $CM* $SP* $OP;
|
||||
|
||||
# LB 16
|
||||
# Do not break between closing punctuation and $NS, even with intervening spaces
|
||||
# But DO allow a break between closing punctuation and $NSX, don't include it here
|
||||
($CL | $CP) $CM* $SP* $NS;
|
||||
|
||||
# LB 17
|
||||
$B2 $CM* $SP* $B2;
|
||||
|
||||
#
|
||||
# LB 18 Break after spaces.
|
||||
#
|
||||
$LB18NonBreaks = [$LB8NonBreaks - [$SP]];
|
||||
$LB18Breaks = [$LB8Breaks $SP];
|
||||
|
||||
|
||||
# LB 19
|
||||
# x QU
|
||||
$LB18NonBreaks $CM* $QU;
|
||||
^$CM+ $QU;
|
||||
|
||||
# QU x
|
||||
$QU $CM* .;
|
||||
|
||||
# LB 20
|
||||
# <break> $CB
|
||||
# $CB <break>
|
||||
#
|
||||
$LB20NonBreaks = [$LB18NonBreaks - $CB];
|
||||
|
||||
# LB 20.09 Don't break between Hyphens and Letters when there is a break preceding the hyphen.
|
||||
# Originally added as a Finnish tailoring, now promoted to default ICU behavior.
|
||||
# Note: this is not default UAX-14 behaviour. See issue ICU-8151.
|
||||
#
|
||||
^($HY | $HH) $CM* $ALPlus;
|
||||
|
||||
# LB 21 x (BA | HY | NS)
|
||||
# BB x
|
||||
#
|
||||
# DO allow breaks here before $NSX, so don't include it
|
||||
$LB20NonBreaks $CM* ($BA | $HY | $NS);
|
||||
|
||||
|
||||
^$CM+ ($BA | $HY | $NS);
|
||||
|
||||
$BB $CM* [^$CB]; # $BB x
|
||||
$BB $CM* $LB20NonBreaks;
|
||||
|
||||
# LB 21a Don't break after Hebrew + Hyphen
|
||||
# HL (HY | BA) x
|
||||
#
|
||||
$HL $CM* ($HY | $BA) $CM* [^$CB]?;
|
||||
|
||||
# LB 21b (forward) Don't break between SY and HL
|
||||
# (break between HL and SY already disallowed by LB 13 above)
|
||||
$SY $CM* $HL;
|
||||
|
||||
# LB 22 Do not break before ellipses
|
||||
#
|
||||
$LB20NonBreaks $CM* $IN;
|
||||
^$CM+ $IN;
|
||||
|
||||
|
||||
# LB 23
|
||||
#
|
||||
($ALPlus | $HL) $CM* $NU;
|
||||
^$CM+ $NU; # Rule 10, any otherwise unattached CM behaves as AL
|
||||
$NU $CM* ($ALPlus | $HL);
|
||||
|
||||
# LB 23a
|
||||
#
|
||||
$PR $CM* ($ID | $EB | $EM);
|
||||
($ID | $EB | $EM) $CM* $PO;
|
||||
|
||||
|
||||
#
|
||||
# LB 24
|
||||
#
|
||||
($PR | $PO) $CM* ($ALPlus | $HL);
|
||||
($ALPlus | $HL) $CM* ($PR | $PO);
|
||||
^$CM+ ($PR | $PO); # Rule 10, any otherwise unattached CM behaves as AL
|
||||
|
||||
#
|
||||
# LB 25 Numbers.
|
||||
#
|
||||
(($PR | $PO) $CM*)? (($OP | $HY) $CM*)? ($IS $CM*)? $NU ($CM* ($NU | $SY | $IS))*
|
||||
($CM* ($CL | $CP))? ($CM* ($PR | $PO))?;
|
||||
|
||||
# LB 26 Do not break a Korean syllable
|
||||
#
|
||||
$JL $CM* ($JL | $JV | $H2 | $H3);
|
||||
($JV | $H2) $CM* ($JV | $JT);
|
||||
($JT | $H3) $CM* $JT;
|
||||
|
||||
# LB 27 Treat korean Syllable Block the same as ID (don't break it)
|
||||
($JL | $JV | $JT | $H2 | $H3) $CM* $PO;
|
||||
$PR $CM* ($JL | $JV | $JT | $H2 | $H3);
|
||||
|
||||
|
||||
# LB 28 Do not break between alphabetics
|
||||
#
|
||||
($ALPlus | $HL) $CM* ($ALPlus | $HL);
|
||||
^$CM+ ($ALPlus | $HL); # The $CM+ is from rule 10, an unattached CM is treated as AL
|
||||
|
||||
# LB 29
|
||||
$IS $CM* ($ALPlus | $HL);
|
||||
|
||||
# LB 30
|
||||
($ALPlus | $HL | $NU) $CM* $OP30;
|
||||
^$CM+ $OP30; # The $CM+ is from rule 10, an unattached CM is treated as AL.
|
||||
$CP30 $CM* ($ALPlus | $HL | $NU);
|
||||
|
||||
# LB 30a Do not break between regional indicators. Break after pairs of them.
|
||||
# Tricky interaction with LB8a: ZWJ x . together with ZWJ acting like a CM.
|
||||
$RI $CM* $RI / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $IN $CM]];
|
||||
$RI $CM* $RI $CM* [$CM-$ZWJ] / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $IN $CM]];
|
||||
$RI $CM* $RI $CM* [$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $IN $ZWJ {eof}];
|
||||
# note: the preceding rule includes {eof} rather than having the last [set] term qualified with '?'
|
||||
# because of the chain-out behavior difference. The rule must chain out only from the [set characters],
|
||||
# not from the preceding $RI or $CM, which it would be able to do if the set were optional.
|
||||
|
||||
# LB30b Do not break between an emoji base (or potential emoji) and an emoji modifier.
|
||||
$EB $CM* $EM;
|
||||
$ExtPictUnassigned $CM* $EM;
|
||||
|
||||
# LB 31 Break everywhere else.
|
||||
# Match a single code point if no other rule applies.
|
||||
.;
|
|
@ -1,377 +0,0 @@
|
|||
# Copyright (C) 2022 and later: Unicode, Inc. and others.
|
||||
# License & terms of use: http://www.unicode.org/copyright.html
|
||||
#
|
||||
# file: line_phrase_cj.txt
|
||||
#
|
||||
# Line Breaking Rules
|
||||
# Implement default line breaking as defined by
|
||||
# Unicode Standard Annex #14 (https://www.unicode.org/reports/tr14/)
|
||||
# for Unicode 14.0, with the following modification:
|
||||
#
|
||||
# Boundaries between hyphens and following letters are suppressed when
|
||||
# there is a boundary preceding the hyphen. See rule 20.9
|
||||
#
|
||||
# This corresponds to CSS line-break-word-handling=phrase (BCP47 -u-lw-phrase).
|
||||
# It sets characters of class CJ to behave like NS.
|
||||
# It allows breaking before 201C and after 201D, for zh_Hans, zh_Hant, and ja.
|
||||
#
|
||||
# The content is the same as line_cj.txt except the following
|
||||
# 1. Add CJK into dictionary.
|
||||
# 2. Add East Asian Width with class F, W and H into $ALPlus.
|
||||
#
|
||||
# Character Classes defined by TR 14.
|
||||
#
|
||||
|
||||
!!chain;
|
||||
!!quoted_literals_only;
|
||||
|
||||
$AI = [:LineBreak = Ambiguous:];
|
||||
$AL = [:LineBreak = Alphabetic:];
|
||||
$BA = [:LineBreak = Break_After:];
|
||||
$HH = [\u2010]; # \u2010 is HYPHEN, default line break is BA.
|
||||
$BB = [:LineBreak = Break_Before:];
|
||||
$BK = [:LineBreak = Mandatory_Break:];
|
||||
$B2 = [:LineBreak = Break_Both:];
|
||||
$CB = [:LineBreak = Contingent_Break:];
|
||||
$CJ = [:LineBreak = Conditional_Japanese_Starter:];
|
||||
$CL = [[:LineBreak = Close_Punctuation:] \u201d];
|
||||
# $CM = [:LineBreak = Combining_Mark:];
|
||||
$CP = [:LineBreak = Close_Parenthesis:];
|
||||
$CR = [:LineBreak = Carriage_Return:];
|
||||
$EB = [:LineBreak = EB:];
|
||||
$EM = [:LineBreak = EM:];
|
||||
$EX = [:LineBreak = Exclamation:];
|
||||
$GL = [:LineBreak = Glue:];
|
||||
$HL = [:LineBreak = Hebrew_Letter:];
|
||||
$HY = [:LineBreak = Hyphen:];
|
||||
$H2 = [:LineBreak = H2:];
|
||||
$H3 = [:LineBreak = H3:];
|
||||
$ID = [:LineBreak = Ideographic:];
|
||||
$IN = [:LineBreak = Inseperable:];
|
||||
$IS = [:LineBreak = Infix_Numeric:];
|
||||
$JL = [:LineBreak = JL:];
|
||||
$JV = [:LineBreak = JV:];
|
||||
$JT = [:LineBreak = JT:];
|
||||
$LF = [:LineBreak = Line_Feed:];
|
||||
$NL = [:LineBreak = Next_Line:];
|
||||
# NS includes CJ for CSS strict line breaking.
|
||||
$NS = [[:LineBreak = Nonstarter:] $CJ];
|
||||
$NU = [:LineBreak = Numeric:];
|
||||
$OP = [[:LineBreak = Open_Punctuation:] \u201c];
|
||||
$PO = [:LineBreak = Postfix_Numeric:];
|
||||
$PR = [:LineBreak = Prefix_Numeric:];
|
||||
$QU = [[:LineBreak = Quotation:] - [\u201c\u201d]];
|
||||
$RI = [:LineBreak = Regional_Indicator:];
|
||||
$SA = [:LineBreak = Complex_Context:];
|
||||
$SG = [:LineBreak = Surrogate:];
|
||||
$SP = [:LineBreak = Space:];
|
||||
$SY = [:LineBreak = Break_Symbols:];
|
||||
$WJ = [:LineBreak = Word_Joiner:];
|
||||
$XX = [:LineBreak = Unknown:];
|
||||
$ZW = [:LineBreak = ZWSpace:];
|
||||
$ZWJ = [:LineBreak = ZWJ:];
|
||||
|
||||
# OP30 and CP30 are variants of OP and CP that appear in-line in rule LB30 from UAX 14,
|
||||
# without a formal name. Because ICU rules require multiple uses of the expressions,
|
||||
# give them a single definition with a name
|
||||
|
||||
$OP30 = [$OP - [\p{ea=F}\p{ea=W}\p{ea=H}]];
|
||||
$CP30 = [$CP - [\p{ea=F}\p{ea=W}\p{ea=H}]];
|
||||
|
||||
$ExtPictUnassigned = [\p{Extended_Pictographic} & \p{Cn}];
|
||||
|
||||
# By LB9, a ZWJ also behaves as a CM. Including it in the definition of CM avoids having to explicitly
|
||||
# list it in the numerous rules that use CM.
|
||||
# By LB1, SA characters with general categor of Mn or Mc also resolve to CM.
|
||||
|
||||
$CM = [[:LineBreak = Combining_Mark:] $ZWJ [$SA & [[:Mn:][:Mc:]]]];
|
||||
$CMX = [[$CM] - [$ZWJ]];
|
||||
|
||||
# Dictionary character set, for triggering language-based break engines. Currently
|
||||
# limited to LineBreak=Complex_Context (SA) and $dictionaryCJK.
|
||||
|
||||
# Add CJK dictionary
|
||||
$Han = [:Han:];
|
||||
$Katakana = [:Katakana:];
|
||||
$Hiragana = [:Hiragana:];
|
||||
$HangulSyllable = [\uac00-\ud7a3];
|
||||
$ComplexContext = [:LineBreak = Complex_Context:];
|
||||
$KanaKanji = [$Han $Hiragana $Katakana \u30fc];
|
||||
$dictionaryCJK = [$KanaKanji $HangulSyllable];
|
||||
$dictionary = [$ComplexContext $dictionaryCJK];
|
||||
|
||||
|
||||
#
|
||||
# Rule LB1. By default, treat AI (characters with ambiguous east Asian width),
|
||||
# SA (Dictionary chars, excluding Mn and Mc)
|
||||
# SG (Unpaired Surrogates)
|
||||
# XX (Unknown, unassigned)
|
||||
# as $AL (Alphabetic)
|
||||
#
|
||||
# Let fullwidth-ASCII digits and letters be part of words.
|
||||
$FW_alphanum = [\uff10-\uff19\uff21-\uff3a\uff41-\uff5a];
|
||||
$ALPlus = [$AL $AI $SG $XX $FW_alphanum [$dictionary-[[:Mn:][:Mc:]]]];
|
||||
|
||||
|
||||
## -------------------------------------------------
|
||||
|
||||
#
|
||||
# CAN_CM is the set of characters that may combine with CM combining chars.
|
||||
# Note that Linebreak UAX 14's concept of a combining char and the rules
|
||||
# for what they can combine with are _very_ different from the rest of Unicode.
|
||||
#
|
||||
# Note that $CM itself is left out of this set. If CM is needed as a base
|
||||
# it must be listed separately in the rule.
|
||||
#
|
||||
$CAN_CM = [^$SP $BK $CR $LF $NL $ZW $CM]; # Bases that can take CMs
|
||||
$CANT_CM = [ $SP $BK $CR $LF $NL $ZW $CM]; # Bases that can't take CMs
|
||||
|
||||
#
|
||||
# AL_FOLLOW set of chars that can unconditionally follow an AL
|
||||
# Needed in rules where stand-alone $CM s are treated as AL.
|
||||
#
|
||||
$AL_FOLLOW = [$BK $CR $LF $NL $ZW $SP $CL $CP $EX $HL $IS $SY $WJ $GL $OP30 $QU $BA $HY $NS $IN $NU $PR $PO $ALPlus];
|
||||
|
||||
|
||||
#
|
||||
# Rule LB 4, 5 Mandatory (Hard) breaks.
|
||||
#
|
||||
$LB4Breaks = [$BK $CR $LF $NL];
|
||||
$LB4NonBreaks = [^$BK $CR $LF $NL $CM];
|
||||
$CR $LF {100};
|
||||
|
||||
#
|
||||
# LB 6 Do not break before hard line breaks.
|
||||
#
|
||||
$LB4NonBreaks? $LB4Breaks {100}; # LB 5 do not break before hard breaks.
|
||||
$CAN_CM $CM* $LB4Breaks {100};
|
||||
^$CM+ $LB4Breaks {100};
|
||||
|
||||
# LB 7 x SP
|
||||
# x ZW
|
||||
$LB4NonBreaks [$SP $ZW];
|
||||
$CAN_CM $CM* [$SP $ZW];
|
||||
^$CM+ [$SP $ZW];
|
||||
|
||||
#
|
||||
# LB 8 Break after zero width space
|
||||
# ZW SP* ÷
|
||||
#
|
||||
$LB8Breaks = [$LB4Breaks $ZW];
|
||||
$LB8NonBreaks = [[$LB4NonBreaks] - [$ZW]];
|
||||
$ZW $SP* / [^$SP $ZW $LB4Breaks];
|
||||
|
||||
# LB 8a ZWJ x Do not break Emoji ZWJ sequences.
|
||||
#
|
||||
$ZWJ [^$CM];
|
||||
|
||||
# LB 9 Combining marks. X $CM needs to behave like X, where X is not $SP, $BK $CR $LF $NL
|
||||
# $CM not covered by the above needs to behave like $AL
|
||||
# See definition of $CAN_CM.
|
||||
|
||||
$CAN_CM $CM+; # Stick together any combining sequences that don't match other rules.
|
||||
^$CM+;
|
||||
|
||||
#
|
||||
# LB 11 Do not break before or after WORD JOINER & related characters.
|
||||
#
|
||||
$CAN_CM $CM* $WJ;
|
||||
$LB8NonBreaks $WJ;
|
||||
^$CM+ $WJ;
|
||||
|
||||
$WJ $CM* .;
|
||||
|
||||
#
|
||||
# LB 12 Do not break after NBSP and related characters.
|
||||
# GL x
|
||||
#
|
||||
$GL $CM* .;
|
||||
|
||||
#
|
||||
# LB 12a Do not break before NBSP and related characters ...
|
||||
# [^SP BA HY] x GL
|
||||
#
|
||||
[[$LB8NonBreaks] - [$SP $BA $HY]] $CM* $GL;
|
||||
^$CM+ $GL;
|
||||
|
||||
|
||||
|
||||
|
||||
# LB 13 Don't break before ']' or '!' or '/', even after spaces.
|
||||
#
|
||||
$LB8NonBreaks $CL;
|
||||
$CAN_CM $CM* $CL;
|
||||
^$CM+ $CL; # by rule 10, stand-alone CM behaves as AL
|
||||
|
||||
$LB8NonBreaks $CP;
|
||||
$CAN_CM $CM* $CP;
|
||||
^$CM+ $CP; # by rule 10, stand-alone CM behaves as AL
|
||||
|
||||
$LB8NonBreaks $EX;
|
||||
$CAN_CM $CM* $EX;
|
||||
^$CM+ $EX; # by rule 10, stand-alone CM behaves as AL
|
||||
|
||||
$LB8NonBreaks $SY;
|
||||
$CAN_CM $CM* $SY;
|
||||
^$CM+ $SY; # by rule 10, stand-alone CM behaves as AL
|
||||
|
||||
|
||||
#
|
||||
# LB 14 Do not break after OP, even after spaces
|
||||
# Note subtle interaction with "SP IS /" rules in LB14a.
|
||||
# This rule consumes the SP, chaining happens on the IS, effectivley overriding the SP IS rules,
|
||||
# which is the desired behavior.
|
||||
#
|
||||
$OP $CM* $SP* .;
|
||||
|
||||
$OP $CM* $SP+ $CM+ $AL_FOLLOW?; # by rule 10, stand-alone CM behaves as AL
|
||||
# by rule 8, CM following a SP is stand-alone.
|
||||
|
||||
|
||||
# LB 14a Force a break before start of a number with a leading decimal pt, e.g. " .23"
|
||||
# Note: would be simpler to express as "$SP / $IS $CM* $NU;", but ICU rules have limitations.
|
||||
# See issue ICU-20303
|
||||
|
||||
|
||||
$CanFollowIS = [$BK $CR $LF $NL $SP $ZW $WJ $GL $CL $CP $EX $IS $SY $QU $BA $HY $NS $ALPlus $HL $IN];
|
||||
$SP $IS / [^ $CanFollowIS $NU $CM];
|
||||
$SP $IS $CM* $CMX / [^ $CanFollowIS $NU $CM];
|
||||
|
||||
#
|
||||
# LB 14b Do not break before numeric separators (IS), even after spaces.
|
||||
|
||||
[$LB8NonBreaks - $SP] $IS;
|
||||
$SP $IS $CM* [$CanFollowIS {eof}];
|
||||
$SP $IS $CM* $ZWJ [^$CM $NU];
|
||||
|
||||
$CAN_CM $CM* $IS;
|
||||
^$CM+ $IS; # by rule 10, stand-alone CM behaves as AL
|
||||
|
||||
|
||||
# LB 15
|
||||
$QU $CM* $SP* $OP;
|
||||
|
||||
# LB 16
|
||||
($CL | $CP) $CM* $SP* $NS;
|
||||
|
||||
# LB 17
|
||||
$B2 $CM* $SP* $B2;
|
||||
|
||||
#
|
||||
# LB 18 Break after spaces.
|
||||
#
|
||||
$LB18NonBreaks = [$LB8NonBreaks - [$SP]];
|
||||
$LB18Breaks = [$LB8Breaks $SP];
|
||||
|
||||
|
||||
# LB 19
|
||||
# x QU
|
||||
$LB18NonBreaks $CM* $QU;
|
||||
^$CM+ $QU;
|
||||
|
||||
# QU x
|
||||
$QU $CM* .;
|
||||
|
||||
# LB 20
|
||||
# <break> $CB
|
||||
# $CB <break>
|
||||
#
|
||||
$LB20NonBreaks = [$LB18NonBreaks - $CB];
|
||||
|
||||
# LB 20.09 Don't break between Hyphens and Letters when there is a break preceding the hyphen.
|
||||
# Originally added as a Finnish tailoring, now promoted to default ICU behavior.
|
||||
# Note: this is not default UAX-14 behaviour. See issue ICU-8151.
|
||||
#
|
||||
^($HY | $HH) $CM* $ALPlus;
|
||||
|
||||
# LB 21 x (BA | HY | NS)
|
||||
# BB x
|
||||
#
|
||||
$LB20NonBreaks $CM* ($BA | $HY | $NS);
|
||||
|
||||
|
||||
^$CM+ ($BA | $HY | $NS);
|
||||
|
||||
$BB $CM* [^$CB]; # $BB x
|
||||
$BB $CM* $LB20NonBreaks;
|
||||
|
||||
# LB 21a Don't break after Hebrew + Hyphen
|
||||
# HL (HY | BA) x
|
||||
#
|
||||
$HL $CM* ($HY | $BA) $CM* [^$CB]?;
|
||||
|
||||
# LB 21b (forward) Don't break between SY and HL
|
||||
# (break between HL and SY already disallowed by LB 13 above)
|
||||
$SY $CM* $HL;
|
||||
|
||||
# LB 22 Do not break before ellipses
|
||||
#
|
||||
$LB20NonBreaks $CM* $IN;
|
||||
^$CM+ $IN;
|
||||
|
||||
|
||||
# LB 23
|
||||
#
|
||||
($ALPlus | $HL) $CM* $NU;
|
||||
^$CM+ $NU; # Rule 10, any otherwise unattached CM behaves as AL
|
||||
$NU $CM* ($ALPlus | $HL);
|
||||
|
||||
# LB 23a
|
||||
#
|
||||
$PR $CM* ($ID | $EB | $EM);
|
||||
($ID | $EB | $EM) $CM* $PO;
|
||||
|
||||
|
||||
#
|
||||
# LB 24
|
||||
#
|
||||
($PR | $PO) $CM* ($ALPlus | $HL);
|
||||
($ALPlus | $HL) $CM* ($PR | $PO);
|
||||
^$CM+ ($PR | $PO); # Rule 10, any otherwise unattached CM behaves as AL
|
||||
|
||||
#
|
||||
# LB 25 Numbers.
|
||||
#
|
||||
(($PR | $PO) $CM*)? (($OP | $HY) $CM*)? ($IS $CM*)? $NU ($CM* ($NU | $SY | $IS))*
|
||||
($CM* ($CL | $CP))? ($CM* ($PR | $PO))?;
|
||||
|
||||
# LB 26 Do not break a Korean syllable
|
||||
#
|
||||
$JL $CM* ($JL | $JV | $H2 | $H3);
|
||||
($JV | $H2) $CM* ($JV | $JT);
|
||||
($JT | $H3) $CM* $JT;
|
||||
|
||||
# LB 27 Treat korean Syllable Block the same as ID (don't break it)
|
||||
($JL | $JV | $JT | $H2 | $H3) $CM* $PO;
|
||||
$PR $CM* ($JL | $JV | $JT | $H2 | $H3);
|
||||
|
||||
|
||||
# LB 28 Do not break between alphabetics
|
||||
#
|
||||
($ALPlus | $HL) $CM* ($ALPlus | $HL);
|
||||
^$CM+ ($ALPlus | $HL); # The $CM+ is from rule 10, an unattached CM is treated as AL
|
||||
|
||||
# LB 29
|
||||
$IS $CM* ($ALPlus | $HL);
|
||||
|
||||
# LB 30
|
||||
($ALPlus | $HL | $NU) $CM* $OP30;
|
||||
^$CM+ $OP30; # The $CM+ is from rule 10, an unattached CM is treated as AL.
|
||||
$CP30 $CM* ($ALPlus | $HL | $NU);
|
||||
|
||||
# LB 30a Do not break between regional indicators. Break after pairs of them.
|
||||
# Tricky interaction with LB8a: ZWJ x . together with ZWJ acting like a CM.
|
||||
$RI $CM* $RI / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $IN $CM]];
|
||||
$RI $CM* $RI $CM* [$CM-$ZWJ] / [[^$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $IN $CM]];
|
||||
$RI $CM* $RI $CM* [$BK $CR $LF $NL $SP $ZW $WJ $CL $CP $EX $IS $SY $GL $QU $BA $HY $NS $IN $ZWJ {eof}];
|
||||
# note: the preceding rule includes {eof} rather than having the last [set] term qualified with '?'
|
||||
# because of the chain-out behavior difference. The rule must chain out only from the [set characters],
|
||||
# not from the preceding $RI or $CM, which it would be able to do if the set were optional.
|
||||
|
||||
# LB30b Do not break between an emoji base (or potential emoji) and an emoji modifier.
|
||||
$EB $CM* $EM;
|
||||
$ExtPictUnassigned $CM* $EM;
|
||||
|
||||
# LB 31 Break everywhere else.
|
||||
# Match a single code point if no other rule applies.
|
||||
.;
|
|
@ -138,44 +138,7 @@ export ICU4C_DIR=$HOME/icu-myfork/icu4c
|
|||
export ICU4J_ROOT=$HOME/icu-myfork/icu4j
|
||||
export TOOLS_ROOT=$HOME/icu-myfork/tools
|
||||
|
||||
# 1d. Directory for logs/notes (create if does not exist)
|
||||
|
||||
export NOTES=...(some directory)...
|
||||
mkdir -p $NOTES
|
||||
|
||||
# 2a. Configure ICU4C, build and test without new data first, to verify that
|
||||
# there are no pre-existing errors. Here <platform> is the runConfigureICU
|
||||
# code for the platform you are building, e.g. Linux, MacOSX, Cygwin.
|
||||
# (optionally build with debug enabled)
|
||||
|
||||
cd $ICU4C_DIR/source
|
||||
./runConfigureICU [--enable-debug] <platform>
|
||||
make clean
|
||||
make check 2>&1 | tee $NOTES/icu4c-oldData-makeCheck.txt
|
||||
|
||||
# 2b. Now with ICU4J, build and test without new data first, to verify that
|
||||
# there are no pre-existing errors (or at least to have the pre-existing errors
|
||||
# as a base for comparison):
|
||||
|
||||
cd $ICU4J_ROOT
|
||||
ant clean
|
||||
ant check 2>&1 | tee $NOTES/icu4j-oldData-antCheck.txt
|
||||
|
||||
# 3. Make pre-adjustments as necessary
|
||||
# 3a. Copy latest relevant CLDR dtds to ICU
|
||||
cp -p $CLDR_DIR/common/dtd/ldml.dtd $ICU4C_DIR/source/data/dtd/cldr/common/dtd/
|
||||
cp -p $CLDR_DIR/common/dtd/ldmlICU.dtd $ICU4C_DIR/source/data/dtd/cldr/common/dtd/
|
||||
|
||||
# 3b. Update the cldr-icu tooling to use the latest tagged version of ICU
|
||||
open $TOOLS_ROOT/cldr/cldr-to-icu/pom.xml
|
||||
# search for icu4j-for-cldr and update to the latest tagged version per instructions
|
||||
|
||||
# 3c. Update the build for any new icu version, added locales, etc.
|
||||
open $TOOLS_ROOT/cldr/cldr-to-icu/build-icu-data.xml
|
||||
# update icuVersion, icuDataVersion if necessary
|
||||
# update lists of locales to include if necessary
|
||||
|
||||
# 4. Build and install the CLDR jar
|
||||
# 2. Build and install the CLDR jar
|
||||
|
||||
cd $TOOLS_ROOT/cldr
|
||||
ant install-cldr-libs
|
||||
|
@ -183,7 +146,16 @@ ant install-cldr-libs
|
|||
See the $TOOLS_ROOT/cldr/lib/README.txt file for more information on the CLDR
|
||||
jar and the install-cldr-jars.sh script.
|
||||
|
||||
# 5a. Generate the CLDR production data. This process uses ant with ICU's
|
||||
# 3. Configure ICU4C, build and test without new data first, to verify that
|
||||
# there are no pre-existing errors. Here <platform> is the runConfigureICU
|
||||
# code for the platform you are building, e.g. Linux, MacOSX, Cygwin.
|
||||
|
||||
cd $ICU4C_DIR/source
|
||||
./runConfigureICU <platform>
|
||||
make clean
|
||||
make check 2>&1 | tee /tmp/icu4c-oldData-makeCheck.txt
|
||||
|
||||
# 4a. Generate the CLDR production data. This process uses ant with ICU's
|
||||
# data/build.xml
|
||||
#
|
||||
# Running "ant cleanprod" is necessary to clean out the production data directory
|
||||
|
@ -195,9 +167,9 @@ jar and the install-cldr-jars.sh script.
|
|||
cd $ICU4C_DIR/source/data
|
||||
ant cleanprod
|
||||
ant setup
|
||||
ant proddata 2>&1 | tee $NOTES/cldr-newData-proddataLog.txt
|
||||
ant proddata 2>&1 | tee /tmp/cldr-newData-proddataLog.txt
|
||||
|
||||
# 5b. Build the new ICU4C data files; these include .txt files and .py files.
|
||||
# 4b. Build the new ICU4C data files; these include .txt files and .py files.
|
||||
# These new files will replace whatever was already present in the ICU4C sources.
|
||||
# This process uses the LdmlConverter in $TOOLS_ROOT/cldr/cldr-to-icu/;
|
||||
# see $TOOLS_ROOT/cldr/cldr-to-icu/README.txt
|
||||
|
@ -215,58 +187,59 @@ ant proddata 2>&1 | tee $NOTES/cldr-newData-proddataLog.txt
|
|||
# build-icu-data.xml file, such as adding new locales etc.
|
||||
|
||||
cd $TOOLS_ROOT/cldr/cldr-to-icu
|
||||
ant -f build-icu-data.xml -DcldrDataDir="$CLDR_TMP_DIR/production" | tee $NOTES/cldr-newData-builddataLog.txt
|
||||
ant -f build-icu-data.xml -DcldrDataDir="$CLDR_TMP_DIR/production" | tee /tmp/cldr-newData-builddataLog.txt
|
||||
|
||||
# 5c. Update the CLDR testData files needed by ICU4C and ICU4J tests, ensuring
|
||||
# 4c. Update the CLDR testData files needed by ICU4C and ICU4J tests, ensuring
|
||||
# they're representative of the newest CLDR data.
|
||||
|
||||
cd $TOOLS_ROOT/cldr
|
||||
ant copy-cldr-testdata
|
||||
|
||||
# 5d. Copy from CLDR common/testData/localeIdentifiers/localeCanonicalization.txt
|
||||
# 4d. Copy from CLDR common/testData/localeIdentifiers/localeCanonicalization.txt
|
||||
# into icu4c/source/test/testdata/localeCanonicalization.txt
|
||||
# and icu4j/main/tests/core/src/com/ibm/icu/dev/data/unicode/localeCanonicalization.txt
|
||||
# and add the following line to the beginning of these two files
|
||||
# # File copied from cldr common/testData/localeIdentifiers/localeCanonicalization.txt
|
||||
|
||||
# 5e. For the time being, manually re-add the lstm entries in data/brkitr/root.txt
|
||||
# 4e. For the time being, manually re-add the lstm entries in data/brkitr/root.txt
|
||||
|
||||
open $ICU4C_DIR/source/data/brkitr/root.txt
|
||||
|
||||
# paste the following block after the dictionaries block and before the final closing '}':
|
||||
# paste the following block at the end, after the dictionaries block>
|
||||
lstm{
|
||||
Thai{"Thai_graphclust_model4_heavy.res"}
|
||||
Mymr{"Burmese_graphclust_model5_heavy.res"}
|
||||
}
|
||||
|
||||
# 6. Check which data files have modifications, which have been added or removed
|
||||
# 5. Check which data files have modifications, which have been added or removed
|
||||
# (if there are no changes, you may not need to proceed further). Make sure the
|
||||
# list seems reasonable.
|
||||
|
||||
cd $ICU4C_DIR/..
|
||||
cd $ICU4C_DIR/source/data
|
||||
git status
|
||||
|
||||
# 6a. You may also want to check which files were modified in CLDR production data:
|
||||
# 5a. You may also want to check which files were modified in CLDR production data:
|
||||
|
||||
cd $CLDR_TMP_DIR
|
||||
git status
|
||||
|
||||
# 7. Fix any errors, investigate any warnings.
|
||||
# 6. Fix any errors, investigate any warnings.
|
||||
#
|
||||
# Fixing may entail modifying CLDR source data or TOOLS_ROOT config files or
|
||||
# tooling.
|
||||
|
||||
# 8. Now rebuild ICU4C with the new data and run make check tests.
|
||||
# 7. Now rebuild ICU4C with the new data and run make check tests.
|
||||
# Again, keep a log so you can investigate the errors.
|
||||
cd $ICU4C_DIR/source
|
||||
|
||||
# 8a. If any files were added or removed (likely), re-run configure:
|
||||
./runConfigureICU [--enable-debug] <platform>
|
||||
# 7a. If any files were added or removed (likely), re-run configure:
|
||||
./runConfigureICU <platform>
|
||||
make clean
|
||||
|
||||
# 8b. Now do the rebuild.
|
||||
make check 2>&1 | tee $NOTES/icu4c-newData-makeCheck.txt
|
||||
# 7b. Now do the rebuild.
|
||||
make check 2>&1 | tee /tmp/icu4c-newData-makeCheck.txt
|
||||
|
||||
# 9. Investigate each test case failure. The first run processing new CLDR data
|
||||
# 8. Investigate each test case failure. The first run processing new CLDR data
|
||||
# from the Survey Tool can result in thousands of failures (in many cases, one
|
||||
# CLDR data fix can resolve hundreds of test failures). If the error is caused
|
||||
# by bad CLDR data, then file a CLDR bug, fix the data, and regenerate from
|
||||
|
@ -276,9 +249,9 @@ make check 2>&1 | tee $NOTES/icu4c-newData-makeCheck.txt
|
|||
# Note that if the new data has any differences in structure, you will have to
|
||||
# update test/testdata/structLocale.txt or /tsutil/cldrtest/TestLocaleStructure
|
||||
# may fail.
|
||||
# Repeat steps 4-8 until there are no errors.
|
||||
# Repeat steps 4-7 until there are no errors.
|
||||
|
||||
# 10. You can also run the make check tests in exhaustive mode. As an alternative
|
||||
# 9. You can also run the make check tests in exhaustive mode. As an alternative
|
||||
# you can run them as part of the pre-merge tests by adding the following as a
|
||||
# comment in the pull request: "/azp run CI-Exhaustive". You should do one or the
|
||||
# other; the exhaustive tests are *not* run automatically on each pull request,
|
||||
|
@ -287,10 +260,17 @@ make check 2>&1 | tee $NOTES/icu4c-newData-makeCheck.txt
|
|||
cd $ICU4C_DIR/source
|
||||
export INTLTEST_OPTS="-e"
|
||||
export CINTLTST_OPTS="-e"
|
||||
make check 2>&1 | tee $NOTES/icu4c-newData-makeCheckEx.txt
|
||||
make check 2>&1 | tee /tmp/icu4c-newData-makeCheckEx.txt
|
||||
|
||||
# 11. Again, investigate each failure, fixing CLDR data or ICU test cases as
|
||||
# appropriate, and repeating steps 4-8 and 10 until there are no errors.
|
||||
# 10. Again, investigate each failure, fixing CLDR data or ICU test cases as
|
||||
# appropriate, and repeating steps 4-7 and 9 until there are no errors.
|
||||
|
||||
# 11. Now with ICU4J, build and test without new data first, to verify that
|
||||
# there are no pre-existing errors (or at least to have the pre-existing errors
|
||||
# as a base for comparison):
|
||||
|
||||
cd $ICU4J_ROOT
|
||||
ant check 2>&1 | tee /tmp/icu4j-oldData-antCheck.txt
|
||||
|
||||
# 12. Transfer the data to ICU4J:
|
||||
cd $ICU4C_DIR/source
|
||||
|
@ -311,7 +291,7 @@ make icu4j-data-install
|
|||
# Keep a log so you can investigate the errors.
|
||||
|
||||
cd $ICU4J_ROOT
|
||||
ant check 2>&1 | tee $NOTES/icu4j-newData-antCheck.txt
|
||||
ant check 2>&1 | tee /tmp/icu4j-newData-antCheck.txt
|
||||
|
||||
# 14. Investigate test case failures; fix test cases and repeat from step 12,
|
||||
# or fix CLDR data and repeat from step 4, as appropriate, until there are no
|
||||
|
|
|
@ -3,7 +3,7 @@
|
|||
// Generated using tools/cldr/cldr-to-icu/build-icu-data.xml
|
||||
|
||||
{
|
||||
"cldrVersion": "41",
|
||||
"cldrVersion": "40",
|
||||
"aliases": {
|
||||
"ars": "ar_SA",
|
||||
"in": "id",
|
||||
|
|
|
@ -5,7 +5,7 @@ af{
|
|||
collations{
|
||||
standard{
|
||||
Sequence{"&N<<<ʼn"}
|
||||
Version{"41"}
|
||||
Version{"40"}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -5,7 +5,7 @@ am{
|
|||
collations{
|
||||
standard{
|
||||
Sequence{"[reorder Ethi]"}
|
||||
Version{"41"}
|
||||
Version{"40"}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -9,7 +9,7 @@ ar{
|
|||
"&ت<<ة<<<ﺔ<<<ﺓ"
|
||||
"&ي<<ى<<<ﯨ<<<ﯩ<<<ﻰ<<<ﻯ<<<ﲐ<<<ﱝ"
|
||||
}
|
||||
Version{"41"}
|
||||
Version{"40"}
|
||||
}
|
||||
standard{
|
||||
Sequence{
|
||||
|
@ -397,7 +397,7 @@ ar{
|
|||
"&ۓ=ﮰ=ﮱ"
|
||||
"&ۀ=ﮤ=ﮥ"
|
||||
}
|
||||
Version{"41"}
|
||||
Version{"40"}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -11,7 +11,7 @@ as{
|
|||
"&[before 1]ত<ৎ=ত্\u200D"
|
||||
"&হ<ক্ষ"
|
||||
}
|
||||
Version{"41"}
|
||||
Version{"40"}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -9,7 +9,7 @@ az{
|
|||
"[import az-u-co-standard]"
|
||||
"[reorder others]"
|
||||
}
|
||||
Version{"41"}
|
||||
Version{"40"}
|
||||
}
|
||||
standard{
|
||||
Sequence{
|
||||
|
@ -26,7 +26,7 @@ az{
|
|||
"&H<x<<<X"
|
||||
"&Z<w<<<W"
|
||||
}
|
||||
Version{"41"}
|
||||
Version{"40"}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -9,7 +9,7 @@ be{
|
|||
"&Е<ё<<<Ё"
|
||||
"&у<ў<<<Ў"
|
||||
}
|
||||
Version{"41"}
|
||||
Version{"40"}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -5,7 +5,7 @@ bg{
|
|||
collations{
|
||||
standard{
|
||||
Sequence{"[reorder Cyrl]"}
|
||||
Version{"41"}
|
||||
Version{"40"}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -9,7 +9,7 @@ bn{
|
|||
"[reorder Beng Deva Guru Gujr Orya Taml Telu Knda Mlym Sinh]"
|
||||
"&ঔ<ং<ঃ<ঁ"
|
||||
}
|
||||
Version{"41"}
|
||||
Version{"40"}
|
||||
}
|
||||
traditional{
|
||||
Sequence{
|
||||
|
@ -629,7 +629,7 @@ bn{
|
|||
"&যৌ<<<য়ৌ"
|
||||
"&য্<<<য়্"
|
||||
}
|
||||
Version{"41"}
|
||||
Version{"40"}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -65,7 +65,7 @@ bo{
|
|||
"&ྲཱྀ=ཷ"
|
||||
"&ླཱྀ=ཹ"
|
||||
}
|
||||
Version{"41"}
|
||||
Version{"40"}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -8,7 +8,7 @@ br{
|
|||
"&C<ch<<<Ch<<<CH<c''h=c\u02BCh=c\u2019h<<<C''h=C\u02BCh=C\u2019h<<<C'"
|
||||
"'H=C\u02BCH=C\u2019H"
|
||||
}
|
||||
Version{"41"}
|
||||
Version{"40"}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -5,11 +5,11 @@ bs{
|
|||
collations{
|
||||
search{
|
||||
Sequence{"[import hr-u-co-search]"}
|
||||
Version{"41"}
|
||||
Version{"40"}
|
||||
}
|
||||
standard{
|
||||
Sequence{"[import hr]"}
|
||||
Version{"41"}
|
||||
Version{"40"}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -5,7 +5,7 @@ bs_Cyrl{
|
|||
collations{
|
||||
standard{
|
||||
Sequence{"[import sr]"}
|
||||
Version{"41"}
|
||||
Version{"40"}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -8,7 +8,7 @@ ca{
|
|||
"[import und-u-co-search]"
|
||||
"&L<ŀ=l·<<<Ŀ=L·"
|
||||
}
|
||||
Version{"41"}
|
||||
Version{"40"}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -5,7 +5,7 @@ ceb{
|
|||
collations{
|
||||
standard{
|
||||
Sequence{"&N<ñ<<<Ñ<ng<<<Ng<<<NG"}
|
||||
Version{"41"}
|
||||
Version{"40"}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -5,7 +5,7 @@ chr{
|
|||
collations{
|
||||
standard{
|
||||
Sequence{"[reorder Cher]"}
|
||||
Version{"41"}
|
||||
Version{"40"}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -11,7 +11,7 @@ cs{
|
|||
"&S<š<<<Š"
|
||||
"&Z<ž<<<Ž"
|
||||
}
|
||||
Version{"41"}
|
||||
Version{"40"}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -14,7 +14,7 @@ cy{
|
|||
"&R<rh<<<Rh<<<RH"
|
||||
"&T<th<<<Th<<<TH"
|
||||
}
|
||||
Version{"41"}
|
||||
Version{"40"}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -9,7 +9,7 @@ da{
|
|||
"[import da-u-co-standard]"
|
||||
"[caseFirst off]"
|
||||
}
|
||||
Version{"41"}
|
||||
Version{"40"}
|
||||
}
|
||||
standard{
|
||||
Sequence{
|
||||
|
@ -21,7 +21,7 @@ da{
|
|||
"&[before 1]ǀ<æ<<<Æ<<ä<<<Ä<ø<<<Ø<<ö<<<Ö<<ő<<<Ő<å<<<Å<<<aa<<<Aa<<<AA"
|
||||
"&oe<<œ<<<Œ"
|
||||
}
|
||||
Version{"41"}
|
||||
Version{"40"}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -9,14 +9,14 @@ de{
|
|||
"&OE<<ö<<<Ö"
|
||||
"&UE<<ü<<<Ü"
|
||||
}
|
||||
Version{"41"}
|
||||
Version{"40"}
|
||||
}
|
||||
search{
|
||||
Sequence{
|
||||
"[import und-u-co-search]"
|
||||
"[import de-u-co-phonebk]"
|
||||
}
|
||||
Version{"41"}
|
||||
Version{"40"}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -10,7 +10,7 @@ de_AT{
|
|||
"&u<ü<<<Ü"
|
||||
"&ss<ß<<<ẞ"
|
||||
}
|
||||
Version{"41"}
|
||||
Version{"40"}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -14,7 +14,7 @@ dsb{
|
|||
"&S<š<<<Š<ś<<<Ś"
|
||||
"&Z<ž<<<Ž<ź<<<Ź"
|
||||
}
|
||||
Version{"41"}
|
||||
Version{"40"}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -17,7 +17,7 @@ ee{
|
|||
"&T<ts<<<Ts<<<TS"
|
||||
"&V<ʋ<<<Ʋ"
|
||||
}
|
||||
Version{"41"}
|
||||
Version{"40"}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -8,7 +8,7 @@ el{
|
|||
"[normalization on]"
|
||||
"[reorder Grek]"
|
||||
}
|
||||
Version{"41"}
|
||||
Version{"40"}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -8,7 +8,7 @@ en_US_POSIX{
|
|||
"&A<*'\u0020'-'/'<*0-'@'<*ABCDEFGHIJKLMNOPQRSTUVWXYZ<*'['-'`'<*abcdefghijklmnopqrstuvwxyz"
|
||||
"<*'{'-'\u007F'"
|
||||
}
|
||||
Version{"41"}
|
||||
Version{"40"}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -12,7 +12,7 @@ eo{
|
|||
"&S<ŝ<<<Ŝ"
|
||||
"&U<ŭ<<<Ŭ"
|
||||
}
|
||||
Version{"41"}
|
||||
Version{"40"}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -8,11 +8,11 @@ es{
|
|||
"[import und-u-co-search]"
|
||||
"&N<ñ<<<Ñ"
|
||||
}
|
||||
Version{"41"}
|
||||
Version{"40"}
|
||||
}
|
||||
standard{
|
||||
Sequence{"&N<ñ<<<Ñ"}
|
||||
Version{"41"}
|
||||
Version{"40"}
|
||||
}
|
||||
traditional{
|
||||
Sequence{
|
||||
|
@ -20,7 +20,7 @@ es{
|
|||
"&C<ch<<<Ch<<<CH"
|
||||
"&l<ll<<<Ll<<<LL"
|
||||
}
|
||||
Version{"41"}
|
||||
Version{"40"}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -8,7 +8,7 @@ et{
|
|||
"&[before 1]T<š<<<Š<z<<<Z<ž<<<Ž"
|
||||
"&[before 1]X<õ<<<Õ<ä<<<Ä<ö<<<Ö<ü<<<Ü"
|
||||
}
|
||||
Version{"41"}
|
||||
Version{"40"}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -16,7 +16,7 @@ fa{
|
|||
"&ۏ<ه<<ە<<ہ<<ة<<ۃ<<ۀ<<ھ"
|
||||
"&ی<<*ىےيېۑۍێ"
|
||||
}
|
||||
Version{"41"}
|
||||
Version{"40"}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -5,7 +5,7 @@ fa_AF{
|
|||
collations{
|
||||
standard{
|
||||
Sequence{"[import ps]"}
|
||||
Version{"41"}
|
||||
Version{"40"}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -143,7 +143,7 @@ ff_Adlm{
|
|||
"&𞤵<𞤵𞥅"
|
||||
"&𞤵𞥅<<𞤵𞤵"
|
||||
}
|
||||
Version{"41"}
|
||||
Version{"40"}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -8,7 +8,7 @@ fi{
|
|||
"[import und-u-co-search]"
|
||||
"[import fi-u-co-trad]"
|
||||
}
|
||||
Version{"41"}
|
||||
Version{"40"}
|
||||
}
|
||||
standard{
|
||||
Sequence{
|
||||
|
@ -20,7 +20,7 @@ fi{
|
|||
"&Z\u0335<<ʒ<<<Ʒ"
|
||||
"&[before 1]ǀ<å<<<Å<ä<<<Ä<<æ<<<Æ<ö<<<Ö<<ø<<<Ø"
|
||||
}
|
||||
Version{"41"}
|
||||
Version{"40"}
|
||||
}
|
||||
traditional{
|
||||
Sequence{
|
||||
|
@ -31,7 +31,7 @@ fi{
|
|||
"&Y<<ü<<<Ü<<ű<<<Ű"
|
||||
"&[before 1]ǀ<å<<<Å<ä<<<Ä<<æ<<<Æ<ö<<<Ö<<ø<<<Ø<<ő<<<Ő<<õ<<<Õ<<œ<<<Œ"
|
||||
}
|
||||
Version{"41"}
|
||||
Version{"40"}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -5,7 +5,7 @@ fil{
|
|||
collations{
|
||||
standard{
|
||||
Sequence{"&N<ñ<<<Ñ<ng<<<Ng<<<NG"}
|
||||
Version{"41"}
|
||||
Version{"40"}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -8,7 +8,7 @@ fo{
|
|||
"[import und-u-co-search]"
|
||||
"[import fo-u-co-standard]"
|
||||
}
|
||||
Version{"41"}
|
||||
Version{"40"}
|
||||
}
|
||||
standard{
|
||||
Sequence{
|
||||
|
@ -18,7 +18,7 @@ fo{
|
|||
"&Y<<ü<<<Ü<<ű<<<Ű"
|
||||
"&[before 1]ǀ<æ<<<Æ<<ä<<<Ä<<ę<<<Ę<ø<<<Ø<<ö<<<Ö<<ő<<<Ő<<œ<<<Œ<å<<<Å<<<aa<<<Aa<<<AA"
|
||||
}
|
||||
Version{"41"}
|
||||
Version{"40"}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -5,7 +5,7 @@ fr_CA{
|
|||
collations{
|
||||
standard{
|
||||
Sequence{"[backwards 2]"}
|
||||
Version{"41"}
|
||||
Version{"40"}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -5,11 +5,11 @@ gl{
|
|||
collations{
|
||||
search{
|
||||
Sequence{"[import es-u-co-search]"}
|
||||
Version{"41"}
|
||||
Version{"40"}
|
||||
}
|
||||
standard{
|
||||
Sequence{"[import es]"}
|
||||
Version{"41"}
|
||||
Version{"40"}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -9,7 +9,7 @@ gu{
|
|||
"[reorder Gujr Deva Beng Guru Orya Taml Telu Knda Mlym Sinh]"
|
||||
"&ૐ<ં<<ઁ<ઃ"
|
||||
}
|
||||
Version{"41"}
|
||||
Version{"40"}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -12,7 +12,7 @@ ha{
|
|||
"&T<ts<<<Ts<<<TS"
|
||||
"&Y<ƴ<<<ʼy<<<''y<<<Ƴ<<<ʼY<<<''Y"
|
||||
}
|
||||
Version{"41"}
|
||||
Version{"40"}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -8,7 +8,7 @@ haw{
|
|||
"&a<e<<<E<i<<<I<o<<<O<u<<<U"
|
||||
"&w<ʻ"
|
||||
}
|
||||
Version{"41"}
|
||||
Version{"40"}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -11,7 +11,7 @@ he{
|
|||
"&״"
|
||||
"<<'\u0022'"
|
||||
}
|
||||
Version{"41"}
|
||||
Version{"40"}
|
||||
}
|
||||
standard{
|
||||
Sequence{
|
||||
|
@ -20,7 +20,7 @@ he{
|
|||
"&[before 2]''<<׳"
|
||||
"&[before 2]'\u0022'<<״"
|
||||
}
|
||||
Version{"41"}
|
||||
Version{"40"}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
Некоторые файлы не были показаны из-за слишком большого количества измененных файлов Показать больше
Загрузка…
Ссылка в новой задаче