gecko-dev/intl/unicharutil/util/GreekCasing.cpp

318 строки
9.8 KiB
C++

/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
/* This Source Code Form is subject to the terms of the Mozilla Public
* License, v. 2.0. If a copy of the MPL was not distributed with this
* file, You can obtain one at http://mozilla.org/MPL/2.0/. */
#include "GreekCasing.h"
#include "nsUnicharUtils.h"
#include "nsUnicodeProperties.h"
// Custom uppercase mapping for Greek; see bug 307039 for details
#define GREEK_LOWER_ALPHA 0x03B1
#define GREEK_LOWER_ALPHA_TONOS 0x03AC
#define GREEK_LOWER_ALPHA_OXIA 0x1F71
#define GREEK_LOWER_EPSILON 0x03B5
#define GREEK_LOWER_EPSILON_TONOS 0x03AD
#define GREEK_LOWER_EPSILON_OXIA 0x1F73
#define GREEK_LOWER_ETA 0x03B7
#define GREEK_LOWER_ETA_TONOS 0x03AE
#define GREEK_LOWER_ETA_OXIA 0x1F75
#define GREEK_LOWER_IOTA 0x03B9
#define GREEK_LOWER_IOTA_TONOS 0x03AF
#define GREEK_LOWER_IOTA_OXIA 0x1F77
#define GREEK_LOWER_IOTA_DIALYTIKA 0x03CA
#define GREEK_LOWER_IOTA_DIALYTIKA_TONOS 0x0390
#define GREEK_LOWER_IOTA_DIALYTIKA_OXIA 0x1FD3
#define GREEK_LOWER_OMICRON 0x03BF
#define GREEK_LOWER_OMICRON_TONOS 0x03CC
#define GREEK_LOWER_OMICRON_OXIA 0x1F79
#define GREEK_LOWER_UPSILON 0x03C5
#define GREEK_LOWER_UPSILON_TONOS 0x03CD
#define GREEK_LOWER_UPSILON_OXIA 0x1F7B
#define GREEK_LOWER_UPSILON_DIALYTIKA 0x03CB
#define GREEK_LOWER_UPSILON_DIALYTIKA_TONOS 0x03B0
#define GREEK_LOWER_UPSILON_DIALYTIKA_OXIA 0x1FE3
#define GREEK_LOWER_OMEGA 0x03C9
#define GREEK_LOWER_OMEGA_TONOS 0x03CE
#define GREEK_LOWER_OMEGA_OXIA 0x1F7D
#define GREEK_UPPER_ALPHA 0x0391
#define GREEK_UPPER_EPSILON 0x0395
#define GREEK_UPPER_ETA 0x0397
#define GREEK_UPPER_IOTA 0x0399
#define GREEK_UPPER_IOTA_DIALYTIKA 0x03AA
#define GREEK_UPPER_OMICRON 0x039F
#define GREEK_UPPER_UPSILON 0x03A5
#define GREEK_UPPER_UPSILON_DIALYTIKA 0x03AB
#define GREEK_UPPER_OMEGA 0x03A9
#define GREEK_UPPER_ALPHA_TONOS 0x0386
#define GREEK_UPPER_ALPHA_OXIA 0x1FBB
#define GREEK_UPPER_EPSILON_TONOS 0x0388
#define GREEK_UPPER_EPSILON_OXIA 0x1FC9
#define GREEK_UPPER_ETA_TONOS 0x0389
#define GREEK_UPPER_ETA_OXIA 0x1FCB
#define GREEK_UPPER_IOTA_TONOS 0x038A
#define GREEK_UPPER_IOTA_OXIA 0x1FDB
#define GREEK_UPPER_OMICRON_TONOS 0x038C
#define GREEK_UPPER_OMICRON_OXIA 0x1FF9
#define GREEK_UPPER_UPSILON_TONOS 0x038E
#define GREEK_UPPER_UPSILON_OXIA 0x1FEB
#define GREEK_UPPER_OMEGA_TONOS 0x038F
#define GREEK_UPPER_OMEGA_OXIA 0x1FFB
#define COMBINING_ACUTE_ACCENT 0x0301
#define COMBINING_DIAERESIS 0x0308
#define COMBINING_ACUTE_TONE_MARK 0x0341
#define COMBINING_GREEK_DIALYTIKA_TONOS 0x0344
namespace mozilla {
uint32_t
GreekCasing::UpperCase(uint32_t aCh, GreekCasing::State& aState,
bool& aMarkEtaPos, bool& aUpdateMarkedEta)
{
aMarkEtaPos = false;
aUpdateMarkedEta = false;
uint8_t category = unicode::GetGeneralCategory(aCh);
if (aState == kEtaAccMarked) {
switch (category) {
case HB_UNICODE_GENERAL_CATEGORY_LOWERCASE_LETTER:
case HB_UNICODE_GENERAL_CATEGORY_MODIFIER_LETTER:
case HB_UNICODE_GENERAL_CATEGORY_OTHER_LETTER:
case HB_UNICODE_GENERAL_CATEGORY_TITLECASE_LETTER:
case HB_UNICODE_GENERAL_CATEGORY_UPPERCASE_LETTER:
case HB_UNICODE_GENERAL_CATEGORY_SPACING_MARK:
case HB_UNICODE_GENERAL_CATEGORY_ENCLOSING_MARK:
case HB_UNICODE_GENERAL_CATEGORY_NON_SPACING_MARK:
aUpdateMarkedEta = true;
break;
default:
break;
}
aState = kEtaAcc;
}
switch (aCh) {
case GREEK_UPPER_ALPHA:
case GREEK_LOWER_ALPHA:
aState = kAlpha;
return GREEK_UPPER_ALPHA;
case GREEK_UPPER_EPSILON:
case GREEK_LOWER_EPSILON:
aState = kEpsilon;
return GREEK_UPPER_EPSILON;
case GREEK_UPPER_ETA:
case GREEK_LOWER_ETA:
aState = kEta;
return GREEK_UPPER_ETA;
case GREEK_UPPER_IOTA:
aState = kIota;
return GREEK_UPPER_IOTA;
case GREEK_UPPER_OMICRON:
case GREEK_LOWER_OMICRON:
aState = kOmicron;
return GREEK_UPPER_OMICRON;
case GREEK_UPPER_UPSILON:
switch (aState) {
case kOmicron:
aState = kOmicronUpsilon;
break;
default:
aState = kUpsilon;
break;
}
return GREEK_UPPER_UPSILON;
case GREEK_UPPER_OMEGA:
case GREEK_LOWER_OMEGA:
aState = kOmega;
return GREEK_UPPER_OMEGA;
// iota and upsilon may be the second vowel of a diphthong
case GREEK_LOWER_IOTA:
switch (aState) {
case kAlphaAcc:
case kEpsilonAcc:
case kOmicronAcc:
case kUpsilonAcc:
aState = kInWord;
return GREEK_UPPER_IOTA_DIALYTIKA;
default:
break;
}
aState = kIota;
return GREEK_UPPER_IOTA;
case GREEK_LOWER_UPSILON:
switch (aState) {
case kAlphaAcc:
case kEpsilonAcc:
case kEtaAcc:
case kOmicronAcc:
aState = kInWord;
return GREEK_UPPER_UPSILON_DIALYTIKA;
case kOmicron:
aState = kOmicronUpsilon;
break;
default:
aState = kUpsilon;
break;
}
return GREEK_UPPER_UPSILON;
case GREEK_UPPER_IOTA_DIALYTIKA:
case GREEK_LOWER_IOTA_DIALYTIKA:
case GREEK_UPPER_UPSILON_DIALYTIKA:
case GREEK_LOWER_UPSILON_DIALYTIKA:
case COMBINING_DIAERESIS:
aState = kDiaeresis;
return ToUpperCase(aCh);
// remove accent if it follows a vowel or diaeresis,
// and set appropriate state for diphthong detection
case COMBINING_ACUTE_ACCENT:
case COMBINING_ACUTE_TONE_MARK:
switch (aState) {
case kAlpha:
aState = kAlphaAcc;
return uint32_t(-1); // omit this char from result string
case kEpsilon:
aState = kEpsilonAcc;
return uint32_t(-1);
case kEta:
aState = kEtaAcc;
return uint32_t(-1);
case kIota:
aState = kIotaAcc;
return uint32_t(-1);
case kOmicron:
aState = kOmicronAcc;
return uint32_t(-1);
case kUpsilon:
aState = kUpsilonAcc;
return uint32_t(-1);
case kOmicronUpsilon:
aState = kInWord; // this completed a diphthong
return uint32_t(-1);
case kOmega:
aState = kOmegaAcc;
return uint32_t(-1);
case kDiaeresis:
aState = kInWord;
return uint32_t(-1);
default:
break;
}
break;
// combinations with dieresis+accent just strip the accent,
// and reset to start state (don't form diphthong with following vowel)
case GREEK_LOWER_IOTA_DIALYTIKA_TONOS:
case GREEK_LOWER_IOTA_DIALYTIKA_OXIA:
aState = kInWord;
return GREEK_UPPER_IOTA_DIALYTIKA;
case GREEK_LOWER_UPSILON_DIALYTIKA_TONOS:
case GREEK_LOWER_UPSILON_DIALYTIKA_OXIA:
aState = kInWord;
return GREEK_UPPER_UPSILON_DIALYTIKA;
case COMBINING_GREEK_DIALYTIKA_TONOS:
aState = kInWord;
return COMBINING_DIAERESIS;
// strip accents from vowels, and note the vowel seen so that we can detect
// diphthongs where diaeresis needs to be added
case GREEK_LOWER_ALPHA_TONOS:
case GREEK_LOWER_ALPHA_OXIA:
case GREEK_UPPER_ALPHA_TONOS:
case GREEK_UPPER_ALPHA_OXIA:
aState = kAlphaAcc;
return GREEK_UPPER_ALPHA;
case GREEK_LOWER_EPSILON_TONOS:
case GREEK_LOWER_EPSILON_OXIA:
case GREEK_UPPER_EPSILON_TONOS:
case GREEK_UPPER_EPSILON_OXIA:
aState = kEpsilonAcc;
return GREEK_UPPER_EPSILON;
case GREEK_LOWER_ETA_TONOS:
case GREEK_UPPER_ETA_TONOS:
if (aState == kStart) {
aState = kEtaAccMarked;
aMarkEtaPos = true; // mark in case we need to remove the tonos later
return GREEK_UPPER_ETA_TONOS; // treat as disjunctive eta for now
}
// if not in initial state, fall through to strip the accent
MOZ_FALLTHROUGH;
case GREEK_LOWER_ETA_OXIA:
case GREEK_UPPER_ETA_OXIA:
aState = kEtaAcc;
return GREEK_UPPER_ETA;
case GREEK_LOWER_IOTA_TONOS:
case GREEK_LOWER_IOTA_OXIA:
case GREEK_UPPER_IOTA_TONOS:
case GREEK_UPPER_IOTA_OXIA:
aState = kIotaAcc;
return GREEK_UPPER_IOTA;
case GREEK_LOWER_OMICRON_TONOS:
case GREEK_LOWER_OMICRON_OXIA:
case GREEK_UPPER_OMICRON_TONOS:
case GREEK_UPPER_OMICRON_OXIA:
aState = kOmicronAcc;
return GREEK_UPPER_OMICRON;
case GREEK_LOWER_UPSILON_TONOS:
case GREEK_LOWER_UPSILON_OXIA:
case GREEK_UPPER_UPSILON_TONOS:
case GREEK_UPPER_UPSILON_OXIA:
switch (aState) {
case kOmicron:
aState = kInWord; // this completed a diphthong
break;
default:
aState = kUpsilonAcc;
break;
}
return GREEK_UPPER_UPSILON;
case GREEK_LOWER_OMEGA_TONOS:
case GREEK_LOWER_OMEGA_OXIA:
case GREEK_UPPER_OMEGA_TONOS:
case GREEK_UPPER_OMEGA_OXIA:
aState = kOmegaAcc;
return GREEK_UPPER_OMEGA;
}
// all other characters just reset the state to either kStart or kInWord,
// and use standard mappings
switch (category) {
case HB_UNICODE_GENERAL_CATEGORY_LOWERCASE_LETTER:
case HB_UNICODE_GENERAL_CATEGORY_MODIFIER_LETTER:
case HB_UNICODE_GENERAL_CATEGORY_OTHER_LETTER:
case HB_UNICODE_GENERAL_CATEGORY_TITLECASE_LETTER:
case HB_UNICODE_GENERAL_CATEGORY_UPPERCASE_LETTER:
case HB_UNICODE_GENERAL_CATEGORY_SPACING_MARK:
case HB_UNICODE_GENERAL_CATEGORY_ENCLOSING_MARK:
case HB_UNICODE_GENERAL_CATEGORY_NON_SPACING_MARK:
aState = kInWord;
break;
default:
aState = kStart;
break;
}
return ToUpperCase(aCh);
}
} // namespace mozilla