зеркало из https://github.com/microsoft/STL.git
253 строки
10 KiB
C++
253 строки
10 KiB
C++
// codecvt facet for UTF-8 multibyte code, UCS wide-character code
|
|
|
|
// Copyright (c) Microsoft Corporation.
|
|
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
|
|
|
#pragma once
|
|
#ifndef _CVT_UTF8_
|
|
#define _CVT_UTF8_
|
|
#include <yvals_core.h>
|
|
#if _STL_COMPILER_PREPROCESSOR
|
|
#include <codecvt>
|
|
#include <cwchar>
|
|
#include <locale>
|
|
|
|
#pragma pack(push, _CRT_PACKING)
|
|
#pragma warning(push, _STL_WARNING_LEVEL)
|
|
#pragma warning(disable : _STL_DISABLED_WARNINGS)
|
|
_STL_DISABLE_CLANG_WARNINGS
|
|
#pragma push_macro("new")
|
|
#undef new
|
|
|
|
#if !_HAS_IF_CONSTEXPR
|
|
#pragma warning(disable : 4127) // conditional expression is constant
|
|
#endif // !_HAS_IF_CONSTEXPR
|
|
|
|
namespace stdext {
|
|
namespace cvt {
|
|
|
|
using _Statype = _CSTD mbstate_t;
|
|
|
|
_STL_DISABLE_DEPRECATED_WARNING
|
|
// CLASS TEMPLATE codecvt_utf8
|
|
template <class _Elem, unsigned long _Maxcode = 0xffffffff, _STD codecvt_mode _Mode = _STD codecvt_mode{}>
|
|
class codecvt_utf8
|
|
: public _STD
|
|
codecvt<_Elem, char, _Statype> { // facet for converting between _Elem and UTF-8 byte sequences
|
|
public:
|
|
using _Mybase = _STD codecvt<_Elem, char, _Statype>;
|
|
using result = typename _Mybase::result;
|
|
using _Byte = char;
|
|
using intern_type = _Elem;
|
|
using extern_type = _Byte;
|
|
using state_type = _Statype;
|
|
|
|
explicit codecvt_utf8(size_t _Refs = 0) : _Mybase(_Refs) {}
|
|
|
|
virtual ~codecvt_utf8() noexcept {}
|
|
|
|
protected:
|
|
virtual result do_in(_Statype& _State, const _Byte* _First1, const _Byte* _Last1, const _Byte*& _Mid1,
|
|
_Elem* _First2, _Elem* _Last2, _Elem*& _Mid2) const override {
|
|
// convert bytes [_First1, _Last1) to [_First2, _Last)
|
|
char* _Pstate = reinterpret_cast<char*>(&_State);
|
|
_Mid1 = _First1;
|
|
_Mid2 = _First2;
|
|
|
|
while (_Mid1 != _Last1 && _Mid2 != _Last2) { // convert a multibyte sequence
|
|
unsigned long _By = static_cast<unsigned char>(*_Mid1);
|
|
unsigned long _Ch;
|
|
int _Nextra;
|
|
|
|
if (_By < 0x80) {
|
|
_Ch = _By;
|
|
_Nextra = 0;
|
|
} else if (_By < 0xc0) { // 0x80-0xbf not first byte
|
|
++_Mid1;
|
|
return _Mybase::error;
|
|
} else if (_By < 0xe0) {
|
|
_Ch = _By & 0x1f;
|
|
_Nextra = 1;
|
|
} else if (_By < 0xf0) {
|
|
_Ch = _By & 0x0f;
|
|
_Nextra = 2;
|
|
} else if (_By < 0xf8) {
|
|
_Ch = _By & 0x07;
|
|
_Nextra = 3;
|
|
} else {
|
|
_Ch = _By & 0x03;
|
|
_Nextra = _By < 0xfc ? 4 : 5;
|
|
}
|
|
|
|
if (_Nextra == 0) {
|
|
++_Mid1;
|
|
} else if (_Last1 - _Mid1 < _Nextra + 1) {
|
|
break; // not enough input
|
|
} else {
|
|
for (++_Mid1; 0 < _Nextra; --_Nextra, ++_Mid1) {
|
|
if ((_By = static_cast<unsigned char>(*_Mid1)) < 0x80 || 0xc0 <= _By) {
|
|
return _Mybase::error; // not continuation byte
|
|
} else {
|
|
_Ch = _Ch << 6 | (_By & 0x3f);
|
|
}
|
|
}
|
|
}
|
|
|
|
if (*_Pstate == 0) { // first time, maybe look for and consume header
|
|
*_Pstate = 1;
|
|
|
|
if _CONSTEXPR_IF ((_Mode & _STD consume_header) != 0) {
|
|
if (_Ch == 0xfeff) { // drop header and retry
|
|
result _Ans = do_in(_State, _Mid1, _Last1, _Mid1, _First2, _Last2, _Mid2);
|
|
|
|
if (_Ans == _Mybase::partial) { // roll back header determination
|
|
*_Pstate = 0;
|
|
_Mid1 = _First1;
|
|
}
|
|
return _Ans;
|
|
}
|
|
}
|
|
}
|
|
|
|
if (_Maxcode < _Ch) {
|
|
return _Mybase::error; // code too large
|
|
}
|
|
*_Mid2++ = static_cast<_Elem>(_Ch);
|
|
}
|
|
|
|
return _First1 == _Mid1 ? _Mybase::partial : _Mybase::ok;
|
|
}
|
|
|
|
virtual result do_out(_Statype& _State, const _Elem* _First1, const _Elem* _Last1, const _Elem*& _Mid1,
|
|
_Byte* _First2, _Byte* _Last2, _Byte*& _Mid2) const override {
|
|
// convert [_First1, _Last1) to bytes [_First2, _Last)
|
|
char* _Pstate = reinterpret_cast<char*>(&_State);
|
|
_Mid1 = _First1;
|
|
_Mid2 = _First2;
|
|
|
|
while (_Mid1 != _Last1 && _Mid2 != _Last2) { // convert and put a wide char
|
|
_Byte _By;
|
|
int _Nextra;
|
|
unsigned long _Ch = static_cast<unsigned long>(*_Mid1);
|
|
|
|
if (_Maxcode < _Ch) {
|
|
return _Mybase::error;
|
|
}
|
|
|
|
if (_Ch < 0x0080) {
|
|
_By = static_cast<_Byte>(_Ch);
|
|
_Nextra = 0;
|
|
} else if (_Ch < 0x0800) {
|
|
_By = static_cast<_Byte>(0xc0 | _Ch >> 6);
|
|
_Nextra = 1;
|
|
} else if (_Ch < 0x00010000) {
|
|
_By = static_cast<_Byte>(0xe0 | _Ch >> 12);
|
|
_Nextra = 2;
|
|
} else if (_Ch < 0x00200000) {
|
|
_By = static_cast<_Byte>(0xf0 | _Ch >> 18);
|
|
_Nextra = 3;
|
|
} else if (_Ch < 0x04000000) {
|
|
_By = static_cast<_Byte>(0xf8 | _Ch >> 24);
|
|
_Nextra = 4;
|
|
} else {
|
|
_By = static_cast<_Byte>(0xfc | (_Ch >> 30 & 0x03));
|
|
_Nextra = 5;
|
|
}
|
|
|
|
if (*_Pstate == 0) { // first time, maybe generate header
|
|
*_Pstate = 1;
|
|
|
|
if _CONSTEXPR_IF ((_Mode & _STD generate_header) != 0) {
|
|
if (_Last2 - _Mid2 < 3 + 1 + _Nextra) {
|
|
return _Mybase::partial; // not enough room for both
|
|
}
|
|
|
|
// prepend header
|
|
*_Mid2++ = '\xef';
|
|
*_Mid2++ = '\xbb';
|
|
*_Mid2++ = '\xbf';
|
|
}
|
|
}
|
|
|
|
if (_Last2 - _Mid2 < 1 + _Nextra) {
|
|
break; // not enough room for output
|
|
}
|
|
|
|
++_Mid1;
|
|
for (*_Mid2++ = _By; 0 < _Nextra;) {
|
|
*_Mid2++ = static_cast<_Byte>((_Ch >> 6 * --_Nextra & 0x3f) | 0x80);
|
|
}
|
|
}
|
|
return _First1 == _Mid1 ? _Mybase::partial : _Mybase::ok;
|
|
}
|
|
|
|
virtual result do_unshift(_Statype&, _Byte* _First2, _Byte*, _Byte*& _Mid2) const override {
|
|
// generate bytes to return to default shift state
|
|
_Mid2 = _First2;
|
|
return _Mybase::ok;
|
|
}
|
|
|
|
virtual int do_length(
|
|
_Statype& _State, const _Byte* _First1, const _Byte* _Last1, size_t _Count) const noexcept override {
|
|
// return min(_Count, converted length of bytes [_First1, _Last1))
|
|
size_t _Wchars = 0;
|
|
_Statype _Mystate = _State;
|
|
|
|
while (_Wchars < _Count && _First1 != _Last1) { // convert another wide character
|
|
const _Byte* _Mid1;
|
|
_Elem* _Mid2;
|
|
_Elem _Ch;
|
|
|
|
switch (do_in(_Mystate, _First1, _Last1, _Mid1, &_Ch, &_Ch + 1,
|
|
_Mid2)) { // test result of single wide-char conversion
|
|
case _Mybase::noconv:
|
|
return static_cast<int>(_Wchars + (_Last1 - _First1));
|
|
|
|
case _Mybase::ok:
|
|
if (_Mid2 == &_Ch + 1) {
|
|
++_Wchars; // replacement do_in might not convert one
|
|
}
|
|
|
|
_First1 = _Mid1;
|
|
break;
|
|
|
|
default:
|
|
return static_cast<int>(_Wchars); // error or partial
|
|
}
|
|
}
|
|
|
|
return static_cast<int>(_Wchars);
|
|
}
|
|
|
|
virtual bool do_always_noconv() const noexcept override { // return true if conversions never change input
|
|
return false;
|
|
}
|
|
|
|
virtual int do_max_length() const noexcept override { // return maximum length required for a conversion
|
|
if _CONSTEXPR_IF ((_Mode & (_STD consume_header | _STD generate_header)) != 0) {
|
|
return 9;
|
|
} else {
|
|
return 6;
|
|
}
|
|
}
|
|
|
|
virtual int do_encoding() const noexcept override { // return length of code sequence (from codecvt)
|
|
if _CONSTEXPR_IF ((_Mode & (_STD consume_header | _STD generate_header)) != 0) {
|
|
return -1; // -1 => state dependent
|
|
} else {
|
|
return 0; // 0 => varying length
|
|
}
|
|
}
|
|
};
|
|
_STL_RESTORE_DEPRECATED_WARNING
|
|
} // namespace cvt
|
|
} // namespace stdext
|
|
|
|
#pragma pop_macro("new")
|
|
_STL_RESTORE_CLANG_WARNINGS
|
|
#pragma warning(pop)
|
|
#pragma pack(pop)
|
|
|
|
#endif // _STL_COMPILER_PREPROCESSOR
|
|
#endif // _CVT_UTF8_
|