STL/stl/inc/cvt/utf16

// codecvt facet for UTF-16 multibyte code, UCS wide-character code

// Copyright (c) Microsoft Corporation.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception

#pragma once
#ifndef _CVT_UTF16_
#define _CVT_UTF16_
#include <yvals_core.h>
#if _STL_COMPILER_PREPROCESSOR
#include <codecvt>
#include <cwchar>
#include <locale>

#pragma pack(push, _CRT_PACKING)
#pragma warning(push, _STL_WARNING_LEVEL)
#pragma warning(disable : _STL_DISABLED_WARNINGS)
_STL_DISABLE_CLANG_WARNINGS
#pragma push_macro("new")
#undef new

#if !_HAS_IF_CONSTEXPR
#pragma warning(disable : 4127) // conditional expression is constant
#endif // !_HAS_IF_CONSTEXPR

namespace stdext {
    namespace cvt {


        using _Statype = _CSTD mbstate_t;

        _STL_DISABLE_DEPRECATED_WARNING
        // CLASS TEMPLATE codecvt_utf16
        template <class _Elem, unsigned long _Maxcode = 0x10ffff, _STD codecvt_mode _Mode = _STD codecvt_mode{}>
        class codecvt_utf16
            : public _STD
                  codecvt<_Elem, char, _Statype> { // facet for converting between _Elem and UTF-16 multibyte sequences
            enum { _Bytes_per_word = 2 };

        public:
            using _Mybase     = _STD codecvt<_Elem, char, _Statype>;
            using result      = typename _Mybase::result;
            using _Byte       = char;
            using intern_type = _Elem;
            using extern_type = _Byte;
            using state_type  = _Statype;

            explicit codecvt_utf16(size_t _Refs = 0) : _Mybase(_Refs) {}

            virtual ~codecvt_utf16() noexcept {}

        protected:
            virtual result do_in(_Statype& _State, const _Byte* _First1, const _Byte* _Last1, const _Byte*& _Mid1,
                _Elem* _First2, _Elem* _Last2, _Elem*& _Mid2) const override {
                // convert bytes [_First1, _Last1) to [_First2, _Last)
                char* _Pstate = reinterpret_cast<char*>(&_State);
                _Mid1         = _First1;
                _Mid2         = _First2;

                while (_Bytes_per_word <= _Last1 - _Mid1 && _Mid2 != _Last2) { // convert a multibyte sequence
                    const auto _Ptr = reinterpret_cast<const unsigned char*>(_Mid1);
                    unsigned long _Ch;
                    unsigned short _Ch0;
                    unsigned short _Ch1;

                    if (*_Pstate == _STD _Little_first) {
                        _Ch0 = static_cast<unsigned short>(_Ptr[1] << 8 | _Ptr[0]);
                    } else if (*_Pstate == _STD _Big_first) {
                        _Ch0 = static_cast<unsigned short>(_Ptr[0] << 8 | _Ptr[1]);
                    } else { // no header seen yet, try preferred mode
                        constexpr unsigned char _Default_endian = static_cast<unsigned char>(
                            (_Mode & _STD little_endian) != 0 ? _STD _Little_first : _STD _Big_first);

                        if _CONSTEXPR_IF ((_Mode & _STD little_endian) != 0) {
                            _Ch0 = static_cast<unsigned short>(_Ptr[1] << 8 | _Ptr[0]);
                        } else {
                            _Ch0 = static_cast<unsigned short>(_Ptr[0] << 8 | _Ptr[1]);
                        }

                        if _CONSTEXPR_IF ((_Mode & _STD consume_header) == 0) {
                            *_Pstate = _Default_endian;
                        } else if (_Ch0 != 0xfeff && _Ch0 != 0xfffe) {
                            *_Pstate = _Default_endian;
                        } else { // consume header, fixate on endianness, and retry
                            _Mid1 += _Bytes_per_word;
                            *_Pstate = static_cast<char>(
                                _Ch0 == 0xfeff ? _Default_endian : static_cast<unsigned char>(3 - _Default_endian));
                            result _Ans = do_in(_State, _Mid1, _Last1, _Mid1, _First2, _Last2, _Mid2);

                            if (_Ans == _Mybase::partial) { // not enough bytes, roll back header
                                *_Pstate = 0;
                                _Mid1    = _First1;
                            }
                            return _Ans;
                        }
                    }

                    if (_Ch0 < 0xd800 || 0xdc00 <= _Ch0) { // one word, consume bytes
                        _Mid1 += _Bytes_per_word;
                        _Ch = _Ch0;
                    } else if (_Last1 - _Mid1 < 2 * _Bytes_per_word) {
                        break;
                    } else { // get second word
                        if (*_Pstate == _STD _Little_first) {
                            _Ch1 = static_cast<unsigned short>(_Ptr[3] << 8 | _Ptr[2]);
                        } else {
                            _Ch1 = static_cast<unsigned short>(_Ptr[2] << 8 | _Ptr[3]);
                        }

                        if (_Ch1 < 0xdc00 || 0xe000 <= _Ch1) {
                            return _Mybase::error;
                        }

                        _Mid1 += 2 * _Bytes_per_word;
                        _Ch = static_cast<unsigned long>(_Ch0 - 0xd800 + 0x0040) << 10 | (_Ch1 - 0xdc00);
                    }

                    if (_Maxcode < _Ch) {
                        return _Mybase::error; // code too large
                    }
                    *_Mid2++ = static_cast<_Elem>(_Ch);
                }

                return _First1 == _Mid1 ? _Mybase::partial : _Mybase::ok;
            }

            virtual result do_out(_Statype& _State, const _Elem* _First1, const _Elem* _Last1, const _Elem*& _Mid1,
                _Byte* _First2, _Byte* _Last2, _Byte*& _Mid2) const override {
                // convert [_First1, _Last1) to bytes [_First2, _Last)
                char* _Pstate = reinterpret_cast<char*>(&_State);
                _Mid1         = _First1;
                _Mid2         = _First2;

                if (*_Pstate == 0) { // determine endianness once, maybe generate header
                    if _CONSTEXPR_IF ((_Mode & _STD little_endian) != 0) {
                        *_Pstate = _STD _Little_first;
                    } else {
                        *_Pstate = _STD _Big_first;
                    }

                    if _CONSTEXPR_IF ((_Mode & _STD generate_header) != 0) {
                        if (_Last2 - _Mid2 < 3 * _Bytes_per_word) {
                            return _Mybase::partial; // not enough room for both
                        }

                        if (*_Pstate == _STD _Little_first) { // put header LS byte first
                            *_Mid2++ = '\xff';
                            *_Mid2++ = '\xfe';
                        } else { // put header MS byte first
                            *_Mid2++ = '\xfe';
                            *_Mid2++ = '\xff';
                        }
                    }
                }

                while (_Mid1 != _Last1 && _Bytes_per_word <= _Last2 - _Mid2) { // convert and put a wide char
                    bool _Extra       = false;
                    unsigned long _Ch = static_cast<unsigned long>(*_Mid1++);

                    if ((_Maxcode < 0x10ffff ? _Maxcode : 0x10ffff) < _Ch) {
                        return _Mybase::error; // value too large
                    } else if (_Ch <= 0xffff) { // one word, can't be code for first of two
                        if (0xd800 <= _Ch && _Ch < 0xdc00) {
                            return _Mybase::error;
                        }
                    } else if (_Last2 - _Mid2 < 2 * _Bytes_per_word) { // not enough room for two-word output, back up
                        --_Mid1;
                        return _Mybase::partial;
                    } else {
                        _Extra = true;
                    }

                    if (*_Pstate == _STD _Little_first) {
                        if (!_Extra) { // put a single word LS byte first
                            *_Mid2++ = static_cast<_Byte>(_Ch);
                            *_Mid2++ = static_cast<_Byte>(_Ch >> 8);
                        } else { // put a pair of words LS byte first
                            unsigned short _Ch0 =
                                static_cast<unsigned short>(0xd800 | static_cast<unsigned short>(_Ch >> 10) - 0x0040);
                            *_Mid2++ = static_cast<_Byte>(_Ch0);
                            *_Mid2++ = static_cast<_Byte>(_Ch0 >> 8);

                            _Ch0 = static_cast<unsigned short>(0xdc00 | (static_cast<unsigned short>(_Ch) & 0x03ff));
                            *_Mid2++ = static_cast<_Byte>(_Ch0);
                            *_Mid2++ = static_cast<_Byte>(_Ch0 >> 8);
                        }
                    } else {
                        if (!_Extra) { // put a single word MS byte first
                            *_Mid2++ = static_cast<_Byte>(_Ch >> 8);
                            *_Mid2++ = static_cast<_Byte>(_Ch);
                        } else { // put a pair of words MS byte first
                            unsigned short _Ch0 =
                                static_cast<unsigned short>(0xd800 | static_cast<unsigned short>(_Ch >> 10) - 0x0040);
                            *_Mid2++ = static_cast<_Byte>(_Ch0 >> 8);
                            *_Mid2++ = static_cast<_Byte>(_Ch0);

                            _Ch0 = static_cast<unsigned short>(0xdc00 | (static_cast<unsigned short>(_Ch) & 0x03ff));
                            *_Mid2++ = static_cast<_Byte>(_Ch0 >> 8);
                            *_Mid2++ = static_cast<_Byte>(_Ch0);
                        }
                    }
                }

                return _First1 == _Mid1 ? _Mybase::partial : _Mybase::ok;
            }

            virtual result do_unshift(_Statype&, _Byte* _First2, _Byte*, _Byte*& _Mid2) const override {
                // generate bytes to return to default shift state
                _Mid2 = _First2;
                return _Mybase::ok;
            }

            virtual int do_length(
                _Statype& _State, const _Byte* _First1, const _Byte* _Last1, size_t _Count) const noexcept override {
                // return min(_Count, converted length of bytes [_First1, _Last1))
                size_t _Wchars    = 0;
                _Statype _Mystate = _State;

                while (_Wchars < _Count && _First1 != _Last1) { // convert another wide char
                    const _Byte* _Mid1;
                    _Elem* _Mid2;
                    _Elem _Ch;

                    switch (do_in(_Mystate, _First1, _Last1, _Mid1, &_Ch, &_Ch + 1,
                        _Mid2)) { // test result of single wide-char conversion
                    case _Mybase::noconv:
                        return static_cast<int>(_Wchars + (_Last1 - _First1));

                    case _Mybase::ok:
                        if (_Mid2 == &_Ch + 1) {
                            ++_Wchars; // replacement do_in might not convert one
                        }

                        _First1 = _Mid1;
                        break;

                    default:
                        return static_cast<int>(_Wchars); // error or partial
                    }
                }

                return static_cast<int>(_Wchars);
            }

            virtual bool do_always_noconv() const noexcept override { // return true if conversions never change input
                return false;
            }

            virtual int do_max_length() const noexcept override { // return maximum length required for a conversion
                if _CONSTEXPR_IF ((_Mode & (_STD consume_header | _STD generate_header)) != 0) {
                    return 3 * _Bytes_per_word;
                } else {
                    return 6 * _Bytes_per_word;
                }
            }

            virtual int do_encoding() const noexcept override { // return length of code sequence (from codecvt)
                if _CONSTEXPR_IF ((_Mode & (_STD consume_header | _STD generate_header)) != 0) {
                    return -1; // -1 => state dependent
                } else {
                    return 0; // 0 => varying length
                }
            }
        };
        _STL_RESTORE_DEPRECATED_WARNING
    } // namespace cvt
} // namespace stdext

#pragma pop_macro("new")
_STL_RESTORE_CLANG_WARNINGS
#pragma warning(pop)
#pragma pack(pop)

#endif // _STL_COMPILER_PREPROCESSOR
#endif // _CVT_UTF16_