gecko-dev/mfbt/Utf8.h

/* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
/* vim: set ts=8 sts=2 et sw=2 tw=80: */
/* This Source Code Form is subject to the terms of the Mozilla Public
 * License, v. 2.0. If a copy of the MPL was not distributed with this
 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */

/*
 * UTF-8-related functionality, including a type-safe structure representing a
 * UTF-8 code unit.
 */

#ifndef mozilla_Utf8_h
#define mozilla_Utf8_h

#include "mozilla/Types.h" // for MFBT_API

#include <limits.h> // for CHAR_BIT
#include <stddef.h> // for size_t
#include <stdint.h> // for uint8_t

namespace mozilla {

union Utf8Unit;

static_assert(CHAR_BIT == 8,
              "Utf8Unit won't work so well with non-octet chars");

/**
 * A code unit within a UTF-8 encoded string.  (A code unit is the smallest
 * unit within the Unicode encoding of a string.  For UTF-8 this is an 8-bit
 * number; for UTF-16 it would be a 16-bit number.)
 *
 * This is *not* the same as a single code point: in UTF-8, non-ASCII code
 * points are constituted by multiple code units.
 */
union Utf8Unit
{
private:
  // Utf8Unit is a union wrapping a raw |char|.  The C++ object model and C++
  // requirements as to how objects may be accessed with respect to their actual
  // types (almost?) uniquely compel this choice.
  //
  // Our requirements for a UTF-8 code unit representation are:
  //
  //   1. It must be "compatible" with C++ character/string literals that use
  //      the UTF-8 encoding.  Given a properly encoded C++ literal, you should
  //      be able to use |Utf8Unit| and friends to access it; given |Utf8Unit|
  //      and friends (particularly UnicodeData), you should be able to access
  //      C++ character types for their contents.
  //   2. |Utf8Unit| and friends must convert to/from |char| and |char*| only by
  //      explicit operation.
  //   3. |Utf8Unit| must participate in overload resolution and template type
  //      equivalence (that is, given |template<class> class X|, when |X<T>| and
  //      |X<U>| are the same type) distinctly from the C++ character types.
  //
  // And a few nice-to-haves (at least for the moment):
  //
  //   4. The representation should use unsigned numbers, to avoid undefined
  //      behavior that can arise with signed types, and because Unicode code
  //      points and code units are unsigned.
  //   5. |Utf8Unit| and friends should be convertible to/from |unsigned char|
  //      and |unsigned char*|, for APIs that (because of #4 above) use those
  //      types as the "natural" choice for UTF-8 data.
  //
  // #1 requires that |Utf8Unit| "incorporate" a C++ character type: one of
  // |{,{un,}signed} char|.[0]  |uint8_t| won't work because it might not be a
  // C++ character type.
  //
  // #2 and #3 mean that |Utf8Unit| can't *be* such a type (or a typedef to one:
  // typedefs don't generate *new* types, just type aliases).  This requires a
  // compound type.
  //
  // The ultimate representation (and character type in it) is constrained by
  // C++14 [basic.lval]p10 that defines how objects may be accessed, with
  // respect to the dynamic type in memory and the actual type used to access
  // them.  It reads:
  //
  //     If a program attempts to access the stored value of an object
  //     through a glvalue of other than one of the following types the
  //     behavior is undefined:
  //
  //       1. the dynamic type of the object,
  //       2. a cv-qualified version of the dynamic type of the object,
  //       ...other types irrelevant here...
  //       3. an aggregate or union type that includes one of the
  //          aforementioned types among its elements or non-static data
  //          members (including, recursively, an element or non-static
  //          data member of a subaggregate or contained union),
  //       ...more irrelevant types...
  //       4. a char or unsigned char type.
  //
  // Accessing (wrapped) UTF-8 data as |char|/|unsigned char| is allowed no
  // matter the representation by #4.  (Briefly set aside what values are seen.)
  // (And #2 allows |const| on either the dynamic type or the accessing type.)
  // (|signed char| is really only useful for small signed numbers, not
  // characters, so we ignore it.)
  //
  // If we interpret contents as |char|/|unsigned char| contrary to the actual
  // type stored there, what happens?  C++14 [basic.fundamental]p1 requires
  // character types be identically aligned/sized; C++14 [basic.fundamental]p3
  // requires |signed char| and |unsigned char| have the same value
  // representation.  C++ doesn't require identical bitwise representation, tho.
  // Practically we could assume it, but this verges on C++ spec bits best not
  // *relied* on for correctness, if possible.
  //
  // So we don't expose |Utf8Unit|'s contents as |unsigned char*|: only |char|
  // and |char*|.  Instead we safely expose |unsigned char| by fully-defined
  // *integral conversion* (C++14 [conv.integral]p2).  Integral conversion from
  // |unsigned char| → |char| has only implementation-defined behavior.  It'd be
  // better not to depend on that, but given twos-complement won, it should be
  // okay.  (Also |unsigned char*| is awkward enough to work with for strings
  // that it probably doesn't appear in string manipulation much anyway, only in
  // places that should really use |Utf8Unit| directly.)
  //
  // The opposite direction -- interpreting |char| or |char*| data through
  // |Utf8Unit| -- isn't tricky as long as |Utf8Unit| contains a |char| as
  // decided above, using #3.  An "aggregate or union" will work that contains a
  // |char|.  Oddly, an aggregate won't work: C++14 [dcl.init.aggr]p1 says
  // aggregates must have "no private or protected non-static data members", and
  // we want to keep the inner |char| hidden.  So a |struct| is out, and only
  // |union| remains.
  //
  // (Enums are not "an aggregate or union type", so [maybe surprisingly] we
  // can't make |Utf8Unit| an enum class with |char| underlying type, because we
  // are given no license to treat |char| memory as such an |enum|'s memory.)
  //
  // Therefore |Utf8Unit| is a union type with a |char| non-static data member.
  // This satisfies all our requirements.  It also supports the nice-to-haves of
  // creating a |Utf8Unit| from an |unsigned char|, and being convertible to
  // |unsigned char|.  It doesn't satisfy the nice-to-haves of using an
  // |unsigned char| internally, nor of letting us wrap an existing
  // |unsigned char| or pointer to one.  We probably *could* do these, if we
  // were willing to rely harder on implementation-defined behaviors, but for
  // now we privilege C++'s main character type over some conceptual purity.
  //
  // 0. There's a proposal for a UTF-8 character type distinct from the existing
  //    C++ narrow character types:
  //
  //      http://open-std.org/JTC1/SC22/WG21/docs/papers/2016/p0482r0.html
  //
  //    but it hasn't been standardized (and might never be), and none of the
  //    compilers we really care about have implemented it.  Maybe someday we
  //    can change our implementation to it without too much trouble, if we're
  //    lucky...
  char mValue;

public:
  explicit constexpr Utf8Unit(char aUnit)
    : mValue(aUnit)
  {}

  explicit constexpr Utf8Unit(unsigned char aUnit)
    : mValue(static_cast<char>(aUnit))
  {
    // Per the above comment, the prior cast is integral conversion with
    // implementation-defined semantics, and we regretfully but unavoidably
    // assume the conversion does what we want it to.
  }

  constexpr bool operator==(const Utf8Unit& aOther) const
  {
    return mValue == aOther.mValue;
  }

  constexpr bool operator!=(const Utf8Unit& aOther) const
  {
    return !(*this == aOther);
  }

  /** Convert a UTF-8 code unit to a raw char. */
  constexpr char toChar() const
  {
    // Only a |char| is ever permitted to be written into this location, so this
    // is both permissible and returns the desired value.
    return mValue;
  }

  /** Convert a UTF-8 code unit to a raw unsigned char. */
  constexpr unsigned char toUnsignedChar() const
  {
    // Per the above comment, this is well-defined integral conversion.
    return static_cast<unsigned char>(mValue);
  }

  /** Convert a UTF-8 code unit to a uint8_t. */
  constexpr uint8_t toUint8() const
  {
    // Per the above comment, this is well-defined integral conversion.
    return static_cast<uint8_t>(mValue);
  }

  // We currently don't expose |&mValue|.  |UnicodeData| sort of does, but
  // that's a somewhat separate concern, justified in different comments in
  // that other code.
};

/**
 * Returns true if the given length-delimited memory consists of a valid UTF-8
 * string, false otherwise.
 *
 * A valid UTF-8 string contains no overlong-encoded code points (as one would
 * expect) and contains no code unit sequence encoding a UTF-16 surrogate.  The
 * string *may* contain U+0000 NULL code points.
 */
extern MFBT_API bool
IsValidUtf8(const void* aCodeUnits, size_t aCount);

} // namespace mozilla

#endif /* mozilla_Utf8_h */
Bug 1426909 - Introduce a new mfbt/Utf8.h header for UTF-8-related functionality, including a UTF-8 code unit type that is compatible with, but doesn't directly interconvert with, \|char\|. r=froydnj --HG-- extra : rebase_source : 00760186060d7e72fac783af3fc5595a6057feca 2018-01-11 21:29:53 +03:00			`/* -- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -- */`
			`/* vim: set ts=8 sts=2 et sw=2 tw=80: */`
			`/* This Source Code Form is subject to the terms of the Mozilla Public`
			`* License, v. 2.0. If a copy of the MPL was not distributed with this`
			`* file, You can obtain one at http://mozilla.org/MPL/2.0/. */`

			`/*`
			`* UTF-8-related functionality, including a type-safe structure representing a`
			`* UTF-8 code unit.`
			`*/`

			`#ifndef mozilla_Utf8_h`
			`#define mozilla_Utf8_h`

			`#include "mozilla/Types.h" // for MFBT_API`

			`#include <limits.h> // for CHAR_BIT`
			`#include <stddef.h> // for size_t`
			`#include <stdint.h> // for uint8_t`

			`namespace mozilla {`

			`union Utf8Unit;`

			`static_assert(CHAR_BIT == 8,`
			`"Utf8Unit won't work so well with non-octet chars");`

			`/**`
			`* A code unit within a UTF-8 encoded string. (A code unit is the smallest`
			`* unit within the Unicode encoding of a string. For UTF-8 this is an 8-bit`
			`* number; for UTF-16 it would be a 16-bit number.)`
			`*`
			`* This is not the same as a single code point: in UTF-8, non-ASCII code`
			`* points are constituted by multiple code units.`
			`*/`
			`union Utf8Unit`
			`{`
			`private:`
			`// Utf8Unit is a union wrapping a raw \|char\|. The C++ object model and C++`
			`// requirements as to how objects may be accessed with respect to their actual`
			`// types (almost?) uniquely compel this choice.`
			`//`
			`// Our requirements for a UTF-8 code unit representation are:`
			`//`
			`// 1. It must be "compatible" with C++ character/string literals that use`
			`// the UTF-8 encoding. Given a properly encoded C++ literal, you should`
			`// be able to use \|Utf8Unit\| and friends to access it; given \|Utf8Unit\|`
			`// and friends (particularly UnicodeData), you should be able to access`
			`// C++ character types for their contents.`
			`// 2. \|Utf8Unit\| and friends must convert to/from \|char\| and \|char*\| only by`
			`// explicit operation.`
			`// 3. \|Utf8Unit\| must participate in overload resolution and template type`
			`// equivalence (that is, given \|template<class> class X\|, when \|X<T>\| and`
			`// \|X<U>\| are the same type) distinctly from the C++ character types.`
			`//`
			`// And a few nice-to-haves (at least for the moment):`
			`//`
			`// 4. The representation should use unsigned numbers, to avoid undefined`
			`// behavior that can arise with signed types, and because Unicode code`
			`// points and code units are unsigned.`
			`// 5. \|Utf8Unit\| and friends should be convertible to/from \|unsigned char\|`
			`// and \|unsigned char*\|, for APIs that (because of #4 above) use those`
			`// types as the "natural" choice for UTF-8 data.`
			`//`
			`// #1 requires that \|Utf8Unit\| "incorporate" a C++ character type: one of`
			`// \|{,{un,}signed} char\|.[0] \|uint8_t\| won't work because it might not be a`
			`// C++ character type.`
			`//`
			`// #2 and #3 mean that \|Utf8Unit\| can't be such a type (or a typedef to one:`
			`// typedefs don't generate new types, just type aliases). This requires a`
			`// compound type.`
			`//`
			`// The ultimate representation (and character type in it) is constrained by`
			`// C++14 [basic.lval]p10 that defines how objects may be accessed, with`
			`// respect to the dynamic type in memory and the actual type used to access`
			`// them. It reads:`
			`//`
			`// If a program attempts to access the stored value of an object`
			`// through a glvalue of other than one of the following types the`
			`// behavior is undefined:`
			`//`
			`// 1. the dynamic type of the object,`
			`// 2. a cv-qualified version of the dynamic type of the object,`
			`// ...other types irrelevant here...`
			`// 3. an aggregate or union type that includes one of the`
			`// aforementioned types among its elements or non-static data`
			`// members (including, recursively, an element or non-static`
			`// data member of a subaggregate or contained union),`
			`// ...more irrelevant types...`
			`// 4. a char or unsigned char type.`
			`//`
			`// Accessing (wrapped) UTF-8 data as \|char\|/\|unsigned char\| is allowed no`
			`// matter the representation by #4. (Briefly set aside what values are seen.)`
			`// (And #2 allows \|const\| on either the dynamic type or the accessing type.)`
			`// (\|signed char\| is really only useful for small signed numbers, not`
			`// characters, so we ignore it.)`
			`//`
			`// If we interpret contents as \|char\|/\|unsigned char\| contrary to the actual`
			`// type stored there, what happens? C++14 [basic.fundamental]p1 requires`
			`// character types be identically aligned/sized; C++14 [basic.fundamental]p3`
			`// requires \|signed char\| and \|unsigned char\| have the same value`
			`// representation. C++ doesn't require identical bitwise representation, tho.`
			`// Practically we could assume it, but this verges on C++ spec bits best not`
			`// relied on for correctness, if possible.`
			`//`
			`// So we don't expose \|Utf8Unit\|'s contents as \|unsigned char*\|: only \|char\|`
			`// and \|char*\|. Instead we safely expose \|unsigned char\| by fully-defined`
			`// integral conversion (C++14 [conv.integral]p2). Integral conversion from`
			`// \|unsigned char\| → \|char\| has only implementation-defined behavior. It'd be`
			`// better not to depend on that, but given twos-complement won, it should be`
			`// okay. (Also \|unsigned char*\| is awkward enough to work with for strings`
			`// that it probably doesn't appear in string manipulation much anyway, only in`
			`// places that should really use \|Utf8Unit\| directly.)`
			`//`
			`// The opposite direction -- interpreting \|char\| or \|char*\| data through`
			`// \|Utf8Unit\| -- isn't tricky as long as \|Utf8Unit\| contains a \|char\| as`
			`// decided above, using #3. An "aggregate or union" will work that contains a`
			`// \|char\|. Oddly, an aggregate won't work: C++14 [dcl.init.aggr]p1 says`
			`// aggregates must have "no private or protected non-static data members", and`
			`// we want to keep the inner \|char\| hidden. So a \|struct\| is out, and only`
			`// \|union\| remains.`
			`//`
			`// (Enums are not "an aggregate or union type", so [maybe surprisingly] we`
			`// can't make \|Utf8Unit\| an enum class with \|char\| underlying type, because we`
			`// are given no license to treat \|char\| memory as such an \|enum\|'s memory.)`
			`//`
			`// Therefore \|Utf8Unit\| is a union type with a \|char\| non-static data member.`
			`// This satisfies all our requirements. It also supports the nice-to-haves of`
			`// creating a \|Utf8Unit\| from an \|unsigned char\|, and being convertible to`
			`// \|unsigned char\|. It doesn't satisfy the nice-to-haves of using an`
			`// \|unsigned char\| internally, nor of letting us wrap an existing`
			`// \|unsigned char\| or pointer to one. We probably could do these, if we`
			`// were willing to rely harder on implementation-defined behaviors, but for`
			`// now we privilege C++'s main character type over some conceptual purity.`
			`//`
			`// 0. There's a proposal for a UTF-8 character type distinct from the existing`
			`// C++ narrow character types:`
			`//`
			`// http://open-std.org/JTC1/SC22/WG21/docs/papers/2016/p0482r0.html`
			`//`
			`// but it hasn't been standardized (and might never be), and none of the`
			`// compilers we really care about have implemented it. Maybe someday we`
			`// can change our implementation to it without too much trouble, if we're`
			`// lucky...`
			`char mValue;`

			`public:`
			`explicit constexpr Utf8Unit(char aUnit)`
			`: mValue(aUnit)`
			`{}`

			`explicit constexpr Utf8Unit(unsigned char aUnit)`
			`: mValue(static_cast<char>(aUnit))`
			`{`
			`// Per the above comment, the prior cast is integral conversion with`
			`// implementation-defined semantics, and we regretfully but unavoidably`
			`// assume the conversion does what we want it to.`
			`}`

			`constexpr bool operator==(const Utf8Unit& aOther) const`
			`{`
			`return mValue == aOther.mValue;`
			`}`

			`constexpr bool operator!=(const Utf8Unit& aOther) const`
			`{`
			`return !(*this == aOther);`
			`}`

			`/** Convert a UTF-8 code unit to a raw char. */`
			`constexpr char toChar() const`
			`{`
			`// Only a \|char\| is ever permitted to be written into this location, so this`
			`// is both permissible and returns the desired value.`
			`return mValue;`
			`}`

			`/** Convert a UTF-8 code unit to a raw unsigned char. */`
			`constexpr unsigned char toUnsignedChar() const`
			`{`
			`// Per the above comment, this is well-defined integral conversion.`
			`return static_cast<unsigned char>(mValue);`
			`}`

			`/** Convert a UTF-8 code unit to a uint8_t. */`
			`constexpr uint8_t toUint8() const`
			`{`
			`// Per the above comment, this is well-defined integral conversion.`
			`return static_cast<uint8_t>(mValue);`
			`}`

			`// We currently don't expose \|&mValue\|. \|UnicodeData\| sort of does, but`
			`// that's a somewhat separate concern, justified in different comments in`
			`// that other code.`
			`};`

			`/**`
			`* Returns true if the given length-delimited memory consists of a valid UTF-8`
			`* string, false otherwise.`
			`*`
			`* A valid UTF-8 string contains no overlong-encoded code points (as one would`
			`* expect) and contains no code unit sequence encoding a UTF-16 surrogate. The`
			`* string may contain U+0000 NULL code points.`
			`*/`
			`extern MFBT_API bool`
			`IsValidUtf8(const void* aCodeUnits, size_t aCount);`

			`} // namespace mozilla`

			`#endif /* mozilla_Utf8_h */`