Bug 1562606 - Implement unsigned LEB128 functions working over iterators - r=gregtatum

The new ProfileBuffer data structure will need to store block sizes (usually small, LEB128 uses fewer bytes for small numbers), and the circular buffer will provide iterators that hide the wrapping-around. Differential Revision: https://phabricator.services.mozilla.com/D36473 --HG-- extra : moz-landing-system : lando
2019-07-03 14:49:10 +00:00 · 2019-07-03 14:49:10 +00:00 · e610854d8e
--- a/mozglue/baseprofiler/moz.build
+++ b/mozglue/baseprofiler/moz.build
@ -83,6 +83,7 @@ EXPORTS += [

 EXPORTS.mozilla += [
    'public/BaseProfilerCounts.h',
+    'public/leb128iterator.h',
    'public/PowerOfTwo.h',
 ]

--- a/mozglue/baseprofiler/public/leb128iterator.h
+++ b/mozglue/baseprofiler/public/leb128iterator.h
@ -0,0 +1,147 @@
+/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
+/* vim: set ts=8 sts=2 et sw=2 tw=80: */
+/* This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
+
+// LEB128 utilities that can read/write unsigned LEB128 numbers from/to
+// iterators.
+//
+// LEB128 = Little Endian Base 128, where small numbers take few bytes, but
+// large numbers are still allowed, which is ideal when serializing numbers that
+// are likely to be small.
+// Each byte contains 7 bits from the number, starting at the "little end", the
+// top bit is 0 for the last byte, 1 otherwise.
+// Numbers 0-127 only take 1 byte. 128-16383 take 2 bytes. Etc.
+//
+// Iterators only need to provide:
+// - `*it` to return a reference to the next byte to be read from or written to.
+// - `++it` to advance the iterator after a byte is written.
+//
+// The caller must always provide sufficient space to write any number, by:
+// - pre-allocating a large enough buffer, or
+// - allocating more space when `++it` reaches the end and/or `*it` is invoked
+//   after the end, or
+// - moving the underlying pointer to an appropriate location (e.g., wrapping
+//   around a circular buffer).
+// The caller must also provide enough bytes to read a full value (i.e., at
+// least one byte should have its top bit unset), and a type large enough to
+// hold the stored value.
+//
+// Note: There are insufficient checks for validity! These functions are
+// intended to be used together, i.e., the user should only `ReadULEB128()` from
+// a sufficiently-large buffer that the same user filled with `WriteULEB128()`.
+// Using with externally-sourced data (e.g., DWARF) is *not* recommended.
+//
+// https://en.wikipedia.org/wiki/LEB128
+
+#ifndef leb128iterator_h
+#define leb128iterator_h
+
+#include <climits>
+#include <cstdint>
+#include <limits>
+#include <type_traits>
+
+namespace mozilla {
+
+// Number of bytes needed to represent `aValue`.
+template <typename T>
+constexpr uint_fast8_t ULEB128Size(T aValue) {
+  static_assert(!std::numeric_limits<T>::is_signed,
+                "ULEB128Size only takes unsigned types");
+  // We need one output byte per 7 bits of non-zero value. So we just remove
+  // 7 least significant bits at a time until the value becomes zero.
+  // Note the special case of 0, which still needs 1 output byte; this is done
+  // by starting the first loop before we check for 0.
+  uint_fast8_t size = 0;
+  for (;;) {
+    size += 1;
+    aValue >>= 7;
+    // Expecting small values, so it should be more likely that `aValue == 0`.
+    if (MOZ_LIKELY(aValue == 0)) {
+      return size;
+    }
+  }
+}
+
+// Maximum number of bytes needed to represent any value of type `T`.
+template <typename T>
+constexpr uint_fast8_t ULEB128MaxSize() {
+  return ULEB128Size<T>(std::numeric_limits<T>::max());
+}
+
+// Write `aValue` in LEB128 to `aIterator`.
+// The iterator will be moved past the last byte.
+template <typename T, typename It>
+void WriteULEB128(T aValue, It& aIterator) {
+  static_assert(!std::numeric_limits<T>::is_signed,
+                "WriteULEB128 only takes unsigned types");
+  using IteratorValue = std::remove_reference_t<decltype(*aIterator)>;
+  static_assert(sizeof(IteratorValue) == 1,
+                "WriteULEB128 expects an iterator to single bytes");
+  // 0. Don't test for 0 yet, as we want to output one byte for it.
+  for (;;) {
+    // 1. Extract the 7 least significant bits.
+    const uint_fast8_t byte = aValue & 0x7Fu;
+    // 2. Remove them from `aValue`.
+    aValue >>= 7;
+    // 3. Write the 7 bits, and set the 8th bit if `aValue` is not 0 yet
+    // (meaning there will be more bytes after this one.)
+    // Expecting small values, so it should be more likely that `aValue == 0`.
+    // Note: No absolute need to force-cast to IteratorValue, because we have
+    // only changed the bottom 8 bits above. However the compiler could warn
+    // about a narrowing conversion from potentially-multibyte uint_fast8_t down
+    // to whatever single-byte type `*iterator* expects, so we make it explicit.
+    *aIterator = static_cast<IteratorValue>(
+        MOZ_LIKELY(aValue == 0) ? byte : (byte | 0x80u));
+    // 4. Always advance the iterator to the next byte.
+    ++aIterator;
+    // 5. We're done if `aValue` is 0.
+    // Expecting small values, so it should be more likely that `aValue == 0`.
+    if (MOZ_LIKELY(aValue == 0)) {
+      return;
+    }
+  }
+}
+
+// Read an LEB128 value from `aIterator`.
+// The iterator will be moved past the last byte.
+template <typename T, typename It>
+T ReadULEB128(It& aIterator) {
+  static_assert(!std::numeric_limits<T>::is_signed,
+                "ReadULEB128 must return an unsigned type");
+  using IteratorValue = std::remove_reference_t<decltype(*aIterator)>;
+  static_assert(sizeof(IteratorValue) == 1,
+                "ReadULEB128 expects an iterator to single bytes");
+  // Incoming bits will be added to `result`...
+  T result = 0;
+  // ... starting with the least significant bits.
+  uint_fast8_t shift = 0;
+  for (;;) {
+    // 1. Read one byte from the iterator.
+    // `static_cast` just in case IteratorValue is not implicitly convertible to
+    // uint_fast8_t. It wouldn't matter if the sign was extended, we're only
+    // dealing with the bottom 8 bits below.
+    const uint_fast8_t byte = static_cast<uint_fast8_t>(*aIterator);
+    // 2. Always advance the iterator.
+    ++aIterator;
+    // 3. Extract the 7 bits of value, and shift them in place into `result`.
+    result |= static_cast<T>(byte & 0x7fu) << shift;
+    // 4. If the 8th bit is *not* set, this was the last byte.
+    // Expecting small values, so it should be more likely that the bit is off.
+    if (MOZ_LIKELY((byte & 0x80u) == 0)) {
+      return result;
+    }
+    // There are more bytes to read.
+    // 5. Next byte will contain more significant bits above the past 7.
+    shift += 7;
+    // Safety check that we're not going to shift by >= than the type size,
+    // which is Undefined Behavior in C++.
+    MOZ_ASSERT(shift < CHAR_BIT * sizeof(T));
+  }
+}
+
+}  // namespace mozilla
+
+#endif  // leb128iterator_h
--- a/mozglue/tests/TestBaseProfiler.cpp
+++ b/mozglue/tests/TestBaseProfiler.cpp
@ -8,6 +8,7 @@

 #ifdef MOZ_BASE_PROFILER

+#  include "mozilla/leb128iterator.h"
 #  include "mozilla/PowerOfTwo.h"

 #  include "mozilla/Attributes.h"
@ -150,6 +151,105 @@ void TestPowerOfTwo() {
  printf("TestPowerOfTwo done\n");
 }

+void TestLEB128() {
+  printf("TestLEB128...\n");
+
+  MOZ_RELEASE_ASSERT(ULEB128MaxSize<uint8_t>() == 2);
+  MOZ_RELEASE_ASSERT(ULEB128MaxSize<uint16_t>() == 3);
+  MOZ_RELEASE_ASSERT(ULEB128MaxSize<uint32_t>() == 5);
+  MOZ_RELEASE_ASSERT(ULEB128MaxSize<uint64_t>() == 10);
+
+  struct TestDataU64 {
+    uint64_t mValue;
+    unsigned mSize;
+    const char* mBytes;
+  };
+  // clang-format off
+  TestDataU64 tests[] = {
+    // Small numbers should keep their normal byte representation.
+    {                  0u,  1, "\0" },
+    {                  1u,  1, "\x01" },
+
+    // 0111 1111 (127, or 0x7F) is the highest number that fits into a single
+    // LEB128 byte. It gets encoded as 0111 1111, note the most significant bit
+    // is off.
+    {               0x7Fu,  1, "\x7F" },
+
+    // Next number: 128, or 0x80.
+    //   Original data representation:  1000 0000
+    //     Broken up into groups of 7:         1  0000000
+    // Padded with 0 (msB) or 1 (lsB):  00000001 10000000
+    //            Byte representation:  0x01     0x80
+    //            Little endian order:  -> 0x80 0x01
+    {               0x80u,  2, "\x80\x01" },
+
+    // Next: 129, or 0x81 (showing that we don't lose low bits.)
+    //   Original data representation:  1000 0001
+    //     Broken up into groups of 7:         1  0000001
+    // Padded with 0 (msB) or 1 (lsB):  00000001 10000001
+    //            Byte representation:  0x01     0x81
+    //            Little endian order:  -> 0x81 0x01
+    {               0x81u,  2, "\x81\x01" },
+
+    // Highest 8-bit number: 255, or 0xFF.
+    //   Original data representation:  1111 1111
+    //     Broken up into groups of 7:         1  1111111
+    // Padded with 0 (msB) or 1 (lsB):  00000001 11111111
+    //            Byte representation:  0x01     0xFF
+    //            Little endian order:  -> 0xFF 0x01
+    {               0xFFu,  2, "\xFF\x01" },
+
+    // Next: 256, or 0x100.
+    //   Original data representation:  1 0000 0000
+    //     Broken up into groups of 7:        10  0000000
+    // Padded with 0 (msB) or 1 (lsB):  00000010 10000000
+    //            Byte representation:  0x10     0x80
+    //            Little endian order:  -> 0x80 0x02
+    {              0x100u,  2, "\x80\x02" },
+
+    // Highest 32-bit number: 0xFFFFFFFF (8 bytes, all bits set).
+    // Original: 1111 1111 1111 1111 1111 1111 1111 1111
+    // Groups:     1111  1111111  1111111  1111111  1111111
+    // Padded: 00001111 11111111 11111111 11111111 11111111
+    // Bytes:  0x0F     0xFF     0xFF     0xFF     0xFF
+    // Little Endian: -> 0xFF 0xFF 0xFF 0xFF 0x0F
+    {         0xFFFFFFFFu,  5, "\xFF\xFF\xFF\xFF\x0F" },
+
+    // Highest 64-bit number: 0xFFFFFFFFFFFFFFFF (16 bytes, all bits set).
+    // 64 bits, that's 9 groups of 7 bits, plus 1 (most significant) bit.
+    { 0xFFFFFFFFFFFFFFFFu, 10, "\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\x01" }
+  };
+  // clang-format on
+
+  for (const TestDataU64& test : tests) {
+    MOZ_RELEASE_ASSERT(ULEB128Size(test.mValue) == test.mSize);
+    // Prepare a buffer that can accomodate the largest-possible LEB128.
+    uint8_t buffer[ULEB128MaxSize<uint64_t>()];
+    // Use a pointer into the buffer as iterator.
+    uint8_t* p = buffer;
+    // And write the LEB128.
+    WriteULEB128(test.mValue, p);
+    // Pointer (iterator) should have advanced just past the expected LEB128
+    // size.
+    MOZ_RELEASE_ASSERT(p == buffer + test.mSize);
+    // Check expected bytes.
+    for (unsigned i = 0; i < test.mSize; ++i) {
+      MOZ_RELEASE_ASSERT(buffer[i] == uint8_t(test.mBytes[i]));
+    }
+    // Move pointer (iterator) back to start of buffer.
+    p = buffer;
+    // And read the LEB128 we wrote above.
+    uint64_t read = ReadULEB128<uint64_t>(p);
+    // Pointer (iterator) should have also advanced just past the expected
+    // LEB128 size.
+    MOZ_RELEASE_ASSERT(p == buffer + test.mSize);
+    // And check the read value.
+    MOZ_RELEASE_ASSERT(read == test.mValue);
+  }
+
+  printf("TestLEB128 done\n");
+}
+
 // Increase the depth, to a maximum (to avoid too-deep recursion).
 static constexpr size_t NextDepth(size_t aDepth) {
  constexpr size_t MAX_DEPTH = 128;
@ -185,6 +285,7 @@ void TestProfiler() {
  // Test dependencies.
  TestPowerOfTwoMask();
  TestPowerOfTwo();
+  TestLEB128();

  {
    printf("profiler_init()...\n");