gecko-dev/mfbt/SIMD.cpp

/* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
/* vim: set ts=8 sts=2 et sw=2 tw=80: */
/* This Source Code Form is subject to the terms of the Mozilla Public
 * License, v. 2.0. If a copy of the MPL was not distributed with this
 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */

#include "mozilla/SIMD.h"

#include <stdint.h>
#include <type_traits>

#include "mozilla/SSE.h"

namespace mozilla {

#ifdef MOZILLA_PRESUME_SSE2

#  include <immintrin.h>

const __m128i* Cast128(uintptr_t ptr) {
  return reinterpret_cast<const __m128i*>(ptr);
}

template <typename T>
T GetAs(uintptr_t ptr) {
  return *reinterpret_cast<const T*>(ptr);
}

// Akin to ceil/floor, AlignDown/AlignUp will return the original pointer if it
// is already aligned.
uintptr_t AlignDown16(uintptr_t ptr) { return ptr & ~0xf; }

uintptr_t AlignUp16(uintptr_t ptr) { return AlignDown16(ptr + 0xf); }

template <typename CharType>
__m128i CmpEq128(__m128i a, __m128i b) {
  static_assert(sizeof(CharType) == 1 || sizeof(CharType) == 2);
  if (sizeof(CharType) == 1) {
    return _mm_cmpeq_epi8(a, b);
  }
  return _mm_cmpeq_epi16(a, b);
}

#  ifdef __GNUC__

// Earlier versions of GCC are missing the _mm_loadu_si32 instruction. This
// workaround from Peter Cordes (https://stackoverflow.com/a/72837992) compiles
// down to the same instructions. We could just replace _mm_loadu_si32
__m128i Load32BitsIntoXMM(uintptr_t ptr) {
  int tmp;
  memcpy(&tmp, reinterpret_cast<const void*>(ptr),
         sizeof(tmp));            // unaligned aliasing-safe load
  return _mm_cvtsi32_si128(tmp);  // efficient on GCC/clang/MSVC
}

#  else

__m128i Load32BitsIntoXMM(uintptr_t ptr) {
  return _mm_loadu_si32(Cast128(ptr));
}

#  endif

const char* Check4x4Chars(__m128i needle, uintptr_t a, uintptr_t b, uintptr_t c,
                          uintptr_t d) {
  __m128i haystackA = Load32BitsIntoXMM(a);
  __m128i cmpA = CmpEq128<char>(needle, haystackA);
  __m128i haystackB = Load32BitsIntoXMM(b);
  __m128i cmpB = CmpEq128<char>(needle, haystackB);
  __m128i haystackC = Load32BitsIntoXMM(c);
  __m128i cmpC = CmpEq128<char>(needle, haystackC);
  __m128i haystackD = Load32BitsIntoXMM(d);
  __m128i cmpD = CmpEq128<char>(needle, haystackD);
  __m128i or_ab = _mm_or_si128(cmpA, cmpB);
  __m128i or_cd = _mm_or_si128(cmpC, cmpD);
  __m128i or_abcd = _mm_or_si128(or_ab, or_cd);
  int orMask = _mm_movemask_epi8(or_abcd);
  if (orMask & 0xf) {
    int cmpMask;
    cmpMask = _mm_movemask_epi8(cmpA);
    if (cmpMask & 0xf) {
      return reinterpret_cast<const char*>(a + __builtin_ctz(cmpMask));
    }
    cmpMask = _mm_movemask_epi8(cmpB);
    if (cmpMask & 0xf) {
      return reinterpret_cast<const char*>(b + __builtin_ctz(cmpMask));
    }
    cmpMask = _mm_movemask_epi8(cmpC);
    if (cmpMask & 0xf) {
      return reinterpret_cast<const char*>(c + __builtin_ctz(cmpMask));
    }
    cmpMask = _mm_movemask_epi8(cmpD);
    if (cmpMask & 0xf) {
      return reinterpret_cast<const char*>(d + __builtin_ctz(cmpMask));
    }
  }

  return nullptr;
}

template <typename CharType>
const CharType* Check4x16Bytes(__m128i needle, uintptr_t a, uintptr_t b,
                               uintptr_t c, uintptr_t d) {
  __m128i haystackA = _mm_loadu_si128(Cast128(a));
  __m128i cmpA = CmpEq128<CharType>(needle, haystackA);
  __m128i haystackB = _mm_loadu_si128(Cast128(b));
  __m128i cmpB = CmpEq128<CharType>(needle, haystackB);
  __m128i haystackC = _mm_loadu_si128(Cast128(c));
  __m128i cmpC = CmpEq128<CharType>(needle, haystackC);
  __m128i haystackD = _mm_loadu_si128(Cast128(d));
  __m128i cmpD = CmpEq128<CharType>(needle, haystackD);
  __m128i or_ab = _mm_or_si128(cmpA, cmpB);
  __m128i or_cd = _mm_or_si128(cmpC, cmpD);
  __m128i or_abcd = _mm_or_si128(or_ab, or_cd);
  int orMask = _mm_movemask_epi8(or_abcd);
  if (orMask) {
    int cmpMask;
    cmpMask = _mm_movemask_epi8(cmpA);
    if (cmpMask) {
      return reinterpret_cast<const CharType*>(a + __builtin_ctz(cmpMask));
    }
    cmpMask = _mm_movemask_epi8(cmpB);
    if (cmpMask) {
      return reinterpret_cast<const CharType*>(b + __builtin_ctz(cmpMask));
    }
    cmpMask = _mm_movemask_epi8(cmpC);
    if (cmpMask) {
      return reinterpret_cast<const CharType*>(c + __builtin_ctz(cmpMask));
    }
    cmpMask = _mm_movemask_epi8(cmpD);
    if (cmpMask) {
      return reinterpret_cast<const CharType*>(d + __builtin_ctz(cmpMask));
    }
  }

  return nullptr;
}

enum class HaystackOverlap {
  Overlapping,
  Sequential,
};

// Check two 16-byte chunks for the two-byte sequence loaded into needle1
// followed by needle1. `carryOut` is an optional pointer which we will
// populate based on whether the last character of b matches needle1. This
// should be provided on subsequent calls via `carryIn` so we can detect cases
// where the last byte of b's 16-byte chunk is needle1 and the first byte of
// the next a's 16-byte chunk is needle2. `overlap` and whether
// `carryIn`/`carryOut` are NULL should be knowable at compile time to avoid
// branching.
template <typename CharType>
const CharType* Check2x2x16Bytes(__m128i needle1, __m128i needle2, uintptr_t a,
                                 uintptr_t b, __m128i* carryIn,
                                 __m128i* carryOut, HaystackOverlap overlap) {
  const int shiftRightAmount = 16 - sizeof(CharType);
  const int shiftLeftAmount = sizeof(CharType);
  __m128i haystackA = _mm_loadu_si128(Cast128(a));
  __m128i cmpA1 = CmpEq128<CharType>(needle1, haystackA);
  __m128i cmpA2 = CmpEq128<CharType>(needle2, haystackA);
  __m128i cmpA;
  if (carryIn) {
    cmpA = _mm_and_si128(
        _mm_or_si128(_mm_bslli_si128(cmpA1, shiftLeftAmount), *carryIn), cmpA2);
  } else {
    cmpA = _mm_and_si128(_mm_bslli_si128(cmpA1, shiftLeftAmount), cmpA2);
  }
  __m128i haystackB = _mm_loadu_si128(Cast128(b));
  __m128i cmpB1 = CmpEq128<CharType>(needle1, haystackB);
  __m128i cmpB2 = CmpEq128<CharType>(needle2, haystackB);
  __m128i cmpB;
  if (overlap == HaystackOverlap::Overlapping) {
    cmpB = _mm_and_si128(_mm_bslli_si128(cmpB1, shiftLeftAmount), cmpB2);
  } else {
    MOZ_ASSERT(overlap == HaystackOverlap::Sequential);
    __m128i carryAB = _mm_bsrli_si128(cmpA1, shiftRightAmount);
    cmpB = _mm_and_si128(
        _mm_or_si128(_mm_bslli_si128(cmpB1, shiftLeftAmount), carryAB), cmpB2);
  }
  __m128i or_ab = _mm_or_si128(cmpA, cmpB);
  int orMask = _mm_movemask_epi8(or_ab);
  if (orMask) {
    int cmpMask;
    cmpMask = _mm_movemask_epi8(cmpA);
    if (cmpMask) {
      return reinterpret_cast<const CharType*>(a + __builtin_ctz(cmpMask) -
                                               shiftLeftAmount);
    }
    cmpMask = _mm_movemask_epi8(cmpB);
    if (cmpMask) {
      return reinterpret_cast<const CharType*>(b + __builtin_ctz(cmpMask) -
                                               shiftLeftAmount);
    }
  }

  if (carryOut) {
    _mm_store_si128(carryOut, _mm_bsrli_si128(cmpB1, shiftRightAmount));
  }

  return nullptr;
}

template <typename CharType>
const CharType* FindInBuffer(const CharType* ptr, CharType value,
                             size_t length) {
  static_assert(sizeof(CharType) == 1 || sizeof(CharType) == 2);
  static_assert(std::is_unsigned<CharType>::value);
  uint64_t splat64;
  if (sizeof(CharType) == 1) {
    splat64 = 0x0101010101010101llu;
  } else {
    splat64 = 0x0001000100010001llu;
  }

  // Load our needle into a 16-byte register
  uint64_t u64_value = static_cast<uint64_t>(value) * splat64;
  int64_t i64_value = *reinterpret_cast<int64_t*>(&u64_value);
  __m128i needle = _mm_set_epi64x(i64_value, i64_value);

  size_t numBytes = length * sizeof(CharType);
  uintptr_t cur = reinterpret_cast<uintptr_t>(ptr);
  uintptr_t end = cur + numBytes;

  if ((sizeof(CharType) > 1 && numBytes < 16) || numBytes < 4) {
    while (cur < end) {
      if (GetAs<CharType>(cur) == value) {
        return reinterpret_cast<const CharType*>(cur);
      }
      cur += sizeof(CharType);
    }
    return nullptr;
  }

  if (numBytes < 16) {
    // NOTE: here and below, we have some bit fiddling which could look a
    // little weird. The important thing to note though is it's just a trick
    // for getting the number 4 if numBytes is greater than or equal to 8,
    // and 0 otherwise. This lets us fully cover the range without any
    // branching for the case where numBytes is in [4,8), and [8,16). We get
    // four ranges from this - if numbytes > 8, we get:
    //   [0,4), [4,8], [end - 8), [end - 4)
    // and if numbytes < 8, we get
    //   [0,4), [0,4), [end - 4), [end - 4)
    uintptr_t a = cur;
    uintptr_t b = cur + ((numBytes & 8) >> 1);
    uintptr_t c = end - 4 - ((numBytes & 8) >> 1);
    uintptr_t d = end - 4;
    const char* charResult = Check4x4Chars(needle, a, b, c, d);
    // Note: we ensure above that sizeof(CharType) == 1 here, so this is
    // either char to char or char to something like a uint8_t.
    return reinterpret_cast<const CharType*>(charResult);
  }

  if (numBytes < 64) {
    // NOTE: see the above explanation of the similar chunk of code, but in
    // this case, replace 8 with 32 and 4 with 16.
    uintptr_t a = cur;
    uintptr_t b = cur + ((numBytes & 32) >> 1);
    uintptr_t c = end - 16 - ((numBytes & 32) >> 1);
    uintptr_t d = end - 16;
    return Check4x16Bytes<CharType>(needle, a, b, c, d);
  }

  // Get the initial unaligned load out of the way. This will overlap with the
  // aligned stuff below, but the overlapped part should effectively be free
  // (relative to a mispredict from doing a byte-by-byte loop).
  __m128i haystack = _mm_loadu_si128(Cast128(cur));
  __m128i cmp = CmpEq128<CharType>(needle, haystack);
  int cmpMask = _mm_movemask_epi8(cmp);
  if (cmpMask) {
    return reinterpret_cast<const CharType*>(cur + __builtin_ctz(cmpMask));
  }

  // Now we're working with aligned memory. Hooray! \o/
  cur = AlignUp16(cur);

  // The address of the final 48-63 bytes. We overlap this with what we check in
  // our hot loop below to avoid branching. Again, the overlap should be
  // negligible compared with a branch mispredict.
  uintptr_t tailStartPtr = AlignDown16(end - 48);
  uintptr_t tailEndPtr = end - 16;

  while (cur < tailStartPtr) {
    uintptr_t a = cur;
    uintptr_t b = cur + 16;
    uintptr_t c = cur + 32;
    uintptr_t d = cur + 48;
    const CharType* result = Check4x16Bytes<CharType>(needle, a, b, c, d);
    if (result) {
      return result;
    }
    cur += 64;
  }

  uintptr_t a = tailStartPtr;
  uintptr_t b = tailStartPtr + 16;
  uintptr_t c = tailStartPtr + 32;
  uintptr_t d = tailEndPtr;
  return Check4x16Bytes<CharType>(needle, a, b, c, d);
}

template <typename CharType>
const CharType* TwoByteLoop(uintptr_t start, uintptr_t end, CharType v1,
                            CharType v2);

template <>
const unsigned char* TwoByteLoop<unsigned char>(uintptr_t start, uintptr_t end,
                                                unsigned char v1,
                                                unsigned char v2) {
  uintptr_t cur = start;
  uintptr_t preEnd = end - sizeof(unsigned char);
  while (cur < preEnd) {
    // NOTE: this should only ever be called on little endian architectures.
    static_assert(MOZ_LITTLE_ENDIAN());
    uint16_t pattern =
        static_cast<uint16_t>(v1) | (static_cast<uint16_t>(v2) << 8);
    if (GetAs<uint16_t>(cur) == pattern) {
      return reinterpret_cast<const unsigned char*>(cur);
    }
    cur += sizeof(unsigned char);
  }
  return nullptr;
}

template <>
const char16_t* TwoByteLoop<char16_t>(uintptr_t start, uintptr_t end,
                                      char16_t v1, char16_t v2) {
  uintptr_t cur = start;
  uintptr_t preEnd = end - sizeof(char16_t);
  while (cur < preEnd) {
    // NOTE: this should only ever be called on little endian architectures
    static_assert(MOZ_LITTLE_ENDIAN());
    uint32_t pattern =
        static_cast<uint32_t>(v1) | (static_cast<uint32_t>(v2) << 16);
    if (GetAs<uint32_t>(cur) == pattern) {
      return reinterpret_cast<const char16_t*>(cur);
    }
    cur += sizeof(char16_t);
  }
  return nullptr;
}

template <typename CharType>
const CharType* FindTwoInBuffer(const CharType* ptr, CharType v1, CharType v2,
                                size_t length) {
  static_assert(sizeof(CharType) == 1 || sizeof(CharType) == 2);
  static_assert(std::is_unsigned<CharType>::value);
  uint64_t splat64;
  if (sizeof(CharType) == 1) {
    splat64 = 0x0101010101010101llu;
  } else {
    splat64 = 0x0001000100010001llu;
  }

  // Load our needle into a 16-byte register
  uint64_t u64_v1 = static_cast<uint64_t>(v1) * splat64;
  int64_t i64_v1 = *reinterpret_cast<int64_t*>(&u64_v1);
  __m128i needle1 = _mm_set_epi64x(i64_v1, i64_v1);
  uint64_t u64_v2 = static_cast<uint64_t>(v2) * splat64;
  int64_t i64_v2 = *reinterpret_cast<int64_t*>(&u64_v2);
  __m128i needle2 = _mm_set_epi64x(i64_v2, i64_v2);

  size_t numBytes = length * sizeof(CharType);
  uintptr_t cur = reinterpret_cast<uintptr_t>(ptr);
  uintptr_t end = cur + numBytes;

  if (numBytes < 16) {
    return TwoByteLoop<CharType>(cur, end, v1, v2);
  }

  if (numBytes < 32) {
    uintptr_t a = cur;
    uintptr_t b = end - 16;
    return Check2x2x16Bytes<CharType>(needle1, needle2, a, b, nullptr, nullptr,
                                      HaystackOverlap::Overlapping);
  }

  // Get the initial unaligned load out of the way. This will likely overlap
  // with the aligned stuff below, but the overlapped part should effectively
  // be free.
  __m128i haystack = _mm_loadu_si128(Cast128(cur));
  __m128i cmp1 = CmpEq128<CharType>(needle1, haystack);
  __m128i cmp2 = CmpEq128<CharType>(needle2, haystack);
  int cmpMask1 = _mm_movemask_epi8(cmp1);
  int cmpMask2 = _mm_movemask_epi8(cmp2);
  int cmpMask = (cmpMask1 << sizeof(CharType)) & cmpMask2;
  if (cmpMask) {
    return reinterpret_cast<const CharType*>(cur + __builtin_ctz(cmpMask) -
                                             sizeof(CharType));
  }

  // Now we're working with aligned memory. Hooray! \o/
  cur = AlignUp16(cur);

  // The address of the final 48-63 bytes. We overlap this with what we check in
  // our hot loop below to avoid branching. Again, the overlap should be
  // negligible compared with a branch mispredict.
  uintptr_t tailEndPtr = end - 16;
  uintptr_t tailStartPtr = AlignDown16(tailEndPtr);

  __m128i cmpMaskCarry = _mm_set1_epi32(0);
  while (cur < tailStartPtr) {
    uintptr_t a = cur;
    uintptr_t b = cur + 16;
    const CharType* result =
        Check2x2x16Bytes<CharType>(needle1, needle2, a, b, &cmpMaskCarry,
                                   &cmpMaskCarry, HaystackOverlap::Sequential);
    if (result) {
      return result;
    }
    cur += 32;
  }

  uint32_t carry = (cur == tailStartPtr) ? 0xffffffff : 0;
  __m128i wideCarry = Load32BitsIntoXMM(reinterpret_cast<uintptr_t>(&carry));
  cmpMaskCarry = _mm_and_si128(cmpMaskCarry, wideCarry);
  uintptr_t a = tailStartPtr;
  uintptr_t b = tailEndPtr;
  return Check2x2x16Bytes<CharType>(needle1, needle2, a, b, &cmpMaskCarry,
                                    nullptr, HaystackOverlap::Overlapping);
}

const char* SIMD::memchr8(const char* ptr, char value, size_t length) {
  // Signed chars are just really annoying to do bit logic with. Convert to
  // unsigned at the outermost scope so we don't have to worry about it.
  const unsigned char* uptr = reinterpret_cast<const unsigned char*>(ptr);
  unsigned char uvalue = static_cast<unsigned char>(value);
  const unsigned char* uresult =
      FindInBuffer<unsigned char>(uptr, uvalue, length);
  return reinterpret_cast<const char*>(uresult);
}

const char16_t* SIMD::memchr16(const char16_t* ptr, char16_t value,
                               size_t length) {
  return FindInBuffer<char16_t>(ptr, value, length);
}

const char* SIMD::memchr2x8(const char* ptr, char v1, char v2, size_t length) {
  // Signed chars are just really annoying to do bit logic with. Convert to
  // unsigned at the outermost scope so we don't have to worry about it.
  const unsigned char* uptr = reinterpret_cast<const unsigned char*>(ptr);
  unsigned char uv1 = static_cast<unsigned char>(v1);
  unsigned char uv2 = static_cast<unsigned char>(v2);
  const unsigned char* uresult =
      FindTwoInBuffer<unsigned char>(uptr, uv1, uv2, length);
  return reinterpret_cast<const char*>(uresult);
}

const char16_t* SIMD::memchr2x16(const char16_t* ptr, char16_t v1, char16_t v2,
                                 size_t length) {
  return FindTwoInBuffer<char16_t>(ptr, v1, v2, length);
}

#else

#  include <cstring>

const char* SIMD::memchr8(const char* ptr, char value, size_t length) {
  const void* result = ::memchr(reinterpret_cast<const void*>(ptr),
                                static_cast<int>(value), length);
  return reinterpret_cast<const char*>(result);
}

const char16_t* SIMD::memchr16(const char16_t* ptr, char16_t value,
                               size_t length) {
  const char16_t* end = ptr + length;
  while (ptr < end) {
    if (*ptr == value) {
      return ptr;
    }
    ptr++;
  }
  return nullptr;
}

const char* SIMD::memchr2x8(const char* ptr, char v1, char v2, size_t length) {
  const char* end = ptr + length - 1;
  while (ptr < end) {
    ptr = memchr8(ptr, v1, end - ptr);
    if (!ptr) {
      return nullptr;
    }
    if (ptr[1] == v2) {
      return ptr;
    }
    ptr++;
  }
  return nullptr;
}

const char16_t* SIMD::memchr2x16(const char16_t* ptr, char16_t v1, char16_t v2,
                                 size_t length) {
  const char16_t* end = ptr + length - 1;
  while (ptr < end) {
    ptr = memchr16(ptr, v1, end - ptr);
    if (!ptr) {
      return nullptr;
    }
    if (ptr[1] == v2) {
      return ptr;
    }
    ptr++;
  }
  return nullptr;
}

#endif

}  // namespace mozilla