Bug 1641140 - Implement the .bitmask instructions. r=jseward

For general background see the bug's description. This patch implements the iNxM.bitmask instructions for x86+x64 in baseline+ion, and adds some simple test cases. Differential Revision: https://phabricator.services.mozilla.com/D77784
2020-06-08 08:02:37 +00:00 · 2020-06-08 08:02:37 +00:00 · e34701fc0d
--- a/js/src/jit-test/tests/wasm/binary.js
+++ b/js/src/jit-test/tests/wasm/binary.js
@ -304,8 +304,8 @@ if (!wasmSimdSupported()) {
 } else {
    let reservedSimd = [
        0x53, 0x54, 0x55, 0x56, 0x57, 0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e,
-        0x5f, 0x64, 0x67, 0x68, 0x69, 0x6a, 0x74, 0x75, 0x7a, 0x7c, 0x7d, 0x7e,
-        0x7f, 0x84, 0x94, 0x9a, 0x9c, 0x9d, 0x9e, 0x9f, 0xa4, 0xa5, 0xa6, 0xaf,
+        0x5f, 0x67, 0x68, 0x69, 0x6a, 0x74, 0x75, 0x7a, 0x7c, 0x7d, 0x7e,
+        0x7f, 0x94, 0x9a, 0x9c, 0x9d, 0x9e, 0x9f, 0xa5, 0xa6, 0xaf,
        0xb0, 0xb2, 0xb3, 0xb4, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf, 0xc0, 0xc2,
        0xc3, 0xc4, 0xc5, 0xc6, 0xc7, 0xc8, 0xc9, 0xca, 0xcf, 0xd0, 0xd2, 0xd3,
        0xd4, 0xd6, 0xd7, 0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf, 0xea,
--- a/js/src/jit-test/tests/wasm/simd/ad-hack.js
+++ b/js/src/jit-test/tests/wasm/simd/ad-hack.js
@ -218,6 +218,12 @@ function shru(count, width) {
    }
 }

+function popcount(n) {
+  n = n - ((n >> 1) & 0x55555555)
+  n = (n & 0x33333333) + ((n >> 2) & 0x33333333)
+  return ((n + (n >> 4) & 0xF0F0F0F) * 0x1010101) >> 24
+}
+
 // For each input array, a set of arrays of the proper length for v128, with
 // values in range but possibly of the wrong signedness (eg, for Int8Array, 128
 // is in range but is really -128).  Also a unary operator `rectify` that
@ -822,6 +828,54 @@ for ( let dope of [1, 7, 32, 195 ] ) {
    assertEq(ins.exports.alltrue_i32x4(), 0);
 }

+// Bitmask
+
+var ins = wasmEvalText(`
+  (module
+    (memory (export "mem") 1 1)
+    (func (export "bitmask_i8x16") (result i32)
+      (i8x16.bitmask (v128.load (i32.const 16))))
+    (func (export "bitmask_i16x8") (result i32)
+      (i16x8.bitmask (v128.load (i32.const 16))))
+    (func (export "bitmask_i32x4") (result i32)
+      (i32x4.bitmask (v128.load (i32.const 16)))))`);
+
+var mem8 = new Uint8Array(ins.exports.mem.buffer);
+var mem16 = new Uint16Array(ins.exports.mem.buffer);
+var mem32 = new Uint32Array(ins.exports.mem.buffer);
+
+set(mem8, 16, iota(16).map((_) => 0));
+assertEq(ins.exports.bitmask_i8x16(), 0);
+assertEq(ins.exports.bitmask_i16x8(), 0);
+assertEq(ins.exports.bitmask_i32x4(), 0);
+
+set(mem8, 16, iota(16).map((_) => 0x80));
+assertEq(ins.exports.bitmask_i8x16(), 0xFFFF);
+
+set(mem8, 16, iota(16).map((_) => 0x7F));
+assertEq(ins.exports.bitmask_i8x16(), 0);
+
+set(mem8, 16, iota(16).map((i) => popcount(i) == 1 ? 0x80 : 0));
+assertEq(ins.exports.bitmask_i8x16(), (1 << 1) | (1 << 2) | (1 << 4) | (1 << 8));
+
+set(mem16, 8, iota(8).map((i) => 0x8000))
+assertEq(ins.exports.bitmask_i16x8(), 0xFF)
+
+set(mem16, 8, iota(8).map((i) => 0x7FFF))
+assertEq(ins.exports.bitmask_i16x8(), 0)
+
+set(mem16, 8, iota(8).map((i) => popcount(i) == 1 ? 0x8000 : 0))
+assertEq(ins.exports.bitmask_i16x8(), (1 << 1) | (1 << 2) | (1 << 4));
+
+set(mem32, 4, iota(4).map((_) => 0x80000000))
+assertEq(ins.exports.bitmask_i32x4(), 0xF);
+
+set(mem32, 4, iota(4).map((_) => 0x7FFFFFFF))
+assertEq(ins.exports.bitmask_i32x4(), 0);
+
+set(mem32, 4, iota(4).map((i) => popcount(i) == 1 ? 0x80000000 : 0))
+assertEq(ins.exports.bitmask_i32x4(), (1 << 1) | (1 << 2));
+
 // Shifts
 //
 // lhs is v128 in memory
--- a/js/src/jit/MacroAssembler.h
+++ b/js/src/jit/MacroAssembler.h
@ -2309,6 +2309,17 @@ class MacroAssembler : public MacroAssemblerSpecific {
  inline void allTrueInt32x4(FloatRegister src, Register dest)
      DEFINED_ON(x86_shared);

+  // Bitmask, ie extract and compress high bits of all lanes
+
+  inline void bitmaskInt8x16(FloatRegister src, Register dest)
+      DEFINED_ON(x86_shared);
+
+  inline void bitmaskInt16x8(FloatRegister src, Register dest)
+      DEFINED_ON(x86_shared);
+
+  inline void bitmaskInt32x4(FloatRegister src, Register dest)
+      DEFINED_ON(x86_shared);
+
  // Comparisons (integer and floating-point)

  inline void compareInt8x16(Assembler::Condition cond, FloatRegister rhs,
--- a/js/src/jit/x86-shared/CodeGenerator-x86-shared.cpp
+++ b/js/src/jit/x86-shared/CodeGenerator-x86-shared.cpp
@ -3194,6 +3194,15 @@ void CodeGenerator::visitWasmReduceSimd128(LWasmReduceSimd128* ins) {
    case wasm::SimdOp::I32x4AllTrue:
      masm.allTrueInt32x4(src, ToRegister(dest));
      break;
+    case wasm::SimdOp::I8x16Bitmask:
+      masm.bitmaskInt8x16(src, ToRegister(dest));
+      break;
+    case wasm::SimdOp::I16x8Bitmask:
+      masm.bitmaskInt16x8(src, ToRegister(dest));
+      break;
+    case wasm::SimdOp::I32x4Bitmask:
+      masm.bitmaskInt32x4(src, ToRegister(dest));
+      break;
    case wasm::SimdOp::I8x16ExtractLaneS:
      masm.extractLaneInt8x16(imm, src, ToRegister(dest));
      break;
--- a/js/src/jit/x86-shared/MacroAssembler-x86-shared-inl.h
+++ b/js/src/jit/x86-shared/MacroAssembler-x86-shared-inl.h
@ -1374,6 +1374,28 @@ void MacroAssembler::allTrueInt32x4(FloatRegister src, Register dest) {
  movzbl(dest, dest);
 }

+// Bitmask
+
+void MacroAssembler::bitmaskInt8x16(FloatRegister src, Register dest) {
+  vpmovmskb(src, dest);
+}
+
+void MacroAssembler::bitmaskInt16x8(FloatRegister src, Register dest) {
+  ScratchSimd128Scope scratch(*this);
+  // A three-instruction sequence is possible by using scratch as a don't-care
+  // input and shifting rather than masking at the end, but creates a false
+  // dependency on the old value of scratch.  The better fix is to allow src to
+  // be clobbered.
+  moveSimd128(src, scratch);
+  vpacksswb(Operand(scratch), scratch, scratch);
+  vpmovmskb(scratch, dest);
+  andl(Imm32(0xFF), dest);
+}
+
+void MacroAssembler::bitmaskInt32x4(FloatRegister src, Register dest) {
+  vmovmskps(src, dest);
+}
+
 // Swizzle - permute with variable indices

 void MacroAssembler::swizzleInt8x16(FloatRegister rhs, FloatRegister lhsDest,
--- a/js/src/wasm/WasmBaselineCompile.cpp
+++ b/js/src/wasm/WasmBaselineCompile.cpp
@ -13093,6 +13093,18 @@ static void AllTrueI32x4(MacroAssembler& masm, RegV128 rs, RegI32 rd) {
  masm.allTrueInt32x4(rs, rd);
 }

+static void BitmaskI8x16(MacroAssembler& masm, RegV128 rs, RegI32 rd) {
+  masm.bitmaskInt8x16(rs, rd);
+}
+
+static void BitmaskI16x8(MacroAssembler& masm, RegV128 rs, RegI32 rd) {
+  masm.bitmaskInt16x8(rs, rd);
+}
+
+static void BitmaskI32x4(MacroAssembler& masm, RegV128 rs, RegI32 rd) {
+  masm.bitmaskInt32x4(rs, rd);
+}
+
 static void Swizzle(MacroAssembler& masm, RegV128 rs, RegV128 rsd,
                    RegV128 temp) {
  masm.swizzleInt8x16(rs, rsd, temp);
@ -14236,6 +14248,12 @@ bool BaseCompiler::emitBody() {
            CHECK_NEXT(dispatchVectorReduction(AllTrueI16x8));
          case uint32_t(SimdOp::I32x4AllTrue):
            CHECK_NEXT(dispatchVectorReduction(AllTrueI32x4));
+          case uint32_t(SimdOp::I8x16Bitmask):
+            CHECK_NEXT(dispatchVectorReduction(BitmaskI8x16));
+          case uint32_t(SimdOp::I16x8Bitmask):
+            CHECK_NEXT(dispatchVectorReduction(BitmaskI16x8));
+          case uint32_t(SimdOp::I32x4Bitmask):
+            CHECK_NEXT(dispatchVectorReduction(BitmaskI32x4));
          case uint32_t(SimdOp::I8x16ReplaceLane):
            CHECK_NEXT(dispatchReplaceLane(ReplaceLaneI8x16, ValType::I32, 16));
          case uint32_t(SimdOp::I16x8ReplaceLane):
--- a/js/src/wasm/WasmConstants.h
+++ b/js/src/wasm/WasmConstants.h
@ -505,7 +505,7 @@ enum class SimdOp {
  I8x16Neg = 0x61,
  I8x16AnyTrue = 0x62,
  I8x16AllTrue = 0x63,
-  // Bitmask = 0x64
+  I8x16Bitmask = 0x64,
  I8x16NarrowSI16x8 = 0x65,
  I8x16NarrowUI16x8 = 0x66,
  // Widen = 0x67
@ -537,7 +537,7 @@ enum class SimdOp {
  I16x8Neg = 0x81,
  I16x8AnyTrue = 0x82,
  I16x8AllTrue = 0x83,
-  // Bitmask = 0x84
+  I16x8Bitmask = 0x84,
  I16x8NarrowSI32x4 = 0x85,
  I16x8NarrowUI32x4 = 0x86,
  I16x8WidenLowSI8x16 = 0x87,
@ -569,7 +569,7 @@ enum class SimdOp {
  I32x4Neg = 0xa1,
  I32x4AnyTrue = 0xa2,
  I32x4AllTrue = 0xa3,
-  // Bitmask = 0xa4
+  I32x4Bitmask = 0xa4,
  // Narrow = 0xa5
  // Narrow = 0xa6
  I32x4WidenLowSI16x8 = 0xa7,
--- a/js/src/wasm/WasmIonCompile.cpp
+++ b/js/src/wasm/WasmIonCompile.cpp
@ -4877,6 +4877,9 @@ static bool EmitBodyExprs(FunctionCompiler& f) {
          case uint32_t(SimdOp::I8x16AllTrue):
          case uint32_t(SimdOp::I16x8AllTrue):
          case uint32_t(SimdOp::I32x4AllTrue):
+          case uint32_t(SimdOp::I8x16Bitmask):
+          case uint32_t(SimdOp::I16x8Bitmask):
+          case uint32_t(SimdOp::I32x4Bitmask):
            CHECK(EmitReduceSimd128(f, SimdOp(op.b1)));
          case uint32_t(SimdOp::I8x16Shl):
          case uint32_t(SimdOp::I8x16ShrS):
--- a/js/src/wasm/WasmOpIter.cpp
+++ b/js/src/wasm/WasmOpIter.cpp
@ -314,6 +314,9 @@ OpKind wasm::Classify(OpBytes op) {
        case SimdOp::I16x8AllTrue:
        case SimdOp::I32x4AnyTrue:
        case SimdOp::I32x4AllTrue:
+        case SimdOp::I8x16Bitmask:
+        case SimdOp::I16x8Bitmask:
+        case SimdOp::I32x4Bitmask:
          WASM_SIMD_OP(OpKind::Conversion);
        case SimdOp::I8x16ReplaceLane:
        case SimdOp::I16x8ReplaceLane:
--- a/js/src/wasm/WasmValidate.cpp
+++ b/js/src/wasm/WasmValidate.cpp
@ -921,6 +921,9 @@ static bool DecodeFunctionBodyExprs(const ModuleEnvironment& env,
          case uint32_t(SimdOp::I16x8AllTrue):
          case uint32_t(SimdOp::I32x4AnyTrue):
          case uint32_t(SimdOp::I32x4AllTrue):
+          case uint32_t(SimdOp::I8x16Bitmask):
+          case uint32_t(SimdOp::I16x8Bitmask):
+          case uint32_t(SimdOp::I32x4Bitmask):
            CHECK(iter.readConversion(ValType::V128, ValType::I32, &nothing));

          case uint32_t(SimdOp::I8x16ReplaceLane):