Bug 1691489 - Implement SIMD i8x16.popcnt op. r=lth

Differential Revision: https://phabricator.services.mozilla.com/D104770
2021-03-16 20:24:22 +00:00 · 2021-03-16 20:24:22 +00:00 · 82dc6f238e
--- a/js/src/jit-test/tests/wasm/binary.js
+++ b/js/src/jit-test/tests/wasm/binary.js
@ -303,8 +303,7 @@ if (!wasmSimdEnabled()) {
    }
 } else {
    let reservedSimd = [
-        0x62, 0x9a,
-        0xa2, 0xa5, 0xa6, 0xaf,
+        0x9a, 0xa2, 0xa5, 0xa6, 0xaf,
        0xb0, 0xb2, 0xb3, 0xb4, 0xbb,
        0xc0, 0xc2, 0xc5, 0xc6, 0xcf,
        0xd0, 0xd2, 0xd3, 0xd4, 0xd8, 0xd9, 0xda, 0xdb,
--- a/js/src/jit-test/tests/wasm/simd/ad-hack-non-cranelift.js
+++ b/js/src/jit-test/tests/wasm/simd/ad-hack-non-cranelift.js
@ -452,6 +452,22 @@ assertSame(get(mem32, 8, 8), [
 ]);


+// i8x16.popcnt
+
+var ins = wasmEvalText(`
+  (module
+    (memory (export "mem") 1 1)
+    (func (export "i8x16_popcnt")
+      (v128.store (i32.const 0) (i8x16.popcnt (v128.load (i32.const 16)) )))
+  )`);
+
+var mem8 = new Int8Array(ins.exports.mem.buffer);
+
+set(mem8, 16, [0, 1, 2, 4, 8, 0x10, 0x20, 0x40, 0x80, 3, -1, 0xF0, 0x11, 0xFE, 0x0F, 0xE]);
+ins.exports.i8x16_popcnt();
+assertSame(get(mem8, 0, 16), [0,1,1,1,1,1,1,1,1,2,8,4,2,7,4,3]);
+
+
 /// Double-precision conversion instructions.
 /// f64x2.convert_low_i32x4_{u,s} / i32x4.trunc_sat_f64x2_{u,s}_zero
 /// f32x4.demote_f64x2_zero / f64x2.promote_low_f32x4
--- a/js/src/jit/MacroAssembler.h
+++ b/js/src/jit/MacroAssembler.h
@ -2539,6 +2539,14 @@ class MacroAssembler : public MacroAssemblerSpecific {
  inline void bitwiseSelectSimd128(FloatRegister onTrue, FloatRegister onFalse,
                                   FloatRegister maskDest) DEFINED_ON(arm64);

+  // Population count
+
+  inline void popcntInt8x16(FloatRegister src, FloatRegister dest,
+                            FloatRegister temp) DEFINED_ON(x86_shared);
+
+  inline void popcntInt8x16(FloatRegister src, FloatRegister dest)
+      DEFINED_ON(arm64);
+
  // Any lane true, ie, any bit set

  inline void anyTrueSimd128(FloatRegister src, Register dest)
--- a/js/src/jit/arm64/MacroAssembler-arm64-inl.h
+++ b/js/src/jit/arm64/MacroAssembler-arm64-inl.h
@ -2544,6 +2544,12 @@ void MacroAssembler::bitwiseSelectSimd128(FloatRegister onTrue,
  Bsl(Simd16B(maskDest), Simd16B(onTrue), Simd16B(onFalse));
 }

+// Population count
+
+void MacroAssembler::popcntInt8x16(FloatRegister src, FloatRegister dest) {
+  Cnt(Simd16B(dest), Simd16B(src));
+}
+
 // Any lane true, ie, any bit set

 void MacroAssembler::anyTrueSimd128(FloatRegister src, Register dest_) {
--- a/js/src/jit/x86-shared/CodeGenerator-x86-shared.cpp
+++ b/js/src/jit/x86-shared/CodeGenerator-x86-shared.cpp
@ -3341,6 +3341,9 @@ void CodeGenerator::visitWasmUnarySimd128(LWasmUnarySimd128* ins) {
    case wasm::SimdOp::V128Not:
      masm.bitwiseNotSimd128(src, dest);
      break;
+    case wasm::SimdOp::I8x16Popcnt:
+      masm.popcntInt8x16(src, dest, ToFloatRegister(ins->temp()));
+      break;
    case wasm::SimdOp::I8x16Abs:
      masm.absInt8x16(src, dest);
      break;
--- a/js/src/jit/x86-shared/Lowering-x86-shared.cpp
+++ b/js/src/jit/x86-shared/Lowering-x86-shared.cpp
@ -1295,6 +1295,7 @@ void LIRGenerator::visitWasmUnarySimd128(MWasmUnarySimd128* ins) {
    case wasm::SimdOp::I32x4TruncUSatF32x4:
    case wasm::SimdOp::I32x4TruncSatF64x2SZero:
    case wasm::SimdOp::I32x4TruncSatF64x2UZero:
+    case wasm::SimdOp::I8x16Popcnt:
      tempReg = tempSimd128();
      // Prefer src == dest to avoid an unconditional src->dest move.
      useAtStart = true;
--- a/js/src/jit/x86-shared/MacroAssembler-x86-shared-SIMD.cpp
+++ b/js/src/jit/x86-shared/MacroAssembler-x86-shared-SIMD.cpp
@ -1359,3 +1359,20 @@ void MacroAssemblerX86Shared::unsignedTruncSatFloat64x2ToInt32x4(
  vaddpd(Operand(temp), dest, dest);
  vshufps(0x88, scratch, dest, dest);
 }
+
+void MacroAssemblerX86Shared::popcntInt8x16(FloatRegister src,
+                                            FloatRegister temp,
+                                            FloatRegister output) {
+  ScratchSimd128Scope scratch(asMasm());
+  asMasm().loadConstantSimd128Float(SimdConstant::SplatX16(0x0f), scratch);
+  asMasm().moveSimd128Int(src, temp);
+  vpand(scratch, temp, temp);
+  vpandn(src, scratch, scratch);
+  int8_t counts[] = {0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4};
+  asMasm().loadConstantSimd128(SimdConstant::CreateX16(counts), output);
+  vpsrlw(Imm32(4), scratch, scratch);
+  vpshufb(temp, output, output);
+  asMasm().loadConstantSimd128(SimdConstant::CreateX16(counts), temp);
+  vpshufb(scratch, temp, temp);
+  vpaddb(Operand(temp), output, output);
+}
--- a/js/src/jit/x86-shared/MacroAssembler-x86-shared-inl.h
+++ b/js/src/jit/x86-shared/MacroAssembler-x86-shared-inl.h
@ -2206,6 +2206,13 @@ void MacroAssembler::bitwiseSelectSimd128(FloatRegister mask,
  MacroAssemblerX86Shared::selectSimd128(mask, onTrue, onFalse, temp, dest);
 }

+// Population count
+
+void MacroAssembler::popcntInt8x16(FloatRegister src, FloatRegister dest,
+                                   FloatRegister temp) {
+  MacroAssemblerX86Shared::popcntInt8x16(src, temp, dest);
+}
+
 // Comparisons (integer and floating-point)

 void MacroAssembler::compareInt8x16(Assembler::Condition cond,
--- a/js/src/jit/x86-shared/MacroAssembler-x86-shared.h
+++ b/js/src/jit/x86-shared/MacroAssembler-x86-shared.h
@ -547,6 +547,8 @@ class MacroAssemblerX86Shared : public Assembler {
  void selectSimd128(FloatRegister mask, FloatRegister onTrue,
                     FloatRegister onFalse, FloatRegister temp,
                     FloatRegister output);
+  void popcntInt8x16(FloatRegister src, FloatRegister temp,
+                     FloatRegister output);

  // SIMD inline methods private to the implementation, that appear to be used.

--- a/js/src/wasm/WasmBaselineCompile.cpp
+++ b/js/src/wasm/WasmBaselineCompile.cpp
@ -14879,6 +14879,17 @@ static void WidenHighUI32x4(MacroAssembler& masm, RegV128 rs, RegV128 rd) {
  masm.unsignedWidenHighInt32x4(rs, rd);
 }

+#  if defined(JS_CODEGEN_ARM64)
+static void PopcntI8x16(MacroAssembler& masm, RegV128 rs, RegV128 rd) {
+  masm.popcntInt8x16(rs, rd);
+}
+#  else
+static void PopcntI8x16(MacroAssembler& masm, RegV128 rs, RegV128 rd,
+                        RegV128 temp) {
+  masm.popcntInt8x16(rs, rd, temp);
+}
+#  endif  // JS_CODEGEN_ARM64
+
 static void AbsI8x16(MacroAssembler& masm, RegV128 rs, RegV128 rd) {
  masm.absInt8x16(rs, rd);
 }
@ -16814,6 +16825,8 @@ bool BaseCompiler::emitBody() {
            CHECK_NEXT(dispatchVectorUnary(SqrtF64x2));
          case uint32_t(SimdOp::V128Not):
            CHECK_NEXT(dispatchVectorUnary(NotV128));
+          case uint32_t(SimdOp::I8x16Popcnt):
+            CHECK_NEXT(dispatchVectorUnary(PopcntI8x16));
          case uint32_t(SimdOp::I8x16Abs):
            CHECK_NEXT(dispatchVectorUnary(AbsI8x16));
          case uint32_t(SimdOp::I16x8Abs):
--- a/js/src/wasm/WasmConstants.h
+++ b/js/src/wasm/WasmConstants.h
@ -575,7 +575,7 @@ enum class SimdOp {
  F64x2PromoteLowF32x4 = 0x5f,
  I8x16Abs = 0x60,
  I8x16Neg = 0x61,
-  // Unused = 0x62
+  I8x16Popcnt = 0x62,
  I8x16AllTrue = 0x63,
  I8x16Bitmask = 0x64,
  I8x16NarrowSI16x8 = 0x65,
--- a/js/src/wasm/WasmIonCompile.cpp
+++ b/js/src/wasm/WasmIonCompile.cpp
@ -5112,6 +5112,7 @@ static bool EmitBodyExprs(FunctionCompiler& f) {
          case uint32_t(SimdOp::F64x2Neg):
          case uint32_t(SimdOp::F64x2Sqrt):
          case uint32_t(SimdOp::V128Not):
+          case uint32_t(SimdOp::I8x16Popcnt):
          case uint32_t(SimdOp::I8x16Abs):
          case uint32_t(SimdOp::I16x8Abs):
          case uint32_t(SimdOp::I32x4Abs):
--- a/js/src/wasm/WasmOpIter.cpp
+++ b/js/src/wasm/WasmOpIter.cpp
@ -515,6 +515,7 @@ OpKind wasm::Classify(OpBytes op) {
        case SimdOp::F64x2Neg:
        case SimdOp::F64x2Sqrt:
        case SimdOp::V128Not:
+        case SimdOp::I8x16Popcnt:
        case SimdOp::I8x16Abs:
        case SimdOp::I16x8Abs:
        case SimdOp::I32x4Abs:
--- a/js/src/wasm/WasmValidate.cpp
+++ b/js/src/wasm/WasmValidate.cpp
@ -1191,6 +1191,7 @@ static bool DecodeFunctionBodyExprs(const ModuleEnvironment& env,
          case uint32_t(SimdOp::F64x2Neg):
          case uint32_t(SimdOp::F64x2Sqrt):
          case uint32_t(SimdOp::V128Not):
+          case uint32_t(SimdOp::I8x16Popcnt):
          case uint32_t(SimdOp::I8x16Abs):
          case uint32_t(SimdOp::I16x8Abs):
          case uint32_t(SimdOp::I32x4Abs):