зеркало из https://github.com/mozilla/gecko-dev.git
Bug 1641140 - Implement the .bitmask instructions. r=jseward
For general background see the bug's description. This patch implements the iNxM.bitmask instructions for x86+x64 in baseline+ion, and adds some simple test cases. Differential Revision: https://phabricator.services.mozilla.com/D77784
This commit is contained in:
Родитель
0cda75506a
Коммит
e34701fc0d
|
@ -304,8 +304,8 @@ if (!wasmSimdSupported()) {
|
|||
} else {
|
||||
let reservedSimd = [
|
||||
0x53, 0x54, 0x55, 0x56, 0x57, 0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e,
|
||||
0x5f, 0x64, 0x67, 0x68, 0x69, 0x6a, 0x74, 0x75, 0x7a, 0x7c, 0x7d, 0x7e,
|
||||
0x7f, 0x84, 0x94, 0x9a, 0x9c, 0x9d, 0x9e, 0x9f, 0xa4, 0xa5, 0xa6, 0xaf,
|
||||
0x5f, 0x67, 0x68, 0x69, 0x6a, 0x74, 0x75, 0x7a, 0x7c, 0x7d, 0x7e,
|
||||
0x7f, 0x94, 0x9a, 0x9c, 0x9d, 0x9e, 0x9f, 0xa5, 0xa6, 0xaf,
|
||||
0xb0, 0xb2, 0xb3, 0xb4, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf, 0xc0, 0xc2,
|
||||
0xc3, 0xc4, 0xc5, 0xc6, 0xc7, 0xc8, 0xc9, 0xca, 0xcf, 0xd0, 0xd2, 0xd3,
|
||||
0xd4, 0xd6, 0xd7, 0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf, 0xea,
|
||||
|
|
|
@ -218,6 +218,12 @@ function shru(count, width) {
|
|||
}
|
||||
}
|
||||
|
||||
function popcount(n) {
|
||||
n = n - ((n >> 1) & 0x55555555)
|
||||
n = (n & 0x33333333) + ((n >> 2) & 0x33333333)
|
||||
return ((n + (n >> 4) & 0xF0F0F0F) * 0x1010101) >> 24
|
||||
}
|
||||
|
||||
// For each input array, a set of arrays of the proper length for v128, with
|
||||
// values in range but possibly of the wrong signedness (eg, for Int8Array, 128
|
||||
// is in range but is really -128). Also a unary operator `rectify` that
|
||||
|
@ -822,6 +828,54 @@ for ( let dope of [1, 7, 32, 195 ] ) {
|
|||
assertEq(ins.exports.alltrue_i32x4(), 0);
|
||||
}
|
||||
|
||||
// Bitmask
|
||||
|
||||
var ins = wasmEvalText(`
|
||||
(module
|
||||
(memory (export "mem") 1 1)
|
||||
(func (export "bitmask_i8x16") (result i32)
|
||||
(i8x16.bitmask (v128.load (i32.const 16))))
|
||||
(func (export "bitmask_i16x8") (result i32)
|
||||
(i16x8.bitmask (v128.load (i32.const 16))))
|
||||
(func (export "bitmask_i32x4") (result i32)
|
||||
(i32x4.bitmask (v128.load (i32.const 16)))))`);
|
||||
|
||||
var mem8 = new Uint8Array(ins.exports.mem.buffer);
|
||||
var mem16 = new Uint16Array(ins.exports.mem.buffer);
|
||||
var mem32 = new Uint32Array(ins.exports.mem.buffer);
|
||||
|
||||
set(mem8, 16, iota(16).map((_) => 0));
|
||||
assertEq(ins.exports.bitmask_i8x16(), 0);
|
||||
assertEq(ins.exports.bitmask_i16x8(), 0);
|
||||
assertEq(ins.exports.bitmask_i32x4(), 0);
|
||||
|
||||
set(mem8, 16, iota(16).map((_) => 0x80));
|
||||
assertEq(ins.exports.bitmask_i8x16(), 0xFFFF);
|
||||
|
||||
set(mem8, 16, iota(16).map((_) => 0x7F));
|
||||
assertEq(ins.exports.bitmask_i8x16(), 0);
|
||||
|
||||
set(mem8, 16, iota(16).map((i) => popcount(i) == 1 ? 0x80 : 0));
|
||||
assertEq(ins.exports.bitmask_i8x16(), (1 << 1) | (1 << 2) | (1 << 4) | (1 << 8));
|
||||
|
||||
set(mem16, 8, iota(8).map((i) => 0x8000))
|
||||
assertEq(ins.exports.bitmask_i16x8(), 0xFF)
|
||||
|
||||
set(mem16, 8, iota(8).map((i) => 0x7FFF))
|
||||
assertEq(ins.exports.bitmask_i16x8(), 0)
|
||||
|
||||
set(mem16, 8, iota(8).map((i) => popcount(i) == 1 ? 0x8000 : 0))
|
||||
assertEq(ins.exports.bitmask_i16x8(), (1 << 1) | (1 << 2) | (1 << 4));
|
||||
|
||||
set(mem32, 4, iota(4).map((_) => 0x80000000))
|
||||
assertEq(ins.exports.bitmask_i32x4(), 0xF);
|
||||
|
||||
set(mem32, 4, iota(4).map((_) => 0x7FFFFFFF))
|
||||
assertEq(ins.exports.bitmask_i32x4(), 0);
|
||||
|
||||
set(mem32, 4, iota(4).map((i) => popcount(i) == 1 ? 0x80000000 : 0))
|
||||
assertEq(ins.exports.bitmask_i32x4(), (1 << 1) | (1 << 2));
|
||||
|
||||
// Shifts
|
||||
//
|
||||
// lhs is v128 in memory
|
||||
|
|
|
@ -2309,6 +2309,17 @@ class MacroAssembler : public MacroAssemblerSpecific {
|
|||
inline void allTrueInt32x4(FloatRegister src, Register dest)
|
||||
DEFINED_ON(x86_shared);
|
||||
|
||||
// Bitmask, ie extract and compress high bits of all lanes
|
||||
|
||||
inline void bitmaskInt8x16(FloatRegister src, Register dest)
|
||||
DEFINED_ON(x86_shared);
|
||||
|
||||
inline void bitmaskInt16x8(FloatRegister src, Register dest)
|
||||
DEFINED_ON(x86_shared);
|
||||
|
||||
inline void bitmaskInt32x4(FloatRegister src, Register dest)
|
||||
DEFINED_ON(x86_shared);
|
||||
|
||||
// Comparisons (integer and floating-point)
|
||||
|
||||
inline void compareInt8x16(Assembler::Condition cond, FloatRegister rhs,
|
||||
|
|
|
@ -3194,6 +3194,15 @@ void CodeGenerator::visitWasmReduceSimd128(LWasmReduceSimd128* ins) {
|
|||
case wasm::SimdOp::I32x4AllTrue:
|
||||
masm.allTrueInt32x4(src, ToRegister(dest));
|
||||
break;
|
||||
case wasm::SimdOp::I8x16Bitmask:
|
||||
masm.bitmaskInt8x16(src, ToRegister(dest));
|
||||
break;
|
||||
case wasm::SimdOp::I16x8Bitmask:
|
||||
masm.bitmaskInt16x8(src, ToRegister(dest));
|
||||
break;
|
||||
case wasm::SimdOp::I32x4Bitmask:
|
||||
masm.bitmaskInt32x4(src, ToRegister(dest));
|
||||
break;
|
||||
case wasm::SimdOp::I8x16ExtractLaneS:
|
||||
masm.extractLaneInt8x16(imm, src, ToRegister(dest));
|
||||
break;
|
||||
|
|
|
@ -1374,6 +1374,28 @@ void MacroAssembler::allTrueInt32x4(FloatRegister src, Register dest) {
|
|||
movzbl(dest, dest);
|
||||
}
|
||||
|
||||
// Bitmask
|
||||
|
||||
void MacroAssembler::bitmaskInt8x16(FloatRegister src, Register dest) {
|
||||
vpmovmskb(src, dest);
|
||||
}
|
||||
|
||||
void MacroAssembler::bitmaskInt16x8(FloatRegister src, Register dest) {
|
||||
ScratchSimd128Scope scratch(*this);
|
||||
// A three-instruction sequence is possible by using scratch as a don't-care
|
||||
// input and shifting rather than masking at the end, but creates a false
|
||||
// dependency on the old value of scratch. The better fix is to allow src to
|
||||
// be clobbered.
|
||||
moveSimd128(src, scratch);
|
||||
vpacksswb(Operand(scratch), scratch, scratch);
|
||||
vpmovmskb(scratch, dest);
|
||||
andl(Imm32(0xFF), dest);
|
||||
}
|
||||
|
||||
void MacroAssembler::bitmaskInt32x4(FloatRegister src, Register dest) {
|
||||
vmovmskps(src, dest);
|
||||
}
|
||||
|
||||
// Swizzle - permute with variable indices
|
||||
|
||||
void MacroAssembler::swizzleInt8x16(FloatRegister rhs, FloatRegister lhsDest,
|
||||
|
|
|
@ -13093,6 +13093,18 @@ static void AllTrueI32x4(MacroAssembler& masm, RegV128 rs, RegI32 rd) {
|
|||
masm.allTrueInt32x4(rs, rd);
|
||||
}
|
||||
|
||||
static void BitmaskI8x16(MacroAssembler& masm, RegV128 rs, RegI32 rd) {
|
||||
masm.bitmaskInt8x16(rs, rd);
|
||||
}
|
||||
|
||||
static void BitmaskI16x8(MacroAssembler& masm, RegV128 rs, RegI32 rd) {
|
||||
masm.bitmaskInt16x8(rs, rd);
|
||||
}
|
||||
|
||||
static void BitmaskI32x4(MacroAssembler& masm, RegV128 rs, RegI32 rd) {
|
||||
masm.bitmaskInt32x4(rs, rd);
|
||||
}
|
||||
|
||||
static void Swizzle(MacroAssembler& masm, RegV128 rs, RegV128 rsd,
|
||||
RegV128 temp) {
|
||||
masm.swizzleInt8x16(rs, rsd, temp);
|
||||
|
@ -14236,6 +14248,12 @@ bool BaseCompiler::emitBody() {
|
|||
CHECK_NEXT(dispatchVectorReduction(AllTrueI16x8));
|
||||
case uint32_t(SimdOp::I32x4AllTrue):
|
||||
CHECK_NEXT(dispatchVectorReduction(AllTrueI32x4));
|
||||
case uint32_t(SimdOp::I8x16Bitmask):
|
||||
CHECK_NEXT(dispatchVectorReduction(BitmaskI8x16));
|
||||
case uint32_t(SimdOp::I16x8Bitmask):
|
||||
CHECK_NEXT(dispatchVectorReduction(BitmaskI16x8));
|
||||
case uint32_t(SimdOp::I32x4Bitmask):
|
||||
CHECK_NEXT(dispatchVectorReduction(BitmaskI32x4));
|
||||
case uint32_t(SimdOp::I8x16ReplaceLane):
|
||||
CHECK_NEXT(dispatchReplaceLane(ReplaceLaneI8x16, ValType::I32, 16));
|
||||
case uint32_t(SimdOp::I16x8ReplaceLane):
|
||||
|
|
|
@ -505,7 +505,7 @@ enum class SimdOp {
|
|||
I8x16Neg = 0x61,
|
||||
I8x16AnyTrue = 0x62,
|
||||
I8x16AllTrue = 0x63,
|
||||
// Bitmask = 0x64
|
||||
I8x16Bitmask = 0x64,
|
||||
I8x16NarrowSI16x8 = 0x65,
|
||||
I8x16NarrowUI16x8 = 0x66,
|
||||
// Widen = 0x67
|
||||
|
@ -537,7 +537,7 @@ enum class SimdOp {
|
|||
I16x8Neg = 0x81,
|
||||
I16x8AnyTrue = 0x82,
|
||||
I16x8AllTrue = 0x83,
|
||||
// Bitmask = 0x84
|
||||
I16x8Bitmask = 0x84,
|
||||
I16x8NarrowSI32x4 = 0x85,
|
||||
I16x8NarrowUI32x4 = 0x86,
|
||||
I16x8WidenLowSI8x16 = 0x87,
|
||||
|
@ -569,7 +569,7 @@ enum class SimdOp {
|
|||
I32x4Neg = 0xa1,
|
||||
I32x4AnyTrue = 0xa2,
|
||||
I32x4AllTrue = 0xa3,
|
||||
// Bitmask = 0xa4
|
||||
I32x4Bitmask = 0xa4,
|
||||
// Narrow = 0xa5
|
||||
// Narrow = 0xa6
|
||||
I32x4WidenLowSI16x8 = 0xa7,
|
||||
|
|
|
@ -4877,6 +4877,9 @@ static bool EmitBodyExprs(FunctionCompiler& f) {
|
|||
case uint32_t(SimdOp::I8x16AllTrue):
|
||||
case uint32_t(SimdOp::I16x8AllTrue):
|
||||
case uint32_t(SimdOp::I32x4AllTrue):
|
||||
case uint32_t(SimdOp::I8x16Bitmask):
|
||||
case uint32_t(SimdOp::I16x8Bitmask):
|
||||
case uint32_t(SimdOp::I32x4Bitmask):
|
||||
CHECK(EmitReduceSimd128(f, SimdOp(op.b1)));
|
||||
case uint32_t(SimdOp::I8x16Shl):
|
||||
case uint32_t(SimdOp::I8x16ShrS):
|
||||
|
|
|
@ -314,6 +314,9 @@ OpKind wasm::Classify(OpBytes op) {
|
|||
case SimdOp::I16x8AllTrue:
|
||||
case SimdOp::I32x4AnyTrue:
|
||||
case SimdOp::I32x4AllTrue:
|
||||
case SimdOp::I8x16Bitmask:
|
||||
case SimdOp::I16x8Bitmask:
|
||||
case SimdOp::I32x4Bitmask:
|
||||
WASM_SIMD_OP(OpKind::Conversion);
|
||||
case SimdOp::I8x16ReplaceLane:
|
||||
case SimdOp::I16x8ReplaceLane:
|
||||
|
|
|
@ -921,6 +921,9 @@ static bool DecodeFunctionBodyExprs(const ModuleEnvironment& env,
|
|||
case uint32_t(SimdOp::I16x8AllTrue):
|
||||
case uint32_t(SimdOp::I32x4AnyTrue):
|
||||
case uint32_t(SimdOp::I32x4AllTrue):
|
||||
case uint32_t(SimdOp::I8x16Bitmask):
|
||||
case uint32_t(SimdOp::I16x8Bitmask):
|
||||
case uint32_t(SimdOp::I32x4Bitmask):
|
||||
CHECK(iter.readConversion(ValType::V128, ValType::I32, ¬hing));
|
||||
|
||||
case uint32_t(SimdOp::I8x16ReplaceLane):
|
||||
|
|
Загрузка…
Ссылка в новой задаче