Bug 1641140 - Implement the .bitmask instructions. r=jseward

For general background see the bug's description.  This patch
implements the iNxM.bitmask instructions for x86+x64 in baseline+ion,
and adds some simple test cases.

Differential Revision: https://phabricator.services.mozilla.com/D77784
This commit is contained in:
Lars T Hansen 2020-06-08 08:02:37 +00:00
Родитель 0cda75506a
Коммит e34701fc0d
10 изменённых файлов: 128 добавлений и 5 удалений

Просмотреть файл

@ -304,8 +304,8 @@ if (!wasmSimdSupported()) {
} else {
let reservedSimd = [
0x53, 0x54, 0x55, 0x56, 0x57, 0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e,
0x5f, 0x64, 0x67, 0x68, 0x69, 0x6a, 0x74, 0x75, 0x7a, 0x7c, 0x7d, 0x7e,
0x7f, 0x84, 0x94, 0x9a, 0x9c, 0x9d, 0x9e, 0x9f, 0xa4, 0xa5, 0xa6, 0xaf,
0x5f, 0x67, 0x68, 0x69, 0x6a, 0x74, 0x75, 0x7a, 0x7c, 0x7d, 0x7e,
0x7f, 0x94, 0x9a, 0x9c, 0x9d, 0x9e, 0x9f, 0xa5, 0xa6, 0xaf,
0xb0, 0xb2, 0xb3, 0xb4, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf, 0xc0, 0xc2,
0xc3, 0xc4, 0xc5, 0xc6, 0xc7, 0xc8, 0xc9, 0xca, 0xcf, 0xd0, 0xd2, 0xd3,
0xd4, 0xd6, 0xd7, 0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf, 0xea,

Просмотреть файл

@ -218,6 +218,12 @@ function shru(count, width) {
}
}
function popcount(n) {
n = n - ((n >> 1) & 0x55555555)
n = (n & 0x33333333) + ((n >> 2) & 0x33333333)
return ((n + (n >> 4) & 0xF0F0F0F) * 0x1010101) >> 24
}
// For each input array, a set of arrays of the proper length for v128, with
// values in range but possibly of the wrong signedness (eg, for Int8Array, 128
// is in range but is really -128). Also a unary operator `rectify` that
@ -822,6 +828,54 @@ for ( let dope of [1, 7, 32, 195 ] ) {
assertEq(ins.exports.alltrue_i32x4(), 0);
}
// Bitmask
var ins = wasmEvalText(`
(module
(memory (export "mem") 1 1)
(func (export "bitmask_i8x16") (result i32)
(i8x16.bitmask (v128.load (i32.const 16))))
(func (export "bitmask_i16x8") (result i32)
(i16x8.bitmask (v128.load (i32.const 16))))
(func (export "bitmask_i32x4") (result i32)
(i32x4.bitmask (v128.load (i32.const 16)))))`);
var mem8 = new Uint8Array(ins.exports.mem.buffer);
var mem16 = new Uint16Array(ins.exports.mem.buffer);
var mem32 = new Uint32Array(ins.exports.mem.buffer);
set(mem8, 16, iota(16).map((_) => 0));
assertEq(ins.exports.bitmask_i8x16(), 0);
assertEq(ins.exports.bitmask_i16x8(), 0);
assertEq(ins.exports.bitmask_i32x4(), 0);
set(mem8, 16, iota(16).map((_) => 0x80));
assertEq(ins.exports.bitmask_i8x16(), 0xFFFF);
set(mem8, 16, iota(16).map((_) => 0x7F));
assertEq(ins.exports.bitmask_i8x16(), 0);
set(mem8, 16, iota(16).map((i) => popcount(i) == 1 ? 0x80 : 0));
assertEq(ins.exports.bitmask_i8x16(), (1 << 1) | (1 << 2) | (1 << 4) | (1 << 8));
set(mem16, 8, iota(8).map((i) => 0x8000))
assertEq(ins.exports.bitmask_i16x8(), 0xFF)
set(mem16, 8, iota(8).map((i) => 0x7FFF))
assertEq(ins.exports.bitmask_i16x8(), 0)
set(mem16, 8, iota(8).map((i) => popcount(i) == 1 ? 0x8000 : 0))
assertEq(ins.exports.bitmask_i16x8(), (1 << 1) | (1 << 2) | (1 << 4));
set(mem32, 4, iota(4).map((_) => 0x80000000))
assertEq(ins.exports.bitmask_i32x4(), 0xF);
set(mem32, 4, iota(4).map((_) => 0x7FFFFFFF))
assertEq(ins.exports.bitmask_i32x4(), 0);
set(mem32, 4, iota(4).map((i) => popcount(i) == 1 ? 0x80000000 : 0))
assertEq(ins.exports.bitmask_i32x4(), (1 << 1) | (1 << 2));
// Shifts
//
// lhs is v128 in memory

Просмотреть файл

@ -2309,6 +2309,17 @@ class MacroAssembler : public MacroAssemblerSpecific {
inline void allTrueInt32x4(FloatRegister src, Register dest)
DEFINED_ON(x86_shared);
// Bitmask, ie extract and compress high bits of all lanes
inline void bitmaskInt8x16(FloatRegister src, Register dest)
DEFINED_ON(x86_shared);
inline void bitmaskInt16x8(FloatRegister src, Register dest)
DEFINED_ON(x86_shared);
inline void bitmaskInt32x4(FloatRegister src, Register dest)
DEFINED_ON(x86_shared);
// Comparisons (integer and floating-point)
inline void compareInt8x16(Assembler::Condition cond, FloatRegister rhs,

Просмотреть файл

@ -3194,6 +3194,15 @@ void CodeGenerator::visitWasmReduceSimd128(LWasmReduceSimd128* ins) {
case wasm::SimdOp::I32x4AllTrue:
masm.allTrueInt32x4(src, ToRegister(dest));
break;
case wasm::SimdOp::I8x16Bitmask:
masm.bitmaskInt8x16(src, ToRegister(dest));
break;
case wasm::SimdOp::I16x8Bitmask:
masm.bitmaskInt16x8(src, ToRegister(dest));
break;
case wasm::SimdOp::I32x4Bitmask:
masm.bitmaskInt32x4(src, ToRegister(dest));
break;
case wasm::SimdOp::I8x16ExtractLaneS:
masm.extractLaneInt8x16(imm, src, ToRegister(dest));
break;

Просмотреть файл

@ -1374,6 +1374,28 @@ void MacroAssembler::allTrueInt32x4(FloatRegister src, Register dest) {
movzbl(dest, dest);
}
// Bitmask
void MacroAssembler::bitmaskInt8x16(FloatRegister src, Register dest) {
vpmovmskb(src, dest);
}
void MacroAssembler::bitmaskInt16x8(FloatRegister src, Register dest) {
ScratchSimd128Scope scratch(*this);
// A three-instruction sequence is possible by using scratch as a don't-care
// input and shifting rather than masking at the end, but creates a false
// dependency on the old value of scratch. The better fix is to allow src to
// be clobbered.
moveSimd128(src, scratch);
vpacksswb(Operand(scratch), scratch, scratch);
vpmovmskb(scratch, dest);
andl(Imm32(0xFF), dest);
}
void MacroAssembler::bitmaskInt32x4(FloatRegister src, Register dest) {
vmovmskps(src, dest);
}
// Swizzle - permute with variable indices
void MacroAssembler::swizzleInt8x16(FloatRegister rhs, FloatRegister lhsDest,

Просмотреть файл

@ -13093,6 +13093,18 @@ static void AllTrueI32x4(MacroAssembler& masm, RegV128 rs, RegI32 rd) {
masm.allTrueInt32x4(rs, rd);
}
static void BitmaskI8x16(MacroAssembler& masm, RegV128 rs, RegI32 rd) {
masm.bitmaskInt8x16(rs, rd);
}
static void BitmaskI16x8(MacroAssembler& masm, RegV128 rs, RegI32 rd) {
masm.bitmaskInt16x8(rs, rd);
}
static void BitmaskI32x4(MacroAssembler& masm, RegV128 rs, RegI32 rd) {
masm.bitmaskInt32x4(rs, rd);
}
static void Swizzle(MacroAssembler& masm, RegV128 rs, RegV128 rsd,
RegV128 temp) {
masm.swizzleInt8x16(rs, rsd, temp);
@ -14236,6 +14248,12 @@ bool BaseCompiler::emitBody() {
CHECK_NEXT(dispatchVectorReduction(AllTrueI16x8));
case uint32_t(SimdOp::I32x4AllTrue):
CHECK_NEXT(dispatchVectorReduction(AllTrueI32x4));
case uint32_t(SimdOp::I8x16Bitmask):
CHECK_NEXT(dispatchVectorReduction(BitmaskI8x16));
case uint32_t(SimdOp::I16x8Bitmask):
CHECK_NEXT(dispatchVectorReduction(BitmaskI16x8));
case uint32_t(SimdOp::I32x4Bitmask):
CHECK_NEXT(dispatchVectorReduction(BitmaskI32x4));
case uint32_t(SimdOp::I8x16ReplaceLane):
CHECK_NEXT(dispatchReplaceLane(ReplaceLaneI8x16, ValType::I32, 16));
case uint32_t(SimdOp::I16x8ReplaceLane):

Просмотреть файл

@ -505,7 +505,7 @@ enum class SimdOp {
I8x16Neg = 0x61,
I8x16AnyTrue = 0x62,
I8x16AllTrue = 0x63,
// Bitmask = 0x64
I8x16Bitmask = 0x64,
I8x16NarrowSI16x8 = 0x65,
I8x16NarrowUI16x8 = 0x66,
// Widen = 0x67
@ -537,7 +537,7 @@ enum class SimdOp {
I16x8Neg = 0x81,
I16x8AnyTrue = 0x82,
I16x8AllTrue = 0x83,
// Bitmask = 0x84
I16x8Bitmask = 0x84,
I16x8NarrowSI32x4 = 0x85,
I16x8NarrowUI32x4 = 0x86,
I16x8WidenLowSI8x16 = 0x87,
@ -569,7 +569,7 @@ enum class SimdOp {
I32x4Neg = 0xa1,
I32x4AnyTrue = 0xa2,
I32x4AllTrue = 0xa3,
// Bitmask = 0xa4
I32x4Bitmask = 0xa4,
// Narrow = 0xa5
// Narrow = 0xa6
I32x4WidenLowSI16x8 = 0xa7,

Просмотреть файл

@ -4877,6 +4877,9 @@ static bool EmitBodyExprs(FunctionCompiler& f) {
case uint32_t(SimdOp::I8x16AllTrue):
case uint32_t(SimdOp::I16x8AllTrue):
case uint32_t(SimdOp::I32x4AllTrue):
case uint32_t(SimdOp::I8x16Bitmask):
case uint32_t(SimdOp::I16x8Bitmask):
case uint32_t(SimdOp::I32x4Bitmask):
CHECK(EmitReduceSimd128(f, SimdOp(op.b1)));
case uint32_t(SimdOp::I8x16Shl):
case uint32_t(SimdOp::I8x16ShrS):

Просмотреть файл

@ -314,6 +314,9 @@ OpKind wasm::Classify(OpBytes op) {
case SimdOp::I16x8AllTrue:
case SimdOp::I32x4AnyTrue:
case SimdOp::I32x4AllTrue:
case SimdOp::I8x16Bitmask:
case SimdOp::I16x8Bitmask:
case SimdOp::I32x4Bitmask:
WASM_SIMD_OP(OpKind::Conversion);
case SimdOp::I8x16ReplaceLane:
case SimdOp::I16x8ReplaceLane:

Просмотреть файл

@ -921,6 +921,9 @@ static bool DecodeFunctionBodyExprs(const ModuleEnvironment& env,
case uint32_t(SimdOp::I16x8AllTrue):
case uint32_t(SimdOp::I32x4AnyTrue):
case uint32_t(SimdOp::I32x4AllTrue):
case uint32_t(SimdOp::I8x16Bitmask):
case uint32_t(SimdOp::I16x8Bitmask):
case uint32_t(SimdOp::I32x4Bitmask):
CHECK(iter.readConversion(ValType::V128, ValType::I32, &nothing));
case uint32_t(SimdOp::I8x16ReplaceLane):