Bug 1141121 - Immediate operands to atomics, x86 and x64. r=h4writer

2015-03-25 10:51:12 +01:00 · 2015-03-25 10:51:12 +01:00 · 50332264f3
--- a/js/src/jit-test/tests/atomics/optimization-tests.js
+++ b/js/src/jit-test/tests/atomics/optimization-tests.js
@ -38,6 +38,18 @@ function f2(ia, k) {
    Atomics.sub(ia, 2, 1);
 }

+function f4(ia, k) {
+    // For effect, variable value.  The generated code on x86/x64
+    // should be one LOCK ORB.  (On ARM, there should be no
+    // sign-extend of the current value in the cell, otherwise this is
+    // still a LDREX/STREX loop.)
+    Atomics.or(ia, 6, k);
+
+    // Ditto constant value.  Here the LOCK ORB should have an
+    // immediate operand.
+    Atomics.or(ia, 6, 1);
+}
+
 function g(ia, k) {
    // For its value, variable value.  The generated code on x86/x64
    // should be one LOCK XADDB.
@ -61,6 +73,16 @@ function g2(ia, k) {
    sum += Atomics.sub(ia, 3, 1);
 }

+function g4(ia, k) {
+    // For its value, variable value.  The generated code on x86/x64
+    // should be a loop around ORB ; CMPXCHGB
+    sum += Atomics.or(ia, 7, k);
+
+    // Ditto constant value.  Here the ORB in the loop should have
+    // an immediate operand.
+    sum += Atomics.or(ia, 7, 1);
+}
+
 function mod(stdlib, ffi, heap) {
    "use asm";

@ -92,6 +114,8 @@ for ( var i=0 ; i < 10000 ; i++ ) {
    g2(i8a, i % 10);
    f3(i % 10);
    g3(i % 10);
+    f4(i8a, i % 10);
+    g4(i8a, i % 10);
 }

 assertEq(i8a[0], ((10000 + 10000*4.5) << 24) >> 24);
@ -100,3 +124,5 @@ assertEq(i8a[2], ((-10000 + -10000*4.5) << 24) >> 24);
 assertEq(i8a[3], ((-10000 + -10000*4.5) << 24) >> 24);
 assertEq(i8a[4], ((10000 + 10000*4.5) << 24) >> 24);
 assertEq(i8a[5], ((10000 + 10000*4.5) << 24) >> 24);
+assertEq(i8a[6], 15);
+assertEq(i8a[7], 15);
--- a/js/src/jit/LIR-Common.h
+++ b/js/src/jit/LIR-Common.h
@ -5059,6 +5059,8 @@ class LAtomicTypedArrayElementBinop : public LInstructionHelper<1, 3, 2>
  public:
    LIR_HEADER(AtomicTypedArrayElementBinop)

+    static const int32_t valueOp = 2;
+
    LAtomicTypedArrayElementBinop(const LAllocation &elements, const LAllocation &index,
                                  const LAllocation &value, const LDefinition &temp1,
                                  const LDefinition &temp2)
@ -5077,6 +5079,7 @@ class LAtomicTypedArrayElementBinop : public LInstructionHelper<1, 3, 2>
        return getOperand(1);
    }
    const LAllocation *value() {
+        MOZ_ASSERT(valueOp == 2);
        return getOperand(2);
    }
    const LDefinition *temp1() {
@ -6491,6 +6494,9 @@ class LAsmJSAtomicBinopHeap : public LInstructionHelper<1, 2, 2>
 {
  public:
    LIR_HEADER(AsmJSAtomicBinopHeap);
+
+    static const int32_t valueOp = 1;
+
    LAsmJSAtomicBinopHeap(const LAllocation &ptr, const LAllocation &value,
                          const LDefinition &temp)
    {
@ -6503,6 +6509,7 @@ class LAsmJSAtomicBinopHeap : public LInstructionHelper<1, 2, 2>
        return getOperand(0);
    }
    const LAllocation *value() {
+        MOZ_ASSERT(valueOp == 1);
        return getOperand(1);
    }
    const LDefinition *temp() {
--- a/js/src/jit/shared/Lowering-x86-shared.cpp
+++ b/js/src/jit/shared/Lowering-x86-shared.cpp
@ -434,16 +434,13 @@ LIRGeneratorX86Shared::lowerAtomicTypedArrayElementBinop(MAtomicTypedArrayElemen
    //
    // We'll emit a single instruction: LOCK ADD, LOCK SUB, LOCK AND,
    // LOCK OR, or LOCK XOR.  We can do this even for the Uint32 case.
-    //
-    // If the operand is 8-bit we shall need to use an 8-bit register
-    // for it on x86 systems.

    if (!ins->hasUses()) {
        LAllocation value;
-        if (useI386ByteRegisters && ins->isByteArray())
+        if (useI386ByteRegisters && ins->isByteArray() && !ins->value()->isConstant())
            value = useFixed(ins->value(), ebx);
        else
-            value = useRegister(ins->value());
+            value = useRegisterOrConstant(ins->value());

        LAtomicTypedArrayElementBinopForEffect *lir =
            new(alloc()) LAtomicTypedArrayElementBinopForEffect(elements, index, value);
@ -459,8 +456,7 @@ LIRGeneratorX86Shared::lowerAtomicTypedArrayElementBinop(MAtomicTypedArrayElemen
    //    movl       src, output
    //    lock xaddl output, mem
    //
-    // For the 8-bit variants XADD needs a byte register for the
-    // output only.
+    // For the 8-bit variants XADD needs a byte register for the output.
    //
    // For AND/OR/XOR we need to use a CMPXCHG loop:
    //
@ -484,23 +480,18 @@ LIRGeneratorX86Shared::lowerAtomicTypedArrayElementBinop(MAtomicTypedArrayElemen
    //  - eax is the first temp
    //  - we also need a second temp
    //
-    // For simplicity we force the 'value' into a byte register if the
-    // array has 1-byte elements, though that could be worked around.
-    //
-    // For simplicity we also choose fixed byte registers even when
-    // any available byte register would have been OK.
-    //
    // There are optimization opportunities:
-    //  - better register allocation and instruction selection, Bug #1077036.
+    //  - better register allocation in the x86 8-bit case, Bug #1077036.

    bool bitOp = !(ins->operation() == AtomicFetchAddOp || ins->operation() == AtomicFetchSubOp);
    bool fixedOutput = true;
+    bool reuseInput = false;
    LDefinition tempDef1 = LDefinition::BogusTemp();
    LDefinition tempDef2 = LDefinition::BogusTemp();
    LAllocation value;

    if (ins->arrayType() == Scalar::Uint32 && IsFloatingPointType(ins->type())) {
-        value = useRegister(ins->value());
+        value = useRegisterOrConstant(ins->value());
        fixedOutput = false;
        if (bitOp) {
            tempDef1 = tempFixed(eax);
@ -509,13 +500,22 @@ LIRGeneratorX86Shared::lowerAtomicTypedArrayElementBinop(MAtomicTypedArrayElemen
            tempDef1 = temp();
        }
    } else if (useI386ByteRegisters && ins->isByteArray()) {
-        value = useFixed(ins->value(), ebx);
+        if (ins->value()->isConstant())
+            value = useRegisterOrConstant(ins->value());
+        else
+            value = useFixed(ins->value(), ebx);
        if (bitOp)
            tempDef1 = tempFixed(ecx);
+    } else if (bitOp) {
+        value = useRegisterOrConstant(ins->value());
+        tempDef1 = temp();
+    } else if (ins->value()->isConstant()) {
+        fixedOutput = false;
+        value = useRegisterOrConstant(ins->value());
    } else {
-        value = useRegister(ins->value());
-        if (bitOp)
-            tempDef1 = temp();
+        fixedOutput = false;
+        reuseInput = true;
+        value = useRegisterAtStart(ins->value());
    }

    LAtomicTypedArrayElementBinop *lir =
@ -523,6 +523,8 @@ LIRGeneratorX86Shared::lowerAtomicTypedArrayElementBinop(MAtomicTypedArrayElemen

    if (fixedOutput)
        defineFixed(lir, ins, LAllocation(AnyRegister(eax)));
+    else if (reuseInput)
+        defineReuseInput(lir, ins, LAtomicTypedArrayElementBinop::valueOp);
    else
        define(lir, ins);
 }
--- a/js/src/jit/x64/Lowering-x64.cpp
+++ b/js/src/jit/x64/Lowering-x64.cpp
@ -233,42 +233,56 @@ LIRGeneratorX64::visitAsmJSAtomicBinopHeap(MAsmJSAtomicBinopHeap *ins)
    if (!ins->hasUses()) {
        LAsmJSAtomicBinopHeapForEffect *lir =
            new(alloc()) LAsmJSAtomicBinopHeapForEffect(useRegister(ptr),
-                                                        useRegister(ins->value()));
+                                                        useRegisterOrConstant(ins->value()));
        add(lir, ins);
        return;
    }

    // Case 2: the result of the operation is used.
    //
-    // For ADD and SUB we'll use XADD (with word and byte ops as appropriate):
+    // For ADD and SUB we'll use XADD with word and byte ops as
+    // appropriate.  Any output register can be used and if value is a
+    // register it's best if it's the same as output:
    //
-    //    movl       value, output
+    //    movl       value, output  ; if value != output
    //    lock xaddl output, mem
    //
-    // For AND/OR/XOR we need to use a CMPXCHG loop:
+    // For AND/OR/XOR we need to use a CMPXCHG loop, and the output is
+    // always in rax:
    //
-    //    movl          *mem, eax
-    // L: mov           eax, temp
+    //    movl          *mem, rax
+    // L: mov           rax, temp
    //    andl          value, temp
-    //    lock cmpxchg  temp, mem  ; reads eax also
+    //    lock cmpxchg  temp, mem  ; reads rax also
    //    jnz           L
-    //    ; result in eax
+    //    ; result in rax
    //
-    // Note the placement of L, cmpxchg will update eax with *mem if
+    // Note the placement of L, cmpxchg will update rax with *mem if
    // *mem does not have the expected value, so reloading it at the
    // top of the loop would be redundant.
-    //
-    // We want to fix eax as the output.  We also need a temp for
-    // the intermediate value.

    bool bitOp = !(ins->operation() == AtomicFetchAddOp || ins->operation() == AtomicFetchSubOp);
-    LAllocation value = useRegister(ins->value());
-    LDefinition tempDef = bitOp ? temp() : LDefinition::BogusTemp();
+    bool reuseInput = false;
+    LAllocation value;
+
+    if (bitOp || ins->value()->isConstant()) {
+        value = useRegisterOrConstant(ins->value());
+    } else {
+        reuseInput = true;
+        value = useRegisterAtStart(ins->value());
+    }

    LAsmJSAtomicBinopHeap *lir =
-        new(alloc()) LAsmJSAtomicBinopHeap(useRegister(ptr), value, tempDef);
+        new(alloc()) LAsmJSAtomicBinopHeap(useRegister(ptr),
+                                           value,
+                                           bitOp ? temp() : LDefinition::BogusTemp());

-    defineFixed(lir, ins, LAllocation(AnyRegister(eax)));
+    if (reuseInput)
+        defineReuseInput(lir, ins, LAsmJSAtomicBinopHeap::valueOp);
+    else if (bitOp)
+        defineFixed(lir, ins, LAllocation(AnyRegister(rax)));
+    else
+        define(lir, ins);
 }

 void
--- a/js/src/jit/x86/Lowering-x86.cpp
+++ b/js/src/jit/x86/Lowering-x86.cpp
@ -317,13 +317,14 @@ LIRGeneratorX86::visitAsmJSAtomicBinopHeap(MAsmJSAtomicBinopHeap *ins)
    // Case 1: the result of the operation is not used.
    //
    // We'll emit a single instruction: LOCK ADD, LOCK SUB, LOCK AND,
-    // LOCK OR, or LOCK XOR.
-    //
-    // For the 8-bit variant the ops need a byte register for the
-    // value; just pin the value to ebx.
+    // LOCK OR, or LOCK XOR.  These can all take an immediate.

    if (!ins->hasUses()) {
-        LAllocation value = byteArray ? useFixed(ins->value(), ebx) : useRegister(ins->value());
+        LAllocation value;
+        if (byteArray && !ins->value()->isConstant())
+            value = useFixed(ins->value(), ebx);
+        else
+            value = useRegisterOrConstant(ins->value());
        LAsmJSAtomicBinopHeapForEffect *lir =
            new(alloc()) LAsmJSAtomicBinopHeapForEffect(useRegister(ptr), value);
        lir->setAddrTemp(temp());
@ -361,31 +362,34 @@ LIRGeneratorX86::visitAsmJSAtomicBinopHeap(MAsmJSAtomicBinopHeap *ins)
    // For the 8-bit variants the temp must have a byte register.
    //
    // There are optimization opportunities:
-    //  - better register allocation and instruction selection, Bug #1077036.
+    //  - better 8-bit register allocation and instruction selection, Bug #1077036.

    bool bitOp = !(ins->operation() == AtomicFetchAddOp || ins->operation() == AtomicFetchSubOp);
    LDefinition tempDef = LDefinition::BogusTemp();
    LAllocation value;

-    // Optimization opportunity: "value" need not be pinned to something that
-    // has a byte register unless the back-end insists on using a byte move
-    // for the setup or the payload computation, which really it need not do.
-
    if (byteArray) {
        value = useFixed(ins->value(), ebx);
        if (bitOp)
            tempDef = tempFixed(ecx);
-    } else {
-        value = useRegister(ins->value());
+    } else if (bitOp || ins->value()->isConstant()) {
+        value = useRegisterOrConstant(ins->value());
        if (bitOp)
            tempDef = temp();
+    } else {
+        value = useRegisterAtStart(ins->value());
    }

    LAsmJSAtomicBinopHeap *lir =
        new(alloc()) LAsmJSAtomicBinopHeap(useRegister(ptr), value, tempDef);

    lir->setAddrTemp(temp());
-    defineFixed(lir, ins, LAllocation(AnyRegister(eax)));
+    if (byteArray || bitOp)
+        defineFixed(lir, ins, LAllocation(AnyRegister(eax)));
+    else if (ins->value()->isConstant())
+        define(lir, ins);
+    else
+        defineReuseInput(lir, ins, LAsmJSAtomicBinopHeap::valueOp);
 }

 void