Bug 860965 - Move 1D ParallelArray operations to Array. (r=luke)

2013-05-11 22:39:46 -07:00 · 2013-05-11 22:39:46 -07:00 · 82db710cfc
--- a/js/src/Makefile.in
+++ b/js/src/Makefile.in
@ -983,6 +983,7 @@ selfhosting_srcs := \

 selfhosted_out_h_deps := \
  $(selfhosting_srcs) \
+  $(srcdir)/builtin/ParallelArrayCommonOps.js \
  $(srcdir)/js.msg \
  $(srcdir)/builtin/embedjs.py \
  $(NULL)
--- a/js/src/builtin/Array.js
+++ b/js/src/builtin/Array.js
@ -391,3 +391,96 @@ function ArrayStaticReduceRight(list, callbackfn) {
    else
        return callFunction(ArrayReduceRight, list, callbackfn);
 }
+
+#ifdef ENABLE_PARALLEL_JS
+
+/* Include the common 1D operations. */
+#define PA_LENGTH(a) (a.length)
+#define PA_GET(a, i) (a[i])
+#define PA_NEW(length, buffer, offset) buffer
+
+#define PA_MAP_NAME     ArrayParallelMap
+#define PA_REDUCE_NAME  ArrayParallelReduce
+#define PA_SCAN_NAME    ArrayParallelScan
+#define PA_SCATTER_NAME ArrayParallelScatter
+#define PA_FILTER_NAME  ArrayParallelFilter
+
+#include "ParallelArrayCommonOps.js"
+
+#undef PA_LENGTH
+#undef PA_GET
+#undef PA_NEW
+#undef PA_MAP_NAME
+#undef PA_REDUCE_NAME
+#undef PA_SCAN_NAME
+#undef PA_SCATTER_NAME
+#undef PA_FILTER_NAME
+
+/**
+ * "Comprehension form": This is the function invoked for |Array.pbuild(len,
+ * fn)| Itt creates a new array with length |len| where index |i| is equal to
+ * |fn(i)|.
+ *
+ * The final |mode| argument is an internal argument used only
+ * during our unit-testing.
+ */
+function ArrayStaticParallelBuild(length, func, mode) {
+  if (!IS_UINT32(length))
+    ThrowError(JSMSG_PAR_ARRAY_BAD_ARG, "");
+  if (!IsCallable(func))
+    ThrowError(JSMSG_NOT_FUNCTION, DecompileArg(1, func));
+
+  var buffer = NewDenseArray(length);
+
+  parallel: for (;;) {
+    // Avoid parallel compilation if we are already nested in another
+    // parallel section or the user told us not to parallelize. The
+    // use of a for (;;) loop is working around some ion limitations:
+    //
+    // - Breaking out of named blocks does not currently work (bug 684384);
+    // - Unreachable Code Elim. can't properly handle if (a && b) (bug 669796)
+    if (ShouldForceSequential())
+      break parallel;
+    if (!TRY_PARALLEL(mode))
+      break parallel;
+
+    var chunks = ComputeNumChunks(length);
+    var numSlices = ParallelSlices();
+    var info = ComputeAllSliceBounds(chunks, numSlices);
+    ParallelDo(constructSlice, CheckParallel(mode));
+    return buffer;
+  }
+
+  // Sequential fallback:
+  ASSERT_SEQUENTIAL_IS_OK(mode);
+  fill(0, length);
+  return buffer;
+
+  function constructSlice(sliceId, numSlices, warmup) {
+    var chunkPos = info[SLICE_POS(sliceId)];
+    var chunkEnd = info[SLICE_END(sliceId)];
+
+    if (warmup && chunkEnd > chunkPos)
+      chunkEnd = chunkPos + 1;
+
+    while (chunkPos < chunkEnd) {
+      var indexStart = chunkPos << CHUNK_SHIFT;
+      var indexEnd = std_Math_min(indexStart + CHUNK_SIZE, length);
+      fill(indexStart, indexEnd);
+      UnsafeSetElement(info, SLICE_POS(sliceId), ++chunkPos);
+    }
+  }
+
+  function fill(indexStart, indexEnd) {
+    for (var i = indexStart; i < indexEnd; i++)
+      UnsafeSetElement(buffer, i, func(i));
+  }
+}
+
+/*
+ * Mark the comprehension form as clone-at-callsite. See note in
+ * ParallelArrayCommonOps.js
+ */
+SetScriptHints(ArrayStaticParallelBuild, { cloneAtCallsite: true });
+
+#endif /* ENABLE_PARALLEL_JS */
--- a/js/src/builtin/ParallelArray.js
+++ b/js/src/builtin/ParallelArray.js
@ -4,71 +4,28 @@

 // FIXME(bug 844882): Parallel array properties should not be exposed.

-// The mode asserts options object.
-#define TRY_PARALLEL(MODE) \
-  ((!MODE || MODE.mode === "par"))
-#define ASSERT_SEQUENTIAL_IS_OK(MODE) \
-  do { if (MODE) AssertSequentialIsOK(MODE) } while(false)
+/* Include the common 1D operations. */
+#define PA_LENGTH(a) (a.shape[0])
+#define PA_GET(a, i) (a.get(i))
+#define PA_NEW(length, buffer, offset) \
+  (NewParallelArray(ParallelArrayView, [length], buffer, offset))

-// Slice array: see ComputeAllSliceBounds()
-#define SLICE_INFO(START, END) START, END, START, 0
-#define SLICE_START(ID) ((ID << 2) + 0)
-#define SLICE_END(ID)   ((ID << 2) + 1)
-#define SLICE_POS(ID)   ((ID << 2) + 2)
+#define PA_MAP_NAME     ParallelArrayMap
+#define PA_REDUCE_NAME  ParallelArrayReduce
+#define PA_SCAN_NAME    ParallelArrayScan
+#define PA_SCATTER_NAME ParallelArrayScatter
+#define PA_FILTER_NAME  ParallelArrayFilter

-// How many items at a time do we do recomp. for parallel execution.
-// Note that filter currently assumes that this is no greater than 32
-// in order to make use of a bitset.
-#define CHUNK_SHIFT 5
-#define CHUNK_SIZE 32
+#include "ParallelArrayCommonOps.js"

-// Safe versions of ARRAY.push(ELEMENT)
-#define ARRAY_PUSH(ARRAY, ELEMENT) \
-  callFunction(std_Array_push, ARRAY, ELEMENT);
-#define ARRAY_SLICE(ARRAY, ELEMENT) \
-  callFunction(std_Array_slice, ARRAY, ELEMENT);
-
-/**
- * Determine the number of chunks of size CHUNK_SIZE;
- * note that the final chunk may be smaller than CHUNK_SIZE.
- */
-function ComputeNumChunks(length) {
-  var chunks = length >>> CHUNK_SHIFT;
-  if (chunks << CHUNK_SHIFT === length)
-    return chunks;
-  return chunks + 1;
-}
-
-/**
- * Computes the bounds for slice |sliceIndex| of |numItems| items,
- * assuming |numSlices| total slices. If numItems is not evenly
- * divisible by numSlices, then the final thread may have a bit of
- * extra work.
- */
-function ComputeSliceBounds(numItems, sliceIndex, numSlices) {
-  var sliceWidth = (numItems / numSlices) | 0;
-  var startIndex = sliceWidth * sliceIndex;
-  var endIndex = sliceIndex === numSlices - 1 ? numItems : sliceWidth * (sliceIndex + 1);
-  return [startIndex, endIndex];
-}
-
-/**
- * Divides |numItems| items amongst |numSlices| slices. The result
- * is an array containing multiple values per slice: the start
- * index, end index, current position, and some padding. The
- * current position is initially the same as the start index. To
- * access the values for a particular slice, use the macros
- * SLICE_START() and so forth.
- */
-function ComputeAllSliceBounds(numItems, numSlices) {
-  // FIXME(bug 844890): Use typed arrays here.
-  var info = [];
-  for (var i = 0; i < numSlices; i++) {
-    var [start, end] = ComputeSliceBounds(numItems, i, numSlices);
-    ARRAY_PUSH(info, SLICE_INFO(start, end));
-  }
-  return info;
-}
+#undef PA_LENGTH
+#undef PA_GET
+#undef PA_NEW
+#undef PA_MAP_NAME
+#undef PA_REDUCE_NAME
+#undef PA_SCAN_NAME
+#undef PA_SCATTER_NAME
+#undef PA_FILTER_NAME

 /**
 * Compute the partial products in reverse order.
@ -91,7 +48,6 @@ function ComputeProducts(shape) {
 * array containing the N-dimensional index that maps to |index1d|.
 */
 function ComputeIndices(shape, index1d) {
-
  var products = ComputeProducts(shape);
  var l = shape.length;

@ -102,7 +58,7 @@ function ComputeIndices(shape, index1d) {
    var stride = products[l - i - 1];

    // Compute how many steps of width stride we could take.
-    var index = (index1d / stride) | 0;
+    var index = TO_INT32(index1d / stride);
    ARRAY_PUSH(result, index);

    // Adjust remaining indices for smaller dimensions.
@ -143,7 +99,7 @@ function ParallelArrayConstructEmpty() {
 */
 function ParallelArrayConstructFromArray(buffer) {
  var buffer = ToObject(buffer);
-  var length = buffer.length >>> 0;
+  var length = TO_UINT32(buffer.length);
  if (length !== buffer.length)
    ThrowError(JSMSG_PAR_ARRAY_BAD_ARG, "");

@ -165,6 +121,11 @@ function ParallelArrayConstructFromArray(buffer) {
 * the ion code that does inlining of PA constructors.
 */
 function ParallelArrayConstructFromFunction(shape, func) {
+  // Note that due to how DecompileArg works, it must be called in the nearest
+  // builtin frame, and so cannot factored out into
+  // ParallelArrayConstructFromComprehension.
+  if (!IsCallable(func))
+    ThrowError(JSMSG_NOT_FUNCTION, DecompileArg(1, func));
  return ParallelArrayConstructFromComprehension(this, shape, func, undefined);
 }

@ -173,6 +134,9 @@ function ParallelArrayConstructFromFunction(shape, func) {
 * case where 3 arguments are supplied.
 */
 function ParallelArrayConstructFromFunctionMode(shape, func, mode) {
+  // See DecompileArg note above.
+  if (!IsCallable(func))
+    ThrowError(JSMSG_NOT_FUNCTION, DecompileArg(1, func));
  return ParallelArrayConstructFromComprehension(this, shape, func, mode);
 }

@ -188,10 +152,8 @@ function ParallelArrayConstructFromFunctionMode(shape, func, mode) {
 * during our unit-testing.
 */
 function ParallelArrayConstructFromComprehension(self, shape, func, mode) {
-  // FIXME(bug 844887): Check |IsCallable(func)|
-
  if (typeof shape === "number") {
-    var length = shape >>> 0;
+    var length = TO_UINT32(shape);
    if (length !== shape)
      ThrowError(JSMSG_PAR_ARRAY_BAD_ARG, "");
    ParallelArrayBuild(self, [length], func, mode);
@ -201,7 +163,7 @@ function ParallelArrayConstructFromComprehension(self, shape, func, mode) {
    var shape1 = [];
    for (var i = 0, l = shape.length; i < l; i++) {
      var s0 = shape[i];
-      var s1 = s0 >>> 0;
+      var s1 = TO_UINT32(s0);
      if (s1 !== s0)
        ThrowError(JSMSG_PAR_ARRAY_BAD_ARG, "");
      ARRAY_PUSH(shape1, s1);
@ -327,7 +289,7 @@ function ParallelArrayBuild(self, shape, func, mode) {
  }

  function fill2(indexStart, indexEnd) {
-    var x = (indexStart / yDimension) | 0;
+    var x = TO_INT32(indexStart / yDimension);
    var y = indexStart - x * yDimension;
    for (var i = indexStart; i < indexEnd; i++) {
      UnsafeSetElement(buffer, i, func(x, y));
@ -339,9 +301,9 @@ function ParallelArrayBuild(self, shape, func, mode) {
  }

  function fill3(indexStart, indexEnd) {
-    var x = (indexStart / (yDimension * zDimension)) | 0;
+    var x = TO_INT32(indexStart / (yDimension * zDimension));
    var r = indexStart - x * yDimension * zDimension;
-    var y = (r / zDimension) | 0;
+    var y = TO_INT32(r / zDimension);
    var z = r - y * zDimension;
    for (var i = indexStart; i < indexEnd; i++) {
      UnsafeSetElement(buffer, i, func(x, y, z));
@ -365,734 +327,6 @@ function ParallelArrayBuild(self, shape, func, mode) {
  }
 }

-/**
- * Creates a new parallel array by applying |func(e, i, self)| for each
- * element |e| with index |i|. Note that
- * this always operates on the outermost dimension only.
- */
-function ParallelArrayMap(func, mode) {
-  // FIXME(bug 844887): Check |this instanceof ParallelArray|
-  // FIXME(bug 844887): Check |IsCallable(func)|
-
-  var self = this;
-  var length = self.shape[0];
-  var buffer = NewDenseArray(length);
-
-  parallel: for (;;) { // see ParallelArrayBuild() to explain why for(;;) etc
-    if (ShouldForceSequential())
-      break parallel;
-    if (!TRY_PARALLEL(mode))
-      break parallel;
-
-    var chunks = ComputeNumChunks(length);
-    var numSlices = ForkJoinSlices();
-    var info = ComputeAllSliceBounds(chunks, numSlices);
-    ForkJoin(mapSlice, CheckParallel(mode));
-    return NewParallelArray(ParallelArrayView, [length], buffer, 0);
-  }
-
-  // Sequential fallback:
-  ASSERT_SEQUENTIAL_IS_OK(mode);
-  for (var i = 0; i < length; i++) {
-    // Note: Unlike JS arrays, parallel arrays cannot have holes.
-    var v = func(self.get(i), i, self);
-    UnsafeSetElement(buffer, i, v);
-  }
-  return NewParallelArray(ParallelArrayView, [length], buffer, 0);
-
-  function mapSlice(sliceId, numSlices, warmup) {
-    var chunkPos = info[SLICE_POS(sliceId)];
-    var chunkEnd = info[SLICE_END(sliceId)];
-
-    if (warmup && chunkEnd > chunkPos + 1)
-      chunkEnd = chunkPos + 1;
-
-    while (chunkPos < chunkEnd) {
-      var indexStart = chunkPos << CHUNK_SHIFT;
-      var indexEnd = std_Math_min(indexStart + CHUNK_SIZE, length);
-
-      for (var i = indexStart; i < indexEnd; i++)
-        UnsafeSetElement(buffer, i, func(self.get(i), i, self));
-
-      UnsafeSetElement(info, SLICE_POS(sliceId), ++chunkPos);
-    }
-  }
-}
-
-/**
- * Reduces the elements in a parallel array's outermost dimension
- * using the given reduction function.
- */
-function ParallelArrayReduce(func, mode) {
-  // FIXME(bug 844887): Check |this instanceof ParallelArray|
-  // FIXME(bug 844887): Check |IsCallable(func)|
-
-  var self = this;
-  var length = self.shape[0];
-
-  if (length === 0)
-    ThrowError(JSMSG_PAR_ARRAY_REDUCE_EMPTY);
-
-  parallel: for (;;) { // see ParallelArrayBuild() to explain why for(;;) etc
-    if (ShouldForceSequential())
-      break parallel;
-    if (!TRY_PARALLEL(mode))
-      break parallel;
-
-    var chunks = ComputeNumChunks(length);
-    var numSlices = ForkJoinSlices();
-    if (chunks < numSlices)
-      break parallel;
-
-    var info = ComputeAllSliceBounds(chunks, numSlices);
-    var subreductions = NewDenseArray(numSlices);
-    ForkJoin(reduceSlice, CheckParallel(mode));
-    var accumulator = subreductions[0];
-    for (var i = 1; i < numSlices; i++)
-      accumulator = func(accumulator, subreductions[i]);
-    return accumulator;
-  }
-
-  // Sequential fallback:
-  ASSERT_SEQUENTIAL_IS_OK(mode);
-  var accumulator = self.get(0);
-  for (var i = 1; i < length; i++)
-    accumulator = func(accumulator, self.get(i));
-  return accumulator;
-
-  function reduceSlice(sliceId, numSlices, warmup) {
-    var chunkStart = info[SLICE_START(sliceId)];
-    var chunkPos = info[SLICE_POS(sliceId)];
-    var chunkEnd = info[SLICE_END(sliceId)];
-
-    // (*) This function is carefully designed so that the warmup
-    // (which executes with chunkStart === chunkPos) will execute all
-    // potential loads and stores. In particular, the warmup run
-    // processes two chunks rather than one. Moreover, it stores
-    // accumulator into subreductions and then loads it again to
-    // ensure that the load is executed during the warmup, as it will
-    // certainly be executed during subsequent runs.
-
-    if (warmup && chunkEnd > chunkPos + 2)
-      chunkEnd = chunkPos + 2;
-
-    if (chunkStart === chunkPos) {
-      var indexPos = chunkStart << CHUNK_SHIFT;
-      var accumulator = reduceChunk(self.get(indexPos), indexPos + 1, indexPos + CHUNK_SIZE);
-
-      UnsafeSetElement(subreductions, sliceId, accumulator, // see (*) above
-                       info, SLICE_POS(sliceId), ++chunkPos);
-    }
-
-    var accumulator = subreductions[sliceId]; // see (*) above
-
-    while (chunkPos < chunkEnd) {
-      var indexPos = chunkPos << CHUNK_SHIFT;
-      accumulator = reduceChunk(accumulator, indexPos, indexPos + CHUNK_SIZE);
-      UnsafeSetElement(subreductions, sliceId, accumulator,
-                       info, SLICE_POS(sliceId), ++chunkPos);
-    }
-  }
-
-  function reduceChunk(accumulator, from, to) {
-    to = std_Math_min(to, length);
-    for (var i = from; i < to; i++)
-      accumulator = func(accumulator, self.get(i));
-    return accumulator;
-  }
-}
-
-/**
- * |scan()| returns an array [s_0, ..., s_N] where
- * |s_i| is equal to the reduction (as per |reduce()|)
- * of elements |0..i|. This is the generalization
- * of partial sum.
- */
-function ParallelArrayScan(func, mode) {
-  // FIXME(bug 844887): Check |this instanceof ParallelArray|
-  // FIXME(bug 844887): Check |IsCallable(func)|
-
-  var self = this;
-  var length = self.shape[0];
-
-  if (length === 0)
-    ThrowError(JSMSG_PAR_ARRAY_REDUCE_EMPTY);
-
-  var buffer = NewDenseArray(length);
-
-  parallel: for (;;) { // see ParallelArrayBuild() to explain why for(;;) etc
-    if (ShouldForceSequential())
-      break parallel;
-    if (!TRY_PARALLEL(mode))
-      break parallel;
-
-    var chunks = ComputeNumChunks(length);
-    var numSlices = ForkJoinSlices();
-    if (chunks < numSlices)
-      break parallel;
-    var info = ComputeAllSliceBounds(chunks, numSlices);
-
-    // Scan slices individually (see comment on phase1()).
-    ForkJoin(phase1, CheckParallel(mode));
-
-    // Compute intermediates array (see comment on phase2()).
-    var intermediates = [];
-    var accumulator = buffer[finalElement(0)];
-    ARRAY_PUSH(intermediates, accumulator);
-    for (var i = 1; i < numSlices - 1; i++) {
-      accumulator = func(accumulator, buffer[finalElement(i)]);
-      ARRAY_PUSH(intermediates, accumulator);
-    }
-
-    // Reset the current position information for each slice, but
-    // convert from chunks to indices (see comment on phase2()).
-    for (var i = 0; i < numSlices; i++) {
-      info[SLICE_POS(i)] = info[SLICE_START(i)] << CHUNK_SHIFT;
-      info[SLICE_END(i)] = info[SLICE_END(i)] << CHUNK_SHIFT;
-    }
-    info[SLICE_END(numSlices - 1)] = std_Math_min(info[SLICE_END(numSlices - 1)], length);
-
-    // Complete each slice using intermediates array (see comment on phase2()).
-    ForkJoin(phase2, CheckParallel(mode));
-    return NewParallelArray(ParallelArrayView, [length], buffer, 0);
-  }
-
-  // Sequential fallback:
-  ASSERT_SEQUENTIAL_IS_OK(mode);
-  scan(self.get(0), 0, length);
-  return NewParallelArray(ParallelArrayView, [length], buffer, 0);
-
-  function scan(accumulator, start, end) {
-    UnsafeSetElement(buffer, start, accumulator);
-    for (var i = start + 1; i < end; i++) {
-      accumulator = func(accumulator, self.get(i));
-      UnsafeSetElement(buffer, i, accumulator);
-    }
-    return accumulator;
-  }
-
-  /**
-   * In phase 1, we divide the source array into |numSlices| slices and
-   * compute scan on each slice sequentially as if it were the entire
-   * array. This function is responsible for computing one of those
-   * slices.
-   *
-   * So, if we have an array [A,B,C,D,E,F,G,H,I], |numSlices == 3|,
-   * and our function |func| is sum, then we would wind up computing a
-   * result array like:
-   *
-   *     [A, A+B, A+B+C, D, D+E, D+E+F, G, G+H, G+H+I]
-   *      ^~~~~~~~~~~~^  ^~~~~~~~~~~~^  ^~~~~~~~~~~~~^
-   *      Slice 0        Slice 1        Slice 2
-   *
-   * Read on in phase2 to see what we do next!
-   */
-  function phase1(sliceId, numSlices, warmup) {
-    var chunkStart = info[SLICE_START(sliceId)];
-    var chunkPos = info[SLICE_POS(sliceId)];
-    var chunkEnd = info[SLICE_END(sliceId)];
-
-    if (warmup && chunkEnd > chunkPos + 2)
-      chunkEnd = chunkPos + 2;
-
-    if (chunkPos == chunkStart) {
-      // For the first chunk, the accumulator begins as the value in
-      // the input at the start of the chunk.
-      var indexStart = chunkPos << CHUNK_SHIFT;
-      var indexEnd = std_Math_min(indexStart + CHUNK_SIZE, length);
-      scan(self.get(indexStart), indexStart, indexEnd);
-      UnsafeSetElement(info, SLICE_POS(sliceId), ++chunkPos);
-    }
-
-    while (chunkPos < chunkEnd) {
-      // For each subsequent chunk, the accumulator begins as the
-      // combination of the final value of prev chunk and the value in
-      // the input at the start of this chunk. Note that this loop is
-      // written as simple as possible, at the cost of an extra read
-      // from the buffer per iteration.
-      var indexStart = chunkPos << CHUNK_SHIFT;
-      var indexEnd = std_Math_min(indexStart + CHUNK_SIZE, length);
-      var accumulator = func(buffer[indexStart - 1], self.get(indexStart));
-      scan(accumulator, indexStart, indexEnd);
-      UnsafeSetElement(info, SLICE_POS(sliceId), ++chunkPos);
-    }
-  }
-
-  /**
-   * Computes the index of the final element computed by the slice |sliceId|.
-   */
-  function finalElement(sliceId) {
-    var chunkEnd = info[SLICE_END(sliceId)]; // last chunk written by |sliceId| is endChunk - 1
-    var indexStart = std_Math_min(chunkEnd << CHUNK_SHIFT, length);
-    return indexStart - 1;
-  }
-
-  /**
-   * After computing the phase1 results, we compute an
-   * |intermediates| array. |intermediates[i]| contains the result
-   * of reducing the final value from each preceding slice j<i with
-   * the final value of slice i. So, to continue our previous
-   * example, the intermediates array would contain:
-   *
-   *   [A+B+C, (A+B+C)+(D+E+F), ((A+B+C)+(D+E+F))+(G+H+I)]
-   *
-   * Here I have used parenthesization to make clear the order of
-   * evaluation in each case.
-   *
-   *   An aside: currently the intermediates array is computed
-   *   sequentially. In principle, we could compute it in parallel,
-   *   at the cost of doing duplicate work. This did not seem
-   *   particularly advantageous to me, particularly as the number
-   *   of slices is typically quite small (one per core), so I opted
-   *   to just compute it sequentially.
-   *
-   * Phase 2 combines the results of phase1 with the intermediates
-   * array to produce the final scan results. The idea is to
-   * reiterate over each element S[i] in the slice |sliceId|, which
-   * currently contains the result of reducing with S[0]...S[i]
-   * (where S0 is the first thing in the slice), and combine that
-   * with |intermediate[sliceId-1]|, which represents the result of
-   * reducing everything in the input array prior to the slice.
-   *
-   * To continue with our example, in phase 1 we computed slice 1 to
-   * be [D, D+E, D+E+F]. We will combine those results with
-   * |intermediates[1-1]|, which is |A+B+C|, so that the final
-   * result is [(A+B+C)+D, (A+B+C)+(D+E), (A+B+C)+(D+E+F)]. Again I
-   * am using parentheses to clarify how these results were reduced.
-   *
-   * SUBTLE: Because we are mutating |buffer| in place, we have to
-   * be very careful about bailouts!  We cannot checkpoint a chunk
-   * at a time as we do elsewhere because that assumes it is safe to
-   * replay the portion of a chunk which was already processed.
-   * Therefore, in this phase, we track the current position at an
-   * index granularity, although this requires two memory writes per
-   * index.
-   */
-  function phase2(sliceId, numSlices, warmup) {
-    if (sliceId == 0)
-      return; // No work to do for the 0th slice.
-
-    var indexPos = info[SLICE_POS(sliceId)];
-    var indexEnd = info[SLICE_END(sliceId)];
-
-    if (warmup)
-      indexEnd = std_Math_min(indexEnd, indexPos + CHUNK_SIZE);
-
-    var intermediate = intermediates[sliceId - 1];
-    for (; indexPos < indexEnd; indexPos++) {
-      UnsafeSetElement(buffer, indexPos, func(intermediate, buffer[indexPos]),
-                       info, SLICE_POS(sliceId), indexPos + 1);
-    }
-  }
-}
-
-/**
- * |scatter()| redistributes the elements in the parallel array
- * into a new parallel array.
- *
- * - targets: The index targets[i] indicates where the ith element
- *   should appear in the result.
- *
- * - defaultValue: what value to use for indices in the output array that
- *   are never targeted.
- *
- * - conflictFunc: The conflict function. Used to resolve what
- *   happens if two indices i and j in the source array are targeted
- *   as the same destination (i.e., targets[i] == targets[j]), then
- *   the final result is determined by applying func(targets[i],
- *   targets[j]). If no conflict function is provided, it is an error
- *   if targets[i] == targets[j].
- *
- * - length: length of the output array (if not specified, uses the
- *   length of the input).
- *
- * - mode: internal debugging specification.
- */
-function ParallelArrayScatter(targets, defaultValue, conflictFunc, length, mode) {
-  // FIXME(bug 844887): Check |this instanceof ParallelArray|
-  // FIXME(bug 844887): Check targets is array-like
-  // FIXME(bug 844887): Check |IsCallable(conflictFunc)|
-
-  var self = this;
-
-  if (length === undefined)
-    length = self.shape[0];
-
-  // The Divide-Scatter-Vector strategy:
-  // 1. Slice |targets| array of indices ("scatter-vector") into N
-  //    parts.
-  // 2. Each of the N threads prepares an output buffer and a
-  //    write-log.
-  // 3. Each thread scatters according to one of the N parts into its
-  //    own output buffer, tracking written indices in the write-log
-  //    and resolving any resulting local collisions in parallel.
-  // 4. Merge the parts (either in parallel or sequentially), using
-  //    the write-logs as both the basis for finding merge-inputs and
-  //    for detecting collisions.
-
-  // The Divide-Output-Range strategy:
-  // 1. Slice the range of indices [0..|length|-1] into N parts.
-  //    Allocate a single shared output buffer of length |length|.
-  // 2. Each of the N threads scans (the entirety of) the |targets|
-  //    array, seeking occurrences of indices from that thread's part
-  //    of the range, and writing the results into the shared output
-  //    buffer.
-  // 3. Since each thread has its own portion of the output range,
-  //    every collision that occurs can be handled thread-locally.
-
-  // SO:
-  //
-  // If |targets.length| >> |length|, Divide-Scatter-Vector seems like
-  // a clear win over Divide-Output-Range, since for the latter, the
-  // expense of redundantly scanning the |targets| will diminish the
-  // gain from processing |length| in parallel, while for the former,
-  // the total expense of building separate output buffers and the
-  // merging post-process is small compared to the gain from
-  // processing |targets| in parallel.
-  //
-  // If |targets.length| << |length|, then Divide-Output-Range seems
-  // like it *could* win over Divide-Scatter-Vector. (But when is
-  // |targets.length| << |length| or even |targets.length| < |length|?
-  // Seems like an odd situation and an uncommon case at best.)
-  //
-  // The unanswered question is which strategy performs better when
-  // |targets.length| approximately equals |length|, especially for
-  // special cases like collision-free scatters and permutations.
-
-  if (targets.length >>> 0 !== targets.length)
-    ThrowError(JSMSG_BAD_ARRAY_LENGTH, ".prototype.scatter");
-
-  var targetsLength = std_Math_min(targets.length, self.length);
-
-  if (length >>> 0 !== length)
-    ThrowError(JSMSG_BAD_ARRAY_LENGTH, ".prototype.scatter");
-
-  parallel: for (;;) { // see ParallelArrayBuild() to explain why for(;;) etc
-    if (ShouldForceSequential())
-      break parallel;
-    if (!TRY_PARALLEL(mode))
-      break parallel;
-
-    if (forceDivideScatterVector())
-      return parDivideScatterVector();
-    else if (forceDivideOutputRange())
-      return parDivideOutputRange();
-    else if (conflictFunc === undefined && targetsLength < length)
-      return parDivideOutputRange();
-    return parDivideScatterVector();
-  }
-
-  // Sequential fallback:
-  ASSERT_SEQUENTIAL_IS_OK(mode);
-  return seq();
-
-  function forceDivideScatterVector() {
-    return mode && mode.strategy && mode.strategy == "divide-scatter-vector";
-  }
-
-  function forceDivideOutputRange() {
-    return mode && mode.strategy && mode.strategy == "divide-output-range";
-  }
-
-  function collide(elem1, elem2) {
-    if (conflictFunc === undefined)
-      ThrowError(JSMSG_PAR_ARRAY_SCATTER_CONFLICT);
-
-    return conflictFunc(elem1, elem2);
-  }
-
-
-  function parDivideOutputRange() {
-    var chunks = ComputeNumChunks(targetsLength);
-    var numSlices = ForkJoinSlices();
-    var checkpoints = NewDenseArray(numSlices);
-    for (var i = 0; i < numSlices; i++)
-      UnsafeSetElement(checkpoints, i, 0);
-
-    var buffer = NewDenseArray(length);
-    var conflicts = NewDenseArray(length);
-
-    for (var i = 0; i < length; i++) {
-      UnsafeSetElement(buffer, i, defaultValue);
-      UnsafeSetElement(conflicts, i, false);
-    }
-
-    ForkJoin(fill, CheckParallel(mode));
-    return NewParallelArray(ParallelArrayView, [length], buffer, 0);
-
-    function fill(sliceId, numSlices, warmup) {
-      var indexPos = checkpoints[sliceId];
-      var indexEnd = targetsLength;
-      if (warmup)
-        indexEnd = std_Math_min(indexEnd, indexPos + CHUNK_SIZE);
-
-      // Range in the output for which we are responsible:
-      var [outputStart, outputEnd] = ComputeSliceBounds(length, sliceId, numSlices);
-
-      for (; indexPos < indexEnd; indexPos++) {
-        var x = self.get(indexPos);
-        var t = checkTarget(indexPos, targets[indexPos]);
-        if (t < outputStart || t >= outputEnd)
-          continue;
-        if (conflicts[t])
-          x = collide(x, buffer[t]);
-        UnsafeSetElement(buffer, t, x,
-                         conflicts, t, true,
-                         checkpoints, sliceId, indexPos + 1);
-      }
-    }
-  }
-
-  function parDivideScatterVector() {
-    // Subtle: because we will be mutating the localBuffers and
-    // conflict arrays in place, we can never replay an entry in the
-    // target array for fear of inducing a conflict where none existed
-    // before. Therefore, we must proceed not by chunks but rather by
-    // individual indices.
-    var numSlices = ForkJoinSlices();
-    var info = ComputeAllSliceBounds(targetsLength, numSlices);
-
-    // FIXME(bug 844890): Use typed arrays here.
-    var localBuffers = NewDenseArray(numSlices);
-    for (var i = 0; i < numSlices; i++)
-      UnsafeSetElement(localBuffers, i, NewDenseArray(length));
-    var localConflicts = NewDenseArray(numSlices);
-    for (var i = 0; i < numSlices; i++) {
-      var conflicts_i = NewDenseArray(length);
-      for (var j = 0; j < length; j++)
-        UnsafeSetElement(conflicts_i, j, false);
-      UnsafeSetElement(localConflicts, i, conflicts_i);
-    }
-
-    // Initialize the 0th buffer, which will become the output. For
-    // the other buffers, we track which parts have been written to
-    // using the conflict buffer so they do not need to be
-    // initialized.
-    var outputBuffer = localBuffers[0];
-    for (var i = 0; i < length; i++)
-      UnsafeSetElement(outputBuffer, i, defaultValue);
-
-    ForkJoin(fill, CheckParallel(mode));
-    mergeBuffers();
-    return NewParallelArray(ParallelArrayView, [length], outputBuffer, 0);
-
-    function fill(sliceId, numSlices, warmup) {
-      var indexPos = info[SLICE_POS(sliceId)];
-      var indexEnd = info[SLICE_END(sliceId)];
-      if (warmup)
-        indexEnd = std_Math_min(indexEnd, indexPos + CHUNK_SIZE);
-
-      var localbuffer = localBuffers[sliceId];
-      var conflicts = localConflicts[sliceId];
-      while (indexPos < indexEnd) {
-        var x = self.get(indexPos);
-        var t = checkTarget(indexPos, targets[indexPos]);
-        if (conflicts[t])
-          x = collide(x, localbuffer[t]);
-        UnsafeSetElement(localbuffer, t, x,
-                         conflicts, t, true,
-                         info, SLICE_POS(sliceId), ++indexPos);
-      }
-    }
-
-    /**
-     * Merge buffers 1..NUMSLICES into buffer 0. In principle, we could
-     * parallelize the merge work as well. But for this first cut,
-     * just do the merge sequentially.
-     */
-    function mergeBuffers() {
-      var buffer = localBuffers[0];
-      var conflicts = localConflicts[0];
-      for (var i = 1; i < numSlices; i++) {
-        var otherbuffer = localBuffers[i];
-        var otherconflicts = localConflicts[i];
-        for (var j = 0; j < length; j++) {
-          if (otherconflicts[j]) {
-            if (conflicts[j]) {
-              buffer[j] = collide(otherbuffer[j], buffer[j]);
-            } else {
-              buffer[j] = otherbuffer[j];
-              conflicts[j] = true;
-            }
-          }
-        }
-      }
-    }
-  }
-
-  function seq() {
-    var buffer = NewDenseArray(length);
-    var conflicts = NewDenseArray(length);
-
-    for (var i = 0; i < length; i++) {
-      UnsafeSetElement(buffer, i, defaultValue);
-      UnsafeSetElement(conflicts, i, false);
-    }
-
-    for (var i = 0; i < targetsLength; i++) {
-      var x = self.get(i);
-      var t = checkTarget(i, targets[i]);
-      if (conflicts[t])
-        x = collide(x, buffer[t]);
-
-      UnsafeSetElement(buffer, t, x,
-                       conflicts, t, true);
-    }
-
-    return NewParallelArray(ParallelArrayView, [length], buffer, 0);
-  }
-
-  function checkTarget(i, t) {
-    if (TO_INT32(t) !== t)
-      ThrowError(JSMSG_PAR_ARRAY_SCATTER_BAD_TARGET, i);
-
-    if (t < 0 || t >= length)
-      ThrowError(JSMSG_PAR_ARRAY_SCATTER_BOUNDS);
-
-    // It's not enough to return t, as -0 | 0 === -0.
-    return TO_INT32(t);
-  }
-}
-
-/**
- * The familiar filter() operation applied across the outermost
- * dimension.
- */
-function ParallelArrayFilter(func, mode) {
-  // FIXME(bug 844887): Check |this instanceof ParallelArray|
-  // FIXME(bug 844887): Check |IsCallable(func)|
-
-  var self = this;
-  var length = self.shape[0];
-
-  parallel: for (;;) { // see ParallelArrayBuild() to explain why for(;;) etc
-    if (ShouldForceSequential())
-      break parallel;
-    if (!TRY_PARALLEL(mode))
-      break parallel;
-
-    var chunks = ComputeNumChunks(length);
-    var numSlices = ForkJoinSlices();
-    if (chunks < numSlices * 2)
-      break parallel;
-
-    var info = ComputeAllSliceBounds(chunks, numSlices);
-
-    // Step 1. Compute which items from each slice of the result
-    // buffer should be preserved. When we're done, we have an array
-    // |survivors| containing a bitset for each chunk, indicating
-    // which members of the chunk survived. We also keep an array
-    // |counts| containing the total number of items that are being
-    // preserved from within one slice.
-    //
-    // FIXME(bug 844890): Use typed arrays here.
-    var counts = NewDenseArray(numSlices);
-    for (var i = 0; i < numSlices; i++)
-      UnsafeSetElement(counts, i, 0);
-    var survivors = NewDenseArray(chunks);
-    ForkJoin(findSurvivorsInSlice, CheckParallel(mode));
-
-    // Step 2. Compress the slices into one contiguous set.
-    var count = 0;
-    for (var i = 0; i < numSlices; i++)
-      count += counts[i];
-    var buffer = NewDenseArray(count);
-    if (count > 0)
-      ForkJoin(copySurvivorsInSlice, CheckParallel(mode));
-
-    return NewParallelArray(ParallelArrayView, [count], buffer, 0);
-  }
-
-  // Sequential fallback:
-  ASSERT_SEQUENTIAL_IS_OK(mode);
-  var buffer = [];
-  for (var i = 0; i < length; i++) {
-    var elem = self.get(i);
-    if (func(elem, i, self))
-      ARRAY_PUSH(buffer, elem);
-  }
-  return NewParallelArray(ParallelArrayView, [buffer.length], buffer, 0);
-
-  /**
-   * As described above, our goal is to determine which items we
-   * will preserve from a given slice. We do this one chunk at a
-   * time. When we finish a chunk, we record our current count and
-   * the next chunk sliceId, lest we should bail.
-   */
-  function findSurvivorsInSlice(sliceId, numSlices, warmup) {
-
-    var chunkPos = info[SLICE_POS(sliceId)];
-    var chunkEnd = info[SLICE_END(sliceId)];
-
-    if (warmup && chunkEnd > chunkPos)
-      chunkEnd = chunkPos + 1;
-
-    var count = counts[sliceId];
-    while (chunkPos < chunkEnd) {
-      var indexStart = chunkPos << CHUNK_SHIFT;
-      var indexEnd = std_Math_min(indexStart + CHUNK_SIZE, length);
-      var chunkBits = 0;
-
-      for (var bit = 0; indexStart + bit < indexEnd; bit++) {
-        var keep = !!func(self.get(indexStart + bit), indexStart + bit, self);
-        chunkBits |= keep << bit;
-        count += keep;
-      }
-
-      UnsafeSetElement(survivors, chunkPos, chunkBits,
-                       counts, sliceId, count,
-                       info, SLICE_POS(sliceId), ++chunkPos);
-    }
-  }
-
-  function copySurvivorsInSlice(sliceId, numSlices, warmup) {
-    // Copies the survivors from this slice into the correct position.
-    // Note that this is an idempotent operation that does not invoke
-    // user code. Therefore, we don't expect bailouts and make an
-    // effort to proceed chunk by chunk or avoid duplicating work.
-
-    // During warmup, we only execute with sliceId 0. This would fail to
-    // execute the loop below. Therefore, during warmup, we
-    // substitute 1 for the sliceId.
-    if (warmup && sliceId == 0 && numSlices != 1)
-      sliceId = 1;
-
-    // Total up the items preserved by previous slices.
-    var count = 0;
-    if (sliceId > 0) { // FIXME(#819219)---work around a bug in Ion's range checks
-      for (var i = 0; i < sliceId; i++)
-        count += counts[i];
-    }
-
-    // Compute the final index we expect to write.
-    var total = count + counts[sliceId];
-    if (count == total)
-      return;
-
-    // Iterate over the chunks assigned to us. Read the bitset for
-    // each chunk. Copy values where a 1 appears until we have
-    // written all the values that we expect to. We can just iterate
-    // from 0...CHUNK_SIZE without fear of a truncated final chunk
-    // because we are already checking for when count==total.
-    var chunkStart = info[SLICE_START(sliceId)];
-    var chunkEnd = info[SLICE_END(sliceId)];
-    for (var chunk = chunkStart; chunk < chunkEnd; chunk++) {
-      var chunkBits = survivors[chunk];
-      if (!chunkBits)
-        continue;
-
-      var indexStart = chunk << CHUNK_SHIFT;
-      for (var i = 0; i < CHUNK_SIZE; i++) {
-        if (chunkBits & (1 << i)) {
-          UnsafeSetElement(buffer, count++, self.get(indexStart + i));
-          if (count == total)
-            break;
-        }
-      }
-    }
-  }
-}
-
 /**
 * Divides the outermost dimension into two dimensions. Does not copy
 * or affect the underlying data, just how it is divided amongst
@ -1101,11 +335,11 @@ function ParallelArrayFilter(func, mode) {
 * N must be evenly divisible by 4 in that case.
 */
 function ParallelArrayPartition(amount) {
-  if (amount >>> 0 !== amount)
+  if (!IS_UINT32(amount))
    ThrowError(JSMSG_PAR_ARRAY_BAD_ARG, "");

  var length = this.shape[0];
-  var partitions = (length / amount) | 0;
+  var partitions = TO_INT32(length / amount);

  if (partitions * amount !== length)
    ThrowError(JSMSG_PAR_ARRAY_BAD_PARTITION);
@ -1242,46 +476,9 @@ function ParallelArrayToString() {
  return result;
 }

-/**
- * Internal debugging tool: checks that the given `mode` permits
- * sequential execution
- */
-function AssertSequentialIsOK(mode) {
-  if (mode && mode.mode && mode.mode !== "seq" && ParallelTestsShouldPass())
-    ThrowError(JSMSG_WRONG_VALUE, "parallel execution", "sequential was forced");
-}
-
-/**
- * Internal debugging tool: returns a function to be supplied to
- * ForkJoin() that will check that the parallel results
- * bailout/succeed as expected. Returns null if no mode is supplied
- * or we are building with some strange IF_DEF configuration such that
- * we don't expect parallel execution to work.
- */
-function CheckParallel(mode) {
-  if (!mode || !ParallelTestsShouldPass())
-    return null;
-
-  return function(result, bailouts, causes) {
-    if (!("expect" in mode) || mode.expect === "any") {
-      return; // Ignore result when unspecified or unimportant.
-    } else if (mode.expect === "mixed" && result !== "disqualified") {
-      return; // "mixed" means that it may bailout, may succeed
-    } else if (result === mode.expect) {
-      return;
-    }
-
-    ThrowError(JSMSG_WRONG_VALUE, mode.expect,
-               result+":"+bailouts+":"+causes);
-  };
-}
-
 /*
- * Mark the main operations as clone-at-callsite for better precision.
- * This is slightly overkill, as all that we really need is to
- * specialize to the receiver and the elemental function, but in
- * practice this is likely not so different, since element functions
- * are often used in exactly one place.
+ * Mark the comprehension form and friends as clone-at-callsite. See note in
+ * ParallelArrayCommonOps.js
 */
 SetScriptHints(ParallelArrayConstructEmpty, { cloneAtCallsite: true });
 SetScriptHints(ParallelArrayConstructFromArray, { cloneAtCallsite: true });
@ -1290,11 +487,6 @@ SetScriptHints(ParallelArrayConstructFromFunctionMode, { cloneAtCallsite: true }
 SetScriptHints(ParallelArrayConstructFromComprehension, { cloneAtCallsite: true });
 SetScriptHints(ParallelArrayView,       { cloneAtCallsite: true });
 SetScriptHints(ParallelArrayBuild,      { cloneAtCallsite: true });
-SetScriptHints(ParallelArrayMap,        { cloneAtCallsite: true });
-SetScriptHints(ParallelArrayReduce,     { cloneAtCallsite: true });
-SetScriptHints(ParallelArrayScan,       { cloneAtCallsite: true });
-SetScriptHints(ParallelArrayScatter,    { cloneAtCallsite: true });
-SetScriptHints(ParallelArrayFilter,     { cloneAtCallsite: true });

 /*
 * Mark the common getters as clone-at-callsite and inline. This is
--- a/js/src/builtin/ParallelArrayCommonOps.js
+++ b/js/src/builtin/ParallelArrayCommonOps.js
@ -0,0 +1,881 @@
+/* This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
+
+/*
+ * This file includes utility functions, macros, and "templates" for the
+ * following 1D parallel operations which operate on the outermost dimension,
+ * so that they may be shared between the Array and Matrix operations.
+ *
+ *   - map
+ *   - reduce
+ *   - scan
+ *   - scatter
+ *   - filter
+ *
+ * Operations that are more closely tied to the dimensionality of the data,
+ * like build, are in the constructors' respective files.
+ *
+ * This file is designed to be #include'd with the following macros defined:
+ *
+ *   PA_LENGTH(array)
+ *   PA_GET(array, index)
+ *   PA_NEW(length, buffer, offset)
+ *
+ * The internal names for the functions should also be defined:
+ *
+ *   PA_MAP_NAME
+ *   PA_REDUCE_NAME
+ *   PA_SCAN_NAME
+ *   PA_SCATTER_NAME
+ *   PA_FILTER_NAME
+ *
+ * See Array.js for an usage example.
+ */
+
+/* The mode asserts options object. */
+#define TRY_PARALLEL(MODE) \
+  ((!MODE || MODE.mode === "par"))
+#define ASSERT_SEQUENTIAL_IS_OK(MODE) \
+  do { if (MODE) AssertSequentialIsOK(MODE) } while(false)
+
+/* Slice array: see ComputeAllSliceBounds() */
+#define SLICE_INFO(START, END) START, END, START, 0
+#define SLICE_START(ID) ((ID << 2) + 0)
+#define SLICE_END(ID)   ((ID << 2) + 1)
+#define SLICE_POS(ID)   ((ID << 2) + 2)
+
+/*
+ * How many items at a time do we do recomp. for parallel execution.
+ * Note that filter currently assumes that this is no greater than 32
+ * in order to make use of a bitset.
+ */
+#define CHUNK_SHIFT 5
+#define CHUNK_SIZE 32
+
+/* Safe versions of ARRAY.push(ELEMENT) */
+#define ARRAY_PUSH(ARRAY, ELEMENT) \
+  callFunction(std_Array_push, ARRAY, ELEMENT);
+#define ARRAY_SLICE(ARRAY, ELEMENT) \
+  callFunction(std_Array_slice, ARRAY, ELEMENT);
+
+/**
+ * Determine the number of chunks of size CHUNK_SIZE;
+ * note that the final chunk may be smaller than CHUNK_SIZE.
+ */
+function ComputeNumChunks(length) {
+  var chunks = length >>> CHUNK_SHIFT;
+  if (chunks << CHUNK_SHIFT === length)
+    return chunks;
+  return chunks + 1;
+}
+
+/**
+ * Computes the bounds for slice |sliceIndex| of |numItems| items,
+ * assuming |numSlices| total slices. If numItems is not evenly
+ * divisible by numSlices, then the final thread may have a bit of
+ * extra work.
+ */
+function ComputeSliceBounds(numItems, sliceIndex, numSlices) {
+  var sliceWidth = TO_INT32(numItems / numSlices);
+  var startIndex = sliceWidth * sliceIndex;
+  var endIndex = sliceIndex === numSlices - 1 ? numItems : sliceWidth * (sliceIndex + 1);
+  return [startIndex, endIndex];
+}
+
+/**
+ * Divides |numItems| items amongst |numSlices| slices. The result
+ * is an array containing multiple values per slice: the start
+ * index, end index, current position, and some padding. The
+ * current position is initially the same as the start index. To
+ * access the values for a particular slice, use the macros
+ * SLICE_START() and so forth.
+ */
+function ComputeAllSliceBounds(numItems, numSlices) {
+  // FIXME(bug 844890): Use typed arrays here.
+  var info = [];
+  for (var i = 0; i < numSlices; i++) {
+    var [start, end] = ComputeSliceBounds(numItems, i, numSlices);
+    ARRAY_PUSH(info, SLICE_INFO(start, end));
+  }
+  return info;
+}
+
+/**
+ * Creates a new array by applying |func(e, i, self)| for each element |e|
+ * with index |i|.
+ */
+function PA_MAP_NAME(func, mode) {
+  if (!IsCallable(func))
+    ThrowError(JSMSG_NOT_FUNCTION, DecompileArg(0, func));
+
+  var self = ToObject(this);
+  var length = PA_LENGTH(self);
+  var buffer = NewDenseArray(length);
+
+  parallel: for (;;) {
+    // Avoid parallel compilation if we are already nested in another
+    // parallel section or the user told us not to parallelize. The
+    // use of a for (;;) loop is working around some ion limitations:
+    //
+    // - Breaking out of named blocks does not currently work (bug 684384);
+    // - Unreachable Code Elim. can't properly handle if (a && b) (bug 669796)
+    if (ShouldForceSequential())
+      break parallel;
+    if (!TRY_PARALLEL(mode))
+      break parallel;
+
+    var chunks = ComputeNumChunks(length);
+    var numSlices = ForkJoinSlices();
+    var info = ComputeAllSliceBounds(chunks, numSlices);
+    ForkJoin(mapSlice, CheckParallel(mode));
+    return PA_NEW(length, buffer, 0);
+  }
+
+  // Sequential fallback:
+  ASSERT_SEQUENTIAL_IS_OK(mode);
+  for (var i = 0; i < length; i++) {
+    // Note: Unlike JS arrays, parallel arrays cannot have holes.
+    var v = func(PA_GET(self, i), i, self);
+    UnsafeSetElement(buffer, i, v);
+  }
+  return PA_NEW(length, buffer, 0);
+
+  function mapSlice(sliceId, numSlices, warmup) {
+    var chunkPos = info[SLICE_POS(sliceId)];
+    var chunkEnd = info[SLICE_END(sliceId)];
+
+    if (warmup && chunkEnd > chunkPos + 1)
+      chunkEnd = chunkPos + 1;
+
+    while (chunkPos < chunkEnd) {
+      var indexStart = chunkPos << CHUNK_SHIFT;
+      var indexEnd = std_Math_min(indexStart + CHUNK_SIZE, length);
+
+      for (var i = indexStart; i < indexEnd; i++)
+        UnsafeSetElement(buffer, i, func(PA_GET(self, i), i, self));
+
+      UnsafeSetElement(info, SLICE_POS(sliceId), ++chunkPos);
+    }
+  }
+}
+
+/**
+ * Reduces the elements in an array in parallel. Order is not fixed and |func|
+ * is assumed to be associative.
+ */
+function PA_REDUCE_NAME(func, mode) {
+  if (!IsCallable(func))
+    ThrowError(JSMSG_NOT_FUNCTION, DecompileArg(0, func));
+
+  var self = ToObject(this);
+  var length = PA_LENGTH(self);
+
+  if (length === 0)
+    ThrowError(JSMSG_PAR_ARRAY_REDUCE_EMPTY);
+
+  parallel: for (;;) { // see map to explain why for(;;) etc
+    if (ShouldForceSequential())
+      break parallel;
+    if (!TRY_PARALLEL(mode))
+      break parallel;
+
+    var chunks = ComputeNumChunks(length);
+    var numSlices = ForkJoinSlices();
+    if (chunks < numSlices)
+      break parallel;
+
+    var info = ComputeAllSliceBounds(chunks, numSlices);
+    var subreductions = NewDenseArray(numSlices);
+    ForkJoin(reduceSlice, CheckParallel(mode));
+    var accumulator = subreductions[0];
+    for (var i = 1; i < numSlices; i++)
+      accumulator = func(accumulator, subreductions[i]);
+    return accumulator;
+  }
+
+  // Sequential fallback:
+  ASSERT_SEQUENTIAL_IS_OK(mode);
+  var accumulator = PA_GET(self, 0);
+  for (var i = 1; i < length; i++)
+    accumulator = func(accumulator, PA_GET(self, i));
+  return accumulator;
+
+  function reduceSlice(sliceId, numSlices, warmup) {
+    var chunkStart = info[SLICE_START(sliceId)];
+    var chunkPos = info[SLICE_POS(sliceId)];
+    var chunkEnd = info[SLICE_END(sliceId)];
+
+    // (*) This function is carefully designed so that the warmup
+    // (which executes with chunkStart === chunkPos) will execute all
+    // potential loads and stores. In particular, the warmup run
+    // processes two chunks rather than one. Moreover, it stores
+    // accumulator into subreductions and then loads it again to
+    // ensure that the load is executed during the warmup, as it will
+    // certainly be executed during subsequent runs.
+
+    if (warmup && chunkEnd > chunkPos + 2)
+      chunkEnd = chunkPos + 2;
+
+    if (chunkStart === chunkPos) {
+      var indexPos = chunkStart << CHUNK_SHIFT;
+      var accumulator = reduceChunk(PA_GET(self, indexPos), indexPos + 1, indexPos + CHUNK_SIZE);
+
+      UnsafeSetElement(subreductions, sliceId, accumulator, // see (*) above
+                       info, SLICE_POS(sliceId), ++chunkPos);
+    }
+
+    var accumulator = subreductions[sliceId]; // see (*) above
+
+    while (chunkPos < chunkEnd) {
+      var indexPos = chunkPos << CHUNK_SHIFT;
+      accumulator = reduceChunk(accumulator, indexPos, indexPos + CHUNK_SIZE);
+      UnsafeSetElement(subreductions, sliceId, accumulator,
+                       info, SLICE_POS(sliceId), ++chunkPos);
+    }
+  }
+
+  function reduceChunk(accumulator, from, to) {
+    to = std_Math_min(to, length);
+    for (var i = from; i < to; i++)
+      accumulator = func(accumulator, PA_GET(self, i));
+    return accumulator;
+  }
+}
+
+/**
+ * Returns an array [s_0, ..., s_N] where |s_i| is equal to the reduction (as
+ * per |reduce()|) of elements |0..i|. This is the generalization of partial
+ * sum.
+ */
+function PA_SCAN_NAME(func, mode) {
+  if (!IsCallable(func))
+    ThrowError(JSMSG_NOT_FUNCTION, DecompileArg(0, func));
+
+  var self = ToObject(this);
+  var length = PA_LENGTH(self);
+
+  if (length === 0)
+    ThrowError(JSMSG_PAR_ARRAY_REDUCE_EMPTY);
+
+  var buffer = NewDenseArray(length);
+
+  parallel: for (;;) { // see map to explain why for(;;) etc
+    if (ShouldForceSequential())
+      break parallel;
+    if (!TRY_PARALLEL(mode))
+      break parallel;
+
+    var chunks = ComputeNumChunks(length);
+    var numSlices = ForkJoinSlices();
+    if (chunks < numSlices)
+      break parallel;
+    var info = ComputeAllSliceBounds(chunks, numSlices);
+
+    // Scan slices individually (see comment on phase1()).
+    ForkJoin(phase1, CheckParallel(mode));
+
+    // Compute intermediates array (see comment on phase2()).
+    var intermediates = [];
+    var accumulator = buffer[finalElement(0)];
+    ARRAY_PUSH(intermediates, accumulator);
+    for (var i = 1; i < numSlices - 1; i++) {
+      accumulator = func(accumulator, buffer[finalElement(i)]);
+      ARRAY_PUSH(intermediates, accumulator);
+    }
+
+    // Reset the current position information for each slice, but
+    // convert from chunks to indices (see comment on phase2()).
+    for (var i = 0; i < numSlices; i++) {
+      info[SLICE_POS(i)] = info[SLICE_START(i)] << CHUNK_SHIFT;
+      info[SLICE_END(i)] = info[SLICE_END(i)] << CHUNK_SHIFT;
+    }
+    info[SLICE_END(numSlices - 1)] = std_Math_min(info[SLICE_END(numSlices - 1)], length);
+
+    // Complete each slice using intermediates array (see comment on phase2()).
+    ForkJoin(phase2, CheckParallel(mode));
+    return PA_NEW(length, buffer, 0);
+  }
+
+  // Sequential fallback:
+  ASSERT_SEQUENTIAL_IS_OK(mode);
+  scan(PA_GET(self, 0), 0, length);
+  return PA_NEW(length, buffer, 0);
+
+  function scan(accumulator, start, end) {
+    UnsafeSetElement(buffer, start, accumulator);
+    for (var i = start + 1; i < end; i++) {
+      accumulator = func(accumulator, PA_GET(self, i));
+      UnsafeSetElement(buffer, i, accumulator);
+    }
+    return accumulator;
+  }
+
+  /**
+   * In phase 1, we divide the source array into |numSlices| slices and
+   * compute scan on each slice sequentially as if it were the entire
+   * array. This function is responsible for computing one of those
+   * slices.
+   *
+   * So, if we have an array [A,B,C,D,E,F,G,H,I], |numSlices == 3|,
+   * and our function |func| is sum, then we would wind up computing a
+   * result array like:
+   *
+   *     [A, A+B, A+B+C, D, D+E, D+E+F, G, G+H, G+H+I]
+   *      ^~~~~~~~~~~~^  ^~~~~~~~~~~~^  ^~~~~~~~~~~~~^
+   *      Slice 0        Slice 1        Slice 2
+   *
+   * Read on in phase2 to see what we do next!
+   */
+  function phase1(sliceId, numSlices, warmup) {
+    var chunkStart = info[SLICE_START(sliceId)];
+    var chunkPos = info[SLICE_POS(sliceId)];
+    var chunkEnd = info[SLICE_END(sliceId)];
+
+    if (warmup && chunkEnd > chunkPos + 2)
+      chunkEnd = chunkPos + 2;
+
+    if (chunkPos == chunkStart) {
+      // For the first chunk, the accumulator begins as the value in
+      // the input at the start of the chunk.
+      var indexStart = chunkPos << CHUNK_SHIFT;
+      var indexEnd = std_Math_min(indexStart + CHUNK_SIZE, length);
+      scan(PA_GET(self, indexStart), indexStart, indexEnd);
+      UnsafeSetElement(info, SLICE_POS(sliceId), ++chunkPos);
+    }
+
+    while (chunkPos < chunkEnd) {
+      // For each subsequent chunk, the accumulator begins as the
+      // combination of the final value of prev chunk and the value in
+      // the input at the start of this chunk. Note that this loop is
+      // written as simple as possible, at the cost of an extra read
+      // from the buffer per iteration.
+      var indexStart = chunkPos << CHUNK_SHIFT;
+      var indexEnd = std_Math_min(indexStart + CHUNK_SIZE, length);
+      var accumulator = func(buffer[indexStart - 1], PA_GET(self, indexStart));
+      scan(accumulator, indexStart, indexEnd);
+      UnsafeSetElement(info, SLICE_POS(sliceId), ++chunkPos);
+    }
+  }
+
+  /**
+   * Computes the index of the final element computed by the slice |sliceId|.
+   */
+  function finalElement(sliceId) {
+    var chunkEnd = info[SLICE_END(sliceId)]; // last chunk written by |sliceId| is endChunk - 1
+    var indexStart = std_Math_min(chunkEnd << CHUNK_SHIFT, length);
+    return indexStart - 1;
+  }
+
+  /**
+   * After computing the phase1 results, we compute an
+   * |intermediates| array. |intermediates[i]| contains the result
+   * of reducing the final value from each preceding slice j<i with
+   * the final value of slice i. So, to continue our previous
+   * example, the intermediates array would contain:
+   *
+   *   [A+B+C, (A+B+C)+(D+E+F), ((A+B+C)+(D+E+F))+(G+H+I)]
+   *
+   * Here I have used parenthesization to make clear the order of
+   * evaluation in each case.
+   *
+   *   An aside: currently the intermediates array is computed
+   *   sequentially. In principle, we could compute it in parallel,
+   *   at the cost of doing duplicate work. This did not seem
+   *   particularly advantageous to me, particularly as the number
+   *   of slices is typically quite small (one per core), so I opted
+   *   to just compute it sequentially.
+   *
+   * Phase 2 combines the results of phase1 with the intermediates
+   * array to produce the final scan results. The idea is to
+   * reiterate over each element S[i] in the slice |sliceId|, which
+   * currently contains the result of reducing with S[0]...S[i]
+   * (where S0 is the first thing in the slice), and combine that
+   * with |intermediate[sliceId-1]|, which represents the result of
+   * reducing everything in the input array prior to the slice.
+   *
+   * To continue with our example, in phase 1 we computed slice 1 to
+   * be [D, D+E, D+E+F]. We will combine those results with
+   * |intermediates[1-1]|, which is |A+B+C|, so that the final
+   * result is [(A+B+C)+D, (A+B+C)+(D+E), (A+B+C)+(D+E+F)]. Again I
+   * am using parentheses to clarify how these results were reduced.
+   *
+   * SUBTLE: Because we are mutating |buffer| in place, we have to
+   * be very careful about bailouts!  We cannot checkpoint a chunk
+   * at a time as we do elsewhere because that assumes it is safe to
+   * replay the portion of a chunk which was already processed.
+   * Therefore, in this phase, we track the current position at an
+   * index granularity, although this requires two memory writes per
+   * index.
+   */
+  function phase2(sliceId, numSlices, warmup) {
+    if (sliceId == 0)
+      return; // No work to do for the 0th slice.
+
+    var indexPos = info[SLICE_POS(sliceId)];
+    var indexEnd = info[SLICE_END(sliceId)];
+
+    if (warmup)
+      indexEnd = std_Math_min(indexEnd, indexPos + CHUNK_SIZE);
+
+    var intermediate = intermediates[sliceId - 1];
+    for (; indexPos < indexEnd; indexPos++) {
+      UnsafeSetElement(buffer, indexPos, func(intermediate, buffer[indexPos]),
+                       info, SLICE_POS(sliceId), indexPos + 1);
+    }
+  }
+}
+
+/**
+ * |scatter()| redistributes the elements in the array into a new array.
+ *
+ * - targets: The index targets[i] indicates where the ith element
+ *   should appear in the result.
+ *
+ * - defaultValue: what value to use for indices in the output array that
+ *   are never targeted.
+ *
+ * - conflictFunc: The conflict function. Used to resolve what
+ *   happens if two indices i and j in the source array are targeted
+ *   as the same destination (i.e., targets[i] == targets[j]), then
+ *   the final result is determined by applying func(targets[i],
+ *   targets[j]). If no conflict function is provided, it is an error
+ *   if targets[i] == targets[j].
+ *
+ * - length: length of the output array (if not specified, uses the
+ *   length of the input).
+ *
+ * - mode: internal debugging specification.
+ */
+function PA_SCATTER_NAME(targets, defaultValue, conflictFunc, length, mode) {
+  // FIXME(bug 844887): Check targets is array-like
+
+  if (conflictFunc && !IsCallable(conflictFunc))
+    ThrowError(JSMSG_NOT_FUNCTION, DecompileArg(2, conflictFunc));
+
+  var self = ToObject(this);
+
+  if (length === undefined)
+    length = PA_LENGTH(self);
+
+  // The Divide-Scatter-Vector strategy:
+  // 1. Slice |targets| array of indices ("scatter-vector") into N
+  //    parts.
+  // 2. Each of the N threads prepares an output buffer and a
+  //    write-log.
+  // 3. Each thread scatters according to one of the N parts into its
+  //    own output buffer, tracking written indices in the write-log
+  //    and resolving any resulting local collisions in parallel.
+  // 4. Merge the parts (either in parallel or sequentially), using
+  //    the write-logs as both the basis for finding merge-inputs and
+  //    for detecting collisions.
+
+  // The Divide-Output-Range strategy:
+  // 1. Slice the range of indices [0..|length|-1] into N parts.
+  //    Allocate a single shared output buffer of length |length|.
+  // 2. Each of the N threads scans (the entirety of) the |targets|
+  //    array, seeking occurrences of indices from that thread's part
+  //    of the range, and writing the results into the shared output
+  //    buffer.
+  // 3. Since each thread has its own portion of the output range,
+  //    every collision that occurs can be handled thread-locally.
+
+  // SO:
+  //
+  // If |targets.length| >> |length|, Divide-Scatter-Vector seems like
+  // a clear win over Divide-Output-Range, since for the latter, the
+  // expense of redundantly scanning the |targets| will diminish the
+  // gain from processing |length| in parallel, while for the former,
+  // the total expense of building separate output buffers and the
+  // merging post-process is small compared to the gain from
+  // processing |targets| in parallel.
+  //
+  // If |targets.length| << |length|, then Divide-Output-Range seems
+  // like it *could* win over Divide-Scatter-Vector. (But when is
+  // |targets.length| << |length| or even |targets.length| < |length|?
+  // Seems like an odd situation and an uncommon case at best.)
+  //
+  // The unanswered question is which strategy performs better when
+  // |targets.length| approximately equals |length|, especially for
+  // special cases like collision-free scatters and permutations.
+
+  if (!IS_UINT32(targets.length))
+    ThrowError(JSMSG_BAD_ARRAY_LENGTH, ".prototype.scatter");
+
+  var targetsLength = std_Math_min(targets.length, self.length);
+
+  if (!IS_UINT32(length))
+    ThrowError(JSMSG_BAD_ARRAY_LENGTH, ".prototype.scatter");
+
+  parallel: for (;;) { // see map to explain why for(;;) etc
+    if (ShouldForceSequential())
+      break parallel;
+    if (!TRY_PARALLEL(mode))
+      break parallel;
+
+    if (forceDivideScatterVector())
+      return parDivideScatterVector();
+    else if (forceDivideOutputRange())
+      return parDivideOutputRange();
+    else if (conflictFunc === undefined && targetsLength < length)
+      return parDivideOutputRange();
+    return parDivideScatterVector();
+  }
+
+  // Sequential fallback:
+  ASSERT_SEQUENTIAL_IS_OK(mode);
+  return seq();
+
+  function forceDivideScatterVector() {
+    return mode && mode.strategy && mode.strategy == "divide-scatter-vector";
+  }
+
+  function forceDivideOutputRange() {
+    return mode && mode.strategy && mode.strategy == "divide-output-range";
+  }
+
+  function collide(elem1, elem2) {
+    if (conflictFunc === undefined)
+      ThrowError(JSMSG_PAR_ARRAY_SCATTER_CONFLICT);
+
+    return conflictFunc(elem1, elem2);
+  }
+
+
+  function parDivideOutputRange() {
+    var chunks = ComputeNumChunks(targetsLength);
+    var numSlices = ForkJoinSlices();
+    var checkpoints = NewDenseArray(numSlices);
+    for (var i = 0; i < numSlices; i++)
+      UnsafeSetElement(checkpoints, i, 0);
+
+    var buffer = NewDenseArray(length);
+    var conflicts = NewDenseArray(length);
+
+    for (var i = 0; i < length; i++) {
+      UnsafeSetElement(buffer, i, defaultValue);
+      UnsafeSetElement(conflicts, i, false);
+    }
+
+    ForkJoin(fill, CheckParallel(mode));
+    return PA_NEW(length, buffer, 0);
+
+    function fill(sliceId, numSlices, warmup) {
+      var indexPos = checkpoints[sliceId];
+      var indexEnd = targetsLength;
+      if (warmup)
+        indexEnd = std_Math_min(indexEnd, indexPos + CHUNK_SIZE);
+
+      // Range in the output for which we are responsible:
+      var [outputStart, outputEnd] = ComputeSliceBounds(length, sliceId, numSlices);
+
+      for (; indexPos < indexEnd; indexPos++) {
+        var x = PA_GET(self, indexPos);
+        var t = checkTarget(indexPos, targets[indexPos]);
+        if (t < outputStart || t >= outputEnd)
+          continue;
+        if (conflicts[t])
+          x = collide(x, buffer[t]);
+        UnsafeSetElement(buffer, t, x,
+                         conflicts, t, true,
+                         checkpoints, sliceId, indexPos + 1);
+      }
+    }
+  }
+
+  function parDivideScatterVector() {
+    // Subtle: because we will be mutating the localBuffers and
+    // conflict arrays in place, we can never replay an entry in the
+    // target array for fear of inducing a conflict where none existed
+    // before. Therefore, we must proceed not by chunks but rather by
+    // individual indices.
+    var numSlices = ForkJoinSlices();
+    var info = ComputeAllSliceBounds(targetsLength, numSlices);
+
+    // FIXME(bug 844890): Use typed arrays here.
+    var localBuffers = NewDenseArray(numSlices);
+    for (var i = 0; i < numSlices; i++)
+      UnsafeSetElement(localBuffers, i, NewDenseArray(length));
+    var localConflicts = NewDenseArray(numSlices);
+    for (var i = 0; i < numSlices; i++) {
+      var conflicts_i = NewDenseArray(length);
+      for (var j = 0; j < length; j++)
+        UnsafeSetElement(conflicts_i, j, false);
+      UnsafeSetElement(localConflicts, i, conflicts_i);
+    }
+
+    // Initialize the 0th buffer, which will become the output. For
+    // the other buffers, we track which parts have been written to
+    // using the conflict buffer so they do not need to be
+    // initialized.
+    var outputBuffer = localBuffers[0];
+    for (var i = 0; i < length; i++)
+      UnsafeSetElement(outputBuffer, i, defaultValue);
+
+    ForkJoin(fill, CheckParallel(mode));
+    mergeBuffers();
+    return PA_NEW(length, outputBuffer, 0);
+
+    function fill(sliceId, numSlices, warmup) {
+      var indexPos = info[SLICE_POS(sliceId)];
+      var indexEnd = info[SLICE_END(sliceId)];
+      if (warmup)
+        indexEnd = std_Math_min(indexEnd, indexPos + CHUNK_SIZE);
+
+      var localbuffer = localBuffers[sliceId];
+      var conflicts = localConflicts[sliceId];
+      while (indexPos < indexEnd) {
+        var x = PA_GET(self, indexPos);
+        var t = checkTarget(indexPos, targets[indexPos]);
+        if (conflicts[t])
+          x = collide(x, localbuffer[t]);
+        UnsafeSetElement(localbuffer, t, x,
+                         conflicts, t, true,
+                         info, SLICE_POS(sliceId), ++indexPos);
+      }
+    }
+
+    /**
+     * Merge buffers 1..NUMSLICES into buffer 0. In principle, we could
+     * parallelize the merge work as well. But for this first cut,
+     * just do the merge sequentially.
+     */
+    function mergeBuffers() {
+      var buffer = localBuffers[0];
+      var conflicts = localConflicts[0];
+      for (var i = 1; i < numSlices; i++) {
+        var otherbuffer = localBuffers[i];
+        var otherconflicts = localConflicts[i];
+        for (var j = 0; j < length; j++) {
+          if (otherconflicts[j]) {
+            if (conflicts[j]) {
+              buffer[j] = collide(otherbuffer[j], buffer[j]);
+            } else {
+              buffer[j] = otherbuffer[j];
+              conflicts[j] = true;
+            }
+          }
+        }
+      }
+    }
+  }
+
+  function seq() {
+    var buffer = NewDenseArray(length);
+    var conflicts = NewDenseArray(length);
+
+    for (var i = 0; i < length; i++) {
+      UnsafeSetElement(buffer, i, defaultValue);
+      UnsafeSetElement(conflicts, i, false);
+    }
+
+    for (var i = 0; i < targetsLength; i++) {
+      var x = PA_GET(self, i);
+      var t = checkTarget(i, targets[i]);
+      if (conflicts[t])
+        x = collide(x, buffer[t]);
+
+      UnsafeSetElement(buffer, t, x,
+                       conflicts, t, true);
+    }
+
+    return PA_NEW(length, buffer, 0);
+  }
+
+  function checkTarget(i, t) {
+    if (TO_INT32(t) !== t)
+      ThrowError(JSMSG_PAR_ARRAY_SCATTER_BAD_TARGET, i);
+
+    if (t < 0 || t >= length)
+      ThrowError(JSMSG_PAR_ARRAY_SCATTER_BOUNDS);
+
+    // It's not enough to return t, as -0 | 0 === -0.
+    return TO_INT32(t);
+  }
+}
+
+/**
+ * The familiar filter operation applied in parallel.
+ */
+function PA_FILTER_NAME(func, mode) {
+  if (!IsCallable(func))
+    ThrowError(JSMSG_NOT_FUNCTION, DecompileArg(0, func));
+
+  var self = ToObject(this);
+  var length = PA_LENGTH(self);
+
+  parallel: for (;;) { // see map to explain why for(;;) etc
+    if (ShouldForceSequential())
+      break parallel;
+    if (!TRY_PARALLEL(mode))
+      break parallel;
+
+    var chunks = ComputeNumChunks(length);
+    var numSlices = ForkJoinSlices();
+    if (chunks < numSlices * 2)
+      break parallel;
+
+    var info = ComputeAllSliceBounds(chunks, numSlices);
+
+    // Step 1. Compute which items from each slice of the result
+    // buffer should be preserved. When we're done, we have an array
+    // |survivors| containing a bitset for each chunk, indicating
+    // which members of the chunk survived. We also keep an array
+    // |counts| containing the total number of items that are being
+    // preserved from within one slice.
+    //
+    // FIXME(bug 844890): Use typed arrays here.
+    var counts = NewDenseArray(numSlices);
+    for (var i = 0; i < numSlices; i++)
+      UnsafeSetElement(counts, i, 0);
+    var survivors = NewDenseArray(chunks);
+    ForkJoin(findSurvivorsInSlice, CheckParallel(mode));
+
+    // Step 2. Compress the slices into one contiguous set.
+    var count = 0;
+    for (var i = 0; i < numSlices; i++)
+      count += counts[i];
+    var buffer = NewDenseArray(count);
+    if (count > 0)
+      ForkJoin(copySurvivorsInSlice, CheckParallel(mode));
+
+    return PA_NEW(count, buffer, 0);
+  }
+
+  // Sequential fallback:
+  ASSERT_SEQUENTIAL_IS_OK(mode);
+  var buffer = [];
+  for (var i = 0; i < length; i++) {
+    var elem = PA_GET(self, i);
+    if (func(elem, i, self))
+      ARRAY_PUSH(buffer, elem);
+  }
+  return PA_NEW(buffer.length, buffer, 0);
+
+  /**
+   * As described above, our goal is to determine which items we
+   * will preserve from a given slice. We do this one chunk at a
+   * time. When we finish a chunk, we record our current count and
+   * the next chunk sliceId, lest we should bail.
+   */
+  function findSurvivorsInSlice(sliceId, numSlices, warmup) {
+
+    var chunkPos = info[SLICE_POS(sliceId)];
+    var chunkEnd = info[SLICE_END(sliceId)];
+
+    if (warmup && chunkEnd > chunkPos)
+      chunkEnd = chunkPos + 1;
+
+    var count = counts[sliceId];
+    while (chunkPos < chunkEnd) {
+      var indexStart = chunkPos << CHUNK_SHIFT;
+      var indexEnd = std_Math_min(indexStart + CHUNK_SIZE, length);
+      var chunkBits = 0;
+
+      for (var bit = 0; indexStart + bit < indexEnd; bit++) {
+        var keep = !!func(self.get(indexStart + bit), indexStart + bit, self);
+        chunkBits |= keep << bit;
+        count += keep;
+      }
+
+      UnsafeSetElement(survivors, chunkPos, chunkBits,
+                       counts, sliceId, count,
+                       info, SLICE_POS(sliceId), ++chunkPos);
+    }
+  }
+
+  function copySurvivorsInSlice(sliceId, numSlices, warmup) {
+    // Copies the survivors from this slice into the correct position.
+    // Note that this is an idempotent operation that does not invoke
+    // user code. Therefore, we don't expect bailouts and make an
+    // effort to proceed chunk by chunk or avoid duplicating work.
+
+    // During warmup, we only execute with sliceId 0. This would fail to
+    // execute the loop below. Therefore, during warmup, we
+    // substitute 1 for the sliceId.
+    if (warmup && sliceId == 0 && numSlices != 1)
+      sliceId = 1;
+
+    // Total up the items preserved by previous slices.
+    var count = 0;
+    if (sliceId > 0) { // FIXME(#819219)---work around a bug in Ion's range checks
+      for (var i = 0; i < sliceId; i++)
+        count += counts[i];
+    }
+
+    // Compute the final index we expect to write.
+    var total = count + counts[sliceId];
+    if (count == total)
+      return;
+
+    // Iterate over the chunks assigned to us. Read the bitset for
+    // each chunk. Copy values where a 1 appears until we have
+    // written all the values that we expect to. We can just iterate
+    // from 0...CHUNK_SIZE without fear of a truncated final chunk
+    // because we are already checking for when count==total.
+    var chunkStart = info[SLICE_START(sliceId)];
+    var chunkEnd = info[SLICE_END(sliceId)];
+    for (var chunk = chunkStart; chunk < chunkEnd; chunk++) {
+      var chunkBits = survivors[chunk];
+      if (!chunkBits)
+        continue;
+
+      var indexStart = chunk << CHUNK_SHIFT;
+      for (var i = 0; i < CHUNK_SIZE; i++) {
+        if (chunkBits & (1 << i)) {
+          UnsafeSetElement(buffer, count++, PA_GET(self, indexStart + i));
+          if (count == total)
+            break;
+        }
+      }
+    }
+  }
+}
+
+
+/**
+ * Internal debugging tool: checks that the given `mode` permits
+ * sequential execution
+ */
+function AssertSequentialIsOK(mode) {
+  if (mode && mode.mode && mode.mode !== "seq" && ParallelTestsShouldPass())
+    ThrowError(JSMSG_WRONG_VALUE, "parallel execution", "sequential was forced");
+}
+
+/**
+ * Internal debugging tool: returns a function to be supplied to
+ * ForkJoin() that will check that the parallel results
+ * bailout/succeed as expected. Returns null if no mode is supplied
+ * or we are building with some strange IF_DEF configuration such that
+ * we don't expect parallel execution to work.
+ */
+function CheckParallel(mode) {
+  if (!mode || !ParallelTestsShouldPass())
+    return null;
+
+  return function(result, bailouts, causes) {
+    if (!("expect" in mode) || mode.expect === "any") {
+      return; // Ignore result when unspecified or unimportant.
+    } else if (mode.expect === "mixed" && result !== "disqualified") {
+      return; // "mixed" means that it may bailout, may succeed
+    } else if (result === mode.expect) {
+      return;
+    }
+
+    ThrowError(JSMSG_WRONG_VALUE, mode.expect,
+               result+":"+bailouts+":"+causes);
+  };
+}
+
+/*
+ * Mark the main operations as clone-at-callsite for better precision.
+ * This is slightly overkill, as all that we really need is to
+ * specialize to the receiver and the elemental function, but in
+ * practice this is likely not so different, since element functions
+ * are often used in exactly one place.
+ */
+SetScriptHints(PA_MAP_NAME,        { cloneAtCallsite: true });
+SetScriptHints(PA_REDUCE_NAME,     { cloneAtCallsite: true });
+SetScriptHints(PA_SCAN_NAME,       { cloneAtCallsite: true });
+SetScriptHints(PA_SCATTER_NAME,    { cloneAtCallsite: true });
+SetScriptHints(PA_FILTER_NAME,     { cloneAtCallsite: true });
--- a/js/src/builtin/Utilities.js
+++ b/js/src/builtin/Utilities.js
@ -26,6 +26,7 @@
 /* Utility macros */
 #define TO_INT32(x) (x | 0)
 #define TO_UINT32(x) (x >>> 0)
+#define IS_UINT32(x) (x >>> 0 === x)

 /* cache built-in functions before applications can change them */
 var std_isFinite = isFinite;
--- a/js/src/jit-test/tests/parallelarray/inline-new-bad-type.js
+++ b/js/src/jit-test/tests/parallelarray/inline-new-bad-type.js
@ -1,4 +1,4 @@
-// |jit-test| error: RangeError
+// |jit-test| error: TypeError
 //
 // Run with --ion-eager.
 if (getBuildConfiguration().parallelJS) {
@ -14,5 +14,5 @@ if (getBuildConfiguration().parallelJS) {
  } catch(exc1) {}
  reportCompare();
 } else {
-  throw new RangeError();
+  throw new TypeError();
 }
--- a/js/src/jsarray.cpp
+++ b/js/src/jsarray.cpp
@ -2759,6 +2759,15 @@ static const JSFunctionSpec array_methods[] = {
         {"some",               {NULL, NULL},       1,0, "ArraySome"},
         {"every",              {NULL, NULL},       1,0, "ArrayEvery"},

+#ifdef ENABLE_PARALLEL_JS
+    /* Parallelizable and pure methods. */
+         {"pmap",              {NULL, NULL},        2,0, "ArrayParallelMap"},
+         {"preduce",           {NULL, NULL},        2,0, "ArrayParallelReduce"},
+         {"pscan",             {NULL, NULL},        2,0, "ArrayParallelScan"},
+         {"pscatter",          {NULL, NULL},        5,0, "ArrayParallelScatter"},
+         {"pfilter",           {NULL, NULL},        2,0, "ArrayParallelFilter"},
+#endif
+
    JS_FN("iterator",           JS_ArrayIterator,   0,0),
    JS_FS_END
 };
@ -2773,6 +2782,12 @@ static const JSFunctionSpec array_static_methods[] = {
         {"some",               {NULL, NULL},       2,0, "ArrayStaticSome"},
         {"reduce",             {NULL, NULL},       2,0, "ArrayStaticReduce"},
         {"reduceRight",        {NULL, NULL},       2,0, "ArrayStaticReduceRight"},
+
+#ifdef ENABLE_PARALLEL_JS
+    /* Parallelizable and pure static methods. */
+         {"pbuild",             {NULL, NULL},       3,0, "ArrayStaticParallelBuild"},
+#endif
+
    JS_FS_END
 };