зеркало из https://github.com/mozilla/gecko-dev.git
Bug 860965 - Move 1D ParallelArray operations to Array. (r=luke)
This commit is contained in:
Родитель
bd907d5829
Коммит
82db710cfc
|
@ -983,6 +983,7 @@ selfhosting_srcs := \
|
|||
|
||||
selfhosted_out_h_deps := \
|
||||
$(selfhosting_srcs) \
|
||||
$(srcdir)/builtin/ParallelArrayCommonOps.js \
|
||||
$(srcdir)/js.msg \
|
||||
$(srcdir)/builtin/embedjs.py \
|
||||
$(NULL)
|
||||
|
|
|
@ -391,3 +391,96 @@ function ArrayStaticReduceRight(list, callbackfn) {
|
|||
else
|
||||
return callFunction(ArrayReduceRight, list, callbackfn);
|
||||
}
|
||||
|
||||
#ifdef ENABLE_PARALLEL_JS
|
||||
|
||||
/* Include the common 1D operations. */
|
||||
#define PA_LENGTH(a) (a.length)
|
||||
#define PA_GET(a, i) (a[i])
|
||||
#define PA_NEW(length, buffer, offset) buffer
|
||||
|
||||
#define PA_MAP_NAME ArrayParallelMap
|
||||
#define PA_REDUCE_NAME ArrayParallelReduce
|
||||
#define PA_SCAN_NAME ArrayParallelScan
|
||||
#define PA_SCATTER_NAME ArrayParallelScatter
|
||||
#define PA_FILTER_NAME ArrayParallelFilter
|
||||
|
||||
#include "ParallelArrayCommonOps.js"
|
||||
|
||||
#undef PA_LENGTH
|
||||
#undef PA_GET
|
||||
#undef PA_NEW
|
||||
#undef PA_MAP_NAME
|
||||
#undef PA_REDUCE_NAME
|
||||
#undef PA_SCAN_NAME
|
||||
#undef PA_SCATTER_NAME
|
||||
#undef PA_FILTER_NAME
|
||||
|
||||
/**
|
||||
* "Comprehension form": This is the function invoked for |Array.pbuild(len,
|
||||
* fn)| Itt creates a new array with length |len| where index |i| is equal to
|
||||
* |fn(i)|.
|
||||
*
|
||||
* The final |mode| argument is an internal argument used only
|
||||
* during our unit-testing.
|
||||
*/
|
||||
function ArrayStaticParallelBuild(length, func, mode) {
|
||||
if (!IS_UINT32(length))
|
||||
ThrowError(JSMSG_PAR_ARRAY_BAD_ARG, "");
|
||||
if (!IsCallable(func))
|
||||
ThrowError(JSMSG_NOT_FUNCTION, DecompileArg(1, func));
|
||||
|
||||
var buffer = NewDenseArray(length);
|
||||
|
||||
parallel: for (;;) {
|
||||
// Avoid parallel compilation if we are already nested in another
|
||||
// parallel section or the user told us not to parallelize. The
|
||||
// use of a for (;;) loop is working around some ion limitations:
|
||||
//
|
||||
// - Breaking out of named blocks does not currently work (bug 684384);
|
||||
// - Unreachable Code Elim. can't properly handle if (a && b) (bug 669796)
|
||||
if (ShouldForceSequential())
|
||||
break parallel;
|
||||
if (!TRY_PARALLEL(mode))
|
||||
break parallel;
|
||||
|
||||
var chunks = ComputeNumChunks(length);
|
||||
var numSlices = ParallelSlices();
|
||||
var info = ComputeAllSliceBounds(chunks, numSlices);
|
||||
ParallelDo(constructSlice, CheckParallel(mode));
|
||||
return buffer;
|
||||
}
|
||||
|
||||
// Sequential fallback:
|
||||
ASSERT_SEQUENTIAL_IS_OK(mode);
|
||||
fill(0, length);
|
||||
return buffer;
|
||||
|
||||
function constructSlice(sliceId, numSlices, warmup) {
|
||||
var chunkPos = info[SLICE_POS(sliceId)];
|
||||
var chunkEnd = info[SLICE_END(sliceId)];
|
||||
|
||||
if (warmup && chunkEnd > chunkPos)
|
||||
chunkEnd = chunkPos + 1;
|
||||
|
||||
while (chunkPos < chunkEnd) {
|
||||
var indexStart = chunkPos << CHUNK_SHIFT;
|
||||
var indexEnd = std_Math_min(indexStart + CHUNK_SIZE, length);
|
||||
fill(indexStart, indexEnd);
|
||||
UnsafeSetElement(info, SLICE_POS(sliceId), ++chunkPos);
|
||||
}
|
||||
}
|
||||
|
||||
function fill(indexStart, indexEnd) {
|
||||
for (var i = indexStart; i < indexEnd; i++)
|
||||
UnsafeSetElement(buffer, i, func(i));
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Mark the comprehension form as clone-at-callsite. See note in
|
||||
* ParallelArrayCommonOps.js
|
||||
*/
|
||||
SetScriptHints(ArrayStaticParallelBuild, { cloneAtCallsite: true });
|
||||
|
||||
#endif /* ENABLE_PARALLEL_JS */
|
||||
|
|
|
@ -4,71 +4,28 @@
|
|||
|
||||
// FIXME(bug 844882): Parallel array properties should not be exposed.
|
||||
|
||||
// The mode asserts options object.
|
||||
#define TRY_PARALLEL(MODE) \
|
||||
((!MODE || MODE.mode === "par"))
|
||||
#define ASSERT_SEQUENTIAL_IS_OK(MODE) \
|
||||
do { if (MODE) AssertSequentialIsOK(MODE) } while(false)
|
||||
/* Include the common 1D operations. */
|
||||
#define PA_LENGTH(a) (a.shape[0])
|
||||
#define PA_GET(a, i) (a.get(i))
|
||||
#define PA_NEW(length, buffer, offset) \
|
||||
(NewParallelArray(ParallelArrayView, [length], buffer, offset))
|
||||
|
||||
// Slice array: see ComputeAllSliceBounds()
|
||||
#define SLICE_INFO(START, END) START, END, START, 0
|
||||
#define SLICE_START(ID) ((ID << 2) + 0)
|
||||
#define SLICE_END(ID) ((ID << 2) + 1)
|
||||
#define SLICE_POS(ID) ((ID << 2) + 2)
|
||||
#define PA_MAP_NAME ParallelArrayMap
|
||||
#define PA_REDUCE_NAME ParallelArrayReduce
|
||||
#define PA_SCAN_NAME ParallelArrayScan
|
||||
#define PA_SCATTER_NAME ParallelArrayScatter
|
||||
#define PA_FILTER_NAME ParallelArrayFilter
|
||||
|
||||
// How many items at a time do we do recomp. for parallel execution.
|
||||
// Note that filter currently assumes that this is no greater than 32
|
||||
// in order to make use of a bitset.
|
||||
#define CHUNK_SHIFT 5
|
||||
#define CHUNK_SIZE 32
|
||||
#include "ParallelArrayCommonOps.js"
|
||||
|
||||
// Safe versions of ARRAY.push(ELEMENT)
|
||||
#define ARRAY_PUSH(ARRAY, ELEMENT) \
|
||||
callFunction(std_Array_push, ARRAY, ELEMENT);
|
||||
#define ARRAY_SLICE(ARRAY, ELEMENT) \
|
||||
callFunction(std_Array_slice, ARRAY, ELEMENT);
|
||||
|
||||
/**
|
||||
* Determine the number of chunks of size CHUNK_SIZE;
|
||||
* note that the final chunk may be smaller than CHUNK_SIZE.
|
||||
*/
|
||||
function ComputeNumChunks(length) {
|
||||
var chunks = length >>> CHUNK_SHIFT;
|
||||
if (chunks << CHUNK_SHIFT === length)
|
||||
return chunks;
|
||||
return chunks + 1;
|
||||
}
|
||||
|
||||
/**
|
||||
* Computes the bounds for slice |sliceIndex| of |numItems| items,
|
||||
* assuming |numSlices| total slices. If numItems is not evenly
|
||||
* divisible by numSlices, then the final thread may have a bit of
|
||||
* extra work.
|
||||
*/
|
||||
function ComputeSliceBounds(numItems, sliceIndex, numSlices) {
|
||||
var sliceWidth = (numItems / numSlices) | 0;
|
||||
var startIndex = sliceWidth * sliceIndex;
|
||||
var endIndex = sliceIndex === numSlices - 1 ? numItems : sliceWidth * (sliceIndex + 1);
|
||||
return [startIndex, endIndex];
|
||||
}
|
||||
|
||||
/**
|
||||
* Divides |numItems| items amongst |numSlices| slices. The result
|
||||
* is an array containing multiple values per slice: the start
|
||||
* index, end index, current position, and some padding. The
|
||||
* current position is initially the same as the start index. To
|
||||
* access the values for a particular slice, use the macros
|
||||
* SLICE_START() and so forth.
|
||||
*/
|
||||
function ComputeAllSliceBounds(numItems, numSlices) {
|
||||
// FIXME(bug 844890): Use typed arrays here.
|
||||
var info = [];
|
||||
for (var i = 0; i < numSlices; i++) {
|
||||
var [start, end] = ComputeSliceBounds(numItems, i, numSlices);
|
||||
ARRAY_PUSH(info, SLICE_INFO(start, end));
|
||||
}
|
||||
return info;
|
||||
}
|
||||
#undef PA_LENGTH
|
||||
#undef PA_GET
|
||||
#undef PA_NEW
|
||||
#undef PA_MAP_NAME
|
||||
#undef PA_REDUCE_NAME
|
||||
#undef PA_SCAN_NAME
|
||||
#undef PA_SCATTER_NAME
|
||||
#undef PA_FILTER_NAME
|
||||
|
||||
/**
|
||||
* Compute the partial products in reverse order.
|
||||
|
@ -91,7 +48,6 @@ function ComputeProducts(shape) {
|
|||
* array containing the N-dimensional index that maps to |index1d|.
|
||||
*/
|
||||
function ComputeIndices(shape, index1d) {
|
||||
|
||||
var products = ComputeProducts(shape);
|
||||
var l = shape.length;
|
||||
|
||||
|
@ -102,7 +58,7 @@ function ComputeIndices(shape, index1d) {
|
|||
var stride = products[l - i - 1];
|
||||
|
||||
// Compute how many steps of width stride we could take.
|
||||
var index = (index1d / stride) | 0;
|
||||
var index = TO_INT32(index1d / stride);
|
||||
ARRAY_PUSH(result, index);
|
||||
|
||||
// Adjust remaining indices for smaller dimensions.
|
||||
|
@ -143,7 +99,7 @@ function ParallelArrayConstructEmpty() {
|
|||
*/
|
||||
function ParallelArrayConstructFromArray(buffer) {
|
||||
var buffer = ToObject(buffer);
|
||||
var length = buffer.length >>> 0;
|
||||
var length = TO_UINT32(buffer.length);
|
||||
if (length !== buffer.length)
|
||||
ThrowError(JSMSG_PAR_ARRAY_BAD_ARG, "");
|
||||
|
||||
|
@ -165,6 +121,11 @@ function ParallelArrayConstructFromArray(buffer) {
|
|||
* the ion code that does inlining of PA constructors.
|
||||
*/
|
||||
function ParallelArrayConstructFromFunction(shape, func) {
|
||||
// Note that due to how DecompileArg works, it must be called in the nearest
|
||||
// builtin frame, and so cannot factored out into
|
||||
// ParallelArrayConstructFromComprehension.
|
||||
if (!IsCallable(func))
|
||||
ThrowError(JSMSG_NOT_FUNCTION, DecompileArg(1, func));
|
||||
return ParallelArrayConstructFromComprehension(this, shape, func, undefined);
|
||||
}
|
||||
|
||||
|
@ -173,6 +134,9 @@ function ParallelArrayConstructFromFunction(shape, func) {
|
|||
* case where 3 arguments are supplied.
|
||||
*/
|
||||
function ParallelArrayConstructFromFunctionMode(shape, func, mode) {
|
||||
// See DecompileArg note above.
|
||||
if (!IsCallable(func))
|
||||
ThrowError(JSMSG_NOT_FUNCTION, DecompileArg(1, func));
|
||||
return ParallelArrayConstructFromComprehension(this, shape, func, mode);
|
||||
}
|
||||
|
||||
|
@ -188,10 +152,8 @@ function ParallelArrayConstructFromFunctionMode(shape, func, mode) {
|
|||
* during our unit-testing.
|
||||
*/
|
||||
function ParallelArrayConstructFromComprehension(self, shape, func, mode) {
|
||||
// FIXME(bug 844887): Check |IsCallable(func)|
|
||||
|
||||
if (typeof shape === "number") {
|
||||
var length = shape >>> 0;
|
||||
var length = TO_UINT32(shape);
|
||||
if (length !== shape)
|
||||
ThrowError(JSMSG_PAR_ARRAY_BAD_ARG, "");
|
||||
ParallelArrayBuild(self, [length], func, mode);
|
||||
|
@ -201,7 +163,7 @@ function ParallelArrayConstructFromComprehension(self, shape, func, mode) {
|
|||
var shape1 = [];
|
||||
for (var i = 0, l = shape.length; i < l; i++) {
|
||||
var s0 = shape[i];
|
||||
var s1 = s0 >>> 0;
|
||||
var s1 = TO_UINT32(s0);
|
||||
if (s1 !== s0)
|
||||
ThrowError(JSMSG_PAR_ARRAY_BAD_ARG, "");
|
||||
ARRAY_PUSH(shape1, s1);
|
||||
|
@ -327,7 +289,7 @@ function ParallelArrayBuild(self, shape, func, mode) {
|
|||
}
|
||||
|
||||
function fill2(indexStart, indexEnd) {
|
||||
var x = (indexStart / yDimension) | 0;
|
||||
var x = TO_INT32(indexStart / yDimension);
|
||||
var y = indexStart - x * yDimension;
|
||||
for (var i = indexStart; i < indexEnd; i++) {
|
||||
UnsafeSetElement(buffer, i, func(x, y));
|
||||
|
@ -339,9 +301,9 @@ function ParallelArrayBuild(self, shape, func, mode) {
|
|||
}
|
||||
|
||||
function fill3(indexStart, indexEnd) {
|
||||
var x = (indexStart / (yDimension * zDimension)) | 0;
|
||||
var x = TO_INT32(indexStart / (yDimension * zDimension));
|
||||
var r = indexStart - x * yDimension * zDimension;
|
||||
var y = (r / zDimension) | 0;
|
||||
var y = TO_INT32(r / zDimension);
|
||||
var z = r - y * zDimension;
|
||||
for (var i = indexStart; i < indexEnd; i++) {
|
||||
UnsafeSetElement(buffer, i, func(x, y, z));
|
||||
|
@ -365,734 +327,6 @@ function ParallelArrayBuild(self, shape, func, mode) {
|
|||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates a new parallel array by applying |func(e, i, self)| for each
|
||||
* element |e| with index |i|. Note that
|
||||
* this always operates on the outermost dimension only.
|
||||
*/
|
||||
function ParallelArrayMap(func, mode) {
|
||||
// FIXME(bug 844887): Check |this instanceof ParallelArray|
|
||||
// FIXME(bug 844887): Check |IsCallable(func)|
|
||||
|
||||
var self = this;
|
||||
var length = self.shape[0];
|
||||
var buffer = NewDenseArray(length);
|
||||
|
||||
parallel: for (;;) { // see ParallelArrayBuild() to explain why for(;;) etc
|
||||
if (ShouldForceSequential())
|
||||
break parallel;
|
||||
if (!TRY_PARALLEL(mode))
|
||||
break parallel;
|
||||
|
||||
var chunks = ComputeNumChunks(length);
|
||||
var numSlices = ForkJoinSlices();
|
||||
var info = ComputeAllSliceBounds(chunks, numSlices);
|
||||
ForkJoin(mapSlice, CheckParallel(mode));
|
||||
return NewParallelArray(ParallelArrayView, [length], buffer, 0);
|
||||
}
|
||||
|
||||
// Sequential fallback:
|
||||
ASSERT_SEQUENTIAL_IS_OK(mode);
|
||||
for (var i = 0; i < length; i++) {
|
||||
// Note: Unlike JS arrays, parallel arrays cannot have holes.
|
||||
var v = func(self.get(i), i, self);
|
||||
UnsafeSetElement(buffer, i, v);
|
||||
}
|
||||
return NewParallelArray(ParallelArrayView, [length], buffer, 0);
|
||||
|
||||
function mapSlice(sliceId, numSlices, warmup) {
|
||||
var chunkPos = info[SLICE_POS(sliceId)];
|
||||
var chunkEnd = info[SLICE_END(sliceId)];
|
||||
|
||||
if (warmup && chunkEnd > chunkPos + 1)
|
||||
chunkEnd = chunkPos + 1;
|
||||
|
||||
while (chunkPos < chunkEnd) {
|
||||
var indexStart = chunkPos << CHUNK_SHIFT;
|
||||
var indexEnd = std_Math_min(indexStart + CHUNK_SIZE, length);
|
||||
|
||||
for (var i = indexStart; i < indexEnd; i++)
|
||||
UnsafeSetElement(buffer, i, func(self.get(i), i, self));
|
||||
|
||||
UnsafeSetElement(info, SLICE_POS(sliceId), ++chunkPos);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Reduces the elements in a parallel array's outermost dimension
|
||||
* using the given reduction function.
|
||||
*/
|
||||
function ParallelArrayReduce(func, mode) {
|
||||
// FIXME(bug 844887): Check |this instanceof ParallelArray|
|
||||
// FIXME(bug 844887): Check |IsCallable(func)|
|
||||
|
||||
var self = this;
|
||||
var length = self.shape[0];
|
||||
|
||||
if (length === 0)
|
||||
ThrowError(JSMSG_PAR_ARRAY_REDUCE_EMPTY);
|
||||
|
||||
parallel: for (;;) { // see ParallelArrayBuild() to explain why for(;;) etc
|
||||
if (ShouldForceSequential())
|
||||
break parallel;
|
||||
if (!TRY_PARALLEL(mode))
|
||||
break parallel;
|
||||
|
||||
var chunks = ComputeNumChunks(length);
|
||||
var numSlices = ForkJoinSlices();
|
||||
if (chunks < numSlices)
|
||||
break parallel;
|
||||
|
||||
var info = ComputeAllSliceBounds(chunks, numSlices);
|
||||
var subreductions = NewDenseArray(numSlices);
|
||||
ForkJoin(reduceSlice, CheckParallel(mode));
|
||||
var accumulator = subreductions[0];
|
||||
for (var i = 1; i < numSlices; i++)
|
||||
accumulator = func(accumulator, subreductions[i]);
|
||||
return accumulator;
|
||||
}
|
||||
|
||||
// Sequential fallback:
|
||||
ASSERT_SEQUENTIAL_IS_OK(mode);
|
||||
var accumulator = self.get(0);
|
||||
for (var i = 1; i < length; i++)
|
||||
accumulator = func(accumulator, self.get(i));
|
||||
return accumulator;
|
||||
|
||||
function reduceSlice(sliceId, numSlices, warmup) {
|
||||
var chunkStart = info[SLICE_START(sliceId)];
|
||||
var chunkPos = info[SLICE_POS(sliceId)];
|
||||
var chunkEnd = info[SLICE_END(sliceId)];
|
||||
|
||||
// (*) This function is carefully designed so that the warmup
|
||||
// (which executes with chunkStart === chunkPos) will execute all
|
||||
// potential loads and stores. In particular, the warmup run
|
||||
// processes two chunks rather than one. Moreover, it stores
|
||||
// accumulator into subreductions and then loads it again to
|
||||
// ensure that the load is executed during the warmup, as it will
|
||||
// certainly be executed during subsequent runs.
|
||||
|
||||
if (warmup && chunkEnd > chunkPos + 2)
|
||||
chunkEnd = chunkPos + 2;
|
||||
|
||||
if (chunkStart === chunkPos) {
|
||||
var indexPos = chunkStart << CHUNK_SHIFT;
|
||||
var accumulator = reduceChunk(self.get(indexPos), indexPos + 1, indexPos + CHUNK_SIZE);
|
||||
|
||||
UnsafeSetElement(subreductions, sliceId, accumulator, // see (*) above
|
||||
info, SLICE_POS(sliceId), ++chunkPos);
|
||||
}
|
||||
|
||||
var accumulator = subreductions[sliceId]; // see (*) above
|
||||
|
||||
while (chunkPos < chunkEnd) {
|
||||
var indexPos = chunkPos << CHUNK_SHIFT;
|
||||
accumulator = reduceChunk(accumulator, indexPos, indexPos + CHUNK_SIZE);
|
||||
UnsafeSetElement(subreductions, sliceId, accumulator,
|
||||
info, SLICE_POS(sliceId), ++chunkPos);
|
||||
}
|
||||
}
|
||||
|
||||
function reduceChunk(accumulator, from, to) {
|
||||
to = std_Math_min(to, length);
|
||||
for (var i = from; i < to; i++)
|
||||
accumulator = func(accumulator, self.get(i));
|
||||
return accumulator;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* |scan()| returns an array [s_0, ..., s_N] where
|
||||
* |s_i| is equal to the reduction (as per |reduce()|)
|
||||
* of elements |0..i|. This is the generalization
|
||||
* of partial sum.
|
||||
*/
|
||||
function ParallelArrayScan(func, mode) {
|
||||
// FIXME(bug 844887): Check |this instanceof ParallelArray|
|
||||
// FIXME(bug 844887): Check |IsCallable(func)|
|
||||
|
||||
var self = this;
|
||||
var length = self.shape[0];
|
||||
|
||||
if (length === 0)
|
||||
ThrowError(JSMSG_PAR_ARRAY_REDUCE_EMPTY);
|
||||
|
||||
var buffer = NewDenseArray(length);
|
||||
|
||||
parallel: for (;;) { // see ParallelArrayBuild() to explain why for(;;) etc
|
||||
if (ShouldForceSequential())
|
||||
break parallel;
|
||||
if (!TRY_PARALLEL(mode))
|
||||
break parallel;
|
||||
|
||||
var chunks = ComputeNumChunks(length);
|
||||
var numSlices = ForkJoinSlices();
|
||||
if (chunks < numSlices)
|
||||
break parallel;
|
||||
var info = ComputeAllSliceBounds(chunks, numSlices);
|
||||
|
||||
// Scan slices individually (see comment on phase1()).
|
||||
ForkJoin(phase1, CheckParallel(mode));
|
||||
|
||||
// Compute intermediates array (see comment on phase2()).
|
||||
var intermediates = [];
|
||||
var accumulator = buffer[finalElement(0)];
|
||||
ARRAY_PUSH(intermediates, accumulator);
|
||||
for (var i = 1; i < numSlices - 1; i++) {
|
||||
accumulator = func(accumulator, buffer[finalElement(i)]);
|
||||
ARRAY_PUSH(intermediates, accumulator);
|
||||
}
|
||||
|
||||
// Reset the current position information for each slice, but
|
||||
// convert from chunks to indices (see comment on phase2()).
|
||||
for (var i = 0; i < numSlices; i++) {
|
||||
info[SLICE_POS(i)] = info[SLICE_START(i)] << CHUNK_SHIFT;
|
||||
info[SLICE_END(i)] = info[SLICE_END(i)] << CHUNK_SHIFT;
|
||||
}
|
||||
info[SLICE_END(numSlices - 1)] = std_Math_min(info[SLICE_END(numSlices - 1)], length);
|
||||
|
||||
// Complete each slice using intermediates array (see comment on phase2()).
|
||||
ForkJoin(phase2, CheckParallel(mode));
|
||||
return NewParallelArray(ParallelArrayView, [length], buffer, 0);
|
||||
}
|
||||
|
||||
// Sequential fallback:
|
||||
ASSERT_SEQUENTIAL_IS_OK(mode);
|
||||
scan(self.get(0), 0, length);
|
||||
return NewParallelArray(ParallelArrayView, [length], buffer, 0);
|
||||
|
||||
function scan(accumulator, start, end) {
|
||||
UnsafeSetElement(buffer, start, accumulator);
|
||||
for (var i = start + 1; i < end; i++) {
|
||||
accumulator = func(accumulator, self.get(i));
|
||||
UnsafeSetElement(buffer, i, accumulator);
|
||||
}
|
||||
return accumulator;
|
||||
}
|
||||
|
||||
/**
|
||||
* In phase 1, we divide the source array into |numSlices| slices and
|
||||
* compute scan on each slice sequentially as if it were the entire
|
||||
* array. This function is responsible for computing one of those
|
||||
* slices.
|
||||
*
|
||||
* So, if we have an array [A,B,C,D,E,F,G,H,I], |numSlices == 3|,
|
||||
* and our function |func| is sum, then we would wind up computing a
|
||||
* result array like:
|
||||
*
|
||||
* [A, A+B, A+B+C, D, D+E, D+E+F, G, G+H, G+H+I]
|
||||
* ^~~~~~~~~~~~^ ^~~~~~~~~~~~^ ^~~~~~~~~~~~~^
|
||||
* Slice 0 Slice 1 Slice 2
|
||||
*
|
||||
* Read on in phase2 to see what we do next!
|
||||
*/
|
||||
function phase1(sliceId, numSlices, warmup) {
|
||||
var chunkStart = info[SLICE_START(sliceId)];
|
||||
var chunkPos = info[SLICE_POS(sliceId)];
|
||||
var chunkEnd = info[SLICE_END(sliceId)];
|
||||
|
||||
if (warmup && chunkEnd > chunkPos + 2)
|
||||
chunkEnd = chunkPos + 2;
|
||||
|
||||
if (chunkPos == chunkStart) {
|
||||
// For the first chunk, the accumulator begins as the value in
|
||||
// the input at the start of the chunk.
|
||||
var indexStart = chunkPos << CHUNK_SHIFT;
|
||||
var indexEnd = std_Math_min(indexStart + CHUNK_SIZE, length);
|
||||
scan(self.get(indexStart), indexStart, indexEnd);
|
||||
UnsafeSetElement(info, SLICE_POS(sliceId), ++chunkPos);
|
||||
}
|
||||
|
||||
while (chunkPos < chunkEnd) {
|
||||
// For each subsequent chunk, the accumulator begins as the
|
||||
// combination of the final value of prev chunk and the value in
|
||||
// the input at the start of this chunk. Note that this loop is
|
||||
// written as simple as possible, at the cost of an extra read
|
||||
// from the buffer per iteration.
|
||||
var indexStart = chunkPos << CHUNK_SHIFT;
|
||||
var indexEnd = std_Math_min(indexStart + CHUNK_SIZE, length);
|
||||
var accumulator = func(buffer[indexStart - 1], self.get(indexStart));
|
||||
scan(accumulator, indexStart, indexEnd);
|
||||
UnsafeSetElement(info, SLICE_POS(sliceId), ++chunkPos);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Computes the index of the final element computed by the slice |sliceId|.
|
||||
*/
|
||||
function finalElement(sliceId) {
|
||||
var chunkEnd = info[SLICE_END(sliceId)]; // last chunk written by |sliceId| is endChunk - 1
|
||||
var indexStart = std_Math_min(chunkEnd << CHUNK_SHIFT, length);
|
||||
return indexStart - 1;
|
||||
}
|
||||
|
||||
/**
|
||||
* After computing the phase1 results, we compute an
|
||||
* |intermediates| array. |intermediates[i]| contains the result
|
||||
* of reducing the final value from each preceding slice j<i with
|
||||
* the final value of slice i. So, to continue our previous
|
||||
* example, the intermediates array would contain:
|
||||
*
|
||||
* [A+B+C, (A+B+C)+(D+E+F), ((A+B+C)+(D+E+F))+(G+H+I)]
|
||||
*
|
||||
* Here I have used parenthesization to make clear the order of
|
||||
* evaluation in each case.
|
||||
*
|
||||
* An aside: currently the intermediates array is computed
|
||||
* sequentially. In principle, we could compute it in parallel,
|
||||
* at the cost of doing duplicate work. This did not seem
|
||||
* particularly advantageous to me, particularly as the number
|
||||
* of slices is typically quite small (one per core), so I opted
|
||||
* to just compute it sequentially.
|
||||
*
|
||||
* Phase 2 combines the results of phase1 with the intermediates
|
||||
* array to produce the final scan results. The idea is to
|
||||
* reiterate over each element S[i] in the slice |sliceId|, which
|
||||
* currently contains the result of reducing with S[0]...S[i]
|
||||
* (where S0 is the first thing in the slice), and combine that
|
||||
* with |intermediate[sliceId-1]|, which represents the result of
|
||||
* reducing everything in the input array prior to the slice.
|
||||
*
|
||||
* To continue with our example, in phase 1 we computed slice 1 to
|
||||
* be [D, D+E, D+E+F]. We will combine those results with
|
||||
* |intermediates[1-1]|, which is |A+B+C|, so that the final
|
||||
* result is [(A+B+C)+D, (A+B+C)+(D+E), (A+B+C)+(D+E+F)]. Again I
|
||||
* am using parentheses to clarify how these results were reduced.
|
||||
*
|
||||
* SUBTLE: Because we are mutating |buffer| in place, we have to
|
||||
* be very careful about bailouts! We cannot checkpoint a chunk
|
||||
* at a time as we do elsewhere because that assumes it is safe to
|
||||
* replay the portion of a chunk which was already processed.
|
||||
* Therefore, in this phase, we track the current position at an
|
||||
* index granularity, although this requires two memory writes per
|
||||
* index.
|
||||
*/
|
||||
function phase2(sliceId, numSlices, warmup) {
|
||||
if (sliceId == 0)
|
||||
return; // No work to do for the 0th slice.
|
||||
|
||||
var indexPos = info[SLICE_POS(sliceId)];
|
||||
var indexEnd = info[SLICE_END(sliceId)];
|
||||
|
||||
if (warmup)
|
||||
indexEnd = std_Math_min(indexEnd, indexPos + CHUNK_SIZE);
|
||||
|
||||
var intermediate = intermediates[sliceId - 1];
|
||||
for (; indexPos < indexEnd; indexPos++) {
|
||||
UnsafeSetElement(buffer, indexPos, func(intermediate, buffer[indexPos]),
|
||||
info, SLICE_POS(sliceId), indexPos + 1);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* |scatter()| redistributes the elements in the parallel array
|
||||
* into a new parallel array.
|
||||
*
|
||||
* - targets: The index targets[i] indicates where the ith element
|
||||
* should appear in the result.
|
||||
*
|
||||
* - defaultValue: what value to use for indices in the output array that
|
||||
* are never targeted.
|
||||
*
|
||||
* - conflictFunc: The conflict function. Used to resolve what
|
||||
* happens if two indices i and j in the source array are targeted
|
||||
* as the same destination (i.e., targets[i] == targets[j]), then
|
||||
* the final result is determined by applying func(targets[i],
|
||||
* targets[j]). If no conflict function is provided, it is an error
|
||||
* if targets[i] == targets[j].
|
||||
*
|
||||
* - length: length of the output array (if not specified, uses the
|
||||
* length of the input).
|
||||
*
|
||||
* - mode: internal debugging specification.
|
||||
*/
|
||||
function ParallelArrayScatter(targets, defaultValue, conflictFunc, length, mode) {
|
||||
// FIXME(bug 844887): Check |this instanceof ParallelArray|
|
||||
// FIXME(bug 844887): Check targets is array-like
|
||||
// FIXME(bug 844887): Check |IsCallable(conflictFunc)|
|
||||
|
||||
var self = this;
|
||||
|
||||
if (length === undefined)
|
||||
length = self.shape[0];
|
||||
|
||||
// The Divide-Scatter-Vector strategy:
|
||||
// 1. Slice |targets| array of indices ("scatter-vector") into N
|
||||
// parts.
|
||||
// 2. Each of the N threads prepares an output buffer and a
|
||||
// write-log.
|
||||
// 3. Each thread scatters according to one of the N parts into its
|
||||
// own output buffer, tracking written indices in the write-log
|
||||
// and resolving any resulting local collisions in parallel.
|
||||
// 4. Merge the parts (either in parallel or sequentially), using
|
||||
// the write-logs as both the basis for finding merge-inputs and
|
||||
// for detecting collisions.
|
||||
|
||||
// The Divide-Output-Range strategy:
|
||||
// 1. Slice the range of indices [0..|length|-1] into N parts.
|
||||
// Allocate a single shared output buffer of length |length|.
|
||||
// 2. Each of the N threads scans (the entirety of) the |targets|
|
||||
// array, seeking occurrences of indices from that thread's part
|
||||
// of the range, and writing the results into the shared output
|
||||
// buffer.
|
||||
// 3. Since each thread has its own portion of the output range,
|
||||
// every collision that occurs can be handled thread-locally.
|
||||
|
||||
// SO:
|
||||
//
|
||||
// If |targets.length| >> |length|, Divide-Scatter-Vector seems like
|
||||
// a clear win over Divide-Output-Range, since for the latter, the
|
||||
// expense of redundantly scanning the |targets| will diminish the
|
||||
// gain from processing |length| in parallel, while for the former,
|
||||
// the total expense of building separate output buffers and the
|
||||
// merging post-process is small compared to the gain from
|
||||
// processing |targets| in parallel.
|
||||
//
|
||||
// If |targets.length| << |length|, then Divide-Output-Range seems
|
||||
// like it *could* win over Divide-Scatter-Vector. (But when is
|
||||
// |targets.length| << |length| or even |targets.length| < |length|?
|
||||
// Seems like an odd situation and an uncommon case at best.)
|
||||
//
|
||||
// The unanswered question is which strategy performs better when
|
||||
// |targets.length| approximately equals |length|, especially for
|
||||
// special cases like collision-free scatters and permutations.
|
||||
|
||||
if (targets.length >>> 0 !== targets.length)
|
||||
ThrowError(JSMSG_BAD_ARRAY_LENGTH, ".prototype.scatter");
|
||||
|
||||
var targetsLength = std_Math_min(targets.length, self.length);
|
||||
|
||||
if (length >>> 0 !== length)
|
||||
ThrowError(JSMSG_BAD_ARRAY_LENGTH, ".prototype.scatter");
|
||||
|
||||
parallel: for (;;) { // see ParallelArrayBuild() to explain why for(;;) etc
|
||||
if (ShouldForceSequential())
|
||||
break parallel;
|
||||
if (!TRY_PARALLEL(mode))
|
||||
break parallel;
|
||||
|
||||
if (forceDivideScatterVector())
|
||||
return parDivideScatterVector();
|
||||
else if (forceDivideOutputRange())
|
||||
return parDivideOutputRange();
|
||||
else if (conflictFunc === undefined && targetsLength < length)
|
||||
return parDivideOutputRange();
|
||||
return parDivideScatterVector();
|
||||
}
|
||||
|
||||
// Sequential fallback:
|
||||
ASSERT_SEQUENTIAL_IS_OK(mode);
|
||||
return seq();
|
||||
|
||||
function forceDivideScatterVector() {
|
||||
return mode && mode.strategy && mode.strategy == "divide-scatter-vector";
|
||||
}
|
||||
|
||||
function forceDivideOutputRange() {
|
||||
return mode && mode.strategy && mode.strategy == "divide-output-range";
|
||||
}
|
||||
|
||||
function collide(elem1, elem2) {
|
||||
if (conflictFunc === undefined)
|
||||
ThrowError(JSMSG_PAR_ARRAY_SCATTER_CONFLICT);
|
||||
|
||||
return conflictFunc(elem1, elem2);
|
||||
}
|
||||
|
||||
|
||||
function parDivideOutputRange() {
|
||||
var chunks = ComputeNumChunks(targetsLength);
|
||||
var numSlices = ForkJoinSlices();
|
||||
var checkpoints = NewDenseArray(numSlices);
|
||||
for (var i = 0; i < numSlices; i++)
|
||||
UnsafeSetElement(checkpoints, i, 0);
|
||||
|
||||
var buffer = NewDenseArray(length);
|
||||
var conflicts = NewDenseArray(length);
|
||||
|
||||
for (var i = 0; i < length; i++) {
|
||||
UnsafeSetElement(buffer, i, defaultValue);
|
||||
UnsafeSetElement(conflicts, i, false);
|
||||
}
|
||||
|
||||
ForkJoin(fill, CheckParallel(mode));
|
||||
return NewParallelArray(ParallelArrayView, [length], buffer, 0);
|
||||
|
||||
function fill(sliceId, numSlices, warmup) {
|
||||
var indexPos = checkpoints[sliceId];
|
||||
var indexEnd = targetsLength;
|
||||
if (warmup)
|
||||
indexEnd = std_Math_min(indexEnd, indexPos + CHUNK_SIZE);
|
||||
|
||||
// Range in the output for which we are responsible:
|
||||
var [outputStart, outputEnd] = ComputeSliceBounds(length, sliceId, numSlices);
|
||||
|
||||
for (; indexPos < indexEnd; indexPos++) {
|
||||
var x = self.get(indexPos);
|
||||
var t = checkTarget(indexPos, targets[indexPos]);
|
||||
if (t < outputStart || t >= outputEnd)
|
||||
continue;
|
||||
if (conflicts[t])
|
||||
x = collide(x, buffer[t]);
|
||||
UnsafeSetElement(buffer, t, x,
|
||||
conflicts, t, true,
|
||||
checkpoints, sliceId, indexPos + 1);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
function parDivideScatterVector() {
|
||||
// Subtle: because we will be mutating the localBuffers and
|
||||
// conflict arrays in place, we can never replay an entry in the
|
||||
// target array for fear of inducing a conflict where none existed
|
||||
// before. Therefore, we must proceed not by chunks but rather by
|
||||
// individual indices.
|
||||
var numSlices = ForkJoinSlices();
|
||||
var info = ComputeAllSliceBounds(targetsLength, numSlices);
|
||||
|
||||
// FIXME(bug 844890): Use typed arrays here.
|
||||
var localBuffers = NewDenseArray(numSlices);
|
||||
for (var i = 0; i < numSlices; i++)
|
||||
UnsafeSetElement(localBuffers, i, NewDenseArray(length));
|
||||
var localConflicts = NewDenseArray(numSlices);
|
||||
for (var i = 0; i < numSlices; i++) {
|
||||
var conflicts_i = NewDenseArray(length);
|
||||
for (var j = 0; j < length; j++)
|
||||
UnsafeSetElement(conflicts_i, j, false);
|
||||
UnsafeSetElement(localConflicts, i, conflicts_i);
|
||||
}
|
||||
|
||||
// Initialize the 0th buffer, which will become the output. For
|
||||
// the other buffers, we track which parts have been written to
|
||||
// using the conflict buffer so they do not need to be
|
||||
// initialized.
|
||||
var outputBuffer = localBuffers[0];
|
||||
for (var i = 0; i < length; i++)
|
||||
UnsafeSetElement(outputBuffer, i, defaultValue);
|
||||
|
||||
ForkJoin(fill, CheckParallel(mode));
|
||||
mergeBuffers();
|
||||
return NewParallelArray(ParallelArrayView, [length], outputBuffer, 0);
|
||||
|
||||
function fill(sliceId, numSlices, warmup) {
|
||||
var indexPos = info[SLICE_POS(sliceId)];
|
||||
var indexEnd = info[SLICE_END(sliceId)];
|
||||
if (warmup)
|
||||
indexEnd = std_Math_min(indexEnd, indexPos + CHUNK_SIZE);
|
||||
|
||||
var localbuffer = localBuffers[sliceId];
|
||||
var conflicts = localConflicts[sliceId];
|
||||
while (indexPos < indexEnd) {
|
||||
var x = self.get(indexPos);
|
||||
var t = checkTarget(indexPos, targets[indexPos]);
|
||||
if (conflicts[t])
|
||||
x = collide(x, localbuffer[t]);
|
||||
UnsafeSetElement(localbuffer, t, x,
|
||||
conflicts, t, true,
|
||||
info, SLICE_POS(sliceId), ++indexPos);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Merge buffers 1..NUMSLICES into buffer 0. In principle, we could
|
||||
* parallelize the merge work as well. But for this first cut,
|
||||
* just do the merge sequentially.
|
||||
*/
|
||||
function mergeBuffers() {
|
||||
var buffer = localBuffers[0];
|
||||
var conflicts = localConflicts[0];
|
||||
for (var i = 1; i < numSlices; i++) {
|
||||
var otherbuffer = localBuffers[i];
|
||||
var otherconflicts = localConflicts[i];
|
||||
for (var j = 0; j < length; j++) {
|
||||
if (otherconflicts[j]) {
|
||||
if (conflicts[j]) {
|
||||
buffer[j] = collide(otherbuffer[j], buffer[j]);
|
||||
} else {
|
||||
buffer[j] = otherbuffer[j];
|
||||
conflicts[j] = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
function seq() {
|
||||
var buffer = NewDenseArray(length);
|
||||
var conflicts = NewDenseArray(length);
|
||||
|
||||
for (var i = 0; i < length; i++) {
|
||||
UnsafeSetElement(buffer, i, defaultValue);
|
||||
UnsafeSetElement(conflicts, i, false);
|
||||
}
|
||||
|
||||
for (var i = 0; i < targetsLength; i++) {
|
||||
var x = self.get(i);
|
||||
var t = checkTarget(i, targets[i]);
|
||||
if (conflicts[t])
|
||||
x = collide(x, buffer[t]);
|
||||
|
||||
UnsafeSetElement(buffer, t, x,
|
||||
conflicts, t, true);
|
||||
}
|
||||
|
||||
return NewParallelArray(ParallelArrayView, [length], buffer, 0);
|
||||
}
|
||||
|
||||
function checkTarget(i, t) {
|
||||
if (TO_INT32(t) !== t)
|
||||
ThrowError(JSMSG_PAR_ARRAY_SCATTER_BAD_TARGET, i);
|
||||
|
||||
if (t < 0 || t >= length)
|
||||
ThrowError(JSMSG_PAR_ARRAY_SCATTER_BOUNDS);
|
||||
|
||||
// It's not enough to return t, as -0 | 0 === -0.
|
||||
return TO_INT32(t);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* The familiar filter() operation applied across the outermost
|
||||
* dimension.
|
||||
*/
|
||||
function ParallelArrayFilter(func, mode) {
|
||||
// FIXME(bug 844887): Check |this instanceof ParallelArray|
|
||||
// FIXME(bug 844887): Check |IsCallable(func)|
|
||||
|
||||
var self = this;
|
||||
var length = self.shape[0];
|
||||
|
||||
parallel: for (;;) { // see ParallelArrayBuild() to explain why for(;;) etc
|
||||
if (ShouldForceSequential())
|
||||
break parallel;
|
||||
if (!TRY_PARALLEL(mode))
|
||||
break parallel;
|
||||
|
||||
var chunks = ComputeNumChunks(length);
|
||||
var numSlices = ForkJoinSlices();
|
||||
if (chunks < numSlices * 2)
|
||||
break parallel;
|
||||
|
||||
var info = ComputeAllSliceBounds(chunks, numSlices);
|
||||
|
||||
// Step 1. Compute which items from each slice of the result
|
||||
// buffer should be preserved. When we're done, we have an array
|
||||
// |survivors| containing a bitset for each chunk, indicating
|
||||
// which members of the chunk survived. We also keep an array
|
||||
// |counts| containing the total number of items that are being
|
||||
// preserved from within one slice.
|
||||
//
|
||||
// FIXME(bug 844890): Use typed arrays here.
|
||||
var counts = NewDenseArray(numSlices);
|
||||
for (var i = 0; i < numSlices; i++)
|
||||
UnsafeSetElement(counts, i, 0);
|
||||
var survivors = NewDenseArray(chunks);
|
||||
ForkJoin(findSurvivorsInSlice, CheckParallel(mode));
|
||||
|
||||
// Step 2. Compress the slices into one contiguous set.
|
||||
var count = 0;
|
||||
for (var i = 0; i < numSlices; i++)
|
||||
count += counts[i];
|
||||
var buffer = NewDenseArray(count);
|
||||
if (count > 0)
|
||||
ForkJoin(copySurvivorsInSlice, CheckParallel(mode));
|
||||
|
||||
return NewParallelArray(ParallelArrayView, [count], buffer, 0);
|
||||
}
|
||||
|
||||
// Sequential fallback:
|
||||
ASSERT_SEQUENTIAL_IS_OK(mode);
|
||||
var buffer = [];
|
||||
for (var i = 0; i < length; i++) {
|
||||
var elem = self.get(i);
|
||||
if (func(elem, i, self))
|
||||
ARRAY_PUSH(buffer, elem);
|
||||
}
|
||||
return NewParallelArray(ParallelArrayView, [buffer.length], buffer, 0);
|
||||
|
||||
/**
|
||||
* As described above, our goal is to determine which items we
|
||||
* will preserve from a given slice. We do this one chunk at a
|
||||
* time. When we finish a chunk, we record our current count and
|
||||
* the next chunk sliceId, lest we should bail.
|
||||
*/
|
||||
function findSurvivorsInSlice(sliceId, numSlices, warmup) {
|
||||
|
||||
var chunkPos = info[SLICE_POS(sliceId)];
|
||||
var chunkEnd = info[SLICE_END(sliceId)];
|
||||
|
||||
if (warmup && chunkEnd > chunkPos)
|
||||
chunkEnd = chunkPos + 1;
|
||||
|
||||
var count = counts[sliceId];
|
||||
while (chunkPos < chunkEnd) {
|
||||
var indexStart = chunkPos << CHUNK_SHIFT;
|
||||
var indexEnd = std_Math_min(indexStart + CHUNK_SIZE, length);
|
||||
var chunkBits = 0;
|
||||
|
||||
for (var bit = 0; indexStart + bit < indexEnd; bit++) {
|
||||
var keep = !!func(self.get(indexStart + bit), indexStart + bit, self);
|
||||
chunkBits |= keep << bit;
|
||||
count += keep;
|
||||
}
|
||||
|
||||
UnsafeSetElement(survivors, chunkPos, chunkBits,
|
||||
counts, sliceId, count,
|
||||
info, SLICE_POS(sliceId), ++chunkPos);
|
||||
}
|
||||
}
|
||||
|
||||
function copySurvivorsInSlice(sliceId, numSlices, warmup) {
|
||||
// Copies the survivors from this slice into the correct position.
|
||||
// Note that this is an idempotent operation that does not invoke
|
||||
// user code. Therefore, we don't expect bailouts and make an
|
||||
// effort to proceed chunk by chunk or avoid duplicating work.
|
||||
|
||||
// During warmup, we only execute with sliceId 0. This would fail to
|
||||
// execute the loop below. Therefore, during warmup, we
|
||||
// substitute 1 for the sliceId.
|
||||
if (warmup && sliceId == 0 && numSlices != 1)
|
||||
sliceId = 1;
|
||||
|
||||
// Total up the items preserved by previous slices.
|
||||
var count = 0;
|
||||
if (sliceId > 0) { // FIXME(#819219)---work around a bug in Ion's range checks
|
||||
for (var i = 0; i < sliceId; i++)
|
||||
count += counts[i];
|
||||
}
|
||||
|
||||
// Compute the final index we expect to write.
|
||||
var total = count + counts[sliceId];
|
||||
if (count == total)
|
||||
return;
|
||||
|
||||
// Iterate over the chunks assigned to us. Read the bitset for
|
||||
// each chunk. Copy values where a 1 appears until we have
|
||||
// written all the values that we expect to. We can just iterate
|
||||
// from 0...CHUNK_SIZE without fear of a truncated final chunk
|
||||
// because we are already checking for when count==total.
|
||||
var chunkStart = info[SLICE_START(sliceId)];
|
||||
var chunkEnd = info[SLICE_END(sliceId)];
|
||||
for (var chunk = chunkStart; chunk < chunkEnd; chunk++) {
|
||||
var chunkBits = survivors[chunk];
|
||||
if (!chunkBits)
|
||||
continue;
|
||||
|
||||
var indexStart = chunk << CHUNK_SHIFT;
|
||||
for (var i = 0; i < CHUNK_SIZE; i++) {
|
||||
if (chunkBits & (1 << i)) {
|
||||
UnsafeSetElement(buffer, count++, self.get(indexStart + i));
|
||||
if (count == total)
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Divides the outermost dimension into two dimensions. Does not copy
|
||||
* or affect the underlying data, just how it is divided amongst
|
||||
|
@ -1101,11 +335,11 @@ function ParallelArrayFilter(func, mode) {
|
|||
* N must be evenly divisible by 4 in that case.
|
||||
*/
|
||||
function ParallelArrayPartition(amount) {
|
||||
if (amount >>> 0 !== amount)
|
||||
if (!IS_UINT32(amount))
|
||||
ThrowError(JSMSG_PAR_ARRAY_BAD_ARG, "");
|
||||
|
||||
var length = this.shape[0];
|
||||
var partitions = (length / amount) | 0;
|
||||
var partitions = TO_INT32(length / amount);
|
||||
|
||||
if (partitions * amount !== length)
|
||||
ThrowError(JSMSG_PAR_ARRAY_BAD_PARTITION);
|
||||
|
@ -1242,46 +476,9 @@ function ParallelArrayToString() {
|
|||
return result;
|
||||
}
|
||||
|
||||
/**
|
||||
* Internal debugging tool: checks that the given `mode` permits
|
||||
* sequential execution
|
||||
*/
|
||||
function AssertSequentialIsOK(mode) {
|
||||
if (mode && mode.mode && mode.mode !== "seq" && ParallelTestsShouldPass())
|
||||
ThrowError(JSMSG_WRONG_VALUE, "parallel execution", "sequential was forced");
|
||||
}
|
||||
|
||||
/**
|
||||
* Internal debugging tool: returns a function to be supplied to
|
||||
* ForkJoin() that will check that the parallel results
|
||||
* bailout/succeed as expected. Returns null if no mode is supplied
|
||||
* or we are building with some strange IF_DEF configuration such that
|
||||
* we don't expect parallel execution to work.
|
||||
*/
|
||||
function CheckParallel(mode) {
|
||||
if (!mode || !ParallelTestsShouldPass())
|
||||
return null;
|
||||
|
||||
return function(result, bailouts, causes) {
|
||||
if (!("expect" in mode) || mode.expect === "any") {
|
||||
return; // Ignore result when unspecified or unimportant.
|
||||
} else if (mode.expect === "mixed" && result !== "disqualified") {
|
||||
return; // "mixed" means that it may bailout, may succeed
|
||||
} else if (result === mode.expect) {
|
||||
return;
|
||||
}
|
||||
|
||||
ThrowError(JSMSG_WRONG_VALUE, mode.expect,
|
||||
result+":"+bailouts+":"+causes);
|
||||
};
|
||||
}
|
||||
|
||||
/*
|
||||
* Mark the main operations as clone-at-callsite for better precision.
|
||||
* This is slightly overkill, as all that we really need is to
|
||||
* specialize to the receiver and the elemental function, but in
|
||||
* practice this is likely not so different, since element functions
|
||||
* are often used in exactly one place.
|
||||
* Mark the comprehension form and friends as clone-at-callsite. See note in
|
||||
* ParallelArrayCommonOps.js
|
||||
*/
|
||||
SetScriptHints(ParallelArrayConstructEmpty, { cloneAtCallsite: true });
|
||||
SetScriptHints(ParallelArrayConstructFromArray, { cloneAtCallsite: true });
|
||||
|
@ -1290,11 +487,6 @@ SetScriptHints(ParallelArrayConstructFromFunctionMode, { cloneAtCallsite: true }
|
|||
SetScriptHints(ParallelArrayConstructFromComprehension, { cloneAtCallsite: true });
|
||||
SetScriptHints(ParallelArrayView, { cloneAtCallsite: true });
|
||||
SetScriptHints(ParallelArrayBuild, { cloneAtCallsite: true });
|
||||
SetScriptHints(ParallelArrayMap, { cloneAtCallsite: true });
|
||||
SetScriptHints(ParallelArrayReduce, { cloneAtCallsite: true });
|
||||
SetScriptHints(ParallelArrayScan, { cloneAtCallsite: true });
|
||||
SetScriptHints(ParallelArrayScatter, { cloneAtCallsite: true });
|
||||
SetScriptHints(ParallelArrayFilter, { cloneAtCallsite: true });
|
||||
|
||||
/*
|
||||
* Mark the common getters as clone-at-callsite and inline. This is
|
||||
|
|
|
@ -0,0 +1,881 @@
|
|||
/* This Source Code Form is subject to the terms of the Mozilla Public
|
||||
* License, v. 2.0. If a copy of the MPL was not distributed with this
|
||||
* file, You can obtain one at http://mozilla.org/MPL/2.0/. */
|
||||
|
||||
/*
|
||||
* This file includes utility functions, macros, and "templates" for the
|
||||
* following 1D parallel operations which operate on the outermost dimension,
|
||||
* so that they may be shared between the Array and Matrix operations.
|
||||
*
|
||||
* - map
|
||||
* - reduce
|
||||
* - scan
|
||||
* - scatter
|
||||
* - filter
|
||||
*
|
||||
* Operations that are more closely tied to the dimensionality of the data,
|
||||
* like build, are in the constructors' respective files.
|
||||
*
|
||||
* This file is designed to be #include'd with the following macros defined:
|
||||
*
|
||||
* PA_LENGTH(array)
|
||||
* PA_GET(array, index)
|
||||
* PA_NEW(length, buffer, offset)
|
||||
*
|
||||
* The internal names for the functions should also be defined:
|
||||
*
|
||||
* PA_MAP_NAME
|
||||
* PA_REDUCE_NAME
|
||||
* PA_SCAN_NAME
|
||||
* PA_SCATTER_NAME
|
||||
* PA_FILTER_NAME
|
||||
*
|
||||
* See Array.js for an usage example.
|
||||
*/
|
||||
|
||||
/* The mode asserts options object. */
|
||||
#define TRY_PARALLEL(MODE) \
|
||||
((!MODE || MODE.mode === "par"))
|
||||
#define ASSERT_SEQUENTIAL_IS_OK(MODE) \
|
||||
do { if (MODE) AssertSequentialIsOK(MODE) } while(false)
|
||||
|
||||
/* Slice array: see ComputeAllSliceBounds() */
|
||||
#define SLICE_INFO(START, END) START, END, START, 0
|
||||
#define SLICE_START(ID) ((ID << 2) + 0)
|
||||
#define SLICE_END(ID) ((ID << 2) + 1)
|
||||
#define SLICE_POS(ID) ((ID << 2) + 2)
|
||||
|
||||
/*
|
||||
* How many items at a time do we do recomp. for parallel execution.
|
||||
* Note that filter currently assumes that this is no greater than 32
|
||||
* in order to make use of a bitset.
|
||||
*/
|
||||
#define CHUNK_SHIFT 5
|
||||
#define CHUNK_SIZE 32
|
||||
|
||||
/* Safe versions of ARRAY.push(ELEMENT) */
|
||||
#define ARRAY_PUSH(ARRAY, ELEMENT) \
|
||||
callFunction(std_Array_push, ARRAY, ELEMENT);
|
||||
#define ARRAY_SLICE(ARRAY, ELEMENT) \
|
||||
callFunction(std_Array_slice, ARRAY, ELEMENT);
|
||||
|
||||
/**
|
||||
* Determine the number of chunks of size CHUNK_SIZE;
|
||||
* note that the final chunk may be smaller than CHUNK_SIZE.
|
||||
*/
|
||||
function ComputeNumChunks(length) {
|
||||
var chunks = length >>> CHUNK_SHIFT;
|
||||
if (chunks << CHUNK_SHIFT === length)
|
||||
return chunks;
|
||||
return chunks + 1;
|
||||
}
|
||||
|
||||
/**
|
||||
* Computes the bounds for slice |sliceIndex| of |numItems| items,
|
||||
* assuming |numSlices| total slices. If numItems is not evenly
|
||||
* divisible by numSlices, then the final thread may have a bit of
|
||||
* extra work.
|
||||
*/
|
||||
function ComputeSliceBounds(numItems, sliceIndex, numSlices) {
|
||||
var sliceWidth = TO_INT32(numItems / numSlices);
|
||||
var startIndex = sliceWidth * sliceIndex;
|
||||
var endIndex = sliceIndex === numSlices - 1 ? numItems : sliceWidth * (sliceIndex + 1);
|
||||
return [startIndex, endIndex];
|
||||
}
|
||||
|
||||
/**
|
||||
* Divides |numItems| items amongst |numSlices| slices. The result
|
||||
* is an array containing multiple values per slice: the start
|
||||
* index, end index, current position, and some padding. The
|
||||
* current position is initially the same as the start index. To
|
||||
* access the values for a particular slice, use the macros
|
||||
* SLICE_START() and so forth.
|
||||
*/
|
||||
function ComputeAllSliceBounds(numItems, numSlices) {
|
||||
// FIXME(bug 844890): Use typed arrays here.
|
||||
var info = [];
|
||||
for (var i = 0; i < numSlices; i++) {
|
||||
var [start, end] = ComputeSliceBounds(numItems, i, numSlices);
|
||||
ARRAY_PUSH(info, SLICE_INFO(start, end));
|
||||
}
|
||||
return info;
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates a new array by applying |func(e, i, self)| for each element |e|
|
||||
* with index |i|.
|
||||
*/
|
||||
function PA_MAP_NAME(func, mode) {
|
||||
if (!IsCallable(func))
|
||||
ThrowError(JSMSG_NOT_FUNCTION, DecompileArg(0, func));
|
||||
|
||||
var self = ToObject(this);
|
||||
var length = PA_LENGTH(self);
|
||||
var buffer = NewDenseArray(length);
|
||||
|
||||
parallel: for (;;) {
|
||||
// Avoid parallel compilation if we are already nested in another
|
||||
// parallel section or the user told us not to parallelize. The
|
||||
// use of a for (;;) loop is working around some ion limitations:
|
||||
//
|
||||
// - Breaking out of named blocks does not currently work (bug 684384);
|
||||
// - Unreachable Code Elim. can't properly handle if (a && b) (bug 669796)
|
||||
if (ShouldForceSequential())
|
||||
break parallel;
|
||||
if (!TRY_PARALLEL(mode))
|
||||
break parallel;
|
||||
|
||||
var chunks = ComputeNumChunks(length);
|
||||
var numSlices = ForkJoinSlices();
|
||||
var info = ComputeAllSliceBounds(chunks, numSlices);
|
||||
ForkJoin(mapSlice, CheckParallel(mode));
|
||||
return PA_NEW(length, buffer, 0);
|
||||
}
|
||||
|
||||
// Sequential fallback:
|
||||
ASSERT_SEQUENTIAL_IS_OK(mode);
|
||||
for (var i = 0; i < length; i++) {
|
||||
// Note: Unlike JS arrays, parallel arrays cannot have holes.
|
||||
var v = func(PA_GET(self, i), i, self);
|
||||
UnsafeSetElement(buffer, i, v);
|
||||
}
|
||||
return PA_NEW(length, buffer, 0);
|
||||
|
||||
function mapSlice(sliceId, numSlices, warmup) {
|
||||
var chunkPos = info[SLICE_POS(sliceId)];
|
||||
var chunkEnd = info[SLICE_END(sliceId)];
|
||||
|
||||
if (warmup && chunkEnd > chunkPos + 1)
|
||||
chunkEnd = chunkPos + 1;
|
||||
|
||||
while (chunkPos < chunkEnd) {
|
||||
var indexStart = chunkPos << CHUNK_SHIFT;
|
||||
var indexEnd = std_Math_min(indexStart + CHUNK_SIZE, length);
|
||||
|
||||
for (var i = indexStart; i < indexEnd; i++)
|
||||
UnsafeSetElement(buffer, i, func(PA_GET(self, i), i, self));
|
||||
|
||||
UnsafeSetElement(info, SLICE_POS(sliceId), ++chunkPos);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Reduces the elements in an array in parallel. Order is not fixed and |func|
|
||||
* is assumed to be associative.
|
||||
*/
|
||||
function PA_REDUCE_NAME(func, mode) {
|
||||
if (!IsCallable(func))
|
||||
ThrowError(JSMSG_NOT_FUNCTION, DecompileArg(0, func));
|
||||
|
||||
var self = ToObject(this);
|
||||
var length = PA_LENGTH(self);
|
||||
|
||||
if (length === 0)
|
||||
ThrowError(JSMSG_PAR_ARRAY_REDUCE_EMPTY);
|
||||
|
||||
parallel: for (;;) { // see map to explain why for(;;) etc
|
||||
if (ShouldForceSequential())
|
||||
break parallel;
|
||||
if (!TRY_PARALLEL(mode))
|
||||
break parallel;
|
||||
|
||||
var chunks = ComputeNumChunks(length);
|
||||
var numSlices = ForkJoinSlices();
|
||||
if (chunks < numSlices)
|
||||
break parallel;
|
||||
|
||||
var info = ComputeAllSliceBounds(chunks, numSlices);
|
||||
var subreductions = NewDenseArray(numSlices);
|
||||
ForkJoin(reduceSlice, CheckParallel(mode));
|
||||
var accumulator = subreductions[0];
|
||||
for (var i = 1; i < numSlices; i++)
|
||||
accumulator = func(accumulator, subreductions[i]);
|
||||
return accumulator;
|
||||
}
|
||||
|
||||
// Sequential fallback:
|
||||
ASSERT_SEQUENTIAL_IS_OK(mode);
|
||||
var accumulator = PA_GET(self, 0);
|
||||
for (var i = 1; i < length; i++)
|
||||
accumulator = func(accumulator, PA_GET(self, i));
|
||||
return accumulator;
|
||||
|
||||
function reduceSlice(sliceId, numSlices, warmup) {
|
||||
var chunkStart = info[SLICE_START(sliceId)];
|
||||
var chunkPos = info[SLICE_POS(sliceId)];
|
||||
var chunkEnd = info[SLICE_END(sliceId)];
|
||||
|
||||
// (*) This function is carefully designed so that the warmup
|
||||
// (which executes with chunkStart === chunkPos) will execute all
|
||||
// potential loads and stores. In particular, the warmup run
|
||||
// processes two chunks rather than one. Moreover, it stores
|
||||
// accumulator into subreductions and then loads it again to
|
||||
// ensure that the load is executed during the warmup, as it will
|
||||
// certainly be executed during subsequent runs.
|
||||
|
||||
if (warmup && chunkEnd > chunkPos + 2)
|
||||
chunkEnd = chunkPos + 2;
|
||||
|
||||
if (chunkStart === chunkPos) {
|
||||
var indexPos = chunkStart << CHUNK_SHIFT;
|
||||
var accumulator = reduceChunk(PA_GET(self, indexPos), indexPos + 1, indexPos + CHUNK_SIZE);
|
||||
|
||||
UnsafeSetElement(subreductions, sliceId, accumulator, // see (*) above
|
||||
info, SLICE_POS(sliceId), ++chunkPos);
|
||||
}
|
||||
|
||||
var accumulator = subreductions[sliceId]; // see (*) above
|
||||
|
||||
while (chunkPos < chunkEnd) {
|
||||
var indexPos = chunkPos << CHUNK_SHIFT;
|
||||
accumulator = reduceChunk(accumulator, indexPos, indexPos + CHUNK_SIZE);
|
||||
UnsafeSetElement(subreductions, sliceId, accumulator,
|
||||
info, SLICE_POS(sliceId), ++chunkPos);
|
||||
}
|
||||
}
|
||||
|
||||
function reduceChunk(accumulator, from, to) {
|
||||
to = std_Math_min(to, length);
|
||||
for (var i = from; i < to; i++)
|
||||
accumulator = func(accumulator, PA_GET(self, i));
|
||||
return accumulator;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns an array [s_0, ..., s_N] where |s_i| is equal to the reduction (as
|
||||
* per |reduce()|) of elements |0..i|. This is the generalization of partial
|
||||
* sum.
|
||||
*/
|
||||
function PA_SCAN_NAME(func, mode) {
|
||||
if (!IsCallable(func))
|
||||
ThrowError(JSMSG_NOT_FUNCTION, DecompileArg(0, func));
|
||||
|
||||
var self = ToObject(this);
|
||||
var length = PA_LENGTH(self);
|
||||
|
||||
if (length === 0)
|
||||
ThrowError(JSMSG_PAR_ARRAY_REDUCE_EMPTY);
|
||||
|
||||
var buffer = NewDenseArray(length);
|
||||
|
||||
parallel: for (;;) { // see map to explain why for(;;) etc
|
||||
if (ShouldForceSequential())
|
||||
break parallel;
|
||||
if (!TRY_PARALLEL(mode))
|
||||
break parallel;
|
||||
|
||||
var chunks = ComputeNumChunks(length);
|
||||
var numSlices = ForkJoinSlices();
|
||||
if (chunks < numSlices)
|
||||
break parallel;
|
||||
var info = ComputeAllSliceBounds(chunks, numSlices);
|
||||
|
||||
// Scan slices individually (see comment on phase1()).
|
||||
ForkJoin(phase1, CheckParallel(mode));
|
||||
|
||||
// Compute intermediates array (see comment on phase2()).
|
||||
var intermediates = [];
|
||||
var accumulator = buffer[finalElement(0)];
|
||||
ARRAY_PUSH(intermediates, accumulator);
|
||||
for (var i = 1; i < numSlices - 1; i++) {
|
||||
accumulator = func(accumulator, buffer[finalElement(i)]);
|
||||
ARRAY_PUSH(intermediates, accumulator);
|
||||
}
|
||||
|
||||
// Reset the current position information for each slice, but
|
||||
// convert from chunks to indices (see comment on phase2()).
|
||||
for (var i = 0; i < numSlices; i++) {
|
||||
info[SLICE_POS(i)] = info[SLICE_START(i)] << CHUNK_SHIFT;
|
||||
info[SLICE_END(i)] = info[SLICE_END(i)] << CHUNK_SHIFT;
|
||||
}
|
||||
info[SLICE_END(numSlices - 1)] = std_Math_min(info[SLICE_END(numSlices - 1)], length);
|
||||
|
||||
// Complete each slice using intermediates array (see comment on phase2()).
|
||||
ForkJoin(phase2, CheckParallel(mode));
|
||||
return PA_NEW(length, buffer, 0);
|
||||
}
|
||||
|
||||
// Sequential fallback:
|
||||
ASSERT_SEQUENTIAL_IS_OK(mode);
|
||||
scan(PA_GET(self, 0), 0, length);
|
||||
return PA_NEW(length, buffer, 0);
|
||||
|
||||
function scan(accumulator, start, end) {
|
||||
UnsafeSetElement(buffer, start, accumulator);
|
||||
for (var i = start + 1; i < end; i++) {
|
||||
accumulator = func(accumulator, PA_GET(self, i));
|
||||
UnsafeSetElement(buffer, i, accumulator);
|
||||
}
|
||||
return accumulator;
|
||||
}
|
||||
|
||||
/**
|
||||
* In phase 1, we divide the source array into |numSlices| slices and
|
||||
* compute scan on each slice sequentially as if it were the entire
|
||||
* array. This function is responsible for computing one of those
|
||||
* slices.
|
||||
*
|
||||
* So, if we have an array [A,B,C,D,E,F,G,H,I], |numSlices == 3|,
|
||||
* and our function |func| is sum, then we would wind up computing a
|
||||
* result array like:
|
||||
*
|
||||
* [A, A+B, A+B+C, D, D+E, D+E+F, G, G+H, G+H+I]
|
||||
* ^~~~~~~~~~~~^ ^~~~~~~~~~~~^ ^~~~~~~~~~~~~^
|
||||
* Slice 0 Slice 1 Slice 2
|
||||
*
|
||||
* Read on in phase2 to see what we do next!
|
||||
*/
|
||||
function phase1(sliceId, numSlices, warmup) {
|
||||
var chunkStart = info[SLICE_START(sliceId)];
|
||||
var chunkPos = info[SLICE_POS(sliceId)];
|
||||
var chunkEnd = info[SLICE_END(sliceId)];
|
||||
|
||||
if (warmup && chunkEnd > chunkPos + 2)
|
||||
chunkEnd = chunkPos + 2;
|
||||
|
||||
if (chunkPos == chunkStart) {
|
||||
// For the first chunk, the accumulator begins as the value in
|
||||
// the input at the start of the chunk.
|
||||
var indexStart = chunkPos << CHUNK_SHIFT;
|
||||
var indexEnd = std_Math_min(indexStart + CHUNK_SIZE, length);
|
||||
scan(PA_GET(self, indexStart), indexStart, indexEnd);
|
||||
UnsafeSetElement(info, SLICE_POS(sliceId), ++chunkPos);
|
||||
}
|
||||
|
||||
while (chunkPos < chunkEnd) {
|
||||
// For each subsequent chunk, the accumulator begins as the
|
||||
// combination of the final value of prev chunk and the value in
|
||||
// the input at the start of this chunk. Note that this loop is
|
||||
// written as simple as possible, at the cost of an extra read
|
||||
// from the buffer per iteration.
|
||||
var indexStart = chunkPos << CHUNK_SHIFT;
|
||||
var indexEnd = std_Math_min(indexStart + CHUNK_SIZE, length);
|
||||
var accumulator = func(buffer[indexStart - 1], PA_GET(self, indexStart));
|
||||
scan(accumulator, indexStart, indexEnd);
|
||||
UnsafeSetElement(info, SLICE_POS(sliceId), ++chunkPos);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Computes the index of the final element computed by the slice |sliceId|.
|
||||
*/
|
||||
function finalElement(sliceId) {
|
||||
var chunkEnd = info[SLICE_END(sliceId)]; // last chunk written by |sliceId| is endChunk - 1
|
||||
var indexStart = std_Math_min(chunkEnd << CHUNK_SHIFT, length);
|
||||
return indexStart - 1;
|
||||
}
|
||||
|
||||
/**
|
||||
* After computing the phase1 results, we compute an
|
||||
* |intermediates| array. |intermediates[i]| contains the result
|
||||
* of reducing the final value from each preceding slice j<i with
|
||||
* the final value of slice i. So, to continue our previous
|
||||
* example, the intermediates array would contain:
|
||||
*
|
||||
* [A+B+C, (A+B+C)+(D+E+F), ((A+B+C)+(D+E+F))+(G+H+I)]
|
||||
*
|
||||
* Here I have used parenthesization to make clear the order of
|
||||
* evaluation in each case.
|
||||
*
|
||||
* An aside: currently the intermediates array is computed
|
||||
* sequentially. In principle, we could compute it in parallel,
|
||||
* at the cost of doing duplicate work. This did not seem
|
||||
* particularly advantageous to me, particularly as the number
|
||||
* of slices is typically quite small (one per core), so I opted
|
||||
* to just compute it sequentially.
|
||||
*
|
||||
* Phase 2 combines the results of phase1 with the intermediates
|
||||
* array to produce the final scan results. The idea is to
|
||||
* reiterate over each element S[i] in the slice |sliceId|, which
|
||||
* currently contains the result of reducing with S[0]...S[i]
|
||||
* (where S0 is the first thing in the slice), and combine that
|
||||
* with |intermediate[sliceId-1]|, which represents the result of
|
||||
* reducing everything in the input array prior to the slice.
|
||||
*
|
||||
* To continue with our example, in phase 1 we computed slice 1 to
|
||||
* be [D, D+E, D+E+F]. We will combine those results with
|
||||
* |intermediates[1-1]|, which is |A+B+C|, so that the final
|
||||
* result is [(A+B+C)+D, (A+B+C)+(D+E), (A+B+C)+(D+E+F)]. Again I
|
||||
* am using parentheses to clarify how these results were reduced.
|
||||
*
|
||||
* SUBTLE: Because we are mutating |buffer| in place, we have to
|
||||
* be very careful about bailouts! We cannot checkpoint a chunk
|
||||
* at a time as we do elsewhere because that assumes it is safe to
|
||||
* replay the portion of a chunk which was already processed.
|
||||
* Therefore, in this phase, we track the current position at an
|
||||
* index granularity, although this requires two memory writes per
|
||||
* index.
|
||||
*/
|
||||
function phase2(sliceId, numSlices, warmup) {
|
||||
if (sliceId == 0)
|
||||
return; // No work to do for the 0th slice.
|
||||
|
||||
var indexPos = info[SLICE_POS(sliceId)];
|
||||
var indexEnd = info[SLICE_END(sliceId)];
|
||||
|
||||
if (warmup)
|
||||
indexEnd = std_Math_min(indexEnd, indexPos + CHUNK_SIZE);
|
||||
|
||||
var intermediate = intermediates[sliceId - 1];
|
||||
for (; indexPos < indexEnd; indexPos++) {
|
||||
UnsafeSetElement(buffer, indexPos, func(intermediate, buffer[indexPos]),
|
||||
info, SLICE_POS(sliceId), indexPos + 1);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* |scatter()| redistributes the elements in the array into a new array.
|
||||
*
|
||||
* - targets: The index targets[i] indicates where the ith element
|
||||
* should appear in the result.
|
||||
*
|
||||
* - defaultValue: what value to use for indices in the output array that
|
||||
* are never targeted.
|
||||
*
|
||||
* - conflictFunc: The conflict function. Used to resolve what
|
||||
* happens if two indices i and j in the source array are targeted
|
||||
* as the same destination (i.e., targets[i] == targets[j]), then
|
||||
* the final result is determined by applying func(targets[i],
|
||||
* targets[j]). If no conflict function is provided, it is an error
|
||||
* if targets[i] == targets[j].
|
||||
*
|
||||
* - length: length of the output array (if not specified, uses the
|
||||
* length of the input).
|
||||
*
|
||||
* - mode: internal debugging specification.
|
||||
*/
|
||||
function PA_SCATTER_NAME(targets, defaultValue, conflictFunc, length, mode) {
|
||||
// FIXME(bug 844887): Check targets is array-like
|
||||
|
||||
if (conflictFunc && !IsCallable(conflictFunc))
|
||||
ThrowError(JSMSG_NOT_FUNCTION, DecompileArg(2, conflictFunc));
|
||||
|
||||
var self = ToObject(this);
|
||||
|
||||
if (length === undefined)
|
||||
length = PA_LENGTH(self);
|
||||
|
||||
// The Divide-Scatter-Vector strategy:
|
||||
// 1. Slice |targets| array of indices ("scatter-vector") into N
|
||||
// parts.
|
||||
// 2. Each of the N threads prepares an output buffer and a
|
||||
// write-log.
|
||||
// 3. Each thread scatters according to one of the N parts into its
|
||||
// own output buffer, tracking written indices in the write-log
|
||||
// and resolving any resulting local collisions in parallel.
|
||||
// 4. Merge the parts (either in parallel or sequentially), using
|
||||
// the write-logs as both the basis for finding merge-inputs and
|
||||
// for detecting collisions.
|
||||
|
||||
// The Divide-Output-Range strategy:
|
||||
// 1. Slice the range of indices [0..|length|-1] into N parts.
|
||||
// Allocate a single shared output buffer of length |length|.
|
||||
// 2. Each of the N threads scans (the entirety of) the |targets|
|
||||
// array, seeking occurrences of indices from that thread's part
|
||||
// of the range, and writing the results into the shared output
|
||||
// buffer.
|
||||
// 3. Since each thread has its own portion of the output range,
|
||||
// every collision that occurs can be handled thread-locally.
|
||||
|
||||
// SO:
|
||||
//
|
||||
// If |targets.length| >> |length|, Divide-Scatter-Vector seems like
|
||||
// a clear win over Divide-Output-Range, since for the latter, the
|
||||
// expense of redundantly scanning the |targets| will diminish the
|
||||
// gain from processing |length| in parallel, while for the former,
|
||||
// the total expense of building separate output buffers and the
|
||||
// merging post-process is small compared to the gain from
|
||||
// processing |targets| in parallel.
|
||||
//
|
||||
// If |targets.length| << |length|, then Divide-Output-Range seems
|
||||
// like it *could* win over Divide-Scatter-Vector. (But when is
|
||||
// |targets.length| << |length| or even |targets.length| < |length|?
|
||||
// Seems like an odd situation and an uncommon case at best.)
|
||||
//
|
||||
// The unanswered question is which strategy performs better when
|
||||
// |targets.length| approximately equals |length|, especially for
|
||||
// special cases like collision-free scatters and permutations.
|
||||
|
||||
if (!IS_UINT32(targets.length))
|
||||
ThrowError(JSMSG_BAD_ARRAY_LENGTH, ".prototype.scatter");
|
||||
|
||||
var targetsLength = std_Math_min(targets.length, self.length);
|
||||
|
||||
if (!IS_UINT32(length))
|
||||
ThrowError(JSMSG_BAD_ARRAY_LENGTH, ".prototype.scatter");
|
||||
|
||||
parallel: for (;;) { // see map to explain why for(;;) etc
|
||||
if (ShouldForceSequential())
|
||||
break parallel;
|
||||
if (!TRY_PARALLEL(mode))
|
||||
break parallel;
|
||||
|
||||
if (forceDivideScatterVector())
|
||||
return parDivideScatterVector();
|
||||
else if (forceDivideOutputRange())
|
||||
return parDivideOutputRange();
|
||||
else if (conflictFunc === undefined && targetsLength < length)
|
||||
return parDivideOutputRange();
|
||||
return parDivideScatterVector();
|
||||
}
|
||||
|
||||
// Sequential fallback:
|
||||
ASSERT_SEQUENTIAL_IS_OK(mode);
|
||||
return seq();
|
||||
|
||||
function forceDivideScatterVector() {
|
||||
return mode && mode.strategy && mode.strategy == "divide-scatter-vector";
|
||||
}
|
||||
|
||||
function forceDivideOutputRange() {
|
||||
return mode && mode.strategy && mode.strategy == "divide-output-range";
|
||||
}
|
||||
|
||||
function collide(elem1, elem2) {
|
||||
if (conflictFunc === undefined)
|
||||
ThrowError(JSMSG_PAR_ARRAY_SCATTER_CONFLICT);
|
||||
|
||||
return conflictFunc(elem1, elem2);
|
||||
}
|
||||
|
||||
|
||||
function parDivideOutputRange() {
|
||||
var chunks = ComputeNumChunks(targetsLength);
|
||||
var numSlices = ForkJoinSlices();
|
||||
var checkpoints = NewDenseArray(numSlices);
|
||||
for (var i = 0; i < numSlices; i++)
|
||||
UnsafeSetElement(checkpoints, i, 0);
|
||||
|
||||
var buffer = NewDenseArray(length);
|
||||
var conflicts = NewDenseArray(length);
|
||||
|
||||
for (var i = 0; i < length; i++) {
|
||||
UnsafeSetElement(buffer, i, defaultValue);
|
||||
UnsafeSetElement(conflicts, i, false);
|
||||
}
|
||||
|
||||
ForkJoin(fill, CheckParallel(mode));
|
||||
return PA_NEW(length, buffer, 0);
|
||||
|
||||
function fill(sliceId, numSlices, warmup) {
|
||||
var indexPos = checkpoints[sliceId];
|
||||
var indexEnd = targetsLength;
|
||||
if (warmup)
|
||||
indexEnd = std_Math_min(indexEnd, indexPos + CHUNK_SIZE);
|
||||
|
||||
// Range in the output for which we are responsible:
|
||||
var [outputStart, outputEnd] = ComputeSliceBounds(length, sliceId, numSlices);
|
||||
|
||||
for (; indexPos < indexEnd; indexPos++) {
|
||||
var x = PA_GET(self, indexPos);
|
||||
var t = checkTarget(indexPos, targets[indexPos]);
|
||||
if (t < outputStart || t >= outputEnd)
|
||||
continue;
|
||||
if (conflicts[t])
|
||||
x = collide(x, buffer[t]);
|
||||
UnsafeSetElement(buffer, t, x,
|
||||
conflicts, t, true,
|
||||
checkpoints, sliceId, indexPos + 1);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
function parDivideScatterVector() {
|
||||
// Subtle: because we will be mutating the localBuffers and
|
||||
// conflict arrays in place, we can never replay an entry in the
|
||||
// target array for fear of inducing a conflict where none existed
|
||||
// before. Therefore, we must proceed not by chunks but rather by
|
||||
// individual indices.
|
||||
var numSlices = ForkJoinSlices();
|
||||
var info = ComputeAllSliceBounds(targetsLength, numSlices);
|
||||
|
||||
// FIXME(bug 844890): Use typed arrays here.
|
||||
var localBuffers = NewDenseArray(numSlices);
|
||||
for (var i = 0; i < numSlices; i++)
|
||||
UnsafeSetElement(localBuffers, i, NewDenseArray(length));
|
||||
var localConflicts = NewDenseArray(numSlices);
|
||||
for (var i = 0; i < numSlices; i++) {
|
||||
var conflicts_i = NewDenseArray(length);
|
||||
for (var j = 0; j < length; j++)
|
||||
UnsafeSetElement(conflicts_i, j, false);
|
||||
UnsafeSetElement(localConflicts, i, conflicts_i);
|
||||
}
|
||||
|
||||
// Initialize the 0th buffer, which will become the output. For
|
||||
// the other buffers, we track which parts have been written to
|
||||
// using the conflict buffer so they do not need to be
|
||||
// initialized.
|
||||
var outputBuffer = localBuffers[0];
|
||||
for (var i = 0; i < length; i++)
|
||||
UnsafeSetElement(outputBuffer, i, defaultValue);
|
||||
|
||||
ForkJoin(fill, CheckParallel(mode));
|
||||
mergeBuffers();
|
||||
return PA_NEW(length, outputBuffer, 0);
|
||||
|
||||
function fill(sliceId, numSlices, warmup) {
|
||||
var indexPos = info[SLICE_POS(sliceId)];
|
||||
var indexEnd = info[SLICE_END(sliceId)];
|
||||
if (warmup)
|
||||
indexEnd = std_Math_min(indexEnd, indexPos + CHUNK_SIZE);
|
||||
|
||||
var localbuffer = localBuffers[sliceId];
|
||||
var conflicts = localConflicts[sliceId];
|
||||
while (indexPos < indexEnd) {
|
||||
var x = PA_GET(self, indexPos);
|
||||
var t = checkTarget(indexPos, targets[indexPos]);
|
||||
if (conflicts[t])
|
||||
x = collide(x, localbuffer[t]);
|
||||
UnsafeSetElement(localbuffer, t, x,
|
||||
conflicts, t, true,
|
||||
info, SLICE_POS(sliceId), ++indexPos);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Merge buffers 1..NUMSLICES into buffer 0. In principle, we could
|
||||
* parallelize the merge work as well. But for this first cut,
|
||||
* just do the merge sequentially.
|
||||
*/
|
||||
function mergeBuffers() {
|
||||
var buffer = localBuffers[0];
|
||||
var conflicts = localConflicts[0];
|
||||
for (var i = 1; i < numSlices; i++) {
|
||||
var otherbuffer = localBuffers[i];
|
||||
var otherconflicts = localConflicts[i];
|
||||
for (var j = 0; j < length; j++) {
|
||||
if (otherconflicts[j]) {
|
||||
if (conflicts[j]) {
|
||||
buffer[j] = collide(otherbuffer[j], buffer[j]);
|
||||
} else {
|
||||
buffer[j] = otherbuffer[j];
|
||||
conflicts[j] = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
function seq() {
|
||||
var buffer = NewDenseArray(length);
|
||||
var conflicts = NewDenseArray(length);
|
||||
|
||||
for (var i = 0; i < length; i++) {
|
||||
UnsafeSetElement(buffer, i, defaultValue);
|
||||
UnsafeSetElement(conflicts, i, false);
|
||||
}
|
||||
|
||||
for (var i = 0; i < targetsLength; i++) {
|
||||
var x = PA_GET(self, i);
|
||||
var t = checkTarget(i, targets[i]);
|
||||
if (conflicts[t])
|
||||
x = collide(x, buffer[t]);
|
||||
|
||||
UnsafeSetElement(buffer, t, x,
|
||||
conflicts, t, true);
|
||||
}
|
||||
|
||||
return PA_NEW(length, buffer, 0);
|
||||
}
|
||||
|
||||
function checkTarget(i, t) {
|
||||
if (TO_INT32(t) !== t)
|
||||
ThrowError(JSMSG_PAR_ARRAY_SCATTER_BAD_TARGET, i);
|
||||
|
||||
if (t < 0 || t >= length)
|
||||
ThrowError(JSMSG_PAR_ARRAY_SCATTER_BOUNDS);
|
||||
|
||||
// It's not enough to return t, as -0 | 0 === -0.
|
||||
return TO_INT32(t);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* The familiar filter operation applied in parallel.
|
||||
*/
|
||||
function PA_FILTER_NAME(func, mode) {
|
||||
if (!IsCallable(func))
|
||||
ThrowError(JSMSG_NOT_FUNCTION, DecompileArg(0, func));
|
||||
|
||||
var self = ToObject(this);
|
||||
var length = PA_LENGTH(self);
|
||||
|
||||
parallel: for (;;) { // see map to explain why for(;;) etc
|
||||
if (ShouldForceSequential())
|
||||
break parallel;
|
||||
if (!TRY_PARALLEL(mode))
|
||||
break parallel;
|
||||
|
||||
var chunks = ComputeNumChunks(length);
|
||||
var numSlices = ForkJoinSlices();
|
||||
if (chunks < numSlices * 2)
|
||||
break parallel;
|
||||
|
||||
var info = ComputeAllSliceBounds(chunks, numSlices);
|
||||
|
||||
// Step 1. Compute which items from each slice of the result
|
||||
// buffer should be preserved. When we're done, we have an array
|
||||
// |survivors| containing a bitset for each chunk, indicating
|
||||
// which members of the chunk survived. We also keep an array
|
||||
// |counts| containing the total number of items that are being
|
||||
// preserved from within one slice.
|
||||
//
|
||||
// FIXME(bug 844890): Use typed arrays here.
|
||||
var counts = NewDenseArray(numSlices);
|
||||
for (var i = 0; i < numSlices; i++)
|
||||
UnsafeSetElement(counts, i, 0);
|
||||
var survivors = NewDenseArray(chunks);
|
||||
ForkJoin(findSurvivorsInSlice, CheckParallel(mode));
|
||||
|
||||
// Step 2. Compress the slices into one contiguous set.
|
||||
var count = 0;
|
||||
for (var i = 0; i < numSlices; i++)
|
||||
count += counts[i];
|
||||
var buffer = NewDenseArray(count);
|
||||
if (count > 0)
|
||||
ForkJoin(copySurvivorsInSlice, CheckParallel(mode));
|
||||
|
||||
return PA_NEW(count, buffer, 0);
|
||||
}
|
||||
|
||||
// Sequential fallback:
|
||||
ASSERT_SEQUENTIAL_IS_OK(mode);
|
||||
var buffer = [];
|
||||
for (var i = 0; i < length; i++) {
|
||||
var elem = PA_GET(self, i);
|
||||
if (func(elem, i, self))
|
||||
ARRAY_PUSH(buffer, elem);
|
||||
}
|
||||
return PA_NEW(buffer.length, buffer, 0);
|
||||
|
||||
/**
|
||||
* As described above, our goal is to determine which items we
|
||||
* will preserve from a given slice. We do this one chunk at a
|
||||
* time. When we finish a chunk, we record our current count and
|
||||
* the next chunk sliceId, lest we should bail.
|
||||
*/
|
||||
function findSurvivorsInSlice(sliceId, numSlices, warmup) {
|
||||
|
||||
var chunkPos = info[SLICE_POS(sliceId)];
|
||||
var chunkEnd = info[SLICE_END(sliceId)];
|
||||
|
||||
if (warmup && chunkEnd > chunkPos)
|
||||
chunkEnd = chunkPos + 1;
|
||||
|
||||
var count = counts[sliceId];
|
||||
while (chunkPos < chunkEnd) {
|
||||
var indexStart = chunkPos << CHUNK_SHIFT;
|
||||
var indexEnd = std_Math_min(indexStart + CHUNK_SIZE, length);
|
||||
var chunkBits = 0;
|
||||
|
||||
for (var bit = 0; indexStart + bit < indexEnd; bit++) {
|
||||
var keep = !!func(self.get(indexStart + bit), indexStart + bit, self);
|
||||
chunkBits |= keep << bit;
|
||||
count += keep;
|
||||
}
|
||||
|
||||
UnsafeSetElement(survivors, chunkPos, chunkBits,
|
||||
counts, sliceId, count,
|
||||
info, SLICE_POS(sliceId), ++chunkPos);
|
||||
}
|
||||
}
|
||||
|
||||
function copySurvivorsInSlice(sliceId, numSlices, warmup) {
|
||||
// Copies the survivors from this slice into the correct position.
|
||||
// Note that this is an idempotent operation that does not invoke
|
||||
// user code. Therefore, we don't expect bailouts and make an
|
||||
// effort to proceed chunk by chunk or avoid duplicating work.
|
||||
|
||||
// During warmup, we only execute with sliceId 0. This would fail to
|
||||
// execute the loop below. Therefore, during warmup, we
|
||||
// substitute 1 for the sliceId.
|
||||
if (warmup && sliceId == 0 && numSlices != 1)
|
||||
sliceId = 1;
|
||||
|
||||
// Total up the items preserved by previous slices.
|
||||
var count = 0;
|
||||
if (sliceId > 0) { // FIXME(#819219)---work around a bug in Ion's range checks
|
||||
for (var i = 0; i < sliceId; i++)
|
||||
count += counts[i];
|
||||
}
|
||||
|
||||
// Compute the final index we expect to write.
|
||||
var total = count + counts[sliceId];
|
||||
if (count == total)
|
||||
return;
|
||||
|
||||
// Iterate over the chunks assigned to us. Read the bitset for
|
||||
// each chunk. Copy values where a 1 appears until we have
|
||||
// written all the values that we expect to. We can just iterate
|
||||
// from 0...CHUNK_SIZE without fear of a truncated final chunk
|
||||
// because we are already checking for when count==total.
|
||||
var chunkStart = info[SLICE_START(sliceId)];
|
||||
var chunkEnd = info[SLICE_END(sliceId)];
|
||||
for (var chunk = chunkStart; chunk < chunkEnd; chunk++) {
|
||||
var chunkBits = survivors[chunk];
|
||||
if (!chunkBits)
|
||||
continue;
|
||||
|
||||
var indexStart = chunk << CHUNK_SHIFT;
|
||||
for (var i = 0; i < CHUNK_SIZE; i++) {
|
||||
if (chunkBits & (1 << i)) {
|
||||
UnsafeSetElement(buffer, count++, PA_GET(self, indexStart + i));
|
||||
if (count == total)
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Internal debugging tool: checks that the given `mode` permits
|
||||
* sequential execution
|
||||
*/
|
||||
function AssertSequentialIsOK(mode) {
|
||||
if (mode && mode.mode && mode.mode !== "seq" && ParallelTestsShouldPass())
|
||||
ThrowError(JSMSG_WRONG_VALUE, "parallel execution", "sequential was forced");
|
||||
}
|
||||
|
||||
/**
|
||||
* Internal debugging tool: returns a function to be supplied to
|
||||
* ForkJoin() that will check that the parallel results
|
||||
* bailout/succeed as expected. Returns null if no mode is supplied
|
||||
* or we are building with some strange IF_DEF configuration such that
|
||||
* we don't expect parallel execution to work.
|
||||
*/
|
||||
function CheckParallel(mode) {
|
||||
if (!mode || !ParallelTestsShouldPass())
|
||||
return null;
|
||||
|
||||
return function(result, bailouts, causes) {
|
||||
if (!("expect" in mode) || mode.expect === "any") {
|
||||
return; // Ignore result when unspecified or unimportant.
|
||||
} else if (mode.expect === "mixed" && result !== "disqualified") {
|
||||
return; // "mixed" means that it may bailout, may succeed
|
||||
} else if (result === mode.expect) {
|
||||
return;
|
||||
}
|
||||
|
||||
ThrowError(JSMSG_WRONG_VALUE, mode.expect,
|
||||
result+":"+bailouts+":"+causes);
|
||||
};
|
||||
}
|
||||
|
||||
/*
|
||||
* Mark the main operations as clone-at-callsite for better precision.
|
||||
* This is slightly overkill, as all that we really need is to
|
||||
* specialize to the receiver and the elemental function, but in
|
||||
* practice this is likely not so different, since element functions
|
||||
* are often used in exactly one place.
|
||||
*/
|
||||
SetScriptHints(PA_MAP_NAME, { cloneAtCallsite: true });
|
||||
SetScriptHints(PA_REDUCE_NAME, { cloneAtCallsite: true });
|
||||
SetScriptHints(PA_SCAN_NAME, { cloneAtCallsite: true });
|
||||
SetScriptHints(PA_SCATTER_NAME, { cloneAtCallsite: true });
|
||||
SetScriptHints(PA_FILTER_NAME, { cloneAtCallsite: true });
|
|
@ -26,6 +26,7 @@
|
|||
/* Utility macros */
|
||||
#define TO_INT32(x) (x | 0)
|
||||
#define TO_UINT32(x) (x >>> 0)
|
||||
#define IS_UINT32(x) (x >>> 0 === x)
|
||||
|
||||
/* cache built-in functions before applications can change them */
|
||||
var std_isFinite = isFinite;
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
// |jit-test| error: RangeError
|
||||
// |jit-test| error: TypeError
|
||||
//
|
||||
// Run with --ion-eager.
|
||||
if (getBuildConfiguration().parallelJS) {
|
||||
|
@ -14,5 +14,5 @@ if (getBuildConfiguration().parallelJS) {
|
|||
} catch(exc1) {}
|
||||
reportCompare();
|
||||
} else {
|
||||
throw new RangeError();
|
||||
throw new TypeError();
|
||||
}
|
||||
|
|
|
@ -2759,6 +2759,15 @@ static const JSFunctionSpec array_methods[] = {
|
|||
{"some", {NULL, NULL}, 1,0, "ArraySome"},
|
||||
{"every", {NULL, NULL}, 1,0, "ArrayEvery"},
|
||||
|
||||
#ifdef ENABLE_PARALLEL_JS
|
||||
/* Parallelizable and pure methods. */
|
||||
{"pmap", {NULL, NULL}, 2,0, "ArrayParallelMap"},
|
||||
{"preduce", {NULL, NULL}, 2,0, "ArrayParallelReduce"},
|
||||
{"pscan", {NULL, NULL}, 2,0, "ArrayParallelScan"},
|
||||
{"pscatter", {NULL, NULL}, 5,0, "ArrayParallelScatter"},
|
||||
{"pfilter", {NULL, NULL}, 2,0, "ArrayParallelFilter"},
|
||||
#endif
|
||||
|
||||
JS_FN("iterator", JS_ArrayIterator, 0,0),
|
||||
JS_FS_END
|
||||
};
|
||||
|
@ -2773,6 +2782,12 @@ static const JSFunctionSpec array_static_methods[] = {
|
|||
{"some", {NULL, NULL}, 2,0, "ArrayStaticSome"},
|
||||
{"reduce", {NULL, NULL}, 2,0, "ArrayStaticReduce"},
|
||||
{"reduceRight", {NULL, NULL}, 2,0, "ArrayStaticReduceRight"},
|
||||
|
||||
#ifdef ENABLE_PARALLEL_JS
|
||||
/* Parallelizable and pure static methods. */
|
||||
{"pbuild", {NULL, NULL}, 3,0, "ArrayStaticParallelBuild"},
|
||||
#endif
|
||||
|
||||
JS_FS_END
|
||||
};
|
||||
|
||||
|
|
Загрузка…
Ссылка в новой задаче