d3/lib/science/science.stats.js

720 строки
18 KiB
JavaScript
Исходник Ответственный История

Этот файл содержит невидимые символы Юникода!

Этот файл содержит невидимые символы Юникода, которые могут быть отображены не так, как показано ниже. Если это намеренно, можете спокойно проигнорировать это предупреждение. Используйте кнопку Экранировать, чтобы показать скрытые символы.

(function(){science.stats = {};
// Bandwidth selectors for Gaussian kernels.
// Based on R's implementations in `stats.bw`.
science.stats.bandwidth = {
// Silverman, B. W. (1986) Density Estimation. London: Chapman and Hall.
nrd0: function(x) {
var hi = Math.sqrt(science.stats.variance(x));
if (!(lo = Math.min(hi, science.stats.iqr(x) / 1.34)))
(lo = hi) || (lo = Math.abs(x[1])) || (lo = 1);
return .9 * lo * Math.pow(x.length, -.2);
},
// Scott, D. W. (1992) Multivariate Density Estimation: Theory, Practice, and
// Visualization. Wiley.
nrd: function(x) {
var h = science.stats.iqr(x) / 1.34;
return 1.06 * Math.min(Math.sqrt(science.stats.variance(x)), h)
* Math.pow(x.length, -1/5);
}
};
science.stats.distance = {
euclidean: function(a, b) {
var n = a.length,
i = -1,
s = 0,
x;
while (++i < n) {
x = a[i] - b[i];
s += x * x;
}
return Math.sqrt(s);
},
manhattan: function(a, b) {
var n = a.length,
i = -1,
s = 0;
while (++i < n) s += Math.abs(a[i] - b[i]);
return s;
},
minkowski: function(p) {
return function(a, b) {
var n = a.length,
i = -1,
s = 0;
while (++i < n) s += Math.pow(Math.abs(a[i] - b[i]), p);
return Math.pow(s, 1 / p);
};
},
chebyshev: function(a, b) {
var n = a.length,
i = -1,
max = 0,
x;
while (++i < n) {
x = Math.abs(a[i] - b[i]);
if (x > max) max = x;
}
return max;
},
hamming: function(a, b) {
var n = a.length,
i = -1,
d = 0;
while (++i < n) if (a[i] !== b[i]) d++;
return d;
},
jaccard: function(a, b) {
var n = a.length,
i = -1,
s = 0;
while (++i < n) if (a[i] === b[i]) s++;
return s / n;
},
braycurtis: function(a, b) {
var n = a.length,
i = -1,
s0 = 0,
s1 = 0,
ai,
bi;
while (++i < n) {
ai = a[i];
bi = b[i];
s0 += Math.abs(ai - bi);
s1 += Math.abs(ai + bi);
}
return s0 / s1;
}
};
// Based on implementation in http://picomath.org/.
science.stats.erf = function(x) {
var a1 = 0.254829592,
a2 = -0.284496736,
a3 = 1.421413741,
a4 = -1.453152027,
a5 = 1.061405429,
p = 0.3275911;
// Save the sign of x
var sign = x < 0 ? -1 : 1;
if (x < 0) {
sign = -1;
x = -x;
}
// A&S formula 7.1.26
var t = 1 / (1 + p * x);
return sign * (
1 - (((((a5 * t + a4) * t) + a3) * t + a2) * t + a1)
* t * Math.exp(-x * x));
};
science.stats.phi = function(x) {
return .5 * (1 + science.stats.erf(x / Math.SQRT2));
};
// See <http://en.wikipedia.org/wiki/Kernel_(statistics)>.
science.stats.kernel = {
uniform: function(u) {
if (u <= 1 && u >= -1) return .5;
return 0;
},
triangular: function(u) {
if (u <= 1 && u >= -1) return 1 - Math.abs(u);
return 0;
},
epanechnikov: function(u) {
if (u <= 1 && u >= -1) return .75 * (1 - u * u);
return 0;
},
quartic: function(u) {
if (u <= 1 && u >= -1) {
var tmp = 1 - u * u;
return (15 / 16) * tmp * tmp;
}
return 0;
},
triweight: function(u) {
if (u <= 1 && u >= -1) {
var tmp = 1 - u * u;
return (35 / 32) * tmp * tmp * tmp;
}
return 0;
},
gaussian: function(u) {
return 1 / Math.sqrt(2 * Math.PI) * Math.exp(-.5 * u * u);
},
cosine: function(u) {
if (u <= 1 && u >= -1) return Math.PI / 4 * Math.cos(Math.PI / 2 * u);
return 0;
}
};
// http://exploringdata.net/den_trac.htm
science.stats.kde = function() {
var kernel = science.stats.kernel.gaussian,
sample = [],
bandwidth = science.stats.bandwidth.nrd;
function kde(points, i) {
var bw = bandwidth.call(this, sample);
return points.map(function(x) {
var i = -1,
y = 0,
n = sample.length;
while (++i < n) {
y += kernel((x - sample[i]) / bw);
}
return [x, y / bw / n];
});
}
kde.kernel = function(x) {
if (!arguments.length) return kernel;
kernel = x;
return kde;
};
kde.sample = function(x) {
if (!arguments.length) return sample;
sample = x;
return kde;
};
kde.bandwidth = function(x) {
if (!arguments.length) return bandwidth;
bandwidth = science.functor(x);
return kde;
};
return kde;
};
// Based on figue implementation by Jean-Yves Delort.
// http://code.google.com/p/figue/
science.stats.kmeans = function() {
var distance = science.stats.distance.euclidean,
maxIterations = 1000,
k = 1;
function kmeans(vectors) {
var n = vectors.length,
assignments = [],
clusterSizes = [],
repeat = 1,
iterations = 0,
centroids = science_stats_kmeansRandom(k, vectors),
newCentroids,
i,
j,
x,
d,
min,
best;
while (repeat && iterations < maxIterations) {
// Assignment step.
j = -1; while (++j < k) {
clusterSizes[j] = 0;
}
i = -1; while (++i < n) {
x = vectors[i];
min = Infinity;
j = -1; while (++j < k) {
d = distance.call(this, centroids[j], x);
if (d < min) {
min = d;
best = j;
}
}
clusterSizes[assignments[i] = best]++;
}
// Update centroids step.
newCentroids = [];
i = -1; while (++i < n) {
x = assignments[i];
d = newCentroids[x];
if (d == null) newCentroids[x] = vectors[i].slice();
else {
j = -1; while (++j < d.length) {
d[j] += vectors[i][j];
}
}
}
j = -1; while (++j < k) {
x = newCentroids[j];
d = 1 / clusterSizes[j];
i = -1; while (++i < x.length) x[i] *= d;
}
// Check convergence.
repeat = 0;
j = -1; while (++j < k) {
if (!science_stats_kmeansCompare(newCentroids[j], centroids[j])) {
repeat = 1;
break;
}
}
centroids = newCentroids;
iterations++;
}
return {assignments: assignments, centroids: centroids};
}
kmeans.k = function(x) {
if (!arguments.length) return k;
k = x;
return kmeans;
};
kmeans.distance = function(x) {
if (!arguments.length) return distance;
distance = x;
return kmeans;
};
return kmeans;
};
function science_stats_kmeansCompare(a, b) {
if (!a || !b || a.length !== b.length) return false;
var n = a.length,
i = -1;
while (++i < n) if (a[i] !== b[i]) return false;
return true;
}
// Returns an array of k distinct vectors randomly selected from the input
// array of vectors. Returns null if k > n or if there are less than k distinct
// objects in vectors.
function science_stats_kmeansRandom(k, vectors) {
var n = vectors.length;
if (k > n) return null;
var selected_vectors = [];
var selected_indices = [];
var tested_indices = {};
var tested = 0;
var selected = 0;
var i,
vector,
select;
while (selected < k) {
if (tested === n) return null;
var random_index = Math.floor(Math.random() * n);
if (random_index in tested_indices) continue;
tested_indices[random_index] = 1;
tested++;
vector = vectors[random_index];
select = true;
for (i = 0; i < selected; i++) {
if (science_stats_kmeansCompare(vector, selected_vectors[i])) {
select = false;
break;
}
}
if (select) {
selected_vectors[selected] = vector;
selected_indices[selected] = random_index;
selected++;
}
}
return selected_vectors;
}
science.stats.hcluster = function() {
var distance = science.stats.distance.euclidean,
linkage = "simple"; // simple, complete or average
function hcluster(vectors) {
var n = vectors.length,
dMin = [],
cSize = [],
distMatrix = [],
clusters = [],
c1,
c2,
c1Cluster,
c2Cluster,
p,
root,
i,
j;
// Initialise distance matrix and vector of closest clusters.
i = -1; while (++i < n) {
dMin[i] = 0;
distMatrix[i] = [];
j = -1; while (++j < n) {
distMatrix[i][j] = i === j ? Infinity : distance(vectors[i] , vectors[j]);
if (distMatrix[i][dMin[i]] > distMatrix[i][j]) dMin[i] = j;
}
}
// create leaves of the tree
i = -1; while (++i < n) {
clusters[i] = [];
clusters[i][0] = {
left: null,
right: null,
dist: 0,
centroid: vectors[i],
size: 1,
depth: 0
};
cSize[i] = 1;
}
// Main loop
for (p = 0; p < n-1; p++) {
// find the closest pair of clusters
c1 = 0;
for (i = 0; i < n; i++) {
if (distMatrix[i][dMin[i]] < distMatrix[c1][dMin[c1]]) c1 = i;
}
c2 = dMin[c1];
// create node to store cluster info
c1Cluster = clusters[c1][0];
c2Cluster = clusters[c2][0];
newCluster = {
left: c1Cluster,
right: c2Cluster,
dist: distMatrix[c1][c2],
centroid: calculateCentroid(c1Cluster.size, c1Cluster.centroid,
c2Cluster.size, c2Cluster.centroid),
size: c1Cluster.size + c2Cluster.size,
depth: 1 + Math.max(c1Cluster.depth, c2Cluster.depth)
};
clusters[c1].splice(0, 0, newCluster);
cSize[c1] += cSize[c2];
// overwrite row c1 with respect to the linkage type
for (j = 0; j < n; j++) {
switch (linkage) {
case "single":
if (distMatrix[c1][j] > distMatrix[c2][j])
distMatrix[j][c1] = distMatrix[c1][j] = distMatrix[c2][j];
break;
case "complete":
if (distMatrix[c1][j] < distMatrix[c2][j])
distMatrix[j][c1] = distMatrix[c1][j] = distMatrix[c2][j];
break;
case "average":
distMatrix[j][c1] = distMatrix[c1][j] = (cSize[c1] * distMatrix[c1][j] + cSize[c2] * distMatrix[c2][j]) / (cSize[c1] + cSize[j]);
break;
}
}
distMatrix[c1][c1] = Infinity;
// infinity ­out old row c2 and column c2
for (i = 0; i < n; i++)
distMatrix[i][c2] = distMatrix[c2][i] = Infinity;
// update dmin and replace ones that previous pointed to c2 to point to c1
for (j = 0; j < n; j++) {
if (dMin[j] == c2) dMin[j] = c1;
if (distMatrix[c1][j] < distMatrix[c1][dMin[c1]]) dMin[c1] = j;
}
// keep track of the last added cluster
root = newCluster;
}
return root;
}
hcluster.distance = function(x) {
if (!arguments.length) return distance;
distance = x;
return hcluster;
};
return hcluster;
};
function calculateCentroid(c1Size, c1Centroid, c2Size, c2Centroid) {
var newCentroid = [],
newSize = c1Size + c2Size,
n = c1Centroid.length,
i = -1;
while (++i < n) {
newCentroid[i] = (c1Size * c1Centroid[i] + c2Size * c2Centroid[i]) / newSize;
}
return newCentroid;
}
science.stats.iqr = function(x) {
var quartiles = science.stats.quantiles(x, [.25, .75]);
return quartiles[1] - quartiles[0];
};
// Based on org.apache.commons.math.analysis.interpolation.LoessInterpolator
// from http://commons.apache.org/math/
science.stats.loess = function() {
var bandwidth = .3,
robustnessIters = 2,
accuracy = 1e-12;
function smooth(xval, yval, weights) {
var n = xval.length,
i;
if (n !== yval.length) throw {error: "Mismatched array lengths"};
if (n == 0) throw {error: "At least one point required."};
if (arguments.length < 3) {
weights = [];
i = -1; while (++i < n) weights[i] = 1;
}
science_stats_loessFiniteReal(xval);
science_stats_loessFiniteReal(yval);
science_stats_loessFiniteReal(weights);
science_stats_loessStrictlyIncreasing(xval);
if (n == 1) return [yval[0]];
if (n == 2) return [yval[0], yval[1]];
var bandwidthInPoints = Math.floor(bandwidth * n);
if (bandwidthInPoints < 2) throw {error: "Bandwidth too small."};
var res = [],
residuals = [],
robustnessWeights = [];
// Do an initial fit and 'robustnessIters' robustness iterations.
// This is equivalent to doing 'robustnessIters+1' robustness iterations
// starting with all robustness weights set to 1.
i = -1; while (++i < n) {
res[i] = 0;
residuals[i] = 0;
robustnessWeights[i] = 1;
}
var iter = -1;
while (++iter <= robustnessIters) {
var bandwidthInterval = [0, bandwidthInPoints - 1];
// At each x, compute a local weighted linear regression
var x;
i = -1; while (++i < n) {
x = xval[i];
// Find out the interval of source points on which
// a regression is to be made.
if (i > 0) {
science_stats_loessUpdateBandwidthInterval(xval, weights, i, bandwidthInterval);
}
var ileft = bandwidthInterval[0],
iright = bandwidthInterval[1];
// Compute the point of the bandwidth interval that is
// farthest from x
var edge = (xval[i] - xval[ileft]) > (xval[iright] - xval[i]) ? ileft : iright;
// Compute a least-squares linear fit weighted by
// the product of robustness weights and the tricube
// weight function.
// See http://en.wikipedia.org/wiki/Linear_regression
// (section "Univariate linear case")
// and http://en.wikipedia.org/wiki/Weighted_least_squares
// (section "Weighted least squares")
var sumWeights = 0,
sumX = 0,
sumXSquared = 0,
sumY = 0,
sumXY = 0,
denom = Math.abs(1 / (xval[edge] - x));
for (var k = ileft; k <= iright; ++k) {
var xk = xval[k],
yk = yval[k],
dist = k < i ? x - xk : xk - x,
w = science_stats_loessTricube(dist * denom) * robustnessWeights[k] * weights[k],
xkw = xk * w;
sumWeights += w;
sumX += xkw;
sumXSquared += xk * xkw;
sumY += yk * w;
sumXY += yk * xkw;
}
var meanX = sumX / sumWeights,
meanY = sumY / sumWeights,
meanXY = sumXY / sumWeights,
meanXSquared = sumXSquared / sumWeights;
var beta = (Math.sqrt(Math.abs(meanXSquared - meanX * meanX)) < accuracy)
? 0 : ((meanXY - meanX * meanY) / (meanXSquared - meanX * meanX));
var alpha = meanY - beta * meanX;
res[i] = beta * x + alpha;
residuals[i] = Math.abs(yval[i] - res[i]);
}
// No need to recompute the robustness weights at the last
// iteration, they won't be needed anymore
if (iter === robustnessIters) {
break;
}
// Recompute the robustness weights.
// Find the median residual.
var sortedResiduals = residuals.slice();
sortedResiduals.sort();
var medianResidual = sortedResiduals[Math.floor(n / 2)];
if (Math.abs(medianResidual) < accuracy)
break;
var arg,
w;
i = -1; while (++i < n) {
arg = residuals[i] / (6 * medianResidual);
robustnessWeights[i] = (arg >= 1) ? 0 : ((w = 1 - arg * arg) * w);
}
}
return res;
}
smooth.bandwidth = function(x) {
if (!arguments.length) return x;
bandwidth = x;
return smooth;
};
smooth.robustnessIterations = function(x) {
if (!arguments.length) return x;
robustnessIters = x;
return smooth;
};
smooth.accuracy = function(x) {
if (!arguments.length) return x;
accuracy = x;
return smooth;
};
return smooth;
};
function science_stats_loessFiniteReal(values) {
var n = values.length,
i = -1;
while (++i < n) if (!isFinite(values[i])) return false;
return true;
}
function science_stats_loessStrictlyIncreasing(xval) {
var n = xval.length,
i = 0;
while (++i < n) if (xval[i - 1] >= xval[i]) return false;
return true;
}
// Compute the tricube weight function.
// http://en.wikipedia.org/wiki/Local_regression#Weight_function
function science_stats_loessTricube(x) {
return (x = 1 - x * x * x) * x * x;
}
// Given an index interval into xval that embraces a certain number of
// points closest to xval[i-1], update the interval so that it embraces
// the same number of points closest to xval[i], ignoring zero weights.
function science_stats_loessUpdateBandwidthInterval(
xval, weights, i, bandwidthInterval) {
var left = bandwidthInterval[0],
right = bandwidthInterval[1];
// The right edge should be adjusted if the next point to the right
// is closer to xval[i] than the leftmost point of the current interval
var nextRight = science_stats_loessNextNonzero(weights, right);
if ((nextRight < xval.length) && (xval[nextRight] - xval[i]) < (xval[i] - xval[left])) {
var nextLeft = science_stats_loessNextNonzero(weights, left);
bandwidthInterval[0] = nextLeft;
bandwidthInterval[1] = nextRight;
}
}
function science_stats_loessNextNonzero(weights, i) {
var j = i + 1;
while (j < weights.length && weights[j] === 0) j++;
return j;
}
// Welford's algorithm.
science.stats.mean = function(x) {
var n = x.length;
if (n === 0) return NaN;
var m = 0,
i = -1;
while (++i < n) m += (x[i] - m) / (i + 1);
return m;
};
science.stats.median = function(x) {
return science.stats.quantiles(x, [.5])[0];
};
science.stats.mode = function(x) {
x = x.slice().sort(science.ascending);
var mode,
n = x.length,
i = -1,
l = i,
last = null,
max = 0,
tmp,
v;
while (++i < n) {
if ((v = x[i]) !== last) {
if ((tmp = i - l) > max) {
max = tmp;
mode = last;
}
last = v;
l = i;
}
}
return mode;
};
// Uses R's quantile algorithm type=7.
science.stats.quantiles = function(d, quantiles) {
d = d.slice().sort(science.ascending);
var n_1 = d.length - 1;
return quantiles.map(function(q) {
if (q === 0) return d[0];
else if (q === 1) return d[n_1];
var index = 1 + q * n_1,
lo = Math.floor(index),
h = index - lo,
a = d[lo - 1];
return h === 0 ? a : a + h * (d[lo] - a);
});
};
// Unbiased estimate of a sample's variance.
// Also known as the sample variance, where the denominator is n - 1.
science.stats.variance = function(x) {
var n = x.length;
if (n < 1) return NaN;
if (n === 1) return 0;
var mean = science.stats.mean(x),
i = -1,
s = 0;
while (++i < n) {
var v = x[i] - mean;
s += v * v;
}
return s / (n - 1);
};
})()