QuantileEstimator.GetProbLessThan uses empirical cdf

This commit is contained in:
Tom Minka 2018-10-29 15:50:53 +00:00
Родитель c966b93213
Коммит 0114565ad3
3 изменённых файлов: 116 добавлений и 45 удалений

Просмотреть файл

@ -111,7 +111,7 @@ namespace Microsoft.ML.Probabilistic.Distributions
/// <param name="upperItem"></param>
/// <param name="n"></param>
/// <returns></returns>
internal static double GetQuantile(double probability, long lowerIndex, double lowerItem, double upperItem, long n)
internal static double GetQuantile(double probability, double lowerIndex, double lowerItem, double upperItem, long n)
{
if (n == 1) throw new ArgumentOutOfRangeException("n == 1");
double pos = MMath.LargestDoubleProduct(n - 1, probability);

Просмотреть файл

@ -53,6 +53,13 @@ namespace Microsoft.ML.Probabilistic.Distributions
[DataMember]
public readonly double MaximumError;
/// <summary>
/// 0 = point mass on each data point (i/n)
/// 1 = interpolate (i/n + (i+1)/n)/2 = (i+0.5)/n
/// 2 = interpolate i/(n-1)
/// </summary>
private static int InterpolationType = 0;
/// <summary>
/// Creates a new QuantileEstimator.
/// </summary>
@ -79,15 +86,32 @@ namespace Microsoft.ML.Probabilistic.Distributions
{
double lowerItem;
double upperItem;
long lowerIndex;
long lowerRank;
long lowerWeight, minLowerWeight, upperWeight, minUpperWeight;
long itemCount;
GetAdjacentItems(x, out lowerItem, out upperItem, out lowerIndex, out lowerWeight, out upperWeight, out minLowerWeight, out minUpperWeight, out itemCount);
if (lowerIndex < 0) return 0;
if (x == lowerItem) return (double)(lowerIndex - lowerWeight + 1) / (itemCount - 1);
// interpolate between the ranks of lowerItem and upperItem
double frac = (x - lowerItem) / (upperItem - lowerItem);
return (lowerIndex + frac) / (itemCount - 1);
GetAdjacentItems(x, out lowerItem, out upperItem, out lowerRank, out lowerWeight, out upperWeight, out minLowerWeight, out minUpperWeight, out itemCount);
if (lowerRank == 0) return 0;
if (lowerRank == itemCount) return 1;
if (InterpolationType == 0)
{
return (double)lowerRank / itemCount;
}
else if (InterpolationType == 1)
{
// interpolate between the ranks of lowerItem and upperItem
// lowerItem has rank (lowerRank - 0.5) from above
// upperItem has rank (lowerRank + 0.5) from below
double frac = (x - lowerItem) / (upperItem - lowerItem);
return (lowerRank - 0.5 + frac) / itemCount;
}
else
{
// interpolate between the ranks of lowerItem and upperItem
// lowerItem has rank (lowerRank - 1)/(itemCount-1) from above
// upperItem has rank lowerRank/(itemCount-1) from below
double frac = (x - lowerItem) / (upperItem - lowerItem);
return (lowerRank - 1 + frac) / (itemCount - 1);
}
}
/// <summary>
@ -113,6 +137,7 @@ namespace Microsoft.ML.Probabilistic.Distributions
}
}
if (probability == 1.0) return MMath.NextDouble(upperBound);
if (probability == 0.0) return lowerBound;
if (double.IsPositiveInfinity(lowerBound)) throw new Exception("QuantileEstimator has no data");
if (lowerBound == upperBound) return upperBound;
// use bisection
@ -121,32 +146,83 @@ namespace Microsoft.ML.Probabilistic.Distributions
double x = (lowerBound + upperBound) / 2;
double lowerItem;
double upperItem;
long lowerIndex;
long lowerRank;
long lowerWeight, minLowerWeight, upperWeight, minUpperWeight;
long itemCount;
GetAdjacentItems(x, out lowerItem, out upperItem, out lowerIndex, out lowerWeight, out upperWeight, out minLowerWeight, out minUpperWeight, out itemCount);
double scaledProbability = MMath.LargestDoubleProduct(itemCount - 1, probability);
// probability of lowerItem ranges from (lowerIndex - lowerWeight + 1) / (itemCount - 1)
// to lowerIndex / (itemCount - 1).
if ((scaledProbability >= lowerIndex - lowerWeight + 1) && (scaledProbability < lowerIndex))
return lowerItem;
// probability of upperItem ranges from (lowerIndex + 1) / (itemCount - 1)
// to (lowerIndex + upperWeight) / (itemCount - 1)
if ((scaledProbability >= lowerIndex + 1) && (scaledProbability <= lowerIndex + upperWeight))
return upperItem;
// solve for frac in (lowerIndex + frac) / (itemCount - 1) == probability
double frac = scaledProbability - lowerIndex;
if (frac < 0)
GetAdjacentItems(x, out lowerItem, out upperItem, out lowerRank, out lowerWeight, out upperWeight, out minLowerWeight, out minUpperWeight, out itemCount);
if (InterpolationType == 0)
{
upperBound = x;
double probabilityLessThanLowerItem = (double)(lowerRank - lowerWeight) / itemCount;
if (probability < probabilityLessThanLowerItem)
{
upperBound = lowerItem;
}
else
{
double probabilityLessThanUpperItem = (double)lowerRank / itemCount;
if (probability < probabilityLessThanUpperItem) return lowerItem;
double probabilityLessThanOrEqualUpperItem = (double)(lowerRank + upperWeight) / itemCount;
if (probability < probabilityLessThanOrEqualUpperItem) return upperItem;
lowerBound = upperItem;
}
}
else if (frac > 1)
else if (InterpolationType == 1)
{
lowerBound = x;
// Find frac such that (lowerRank - 0.5 + frac) / itemCount == probability
double scaledProbability = MMath.LargestDoubleProduct(itemCount, probability);
if (scaledProbability < 0.5) return lowerBound;
if (scaledProbability >= itemCount - 0.5) return upperBound;
// probability of lowerItem ranges from (lowerRank-lowerWeight+0.5) / itemCount
// to (lowerRank - 0.5) / itemCount
//if (scaledProbability == lowerRank - lowerWeight + 0.5) return lowerItem;
if ((scaledProbability > lowerRank - lowerWeight + 0.5) && (scaledProbability < lowerRank - 0.5))
return lowerItem;
// probability of upperItem ranges from (lowerRank + 0.5) / itemCount
// to (lowerRank + upperWeight - 0.5) / itemCount
if (scaledProbability == lowerRank+0.5) return upperItem;
if ((scaledProbability > lowerRank+0.5) && (scaledProbability < lowerRank + upperWeight - 0.5))
return upperItem;
double frac = scaledProbability - (lowerRank - 0.5);
if (frac < 0)
{
upperBound = x;
}
else if (frac > 1)
{
lowerBound = x;
}
else
{
return OuterQuantiles.GetQuantile(probability, lowerRank - 0.5, lowerItem, upperItem, itemCount+1);
}
}
else
{
return OuterQuantiles.GetQuantile(probability, lowerIndex, lowerItem, upperItem, itemCount);
double scaledProbability = MMath.LargestDoubleProduct(itemCount - 1, probability);
// probability of lowerItem ranges from (lowerRank-lowerWeight) / (itemCount - 1)
// to (lowerRank - 1) / (itemCount - 1).
if (scaledProbability == lowerRank - lowerWeight) return lowerItem;
if ((scaledProbability > lowerRank - lowerWeight) && (scaledProbability < lowerRank - 1))
return lowerItem;
// probability of upperItem ranges from lowerRank / (itemCount - 1)
// to (lowerRank + upperWeight-1) / (itemCount - 1)
if (scaledProbability == lowerRank) return upperItem;
if ((scaledProbability > lowerRank) && (scaledProbability < lowerRank + upperWeight - 1))
return upperItem;
// solve for frac in (lowerRank-1 + frac) / (itemCount - 1) == probability
double frac = scaledProbability - (lowerRank - 1);
if (frac < 0)
{
upperBound = x;
}
else if (frac > 1)
{
lowerBound = x;
}
else
{
return OuterQuantiles.GetQuantile(probability, lowerRank - 1, lowerItem, upperItem, itemCount);
}
}
}
}
@ -255,17 +331,17 @@ namespace Microsoft.ML.Probabilistic.Distributions
///
/// </summary>
/// <param name="x"></param>
/// <param name="lowerItem">The largest item that is less than or equal to x.</param>
/// <param name="upperItem">The smallest item that is greater than x.</param>
/// <param name="lowerIndex"></param>
/// <param name="lowerWeight">The total weight of items equal to lowerItem, excluding the item with lowest weight.</param>
/// <param name="lowerItem">The largest item that is less than x.</param>
/// <param name="upperItem">The smallest item that is greater than or equal to x.</param>
/// <param name="lowerRank">The total weight of items less than x.</param>
/// <param name="lowerWeight">The total weight of items equal to lowerItem.</param>
/// <param name="upperWeight">The total weight of items equal to upperItem.</param>
/// <param name="minLowerWeight">The smallest weight of items equal to lowerItem.</param>
/// <param name="minUpperWeight">The smallest weight of items equal to upperItem.</param>
/// <param name="itemCount"></param>
private void GetAdjacentItems(double x, out double lowerItem, out double upperItem, out long lowerIndex, out long lowerWeight, out long upperWeight, out long minLowerWeight, out long minUpperWeight, out long itemCount)
private void GetAdjacentItems(double x, out double lowerItem, out double upperItem, out long lowerRank, out long lowerWeight, out long upperWeight, out long minLowerWeight, out long minUpperWeight, out long itemCount)
{
long lowerRank = 0;
lowerRank = 0;
long weight = (1L << lowestBufferHeight);
itemCount = 0;
lowerItem = double.NegativeInfinity;
@ -282,7 +358,7 @@ namespace Microsoft.ML.Probabilistic.Distributions
for (int i = 0; i < count; i++)
{
double item = buffer[i];
if (item <= x)
if (item < x)
{
lowerRank += weight;
if (item > lowerItem)
@ -316,7 +392,6 @@ namespace Microsoft.ML.Probabilistic.Distributions
if (bufferIndex == lowestBufferIndex) break;
}
if (itemCount == 0) throw new Exception("QuantileEstimator has no data");
lowerIndex = lowerRank - 1;
}
private void AddToBuffer(int bufferIndex, double item)

Просмотреть файл

@ -165,10 +165,10 @@ namespace Microsoft.ML.Probabilistic.Tests
CheckGetQuantile(inner, inner, (int)Math.Ceiling(100.0/8), (int)Math.Floor(100.0*7/8));
var est = new QuantileEstimator(0.01);
est.AddRange(x);
Assert.Equal(0.25, est.GetProbLessThan(middle));
Assert.Equal(est.GetQuantile(0.3), middle);
Assert.Equal(est.GetQuantile(0.5), middle);
Assert.Equal(est.GetQuantile(0.7), middle);
// InterpolationType==1 returns NextDouble(middle)
Assert.Equal(est.GetQuantile(0.7), middle, 1e-15);
CheckGetQuantile(est, est);
}
@ -199,8 +199,6 @@ namespace Microsoft.ML.Probabilistic.Tests
Assert.Equal(0.0, est.GetProbLessThan(first));
Assert.Equal(first, est.GetQuantile(0.0));
Assert.Equal(0.5, est.GetProbLessThan(between));
Assert.Equal(between, est.GetQuantile(0.5));
Assert.Equal(2.0 / 3, est.GetProbLessThan(second));
Assert.Equal(second, est.GetQuantile(2.0 / 3));
Assert.Equal(1.0, est.GetProbLessThan(next));
Assert.Equal(next, est.GetQuantile(1.0));
@ -278,12 +276,10 @@ namespace Microsoft.ML.Probabilistic.Tests
[Fact]
public void QuantileEstimatorTest()
{
foreach (int n in new[] { 20, 10000 })
foreach(double maximumError in new[] { 0.05, 0.01, 0.005, 0.001 })
{
QuantileEstimator(0.05, n);
QuantileEstimator(0.01, n);
QuantileEstimator(0.005, n);
QuantileEstimator(0.001, n);
int n = (int)(2.0 / maximumError);
QuantileEstimator(maximumError, n);
}
}
@ -302,7 +298,7 @@ namespace Microsoft.ML.Probabilistic.Tests
x.Add(sample);
est.Add(sample);
}
CheckProbLessThan(est, x, (n < 40) ? 0 : maximumError);
CheckProbLessThan(est, x, maximumError);
}
private void CheckProbLessThan(CanGetProbLessThan canGetProbLessThan, List<double> x, double maximumError)
@ -324,7 +320,7 @@ namespace Microsoft.ML.Probabilistic.Tests
maxError = System.Math.Max(maxError, error);
}
Console.WriteLine($"max rank error = {maxError}");
Assert.True(maxError <= 2 * maximumError);
Assert.True(maxError <= maximumError);
}
[Fact]