зеркало из https://github.com/dotnet/infer.git
Remove DiscreteChar.ProbabilityOutsideRanges (#188)
A lot of code existed that had to treat `ProbabilityOutsideRanges` in a special way. Now in cases where `ProbabilityOutsideRanges` was non-zero missing ranges are added to cove all char domain. `ProbabilityOutsideRanges` had 1 useful property: it needed 2 times less ranges for representing distributions distributions covering whole domain This property is never used in real code. (And even if it did, it would be in a very few places), so reducing code complexity trumps small performance/space gain.
This commit is contained in:
Родитель
b74d02b98d
Коммит
419419327d
|
@ -132,8 +132,6 @@ namespace Microsoft.ML.Probabilistic.Distributions.Automata
|
|||
{
|
||||
var distribution = transition.ElementDistribution.Value;
|
||||
var ranges = distribution.Ranges;
|
||||
var commonValueStart = (int)char.MinValue;
|
||||
var commonValue = distribution.ProbabilityOutsideRanges;
|
||||
var weightBase = transition.Weight * sourceStateResidualWeight;
|
||||
|
||||
void AddEndPoints(int start, int end, int destinationIndex, Weight weight)
|
||||
|
@ -144,24 +142,12 @@ namespace Microsoft.ML.Probabilistic.Distributions.Automata
|
|||
|
||||
foreach (var range in ranges)
|
||||
{
|
||||
if (range.StartInclusive > commonValueStart && !commonValue.IsZero)
|
||||
{
|
||||
AddEndPoints(commonValueStart, range.StartInclusive, transition.DestinationStateIndex, commonValue);
|
||||
}
|
||||
|
||||
// Add segment endpoints
|
||||
var pieceValue = range.Probability;
|
||||
if (!pieceValue.IsZero)
|
||||
{
|
||||
AddEndPoints(range.StartInclusive, range.EndExclusive, transition.DestinationStateIndex, pieceValue);
|
||||
}
|
||||
|
||||
commonValueStart = range.EndExclusive;
|
||||
}
|
||||
|
||||
if (!commonValue.IsZero && (ranges.Count == 0 || ranges[ranges.Count - 1].EndExclusive != DiscreteChar.CharRangeEndExclusive))
|
||||
{
|
||||
AddEndPoints(commonValueStart, char.MaxValue + 1, transition.DestinationStateIndex, commonValue);
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -2,26 +2,23 @@
|
|||
// The .NET Foundation licenses this file to you under the MIT license.
|
||||
// See the LICENSE file in the project root for more information.
|
||||
|
||||
using System.Runtime.CompilerServices;
|
||||
using Microsoft.ML.Probabilistic.Distributions.Automata;
|
||||
|
||||
namespace Microsoft.ML.Probabilistic.Distributions
|
||||
{
|
||||
using System;
|
||||
using System.Collections.Generic;
|
||||
using System.Collections.Specialized;
|
||||
using System.Diagnostics;
|
||||
using System.Linq;
|
||||
using System.Runtime.Serialization;
|
||||
using System.Text;
|
||||
using System.Text.RegularExpressions;
|
||||
|
||||
using Collections;
|
||||
using Math;
|
||||
using Utilities;
|
||||
using Factors.Attributes;
|
||||
using Microsoft.ML.Probabilistic.Distributions.Automata;
|
||||
using Microsoft.ML.Probabilistic.Collections;
|
||||
using Microsoft.ML.Probabilistic.Math;
|
||||
using Microsoft.ML.Probabilistic.Utilities;
|
||||
using Microsoft.ML.Probabilistic.Factors.Attributes;
|
||||
|
||||
using Serialization;
|
||||
using Microsoft.ML.Probabilistic.Serialization;
|
||||
|
||||
/// <summary>
|
||||
/// Represents a distribution over characters.
|
||||
|
@ -135,17 +132,16 @@ namespace Microsoft.ML.Probabilistic.Distributions
|
|||
|
||||
/// <summary>
|
||||
/// Initializes a new instance of the <see cref="DiscreteChar"/> class
|
||||
/// with a given list of constant probability character ranges and the probability of characters outside ranges.
|
||||
/// with a given list of constant probability character ranges.
|
||||
/// </summary>
|
||||
/// <param name="probabilityOutsideRanges">The probability of characters outside the given ranges.</param>
|
||||
/// <param name="ranges">The constant-probability character ranges.</param>
|
||||
/// <param name="rangeCount">The number of valid elements in the <paramref name="ranges"/> array.</param>
|
||||
/// <remarks>
|
||||
/// The probabilities need to be normalized. The character ranges need to be sorted.
|
||||
/// The created objects takes ownership of the character range list.
|
||||
/// </remarks>
|
||||
private DiscreteChar(Weight probabilityOutsideRanges, ReadOnlyArray<CharRange> ranges, int rangeCount) =>
|
||||
this.data_ = Storage.Create(ranges, probabilityOutsideRanges);
|
||||
private DiscreteChar(ReadOnlyArray<CharRange> ranges, int rangeCount) =>
|
||||
this.data_ = Storage.Create(ranges);
|
||||
|
||||
private DiscreteChar(Storage storage) => this.data_ = storage;
|
||||
|
||||
|
@ -191,11 +187,6 @@ namespace Microsoft.ML.Probabilistic.Distributions
|
|||
|
||||
#region Distribution properties
|
||||
|
||||
/// <summary>
|
||||
/// Gets the probability assigned to characters outside ranges returned by <see cref="Ranges"/>.
|
||||
/// </summary>
|
||||
public Weight ProbabilityOutsideRanges => this.Data.ProbabilityOutsideRanges;
|
||||
|
||||
/// <summary>
|
||||
/// Gets or sets the point mass represented by the distribution.
|
||||
/// </summary>
|
||||
|
@ -224,18 +215,17 @@ namespace Microsoft.ML.Probabilistic.Distributions
|
|||
#region Factory methods
|
||||
|
||||
/// <summary>
|
||||
/// Creates a distribution given a list of constant probability character ranges and the probability of characters outside those ranges.
|
||||
/// Creates a distribution given a list of constant probability character ranges.
|
||||
/// </summary>
|
||||
/// <param name="probabilityOutsideRanges">The probability of characters outside the given ranges.</param>
|
||||
/// <param name="ranges">The constant-probability character ranges.</param>
|
||||
/// <remarks>The probabilities do not need to be normalized. The character ranges do not need to be sorted.</remarks>
|
||||
/// <returns>The created distribution.</returns>
|
||||
[Construction("ProbabilityOutsideRanges", "Ranges")]
|
||||
public static DiscreteChar Create(Weight probabilityOutsideRanges, IEnumerable<CharRange> ranges)
|
||||
[Construction("Ranges")]
|
||||
public static DiscreteChar Create(IEnumerable<CharRange> ranges)
|
||||
{
|
||||
Argument.CheckIfNotNull(ranges, "ranges");
|
||||
|
||||
var builder = new StorageBuilder(probabilityOutsideRanges);
|
||||
var builder = StorageBuilder.Create();
|
||||
foreach (var range in ranges)
|
||||
{
|
||||
builder.AddRange(range);
|
||||
|
@ -366,7 +356,7 @@ namespace Microsoft.ML.Probabilistic.Distributions
|
|||
public static DiscreteChar UniformOver(IEnumerable<char> chars)
|
||||
{
|
||||
Argument.CheckIfNotNull(chars, nameof(chars));
|
||||
return Create(Weight.Zero, chars.Select(c => new CharRange(c, c + 1, Weight.One)));
|
||||
return Create(chars.Select(c => new CharRange(c, c + 1, Weight.One)));
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
|
@ -434,15 +424,24 @@ namespace Microsoft.ML.Probabilistic.Distributions
|
|||
Argument.CheckIfNotNull(vector, "vector");
|
||||
Argument.CheckIfValid(vector.Count <= CharRangeEndExclusive, "The given vector is of invalid size.");
|
||||
|
||||
var builder = new StorageBuilder(Weight.FromValue(vector.CommonValue));
|
||||
var commonValue = Weight.FromValue(vector.CommonValue);
|
||||
int prevEnd = 0;
|
||||
|
||||
var builder = StorageBuilder.Create();
|
||||
foreach (var piece in vector.Pieces)
|
||||
{
|
||||
if (prevEnd != piece.Start && !commonValue.IsZero)
|
||||
{
|
||||
builder.AddRange(new CharRange(prevEnd, piece.Start, commonValue));
|
||||
}
|
||||
|
||||
builder.AddRange(new CharRange(piece.Start, piece.End + 1, Weight.FromValue(piece.Value)));
|
||||
prevEnd = piece.End + 1;
|
||||
}
|
||||
|
||||
if (vector.Count < CharRangeEndExclusive && Math.Abs(vector.CommonValue) > Eps)
|
||||
if (prevEnd < vector.Count && !commonValue.IsZero)
|
||||
{
|
||||
builder.AddRange(new CharRange(vector.Count, CharRangeEndExclusive, Weight.Zero));
|
||||
builder.AddRange(new CharRange(prevEnd, vector.Count, commonValue));
|
||||
}
|
||||
|
||||
return new DiscreteChar(builder.GetResult());
|
||||
|
@ -491,7 +490,7 @@ namespace Microsoft.ML.Probabilistic.Distributions
|
|||
/// </returns>
|
||||
public bool IsUniform()
|
||||
{
|
||||
foreach (var range in this.Data.Ranges)
|
||||
foreach (var range in this.Ranges)
|
||||
{
|
||||
if (Math.Abs(range.Probability.LogValue - UniformProb.LogValue) > Eps)
|
||||
{
|
||||
|
@ -530,15 +529,11 @@ namespace Microsoft.ML.Probabilistic.Distributions
|
|||
return;
|
||||
}
|
||||
|
||||
var probabilityOutsideRanges = distribution1.Data.ProbabilityOutsideRanges * distribution2.Data.ProbabilityOutsideRanges;
|
||||
var builder = new StorageBuilder(probabilityOutsideRanges);
|
||||
var builder = StorageBuilder.Create();
|
||||
foreach (var pair in CharRangePair.CombinedRanges(distribution1, distribution2))
|
||||
{
|
||||
var probProduct = pair.Probability1 * pair.Probability2;
|
||||
if (Math.Abs(probProduct.LogValue - probabilityOutsideRanges.LogValue) > Eps)
|
||||
{
|
||||
builder.AddRange(new CharRange(pair.StartInclusive, pair.EndExclusive, probProduct));
|
||||
}
|
||||
builder.AddRange(new CharRange(pair.StartInclusive, pair.EndExclusive, probProduct));
|
||||
}
|
||||
|
||||
this.Data = builder.GetResult();
|
||||
|
@ -586,16 +581,11 @@ namespace Microsoft.ML.Probabilistic.Distributions
|
|||
var invW = Weight.Inverse(weight1 + weight2);
|
||||
weight1 *= invW;
|
||||
weight2 *= invW;
|
||||
var probabilityOutsideRanges =
|
||||
(weight1 * distribution1.Data.ProbabilityOutsideRanges) + (weight2 * distribution2.Data.ProbabilityOutsideRanges);
|
||||
var builder = new StorageBuilder(probabilityOutsideRanges);
|
||||
var builder = StorageBuilder.Create();
|
||||
foreach (var pair in CharRangePair.CombinedRanges(distribution1, distribution2, false))
|
||||
{
|
||||
var probSum = (weight1 * pair.Probability1) + (weight2 * pair.Probability2);
|
||||
if (Math.Abs(probSum.LogValue - probabilityOutsideRanges.LogValue) > Eps)
|
||||
{
|
||||
builder.AddRange(new CharRange(pair.StartInclusive, pair.EndExclusive, probSum));
|
||||
}
|
||||
builder.AddRange(new CharRange(pair.StartInclusive, pair.EndExclusive, probSum));
|
||||
}
|
||||
|
||||
this.Data = builder.GetResult();
|
||||
|
@ -642,9 +632,8 @@ namespace Microsoft.ML.Probabilistic.Distributions
|
|||
/// <param name="distribution">The distribution which support will be used to setup the current distribution.</param>
|
||||
public void SetToPartialUniformOf(DiscreteChar distribution)
|
||||
{
|
||||
var builder = new StorageBuilder(
|
||||
distribution.Data.ProbabilityOutsideRanges.IsZero ? Weight.Zero : Weight.One);
|
||||
foreach (var range in distribution.Data.Ranges)
|
||||
var builder = StorageBuilder.Create();
|
||||
foreach (var range in distribution.Ranges)
|
||||
{
|
||||
builder.AddRange(
|
||||
new CharRange(
|
||||
|
@ -663,10 +652,7 @@ namespace Microsoft.ML.Probabilistic.Distributions
|
|||
public bool IsPartialUniform()
|
||||
{
|
||||
Weight? commonProb = null;
|
||||
bool hasCommonValues = false;
|
||||
int prevRangeEnd = 0;
|
||||
var data = this.Data;
|
||||
foreach (var range in data.Ranges)
|
||||
foreach (var range in this.Ranges)
|
||||
{
|
||||
if (commonProb.HasValue && !range.Probability.IsZero && Math.Abs(commonProb.Value.LogValue - range.Probability.LogValue) > Eps)
|
||||
{
|
||||
|
@ -674,16 +660,6 @@ namespace Microsoft.ML.Probabilistic.Distributions
|
|||
}
|
||||
|
||||
commonProb = range.Probability;
|
||||
hasCommonValues |= range.StartInclusive > prevRangeEnd;
|
||||
prevRangeEnd = range.EndExclusive;
|
||||
}
|
||||
|
||||
hasCommonValues |= prevRangeEnd < CharRangeEndExclusive;
|
||||
|
||||
if (hasCommonValues && commonProb.HasValue && !data.ProbabilityOutsideRanges.IsZero &&
|
||||
Math.Abs(commonProb.Value.LogValue - data.ProbabilityOutsideRanges.LogValue) > Eps)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
|
@ -697,16 +673,12 @@ namespace Microsoft.ML.Probabilistic.Distributions
|
|||
/// <param name="forceProper">Specifies whether the ratio must be proper.</param>
|
||||
public void SetToRatio(DiscreteChar numerator, DiscreteChar denominator, bool forceProper = false)
|
||||
{
|
||||
var probabilityOutsideRanges = DivideProb(numerator.Data.ProbabilityOutsideRanges, denominator.Data.ProbabilityOutsideRanges);
|
||||
var builder = new StorageBuilder(probabilityOutsideRanges);
|
||||
var builder = StorageBuilder.Create();
|
||||
|
||||
foreach (var pair in CharRangePair.CombinedRanges(numerator, denominator))
|
||||
foreach (var pair in CharRangePair.CombinedRanges(numerator, denominator, false))
|
||||
{
|
||||
var probRatio = DivideProb(pair.Probability1, pair.Probability2);
|
||||
if (Math.Abs(probRatio.LogValue - probabilityOutsideRanges.LogValue) > Eps)
|
||||
{
|
||||
builder.AddRange(new CharRange(pair.StartInclusive, pair.EndExclusive, probRatio));
|
||||
}
|
||||
builder.AddRange(new CharRange(pair.StartInclusive, pair.EndExclusive, probRatio));
|
||||
}
|
||||
|
||||
this.Data = builder.GetResult();
|
||||
|
@ -719,32 +691,29 @@ namespace Microsoft.ML.Probabilistic.Distributions
|
|||
/// <param name="power">The power.</param>
|
||||
public void SetToPower(DiscreteChar distribution, double power)
|
||||
{
|
||||
var builder = new StorageBuilder(Weight.Zero);
|
||||
|
||||
bool hasCommonValues = false;
|
||||
int prevRangeEnd = 0;
|
||||
foreach (var range in distribution.Data.Ranges)
|
||||
if (power == 0)
|
||||
{
|
||||
if (range.Probability.IsZero && power < 0)
|
||||
this.SetToUniform();
|
||||
return;
|
||||
}
|
||||
|
||||
var builder = StorageBuilder.Create();
|
||||
|
||||
var prevRangeEnd = 0;
|
||||
foreach (var range in distribution.Ranges)
|
||||
{
|
||||
if ((prevRangeEnd != range.StartInclusive || range.Probability.IsZero) && power < 0)
|
||||
{
|
||||
throw new DivideByZeroException();
|
||||
}
|
||||
|
||||
builder.AddRange(new CharRange(range.StartInclusive, range.EndExclusive, Weight.Pow(range.Probability, power)));
|
||||
|
||||
hasCommonValues |= range.StartInclusive > prevRangeEnd;
|
||||
prevRangeEnd = range.EndExclusive;
|
||||
}
|
||||
|
||||
hasCommonValues |= prevRangeEnd < CharRangeEndExclusive;
|
||||
if (hasCommonValues)
|
||||
if (prevRangeEnd != CharRangeEndExclusive && power < 0)
|
||||
{
|
||||
if (distribution.Data.ProbabilityOutsideRanges.IsZero && power < 0)
|
||||
{
|
||||
throw new DivideByZeroException();
|
||||
}
|
||||
|
||||
builder.ProbabilityOutsideRanges = Weight.Pow(distribution.Data.ProbabilityOutsideRanges, power);
|
||||
throw new DivideByZeroException();
|
||||
}
|
||||
|
||||
this.Data = builder.GetResult();
|
||||
|
@ -794,14 +763,9 @@ namespace Microsoft.ML.Probabilistic.Distributions
|
|||
public double GetAverageLog(DiscreteChar distribution)
|
||||
{
|
||||
double result = 0;
|
||||
foreach (var pair in CharRangePair.CombinedRanges(this, distribution, true))
|
||||
foreach (var pair in CharRangePair.CombinedRanges(this, distribution, false))
|
||||
{
|
||||
if (pair.Probability2.IsZero)
|
||||
{
|
||||
return double.NegativeInfinity;
|
||||
}
|
||||
|
||||
double product = ValueTimesLogValue(pair.Probability1, pair.Probability2);
|
||||
var product = ValueTimesLogValue(pair.Probability1, pair.Probability2);
|
||||
result += product * (pair.EndExclusive - pair.StartInclusive);
|
||||
}
|
||||
|
||||
|
@ -814,36 +778,18 @@ namespace Microsoft.ML.Probabilistic.Distributions
|
|||
/// <returns>The mode.</returns>
|
||||
public char GetMode()
|
||||
{
|
||||
bool hasCommonValues = false;
|
||||
int prevRangeEnd = 0;
|
||||
char mode = '\0';
|
||||
char charOutOfRanges = '\0';
|
||||
var maxProb = Weight.Zero;
|
||||
var data = this.Data;
|
||||
foreach (var range in data.Ranges)
|
||||
foreach (var range in this.Ranges)
|
||||
{
|
||||
if (range.Probability > maxProb)
|
||||
{
|
||||
mode = (char)range.StartInclusive;
|
||||
maxProb = range.Probability;
|
||||
}
|
||||
|
||||
if (range.StartInclusive > prevRangeEnd)
|
||||
{
|
||||
hasCommonValues = true;
|
||||
charOutOfRanges = (char)prevRangeEnd;
|
||||
}
|
||||
|
||||
prevRangeEnd = range.EndExclusive;
|
||||
}
|
||||
|
||||
if (prevRangeEnd < CharRangeEndExclusive)
|
||||
{
|
||||
hasCommonValues = true;
|
||||
charOutOfRanges = (char)prevRangeEnd;
|
||||
}
|
||||
|
||||
return hasCommonValues && data.ProbabilityOutsideRanges > maxProb ? charOutOfRanges : mode;
|
||||
return mode;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
|
@ -854,7 +800,7 @@ namespace Microsoft.ML.Probabilistic.Distributions
|
|||
{
|
||||
var sampleProb = Rand.Double();
|
||||
|
||||
foreach (var interval in EnumerateCharRanges())
|
||||
foreach (var interval in this.Ranges)
|
||||
{
|
||||
var intervalLength = Weight.FromValue(interval.EndExclusive - interval.StartInclusive);
|
||||
var prob = intervalLength * interval.Probability;
|
||||
|
@ -868,21 +814,6 @@ namespace Microsoft.ML.Probabilistic.Distributions
|
|||
throw new Exception();
|
||||
}
|
||||
|
||||
private IEnumerable<CharRange> EnumerateCharRanges()
|
||||
{
|
||||
var prevRangeEnd = 0;
|
||||
var data = this.Data;
|
||||
var probabilityOutsideRanges = data.ProbabilityOutsideRanges;
|
||||
foreach (var range in data.Ranges)
|
||||
{
|
||||
yield return new CharRange(prevRangeEnd, range.StartInclusive, probabilityOutsideRanges);
|
||||
yield return new CharRange(range.StartInclusive, range.EndExclusive, range.Probability);
|
||||
prevRangeEnd = range.EndExclusive;
|
||||
}
|
||||
|
||||
yield return new CharRange(prevRangeEnd, CharRangeEndExclusive, probabilityOutsideRanges);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Draws a sample from the distribution.
|
||||
/// </summary>
|
||||
|
@ -902,21 +833,8 @@ namespace Microsoft.ML.Probabilistic.Distributions
|
|||
/// <returns>The character values with non-zero mass.</returns>
|
||||
public IEnumerable<char> EnumerateSupport()
|
||||
{
|
||||
int prevRangeEnd = 0;
|
||||
|
||||
var data = this.Data;
|
||||
var probabilityOutsideRanges = data.ProbabilityOutsideRanges;
|
||||
|
||||
foreach (var range in data.Ranges)
|
||||
foreach (var range in this.Ranges)
|
||||
{
|
||||
if (!probabilityOutsideRanges.IsZero)
|
||||
{
|
||||
for (int j = prevRangeEnd; j < range.StartInclusive; j++)
|
||||
{
|
||||
yield return (char)j;
|
||||
}
|
||||
}
|
||||
|
||||
if (!range.Probability.IsZero)
|
||||
{
|
||||
for (int j = range.StartInclusive; j < range.EndExclusive; j++)
|
||||
|
@ -924,16 +842,6 @@ namespace Microsoft.ML.Probabilistic.Distributions
|
|||
yield return (char)j;
|
||||
}
|
||||
}
|
||||
|
||||
prevRangeEnd = range.EndExclusive;
|
||||
}
|
||||
|
||||
if (!probabilityOutsideRanges.IsZero)
|
||||
{
|
||||
for (int j = prevRangeEnd; j < CharRangeEndExclusive; j++)
|
||||
{
|
||||
yield return (char)j;
|
||||
}
|
||||
}
|
||||
}
|
||||
#endregion
|
||||
|
@ -943,9 +851,6 @@ namespace Microsoft.ML.Probabilistic.Distributions
|
|||
/// <summary>
|
||||
/// Gets an array of character ranges with associated probabilities.
|
||||
/// </summary>
|
||||
/// <remarks>
|
||||
/// See <see cref="ProbabilityOutsideRanges"/> for the probability of characters not covered by the returned ranges.
|
||||
/// </remarks>
|
||||
/// <value>An array of character ranges with associated probabilities.</value>
|
||||
public ReadOnlyArray<CharRange> Ranges => this.Data.Ranges;
|
||||
|
||||
|
@ -979,7 +884,7 @@ namespace Microsoft.ML.Probabilistic.Distributions
|
|||
return new DiscreteChar(StorageCache.UpperComplement);
|
||||
default:
|
||||
// TODO: decent implementation
|
||||
var ranges = unnormalizedCharDist.Data.Ranges;
|
||||
var ranges = unnormalizedCharDist.Ranges;
|
||||
var probVector = PiecewiseVector.Zero(CharRangeEndExclusive);
|
||||
foreach (var range in ranges)
|
||||
{
|
||||
|
@ -1001,9 +906,8 @@ namespace Microsoft.ML.Probabilistic.Distributions
|
|||
public PiecewiseVector GetProbs()
|
||||
{
|
||||
// TODO: replace with GetLogProbs()
|
||||
var data = this.Data;
|
||||
var result = PiecewiseVector.Constant(CharRangeEndExclusive, data.ProbabilityOutsideRanges.Value);
|
||||
foreach (var range in data.Ranges)
|
||||
var result = PiecewiseVector.Constant(CharRangeEndExclusive, 0);
|
||||
foreach (var range in this.Ranges)
|
||||
{
|
||||
result.Pieces.Add(new ConstantVector(range.StartInclusive, range.EndExclusive - 1, range.Probability.Value));
|
||||
}
|
||||
|
@ -1111,8 +1015,7 @@ namespace Microsoft.ML.Probabilistic.Distributions
|
|||
|
||||
private Weight FindProb(char value)
|
||||
{
|
||||
var data = this.Data;
|
||||
foreach (var range in data.Ranges)
|
||||
foreach (var range in this.Ranges)
|
||||
{
|
||||
if (range.StartInclusive <= value && range.EndExclusive > value)
|
||||
{
|
||||
|
@ -1120,7 +1023,7 @@ namespace Microsoft.ML.Probabilistic.Distributions
|
|||
}
|
||||
}
|
||||
|
||||
return data.ProbabilityOutsideRanges;
|
||||
return Weight.Zero;
|
||||
}
|
||||
|
||||
#endregion
|
||||
|
@ -1170,7 +1073,7 @@ namespace Microsoft.ML.Probabilistic.Distributions
|
|||
/// </summary>
|
||||
[Serializable]
|
||||
[DataContract]
|
||||
public struct CharRange
|
||||
public struct CharRange : IComparable<CharRange>
|
||||
{
|
||||
/// <summary>
|
||||
/// Initializes a new instance of the <see cref="CharRange"/> struct
|
||||
|
@ -1193,18 +1096,27 @@ namespace Microsoft.ML.Probabilistic.Distributions
|
|||
/// <summary>
|
||||
/// Gets or sets the start of the range (inclusive).
|
||||
/// </summary>
|
||||
/// <remarks>
|
||||
/// Setter is required for DataContractSerializer
|
||||
/// </remarks>
|
||||
[DataMember]
|
||||
public int StartInclusive { get; private set; }
|
||||
|
||||
/// <summary>
|
||||
/// Gets or sets the end of the range (exclusive).
|
||||
/// </summary>
|
||||
/// <remarks>
|
||||
/// Setter is required for DataContractSerializer
|
||||
/// </remarks>
|
||||
[DataMember]
|
||||
public int EndExclusive { get; private set; }
|
||||
|
||||
/// <summary>
|
||||
/// Gets or sets the probability associated with the range.
|
||||
/// </summary>
|
||||
/// <remarks>
|
||||
/// Setter is required for DataContractSerializer
|
||||
/// </remarks>
|
||||
[DataMember]
|
||||
public Weight Probability { get; private set; }
|
||||
|
||||
|
@ -1221,6 +1133,9 @@ namespace Microsoft.ML.Probabilistic.Distributions
|
|||
return sb.ToString();
|
||||
}
|
||||
|
||||
public int CompareTo(CharRange that) =>
|
||||
this.StartInclusive.CompareTo(that.StartInclusive);
|
||||
|
||||
internal void AppendToString(StringBuilder stringBuilder)
|
||||
{
|
||||
stringBuilder.Append('[');
|
||||
|
@ -1265,22 +1180,22 @@ namespace Microsoft.ML.Probabilistic.Distributions
|
|||
/// <summary>
|
||||
/// Gets or sets the start of the ranges (inclusive).
|
||||
/// </summary>
|
||||
public int StartInclusive { get; set; }
|
||||
public int StartInclusive { get; private set; }
|
||||
|
||||
/// <summary>
|
||||
/// Gets or sets the end of the ranges (exclusive).
|
||||
/// </summary>
|
||||
public int EndExclusive { get; set; }
|
||||
public int EndExclusive { get; private set; }
|
||||
|
||||
/// <summary>
|
||||
/// Gets or sets the probability value associated with the first range.
|
||||
/// </summary>
|
||||
public Weight Probability1 { get; set; }
|
||||
public Weight Probability1 { get; private set; }
|
||||
|
||||
/// <summary>
|
||||
/// Gets or sets the probability value associated with the second range.
|
||||
/// </summary>
|
||||
public Weight Probability2 { get; set; }
|
||||
public Weight Probability2 { get; private set; }
|
||||
|
||||
/// <summary>
|
||||
/// Gets a string that represents this character range.
|
||||
|
@ -1295,7 +1210,7 @@ namespace Microsoft.ML.Probabilistic.Distributions
|
|||
return sb.ToString();
|
||||
}
|
||||
|
||||
internal void AppendToString(StringBuilder stringBuilder)
|
||||
private void AppendToString(StringBuilder stringBuilder)
|
||||
{
|
||||
stringBuilder.Append('[');
|
||||
AppendChar(stringBuilder, (char)this.StartInclusive);
|
||||
|
@ -1322,14 +1237,15 @@ namespace Microsoft.ML.Probabilistic.Distributions
|
|||
/// </summary>
|
||||
/// <param name="distribution1">The first distribution.</param>
|
||||
/// <param name="distribution2">The second distribution</param>
|
||||
/// <param name="excludeZeroProb">Whether to exclude non-intersectng ranges in the case where both distibrutions have zero probability outside their ranges.</param>
|
||||
/// <returns></returns>
|
||||
/// <param name="excludeZeroProb">
|
||||
/// Whether to exclude non-interesting ranges with zero probability.
|
||||
/// </param>
|
||||
public static IEnumerable<CharRangePair> CombinedRanges(DiscreteChar distribution1, DiscreteChar distribution2, bool excludeZeroProb = true) =>
|
||||
CombinedRanges(distribution1.Data, distribution2.Data, excludeZeroProb);
|
||||
|
||||
internal static IEnumerable<CharRangePair> CombinedRanges(Storage state1, Storage state2, bool excludeZeroProb)
|
||||
{
|
||||
if (excludeZeroProb && state1.ProbabilityOutsideRanges.IsZero && state2.ProbabilityOutsideRanges.IsZero)
|
||||
if (excludeZeroProb)
|
||||
{
|
||||
int rangeIndex1 = 0;
|
||||
int rangeIndex2 = 0;
|
||||
|
@ -1435,7 +1351,7 @@ namespace Microsoft.ML.Probabilistic.Distributions
|
|||
if (state1.Ranges[rangeIndex1].StartInclusive > currentStartInclusive)
|
||||
{
|
||||
currentEndExclusive = state1.Ranges[rangeIndex1].StartInclusive;
|
||||
currentProbability1 = state1.ProbabilityOutsideRanges;
|
||||
currentProbability1 = Weight.Zero;
|
||||
}
|
||||
else
|
||||
{
|
||||
|
@ -1445,7 +1361,7 @@ namespace Microsoft.ML.Probabilistic.Distributions
|
|||
}
|
||||
else
|
||||
{
|
||||
currentProbability1 = state1.ProbabilityOutsideRanges;
|
||||
currentProbability1 = Weight.Zero;
|
||||
currentEndExclusive = CharRangeEndExclusive;
|
||||
}
|
||||
|
||||
|
@ -1454,7 +1370,7 @@ namespace Microsoft.ML.Probabilistic.Distributions
|
|||
if (state2.Ranges[rangeIndex2].StartInclusive > currentStartInclusive)
|
||||
{
|
||||
currentEndExclusive = Math.Min(currentEndExclusive, state2.Ranges[rangeIndex2].StartInclusive);
|
||||
currentProbability2 = state2.ProbabilityOutsideRanges;
|
||||
currentProbability2 = Weight.Zero;
|
||||
}
|
||||
else
|
||||
{
|
||||
|
@ -1464,7 +1380,7 @@ namespace Microsoft.ML.Probabilistic.Distributions
|
|||
}
|
||||
else
|
||||
{
|
||||
currentProbability2 = state2.ProbabilityOutsideRanges;
|
||||
currentProbability2 = Weight.Zero;
|
||||
}
|
||||
|
||||
yield return new CharRangePair()
|
||||
|
@ -1528,14 +1444,6 @@ namespace Microsoft.ML.Probabilistic.Distributions
|
|||
/// </remarks>
|
||||
public ReadOnlyArray<CharRange> Ranges { get; }
|
||||
|
||||
/// <summary>
|
||||
/// The probability of a character outside character ranges defined by <see cref="Ranges"/>.
|
||||
/// </summary>
|
||||
/// <remarks>
|
||||
/// The character probabilities must be kept normalized by applying <see cref="StorageBuilder.NormalizeProbabilities"/> when necessary.
|
||||
/// </remarks>
|
||||
public Weight ProbabilityOutsideRanges { get; }
|
||||
|
||||
public char? Point { get; }
|
||||
|
||||
// Following 3 members are not immutable and can be recalculated on-demand
|
||||
|
@ -1549,14 +1457,12 @@ namespace Microsoft.ML.Probabilistic.Distributions
|
|||
|
||||
private Storage(
|
||||
ReadOnlyArray<CharRange> ranges,
|
||||
Weight probabilityOutsideRanges,
|
||||
char? point,
|
||||
CharClasses charClasses,
|
||||
string regexRepresentation,
|
||||
string symbolRepresentation)
|
||||
{
|
||||
this.Ranges = ranges;
|
||||
this.ProbabilityOutsideRanges = probabilityOutsideRanges;
|
||||
this.Point = point;
|
||||
this.CharClasses = charClasses;
|
||||
this.regexRepresentation = regexRepresentation;
|
||||
|
@ -1565,26 +1471,24 @@ namespace Microsoft.ML.Probabilistic.Distributions
|
|||
|
||||
public static Storage CreateUncached(
|
||||
ReadOnlyArray<CharRange> ranges,
|
||||
Weight probabilityOutsideRanges,
|
||||
char? point,
|
||||
CharClasses charClasses = CharClasses.Unknown,
|
||||
string regexRepresentation = null,
|
||||
string symbolRepresentation = null)
|
||||
{
|
||||
Debug.Assert(point.HasValue == IsRangesPointMass(ranges));
|
||||
return new Storage(ranges, probabilityOutsideRanges, point, charClasses, regexRepresentation, symbolRepresentation);
|
||||
return new Storage(ranges, point, charClasses, regexRepresentation, symbolRepresentation);
|
||||
}
|
||||
|
||||
public static Storage Create(
|
||||
ReadOnlyArray<CharRange> ranges,
|
||||
Weight probabilityOutsideRanges,
|
||||
CharClasses charClaasses = CharClasses.Unknown,
|
||||
CharClasses charClasses = CharClasses.Unknown,
|
||||
string regexRepresentation = null,
|
||||
string symbolRepresentation = null)
|
||||
{
|
||||
return IsRangesPointMass(ranges)
|
||||
? CreatePoint((char)ranges[0].StartInclusive, ranges)
|
||||
: CreateUncached(ranges, probabilityOutsideRanges, null, charClaasses, regexRepresentation, symbolRepresentation);
|
||||
: CreateUncached(ranges, null, charClasses, regexRepresentation, symbolRepresentation);
|
||||
}
|
||||
|
||||
public static Storage CreatePoint(char point, ReadOnlyArray<CharRange> ranges) =>
|
||||
|
@ -1603,8 +1507,8 @@ namespace Microsoft.ML.Probabilistic.Distributions
|
|||
var startEndPairsArray = startEndPairs.ToArray();
|
||||
Argument.CheckIfValid(startEndPairsArray.Length % 2 == 0, "startEndPairs", "The number of characters must be even.");
|
||||
|
||||
var builder = new StorageBuilder(
|
||||
Weight.Zero, charClasses, regexRepresentation, symbolRepresentation);
|
||||
var builder = StorageBuilder.Create(
|
||||
charClasses, regexRepresentation, symbolRepresentation);
|
||||
for (int i = 0; i < startEndPairsArray.Length; i += 2)
|
||||
{
|
||||
var startInclusive = startEndPairsArray[i];
|
||||
|
@ -1626,10 +1530,20 @@ namespace Microsoft.ML.Probabilistic.Distributions
|
|||
public Storage Complement()
|
||||
{
|
||||
// Must use StorageBuilder, because need to Normalize probabilities
|
||||
var builder = new StorageBuilder(this.ProbabilityOutsideRanges.IsZero ? Weight.One : Weight.Zero);
|
||||
var builder = StorageBuilder.Create();
|
||||
int prevEnd = 0;
|
||||
foreach (var range in this.Ranges)
|
||||
{
|
||||
builder.AddRange(new CharRange(range.StartInclusive, range.EndExclusive, range.Probability.IsZero ? Weight.One : Weight.Zero));
|
||||
if (range.StartInclusive != prevEnd)
|
||||
{
|
||||
builder.AddRange(new CharRange(prevEnd, range.StartInclusive, Weight.One));
|
||||
prevEnd = range.EndExclusive;
|
||||
}
|
||||
}
|
||||
|
||||
if (prevEnd != CharRangeEndExclusive)
|
||||
{
|
||||
builder.AddRange(new CharRange(prevEnd, CharRangeEndExclusive, Weight.One));
|
||||
}
|
||||
|
||||
return builder.GetResult();
|
||||
|
@ -1661,7 +1575,6 @@ namespace Microsoft.ML.Probabilistic.Distributions
|
|||
#region Properties
|
||||
|
||||
// TODO: Assumes that there are no ranges with zero probability
|
||||
// TODO: also assumes that a point is not represented by zero-probability ranges and a non-zero value outside of ranges
|
||||
private static bool IsRangesPointMass(ReadOnlyArray<CharRange> ranges) =>
|
||||
ranges.Count > 0 && Math.Abs(ranges[0].Probability.LogValue - Weight.One.LogValue) < Eps;
|
||||
|
||||
|
@ -1698,27 +1611,19 @@ namespace Microsoft.ML.Probabilistic.Distributions
|
|||
public static Storage FromSerializationInfo(SerializationInfo info) =>
|
||||
Storage.Create(
|
||||
(CharRange[]) info.GetValue(nameof(Ranges), typeof(CharRange[])),
|
||||
(Weight) info.GetValue(nameof(ProbabilityOutsideRanges), typeof(Weight)),
|
||||
(CharClasses) info.GetValue(nameof(CharClasses), typeof(CharClasses)));
|
||||
|
||||
public void GetObjectData(SerializationInfo info)
|
||||
{
|
||||
info.AddValue(nameof(this.Ranges), this.Ranges.CloneArray());
|
||||
info.AddValue(nameof(this.ProbabilityOutsideRanges), this.ProbabilityOutsideRanges);
|
||||
info.AddValue(nameof(this.CharClasses), this.CharClasses);
|
||||
}
|
||||
|
||||
public void Write(Action<int> writeInt32, Action<double> writeDouble)
|
||||
{
|
||||
var propertyMask = new BitVector32();
|
||||
var idx = 0;
|
||||
propertyMask[1 << idx++] = true; // ranges can never be null
|
||||
writeInt32(propertyMask.Data);
|
||||
writeInt32(this.Ranges.Count);
|
||||
this.Ranges.ForEach(range => range.Write(writeInt32, writeDouble));
|
||||
writeInt32(this.Ranges.Count); // For compatibility with old readers
|
||||
writeInt32((int)this.CharClasses);
|
||||
writeDouble(this.ProbabilityOutsideRanges.LogValue);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
|
@ -1726,37 +1631,18 @@ namespace Microsoft.ML.Probabilistic.Distributions
|
|||
/// </summary>
|
||||
public static Storage Read(Func<int> readInt32, Func<double> readDouble)
|
||||
{
|
||||
var propertyMask = new BitVector32(readInt32());
|
||||
var idx = 0;
|
||||
var hasRanges = propertyMask[1 << idx++];
|
||||
CharRange[] ranges = null;
|
||||
|
||||
if (hasRanges)
|
||||
var nRanges = readInt32();
|
||||
ranges = new CharRange[nRanges];
|
||||
for (var i = 0; i < nRanges; i++)
|
||||
{
|
||||
var nRanges = readInt32();
|
||||
ranges = new CharRange[nRanges];
|
||||
for (var i = 0; i < nRanges; i++)
|
||||
{
|
||||
ranges[i] = CharRange.Read(readInt32, readDouble);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
ranges = new CharRange[0];
|
||||
ranges[i] = CharRange.Read(readInt32, readDouble);
|
||||
}
|
||||
|
||||
// In old internal representation rangeCount could be different from serialized array
|
||||
var rangesCount = readInt32();
|
||||
if (rangesCount != ranges.Length)
|
||||
{
|
||||
var newRanges = new CharRange[rangesCount];
|
||||
Array.Copy(ranges, newRanges, rangesCount);
|
||||
ranges = newRanges;
|
||||
}
|
||||
var charClasses = (CharClasses)readInt32();
|
||||
var probabilityOutsideRanges = Weight.FromLogValue(readDouble());
|
||||
|
||||
return Storage.Create(ranges, probabilityOutsideRanges, charClasses);
|
||||
return Storage.Create(ranges, charClasses);
|
||||
}
|
||||
|
||||
#endregion
|
||||
|
@ -1787,13 +1673,6 @@ namespace Microsoft.ML.Probabilistic.Distributions
|
|||
range.AppendToString(stringBuilder);
|
||||
stringBuilder.Append(' ');
|
||||
}
|
||||
|
||||
if (stringBuilder.Length > 0)
|
||||
{
|
||||
stringBuilder.Append("Otherwise: ");
|
||||
}
|
||||
|
||||
stringBuilder.Append(this.ProbabilityOutsideRanges);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -1945,8 +1824,7 @@ namespace Microsoft.ML.Probabilistic.Distributions
|
|||
string WordCharRanges(string baseRange) => baseRange + "09__";
|
||||
|
||||
Uniform = Storage.CreateUncached(
|
||||
new CharRange[] { },
|
||||
UniformProb,
|
||||
new CharRange[] { new CharRange(char.MinValue, CharRangeEndExclusive, UniformProb) },
|
||||
null,
|
||||
CharClasses.Uniform,
|
||||
UniformRegexRepresentation,
|
||||
|
@ -1972,7 +1850,6 @@ namespace Microsoft.ML.Probabilistic.Distributions
|
|||
var upperComplement = Upper.Complement();
|
||||
UpperComplement = Storage.CreateUncached(
|
||||
upperComplement.Ranges,
|
||||
upperComplement.ProbabilityOutsideRanges,
|
||||
null,
|
||||
regexRepresentation: @"[^\p{Lu}]",
|
||||
symbolRepresentation: "🡻");
|
||||
|
@ -1988,7 +1865,6 @@ namespace Microsoft.ML.Probabilistic.Distributions
|
|||
ranges.IsNull
|
||||
? new ReadOnlyArray<CharRange>(new[] { new CharRange(point, point + 1, Weight.One) })
|
||||
: ranges,
|
||||
Weight.Zero,
|
||||
point);
|
||||
}
|
||||
|
||||
|
@ -2081,32 +1957,39 @@ namespace Microsoft.ML.Probabilistic.Distributions
|
|||
/// </summary>
|
||||
private readonly string symbolRepresentation;
|
||||
|
||||
/// <summary>
|
||||
/// The probability of a character outside character ranges defined by <see cref="ranges"/>.
|
||||
/// </summary>
|
||||
public Weight ProbabilityOutsideRanges { private get; set; }
|
||||
|
||||
#endregion
|
||||
|
||||
public StorageBuilder(
|
||||
Weight probabilityOutsideRanges,
|
||||
CharClasses charClasses = CharClasses.Unknown,
|
||||
string regexRepresentation = null,
|
||||
string symbolRepresentation = null)
|
||||
CharClasses charClasses,
|
||||
string regexRepresentation,
|
||||
string symbolRepresentation)
|
||||
{
|
||||
this.ProbabilityOutsideRanges = probabilityOutsideRanges;
|
||||
this.ranges = new List<CharRange>();
|
||||
this.charClasses = charClasses;
|
||||
this.regexRepresentation = regexRepresentation;
|
||||
this.symbolRepresentation = symbolRepresentation;
|
||||
}
|
||||
|
||||
public static StorageBuilder Create(
|
||||
CharClasses charClasses = CharClasses.Unknown,
|
||||
string regexRepresentation = null,
|
||||
string symbolRepresentation = null)
|
||||
{
|
||||
return new StorageBuilder(charClasses, regexRepresentation, symbolRepresentation);
|
||||
}
|
||||
|
||||
#region Public methods
|
||||
|
||||
/// <summary>
|
||||
/// Adds a new character range to <see cref="ranges"/>
|
||||
/// </summary>
|
||||
public void AddRange(CharRange range) => this.ranges.Add(range);
|
||||
public void AddRange(CharRange range)
|
||||
{
|
||||
if (!range.Probability.IsZero)
|
||||
{
|
||||
this.ranges.Add(range);
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Sorts ranges by StartInclusive, checks that they are non-overlapping, cover valid characters only
|
||||
|
@ -2117,7 +2000,9 @@ namespace Microsoft.ML.Probabilistic.Distributions
|
|||
/// </remarks>
|
||||
public void SortAndCheckRanges()
|
||||
{
|
||||
this.ranges.Sort((s1, s2) => Comparer<int>.Default.Compare(s1.StartInclusive, s2.StartInclusive));
|
||||
Debug.Assert(this.ranges.Count > 0);
|
||||
|
||||
this.ranges.Sort();
|
||||
|
||||
var prevRangeEnd = 0;
|
||||
foreach (var range in this.ranges)
|
||||
|
@ -2139,7 +2024,6 @@ namespace Microsoft.ML.Probabilistic.Distributions
|
|||
this.NormalizeProbabilities();
|
||||
return Storage.Create(
|
||||
this.ranges.ToArray(),
|
||||
this.ProbabilityOutsideRanges,
|
||||
this.charClasses,
|
||||
this.regexRepresentation,
|
||||
this.symbolRepresentation);
|
||||
|
@ -2158,17 +2042,15 @@ namespace Microsoft.ML.Probabilistic.Distributions
|
|||
for (var i = 0; i < this.ranges.Count; ++i)
|
||||
{
|
||||
var range = this.ranges[i];
|
||||
if (Math.Abs(range.Probability.LogValue - this.ProbabilityOutsideRanges.LogValue) < Eps)
|
||||
{
|
||||
continue;
|
||||
}
|
||||
|
||||
if (newRangeCount > 0)
|
||||
{
|
||||
var prevRange = this.ranges[newRangeCount - 1];
|
||||
if (range.StartInclusive == prevRange.EndExclusive && Math.Abs(range.Probability.LogValue - prevRange.Probability.LogValue) < Eps)
|
||||
if (range.StartInclusive == prevRange.EndExclusive &&
|
||||
Math.Abs(range.Probability.LogValue - prevRange.Probability.LogValue) < Eps)
|
||||
{
|
||||
this.ranges[newRangeCount - 1] = new CharRange(prevRange.StartInclusive, range.EndExclusive, prevRange.Probability);
|
||||
this.ranges[newRangeCount - 1] = new CharRange(
|
||||
prevRange.StartInclusive, range.EndExclusive, prevRange.Probability);
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
@ -2185,11 +2067,11 @@ namespace Microsoft.ML.Probabilistic.Distributions
|
|||
private void NormalizeProbabilities()
|
||||
{
|
||||
var normalizer = this.ComputeInvNormalizer();
|
||||
|
||||
this.ProbabilityOutsideRanges *= normalizer;
|
||||
for (int i = 0; i < this.ranges.Count; ++i)
|
||||
{
|
||||
this.ranges[i] = new CharRange(this.ranges[i].StartInclusive, this.ranges[i].EndExclusive, this.ranges[i].Probability * normalizer);
|
||||
var range = this.ranges[i];
|
||||
this.ranges[i] = new CharRange(
|
||||
range.StartInclusive, range.EndExclusive, range.Probability * normalizer);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -2200,17 +2082,12 @@ namespace Microsoft.ML.Probabilistic.Distributions
|
|||
private Weight ComputeInvNormalizer()
|
||||
{
|
||||
Weight normalizer = Weight.Zero;
|
||||
var prevRangeEnd = 0;
|
||||
|
||||
foreach (var range in this.ranges)
|
||||
{
|
||||
normalizer += Weight.FromValue(range.StartInclusive - prevRangeEnd) * this.ProbabilityOutsideRanges;
|
||||
normalizer += Weight.FromValue(range.EndExclusive - range.StartInclusive) * range.Probability;
|
||||
prevRangeEnd = range.EndExclusive;
|
||||
}
|
||||
|
||||
normalizer += Weight.FromValue(CharRangeEndExclusive - prevRangeEnd) * this.ProbabilityOutsideRanges;
|
||||
|
||||
if (normalizer.IsZero)
|
||||
{
|
||||
throw new AllZeroException("A character distribution that is zero everywhere has been produced.");
|
||||
|
|
Загрузка…
Ссылка в новой задаче