3444 строки
164 KiB
C#
3444 строки
164 KiB
C#
// Licensed to the .NET Foundation under one or more agreements.
|
|
// The .NET Foundation licenses this file to you under the MIT license.
|
|
// See the LICENSE file in the project root for more information.
|
|
|
|
using System;
|
|
using System.Collections;
|
|
using System.Collections.Generic;
|
|
using System.ComponentModel;
|
|
using System.Diagnostics;
|
|
using System.IO;
|
|
using System.Linq;
|
|
using System.Text;
|
|
using Microsoft.ML.Calibrators;
|
|
using Microsoft.ML.CommandLine;
|
|
using Microsoft.ML.Data;
|
|
using Microsoft.ML.Data.Conversion;
|
|
using Microsoft.ML.FastTree.Utils;
|
|
using Microsoft.ML.Internal.Internallearn;
|
|
using Microsoft.ML.Internal.Utilities;
|
|
using Microsoft.ML.Model;
|
|
using Microsoft.ML.Model.OnnxConverter;
|
|
using Microsoft.ML.Model.Pfa;
|
|
using Microsoft.ML.Runtime;
|
|
using Microsoft.ML.Transforms;
|
|
using Microsoft.ML.TreePredictor;
|
|
using Newtonsoft.Json.Linq;
|
|
|
|
// All of these reviews apply in general to fast tree and random forest implementations.
|
|
//REVIEW: Decouple train method in Application.cs to have boosting and random forest logic separate.
|
|
//REVIEW: Do we need to keep all the fast tree based testers?
|
|
|
|
namespace Microsoft.ML.Trainers.FastTree
|
|
{
|
|
[BestFriend]
|
|
internal delegate void SignatureTreeEnsembleTrainer();
|
|
|
|
/// <summary>
|
|
/// FastTreeTrainerBase is generic class and can't have shared object among classes.
|
|
/// This class is to provide common for all classes object which we can use for lock purpose.
|
|
/// </summary>
|
|
internal static class FastTreeShared
|
|
{
|
|
public static readonly object TrainLock = new object();
|
|
}
|
|
|
|
public abstract class FastTreeTrainerBase<TOptions, TTransformer, TModel> :
|
|
TrainerEstimatorBaseWithGroupId<TTransformer, TModel>
|
|
where TTransformer : ISingleFeaturePredictionTransformer<TModel>
|
|
where TOptions : TreeOptions, new()
|
|
where TModel : class
|
|
{
|
|
private protected readonly TOptions FastTreeTrainerOptions;
|
|
private protected readonly bool AllowGC;
|
|
private protected int FeatureCount;
|
|
private protected InternalTreeEnsemble TrainedEnsemble;
|
|
private protected RoleMappedData ValidData;
|
|
/// <summary>
|
|
/// If not null, it's a test data set passed in from training context. It will be converted to one element in
|
|
/// <see cref="Tests"/> by calling <see cref="ExamplesToFastTreeBins.GetCompatibleDataset"/> in <see cref="InitializeTests"/>.
|
|
/// </summary>
|
|
private protected RoleMappedData TestData;
|
|
private protected IParallelTraining ParallelTraining;
|
|
private protected OptimizationAlgorithm OptimizationAlgorithm;
|
|
private protected Dataset TrainSet;
|
|
private protected Dataset ValidSet;
|
|
/// <summary>
|
|
/// Data sets used to evaluate the prediction scores produced the trained model during the training process.
|
|
/// </summary>
|
|
private protected Dataset[] TestSets;
|
|
private protected int[] FeatureMap;
|
|
/// <summary>
|
|
/// In the training process, <see cref="TrainSet"/>, <see cref="ValidSet"/>, <see cref="TestSets"/> would be
|
|
/// converted into <see cref="Tests"/> for efficient model evaluation.
|
|
/// </summary>
|
|
private protected List<Test> Tests;
|
|
private protected TestHistory PruningTest;
|
|
private protected int[] CategoricalFeatures;
|
|
|
|
// Test for early stopping.
|
|
private protected Test TrainTest;
|
|
private protected Test ValidTest;
|
|
|
|
private protected double[] InitTrainScores;
|
|
private protected double[] InitValidScores;
|
|
private protected double[][] InitTestScores;
|
|
private protected InternalTreeEnsemble Ensemble;
|
|
|
|
private protected bool HasValidSet => ValidSet != null;
|
|
|
|
private const string RegisterName = "FastTreeTraining";
|
|
// random for active features selection
|
|
private Random _featureSelectionRandom;
|
|
|
|
private protected string InnerOptions => CmdParser.GetSettings(Host, FastTreeTrainerOptions, new TOptions());
|
|
|
|
public override TrainerInfo Info { get; }
|
|
|
|
private protected virtual bool NeedCalibration => false;
|
|
|
|
/// <summary>
|
|
/// Constructor to use when instantiating the classes deriving from here through the API.
|
|
/// </summary>
|
|
private protected FastTreeTrainerBase(IHostEnvironment env,
|
|
SchemaShape.Column label,
|
|
string featureColumnName,
|
|
string exampleWeightColumnName,
|
|
string rowGroupColumnName,
|
|
int numberOfLeaves,
|
|
int numberOfTrees,
|
|
int minimumExampleCountPerLeaf)
|
|
: base(Contracts.CheckRef(env, nameof(env)).Register(RegisterName), TrainerUtils.MakeR4VecFeature(featureColumnName), label, TrainerUtils.MakeR4ScalarWeightColumn(exampleWeightColumnName), TrainerUtils.MakeU4ScalarColumn(rowGroupColumnName))
|
|
{
|
|
FastTreeTrainerOptions = new TOptions();
|
|
|
|
// set up the directly provided values
|
|
// override with the directly provided values.
|
|
FastTreeTrainerOptions.NumberOfLeaves = numberOfLeaves;
|
|
FastTreeTrainerOptions.NumberOfTrees = numberOfTrees;
|
|
FastTreeTrainerOptions.MinimumExampleCountPerLeaf = minimumExampleCountPerLeaf;
|
|
|
|
FastTreeTrainerOptions.LabelColumnName = label.Name;
|
|
FastTreeTrainerOptions.FeatureColumnName = featureColumnName;
|
|
FastTreeTrainerOptions.ExampleWeightColumnName = exampleWeightColumnName;
|
|
FastTreeTrainerOptions.RowGroupColumnName = rowGroupColumnName;
|
|
|
|
// The discretization step renders this trainer non-parametric, and therefore it does not need normalization.
|
|
// Also since it builds its own internal discretized columnar structures, it cannot benefit from caching.
|
|
// Finally, even the binary classifiers, being logitboost, tend to not benefit from external calibration.
|
|
Info = new TrainerInfo(normalization: false, caching: false, calibration: NeedCalibration, supportValid: true, supportTest: true);
|
|
// REVIEW: CLR 4.6 has a bug that is only exposed in Scope, and if we trigger GC.Collect in scope environment
|
|
// with memory consumption more than 5GB, GC get stuck in infinite loop.
|
|
// Before, we could check a specific type of the environment here, but now it is internal, so we will need another
|
|
// mechanism to detect that we are running in Scope.
|
|
AllowGC = true;
|
|
|
|
Initialize(env);
|
|
}
|
|
|
|
/// <summary>
|
|
/// Constructor that is used when invoking the classes deriving from this, through maml.
|
|
/// </summary>
|
|
private protected FastTreeTrainerBase(IHostEnvironment env, TOptions options, SchemaShape.Column label)
|
|
: base(Contracts.CheckRef(env, nameof(env)).Register(RegisterName), TrainerUtils.MakeR4VecFeature(options.FeatureColumnName), label, TrainerUtils.MakeR4ScalarWeightColumn(options.ExampleWeightColumnName),
|
|
TrainerUtils.MakeU4ScalarColumn(options.RowGroupColumnName))
|
|
{
|
|
Host.CheckValue(options, nameof(options));
|
|
FastTreeTrainerOptions = options;
|
|
// The discretization step renders this trainer non-parametric, and therefore it does not need normalization.
|
|
// Also since it builds its own internal discretized columnar structures, it cannot benefit from caching.
|
|
// Finally, even the binary classifiers, being logitboost, tend to not benefit from external calibration.
|
|
Info = new TrainerInfo(normalization: false, caching: false, calibration: NeedCalibration, supportValid: true, supportTest: true);
|
|
// REVIEW: CLR 4.6 has a bug that is only exposed in Scope, and if we trigger GC.Collect in scope environment
|
|
// with memory consumption more than 5GB, GC get stuck in infinite loop.
|
|
// Before, we could check a specific type of the environment here, but now it is internal, so we will need another
|
|
// mechanism to detect that we are running in Scope.
|
|
AllowGC = true;
|
|
|
|
Initialize(env);
|
|
}
|
|
|
|
private protected abstract void PrepareLabels(IChannel ch);
|
|
|
|
private protected abstract void InitializeTests();
|
|
|
|
private protected abstract Test ConstructTestForTrainingData();
|
|
|
|
private protected abstract OptimizationAlgorithm ConstructOptimizationAlgorithm(IChannel ch);
|
|
|
|
private protected abstract TreeLearner ConstructTreeLearner(IChannel ch);
|
|
|
|
private protected abstract ObjectiveFunctionBase ConstructObjFunc(IChannel ch);
|
|
|
|
private protected virtual float GetMaxLabel()
|
|
{
|
|
return float.PositiveInfinity;
|
|
}
|
|
|
|
private void Initialize(IHostEnvironment env)
|
|
{
|
|
ParallelTraining = FastTreeTrainerOptions.ParallelTrainer != null ? FastTreeTrainerOptions.ParallelTrainer.CreateComponent(env) : new SingleTrainer();
|
|
ParallelTraining.InitEnvironment();
|
|
|
|
Tests = new List<Test>();
|
|
|
|
InitializeThreads(FastTreeTrainerOptions.NumberOfThreads ?? Environment.ProcessorCount);
|
|
}
|
|
|
|
private protected void ConvertData(RoleMappedData trainData)
|
|
{
|
|
AnnotationUtils.TryGetCategoricalFeatureIndices(trainData.Schema.Schema, trainData.Schema.Feature.Value.Index, out CategoricalFeatures);
|
|
var useTranspose = UseTranspose(FastTreeTrainerOptions.DiskTranspose, trainData) && (ValidData == null || UseTranspose(FastTreeTrainerOptions.DiskTranspose, ValidData));
|
|
var instanceConverter = new ExamplesToFastTreeBins(Host, FastTreeTrainerOptions.MaximumBinCountPerFeature, useTranspose, !FastTreeTrainerOptions.FeatureFlocks, FastTreeTrainerOptions.MinimumExampleCountPerLeaf, GetMaxLabel());
|
|
|
|
TrainSet = instanceConverter.FindBinsAndReturnDataset(trainData, PredictionKind, ParallelTraining, CategoricalFeatures, FastTreeTrainerOptions.CategoricalSplit);
|
|
FeatureMap = instanceConverter.FeatureMap;
|
|
if (ValidData != null)
|
|
ValidSet = instanceConverter.GetCompatibleDataset(ValidData, PredictionKind, CategoricalFeatures, FastTreeTrainerOptions.CategoricalSplit);
|
|
if (TestData != null)
|
|
TestSets = new[] { instanceConverter.GetCompatibleDataset(TestData, PredictionKind, CategoricalFeatures, FastTreeTrainerOptions.CategoricalSplit) };
|
|
}
|
|
|
|
private bool UseTranspose(bool? useTranspose, RoleMappedData data)
|
|
{
|
|
Host.AssertValue(data);
|
|
Host.Assert(data.Schema.Feature.HasValue);
|
|
|
|
if (useTranspose.HasValue)
|
|
return useTranspose.Value;
|
|
|
|
var itdv = data.Data as ITransposeDataView;
|
|
return itdv?.GetSlotType(data.Schema.Feature.Value.Index) != null;
|
|
}
|
|
|
|
private protected void TrainCore(IChannel ch)
|
|
{
|
|
Contracts.CheckValue(ch, nameof(ch));
|
|
// REVIEW:Get rid of this lock then we completely remove all static classes from FastTree such as BlockingThreadPool.
|
|
lock (FastTreeShared.TrainLock)
|
|
{
|
|
using (Timer.Time(TimerEvent.TotalInitialization))
|
|
{
|
|
CheckOptions(ch);
|
|
PrintPrologInfo(ch);
|
|
|
|
Initialize(ch);
|
|
if (FastTreeTrainerOptions.MemoryStatistics)
|
|
PrintMemoryStats(ch);
|
|
}
|
|
using (Timer.Time(TimerEvent.TotalTrain))
|
|
Train(ch);
|
|
if (FastTreeTrainerOptions.ExecutionTime)
|
|
PrintExecutionTime(ch);
|
|
TrainedEnsemble = Ensemble;
|
|
if (FeatureMap != null)
|
|
TrainedEnsemble.RemapFeatures(FeatureMap);
|
|
ParallelTraining.FinalizeEnvironment();
|
|
}
|
|
}
|
|
|
|
private protected virtual bool ShouldStop(IChannel ch, ref EarlyStoppingRuleBase earlyStopping, ref int bestIteration)
|
|
{
|
|
bestIteration = Ensemble.NumTrees;
|
|
return false;
|
|
}
|
|
private protected virtual int GetBestIteration(IChannel ch) => Ensemble.NumTrees;
|
|
|
|
private protected virtual void InitializeThreads(int numThreads)
|
|
{
|
|
ThreadTaskManager.Initialize(numThreads);
|
|
}
|
|
|
|
private protected virtual void PrintExecutionTime(IChannel ch)
|
|
{
|
|
ch.Info("Execution time breakdown:\n{0}", Timer.GetString());
|
|
}
|
|
|
|
private protected virtual void CheckOptions(IChannel ch)
|
|
{
|
|
FastTreeTrainerOptions.Check(ch);
|
|
|
|
IntArray.CompatibilityLevel = FastTreeTrainerOptions.FeatureCompressionLevel;
|
|
|
|
// change arguments
|
|
if (FastTreeTrainerOptions.HistogramPoolSize < 2)
|
|
FastTreeTrainerOptions.HistogramPoolSize = FastTreeTrainerOptions.NumberOfLeaves * 2 / 3;
|
|
if (FastTreeTrainerOptions.HistogramPoolSize > FastTreeTrainerOptions.NumberOfLeaves - 1)
|
|
FastTreeTrainerOptions.HistogramPoolSize = FastTreeTrainerOptions.NumberOfLeaves - 1;
|
|
|
|
if (FastTreeTrainerOptions.BaggingSize > 0)
|
|
{
|
|
int bagCount = FastTreeTrainerOptions.NumberOfTrees / FastTreeTrainerOptions.BaggingSize;
|
|
if (bagCount * FastTreeTrainerOptions.BaggingSize != FastTreeTrainerOptions.NumberOfTrees)
|
|
throw ch.Except("Number of trees should be a multiple of number bag size");
|
|
}
|
|
|
|
if (!(0 <= FastTreeTrainerOptions.GainConfidenceLevel && FastTreeTrainerOptions.GainConfidenceLevel < 1))
|
|
throw ch.Except("Gain confidence level must be in the range [0,1)");
|
|
|
|
#if OLD_DATALOAD
|
|
#if !NO_STORE
|
|
if (_args.offloadBinsToFileStore)
|
|
{
|
|
if (!string.IsNullOrEmpty(_args.offloadBinsDirectory) && !Directory.Exists(_args.offloadBinsDirectory))
|
|
{
|
|
try
|
|
{
|
|
Directory.CreateDirectory(_args.offloadBinsDirectory);
|
|
}
|
|
catch (Exception e)
|
|
{
|
|
throw ch.Except(e, "Failure creating bins offload directory {0} - Exception {1}", _args.offloadBinsDirectory, e.Message);
|
|
}
|
|
}
|
|
}
|
|
#endif
|
|
#endif
|
|
}
|
|
|
|
/// <summary>
|
|
/// A virtual method that is used to print header of test graph.
|
|
/// Applications that need printing test graph are supposed to override
|
|
/// it to print specific test graph header.
|
|
/// </summary>
|
|
/// <returns> string representation of test graph header </returns>
|
|
private protected virtual string GetTestGraphHeader() => string.Empty;
|
|
|
|
/// <summary>
|
|
/// A virtual method that is used to print a single line of test graph.
|
|
/// Applications that need printing test graph are supposed to override
|
|
/// it to print a specific line of test graph after a new iteration is finished.
|
|
/// </summary>
|
|
/// <returns> string representation of a line of test graph </returns>
|
|
private protected virtual string GetTestGraphLine() => string.Empty;
|
|
|
|
/// <summary>
|
|
/// A virtual method that is used to compute test results after each iteration is finished.
|
|
/// </summary>
|
|
private protected virtual void ComputeTests()
|
|
{
|
|
}
|
|
|
|
private protected void PrintTestGraph(IChannel ch)
|
|
{
|
|
// we call Tests computing no matter whether we require to print test graph
|
|
ComputeTests();
|
|
|
|
if (!FastTreeTrainerOptions.PrintTestGraph)
|
|
return;
|
|
|
|
if (Ensemble.NumTrees == 0)
|
|
ch.Info(GetTestGraphHeader());
|
|
else
|
|
ch.Info(GetTestGraphLine());
|
|
}
|
|
|
|
private protected virtual void Initialize(IChannel ch)
|
|
{
|
|
#region Load/Initialize State
|
|
|
|
using (Timer.Time(TimerEvent.InitializeLabels))
|
|
PrepareLabels(ch);
|
|
using (Timer.Time(TimerEvent.InitializeTraining))
|
|
{
|
|
InitializeEnsemble();
|
|
OptimizationAlgorithm = ConstructOptimizationAlgorithm(ch);
|
|
}
|
|
using (Timer.Time(TimerEvent.InitializeTests))
|
|
InitializeTests();
|
|
if (AllowGC)
|
|
{
|
|
GC.Collect(2, GCCollectionMode.Forced);
|
|
GC.Collect(2, GCCollectionMode.Forced);
|
|
}
|
|
#endregion
|
|
}
|
|
|
|
#if !NO_STORE
|
|
/// <summary>
|
|
/// Calculates the percentage of feature bins that will fit into memory based on current available memory in the machine.
|
|
/// </summary>
|
|
/// <returns>A float number between 0 and 1 indicating the percentage of features to load.
|
|
/// The number will not be smaller than two times the feature fraction value</returns>
|
|
private float GetFeaturePercentInMemory(IChannel ch)
|
|
{
|
|
const float maxFeaturePercentValue = 1.0f;
|
|
|
|
float availableMemory = GetMachineAvailableBytes();
|
|
|
|
ch.Info("Available memory in the machine is = {0} bytes", availableMemory.ToString("N", CultureInfo.InvariantCulture));
|
|
|
|
float minFeaturePercentThreshold = _args.preloadFeatureBinsBeforeTraining ? (float)_args.featureFraction * 2 : (float)_args.featureFraction;
|
|
|
|
if (minFeaturePercentThreshold >= maxFeaturePercentValue)
|
|
{
|
|
return maxFeaturePercentValue;
|
|
}
|
|
|
|
// Initial free memory allowance in bytes for single and parallel fastrank modes
|
|
float freeMemoryAllowance = 1024 * 1024 * 512;
|
|
|
|
if (_optimizationAlgorithm.TreeLearner != null)
|
|
{
|
|
// Get the size of memory in bytes needed by the tree learner internal data structures
|
|
freeMemoryAllowance += _optimizationAlgorithm.TreeLearner.GetSizeOfReservedMemory();
|
|
}
|
|
|
|
availableMemory = (availableMemory > freeMemoryAllowance) ? availableMemory - freeMemoryAllowance : 0;
|
|
|
|
long featureSize = TrainSet.FeatureSetSize;
|
|
|
|
if (ValidSet != null)
|
|
{
|
|
featureSize += ValidSet.FeatureSetSize;
|
|
}
|
|
|
|
if (TestSets != null)
|
|
{
|
|
foreach (var item in TestSets)
|
|
{
|
|
featureSize += item.FeatureSetSize;
|
|
}
|
|
}
|
|
|
|
ch.Info("Total Feature bins size is = {0} bytes", featureSize.ToString("N", CultureInfo.InvariantCulture));
|
|
|
|
return Math.Min(Math.Max(minFeaturePercentThreshold, availableMemory / featureSize), maxFeaturePercentValue);
|
|
}
|
|
#endif
|
|
|
|
private protected bool[] GetActiveFeatures()
|
|
{
|
|
var activeFeatures = Utils.CreateArray(TrainSet.NumFeatures, true);
|
|
if (FastTreeTrainerOptions.FeatureFraction < 1.0)
|
|
{
|
|
if (_featureSelectionRandom == null)
|
|
_featureSelectionRandom = new Random(FastTreeTrainerOptions.FeatureSelectionSeed);
|
|
|
|
for (int i = 0; i < TrainSet.NumFeatures; ++i)
|
|
{
|
|
if (activeFeatures[i])
|
|
activeFeatures[i] = _featureSelectionRandom.NextDouble() <= FastTreeTrainerOptions.FeatureFraction;
|
|
}
|
|
}
|
|
|
|
return activeFeatures;
|
|
}
|
|
|
|
private string GetDatasetStatistics(Dataset set)
|
|
{
|
|
long datasetSize = set.SizeInBytes();
|
|
int skeletonSize = set.Skeleton.SizeInBytes();
|
|
return string.Format("set contains {0} query-doc pairs in {1} queries with {2} features and uses {3} MB ({4} MB for features)",
|
|
set.NumDocs, set.NumQueries, set.NumFeatures, datasetSize / 1024 / 1024, (datasetSize - skeletonSize) / 1024 / 1024);
|
|
}
|
|
|
|
private protected virtual void PrintMemoryStats(IChannel ch)
|
|
{
|
|
Contracts.AssertValue(ch);
|
|
ch.Trace("Training {0}", GetDatasetStatistics(TrainSet));
|
|
|
|
if (ValidSet != null)
|
|
ch.Trace("Validation {0}", GetDatasetStatistics(ValidSet));
|
|
if (TestSets != null)
|
|
{
|
|
for (int i = 0; i < TestSets.Length; ++i)
|
|
ch.Trace("ComputeTests[{1}] {0}",
|
|
GetDatasetStatistics(TestSets[i]), i);
|
|
}
|
|
|
|
if (AllowGC)
|
|
ch.Trace("GC Total Memory = {0} MB", GC.GetTotalMemory(true) / 1024 / 1024);
|
|
Process currentProcess = Process.GetCurrentProcess();
|
|
ch.Trace("Working Set = {0} MB", currentProcess.WorkingSet64 / 1024 / 1024);
|
|
ch.Trace("Virtual Memory = {0} MB",
|
|
currentProcess.VirtualMemorySize64 / 1024 / 1024);
|
|
ch.Trace("Private Memory = {0} MB",
|
|
currentProcess.PrivateMemorySize64 / 1024 / 1024);
|
|
ch.Trace("Peak Working Set = {0} MB", currentProcess.PeakWorkingSet64 / 1024 / 1024);
|
|
ch.Trace("Peak Virtual Memory = {0} MB",
|
|
currentProcess.PeakVirtualMemorySize64 / 1024 / 1024);
|
|
}
|
|
|
|
private protected bool AreSamplesWeighted(IChannel ch)
|
|
{
|
|
return TrainSet.SampleWeights != null;
|
|
}
|
|
|
|
private void InitializeEnsemble()
|
|
{
|
|
Ensemble = new InternalTreeEnsemble();
|
|
}
|
|
|
|
/// <summary>
|
|
/// Creates weights wrapping (possibly, trivial) for gradient target values.
|
|
/// </summary>
|
|
private protected virtual IGradientAdjuster MakeGradientWrapper(IChannel ch)
|
|
{
|
|
if (AreSamplesWeighted(ch))
|
|
return new QueryWeightsGradientWrapper();
|
|
else
|
|
return new TrivialGradientWrapper();
|
|
}
|
|
|
|
#if !NO_STORE
|
|
/// <summary>
|
|
/// Unloads feature bins being used in the current iteration.
|
|
/// </summary>
|
|
/// <param name="featureToUnload">Boolean array indicating the features to unload</param>
|
|
private void UnloadFeatureBins(bool[] featureToUnload)
|
|
{
|
|
foreach (ScoreTracker scoreTracker in this._optimizationAlgorithm.TrackedScores)
|
|
{
|
|
for (int i = 0; i < scoreTracker.Dataset.Features.Length; i++)
|
|
{
|
|
if (featureToUnload[i])
|
|
{
|
|
// Only return buffers to the pool that were allocated using the pool
|
|
// So far only this type of IntArrays below have buffer pool support.
|
|
// This is to avoid unexpected leaks in case a new IntArray is added but we are not allocating it from the pool.
|
|
if (scoreTracker.Dataset.Features[i].Bins is DenseIntArray ||
|
|
scoreTracker.Dataset.Features[i].Bins is DeltaSparseIntArray ||
|
|
scoreTracker.Dataset.Features[i].Bins is DeltaRepeatIntArray)
|
|
{
|
|
scoreTracker.Dataset.Features[i].Bins.ReturnBuffer();
|
|
scoreTracker.Dataset.Features[i].Bins = null;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
/// <summary>
|
|
/// Worker thread delegate that loads features for the next training iteration
|
|
/// </summary>
|
|
/// <param name="state">thread state object</param>
|
|
private void LazyFeatureLoad(object state)
|
|
{
|
|
bool[] featuresToLoad = (bool[])state;
|
|
|
|
foreach (ScoreTracker scoreTracker in this._optimizationAlgorithm.TrackedScores)
|
|
{
|
|
for (int i = 0; i < scoreTracker.Dataset.Features.Length; i++)
|
|
{
|
|
if (featuresToLoad[i])
|
|
{
|
|
// just using the Bins property so feature bins are loaded into memory
|
|
IntArray bins = scoreTracker.Dataset.Features[i].Bins;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
/// <summary>
|
|
/// Iterates through the feature sets needed in future tree training iterations (i.e. in ActiveFeatureSetQueue),
|
|
/// using the same order as they were enqueued, and it returns the initial active features based on the percentage parameter.
|
|
/// </summary>
|
|
/// <param name="pctFeatureThreshold">A float value between 0 and 1 indicating maximum percentage of features to return</param>
|
|
/// <returns>Array indicating calculated feature list</returns>
|
|
private bool[] GetNextFeaturesByThreshold(float pctFeatureThreshold)
|
|
{
|
|
int totalUniqueFeatureCount = 0;
|
|
bool[] nextActiveFeatures = new bool[TrainSet.NumFeatures];
|
|
|
|
if (pctFeatureThreshold == 1.0f)
|
|
{
|
|
// return all features to load
|
|
return nextActiveFeatures.Select(x => x = true).ToArray();
|
|
}
|
|
|
|
int maxNumberOfFeatures = (int)(pctFeatureThreshold * TrainSet.NumFeatures);
|
|
|
|
for (int i = 0; i < _activeFeatureSetQueue.Count; i++)
|
|
{
|
|
bool[] tempActiveFeatures = _activeFeatureSetQueue.ElementAt(i);
|
|
|
|
for (int j = 0; j < tempActiveFeatures.Length; j++)
|
|
{
|
|
if (tempActiveFeatures[j] && !nextActiveFeatures[j])
|
|
{
|
|
nextActiveFeatures[j] = true;
|
|
if (totalUniqueFeatureCount++ > maxNumberOfFeatures)
|
|
return nextActiveFeatures;
|
|
}
|
|
}
|
|
}
|
|
|
|
return nextActiveFeatures;
|
|
}
|
|
|
|
/// <summary>
|
|
/// Adds several items in the ActiveFeature queue
|
|
/// </summary>
|
|
/// <param name="numberOfItems">Number of items to add</param>
|
|
private void GenerateActiveFeatureLists(int numberOfItems)
|
|
{
|
|
for (int i = 0; i < numberOfItems; i++)
|
|
{
|
|
_activeFeatureSetQueue.Enqueue(GetActiveFeatures());
|
|
}
|
|
}
|
|
#endif
|
|
|
|
private protected virtual BaggingProvider CreateBaggingProvider()
|
|
{
|
|
Contracts.Assert(FastTreeTrainerOptions.BaggingSize > 0);
|
|
return new BaggingProvider(TrainSet, FastTreeTrainerOptions.NumberOfLeaves, FastTreeTrainerOptions.Seed, FastTreeTrainerOptions.BaggingExampleFraction);
|
|
}
|
|
|
|
private protected virtual bool ShouldRandomStartOptimizer()
|
|
{
|
|
return false;
|
|
}
|
|
|
|
private protected virtual void Train(IChannel ch)
|
|
{
|
|
Contracts.AssertValue(ch);
|
|
int numTotalTrees = FastTreeTrainerOptions.NumberOfTrees;
|
|
|
|
ch.Info(
|
|
"Reserved memory for tree learner: {0} bytes",
|
|
OptimizationAlgorithm.TreeLearner.GetSizeOfReservedMemory());
|
|
|
|
#if !NO_STORE
|
|
if (_args.offloadBinsToFileStore)
|
|
{
|
|
// Initialize feature percent to load before loading any features
|
|
_featurePercentToLoad = GetFeaturePercentInMemory(ch);
|
|
ch.Info("Using featurePercentToLoad = {0} ", _featurePercentToLoad);
|
|
}
|
|
#endif
|
|
|
|
// random starting point
|
|
bool revertRandomStart = false;
|
|
if (Ensemble.NumTrees < numTotalTrees && ShouldRandomStartOptimizer())
|
|
{
|
|
ch.Info("Randomizing start point");
|
|
OptimizationAlgorithm.TrainingScores.RandomizeScores(FastTreeTrainerOptions.Seed, false);
|
|
revertRandomStart = true;
|
|
}
|
|
|
|
ch.Info("Starting to train ...");
|
|
|
|
BaggingProvider baggingProvider = FastTreeTrainerOptions.BaggingSize > 0 ? CreateBaggingProvider() : null;
|
|
|
|
#if OLD_DATALOAD
|
|
#if !NO_STORE
|
|
// Preload
|
|
GenerateActiveFeatureLists(_args.numTrees);
|
|
Thread featureLoadThread = null;
|
|
|
|
// Initial feature load
|
|
if (_args.offloadBinsToFileStore)
|
|
{
|
|
FileObjectStore<IntArrayFormatter>.GetDefaultInstance().SealObjectStore();
|
|
if (_args.preloadFeatureBinsBeforeTraining)
|
|
{
|
|
StartFeatureLoadThread(GetNextFeaturesByThreshold(_featurePercentToLoad)).Join();
|
|
}
|
|
}
|
|
#endif
|
|
#endif
|
|
|
|
EarlyStoppingRuleBase earlyStoppingRule = null;
|
|
int bestIteration = 0;
|
|
int emptyTrees = 0;
|
|
using (var pch = Host.StartProgressChannel("FastTree training"))
|
|
{
|
|
pch.SetHeader(new ProgressHeader("trees"), e => e.SetProgress(0, Ensemble.NumTrees, numTotalTrees));
|
|
while (Ensemble.NumTrees < numTotalTrees)
|
|
{
|
|
ch.Trace($"numTotalTrees left: {numTotalTrees}");
|
|
Host.CheckAlive();
|
|
using (Timer.Time(TimerEvent.Iteration))
|
|
{
|
|
#if NO_STORE
|
|
bool[] activeFeatures = GetActiveFeatures();
|
|
#else
|
|
bool[] activeFeatures = _activeFeatureSetQueue.Dequeue();
|
|
#endif
|
|
|
|
if (FastTreeTrainerOptions.BaggingSize > 0 && Ensemble.NumTrees % FastTreeTrainerOptions.BaggingSize == 0)
|
|
{
|
|
baggingProvider.GenerateNewBag();
|
|
OptimizationAlgorithm.TreeLearner.Partitioning =
|
|
baggingProvider.GetCurrentTrainingPartition();
|
|
}
|
|
|
|
#if !NO_STORE
|
|
if (_args.offloadBinsToFileStore)
|
|
{
|
|
featureLoadThread = StartFeatureLoadThread(GetNextFeaturesByThreshold(_featurePercentToLoad));
|
|
if (!_args.preloadFeatureBinsBeforeTraining)
|
|
featureLoadThread.Join();
|
|
}
|
|
#endif
|
|
|
|
// call the weak learner
|
|
var tree = OptimizationAlgorithm.TrainingIteration(ch, activeFeatures);
|
|
if (tree == null)
|
|
{
|
|
emptyTrees++;
|
|
numTotalTrees--;
|
|
}
|
|
else if (FastTreeTrainerOptions.BaggingSize > 0 && Ensemble.Trees.Count() > 0)
|
|
{
|
|
ch.Assert(Ensemble.Trees.Last() == tree);
|
|
Ensemble.Trees.Last()
|
|
.AddOutputsToScores(OptimizationAlgorithm.TrainingScores.Dataset,
|
|
OptimizationAlgorithm.TrainingScores.Scores,
|
|
baggingProvider.GetCurrentOutOfBagPartition().Documents);
|
|
}
|
|
|
|
Host.CheckAlive();
|
|
CustomizedTrainingIteration(tree);
|
|
|
|
using (Timer.Time(TimerEvent.Test))
|
|
{
|
|
PrintIterationMessage(ch, pch);
|
|
PrintTestResults(ch);
|
|
}
|
|
|
|
// revert randomized start
|
|
if (revertRandomStart)
|
|
{
|
|
revertRandomStart = false;
|
|
ch.Info("Reverting random score assignment");
|
|
OptimizationAlgorithm.TrainingScores.RandomizeScores(FastTreeTrainerOptions.Seed, true);
|
|
}
|
|
|
|
#if !NO_STORE
|
|
if (_args.offloadBinsToFileStore)
|
|
{
|
|
// Unload only features that are not needed for the next iteration
|
|
bool[] featuresToUnload = activeFeatures;
|
|
|
|
if (_args.preloadFeatureBinsBeforeTraining)
|
|
{
|
|
featuresToUnload =
|
|
activeFeatures.Zip(GetNextFeaturesByThreshold(_featurePercentToLoad),
|
|
(current, next) => current && !next).ToArray();
|
|
}
|
|
|
|
UnloadFeatureBins(featuresToUnload);
|
|
|
|
if (featureLoadThread != null &&
|
|
_args.preloadFeatureBinsBeforeTraining)
|
|
{
|
|
// wait for loading the features needed for the next iteration
|
|
featureLoadThread.Join();
|
|
}
|
|
}
|
|
#endif
|
|
if (ShouldStop(ch, ref earlyStoppingRule, ref bestIteration))
|
|
break;
|
|
}
|
|
}
|
|
|
|
if (emptyTrees > 0)
|
|
{
|
|
ch.Warning("{0} of the boosting iterations failed to grow a tree. This is commonly because the " +
|
|
"minimum documents in leaf hyperparameter was set too high for this dataset.", emptyTrees);
|
|
}
|
|
}
|
|
|
|
Host.CheckAlive();
|
|
if (earlyStoppingRule != null)
|
|
{
|
|
Contracts.Assert(numTotalTrees == 0 || bestIteration > 0);
|
|
// REVIEW: Need to reconcile with future progress reporting changes.
|
|
ch.Info("The training is stopped at {0} and iteration {1} is picked",
|
|
Ensemble.NumTrees, bestIteration);
|
|
}
|
|
else
|
|
{
|
|
bestIteration = GetBestIteration(ch);
|
|
}
|
|
|
|
Host.CheckAlive();
|
|
OptimizationAlgorithm.FinalizeLearning(bestIteration);
|
|
|
|
Host.CheckAlive();
|
|
Ensemble.PopulateRawThresholds(TrainSet);
|
|
|
|
Host.CheckAlive();
|
|
ParallelTraining.FinalizeTreeLearner();
|
|
}
|
|
|
|
#if !NO_STORE
|
|
/// <summary>
|
|
/// Gets the available bytes performance counter on the local machine
|
|
/// </summary>
|
|
/// <returns>Available bytes number</returns>
|
|
private float GetMachineAvailableBytes()
|
|
{
|
|
using (var availableBytes = new System.Diagnostics.PerformanceCounter("Memory", "Available Bytes", true))
|
|
{
|
|
return availableBytes.NextValue();
|
|
}
|
|
}
|
|
#endif
|
|
|
|
// This method is called at the end of each training iteration, with the tree that was learnt on that iteration.
|
|
// Note that this tree can be null if no tree was learnt this iteration.
|
|
private protected virtual void CustomizedTrainingIteration(InternalRegressionTree tree)
|
|
{
|
|
}
|
|
|
|
private protected virtual void PrintIterationMessage(IChannel ch, IProgressChannel pch)
|
|
{
|
|
// REVIEW: report some metrics, not just number of trees?
|
|
int iteration = Ensemble.NumTrees;
|
|
if (iteration % 50 == 49)
|
|
pch.Checkpoint(iteration + 1);
|
|
}
|
|
|
|
private protected virtual void PrintTestResults(IChannel ch)
|
|
{
|
|
if (FastTreeTrainerOptions.TestFrequency != int.MaxValue && (Ensemble.NumTrees % FastTreeTrainerOptions.TestFrequency == 0 || Ensemble.NumTrees == FastTreeTrainerOptions.NumberOfTrees))
|
|
{
|
|
var sb = new StringBuilder();
|
|
using (var sw = new StringWriter(sb))
|
|
{
|
|
foreach (var t in Tests)
|
|
{
|
|
var results = t.ComputeTests();
|
|
sw.Write(t.FormatInfoString());
|
|
}
|
|
}
|
|
|
|
if (sb.Length > 0)
|
|
ch.Info(sb.ToString());
|
|
}
|
|
}
|
|
private protected virtual void PrintPrologInfo(IChannel ch)
|
|
{
|
|
Contracts.AssertValue(ch);
|
|
ch.Trace("Host = {0}", Environment.MachineName);
|
|
ch.Trace("CommandLine = {0}", CmdParser.GetSettings(Host, FastTreeTrainerOptions, new TOptions()));
|
|
ch.Trace("GCSettings.IsServerGC = {0}", System.Runtime.GCSettings.IsServerGC);
|
|
ch.Trace("{0}", FastTreeTrainerOptions);
|
|
}
|
|
|
|
private protected ScoreTracker ConstructScoreTracker(Dataset set)
|
|
{
|
|
// If not found construct one
|
|
ScoreTracker st = null;
|
|
if (set == TrainSet)
|
|
st = OptimizationAlgorithm.GetScoreTracker("train", TrainSet, InitTrainScores);
|
|
else if (set == ValidSet)
|
|
st = OptimizationAlgorithm.GetScoreTracker("valid", ValidSet, InitValidScores);
|
|
else
|
|
{
|
|
for (int t = 0; t < TestSets.Length; ++t)
|
|
{
|
|
if (set == TestSets[t])
|
|
{
|
|
double[] initTestScores = InitTestScores?[t];
|
|
st = OptimizationAlgorithm.GetScoreTracker(string.Format("test[{0}]", t), TestSets[t], initTestScores);
|
|
}
|
|
}
|
|
}
|
|
Contracts.Check(st != null, "unknown dataset passed to ConstructScoreTracker");
|
|
return st;
|
|
}
|
|
|
|
private double[] ComputeScoresSmart(IChannel ch, Dataset set)
|
|
{
|
|
if (!FastTreeTrainerOptions.CompressEnsemble)
|
|
{
|
|
foreach (var st in OptimizationAlgorithm.TrackedScores)
|
|
if (st.Dataset == set)
|
|
{
|
|
ch.Trace("Computing scores fast");
|
|
return st.Scores;
|
|
}
|
|
}
|
|
return ComputeScoresSlow(ch, set);
|
|
}
|
|
|
|
private double[] ComputeScoresSlow(IChannel ch, Dataset set)
|
|
{
|
|
ch.Trace("Computing scores slow");
|
|
double[] scores = new double[set.NumDocs];
|
|
Ensemble.GetOutputs(set, scores);
|
|
double[] initScores = GetInitScores(set);
|
|
if (initScores != null)
|
|
{
|
|
Contracts.Check(scores.Length == initScores.Length, "Length of initscores and scores mismatch");
|
|
for (int i = 0; i < scores.Length; i++)
|
|
scores[i] += initScores[i];
|
|
}
|
|
return scores;
|
|
}
|
|
|
|
private double[] GetInitScores(Dataset set)
|
|
{
|
|
if (set == TrainSet)
|
|
return InitTrainScores;
|
|
if (set == ValidSet)
|
|
return InitValidScores;
|
|
for (int i = 0; TestSets != null && i < TestSets.Length; i++)
|
|
{
|
|
if (set == TestSets[i])
|
|
return InitTestScores?[i];
|
|
}
|
|
throw Contracts.Except("Queried for unknown set");
|
|
}
|
|
}
|
|
|
|
internal abstract class DataConverter
|
|
{
|
|
private protected readonly int NumFeatures;
|
|
public abstract int NumExamples { get; }
|
|
|
|
private protected readonly float MaxLabel;
|
|
|
|
private protected readonly PredictionKind PredictionKind;
|
|
|
|
/// <summary>
|
|
/// The per-feature bin upper bounds. Implementations may differ on when all of the items
|
|
/// in this array are initialized to non-null values but it must happen at least no later
|
|
/// than immediately after we return from <see cref="GetDataset"/>.
|
|
/// </summary>
|
|
public readonly double[][] BinUpperBounds;
|
|
|
|
/// <summary>
|
|
/// In the event that any features are filtered, this will contain the feature map, where
|
|
/// the indices are the indices of features within the dataset, and the tree as we are
|
|
/// learning, and the values are the indices of the features within the original input
|
|
/// data. This array is used to "rehydrate" the tree once we finish training, so that the
|
|
/// feature indices are once again over the full set of features, as opposed to the subset
|
|
/// of features we actually trained on. This can be null in the event that no filtering
|
|
/// occurred.
|
|
/// </summary>
|
|
/// <seealso cref="InternalTreeEnsemble.RemapFeatures"/>
|
|
public int[] FeatureMap;
|
|
|
|
private protected readonly IHost Host;
|
|
|
|
private protected readonly int[] CategoricalFeatureIndices;
|
|
|
|
private protected readonly bool CategoricalSplit;
|
|
|
|
private protected bool UsingMaxLabel
|
|
{
|
|
get { return MaxLabel != float.PositiveInfinity; }
|
|
}
|
|
|
|
private DataConverter(RoleMappedData data, IHost host, double[][] binUpperBounds, float maxLabel,
|
|
PredictionKind kind, int[] categoricalFeatureIndices, bool categoricalSplit)
|
|
{
|
|
Contracts.AssertValue(host, "host");
|
|
Host = host;
|
|
Host.CheckValue(data, nameof(data));
|
|
data.CheckFeatureFloatVector(out int featLen);
|
|
data.CheckOptFloatWeight();
|
|
data.CheckOptGroup();
|
|
|
|
NumFeatures = featLen;
|
|
if (binUpperBounds != null)
|
|
{
|
|
Host.AssertValue(binUpperBounds);
|
|
Host.Assert(Utils.Size(binUpperBounds) == NumFeatures);
|
|
Host.Assert(binUpperBounds.All(b => b != null));
|
|
BinUpperBounds = binUpperBounds;
|
|
}
|
|
else
|
|
BinUpperBounds = new double[NumFeatures][];
|
|
MaxLabel = maxLabel;
|
|
PredictionKind = kind;
|
|
CategoricalSplit = categoricalSplit;
|
|
CategoricalFeatureIndices = categoricalFeatureIndices;
|
|
}
|
|
|
|
public static DataConverter Create(RoleMappedData data, IHost host, int maxBins,
|
|
float maxLabel, bool diskTranspose, bool noFlocks, int minDocsPerLeaf, PredictionKind kind,
|
|
IParallelTraining parallelTraining, int[] categoricalFeatureIndices, bool categoricalSplit)
|
|
{
|
|
Contracts.AssertValue(host, "host");
|
|
host.AssertValue(data);
|
|
host.Assert(maxBins > 0);
|
|
DataConverter conv;
|
|
using (var ch = host.Start("CreateConverter"))
|
|
{
|
|
if (!diskTranspose)
|
|
conv = new MemImpl(data, host, maxBins, maxLabel, noFlocks, minDocsPerLeaf, kind,
|
|
parallelTraining, categoricalFeatureIndices, categoricalSplit);
|
|
else
|
|
conv = new DiskImpl(data, host, maxBins, maxLabel, kind, parallelTraining, categoricalFeatureIndices, categoricalSplit);
|
|
}
|
|
return conv;
|
|
}
|
|
|
|
public static DataConverter Create(RoleMappedData data, IHost host, double[][] binUpperBounds,
|
|
float maxLabel, bool diskTranspose, bool noFlocks, PredictionKind kind, int[] categoricalFeatureIndices, bool categoricalSplit)
|
|
{
|
|
Contracts.AssertValue(host, "host");
|
|
host.AssertValue(data);
|
|
DataConverter conv;
|
|
using (var ch = host.Start("CreateConverter"))
|
|
{
|
|
if (!diskTranspose)
|
|
conv = new MemImpl(data, host, binUpperBounds, maxLabel, noFlocks, kind, categoricalFeatureIndices, categoricalSplit);
|
|
else
|
|
conv = new DiskImpl(data, host, binUpperBounds, maxLabel, kind, categoricalFeatureIndices, categoricalSplit);
|
|
}
|
|
return conv;
|
|
}
|
|
|
|
public abstract Dataset GetDataset();
|
|
|
|
/// <summary>
|
|
/// Bins and input vector of feature values.
|
|
/// </summary>
|
|
/// <param name="binFinder">The instead of the bin finder to use</param>
|
|
/// <param name="values">The values for one particular feature value across all examples</param>
|
|
/// <param name="maxBins">The maximum number of bins to find</param>
|
|
/// <param name="minDocsPerLeaf"></param>
|
|
/// <param name="upperBounds">The bin upper bounds, maximum length will be <paramref name="maxBins"/></param>
|
|
/// <returns>Whether finding the bins was successful or not. It will be unsuccessful iff <paramref name="values"/>
|
|
/// has any missing values. In that event, the out parameters will be left as null.</returns>
|
|
private protected static bool CalculateBins(BinFinder binFinder, in VBuffer<double> values, int maxBins, int minDocsPerLeaf,
|
|
out double[] upperBounds)
|
|
{
|
|
return binFinder.FindBins(in values, maxBins, minDocsPerLeaf, out upperBounds);
|
|
}
|
|
|
|
private static IEnumerable<KeyValuePair<int, int>> NonZeroBinnedValuesForSparse(ReadOnlySpan<double> values, ReadOnlySpan<int> indices, double[] binUpperBounds)
|
|
{
|
|
Contracts.Assert(values.Length == indices.Length);
|
|
Contracts.Assert(Algorithms.FindFirstGE(binUpperBounds, 0) == 0);
|
|
var result = new List<KeyValuePair<int, int>>();
|
|
for (int i = 0; i < values.Length; ++i)
|
|
{
|
|
int ge = Algorithms.FindFirstGE(binUpperBounds, values[i]);
|
|
if (ge != 0)
|
|
result.Add(new KeyValuePair<int, int>(indices[i], ge));
|
|
}
|
|
return result;
|
|
}
|
|
|
|
private FeatureFlockBase CreateOneHotFlock(IChannel ch,
|
|
List<int> features, int[] binnedValues, int[] lastOn, ValuesList[] instanceList,
|
|
ref int[] forwardIndexerWork, ref VBuffer<double> temp, bool categorical)
|
|
{
|
|
Contracts.AssertValue(ch);
|
|
ch.Assert(0 <= features.Min() && features.Max() < NumFeatures);
|
|
ch.Assert(features.Count > 0);
|
|
|
|
if (features.Count == 1)
|
|
{
|
|
// Singleton.
|
|
int fi = features[0];
|
|
var values = instanceList[fi];
|
|
values.CopyTo(NumExamples, ref temp);
|
|
return CreateSingletonFlock(ch, in temp, binnedValues, BinUpperBounds[fi]);
|
|
}
|
|
// Multiple, one hot.
|
|
int[] hotFeatureStarts = new int[features.Count + 1];
|
|
// The position 0 is reserved as the "cold" position for all features in the slot.
|
|
// This corresponds to all features being in their first bin (for example, cold). So the
|
|
// first feature's "hotness" starts at 1. HOWEVER, for the purpose of defining the
|
|
// bins, we start with this array computed off by one. Once we define the bins, we
|
|
// will correct it.
|
|
hotFeatureStarts[0] = 0;
|
|
// There are as many hot positions per feature as there are number of bin upper
|
|
// bounds, minus 1. (The first bin is the "cold" position.)
|
|
for (int i = 1; i < hotFeatureStarts.Length; ++i)
|
|
hotFeatureStarts[i] = hotFeatureStarts[i - 1] + BinUpperBounds[features[i - 1]].Length - 1;
|
|
IntArrayBits flockBits = IntArray.NumBitsNeeded(hotFeatureStarts[hotFeatureStarts.Length - 1] + 1);
|
|
|
|
int min = features[0];
|
|
int lim = features[features.Count - 1] + 1;
|
|
var ind = new ValuesList.ForwardIndexer(instanceList, features.ToArray(), ref forwardIndexerWork);
|
|
int[] f2sf = Utils.CreateArray(lim - min, -1);
|
|
for (int i = 0; i < features.Count; ++i)
|
|
f2sf[features[i] - min] = i;
|
|
|
|
int hotCount = 0;
|
|
for (int i = 0; i < lastOn.Length; ++i)
|
|
{
|
|
int fi = lastOn[i];
|
|
if (fi < min || fi >= lim)
|
|
{
|
|
// All of the features would bin to 0, so we're in the "cold" position.
|
|
binnedValues[i] = 0;
|
|
#if false // This would be a very nice test to have, but for some situations it's too slow, even for debug builds. Consider reactivating temporarily if actively working on flocks.
|
|
// Assert that all the features really would be cold for this position.
|
|
Contracts.Assert(Enumerable.Range(min, lim - min).All(f => ind[f, i] < BinUpperBounds[f][0]));
|
|
#endif
|
|
continue;
|
|
}
|
|
ch.Assert(min <= fi && fi < lim);
|
|
int subfeature = f2sf[fi - min];
|
|
ch.Assert(subfeature >= 0);
|
|
double val = ind[subfeature, i];
|
|
#if false // Same note, too slow even for debug builds.
|
|
// Assert that all the other features really would be cold for this position.
|
|
Contracts.Assert(Enumerable.Range(min, fi - min).Concat(Enumerable.Range(fi + 1, lim - (fi + 1))).All(f => ind[f, i] < BinUpperBounds[f][0]));
|
|
#endif
|
|
double[] bub = BinUpperBounds[fi];
|
|
ch.Assert(bub.Length > 1);
|
|
int bin = Algorithms.FindFirstGE(bub, val);
|
|
ch.Assert(0 < bin && bin < bub.Length); // If 0, should not have been considered "on", so what the heck?
|
|
binnedValues[i] = hotFeatureStarts[subfeature] + bin;
|
|
hotCount++;
|
|
}
|
|
#if DEBUG
|
|
int limBin = (1 << (int)flockBits);
|
|
Contracts.Assert(flockBits == IntArrayBits.Bits32 || binnedValues.All(b => b < limBin));
|
|
#endif
|
|
// Correct the hot feature starts now that we're done binning.
|
|
for (int f = 0; f < hotFeatureStarts.Length; ++f)
|
|
hotFeatureStarts[f]++;
|
|
// Construct the int array of binned values.
|
|
const double sparsifyThreshold = 0.7;
|
|
|
|
IntArrayType type = hotCount < (1 - sparsifyThreshold) * NumExamples
|
|
? IntArrayType.Sparse
|
|
: IntArrayType.Dense;
|
|
IntArray bins = IntArray.New(NumExamples, type, flockBits, binnedValues);
|
|
|
|
var bups = features.Select(fi => BinUpperBounds[fi]).ToArray(features.Count);
|
|
return new OneHotFeatureFlock(bins, hotFeatureStarts, bups, categorical);
|
|
}
|
|
|
|
private FeatureFlockBase CreateOneHotFlockCategorical(IChannel ch,
|
|
List<int> features, int[] binnedValues, int[] lastOn, bool categorical)
|
|
{
|
|
Contracts.AssertValue(ch);
|
|
ch.Assert(0 <= features.Min() && features.Max() < NumFeatures);
|
|
ch.Assert(features.Count > 1);
|
|
|
|
// Multiple, one hot.
|
|
int[] hotFeatureStarts = new int[features.Count + 1];
|
|
// The position 0 is reserved as the "cold" position for all features in the slot.
|
|
// This corresponds to all features being in their first bin (for example, cold). So the
|
|
// first feature's "hotness" starts at 1. HOWEVER, for the purpose of defining the
|
|
// bins, we start with this array computed off by one. Once we define the bins, we
|
|
// will correct it.
|
|
hotFeatureStarts[0] = 0;
|
|
// There are as many hot positions per feature as there are number of bin upper
|
|
// bounds, minus 1. (The first bin is the "cold" position.)
|
|
for (int i = 1; i < hotFeatureStarts.Length; ++i)
|
|
hotFeatureStarts[i] = hotFeatureStarts[i - 1] + BinUpperBounds[features[i - 1]].Length - 1;
|
|
IntArrayBits flockBits = IntArray.NumBitsNeeded(hotFeatureStarts[hotFeatureStarts.Length - 1] + 1);
|
|
|
|
int min = features[0];
|
|
int lim = features[features.Count - 1] + 1;
|
|
int[] f2sf = Utils.CreateArray(lim - min, -1);
|
|
for (int i = 0; i < features.Count; ++i)
|
|
f2sf[features[i] - min] = i;
|
|
|
|
int hotCount = 0;
|
|
for (int i = 0; i < lastOn.Length; ++i)
|
|
{
|
|
int fi = lastOn[i];
|
|
if (fi < min || fi >= lim)
|
|
{
|
|
// All of the features would bin to 0, so we're in the "cold" position.
|
|
binnedValues[i] = 0;
|
|
#if false // This would be a very nice test to have, but for some situations it's too slow, even for debug builds. Consider reactivating temporarily if actively working on flocks.
|
|
// Assert that all the features really would be cold for this position.
|
|
Contracts.Assert(Enumerable.Range(min, lim - min).All(f => ind[f, i] < BinUpperBounds[f][0]));
|
|
#endif
|
|
continue;
|
|
}
|
|
ch.Assert(min <= fi && fi < lim);
|
|
int subfeature = f2sf[fi - min];
|
|
ch.Assert(subfeature >= 0);
|
|
#if false // Same note, too slow even for debug builds.
|
|
// Assert that all the other features really would be cold for this position.
|
|
Contracts.Assert(Enumerable.Range(min, fi - min).Concat(Enumerable.Range(fi + 1, lim - (fi + 1))).All(f => ind[f, i] < BinUpperBounds[f][0]));
|
|
#endif
|
|
double[] bub = BinUpperBounds[fi];
|
|
ch.Assert(bub.Length == 2);
|
|
//REVIEW: leaving out check for the value to reduced memory consumption and going with
|
|
//leap of faith based on what the user told.
|
|
binnedValues[i] = hotFeatureStarts[subfeature] + 1;
|
|
hotCount++;
|
|
}
|
|
#if DEBUG
|
|
int limBin = (1 << (int)flockBits);
|
|
Contracts.Assert(flockBits == IntArrayBits.Bits32 || binnedValues.All(b => b < limBin));
|
|
#endif
|
|
// Correct the hot feature starts now that we're done binning.
|
|
for (int f = 0; f < hotFeatureStarts.Length; ++f)
|
|
hotFeatureStarts[f]++;
|
|
// Construct the int array of binned values.
|
|
const double sparsifyThreshold = 0.7;
|
|
|
|
IntArrayType type = hotCount < (1 - sparsifyThreshold) * NumExamples
|
|
? IntArrayType.Sparse
|
|
: IntArrayType.Dense;
|
|
IntArray bins = IntArray.New(NumExamples, type, flockBits, binnedValues);
|
|
|
|
var bups = features.Select(fi => BinUpperBounds[fi]).ToArray(features.Count);
|
|
return new OneHotFeatureFlock(bins, hotFeatureStarts, bups, categorical);
|
|
}
|
|
|
|
/// <summary>
|
|
/// Create a new feature flock with a given name, values and specified bin bounds.
|
|
/// </summary>
|
|
/// <param name="ch"></param>
|
|
/// <param name="values">The values for this feature, that will be binned.</param>
|
|
/// <param name="binnedValues">A working array of length equal to the length of the input feature vector</param>
|
|
/// <param name="binUpperBounds">The upper bounds of the binning of this feature.</param>
|
|
/// <returns>A derived binned derived feature vector.</returns>
|
|
private protected static SingletonFeatureFlock CreateSingletonFlock(IChannel ch, in VBuffer<double> values, int[] binnedValues,
|
|
double[] binUpperBounds)
|
|
{
|
|
Contracts.AssertValue(ch);
|
|
ch.Assert(Utils.Size(binUpperBounds) > 0);
|
|
ch.AssertValue(binnedValues);
|
|
ch.Assert(binnedValues.Length == values.Length);
|
|
|
|
// TODO: Consider trying to speed up FindFirstGE by making a "map" like is done in the fastrank code
|
|
// TODO: Cache binnedValues
|
|
int zeroBin = Algorithms.FindFirstGE(binUpperBounds, 0);
|
|
|
|
// TODO: Make this a settable parameter / use the sparsifyThreshold already in the parameters
|
|
const double sparsifyThreshold = 0.7;
|
|
|
|
IntArray bins = null;
|
|
|
|
var valuesValues = values.GetValues();
|
|
var numBitsNeeded = IntArray.NumBitsNeeded(binUpperBounds.Length);
|
|
if (numBitsNeeded == IntArrayBits.Bits0)
|
|
bins = new Dense0BitIntArray(values.Length);
|
|
else if (!values.IsDense && zeroBin == 0 && valuesValues.Length < (1 - sparsifyThreshold) * values.Length)
|
|
{
|
|
// Special code to go straight from our own sparse format to a sparse IntArray.
|
|
// Note: requires zeroBin to be 0 because that's what's assumed in FastTree code
|
|
var nonZeroValues = NonZeroBinnedValuesForSparse(valuesValues, values.GetIndices(), binUpperBounds);
|
|
bins = new DeltaSparseIntArray(values.Length, numBitsNeeded, nonZeroValues);
|
|
}
|
|
else
|
|
{
|
|
// Fill the binnedValues array and convert using normal IntArray code
|
|
int firstBinCount = 0;
|
|
if (!values.IsDense)
|
|
{
|
|
if (zeroBin != 0)
|
|
{
|
|
for (int i = 0; i < values.Length; i++)
|
|
binnedValues[i] = zeroBin;
|
|
}
|
|
else
|
|
Array.Clear(binnedValues, 0, values.Length);
|
|
var valuesIndices = values.GetIndices();
|
|
for (int i = 0; i < valuesValues.Length; ++i)
|
|
{
|
|
if ((binnedValues[valuesIndices[i]] = Algorithms.FindFirstGE(binUpperBounds, valuesValues[i])) == 0)
|
|
firstBinCount++;
|
|
}
|
|
if (zeroBin == 0)
|
|
firstBinCount += values.Length - valuesValues.Length;
|
|
}
|
|
else
|
|
{
|
|
for (int i = 0; i < valuesValues.Length; i++)
|
|
{
|
|
if (valuesValues[i] == 0)
|
|
binnedValues[i] = zeroBin;
|
|
else
|
|
binnedValues[i] = Algorithms.FindFirstGE(binUpperBounds, valuesValues[i]);
|
|
if (binnedValues[i] == 0)
|
|
firstBinCount++;
|
|
}
|
|
}
|
|
// This sparsity check came from the FastRank code.
|
|
double firstBinFrac = (double)firstBinCount / binnedValues.Length;
|
|
IntArrayType arrayType = firstBinFrac > sparsifyThreshold ? IntArrayType.Sparse : IntArrayType.Dense;
|
|
bins = IntArray.New(values.Length, arrayType, IntArray.NumBitsNeeded(binUpperBounds.Length), binnedValues);
|
|
}
|
|
return new SingletonFeatureFlock(bins, binUpperBounds);
|
|
}
|
|
|
|
private sealed class DiskImpl : DataConverter
|
|
{
|
|
private readonly int _numExamples;
|
|
private readonly Dataset _dataset;
|
|
|
|
public override int NumExamples { get { return _numExamples; } }
|
|
|
|
public DiskImpl(RoleMappedData data, IHost host, int maxBins, float maxLabel, PredictionKind kind,
|
|
IParallelTraining parallelTraining, int[] categoricalFeatureIndices, bool categoricalSplit)
|
|
: base(data, host, null, maxLabel, kind, categoricalFeatureIndices, categoricalSplit)
|
|
{
|
|
// use parallel training for training data
|
|
Host.AssertValue(parallelTraining);
|
|
_dataset = Construct(data, ref _numExamples, maxBins, parallelTraining);
|
|
}
|
|
|
|
public DiskImpl(RoleMappedData data, IHost host,
|
|
double[][] binUpperBounds, float maxLabel, PredictionKind kind, int[] categoricalFeatureIndices, bool categoricalSplit)
|
|
: base(data, host, binUpperBounds, maxLabel, kind, categoricalFeatureIndices, categoricalSplit)
|
|
{
|
|
_dataset = Construct(data, ref _numExamples, -1, null);
|
|
}
|
|
|
|
public override Dataset GetDataset()
|
|
{
|
|
return _dataset;
|
|
}
|
|
|
|
private static int AddColumnIfNeeded(DataViewSchema.Column? info, List<int> toTranspose)
|
|
{
|
|
if (!info.HasValue)
|
|
return -1;
|
|
// It is entirely possible that a single column could have two roles,
|
|
// and so be added twice, but this case is handled by the transposer.
|
|
var idx = info.Value.Index;
|
|
toTranspose.Add(idx);
|
|
return idx;
|
|
}
|
|
|
|
private ValueMapper<VBuffer<T1>, VBuffer<T2>> GetCopier<T1, T2>(DataViewType itemType1, DataViewType itemType2)
|
|
{
|
|
var conv = Conversions.DefaultInstance.GetStandardConversion<T1, T2>(itemType1, itemType2, out bool identity);
|
|
if (identity)
|
|
{
|
|
ValueMapper<VBuffer<T1>, VBuffer<T1>> identityResult =
|
|
(in VBuffer<T1> src, ref VBuffer<T1> dst) => src.CopyTo(ref dst);
|
|
return (ValueMapper<VBuffer<T1>, VBuffer<T2>>)(object)identityResult;
|
|
}
|
|
return
|
|
(in VBuffer<T1> src, ref VBuffer<T2> dst) =>
|
|
{
|
|
var srcValues = src.GetValues();
|
|
var editor = VBufferEditor.Create(ref dst, src.Length, srcValues.Length);
|
|
if (srcValues.Length > 0)
|
|
{
|
|
if (!src.IsDense)
|
|
{
|
|
src.GetIndices().CopyTo(editor.Indices);
|
|
}
|
|
for (int i = 0; i < srcValues.Length; ++i)
|
|
conv(in srcValues[i], ref editor.Values[i]);
|
|
}
|
|
dst = editor.Commit();
|
|
};
|
|
}
|
|
|
|
private Dataset Construct(RoleMappedData examples, ref int numExamples, int maxBins, IParallelTraining parallelTraining)
|
|
{
|
|
Host.CheckAlive();
|
|
Host.AssertValue(examples);
|
|
Host.Assert(examples.Schema.Feature.HasValue);
|
|
|
|
if (parallelTraining == null)
|
|
Host.AssertValue(BinUpperBounds);
|
|
|
|
Dataset result;
|
|
using (var ch = Host.Start("Conversion"))
|
|
{
|
|
// Add a missing value filter on the features.
|
|
// REVIEW: Possibly filter out missing labels, but we don't do this in current FastTree conversion.
|
|
//var missingArgs = new MissingValueFilter.Arguments();
|
|
//missingArgs.column = new string[] { examples.Schema.Feature.Name };
|
|
//IDataView data = new MissingValueFilter(missingArgs, Host, examples.Data);
|
|
IDataView data = examples.Data;
|
|
|
|
// Convert the label column, if one exists.
|
|
var labelName = examples.Schema.Label?.Name;
|
|
if (labelName != null)
|
|
{
|
|
var convArgs = new LabelConvertTransform.Arguments();
|
|
var convCol = new LabelConvertTransform.Column() { Name = labelName, Source = labelName };
|
|
convArgs.Columns = new LabelConvertTransform.Column[] { convCol };
|
|
data = new LabelConvertTransform(Host, convArgs, data);
|
|
}
|
|
// Convert the group column, if one exists.
|
|
if (examples.Schema.Group?.Name is string groupName)
|
|
data = new TypeConvertingTransformer(Host, new TypeConvertingEstimator.ColumnOptions(groupName, DataKind.UInt64, groupName)).Transform(data);
|
|
|
|
// Since we've passed it through a few transforms, reconstitute the mapping on the
|
|
// newly transformed data.
|
|
examples = new RoleMappedData(data, examples.Schema.GetColumnRoleNames());
|
|
|
|
// Get the index of the columns in the transposed view, while we're at it composing
|
|
// the list of the columns we want to transpose.
|
|
var toTranspose = new List<int>();
|
|
int featIdx = AddColumnIfNeeded(examples.Schema.Feature, toTranspose);
|
|
int labelIdx = AddColumnIfNeeded(examples.Schema.Label, toTranspose);
|
|
int groupIdx = AddColumnIfNeeded(examples.Schema.Group, toTranspose);
|
|
int weightIdx = AddColumnIfNeeded(examples.Schema.Weight, toTranspose);
|
|
Host.Assert(1 <= toTranspose.Count && toTranspose.Count <= 4);
|
|
ch.Info("Changing data from row-wise to column-wise on disk");
|
|
// Note that if these columns are already transposed, then this will be a no-op.
|
|
using (Transposer trans = Transposer.Create(Host, data, false, toTranspose.ToArray()))
|
|
{
|
|
VBuffer<float> temp = default(VBuffer<float>);
|
|
// Construct the derived features.
|
|
var features = new FeatureFlockBase[NumFeatures];
|
|
BinFinder finder = new BinFinder();
|
|
FeaturesToContentMap fmap = new FeaturesToContentMap(examples.Schema);
|
|
|
|
var hasMissingPred = Conversions.DefaultInstance.GetHasMissingPredicate<float>(((ITransposeDataView)trans).GetSlotType(featIdx));
|
|
// There is no good mechanism to filter out rows with missing feature values on transposed data.
|
|
// So, we instead perform one featurization pass which, if successful, will remain one pass but,
|
|
// if we ever encounter missing values will become a "detect missing features" pass, which will
|
|
// in turn inform a necessary featurization pass secondary
|
|
SlotDropper slotDropper = null;
|
|
bool[] localConstructBinFeatures = Utils.CreateArray<bool>(NumFeatures, true);
|
|
|
|
if (parallelTraining != null)
|
|
localConstructBinFeatures = parallelTraining.GetLocalBinConstructionFeatures(NumFeatures);
|
|
|
|
using (var pch = Host.StartProgressChannel("FastTree disk-based bins initialization"))
|
|
{
|
|
for (; ; )
|
|
{
|
|
bool hasMissing = false;
|
|
using (var cursor = trans.GetSlotCursor(featIdx))
|
|
{
|
|
HashSet<int> constructed = new HashSet<int>();
|
|
var getter = SubsetGetter(cursor.GetGetter<float>(), slotDropper);
|
|
numExamples = slotDropper?.DstLength ?? trans.RowCount;
|
|
|
|
// Perhaps we should change the binning to just work over singles.
|
|
VBuffer<double> doubleTemp = default(VBuffer<double>);
|
|
var copier = GetCopier<float, double>(NumberDataViewType.Single, NumberDataViewType.Double);
|
|
int iFeature = 0;
|
|
pch.SetHeader(new ProgressHeader("features"), e => e.SetProgress(0, iFeature, features.Length));
|
|
while (cursor.MoveNext())
|
|
{
|
|
Host.CheckAlive();
|
|
iFeature = cursor.SlotIndex;
|
|
if (!localConstructBinFeatures[iFeature])
|
|
continue;
|
|
|
|
Host.Assert(iFeature < features.Length);
|
|
Host.Assert(features[iFeature] == null);
|
|
getter(ref temp);
|
|
Host.Assert(temp.Length == numExamples);
|
|
|
|
// First get the bin bounds, constructing them if they do not exist.
|
|
if (BinUpperBounds[iFeature] == null)
|
|
{
|
|
constructed.Add(iFeature);
|
|
ch.Assert(maxBins > 0);
|
|
finder = finder ?? new BinFinder();
|
|
// Must copy over, as bin calculation is potentially destructive.
|
|
copier(in temp, ref doubleTemp);
|
|
hasMissing = !CalculateBins(finder, in doubleTemp, maxBins, 0,
|
|
out BinUpperBounds[iFeature]);
|
|
}
|
|
else
|
|
hasMissing = hasMissingPred(in temp);
|
|
|
|
if (hasMissing)
|
|
{
|
|
// Let's just be a little extra safe, since it's so easy to check and the results if there
|
|
// is a bug in the upstream pipeline would be very severe.
|
|
ch.Check(slotDropper == null,
|
|
"Multiple passes over the data seem to be producing different data. There is a bug in the upstream pipeline.");
|
|
|
|
// Destroy any constructed bin upper bounds. We'll calculate them over the next pass.
|
|
foreach (var i in constructed)
|
|
BinUpperBounds[i] = null;
|
|
// Determine what rows have missing values.
|
|
slotDropper = ConstructDropSlotRanges(cursor, getter, ref temp);
|
|
ch.Assert(slotDropper.DstLength < temp.Length);
|
|
ch.Warning("{0} of {1} examples will be skipped due to missing feature values",
|
|
temp.Length - slotDropper.DstLength, temp.Length);
|
|
|
|
break;
|
|
}
|
|
Host.AssertValue(BinUpperBounds[iFeature]);
|
|
}
|
|
}
|
|
if (hasMissing == false)
|
|
break;
|
|
}
|
|
|
|
// Sync up global boundaries.
|
|
if (parallelTraining != null)
|
|
parallelTraining.SyncGlobalBoundary(NumFeatures, maxBins, BinUpperBounds);
|
|
|
|
List<FeatureFlockBase> flocks = new List<FeatureFlockBase>();
|
|
using (var cursor = trans.GetSlotCursor(featIdx))
|
|
using (var catCursor = trans.GetSlotCursor(featIdx))
|
|
{
|
|
var getter = SubsetGetter(cursor.GetGetter<float>(), slotDropper);
|
|
var catGetter = SubsetGetter(catCursor.GetGetter<float>(), slotDropper);
|
|
numExamples = slotDropper?.DstLength ?? trans.RowCount;
|
|
|
|
// Perhaps we should change the binning to just work over singles.
|
|
VBuffer<double> doubleTemp = default(VBuffer<double>);
|
|
|
|
int[] binnedValues = new int[numExamples];
|
|
var copier = GetCopier<float, double>(NumberDataViewType.Single, NumberDataViewType.Double);
|
|
int iFeature = 0;
|
|
if (CategoricalSplit && CategoricalFeatureIndices != null)
|
|
{
|
|
int[] lastOn = new int[NumExamples];
|
|
for (int i = 0; i < lastOn.Length; ++i)
|
|
lastOn[i] = -1;
|
|
List<int> pending = new List<int>();
|
|
int catRangeIndex = 0;
|
|
for (iFeature = 0; iFeature < NumFeatures;)
|
|
{
|
|
Host.CheckAlive();
|
|
|
|
if (catRangeIndex < CategoricalFeatureIndices.Length &&
|
|
CategoricalFeatureIndices[catRangeIndex] == iFeature)
|
|
{
|
|
pending.Clear();
|
|
bool oneHot = true;
|
|
for (int iFeatureLocal = iFeature;
|
|
iFeatureLocal <= CategoricalFeatureIndices[catRangeIndex + 1];
|
|
++iFeatureLocal)
|
|
{
|
|
double[] bup = BinUpperBounds[iFeatureLocal];
|
|
if (bup.Length == 1)
|
|
{
|
|
// This is a trivial feature. Skip it.
|
|
continue;
|
|
}
|
|
Contracts.Assert(Utils.Size(bup) > 0);
|
|
|
|
double firstBin = bup[0];
|
|
GetFeatureValues(catCursor, iFeatureLocal, catGetter, ref temp, ref doubleTemp, copier);
|
|
bool add = false;
|
|
var doubleTempValues = doubleTemp.GetValues();
|
|
var doubleTempIndices = doubleTemp.GetIndices();
|
|
for (int index = 0; index < doubleTempValues.Length; ++index)
|
|
{
|
|
if (doubleTempValues[index] <= firstBin)
|
|
continue;
|
|
|
|
int iindex = doubleTemp.IsDense ? index : doubleTempIndices[index];
|
|
int last = lastOn[iindex];
|
|
|
|
if (doubleTempValues[index] != 1 || (last != -1 && last >= iFeature))
|
|
{
|
|
catRangeIndex += 2;
|
|
pending.Clear();
|
|
oneHot = false;
|
|
break;
|
|
}
|
|
|
|
lastOn[iindex] = iFeatureLocal;
|
|
add = true;
|
|
}
|
|
|
|
if (!oneHot)
|
|
break;
|
|
|
|
if (add)
|
|
pending.Add(iFeatureLocal);
|
|
}
|
|
|
|
if (!oneHot)
|
|
continue;
|
|
|
|
if (pending.Count > 0)
|
|
{
|
|
flocks.Add(CreateOneHotFlockCategorical(ch, pending, binnedValues,
|
|
lastOn, true));
|
|
}
|
|
iFeature = CategoricalFeatureIndices[catRangeIndex + 1] + 1;
|
|
catRangeIndex += 2;
|
|
}
|
|
else
|
|
{
|
|
GetFeatureValues(cursor, iFeature, getter, ref temp, ref doubleTemp, copier);
|
|
double[] upperBounds = BinUpperBounds[iFeature++];
|
|
Host.AssertValue(upperBounds);
|
|
if (upperBounds.Length == 1)
|
|
continue; //trivial feature, skip it.
|
|
|
|
flocks.Add(CreateSingletonFlock(ch, in doubleTemp, binnedValues, upperBounds));
|
|
}
|
|
}
|
|
}
|
|
else
|
|
{
|
|
for (int i = 0; i < NumFeatures; i++)
|
|
{
|
|
Host.CheckAlive();
|
|
GetFeatureValues(cursor, i, getter, ref temp, ref doubleTemp, copier);
|
|
double[] upperBounds = BinUpperBounds[i];
|
|
Host.AssertValue(upperBounds);
|
|
if (upperBounds.Length == 1)
|
|
continue; //trivial feature, skip it.
|
|
|
|
flocks.Add(CreateSingletonFlock(ch, in doubleTemp, binnedValues, upperBounds));
|
|
}
|
|
}
|
|
|
|
Contracts.Assert(FeatureMap == null);
|
|
|
|
FeatureMap = Enumerable.Range(0, NumFeatures).Where(f => BinUpperBounds[f].Length > 1).ToArray();
|
|
features = flocks.ToArray();
|
|
}
|
|
}
|
|
|
|
// Construct the labels.
|
|
short[] ratings = new short[numExamples];
|
|
double[] actualLabels = new double[numExamples];
|
|
|
|
if (labelIdx >= 0)
|
|
{
|
|
trans.GetSingleSlotValue<float>(labelIdx, ref temp);
|
|
slotDropper?.DropSlots(ref temp, ref temp);
|
|
|
|
var tempValues = temp.GetValues();
|
|
var tempIndices = temp.GetIndices();
|
|
for (int i = 0; i < tempValues.Length; ++i)
|
|
{
|
|
int ii = temp.IsDense ? i : tempIndices[i];
|
|
var label = tempValues[i];
|
|
if (UsingMaxLabel && !(0 <= label && label <= MaxLabel))
|
|
throw Host.Except("Found invalid label {0}. Value should be between 0 and {1}, inclusive.", label, MaxLabel);
|
|
ratings[ii] = (short)label;
|
|
actualLabels[ii] = (double)label;
|
|
}
|
|
}
|
|
|
|
// Construct the boundaries and query IDs.
|
|
int[] boundaries;
|
|
ulong[] qids;
|
|
if (PredictionKind == PredictionKind.Ranking)
|
|
{
|
|
if (groupIdx < 0)
|
|
throw ch.Except("You need to provide {0} column for Ranking problem", DefaultColumnNames.GroupId);
|
|
VBuffer<ulong> groupIds = default(VBuffer<ulong>);
|
|
trans.GetSingleSlotValue<ulong>(groupIdx, ref groupIds);
|
|
slotDropper?.DropSlots(ref groupIds, ref groupIds);
|
|
|
|
ConstructBoundariesAndQueryIds(in groupIds, out boundaries, out qids);
|
|
}
|
|
else
|
|
{
|
|
if (groupIdx >= 0)
|
|
ch.Warning("This is not ranking problem, Group Id '{0}' column will be ignored", examples.Schema.Group.Value.Name);
|
|
const int queryChunkSize = 100;
|
|
qids = new ulong[(numExamples - 1) / queryChunkSize + 1];
|
|
boundaries = new int[qids.Length + 1];
|
|
for (int i = 0; i < qids.Length; ++i)
|
|
{
|
|
qids[i] = (ulong)i;
|
|
boundaries[i + 1] = boundaries[i] + queryChunkSize;
|
|
}
|
|
boundaries[boundaries.Length - 1] = numExamples;
|
|
}
|
|
// Construct the doc IDs. Doesn't really matter what these are.
|
|
ulong[] dids = Enumerable.Range(0, numExamples).Select(d => (ulong)d).ToArray(numExamples);
|
|
|
|
var skeleton = new Dataset.DatasetSkeleton(ratings, boundaries, qids, dids, new double[0][], actualLabels);
|
|
|
|
Host.Assert(features.All(f => f != null));
|
|
result = new Dataset(skeleton, features);
|
|
}
|
|
}
|
|
return result;
|
|
}
|
|
|
|
private void GetFeatureValues(SlotCursor cursor, int iFeature, ValueGetter<VBuffer<float>> getter,
|
|
ref VBuffer<float> temp, ref VBuffer<double> doubleTemp, ValueMapper<VBuffer<float>, VBuffer<double>> copier)
|
|
{
|
|
while (cursor.MoveNext())
|
|
{
|
|
|
|
Contracts.Assert(iFeature >= cursor.SlotIndex);
|
|
|
|
if (iFeature == cursor.SlotIndex)
|
|
break;
|
|
}
|
|
|
|
Contracts.Assert(cursor.SlotIndex == iFeature);
|
|
|
|
getter(ref temp);
|
|
copier(in temp, ref doubleTemp);
|
|
}
|
|
|
|
private static ValueGetter<VBuffer<T>> SubsetGetter<T>(ValueGetter<VBuffer<T>> getter, SlotDropper slotDropper)
|
|
{
|
|
if (slotDropper == null)
|
|
return getter;
|
|
|
|
return slotDropper.SubsetGetter(getter);
|
|
}
|
|
|
|
/// <summary>
|
|
/// Returns a slot dropper object that has ranges of slots to be dropped,
|
|
/// based on an examination of the feature values.
|
|
/// </summary>
|
|
private static SlotDropper ConstructDropSlotRanges(SlotCursor cursor,
|
|
ValueGetter<VBuffer<float>> getter, ref VBuffer<float> temp)
|
|
{
|
|
// The iteration here is slightly differently from a usual cursor iteration. Here, temp
|
|
// already holds the value of the cursor's current position, and we don't really want
|
|
// to re-fetch it, and the cursor is necessarily advanced.
|
|
Contracts.Assert(cursor.SlotIndex >= 0);
|
|
BitArray rowHasMissing = new BitArray(temp.Length);
|
|
for (; ; )
|
|
{
|
|
foreach (var kv in temp.Items())
|
|
{
|
|
if (float.IsNaN(kv.Value))
|
|
rowHasMissing.Set(kv.Key, true);
|
|
}
|
|
if (!cursor.MoveNext())
|
|
break;
|
|
getter(ref temp);
|
|
}
|
|
|
|
List<int> minSlots = new List<int>();
|
|
List<int> maxSlots = new List<int>();
|
|
bool previousBit = false;
|
|
for (int i = 0; i < rowHasMissing.Length; i++)
|
|
{
|
|
bool currentBit = rowHasMissing.Get(i);
|
|
if (currentBit && !previousBit)
|
|
{
|
|
minSlots.Add(i);
|
|
maxSlots.Add(i);
|
|
}
|
|
else if (currentBit)
|
|
maxSlots[maxSlots.Count - 1] = i;
|
|
|
|
previousBit = currentBit;
|
|
}
|
|
|
|
Contracts.Assert(maxSlots.Count == minSlots.Count);
|
|
|
|
return new SlotDropper(temp.Length, minSlots.ToArray(), maxSlots.ToArray());
|
|
}
|
|
|
|
private static void ConstructBoundariesAndQueryIds(in VBuffer<ulong> groupIds, out int[] boundariesArray, out ulong[] qidsArray)
|
|
{
|
|
List<ulong> qids = new List<ulong>();
|
|
List<int> boundaries = new List<int>();
|
|
|
|
ulong last = 0;
|
|
if (groupIds.Length > 0)
|
|
groupIds.GetItemOrDefault(0, ref last);
|
|
int count = 0;
|
|
foreach (ulong groupId in groupIds.DenseValues())
|
|
{
|
|
if (count == 0 || last != groupId)
|
|
{
|
|
qids.Add(last = groupId);
|
|
boundaries.Add(count);
|
|
}
|
|
count++;
|
|
}
|
|
boundaries.Add(count);
|
|
qidsArray = qids.ToArray();
|
|
boundariesArray = boundaries.ToArray();
|
|
}
|
|
}
|
|
|
|
// REVIEW: Our data conversion is extremely inefficient. Fix it!
|
|
private sealed class MemImpl : DataConverter
|
|
{
|
|
private readonly RoleMappedData _data;
|
|
|
|
// instanceList[feature] is the vector of values for the given feature
|
|
private readonly ValuesList[] _instanceList;
|
|
|
|
private readonly List<short> _targetsList;
|
|
private readonly List<double> _actualTargets;
|
|
private readonly List<double> _weights;
|
|
private readonly List<int> _boundaries;
|
|
private readonly long _numMissingInstances;
|
|
private readonly int _numExamples;
|
|
private readonly bool _noFlocks;
|
|
private readonly int _minDocsPerLeaf;
|
|
|
|
public override int NumExamples
|
|
{
|
|
get { return _numExamples; }
|
|
}
|
|
|
|
private MemImpl(RoleMappedData data, IHost host, double[][] binUpperBounds, float maxLabel, bool dummy,
|
|
bool noFlocks, PredictionKind kind, int[] categoricalFeatureIndices, bool categoricalSplit)
|
|
: base(data, host, binUpperBounds, maxLabel, kind, categoricalFeatureIndices, categoricalSplit)
|
|
{
|
|
_data = data;
|
|
// Array of List<double> objects for each feature, containing values for that feature over all rows
|
|
_instanceList = new ValuesList[NumFeatures];
|
|
for (int i = 0; i < _instanceList.Length; i++)
|
|
_instanceList[i] = new ValuesList();
|
|
// Labels.
|
|
_targetsList = new List<short>();
|
|
_actualTargets = new List<double>();
|
|
_weights = data.Schema.Weight != null ? new List<double>() : null;
|
|
_boundaries = new List<int>();
|
|
_noFlocks = noFlocks;
|
|
|
|
MakeBoundariesAndCheckLabels(out _numMissingInstances, out long numInstances);
|
|
if (numInstances > Utils.ArrayMaxSize)
|
|
throw Host.ExceptParam(nameof(data), "Input data had {0} rows, but can only accommodate {1}", numInstances, Utils.ArrayMaxSize);
|
|
_numExamples = (int)numInstances;
|
|
}
|
|
|
|
public MemImpl(RoleMappedData data, IHost host, int maxBins, float maxLabel, bool noFlocks, int minDocsPerLeaf,
|
|
PredictionKind kind, IParallelTraining parallelTraining, int[] categoricalFeatureIndices, bool categoricalSplit)
|
|
: this(data, host, null, maxLabel, dummy: true, noFlocks: noFlocks, kind: kind,
|
|
categoricalFeatureIndices: categoricalFeatureIndices, categoricalSplit: categoricalSplit)
|
|
{
|
|
// Convert features to binned values.
|
|
_minDocsPerLeaf = minDocsPerLeaf;
|
|
InitializeBins(maxBins, parallelTraining);
|
|
}
|
|
|
|
public MemImpl(RoleMappedData data, IHost host, double[][] binUpperBounds, float maxLabel,
|
|
bool noFlocks, PredictionKind kind, int[] categoricalFeatureIndices, bool categoricalSplit)
|
|
: this(data, host, binUpperBounds, maxLabel, dummy: true, noFlocks: noFlocks, kind: kind,
|
|
categoricalFeatureIndices: categoricalFeatureIndices, categoricalSplit: categoricalSplit)
|
|
{
|
|
Host.AssertValue(binUpperBounds);
|
|
}
|
|
|
|
private void MakeBoundariesAndCheckLabels(out long missingInstances, out long totalInstances)
|
|
{
|
|
using (var ch = Host.Start("InitBoundariesAndLabels"))
|
|
using (var pch = Host.StartProgressChannel("FastTree data preparation"))
|
|
{
|
|
long featureValues = 0;
|
|
// Warn at about 2 GB usage.
|
|
const long featureValuesWarnThreshold = (2L << 30) / sizeof(double);
|
|
bool featureValuesWarned = false;
|
|
const string featureValuesWarning = "We seem to be processing a lot of data. Consider using the FastTree diskTranspose+ (or dt+) option, for slower but more memory efficient transposition.";
|
|
const int queryChunkSize = 100;
|
|
|
|
// Populate the feature values array and labels.
|
|
ch.Info("Changing data from row-wise to column-wise");
|
|
|
|
long pos = 0;
|
|
double rowCountDbl = (double?)_data.Data.GetRowCount() ?? double.NaN;
|
|
pch.SetHeader(new ProgressHeader("examples"),
|
|
e => e.SetProgress(0, pos, rowCountDbl));
|
|
// REVIEW: Should we ignore rows with bad label, weight, or group? The previous code seemed to let
|
|
// them through (but filtered out bad features).
|
|
CursOpt curOptions = CursOpt.Label | CursOpt.Features;
|
|
bool hasGroup = false;
|
|
if (PredictionKind == PredictionKind.Ranking)
|
|
{
|
|
hasGroup = _data.Schema.Group != null;
|
|
|
|
if (hasGroup)
|
|
curOptions |= CursOpt.Group;
|
|
}
|
|
else
|
|
{
|
|
if (_data.Schema.Group != null)
|
|
ch.Warning("This is not ranking problem, Group Id '{0}' column will be ignored", _data.Schema.Group.Value.Name);
|
|
}
|
|
|
|
if (_data.Schema.Weight.HasValue)
|
|
curOptions |= CursOpt.Weight;
|
|
|
|
using (var cursor = new FloatLabelCursor(_data, curOptions))
|
|
{
|
|
ulong groupPrev = 0;
|
|
|
|
while (cursor.MoveNext())
|
|
{
|
|
pos = cursor.KeptRowCount - 1;
|
|
int index = checked((int)pos);
|
|
ch.Assert(pos >= 0);
|
|
|
|
// If we have no group, then the group number should not change.
|
|
Host.Assert(hasGroup || cursor.Group == groupPrev);
|
|
if (hasGroup)
|
|
{
|
|
// If we are either at the start of iteration, or a new
|
|
// group has started, add the boundary and register the
|
|
// new group identifier.
|
|
if (pos == 0 || cursor.Group != groupPrev)
|
|
{
|
|
_boundaries.Add(index);
|
|
groupPrev = cursor.Group;
|
|
}
|
|
}
|
|
else if (pos % queryChunkSize == 0)
|
|
{
|
|
// If there are no groups, it is best to just put the
|
|
// boundaries at regular intervals.
|
|
_boundaries.Add(index);
|
|
}
|
|
|
|
if (UsingMaxLabel)
|
|
{
|
|
if (cursor.Label < 0 || cursor.Label > MaxLabel)
|
|
throw ch.Except("Found invalid label {0}. Value should be between 0 and {1}, inclusive.", cursor.Label, MaxLabel);
|
|
}
|
|
|
|
foreach (var kvp in cursor.Features.Items())
|
|
_instanceList[kvp.Key].Add(index, kvp.Value);
|
|
|
|
_actualTargets.Add(cursor.Label);
|
|
if (_weights != null)
|
|
_weights.Add(cursor.Weight);
|
|
_targetsList.Add((short)cursor.Label);
|
|
featureValues += cursor.Features.GetValues().Length;
|
|
|
|
if (featureValues > featureValuesWarnThreshold && !featureValuesWarned)
|
|
{
|
|
ch.Warning(featureValuesWarning);
|
|
featureValuesWarned = true;
|
|
}
|
|
}
|
|
|
|
_boundaries.Add(checked((int)cursor.KeptRowCount));
|
|
totalInstances = cursor.KeptRowCount;
|
|
missingInstances = cursor.BadFeaturesRowCount;
|
|
}
|
|
|
|
ch.Check(totalInstances > 0, "All instances skipped due to missing features.");
|
|
|
|
if (missingInstances > 0)
|
|
ch.Warning("Skipped {0} instances with missing features during training", missingInstances);
|
|
}
|
|
}
|
|
|
|
private void InitializeBins(int maxBins, IParallelTraining parallelTraining)
|
|
{
|
|
// Find upper bounds for each bin for each feature.
|
|
using (var ch = Host.Start("InitBins"))
|
|
using (var pch = Host.StartProgressChannel("FastTree in-memory bins initialization"))
|
|
{
|
|
BinFinder binFinder = new BinFinder();
|
|
VBuffer<double> temp = default(VBuffer<double>);
|
|
int len = _numExamples;
|
|
bool[] localConstructBinFeatures = parallelTraining.GetLocalBinConstructionFeatures(NumFeatures);
|
|
int iFeature = 0;
|
|
pch.SetHeader(new ProgressHeader("features"), e => e.SetProgress(0, iFeature, NumFeatures));
|
|
List<int> trivialFeatures = new List<int>();
|
|
for (iFeature = 0; iFeature < NumFeatures; iFeature++)
|
|
{
|
|
Host.CheckAlive();
|
|
if (!localConstructBinFeatures[iFeature])
|
|
continue;
|
|
// The following strange call will actually sparsify.
|
|
_instanceList[iFeature].CopyTo(len, ref temp);
|
|
// REVIEW: In principle we could also put the min docs per leaf information
|
|
// into here, and collapse bins somehow as we determine the bins, so that "trivial"
|
|
// bins on the head or tail of the bin distribution are never actually considered.
|
|
CalculateBins(binFinder, in temp, maxBins, _minDocsPerLeaf,
|
|
out double[] binUpperBounds);
|
|
BinUpperBounds[iFeature] = binUpperBounds;
|
|
}
|
|
parallelTraining.SyncGlobalBoundary(NumFeatures, maxBins, BinUpperBounds);
|
|
}
|
|
}
|
|
|
|
public override Dataset GetDataset()
|
|
{
|
|
using (var ch = Host.Start("BinFeatures"))
|
|
using (var pch = Host.StartProgressChannel("FastTree feature conversion"))
|
|
{
|
|
FeatureFlockBase[] flocks = CreateFlocks(ch, pch).ToArray();
|
|
ch.Trace("{0} features stored in {1} flocks.", NumFeatures, flocks.Length);
|
|
return new Dataset(CreateDatasetSkeleton(), flocks);
|
|
}
|
|
}
|
|
|
|
private NHotFeatureFlock CreateNHotFlock(IChannel ch, List<int> features)
|
|
{
|
|
Contracts.AssertValue(ch);
|
|
ch.Assert(Utils.Size(features) > 1);
|
|
|
|
// Copy pasta from above.
|
|
int[] hotFeatureStarts = new int[features.Count + 1];
|
|
for (int i = 1; i < hotFeatureStarts.Length; ++i)
|
|
hotFeatureStarts[i] = hotFeatureStarts[i - 1] + BinUpperBounds[features[i - 1]].Length - 1;
|
|
IntArrayBits flockBits = IntArray.NumBitsNeeded(hotFeatureStarts[hotFeatureStarts.Length - 1] + 1);
|
|
|
|
var kvEnums = new IEnumerator<KeyValuePair<int, int>>[features.Count];
|
|
var delta = new List<byte>();
|
|
var values = new List<int>();
|
|
|
|
try
|
|
{
|
|
for (int i = 0; i < features.Count; ++i)
|
|
kvEnums[i] = _instanceList[features[i]].Binned(BinUpperBounds[features[i]], NumExamples).GetEnumerator();
|
|
Heap<int> heap = new Heap<int>(
|
|
(i, j) =>
|
|
{
|
|
ch.AssertValue(kvEnums[i]);
|
|
ch.AssertValue(kvEnums[j]);
|
|
int irow = kvEnums[i].Current.Key;
|
|
int jrow = kvEnums[j].Current.Key;
|
|
if (irow == jrow) // If we're on the same row, prefer the "smaller" feature.
|
|
return j < i;
|
|
// Earlier rows should go first.
|
|
return jrow < irow;
|
|
});
|
|
// Do the initial population of the heap.
|
|
for (int i = 0; i < kvEnums.Length; ++i)
|
|
{
|
|
if (kvEnums[i].MoveNext())
|
|
heap.Add(i);
|
|
else
|
|
{
|
|
kvEnums[i].Dispose();
|
|
kvEnums[i] = null;
|
|
}
|
|
}
|
|
// Iteratively build the delta-sparse and int arrays.
|
|
// REVIEW: Could be hinted as having capacity count hot, but may do more harm than good.
|
|
int last = 0;
|
|
while (heap.Count > 0)
|
|
{
|
|
int i = heap.Pop();
|
|
var kvEnum = kvEnums[i];
|
|
ch.AssertValue(kvEnum);
|
|
var kvp = kvEnum.Current;
|
|
ch.Assert(kvp.Key >= last);
|
|
ch.Assert(kvp.Value > 0);
|
|
while (kvp.Key - last > Byte.MaxValue)
|
|
{
|
|
delta.Add(Byte.MaxValue);
|
|
values.Add(0);
|
|
last += Byte.MaxValue;
|
|
}
|
|
ch.Assert(kvp.Key - last <= Byte.MaxValue);
|
|
// Note that kvp.Key - last might be zero, in the case where we are representing multiple
|
|
// values for a single row.
|
|
delta.Add((byte)(kvp.Key - last));
|
|
values.Add(kvp.Value + hotFeatureStarts[i]);
|
|
ch.Assert(kvp.Key > last || values.Count == 1 || values[values.Count - 1] > values[values.Count - 2]);
|
|
last = kvp.Key;
|
|
if (kvEnum.MoveNext())
|
|
heap.Add(i);
|
|
else
|
|
{
|
|
kvEnum.Dispose();
|
|
kvEnums[i] = null;
|
|
}
|
|
}
|
|
}
|
|
finally
|
|
{
|
|
// Need to dispose the enumerators.
|
|
foreach (var enumerator in kvEnums)
|
|
{
|
|
if (enumerator != null)
|
|
enumerator.Dispose();
|
|
}
|
|
}
|
|
|
|
// Correct the hot feature starts now that we're done binning.
|
|
for (int f = 0; f < hotFeatureStarts.Length; ++f)
|
|
hotFeatureStarts[f]++;
|
|
var denseBins = (DenseIntArray)IntArray.New(values.Count, IntArrayType.Dense, flockBits, values);
|
|
var bups = features.Select(fi => BinUpperBounds[fi]).ToArray(features.Count);
|
|
return new NHotFeatureFlock(denseBins, delta.ToArray(), NumExamples, hotFeatureStarts, bups);
|
|
}
|
|
|
|
private IEnumerable<FeatureFlockBase> CreateFlocks(IChannel ch, IProgressChannel pch)
|
|
{
|
|
int iFeature = 0;
|
|
FeatureMap = Enumerable.Range(0, NumFeatures).Where(f => BinUpperBounds[f].Length > 1).ToArray();
|
|
|
|
foreach (FeatureFlockBase flock in CreateFlocksCore(ch, pch))
|
|
{
|
|
Contracts.Assert(flock.Count > 0);
|
|
Contracts.Assert(iFeature + flock.Count <= FeatureMap.Length);
|
|
int min = FeatureMap[iFeature];
|
|
int lim = iFeature + flock.Count == FeatureMap.Length
|
|
? NumFeatures
|
|
: FeatureMap[iFeature + flock.Count];
|
|
for (int i = min; i < lim; ++i)
|
|
_instanceList[i] = null;
|
|
iFeature += flock.Count;
|
|
yield return flock;
|
|
}
|
|
ch.Assert(iFeature <= NumFeatures); // Some could have been filtered.
|
|
ch.Assert(iFeature == FeatureMap.Length);
|
|
if (iFeature == 0)
|
|
{
|
|
// It is possible to filter out all features. In such a case as this we introduce a dummy
|
|
// "trivial" feature, so that the learning code downstream does not choke.
|
|
yield return new SingletonFeatureFlock(new Dense0BitIntArray(NumExamples), BinUpperBounds[0]);
|
|
FeatureMap = new[] { 0 };
|
|
}
|
|
}
|
|
|
|
private IEnumerable<FeatureFlockBase> CreateFlocksCore(IChannel ch, IProgressChannel pch)
|
|
{
|
|
int iFeature = 0;
|
|
pch.SetHeader(new ProgressHeader("features"), e => e.SetProgress(0, iFeature, NumFeatures));
|
|
VBuffer<double> temp = default(VBuffer<double>);
|
|
// Working array for bins.
|
|
int[] binnedValues = new int[NumExamples];
|
|
|
|
if (_noFlocks)
|
|
{
|
|
for (iFeature = 0; iFeature < NumFeatures; ++iFeature)
|
|
{
|
|
var bup = BinUpperBounds[iFeature];
|
|
ch.Assert(Utils.Size(bup) > 0);
|
|
if (bup.Length == 1) // Trivial.
|
|
continue;
|
|
var values = _instanceList[iFeature];
|
|
_instanceList[iFeature] = null;
|
|
values.CopyTo(NumExamples, ref temp);
|
|
yield return CreateSingletonFlock(ch, in temp, binnedValues, bup);
|
|
}
|
|
yield break;
|
|
}
|
|
|
|
List<int> pending = new List<int>();
|
|
int[] forwardIndexerWork = null;
|
|
|
|
if (CategoricalSplit && CategoricalFeatureIndices != null)
|
|
{
|
|
int[] lastOn = new int[NumExamples];
|
|
for (int i = 0; i < lastOn.Length; ++i)
|
|
lastOn[i] = -1;
|
|
|
|
int catRangeIndex = 0;
|
|
for (iFeature = 0; iFeature < NumFeatures;)
|
|
{
|
|
if (catRangeIndex < CategoricalFeatureIndices.Length)
|
|
{
|
|
if (CategoricalFeatureIndices[catRangeIndex] == iFeature)
|
|
{
|
|
bool isOneHot = true;
|
|
for (int iFeatureLocal = iFeature;
|
|
iFeatureLocal <= CategoricalFeatureIndices[catRangeIndex + 1];
|
|
++iFeatureLocal)
|
|
{
|
|
double[] bup = BinUpperBounds[iFeatureLocal];
|
|
if (bup.Length == 1)
|
|
{
|
|
// This is a trivial feature. Skip it.
|
|
continue;
|
|
}
|
|
Contracts.Assert(Utils.Size(bup) > 0);
|
|
|
|
double firstBin = bup[0];
|
|
using (IEnumerator<int> hotEnumerator = _instanceList[iFeatureLocal].AllIndicesGT(NumExamples, firstBin).GetEnumerator())
|
|
{
|
|
while (hotEnumerator.MoveNext())
|
|
{
|
|
int last = lastOn[hotEnumerator.Current];
|
|
|
|
//Not a one-hot flock, bail.
|
|
if (last >= iFeature)
|
|
{
|
|
isOneHot = false;
|
|
pending.Clear();
|
|
break;
|
|
}
|
|
|
|
lastOn[hotEnumerator.Current] = iFeatureLocal;
|
|
}
|
|
}
|
|
|
|
pending.Add(iFeatureLocal);
|
|
}
|
|
|
|
if (pending.Count > 0)
|
|
{
|
|
yield return CreateOneHotFlock(ch, pending, binnedValues, lastOn, _instanceList,
|
|
ref forwardIndexerWork, ref temp, true);
|
|
|
|
pending.Clear();
|
|
}
|
|
|
|
if (isOneHot)
|
|
iFeature = CategoricalFeatureIndices[catRangeIndex + 1] + 1;
|
|
|
|
catRangeIndex += 2;
|
|
}
|
|
else
|
|
{
|
|
foreach (var flock in CreateFlocksCore(ch, pch, iFeature, CategoricalFeatureIndices[catRangeIndex]))
|
|
yield return flock;
|
|
|
|
iFeature = CategoricalFeatureIndices[catRangeIndex];
|
|
}
|
|
}
|
|
else
|
|
{
|
|
foreach (var flock in CreateFlocksCore(ch, pch, iFeature, NumFeatures))
|
|
yield return flock;
|
|
|
|
iFeature = NumFeatures;
|
|
}
|
|
}
|
|
}
|
|
else
|
|
{
|
|
foreach (var flock in CreateFlocksCore(ch, pch, 0, NumFeatures))
|
|
yield return flock;
|
|
}
|
|
}
|
|
|
|
private IEnumerable<FeatureFlockBase> CreateFlocksCore(IChannel ch, IProgressChannel pch, int startFeatureIndex, int featureLim)
|
|
{
|
|
int iFeature = startFeatureIndex;
|
|
VBuffer<double> temp = default(VBuffer<double>);
|
|
// Working array for bins.
|
|
int[] binnedValues = new int[NumExamples];
|
|
// Holds what feature for an example was last "on", that is, will have
|
|
// to be explicitly represented. This was the last feature for which AllIndicesGE
|
|
// returned an index.
|
|
int[] lastOn = new int[NumExamples];
|
|
for (int i = 0; i < lastOn.Length; ++i)
|
|
lastOn[i] = -1;
|
|
int[] forwardIndexerWork = null;
|
|
// What creations are pending?
|
|
List<int> pending = new List<int>();
|
|
|
|
Func<FeatureFlockBase> createOneHotFlock =
|
|
() => CreateOneHotFlock(ch, pending, binnedValues, lastOn, _instanceList,
|
|
ref forwardIndexerWork, ref temp, false);
|
|
|
|
Func<FeatureFlockBase> createNHotFlock =
|
|
() => CreateNHotFlock(ch, pending);
|
|
|
|
// The exclusive upper bound of what features have already been incorporated
|
|
// into a flock.
|
|
int limMade = startFeatureIndex;
|
|
int countBins = 1; // Count of bins we'll need to represent. Starts at 1, accumulates "hot" features.
|
|
// Tracking for n-hot flocks.
|
|
long countHotRows = 0; // The count of hot "rows"
|
|
long hotNThreshold = (long)(0.1 * NumExamples);
|
|
bool canBeOneHot = true;
|
|
|
|
Func<FeatureFlockBase> createFlock =
|
|
() =>
|
|
{
|
|
ch.Assert(pending.Count > 0);
|
|
FeatureFlockBase flock;
|
|
if (canBeOneHot)
|
|
flock = createOneHotFlock();
|
|
else
|
|
flock = createNHotFlock();
|
|
canBeOneHot = true;
|
|
limMade = iFeature;
|
|
pending.Clear();
|
|
countHotRows = 0;
|
|
countBins = 1;
|
|
return flock;
|
|
};
|
|
|
|
for (; iFeature < featureLim; ++iFeature)
|
|
{
|
|
Host.CheckAlive();
|
|
double[] bup = BinUpperBounds[iFeature];
|
|
Contracts.Assert(Utils.Size(bup) > 0);
|
|
if (bup.Length == 1)
|
|
{
|
|
// This is a trivial feature. Skip it.
|
|
continue;
|
|
}
|
|
ValuesList values = _instanceList[iFeature];
|
|
|
|
if (countBins > Utils.ArrayMaxSize - (bup.Length - 1))
|
|
{
|
|
// It can happen that a flock could be created with more than Utils.ArrayMaxSize
|
|
// bins, in the case where we bin over a training dataset with many features with
|
|
// many bins (for example, 1 million features with 10k bins each), and then in a subsequent
|
|
// validation dataset we have these features suddenly become one-hot. Practically
|
|
// this will never happen, of course, but it is still possible. If this ever happens,
|
|
// we create the flock before this becomes an issue.
|
|
ch.Assert(0 < countBins && countBins <= Utils.ArrayMaxSize);
|
|
ch.Assert(limMade < iFeature);
|
|
ch.Assert(pending.Count > 0);
|
|
yield return createFlock();
|
|
}
|
|
countBins += bup.Length - 1;
|
|
double firstBin = bup[0];
|
|
int localHotRows = 0;
|
|
// The number of bits we would use if we incorporated the current feature in to the
|
|
// existing running flock.
|
|
IntArrayBits newBits = IntArray.NumBitsNeeded(countBins);
|
|
|
|
if (canBeOneHot)
|
|
{
|
|
using (IEnumerator<int> hotEnumerator = values.AllIndicesGT(NumExamples, firstBin).GetEnumerator())
|
|
{
|
|
if (pending.Count > 0)
|
|
{
|
|
// There are prior features we haven't yet flocked. So we are still contemplating
|
|
// "flocking" this prior feature with this feature (and possibly features beyond).
|
|
// The enumeration will need to run the appropriate checks.
|
|
while (hotEnumerator.MoveNext())
|
|
{
|
|
int i = hotEnumerator.Current;
|
|
++localHotRows;
|
|
var last = lastOn[i];
|
|
Contracts.Assert(last < iFeature);
|
|
if (last >= limMade)
|
|
{
|
|
// We've encountered an overlapping feature. We now need to decide whether we want
|
|
// to continue accumulating into a flock and so make this n-hot flock, or cut it off
|
|
// now and create a one-hot flock.
|
|
if (countHotRows < hotNThreshold)
|
|
{
|
|
// We may want to create an N-hot flock.
|
|
int superLocalHot = values.CountIndicesGT(NumExamples, firstBin);
|
|
if (countHotRows + superLocalHot < hotNThreshold)
|
|
{
|
|
// If this succeeds, we want to create an N-hot flock including this.
|
|
canBeOneHot = false;
|
|
localHotRows = superLocalHot;
|
|
break; // Future iterations will create the n-hot.
|
|
}
|
|
// If the test above failed, then we want to create a one-hot of [limMade, iFeature),
|
|
// and keep going on this guy.
|
|
}
|
|
|
|
// We've decided to create a one-hot flock. Before continuing to fill in lastOn, use
|
|
// lastOn in its current state to create a flock from limMade inclusive, to f
|
|
// exclusive, and make "f" the new limMade. Note that we continue to fill in lastOn
|
|
// once we finish this.
|
|
ch.Assert(limMade < iFeature);
|
|
ch.Assert(canBeOneHot);
|
|
yield return createFlock();
|
|
lastOn[i] = iFeature;
|
|
// Now that we've made the feature there's no need continually check against lastOn[i]'s
|
|
// prior values. Fall through to the limMade == iFeature case.
|
|
break;
|
|
}
|
|
lastOn[i] = iFeature;
|
|
}
|
|
}
|
|
|
|
if (canBeOneHot)
|
|
{
|
|
// In the event that hotEnumerator was exhausted in the above loop, the following is a no-op.
|
|
while (hotEnumerator.MoveNext())
|
|
{
|
|
// There is no prior feature to flock, so there's no need to track anything yet.
|
|
// Just populate lastOn appropriately.
|
|
++localHotRows;
|
|
lastOn[hotEnumerator.Current] = iFeature;
|
|
}
|
|
}
|
|
}
|
|
ch.Assert(values.CountIndicesGT(NumExamples, firstBin) == localHotRows);
|
|
pending.Add(iFeature); // Have not yet flocked this feature.
|
|
}
|
|
else
|
|
{
|
|
// No need to track in lastOn, since we're no longer contemplating this being one-hot.
|
|
ch.Assert(limMade < iFeature);
|
|
ch.Assert(countHotRows < hotNThreshold);
|
|
ch.Assert(!canBeOneHot);
|
|
localHotRows = values.CountIndicesGT(NumExamples, firstBin);
|
|
if (countHotRows + localHotRows >= hotNThreshold)
|
|
{
|
|
// Too dense if we add iFeature to the mix. Make an n-hot of [limMade, iFeature),
|
|
// then decrement iFeature so that we reconsider it in light of being a candidate
|
|
// for one-hot or singleton. Do not add to pending, as its status will be considered
|
|
// in the next pass.
|
|
yield return createFlock();
|
|
--iFeature;
|
|
}
|
|
else // Have not yet flocked as feature.
|
|
pending.Add(iFeature);
|
|
}
|
|
countHotRows += localHotRows;
|
|
}
|
|
Contracts.Assert(limMade < featureLim);
|
|
if (pending.Count > 0)
|
|
yield return createFlock();
|
|
}
|
|
|
|
/// <summary>
|
|
/// Create an artificial metadata object to pad the Dataset
|
|
/// </summary>
|
|
private Dataset.DatasetSkeleton CreateDatasetSkeleton()
|
|
{
|
|
ulong[] docIds = new ulong[_numExamples]; // All zeros is fine
|
|
ulong[] queryIds = new ulong[_boundaries.Count - 1]; // All zeros is fine
|
|
var ds = UsingMaxLabel
|
|
? new Dataset.DatasetSkeleton(_targetsList.ToArray(), _boundaries.ToArray(), queryIds, docIds, new double[0][])
|
|
: new Dataset.DatasetSkeleton(_targetsList.ToArray(), _boundaries.ToArray(), queryIds, docIds, new double[0][], _actualTargets.ToArray());
|
|
//AP TODO change it to have weights=null when dataset is unweighted in order to avoid potential long memory scan
|
|
if (_weights != null)
|
|
ds.SampleWeights = _weights.ToArray();
|
|
return ds;
|
|
}
|
|
}
|
|
|
|
// REVIEW: Change this, as well as the bin finding code and bin upper bounds, to be float instead of double.
|
|
/// <summary>
|
|
/// A mutable list of index,value that may be kept sparse or dense.
|
|
/// </summary>
|
|
private sealed class ValuesList
|
|
{
|
|
private bool _isSparse;
|
|
private List<double> _dense;
|
|
private int _nonZeroElements; // when dense, is the number of non-zero elements (for determining when to sparsify)
|
|
private List<KeyValuePair<int, double>> _sparse;
|
|
|
|
public ValuesList()
|
|
{
|
|
_dense = new List<double>();
|
|
}
|
|
|
|
public void Add(int index, double value)
|
|
{
|
|
if (!_isSparse)
|
|
{
|
|
// Check if adding this element will make the array sparse.
|
|
if (ShouldSparsify(_nonZeroElements + 1, index + 1))
|
|
Sparsify();
|
|
else
|
|
{
|
|
// Add zeros if needed.
|
|
while (_dense.Count < index)
|
|
_dense.Add(default(double));
|
|
// Add the value.
|
|
_dense.Add(value);
|
|
if (value != 0)
|
|
_nonZeroElements++;
|
|
return;
|
|
}
|
|
}
|
|
// Note this also may happen because we just sparsified.
|
|
Contracts.Assert(_isSparse);
|
|
if (value != 0)
|
|
_sparse.Add(new KeyValuePair<int, double>(index, value));
|
|
}
|
|
|
|
private bool ShouldSparsify(int nonZeroElements, int totalElements)
|
|
{
|
|
// TODO: We need a better solution here. Also, maybe should start sparse and become dense instead?
|
|
return (double)nonZeroElements / totalElements < 0.25 && totalElements > 10;
|
|
}
|
|
|
|
private void Sparsify()
|
|
{
|
|
_sparse = new List<KeyValuePair<int, double>>(_nonZeroElements);
|
|
for (int i = 0; i < _dense.Count; i++)
|
|
{
|
|
if (_dense[i] != 0)
|
|
_sparse.Add(new KeyValuePair<int, double>(i, _dense[i]));
|
|
}
|
|
_isSparse = true;
|
|
_dense = null;
|
|
}
|
|
|
|
/// <summary>
|
|
/// Returns the count of all positions greater than an indicated value.
|
|
/// </summary>
|
|
/// <param name="length">The limit of indices to check</param>
|
|
/// <param name="gtValue">The value against which the greater-than
|
|
/// comparison is made</param>
|
|
/// <returns>The count of all indices in the range of 0 to <paramref name="length"/>
|
|
/// exclusive whose values are greater than <paramref name="gtValue"/></returns>
|
|
public int CountIndicesGT(int length, double gtValue)
|
|
{
|
|
Contracts.Assert(0 <= length);
|
|
if (_isSparse)
|
|
{
|
|
Contracts.Assert(_sparse.Count == 0 || _sparse[_sparse.Count - 1].Key < length);
|
|
return _sparse.Count(kvp => kvp.Value > gtValue) + (0 > gtValue ? length - _sparse.Count : 0);
|
|
}
|
|
else
|
|
{
|
|
Contracts.Assert(_dense.Count <= length);
|
|
return _dense.Count(v => v > gtValue) + (0 > gtValue ? length - _dense.Count : 0);
|
|
}
|
|
}
|
|
|
|
/// <summary>
|
|
/// Return all indices that are greater than an indicated value.
|
|
/// </summary>
|
|
/// <param name="lim">The limit of indices to return</param>
|
|
/// <param name="gtValue">The value against which the greater-than
|
|
/// comparison is made</param>
|
|
/// <returns>All indices in the range of 0 to <paramref name="lim"/> exclusive
|
|
/// whose values are greater than <paramref name="gtValue"/>, in
|
|
/// increasing order</returns>
|
|
public IEnumerable<int> AllIndicesGT(int lim, double gtValue)
|
|
{
|
|
Contracts.Assert(0 <= lim);
|
|
if (_isSparse)
|
|
{
|
|
Contracts.Assert(_sparse.Count == 0 || _sparse[_sparse.Count - 1].Key < lim);
|
|
if (0 > gtValue)
|
|
{
|
|
// All implicitly defined sparse values will have to be returned.
|
|
int prev = -1;
|
|
foreach (var kvp in _sparse)
|
|
{
|
|
Contracts.Assert(prev < kvp.Key);
|
|
while (++prev < kvp.Key)
|
|
yield return prev;
|
|
if (kvp.Value > gtValue)
|
|
yield return kvp.Key;
|
|
}
|
|
// Return the "leftovers."
|
|
while (++prev < lim)
|
|
yield return prev;
|
|
}
|
|
else
|
|
{
|
|
// Only explicitly defined values have to be returned.
|
|
foreach (var kvp in _sparse)
|
|
{
|
|
if (kvp.Value > gtValue)
|
|
yield return kvp.Key;
|
|
}
|
|
}
|
|
}
|
|
else
|
|
{
|
|
Contracts.Assert(_dense.Count <= lim);
|
|
for (int i = 0; i < _dense.Count; ++i)
|
|
{
|
|
if (_dense[i] > gtValue)
|
|
yield return i;
|
|
}
|
|
if (0 > gtValue)
|
|
{
|
|
// All implicitly defined post-dense values will have to be returned,
|
|
// assuming there are any (this set is only non-empty when listLim < lim).
|
|
for (int i = _dense.Count; i < lim; ++i)
|
|
yield return i;
|
|
}
|
|
}
|
|
}
|
|
|
|
public void CopyTo(int length, ref VBuffer<double> dst)
|
|
{
|
|
Contracts.Assert(0 <= length);
|
|
VBufferEditor<double> editor;
|
|
if (!_isSparse)
|
|
{
|
|
Contracts.Assert(_dense.Count <= length);
|
|
if (ShouldSparsify(_nonZeroElements, length))
|
|
Sparsify();
|
|
else
|
|
{
|
|
editor = VBufferEditor.Create(ref dst, length);
|
|
if (_dense.Count < length)
|
|
{
|
|
_dense.CopyTo(editor.Values);
|
|
editor.Values.Slice(_dense.Count, length - _dense.Count).Clear();
|
|
}
|
|
else
|
|
_dense.CopyTo(editor.Values, length);
|
|
dst = editor.Commit();
|
|
return;
|
|
}
|
|
}
|
|
int count = _sparse.Count;
|
|
Contracts.Assert(count <= length);
|
|
editor = VBufferEditor.Create(ref dst, length, count);
|
|
for (int i = 0; i < _sparse.Count; ++i)
|
|
{
|
|
editor.Indices[i] = _sparse[i].Key;
|
|
editor.Values[i] = _sparse[i].Value;
|
|
}
|
|
Contracts.Assert(Utils.IsIncreasing(0, editor.Indices, count, length));
|
|
dst = editor.Commit();
|
|
}
|
|
|
|
/// <summary>
|
|
/// An enumerable of the row/bin pair of every non-zero bin row according to the
|
|
/// binning passed into this method.
|
|
/// </summary>
|
|
/// <param name="binUpperBounds">The binning to use for the enumeration</param>
|
|
/// <param name="length">The number of rows in this feature</param>
|
|
/// <returns>An enumerable that returns a pair of every row-index and binned value,
|
|
/// where the row indices are increasing, the binned values are positive</returns>
|
|
public IEnumerable<KeyValuePair<int, int>> Binned(double[] binUpperBounds, int length)
|
|
{
|
|
Contracts.Assert(Utils.Size(binUpperBounds) > 0);
|
|
Contracts.Assert(0 <= length);
|
|
|
|
int zeroBin = Algorithms.FindFirstGE(binUpperBounds, 0);
|
|
IntArrayBits numBitsNeeded = IntArray.NumBitsNeeded(binUpperBounds.Length);
|
|
if (numBitsNeeded == IntArrayBits.Bits0)
|
|
yield break;
|
|
if (!_isSparse)
|
|
{
|
|
Contracts.Assert(_dense.Count <= length);
|
|
if (ShouldSparsify(_nonZeroElements, length))
|
|
Sparsify();
|
|
}
|
|
|
|
if (_isSparse)
|
|
{
|
|
Contracts.AssertValue(_sparse);
|
|
if (zeroBin == 0)
|
|
{
|
|
// We can skip all implicit values in sparse.
|
|
foreach (var kvp in _sparse)
|
|
{
|
|
Contracts.Assert(kvp.Key < length);
|
|
int binned = Algorithms.FindFirstGE(binUpperBounds, kvp.Value);
|
|
if (binned > 0)
|
|
yield return new KeyValuePair<int, int>(kvp.Key, binned);
|
|
}
|
|
yield break;
|
|
}
|
|
|
|
Contracts.Assert(zeroBin != 0);
|
|
int last = -1;
|
|
foreach (var kvp in _sparse)
|
|
{
|
|
Contracts.Assert(kvp.Key < length);
|
|
while (++last < kvp.Key)
|
|
yield return new KeyValuePair<int, int>(last, zeroBin);
|
|
int binned = Algorithms.FindFirstGE(binUpperBounds, kvp.Value);
|
|
if (binned > 0)
|
|
yield return new KeyValuePair<int, int>(kvp.Key, binned);
|
|
}
|
|
while (++last < length)
|
|
yield return new KeyValuePair<int, int>(last, zeroBin);
|
|
|
|
yield break;
|
|
}
|
|
Contracts.Assert(!_isSparse);
|
|
Contracts.AssertValue(_dense);
|
|
Contracts.Assert(_dense.Count <= length);
|
|
for (int i = 0; i < _dense.Count; ++i)
|
|
{
|
|
int binned = Algorithms.FindFirstGE(binUpperBounds, _dense[i]);
|
|
if (binned > 0)
|
|
yield return new KeyValuePair<int, int>(i, binned);
|
|
}
|
|
if (zeroBin > 0)
|
|
{
|
|
for (int i = _dense.Count; i < length; ++i)
|
|
yield return new KeyValuePair<int, int>(i, zeroBin);
|
|
}
|
|
}
|
|
|
|
public sealed class ForwardIndexer
|
|
{
|
|
// All of the _values list. We are only addressing _min through _lim.
|
|
private readonly ValuesList[] _values;
|
|
// Parallel to the subsequence of _values in min to lim, indicates the index where
|
|
// we should start to look for the next value, if the corresponding value list in
|
|
// _values is sparse. If the corresponding value list is dense the entry at this
|
|
// position is not used.
|
|
private readonly int[] _perFeaturePosition;
|
|
private readonly int[] _featureIndices;
|
|
#if DEBUG
|
|
// Holds for each feature the row index that it was previously accessed on.
|
|
// Purely for validation purposes.
|
|
private readonly int[] _lastRow;
|
|
#endif
|
|
|
|
/// <summary>
|
|
/// Access the value of a particular feature, at a particular row.
|
|
/// </summary>
|
|
/// <param name="featureIndex">A feature index, which indexes not the global feature indices,
|
|
/// but the index into the subset of features specified at the constructor time</param>
|
|
/// <param name="rowIndex">The row index to access, which must be non-decreasing, and must
|
|
/// indeed be actually increasing for access on the same feature (for example, if you have two features,
|
|
/// it is OK to access <c>[1, 5]</c>, then <c>[0, 5]</c>, but once this is done you cannot
|
|
/// access the same feature at the same position.</param>
|
|
/// <returns></returns>
|
|
public double this[int featureIndex, int rowIndex]
|
|
{
|
|
get
|
|
{
|
|
Contracts.Assert(0 <= featureIndex && featureIndex < _featureIndices.Length);
|
|
Contracts.Assert(rowIndex >= 0);
|
|
var values = _values[_featureIndices[featureIndex]];
|
|
#if DEBUG
|
|
int lastRow = _lastRow[featureIndex];
|
|
Contracts.Assert(rowIndex > lastRow);
|
|
_lastRow[featureIndex] = rowIndex;
|
|
#endif
|
|
if (!values._isSparse)
|
|
return rowIndex < values._dense.Count ? values._dense[rowIndex] : 0;
|
|
int last = _perFeaturePosition[featureIndex];
|
|
var sp = values._sparse;
|
|
#if DEBUG
|
|
// The next value of _sparse (assuming there is one) should have been past the last access.
|
|
// That is, sp[last].Key, if it exist, must be greater than lastRow.
|
|
Contracts.Assert(sp.Count == 0 || sp[last].Key > lastRow);
|
|
#endif
|
|
while (last < sp.Count)
|
|
{
|
|
var s = sp[last++];
|
|
if (s.Key < rowIndex)
|
|
continue;
|
|
if (s.Key > rowIndex)
|
|
{
|
|
// We'd previously put last past this element,
|
|
// have to put it back a bit.
|
|
last--;
|
|
break;
|
|
}
|
|
Contracts.Assert(s.Key == rowIndex);
|
|
_perFeaturePosition[featureIndex] = last;
|
|
return s.Value;
|
|
}
|
|
_perFeaturePosition[featureIndex] = last;
|
|
return 0;
|
|
}
|
|
}
|
|
|
|
/// <summary>
|
|
/// Initialize a forward indexer.
|
|
/// </summary>
|
|
/// <param name="values">Holds the values of the features</param>
|
|
/// <param name="features">The array of feature indices this will index</param>
|
|
/// <param name="workArray">A possibly shared working array, once used by this forward
|
|
/// indexer it should not be used in any previously created forward indexer</param>
|
|
public ForwardIndexer(ValuesList[] values, int[] features, ref int[] workArray)
|
|
{
|
|
Contracts.AssertValue(values);
|
|
Contracts.AssertValueOrNull(workArray);
|
|
Contracts.AssertValue(features);
|
|
Contracts.Assert(Utils.IsIncreasing(0, features, values.Length));
|
|
Contracts.Assert(features.All(i => values[i] != null));
|
|
_values = values;
|
|
_featureIndices = features;
|
|
Utils.EnsureSize(ref workArray, _featureIndices.Length, keepOld: false);
|
|
Contracts.AssertValue(workArray); // Should be initialized now.
|
|
_perFeaturePosition = workArray;
|
|
Array.Clear(_perFeaturePosition, 0, _featureIndices.Length);
|
|
#if DEBUG
|
|
_lastRow = new int[features.Length];
|
|
for (int i = 0; i < _lastRow.Length; ++i)
|
|
_lastRow[i] = -1;
|
|
#endif
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
internal sealed class ExamplesToFastTreeBins
|
|
{
|
|
private readonly int _maxBins;
|
|
private readonly float _maxLabel;
|
|
private readonly IHost _host;
|
|
private readonly bool _diskTranspose;
|
|
private readonly bool _noFlocks;
|
|
private readonly int _minDocsPerLeaf;
|
|
|
|
/// <summary> Bin boundaries </summary>
|
|
public double[][] BinUpperBounds
|
|
{
|
|
get;
|
|
private set;
|
|
}
|
|
|
|
public int[] FeatureMap { get; private set; }
|
|
|
|
public ExamplesToFastTreeBins(IHostEnvironment env, int maxBins, bool diskTranspose, bool noFlocks, int minDocsPerLeaf, float maxLabel)
|
|
{
|
|
Contracts.AssertValue(env);
|
|
_host = env.Register("Converter");
|
|
|
|
_maxBins = maxBins;
|
|
_maxLabel = maxLabel;
|
|
_diskTranspose = diskTranspose;
|
|
_noFlocks = noFlocks;
|
|
_minDocsPerLeaf = minDocsPerLeaf;
|
|
}
|
|
|
|
public Dataset FindBinsAndReturnDataset(RoleMappedData data, PredictionKind kind, IParallelTraining parallelTraining,
|
|
int[] categoricalFeaturIndices, bool categoricalSplit)
|
|
{
|
|
using (var ch = _host.Start("InitDataset"))
|
|
{
|
|
ch.Info("Making per-feature arrays");
|
|
var convData = DataConverter.Create(data, _host, _maxBins, _maxLabel, _diskTranspose, _noFlocks,
|
|
_minDocsPerLeaf, kind, parallelTraining, categoricalFeaturIndices, categoricalSplit);
|
|
|
|
ch.Info("Processed {0} instances", convData.NumExamples);
|
|
ch.Info("Binning and forming Feature objects");
|
|
Dataset d = convData.GetDataset();
|
|
BinUpperBounds = convData.BinUpperBounds;
|
|
FeatureMap = convData.FeatureMap;
|
|
return d;
|
|
}
|
|
}
|
|
|
|
public Dataset GetCompatibleDataset(RoleMappedData data, PredictionKind kind, int[] categoricalFeatures, bool categoricalSplit)
|
|
{
|
|
_host.AssertValue(BinUpperBounds);
|
|
var convData = DataConverter.Create(data, _host, BinUpperBounds, _maxLabel, _diskTranspose, _noFlocks, kind,
|
|
categoricalFeatures, categoricalSplit);
|
|
|
|
return convData.GetDataset();
|
|
}
|
|
}
|
|
|
|
public abstract class TreeEnsembleModelParameters :
|
|
ModelParametersBase<float>,
|
|
IValueMapper,
|
|
ICanSaveInTextFormat,
|
|
ICanSaveInIniFormat,
|
|
ICanSaveInSourceCode,
|
|
ICanSaveSummary,
|
|
ICanGetSummaryInKeyValuePairs,
|
|
ITreeEnsemble,
|
|
IPredictorWithFeatureWeights<float>,
|
|
IFeatureContributionMapper,
|
|
ICalculateFeatureContribution,
|
|
ISingleCanSavePfa,
|
|
ISingleCanSaveOnnx
|
|
{
|
|
// The below two properties are necessary for tree Visualizer
|
|
[BestFriend]
|
|
internal InternalTreeEnsemble TrainedEnsemble { get; }
|
|
|
|
int ITreeEnsemble.NumTrees => TrainedEnsemble.NumTrees;
|
|
|
|
// Inner args is used only for documentation purposes when saving comments to INI files.
|
|
private protected readonly string InnerOptions;
|
|
|
|
// The total number of features used in training (takes the value of zero if the
|
|
// written version of the loaded model is less than VerNumFeaturesSerialized)
|
|
private protected readonly int NumFeatures;
|
|
|
|
// Maximum index of the split features of trainedEnsemble trees
|
|
private protected readonly int MaxSplitFeatIdx;
|
|
|
|
private protected abstract uint VerNumFeaturesSerialized { get; }
|
|
|
|
private protected abstract uint VerDefaultValueSerialized { get; }
|
|
|
|
private protected abstract uint VerCategoricalSplitSerialized { get; }
|
|
|
|
[BestFriend]
|
|
internal readonly DataViewType InputType;
|
|
DataViewType IValueMapper.InputType => InputType;
|
|
|
|
[BestFriend]
|
|
internal readonly DataViewType OutputType;
|
|
DataViewType IValueMapper.OutputType => OutputType;
|
|
|
|
bool ICanSavePfa.CanSavePfa => true;
|
|
|
|
bool ICanSaveOnnx.CanSaveOnnx(OnnxContext ctx) => true;
|
|
|
|
/// <summary>
|
|
/// Used to determine the contribution of each feature to the score of an example by <see cref="FeatureContributionCalculatingTransformer"/>.
|
|
/// The calculation of feature contribution essentially consists in determining which splits in the tree have the most impact
|
|
/// on the final score and assigning the value of the impact to the features determining the split. More precisely, the contribution of a feature
|
|
/// is equal to the change in score produced by exploring the opposite sub-tree every time a decision node for the given feature is encountered.
|
|
/// Consider a simple case with a single decision tree that has a decision node for the binary feature F1. Given an example that has feature F1
|
|
/// equal to true, we can calculate the score it would have obtained if we chose the subtree corresponding to the feature F1 being equal to false
|
|
/// while keeping the other features constant. The contribution of feature F1 for the given example is the difference between the original score
|
|
/// and the score obtained by taking the opposite decision at the node corresponding to feature F1. This algorithm extends naturally to models with
|
|
/// many decision trees.
|
|
/// </summary>
|
|
FeatureContributionCalculator ICalculateFeatureContribution.FeatureContributionCalculator => new FeatureContributionCalculator(this);
|
|
|
|
/// The following function is used in both FastTree and LightGBM so <see cref="BestFriendAttribute"/> is required.
|
|
[BestFriend]
|
|
private protected TreeEnsembleModelParameters(IHostEnvironment env, string name, InternalTreeEnsemble trainedEnsemble, int numFeatures, string innerArgs)
|
|
: base(env, name)
|
|
{
|
|
Host.CheckValue(trainedEnsemble, nameof(trainedEnsemble));
|
|
Host.CheckParam(numFeatures > 0, nameof(numFeatures), "must be positive");
|
|
Host.CheckValueOrNull(innerArgs);
|
|
|
|
// REVIEW: When we make the predictor wrapper, we may want to further "optimize"
|
|
// the trained ensemble to, for instance, resize arrays so that they are of the length
|
|
// the actual number of leaves/nodes, or remove unnecessary arrays, and so forth.
|
|
TrainedEnsemble = trainedEnsemble;
|
|
InnerOptions = innerArgs;
|
|
NumFeatures = numFeatures;
|
|
|
|
MaxSplitFeatIdx = trainedEnsemble.GetMaxFeatureIndex();
|
|
Contracts.Assert(NumFeatures > MaxSplitFeatIdx);
|
|
|
|
InputType = new VectorDataViewType(NumberDataViewType.Single, NumFeatures);
|
|
OutputType = NumberDataViewType.Single;
|
|
}
|
|
|
|
private protected TreeEnsembleModelParameters(IHostEnvironment env, string name, ModelLoadContext ctx, VersionInfo ver)
|
|
: base(env, name, ctx)
|
|
{
|
|
// *** Binary format ***
|
|
// Ensemble
|
|
// int: Inner args string id
|
|
// int: Number of features (VerNumFeaturesSerialized)
|
|
// <PredictionKind> specific stuff
|
|
ctx.CheckVersionInfo(ver);
|
|
bool usingDefaultValues = false;
|
|
bool categoricalSplits = false;
|
|
if (ctx.Header.ModelVerWritten >= VerDefaultValueSerialized)
|
|
usingDefaultValues = true;
|
|
|
|
if (ctx.Header.ModelVerWritten >= VerCategoricalSplitSerialized)
|
|
categoricalSplits = true;
|
|
|
|
TrainedEnsemble = new InternalTreeEnsemble(ctx, usingDefaultValues, categoricalSplits);
|
|
MaxSplitFeatIdx = TrainedEnsemble.GetMaxFeatureIndex();
|
|
|
|
InnerOptions = ctx.LoadStringOrNull();
|
|
if (ctx.Header.ModelVerWritten >= VerNumFeaturesSerialized)
|
|
{
|
|
NumFeatures = ctx.Reader.ReadInt32();
|
|
// It is possible that the number of features is 0 when an old model is loaded and then saved with the new version.
|
|
Host.CheckDecode(NumFeatures >= 0);
|
|
}
|
|
|
|
// In the days of TLC <= 2.7 before we had a data pipeline, there was
|
|
// some auxiliary structure called the "ContentMap." This structure is
|
|
// no longer necessary or helpful since the data pipeline is in
|
|
// TLC >= 3.0 supposed to be independent of any predictor specific
|
|
// tricks.
|
|
|
|
InputType = new VectorDataViewType(NumberDataViewType.Single, NumFeatures);
|
|
OutputType = NumberDataViewType.Single;
|
|
}
|
|
|
|
[BestFriend]
|
|
private protected override void SaveCore(ModelSaveContext ctx)
|
|
{
|
|
base.SaveCore(ctx);
|
|
|
|
// *** Binary format ***
|
|
// Ensemble
|
|
// int: Inner args string id
|
|
// int: Number of features (VerNumFeaturesSerialized)
|
|
// <PredictionKind> specific stuff
|
|
TrainedEnsemble.Save(ctx);
|
|
ctx.SaveStringOrNull(InnerOptions);
|
|
Host.Assert(NumFeatures >= 0);
|
|
ctx.Writer.Write(NumFeatures);
|
|
}
|
|
|
|
ValueMapper<TIn, TOut> IValueMapper.GetMapper<TIn, TOut>()
|
|
{
|
|
Host.Check(typeof(TIn) == typeof(VBuffer<float>));
|
|
Host.Check(typeof(TOut) == typeof(float));
|
|
|
|
ValueMapper<VBuffer<float>, float> del = Map;
|
|
return (ValueMapper<TIn, TOut>)(Delegate)del;
|
|
}
|
|
|
|
private protected virtual void Map(in VBuffer<float> src, ref float dst)
|
|
{
|
|
int inputVectorSize = InputType.GetVectorSize();
|
|
if (inputVectorSize > 0)
|
|
Host.Check(src.Length == inputVectorSize);
|
|
else
|
|
Host.Check(src.Length > MaxSplitFeatIdx);
|
|
|
|
dst = (float)TrainedEnsemble.GetOutput(in src);
|
|
}
|
|
|
|
ValueMapper<TSrc, VBuffer<float>> IFeatureContributionMapper.GetFeatureContributionMapper<TSrc, TDst>(int top, int bottom, bool normalize)
|
|
{
|
|
Host.Check(typeof(TSrc) == typeof(VBuffer<float>));
|
|
Host.Check(typeof(TDst) == typeof(VBuffer<float>));
|
|
Host.Check(top >= 0, "top must be non-negative");
|
|
Host.Check(bottom >= 0, "bottom must be non-negative");
|
|
|
|
BufferBuilder<float> builder = null;
|
|
ValueMapper<VBuffer<float>, VBuffer<float>> del =
|
|
(in VBuffer<float> src, ref VBuffer<float> dst) =>
|
|
{
|
|
FeatureContributionMap(in src, ref dst, ref builder);
|
|
Numeric.VectorUtils.SparsifyNormalize(ref dst, top, bottom, normalize);
|
|
};
|
|
return (ValueMapper<TSrc, VBuffer<float>>)(Delegate)del;
|
|
}
|
|
|
|
private void FeatureContributionMap(in VBuffer<float> src, ref VBuffer<float> dst, ref BufferBuilder<float> builder)
|
|
{
|
|
int inputVectorSize = InputType.GetVectorSize();
|
|
if (inputVectorSize > 0)
|
|
Host.Check(src.Length == inputVectorSize);
|
|
else
|
|
Host.Check(src.Length > MaxSplitFeatIdx);
|
|
|
|
TrainedEnsemble.GetFeatureContributions(in src, ref dst, ref builder);
|
|
}
|
|
|
|
/// <summary>
|
|
/// write out a C# representation of the ensemble
|
|
/// </summary>
|
|
void ICanSaveInSourceCode.SaveAsCode(TextWriter writer, RoleMappedSchema schema)
|
|
{
|
|
Host.CheckValueOrNull(schema);
|
|
SaveEnsembleAsCode(writer, schema);
|
|
}
|
|
|
|
/// <summary>
|
|
/// Output the INI model to a given writer
|
|
/// </summary>
|
|
void ICanSaveInTextFormat.SaveAsText(TextWriter writer, RoleMappedSchema schema)
|
|
{
|
|
Host.CheckValue(writer, nameof(writer));
|
|
Host.CheckValueOrNull(schema);
|
|
((ICanSaveInIniFormat)this).SaveAsIni(writer, schema);
|
|
}
|
|
|
|
/// <summary>
|
|
/// Output the INI model to a given writer
|
|
/// </summary>
|
|
void ICanSaveInIniFormat.SaveAsIni(TextWriter writer, RoleMappedSchema schema, ICalibrator calibrator)
|
|
{
|
|
Host.CheckValue(writer, nameof(writer));
|
|
var ensembleIni = FastTreeIniFileUtils.TreeEnsembleToIni(Host, TrainedEnsemble, schema, calibrator,
|
|
InnerOptions, appendFeatureGain: true, includeZeroGainFeatures: false);
|
|
writer.WriteLine(ensembleIni);
|
|
}
|
|
|
|
JToken ISingleCanSavePfa.SaveAsPfa(BoundPfaContext ctx, JToken input)
|
|
{
|
|
Host.CheckValue(ctx, nameof(ctx));
|
|
Host.CheckValue(input, nameof(input));
|
|
return TrainedEnsemble.AsPfa(ctx, input);
|
|
}
|
|
|
|
private enum NodeMode
|
|
{
|
|
[Description("BRANCH_LEQ")]
|
|
BranchLEq,
|
|
[Description("BRANCH_LT")]
|
|
BranchLT,
|
|
[Description("BRANCH_GTE")]
|
|
BranchGte,
|
|
[Description("BRANCH_GT")]
|
|
BranchGT,
|
|
[Description("BRANCH_EQ")]
|
|
BranchEq,
|
|
[Description("BRANCH_LT")]
|
|
BranchNeq,
|
|
[Description("LEAF")]
|
|
Leaf
|
|
};
|
|
|
|
private enum PostTransform
|
|
{
|
|
[Description("NONE")]
|
|
None,
|
|
[Description("SOFTMAX")]
|
|
SoftMax,
|
|
[Description("LOGISTIC")]
|
|
Logstic,
|
|
[Description("SOFTMAX_ZERO")]
|
|
SoftMaxZero
|
|
}
|
|
|
|
private enum AggregateFunction
|
|
{
|
|
[Description("AVERAGE")]
|
|
Average,
|
|
[Description("SUM")]
|
|
Sum,
|
|
[Description("MIN")]
|
|
Min,
|
|
[Description("MAX")]
|
|
Max
|
|
}
|
|
|
|
private protected virtual bool SaveAsOnnx(OnnxContext ctx, string[] outputNames, string featureColumn)
|
|
{
|
|
Host.CheckValue(ctx, nameof(ctx));
|
|
Host.Check(Utils.Size(outputNames) >= 1);
|
|
|
|
const int minimumOpSetVersion = 9;
|
|
ctx.CheckOpSetVersion(minimumOpSetVersion, "TreeEnsembleModelParameters");
|
|
|
|
//Nodes.
|
|
var nodesTreeids = new List<long>();
|
|
var nodesIds = new List<long>();
|
|
var nodesFeatureIds = new List<long>();
|
|
var nodeModes = new List<string>();
|
|
var nodesValues = new List<double>();
|
|
var nodeHitrates = new List<long>();
|
|
var missingValueTracksTrue = new List<bool>();
|
|
var nodesTrueNodeIds = new List<long>();
|
|
var nodesFalseNodeIds = new List<long>();
|
|
var nodesBaseValues = new List<float>();
|
|
|
|
//Leafs.
|
|
var classTreeIds = new List<long>();
|
|
var classNodeIds = new List<long>();
|
|
var classIds = new List<long>();
|
|
var classWeights = new List<double>();
|
|
|
|
int treeIndex = -1;
|
|
foreach (var tree in TrainedEnsemble.Trees)
|
|
{
|
|
treeIndex++;
|
|
for (int nodeIndex = 0; nodeIndex < tree.NumNodes; nodeIndex++)
|
|
{
|
|
nodesTreeids.Add(treeIndex);
|
|
nodeModes.Add(NodeMode.BranchLEq.GetDescription());
|
|
nodesIds.Add(nodeIndex);
|
|
nodesFeatureIds.Add(tree.SplitFeature(nodeIndex));
|
|
nodesValues.Add(tree.RawThresholds[nodeIndex]);
|
|
nodesTrueNodeIds.Add(tree.LteChild[nodeIndex] < 0 ? ~tree.LteChild[nodeIndex] + tree.NumNodes : tree.LteChild[nodeIndex]);
|
|
nodesFalseNodeIds.Add(tree.GtChild[nodeIndex] < 0 ? ~tree.GtChild[nodeIndex] + tree.NumNodes : tree.GtChild[nodeIndex]);
|
|
if (tree.DefaultValueForMissing?[nodeIndex] <= tree.RawThresholds[nodeIndex])
|
|
missingValueTracksTrue.Add(true);
|
|
else
|
|
missingValueTracksTrue.Add(false);
|
|
|
|
nodeHitrates.Add(0);
|
|
}
|
|
|
|
for (int leafIndex = 0; leafIndex < tree.NumLeaves; leafIndex++)
|
|
{
|
|
int nodeIndex = tree.NumNodes + leafIndex;
|
|
nodesTreeids.Add(treeIndex);
|
|
nodesBaseValues.Add(0);
|
|
nodeModes.Add(NodeMode.Leaf.GetDescription());
|
|
nodesIds.Add(nodeIndex);
|
|
nodesFeatureIds.Add(0);
|
|
nodesValues.Add(0);
|
|
nodesTrueNodeIds.Add(0);
|
|
nodesFalseNodeIds.Add(0);
|
|
missingValueTracksTrue.Add(false);
|
|
nodeHitrates.Add(0);
|
|
|
|
classTreeIds.Add(treeIndex);
|
|
classNodeIds.Add(nodeIndex);
|
|
classIds.Add(0);
|
|
classWeights.Add(tree.LeafValues[leafIndex]);
|
|
}
|
|
}
|
|
|
|
string opType = "TreeEnsembleRegressor";
|
|
string scoreVarName = (Utils.Size(outputNames) >= 2) ? outputNames[1] : outputNames[0]; // Get Score from PredictedLabel and/or Score columns
|
|
var node = ctx.CreateNode(opType, new[] { featureColumn }, new[] { scoreVarName }, ctx.GetNodeName(opType));
|
|
|
|
node.AddAttribute("post_transform", PostTransform.None.GetDescription());
|
|
node.AddAttribute("n_targets", 1);
|
|
node.AddAttribute("base_values", new List<float>() { 0 });
|
|
node.AddAttribute("aggregate_function", AggregateFunction.Sum.GetDescription());
|
|
node.AddAttribute("nodes_treeids", nodesTreeids);
|
|
node.AddAttribute("nodes_nodeids", nodesIds);
|
|
node.AddAttribute("nodes_featureids", nodesFeatureIds);
|
|
node.AddAttribute("nodes_modes", nodeModes);
|
|
node.AddAttribute("nodes_values", nodesValues);
|
|
node.AddAttribute("nodes_truenodeids", nodesTrueNodeIds);
|
|
node.AddAttribute("nodes_falsenodeids", nodesFalseNodeIds);
|
|
node.AddAttribute("nodes_missing_value_tracks_true", missingValueTracksTrue);
|
|
node.AddAttribute("target_treeids", classTreeIds);
|
|
node.AddAttribute("target_nodeids", classNodeIds);
|
|
node.AddAttribute("target_ids", classIds);
|
|
node.AddAttribute("target_weights", classWeights);
|
|
|
|
return true;
|
|
}
|
|
bool ISingleCanSaveOnnx.SaveAsOnnx(OnnxContext ctx, string[] outputNames, string featureColumn)
|
|
{
|
|
return SaveAsOnnx(ctx, outputNames, featureColumn);
|
|
}
|
|
|
|
void ICanSaveSummary.SaveSummary(TextWriter writer, RoleMappedSchema schema)
|
|
{
|
|
writer.WriteLine();
|
|
writer.WriteLine("Per-feature gain summary for the boosted tree ensemble:");
|
|
|
|
foreach (var pair in ((ICanGetSummaryInKeyValuePairs)this).GetSummaryInKeyValuePairs(schema))
|
|
{
|
|
Host.Assert(pair.Value is double);
|
|
writer.WriteLine("\t{0}\t{1}", pair.Key, (double)pair.Value);
|
|
}
|
|
}
|
|
|
|
private IEnumerable<KeyValuePair<string, double>> GetSortedFeatureGains(RoleMappedSchema schema)
|
|
{
|
|
var gainMap = new FeatureToGainMap(TrainedEnsemble.Trees.ToList(), normalize: true);
|
|
|
|
var names = default(VBuffer<ReadOnlyMemory<char>>);
|
|
AnnotationUtils.GetSlotNames(schema, RoleMappedSchema.ColumnRole.Feature, NumFeatures, ref names);
|
|
var ordered = gainMap.OrderByDescending(pair => pair.Value);
|
|
double max = ordered.FirstOrDefault().Value;
|
|
double normFactor = max == 0 ? 1.0 : (1.0 / Math.Sqrt(max));
|
|
foreach (var pair in ordered)
|
|
{
|
|
var name = names.GetItemOrDefault(pair.Key).ToString();
|
|
if (string.IsNullOrEmpty(name))
|
|
name = $"f{pair.Key}";
|
|
yield return new KeyValuePair<string, double>(name, Math.Sqrt(pair.Value) * normFactor);
|
|
}
|
|
}
|
|
|
|
///<inheritdoc/>
|
|
IList<KeyValuePair<string, object>> ICanGetSummaryInKeyValuePairs.GetSummaryInKeyValuePairs(RoleMappedSchema schema)
|
|
{
|
|
List<KeyValuePair<string, object>> results = new List<KeyValuePair<string, object>>();
|
|
|
|
var ordered = GetSortedFeatureGains(schema);
|
|
foreach (var pair in ordered)
|
|
results.Add(new KeyValuePair<string, object>(pair.Key, pair.Value));
|
|
return results;
|
|
}
|
|
|
|
/// <summary>
|
|
/// returns a C# representation of the ensemble
|
|
/// </summary>
|
|
private void SaveEnsembleAsCode(TextWriter writer, RoleMappedSchema schema)
|
|
{
|
|
Host.AssertValueOrNull(schema);
|
|
|
|
var names = default(VBuffer<ReadOnlyMemory<char>>);
|
|
AnnotationUtils.GetSlotNames(schema, RoleMappedSchema.ColumnRole.Feature, NumFeatures, ref names);
|
|
|
|
int i = 0;
|
|
foreach (InternalRegressionTree tree in TrainedEnsemble.Trees)
|
|
{
|
|
writer.Write("double treeOutput{0}=", i);
|
|
SaveTreeAsCode(tree, writer, in names);
|
|
writer.Write(";\n");
|
|
i++;
|
|
}
|
|
writer.Write("double output = ");
|
|
for (int j = 0; j < i; j++)
|
|
writer.Write((j > 0 ? "+" : "") + "treeOutput" + j);
|
|
writer.Write(";");
|
|
}
|
|
|
|
/// <summary>
|
|
/// Convert a single tree to code, called recursively
|
|
/// </summary>
|
|
private void SaveTreeAsCode(InternalRegressionTree tree, TextWriter writer, in VBuffer<ReadOnlyMemory<char>> names)
|
|
{
|
|
ToCSharp(tree, writer, 0, in names);
|
|
}
|
|
|
|
// converts a subtree into a C# expression
|
|
private void ToCSharp(InternalRegressionTree tree, TextWriter writer, int node, in VBuffer<ReadOnlyMemory<char>> names)
|
|
{
|
|
if (node < 0)
|
|
{
|
|
writer.Write(FloatUtils.ToRoundTripString(tree.LeafValue(~node)));
|
|
//_output[~node].ToString());
|
|
}
|
|
else
|
|
{
|
|
var name = names.GetItemOrDefault(tree.SplitFeature(node)).ToString();
|
|
if (string.IsNullOrEmpty(name))
|
|
name = $"f{tree.SplitFeature(node)}";
|
|
|
|
writer.Write("(({0} > {1}) ? ", name, FloatUtils.ToRoundTripString(tree.RawThreshold(node)));
|
|
ToCSharp(tree, writer, tree.GetGtChildForNode(node), in names);
|
|
writer.Write(" : ");
|
|
ToCSharp(tree, writer, tree.GetLteChildForNode(node), in names);
|
|
writer.Write(")");
|
|
}
|
|
}
|
|
|
|
/// <summary>
|
|
/// Get the cumulative split gains for each feature across all trees.
|
|
/// </summary>
|
|
/// <param name="weights">A <see cref="VBuffer{T}"/> to hold the cumulative split gain value for each feature.
|
|
/// The i-th element in <paramref name="weights"/> stores the cumulative split gain of the i-th feature.</param>
|
|
public void GetFeatureWeights(ref VBuffer<float> weights)
|
|
{
|
|
var numFeatures = Math.Max(NumFeatures, MaxSplitFeatIdx + 1);
|
|
FeatureToGainMap gainMap = new FeatureToGainMap(TrainedEnsemble.Trees.ToList(), normalize: true);
|
|
|
|
// If there are no trees or no splits, there are no gains.
|
|
if (gainMap.Count == 0)
|
|
{
|
|
VBufferUtils.Resize(ref weights, numFeatures, 0);
|
|
return;
|
|
}
|
|
|
|
double max = gainMap.Values.Max();
|
|
double normFactor = max == 0 ? 1.0 : (1.0 / Math.Sqrt(max));
|
|
var bldr = new BufferBuilder<float>(R4Adder.Instance);
|
|
bldr.Reset(numFeatures, false);
|
|
foreach (var pair in gainMap)
|
|
bldr.AddFeature(pair.Key, (float)(Math.Sqrt(pair.Value) * normFactor));
|
|
bldr.GetResult(ref weights);
|
|
}
|
|
|
|
ITree[] ITreeEnsemble.GetTrees()
|
|
{
|
|
return TrainedEnsemble.Trees.Select(k => new Tree(k)).ToArray();
|
|
}
|
|
|
|
[BestFriend]
|
|
internal float GetLeafValue(int treeId, int leafId)
|
|
{
|
|
return (float)TrainedEnsemble.GetTreeAt(treeId).LeafValue(leafId);
|
|
}
|
|
|
|
/// <summary>
|
|
/// Returns the leaf node in the requested tree for the given feature vector, and populates 'path' with the list of
|
|
/// internal nodes in the path from the root to that leaf. If 'path' is null a new list is initialized. All elements
|
|
/// in 'path' are cleared before filling in the current path nodes.
|
|
/// </summary>
|
|
[BestFriend]
|
|
internal int GetLeaf(int treeId, in VBuffer<float> features, ref List<int> path)
|
|
{
|
|
return TrainedEnsemble.GetTreeAt(treeId).GetLeaf(in features, ref path);
|
|
}
|
|
|
|
private sealed class Tree : ITree<VBuffer<float>>
|
|
{
|
|
private readonly InternalRegressionTree _regTree;
|
|
|
|
public Tree(InternalRegressionTree regTree)
|
|
{
|
|
_regTree = regTree;
|
|
}
|
|
|
|
public int[] GtChild => _regTree.GtChild;
|
|
|
|
public int[] LteChild => _regTree.LteChild;
|
|
|
|
public int NumNodes => _regTree.NumNodes;
|
|
|
|
public int NumLeaves => _regTree.NumLeaves;
|
|
|
|
public int GetLeaf(in VBuffer<float> feat)
|
|
{
|
|
return _regTree.GetLeaf(in feat);
|
|
}
|
|
|
|
public INode GetNode(int nodeId, bool isLeaf, IEnumerable<string> featuresNames = null)
|
|
{
|
|
var keyValues = new Dictionary<string, object>();
|
|
if (isLeaf)
|
|
{
|
|
keyValues.Add(NodeKeys.LeafValue, _regTree.LeafValue(nodeId));
|
|
}
|
|
else
|
|
{
|
|
if (featuresNames != null)
|
|
{
|
|
if (featuresNames is FeatureNameCollection features)
|
|
{
|
|
if (_regTree.CategoricalSplit[nodeId])
|
|
{
|
|
string featureList = string.Join(" OR \n",
|
|
_regTree.CategoricalSplitFeatures[nodeId].Select(feature => features[feature]));
|
|
|
|
keyValues.Add(NodeKeys.SplitName, featureList);
|
|
}
|
|
else
|
|
keyValues.Add(NodeKeys.SplitName, features[_regTree.SplitFeature(nodeId)]);
|
|
}
|
|
}
|
|
keyValues.Add(NodeKeys.Threshold, string.Format("<= {0}", _regTree.RawThreshold(nodeId)));
|
|
if (_regTree.SplitGains != null)
|
|
keyValues.Add(NodeKeys.SplitGain, _regTree.SplitGains[nodeId]);
|
|
if (_regTree.GainPValues != null)
|
|
keyValues.Add(NodeKeys.GainValue, _regTree.GainPValues[nodeId]);
|
|
if (_regTree.PreviousLeafValues != null)
|
|
keyValues.Add(NodeKeys.PreviousLeafValue, _regTree.PreviousLeafValues[nodeId]);
|
|
}
|
|
|
|
return new TreeNode(keyValues);
|
|
}
|
|
|
|
public double GetLeafValue(int leafId)
|
|
{
|
|
return _regTree.LeafValue(leafId);
|
|
}
|
|
}
|
|
|
|
private sealed class TreeNode : INode
|
|
{
|
|
public TreeNode(Dictionary<string, object> keyValues)
|
|
{
|
|
KeyValues = keyValues;
|
|
}
|
|
|
|
public Dictionary<string, object> KeyValues { get; }
|
|
}
|
|
}
|
|
|
|
/// <summary>
|
|
/// <see cref="TreeEnsembleModelParametersBasedOnRegressionTree"/> is derived from
|
|
/// <see cref="TreeEnsembleModelParameters"/> plus a strongly-typed public attribute,
|
|
/// <see cref="TrainedTreeEnsemble"/>, for exposing trained model's details to users.
|
|
/// Its function, <see cref="CreateTreeEnsembleFromInternalDataStructure"/>, is
|
|
/// called to create <see cref="TrainedTreeEnsemble"/> inside <see cref="TreeEnsembleModelParameters"/>.
|
|
/// Note that the major difference between <see cref="TreeEnsembleModelParametersBasedOnQuantileRegressionTree"/>
|
|
/// and <see cref="TreeEnsembleModelParametersBasedOnRegressionTree"/> is the type of
|
|
/// <see cref="TrainedTreeEnsemble"/>.
|
|
/// </summary>
|
|
public abstract class TreeEnsembleModelParametersBasedOnRegressionTree : TreeEnsembleModelParameters, ICanGetSummaryAsIDataView
|
|
{
|
|
/// <summary>
|
|
/// An ensemble of trees exposed to users. It is a wrapper on the <see langword="internal"/>
|
|
/// <see cref="InternalTreeEnsemble"/> in <see cref="ML.Trainers.FastTree.TreeEnsemble{T}"/>.
|
|
/// </summary>
|
|
public RegressionTreeEnsemble TrainedTreeEnsemble { get; }
|
|
|
|
[BestFriend]
|
|
private protected TreeEnsembleModelParametersBasedOnRegressionTree(IHostEnvironment env, string name, InternalTreeEnsemble trainedEnsemble, int numFeatures, string innerArgs)
|
|
: base(env, name, trainedEnsemble, numFeatures, innerArgs)
|
|
{
|
|
TrainedTreeEnsemble = CreateTreeEnsembleFromInternalDataStructure();
|
|
}
|
|
|
|
[BestFriend]
|
|
private protected TreeEnsembleModelParametersBasedOnRegressionTree(IHostEnvironment env, string name, ModelLoadContext ctx, VersionInfo ver)
|
|
: base(env, name, ctx, ver)
|
|
{
|
|
TrainedTreeEnsemble = CreateTreeEnsembleFromInternalDataStructure();
|
|
}
|
|
|
|
private RegressionTreeEnsemble CreateTreeEnsembleFromInternalDataStructure()
|
|
{
|
|
var trees = TrainedEnsemble.Trees.Select(tree => new RegressionTree(tree));
|
|
var treeWeights = TrainedEnsemble.Trees.Select(tree => tree.Weight);
|
|
return new RegressionTreeEnsemble(trees, treeWeights, TrainedEnsemble.Bias);
|
|
}
|
|
|
|
/// <summary>
|
|
/// Used for the Summarize entrypoint.
|
|
/// </summary>
|
|
IDataView ICanGetSummaryAsIDataView.GetSummaryDataView(RoleMappedSchema schema)
|
|
=> RegressionTreeBaseUtils.RegressionTreeEnsembleAsIDataView(Host, TrainedTreeEnsemble.Bias, TrainedTreeEnsemble.TreeWeights, TrainedTreeEnsemble.Trees);
|
|
}
|
|
|
|
/// <summary>
|
|
/// <see cref="TreeEnsembleModelParametersBasedOnQuantileRegressionTree"/> is derived from
|
|
/// <see cref="TreeEnsembleModelParameters"/> plus a strongly-typed public attribute,
|
|
/// <see cref="TrainedTreeEnsemble"/>, for exposing trained model's details to users.
|
|
/// Its function, <see cref="CreateTreeEnsembleFromInternalDataStructure"/>, is
|
|
/// called to create <see cref="TrainedTreeEnsemble"/> inside <see cref="TreeEnsembleModelParameters"/>.
|
|
/// Note that the major difference between <see cref="TreeEnsembleModelParametersBasedOnQuantileRegressionTree"/>
|
|
/// and <see cref="TreeEnsembleModelParametersBasedOnRegressionTree"/> is the type of
|
|
/// <see cref="TrainedTreeEnsemble"/>.
|
|
/// </summary>
|
|
public abstract class TreeEnsembleModelParametersBasedOnQuantileRegressionTree : TreeEnsembleModelParameters, ICanGetSummaryAsIDataView
|
|
{
|
|
/// <summary>
|
|
/// An ensemble of trees exposed to users. It is a wrapper on the <see langword="internal"/>
|
|
/// <see cref="InternalTreeEnsemble"/> in <see cref="ML.Trainers.FastTree.TreeEnsemble{T}"/>.
|
|
/// </summary>
|
|
public QuantileRegressionTreeEnsemble TrainedTreeEnsemble { get; }
|
|
|
|
[BestFriend]
|
|
private protected TreeEnsembleModelParametersBasedOnQuantileRegressionTree(IHostEnvironment env, string name, InternalTreeEnsemble trainedEnsemble, int numFeatures, string innerArgs)
|
|
: base(env, name, trainedEnsemble, numFeatures, innerArgs)
|
|
{
|
|
TrainedTreeEnsemble = CreateTreeEnsembleFromInternalDataStructure();
|
|
}
|
|
|
|
[BestFriend]
|
|
private protected TreeEnsembleModelParametersBasedOnQuantileRegressionTree(IHostEnvironment env, string name, ModelLoadContext ctx, VersionInfo ver)
|
|
: base(env, name, ctx, ver)
|
|
{
|
|
TrainedTreeEnsemble = CreateTreeEnsembleFromInternalDataStructure();
|
|
}
|
|
|
|
private QuantileRegressionTreeEnsemble CreateTreeEnsembleFromInternalDataStructure()
|
|
{
|
|
var trees = TrainedEnsemble.Trees.Select(tree => new QuantileRegressionTree((InternalQuantileRegressionTree)tree));
|
|
var treeWeights = TrainedEnsemble.Trees.Select(tree => tree.Weight);
|
|
return new QuantileRegressionTreeEnsemble(trees, treeWeights, TrainedEnsemble.Bias);
|
|
}
|
|
|
|
/// <summary>
|
|
/// Used for the Summarize entrypoint.
|
|
/// </summary>
|
|
IDataView ICanGetSummaryAsIDataView.GetSummaryDataView(RoleMappedSchema schema)
|
|
=> RegressionTreeBaseUtils.RegressionTreeEnsembleAsIDataView(Host, TrainedTreeEnsemble.Bias, TrainedTreeEnsemble.TreeWeights, TrainedTreeEnsemble.Trees);
|
|
}
|
|
}
|