machinelearning/src/Microsoft.ML.FastTree/FastTree.cs

3444 строки
164 KiB
C#

// Licensed to the .NET Foundation under one or more agreements.
// The .NET Foundation licenses this file to you under the MIT license.
// See the LICENSE file in the project root for more information.
using System;
using System.Collections;
using System.Collections.Generic;
using System.ComponentModel;
using System.Diagnostics;
using System.IO;
using System.Linq;
using System.Text;
using Microsoft.ML.Calibrators;
using Microsoft.ML.CommandLine;
using Microsoft.ML.Data;
using Microsoft.ML.Data.Conversion;
using Microsoft.ML.FastTree.Utils;
using Microsoft.ML.Internal.Internallearn;
using Microsoft.ML.Internal.Utilities;
using Microsoft.ML.Model;
using Microsoft.ML.Model.OnnxConverter;
using Microsoft.ML.Model.Pfa;
using Microsoft.ML.Runtime;
using Microsoft.ML.Transforms;
using Microsoft.ML.TreePredictor;
using Newtonsoft.Json.Linq;
// All of these reviews apply in general to fast tree and random forest implementations.
//REVIEW: Decouple train method in Application.cs to have boosting and random forest logic separate.
//REVIEW: Do we need to keep all the fast tree based testers?
namespace Microsoft.ML.Trainers.FastTree
{
[BestFriend]
internal delegate void SignatureTreeEnsembleTrainer();
/// <summary>
/// FastTreeTrainerBase is generic class and can't have shared object among classes.
/// This class is to provide common for all classes object which we can use for lock purpose.
/// </summary>
internal static class FastTreeShared
{
public static readonly object TrainLock = new object();
}
public abstract class FastTreeTrainerBase<TOptions, TTransformer, TModel> :
TrainerEstimatorBaseWithGroupId<TTransformer, TModel>
where TTransformer : ISingleFeaturePredictionTransformer<TModel>
where TOptions : TreeOptions, new()
where TModel : class
{
private protected readonly TOptions FastTreeTrainerOptions;
private protected readonly bool AllowGC;
private protected int FeatureCount;
private protected InternalTreeEnsemble TrainedEnsemble;
private protected RoleMappedData ValidData;
/// <summary>
/// If not null, it's a test data set passed in from training context. It will be converted to one element in
/// <see cref="Tests"/> by calling <see cref="ExamplesToFastTreeBins.GetCompatibleDataset"/> in <see cref="InitializeTests"/>.
/// </summary>
private protected RoleMappedData TestData;
private protected IParallelTraining ParallelTraining;
private protected OptimizationAlgorithm OptimizationAlgorithm;
private protected Dataset TrainSet;
private protected Dataset ValidSet;
/// <summary>
/// Data sets used to evaluate the prediction scores produced the trained model during the training process.
/// </summary>
private protected Dataset[] TestSets;
private protected int[] FeatureMap;
/// <summary>
/// In the training process, <see cref="TrainSet"/>, <see cref="ValidSet"/>, <see cref="TestSets"/> would be
/// converted into <see cref="Tests"/> for efficient model evaluation.
/// </summary>
private protected List<Test> Tests;
private protected TestHistory PruningTest;
private protected int[] CategoricalFeatures;
// Test for early stopping.
private protected Test TrainTest;
private protected Test ValidTest;
private protected double[] InitTrainScores;
private protected double[] InitValidScores;
private protected double[][] InitTestScores;
private protected InternalTreeEnsemble Ensemble;
private protected bool HasValidSet => ValidSet != null;
private const string RegisterName = "FastTreeTraining";
// random for active features selection
private Random _featureSelectionRandom;
private protected string InnerOptions => CmdParser.GetSettings(Host, FastTreeTrainerOptions, new TOptions());
public override TrainerInfo Info { get; }
private protected virtual bool NeedCalibration => false;
/// <summary>
/// Constructor to use when instantiating the classes deriving from here through the API.
/// </summary>
private protected FastTreeTrainerBase(IHostEnvironment env,
SchemaShape.Column label,
string featureColumnName,
string exampleWeightColumnName,
string rowGroupColumnName,
int numberOfLeaves,
int numberOfTrees,
int minimumExampleCountPerLeaf)
: base(Contracts.CheckRef(env, nameof(env)).Register(RegisterName), TrainerUtils.MakeR4VecFeature(featureColumnName), label, TrainerUtils.MakeR4ScalarWeightColumn(exampleWeightColumnName), TrainerUtils.MakeU4ScalarColumn(rowGroupColumnName))
{
FastTreeTrainerOptions = new TOptions();
// set up the directly provided values
// override with the directly provided values.
FastTreeTrainerOptions.NumberOfLeaves = numberOfLeaves;
FastTreeTrainerOptions.NumberOfTrees = numberOfTrees;
FastTreeTrainerOptions.MinimumExampleCountPerLeaf = minimumExampleCountPerLeaf;
FastTreeTrainerOptions.LabelColumnName = label.Name;
FastTreeTrainerOptions.FeatureColumnName = featureColumnName;
FastTreeTrainerOptions.ExampleWeightColumnName = exampleWeightColumnName;
FastTreeTrainerOptions.RowGroupColumnName = rowGroupColumnName;
// The discretization step renders this trainer non-parametric, and therefore it does not need normalization.
// Also since it builds its own internal discretized columnar structures, it cannot benefit from caching.
// Finally, even the binary classifiers, being logitboost, tend to not benefit from external calibration.
Info = new TrainerInfo(normalization: false, caching: false, calibration: NeedCalibration, supportValid: true, supportTest: true);
// REVIEW: CLR 4.6 has a bug that is only exposed in Scope, and if we trigger GC.Collect in scope environment
// with memory consumption more than 5GB, GC get stuck in infinite loop.
// Before, we could check a specific type of the environment here, but now it is internal, so we will need another
// mechanism to detect that we are running in Scope.
AllowGC = true;
Initialize(env);
}
/// <summary>
/// Constructor that is used when invoking the classes deriving from this, through maml.
/// </summary>
private protected FastTreeTrainerBase(IHostEnvironment env, TOptions options, SchemaShape.Column label)
: base(Contracts.CheckRef(env, nameof(env)).Register(RegisterName), TrainerUtils.MakeR4VecFeature(options.FeatureColumnName), label, TrainerUtils.MakeR4ScalarWeightColumn(options.ExampleWeightColumnName),
TrainerUtils.MakeU4ScalarColumn(options.RowGroupColumnName))
{
Host.CheckValue(options, nameof(options));
FastTreeTrainerOptions = options;
// The discretization step renders this trainer non-parametric, and therefore it does not need normalization.
// Also since it builds its own internal discretized columnar structures, it cannot benefit from caching.
// Finally, even the binary classifiers, being logitboost, tend to not benefit from external calibration.
Info = new TrainerInfo(normalization: false, caching: false, calibration: NeedCalibration, supportValid: true, supportTest: true);
// REVIEW: CLR 4.6 has a bug that is only exposed in Scope, and if we trigger GC.Collect in scope environment
// with memory consumption more than 5GB, GC get stuck in infinite loop.
// Before, we could check a specific type of the environment here, but now it is internal, so we will need another
// mechanism to detect that we are running in Scope.
AllowGC = true;
Initialize(env);
}
private protected abstract void PrepareLabels(IChannel ch);
private protected abstract void InitializeTests();
private protected abstract Test ConstructTestForTrainingData();
private protected abstract OptimizationAlgorithm ConstructOptimizationAlgorithm(IChannel ch);
private protected abstract TreeLearner ConstructTreeLearner(IChannel ch);
private protected abstract ObjectiveFunctionBase ConstructObjFunc(IChannel ch);
private protected virtual float GetMaxLabel()
{
return float.PositiveInfinity;
}
private void Initialize(IHostEnvironment env)
{
ParallelTraining = FastTreeTrainerOptions.ParallelTrainer != null ? FastTreeTrainerOptions.ParallelTrainer.CreateComponent(env) : new SingleTrainer();
ParallelTraining.InitEnvironment();
Tests = new List<Test>();
InitializeThreads(FastTreeTrainerOptions.NumberOfThreads ?? Environment.ProcessorCount);
}
private protected void ConvertData(RoleMappedData trainData)
{
AnnotationUtils.TryGetCategoricalFeatureIndices(trainData.Schema.Schema, trainData.Schema.Feature.Value.Index, out CategoricalFeatures);
var useTranspose = UseTranspose(FastTreeTrainerOptions.DiskTranspose, trainData) && (ValidData == null || UseTranspose(FastTreeTrainerOptions.DiskTranspose, ValidData));
var instanceConverter = new ExamplesToFastTreeBins(Host, FastTreeTrainerOptions.MaximumBinCountPerFeature, useTranspose, !FastTreeTrainerOptions.FeatureFlocks, FastTreeTrainerOptions.MinimumExampleCountPerLeaf, GetMaxLabel());
TrainSet = instanceConverter.FindBinsAndReturnDataset(trainData, PredictionKind, ParallelTraining, CategoricalFeatures, FastTreeTrainerOptions.CategoricalSplit);
FeatureMap = instanceConverter.FeatureMap;
if (ValidData != null)
ValidSet = instanceConverter.GetCompatibleDataset(ValidData, PredictionKind, CategoricalFeatures, FastTreeTrainerOptions.CategoricalSplit);
if (TestData != null)
TestSets = new[] { instanceConverter.GetCompatibleDataset(TestData, PredictionKind, CategoricalFeatures, FastTreeTrainerOptions.CategoricalSplit) };
}
private bool UseTranspose(bool? useTranspose, RoleMappedData data)
{
Host.AssertValue(data);
Host.Assert(data.Schema.Feature.HasValue);
if (useTranspose.HasValue)
return useTranspose.Value;
var itdv = data.Data as ITransposeDataView;
return itdv?.GetSlotType(data.Schema.Feature.Value.Index) != null;
}
private protected void TrainCore(IChannel ch)
{
Contracts.CheckValue(ch, nameof(ch));
// REVIEW:Get rid of this lock then we completely remove all static classes from FastTree such as BlockingThreadPool.
lock (FastTreeShared.TrainLock)
{
using (Timer.Time(TimerEvent.TotalInitialization))
{
CheckOptions(ch);
PrintPrologInfo(ch);
Initialize(ch);
if (FastTreeTrainerOptions.MemoryStatistics)
PrintMemoryStats(ch);
}
using (Timer.Time(TimerEvent.TotalTrain))
Train(ch);
if (FastTreeTrainerOptions.ExecutionTime)
PrintExecutionTime(ch);
TrainedEnsemble = Ensemble;
if (FeatureMap != null)
TrainedEnsemble.RemapFeatures(FeatureMap);
ParallelTraining.FinalizeEnvironment();
}
}
private protected virtual bool ShouldStop(IChannel ch, ref EarlyStoppingRuleBase earlyStopping, ref int bestIteration)
{
bestIteration = Ensemble.NumTrees;
return false;
}
private protected virtual int GetBestIteration(IChannel ch) => Ensemble.NumTrees;
private protected virtual void InitializeThreads(int numThreads)
{
ThreadTaskManager.Initialize(numThreads);
}
private protected virtual void PrintExecutionTime(IChannel ch)
{
ch.Info("Execution time breakdown:\n{0}", Timer.GetString());
}
private protected virtual void CheckOptions(IChannel ch)
{
FastTreeTrainerOptions.Check(ch);
IntArray.CompatibilityLevel = FastTreeTrainerOptions.FeatureCompressionLevel;
// change arguments
if (FastTreeTrainerOptions.HistogramPoolSize < 2)
FastTreeTrainerOptions.HistogramPoolSize = FastTreeTrainerOptions.NumberOfLeaves * 2 / 3;
if (FastTreeTrainerOptions.HistogramPoolSize > FastTreeTrainerOptions.NumberOfLeaves - 1)
FastTreeTrainerOptions.HistogramPoolSize = FastTreeTrainerOptions.NumberOfLeaves - 1;
if (FastTreeTrainerOptions.BaggingSize > 0)
{
int bagCount = FastTreeTrainerOptions.NumberOfTrees / FastTreeTrainerOptions.BaggingSize;
if (bagCount * FastTreeTrainerOptions.BaggingSize != FastTreeTrainerOptions.NumberOfTrees)
throw ch.Except("Number of trees should be a multiple of number bag size");
}
if (!(0 <= FastTreeTrainerOptions.GainConfidenceLevel && FastTreeTrainerOptions.GainConfidenceLevel < 1))
throw ch.Except("Gain confidence level must be in the range [0,1)");
#if OLD_DATALOAD
#if !NO_STORE
if (_args.offloadBinsToFileStore)
{
if (!string.IsNullOrEmpty(_args.offloadBinsDirectory) && !Directory.Exists(_args.offloadBinsDirectory))
{
try
{
Directory.CreateDirectory(_args.offloadBinsDirectory);
}
catch (Exception e)
{
throw ch.Except(e, "Failure creating bins offload directory {0} - Exception {1}", _args.offloadBinsDirectory, e.Message);
}
}
}
#endif
#endif
}
/// <summary>
/// A virtual method that is used to print header of test graph.
/// Applications that need printing test graph are supposed to override
/// it to print specific test graph header.
/// </summary>
/// <returns> string representation of test graph header </returns>
private protected virtual string GetTestGraphHeader() => string.Empty;
/// <summary>
/// A virtual method that is used to print a single line of test graph.
/// Applications that need printing test graph are supposed to override
/// it to print a specific line of test graph after a new iteration is finished.
/// </summary>
/// <returns> string representation of a line of test graph </returns>
private protected virtual string GetTestGraphLine() => string.Empty;
/// <summary>
/// A virtual method that is used to compute test results after each iteration is finished.
/// </summary>
private protected virtual void ComputeTests()
{
}
private protected void PrintTestGraph(IChannel ch)
{
// we call Tests computing no matter whether we require to print test graph
ComputeTests();
if (!FastTreeTrainerOptions.PrintTestGraph)
return;
if (Ensemble.NumTrees == 0)
ch.Info(GetTestGraphHeader());
else
ch.Info(GetTestGraphLine());
}
private protected virtual void Initialize(IChannel ch)
{
#region Load/Initialize State
using (Timer.Time(TimerEvent.InitializeLabels))
PrepareLabels(ch);
using (Timer.Time(TimerEvent.InitializeTraining))
{
InitializeEnsemble();
OptimizationAlgorithm = ConstructOptimizationAlgorithm(ch);
}
using (Timer.Time(TimerEvent.InitializeTests))
InitializeTests();
if (AllowGC)
{
GC.Collect(2, GCCollectionMode.Forced);
GC.Collect(2, GCCollectionMode.Forced);
}
#endregion
}
#if !NO_STORE
/// <summary>
/// Calculates the percentage of feature bins that will fit into memory based on current available memory in the machine.
/// </summary>
/// <returns>A float number between 0 and 1 indicating the percentage of features to load.
/// The number will not be smaller than two times the feature fraction value</returns>
private float GetFeaturePercentInMemory(IChannel ch)
{
const float maxFeaturePercentValue = 1.0f;
float availableMemory = GetMachineAvailableBytes();
ch.Info("Available memory in the machine is = {0} bytes", availableMemory.ToString("N", CultureInfo.InvariantCulture));
float minFeaturePercentThreshold = _args.preloadFeatureBinsBeforeTraining ? (float)_args.featureFraction * 2 : (float)_args.featureFraction;
if (minFeaturePercentThreshold >= maxFeaturePercentValue)
{
return maxFeaturePercentValue;
}
// Initial free memory allowance in bytes for single and parallel fastrank modes
float freeMemoryAllowance = 1024 * 1024 * 512;
if (_optimizationAlgorithm.TreeLearner != null)
{
// Get the size of memory in bytes needed by the tree learner internal data structures
freeMemoryAllowance += _optimizationAlgorithm.TreeLearner.GetSizeOfReservedMemory();
}
availableMemory = (availableMemory > freeMemoryAllowance) ? availableMemory - freeMemoryAllowance : 0;
long featureSize = TrainSet.FeatureSetSize;
if (ValidSet != null)
{
featureSize += ValidSet.FeatureSetSize;
}
if (TestSets != null)
{
foreach (var item in TestSets)
{
featureSize += item.FeatureSetSize;
}
}
ch.Info("Total Feature bins size is = {0} bytes", featureSize.ToString("N", CultureInfo.InvariantCulture));
return Math.Min(Math.Max(minFeaturePercentThreshold, availableMemory / featureSize), maxFeaturePercentValue);
}
#endif
private protected bool[] GetActiveFeatures()
{
var activeFeatures = Utils.CreateArray(TrainSet.NumFeatures, true);
if (FastTreeTrainerOptions.FeatureFraction < 1.0)
{
if (_featureSelectionRandom == null)
_featureSelectionRandom = new Random(FastTreeTrainerOptions.FeatureSelectionSeed);
for (int i = 0; i < TrainSet.NumFeatures; ++i)
{
if (activeFeatures[i])
activeFeatures[i] = _featureSelectionRandom.NextDouble() <= FastTreeTrainerOptions.FeatureFraction;
}
}
return activeFeatures;
}
private string GetDatasetStatistics(Dataset set)
{
long datasetSize = set.SizeInBytes();
int skeletonSize = set.Skeleton.SizeInBytes();
return string.Format("set contains {0} query-doc pairs in {1} queries with {2} features and uses {3} MB ({4} MB for features)",
set.NumDocs, set.NumQueries, set.NumFeatures, datasetSize / 1024 / 1024, (datasetSize - skeletonSize) / 1024 / 1024);
}
private protected virtual void PrintMemoryStats(IChannel ch)
{
Contracts.AssertValue(ch);
ch.Trace("Training {0}", GetDatasetStatistics(TrainSet));
if (ValidSet != null)
ch.Trace("Validation {0}", GetDatasetStatistics(ValidSet));
if (TestSets != null)
{
for (int i = 0; i < TestSets.Length; ++i)
ch.Trace("ComputeTests[{1}] {0}",
GetDatasetStatistics(TestSets[i]), i);
}
if (AllowGC)
ch.Trace("GC Total Memory = {0} MB", GC.GetTotalMemory(true) / 1024 / 1024);
Process currentProcess = Process.GetCurrentProcess();
ch.Trace("Working Set = {0} MB", currentProcess.WorkingSet64 / 1024 / 1024);
ch.Trace("Virtual Memory = {0} MB",
currentProcess.VirtualMemorySize64 / 1024 / 1024);
ch.Trace("Private Memory = {0} MB",
currentProcess.PrivateMemorySize64 / 1024 / 1024);
ch.Trace("Peak Working Set = {0} MB", currentProcess.PeakWorkingSet64 / 1024 / 1024);
ch.Trace("Peak Virtual Memory = {0} MB",
currentProcess.PeakVirtualMemorySize64 / 1024 / 1024);
}
private protected bool AreSamplesWeighted(IChannel ch)
{
return TrainSet.SampleWeights != null;
}
private void InitializeEnsemble()
{
Ensemble = new InternalTreeEnsemble();
}
/// <summary>
/// Creates weights wrapping (possibly, trivial) for gradient target values.
/// </summary>
private protected virtual IGradientAdjuster MakeGradientWrapper(IChannel ch)
{
if (AreSamplesWeighted(ch))
return new QueryWeightsGradientWrapper();
else
return new TrivialGradientWrapper();
}
#if !NO_STORE
/// <summary>
/// Unloads feature bins being used in the current iteration.
/// </summary>
/// <param name="featureToUnload">Boolean array indicating the features to unload</param>
private void UnloadFeatureBins(bool[] featureToUnload)
{
foreach (ScoreTracker scoreTracker in this._optimizationAlgorithm.TrackedScores)
{
for (int i = 0; i < scoreTracker.Dataset.Features.Length; i++)
{
if (featureToUnload[i])
{
// Only return buffers to the pool that were allocated using the pool
// So far only this type of IntArrays below have buffer pool support.
// This is to avoid unexpected leaks in case a new IntArray is added but we are not allocating it from the pool.
if (scoreTracker.Dataset.Features[i].Bins is DenseIntArray ||
scoreTracker.Dataset.Features[i].Bins is DeltaSparseIntArray ||
scoreTracker.Dataset.Features[i].Bins is DeltaRepeatIntArray)
{
scoreTracker.Dataset.Features[i].Bins.ReturnBuffer();
scoreTracker.Dataset.Features[i].Bins = null;
}
}
}
}
}
/// <summary>
/// Worker thread delegate that loads features for the next training iteration
/// </summary>
/// <param name="state">thread state object</param>
private void LazyFeatureLoad(object state)
{
bool[] featuresToLoad = (bool[])state;
foreach (ScoreTracker scoreTracker in this._optimizationAlgorithm.TrackedScores)
{
for (int i = 0; i < scoreTracker.Dataset.Features.Length; i++)
{
if (featuresToLoad[i])
{
// just using the Bins property so feature bins are loaded into memory
IntArray bins = scoreTracker.Dataset.Features[i].Bins;
}
}
}
}
/// <summary>
/// Iterates through the feature sets needed in future tree training iterations (i.e. in ActiveFeatureSetQueue),
/// using the same order as they were enqueued, and it returns the initial active features based on the percentage parameter.
/// </summary>
/// <param name="pctFeatureThreshold">A float value between 0 and 1 indicating maximum percentage of features to return</param>
/// <returns>Array indicating calculated feature list</returns>
private bool[] GetNextFeaturesByThreshold(float pctFeatureThreshold)
{
int totalUniqueFeatureCount = 0;
bool[] nextActiveFeatures = new bool[TrainSet.NumFeatures];
if (pctFeatureThreshold == 1.0f)
{
// return all features to load
return nextActiveFeatures.Select(x => x = true).ToArray();
}
int maxNumberOfFeatures = (int)(pctFeatureThreshold * TrainSet.NumFeatures);
for (int i = 0; i < _activeFeatureSetQueue.Count; i++)
{
bool[] tempActiveFeatures = _activeFeatureSetQueue.ElementAt(i);
for (int j = 0; j < tempActiveFeatures.Length; j++)
{
if (tempActiveFeatures[j] && !nextActiveFeatures[j])
{
nextActiveFeatures[j] = true;
if (totalUniqueFeatureCount++ > maxNumberOfFeatures)
return nextActiveFeatures;
}
}
}
return nextActiveFeatures;
}
/// <summary>
/// Adds several items in the ActiveFeature queue
/// </summary>
/// <param name="numberOfItems">Number of items to add</param>
private void GenerateActiveFeatureLists(int numberOfItems)
{
for (int i = 0; i < numberOfItems; i++)
{
_activeFeatureSetQueue.Enqueue(GetActiveFeatures());
}
}
#endif
private protected virtual BaggingProvider CreateBaggingProvider()
{
Contracts.Assert(FastTreeTrainerOptions.BaggingSize > 0);
return new BaggingProvider(TrainSet, FastTreeTrainerOptions.NumberOfLeaves, FastTreeTrainerOptions.Seed, FastTreeTrainerOptions.BaggingExampleFraction);
}
private protected virtual bool ShouldRandomStartOptimizer()
{
return false;
}
private protected virtual void Train(IChannel ch)
{
Contracts.AssertValue(ch);
int numTotalTrees = FastTreeTrainerOptions.NumberOfTrees;
ch.Info(
"Reserved memory for tree learner: {0} bytes",
OptimizationAlgorithm.TreeLearner.GetSizeOfReservedMemory());
#if !NO_STORE
if (_args.offloadBinsToFileStore)
{
// Initialize feature percent to load before loading any features
_featurePercentToLoad = GetFeaturePercentInMemory(ch);
ch.Info("Using featurePercentToLoad = {0} ", _featurePercentToLoad);
}
#endif
// random starting point
bool revertRandomStart = false;
if (Ensemble.NumTrees < numTotalTrees && ShouldRandomStartOptimizer())
{
ch.Info("Randomizing start point");
OptimizationAlgorithm.TrainingScores.RandomizeScores(FastTreeTrainerOptions.Seed, false);
revertRandomStart = true;
}
ch.Info("Starting to train ...");
BaggingProvider baggingProvider = FastTreeTrainerOptions.BaggingSize > 0 ? CreateBaggingProvider() : null;
#if OLD_DATALOAD
#if !NO_STORE
// Preload
GenerateActiveFeatureLists(_args.numTrees);
Thread featureLoadThread = null;
// Initial feature load
if (_args.offloadBinsToFileStore)
{
FileObjectStore<IntArrayFormatter>.GetDefaultInstance().SealObjectStore();
if (_args.preloadFeatureBinsBeforeTraining)
{
StartFeatureLoadThread(GetNextFeaturesByThreshold(_featurePercentToLoad)).Join();
}
}
#endif
#endif
EarlyStoppingRuleBase earlyStoppingRule = null;
int bestIteration = 0;
int emptyTrees = 0;
using (var pch = Host.StartProgressChannel("FastTree training"))
{
pch.SetHeader(new ProgressHeader("trees"), e => e.SetProgress(0, Ensemble.NumTrees, numTotalTrees));
while (Ensemble.NumTrees < numTotalTrees)
{
ch.Trace($"numTotalTrees left: {numTotalTrees}");
Host.CheckAlive();
using (Timer.Time(TimerEvent.Iteration))
{
#if NO_STORE
bool[] activeFeatures = GetActiveFeatures();
#else
bool[] activeFeatures = _activeFeatureSetQueue.Dequeue();
#endif
if (FastTreeTrainerOptions.BaggingSize > 0 && Ensemble.NumTrees % FastTreeTrainerOptions.BaggingSize == 0)
{
baggingProvider.GenerateNewBag();
OptimizationAlgorithm.TreeLearner.Partitioning =
baggingProvider.GetCurrentTrainingPartition();
}
#if !NO_STORE
if (_args.offloadBinsToFileStore)
{
featureLoadThread = StartFeatureLoadThread(GetNextFeaturesByThreshold(_featurePercentToLoad));
if (!_args.preloadFeatureBinsBeforeTraining)
featureLoadThread.Join();
}
#endif
// call the weak learner
var tree = OptimizationAlgorithm.TrainingIteration(ch, activeFeatures);
if (tree == null)
{
emptyTrees++;
numTotalTrees--;
}
else if (FastTreeTrainerOptions.BaggingSize > 0 && Ensemble.Trees.Count() > 0)
{
ch.Assert(Ensemble.Trees.Last() == tree);
Ensemble.Trees.Last()
.AddOutputsToScores(OptimizationAlgorithm.TrainingScores.Dataset,
OptimizationAlgorithm.TrainingScores.Scores,
baggingProvider.GetCurrentOutOfBagPartition().Documents);
}
Host.CheckAlive();
CustomizedTrainingIteration(tree);
using (Timer.Time(TimerEvent.Test))
{
PrintIterationMessage(ch, pch);
PrintTestResults(ch);
}
// revert randomized start
if (revertRandomStart)
{
revertRandomStart = false;
ch.Info("Reverting random score assignment");
OptimizationAlgorithm.TrainingScores.RandomizeScores(FastTreeTrainerOptions.Seed, true);
}
#if !NO_STORE
if (_args.offloadBinsToFileStore)
{
// Unload only features that are not needed for the next iteration
bool[] featuresToUnload = activeFeatures;
if (_args.preloadFeatureBinsBeforeTraining)
{
featuresToUnload =
activeFeatures.Zip(GetNextFeaturesByThreshold(_featurePercentToLoad),
(current, next) => current && !next).ToArray();
}
UnloadFeatureBins(featuresToUnload);
if (featureLoadThread != null &&
_args.preloadFeatureBinsBeforeTraining)
{
// wait for loading the features needed for the next iteration
featureLoadThread.Join();
}
}
#endif
if (ShouldStop(ch, ref earlyStoppingRule, ref bestIteration))
break;
}
}
if (emptyTrees > 0)
{
ch.Warning("{0} of the boosting iterations failed to grow a tree. This is commonly because the " +
"minimum documents in leaf hyperparameter was set too high for this dataset.", emptyTrees);
}
}
Host.CheckAlive();
if (earlyStoppingRule != null)
{
Contracts.Assert(numTotalTrees == 0 || bestIteration > 0);
// REVIEW: Need to reconcile with future progress reporting changes.
ch.Info("The training is stopped at {0} and iteration {1} is picked",
Ensemble.NumTrees, bestIteration);
}
else
{
bestIteration = GetBestIteration(ch);
}
Host.CheckAlive();
OptimizationAlgorithm.FinalizeLearning(bestIteration);
Host.CheckAlive();
Ensemble.PopulateRawThresholds(TrainSet);
Host.CheckAlive();
ParallelTraining.FinalizeTreeLearner();
}
#if !NO_STORE
/// <summary>
/// Gets the available bytes performance counter on the local machine
/// </summary>
/// <returns>Available bytes number</returns>
private float GetMachineAvailableBytes()
{
using (var availableBytes = new System.Diagnostics.PerformanceCounter("Memory", "Available Bytes", true))
{
return availableBytes.NextValue();
}
}
#endif
// This method is called at the end of each training iteration, with the tree that was learnt on that iteration.
// Note that this tree can be null if no tree was learnt this iteration.
private protected virtual void CustomizedTrainingIteration(InternalRegressionTree tree)
{
}
private protected virtual void PrintIterationMessage(IChannel ch, IProgressChannel pch)
{
// REVIEW: report some metrics, not just number of trees?
int iteration = Ensemble.NumTrees;
if (iteration % 50 == 49)
pch.Checkpoint(iteration + 1);
}
private protected virtual void PrintTestResults(IChannel ch)
{
if (FastTreeTrainerOptions.TestFrequency != int.MaxValue && (Ensemble.NumTrees % FastTreeTrainerOptions.TestFrequency == 0 || Ensemble.NumTrees == FastTreeTrainerOptions.NumberOfTrees))
{
var sb = new StringBuilder();
using (var sw = new StringWriter(sb))
{
foreach (var t in Tests)
{
var results = t.ComputeTests();
sw.Write(t.FormatInfoString());
}
}
if (sb.Length > 0)
ch.Info(sb.ToString());
}
}
private protected virtual void PrintPrologInfo(IChannel ch)
{
Contracts.AssertValue(ch);
ch.Trace("Host = {0}", Environment.MachineName);
ch.Trace("CommandLine = {0}", CmdParser.GetSettings(Host, FastTreeTrainerOptions, new TOptions()));
ch.Trace("GCSettings.IsServerGC = {0}", System.Runtime.GCSettings.IsServerGC);
ch.Trace("{0}", FastTreeTrainerOptions);
}
private protected ScoreTracker ConstructScoreTracker(Dataset set)
{
// If not found construct one
ScoreTracker st = null;
if (set == TrainSet)
st = OptimizationAlgorithm.GetScoreTracker("train", TrainSet, InitTrainScores);
else if (set == ValidSet)
st = OptimizationAlgorithm.GetScoreTracker("valid", ValidSet, InitValidScores);
else
{
for (int t = 0; t < TestSets.Length; ++t)
{
if (set == TestSets[t])
{
double[] initTestScores = InitTestScores?[t];
st = OptimizationAlgorithm.GetScoreTracker(string.Format("test[{0}]", t), TestSets[t], initTestScores);
}
}
}
Contracts.Check(st != null, "unknown dataset passed to ConstructScoreTracker");
return st;
}
private double[] ComputeScoresSmart(IChannel ch, Dataset set)
{
if (!FastTreeTrainerOptions.CompressEnsemble)
{
foreach (var st in OptimizationAlgorithm.TrackedScores)
if (st.Dataset == set)
{
ch.Trace("Computing scores fast");
return st.Scores;
}
}
return ComputeScoresSlow(ch, set);
}
private double[] ComputeScoresSlow(IChannel ch, Dataset set)
{
ch.Trace("Computing scores slow");
double[] scores = new double[set.NumDocs];
Ensemble.GetOutputs(set, scores);
double[] initScores = GetInitScores(set);
if (initScores != null)
{
Contracts.Check(scores.Length == initScores.Length, "Length of initscores and scores mismatch");
for (int i = 0; i < scores.Length; i++)
scores[i] += initScores[i];
}
return scores;
}
private double[] GetInitScores(Dataset set)
{
if (set == TrainSet)
return InitTrainScores;
if (set == ValidSet)
return InitValidScores;
for (int i = 0; TestSets != null && i < TestSets.Length; i++)
{
if (set == TestSets[i])
return InitTestScores?[i];
}
throw Contracts.Except("Queried for unknown set");
}
}
internal abstract class DataConverter
{
private protected readonly int NumFeatures;
public abstract int NumExamples { get; }
private protected readonly float MaxLabel;
private protected readonly PredictionKind PredictionKind;
/// <summary>
/// The per-feature bin upper bounds. Implementations may differ on when all of the items
/// in this array are initialized to non-null values but it must happen at least no later
/// than immediately after we return from <see cref="GetDataset"/>.
/// </summary>
public readonly double[][] BinUpperBounds;
/// <summary>
/// In the event that any features are filtered, this will contain the feature map, where
/// the indices are the indices of features within the dataset, and the tree as we are
/// learning, and the values are the indices of the features within the original input
/// data. This array is used to "rehydrate" the tree once we finish training, so that the
/// feature indices are once again over the full set of features, as opposed to the subset
/// of features we actually trained on. This can be null in the event that no filtering
/// occurred.
/// </summary>
/// <seealso cref="InternalTreeEnsemble.RemapFeatures"/>
public int[] FeatureMap;
private protected readonly IHost Host;
private protected readonly int[] CategoricalFeatureIndices;
private protected readonly bool CategoricalSplit;
private protected bool UsingMaxLabel
{
get { return MaxLabel != float.PositiveInfinity; }
}
private DataConverter(RoleMappedData data, IHost host, double[][] binUpperBounds, float maxLabel,
PredictionKind kind, int[] categoricalFeatureIndices, bool categoricalSplit)
{
Contracts.AssertValue(host, "host");
Host = host;
Host.CheckValue(data, nameof(data));
data.CheckFeatureFloatVector(out int featLen);
data.CheckOptFloatWeight();
data.CheckOptGroup();
NumFeatures = featLen;
if (binUpperBounds != null)
{
Host.AssertValue(binUpperBounds);
Host.Assert(Utils.Size(binUpperBounds) == NumFeatures);
Host.Assert(binUpperBounds.All(b => b != null));
BinUpperBounds = binUpperBounds;
}
else
BinUpperBounds = new double[NumFeatures][];
MaxLabel = maxLabel;
PredictionKind = kind;
CategoricalSplit = categoricalSplit;
CategoricalFeatureIndices = categoricalFeatureIndices;
}
public static DataConverter Create(RoleMappedData data, IHost host, int maxBins,
float maxLabel, bool diskTranspose, bool noFlocks, int minDocsPerLeaf, PredictionKind kind,
IParallelTraining parallelTraining, int[] categoricalFeatureIndices, bool categoricalSplit)
{
Contracts.AssertValue(host, "host");
host.AssertValue(data);
host.Assert(maxBins > 0);
DataConverter conv;
using (var ch = host.Start("CreateConverter"))
{
if (!diskTranspose)
conv = new MemImpl(data, host, maxBins, maxLabel, noFlocks, minDocsPerLeaf, kind,
parallelTraining, categoricalFeatureIndices, categoricalSplit);
else
conv = new DiskImpl(data, host, maxBins, maxLabel, kind, parallelTraining, categoricalFeatureIndices, categoricalSplit);
}
return conv;
}
public static DataConverter Create(RoleMappedData data, IHost host, double[][] binUpperBounds,
float maxLabel, bool diskTranspose, bool noFlocks, PredictionKind kind, int[] categoricalFeatureIndices, bool categoricalSplit)
{
Contracts.AssertValue(host, "host");
host.AssertValue(data);
DataConverter conv;
using (var ch = host.Start("CreateConverter"))
{
if (!diskTranspose)
conv = new MemImpl(data, host, binUpperBounds, maxLabel, noFlocks, kind, categoricalFeatureIndices, categoricalSplit);
else
conv = new DiskImpl(data, host, binUpperBounds, maxLabel, kind, categoricalFeatureIndices, categoricalSplit);
}
return conv;
}
public abstract Dataset GetDataset();
/// <summary>
/// Bins and input vector of feature values.
/// </summary>
/// <param name="binFinder">The instead of the bin finder to use</param>
/// <param name="values">The values for one particular feature value across all examples</param>
/// <param name="maxBins">The maximum number of bins to find</param>
/// <param name="minDocsPerLeaf"></param>
/// <param name="upperBounds">The bin upper bounds, maximum length will be <paramref name="maxBins"/></param>
/// <returns>Whether finding the bins was successful or not. It will be unsuccessful iff <paramref name="values"/>
/// has any missing values. In that event, the out parameters will be left as null.</returns>
private protected static bool CalculateBins(BinFinder binFinder, in VBuffer<double> values, int maxBins, int minDocsPerLeaf,
out double[] upperBounds)
{
return binFinder.FindBins(in values, maxBins, minDocsPerLeaf, out upperBounds);
}
private static IEnumerable<KeyValuePair<int, int>> NonZeroBinnedValuesForSparse(ReadOnlySpan<double> values, ReadOnlySpan<int> indices, double[] binUpperBounds)
{
Contracts.Assert(values.Length == indices.Length);
Contracts.Assert(Algorithms.FindFirstGE(binUpperBounds, 0) == 0);
var result = new List<KeyValuePair<int, int>>();
for (int i = 0; i < values.Length; ++i)
{
int ge = Algorithms.FindFirstGE(binUpperBounds, values[i]);
if (ge != 0)
result.Add(new KeyValuePair<int, int>(indices[i], ge));
}
return result;
}
private FeatureFlockBase CreateOneHotFlock(IChannel ch,
List<int> features, int[] binnedValues, int[] lastOn, ValuesList[] instanceList,
ref int[] forwardIndexerWork, ref VBuffer<double> temp, bool categorical)
{
Contracts.AssertValue(ch);
ch.Assert(0 <= features.Min() && features.Max() < NumFeatures);
ch.Assert(features.Count > 0);
if (features.Count == 1)
{
// Singleton.
int fi = features[0];
var values = instanceList[fi];
values.CopyTo(NumExamples, ref temp);
return CreateSingletonFlock(ch, in temp, binnedValues, BinUpperBounds[fi]);
}
// Multiple, one hot.
int[] hotFeatureStarts = new int[features.Count + 1];
// The position 0 is reserved as the "cold" position for all features in the slot.
// This corresponds to all features being in their first bin (for example, cold). So the
// first feature's "hotness" starts at 1. HOWEVER, for the purpose of defining the
// bins, we start with this array computed off by one. Once we define the bins, we
// will correct it.
hotFeatureStarts[0] = 0;
// There are as many hot positions per feature as there are number of bin upper
// bounds, minus 1. (The first bin is the "cold" position.)
for (int i = 1; i < hotFeatureStarts.Length; ++i)
hotFeatureStarts[i] = hotFeatureStarts[i - 1] + BinUpperBounds[features[i - 1]].Length - 1;
IntArrayBits flockBits = IntArray.NumBitsNeeded(hotFeatureStarts[hotFeatureStarts.Length - 1] + 1);
int min = features[0];
int lim = features[features.Count - 1] + 1;
var ind = new ValuesList.ForwardIndexer(instanceList, features.ToArray(), ref forwardIndexerWork);
int[] f2sf = Utils.CreateArray(lim - min, -1);
for (int i = 0; i < features.Count; ++i)
f2sf[features[i] - min] = i;
int hotCount = 0;
for (int i = 0; i < lastOn.Length; ++i)
{
int fi = lastOn[i];
if (fi < min || fi >= lim)
{
// All of the features would bin to 0, so we're in the "cold" position.
binnedValues[i] = 0;
#if false // This would be a very nice test to have, but for some situations it's too slow, even for debug builds. Consider reactivating temporarily if actively working on flocks.
// Assert that all the features really would be cold for this position.
Contracts.Assert(Enumerable.Range(min, lim - min).All(f => ind[f, i] < BinUpperBounds[f][0]));
#endif
continue;
}
ch.Assert(min <= fi && fi < lim);
int subfeature = f2sf[fi - min];
ch.Assert(subfeature >= 0);
double val = ind[subfeature, i];
#if false // Same note, too slow even for debug builds.
// Assert that all the other features really would be cold for this position.
Contracts.Assert(Enumerable.Range(min, fi - min).Concat(Enumerable.Range(fi + 1, lim - (fi + 1))).All(f => ind[f, i] < BinUpperBounds[f][0]));
#endif
double[] bub = BinUpperBounds[fi];
ch.Assert(bub.Length > 1);
int bin = Algorithms.FindFirstGE(bub, val);
ch.Assert(0 < bin && bin < bub.Length); // If 0, should not have been considered "on", so what the heck?
binnedValues[i] = hotFeatureStarts[subfeature] + bin;
hotCount++;
}
#if DEBUG
int limBin = (1 << (int)flockBits);
Contracts.Assert(flockBits == IntArrayBits.Bits32 || binnedValues.All(b => b < limBin));
#endif
// Correct the hot feature starts now that we're done binning.
for (int f = 0; f < hotFeatureStarts.Length; ++f)
hotFeatureStarts[f]++;
// Construct the int array of binned values.
const double sparsifyThreshold = 0.7;
IntArrayType type = hotCount < (1 - sparsifyThreshold) * NumExamples
? IntArrayType.Sparse
: IntArrayType.Dense;
IntArray bins = IntArray.New(NumExamples, type, flockBits, binnedValues);
var bups = features.Select(fi => BinUpperBounds[fi]).ToArray(features.Count);
return new OneHotFeatureFlock(bins, hotFeatureStarts, bups, categorical);
}
private FeatureFlockBase CreateOneHotFlockCategorical(IChannel ch,
List<int> features, int[] binnedValues, int[] lastOn, bool categorical)
{
Contracts.AssertValue(ch);
ch.Assert(0 <= features.Min() && features.Max() < NumFeatures);
ch.Assert(features.Count > 1);
// Multiple, one hot.
int[] hotFeatureStarts = new int[features.Count + 1];
// The position 0 is reserved as the "cold" position for all features in the slot.
// This corresponds to all features being in their first bin (for example, cold). So the
// first feature's "hotness" starts at 1. HOWEVER, for the purpose of defining the
// bins, we start with this array computed off by one. Once we define the bins, we
// will correct it.
hotFeatureStarts[0] = 0;
// There are as many hot positions per feature as there are number of bin upper
// bounds, minus 1. (The first bin is the "cold" position.)
for (int i = 1; i < hotFeatureStarts.Length; ++i)
hotFeatureStarts[i] = hotFeatureStarts[i - 1] + BinUpperBounds[features[i - 1]].Length - 1;
IntArrayBits flockBits = IntArray.NumBitsNeeded(hotFeatureStarts[hotFeatureStarts.Length - 1] + 1);
int min = features[0];
int lim = features[features.Count - 1] + 1;
int[] f2sf = Utils.CreateArray(lim - min, -1);
for (int i = 0; i < features.Count; ++i)
f2sf[features[i] - min] = i;
int hotCount = 0;
for (int i = 0; i < lastOn.Length; ++i)
{
int fi = lastOn[i];
if (fi < min || fi >= lim)
{
// All of the features would bin to 0, so we're in the "cold" position.
binnedValues[i] = 0;
#if false // This would be a very nice test to have, but for some situations it's too slow, even for debug builds. Consider reactivating temporarily if actively working on flocks.
// Assert that all the features really would be cold for this position.
Contracts.Assert(Enumerable.Range(min, lim - min).All(f => ind[f, i] < BinUpperBounds[f][0]));
#endif
continue;
}
ch.Assert(min <= fi && fi < lim);
int subfeature = f2sf[fi - min];
ch.Assert(subfeature >= 0);
#if false // Same note, too slow even for debug builds.
// Assert that all the other features really would be cold for this position.
Contracts.Assert(Enumerable.Range(min, fi - min).Concat(Enumerable.Range(fi + 1, lim - (fi + 1))).All(f => ind[f, i] < BinUpperBounds[f][0]));
#endif
double[] bub = BinUpperBounds[fi];
ch.Assert(bub.Length == 2);
//REVIEW: leaving out check for the value to reduced memory consumption and going with
//leap of faith based on what the user told.
binnedValues[i] = hotFeatureStarts[subfeature] + 1;
hotCount++;
}
#if DEBUG
int limBin = (1 << (int)flockBits);
Contracts.Assert(flockBits == IntArrayBits.Bits32 || binnedValues.All(b => b < limBin));
#endif
// Correct the hot feature starts now that we're done binning.
for (int f = 0; f < hotFeatureStarts.Length; ++f)
hotFeatureStarts[f]++;
// Construct the int array of binned values.
const double sparsifyThreshold = 0.7;
IntArrayType type = hotCount < (1 - sparsifyThreshold) * NumExamples
? IntArrayType.Sparse
: IntArrayType.Dense;
IntArray bins = IntArray.New(NumExamples, type, flockBits, binnedValues);
var bups = features.Select(fi => BinUpperBounds[fi]).ToArray(features.Count);
return new OneHotFeatureFlock(bins, hotFeatureStarts, bups, categorical);
}
/// <summary>
/// Create a new feature flock with a given name, values and specified bin bounds.
/// </summary>
/// <param name="ch"></param>
/// <param name="values">The values for this feature, that will be binned.</param>
/// <param name="binnedValues">A working array of length equal to the length of the input feature vector</param>
/// <param name="binUpperBounds">The upper bounds of the binning of this feature.</param>
/// <returns>A derived binned derived feature vector.</returns>
private protected static SingletonFeatureFlock CreateSingletonFlock(IChannel ch, in VBuffer<double> values, int[] binnedValues,
double[] binUpperBounds)
{
Contracts.AssertValue(ch);
ch.Assert(Utils.Size(binUpperBounds) > 0);
ch.AssertValue(binnedValues);
ch.Assert(binnedValues.Length == values.Length);
// TODO: Consider trying to speed up FindFirstGE by making a "map" like is done in the fastrank code
// TODO: Cache binnedValues
int zeroBin = Algorithms.FindFirstGE(binUpperBounds, 0);
// TODO: Make this a settable parameter / use the sparsifyThreshold already in the parameters
const double sparsifyThreshold = 0.7;
IntArray bins = null;
var valuesValues = values.GetValues();
var numBitsNeeded = IntArray.NumBitsNeeded(binUpperBounds.Length);
if (numBitsNeeded == IntArrayBits.Bits0)
bins = new Dense0BitIntArray(values.Length);
else if (!values.IsDense && zeroBin == 0 && valuesValues.Length < (1 - sparsifyThreshold) * values.Length)
{
// Special code to go straight from our own sparse format to a sparse IntArray.
// Note: requires zeroBin to be 0 because that's what's assumed in FastTree code
var nonZeroValues = NonZeroBinnedValuesForSparse(valuesValues, values.GetIndices(), binUpperBounds);
bins = new DeltaSparseIntArray(values.Length, numBitsNeeded, nonZeroValues);
}
else
{
// Fill the binnedValues array and convert using normal IntArray code
int firstBinCount = 0;
if (!values.IsDense)
{
if (zeroBin != 0)
{
for (int i = 0; i < values.Length; i++)
binnedValues[i] = zeroBin;
}
else
Array.Clear(binnedValues, 0, values.Length);
var valuesIndices = values.GetIndices();
for (int i = 0; i < valuesValues.Length; ++i)
{
if ((binnedValues[valuesIndices[i]] = Algorithms.FindFirstGE(binUpperBounds, valuesValues[i])) == 0)
firstBinCount++;
}
if (zeroBin == 0)
firstBinCount += values.Length - valuesValues.Length;
}
else
{
for (int i = 0; i < valuesValues.Length; i++)
{
if (valuesValues[i] == 0)
binnedValues[i] = zeroBin;
else
binnedValues[i] = Algorithms.FindFirstGE(binUpperBounds, valuesValues[i]);
if (binnedValues[i] == 0)
firstBinCount++;
}
}
// This sparsity check came from the FastRank code.
double firstBinFrac = (double)firstBinCount / binnedValues.Length;
IntArrayType arrayType = firstBinFrac > sparsifyThreshold ? IntArrayType.Sparse : IntArrayType.Dense;
bins = IntArray.New(values.Length, arrayType, IntArray.NumBitsNeeded(binUpperBounds.Length), binnedValues);
}
return new SingletonFeatureFlock(bins, binUpperBounds);
}
private sealed class DiskImpl : DataConverter
{
private readonly int _numExamples;
private readonly Dataset _dataset;
public override int NumExamples { get { return _numExamples; } }
public DiskImpl(RoleMappedData data, IHost host, int maxBins, float maxLabel, PredictionKind kind,
IParallelTraining parallelTraining, int[] categoricalFeatureIndices, bool categoricalSplit)
: base(data, host, null, maxLabel, kind, categoricalFeatureIndices, categoricalSplit)
{
// use parallel training for training data
Host.AssertValue(parallelTraining);
_dataset = Construct(data, ref _numExamples, maxBins, parallelTraining);
}
public DiskImpl(RoleMappedData data, IHost host,
double[][] binUpperBounds, float maxLabel, PredictionKind kind, int[] categoricalFeatureIndices, bool categoricalSplit)
: base(data, host, binUpperBounds, maxLabel, kind, categoricalFeatureIndices, categoricalSplit)
{
_dataset = Construct(data, ref _numExamples, -1, null);
}
public override Dataset GetDataset()
{
return _dataset;
}
private static int AddColumnIfNeeded(DataViewSchema.Column? info, List<int> toTranspose)
{
if (!info.HasValue)
return -1;
// It is entirely possible that a single column could have two roles,
// and so be added twice, but this case is handled by the transposer.
var idx = info.Value.Index;
toTranspose.Add(idx);
return idx;
}
private ValueMapper<VBuffer<T1>, VBuffer<T2>> GetCopier<T1, T2>(DataViewType itemType1, DataViewType itemType2)
{
var conv = Conversions.DefaultInstance.GetStandardConversion<T1, T2>(itemType1, itemType2, out bool identity);
if (identity)
{
ValueMapper<VBuffer<T1>, VBuffer<T1>> identityResult =
(in VBuffer<T1> src, ref VBuffer<T1> dst) => src.CopyTo(ref dst);
return (ValueMapper<VBuffer<T1>, VBuffer<T2>>)(object)identityResult;
}
return
(in VBuffer<T1> src, ref VBuffer<T2> dst) =>
{
var srcValues = src.GetValues();
var editor = VBufferEditor.Create(ref dst, src.Length, srcValues.Length);
if (srcValues.Length > 0)
{
if (!src.IsDense)
{
src.GetIndices().CopyTo(editor.Indices);
}
for (int i = 0; i < srcValues.Length; ++i)
conv(in srcValues[i], ref editor.Values[i]);
}
dst = editor.Commit();
};
}
private Dataset Construct(RoleMappedData examples, ref int numExamples, int maxBins, IParallelTraining parallelTraining)
{
Host.CheckAlive();
Host.AssertValue(examples);
Host.Assert(examples.Schema.Feature.HasValue);
if (parallelTraining == null)
Host.AssertValue(BinUpperBounds);
Dataset result;
using (var ch = Host.Start("Conversion"))
{
// Add a missing value filter on the features.
// REVIEW: Possibly filter out missing labels, but we don't do this in current FastTree conversion.
//var missingArgs = new MissingValueFilter.Arguments();
//missingArgs.column = new string[] { examples.Schema.Feature.Name };
//IDataView data = new MissingValueFilter(missingArgs, Host, examples.Data);
IDataView data = examples.Data;
// Convert the label column, if one exists.
var labelName = examples.Schema.Label?.Name;
if (labelName != null)
{
var convArgs = new LabelConvertTransform.Arguments();
var convCol = new LabelConvertTransform.Column() { Name = labelName, Source = labelName };
convArgs.Columns = new LabelConvertTransform.Column[] { convCol };
data = new LabelConvertTransform(Host, convArgs, data);
}
// Convert the group column, if one exists.
if (examples.Schema.Group?.Name is string groupName)
data = new TypeConvertingTransformer(Host, new TypeConvertingEstimator.ColumnOptions(groupName, DataKind.UInt64, groupName)).Transform(data);
// Since we've passed it through a few transforms, reconstitute the mapping on the
// newly transformed data.
examples = new RoleMappedData(data, examples.Schema.GetColumnRoleNames());
// Get the index of the columns in the transposed view, while we're at it composing
// the list of the columns we want to transpose.
var toTranspose = new List<int>();
int featIdx = AddColumnIfNeeded(examples.Schema.Feature, toTranspose);
int labelIdx = AddColumnIfNeeded(examples.Schema.Label, toTranspose);
int groupIdx = AddColumnIfNeeded(examples.Schema.Group, toTranspose);
int weightIdx = AddColumnIfNeeded(examples.Schema.Weight, toTranspose);
Host.Assert(1 <= toTranspose.Count && toTranspose.Count <= 4);
ch.Info("Changing data from row-wise to column-wise on disk");
// Note that if these columns are already transposed, then this will be a no-op.
using (Transposer trans = Transposer.Create(Host, data, false, toTranspose.ToArray()))
{
VBuffer<float> temp = default(VBuffer<float>);
// Construct the derived features.
var features = new FeatureFlockBase[NumFeatures];
BinFinder finder = new BinFinder();
FeaturesToContentMap fmap = new FeaturesToContentMap(examples.Schema);
var hasMissingPred = Conversions.DefaultInstance.GetHasMissingPredicate<float>(((ITransposeDataView)trans).GetSlotType(featIdx));
// There is no good mechanism to filter out rows with missing feature values on transposed data.
// So, we instead perform one featurization pass which, if successful, will remain one pass but,
// if we ever encounter missing values will become a "detect missing features" pass, which will
// in turn inform a necessary featurization pass secondary
SlotDropper slotDropper = null;
bool[] localConstructBinFeatures = Utils.CreateArray<bool>(NumFeatures, true);
if (parallelTraining != null)
localConstructBinFeatures = parallelTraining.GetLocalBinConstructionFeatures(NumFeatures);
using (var pch = Host.StartProgressChannel("FastTree disk-based bins initialization"))
{
for (; ; )
{
bool hasMissing = false;
using (var cursor = trans.GetSlotCursor(featIdx))
{
HashSet<int> constructed = new HashSet<int>();
var getter = SubsetGetter(cursor.GetGetter<float>(), slotDropper);
numExamples = slotDropper?.DstLength ?? trans.RowCount;
// Perhaps we should change the binning to just work over singles.
VBuffer<double> doubleTemp = default(VBuffer<double>);
var copier = GetCopier<float, double>(NumberDataViewType.Single, NumberDataViewType.Double);
int iFeature = 0;
pch.SetHeader(new ProgressHeader("features"), e => e.SetProgress(0, iFeature, features.Length));
while (cursor.MoveNext())
{
Host.CheckAlive();
iFeature = cursor.SlotIndex;
if (!localConstructBinFeatures[iFeature])
continue;
Host.Assert(iFeature < features.Length);
Host.Assert(features[iFeature] == null);
getter(ref temp);
Host.Assert(temp.Length == numExamples);
// First get the bin bounds, constructing them if they do not exist.
if (BinUpperBounds[iFeature] == null)
{
constructed.Add(iFeature);
ch.Assert(maxBins > 0);
finder = finder ?? new BinFinder();
// Must copy over, as bin calculation is potentially destructive.
copier(in temp, ref doubleTemp);
hasMissing = !CalculateBins(finder, in doubleTemp, maxBins, 0,
out BinUpperBounds[iFeature]);
}
else
hasMissing = hasMissingPred(in temp);
if (hasMissing)
{
// Let's just be a little extra safe, since it's so easy to check and the results if there
// is a bug in the upstream pipeline would be very severe.
ch.Check(slotDropper == null,
"Multiple passes over the data seem to be producing different data. There is a bug in the upstream pipeline.");
// Destroy any constructed bin upper bounds. We'll calculate them over the next pass.
foreach (var i in constructed)
BinUpperBounds[i] = null;
// Determine what rows have missing values.
slotDropper = ConstructDropSlotRanges(cursor, getter, ref temp);
ch.Assert(slotDropper.DstLength < temp.Length);
ch.Warning("{0} of {1} examples will be skipped due to missing feature values",
temp.Length - slotDropper.DstLength, temp.Length);
break;
}
Host.AssertValue(BinUpperBounds[iFeature]);
}
}
if (hasMissing == false)
break;
}
// Sync up global boundaries.
if (parallelTraining != null)
parallelTraining.SyncGlobalBoundary(NumFeatures, maxBins, BinUpperBounds);
List<FeatureFlockBase> flocks = new List<FeatureFlockBase>();
using (var cursor = trans.GetSlotCursor(featIdx))
using (var catCursor = trans.GetSlotCursor(featIdx))
{
var getter = SubsetGetter(cursor.GetGetter<float>(), slotDropper);
var catGetter = SubsetGetter(catCursor.GetGetter<float>(), slotDropper);
numExamples = slotDropper?.DstLength ?? trans.RowCount;
// Perhaps we should change the binning to just work over singles.
VBuffer<double> doubleTemp = default(VBuffer<double>);
int[] binnedValues = new int[numExamples];
var copier = GetCopier<float, double>(NumberDataViewType.Single, NumberDataViewType.Double);
int iFeature = 0;
if (CategoricalSplit && CategoricalFeatureIndices != null)
{
int[] lastOn = new int[NumExamples];
for (int i = 0; i < lastOn.Length; ++i)
lastOn[i] = -1;
List<int> pending = new List<int>();
int catRangeIndex = 0;
for (iFeature = 0; iFeature < NumFeatures;)
{
Host.CheckAlive();
if (catRangeIndex < CategoricalFeatureIndices.Length &&
CategoricalFeatureIndices[catRangeIndex] == iFeature)
{
pending.Clear();
bool oneHot = true;
for (int iFeatureLocal = iFeature;
iFeatureLocal <= CategoricalFeatureIndices[catRangeIndex + 1];
++iFeatureLocal)
{
double[] bup = BinUpperBounds[iFeatureLocal];
if (bup.Length == 1)
{
// This is a trivial feature. Skip it.
continue;
}
Contracts.Assert(Utils.Size(bup) > 0);
double firstBin = bup[0];
GetFeatureValues(catCursor, iFeatureLocal, catGetter, ref temp, ref doubleTemp, copier);
bool add = false;
var doubleTempValues = doubleTemp.GetValues();
var doubleTempIndices = doubleTemp.GetIndices();
for (int index = 0; index < doubleTempValues.Length; ++index)
{
if (doubleTempValues[index] <= firstBin)
continue;
int iindex = doubleTemp.IsDense ? index : doubleTempIndices[index];
int last = lastOn[iindex];
if (doubleTempValues[index] != 1 || (last != -1 && last >= iFeature))
{
catRangeIndex += 2;
pending.Clear();
oneHot = false;
break;
}
lastOn[iindex] = iFeatureLocal;
add = true;
}
if (!oneHot)
break;
if (add)
pending.Add(iFeatureLocal);
}
if (!oneHot)
continue;
if (pending.Count > 0)
{
flocks.Add(CreateOneHotFlockCategorical(ch, pending, binnedValues,
lastOn, true));
}
iFeature = CategoricalFeatureIndices[catRangeIndex + 1] + 1;
catRangeIndex += 2;
}
else
{
GetFeatureValues(cursor, iFeature, getter, ref temp, ref doubleTemp, copier);
double[] upperBounds = BinUpperBounds[iFeature++];
Host.AssertValue(upperBounds);
if (upperBounds.Length == 1)
continue; //trivial feature, skip it.
flocks.Add(CreateSingletonFlock(ch, in doubleTemp, binnedValues, upperBounds));
}
}
}
else
{
for (int i = 0; i < NumFeatures; i++)
{
Host.CheckAlive();
GetFeatureValues(cursor, i, getter, ref temp, ref doubleTemp, copier);
double[] upperBounds = BinUpperBounds[i];
Host.AssertValue(upperBounds);
if (upperBounds.Length == 1)
continue; //trivial feature, skip it.
flocks.Add(CreateSingletonFlock(ch, in doubleTemp, binnedValues, upperBounds));
}
}
Contracts.Assert(FeatureMap == null);
FeatureMap = Enumerable.Range(0, NumFeatures).Where(f => BinUpperBounds[f].Length > 1).ToArray();
features = flocks.ToArray();
}
}
// Construct the labels.
short[] ratings = new short[numExamples];
double[] actualLabels = new double[numExamples];
if (labelIdx >= 0)
{
trans.GetSingleSlotValue<float>(labelIdx, ref temp);
slotDropper?.DropSlots(ref temp, ref temp);
var tempValues = temp.GetValues();
var tempIndices = temp.GetIndices();
for (int i = 0; i < tempValues.Length; ++i)
{
int ii = temp.IsDense ? i : tempIndices[i];
var label = tempValues[i];
if (UsingMaxLabel && !(0 <= label && label <= MaxLabel))
throw Host.Except("Found invalid label {0}. Value should be between 0 and {1}, inclusive.", label, MaxLabel);
ratings[ii] = (short)label;
actualLabels[ii] = (double)label;
}
}
// Construct the boundaries and query IDs.
int[] boundaries;
ulong[] qids;
if (PredictionKind == PredictionKind.Ranking)
{
if (groupIdx < 0)
throw ch.Except("You need to provide {0} column for Ranking problem", DefaultColumnNames.GroupId);
VBuffer<ulong> groupIds = default(VBuffer<ulong>);
trans.GetSingleSlotValue<ulong>(groupIdx, ref groupIds);
slotDropper?.DropSlots(ref groupIds, ref groupIds);
ConstructBoundariesAndQueryIds(in groupIds, out boundaries, out qids);
}
else
{
if (groupIdx >= 0)
ch.Warning("This is not ranking problem, Group Id '{0}' column will be ignored", examples.Schema.Group.Value.Name);
const int queryChunkSize = 100;
qids = new ulong[(numExamples - 1) / queryChunkSize + 1];
boundaries = new int[qids.Length + 1];
for (int i = 0; i < qids.Length; ++i)
{
qids[i] = (ulong)i;
boundaries[i + 1] = boundaries[i] + queryChunkSize;
}
boundaries[boundaries.Length - 1] = numExamples;
}
// Construct the doc IDs. Doesn't really matter what these are.
ulong[] dids = Enumerable.Range(0, numExamples).Select(d => (ulong)d).ToArray(numExamples);
var skeleton = new Dataset.DatasetSkeleton(ratings, boundaries, qids, dids, new double[0][], actualLabels);
Host.Assert(features.All(f => f != null));
result = new Dataset(skeleton, features);
}
}
return result;
}
private void GetFeatureValues(SlotCursor cursor, int iFeature, ValueGetter<VBuffer<float>> getter,
ref VBuffer<float> temp, ref VBuffer<double> doubleTemp, ValueMapper<VBuffer<float>, VBuffer<double>> copier)
{
while (cursor.MoveNext())
{
Contracts.Assert(iFeature >= cursor.SlotIndex);
if (iFeature == cursor.SlotIndex)
break;
}
Contracts.Assert(cursor.SlotIndex == iFeature);
getter(ref temp);
copier(in temp, ref doubleTemp);
}
private static ValueGetter<VBuffer<T>> SubsetGetter<T>(ValueGetter<VBuffer<T>> getter, SlotDropper slotDropper)
{
if (slotDropper == null)
return getter;
return slotDropper.SubsetGetter(getter);
}
/// <summary>
/// Returns a slot dropper object that has ranges of slots to be dropped,
/// based on an examination of the feature values.
/// </summary>
private static SlotDropper ConstructDropSlotRanges(SlotCursor cursor,
ValueGetter<VBuffer<float>> getter, ref VBuffer<float> temp)
{
// The iteration here is slightly differently from a usual cursor iteration. Here, temp
// already holds the value of the cursor's current position, and we don't really want
// to re-fetch it, and the cursor is necessarily advanced.
Contracts.Assert(cursor.SlotIndex >= 0);
BitArray rowHasMissing = new BitArray(temp.Length);
for (; ; )
{
foreach (var kv in temp.Items())
{
if (float.IsNaN(kv.Value))
rowHasMissing.Set(kv.Key, true);
}
if (!cursor.MoveNext())
break;
getter(ref temp);
}
List<int> minSlots = new List<int>();
List<int> maxSlots = new List<int>();
bool previousBit = false;
for (int i = 0; i < rowHasMissing.Length; i++)
{
bool currentBit = rowHasMissing.Get(i);
if (currentBit && !previousBit)
{
minSlots.Add(i);
maxSlots.Add(i);
}
else if (currentBit)
maxSlots[maxSlots.Count - 1] = i;
previousBit = currentBit;
}
Contracts.Assert(maxSlots.Count == minSlots.Count);
return new SlotDropper(temp.Length, minSlots.ToArray(), maxSlots.ToArray());
}
private static void ConstructBoundariesAndQueryIds(in VBuffer<ulong> groupIds, out int[] boundariesArray, out ulong[] qidsArray)
{
List<ulong> qids = new List<ulong>();
List<int> boundaries = new List<int>();
ulong last = 0;
if (groupIds.Length > 0)
groupIds.GetItemOrDefault(0, ref last);
int count = 0;
foreach (ulong groupId in groupIds.DenseValues())
{
if (count == 0 || last != groupId)
{
qids.Add(last = groupId);
boundaries.Add(count);
}
count++;
}
boundaries.Add(count);
qidsArray = qids.ToArray();
boundariesArray = boundaries.ToArray();
}
}
// REVIEW: Our data conversion is extremely inefficient. Fix it!
private sealed class MemImpl : DataConverter
{
private readonly RoleMappedData _data;
// instanceList[feature] is the vector of values for the given feature
private readonly ValuesList[] _instanceList;
private readonly List<short> _targetsList;
private readonly List<double> _actualTargets;
private readonly List<double> _weights;
private readonly List<int> _boundaries;
private readonly long _numMissingInstances;
private readonly int _numExamples;
private readonly bool _noFlocks;
private readonly int _minDocsPerLeaf;
public override int NumExamples
{
get { return _numExamples; }
}
private MemImpl(RoleMappedData data, IHost host, double[][] binUpperBounds, float maxLabel, bool dummy,
bool noFlocks, PredictionKind kind, int[] categoricalFeatureIndices, bool categoricalSplit)
: base(data, host, binUpperBounds, maxLabel, kind, categoricalFeatureIndices, categoricalSplit)
{
_data = data;
// Array of List<double> objects for each feature, containing values for that feature over all rows
_instanceList = new ValuesList[NumFeatures];
for (int i = 0; i < _instanceList.Length; i++)
_instanceList[i] = new ValuesList();
// Labels.
_targetsList = new List<short>();
_actualTargets = new List<double>();
_weights = data.Schema.Weight != null ? new List<double>() : null;
_boundaries = new List<int>();
_noFlocks = noFlocks;
MakeBoundariesAndCheckLabels(out _numMissingInstances, out long numInstances);
if (numInstances > Utils.ArrayMaxSize)
throw Host.ExceptParam(nameof(data), "Input data had {0} rows, but can only accommodate {1}", numInstances, Utils.ArrayMaxSize);
_numExamples = (int)numInstances;
}
public MemImpl(RoleMappedData data, IHost host, int maxBins, float maxLabel, bool noFlocks, int minDocsPerLeaf,
PredictionKind kind, IParallelTraining parallelTraining, int[] categoricalFeatureIndices, bool categoricalSplit)
: this(data, host, null, maxLabel, dummy: true, noFlocks: noFlocks, kind: kind,
categoricalFeatureIndices: categoricalFeatureIndices, categoricalSplit: categoricalSplit)
{
// Convert features to binned values.
_minDocsPerLeaf = minDocsPerLeaf;
InitializeBins(maxBins, parallelTraining);
}
public MemImpl(RoleMappedData data, IHost host, double[][] binUpperBounds, float maxLabel,
bool noFlocks, PredictionKind kind, int[] categoricalFeatureIndices, bool categoricalSplit)
: this(data, host, binUpperBounds, maxLabel, dummy: true, noFlocks: noFlocks, kind: kind,
categoricalFeatureIndices: categoricalFeatureIndices, categoricalSplit: categoricalSplit)
{
Host.AssertValue(binUpperBounds);
}
private void MakeBoundariesAndCheckLabels(out long missingInstances, out long totalInstances)
{
using (var ch = Host.Start("InitBoundariesAndLabels"))
using (var pch = Host.StartProgressChannel("FastTree data preparation"))
{
long featureValues = 0;
// Warn at about 2 GB usage.
const long featureValuesWarnThreshold = (2L << 30) / sizeof(double);
bool featureValuesWarned = false;
const string featureValuesWarning = "We seem to be processing a lot of data. Consider using the FastTree diskTranspose+ (or dt+) option, for slower but more memory efficient transposition.";
const int queryChunkSize = 100;
// Populate the feature values array and labels.
ch.Info("Changing data from row-wise to column-wise");
long pos = 0;
double rowCountDbl = (double?)_data.Data.GetRowCount() ?? double.NaN;
pch.SetHeader(new ProgressHeader("examples"),
e => e.SetProgress(0, pos, rowCountDbl));
// REVIEW: Should we ignore rows with bad label, weight, or group? The previous code seemed to let
// them through (but filtered out bad features).
CursOpt curOptions = CursOpt.Label | CursOpt.Features;
bool hasGroup = false;
if (PredictionKind == PredictionKind.Ranking)
{
hasGroup = _data.Schema.Group != null;
if (hasGroup)
curOptions |= CursOpt.Group;
}
else
{
if (_data.Schema.Group != null)
ch.Warning("This is not ranking problem, Group Id '{0}' column will be ignored", _data.Schema.Group.Value.Name);
}
if (_data.Schema.Weight.HasValue)
curOptions |= CursOpt.Weight;
using (var cursor = new FloatLabelCursor(_data, curOptions))
{
ulong groupPrev = 0;
while (cursor.MoveNext())
{
pos = cursor.KeptRowCount - 1;
int index = checked((int)pos);
ch.Assert(pos >= 0);
// If we have no group, then the group number should not change.
Host.Assert(hasGroup || cursor.Group == groupPrev);
if (hasGroup)
{
// If we are either at the start of iteration, or a new
// group has started, add the boundary and register the
// new group identifier.
if (pos == 0 || cursor.Group != groupPrev)
{
_boundaries.Add(index);
groupPrev = cursor.Group;
}
}
else if (pos % queryChunkSize == 0)
{
// If there are no groups, it is best to just put the
// boundaries at regular intervals.
_boundaries.Add(index);
}
if (UsingMaxLabel)
{
if (cursor.Label < 0 || cursor.Label > MaxLabel)
throw ch.Except("Found invalid label {0}. Value should be between 0 and {1}, inclusive.", cursor.Label, MaxLabel);
}
foreach (var kvp in cursor.Features.Items())
_instanceList[kvp.Key].Add(index, kvp.Value);
_actualTargets.Add(cursor.Label);
if (_weights != null)
_weights.Add(cursor.Weight);
_targetsList.Add((short)cursor.Label);
featureValues += cursor.Features.GetValues().Length;
if (featureValues > featureValuesWarnThreshold && !featureValuesWarned)
{
ch.Warning(featureValuesWarning);
featureValuesWarned = true;
}
}
_boundaries.Add(checked((int)cursor.KeptRowCount));
totalInstances = cursor.KeptRowCount;
missingInstances = cursor.BadFeaturesRowCount;
}
ch.Check(totalInstances > 0, "All instances skipped due to missing features.");
if (missingInstances > 0)
ch.Warning("Skipped {0} instances with missing features during training", missingInstances);
}
}
private void InitializeBins(int maxBins, IParallelTraining parallelTraining)
{
// Find upper bounds for each bin for each feature.
using (var ch = Host.Start("InitBins"))
using (var pch = Host.StartProgressChannel("FastTree in-memory bins initialization"))
{
BinFinder binFinder = new BinFinder();
VBuffer<double> temp = default(VBuffer<double>);
int len = _numExamples;
bool[] localConstructBinFeatures = parallelTraining.GetLocalBinConstructionFeatures(NumFeatures);
int iFeature = 0;
pch.SetHeader(new ProgressHeader("features"), e => e.SetProgress(0, iFeature, NumFeatures));
List<int> trivialFeatures = new List<int>();
for (iFeature = 0; iFeature < NumFeatures; iFeature++)
{
Host.CheckAlive();
if (!localConstructBinFeatures[iFeature])
continue;
// The following strange call will actually sparsify.
_instanceList[iFeature].CopyTo(len, ref temp);
// REVIEW: In principle we could also put the min docs per leaf information
// into here, and collapse bins somehow as we determine the bins, so that "trivial"
// bins on the head or tail of the bin distribution are never actually considered.
CalculateBins(binFinder, in temp, maxBins, _minDocsPerLeaf,
out double[] binUpperBounds);
BinUpperBounds[iFeature] = binUpperBounds;
}
parallelTraining.SyncGlobalBoundary(NumFeatures, maxBins, BinUpperBounds);
}
}
public override Dataset GetDataset()
{
using (var ch = Host.Start("BinFeatures"))
using (var pch = Host.StartProgressChannel("FastTree feature conversion"))
{
FeatureFlockBase[] flocks = CreateFlocks(ch, pch).ToArray();
ch.Trace("{0} features stored in {1} flocks.", NumFeatures, flocks.Length);
return new Dataset(CreateDatasetSkeleton(), flocks);
}
}
private NHotFeatureFlock CreateNHotFlock(IChannel ch, List<int> features)
{
Contracts.AssertValue(ch);
ch.Assert(Utils.Size(features) > 1);
// Copy pasta from above.
int[] hotFeatureStarts = new int[features.Count + 1];
for (int i = 1; i < hotFeatureStarts.Length; ++i)
hotFeatureStarts[i] = hotFeatureStarts[i - 1] + BinUpperBounds[features[i - 1]].Length - 1;
IntArrayBits flockBits = IntArray.NumBitsNeeded(hotFeatureStarts[hotFeatureStarts.Length - 1] + 1);
var kvEnums = new IEnumerator<KeyValuePair<int, int>>[features.Count];
var delta = new List<byte>();
var values = new List<int>();
try
{
for (int i = 0; i < features.Count; ++i)
kvEnums[i] = _instanceList[features[i]].Binned(BinUpperBounds[features[i]], NumExamples).GetEnumerator();
Heap<int> heap = new Heap<int>(
(i, j) =>
{
ch.AssertValue(kvEnums[i]);
ch.AssertValue(kvEnums[j]);
int irow = kvEnums[i].Current.Key;
int jrow = kvEnums[j].Current.Key;
if (irow == jrow) // If we're on the same row, prefer the "smaller" feature.
return j < i;
// Earlier rows should go first.
return jrow < irow;
});
// Do the initial population of the heap.
for (int i = 0; i < kvEnums.Length; ++i)
{
if (kvEnums[i].MoveNext())
heap.Add(i);
else
{
kvEnums[i].Dispose();
kvEnums[i] = null;
}
}
// Iteratively build the delta-sparse and int arrays.
// REVIEW: Could be hinted as having capacity count hot, but may do more harm than good.
int last = 0;
while (heap.Count > 0)
{
int i = heap.Pop();
var kvEnum = kvEnums[i];
ch.AssertValue(kvEnum);
var kvp = kvEnum.Current;
ch.Assert(kvp.Key >= last);
ch.Assert(kvp.Value > 0);
while (kvp.Key - last > Byte.MaxValue)
{
delta.Add(Byte.MaxValue);
values.Add(0);
last += Byte.MaxValue;
}
ch.Assert(kvp.Key - last <= Byte.MaxValue);
// Note that kvp.Key - last might be zero, in the case where we are representing multiple
// values for a single row.
delta.Add((byte)(kvp.Key - last));
values.Add(kvp.Value + hotFeatureStarts[i]);
ch.Assert(kvp.Key > last || values.Count == 1 || values[values.Count - 1] > values[values.Count - 2]);
last = kvp.Key;
if (kvEnum.MoveNext())
heap.Add(i);
else
{
kvEnum.Dispose();
kvEnums[i] = null;
}
}
}
finally
{
// Need to dispose the enumerators.
foreach (var enumerator in kvEnums)
{
if (enumerator != null)
enumerator.Dispose();
}
}
// Correct the hot feature starts now that we're done binning.
for (int f = 0; f < hotFeatureStarts.Length; ++f)
hotFeatureStarts[f]++;
var denseBins = (DenseIntArray)IntArray.New(values.Count, IntArrayType.Dense, flockBits, values);
var bups = features.Select(fi => BinUpperBounds[fi]).ToArray(features.Count);
return new NHotFeatureFlock(denseBins, delta.ToArray(), NumExamples, hotFeatureStarts, bups);
}
private IEnumerable<FeatureFlockBase> CreateFlocks(IChannel ch, IProgressChannel pch)
{
int iFeature = 0;
FeatureMap = Enumerable.Range(0, NumFeatures).Where(f => BinUpperBounds[f].Length > 1).ToArray();
foreach (FeatureFlockBase flock in CreateFlocksCore(ch, pch))
{
Contracts.Assert(flock.Count > 0);
Contracts.Assert(iFeature + flock.Count <= FeatureMap.Length);
int min = FeatureMap[iFeature];
int lim = iFeature + flock.Count == FeatureMap.Length
? NumFeatures
: FeatureMap[iFeature + flock.Count];
for (int i = min; i < lim; ++i)
_instanceList[i] = null;
iFeature += flock.Count;
yield return flock;
}
ch.Assert(iFeature <= NumFeatures); // Some could have been filtered.
ch.Assert(iFeature == FeatureMap.Length);
if (iFeature == 0)
{
// It is possible to filter out all features. In such a case as this we introduce a dummy
// "trivial" feature, so that the learning code downstream does not choke.
yield return new SingletonFeatureFlock(new Dense0BitIntArray(NumExamples), BinUpperBounds[0]);
FeatureMap = new[] { 0 };
}
}
private IEnumerable<FeatureFlockBase> CreateFlocksCore(IChannel ch, IProgressChannel pch)
{
int iFeature = 0;
pch.SetHeader(new ProgressHeader("features"), e => e.SetProgress(0, iFeature, NumFeatures));
VBuffer<double> temp = default(VBuffer<double>);
// Working array for bins.
int[] binnedValues = new int[NumExamples];
if (_noFlocks)
{
for (iFeature = 0; iFeature < NumFeatures; ++iFeature)
{
var bup = BinUpperBounds[iFeature];
ch.Assert(Utils.Size(bup) > 0);
if (bup.Length == 1) // Trivial.
continue;
var values = _instanceList[iFeature];
_instanceList[iFeature] = null;
values.CopyTo(NumExamples, ref temp);
yield return CreateSingletonFlock(ch, in temp, binnedValues, bup);
}
yield break;
}
List<int> pending = new List<int>();
int[] forwardIndexerWork = null;
if (CategoricalSplit && CategoricalFeatureIndices != null)
{
int[] lastOn = new int[NumExamples];
for (int i = 0; i < lastOn.Length; ++i)
lastOn[i] = -1;
int catRangeIndex = 0;
for (iFeature = 0; iFeature < NumFeatures;)
{
if (catRangeIndex < CategoricalFeatureIndices.Length)
{
if (CategoricalFeatureIndices[catRangeIndex] == iFeature)
{
bool isOneHot = true;
for (int iFeatureLocal = iFeature;
iFeatureLocal <= CategoricalFeatureIndices[catRangeIndex + 1];
++iFeatureLocal)
{
double[] bup = BinUpperBounds[iFeatureLocal];
if (bup.Length == 1)
{
// This is a trivial feature. Skip it.
continue;
}
Contracts.Assert(Utils.Size(bup) > 0);
double firstBin = bup[0];
using (IEnumerator<int> hotEnumerator = _instanceList[iFeatureLocal].AllIndicesGT(NumExamples, firstBin).GetEnumerator())
{
while (hotEnumerator.MoveNext())
{
int last = lastOn[hotEnumerator.Current];
//Not a one-hot flock, bail.
if (last >= iFeature)
{
isOneHot = false;
pending.Clear();
break;
}
lastOn[hotEnumerator.Current] = iFeatureLocal;
}
}
pending.Add(iFeatureLocal);
}
if (pending.Count > 0)
{
yield return CreateOneHotFlock(ch, pending, binnedValues, lastOn, _instanceList,
ref forwardIndexerWork, ref temp, true);
pending.Clear();
}
if (isOneHot)
iFeature = CategoricalFeatureIndices[catRangeIndex + 1] + 1;
catRangeIndex += 2;
}
else
{
foreach (var flock in CreateFlocksCore(ch, pch, iFeature, CategoricalFeatureIndices[catRangeIndex]))
yield return flock;
iFeature = CategoricalFeatureIndices[catRangeIndex];
}
}
else
{
foreach (var flock in CreateFlocksCore(ch, pch, iFeature, NumFeatures))
yield return flock;
iFeature = NumFeatures;
}
}
}
else
{
foreach (var flock in CreateFlocksCore(ch, pch, 0, NumFeatures))
yield return flock;
}
}
private IEnumerable<FeatureFlockBase> CreateFlocksCore(IChannel ch, IProgressChannel pch, int startFeatureIndex, int featureLim)
{
int iFeature = startFeatureIndex;
VBuffer<double> temp = default(VBuffer<double>);
// Working array for bins.
int[] binnedValues = new int[NumExamples];
// Holds what feature for an example was last "on", that is, will have
// to be explicitly represented. This was the last feature for which AllIndicesGE
// returned an index.
int[] lastOn = new int[NumExamples];
for (int i = 0; i < lastOn.Length; ++i)
lastOn[i] = -1;
int[] forwardIndexerWork = null;
// What creations are pending?
List<int> pending = new List<int>();
Func<FeatureFlockBase> createOneHotFlock =
() => CreateOneHotFlock(ch, pending, binnedValues, lastOn, _instanceList,
ref forwardIndexerWork, ref temp, false);
Func<FeatureFlockBase> createNHotFlock =
() => CreateNHotFlock(ch, pending);
// The exclusive upper bound of what features have already been incorporated
// into a flock.
int limMade = startFeatureIndex;
int countBins = 1; // Count of bins we'll need to represent. Starts at 1, accumulates "hot" features.
// Tracking for n-hot flocks.
long countHotRows = 0; // The count of hot "rows"
long hotNThreshold = (long)(0.1 * NumExamples);
bool canBeOneHot = true;
Func<FeatureFlockBase> createFlock =
() =>
{
ch.Assert(pending.Count > 0);
FeatureFlockBase flock;
if (canBeOneHot)
flock = createOneHotFlock();
else
flock = createNHotFlock();
canBeOneHot = true;
limMade = iFeature;
pending.Clear();
countHotRows = 0;
countBins = 1;
return flock;
};
for (; iFeature < featureLim; ++iFeature)
{
Host.CheckAlive();
double[] bup = BinUpperBounds[iFeature];
Contracts.Assert(Utils.Size(bup) > 0);
if (bup.Length == 1)
{
// This is a trivial feature. Skip it.
continue;
}
ValuesList values = _instanceList[iFeature];
if (countBins > Utils.ArrayMaxSize - (bup.Length - 1))
{
// It can happen that a flock could be created with more than Utils.ArrayMaxSize
// bins, in the case where we bin over a training dataset with many features with
// many bins (for example, 1 million features with 10k bins each), and then in a subsequent
// validation dataset we have these features suddenly become one-hot. Practically
// this will never happen, of course, but it is still possible. If this ever happens,
// we create the flock before this becomes an issue.
ch.Assert(0 < countBins && countBins <= Utils.ArrayMaxSize);
ch.Assert(limMade < iFeature);
ch.Assert(pending.Count > 0);
yield return createFlock();
}
countBins += bup.Length - 1;
double firstBin = bup[0];
int localHotRows = 0;
// The number of bits we would use if we incorporated the current feature in to the
// existing running flock.
IntArrayBits newBits = IntArray.NumBitsNeeded(countBins);
if (canBeOneHot)
{
using (IEnumerator<int> hotEnumerator = values.AllIndicesGT(NumExamples, firstBin).GetEnumerator())
{
if (pending.Count > 0)
{
// There are prior features we haven't yet flocked. So we are still contemplating
// "flocking" this prior feature with this feature (and possibly features beyond).
// The enumeration will need to run the appropriate checks.
while (hotEnumerator.MoveNext())
{
int i = hotEnumerator.Current;
++localHotRows;
var last = lastOn[i];
Contracts.Assert(last < iFeature);
if (last >= limMade)
{
// We've encountered an overlapping feature. We now need to decide whether we want
// to continue accumulating into a flock and so make this n-hot flock, or cut it off
// now and create a one-hot flock.
if (countHotRows < hotNThreshold)
{
// We may want to create an N-hot flock.
int superLocalHot = values.CountIndicesGT(NumExamples, firstBin);
if (countHotRows + superLocalHot < hotNThreshold)
{
// If this succeeds, we want to create an N-hot flock including this.
canBeOneHot = false;
localHotRows = superLocalHot;
break; // Future iterations will create the n-hot.
}
// If the test above failed, then we want to create a one-hot of [limMade, iFeature),
// and keep going on this guy.
}
// We've decided to create a one-hot flock. Before continuing to fill in lastOn, use
// lastOn in its current state to create a flock from limMade inclusive, to f
// exclusive, and make "f" the new limMade. Note that we continue to fill in lastOn
// once we finish this.
ch.Assert(limMade < iFeature);
ch.Assert(canBeOneHot);
yield return createFlock();
lastOn[i] = iFeature;
// Now that we've made the feature there's no need continually check against lastOn[i]'s
// prior values. Fall through to the limMade == iFeature case.
break;
}
lastOn[i] = iFeature;
}
}
if (canBeOneHot)
{
// In the event that hotEnumerator was exhausted in the above loop, the following is a no-op.
while (hotEnumerator.MoveNext())
{
// There is no prior feature to flock, so there's no need to track anything yet.
// Just populate lastOn appropriately.
++localHotRows;
lastOn[hotEnumerator.Current] = iFeature;
}
}
}
ch.Assert(values.CountIndicesGT(NumExamples, firstBin) == localHotRows);
pending.Add(iFeature); // Have not yet flocked this feature.
}
else
{
// No need to track in lastOn, since we're no longer contemplating this being one-hot.
ch.Assert(limMade < iFeature);
ch.Assert(countHotRows < hotNThreshold);
ch.Assert(!canBeOneHot);
localHotRows = values.CountIndicesGT(NumExamples, firstBin);
if (countHotRows + localHotRows >= hotNThreshold)
{
// Too dense if we add iFeature to the mix. Make an n-hot of [limMade, iFeature),
// then decrement iFeature so that we reconsider it in light of being a candidate
// for one-hot or singleton. Do not add to pending, as its status will be considered
// in the next pass.
yield return createFlock();
--iFeature;
}
else // Have not yet flocked as feature.
pending.Add(iFeature);
}
countHotRows += localHotRows;
}
Contracts.Assert(limMade < featureLim);
if (pending.Count > 0)
yield return createFlock();
}
/// <summary>
/// Create an artificial metadata object to pad the Dataset
/// </summary>
private Dataset.DatasetSkeleton CreateDatasetSkeleton()
{
ulong[] docIds = new ulong[_numExamples]; // All zeros is fine
ulong[] queryIds = new ulong[_boundaries.Count - 1]; // All zeros is fine
var ds = UsingMaxLabel
? new Dataset.DatasetSkeleton(_targetsList.ToArray(), _boundaries.ToArray(), queryIds, docIds, new double[0][])
: new Dataset.DatasetSkeleton(_targetsList.ToArray(), _boundaries.ToArray(), queryIds, docIds, new double[0][], _actualTargets.ToArray());
//AP TODO change it to have weights=null when dataset is unweighted in order to avoid potential long memory scan
if (_weights != null)
ds.SampleWeights = _weights.ToArray();
return ds;
}
}
// REVIEW: Change this, as well as the bin finding code and bin upper bounds, to be float instead of double.
/// <summary>
/// A mutable list of index,value that may be kept sparse or dense.
/// </summary>
private sealed class ValuesList
{
private bool _isSparse;
private List<double> _dense;
private int _nonZeroElements; // when dense, is the number of non-zero elements (for determining when to sparsify)
private List<KeyValuePair<int, double>> _sparse;
public ValuesList()
{
_dense = new List<double>();
}
public void Add(int index, double value)
{
if (!_isSparse)
{
// Check if adding this element will make the array sparse.
if (ShouldSparsify(_nonZeroElements + 1, index + 1))
Sparsify();
else
{
// Add zeros if needed.
while (_dense.Count < index)
_dense.Add(default(double));
// Add the value.
_dense.Add(value);
if (value != 0)
_nonZeroElements++;
return;
}
}
// Note this also may happen because we just sparsified.
Contracts.Assert(_isSparse);
if (value != 0)
_sparse.Add(new KeyValuePair<int, double>(index, value));
}
private bool ShouldSparsify(int nonZeroElements, int totalElements)
{
// TODO: We need a better solution here. Also, maybe should start sparse and become dense instead?
return (double)nonZeroElements / totalElements < 0.25 && totalElements > 10;
}
private void Sparsify()
{
_sparse = new List<KeyValuePair<int, double>>(_nonZeroElements);
for (int i = 0; i < _dense.Count; i++)
{
if (_dense[i] != 0)
_sparse.Add(new KeyValuePair<int, double>(i, _dense[i]));
}
_isSparse = true;
_dense = null;
}
/// <summary>
/// Returns the count of all positions greater than an indicated value.
/// </summary>
/// <param name="length">The limit of indices to check</param>
/// <param name="gtValue">The value against which the greater-than
/// comparison is made</param>
/// <returns>The count of all indices in the range of 0 to <paramref name="length"/>
/// exclusive whose values are greater than <paramref name="gtValue"/></returns>
public int CountIndicesGT(int length, double gtValue)
{
Contracts.Assert(0 <= length);
if (_isSparse)
{
Contracts.Assert(_sparse.Count == 0 || _sparse[_sparse.Count - 1].Key < length);
return _sparse.Count(kvp => kvp.Value > gtValue) + (0 > gtValue ? length - _sparse.Count : 0);
}
else
{
Contracts.Assert(_dense.Count <= length);
return _dense.Count(v => v > gtValue) + (0 > gtValue ? length - _dense.Count : 0);
}
}
/// <summary>
/// Return all indices that are greater than an indicated value.
/// </summary>
/// <param name="lim">The limit of indices to return</param>
/// <param name="gtValue">The value against which the greater-than
/// comparison is made</param>
/// <returns>All indices in the range of 0 to <paramref name="lim"/> exclusive
/// whose values are greater than <paramref name="gtValue"/>, in
/// increasing order</returns>
public IEnumerable<int> AllIndicesGT(int lim, double gtValue)
{
Contracts.Assert(0 <= lim);
if (_isSparse)
{
Contracts.Assert(_sparse.Count == 0 || _sparse[_sparse.Count - 1].Key < lim);
if (0 > gtValue)
{
// All implicitly defined sparse values will have to be returned.
int prev = -1;
foreach (var kvp in _sparse)
{
Contracts.Assert(prev < kvp.Key);
while (++prev < kvp.Key)
yield return prev;
if (kvp.Value > gtValue)
yield return kvp.Key;
}
// Return the "leftovers."
while (++prev < lim)
yield return prev;
}
else
{
// Only explicitly defined values have to be returned.
foreach (var kvp in _sparse)
{
if (kvp.Value > gtValue)
yield return kvp.Key;
}
}
}
else
{
Contracts.Assert(_dense.Count <= lim);
for (int i = 0; i < _dense.Count; ++i)
{
if (_dense[i] > gtValue)
yield return i;
}
if (0 > gtValue)
{
// All implicitly defined post-dense values will have to be returned,
// assuming there are any (this set is only non-empty when listLim < lim).
for (int i = _dense.Count; i < lim; ++i)
yield return i;
}
}
}
public void CopyTo(int length, ref VBuffer<double> dst)
{
Contracts.Assert(0 <= length);
VBufferEditor<double> editor;
if (!_isSparse)
{
Contracts.Assert(_dense.Count <= length);
if (ShouldSparsify(_nonZeroElements, length))
Sparsify();
else
{
editor = VBufferEditor.Create(ref dst, length);
if (_dense.Count < length)
{
_dense.CopyTo(editor.Values);
editor.Values.Slice(_dense.Count, length - _dense.Count).Clear();
}
else
_dense.CopyTo(editor.Values, length);
dst = editor.Commit();
return;
}
}
int count = _sparse.Count;
Contracts.Assert(count <= length);
editor = VBufferEditor.Create(ref dst, length, count);
for (int i = 0; i < _sparse.Count; ++i)
{
editor.Indices[i] = _sparse[i].Key;
editor.Values[i] = _sparse[i].Value;
}
Contracts.Assert(Utils.IsIncreasing(0, editor.Indices, count, length));
dst = editor.Commit();
}
/// <summary>
/// An enumerable of the row/bin pair of every non-zero bin row according to the
/// binning passed into this method.
/// </summary>
/// <param name="binUpperBounds">The binning to use for the enumeration</param>
/// <param name="length">The number of rows in this feature</param>
/// <returns>An enumerable that returns a pair of every row-index and binned value,
/// where the row indices are increasing, the binned values are positive</returns>
public IEnumerable<KeyValuePair<int, int>> Binned(double[] binUpperBounds, int length)
{
Contracts.Assert(Utils.Size(binUpperBounds) > 0);
Contracts.Assert(0 <= length);
int zeroBin = Algorithms.FindFirstGE(binUpperBounds, 0);
IntArrayBits numBitsNeeded = IntArray.NumBitsNeeded(binUpperBounds.Length);
if (numBitsNeeded == IntArrayBits.Bits0)
yield break;
if (!_isSparse)
{
Contracts.Assert(_dense.Count <= length);
if (ShouldSparsify(_nonZeroElements, length))
Sparsify();
}
if (_isSparse)
{
Contracts.AssertValue(_sparse);
if (zeroBin == 0)
{
// We can skip all implicit values in sparse.
foreach (var kvp in _sparse)
{
Contracts.Assert(kvp.Key < length);
int binned = Algorithms.FindFirstGE(binUpperBounds, kvp.Value);
if (binned > 0)
yield return new KeyValuePair<int, int>(kvp.Key, binned);
}
yield break;
}
Contracts.Assert(zeroBin != 0);
int last = -1;
foreach (var kvp in _sparse)
{
Contracts.Assert(kvp.Key < length);
while (++last < kvp.Key)
yield return new KeyValuePair<int, int>(last, zeroBin);
int binned = Algorithms.FindFirstGE(binUpperBounds, kvp.Value);
if (binned > 0)
yield return new KeyValuePair<int, int>(kvp.Key, binned);
}
while (++last < length)
yield return new KeyValuePair<int, int>(last, zeroBin);
yield break;
}
Contracts.Assert(!_isSparse);
Contracts.AssertValue(_dense);
Contracts.Assert(_dense.Count <= length);
for (int i = 0; i < _dense.Count; ++i)
{
int binned = Algorithms.FindFirstGE(binUpperBounds, _dense[i]);
if (binned > 0)
yield return new KeyValuePair<int, int>(i, binned);
}
if (zeroBin > 0)
{
for (int i = _dense.Count; i < length; ++i)
yield return new KeyValuePair<int, int>(i, zeroBin);
}
}
public sealed class ForwardIndexer
{
// All of the _values list. We are only addressing _min through _lim.
private readonly ValuesList[] _values;
// Parallel to the subsequence of _values in min to lim, indicates the index where
// we should start to look for the next value, if the corresponding value list in
// _values is sparse. If the corresponding value list is dense the entry at this
// position is not used.
private readonly int[] _perFeaturePosition;
private readonly int[] _featureIndices;
#if DEBUG
// Holds for each feature the row index that it was previously accessed on.
// Purely for validation purposes.
private readonly int[] _lastRow;
#endif
/// <summary>
/// Access the value of a particular feature, at a particular row.
/// </summary>
/// <param name="featureIndex">A feature index, which indexes not the global feature indices,
/// but the index into the subset of features specified at the constructor time</param>
/// <param name="rowIndex">The row index to access, which must be non-decreasing, and must
/// indeed be actually increasing for access on the same feature (for example, if you have two features,
/// it is OK to access <c>[1, 5]</c>, then <c>[0, 5]</c>, but once this is done you cannot
/// access the same feature at the same position.</param>
/// <returns></returns>
public double this[int featureIndex, int rowIndex]
{
get
{
Contracts.Assert(0 <= featureIndex && featureIndex < _featureIndices.Length);
Contracts.Assert(rowIndex >= 0);
var values = _values[_featureIndices[featureIndex]];
#if DEBUG
int lastRow = _lastRow[featureIndex];
Contracts.Assert(rowIndex > lastRow);
_lastRow[featureIndex] = rowIndex;
#endif
if (!values._isSparse)
return rowIndex < values._dense.Count ? values._dense[rowIndex] : 0;
int last = _perFeaturePosition[featureIndex];
var sp = values._sparse;
#if DEBUG
// The next value of _sparse (assuming there is one) should have been past the last access.
// That is, sp[last].Key, if it exist, must be greater than lastRow.
Contracts.Assert(sp.Count == 0 || sp[last].Key > lastRow);
#endif
while (last < sp.Count)
{
var s = sp[last++];
if (s.Key < rowIndex)
continue;
if (s.Key > rowIndex)
{
// We'd previously put last past this element,
// have to put it back a bit.
last--;
break;
}
Contracts.Assert(s.Key == rowIndex);
_perFeaturePosition[featureIndex] = last;
return s.Value;
}
_perFeaturePosition[featureIndex] = last;
return 0;
}
}
/// <summary>
/// Initialize a forward indexer.
/// </summary>
/// <param name="values">Holds the values of the features</param>
/// <param name="features">The array of feature indices this will index</param>
/// <param name="workArray">A possibly shared working array, once used by this forward
/// indexer it should not be used in any previously created forward indexer</param>
public ForwardIndexer(ValuesList[] values, int[] features, ref int[] workArray)
{
Contracts.AssertValue(values);
Contracts.AssertValueOrNull(workArray);
Contracts.AssertValue(features);
Contracts.Assert(Utils.IsIncreasing(0, features, values.Length));
Contracts.Assert(features.All(i => values[i] != null));
_values = values;
_featureIndices = features;
Utils.EnsureSize(ref workArray, _featureIndices.Length, keepOld: false);
Contracts.AssertValue(workArray); // Should be initialized now.
_perFeaturePosition = workArray;
Array.Clear(_perFeaturePosition, 0, _featureIndices.Length);
#if DEBUG
_lastRow = new int[features.Length];
for (int i = 0; i < _lastRow.Length; ++i)
_lastRow[i] = -1;
#endif
}
}
}
}
internal sealed class ExamplesToFastTreeBins
{
private readonly int _maxBins;
private readonly float _maxLabel;
private readonly IHost _host;
private readonly bool _diskTranspose;
private readonly bool _noFlocks;
private readonly int _minDocsPerLeaf;
/// <summary> Bin boundaries </summary>
public double[][] BinUpperBounds
{
get;
private set;
}
public int[] FeatureMap { get; private set; }
public ExamplesToFastTreeBins(IHostEnvironment env, int maxBins, bool diskTranspose, bool noFlocks, int minDocsPerLeaf, float maxLabel)
{
Contracts.AssertValue(env);
_host = env.Register("Converter");
_maxBins = maxBins;
_maxLabel = maxLabel;
_diskTranspose = diskTranspose;
_noFlocks = noFlocks;
_minDocsPerLeaf = minDocsPerLeaf;
}
public Dataset FindBinsAndReturnDataset(RoleMappedData data, PredictionKind kind, IParallelTraining parallelTraining,
int[] categoricalFeaturIndices, bool categoricalSplit)
{
using (var ch = _host.Start("InitDataset"))
{
ch.Info("Making per-feature arrays");
var convData = DataConverter.Create(data, _host, _maxBins, _maxLabel, _diskTranspose, _noFlocks,
_minDocsPerLeaf, kind, parallelTraining, categoricalFeaturIndices, categoricalSplit);
ch.Info("Processed {0} instances", convData.NumExamples);
ch.Info("Binning and forming Feature objects");
Dataset d = convData.GetDataset();
BinUpperBounds = convData.BinUpperBounds;
FeatureMap = convData.FeatureMap;
return d;
}
}
public Dataset GetCompatibleDataset(RoleMappedData data, PredictionKind kind, int[] categoricalFeatures, bool categoricalSplit)
{
_host.AssertValue(BinUpperBounds);
var convData = DataConverter.Create(data, _host, BinUpperBounds, _maxLabel, _diskTranspose, _noFlocks, kind,
categoricalFeatures, categoricalSplit);
return convData.GetDataset();
}
}
public abstract class TreeEnsembleModelParameters :
ModelParametersBase<float>,
IValueMapper,
ICanSaveInTextFormat,
ICanSaveInIniFormat,
ICanSaveInSourceCode,
ICanSaveSummary,
ICanGetSummaryInKeyValuePairs,
ITreeEnsemble,
IPredictorWithFeatureWeights<float>,
IFeatureContributionMapper,
ICalculateFeatureContribution,
ISingleCanSavePfa,
ISingleCanSaveOnnx
{
// The below two properties are necessary for tree Visualizer
[BestFriend]
internal InternalTreeEnsemble TrainedEnsemble { get; }
int ITreeEnsemble.NumTrees => TrainedEnsemble.NumTrees;
// Inner args is used only for documentation purposes when saving comments to INI files.
private protected readonly string InnerOptions;
// The total number of features used in training (takes the value of zero if the
// written version of the loaded model is less than VerNumFeaturesSerialized)
private protected readonly int NumFeatures;
// Maximum index of the split features of trainedEnsemble trees
private protected readonly int MaxSplitFeatIdx;
private protected abstract uint VerNumFeaturesSerialized { get; }
private protected abstract uint VerDefaultValueSerialized { get; }
private protected abstract uint VerCategoricalSplitSerialized { get; }
[BestFriend]
internal readonly DataViewType InputType;
DataViewType IValueMapper.InputType => InputType;
[BestFriend]
internal readonly DataViewType OutputType;
DataViewType IValueMapper.OutputType => OutputType;
bool ICanSavePfa.CanSavePfa => true;
bool ICanSaveOnnx.CanSaveOnnx(OnnxContext ctx) => true;
/// <summary>
/// Used to determine the contribution of each feature to the score of an example by <see cref="FeatureContributionCalculatingTransformer"/>.
/// The calculation of feature contribution essentially consists in determining which splits in the tree have the most impact
/// on the final score and assigning the value of the impact to the features determining the split. More precisely, the contribution of a feature
/// is equal to the change in score produced by exploring the opposite sub-tree every time a decision node for the given feature is encountered.
/// Consider a simple case with a single decision tree that has a decision node for the binary feature F1. Given an example that has feature F1
/// equal to true, we can calculate the score it would have obtained if we chose the subtree corresponding to the feature F1 being equal to false
/// while keeping the other features constant. The contribution of feature F1 for the given example is the difference between the original score
/// and the score obtained by taking the opposite decision at the node corresponding to feature F1. This algorithm extends naturally to models with
/// many decision trees.
/// </summary>
FeatureContributionCalculator ICalculateFeatureContribution.FeatureContributionCalculator => new FeatureContributionCalculator(this);
/// The following function is used in both FastTree and LightGBM so <see cref="BestFriendAttribute"/> is required.
[BestFriend]
private protected TreeEnsembleModelParameters(IHostEnvironment env, string name, InternalTreeEnsemble trainedEnsemble, int numFeatures, string innerArgs)
: base(env, name)
{
Host.CheckValue(trainedEnsemble, nameof(trainedEnsemble));
Host.CheckParam(numFeatures > 0, nameof(numFeatures), "must be positive");
Host.CheckValueOrNull(innerArgs);
// REVIEW: When we make the predictor wrapper, we may want to further "optimize"
// the trained ensemble to, for instance, resize arrays so that they are of the length
// the actual number of leaves/nodes, or remove unnecessary arrays, and so forth.
TrainedEnsemble = trainedEnsemble;
InnerOptions = innerArgs;
NumFeatures = numFeatures;
MaxSplitFeatIdx = trainedEnsemble.GetMaxFeatureIndex();
Contracts.Assert(NumFeatures > MaxSplitFeatIdx);
InputType = new VectorDataViewType(NumberDataViewType.Single, NumFeatures);
OutputType = NumberDataViewType.Single;
}
private protected TreeEnsembleModelParameters(IHostEnvironment env, string name, ModelLoadContext ctx, VersionInfo ver)
: base(env, name, ctx)
{
// *** Binary format ***
// Ensemble
// int: Inner args string id
// int: Number of features (VerNumFeaturesSerialized)
// <PredictionKind> specific stuff
ctx.CheckVersionInfo(ver);
bool usingDefaultValues = false;
bool categoricalSplits = false;
if (ctx.Header.ModelVerWritten >= VerDefaultValueSerialized)
usingDefaultValues = true;
if (ctx.Header.ModelVerWritten >= VerCategoricalSplitSerialized)
categoricalSplits = true;
TrainedEnsemble = new InternalTreeEnsemble(ctx, usingDefaultValues, categoricalSplits);
MaxSplitFeatIdx = TrainedEnsemble.GetMaxFeatureIndex();
InnerOptions = ctx.LoadStringOrNull();
if (ctx.Header.ModelVerWritten >= VerNumFeaturesSerialized)
{
NumFeatures = ctx.Reader.ReadInt32();
// It is possible that the number of features is 0 when an old model is loaded and then saved with the new version.
Host.CheckDecode(NumFeatures >= 0);
}
// In the days of TLC <= 2.7 before we had a data pipeline, there was
// some auxiliary structure called the "ContentMap." This structure is
// no longer necessary or helpful since the data pipeline is in
// TLC >= 3.0 supposed to be independent of any predictor specific
// tricks.
InputType = new VectorDataViewType(NumberDataViewType.Single, NumFeatures);
OutputType = NumberDataViewType.Single;
}
[BestFriend]
private protected override void SaveCore(ModelSaveContext ctx)
{
base.SaveCore(ctx);
// *** Binary format ***
// Ensemble
// int: Inner args string id
// int: Number of features (VerNumFeaturesSerialized)
// <PredictionKind> specific stuff
TrainedEnsemble.Save(ctx);
ctx.SaveStringOrNull(InnerOptions);
Host.Assert(NumFeatures >= 0);
ctx.Writer.Write(NumFeatures);
}
ValueMapper<TIn, TOut> IValueMapper.GetMapper<TIn, TOut>()
{
Host.Check(typeof(TIn) == typeof(VBuffer<float>));
Host.Check(typeof(TOut) == typeof(float));
ValueMapper<VBuffer<float>, float> del = Map;
return (ValueMapper<TIn, TOut>)(Delegate)del;
}
private protected virtual void Map(in VBuffer<float> src, ref float dst)
{
int inputVectorSize = InputType.GetVectorSize();
if (inputVectorSize > 0)
Host.Check(src.Length == inputVectorSize);
else
Host.Check(src.Length > MaxSplitFeatIdx);
dst = (float)TrainedEnsemble.GetOutput(in src);
}
ValueMapper<TSrc, VBuffer<float>> IFeatureContributionMapper.GetFeatureContributionMapper<TSrc, TDst>(int top, int bottom, bool normalize)
{
Host.Check(typeof(TSrc) == typeof(VBuffer<float>));
Host.Check(typeof(TDst) == typeof(VBuffer<float>));
Host.Check(top >= 0, "top must be non-negative");
Host.Check(bottom >= 0, "bottom must be non-negative");
BufferBuilder<float> builder = null;
ValueMapper<VBuffer<float>, VBuffer<float>> del =
(in VBuffer<float> src, ref VBuffer<float> dst) =>
{
FeatureContributionMap(in src, ref dst, ref builder);
Numeric.VectorUtils.SparsifyNormalize(ref dst, top, bottom, normalize);
};
return (ValueMapper<TSrc, VBuffer<float>>)(Delegate)del;
}
private void FeatureContributionMap(in VBuffer<float> src, ref VBuffer<float> dst, ref BufferBuilder<float> builder)
{
int inputVectorSize = InputType.GetVectorSize();
if (inputVectorSize > 0)
Host.Check(src.Length == inputVectorSize);
else
Host.Check(src.Length > MaxSplitFeatIdx);
TrainedEnsemble.GetFeatureContributions(in src, ref dst, ref builder);
}
/// <summary>
/// write out a C# representation of the ensemble
/// </summary>
void ICanSaveInSourceCode.SaveAsCode(TextWriter writer, RoleMappedSchema schema)
{
Host.CheckValueOrNull(schema);
SaveEnsembleAsCode(writer, schema);
}
/// <summary>
/// Output the INI model to a given writer
/// </summary>
void ICanSaveInTextFormat.SaveAsText(TextWriter writer, RoleMappedSchema schema)
{
Host.CheckValue(writer, nameof(writer));
Host.CheckValueOrNull(schema);
((ICanSaveInIniFormat)this).SaveAsIni(writer, schema);
}
/// <summary>
/// Output the INI model to a given writer
/// </summary>
void ICanSaveInIniFormat.SaveAsIni(TextWriter writer, RoleMappedSchema schema, ICalibrator calibrator)
{
Host.CheckValue(writer, nameof(writer));
var ensembleIni = FastTreeIniFileUtils.TreeEnsembleToIni(Host, TrainedEnsemble, schema, calibrator,
InnerOptions, appendFeatureGain: true, includeZeroGainFeatures: false);
writer.WriteLine(ensembleIni);
}
JToken ISingleCanSavePfa.SaveAsPfa(BoundPfaContext ctx, JToken input)
{
Host.CheckValue(ctx, nameof(ctx));
Host.CheckValue(input, nameof(input));
return TrainedEnsemble.AsPfa(ctx, input);
}
private enum NodeMode
{
[Description("BRANCH_LEQ")]
BranchLEq,
[Description("BRANCH_LT")]
BranchLT,
[Description("BRANCH_GTE")]
BranchGte,
[Description("BRANCH_GT")]
BranchGT,
[Description("BRANCH_EQ")]
BranchEq,
[Description("BRANCH_LT")]
BranchNeq,
[Description("LEAF")]
Leaf
};
private enum PostTransform
{
[Description("NONE")]
None,
[Description("SOFTMAX")]
SoftMax,
[Description("LOGISTIC")]
Logstic,
[Description("SOFTMAX_ZERO")]
SoftMaxZero
}
private enum AggregateFunction
{
[Description("AVERAGE")]
Average,
[Description("SUM")]
Sum,
[Description("MIN")]
Min,
[Description("MAX")]
Max
}
private protected virtual bool SaveAsOnnx(OnnxContext ctx, string[] outputNames, string featureColumn)
{
Host.CheckValue(ctx, nameof(ctx));
Host.Check(Utils.Size(outputNames) >= 1);
const int minimumOpSetVersion = 9;
ctx.CheckOpSetVersion(minimumOpSetVersion, "TreeEnsembleModelParameters");
//Nodes.
var nodesTreeids = new List<long>();
var nodesIds = new List<long>();
var nodesFeatureIds = new List<long>();
var nodeModes = new List<string>();
var nodesValues = new List<double>();
var nodeHitrates = new List<long>();
var missingValueTracksTrue = new List<bool>();
var nodesTrueNodeIds = new List<long>();
var nodesFalseNodeIds = new List<long>();
var nodesBaseValues = new List<float>();
//Leafs.
var classTreeIds = new List<long>();
var classNodeIds = new List<long>();
var classIds = new List<long>();
var classWeights = new List<double>();
int treeIndex = -1;
foreach (var tree in TrainedEnsemble.Trees)
{
treeIndex++;
for (int nodeIndex = 0; nodeIndex < tree.NumNodes; nodeIndex++)
{
nodesTreeids.Add(treeIndex);
nodeModes.Add(NodeMode.BranchLEq.GetDescription());
nodesIds.Add(nodeIndex);
nodesFeatureIds.Add(tree.SplitFeature(nodeIndex));
nodesValues.Add(tree.RawThresholds[nodeIndex]);
nodesTrueNodeIds.Add(tree.LteChild[nodeIndex] < 0 ? ~tree.LteChild[nodeIndex] + tree.NumNodes : tree.LteChild[nodeIndex]);
nodesFalseNodeIds.Add(tree.GtChild[nodeIndex] < 0 ? ~tree.GtChild[nodeIndex] + tree.NumNodes : tree.GtChild[nodeIndex]);
if (tree.DefaultValueForMissing?[nodeIndex] <= tree.RawThresholds[nodeIndex])
missingValueTracksTrue.Add(true);
else
missingValueTracksTrue.Add(false);
nodeHitrates.Add(0);
}
for (int leafIndex = 0; leafIndex < tree.NumLeaves; leafIndex++)
{
int nodeIndex = tree.NumNodes + leafIndex;
nodesTreeids.Add(treeIndex);
nodesBaseValues.Add(0);
nodeModes.Add(NodeMode.Leaf.GetDescription());
nodesIds.Add(nodeIndex);
nodesFeatureIds.Add(0);
nodesValues.Add(0);
nodesTrueNodeIds.Add(0);
nodesFalseNodeIds.Add(0);
missingValueTracksTrue.Add(false);
nodeHitrates.Add(0);
classTreeIds.Add(treeIndex);
classNodeIds.Add(nodeIndex);
classIds.Add(0);
classWeights.Add(tree.LeafValues[leafIndex]);
}
}
string opType = "TreeEnsembleRegressor";
string scoreVarName = (Utils.Size(outputNames) >= 2) ? outputNames[1] : outputNames[0]; // Get Score from PredictedLabel and/or Score columns
var node = ctx.CreateNode(opType, new[] { featureColumn }, new[] { scoreVarName }, ctx.GetNodeName(opType));
node.AddAttribute("post_transform", PostTransform.None.GetDescription());
node.AddAttribute("n_targets", 1);
node.AddAttribute("base_values", new List<float>() { 0 });
node.AddAttribute("aggregate_function", AggregateFunction.Sum.GetDescription());
node.AddAttribute("nodes_treeids", nodesTreeids);
node.AddAttribute("nodes_nodeids", nodesIds);
node.AddAttribute("nodes_featureids", nodesFeatureIds);
node.AddAttribute("nodes_modes", nodeModes);
node.AddAttribute("nodes_values", nodesValues);
node.AddAttribute("nodes_truenodeids", nodesTrueNodeIds);
node.AddAttribute("nodes_falsenodeids", nodesFalseNodeIds);
node.AddAttribute("nodes_missing_value_tracks_true", missingValueTracksTrue);
node.AddAttribute("target_treeids", classTreeIds);
node.AddAttribute("target_nodeids", classNodeIds);
node.AddAttribute("target_ids", classIds);
node.AddAttribute("target_weights", classWeights);
return true;
}
bool ISingleCanSaveOnnx.SaveAsOnnx(OnnxContext ctx, string[] outputNames, string featureColumn)
{
return SaveAsOnnx(ctx, outputNames, featureColumn);
}
void ICanSaveSummary.SaveSummary(TextWriter writer, RoleMappedSchema schema)
{
writer.WriteLine();
writer.WriteLine("Per-feature gain summary for the boosted tree ensemble:");
foreach (var pair in ((ICanGetSummaryInKeyValuePairs)this).GetSummaryInKeyValuePairs(schema))
{
Host.Assert(pair.Value is double);
writer.WriteLine("\t{0}\t{1}", pair.Key, (double)pair.Value);
}
}
private IEnumerable<KeyValuePair<string, double>> GetSortedFeatureGains(RoleMappedSchema schema)
{
var gainMap = new FeatureToGainMap(TrainedEnsemble.Trees.ToList(), normalize: true);
var names = default(VBuffer<ReadOnlyMemory<char>>);
AnnotationUtils.GetSlotNames(schema, RoleMappedSchema.ColumnRole.Feature, NumFeatures, ref names);
var ordered = gainMap.OrderByDescending(pair => pair.Value);
double max = ordered.FirstOrDefault().Value;
double normFactor = max == 0 ? 1.0 : (1.0 / Math.Sqrt(max));
foreach (var pair in ordered)
{
var name = names.GetItemOrDefault(pair.Key).ToString();
if (string.IsNullOrEmpty(name))
name = $"f{pair.Key}";
yield return new KeyValuePair<string, double>(name, Math.Sqrt(pair.Value) * normFactor);
}
}
///<inheritdoc/>
IList<KeyValuePair<string, object>> ICanGetSummaryInKeyValuePairs.GetSummaryInKeyValuePairs(RoleMappedSchema schema)
{
List<KeyValuePair<string, object>> results = new List<KeyValuePair<string, object>>();
var ordered = GetSortedFeatureGains(schema);
foreach (var pair in ordered)
results.Add(new KeyValuePair<string, object>(pair.Key, pair.Value));
return results;
}
/// <summary>
/// returns a C# representation of the ensemble
/// </summary>
private void SaveEnsembleAsCode(TextWriter writer, RoleMappedSchema schema)
{
Host.AssertValueOrNull(schema);
var names = default(VBuffer<ReadOnlyMemory<char>>);
AnnotationUtils.GetSlotNames(schema, RoleMappedSchema.ColumnRole.Feature, NumFeatures, ref names);
int i = 0;
foreach (InternalRegressionTree tree in TrainedEnsemble.Trees)
{
writer.Write("double treeOutput{0}=", i);
SaveTreeAsCode(tree, writer, in names);
writer.Write(";\n");
i++;
}
writer.Write("double output = ");
for (int j = 0; j < i; j++)
writer.Write((j > 0 ? "+" : "") + "treeOutput" + j);
writer.Write(";");
}
/// <summary>
/// Convert a single tree to code, called recursively
/// </summary>
private void SaveTreeAsCode(InternalRegressionTree tree, TextWriter writer, in VBuffer<ReadOnlyMemory<char>> names)
{
ToCSharp(tree, writer, 0, in names);
}
// converts a subtree into a C# expression
private void ToCSharp(InternalRegressionTree tree, TextWriter writer, int node, in VBuffer<ReadOnlyMemory<char>> names)
{
if (node < 0)
{
writer.Write(FloatUtils.ToRoundTripString(tree.LeafValue(~node)));
//_output[~node].ToString());
}
else
{
var name = names.GetItemOrDefault(tree.SplitFeature(node)).ToString();
if (string.IsNullOrEmpty(name))
name = $"f{tree.SplitFeature(node)}";
writer.Write("(({0} > {1}) ? ", name, FloatUtils.ToRoundTripString(tree.RawThreshold(node)));
ToCSharp(tree, writer, tree.GetGtChildForNode(node), in names);
writer.Write(" : ");
ToCSharp(tree, writer, tree.GetLteChildForNode(node), in names);
writer.Write(")");
}
}
/// <summary>
/// Get the cumulative split gains for each feature across all trees.
/// </summary>
/// <param name="weights">A <see cref="VBuffer{T}"/> to hold the cumulative split gain value for each feature.
/// The i-th element in <paramref name="weights"/> stores the cumulative split gain of the i-th feature.</param>
public void GetFeatureWeights(ref VBuffer<float> weights)
{
var numFeatures = Math.Max(NumFeatures, MaxSplitFeatIdx + 1);
FeatureToGainMap gainMap = new FeatureToGainMap(TrainedEnsemble.Trees.ToList(), normalize: true);
// If there are no trees or no splits, there are no gains.
if (gainMap.Count == 0)
{
VBufferUtils.Resize(ref weights, numFeatures, 0);
return;
}
double max = gainMap.Values.Max();
double normFactor = max == 0 ? 1.0 : (1.0 / Math.Sqrt(max));
var bldr = new BufferBuilder<float>(R4Adder.Instance);
bldr.Reset(numFeatures, false);
foreach (var pair in gainMap)
bldr.AddFeature(pair.Key, (float)(Math.Sqrt(pair.Value) * normFactor));
bldr.GetResult(ref weights);
}
ITree[] ITreeEnsemble.GetTrees()
{
return TrainedEnsemble.Trees.Select(k => new Tree(k)).ToArray();
}
[BestFriend]
internal float GetLeafValue(int treeId, int leafId)
{
return (float)TrainedEnsemble.GetTreeAt(treeId).LeafValue(leafId);
}
/// <summary>
/// Returns the leaf node in the requested tree for the given feature vector, and populates 'path' with the list of
/// internal nodes in the path from the root to that leaf. If 'path' is null a new list is initialized. All elements
/// in 'path' are cleared before filling in the current path nodes.
/// </summary>
[BestFriend]
internal int GetLeaf(int treeId, in VBuffer<float> features, ref List<int> path)
{
return TrainedEnsemble.GetTreeAt(treeId).GetLeaf(in features, ref path);
}
private sealed class Tree : ITree<VBuffer<float>>
{
private readonly InternalRegressionTree _regTree;
public Tree(InternalRegressionTree regTree)
{
_regTree = regTree;
}
public int[] GtChild => _regTree.GtChild;
public int[] LteChild => _regTree.LteChild;
public int NumNodes => _regTree.NumNodes;
public int NumLeaves => _regTree.NumLeaves;
public int GetLeaf(in VBuffer<float> feat)
{
return _regTree.GetLeaf(in feat);
}
public INode GetNode(int nodeId, bool isLeaf, IEnumerable<string> featuresNames = null)
{
var keyValues = new Dictionary<string, object>();
if (isLeaf)
{
keyValues.Add(NodeKeys.LeafValue, _regTree.LeafValue(nodeId));
}
else
{
if (featuresNames != null)
{
if (featuresNames is FeatureNameCollection features)
{
if (_regTree.CategoricalSplit[nodeId])
{
string featureList = string.Join(" OR \n",
_regTree.CategoricalSplitFeatures[nodeId].Select(feature => features[feature]));
keyValues.Add(NodeKeys.SplitName, featureList);
}
else
keyValues.Add(NodeKeys.SplitName, features[_regTree.SplitFeature(nodeId)]);
}
}
keyValues.Add(NodeKeys.Threshold, string.Format("<= {0}", _regTree.RawThreshold(nodeId)));
if (_regTree.SplitGains != null)
keyValues.Add(NodeKeys.SplitGain, _regTree.SplitGains[nodeId]);
if (_regTree.GainPValues != null)
keyValues.Add(NodeKeys.GainValue, _regTree.GainPValues[nodeId]);
if (_regTree.PreviousLeafValues != null)
keyValues.Add(NodeKeys.PreviousLeafValue, _regTree.PreviousLeafValues[nodeId]);
}
return new TreeNode(keyValues);
}
public double GetLeafValue(int leafId)
{
return _regTree.LeafValue(leafId);
}
}
private sealed class TreeNode : INode
{
public TreeNode(Dictionary<string, object> keyValues)
{
KeyValues = keyValues;
}
public Dictionary<string, object> KeyValues { get; }
}
}
/// <summary>
/// <see cref="TreeEnsembleModelParametersBasedOnRegressionTree"/> is derived from
/// <see cref="TreeEnsembleModelParameters"/> plus a strongly-typed public attribute,
/// <see cref="TrainedTreeEnsemble"/>, for exposing trained model's details to users.
/// Its function, <see cref="CreateTreeEnsembleFromInternalDataStructure"/>, is
/// called to create <see cref="TrainedTreeEnsemble"/> inside <see cref="TreeEnsembleModelParameters"/>.
/// Note that the major difference between <see cref="TreeEnsembleModelParametersBasedOnQuantileRegressionTree"/>
/// and <see cref="TreeEnsembleModelParametersBasedOnRegressionTree"/> is the type of
/// <see cref="TrainedTreeEnsemble"/>.
/// </summary>
public abstract class TreeEnsembleModelParametersBasedOnRegressionTree : TreeEnsembleModelParameters, ICanGetSummaryAsIDataView
{
/// <summary>
/// An ensemble of trees exposed to users. It is a wrapper on the <see langword="internal"/>
/// <see cref="InternalTreeEnsemble"/> in <see cref="ML.Trainers.FastTree.TreeEnsemble{T}"/>.
/// </summary>
public RegressionTreeEnsemble TrainedTreeEnsemble { get; }
[BestFriend]
private protected TreeEnsembleModelParametersBasedOnRegressionTree(IHostEnvironment env, string name, InternalTreeEnsemble trainedEnsemble, int numFeatures, string innerArgs)
: base(env, name, trainedEnsemble, numFeatures, innerArgs)
{
TrainedTreeEnsemble = CreateTreeEnsembleFromInternalDataStructure();
}
[BestFriend]
private protected TreeEnsembleModelParametersBasedOnRegressionTree(IHostEnvironment env, string name, ModelLoadContext ctx, VersionInfo ver)
: base(env, name, ctx, ver)
{
TrainedTreeEnsemble = CreateTreeEnsembleFromInternalDataStructure();
}
private RegressionTreeEnsemble CreateTreeEnsembleFromInternalDataStructure()
{
var trees = TrainedEnsemble.Trees.Select(tree => new RegressionTree(tree));
var treeWeights = TrainedEnsemble.Trees.Select(tree => tree.Weight);
return new RegressionTreeEnsemble(trees, treeWeights, TrainedEnsemble.Bias);
}
/// <summary>
/// Used for the Summarize entrypoint.
/// </summary>
IDataView ICanGetSummaryAsIDataView.GetSummaryDataView(RoleMappedSchema schema)
=> RegressionTreeBaseUtils.RegressionTreeEnsembleAsIDataView(Host, TrainedTreeEnsemble.Bias, TrainedTreeEnsemble.TreeWeights, TrainedTreeEnsemble.Trees);
}
/// <summary>
/// <see cref="TreeEnsembleModelParametersBasedOnQuantileRegressionTree"/> is derived from
/// <see cref="TreeEnsembleModelParameters"/> plus a strongly-typed public attribute,
/// <see cref="TrainedTreeEnsemble"/>, for exposing trained model's details to users.
/// Its function, <see cref="CreateTreeEnsembleFromInternalDataStructure"/>, is
/// called to create <see cref="TrainedTreeEnsemble"/> inside <see cref="TreeEnsembleModelParameters"/>.
/// Note that the major difference between <see cref="TreeEnsembleModelParametersBasedOnQuantileRegressionTree"/>
/// and <see cref="TreeEnsembleModelParametersBasedOnRegressionTree"/> is the type of
/// <see cref="TrainedTreeEnsemble"/>.
/// </summary>
public abstract class TreeEnsembleModelParametersBasedOnQuantileRegressionTree : TreeEnsembleModelParameters, ICanGetSummaryAsIDataView
{
/// <summary>
/// An ensemble of trees exposed to users. It is a wrapper on the <see langword="internal"/>
/// <see cref="InternalTreeEnsemble"/> in <see cref="ML.Trainers.FastTree.TreeEnsemble{T}"/>.
/// </summary>
public QuantileRegressionTreeEnsemble TrainedTreeEnsemble { get; }
[BestFriend]
private protected TreeEnsembleModelParametersBasedOnQuantileRegressionTree(IHostEnvironment env, string name, InternalTreeEnsemble trainedEnsemble, int numFeatures, string innerArgs)
: base(env, name, trainedEnsemble, numFeatures, innerArgs)
{
TrainedTreeEnsemble = CreateTreeEnsembleFromInternalDataStructure();
}
[BestFriend]
private protected TreeEnsembleModelParametersBasedOnQuantileRegressionTree(IHostEnvironment env, string name, ModelLoadContext ctx, VersionInfo ver)
: base(env, name, ctx, ver)
{
TrainedTreeEnsemble = CreateTreeEnsembleFromInternalDataStructure();
}
private QuantileRegressionTreeEnsemble CreateTreeEnsembleFromInternalDataStructure()
{
var trees = TrainedEnsemble.Trees.Select(tree => new QuantileRegressionTree((InternalQuantileRegressionTree)tree));
var treeWeights = TrainedEnsemble.Trees.Select(tree => tree.Weight);
return new QuantileRegressionTreeEnsemble(trees, treeWeights, TrainedEnsemble.Bias);
}
/// <summary>
/// Used for the Summarize entrypoint.
/// </summary>
IDataView ICanGetSummaryAsIDataView.GetSummaryDataView(RoleMappedSchema schema)
=> RegressionTreeBaseUtils.RegressionTreeEnsembleAsIDataView(Host, TrainedTreeEnsemble.Bias, TrainedTreeEnsemble.TreeWeights, TrainedTreeEnsemble.Trees);
}
}