machinelearning/test/Microsoft.ML.TestFrameworkC.../Datasets.cs

739 строки
30 KiB
C#

// Licensed to the .NET Foundation under one or more agreements.
// The .NET Foundation licenses this file to you under the MIT license.
// See the LICENSE file in the project root for more information.
using System;
using Microsoft.ML.Data;
namespace Microsoft.ML.TestFrameworkCommon
{
public class TestDataset
{
public string name;
public string path;
public string trainFilename;
public string testFilename;
public string validFilename;
public string labelFilename;
public char fileSeparator;
public bool fileHasHeader;
public bool allowQuoting;
// REVIEW: Replace these with appropriate SubComponents!
public string settings;
public string testSettings;
// REVIEW: Remove the three above setting strings once conversion work is complete.
public string loaderSettings;
public string[] mamlExtraSettings;
public Func<TextLoader.Column[]> GetLoaderColumns;
public TestDataset Clone()
{
var ret = new TestDataset
{
name = name,
trainFilename = trainFilename,
testFilename = testFilename,
validFilename = validFilename,
labelFilename = labelFilename,
settings = settings,
testSettings = testSettings,
loaderSettings = loaderSettings,
mamlExtraSettings = mamlExtraSettings
};
return ret;
}
}
public static class TestDatasets
{
public static TestDataset breastCancer = new TestDataset
{
name = "breast-cancer",
trainFilename = "breast-cancer.txt",
testFilename = "breast-cancer.txt",
// REVIEW: For the purpose of the TL->MAML test translation work, this indicates
// that the TestDataset instance was reviewed and no specific MAML settings are necessary, or
// should be added by people doing other translation work as there are presumably tests already
// depending on the empty settings as written.
loaderSettings = ""
};
public static TestDataset breastCancerBoolLabel = new TestDataset
{
name = "breast-cancer",
trainFilename = "breast-cancer.txt",
testFilename = "breast-cancer.txt",
loaderSettings = "loader=Text{col=Label:BL:0 col=Features:~}"
};
public static TestDataset breastCancerGroupId = new TestDataset
{
name = "breast-cancer-group",
trainFilename = "breast-cancer.txt",
testFilename = "breast-cancer.txt",
loaderSettings = "loader=Text{col=Label:0 col=GroupId:U4[0-10]:1 col=Features:1-*}"
};
public static TestDataset breastCancerConst = new TestDataset
{
name = "breast-cancer",
trainFilename = "breast-cancer.txt",
testFilename = "breast-cancer.txt"
};
/// <summary>
/// Should perform identically with breastCancer above.
/// </summary>
public static TestDataset breastCancerPipe = new TestDataset
{
name = "breast-cancer",
trainFilename = "breast-cancer.txt",
testFilename = "breast-cancer.txt",
// Using "col=Features:1-5,6,7-9" improves code coverage. Same with "col=Attr:TX:6".
loaderSettings = "loader=Text{sparse- col=Attr:TX:6 col=Label:0 col=Features:1-5,6,7-9}",
mamlExtraSettings = new[] { "cache-" }
};
/// <summary>
/// Modified version of breastCancerPipe without mamlExtraSettings.
/// </summary>
public static TestDataset breastCancerPipeWithoutMamlExtraSettings = new TestDataset
{
name = "breast-cancer",
trainFilename = "breast-cancer.txt",
testFilename = "breast-cancer.txt",
// Using "col=Features:1-5,6,7-9" improves code coverage. Same with "col=Attr:TX:6".
loaderSettings = "loader=Text{sparse- col=Attr:TX:6 col=Label:0 col=Features:1-5,6,7-9}",
};
/// <summary>
/// Fixes missing values.
/// </summary>
public static TestDataset breastCancerPipeMissing = new TestDataset
{
name = "breast-cancer-missing",
trainFilename = "breast-cancer.txt",
testFilename = "breast-cancer.txt",
// Note that More and More_Cleansed are not really needed (duplicate info), but improve code coverage.
loaderSettings = "loader=Text{col=Label:0 col=Good:1-5,7-9 col=Mixed:6 col=More:4-6}",
mamlExtraSettings = new[] { "cache-", "xf=NAHandle{col=Fixed:Mixed col=More}", "xf=Concat{col=Features:Good,Fixed,More}" }
};
/// <summary>
/// Filters missing values.
/// </summary>
public static TestDataset breastCancerPipeMissingFilter = new TestDataset
{
name = "breast-cancer-missing-filter",
trainFilename = "breast-cancer.txt",
testFilename = "breast-cancer.txt",
// Note that More and More_Cleansed are not really needed (duplicate info), but improve code coverage.
loaderSettings = "loader=Text{col=Label:0 col=Good:1-5,7-9 col=Mixed:6 col=More:4-6}",
mamlExtraSettings = new[] { "cache-", "xf=MissingFilter{col=Mixed col=More}", "xf=Concat{col=Features:Good,Mixed,More}" }
};
public static TestDataset breastCancerOneClass = new TestDataset
{
name = "breast-cancer-one-class",
trainFilename = "breast-cancer.oneclass.txt",
testFilename = "breast-cancer.txt",
loaderSettings = ""
};
public static TestDataset breastCancerSparseBinaryFeatures = new TestDataset
{
name = "breast-cancer-sparse",
trainFilename = "breast-cancer.txt",
testFilename = "breast-cancer.txt",
loaderSettings = "xf=expr{col=Features expr=x:float(x>4?1:0)}"
};
// The data set contains images of hand-written digits.
// The input is given in the form of matrix id 8x8 where
// each element is an integer in the range 0..16
public static TestDataset Digits = new TestDataset
{
name = "Digits",
path = "external",
trainFilename = "digits.csv"
};
public static TestDataset vw = new TestDataset
{
name = "vw",
trainFilename = "vw.dat",
testFilename = "vw.dat"
};
public static TestDataset housing = new TestDataset
{
name = "housing",
trainFilename = "housing.txt",
testFilename = "housing.txt",
fileSeparator = '\t',
fileHasHeader = true,
loaderSettings = "loader=Text{col=Label:0 col=Features:~ header=+}",
GetLoaderColumns = () =>
{
return new[] {
new TextLoader.Column("MedianHomeValue", DataKind.Single, 0),
new TextLoader.Column("CrimesPerCapita", DataKind.Single, 1),
new TextLoader.Column("PercentResidental", DataKind.Single, 2),
new TextLoader.Column("PercentNonRetail", DataKind.Single, 3),
new TextLoader.Column("CharlesRiver", DataKind.Single, 4),
new TextLoader.Column("NitricOxides", DataKind.Single, 5),
new TextLoader.Column("RoomsPerDwelling", DataKind.Single, 6),
new TextLoader.Column("PercentPre40s", DataKind.Single, 7),
new TextLoader.Column("EmploymentDistance", DataKind.Single, 8),
new TextLoader.Column("HighwayDistance", DataKind.Single, 9),
new TextLoader.Column("TaxRate", DataKind.Single, 10),
new TextLoader.Column("TeacherRatio", DataKind.Single, 11),
};
}
};
public static TestDataset generatedRegressionDatasetmacro = new TestDataset
{
name = "generatedRegressionDataset",
trainFilename = "generated_regression_dataset.csv",
testFilename = "generated_regression_dataset.csv",
loaderSettings = "col=Label:R4:11 col=Features:R4:0-10 sep=; header+"
};
public static TestDataset WikiDetox = new TestDataset
{
name = "WikiDetox",
path = "external",
trainFilename = "WikiDetoxAnnotated160kRows.tsv",
testFilename = "WikiDetoxAnnotated160kRows.tsv"
};
public static TestDataset MSLRWeb = new TestDataset
{
name = "MSLRWeb",
path = "external",
trainFilename = "MSLRWeb10KTrain720kRows.tsv",
validFilename = "MSLRWeb10KValidate240kRows.tsv",
testFilename = "MSLRWeb10KTest240kRows.tsv"
};
public static TestDataset Sentiment = new TestDataset
{
name = "sentiment",
trainFilename = "wikipedia-detox-250-line-data.tsv",
testFilename = "wikipedia-detox-250-line-test.tsv",
fileHasHeader = true,
fileSeparator = '\t',
allowQuoting = true,
GetLoaderColumns = () =>
{
return new[]
{
new TextLoader.Column("Label", DataKind.Boolean, 0),
new TextLoader.Column("SentimentText", DataKind.String, 1)
};
}
};
public static TestDataset generatedRegressionDataset = new TestDataset
{
name = "generatedRegressionDataset",
trainFilename = "generated_regression_dataset.csv",
testFilename = "generated_regression_dataset.csv",
loaderSettings = "loader=Text{col=Label:R4:11 col=Features:R4:0-10 sep=; header+}"
};
public static TestDataset msm = new TestDataset
{
// REVIEW: Why is the MSM train set smaller than the test set? Reverse these!
name = "MSM-sparse-sample",
trainFilename = "MSM-sparse-sample-train.txt",
testFilename = "MSM-sparse-sample-test.txt",
loaderSettings = "loader=Text{col=Name:TX:0 col=Label:Num:1 col=Features:Num:~}",
mamlExtraSettings = new[] { "xf=Expr{col=Name expr={x=>right(x, 1)}}" }
};
public static TestDataset msmNamesHeader = new TestDataset
{
name = "MSM-names",
trainFilename = @"..\SmartMatch\Instances-Relevance.txt",
testFilename = @"..\SmartMatch\Instances-Relevance.txt",
settings = "header+;name:0,1"
};
public static TestDataset msmNamesHeaderIps = new TestDataset
{
name = "MSM-names",
trainFilename = @"..\SmartMatch\Instances-Relevance.txt",
testFilename = @"..\SmartMatch\Instances-Relevance.txt",
settings = "header+;name:0,1"
};
public static TestDataset extract1 = new TestDataset
{
name = "Extract1",
trainFilename = "Extract1.txt",
testFilename = "Extract1.txt"
};
public static TestDataset breastCancerBing = new TestDataset
{
name = "breast-cancer-bing",
trainFilename = "breast-cancer-bing.txt",
testFilename = "breast-cancer-bing.txt"
};
public static TestDataset adult = new TestDataset
{
name = "Census",
trainFilename = "adult.tiny.with-schema.txt",
testFilename = "adult.tiny.with-schema.txt",
fileHasHeader = true,
fileSeparator = '\t',
loaderSettings = "loader=Text{header+ col=Label:0 col=Num:9-14 col=Cat:TX:1-8}",
mamlExtraSettings = new[] { "xf=Cat{col=Cat}", "xf=Concat{col=Features:Num,Cat}" }
};
public static TestDataset adultOnlyCat = new TestDataset
{
name = "Census-Cat-Only",
trainFilename = "adult.tiny.with-schema.txt",
testFilename = "adult.tiny.with-schema.txt",
loaderSettings = "loader=Text{header+ col=Label:0 col=Cat:TX:1-8}",
mamlExtraSettings = new[] { "xf=Cat{col=Cat}", "xf=Concat{col=Features:Cat}" }
};
public static TestDataset adultHash = new TestDataset
{
name = "CensusHash",
trainFilename = "adult.tiny.with-schema.txt",
testFilename = "adult.tiny.with-schema.txt",
loaderSettings = "loader=Text{header+ col=Label:0 col=Num:9-14 col=Cat:TX:1-8}",
mamlExtraSettings = new[] { "xf=CatHash{col=Cat bits=5}", "xf=Concat{col=Features:Num,Cat}" }
};
public static TestDataset adultHashWithDataPipe = new TestDataset
{
name = "CensusHashWithPipe",
trainFilename = "adult.tiny.with-schema.txt",
testFilename = "adult.tiny.with-schema.txt",
loaderSettings = "loader=Text{header+ col=Cat:TX:1-8 col=Label:0 col=Num:~}",
mamlExtraSettings = new[] { "xf=CatHash{col=Hash:5:Cat}", "xf=Concat{col=Features:Num,Hash}" }
};
public static TestDataset adultText = new TestDataset
{
name = "CensusText",
trainFilename = "adult.tiny.with-schema.txt",
testFilename = "adult.tiny.with-schema.txt",
loaderSettings = "loader=Text{header+ col=Label:0 col=Word:TX:1-8 col=Num:~}",
mamlExtraSettings = new[] { "xf=WordBag{col=Word}", "xf=Concat{col=Features:Num,Word}" }
};
public static TestDataset adultTextHash = new TestDataset
{
name = "CensusTextHash",
trainFilename = "adult.tiny.with-schema.txt",
testFilename = "adult.tiny.with-schema.txt",
loaderSettings = "loader=Text{header+ col=Label:0 col=Word:TX:1-8 col=Num:~}",
mamlExtraSettings = new[] { "xf=WordHashBag{col=Word bits=8}", "xf=Concat{col=Features:Num,Word}" }
};
public static TestDataset adultRanking = new TestDataset
{
name = "adultRanking",
trainFilename = "adult.tiny.with-schema.txt",
loaderSettings = "loader=Text{header+ sep=tab, col=Label:R4:0 col=Workclass:TX:1 col=Categories:TX:2-8 col=NumericFeatures:R4:9-14}"
};
public static TestDataset displayPoisson = new TestDataset
{
name = "DisplayPoisson",
trainFilename = @"..\synthetic\Poisson-display-train.txt",
testFilename = @"..\synthetic\Poisson-display-test.txt",
settings = "header+;cat:1,2,3;label:5;max:2000;attr:6,7",
testSettings = "header+;cat:1,2,3;label:6;attr:4,7,8"
};
public static TestDataset displayPoissonWithInstanceParser = new TestDataset
{
name = "DisplayPoisson",
trainFilename = @"..\synthetic\Poisson-display-train.txt",
testFilename = @"..\synthetic\Poisson-display-test.txt",
settings = "header+;cat:1,2,3;label:5;max:2000;attr:6,7",
testSettings = "header+;cat:1,2,3;label:6;attr:4,7,8"
};
public static TestDataset childrenPoisson = new TestDataset
{
name = "ChildrenPoisson",
trainFilename = @"..\children\children.txt",
testFilename = @"..\children\children.txt",
loaderSettings = "loader=Text{header+ sep=space col=Cat1:TX:1 col=Cat2:TX:2 col=Cat3:TX:3 col=Label:4 col=Ignore:TX:0,5-7 col=Features:8-*}",
mamlExtraSettings = new[] { "xf=Cat{col=Cat1 col=Cat2 col=Cat3}", "xf=Concat{col=Features:Features,Cat1,Cat2,Cat3}" }
// settings = "header+;sep:space;cat:1,2,3;label:4;attr:0,5,6,7",
};
public static TestDataset autosSample = new TestDataset
{
name = "AutosSample",
trainFilename = @"auto-sample.txt",
testFilename = @"auto-sample.txt",
loaderSettings = "loader=Text{col=Label:0 col=Cat3:TX:3 col=Cat4:TX:4 col=Cat5:TX:5 col=Cat6:TX:6 col=Cat7:TX:7 col=Cat8:TX:8 col=Cat9:TX:9 col=Cat15:TX:15 col=Cat16:TX:16 col=Cat18:TX:18 col=Features:~}",
mamlExtraSettings = new[] {
"xf=Cat{col=Cat3 col=Cat4 col=Cat5 col=Cat6 col=Cat7 col=Cat8 col=Cat9 col=Cat15 col=Cat16 col=Cat18}",
"xf=Concat{col=Features:Features,Cat3,Cat4,Cat5,Cat6,Cat7,Cat8,Cat9,Cat15,Cat16,Cat18}" }
};
public static TestDataset reutersMaxDim = new TestDataset()
{
name = "reuters",
trainFilename = @"RCV1\reuters-toy-test.txt",
testFilename = @"RCV1\reuters-toy-test.txt",
loaderSettings = "loader=Text{size=10000 col=Label:0 col=Features:1-*}",
settings = "maxdim:10000"
};
public static TestDataset irisLoader = new TestDataset()
{
name = "iris",
trainFilename = @"iris.txt",
testFilename = @"iris.txt",
loaderSettings = "loader=Text{col=Label:TX:0 col=Features:1-*}",
mamlExtraSettings = new[] { "xf=Term{col=Label}" }
};
public static TestDataset irisData = new TestDataset()
{
name = "iris",
trainFilename = @"iris.data",
loaderSettings = "loader=Text{col=Label:TX:4 col=Features:0-3}",
GetLoaderColumns = () =>
{
return new[]
{
new TextLoader.Column("SepalLength", DataKind.Single, 0),
new TextLoader.Column("SepalWidth", DataKind.Single, 1),
new TextLoader.Column("PetalLength", DataKind.Single, 2),
new TextLoader.Column("PetalWidth",DataKind.Single, 3),
new TextLoader.Column("Label", DataKind.String, 4)
};
}
};
public static TestDataset irisLabelName = new TestDataset()
{
name = "iris-label-name",
trainFilename = @"iris-label-name.txt",
testFilename = @"iris-label-name.txt",
loaderSettings = "loader=Text{header+ col=Label:TX:0 col=Features:1-*}",
mamlExtraSettings = new[] { "xf=Term{col=Label}" }
};
public static TestDataset irisTreeFeaturized = new TestDataset()
{
name = "iris-tree-featurized",
trainFilename = @"iris.txt",
testFilename = @"iris.txt",
loaderSettings = "loader=Text{col=Label:U4[0-2]:0 col=Features:1-*}",
mamlExtraSettings = new[] { "xf=TreeFeat{lps=0 trainer=ftr{iter=3}} xf=copy{col=Features:Leaves}" }
};
public static TestDataset irisTreeFeaturizedPermuted = new TestDataset()
{
name = "iris-tree-featurized-permuted",
trainFilename = @"iris.txt",
testFilename = @"iris.txt",
loaderSettings = "loader=Text{col=Label:U4[0-2]:0 col=Features:1-*}",
mamlExtraSettings = new[] { "xf=TreeFeat{lps=2 trainer=ftr{iter=3}} xf=copy{col=Features:Leaves}" }
};
public static TestDataset irisLoaderU404 = new TestDataset()
{
name = "iris",
trainFilename = @"iris.txt",
testFilename = @"iris.txt",
loaderSettings = "loader=Text{col=Label:U4[0-2]:0 col=Features:1-4}"
};
public static TestDataset iris = new TestDataset()
{
name = "iris",
trainFilename = @"iris.txt",
testFilename = @"iris.txt",
fileHasHeader = true,
fileSeparator = '\t',
mamlExtraSettings = new[] { "xf=Term{col=Label}" }
};
public static TestDataset irisDb = new TestDataset()
{
name = "iris",
trainFilename = @"iris-train",
testFilename = @"iris-test"
};
// TestDataset irisDbSQLite is necessary as SQLite table names including the '-' characters
// are not valid; the underscore '_' is the only valid separator character in SQLite.
public static TestDataset irisDbSQLite = new TestDataset()
{
name = "iris",
trainFilename = @"iris_train",
testFilename = @"iris_test"
};
public static TestDataset irisMissing = new TestDataset()
{
name = "irisMissing",
trainFilename = @"iris.txt",
testFilename = @"iris.txt",
// Create missing labels in iris by generating a number then replacing 5% with an NA label.
mamlExtraSettings = new[] { "xf=generateNumber{col=A} xf=expr{col=Label:Label,A expr={(lab,a):a<0.05?na(lab):lab}}" }
};
public static TestDataset LM = new TestDataset()
{
name = "LM",
trainFilename = @"..\LM\Local.source_features.de-de.txt",
testFilename = @"..\LM\Local.validate_features.de-de.txt",
labelFilename = @"..\LM\Mapping.de-de.txt",
settings = @"name:1,2;text:3,4;header+"
};
public static TestDataset LMWordHashBag = new TestDataset()
{
name = "LM",
trainFilename = @"..\LM\Local.source_features.de-de.txt",
testFilename = @"..\LM\Local.validate_features.de-de.txt",
labelFilename = @"..\LM\Mapping.de-de.txt"
};
public static TestDataset LMCharGrams = new TestDataset()
{
name = "LMCharGrams",
trainFilename = @"..\LM\Local.source_features.de-de.txt",
testFilename = @"..\LM\Local.validate_features.de-de.txt",
labelFilename = @"..\LM\Mapping.de-de.txt"
};
public static TestDataset LMBigrams = new TestDataset()
{
name = "LMBigrams",
trainFilename = @"..\LM\Local.source_features.de-de.txt",
testFilename = @"..\LM\Local.validate_features.de-de.txt",
labelFilename = @"..\LM\Mapping.de-de.txt"
};
public static TestDataset LMNgrams = new TestDataset()
{
name = "LMNgrams",
trainFilename = @"..\LM\Local.source_features.de-de.txt",
testFilename = @"..\LM\Local.validate_features.de-de.txt",
labelFilename = @"..\LM\Mapping.de-de.txt"
};
public static TestDataset LMSkipNgrams = new TestDataset()
{
name = "LMSkipNgrams",
trainFilename = @"..\LM\Local.source_features.de-de.txt",
testFilename = @"..\LM\Local.validate_features.de-de.txt",
labelFilename = @"..\LM\Mapping.de-de.txt"
};
public static TestDataset LMNgramsHashing = new TestDataset()
{
name = "LMNgramsHashing",
trainFilename = @"..\LM\Local.source_features.de-de.txt",
testFilename = @"..\LM\Local.validate_features.de-de.txt",
labelFilename = @"..\LM\Mapping.de-de.txt"
};
public static TestDataset rankingText = new TestDataset()
{
name = "ranking",
trainFilename = @"ranking-sample.txt",
testFilename = @"ranking-sample.txt",
labelFilename = @"ranking-sample.txt.labels",
loaderSettings = "loader=Text{header+ col=Label:TX:0 col=GroupId:U4[0-*]:1 col=Name:TX:1-2 col=Features:3-*}"
};
public static TestDataset rankingExtract = new TestDataset()
{
name = "ranking",
trainFilename = @"ranking-sample.txt",
testFilename = @"ranking-sample.txt",
labelFilename = @"ranking-sample.txt.labels"
};
public static TestDataset breastCancerWeighted = new TestDataset
{
name = "breast-cancer-weighted",
trainFilename = @"ArtificiallyWeighted\breast-cancer-weights-quarter.txt",
testFilename = @"ArtificiallyWeighted\breast-cancer-weights-quarter.txt",
loaderSettings = "loader=Text{col=Weight:0 col=Label:1 col=Features:~}",
settings = "weight:0;label:1"
};
public static TestDataset breastCancerDifferentlyWeighted = new TestDataset
{
name = "breast-cancer-weighted",
trainFilename = @"ArtificiallyWeighted\breast-cancer-weights-quarter.txt",
testFilename = @"ArtificiallyWeighted\breast-cancer-weights-quarter.txt",
loaderSettings = "loader=Text{col=Label:Num:1 col=Weight:Num:4 col=Features:Num:~}"
};
public static TestDataset housingWeightedRep = new TestDataset
{
name = "housing-weighted",
trainFilename = @"ArtificiallyWeighted\housing-weights-quarter-rep.txt",
testFilename = @"ArtificiallyWeighted\housing-weights-quarter-rep.txt",
loaderSettings = "loader=Text{col=Weight:0 col=Label:1 col=Features:~}"
};
public static TestDataset housingDifferentlyWeightedRep = new TestDataset
{
name = "housing-weighted",
trainFilename = @"ArtificiallyWeighted\housing-weights-quarter-rep.txt",
testFilename = @"ArtificiallyWeighted\housing-weights-quarter-rep.txt",
loaderSettings = "loader=Text{col=Label:1 col=Weight:10 col=Features:~}",
settings = "weight:10;label:1"
};
public static TestDataset rankingWeighted = new TestDataset()
{
name = "ranking-weighted",
trainFilename = @"ArtificiallyWeighted\ranking-sample-weights-one.txt",
testFilename = @"ArtificiallyWeighted\ranking-sample-weights-one.txt",
labelFilename = @"ranking-sample.txt.labels",
loaderSettings = "loader=Text{header+ col=Weight:0 col=Label:TX:1 col=Name:TX:2-3 col=GroupId:U4[0-*]:2 col=Features:~}"
};
public static TestDataset adultSparseWithCategory = new TestDataset()
{
name = "adult-sparseWithCat",
trainFilename = @"adult.SparseWithCat.txt",
testFilename = @"adult.SparseWithCat.txt",
settings = @"cat:0"
};
public static TestDataset adultSparseWithCategoryHash = new TestDataset()
{
name = "adult-sparseWithCat",
trainFilename = @"adult.SparseWithCat.txt",
testFilename = @"adult.SparseWithCat.txt",
settings = "",
};
public static TestDataset adultSparseWithCatAsAtt = new TestDataset()
{
name = "adult-sparseWithCat",
trainFilename = @"adult.SparseWithCat.txt",
testFilename = @"adult.SparseWithCat.txt",
settings = ""
};
public static TestDataset pClick = new TestDataset()
{
name = "pClick",
trainFilename = @"..\pclick\pclick-train.txt",
testFilename = @"..\pclick\pclick-test.txt",
loaderSettings = "loader=Text{header+ col=Label:0 col=Features:1-84 rows=3000}",
settings = @"header+;attr:85,86;max:3000"
};
public static TestDataset mnist28 = new TestDataset()
{
name = "mnist28",
trainFilename = @"Train-28x28.txt",
testFilename = @"Test-28x28.txt"
};
public static TestDataset mnistTiny28 = new TestDataset()
{
name = "mnistTiny28",
trainFilename = @"Train-Tiny-28x28.txt",
testFilename = @"Test-Tiny-28x28.txt"
};
public static TestDataset sampleBingRegression = new TestDataset()
{
name = "sampleBingRegression",
trainFilename = @"..\Bing\SampleInputExtraction.txt",
testFilename = @"..\Bing\SampleInputExtraction.txt",
labelFilename = @"..\Bing\labelmap.txt",
settings = @"header:+;attr:1-4;groupkey:a0"
};
public static TestDataset sampleBingBin = new TestDataset()
{
name = "sampleBingBin",
trainFilename = @"..\Bing\SampleInputExtraction.bin",
testFilename = @"..\Bing\SampleInputExtraction.bin",
labelFilename = @"..\Bing\labelmap.txt",
loaderSettings = "loader=Text{header+ col=Label:0 col=Features:1-84 rows=3000}"
};
public static TestDataset mnistOneClass = new TestDataset()
{
name = "mnistOneClass",
trainFilename = @"MNIST.Train.0-class.tiny.txt",
testFilename = @"MNIST.Test.tiny.txt",
fileHasHeader = false,
fileSeparator = '\t',
settings = ""
};
public static TestDataset WebClicksSample = new TestDataset()
{
name = "webClicksSample",
trainFilename = @"..\AdSelection\webClicksSample.small.txt",
testFilename = @"..\AdSelection\webClicksSample.small.txt",
settings = ""
};
public static TestDataset AppFailure = new TestDataset()
{
name = "AppFailure",
trainFilename = @"AppFailure.csv",
settings = "/inst Text{sep=, name=0 label=2}"
};
public static TestDataset azureCounterUnlabeled = new TestDataset()
{
name = "azureCounterUnlabeled",
trainFilename = @"azure-train-unlabeled.txt",
testFilename = @"azure-test-unlabeled.txt",
loaderSettings = "loader=Text{sep=space col=Name:TX:0 col=Features:~}",
mamlExtraSettings = new[] { "xf=Expr{col=Label:Name expr={x : na(float(0))}}" },
settings = "/inst Text{name=0 sep=space nolabel=+}"
};
public static TestDataset MQ2008 = new TestDataset
{
name = "MQ2008",
trainFilename = @"MQ2008\Train.idv.small.txt",
testFilename = @"MQ2008\Test.idv.small.txt",
loaderSettings = "loader=Text"
};
public static TestDataset SequenceDataset = new TestDataset()
{
name = "sequenceDataset",
trainFilename = @"..\V3\Data\OCR\train.tsv",
testFilename = @"..\V3\Data\OCR\train.tsv",
loaderSettings = "loader=Text{col=Label:U1[0-25]:1 col=GroupId:U4[1-*]:3 col=Features:Num:4-*}"
};
public static TestDataset trivialMatrixFactorization = new TestDataset()
{
name = "trivialMatrixFactorization",
trainFilename = @"trivial-train.tsv",
testFilename = @"trivial-test.tsv",
fileHasHeader = true,
fileSeparator = '\t',
loaderSettings = "loader=Text{col=Label:R4:0 col=User:U4[0-19]:1 col=Item:U4[0-39]:2 header+}"
};
}
}