add hasHeader to ColumnInference (#4922)

This commit is contained in:
Xiaoyun Zhang 2020-03-06 10:46:07 -08:00 коммит произвёл GitHub
Родитель ae1b709947
Коммит e5a19af589
Не найден ключ, соответствующий данной подписи
Идентификатор ключа GPG: 4AEE18F83AFDEB23
4 изменённых файлов: 56 добавлений и 3 удалений

Просмотреть файл

@ -36,12 +36,12 @@ namespace Microsoft.ML.AutoML
}
public static ColumnInferenceResults InferColumns(MLContext context, string path, ColumnInformation columnInfo,
char? separatorChar, bool? allowQuotedStrings, bool? supportSparse, bool trimWhitespace, bool groupColumns)
char? separatorChar, bool? allowQuotedStrings, bool? supportSparse, bool trimWhitespace, bool groupColumns, bool hasHeader = true)
{
var sample = TextFileSample.CreateFromFullFile(path);
var splitInference = InferSplit(context, sample, separatorChar, allowQuotedStrings, supportSparse);
var typeInference = InferColumnTypes(context, sample, splitInference, true, null, columnInfo.LabelColumnName);
return InferColumns(context, path, columnInfo, true, splitInference, typeInference, trimWhitespace, groupColumns);
var typeInference = InferColumnTypes(context, sample, splitInference, hasHeader, null, columnInfo.LabelColumnName);
return InferColumns(context, path, columnInfo, hasHeader, splitInference, typeInference, trimWhitespace, groupColumns);
}
public static ColumnInferenceResults InferColumns(MLContext context, string path, ColumnInformation columnInfo, bool hasHeader,

Просмотреть файл

@ -1,4 +1,5 @@
using System;
using System.Collections.Generic;
using System.IO;
using System.Linq;
using Microsoft.ML.Data;
@ -91,6 +92,35 @@ namespace Microsoft.ML.AutoML.Test
Assert.Equal(DefaultColumnNames.Label, result.ColumnInformation.LabelColumnName);
}
[Fact]
public void InferDatasetWithoutHeader()
{
var context = new MLContext(1);
var filePath = Path.Combine("TestData", "DatasetWithoutHeader.txt");
var columnInfo = new ColumnInformation()
{
LabelColumnName = "col0",
UserIdColumnName = "col1",
ItemIdColumnName = "col2",
};
columnInfo.IgnoredColumnNames.Add("col4");
var result = ColumnInferenceApi.InferColumns(context, filePath, columnInfo, ',', null, null, false, false, false);
Assert.Equal(6, result.TextLoaderOptions.Columns.Count());
var labelColumn = result.TextLoaderOptions.Columns.First(c => c.Name == "col0");
var userColumn = result.TextLoaderOptions.Columns.First(c => c.Name == "col1");
var itemColumn = result.TextLoaderOptions.Columns.First(c => c.Name == "col2");
var ignoreColumn = result.TextLoaderOptions.Columns.First(c => c.Name == "col4");
Assert.Equal(DataKind.String, labelColumn.DataKind);
Assert.Equal(DataKind.Single, userColumn.DataKind);
Assert.Equal(DataKind.Single, itemColumn.DataKind);
Assert.Equal(DataKind.Single, ignoreColumn.DataKind);
Assert.Single(result.ColumnInformation.CategoricalColumnNames);
Assert.Empty(result.ColumnInformation.TextColumnNames);
}
[Fact]
public void WhereNameColumnIsOnlyFeature()
{

Просмотреть файл

@ -14,6 +14,9 @@
<None Update="TestData\DatasetWithDefaultColumnNames.txt">
<CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
</None>
<None Update="TestData\DatasetWithoutHeader.txt">
<CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
</None>
<None Update="TestData\NameColumnIsOnlyFeatureDataset.txt">
<CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
</None>

Просмотреть файл

@ -0,0 +1,20 @@
CMT,1271,3.8,CRD,17.5,T
CMT,474,1.5,CRD,8,T
CMT,637,1.4,CRD,8.5,T
CMT,181,0.6,CSH,4.5,T
CMT,661,1.1,CRD,8.5,T
CMT,935,9.6,CSH,27.5,T
CMT,869,2.3,CRD,11.5,T
CMT,454,1.4,CRD,7.5,T
CMT,366,1.5,CSH,7.5,T
VTS,1140,5.61,CSH,18.5,F
VTS,120,0.67,CSH,4,F
VTS,240,1.7,CRD,6.5,F
VTS,660,2.52,CRD,10.5,F
VTS,420,0.82,CSH,6,F
VTS,420,1.04,CRD,6.5,F
VTS,2280,18,CRD,52,F
VTS,360,1.2,CRD,6.5,F
VTS,660,2.22,CSH,10,F
VTS,840,3.29,CSH,12.5,F
VTS,540,1.85,CRD,8.5,F