* Fix https://github.com/dotnet/corefxlab/issues/2906

* Improvements and unit tests

* sq

* Better fix

* sq
This commit is contained in:
Prashanth Govindarajan 2020-04-30 11:51:00 -07:00 коммит произвёл GitHub
Родитель 32d36a1770
Коммит 300885c220
Не найден ключ, соответствующий данной подписи
Идентификатор ключа GPG: 4AEE18F83AFDEB23
4 изменённых файлов: 144 добавлений и 23 удалений

Просмотреть файл

@ -198,14 +198,23 @@ namespace Microsoft.Data.Analysis
Encoding encoding = null) Encoding encoding = null)
{ {
if (!csvStream.CanSeek) if (!csvStream.CanSeek)
{
throw new ArgumentException(Strings.NonSeekableStream, nameof(csvStream)); throw new ArgumentException(Strings.NonSeekableStream, nameof(csvStream));
}
if (dataTypes == null && guessRows <= 0)
{
throw new ArgumentException(string.Format(Strings.ExpectedEitherGuessRowsOrDataTypes, nameof(guessRows), nameof(dataTypes)));
}
var linesForGuessType = new List<string[]>(); var linesForGuessType = new List<string[]>();
long rowline = 0; long rowline = 0;
int numberOfColumns = dataTypes?.Length ?? 0; int numberOfColumns = dataTypes?.Length ?? 0;
if (header == true && numberOfRowsToRead != -1) if (header == true && numberOfRowsToRead != -1)
{
numberOfRowsToRead++; numberOfRowsToRead++;
}
List<DataFrameColumn> columns; List<DataFrameColumn> columns;
long streamStart = csvStream.Position; long streamStart = csvStream.Position;
@ -213,40 +222,39 @@ namespace Microsoft.Data.Analysis
using (var streamReader = new StreamReader(csvStream, encoding ?? Encoding.UTF8, detectEncodingFromByteOrderMarks: true, DefaultStreamReaderBufferSize, leaveOpen: true)) using (var streamReader = new StreamReader(csvStream, encoding ?? Encoding.UTF8, detectEncodingFromByteOrderMarks: true, DefaultStreamReaderBufferSize, leaveOpen: true))
{ {
string line = null; string line = null;
if (dataTypes == null) line = streamReader.ReadLine();
while (line != null)
{ {
line = streamReader.ReadLine(); if ((numberOfRowsToRead == -1) || rowline < numberOfRowsToRead)
while (line != null)
{ {
if ((numberOfRowsToRead == -1) || rowline < numberOfRowsToRead) if (linesForGuessType.Count < guessRows || (header && rowline == 0))
{ {
if (linesForGuessType.Count < guessRows) var spl = line.Split(separator);
if (header && rowline == 0)
{ {
var spl = line.Split(separator); if (columnNames == null)
if (header && rowline == 0)
{ {
if (columnNames == null) columnNames = spl;
columnNames = spl;
}
else
{
linesForGuessType.Add(spl);
numberOfColumns = Math.Max(numberOfColumns, spl.Length);
} }
} }
else
{
linesForGuessType.Add(spl);
numberOfColumns = Math.Max(numberOfColumns, spl.Length);
}
} }
++rowline;
if (rowline == guessRows)
{
break;
}
line = streamReader.ReadLine();
} }
++rowline;
if (linesForGuessType.Count == 0) if (rowline == guessRows || guessRows == 0)
{ {
throw new FormatException(Strings.EmptyFile); break;
} }
line = streamReader.ReadLine();
}
if (rowline == 0)
{
throw new FormatException(Strings.EmptyFile);
} }
columns = new List<DataFrameColumn>(numberOfColumns); columns = new List<DataFrameColumn>(numberOfColumns);

9
src/Microsoft.Data.Analysis/strings.Designer.cs сгенерированный
Просмотреть файл

@ -132,6 +132,15 @@ namespace Microsoft.Data {
} }
} }
/// <summary>
/// Looks up a localized string similar to Expected either {0} or {1} to be provided.
/// </summary>
internal static string ExpectedEitherGuessRowsOrDataTypes {
get {
return ResourceManager.GetString("ExpectedEitherGuessRowsOrDataTypes", resourceCulture);
}
}
/// <summary> /// <summary>
/// Looks up a localized string similar to Column is immutable. /// Looks up a localized string similar to Column is immutable.
/// </summary> /// </summary>

Просмотреть файл

@ -141,6 +141,9 @@
<data name="ExceedsNumberOfColumns" xml:space="preserve"> <data name="ExceedsNumberOfColumns" xml:space="preserve">
<value>Parameter.Count exceeds the number of columns({0}) in the DataFrame </value> <value>Parameter.Count exceeds the number of columns({0}) in the DataFrame </value>
</data> </data>
<data name="ExpectedEitherGuessRowsOrDataTypes" xml:space="preserve">
<value>Expected either {0} or {1} to be provided</value>
</data>
<data name="ImmutableColumn" xml:space="preserve"> <data name="ImmutableColumn" xml:space="preserve">
<value>Column is immutable</value> <value>Column is immutable</value>
</data> </data>

Просмотреть файл

@ -151,6 +151,99 @@ CMT,1,1,181,0.6,CSH,4.5";
VerifyColumnTypes(df); VerifyColumnTypes(df);
} }
void VerifyDataFrameWithNamedColumnsAndDataTypes(DataFrame df, bool verifyColumnDataType, bool verifyNames)
{
Assert.Equal(4, df.Rows.Count);
Assert.Equal(7, df.Columns.Count);
if (verifyColumnDataType)
{
Assert.True(typeof(string) == df.Columns[0].DataType);
Assert.True(typeof(short) == df.Columns[1].DataType);
Assert.True(typeof(int) == df.Columns[2].DataType);
Assert.True(typeof(long) == df.Columns[3].DataType);
Assert.True(typeof(float) == df.Columns[4].DataType);
Assert.True(typeof(string) == df.Columns[5].DataType);
Assert.True(typeof(double) == df.Columns[6].DataType);
}
if (verifyNames)
{
Assert.Equal("vendor_id", df.Columns[0].Name);
Assert.Equal("rate_code", df.Columns[1].Name);
Assert.Equal("passenger_count", df.Columns[2].Name);
Assert.Equal("trip_time_in_secs", df.Columns[3].Name);
Assert.Equal("trip_distance", df.Columns[4].Name);
Assert.Equal("payment_type", df.Columns[5].Name);
Assert.Equal("fare_amount", df.Columns[6].Name);
}
VerifyColumnTypes(df);
foreach (var column in df.Columns)
{
Assert.Equal(0, column.NullCount);
}
}
[Theory]
[InlineData(true, 0)]
[InlineData(false, 0)]
[InlineData(true, 10)]
[InlineData(false, 10)]
public void TestReadCsvWithTypesAndGuessRows(bool header, int guessRows)
{
/* Tests this matrix
*
header GuessRows DataTypes
True 0 NotNull
False 0 NotNull
True 10 NotNull
False 10 NotNull
True 0 Null -----> Throws an exception
False 0 Null -----> Throws an exception
True 10 Null
False 10 Null
*
*/
string headerLine = @"vendor_id,rate_code,passenger_count,trip_time_in_secs,trip_distance,payment_type,fare_amount
";
string dataLines =
@"CMT,1,1,1271,3.8,CRD,17.5
CMT,1,1,474,1.5,CRD,8
CMT,1,1,637,1.4,CRD,8.5
CMT,1,1,181,0.6,CSH,4.5";
Stream GetStream(string streamData)
{
return new MemoryStream(Encoding.Default.GetBytes(streamData));
}
string data = header ? headerLine + dataLines : dataLines;
DataFrame df = DataFrame.LoadCsv(GetStream(data),
header: header,
guessRows: guessRows,
dataTypes: new Type[] { typeof(string), typeof(short), typeof(int), typeof(long), typeof(float), typeof(string), typeof(double) }
);
VerifyDataFrameWithNamedColumnsAndDataTypes(df, verifyColumnDataType: true, verifyNames: header);
if (guessRows == 10)
{
df = DataFrame.LoadCsv(GetStream(data),
header: header,
guessRows: guessRows
);
VerifyDataFrameWithNamedColumnsAndDataTypes(df, verifyColumnDataType: false, verifyNames: header);
}
else
{
Assert.ThrowsAny<ArgumentException>(() => DataFrame.LoadCsv(GetStream(data),
header: header,
guessRows: guessRows
));
}
}
[Fact] [Fact]
public void TestReadCsvWithTypes() public void TestReadCsvWithTypes()
{ {
@ -176,6 +269,14 @@ CMT,1,1,181,0.6,CSH,4.5";
Assert.True(typeof(float) == df.Columns[4].DataType); Assert.True(typeof(float) == df.Columns[4].DataType);
Assert.True(typeof(string) == df.Columns[5].DataType); Assert.True(typeof(string) == df.Columns[5].DataType);
Assert.True(typeof(double) == df.Columns[6].DataType); Assert.True(typeof(double) == df.Columns[6].DataType);
Assert.Equal("vendor_id", df.Columns[0].Name);
Assert.Equal("rate_code", df.Columns[1].Name);
Assert.Equal("passenger_count", df.Columns[2].Name);
Assert.Equal("trip_time_in_secs", df.Columns[3].Name);
Assert.Equal("trip_distance", df.Columns[4].Name);
Assert.Equal("payment_type", df.Columns[5].Name);
Assert.Equal("fare_amount", df.Columns[6].Name);
VerifyColumnTypes(df); VerifyColumnTypes(df);
foreach (var column in df.Columns) foreach (var column in df.Columns)