Allow to define CultureInfo for parsing values on reading DataFrame from csv (#6782)
* Use CultureInfo for parsing values in csv file * Fix merge issues
This commit is contained in:
Родитель
ccf34e370b
Коммит
d9dbf99d97
|
@ -98,18 +98,21 @@ namespace Microsoft.Data.Analysis
|
|||
/// <param name="guessRows">number of rows used to guess types</param>
|
||||
/// <param name="addIndexColumn">add one column with the row index</param>
|
||||
/// <param name="encoding">The character encoding. Defaults to UTF8 if not specified</param>
|
||||
/// <param name="renameDuplicatedColumns">If set to true, columns with repeated names are auto-renamed.</param>
|
||||
/// <param name="cultureInfo">culture info for formatting values</param>
|
||||
/// <returns>DataFrame</returns>
|
||||
public static DataFrame LoadCsv(string filename,
|
||||
char separator = ',', bool header = true,
|
||||
string[] columnNames = null, Type[] dataTypes = null,
|
||||
int numRows = -1, int guessRows = 10,
|
||||
bool addIndexColumn = false, Encoding encoding = null)
|
||||
bool addIndexColumn = false, Encoding encoding = null,
|
||||
bool renameDuplicatedColumns = false, CultureInfo cultureInfo = null)
|
||||
{
|
||||
using (Stream fileStream = new FileStream(filename, FileMode.Open))
|
||||
{
|
||||
return LoadCsv(fileStream,
|
||||
separator: separator, header: header, columnNames: columnNames, dataTypes: dataTypes, numberOfRowsToRead: numRows,
|
||||
guessRows: guessRows, addIndexColumn: addIndexColumn, encoding: encoding);
|
||||
guessRows: guessRows, addIndexColumn: addIndexColumn, encoding: encoding, renameDuplicatedColumns: renameDuplicatedColumns, cultureInfo: cultureInfo);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -351,8 +354,14 @@ namespace Microsoft.Data.Analysis
|
|||
char separator = ',', bool header = true,
|
||||
string[] columnNames = null, Type[] dataTypes = null,
|
||||
long numberOfRowsToRead = -1, int guessRows = 10, bool addIndexColumn = false,
|
||||
bool renameDuplicatedColumns = false)
|
||||
bool renameDuplicatedColumns = false,
|
||||
CultureInfo cultureInfo = null)
|
||||
{
|
||||
if (cultureInfo == null)
|
||||
{
|
||||
cultureInfo = CultureInfo.CurrentCulture;
|
||||
}
|
||||
|
||||
if (dataTypes == null && guessRows <= 0)
|
||||
{
|
||||
throw new ArgumentException(string.Format(Strings.ExpectedEitherGuessRowsOrDataTypes, nameof(guessRows), nameof(dataTypes)));
|
||||
|
@ -452,7 +461,7 @@ namespace Microsoft.Data.Analysis
|
|||
}
|
||||
else
|
||||
{
|
||||
ret.Append(fields, inPlace: true);
|
||||
ret.Append(fields, inPlace: true, cultureInfo: cultureInfo);
|
||||
}
|
||||
++rowline;
|
||||
}
|
||||
|
@ -508,7 +517,6 @@ namespace Microsoft.Data.Analysis
|
|||
}
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
|
@ -522,14 +530,18 @@ namespace Microsoft.Data.Analysis
|
|||
/// <param name="numberOfRowsToRead">number of rows to read not including the header(if present)</param>
|
||||
/// <param name="guessRows">number of rows used to guess types</param>
|
||||
/// <param name="addIndexColumn">add one column with the row index</param>
|
||||
/// <param name="renameDuplicatedColumns">If set to true, columns with repeated names are auto-renamed.</param>
|
||||
/// <param name="cultureInfo">culture info for formatting values</param>
|
||||
/// <returns><see cref="DataFrame"/></returns>
|
||||
public static DataFrame LoadCsvFromString(string csvString,
|
||||
char separator = ',', bool header = true,
|
||||
string[] columnNames = null, Type[] dataTypes = null,
|
||||
long numberOfRowsToRead = -1, int guessRows = 10, bool addIndexColumn = false)
|
||||
long numberOfRowsToRead = -1, int guessRows = 10, bool addIndexColumn = false,
|
||||
bool renameDuplicatedColumns = false,
|
||||
CultureInfo cultureInfo = null)
|
||||
{
|
||||
WrappedStreamReaderOrStringReader wrappedStreamReaderOrStringReader = new WrappedStreamReaderOrStringReader(csvString);
|
||||
return ReadCsvLinesIntoDataFrame(wrappedStreamReaderOrStringReader, separator, header, columnNames, dataTypes, numberOfRowsToRead, guessRows, addIndexColumn);
|
||||
return ReadCsvLinesIntoDataFrame(wrappedStreamReaderOrStringReader, separator, header, columnNames, dataTypes, numberOfRowsToRead, guessRows, addIndexColumn, renameDuplicatedColumns, cultureInfo);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
|
@ -545,12 +557,13 @@ namespace Microsoft.Data.Analysis
|
|||
/// <param name="addIndexColumn">add one column with the row index</param>
|
||||
/// <param name="encoding">The character encoding. Defaults to UTF8 if not specified</param>
|
||||
/// <param name="renameDuplicatedColumns">If set to true, columns with repeated names are auto-renamed.</param>
|
||||
/// <param name="cultureInfo">culture info for formatting values</param>
|
||||
/// <returns><see cref="DataFrame"/></returns>
|
||||
public static DataFrame LoadCsv(Stream csvStream,
|
||||
char separator = ',', bool header = true,
|
||||
string[] columnNames = null, Type[] dataTypes = null,
|
||||
long numberOfRowsToRead = -1, int guessRows = 10, bool addIndexColumn = false,
|
||||
Encoding encoding = null, bool renameDuplicatedColumns = false)
|
||||
Encoding encoding = null, bool renameDuplicatedColumns = false, CultureInfo cultureInfo = null)
|
||||
{
|
||||
if (!csvStream.CanSeek)
|
||||
{
|
||||
|
@ -563,7 +576,7 @@ namespace Microsoft.Data.Analysis
|
|||
}
|
||||
|
||||
WrappedStreamReaderOrStringReader wrappedStreamReaderOrStringReader = new WrappedStreamReaderOrStringReader(csvStream, encoding ?? Encoding.UTF8);
|
||||
return ReadCsvLinesIntoDataFrame(wrappedStreamReaderOrStringReader, separator, header, columnNames, dataTypes, numberOfRowsToRead, guessRows, addIndexColumn, renameDuplicatedColumns);
|
||||
return ReadCsvLinesIntoDataFrame(wrappedStreamReaderOrStringReader, separator, header, columnNames, dataTypes, numberOfRowsToRead, guessRows, addIndexColumn, renameDuplicatedColumns, cultureInfo);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
|
|
|
@ -5,6 +5,7 @@
|
|||
using System;
|
||||
using System.Collections.Generic;
|
||||
using System.Diagnostics;
|
||||
using System.Globalization;
|
||||
using System.Linq;
|
||||
using System.Text;
|
||||
|
||||
|
@ -484,12 +485,13 @@ namespace Microsoft.Data.Analysis
|
|||
/// <remarks>If a <seealso cref="DataFrameRow"/> in <paramref name="rows"/> is null, a null value is appended to each column</remarks>
|
||||
/// <param name="rows">The rows to be appended to this DataFrame </param>
|
||||
/// <param name="inPlace">If set, appends <paramref name="rows"/> in place. Otherwise, a new DataFrame is returned with the <paramref name="rows"/> appended</param>
|
||||
public DataFrame Append(IEnumerable<DataFrameRow> rows, bool inPlace = false)
|
||||
/// <param name="cultureInfo">culture info for formatting values</param>
|
||||
public DataFrame Append(IEnumerable<DataFrameRow> rows, bool inPlace = false, CultureInfo cultureInfo = null)
|
||||
{
|
||||
DataFrame ret = inPlace ? this : Clone();
|
||||
foreach (DataFrameRow row in rows)
|
||||
{
|
||||
ret.Append(row, inPlace: true);
|
||||
ret.Append(row, inPlace: true, cultureInfo: cultureInfo);
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
@ -501,8 +503,14 @@ namespace Microsoft.Data.Analysis
|
|||
/// <remarks>If <paramref name="row"/> is null, a null value is appended to each column</remarks>
|
||||
/// <param name="row"></param>
|
||||
/// <param name="inPlace">If set, appends a <paramref name="row"/> in place. Otherwise, a new DataFrame is returned with an appended <paramref name="row"/> </param>
|
||||
public DataFrame Append(IEnumerable<object> row = null, bool inPlace = false)
|
||||
/// <param name="cultureInfo">culture info for formatting values</param>
|
||||
public DataFrame Append(IEnumerable<object> row = null, bool inPlace = false, CultureInfo cultureInfo = null)
|
||||
{
|
||||
if (cultureInfo == null)
|
||||
{
|
||||
cultureInfo = CultureInfo.CurrentCulture;
|
||||
}
|
||||
|
||||
DataFrame ret = inPlace ? this : Clone();
|
||||
IEnumerator<DataFrameColumn> columnEnumerator = ret.Columns.GetEnumerator();
|
||||
bool columnMoveNext = columnEnumerator.MoveNext();
|
||||
|
@ -530,7 +538,7 @@ namespace Microsoft.Data.Analysis
|
|||
}
|
||||
if (value != null)
|
||||
{
|
||||
value = Convert.ChangeType(value, column.DataType);
|
||||
value = Convert.ChangeType(value, column.DataType, cultureInfo);
|
||||
|
||||
if (value is null)
|
||||
{
|
||||
|
|
|
@ -14,6 +14,7 @@ using System.Data.SQLite;
|
|||
using System.Data.SQLite.EF6;
|
||||
using Xunit;
|
||||
using Microsoft.ML.TestFramework.Attributes;
|
||||
using System.Threading;
|
||||
|
||||
namespace Microsoft.Data.Analysis.Tests
|
||||
{
|
||||
|
@ -154,6 +155,42 @@ namespace Microsoft.Data.Analysis.Tests
|
|||
ReducedRowsTest(csvDf);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void TestReadCsvWithHeaderCultureInfoAndSeparator()
|
||||
{
|
||||
string data = @$"vendor_id;rate_code;passenger_count;trip_time_in_secs;trip_distance;payment_type;fare_amount
|
||||
CMT;1;1;1271;3,8;CRD;17,5
|
||||
CMT;1;1;474;1,5;CRD;8
|
||||
CMT;1;1;637;1,4;CRD;8,5
|
||||
CMT;1;1;181;0,6;CSH;4,5";
|
||||
|
||||
void RegularTest(DataFrame df)
|
||||
{
|
||||
Assert.Equal(4, df.Rows.Count);
|
||||
Assert.Equal(7, df.Columns.Count);
|
||||
|
||||
Assert.Equal(3.8f, (float)df["trip_distance"][0]);
|
||||
Assert.Equal(17.5f, (float)df["fare_amount"][0]);
|
||||
|
||||
Assert.Equal(1.5f, (float)df["trip_distance"][1]);
|
||||
Assert.Equal(8f, (float)df["fare_amount"][1]);
|
||||
|
||||
Assert.Equal(1.4f, (float)df["trip_distance"][2]);
|
||||
Assert.Equal(8.5f, (float)df["fare_amount"][2]);
|
||||
|
||||
VerifyColumnTypes(df);
|
||||
}
|
||||
|
||||
// de-DE has ',' as decimal separator
|
||||
var cultureInfo = new CultureInfo("de-DE");
|
||||
DataFrame df = DataFrame.LoadCsv(GetStream(data), separator: ';', cultureInfo: cultureInfo);
|
||||
|
||||
RegularTest(df);
|
||||
|
||||
DataFrame csvDf = DataFrame.LoadCsvFromString(data, separator: ';', cultureInfo: cultureInfo);
|
||||
RegularTest(csvDf);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void TestReadCsvWithHeaderAndDuplicatedColumns_WithoutRenaming()
|
||||
{
|
||||
|
|
Загрузка…
Ссылка в новой задаче