Allow to define CultureInfo for parsing values on reading DataFrame from csv (#6782)

* Use CultureInfo for parsing values in csv file

* Fix merge issues
This commit is contained in:
Aleksei Smirnov 2023-09-01 06:36:46 +03:00 коммит произвёл GitHub
Родитель ccf34e370b
Коммит d9dbf99d97
Не найден ключ, соответствующий данной подписи
Идентификатор ключа GPG: 4AEE18F83AFDEB23
3 изменённых файлов: 71 добавлений и 13 удалений

Просмотреть файл

@ -98,18 +98,21 @@ namespace Microsoft.Data.Analysis
/// <param name="guessRows">number of rows used to guess types</param>
/// <param name="addIndexColumn">add one column with the row index</param>
/// <param name="encoding">The character encoding. Defaults to UTF8 if not specified</param>
/// <param name="renameDuplicatedColumns">If set to true, columns with repeated names are auto-renamed.</param>
/// <param name="cultureInfo">culture info for formatting values</param>
/// <returns>DataFrame</returns>
public static DataFrame LoadCsv(string filename,
char separator = ',', bool header = true,
string[] columnNames = null, Type[] dataTypes = null,
int numRows = -1, int guessRows = 10,
bool addIndexColumn = false, Encoding encoding = null)
bool addIndexColumn = false, Encoding encoding = null,
bool renameDuplicatedColumns = false, CultureInfo cultureInfo = null)
{
using (Stream fileStream = new FileStream(filename, FileMode.Open))
{
return LoadCsv(fileStream,
separator: separator, header: header, columnNames: columnNames, dataTypes: dataTypes, numberOfRowsToRead: numRows,
guessRows: guessRows, addIndexColumn: addIndexColumn, encoding: encoding);
guessRows: guessRows, addIndexColumn: addIndexColumn, encoding: encoding, renameDuplicatedColumns: renameDuplicatedColumns, cultureInfo: cultureInfo);
}
}
@ -351,8 +354,14 @@ namespace Microsoft.Data.Analysis
char separator = ',', bool header = true,
string[] columnNames = null, Type[] dataTypes = null,
long numberOfRowsToRead = -1, int guessRows = 10, bool addIndexColumn = false,
bool renameDuplicatedColumns = false)
bool renameDuplicatedColumns = false,
CultureInfo cultureInfo = null)
{
if (cultureInfo == null)
{
cultureInfo = CultureInfo.CurrentCulture;
}
if (dataTypes == null && guessRows <= 0)
{
throw new ArgumentException(string.Format(Strings.ExpectedEitherGuessRowsOrDataTypes, nameof(guessRows), nameof(dataTypes)));
@ -452,7 +461,7 @@ namespace Microsoft.Data.Analysis
}
else
{
ret.Append(fields, inPlace: true);
ret.Append(fields, inPlace: true, cultureInfo: cultureInfo);
}
++rowline;
}
@ -508,7 +517,6 @@ namespace Microsoft.Data.Analysis
}
}
}
/// <summary>
@ -522,14 +530,18 @@ namespace Microsoft.Data.Analysis
/// <param name="numberOfRowsToRead">number of rows to read not including the header(if present)</param>
/// <param name="guessRows">number of rows used to guess types</param>
/// <param name="addIndexColumn">add one column with the row index</param>
/// <param name="renameDuplicatedColumns">If set to true, columns with repeated names are auto-renamed.</param>
/// <param name="cultureInfo">culture info for formatting values</param>
/// <returns><see cref="DataFrame"/></returns>
public static DataFrame LoadCsvFromString(string csvString,
char separator = ',', bool header = true,
string[] columnNames = null, Type[] dataTypes = null,
long numberOfRowsToRead = -1, int guessRows = 10, bool addIndexColumn = false)
long numberOfRowsToRead = -1, int guessRows = 10, bool addIndexColumn = false,
bool renameDuplicatedColumns = false,
CultureInfo cultureInfo = null)
{
WrappedStreamReaderOrStringReader wrappedStreamReaderOrStringReader = new WrappedStreamReaderOrStringReader(csvString);
return ReadCsvLinesIntoDataFrame(wrappedStreamReaderOrStringReader, separator, header, columnNames, dataTypes, numberOfRowsToRead, guessRows, addIndexColumn);
return ReadCsvLinesIntoDataFrame(wrappedStreamReaderOrStringReader, separator, header, columnNames, dataTypes, numberOfRowsToRead, guessRows, addIndexColumn, renameDuplicatedColumns, cultureInfo);
}
/// <summary>
@ -545,12 +557,13 @@ namespace Microsoft.Data.Analysis
/// <param name="addIndexColumn">add one column with the row index</param>
/// <param name="encoding">The character encoding. Defaults to UTF8 if not specified</param>
/// <param name="renameDuplicatedColumns">If set to true, columns with repeated names are auto-renamed.</param>
/// <param name="cultureInfo">culture info for formatting values</param>
/// <returns><see cref="DataFrame"/></returns>
public static DataFrame LoadCsv(Stream csvStream,
char separator = ',', bool header = true,
string[] columnNames = null, Type[] dataTypes = null,
long numberOfRowsToRead = -1, int guessRows = 10, bool addIndexColumn = false,
Encoding encoding = null, bool renameDuplicatedColumns = false)
Encoding encoding = null, bool renameDuplicatedColumns = false, CultureInfo cultureInfo = null)
{
if (!csvStream.CanSeek)
{
@ -563,7 +576,7 @@ namespace Microsoft.Data.Analysis
}
WrappedStreamReaderOrStringReader wrappedStreamReaderOrStringReader = new WrappedStreamReaderOrStringReader(csvStream, encoding ?? Encoding.UTF8);
return ReadCsvLinesIntoDataFrame(wrappedStreamReaderOrStringReader, separator, header, columnNames, dataTypes, numberOfRowsToRead, guessRows, addIndexColumn, renameDuplicatedColumns);
return ReadCsvLinesIntoDataFrame(wrappedStreamReaderOrStringReader, separator, header, columnNames, dataTypes, numberOfRowsToRead, guessRows, addIndexColumn, renameDuplicatedColumns, cultureInfo);
}
/// <summary>

Просмотреть файл

@ -5,6 +5,7 @@
using System;
using System.Collections.Generic;
using System.Diagnostics;
using System.Globalization;
using System.Linq;
using System.Text;
@ -484,12 +485,13 @@ namespace Microsoft.Data.Analysis
/// <remarks>If a <seealso cref="DataFrameRow"/> in <paramref name="rows"/> is null, a null value is appended to each column</remarks>
/// <param name="rows">The rows to be appended to this DataFrame </param>
/// <param name="inPlace">If set, appends <paramref name="rows"/> in place. Otherwise, a new DataFrame is returned with the <paramref name="rows"/> appended</param>
public DataFrame Append(IEnumerable<DataFrameRow> rows, bool inPlace = false)
/// <param name="cultureInfo">culture info for formatting values</param>
public DataFrame Append(IEnumerable<DataFrameRow> rows, bool inPlace = false, CultureInfo cultureInfo = null)
{
DataFrame ret = inPlace ? this : Clone();
foreach (DataFrameRow row in rows)
{
ret.Append(row, inPlace: true);
ret.Append(row, inPlace: true, cultureInfo: cultureInfo);
}
return ret;
}
@ -501,8 +503,14 @@ namespace Microsoft.Data.Analysis
/// <remarks>If <paramref name="row"/> is null, a null value is appended to each column</remarks>
/// <param name="row"></param>
/// <param name="inPlace">If set, appends a <paramref name="row"/> in place. Otherwise, a new DataFrame is returned with an appended <paramref name="row"/> </param>
public DataFrame Append(IEnumerable<object> row = null, bool inPlace = false)
/// <param name="cultureInfo">culture info for formatting values</param>
public DataFrame Append(IEnumerable<object> row = null, bool inPlace = false, CultureInfo cultureInfo = null)
{
if (cultureInfo == null)
{
cultureInfo = CultureInfo.CurrentCulture;
}
DataFrame ret = inPlace ? this : Clone();
IEnumerator<DataFrameColumn> columnEnumerator = ret.Columns.GetEnumerator();
bool columnMoveNext = columnEnumerator.MoveNext();
@ -530,7 +538,7 @@ namespace Microsoft.Data.Analysis
}
if (value != null)
{
value = Convert.ChangeType(value, column.DataType);
value = Convert.ChangeType(value, column.DataType, cultureInfo);
if (value is null)
{

Просмотреть файл

@ -14,6 +14,7 @@ using System.Data.SQLite;
using System.Data.SQLite.EF6;
using Xunit;
using Microsoft.ML.TestFramework.Attributes;
using System.Threading;
namespace Microsoft.Data.Analysis.Tests
{
@ -154,6 +155,42 @@ namespace Microsoft.Data.Analysis.Tests
ReducedRowsTest(csvDf);
}
[Fact]
public void TestReadCsvWithHeaderCultureInfoAndSeparator()
{
string data = @$"vendor_id;rate_code;passenger_count;trip_time_in_secs;trip_distance;payment_type;fare_amount
CMT;1;1;1271;3,8;CRD;17,5
CMT;1;1;474;1,5;CRD;8
CMT;1;1;637;1,4;CRD;8,5
CMT;1;1;181;0,6;CSH;4,5";
void RegularTest(DataFrame df)
{
Assert.Equal(4, df.Rows.Count);
Assert.Equal(7, df.Columns.Count);
Assert.Equal(3.8f, (float)df["trip_distance"][0]);
Assert.Equal(17.5f, (float)df["fare_amount"][0]);
Assert.Equal(1.5f, (float)df["trip_distance"][1]);
Assert.Equal(8f, (float)df["fare_amount"][1]);
Assert.Equal(1.4f, (float)df["trip_distance"][2]);
Assert.Equal(8.5f, (float)df["fare_amount"][2]);
VerifyColumnTypes(df);
}
// de-DE has ',' as decimal separator
var cultureInfo = new CultureInfo("de-DE");
DataFrame df = DataFrame.LoadCsv(GetStream(data), separator: ';', cultureInfo: cultureInfo);
RegularTest(df);
DataFrame csvDf = DataFrame.LoadCsvFromString(data, separator: ';', cultureInfo: cultureInfo);
RegularTest(csvDf);
}
[Fact]
public void TestReadCsvWithHeaderAndDuplicatedColumns_WithoutRenaming()
{