Merge pull request #176 from jayjaywg/master

Refactor DataFrame schema and create data type classes, add CreateDataFrame of SqlContext
This commit is contained in:
guwang 2015-12-20 22:20:41 -08:00
Родитель 333c8a34b6 ebd55c3ce8
Коммит 9b75c6fe1b
25 изменённых файлов: 751 добавлений и 460 удалений

Просмотреть файл

@ -76,6 +76,7 @@
<Compile Include="Core\StatCounter.cs" /> <Compile Include="Core\StatCounter.cs" />
<Compile Include="Core\StatusTracker.cs" /> <Compile Include="Core\StatusTracker.cs" />
<Compile Include="Core\StorageLevel.cs" /> <Compile Include="Core\StorageLevel.cs" />
<Compile Include="Interop\Ipc\JsonSerDe.cs" />
<Compile Include="Interop\SparkCLREnvironment.cs" /> <Compile Include="Interop\SparkCLREnvironment.cs" />
<Compile Include="Interop\Ipc\IJvmBridge.cs" /> <Compile Include="Interop\Ipc\IJvmBridge.cs" />
<Compile Include="Interop\Ipc\JvmBridge.cs" /> <Compile Include="Interop\Ipc\JvmBridge.cs" />
@ -118,7 +119,7 @@
<Compile Include="Sql\Functions.cs" /> <Compile Include="Sql\Functions.cs" />
<Compile Include="Sql\SaveMode.cs" /> <Compile Include="Sql\SaveMode.cs" />
<Compile Include="Sql\SqlContext.cs" /> <Compile Include="Sql\SqlContext.cs" />
<Compile Include="Sql\Struct.cs" /> <Compile Include="Sql\Types.cs" />
<Compile Include="Sql\UserDefinedFunction.cs" /> <Compile Include="Sql\UserDefinedFunction.cs" />
<Compile Include="Streaming\DStream.cs" /> <Compile Include="Streaming\DStream.cs" />
<Compile Include="Streaming\PairDStreamFunctions.cs" /> <Compile Include="Streaming\PairDStreamFunctions.cs" />

Просмотреть файл

@ -283,7 +283,7 @@ namespace Microsoft.Spark.CSharp.Core
/// <returns></returns> /// <returns></returns>
public RDD<T> Distinct(int numPartitions = 0) public RDD<T> Distinct(int numPartitions = 0)
{ {
return Map(x => new KeyValuePair<T, int>(x, 0)).ReduceByKey((x,y) => x, numPartitions).Map<T>(x => x.Key); return Map(x => new KeyValuePair<T, int>(x, 0)).ReduceByKey((x, y) => x, numPartitions).Map<T>(x => x.Key);
} }
/// <summary> /// <summary>
@ -417,7 +417,7 @@ namespace Microsoft.Spark.CSharp.Core
else else
{ {
const double delta = 0.00005; const double delta = 0.00005;
var gamma = - Math.Log(delta) / total; var gamma = -Math.Log(delta) / total;
return Math.Min(1, fraction + gamma + Math.Sqrt(gamma * gamma + 2 * gamma * fraction)); return Math.Min(1, fraction + gamma + Math.Sqrt(gamma * gamma + 2 * gamma * fraction));
} }
} }
@ -811,6 +811,7 @@ namespace Microsoft.Spark.CSharp.Core
int left = num - items.Count; int left = num - items.Count;
IEnumerable<int> partitions = Enumerable.Range(partsScanned, Math.Min(numPartsToTry, totalParts - partsScanned)); IEnumerable<int> partitions = Enumerable.Range(partsScanned, Math.Min(numPartsToTry, totalParts - partsScanned));
var mappedRDD = MapPartitionsWithIndex<T>(new TakeHelper<T>(left).Execute); var mappedRDD = MapPartitionsWithIndex<T>(new TakeHelper<T>(left).Execute);
int port = sparkContext.SparkContextProxy.RunJob(mappedRDD.RddProxy, partitions); int port = sparkContext.SparkContextProxy.RunJob(mappedRDD.RddProxy, partitions);
@ -867,7 +868,7 @@ namespace Microsoft.Spark.CSharp.Core
{ {
return Map<KeyValuePair<T, T>>(v => new KeyValuePair<T, T>(v, default(T))).SubtractByKey return Map<KeyValuePair<T, T>>(v => new KeyValuePair<T, T>(v, default(T))).SubtractByKey
( (
other.Map<KeyValuePair<T, T>>(v => new KeyValuePair<T, T>(v, default(T))), other.Map<KeyValuePair<T, T>>(v => new KeyValuePair<T, T>(v, default(T))),
numPartitions numPartitions
) )
.Keys(); .Keys();
@ -1044,7 +1045,7 @@ namespace Microsoft.Spark.CSharp.Core
/// <returns></returns> /// <returns></returns>
public IEnumerable<T> ToLocalIterator() public IEnumerable<T> ToLocalIterator()
{ {
foreach(int partition in Enumerable.Range(0, GetNumPartitions())) foreach (int partition in Enumerable.Range(0, GetNumPartitions()))
{ {
var mappedRDD = MapPartitionsWithIndex<T>((pid, iter) => iter); var mappedRDD = MapPartitionsWithIndex<T>((pid, iter) => iter);
int port = sparkContext.SparkContextProxy.RunJob(mappedRDD.RddProxy, Enumerable.Range(partition, 1)); int port = sparkContext.SparkContextProxy.RunJob(mappedRDD.RddProxy, Enumerable.Range(partition, 1));
@ -1382,7 +1383,7 @@ namespace Microsoft.Spark.CSharp.Core
internal KeyValuePair<K, T> Execute(T input) internal KeyValuePair<K, T> Execute(T input)
{ {
return new KeyValuePair<K,T>(func(input), input); return new KeyValuePair<K, T>(func(input), input);
} }
} }
[Serializable] [Serializable]
@ -1429,7 +1430,7 @@ namespace Microsoft.Spark.CSharp.Core
else if (y.Value) else if (y.Value)
return x; return x;
else else
return new KeyValuePair<T,bool>(func(x.Key, y.Key), false); return new KeyValuePair<T, bool>(func(x.Key, y.Key), false);
} }
} }
[Serializable] [Serializable]

Просмотреть файл

@ -0,0 +1,68 @@
// Copyright (c) Microsoft. All rights reserved.
// Licensed under the MIT license. See LICENSE file in the project root for full license information.
using System.Linq;
using Newtonsoft.Json.Linq;
namespace Microsoft.Spark.CSharp.Interop.Ipc
{
/// <summary>
/// Json.NET Serialization/Deserialization helper class.
/// </summary>
public static class JsonSerDe
{
// Note: Scala side uses JSortedObject when parse Json, so the properties in JObject need to be sorted
/// <summary>
/// Extend method to sort items in a JSON object by keys.
/// </summary>
/// <param name="jObject"></param>
/// <returns></returns>
public static JObject SortProperties(this JObject jObject)
{
JObject sortedJObject = new JObject();
foreach (var property in jObject.Properties().OrderBy(p => p.Name))
{
if (property.Value is JObject)
{
var propJObject = property.Value as JObject;
sortedJObject.Add(property.Name, propJObject.SortProperties());
}
else if (property.Value is JArray)
{
var propJArray = property.Value as JArray;
sortedJObject.Add(property.Name, propJArray.SortProperties());
}
else
{
sortedJObject.Add(property.Name, property.Value);
}
}
return sortedJObject;
}
/// <summary>
/// Extend method to sort items in a JSON array by keys.
/// </summary>
public static JArray SortProperties(this JArray jArray)
{
JArray sortedJArray = new JArray();
if (jArray.Count == 0) return jArray;
foreach (var item in jArray)
{
if (item is JObject)
{
var sortedItem = ((JObject)item).SortProperties();
sortedJArray.Add(sortedItem);
}
else if (item is JArray)
{
var sortedItem = ((JArray)item).SortProperties();
sortedJArray.Add(sortedItem);
}
}
return sortedJArray;
}
}
}

Просмотреть файл

@ -17,8 +17,6 @@ namespace Microsoft.Spark.CSharp.Proxy
ISparkContextProxy SparkContextProxy { get; } ISparkContextProxy SparkContextProxy { get; }
ISparkConfProxy CreateSparkConf(bool loadDefaults = true); ISparkConfProxy CreateSparkConf(bool loadDefaults = true);
ISparkContextProxy CreateSparkContext(ISparkConfProxy conf); ISparkContextProxy CreateSparkContext(ISparkConfProxy conf);
IStructFieldProxy CreateStructField(string name, string dataType, bool isNullable);
IStructTypeProxy CreateStructType(List<StructField> fields);
IDStreamProxy CreateCSharpDStream(IDStreamProxy jdstream, byte[] func, string deserializer); IDStreamProxy CreateCSharpDStream(IDStreamProxy jdstream, byte[] func, string deserializer);
IDStreamProxy CreateCSharpTransformed2DStream(IDStreamProxy jdstream, IDStreamProxy jother, byte[] func, string deserializer, string deserializerOther); IDStreamProxy CreateCSharpTransformed2DStream(IDStreamProxy jdstream, IDStreamProxy jother, byte[] func, string deserializer, string deserializerOther);
IDStreamProxy CreateCSharpReducedWindowedDStream(IDStreamProxy jdstream, byte[] func, byte[] invFunc, int windowSeconds, int slideSeconds, string deserializer); IDStreamProxy CreateCSharpReducedWindowedDStream(IDStreamProxy jdstream, byte[] func, byte[] invFunc, int windowSeconds, int slideSeconds, string deserializer);

Просмотреть файл

@ -13,6 +13,7 @@ namespace Microsoft.Spark.CSharp.Proxy
{ {
internal interface ISqlContextProxy internal interface ISqlContextProxy
{ {
IDataFrameProxy CreateDataFrame(IRDDProxy rddProxy, IStructTypeProxy structTypeProxy);
IDataFrameProxy ReadDataFrame(string path, StructType schema, Dictionary<string, string> options); IDataFrameProxy ReadDataFrame(string path, StructType schema, Dictionary<string, string> options);
IDataFrameProxy JsonFile(string path); IDataFrameProxy JsonFile(string path);
IDataFrameProxy TextFile(string path, StructType schema, string delimiter); IDataFrameProxy TextFile(string path, StructType schema, string delimiter);

Просмотреть файл

@ -6,7 +6,6 @@ using System.Collections.Generic;
using System.Linq; using System.Linq;
using Microsoft.Spark.CSharp.Core; using Microsoft.Spark.CSharp.Core;
using Microsoft.Spark.CSharp.Interop.Ipc; using Microsoft.Spark.CSharp.Interop.Ipc;
using Microsoft.Spark.CSharp.Sql;
namespace Microsoft.Spark.CSharp.Proxy.Ipc namespace Microsoft.Spark.CSharp.Proxy.Ipc
{ {
@ -77,7 +76,7 @@ namespace Microsoft.Spark.CSharp.Proxy.Ipc
return return
SparkCLRIpcProxy.JvmBridge.CallNonStaticJavaMethod( SparkCLRIpcProxy.JvmBridge.CallNonStaticJavaMethod(
jvmDataFrameReference, "showString", jvmDataFrameReference, "showString",
new object[] { numberOfRows , truncate }).ToString(); new object[] { numberOfRows, truncate }).ToString();
} }
public bool IsLocal() public bool IsLocal()

Просмотреть файл

@ -62,33 +62,6 @@ namespace Microsoft.Spark.CSharp.Proxy.Ipc
return sparkContextProxy; return sparkContextProxy;
} }
public IStructFieldProxy CreateStructField(string name, string dataType, bool isNullable)
{
return new StructFieldIpcProxy(
new JvmObjectReference(
JvmBridge.CallStaticJavaMethod(
"org.apache.spark.sql.api.csharp.SQLUtils", "createStructField",
new object[] { name, dataType, isNullable }).ToString()
)
);
}
public IStructTypeProxy CreateStructType(List<StructField> fields)
{
var fieldsReference = fields.Select(s => (s.StructFieldProxy as StructFieldIpcProxy).JvmStructFieldReference).ToList().Cast<JvmObjectReference>();
var seq =
new JvmObjectReference(
JvmBridge.CallStaticJavaMethod("org.apache.spark.sql.api.csharp.SQLUtils",
"toSeq", new object[] { fieldsReference }).ToString());
return new StructTypeIpcProxy(
new JvmObjectReference(
JvmBridge.CallStaticJavaMethod("org.apache.spark.sql.api.csharp.SQLUtils", "createStructType", new object[] { seq }).ToString()
)
);
}
public IDStreamProxy CreateCSharpDStream(IDStreamProxy jdstream, byte[] func, string deserializer) public IDStreamProxy CreateCSharpDStream(IDStreamProxy jdstream, byte[] func, string deserializer)
{ {
var jvmDStreamReference = SparkCLRIpcProxy.JvmBridge.CallConstructor("org.apache.spark.streaming.api.csharp.CSharpDStream", var jvmDStreamReference = SparkCLRIpcProxy.JvmBridge.CallConstructor("org.apache.spark.streaming.api.csharp.CSharpDStream",

Просмотреть файл

@ -116,7 +116,6 @@ namespace Microsoft.Spark.CSharp.Proxy.Ipc
return new RDDIpcProxy(jvmRddReference); return new RDDIpcProxy(jvmRddReference);
} }
//TODO - this implementation is slow. Replace with call to createRDDFromArray() in CSharpRDD
public IRDDProxy Parallelize(IEnumerable<byte[]> values, int numSlices) public IRDDProxy Parallelize(IEnumerable<byte[]> values, int numSlices)
{ {
var jvmRddReference = new JvmObjectReference((string)SparkCLRIpcProxy.JvmBridge.CallStaticJavaMethod("org.apache.spark.api.csharp.CSharpRDD", "createRDDFromArray", new object[] { jvmSparkContextReference, values, numSlices })); var jvmRddReference = new JvmObjectReference((string)SparkCLRIpcProxy.JvmBridge.CallStaticJavaMethod("org.apache.spark.api.csharp.CSharpRDD", "createRDDFromArray", new object[] { jvmSparkContextReference, values, numSlices }));

Просмотреть файл

@ -21,6 +21,17 @@ namespace Microsoft.Spark.CSharp.Proxy.Ipc
this.jvmSqlContextReference = jvmSqlContextReference; this.jvmSqlContextReference = jvmSqlContextReference;
} }
public IDataFrameProxy CreateDataFrame(IRDDProxy rddProxy, IStructTypeProxy structTypeProxy)
{
var rdd = new JvmObjectReference(SparkCLRIpcProxy.JvmBridge.CallStaticJavaMethod("org.apache.spark.sql.api.csharp.SQLUtils", "byteArrayRDDToAnyArrayRDD",
new object[] { (rddProxy as RDDIpcProxy).JvmRddReference }).ToString());
return new DataFrameIpcProxy(
new JvmObjectReference(
SparkCLRIpcProxy.JvmBridge.CallNonStaticJavaMethod(jvmSqlContextReference, "applySchemaToPythonRDD",
new object[] { rdd, (structTypeProxy as StructTypeIpcProxy).JvmStructTypeReference }).ToString()), this);
}
public IDataFrameProxy ReadDataFrame(string path, StructType schema, Dictionary<string, string> options) public IDataFrameProxy ReadDataFrame(string path, StructType schema, Dictionary<string, string> options)
{ {
//TODO parameter Dictionary<string, string> options is not used right now - it is meant to be passed on to data sources //TODO parameter Dictionary<string, string> options is not used right now - it is meant to be passed on to data sources
@ -44,7 +55,7 @@ namespace Microsoft.Spark.CSharp.Proxy.Ipc
new JvmObjectReference( new JvmObjectReference(
SparkCLRIpcProxy.JvmBridge.CallStaticJavaMethod( SparkCLRIpcProxy.JvmBridge.CallStaticJavaMethod(
"org.apache.spark.sql.api.csharp.SQLUtils", "loadTextFile", "org.apache.spark.sql.api.csharp.SQLUtils", "loadTextFile",
new object[] {jvmSqlContextReference, path, delimiter, (schema.StructTypeProxy as StructTypeIpcProxy).JvmStructTypeReference}).ToString() new object[] { jvmSqlContextReference, path, delimiter, schema.Json}).ToString()
), this ), this
); );
} }

Просмотреть файл

@ -22,7 +22,7 @@ namespace Microsoft.Spark.CSharp.Sql
private readonly IDataFrameProxy dataFrameProxy; private readonly IDataFrameProxy dataFrameProxy;
[NonSerialized] [NonSerialized]
private readonly SparkContext sparkContext; private readonly SparkContext sparkContext;
[NonSerialized]
private StructType schema; private StructType schema;
[NonSerialized] [NonSerialized]
private RDD<Row> rdd; private RDD<Row> rdd;
@ -40,7 +40,7 @@ namespace Microsoft.Spark.CSharp.Sql
if (rdd == null) if (rdd == null)
{ {
rddProxy = dataFrameProxy.JavaToCSharp(); rddProxy = dataFrameProxy.JavaToCSharp();
rdd = new RDD<Row>(rddProxy, sparkContext, SerializedMode.Row); rdd = new RDD<Row>(rddProxy, sparkContext, SerializedMode.Row);
} }
return rdd; return rdd;
} }
@ -137,7 +137,7 @@ namespace Microsoft.Spark.CSharp.Sql
/// </summary> /// </summary>
public void ShowSchema() public void ShowSchema()
{ {
List<string> nameTypeList = Schema.Fields.Select(structField => string.Format("{0}:{1}", structField.Name, structField.DataType.SimpleString())).ToList(); var nameTypeList = Schema.Fields.Select(structField => structField.SimpleString);
Console.WriteLine(string.Join(", ", nameTypeList)); Console.WriteLine(string.Join(", ", nameTypeList));
} }
@ -145,18 +145,18 @@ namespace Microsoft.Spark.CSharp.Sql
/// Returns all of Rows in this DataFrame /// Returns all of Rows in this DataFrame
/// </summary> /// </summary>
public IEnumerable<Row> Collect() public IEnumerable<Row> Collect()
{ {
int port = RddProxy.CollectAndServe(); int port = RddProxy.CollectAndServe();
return Rdd.Collect(port).Cast<Row>(); return Rdd.Collect(port).Cast<Row>();
} }
/// <summary> /// <summary>
/// Converts the DataFrame to RDD of byte[] /// Converts the DataFrame to RDD of Row
/// </summary> /// </summary>
/// <returns>resulting RDD</returns> /// <returns>resulting RDD</returns>
public RDD<byte[]> ToRDD() //RDD created using byte representation of GenericRow objects public RDD<Row> ToRDD() //RDD created using byte representation of Row objects
{ {
return new RDD<byte[]>(dataFrameProxy.ToRDD(), sparkContext); return Rdd;
} }
/// <summary> /// <summary>

Просмотреть файл

@ -2,13 +2,9 @@
// Licensed under the MIT license. See LICENSE file in the project root for full license information. // Licensed under the MIT license. See LICENSE file in the project root for full license information.
using System; using System;
using System.Collections;
using System.Collections.Generic; using System.Collections.Generic;
using System.Linq; using System.Linq;
using System.Text;
using System.Threading.Tasks;
using Newtonsoft.Json;
using Newtonsoft.Json.Linq;
using Microsoft.Spark.CSharp.Services; using Microsoft.Spark.CSharp.Services;
namespace Microsoft.Spark.CSharp.Sql namespace Microsoft.Spark.CSharp.Sql
@ -31,24 +27,24 @@ namespace Microsoft.Spark.CSharp.Sql
/// <summary> /// <summary>
/// Schema for the row. /// Schema for the row.
/// </summary> /// </summary>
public abstract RowSchema GetSchema(); public abstract StructType GetSchema();
/// <summary> /// <summary>
/// Returns the value at position i. /// Returns the value at position i.
/// </summary> /// </summary>
public abstract object Get(int i); public abstract dynamic Get(int i);
/// <summary> /// <summary>
/// Returns the value of a given columnName. /// Returns the value of a given columnName.
/// </summary> /// </summary>
public abstract object Get(string columnName); public abstract dynamic Get(string columnName);
/// <summary> /// <summary>
/// Returns the value at position i, the return value will be cast to type T. /// Returns the value at position i, the return value will be cast to type T.
/// </summary> /// </summary>
public T GetAs<T>(int i) public T GetAs<T>(int i)
{ {
object o = Get(i); dynamic o = Get(i);
try try
{ {
T result = (T)o; T result = (T)o;
@ -66,7 +62,7 @@ namespace Microsoft.Spark.CSharp.Sql
/// </summary> /// </summary>
public T GetAs<T>(string columnName) public T GetAs<T>(string columnName)
{ {
object o = Get(columnName); dynamic o = Get(columnName);
try try
{ {
T result = (T)o; T result = (T)o;
@ -80,145 +76,12 @@ namespace Microsoft.Spark.CSharp.Sql
} }
} }
/// <summary>
/// Schema of Row
/// </summary>
[Serializable]
public class RowSchema
{
public string type;
public List<ColumnSchema> columns;
private readonly Dictionary<string, int> columnName2Index = new Dictionary<string, int>();
public RowSchema(string type)
{
this.type = type;
this.columns = new List<ColumnSchema>();
}
public RowSchema(string type, List<ColumnSchema> cols)
{
int index = 0;
foreach (var col in cols)
{
string columnName = col.name;
//TODO - investigate the issue and uncomment the following checks
/*
* UDFs produce empty column name. Commenting out the following code at the time of upgrading to 1.5.2
*/
//if (string.IsNullOrEmpty(columnName))
//{
// throw new Exception(string.Format("Null column name at pos: {0}", index));
//}
//if (columnName2Index.ContainsKey(columnName))
//{
// throw new Exception(string.Format("duplicate column name ({0}) in pos ({1}) and ({2})",
// columnName, columnName2Index[columnName], index));
//}
columnName2Index[columnName] = index;
index++;
}
this.type = type;
this.columns = cols;
}
internal int GetIndexByColumnName(string ColumnName)
{
if (!columnName2Index.ContainsKey(ColumnName))
{
throw new Exception(string.Format("unknown ColumnName: {0}", ColumnName));
}
return columnName2Index[ColumnName];
}
public override string ToString()
{
string result;
if (columns.Any())
{
List<string> cols = new List<string>();
foreach (var col in columns)
{
cols.Add(col.ToString());
}
result = "{" +
string.Format("type: {0}, columns: [{1}]", type, string.Join(", ", cols.ToArray())) +
"}";
}
else
{
result = type;
}
return result;
}
internal static RowSchema ParseRowSchemaFromJson(string json)
{
JObject joType = JObject.Parse(json);
string type = joType["type"].ToString();
List<ColumnSchema> columns = new List<ColumnSchema>();
List<JToken> jtFields = joType["fields"].Children().ToList();
foreach (JToken jtField in jtFields)
{
ColumnSchema col = ColumnSchema.ParseColumnSchemaFromJson(jtField.ToString());
columns.Add(col);
}
return new RowSchema(type, columns);
}
}
/// <summary>
/// Schema for column
/// </summary>
[Serializable]
public class ColumnSchema
{
public string name;
public RowSchema type;
public bool nullable;
public override string ToString()
{
string str = string.Format("name: {0}, type: {1}, nullable: {2}", name, type, nullable);
return "{" + str + "}";
}
internal static ColumnSchema ParseColumnSchemaFromJson(string json)
{
ColumnSchema col = new ColumnSchema();
JObject joField = JObject.Parse(json);
col.name = joField["name"].ToString();
col.nullable = (bool)(joField["nullable"]);
JToken jtType = joField["type"];
if (jtType.Type == JTokenType.String)
{
col.type = new RowSchema(joField["type"].ToString());
}
else
{
col.type = RowSchema.ParseRowSchemaFromJson(joField["type"].ToString());
}
return col;
}
}
[Serializable] [Serializable]
internal class RowImpl : Row internal class RowImpl : Row
{ {
private readonly RowSchema schema; private readonly StructType schema;
private readonly object[] values; public dynamic[] Values { get { return values; } }
private readonly dynamic[] values;
private readonly int columnCount; private readonly int columnCount;
@ -229,17 +92,15 @@ namespace Microsoft.Spark.CSharp.Sql
return Get(index); return Get(index);
} }
} }
internal RowImpl(dynamic data, StructType schema)
internal RowImpl(object data, RowSchema schema)
{ {
if (data is object[]) if (data is dynamic[])
{ {
values = data as object[]; values = data as dynamic[];
} }
else if (data is List<object>) else if (data is List<dynamic>)
{ {
values = (data as List<object>).ToArray(); values = (data as List<dynamic>).ToArray();
} }
else else
{ {
@ -249,7 +110,7 @@ namespace Microsoft.Spark.CSharp.Sql
this.schema = schema; this.schema = schema;
columnCount = values.Count(); columnCount = values.Count();
int schemaColumnCount = this.schema.columns.Count(); int schemaColumnCount = this.schema.Fields.Count();
if (columnCount != schemaColumnCount) if (columnCount != schemaColumnCount)
{ {
throw new Exception(string.Format("column count inferred from data ({0}) and schema ({1}) mismatch", columnCount, schemaColumnCount)); throw new Exception(string.Format("column count inferred from data ({0}) and schema ({1}) mismatch", columnCount, schemaColumnCount));
@ -263,12 +124,12 @@ namespace Microsoft.Spark.CSharp.Sql
return columnCount; return columnCount;
} }
public override RowSchema GetSchema() public override StructType GetSchema()
{ {
return schema; return schema;
} }
public override object Get(int i) public override dynamic Get(int i)
{ {
if (i >= columnCount) if (i >= columnCount)
{ {
@ -278,9 +139,9 @@ namespace Microsoft.Spark.CSharp.Sql
return values[i]; return values[i];
} }
public override object Get(string columnName) public override dynamic Get(string columnName)
{ {
int index = schema.GetIndexByColumnName(columnName); int index = schema.Fields.FindIndex(f => f.Name == columnName); // case sensitive
return Get(index); return Get(index);
} }
@ -305,21 +166,69 @@ namespace Microsoft.Spark.CSharp.Sql
private void Initialize() private void Initialize()
{ {
int index = 0; int index = 0;
foreach (var col in schema.columns) foreach (var field in schema.Fields)
{ {
if (col.type.columns.Any()) // this column itself is a sub-row if (field.DataType is ArrayType)
{ {
object value = values[index]; Func<DataType, int, StructType> convertArrayTypeToStructTypeFunc = (dataType, length) =>
{
StructField[] fields = new StructField[length];
for(int i = 0; i < length ; i++)
{
fields[i] = new StructField(string.Format("_array_{0}", i), dataType);
}
return new StructType(fields);
};
var elementType = (field.DataType as ArrayType).ElementType;
// Note: When creating object from json, PySpark converts Json array to Python List (https://github.com/apache/spark/blob/branch-1.4/python/pyspark/sql/types.py, _create_cls(dataType)),
// then Pyrolite unpickler converts Python List to C# ArrayList (https://github.com/irmen/Pyrolite/blob/v4.10/README.txt). So values[index] should be of type ArrayList;
// In case Python changes its implementation, which means value is not of type ArrayList, try cast to object[] because Pyrolite unpickler convert Python Tuple to C# object[].
object[] valueOfArray = values[index] is ArrayList ? (values[index] as ArrayList).ToArray() : values[index] as object[];
if (valueOfArray == null)
{
throw new ArgumentException("Cannot parse data of ArrayType: " + field.Name);
}
values[index] = new RowImpl(valueOfArray, elementType as StructType ?? convertArrayTypeToStructTypeFunc(elementType, valueOfArray.Length)).values;
}
else if (field.DataType is MapType)
{
//TODO
throw new NotImplementedException();
}
else if (field.DataType is StructType)
{
dynamic value = values[index];
if (value != null) if (value != null)
{ {
RowImpl subRow = new RowImpl(values[index], col.type); var subRow = new RowImpl(values[index], field.DataType as StructType);
values[index] = subRow; values[index] = subRow;
} }
} }
else if (field.DataType is DecimalType)
{
//TODO
throw new NotImplementedException();
}
else if (field.DataType is DateType)
{
//TODO
throw new NotImplementedException();
}
else if (field.DataType is StringType)
{
if (values[index] != null) values[index] = values[index].ToString();
}
else
{
values[index] = values[index];
}
index++; index++;
} }
} }
} }
} }

Просмотреть файл

@ -1,11 +1,6 @@
// Copyright (c) Microsoft. All rights reserved. // Copyright (c) Microsoft. All rights reserved.
// Licensed under the MIT license. See LICENSE file in the project root for full license information. // Licensed under the MIT license. See LICENSE file in the project root for full license information.
using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Threading.Tasks;
using Razorvine.Pickle; using Razorvine.Pickle;
namespace Microsoft.Spark.CSharp.Sql namespace Microsoft.Spark.CSharp.Sql
@ -65,7 +60,8 @@ namespace Microsoft.Spark.CSharp.Sql
/// <returns></returns> /// <returns></returns>
public Row GetRow() public Row GetRow()
{ {
var row = new RowImpl(GetValues(Values), RowSchema.ParseRowSchemaFromJson(Schema)); var schema = DataType.ParseDataTypeFromJson(Schema) as StructType;
var row = new RowImpl(GetValues(Values), schema);
//Resetting schema here so that rows from multiple DataFrames can be processed in the same AppDomain //Resetting schema here so that rows from multiple DataFrames can be processed in the same AppDomain
//next row will have schema - so resetting is fine //next row will have schema - so resetting is fine

Просмотреть файл

@ -8,6 +8,7 @@ using System.Text;
using System.Threading.Tasks; using System.Threading.Tasks;
using Microsoft.Spark.CSharp.Core; using Microsoft.Spark.CSharp.Core;
using Microsoft.Spark.CSharp.Interop; using Microsoft.Spark.CSharp.Interop;
using Microsoft.Spark.CSharp.Interop.Ipc;
using Microsoft.Spark.CSharp.Proxy; using Microsoft.Spark.CSharp.Proxy;
namespace Microsoft.Spark.CSharp.Sql namespace Microsoft.Spark.CSharp.Sql
@ -39,9 +40,16 @@ namespace Microsoft.Spark.CSharp.Sql
return new DataFrame(sqlContextProxy.ReadDataFrame(path, schema, options), sparkContext); return new DataFrame(sqlContextProxy.ReadDataFrame(path, schema, options), sparkContext);
} }
public DataFrame CreateDataFrame(RDD<byte[]> rdd, StructType schema) public DataFrame CreateDataFrame(RDD<object[]> rdd, StructType schema)
{ {
throw new NotImplementedException(); // Note: This is for pickling RDD, convert to RDD<byte[]> which happens in CSharpWorker.
// The below sqlContextProxy.CreateDataFrame() will call byteArrayRDDToAnyArrayRDD() of SQLUtils.scala which only accept RDD of type RDD[Array[Byte]].
// In byteArrayRDDToAnyArrayRDD() of SQLUtils.scala, the SerDeUtil.pythonToJava() will be called which is a mapPartitions inside.
// It will be executed until the CSharpWorker finishes Pickling to RDD[Array[Byte]].
var rddRow = rdd.Map(r => r);
rddRow.serializedMode = SerializedMode.Row;
return new DataFrame(sqlContextProxy.CreateDataFrame(rddRow.RddProxy, schema.StructTypeProxy), sparkContext);
} }
/// <summary> /// <summary>

Просмотреть файл

@ -1,135 +0,0 @@
// Copyright (c) Microsoft. All rights reserved.
// Licensed under the MIT license. See LICENSE file in the project root for full license information.
using System.Collections.Generic;
using System.Linq;
using Microsoft.Spark.CSharp.Interop;
using Microsoft.Spark.CSharp.Proxy;
using Microsoft.Spark.CSharp.Proxy.Ipc;
namespace Microsoft.Spark.CSharp.Sql
{
/// <summary>
/// Schema of DataFrame
/// </summary>
public class StructType
{
private readonly IStructTypeProxy structTypeProxy;
internal IStructTypeProxy StructTypeProxy
{
get
{
return structTypeProxy;
}
}
public List<StructField> Fields //TODO - avoid calling method everytime
{
get
{
var structTypeFieldJvmObjectReferenceList =
structTypeProxy.GetStructTypeFields();
var structFieldList = new List<StructField>(structTypeFieldJvmObjectReferenceList.Count);
structFieldList.AddRange(
structTypeFieldJvmObjectReferenceList.Select(
structTypeFieldJvmObjectReference => new StructField(structTypeFieldJvmObjectReference)));
return structFieldList;
}
}
public string ToJson()
{
return structTypeProxy.ToJson();
}
internal StructType(IStructTypeProxy structTypeProxy)
{
this.structTypeProxy = structTypeProxy;
}
public static StructType CreateStructType(List<StructField> structFields)
{
return new StructType(SparkCLREnvironment.SparkCLRProxy.CreateStructType(structFields));
}
}
/// <summary>
/// Schema for DataFrame column
/// </summary>
public class StructField
{
private readonly IStructFieldProxy structFieldProxy;
internal IStructFieldProxy StructFieldProxy
{
get
{
return structFieldProxy;
}
}
public string Name
{
get
{
return structFieldProxy.GetStructFieldName();
}
}
public DataType DataType
{
get
{
return new DataType(structFieldProxy.GetStructFieldDataType());
}
}
public bool IsNullable
{
get
{
return structFieldProxy.GetStructFieldIsNullable();
}
}
internal StructField(IStructFieldProxy strucFieldProxy)
{
structFieldProxy = strucFieldProxy;
}
public static StructField CreateStructField(string name, string dataType, bool isNullable)
{
return new StructField(SparkCLREnvironment.SparkCLRProxy.CreateStructField(name, dataType, isNullable));
}
}
public class DataType
{
private readonly IStructDataTypeProxy structDataTypeProxy;
internal IStructDataTypeProxy StructDataTypeProxy
{
get
{
return structDataTypeProxy;
}
}
internal DataType(IStructDataTypeProxy structDataTypeProxy)
{
this.structDataTypeProxy = structDataTypeProxy;
}
public override string ToString()
{
return structDataTypeProxy.GetDataTypeString();
}
public string SimpleString()
{
return structDataTypeProxy.GetDataTypeSimpleString();
}
}
}

Просмотреть файл

@ -0,0 +1,360 @@
// Copyright (c) Microsoft. All rights reserved.
// Licensed under the MIT license. See LICENSE file in the project root for full license information.
using System;
using System.Collections.Generic;
using System.Linq;
using System.Reflection;
using System.Text.RegularExpressions;
using Microsoft.Spark.CSharp.Interop.Ipc;
using Microsoft.Spark.CSharp.Proxy;
using Microsoft.Spark.CSharp.Proxy.Ipc;
using Newtonsoft.Json;
using Newtonsoft.Json.Linq;
namespace Microsoft.Spark.CSharp.Sql
{
[Serializable]
public abstract class DataType
{
/// <summary>
/// Trim "Type" in the end from class name, ToLower() to align with Scala.
/// </summary>
public string TypeName
{
get { return NormalizeTypeName(GetType().Name); }
}
/// <summary>
/// return TypeName by default, subclass can override it
/// </summary>
public virtual string SimpleString
{
get { return TypeName; }
}
/// <summary>
/// return only type: TypeName by default, subclass can override it
/// </summary>
internal virtual object JsonValue { get { return TypeName; } }
public string Json
{
get
{
var jObject = JsonValue is JObject ? ((JObject)JsonValue).SortProperties() : JsonValue;
return JsonConvert.SerializeObject(jObject, Formatting.None);
}
}
public static DataType ParseDataTypeFromJson(string json)
{
return ParseDataTypeFromJson(JToken.Parse(json));
}
protected static DataType ParseDataTypeFromJson(JToken json)
{
if (json.Type == JTokenType.Object) // {name: address, type: {type: struct,...},...}
{
JToken type;
var typeJObject = (JObject)json;
if (typeJObject.TryGetValue("type", out type))
{
Type complexType;
if ((complexType = ComplexTypes.FirstOrDefault(ct => NormalizeTypeName(ct.Name) == type.ToString())) != default(Type))
{
return ((ComplexType)Activator.CreateInstance(complexType, BindingFlags.Public | BindingFlags.NonPublic | BindingFlags.Instance
, null, new object[] { typeJObject }, null)); // create new instance of ComplexType
}
if (type.ToString() == "udt")
{
// TODO
throw new NotImplementedException();
}
}
throw new ArgumentException(string.Format("Could not parse data type: {0}", type));
}
else // {name: age, type: bigint,...} // TODO: validate more JTokenType other than Object
{
return ParseAtomicType(json);
}
throw new ArgumentException(string.Format("Could not parse data type: {0}", json));
}
private static AtomicType ParseAtomicType(JToken type)
{
Type atomicType;
if ((atomicType = AtomicTypes.FirstOrDefault(at => NormalizeTypeName(at.Name) == type.ToString())) != default(Type))
{
return (AtomicType)Activator.CreateInstance(atomicType); // create new instance of AtomicType
}
Match fixedDecimal = DecimalType.FixedDecimal.Match(type.ToString());
if (fixedDecimal.Success)
{
return new DecimalType(int.Parse(fixedDecimal.Groups[1].Value), int.Parse(fixedDecimal.Groups[2].Value));
}
throw new ArgumentException(string.Format("Could not parse data type: {0}", type));
}
[NonSerialized]
private static readonly Type[] AtomicTypes = typeof(AtomicType).Assembly.GetTypes().Where(type =>
type.IsSubclassOf(typeof(AtomicType))).ToArray();
[NonSerialized]
private static readonly Type[] ComplexTypes = typeof(ComplexType).Assembly.GetTypes().Where(type =>
type.IsSubclassOf(typeof(ComplexType))).ToArray();
[NonSerialized]
private static readonly Func<string, string> NormalizeTypeName = s => s.Substring(0, s.Length - 4).ToLower(); // trim "Type" at the end of type name
}
[Serializable]
public class AtomicType : DataType
{
}
[Serializable]
public abstract class ComplexType : DataType
{
public abstract DataType FromJson(JObject json);
public DataType FromJson(string json)
{
return FromJson(JObject.Parse(json));
}
}
[Serializable]
public class NullType : AtomicType { }
[Serializable]
public class StringType : AtomicType { }
[Serializable]
public class BinaryType : AtomicType { }
[Serializable]
public class BooleanType : AtomicType { }
[Serializable]
public class DateType : AtomicType { }
[Serializable]
public class TimestampType : AtomicType { }
[Serializable]
public class DoubleType : AtomicType { }
[Serializable]
public class FloatType : AtomicType { }
[Serializable]
public class ByteType : AtomicType { }
[Serializable]
public class IntegerType : AtomicType { }
[Serializable]
public class LongType : AtomicType { }
[Serializable]
public class ShortType : AtomicType { }
[Serializable]
public class DecimalType : AtomicType
{
public static Regex FixedDecimal = new Regex(@"decimal\((\d+),\s(\d+)\)");
private int? precision, scale;
public DecimalType(int? precision = null, int? scale = null)
{
this.precision = precision;
this.scale = scale;
}
internal override object JsonValue
{
get { throw new NotImplementedException(); }
}
public DataType FromJson(JObject json)
{
throw new NotImplementedException();
}
}
[Serializable]
public class ArrayType : ComplexType
{
public DataType ElementType { get { return elementType; } }
public bool ContainsNull { get { return containsNull; } }
public ArrayType(DataType elementType, bool containsNull = true)
{
this.elementType = elementType;
this.containsNull = containsNull;
}
internal ArrayType(JObject json)
{
FromJson(json);
}
public override string SimpleString
{
get { return string.Format("array<{0}>", elementType.SimpleString); }
}
internal override object JsonValue
{
get
{
return new JObject(
new JProperty("type", TypeName),
new JProperty("elementType", elementType.JsonValue),
new JProperty("containsNull", containsNull));
}
}
public override sealed DataType FromJson(JObject json)
{
elementType = ParseDataTypeFromJson(json["elementType"]);
containsNull = (bool)json["containsNull"];
return this;
}
private DataType elementType;
private bool containsNull;
}
[Serializable]
public class MapType : ComplexType
{
internal override object JsonValue
{
get { throw new NotImplementedException(); }
}
public override DataType FromJson(JObject json)
{
throw new NotImplementedException();
}
}
[Serializable]
public class StructField : ComplexType
{
public string Name { get { return name; } }
public DataType DataType { get { return dataType; } }
public bool IsNullable { get { return isNullable; } }
public JObject Metadata { get { return metadata; } }
public StructField(string name, DataType dataType, bool isNullable = true, JObject metadata = null)
{
this.name = name;
this.dataType = dataType;
this.isNullable = isNullable;
this.metadata = metadata ?? new JObject();
}
internal StructField(JObject json)
{
FromJson(json);
}
public override string SimpleString { get { return string.Format(@"{0}:{1}", name, dataType.SimpleString); } }
internal override object JsonValue
{
get
{
return new JObject(
new JProperty("name", name),
new JProperty("type", dataType.JsonValue),
new JProperty("nullable", isNullable),
new JProperty("metadata", metadata));
}
}
public override sealed DataType FromJson(JObject json)
{
name = json["name"].ToString();
dataType = ParseDataTypeFromJson(json["type"]);
isNullable = (bool)json["nullable"];
metadata = (JObject)json["metadata"];
return this;
}
private string name;
private DataType dataType;
private bool isNullable;
[NonSerialized]
private JObject metadata;
}
[Serializable]
public class StructType : ComplexType
{
public List<StructField> Fields { get { return fields; } }
internal IStructTypeProxy StructTypeProxy
{
get
{
return structTypeProxy ??
new StructTypeIpcProxy(
new JvmObjectReference(SparkCLRIpcProxy.JvmBridge.CallStaticJavaMethod("org.apache.spark.sql.api.csharp.SQLUtils", "createSchema",
new object[] { Json }).ToString()));
}
}
public StructType(IEnumerable<StructField> fields)
{
this.fields = fields.ToList();
}
internal StructType(JObject json)
{
FromJson(json);
}
internal StructType(IStructTypeProxy structTypeProxy)
{
this.structTypeProxy = structTypeProxy;
var jsonSchema = (structTypeProxy as StructTypeIpcProxy).ToJson();
FromJson(jsonSchema);
}
public override string SimpleString
{
get { return string.Format(@"struct<{0}>", string.Join(",", fields.Select(f => f.SimpleString))); }
}
internal override object JsonValue
{
get
{
return new JObject(
new JProperty("type", TypeName),
new JProperty("fields", fields.Select(f => f.JsonValue).ToArray()));
}
}
public override sealed DataType FromJson(JObject json)
{
var fieldsJObjects = json["fields"].Select(f => (JObject)f);
fields = fieldsJObjects.Select(fieldJObject => (new StructField(fieldJObject))).ToList();
return this;
}
[NonSerialized]
private readonly IStructTypeProxy structTypeProxy;
private List<StructField> fields;
}
}

Просмотреть файл

@ -40,6 +40,9 @@
<HintPath>..\packages\Moq.4.2.1510.2205\lib\net40\Moq.dll</HintPath> <HintPath>..\packages\Moq.4.2.1510.2205\lib\net40\Moq.dll</HintPath>
<Private>True</Private> <Private>True</Private>
</Reference> </Reference>
<Reference Include="Newtonsoft.Json">
<HintPath>..\packages\Newtonsoft.Json.7.0.1\lib\net45\Newtonsoft.Json.dll</HintPath>
</Reference>
<Reference Include="nunit.framework, Version=3.0.5813.39031, Culture=neutral, PublicKeyToken=2638cd05610744eb, processorArchitecture=MSIL"> <Reference Include="nunit.framework, Version=3.0.5813.39031, Culture=neutral, PublicKeyToken=2638cd05610744eb, processorArchitecture=MSIL">
<HintPath>..\packages\NUnit.3.0.1\lib\net45\nunit.framework.dll</HintPath> <HintPath>..\packages\NUnit.3.0.1\lib\net45\nunit.framework.dll</HintPath>
<Private>True</Private> <Private>True</Private>
@ -127,4 +130,4 @@
<Target Name="AfterBuild"> <Target Name="AfterBuild">
</Target> </Target>
--> -->
</Project> </Project>

Просмотреть файл

@ -422,7 +422,7 @@ namespace AdapterTest
"123", "123",
"Bill" "Bill"
}, },
RowSchema.ParseRowSchemaFromJson(jsonSchema)) DataType.ParseDataTypeFromJson(jsonSchema) as StructType)
}; };
mockDataFrameProxy.Setup(m => m.JavaToCSharp()).Returns(new MockRddProxy(rows)); mockDataFrameProxy.Setup(m => m.JavaToCSharp()).Returns(new MockRddProxy(rows));
@ -435,14 +435,6 @@ namespace AdapterTest
Assert.IsNotNull(rdd); Assert.IsNotNull(rdd);
mockDataFrameProxy.Verify(m => m.JavaToCSharp(), Times.Once); mockDataFrameProxy.Verify(m => m.JavaToCSharp(), Times.Once);
mockDataFrameProxy.Reset();
mockStructTypeProxy.Reset();
rdd = dataFrame.Rdd;
Assert.IsNotNull(rdd);
mockDataFrameProxy.Verify(m => m.JavaToCSharp(), Times.Never);
mockStructTypeProxy.Verify(m => m.ToJson(), Times.Never);
} }
[Test] [Test]

Просмотреть файл

@ -14,7 +14,7 @@ namespace AdapterTest.Mocks
throw new NotImplementedException(); throw new NotImplementedException();
} }
public override RowSchema GetSchema() public override StructType GetSchema()
{ {
throw new NotImplementedException(); throw new NotImplementedException();
} }

Просмотреть файл

@ -44,17 +44,6 @@ namespace AdapterTest.Mocks
return new MockSparkContextProxy(conf); return new MockSparkContextProxy(conf);
} }
public IStructFieldProxy CreateStructField(string name, string dataType, bool isNullable)
{
throw new NotImplementedException();
}
public IStructTypeProxy CreateStructType(List<StructField> fields)
{
throw new NotImplementedException();
}
public ISparkContextProxy SparkContextProxy public ISparkContextProxy SparkContextProxy
{ {
get { throw new NotImplementedException(); } get { throw new NotImplementedException(); }

Просмотреть файл

@ -27,6 +27,11 @@ namespace AdapterTest.Mocks
mockSparkContextProxy = scProxy; mockSparkContextProxy = scProxy;
} }
public IDataFrameProxy CreateDataFrame(IRDDProxy rddProxy, IStructTypeProxy structTypeProxy)
{
throw new NotImplementedException();
}
public IDataFrameProxy ReadDataFrame(string path, StructType schema, System.Collections.Generic.Dictionary<string, string> options) public IDataFrameProxy ReadDataFrame(string path, StructType schema, System.Collections.Generic.Dictionary<string, string> options)
{ {
throw new NotImplementedException(); throw new NotImplementedException();

Просмотреть файл

@ -1,6 +1,7 @@
<?xml version="1.0" encoding="utf-8"?> <?xml version="1.0" encoding="utf-8"?>
<packages> <packages>
<package id="Moq" version="4.2.1510.2205" targetFramework="net45" /> <package id="Moq" version="4.2.1510.2205" targetFramework="net45" />
<package id="Newtonsoft.Json" version="7.0.1" targetFramework="net45" />
<package id="NUnit" version="3.0.1" targetFramework="net45" /> <package id="NUnit" version="3.0.1" targetFramework="net45" />
<package id="Razorvine.Pyrolite" version="4.10.0.0" targetFramework="net45" /> <package id="Razorvine.Pyrolite" version="4.10.0.0" targetFramework="net45" />
<package id="Razorvine.Serpent" version="1.12.0.0" targetFramework="net45" /> <package id="Razorvine.Serpent" version="1.12.0.0" targetFramework="net45" />

Просмотреть файл

@ -6,6 +6,7 @@ using System.Collections.Generic;
using System.Collections.ObjectModel; using System.Collections.ObjectModel;
using System.IO; using System.IO;
using System.Linq; using System.Linq;
using System.Text.RegularExpressions;
using Microsoft.Spark.CSharp.Core; using Microsoft.Spark.CSharp.Core;
using Microsoft.Spark.CSharp.Sql; using Microsoft.Spark.CSharp.Sql;
using NUnit.Framework; using NUnit.Framework;
@ -27,6 +28,116 @@ namespace Microsoft.Spark.CSharp.Samples
return sqlContext ?? (sqlContext = new SqlContext(SparkCLRSamples.SparkContext)); return sqlContext ?? (sqlContext = new SqlContext(SparkCLRSamples.SparkContext));
} }
/// <summary>
/// Sample to create DataFrame. The RDD is generated from SparkContext Parallelize; the schema is created via object creating.
/// </summary>
[Sample]
internal static void DFCreateDataFrameSample()
{
var schemaPeople = new StructType(new List<StructField>
{
new StructField("id", new StringType()),
new StructField("name", new StringType()),
new StructField("age", new IntegerType()),
new StructField("address", new StructType(new List<StructField>
{
new StructField("city", new StringType()),
new StructField("state", new StringType())
})),
new StructField("phone numbers", new ArrayType(new StringType()))
});
var rddPeople = SparkCLRSamples.SparkContext.Parallelize(
new List<object[]>
{
new object[] { "123", "Bill", 43, new object[]{ "Columbus", "Ohio" }, new string[]{ "Tel1", "Tel2" } },
new object[] { "456", "Steve", 34, new object[]{ "Seattle", "Washington" }, new string[]{ "Tel3", "Tel4" } }
});
var dataFramePeople = GetSqlContext().CreateDataFrame(rddPeople, schemaPeople);
Console.WriteLine("------ Schema of People Data Frame:\r\n");
dataFramePeople.ShowSchema();
Console.WriteLine();
var collected = dataFramePeople.Collect().ToArray();
foreach (var people in collected)
{
string id = people.Get("id");
string name = people.Get("name");
int age = people.Get("age");
Row address = people.Get("address");
string city = address.Get("city");
string state = address.Get("state");
object[] phoneNumbers = people.Get("phone numbers");
Console.WriteLine("id:{0}, name:{1}, age:{2}, address:(city:{3},state:{4}), phoneNumbers:[{5},{6}]\r\n", id, name, age, city, state, phoneNumbers[0], phoneNumbers[1]);
}
if (SparkCLRSamples.Configuration.IsValidationEnabled)
{
Assert.AreEqual(2, dataFramePeople.Rdd.Count());
Assert.AreEqual(schemaPeople.Json, dataFramePeople.Schema.Json);
}
}
/// <summary>
/// Sample to create DataFrame. The RDD is generated from SparkContext TextFile; the schema is created from Json.
/// </summary>
[Sample]
internal static void DFCreateDataFrameSample2()
{
var rddRequestsLog = SparkCLRSamples.SparkContext.TextFile(SparkCLRSamples.Configuration.GetInputDataPath(RequestsLog), 1).Map(r => r.Split(',').Select(s => (object)s).ToArray());
const string schemaRequestsLogJson = @"{
""fields"": [{
""metadata"": {},
""name"": ""guid"",
""nullable"": false,
""type"": ""string""
},
{
""metadata"": {},
""name"": ""datacenter"",
""nullable"": false,
""type"": ""string""
},
{
""metadata"": {},
""name"": ""abtestid"",
""nullable"": false,
""type"": ""string""
},
{
""metadata"": {},
""name"": ""traffictype"",
""nullable"": false,
""type"": ""string""
}],
""type"": ""struct""
}";
// create schema from parsing Json
StructType requestsLogSchema = DataType.ParseDataTypeFromJson(schemaRequestsLogJson) as StructType;
var dataFrameRequestsLog = GetSqlContext().CreateDataFrame(rddRequestsLog, requestsLogSchema);
Console.WriteLine("------ Schema of RequestsLog Data Frame:");
dataFrameRequestsLog.ShowSchema();
Console.WriteLine();
var collected = dataFrameRequestsLog.Collect().ToArray();
foreach (var request in collected)
{
string guid = request.Get("guid");
string datacenter = request.Get("datacenter");
string abtestid = request.Get("abtestid");
string traffictype = request.Get("traffictype");
Console.WriteLine("guid:{0}, datacenter:{1}, abtestid:{2}, traffictype:{3}\r\n", guid, datacenter, abtestid, traffictype);
}
if (SparkCLRSamples.Configuration.IsValidationEnabled)
{
Assert.AreEqual(10, collected.Length);
Assert.AreEqual(Regex.Replace(schemaRequestsLogJson, @"\s", string.Empty), Regex.Replace(dataFrameRequestsLog.Schema.Json, @"\s", string.Empty));
}
}
/// <summary> /// <summary>
/// Sample to show schema of DataFrame /// Sample to show schema of DataFrame
/// </summary> /// </summary>
@ -42,10 +153,10 @@ namespace Microsoft.Spark.CSharp.Samples
/// Sample to get schema of DataFrame in json format /// Sample to get schema of DataFrame in json format
/// </summary> /// </summary>
[Sample] [Sample]
internal static void DFGetSchemaToJsonSample() internal static void DFGetSchemaJsonSample()
{ {
var peopleDataFrame = GetSqlContext().JsonFile(SparkCLRSamples.Configuration.GetInputDataPath(PeopleJson)); var peopleDataFrame = GetSqlContext().JsonFile(SparkCLRSamples.Configuration.GetInputDataPath(PeopleJson));
string json = peopleDataFrame.Schema.ToJson(); string json = peopleDataFrame.Schema.Json;
Console.WriteLine("schema in json format: {0}", json); Console.WriteLine("schema in json format: {0}", json);
} }
@ -56,7 +167,7 @@ namespace Microsoft.Spark.CSharp.Samples
internal static void DFCollectSample() internal static void DFCollectSample()
{ {
var peopleDataFrame = GetSqlContext().JsonFile(SparkCLRSamples.Configuration.GetInputDataPath(PeopleJson)); var peopleDataFrame = GetSqlContext().JsonFile(SparkCLRSamples.Configuration.GetInputDataPath(PeopleJson));
IEnumerable<Row> rows = peopleDataFrame.Collect(); var rows = peopleDataFrame.Collect().ToArray();
Console.WriteLine("peopleDataFrame:"); Console.WriteLine("peopleDataFrame:");
int i = 0; int i = 0;
@ -122,15 +233,13 @@ namespace Microsoft.Spark.CSharp.Samples
[Sample] [Sample]
internal static void DFTextFileLoadDataFrameSample() internal static void DFTextFileLoadDataFrameSample()
{ {
var requestsSchema = StructType.CreateStructType( var requestsSchema = new StructType(new List<StructField>
new List<StructField> {
{ new StructField("guid", new StringType(), false),
StructField.CreateStructField("guid", "string", false), new StructField("datacenter", new StringType(), false),
StructField.CreateStructField("datacenter", "string", false), new StructField("abtestid", new StringType(), false),
StructField.CreateStructField("abtestid", "string", false), new StructField("traffictype", new StringType(), false),
StructField.CreateStructField("traffictype", "string", false), });
}
);
var requestsDateFrame = GetSqlContext().TextFile(SparkCLRSamples.Configuration.GetInputDataPath(RequestsLog), requestsSchema); var requestsDateFrame = GetSqlContext().TextFile(SparkCLRSamples.Configuration.GetInputDataPath(RequestsLog), requestsSchema);
requestsDateFrame.RegisterTempTable("requests"); requestsDateFrame.RegisterTempTable("requests");
@ -154,18 +263,17 @@ namespace Microsoft.Spark.CSharp.Samples
private static DataFrame GetMetricsDataFrame() private static DataFrame GetMetricsDataFrame()
{ {
var metricsSchema = StructType.CreateStructType( var metricsSchema = new StructType(
new List<StructField> new[]
{ {
StructField.CreateStructField("unknown", "string", false), new StructField("unknown", new StringType(), false),
StructField.CreateStructField("date", "string", false), new StructField("date", new StringType(), false),
StructField.CreateStructField("time", "string", false), new StructField("time", new StringType(), false),
StructField.CreateStructField("guid", "string", false), new StructField("guid", new StringType(), false),
StructField.CreateStructField("lang", "string", false), new StructField("lang", new StringType(), false),
StructField.CreateStructField("country", "string", false), new StructField("country", new StringType(), false),
StructField.CreateStructField("latency", "integer", false) new StructField("latency", new StringType(), false),
} });
);
return return
GetSqlContext() GetSqlContext()
@ -236,8 +344,8 @@ namespace Microsoft.Spark.CSharp.Samples
{ {
var name = peopleDataFrameSchemaField.Name; var name = peopleDataFrameSchemaField.Name;
var dataType = peopleDataFrameSchemaField.DataType; var dataType = peopleDataFrameSchemaField.DataType;
var stringVal = dataType.ToString(); var stringVal = dataType.TypeName;
var simpleStringVal = dataType.SimpleString(); var simpleStringVal = dataType.SimpleString;
var isNullable = peopleDataFrameSchemaField.IsNullable; var isNullable = peopleDataFrameSchemaField.IsNullable;
Console.WriteLine("Name={0}, DT.string={1}, DT.simplestring={2}, DT.isNullable={3}", name, stringVal, simpleStringVal, isNullable); Console.WriteLine("Name={0}, DT.string={1}, DT.simplestring={2}, DT.isNullable={3}", name, stringVal, simpleStringVal, isNullable);
} }
@ -388,7 +496,7 @@ namespace Microsoft.Spark.CSharp.Samples
var singleValueReplaced = peopleDataFrame.Replace("Bill", "Bill.G"); var singleValueReplaced = peopleDataFrame.Replace("Bill", "Bill.G");
singleValueReplaced.Show(); singleValueReplaced.Show();
var multiValueReplaced = peopleDataFrame.ReplaceAll(new List<int> { 14, 34 }, new List<int> { 44, 54 }); var multiValueReplaced = peopleDataFrame.ReplaceAll(new List<int> { 14, 34 }, new List<int> { 44, 54 });
multiValueReplaced.Show(); multiValueReplaced.Show();
@ -853,7 +961,7 @@ namespace Microsoft.Spark.CSharp.Samples
Console.WriteLine("peopleDataFrame:"); Console.WriteLine("peopleDataFrame:");
var count = 0; var count = 0;
RowSchema schema = null; StructType schema = null;
Row firstRow = null; Row firstRow = null;
foreach (var row in rows) foreach (var row in rows)
{ {
@ -939,43 +1047,43 @@ namespace Microsoft.Spark.CSharp.Samples
/// Verify the schema of people dataframe. /// Verify the schema of people dataframe.
/// </summary> /// </summary>
/// <param name="schema"> RowSchema of people DataFrame </param> /// <param name="schema"> RowSchema of people DataFrame </param>
internal static void VerifySchemaOfPeopleDataFrame(RowSchema schema) internal static void VerifySchemaOfPeopleDataFrame(StructType schema)
{ {
Assert.IsNotNull(schema); Assert.IsNotNull(schema);
Assert.AreEqual("struct", schema.type); Assert.AreEqual("struct", schema.TypeName);
Assert.IsNotNull(schema.columns); Assert.IsNotNull(schema.Fields);
Assert.AreEqual(4, schema.columns.Count); Assert.AreEqual(4, schema.Fields.Count);
// name // name
var nameColSchema = schema.columns.Find(c => c.name.Equals("name")); var nameColSchema = schema.Fields.Find(c => c.Name.Equals("name"));
Assert.IsNotNull(nameColSchema); Assert.IsNotNull(nameColSchema);
Assert.AreEqual("name", nameColSchema.name); Assert.AreEqual("name", nameColSchema.Name);
Assert.IsTrue(nameColSchema.nullable); Assert.IsTrue(nameColSchema.IsNullable);
Assert.AreEqual("string", nameColSchema.type.ToString()); Assert.AreEqual("string", nameColSchema.DataType.TypeName);
// id // id
var idColSchema = schema.columns.Find(c => c.name.Equals("id")); var idColSchema = schema.Fields.Find(c => c.Name.Equals("id"));
Assert.IsNotNull(idColSchema); Assert.IsNotNull(idColSchema);
Assert.AreEqual("id", idColSchema.name); Assert.AreEqual("id", idColSchema.Name);
Assert.IsTrue(idColSchema.nullable); Assert.IsTrue(idColSchema.IsNullable);
Assert.AreEqual("string", nameColSchema.type.ToString()); Assert.AreEqual("string", nameColSchema.DataType.TypeName);
// age // age
var ageColSchema = schema.columns.Find(c => c.name.Equals("age")); var ageColSchema = schema.Fields.Find(c => c.Name.Equals("age"));
Assert.IsNotNull(ageColSchema); Assert.IsNotNull(ageColSchema);
Assert.AreEqual("age", ageColSchema.name); Assert.AreEqual("age", ageColSchema.Name);
Assert.IsTrue(ageColSchema.nullable); Assert.IsTrue(ageColSchema.IsNullable);
Assert.AreEqual("long", ageColSchema.type.ToString()); Assert.AreEqual("long", ageColSchema.DataType.TypeName);
// address // address
var addressColSchema = schema.columns.Find(c => c.name.Equals("address")); var addressColSchema = schema.Fields.Find(c => c.Name.Equals("address"));
Assert.IsNotNull(addressColSchema); Assert.IsNotNull(addressColSchema);
Assert.AreEqual("address", addressColSchema.name); Assert.AreEqual("address", addressColSchema.Name);
Assert.IsTrue(addressColSchema.nullable); Assert.IsTrue(addressColSchema.IsNullable);
Assert.IsNotNull(addressColSchema.type); Assert.IsNotNull(addressColSchema.DataType);
Assert.AreEqual("struct", addressColSchema.type.type); Assert.AreEqual("struct", addressColSchema.DataType.TypeName);
Assert.IsNotNull(addressColSchema.type.columns.Find(c => c.name.Equals("state"))); Assert.IsNotNull(((StructType)addressColSchema.DataType).Fields.Find(c => c.Name.Equals("state")));
Assert.IsNotNull(addressColSchema.type.columns.Find(c => c.name.Equals("city"))); Assert.IsNotNull(((StructType)addressColSchema.DataType).Fields.Find(c => c.Name.Equals("city")));
} }
/// <summary> /// <summary>
@ -1128,25 +1236,25 @@ namespace Microsoft.Spark.CSharp.Samples
if (x == null && y == null) return true; if (x == null && y == null) return true;
if (x == null && y != null || x != null && y == null) return false; if (x == null && y != null || x != null && y == null) return false;
foreach (var col in x.GetSchema().columns) foreach (var col in x.GetSchema().Fields)
{ {
if (!y.GetSchema().columns.Any(c => c.ToString() == col.ToString())) return false; if (!y.GetSchema().Fields.Any(c => c.Name == col.Name)) return false;
if (col.type.columns.Any()) if (col.DataType is StructType)
{ {
if (!IsRowEqual(x.GetAs<Row>(col.name), y.GetAs<Row>(col.name), columnsComparer)) return false; if (!IsRowEqual(x.GetAs<Row>(col.Name), y.GetAs<Row>(col.Name), columnsComparer)) return false;
} }
else if (x.Get(col.name) == null && y.Get(col.name) != null || x.Get(col.name) != null && y.Get(col.name) == null) return false; else if (x.Get(col.Name) == null && y.Get(col.Name) != null || x.Get(col.Name) != null && y.Get(col.Name) == null) return false;
else if (x.Get(col.name) != null && y.Get(col.name) != null) else if (x.Get(col.Name) != null && y.Get(col.Name) != null)
{ {
Func<object, object, bool> colComparer; Func<object, object, bool> colComparer;
if (columnsComparer != null && columnsComparer.TryGetValue(col.name, out colComparer)) if (columnsComparer != null && columnsComparer.TryGetValue(col.Name, out colComparer))
{ {
if (!colComparer(x.Get(col.name), y.Get(col.name))) return false; if (!colComparer(x.Get(col.Name), y.Get(col.Name))) return false;
} }
else else
{ {
if (x.Get(col.name).ToString() != y.Get(col.name).ToString()) return false; if (x.Get(col.Name).ToString() != y.Get(col.Name).ToString()) return false;
} }
} }
} }

Просмотреть файл

@ -33,6 +33,10 @@
<WarningLevel>4</WarningLevel> <WarningLevel>4</WarningLevel>
</PropertyGroup> </PropertyGroup>
<ItemGroup> <ItemGroup>
<Reference Include="Newtonsoft.Json, Version=7.0.0.0, Culture=neutral, PublicKeyToken=30ad4fe6b2a6aeed, processorArchitecture=MSIL">
<HintPath>..\..\packages\Newtonsoft.Json.7.0.1\lib\net45\Newtonsoft.Json.dll</HintPath>
<Private>True</Private>
</Reference>
<Reference Include="nunit.framework, Version=3.0.5813.39031, Culture=neutral, PublicKeyToken=2638cd05610744eb, processorArchitecture=MSIL"> <Reference Include="nunit.framework, Version=3.0.5813.39031, Culture=neutral, PublicKeyToken=2638cd05610744eb, processorArchitecture=MSIL">
<HintPath>..\..\packages\NUnit.3.0.1\lib\net45\nunit.framework.dll</HintPath> <HintPath>..\..\packages\NUnit.3.0.1\lib\net45\nunit.framework.dll</HintPath>
<Private>True</Private> <Private>True</Private>

Просмотреть файл

@ -1,4 +1,5 @@
<?xml version="1.0" encoding="utf-8"?> <?xml version="1.0" encoding="utf-8"?>
<packages> <packages>
<package id="Newtonsoft.Json" version="7.0.1" targetFramework="net45" />
<package id="NUnit" version="3.0.1" targetFramework="net45" /> <package id="NUnit" version="3.0.1" targetFramework="net45" />
</packages> </packages>

Просмотреть файл

@ -3,14 +3,16 @@
package org.apache.spark.sql.api.csharp package org.apache.spark.sql.api.csharp
import java.io.{ByteArrayInputStream, ByteArrayOutputStream, DataInputStream, DataOutputStream} import java.io.{ByteArrayOutputStream, DataOutputStream}
import org.apache.spark.SparkContext import org.apache.spark.SparkContext
import org.apache.spark.api.csharp.SerDe import org.apache.spark.api.csharp.SerDe
import org.apache.spark.api.java.{JavaRDD, JavaSparkContext} import org.apache.spark.api.java.{JavaRDD, JavaSparkContext}
import org.apache.spark.api.python.SerDeUtil
import org.apache.spark.rdd.RDD import org.apache.spark.rdd.RDD
import org.apache.spark.sql.types.{DataType, FloatType, StructField, StructType} import org.apache.spark.sql.types.{DataType, FloatType, StructType}
import org.apache.spark.sql._ import org.apache.spark.sql._
import java.util.{ArrayList => JArrayList}
/** /**
* Utility functions for DataFrame in SparkCLR * Utility functions for DataFrame in SparkCLR
@ -31,10 +33,6 @@ object SQLUtils {
arr.toSeq arr.toSeq
} }
def createStructType(fields : Seq[StructField]): StructType = {
StructType(fields)
}
def getSQLDataType(dataType: String): DataType = { def getSQLDataType(dataType: String): DataType = {
dataType match { dataType match {
case "byte" => org.apache.spark.sql.types.ByteType case "byte" => org.apache.spark.sql.types.ByteType
@ -54,17 +52,6 @@ object SQLUtils {
} }
} }
def createStructField(name: String, dataType: String, nullable: Boolean): StructField = {
val dtObj = getSQLDataType(dataType)
StructField(name, dtObj, nullable)
}
def createDF(rdd: RDD[Array[Byte]], schema: StructType, sqlContext: SQLContext): DataFrame = {
val num = schema.fields.size
val rowRDD = rdd.map(bytesToRow(_, schema))
sqlContext.createDataFrame(rowRDD, schema)
}
def dfToRowRDD(df: DataFrame): RDD[Array[Byte]] = { def dfToRowRDD(df: DataFrame): RDD[Array[Byte]] = {
df.map(r => rowToCSharpBytes(r)) df.map(r => rowToCSharpBytes(r))
} }
@ -77,14 +64,6 @@ object SQLUtils {
} }
} }
private[this] def bytesToRow(bytes: Array[Byte], schema: StructType): Row = {
val bis = new ByteArrayInputStream(bytes)
val dis = new DataInputStream(bis)
val num = SerDe.readInt(dis)
Row.fromSeq((0 until num).map { i =>
doConversion(SerDe.readObject(dis), schema.fields(i).dataType)
}.toSeq)
}
private[this] def rowToCSharpBytes(row: Row): Array[Byte] = { private[this] def rowToCSharpBytes(row: Row): Array[Byte] = {
val bos = new ByteArrayOutputStream() val bos = new ByteArrayOutputStream()
@ -176,8 +155,11 @@ object SQLUtils {
dfReader.load(path) dfReader.load(path)
} }
def loadTextFile(sqlContext: SQLContext, path: String, delimiter: String, schema: StructType) : DataFrame = { def loadTextFile(sqlContext: SQLContext, path: String, delimiter: String, schemaJson: String) : DataFrame = {
val stringRdd = sqlContext.sparkContext.textFile(path) val stringRdd = sqlContext.sparkContext.textFile(path)
val schema = createSchema(schemaJson)
val rowRdd = stringRdd.map{s => val rowRdd = stringRdd.map{s =>
val columns = s.split(delimiter) val columns = s.split(delimiter)
columns.length match { columns.length match {
@ -217,4 +199,21 @@ object SQLUtils {
sqlContext.createDataFrame(rowRdd, schema) sqlContext.createDataFrame(rowRdd, schema)
} }
def createSchema(schemaJson: String) : StructType = {
DataType.fromJson(schemaJson).asInstanceOf[StructType]
}
def byteArrayRDDToAnyArrayRDD(jrdd: JavaRDD[Array[Byte]]) : RDD[Array[_ >: AnyRef]] = {
// JavaRDD[Array[Byte]] -> JavaRDD[Any]
val jrddAny = SerDeUtil.pythonToJava(jrdd, true)
// JavaRDD[Any] -> RDD[Array[_]]
jrddAny.rdd.map {
case objs: JArrayList[_] =>
objs.toArray
case obj if obj.getClass.isArray =>
obj.asInstanceOf[Array[_]].toArray
}
}
} }