From 9aa97b98c61d4ac0c54c1149125c378dd21fa182 Mon Sep 17 00:00:00 2001 From: sutyag <42341317+sutyag@users.noreply.github.com> Date: Wed, 21 Nov 2018 06:53:43 +0530 Subject: [PATCH] Dev/sutyag/upgrade mobius (#697) * basic * Add extractor and outputter * Add reducer not done * Add procedure * kill node, broadcast, upload executable error feed to cosmos, specify avro or parquet syntax * Add more functions to HDFS. Add submitter heartbeat Update doc * Redesign cosmos download, add replication setting for hdfs * Improve executable runner to deal with bad lines * MERGE MOBIUS * change dependency path * Add registration method to mobius * Major refactoring to add ISparkosmosModule to modulize everything Start supporting streaming Fixed a couple of Mobius bugs Added integration tests Reenabled unit tests Added DatedPath * Make sparkcontext settable, fix setjobgroup * Expose more interface from Mobius * Mobius change for Spark 2.3 * fix version conflict, remove unused files * Added support for multiple UDFs * Fixed non sql udf issue * 1. Upgarde mobius to spark 2.3.1 2. Fixed UDF bugs 3. Added support for multipe UDFs * 1. Added sample testcases 2.Updated referece for examples * Removed stashed files * Fixed review comments * Fixed review comments * Fixed failed unit test cases * Deleting all the things * Updated version in appveyor * Updated tartool download path * Fixed java process terminate issue * Revert access modifier to internal from public for JvmBridge --- README.md | 2 +- appveyor.yml | 2 +- build/Build.cmd | 2 + build/localmode/RunSamples.cmd | 4 +- build/localmode/downloadtools.ps1 | 2 +- build/localmode/run-samples.sh | 2 +- cpp/Riosock/Riosock.vcxproj | 6 +- .../Microsoft.Spark.CSharp/Adapter.csproj | 11 +- .../Core/IRDDCollector.cs | 3 +- .../Microsoft.Spark.CSharp/Core/RDD.cs | 24 +- .../Core/RDDCollector.cs | 22 +- .../Interop/Ipc/JvmBridge.cs | 2 +- .../Interop/Ipc/JvmObjectReference.cs | 9 +- .../Interop/SparkCLREnvironment.cs | 4 +- .../Microsoft.Spark.CSharp/Network/ByteBuf.cs | 2 +- .../Network/DefaultSocketWrapper.cs | 327 +++--- .../Network/ISocketWrapper.cs | 15 +- .../Network/RioSocketWrapper.cs | 2 +- .../Network/SaeaSocketWrapper.cs | 2 +- .../Network/SocketInfo.cs | 28 + .../Proxy/IDataFrameProxy.cs | 6 +- .../Microsoft.Spark.CSharp/Proxy/IRDDProxy.cs | 3 +- .../Proxy/ISparkContextProxy.cs | 3 +- .../Proxy/Ipc/DataFrameIpcProxy.cs | 15 +- .../Proxy/Ipc/RDDIpcProxy.cs | 5 +- .../Proxy/Ipc/SparkContextIpcProxy.cs | 13 +- .../Proxy/Ipc/SparkSessionIpcProxy.cs | 4 +- .../Proxy/Ipc/SqlContextIpcProxy.cs | 2 +- .../Microsoft.Spark.CSharp/Sql/DataFrame.cs | 31 +- .../Sql/DataFrameReader.cs | 15 +- .../Sql/DataFrameWriter.cs | 13 +- .../Microsoft.Spark.CSharp/Sql/Dataset.cs | 3 +- .../Microsoft.Spark.CSharp/Sql/Functions.cs | 41 +- .../Adapter/Microsoft.Spark.CSharp/Sql/Row.cs | 132 +-- .../Sql/RowConstructor.cs | 4 +- .../Sql/SparkSession.cs | 38 +- .../Microsoft.Spark.CSharp/Sql/SqlContext.cs | 39 +- .../Microsoft.Spark.CSharp/Sql/Types.cs | 989 ++++++++++-------- .../Sql/UdfRegistration.cs | 16 +- .../Microsoft.Spark.CSharp/packages.config | 6 +- .../Microsoft.Spark.CSharp.Adapter.Doc.XML | 34 +- .../documentation/Mobius_API_Documentation.md | 6 +- csharp/AdapterTest/AccumulatorTest.cs | 2 +- csharp/AdapterTest/AdapterTest.csproj | 11 +- csharp/AdapterTest/DataFrameTest.cs | 21 +- csharp/AdapterTest/DatasetTest.cs | 4 +- .../AdapterTest/Mocks/MockDataFrameProxy.cs | 10 +- csharp/AdapterTest/Mocks/MockRDDCollector.cs | 3 +- csharp/AdapterTest/Mocks/MockRddProxy.cs | 3 +- csharp/AdapterTest/Mocks/MockRow.cs | 7 + .../Mocks/MockSparkContextProxy.cs | 8 +- csharp/AdapterTest/SocketWrapperTest.cs | 4 +- csharp/AdapterTest/TestWithMoqDemo.cs | 2 +- csharp/AdapterTest/packages.config | 5 +- csharp/Repl/Repl.csproj | 9 +- csharp/Repl/packages.config | 8 +- .../DataFrameSamples.cs | 67 ++ .../Samples/Microsoft.Spark.CSharp/Program.cs | 4 +- .../Microsoft.Spark.CSharp/Samples.csproj | 8 +- .../Microsoft.Spark.CSharp/packages.config | 3 +- csharp/Tests.Common/Tests.Common.csproj | 7 +- .../FileSystem/HdfsFileStatus.cs | 57 + .../FileSystem/HdfsFileSystemHelper.cs | 58 +- .../Utils/Microsoft.Spark.CSharp/Utils.csproj | 1 + .../MultiThreadWorker.cs | 5 +- .../Microsoft.Spark.CSharp/TaskRunner.cs | 204 ++-- .../Microsoft.Spark.CSharp/UDFCommand.cs | 391 +++++++ .../Worker/Microsoft.Spark.CSharp/Worker.cs | 449 +++----- .../Microsoft.Spark.CSharp/Worker.csproj | 2 + .../Microsoft.Spark.CSharp/WorkerFunc.cs | 25 + csharp/WorkerTest/MultiThreadWorkerTest.cs | 7 +- csharp/WorkerTest/WorkerTest.cs | 13 + csharp/WorkerTest/WorkerTest.csproj | 5 +- examples/Batch/WordCount/WordCount.csproj | 12 +- examples/Batch/pi/Pi.csproj | 12 +- examples/Examples.sln | 2 +- .../CassandraDataFrame.csproj | 17 +- .../Sql/HiveDataFrame/HiveDataFrame.csproj | 18 +- .../Sql/JdbcDataFrame/JdbcDataFrame.csproj | 12 +- examples/Sql/SparkXml/SparkXml.csproj | 12 +- examples/Streaming/EventHub/EventHub.csproj | 14 +- .../HdfsWordCount/HdfsWordCount.csproj | 26 +- examples/Streaming/Kafka/Kafka.csproj | 18 +- .../fsharp/JsonDataFrame/JsonDataFrame.fsproj | 8 +- .../fsharp/WordCount/WordCountFSharp.fsproj | 11 +- notes/running-mobius-app.md | 2 +- scala/pom.xml | 15 +- .../apache/spark/api/csharp/CSharpRDD.scala | 5 +- .../spark/sql/api/csharp/SQLUtils.scala | 3 +- .../org/apache/spark/util/csharp/Utils.scala | 8 +- scripts/sparkclr-submit.cmd | 4 +- scripts/sparkclr-submit.sh | 4 +- 92 files changed, 2117 insertions(+), 1378 deletions(-) create mode 100644 csharp/Adapter/Microsoft.Spark.CSharp/Network/SocketInfo.cs create mode 100644 csharp/Utils/Microsoft.Spark.CSharp/FileSystem/HdfsFileStatus.cs create mode 100644 csharp/Worker/Microsoft.Spark.CSharp/UDFCommand.cs create mode 100644 csharp/Worker/Microsoft.Spark.CSharp/WorkerFunc.cs diff --git a/README.md b/README.md index b007ce3..f230369 100644 --- a/README.md +++ b/README.md @@ -157,4 +157,4 @@ Mobius is licensed under the MIT license. See [LICENSE](LICENSE) file for full l * tweet [@MobiusForSpark](http://twitter.com/MobiusForSpark) ## Code of Conduct -This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/). For more information see the [Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/) or contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with any additional questions or comments. +This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/). For more information see the [Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/) or contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with any additional questions or comments. \ No newline at end of file diff --git a/appveyor.yml b/appveyor.yml index b7a50ce..4eb2774 100644 --- a/appveyor.yml +++ b/appveyor.yml @@ -1,4 +1,4 @@ -version: 2.0.2-SNAPSHOT.{build} +version: 2.3.1-SNAPSHOT.{build} environment: securefile: diff --git a/build/Build.cmd b/build/Build.cmd index 05239ac..485aa3b 100644 --- a/build/Build.cmd +++ b/build/Build.cmd @@ -6,6 +6,8 @@ rem Copyright (c) Microsoft. All rights reserved. rem Licensed under the MIT license. See LICENSE file in the project root for full license information. rem +SET MAVEN_OPTS=-Dhttps.protocols=TLSv1,TLSv1.1,TLSv1.2 + if "%1" == "csharp" set buildCSharp=true SET CMDHOME=%~dp0 diff --git a/build/localmode/RunSamples.cmd b/build/localmode/RunSamples.cmd index b9690e3..57872f7 100644 --- a/build/localmode/RunSamples.cmd +++ b/build/localmode/RunSamples.cmd @@ -47,7 +47,7 @@ if "%precheck%" == "bad" (goto :EOF) @rem @rem setup Hadoop and Spark versions @rem -set SPARK_VERSION=2.0.2 +set SPARK_VERSION=2.3.1 set HADOOP_VERSION=2.6 set APACHE_DIST_SERVER=archive.apache.org @echo [RunSamples.cmd] SPARK_VERSION=%SPARK_VERSION%, HADOOP_VERSION=%HADOOP_VERSION%, APACHE_DIST_SERVER=%APACHE_DIST_SERVER% @@ -100,7 +100,7 @@ if "!USER_EXE!"=="" ( call sparkclr-submit.cmd --conf spark.sql.warehouse.dir=%TEMP_DIR% %* ) -@if ERRORLEVEL 1 GOTO :ErrorStop +@if ERRORLEVEL 2 GOTO :ErrorStop @GOTO :EOF diff --git a/build/localmode/downloadtools.ps1 b/build/localmode/downloadtools.ps1 index c42ab8a..512a23f 100644 --- a/build/localmode/downloadtools.ps1 +++ b/build/localmode/downloadtools.ps1 @@ -20,7 +20,7 @@ if ($stage.ToLower() -eq "run") $hadoopVersion = if ($envValue -eq $null) { "2.6" } else { $envValue } $envValue = [Environment]::GetEnvironmentVariable("SPARK_VERSION") - $sparkVersion = if ($envValue -eq $null) { "2.0.2" } else { $envValue } + $sparkVersion = if ($envValue -eq $null) { "2.3.1" } else { $envValue } Write-Output "[downloadtools] hadoopVersion=$hadoopVersion, sparkVersion=$sparkVersion, apacheDistServer=$apacheDistServer" } diff --git a/build/localmode/run-samples.sh b/build/localmode/run-samples.sh index 685507d..24d4f3d 100755 --- a/build/localmode/run-samples.sh +++ b/build/localmode/run-samples.sh @@ -16,7 +16,7 @@ do done # setup Hadoop and Spark versions -export SPARK_VERSION=2.0.2 +export SPARK_VERSION=2.3.1 export HADOOP_VERSION=2.6 export APACHE_DIST_SERVER=archive.apache.org echo "[run-samples.sh] SPARK_VERSION=$SPARK_VERSION, HADOOP_VERSION=$HADOOP_VERSION, APACHE_DIST_SERVER=$APACHE_DIST_SERVER" diff --git a/cpp/Riosock/Riosock.vcxproj b/cpp/Riosock/Riosock.vcxproj index d61d067..95b642d 100644 --- a/cpp/Riosock/Riosock.vcxproj +++ b/cpp/Riosock/Riosock.vcxproj @@ -1,5 +1,5 @@  - + Debug @@ -20,13 +20,13 @@ DynamicLibrary true - v120 + v140 Unicode DynamicLibrary false - v120 + v140 true Unicode diff --git a/csharp/Adapter/Microsoft.Spark.CSharp/Adapter.csproj b/csharp/Adapter/Microsoft.Spark.CSharp/Adapter.csproj index d887daf..72341a3 100644 --- a/csharp/Adapter/Microsoft.Spark.CSharp/Adapter.csproj +++ b/csharp/Adapter/Microsoft.Spark.CSharp/Adapter.csproj @@ -35,16 +35,17 @@ prompt 4 ..\documentation\Microsoft.Spark.CSharp.Adapter.Doc.XML + true - - ..\..\packages\log4net.2.0.5\lib\net45-full\log4net.dll + + ..\..\packages\log4net.2.0.8\lib\net45-full\log4net.dll - - ..\..\packages\Newtonsoft.Json.7.0.1\lib\net45\Newtonsoft.Json.dll + + ..\..\packages\Newtonsoft.Json.11.0.2\lib\net45\Newtonsoft.Json.dll ..\..\packages\Razorvine.Pyrolite.4.10.0.0\lib\net40\Razorvine.Pyrolite.dll @@ -98,6 +99,7 @@ + @@ -184,6 +186,7 @@ + diff --git a/csharp/Adapter/Microsoft.Spark.CSharp/Core/IRDDCollector.cs b/csharp/Adapter/Microsoft.Spark.CSharp/Core/IRDDCollector.cs index b8b078c..51250de 100644 --- a/csharp/Adapter/Microsoft.Spark.CSharp/Core/IRDDCollector.cs +++ b/csharp/Adapter/Microsoft.Spark.CSharp/Core/IRDDCollector.cs @@ -3,6 +3,7 @@ using System; using System.Collections.Generic; +using Microsoft.Spark.CSharp.Network; namespace Microsoft.Spark.CSharp.Core { @@ -11,6 +12,6 @@ namespace Microsoft.Spark.CSharp.Core /// interface IRDDCollector { - IEnumerable Collect(int port, SerializedMode serializedMode, Type type); + IEnumerable Collect(SocketInfo info, SerializedMode serializedMode, Type type); } } diff --git a/csharp/Adapter/Microsoft.Spark.CSharp/Core/RDD.cs b/csharp/Adapter/Microsoft.Spark.CSharp/Core/RDD.cs index bdfbd98..9dfd119 100644 --- a/csharp/Adapter/Microsoft.Spark.CSharp/Core/RDD.cs +++ b/csharp/Adapter/Microsoft.Spark.CSharp/Core/RDD.cs @@ -4,6 +4,7 @@ using System; using System.Collections.Generic; using System.Linq; +using Microsoft.Spark.CSharp.Network; using Microsoft.Spark.CSharp.Proxy; using Microsoft.Spark.CSharp.Services; @@ -60,6 +61,7 @@ namespace Microsoft.Spark.CSharp.Core { return sparkContext; } + set { sparkContext = value; } } /// @@ -592,13 +594,13 @@ namespace Microsoft.Spark.CSharp.Core /// public T[] Collect() { - int port = RddProxy.CollectAndServe(); - return Collect(port).Cast().ToArray(); + var info = RddProxy.CollectAndServe(); + return Collect(info).Cast().ToArray(); } - internal IEnumerable Collect(int port) + internal IEnumerable Collect(SocketInfo info) { - return RddProxy.RDDCollector.Collect(port, serializedMode, typeof(T)); + return RddProxy.RDDCollector.Collect(info, serializedMode, typeof(T)); } /// @@ -830,9 +832,9 @@ namespace Microsoft.Spark.CSharp.Core var mappedRDD = MapPartitionsWithIndex(new TakeHelper(left).Execute); - int port = sparkContext.SparkContextProxy.RunJob(mappedRDD.RddProxy, partitions); + var info = sparkContext.SparkContextProxy.RunJob(mappedRDD.RddProxy, partitions); - IEnumerable res = Collect(port).Cast(); + IEnumerable res = Collect(info).Cast(); items.AddRange(res); partsScanned += numPartsToTry; @@ -925,7 +927,7 @@ namespace Microsoft.Spark.CSharp.Core /// public RDD Repartition(int numPartitions) { - return new RDD(RddProxy.Repartition(numPartitions), sparkContext); + return new RDD(RddProxy.Repartition(numPartitions), sparkContext, serializedMode); } /// @@ -942,8 +944,8 @@ namespace Microsoft.Spark.CSharp.Core /// public RDD Coalesce(int numPartitions, bool shuffle = false) { - return new RDD(RddProxy.Coalesce(numPartitions, shuffle), sparkContext); - } + return new RDD(RddProxy.Coalesce(numPartitions, shuffle), sparkContext, serializedMode); + } /// /// Zips this RDD with another one, returning key-value pairs with the @@ -1065,8 +1067,8 @@ namespace Microsoft.Spark.CSharp.Core foreach (int partition in Enumerable.Range(0, GetNumPartitions())) { var mappedRDD = MapPartitionsWithIndex((pid, iter) => iter); - int port = sparkContext.SparkContextProxy.RunJob(mappedRDD.RddProxy, Enumerable.Range(partition, 1)); - foreach (T row in Collect(port)) + var info = sparkContext.SparkContextProxy.RunJob(mappedRDD.RddProxy, Enumerable.Range(partition, 1)); + foreach (T row in Collect(info)) yield return row; } } diff --git a/csharp/Adapter/Microsoft.Spark.CSharp/Core/RDDCollector.cs b/csharp/Adapter/Microsoft.Spark.CSharp/Core/RDDCollector.cs index 6d92ad2..0596395 100644 --- a/csharp/Adapter/Microsoft.Spark.CSharp/Core/RDDCollector.cs +++ b/csharp/Adapter/Microsoft.Spark.CSharp/Core/RDDCollector.cs @@ -11,6 +11,7 @@ using System.Runtime.Serialization.Formatters.Binary; using System.Text; using Microsoft.Spark.CSharp.Interop.Ipc; using Microsoft.Spark.CSharp.Network; +using Microsoft.Spark.CSharp.Services; using Microsoft.Spark.CSharp.Sql; namespace Microsoft.Spark.CSharp.Core @@ -20,14 +21,31 @@ namespace Microsoft.Spark.CSharp.Core /// class RDDCollector : IRDDCollector { - public IEnumerable Collect(int port, SerializedMode serializedMode, Type type) + private static ILoggerService logger; + private static ILoggerService Logger + { + get + { + if (logger != null) return logger; + logger = LoggerServiceFactory.GetLogger(typeof(RDDCollector)); + return logger; + } + } + + public IEnumerable Collect(SocketInfo info, SerializedMode serializedMode, Type type) { IFormatter formatter = new BinaryFormatter(); var sock = SocketFactory.CreateSocket(); - sock.Connect(IPAddress.Loopback, port); + sock.Connect(IPAddress.Loopback, info.Port, null); using (var s = sock.GetStream()) { + if (info.Secret != null) + { + SerDe.Write(s, info.Secret); + var reply = SerDe.ReadString(s); + Logger.LogDebug("Connect back to JVM: " + reply); + } byte[] buffer; while ((buffer = SerDe.ReadBytes(s)) != null && buffer.Length > 0) { diff --git a/csharp/Adapter/Microsoft.Spark.CSharp/Interop/Ipc/JvmBridge.cs b/csharp/Adapter/Microsoft.Spark.CSharp/Interop/Ipc/JvmBridge.cs index a3e6cd9..366ed96 100644 --- a/csharp/Adapter/Microsoft.Spark.CSharp/Interop/Ipc/JvmBridge.cs +++ b/csharp/Adapter/Microsoft.Spark.CSharp/Interop/Ipc/JvmBridge.cs @@ -36,7 +36,7 @@ namespace Microsoft.Spark.CSharp.Interop.Ipc if (!sockets.TryDequeue(out socket)) { socket = SocketFactory.CreateSocket(); - socket.Connect(IPAddress.Loopback, portNumber); + socket.Connect(IPAddress.Loopback, portNumber, null); } return socket; } diff --git a/csharp/Adapter/Microsoft.Spark.CSharp/Interop/Ipc/JvmObjectReference.cs b/csharp/Adapter/Microsoft.Spark.CSharp/Interop/Ipc/JvmObjectReference.cs index 75c27e2..12cdd93 100644 --- a/csharp/Adapter/Microsoft.Spark.CSharp/Interop/Ipc/JvmObjectReference.cs +++ b/csharp/Adapter/Microsoft.Spark.CSharp/Interop/Ipc/JvmObjectReference.cs @@ -12,12 +12,12 @@ namespace Microsoft.Spark.CSharp.Interop.Ipc /// Reference to object created in JVM /// [Serializable] - internal class JvmObjectReference + public class JvmObjectReference { public string Id { get; private set; } private DateTime creationTime; - public JvmObjectReference(string jvmReferenceId) + internal JvmObjectReference(string jvmReferenceId) { Id = jvmReferenceId; creationTime = DateTime.UtcNow; @@ -48,6 +48,11 @@ namespace Microsoft.Spark.CSharp.Interop.Ipc return base.GetHashCode(); } + public string ObjectToString() + { + return SparkCLRIpcProxy.JvmBridge.CallNonStaticJavaMethod(this, "toString").ToString(); + } + public string GetDebugInfo() { var javaObjectReferenceForClassObject = new JvmObjectReference(SparkCLRIpcProxy.JvmBridge.CallNonStaticJavaMethod(this, "getClass").ToString()); diff --git a/csharp/Adapter/Microsoft.Spark.CSharp/Interop/SparkCLREnvironment.cs b/csharp/Adapter/Microsoft.Spark.CSharp/Interop/SparkCLREnvironment.cs index bee4625..befa7ee 100644 --- a/csharp/Adapter/Microsoft.Spark.CSharp/Interop/SparkCLREnvironment.cs +++ b/csharp/Adapter/Microsoft.Spark.CSharp/Interop/SparkCLREnvironment.cs @@ -31,7 +31,9 @@ namespace Microsoft.Spark.CSharp.Interop } } - internal static IConfigurationService configurationService; + internal static IJvmBridge JvmBridge => SparkCLRIpcProxy.JvmBridge; + + internal static IConfigurationService configurationService; internal static IConfigurationService ConfigurationService { diff --git a/csharp/Adapter/Microsoft.Spark.CSharp/Network/ByteBuf.cs b/csharp/Adapter/Microsoft.Spark.CSharp/Network/ByteBuf.cs index 90a1179..57886d5 100644 --- a/csharp/Adapter/Microsoft.Spark.CSharp/Network/ByteBuf.cs +++ b/csharp/Adapter/Microsoft.Spark.CSharp/Network/ByteBuf.cs @@ -11,7 +11,7 @@ namespace Microsoft.Spark.CSharp.Network /// ByteBuf delimits a section of a ByteBufChunk. /// It is the smallest unit to be allocated. /// - internal class ByteBuf + public class ByteBuf { private int readerIndex; private int writerIndex; diff --git a/csharp/Adapter/Microsoft.Spark.CSharp/Network/DefaultSocketWrapper.cs b/csharp/Adapter/Microsoft.Spark.CSharp/Network/DefaultSocketWrapper.cs index 3db32f5..8c96fcc 100644 --- a/csharp/Adapter/Microsoft.Spark.CSharp/Network/DefaultSocketWrapper.cs +++ b/csharp/Adapter/Microsoft.Spark.CSharp/Network/DefaultSocketWrapper.cs @@ -2,182 +2,203 @@ // Licensed under the MIT license. See LICENSE file in the project root for full license information. using System; +using System.Collections.Generic; using System.IO; using System.Net; using System.Net.Sockets; +using System.Text; +using System.Threading; using Microsoft.Spark.CSharp.Configuration; using Microsoft.Spark.CSharp.Services; namespace Microsoft.Spark.CSharp.Network { - /// - /// A simple wrapper of System.Net.Sockets.Socket class. - /// - internal class DefaultSocketWrapper : ISocketWrapper - { - private readonly ILoggerService logger = LoggerServiceFactory.GetLogger(typeof(DefaultSocketWrapper)); - private readonly Socket innerSocket; + /// + /// A simple wrapper of System.Net.Sockets.Socket class. + /// + internal class DefaultSocketWrapper : ISocketWrapper + { + private readonly ILoggerService logger = LoggerServiceFactory.GetLogger(typeof(DefaultSocketWrapper)); + private readonly Socket innerSocket; - /// - /// Default constructor that creates a new instance of DefaultSocket class which represents - /// a traditional socket (System.Net.Socket.Socket). - /// - /// This socket is bound to Loopback with port 0. - /// - public DefaultSocketWrapper() - { - innerSocket = new Socket(AddressFamily.InterNetwork, SocketType.Stream, ProtocolType.Tcp); - var localEndPoint = new IPEndPoint(IPAddress.Loopback, 0); - innerSocket.Bind(localEndPoint); - } + /// + /// Default constructor that creates a new instance of DefaultSocket class which represents + /// a traditional socket (System.Net.Socket.Socket). + /// + /// This socket is bound to Loopback with port 0. + /// + public DefaultSocketWrapper() + { + innerSocket = new Socket(AddressFamily.InterNetwork, SocketType.Stream, ProtocolType.Tcp); + var localEndPoint = new IPEndPoint(IPAddress.Loopback, 0); + innerSocket.Bind(localEndPoint); + } - /// - /// Initializes a instance of DefaultSocket class using the specified System.Net.Socket.Socket object. - /// - /// The existing socket - private DefaultSocketWrapper(Socket socket) - { - innerSocket = socket; - } + /// + /// Initializes a instance of DefaultSocket class using the specified System.Net.Socket.Socket object. + /// + /// The existing socket + private DefaultSocketWrapper(Socket socket) + { + innerSocket = socket; + } - /// - /// Accepts a incoming connection request. - /// - /// A DefaultSocket instance used to send and receive data - public ISocketWrapper Accept() - { - var socket = innerSocket.Accept(); - return new DefaultSocketWrapper(socket); - } + /// + /// Accepts a incoming connection request. + /// + /// A DefaultSocket instance used to send and receive data + public ISocketWrapper Accept() + { + var socket = innerSocket.Accept(); + return new DefaultSocketWrapper(socket); + } - /// - /// Close the socket connections and releases all associated resources. - /// - public void Close() - { - innerSocket.Close(); - } + /// + /// Close the socket connections and releases all associated resources. + /// + public void Close() + { + innerSocket.Close(); + } - /// - /// Establishes a connection to a remote host that is specified by an IP address and a port number - /// - /// The IP address of the remote host - /// The port number of the remote host - public void Connect(IPAddress remoteaddr, int port) - { - var remoteEndPoint = new IPEndPoint(remoteaddr, port); - innerSocket.Connect(remoteEndPoint); - } + /// + /// Establishes a connection to a remote host that is specified by an IP address and a port number + /// + /// The IP address of the remote host + /// The port number of the remote host + public void Connect(IPAddress remoteaddr, int port, string secret) + { + var remoteEndPoint = new IPEndPoint(remoteaddr, port); + innerSocket.Connect(remoteEndPoint); + } - /// - /// Returns the NetworkStream used to send and receive data. - /// - /// The underlying Stream instance that be used to send and receive data - /// - /// GetStream returns a NetworkStream that you can use to send and receive data. You must close/dispose - /// the NetworkStream by yourself. Closing DefaultSocketWrapper does not release the NetworkStream - /// - public Stream GetStream() - { - return new NetworkStream(innerSocket); - } + private static byte[] ReceiveAll(Socket socket, int len) + { + var buffer = new List(); - /// - /// Returns a stream used to receive data only. - /// - /// The underlying Stream instance that be used to receive data - public Stream GetInputStream() - { - // The default buffer size is 64K, PythonRDD also use 64K as default buffer size. - var readBufferSize = int.Parse(Environment.GetEnvironmentVariable(ConfigurationService.CSharpWorkerReadBufferSizeEnvName) ?? "65536"); - logger.LogDebug("Input stream buffer size: [{0}]", readBufferSize); - return readBufferSize > 0 ? new BufferedStream(GetStream(), readBufferSize) : GetStream(); - } + while (socket.Available > 0 && buffer.Count < len) + { + var currByte = new Byte[1]; + var byteCounter = socket.Receive(currByte, currByte.Length, SocketFlags.None); - /// - /// Returns a stream used to send data only. - /// - /// The underlying Stream instance that be used to send data - public Stream GetOutputStream() - { - // The default buffer size is 64K, PythonRDD also use 64K as default buffer size. - var writeBufferSize = int.Parse(Environment.GetEnvironmentVariable(ConfigurationService.CSharpWorkerWriteBufferSizeEnvName) ?? "65536"); - logger.LogDebug("Output stream buffer size: [{0}]", writeBufferSize); - return writeBufferSize > 0 ? new BufferedStream(GetStream(), writeBufferSize) : GetStream(); - } + if (byteCounter.Equals(1)) + { + buffer.Add(currByte[0]); + } + } - /// - /// Starts listening for incoming connections requests - /// - /// The maximum length of the pending connections queue. - public void Listen(int backlog = 16) - { - innerSocket.Listen(backlog); - } + return buffer.ToArray(); + } - /// - /// Receives network data from this socket, and returns a ByteBuf that contains the received data. - /// - /// The DefaultSocketWrapper does not support this function. - /// - /// A ByteBuf object that contains received data. - public ByteBuf Receive() - { - throw new NotImplementedException(); - } + /// + /// Returns the NetworkStream used to send and receive data. + /// + /// The underlying Stream instance that be used to send and receive data + /// + /// GetStream returns a NetworkStream that you can use to send and receive data. You must close/dispose + /// the NetworkStream by yourself. Closing DefaultSocketWrapper does not release the NetworkStream + /// + public Stream GetStream() + { + return new NetworkStream(innerSocket); + } - /// - /// Sends data to this socket with a ByteBuf object that contains data to be sent. - /// - /// The DefaultSocketWrapper does not support this function. - /// - /// A ByteBuf object that contains data to be sent - public void Send(ByteBuf data) - { - throw new NotImplementedException(); - } + /// + /// Returns a stream used to receive data only. + /// + /// The underlying Stream instance that be used to receive data + public Stream GetInputStream() + { + // The default buffer size is 64K, PythonRDD also use 64K as default buffer size. + var readBufferSize = int.Parse(Environment.GetEnvironmentVariable(ConfigurationService.CSharpWorkerReadBufferSizeEnvName) ?? "65536"); + logger.LogDebug("Input stream buffer size: [{0}]", readBufferSize); + return readBufferSize > 0 ? new BufferedStream(GetStream(), readBufferSize) : GetStream(); + } - /// - /// Disposes the resources used by this instance of the DefaultSocket class. - /// - /// - protected virtual void Dispose(bool disposing) - { - if (disposing) - { - innerSocket.Dispose(); - } - } + /// + /// Returns a stream used to send data only. + /// + /// The underlying Stream instance that be used to send data + public Stream GetOutputStream() + { + // The default buffer size is 64K, PythonRDD also use 64K as default buffer size. + var writeBufferSize = int.Parse(Environment.GetEnvironmentVariable(ConfigurationService.CSharpWorkerWriteBufferSizeEnvName) ?? "65536"); + logger.LogDebug("Output stream buffer size: [{0}]", writeBufferSize); + return writeBufferSize > 0 ? new BufferedStream(GetStream(), writeBufferSize) : GetStream(); + } - /// - /// Releases all resources used by the current instance of the DefaultSocket class. - /// - public void Dispose() - { - Dispose(true); - } + /// + /// Starts listening for incoming connections requests + /// + /// The maximum length of the pending connections queue. + public void Listen(int backlog = 16) + { + innerSocket.Listen(backlog); + } - /// - /// Frees resources used by DefaultSocket class - /// - ~DefaultSocketWrapper() - { - Dispose(false); - } + /// + /// Receives network data from this socket, and returns a ByteBuf that contains the received data. + /// + /// The DefaultSocketWrapper does not support this function. + /// + /// A ByteBuf object that contains received data. + public ByteBuf Receive() + { + throw new NotImplementedException(); + } - /// - /// Indicates whether there are data that has been received from the network and is available to be read. - /// - public bool HasData { get { return innerSocket.Available > 0; } } + /// + /// Sends data to this socket with a ByteBuf object that contains data to be sent. + /// + /// The DefaultSocketWrapper does not support this function. + /// + /// A ByteBuf object that contains data to be sent + public void Send(ByteBuf data) + { + throw new NotImplementedException(); + } - /// - /// Returns the local endpoint. - /// - public EndPoint LocalEndPoint { get { return innerSocket.LocalEndPoint; } } + /// + /// Disposes the resources used by this instance of the DefaultSocket class. + /// + /// + protected virtual void Dispose(bool disposing) + { + if (disposing) + { + innerSocket.Dispose(); + } + } - /// - /// Returns the remote endpoint if it has one. - /// - public EndPoint RemoteEndPoint { get { return innerSocket.RemoteEndPoint; } } - } + /// + /// Releases all resources used by the current instance of the DefaultSocket class. + /// + public void Dispose() + { + Dispose(true); + } + + /// + /// Frees resources used by DefaultSocket class + /// + ~DefaultSocketWrapper() + { + Dispose(false); + } + + /// + /// Indicates whether there are data that has been received from the network and is available to be read. + /// + public bool HasData { get { return innerSocket.Available > 0; } } + + /// + /// Returns the local endpoint. + /// + public EndPoint LocalEndPoint { get { return innerSocket.LocalEndPoint; } } + + /// + /// Returns the remote endpoint if it has one. + /// + public EndPoint RemoteEndPoint { get { return innerSocket.RemoteEndPoint; } } + } } diff --git a/csharp/Adapter/Microsoft.Spark.CSharp/Network/ISocketWrapper.cs b/csharp/Adapter/Microsoft.Spark.CSharp/Network/ISocketWrapper.cs index b08dcd6..45b61d2 100644 --- a/csharp/Adapter/Microsoft.Spark.CSharp/Network/ISocketWrapper.cs +++ b/csharp/Adapter/Microsoft.Spark.CSharp/Network/ISocketWrapper.cs @@ -11,7 +11,7 @@ namespace Microsoft.Spark.CSharp.Network /// ISocketWrapper interface defines the common methods to operate a socket (traditional socket or /// Windows Registered IO socket) /// - internal interface ISocketWrapper : IDisposable + public interface ISocketWrapper : IDisposable { /// /// Accepts a incoming connection request. @@ -24,12 +24,13 @@ namespace Microsoft.Spark.CSharp.Network /// void Close(); - /// - /// Establishes a connection to a remote host that is specified by an IP address and a port number - /// - /// The IP address of the remote host - /// The port number of the remote host - void Connect(IPAddress remoteaddr, int port); + /// + /// Establishes a connection to a remote host that is specified by an IP address and a port number + /// + /// The IP address of the remote host + /// The port number of the remote host + /// The secret to connect, can be null + void Connect(IPAddress remoteaddr, int port, string secret); /// /// Returns a stream used to send and receive data. diff --git a/csharp/Adapter/Microsoft.Spark.CSharp/Network/RioSocketWrapper.cs b/csharp/Adapter/Microsoft.Spark.CSharp/Network/RioSocketWrapper.cs index 740787f..54e73ed 100644 --- a/csharp/Adapter/Microsoft.Spark.CSharp/Network/RioSocketWrapper.cs +++ b/csharp/Adapter/Microsoft.Spark.CSharp/Network/RioSocketWrapper.cs @@ -151,7 +151,7 @@ namespace Microsoft.Spark.CSharp.Network /// /// The IP address of the remote host /// The port number of the remote host - public void Connect(IPAddress remoteaddr, int port) + public void Connect(IPAddress remoteaddr, int port, string secret) { EnsureAccessible(); diff --git a/csharp/Adapter/Microsoft.Spark.CSharp/Network/SaeaSocketWrapper.cs b/csharp/Adapter/Microsoft.Spark.CSharp/Network/SaeaSocketWrapper.cs index cb8ed0f..505bf96 100644 --- a/csharp/Adapter/Microsoft.Spark.CSharp/Network/SaeaSocketWrapper.cs +++ b/csharp/Adapter/Microsoft.Spark.CSharp/Network/SaeaSocketWrapper.cs @@ -111,7 +111,7 @@ namespace Microsoft.Spark.CSharp.Network /// /// The IP address of the remote host /// The port number of the remote host - public void Connect(IPAddress remoteaddr, int port) + public void Connect(IPAddress remoteaddr, int port, string secret) { var remoteEndPoint = new IPEndPoint(remoteaddr, port); innerSocket.Connect(remoteEndPoint); diff --git a/csharp/Adapter/Microsoft.Spark.CSharp/Network/SocketInfo.cs b/csharp/Adapter/Microsoft.Spark.CSharp/Network/SocketInfo.cs new file mode 100644 index 0000000..d14e5cc --- /dev/null +++ b/csharp/Adapter/Microsoft.Spark.CSharp/Network/SocketInfo.cs @@ -0,0 +1,28 @@ +using System; +using System.Collections.Generic; +using System.Linq; +using System.Text; +using System.Threading.Tasks; +using Microsoft.Spark.CSharp.Interop.Ipc; + +namespace Microsoft.Spark.CSharp.Network +{ + public class SocketInfo + { + public readonly int Port; + public readonly string Secret; + + public SocketInfo(int port, string secret) + { + Port = port; + Secret = secret; + } + + public static SocketInfo Parse(object o) + { + var oo = o as List; + if (oo == null) throw new Exception(o.ToString() + " is not socket info "+typeof(List)+" "+o.GetType()); + return new SocketInfo(int.Parse(oo[0].ObjectToString()), oo[1].ObjectToString()); + } + } +} diff --git a/csharp/Adapter/Microsoft.Spark.CSharp/Proxy/IDataFrameProxy.cs b/csharp/Adapter/Microsoft.Spark.CSharp/Proxy/IDataFrameProxy.cs index 9928523..87071d9 100644 --- a/csharp/Adapter/Microsoft.Spark.CSharp/Proxy/IDataFrameProxy.cs +++ b/csharp/Adapter/Microsoft.Spark.CSharp/Proxy/IDataFrameProxy.cs @@ -13,7 +13,7 @@ namespace Microsoft.Spark.CSharp.Proxy IRDDProxy JavaToCSharp(); string GetQueryExecution(); string GetExecutedPlan(); - string GetShowString(int numberOfRows, bool truncate); + string GetShowString(int numberOfRows, int truncate, bool vertical); bool IsLocal(); IStructTypeProxy GetSchema(); IRDDProxy ToJSON(); @@ -59,7 +59,9 @@ namespace Microsoft.Spark.CSharp.Proxy IDataFrameProxy Repartition(int numPartitions, IColumnProxy[] columns); IDataFrameProxy Repartition(IColumnProxy[] columns); IDataFrameProxy Sample(bool withReplacement, double fraction, long seed); - IDataFrameWriterProxy Write(); + IDataFrameProxy Broadcast(); + + IDataFrameWriterProxy Write(); } internal interface IUDFProxy diff --git a/csharp/Adapter/Microsoft.Spark.CSharp/Proxy/IRDDProxy.cs b/csharp/Adapter/Microsoft.Spark.CSharp/Proxy/IRDDProxy.cs index e323cf4..24788c0 100644 --- a/csharp/Adapter/Microsoft.Spark.CSharp/Proxy/IRDDProxy.cs +++ b/csharp/Adapter/Microsoft.Spark.CSharp/Proxy/IRDDProxy.cs @@ -7,6 +7,7 @@ using System.Linq; using System.Text; using System.Threading.Tasks; using Microsoft.Spark.CSharp.Core; +using Microsoft.Spark.CSharp.Network; namespace Microsoft.Spark.CSharp.Proxy { @@ -41,6 +42,6 @@ namespace Microsoft.Spark.CSharp.Proxy void SaveAsSequenceFile(string path, string compressionCodecClass); void SaveAsTextFile(string path, string compressionCodecClass); long Count(); - int CollectAndServe(); + SocketInfo CollectAndServe(); } } diff --git a/csharp/Adapter/Microsoft.Spark.CSharp/Proxy/ISparkContextProxy.cs b/csharp/Adapter/Microsoft.Spark.CSharp/Proxy/ISparkContextProxy.cs index f1a00ac..a53fdab 100644 --- a/csharp/Adapter/Microsoft.Spark.CSharp/Proxy/ISparkContextProxy.cs +++ b/csharp/Adapter/Microsoft.Spark.CSharp/Proxy/ISparkContextProxy.cs @@ -8,6 +8,7 @@ using System.Text; using System.Threading.Tasks; using Microsoft.Spark.CSharp.Core; using Microsoft.Spark.CSharp.Interop; +using Microsoft.Spark.CSharp.Network; namespace Microsoft.Spark.CSharp.Proxy @@ -50,7 +51,7 @@ namespace Microsoft.Spark.CSharp.Proxy void CancelJobGroup(string groupId); void CancelAllJobs(); IStatusTrackerProxy StatusTracker { get; } - int RunJob(IRDDProxy rdd, IEnumerable partitions); + SocketInfo RunJob(IRDDProxy rdd, IEnumerable partitions); IBroadcastProxy ReadBroadcastFromFile(string path, out long broadcastId); IRDDProxy CreateCSharpRdd(IRDDProxy prefvJavaRddReference, byte[] command, Dictionary environmentVariables, List pythonIncludes, bool preservePartitioning, List broadcastVariables, List accumulator); IRDDProxy CreatePairwiseRDD(IRDDProxy javaReferenceInByteArrayRdd, int numPartitions, long partitionFuncId); diff --git a/csharp/Adapter/Microsoft.Spark.CSharp/Proxy/Ipc/DataFrameIpcProxy.cs b/csharp/Adapter/Microsoft.Spark.CSharp/Proxy/Ipc/DataFrameIpcProxy.cs index 177d33c..85c1210 100644 --- a/csharp/Adapter/Microsoft.Spark.CSharp/Proxy/Ipc/DataFrameIpcProxy.cs +++ b/csharp/Adapter/Microsoft.Spark.CSharp/Proxy/Ipc/DataFrameIpcProxy.cs @@ -79,12 +79,12 @@ namespace Microsoft.Spark.CSharp.Proxy.Ipc return SparkCLRIpcProxy.JvmBridge.CallNonStaticJavaMethod(executedPlanReference, "toString", new object[] { }).ToString(); } - public string GetShowString(int numberOfRows, bool truncate) + public string GetShowString(int numberOfRows, int truncate, bool vertical) { return SparkCLRIpcProxy.JvmBridge.CallNonStaticJavaMethod( jvmDataFrameReference, "showString", - new object[] { numberOfRows, truncate }).ToString(); + new object[] { numberOfRows, truncate, vertical}).ToString(); } public bool IsLocal() @@ -575,7 +575,16 @@ namespace Microsoft.Spark.CSharp.Proxy.Ipc new object[] { withReplacement, fraction, seed }).ToString()), sqlContextProxy); } - public IDataFrameWriterProxy Write() + public IDataFrameProxy Broadcast() + { + return + new DataFrameIpcProxy( + new JvmObjectReference( + SparkCLRIpcProxy.JvmBridge.CallStaticJavaMethod("org.apache.spark.sql.functions", "broadcast", + new object[] { jvmDataFrameReference }).ToString()), sqlContextProxy); + } + + public IDataFrameWriterProxy Write() { return new DataFrameWriterIpcProxy(new JvmObjectReference( SparkCLRIpcProxy.JvmBridge.CallNonStaticJavaMethod(jvmDataFrameReference, "write").ToString())); diff --git a/csharp/Adapter/Microsoft.Spark.CSharp/Proxy/Ipc/RDDIpcProxy.cs b/csharp/Adapter/Microsoft.Spark.CSharp/Proxy/Ipc/RDDIpcProxy.cs index 9377c07..3ef6577 100644 --- a/csharp/Adapter/Microsoft.Spark.CSharp/Proxy/Ipc/RDDIpcProxy.cs +++ b/csharp/Adapter/Microsoft.Spark.CSharp/Proxy/Ipc/RDDIpcProxy.cs @@ -12,6 +12,7 @@ using System.Threading.Tasks; using Microsoft.Spark.CSharp.Core; using Microsoft.Spark.CSharp.Interop; using Microsoft.Spark.CSharp.Interop.Ipc; +using Microsoft.Spark.CSharp.Network; namespace Microsoft.Spark.CSharp.Proxy.Ipc { @@ -66,10 +67,10 @@ namespace Microsoft.Spark.CSharp.Proxy.Ipc return long.Parse(SparkCLRIpcProxy.JvmBridge.CallNonStaticJavaMethod(rdd, "count").ToString()); } - public int CollectAndServe() + public SocketInfo CollectAndServe() { var rdd = new JvmObjectReference((string)SparkCLRIpcProxy.JvmBridge.CallNonStaticJavaMethod(jvmRddReference, "rdd")); - return int.Parse(SparkCLRIpcProxy.JvmBridge.CallStaticJavaMethod("org.apache.spark.api.python.PythonRDD", "collectAndServe", new object[] { rdd }).ToString()); + return SocketInfo.Parse(SparkCLRIpcProxy.JvmBridge.CallStaticJavaMethod("org.apache.spark.api.python.PythonRDD", "collectAndServe", new object[] { rdd })); } diff --git a/csharp/Adapter/Microsoft.Spark.CSharp/Proxy/Ipc/SparkContextIpcProxy.cs b/csharp/Adapter/Microsoft.Spark.CSharp/Proxy/Ipc/SparkContextIpcProxy.cs index 01290fd..f48aa52 100644 --- a/csharp/Adapter/Microsoft.Spark.CSharp/Proxy/Ipc/SparkContextIpcProxy.cs +++ b/csharp/Adapter/Microsoft.Spark.CSharp/Proxy/Ipc/SparkContextIpcProxy.cs @@ -11,6 +11,7 @@ using System.Threading.Tasks; using Microsoft.Spark.CSharp.Core; using Microsoft.Spark.CSharp.Interop; using Microsoft.Spark.CSharp.Interop.Ipc; +using Microsoft.Spark.CSharp.Network; using Microsoft.Spark.CSharp.Proxy.Ipc; namespace Microsoft.Spark.CSharp.Proxy.Ipc @@ -134,10 +135,8 @@ namespace Microsoft.Spark.CSharp.Proxy.Ipc public void Accumulator(int port) { - jvmAccumulatorReference = new JvmObjectReference((string)SparkCLRIpcProxy.JvmBridge.CallNonStaticJavaMethod(jvmJavaContextReference, "accumulator", - SparkCLRIpcProxy.JvmBridge.CallConstructor("java.util.ArrayList"), - SparkCLRIpcProxy.JvmBridge.CallConstructor("org.apache.spark.api.python.PythonAccumulatorParam", IPAddress.Loopback.ToString(), port) - )); + jvmAccumulatorReference = SparkCLRIpcProxy.JvmBridge.CallConstructor("org.apache.spark.api.python.PythonAccumulatorV2", IPAddress.Loopback.ToString(), port); + SparkCLRIpcProxy.JvmBridge.CallNonStaticJavaMethod(jvmSparkContextReference, "register", new object[] { jvmAccumulatorReference }); } public void Stop() @@ -241,7 +240,7 @@ namespace Microsoft.Spark.CSharp.Proxy.Ipc public void SetJobGroup(string groupId, string description, bool interruptOnCancel) { - SparkCLRIpcProxy.JvmBridge.CallNonStaticJavaMethod(jvmJavaContextReference, "setCheckpointDir", new object[] { groupId, description, interruptOnCancel }); + SparkCLRIpcProxy.JvmBridge.CallNonStaticJavaMethod(jvmJavaContextReference, "setJobGroup", new object[] { groupId, description, interruptOnCancel }); } public void SetLocalProperty(string key, string value) @@ -344,10 +343,10 @@ namespace Microsoft.Spark.CSharp.Proxy.Ipc } - public int RunJob(IRDDProxy rdd, IEnumerable partitions) + public SocketInfo RunJob(IRDDProxy rdd, IEnumerable partitions) { var jpartitions = JvmBridgeUtils.GetJavaList(partitions); - return int.Parse(SparkCLRIpcProxy.JvmBridge.CallStaticJavaMethod("org.apache.spark.api.python.PythonRDD", "runJob", new object[] { jvmSparkContextReference, (rdd as RDDIpcProxy).JvmRddReference, jpartitions }).ToString()); + return SocketInfo.Parse(SparkCLRIpcProxy.JvmBridge.CallStaticJavaMethod("org.apache.spark.api.python.PythonRDD", "runJob", new object[] { jvmSparkContextReference, (rdd as RDDIpcProxy).JvmRddReference, jpartitions })); } public IBroadcastProxy ReadBroadcastFromFile(string path, out long broadcastId) diff --git a/csharp/Adapter/Microsoft.Spark.CSharp/Proxy/Ipc/SparkSessionIpcProxy.cs b/csharp/Adapter/Microsoft.Spark.CSharp/Proxy/Ipc/SparkSessionIpcProxy.cs index febfd3b..bc6e5a1 100644 --- a/csharp/Adapter/Microsoft.Spark.CSharp/Proxy/Ipc/SparkSessionIpcProxy.cs +++ b/csharp/Adapter/Microsoft.Spark.CSharp/Proxy/Ipc/SparkSessionIpcProxy.cs @@ -27,7 +27,9 @@ namespace Microsoft.Spark.CSharp.Proxy.Ipc } } - public ISqlContextProxy SqlContextProxy + internal JvmObjectReference JvmReference => jvmSparkSessionReference; + + public ISqlContextProxy SqlContextProxy { get { return sqlContextProxy; } } diff --git a/csharp/Adapter/Microsoft.Spark.CSharp/Proxy/Ipc/SqlContextIpcProxy.cs b/csharp/Adapter/Microsoft.Spark.CSharp/Proxy/Ipc/SqlContextIpcProxy.cs index 4bb930f..d6f0098 100644 --- a/csharp/Adapter/Microsoft.Spark.CSharp/Proxy/Ipc/SqlContextIpcProxy.cs +++ b/csharp/Adapter/Microsoft.Spark.CSharp/Proxy/Ipc/SqlContextIpcProxy.cs @@ -106,7 +106,7 @@ namespace Microsoft.Spark.CSharp.Proxy.Ipc var udf = SparkCLRIpcProxy.JvmBridge.CallConstructor("org.apache.spark.sql.execution.python.UserDefinedPythonFunction", new object[] { - name, function, dt + name, function, dt, 100 /*BatchUDF*/, true /*deterministic*/ }); SparkCLRIpcProxy.JvmBridge.CallNonStaticJavaMethod(judf, "registerPython", new object[] { name, udf }); diff --git a/csharp/Adapter/Microsoft.Spark.CSharp/Sql/DataFrame.cs b/csharp/Adapter/Microsoft.Spark.CSharp/Sql/DataFrame.cs index 66601ca..b288baa 100644 --- a/csharp/Adapter/Microsoft.Spark.CSharp/Sql/DataFrame.cs +++ b/csharp/Adapter/Microsoft.Spark.CSharp/Sql/DataFrame.cs @@ -6,7 +6,9 @@ using System.Collections.Generic; using System.Globalization; using System.Linq; using Microsoft.Spark.CSharp.Core; +using Microsoft.Spark.CSharp.Interop.Ipc; using Microsoft.Spark.CSharp.Proxy; +using Microsoft.Spark.CSharp.Proxy.Ipc; using Microsoft.Spark.CSharp.Services; namespace Microsoft.Spark.CSharp.Sql @@ -66,10 +68,12 @@ namespace Microsoft.Spark.CSharp.Sql } } - /// - /// Returns true if the collect and take methods can be run locally (without any Spark executors). - /// - public bool IsLocal + internal JvmObjectReference JvmReference => (dataFrameProxy as DataFrameIpcProxy)?.JvmDataFrameReference; + + /// + /// Returns true if the collect and take methods can be run locally (without any Spark executors). + /// + public bool IsLocal { get { @@ -145,10 +149,11 @@ namespace Microsoft.Spark.CSharp.Sql /// /// Number of rows to display - default 20 /// Indicates if strings more than 20 characters long will be truncated - public void Show(int numberOfRows = 20, bool truncate = true) + /// If set to True, print output rows vertically (one line per column value). + public void Show(int numberOfRows = 20, int truncate = 20, bool vertical = false) { logger.LogInfo("Writing {0} rows in the DataFrame to Console output", numberOfRows); - Console.WriteLine(dataFrameProxy.GetShowString(numberOfRows, truncate)); + Console.WriteLine(dataFrameProxy.GetShowString(numberOfRows, truncate, vertical)); } /// @@ -166,8 +171,8 @@ namespace Microsoft.Spark.CSharp.Sql /// public IEnumerable Collect() { - int port = RddProxy.CollectAndServe(); - return Rdd.Collect(port).Cast(); + var info = RddProxy.CollectAndServe(); + return Rdd.Collect(info).Cast(); } //TODO - add this method if needed to convert Row to collection of T @@ -917,10 +922,11 @@ namespace Microsoft.Spark.CSharp.Sql /// /// Persist this DataFrame with the default storage level (`MEMORY_AND_DISK`) /// + /// Persist storage type // Python API: https://github.com/apache/spark/blob/branch-1.4/python/pyspark/sql/dataframe.py persist(self, storageLevel) - public DataFrame Persist() + public DataFrame Persist(StorageLevelType type= StorageLevelType.MEMORY_AND_DISK) { - dataFrameProxy.Persist(StorageLevelType.MEMORY_AND_DISK); + dataFrameProxy.Persist(type); return this; } @@ -944,6 +950,11 @@ namespace Microsoft.Spark.CSharp.Sql return Persist(); } + public DataFrame Broadcast() + { + return new DataFrame(dataFrameProxy.Broadcast(), sparkContext); + } + /// /// Returns a new DataFrame that has exactly `numPartitions` partitions. /// diff --git a/csharp/Adapter/Microsoft.Spark.CSharp/Sql/DataFrameReader.cs b/csharp/Adapter/Microsoft.Spark.CSharp/Sql/DataFrameReader.cs index 04fcc90..c27700e 100644 --- a/csharp/Adapter/Microsoft.Spark.CSharp/Sql/DataFrameReader.cs +++ b/csharp/Adapter/Microsoft.Spark.CSharp/Sql/DataFrameReader.cs @@ -159,5 +159,18 @@ namespace Microsoft.Spark.CSharp.Sql logger.LogInfo("Constructing DataFrame using Parquet source {0}", string.Join(";", path)); return new DataFrame(dataFrameReaderProxy.Parquet(path), sparkContext); } - } + + /// + /// Loads a AVRO file (one object per line) and returns the result as a DataFrame. + /// + /// This function goes through the input once to determine the input schema. If you know the + /// schema in advance, use the version that specifies the schema to avoid the extra scan. + /// + /// input path + public DataFrame Avro(string path) + { + logger.LogInfo("Constructing DataFrame using AVRO source {0}", path); + return Format("com.databricks.spark.avro").Load(path); + } + } } diff --git a/csharp/Adapter/Microsoft.Spark.CSharp/Sql/DataFrameWriter.cs b/csharp/Adapter/Microsoft.Spark.CSharp/Sql/DataFrameWriter.cs index a16478d..9fa9fdb 100644 --- a/csharp/Adapter/Microsoft.Spark.CSharp/Sql/DataFrameWriter.cs +++ b/csharp/Adapter/Microsoft.Spark.CSharp/Sql/DataFrameWriter.cs @@ -170,5 +170,16 @@ namespace Microsoft.Spark.CSharp.Sql { Format("parquet").Save(path); } - } + + /// + /// Saves the content of the DataFrame in AVRO format at the specified path. + /// This is equivalent to: + /// Format("com.databricks.spark.avro").Save(path) + /// + public void Avro(string path) + { + Format("com.databricks.spark.avro").Save(path); + } + + } } diff --git a/csharp/Adapter/Microsoft.Spark.CSharp/Sql/Dataset.cs b/csharp/Adapter/Microsoft.Spark.CSharp/Sql/Dataset.cs index b3a81cf..bc89168 100644 --- a/csharp/Adapter/Microsoft.Spark.CSharp/Sql/Dataset.cs +++ b/csharp/Adapter/Microsoft.Spark.CSharp/Sql/Dataset.cs @@ -92,7 +92,8 @@ namespace Microsoft.Spark.CSharp.Sql /// /// Number of rows - default is 20 /// Indicates if rows with more than 20 characters to be truncated - public void Show(int numberOfRows = 20, bool truncate = true) + /// If set to true, prints output rows vertically (one line per column value). + public void Show(int numberOfRows = 20, int truncate = 20, bool vertical = false) { ToDF().Show(numberOfRows, truncate); } diff --git a/csharp/Adapter/Microsoft.Spark.CSharp/Sql/Functions.cs b/csharp/Adapter/Microsoft.Spark.CSharp/Sql/Functions.cs index c9166fe..a23d91a 100644 --- a/csharp/Adapter/Microsoft.Spark.CSharp/Sql/Functions.cs +++ b/csharp/Adapter/Microsoft.Spark.CSharp/Sql/Functions.cs @@ -4,6 +4,8 @@ using System; using System.Collections.Generic; using System.Linq; +using System.Reflection; +using System.Runtime.Serialization; using System.Text; using System.Threading.Tasks; @@ -1119,5 +1121,42 @@ namespace Microsoft.Spark.CSharp.Sql return input.Select(a => func((A1)(a[0]), (A2)(a[1]), (A3)(a[2]), (A4)(a[3]), (A5)(a[4]), (A6)(a[5]), (A7)(a[6]), (A8)(a[7]), (A9)(a[8]), (A10)(a[9]))).Cast(); } } - #endregion + + [Serializable] + internal class UdfReflectionHelper + { + private readonly MethodInfo func; + + [NonSerialized] + private object[] _cache; + + internal UdfReflectionHelper(MethodInfo f) + { + func = f; + _cache = new object[func.GetParameters().Length]; + } + + public Type ReturnType => func.ReturnType; + + [OnDeserialized()] + public void Init(StreamingContext context) + { + _cache = new object[func.GetParameters().Length]; + } + + internal IEnumerable Execute(int pid, IEnumerable input) + { + return input.Select(Run).Cast(); + } + + private dynamic Run(dynamic input) + { + for (int i = 0; i < _cache.Length; ++i) + { + _cache[i] = input[i]; + } + return func.Invoke(null, _cache); + } + } + #endregion } diff --git a/csharp/Adapter/Microsoft.Spark.CSharp/Sql/Row.cs b/csharp/Adapter/Microsoft.Spark.CSharp/Sql/Row.cs index 77614a7..a299d1a 100644 --- a/csharp/Adapter/Microsoft.Spark.CSharp/Sql/Row.cs +++ b/csharp/Adapter/Microsoft.Spark.CSharp/Sql/Row.cs @@ -18,17 +18,24 @@ namespace Microsoft.Spark.CSharp.Sql [NonSerialized] private readonly ILoggerService logger = LoggerServiceFactory.GetLogger(typeof(Row)); - /// - /// Number of elements in the Row. - /// - /// elements count in this row - public abstract int Size(); + public abstract dynamic[] Values { get; } + + /// + /// Number of elements in the Row. + /// + /// elements count in this row + public abstract int Size(); /// /// Schema for the row. /// public abstract StructType GetSchema(); + public virtual void ResetValues(dynamic[] values) + { + throw new NotImplementedException(); + } + /// /// Returns the value at position i. /// @@ -80,8 +87,22 @@ namespace Microsoft.Spark.CSharp.Sql internal class RowImpl : Row { private readonly StructType schema; - public dynamic[] Values { get { return values; } } - private readonly dynamic[] values; + + public override dynamic[] Values + { + get + { + if (!valuesConverted) + { + schema.ConvertPickleObjects(rawValues,rawValues); + valuesConverted = true; + } + return rawValues; + } + } + + private dynamic[] rawValues; + private bool valuesConverted = false; private readonly int columnCount; @@ -96,11 +117,11 @@ namespace Microsoft.Spark.CSharp.Sql { if (data is dynamic[]) { - values = data as dynamic[]; + rawValues = data as dynamic[]; } else if (data is List) { - values = (data as List).ToArray(); + rawValues = (data as List).ToArray(); } else { @@ -109,17 +130,25 @@ namespace Microsoft.Spark.CSharp.Sql this.schema = schema; - columnCount = values.Count(); - int schemaColumnCount = this.schema.Fields.Count(); + columnCount = rawValues.Length; + int schemaColumnCount = this.schema.Fields.Count; if (columnCount != schemaColumnCount) { throw new Exception(string.Format("column count inferred from data ({0}) and schema ({1}) mismatch", columnCount, schemaColumnCount)); } - - Initialize(); } - public override int Size() + public override void ResetValues(dynamic[] values) + { + if (columnCount != values.Length) + { + throw new ArgumentException("column count inferred from data and schema mismatch"); + } + rawValues = values; + valuesConverted = false; + } + + public override int Size() { return columnCount; } @@ -131,16 +160,15 @@ namespace Microsoft.Spark.CSharp.Sql public override dynamic Get(int i) { + if (i >= 0 && i < columnCount) return Values[i]; if (i >= columnCount) { throw new Exception(string.Format("i ({0}) >= columnCount ({1})", i, columnCount)); } - else if(i < 0) + else { throw new Exception(string.Format("i ({0}) < 0", i)); } - - return values[i]; } public override dynamic Get(string columnName) @@ -152,7 +180,7 @@ namespace Microsoft.Spark.CSharp.Sql public override string ToString() { List cols = new List(); - foreach (var item in values) + foreach (var item in Values) { if (item != null) { @@ -166,73 +194,7 @@ namespace Microsoft.Spark.CSharp.Sql return string.Format("[{0}]", string.Join(",", cols.ToArray())); } - - - private void Initialize() - { - - int index = 0; - foreach (var field in schema.Fields) - { - if (field.DataType is ArrayType) - { - Func convertArrayTypeToStructTypeFunc = (dataType, length) => - { - StructField[] fields = new StructField[length]; - for(int i = 0; i < length ; i++) - { - fields[i] = new StructField(string.Format("_array_{0}", i), dataType); - } - return new StructType(fields); - }; - var elementType = (field.DataType as ArrayType).ElementType; - - // Note: When creating object from json, PySpark converts Json array to Python List (https://github.com/apache/spark/blob/branch-1.4/python/pyspark/sql/types.py, _create_cls(dataType)), - // then Pyrolite unpickler converts Python List to C# ArrayList (https://github.com/irmen/Pyrolite/blob/v4.10/README.txt). So values[index] should be of type ArrayList; - // In case Python changes its implementation, which means value is not of type ArrayList, try cast to object[] because Pyrolite unpickler convert Python Tuple to C# object[]. - object[] valueOfArray = values[index] is ArrayList ? (values[index] as ArrayList).ToArray() : values[index] as object[]; - if (valueOfArray == null) - { - throw new ArgumentException("Cannot parse data of ArrayType: " + field.Name); - } - - values[index] = new RowImpl(valueOfArray, elementType as StructType ?? convertArrayTypeToStructTypeFunc(elementType, valueOfArray.Length)).values; - } - else if (field.DataType is MapType) - { - //TODO - throw new NotImplementedException(); - } - else if (field.DataType is StructType) - { - dynamic value = values[index]; - if (value != null) - { - var subRow = new RowImpl(values[index], field.DataType as StructType); - values[index] = subRow; - } - } - else if (field.DataType is DecimalType) - { - //TODO - throw new NotImplementedException(); - } - else if (field.DataType is DateType) - { - //TODO - throw new NotImplementedException(); - } - else if (field.DataType is StringType) - { - if (values[index] != null) values[index] = values[index].ToString(); - } - else - { - values[index] = values[index]; - } - index++; - } - } + } } diff --git a/csharp/Adapter/Microsoft.Spark.CSharp/Sql/RowConstructor.cs b/csharp/Adapter/Microsoft.Spark.CSharp/Sql/RowConstructor.cs index 96b50c2..25726ad 100644 --- a/csharp/Adapter/Microsoft.Spark.CSharp/Sql/RowConstructor.cs +++ b/csharp/Adapter/Microsoft.Spark.CSharp/Sql/RowConstructor.cs @@ -78,7 +78,7 @@ namespace Microsoft.Spark.CSharp.Sql currentSchema = null; return row; } - + //removes objects of type RowConstructor and replacing them with actual values private object[] GetValues(object[] arguments) { @@ -86,7 +86,7 @@ namespace Microsoft.Spark.CSharp.Sql int i = 0; foreach (var argument in arguments) { - if (argument != null && argument.GetType() == typeof(RowConstructor)) + if (argument is RowConstructor) { values[i++] = (argument as RowConstructor).Values; } diff --git a/csharp/Adapter/Microsoft.Spark.CSharp/Sql/SparkSession.cs b/csharp/Adapter/Microsoft.Spark.CSharp/Sql/SparkSession.cs index c4f7288..aa70216 100644 --- a/csharp/Adapter/Microsoft.Spark.CSharp/Sql/SparkSession.cs +++ b/csharp/Adapter/Microsoft.Spark.CSharp/Sql/SparkSession.cs @@ -9,7 +9,9 @@ using System.Runtime.Remoting.Contexts; using System.Text; using System.Threading.Tasks; using Microsoft.Spark.CSharp.Core; +using Microsoft.Spark.CSharp.Interop.Ipc; using Microsoft.Spark.CSharp.Proxy; +using Microsoft.Spark.CSharp.Proxy.Ipc; using Microsoft.Spark.CSharp.Services; using Microsoft.Spark.CSharp.Sql.Catalog; @@ -42,10 +44,12 @@ namespace Microsoft.Spark.CSharp.Sql get { return catalog ?? (catalog = new Catalog.Catalog(SparkSessionProxy.GetCatalog())); } } - /// - /// Interface through which the user may access the underlying SparkContext. - /// - public SparkContext SparkContext { get; private set; } + internal JvmObjectReference JvmReference => (sparkSessionProxy as SparkSessionIpcProxy)?.JvmReference; + + /// + /// Interface through which the user may access the underlying SparkContext. + /// + public SparkContext SparkContext { get; private set; } public UdfRegistration Udf { @@ -114,18 +118,30 @@ namespace Microsoft.Spark.CSharp.Sql // The below sqlContextProxy.CreateDataFrame() will call byteArrayRDDToAnyArrayRDD() of SQLUtils.scala which only accept RDD of type RDD[Array[Byte]]. // In byteArrayRDDToAnyArrayRDD() of SQLUtils.scala, the SerDeUtil.pythonToJava() will be called which is a mapPartitions inside. // It will be executed until the CSharpWorker finishes Pickling to RDD[Array[Byte]]. - var rddRow = rdd.Map(r => r); + var rddRow = rdd.MapPartitions(r => r.Select(rr => rr)); rddRow.serializedMode = SerializedMode.Row; return new DataFrame(sparkSessionProxy.CreateDataFrame(rddRow.RddProxy, schema.StructTypeProxy), SparkContext); } - /// - /// Returns the specified table as a - /// - /// - /// - public DataFrame Table(string tableName) + public DataFrame CreateDataFrame(RDD rdd, StructType schema) + { + // Note: This is for pickling RDD, convert to RDD which happens in CSharpWorker. + // The below sqlContextProxy.CreateDataFrame() will call byteArrayRDDToAnyArrayRDD() of SQLUtils.scala which only accept RDD of type RDD[Array[Byte]]. + // In byteArrayRDDToAnyArrayRDD() of SQLUtils.scala, the SerDeUtil.pythonToJava() will be called which is a mapPartitions inside. + // It will be executed until the CSharpWorker finishes Pickling to RDD[Array[Byte]]. + var rddRow = rdd.MapPartitions(rows => rows.Select(r => r.Values)); + rddRow.serializedMode = SerializedMode.Row; + + return new DataFrame(sparkSessionProxy.CreateDataFrame(rddRow.RddProxy, schema.StructTypeProxy), SparkContext); + } + + /// + /// Returns the specified table as a + /// + /// + /// + public DataFrame Table(string tableName) { return new DataFrame(sparkSessionProxy.Table(tableName), SparkContext); } diff --git a/csharp/Adapter/Microsoft.Spark.CSharp/Sql/SqlContext.cs b/csharp/Adapter/Microsoft.Spark.CSharp/Sql/SqlContext.cs index 03e9fb2..c99e901 100644 --- a/csharp/Adapter/Microsoft.Spark.CSharp/Sql/SqlContext.cs +++ b/csharp/Adapter/Microsoft.Spark.CSharp/Sql/SqlContext.cs @@ -3,6 +3,7 @@ using System; using System.Collections.Generic; +using System.Reflection; using Microsoft.Spark.CSharp.Core; using Microsoft.Spark.CSharp.Proxy; using Microsoft.Spark.CSharp.Services; @@ -150,13 +151,25 @@ namespace Microsoft.Spark.CSharp.Sql return new DataFrame(sqlContextProxy.CreateDataFrame(rddRow.RddProxy, schema.StructTypeProxy), sparkContext); } - /// - /// Registers the given as a temporary table in the catalog. - /// Temporary tables exist only during the lifetime of this instance of SqlContext. - /// - /// - /// - public void RegisterDataFrameAsTable(DataFrame dataFrame, string tableName) + public DataFrame CreateDataFrame(RDD rdd, StructType schema) + { + // Note: This is for pickling RDD, convert to RDD which happens in CSharpWorker. + // The below sqlContextProxy.CreateDataFrame() will call byteArrayRDDToAnyArrayRDD() of SQLUtils.scala which only accept RDD of type RDD[Array[Byte]]. + // In byteArrayRDDToAnyArrayRDD() of SQLUtils.scala, the SerDeUtil.pythonToJava() will be called which is a mapPartitions inside. + // It will be executed until the CSharpWorker finishes Pickling to RDD[Array[Byte]]. + var rddRow = rdd.Map(r => r); + rddRow.serializedMode = SerializedMode.Row; + + return new DataFrame(sqlContextProxy.CreateDataFrame(rddRow.RddProxy, schema.StructTypeProxy), sparkContext); + } + + /// + /// Registers the given as a temporary table in the catalog. + /// Temporary tables exist only during the lifetime of this instance of SqlContext. + /// + /// + /// + public void RegisterDataFrameAsTable(DataFrame dataFrame, string tableName) { sqlContextProxy.RegisterDataFrameAsTable(dataFrame.DataFrameProxy, tableName); } @@ -527,6 +540,14 @@ namespace Microsoft.Spark.CSharp.Sql Func, IEnumerable> udfHelper = new UdfHelper(f).Execute; sqlContextProxy.RegisterFunction(name, SparkContext.BuildCommand(new CSharpWorkerFunc(udfHelper), SerializedMode.Row, SerializedMode.Row), Functions.GetReturnType(typeof(RT))); } - #endregion - } + + public void RegisterFunction(string name, MethodInfo f) + { + logger.LogInfo("Name of the function to register {0}, method info", name, f.DeclaringType?.FullName + "." + f.Name); + var helper = new UdfReflectionHelper(f); + Func, IEnumerable> udfHelper = helper.Execute; + sqlContextProxy.RegisterFunction(name, SparkContext.BuildCommand(new CSharpWorkerFunc(udfHelper), SerializedMode.Row, SerializedMode.Row), Functions.GetReturnType(helper.ReturnType)); + } + #endregion + } } \ No newline at end of file diff --git a/csharp/Adapter/Microsoft.Spark.CSharp/Sql/Types.cs b/csharp/Adapter/Microsoft.Spark.CSharp/Sql/Types.cs index 2efcf20..ef945c3 100644 --- a/csharp/Adapter/Microsoft.Spark.CSharp/Sql/Types.cs +++ b/csharp/Adapter/Microsoft.Spark.CSharp/Sql/Types.cs @@ -2,6 +2,7 @@ // Licensed under the MIT license. See LICENSE file in the project root for full license information. using System; +using System.Collections; using System.Collections.Generic; using System.Linq; using System.Reflection; @@ -14,512 +15,600 @@ using Newtonsoft.Json.Linq; namespace Microsoft.Spark.CSharp.Sql { - /// - /// The base type of all Spark SQL data types. - /// - [Serializable] - public abstract class DataType - { - /// - /// Trim "Type" in the end from class name, ToLower() to align with Scala. - /// - public string TypeName - { - get { return NormalizeTypeName(GetType().Name); } - } + /// + /// The base type of all Spark SQL data types. + /// + [Serializable] + public abstract class DataType + { + /// + /// Trim "Type" in the end from class name, ToLower() to align with Scala. + /// + public string TypeName + { + get { return NormalizeTypeName(GetType().Name); } + } - /// - /// return TypeName by default, subclass can override it - /// - public virtual string SimpleString - { - get { return TypeName; } - } + /// + /// return TypeName by default, subclass can override it + /// + public virtual string SimpleString + { + get { return TypeName; } + } - /// - /// return only type: TypeName by default, subclass can override it - /// - internal virtual object JsonValue { get { return TypeName; } } + /// + /// return only type: TypeName by default, subclass can override it + /// + internal virtual object JsonValue { get { return TypeName; } } - /// - /// The compact JSON representation of this data type. - /// - public string Json - { - get - { - var jObject = JsonValue is JObject ? ((JObject)JsonValue).SortProperties() : JsonValue; - return JsonConvert.SerializeObject(jObject, Formatting.None); - } - } + /// + /// The compact JSON representation of this data type. + /// + public string Json + { + get + { + var jObject = JsonValue is JObject ? ((JObject)JsonValue).SortProperties() : JsonValue; + return JsonConvert.SerializeObject(jObject, Formatting.None); + } + } - /// - /// Parses a Json string to construct a DataType. - /// - /// The Json string to be parsed - /// The new DataType instance from the Json string - public static DataType ParseDataTypeFromJson(string json) - { - return ParseDataTypeFromJson(JToken.Parse(json)); - } + /// + /// Parses a Json string to construct a DataType. + /// + /// The Json string to be parsed + /// The new DataType instance from the Json string + public static DataType ParseDataTypeFromJson(string json) + { + return ParseDataTypeFromJson(JToken.Parse(json)); + } - /// - /// Parse a JToken object to construct a DataType. - /// - /// The JToken object to be parsed - /// The new DataType instance from the Json string - /// Not implemented for "udt" type - /// - protected static DataType ParseDataTypeFromJson(JToken json) - { - if (json.Type == JTokenType.Object) // {name: address, type: {type: struct,...},...} - { - JToken type; - var typeJObject = (JObject)json; - if (typeJObject.TryGetValue("type", out type)) - { - Type complexType; - if ((complexType = ComplexTypes.FirstOrDefault(ct => NormalizeTypeName(ct.Name) == type.ToString())) != default(Type)) - { - return ((ComplexType)Activator.CreateInstance(complexType, BindingFlags.Public | BindingFlags.NonPublic | BindingFlags.Instance - , null, new object[] { typeJObject }, null)); // create new instance of ComplexType - } - if (type.ToString() == "udt") - { - // TODO - throw new NotImplementedException(); - } - } - throw new ArgumentException(string.Format("Could not parse data type: {0}", type)); - } - else // {name: age, type: bigint,...} // TODO: validate more JTokenType other than Object - { - return ParseAtomicType(json); - } + /// + /// Parse a JToken object to construct a DataType. + /// + /// The JToken object to be parsed + /// The new DataType instance from the Json string + /// Not implemented for "udt" type + /// + protected static DataType ParseDataTypeFromJson(JToken json) + { + if (json.Type == JTokenType.Object) // {name: address, type: {type: struct,...},...} + { + JToken type; + var typeJObject = (JObject)json; + if (typeJObject.TryGetValue("type", out type)) + { + Type complexType; + if ((complexType = ComplexTypes.FirstOrDefault(ct => NormalizeTypeName(ct.Name) == type.ToString())) != default(Type)) + { + return ((ComplexType)Activator.CreateInstance(complexType, BindingFlags.Public | BindingFlags.NonPublic | BindingFlags.Instance + , null, new object[] { typeJObject }, null)); // create new instance of ComplexType + } + if (type.ToString() == "udt") + { + // TODO + throw new NotImplementedException(); + } + } + throw new ArgumentException(string.Format("Could not parse data type: {0}", type)); + } + else // {name: age, type: bigint,...} // TODO: validate more JTokenType other than Object + { + return ParseAtomicType(json); + } - } + } - private static AtomicType ParseAtomicType(JToken type) - { - Type atomicType; - if ((atomicType = AtomicTypes.FirstOrDefault(at => NormalizeTypeName(at.Name) == type.ToString())) != default(Type)) - { - return (AtomicType)Activator.CreateInstance(atomicType); // create new instance of AtomicType - } + private static AtomicType ParseAtomicType(JToken type) + { + Type atomicType; + if ((atomicType = AtomicTypes.FirstOrDefault(at => NormalizeTypeName(at.Name) == type.ToString())) != default(Type)) + { + return (AtomicType)Activator.CreateInstance(atomicType); // create new instance of AtomicType + } - Match fixedDecimal = DecimalType.FixedDecimal.Match(type.ToString()); - if (fixedDecimal.Success) - { - return new DecimalType(int.Parse(fixedDecimal.Groups[1].Value), int.Parse(fixedDecimal.Groups[2].Value)); - } + Match fixedDecimal = DecimalType.FixedDecimal.Match(type.ToString()); + if (fixedDecimal.Success) + { + return new DecimalType(int.Parse(fixedDecimal.Groups[1].Value), int.Parse(fixedDecimal.Groups[2].Value)); + } - throw new ArgumentException(string.Format("Could not parse data type: {0}", type)); - } + throw new ArgumentException(string.Format("Could not parse data type: {0}", type)); + } - [NonSerialized] - private static readonly Type[] AtomicTypes = typeof(AtomicType).Assembly.GetTypes().Where(type => - type.IsSubclassOf(typeof(AtomicType))).ToArray(); + [NonSerialized] + private static readonly Type[] AtomicTypes = typeof(AtomicType).Assembly.GetTypes().Where(type => + type.IsSubclassOf(typeof(AtomicType))).ToArray(); - [NonSerialized] - private static readonly Type[] ComplexTypes = typeof(ComplexType).Assembly.GetTypes().Where(type => - type.IsSubclassOf(typeof(ComplexType))).ToArray(); + [NonSerialized] + private static readonly Type[] ComplexTypes = typeof(ComplexType).Assembly.GetTypes().Where(type => + type.IsSubclassOf(typeof(ComplexType))).ToArray(); - [NonSerialized] - private static readonly Func NormalizeTypeName = s => s.Substring(0, s.Length - 4).ToLower(); // trim "Type" at the end of type name + [NonSerialized] + private static readonly Func NormalizeTypeName = s => s.Substring(0, s.Length - 4).ToLower(); // trim "Type" at the end of type name - } + } - /// - /// An internal type used to represent a simple type. - /// - [Serializable] - public class AtomicType : DataType - { - } + /// + /// An internal type used to represent a simple type. + /// + [Serializable] + public class AtomicType : DataType + { + } - /// - /// An internal type used to represent a complex type (such as arrays, structs, and maps). - /// - [Serializable] - public abstract class ComplexType : DataType - { - /// - /// Abstract method that constructs a complex type from a Json object - /// - /// The Json object to construct a complex type - /// A new constructed complex type - public abstract DataType FromJson(JObject json); - /// - /// Constructs a complex type from a Json string - /// - /// The string that represents a Json. - /// A new constructed complex type - public DataType FromJson(string json) - { - return FromJson(JObject.Parse(json)); - } - } + /// + /// An internal type used to represent a complex type (such as arrays, structs, and maps). + /// + [Serializable] + public abstract class ComplexType : DataType + { + /// + /// Abstract method that constructs a complex type from a Json object + /// + /// The Json object to construct a complex type + /// A new constructed complex type + public abstract DataType FromJson(JObject json); + /// + /// Constructs a complex type from a Json string + /// + /// The string that represents a Json. + /// A new constructed complex type + public DataType FromJson(string json) + { + return FromJson(JObject.Parse(json)); + } + } - /// - /// The data type representing NULL values. - /// - [Serializable] - public class NullType : AtomicType { } + /// + /// The data type representing NULL values. + /// + [Serializable] + public class NullType : AtomicType { } - /// - /// The data type representing String values. - /// - [Serializable] - public class StringType : AtomicType { } + /// + /// The data type representing String values. + /// + [Serializable] + public class StringType : AtomicType { } - /// - /// The data type representing binary values. - /// - [Serializable] - public class BinaryType : AtomicType { } + /// + /// The data type representing binary values. + /// + [Serializable] + public class BinaryType : AtomicType { } - /// - /// The data type representing Boolean values. - /// - [Serializable] - public class BooleanType : AtomicType { } + /// + /// The data type representing Boolean values. + /// + [Serializable] + public class BooleanType : AtomicType { } - /// - /// The data type representing Date values. - /// - [Serializable] - public class DateType : AtomicType { } + /// + /// The data type representing Date values. + /// + [Serializable] + public class DateType : AtomicType { } - /// - /// The data type representing Timestamp values. - /// - [Serializable] - public class TimestampType : AtomicType { } + /// + /// The data type representing Timestamp values. + /// + [Serializable] + public class TimestampType : AtomicType { } - /// - /// The data type representing Double values. - /// - [Serializable] - public class DoubleType : AtomicType { } + /// + /// The data type representing Double values. + /// + [Serializable] + public class DoubleType : AtomicType { } - /// - /// - /// - [Serializable] - public class FloatType : AtomicType { } + /// + /// + /// + [Serializable] + public class FloatType : AtomicType { } - /// - /// The data type representing Float values. - /// - [Serializable] - public class ByteType : AtomicType { } + /// + /// The data type representing Float values. + /// + [Serializable] + public class ByteType : AtomicType { } - /// - /// - /// - [Serializable] - public class IntegerType : AtomicType { } + /// + /// + /// + [Serializable] + public class IntegerType : AtomicType { } - /// - /// The data type representing Int values. - /// - [Serializable] - public class LongType : AtomicType { } + /// + /// The data type representing Int values. + /// + [Serializable] + public class LongType : AtomicType { } - /// - /// The data type representing Short values. - /// - [Serializable] - public class ShortType : AtomicType { } + /// + /// The data type representing Short values. + /// + [Serializable] + public class ShortType : AtomicType { } - /// - /// The data type representing Decimal values. - /// - [Serializable] - public class DecimalType : AtomicType - { - /// - /// Gets the regular expression that represents a fixed decimal. - /// - public static Regex FixedDecimal = new Regex(@"decimal\((\d+),\s(\d+)\)"); - private int? precision, scale; - /// - /// Initializes a new instance of DecimalType from parameters specifying its precision and scale. - /// - /// The precision of the type - /// The scale of the type - public DecimalType(int? precision = null, int? scale = null) - { - this.precision = precision; - this.scale = scale; - } + /// + /// The data type representing Decimal values. + /// + [Serializable] + public class DecimalType : AtomicType + { + /// + /// Gets the regular expression that represents a fixed decimal. + /// + public static Regex FixedDecimal = new Regex(@"decimal\s*\((\d+),\s*(\d+)\)"); + private int? precision, scale; + /// + /// Initializes a new instance of DecimalType from parameters specifying its precision and scale. + /// + /// The precision of the type + /// The scale of the type + public DecimalType(int? precision = null, int? scale = null) + { + this.precision = precision; + this.scale = scale; + } - internal override object JsonValue - { - get { throw new NotImplementedException(); } - } + internal override object JsonValue + { + get + { + if (precision == null && scale == null) return "decimal"; + return "decimal(" + precision + "," + scale + ")"; + } + } - /// - /// Constructs a DecimalType from a Json object - /// - /// The Json object used to construct a DecimalType - /// A new DecimalType instance - /// Not implemented yet. - public DataType FromJson(JObject json) - { - throw new NotImplementedException(); - } - } + /// + /// Constructs a DecimalType from a Json object + /// + /// The Json object used to construct a DecimalType + /// A new DecimalType instance + /// Not implemented yet. + public DataType FromJson(JObject json) + { + return ParseDataTypeFromJson(json); + } + } - /// - /// The data type for collections of multiple values. - /// - [Serializable] - public class ArrayType : ComplexType - { - /// - /// Gets the DataType of each element in the array - /// - public DataType ElementType { get { return elementType; } } - /// - /// Returns whether the array can contain null (None) values - /// - public bool ContainsNull { get { return containsNull; } } + /// + /// The data type for collections of multiple values. + /// + [Serializable] + public class ArrayType : ComplexType + { + /// + /// Gets the DataType of each element in the array + /// + public DataType ElementType { get { return elementType; } } + /// + /// Returns whether the array can contain null (None) values + /// + public bool ContainsNull { get { return containsNull; } } - /// - /// Initializes a ArrayType instance with a specific DataType and specifying if the array has null values. - /// - /// The data type of values - /// Indicates if values have null values - public ArrayType(DataType elementType, bool containsNull = true) - { - this.elementType = elementType; - this.containsNull = containsNull; - } + /// + /// Initializes a ArrayType instance with a specific DataType and specifying if the array has null values. + /// + /// The data type of values + /// Indicates if values have null values + public ArrayType(DataType elementType, bool containsNull = true) + { + this.elementType = elementType; + this.containsNull = containsNull; + } - internal ArrayType(JObject json) - { - FromJson(json); - } + internal ArrayType(JObject json) + { + FromJson(json); + } - /// - /// Readable string representation for the type. - /// - public override string SimpleString - { - get { return string.Format("array<{0}>", elementType.SimpleString); } - } + /// + /// Readable string representation for the type. + /// + public override string SimpleString + { + get { return string.Format("array<{0}>", elementType.SimpleString); } + } - internal override object JsonValue - { - get - { - return new JObject( - new JProperty("type", TypeName), - new JProperty("elementType", elementType.JsonValue), - new JProperty("containsNull", containsNull)); - } - } + internal override object JsonValue + { + get + { + return new JObject( + new JProperty("type", TypeName), + new JProperty("elementType", elementType.JsonValue), + new JProperty("containsNull", containsNull)); + } + } - /// - /// Constructs a ArrayType from a Json object - /// - /// The Json object used to construct a ArrayType - /// A new ArrayType instance - public override sealed DataType FromJson(JObject json) - { - elementType = ParseDataTypeFromJson(json["elementType"]); - containsNull = (bool)json["containsNull"]; - return this; - } + /// + /// Constructs a ArrayType from a Json object + /// + /// The Json object used to construct a ArrayType + /// A new ArrayType instance + public override sealed DataType FromJson(JObject json) + { + elementType = ParseDataTypeFromJson(json["elementType"]); + containsNull = (bool)json["containsNull"]; + return this; + } - private DataType elementType; - private bool containsNull; - } + private DataType elementType; + private bool containsNull; + } - /// - /// The data type for Maps. Not implemented yet. - /// - [Serializable] - public class MapType : ComplexType - { - internal override object JsonValue - { - get { throw new NotImplementedException(); } - } + /// + /// The data type for Maps. Not implemented yet. + /// + [Serializable] + public class MapType : ComplexType + { + internal override object JsonValue + { + get { throw new NotImplementedException(); } + } - /// - /// Constructs a StructField from a Json object. Not implemented yet. - /// - /// The Json object used to construct a MapType - /// A new MapType instance - /// - public override DataType FromJson(JObject json) - { - throw new NotImplementedException(); - } - } + /// + /// Constructs a StructField from a Json object. Not implemented yet. + /// + /// The Json object used to construct a MapType + /// A new MapType instance + /// + public override DataType FromJson(JObject json) + { + throw new NotImplementedException(); + } + } - /// - /// A field inside a StructType. - /// - [Serializable] - public class StructField : ComplexType - { - /// - /// The name of this field. - /// - public string Name { get { return name; } } - /// - /// The data type of this field. - /// - public DataType DataType { get { return dataType; } } - /// - /// Indicates if values of this field can be null values. - /// - public bool IsNullable { get { return isNullable; } } - /// - /// The metadata of this field. The metadata should be preserved during transformation if the content of the column is not modified, e.g, in selection. - /// - public JObject Metadata { get { return metadata; } } + /// + /// A field inside a StructType. + /// + [Serializable] + public class StructField : ComplexType + { + /// + /// The name of this field. + /// + public string Name { get { return name; } } + /// + /// The data type of this field. + /// + public DataType DataType { get { return dataType; } } + /// + /// Indicates if values of this field can be null values. + /// + public bool IsNullable { get { return isNullable; } } + /// + /// The metadata of this field. The metadata should be preserved during transformation if the content of the column is not modified, e.g, in selection. + /// + public JObject Metadata { get { return metadata; } } - /// - /// Initializes a StructField instance with a specific name, data type, nullable, and metadata - /// - /// The name of this field - /// The data type of this field - /// Indicates if values of this field can be null values - /// The metadata of this field - public StructField(string name, DataType dataType, bool isNullable = true, JObject metadata = null) - { - this.name = name; - this.dataType = dataType; - this.isNullable = isNullable; - this.metadata = metadata ?? new JObject(); - } + /// + /// Initializes a StructField instance with a specific name, data type, nullable, and metadata + /// + /// The name of this field + /// The data type of this field + /// Indicates if values of this field can be null values + /// The metadata of this field + public StructField(string name, DataType dataType, bool isNullable = true, JObject metadata = null) + { + this.name = name; + this.dataType = dataType; + this.isNullable = isNullable; + this.metadata = metadata ?? new JObject(); + } - internal StructField(JObject json) - { - FromJson(json); - } + internal StructField(JObject json) + { + FromJson(json); + } - /// - /// Returns a readable string that represents the type. - /// - public override string SimpleString { get { return string.Format(@"{0}:{1}", name, dataType.SimpleString); } } + /// + /// Returns a readable string that represents the type. + /// + public override string SimpleString { get { return string.Format(@"{0}:{1}", name, dataType.SimpleString); } } - internal override object JsonValue - { - get - { - return new JObject( - new JProperty("name", name), - new JProperty("type", dataType.JsonValue), - new JProperty("nullable", isNullable), - new JProperty("metadata", metadata)); - } - } + internal override object JsonValue + { + get + { + return new JObject( + new JProperty("name", name), + new JProperty("type", dataType.JsonValue), + new JProperty("nullable", isNullable), + new JProperty("metadata", metadata)); + } + } - /// - /// Constructs a StructField from a Json object - /// - /// The Json object used to construct a StructField - /// A new StructField instance - public override sealed DataType FromJson(JObject json) - { - name = json["name"].ToString(); - dataType = ParseDataTypeFromJson(json["type"]); - isNullable = (bool)json["nullable"]; - metadata = (JObject)json["metadata"]; - return this; - } + /// + /// Constructs a StructField from a Json object + /// + /// The Json object used to construct a StructField + /// A new StructField instance + public override sealed DataType FromJson(JObject json) + { + name = json["name"].ToString(); + dataType = ParseDataTypeFromJson(json["type"]); + isNullable = (bool)json["nullable"]; + metadata = (JObject)json["metadata"]; + return this; + } - private string name; - private DataType dataType; - private bool isNullable; - [NonSerialized] - private JObject metadata; - } + private string name; + private DataType dataType; + private bool isNullable; + [NonSerialized] + private JObject metadata; + } - /// - /// Struct type, consisting of a list of StructField - /// This is the data type representing a Row - /// - [Serializable] - public class StructType : ComplexType - { - /// - /// Gets a list of StructField. - /// - public List Fields { get { return fields; } } + /// + /// Struct type, consisting of a list of StructField + /// This is the data type representing a Row + /// + [Serializable] + public class StructType : ComplexType + { + /// + /// Gets a list of StructField. + /// + public List Fields { get { return fields; } } - internal IStructTypeProxy StructTypeProxy - { - get - { - return structTypeProxy ?? - new StructTypeIpcProxy( - new JvmObjectReference(SparkCLRIpcProxy.JvmBridge.CallStaticJavaMethod("org.apache.spark.sql.api.csharp.SQLUtils", "createSchema", - new object[] { Json }).ToString())); - } - } - /// - /// Initializes a StructType instance with a specific collection of SructField object. - /// - /// The collection that holds StructField objects - public StructType(IEnumerable fields) - { - this.fields = fields.ToList(); - } + private Lazy[]> pickleConverters; - internal StructType(JObject json) - { - FromJson(json); - } + private Func[] ConstructPickleConverters() + { + var funcs = new Func[fields.Count]; + int index = 0; + foreach (var field in fields) + { + if (field.DataType is StringType) + { + funcs[index] = x => x?.ToString(); + } + /*else if (field.DataType is LongType) + { + funcs[index] = x => x==null?null:(dynamic)(long)x ; + }*/ + /*else if (field.DataType is DateType) + { + funcs[index] = x => x; + }*/ + else if (field.DataType is ArrayType) + { + Func convertArrayTypeToStructTypeFunc = (dataType, length) => + { + StructField[] f = new StructField[length]; + for (int i = 0; i < length; i++) + { + f[i] = new StructField(string.Format("_array_{0}", i), dataType); + } + return new StructType(f); + }; + var elementType = (field.DataType as ArrayType).ElementType; + funcs[index] = x => + { - internal StructType(IStructTypeProxy structTypeProxy) - { - this.structTypeProxy = structTypeProxy; - var jsonSchema = structTypeProxy.ToJson(); - FromJson(jsonSchema); - } + // Note: When creating object from json, PySpark converts Json array to Python List (https://github.com/apache/spark/blob/branch-1.4/python/pyspark/sql/types.py, _create_cls(dataType)), + // then Pyrolite unpickler converts Python List to C# ArrayList (https://github.com/irmen/Pyrolite/blob/v4.10/README.txt). So values[index] should be of type ArrayList; + // In case Python changes its implementation, which means value is not of type ArrayList, try cast to object[] because Pyrolite unpickler convert Python Tuple to C# object[]. + object[] valueOfArray = (x as ArrayList)?.ToArray() ?? x as object[]; + if (valueOfArray == null) + { + throw new ArgumentException("Cannot parse data of ArrayType: " + field.Name); + } - /// - /// Returns a readable string that joins all s together. - /// - public override string SimpleString - { - get { return string.Format(@"struct<{0}>", string.Join(",", fields.Select(f => f.SimpleString))); } - } + return new RowImpl(valueOfArray, + elementType as StructType ?? convertArrayTypeToStructTypeFunc(elementType, valueOfArray.Length)).Values; // TODO: this part may have some problems, not verified + }; + } + else if (field.DataType is MapType) + { + //TODO + throw new NotImplementedException(); + } + else if (field.DataType is StructType) + { + funcs[index] = x => x != null ? new RowImpl(x, field.DataType as StructType) : null; + } + else + { + funcs[index] = x => x; + } + index++; + } + return funcs; + } - internal override object JsonValue - { - get - { - return new JObject( - new JProperty("type", TypeName), - new JProperty("fields", fields.Select(f => f.JsonValue).ToArray())); - } - } + internal IStructTypeProxy StructTypeProxy + { + get + { + return structTypeProxy ?? + new StructTypeIpcProxy( + new JvmObjectReference(SparkCLRIpcProxy.JvmBridge.CallStaticJavaMethod("org.apache.spark.sql.api.csharp.SQLUtils", "createSchema", + new object[] { Json }).ToString())); + } + } - /// - /// Constructs a StructType from a Json object - /// - /// The Json object used to construct a StructType - /// A new StructType instance - public override sealed DataType FromJson(JObject json) - { - var fieldsJObjects = json["fields"].Select(f => (JObject)f); - fields = fieldsJObjects.Select(fieldJObject => (new StructField(fieldJObject))).ToList(); - return this; - } + /// + /// Initializes a StructType instance with a specific collection of SructField object. + /// + /// The collection that holds StructField objects + public StructType(IEnumerable fields) + { + this.fields = fields.ToList(); + Initialize(); + } - [NonSerialized] - private readonly IStructTypeProxy structTypeProxy; + internal StructType(JObject json) + { + FromJson(json); + Initialize(); + } - private List fields; - } + internal StructType(IStructTypeProxy structTypeProxy) + { + this.structTypeProxy = structTypeProxy; + var jsonSchema = structTypeProxy.ToJson(); + FromJson(jsonSchema); + Initialize(); + } + + public void ConvertPickleObjects(dynamic[] input, dynamic[] output) + { + var c = pickleConverters.Value; + for (int i = 0; i < input.Length; ++i) + { + output[i] = c[i](input[i]); + } + } + + private void Initialize() + { + pickleConverters = new Lazy[]>(ConstructPickleConverters); + } + + /// + /// Returns a readable string that joins all s together. + /// + public override string SimpleString + { + get { return string.Format(@"struct<{0}>", string.Join(",", fields.Select(f => f.SimpleString))); } + } + + internal override object JsonValue + { + get + { + return new JObject( + new JProperty("type", TypeName), + new JProperty("fields", fields.Select(f => f.JsonValue).ToArray())); + } + } + + /// + /// Constructs a StructType from a Json object + /// + /// The Json object used to construct a StructType + /// A new StructType instance + public override sealed DataType FromJson(JObject json) + { + var fieldsJObjects = json["fields"].Select(f => (JObject)f); + fields = fieldsJObjects.Select(fieldJObject => (new StructField(fieldJObject))).ToList(); + return this; + } + + [NonSerialized] + private readonly IStructTypeProxy structTypeProxy; + + private List fields; + } } diff --git a/csharp/Adapter/Microsoft.Spark.CSharp/Sql/UdfRegistration.cs b/csharp/Adapter/Microsoft.Spark.CSharp/Sql/UdfRegistration.cs index b9c5008..eaa602b 100644 --- a/csharp/Adapter/Microsoft.Spark.CSharp/Sql/UdfRegistration.cs +++ b/csharp/Adapter/Microsoft.Spark.CSharp/Sql/UdfRegistration.cs @@ -5,6 +5,7 @@ using System; using System.Collections.Generic; using System.Diagnostics.CodeAnalysis; using System.Linq; +using System.Reflection; using System.Text; using System.Threading.Tasks; using Microsoft.Spark.CSharp.Core; @@ -249,6 +250,17 @@ namespace Microsoft.Spark.CSharp.Sql Func, IEnumerable> udfHelper = new UdfHelper(f).Execute; udfRegistrationProxy.RegisterFunction(name, SparkContext.BuildCommand(new CSharpWorkerFunc(udfHelper), SerializedMode.Row, SerializedMode.Row), Functions.GetReturnType(typeof(RT))); } - #endregion - } + + public void RegisterFunction(string name, MethodInfo f) + { + if (!f.IsStatic) + throw new InvalidOperationException(f.DeclaringType?.FullName + "." + f.Name + + " is not a static method, can't be registered"); + logger.LogInfo("Name of the function to register {0}, method info", name, f.DeclaringType?.FullName + "." + f.Name); + var helper = new UdfReflectionHelper(f); + Func, IEnumerable> udfHelper = helper.Execute; + udfRegistrationProxy.RegisterFunction(name, SparkContext.BuildCommand(new CSharpWorkerFunc(udfHelper), SerializedMode.Row, SerializedMode.Row), Functions.GetReturnType(helper.ReturnType)); + } + #endregion + } } diff --git a/csharp/Adapter/Microsoft.Spark.CSharp/packages.config b/csharp/Adapter/Microsoft.Spark.CSharp/packages.config index 8f5143e..d95f59d 100644 --- a/csharp/Adapter/Microsoft.Spark.CSharp/packages.config +++ b/csharp/Adapter/Microsoft.Spark.CSharp/packages.config @@ -1,7 +1,7 @@  - - + + - + \ No newline at end of file diff --git a/csharp/Adapter/documentation/Microsoft.Spark.CSharp.Adapter.Doc.XML b/csharp/Adapter/documentation/Microsoft.Spark.CSharp.Adapter.Doc.XML index 0d192a5..f7d5b48 100644 --- a/csharp/Adapter/documentation/Microsoft.Spark.CSharp.Adapter.Doc.XML +++ b/csharp/Adapter/documentation/Microsoft.Spark.CSharp.Adapter.Doc.XML @@ -3513,7 +3513,7 @@ Close the socket connections and releases all associated resources. - + Establishes a connection to a remote host that is specified by an IP address and a port number @@ -3612,12 +3612,13 @@ Close the ISocket connections and releases all associated resources. - + Establishes a connection to a remote host that is specified by an IP address and a port number The IP address of the remote host The port number of the remote host + The secret to connect, can be null @@ -3770,7 +3771,7 @@ Close the ISocket connections and releases all associated resources. - + Establishes a connection to a remote host that is specified by an IP address and a port number @@ -3912,7 +3913,7 @@ Close the ISocket connections and releases all associated resources. - + Establishes a connection to a remote host that is specified by an IP address and a port number @@ -5190,12 +5191,13 @@ row count - + Displays rows of the DataFrame in tabular form Number of rows to display - default 20 Indicates if strings more than 20 characters long will be truncated + If set to True, print output rows vertically (one line per column value). @@ -5627,10 +5629,11 @@ the 100 new partitions will claim 10 of the current partitions. - + Persist this DataFrame with the default storage level (`MEMORY_AND_DISK`) + Persist storage type @@ -6040,6 +6043,15 @@ DataFrame if no paths are passed in. + + + Loads a AVRO file (one object per line) and returns the result as a DataFrame. + + This function goes through the input once to determine the input schema. If you know the + schema in advance, use the version that specifies the schema to avoid the extra scan. + + input path + Interface used to write a DataFrame to external storage systems (e.g. file systems, @@ -6145,6 +6157,13 @@ Format("parquet").Save(path) + + + Saves the content of the DataFrame in AVRO format at the specified path. + This is equivalent to: + Format("com.databricks.spark.avro").Save(path) + + Dataset is a strongly typed collection of domain-specific objects that can be transformed @@ -6193,13 +6212,14 @@ Returns all column names as an array. - + Displays the top 20 rows of Dataset in a tabular form. Strings more than 20 characters will be truncated, and all cells will be aligned right. Number of rows - default is 20 Indicates if rows with more than 20 characters to be truncated + If set to true, prints output rows vertically (one line per column value). diff --git a/csharp/Adapter/documentation/Mobius_API_Documentation.md b/csharp/Adapter/documentation/Mobius_API_Documentation.md index c9e4065..7ee0e9e 100644 --- a/csharp/Adapter/documentation/Mobius_API_Documentation.md +++ b/csharp/Adapter/documentation/Mobius_API_Documentation.md @@ -638,7 +638,7 @@ ####Methods -
NameDescription
RegisterTempTableRegisters this DataFrame as a temporary table using the given name. The lifetime of this temporary table is tied to the SqlContext that was used to create this DataFrame.
CountNumber of rows in the DataFrame
ShowDisplays rows of the DataFrame in tabular form
ShowSchemaPrints the schema information of the DataFrame
CollectReturns all of Rows in this DataFrame
ToRDDConverts the DataFrame to RDD of Row
ToJSONReturns the content of the DataFrame as RDD of JSON strings
ExplainPrints the plans (logical and physical) to the console for debugging purposes
SelectSelects a set of columns specified by column name or Column. df.Select("colA", df["colB"]) df.Select("*", df["colB"] + 10)
SelectSelects a set of columns. This is a variant of `select` that can only select existing columns using column names (i.e. cannot construct expressions). df.Select("colA", "colB")
SelectExprSelects a set of SQL expressions. This is a variant of `select` that accepts SQL expressions. df.SelectExpr("colA", "colB as newName", "abs(colC)")
WhereFilters rows using the given condition
FilterFilters rows using the given condition
GroupByGroups the DataFrame using the specified columns, so we can run aggregation on them.
RollupCreate a multi-dimensional rollup for the current DataFrame using the specified columns, so we can run aggregation on them.
CubeCreate a multi-dimensional cube for the current DataFrame using the specified columns, so we can run aggregation on them.
AggAggregates on the DataFrame for the given column-aggregate function mapping
JoinJoin with another DataFrame - Cartesian join
JoinJoin with another DataFrame - Inner equi-join using given column name
JoinJoin with another DataFrame - Inner equi-join using given column name
JoinJoin with another DataFrame, using the specified JoinType
IntersectIntersect with another DataFrame. This is equivalent to `INTERSECT` in SQL. Reference to https://github.com/apache/spark/blob/branch-1.4/python/pyspark/sql/dataframe.py, intersect(self, other)
UnionAllUnion with another DataFrame WITHOUT removing duplicated rows. This is equivalent to `UNION ALL` in SQL. Reference to https://github.com/apache/spark/blob/branch-1.4/python/pyspark/sql/dataframe.py, unionAll(self, other)
SubtractReturns a new DataFrame containing rows in this frame but not in another frame. This is equivalent to `EXCEPT` in SQL. Reference to https://github.com/apache/spark/blob/branch-1.4/python/pyspark/sql/dataframe.py, subtract(self, other)
DropReturns a new DataFrame with a column dropped. Reference to https://github.com/apache/spark/blob/branch-1.4/python/pyspark/sql/dataframe.py, drop(self, col)
DropNaReturns a new DataFrame omitting rows with null values. Reference to https://github.com/apache/spark/blob/branch-1.4/python/pyspark/sql/dataframe.py, dropna(self, how='any', thresh=None, subset=None)
NaReturns a DataFrameNaFunctions for working with missing data.
FillNaReplace null values, alias for ``na.fill()`
DropDuplicatesReturns a new DataFrame with duplicate rows removed, considering only the subset of columns. Reference to https://github.com/apache/spark/blob/branch-1.4/python/pyspark/sql/dataframe.py, dropDuplicates(self, subset=None)
Replace``1Returns a new DataFrame replacing a value with another value. Reference to https://github.com/apache/spark/blob/branch-1.4/python/pyspark/sql/dataframe.py, replace(self, to_replace, value, subset=None)
ReplaceAll``1Returns a new DataFrame replacing values with other values. Reference to https://github.com/apache/spark/blob/branch-1.4/python/pyspark/sql/dataframe.py, replace(self, to_replace, value, subset=None)
ReplaceAll``1Returns a new DataFrame replacing values with another value. Reference to https://github.com/apache/spark/blob/branch-1.4/python/pyspark/sql/dataframe.py, replace(self, to_replace, value, subset=None)
RandomSplitRandomly splits this DataFrame with the provided weights. Reference to https://github.com/apache/spark/blob/branch-1.4/python/pyspark/sql/dataframe.py, randomSplit(self, weights, seed=None)
ColumnsReturns all column names as a list. Reference to https://github.com/apache/spark/blob/branch-1.4/python/pyspark/sql/dataframe.py, columns(self)
DTypesReturns all column names and their data types. Reference to https://github.com/apache/spark/blob/branch-1.4/python/pyspark/sql/dataframe.py, dtypes(self)
SortReturns a new DataFrame sorted by the specified column(s). Reference to https://github.com/apache/spark/blob/branch-1.4/python/pyspark/sql/dataframe.py, sort(self, *cols, **kwargs)
SortReturns a new DataFrame sorted by the specified column(s). Reference to https://github.com/apache/spark/blob/branch-1.4/python/pyspark/sql/dataframe.py, sort(self, *cols, **kwargs)
SortWithinPartitionsReturns a new DataFrame sorted by the specified column(s). Reference to https://github.com/apache/spark/blob/branch-1.6/python/pyspark/sql/dataframe.py, sortWithinPartitions(self, *cols, **kwargs)
SortWithinPartitionReturns a new DataFrame sorted by the specified column(s). Reference to https://github.com/apache/spark/blob/branch-1.6/python/pyspark/sql/dataframe.py, sortWithinPartitions(self, *cols, **kwargs)
AliasReturns a new DataFrame with an alias set. Reference to https://github.com/apache/spark/blob/branch-1.4/python/pyspark/sql/dataframe.py, alias(self, alias)
WithColumnReturns a new DataFrame by adding a column. Reference to https://github.com/apache/spark/blob/branch-1.4/python/pyspark/sql/dataframe.py, withColumn(self, colName, col)
WithColumnRenamedReturns a new DataFrame by renaming an existing column. Reference to https://github.com/apache/spark/blob/branch-1.4/python/pyspark/sql/dataframe.py, withColumnRenamed(self, existing, new)
CorrCalculates the correlation of two columns of a DataFrame as a double value. Currently only supports the Pearson Correlation Coefficient. Reference to https://github.com/apache/spark/blob/branch-1.4/python/pyspark/sql/dataframe.py, corr(self, col1, col2, method=None)
CovCalculate the sample covariance of two columns as a double value. Reference to https://github.com/apache/spark/blob/branch-1.4/python/pyspark/sql/dataframe.py, cov(self, col1, col2)
FreqItemsFinding frequent items for columns, possibly with false positives. Using the frequent element count algorithm described in "http://dx.doi.org/10.1145/762471.762473, proposed by Karp, Schenker, and Papadimitriou". Reference to https://github.com/apache/spark/blob/branch-1.4/python/pyspark/sql/dataframe.py, freqItems(self, cols, support=None) Note: This function is meant for exploratory data analysis, as we make no guarantee about the backward compatibility of the schema of the resulting DataFrame.
CrosstabComputes a pair-wise frequency table of the given columns. Also known as a contingency table. The number of distinct values for each column should be less than 1e4. At most 1e6 non-zero pair frequencies will be returned. Reference to https://github.com/apache/spark/blob/branch-1.4/python/pyspark/sql/dataframe.py, crosstab(self, col1, col2)
DescribeComputes statistics for numeric columns. This include count, mean, stddev, min, and max. If no columns are given, this function computes statistics for all numerical columns.
LimitReturns a new DataFrame by taking the first `n` rows. The difference between this function and `head` is that `head` returns an array while `limit` returns a new DataFrame.
HeadReturns the first `n` rows.
FirstReturns the first row.
TakeReturns the first `n` rows in the DataFrame.
DistinctReturns a new DataFrame that contains only the unique rows from this DataFrame.
CoalesceReturns a new DataFrame that has exactly `numPartitions` partitions. Similar to coalesce defined on an RDD, this operation results in a narrow dependency, e.g. if you go from 1000 partitions to 100 partitions, there will not be a shuffle, instead each of the 100 new partitions will claim 10 of the current partitions.
PersistPersist this DataFrame with the default storage level (`MEMORY_AND_DISK`)
UnpersistMark the DataFrame as non-persistent, and remove all blocks for it from memory and disk.
CachePersist this DataFrame with the default storage level (`MEMORY_AND_DISK`)
RepartitionReturns a new DataFrame that has exactly `numPartitions` partitions.
RepartitionReturns a new [[DataFrame]] partitioned by the given partitioning columns into . The resulting DataFrame is hash partitioned. optional. If not specified, keep current partitions.
RepartitionReturns a new [[DataFrame]] partitioned by the given partitioning columns into . The resulting DataFrame is hash partitioned. optional. If not specified, keep current partitions.
SampleReturns a new DataFrame by sampling a fraction of rows.
FlatMap``1Returns a new RDD by first applying a function to all rows of this DataFrame, and then flattening the results.
Map``1Returns a new RDD by applying a function to all rows of this DataFrame.
MapPartitions``1Returns a new RDD by applying a function to each partition of this DataFrame.
ForeachPartitionApplies a function f to each partition of this DataFrame.
ForeachApplies a function f to all rows.
WriteInterface for saving the content of the DataFrame out into external storage.
SaveAsParquetFileSaves the contents of this DataFrame as a parquet file, preserving the schema. Files that are written out using this method can be read back in as a DataFrame using the `parquetFile` function in SQLContext.
InsertIntoAdds the rows from this RDD to the specified table, optionally overwriting the existing data.
SaveAsTableCreates a table from the the contents of this DataFrame based on a given data source, SaveMode specified by mode, and a set of options. Note that this currently only works with DataFrames that are created from a HiveContext as there is no notion of a persisted catalog in a standard SQL context. Instead you can write an RDD out to a parquet file, and then register that file as a table. This "table" can then be the target of an `insertInto`. Also note that while this function can persist the table metadata into Hive's metastore, the table will NOT be accessible from Hive, until SPARK-7550 is resolved.
SaveSaves the contents of this DataFrame based on the given data source, SaveMode specified by mode, and a set of options.
Returns a new DataFrame that drops rows containing any null values.
Returns a new DataFrame that drops rows containing null values. If `how` is "any", then drop rows containing any null values. If `how` is "all", then drop rows only if every column is null for that row.
Returns a new [[DataFrame]] that drops rows containing null values in the specified columns. If `how` is "any", then drop rows containing any null values in the specified columns. If `how` is "all", then drop rows only if every specified column is null for that row.
Returns a new DataFrame that drops rows containing any null values in the specified columns.
Returns a new DataFrame that drops rows containing less than `minNonNulls` non-null values.
Returns a new DataFrame that drops rows containing less than `minNonNulls` non-null values values in the specified columns.
Returns a new DataFrame that replaces null values in numeric columns with `value`.
Returns a new DataFrame that replaces null values in string columns with `value`.
Returns a new DataFrame that replaces null values in specified numeric columns. If a specified column is not a numeric column, it is ignored.
Returns a new DataFrame that replaces null values in specified string columns. If a specified column is not a numeric column, it is ignored.
Replaces values matching keys in `replacement` map with the corresponding values. Key and value of `replacement` map must have the same type, and can only be doubles or strings. The value must be of the following type: `Integer`, `Long`, `Float`, `Double`, `String`. For example, the following replaces null values in column "A" with string "unknown", and null values in column "B" with numeric value 1.0. import com.google.common.collect.ImmutableMap; df.na.fill(ImmutableMap.of("A", "unknown", "B", 1.0));
Replaces values matching keys in `replacement` map with the corresponding values. Key and value of `replacement` map must have the same type, and can only be doubles or strings. If `col` is "*", then the replacement is applied on all string columns or numeric columns. Example: import com.google.common.collect.ImmutableMap; // Replaces all occurrences of 1.0 with 2.0 in column "height". df.replace("height", ImmutableMap.of(1.0, 2.0)); // Replaces all occurrences of "UNKNOWN" with "unnamed" in column "name". df.replace("name", ImmutableMap.of("UNKNOWN", "unnamed")); // Replaces all occurrences of "UNKNOWN" with "unnamed" in all string columns. df.replace("*", ImmutableMap.of("UNKNOWN", "unnamed"));
Replaces values matching keys in `replacement` map with the corresponding values. Key and value of `replacement` map must have the same type, and can only be doubles or strings. If `col` is "*", then the replacement is applied on all string columns or numeric columns. Example: import com.google.common.collect.ImmutableMap; // Replaces all occurrences of 1.0 with 2.0 in column "height" and "weight". df.replace(new String[] {"height", "weight"}, ImmutableMap.of(1.0, 2.0)); // Replaces all occurrences of "UNKNOWN" with "unnamed" in column "firstname" and "lastname". df.replace(new String[] {"firstname", "lastname"}, ImmutableMap.of("UNKNOWN", "unnamed"));
Specifies the input data source format.
Specifies the input schema. Some data sources (e.g. JSON) can infer the input schema automatically from data. By specifying the schema here, the underlying data source can skip the schema inference step, and thus speed up data loading.
Adds an input option for the underlying data source.
Adds input options for the underlying data source.
Loads input in as a [[DataFrame]], for data sources that require a path (e.g. data backed by a local or distributed file system).
Loads input in as a DataFrame, for data sources that don't require a path (e.g. external key-value stores).
Construct a [[DataFrame]] representing the database table accessible via JDBC URL, url named table and connection properties.
Construct a DataFrame representing the database table accessible via JDBC URL url named table. Partitions of the table will be retrieved in parallel based on the parameters passed to this function. Don't create too many partitions in parallel on a large cluster; otherwise Spark might crash your external database systems.
Construct a DataFrame representing the database table accessible via JDBC URL url named table using connection properties. The `predicates` parameter gives a list expressions suitable for inclusion in WHERE clauses; each one defines one partition of the DataFrame. Don't create too many partitions in parallel on a large cluster; otherwise Spark might crash your external database systems.
Loads a JSON file (one object per line) and returns the result as a DataFrame. This function goes through the input once to determine the input schema. If you know the schema in advance, use the version that specifies the schema to avoid the extra scan.
Loads a Parquet file, returning the result as a [[DataFrame]]. This function returns an empty DataFrame if no paths are passed in.
Specifies the behavior when data or table already exists. Options include: - `SaveMode.Overwrite`: overwrite the existing data. - `SaveMode.Append`: append the data. - `SaveMode.Ignore`: ignore the operation (i.e. no-op). - `SaveMode.ErrorIfExists`: default option, throw an exception at runtime.
Specifies the behavior when data or table already exists. Options include: - `SaveMode.Overwrite`: overwrite the existing data. - `SaveMode.Append`: append the data. - `SaveMode.Ignore`: ignore the operation (i.e. no-op). - `SaveMode.ErrorIfExists`: default option, throw an exception at runtime.
Specifies the underlying output data source. Built-in options include "parquet", "json", etc.
Adds an output option for the underlying data source.
Adds output options for the underlying data source.
Partitions the output by the given columns on the file system. If specified, the output is laid out on the file system similar to Hive's partitioning scheme. This is only applicable for Parquet at the moment.
Saves the content of the DataFrame at the specified path.
Saves the content of the DataFrame as the specified table.
Inserts the content of the DataFrame to the specified table. It requires that the schema of the DataFrame is the same as the schema of the table. Because it inserts data to an existing table, format or options will be ignored.
Saves the content of the DataFrame as the specified table. In the case the table already exists, behavior of this function depends on the save mode, specified by the `mode` function (default to throwing an exception). When `mode` is `Overwrite`, the schema of the DataFrame does not need to be the same as that of the existing table. When `mode` is `Append`, the schema of the DataFrame need to be the same as that of the existing table, and format or options will be ignored.
Saves the content of the DataFrame to a external database table via JDBC. In the case the table already exists in the external database, behavior of this function depends on the save mode, specified by the `mode` function (default to throwing an exception). Don't create too many partitions in parallel on a large cluster; otherwise Spark might crash your external database systems.
Saves the content of the DataFrame in JSON format at the specified path. This is equivalent to: Format("json").Save(path)
Saves the content of the DataFrame in JSON format at the specified path. This is equivalent to: Format("parquet").Save(path)
+
NameDescription
RegisterTempTableRegisters this DataFrame as a temporary table using the given name. The lifetime of this temporary table is tied to the SqlContext that was used to create this DataFrame.
CountNumber of rows in the DataFrame
ShowDisplays rows of the DataFrame in tabular form
ShowSchemaPrints the schema information of the DataFrame
CollectReturns all of Rows in this DataFrame
ToRDDConverts the DataFrame to RDD of Row
ToJSONReturns the content of the DataFrame as RDD of JSON strings
ExplainPrints the plans (logical and physical) to the console for debugging purposes
SelectSelects a set of columns specified by column name or Column. df.Select("colA", df["colB"]) df.Select("*", df["colB"] + 10)
SelectSelects a set of columns. This is a variant of `select` that can only select existing columns using column names (i.e. cannot construct expressions). df.Select("colA", "colB")
SelectExprSelects a set of SQL expressions. This is a variant of `select` that accepts SQL expressions. df.SelectExpr("colA", "colB as newName", "abs(colC)")
WhereFilters rows using the given condition
FilterFilters rows using the given condition
GroupByGroups the DataFrame using the specified columns, so we can run aggregation on them.
RollupCreate a multi-dimensional rollup for the current DataFrame using the specified columns, so we can run aggregation on them.
CubeCreate a multi-dimensional cube for the current DataFrame using the specified columns, so we can run aggregation on them.
AggAggregates on the DataFrame for the given column-aggregate function mapping
JoinJoin with another DataFrame - Cartesian join
JoinJoin with another DataFrame - Inner equi-join using given column name
JoinJoin with another DataFrame - Inner equi-join using given column name
JoinJoin with another DataFrame, using the specified JoinType
IntersectIntersect with another DataFrame. This is equivalent to `INTERSECT` in SQL. Reference to https://github.com/apache/spark/blob/branch-1.4/python/pyspark/sql/dataframe.py, intersect(self, other)
UnionAllUnion with another DataFrame WITHOUT removing duplicated rows. This is equivalent to `UNION ALL` in SQL. Reference to https://github.com/apache/spark/blob/branch-1.4/python/pyspark/sql/dataframe.py, unionAll(self, other)
SubtractReturns a new DataFrame containing rows in this frame but not in another frame. This is equivalent to `EXCEPT` in SQL. Reference to https://github.com/apache/spark/blob/branch-1.4/python/pyspark/sql/dataframe.py, subtract(self, other)
DropReturns a new DataFrame with a column dropped. Reference to https://github.com/apache/spark/blob/branch-1.4/python/pyspark/sql/dataframe.py, drop(self, col)
DropNaReturns a new DataFrame omitting rows with null values. Reference to https://github.com/apache/spark/blob/branch-1.4/python/pyspark/sql/dataframe.py, dropna(self, how='any', thresh=None, subset=None)
NaReturns a DataFrameNaFunctions for working with missing data.
FillNaReplace null values, alias for ``na.fill()`
DropDuplicatesReturns a new DataFrame with duplicate rows removed, considering only the subset of columns. Reference to https://github.com/apache/spark/blob/branch-1.4/python/pyspark/sql/dataframe.py, dropDuplicates(self, subset=None)
Replace``1Returns a new DataFrame replacing a value with another value. Reference to https://github.com/apache/spark/blob/branch-1.4/python/pyspark/sql/dataframe.py, replace(self, to_replace, value, subset=None)
ReplaceAll``1Returns a new DataFrame replacing values with other values. Reference to https://github.com/apache/spark/blob/branch-1.4/python/pyspark/sql/dataframe.py, replace(self, to_replace, value, subset=None)
ReplaceAll``1Returns a new DataFrame replacing values with another value. Reference to https://github.com/apache/spark/blob/branch-1.4/python/pyspark/sql/dataframe.py, replace(self, to_replace, value, subset=None)
RandomSplitRandomly splits this DataFrame with the provided weights. Reference to https://github.com/apache/spark/blob/branch-1.4/python/pyspark/sql/dataframe.py, randomSplit(self, weights, seed=None)
ColumnsReturns all column names as a list. Reference to https://github.com/apache/spark/blob/branch-1.4/python/pyspark/sql/dataframe.py, columns(self)
DTypesReturns all column names and their data types. Reference to https://github.com/apache/spark/blob/branch-1.4/python/pyspark/sql/dataframe.py, dtypes(self)
SortReturns a new DataFrame sorted by the specified column(s). Reference to https://github.com/apache/spark/blob/branch-1.4/python/pyspark/sql/dataframe.py, sort(self, *cols, **kwargs)
SortReturns a new DataFrame sorted by the specified column(s). Reference to https://github.com/apache/spark/blob/branch-1.4/python/pyspark/sql/dataframe.py, sort(self, *cols, **kwargs)
SortWithinPartitionsReturns a new DataFrame sorted by the specified column(s). Reference to https://github.com/apache/spark/blob/branch-1.6/python/pyspark/sql/dataframe.py, sortWithinPartitions(self, *cols, **kwargs)
SortWithinPartitionReturns a new DataFrame sorted by the specified column(s). Reference to https://github.com/apache/spark/blob/branch-1.6/python/pyspark/sql/dataframe.py, sortWithinPartitions(self, *cols, **kwargs)
AliasReturns a new DataFrame with an alias set. Reference to https://github.com/apache/spark/blob/branch-1.4/python/pyspark/sql/dataframe.py, alias(self, alias)
WithColumnReturns a new DataFrame by adding a column. Reference to https://github.com/apache/spark/blob/branch-1.4/python/pyspark/sql/dataframe.py, withColumn(self, colName, col)
WithColumnRenamedReturns a new DataFrame by renaming an existing column. Reference to https://github.com/apache/spark/blob/branch-1.4/python/pyspark/sql/dataframe.py, withColumnRenamed(self, existing, new)
CorrCalculates the correlation of two columns of a DataFrame as a double value. Currently only supports the Pearson Correlation Coefficient. Reference to https://github.com/apache/spark/blob/branch-1.4/python/pyspark/sql/dataframe.py, corr(self, col1, col2, method=None)
CovCalculate the sample covariance of two columns as a double value. Reference to https://github.com/apache/spark/blob/branch-1.4/python/pyspark/sql/dataframe.py, cov(self, col1, col2)
FreqItemsFinding frequent items for columns, possibly with false positives. Using the frequent element count algorithm described in "http://dx.doi.org/10.1145/762471.762473, proposed by Karp, Schenker, and Papadimitriou". Reference to https://github.com/apache/spark/blob/branch-1.4/python/pyspark/sql/dataframe.py, freqItems(self, cols, support=None) Note: This function is meant for exploratory data analysis, as we make no guarantee about the backward compatibility of the schema of the resulting DataFrame.
CrosstabComputes a pair-wise frequency table of the given columns. Also known as a contingency table. The number of distinct values for each column should be less than 1e4. At most 1e6 non-zero pair frequencies will be returned. Reference to https://github.com/apache/spark/blob/branch-1.4/python/pyspark/sql/dataframe.py, crosstab(self, col1, col2)
DescribeComputes statistics for numeric columns. This include count, mean, stddev, min, and max. If no columns are given, this function computes statistics for all numerical columns.
LimitReturns a new DataFrame by taking the first `n` rows. The difference between this function and `head` is that `head` returns an array while `limit` returns a new DataFrame.
HeadReturns the first `n` rows.
FirstReturns the first row.
TakeReturns the first `n` rows in the DataFrame.
DistinctReturns a new DataFrame that contains only the unique rows from this DataFrame.
CoalesceReturns a new DataFrame that has exactly `numPartitions` partitions. Similar to coalesce defined on an RDD, this operation results in a narrow dependency, e.g. if you go from 1000 partitions to 100 partitions, there will not be a shuffle, instead each of the 100 new partitions will claim 10 of the current partitions.
PersistPersist this DataFrame with the default storage level (`MEMORY_AND_DISK`)
UnpersistMark the DataFrame as non-persistent, and remove all blocks for it from memory and disk.
CachePersist this DataFrame with the default storage level (`MEMORY_AND_DISK`)
RepartitionReturns a new DataFrame that has exactly `numPartitions` partitions.
RepartitionReturns a new [[DataFrame]] partitioned by the given partitioning columns into . The resulting DataFrame is hash partitioned. optional. If not specified, keep current partitions.
RepartitionReturns a new [[DataFrame]] partitioned by the given partitioning columns into . The resulting DataFrame is hash partitioned. optional. If not specified, keep current partitions.
SampleReturns a new DataFrame by sampling a fraction of rows.
FlatMap``1Returns a new RDD by first applying a function to all rows of this DataFrame, and then flattening the results.
Map``1Returns a new RDD by applying a function to all rows of this DataFrame.
MapPartitions``1Returns a new RDD by applying a function to each partition of this DataFrame.
ForeachPartitionApplies a function f to each partition of this DataFrame.
ForeachApplies a function f to all rows.
WriteInterface for saving the content of the DataFrame out into external storage.
SaveAsParquetFileSaves the contents of this DataFrame as a parquet file, preserving the schema. Files that are written out using this method can be read back in as a DataFrame using the `parquetFile` function in SQLContext.
InsertIntoAdds the rows from this RDD to the specified table, optionally overwriting the existing data.
SaveAsTableCreates a table from the the contents of this DataFrame based on a given data source, SaveMode specified by mode, and a set of options. Note that this currently only works with DataFrames that are created from a HiveContext as there is no notion of a persisted catalog in a standard SQL context. Instead you can write an RDD out to a parquet file, and then register that file as a table. This "table" can then be the target of an `insertInto`. Also note that while this function can persist the table metadata into Hive's metastore, the table will NOT be accessible from Hive, until SPARK-7550 is resolved.
SaveSaves the contents of this DataFrame based on the given data source, SaveMode specified by mode, and a set of options.
Returns a new DataFrame that drops rows containing any null values.
Returns a new DataFrame that drops rows containing null values. If `how` is "any", then drop rows containing any null values. If `how` is "all", then drop rows only if every column is null for that row.
Returns a new [[DataFrame]] that drops rows containing null values in the specified columns. If `how` is "any", then drop rows containing any null values in the specified columns. If `how` is "all", then drop rows only if every specified column is null for that row.
Returns a new DataFrame that drops rows containing any null values in the specified columns.
Returns a new DataFrame that drops rows containing less than `minNonNulls` non-null values.
Returns a new DataFrame that drops rows containing less than `minNonNulls` non-null values values in the specified columns.
Returns a new DataFrame that replaces null values in numeric columns with `value`.
Returns a new DataFrame that replaces null values in string columns with `value`.
Returns a new DataFrame that replaces null values in specified numeric columns. If a specified column is not a numeric column, it is ignored.
Returns a new DataFrame that replaces null values in specified string columns. If a specified column is not a numeric column, it is ignored.
Replaces values matching keys in `replacement` map with the corresponding values. Key and value of `replacement` map must have the same type, and can only be doubles or strings. The value must be of the following type: `Integer`, `Long`, `Float`, `Double`, `String`. For example, the following replaces null values in column "A" with string "unknown", and null values in column "B" with numeric value 1.0. import com.google.common.collect.ImmutableMap; df.na.fill(ImmutableMap.of("A", "unknown", "B", 1.0));
Replaces values matching keys in `replacement` map with the corresponding values. Key and value of `replacement` map must have the same type, and can only be doubles or strings. If `col` is "*", then the replacement is applied on all string columns or numeric columns. Example: import com.google.common.collect.ImmutableMap; // Replaces all occurrences of 1.0 with 2.0 in column "height". df.replace("height", ImmutableMap.of(1.0, 2.0)); // Replaces all occurrences of "UNKNOWN" with "unnamed" in column "name". df.replace("name", ImmutableMap.of("UNKNOWN", "unnamed")); // Replaces all occurrences of "UNKNOWN" with "unnamed" in all string columns. df.replace("*", ImmutableMap.of("UNKNOWN", "unnamed"));
Replaces values matching keys in `replacement` map with the corresponding values. Key and value of `replacement` map must have the same type, and can only be doubles or strings. If `col` is "*", then the replacement is applied on all string columns or numeric columns. Example: import com.google.common.collect.ImmutableMap; // Replaces all occurrences of 1.0 with 2.0 in column "height" and "weight". df.replace(new String[] {"height", "weight"}, ImmutableMap.of(1.0, 2.0)); // Replaces all occurrences of "UNKNOWN" with "unnamed" in column "firstname" and "lastname". df.replace(new String[] {"firstname", "lastname"}, ImmutableMap.of("UNKNOWN", "unnamed"));
Specifies the input data source format.
Specifies the input schema. Some data sources (e.g. JSON) can infer the input schema automatically from data. By specifying the schema here, the underlying data source can skip the schema inference step, and thus speed up data loading.
Adds an input option for the underlying data source.
Adds input options for the underlying data source.
Loads input in as a [[DataFrame]], for data sources that require a path (e.g. data backed by a local or distributed file system).
Loads input in as a DataFrame, for data sources that don't require a path (e.g. external key-value stores).
Construct a [[DataFrame]] representing the database table accessible via JDBC URL, url named table and connection properties.
Construct a DataFrame representing the database table accessible via JDBC URL url named table. Partitions of the table will be retrieved in parallel based on the parameters passed to this function. Don't create too many partitions in parallel on a large cluster; otherwise Spark might crash your external database systems.
Construct a DataFrame representing the database table accessible via JDBC URL url named table using connection properties. The `predicates` parameter gives a list expressions suitable for inclusion in WHERE clauses; each one defines one partition of the DataFrame. Don't create too many partitions in parallel on a large cluster; otherwise Spark might crash your external database systems.
Loads a JSON file (one object per line) and returns the result as a DataFrame. This function goes through the input once to determine the input schema. If you know the schema in advance, use the version that specifies the schema to avoid the extra scan.
Loads a Parquet file, returning the result as a [[DataFrame]]. This function returns an empty DataFrame if no paths are passed in.
Loads a AVRO file (one object per line) and returns the result as a DataFrame. This function goes through the input once to determine the input schema. If you know the schema in advance, use the version that specifies the schema to avoid the extra scan.
Specifies the behavior when data or table already exists. Options include: - `SaveMode.Overwrite`: overwrite the existing data. - `SaveMode.Append`: append the data. - `SaveMode.Ignore`: ignore the operation (i.e. no-op). - `SaveMode.ErrorIfExists`: default option, throw an exception at runtime.
Specifies the behavior when data or table already exists. Options include: - `SaveMode.Overwrite`: overwrite the existing data. - `SaveMode.Append`: append the data. - `SaveMode.Ignore`: ignore the operation (i.e. no-op). - `SaveMode.ErrorIfExists`: default option, throw an exception at runtime.
Specifies the underlying output data source. Built-in options include "parquet", "json", etc.
Adds an output option for the underlying data source.
Adds output options for the underlying data source.
Partitions the output by the given columns on the file system. If specified, the output is laid out on the file system similar to Hive's partitioning scheme. This is only applicable for Parquet at the moment.
Saves the content of the DataFrame at the specified path.
Saves the content of the DataFrame as the specified table.
Inserts the content of the DataFrame to the specified table. It requires that the schema of the DataFrame is the same as the schema of the table. Because it inserts data to an existing table, format or options will be ignored.
Saves the content of the DataFrame as the specified table. In the case the table already exists, behavior of this function depends on the save mode, specified by the `mode` function (default to throwing an exception). When `mode` is `Overwrite`, the schema of the DataFrame does not need to be the same as that of the existing table. When `mode` is `Append`, the schema of the DataFrame need to be the same as that of the existing table, and format or options will be ignored.
Saves the content of the DataFrame to a external database table via JDBC. In the case the table already exists in the external database, behavior of this function depends on the save mode, specified by the `mode` function (default to throwing an exception). Don't create too many partitions in parallel on a large cluster; otherwise Spark might crash your external database systems.
Saves the content of the DataFrame in JSON format at the specified path. This is equivalent to: Format("json").Save(path)
Saves the content of the DataFrame in JSON format at the specified path. This is equivalent to: Format("parquet").Save(path)
Saves the content of the DataFrame in AVRO format at the specified path. This is equivalent to: Format("com.databricks.spark.avro").Save(path)
--- @@ -688,7 +688,7 @@ ####Methods -
NameDescription
FormatSpecifies the input data source format.
SchemaSpecifies the input schema. Some data sources (e.g. JSON) can infer the input schema automatically from data. By specifying the schema here, the underlying data source can skip the schema inference step, and thus speed up data loading.
OptionAdds an input option for the underlying data source.
OptionsAdds input options for the underlying data source.
LoadLoads input in as a [[DataFrame]], for data sources that require a path (e.g. data backed by a local or distributed file system).
LoadLoads input in as a DataFrame, for data sources that don't require a path (e.g. external key-value stores).
JdbcConstruct a [[DataFrame]] representing the database table accessible via JDBC URL, url named table and connection properties.
JdbcConstruct a DataFrame representing the database table accessible via JDBC URL url named table. Partitions of the table will be retrieved in parallel based on the parameters passed to this function. Don't create too many partitions in parallel on a large cluster; otherwise Spark might crash your external database systems.
JdbcConstruct a DataFrame representing the database table accessible via JDBC URL url named table using connection properties. The `predicates` parameter gives a list expressions suitable for inclusion in WHERE clauses; each one defines one partition of the DataFrame. Don't create too many partitions in parallel on a large cluster; otherwise Spark might crash your external database systems.
JsonLoads a JSON file (one object per line) and returns the result as a DataFrame. This function goes through the input once to determine the input schema. If you know the schema in advance, use the version that specifies the schema to avoid the extra scan.
ParquetLoads a Parquet file, returning the result as a [[DataFrame]]. This function returns an empty DataFrame if no paths are passed in.
+
NameDescription
FormatSpecifies the input data source format.
SchemaSpecifies the input schema. Some data sources (e.g. JSON) can infer the input schema automatically from data. By specifying the schema here, the underlying data source can skip the schema inference step, and thus speed up data loading.
OptionAdds an input option for the underlying data source.
OptionsAdds input options for the underlying data source.
LoadLoads input in as a [[DataFrame]], for data sources that require a path (e.g. data backed by a local or distributed file system).
LoadLoads input in as a DataFrame, for data sources that don't require a path (e.g. external key-value stores).
JdbcConstruct a [[DataFrame]] representing the database table accessible via JDBC URL, url named table and connection properties.
JdbcConstruct a DataFrame representing the database table accessible via JDBC URL url named table. Partitions of the table will be retrieved in parallel based on the parameters passed to this function. Don't create too many partitions in parallel on a large cluster; otherwise Spark might crash your external database systems.
JdbcConstruct a DataFrame representing the database table accessible via JDBC URL url named table using connection properties. The `predicates` parameter gives a list expressions suitable for inclusion in WHERE clauses; each one defines one partition of the DataFrame. Don't create too many partitions in parallel on a large cluster; otherwise Spark might crash your external database systems.
JsonLoads a JSON file (one object per line) and returns the result as a DataFrame. This function goes through the input once to determine the input schema. If you know the schema in advance, use the version that specifies the schema to avoid the extra scan.
ParquetLoads a Parquet file, returning the result as a [[DataFrame]]. This function returns an empty DataFrame if no paths are passed in.
AvroLoads a AVRO file (one object per line) and returns the result as a DataFrame. This function goes through the input once to determine the input schema. If you know the schema in advance, use the version that specifies the schema to avoid the extra scan.
--- @@ -705,7 +705,7 @@ ####Methods -
NameDescription
ModeSpecifies the behavior when data or table already exists. Options include: - `SaveMode.Overwrite`: overwrite the existing data. - `SaveMode.Append`: append the data. - `SaveMode.Ignore`: ignore the operation (i.e. no-op). - `SaveMode.ErrorIfExists`: default option, throw an exception at runtime.
ModeSpecifies the behavior when data or table already exists. Options include: - `SaveMode.Overwrite`: overwrite the existing data. - `SaveMode.Append`: append the data. - `SaveMode.Ignore`: ignore the operation (i.e. no-op). - `SaveMode.ErrorIfExists`: default option, throw an exception at runtime.
FormatSpecifies the underlying output data source. Built-in options include "parquet", "json", etc.
OptionAdds an output option for the underlying data source.
OptionsAdds output options for the underlying data source.
PartitionByPartitions the output by the given columns on the file system. If specified, the output is laid out on the file system similar to Hive's partitioning scheme. This is only applicable for Parquet at the moment.
SaveSaves the content of the DataFrame at the specified path.
SaveSaves the content of the DataFrame as the specified table.
InsertIntoInserts the content of the DataFrame to the specified table. It requires that the schema of the DataFrame is the same as the schema of the table. Because it inserts data to an existing table, format or options will be ignored.
SaveAsTableSaves the content of the DataFrame as the specified table. In the case the table already exists, behavior of this function depends on the save mode, specified by the `mode` function (default to throwing an exception). When `mode` is `Overwrite`, the schema of the DataFrame does not need to be the same as that of the existing table. When `mode` is `Append`, the schema of the DataFrame need to be the same as that of the existing table, and format or options will be ignored.
JdbcSaves the content of the DataFrame to a external database table via JDBC. In the case the table already exists in the external database, behavior of this function depends on the save mode, specified by the `mode` function (default to throwing an exception). Don't create too many partitions in parallel on a large cluster; otherwise Spark might crash your external database systems.
JsonSaves the content of the DataFrame in JSON format at the specified path. This is equivalent to: Format("json").Save(path)
ParquetSaves the content of the DataFrame in JSON format at the specified path. This is equivalent to: Format("parquet").Save(path)
+
NameDescription
ModeSpecifies the behavior when data or table already exists. Options include: - `SaveMode.Overwrite`: overwrite the existing data. - `SaveMode.Append`: append the data. - `SaveMode.Ignore`: ignore the operation (i.e. no-op). - `SaveMode.ErrorIfExists`: default option, throw an exception at runtime.
ModeSpecifies the behavior when data or table already exists. Options include: - `SaveMode.Overwrite`: overwrite the existing data. - `SaveMode.Append`: append the data. - `SaveMode.Ignore`: ignore the operation (i.e. no-op). - `SaveMode.ErrorIfExists`: default option, throw an exception at runtime.
FormatSpecifies the underlying output data source. Built-in options include "parquet", "json", etc.
OptionAdds an output option for the underlying data source.
OptionsAdds output options for the underlying data source.
PartitionByPartitions the output by the given columns on the file system. If specified, the output is laid out on the file system similar to Hive's partitioning scheme. This is only applicable for Parquet at the moment.
SaveSaves the content of the DataFrame at the specified path.
SaveSaves the content of the DataFrame as the specified table.
InsertIntoInserts the content of the DataFrame to the specified table. It requires that the schema of the DataFrame is the same as the schema of the table. Because it inserts data to an existing table, format or options will be ignored.
SaveAsTableSaves the content of the DataFrame as the specified table. In the case the table already exists, behavior of this function depends on the save mode, specified by the `mode` function (default to throwing an exception). When `mode` is `Overwrite`, the schema of the DataFrame does not need to be the same as that of the existing table. When `mode` is `Append`, the schema of the DataFrame need to be the same as that of the existing table, and format or options will be ignored.
JdbcSaves the content of the DataFrame to a external database table via JDBC. In the case the table already exists in the external database, behavior of this function depends on the save mode, specified by the `mode` function (default to throwing an exception). Don't create too many partitions in parallel on a large cluster; otherwise Spark might crash your external database systems.
JsonSaves the content of the DataFrame in JSON format at the specified path. This is equivalent to: Format("json").Save(path)
ParquetSaves the content of the DataFrame in JSON format at the specified path. This is equivalent to: Format("parquet").Save(path)
AvroSaves the content of the DataFrame in AVRO format at the specified path. This is equivalent to: Format("com.databricks.spark.avro").Save(path)
--- diff --git a/csharp/AdapterTest/AccumulatorTest.cs b/csharp/AdapterTest/AccumulatorTest.cs index 24ccfb5..75fb938 100644 --- a/csharp/AdapterTest/AccumulatorTest.cs +++ b/csharp/AdapterTest/AccumulatorTest.cs @@ -33,7 +33,7 @@ namespace AdapterTest // get accumulator server port and connect to accumuator server int serverPort = (sc.SparkContextProxy as MockSparkContextProxy).AccumulatorServerPort; sock = SocketFactory.CreateSocket(); - sock.Connect(IPAddress.Loopback, serverPort); + sock.Connect(IPAddress.Loopback, serverPort, null); } [TearDown] diff --git a/csharp/AdapterTest/AdapterTest.csproj b/csharp/AdapterTest/AdapterTest.csproj index c32ed7a..cbea547 100644 --- a/csharp/AdapterTest/AdapterTest.csproj +++ b/csharp/AdapterTest/AdapterTest.csproj @@ -35,22 +35,25 @@ 4 + + ..\packages\log4net.2.0.8\lib\net45-full\log4net.dll + ..\packages\Moq.4.2.1510.2205\lib\net40\Moq.dll True - - ..\packages\Newtonsoft.Json.7.0.1\lib\net45\Newtonsoft.Json.dll + + ..\packages\Newtonsoft.Json.11.0.2\lib\net45\Newtonsoft.Json.dll ..\packages\NUnit.3.0.1\lib\net45\nunit.framework.dll True - + ..\packages\Razorvine.Pyrolite.4.10.0.0\lib\net40\Razorvine.Pyrolite.dll - + ..\packages\Razorvine.Serpent.1.12.0.0\lib\net40\Razorvine.Serpent.dll diff --git a/csharp/AdapterTest/DataFrameTest.cs b/csharp/AdapterTest/DataFrameTest.cs index d54a9c3..34a6dfb 100644 --- a/csharp/AdapterTest/DataFrameTest.cs +++ b/csharp/AdapterTest/DataFrameTest.cs @@ -12,6 +12,7 @@ using Microsoft.Spark.CSharp.Sql; using Microsoft.Spark.CSharp.Proxy; using NUnit.Framework; using Moq; +using Microsoft.Spark.CSharp.Network; namespace AdapterTest { @@ -65,10 +66,10 @@ namespace AdapterTest [Test] public void TestShow() { - mockDataFrameProxy.Setup(m => m.GetShowString(It.IsAny(), It.IsAny())).Returns("Show"); + mockDataFrameProxy.Setup(m => m.GetShowString(It.IsAny(), It.IsAny(), It.IsAny())).Returns("Show"); var dataFrame = new DataFrame(mockDataFrameProxy.Object, null); dataFrame.Show(); - mockDataFrameProxy.Verify(m => m.GetShowString(20, true), Times.Once); + mockDataFrameProxy.Verify(m => m.GetShowString(20, 20, false), Times.Once); } [Test] @@ -135,9 +136,9 @@ namespace AdapterTest var expectedRows = new Row[] {new MockRow(), new MockRow()}; var mockRddProxy = new Mock(); var mockRddCollector = new Mock(); - mockRddCollector.Setup(m => m.Collect(It.IsAny(), It.IsAny(), It.IsAny())) + mockRddCollector.Setup(m => m.Collect(It.IsAny(), It.IsAny(), It.IsAny())) .Returns(expectedRows); - mockRddProxy.Setup(m => m.CollectAndServe()).Returns(123); + mockRddProxy.Setup(m => m.CollectAndServe()).Returns(new SocketInfo(123,null)); mockRddProxy.Setup(m => m.RDDCollector).Returns(mockRddCollector.Object); mockDataFrameProxy.Setup(m => m.JavaToCSharp()).Returns(mockRddProxy.Object); var dataFrame = new DataFrame(mockDataFrameProxy.Object, null); @@ -838,9 +839,9 @@ namespace AdapterTest var expectedRows = new Row[] {new MockRow(), new MockRow(), new MockRow(), new MockRow(), new MockRow()}; var mockRddProxy = new Mock(); var mockRddCollector = new Mock(); - mockRddCollector.Setup(m => m.Collect(It.IsAny(), It.IsAny(), It.IsAny())) + mockRddCollector.Setup(m => m.Collect(It.IsAny(), It.IsAny(), It.IsAny())) .Returns(expectedRows); - mockRddProxy.Setup(m => m.CollectAndServe()).Returns(123); + mockRddProxy.Setup(m => m.CollectAndServe()).Returns(new SocketInfo(123, null)); mockRddProxy.Setup(m => m.RDDCollector).Returns(mockRddCollector.Object); mockDataFrameProxy.Setup(m => m.JavaToCSharp()).Returns(mockRddProxy.Object); mockDataFrameProxy.Setup(m => m.Limit(It.IsAny())).Returns(mockDataFrameProxy.Object); @@ -868,9 +869,9 @@ namespace AdapterTest var expectedRows = new Row[] { new MockRow(), new MockRow(), new MockRow(), new MockRow(), new MockRow() }; var mockRddProxy = new Mock(); var mockRddCollector = new Mock(); - mockRddCollector.Setup(m => m.Collect(It.IsAny(), It.IsAny(), It.IsAny())) + mockRddCollector.Setup(m => m.Collect(It.IsAny(), It.IsAny(), It.IsAny())) .Returns(expectedRows); - mockRddProxy.Setup(m => m.CollectAndServe()).Returns(123); + mockRddProxy.Setup(m => m.CollectAndServe()).Returns(new SocketInfo(123, null)); mockRddProxy.Setup(m => m.RDDCollector).Returns(mockRddCollector.Object); mockDataFrameProxy.Setup(m => m.JavaToCSharp()).Returns(mockRddProxy.Object); mockDataFrameProxy.Setup(m => m.Limit(It.IsAny())).Returns(mockDataFrameProxy.Object); @@ -892,9 +893,9 @@ namespace AdapterTest var expectedRows = new Row[] { new MockRow(), new MockRow(), new MockRow(), new MockRow(), new MockRow() }; var mockRddProxy = new Mock(); var mockRddCollector = new Mock(); - mockRddCollector.Setup(m => m.Collect(It.IsAny(), It.IsAny(), It.IsAny())) + mockRddCollector.Setup(m => m.Collect(It.IsAny(), It.IsAny(), It.IsAny())) .Returns(expectedRows); - mockRddProxy.Setup(m => m.CollectAndServe()).Returns(123); + mockRddProxy.Setup(m => m.CollectAndServe()).Returns(new SocketInfo(123, null)); mockRddProxy.Setup(m => m.RDDCollector).Returns(mockRddCollector.Object); mockDataFrameProxy.Setup(m => m.JavaToCSharp()).Returns(mockRddProxy.Object); mockDataFrameProxy.Setup(m => m.Limit(It.IsAny())).Returns(mockDataFrameProxy.Object); diff --git a/csharp/AdapterTest/DatasetTest.cs b/csharp/AdapterTest/DatasetTest.cs index 7ee59db..b900041 100644 --- a/csharp/AdapterTest/DatasetTest.cs +++ b/csharp/AdapterTest/DatasetTest.cs @@ -38,12 +38,12 @@ namespace AdapterTest public void TestShow() { Mock mockDataFrameProxy = new Mock(); - mockDataFrameProxy.Setup(m => m.GetShowString(It.IsAny(), It.IsAny())).Returns("Show"); + mockDataFrameProxy.Setup(m => m.GetShowString(It.IsAny(), It.IsAny(), It.IsAny())).Returns("Show"); mockDatasetProxy.Setup(m => m.ToDF()).Returns(mockDataFrameProxy.Object); var dataset = new Dataset(mockDatasetProxy.Object); dataset.Show(); - mockDataFrameProxy.Verify(m => m.GetShowString(20, true), Times.Once); + mockDataFrameProxy.Verify(m => m.GetShowString(20, 20, false), Times.Once); } [Test] diff --git a/csharp/AdapterTest/Mocks/MockDataFrameProxy.cs b/csharp/AdapterTest/Mocks/MockDataFrameProxy.cs index a68d408..60e84fb 100644 --- a/csharp/AdapterTest/Mocks/MockDataFrameProxy.cs +++ b/csharp/AdapterTest/Mocks/MockDataFrameProxy.cs @@ -9,6 +9,7 @@ using System.Threading.Tasks; using System.Net; using System.Net.Sockets; using System.IO; +using Microsoft.Spark.CSharp.Core; using Microsoft.Spark.CSharp.Sql; using Razorvine.Pickle; using Microsoft.Spark.CSharp.Proxy; @@ -64,7 +65,7 @@ namespace AdapterTest.Mocks throw new NotImplementedException(); } - public string GetShowString(int numberOfRows, bool truncate) + public string GetShowString(int numberOfRows, int truncate, bool vertical) { throw new NotImplementedException(); } @@ -240,7 +241,12 @@ namespace AdapterTest.Mocks throw new NotImplementedException(); } - public IDataFrameWriterProxy Write() + public IDataFrameProxy Broadcast() + { + throw new NotImplementedException(); + } + + public IDataFrameWriterProxy Write() { throw new NotImplementedException(); } diff --git a/csharp/AdapterTest/Mocks/MockRDDCollector.cs b/csharp/AdapterTest/Mocks/MockRDDCollector.cs index 2ec5c62..e9c8c5c 100644 --- a/csharp/AdapterTest/Mocks/MockRDDCollector.cs +++ b/csharp/AdapterTest/Mocks/MockRDDCollector.cs @@ -4,12 +4,13 @@ using System.Linq; using System.Text; using System.Threading.Tasks; using Microsoft.Spark.CSharp.Core; +using Microsoft.Spark.CSharp.Network; namespace AdapterTest.Mocks { class MockRDDCollector : IRDDCollector { - public IEnumerable Collect(int port, SerializedMode serializedMode, Type type) + public IEnumerable Collect(SocketInfo port, SerializedMode serializedMode, Type type) { throw new NotImplementedException(); } diff --git a/csharp/AdapterTest/Mocks/MockRddProxy.cs b/csharp/AdapterTest/Mocks/MockRddProxy.cs index 03b0142..9188ea4 100644 --- a/csharp/AdapterTest/Mocks/MockRddProxy.cs +++ b/csharp/AdapterTest/Mocks/MockRddProxy.cs @@ -15,6 +15,7 @@ using Microsoft.Spark.CSharp.Core; using Microsoft.Spark.CSharp.Proxy; using Microsoft.Spark.CSharp.Interop.Ipc; using NUnit.Framework; +using Microsoft.Spark.CSharp.Network; namespace AdapterTest.Mocks { @@ -60,7 +61,7 @@ namespace AdapterTest.Mocks return union; } - public int CollectAndServe() + public SocketInfo CollectAndServe() { return MockSparkContextProxy.RunJob(this); } diff --git a/csharp/AdapterTest/Mocks/MockRow.cs b/csharp/AdapterTest/Mocks/MockRow.cs index bfa5b73..a6a9a86 100644 --- a/csharp/AdapterTest/Mocks/MockRow.cs +++ b/csharp/AdapterTest/Mocks/MockRow.cs @@ -8,6 +8,13 @@ namespace AdapterTest.Mocks { public class MockRow : Row { + public override dynamic[] Values + { + get + { + throw new NotImplementedException(); + } + } public override int Size() { diff --git a/csharp/AdapterTest/Mocks/MockSparkContextProxy.cs b/csharp/AdapterTest/Mocks/MockSparkContextProxy.cs index 609e591..da8b853 100644 --- a/csharp/AdapterTest/Mocks/MockSparkContextProxy.cs +++ b/csharp/AdapterTest/Mocks/MockSparkContextProxy.cs @@ -195,7 +195,7 @@ namespace AdapterTest.Mocks throw new NotImplementedException(); } - internal static int RunJob(IRDDProxy rdd) + internal static SocketInfo RunJob(IRDDProxy rdd) { var mockRdd = (rdd as MockRddProxy); IEnumerable result = mockRdd.pickle ? mockRdd.result.Cast() : @@ -222,10 +222,12 @@ namespace AdapterTest.Mocks ns.Flush(); } }); - return (listener.LocalEndPoint as IPEndPoint).Port; + + SocketInfo socketInfo = new SocketInfo((listener.LocalEndPoint as IPEndPoint).Port, null); + return socketInfo; } - public int RunJob(IRDDProxy rdd, IEnumerable partitions) + public SocketInfo RunJob(IRDDProxy rdd, IEnumerable partitions) { return RunJob(rdd); } diff --git a/csharp/AdapterTest/SocketWrapperTest.cs b/csharp/AdapterTest/SocketWrapperTest.cs index 3c7fac3..63c2ef8 100644 --- a/csharp/AdapterTest/SocketWrapperTest.cs +++ b/csharp/AdapterTest/SocketWrapperTest.cs @@ -86,9 +86,9 @@ namespace AdapterTest Assert.Throws(() => clientSock.GetStream()); Assert.Throws(() => clientSock.Receive()); Assert.Throws(() => clientSock.Send(null)); - Assert.Throws(() => clientSock.Connect(IPAddress.Any, 1024)); + Assert.Throws(() => clientSock.Connect(IPAddress.Any, 1024, null)); - clientSock.Connect(IPAddress.Loopback, port); + clientSock.Connect(IPAddress.Loopback, port, null); // Valid invalid operation var byteBuf = ByteBufPool.Default.Allocate(); diff --git a/csharp/AdapterTest/TestWithMoqDemo.cs b/csharp/AdapterTest/TestWithMoqDemo.cs index 337794b..706413c 100644 --- a/csharp/AdapterTest/TestWithMoqDemo.cs +++ b/csharp/AdapterTest/TestWithMoqDemo.cs @@ -80,7 +80,7 @@ namespace AdapterTest ns.Flush(); } }); - return (listener.LocalEndPoint as IPEndPoint).Port; + return new SocketInfo((listener.LocalEndPoint as IPEndPoint).Port, null); }); _mockRddProxy.Setup(m => m.RDDCollector).Returns(new RDDCollector()); diff --git a/csharp/AdapterTest/packages.config b/csharp/AdapterTest/packages.config index c3a926b..c7cc11e 100644 --- a/csharp/AdapterTest/packages.config +++ b/csharp/AdapterTest/packages.config @@ -1,10 +1,11 @@  + - + + - \ No newline at end of file diff --git a/csharp/Repl/Repl.csproj b/csharp/Repl/Repl.csproj index 35d8bd6..faf98f4 100644 --- a/csharp/Repl/Repl.csproj +++ b/csharp/Repl/Repl.csproj @@ -34,6 +34,9 @@ false + + ..\packages\log4net.2.0.8\lib\net45-full\log4net.dll + False ..\packages\Microsoft.Net.Compilers.1.1.1\tools\Microsoft.CodeAnalysis.dll @@ -50,11 +53,13 @@ False ..\packages\Microsoft.Net.Compilers.1.1.1\tools\Microsoft.CodeAnalysis.Scripting.dll + + ..\packages\Newtonsoft.Json.11.0.2\lib\net45\Newtonsoft.Json.dll + - False ..\packages\Razorvine.Pyrolite.4.10.0.0\lib\net40\Razorvine.Pyrolite.dll - + ..\packages\Razorvine.Serpent.1.12.0.0\lib\net40\Razorvine.Serpent.dll diff --git a/csharp/Repl/packages.config b/csharp/Repl/packages.config index 76ea838..7c1ac61 100644 --- a/csharp/Repl/packages.config +++ b/csharp/Repl/packages.config @@ -1,8 +1,8 @@  - + - - - + + + \ No newline at end of file diff --git a/csharp/Samples/Microsoft.Spark.CSharp/DataFrameSamples.cs b/csharp/Samples/Microsoft.Spark.CSharp/DataFrameSamples.cs index 5f4e5b4..cb6bac8 100644 --- a/csharp/Samples/Microsoft.Spark.CSharp/DataFrameSamples.cs +++ b/csharp/Samples/Microsoft.Spark.CSharp/DataFrameSamples.cs @@ -1867,5 +1867,72 @@ namespace Microsoft.Spark.CSharp.Samples SparkCLRSamples.FileSystemHelper.DeleteDirectory(path, true); Console.WriteLine("Remove directory: {0}", path); } + + /// + /// Single UDF Sample + /// + [Sample] + internal static void SingleUDFSample() + { + var sqlContext = GetSqlContext(); + var peopleDataFrame = sqlContext.Read().Json(SparkCLRSamples.Configuration.GetInputDataPath(PeopleJson)); + peopleDataFrame.RegisterTempTable("peopleDataFrame"); + + sqlContext.RegisterFunction("UDF", (int x, int y) => { return x + y; }); + + var rowSet = sqlContext.Sql("SELECT * FROM peopleDataFrame where UDF(age, 20) > 60"); + + rowSet.Show(); + + if (SparkCLRSamples.Configuration.IsValidationEnabled) + { + Assert.AreEqual(rowSet.Count() ,2); + } + } + + /// + /// Single UDF Sample with duplicate values + /// + [Sample] + internal static void SingleUDFWithDupSample() + { + var sqlContext = GetSqlContext(); + var peopleDataFrame = sqlContext.Read().Json(SparkCLRSamples.Configuration.GetInputDataPath(PeopleJson)); + peopleDataFrame.RegisterTempTable("peopleDataFrame"); + + sqlContext.RegisterFunction("UDF", (int x, int y) => { return x + y; }); + + var rowSet = sqlContext.Sql("SELECT * FROM peopleDataFrame where UDF(age, age) < 50"); + + rowSet.Show(); + + if (SparkCLRSamples.Configuration.IsValidationEnabled) + { + Assert.AreEqual(rowSet.Count(), 1); + } + } + + /// + /// Multiple UDFs sample + /// + [Sample] + internal static void MultipleUDFSample() + { + var sqlContext = GetSqlContext(); + var peopleDataFrame = sqlContext.Read().Json(SparkCLRSamples.Configuration.GetInputDataPath(PeopleJson)); + peopleDataFrame.RegisterTempTable("peopleDataFrame"); + + sqlContext.RegisterFunction("UDF1", (int x, int y) => { return x + y; }); + sqlContext.RegisterFunction("UDF2", (string name, string id) => { return name + ":" + id; }); + + var rowSet = sqlContext.Sql("SELECT id, name, UDF1(age, 20) AS UDF1, UDF2(name, id) AS UDF2 FROM peopleDataFrame where UDF1(age, 20) > 60"); + + rowSet.Show(); + + if (SparkCLRSamples.Configuration.IsValidationEnabled) + { + Assert.AreEqual(rowSet.Count(), 2); + } + } } } diff --git a/csharp/Samples/Microsoft.Spark.CSharp/Program.cs b/csharp/Samples/Microsoft.Spark.CSharp/Program.cs index 1f25fa2..f9b5af5 100644 --- a/csharp/Samples/Microsoft.Spark.CSharp/Program.cs +++ b/csharp/Samples/Microsoft.Spark.CSharp/Program.cs @@ -66,8 +66,10 @@ namespace Microsoft.Spark.CSharp.Samples if (Configuration.IsValidationEnabled && !status) { - Environment.Exit(1); + Environment.Exit(2); } + + Environment.Exit(1); } // Creates and returns a context diff --git a/csharp/Samples/Microsoft.Spark.CSharp/Samples.csproj b/csharp/Samples/Microsoft.Spark.CSharp/Samples.csproj index 880feb2..d28e1d6 100644 --- a/csharp/Samples/Microsoft.Spark.CSharp/Samples.csproj +++ b/csharp/Samples/Microsoft.Spark.CSharp/Samples.csproj @@ -33,9 +33,11 @@ 4 - - ..\..\packages\Newtonsoft.Json.7.0.1\lib\net45\Newtonsoft.Json.dll - True + + ..\..\packages\log4net.2.0.8\lib\net45-full\log4net.dll + + + ..\..\packages\Newtonsoft.Json.11.0.2\lib\net45\Newtonsoft.Json.dll ..\..\packages\NUnit.3.0.1\lib\net45\nunit.framework.dll diff --git a/csharp/Samples/Microsoft.Spark.CSharp/packages.config b/csharp/Samples/Microsoft.Spark.CSharp/packages.config index 4abe7e9..fc5be33 100644 --- a/csharp/Samples/Microsoft.Spark.CSharp/packages.config +++ b/csharp/Samples/Microsoft.Spark.CSharp/packages.config @@ -1,5 +1,6 @@  - + + \ No newline at end of file diff --git a/csharp/Tests.Common/Tests.Common.csproj b/csharp/Tests.Common/Tests.Common.csproj index 361031e..a2ca2c9 100644 --- a/csharp/Tests.Common/Tests.Common.csproj +++ b/csharp/Tests.Common/Tests.Common.csproj @@ -36,11 +36,10 @@ 4 - - False - ..\packages\Newtonsoft.Json.7.0.1\lib\net45\Newtonsoft.Json.dll - + + ..\packages\Newtonsoft.Json.11.0.2\lib\net45\Newtonsoft.Json.dll + ..\packages\Razorvine.Pyrolite.4.10.0.0\lib\net40\Razorvine.Pyrolite.dll diff --git a/csharp/Utils/Microsoft.Spark.CSharp/FileSystem/HdfsFileStatus.cs b/csharp/Utils/Microsoft.Spark.CSharp/FileSystem/HdfsFileStatus.cs new file mode 100644 index 0000000..0222849 --- /dev/null +++ b/csharp/Utils/Microsoft.Spark.CSharp/FileSystem/HdfsFileStatus.cs @@ -0,0 +1,57 @@ +// Copyright (c) Microsoft. All rights reserved. +// Licensed under the MIT license. See LICENSE file in the project root for full license information. + +using System; +using System.Collections.Generic; +using System.Linq; +using System.Text; +using System.Threading.Tasks; +using Microsoft.Spark.CSharp.Interop.Ipc; +using Microsoft.Spark.CSharp.Proxy.Ipc; + +namespace Microsoft.Spark.CSharp.Utils.FileSystem +{ + /// + /// See https://hadoop.apache.org/docs/r2.6.1/api/org/apache/hadoop/fs/FileStatus.html + /// + public class HdfsFileStatus + { + public long Length => _status.Value.Length; + public long ModificationTime => _status.Value.Time; + public string Owner => _status.Value.Owner; + public string Path => _status.Value.Path; + public bool IsFile => _status.Value.IsFile; + public bool IsDirectory => _status.Value.IsDirectory; + public bool IsSymlink => _status.Value.IsSymlink; + + private Lazy _status; + + internal HdfsFileStatus(JvmObjectReference obj) + { + _status = new Lazy(()=>new Status(obj)); + } + + private class Status + { + public long Length; + public long Time; + public string Owner; + public string Path; + public bool IsFile; + public bool IsDirectory; + public bool IsSymlink; + + public Status(JvmObjectReference obj) + { + Length = (long) SparkCLRIpcProxy.JvmBridge.CallNonStaticJavaMethod(obj, "getLen"); + Time = (long)SparkCLRIpcProxy.JvmBridge.CallNonStaticJavaMethod(obj, "getModificationTime"); + Owner = (string)SparkCLRIpcProxy.JvmBridge.CallNonStaticJavaMethod(obj, "getOwner"); + IsFile = (bool)SparkCLRIpcProxy.JvmBridge.CallNonStaticJavaMethod(obj, "isFile"); + IsDirectory = (bool)SparkCLRIpcProxy.JvmBridge.CallNonStaticJavaMethod(obj, "isDirectory"); + IsSymlink = (bool)SparkCLRIpcProxy.JvmBridge.CallNonStaticJavaMethod(obj, "isSymlink"); + var pr = new JvmObjectReference((string)SparkCLRIpcProxy.JvmBridge.CallNonStaticJavaMethod(obj, "getPath")); + Path = (string)SparkCLRIpcProxy.JvmBridge.CallNonStaticJavaMethod(pr, "getName"); + } + } + } +} diff --git a/csharp/Utils/Microsoft.Spark.CSharp/FileSystem/HdfsFileSystemHelper.cs b/csharp/Utils/Microsoft.Spark.CSharp/FileSystem/HdfsFileSystemHelper.cs index 52d20c3..c88c93b 100644 --- a/csharp/Utils/Microsoft.Spark.CSharp/FileSystem/HdfsFileSystemHelper.cs +++ b/csharp/Utils/Microsoft.Spark.CSharp/FileSystem/HdfsFileSystemHelper.cs @@ -4,8 +4,11 @@ using System; using System.Collections.Generic; using System.Diagnostics.CodeAnalysis; +using System.Linq; +using Microsoft.Spark.CSharp.Interop; using Microsoft.Spark.CSharp.Interop.Ipc; using Microsoft.Spark.CSharp.Proxy.Ipc; +using Microsoft.Spark.CSharp.Utils.FileSystem; namespace Microsoft.Spark.CSharp.Utils { @@ -18,7 +21,7 @@ namespace Microsoft.Spark.CSharp.Utils public HdfsFileSystemHelper() { - var jvmConfReference = SparkCLRIpcProxy.JvmBridge.CallConstructor("org.apache.hadoop.conf.Configuration"); + var jvmConfReference = SparkCLRIpcProxy.JvmBridge.CallConstructor("org.apache.hadoop.conf.Configuration"); jvmHdfsReference = new JvmObjectReference((string) SparkCLRIpcProxy.JvmBridge.CallStaticJavaMethod("org.apache.hadoop.fs.FileSystem", "get", jvmConfReference)); } @@ -39,16 +42,25 @@ namespace Microsoft.Spark.CSharp.Utils for (var i = 0; i < statusList.Count; i++) { var subPathJvmReference = new JvmObjectReference((string)SparkCLRIpcProxy.JvmBridge.CallNonStaticJavaMethod(statusList[i], "getPath")); - files[i] = (string)SparkCLRIpcProxy.JvmBridge.CallNonStaticJavaMethod(subPathJvmReference, "getName"); + files[i] = (string)SparkCLRIpcProxy.JvmBridge.CallNonStaticJavaMethod(subPathJvmReference, "getName"); } return files; } - /// - /// Build a temp file path under '/tmp' path on HDFS. - /// - public string GetTempFileName() + /// + /// List the names of all the files under the given path. + /// + public IEnumerable ListStatus(string path) + { + var pathJvmReference = SparkCLRIpcProxy.JvmBridge.CallConstructor("org.apache.hadoop.fs.Path", path); + return ((List)SparkCLRIpcProxy.JvmBridge.CallNonStaticJavaMethod(jvmHdfsReference, "listStatus", pathJvmReference)).Select(r=>new HdfsFileStatus(r)); + } + + /// + /// Build a temp file path under '/tmp' path on HDFS. + /// + public string GetTempFileName() { return "/tmp/" + Guid.NewGuid().ToString("N"); } @@ -91,5 +103,37 @@ namespace Microsoft.Spark.CSharp.Utils var pathJvmReference = SparkCLRIpcProxy.JvmBridge.CallConstructor("org.apache.hadoop.fs.Path", path); return (bool)SparkCLRIpcProxy.JvmBridge.CallNonStaticJavaMethod(jvmHdfsReference, "delete", pathJvmReference, recursive); } - } + + public bool IsFile(string path) + { + var pathJvmReference = SparkCLRIpcProxy.JvmBridge.CallConstructor("org.apache.hadoop.fs.Path", path); + return (bool)SparkCLRIpcProxy.JvmBridge.CallNonStaticJavaMethod(jvmHdfsReference, "isFile", pathJvmReference); + } + + public bool IsDirectory(string path) + { + var pathJvmReference = SparkCLRIpcProxy.JvmBridge.CallConstructor("org.apache.hadoop.fs.Path", path); + return (bool)SparkCLRIpcProxy.JvmBridge.CallNonStaticJavaMethod(jvmHdfsReference, "isDirectory", pathJvmReference); + } + + public bool Touch(string path) + { + var pathJvmReference = SparkCLRIpcProxy.JvmBridge.CallConstructor("org.apache.hadoop.fs.Path", path); + return (bool)SparkCLRIpcProxy.JvmBridge.CallNonStaticJavaMethod(jvmHdfsReference, "createNewFile", pathJvmReference); + } + + public void CopyFromLocalFile(string src, string dest) + { + var from = SparkCLRIpcProxy.JvmBridge.CallConstructor("org.apache.hadoop.fs.Path", new Uri(src).AbsoluteUri); + var to = SparkCLRIpcProxy.JvmBridge.CallConstructor("org.apache.hadoop.fs.Path", dest); + SparkCLRIpcProxy.JvmBridge.CallNonStaticJavaMethod(jvmHdfsReference, "copyFromLocalFile", from, to); + } + + public void CopyToLocalFile(string src, string dest) + { + var to = SparkCLRIpcProxy.JvmBridge.CallConstructor("org.apache.hadoop.fs.Path", new Uri(dest).AbsoluteUri); + var from = SparkCLRIpcProxy.JvmBridge.CallConstructor("org.apache.hadoop.fs.Path", src); + SparkCLRIpcProxy.JvmBridge.CallNonStaticJavaMethod(jvmHdfsReference, "copyToLocalFile", from, to); + } + } } diff --git a/csharp/Utils/Microsoft.Spark.CSharp/Utils.csproj b/csharp/Utils/Microsoft.Spark.CSharp/Utils.csproj index 60657c7..d089d7d 100644 --- a/csharp/Utils/Microsoft.Spark.CSharp/Utils.csproj +++ b/csharp/Utils/Microsoft.Spark.CSharp/Utils.csproj @@ -40,6 +40,7 @@ + diff --git a/csharp/Worker/Microsoft.Spark.CSharp/MultiThreadWorker.cs b/csharp/Worker/Microsoft.Spark.CSharp/MultiThreadWorker.cs index f73e90b..6fb9e1a 100644 --- a/csharp/Worker/Microsoft.Spark.CSharp/MultiThreadWorker.cs +++ b/csharp/Worker/Microsoft.Spark.CSharp/MultiThreadWorker.cs @@ -111,7 +111,8 @@ namespace Microsoft.Spark.CSharp bool sparkReuseWorker = false; string envVar = Environment.GetEnvironmentVariable("SPARK_REUSE_WORKER"); // this envVar is set in JVM side - if ((envVar != null) && envVar.Equals("1")) + var secret = Environment.GetEnvironmentVariable("PYTHON_WORKER_FACTORY_SECRET"); + if ((envVar != null) && envVar.Equals("1")) { sparkReuseWorker = true; } @@ -130,7 +131,7 @@ namespace Microsoft.Spark.CSharp SerDe.Write(s, trId); // write taskRunnerId to JVM side s.Flush(); } - TaskRunner taskRunner = new TaskRunner(trId, socket, sparkReuseWorker); + TaskRunner taskRunner = new TaskRunner(trId, socket, sparkReuseWorker, secret); waitingTaskRunners.Add(taskRunner); taskRunnerRegistry[trId] = taskRunner; trId++; diff --git a/csharp/Worker/Microsoft.Spark.CSharp/TaskRunner.cs b/csharp/Worker/Microsoft.Spark.CSharp/TaskRunner.cs index fb88e43..fb39856 100644 --- a/csharp/Worker/Microsoft.Spark.CSharp/TaskRunner.cs +++ b/csharp/Worker/Microsoft.Spark.CSharp/TaskRunner.cs @@ -3,7 +3,9 @@ using System; using System.IO; +using System.Net; using System.Runtime.CompilerServices; +using System.Text; using System.Threading; using Microsoft.Spark.CSharp.Configuration; using Microsoft.Spark.CSharp.Interop.Ipc; @@ -13,106 +15,116 @@ using Microsoft.Spark.CSharp.Services; [assembly: InternalsVisibleTo("WorkerTest")] namespace Microsoft.Spark.CSharp { - /// - /// TaskRunner is used to run Spark task assigned by JVM side. It uses a TCP socket to - /// communicate with JVM side. This socket may be reused to run multiple Spark tasks. - /// - internal class TaskRunner - { - private static ILoggerService logger; - private static ILoggerService Logger - { - get - { - if (logger != null) return logger; - logger = LoggerServiceFactory.GetLogger(typeof(TaskRunner)); - return logger; - } - } + /// + /// TaskRunner is used to run Spark task assigned by JVM side. It uses a TCP socket to + /// communicate with JVM side. This socket may be reused to run multiple Spark tasks. + /// + internal class TaskRunner + { + private static ILoggerService logger; + private static ILoggerService Logger + { + get + { + if (logger != null) return logger; + logger = LoggerServiceFactory.GetLogger(typeof(TaskRunner)); + return logger; + } + } - private readonly ISocketWrapper socket; // Socket to communicate with JVM - private volatile bool stop; - private readonly bool socketReuse; // whether the socket can be reused to run multiple Spark tasks + private readonly ISocketWrapper socket; // Socket to communicate with JVM + private volatile bool stop; + private readonly bool socketReuse; // whether the socket can be reused to run multiple Spark tasks + private string secret; - /// - /// Task runner Id - /// - public int TaskId { get; private set; } + /// + /// Task runner Id + /// + public int TaskId { get; private set; } - public TaskRunner(int trId, ISocketWrapper socket, bool socketReuse) - { - TaskId = trId; - this.socket = socket; - this.socketReuse = socketReuse; - } + public TaskRunner(int trId, ISocketWrapper socket, bool socketReuse, string secret) + { + TaskId = trId; + this.socket = socket; + this.socketReuse = socketReuse; + this.secret = secret; + } - public void Run() - { - Logger.LogInfo("TaskRunner [{0}] is running ...", TaskId); + public void Run() + { + Logger.LogInfo("TaskRunner [{0}] is running ...", TaskId); - try - { - while (!stop) - { - using (var inputStream = socket.GetInputStream()) - using (var outputStream = socket.GetOutputStream()) - { - byte[] bytes = SerDe.ReadBytes(inputStream, sizeof(int)); - if (bytes != null) - { - int splitIndex = SerDe.ToInt(bytes); - bool readComplete = Worker.ProcessStream(inputStream, outputStream, splitIndex); - outputStream.Flush(); - if (!readComplete) // if the socket is not read through completely, then it can't be reused - { - stop = true; - // wait for server to complete, otherwise server may get 'connection reset' exception - Logger.LogInfo("Sleep 500 millisecond to close socket ..."); - Thread.Sleep(500); - } - else if (!socketReuse) - { - stop = true; - // wait for server to complete, otherwise server gets 'connection reset' exception - // Use SerDe.ReadBytes() to detect java side has closed socket properly - // ReadBytes() will block until the socket is closed - Logger.LogInfo("waiting JVM side to close socket..."); - SerDe.ReadBytes(inputStream); - Logger.LogInfo("JVM side has closed socket"); - } - } - else - { - stop = true; - Logger.LogWarn("read null splitIndex, socket is closed by JVM"); - } - } - } - } - catch (Exception e) - { - stop = true; - Logger.LogError("TaskRunner [{0}] exeption, will dispose this TaskRunner", TaskId); - Logger.LogException(e); - } - finally - { - try - { - socket.Close(); - } - catch (Exception ex) - { - Logger.LogWarn("close socket exception: {0}", ex); - } - Logger.LogInfo("TaskRunner [{0}] finished", TaskId); - } - } + try + { + while (!stop) + { + using (var inputStream = socket.GetInputStream()) + using (var outputStream = socket.GetOutputStream()) + { + if (!string.IsNullOrEmpty(secret)) + { + SerDe.Write(outputStream, secret); + outputStream.Flush(); + var reply = SerDe.ReadString(inputStream); + Logger.LogDebug("Connect back to JVM: " + reply); + secret = null; + } + byte[] bytes = SerDe.ReadBytes(inputStream, sizeof(int)); + if (bytes != null) + { + int splitIndex = SerDe.ToInt(bytes); + bool readComplete = Worker.ProcessStream(inputStream, outputStream, splitIndex); + outputStream.Flush(); + if (!readComplete) // if the socket is not read through completely, then it can't be reused + { + stop = true; + // wait for server to complete, otherwise server may get 'connection reset' exception + Logger.LogInfo("Sleep 500 millisecond to close socket ..."); + Thread.Sleep(500); + } + else if (!socketReuse) + { + stop = true; + // wait for server to complete, otherwise server gets 'connection reset' exception + // Use SerDe.ReadBytes() to detect java side has closed socket properly + // ReadBytes() will block until the socket is closed + Logger.LogInfo("waiting JVM side to close socket..."); + SerDe.ReadBytes(inputStream); + Logger.LogInfo("JVM side has closed socket"); + } + } + else + { + stop = true; + Logger.LogWarn("read null splitIndex, socket is closed by JVM"); + } + } + } + } + catch (Exception e) + { + stop = true; + Logger.LogError("TaskRunner [{0}] exeption, will dispose this TaskRunner", TaskId); + Logger.LogException(e); + } + finally + { + try + { + socket.Close(); + } + catch (Exception ex) + { + Logger.LogWarn("close socket exception: {0}", ex); + } + Logger.LogInfo("TaskRunner [{0}] finished", TaskId); + } + } - public void Stop() - { - Logger.LogInfo("try to stop TaskRunner [{0}]", TaskId); - stop = true; - } - } + public void Stop() + { + Logger.LogInfo("try to stop TaskRunner [{0}]", TaskId); + stop = true; + } + } } diff --git a/csharp/Worker/Microsoft.Spark.CSharp/UDFCommand.cs b/csharp/Worker/Microsoft.Spark.CSharp/UDFCommand.cs new file mode 100644 index 0000000..43cf6b5 --- /dev/null +++ b/csharp/Worker/Microsoft.Spark.CSharp/UDFCommand.cs @@ -0,0 +1,391 @@ +// Copyright (c) Microsoft. All rights reserved. +// Licensed under the MIT license. See LICENSE file in the project root for full license information. + +using Microsoft.Spark.CSharp.Core; +using Microsoft.Spark.CSharp.Interop.Ipc; +using Microsoft.Spark.CSharp.Services; +using Microsoft.Spark.CSharp.Sql; +using Razorvine.Pickle; +using System; +using System.Collections; +using System.Collections.Generic; +using System.Diagnostics; +using System.IO; +using System.Linq; +using System.Runtime.Serialization; +using System.Runtime.Serialization.Formatters.Binary; + +namespace Microsoft.Spark.CSharp +{ + /// + /// This class execute user defined methods. + /// + + internal class UDFCommand + { + private readonly DateTime UnixTimeEpoch = new DateTime(1970, 1, 1, 0, 0, 0, DateTimeKind.Utc); + private ILoggerService logger; + private Stream inputStream; + private Stream outputStream; + private int splitIndex; + private DateTime bootTime; + private string deserializerMode; + private string serializerMode; + private IFormatter formatter; + private Stopwatch commandProcessWatch; + private int isSqlUdf; + private List workerFuncList; + private int stageId; + + public UDFCommand(Stream inputStream, Stream outputStream, int splitIndex, DateTime bootTime, + string deserializerMode, string serializerMode, IFormatter formatter, + Stopwatch commandProcessWatch, int isSqlUdf, List workerFuncList, int stageId) + { + this.inputStream = inputStream; + this.outputStream = outputStream; + this.splitIndex = splitIndex; + this.bootTime = bootTime; + this.deserializerMode = deserializerMode; + this.serializerMode = serializerMode; + this.formatter = formatter; + this.commandProcessWatch = commandProcessWatch; + this.isSqlUdf = isSqlUdf; + this.workerFuncList = workerFuncList; + this.stageId = stageId; + + InitializeLogger(); + } + + private void InitializeLogger() + { + try + { + // if there exists exe.config file, then use log4net + if (File.Exists(AppDomain.CurrentDomain.SetupInformation.ConfigurationFile)) + { + LoggerServiceFactory.SetLoggerService(Log4NetLoggerService.Instance); + } + + logger = LoggerServiceFactory.GetLogger(typeof(UDFCommand)); + } + catch (Exception e) + { + Console.WriteLine("InitializeLogger exception {0}, will exit", e); + Environment.Exit(-1); + } + } + + internal void Execute() + { + if (isSqlUdf == 0) + { + ExecuteNonSqlUDF(); + } + else + { + ExecuteSqlUDF(); + } + } + + private void ExecuteNonSqlUDF() + { + int count = 0; + int nullMessageCount = 0; + logger.LogDebug("Beginning to execute non sql func"); + WorkerFunc workerFunc = workerFuncList[0]; + var func = workerFunc.CharpWorkerFunc.Func; + + var funcProcessWatch = Stopwatch.StartNew(); + DateTime initTime = DateTime.UtcNow; + foreach (var message in func(splitIndex, GetIterator(inputStream, deserializerMode, isSqlUdf))) + { + funcProcessWatch.Stop(); + + if (object.ReferenceEquals(null, message)) + { + nullMessageCount++; + continue; + } + + try + { + WriteOutput(outputStream, serializerMode, message, formatter); + } + catch (Exception ex) + { + logger.LogError("WriteOutput() failed at iteration {0}, execption {1}", count, ex); + throw; + } + + count++; + funcProcessWatch.Start(); + } + + logger.LogInfo("Output entries count: " + count); + logger.LogDebug("Null messages count: " + nullMessageCount); + + WriteDiagnosticsInfo(outputStream, bootTime, initTime); + + commandProcessWatch.Stop(); + + // log statistics + logger.LogInfo("func process time: {0}", funcProcessWatch.ElapsedMilliseconds); + logger.LogInfo("stage {0}, command process time: {1}", stageId, commandProcessWatch.ElapsedMilliseconds); + } + + private void ExecuteSqlUDF() + { + int count = 0; + int nullMessageCount = 0; + logger.LogDebug("Beginning to execute sql func"); + + var funcProcessWatch = Stopwatch.StartNew(); + DateTime initTime = DateTime.UtcNow; + + foreach (var row in GetIterator(inputStream, deserializerMode, isSqlUdf)) + { + List messages = new List(); + + foreach (WorkerFunc workerFunc in workerFuncList) + { + List args = new List(); + foreach (int offset in workerFunc.ArgOffsets) + { + args.Add(row[offset]); + } + + foreach (var message in workerFunc.CharpWorkerFunc.Func(splitIndex, new[] { args.ToArray()})) + { + funcProcessWatch.Stop(); + + if (object.ReferenceEquals(null, message)) + { + nullMessageCount++; + continue; + } + + messages.Add(message); + } + } + + try + { + dynamic res = messages.ToArray(); + if (messages.Count == 1) + { + res = messages[0]; + } + + WriteOutput(outputStream, serializerMode, res, formatter); + } + catch (Exception ex) + { + logger.LogError("WriteOutput() failed at iteration {0}, exception error {1}", count, ex.Message); + throw; + } + + count++; + funcProcessWatch.Start(); + } + + logger.LogInfo("Output entries count: " + count); + logger.LogDebug("Null messages count: " + nullMessageCount); + + WriteDiagnosticsInfo(outputStream, bootTime, initTime); + + commandProcessWatch.Stop(); + + // log statistics + logger.LogInfo("func process time: {0}", funcProcessWatch.ElapsedMilliseconds); + logger.LogInfo("stage {0}, command process time: {0}", stageId, commandProcessWatch.ElapsedMilliseconds); + } + + private void WriteOutput(Stream networkStream, string serializerMode, dynamic message, IFormatter formatter) + { + var buffer = GetSerializedMessage(serializerMode, message, formatter); + if (buffer == null) + { + logger.LogError("Buffer is null"); + } + + if (buffer.Length <= 0) + { + logger.LogError("Buffer length {0} cannot be <= 0", buffer.Length); + } + + SerDe.Write(networkStream, buffer.Length); + SerDe.Write(networkStream, buffer); + } + + private byte[] GetSerializedMessage(string serializerMode, dynamic message, IFormatter formatter) + { + byte[] buffer; + + switch ((SerializedMode)Enum.Parse(typeof(SerializedMode), serializerMode)) + { + case SerializedMode.None: + buffer = message as byte[]; + break; + + case SerializedMode.String: + buffer = SerDe.ToBytes(message as string); + break; + + case SerializedMode.Row: + var pickler = new Pickler(); + buffer = pickler.dumps(new ArrayList { message }); + break; + + default: + try + { + var ms = new MemoryStream(); + formatter.Serialize(ms, message); + buffer = ms.ToArray(); + } + catch (Exception ex) + { + logger.LogError("Exception serializing output: " + ex); + logger.LogError("{0} : {1}", message.GetType().Name, message.GetType().FullName); + throw; + } + break; + } + + return buffer; + } + + private void WriteDiagnosticsInfo(Stream networkStream, DateTime bootTime, DateTime initTime) + { + DateTime finishTime = DateTime.UtcNow; + const string format = "MM/dd/yyyy hh:mm:ss.fff tt"; + + logger.LogDebug("bootTime: {0}, initTime: {1}, finish_time: {2}", + bootTime.ToString(format), initTime.ToString(format), finishTime.ToString(format)); + + SerDe.Write(networkStream, (int)SpecialLengths.TIMING_DATA); + SerDe.Write(networkStream, ToUnixTime(bootTime)); + SerDe.Write(networkStream, ToUnixTime(initTime)); + SerDe.Write(networkStream, ToUnixTime(finishTime)); + + SerDe.Write(networkStream, 0L); //shuffle.MemoryBytesSpilled + SerDe.Write(networkStream, 0L); //shuffle.DiskBytesSpilled + } + + private long ToUnixTime(DateTime dt) + { + return (long)(dt - UnixTimeEpoch).TotalMilliseconds; + } + + private IEnumerable GetIterator(Stream inputStream, string serializedMode, int isFuncSqlUdf) + { + logger.LogInfo("Serialized mode in GetIterator: " + serializedMode); + IFormatter formatter = new BinaryFormatter(); + var mode = (SerializedMode)Enum.Parse(typeof(SerializedMode), serializedMode); + int messageLength; + Stopwatch watch = Stopwatch.StartNew(); + Row tempRow = null; + + while ((messageLength = SerDe.ReadInt(inputStream)) != (int)SpecialLengths.END_OF_DATA_SECTION) + { + watch.Stop(); + if (messageLength > 0 || messageLength == (int)SpecialLengths.NULL) + { + watch.Start(); + byte[] buffer = messageLength > 0 ? SerDe.ReadBytes(inputStream, messageLength) : null; + watch.Stop(); + switch (mode) + { + case SerializedMode.String: + { + if (messageLength > 0) + { + if (buffer == null) + { + logger.LogDebug("Buffer is null. Message length is {0}", messageLength); + } + yield return SerDe.ToString(buffer); + } + else + { + yield return null; + } + break; + } + + case SerializedMode.Row: + { + Debug.Assert(messageLength > 0); + var unpickledObjects = PythonSerDe.GetUnpickledObjects(buffer); + + if (isFuncSqlUdf == 0) + { + foreach (var row in unpickledObjects.Select(item => (item as RowConstructor).GetRow())) + { + yield return row; + } + } + else + { + foreach (var row in unpickledObjects) + { + yield return row; + } + } + + break; + } + + case SerializedMode.Pair: + { + byte[] pairKey = buffer; + byte[] pairValue; + + watch.Start(); + int valueLength = SerDe.ReadInt(inputStream); + if (valueLength > 0) + { + pairValue = SerDe.ReadBytes(inputStream, valueLength); + } + else if (valueLength == (int)SpecialLengths.NULL) + { + pairValue = null; + } + else + { + throw new Exception(string.Format("unexpected valueLength: {0}", valueLength)); + } + watch.Stop(); + + yield return new Tuple(pairKey, pairValue); + break; + } + + case SerializedMode.None: //just return raw bytes + { + yield return buffer; + break; + } + + default: + { + if (buffer != null) + { + var ms = new MemoryStream(buffer); + yield return formatter.Deserialize(ms); + } + else + { + yield return null; + } + break; + } + } + } + watch.Start(); + } + + logger.LogInfo("total receive time: {0}", watch.ElapsedMilliseconds); + } + } +} diff --git a/csharp/Worker/Microsoft.Spark.CSharp/Worker.cs b/csharp/Worker/Microsoft.Spark.CSharp/Worker.cs index 486a1bc..c034ca6 100644 --- a/csharp/Worker/Microsoft.Spark.CSharp/Worker.cs +++ b/csharp/Worker/Microsoft.Spark.CSharp/Worker.cs @@ -2,7 +2,6 @@ // Licensed under the MIT license. See LICENSE file in the project root for full license information. using System; -using System.Collections; using System.Collections.Concurrent; using System.Collections.Generic; using System.IO; @@ -17,8 +16,6 @@ using Microsoft.Spark.CSharp.Core; using Microsoft.Spark.CSharp.Interop.Ipc; using Microsoft.Spark.CSharp.Network; using Microsoft.Spark.CSharp.Services; -using Microsoft.Spark.CSharp.Sql; -using Razorvine.Pickle; namespace Microsoft.Spark.CSharp { @@ -31,7 +28,6 @@ namespace Microsoft.Spark.CSharp /// public class Worker { - private static readonly DateTime UnixTimeEpoch = new DateTime(1970, 1, 1, 0, 0, 0, DateTimeKind.Utc); private static ILoggerService logger; private static SparkCLRAssemblyHandler assemblyHandler; @@ -81,11 +77,13 @@ namespace Microsoft.Spark.CSharp InitializeLogger(); logger.LogInfo("RunSimpleWorker ..."); PrintFiles(); - - int javaPort = int.Parse(Console.ReadLine()); //reading port number written from JVM - logger.LogDebug("Port number used to pipe in/out data between JVM and CLR {0}", javaPort); + //int javaPort = int.Parse(Console.ReadLine()); //reading port number written from JVM + var javaPort = int.Parse(Environment.GetEnvironmentVariable("PYTHON_WORKER_FACTORY_PORT")); + var secret = Environment.GetEnvironmentVariable("PYTHON_WORKER_FACTORY_SECRET"); + logger.LogDebug("Port and secret number used to pipe in/out data between JVM and CLR {0} {1}", javaPort, secret); var socket = InitializeSocket(javaPort); - TaskRunner taskRunner = new TaskRunner(0, socket, false); + //Microsoft.Spark.CSharp.Network.Utils.DoServerAuth(socket, secret); + TaskRunner taskRunner = new TaskRunner(0, socket, false, secret); taskRunner.Run(); } catch (Exception e) @@ -119,7 +117,7 @@ namespace Microsoft.Spark.CSharp private static ISocketWrapper InitializeSocket(int javaPort) { var socket = SocketFactory.CreateSocket(); - socket.Connect(IPAddress.Loopback, javaPort); + socket.Connect(IPAddress.Loopback, javaPort, null); return socket; } @@ -138,9 +136,13 @@ namespace Microsoft.Spark.CSharp //// initialize global state //shuffle.MemoryBytesSpilled = 0 //shuffle.DiskBytesSpilled = 0 + SerDe.ReadInt(inputStream); + SerDe.ReadInt(inputStream); + SerDe.ReadInt(inputStream); + SerDe.ReadLong(inputStream); - // fetch name of workdir - string sparkFilesDir = SerDe.ReadString(inputStream); + // fetch name of workdir + string sparkFilesDir = SerDe.ReadString(inputStream); logger.LogDebug("spark_files_dir: " + sparkFilesDir); //SparkFiles._root_directory = sparkFilesDir //SparkFiles._is_running_on_worker = True @@ -149,7 +151,7 @@ namespace Microsoft.Spark.CSharp ProcessBroadcastVariables(inputStream); - Accumulator.threadLocalAccumulatorRegistry = new Dictionary(); + Accumulator.threadLocalAccumulatorRegistry = new Dictionary(); var formatter = ProcessCommand(inputStream, outputStream, splitIndex, bootTime); @@ -255,96 +257,119 @@ namespace Microsoft.Spark.CSharp logger.LogDebug("Is func Sql UDF = {0}", isSqlUdf); IFormatter formatter = new BinaryFormatter(); + UDFCommand command = null; if (isSqlUdf == 0) { - logger.LogDebug("Processing non-UDF command"); - int lengthOfCommandByteArray = SerDe.ReadInt(inputStream); - logger.LogDebug("Command length: " + lengthOfCommandByteArray); - - if (lengthOfCommandByteArray > 0) - { - var commandProcessWatch = new Stopwatch(); - commandProcessWatch.Start(); - - int stageId; - string deserializerMode; - string serializerMode; - CSharpWorkerFunc workerFunc; - ReadCommand(inputStream, formatter, out stageId, out deserializerMode, out serializerMode, - out workerFunc); - - ExecuteCommand(inputStream, outputStream, splitIndex, bootTime, deserializerMode, workerFunc, serializerMode, - formatter, commandProcessWatch, stageId, isSqlUdf); - } - else - { - logger.LogWarn("lengthOfCommandByteArray = 0. Nothing to execute :-("); - } + command = ProcessNonUdfCommand(inputStream, outputStream, splitIndex, bootTime, formatter, isSqlUdf); } else { - logger.LogDebug("Processing UDF command"); - var udfCount = SerDe.ReadInt(inputStream); - logger.LogDebug("Count of UDFs = {0}", udfCount); + command = ProcessUdfCommand(inputStream, outputStream, splitIndex, bootTime, formatter, isSqlUdf); + } - if (udfCount == 1) - { - CSharpWorkerFunc func = null; - var argCount = SerDe.ReadInt(inputStream); - logger.LogDebug("Count of args = {0}", argCount); - - var argOffsets = new List(); - - for (int argIndex = 0; argIndex < argCount; argIndex++) - { - var offset = SerDe.ReadInt(inputStream); - logger.LogDebug("UDF argIndex = {0}, Offset = {1}", argIndex, offset); - argOffsets.Add(offset); - } - var chainedFuncCount = SerDe.ReadInt(inputStream); - logger.LogDebug("Count of chained func = {0}", chainedFuncCount); - - var commandProcessWatch = new Stopwatch(); - int stageId = -1; - string deserializerMode = null; - string serializerMode = null; - for (int funcIndex = 0; funcIndex < chainedFuncCount; funcIndex++) - { - int lengthOfCommandByteArray = SerDe.ReadInt(inputStream); - logger.LogDebug("UDF command length: " + lengthOfCommandByteArray) - ; - - if (lengthOfCommandByteArray > 0) - { - CSharpWorkerFunc workerFunc; - ReadCommand(inputStream, formatter, out stageId, out deserializerMode, out serializerMode, - out workerFunc); - - func = func == null ? workerFunc : CSharpWorkerFunc.Chain(func, workerFunc); - } - else - { - logger.LogWarn("UDF lengthOfCommandByteArray = 0. Nothing to execute :-("); - } - } - - Debug.Assert(stageId != -1); - Debug.Assert(deserializerMode != null); - Debug.Assert(serializerMode != null); - Debug.Assert(func != null); - ExecuteCommand(inputStream, outputStream, splitIndex, bootTime, deserializerMode, func, serializerMode, formatter, - commandProcessWatch, stageId, isSqlUdf); - } - else - { - throw new NotSupportedException(); //TODO - add support for multiple UDFs - } + if (command != null) + { + command.Execute(); } return formatter; } + private static UDFCommand ProcessNonUdfCommand(Stream inputStream, Stream outputStream, int splitIndex, + DateTime bootTime, IFormatter formatter, int isSqlUdf) + { + logger.LogDebug("Processing non-UDF command"); + int lengthOfCommandByteArray = SerDe.ReadInt(inputStream); + logger.LogDebug("Command length: " + lengthOfCommandByteArray); + + UDFCommand command = null; + if (lengthOfCommandByteArray > 0) + { + var commandProcessWatch = new Stopwatch(); + commandProcessWatch.Start(); + + int stageId; + string deserializerMode; + string serializerMode; + CSharpWorkerFunc cSharpWorkerFunc; + ReadCommand(inputStream, formatter, out stageId, out deserializerMode, out serializerMode, + out cSharpWorkerFunc); + + command = new UDFCommand(inputStream, outputStream, splitIndex, bootTime, deserializerMode, + serializerMode, formatter, commandProcessWatch, isSqlUdf, + new List() { new WorkerFunc(cSharpWorkerFunc, 0, null) }, stageId); + + } + else + { + logger.LogWarn("lengthOfCommandByteArray = 0. Nothing to execute :-("); + } + + return command; + } + + private static UDFCommand ProcessUdfCommand(Stream inputStream, Stream outputStream, int splitIndex, + DateTime bootTime, IFormatter formatter, int isSqlUdf) + { + logger.LogDebug("Processing UDF command"); + var udfCount = SerDe.ReadInt(inputStream); + logger.LogDebug("Count of UDFs = {0}", udfCount); + + int stageId = -1; + string deserializerMode = null; + string serializerMode = null; + var commandProcessWatch = new Stopwatch(); + List workerFuncList = new List(); + + for(int udfIter = 0; udfIter < udfCount; udfIter++) + { + CSharpWorkerFunc func = null; + var argCount = SerDe.ReadInt(inputStream); + logger.LogDebug("Count of args = {0}", argCount); + + List argOffsets = new List(); + for (int argIndex = 0; argIndex < argCount; argIndex++) + { + var offset = SerDe.ReadInt(inputStream); + logger.LogDebug("UDF argIndex = {0}, Offset = {1}", argIndex, offset); + argOffsets.Add(offset); + } + + var chainedFuncCount = SerDe.ReadInt(inputStream); + logger.LogDebug("Count of chained func = {0}", chainedFuncCount); + + for (int funcIndex = 0; funcIndex < chainedFuncCount; funcIndex++) + { + int lengthOfCommandByteArray = SerDe.ReadInt(inputStream); + logger.LogDebug("UDF command length: " + lengthOfCommandByteArray); + + if (lengthOfCommandByteArray > 0) + { + CSharpWorkerFunc workerFunc; + ReadCommand(inputStream, formatter, out stageId, out deserializerMode, out serializerMode, + out workerFunc); + + func = func == null ? workerFunc : CSharpWorkerFunc.Chain(func, workerFunc); + } + else + { + logger.LogWarn("UDF lengthOfCommandByteArray = 0. Nothing to execute :-("); + } + } + + Debug.Assert(stageId != -1); + Debug.Assert(deserializerMode != null); + Debug.Assert(serializerMode != null); + Debug.Assert(func != null); + + workerFuncList.Add(new WorkerFunc(func, argCount, argOffsets)); + } + + return new UDFCommand(inputStream, outputStream, splitIndex, bootTime, deserializerMode, + serializerMode, formatter, commandProcessWatch, isSqlUdf, workerFuncList, stageId); + } + private static void ReadCommand(Stream networkStream, IFormatter formatter, out int stageId, out string deserializerMode, out string serializerMode, out CSharpWorkerFunc workerFunc) @@ -388,116 +413,7 @@ namespace Microsoft.Spark.CSharp "--------------------------------------------------------------------------------------------------------------"); logger.LogDebug(sb.ToString()); } - - private static void ExecuteCommand(Stream inputStream, Stream outputStream, int splitIndex, DateTime bootTime, - string deserializerMode, CSharpWorkerFunc workerFunc, string serializerMode, - IFormatter formatter, Stopwatch commandProcessWatch, int stageId, int isSqlUdf) - { - int count = 0; - int nullMessageCount = 0; - logger.LogDebug("Beginning to execute func"); - var func = workerFunc.Func; - - var funcProcessWatch = Stopwatch.StartNew(); - DateTime initTime = DateTime.UtcNow; - foreach (var message in func(splitIndex, GetIterator(inputStream, deserializerMode, isSqlUdf))) - { - funcProcessWatch.Stop(); - - if (object.ReferenceEquals(null, message)) - { - nullMessageCount++; - continue; - } - - try - { - WriteOutput(outputStream, serializerMode, message, formatter); - } - catch (Exception) - { - logger.LogError("WriteOutput() failed at iteration {0}", count); - throw; - } - - count++; - funcProcessWatch.Start(); - } - - logger.LogInfo("Output entries count: " + count); - logger.LogDebug("Null messages count: " + nullMessageCount); - - //if profiler: - // profiler.profile(process) - //else: - // process() - - WriteDiagnosticsInfo(outputStream, bootTime, initTime); - - commandProcessWatch.Stop(); - - // log statistics - logger.LogInfo("func process time: {0}", funcProcessWatch.ElapsedMilliseconds); - logger.LogInfo("stage {0}, command process time: {1}", stageId, commandProcessWatch.ElapsedMilliseconds); - } - - private static void WriteOutput(Stream networkStream, string serializerMode, dynamic message, IFormatter formatter) - { - var buffer = GetSerializedMessage(serializerMode, message, formatter); - if (buffer == null) - { - logger.LogError("Buffer is null"); - } - - if (buffer.Length <= 0) - { - logger.LogError("Buffer length {0} cannot be <= 0", buffer.Length); - } - - //Debug.Assert(buffer != null); - //Debug.Assert(buffer.Length > 0); - SerDe.Write(networkStream, buffer.Length); - SerDe.Write(networkStream, buffer); - } - - private static byte[] GetSerializedMessage(string serializerMode, dynamic message, IFormatter formatter) - { - byte[] buffer; - - switch ((SerializedMode)Enum.Parse(typeof(SerializedMode), serializerMode)) - { - case SerializedMode.None: - buffer = message as byte[]; - break; - - case SerializedMode.String: - buffer = SerDe.ToBytes(message as string); - break; - - case SerializedMode.Row: - var pickler = new Pickler(); - buffer = pickler.dumps(new ArrayList { message }); - break; - - default: - try - { - var ms = new MemoryStream(); - formatter.Serialize(ms, message); - buffer = ms.ToArray(); - } - catch (Exception) - { - logger.LogError("Exception serializing output"); - logger.LogError("{0} : {1}", message.GetType().Name, message.GetType().FullName); - throw; - } - break; - } - - return buffer; - } - + private static int ReadDiagnosticsInfo(Stream networkStream) { int rddId = SerDe.ReadInt(networkStream); @@ -505,22 +421,7 @@ namespace Microsoft.Spark.CSharp int partitionId = SerDe.ReadInt(networkStream); logger.LogInfo("rddInfo: rddId {0}, stageId {1}, partitionId {2}", rddId, stageId, partitionId); return stageId; - } - - private static void WriteDiagnosticsInfo(Stream networkStream, DateTime bootTime, DateTime initTime) - { - DateTime finishTime = DateTime.UtcNow; - const string format = "MM/dd/yyyy hh:mm:ss.fff tt"; - logger.LogDebug("bootTime: {0}, initTime: {1}, finish_time: {2}", - bootTime.ToString(format), initTime.ToString(format), finishTime.ToString(format)); - SerDe.Write(networkStream, (int)SpecialLengths.TIMING_DATA); - SerDe.Write(networkStream, ToUnixTime(bootTime)); - SerDe.Write(networkStream, ToUnixTime(initTime)); - SerDe.Write(networkStream, ToUnixTime(finishTime)); - - SerDe.Write(networkStream, 0L); //shuffle.MemoryBytesSpilled - SerDe.Write(networkStream, 0L); //shuffle.DiskBytesSpilled - } + } private static void WriteAccumulatorValues(Stream networkStream, IFormatter formatter) { @@ -564,121 +465,7 @@ namespace Microsoft.Spark.CSharp logger.LogDebug("Files available in executor"); logger.LogDebug("Location: {0}{1}{2}", folder, Environment.NewLine, outfiles.ToString()); - } - - private static long ToUnixTime(DateTime dt) - { - return (long)(dt - UnixTimeEpoch).TotalMilliseconds; - } - - private static IEnumerable GetIterator(Stream inputStream, string serializedMode, int isFuncSqlUdf) - { - logger.LogInfo("Serialized mode in GetIterator: " + serializedMode); - IFormatter formatter = new BinaryFormatter(); - var mode = (SerializedMode)Enum.Parse(typeof(SerializedMode), serializedMode); - int messageLength; - Stopwatch watch = Stopwatch.StartNew(); - while ((messageLength = SerDe.ReadInt(inputStream)) != (int)SpecialLengths.END_OF_DATA_SECTION) - { - watch.Stop(); - if (messageLength > 0 || messageLength == (int)SpecialLengths.NULL) - { - watch.Start(); - byte[] buffer = messageLength > 0 ? SerDe.ReadBytes(inputStream, messageLength) : null; - watch.Stop(); - switch (mode) - { - case SerializedMode.String: - { - if (messageLength > 0) - { - if (buffer == null) - { - logger.LogDebug("Buffer is null. Message length is {0}", messageLength); - } - yield return SerDe.ToString(buffer); - } - else - { - yield return null; - } - break; - } - - case SerializedMode.Row: - { - Debug.Assert(messageLength > 0); - var unpickledObjects = PythonSerDe.GetUnpickledObjects(buffer); - - if (isFuncSqlUdf == 0) - { - foreach (var row in unpickledObjects.Select(item => (item as RowConstructor).GetRow())) - { - yield return row; - } - } - else - { - foreach (var row in unpickledObjects) - { - yield return row; - } - } - - break; - } - - case SerializedMode.Pair: - { - byte[] pairKey = buffer; - byte[] pairValue; - - watch.Start(); - int valueLength = SerDe.ReadInt(inputStream); - if (valueLength > 0) - { - pairValue = SerDe.ReadBytes(inputStream, valueLength); - } - else if (valueLength == (int)SpecialLengths.NULL) - { - pairValue = null; - } - else - { - throw new Exception(string.Format("unexpected valueLength: {0}", valueLength)); - } - watch.Stop(); - - yield return new Tuple(pairKey, pairValue); - break; - } - - case SerializedMode.None: //just return raw bytes - { - yield return buffer; - break; - } - - default: - { - if (buffer != null) - { - var ms = new MemoryStream(buffer); - yield return formatter.Deserialize(ms); - } - else - { - yield return null; - } - break; - } - } - } - watch.Start(); - } - - logger.LogInfo("total receive time: {0}", watch.ElapsedMilliseconds); - } + } internal class SparkCLRAssemblyHandler { diff --git a/csharp/Worker/Microsoft.Spark.CSharp/Worker.csproj b/csharp/Worker/Microsoft.Spark.CSharp/Worker.csproj index 36c9c1f..2ba4552 100644 --- a/csharp/Worker/Microsoft.Spark.CSharp/Worker.csproj +++ b/csharp/Worker/Microsoft.Spark.CSharp/Worker.csproj @@ -46,6 +46,8 @@ + + diff --git a/csharp/Worker/Microsoft.Spark.CSharp/WorkerFunc.cs b/csharp/Worker/Microsoft.Spark.CSharp/WorkerFunc.cs new file mode 100644 index 0000000..0c6a638 --- /dev/null +++ b/csharp/Worker/Microsoft.Spark.CSharp/WorkerFunc.cs @@ -0,0 +1,25 @@ +// Copyright (c) Microsoft. All rights reserved. +// Licensed under the MIT license. See LICENSE file in the project root for full license information. + +using System.Runtime.Serialization; +using Microsoft.Spark.CSharp.Core; +using System.Collections.Generic; + +namespace Microsoft.Spark.CSharp +{ + internal class WorkerFunc + { + internal CSharpWorkerFunc CharpWorkerFunc { get; } + + internal int ArgsCount { get; } + + internal List ArgOffsets { get; } + + internal WorkerFunc(CSharpWorkerFunc func, int argsCount, List argOffsets) + { + CharpWorkerFunc = func; + ArgsCount = argsCount; + ArgOffsets = argOffsets; + } + } +} diff --git a/csharp/WorkerTest/MultiThreadWorkerTest.cs b/csharp/WorkerTest/MultiThreadWorkerTest.cs index 0f0b307..6488ade 100644 --- a/csharp/WorkerTest/MultiThreadWorkerTest.cs +++ b/csharp/WorkerTest/MultiThreadWorkerTest.cs @@ -81,6 +81,7 @@ namespace WorkerTest worker.Start(); int serverPort = 0; serverPort = SerDe.ReadInt(worker.StandardOutput.BaseStream); + Environment.SetEnvironmentVariable("PYTHON_WORKER_FACTORY_PORT", serverPort.ToString()); StreamReader stdoutReader = worker.StandardOutput; Task.Run(() => { @@ -119,7 +120,7 @@ namespace WorkerTest private ISocketWrapper CreateSocket(int serverPort) { var socket =SocketFactory.CreateSocket(); - socket.Connect(IPAddress.Loopback, serverPort); + socket.Connect(IPAddress.Loopback, serverPort, null); return socket; } @@ -131,6 +132,10 @@ namespace WorkerTest { SerDe.Write(s, splitIndex); SerDe.Write(s, ver); + SerDe.Write(s, 0); + SerDe.Write(s, 0); + SerDe.Write(s, 0); + SerDe.Write(s, 0L); SerDe.Write(s, sparkFilesDir); SerDe.Write(s, numberOfIncludesItems); SerDe.Write(s, numBroadcastVariables); diff --git a/csharp/WorkerTest/WorkerTest.cs b/csharp/WorkerTest/WorkerTest.cs index 1826437..1c0f6ea 100644 --- a/csharp/WorkerTest/WorkerTest.cs +++ b/csharp/WorkerTest/WorkerTest.cs @@ -93,6 +93,7 @@ namespace WorkerTest } }; + Environment.SetEnvironmentVariable("PYTHON_WORKER_FACTORY_PORT", port.ToString()); lock (syncLock) { output.Clear(); @@ -125,6 +126,10 @@ namespace WorkerTest { SerDe.Write(s, splitIndex); SerDe.Write(s, ver); + SerDe.Write(s, 0); + SerDe.Write(s, 0); + SerDe.Write(s, 0); + SerDe.Write(s, 0L); SerDe.Write(s, sparkFilesDir); SerDe.Write(s, numberOfIncludesItems); SerDe.Write(s, numBroadcastVariables); @@ -631,6 +636,10 @@ namespace WorkerTest { SerDe.Write(s, splitIndex); SerDe.Write(s, ver); + SerDe.Write(s, 0); + SerDe.Write(s, 0); + SerDe.Write(s, 0); + SerDe.Write(s, 0L); SerDe.Write(s, sparkFilesDir); SerDe.Write(s, numberOfIncludesItems); @@ -802,6 +811,10 @@ namespace WorkerTest using (var inputStream = new MemoryStream(500)) { SerDe.Write(inputStream, "1.0"); //version + SerDe.Write(inputStream, 0); + SerDe.Write(inputStream, 0); + SerDe.Write(inputStream, 0); + SerDe.Write(inputStream, 0L); SerDe.Write(inputStream, ""); //includes directory SerDe.Write(inputStream, 0); //number of included items SerDe.Write(inputStream, 0); //number of broadcast variables diff --git a/csharp/WorkerTest/WorkerTest.csproj b/csharp/WorkerTest/WorkerTest.csproj index 76c9ba8..8fa76de 100644 --- a/csharp/WorkerTest/WorkerTest.csproj +++ b/csharp/WorkerTest/WorkerTest.csproj @@ -35,9 +35,8 @@ 4 - - False - ..\packages\Newtonsoft.Json.7.0.1\lib\net45\Newtonsoft.Json.dll + + ..\packages\Newtonsoft.Json.11.0.2\lib\net45\Newtonsoft.Json.dll False diff --git a/examples/Batch/WordCount/WordCount.csproj b/examples/Batch/WordCount/WordCount.csproj index b655eb8..1961a0b 100644 --- a/examples/Batch/WordCount/WordCount.csproj +++ b/examples/Batch/WordCount/WordCount.csproj @@ -32,17 +32,17 @@ 4 - + False - ..\..\packages\Microsoft.SparkCLR.2.0.200\lib\net45\CSharpWorker.exe + ..\..\..\csharp\Worker\Microsoft.Spark.CSharp\bin\Debug\CSharpWorker.exe False - ..\..\packages\log4net.2.0.5\lib\net45-full\log4net.dll + ..\..\..\csharp\Worker\Microsoft.Spark.CSharp\bin\Debug\log4net.dll - + False - ..\..\packages\Microsoft.SparkCLR.2.0.200\lib\net45\Microsoft.Spark.CSharp.Adapter.dll + ..\..\..\csharp\Worker\Microsoft.Spark.CSharp\bin\Debug\Microsoft.Spark.CSharp.Adapter.dll False @@ -84,4 +84,4 @@ --> - + \ No newline at end of file diff --git a/examples/Batch/pi/Pi.csproj b/examples/Batch/pi/Pi.csproj index df0916b..464f4b5 100644 --- a/examples/Batch/pi/Pi.csproj +++ b/examples/Batch/pi/Pi.csproj @@ -35,17 +35,17 @@ 4 - + False - ..\..\packages\Microsoft.SparkCLR.2.0.200\lib\net45\CSharpWorker.exe + ..\..\..\csharp\Worker\Microsoft.Spark.CSharp\bin\Debug\CSharpWorker.exe False - ..\..\packages\log4net.2.0.5\lib\net45-full\log4net.dll + ..\..\..\csharp\Worker\Microsoft.Spark.CSharp\bin\Debug\log4net.dll - + False - ..\..\packages\Microsoft.SparkCLR.2.0.200\lib\net45\Microsoft.Spark.CSharp.Adapter.dll + ..\..\..\csharp\Worker\Microsoft.Spark.CSharp\bin\Debug\Microsoft.Spark.CSharp.Adapter.dll False @@ -84,4 +84,4 @@ --> - + \ No newline at end of file diff --git a/examples/Examples.sln b/examples/Examples.sln index 5ba0d23..3eaad7a 100644 --- a/examples/Examples.sln +++ b/examples/Examples.sln @@ -1,6 +1,6 @@ Microsoft Visual Studio Solution File, Format Version 12.00 # Visual Studio 14 -VisualStudioVersion = 14.0.25123.0 +VisualStudioVersion = 14.0.25420.1 MinimumVisualStudioVersion = 10.0.40219.1 Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "HdfsWordCount", "Streaming\HdfsWordCount\HdfsWordCount.csproj", "{6A2C7CF9-D64E-490D-9841-269EE14F7932}" EndProject diff --git a/examples/Sql/CassandraDataFrame/CassandraDataFrame.csproj b/examples/Sql/CassandraDataFrame/CassandraDataFrame.csproj index 30fd07f..2f38f46 100644 --- a/examples/Sql/CassandraDataFrame/CassandraDataFrame.csproj +++ b/examples/Sql/CassandraDataFrame/CassandraDataFrame.csproj @@ -34,14 +34,17 @@ 4 - - ..\..\packages\Microsoft.SparkCLR.2.0.200\lib\net45\CSharpWorker.exe + + False + ..\..\..\csharp\Worker\Microsoft.Spark.CSharp\bin\Debug\CSharpWorker.exe - - ..\..\packages\log4net.2.0.5\lib\net45-full\log4net.dll + + False + ..\..\..\csharp\Worker\Microsoft.Spark.CSharp\bin\Debug\log4net.dll - - ..\..\packages\Microsoft.SparkCLR.2.0.200\lib\net45\Microsoft.Spark.CSharp.Adapter.dll + + False + ..\..\..\csharp\Worker\Microsoft.Spark.CSharp\bin\Debug\Microsoft.Spark.CSharp.Adapter.dll False @@ -80,4 +83,4 @@ --> - + \ No newline at end of file diff --git a/examples/Sql/HiveDataFrame/HiveDataFrame.csproj b/examples/Sql/HiveDataFrame/HiveDataFrame.csproj index 0040a3e..c826a80 100644 --- a/examples/Sql/HiveDataFrame/HiveDataFrame.csproj +++ b/examples/Sql/HiveDataFrame/HiveDataFrame.csproj @@ -33,17 +33,17 @@ 4 + + False + ..\..\..\csharp\Worker\Microsoft.Spark.CSharp\bin\Debug\CSharpWorker.exe + False - ..\..\packages\log4net.2.0.5\lib\net45-full\log4net.dll + ..\..\..\csharp\Worker\Microsoft.Spark.CSharp\bin\Debug\log4net.dll - - ..\..\packages\Microsoft.SparkCLR.2.0.200\lib\net45\CSharpWorker.exe - True - - - ..\..\packages\Microsoft.SparkCLR.2.0.200\lib\net45\Microsoft.Spark.CSharp.Adapter.dll - True + + False + ..\..\..\csharp\Worker\Microsoft.Spark.CSharp\bin\Debug\Microsoft.Spark.CSharp.Adapter.dll @@ -75,4 +75,4 @@ --> - + \ No newline at end of file diff --git a/examples/Sql/JdbcDataFrame/JdbcDataFrame.csproj b/examples/Sql/JdbcDataFrame/JdbcDataFrame.csproj index 24ecf84..fb4fc63 100644 --- a/examples/Sql/JdbcDataFrame/JdbcDataFrame.csproj +++ b/examples/Sql/JdbcDataFrame/JdbcDataFrame.csproj @@ -34,17 +34,17 @@ 4 - + False - ..\..\packages\Microsoft.SparkCLR.2.0.200\lib\net45\CSharpWorker.exe + ..\..\..\csharp\Worker\Microsoft.Spark.CSharp\bin\Debug\CSharpWorker.exe False - ..\..\packages\log4net.2.0.5\lib\net45-full\log4net.dll + ..\..\..\csharp\Worker\Microsoft.Spark.CSharp\bin\Debug\log4net.dll - + False - ..\..\packages\Microsoft.SparkCLR.2.0.200\lib\net45\Microsoft.Spark.CSharp.Adapter.dll + ..\..\..\csharp\Worker\Microsoft.Spark.CSharp\bin\Debug\Microsoft.Spark.CSharp.Adapter.dll False @@ -82,4 +82,4 @@ --> - + \ No newline at end of file diff --git a/examples/Sql/SparkXml/SparkXml.csproj b/examples/Sql/SparkXml/SparkXml.csproj index d770125..622b6a2 100644 --- a/examples/Sql/SparkXml/SparkXml.csproj +++ b/examples/Sql/SparkXml/SparkXml.csproj @@ -34,17 +34,17 @@ 4 - + False - ..\..\packages\Microsoft.SparkCLR.2.0.200\lib\net45\CSharpWorker.exe + ..\..\..\csharp\Worker\Microsoft.Spark.CSharp\bin\Debug\CSharpWorker.exe False - ..\..\packages\log4net.2.0.5\lib\net45-full\log4net.dll + ..\..\..\csharp\Worker\Microsoft.Spark.CSharp\bin\Debug\log4net.dll - + False - ..\..\packages\Microsoft.SparkCLR.2.0.200\lib\net45\Microsoft.Spark.CSharp.Adapter.dll + ..\..\..\csharp\Worker\Microsoft.Spark.CSharp\bin\Debug\Microsoft.Spark.CSharp.Adapter.dll False @@ -82,4 +82,4 @@ --> - + \ No newline at end of file diff --git a/examples/Streaming/EventHub/EventHub.csproj b/examples/Streaming/EventHub/EventHub.csproj index cc6d4e2..934eae5 100644 --- a/examples/Streaming/EventHub/EventHub.csproj +++ b/examples/Streaming/EventHub/EventHub.csproj @@ -34,16 +34,18 @@ 4 - - ..\..\packages\Microsoft.SparkCLR.2.0.200\lib\net45\CSharpWorker.exe + + False + ..\..\..\csharp\Worker\Microsoft.Spark.CSharp\bin\Debug\CSharpWorker.exe False - ..\..\packages\log4net.2.0.5\lib\net45-full\log4net.dll + ..\..\..\csharp\Worker\Microsoft.Spark.CSharp\bin\Debug\log4net.dll - - ..\..\packages\Microsoft.SparkCLR.2.0.200\lib\net45\Microsoft.Spark.CSharp.Adapter.dll + + False + ..\..\..\csharp\Worker\Microsoft.Spark.CSharp\bin\Debug\Microsoft.Spark.CSharp.Adapter.dll False @@ -85,4 +87,4 @@ --> - + \ No newline at end of file diff --git a/examples/Streaming/HdfsWordCount/HdfsWordCount.csproj b/examples/Streaming/HdfsWordCount/HdfsWordCount.csproj index 34facbb..c58ceae 100644 --- a/examples/Streaming/HdfsWordCount/HdfsWordCount.csproj +++ b/examples/Streaming/HdfsWordCount/HdfsWordCount.csproj @@ -32,22 +32,22 @@ 4 + + False + ..\..\..\csharp\Worker\Microsoft.Spark.CSharp\bin\Debug\CSharpWorker.exe + + + False + ..\..\..\csharp\Worker\Microsoft.Spark.CSharp\bin\Debug\log4net.dll + + + False + ..\..\..\csharp\Worker\Microsoft.Spark.CSharp\bin\Debug\Microsoft.Spark.CSharp.Adapter.dll + False ..\..\packages\Newtonsoft.Json.7.0.1\lib\net45\Newtonsoft.Json.dll - - False - ..\..\packages\Microsoft.SparkCLR.2.0.200\lib\net45\CSharpWorker.exe - - - False - ..\..\packages\log4net.2.0.5\lib\net45-full\log4net.dll - - - False - ..\..\packages\Microsoft.SparkCLR.2.0.200\lib\net45\Microsoft.Spark.CSharp.Adapter.dll - False ..\..\packages\Razorvine.Pyrolite.4.10.0.0\lib\net40\Razorvine.Pyrolite.dll @@ -80,4 +80,4 @@ --> - + \ No newline at end of file diff --git a/examples/Streaming/Kafka/Kafka.csproj b/examples/Streaming/Kafka/Kafka.csproj index 2bdaa81..68b15a7 100644 --- a/examples/Streaming/Kafka/Kafka.csproj +++ b/examples/Streaming/Kafka/Kafka.csproj @@ -32,15 +32,17 @@ 4 - - ..\..\packages\Microsoft.SparkCLR.2.0.200\lib\net45\CSharpWorker.exe - - + False - ..\..\packages\log4net.2.0.5\lib\net45-full\log4net.dll + ..\..\..\csharp\Worker\Microsoft.Spark.CSharp\bin\Debug\CSharpWorker.exe - - ..\..\packages\Microsoft.SparkCLR.2.0.200\lib\net45\Microsoft.Spark.CSharp.Adapter.dll + + False + ..\..\..\csharp\Worker\Microsoft.Spark.CSharp\bin\Debug\log4net.dll + + + False + ..\..\..\csharp\Worker\Microsoft.Spark.CSharp\bin\Debug\Microsoft.Spark.CSharp.Adapter.dll False @@ -79,4 +81,4 @@ --> - + \ No newline at end of file diff --git a/examples/fsharp/JsonDataFrame/JsonDataFrame.fsproj b/examples/fsharp/JsonDataFrame/JsonDataFrame.fsproj index e91905a..81f5a19 100644 --- a/examples/fsharp/JsonDataFrame/JsonDataFrame.fsproj +++ b/examples/fsharp/JsonDataFrame/JsonDataFrame.fsproj @@ -66,13 +66,13 @@ - ..\..\packages\Microsoft.SparkCLR.2.0.200\lib\net45\CSharpWorker.exe + ..\..\..\csharp\Worker\Microsoft.Spark.CSharp\bin\Debug\CSharpWorker.exe - ..\..\packages\log4net.2.0.5\lib\net45-full\log4net.dll + ..\..\..\csharp\Worker\Microsoft.Spark.CSharp\bin\Debug\log4net.dll - ..\..\packages\Microsoft.SparkCLR.2.0.200\lib\net45\Microsoft.Spark.CSharp.Adapter.dll + ..\..\..\csharp\Worker\Microsoft.Spark.CSharp\bin\Debug\Microsoft.Spark.CSharp.Adapter.dll @@ -98,4 +98,4 @@ --> - + \ No newline at end of file diff --git a/examples/fsharp/WordCount/WordCountFSharp.fsproj b/examples/fsharp/WordCount/WordCountFSharp.fsproj index af96e49..86c3bda 100644 --- a/examples/fsharp/WordCount/WordCountFSharp.fsproj +++ b/examples/fsharp/WordCount/WordCountFSharp.fsproj @@ -71,20 +71,17 @@ - ..\..\packages\Microsoft.SparkCLR.2.0.200\lib\net45\CSharpWorker.exe - True + ..\..\..\csharp\Worker\Microsoft.Spark.CSharp\bin\Debug\CSharpWorker.exe ..\..\packages\FSharp.Core.4.0.0.1\lib\net40\FSharp.Core.dll True - ..\..\packages\log4net.2.0.5\lib\net45-full\log4net.dll - True + ..\..\..\csharp\Worker\Microsoft.Spark.CSharp\bin\Debug\log4net.dll - ..\..\packages\Microsoft.SparkCLR.2.0.200\lib\net45\Microsoft.Spark.CSharp.Adapter.dll - True + ..\..\..\csharp\Worker\Microsoft.Spark.CSharp\bin\Debug\Microsoft.Spark.CSharp.Adapter.dll @@ -110,4 +107,4 @@ --> - + \ No newline at end of file diff --git a/notes/running-mobius-app.md b/notes/running-mobius-app.md index b430a0d..ea776a3 100644 --- a/notes/running-mobius-app.md +++ b/notes/running-mobius-app.md @@ -145,7 +145,7 @@ The following sample commands show how to run Mobius examples in local mode. Usi Computes the _approximate_ value of Pi using two appropaches and displays the value. ### WordCount Example (Batch) -* Run `sparkclr-submit.cmd --exe SparkClrWordCount.exe C:\Git\Mobius\examples\Batch\WordCount\bin\Debug ` +* Run `sparkclr-submit.cmd --exe SparkClrPi.exe C:\Git\Mobius\examples\Batch\WordCount\bin\Debug ` `InputFilePath` should be in one of the following formats: * `hdfs://path/to/inputfile` diff --git a/scala/pom.xml b/scala/pom.xml index cb9ce90..ec526cd 100644 --- a/scala/pom.xml +++ b/scala/pom.xml @@ -2,7 +2,7 @@ 4.0.0 com.microsoft.sparkclr spark-clr_2.11 - 2.0.200-SNAPSHOT + 2.3.1-SNAPSHOT Mobius Project C# language binding and extensions to Apache Spark https://github.com/Microsoft/Mobius @@ -35,7 +35,7 @@ 1.5 UTF-8 2.11.8 - 2.0.2 + 2.3.1 2.11 @@ -106,14 +106,19 @@ org.apache.spark spark-hive_2.11 - 2.0.0 + ${spark.version} com.databricks - spark-csv_2.10 - 1.4.0 + spark-csv_2.11 + 1.5.0 + + + com.databricks + spark-avro_2.11 + 4.0.0 diff --git a/scala/src/main/org/apache/spark/api/csharp/CSharpRDD.scala b/scala/src/main/org/apache/spark/api/csharp/CSharpRDD.scala index d48e9f3..57ca361 100644 --- a/scala/src/main/org/apache/spark/api/csharp/CSharpRDD.scala +++ b/scala/src/main/org/apache/spark/api/csharp/CSharpRDD.scala @@ -12,6 +12,7 @@ import java.util.{List => JList, Map => JMap} import org.apache.hadoop.io.compress.CompressionCodec import org.apache.spark.api.python._ +import org.apache.spark.api.python.PythonAccumulatorV2 import org.apache.spark.broadcast.Broadcast import org.apache.spark.rdd.RDD import org.apache.spark._ @@ -34,7 +35,7 @@ class CSharpRDD( cSharpWorkerExecutable: String, unUsedVersionIdentifier: String, broadcastVars: JList[Broadcast[PythonBroadcast]], - accumulator: Accumulator[JList[Array[Byte]]]) + accumulator: PythonAccumulatorV2) extends PythonRDD ( parent, SQLUtils.createCSharpFunction(command, envVars, cSharpIncludes, cSharpWorkerExecutable, @@ -95,7 +96,7 @@ class CSharpRDD( logInfo("Env vars: " + envVars.asScala.mkString(", ")) val runner = new PythonRunner( - Seq(ChainedPythonFunctions(Seq(func))), bufferSize, reuse_worker, false, Array(Array(0))) + Seq(ChainedPythonFunctions(Seq(func))), bufferSize, reuseWorker) runner.compute(firstParent.iterator(split, context), split.index, context) } diff --git a/scala/src/main/org/apache/spark/sql/api/csharp/SQLUtils.scala b/scala/src/main/org/apache/spark/sql/api/csharp/SQLUtils.scala index c01d76a..79af72c 100644 --- a/scala/src/main/org/apache/spark/sql/api/csharp/SQLUtils.scala +++ b/scala/src/main/org/apache/spark/sql/api/csharp/SQLUtils.scala @@ -8,6 +8,7 @@ package org.apache.spark.sql.api.csharp import java.io.{ByteArrayOutputStream, DataOutputStream} import org.apache.spark.{Accumulator, SparkContext} +import org.apache.spark.api.python.PythonAccumulatorV2 import org.apache.spark.api.csharp.SerDe import org.apache.spark.api.java.{JavaRDD, JavaSparkContext} import org.apache.spark.api.python.{PythonBroadcast, PythonFunction, SerDeUtil} @@ -51,7 +52,7 @@ object SQLUtils { cSharpWorkerExecutable: String, unUsedVersionIdentifier: String, broadcastVars: JList[Broadcast[PythonBroadcast]], - accumulator: Accumulator[JList[Array[Byte]]]) : PythonFunction = { + accumulator: PythonAccumulatorV2) : PythonFunction = { PythonFunction(command, envVars, cSharpIncludes, cSharpWorkerExecutable, unUsedVersionIdentifier, broadcastVars, accumulator) } diff --git a/scala/src/main/org/apache/spark/util/csharp/Utils.scala b/scala/src/main/org/apache/spark/util/csharp/Utils.scala index 7bb7419..7294cae 100644 --- a/scala/src/main/org/apache/spark/util/csharp/Utils.scala +++ b/scala/src/main/org/apache/spark/util/csharp/Utils.scala @@ -127,17 +127,17 @@ object Utils extends Logging { timer.schedule(new TimerTask() { @Override def run() { - Runtime.getRuntime.halt(status) + if (status!=0) { Runtime.getRuntime.halt(status); } } }, maxDelayMillis) // try to exit nicely - System.exit(status); + if (status!=0) { System.exit(status); } } catch { // exit nastily if we have a problem case ex: Throwable => Runtime.getRuntime.halt(status) } finally { // should never get here - Runtime.getRuntime.halt(status) + if (status!=0) { Runtime.getRuntime.halt(status); } } } @@ -147,7 +147,7 @@ object Utils extends Logging { * @param status the exit status, zero for OK, non-zero for error */ def exit(status: Int): Unit = { - exit(status, 1000) + exit(status, 1000); } private[spark] def listZipFileEntries(file: File): Array[String] = { diff --git a/scripts/sparkclr-submit.cmd b/scripts/sparkclr-submit.cmd index c6e1d50..5f119c8 100644 --- a/scripts/sparkclr-submit.cmd +++ b/scripts/sparkclr-submit.cmd @@ -42,7 +42,7 @@ if not exist "%SPARK_JARS_DIR%" ( set SPARK_JARS_CLASSPATH=%SPARK_JARS_DIR%\* -if not defined SPARKCLR_JAR (set SPARKCLR_JAR=spark-clr_2.11-2.0.200-SNAPSHOT.jar) +if not defined SPARKCLR_JAR (set SPARKCLR_JAR=spark-clr_2.11-2.3.1-SNAPSHOT.jar) echo [sparkclr-submit.cmd] SPARKCLR_JAR=%SPARKCLR_JAR% set SPARKCLR_CLASSPATH=%SPARKCLR_HOME%\lib\%SPARKCLR_JAR% REM SPARKCLR_DEBUGMODE_EXT_JARS environment variable is used to specify external dependencies to use in debug mode @@ -105,4 +105,4 @@ goto :eof @echo Example 2: @echo sparkclr-submit.cmd [--verbose] [--master local] [--deploy-mode client] [--name testapp] --exe csdriver.exe c:\sparkclrapp\driver.zip arg1 arg2 arg3 @echo Example 3: - @echo sparkclr-submit.cmd [--verbose] --master spark://host:port --deploy-mode cluster [--name testapp] --exe csdriver.exe --remote-sparkclr-jar hdfs://path/to/spark-clr-1.6.1-SNAPSHOT.jar hdfs://path/to/driver.zip arg1 arg2 arg3 + @echo sparkclr-submit.cmd [--verbose] --master spark://host:port --deploy-mode cluster [--name testapp] --exe csdriver.exe --remote-sparkclr-jar hdfs://path/to/spark-clr_2.11-2.3.1-SNAPSHOT.jar hdfs://path/to/driver.zip arg1 arg2 arg3 diff --git a/scripts/sparkclr-submit.sh b/scripts/sparkclr-submit.sh index 5d94efa..e4ca34f 100755 --- a/scripts/sparkclr-submit.sh +++ b/scripts/sparkclr-submit.sh @@ -32,7 +32,7 @@ function usage() { echo "Example 2:" echo "sparkclr-submit.sh [--verbose] [--master local] [--deploy-mode client] [--name testapp] --exe csdriver.exe sparkclrapp/driver.zip arg1 arg2 arg3" echo "Example 3:" - echo "sparkclr-submit.sh [--verbose] --master spark://host:port --deploy-mode cluster [--name testapp] --exe csdriver.exe --remote-sparkclr-jar --remote-sparkclr-jar hdfs://path/to/spark-clr_2.10-1.6.1-SNAPSHOT.jar hdfs://path/to/driver.zip arg1 arg2 arg3" + echo "sparkclr-submit.sh [--verbose] --master spark://host:port --deploy-mode cluster [--name testapp] --exe csdriver.exe --remote-sparkclr-jar --remote-sparkclr-jar hdfs://path/to/spark-clr_2.11-2.3.1-SNAPSHOT.jar hdfs://path/to/driver.zip arg1 arg2 arg3" } [ "$SPARK_HOME" = "" ] && spark_home_error @@ -57,7 +57,7 @@ fi export SPARK_JARS_CLASSPATH="$SPARK_JARS_DIR/*" -export SPARKCLR_JAR=spark-clr_2.11-2.0.200-SNAPSHOT.jar +export SPARKCLR_JAR=spark-clr_2.11-2.3.1-SNAPSHOT.jar export SPARKCLR_CLASSPATH="$SPARKCLR_HOME/lib/$SPARKCLR_JAR" # SPARKCLR_DEBUGMODE_EXT_JARS environment variable is used to specify external dependencies to use in debug mode [ ! "$SPARKCLR_DEBUGMODE_EXT_JARS" = "" ] && export SPARKCLR_CLASSPATH="$SPARKCLR_CLASSPATH:$SPARKCLR_DEBUGMODE_EXT_JARS"