Merge branch 'master' of https://github.com/microsoft/Mobius into REPL
This commit is contained in:
Коммит
470bb411fe
|
@ -30,6 +30,8 @@
|
|||
scala/dependency-reduced-pom.xml
|
||||
build/runtime/
|
||||
build/tools/
|
||||
build/examples/
|
||||
build/dependencies/
|
||||
*.log
|
||||
lib/
|
||||
|
||||
|
|
|
@ -6,9 +6,9 @@ before_install:
|
|||
- sudo apt-get install xsltproc
|
||||
- nuget install NUnit.Runners -Version 3.0.0 -OutputDirectory testrunner
|
||||
# install maven 3.3.3
|
||||
- wget http://archive.apache.org/dist/maven/maven-3/3.3.3/binaries/apache-maven-3.3.3-bin.tar.gz
|
||||
- tar zxf apache-maven-3.3.3-bin.tar.gz && rm apache-maven-3.3.3-bin.tar.gz
|
||||
- export M2_HOME="$PWD/apache-maven-3.3.3"
|
||||
- wget http://archive.apache.org/dist/maven/maven-3/3.3.9/binaries/apache-maven-3.3.9-bin.tar.gz
|
||||
- tar zxf apache-maven-3.3.9-bin.tar.gz && rm apache-maven-3.3.9-bin.tar.gz
|
||||
- export M2_HOME="$PWD/apache-maven-3.3.9"
|
||||
- export M2="$M2_HOME/bin"
|
||||
- export PATH="$M2:$PATH"
|
||||
- hash -r
|
||||
|
|
101
README.md
101
README.md
|
@ -1,6 +1,7 @@
|
|||
<h1><img src='/logo/spark-clr-clear-500x200.png' width='200px' alt='SparkCLR logo' /></h1>
|
||||
<img src='logo/mobius-star-200.png' width='125px' alt='Mobius logo' />
|
||||
# Mobius: C# API for Spark
|
||||
|
||||
[SparkCLR](https://github.com/Microsoft/SparkCLR) (pronounced Sparkler) adds C# language binding to [Apache Spark](https://spark.apache.org/), enabling the implementation of Spark driver code and data processing operations in C#.
|
||||
[Mobius](https://github.com/Microsoft/Mobius) adds C# language binding to [Apache Spark](https://spark.apache.org/), enabling the implementation of Spark driver code and data processing operations in C#.
|
||||
|
||||
For example, the word count sample in Apache Spark can be implemented in C# as follows :
|
||||
|
||||
|
@ -49,64 +50,108 @@ maxLatencyByDcDataFrame.ShowSchema();
|
|||
maxLatencyByDcDataFrame.Show();
|
||||
```
|
||||
|
||||
Refer to [SparkCLR\csharp\Samples](csharp/Samples) directory and [sample usage](csharp/Samples/Microsoft.Spark.CSharp/samplesusage.md) for complete samples.
|
||||
A simple Spark Streaming application that processes messages from Kafka using C# may be implemented using the following code:
|
||||
|
||||
``` c#
|
||||
StreamingContext sparkStreamingContext = StreamingContext.GetOrCreate(checkpointPath, () =>
|
||||
{
|
||||
var ssc = new StreamingContext(sparkContext, slideDurationInMillis);
|
||||
ssc.Checkpoint(checkpointPath);
|
||||
var stream = KafkaUtils.CreateDirectStream(ssc, topicList, kafkaParams, perTopicPartitionKafkaOffsets);
|
||||
//message format: [timestamp],[loglevel],[logmessage]
|
||||
var countByLogLevelAndTime = stream
|
||||
.Map(kvp => Encoding.UTF8.GetString(kvp.Value))
|
||||
.Filter(line => line.Contains(","))
|
||||
.Map(line => line.Split(','))
|
||||
.Map(columns => new KeyValuePair<string, int>(
|
||||
string.Format("{0},{1}", columns[0], columns[1]), 1))
|
||||
.ReduceByKeyAndWindow((x, y) => x + y, (x, y) => x - y,
|
||||
windowDurationInSecs, slideDurationInSecs, 3)
|
||||
.Map(logLevelCountPair => string.Format("{0},{1}",
|
||||
logLevelCountPair.Key, logLevelCountPair.Value));
|
||||
countByLogLevelAndTime.ForeachRDD(countByLogLevel =>
|
||||
{
|
||||
foreach (var logCount in countByLogLevel.Collect())
|
||||
Console.WriteLine(logCount);
|
||||
});
|
||||
return ssc;
|
||||
});
|
||||
sparkStreamingContext.Start();
|
||||
sparkStreamingContext.AwaitTermination();
|
||||
```
|
||||
Refer to [Mobius\csharp\Samples](csharp/Samples) directory and [sample usage](csharp/Samples/Microsoft.Spark.CSharp/samplesusage.md) for complete samples.
|
||||
|
||||
## API Documentation
|
||||
|
||||
Refer to [SparkCLR C# API documentation](csharp/Adapter/documentation/SparkCLR_API_Documentation.md) for the list of Spark's data processing operations supported in SparkCLR.
|
||||
Refer to [Mobius C# API documentation](csharp/Adapter/documentation/Mobius_API_Documentation.md) for the list of Spark's data processing operations supported in Mobius.
|
||||
|
||||
## API Usage
|
||||
|
||||
SparkCLR API usage samples are available at:
|
||||
Mobius API usage samples are available at:
|
||||
|
||||
* [Samples project](csharp/Samples/Microsoft.Spark.CSharp/) which uses a comprehensive set of SparkCLR APIs to implement samples that are also used for functional validation of APIs
|
||||
* [Examples folder](./examples) which contains standalone [C# projects](/notes/running-mobius-app.md#running-mobius-examples-in-local-mode) that can be used as templates to start developing Mobius applications
|
||||
|
||||
* [Examples folder](./examples) which contains standalone SparkCLR projects that can be used as templates to start developing SparkCLR applications
|
||||
|
||||
* Performance test scenarios implemented in [C#](csharp/Perf/Microsoft.Spark.CSharp) and [Scala](scala/perf) for side by side comparison of Spark driver code
|
||||
* [Samples project](csharp/Samples/Microsoft.Spark.CSharp/) which uses a comprehensive set of Mobius APIs to implement samples that are also used for functional validation of APIs
|
||||
|
||||
* Mobius performance test scenarios implemented in [C#](csharp/Perf/Microsoft.Spark.CSharp) and [Scala](scala/perf) for side by side comparison of Spark driver code
|
||||
|
||||
## Documents
|
||||
|
||||
Refer to the [docs folder](docs) for design overview and other info on SparkCLR
|
||||
Refer to the [docs folder](docs) for design overview and other info on Mobius
|
||||
|
||||
## Build Status
|
||||
|
||||
|Ubuntu 14.04.3 LTS |Windows |Unit test coverage |
|
||||
|-------------------|:------:|:-----------------:|
|
||||
|[![Build status](https://travis-ci.org/Microsoft/SparkCLR.svg?branch=master)](https://travis-ci.org/Microsoft/SparkCLR) |[![Build status](https://ci.appveyor.com/api/projects/status/lflkua81gg0swv6i/branch/master?svg=true)](https://ci.appveyor.com/project/SparkCLR/sparkclr/branch/master) |[![codecov.io](https://codecov.io/github/Microsoft/SparkCLR/coverage.svg?branch=master)](https://codecov.io/github/Microsoft/SparkCLR?branch=master) |
|
||||
|[![Build status](https://travis-ci.org/Microsoft/Mobius.svg?branch=master)](https://travis-ci.org/Microsoft/Mobius) |[![Build status](https://ci.appveyor.com/api/projects/status/lflkua81gg0swv6i/branch/master?svg=true)](https://ci.appveyor.com/project/SparkCLR/sparkclr/branch/master) |[![codecov.io](https://codecov.io/github/Microsoft/Mobius/coverage.svg?branch=master)](https://codecov.io/github/Microsoft/Mobius?branch=master)
|
||||
|
||||
## Getting Started
|
||||
|
||||
| |Windows |Linux |
|
||||
|---|:------:|:----:|
|
||||
|Build & run unit tests |[windows-instructions.md](notes/windows-instructions.md#building-sparkclr) |[linux-instructions.md](notes/linux-instructions.md#building-sparkclr) |
|
||||
|Run samples (functional tests) in local mode |[windows-instructions.md](notes/windows-instructions.md#running-samples) |[linux-instructions.md](notes/linux-instructions.md#running-samples) |
|
||||
|Run standlone examples in Client mode |[Quick-start wiki](https://github.com/Microsoft/SparkCLR/wiki/Quick-Start#client-mode) |[Quick-start wiki](https://github.com/Microsoft/SparkCLR/wiki/Quick-Start#client-mode) |
|
||||
|Run standlone examples in Cluster mode |[Quick-start wiki](https://github.com/Microsoft/SparkCLR/wiki/Quick-Start#cluster-mode) |[Quick-start wiki](https://github.com/Microsoft/SparkCLR/wiki/Quick-Start#cluster-mode) |
|
||||
|---|:------|:----|
|
||||
|Build & run unit tests |[Build in Windows](notes/windows-instructions.md#building-mobius) |[Build in Linux](notes/linux-instructions.md#building-mobius-in-linux) |
|
||||
|Run samples (functional tests) in local mode |[Samples in Windows](notes/windows-instructions.md#running-samples) |[Samples in Linux](notes/linux-instructions.md#running-mobius-samples-in-linux) |
|
||||
|Run examples in local mode |[Examples in Windows](/notes/running-mobius-app.md#running-mobius-examples-in-local-mode) |[Examples in Linux](notes/linux-instructions.md#running-mobius-examples-in-linux) |
|
||||
|Run Mobius app |<ul><li>[Standalone cluster](notes/running-mobius-app.md#standalone-cluster)</li><li>[YARN cluster](notes/running-mobius-app.md#yarn-cluster)</li></ul> |<ul><li>[Linux cluster](notes/linux-instructions.md#running-mobius-applications-in-linux)</li><li>[Azure HDInsight Spark Cluster](/notes/linux-instructions.md#mobius-in-azure-hdinsight-spark-cluster)</li><li>[AWS EMR Spark Cluster](/notes/linux-instructions.md#mobius-in-amazon-web-services-emr-spark-cluster)</li> |
|
||||
|
||||
Note: Refer to [linux-compatibility.md](notes/linux-compatibility.md) for using SparkCLR with Spark on Linux
|
||||
### Useful Links
|
||||
* [Configuration parameters in Mobius](/notes/configuration-mobius.md)
|
||||
* [Troubleshoot errors in Mobius](/notes/troubleshooting-mobius.md)
|
||||
* [Debug Mobius apps](/notes/running-mobius-app.md#debug-mode)
|
||||
|
||||
## Supported Spark Versions
|
||||
|
||||
SparkCLR is built and tested with [Spark 1.4.1](https://github.com/Microsoft/SparkCLR/tree/branch-1.4), [Spark 1.5.2](https://github.com/Microsoft/SparkCLR/tree/branch-1.5) and [Spark 1.6.0](https://github.com/Microsoft/SparkCLR/tree/master).
|
||||
Mobius is built and tested with Apache Spark [1.4.1](https://github.com/Microsoft/Mobius/tree/branch-1.4), [1.5.2](https://github.com/Microsoft/Mobius/tree/branch-1.5) and [1.6.*](https://github.com/Microsoft/Mobius/tree/branch-1.6).
|
||||
|
||||
## Releases
|
||||
|
||||
Mobius releases are available at https://github.com/Microsoft/Mobius/releases. References needed to build C# Spark driver applicaiton using Mobius are also available in [NuGet](https://www.nuget.org/packages/Microsoft.SparkCLR)
|
||||
|
||||
[![NuGet Badge](https://buildstats.info/nuget/Microsoft.SparkCLR)](https://www.nuget.org/packages/Microsoft.SparkCLR)
|
||||
|
||||
Refer to [mobius-release-info.md](notes/mobius-release-info.md) for the details on versioning policy and the contents of the release.
|
||||
|
||||
## License
|
||||
|
||||
[![License](https://img.shields.io/badge/license-MIT-blue.svg?style=plastic)](https://github.com/Microsoft/SparkCLR/blob/master/LICENSE)
|
||||
[![License](https://img.shields.io/badge/license-MIT-blue.svg?style=plastic)](https://github.com/Microsoft/Mobius/blob/master/LICENSE)
|
||||
|
||||
SparkCLR is licensed under the MIT license. See [LICENSE](LICENSE) file for full license information.
|
||||
Mobius is licensed under the MIT license. See [LICENSE](LICENSE) file for full license information.
|
||||
|
||||
|
||||
## Community
|
||||
|
||||
[![Issue Stats](http://issuestats.com/github/Microsoft/SparkCLR/badge/pr)](http://issuestats.com/github/Microsoft/SparkCLR)
|
||||
[![Issue Stats](http://issuestats.com/github/Microsoft/SparkCLR/badge/issue)](http://issuestats.com/github/Microsoft/SparkCLR)
|
||||
[![Join the chat at https://gitter.im/Microsoft/SparkCLR](https://badges.gitter.im/Microsoft/SparkCLR.svg)](https://gitter.im/Microsoft/SparkCLR?utm_source=badge&utm_medium=badge&utm_campaign=pr-badge&utm_content=badge)
|
||||
[![Issue Stats](http://issuestats.com/github/Microsoft/Mobius/badge/pr)](http://issuestats.com/github/Microsoft/Mobius)
|
||||
[![Issue Stats](http://issuestats.com/github/Microsoft/Mobius/badge/issue)](http://issuestats.com/github/Microsoft/Mobius)
|
||||
[![Join the chat at https://gitter.im/Microsoft/Mobius](https://badges.gitter.im/Microsoft/Mobius.svg)](https://gitter.im/Microsoft/Mobius?utm_source=badge&utm_medium=badge&utm_campaign=pr-badge&utm_content=badge)
|
||||
[![Twitter](https://img.shields.io/twitter/url/http/twitter.com/MobiusForSpark.svg?style=social)](https://twitter.com/intent/tweet?text=@MobiusForSpark [your tweet] via @GitHub)
|
||||
|
||||
* SparkCLR project welcomes contributions. To contribute, follow the instructions in [CONTRIBUTING.md](notes/CONTRIBUTING.md)
|
||||
* Mobius project welcomes contributions. To contribute, follow the instructions in [CONTRIBUTING.md](notes/CONTRIBUTING.md)
|
||||
|
||||
* Options to ask your question to the SparkCLR community
|
||||
* create issue on [GitHub](https://github.com/Microsoft/SparkCLR)
|
||||
* Options to ask your question to the Mobius community
|
||||
* create issue on [GitHub](https://github.com/Microsoft/Mobius)
|
||||
* create post with "sparkclr" tag in [Stack Overflow](https://stackoverflow.com/questions/tagged/sparkclr)
|
||||
* send email to sparkclr-user@googlegroups.com
|
||||
* join chat at [SparkCLR room in Gitter](https://gitter.im/Microsoft/SparkCLR)
|
||||
* join chat at [Mobius room in Gitter](https://gitter.im/Microsoft/Mobius)
|
||||
* tweet [@MobiusForSpark](http://twitter.com/MobiusForSpark)
|
||||
|
||||
## Code of Conduct
|
||||
This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/). For more information see the [Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/) or contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with any additional questions or comments.
|
||||
|
|
15
appveyor.yml
15
appveyor.yml
|
@ -1,4 +1,4 @@
|
|||
version: 1.6.0-SNAPSHOT.{build}
|
||||
version: 1.6.2-SNAPSHOT.{build}
|
||||
|
||||
environment:
|
||||
securefile:
|
||||
|
@ -26,6 +26,7 @@ build_script:
|
|||
- cmd: SET MAVEN_OPTS=-XX:MaxPermSize=2g -Xmx4g
|
||||
- cmd: SET JAVA_OPTS=-XX:MaxPermSize=2g -Xmx4g
|
||||
- cmd: SET MVN_QUIET=--quiet
|
||||
- ps: if($env:APPVEYOR_REPO_TAG -eq $FALSE) {.\dev\scripts\SetSparkClrNugetPackageVersion.ps1 -nuspecDir .\csharp -version $env:APPVEYOR_BUILD_VERSION}
|
||||
- cmd: cd .\build
|
||||
- cmd: .\Build.cmd
|
||||
- cmd: cd ..
|
||||
|
@ -45,7 +46,7 @@ after_test:
|
|||
- pip install codecov
|
||||
- codecov -f "SparkCLRCodeCoverage.xml"
|
||||
- cmd: cd .\build\localmode
|
||||
- cmd: .\Runsamples.cmd --validate
|
||||
- cmd: if not defined ProjectVersion (.\Runsamples.cmd --validate)
|
||||
- cmd: cd ..\..
|
||||
- cmd: dir csharp\Microsoft*.nupkg
|
||||
- cmd: dir scala\target\spark-clr*.jar
|
||||
|
@ -66,7 +67,15 @@ deploy:
|
|||
- provider: NuGet # deploy to NuGet.org
|
||||
api_key:
|
||||
secure: TscZXMoOxrMfjR2TvGBns6b+IILWvo0WJpxikoGsMCqEcMj/x41Le1j8dHTCJMjI
|
||||
skip_symbols: false # push symbols to SymbolSource.org
|
||||
skip_symbols: false
|
||||
artifact: /Microsoft.*\.nupkg/
|
||||
on:
|
||||
appveyor_repo_tag: true # deploy on tag push only
|
||||
|
||||
- provider: NuGet # deploy to MyGet.org
|
||||
server: https://www.myget.org/F/mobiusforspark/api/v2/package
|
||||
api_key:
|
||||
secure: 1c6+PZ3zOdIgIy2y8rf1g/NfbcfoxwNcymNBUr1591mD3Ull2X32Qvw2QyCXqFka
|
||||
skip_symbols: false
|
||||
symbol_server: https://www.myget.org/F/mobiusforspark/api/v2/package
|
||||
artifact: /Microsoft.*\.nupkg/
|
||||
|
|
|
@ -38,14 +38,14 @@ if NOT EXIST "%SPARKCLR_HOME%\lib" mkdir "%SPARKCLR_HOME%\lib"
|
|||
if NOT EXIST "%SPARKCLR_HOME%\samples" mkdir "%SPARKCLR_HOME%\samples"
|
||||
if NOT EXIST "%SPARKCLR_HOME%\repl" mkdir "%SPARKCLR_HOME%\repl"
|
||||
|
||||
@echo Assemble SparkCLR Scala components
|
||||
@echo Assemble Mobius Scala components
|
||||
pushd "%CMDHOME%\..\scala"
|
||||
|
||||
@rem clean the target directory first
|
||||
call mvn.cmd %MVN_QUIET% clean
|
||||
|
||||
@rem
|
||||
@rem Note: Shade-plugin helps creates an uber-package to simplify SparkCLR job submission;
|
||||
@rem Note: Shade-plugin helps creates an uber-package to simplify running samples during CI;
|
||||
@rem however, it breaks debug mode in IntellJ. So enable shade-plugin
|
||||
@rem only in build.cmd to create the uber-package.
|
||||
@rem
|
||||
|
@ -80,19 +80,22 @@ IF "%APPVEYOR_REPO_TAG%" == "true" (goto :sign)
|
|||
|
||||
:mvndone
|
||||
|
||||
set MVN_ERRORLEVEL=%ERRORLEVEL%
|
||||
|
||||
@rem
|
||||
@rem After uber package is created, restore Pom.xml
|
||||
@rem
|
||||
copy /y %temp%\pom.xml.original pom.xml
|
||||
|
||||
if %ERRORLEVEL% NEQ 0 (
|
||||
@echo Build SparkCLR Scala components failed, stop building.
|
||||
if %MVN_ERRORLEVEL% NEQ 0 (
|
||||
@echo Build Mobius Scala components failed, stop building.
|
||||
popd
|
||||
goto :eof
|
||||
)
|
||||
|
||||
@echo SparkCLR Scala binaries
|
||||
copy /y target\spark*.jar "%SPARKCLR_HOME%\lib\"
|
||||
@echo Mobius Scala binaries
|
||||
@rem copy non-uber jar to runtime\lib folder
|
||||
powershell -f ..\build\copyjar.ps1
|
||||
popd
|
||||
|
||||
@REM Any .jar files under the lib directory will be copied to the staged runtime lib tree.
|
||||
|
@ -105,7 +108,7 @@ if EXIST "%CMDHOME%\lib" (
|
|||
)
|
||||
|
||||
:buildCSharp
|
||||
@echo Assemble SparkCLR C# components
|
||||
@echo Assemble Mobius C# components
|
||||
pushd "%CMDHOME%\..\csharp"
|
||||
|
||||
@rem clean any possible previous build first
|
||||
|
@ -113,20 +116,20 @@ call Clean.cmd
|
|||
call Build.cmd
|
||||
|
||||
if %ERRORLEVEL% NEQ 0 (
|
||||
@echo Build SparkCLR C# components failed, stop building.
|
||||
@echo Build Mobius C# components failed, stop building.
|
||||
popd
|
||||
goto :eof
|
||||
)
|
||||
|
||||
@echo SparkCLR C# binaries
|
||||
@echo Mobius C# binaries
|
||||
copy /y Worker\Microsoft.Spark.CSharp\bin\Release\* "%SPARKCLR_HOME%\bin\"
|
||||
|
||||
@echo SparkCLR C# Samples binaries
|
||||
@echo Mobius C# Samples binaries
|
||||
@rem need to include CSharpWorker.exe.config in samples folder
|
||||
copy /y Worker\Microsoft.Spark.CSharp\bin\Release\* "%SPARKCLR_HOME%\samples\"
|
||||
copy /y Samples\Microsoft.Spark.CSharp\bin\Release\* "%SPARKCLR_HOME%\samples\"
|
||||
|
||||
@echo SparkCLR Samples data
|
||||
@echo Mobius Samples data
|
||||
copy /y Samples\Microsoft.Spark.CSharp\data\* "%SPARKCLR_HOME%\data\"
|
||||
|
||||
@echo SparkCLR REPL
|
||||
|
@ -135,7 +138,59 @@ copy /y Repl\bin\Release\* "%SPARKCLR_HOME%\repl\"
|
|||
|
||||
popd
|
||||
|
||||
@echo Assemble SparkCLR script components
|
||||
@echo Download external dependencies
|
||||
pushd "%CMDHOME%"
|
||||
set DEPENDENCIES_DIR=dependencies
|
||||
if NOT EXIST "%DEPENDENCIES_DIR%" mkdir %DEPENDENCIES_DIR%
|
||||
set DEPENDENCIES_HOME=%CMDHOME%\%DEPENDENCIES_DIR%
|
||||
powershell -f localmode\downloadtools.ps1 dependencies
|
||||
@echo Assemble dependencies
|
||||
xcopy /e /y "%DEPENDENCIES_HOME%" "%SPARKCLR_HOME%\dependencies\"
|
||||
|
||||
@echo Assemble Mobius examples
|
||||
pushd "%CMDHOME%\..\examples"
|
||||
call Clean.cmd
|
||||
call Build.cmd
|
||||
|
||||
if %ERRORLEVEL% NEQ 0 (
|
||||
@echo Build Mobius .NET examples failed, stop building.
|
||||
popd
|
||||
goto :eof
|
||||
)
|
||||
|
||||
set EXAMPLES_HOME=%CMDHOME%\examples
|
||||
@echo set EXAMPLES_HOME=%EXAMPLES_HOME%
|
||||
|
||||
if EXIST "%EXAMPLES_HOME%" (
|
||||
@echo Delete existing %EXAMPLES_HOME% ...
|
||||
rd /s /q "%EXAMPLES_HOME%"
|
||||
)
|
||||
if NOT EXIST "%EXAMPLES_HOME%" mkdir "%EXAMPLES_HOME%"
|
||||
|
||||
set CURRDIR=%cd%
|
||||
for /f "delims=" %%D in ('dir /b /s bin') do call :copyexamples %%D
|
||||
goto :copyscripts
|
||||
|
||||
:copyexamples
|
||||
set EXAMPLES_SRC=%1
|
||||
set EXAMPLES_TARGET=%1
|
||||
call set EXAMPLES_TARGET=%%EXAMPLES_TARGET:%CURRDIR%=%EXAMPLES_HOME%%%
|
||||
set EXAMPLES_TARGET=%EXAMPLES_TARGET:~0,-4%
|
||||
|
||||
@echo mkdir %EXAMPLES_TARGET%
|
||||
if NOT EXIST "%EXAMPLES_TARGET%" mkdir "%EXAMPLES_TARGET%"
|
||||
|
||||
REM 1. Copy dependencies from %SPARKCLR_HOME%\bin to use latest Mobius binaries
|
||||
xcopy /y "%SPARKCLR_HOME%\bin\*" "%EXAMPLES_TARGET%"
|
||||
REM 2. copy Examples APPs
|
||||
xcopy /d /y "%EXAMPLES_SRC%\Release" "%EXAMPLES_TARGET%"
|
||||
|
||||
goto :eof
|
||||
|
||||
:copyscripts
|
||||
popd
|
||||
|
||||
@echo Assemble Mobius script components
|
||||
xcopy /e /y "%CMDHOME%\..\scripts" "%SPARKCLR_HOME%\scripts\"
|
||||
|
||||
@echo Make distribution
|
||||
|
@ -148,10 +203,21 @@ if not defined ProjectVersion (
|
|||
)
|
||||
|
||||
set SPARKCLR_NAME=spark-clr_2.10-%ProjectVersion%
|
||||
@echo "%SPARKCLR_HOME%
|
||||
|
||||
@rem copy samples to top-level folder before zipping
|
||||
@echo move /Y "%SPARKCLR_HOME%\samples "%CMDHOME%"
|
||||
move /Y %SPARKCLR_HOME%\samples %CMDHOME%
|
||||
@echo move /Y "%SPARKCLR_HOME%\data" "%CMDHOME%\samples"
|
||||
move /Y %SPARKCLR_HOME%\data %CMDHOME%\samples
|
||||
|
||||
@rem copy release info
|
||||
@echo copy /Y "%CMDHOME%\..\notes\mobius-release-info.md"
|
||||
copy /Y "%CMDHOME%\..\notes\mobius-release-info.md"
|
||||
|
||||
@rem Create the zip file
|
||||
@echo 7z a .\target\%SPARKCLR_NAME%.zip runtime localmode ..\examples
|
||||
7z a .\target\%SPARKCLR_NAME%.zip runtime localmode ..\examples
|
||||
@echo 7z a .\target\%SPARKCLR_NAME%.zip runtime examples samples mobius-release-info.md
|
||||
7z a .\target\%SPARKCLR_NAME%.zip runtime examples samples mobius-release-info.md
|
||||
|
||||
:distdone
|
||||
popd
|
||||
|
|
|
@ -18,14 +18,14 @@ fi
|
|||
[ ! -d "$SPARKCLR_HOME/samples" ] && mkdir "$SPARKCLR_HOME/samples"
|
||||
[ ! -d "$SPARKCLR_HOME/scripts" ] && mkdir "$SPARKCLR_HOME/scripts"
|
||||
|
||||
echo "Assemble SparkCLR Scala components"
|
||||
echo "Assemble Mobius Scala components"
|
||||
pushd "$FWDIR/../scala"
|
||||
|
||||
# clean the target directory first
|
||||
mvn clean -q
|
||||
[ $? -ne 0 ] && exit 1
|
||||
|
||||
# Note: Shade-plugin helps creates an uber-package to simplify SparkCLR job submission;
|
||||
# Note: Shade-plugin helps creates an uber-package to simplify running samples during CI;
|
||||
# however, it breaks debug mode in IntellJ. So enable shade-plugin
|
||||
# only in build.cmd to create the uber-package.
|
||||
# build the package
|
||||
|
@ -33,11 +33,11 @@ mvn package -Puber-jar -q
|
|||
|
||||
if [ $? -ne 0 ]
|
||||
then
|
||||
echo "Build SparkCLR Scala components failed, stop building."
|
||||
echo "Build Mobius Scala components failed, stop building."
|
||||
popd
|
||||
exit 1
|
||||
fi
|
||||
echo "SparkCLR Scala binaries"
|
||||
echo "Mobius Scala binaries"
|
||||
cp target/spark*.jar "$SPARKCLR_HOME/lib/"
|
||||
popd
|
||||
|
||||
|
@ -52,7 +52,7 @@ then
|
|||
done
|
||||
fi
|
||||
|
||||
echo "Assemble SparkCLR C# components"
|
||||
echo "Assemble Mobius C# components"
|
||||
pushd "$FWDIR/../csharp"
|
||||
|
||||
# clean any possible previous build first
|
||||
|
@ -62,23 +62,37 @@ pushd "$FWDIR/../csharp"
|
|||
|
||||
if [ $? -ne 0 ];
|
||||
then
|
||||
echo "Build SparkCLR C# components failed, stop building."
|
||||
echo "Build Mobius C# components failed, stop building."
|
||||
popd
|
||||
exit 1
|
||||
fi
|
||||
echo "SparkCLR C# binaries"
|
||||
echo "Mobius C# binaries"
|
||||
cp Worker/Microsoft.Spark.CSharp/bin/Release/* "$SPARKCLR_HOME/bin/"
|
||||
|
||||
echo "SparkCLR C# Samples binaries"
|
||||
echo "Mobius C# Samples binaries"
|
||||
# need to include CSharpWorker.exe.config in samples folder
|
||||
cp Worker/Microsoft.Spark.CSharp/bin/Release/* "$SPARKCLR_HOME/samples/"
|
||||
cp Samples/Microsoft.Spark.CSharp/bin/Release/* "$SPARKCLR_HOME/samples/"
|
||||
|
||||
echo "SparkCLR Samples data"
|
||||
echo "Mobius Samples data"
|
||||
cp Samples/Microsoft.Spark.CSharp/data/* "$SPARKCLR_HOME/data/"
|
||||
popd
|
||||
|
||||
echo "Assemble SparkCLR script components"
|
||||
echo "Assemble Mobius examples"
|
||||
pushd "$FWDIR/../examples"
|
||||
# clean any possible previous build first
|
||||
./clean.sh
|
||||
./build.sh
|
||||
|
||||
if [ $? -ne 0 ];
|
||||
then
|
||||
echo "Build Mobius .NET Examples failed, stop building."
|
||||
popd
|
||||
exit 1
|
||||
fi
|
||||
popd
|
||||
|
||||
echo "Assemble Mobius script components"
|
||||
pushd "$FWDIR/../scripts"
|
||||
cp *.sh "$SPARKCLR_HOME/scripts/"
|
||||
popd
|
||||
|
|
|
@ -0,0 +1,43 @@
|
|||
function Get-ScriptDirectory
|
||||
{
|
||||
$Invocation = (Get-Variable MyInvocation -Scope 1).Value;
|
||||
if($Invocation.PSScriptRoot)
|
||||
{
|
||||
$Invocation.PSScriptRoot;
|
||||
}
|
||||
Elseif($Invocation.MyCommand.Path)
|
||||
{
|
||||
Split-Path $Invocation.MyCommand.Path
|
||||
}
|
||||
else
|
||||
{
|
||||
$Invocation.InvocationName.Substring(0,$Invocation.InvocationName.LastIndexOf("\"));
|
||||
}
|
||||
}
|
||||
|
||||
#
|
||||
# main body of the script
|
||||
# this script copies jar file for the release
|
||||
#
|
||||
$scriptDir= Get-ScriptDirectory
|
||||
write-output "Script directory: $scriptDir"
|
||||
$destDir = "$scriptDir\runtime\lib"
|
||||
write-output "Directory to which file will be copied to: $destDir"
|
||||
pushd ..\scala\target
|
||||
|
||||
#non-uber jar has original prefix - this is the file that needs to be copied over
|
||||
$files = get-childitem $configPath -filter "original*"
|
||||
|
||||
#only one file in $files
|
||||
foreach($file in $files)
|
||||
{
|
||||
$sourceFileName = $file.Name
|
||||
write-output "Name of the file to copy: $sourceFileName"
|
||||
}
|
||||
|
||||
$pattern = "^original-(.*)"
|
||||
$destFileName = $sourceFileName -replace $pattern,'$1'
|
||||
write-output "Name of the file to use in destination: $destFileName"
|
||||
|
||||
copy-item $sourceFileName -Destination "$destDir\$destFileName"
|
||||
popd
|
|
@ -28,7 +28,7 @@ if "%1" == "" (
|
|||
@rem TODO: this check will fail if "--exe" only exists in the argument list of user application.
|
||||
if "%1" == "--exe" (
|
||||
set USER_EXE="true"
|
||||
@echo [RunSamples.cmd] Run user specified application, instead of SparkCLR samples.
|
||||
@echo [RunSamples.cmd] Run user specified application, instead of Mobius samples.
|
||||
)
|
||||
|
||||
rem - shift the arguments and examine %1 again
|
||||
|
@ -47,16 +47,14 @@ if "%precheck%" == "bad" (goto :EOF)
|
|||
@rem
|
||||
@rem setup Hadoop and Spark versions
|
||||
@rem
|
||||
set SPARK_VERSION=1.6.0
|
||||
set SPARK_VERSION=1.6.2
|
||||
set HADOOP_VERSION=2.6
|
||||
@echo [RunSamples.cmd] SPARK_VERSION=%SPARK_VERSION%, HADOOP_VERSION=%HADOOP_VERSION%
|
||||
|
||||
@rem Windows 7/8/10 may not allow powershell scripts by default
|
||||
powershell -Command Set-ExecutionPolicy -Scope CurrentUser -ExecutionPolicy Unrestricted
|
||||
|
||||
@rem download runtime dependencies
|
||||
pushd "%CMDHOME%"
|
||||
powershell -f downloadtools.ps1 run !VERBOSE!
|
||||
@rem Windows 7/8/10 may not allow powershell scripts by default
|
||||
powershell -ExecutionPolicy Unrestricted -File downloadtools.ps1 run !VERBOSE!
|
||||
@echo [RunSamples.cmd] UpdateRuntime.cmd
|
||||
type ..\tools\updateruntime.cmd
|
||||
call ..\tools\updateruntime.cmd
|
||||
|
@ -67,7 +65,12 @@ if defined ProjectVersion (
|
|||
)
|
||||
|
||||
set SPARKCLR_HOME=%CMDHOME%\..\runtime
|
||||
set SPARKCSV_JARS=
|
||||
|
||||
@rem spark-csv package and its depenedency are required for DataFrame operations in Mobius
|
||||
set SPARKCLR_EXT_PATH=%SPARKCLR_HOME%\dependencies
|
||||
set SPARKCSV_JAR1PATH=%SPARKCLR_EXT_PATH%\spark-csv_2.10-1.3.0.jar
|
||||
set SPARKCSV_JAR2PATH=%SPARKCLR_EXT_PATH%\commons-csv-1.1.jar
|
||||
set SPARKCLR_EXT_JARS=%SPARKCSV_JAR1PATH%,%SPARKCSV_JAR2PATH%
|
||||
|
||||
@rem RunSamples.cmd is in local mode, should not load Hadoop or Yarn cluster config. Disable Hadoop/Yarn conf dir.
|
||||
set HADOOP_CONF_DIR=
|
||||
|
@ -81,7 +84,7 @@ set SAMPLES_DIR=%SPARKCLR_HOME%\samples
|
|||
@echo [RunSamples.cmd] JAVA_HOME=%JAVA_HOME%
|
||||
@echo [RunSamples.cmd] SPARK_HOME=%SPARK_HOME%
|
||||
@echo [RunSamples.cmd] SPARKCLR_HOME=%SPARKCLR_HOME%
|
||||
@echo [RunSamples.cmd] SPARKCSV_JARS=%SPARKCSV_JARS%
|
||||
@echo [RunSamples.cmd] SPARKCLR_EXT_JARS=%SPARKCLR_EXT_JARS%
|
||||
|
||||
pushd "%SPARKCLR_HOME%\scripts"
|
||||
@echo [RunSamples.cmd] CWD=
|
||||
|
@ -93,8 +96,8 @@ if !INTERACTIVE! == "interactive" (
|
|||
call sparkclr-repl.cmd
|
||||
) else (
|
||||
if "!USER_EXE!"=="" (
|
||||
@echo [RunSamples.cmd] call sparkclr-submit.cmd --exe SparkCLRSamples.exe %SAMPLES_DIR% spark.local.dir %TEMP_DIR% sparkclr.sampledata.loc %SPARKCLR_HOME%\data %*
|
||||
call sparkclr-submit.cmd --exe SparkCLRSamples.exe %SAMPLES_DIR% spark.local.dir %TEMP_DIR% sparkclr.sampledata.loc %SPARKCLR_HOME%\data %*
|
||||
@echo [RunSamples.cmd] call sparkclr-submit.cmd --jars %SPARKCLR_EXT_JARS% -exe SparkCLRSamples.exe %SAMPLES_DIR% spark.local.dir %TEMP_DIR% sparkclr.sampledata.loc %SPARKCLR_HOME%\data %*
|
||||
call sparkclr-submit.cmd --jars %SPARKCLR_EXT_JARS% --exe SparkCLRSamples.exe %SAMPLES_DIR% spark.local.dir %TEMP_DIR% sparkclr.sampledata.loc %SPARKCLR_HOME%\data %*
|
||||
) else (
|
||||
@echo [RunSamples.cmd] call sparkclr-submit.cmd %*
|
||||
call sparkclr-submit.cmd %*
|
||||
|
|
|
@ -12,7 +12,7 @@ if ($stage.ToLower() -eq "run")
|
|||
$hadoopVersion = if ($envValue -eq $null) { "2.6" } else { $envValue }
|
||||
|
||||
$envValue = [Environment]::GetEnvironmentVariable("SPARK_VERSION")
|
||||
$sparkVersion = if ($envValue -eq $null) { "1.6.0" } else { $envValue }
|
||||
$sparkVersion = if ($envValue -eq $null) { "1.6.1" } else { $envValue }
|
||||
|
||||
Write-Output "[downloadtools] hadoopVersion=$hadoopVersion, sparkVersion=$sparkVersion"
|
||||
}
|
||||
|
@ -65,6 +65,7 @@ function Replace-VariableInFile($variable, $value, $sourceFile, $targetFile)
|
|||
|
||||
function Download-File($url, $output)
|
||||
{
|
||||
$output = [System.IO.Path]::GetFullPath($output)
|
||||
if (test-path $output)
|
||||
{
|
||||
Write-Output "[downloadtools.Download-File] $output exists. No need to download."
|
||||
|
@ -83,7 +84,13 @@ function Download-File($url, $output)
|
|||
-SourceIdentifier Web.DownloadProgressChanged -Action {
|
||||
$Global:Data = $event
|
||||
}
|
||||
$wc.DownloadFileAsync($url, $output)
|
||||
|
||||
$tmpOutput = $output + ".tmp.download"
|
||||
if (test-path $tmpOutput) {
|
||||
Remove-Item $tmpOutput
|
||||
}
|
||||
|
||||
$wc.DownloadFileAsync($url, $tmpOutput)
|
||||
While (!($Global:downloadComplete)) {
|
||||
$percent = $Global:Data.SourceArgs.ProgressPercentage
|
||||
$totalBytes = $Global:Data.SourceArgs.TotalBytesToReceive
|
||||
|
@ -92,6 +99,8 @@ function Download-File($url, $output)
|
|||
Write-Progress -Activity ("Downloading file to {0} from {1}" -f $output,$url) -Status ("{0} bytes \ {1} bytes" -f $receivedBytes,$totalBytes) -PercentComplete $percent
|
||||
}
|
||||
}
|
||||
|
||||
Rename-Item $tmpOutput -NewName $output
|
||||
Write-Progress -Activity ("Downloading file to {0} from {1}" -f $output, $url) -Status ("{0} bytes \ {1} bytes" -f $receivedBytes,$totalBytes) -Completed
|
||||
Unregister-Event -SourceIdentifier Web.DownloadFileCompleted
|
||||
Unregister-Event -SourceIdentifier Web.DownloadProgressChanged
|
||||
|
@ -213,11 +222,11 @@ function Download-BuildTools
|
|||
}
|
||||
|
||||
# Apache Maven
|
||||
$mvnVer = "apache-maven-3.3.3"
|
||||
$mvnVer = "apache-maven-3.3.9"
|
||||
$mvnCmd = "$toolsDir\$mvnVer\bin\mvn.cmd"
|
||||
if (!(test-path $mvnCmd))
|
||||
{
|
||||
$url = "http://www.us.apache.org/dist/maven/maven-3/3.3.3/binaries/$mvnVer-bin.tar.gz"
|
||||
$url = "http://www.us.apache.org/dist/maven/maven-3/3.3.9/binaries/$mvnVer-bin.tar.gz"
|
||||
$output="$toolsDir\$mvnVer-bin.tar.gz"
|
||||
Download-File $url $output
|
||||
Untar-File $output $toolsDir
|
||||
|
@ -257,7 +266,7 @@ function Download-BuildTools
|
|||
$gpgZip = "$toolsDir\gpg4win-vanilla-2.3.0.zip"
|
||||
if (!(test-path $gpgZip))
|
||||
{
|
||||
$url = "https://github.com/SparkCLR/build/blob/master/tools/gpg4win-vanilla-2.3.0.zip?raw=true"
|
||||
$url = "https://github.com/MobiusForSpark/build/blob/master/tools/gpg4win-vanilla-2.3.0.zip?raw=true"
|
||||
$output=$gpgZip
|
||||
Download-File $url $output
|
||||
# Unzip-File $output $toolsDir
|
||||
|
@ -280,6 +289,39 @@ function Download-BuildTools
|
|||
$envStream.close()
|
||||
}
|
||||
|
||||
function Download-ExternalDependencies
|
||||
{
|
||||
$readMeStream = [System.IO.StreamWriter] "$scriptDir\..\dependencies\ReadMe.txt"
|
||||
$readMeStream.WriteLine("The files in this folder are dependencies of Mobius Project")
|
||||
$readMeStream.WriteLine("Refer to the following download locations for details on the jars like POM file, license etc.")
|
||||
$readMeStream.WriteLine("")
|
||||
|
||||
$readMeStream.WriteLine("------------ Dependencies for CSV parsing in Mobius DataFrame API -----------------------------")
|
||||
# Downloading spark-csv package and its depenency. These packages are required for DataFrame operations in Mobius
|
||||
$url = "http://search.maven.org/remotecontent?filepath=com/databricks/spark-csv_2.10/1.3.0/spark-csv_2.10-1.3.0.jar"
|
||||
$output="$scriptDir\..\dependencies\spark-csv_2.10-1.3.0.jar"
|
||||
Download-File $url $output
|
||||
Write-Output "[downloadtools.Download-ExternalDependencies] Downloading $url to $scriptDir\..\dependencies"
|
||||
$readMeStream.WriteLine("$url")
|
||||
|
||||
$url = "http://search.maven.org/remotecontent?filepath=org/apache/commons/commons-csv/1.1/commons-csv-1.1.jar"
|
||||
$output="$scriptDir\..\dependencies\commons-csv-1.1.jar"
|
||||
Download-File $url $output
|
||||
Write-Output "[downloadtools.Download-ExternalDependencies] Downloading $url to $scriptDir\..\dependencies"
|
||||
$readMeStream.WriteLine("$url")
|
||||
$readMeStream.WriteLine("")
|
||||
$readMeStream.WriteLine("------------ Dependencies for Kafka-based processing in Mobius Streaming API -----------------------------")
|
||||
|
||||
$url = "http://search.maven.org/remotecontent?filepath=org/apache/spark/spark-streaming-kafka-assembly_2.10/1.6.1/spark-streaming-kafka-assembly_2.10-1.6.1.jar"
|
||||
$output="$scriptDir\..\dependencies\spark-streaming-kafka-assembly_2.10-1.6.1.jar"
|
||||
Download-File $url $output
|
||||
Write-Output "[downloadtools.Download-ExternalDependencies] Downloading $url to $scriptDir\..\dependencies"
|
||||
$readMeStream.WriteLine("$url")
|
||||
|
||||
$readMeStream.close()
|
||||
return
|
||||
}
|
||||
|
||||
function Download-RuntimeDependencies
|
||||
{
|
||||
# Create a cmd file to update environment variable
|
||||
|
@ -340,7 +382,7 @@ function Download-RuntimeDependencies
|
|||
$winutilsExe = "$winutilsBin\winutils.exe"
|
||||
if (!(test-path $winutilsExe))
|
||||
{
|
||||
$url = "http://public-repo-1.hortonworks.com/hdp-win-alpha/winutils.exe"
|
||||
$url = "https://github.com/MobiusForSpark/winutils/blob/master/hadoop-2.6.0/bin/winutils.exe?raw=true"
|
||||
$output=$winutilsExe
|
||||
Download-File $url $output
|
||||
}
|
||||
|
@ -480,8 +522,8 @@ function Print-Usage
|
|||
Write-Output ''
|
||||
Write-Output ' This script takes one input parameter ("stage"), which can be either [build | run].'
|
||||
Write-Output ''
|
||||
Write-Output ' Build: Download tools required in building SparkCLR;'
|
||||
Write-Output ' Run: Download Apache Spark and related binaries, required to run SparkCLR samples locally.'
|
||||
Write-Output ' Build: Download tools required in building Mobius;'
|
||||
Write-Output ' Run: Download Apache Spark and related binaries, required to run Mobius samples locally.'
|
||||
Write-Output ''
|
||||
Write-Output '====================================================================================================='
|
||||
}
|
||||
|
@ -513,6 +555,10 @@ elseif ($stage.ToLower() -eq "run")
|
|||
{
|
||||
Download-RuntimeDependencies
|
||||
}
|
||||
elseif ($stage.ToLower() -eq "dependencies")
|
||||
{
|
||||
Download-ExternalDependencies
|
||||
}
|
||||
else
|
||||
{
|
||||
Print-Usage
|
||||
|
|
|
@ -6,7 +6,7 @@ if not exist "%JAVA_HOME%\bin\java.exe" (
|
|||
@echo.
|
||||
@echo ============================================================================================
|
||||
@echo.
|
||||
@echo WARNING!!! %~nx0 detected JAVA_HOME is not set properly. SparkCLR requires JDK 7u85 and above,
|
||||
@echo WARNING!!! %~nx0 detected JAVA_HOME is not set properly. Mobius requires JDK 7u85 and above,
|
||||
@echo or JDK 8u60 and above. You can either download OpenJDK available at
|
||||
@echo http://www.azul.com/downloads/zulu/zulu-windows/, or use Oracle JDK.
|
||||
@echo.
|
||||
|
@ -33,7 +33,7 @@ goto :eof
|
|||
@echo ============================================================================================
|
||||
@echo.
|
||||
@echo WARNING!!! %~nx0 detected version of Visual Studio in current command prompt as %version%.
|
||||
@echo SparkCLR %~nx0 requires "Developer Command Prompt for VS2013" and above, or
|
||||
@echo Mobius %~nx0 requires "Developer Command Prompt for VS2013" and above, or
|
||||
@echo "MSBuild Command Prompt for VS2015" and above.
|
||||
@echo.
|
||||
@echo ============================================================================================
|
||||
|
|
|
@ -11,7 +11,7 @@ do
|
|||
done
|
||||
|
||||
# setup Hadoop and Spark versions
|
||||
export SPARK_VERSION=1.6.0
|
||||
export SPARK_VERSION=1.6.2
|
||||
export HADOOP_VERSION=2.6
|
||||
echo "[run-samples.sh] SPARK_VERSION=$SPARK_VERSION, HADOOP_VERSION=$HADOOP_VERSION"
|
||||
|
||||
|
@ -27,18 +27,6 @@ if [ ! -d "$SPARK_HOME" ];
|
|||
then
|
||||
wget "http://www.us.apache.org/dist/spark/spark-$SPARK_VERSION/$SPARK.tgz" -O "$TOOLS_DIR/$SPARK.tgz"
|
||||
tar xfz "$TOOLS_DIR/$SPARK.tgz" -C "$TOOLS_DIR"
|
||||
|
||||
# hack: use a customized spark
|
||||
# TODO: fix the C# Worker
|
||||
export SPARK_SRC="$TOOLS_DIR/spark-$SPARK_VERSION"
|
||||
wget "http://www.us.apache.org/dist/spark/spark-$SPARK_VERSION/spark-$SPARK_VERSION.tgz" -O "$SPARK_SRC.tgz"
|
||||
tar xfz "$SPARK_SRC.tgz" -C "$TOOLS_DIR"
|
||||
pushd "$SPARK_SRC"
|
||||
sed -i "s/val useDaemon = /val useDaemon = false \/\//g" "core/src/main/scala/org/apache/spark/api/python/PythonWorkerFactory.scala"
|
||||
build/mvn -Pyarn -Phadoop-$HADOOP_VERSION -DskipTests package 2>&1 | grep warn
|
||||
[ $? -ne 0 ] && exit 1
|
||||
cp assembly/target/scala-2.10/spark-assembly*hadoop*.jar "$SPARK_HOME/lib/"
|
||||
popd
|
||||
fi
|
||||
export PATH="$SPARK_HOME/bin:$PATH"
|
||||
|
||||
|
|
|
@ -1,6 +1,9 @@
|
|||
# Copyright (c) Microsoft. All rights reserved.
|
||||
# Licensed under the MIT license. See LICENSE file in the project root for full license information.
|
||||
#
|
||||
# This script takes in "dir" and "target" parameters, zips all files under dir to the target file
|
||||
#
|
||||
|
||||
Param([string]$dir, [string]$target)
|
||||
|
||||
function Get-ScriptDirectory
|
||||
|
|
|
@ -53,12 +53,7 @@
|
|||
<Reference Include="System" />
|
||||
<Reference Include="System.Configuration" />
|
||||
<Reference Include="System.Core" />
|
||||
<Reference Include="System.Runtime.Serialization" />
|
||||
<Reference Include="System.Xml.Linq" />
|
||||
<Reference Include="System.Data.DataSetExtensions" />
|
||||
<Reference Include="Microsoft.CSharp" />
|
||||
<Reference Include="System.Data" />
|
||||
<Reference Include="System.Xml" />
|
||||
</ItemGroup>
|
||||
<ItemGroup>
|
||||
<Compile Include="Configuration\ConfigurationService.cs" />
|
||||
|
@ -66,6 +61,7 @@
|
|||
<Compile Include="Core\Accumulator.cs" />
|
||||
<Compile Include="Core\Broadcast.cs" />
|
||||
<Compile Include="Core\Option.cs" />
|
||||
<Compile Include="Core\Partitioner.cs" />
|
||||
<Compile Include="Core\RDDCollector.cs" />
|
||||
<Compile Include="Core\DoubleRDDFunctions.cs" />
|
||||
<Compile Include="Core\IRDDCollector.cs" />
|
||||
|
@ -80,12 +76,17 @@
|
|||
<Compile Include="Core\StatusTracker.cs" />
|
||||
<Compile Include="Core\StorageLevel.cs" />
|
||||
<Compile Include="Interop\Ipc\JsonSerDe.cs" />
|
||||
<Compile Include="Interop\Ipc\JvmBridgeUtils.cs" />
|
||||
<Compile Include="Interop\Ipc\WeakObjectManager.cs" />
|
||||
<Compile Include="Interop\SparkCLREnvironment.cs" />
|
||||
<Compile Include="Interop\Ipc\IJvmBridge.cs" />
|
||||
<Compile Include="Interop\Ipc\JvmBridge.cs" />
|
||||
<Compile Include="Interop\Ipc\JvmObjectReference.cs" />
|
||||
<Compile Include="Interop\Ipc\PayloadHelper.cs" />
|
||||
<Compile Include="Interop\Ipc\SerDe.cs" />
|
||||
<Compile Include="Network\DefaultSocketWrapper.cs" />
|
||||
<Compile Include="Network\ISocketWrapper.cs" />
|
||||
<Compile Include="Network\SocketFactory.cs" />
|
||||
<Compile Include="Properties\AssemblyInfo.cs" />
|
||||
<Compile Include="Proxy\IDataFrameNaFunctionsProxy.cs" />
|
||||
<Compile Include="Proxy\IDataFrameProxy.cs" />
|
||||
|
@ -122,6 +123,7 @@
|
|||
<Compile Include="Sql\DataFrameNaFunctions.cs" />
|
||||
<Compile Include="Sql\DataFrameReader.cs" />
|
||||
<Compile Include="Sql\DataFrameWriter.cs" />
|
||||
<Compile Include="Sql\HiveContext.cs" />
|
||||
<Compile Include="Sql\PythonSerDe.cs" />
|
||||
<Compile Include="Sql\RowConstructor.cs" />
|
||||
<Compile Include="Sql\Row.cs" />
|
||||
|
@ -130,8 +132,11 @@
|
|||
<Compile Include="Sql\SqlContext.cs" />
|
||||
<Compile Include="Sql\Types.cs" />
|
||||
<Compile Include="Sql\UserDefinedFunction.cs" />
|
||||
<Compile Include="Streaming\ConstantInputDStream.cs" />
|
||||
<Compile Include="Streaming\DStream.cs" />
|
||||
<Compile Include="Streaming\EventHubsUtils.cs" />
|
||||
<Compile Include="Streaming\Kafka.cs" />
|
||||
<Compile Include="Streaming\MapWithStateDStream.cs" />
|
||||
<Compile Include="Streaming\PairDStreamFunctions.cs" />
|
||||
<Compile Include="Streaming\StreamingContext.cs" />
|
||||
<Compile Include="Streaming\TransformedDStream.cs" />
|
||||
|
@ -157,13 +162,7 @@
|
|||
</Target>
|
||||
-->
|
||||
<Target Name="AfterBuild">
|
||||
<XslTransformation
|
||||
XslInputPath="..\documentation\DocFormatter.xsl"
|
||||
XmlInputPaths="..\documentation\Microsoft.Spark.CSharp.Adapter.Doc.XML"
|
||||
OutputPaths="..\documentation\SparkCLR_API_Documentation.md"
|
||||
Condition="'$(OS)' == 'Windows_NT'" />
|
||||
<Exec
|
||||
Command="xsltproc -o ../documentation/SparkCLR_API_Documentation.md ../documentation/DocFormatter.xsl ../documentation/Microsoft.Spark.CSharp.Adapter.Doc.XML"
|
||||
Condition="'$(OS)' != 'Windows_NT'" />
|
||||
<XslTransformation XslInputPath="..\documentation\DocFormatter.xsl" XmlInputPaths="..\documentation\Microsoft.Spark.CSharp.Adapter.Doc.XML" OutputPaths="..\documentation\Mobius_API_Documentation.md" Condition="'$(OS)' == 'Windows_NT'" />
|
||||
<Exec Command="xsltproc -o ../documentation/Mobius_API_Documentation.md ../documentation/DocFormatter.xsl ../documentation/Microsoft.Spark.CSharp.Adapter.Doc.XML" Condition="'$(OS)' != 'Windows_NT'" />
|
||||
</Target>
|
||||
</Project>
|
|
@ -2,15 +2,11 @@
|
|||
// Licensed under the MIT license. See LICENSE file in the project root for full license information.
|
||||
|
||||
using System;
|
||||
using System.Collections.Generic;
|
||||
using System.Configuration;
|
||||
using System.Diagnostics;
|
||||
using System.IO;
|
||||
using System.Linq;
|
||||
using System.Reflection;
|
||||
using System.Text;
|
||||
using System.Threading.Tasks;
|
||||
using Microsoft.Spark.CSharp.Core;
|
||||
using Microsoft.Spark.CSharp.Services;
|
||||
|
||||
namespace Microsoft.Spark.CSharp.Configuration
|
||||
|
@ -91,7 +87,7 @@ namespace Microsoft.Spark.CSharp.Configuration
|
|||
{
|
||||
protected readonly AppSettingsSection appSettings;
|
||||
protected readonly string sparkCLRHome = Environment.GetEnvironmentVariable(SPARKCLR_HOME); //set by sparkclr-submit.cmd
|
||||
protected readonly ILoggerService logger = LoggerServiceFactory.GetLogger(typeof(SparkCLRConfiguration));
|
||||
private readonly ILoggerService logger = LoggerServiceFactory.GetLogger(typeof(SparkCLRConfiguration));
|
||||
|
||||
internal SparkCLRConfiguration(System.Configuration.Configuration configuration)
|
||||
{
|
||||
|
@ -109,14 +105,36 @@ namespace Microsoft.Spark.CSharp.Configuration
|
|||
throw new Exception("Environment variable " + CSHARPBACKEND_PORT + " not set");
|
||||
}
|
||||
|
||||
logger.LogInfo("CSharpBackend successfully read from environment variable " + CSHARPBACKEND_PORT);
|
||||
logger.LogInfo("CSharpBackend successfully read from environment variable {0}", CSHARPBACKEND_PORT);
|
||||
return portNo;
|
||||
}
|
||||
|
||||
private string workerPath;
|
||||
|
||||
/// <summary>
|
||||
/// The path of the CSharp external backend worker process.
|
||||
/// </summary>
|
||||
internal virtual string GetCSharpWorkerExePath()
|
||||
{
|
||||
// SparkCLR jar and driver, worker & dependencies are shipped using Spark file server.
|
||||
// These files are available in the Spark executing directory at executor node.
|
||||
if (workerPath != null) return workerPath; // Return cached value
|
||||
|
||||
var workerPathConfig = appSettings.Settings[CSharpWorkerPathSettingKey];
|
||||
if (workerPathConfig == null)
|
||||
{
|
||||
workerPath = GetCSharpProcFileName();
|
||||
}
|
||||
else
|
||||
{
|
||||
// Explicit path for the CSharpWorker.exe was listed in App.config
|
||||
workerPath = workerPathConfig.Value;
|
||||
logger.LogDebug("Using CSharpWorkerPath value from App.config : {0}", workerPath);
|
||||
}
|
||||
return workerPath;
|
||||
}
|
||||
|
||||
internal virtual string GetCSharpProcFileName()
|
||||
{
|
||||
return ProcFileName;
|
||||
}
|
||||
|
@ -124,50 +142,33 @@ namespace Microsoft.Spark.CSharp.Configuration
|
|||
|
||||
/// <summary>
|
||||
/// Configuration for SparkCLR jobs in ** Local ** mode
|
||||
/// Needs some investigation to find out why Local mode behaves
|
||||
/// different than standalone cluster mode for the configuration values
|
||||
/// overridden here
|
||||
/// </summary>
|
||||
private class SparkCLRLocalConfiguration : SparkCLRConfiguration
|
||||
{
|
||||
private readonly ILoggerService logger = LoggerServiceFactory.GetLogger(typeof(SparkCLRLocalConfiguration));
|
||||
internal SparkCLRLocalConfiguration(System.Configuration.Configuration configuration)
|
||||
: base(configuration)
|
||||
{ }
|
||||
|
||||
private string workerPath;
|
||||
internal override string GetCSharpWorkerExePath()
|
||||
internal override string GetCSharpProcFileName()
|
||||
{
|
||||
// SparkCLR jar and driver, worker & dependencies are shipped using Spark file server.
|
||||
// These files are available in the Spark executing directory at executor node.
|
||||
|
||||
if (workerPath != null) return workerPath; // Return cached value
|
||||
|
||||
KeyValueConfigurationElement workerPathConfig = appSettings.Settings[CSharpWorkerPathSettingKey];
|
||||
if (workerPathConfig == null)
|
||||
{
|
||||
// Path for the CSharpWorker.exe was not specified in App.config
|
||||
// Try to work out where location relative to this class.
|
||||
// Construct path based on well-known file name + directory this class was loaded from.
|
||||
string procDir = Path.GetDirectoryName(GetType().Assembly.Location);
|
||||
workerPath = Path.Combine(procDir, ProcFileName);
|
||||
logger.LogDebug("Using synthesized value for CSharpWorkerPath : " + workerPath);
|
||||
}
|
||||
else
|
||||
{
|
||||
// Explicit path for the CSharpWorker.exe was listed in App.config
|
||||
workerPath = workerPathConfig.Value;
|
||||
logger.LogDebug("Using CSharpWorkerPath value from App.config : " + workerPath);
|
||||
}
|
||||
return workerPath;
|
||||
// Path for the CSharpWorker.exe was not specified in App.config
|
||||
// Try to work out where location relative to this class.
|
||||
// Construct path based on well-known file name + directory this class was loaded from.
|
||||
string procDir = Path.GetDirectoryName(GetType().Assembly.Location);
|
||||
var procFilePath = Path.Combine(procDir, ProcFileName);
|
||||
logger.LogDebug("Using SparkCLR Adapter dll path to construct CSharpWorkerPath : {0}", procFilePath);
|
||||
return procFilePath;
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Configuration mode for debug mode
|
||||
/// This configuration exists only to make SparkCLR development & debugging easier
|
||||
/// This configuration exists only to make SparkCLR development and debugging easier
|
||||
/// </summary>
|
||||
private class SparkCLRDebugConfiguration : SparkCLRLocalConfiguration
|
||||
{
|
||||
private readonly ILoggerService logger = LoggerServiceFactory.GetLogger(typeof(SparkCLRDebugConfiguration));
|
||||
internal SparkCLRDebugConfiguration(System.Configuration.Configuration configuration)
|
||||
: base(configuration)
|
||||
{}
|
||||
|
@ -192,9 +193,14 @@ namespace Microsoft.Spark.CSharp.Configuration
|
|||
KeyValueConfigurationElement workerPathConfig = appSettings.Settings[CSharpWorkerPathSettingKey];
|
||||
if (workerPathConfig != null)
|
||||
{
|
||||
logger.LogInfo("Worker path read from setting {0} in app config", CSharpWorkerPathSettingKey);
|
||||
return workerPathConfig.Value;
|
||||
}
|
||||
return GetSparkCLRArtifactsPath("bin", ProcFileName);
|
||||
|
||||
var path = GetSparkCLRArtifactsPath("bin", ProcFileName);
|
||||
logger.LogInfo("Worker path {0} constructed using {1} environment variable", path, SPARKCLR_HOME);
|
||||
|
||||
return path;
|
||||
}
|
||||
|
||||
private string GetSparkCLRArtifactsPath(string sparkCLRSubFolderName, string fileName)
|
||||
|
@ -209,14 +215,31 @@ namespace Microsoft.Spark.CSharp.Configuration
|
|||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// The running mode used by Configuration Service
|
||||
/// </summary>
|
||||
public enum RunMode
|
||||
{
|
||||
/// <summary>
|
||||
/// Unknown running mode
|
||||
/// </summary>
|
||||
UNKNOWN,
|
||||
DEBUG, //not a Spark mode but exists for dev debugging purpose
|
||||
/// <summary>
|
||||
/// Debug mode, not a Spark mode but exists for develop debugging purpose
|
||||
/// </summary>
|
||||
DEBUG,
|
||||
/// <summary>
|
||||
/// Indicates service is running in local
|
||||
/// </summary>
|
||||
LOCAL,
|
||||
/// <summary>
|
||||
/// Indicates service is running in cluster
|
||||
/// </summary>
|
||||
CLUSTER,
|
||||
YARN,
|
||||
//following are not currently supported
|
||||
MESOS
|
||||
/// <summary>
|
||||
/// Indicates service is running in Yarn
|
||||
/// </summary>
|
||||
YARN
|
||||
//MESOS //not currently supported
|
||||
}
|
||||
}
|
||||
|
|
|
@ -12,6 +12,7 @@ using System.Runtime.Serialization;
|
|||
using System.Runtime.Serialization.Formatters.Binary;
|
||||
|
||||
using Microsoft.Spark.CSharp.Interop.Ipc;
|
||||
using Microsoft.Spark.CSharp.Network;
|
||||
using Microsoft.Spark.CSharp.Services;
|
||||
|
||||
[assembly: InternalsVisibleTo("CSharpWorker")]
|
||||
|
@ -35,10 +36,26 @@ namespace Microsoft.Spark.CSharp.Core
|
|||
{
|
||||
internal static Dictionary<int, Accumulator> accumulatorRegistry = new Dictionary<int, Accumulator>();
|
||||
|
||||
[ThreadStatic] // Thread safe is needed when running in C# worker
|
||||
internal static Dictionary<int, Accumulator> threadLocalAccumulatorRegistry = new Dictionary<int, Accumulator>();
|
||||
|
||||
/// <summary>
|
||||
/// The identity of the accumulator
|
||||
/// </summary>
|
||||
protected int accumulatorId;
|
||||
|
||||
/// <summary>
|
||||
/// Indicates whether the accumulator is on driver side.
|
||||
/// When deserialized on worker side, isDriver is false by default.
|
||||
/// </summary>
|
||||
[NonSerialized]
|
||||
protected bool deserialized = true;
|
||||
protected bool isDriver = false;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// A generic version of <see cref="Accumulator"/> where the element type is specified by the driver program.
|
||||
/// </summary>
|
||||
/// <typeparam name="T">The type of element in the accumulator.</typeparam>
|
||||
[Serializable]
|
||||
public class Accumulator<T> : Accumulator
|
||||
{
|
||||
|
@ -46,20 +63,42 @@ namespace Microsoft.Spark.CSharp.Core
|
|||
internal T value;
|
||||
private readonly AccumulatorParam<T> accumulatorParam = new AccumulatorParam<T>();
|
||||
|
||||
/// <summary>
|
||||
/// Initializes a new instance of the Accumulator class with a specified identity and a value.
|
||||
/// </summary>
|
||||
/// <param name="accumulatorId">The Identity of the accumulator</param>
|
||||
/// <param name="value">The value of the accumulator</param>
|
||||
public Accumulator(int accumulatorId, T value)
|
||||
{
|
||||
this.accumulatorId = accumulatorId;
|
||||
this.value = value;
|
||||
deserialized = false;
|
||||
isDriver = true;
|
||||
accumulatorRegistry[accumulatorId] = this;
|
||||
}
|
||||
|
||||
[OnDeserialized()]
|
||||
internal void OnDeserializedMethod(System.Runtime.Serialization.StreamingContext context)
|
||||
{
|
||||
if (threadLocalAccumulatorRegistry == null)
|
||||
{
|
||||
threadLocalAccumulatorRegistry = new Dictionary<int, Accumulator>();
|
||||
}
|
||||
if (!threadLocalAccumulatorRegistry.ContainsKey(accumulatorId))
|
||||
{
|
||||
threadLocalAccumulatorRegistry[accumulatorId] = this;
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Gets or sets the value of the accumulator; only usable in driver program
|
||||
/// </summary>
|
||||
/// <exception cref="ArgumentException"></exception>
|
||||
public T Value
|
||||
{
|
||||
// Get the accumulator's value; only usable in driver program
|
||||
get
|
||||
{
|
||||
if (deserialized)
|
||||
if (!isDriver)
|
||||
{
|
||||
throw new ArgumentException("Accumulator.value cannot be accessed inside tasks");
|
||||
}
|
||||
|
@ -68,7 +107,7 @@ namespace Microsoft.Spark.CSharp.Core
|
|||
// Sets the accumulator's value; only usable in driver program
|
||||
set
|
||||
{
|
||||
if (deserialized)
|
||||
if (!isDriver)
|
||||
{
|
||||
throw new ArgumentException("Accumulator.value cannot be accessed inside tasks");
|
||||
}
|
||||
|
@ -94,14 +133,14 @@ namespace Microsoft.Spark.CSharp.Core
|
|||
/// <returns></returns>
|
||||
public static Accumulator<T> operator +(Accumulator<T> self, T term)
|
||||
{
|
||||
if (!accumulatorRegistry.ContainsKey(self.accumulatorId))
|
||||
{
|
||||
accumulatorRegistry[self.accumulatorId] = self;
|
||||
}
|
||||
self.Add(term);
|
||||
return self;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Creates and returns a string representation of the current accumulator
|
||||
/// </summary>
|
||||
/// <returns>A string representation of the current accumulator</returns>
|
||||
public override string ToString()
|
||||
{
|
||||
return string.Format("Accumulator<id={0}, value={1}>", accumulatorId, value);
|
||||
|
@ -143,33 +182,33 @@ namespace Microsoft.Spark.CSharp.Core
|
|||
/// A simple TCP server that intercepts shutdown() in order to interrupt
|
||||
/// our continuous polling on the handler.
|
||||
/// </summary>
|
||||
internal class AccumulatorServer : System.Net.Sockets.TcpListener
|
||||
internal class AccumulatorServer
|
||||
{
|
||||
private readonly ILoggerService logger = LoggerServiceFactory.GetLogger(typeof(AccumulatorServer));
|
||||
private volatile bool serverShutdown;
|
||||
private ISocketWrapper innerSocket;
|
||||
|
||||
internal AccumulatorServer()
|
||||
: base(IPAddress.Loopback, 0)
|
||||
{
|
||||
|
||||
innerSocket = SocketFactory.CreateSocket();
|
||||
}
|
||||
|
||||
internal void Shutdown()
|
||||
{
|
||||
serverShutdown = true;
|
||||
base.Stop();
|
||||
innerSocket.Close();
|
||||
}
|
||||
|
||||
internal int StartUpdateServer()
|
||||
{
|
||||
base.Start();
|
||||
innerSocket.Listen();
|
||||
Task.Run(() =>
|
||||
{
|
||||
try
|
||||
{
|
||||
IFormatter formatter = new BinaryFormatter();
|
||||
using (Socket s = AcceptSocket())
|
||||
using (var ns = new NetworkStream(s))
|
||||
using (var s = innerSocket.Accept())
|
||||
using (var ns = s.GetStream())
|
||||
{
|
||||
while (!serverShutdown)
|
||||
{
|
||||
|
@ -199,7 +238,7 @@ namespace Microsoft.Spark.CSharp.Core
|
|||
}
|
||||
catch (SocketException e)
|
||||
{
|
||||
if (e.ErrorCode != 10004) // A blocking operation was interrupted by a call to WSACancelBlockingCall - TcpListener.Stop cancelled AccepSocket as expected
|
||||
if (e.ErrorCode != 10004) // A blocking operation was interrupted by a call to WSACancelBlockingCall - ISocketWrapper.Close canceled Accep() as expected
|
||||
throw e;
|
||||
}
|
||||
catch (Exception e)
|
||||
|
@ -209,7 +248,7 @@ namespace Microsoft.Spark.CSharp.Core
|
|||
}
|
||||
});
|
||||
|
||||
return (base.LocalEndpoint as IPEndPoint).Port;
|
||||
return (innerSocket.LocalEndPoint as IPEndPoint).Port;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -8,6 +8,7 @@ using System.Text;
|
|||
using System.Threading.Tasks;
|
||||
using System.IO;
|
||||
using System.Runtime.Serialization.Formatters.Binary;
|
||||
using System.Collections.Concurrent;
|
||||
|
||||
using Microsoft.Spark.CSharp.Proxy;
|
||||
|
||||
|
@ -30,13 +31,21 @@ namespace Microsoft.Spark.CSharp.Core
|
|||
[Serializable]
|
||||
public class Broadcast
|
||||
{
|
||||
/// <summary>
|
||||
/// A thread-safe static collection that is used to store registered broadcast objects.
|
||||
/// </summary>
|
||||
[NonSerialized]
|
||||
public static Dictionary<long, Broadcast> broadcastRegistry = new Dictionary<long, Broadcast>();
|
||||
public static ConcurrentDictionary<long, Broadcast> broadcastRegistry = new ConcurrentDictionary<long, Broadcast>();
|
||||
[NonSerialized]
|
||||
internal string path;
|
||||
|
||||
internal long broadcastId;
|
||||
internal Broadcast() { }
|
||||
|
||||
/// <summary>
|
||||
/// Initializes a new instance of Broadcast class with a specified path.
|
||||
/// </summary>
|
||||
/// <param name="path">The path that to be set.</param>
|
||||
public Broadcast(string path)
|
||||
{
|
||||
this.path = path;
|
||||
|
@ -59,6 +68,11 @@ namespace Microsoft.Spark.CSharp.Core
|
|||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// A generic version of <see cref="Broadcast"/> where the element can be specified.
|
||||
/// </summary>
|
||||
/// <typeparam name="T">The type of element in Broadcast</typeparam>
|
||||
[Serializable]
|
||||
public class Broadcast<T> : Broadcast
|
||||
{
|
||||
|
|
|
@ -9,6 +9,9 @@ using System.Threading.Tasks;
|
|||
|
||||
namespace Microsoft.Spark.CSharp.Core
|
||||
{
|
||||
/// <summary>
|
||||
/// Extra functions available on RDDs of Doubles through an implicit conversion.
|
||||
/// </summary>
|
||||
public static class DoubleRDDFunctions
|
||||
{
|
||||
/// <summary>
|
||||
|
|
|
@ -16,17 +16,32 @@ namespace Microsoft.Spark.CSharp.Core
|
|||
private bool isDefined = false;
|
||||
private T value;
|
||||
|
||||
/// <summary>
|
||||
/// Initialize a instance of Option class without any value.
|
||||
/// </summary>
|
||||
public Option()
|
||||
{ }
|
||||
|
||||
/// <summary>
|
||||
/// Initializes a instance of Option class with a specific value.
|
||||
/// </summary>
|
||||
/// <param name="value">The value to be associated with the new instance.</param>
|
||||
public Option(T value)
|
||||
{
|
||||
isDefined = true;
|
||||
this.value = value;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Indicates whether the option value is defined.
|
||||
/// </summary>
|
||||
public bool IsDefined { get { return isDefined; } }
|
||||
|
||||
/// <summary>
|
||||
/// Returns the value of the option if Option.IsDefined is TRUE;
|
||||
/// otherwise, throws an <see cref="ArgumentException"/>.
|
||||
/// </summary>
|
||||
/// <returns></returns>
|
||||
public T GetValue()
|
||||
{
|
||||
if (isDefined) return value;
|
||||
|
|
|
@ -1,6 +1,7 @@
|
|||
// Copyright (c) Microsoft. All rights reserved.
|
||||
// Licensed under the MIT license. See LICENSE file in the project root for full license information.
|
||||
|
||||
using Microsoft.Spark.CSharp.Core;
|
||||
using System;
|
||||
using System.Collections.Generic;
|
||||
using System.Linq;
|
||||
|
@ -9,26 +10,15 @@ using System.Threading.Tasks;
|
|||
|
||||
namespace Microsoft.Spark.CSharp.Core
|
||||
{
|
||||
/// <summary>
|
||||
/// Extra functions available on RDDs of (key, value) pairs where the key is sortable through
|
||||
/// a function to sort the key.
|
||||
/// </summary>
|
||||
public static class OrderedRDDFunctions
|
||||
{
|
||||
|
||||
/// <summary>
|
||||
/// Sort the RDD by key, so that each partition contains a sorted range of the elements. Calling
|
||||
/// `collect` or `save` on the resulting RDD will return or output an ordered list of records
|
||||
/// (in the `save` case, they will be written to multiple `part-X` files in the filesystem, in
|
||||
/// order of the keys).
|
||||
///
|
||||
/// >>> tmp = [('a', 1), ('b', 2), ('1', 3), ('d', 4), ('2', 5)]
|
||||
/// >>> sc.parallelize(tmp).sortByKey().first()
|
||||
/// ('1', 3)
|
||||
/// >>> sc.parallelize(tmp).sortByKey(True, 1).collect()
|
||||
/// [('1', 3), ('2', 5), ('a', 1), ('b', 2), ('d', 4)]
|
||||
/// >>> sc.parallelize(tmp).sortByKey(True, 2).collect()
|
||||
/// [('1', 3), ('2', 5), ('a', 1), ('b', 2), ('d', 4)]
|
||||
/// >>> tmp2 = [('Mary', 1), ('had', 2), ('a', 3), ('little', 4), ('lamb', 5)]
|
||||
/// >>> tmp2.extend([('whose', 6), ('fleece', 7), ('was', 8), ('white', 9)])
|
||||
/// >>> sc.parallelize(tmp2).sortByKey(True, 3, keyfunc=lambda k: k.lower()).collect()
|
||||
/// [('a', 3), ('fleece', 7), ('had', 2), ('lamb', 5),...('white', 9), ('whose', 6)]
|
||||
///
|
||||
/// Sorts this RDD, which is assumed to consist of KeyValuePair pairs.
|
||||
/// </summary>
|
||||
/// <typeparam name="K"></typeparam>
|
||||
/// <typeparam name="V"></typeparam>
|
||||
|
@ -36,26 +26,75 @@ namespace Microsoft.Spark.CSharp.Core
|
|||
/// <param name="ascending"></param>
|
||||
/// <param name="numPartitions"></param>
|
||||
/// <returns></returns>
|
||||
public static RDD<KeyValuePair<K, V>> SortByKey<K, V>(
|
||||
this RDD<KeyValuePair<K, V>> self,
|
||||
bool ascending = true,
|
||||
int? numPartitions = null)
|
||||
public static RDD<KeyValuePair<K, V>> SortByKey<K, V>(this RDD<KeyValuePair<K, V>> self,
|
||||
bool ascending = true, int? numPartitions = null)
|
||||
{
|
||||
throw new NotImplementedException();
|
||||
return SortByKey<K, V, K>(self, ascending, numPartitions, new DefaultSortKeyFuncHelper<K>().Execute);
|
||||
}
|
||||
/// <summary>
|
||||
/// Sorts this RDD, which is assumed to consist of KeyValuePairs. If key is type of string, case is sensitive.
|
||||
/// </summary>
|
||||
/// <typeparam name="K"></typeparam>
|
||||
/// <typeparam name="V"></typeparam>
|
||||
/// <typeparam name="U"></typeparam>
|
||||
/// <param name="self"></param>
|
||||
/// <param name="ascending"></param>
|
||||
/// <param name="numPartitions">Number of partitions. Each partition of the sorted RDD contains a sorted range of the elements.</param>
|
||||
/// <param name="keyFunc">RDD will sort by keyFunc(key) for every key in KeyValuePair. Must not be null.</param>
|
||||
/// <returns></returns>
|
||||
public static RDD<KeyValuePair<K, V>> SortByKey<K, V, U>(this RDD<KeyValuePair<K, V>> self,
|
||||
bool ascending, int? numPartitions, Func<K, U> keyFunc)
|
||||
{
|
||||
if (keyFunc == null)
|
||||
{
|
||||
throw new ArgumentNullException("keyFunc cannot be null.");
|
||||
}
|
||||
|
||||
if (numPartitions == null)
|
||||
{
|
||||
numPartitions = self.GetDefaultPartitionNum();
|
||||
}
|
||||
|
||||
if (numPartitions == 1)
|
||||
{
|
||||
if (self.GetNumPartitions() > 1)
|
||||
{
|
||||
self = self.Coalesce(1);
|
||||
}
|
||||
return self.MapPartitionsWithIndex(new SortByKeyHelper<K, V, U>(keyFunc, ascending).Execute, true);
|
||||
}
|
||||
|
||||
var rddSize = self.Count();
|
||||
if (rddSize == 0) return self; // empty RDD
|
||||
|
||||
var maxSampleSize = numPartitions.Value * 20; // constant from Spark's RangePartitioner
|
||||
double fraction = Math.Min((double)maxSampleSize / Math.Max(rddSize, 1), 1.0);
|
||||
|
||||
/* first compute the boundary of each part via sampling: we want to partition
|
||||
* the key-space into bins such that the bins have roughly the same
|
||||
* number of (key, value) pairs falling into them */
|
||||
U[] samples = self.Sample(false, fraction, 1).Map(kv => kv.Key).Collect().Select(k => keyFunc(k)).ToArray();
|
||||
Array.Sort(samples, StringComparer.Ordinal); // case sensitive if key type is string
|
||||
|
||||
List<U> bounds = new List<U>();
|
||||
for (int i = 0; i < numPartitions - 1; i++)
|
||||
{
|
||||
bounds.Add(samples[(int)(samples.Length * (i + 1) / numPartitions)]);
|
||||
}
|
||||
|
||||
return self.PartitionBy(numPartitions.Value,
|
||||
new PairRDDFunctions.PartitionFuncDynamicTypeHelper<K>(
|
||||
new RangePartitionerHelper<K, U>(numPartitions.Value, keyFunc, bounds, ascending).Execute)
|
||||
.Execute)
|
||||
.MapPartitionsWithIndex(new SortByKeyHelper<K, V, U>(keyFunc, ascending).Execute, true);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Repartition the RDD according to the given partitioner and, within each resulting partition,
|
||||
/// sort records by their keys.
|
||||
///
|
||||
/// This is more efficient than calling `repartition` and then sorting within each partition
|
||||
/// because it can push the sorting down into the shuffle machinery.
|
||||
///
|
||||
/// >>> rdd = sc.parallelize([(0, 5), (3, 8), (2, 6), (0, 8), (3, 8), (1, 3)])
|
||||
/// >>> rdd2 = rdd.repartitionAndSortWithinPartitions(2, lambda x: x % 2, 2)
|
||||
/// >>> rdd2.glom().collect()
|
||||
/// [[(0, 5), (0, 8), (2, 6)], [(1, 3), (3, 8), (3, 8)]]
|
||||
///
|
||||
/// </summary>
|
||||
/// <typeparam name="K"></typeparam>
|
||||
/// <typeparam name="V"></typeparam>
|
||||
|
@ -72,5 +111,69 @@ namespace Microsoft.Spark.CSharp.Core
|
|||
{
|
||||
return self.MapPartitionsWithIndex<KeyValuePair<K, V>>((pid, iter) => ascending ? iter.OrderBy(kv => kv.Key) : iter.OrderByDescending(kv => kv.Key));
|
||||
}
|
||||
|
||||
[Serializable]
|
||||
internal class SortByKeyHelper<K, V, U>
|
||||
{
|
||||
private readonly Func<K, U> func;
|
||||
private readonly bool ascending;
|
||||
public SortByKeyHelper(Func<K, U> f, bool ascending = true)
|
||||
{
|
||||
func = f;
|
||||
this.ascending = ascending;
|
||||
}
|
||||
|
||||
public IEnumerable<KeyValuePair<K, V>> Execute(int pid, IEnumerable<KeyValuePair<K, V>> kvs)
|
||||
{
|
||||
IEnumerable<KeyValuePair<K, V>> ordered;
|
||||
if (ascending)
|
||||
{
|
||||
if (typeof(K) == typeof(string))
|
||||
ordered = kvs.OrderBy(k => func(k.Key).ToString(), StringComparer.Ordinal);
|
||||
else
|
||||
ordered = kvs.OrderBy(k => func(k.Key));
|
||||
}
|
||||
else
|
||||
{
|
||||
if (typeof(K) == typeof(string))
|
||||
ordered = kvs.OrderByDescending(k => func(k.Key).ToString(), StringComparer.Ordinal);
|
||||
else
|
||||
ordered = kvs.OrderByDescending(k => func(k.Key));
|
||||
}
|
||||
return ordered;
|
||||
}
|
||||
}
|
||||
|
||||
[Serializable]
|
||||
internal class DefaultSortKeyFuncHelper<K>
|
||||
{
|
||||
public K Execute(K key) { return key; }
|
||||
}
|
||||
|
||||
[Serializable]
|
||||
internal class RangePartitionerHelper<K, U>
|
||||
{
|
||||
private readonly int numPartitions;
|
||||
private readonly Func<K, U> keyFunc;
|
||||
private readonly List<U> bounds;
|
||||
private readonly bool ascending;
|
||||
public RangePartitionerHelper(int numPartitions, Func<K, U> keyFunc, List<U> bounds, bool ascending)
|
||||
{
|
||||
this.numPartitions = numPartitions;
|
||||
this.bounds = bounds;
|
||||
this.keyFunc = keyFunc;
|
||||
this.ascending = ascending;
|
||||
}
|
||||
|
||||
public int Execute(K key)
|
||||
{
|
||||
// Binary search the insert position in the bounds. If key found, return the insert position; if not, a negative
|
||||
// number that is the bitwise complement of insert position is returned, so bitwise inversing it.
|
||||
var pos = bounds.BinarySearch(keyFunc(key));
|
||||
if (pos < 0) pos = ~pos;
|
||||
|
||||
return ascending ? pos : numPartitions - 1 - pos;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -8,6 +8,7 @@ using System.Runtime.Serialization;
|
|||
using System.Runtime.Serialization.Formatters.Binary;
|
||||
using System.IO;
|
||||
using System.Security.Cryptography;
|
||||
using Microsoft.Spark.CSharp.Interop.Ipc;
|
||||
|
||||
namespace Microsoft.Spark.CSharp.Core
|
||||
{
|
||||
|
@ -21,7 +22,7 @@ namespace Microsoft.Spark.CSharp.Core
|
|||
/// <summary>
|
||||
/// Return the key-value pairs in this RDD to the master as a dictionary.
|
||||
///
|
||||
/// var m = sc.Parallelize(new[] { new <see cref="KeyValuePair{int, int}"/>(1, 2), new <see cref="KeyValuePair{int, int}"/>(3, 4) }, 1).CollectAsMap()
|
||||
/// var m = sc.Parallelize(new[] { new KeyValuePair<int, int>(1, 2), new KeyValuePair<int, int>(3, 4) }, 1).CollectAsMap()
|
||||
/// m[1]
|
||||
/// 2
|
||||
/// m[3]
|
||||
|
@ -40,7 +41,7 @@ namespace Microsoft.Spark.CSharp.Core
|
|||
/// <summary>
|
||||
/// Return an RDD with the keys of each tuple.
|
||||
///
|
||||
/// >>> m = sc.Parallelize(new[] { new <see cref="KeyValuePair{int, int}"/>(1, 2), new <see cref="KeyValuePair{int, int}"/>(3, 4) }, 1).Keys().Collect()
|
||||
/// >>> m = sc.Parallelize(new[] { new KeyValuePair<int, int>(1, 2), new KeyValuePair<int, int>(3, 4) }, 1).Keys().Collect()
|
||||
/// [1, 3]
|
||||
/// </summary>
|
||||
/// <typeparam name="K"></typeparam>
|
||||
|
@ -55,7 +56,7 @@ namespace Microsoft.Spark.CSharp.Core
|
|||
/// <summary>
|
||||
/// Return an RDD with the values of each tuple.
|
||||
///
|
||||
/// >>> m = sc.Parallelize(new[] { new <see cref="KeyValuePair{int, int}"/>(1, 2), new <see cref="KeyValuePair{int, int}"/>(3, 4) }, 1).Values().Collect()
|
||||
/// >>> m = sc.Parallelize(new[] { new KeyValuePair<int, int>(1, 2), new KeyValuePair<int, int>(3, 4) }, 1).Values().Collect()
|
||||
/// [2, 4]
|
||||
///
|
||||
/// </summary>
|
||||
|
@ -79,9 +80,9 @@ namespace Microsoft.Spark.CSharp.Core
|
|||
///
|
||||
/// sc.Parallelize(new[]
|
||||
/// {
|
||||
/// new <see cref="KeyValuePair{string, int}"/>("a", 1),
|
||||
/// new <see cref="KeyValuePair{string, int}"/>("b", 1),
|
||||
/// new <see cref="KeyValuePair{string, int}"/>("a", 1)
|
||||
/// new KeyValuePair<string, int>("a", 1),
|
||||
/// new KeyValuePair<string, int>("b", 1),
|
||||
/// new KeyValuePair<string, int>("a", 1)
|
||||
/// }, 2)
|
||||
/// .ReduceByKey((x, y) => x + y).Collect()
|
||||
///
|
||||
|
@ -108,9 +109,9 @@ namespace Microsoft.Spark.CSharp.Core
|
|||
///
|
||||
/// sc.Parallelize(new[]
|
||||
/// {
|
||||
/// new <see cref="KeyValuePair{string, int}"/>("a", 1),
|
||||
/// new <see cref="KeyValuePair{string, int}"/>("b", 1),
|
||||
/// new <see cref="KeyValuePair{string, int}"/>("a", 1)
|
||||
/// new KeyValuePair<string, int>("a", 1),
|
||||
/// new KeyValuePair<string, int>("b", 1),
|
||||
/// new KeyValuePair<string, int>("a", 1)
|
||||
/// }, 2)
|
||||
/// .ReduceByKeyLocally((x, y) => x + y).Collect()
|
||||
///
|
||||
|
@ -132,9 +133,9 @@ namespace Microsoft.Spark.CSharp.Core
|
|||
///
|
||||
/// sc.Parallelize(new[]
|
||||
/// {
|
||||
/// new <see cref="KeyValuePair{string, int}"/>("a", 1),
|
||||
/// new <see cref="KeyValuePair{string, int}"/>("b", 1),
|
||||
/// new <see cref="KeyValuePair{string, int}"/>("a", 1)
|
||||
/// new KeyValuePair<string, int>("a", 1),
|
||||
/// new KeyValuePair<string, int>("b", 1),
|
||||
/// new KeyValuePair<string, int>("a", 1)
|
||||
/// }, 2)
|
||||
/// .CountByKey((x, y) => x + y).Collect()
|
||||
///
|
||||
|
@ -158,9 +159,9 @@ namespace Microsoft.Spark.CSharp.Core
|
|||
/// Performs a hash join across the cluster.
|
||||
///
|
||||
/// var l = sc.Parallelize(
|
||||
/// new[] { new <see cref="KeyValuePair{string, int}"/>("a", 1), new <see cref="KeyValuePair{string, int}"/>("b", 4) }, 1);
|
||||
/// new[] { new KeyValuePair<string, int>("a", 1), new KeyValuePair<string, int>("b", 4) }, 1);
|
||||
/// var r = sc.Parallelize(
|
||||
/// new[] { new <see cref="KeyValuePair{string, int}"/>("a", 2), new <see cref="KeyValuePair{string, int}"/>("a", 3) }, 1);
|
||||
/// new[] { new KeyValuePair<string, int>("a", 2), new KeyValuePair<string, int>("a", 3) }, 1);
|
||||
/// var m = l.Join(r, 2).Collect();
|
||||
///
|
||||
/// [('a', (1, 2)), ('a', (1, 3))]
|
||||
|
@ -193,9 +194,9 @@ namespace Microsoft.Spark.CSharp.Core
|
|||
/// Hash-partitions the resulting RDD into the given number of partitions.
|
||||
///
|
||||
/// var l = sc.Parallelize(
|
||||
/// new[] { new <see cref="KeyValuePair{string, int}"/>("a", 1), new <see cref="KeyValuePair{string, int}"/>("b", 4) }, 1);
|
||||
/// new[] { new KeyValuePair<string, int>("a", 1), new KeyValuePair<string, int>("b", 4) }, 1);
|
||||
/// var r = sc.Parallelize(
|
||||
/// new[] { new <see cref="KeyValuePair{string, int}"/>("a", 2) }, 1);
|
||||
/// new[] { new KeyValuePair<string, int>("a", 2) }, 1);
|
||||
/// var m = l.LeftOuterJoin(r).Collect();
|
||||
///
|
||||
/// [('a', (1, 2)), ('b', (4, Option))]
|
||||
|
@ -227,9 +228,9 @@ namespace Microsoft.Spark.CSharp.Core
|
|||
/// Hash-partitions the resulting RDD into the given number of partitions.
|
||||
///
|
||||
/// var l = sc.Parallelize(
|
||||
/// new[] { new <see cref="KeyValuePair{string, int}"/>("a", 2) }, 1);
|
||||
/// new[] { new KeyValuePair<string, int>("a", 2) }, 1);
|
||||
/// var r = sc.Parallelize(
|
||||
/// new[] { new <see cref="KeyValuePair{string, int}"/>("a", 1), new <see cref="KeyValuePair{string, int}"/>("b", 4) }, 1);
|
||||
/// new[] { new KeyValuePair<string, int>("a", 1), new KeyValuePair<string, int>("b", 4) }, 1);
|
||||
/// var m = l.RightOuterJoin(r).Collect();
|
||||
///
|
||||
/// [('a', (2, 1)), ('b', (Option, 4))]
|
||||
|
@ -266,9 +267,9 @@ namespace Microsoft.Spark.CSharp.Core
|
|||
/// Hash-partitions the resulting RDD into the given number of partitions.
|
||||
///
|
||||
/// var l = sc.Parallelize(
|
||||
/// new[] { new <see cref="KeyValuePair{string, int}"/>("a", 1), <see cref="KeyValuePair{string, int}"/>("b", 4) }, 1);
|
||||
/// new[] { new KeyValuePair<string, int>("a", 1), KeyValuePair<string, int>("b", 4) }, 1);
|
||||
/// var r = sc.Parallelize(
|
||||
/// new[] { new <see cref="KeyValuePair{string, int}"/>("a", 2), new <see cref="KeyValuePair{string, int}"/>("c", 8) }, 1);
|
||||
/// new[] { new KeyValuePair<string, int>("a", 2), new KeyValuePair<string, int>("c", 8) }, 1);
|
||||
/// var m = l.FullOuterJoin(r).Collect();
|
||||
///
|
||||
/// [('a', (1, 2)), ('b', (4, None)), ('c', (None, 8))]
|
||||
|
@ -294,30 +295,31 @@ namespace Microsoft.Spark.CSharp.Core
|
|||
/// <summary>
|
||||
/// Return a copy of the RDD partitioned using the specified partitioner.
|
||||
///
|
||||
/// sc.Parallelize(new[] { 1, 2, 3, 4, 2, 4, 1 }, 1).Map(x => new <see cref="KeyValuePair{int, int}"/>(x, x)).PartitionBy(3).Glom().Collect()
|
||||
/// sc.Parallelize(new[] { 1, 2, 3, 4, 2, 4, 1 }, 1).Map(x => new KeyValuePair<int, int>(x, x)).PartitionBy(3).Glom().Collect()
|
||||
/// </summary>
|
||||
/// <param name="self"></param>
|
||||
/// <param name="numPartitions"></param>
|
||||
/// <param name="partitionFunc"></param>
|
||||
/// <returns></returns>
|
||||
public static RDD<KeyValuePair<K, V>> PartitionBy<K, V>(this RDD<KeyValuePair<K, V>> self, int numPartitions = 0)
|
||||
public static RDD<KeyValuePair<K, V>> PartitionBy<K, V>(this RDD<KeyValuePair<K, V>> self, int numPartitions = 0,
|
||||
Func<dynamic, int> partitionFunc = null)
|
||||
{
|
||||
if (numPartitions == 0)
|
||||
{
|
||||
numPartitions = self.sparkContext.SparkConf.SparkConfProxy.GetInt("spark.default.parallelism", 0);
|
||||
if (numPartitions == 0 && self.previousRddProxy != null)
|
||||
numPartitions = self.previousRddProxy.PartitionLength();
|
||||
numPartitions = self.GetDefaultPartitionNum();
|
||||
}
|
||||
|
||||
int? partitioner = numPartitions;
|
||||
if (self.partitioner == partitioner)
|
||||
|
||||
var partitioner = new Partitioner(numPartitions, partitionFunc);
|
||||
if (self.partitioner != null && self.partitioner.Equals(partitioner))
|
||||
return self;
|
||||
|
||||
var keyed = self.MapPartitionsWithIndex(new AddShuffleKeyHelper<K, V>().Execute, true);
|
||||
var keyed = self.MapPartitionsWithIndex(new AddShuffleKeyHelper<K, V>(numPartitions, partitionFunc).Execute, true);
|
||||
keyed.bypassSerializer = true;
|
||||
// convert shuffling version of RDD[(Long, Array[Byte])] back to normal RDD[Array[Byte]]
|
||||
// invoking property keyed.RddProxy marks the end of current pipeline RDD after shuffling
|
||||
// and potentially starts next pipeline RDD with defult SerializedMode.Byte
|
||||
var rdd = new RDD<KeyValuePair<K, V>>(self.sparkContext.SparkContextProxy.CreatePairwiseRDD(keyed.RddProxy, numPartitions), self.sparkContext);
|
||||
var rdd = new RDD<KeyValuePair<K, V>>(self.sparkContext.SparkContextProxy.CreatePairwiseRDD(keyed.RddProxy, numPartitions,
|
||||
GenerateObjectId(partitionFunc)), self.sparkContext);
|
||||
rdd.partitioner = partitioner;
|
||||
|
||||
return rdd;
|
||||
|
@ -344,9 +346,9 @@ namespace Microsoft.Spark.CSharp.Core
|
|||
/// sc.Parallelize(
|
||||
/// new[]
|
||||
/// {
|
||||
/// new <see cref="KeyValuePair{string, int}"/>("a", 1),
|
||||
/// new <see cref="KeyValuePair{string, int}"/>("b", 1),
|
||||
/// new <see cref="KeyValuePair{string, int}"/>("a", 1)
|
||||
/// new KeyValuePair<string, int>("a", 1),
|
||||
/// new KeyValuePair<string, int>("b", 1),
|
||||
/// new KeyValuePair<string, int>("a", 1)
|
||||
/// }, 2)
|
||||
/// .CombineByKey(() => string.Empty, (x, y) => x + y.ToString(), (x, y) => x + y).Collect()
|
||||
///
|
||||
|
@ -387,9 +389,9 @@ namespace Microsoft.Spark.CSharp.Core
|
|||
/// sc.Parallelize(
|
||||
/// new[]
|
||||
/// {
|
||||
/// new <see cref="KeyValuePair{string, int}"/>("a", 1),
|
||||
/// new <see cref="KeyValuePair{string, int}"/>("b", 1),
|
||||
/// new <see cref="KeyValuePair{string, int}"/>("a", 1)
|
||||
/// new KeyValuePair<string, int>("a", 1),
|
||||
/// new KeyValuePair<string, int>("b", 1),
|
||||
/// new KeyValuePair<string, int>("a", 1)
|
||||
/// }, 2)
|
||||
/// .CombineByKey(() => string.Empty, (x, y) => x + y.ToString(), (x, y) => x + y).Collect()
|
||||
///
|
||||
|
@ -423,9 +425,9 @@ namespace Microsoft.Spark.CSharp.Core
|
|||
/// sc.Parallelize(
|
||||
/// new[]
|
||||
/// {
|
||||
/// new <see cref="KeyValuePair{string, int}"/>("a", 1),
|
||||
/// new <see cref="KeyValuePair{string, int}"/>("b", 1),
|
||||
/// new <see cref="KeyValuePair{string, int}"/>("a", 1)
|
||||
/// new KeyValuePair<string, int>("a", 1),
|
||||
/// new KeyValuePair<string, int>("b", 1),
|
||||
/// new KeyValuePair<string, int>("a", 1)
|
||||
/// }, 2)
|
||||
/// .CombineByKey(() => string.Empty, (x, y) => x + y.ToString(), (x, y) => x + y).Collect()
|
||||
///
|
||||
|
@ -458,9 +460,9 @@ namespace Microsoft.Spark.CSharp.Core
|
|||
/// sc.Parallelize(
|
||||
/// new[]
|
||||
/// {
|
||||
/// new <see cref="KeyValuePair{string, int}"/>("a", 1),
|
||||
/// new <see cref="KeyValuePair{string, int}"/>("b", 1),
|
||||
/// new <see cref="KeyValuePair{string, int}"/>("a", 1)
|
||||
/// new KeyValuePair<string, int>("a", 1),
|
||||
/// new KeyValuePair<string, int>("b", 1),
|
||||
/// new KeyValuePair<string, int>("a", 1)
|
||||
/// }, 2)
|
||||
/// .GroupByKey().MapValues(l => string.Join(" ", l)).Collect()
|
||||
///
|
||||
|
@ -488,8 +490,8 @@ namespace Microsoft.Spark.CSharp.Core
|
|||
/// sc.Parallelize(
|
||||
/// new[]
|
||||
/// {
|
||||
/// new <see cref="KeyValuePair{string, string[]}"/>("a", new[]{"apple", "banana", "lemon"}),
|
||||
/// new <see cref="KeyValuePair{string, string[]}"/>("b", new[]{"grapes"})
|
||||
/// new KeyValuePair<string, string[]>("a", new[]{"apple", "banana", "lemon"}),
|
||||
/// new KeyValuePair<string, string[]>("b", new[]{"grapes"})
|
||||
/// }, 2)
|
||||
/// .MapValues(x => x.Length).Collect()
|
||||
///
|
||||
|
@ -514,8 +516,8 @@ namespace Microsoft.Spark.CSharp.Core
|
|||
/// x = sc.Parallelize(
|
||||
/// new[]
|
||||
/// {
|
||||
/// new <see cref="KeyValuePair{string, string[]}"/>("a", new[]{"x", "y", "z"}),
|
||||
/// new <see cref="KeyValuePair{string, string[]}"/>("b", new[]{"p", "r"})
|
||||
/// new KeyValuePair<string, string[]>("a", new[]{"x", "y", "z"}),
|
||||
/// new KeyValuePair<string, string[]>("b", new[]{"p", "r"})
|
||||
/// }, 2)
|
||||
/// .FlatMapValues(x => x).Collect()
|
||||
///
|
||||
|
@ -534,7 +536,7 @@ namespace Microsoft.Spark.CSharp.Core
|
|||
}
|
||||
|
||||
/// <summary>
|
||||
/// explicitly convert KeyValuePair<K, V> to KeyValuePair<K, dynamic>
|
||||
/// explicitly convert KeyValuePair<K, V> to KeyValuePair<K, dynamic>
|
||||
/// since they are incompatibles types unlike V to dynamic
|
||||
/// </summary>
|
||||
/// <typeparam name="K"></typeparam>
|
||||
|
@ -566,8 +568,8 @@ namespace Microsoft.Spark.CSharp.Core
|
|||
/// For each key k in this RDD or <paramref name="other"/>, return a resulting RDD that
|
||||
/// contains a tuple with the list of values for that key in this RDD as well as <paramref name="other"/>.
|
||||
///
|
||||
/// var x = sc.Parallelize(new[] { new <see cref="KeyValuePair{string, int}"/>("a", 1), new <see cref="KeyValuePair{string, int}"/>("b", 4) }, 2);
|
||||
/// var y = sc.Parallelize(new[] { new <see cref="KeyValuePair{string, int}"/>("a", 2) }, 1);
|
||||
/// var x = sc.Parallelize(new[] { new KeyValuePair<string, int>("a", 1), new KeyValuePair<string, int>("b", 4) }, 2);
|
||||
/// var y = sc.Parallelize(new[] { new KeyValuePair<string, int>("a", 2) }, 1);
|
||||
/// x.GroupWith(y).Collect();
|
||||
///
|
||||
/// [('a', ([1], [2])), ('b', ([4], []))]
|
||||
|
@ -608,9 +610,9 @@ namespace Microsoft.Spark.CSharp.Core
|
|||
}
|
||||
|
||||
/// <summary>
|
||||
/// var x = sc.Parallelize(new[] { new <see cref="KeyValuePair{string, int}"/>("a", 5), new <see cref="KeyValuePair{string, int}"/>("b", 6) }, 2);
|
||||
/// var y = sc.Parallelize(new[] { new <see cref="KeyValuePair{string, int}"/>("a", 1), new <see cref="KeyValuePair{string, int}"/>("b", 4) }, 2);
|
||||
/// var z = sc.Parallelize(new[] { new <see cref="KeyValuePair{string, int}"/>("a", 2) }, 1);
|
||||
/// var x = sc.Parallelize(new[] { new KeyValuePair<string, int>("a", 5), new KeyValuePair<string, int>("b", 6) }, 2);
|
||||
/// var y = sc.Parallelize(new[] { new KeyValuePair<string, int>("a", 1), new KeyValuePair<string, int>("b", 4) }, 2);
|
||||
/// var z = sc.Parallelize(new[] { new KeyValuePair<string, int>("a", 2) }, 1);
|
||||
/// x.GroupWith(y, z).Collect();
|
||||
/// </summary>
|
||||
/// <typeparam name="K"></typeparam>
|
||||
|
@ -653,10 +655,10 @@ namespace Microsoft.Spark.CSharp.Core
|
|||
}
|
||||
|
||||
/// <summary>
|
||||
/// var x = sc.Parallelize(new[] { new <see cref="KeyValuePair{string, int}"/>("a", 5), new <see cref="KeyValuePair{string, int}"/>("b", 6) }, 2);
|
||||
/// var y = sc.Parallelize(new[] { new <see cref="KeyValuePair{string, int}"/>("a", 1), new <see cref="KeyValuePair{string, int}"/>("b", 4) }, 2);
|
||||
/// var z = sc.Parallelize(new[] { new <see cref="KeyValuePair{string, int}"/>("a", 2) }, 1);
|
||||
/// var w = sc.Parallelize(new[] { new <see cref="KeyValuePair{string, int}"/>("b", 42) }, 1);
|
||||
/// var x = sc.Parallelize(new[] { new KeyValuePair<string, int>("a", 5), new KeyValuePair<string, int>("b", 6) }, 2);
|
||||
/// var y = sc.Parallelize(new[] { new KeyValuePair<string, int>("a", 1), new KeyValuePair<string, int>("b", 4) }, 2);
|
||||
/// var z = sc.Parallelize(new[] { new KeyValuePair<string, int>("a", 2) }, 1);
|
||||
/// var w = sc.Parallelize(new[] { new KeyValuePair<string, int>("b", 42) }, 1);
|
||||
/// var m = x.GroupWith(y, z, w).MapValues(l => string.Join(" ", l.Item1) + " : " + string.Join(" ", l.Item2) + " : " + string.Join(" ", l.Item3) + " : " + string.Join(" ", l.Item4)).Collect();
|
||||
/// </summary>
|
||||
/// <typeparam name="K"></typeparam>
|
||||
|
@ -711,7 +713,7 @@ namespace Microsoft.Spark.CSharp.Core
|
|||
// ///
|
||||
// /// var fractions = new <see cref="Dictionary{string, double}"/> { { "a", 0.2 }, { "b", 0.1 } };
|
||||
// /// var rdd = sc.Parallelize(fractions.Keys.ToArray(), 2).Cartesian(sc.Parallelize(Enumerable.Range(0, 1000), 2));
|
||||
// /// var sample = rdd.Map(t => new <see cref="KeyValuePair{string, int}"/>(t.Item1, t.Item2)).SampleByKey(false, fractions, 2).GroupByKey().Collect();
|
||||
// /// var sample = rdd.Map(t => new KeyValuePair<string, int>(t.Item1, t.Item2)).SampleByKey(false, fractions, 2).GroupByKey().Collect();
|
||||
// ///
|
||||
// /// 100 < sample["a"].Length < 300 and 50 < sample["b"].Length < 150
|
||||
// /// true
|
||||
|
@ -743,8 +745,8 @@ namespace Microsoft.Spark.CSharp.Core
|
|||
/// <summary>
|
||||
/// Return each (key, value) pair in this RDD that has no pair with matching key in <paramref name="other"/>.
|
||||
///
|
||||
/// var x = sc.Parallelize(new[] { new <see cref="KeyValuePair{string, int?}"/>("a", 1), new <see cref="KeyValuePair{string, int?}"/>("b", 4), new <see cref="KeyValuePair{string, int?}"/>("b", 5), new <see cref="KeyValuePair{string, int?}"/>("a", 2) }, 2);
|
||||
/// var y = sc.Parallelize(new[] { new <see cref="KeyValuePair{string, int?}"/>("a", 3), new <see cref="KeyValuePair{string, int?}"/>("c", null) }, 2);
|
||||
/// var x = sc.Parallelize(new[] { new KeyValuePair<string, int?>("a", 1), new KeyValuePair<string, int?>("b", 4), new KeyValuePair<string, int?>("b", 5), new KeyValuePair<string, int?>("a", 2) }, 2);
|
||||
/// var y = sc.Parallelize(new[] { new KeyValuePair<string, int?>("a", 3), new KeyValuePair<string, int?>("c", null) }, 2);
|
||||
/// x.SubtractByKey(y).Collect();
|
||||
///
|
||||
/// [('b', 4), ('b', 5)]
|
||||
|
@ -768,7 +770,7 @@ namespace Microsoft.Spark.CSharp.Core
|
|||
/// searching the partition that the key maps to.
|
||||
///
|
||||
/// >>> l = range(1000)
|
||||
/// >>> rdd = sc.Parallelize(Enumerable.Range(0, 1000).Zip(Enumerable.Range(0, 1000), (x, y) => new <see cref="KeyValuePair{int, int}"/>(x, y)), 10)
|
||||
/// >>> rdd = sc.Parallelize(Enumerable.Range(0, 1000).Zip(Enumerable.Range(0, 1000), (x, y) => new KeyValuePair<int, int>(x, y)), 10)
|
||||
/// >>> rdd.lookup(42)
|
||||
/// [42]
|
||||
///
|
||||
|
@ -917,20 +919,42 @@ namespace Microsoft.Spark.CSharp.Core
|
|||
}
|
||||
|
||||
[Serializable]
|
||||
private class AddShuffleKeyHelper<K1, V1>
|
||||
internal class AddShuffleKeyHelper<K, V>
|
||||
{
|
||||
[NonSerialized]
|
||||
private static MD5 md5 = MD5.Create();
|
||||
public IEnumerable<byte[]> Execute(int split, IEnumerable<KeyValuePair<K1, V1>> input)
|
||||
private MD5 md5 = MD5.Create();
|
||||
private readonly int numPartitions;
|
||||
private readonly Func<dynamic, int> partitionFunc = null;
|
||||
|
||||
public AddShuffleKeyHelper(int numPartitions, Func<dynamic, int> partitionFunc = null)
|
||||
{
|
||||
this.numPartitions = numPartitions;
|
||||
this.partitionFunc = partitionFunc;
|
||||
}
|
||||
|
||||
public IEnumerable<byte[]> Execute(int split, IEnumerable<KeyValuePair<K, V>> input)
|
||||
{
|
||||
// make sure that md5 is not null even if it is deseriazed in C# worker
|
||||
if (md5 == null)
|
||||
{
|
||||
md5 = MD5.Create();
|
||||
}
|
||||
IFormatter formatter = new BinaryFormatter();
|
||||
foreach (var kvp in input)
|
||||
foreach (var kv in input)
|
||||
{
|
||||
var ms = new MemoryStream();
|
||||
formatter.Serialize(ms, kvp.Key);
|
||||
yield return md5.ComputeHash(ms.ToArray()).Take(8).ToArray();
|
||||
if (partitionFunc == null)
|
||||
{
|
||||
formatter.Serialize(ms, kv.Key);
|
||||
yield return md5.ComputeHash(ms.ToArray()).Take(8).ToArray();
|
||||
}
|
||||
else
|
||||
{
|
||||
long pid = (long)(partitionFunc(kv.Key) % numPartitions);
|
||||
yield return SerDe.ToBytes(pid);
|
||||
}
|
||||
ms = new MemoryStream();
|
||||
formatter.Serialize(ms, kvp);
|
||||
formatter.Serialize(ms, kv);
|
||||
yield return ms.ToArray();
|
||||
}
|
||||
}
|
||||
|
@ -983,9 +1007,43 @@ namespace Microsoft.Spark.CSharp.Core
|
|||
}
|
||||
}
|
||||
|
||||
[Serializable]
|
||||
internal class PartitionFuncDynamicTypeHelper<K>
|
||||
{
|
||||
private readonly Func<K, int> func;
|
||||
internal PartitionFuncDynamicTypeHelper(Func<K, int> f)
|
||||
{
|
||||
this.func = f;
|
||||
}
|
||||
internal int Execute(dynamic input)
|
||||
{
|
||||
return func((K)input);
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Converts a collection to a list where the element type is Option(T) type.
|
||||
/// If the collection is empty, just returns the empty list.
|
||||
/// </summary>
|
||||
/// <param name="list">The collection that be inputted to convert</param>
|
||||
/// <typeparam name="T">The element type in the collection</typeparam>
|
||||
/// <returns>A list that use Option(T) as element type</returns>
|
||||
public static List<Option<T>> NullIfEmpty<T>(this IEnumerable<T> list)
|
||||
{
|
||||
return list.Any() ? list.Select(v => new Option<T>(v)).ToList() : new List<Option<T>>() { new Option<T>() };
|
||||
}
|
||||
|
||||
private static long GenerateObjectId(object obj)
|
||||
{
|
||||
if (obj == null)
|
||||
return 0;
|
||||
|
||||
MD5 md5 = MD5.Create();
|
||||
IFormatter formatter = new BinaryFormatter();
|
||||
var ms = new MemoryStream();
|
||||
formatter.Serialize(ms, obj);
|
||||
var hash = md5.ComputeHash(ms.ToArray());
|
||||
return BitConverter.ToInt64(hash.Take(8).ToArray(), 0);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -0,0 +1,62 @@
|
|||
// Copyright (c) Microsoft. All rights reserved.
|
||||
// Licensed under the MIT license. See LICENSE file in the project root for full license information.
|
||||
|
||||
using System;
|
||||
|
||||
namespace Microsoft.Spark.CSharp.Core
|
||||
{
|
||||
/// <summary>
|
||||
/// An object that defines how the elements in a key-value pair RDD are partitioned by key.
|
||||
/// Maps each key to a partition ID, from 0 to "numPartitions - 1".
|
||||
/// </summary>
|
||||
[Serializable]
|
||||
public class Partitioner
|
||||
{
|
||||
private readonly int numPartitions;
|
||||
private readonly Func<dynamic, int> partitionFunc;
|
||||
|
||||
/// <summary>
|
||||
/// Create a <seealso cref="Partitioner"/> instance.
|
||||
/// </summary>
|
||||
/// <param name="numPartitions">Number of partitions.</param>
|
||||
/// <param name="partitionFunc">Defines how the elements in a key-value pair RDD are partitioned by key. Input of Func is key, output is partition index.
|
||||
/// Warning: diffrent Func instances are considered as different partitions which will cause repartition.</param>
|
||||
public Partitioner(int numPartitions, Func<dynamic, int> partitionFunc)
|
||||
{
|
||||
this.numPartitions = numPartitions;
|
||||
this.partitionFunc = partitionFunc;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Determines whether the specified object is equal to the current object.
|
||||
/// </summary>
|
||||
/// <returns>
|
||||
/// true if the specified object is equal to the current object; otherwise, false.
|
||||
/// </returns>
|
||||
/// <param name="obj">The object to compare with the current object. </param>
|
||||
public override bool Equals(object obj)
|
||||
{
|
||||
if (ReferenceEquals(null, obj)) return false;
|
||||
if (ReferenceEquals(this, obj)) return true;
|
||||
|
||||
var otherPartitioner = obj as Partitioner;
|
||||
if (otherPartitioner != null)
|
||||
{
|
||||
return otherPartitioner.numPartitions == numPartitions && otherPartitioner.partitionFunc == partitionFunc;
|
||||
}
|
||||
|
||||
return base.Equals(obj);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Serves as the default hash function.
|
||||
/// </summary>
|
||||
/// <returns>
|
||||
/// A hash code for the current object.
|
||||
/// </returns>
|
||||
public override int GetHashCode()
|
||||
{
|
||||
return base.GetHashCode();
|
||||
}
|
||||
}
|
||||
}
|
|
@ -19,7 +19,7 @@ namespace Microsoft.Spark.CSharp.Core
|
|||
|
||||
/// <summary>
|
||||
/// Wraps C#-based transformations that can be executed within a stage. It helps avoid unnecessary Ser/De of data between
|
||||
/// JVM & CLR to execute C# transformations and pipelines them
|
||||
/// JVM and CLR to execute C# transformations and pipelines them
|
||||
/// </summary>
|
||||
/// <typeparam name="U"></typeparam>
|
||||
[Serializable]
|
||||
|
@ -29,6 +29,14 @@ namespace Microsoft.Spark.CSharp.Core
|
|||
internal bool preservesPartitioning;
|
||||
|
||||
//TODO - give generic types a better id
|
||||
/// <summary>
|
||||
/// Return a new RDD by applying a function to each partition of this RDD,
|
||||
/// while tracking the index of the original partition.
|
||||
/// </summary>
|
||||
/// <typeparam name="U1">The element type</typeparam>
|
||||
/// <param name="newFunc">The function to be applied to each partition</param>
|
||||
/// <param name="preservesPartitioningParam">Indicates if it preserves partition parameters</param>
|
||||
/// <returns>A new RDD</returns>
|
||||
public override RDD<U1> MapPartitionsWithIndex<U1>(Func<int, IEnumerable<U>, IEnumerable<U1>> newFunc, bool preservesPartitioningParam = false)
|
||||
{
|
||||
if (IsPipelinable())
|
||||
|
|
|
@ -10,6 +10,9 @@ using System.Threading.Tasks;
|
|||
namespace Microsoft.Spark.CSharp.Core
|
||||
{
|
||||
//TODO - complete the impl
|
||||
/// <summary>
|
||||
/// A class represents a profiler
|
||||
/// </summary>
|
||||
public class Profiler
|
||||
{
|
||||
}
|
||||
|
|
|
@ -5,6 +5,7 @@ using System;
|
|||
using System.Collections.Generic;
|
||||
using System.Linq;
|
||||
using Microsoft.Spark.CSharp.Proxy;
|
||||
using Microsoft.Spark.CSharp.Services;
|
||||
|
||||
namespace Microsoft.Spark.CSharp.Core
|
||||
{
|
||||
|
@ -18,16 +19,29 @@ namespace Microsoft.Spark.CSharp.Core
|
|||
[Serializable]
|
||||
public class RDD<T>
|
||||
{
|
||||
[NonSerialized]
|
||||
private readonly ILoggerService logger = LoggerServiceFactory.GetLogger(typeof(RDD<T>));
|
||||
|
||||
internal IRDDProxy rddProxy;
|
||||
internal IRDDProxy previousRddProxy;
|
||||
// There should be only one SparkContext instance per application, mark it as NonSerialized to avoid more than one SparkContext instances created.
|
||||
// Need to set this field with a valid SparkContext instance after deserialization.
|
||||
[NonSerialized]
|
||||
internal SparkContext sparkContext;
|
||||
internal SerializedMode serializedMode; //used for deserializing data before processing in C# worker
|
||||
internal SerializedMode prevSerializedMode;
|
||||
|
||||
/// <summary>
|
||||
/// Indicates whether the RDD is cached.
|
||||
/// </summary>
|
||||
protected bool isCached;
|
||||
|
||||
/// <summary>
|
||||
/// Indicates whether the RDD is checkpointed.
|
||||
/// </summary>
|
||||
protected bool isCheckpointed;
|
||||
internal bool bypassSerializer;
|
||||
internal int? partitioner;
|
||||
internal Partitioner partitioner;
|
||||
|
||||
internal virtual IRDDProxy RddProxy
|
||||
{
|
||||
|
@ -108,6 +122,7 @@ namespace Microsoft.Spark.CSharp.Core
|
|||
public RDD<T> Cache()
|
||||
{
|
||||
isCached = true;
|
||||
logger.LogInfo("Persisting RDD to default storage cache");
|
||||
RddProxy.Cache();
|
||||
return this;
|
||||
}
|
||||
|
@ -127,6 +142,7 @@ namespace Microsoft.Spark.CSharp.Core
|
|||
public RDD<T> Persist(StorageLevelType storageLevelType)
|
||||
{
|
||||
isCached = true;
|
||||
logger.LogInfo("Persisting RDD to storage level type {0}", storageLevelType);
|
||||
RddProxy.Persist(storageLevelType);
|
||||
return this;
|
||||
}
|
||||
|
@ -140,6 +156,7 @@ namespace Microsoft.Spark.CSharp.Core
|
|||
if (isCached)
|
||||
{
|
||||
isCached = false;
|
||||
logger.LogInfo("Unpersisting RDD from the cache");
|
||||
RddProxy.Unpersist();
|
||||
}
|
||||
return this;
|
||||
|
@ -156,10 +173,15 @@ namespace Microsoft.Spark.CSharp.Core
|
|||
public void Checkpoint()
|
||||
{
|
||||
isCheckpointed = true;
|
||||
logger.LogInfo("Checkpointing RDD to SparkContext.SetCheckpointDir");
|
||||
RddProxy.Checkpoint();
|
||||
}
|
||||
|
||||
internal int GetNumPartitions()
|
||||
/// <summary>
|
||||
/// Returns the number of partitions of this RDD.
|
||||
/// </summary>
|
||||
/// <returns>The number of partitions of this RDD</returns>
|
||||
public int GetNumPartitions()
|
||||
{
|
||||
return RddProxy.GetNumPartitions();
|
||||
}
|
||||
|
@ -167,7 +189,7 @@ namespace Microsoft.Spark.CSharp.Core
|
|||
/// <summary>
|
||||
/// Return a new RDD by applying a function to each element of this RDD.
|
||||
///
|
||||
/// sc.Parallelize(new string[]{"b", "a", "c"}, 1).Map(x => new <see cref="KeyValuePair{string, int}"/>(x, 1)).Collect()
|
||||
/// sc.Parallelize(new string[]{"b", "a", "c"}, 1).Map(x => new KeyValuePair<string, int>(x, 1)).Collect()
|
||||
/// [('a', 1), ('b', 1), ('c', 1)]
|
||||
///
|
||||
/// </summary>
|
||||
|
@ -177,6 +199,7 @@ namespace Microsoft.Spark.CSharp.Core
|
|||
/// <returns></returns>
|
||||
public RDD<U> Map<U>(Func<T, U> f, bool preservesPartitioning = false)
|
||||
{
|
||||
logger.LogInfo("Executing Map operation on RDD (preservesPartitioning={0})", preservesPartitioning);
|
||||
return MapPartitionsWithIndex(new MapHelper<T, U>(f).Execute, preservesPartitioning);
|
||||
}
|
||||
|
||||
|
@ -217,7 +240,7 @@ namespace Microsoft.Spark.CSharp.Core
|
|||
/// Return a new RDD by applying a function to each partition of this RDD,
|
||||
/// while tracking the index of the original partition.
|
||||
///
|
||||
/// <see cref="sc.Parallelize(new int[]{1, 2, 3, 4}, 4).MapPartitionsWithIndex{double}"/>((pid, iter) => (double)pid).Sum()
|
||||
/// sc.Parallelize(new int[]{1, 2, 3, 4}, 4).MapPartitionsWithIndex<double>((pid, iter) => (double)pid).Sum()
|
||||
/// 6
|
||||
/// </summary>
|
||||
/// <typeparam name="U"></typeparam>
|
||||
|
@ -417,7 +440,7 @@ namespace Microsoft.Spark.CSharp.Core
|
|||
public RDD<T> Union(RDD<T> other)
|
||||
{
|
||||
var rdd = new RDD<T>(RddProxy.Union(other.RddProxy), sparkContext);
|
||||
if (partitioner == other.partitioner && RddProxy.PartitionLength() == rdd.RddProxy.PartitionLength())
|
||||
if (partitioner == other.partitioner && RddProxy.GetNumPartitions() == rdd.RddProxy.GetNumPartitions())
|
||||
rdd.partitioner = partitioner;
|
||||
return rdd;
|
||||
}
|
||||
|
@ -579,6 +602,7 @@ namespace Microsoft.Spark.CSharp.Core
|
|||
/// <returns></returns>
|
||||
public T Reduce(Func<T, T, T> f)
|
||||
{
|
||||
logger.LogInfo("Executing Reduce operation on RDD");
|
||||
Func<int, IEnumerable<T>, IEnumerable<T>> func = new ReduceHelper<T>(f).Execute;
|
||||
var vals = MapPartitionsWithIndex(func, true).Collect();
|
||||
|
||||
|
@ -1047,6 +1071,14 @@ namespace Microsoft.Spark.CSharp.Core
|
|||
{
|
||||
return new RDD<T>(RddProxy.RandomSampleWithRange(lb, ub, seed), sparkContext);
|
||||
}
|
||||
|
||||
internal int GetDefaultPartitionNum()
|
||||
{
|
||||
var numPartitions = sparkContext.SparkConf.SparkConfProxy.GetInt("spark.default.parallelism", 0);
|
||||
if (numPartitions == 0 && previousRddProxy != null)
|
||||
numPartitions = previousRddProxy.GetNumPartitions();
|
||||
return numPartitions;
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
|
@ -1115,10 +1147,12 @@ namespace Microsoft.Spark.CSharp.Core
|
|||
/// <typeparam name="T"></typeparam>
|
||||
/// <param name="self"></param>
|
||||
/// <param name="num"></param>
|
||||
/// <param name="keyFunc"></param>
|
||||
/// <returns></returns>
|
||||
public static T[] TakeOrdered<T>(this RDD<T> self, int num) where T : IComparable<T>
|
||||
public static T[] TakeOrdered<T>(this RDD<T> self, int num, Func<T, dynamic> keyFunc = null) where T : IComparable<T>
|
||||
{
|
||||
return self.MapPartitionsWithIndex<T>(new TakeOrderedHelper<T>(num).Execute).Collect().OrderBy(x => x).Take(num).ToArray();
|
||||
return self.MapPartitionsWithIndex<T>(new TakeOrderedHelper<T>(num, keyFunc).Execute).Collect()
|
||||
.OrderBy(x => keyFunc == null ? x : keyFunc(x)).Take(num).ToArray();
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
|
@ -1432,13 +1466,15 @@ namespace Microsoft.Spark.CSharp.Core
|
|||
internal class TakeOrderedHelper<T>
|
||||
{
|
||||
private readonly int num;
|
||||
internal TakeOrderedHelper(int num)
|
||||
private readonly Func<T, dynamic> keyFunc;
|
||||
internal TakeOrderedHelper(int num, Func<T, dynamic> keyFunc)
|
||||
{
|
||||
this.num = num;
|
||||
this.keyFunc = keyFunc;
|
||||
}
|
||||
internal IEnumerable<T> Execute(int pid, IEnumerable<T> input)
|
||||
{
|
||||
return input.OrderBy(x => x).Take(num);
|
||||
return input.OrderBy(x => keyFunc == null ? x : keyFunc(x)).Take(num);
|
||||
}
|
||||
}
|
||||
[Serializable]
|
||||
|
|
|
@ -5,12 +5,12 @@ using System;
|
|||
using System.Collections.Generic;
|
||||
using System.IO;
|
||||
using System.Net;
|
||||
using System.Net.Sockets;
|
||||
using System.Reflection;
|
||||
using System.Runtime.Serialization;
|
||||
using System.Runtime.Serialization.Formatters.Binary;
|
||||
using System.Text;
|
||||
using Microsoft.Spark.CSharp.Interop.Ipc;
|
||||
using Microsoft.Spark.CSharp.Network;
|
||||
using Microsoft.Spark.CSharp.Sql;
|
||||
|
||||
namespace Microsoft.Spark.CSharp.Core
|
||||
|
@ -23,10 +23,10 @@ namespace Microsoft.Spark.CSharp.Core
|
|||
public IEnumerable<dynamic> Collect(int port, SerializedMode serializedMode, Type type)
|
||||
{
|
||||
IFormatter formatter = new BinaryFormatter();
|
||||
Socket sock = new Socket(AddressFamily.InterNetwork, SocketType.Stream, ProtocolType.Tcp);
|
||||
var sock = SocketFactory.CreateSocket();
|
||||
sock.Connect(IPAddress.Loopback, port);
|
||||
|
||||
using (NetworkStream s = new NetworkStream(sock))
|
||||
using (var s = sock.GetStream())
|
||||
{
|
||||
byte[] buffer;
|
||||
while ((buffer = SerDe.ReadBytes(s)) != null && buffer.Length > 0)
|
||||
|
|
|
@ -63,6 +63,7 @@ namespace Microsoft.Spark.CSharp.Core
|
|||
public SparkConf SetMaster(string master)
|
||||
{
|
||||
sparkConfProxy.SetMaster(master);
|
||||
logger.LogInfo("Spark master set to {0}", master);
|
||||
return this;
|
||||
}
|
||||
|
||||
|
@ -73,6 +74,7 @@ namespace Microsoft.Spark.CSharp.Core
|
|||
public SparkConf SetAppName(string appName)
|
||||
{
|
||||
sparkConfProxy.SetAppName(appName);
|
||||
logger.LogInfo("Spark app name set to {0}", appName);
|
||||
return this;
|
||||
}
|
||||
|
||||
|
@ -84,6 +86,7 @@ namespace Microsoft.Spark.CSharp.Core
|
|||
public SparkConf SetSparkHome(string sparkHome)
|
||||
{
|
||||
sparkConfProxy.SetSparkHome(sparkHome);
|
||||
logger.LogInfo("Spark home set to {0}", sparkHome);
|
||||
return this;
|
||||
}
|
||||
|
||||
|
@ -95,6 +98,7 @@ namespace Microsoft.Spark.CSharp.Core
|
|||
public SparkConf Set(string key, string value)
|
||||
{
|
||||
sparkConfProxy.Set(key, value);
|
||||
logger.LogInfo("Spark configuration key-value set to {0}={1}", key, value);
|
||||
return this;
|
||||
}
|
||||
|
||||
|
|
|
@ -10,14 +10,31 @@ using System.Text;
|
|||
|
||||
using Microsoft.Spark.CSharp.Interop;
|
||||
using Microsoft.Spark.CSharp.Proxy;
|
||||
using Microsoft.Spark.CSharp.Services;
|
||||
|
||||
namespace Microsoft.Spark.CSharp.Core
|
||||
{
|
||||
/// <summary>
|
||||
/// Main entry point for Spark functionality. A SparkContext represents the
|
||||
/// connection to a Spark cluster, and can be used to create RDDs, accumulators
|
||||
/// and broadcast variables on that cluster.
|
||||
/// </summary>
|
||||
public class SparkContext
|
||||
{
|
||||
private readonly ILoggerService logger = LoggerServiceFactory.GetLogger(typeof(SparkContext));
|
||||
internal ISparkContextProxy SparkContextProxy { get; private set; }
|
||||
internal SparkConf SparkConf { get; private set; }
|
||||
|
||||
private static SparkContext _activeSparkContext = null;
|
||||
|
||||
/// <summary>
|
||||
/// Get existing SparkContext
|
||||
/// </summary>
|
||||
internal static SparkContext GetActiveSparkContext()
|
||||
{
|
||||
return _activeSparkContext;
|
||||
}
|
||||
|
||||
private AccumulatorServer accumulatorServer;
|
||||
private int nextAccumulatorId;
|
||||
|
||||
|
@ -63,20 +80,32 @@ namespace Microsoft.Spark.CSharp.Core
|
|||
/// </summary>
|
||||
public StatusTracker StatusTracker { get { return new StatusTracker(SparkContextProxy.StatusTracker); } }
|
||||
|
||||
/// <summary>
|
||||
/// Initializes a SparkContext instance with a specific master, application name, and spark home
|
||||
/// </summary>
|
||||
/// <param name="master">Cluster URL to connect to (e.g. mesos://host:port, spark://host:port, local)</param>
|
||||
/// <param name="appName">A name for your application, to display on the cluster web UI</param>
|
||||
/// <param name="sparkHome">the path that holds spark bits</param>
|
||||
public SparkContext(string master, string appName, string sparkHome)
|
||||
: this(master, appName, sparkHome, null)
|
||||
{
|
||||
}
|
||||
{}
|
||||
|
||||
/// <summary>
|
||||
/// Initializes a SparkContext instance with a specific master and application name.
|
||||
/// </summary>
|
||||
/// <param name="master"></param>
|
||||
/// <param name="appName"></param>
|
||||
public SparkContext(string master, string appName)
|
||||
: this(master, appName, null, null)
|
||||
{
|
||||
}
|
||||
{}
|
||||
|
||||
/// <summary>
|
||||
/// Initializes a SparkContext instance with a specific spark config.
|
||||
/// </summary>
|
||||
/// <param name="conf">A SparkConf object that represents the settings for spark</param>
|
||||
public SparkContext(SparkConf conf)
|
||||
: this(null, null, null, conf)
|
||||
{
|
||||
}
|
||||
{}
|
||||
|
||||
/// <summary>
|
||||
/// when created from checkpoint
|
||||
|
@ -100,6 +129,7 @@ namespace Microsoft.Spark.CSharp.Core
|
|||
SparkConf.SetSparkHome(sparkHome);
|
||||
|
||||
SparkContextProxy = SparkCLREnvironment.SparkCLRProxy.CreateSparkContext(SparkConf.SparkConfProxy);
|
||||
_activeSparkContext = this;
|
||||
}
|
||||
|
||||
internal void StartAccumulatorServer()
|
||||
|
@ -112,8 +142,15 @@ namespace Microsoft.Spark.CSharp.Core
|
|||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Read a text file from HDFS, a local file system (available on all nodes), or any Hadoop-supported file system URI, and return it as an RDD of Strings.
|
||||
/// </summary>
|
||||
/// <param name="filePath">The path of file to be read</param>
|
||||
/// <param name="minPartitions">A suggestion value of the minimal splitting number for input data</param>
|
||||
/// <returns>an RDD of Strings</returns>
|
||||
public RDD<string> TextFile(string filePath, int minPartitions = 0)
|
||||
{
|
||||
logger.LogInfo("Reading text file {0} as RDD<string> with {1} partitions", filePath, minPartitions);
|
||||
return new RDD<string>(SparkContextProxy.TextFile(filePath, minPartitions), this, SerializedMode.String);
|
||||
}
|
||||
|
||||
|
@ -142,6 +179,7 @@ namespace Microsoft.Spark.CSharp.Core
|
|||
if (numSlices < 1)
|
||||
numSlices = 1;
|
||||
|
||||
logger.LogInfo("Parallelizing {0} items to form RDD in the cluster with {1} partitions", collectionOfByteRepresentationOfObjects.Count, numSlices);
|
||||
return new RDD<T>(SparkContextProxy.Parallelize(collectionOfByteRepresentationOfObjects, numSlices), this);
|
||||
}
|
||||
|
||||
|
@ -170,7 +208,7 @@ namespace Microsoft.Spark.CSharp.Core
|
|||
///
|
||||
/// Do
|
||||
/// {{{
|
||||
/// <see cref="RDD{KeyValuePair{string, string}}"/> rdd = sparkContext.WholeTextFiles("hdfs://a-hdfs-path")
|
||||
/// RDD<KeyValuePair<string, string>> rdd = sparkContext.WholeTextFiles("hdfs://a-hdfs-path")
|
||||
/// }}}
|
||||
///
|
||||
/// then `rdd` contains
|
||||
|
@ -208,7 +246,7 @@ namespace Microsoft.Spark.CSharp.Core
|
|||
/// }}}
|
||||
///
|
||||
/// Do
|
||||
/// <see cref="RDD{KeyValuePair{string, byte[]}}"/> rdd = sparkContext.dataStreamFiles("hdfs://a-hdfs-path")`,
|
||||
/// RDD<KeyValuePair<string, byte[]>>"/> rdd = sparkContext.dataStreamFiles("hdfs://a-hdfs-path")`,
|
||||
///
|
||||
/// then `rdd` contains
|
||||
/// {{{
|
||||
|
@ -401,9 +439,16 @@ namespace Microsoft.Spark.CSharp.Core
|
|||
/// </summary>
|
||||
public void Stop()
|
||||
{
|
||||
logger.LogInfo("Stopping SparkContext");
|
||||
logger.LogInfo("Note that there might be error in Spark logs on the failure to delete userFiles directory " +
|
||||
"under Spark temp directory (spark.local.dir config value in local mode)");
|
||||
logger.LogInfo("This error may be ignored for now. See https://issues.apache.org/jira/browse/SPARK-8333 for details");
|
||||
|
||||
if (accumulatorServer != null)
|
||||
accumulatorServer.Shutdown();
|
||||
|
||||
SparkContextProxy.Stop();
|
||||
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
|
|
|
@ -9,6 +9,11 @@ using System.Threading.Tasks;
|
|||
|
||||
namespace Microsoft.Spark.CSharp.Core
|
||||
{
|
||||
/// <summary>
|
||||
/// A class for tracking the statistics of a set of numbers (count, mean and variance) in a numerically
|
||||
/// robust way. Includes support for merging two StatCounters. Based on Welford and Chan's algorithms
|
||||
/// for running variance.
|
||||
/// </summary>
|
||||
[Serializable]
|
||||
public class StatCounter
|
||||
{
|
||||
|
@ -18,9 +23,16 @@ namespace Microsoft.Spark.CSharp.Core
|
|||
private double maxValue = double.MinValue; // Running max of our values
|
||||
private double minValue = double.MaxValue; // Running min of our values
|
||||
|
||||
/// <summary>
|
||||
/// Initializes the StatCounter with no values.
|
||||
/// </summary>
|
||||
public StatCounter()
|
||||
{ }
|
||||
|
||||
/// <summary>
|
||||
/// Initializes the StatCounter with the given values.
|
||||
/// </summary>
|
||||
/// <param name="values"></param>
|
||||
public StatCounter(IEnumerable<double> values)
|
||||
{
|
||||
Merge(values);
|
||||
|
@ -114,10 +126,30 @@ namespace Microsoft.Spark.CSharp.Core
|
|||
other.minValue = minValue;
|
||||
return other;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Gets the count number of this StatCounter
|
||||
/// </summary>
|
||||
public long Count { get { return n; } }
|
||||
|
||||
/// <summary>
|
||||
/// Gets the average number of this StatCounter
|
||||
/// </summary>
|
||||
public double Mean { get { return mu; } }
|
||||
|
||||
/// <summary>
|
||||
/// Gets the sum number of this StatCounter
|
||||
/// </summary>
|
||||
public double Sum { get { return n * mu; } }
|
||||
|
||||
/// <summary>
|
||||
/// Gets the maximum number of this StatCounter
|
||||
/// </summary>
|
||||
public double Max { get { return maxValue; } }
|
||||
|
||||
/// <summary>
|
||||
/// Gets the minimum number of this StatCounter
|
||||
/// </summary>
|
||||
public double Min { get { return minValue; } }
|
||||
|
||||
/// <summary>
|
||||
|
@ -139,6 +171,13 @@ namespace Microsoft.Spark.CSharp.Core
|
|||
/// Return the sample standard deviation of the values, which corrects for bias in estimating the variance by dividing by N-1 instead of N.
|
||||
/// </summary>
|
||||
public double SampleStdev { get { return Math.Sqrt(SampleVariance); } }
|
||||
|
||||
/// <summary>
|
||||
/// Returns a string that represents this StatCounter.
|
||||
/// </summary>
|
||||
/// <returns>
|
||||
/// A string that represents this StatCounter.
|
||||
/// </returns>
|
||||
public override string ToString()
|
||||
{
|
||||
return string.Format("(count: {0}, mean: {1}, stdev: {2}, max: {3}, min: {4})", Count, Mean, Stdev, Max, Min);
|
||||
|
|
|
@ -11,6 +11,9 @@ using Microsoft.Spark.CSharp.Proxy;
|
|||
|
||||
namespace Microsoft.Spark.CSharp.Core
|
||||
{
|
||||
/// <summary>
|
||||
/// Low-level status reporting APIs for monitoring job and stage progress.
|
||||
/// </summary>
|
||||
public class StatusTracker
|
||||
{
|
||||
private readonly IStatusTrackerProxy statusTrackerProxy;
|
||||
|
@ -76,11 +79,21 @@ namespace Microsoft.Spark.CSharp.Core
|
|||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// SparkJobInfo represents a job information of Spark
|
||||
/// </summary>
|
||||
public class SparkJobInfo
|
||||
{
|
||||
readonly int jobId;
|
||||
readonly int[] stageIds;
|
||||
readonly string status;
|
||||
|
||||
/// <summary>
|
||||
/// Initializes a SparkJobInfo instance with a given job Id, stage Ids, and status
|
||||
/// </summary>
|
||||
/// <param name="jobId"></param>
|
||||
/// <param name="stageIds"></param>
|
||||
/// <param name="status"></param>
|
||||
public SparkJobInfo(int jobId, int[] stageIds, string status)
|
||||
{
|
||||
this.jobId = jobId;
|
||||
|
@ -88,12 +101,26 @@ namespace Microsoft.Spark.CSharp.Core
|
|||
this.status = status;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Gets the Id of this Spark job
|
||||
/// </summary>
|
||||
public int JobId { get { return jobId; } }
|
||||
|
||||
/// <summary>
|
||||
/// Gets the stage Ids of this Spark job
|
||||
/// </summary>
|
||||
public int[] StageIds { get { return stageIds; } }
|
||||
|
||||
/// <summary>
|
||||
/// Gets the status of this Spark job
|
||||
/// </summary>
|
||||
public string Status { get { return status; } }
|
||||
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// SparkJobInfo represents a stage information of Spark
|
||||
/// </summary>
|
||||
public class SparkStageInfo
|
||||
{
|
||||
readonly int stageId;
|
||||
|
@ -104,6 +131,18 @@ namespace Microsoft.Spark.CSharp.Core
|
|||
readonly int numActiveTasks;
|
||||
readonly int numCompletedTasks;
|
||||
readonly int numFailedTasks;
|
||||
|
||||
/// <summary>
|
||||
/// Initializes a SparkStageInfo instance with given values
|
||||
/// </summary>
|
||||
/// <param name="stageId">The stage Id</param>
|
||||
/// <param name="currentAttemptId">The current attempt Id</param>
|
||||
/// <param name="submissionTime">The submission time</param>
|
||||
/// <param name="name">The name of this stage</param>
|
||||
/// <param name="numTasks">The number of tasks</param>
|
||||
/// <param name="numActiveTasks">The number of active tasks</param>
|
||||
/// <param name="numCompletedTasks">The number of completed tasks</param>
|
||||
/// <param name="numFailedTasks">The number of failed tasks</param>
|
||||
public SparkStageInfo(int stageId, int currentAttemptId, long submissionTime, string name, int numTasks, int numActiveTasks, int numCompletedTasks, int numFailedTasks)
|
||||
{
|
||||
this.stageId = stageId;
|
||||
|
@ -116,13 +155,44 @@ namespace Microsoft.Spark.CSharp.Core
|
|||
this.numFailedTasks = numFailedTasks;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Gets the stage Id of this SparkStageInfo
|
||||
/// </summary>
|
||||
public int StageId { get { return stageId; } }
|
||||
|
||||
/// <summary>
|
||||
/// Gets the current attempt Id of this SparkStageInfo
|
||||
/// </summary>
|
||||
public int CurrentAttemptId { get { return currentAttemptId; } }
|
||||
|
||||
/// <summary>
|
||||
/// Gets the submission time of this SparkStageInfo
|
||||
/// </summary>
|
||||
public long SubmissionTime { get { return submissionTime; } }
|
||||
|
||||
/// <summary>
|
||||
/// Gets the name of this SparkStageInfo
|
||||
/// </summary>
|
||||
public string Name { get { return name; } }
|
||||
|
||||
/// <summary>
|
||||
/// Gets the number of tasks of this SparkStageInfo
|
||||
/// </summary>
|
||||
public int NumTasks { get { return numTasks; } }
|
||||
|
||||
/// <summary>
|
||||
/// Gets the number of active tasks of this SparkStageInfo
|
||||
/// </summary>
|
||||
public int NumActiveTasks { get { return numActiveTasks; } }
|
||||
|
||||
/// <summary>
|
||||
/// Gets the number of completed tasks of this SparkStageInfo
|
||||
/// </summary>
|
||||
public int NumCompletedTasks { get { return numCompletedTasks; } }
|
||||
|
||||
/// <summary>
|
||||
/// Gets the number of failed tasks of this SparkStageInfro
|
||||
/// </summary>
|
||||
public int NumFailedTasks { get { return numFailedTasks; } }
|
||||
}
|
||||
}
|
||||
|
|
|
@ -9,21 +9,67 @@ using System.Threading.Tasks;
|
|||
|
||||
namespace Microsoft.Spark.CSharp.Core
|
||||
{
|
||||
/// <summary>
|
||||
/// Defines the type of storage levels
|
||||
/// </summary>
|
||||
public enum StorageLevelType
|
||||
{
|
||||
/// <summary>
|
||||
/// Not specified to use any storage
|
||||
/// </summary>
|
||||
NONE,
|
||||
/// <summary>
|
||||
/// Specifies to use disk only
|
||||
/// </summary>
|
||||
DISK_ONLY,
|
||||
/// <summary>
|
||||
/// Specifies to use disk only with 2 replicas
|
||||
/// </summary>
|
||||
DISK_ONLY_2,
|
||||
/// <summary>
|
||||
/// Specifies to use memory only
|
||||
/// </summary>
|
||||
MEMORY_ONLY,
|
||||
/// <summary>
|
||||
/// Specifies to use memory only 2 replicas
|
||||
/// </summary>
|
||||
MEMORY_ONLY_2,
|
||||
/// <summary>
|
||||
/// Specifies to use memory only in a serialized format
|
||||
/// </summary>
|
||||
MEMORY_ONLY_SER,
|
||||
/// <summary>
|
||||
/// Specifies to use memory only in a serialized format with 2 replicas
|
||||
/// </summary>
|
||||
MEMORY_ONLY_SER_2,
|
||||
/// <summary>
|
||||
/// Specifies to use disk and memory
|
||||
/// </summary>
|
||||
MEMORY_AND_DISK,
|
||||
/// <summary>
|
||||
/// Specifies to use disk and memory with 2 replicas
|
||||
/// </summary>
|
||||
MEMORY_AND_DISK_2,
|
||||
/// <summary>
|
||||
/// Specifies to use disk and memory in a serialized format
|
||||
/// </summary>
|
||||
MEMORY_AND_DISK_SER,
|
||||
/// <summary>
|
||||
/// Specifies to use disk and memory in a serialized format with 2 replicas
|
||||
/// </summary>
|
||||
MEMORY_AND_DISK_SER_2,
|
||||
/// <summary>
|
||||
/// Specifies to use off heap
|
||||
/// </summary>
|
||||
OFF_HEAP
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Flags for controlling the storage of an RDD. Each StorageLevel records whether to use
|
||||
/// memory, whether to drop the RDD to disk if it falls out of memory, whether to keep the
|
||||
/// data in memory in a serialized format, and whether to replicate the RDD partitions
|
||||
/// on multiple nodes.
|
||||
/// </summary>
|
||||
public class StorageLevel
|
||||
{
|
||||
internal static Dictionary<StorageLevelType, StorageLevel> storageLevel = new Dictionary<StorageLevelType, StorageLevel>
|
||||
|
@ -56,6 +102,10 @@ namespace Microsoft.Spark.CSharp.Core
|
|||
this.replication = replication;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Returns a readable string that represents the type
|
||||
/// </summary>
|
||||
/// <returns>A readable string</returns>
|
||||
public override string ToString()
|
||||
{
|
||||
return string.Format("{0}{1}{2}{3}{4} Replicated",
|
||||
|
|
|
@ -10,7 +10,7 @@ using System.Threading.Tasks;
|
|||
namespace Microsoft.Spark.CSharp.Interop.Ipc
|
||||
{
|
||||
/// <summary>
|
||||
/// Behavior of the bridge used for the IPC interop between JVM & CLR
|
||||
/// Behavior of the bridge used for the IPC interop between JVM and CLR
|
||||
/// </summary>
|
||||
internal interface IJvmBridge : IDisposable
|
||||
{
|
||||
|
|
|
@ -5,23 +5,24 @@ using System;
|
|||
using System.Collections.Concurrent;
|
||||
using System.Collections.Generic;
|
||||
using System.Diagnostics.CodeAnalysis;
|
||||
using System.IO;
|
||||
using System.Net;
|
||||
using System.Net.Sockets;
|
||||
using System.Text;
|
||||
using Microsoft.Spark.CSharp.Network;
|
||||
using Microsoft.Spark.CSharp.Services;
|
||||
|
||||
namespace Microsoft.Spark.CSharp.Interop.Ipc
|
||||
{
|
||||
/// <summary>
|
||||
/// Implementation of thread safe IPC bridge between JVM & CLR
|
||||
/// throught a concourrent socket connection queue (lightweight synchronisation mechanism)
|
||||
/// Implementation of thread safe IPC bridge between JVM and CLR
|
||||
/// Using a concurrent socket connection queue (lightweight synchronization mechanism)
|
||||
/// supporting async JVM calls like StreamingContext.AwaitTermination()
|
||||
/// </summary>
|
||||
[ExcludeFromCodeCoverage] //IPC calls to JVM validated using validation-enabled samples - unit test coverage not reqiured
|
||||
internal class JvmBridge : IJvmBridge
|
||||
{
|
||||
private int portNumber;
|
||||
private readonly ConcurrentQueue<Socket> sockets = new ConcurrentQueue<Socket>();
|
||||
private readonly ConcurrentQueue<ISocketWrapper> sockets = new ConcurrentQueue<ISocketWrapper>();
|
||||
private readonly ILoggerService logger = LoggerServiceFactory.GetLogger(typeof(JvmBridge));
|
||||
|
||||
public void Initialize(int portNumber)
|
||||
|
@ -29,12 +30,12 @@ namespace Microsoft.Spark.CSharp.Interop.Ipc
|
|||
this.portNumber = portNumber;
|
||||
}
|
||||
|
||||
private Socket GetConnection()
|
||||
private ISocketWrapper GetConnection()
|
||||
{
|
||||
Socket socket;
|
||||
ISocketWrapper socket;
|
||||
if (!sockets.TryDequeue(out socket))
|
||||
{
|
||||
socket = new Socket(AddressFamily.InterNetwork, SocketType.Stream, ProtocolType.Tcp);
|
||||
socket = SocketFactory.CreateSocket();
|
||||
socket.Connect(IPAddress.Loopback, portNumber);
|
||||
}
|
||||
return socket;
|
||||
|
@ -72,8 +73,8 @@ namespace Microsoft.Spark.CSharp.Interop.Ipc
|
|||
{
|
||||
var overallPayload = PayloadHelper.BuildPayload(isStatic, classNameOrJvmObjectReference, methodName, parameters);
|
||||
|
||||
Socket socket = GetConnection();
|
||||
using (NetworkStream s = new NetworkStream(socket))
|
||||
var socket = GetConnection();
|
||||
using (var s = socket.GetStream())
|
||||
{
|
||||
SerDe.Write(s, overallPayload);
|
||||
|
||||
|
@ -115,7 +116,7 @@ namespace Microsoft.Spark.CSharp.Interop.Ipc
|
|||
break;
|
||||
|
||||
case 'l':
|
||||
returnValue = ReadJvmObjectReferenceCollection(s);
|
||||
returnValue = ReadCollection(s);
|
||||
|
||||
break;
|
||||
|
||||
|
@ -207,15 +208,56 @@ namespace Microsoft.Spark.CSharp.Interop.Ipc
|
|||
return paramsString.ToString();
|
||||
}
|
||||
|
||||
private object ReadJvmObjectReferenceCollection(NetworkStream s)
|
||||
private object ReadCollection(Stream s)
|
||||
{
|
||||
object returnValue;
|
||||
var listItemTypeAsChar = Convert.ToChar(s.ReadByte());
|
||||
int numOfItemsInList = SerDe.ReadInt(s);
|
||||
switch (listItemTypeAsChar)
|
||||
{
|
||||
case 'c':
|
||||
var strList = new List<string>();
|
||||
for (int itemIndex = 0; itemIndex < numOfItemsInList; itemIndex++)
|
||||
{
|
||||
strList.Add(SerDe.ReadString(s));
|
||||
}
|
||||
returnValue = strList;
|
||||
break;
|
||||
case 'i':
|
||||
var intList = new List<int>();
|
||||
for (int itemIndex = 0; itemIndex < numOfItemsInList; itemIndex++)
|
||||
{
|
||||
intList.Add(SerDe.ReadInt(s));
|
||||
}
|
||||
returnValue = intList;
|
||||
break;
|
||||
case 'd':
|
||||
var doubleList = new List<double>();
|
||||
for (int itemIndex = 0; itemIndex < numOfItemsInList; itemIndex++)
|
||||
{
|
||||
doubleList.Add(SerDe.ReadDouble(s));
|
||||
}
|
||||
returnValue = doubleList;
|
||||
break;
|
||||
case 'b':
|
||||
var boolList = new List<bool>();
|
||||
for (int itemIndex = 0; itemIndex < numOfItemsInList; itemIndex++)
|
||||
{
|
||||
boolList.Add(Convert.ToBoolean(s.ReadByte()));
|
||||
}
|
||||
returnValue = boolList;
|
||||
break;
|
||||
case 'r':
|
||||
var byteArrayList = new List<byte[]>();
|
||||
for (int itemIndex = 0; itemIndex < numOfItemsInList; itemIndex++)
|
||||
{
|
||||
var byteArrayLen = SerDe.ReadInt(s);
|
||||
byteArrayList.Add(SerDe.ReadBytes(s, byteArrayLen));
|
||||
}
|
||||
returnValue = byteArrayList;
|
||||
break;
|
||||
case 'j':
|
||||
var jvmObjectReferenceList = new List<JvmObjectReference>();
|
||||
var numOfItemsInList = SerDe.ReadInt(s);
|
||||
for (int itemIndex = 0; itemIndex < numOfItemsInList; itemIndex++)
|
||||
{
|
||||
var itemIdentifier = SerDe.ReadString(s);
|
||||
|
@ -223,7 +265,6 @@ namespace Microsoft.Spark.CSharp.Interop.Ipc
|
|||
}
|
||||
returnValue = jvmObjectReferenceList;
|
||||
break;
|
||||
|
||||
default:
|
||||
// convert listItemTypeAsChar to UInt32 because the char may be non-printable
|
||||
throw new NotSupportedException(
|
||||
|
@ -235,13 +276,12 @@ namespace Microsoft.Spark.CSharp.Interop.Ipc
|
|||
|
||||
public void Dispose()
|
||||
{
|
||||
Socket socket;
|
||||
ISocketWrapper socket;
|
||||
while (sockets.TryDequeue(out socket))
|
||||
{
|
||||
if (socket != null)
|
||||
{
|
||||
socket.Dispose();
|
||||
socket = null;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -0,0 +1,80 @@
|
|||
// Copyright (c) Microsoft. All rights reserved.
|
||||
// Licensed under the MIT license. See LICENSE file in the project root for full license information.
|
||||
|
||||
using System.Collections.Generic;
|
||||
using System.Diagnostics.CodeAnalysis;
|
||||
using System.Linq;
|
||||
using System.Text;
|
||||
using System.Threading.Tasks;
|
||||
using Microsoft.Spark.CSharp.Proxy.Ipc;
|
||||
|
||||
namespace Microsoft.Spark.CSharp.Interop.Ipc
|
||||
{
|
||||
/// <summary>
|
||||
/// Utility methods for C#-JVM interaction
|
||||
/// </summary>
|
||||
[ExcludeFromCodeCoverage] //IPC calls to JVM validated using validation-enabled samples - unit test coverage not reqiured
|
||||
internal static class JvmBridgeUtils
|
||||
{
|
||||
public static JvmObjectReference GetJavaMap<K, V>(IEnumerable<KeyValuePair<K, V>> enumerable)
|
||||
{
|
||||
var jmap = SparkCLRIpcProxy.JvmBridge.CallConstructor("java.util.Hashtable", new object[] { });
|
||||
if (enumerable != null)
|
||||
{
|
||||
foreach (var item in enumerable)
|
||||
SparkCLRIpcProxy.JvmBridge.CallNonStaticJavaMethod(jmap, "put", new object[] { item.Key, item.Value });
|
||||
}
|
||||
return jmap;
|
||||
}
|
||||
|
||||
public static JvmObjectReference GetJavaHashMap<K, V>(IEnumerable<KeyValuePair<K, V>> enumerable)
|
||||
{
|
||||
var jmap = SparkCLRIpcProxy.JvmBridge.CallConstructor("java.util.HashMap", new object[] { });
|
||||
if (enumerable != null)
|
||||
{
|
||||
foreach (var item in enumerable)
|
||||
SparkCLRIpcProxy.JvmBridge.CallNonStaticJavaMethod(jmap, "put", new object[] { item.Key, item.Value });
|
||||
}
|
||||
return jmap;
|
||||
}
|
||||
|
||||
public static JvmObjectReference GetScalaMutableMap<K, V>(Dictionary<K, V> mapValues)
|
||||
{
|
||||
var hashMapReference = GetJavaHashMap(mapValues.Select(kvp => kvp));
|
||||
return new JvmObjectReference(SparkCLRIpcProxy.JvmBridge.CallStaticJavaMethod("org.apache.spark.sql.api.csharp.JvmBridgeUtils", "toMutableMap", new object[] { hashMapReference }).ToString());
|
||||
}
|
||||
|
||||
public static JvmObjectReference GetJavaSet<T>(IEnumerable<T> enumerable)
|
||||
{
|
||||
var jset = SparkCLRIpcProxy.JvmBridge.CallConstructor("java.util.HashSet", new object[] { });
|
||||
if (enumerable != null)
|
||||
{
|
||||
foreach (var item in enumerable)
|
||||
SparkCLRIpcProxy.JvmBridge.CallNonStaticJavaMethod(jset, "add", new object[] { item });
|
||||
}
|
||||
return jset;
|
||||
}
|
||||
|
||||
public static JvmObjectReference GetJavaList<T>(IEnumerable<T> enumerable)
|
||||
{
|
||||
var jlist = SparkCLRIpcProxy.JvmBridge.CallConstructor("java.util.ArrayList", new object[] { });
|
||||
if (enumerable != null)
|
||||
{
|
||||
foreach (var item in enumerable)
|
||||
SparkCLRIpcProxy.JvmBridge.CallNonStaticJavaMethod(jlist, "add", new object[] { item });
|
||||
}
|
||||
return jlist;
|
||||
}
|
||||
|
||||
public static JvmObjectReference GetJavaSeq<T>(IEnumerable<T> enumerable)
|
||||
{
|
||||
return new JvmObjectReference((string)SparkCLRIpcProxy.JvmBridge.CallStaticJavaMethod("org.apache.spark.sql.api.csharp.SQLUtils", "toSeq", GetJavaList<T>(enumerable)));
|
||||
}
|
||||
|
||||
public static JvmObjectReference GetJavaDuration(int durationSeconds)
|
||||
{
|
||||
// java expects Duration in mini seconds and must be of long type
|
||||
return SparkCLRIpcProxy.JvmBridge.CallConstructor("org.apache.spark.streaming.Duration", new object[] { (long)durationSeconds * 1000 });
|
||||
}
|
||||
}
|
||||
}
|
|
@ -2,8 +2,10 @@
|
|||
// Licensed under the MIT license. See LICENSE file in the project root for full license information.
|
||||
|
||||
using System;
|
||||
using System.Runtime.CompilerServices;
|
||||
using Microsoft.Spark.CSharp.Proxy.Ipc;
|
||||
|
||||
[assembly: InternalsVisibleTo("Microsoft.Spark.CSharp.Utils")]
|
||||
namespace Microsoft.Spark.CSharp.Interop.Ipc
|
||||
{
|
||||
/// <summary>
|
||||
|
@ -19,6 +21,7 @@ namespace Microsoft.Spark.CSharp.Interop.Ipc
|
|||
{
|
||||
Id = jvmReferenceId;
|
||||
creationTime = DateTime.UtcNow;
|
||||
SparkCLREnvironment.WeakObjectManager.AddWeakRefereceObject(this);
|
||||
}
|
||||
|
||||
public override string ToString()
|
||||
|
@ -40,6 +43,11 @@ namespace Microsoft.Spark.CSharp.Interop.Ipc
|
|||
return base.Equals(obj);
|
||||
}
|
||||
|
||||
public override int GetHashCode()
|
||||
{
|
||||
return base.GetHashCode();
|
||||
}
|
||||
|
||||
public string GetDebugInfo()
|
||||
{
|
||||
var javaObjectReferenceForClassObject = new JvmObjectReference(SparkCLRIpcProxy.JvmBridge.CallNonStaticJavaMethod(this, "getClass").ToString());
|
||||
|
|
|
@ -12,37 +12,84 @@ namespace Microsoft.Spark.CSharp.Interop.Ipc
|
|||
/// </summary>
|
||||
public enum SpecialLengths : int
|
||||
{
|
||||
/// <summary>
|
||||
/// Flag to indicate the end of data section
|
||||
/// </summary>
|
||||
END_OF_DATA_SECTION = -1,
|
||||
|
||||
/// <summary>
|
||||
/// Flag to indicate an exception thrown from .NET side
|
||||
/// </summary>
|
||||
DOTNET_EXCEPTION_THROWN = -2,
|
||||
|
||||
/// <summary>
|
||||
/// Flag to indicate a timing data
|
||||
/// </summary>
|
||||
TIMING_DATA = -3,
|
||||
|
||||
/// <summary>
|
||||
/// Flag to indicate the end of stream
|
||||
/// </summary>
|
||||
END_OF_STREAM = -4,
|
||||
|
||||
/// <summary>
|
||||
/// Flag to indicate non-defined type
|
||||
/// </summary>
|
||||
NULL = -5,
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Serialization and Deserialization of data types between JVM & CLR
|
||||
/// Serialization and Deserialization of data types between JVM and CLR
|
||||
/// </summary>
|
||||
public class SerDe //TODO - add ToBytes() for other types
|
||||
{
|
||||
/// <summary>
|
||||
/// The total number of read
|
||||
/// </summary>
|
||||
public static long totalReadNum = 0;
|
||||
|
||||
/// <summary>
|
||||
/// The total number of write
|
||||
/// </summary>
|
||||
public static long totalWriteNum = 0;
|
||||
|
||||
/// <summary>
|
||||
/// Converts a boolean to a byte array
|
||||
/// </summary>
|
||||
/// <param name="value">The boolean to be converted</param>
|
||||
/// <returns>The byte array converted from a boolean</returns>
|
||||
public static byte[] ToBytes(bool value)
|
||||
{
|
||||
return new[] { System.Convert.ToByte(value) };
|
||||
}
|
||||
|
||||
|
||||
/// <summary>
|
||||
/// Converts a string to a byte array.
|
||||
/// </summary>
|
||||
/// <param name="value">The string to be converted</param>
|
||||
/// <returns>The byte array converted from a string</returns>
|
||||
public static byte[] ToBytes(string value)
|
||||
{
|
||||
return Encoding.UTF8.GetBytes(value);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Converts an integer to a byte array
|
||||
/// </summary>
|
||||
/// <param name="value">The intger to be converted</param>
|
||||
/// <returns>The byte array converted from an integer</returns>
|
||||
public static byte[] ToBytes(int value)
|
||||
{
|
||||
var byteRepresentationofInputLength = BitConverter.GetBytes(value);
|
||||
Array.Reverse(byteRepresentationofInputLength);
|
||||
return byteRepresentationofInputLength;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Converts a long integer to a byte array
|
||||
/// </summary>
|
||||
/// <param name="value">The long intger to be converted</param>
|
||||
/// <returns>The byte array converted from a long integer</returns>
|
||||
public static byte[] ToBytes(long value)
|
||||
{
|
||||
var byteRepresentationofInputLength = BitConverter.GetBytes(value);
|
||||
|
@ -50,6 +97,11 @@ namespace Microsoft.Spark.CSharp.Interop.Ipc
|
|||
return byteRepresentationofInputLength;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Converts a double to a byte array
|
||||
/// </summary>
|
||||
/// <param name="value">The double to be converted</param>
|
||||
/// <returns>The byte array converted from a double</returns>
|
||||
public static byte[] ToBytes(double value)
|
||||
{
|
||||
var byteRepresentationofInputLength = BitConverter.GetBytes(value);
|
||||
|
@ -57,16 +109,31 @@ namespace Microsoft.Spark.CSharp.Interop.Ipc
|
|||
return byteRepresentationofInputLength;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Converts a byte to a character
|
||||
/// </summary>
|
||||
/// <param name="value">The byte to be converted</param>
|
||||
/// <returns>The char converted from a byte</returns>
|
||||
public static char ToChar(byte value)
|
||||
{
|
||||
return System.Convert.ToChar(value);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Converts a byte array to a string
|
||||
/// </summary>
|
||||
/// <param name="value">The byte array to be converted</param>
|
||||
/// <returns>The string converted from a byte array</returns>
|
||||
public static string ToString(byte[] value)
|
||||
{
|
||||
return Encoding.UTF8.GetString(value);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Converts a byte array to an integer
|
||||
/// </summary>
|
||||
/// <param name="value">The byte array to be converted</param>
|
||||
/// <returns>The integer converted from a byte array</returns>
|
||||
public static int ToInt(byte[] value)
|
||||
{
|
||||
return //Netty byte order is BigEndian
|
||||
|
@ -76,11 +143,21 @@ namespace Microsoft.Spark.CSharp.Interop.Ipc
|
|||
(int)value[0] << 24;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Reads an integer from a stream
|
||||
/// </summary>
|
||||
/// <param name="s">The stream to be read</param>
|
||||
/// <returns>The integer read from stream</returns>
|
||||
public static int ReadInt(Stream s)
|
||||
{
|
||||
return ToInt(ReadBytes(s, 4));
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Reads a long integer from a stream
|
||||
/// </summary>
|
||||
/// <param name="s">The stream to be read</param>
|
||||
/// <returns>The long integer read from stream</returns>
|
||||
public static long ReadLong(Stream s)
|
||||
{
|
||||
byte[] buffer = ReadBytes(s, 8);
|
||||
|
@ -94,7 +171,12 @@ namespace Microsoft.Spark.CSharp.Interop.Ipc
|
|||
(long)buffer[1] << 48 |
|
||||
(long)buffer[0] << 56;
|
||||
}
|
||||
|
||||
|
||||
/// <summary>
|
||||
/// Reads a double from a stream
|
||||
/// </summary>
|
||||
/// <param name="s">The stream to be read</param>
|
||||
/// <returns>The double read from stream</returns>
|
||||
public static double ReadDouble(Stream s)
|
||||
{
|
||||
byte[] buffer = ReadBytes(s, 8);
|
||||
|
@ -102,11 +184,24 @@ namespace Microsoft.Spark.CSharp.Interop.Ipc
|
|||
return BitConverter.ToDouble(buffer, 0);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Reads a string from a stream
|
||||
/// </summary>
|
||||
/// <param name="s">The stream to be read</param>
|
||||
/// <returns>The string read from stream</returns>
|
||||
public static string ReadString(Stream s)
|
||||
{
|
||||
return ToString(ReadBytes(s));
|
||||
}
|
||||
|
||||
|
||||
/// <summary>
|
||||
/// Reads a byte array with a given length from a stream
|
||||
/// </summary>
|
||||
/// <param name="s">The stream to be read</param>
|
||||
/// <param name="length">The length to be read</param>
|
||||
/// <returns>The a byte array read from stream</returns>
|
||||
/// <exception cref="ArgumentOutOfRangeException">An ArgumentOutOfRangeException thrown if the given length is negative</exception>
|
||||
/// <exception cref="ArgumentException">An ArgumentException if the actual read length is less than the given length</exception>
|
||||
public static byte[] ReadBytes(Stream s, int length)
|
||||
{
|
||||
if (length < 0)
|
||||
|
@ -139,6 +234,11 @@ namespace Microsoft.Spark.CSharp.Interop.Ipc
|
|||
return buffer;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Reads a byte array from a stream. The first 4 bytes indicate the length of a byte array.
|
||||
/// </summary>
|
||||
/// <param name="s">The stream to be read</param>
|
||||
/// <returns>The byte array read from stream</returns>
|
||||
public static byte[] ReadBytes(Stream s)
|
||||
{
|
||||
var lengthBuffer = ReadBytes(s, 4);
|
||||
|
@ -152,6 +252,11 @@ namespace Microsoft.Spark.CSharp.Interop.Ipc
|
|||
return ReadBytes(s, length);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Read an object Id from a stream.
|
||||
/// </summary>
|
||||
/// <param name="s">The stream to be read</param>
|
||||
/// <returns>The object Id read from stream</returns>
|
||||
public static string ReadObjectId(Stream s)
|
||||
{
|
||||
var type = s.ReadByte();
|
||||
|
@ -168,18 +273,33 @@ namespace Microsoft.Spark.CSharp.Interop.Ipc
|
|||
return ReadString(s);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Writes a byte to a stream
|
||||
/// </summary>
|
||||
/// <param name="s">The stream to write</param>
|
||||
/// <param name="value">The byte to write</param>
|
||||
public static void Write(Stream s, byte value)
|
||||
{
|
||||
s.WriteByte(value);
|
||||
totalWriteNum += 1;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Writes a byte array to a stream
|
||||
/// </summary>
|
||||
/// <param name="s">The stream to write</param>
|
||||
/// <param name="value">The byte array to write</param>
|
||||
public static void Write(Stream s, byte[] value)
|
||||
{
|
||||
s.Write(value, 0, value.Length);
|
||||
totalWriteNum += value.Length;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Writes an integer to a stream
|
||||
/// </summary>
|
||||
/// <param name="s">The stream to write</param>
|
||||
/// <param name="value">The integer to write</param>
|
||||
public static void Write(Stream s, int value)
|
||||
{
|
||||
Write(s, new byte[] {
|
||||
|
@ -190,6 +310,11 @@ namespace Microsoft.Spark.CSharp.Interop.Ipc
|
|||
});
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Writes a long integer to a stream
|
||||
/// </summary>
|
||||
/// <param name="s">The stream to write</param>
|
||||
/// <param name="value">The long integer to write</param>
|
||||
public static void Write(Stream s, long value)
|
||||
{
|
||||
Write(s, new byte[] {
|
||||
|
@ -204,6 +329,11 @@ namespace Microsoft.Spark.CSharp.Interop.Ipc
|
|||
});
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Writes a double to a stream
|
||||
/// </summary>
|
||||
/// <param name="s">The stream to write</param>
|
||||
/// <param name="value">The double to write</param>
|
||||
public static void Write(Stream s, double value)
|
||||
{
|
||||
byte[] buffer = BitConverter.GetBytes(value);
|
||||
|
@ -211,6 +341,11 @@ namespace Microsoft.Spark.CSharp.Interop.Ipc
|
|||
Write(s, buffer);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Writes a string to a stream
|
||||
/// </summary>
|
||||
/// <param name="s">The stream to write</param>
|
||||
/// <param name="value">The string to write</param>
|
||||
public static void Write(Stream s, string value)
|
||||
{
|
||||
byte[] buffer = Encoding.UTF8.GetBytes(value);
|
||||
|
|
|
@ -0,0 +1,261 @@
|
|||
using System;
|
||||
using System.Collections.Concurrent;
|
||||
using System.Collections.Generic;
|
||||
using System.Linq;
|
||||
using System.Text;
|
||||
using System.Threading;
|
||||
using System.Threading.Tasks;
|
||||
using Microsoft.Spark.CSharp.Proxy.Ipc;
|
||||
using Microsoft.Spark.CSharp.Services;
|
||||
|
||||
namespace Microsoft.Spark.CSharp.Interop.Ipc
|
||||
{
|
||||
using WeakReferenceObjectIdPair = KeyValuePair<WeakReference, string>;
|
||||
|
||||
/// <summary>
|
||||
/// Release JVMObjectTracker oject reference.
|
||||
/// The reason is for the inter-operation from CSharp to Java :
|
||||
/// 1.Java-side: https://github.com/Microsoft/Mobius/blob/master/scala/src/main/org/apache/spark/api/csharp/CSharpBackendHandler.scala#L269
|
||||
/// JVMObjectTracker keep a HashMap[String, Object] which is [id, Java-object]
|
||||
/// 2.CSharp-side :
|
||||
/// 1) JvmObjectReference remember the id : https://github.com/Microsoft/Mobius/blob/master/csharp/Adapter/Microsoft.Spark.CSharp/Interop/Ipc/JvmObjectReference.cs#L20
|
||||
/// 2) So JvmBridge can call java object's method https://github.com/Microsoft/Mobius/blob/master/csharp/Adapter/Microsoft.Spark.CSharp/Interop/Ipc/JvmBridge.cs#L69
|
||||
///
|
||||
/// So potential memory leak can happen in JVMObjectTracker.
|
||||
/// To solve this, track the garbage collection in CSharp side, get the id, release JVMObjectTracker's HashMap.
|
||||
/// </summary>
|
||||
internal interface IWeakObjectManager : IDisposable
|
||||
{
|
||||
TimeSpan CheckInterval { get; set; }
|
||||
|
||||
void AddWeakRefereceObject(JvmObjectReference obj);
|
||||
|
||||
/// <summary>
|
||||
/// Gets all weak object count including non-alive objects that wait for releasing.
|
||||
/// </summary>
|
||||
int GetReferencesCount();
|
||||
|
||||
/// <summary>
|
||||
/// Gets alive weak object count
|
||||
/// </summary>
|
||||
/// <returns></returns>
|
||||
int GetAliveCount();
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// adaptively control the number of weak objects that should be checked for each interval
|
||||
/// <summary>
|
||||
internal class WeakReferenceCheckCountController
|
||||
{
|
||||
private static readonly ILoggerService logger = LoggerServiceFactory.GetLogger(typeof(WeakReferenceCheckCountController));
|
||||
|
||||
private int checkCount;
|
||||
private int referencesCountBenchmark;
|
||||
|
||||
public WeakReferenceCheckCountController(int initialCheckCount, int initialReferencesCountBenchmark)
|
||||
{
|
||||
checkCount = initialCheckCount;
|
||||
referencesCountBenchmark = initialReferencesCountBenchmark;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Adjust checkCount adaptively according to current weak reference objects count
|
||||
/// </summary>
|
||||
public int AdjustCheckCount(int currentReferenceCount)
|
||||
{
|
||||
if (currentReferenceCount > (referencesCountBenchmark + referencesCountBenchmark / 2))
|
||||
{
|
||||
int previousCheckCount = checkCount;
|
||||
int previousReferencesCountBenchmark = referencesCountBenchmark;
|
||||
checkCount *= 2;
|
||||
referencesCountBenchmark = referencesCountBenchmark + referencesCountBenchmark / 2;
|
||||
logger.LogInfo("Adjust checkCount from {0} to {1}, referencesCountBenchmark from {2} to {3}",
|
||||
previousCheckCount, checkCount, previousReferencesCountBenchmark, referencesCountBenchmark);
|
||||
}
|
||||
return checkCount;
|
||||
}
|
||||
}
|
||||
|
||||
internal class WeakObjectManagerImpl : IWeakObjectManager
|
||||
{
|
||||
private static readonly ILoggerService logger = LoggerServiceFactory.GetLogger(typeof(WeakObjectManagerImpl));
|
||||
|
||||
internal static TimeSpan DefaultCheckInterval = TimeSpan.FromSeconds(3);
|
||||
private TimeSpan checkInterval;
|
||||
|
||||
private WeakReferenceCheckCountController checkCountController = new WeakReferenceCheckCountController(10, 1000);
|
||||
|
||||
/// <summary>
|
||||
/// Sleep time for checking thread
|
||||
/// </summary>
|
||||
public TimeSpan CheckInterval
|
||||
{
|
||||
get
|
||||
{
|
||||
return checkInterval;
|
||||
}
|
||||
set
|
||||
{
|
||||
checkInterval = value;
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Maximum running duration for checking thread each time
|
||||
/// </summary>
|
||||
private static readonly TimeSpan MaxReleasingDuration = TimeSpan.FromMilliseconds(100);
|
||||
|
||||
private readonly ConcurrentQueue<WeakReferenceObjectIdPair> weakReferences = new ConcurrentQueue<WeakReferenceObjectIdPair>();
|
||||
|
||||
private bool shouldKeepRunning = true;
|
||||
|
||||
private IObjectReleaser objectReleaser = new JvmObjectReleaser();
|
||||
|
||||
internal IObjectReleaser ObjectReleaser
|
||||
{
|
||||
set { objectReleaser = value; }
|
||||
}
|
||||
|
||||
private Thread releaserThread;
|
||||
|
||||
internal WeakObjectManagerImpl(TimeSpan checkIntervalTimeSpan)
|
||||
{
|
||||
checkInterval = checkIntervalTimeSpan;
|
||||
releaserThread = new Thread(RunReleaseObjectLoop) { IsBackground = true };
|
||||
releaserThread.Start();
|
||||
}
|
||||
|
||||
internal WeakObjectManagerImpl() : this(DefaultCheckInterval) { }
|
||||
|
||||
public int GetReferencesCount()
|
||||
{
|
||||
return weakReferences.Count;
|
||||
}
|
||||
|
||||
private void RunReleaseObjectLoop()
|
||||
{
|
||||
logger.LogInfo("Checking objects thread start ...");
|
||||
while (shouldKeepRunning)
|
||||
{
|
||||
ReleseGarbageCollectedObjects();
|
||||
Thread.Sleep(CheckInterval);
|
||||
}
|
||||
|
||||
logger.LogDebug("Checking objects thread stopped.");
|
||||
}
|
||||
|
||||
~WeakObjectManagerImpl()
|
||||
{
|
||||
Dispose();
|
||||
}
|
||||
|
||||
public void AddWeakRefereceObject(JvmObjectReference obj)
|
||||
{
|
||||
if (obj == null || string.IsNullOrEmpty(obj.Id))
|
||||
{
|
||||
logger.LogWarn("Not add null weak object or id : {0}", obj);
|
||||
return;
|
||||
}
|
||||
|
||||
weakReferences.Enqueue(new WeakReferenceObjectIdPair(new WeakReference(obj), obj.ToString()));
|
||||
}
|
||||
|
||||
private void ReleseGarbageCollectedObjects()
|
||||
{
|
||||
int referencesCount = weakReferences.Count;
|
||||
if (referencesCount == 0)
|
||||
{
|
||||
logger.LogDebug("check begin : quit as weakReferences.Count = 0");
|
||||
return;
|
||||
}
|
||||
|
||||
var beginTime = DateTime.Now;
|
||||
int checkCount = checkCountController.AdjustCheckCount(referencesCount);
|
||||
logger.LogDebug("check begin : weakReferences.Count = {0}, checkCount: {1}", referencesCount, checkCount);
|
||||
int garbageCount;
|
||||
var aliveList = ReleseGarbageCollectedObjects(checkCount, out garbageCount);
|
||||
|
||||
var timeReleaseGarbage = DateTime.Now;
|
||||
aliveList.ForEach(item => weakReferences.Enqueue(item));
|
||||
var timeStoreAlive = DateTime.Now;
|
||||
|
||||
logger.LogInfo("check end : released {0} garbage, remain {1} alive, used {2} ms : release garbage used {3} ms, store alive used {4} ms",
|
||||
garbageCount, weakReferences.Count, (DateTime.Now - beginTime).TotalMilliseconds,
|
||||
(timeReleaseGarbage - beginTime).TotalMilliseconds,
|
||||
(timeStoreAlive - timeReleaseGarbage).TotalMilliseconds
|
||||
);
|
||||
}
|
||||
|
||||
private List<WeakReferenceObjectIdPair> ReleseGarbageCollectedObjects(int checkCount, out int garbageCount)
|
||||
{
|
||||
var aliveList = new List<WeakReferenceObjectIdPair>();
|
||||
garbageCount = 0;
|
||||
int i = 0;
|
||||
WeakReferenceObjectIdPair weakReferenceObjectIdPair;
|
||||
while (weakReferences.TryDequeue(out weakReferenceObjectIdPair))
|
||||
{
|
||||
var weakRef = weakReferenceObjectIdPair.Key;
|
||||
if (weakRef.IsAlive)
|
||||
{
|
||||
aliveList.Add(weakReferenceObjectIdPair);
|
||||
}
|
||||
else
|
||||
{
|
||||
objectReleaser.ReleaseObject(weakReferenceObjectIdPair.Value);
|
||||
garbageCount++;
|
||||
}
|
||||
|
||||
i++;
|
||||
if (i >= checkCount)
|
||||
{
|
||||
logger.LogDebug("Stop releasing as exceeded allowed checkCount: {0}", checkCount);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
return aliveList;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// It can be an expensive operation. ** Do not use ** unless there is a real need for this method
|
||||
/// </summary>
|
||||
/// <returns></returns>
|
||||
public int GetAliveCount()
|
||||
{
|
||||
//copying to get alive count at the time of this method call
|
||||
var copiedList = new Queue<WeakReferenceObjectIdPair>(weakReferences);
|
||||
var count = 0;
|
||||
foreach (var weakReference in copiedList)
|
||||
{
|
||||
if (weakReference.Key.IsAlive)
|
||||
{
|
||||
count++;
|
||||
}
|
||||
}
|
||||
|
||||
return count;
|
||||
}
|
||||
|
||||
public virtual void Dispose()
|
||||
{
|
||||
logger.LogInfo("Dispose {0}", this.GetType());
|
||||
shouldKeepRunning = false;
|
||||
}
|
||||
}
|
||||
|
||||
internal interface IObjectReleaser
|
||||
{
|
||||
void ReleaseObject(string objId);
|
||||
}
|
||||
|
||||
internal class JvmObjectReleaser : IObjectReleaser
|
||||
{
|
||||
private const string ReleaseHandler = "SparkCLRHandler";
|
||||
private const string ReleaseMethod = "rm";
|
||||
|
||||
public void ReleaseObject(string objId)
|
||||
{
|
||||
SparkCLRIpcProxy.JvmBridge.CallStaticJavaMethod(ReleaseHandler, ReleaseMethod, objId);
|
||||
}
|
||||
}
|
||||
}
|
|
@ -8,6 +8,7 @@ using Microsoft.Spark.CSharp.Interop.Ipc;
|
|||
using Microsoft.Spark.CSharp.Proxy;
|
||||
using Microsoft.Spark.CSharp.Proxy.Ipc;
|
||||
|
||||
[assembly: InternalsVisibleTo("Tests.Common")]
|
||||
[assembly: InternalsVisibleTo("AdapterTest")]
|
||||
[assembly: InternalsVisibleTo("WorkerTest")]
|
||||
// DynamicProxyGenAssembly2 is a temporary assembly built by mocking systems that use CastleProxy like Moq
|
||||
|
@ -39,5 +40,12 @@ namespace Microsoft.Spark.CSharp.Interop
|
|||
configurationService = value;
|
||||
}
|
||||
}
|
||||
|
||||
private static IWeakObjectManager weakObjectManager;
|
||||
internal static IWeakObjectManager WeakObjectManager
|
||||
{
|
||||
get { return weakObjectManager ?? (weakObjectManager = new WeakObjectManagerImpl()); }
|
||||
set { weakObjectManager = value; }
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -0,0 +1,130 @@
|
|||
// Copyright (c) Microsoft. All rights reserved.
|
||||
// Licensed under the MIT license. See LICENSE file in the project root for full license information.
|
||||
|
||||
using System;
|
||||
using System.IO;
|
||||
using System.Net;
|
||||
using System.Net.Sockets;
|
||||
|
||||
namespace Microsoft.Spark.CSharp.Network
|
||||
{
|
||||
/// <summary>
|
||||
/// A simple wrapper of System.Net.Sockets.Socket class.
|
||||
/// </summary>
|
||||
public class DefaultSocketWrapper : ISocketWrapper
|
||||
{
|
||||
private readonly Socket innerSocket;
|
||||
|
||||
/// <summary>
|
||||
/// Default constructor that creates a new instance of DefaultSocket class which represents
|
||||
/// a traditional socket (System.Net.Socket.Socket).
|
||||
///
|
||||
/// This socket is bound to Loopback with port 0.
|
||||
/// </summary>
|
||||
public DefaultSocketWrapper()
|
||||
{
|
||||
innerSocket = new Socket(AddressFamily.InterNetwork, SocketType.Stream, ProtocolType.Tcp);
|
||||
var localEndPoint = new IPEndPoint(IPAddress.Loopback, 0);
|
||||
innerSocket.Bind(localEndPoint);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Initializes a instance of DefaultSocket class using the specified System.Net.Socket.Socket object.
|
||||
/// </summary>
|
||||
/// <param name="socket">The existing socket</param>
|
||||
private DefaultSocketWrapper(Socket socket)
|
||||
{
|
||||
innerSocket = socket;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Accepts a incoming connection request.
|
||||
/// </summary>
|
||||
/// <returns>A DefaultSocket instance used to send and receive data</returns>
|
||||
public ISocketWrapper Accept()
|
||||
{
|
||||
var socket = innerSocket.Accept();
|
||||
return new DefaultSocketWrapper(socket);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Close the socket connections and releases all associated resources.
|
||||
/// </summary>
|
||||
public void Close()
|
||||
{
|
||||
innerSocket.Close();
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Establishes a connection to a remote host that is specified by an IP address and a port number
|
||||
/// </summary>
|
||||
/// <param name="remoteaddr">The IP address of the remote host</param>
|
||||
/// <param name="port">The port number of the remote host</param>
|
||||
public void Connect(IPAddress remoteaddr, int port)
|
||||
{
|
||||
var remoteEndPoint = new IPEndPoint(remoteaddr, port);
|
||||
innerSocket.Connect(remoteEndPoint);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Returns the NetworkStream used to send and receive data.
|
||||
/// </summary>
|
||||
/// <returns>The underlying Stream instance that be used to send and receive data</returns>
|
||||
/// <remarks>
|
||||
/// GetStream returns a NetworkStream that you can use to send and receive data. You must close/dispose
|
||||
/// the NetworkStream by yourself. Closing DefaultSocketWrapper does not release the NetworkStream
|
||||
/// </remarks>
|
||||
public Stream GetStream()
|
||||
{
|
||||
return new NetworkStream(innerSocket);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Starts listening for incoming connections requests
|
||||
/// </summary>
|
||||
/// <param name="backlog">The maximum length of the pending connections queue. </param>
|
||||
public void Listen(int backlog = (int)SocketOptionName.MaxConnections)
|
||||
{
|
||||
innerSocket.Listen(backlog);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Disposes the resources used by this instance of the DefaultSocket class.
|
||||
/// </summary>
|
||||
/// <param name="disposing"></param>
|
||||
protected virtual void Dispose(bool disposing)
|
||||
{
|
||||
if (disposing)
|
||||
{
|
||||
innerSocket.Dispose();
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Releases all resources used by the current instance of the DefaultSocket class.
|
||||
/// </summary>
|
||||
public void Dispose()
|
||||
{
|
||||
Dispose(true);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Frees resources used by DefaultSocket class
|
||||
/// </summary>
|
||||
~DefaultSocketWrapper()
|
||||
{
|
||||
Dispose(false);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Returns the local endpoint.
|
||||
/// </summary>
|
||||
public EndPoint LocalEndPoint
|
||||
{
|
||||
get
|
||||
{
|
||||
return innerSocket.LocalEndPoint;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,52 @@
|
|||
// Copyright (c) Microsoft. All rights reserved.
|
||||
// Licensed under the MIT license. See LICENSE file in the project root for full license information.
|
||||
|
||||
using System;
|
||||
using System.IO;
|
||||
using System.Net;
|
||||
using System.Net.Sockets;
|
||||
|
||||
namespace Microsoft.Spark.CSharp.Network
|
||||
{
|
||||
/// <summary>
|
||||
/// ISocketWrapper interface defines the common methods to operate a socket (traditional socket or
|
||||
/// Windows Registered IO socket)
|
||||
/// </summary>
|
||||
public interface ISocketWrapper : IDisposable
|
||||
{
|
||||
/// <summary>
|
||||
/// Accepts a incoming connection request.
|
||||
/// </summary>
|
||||
/// <returns>A ISocket instance used to send and receive data</returns>
|
||||
ISocketWrapper Accept();
|
||||
|
||||
/// <summary>
|
||||
/// Close the ISocket connections and releases all associated resources.
|
||||
/// </summary>
|
||||
void Close();
|
||||
|
||||
/// <summary>
|
||||
/// Establishes a connection to a remote host that is specified by an IP address and a port number
|
||||
/// </summary>
|
||||
/// <param name="remoteaddr">The IP address of the remote host</param>
|
||||
/// <param name="port">The port number of the remote host</param>
|
||||
void Connect(IPAddress remoteaddr, int port);
|
||||
|
||||
/// <summary>
|
||||
/// Returns a stream used to send and receive data.
|
||||
/// </summary>
|
||||
/// <returns>The underlying Stream instance that be used to send and receive data</returns>
|
||||
Stream GetStream();
|
||||
|
||||
/// <summary>
|
||||
/// Starts listening for incoming connections requests
|
||||
/// </summary>
|
||||
/// <param name="backlog">The maximum length of the pending connections queue. </param>
|
||||
void Listen(int backlog = (int)SocketOptionName.MaxConnections);
|
||||
|
||||
/// <summary>
|
||||
/// Returns the local endpoint.
|
||||
/// </summary>
|
||||
EndPoint LocalEndPoint { get; }
|
||||
}
|
||||
}
|
|
@ -0,0 +1,27 @@
|
|||
// Copyright (c) Microsoft. All rights reserved.
|
||||
// Licensed under the MIT license. See LICENSE file in the project root for full license information.
|
||||
|
||||
namespace Microsoft.Spark.CSharp.Network
|
||||
{
|
||||
/// <summary>
|
||||
/// SocketFactory is used to create ISocketWrapper instance based on a configuration and OS version.
|
||||
///
|
||||
/// The ISocket instance can be RioSocket object, if the configuration is set to RioSocket and
|
||||
/// only the application is running on a Windows OS that supports Registered IO socket.
|
||||
/// </summary>
|
||||
public static class SocketFactory
|
||||
{
|
||||
/// <summary>
|
||||
/// Creates a ISocket instance based on the configuration and OS version.
|
||||
/// </summary>
|
||||
/// <returns>
|
||||
/// A RioSocket instance, if the configuration is set to RioSocket and only the application
|
||||
/// is running on a Window OS that supports Registered IO socket. By default, it returns
|
||||
/// DefaultSocket instance which wraps System.Net.Sockets.Socket.
|
||||
/// </returns>
|
||||
public static ISocketWrapper CreateSocket()
|
||||
{
|
||||
return new DefaultSocketWrapper();
|
||||
}
|
||||
}
|
||||
}
|
|
@ -30,5 +30,5 @@ using System.Runtime.InteropServices;
|
|||
// Build Number
|
||||
// Revision
|
||||
//
|
||||
[assembly: AssemblyVersion("1.6.0.0")]
|
||||
[assembly: AssemblyFileVersion("1.6.0.0")]
|
||||
[assembly: AssemblyVersion("1.6.1.0")]
|
||||
[assembly: AssemblyFileVersion("1.6.1.0")]
|
||||
|
|
|
@ -19,7 +19,7 @@ namespace Microsoft.Spark.CSharp.Proxy
|
|||
void CallForeachRDD(byte[] func, string serializedMode);
|
||||
void Print(int num = 10);
|
||||
void Persist(StorageLevelType storageLevelType);
|
||||
void Checkpoint(long intervalMs);
|
||||
void Checkpoint(int intervalSeconds);
|
||||
IRDDProxy[] Slice(long fromUnixTime, long toUnixTime);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -42,6 +42,7 @@ namespace Microsoft.Spark.CSharp.Proxy
|
|||
IDataFrameProxy Replace<T>(object subset, Dictionary<T, T> toReplaceAndValueDict);
|
||||
IEnumerable<IDataFrameProxy> RandomSplit(IEnumerable<double> weights, long? seed);
|
||||
IDataFrameProxy Sort(IColumnProxy[] columns);
|
||||
IDataFrameProxy SortWithinPartitions(IColumnProxy[] columns);
|
||||
IDataFrameProxy Alias(string alias);
|
||||
double Corr(string column1, string column2, string method);
|
||||
double Cov(string column1, string column2);
|
||||
|
@ -55,6 +56,8 @@ namespace Microsoft.Spark.CSharp.Proxy
|
|||
void Persist(StorageLevelType storageLevelType);
|
||||
void Unpersist(bool blocking = true);
|
||||
IDataFrameProxy Repartition(int numPartitions);
|
||||
IDataFrameProxy Repartition(int numPartitions, IColumnProxy[] columns);
|
||||
IDataFrameProxy Repartition(IColumnProxy[] columns);
|
||||
IDataFrameProxy Sample(bool withReplacement, double fraction, long seed);
|
||||
IDataFrameWriterProxy Write();
|
||||
}
|
||||
|
|
|
@ -3,7 +3,6 @@
|
|||
|
||||
using System;
|
||||
using System.Collections.Generic;
|
||||
using System.Data;
|
||||
using System.Linq;
|
||||
using System.Text;
|
||||
using System.Threading.Tasks;
|
||||
|
|
|
@ -42,6 +42,5 @@ namespace Microsoft.Spark.CSharp.Proxy
|
|||
void SaveAsTextFile(string path, string compressionCodecClass);
|
||||
long Count();
|
||||
int CollectAndServe();
|
||||
int PartitionLength();
|
||||
}
|
||||
}
|
||||
|
|
|
@ -16,7 +16,7 @@ namespace Microsoft.Spark.CSharp.Proxy
|
|||
// or restore it from checkpoint. Thus this function is called before IStreamingContextProxy is initialized. So CheckpointExists()
|
||||
// should not be put to IStreamingContextProxy.
|
||||
bool CheckpointExists(string checkpointPath);
|
||||
IStreamingContextProxy CreateStreamingContext(SparkContext sparkContext, long durationMs);
|
||||
IStreamingContextProxy CreateStreamingContext(SparkContext sparkContext, int durationSeconds);
|
||||
IStreamingContextProxy CreateStreamingContext(string checkpointPath);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -15,6 +15,7 @@ namespace Microsoft.Spark.CSharp.Proxy
|
|||
internal interface ISparkContextProxy
|
||||
{
|
||||
ISqlContextProxy CreateSqlContext();
|
||||
ISqlContextProxy CreateHiveContext();
|
||||
IColumnProxy CreateColumnFromName(string name);
|
||||
IColumnProxy CreateFunction(string name, object self);
|
||||
IColumnProxy CreateBinaryMathFunction(string name, object self, object other);
|
||||
|
@ -50,7 +51,7 @@ namespace Microsoft.Spark.CSharp.Proxy
|
|||
int RunJob(IRDDProxy rdd, IEnumerable<int> partitions);
|
||||
IBroadcastProxy ReadBroadcastFromFile(string path, out long broadcastId);
|
||||
IRDDProxy CreateCSharpRdd(IRDDProxy prefvJavaRddReference, byte[] command, Dictionary<string, string> environmentVariables, List<string> pythonIncludes, bool preservePartitioning, List<Broadcast> broadcastVariables, List<byte[]> accumulator);
|
||||
IRDDProxy CreatePairwiseRDD(IRDDProxy javaReferenceInByteArrayRdd, int numPartitions);
|
||||
IRDDProxy CreatePairwiseRDD(IRDDProxy javaReferenceInByteArrayRdd, int numPartitions, long partitionFuncId);
|
||||
IUDFProxy CreateUserDefinedCSharpFunction(string name, byte[] command, string returnType);
|
||||
}
|
||||
internal interface IBroadcastProxy
|
||||
|
|
|
@ -14,12 +14,31 @@ namespace Microsoft.Spark.CSharp.Proxy
|
|||
internal interface ISqlContextProxy
|
||||
{
|
||||
IDataFrameReaderProxy Read();
|
||||
ISqlContextProxy NewSession();
|
||||
string GetConf(string key, string defaultValue);
|
||||
void SetConf(string key, string value);
|
||||
IDataFrameProxy CreateDataFrame(IRDDProxy rddProxy, IStructTypeProxy structTypeProxy);
|
||||
void RegisterDataFrameAsTable(IDataFrameProxy dataFrameProxy, string tableName);
|
||||
void DropTempTable(string tableName);
|
||||
IDataFrameProxy Table(string tableName);
|
||||
IDataFrameProxy Tables();
|
||||
IDataFrameProxy Tables(string databaseName);
|
||||
IEnumerable<string> TableNames();
|
||||
IEnumerable<string> TableNames(string databaseName);
|
||||
void CacheTable(string tableName);
|
||||
void UncacheTable(string tableName);
|
||||
void ClearCache();
|
||||
bool IsCached(string tableName);
|
||||
IDataFrameProxy ReadDataFrame(string path, StructType schema, Dictionary<string, string> options);
|
||||
IDataFrameProxy JsonFile(string path);
|
||||
IDataFrameProxy TextFile(string path, StructType schema, string delimiter);
|
||||
IDataFrameProxy TextFile(string path, string delimiter, bool hasHeader, bool inferSchema);
|
||||
IDataFrameProxy Sql(string query);
|
||||
void RegisterFunction(string name, byte[] command, string returnType);
|
||||
|
||||
#region HiveContext
|
||||
void RefreshTable(string tableName);
|
||||
|
||||
#endregion
|
||||
}
|
||||
}
|
|
@ -16,18 +16,23 @@ namespace Microsoft.Spark.CSharp.Proxy
|
|||
SparkContext SparkContext { get; }
|
||||
void Start();
|
||||
void Stop();
|
||||
void Remember(long durationMs);
|
||||
void Remember(int durationSeconds);
|
||||
void Checkpoint(string directory);
|
||||
IDStreamProxy TextFileStream(string directory);
|
||||
IDStreamProxy SocketTextStream(string hostname, int port, StorageLevelType storageLevelType);
|
||||
IDStreamProxy KafkaStream(Dictionary<string, int> topics, Dictionary<string, string> kafkaParams, StorageLevelType storageLevelType);
|
||||
IDStreamProxy DirectKafkaStream(List<string> topics, Dictionary<string, string> kafkaParams, Dictionary<string, long> fromOffsets);
|
||||
IDStreamProxy DirectKafkaStreamWithRepartition(List<string> topics, Dictionary<string, string> kafkaParams, Dictionary<string, long> fromOffsets,
|
||||
int numPartitions, byte[] readFunc, string serializationMode);
|
||||
IDStreamProxy Union(IDStreamProxy firstDStreams, IDStreamProxy[] otherDStreams);
|
||||
void AwaitTermination();
|
||||
void AwaitTermination(int timeout);
|
||||
void AwaitTerminationOrTimeout(long timeout);
|
||||
IDStreamProxy CreateCSharpDStream(IDStreamProxy jdstream, byte[] func, string serializationMode);
|
||||
IDStreamProxy CreateCSharpTransformed2DStream(IDStreamProxy jdstream, IDStreamProxy jother, byte[] func, string serializationMode, string serializationModeOther);
|
||||
IDStreamProxy CreateCSharpReducedWindowedDStream(IDStreamProxy jdstream, byte[] func, byte[] invFunc, int windowSeconds, int slideSeconds, string serializationMode);
|
||||
IDStreamProxy CreateCSharpStateDStream(IDStreamProxy jdstream, byte[] func, string className, string serializationMode, string serializationMode2);
|
||||
IDStreamProxy CreateConstantInputDStream(IRDDProxy rddProxy);
|
||||
IDStreamProxy EventHubsUnionStream(Dictionary<string, string> eventHubsParams, StorageLevelType storageLevelType);
|
||||
|
||||
}
|
||||
}
|
||||
|
|
|
@ -40,7 +40,7 @@ namespace Microsoft.Spark.CSharp.Proxy.Ipc
|
|||
public IDStreamProxy Window(int windowSeconds, int slideSeconds = 0)
|
||||
{
|
||||
string windowId = null;
|
||||
var windowDurationReference = SparkCLRIpcProxy.JvmBridge.CallConstructor("org.apache.spark.streaming.Duration", new object[] { windowSeconds * 1000 });
|
||||
var windowDurationReference = JvmBridgeUtils.GetJavaDuration(windowSeconds);
|
||||
|
||||
if (slideSeconds <= 0)
|
||||
{
|
||||
|
@ -48,7 +48,7 @@ namespace Microsoft.Spark.CSharp.Proxy.Ipc
|
|||
return new DStreamIpcProxy(new JvmObjectReference(windowId));
|
||||
}
|
||||
|
||||
var slideDurationReference = SparkCLRIpcProxy.JvmBridge.CallConstructor("org.apache.spark.streaming.Duration", new object[] { slideSeconds * 1000 });
|
||||
var slideDurationReference = JvmBridgeUtils.GetJavaDuration(slideSeconds);
|
||||
windowId = (string)SparkCLRIpcProxy.JvmBridge.CallNonStaticJavaMethod(javaDStreamReference, "window", new object[] { windowDurationReference, slideDurationReference });
|
||||
|
||||
return new DStreamIpcProxy(new JvmObjectReference(windowId));
|
||||
|
@ -77,9 +77,9 @@ namespace Microsoft.Spark.CSharp.Proxy.Ipc
|
|||
SparkCLRIpcProxy.JvmBridge.CallNonStaticJavaMethod(jvmDStreamReference, "persist", new object[] { jstorageLevel });
|
||||
}
|
||||
|
||||
public void Checkpoint(long intervalMs)
|
||||
public void Checkpoint(int intervalSeconds)
|
||||
{
|
||||
var jinterval = SparkCLRIpcProxy.JvmBridge.CallConstructor("org.apache.spark.streaming.Duration", new object[] { intervalMs });
|
||||
var jinterval = JvmBridgeUtils.GetJavaDuration(intervalSeconds);
|
||||
SparkCLRIpcProxy.JvmBridge.CallNonStaticJavaMethod(jvmDStreamReference, "checkpoint", new object[] { jinterval });
|
||||
}
|
||||
|
||||
|
|
|
@ -14,6 +14,12 @@ namespace Microsoft.Spark.CSharp.Proxy.Ipc
|
|||
internal class DataFrameIpcProxy : IDataFrameProxy
|
||||
{
|
||||
private readonly JvmObjectReference jvmDataFrameReference;
|
||||
|
||||
internal JvmObjectReference JvmDataFrameReference
|
||||
{
|
||||
get { return jvmDataFrameReference; }
|
||||
}
|
||||
|
||||
private readonly ISqlContextProxy sqlContextProxy;
|
||||
|
||||
private readonly DataFrameNaFunctions na;
|
||||
|
@ -405,6 +411,20 @@ namespace Microsoft.Spark.CSharp.Proxy.Ipc
|
|||
jvmDataFrameReference, "sort", columnsSeq).ToString()), sqlContextProxy);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Call https://github.com/apache/spark/blob/branch-1.6/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala, sortWithinPartitions(sortExprs: Column*): DataFrame
|
||||
/// </summary>
|
||||
/// <param name="columns"></param>
|
||||
/// <returns></returns>
|
||||
public IDataFrameProxy SortWithinPartitions(IColumnProxy[] columns)
|
||||
{
|
||||
var columnsSeq = new JvmObjectReference((string)SparkCLRIpcProxy.JvmBridge.CallStaticJavaMethod("org.apache.spark.sql.api.csharp.SQLUtils",
|
||||
"toSeq", new object[] { columns.Select(c => (c as ColumnIpcProxy).ScalaColumnReference).ToArray() }));
|
||||
|
||||
return new DataFrameIpcProxy(new JvmObjectReference(SparkCLRIpcProxy.JvmBridge.CallNonStaticJavaMethod(
|
||||
jvmDataFrameReference, "sortWithinPartitions", columnsSeq).ToString()), sqlContextProxy);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Call https://github.com/apache/spark/blob/branch-1.4/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala, as(alias: String): DataFrame
|
||||
/// </summary>
|
||||
|
@ -517,6 +537,35 @@ namespace Microsoft.Spark.CSharp.Proxy.Ipc
|
|||
new object[] { numPartitions }).ToString()), sqlContextProxy);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Call https://github.com/apache/spark/blob/branch-1.6/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala, repartition(numPartitions: Int, partitionExprs: Column*): DataFrame
|
||||
/// </summary>
|
||||
/// <param name="numPartitions"></param>
|
||||
/// <param name="columns"></param>
|
||||
/// <returns></returns>
|
||||
public IDataFrameProxy Repartition(int numPartitions, IColumnProxy[] columns)
|
||||
{
|
||||
var columnsSeq = new JvmObjectReference((string)SparkCLRIpcProxy.JvmBridge.CallStaticJavaMethod("org.apache.spark.sql.api.csharp.SQLUtils",
|
||||
"toSeq", new object[] { columns.Select(c => (c as ColumnIpcProxy).ScalaColumnReference).ToArray() }));
|
||||
|
||||
return new DataFrameIpcProxy(new JvmObjectReference(SparkCLRIpcProxy.JvmBridge.CallNonStaticJavaMethod(
|
||||
jvmDataFrameReference, "repartition", new object[] { numPartitions, columnsSeq }).ToString()), sqlContextProxy);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Call https://github.com/apache/spark/blob/branch-1.6/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala, repartition(partitionExprs: Column*): DataFrame
|
||||
/// </summary>
|
||||
/// <param name="columns"></param>
|
||||
/// <returns></returns>
|
||||
public IDataFrameProxy Repartition(IColumnProxy[] columns)
|
||||
{
|
||||
var columnsSeq = new JvmObjectReference((string)SparkCLRIpcProxy.JvmBridge.CallStaticJavaMethod("org.apache.spark.sql.api.csharp.SQLUtils",
|
||||
"toSeq", new object[] { columns.Select(c => (c as ColumnIpcProxy).ScalaColumnReference).ToArray() }));
|
||||
|
||||
return new DataFrameIpcProxy(new JvmObjectReference(SparkCLRIpcProxy.JvmBridge.CallNonStaticJavaMethod(
|
||||
jvmDataFrameReference, "repartition", new object[] { columnsSeq }).ToString()), sqlContextProxy);
|
||||
}
|
||||
|
||||
public IDataFrameProxy Sample(bool withReplacement, double fraction, long seed)
|
||||
{
|
||||
return
|
||||
|
|
|
@ -16,6 +16,7 @@ using Microsoft.Spark.CSharp.Interop.Ipc;
|
|||
namespace Microsoft.Spark.CSharp.Proxy.Ipc
|
||||
{
|
||||
[ExcludeFromCodeCoverage] //IPC calls to JVM validated using validation-enabled samples - unit test coverage not reqiured
|
||||
[Serializable]
|
||||
internal class RDDIpcProxy : IRDDProxy
|
||||
{
|
||||
private readonly JvmObjectReference jvmRddReference;
|
||||
|
@ -78,13 +79,6 @@ namespace Microsoft.Spark.CSharp.Proxy.Ipc
|
|||
return new RDDIpcProxy(jref);
|
||||
}
|
||||
|
||||
public int PartitionLength()
|
||||
{
|
||||
var rdd = new JvmObjectReference((string)SparkCLRIpcProxy.JvmBridge.CallNonStaticJavaMethod(jvmRddReference, "rdd"));
|
||||
var partitions = SparkCLRIpcProxy.JvmBridge.CallNonStaticJavaMethod(rdd, "partitions", new object[] { });
|
||||
return int.Parse(SparkCLRIpcProxy.JvmBridge.CallStaticJavaMethod("java.lang.reflect.Array", "getLength", new object[] { partitions }).ToString());
|
||||
}
|
||||
|
||||
public IRDDProxy Coalesce(int numPartitions, bool shuffle)
|
||||
{
|
||||
return new RDDIpcProxy(new JvmObjectReference((string)SparkCLRIpcProxy.JvmBridge.CallNonStaticJavaMethod(jvmRddReference, "coalesce", new object[] { numPartitions, shuffle })));
|
||||
|
@ -166,7 +160,7 @@ namespace Microsoft.Spark.CSharp.Proxy.Ipc
|
|||
|
||||
public IRDDProxy SampleByKey(bool withReplacement, Dictionary<string, double> fractions, long seed)
|
||||
{
|
||||
var jfractions = SparkContextIpcProxy.GetJavaMap(fractions) as JvmObjectReference;
|
||||
var jfractions = JvmBridgeUtils.GetJavaMap(fractions) as JvmObjectReference;
|
||||
return new RDDIpcProxy(new JvmObjectReference((string) SparkCLRIpcProxy.JvmBridge.CallNonStaticJavaMethod(jvmRddReference, "sampleByKey", new object[] { withReplacement, jfractions, seed })));
|
||||
}
|
||||
|
||||
|
@ -184,25 +178,25 @@ namespace Microsoft.Spark.CSharp.Proxy.Ipc
|
|||
|
||||
public void SaveAsNewAPIHadoopDataset(IEnumerable<KeyValuePair<string, string>> conf)
|
||||
{
|
||||
var jconf = SparkContextIpcProxy.GetJavaMap<string, string>(conf);
|
||||
var jconf = JvmBridgeUtils.GetJavaMap<string, string>(conf);
|
||||
SparkCLRIpcProxy.JvmBridge.CallStaticJavaMethod("org.apache.spark.api.python.PythonRDD", "saveAsHadoopDataset", new object[] { jvmRddReference, false, jconf, null, null, true });
|
||||
}
|
||||
|
||||
public void SaveAsNewAPIHadoopFile(string path, string outputFormatClass, string keyClass, string valueClass, IEnumerable<KeyValuePair<string, string>> conf)
|
||||
{
|
||||
var jconf = SparkContextIpcProxy.GetJavaMap<string, string>(conf);
|
||||
var jconf = JvmBridgeUtils.GetJavaMap<string, string>(conf);
|
||||
SparkCLRIpcProxy.JvmBridge.CallStaticJavaMethod("org.apache.spark.api.python.PythonRDD", "saveAsNewAPIHadoopFile", new object[] { jvmRddReference, false, path, outputFormatClass, keyClass, valueClass, null, null, jconf });
|
||||
}
|
||||
|
||||
public void SaveAsHadoopDataset(IEnumerable<KeyValuePair<string, string>> conf)
|
||||
{
|
||||
var jconf = SparkContextIpcProxy.GetJavaMap<string, string>(conf);
|
||||
var jconf = JvmBridgeUtils.GetJavaMap<string, string>(conf);
|
||||
SparkCLRIpcProxy.JvmBridge.CallStaticJavaMethod("org.apache.spark.api.python.PythonRDD", "saveAsHadoopDataset", new object[] { jvmRddReference, false, jconf, null, null, false });
|
||||
}
|
||||
|
||||
public void SaveAsHadoopFile(string path, string outputFormatClass, string keyClass, string valueClass, IEnumerable<KeyValuePair<string, string>> conf, string compressionCodecClass)
|
||||
{
|
||||
var jconf = SparkContextIpcProxy.GetJavaMap<string, string>(conf);
|
||||
var jconf = JvmBridgeUtils.GetJavaMap<string, string>(conf);
|
||||
SparkCLRIpcProxy.JvmBridge.CallStaticJavaMethod("org.apache.spark.api.python.PythonRDD", "saveAsHadoopFile", new object[] { jvmRddReference, false, path, outputFormatClass, keyClass, valueClass, null, null, jconf, compressionCodecClass });
|
||||
}
|
||||
|
||||
|
@ -211,17 +205,18 @@ namespace Microsoft.Spark.CSharp.Proxy.Ipc
|
|||
SparkCLRIpcProxy.JvmBridge.CallStaticJavaMethod("org.apache.spark.api.python.PythonRDD", "SaveAsSequenceFile", new object[] { jvmRddReference, false, path, compressionCodecClass });
|
||||
}
|
||||
|
||||
//this method is called by RDD<string> (implementation is at StringRDDFunctions.SaveAsTextFile)
|
||||
//calling saveAsTextFile() on CSharpRDDs result in bytes written to text file - so calling saveStringRddAsTextFile() which converts bytes to string before writing to file
|
||||
public void SaveAsTextFile(string path, string compressionCodecClass)
|
||||
{
|
||||
var rdd = new JvmObjectReference((string)SparkCLRIpcProxy.JvmBridge.CallNonStaticJavaMethod(jvmRddReference, "rdd"));
|
||||
if (!string.IsNullOrEmpty(compressionCodecClass))
|
||||
{
|
||||
var codec = new JvmObjectReference((string)SparkCLRIpcProxy.JvmBridge.CallStaticJavaMethod("java.lang.Class", "forName", new object[] { compressionCodecClass }));
|
||||
SparkCLRIpcProxy.JvmBridge.CallNonStaticJavaMethod(jvmRddReference, "saveAsTextFile", new object[] { path, codec });
|
||||
SparkCLRIpcProxy.JvmBridge.CallStaticJavaMethod("org.apache.spark.api.csharp.CSharpRDD", "saveStringRddAsTextFile", new object[] { jvmRddReference, path, codec });
|
||||
}
|
||||
else
|
||||
{
|
||||
SparkCLRIpcProxy.JvmBridge.CallNonStaticJavaMethod(jvmRddReference, "saveAsTextFile", new object[] { path });
|
||||
SparkCLRIpcProxy.JvmBridge.CallStaticJavaMethod("org.apache.spark.api.csharp.CSharpRDD", "saveStringRddAsTextFile", new object[] { jvmRddReference, path });
|
||||
}
|
||||
}
|
||||
public StorageLevel GetStorageLevel()
|
||||
|
|
|
@ -80,9 +80,9 @@ namespace Microsoft.Spark.CSharp.Proxy.Ipc
|
|||
SparkCLRIpcProxy.JvmBridge.CallNonStaticJavaMethod(fs, "listStatus", path) != null;
|
||||
}
|
||||
|
||||
public IStreamingContextProxy CreateStreamingContext(SparkContext sparkContext, long durationMs)
|
||||
public IStreamingContextProxy CreateStreamingContext(SparkContext sparkContext, int durationSeconds)
|
||||
{
|
||||
streamingContextIpcProxy = new StreamingContextIpcProxy(sparkContext, durationMs);
|
||||
streamingContextIpcProxy = new StreamingContextIpcProxy(sparkContext, durationSeconds);
|
||||
return streamingContextIpcProxy;
|
||||
}
|
||||
|
||||
|
|
|
@ -39,6 +39,13 @@ namespace Microsoft.Spark.CSharp.Proxy.Ipc
|
|||
return new SqlContextIpcProxy(new JvmObjectReference((string)SparkCLRIpcProxy.JvmBridge.CallStaticJavaMethod("org.apache.spark.sql.api.csharp.SQLUtils", "createSQLContext", new object[] { jvmSparkContextReference })));
|
||||
}
|
||||
|
||||
public ISqlContextProxy CreateHiveContext()
|
||||
{
|
||||
return new SqlContextIpcProxy(new JvmObjectReference(
|
||||
(string)SparkCLRIpcProxy.JvmBridge.CallStaticJavaMethod(
|
||||
"org.apache.spark.sql.api.csharp.SQLUtils", "createHiveContext", new object[] { jvmSparkContextReference })));
|
||||
}
|
||||
|
||||
public void CreateSparkContext(string master, string appName, string sparkHome, ISparkConfProxy conf)
|
||||
{
|
||||
object[] args = (new object[] { master, appName, sparkHome, (conf == null ? null : (conf as SparkConfIpcProxy).JvmSparkConfReference) }).Where(x => x != null).ToArray();
|
||||
|
@ -152,7 +159,7 @@ namespace Microsoft.Spark.CSharp.Proxy.Ipc
|
|||
|
||||
public IRDDProxy NewAPIHadoopFile(string filePath, string inputFormatClass, string keyClass, string valueClass, string keyConverterClass, string valueConverterClass, IEnumerable<KeyValuePair<string, string>> conf, int batchSize)
|
||||
{
|
||||
var jconf = GetJavaHashMap<string, string>(conf);
|
||||
var jconf = JvmBridgeUtils.GetJavaHashMap<string, string>(conf);
|
||||
var jvmRddReference = new JvmObjectReference((string)SparkCLRIpcProxy.JvmBridge.CallStaticJavaMethod("org.apache.spark.api.python.PythonRDD", "newAPIHadoopFile",
|
||||
new object[] { jvmJavaContextReference, filePath, inputFormatClass, keyClass, valueClass, keyConverterClass, valueConverterClass, jconf, batchSize }));
|
||||
return new RDDIpcProxy(jvmRddReference);
|
||||
|
@ -160,7 +167,7 @@ namespace Microsoft.Spark.CSharp.Proxy.Ipc
|
|||
|
||||
public IRDDProxy NewAPIHadoopRDD(string inputFormatClass, string keyClass, string valueClass, string keyConverterClass, string valueConverterClass, IEnumerable<KeyValuePair<string, string>> conf, int batchSize)
|
||||
{
|
||||
var jconf = GetJavaHashMap<string, string>(conf);
|
||||
var jconf = JvmBridgeUtils.GetJavaHashMap<string, string>(conf);
|
||||
var jvmRddReference = new JvmObjectReference((string)SparkCLRIpcProxy.JvmBridge.CallStaticJavaMethod("org.apache.spark.api.python.PythonRDD", "newAPIHadoopRDD",
|
||||
new object[] { jvmJavaContextReference, inputFormatClass, keyClass, valueClass, keyConverterClass, valueConverterClass, jconf, batchSize }));
|
||||
return new RDDIpcProxy(jvmRddReference);
|
||||
|
@ -168,7 +175,7 @@ namespace Microsoft.Spark.CSharp.Proxy.Ipc
|
|||
|
||||
public IRDDProxy HadoopFile(string filePath, string inputFormatClass, string keyClass, string valueClass, string keyConverterClass, string valueConverterClass, IEnumerable<KeyValuePair<string, string>> conf, int batchSize)
|
||||
{
|
||||
var jconf = GetJavaHashMap<string, string>(conf);
|
||||
var jconf = JvmBridgeUtils.GetJavaHashMap<string, string>(conf);
|
||||
var jvmRddReference = new JvmObjectReference((string)SparkCLRIpcProxy.JvmBridge.CallStaticJavaMethod("org.apache.spark.api.python.PythonRDD", "hadoopFile",
|
||||
new object[] { jvmJavaContextReference, filePath, inputFormatClass, keyClass, valueClass, keyConverterClass, valueConverterClass, jconf, batchSize }));
|
||||
return new RDDIpcProxy(jvmRddReference);
|
||||
|
@ -176,7 +183,7 @@ namespace Microsoft.Spark.CSharp.Proxy.Ipc
|
|||
|
||||
public IRDDProxy HadoopRDD(string inputFormatClass, string keyClass, string valueClass, string keyConverterClass, string valueConverterClass, IEnumerable<KeyValuePair<string, string>> conf, int batchSize)
|
||||
{
|
||||
var jconf = GetJavaHashMap<string, string>(conf);
|
||||
var jconf = JvmBridgeUtils.GetJavaHashMap<string, string>(conf);
|
||||
var jvmRddReference = new JvmObjectReference((string)SparkCLRIpcProxy.JvmBridge.CallStaticJavaMethod("org.apache.spark.api.python.PythonRDD", "hadoopRDD",
|
||||
new object[] { jvmJavaContextReference, inputFormatClass, keyClass, valueClass, keyConverterClass, valueConverterClass, jconf, batchSize }));
|
||||
return new RDDIpcProxy(jvmRddReference);
|
||||
|
@ -191,7 +198,7 @@ namespace Microsoft.Spark.CSharp.Proxy.Ipc
|
|||
public IRDDProxy Union(IEnumerable<IRDDProxy> rdds)
|
||||
{
|
||||
var jfirst = (rdds.First() as RDDIpcProxy).JvmRddReference;
|
||||
var jrest = GetJavaList<JvmObjectReference>(rdds.Skip(1).Select(r => (r as RDDIpcProxy).JvmRddReference));
|
||||
var jrest = JvmBridgeUtils.GetJavaList<JvmObjectReference>(rdds.Skip(1).Select(r => (r as RDDIpcProxy).JvmRddReference));
|
||||
var jvmRddReference = new JvmObjectReference((string)SparkCLRIpcProxy.JvmBridge.CallNonStaticJavaMethod(jvmJavaContextReference, "union", new object[] { jfirst, jrest }));
|
||||
return new RDDIpcProxy(jvmRddReference);
|
||||
}
|
||||
|
@ -250,13 +257,20 @@ namespace Microsoft.Spark.CSharp.Proxy.Ipc
|
|||
}
|
||||
}
|
||||
|
||||
public IRDDProxy CreatePairwiseRDD(IRDDProxy jvmReferenceOfByteArrayRdd, int numPartitions)
|
||||
/// <summary>
|
||||
/// Create a PairwiseRDD.
|
||||
/// </summary>
|
||||
/// <param name="jvmReferenceOfByteArrayRdd"></param>
|
||||
/// <param name="numPartitions"></param>
|
||||
/// <param name="partitionFuncId">Global unique id of partitioner which is used for comparison PythonPartitioners in JVM.</param>
|
||||
/// <returns></returns>
|
||||
public IRDDProxy CreatePairwiseRDD(IRDDProxy jvmReferenceOfByteArrayRdd, int numPartitions, long partitionFuncId)
|
||||
{
|
||||
var rdd = new JvmObjectReference((string)SparkCLRIpcProxy.JvmBridge.CallNonStaticJavaMethod((jvmReferenceOfByteArrayRdd as RDDIpcProxy).JvmRddReference, "rdd"));
|
||||
var pairwiseRdd = SparkCLRIpcProxy.JvmBridge.CallConstructor("org.apache.spark.api.python.PairwiseRDD", rdd);
|
||||
var pairRddJvmReference = new JvmObjectReference(SparkCLRIpcProxy.JvmBridge.CallNonStaticJavaMethod(pairwiseRdd, "asJavaPairRDD", new object[] { }).ToString());
|
||||
|
||||
var jpartitionerJavaReference = SparkCLRIpcProxy.JvmBridge.CallConstructor("org.apache.spark.api.python.PythonPartitioner", new object[] { numPartitions, (long)0 });
|
||||
var jpartitionerJavaReference = SparkCLRIpcProxy.JvmBridge.CallConstructor("org.apache.spark.api.python.PythonPartitioner", new object[] { numPartitions, partitionFuncId });
|
||||
var partitionedPairRddJvmReference = new JvmObjectReference(SparkCLRIpcProxy.JvmBridge.CallNonStaticJavaMethod(pairRddJvmReference, "partitionBy", new object[] { jpartitionerJavaReference }).ToString());
|
||||
var jvmRddReference = new JvmObjectReference(SparkCLRIpcProxy.JvmBridge.CallStaticJavaMethod("org.apache.spark.api.python.PythonRDD", "valueOfPair", new object[] { partitionedPairRddJvmReference }).ToString());
|
||||
//var jvmRddReference = new JvmObjectReference(SparkCLRIpcProxy.JvmBridge.CallNonStaticJavaMethod(partitionedRddJvmReference, "rdd", new object[] { }).ToString());
|
||||
|
@ -267,7 +281,7 @@ namespace Microsoft.Spark.CSharp.Proxy.Ipc
|
|||
{
|
||||
var hashTableReference = SparkCLRIpcProxy.JvmBridge.CallConstructor("java.util.Hashtable", new object[] { });
|
||||
var arrayListReference = SparkCLRIpcProxy.JvmBridge.CallConstructor("java.util.ArrayList", new object[] { });
|
||||
var jbroadcastVariables = GetJavaList<JvmObjectReference>(jvmBroadcastReferences);
|
||||
var jbroadcastVariables = JvmBridgeUtils.GetJavaList<JvmObjectReference>(jvmBroadcastReferences);
|
||||
|
||||
var rdd = new JvmObjectReference((string)SparkCLRIpcProxy.JvmBridge.CallNonStaticJavaMethod((prevJvmRddReference as RDDIpcProxy).JvmRddReference, "rdd"));
|
||||
|
||||
|
@ -288,7 +302,7 @@ namespace Microsoft.Spark.CSharp.Proxy.Ipc
|
|||
{
|
||||
var jSqlContext = SparkCLRIpcProxy.JvmBridge.CallConstructor("org.apache.spark.sql.SQLContext", new object[] { jvmSparkContextReference });
|
||||
var jDataType = new JvmObjectReference((string)SparkCLRIpcProxy.JvmBridge.CallNonStaticJavaMethod(jSqlContext, "parseDataType", new object[] { "\"" + returnType + "\"" }));
|
||||
var jbroadcastVariables = GetJavaList<JvmObjectReference>(jvmBroadcastReferences);
|
||||
var jbroadcastVariables = JvmBridgeUtils.GetJavaList<JvmObjectReference>(jvmBroadcastReferences);
|
||||
|
||||
var hashTableReference = SparkCLRIpcProxy.JvmBridge.CallConstructor("java.util.Hashtable", new object[] { });
|
||||
var arrayListReference = SparkCLRIpcProxy.JvmBridge.CallConstructor("java.util.ArrayList", new object[] { });
|
||||
|
@ -306,7 +320,7 @@ namespace Microsoft.Spark.CSharp.Proxy.Ipc
|
|||
|
||||
public int RunJob(IRDDProxy rdd, IEnumerable<int> partitions)
|
||||
{
|
||||
var jpartitions = GetJavaList<int>(partitions);
|
||||
var jpartitions = JvmBridgeUtils.GetJavaList<int>(partitions);
|
||||
return int.Parse(SparkCLRIpcProxy.JvmBridge.CallStaticJavaMethod("org.apache.spark.api.python.PythonRDD", "runJob", new object[] { jvmSparkContextReference, (rdd as RDDIpcProxy).JvmRddReference, jpartitions }).ToString());
|
||||
}
|
||||
|
||||
|
@ -333,7 +347,7 @@ namespace Microsoft.Spark.CSharp.Proxy.Ipc
|
|||
if (self is ColumnIpcProxy)
|
||||
self = (self as ColumnIpcProxy).ScalaColumnReference;
|
||||
else if (self is IColumnProxy[])
|
||||
self = GetJavaSeq<JvmObjectReference>((self as IColumnProxy[]).Select(x => (x as ColumnIpcProxy).ScalaColumnReference));
|
||||
self = JvmBridgeUtils.GetJavaSeq<JvmObjectReference>((self as IColumnProxy[]).Select(x => (x as ColumnIpcProxy).ScalaColumnReference));
|
||||
return new ColumnIpcProxy(new JvmObjectReference((string)SparkCLRIpcProxy.JvmBridge.CallStaticJavaMethod("org.apache.spark.sql.functions", name, self)));
|
||||
}
|
||||
|
||||
|
@ -351,52 +365,6 @@ namespace Microsoft.Spark.CSharp.Proxy.Ipc
|
|||
return new ColumnIpcProxy(new JvmObjectReference((string)SparkCLRIpcProxy.JvmBridge.CallStaticJavaMethod("org.apache.spark.sql.functions", name)));
|
||||
}
|
||||
|
||||
public static JvmObjectReference GetJavaMap<K, V>(IEnumerable<KeyValuePair<K, V>> enumerable)
|
||||
{
|
||||
var jmap = SparkCLRIpcProxy.JvmBridge.CallConstructor("java.util.Hashtable", new object[] { });
|
||||
if (enumerable != null)
|
||||
{
|
||||
foreach (var item in enumerable)
|
||||
SparkCLRIpcProxy.JvmBridge.CallNonStaticJavaMethod(jmap, "put", new object[] { item.Key, item.Value });
|
||||
}
|
||||
return jmap;
|
||||
}
|
||||
|
||||
public static JvmObjectReference GetJavaHashMap<K, V>(IEnumerable<KeyValuePair<K, V>> enumerable)
|
||||
{
|
||||
var jmap = SparkCLRIpcProxy.JvmBridge.CallConstructor("java.util.HashMap", new object[] { });
|
||||
if (enumerable != null)
|
||||
{
|
||||
foreach (var item in enumerable)
|
||||
SparkCLRIpcProxy.JvmBridge.CallNonStaticJavaMethod(jmap, "put", new object[] { item.Key, item.Value });
|
||||
}
|
||||
return jmap;
|
||||
}
|
||||
|
||||
public static JvmObjectReference GetJavaSet<T>(IEnumerable<T> enumerable)
|
||||
{
|
||||
var jset = SparkCLRIpcProxy.JvmBridge.CallConstructor("java.util.HashSet", new object[] { });
|
||||
if (enumerable != null)
|
||||
{
|
||||
foreach (var item in enumerable)
|
||||
SparkCLRIpcProxy.JvmBridge.CallNonStaticJavaMethod(jset, "add", new object[] { item });
|
||||
}
|
||||
return jset;
|
||||
}
|
||||
public static JvmObjectReference GetJavaList<T>(IEnumerable<T> enumerable)
|
||||
{
|
||||
var jlist = SparkCLRIpcProxy.JvmBridge.CallConstructor("java.util.ArrayList", new object[] { });
|
||||
if (enumerable != null)
|
||||
{
|
||||
foreach (var item in enumerable)
|
||||
SparkCLRIpcProxy.JvmBridge.CallNonStaticJavaMethod(jlist, "add", new object[] { item });
|
||||
}
|
||||
return jlist;
|
||||
}
|
||||
public static JvmObjectReference GetJavaSeq<T>(IEnumerable<T> enumerable)
|
||||
{
|
||||
return new JvmObjectReference((string)SparkCLRIpcProxy.JvmBridge.CallStaticJavaMethod("org.apache.spark.sql.api.csharp.SQLUtils", "toSeq", GetJavaList<T>(enumerable)));
|
||||
}
|
||||
public static JvmObjectReference GetJavaStorageLevel(StorageLevelType storageLevelType)
|
||||
{
|
||||
return new JvmObjectReference(SparkCLRIpcProxy.JvmBridge.CallStaticJavaMethod("org.apache.spark.api.java.StorageLevels", "create",
|
||||
|
|
|
@ -105,5 +105,100 @@ namespace Microsoft.Spark.CSharp.Proxy.Ipc
|
|||
|
||||
SparkCLRIpcProxy.JvmBridge.CallNonStaticJavaMethod(judf, "registerPython", new object[] {name, udf});
|
||||
}
|
||||
|
||||
public ISqlContextProxy NewSession()
|
||||
{
|
||||
return new SqlContextIpcProxy(
|
||||
new JvmObjectReference((string)SparkCLRIpcProxy.JvmBridge.CallNonStaticJavaMethod(jvmSqlContextReference, "newSession")));
|
||||
}
|
||||
|
||||
public string GetConf(string key, string defaultValue)
|
||||
{
|
||||
return (string)SparkCLRIpcProxy.JvmBridge.CallNonStaticJavaMethod(jvmSqlContextReference, "getConf", new object[] { key, defaultValue });
|
||||
}
|
||||
|
||||
public void SetConf(string key, string value)
|
||||
{
|
||||
SparkCLRIpcProxy.JvmBridge.CallNonStaticJavaMethod(jvmSqlContextReference, "setConf", new object[] { key, value });
|
||||
}
|
||||
|
||||
public void RegisterDataFrameAsTable(IDataFrameProxy dataFrameProxy, string tableName)
|
||||
{
|
||||
SparkCLRIpcProxy.JvmBridge.CallNonStaticJavaMethod(
|
||||
jvmSqlContextReference, "registerDataFrameAsTable",
|
||||
new object[] { (dataFrameProxy as DataFrameIpcProxy).JvmDataFrameReference, tableName });
|
||||
}
|
||||
|
||||
public void DropTempTable(string tableName)
|
||||
{
|
||||
SparkCLRIpcProxy.JvmBridge.CallNonStaticJavaMethod(
|
||||
jvmSqlContextReference, "dropTempTable", new object[] { tableName });
|
||||
}
|
||||
|
||||
public IDataFrameProxy Table(string tableName)
|
||||
{
|
||||
return new DataFrameIpcProxy(
|
||||
new JvmObjectReference(
|
||||
(string)SparkCLRIpcProxy.JvmBridge.CallNonStaticJavaMethod(jvmSqlContextReference, "table",
|
||||
new object[] { tableName })), this);
|
||||
}
|
||||
|
||||
public IDataFrameProxy Tables()
|
||||
{
|
||||
return new DataFrameIpcProxy(
|
||||
new JvmObjectReference(
|
||||
(string)SparkCLRIpcProxy.JvmBridge.CallNonStaticJavaMethod(jvmSqlContextReference, "tables")), this);
|
||||
}
|
||||
|
||||
public IDataFrameProxy Tables(string databaseName)
|
||||
{
|
||||
return new DataFrameIpcProxy(
|
||||
new JvmObjectReference(
|
||||
(string)SparkCLRIpcProxy.JvmBridge.CallNonStaticJavaMethod(jvmSqlContextReference, "tables",
|
||||
new object[] { databaseName })), this);
|
||||
}
|
||||
|
||||
public IEnumerable<string> TableNames()
|
||||
{
|
||||
var tableNames = SparkCLRIpcProxy.JvmBridge.CallNonStaticJavaMethod(jvmSqlContextReference, "tableNames");
|
||||
return (List<string>) tableNames;
|
||||
}
|
||||
|
||||
public IEnumerable<string> TableNames(string databaseName)
|
||||
{
|
||||
return (List<string>)SparkCLRIpcProxy.JvmBridge.CallNonStaticJavaMethod(jvmSqlContextReference, "tableNames",
|
||||
new object[] { databaseName });
|
||||
}
|
||||
|
||||
public void CacheTable(string tableName)
|
||||
{
|
||||
SparkCLRIpcProxy.JvmBridge.CallNonStaticJavaMethod(jvmSqlContextReference, "cacheTable",
|
||||
new object[] { tableName });
|
||||
}
|
||||
|
||||
public void UncacheTable(string tableName)
|
||||
{
|
||||
SparkCLRIpcProxy.JvmBridge.CallNonStaticJavaMethod(jvmSqlContextReference, "uncacheTable",
|
||||
new object[] { tableName });
|
||||
}
|
||||
|
||||
public void ClearCache()
|
||||
{
|
||||
SparkCLRIpcProxy.JvmBridge.CallNonStaticJavaMethod(jvmSqlContextReference, "clearCache");
|
||||
}
|
||||
|
||||
public bool IsCached(string tableName)
|
||||
{
|
||||
return (bool)SparkCLRIpcProxy.JvmBridge.CallNonStaticJavaMethod(jvmSqlContextReference, "isCached",
|
||||
new object[] { tableName });
|
||||
}
|
||||
|
||||
#region HiveContext
|
||||
public void RefreshTable(string tableName)
|
||||
{
|
||||
SparkCLRIpcProxy.JvmBridge.CallNonStaticJavaMethod(jvmSqlContextReference, "refreshTable",
|
||||
new object[] { tableName });
|
||||
}
|
||||
#endregion
|
||||
}
|
||||
}
|
||||
|
|
|
@ -10,12 +10,12 @@ using System.Net;
|
|||
using System.Net.Sockets;
|
||||
using System.Runtime.Serialization;
|
||||
using System.Runtime.Serialization.Formatters.Binary;
|
||||
using System.Text;
|
||||
using System.Threading;
|
||||
using System.Threading.Tasks;
|
||||
|
||||
using Microsoft.Spark.CSharp.Core;
|
||||
using Microsoft.Spark.CSharp.Interop.Ipc;
|
||||
using Microsoft.Spark.CSharp.Network;
|
||||
using Microsoft.Spark.CSharp.Services;
|
||||
|
||||
namespace Microsoft.Spark.CSharp.Proxy.Ipc
|
||||
|
@ -26,7 +26,7 @@ namespace Microsoft.Spark.CSharp.Proxy.Ipc
|
|||
[ExcludeFromCodeCoverage] //IPC calls to JVM validated using validation-enabled samples - unit test coverage not reqiured
|
||||
internal class StreamingContextIpcProxy : IStreamingContextProxy
|
||||
{
|
||||
private readonly ILoggerService logger = LoggerServiceFactory.GetLogger(typeof(SparkConf));
|
||||
private readonly ILoggerService logger = LoggerServiceFactory.GetLogger(typeof(StreamingContextIpcProxy));
|
||||
internal readonly JvmObjectReference jvmStreamingContextReference;
|
||||
private readonly JvmObjectReference jvmJavaStreamingReference;
|
||||
private readonly ISparkContextProxy sparkContextProxy;
|
||||
|
@ -43,36 +43,51 @@ namespace Microsoft.Spark.CSharp.Proxy.Ipc
|
|||
}
|
||||
}
|
||||
|
||||
public StreamingContextIpcProxy(SparkContext sparkContext, long durationMs)
|
||||
public StreamingContextIpcProxy(SparkContext sparkContext, int durationSeconds)
|
||||
{
|
||||
this.sparkContext = sparkContext;
|
||||
sparkContextProxy = sparkContext.SparkContextProxy;
|
||||
var jduration = SparkCLRIpcProxy.JvmBridge.CallConstructor("org.apache.spark.streaming.Duration", new object[] { durationMs });
|
||||
var jduration = JvmBridgeUtils.GetJavaDuration(durationSeconds);
|
||||
|
||||
JvmObjectReference jvmSparkContextReference = (sparkContextProxy as SparkContextIpcProxy).JvmSparkContextReference;
|
||||
jvmStreamingContextReference = SparkCLRIpcProxy.JvmBridge.CallConstructor("org.apache.spark.streaming.StreamingContext", new object[] { jvmSparkContextReference, jduration });
|
||||
jvmJavaStreamingReference = SparkCLRIpcProxy.JvmBridge.CallConstructor("org.apache.spark.streaming.api.java.JavaStreamingContext", new object[] { jvmStreamingContextReference });
|
||||
|
||||
StartAccumulatorServer(sparkContext);
|
||||
StartCallbackServer();
|
||||
}
|
||||
|
||||
|
||||
public StreamingContextIpcProxy(string checkpointPath)
|
||||
{
|
||||
sparkContext = SparkContext.GetActiveSparkContext();
|
||||
StartCallbackServer();
|
||||
|
||||
jvmJavaStreamingReference = SparkCLRIpcProxy.JvmBridge.CallConstructor("org.apache.spark.streaming.api.java.JavaStreamingContext", new object[] { checkpointPath });
|
||||
jvmStreamingContextReference = new JvmObjectReference((string)SparkCLRIpcProxy.JvmBridge.CallNonStaticJavaMethod(jvmJavaStreamingReference, "ssc"));
|
||||
JvmObjectReference jvmSparkContextReference = new JvmObjectReference((string)SparkCLRIpcProxy.JvmBridge.CallNonStaticJavaMethod(jvmStreamingContextReference, "sc"));
|
||||
JvmObjectReference jvmSparkConfReference = new JvmObjectReference((string)SparkCLRIpcProxy.JvmBridge.CallNonStaticJavaMethod(jvmStreamingContextReference, "conf"));
|
||||
JvmObjectReference jvmJavaContextReference = new JvmObjectReference((string)SparkCLRIpcProxy.JvmBridge.CallNonStaticJavaMethod(jvmJavaStreamingReference, "sparkContext"));
|
||||
sparkContextProxy = new SparkContextIpcProxy(jvmSparkContextReference, jvmJavaContextReference);
|
||||
var sparkConfProxy = new SparkConfIpcProxy(jvmSparkConfReference);
|
||||
sparkContext = new SparkContext(sparkContextProxy, new SparkConf(sparkConfProxy));
|
||||
if (sparkContext == null)
|
||||
{
|
||||
JvmObjectReference jvmSparkContextReference = new JvmObjectReference((string)SparkCLRIpcProxy.JvmBridge.CallNonStaticJavaMethod(jvmStreamingContextReference, "sc"));
|
||||
JvmObjectReference jvmSparkConfReference = new JvmObjectReference((string)SparkCLRIpcProxy.JvmBridge.CallNonStaticJavaMethod(jvmStreamingContextReference, "conf"));
|
||||
JvmObjectReference jvmJavaContextReference = new JvmObjectReference((string)SparkCLRIpcProxy.JvmBridge.CallNonStaticJavaMethod(jvmJavaStreamingReference, "sparkContext"));
|
||||
sparkContextProxy = new SparkContextIpcProxy(jvmSparkContextReference, jvmJavaContextReference);
|
||||
var sparkConfProxy = new SparkConfIpcProxy(jvmSparkConfReference);
|
||||
sparkContext = new SparkContext(sparkContextProxy, new SparkConf(sparkConfProxy));
|
||||
}
|
||||
else
|
||||
{
|
||||
sparkContextProxy = sparkContext.SparkContextProxy;
|
||||
}
|
||||
StartAccumulatorServer(sparkContext);
|
||||
}
|
||||
|
||||
private void StartAccumulatorServer(SparkContext sparkContext)
|
||||
{
|
||||
// TODO: We don't know whether accumulator variable is used before restart. We just start accumuator server for safety.
|
||||
sparkContext.StartAccumulatorServer();
|
||||
}
|
||||
|
||||
public void Start()
|
||||
{
|
||||
int port = StartCallback();
|
||||
SparkCLRIpcProxy.JvmBridge.CallStaticJavaMethod("SparkCLRHandler", "connectCallback", port); //className and methodName hardcoded in CSharpBackendHandler
|
||||
SparkCLRIpcProxy.JvmBridge.CallNonStaticJavaMethod(jvmStreamingContextReference, "start");
|
||||
}
|
||||
|
||||
|
@ -84,9 +99,9 @@ namespace Microsoft.Spark.CSharp.Proxy.Ipc
|
|||
SparkCLRIpcProxy.JvmBridge.CallStaticJavaMethod("SparkCLRHandler", "closeCallback");
|
||||
}
|
||||
|
||||
public void Remember(long durationMs)
|
||||
public void Remember(int durationSeconds)
|
||||
{
|
||||
var jduration = SparkCLRIpcProxy.JvmBridge.CallConstructor("org.apache.spark.streaming.Duration", new object[] { (int)durationMs });
|
||||
var jduration = JvmBridgeUtils.GetJavaDuration(durationSeconds);
|
||||
|
||||
SparkCLRIpcProxy.JvmBridge.CallNonStaticJavaMethod(jvmStreamingContextReference, "remember", new object[] { jduration });
|
||||
}
|
||||
|
@ -119,8 +134,8 @@ namespace Microsoft.Spark.CSharp.Proxy.Ipc
|
|||
|
||||
public IDStreamProxy CreateCSharpReducedWindowedDStream(IDStreamProxy jdstream, byte[] func, byte[] invFunc, int windowSeconds, int slideSeconds, string serializationMode)
|
||||
{
|
||||
var windowDurationReference = SparkCLRIpcProxy.JvmBridge.CallConstructor("org.apache.spark.streaming.Duration", new object[] { windowSeconds * 1000 });
|
||||
var slideDurationReference = SparkCLRIpcProxy.JvmBridge.CallConstructor("org.apache.spark.streaming.Duration", new object[] { slideSeconds * 1000 });
|
||||
var windowDurationReference = JvmBridgeUtils.GetJavaDuration(windowSeconds);
|
||||
var slideDurationReference = JvmBridgeUtils.GetJavaDuration(slideSeconds);
|
||||
|
||||
var jvmDStreamReference = SparkCLRIpcProxy.JvmBridge.CallConstructor("org.apache.spark.streaming.api.csharp.CSharpReducedWindowedDStream",
|
||||
new object[] { (jdstream as DStreamIpcProxy).jvmDStreamReference, func, invFunc, windowDurationReference, slideDurationReference, serializationMode });
|
||||
|
@ -138,6 +153,21 @@ namespace Microsoft.Spark.CSharp.Proxy.Ipc
|
|||
return new DStreamIpcProxy(javaDStreamReference, jvmDStreamReference);
|
||||
}
|
||||
|
||||
public IDStreamProxy CreateConstantInputDStream(IRDDProxy rddProxy)
|
||||
{
|
||||
var rddReference =
|
||||
new JvmObjectReference(
|
||||
(string)SparkCLRIpcProxy.JvmBridge.CallNonStaticJavaMethod(((RDDIpcProxy)rddProxy).JvmRddReference, "rdd"));
|
||||
|
||||
var jvmDStreamReference = SparkCLRIpcProxy.JvmBridge.CallConstructor(
|
||||
"org.apache.spark.streaming.api.csharp.CSharpConstantInputDStream", jvmStreamingContextReference, rddReference);
|
||||
|
||||
var javaDStreamReference =
|
||||
new JvmObjectReference((String)SparkCLRIpcProxy.JvmBridge.CallNonStaticJavaMethod(jvmDStreamReference, "asJavaDStream"));
|
||||
|
||||
return new DStreamIpcProxy(javaDStreamReference, jvmDStreamReference);
|
||||
}
|
||||
|
||||
public IDStreamProxy TextFileStream(string directory)
|
||||
{
|
||||
var jstream = new JvmObjectReference(SparkCLRIpcProxy.JvmBridge.CallNonStaticJavaMethod(jvmJavaStreamingReference, "textFileStream", new object[] { directory }).ToString());
|
||||
|
@ -153,19 +183,19 @@ namespace Microsoft.Spark.CSharp.Proxy.Ipc
|
|||
|
||||
public IDStreamProxy KafkaStream(Dictionary<string, int> topics, Dictionary<string, string> kafkaParams, StorageLevelType storageLevelType)
|
||||
{
|
||||
JvmObjectReference jtopics = SparkContextIpcProxy.GetJavaMap<string, int>(topics);
|
||||
JvmObjectReference jkafkaParams = SparkContextIpcProxy.GetJavaMap<string, string>(kafkaParams);
|
||||
JvmObjectReference jtopics = JvmBridgeUtils.GetJavaMap<string, int>(topics);
|
||||
JvmObjectReference jkafkaParams = JvmBridgeUtils.GetJavaMap<string, string>(kafkaParams);
|
||||
JvmObjectReference jlevel = SparkContextIpcProxy.GetJavaStorageLevel(storageLevelType);
|
||||
// KafkaUtilsPythonHelper: external/kafka/src/main/scala/org/apache/spark/streaming/kafka/KafkaUtils.scala
|
||||
JvmObjectReference jhelper = SparkCLRIpcProxy.JvmBridge.CallConstructor("org.apache.spark.streaming.kafka.KafkaUtilsPythonHelper", new object[] { });
|
||||
var jstream = new JvmObjectReference(SparkCLRIpcProxy.JvmBridge.CallNonStaticJavaMethod(jhelper, "createStream", new object[] { jvmJavaStreamingReference, jkafkaParams, jtopics, jlevel }).ToString());
|
||||
return new DStreamIpcProxy(jstream);
|
||||
}
|
||||
|
||||
|
||||
public IDStreamProxy DirectKafkaStream(List<string> topics, Dictionary<string, string> kafkaParams, Dictionary<string, long> fromOffsets)
|
||||
{
|
||||
JvmObjectReference jtopics = SparkContextIpcProxy.GetJavaSet<string>(topics);
|
||||
JvmObjectReference jkafkaParams = SparkContextIpcProxy.GetJavaMap<string, string>(kafkaParams);
|
||||
JvmObjectReference jtopics = JvmBridgeUtils.GetJavaSet<string>(topics);
|
||||
JvmObjectReference jkafkaParams = JvmBridgeUtils.GetJavaMap<string, string>(kafkaParams);
|
||||
|
||||
var jTopicAndPartitions = fromOffsets.Select(x =>
|
||||
new KeyValuePair<JvmObjectReference, long>
|
||||
|
@ -175,13 +205,48 @@ namespace Microsoft.Spark.CSharp.Proxy.Ipc
|
|||
)
|
||||
);
|
||||
|
||||
JvmObjectReference jfromOffsets = SparkContextIpcProxy.GetJavaMap<JvmObjectReference, long>(jTopicAndPartitions);
|
||||
JvmObjectReference jfromOffsets = JvmBridgeUtils.GetJavaMap<JvmObjectReference, long>(jTopicAndPartitions);
|
||||
// KafkaUtilsPythonHelper: external/kafka/src/main/scala/org/apache/spark/streaming/kafka/KafkaUtils.scala
|
||||
JvmObjectReference jhelper = SparkCLRIpcProxy.JvmBridge.CallConstructor("org.apache.spark.streaming.kafka.KafkaUtilsPythonHelper", new object[] { });
|
||||
var jstream = new JvmObjectReference(SparkCLRIpcProxy.JvmBridge.CallNonStaticJavaMethod(jhelper, "createDirectStreamWithoutMessageHandler", new object[] { jvmJavaStreamingReference, jkafkaParams, jtopics, jfromOffsets }).ToString());
|
||||
return new DStreamIpcProxy(jstream);
|
||||
}
|
||||
|
||||
|
||||
public IDStreamProxy DirectKafkaStreamWithRepartition(List<string> topics, Dictionary<string, string> kafkaParams,
|
||||
Dictionary<string, long> fromOffsets, int numPartitions, byte[] readFunc, string serializationMode)
|
||||
{
|
||||
JvmObjectReference jtopics = JvmBridgeUtils.GetJavaSet<string>(topics);
|
||||
JvmObjectReference jkafkaParams = JvmBridgeUtils.GetJavaMap<string, string>(kafkaParams);
|
||||
|
||||
var jTopicAndPartitions = fromOffsets.Select(x =>
|
||||
new KeyValuePair<JvmObjectReference, long>
|
||||
(
|
||||
SparkCLRIpcProxy.JvmBridge.CallConstructor("kafka.common.TopicAndPartition", new object[] { x.Key.Split(':')[0], int.Parse(x.Key.Split(':')[1]) }),
|
||||
x.Value
|
||||
)
|
||||
);
|
||||
|
||||
JvmObjectReference jfromOffsets = JvmBridgeUtils.GetJavaMap<JvmObjectReference, long>(jTopicAndPartitions);
|
||||
// SparkCLR\scala\src\main\org\apache\spark\streaming\api\kafka\KafkaUtilsCSharpHelper.scala
|
||||
JvmObjectReference jhelper = SparkCLRIpcProxy.JvmBridge.CallConstructor("org.apache.spark.streaming.kafka.KafkaUtilsCSharpHelper", new object[] { });
|
||||
var jstream = new JvmObjectReference(SparkCLRIpcProxy.JvmBridge.CallNonStaticJavaMethod(jhelper, "createDirectStreamWithoutMessageHandler",
|
||||
new object[] { jvmJavaStreamingReference, jkafkaParams, jtopics, jfromOffsets, (int)numPartitions, readFunc, serializationMode }).ToString());
|
||||
return new DStreamIpcProxy(jstream);
|
||||
}
|
||||
|
||||
public IDStreamProxy EventHubsUnionStream(Dictionary<string, string> eventHubsParams, StorageLevelType storageLevelType)
|
||||
{
|
||||
JvmObjectReference eventHubsParamsReference = JvmBridgeUtils.GetScalaMutableMap<string, string>(eventHubsParams);
|
||||
JvmObjectReference storageLevelTypeReference = SparkContextIpcProxy.GetJavaStorageLevel(storageLevelType);
|
||||
return
|
||||
new DStreamIpcProxy(
|
||||
new JvmObjectReference(
|
||||
SparkCLRIpcProxy.JvmBridge.CallStaticJavaMethod(
|
||||
"org.apache.spark.streaming.api.csharp.EventHubsUtils", "createUnionStream",
|
||||
new object[] { jvmJavaStreamingReference, eventHubsParamsReference, storageLevelTypeReference })
|
||||
.ToString()));
|
||||
}
|
||||
|
||||
public IDStreamProxy Union(IDStreamProxy firstDStream, IDStreamProxy[] otherDStreams)
|
||||
{
|
||||
return new DStreamIpcProxy(
|
||||
|
@ -190,7 +255,7 @@ namespace Microsoft.Spark.CSharp.Proxy.Ipc
|
|||
new object[]
|
||||
{
|
||||
(firstDStream as DStreamIpcProxy).javaDStreamReference,
|
||||
SparkContextIpcProxy.GetJavaList<JvmObjectReference>(otherDStreams.Select(x => (x as DStreamIpcProxy).javaDStreamReference))
|
||||
JvmBridgeUtils.GetJavaList<JvmObjectReference>(otherDStreams.Select(x => (x as DStreamIpcProxy).javaDStreamReference))
|
||||
}
|
||||
)));
|
||||
}
|
||||
|
@ -200,19 +265,19 @@ namespace Microsoft.Spark.CSharp.Proxy.Ipc
|
|||
SparkCLRIpcProxy.JvmBridge.CallNonStaticJavaMethod(jvmStreamingContextReference, "awaitTermination");
|
||||
}
|
||||
|
||||
public void AwaitTermination(int timeout)
|
||||
public void AwaitTerminationOrTimeout(long timeout)
|
||||
{
|
||||
SparkCLRIpcProxy.JvmBridge.CallNonStaticJavaMethod(jvmStreamingContextReference, "awaitTermination", new object[] { timeout });
|
||||
SparkCLRIpcProxy.JvmBridge.CallNonStaticJavaMethod(jvmStreamingContextReference, "awaitTerminationOrTimeout", new object[] { timeout });
|
||||
}
|
||||
|
||||
private void ProcessCallbackRequest(object socket)
|
||||
{
|
||||
logger.LogDebug("new thread created to process callback request");
|
||||
logger.LogDebug("New thread (id={0}) created to process callback request", Thread.CurrentThread.ManagedThreadId);
|
||||
|
||||
try
|
||||
{
|
||||
using (Socket sock = (Socket)socket)
|
||||
using (var s = new NetworkStream(sock))
|
||||
using (var sock = (ISocketWrapper)socket)
|
||||
using (var s = sock.GetStream())
|
||||
{
|
||||
while (true)
|
||||
{
|
||||
|
@ -268,6 +333,7 @@ namespace Microsoft.Spark.CSharp.Proxy.Ipc
|
|||
//log exception only when callback socket is not shutdown explicitly
|
||||
if (!callbackSocketShutdown)
|
||||
{
|
||||
logger.LogError("Exception processing call back request. Thread id {0}", Thread.CurrentThread.ManagedThreadId);
|
||||
logger.LogException(e);
|
||||
|
||||
// exit when exception happens
|
||||
|
@ -281,16 +347,17 @@ namespace Microsoft.Spark.CSharp.Proxy.Ipc
|
|||
}
|
||||
catch (Exception e)
|
||||
{
|
||||
logger.LogError("Exception in callback. Thread id {0}", Thread.CurrentThread.ManagedThreadId);
|
||||
logger.LogException(e);
|
||||
}
|
||||
|
||||
logger.LogDebug("thread to process callback request exit");
|
||||
logger.LogDebug("Thread (id={0}) to process callback request exiting", Thread.CurrentThread.ManagedThreadId);
|
||||
}
|
||||
|
||||
public int StartCallback()
|
||||
private int StartCallbackServer()
|
||||
{
|
||||
TcpListener callbackServer = new TcpListener(IPAddress.Loopback, 0);
|
||||
callbackServer.Start();
|
||||
var callbackServer = SocketFactory.CreateSocket();
|
||||
callbackServer.Listen();
|
||||
|
||||
Task.Run(() =>
|
||||
{
|
||||
|
@ -299,23 +366,28 @@ namespace Microsoft.Spark.CSharp.Proxy.Ipc
|
|||
ThreadPool.SetMaxThreads(10, 10);
|
||||
while (!callbackSocketShutdown)
|
||||
{
|
||||
Socket sock = callbackServer.AcceptSocket();
|
||||
ThreadPool.QueueUserWorkItem(new WaitCallback(ProcessCallbackRequest), sock);
|
||||
var sock = callbackServer.Accept();
|
||||
ThreadPool.QueueUserWorkItem(ProcessCallbackRequest, sock);
|
||||
}
|
||||
}
|
||||
catch (Exception e)
|
||||
{
|
||||
logger.LogError("Exception starting callback server");
|
||||
logger.LogException(e);
|
||||
throw;
|
||||
}
|
||||
finally
|
||||
{
|
||||
if (callbackServer != null)
|
||||
callbackServer.Stop();
|
||||
callbackServer.Close();
|
||||
}
|
||||
});
|
||||
|
||||
return (callbackServer.LocalEndpoint as IPEndPoint).Port;
|
||||
int port = (callbackServer.LocalEndPoint as IPEndPoint).Port;
|
||||
logger.LogInfo("Callback server port number is {0}", port);
|
||||
SparkCLRIpcProxy.JvmBridge.CallStaticJavaMethod("SparkCLRHandler", "connectCallback", port); //className and methodName hard coded in CSharpBackendHandler
|
||||
|
||||
return port;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -13,6 +13,11 @@ namespace Microsoft.Spark.CSharp.Services
|
|||
public class DefaultLoggerService : ILoggerService
|
||||
{
|
||||
internal readonly static DefaultLoggerService Instance = new DefaultLoggerService(typeof (Type));
|
||||
/// <summary>
|
||||
/// Get an instance of ILoggerService by a given type of logger
|
||||
/// </summary>
|
||||
/// <param name="type">The type of a logger to return</param>
|
||||
/// <returns>An instance of ILoggerService</returns>
|
||||
public ILoggerService GetLoggerInstance(Type type)
|
||||
{
|
||||
return new DefaultLoggerService(type);
|
||||
|
@ -24,31 +29,105 @@ namespace Microsoft.Spark.CSharp.Services
|
|||
type = t;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Logs a message at debug level.
|
||||
/// </summary>
|
||||
/// <param name="message">The message to be logged</param>
|
||||
public void LogDebug(string message)
|
||||
{
|
||||
Log("Debug", message);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Logs a message at debug level with a format string.
|
||||
/// </summary>
|
||||
/// <param name="messageFormat">The format string</param>
|
||||
/// <param name="messageParameters">The array of arguments</param>
|
||||
public void LogDebug(string messageFormat, params object[] messageParameters)
|
||||
{
|
||||
Log("Debug", string.Format(messageFormat, messageParameters));
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Logs a message at info level.
|
||||
/// </summary>
|
||||
/// <param name="message">The message to be logged</param>
|
||||
public void LogInfo(string message)
|
||||
{
|
||||
Log("Info", message);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Logs a message at info level with a format string.
|
||||
/// </summary>
|
||||
/// <param name="messageFormat">The format string</param>
|
||||
/// <param name="messageParameters">The array of arguments</param>
|
||||
public void LogInfo(string messageFormat, params object[] messageParameters)
|
||||
{
|
||||
Log("Info", string.Format(messageFormat, messageParameters));
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Logs a message at warning level.
|
||||
/// </summary>
|
||||
/// <param name="message">The message to be logged</param>
|
||||
public void LogWarn(string message)
|
||||
{
|
||||
Log("Warn", message);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Logs a message at warning level with a format string.
|
||||
/// </summary>
|
||||
/// <param name="messageFormat">The format string</param>
|
||||
/// <param name="messageParameters">The array of arguments</param>
|
||||
public void LogWarn(string messageFormat, params object[] messageParameters)
|
||||
{
|
||||
Log("Warn", string.Format(messageFormat, messageParameters));
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Logs a fatal message.
|
||||
/// </summary>
|
||||
/// <param name="message">The message to be logged</param>
|
||||
public void LogFatal(string message)
|
||||
{
|
||||
Log("Fatal", message);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Logs a fatal message with a format string.
|
||||
/// </summary>
|
||||
/// <param name="messageFormat">The format string</param>
|
||||
/// <param name="messageParameters">The array of arguments</param>
|
||||
public void LogFatal(string messageFormat, params object[] messageParameters)
|
||||
{
|
||||
Log("Fatal", string.Format(messageFormat, messageParameters));
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Logs a error message.
|
||||
/// </summary>
|
||||
/// <param name="message">The message to be logged</param>
|
||||
public void LogError(string message)
|
||||
{
|
||||
Log("Error", message);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Logs a error message with a format string.
|
||||
/// </summary>
|
||||
/// <param name="messageFormat">The format string</param>
|
||||
/// <param name="messageParameters">The array of arguments</param>
|
||||
public void LogError(string messageFormat, params object[] messageParameters)
|
||||
{
|
||||
Log("Error", string.Format(messageFormat, messageParameters));
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Logs an exception
|
||||
/// </summary>
|
||||
/// <param name="e">The exception to be logged</param>
|
||||
public void LogException(Exception e)
|
||||
{
|
||||
Log("Exception", string.Format("{0}{1}{2}", e.Message, Environment.NewLine, e.StackTrace));
|
||||
|
|
|
@ -1,19 +1,77 @@
|
|||
using System;
|
||||
using System.Collections.Generic;
|
||||
using System.Linq;
|
||||
using System.Text;
|
||||
using System.Threading.Tasks;
|
||||
|
||||
namespace Microsoft.Spark.CSharp.Services
|
||||
{
|
||||
/// <summary>
|
||||
/// Defines a logger what be used in service
|
||||
/// </summary>
|
||||
public interface ILoggerService
|
||||
{
|
||||
/// <summary>
|
||||
/// Get an instance of ILoggerService by a given type of logger
|
||||
/// </summary>
|
||||
/// <param name="type">The type of a logger to return</param>
|
||||
/// <returns>An instance of ILoggerService</returns>
|
||||
ILoggerService GetLoggerInstance(Type type);
|
||||
/// <summary>
|
||||
/// Logs a message at debug level.
|
||||
/// </summary>
|
||||
/// <param name="message">The message to be logged</param>
|
||||
void LogDebug(string message);
|
||||
/// <summary>
|
||||
/// Logs a message at debug level with a format string.
|
||||
/// </summary>
|
||||
/// <param name="messageFormat">The format string</param>
|
||||
/// <param name="messageParameters">The array of arguments</param>
|
||||
void LogDebug(string messageFormat, params object[] messageParameters);
|
||||
/// <summary>
|
||||
/// Logs a message at info level.
|
||||
/// </summary>
|
||||
/// <param name="message">The message to be logged</param>
|
||||
void LogInfo(string message);
|
||||
/// <summary>
|
||||
/// Logs a message at info level with a format string.
|
||||
/// </summary>
|
||||
/// <param name="messageFormat">The format string</param>
|
||||
/// <param name="messageParameters">The array of arguments</param>
|
||||
void LogInfo(string messageFormat, params object[] messageParameters);
|
||||
/// <summary>
|
||||
/// Logs a message at warning level.
|
||||
/// </summary>
|
||||
/// <param name="message">The message to be logged</param>
|
||||
void LogWarn(string message);
|
||||
/// <summary>
|
||||
/// Logs a message at warning level with a format string.
|
||||
/// </summary>
|
||||
/// <param name="messageFormat">The format string</param>
|
||||
/// <param name="messageParameters">The array of arguments</param>
|
||||
void LogWarn(string messageFormat, params object[] messageParameters);
|
||||
/// <summary>
|
||||
/// Logs a fatal message.
|
||||
/// </summary>
|
||||
/// <param name="message">The message to be logged</param>
|
||||
void LogFatal(string message);
|
||||
/// <summary>
|
||||
/// Logs a fatal message with a format string.
|
||||
/// </summary>
|
||||
/// <param name="messageFormat">The format string</param>
|
||||
/// <param name="messageParameters">The array of arguments</param>
|
||||
void LogFatal(string messageFormat, params object[] messageParameters);
|
||||
/// <summary>
|
||||
/// Logs a error message.
|
||||
/// </summary>
|
||||
/// <param name="message">The message to be logged</param>
|
||||
void LogError(string message);
|
||||
/// <summary>
|
||||
/// Logs a error message with a format string.
|
||||
/// </summary>
|
||||
/// <param name="messageFormat">The format string</param>
|
||||
/// <param name="messageParameters">The array of arguments</param>
|
||||
void LogError(string messageFormat, params object[] messageParameters);
|
||||
/// <summary>
|
||||
/// Logs an exception
|
||||
/// </summary>
|
||||
/// <param name="e">The exception to be logged</param>
|
||||
void LogException(Exception e);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -10,11 +10,17 @@ using log4net.Config;
|
|||
|
||||
namespace Microsoft.Spark.CSharp.Services
|
||||
{
|
||||
[ExcludeFromCodeCoverage] //unit test coverage not reqiured for logger service
|
||||
/// <summary>
|
||||
/// Represents a Log4Net logger.
|
||||
/// </summary>
|
||||
[ExcludeFromCodeCoverage] //unit test coverage not required for logger service
|
||||
public class Log4NetLoggerService : ILoggerService
|
||||
{
|
||||
private readonly ILog logger;
|
||||
private const string exceptionLogDelimiter = "*******************************************************************************************************************************";
|
||||
/// <summary>
|
||||
/// Gets a instance of Log4Net logger
|
||||
/// </summary>
|
||||
public static Log4NetLoggerService Instance = new Log4NetLoggerService(typeof(Type));
|
||||
|
||||
static Log4NetLoggerService()
|
||||
|
@ -22,37 +28,115 @@ namespace Microsoft.Spark.CSharp.Services
|
|||
XmlConfigurator.Configure();
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Initializes a instance of Log4NetLoggerService with a specific type.
|
||||
/// </summary>
|
||||
/// <param name="type">The type of the logger</param>
|
||||
public Log4NetLoggerService(Type type)
|
||||
{
|
||||
logger = LogManager.GetLogger(type);
|
||||
log4net.GlobalContext.Properties["pid"] = Process.GetCurrentProcess().Id;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Logs a message at debug level.
|
||||
/// </summary>
|
||||
/// <param name="message">The message to be logged</param>
|
||||
public void LogDebug(string message)
|
||||
{
|
||||
logger.Debug(message);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Logs a message at debug level with a format string.
|
||||
/// </summary>
|
||||
/// <param name="messageFormat">The format string</param>
|
||||
/// <param name="messageParameters">The array of arguments</param>
|
||||
public void LogDebug(string messageFormat, params object[] messageParameters)
|
||||
{
|
||||
logger.DebugFormat(messageFormat, messageParameters);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Logs a message at info level.
|
||||
/// </summary>
|
||||
/// <param name="message">The message to be logged</param>
|
||||
public void LogInfo(string message)
|
||||
{
|
||||
logger.Info(message);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Logs a message at info level with a format string.
|
||||
/// </summary>
|
||||
/// <param name="messageFormat">The format string</param>
|
||||
/// <param name="messageParameters">The array of arguments</param>
|
||||
public void LogInfo(string messageFormat, params object[] messageParameters)
|
||||
{
|
||||
logger.InfoFormat(messageFormat, messageParameters);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Logs a message at warning level.
|
||||
/// </summary>
|
||||
/// <param name="message">The message to be logged</param>
|
||||
public void LogWarn(string message)
|
||||
{
|
||||
logger.Warn(message);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Logs a message at warning level with a format string.
|
||||
/// </summary>
|
||||
/// <param name="messageFormat">The format string</param>
|
||||
/// <param name="messageParameters">The array of arguments</param>
|
||||
public void LogWarn(string messageFormat, params object[] messageParameters)
|
||||
{
|
||||
logger.WarnFormat(messageFormat, messageParameters);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Logs a fatal message.
|
||||
/// </summary>
|
||||
/// <param name="message">The message to be logged</param>
|
||||
public void LogFatal(string message)
|
||||
{
|
||||
logger.Fatal(message);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Logs a fatal message with a format string.
|
||||
/// </summary>
|
||||
/// <param name="messageFormat">The format string</param>
|
||||
/// <param name="messageParameters">The array of arguments</param>
|
||||
public void LogFatal(string messageFormat, params object[] messageParameters)
|
||||
{
|
||||
logger.FatalFormat(messageFormat, messageParameters);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Logs a error message.
|
||||
/// </summary>
|
||||
/// <param name="message">The message to be logged</param>
|
||||
public void LogError(string message)
|
||||
{
|
||||
logger.Error(message);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Logs a error message with a format string.
|
||||
/// </summary>
|
||||
/// <param name="messageFormat">The format string</param>
|
||||
/// <param name="messageParameters">The array of arguments</param>
|
||||
public void LogError(string messageFormat, params object[] messageParameters)
|
||||
{
|
||||
logger.ErrorFormat(messageFormat, messageParameters);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Logs an exception
|
||||
/// </summary>
|
||||
/// <param name="e">The exception to be logged</param>
|
||||
public void LogException(Exception e)
|
||||
{
|
||||
|
||||
|
@ -92,7 +176,12 @@ namespace Microsoft.Spark.CSharp.Services
|
|||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
/// <summary>
|
||||
/// Get an instance of ILoggerService by a given type of logger
|
||||
/// </summary>
|
||||
/// <param name="type">The type of a logger to return</param>
|
||||
/// <returns>An instance of ILoggerService</returns>
|
||||
public ILoggerService GetLoggerInstance(Type type)
|
||||
{
|
||||
return new Log4NetLoggerService(type);
|
||||
|
|
|
@ -12,11 +12,23 @@ namespace Microsoft.Spark.CSharp.Services
|
|||
public class LoggerServiceFactory
|
||||
{
|
||||
private static ILoggerService loggerService = DefaultLoggerService.Instance;
|
||||
|
||||
/// <summary>
|
||||
/// Overrides an existing logger by a given logger service instance
|
||||
/// </summary>
|
||||
/// <param name="loggerServiceOverride">The logger service instance used to overrides</param>
|
||||
public static void SetLoggerService(ILoggerService loggerServiceOverride)
|
||||
{
|
||||
loggerService = loggerServiceOverride;
|
||||
var logger = GetLogger(typeof(LoggerServiceFactory));
|
||||
logger.LogInfo("Logger service configured to use {0}", logger.GetType().Name);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Gets an instance of logger service for a given type.
|
||||
/// </summary>
|
||||
/// <param name="type">The type of logger service to get</param>
|
||||
/// <returns>An instance of logger service</returns>
|
||||
public static ILoggerService GetLogger(Type type)
|
||||
{
|
||||
return loggerService.GetLoggerInstance(type);
|
||||
|
|
|
@ -10,6 +10,9 @@ using Microsoft.Spark.CSharp.Interop;
|
|||
|
||||
namespace Microsoft.Spark.CSharp.Sql
|
||||
{
|
||||
/// <summary>
|
||||
/// A column that will be computed based on the data in a DataFrame.
|
||||
/// </summary>
|
||||
public class Column
|
||||
{
|
||||
private readonly IColumnProxy columnProxy;
|
||||
|
@ -27,81 +30,179 @@ namespace Microsoft.Spark.CSharp.Sql
|
|||
this.columnProxy = columnProxy;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// The logical negation operator that negates its operand.
|
||||
/// </summary>
|
||||
/// <param name="self">The column self to compute</param>
|
||||
/// <returns>true if and only if its operand is false</returns>
|
||||
public static Column operator !(Column self)
|
||||
{
|
||||
return new Column(self.columnProxy.FuncOp("not"));
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Negation of itself.
|
||||
/// </summary>
|
||||
/// <param name="self">The column self to compute</param>
|
||||
/// <returns>The nagation of itself</returns>
|
||||
public static Column operator -(Column self)
|
||||
{
|
||||
return new Column(self.columnProxy.FuncOp("negate"));
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Sum of this expression and another expression.
|
||||
/// </summary>
|
||||
/// <param name="self">The column self to compute</param>
|
||||
/// <param name="other">The other object to compute</param>
|
||||
/// <returns>The result of sum</returns>
|
||||
public static Column operator +(Column self, object other)
|
||||
{
|
||||
return new Column(self.columnProxy.BinOp("plus", (other is Column) ? ((Column)other).columnProxy : other));
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Subtraction of this expression and another expression.
|
||||
/// </summary>
|
||||
/// <param name="self">The column self to compute</param>
|
||||
/// <param name="other">The other object to compute</param>
|
||||
/// <returns>The result of subtraction</returns>
|
||||
public static Column operator -(Column self, object other)
|
||||
{
|
||||
return new Column(self.columnProxy.BinOp("minus", (other is Column) ? ((Column)other).columnProxy : other));
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Multiplication of this expression and another expression.
|
||||
/// </summary>
|
||||
/// <param name="self">The column self to compute</param>
|
||||
/// <param name="other">The other object to compute</param>
|
||||
/// <returns>The result of multiplication</returns>
|
||||
public static Column operator *(Column self, object other)
|
||||
{
|
||||
return new Column(self.columnProxy.BinOp("multiply", (other is Column) ? ((Column)other).columnProxy : other));
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Division this expression by another expression.
|
||||
/// </summary>
|
||||
/// <param name="self">The column self to compute</param>
|
||||
/// <param name="other">The other object to compute</param>
|
||||
/// <returns>The result of division</returns>
|
||||
public static Column operator /(Column self, object other)
|
||||
{
|
||||
return new Column(self.columnProxy.BinOp("divide", (other is Column) ? ((Column)other).columnProxy : other));
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Modulo (a.k.a. remainder) expression.
|
||||
/// </summary>
|
||||
/// <param name="self">The column self to compute</param>
|
||||
/// <param name="other">The other object to compute</param>
|
||||
/// <returns>The remainder after dividing column self by other</returns>
|
||||
public static Column operator %(Column self, object other)
|
||||
{
|
||||
return new Column(self.columnProxy.BinOp("mod", (other is Column) ? ((Column)other).columnProxy : other));
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// The equality operator returns true if the values of its operands are equal, false otherwise.
|
||||
/// </summary>
|
||||
/// <param name="self">The column self to compare</param>
|
||||
/// <param name="other">The other object to compare</param>
|
||||
/// <returns>true if the value of self is the same as the value of other; otherwise, false.</returns>
|
||||
public static Column operator ==(Column self, object other)
|
||||
{
|
||||
return new Column(self.columnProxy.BinOp("equalTo", (other is Column) ? ((Column)other).columnProxy : other));
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// The inequality operator returns false if its operands are equal, true otherwise.
|
||||
/// </summary>
|
||||
/// <param name="self">The column self to compare</param>
|
||||
/// <param name="other">The other object to compare</param>
|
||||
/// <returns>true if the value of self is different from the value of other; otherwise, false.</returns>
|
||||
public static Column operator !=(Column self, object other)
|
||||
{
|
||||
return new Column(self.columnProxy.BinOp("notEqual", (other is Column) ? ((Column)other).columnProxy : other));
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// The "less than" relational operator that returns true if the first operand
|
||||
/// is less than the second, false otherwise.
|
||||
/// </summary>
|
||||
/// <param name="self">The column self to compare</param>
|
||||
/// <param name="other">The other object to compare</param>
|
||||
/// <returns>true if the value of self is less than the value of other; otherwise, false.</returns>
|
||||
public static Column operator <(Column self, object other)
|
||||
{
|
||||
return new Column(self.columnProxy.BinOp("lt", (other is Column) ? ((Column)other).columnProxy : other));
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// The "less than or equal" relational operator that returns true if the first operand
|
||||
/// is less than or equal to the second, false otherwise.
|
||||
/// </summary>
|
||||
/// <param name="self">The column self to compare</param>
|
||||
/// <param name="other">The other object to compare</param>
|
||||
/// <returns>true if the value of self is less than or equal to the value of other; otherwise, false.</returns>
|
||||
public static Column operator <=(Column self, object other)
|
||||
{
|
||||
return new Column(self.columnProxy.BinOp("leq", (other is Column) ? ((Column)other).columnProxy : other));
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// The "greater than or equal" relational operator that returns true if the first operand
|
||||
/// is greater than or equal to the second, false otherwise.
|
||||
/// </summary>
|
||||
/// <param name="self">The column self to compare</param>
|
||||
/// <param name="other">The other object to compare</param>
|
||||
/// <returns>true if the value of self is greater than or equal to the value of other; otherwise, false.</returns>
|
||||
public static Column operator >=(Column self, object other)
|
||||
{
|
||||
return new Column(self.columnProxy.BinOp("geq", (other is Column) ? ((Column)other).columnProxy : other));
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// The "greater than" relational operator that returns true if the first operand
|
||||
/// is greater than the second, false otherwise.
|
||||
/// </summary>
|
||||
/// <param name="self">The column self to compare</param>
|
||||
/// <param name="other">The other object to compare</param>
|
||||
/// <returns>true if the value of self is greater than the value of other; otherwise, false.</returns>
|
||||
public static Column operator >(Column self, object other)
|
||||
{
|
||||
return new Column(self.columnProxy.BinOp("gt", (other is Column) ? ((Column)other).columnProxy : other));
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Compute bitwise OR of this expression with another expression.
|
||||
/// </summary>
|
||||
/// <param name="self">The column self to compute</param>
|
||||
/// <param name="other">The other object to compute</param>
|
||||
/// <returns>false if and only if both its operands are false; otherwise, true</returns>
|
||||
public static Column operator |(Column self, object other)
|
||||
{
|
||||
return new Column(self.columnProxy.BinOp("bitwiseOR", (other is Column) ? ((Column)other).columnProxy : other));
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Compute bitwise AND of this expression with another expression.
|
||||
/// </summary>
|
||||
/// <param name="self">The column self to compute</param>
|
||||
/// <param name="other">The other object to compute</param>
|
||||
/// <returns>true if and only if both its operands are true; otherwise, false</returns>
|
||||
public static Column operator &(Column self, object other)
|
||||
{
|
||||
return new Column(self.columnProxy.BinOp("bitwiseAND", (other is Column) ? ((Column)other).columnProxy : other));
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Compute bitwise XOR of this expression with another expression.
|
||||
/// </summary>
|
||||
/// <param name="self">The column self to compute</param>
|
||||
/// <param name="other">The other object to compute</param>
|
||||
/// <returns>true if and only if exactly one of its operands is true; otherwise, false</returns>
|
||||
public static Column operator ^(Column self, object other)
|
||||
{
|
||||
return new Column(self.columnProxy.BinOp("bitwiseXOR", (other is Column) ? ((Column)other).columnProxy : other));
|
||||
|
@ -167,20 +268,39 @@ namespace Microsoft.Spark.CSharp.Sql
|
|||
return new Column(columnProxy.BinOp("endsWith", other.columnProxy));
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Returns a sort expression based on the ascending order.
|
||||
/// </summary>
|
||||
/// <returns>A column with ascending order</returns>
|
||||
public Column Asc()
|
||||
{
|
||||
return new Column(columnProxy.UnaryOp("asc"));
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Returns a sort expression based on the descending order.
|
||||
/// </summary>
|
||||
/// <returns>A column with descending order</returns>
|
||||
public Column Desc()
|
||||
{
|
||||
return new Column(columnProxy.UnaryOp("desc"));
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Returns this column aliased with a new name.
|
||||
/// </summary>
|
||||
/// <param name="alias">The name of alias</param>
|
||||
/// <returns>A column aliased with the given name</returns>
|
||||
public Column Alias(string alias)
|
||||
{
|
||||
return new Column(columnProxy.InvokeMethod("as", alias));
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Returns this column aliased with new names
|
||||
/// </summary>
|
||||
/// <param name="aliases">The array of names for aliases</param>
|
||||
/// <returns>A column aliased with the given names</returns>
|
||||
public Column Alias(string[] aliases)
|
||||
{
|
||||
return new Column(columnProxy.InvokeMethod("as", new object[] { aliases }));
|
||||
|
|
|
@ -7,6 +7,7 @@ using System.Globalization;
|
|||
using System.Linq;
|
||||
using Microsoft.Spark.CSharp.Core;
|
||||
using Microsoft.Spark.CSharp.Proxy;
|
||||
using Microsoft.Spark.CSharp.Services;
|
||||
|
||||
namespace Microsoft.Spark.CSharp.Sql
|
||||
{
|
||||
|
@ -18,6 +19,9 @@ namespace Microsoft.Spark.CSharp.Sql
|
|||
[Serializable]
|
||||
public class DataFrame
|
||||
{
|
||||
[NonSerialized]
|
||||
private readonly ILoggerService logger = LoggerServiceFactory.GetLogger(typeof(DataFrame));
|
||||
|
||||
[NonSerialized]
|
||||
private readonly IDataFrameProxy dataFrameProxy;
|
||||
[NonSerialized]
|
||||
|
@ -33,6 +37,9 @@ namespace Microsoft.Spark.CSharp.Sql
|
|||
[NonSerialized]
|
||||
private readonly Random random = new Random();
|
||||
|
||||
/// <summary>
|
||||
/// Represents the content of the DataFrame as an RDD of Rows.
|
||||
/// </summary>
|
||||
public RDD<Row> Rdd
|
||||
{
|
||||
get
|
||||
|
@ -59,6 +66,9 @@ namespace Microsoft.Spark.CSharp.Sql
|
|||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Returns true if the collect and take methods can be run locally (without any Spark executors).
|
||||
/// </summary>
|
||||
public bool IsLocal
|
||||
{
|
||||
get
|
||||
|
@ -84,11 +94,18 @@ namespace Microsoft.Spark.CSharp.Sql
|
|||
get { return dataFrameProxy; }
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Returns the schema of this DataFrame.
|
||||
/// </summary>
|
||||
public StructType Schema
|
||||
{
|
||||
get { return schema ?? (schema = new StructType(dataFrameProxy.GetSchema())); }
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Returns a column for a given column name.
|
||||
/// </summary>
|
||||
/// <param name="columnName">The name of column</param>
|
||||
public Column this[string columnName]
|
||||
{
|
||||
get
|
||||
|
@ -119,6 +136,7 @@ namespace Microsoft.Spark.CSharp.Sql
|
|||
/// <returns>row count</returns>
|
||||
public long Count()
|
||||
{
|
||||
logger.LogInfo("Calculating the number of rows in the dataframe");
|
||||
return dataFrameProxy.Count();
|
||||
}
|
||||
|
||||
|
@ -129,6 +147,7 @@ namespace Microsoft.Spark.CSharp.Sql
|
|||
/// <param name="truncate">Indicates if strings more than 20 characters long will be truncated</param>
|
||||
public void Show(int numberOfRows = 20, bool truncate = true)
|
||||
{
|
||||
logger.LogInfo("Writing {0} rows in the DataFrame to Console output", numberOfRows);
|
||||
Console.WriteLine(dataFrameProxy.GetShowString(numberOfRows, truncate));
|
||||
}
|
||||
|
||||
|
@ -138,6 +157,7 @@ namespace Microsoft.Spark.CSharp.Sql
|
|||
public void ShowSchema()
|
||||
{
|
||||
var nameTypeList = Schema.Fields.Select(structField => structField.SimpleString);
|
||||
logger.LogInfo("Writing Schema to Console output");
|
||||
Console.WriteLine(string.Join(", ", nameTypeList));
|
||||
}
|
||||
|
||||
|
@ -641,19 +661,69 @@ namespace Microsoft.Spark.CSharp.Sql
|
|||
}
|
||||
if (ascending != null)
|
||||
{
|
||||
if(columns.Length != ascending.Length)
|
||||
throw new ArgumentException("ascending should have the same length with columns");
|
||||
|
||||
var columnsWithOrder = new Column[columns.Length];
|
||||
for (var i = 0; i < columns.Length; i++)
|
||||
{
|
||||
columnsWithOrder[i] = ascending[i] ? columns[i].Asc() : columns[i].Desc();
|
||||
}
|
||||
return new DataFrame(dataFrameProxy.Sort(columnsWithOrder.Select(c => c.ColumnProxy).ToArray()), sparkContext);
|
||||
var sortedColumns = SortColumns(columns, ascending);
|
||||
return new DataFrame(dataFrameProxy.Sort(sortedColumns.Select(c => c.ColumnProxy).ToArray()), sparkContext);
|
||||
}
|
||||
return new DataFrame(dataFrameProxy.Sort(columns.Select(c => c.ColumnProxy).ToArray()), sparkContext);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Returns a new DataFrame sorted by the specified column(s).
|
||||
/// Reference to https://github.com/apache/spark/blob/branch-1.6/python/pyspark/sql/dataframe.py, sortWithinPartitions(self, *cols, **kwargs)
|
||||
/// </summary>
|
||||
/// <param name="columns">List of Columns to sort by</param>
|
||||
/// <param name="ascending">List of boolean to specify multiple sort orders for <paramref name="columns"/>, TRUE for ascending, FALSE for descending.
|
||||
/// if not null, it will overwrite the order specified by Column.Asc() or Column Desc() in <paramref name="columns"/>, </param>
|
||||
/// <returns>A new DataFrame sorted by the specified column(s)</returns>
|
||||
public DataFrame SortWithinPartitions(string[] columns, bool[] ascending = null)
|
||||
{
|
||||
if (columns == null || columns.Length == 0)
|
||||
{
|
||||
throw new ArgumentException("should sort by at least one column.");
|
||||
}
|
||||
if (ascending != null)
|
||||
{
|
||||
var sortedColumns = SortColumns(columns.Select(c => this[c]).ToArray(), ascending);
|
||||
return new DataFrame(dataFrameProxy.SortWithinPartitions(sortedColumns.Select(c => c.ColumnProxy).ToArray()), sparkContext);
|
||||
}
|
||||
return new DataFrame(dataFrameProxy.SortWithinPartitions(columns.Select(c => this[c].ColumnProxy).ToArray()), sparkContext);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Returns a new DataFrame sorted by the specified column(s).
|
||||
/// Reference to https://github.com/apache/spark/blob/branch-1.6/python/pyspark/sql/dataframe.py, sortWithinPartitions(self, *cols, **kwargs)
|
||||
/// </summary>
|
||||
/// <param name="columns">List of Columns to sort by</param>
|
||||
/// <param name="ascending">List of boolean to specify multiple sort orders for <paramref name="columns"/>, TRUE for ascending, FALSE for descending.
|
||||
/// if not null, it will overwrite the order specified by Column.Asc() or Column Desc() in <paramref name="columns"/>, </param>
|
||||
/// <returns>A new DataFrame sorted by the specified column(s)</returns>
|
||||
public DataFrame SortWithinPartition(Column[] columns, bool[] ascending = null)
|
||||
{
|
||||
if (columns == null || columns.Length == 0)
|
||||
{
|
||||
throw new ArgumentException("should sort by at least one column.");
|
||||
}
|
||||
if (ascending != null)
|
||||
{
|
||||
var sortedColumns = SortColumns(columns, ascending);
|
||||
return new DataFrame(dataFrameProxy.SortWithinPartitions(sortedColumns.Select(c => c.ColumnProxy).ToArray()), sparkContext);
|
||||
}
|
||||
return new DataFrame(dataFrameProxy.SortWithinPartitions(columns.Select(c => c.ColumnProxy).ToArray()), sparkContext);
|
||||
}
|
||||
|
||||
private Column[] SortColumns(Column[] columns, bool[] ascending)
|
||||
{
|
||||
if (columns.Length != ascending.Length)
|
||||
throw new ArgumentException("ascending should have the same length with columns");
|
||||
|
||||
var columnsWithOrder = new Column[columns.Length];
|
||||
for (var i = 0; i < columns.Length; i++)
|
||||
{
|
||||
columnsWithOrder[i] = ascending[i] ? columns[i].Asc() : columns[i].Desc();
|
||||
}
|
||||
return columnsWithOrder;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Returns a new DataFrame with an alias set.
|
||||
/// Reference to https://github.com/apache/spark/blob/branch-1.4/python/pyspark/sql/dataframe.py, alias(self, alias)
|
||||
|
@ -877,6 +947,32 @@ namespace Microsoft.Spark.CSharp.Sql
|
|||
return new DataFrame(dataFrameProxy.Repartition(numPartitions), sparkContext);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Returns a new [[DataFrame]] partitioned by the given partitioning columns into <paramref name="numPartitions"/>. The resulting DataFrame is hash partitioned.
|
||||
/// <param name="columns"></param>
|
||||
/// <param name="numPartitions">optional. If not specified, keep current partitions.</param>
|
||||
/// </summary>
|
||||
// Python API: https://github.com/apache/spark/blob/branch-1.6/python/pyspark/sql/dataframe.py repartition(self, numPartitions)
|
||||
public DataFrame Repartition(string[] columns, int numPartitions = 0)
|
||||
{
|
||||
return numPartitions == 0 ?
|
||||
new DataFrame(dataFrameProxy.Repartition(columns.Select(c => this[c].ColumnProxy).ToArray()), sparkContext) :
|
||||
new DataFrame(dataFrameProxy.Repartition(numPartitions, columns.Select(c => this[c].ColumnProxy).ToArray()), sparkContext);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Returns a new [[DataFrame]] partitioned by the given partitioning columns into <paramref name="numPartitions"/>. The resulting DataFrame is hash partitioned.
|
||||
/// <param name="columns"></param>
|
||||
/// <param name="numPartitions">optional. If not specified, keep current partitions.</param>
|
||||
/// </summary>
|
||||
// Python API: https://github.com/apache/spark/blob/branch-1.6/python/pyspark/sql/dataframe.py repartition(self, numPartitions)
|
||||
public DataFrame Repartition(Column[] columns, int numPartitions = 0)
|
||||
{
|
||||
return numPartitions == 0 ?
|
||||
new DataFrame(dataFrameProxy.Repartition(columns.Select(c => c.ColumnProxy).ToArray()), sparkContext) :
|
||||
new DataFrame(dataFrameProxy.Repartition(numPartitions, columns.Select(c => c.ColumnProxy).ToArray()), sparkContext);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Returns a new DataFrame by sampling a fraction of rows.
|
||||
/// </summary>
|
||||
|
@ -954,6 +1050,7 @@ namespace Microsoft.Spark.CSharp.Sql
|
|||
// write(self)
|
||||
public DataFrameWriter Write()
|
||||
{
|
||||
logger.LogInfo("Using DataFrameWriter to write output data to external data storage");
|
||||
return new DataFrameWriter(dataFrameProxy.Write());
|
||||
}
|
||||
|
||||
|
@ -1059,8 +1156,14 @@ namespace Microsoft.Spark.CSharp.Sql
|
|||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// The type of join operation for DataFrame
|
||||
/// </summary>
|
||||
public class JoinType
|
||||
{
|
||||
/// <summary>
|
||||
/// Get the string that represents a join type
|
||||
/// </summary>
|
||||
public string Value { get; private set; }
|
||||
private JoinType(string value)
|
||||
{
|
||||
|
@ -1073,6 +1176,9 @@ namespace Microsoft.Spark.CSharp.Sql
|
|||
private static readonly JoinType RightOuterJoinType = new JoinType("right_outer");
|
||||
private static readonly JoinType LeftSemiJoinType = new JoinType("leftsemi");
|
||||
|
||||
/// <summary>
|
||||
/// Inner join
|
||||
/// </summary>
|
||||
public static JoinType Inner
|
||||
{
|
||||
get
|
||||
|
@ -1081,6 +1187,9 @@ namespace Microsoft.Spark.CSharp.Sql
|
|||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Outer join
|
||||
/// </summary>
|
||||
public static JoinType Outer
|
||||
{
|
||||
get
|
||||
|
@ -1089,6 +1198,9 @@ namespace Microsoft.Spark.CSharp.Sql
|
|||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Left outer join
|
||||
/// </summary>
|
||||
public static JoinType LeftOuter
|
||||
{
|
||||
get
|
||||
|
@ -1097,6 +1209,9 @@ namespace Microsoft.Spark.CSharp.Sql
|
|||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Right outer join
|
||||
/// </summary>
|
||||
public static JoinType RightOuter
|
||||
{
|
||||
get
|
||||
|
@ -1105,6 +1220,9 @@ namespace Microsoft.Spark.CSharp.Sql
|
|||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Left semi join
|
||||
/// </summary>
|
||||
public static JoinType LeftSemi
|
||||
{
|
||||
get
|
||||
|
@ -1114,6 +1232,9 @@ namespace Microsoft.Spark.CSharp.Sql
|
|||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// A set of methods for aggregations on a DataFrame, created by DataFrame.groupBy.
|
||||
/// </summary>
|
||||
public class GroupedData
|
||||
{
|
||||
internal IGroupedDataProxy GroupedDataProxy
|
||||
|
@ -1130,36 +1251,79 @@ namespace Microsoft.Spark.CSharp.Sql
|
|||
this.dataFrame = dataFrame;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Compute aggregates by specifying a dictionary from column name to aggregate methods.
|
||||
/// The available aggregate methods are avg, max, min, sum, count.
|
||||
/// </summary>
|
||||
/// <param name="columnNameAggFunctionDictionary">The dictionary of column name to aggregate method</param>
|
||||
/// <returns>The DataFrame object that contains the grouping columns.</returns>
|
||||
public DataFrame Agg(Dictionary<string, string> columnNameAggFunctionDictionary)
|
||||
{
|
||||
return new DataFrame(dataFrame.DataFrameProxy.Agg(groupedDataProxy, columnNameAggFunctionDictionary), dataFrame.SparkContext);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Count the number of rows for each group.
|
||||
/// </summary>
|
||||
/// <returns>The DataFrame object that contains the grouping columns.</returns>
|
||||
public DataFrame Count()
|
||||
{
|
||||
return new DataFrame(groupedDataProxy.Count(), dataFrame.SparkContext);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Compute the average value for each numeric columns for each group.
|
||||
/// This is an alias for avg.
|
||||
/// When specified columns are given, only compute the average values for them.
|
||||
/// </summary>
|
||||
/// <param name="columns">The name of columns to be computed.</param>
|
||||
/// <returns>The DataFrame object that contains the grouping columns.</returns>
|
||||
public DataFrame Mean(params string[] columns)
|
||||
{
|
||||
return new DataFrame(groupedDataProxy.Mean(columns), dataFrame.SparkContext);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Compute the max value for each numeric columns for each group.
|
||||
/// When specified columns are given, only compute the max values for them.
|
||||
/// </summary>
|
||||
/// <param name="columns"> The name of columns to be computed.</param>
|
||||
/// <returns>The DataFrame object that contains the grouping columns.</returns>
|
||||
public DataFrame Max(params string[] columns)
|
||||
{
|
||||
return new DataFrame(groupedDataProxy.Max(columns), dataFrame.SparkContext);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Compute the min value for each numeric column for each group.
|
||||
/// </summary>
|
||||
/// <param name="columns">
|
||||
/// The name of columns to be computed. When specified columns are
|
||||
/// given, only compute the min values for them.
|
||||
/// </param>
|
||||
/// <returns>The DataFrame object that contains the grouping columns.</returns>
|
||||
public DataFrame Min(params string[] columns)
|
||||
{
|
||||
return new DataFrame(groupedDataProxy.Min(columns), dataFrame.SparkContext);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Compute the mean value for each numeric columns for each group.
|
||||
/// When specified columns are given, only compute the mean values for them.
|
||||
/// </summary>
|
||||
/// <param name="columns">The name of columns to be computed</param>
|
||||
/// <returns>The DataFrame object that contains the grouping columns.</returns>
|
||||
public DataFrame Avg(params string[] columns)
|
||||
{
|
||||
return new DataFrame(groupedDataProxy.Avg(columns), dataFrame.SparkContext);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Compute the sum for each numeric columns for each group.
|
||||
/// When specified columns are given, only compute the sum for them.
|
||||
/// </summary>
|
||||
/// <param name="columns">The name of columns to be computed</param>
|
||||
/// <returns>The DataFrame object that contains the grouping columns.</returns>
|
||||
public DataFrame Sum(params string[] columns)
|
||||
{
|
||||
return new DataFrame(groupedDataProxy.Sum(columns), dataFrame.SparkContext);
|
||||
|
|
|
@ -5,6 +5,7 @@ using System;
|
|||
using System.Collections.Generic;
|
||||
using Microsoft.Spark.CSharp.Core;
|
||||
using Microsoft.Spark.CSharp.Proxy;
|
||||
using Microsoft.Spark.CSharp.Services;
|
||||
|
||||
namespace Microsoft.Spark.CSharp.Sql
|
||||
{
|
||||
|
@ -14,6 +15,8 @@ namespace Microsoft.Spark.CSharp.Sql
|
|||
/// </summary>
|
||||
public class DataFrameReader
|
||||
{
|
||||
private readonly ILoggerService logger = LoggerServiceFactory.GetLogger(typeof(DataFrameReader));
|
||||
|
||||
private readonly IDataFrameReaderProxy dataFrameReaderProxy;
|
||||
private readonly SparkContext sparkContext;
|
||||
|
||||
|
@ -27,6 +30,7 @@ namespace Microsoft.Spark.CSharp.Sql
|
|||
/// </summary>
|
||||
public DataFrameReader Format(string source)
|
||||
{
|
||||
logger.LogInfo("Input data source format for the reader is '{0}'", source);
|
||||
dataFrameReaderProxy.Format(source);
|
||||
return this;
|
||||
}
|
||||
|
@ -48,6 +52,7 @@ namespace Microsoft.Spark.CSharp.Sql
|
|||
public DataFrameReader Option(string key, string value)
|
||||
{
|
||||
dataFrameReaderProxy.Options(new Dictionary<string, string>(){{key, value}});
|
||||
logger.LogInfo("Input key-vaue option for the data source is {0}={1}", key, value);
|
||||
return this;
|
||||
}
|
||||
|
||||
|
@ -75,6 +80,7 @@ namespace Microsoft.Spark.CSharp.Sql
|
|||
/// </summary>
|
||||
public DataFrame Load()
|
||||
{
|
||||
logger.LogInfo("Loading DataFrame using the reader");
|
||||
return new DataFrame(dataFrameReaderProxy.Load(), sparkContext);
|
||||
}
|
||||
|
||||
|
@ -84,6 +90,7 @@ namespace Microsoft.Spark.CSharp.Sql
|
|||
/// </summary>
|
||||
public DataFrame Jdbc(string url, string table, Dictionary<String, String> properties)
|
||||
{
|
||||
logger.LogInfo("Constructing DataFrame using JDBC source. Url={0}, tableName={1}", url, table);
|
||||
return new DataFrame(dataFrameReaderProxy.Jdbc(url, table, properties), sparkContext);
|
||||
}
|
||||
|
||||
|
@ -106,6 +113,7 @@ namespace Microsoft.Spark.CSharp.Sql
|
|||
public DataFrame Jdbc(string url, string table, string columnName, string lowerBound, string upperBound,
|
||||
int numPartitions, Dictionary<String, String> connectionProperties)
|
||||
{
|
||||
logger.LogInfo("Constructing DataFrame using JDBC source. Url={0}, tableName={1}, columnName={2}", url, table, columnName);
|
||||
return new DataFrame(dataFrameReaderProxy.Jdbc(url, table, columnName, lowerBound, upperBound, numPartitions, connectionProperties), sparkContext);
|
||||
}
|
||||
|
||||
|
@ -125,6 +133,7 @@ namespace Microsoft.Spark.CSharp.Sql
|
|||
/// Normally at least a "user" and "password" property should be included.</param>
|
||||
public DataFrame Jdbc(string url, string table, string[] predicates, Dictionary<String, String> connectionProperties)
|
||||
{
|
||||
logger.LogInfo("Constructing DataFrame using JDBC source. Url={0}, table={1}", url, table);
|
||||
return new DataFrame(dataFrameReaderProxy.Jdbc(url, table, predicates, connectionProperties), sparkContext);
|
||||
}
|
||||
|
||||
|
@ -137,6 +146,7 @@ namespace Microsoft.Spark.CSharp.Sql
|
|||
/// <param name="path">input path</param>
|
||||
public DataFrame Json(string path)
|
||||
{
|
||||
logger.LogInfo("Constructing DataFrame using JSON source {0}", path);
|
||||
return Format("json").Load(path);
|
||||
}
|
||||
|
||||
|
@ -146,6 +156,7 @@ namespace Microsoft.Spark.CSharp.Sql
|
|||
/// </summary>
|
||||
public DataFrame Parquet(params string[] path)
|
||||
{
|
||||
logger.LogInfo("Constructing DataFrame using Parquet source {0}", string.Join(";", path));
|
||||
return new DataFrame(dataFrameReaderProxy.Parquet(path), sparkContext);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -3,6 +3,7 @@
|
|||
|
||||
using System.Collections.Generic;
|
||||
using Microsoft.Spark.CSharp.Proxy;
|
||||
using Microsoft.Spark.CSharp.Services;
|
||||
|
||||
namespace Microsoft.Spark.CSharp.Sql
|
||||
{
|
||||
|
@ -14,6 +15,7 @@ namespace Microsoft.Spark.CSharp.Sql
|
|||
/// </summary>
|
||||
public class DataFrameWriter
|
||||
{
|
||||
private readonly ILoggerService logger = LoggerServiceFactory.GetLogger(typeof(DataFrameWriter));
|
||||
internal IDataFrameWriterProxy DataFrameWriterProxy
|
||||
{
|
||||
get { return dataFrameWriterProxy; }
|
||||
|
@ -56,6 +58,7 @@ namespace Microsoft.Spark.CSharp.Sql
|
|||
/// </summary>
|
||||
public DataFrameWriter Format(string source)
|
||||
{
|
||||
logger.LogInfo("Output data storage format for the writer is '{0}'", source);
|
||||
dataFrameWriterProxy.Format(source);
|
||||
return this;
|
||||
}
|
||||
|
@ -66,6 +69,7 @@ namespace Microsoft.Spark.CSharp.Sql
|
|||
public DataFrameWriter Option(string key, string value)
|
||||
{
|
||||
var options = new Dictionary<string, string>() { { key, value } };
|
||||
logger.LogInfo("Output key-vaue option for the external data stroage is {0}={1}", key, value);
|
||||
return Options(options);
|
||||
}
|
||||
|
||||
|
|
|
@ -19,110 +19,272 @@ namespace Microsoft.Spark.CSharp.Sql
|
|||
public static class Functions
|
||||
{
|
||||
#region functions
|
||||
/// <summary>
|
||||
/// Creates a Column of any literal value.
|
||||
/// </summary>
|
||||
/// <param name="column">The given literal value</param>
|
||||
/// <returns>A new Column is created to represent the literal value</returns>
|
||||
public static Column Lit(object column)
|
||||
{
|
||||
return new Column(SparkCLREnvironment.SparkCLRProxy.SparkContextProxy.CreateFunction("lit", column));
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Returns a Column based on the given column name.
|
||||
/// </summary>
|
||||
/// <param name="colName">The name of column specified</param>
|
||||
/// <returns>The column for the given name</returns>
|
||||
public static Column Col(string colName)
|
||||
{
|
||||
return new Column(SparkCLREnvironment.SparkCLRProxy.SparkContextProxy.CreateFunction("col", colName));
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Returns a Column based on the given column name.
|
||||
/// </summary>
|
||||
/// <param name="colName">The name of column specified</param>
|
||||
/// <returns>The column for the given name</returns>
|
||||
public static Column Column(string colName)
|
||||
{
|
||||
return new Column(SparkCLREnvironment.SparkCLRProxy.SparkContextProxy.CreateFunction("column", colName));
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Returns a sort expression based on ascending order of the column.
|
||||
/// </summary>
|
||||
/// <param name="columnName">The name of column specified</param>
|
||||
/// <returns>The column with ascending order</returns>
|
||||
public static Column Asc(string columnName)
|
||||
{
|
||||
return new Column(SparkCLREnvironment.SparkCLRProxy.SparkContextProxy.CreateFunction("asc", columnName));
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Returns a sort expression based on the descending order of the column.
|
||||
/// </summary>
|
||||
/// <param name="columnName">The name of column specified</param>
|
||||
/// <returns>the column with descending order</returns>
|
||||
public static Column Desc(string columnName)
|
||||
{
|
||||
return new Column(SparkCLREnvironment.SparkCLRProxy.SparkContextProxy.CreateFunction("desc", columnName));
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Converts a string column to upper case.
|
||||
/// </summary>
|
||||
/// <param name="column">The string column specified</param>
|
||||
/// <returns>The string column in upper case</returns>
|
||||
public static Column Upper(Column column)
|
||||
{
|
||||
return new Column(SparkCLREnvironment.SparkCLRProxy.SparkContextProxy.CreateFunction("upper", column.ColumnProxy));
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Converts a string column to lower case.
|
||||
/// </summary>
|
||||
/// <param name="column">The string column specified</param>
|
||||
/// <returns>The string column in lower case</returns>
|
||||
public static Column Lower(Column column)
|
||||
{
|
||||
return new Column(SparkCLREnvironment.SparkCLRProxy.SparkContextProxy.CreateFunction("lower", column.ColumnProxy));
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Computes the square root of the specified float column.
|
||||
/// </summary>
|
||||
/// <param name="column">The float column</param>
|
||||
/// <returns>The square root of the specified float column.</returns>
|
||||
public static Column Sqrt(Column column)
|
||||
{
|
||||
return new Column(SparkCLREnvironment.SparkCLRProxy.SparkContextProxy.CreateFunction("sqrt", column.ColumnProxy));
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Computes the absolute value.
|
||||
/// </summary>
|
||||
/// <param name="column">The column to compute</param>
|
||||
/// <returns>The new column represents the absolute value of the given column</returns>
|
||||
public static Column Abs(Column column)
|
||||
{
|
||||
return new Column(SparkCLREnvironment.SparkCLRProxy.SparkContextProxy.CreateFunction("abs", column.ColumnProxy));
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Returns the maximum value of the expression in a group.
|
||||
/// </summary>
|
||||
/// <param name="column">The given column</param>
|
||||
/// <returns>The new column reprents the maximum value</returns>
|
||||
public static Column Max(Column column)
|
||||
{
|
||||
return new Column(SparkCLREnvironment.SparkCLRProxy.SparkContextProxy.CreateFunction("max", column.ColumnProxy));
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Returns the minimum value of the expression in a group.
|
||||
/// </summary>
|
||||
/// <param name="column">The given column</param>
|
||||
/// <returns>The new column represents the minimum value</returns>
|
||||
public static Column Min(Column column)
|
||||
{
|
||||
return new Column(SparkCLREnvironment.SparkCLRProxy.SparkContextProxy.CreateFunction("min", column.ColumnProxy));
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Returns the first value in a group.
|
||||
/// </summary>
|
||||
/// <param name="column">The given column</param>
|
||||
/// <returns>The new column represents the first value</returns>
|
||||
public static Column First(Column column)
|
||||
{
|
||||
return new Column(SparkCLREnvironment.SparkCLRProxy.SparkContextProxy.CreateFunction("first", column.ColumnProxy));
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Returns the last value in a group.
|
||||
/// </summary>
|
||||
/// <param name="column">The given column</param>
|
||||
/// <returns>The new column represents the last value</returns>
|
||||
public static Column Last(Column column)
|
||||
{
|
||||
return new Column(SparkCLREnvironment.SparkCLRProxy.SparkContextProxy.CreateFunction("last", column.ColumnProxy));
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Returns the number of items in a group.
|
||||
/// </summary>
|
||||
/// <param name="column">The given column</param>
|
||||
/// <returns>The new column represents the count value</returns>
|
||||
public static Column Count(Column column)
|
||||
{
|
||||
return new Column(SparkCLREnvironment.SparkCLRProxy.SparkContextProxy.CreateFunction("count", column.ColumnProxy));
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Returns the sum of all values in the expression.
|
||||
/// </summary>
|
||||
/// <param name="column">The given column</param>
|
||||
/// <returns>The new column represents the sum</returns>
|
||||
public static Column Sum(Column column)
|
||||
{
|
||||
return new Column(SparkCLREnvironment.SparkCLRProxy.SparkContextProxy.CreateFunction("sum", column.ColumnProxy));
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Returns the average of the values in a group.
|
||||
/// </summary>
|
||||
/// <param name="column">The given column</param>
|
||||
/// <returns>The new column represents the average</returns>
|
||||
public static Column Avg(Column column)
|
||||
{
|
||||
return new Column(SparkCLREnvironment.SparkCLRProxy.SparkContextProxy.CreateFunction("avg", column.ColumnProxy));
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Returns the average of the values in a group.
|
||||
/// </summary>
|
||||
/// <param name="column">The given column</param>
|
||||
/// <returns>The new column represents the average</returns>
|
||||
public static Column Mean(Column column)
|
||||
{
|
||||
return new Column(SparkCLREnvironment.SparkCLRProxy.SparkContextProxy.CreateFunction("mean", column.ColumnProxy));
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Returns the sum of distinct values in the expression.
|
||||
/// </summary>
|
||||
/// <param name="column">The given column</param>
|
||||
/// <returns>The new column represents the sum of distinct values </returns>
|
||||
public static Column SumDistinct(Column column)
|
||||
{
|
||||
return new Column(SparkCLREnvironment.SparkCLRProxy.SparkContextProxy.CreateFunction("sumDistinct", column.ColumnProxy));
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Creates a new array column. The input columns must all have the same data type.
|
||||
/// </summary>
|
||||
/// <param name="columns">The given columns</param>
|
||||
/// <returns>The new array column</returns>
|
||||
public static Column Array(params Column[] columns)
|
||||
{
|
||||
return new Column(SparkCLREnvironment.SparkCLRProxy.SparkContextProxy.CreateFunction("array", columns.Select(x => x.ColumnProxy)));
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Returns the first column that is not null, or null if all inputs are null.
|
||||
/// </summary>
|
||||
/// <param name="columns">The given columns</param>
|
||||
/// <returns>The first column that is not null</returns>
|
||||
public static Column Coalesce(params Column[] columns)
|
||||
{
|
||||
return new Column(SparkCLREnvironment.SparkCLRProxy.SparkContextProxy.CreateFunction("coalesce", columns.Select(x => x.ColumnProxy)));
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Returns the number of distinct items in a group.
|
||||
/// </summary>
|
||||
/// <param name="columns">The given columns</param>
|
||||
/// <returns>The new column represents the number of distinct items</returns>
|
||||
public static Column CountDistinct(params Column[] columns)
|
||||
{
|
||||
return new Column(SparkCLREnvironment.SparkCLRProxy.SparkContextProxy.CreateFunction("countDistinct", columns.Select(x => x.ColumnProxy)));
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Creates a new struct column.
|
||||
/// </summary>
|
||||
/// <param name="columns">The given columns</param>
|
||||
/// <returns>The new struct column</returns>
|
||||
public static Column Struct(params Column[] columns)
|
||||
{
|
||||
return new Column(SparkCLREnvironment.SparkCLRProxy.SparkContextProxy.CreateFunction("struct", columns.Select(x => x.ColumnProxy)));
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Returns the approximate number of distinct items in a group
|
||||
/// </summary>
|
||||
/// <param name="column">The given columns</param>
|
||||
/// <returns>The column represents the approximate number of distinct items</returns>
|
||||
public static Column ApproxCountDistinct(Column column)
|
||||
{
|
||||
return new Column(SparkCLREnvironment.SparkCLRProxy.SparkContextProxy.CreateFunction("approxCountDistinct", column));
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Creates a new row for each element in the given array or map column.
|
||||
/// </summary>
|
||||
/// <param name="column">The given column</param>
|
||||
/// <returns>The new column for each element in the given array or map column</returns>
|
||||
public static Column Explode(Column column)
|
||||
{
|
||||
return new Column(SparkCLREnvironment.SparkCLRProxy.SparkContextProxy.CreateFunction("explode", column));
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Generate a random column with i.i.d. samples from U[0.0, 1.0].
|
||||
/// </summary>
|
||||
/// <param name="seed">The long integer as seed</param>
|
||||
/// <returns>A random column with i.i.d. samples from U[0.0, 1.0]. </returns>
|
||||
public static Column Rand(long seed)
|
||||
{
|
||||
return new Column(SparkCLREnvironment.SparkCLRProxy.SparkContextProxy.CreateFunction("rand", seed));
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Generate a column with i.i.d. samples from the standard normal distribution.
|
||||
/// </summary>
|
||||
/// <param name="seed">The long integer as seed</param>
|
||||
/// <returns>A column with i.i.d. samples from the standard normal distribution</returns>
|
||||
public static Column Randn(long seed)
|
||||
{
|
||||
return new Column(SparkCLREnvironment.SparkCLRProxy.SparkContextProxy.CreateFunction("randn", seed));
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Returns the ntile group id (from 1 to n inclusive) in an ordered window partition.
|
||||
/// This is equivalent to the NTILE function in SQL.
|
||||
/// </summary>
|
||||
/// <param name="n">The given number</param>
|
||||
/// <returns>The ntile group id</returns>
|
||||
public static Column Ntile(int n)
|
||||
{
|
||||
return new Column(SparkCLREnvironment.SparkCLRProxy.SparkContextProxy.CreateFunction("ntile", n));
|
||||
|
@ -130,100 +292,221 @@ namespace Microsoft.Spark.CSharp.Sql
|
|||
#endregion
|
||||
|
||||
#region unary math functions
|
||||
/// <summary>
|
||||
/// Computes the cosine inverse of the given column; the returned angle is in the range 0.
|
||||
/// </summary>
|
||||
/// <param name="column">The given column</param>
|
||||
/// <returns>The column represents the cosine inverse</returns>
|
||||
public static Column Acos(Column column)
|
||||
{
|
||||
return new Column(SparkCLREnvironment.SparkCLRProxy.SparkContextProxy.CreateFunction("acos", column.ColumnProxy));
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Computes the sine inverse of the given column; the returned angle is in the range -pi/2 through pi/2.
|
||||
/// </summary>
|
||||
/// <param name="column"></param>
|
||||
/// <returns>The column represents the sine inverse</returns>
|
||||
public static Column Asin(Column column)
|
||||
{
|
||||
return new Column(SparkCLREnvironment.SparkCLRProxy.SparkContextProxy.CreateFunction("asin", column.ColumnProxy));
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Computes the tangent inverse of the given column.
|
||||
/// </summary>
|
||||
/// <param name="column">The given column</param>
|
||||
/// <returns>The column represents the tangent inverse</returns>
|
||||
public static Column Atan(Column column)
|
||||
{
|
||||
return new Column(SparkCLREnvironment.SparkCLRProxy.SparkContextProxy.CreateFunction("atan", column.ColumnProxy));
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Computes the cube-root of the given column.
|
||||
/// </summary>
|
||||
/// <param name="column">The given column</param>
|
||||
/// <returns>The column represents the cube-root</returns>
|
||||
public static Column Cbrt(Column column)
|
||||
{
|
||||
return new Column(SparkCLREnvironment.SparkCLRProxy.SparkContextProxy.CreateFunction("cbrt", column.ColumnProxy));
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Computes the ceiling of the given column.
|
||||
/// </summary>
|
||||
/// <param name="column">The given column</param>
|
||||
/// <returns>The column represents the ceiling</returns>
|
||||
public static Column Ceil(Column column)
|
||||
{
|
||||
return new Column(SparkCLREnvironment.SparkCLRProxy.SparkContextProxy.CreateFunction("ceil", column.ColumnProxy));
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Computes the cosine of the given column.
|
||||
/// </summary>
|
||||
/// <param name="column">The given column</param>
|
||||
/// <returns>The column represents the cosine</returns>
|
||||
public static Column Cos(Column column)
|
||||
{
|
||||
return new Column(SparkCLREnvironment.SparkCLRProxy.SparkContextProxy.CreateFunction("cos", column.ColumnProxy));
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Computes the hyperbolic cosine of the given column.
|
||||
/// </summary>
|
||||
/// <param name="column">The given column</param>
|
||||
/// <returns>The column represents the hyperbolic cosine</returns>
|
||||
public static Column Cosh(Column column)
|
||||
{
|
||||
return new Column(SparkCLREnvironment.SparkCLRProxy.SparkContextProxy.CreateFunction("cosh", column.ColumnProxy));
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Computes the exponential of the given value.
|
||||
/// Computes the exponential of the given column.
|
||||
/// </summary>
|
||||
/// <param name="column"></param>
|
||||
/// <returns></returns>
|
||||
/// <param name="column">The given column</param>
|
||||
/// <returns>The column represents the exponential</returns>
|
||||
public static Column Exp(Column column)
|
||||
{
|
||||
return new Column(SparkCLREnvironment.SparkCLRProxy.SparkContextProxy.CreateFunction("exp", column.ColumnProxy));
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Computes the exponential of the given value minus one.
|
||||
/// Computes the exponential of the given value minus column.
|
||||
/// </summary>
|
||||
/// <param name="column"></param>
|
||||
/// <returns></returns>
|
||||
/// <param name="column">The given column</param>
|
||||
/// <returns>The column represents the exponential</returns>
|
||||
public static Column Expm1(Column column)
|
||||
{
|
||||
return new Column(SparkCLREnvironment.SparkCLRProxy.SparkContextProxy.CreateFunction("expm1", column.ColumnProxy));
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Computes the floor of the given column.
|
||||
/// </summary>
|
||||
/// <param name="column">The given column</param>
|
||||
/// <returns>The column represents the floor</returns>
|
||||
public static Column Floor(Column column)
|
||||
{
|
||||
return new Column(SparkCLREnvironment.SparkCLRProxy.SparkContextProxy.CreateFunction("floor", column.ColumnProxy));
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Computes the natural logarithm of the given column.
|
||||
/// </summary>
|
||||
/// <param name="column">The given column</param>
|
||||
/// <returns>The column represents the natural logarithm</returns>
|
||||
public static Column Log(Column column)
|
||||
{
|
||||
return new Column(SparkCLREnvironment.SparkCLRProxy.SparkContextProxy.CreateFunction("log", column.ColumnProxy));
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Computes the logarithm of the given column in base 10.
|
||||
/// </summary>
|
||||
/// <param name="column">The given column</param>
|
||||
/// <returns>The column represents the logarithm</returns>
|
||||
public static Column Log10(Column column)
|
||||
{
|
||||
return new Column(SparkCLREnvironment.SparkCLRProxy.SparkContextProxy.CreateFunction("log10", column.ColumnProxy));
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Computes the natural logarithm of the given column plus one.
|
||||
/// </summary>
|
||||
/// <param name="column">The given column</param>
|
||||
/// <returns>The column represents the logarithm</returns>
|
||||
public static Column Log1p(Column column)
|
||||
{
|
||||
return new Column(SparkCLREnvironment.SparkCLRProxy.SparkContextProxy.CreateFunction("log1p", column.ColumnProxy));
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Returns the double value that is closest in value to the argument and is equal to a mathematical integer.
|
||||
/// </summary>
|
||||
/// <param name="column">The given column</param>
|
||||
/// <returns>The column represents the double value</returns>
|
||||
public static Column Rint(Column column)
|
||||
{
|
||||
return new Column(SparkCLREnvironment.SparkCLRProxy.SparkContextProxy.CreateFunction("rint", column.ColumnProxy));
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Computes the signum of the given column.
|
||||
/// </summary>
|
||||
/// <param name="column">The given column</param>
|
||||
/// <returns>The column represents the signum</returns>
|
||||
public static Column Signum(Column column)
|
||||
{
|
||||
return new Column(SparkCLREnvironment.SparkCLRProxy.SparkContextProxy.CreateFunction("signum", column.ColumnProxy));
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Computes the sine of the given column.
|
||||
/// </summary>
|
||||
/// <param name="column">The given column</param>
|
||||
/// <returns>The column represents the sine</returns>
|
||||
public static Column Sin(Column column)
|
||||
{
|
||||
return new Column(SparkCLREnvironment.SparkCLRProxy.SparkContextProxy.CreateFunction("sin", column.ColumnProxy));
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Computes the hyperbolic sine of the given column.
|
||||
/// </summary>
|
||||
/// <param name="column">The given column</param>
|
||||
/// <returns>The column represents the hyperbolic sine</returns>
|
||||
public static Column Sinh(Column column)
|
||||
{
|
||||
return new Column(SparkCLREnvironment.SparkCLRProxy.SparkContextProxy.CreateFunction("sinh", column.ColumnProxy));
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Computes the tangent of the given column.
|
||||
/// </summary>
|
||||
/// <param name="column">The given column</param>
|
||||
/// <returns>The column represents the tangent</returns>
|
||||
public static Column Tan(Column column)
|
||||
{
|
||||
return new Column(SparkCLREnvironment.SparkCLRProxy.SparkContextProxy.CreateFunction("tan", column.ColumnProxy));
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Computes the hyperbolic tangent of the given column.
|
||||
/// </summary>
|
||||
/// <param name="column">The given column</param>
|
||||
/// <returns>The column represents the hyperbolic tangent</returns>
|
||||
public static Column Tanh(Column column)
|
||||
{
|
||||
return new Column(SparkCLREnvironment.SparkCLRProxy.SparkContextProxy.CreateFunction("tanh", column.ColumnProxy));
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Converts an angle measured in radians to an approximately equivalent angle measured in degrees.
|
||||
/// </summary>
|
||||
/// <param name="column">The given column</param>
|
||||
/// <returns>The column represents the degrees</returns>
|
||||
public static Column ToDegrees(Column column)
|
||||
{
|
||||
return new Column(SparkCLREnvironment.SparkCLRProxy.SparkContextProxy.CreateFunction("toDegrees", column.ColumnProxy));
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Converts an angle measured in degrees to an approximately equivalent angle measured in radians.
|
||||
/// </summary>
|
||||
/// <param name="column">The given column</param>
|
||||
/// <returns>The column represents the radians</returns>
|
||||
public static Column ToRadians(Column column)
|
||||
{
|
||||
return new Column(SparkCLREnvironment.SparkCLRProxy.SparkContextProxy.CreateFunction("toRadians", column.ColumnProxy));
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Computes bitwise NOT.
|
||||
/// </summary>
|
||||
/// <param name="column">The given column</param>
|
||||
/// <returns>The column of bitwise NOT result</returns>
|
||||
public static Column BitwiseNOT(Column column)
|
||||
{
|
||||
return new Column(SparkCLREnvironment.SparkCLRProxy.SparkContextProxy.CreateFunction("bitwiseNOT", column.ColumnProxy));
|
||||
|
@ -231,46 +514,122 @@ namespace Microsoft.Spark.CSharp.Sql
|
|||
#endregion
|
||||
|
||||
#region binary math functions
|
||||
/// <summary>
|
||||
/// Returns the angle theta from the conversion of rectangular coordinates (x, y) to polar coordinates (r, theta).
|
||||
/// </summary>
|
||||
/// <param name="leftColumn">The left column</param>
|
||||
/// <param name="rightColumn">The right column</param>
|
||||
/// <returns>The column of the result</returns>
|
||||
public static Column Atan2(Column leftColumn, Column rightColumn)
|
||||
{
|
||||
return new Column(SparkCLREnvironment.SparkCLRProxy.SparkContextProxy.CreateBinaryMathFunction("atan2", leftColumn.ColumnProxy, rightColumn.ColumnProxy));
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Computes sqrt(a2 + b2) without intermediate overflow or underflow.
|
||||
/// </summary>
|
||||
/// <param name="leftColumn">The left column</param>
|
||||
/// <param name="rightColumn">The right column</param>
|
||||
/// <returns>The column of the result</returns>
|
||||
public static Column Hypot(Column leftColumn, Column rightColumn)
|
||||
{
|
||||
return new Column(SparkCLREnvironment.SparkCLRProxy.SparkContextProxy.CreateBinaryMathFunction("hypot", leftColumn.ColumnProxy, rightColumn.ColumnProxy));
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Computes sqrt(a2 + b2) without intermediate overflow or underflow.
|
||||
/// </summary>
|
||||
/// <param name="leftColumn">The left column</param>
|
||||
/// <param name="rightValue">The right column</param>
|
||||
/// <returns>The column of the result</returns>
|
||||
public static Column Hypot(Column leftColumn, double rightValue)
|
||||
{
|
||||
return new Column(SparkCLREnvironment.SparkCLRProxy.SparkContextProxy.CreateBinaryMathFunction("hypot", leftColumn.ColumnProxy, rightValue));
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Computes sqrt(a2 + b2) without intermediate overflow or underflow.
|
||||
/// </summary>
|
||||
/// <param name="leftValue">The left value</param>
|
||||
/// <param name="rightColumn">The right column</param>
|
||||
/// <returns>The column of the result</returns>
|
||||
public static Column Hypot(double leftValue, Column rightColumn)
|
||||
{
|
||||
return new Column(SparkCLREnvironment.SparkCLRProxy.SparkContextProxy.CreateBinaryMathFunction("hypot", leftValue, rightColumn.ColumnProxy));
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Returns the value of the first argument raised to the power of the second argument.
|
||||
/// </summary>
|
||||
/// <param name="leftColumn">The left column</param>
|
||||
/// <param name="rightColumn">The right column</param>
|
||||
/// <returns>The column of the result</returns>
|
||||
public static Column Pow(Column leftColumn, Column rightColumn)
|
||||
{
|
||||
return new Column(SparkCLREnvironment.SparkCLRProxy.SparkContextProxy.CreateBinaryMathFunction("pow", leftColumn.ColumnProxy, rightColumn.ColumnProxy));
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Returns the value of the first argument raised to the power of the second argument.
|
||||
/// </summary>
|
||||
/// <param name="leftColumn">The left column</param>
|
||||
/// <param name="rightValue">The right value</param>
|
||||
/// <returns>The column of the result</returns>
|
||||
public static Column Pow(Column leftColumn, double rightValue)
|
||||
{
|
||||
return new Column(SparkCLREnvironment.SparkCLRProxy.SparkContextProxy.CreateBinaryMathFunction("pow", leftColumn.ColumnProxy, rightValue));
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Returns the value of the first argument raised to the power of the second argument.
|
||||
/// </summary>
|
||||
/// <param name="leftValue">The left value</param>
|
||||
/// <param name="rightColumn">The right column</param>
|
||||
/// <returns>The column of the result</returns>
|
||||
public static Column Pow(double leftValue, Column rightColumn)
|
||||
{
|
||||
return new Column(SparkCLREnvironment.SparkCLRProxy.SparkContextProxy.CreateBinaryMathFunction("pow", leftValue, rightColumn.ColumnProxy));
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Returns the approximate number of distinct items in a group.
|
||||
/// </summary>
|
||||
/// <param name="column">The given column</param>
|
||||
/// <param name="rsd">The rsd</param>
|
||||
/// <returns>The column of the result</returns>
|
||||
public static Column ApproxCountDistinct(Column column, double rsd)
|
||||
{
|
||||
return new Column(SparkCLREnvironment.SparkCLRProxy.SparkContextProxy.CreateBinaryMathFunction("approxCountDistinct", column, rsd));
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Evaluates a list of conditions and returns one of multiple possible result expressions.
|
||||
/// </summary>
|
||||
/// <param name="condition">The given column of condition</param>
|
||||
/// <param name="value">The value of condition</param>
|
||||
/// <returns>The column of the result</returns>
|
||||
public static Column When(Column condition, object value)
|
||||
{
|
||||
return new Column(SparkCLREnvironment.SparkCLRProxy.SparkContextProxy.CreateBinaryMathFunction("when", condition, value));
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Returns the value that is offset rows before the current row, and null if there is less than offset rows before the current row.
|
||||
/// </summary>
|
||||
/// <param name="column">The given column</param>
|
||||
/// <param name="offset">The offset of the given column</param>
|
||||
/// <returns>The column of the result</returns>
|
||||
public static Column Lag(Column column, int offset)
|
||||
{
|
||||
return new Column(SparkCLREnvironment.SparkCLRProxy.SparkContextProxy.CreateBinaryMathFunction("lag", column, offset));
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Returns the value that is offset rows after the current row, and null if there is less than offset rows after the current row.
|
||||
/// </summary>
|
||||
/// <param name="column">The given column</param>
|
||||
/// <param name="offset">The offset of the given column</param>
|
||||
/// <returns>The column of the result</returns>
|
||||
public static Column Lead(Column column, int offset)
|
||||
{
|
||||
return new Column(SparkCLREnvironment.SparkCLRProxy.SparkContextProxy.CreateBinaryMathFunction("lead", column, offset));
|
||||
|
@ -278,38 +637,83 @@ namespace Microsoft.Spark.CSharp.Sql
|
|||
#endregion
|
||||
|
||||
#region window functions
|
||||
/// <summary>
|
||||
/// Returns a sequential number starting at 1 within a window partition.
|
||||
/// </summary>
|
||||
/// <returns>The column of the result</returns>
|
||||
public static Column RowNumber()
|
||||
{
|
||||
return new Column(SparkCLREnvironment.SparkCLRProxy.SparkContextProxy.CreateWindowFunction("rowNumber"));
|
||||
return new Column(SparkCLREnvironment.SparkCLRProxy.SparkContextProxy.CreateWindowFunction("row_number"));
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Returns the rank of rows within a window partition, without any gaps.
|
||||
/// </summary>
|
||||
/// <returns>The column of the result</returns>
|
||||
public static Column DenseRank()
|
||||
{
|
||||
return new Column(SparkCLREnvironment.SparkCLRProxy.SparkContextProxy.CreateWindowFunction("denseRank"));
|
||||
return new Column(SparkCLREnvironment.SparkCLRProxy.SparkContextProxy.CreateWindowFunction("dense_rank"));
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Returns the rank of rows within a window partition.
|
||||
/// </summary>
|
||||
/// <returns>The column of the result</returns>
|
||||
public static Column Rank()
|
||||
{
|
||||
return new Column(SparkCLREnvironment.SparkCLRProxy.SparkContextProxy.CreateWindowFunction("rank"));
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Returns the cumulative distribution of values within a window partition
|
||||
/// </summary>
|
||||
/// <returns>The column of the result</returns>
|
||||
public static Column CumeDist()
|
||||
{
|
||||
return new Column(SparkCLREnvironment.SparkCLRProxy.SparkContextProxy.CreateWindowFunction("cumeDist"));
|
||||
return new Column(SparkCLREnvironment.SparkCLRProxy.SparkContextProxy.CreateWindowFunction("cume_dist"));
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Returns the relative rank (i.e. percentile) of rows within a window partition.
|
||||
/// </summary>
|
||||
/// <returns>The column of the result</returns>
|
||||
public static Column PercentRank()
|
||||
{
|
||||
return new Column(SparkCLREnvironment.SparkCLRProxy.SparkContextProxy.CreateWindowFunction("percentRank"));
|
||||
return new Column(SparkCLREnvironment.SparkCLRProxy.SparkContextProxy.CreateWindowFunction("percent_rank"));
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// A column expression that generates monotonically increasing 64-bit integers.
|
||||
/// </summary>
|
||||
/// <returns>The column of the result</returns>
|
||||
public static Column MonotonicallyIncreasingId()
|
||||
{
|
||||
return new Column(SparkCLREnvironment.SparkCLRProxy.SparkContextProxy.CreateWindowFunction("monotonicallyIncreasingId"));
|
||||
return new Column(SparkCLREnvironment.SparkCLRProxy.SparkContextProxy.CreateWindowFunction("monotonically_increasing_id"));
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Partition ID of the Spark task.
|
||||
/// Note that this is indeterministic because it depends on data partitioning and task scheduling.
|
||||
/// </summary>
|
||||
/// <returns>The column of the result</returns>
|
||||
public static Column SparkPartitionId()
|
||||
{
|
||||
return new Column(SparkCLREnvironment.SparkCLRProxy.SparkContextProxy.CreateWindowFunction("sparkPartitionId"));
|
||||
return new Column(SparkCLREnvironment.SparkCLRProxy.SparkContextProxy.CreateWindowFunction("spark_partition_id"));
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Generate a random column with i.i.d. samples from U[0.0, 1.0].
|
||||
/// </summary>
|
||||
/// <returns>The column of the result</returns>
|
||||
public static Column Rand()
|
||||
{
|
||||
return new Column(SparkCLREnvironment.SparkCLRProxy.SparkContextProxy.CreateWindowFunction("rand"));
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Generate a column with i.i.d. samples from the standard normal distribution.
|
||||
/// </summary>
|
||||
/// <returns>The column of the result</returns>
|
||||
public static Column Randn()
|
||||
{
|
||||
return new Column(SparkCLREnvironment.SparkCLRProxy.SparkContextProxy.CreateWindowFunction("randn"));
|
||||
|
@ -317,46 +721,188 @@ namespace Microsoft.Spark.CSharp.Sql
|
|||
#endregion
|
||||
|
||||
#region udf
|
||||
/// <summary>
|
||||
/// Defines a user-defined function of 0 arguments as user-defined function (UDF).
|
||||
/// The data types are automatically inferred based on the function's signature.
|
||||
/// </summary>
|
||||
/// <param name="f">The given function</param>
|
||||
/// <typeparam name="RT">The return type of the given function</typeparam>
|
||||
/// <returns>The new user-defined function</returns>
|
||||
public static Func<Column> Udf<RT>(Func<RT> f)
|
||||
{
|
||||
return new UserDefinedFunction<RT>(new UdfHelper<RT>(f).Execute).Execute0;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Defines a user-defined function of 1 arguments as user-defined function (UDF).
|
||||
/// The data types are automatically inferred based on the function's signature.
|
||||
/// </summary>
|
||||
/// <param name="f">The given function</param>
|
||||
/// <typeparam name="RT">The return type of the given function</typeparam>
|
||||
/// <typeparam name="A1">The 1st arguement of the given function</typeparam>
|
||||
/// <returns>The new user-defined function</returns>
|
||||
public static Func<Column, Column> Udf<RT, A1>(Func<A1, RT> f)
|
||||
{
|
||||
return new UserDefinedFunction<RT>(new UdfHelper<RT, A1>(f).Execute).Execute1;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Defines a user-defined function of 2 arguments as user-defined function (UDF).
|
||||
/// The data types are automatically inferred based on the function's signature.
|
||||
/// </summary>
|
||||
/// <param name="f">The given function</param>
|
||||
/// <typeparam name="RT">The return type of the given function</typeparam>
|
||||
/// <typeparam name="A1">The 1st arguement of the given function</typeparam>
|
||||
/// <typeparam name="A2">The 2nd arguement of the given function</typeparam>
|
||||
/// <returns>The new user-defined function</returns>
|
||||
public static Func<Column, Column, Column> Udf<RT, A1, A2>(Func<A1, A2, RT> f)
|
||||
{
|
||||
return new UserDefinedFunction<RT>(new UdfHelper<RT, A1, A2>(f).Execute).Execute2;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Defines a user-defined function of 3 arguments as user-defined function (UDF).
|
||||
/// The data types are automatically inferred based on the function's signature.
|
||||
/// </summary>
|
||||
/// <param name="f">The given function</param>
|
||||
/// <typeparam name="RT">The return type of the given function</typeparam>
|
||||
/// <typeparam name="A1">The 1st arguement of the given function</typeparam>
|
||||
/// <typeparam name="A2">The 2nd arguement of the given function</typeparam>
|
||||
/// <typeparam name="A3">The 3rd arguement of the given function</typeparam>
|
||||
/// <returns>The new user-defined function</returns>
|
||||
public static Func<Column, Column, Column, Column> Udf<RT, A1, A2, A3>(Func<A1, A2, A3, RT> f)
|
||||
{
|
||||
return new UserDefinedFunction<RT>(new UdfHelper<RT, A1, A2, A3>(f).Execute).Execute3;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Defines a user-defined function of 4 arguments as user-defined function (UDF).
|
||||
/// The data types are automatically inferred based on the function's signature.
|
||||
/// </summary>
|
||||
/// <param name="f">The given function</param>
|
||||
/// <typeparam name="RT">The return type of the given function</typeparam>
|
||||
/// <typeparam name="A1">The 1st arguement of the given function</typeparam>
|
||||
/// <typeparam name="A2">The 2nd arguement of the given function</typeparam>
|
||||
/// <typeparam name="A3">The 3rd arguement of the given function</typeparam>
|
||||
/// <typeparam name="A4">The 4th arguement of the given function</typeparam>
|
||||
/// <returns>The new user-defined function</returns>
|
||||
public static Func<Column, Column, Column, Column, Column> Udf<RT, A1, A2, A3, A4>(Func<A1, A2, A3, A4, RT> f)
|
||||
{
|
||||
return new UserDefinedFunction<RT>(new UdfHelper<RT, A1, A2, A3, A4>(f).Execute).Execute4;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Defines a user-defined function of 5 arguments as user-defined function (UDF).
|
||||
/// The data types are automatically inferred based on the function's signature.
|
||||
/// </summary>
|
||||
/// <param name="f">The given function</param>
|
||||
/// <typeparam name="RT">The return type of the given function</typeparam>
|
||||
/// <typeparam name="A1">The 1st arguement of the given function</typeparam>
|
||||
/// <typeparam name="A2">The 2nd arguement of the given function</typeparam>
|
||||
/// <typeparam name="A3">The 3rd arguement of the given function</typeparam>
|
||||
/// <typeparam name="A4">The 4th arguement of the given function</typeparam>
|
||||
/// <typeparam name="A5">The 5th arguement of the given function</typeparam>
|
||||
/// <returns>The new user-defined function</returns>
|
||||
public static Func<Column, Column, Column, Column, Column, Column> Udf<RT, A1, A2, A3, A4, A5>(Func<A1, A2, A3, A4, A5, RT> f)
|
||||
{
|
||||
return new UserDefinedFunction<RT>(new UdfHelper<RT, A1, A2, A3, A4, A5>(f).Execute).Execute5;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Defines a user-defined function of 6 arguments as user-defined function (UDF).
|
||||
/// The data types are automatically inferred based on the function's signature.
|
||||
/// </summary>
|
||||
/// <param name="f">The given function</param>
|
||||
/// <typeparam name="RT">The return type of the given function</typeparam>
|
||||
/// <typeparam name="A1">The 1st arguement of the given function</typeparam>
|
||||
/// <typeparam name="A2">The 2nd arguement of the given function</typeparam>
|
||||
/// <typeparam name="A3">The 3rd arguement of the given function</typeparam>
|
||||
/// <typeparam name="A4">The 4th arguement of the given function</typeparam>
|
||||
/// <typeparam name="A5">The 5th arguement of the given function</typeparam>
|
||||
/// <typeparam name="A6">The 6th arguement of the given function</typeparam>
|
||||
/// <returns>The new user-defined function</returns>
|
||||
public static Func<Column, Column, Column, Column, Column, Column, Column> Udf<RT, A1, A2, A3, A4, A5, A6>(Func<A1, A2, A3, A4, A5, A6, RT> f)
|
||||
{
|
||||
return new UserDefinedFunction<RT>(new UdfHelper<RT, A1, A2, A3, A4, A5, A6>(f).Execute).Execute6;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Defines a user-defined function of 7 arguments as user-defined function (UDF).
|
||||
/// The data types are automatically inferred based on the function's signature.
|
||||
/// </summary>
|
||||
/// <param name="f">The given function</param>
|
||||
/// <typeparam name="RT">The return type of the given function</typeparam>
|
||||
/// <typeparam name="A1">The 1st arguement of the given function</typeparam>
|
||||
/// <typeparam name="A2">The 2nd arguement of the given function</typeparam>
|
||||
/// <typeparam name="A3">The 3rd arguement of the given function</typeparam>
|
||||
/// <typeparam name="A4">The 4th arguement of the given function</typeparam>
|
||||
/// <typeparam name="A5">The 5th arguement of the given function</typeparam>
|
||||
/// <typeparam name="A6">The 6th arguement of the given function</typeparam>
|
||||
/// <typeparam name="A7">The 7th arguement of the given function</typeparam>
|
||||
/// <returns>The new user-defined function</returns>
|
||||
public static Func<Column, Column, Column, Column, Column, Column, Column, Column> Udf<RT, A1, A2, A3, A4, A5, A6, A7>(Func<A1, A2, A3, A4, A5, A6, A7, RT> f)
|
||||
{
|
||||
return new UserDefinedFunction<RT>(new UdfHelper<RT, A1, A2, A3, A4, A5, A6, A7>(f).Execute).Execute7;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Defines a user-defined function of 8 arguments as user-defined function (UDF).
|
||||
/// The data types are automatically inferred based on the function's signature.
|
||||
/// </summary>
|
||||
/// <param name="f">The given function</param>
|
||||
/// <typeparam name="RT">The return type of the given function</typeparam>
|
||||
/// <typeparam name="A1">The 1st arguement of the given function</typeparam>
|
||||
/// <typeparam name="A2">The 2nd arguement of the given function</typeparam>
|
||||
/// <typeparam name="A3">The 3rd arguement of the given function</typeparam>
|
||||
/// <typeparam name="A4">The 4th arguement of the given function</typeparam>
|
||||
/// <typeparam name="A5">The 5th arguement of the given function</typeparam>
|
||||
/// <typeparam name="A6">The 6th arguement of the given function</typeparam>
|
||||
/// <typeparam name="A7">The 7th arguement of the given function</typeparam>
|
||||
/// <typeparam name="A8">The 8th arguement of the given function</typeparam>
|
||||
/// <returns>The new user-defined function</returns>
|
||||
public static Func<Column, Column, Column, Column, Column, Column, Column, Column, Column> Udf<RT, A1, A2, A3, A4, A5, A6, A7, A8>(Func<A1, A2, A3, A4, A5, A6, A7, A8, RT> f)
|
||||
{
|
||||
return new UserDefinedFunction<RT>(new UdfHelper<RT, A1, A2, A3, A4, A5, A6, A7, A8>(f).Execute).Execute8;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Defines a user-defined function of 9 arguments as user-defined function (UDF).
|
||||
/// The data types are automatically inferred based on the function's signature.
|
||||
/// </summary>
|
||||
/// <param name="f">The given function</param>
|
||||
/// <typeparam name="RT">The return type of the given function</typeparam>
|
||||
/// <typeparam name="A1">The 1st arguement of the given function</typeparam>
|
||||
/// <typeparam name="A2">The 2nd arguement of the given function</typeparam>
|
||||
/// <typeparam name="A3">The 3rd arguement of the given function</typeparam>
|
||||
/// <typeparam name="A4">The 4th arguement of the given function</typeparam>
|
||||
/// <typeparam name="A5">The 5th arguement of the given function</typeparam>
|
||||
/// <typeparam name="A6">The 6th arguement of the given function</typeparam>
|
||||
/// <typeparam name="A7">The 7th arguement of the given function</typeparam>
|
||||
/// <typeparam name="A8">The 8th arguement of the given function</typeparam>
|
||||
/// <typeparam name="A9">The 9th arguement of the given function</typeparam>
|
||||
/// <returns>The new user-defined function</returns>
|
||||
public static Func<Column, Column, Column, Column, Column, Column, Column, Column, Column, Column> Udf<RT, A1, A2, A3, A4, A5, A6, A7, A8, A9>(Func<A1, A2, A3, A4, A5, A6, A7, A8, A9, RT> f)
|
||||
{
|
||||
return new UserDefinedFunction<RT>(new UdfHelper<RT, A1, A2, A3, A4, A5, A6, A7, A8, A9>(f).Execute).Execute9;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Defines a user-defined function of 10 arguments as user-defined function (UDF).
|
||||
/// The data types are automatically inferred based on the function's signature.
|
||||
/// </summary>
|
||||
/// <param name="f">The given function</param>
|
||||
/// <typeparam name="RT">The return type of the given function</typeparam>
|
||||
/// <typeparam name="A1">The 1st arguement of the given function</typeparam>
|
||||
/// <typeparam name="A2">The 2nd arguement of the given function</typeparam>
|
||||
/// <typeparam name="A3">The 3rd arguement of the given function</typeparam>
|
||||
/// <typeparam name="A4">The 4th arguement of the given function</typeparam>
|
||||
/// <typeparam name="A5">The 5th arguement of the given function</typeparam>
|
||||
/// <typeparam name="A6">The 6th arguement of the given function</typeparam>
|
||||
/// <typeparam name="A7">The 7th arguement of the given function</typeparam>
|
||||
/// <typeparam name="A8">The 8th arguement of the given function</typeparam>
|
||||
/// <typeparam name="A9">The 9th arguement of the given function</typeparam>
|
||||
/// <typeparam name="A10">The 10th arguement of the given function</typeparam>
|
||||
/// <returns>The new user-defined function</returns>
|
||||
public static Func<Column, Column, Column, Column, Column, Column, Column, Column, Column, Column, Column> Udf<RT, A1, A2, A3, A4, A5, A6, A7, A8, A9, A10>(Func<A1, A2, A3, A4, A5, A6, A7, A8, A9, A10, RT> f)
|
||||
{
|
||||
return new UserDefinedFunction<RT>(new UdfHelper<RT, A1, A2, A3, A4, A5, A6, A7, A8, A9, A10>(f).Execute).Execute10;
|
||||
|
|
|
@ -0,0 +1,42 @@
|
|||
// Copyright (c) Microsoft. All rights reserved.
|
||||
// Licensed under the MIT license. See LICENSE file in the project root for full license information.
|
||||
|
||||
using Microsoft.Spark.CSharp.Core;
|
||||
using Microsoft.Spark.CSharp.Proxy;
|
||||
|
||||
namespace Microsoft.Spark.CSharp.Sql
|
||||
{
|
||||
/// <summary>
|
||||
/// A variant of Spark SQL that integrates with data stored in Hive.
|
||||
/// Configuration for Hive is read from hive-site.xml on the classpath.
|
||||
/// It supports running both SQL and HiveQL commands.
|
||||
/// </summary>
|
||||
public class HiveContext : SqlContext
|
||||
{
|
||||
/// <summary>
|
||||
/// Creates a HiveContext
|
||||
/// </summary>
|
||||
/// <param name="sparkContext"></param>
|
||||
public HiveContext(SparkContext sparkContext)
|
||||
: base(sparkContext, sparkContext.SparkContextProxy.CreateHiveContext())
|
||||
{
|
||||
}
|
||||
|
||||
internal HiveContext(SparkContext sparkContext, ISqlContextProxy sqlContextProxy)
|
||||
: base(sparkContext, sqlContextProxy)
|
||||
{
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Invalidate and refresh all the cached the metadata of the given table.
|
||||
/// For performance reasons, Spark SQL or the external data source library it uses
|
||||
/// might cache certain metadata about a table, such as the location of blocks.
|
||||
/// When those change outside of Spark SQL, users should call this function to invalidate the cache.
|
||||
/// </summary>
|
||||
/// <param name="tableName"></param>
|
||||
public void RefreshTable(string tableName)
|
||||
{
|
||||
SqlContextProxy.RefreshTable(tableName);
|
||||
}
|
||||
}
|
||||
}
|
|
@ -1,13 +1,15 @@
|
|||
// Copyright (c) Microsoft. All rights reserved.
|
||||
// Licensed under the MIT license. See LICENSE file in the project root for full license information.
|
||||
|
||||
using System;
|
||||
using System.Threading;
|
||||
|
||||
using Razorvine.Pickle;
|
||||
|
||||
namespace Microsoft.Spark.CSharp.Sql
|
||||
{
|
||||
/// <summary>
|
||||
/// Used by Unpickler to unpickle pickled objects. It is also used to construct a Row (C# representation of pickled objects).
|
||||
/// Note this implementation is not ThreadSafe. Collect or RDD conversion where unpickling is done is not expected to be multithreaded.
|
||||
/// </summary>
|
||||
public class RowConstructor : IObjectConstructor
|
||||
{
|
||||
|
@ -16,11 +18,13 @@ namespace Microsoft.Spark.CSharp.Sql
|
|||
/// <summary>
|
||||
/// Schema of the DataFrame currently being processed
|
||||
/// </summary>
|
||||
[ThreadStatic] // thread safe is need when running in C# worker process
|
||||
private static string currentSchema;
|
||||
|
||||
/// <summary>
|
||||
/// Indicates if Schema is already set during construction of this type
|
||||
/// </summary>
|
||||
[ThreadStatic] // thread safe is need when running in C# worker process
|
||||
private static bool isCurrentSchemaSet;
|
||||
|
||||
/// <summary>
|
||||
|
@ -33,6 +37,10 @@ namespace Microsoft.Spark.CSharp.Sql
|
|||
/// </summary>
|
||||
internal string Schema;
|
||||
|
||||
/// <summary>
|
||||
/// Returns a string that represents the current object.
|
||||
/// </summary>
|
||||
/// <returns>A string that represents the current object.</returns>
|
||||
public override string ToString()
|
||||
{
|
||||
return string.Format("{{{0}}}", string.Join(",", Values));
|
||||
|
|
|
@ -40,6 +40,11 @@ namespace Microsoft.Spark.CSharp.Sql
|
|||
/// </summary>
|
||||
public static class SaveModeExtensions
|
||||
{
|
||||
/// <summary>
|
||||
/// Gets the string for the value of SaveMode
|
||||
/// </summary>
|
||||
/// <param name="mode">The given SaveMode</param>
|
||||
/// <returns>The string that represents the given SaveMode</returns>
|
||||
public static string GetStringValue(this SaveMode mode)
|
||||
{
|
||||
switch (mode)
|
||||
|
|
|
@ -5,6 +5,7 @@ using System;
|
|||
using System.Collections.Generic;
|
||||
using Microsoft.Spark.CSharp.Core;
|
||||
using Microsoft.Spark.CSharp.Proxy;
|
||||
using Microsoft.Spark.CSharp.Services;
|
||||
|
||||
namespace Microsoft.Spark.CSharp.Sql
|
||||
{
|
||||
|
@ -14,13 +15,77 @@ namespace Microsoft.Spark.CSharp.Sql
|
|||
/// </summary>
|
||||
public class SqlContext
|
||||
{
|
||||
private readonly ILoggerService logger = LoggerServiceFactory.GetLogger(typeof(SqlContext));
|
||||
|
||||
private readonly ISqlContextProxy sqlContextProxy;
|
||||
private readonly SparkContext sparkContext;
|
||||
internal ISqlContextProxy SqlContextProxy { get { return sqlContextProxy; } }
|
||||
|
||||
private static SqlContext instance;
|
||||
|
||||
/// <summary>
|
||||
/// Creates a SqlContext
|
||||
/// </summary>
|
||||
/// <param name="sparkContext"></param>
|
||||
public SqlContext(SparkContext sparkContext)
|
||||
{
|
||||
this.sparkContext = sparkContext;
|
||||
sqlContextProxy = sparkContext.SparkContextProxy.CreateSqlContext();
|
||||
sqlContextProxy = sparkContext.SparkContextProxy.CreateSqlContext();
|
||||
if (instance == null) instance = this;
|
||||
}
|
||||
|
||||
internal SqlContext(SparkContext sparkContext, ISqlContextProxy sqlContextProxy)
|
||||
{
|
||||
this.sparkContext = sparkContext;
|
||||
this.sqlContextProxy = sqlContextProxy;
|
||||
if (instance == null) instance = this;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Get the existing SQLContext or create a new one with given SparkContext.
|
||||
/// </summary>
|
||||
/// <param name="sparkContext"></param>
|
||||
/// <returns></returns>
|
||||
public static SqlContext GetOrCreate(SparkContext sparkContext)
|
||||
{
|
||||
if (instance == null)
|
||||
{
|
||||
return new SqlContext(sparkContext);
|
||||
}
|
||||
return instance;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Returns a new SQLContext as new session, that has separate SQLConf,
|
||||
/// registered temporary tables and UDFs, but shared SparkContext and table cache.
|
||||
/// </summary>
|
||||
/// <returns></returns>
|
||||
public SqlContext NewSession()
|
||||
{
|
||||
var newSessionProxy = sqlContextProxy.NewSession();
|
||||
return new SqlContext(this.sparkContext, newSessionProxy);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Returns the value of Spark SQL configuration property for the given key.
|
||||
/// If the key is not set, returns defaultValue.
|
||||
/// </summary>
|
||||
/// <param name="key"></param>
|
||||
/// <param name="defaultValue"></param>
|
||||
/// <returns></returns>
|
||||
public string GetConf(string key, string defaultValue)
|
||||
{
|
||||
return sqlContextProxy.GetConf(key, defaultValue);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Sets the given Spark SQL configuration property.
|
||||
/// </summary>
|
||||
/// <param name="key"></param>
|
||||
/// <param name="value"></param>
|
||||
public void SetConf(string key, string value)
|
||||
{
|
||||
sqlContextProxy.SetConf(key, value);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
|
@ -28,6 +93,7 @@ namespace Microsoft.Spark.CSharp.Sql
|
|||
/// </summary>
|
||||
public DataFrameReader Read()
|
||||
{
|
||||
logger.LogInfo("Using DataFrameReader to read input data from external data source");
|
||||
return new DataFrameReader(sqlContextProxy.Read(), sparkContext);
|
||||
}
|
||||
|
||||
|
@ -40,9 +106,16 @@ namespace Microsoft.Spark.CSharp.Sql
|
|||
/// <returns></returns>
|
||||
public DataFrame ReadDataFrame(string path, StructType schema, Dictionary<string, string> options)
|
||||
{
|
||||
logger.LogInfo("Reading DataFrame from file {0}", path);
|
||||
return new DataFrame(sqlContextProxy.ReadDataFrame(path, schema, options), sparkContext);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Creates a <see cref="DataFrame"/> from a RDD containing array of object using the given schema.
|
||||
/// </summary>
|
||||
/// <param name="rdd">RDD containing array of object. The array acts as a row and items within the array act as columns which the schema is specified in <paramref name="schema"/>. </param>
|
||||
/// <param name="schema">The schema of DataFrame.</param>
|
||||
/// <returns></returns>
|
||||
public DataFrame CreateDataFrame(RDD<object[]> rdd, StructType schema)
|
||||
{
|
||||
// Note: This is for pickling RDD, convert to RDD<byte[]> which happens in CSharpWorker.
|
||||
|
@ -55,6 +128,100 @@ namespace Microsoft.Spark.CSharp.Sql
|
|||
return new DataFrame(sqlContextProxy.CreateDataFrame(rddRow.RddProxy, schema.StructTypeProxy), sparkContext);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Registers the given <see cref="DataFrame"/> as a temporary table in the catalog.
|
||||
/// Temporary tables exist only during the lifetime of this instance of SqlContext.
|
||||
/// </summary>
|
||||
/// <param name="dataFrame"></param>
|
||||
/// <param name="tableName"></param>
|
||||
public void RegisterDataFrameAsTable(DataFrame dataFrame, string tableName)
|
||||
{
|
||||
sqlContextProxy.RegisterDataFrameAsTable(dataFrame.DataFrameProxy, tableName);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Remove the temp table from catalog.
|
||||
/// </summary>
|
||||
/// <param name="tableName"></param>
|
||||
public void DropTempTable(string tableName)
|
||||
{
|
||||
sqlContextProxy.DropTempTable(tableName);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Returns the specified table as a <see cref="DataFrame"/>
|
||||
/// </summary>
|
||||
/// <param name="tableName"></param>
|
||||
/// <returns></returns>
|
||||
public DataFrame Table(string tableName)
|
||||
{
|
||||
return new DataFrame(sqlContextProxy.Table(tableName), sparkContext);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Returns a <see cref="DataFrame"/> containing names of tables in the given database.
|
||||
/// If <paramref name="databaseName"/> is not specified, the current database will be used.
|
||||
/// The returned DataFrame has two columns: 'tableName' and 'isTemporary' (a column with bool
|
||||
/// type indicating if a table is a temporary one or not).
|
||||
/// </summary>
|
||||
/// <param name="databaseName">Name of the database to use. Default to the current database.
|
||||
/// Note: This is only applicable to HiveContext.</param>
|
||||
/// <returns></returns>
|
||||
public DataFrame Tables(string databaseName = null)
|
||||
{
|
||||
return databaseName == null ?
|
||||
new DataFrame(sqlContextProxy.Tables(), sparkContext) :
|
||||
new DataFrame(sqlContextProxy.Tables(databaseName), sparkContext);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Returns a list of names of tables in the database <paramref name="databaseName"/>
|
||||
/// </summary>
|
||||
/// <param name="databaseName">Name of the database to use. Default to the current database.
|
||||
/// Note: This is only applicable to HiveContext.</param>
|
||||
/// <returns></returns>
|
||||
public IEnumerable<string> TableNames(string databaseName = null)
|
||||
{
|
||||
return databaseName == null ?
|
||||
sqlContextProxy.TableNames() : sqlContextProxy.TableNames(databaseName);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Caches the specified table in-memory.
|
||||
/// </summary>
|
||||
/// <param name="tableName"></param>
|
||||
public void CacheTable(string tableName)
|
||||
{
|
||||
sqlContextProxy.CacheTable(tableName);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Removes the specified table from the in-memory cache.
|
||||
/// </summary>
|
||||
/// <param name="tableName"></param>
|
||||
public void UncacheTable(string tableName)
|
||||
{
|
||||
sqlContextProxy.UncacheTable(tableName);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Removes all cached tables from the in-memory cache.
|
||||
/// </summary>
|
||||
public void ClearCache()
|
||||
{
|
||||
sqlContextProxy.ClearCache();
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Returns true if the table is currently cached in-memory.
|
||||
/// </summary>
|
||||
/// <param name="tableName"></param>
|
||||
/// <returns></returns>
|
||||
public bool IsCached(string tableName)
|
||||
{
|
||||
return sqlContextProxy.IsCached(tableName);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Executes a SQL query using Spark, returning the result as a DataFrame. The dialect that is used for SQL parsing can be configured with 'spark.sql.dialect'
|
||||
/// </summary>
|
||||
|
@ -62,6 +229,7 @@ namespace Microsoft.Spark.CSharp.Sql
|
|||
/// <returns></returns>
|
||||
public DataFrame Sql(string sqlQuery)
|
||||
{
|
||||
logger.LogInfo("SQL query to execute on the dataframe is {0}", sqlQuery);
|
||||
return new DataFrame(sqlContextProxy.Sql(sqlQuery), sparkContext);
|
||||
}
|
||||
|
||||
|
@ -117,7 +285,7 @@ namespace Microsoft.Spark.CSharp.Sql
|
|||
#region UDF Registration
|
||||
/// <summary>
|
||||
/// Register UDF with no input argument, e.g:
|
||||
/// <see cref="SqlContext.RegisterFunction{bool}"/>("MyFilter", () => true);
|
||||
/// SqlContext.RegisterFunction<bool>("MyFilter", () => true);
|
||||
/// sqlContext.Sql("SELECT * FROM MyTable where MyFilter()");
|
||||
/// </summary>
|
||||
/// <typeparam name="RT"></typeparam>
|
||||
|
@ -131,7 +299,7 @@ namespace Microsoft.Spark.CSharp.Sql
|
|||
|
||||
/// <summary>
|
||||
/// Register UDF with 1 input argument, e.g:
|
||||
/// <see cref="SqlContext.RegisterFunction{bool, string}"/>("MyFilter", (arg1) => arg1 != null);
|
||||
/// SqlContext.RegisterFunction<bool, string>("MyFilter", (arg1) => arg1 != null);
|
||||
/// sqlContext.Sql("SELECT * FROM MyTable where MyFilter(columnName1)");
|
||||
/// </summary>
|
||||
/// <typeparam name="RT"></typeparam>
|
||||
|
@ -146,7 +314,7 @@ namespace Microsoft.Spark.CSharp.Sql
|
|||
|
||||
/// <summary>
|
||||
/// Register UDF with 2 input arguments, e.g:
|
||||
/// <see cref="SqlContext.RegisterFunction{bool, string, string}"/>("MyFilter", (arg1, arg2) => arg1 != null && arg2 != null);
|
||||
/// SqlContext.RegisterFunction<bool, string, string>("MyFilter", (arg1, arg2) => arg1 != null && arg2 != null);
|
||||
/// sqlContext.Sql("SELECT * FROM MyTable where MyFilter(columnName1, columnName2)");
|
||||
/// </summary>
|
||||
/// <typeparam name="RT"></typeparam>
|
||||
|
@ -162,7 +330,7 @@ namespace Microsoft.Spark.CSharp.Sql
|
|||
|
||||
/// <summary>
|
||||
/// Register UDF with 3 input arguments, e.g:
|
||||
/// <see cref="SqlContext.RegisterFunction{bool, string, string, string}"/>("MyFilter", (arg1, arg2, arg3) => arg1 != null && arg2 != null && arg3 != null);
|
||||
/// SqlContext.RegisterFunction<bool, string, string, string>("MyFilter", (arg1, arg2, arg3) => arg1 != null && arg2 != null && arg3 != null);
|
||||
/// sqlContext.Sql("SELECT * FROM MyTable where MyFilter(columnName1, columnName2, columnName3)");
|
||||
/// </summary>
|
||||
/// <typeparam name="RT"></typeparam>
|
||||
|
@ -179,7 +347,7 @@ namespace Microsoft.Spark.CSharp.Sql
|
|||
|
||||
/// <summary>
|
||||
/// Register UDF with 4 input arguments, e.g:
|
||||
/// <see cref="SqlContext.RegisterFunction{bool, string, string, ..., string}"/>("MyFilter", (arg1, arg2, ..., arg4) => arg1 != null && arg2 != null && ... && arg3 != null);
|
||||
/// SqlContext.RegisterFunction<bool, string, string, ..., string>("MyFilter", (arg1, arg2, ..., arg4) => arg1 != null && arg2 != null && ... && arg3 != null);
|
||||
/// sqlContext.Sql("SELECT * FROM MyTable where MyFilter(columnName1, columnName2, ..., columnName4)");
|
||||
/// </summary>
|
||||
/// <typeparam name="RT"></typeparam>
|
||||
|
@ -197,7 +365,7 @@ namespace Microsoft.Spark.CSharp.Sql
|
|||
|
||||
/// <summary>
|
||||
/// Register UDF with 5 input arguments, e.g:
|
||||
/// <see cref="SqlContext.RegisterFunction{bool, string, string, ..., string}"/>("MyFilter", (arg1, arg2, ..., arg5) => arg1 != null && arg2 != null && ... && arg5 != null);
|
||||
/// SqlContext.RegisterFunction<bool, string, string, ..., string>("MyFilter", (arg1, arg2, ..., arg5) => arg1 != null && arg2 != null && ... && arg5 != null);
|
||||
/// sqlContext.Sql("SELECT * FROM MyTable where MyFilter(columnName1, columnName2, ..., columnName5)");
|
||||
/// </summary>
|
||||
/// <typeparam name="RT"></typeparam>
|
||||
|
@ -216,7 +384,7 @@ namespace Microsoft.Spark.CSharp.Sql
|
|||
|
||||
/// <summary>
|
||||
/// Register UDF with 6 input arguments, e.g:
|
||||
/// <see cref="SqlContext.RegisterFunction{bool, string, string, ..., string}"/>("MyFilter", (arg1, arg2, ..., arg6) => arg1 != null && arg2 != null && ... && arg6 != null);
|
||||
/// SqlContext.RegisterFunction<bool, string, string, ..., string>("MyFilter", (arg1, arg2, ..., arg6) => arg1 != null && arg2 != null && ... && arg6 != null);
|
||||
/// sqlContext.Sql("SELECT * FROM MyTable where MyFilter(columnName1, columnName2, ..., columnName6)");
|
||||
/// </summary>
|
||||
/// <typeparam name="RT"></typeparam>
|
||||
|
@ -236,7 +404,7 @@ namespace Microsoft.Spark.CSharp.Sql
|
|||
|
||||
/// <summary>
|
||||
/// Register UDF with 7 input arguments, e.g:
|
||||
/// <see cref="SqlContext.RegisterFunction{bool, string, string, ..., string}"/>("MyFilter", (arg1, arg2, ..., arg7) => arg1 != null && arg2 != null && ... && arg7 != null);
|
||||
/// SqlContext.RegisterFunction<bool, string, string, ..., string>("MyFilter", (arg1, arg2, ..., arg7) => arg1 != null && arg2 != null && ... && arg7 != null);
|
||||
/// sqlContext.Sql("SELECT * FROM MyTable where MyFilter(columnName1, columnName2, ..., columnName7)");
|
||||
/// </summary>
|
||||
/// <typeparam name="RT"></typeparam>
|
||||
|
@ -257,7 +425,7 @@ namespace Microsoft.Spark.CSharp.Sql
|
|||
|
||||
/// <summary>
|
||||
/// Register UDF with 8 input arguments, e.g:
|
||||
/// <see cref="SqlContext.RegisterFunction{bool, string, string, ..., string}"/>("MyFilter", (arg1, arg2, ..., arg8) => arg1 != null && arg2 != null && ... && arg8 != null);
|
||||
/// SqlContext.RegisterFunction<bool, string, string, ..., string>("MyFilter", (arg1, arg2, ..., arg8) => arg1 != null && arg2 != null && ... && arg8 != null);
|
||||
/// sqlContext.Sql("SELECT * FROM MyTable where MyFilter(columnName1, columnName2, ..., columnName8)");
|
||||
/// </summary>
|
||||
/// <typeparam name="RT"></typeparam>
|
||||
|
@ -279,7 +447,7 @@ namespace Microsoft.Spark.CSharp.Sql
|
|||
|
||||
/// <summary>
|
||||
/// Register UDF with 9 input arguments, e.g:
|
||||
/// <see cref="SqlContext.RegisterFunction{bool, string, string, ..., string}"/>("MyFilter", (arg1, arg2, ..., arg9) => arg1 != null && arg2 != null && ... && arg9 != null);
|
||||
/// SqlContext.RegisterFunction<bool, string, string, ..., string>("MyFilter", (arg1, arg2, ..., arg9) => arg1 != null && arg2 != null && ... && arg9 != null);
|
||||
/// sqlContext.Sql("SELECT * FROM MyTable where MyFilter(columnName1, columnName2, ..., columnName9)");
|
||||
/// </summary>
|
||||
/// <typeparam name="RT"></typeparam>
|
||||
|
@ -302,7 +470,7 @@ namespace Microsoft.Spark.CSharp.Sql
|
|||
|
||||
/// <summary>
|
||||
/// Register UDF with 10 input arguments, e.g:
|
||||
/// <see cref="SqlContext.RegisterFunction{bool, string, string, ..., string}"/>("MyFilter", (arg1, arg2, ..., arg10) => arg1 != null && arg2 != null && ... && arg10 != null);
|
||||
/// SqlContext.RegisterFunction<bool, string, string, ..., string>("MyFilter", (arg1, arg2, ..., arg10) => arg1 != null && arg2 != null && ... && arg10 != null);
|
||||
/// sqlContext.Sql("SELECT * FROM MyTable where MyFilter(columnName1, columnName2, ..., columnName10)");
|
||||
/// </summary>
|
||||
/// <typeparam name="RT"></typeparam>
|
||||
|
|
|
@ -14,6 +14,9 @@ using Newtonsoft.Json.Linq;
|
|||
|
||||
namespace Microsoft.Spark.CSharp.Sql
|
||||
{
|
||||
/// <summary>
|
||||
/// The base type of all Spark SQL data types.
|
||||
/// </summary>
|
||||
[Serializable]
|
||||
public abstract class DataType
|
||||
{
|
||||
|
@ -38,6 +41,9 @@ namespace Microsoft.Spark.CSharp.Sql
|
|||
/// </summary>
|
||||
internal virtual object JsonValue { get { return TypeName; } }
|
||||
|
||||
/// <summary>
|
||||
/// The compact JSON representation of this data type.
|
||||
/// </summary>
|
||||
public string Json
|
||||
{
|
||||
get
|
||||
|
@ -47,11 +53,23 @@ namespace Microsoft.Spark.CSharp.Sql
|
|||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Parses a Json string to construct a DataType.
|
||||
/// </summary>
|
||||
/// <param name="json">The Json string to be parsed</param>
|
||||
/// <returns>The new DataType instance from the Json string</returns>
|
||||
public static DataType ParseDataTypeFromJson(string json)
|
||||
{
|
||||
return ParseDataTypeFromJson(JToken.Parse(json));
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Parse a JToken object to construct a DataType.
|
||||
/// </summary>
|
||||
/// <param name="json">The JToken object to be parsed</param>
|
||||
/// <returns>The new DataType instance from the Json string</returns>
|
||||
/// <exception cref="NotImplementedException">Not implemented for "udt" type</exception>
|
||||
/// <exception cref="ArgumentException"></exception>
|
||||
protected static DataType ParseDataTypeFromJson(JToken json)
|
||||
{
|
||||
if (json.Type == JTokenType.Object) // {name: address, type: {type: struct,...},...}
|
||||
|
@ -112,63 +130,125 @@ namespace Microsoft.Spark.CSharp.Sql
|
|||
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// An internal type used to represent a simple type.
|
||||
/// </summary>
|
||||
[Serializable]
|
||||
public class AtomicType : DataType
|
||||
{
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// An internal type used to represent a complex type (such as arrays, structs, and maps).
|
||||
/// </summary>
|
||||
[Serializable]
|
||||
public abstract class ComplexType : DataType
|
||||
{
|
||||
/// <summary>
|
||||
/// Abstract method that constructs a complex type from a Json object
|
||||
/// </summary>
|
||||
/// <param name="json">The Json object to construct a complex type</param>
|
||||
/// <returns>A new constructed complex type</returns>
|
||||
public abstract DataType FromJson(JObject json);
|
||||
/// <summary>
|
||||
/// Constructs a complex type from a Json string
|
||||
/// </summary>
|
||||
/// <param name="json">The string that represents a Json.</param>
|
||||
/// <returns>A new constructed complex type</returns>
|
||||
public DataType FromJson(string json)
|
||||
{
|
||||
return FromJson(JObject.Parse(json));
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/// <summary>
|
||||
/// The data type representing NULL values.
|
||||
/// </summary>
|
||||
[Serializable]
|
||||
public class NullType : AtomicType { }
|
||||
|
||||
/// <summary>
|
||||
/// The data type representing String values.
|
||||
/// </summary>
|
||||
[Serializable]
|
||||
public class StringType : AtomicType { }
|
||||
|
||||
/// <summary>
|
||||
/// The data type representing binary values.
|
||||
/// </summary>
|
||||
[Serializable]
|
||||
public class BinaryType : AtomicType { }
|
||||
|
||||
/// <summary>
|
||||
/// The data type representing Boolean values.
|
||||
/// </summary>
|
||||
[Serializable]
|
||||
public class BooleanType : AtomicType { }
|
||||
|
||||
/// <summary>
|
||||
/// The data type representing Date values.
|
||||
/// </summary>
|
||||
[Serializable]
|
||||
public class DateType : AtomicType { }
|
||||
|
||||
/// <summary>
|
||||
/// The data type representing Timestamp values.
|
||||
/// </summary>
|
||||
[Serializable]
|
||||
public class TimestampType : AtomicType { }
|
||||
|
||||
/// <summary>
|
||||
/// The data type representing Double values.
|
||||
/// </summary>
|
||||
[Serializable]
|
||||
public class DoubleType : AtomicType { }
|
||||
|
||||
/// <summary>
|
||||
///
|
||||
/// </summary>
|
||||
[Serializable]
|
||||
public class FloatType : AtomicType { }
|
||||
|
||||
/// <summary>
|
||||
/// The data type representing Float values.
|
||||
/// </summary>
|
||||
[Serializable]
|
||||
public class ByteType : AtomicType { }
|
||||
|
||||
/// <summary>
|
||||
///
|
||||
/// </summary>
|
||||
[Serializable]
|
||||
public class IntegerType : AtomicType { }
|
||||
|
||||
/// <summary>
|
||||
/// The data type representing Int values.
|
||||
/// </summary>
|
||||
[Serializable]
|
||||
public class LongType : AtomicType { }
|
||||
|
||||
/// <summary>
|
||||
/// The data type representing Short values.
|
||||
/// </summary>
|
||||
[Serializable]
|
||||
public class ShortType : AtomicType { }
|
||||
|
||||
/// <summary>
|
||||
/// The data type representing Decimal values.
|
||||
/// </summary>
|
||||
[Serializable]
|
||||
public class DecimalType : AtomicType
|
||||
{
|
||||
/// <summary>
|
||||
/// Gets the regular expression that represents a fixed decimal.
|
||||
/// </summary>
|
||||
public static Regex FixedDecimal = new Regex(@"decimal\((\d+),\s(\d+)\)");
|
||||
private int? precision, scale;
|
||||
/// <summary>
|
||||
/// Initializes a new instance of DecimalType from parameters specifying its precision and scale.
|
||||
/// </summary>
|
||||
/// <param name="precision">The precision of the type</param>
|
||||
/// <param name="scale">The scale of the type</param>
|
||||
public DecimalType(int? precision = null, int? scale = null)
|
||||
{
|
||||
this.precision = precision;
|
||||
|
@ -180,18 +260,38 @@ namespace Microsoft.Spark.CSharp.Sql
|
|||
get { throw new NotImplementedException(); }
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Constructs a DecimalType from a Json object
|
||||
/// </summary>
|
||||
/// <param name="json">The Json object used to construct a DecimalType</param>
|
||||
/// <returns>A new DecimalType instance</returns>
|
||||
/// <exception cref="NotImplementedException">Not implemented yet.</exception>
|
||||
public DataType FromJson(JObject json)
|
||||
{
|
||||
throw new NotImplementedException();
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// The data type for collections of multiple values.
|
||||
/// </summary>
|
||||
[Serializable]
|
||||
public class ArrayType : ComplexType
|
||||
{
|
||||
/// <summary>
|
||||
/// Gets the DataType of each element in the array
|
||||
/// </summary>
|
||||
public DataType ElementType { get { return elementType; } }
|
||||
/// <summary>
|
||||
/// Returns whether the array can contain null (None) values
|
||||
/// </summary>
|
||||
public bool ContainsNull { get { return containsNull; } }
|
||||
|
||||
/// <summary>
|
||||
/// Initializes a ArrayType instance with a specific DataType and specifying if the array has null values.
|
||||
/// </summary>
|
||||
/// <param name="elementType">The data type of values</param>
|
||||
/// <param name="containsNull">Indicates if values have null values</param>
|
||||
public ArrayType(DataType elementType, bool containsNull = true)
|
||||
{
|
||||
this.elementType = elementType;
|
||||
|
@ -203,6 +303,9 @@ namespace Microsoft.Spark.CSharp.Sql
|
|||
FromJson(json);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Readable string representation for the type.
|
||||
/// </summary>
|
||||
public override string SimpleString
|
||||
{
|
||||
get { return string.Format("array<{0}>", elementType.SimpleString); }
|
||||
|
@ -219,6 +322,11 @@ namespace Microsoft.Spark.CSharp.Sql
|
|||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Constructs a ArrayType from a Json object
|
||||
/// </summary>
|
||||
/// <param name="json">The Json object used to construct a ArrayType</param>
|
||||
/// <returns>A new ArrayType instance</returns>
|
||||
public override sealed DataType FromJson(JObject json)
|
||||
{
|
||||
elementType = ParseDataTypeFromJson(json["elementType"]);
|
||||
|
@ -230,6 +338,9 @@ namespace Microsoft.Spark.CSharp.Sql
|
|||
private bool containsNull;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// The data type for Maps. Not implemented yet.
|
||||
/// </summary>
|
||||
[Serializable]
|
||||
public class MapType : ComplexType
|
||||
{
|
||||
|
@ -238,20 +349,48 @@ namespace Microsoft.Spark.CSharp.Sql
|
|||
get { throw new NotImplementedException(); }
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Constructs a StructField from a Json object. Not implemented yet.
|
||||
/// </summary>
|
||||
/// <param name="json">The Json object used to construct a MapType</param>
|
||||
/// <returns>A new MapType instance</returns>
|
||||
/// <exception cref="NotImplementedException"></exception>
|
||||
public override DataType FromJson(JObject json)
|
||||
{
|
||||
throw new NotImplementedException();
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// A field inside a StructType.
|
||||
/// </summary>
|
||||
[Serializable]
|
||||
public class StructField : ComplexType
|
||||
{
|
||||
/// <summary>
|
||||
/// The name of this field.
|
||||
/// </summary>
|
||||
public string Name { get { return name; } }
|
||||
/// <summary>
|
||||
/// The data type of this field.
|
||||
/// </summary>
|
||||
public DataType DataType { get { return dataType; } }
|
||||
/// <summary>
|
||||
/// Indicates if values of this field can be null values.
|
||||
/// </summary>
|
||||
public bool IsNullable { get { return isNullable; } }
|
||||
/// <summary>
|
||||
/// The metadata of this field. The metadata should be preserved during transformation if the content of the column is not modified, e.g, in selection.
|
||||
/// </summary>
|
||||
public JObject Metadata { get { return metadata; } }
|
||||
|
||||
/// <summary>
|
||||
/// Initializes a StructField instance with a specific name, data type, nullable, and metadata
|
||||
/// </summary>
|
||||
/// <param name="name">The name of this field</param>
|
||||
/// <param name="dataType">The data type of this field</param>
|
||||
/// <param name="isNullable">Indicates if values of this field can be null values</param>
|
||||
/// <param name="metadata">The metadata of this field</param>
|
||||
public StructField(string name, DataType dataType, bool isNullable = true, JObject metadata = null)
|
||||
{
|
||||
this.name = name;
|
||||
|
@ -265,6 +404,9 @@ namespace Microsoft.Spark.CSharp.Sql
|
|||
FromJson(json);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Returns a readable string that represents the type.
|
||||
/// </summary>
|
||||
public override string SimpleString { get { return string.Format(@"{0}:{1}", name, dataType.SimpleString); } }
|
||||
|
||||
internal override object JsonValue
|
||||
|
@ -279,6 +421,11 @@ namespace Microsoft.Spark.CSharp.Sql
|
|||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Constructs a StructField from a Json object
|
||||
/// </summary>
|
||||
/// <param name="json">The Json object used to construct a StructField</param>
|
||||
/// <returns>A new StructField instance</returns>
|
||||
public override sealed DataType FromJson(JObject json)
|
||||
{
|
||||
name = json["name"].ToString();
|
||||
|
@ -295,9 +442,16 @@ namespace Microsoft.Spark.CSharp.Sql
|
|||
private JObject metadata;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Struct type, consisting of a list of StructField
|
||||
/// This is the data type representing a Row
|
||||
/// </summary>
|
||||
[Serializable]
|
||||
public class StructType : ComplexType
|
||||
{
|
||||
/// <summary>
|
||||
/// Gets a list of StructField.
|
||||
/// </summary>
|
||||
public List<StructField> Fields { get { return fields; } }
|
||||
|
||||
internal IStructTypeProxy StructTypeProxy
|
||||
|
@ -311,6 +465,10 @@ namespace Microsoft.Spark.CSharp.Sql
|
|||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Initializes a StructType instance with a specific collection of SructField object.
|
||||
/// </summary>
|
||||
/// <param name="fields">The collection that holds StructField objects</param>
|
||||
public StructType(IEnumerable<StructField> fields)
|
||||
{
|
||||
this.fields = fields.ToList();
|
||||
|
@ -328,6 +486,9 @@ namespace Microsoft.Spark.CSharp.Sql
|
|||
FromJson(jsonSchema);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Returns a readable string that joins all <see cref="StructField"/>s together.
|
||||
/// </summary>
|
||||
public override string SimpleString
|
||||
{
|
||||
get { return string.Format(@"struct<{0}>", string.Join(",", fields.Select(f => f.SimpleString))); }
|
||||
|
@ -343,6 +504,11 @@ namespace Microsoft.Spark.CSharp.Sql
|
|||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Constructs a StructType from a Json object
|
||||
/// </summary>
|
||||
/// <param name="json">The Json object used to construct a StructType</param>
|
||||
/// <returns>A new StructType instance</returns>
|
||||
public override sealed DataType FromJson(JObject json)
|
||||
{
|
||||
var fieldsJObjects = json["fields"].Select(f => (JObject)f);
|
||||
|
|
|
@ -0,0 +1,31 @@
|
|||
// Copyright (c) Microsoft. All rights reserved.
|
||||
// Licensed under the MIT license. See LICENSE file in the project root for full license information.
|
||||
|
||||
using System;
|
||||
using Microsoft.Spark.CSharp.Core;
|
||||
using Microsoft.Spark.CSharp.Interop.Ipc;
|
||||
using Microsoft.Spark.CSharp.Proxy.Ipc;
|
||||
|
||||
namespace Microsoft.Spark.CSharp.Streaming
|
||||
{
|
||||
/// <summary>
|
||||
/// An input stream that always returns the same RDD on each timestep. Useful for testing.
|
||||
/// </summary>
|
||||
public class ConstantInputDStream<T> : DStream<T>
|
||||
{
|
||||
/// <summary>
|
||||
/// Construct a ConstantInputDStream instance.
|
||||
/// </summary>
|
||||
public ConstantInputDStream(RDD<T> rdd, StreamingContext ssc)
|
||||
{
|
||||
if (rdd == null)
|
||||
{
|
||||
throw new ArgumentNullException("Parameter rdd null is illegal, which will lead to NPE in the following transformation");
|
||||
}
|
||||
|
||||
dstreamProxy = ssc.streamingContextProxy.CreateConstantInputDStream(rdd.RddProxy);
|
||||
streamingContext = ssc;
|
||||
serializedMode = SerializedMode.Byte;
|
||||
}
|
||||
}
|
||||
}
|
|
@ -221,12 +221,12 @@ namespace Microsoft.Spark.CSharp.Streaming
|
|||
/// <summary>
|
||||
/// Enable periodic checkpointing of RDDs of this DStream
|
||||
/// </summary>
|
||||
/// <param name="intervalMs">time in seconds, after each period of that, generated RDD will be checkpointed</param>
|
||||
/// <param name="intervalSeconds">time in seconds, after each period of that, generated RDD will be checkpointed</param>
|
||||
/// <returns></returns>
|
||||
public DStream<T> Checkpoint(long intervalMs)
|
||||
public DStream<T> Checkpoint(int intervalSeconds)
|
||||
{
|
||||
isCheckpointed = true;
|
||||
DStreamProxy.Checkpoint(intervalMs);
|
||||
DStreamProxy.Checkpoint(intervalSeconds);
|
||||
return this;
|
||||
}
|
||||
|
||||
|
@ -373,7 +373,7 @@ namespace Microsoft.Spark.CSharp.Streaming
|
|||
return DStreamProxy.Slice(fromUnixTime, toUnixTime).Select(r => new RDD<T>(r, streamingContext.SparkContext, serializedMode)).ToArray();
|
||||
}
|
||||
|
||||
internal void ValidatWindowParam(int windowSeconds, int slideSeconds)
|
||||
internal void ValidateWindowParam(int windowSeconds, int slideSeconds)
|
||||
{
|
||||
int duration = SlideDuration;
|
||||
|
||||
|
@ -403,7 +403,7 @@ namespace Microsoft.Spark.CSharp.Streaming
|
|||
/// <returns></returns>
|
||||
public DStream<T> Window(int windowSeconds, int slideSeconds)
|
||||
{
|
||||
ValidatWindowParam(windowSeconds, slideSeconds);
|
||||
ValidateWindowParam(windowSeconds, slideSeconds);
|
||||
return new DStream<T>(DStreamProxy.Window(windowSeconds, slideSeconds), streamingContext, serializedMode);
|
||||
}
|
||||
|
||||
|
|
|
@ -0,0 +1,43 @@
|
|||
// Copyright (c) Microsoft. All rights reserved.
|
||||
// Licensed under the MIT license. See LICENSE file in the project root for full license information.
|
||||
|
||||
using System;
|
||||
using System.Collections.Generic;
|
||||
using Microsoft.Spark.CSharp.Core;
|
||||
|
||||
namespace Microsoft.Spark.CSharp.Streaming
|
||||
{
|
||||
/// <summary>
|
||||
/// Utility for creating streams from
|
||||
/// </summary>
|
||||
public class EventHubsUtils
|
||||
{
|
||||
/// <summary>
|
||||
/// Create a unioned EventHubs stream that receives data from Microsoft Azure Eventhubs
|
||||
/// The unioned stream will receive message from all partitions of the EventHubs
|
||||
/// </summary>
|
||||
/// <param name="ssc">Streaming context</param>
|
||||
/// <param name="eventhubsParams"> Parameters for EventHubs.
|
||||
/// Required parameters are:
|
||||
/// "eventhubs.policyname": EventHubs policy name
|
||||
/// "eventhubs.policykey": EventHubs policy key
|
||||
/// "eventhubs.namespace": EventHubs namespace
|
||||
/// "eventhubs.name": EventHubs name
|
||||
/// "eventhubs.partition.count": Number of partitions
|
||||
/// "eventhubs.checkpoint.dir": checkpoint directory on HDFS
|
||||
///
|
||||
/// Optional parameters are:
|
||||
/// "eventhubs.consumergroup": EventHubs consumer group name, default to "\$default"
|
||||
/// "eventhubs.filter.offset": Starting offset of EventHubs, default to "-1"
|
||||
/// "eventhubs.filter.enqueuetime": Unix time, millisecond since epoch, default to "0"
|
||||
/// "eventhubs.default.credits": default AMQP credits, default to -1 (which is 1024)
|
||||
/// "eventhubs.checkpoint.interval": checkpoint interval in second, default to 10
|
||||
/// </param>
|
||||
/// <param name="storageLevelType">Storage level, by default it is MEMORY_ONLY</param>
|
||||
/// <returns>DStream with byte[] representing events from EventHub</returns>
|
||||
public static DStream<byte[]> CreateUnionStream(StreamingContext ssc, Dictionary<string, string> eventhubsParams, StorageLevelType storageLevelType = StorageLevelType.MEMORY_ONLY)
|
||||
{
|
||||
return new DStream<byte[]>(ssc.streamingContextProxy.EventHubsUnionStream(eventhubsParams, storageLevelType), ssc, SerializedMode.None);
|
||||
}
|
||||
}
|
||||
}
|
|
@ -6,11 +6,16 @@ using System.Collections.Generic;
|
|||
using System.Linq;
|
||||
using System.Text;
|
||||
using System.Threading.Tasks;
|
||||
using System.Runtime.Serialization.Formatters.Binary;
|
||||
using System.IO;
|
||||
|
||||
using Microsoft.Spark.CSharp.Core;
|
||||
|
||||
namespace Microsoft.Spark.CSharp.Streaming
|
||||
{
|
||||
/// <summary>
|
||||
/// Utils for Kafka input stream.
|
||||
/// </summary>
|
||||
public class KafkaUtils
|
||||
{
|
||||
/// <summary>
|
||||
|
@ -30,6 +35,7 @@ namespace Microsoft.Spark.CSharp.Streaming
|
|||
/// <summary>
|
||||
/// Create an input stream that pulls messages from a Kafka Broker.
|
||||
/// </summary>
|
||||
/// <param name="ssc">Spark Streaming Context</param>
|
||||
/// <param name="zkQuorum">Zookeeper quorum (hostname:port,hostname:port,..).</param>
|
||||
/// <param name="groupId">The group id for this consumer.</param>
|
||||
/// <param name="topics">Dict of (topic_name -> numPartitions) to consume. Each partition is consumed in its own thread.</param>
|
||||
|
@ -79,5 +85,90 @@ namespace Microsoft.Spark.CSharp.Streaming
|
|||
{
|
||||
return new DStream<KeyValuePair<byte[], byte[]>>(ssc.streamingContextProxy.DirectKafkaStream(topics, kafkaParams, fromOffsets), ssc, SerializedMode.Pair);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Create an input stream that directly pulls messages from a Kafka Broker and specific offset.
|
||||
///
|
||||
/// This is not a receiver based Kafka input stream, it directly pulls the message from Kafka
|
||||
/// in each batch duration and processed without storing.
|
||||
///
|
||||
/// This does not use Zookeeper to store offsets. The consumed offsets are tracked
|
||||
/// by the stream itself. For interoperability with Kafka monitoring tools that depend on
|
||||
/// Zookeeper, you have to update Kafka/Zookeeper yourself from the streaming application.
|
||||
/// You can access the offsets used in each batch from the generated RDDs (see
|
||||
/// [[org.apache.spark.streaming.kafka.HasOffsetRanges]]).
|
||||
/// To recover from driver failures, you have to enable checkpointing in the StreamingContext.
|
||||
/// The information on consumed offset can be recovered from the checkpoint.
|
||||
/// See the programming guide for details (constraints, etc.).
|
||||
///
|
||||
/// </summary>
|
||||
/// <param name="ssc">Spark Streaming Context</param>
|
||||
/// <param name="topics">list of topic_name to consume.</param>
|
||||
/// <param name="kafkaParams">
|
||||
/// Additional params for Kafka. Requires "metadata.broker.list" or "bootstrap.servers" to be set
|
||||
/// with Kafka broker(s) (NOT zookeeper servers), specified in host1:port1,host2:port2 form.
|
||||
/// </param>
|
||||
/// <param name="fromOffsets">Per-topic/partition Kafka offsets defining the (inclusive) starting point of the stream.</param>
|
||||
/// <param name="numPartitions">
|
||||
/// user hint on how many kafka RDD partitions to create instead of aligning with kafka partitions,
|
||||
/// unbalanced kafka partitions and/or under-distributed data will be redistributed evenly across
|
||||
/// a probably larger number of RDD partitions
|
||||
/// If numPartitions = -1, either repartition based on spark.streaming.kafka.maxRatePerTask or do nothing if config not defined
|
||||
/// If numPartitions = 0, repartition using original kafka partition count
|
||||
/// If numPartitions > 0, repartition using this parameter
|
||||
/// </param>
|
||||
/// <returns>A DStream object</returns>
|
||||
public static DStream<KeyValuePair<byte[], byte[]>> CreateDirectStreamWithRepartition(StreamingContext ssc, List<string> topics, Dictionary<string, string> kafkaParams, Dictionary<string, long> fromOffsets, int numPartitions = -1)
|
||||
{
|
||||
return new DStream<KeyValuePair<byte[], byte[]>>(ssc.streamingContextProxy.DirectKafkaStreamWithRepartition(topics, kafkaParams, fromOffsets, numPartitions, null, null), ssc, SerializedMode.Pair);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Create an input stream that directly pulls messages from a Kafka Broker and specific offset.
|
||||
///
|
||||
/// This is not a receiver based Kafka input stream, it directly pulls the message from Kafka
|
||||
/// in each batch duration and processed without storing.
|
||||
///
|
||||
/// This does not use Zookeeper to store offsets. The consumed offsets are tracked
|
||||
/// by the stream itself. For interoperability with Kafka monitoring tools that depend on
|
||||
/// Zookeeper, you have to update Kafka/Zookeeper yourself from the streaming application.
|
||||
/// You can access the offsets used in each batch from the generated RDDs (see
|
||||
/// [[org.apache.spark.streaming.kafka.HasOffsetRanges]]).
|
||||
/// To recover from driver failures, you have to enable checkpointing in the StreamingContext.
|
||||
/// The information on consumed offset can be recovered from the checkpoint.
|
||||
/// See the programming guide for details (constraints, etc.).
|
||||
///
|
||||
/// </summary>
|
||||
/// <param name="ssc">Spark Streaming Context</param>
|
||||
/// <param name="topics">list of topic_name to consume.</param>
|
||||
/// <param name="kafkaParams">
|
||||
/// Additional params for Kafka. Requires "metadata.broker.list" or "bootstrap.servers" to be set
|
||||
/// with Kafka broker(s) (NOT zookeeper servers), specified in host1:port1,host2:port2 form.
|
||||
/// </param>
|
||||
/// <param name="fromOffsets">Per-topic/partition Kafka offsets defining the (inclusive) starting point of the stream.</param>
|
||||
/// <param name="numPartitions">
|
||||
/// user hint on how many kafka RDD partitions to create instead of aligning with kafka partitions,
|
||||
/// unbalanced kafka partitions and/or under-distributed data will be redistributed evenly across
|
||||
/// a probably larger number of RDD partitions
|
||||
/// If numPartitions = -1, either repartition based on spark.streaming.kafka.maxRatePerTask or do nothing if config not defined
|
||||
/// If numPartitions = 0, repartition using original kafka partition count
|
||||
/// If numPartitions > 0, repartition using this parameter
|
||||
/// </param>
|
||||
/// <param name="readFunc">user function to process the kafka data.</param>
|
||||
/// <returns>A DStream object</returns>
|
||||
public static DStream<T> CreateDirectStreamWithRepartitionAndReadFunc<T>(StreamingContext ssc, List<string> topics, Dictionary<string, string> kafkaParams, Dictionary<string, long> fromOffsets,
|
||||
int numPartitions, Func<int, IEnumerable<KeyValuePair<byte[], byte[]>>, IEnumerable<T>> readFunc)
|
||||
{
|
||||
var mapPartitionsWithIndexHelper = new MapPartitionsWithIndexHelper<KeyValuePair<byte[], byte[]>, T>(readFunc, true);
|
||||
var transformHelper = new TransformHelper<KeyValuePair<byte[], byte[]>, T>(mapPartitionsWithIndexHelper.Execute);
|
||||
var transformDynamicHelper = new TransformDynamicHelper<KeyValuePair<byte[], byte[]>, T>(transformHelper.Execute);
|
||||
Func<double, RDD<dynamic>, RDD<dynamic>> func = transformDynamicHelper.Execute;
|
||||
var formatter = new BinaryFormatter();
|
||||
var stream = new MemoryStream();
|
||||
formatter.Serialize(stream, func);
|
||||
byte[] readFuncBytes = stream.ToArray();
|
||||
string serializationMode = SerializedMode.Pair.ToString();
|
||||
return new DStream<T>(ssc.streamingContextProxy.DirectKafkaStreamWithRepartition(topics, kafkaParams, fromOffsets, numPartitions, readFuncBytes, serializationMode), ssc);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -0,0 +1,433 @@
|
|||
// Copyright (c) Microsoft. All rights reserved.
|
||||
// Licensed under the MIT license. See LICENSE file in the project root for full license information.
|
||||
|
||||
using System;
|
||||
using System.Collections;
|
||||
using System.Collections.Generic;
|
||||
using System.Diagnostics;
|
||||
using System.IO;
|
||||
using System.Linq;
|
||||
using System.Runtime.Serialization;
|
||||
using System.Runtime.Serialization.Formatters.Binary;
|
||||
using Microsoft.Spark.CSharp.Core;
|
||||
using Microsoft.Spark.CSharp.Interop;
|
||||
using Microsoft.Spark.CSharp.Interop.Ipc;
|
||||
using Microsoft.Spark.CSharp.Proxy;
|
||||
using Microsoft.Spark.CSharp.Proxy.Ipc;
|
||||
using Microsoft.Spark.CSharp.Services;
|
||||
|
||||
namespace Microsoft.Spark.CSharp.Streaming
|
||||
{
|
||||
/// <summary>
|
||||
/// DStream representing the stream of data generated by `mapWithState` operation on a pair DStream.
|
||||
/// Additionally, it also gives access to the stream of state snapshots, that is, the state data of all keys after a batch has updated them.
|
||||
/// </summary>
|
||||
/// <typeparam name="K">Type of the key</typeparam>
|
||||
/// <typeparam name="V">Type of the value</typeparam>
|
||||
/// <typeparam name="S">Type of the state data</typeparam>
|
||||
/// <typeparam name="M">Type of the mapped data</typeparam>
|
||||
[Serializable]
|
||||
public class MapWithStateDStream<K, V, S, M> : DStream<M>
|
||||
{
|
||||
internal DStream<KeyValuePair<K, S>> snapshotsDStream;
|
||||
|
||||
internal MapWithStateDStream(DStream<M> mappedDataDStream, DStream<KeyValuePair<K, S>> snapshotsDStream)
|
||||
: base(mappedDataDStream.DStreamProxy, mappedDataDStream.streamingContext)
|
||||
{
|
||||
this.snapshotsDStream = snapshotsDStream;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Return a pair DStream where each RDD is the snapshot of the state of all the keys.
|
||||
/// </summary>
|
||||
public DStream<KeyValuePair<K, S>> StateSnapshots()
|
||||
{
|
||||
return snapshotsDStream;
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Class to hold a state instance and the timestamp when the state is updated or created.
|
||||
/// No need to explicitly make this class clonable, since the serialization and deserialization in Worker is already a kind of clone mechanism.
|
||||
/// </summary>
|
||||
/// <typeparam name="S">Type of the state data</typeparam>
|
||||
[Serializable]
|
||||
internal class KeyedState<S>
|
||||
{
|
||||
internal S state;
|
||||
internal long ticks;
|
||||
|
||||
internal KeyedState()
|
||||
{
|
||||
|
||||
}
|
||||
|
||||
internal KeyedState(S state, long ticks)
|
||||
{
|
||||
this.state = state;
|
||||
this.ticks = ticks;
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Record storing the keyed-state MapWithStateRDD.
|
||||
/// Each record contains a stateMap and a sequence of records returned by the mapping function of MapWithState.
|
||||
/// Note: don't need to explicitly make this class clonable, since the serialization and deserialization in Worker is already a kind of clone.
|
||||
/// </summary>
|
||||
/// <typeparam name="K">Type of the key</typeparam>
|
||||
/// <typeparam name="S">Type of the state data</typeparam>
|
||||
/// <typeparam name="M">Type of the mapped data</typeparam>
|
||||
[Serializable]
|
||||
internal class MapWithStateRDDRecord<K, S, M>
|
||||
{
|
||||
internal Dictionary<K, KeyedState<S>> stateMap = new Dictionary<K, KeyedState<S>>();
|
||||
internal List<M> mappedData = new List<M>();
|
||||
|
||||
public MapWithStateRDDRecord()
|
||||
{
|
||||
}
|
||||
|
||||
public MapWithStateRDDRecord(long t, IEnumerable<KeyValuePair<K, S>> iter)
|
||||
{
|
||||
foreach (var p in iter)
|
||||
{
|
||||
stateMap[p.Key] = new KeyedState<S>(p.Value, t);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Helper class to update states for a RDD partition.
|
||||
/// Reference: https://github.com/apache/spark/blob/master/streaming/src/main/scala/org/apache/spark/streaming/rdd/MapWithStateRDD.scala
|
||||
/// </summary>
|
||||
/// <typeparam name="K">Type of the key</typeparam>
|
||||
/// <typeparam name="V">Type of the value</typeparam>
|
||||
/// <typeparam name="S">Type of the state data</typeparam>
|
||||
/// <typeparam name="M">Type of the mapped data</typeparam>
|
||||
[Serializable]
|
||||
internal class UpdateStateHelper<K, V, S, M>
|
||||
{
|
||||
[NonSerialized]
|
||||
private readonly ILoggerService logger = LoggerServiceFactory.GetLogger(typeof(UpdateStateHelper<K, V, S, M>));
|
||||
|
||||
private readonly Func<K, V, State<S>, M> f;
|
||||
private readonly long ticks;
|
||||
private readonly bool removeTimedoutData;
|
||||
private readonly TimeSpan idleDuration;
|
||||
|
||||
internal UpdateStateHelper(Func<K, V, State<S>, M> f, long ticks, bool removeTimedoutData, TimeSpan idleDuration)
|
||||
{
|
||||
this.f = f;
|
||||
this.ticks = ticks;
|
||||
this.removeTimedoutData = removeTimedoutData;
|
||||
this.idleDuration = idleDuration;
|
||||
}
|
||||
|
||||
internal IEnumerable<dynamic> Execute(int pid, IEnumerable<dynamic> iter)
|
||||
{
|
||||
var enumerator = iter.GetEnumerator();
|
||||
var preStateRddRecord = GetStateRecord(enumerator);
|
||||
var stateRddRecord = preStateRddRecord;
|
||||
|
||||
while (enumerator.MoveNext())
|
||||
{
|
||||
KeyValuePair<K, V> kv = enumerator.Current;
|
||||
KeyedState<S> keyedState;
|
||||
State<S> wrappedState = stateRddRecord.stateMap.TryGetValue(kv.Key, out keyedState) ? new State<S>(keyedState.state) : new State<S>(default(S));
|
||||
|
||||
var mappedData = default(M);
|
||||
try
|
||||
{
|
||||
mappedData = f(kv.Key, kv.Value, wrappedState);
|
||||
}
|
||||
catch (Exception e)
|
||||
{
|
||||
logger.LogException(e);
|
||||
}
|
||||
|
||||
stateRddRecord.mappedData.Add(mappedData);
|
||||
|
||||
if (wrappedState.removed)
|
||||
{
|
||||
stateRddRecord.stateMap.Remove(kv.Key);
|
||||
}
|
||||
else if (wrappedState.updated || wrappedState.defined)
|
||||
{
|
||||
stateRddRecord.stateMap[kv.Key] = new KeyedState<S>(wrappedState.state, ticks);
|
||||
}
|
||||
}
|
||||
|
||||
// Get the timed out state records, call the mapping function on each and collect the data returned
|
||||
if (removeTimedoutData)
|
||||
{
|
||||
long timeoutThresholdInTicks = ticks - idleDuration.Ticks;
|
||||
var toBeRemovedKeys = new List<K>();
|
||||
foreach (KeyValuePair<K, KeyedState<S>> entry in stateRddRecord.stateMap)
|
||||
{
|
||||
if (entry.Value.ticks >= timeoutThresholdInTicks) continue;
|
||||
|
||||
var timingOutstate = new State<S>(entry.Value.state, true);
|
||||
var mappedData = default(M);
|
||||
try
|
||||
{
|
||||
mappedData = f(entry.Key, default(V), timingOutstate);
|
||||
}
|
||||
catch (Exception e)
|
||||
{
|
||||
logger.LogException(e);
|
||||
}
|
||||
|
||||
stateRddRecord.mappedData.Add(mappedData);
|
||||
toBeRemovedKeys.Add(entry.Key);
|
||||
}
|
||||
|
||||
foreach (var k in toBeRemovedKeys)
|
||||
{
|
||||
stateRddRecord.stateMap.Remove(k);
|
||||
}
|
||||
}
|
||||
|
||||
return new []{stateRddRecord};
|
||||
}
|
||||
|
||||
internal MapWithStateRDDRecord<K, S, M> GetStateRecord(IEnumerator<dynamic> enumerator)
|
||||
{
|
||||
if (enumerator.MoveNext())
|
||||
{
|
||||
return enumerator.Current;
|
||||
}
|
||||
|
||||
throw new InvalidOperationException("MapWithStateRDDRecord is missing.");
|
||||
}
|
||||
}
|
||||
|
||||
[Serializable]
|
||||
internal class MapWithStateHelper<K, V, S, M>
|
||||
{
|
||||
private static readonly DateTime UnixTimeEpoch = new DateTime(1970, 1, 1, 0, 0, 0, DateTimeKind.Utc);
|
||||
private readonly Func<double, RDD<dynamic>, RDD<dynamic>> prevFunc;
|
||||
private readonly StateSpec<K, V, S, M> stateSpec;
|
||||
|
||||
internal MapWithStateHelper(Func<double, RDD<dynamic>, RDD<dynamic>> prevF, StateSpec<K, V, S, M> stateSpec)
|
||||
{
|
||||
prevFunc = prevF;
|
||||
this.stateSpec = stateSpec;
|
||||
}
|
||||
|
||||
internal RDD<dynamic> Execute(double t, RDD<dynamic> stateRDD, RDD<dynamic> valuesRDD)
|
||||
{
|
||||
long ticks = UnixTimeEpoch.AddMilliseconds(t).Ticks;
|
||||
|
||||
if (prevFunc != null)
|
||||
{
|
||||
valuesRDD = prevFunc(t, valuesRDD);
|
||||
}
|
||||
|
||||
var values = valuesRDD.ConvertTo<KeyValuePair<K, V>>().PartitionBy(stateSpec.numPartitions);
|
||||
|
||||
if (stateRDD == null)
|
||||
{
|
||||
if (stateSpec.initialState != null)
|
||||
{
|
||||
if (stateSpec.initialState.sparkContext == null)
|
||||
{
|
||||
stateSpec.initialState.sparkContext = valuesRDD.sparkContext;
|
||||
}
|
||||
var partitionedInitialState = stateSpec.initialState.PartitionBy(stateSpec.numPartitions);
|
||||
stateRDD = partitionedInitialState.MapPartitions(new MapWithStateMapPartitionHelper<K, V, S, M>(ticks).Execute, true).ConvertTo<dynamic>();
|
||||
}
|
||||
else
|
||||
{
|
||||
stateRDD = values.PartitionBy(stateSpec.numPartitions).MapPartitions(new MapWithStateMapPartitionHelper<K, V, S, M>(ticks).ExecuteWithoutInitialState, true).ConvertTo<dynamic>();
|
||||
}
|
||||
}
|
||||
|
||||
bool removeTimedoutData = stateSpec.idleDuration.Ticks != 0 && stateRDD.IsCheckpointed;
|
||||
stateRDD.partitioner = values.partitioner;
|
||||
RDD<dynamic> union = stateRDD.Union(values.ConvertTo<dynamic>());
|
||||
|
||||
return union.MapPartitionsWithIndex(new UpdateStateHelper<K, V, S, M>(stateSpec.mappingFunction, ticks, removeTimedoutData, stateSpec.idleDuration).Execute, true);
|
||||
}
|
||||
}
|
||||
|
||||
[Serializable]
|
||||
internal class MapWithStateMapPartitionHelper<K, V, S, M>
|
||||
{
|
||||
internal long ticks;
|
||||
internal MapWithStateMapPartitionHelper(long ticks)
|
||||
{
|
||||
this.ticks = ticks;
|
||||
}
|
||||
|
||||
internal IEnumerable<MapWithStateRDDRecord<K, S, M>> Execute(IEnumerable<KeyValuePair<K, S>> iter)
|
||||
{
|
||||
return new[] {new MapWithStateRDDRecord<K, S, M>(ticks, iter)};
|
||||
}
|
||||
|
||||
internal IEnumerable<MapWithStateRDDRecord<K, S, M>> ExecuteWithoutInitialState(IEnumerable<KeyValuePair<K, V>> iter)
|
||||
{
|
||||
return new[] { new MapWithStateRDDRecord<K, S, M>() };
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Representing all the specifications of the DStream transformation `mapWithState` operation.
|
||||
/// </summary>
|
||||
/// <typeparam name="K">Type of the key</typeparam>
|
||||
/// <typeparam name="V">Type of the value</typeparam>
|
||||
/// <typeparam name="S">Type of the state data</typeparam>
|
||||
/// <typeparam name="M">Type of the mapped data</typeparam>
|
||||
[Serializable]
|
||||
public class StateSpec<K, V, S, M>
|
||||
{
|
||||
internal Func<K, V, State<S>, M> mappingFunction;
|
||||
internal int numPartitions;
|
||||
internal TimeSpan idleDuration = TimeSpan.FromTicks(0);
|
||||
internal RDD<KeyValuePair<K, S>> initialState = null;
|
||||
|
||||
/// <summary>
|
||||
/// Create a StateSpec for setting all the specifications of the `mapWithState` operation on a pair DStream.
|
||||
/// </summary>
|
||||
/// <param name="mappingFunction">The function applied on every data item to manage the associated state and generate the mapped data</param>
|
||||
public StateSpec(Func<K, V, State<S>, M> mappingFunction)
|
||||
{
|
||||
this.mappingFunction = mappingFunction;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Set the number of partitions by which the state RDDs generated by `mapWithState` will be partitioned.
|
||||
/// Hash partitioning will be used.
|
||||
/// </summary>
|
||||
/// <param name="numPartitions">The number of partitions</param>
|
||||
/// <returns>The new StateSpec object</returns>
|
||||
public StateSpec<K, V, S, M> NumPartitions(int numPartitions)
|
||||
{
|
||||
this.numPartitions = numPartitions;
|
||||
return this;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Set the duration after which the state of an idle key will be removed. A key and its state is
|
||||
/// considered idle if it has not received any data for at least the given duration. The
|
||||
/// mapping function will be called one final time on the idle states that are going to be
|
||||
/// removed; [[org.apache.spark.streaming.State State.isTimingOut()]] set to `true` in that call.
|
||||
/// </summary>
|
||||
/// <param name="idleDuration">The idle time of duration</param>
|
||||
/// <returns>The new StateSpec object</returns>
|
||||
public StateSpec<K, V, S, M> Timeout(TimeSpan idleDuration)
|
||||
{
|
||||
this.idleDuration = idleDuration;
|
||||
return this;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Set the RDD containing the initial states that will be used by mapWithState
|
||||
/// </summary>
|
||||
/// <param name="initialState">The given initial state</param>
|
||||
/// <returns>The new StateSpec object</returns>
|
||||
public StateSpec<K, V, S, M> InitialState(RDD<KeyValuePair<K, S>> initialState)
|
||||
{
|
||||
this.initialState = initialState;
|
||||
return this;
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// class for getting and updating the state in mapping function used in the `mapWithState` operation
|
||||
/// </summary>
|
||||
/// <typeparam name="S">Type of the state</typeparam>
|
||||
[Serializable]
|
||||
public class State<S>
|
||||
{
|
||||
internal S state = default(S);
|
||||
|
||||
[NonSerialized]
|
||||
internal bool defined = false;
|
||||
[NonSerialized]
|
||||
internal bool timingOut = false; // FIXME: set timingOut to true for those timeouted keys
|
||||
[NonSerialized]
|
||||
internal bool updated = false;
|
||||
[NonSerialized]
|
||||
internal bool removed = false;
|
||||
|
||||
internal State(S state, bool timingOut = false)
|
||||
{
|
||||
this.state = state;
|
||||
this.timingOut = timingOut;
|
||||
removed = false;
|
||||
updated = false;
|
||||
|
||||
if (!timingOut)
|
||||
{
|
||||
defined = !ReferenceEquals(null, state);
|
||||
}
|
||||
else
|
||||
{
|
||||
defined = true;
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Returns whether the state already exists
|
||||
/// </summary>
|
||||
/// <returns>true, if the state already exists; otherwise, false.</returns>
|
||||
public bool Exists()
|
||||
{
|
||||
return defined;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Gets the state if it exists, otherwise it will throw ArgumentException.
|
||||
/// </summary>
|
||||
/// <returns>The state</returns>
|
||||
/// <exception cref="ArgumentException">ArgumentException if it does not exist.</exception>
|
||||
public S Get()
|
||||
{
|
||||
if (defined)
|
||||
{
|
||||
return state;
|
||||
}
|
||||
throw new ArgumentException("State is not set");
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Updates the state with a new value.
|
||||
/// </summary>
|
||||
/// <param name="newState">The new state</param>
|
||||
/// <exception cref="ArgumentException">ArgumentException if the state already be removed or timing out</exception>
|
||||
public void Update(S newState)
|
||||
{
|
||||
if (removed || timingOut)
|
||||
{
|
||||
throw new ArgumentException("Cannot update the state that is timing out or has been removed.");
|
||||
}
|
||||
state = newState;
|
||||
defined = true;
|
||||
updated = true;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Removes the state if it exists.
|
||||
/// </summary>
|
||||
/// <exception cref="ArgumentException">ArgumentException if the state already be removed or timing out</exception>
|
||||
public void Remove()
|
||||
{
|
||||
if (removed || timingOut)
|
||||
{
|
||||
throw new ArgumentException("Cannot update the state that is timing out or has already been removed.");
|
||||
}
|
||||
defined = false;
|
||||
updated = false;
|
||||
removed = true;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Returns whether the state is timing out and going to be removed by the system after the current batch.
|
||||
/// </summary>
|
||||
/// <returns>true, if it is timing out; otherwise, false.</returns>
|
||||
public bool IsTimingOut()
|
||||
{
|
||||
return timingOut;
|
||||
}
|
||||
}
|
||||
}
|
|
@ -267,17 +267,16 @@ namespace Microsoft.Spark.CSharp.Streaming
|
|||
int numPartitions = 0,
|
||||
Func<KeyValuePair<K, V>, bool> filterFunc = null)
|
||||
{
|
||||
self.ValidatWindowParam(windowSeconds, slideSeconds);
|
||||
self.ValidateWindowParam(windowSeconds, slideSeconds);
|
||||
|
||||
if (slideSeconds <= 0)
|
||||
slideSeconds = self.SlideDuration;
|
||||
|
||||
// dstream to be transformed by substracting old RDDs and adding new RDDs based on the window
|
||||
var reduced = self.ReduceByKey(reduceFunc, numPartitions);
|
||||
reduced.Cache();
|
||||
|
||||
Func<double, RDD<dynamic>, RDD<dynamic>> prevFunc = reduced.Piplinable ? (reduced as TransformedDStream<KeyValuePair<K, V>>).func : null;
|
||||
|
||||
var helper = new ReduceByKeyAndWindowHelper<K, V>(reduceFunc, invReduceFunc, numPartitions, filterFunc, prevFunc);
|
||||
var helper = new ReduceByKeyAndWindowHelper<K, V>(reduceFunc, invReduceFunc, numPartitions, filterFunc);
|
||||
// function to reduce the new values that entered the window (e.g., adding new counts)
|
||||
Func<double, RDD<dynamic>, RDD<dynamic>, RDD<dynamic>> reduceF = helper.Reduce;
|
||||
|
||||
|
@ -292,17 +291,17 @@ namespace Microsoft.Spark.CSharp.Streaming
|
|||
Func<double, RDD<dynamic>, RDD<dynamic>, RDD<dynamic>> invReduceF = helper.InvReduce;
|
||||
|
||||
invStream = new MemoryStream();
|
||||
formatter.Serialize(stream, invReduceF);
|
||||
formatter.Serialize(invStream, invReduceF);
|
||||
}
|
||||
|
||||
return new DStream<KeyValuePair<K, V>>(
|
||||
SparkCLREnvironment.SparkCLRProxy.StreamingContextProxy.CreateCSharpReducedWindowedDStream(
|
||||
reduced.Piplinable ? reduced.prevDStreamProxy : reduced.DStreamProxy,
|
||||
reduced.DStreamProxy,
|
||||
stream.ToArray(),
|
||||
invStream == null ? null : invStream.ToArray(),
|
||||
windowSeconds,
|
||||
slideSeconds,
|
||||
(reduced.Piplinable ? reduced.prevSerializedMode : reduced.serializedMode).ToString()),
|
||||
reduced.serializedMode.ToString()),
|
||||
self.streamingContext
|
||||
);
|
||||
}
|
||||
|
@ -319,13 +318,14 @@ namespace Microsoft.Spark.CSharp.Streaming
|
|||
/// State update function - (newValues, oldState) => newState
|
||||
/// If this function returns None, then corresponding state key-value pair will be eliminated.
|
||||
/// </param>
|
||||
/// <param name="initialState">Initial state value of each key</param>
|
||||
/// <param name="numPartitions"></param>
|
||||
/// <returns></returns>
|
||||
public static DStream<KeyValuePair<K, S>> UpdateStateByKey<K, V, S>(this DStream<KeyValuePair<K, V>> self,
|
||||
Func<IEnumerable<V>, S, S> updateFunc,
|
||||
Func<IEnumerable<V>, S, S> updateFunc, RDD<KeyValuePair<K, S>> initialState = null,
|
||||
int numPartitions = 0)
|
||||
{
|
||||
return UpdateStateByKey<K, V, S>(self, new UpdateStateByKeyHelper<K, V, S>(updateFunc).Execute, numPartitions);
|
||||
return UpdateStateByKey<K, V, S>(self, new UpdateStateByKeyHelper<K, V, S>(updateFunc).Execute, initialState, numPartitions);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
|
@ -337,13 +337,14 @@ namespace Microsoft.Spark.CSharp.Streaming
|
|||
/// <typeparam name="S"></typeparam>
|
||||
/// <param name="self"></param>
|
||||
/// <param name="updateFunc">State update function - IEnumerable[K, [newValues, oldState]] => IEnumerable[K, newState]</param>
|
||||
/// <param name="initialState">Initial state value of each key</param>
|
||||
/// <param name="numPartitions"></param>
|
||||
/// <returns></returns>
|
||||
public static DStream<KeyValuePair<K, S>> UpdateStateByKey<K, V, S>(this DStream<KeyValuePair<K, V>> self,
|
||||
Func<IEnumerable<KeyValuePair<K, Tuple<IEnumerable<V>, S>>>, IEnumerable<KeyValuePair<K, S>>> updateFunc,
|
||||
Func<IEnumerable<KeyValuePair<K, Tuple<IEnumerable<V>, S>>>, IEnumerable<KeyValuePair<K, S>>> updateFunc, RDD<KeyValuePair<K, S>> initialState = null,
|
||||
int numPartitions = 0)
|
||||
{
|
||||
return UpdateStateByKey<K, V, S>(self, new MapPartitionsHelper<KeyValuePair<K, Tuple<IEnumerable<V>, S>>, KeyValuePair<K, S>>(updateFunc).Execute, numPartitions);
|
||||
return UpdateStateByKey<K, V, S>(self, new MapPartitionsHelper<KeyValuePair<K, Tuple<IEnumerable<V>, S>>, KeyValuePair<K, S>>(updateFunc).Execute, initialState, numPartitions);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
|
@ -355,30 +356,68 @@ namespace Microsoft.Spark.CSharp.Streaming
|
|||
/// <typeparam name="S"></typeparam>
|
||||
/// <param name="self"></param>
|
||||
/// <param name="updateFunc">State update function - (pid, IEnumerable[K, [newValues, oldState]]) => IEnumerable[K, newState]</param>
|
||||
/// <param name="initialState">Initial state value of each key</param>
|
||||
/// <param name="numPartitions"></param>
|
||||
/// <returns></returns>
|
||||
public static DStream<KeyValuePair<K, S>> UpdateStateByKey<K, V, S>(this DStream<KeyValuePair<K, V>> self,
|
||||
Func<int, IEnumerable<KeyValuePair<K, Tuple<IEnumerable<V>, S>>>, IEnumerable<KeyValuePair<K, S>>> updateFunc,
|
||||
int numPartitions = 0)
|
||||
RDD<KeyValuePair<K, S>> initialState = null, int numPartitions = 0)
|
||||
{
|
||||
if (numPartitions <= 0)
|
||||
numPartitions = self.streamingContext.SparkContext.DefaultParallelism;
|
||||
|
||||
Func<double, RDD<dynamic>, RDD<dynamic>> prevFunc = self.Piplinable ? (self as TransformedDStream<KeyValuePair<K, V>>).func : null;
|
||||
// completes pipelinable dstream by adding the last pipelinable operation
|
||||
// before transforming to CSharpStateDStream so that UpdateStateByKey's
|
||||
// parallel job covers all pipelinable operations before shuffling
|
||||
var ds = self.Transform(new AddShuffleKeyHelper<K, V>(numPartitions).Execute);
|
||||
|
||||
Func<double, RDD<dynamic>, RDD<dynamic>, RDD<dynamic>> func = new UpdateStateByKeysHelper<K, V, S>(updateFunc, prevFunc, numPartitions).Execute;
|
||||
Func<double, RDD<dynamic>, RDD<dynamic>, RDD<dynamic>> func = new UpdateStateByKeysHelper<K, V, S>(updateFunc, initialState, numPartitions).Execute;
|
||||
|
||||
var formatter = new BinaryFormatter();
|
||||
var stream = new MemoryStream();
|
||||
formatter.Serialize(stream, func);
|
||||
|
||||
return new DStream<KeyValuePair<K, S>>(SparkCLREnvironment.SparkCLRProxy.StreamingContextProxy.CreateCSharpStateDStream(
|
||||
ds.DStreamProxy,
|
||||
stream.ToArray(),
|
||||
"CSharpStateDStream",
|
||||
ds.serializedMode.ToString(),
|
||||
ds.serializedMode.ToString()),
|
||||
self.streamingContext);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Return a new "state" DStream where the state for each key is updated by applying
|
||||
/// the given function on the previous state of the key and the new values of the key.
|
||||
/// </summary>
|
||||
public static MapWithStateDStream<K, V, S, M> MapWithState<K, V, S, M>(this DStream<KeyValuePair<K, V>> self, StateSpec<K, V, S, M> stateSpec)
|
||||
{
|
||||
if (stateSpec.numPartitions <= 0)
|
||||
{
|
||||
stateSpec = stateSpec.NumPartitions(self.streamingContext.SparkContext.DefaultParallelism);
|
||||
}
|
||||
|
||||
Func<double, RDD<dynamic>, RDD<dynamic>> prevFunc = self.Piplinable ? (self as TransformedDStream<KeyValuePair<K, V>>).func : null;
|
||||
|
||||
Func<double, RDD<dynamic>, RDD<dynamic>, RDD<dynamic>> func = new MapWithStateHelper<K, V, S, M>(prevFunc, stateSpec).Execute;
|
||||
|
||||
var formatter = new BinaryFormatter();
|
||||
var stream = new MemoryStream();
|
||||
formatter.Serialize(stream, func);
|
||||
|
||||
var mapWithStateDStream = new DStream<MapWithStateRDDRecord<K, S, M>>(SparkCLREnvironment.SparkCLRProxy.StreamingContextProxy.CreateCSharpStateDStream(
|
||||
self.Piplinable ? self.prevDStreamProxy : self.DStreamProxy,
|
||||
stream.ToArray(),
|
||||
"CSharpStateDStream",
|
||||
self.serializedMode.ToString(),
|
||||
(self.Piplinable ? self.prevSerializedMode : self.serializedMode).ToString()),
|
||||
self.streamingContext);
|
||||
|
||||
DStream<M> mappedDataDStream = mapWithStateDStream.FlatMap(r => r.mappedData);
|
||||
DStream<KeyValuePair<K, S>> snapshotsDStream = mapWithStateDStream.FlatMap(
|
||||
r => r.stateMap.Select(entry => new KeyValuePair<K, S>(entry.Key, entry.Value.state)));
|
||||
|
||||
return new MapWithStateDStream<K, V, S, M>(mappedDataDStream, snapshotsDStream);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -425,6 +464,25 @@ namespace Microsoft.Spark.CSharp.Streaming
|
|||
}
|
||||
}
|
||||
|
||||
[Serializable]
|
||||
internal class AddShuffleKeyHelper<K, V>
|
||||
{
|
||||
private readonly int numPartitions;
|
||||
internal AddShuffleKeyHelper(int numPartitions)
|
||||
{
|
||||
this.numPartitions = numPartitions;
|
||||
}
|
||||
|
||||
internal RDD<byte[]> Execute(RDD<KeyValuePair<K, V>> rdd)
|
||||
{
|
||||
var keyed = rdd.MapPartitionsWithIndex(new PairRDDFunctions.AddShuffleKeyHelper<K, V>(numPartitions).Execute, true);
|
||||
keyed.bypassSerializer = true;
|
||||
keyed.rddProxy = keyed.RddProxy;
|
||||
|
||||
return keyed;
|
||||
}
|
||||
}
|
||||
|
||||
[Serializable]
|
||||
internal class MapValuesHelper<K, V, U>
|
||||
{
|
||||
|
@ -509,7 +567,7 @@ namespace Microsoft.Spark.CSharp.Streaming
|
|||
this.numPartitions = numPartitions;
|
||||
}
|
||||
|
||||
internal RDD<KeyValuePair<K, Tuple<V, Option<W>>>> Execute<K,V,W>(RDD<KeyValuePair<K, V>> l, RDD<KeyValuePair<K, W>> r)
|
||||
internal RDD<KeyValuePair<K, Tuple<V, Option<W>>>> Execute(RDD<KeyValuePair<K, V>> l, RDD<KeyValuePair<K, W>> r)
|
||||
{
|
||||
return l.LeftOuterJoin<K, V, W>(r, numPartitions);
|
||||
}
|
||||
|
@ -552,34 +610,28 @@ namespace Microsoft.Spark.CSharp.Streaming
|
|||
private readonly Func<V, V, V> invReduceFunc;
|
||||
private readonly int numPartitions;
|
||||
private readonly Func<KeyValuePair<K, V>, bool> filterFunc;
|
||||
private readonly Func<double, RDD<dynamic>, RDD<dynamic>> prevFunc;
|
||||
|
||||
internal ReduceByKeyAndWindowHelper(Func<V, V, V> reduceF,
|
||||
Func<V, V, V> invReduceF,
|
||||
int numPartitions,
|
||||
Func<KeyValuePair<K, V>, bool> filterF,
|
||||
Func<double, RDD<dynamic>, RDD<dynamic>> prevF)
|
||||
Func<KeyValuePair<K, V>, bool> filterF)
|
||||
{
|
||||
reduceFunc = reduceF;
|
||||
invReduceFunc = invReduceF;
|
||||
this.numPartitions = numPartitions;
|
||||
filterFunc = filterF;
|
||||
prevFunc = prevF;
|
||||
}
|
||||
|
||||
internal RDD<dynamic> Reduce(double t, RDD<dynamic> a, RDD<dynamic> b)
|
||||
{
|
||||
if (prevFunc != null)
|
||||
b = prevFunc(t, b);
|
||||
|
||||
var r = b.ConvertTo<KeyValuePair<K, V>>().ReduceByKey<K, V>(reduceFunc);
|
||||
b.partitioner = new Partitioner(numPartitions, null);
|
||||
var r = b.ConvertTo<KeyValuePair<K, V>>();
|
||||
if (a != null)
|
||||
{
|
||||
if (prevFunc != null)
|
||||
a = prevFunc(t, a);
|
||||
|
||||
r = a.ConvertTo<KeyValuePair<K, V>>().Union(r).ReduceByKey<K, V>(reduceFunc);
|
||||
a.partitioner = b.partitioner;
|
||||
r = a.ConvertTo<KeyValuePair<K, V>>().Union(r);
|
||||
}
|
||||
r = r.ReduceByKey<K, V>(reduceFunc, numPartitions);
|
||||
if (filterFunc != null)
|
||||
r.Filter(filterFunc);
|
||||
return r.ConvertTo<dynamic>();
|
||||
|
@ -587,13 +639,8 @@ namespace Microsoft.Spark.CSharp.Streaming
|
|||
|
||||
internal RDD<dynamic> InvReduce(double t, RDD<dynamic> a, RDD<dynamic> b)
|
||||
{
|
||||
if (prevFunc != null)
|
||||
{
|
||||
a = prevFunc(t, a);
|
||||
b = prevFunc(t, b);
|
||||
}
|
||||
|
||||
var rddb = b.ConvertTo<KeyValuePair<K, V>>().ReduceByKey<K, V>(reduceFunc);
|
||||
a.partitioner = b.partitioner = new Partitioner(numPartitions, null);
|
||||
var rddb = b.ConvertTo<KeyValuePair<K, V>>().ReduceByKey<K, V>(reduceFunc, numPartitions);
|
||||
var rdda = a.ConvertTo<KeyValuePair<K, V>>();
|
||||
var joined = rdda.Join<K, V, V>(rddb, numPartitions);
|
||||
var r = joined.MapValues<K, Tuple<V, V>, V>(kv => kv.Item2 != null ? invReduceFunc(kv.Item1, kv.Item2) : kv.Item1);
|
||||
|
@ -621,14 +668,14 @@ namespace Microsoft.Spark.CSharp.Streaming
|
|||
internal class UpdateStateByKeysHelper<K, V, S>
|
||||
{
|
||||
private readonly Func<int, IEnumerable<KeyValuePair<K, Tuple<IEnumerable<V>, S>>>, IEnumerable<KeyValuePair<K, S>>> func;
|
||||
private readonly Func<double, RDD<dynamic>, RDD<dynamic>> prevFunc;
|
||||
private readonly RDD<KeyValuePair<K, S>> initialState;
|
||||
private readonly int numPartitions;
|
||||
internal UpdateStateByKeysHelper(
|
||||
Func<int, IEnumerable<KeyValuePair<K, Tuple<IEnumerable<V>, S>>>, IEnumerable<KeyValuePair<K, S>>> f,
|
||||
Func<double, RDD<dynamic>, RDD<dynamic>> prevF, int numPartitions)
|
||||
RDD<KeyValuePair<K, S>> initialState, int numPartitions)
|
||||
{
|
||||
func = f;
|
||||
prevFunc = prevF;
|
||||
this.initialState = initialState;
|
||||
this.numPartitions = numPartitions;
|
||||
}
|
||||
|
||||
|
@ -637,10 +684,21 @@ namespace Microsoft.Spark.CSharp.Streaming
|
|||
RDD<KeyValuePair<K, S>> state = null;
|
||||
RDD<KeyValuePair<K, Tuple<IEnumerable<V>, S>>> g = null;
|
||||
|
||||
if (prevFunc != null)
|
||||
valuesRDD = prevFunc(t, valuesRDD);
|
||||
// call into scala side partitionBy directly since AddShuffleKey already applied
|
||||
var values = new RDD<KeyValuePair<K, V>>(valuesRDD.sparkContext.SparkContextProxy.CreatePairwiseRDD(valuesRDD.rddProxy, numPartitions, 0), valuesRDD.sparkContext);
|
||||
values.partitioner = new Partitioner(numPartitions, null);
|
||||
|
||||
var values = valuesRDD.ConvertTo<KeyValuePair<K, V>>();
|
||||
if (stateRDD == null)
|
||||
{
|
||||
if (initialState != null)
|
||||
{
|
||||
if (initialState.sparkContext == null)
|
||||
{
|
||||
initialState.sparkContext = valuesRDD.sparkContext;
|
||||
}
|
||||
stateRDD = initialState.ConvertTo<dynamic>();
|
||||
}
|
||||
}
|
||||
|
||||
if (stateRDD == null)
|
||||
{
|
||||
|
@ -649,7 +707,6 @@ namespace Microsoft.Spark.CSharp.Streaming
|
|||
else
|
||||
{
|
||||
state = stateRDD.ConvertTo<KeyValuePair<K, S>>();
|
||||
values = values.PartitionBy(numPartitions);
|
||||
state.partitioner = values.partitioner;
|
||||
g = state.GroupWith(values, numPartitions).MapValues(x => new Tuple<IEnumerable<V>, S>(new List<V>(x.Item2), x.Item1.Count > 0 ? x.Item1[0] : default(S)));
|
||||
}
|
||||
|
|
|
@ -52,10 +52,15 @@ namespace Microsoft.Spark.CSharp.Streaming
|
|||
this.streamingContextProxy = streamingContextProxy;
|
||||
}
|
||||
|
||||
public StreamingContext(SparkContext sparkContext, long durationMs)
|
||||
/// <summary>
|
||||
/// Initializes a new instance of StreamingContext with a existing SparkContext
|
||||
/// </summary>
|
||||
/// <param name="sparkContext">An existing SparkContext</param>
|
||||
/// <param name="durationSeconds">the time interval at which streaming data will be divided into batches</param>
|
||||
public StreamingContext(SparkContext sparkContext, int durationSeconds)
|
||||
{
|
||||
this.sparkContext = sparkContext;
|
||||
streamingContextProxy = SparkCLREnvironment.SparkCLRProxy.CreateStreamingContext(sparkContext, durationMs);
|
||||
streamingContextProxy = SparkCLREnvironment.SparkCLRProxy.CreateStreamingContext(sparkContext, durationSeconds);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
|
@ -79,11 +84,17 @@ namespace Microsoft.Spark.CSharp.Streaming
|
|||
return new StreamingContext(SparkCLREnvironment.SparkCLRProxy.CreateStreamingContext(checkpointPath));
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Start the execution of the streams.
|
||||
/// </summary>
|
||||
public void Start()
|
||||
{
|
||||
streamingContextProxy.Start();
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Stop the execution of the streams.
|
||||
/// </summary>
|
||||
public void Stop()
|
||||
{
|
||||
streamingContextProxy.Stop();
|
||||
|
@ -95,10 +106,10 @@ namespace Microsoft.Spark.CSharp.Streaming
|
|||
/// collection. This method allows the developer to specify how long to remember the RDDs (
|
||||
/// if the developer wishes to query old data outside the DStream computation).
|
||||
/// </summary>
|
||||
/// <param name="durationMs">Minimum duration that each DStream should remember its RDDs</param>
|
||||
public void Remember(long durationMs)
|
||||
/// <param name="durationSeconds">Minimum duration that each DStream should remember its RDDs</param>
|
||||
public void Remember(int durationSeconds)
|
||||
{
|
||||
streamingContextProxy.Remember(durationMs);
|
||||
streamingContextProxy.Remember(durationSeconds);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
|
@ -152,10 +163,10 @@ namespace Microsoft.Spark.CSharp.Streaming
|
|||
/// <summary>
|
||||
/// Wait for the execution to stop.
|
||||
/// </summary>
|
||||
/// <param name="timeout">time to wait in seconds</param>
|
||||
public void AwaitTerminationOrTimeout(int timeout)
|
||||
/// <param name="timeout">time to wait in milliseconds</param>
|
||||
public void AwaitTerminationOrTimeout(long timeout)
|
||||
{
|
||||
streamingContextProxy.AwaitTermination(timeout);
|
||||
streamingContextProxy.AwaitTerminationOrTimeout(timeout);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
|
|
|
@ -2,7 +2,7 @@
|
|||
<xsl:stylesheet version="1.0" xmlns:xsl="http://www.w3.org/1999/XSL/Transform">
|
||||
|
||||
<xsl:template match="/">
|
||||
##<center><H1><font color="darkorchid4">SparkCLR API Documentation<!--xsl:value-of select="$AssemblyName"/--></font></H1></center>
|
||||
##<center><H1><font color="darkorchid4">Mobius API Documentation<!--xsl:value-of select="$AssemblyName"/--></font></H1></center>
|
||||
<xsl:apply-templates select="//member[contains(@name,'T:') and not(contains(@name,'Helper')) and not(contains(@name,'Wrapper')) and not(contains(@name,'Configuration')) and not(contains(@name,'Proxy')) and not(contains(@name,'Interop')) and not(contains(@name,'Services'))]"/>
|
||||
</xsl:template>
|
||||
|
||||
|
|
Разница между файлами не показана из-за своего большого размера
Загрузить разницу
Различия файлов скрыты, потому что одна или несколько строк слишком длинны
Различия файлов скрыты, потому что одна или несколько строк слишком длинны
|
@ -1,21 +1,14 @@
|
|||
using System;
|
||||
using System.IO;
|
||||
using System.IO;
|
||||
using System.Collections.Generic;
|
||||
using System.Linq;
|
||||
using System.Text;
|
||||
using System.Threading.Tasks;
|
||||
using System.Net;
|
||||
using System.Net.Sockets;
|
||||
using System.Runtime.Serialization.Formatters.Binary;
|
||||
|
||||
using Microsoft.Spark.CSharp.Core;
|
||||
using Microsoft.Spark.CSharp.Interop;
|
||||
using Microsoft.Spark.CSharp.Proxy;
|
||||
using Microsoft.Spark.CSharp.Interop.Ipc;
|
||||
|
||||
using NUnit.Framework;
|
||||
using Moq;
|
||||
using AdapterTest.Mocks;
|
||||
using Microsoft.Spark.CSharp.Network;
|
||||
|
||||
namespace AdapterTest
|
||||
{
|
||||
|
@ -27,7 +20,7 @@ namespace AdapterTest
|
|||
public class AccumulatorTest
|
||||
{
|
||||
private SparkContext sc;
|
||||
private Socket sock;
|
||||
private ISocketWrapper sock;
|
||||
|
||||
|
||||
[SetUp]
|
||||
|
@ -38,7 +31,7 @@ namespace AdapterTest
|
|||
|
||||
// get accumulator server port and connect to accumuator server
|
||||
int serverPort = (sc.SparkContextProxy as MockSparkContextProxy).AccumulatorServerPort;
|
||||
sock = new Socket(AddressFamily.InterNetwork, SocketType.Stream, ProtocolType.Tcp);
|
||||
sock = SocketFactory.CreateSocket();
|
||||
sock.Connect(IPAddress.Loopback, serverPort);
|
||||
}
|
||||
|
||||
|
@ -49,29 +42,31 @@ namespace AdapterTest
|
|||
|
||||
try
|
||||
{
|
||||
using (var s = new NetworkStream(sock))
|
||||
using (var s = sock.GetStream())
|
||||
{
|
||||
int numUpdates = 0;
|
||||
SerDe.Write(s, numUpdates);
|
||||
}
|
||||
|
||||
sock.Close();
|
||||
}
|
||||
catch
|
||||
{
|
||||
// do nothing here
|
||||
}
|
||||
finally
|
||||
{
|
||||
sock.Close();
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// test when no errors, accumuator server receives data as expected and exit with 0
|
||||
/// test when no errors, accumulator server receives data as expected and exit with 0
|
||||
/// </summary>
|
||||
[Test]
|
||||
public void TestAccumuatorSuccess()
|
||||
{
|
||||
Accumulator<int> accumulator = sc.Accumulator<int>(0);
|
||||
|
||||
using (var s = new NetworkStream(sock))
|
||||
using (var s = sock.GetStream())
|
||||
{
|
||||
// write numUpdates
|
||||
int numUpdates = 1;
|
||||
|
@ -102,7 +97,7 @@ namespace AdapterTest
|
|||
[Test]
|
||||
public void TestUndefinedAccumuator()
|
||||
{
|
||||
using (var s = new NetworkStream(sock))
|
||||
using (var s = sock.GetStream())
|
||||
{
|
||||
// write numUpdates
|
||||
int numUpdates = 1;
|
||||
|
|
|
@ -72,6 +72,7 @@
|
|||
<Compile Include="DataFrameNaFunctionsTest.cs" />
|
||||
<Compile Include="DataFrameReaderTest.cs" />
|
||||
<Compile Include="DataFrameWriterTest.cs" />
|
||||
<Compile Include="EventHubsUtilsTest.cs" />
|
||||
<Compile Include="JsonSerDeTest.cs" />
|
||||
<Compile Include="FunctionsTest.cs" />
|
||||
<Compile Include="Mocks\MockDataFrameReaderProxy.cs" />
|
||||
|
@ -81,6 +82,7 @@
|
|||
<Compile Include="Properties\AssemblyInfo.cs" />
|
||||
<Compile Include="RowTest.cs" />
|
||||
<Compile Include="SerDeTest.cs" />
|
||||
<Compile Include="HiveContextTest.cs" />
|
||||
<Compile Include="StatusTrackerTest.cs" />
|
||||
<Compile Include="TestWithMoqDemo.cs" />
|
||||
<Compile Include="Mocks\MockStructTypeProxy.cs" />
|
||||
|
@ -107,12 +109,17 @@
|
|||
<Compile Include="ComparableRDDTest.cs" />
|
||||
<Compile Include="DoubleRDDTest.cs" />
|
||||
<Compile Include="UserDefinedFunctionTest.cs" />
|
||||
<Compile Include="WeakObjectManagerTest.cs" />
|
||||
</ItemGroup>
|
||||
<ItemGroup>
|
||||
<ProjectReference Include="..\Adapter\Microsoft.Spark.CSharp\Adapter.csproj">
|
||||
<Project>{ce999a96-f42b-4e80-b208-709d7f49a77c}</Project>
|
||||
<Name>Adapter</Name>
|
||||
</ProjectReference>
|
||||
<ProjectReference Include="..\Tests.Common\Tests.Common.csproj">
|
||||
<Project>{e4479c4c-e106-4b90-bf0c-319561cea9c4}</Project>
|
||||
<Name>Tests.Common</Name>
|
||||
</ProjectReference>
|
||||
</ItemGroup>
|
||||
<ItemGroup />
|
||||
<ItemGroup>
|
||||
|
|
|
@ -108,7 +108,8 @@ namespace AdapterTest
|
|||
|
||||
// worker side operations
|
||||
Broadcast<int> broadcastVarInWorker = CreateBroadcastVarInWorker(expectedValue, out bid, out dumpPath);
|
||||
Broadcast.broadcastRegistry.Remove(bid);
|
||||
Broadcast bc;
|
||||
Broadcast.broadcastRegistry.TryRemove(bid, out bc);
|
||||
|
||||
// assert
|
||||
Assert.Throws<ArgumentException>(() => { var broadcastValueInWorker = broadcastVarInWorker.Value; });
|
||||
|
|
|
@ -195,6 +195,24 @@ namespace AdapterTest
|
|||
mockColumnProxy.Verify(m => m.BinOp("bitwiseXOR", column2.ColumnProxy), Times.Once);
|
||||
}
|
||||
|
||||
[Test]
|
||||
public void TestColumnGetHashCode()
|
||||
{
|
||||
var column1 = new Column(null);
|
||||
Assert.AreEqual(0, column1.GetHashCode());
|
||||
|
||||
var column2 = new Column(mockColumnProxy.Object);
|
||||
Assert.AreNotEqual(0, column2.GetHashCode());
|
||||
}
|
||||
|
||||
[Test]
|
||||
public void TestColumnEquals()
|
||||
{
|
||||
var column1 = new Column(mockColumnProxy.Object);
|
||||
var column2 = new Column(mockColumnProxy.Object);
|
||||
Assert.IsTrue(column1.Equals(column2));
|
||||
}
|
||||
|
||||
[Test]
|
||||
public void TestColumnLike()
|
||||
{
|
||||
|
|
|
@ -4,6 +4,7 @@ using AdapterTest.Mocks;
|
|||
using Microsoft.Spark.CSharp.Core;
|
||||
using Microsoft.Spark.CSharp.Interop.Ipc;
|
||||
using NUnit.Framework;
|
||||
using System.Linq;
|
||||
|
||||
namespace AdapterTest
|
||||
{
|
||||
|
@ -39,6 +40,12 @@ namespace AdapterTest
|
|||
Assert.AreEqual(2, taken.Length);
|
||||
Assert.AreEqual("brown", taken[0]);
|
||||
Assert.AreEqual("dog", taken[1]);
|
||||
|
||||
taken = words.Distinct().TakeOrdered(2, x => new string(x.ToCharArray().Reverse().ToArray()));
|
||||
Array.Sort(taken, StringComparer.Ordinal);
|
||||
Assert.AreEqual(2, taken.Length);
|
||||
Assert.AreEqual("The", taken[0]);
|
||||
Assert.AreEqual("the", taken[1]);
|
||||
}
|
||||
|
||||
[Test]
|
||||
|
|
|
@ -4,9 +4,13 @@
|
|||
using System;
|
||||
using System.Collections.Generic;
|
||||
using System.IO;
|
||||
using System.Linq;
|
||||
using AdapterTest.Mocks;
|
||||
using Microsoft.Spark.CSharp.Core;
|
||||
using Microsoft.Spark.CSharp.Interop;
|
||||
using Microsoft.Spark.CSharp.Proxy;
|
||||
using Microsoft.Spark.CSharp.Streaming;
|
||||
using Moq;
|
||||
using NUnit.Framework;
|
||||
|
||||
namespace AdapterTest
|
||||
|
@ -17,7 +21,7 @@ namespace AdapterTest
|
|||
[Test]
|
||||
public void TestDStreamMapReduce()
|
||||
{
|
||||
var ssc = new StreamingContext(new SparkContext("", ""), 1000);
|
||||
var ssc = new StreamingContext(new SparkContext("", ""), 1);
|
||||
Assert.IsNotNull((ssc.streamingContextProxy as MockStreamingContextProxy));
|
||||
|
||||
var lines = ssc.TextFileStream(Path.GetTempPath());
|
||||
|
@ -27,7 +31,8 @@ namespace AdapterTest
|
|||
|
||||
words.Slice(DateTime.MinValue, DateTime.MaxValue);
|
||||
words.Cache();
|
||||
words.Checkpoint(1000);
|
||||
words.Checkpoint(1);
|
||||
words.Window(1, 1);
|
||||
|
||||
words.Count().ForeachRDD((time, rdd) =>
|
||||
{
|
||||
|
@ -78,7 +83,7 @@ namespace AdapterTest
|
|||
[Test]
|
||||
public void TestDStreamTransform()
|
||||
{
|
||||
var ssc = new StreamingContext(new SparkContext("", ""), 1000);
|
||||
var ssc = new StreamingContext(new SparkContext("", ""), 1);
|
||||
Assert.IsNotNull((ssc.streamingContextProxy as MockStreamingContextProxy));
|
||||
|
||||
var lines = ssc.TextFileStream(Path.GetTempPath());
|
||||
|
@ -134,7 +139,7 @@ namespace AdapterTest
|
|||
[Test]
|
||||
public void TestDStreamJoin()
|
||||
{
|
||||
var ssc = new StreamingContext(new SparkContext("", ""), 1000);
|
||||
var ssc = new StreamingContext(new SparkContext("", ""), 1);
|
||||
Assert.IsNotNull((ssc.streamingContextProxy as MockStreamingContextProxy));
|
||||
|
||||
var lines = ssc.TextFileStream(Path.GetTempPath());
|
||||
|
@ -241,7 +246,7 @@ namespace AdapterTest
|
|||
[Test]
|
||||
public void TestDStreamUpdateStateByKey()
|
||||
{
|
||||
var ssc = new StreamingContext(new SparkContext("", ""), 1000);
|
||||
var ssc = new StreamingContext(new SparkContext("", ""), 1);
|
||||
Assert.IsNotNull((ssc.streamingContextProxy as MockStreamingContextProxy));
|
||||
|
||||
var lines = ssc.TextFileStream(Path.GetTempPath());
|
||||
|
@ -267,8 +272,23 @@ namespace AdapterTest
|
|||
// disable pipeline to UpdateStateByKey which replys on checkpoint mock proxy doesn't support
|
||||
pairs.Cache();
|
||||
|
||||
var state = pairs.UpdateStateByKey<string, int, int>((v, s) => s + (v as List<int>).Count);
|
||||
var initialStateRdd = ssc.SparkContext.Parallelize(new[] { "AAA" }).Map( w => new KeyValuePair<string, int>("AAA", 22));
|
||||
var state = pairs.UpdateStateByKey<string, int, int>((v, s) => s + (v as List<int>).Count, initialStateRdd);
|
||||
state.ForeachRDD((time, rdd) =>
|
||||
{
|
||||
var taken = rdd.Collect();
|
||||
Assert.AreEqual(taken.Length, 10);
|
||||
|
||||
foreach (object record in taken)
|
||||
{
|
||||
KeyValuePair<string, int> countByWord = (KeyValuePair<string, int>)record;
|
||||
Assert.AreEqual(countByWord.Key == "The" || countByWord.Key == "dog" || countByWord.Key == "lazy" ? 23 : 22, countByWord.Value);
|
||||
}
|
||||
});
|
||||
|
||||
// test when initialStateRdd is not provided
|
||||
var state2 = pairs.UpdateStateByKey<string, int, int>((v, s) => s + (v as List<int>).Count);
|
||||
state2.ForeachRDD((time, rdd) =>
|
||||
{
|
||||
var taken = rdd.Collect();
|
||||
Assert.AreEqual(taken.Length, 9);
|
||||
|
@ -276,9 +296,146 @@ namespace AdapterTest
|
|||
foreach (object record in taken)
|
||||
{
|
||||
KeyValuePair<string, int> countByWord = (KeyValuePair<string, int>)record;
|
||||
Assert.AreEqual(countByWord.Value, countByWord.Key == "The" || countByWord.Key == "dog" || countByWord.Key == "lazy" ? 24 : 23);
|
||||
Assert.AreEqual(countByWord.Key == "The" || countByWord.Key == "dog" || countByWord.Key == "lazy" ? 23 : 22, countByWord.Value);
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
[Test]
|
||||
public void TestDStreamMapWithState()
|
||||
{
|
||||
var mapwithStateDStreamProxy = new Mock<IDStreamProxy>();
|
||||
var streamingContextProxy = new Mock<IStreamingContextProxy>();
|
||||
streamingContextProxy.Setup(p =>
|
||||
p.CreateCSharpStateDStream(It.IsAny<IDStreamProxy>(), It.IsAny<byte[]>(), It.IsAny<string>(), It.IsAny<string>(), It.IsAny<string>()))
|
||||
.Returns(mapwithStateDStreamProxy.Object);
|
||||
|
||||
var sparkContextProxy = new Mock<ISparkContextProxy>();
|
||||
|
||||
var sparkConfProxy = new Mock<ISparkConfProxy>();
|
||||
|
||||
var sparkClrProxy = new Mock<ISparkCLRProxy>();
|
||||
sparkClrProxy.Setup(p => p.StreamingContextProxy).Returns(streamingContextProxy.Object);
|
||||
sparkClrProxy.Setup(p => p.SparkContextProxy).Returns(sparkContextProxy.Object);
|
||||
sparkClrProxy.Setup(p => p.CreateSparkContext(It.IsAny<ISparkConfProxy>())).Returns(sparkContextProxy.Object);
|
||||
sparkClrProxy.Setup(p => p.CreateSparkConf(It.IsAny<bool>())).Returns(sparkConfProxy.Object);
|
||||
|
||||
// reset sparkCLRProxy for after test completes
|
||||
var originalSparkCLRProxy = SparkCLREnvironment.SparkCLRProxy;
|
||||
try
|
||||
{
|
||||
SparkCLREnvironment.SparkCLRProxy = sparkClrProxy.Object;
|
||||
|
||||
var sparkConf = new SparkConf(false);
|
||||
var ssc = new StreamingContext(new SparkContext(sparkContextProxy.Object, sparkConf), 10);
|
||||
|
||||
var dstreamProxy = new Mock<IDStreamProxy>();
|
||||
var pairDStream = new DStream<KeyValuePair<string, int>>(dstreamProxy.Object, ssc);
|
||||
|
||||
var stateSpec = new StateSpec<string, int, int, int>((k, v, s) => v);
|
||||
var stateDStream = pairDStream.MapWithState(stateSpec);
|
||||
var snapshotDStream = stateDStream.StateSnapshots();
|
||||
|
||||
Assert.IsNotNull(stateDStream);
|
||||
Assert.IsNotNull(snapshotDStream);
|
||||
}
|
||||
finally
|
||||
{
|
||||
SparkCLREnvironment.SparkCLRProxy = originalSparkCLRProxy;
|
||||
}
|
||||
}
|
||||
|
||||
[Test]
|
||||
public void TestDStreamMapWithStateMapWithStateHelper()
|
||||
{
|
||||
// test when initialStateRdd is null
|
||||
var stateSpec = new StateSpec<string, int, int, int>((k, v, s) => v).NumPartitions(2).Timeout(TimeSpan.FromSeconds(100));
|
||||
var helper = new MapWithStateHelper<string, int, int, int>((t, rdd) => rdd, stateSpec);
|
||||
|
||||
var sparkContextProxy = new Mock<ISparkContextProxy>();
|
||||
var sc = new SparkContext(sparkContextProxy.Object, null);
|
||||
|
||||
var pairwiseRddProxy = new Mock<IRDDProxy>();
|
||||
sparkContextProxy.Setup(p => p.CreatePairwiseRDD(It.IsAny<IRDDProxy>(), It.IsAny<int>(), It.IsAny<long>())).Returns(pairwiseRddProxy.Object);
|
||||
|
||||
var pipelinedRddProxy = new Mock<IRDDProxy>();
|
||||
pipelinedRddProxy.Setup(p => p.Union(It.IsAny<IRDDProxy>())).Returns(new Mock<IRDDProxy>().Object);
|
||||
|
||||
sparkContextProxy.Setup(p =>
|
||||
p.CreateCSharpRdd(It.IsAny<IRDDProxy>(), It.IsAny<byte[]>(), It.IsAny<Dictionary<string, string>>(), It.IsAny<List<string>>(), It.IsAny<bool>(), It.IsAny<List<Broadcast>>(), It.IsAny<List<byte[]>>()))
|
||||
.Returns(pipelinedRddProxy.Object);
|
||||
|
||||
var valueRddProxy = new Mock<IRDDProxy>();
|
||||
var valuesRdd = new RDD<dynamic>(valueRddProxy.Object, sc);
|
||||
|
||||
var resultRdd = helper.Execute(DateTime.UtcNow.Millisecond, null, valuesRdd);
|
||||
|
||||
Assert.IsNotNull(resultRdd);
|
||||
|
||||
// test when initialStateRdd is not null
|
||||
var initialStateRdd = new RDD<KeyValuePair<string, int>>(new Mock<IRDDProxy>().Object, null);
|
||||
var stateSpec2 = new StateSpec<string, int, int, int>((k, v, s) => v).InitialState(initialStateRdd).NumPartitions(2);
|
||||
var helper2 = new MapWithStateHelper<string, int, int, int>((t, rdd) => rdd, stateSpec2);
|
||||
|
||||
var resultRdd2 = helper2.Execute(DateTime.UtcNow.Millisecond, null, valuesRdd);
|
||||
|
||||
Assert.IsNotNull(resultRdd2);
|
||||
}
|
||||
|
||||
[Test]
|
||||
public void TestDStreamMapWithStateUpdateStateHelper()
|
||||
{
|
||||
var ticks = DateTime.UtcNow.Ticks;
|
||||
var helper = new UpdateStateHelper<string, int, int, int>(
|
||||
(k, v, state) =>
|
||||
{
|
||||
if (v < 0 && state.Exists())
|
||||
{
|
||||
state.Remove();
|
||||
}
|
||||
else if(!state.IsTimingOut())
|
||||
{
|
||||
state.Update(v + state.Get());
|
||||
}
|
||||
|
||||
return v;
|
||||
},
|
||||
ticks, true, TimeSpan.FromSeconds(10));
|
||||
|
||||
var input = new dynamic[4];
|
||||
|
||||
var preStateRddRecord = new MapWithStateRDDRecord<string, int, int>(ticks - TimeSpan.FromSeconds(2).Ticks, new [] { new KeyValuePair<string, int>("1", 1), new KeyValuePair<string, int>("2", 2)});
|
||||
preStateRddRecord.stateMap.Add("expired", new KeyedState<int>(0, ticks - TimeSpan.FromSeconds(60).Ticks));
|
||||
|
||||
input[0] = preStateRddRecord;
|
||||
input[1] = new KeyValuePair<string, int>("1", -1);
|
||||
input[2] = new KeyValuePair<string, int>("2", 2);
|
||||
input[3] = new KeyValuePair<string, int>("3", 3);
|
||||
|
||||
var result = helper.Execute(1, input).GetEnumerator();
|
||||
Assert.IsNotNull(result);
|
||||
Assert.IsTrue(result.MoveNext());
|
||||
|
||||
MapWithStateRDDRecord<string, int, int> stateRddRecord = result.Current;
|
||||
|
||||
Assert.IsNotNull(stateRddRecord);
|
||||
Assert.AreEqual(stateRddRecord.mappedData.Count, 4); // timedout record also appears in return results
|
||||
Assert.AreEqual(stateRddRecord.stateMap.Count, 2);
|
||||
}
|
||||
|
||||
[Test]
|
||||
public void TestConstantInputDStream()
|
||||
{
|
||||
var sc = new SparkContext("", "");
|
||||
var rdd = sc.Parallelize(Enumerable.Range(0, 10), 1);
|
||||
var ssc = new StreamingContext(sc, 1);
|
||||
|
||||
// test when rdd is null
|
||||
Assert.Throws<ArgumentNullException>(() => new ConstantInputDStream<int>(null, ssc));
|
||||
|
||||
var constantInputDStream = new ConstantInputDStream<int>(rdd, ssc);
|
||||
Assert.IsNotNull(constantInputDStream);
|
||||
Assert.AreEqual(ssc, constantInputDStream.streamingContext);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -42,6 +42,13 @@ namespace AdapterTest
|
|||
public void TestDropWithAny()
|
||||
{
|
||||
// arrange
|
||||
const string columnName = "column1";
|
||||
var mockSchemaProxy = new Mock<IStructTypeProxy>();
|
||||
var mockFieldProxy = new Mock<IStructFieldProxy>();
|
||||
mockDataFrameProxy.Setup(m => m.GetSchema()).Returns(mockSchemaProxy.Object);
|
||||
mockSchemaProxy.Setup(m => m.GetStructTypeFields()).Returns(new List<IStructFieldProxy> { mockFieldProxy.Object });
|
||||
mockFieldProxy.Setup(m => m.GetStructFieldName()).Returns(columnName);
|
||||
|
||||
var sparkContext = new SparkContext("", "");
|
||||
mockDataFrameNaFunctionsProxy.Setup(m => m.Drop(It.IsAny<int>(), It.IsAny<string[]>())).Returns(mockDataFrameProxy.Object);
|
||||
|
||||
|
@ -50,12 +57,21 @@ namespace AdapterTest
|
|||
|
||||
// act
|
||||
var cols = new[] { "col1", "col2" };
|
||||
var df = f.Drop("any", cols);
|
||||
var df1 = f.Drop("any", cols);
|
||||
var df2 = f.Drop();
|
||||
var df3 = f.Drop("any");
|
||||
|
||||
// verify
|
||||
Assert.IsNotNull(df);
|
||||
Assert.AreEqual(df.DataFrameProxy, dataFrame.DataFrameProxy);
|
||||
Assert.IsNotNull(df1);
|
||||
Assert.AreEqual(df1.DataFrameProxy, dataFrame.DataFrameProxy);
|
||||
mockDataFrameNaFunctionsProxy.Verify(m => m.Drop(cols.Length, cols), Times.Once);
|
||||
|
||||
Assert.IsNotNull(df2);
|
||||
Assert.AreEqual(df2.DataFrameProxy, dataFrame.DataFrameProxy);
|
||||
|
||||
Assert.IsNotNull(df3);
|
||||
Assert.AreEqual(df3.DataFrameProxy, dataFrame.DataFrameProxy);
|
||||
mockDataFrameNaFunctionsProxy.Verify(m => m.Drop(1, new[] { columnName }), Times.Exactly(2));
|
||||
}
|
||||
|
||||
[Test]
|
||||
|
@ -106,6 +122,29 @@ namespace AdapterTest
|
|||
mockDataFrameNaFunctionsProxy.Verify(m => m.Drop(It.IsAny<int>(), It.IsAny<string[]>()), Times.Never);
|
||||
}
|
||||
|
||||
[Test]
|
||||
public void TestDropWithMinNonNulls()
|
||||
{
|
||||
const string columnName = "column1";
|
||||
var mockSchemaProxy = new Mock<IStructTypeProxy>();
|
||||
var mockFieldProxy = new Mock<IStructFieldProxy>();
|
||||
mockDataFrameProxy.Setup(m => m.GetSchema()).Returns(mockSchemaProxy.Object);
|
||||
mockSchemaProxy.Setup(m => m.GetStructTypeFields()).Returns(new List<IStructFieldProxy> { mockFieldProxy.Object });
|
||||
mockFieldProxy.Setup(m => m.GetStructFieldName()).Returns(columnName);
|
||||
|
||||
var sparkContext = new SparkContext("", "");
|
||||
mockDataFrameNaFunctionsProxy.Setup(m => m.Drop(It.IsAny<int>(), It.IsAny<string[]>())).Returns(mockDataFrameProxy.Object);
|
||||
|
||||
var dataFrame = new DataFrame(mockDataFrameProxy.Object, sparkContext);
|
||||
var f = new DataFrameNaFunctions(mockDataFrameNaFunctionsProxy.Object, dataFrame, sparkContext);
|
||||
|
||||
var df = f.Drop(20);
|
||||
Assert.IsNotNull(df);
|
||||
Assert.AreEqual(df.DataFrameProxy, dataFrame.DataFrameProxy);
|
||||
Assert.AreNotSame(dataFrame, df);
|
||||
mockDataFrameNaFunctionsProxy.Verify(m => m.Drop(20, new[] { columnName }), Times.Once);
|
||||
}
|
||||
|
||||
[Test]
|
||||
public void TestFill()
|
||||
{
|
||||
|
|
|
@ -44,6 +44,33 @@ namespace AdapterTest
|
|||
SparkCLREnvironment.SparkCLRProxy = new MockSparkCLRProxy();
|
||||
}
|
||||
|
||||
[Test]
|
||||
public void TestRegisterTempTable()
|
||||
{
|
||||
mockDataFrameProxy.Setup(m => m.RegisterTempTable(It.IsAny<string>()));
|
||||
var dataFrame = new DataFrame(mockDataFrameProxy.Object, null);
|
||||
dataFrame.RegisterTempTable("TestTable");
|
||||
mockDataFrameProxy.Verify(m => m.RegisterTempTable("TestTable"), Times.Once);
|
||||
}
|
||||
|
||||
[Test]
|
||||
public void TestDataFrameCount()
|
||||
{
|
||||
mockDataFrameProxy.Setup(m => m.Count()).Returns(1);
|
||||
var dataFrame = new DataFrame(mockDataFrameProxy.Object, null);
|
||||
Assert.AreEqual(1, dataFrame.Count());
|
||||
mockDataFrameProxy.Verify(m => m.Count(), Times.Once);
|
||||
}
|
||||
|
||||
[Test]
|
||||
public void TestShow()
|
||||
{
|
||||
mockDataFrameProxy.Setup(m => m.GetShowString(It.IsAny<int>(), It.IsAny<bool>())).Returns("Show");
|
||||
var dataFrame = new DataFrame(mockDataFrameProxy.Object, null);
|
||||
dataFrame.Show();
|
||||
mockDataFrameProxy.Verify(m => m.GetShowString(20, true), Times.Once);
|
||||
}
|
||||
|
||||
[Test]
|
||||
public void TestDataFrameJoin()
|
||||
{
|
||||
|
@ -51,10 +78,54 @@ namespace AdapterTest
|
|||
var dataFrame = sqlContext.Read().Json(@"c:\path\to\input.json");
|
||||
var dataFrame2 = sqlContext.Read().Json(@"c:\path\to\input2.json");
|
||||
var joinedDataFrame = dataFrame.Join(dataFrame2, "JoinCol");
|
||||
var paramValuesToJoinMethod = (joinedDataFrame.DataFrameProxy as MockDataFrameProxy).mockDataFrameReference as object[];
|
||||
var paramValuesToSecondDataFrameJsonFileMethod = ((paramValuesToJoinMethod[0] as MockDataFrameProxy).mockDataFrameReference as object[]);
|
||||
var paramValuesToJoinMethod = (joinedDataFrame.DataFrameProxy as MockDataFrameProxy).mockDataFrameReference;
|
||||
var paramValuesToSecondDataFrameJsonFileMethod = (paramValuesToJoinMethod[0] as MockDataFrameProxy).mockDataFrameReference;
|
||||
Assert.AreEqual(@"c:\path\to\input2.json", paramValuesToSecondDataFrameJsonFileMethod[0]);
|
||||
Assert.AreEqual("JoinCol", paramValuesToJoinMethod[1]);
|
||||
|
||||
var joinedDataFrame2 = dataFrame.Join(dataFrame2, new[] {"JoinCol1", "JoinCol2"});
|
||||
var paramValuesToJoinMethod2 = (joinedDataFrame2.DataFrameProxy as MockDataFrameProxy).mockDataFrameReference;
|
||||
var paramValuesToSecondDataFrameJsonFileMethod2 = (paramValuesToJoinMethod2[0] as MockDataFrameProxy).mockDataFrameReference;
|
||||
Assert.AreEqual(@"c:\path\to\input2.json", paramValuesToSecondDataFrameJsonFileMethod2[0]);
|
||||
Assert.AreEqual("JoinCol1", (paramValuesToJoinMethod2[1] as string[])[0]);
|
||||
Assert.AreEqual("JoinCol2", (paramValuesToJoinMethod2[1] as string[])[1]);
|
||||
|
||||
var mockColumnProxy = new Mock<IColumnProxy>().Object;
|
||||
var mockColumn = new Column(mockColumnProxy);
|
||||
var joinedDataFrame3 = dataFrame.Join(dataFrame2, mockColumn);
|
||||
var paramValuesToJoinMethod3 = (joinedDataFrame3.DataFrameProxy as MockDataFrameProxy).mockDataFrameReference;
|
||||
var paramValuesToSecondDataFrameJsonFileMethod3 = (paramValuesToJoinMethod3[0] as MockDataFrameProxy).mockDataFrameReference;
|
||||
Assert.AreEqual(@"c:\path\to\input2.json", paramValuesToSecondDataFrameJsonFileMethod3[0]);
|
||||
Assert.AreEqual(mockColumnProxy, paramValuesToJoinMethod3[1]);
|
||||
Assert.AreEqual(JoinType.Inner.Value, paramValuesToJoinMethod3[2]);
|
||||
|
||||
var joinedDataFrame4 = dataFrame.Join(dataFrame2, mockColumn, JoinType.Outer);
|
||||
var paramValuesToJoinMethod4 = (joinedDataFrame4.DataFrameProxy as MockDataFrameProxy).mockDataFrameReference;
|
||||
var paramValuesToSecondDataFrameJsonFileMethod4 = (paramValuesToJoinMethod4[0] as MockDataFrameProxy).mockDataFrameReference;
|
||||
Assert.AreEqual(@"c:\path\to\input2.json", paramValuesToSecondDataFrameJsonFileMethod4[0]);
|
||||
Assert.AreEqual(mockColumnProxy, paramValuesToJoinMethod4[1]);
|
||||
Assert.AreEqual(JoinType.Outer.Value, paramValuesToJoinMethod4[2]);
|
||||
|
||||
var joinedDataFrame5 = dataFrame.Join(dataFrame2, mockColumn, JoinType.LeftOuter);
|
||||
var paramValuesToJoinMethod5 = (joinedDataFrame5.DataFrameProxy as MockDataFrameProxy).mockDataFrameReference;
|
||||
var paramValuesToSecondDataFrameJsonFileMethod5 = (paramValuesToJoinMethod5[0] as MockDataFrameProxy).mockDataFrameReference;
|
||||
Assert.AreEqual(@"c:\path\to\input2.json", paramValuesToSecondDataFrameJsonFileMethod5[0]);
|
||||
Assert.AreEqual(mockColumnProxy, paramValuesToJoinMethod5[1]);
|
||||
Assert.AreEqual(JoinType.LeftOuter.Value, paramValuesToJoinMethod5[2]);
|
||||
|
||||
var joinedDataFrame6 = dataFrame.Join(dataFrame2, mockColumn, JoinType.RightOuter);
|
||||
var paramValuesToJoinMethod6 = (joinedDataFrame6.DataFrameProxy as MockDataFrameProxy).mockDataFrameReference;
|
||||
var paramValuesToSecondDataFrameJsonFileMethod6 = (paramValuesToJoinMethod6[0] as MockDataFrameProxy).mockDataFrameReference;
|
||||
Assert.AreEqual(@"c:\path\to\input2.json", paramValuesToSecondDataFrameJsonFileMethod6[0]);
|
||||
Assert.AreEqual(mockColumnProxy, paramValuesToJoinMethod6[1]);
|
||||
Assert.AreEqual(JoinType.RightOuter.Value, paramValuesToJoinMethod6[2]);
|
||||
|
||||
var joinedDataFrame7 = dataFrame.Join(dataFrame2, mockColumn, JoinType.LeftSemi);
|
||||
var paramValuesToJoinMethod7 = (joinedDataFrame7.DataFrameProxy as MockDataFrameProxy).mockDataFrameReference;
|
||||
var paramValuesToSecondDataFrameJsonFileMethod7 = (paramValuesToJoinMethod7[0] as MockDataFrameProxy).mockDataFrameReference;
|
||||
Assert.AreEqual(@"c:\path\to\input2.json", paramValuesToSecondDataFrameJsonFileMethod7[0]);
|
||||
Assert.AreEqual(mockColumnProxy, paramValuesToJoinMethod7[1]);
|
||||
Assert.AreEqual(JoinType.LeftSemi.Value, paramValuesToJoinMethod7[2]);
|
||||
}
|
||||
|
||||
|
||||
|
@ -187,6 +258,48 @@ namespace AdapterTest
|
|||
Assert.AreEqual(expectedResultDataFrameProxy, actualResultDataFrame.DataFrameProxy);
|
||||
}
|
||||
|
||||
[Test]
|
||||
public void TestFillNa()
|
||||
{
|
||||
// Arrange
|
||||
const string columnName = "column1";
|
||||
var mockSchemaProxy = new Mock<IStructTypeProxy>();
|
||||
var mockFieldProxy = new Mock<IStructFieldProxy>();
|
||||
var expectedResultDataFrameProxy = new Mock<IDataFrameProxy>().Object;
|
||||
mockDataFrameProxy.Setup(m => m.GetSchema()).Returns(mockSchemaProxy.Object);
|
||||
|
||||
// dataframeNaFunctionsProxy
|
||||
var dataFrameNaFunctionsProxy = new Mock<IDataFrameNaFunctionsProxy>();
|
||||
dataFrameNaFunctionsProxy.Setup(d => d.Fill(It.IsAny<double>(), It.IsAny<string[]>())).Returns(expectedResultDataFrameProxy);
|
||||
dataFrameNaFunctionsProxy.Setup(d => d.Fill(It.IsAny<string>(), It.IsAny<string[]>())).Returns(expectedResultDataFrameProxy);
|
||||
dataFrameNaFunctionsProxy.Setup(d => d.Fill(It.IsAny<Dictionary<string, object>>())).Returns(expectedResultDataFrameProxy);
|
||||
|
||||
mockDataFrameProxy.Setup(m => m.Na()).Returns(dataFrameNaFunctionsProxy.Object);
|
||||
|
||||
mockSchemaProxy.Setup(m => m.GetStructTypeFields()).Returns(new List<IStructFieldProxy> { mockFieldProxy.Object });
|
||||
mockFieldProxy.Setup(m => m.GetStructFieldName()).Returns(columnName);
|
||||
var sc = new SparkContext(null);
|
||||
|
||||
var dict = new Dictionary<string, object> {{columnName, 1}};
|
||||
|
||||
// Act
|
||||
var originalDataFrame = new DataFrame(mockDataFrameProxy.Object, sc);
|
||||
var actualResultDataFrame1 = originalDataFrame.FillNa(1);
|
||||
var actualResultDataFrame2 = originalDataFrame.FillNa("1", new[] {columnName});
|
||||
var actualResultDataFrame3 = originalDataFrame.FillNa(dict);
|
||||
|
||||
// Assert
|
||||
// assert DropNa of Proxy was invoked with correct parameters
|
||||
dataFrameNaFunctionsProxy.Verify(m => m.Fill(1, It.Is<string[]>(subset => subset.Length == 1 &&
|
||||
subset.Contains(columnName))));
|
||||
dataFrameNaFunctionsProxy.Verify(m => m.Fill("1", It.Is<string[]>(subset => subset.Length == 1 &&
|
||||
subset.Contains(columnName))));
|
||||
dataFrameNaFunctionsProxy.Verify(m => m.Fill(dict));
|
||||
Assert.AreEqual(expectedResultDataFrameProxy, actualResultDataFrame1.DataFrameProxy);
|
||||
Assert.AreEqual(expectedResultDataFrameProxy, actualResultDataFrame2.DataFrameProxy);
|
||||
Assert.AreEqual(expectedResultDataFrameProxy, actualResultDataFrame3.DataFrameProxy);
|
||||
}
|
||||
|
||||
[Test]
|
||||
public void TestDropDuplicates()
|
||||
{
|
||||
|
@ -352,7 +465,7 @@ namespace AdapterTest
|
|||
}
|
||||
|
||||
[Test]
|
||||
public void TestSort_ColumnNames()
|
||||
public void TestSort()
|
||||
{
|
||||
// Arrange
|
||||
const string columnName = "column1";
|
||||
|
@ -374,6 +487,28 @@ namespace AdapterTest
|
|||
Assert.AreEqual(expectedResultDataFrameProxy, actualResultDataFrameProxy.DataFrameProxy);
|
||||
}
|
||||
|
||||
[Test]
|
||||
public void TestSortWithinPartitions()
|
||||
{
|
||||
// Arrange
|
||||
const string columnName = "column1";
|
||||
var expectedResultDataFrameProxy = new Mock<IDataFrameProxy>().Object;
|
||||
var mockColumnProxy = new Mock<IColumnProxy>();
|
||||
var mockSortedColumnProxy = new Mock<IColumnProxy>();
|
||||
mockColumnProxy.Setup(m => m.UnaryOp(It.IsAny<string>())).Returns(mockSortedColumnProxy.Object);
|
||||
mockDataFrameProxy.Setup(m => m.GetColumn(It.IsAny<string>())).Returns(mockColumnProxy.Object);
|
||||
mockDataFrameProxy.Setup(m => m.SortWithinPartitions(It.IsAny<IColumnProxy[]>())).Returns(expectedResultDataFrameProxy);
|
||||
|
||||
var sc = new SparkContext(null);
|
||||
|
||||
// Act
|
||||
var originalDataFrame = new DataFrame(mockDataFrameProxy.Object, sc);
|
||||
var actualResultDataFrameProxy = originalDataFrame.SortWithinPartitions(new[] { columnName });
|
||||
|
||||
// Assert
|
||||
Assert.AreEqual(expectedResultDataFrameProxy, actualResultDataFrameProxy.DataFrameProxy);
|
||||
}
|
||||
|
||||
[Test]
|
||||
public void TestAlias()
|
||||
{
|
||||
|
@ -544,6 +679,30 @@ namespace AdapterTest
|
|||
mockDataFrameProxy.Verify(m => m.Repartition(numPartitions), Times.Once());
|
||||
}
|
||||
|
||||
[Test]
|
||||
public void TestRepartition2()
|
||||
{
|
||||
// arrange
|
||||
mockDataFrameProxy.Setup(m => m.Repartition(It.IsAny<int>(), It.IsAny<IColumnProxy[]>()));
|
||||
|
||||
var sc = new SparkContext(null);
|
||||
var dataFrame = new DataFrame(mockDataFrameProxy.Object, sc);
|
||||
|
||||
const int numPartitions = 5;
|
||||
IColumnProxy mockColumn1Proxy = new Mock<IColumnProxy>().Object;
|
||||
Column mockColumn = new Column(mockColumn1Proxy);
|
||||
|
||||
// act
|
||||
dataFrame.Repartition(new[] { mockColumn }, numPartitions);
|
||||
// assert
|
||||
mockDataFrameProxy.Verify(m => m.Repartition(numPartitions, new[] { mockColumn1Proxy }), Times.Once());
|
||||
|
||||
// act
|
||||
dataFrame.Repartition(new[] { mockColumn });
|
||||
// assert
|
||||
mockDataFrameProxy.Verify(m => m.Repartition(new[] { mockColumn1Proxy }), Times.Once());
|
||||
}
|
||||
|
||||
[Test]
|
||||
public void TestSample()
|
||||
{
|
||||
|
@ -968,6 +1127,60 @@ namespace AdapterTest
|
|||
Assert.AreEqual(expectedResultDataFrameProxy, actualResultDataFrame.DataFrameProxy);
|
||||
}
|
||||
|
||||
[Test]
|
||||
public void TestSelect_ColumnName()
|
||||
{
|
||||
var expectedResultDataFrameProxy = new Mock<IDataFrameProxy>().Object;
|
||||
mockDataFrameProxy.Setup(m => m.Select(It.IsAny<string>(), It.IsAny<string[]>())).Returns(expectedResultDataFrameProxy);
|
||||
var sc = new SparkContext(null);
|
||||
|
||||
const string column1Name = "colName1";
|
||||
const string column2Name = "colName2";
|
||||
|
||||
// Act
|
||||
var originalDataFrame = new DataFrame(mockDataFrameProxy.Object, sc);
|
||||
var actualResultDataFrame = originalDataFrame.Select(column1Name, column2Name);
|
||||
|
||||
// Assert
|
||||
mockDataFrameProxy.Verify(m => m.Select(column1Name, new [] { column2Name } ));
|
||||
Assert.AreEqual(expectedResultDataFrameProxy, actualResultDataFrame.DataFrameProxy);
|
||||
}
|
||||
|
||||
[Test]
|
||||
public void TestSelectExpr()
|
||||
{
|
||||
var expectedResultDataFrameProxy = new Mock<IDataFrameProxy>().Object;
|
||||
mockDataFrameProxy.Setup(m => m.SelectExpr(It.IsAny<string[]>())).Returns(expectedResultDataFrameProxy);
|
||||
var sc = new SparkContext(null);
|
||||
|
||||
const string columnExpr = "colB as newName";
|
||||
|
||||
// Act
|
||||
var originalDataFrame = new DataFrame(mockDataFrameProxy.Object, sc);
|
||||
var actualResultDataFrame = originalDataFrame.SelectExpr(columnExpr);
|
||||
|
||||
// Assert
|
||||
mockDataFrameProxy.Verify(m => m.SelectExpr(new[] { columnExpr }));
|
||||
Assert.AreEqual(expectedResultDataFrameProxy, actualResultDataFrame.DataFrameProxy);
|
||||
}
|
||||
|
||||
[Test]
|
||||
public void TestWhere()
|
||||
{
|
||||
var expectedResultDataFrameProxy = new Mock<IDataFrameProxy>().Object;
|
||||
mockDataFrameProxy.Setup(m => m.Filter(It.IsAny<string>())).Returns(expectedResultDataFrameProxy);
|
||||
var sc = new SparkContext(null);
|
||||
|
||||
const string condition = "Filter Condition";
|
||||
// Act
|
||||
var originalDataFrame = new DataFrame(mockDataFrameProxy.Object, sc);
|
||||
var actualResultDataFrame = originalDataFrame.Where(condition);
|
||||
|
||||
// Assert
|
||||
mockDataFrameProxy.Verify(m => m.Filter(condition));
|
||||
Assert.AreEqual(expectedResultDataFrameProxy, actualResultDataFrame.DataFrameProxy);
|
||||
}
|
||||
|
||||
[Test]
|
||||
public void TestWithColumn()
|
||||
{
|
||||
|
@ -1186,6 +1399,26 @@ namespace AdapterTest
|
|||
|
||||
#region GroupedDataTest
|
||||
|
||||
[Test]
|
||||
public void TestAgg()
|
||||
{
|
||||
// Arrange
|
||||
var expectedResultDataFrameProxy = new Mock<IDataFrameProxy>().Object;
|
||||
var mockGroupedDataProxy = new Mock<IGroupedDataProxy>();
|
||||
mockDataFrameProxy.Setup(m => m.GroupBy()).Returns(mockGroupedDataProxy.Object);
|
||||
mockDataFrameProxy.Setup(m => m.Agg(It.IsAny<IGroupedDataProxy>(), It.IsAny<Dictionary<string, string>>())).Returns(expectedResultDataFrameProxy);
|
||||
var sc = new SparkContext(null);
|
||||
|
||||
var columnNameAggFuncDic = new Dictionary<string, string> {{"name", "count"}};
|
||||
// Act
|
||||
var originalDataFrame = new DataFrame(mockDataFrameProxy.Object, sc);
|
||||
var actualResult = originalDataFrame.Agg(columnNameAggFuncDic);
|
||||
|
||||
// Assert
|
||||
mockDataFrameProxy.Verify(m => m.Agg(mockGroupedDataProxy.Object, columnNameAggFuncDic)); // assert Agg was invoked with correct parameters
|
||||
Assert.AreEqual(expectedResultDataFrameProxy, actualResult.DataFrameProxy);
|
||||
}
|
||||
|
||||
[Test]
|
||||
public void TestCount()
|
||||
{
|
||||
|
|
|
@ -0,0 +1,39 @@
|
|||
// Copyright (c) Microsoft. All rights reserved.
|
||||
// Licensed under the MIT license. See LICENSE file in the project root for full license information.
|
||||
|
||||
using System;
|
||||
using System.Collections.Generic;
|
||||
using AdapterTest.Mocks;
|
||||
using Microsoft.Spark.CSharp.Core;
|
||||
using Microsoft.Spark.CSharp.Interop;
|
||||
using Microsoft.Spark.CSharp.Proxy;
|
||||
using Microsoft.Spark.CSharp.Streaming;
|
||||
using Moq;
|
||||
using NUnit.Framework;
|
||||
|
||||
namespace AdapterTest
|
||||
{
|
||||
[TestFixture]
|
||||
public class EventHubsUtilsTest
|
||||
{
|
||||
[Test]
|
||||
public void TestCreateUnionStream()
|
||||
{
|
||||
var streamingContextProxy = new Mock<IStreamingContextProxy>();
|
||||
var mockDstreamProxy = new Mock<IDStreamProxy>().Object;
|
||||
streamingContextProxy.Setup(
|
||||
m => m.EventHubsUnionStream(It.IsAny<Dictionary<string, string>>(), It.IsAny<StorageLevelType>()))
|
||||
.Returns(mockDstreamProxy);
|
||||
|
||||
var mockSparkClrProxy = new Mock<ISparkCLRProxy>();
|
||||
mockSparkClrProxy.Setup(m => m.CreateStreamingContext(It.IsAny<SparkContext>(), It.IsAny<int>()))
|
||||
.Returns(streamingContextProxy.Object);
|
||||
SparkCLREnvironment.SparkCLRProxy = mockSparkClrProxy.Object;
|
||||
|
||||
var sparkContext = new SparkContext(SparkCLREnvironment.SparkCLRProxy.SparkContextProxy, new SparkConf(new Mock<ISparkConfProxy>().Object));
|
||||
var streamingContext = new StreamingContext(sparkContext, 123);
|
||||
var dstream = EventHubsUtils.CreateUnionStream(streamingContext, new Dictionary<string, string>());
|
||||
Assert.AreEqual(mockDstreamProxy, dstream.DStreamProxy);
|
||||
}
|
||||
}
|
||||
}
|
|
@ -565,25 +565,25 @@ namespace AdapterTest
|
|||
{
|
||||
mockSparkContextProxy.Setup(m => m.CreateWindowFunction(It.IsAny<string>()));
|
||||
Functions.RowNumber();
|
||||
mockSparkContextProxy.Verify(m => m.CreateWindowFunction("rowNumber"), Times.Once);
|
||||
mockSparkContextProxy.Verify(m => m.CreateWindowFunction("row_number"), Times.Once);
|
||||
|
||||
Functions.DenseRank();
|
||||
mockSparkContextProxy.Verify(m => m.CreateWindowFunction("denseRank"), Times.Once);
|
||||
mockSparkContextProxy.Verify(m => m.CreateWindowFunction("dense_rank"), Times.Once);
|
||||
|
||||
Functions.Rank();
|
||||
mockSparkContextProxy.Verify(m => m.CreateWindowFunction("rank"), Times.Once);
|
||||
|
||||
Functions.CumeDist();
|
||||
mockSparkContextProxy.Verify(m => m.CreateWindowFunction("cumeDist"), Times.Once);
|
||||
mockSparkContextProxy.Verify(m => m.CreateWindowFunction("cume_dist"), Times.Once);
|
||||
|
||||
Functions.PercentRank();
|
||||
mockSparkContextProxy.Verify(m => m.CreateWindowFunction("percentRank"), Times.Once);
|
||||
mockSparkContextProxy.Verify(m => m.CreateWindowFunction("percent_rank"), Times.Once);
|
||||
|
||||
Functions.MonotonicallyIncreasingId();
|
||||
mockSparkContextProxy.Verify(m => m.CreateWindowFunction("monotonicallyIncreasingId"), Times.Once);
|
||||
mockSparkContextProxy.Verify(m => m.CreateWindowFunction("monotonically_increasing_id"), Times.Once);
|
||||
|
||||
Functions.SparkPartitionId();
|
||||
mockSparkContextProxy.Verify(m => m.CreateWindowFunction("sparkPartitionId"), Times.Once);
|
||||
mockSparkContextProxy.Verify(m => m.CreateWindowFunction("spark_partition_id"), Times.Once);
|
||||
|
||||
Functions.Rand();
|
||||
mockSparkContextProxy.Verify(m => m.CreateWindowFunction("rand"), Times.Once);
|
||||
|
@ -594,6 +594,60 @@ namespace AdapterTest
|
|||
|
||||
#endregion
|
||||
|
||||
#region udf functions
|
||||
|
||||
[Test]
|
||||
public void TestUdfFunction()
|
||||
{
|
||||
var mockUdfProxy = new Mock<IUDFProxy>();
|
||||
mockUdfProxy.Setup(m => m.Apply(It.IsAny<IColumnProxy[]>()));
|
||||
mockSparkContextProxy.Setup(m => m.CreateUserDefinedCSharpFunction(It.IsAny<string>(), It.IsAny<byte[]>(), It.IsAny<string>())).Returns(mockUdfProxy.Object);
|
||||
|
||||
Functions.Udf(() => 0).Invoke();
|
||||
mockUdfProxy.Verify(m => m.Apply(new IColumnProxy[] { }), Times.Once);
|
||||
|
||||
var column1 = GeneratorColum();
|
||||
Functions.Udf<int, int>(i => 1).Invoke(column1);
|
||||
mockUdfProxy.Verify(m => m.Apply(new[] { column1.ColumnProxy }), Times.Once);
|
||||
|
||||
var column2 = GeneratorColum();
|
||||
Functions.Udf<int, int, int>( (i1, i2) => 2).Invoke(column1, column2);
|
||||
mockUdfProxy.Verify(m => m.Apply(new[] { column1.ColumnProxy, column2.ColumnProxy }), Times.Once);
|
||||
|
||||
var column3 = GeneratorColum();
|
||||
Functions.Udf<int, int, int, int>((i1, i2, i3) => 3).Invoke(column1, column2, column3);
|
||||
mockUdfProxy.Verify(m => m.Apply(new[] { column1.ColumnProxy, column2.ColumnProxy, column3.ColumnProxy }), Times.Once);
|
||||
|
||||
var column4 = GeneratorColum();
|
||||
Functions.Udf<int, int, int, int, int>((i1, i2, i3, i4) => 4).Invoke(column1, column2, column3, column4);
|
||||
mockUdfProxy.Verify(m => m.Apply(new[] { column1.ColumnProxy, column2.ColumnProxy, column3.ColumnProxy, column4.ColumnProxy }), Times.Once);
|
||||
|
||||
var column5 = GeneratorColum();
|
||||
Functions.Udf<int, int, int, int, int, int>((i1, i2, i3, i4, i5) => 5).Invoke(column1, column2, column3, column4, column5);
|
||||
mockUdfProxy.Verify(m => m.Apply(new[] { column1.ColumnProxy, column2.ColumnProxy, column3.ColumnProxy, column4.ColumnProxy, column5.ColumnProxy }), Times.Once);
|
||||
|
||||
var column6 = GeneratorColum();
|
||||
Functions.Udf<int, int, int, int, int, int, int>((i1, i2, i3, i4, i5, i6) => 6).Invoke(column1, column2, column3, column4, column5, column6);
|
||||
mockUdfProxy.Verify(m => m.Apply(new[] { column1.ColumnProxy, column2.ColumnProxy, column3.ColumnProxy, column4.ColumnProxy, column5.ColumnProxy, column6.ColumnProxy }), Times.Once);
|
||||
|
||||
var column7 = GeneratorColum();
|
||||
Functions.Udf<int, int, int, int, int, int, int, int>((i1, i2, i3, i4, i5, i6, i7) => 7).Invoke(column1, column2, column3, column4, column5, column6, column7);
|
||||
mockUdfProxy.Verify(m => m.Apply(new[] { column1.ColumnProxy, column2.ColumnProxy, column3.ColumnProxy, column4.ColumnProxy, column5.ColumnProxy, column6.ColumnProxy, column7.ColumnProxy }), Times.Once);
|
||||
|
||||
var column8 = GeneratorColum();
|
||||
Functions.Udf<int, int, int, int, int, int, int, int, int>((i1, i2, i3, i4, i5, i6, i7, i8) => 8).Invoke(column1, column2, column3, column4, column5, column6, column7, column8);
|
||||
mockUdfProxy.Verify(m => m.Apply(new[] { column1.ColumnProxy, column2.ColumnProxy, column3.ColumnProxy, column4.ColumnProxy, column5.ColumnProxy, column6.ColumnProxy, column7.ColumnProxy, column8.ColumnProxy }), Times.Once);
|
||||
|
||||
var column9 = GeneratorColum();
|
||||
Functions.Udf<int, int, int, int, int, int, int, int, int, int>((i1, i2, i3, i4, i5, i6, i7, i8, i9) => 9).Invoke(column1, column2, column3, column4, column5, column6, column7, column8, column9);
|
||||
mockUdfProxy.Verify(m => m.Apply(new[] { column1.ColumnProxy, column2.ColumnProxy, column3.ColumnProxy, column4.ColumnProxy, column5.ColumnProxy, column6.ColumnProxy, column7.ColumnProxy, column8.ColumnProxy, column9.ColumnProxy }), Times.Once);
|
||||
|
||||
var column10 = GeneratorColum();
|
||||
Functions.Udf<int, int, int, int, int, int, int, int, int, int, int>((i1, i2, i3, i4, i5, i6, i7, i8, i9, i10) => 10).Invoke(column1, column2, column3, column4, column5, column6, column7, column8, column9, column10);
|
||||
mockUdfProxy.Verify(m => m.Apply(new[] { column1.ColumnProxy, column2.ColumnProxy, column3.ColumnProxy, column4.ColumnProxy, column5.ColumnProxy, column6.ColumnProxy, column7.ColumnProxy, column8.ColumnProxy, column9.ColumnProxy, column10.ColumnProxy }), Times.Once);
|
||||
}
|
||||
#endregion
|
||||
|
||||
private Column GeneratorColum()
|
||||
{
|
||||
Mock<IColumnProxy> mockColumnProxy = new Mock<IColumnProxy>();
|
||||
|
|
|
@ -0,0 +1,67 @@
|
|||
// Copyright (c) Microsoft. All rights reserved.
|
||||
// Licensed under the MIT license. See LICENSE file in the project root for full license information.
|
||||
|
||||
using System;
|
||||
using AdapterTest.Mocks;
|
||||
using Microsoft.Spark.CSharp.Core;
|
||||
using Microsoft.Spark.CSharp.Interop.Ipc;
|
||||
using Microsoft.Spark.CSharp.Proxy;
|
||||
using Microsoft.Spark.CSharp.Sql;
|
||||
using NUnit.Framework;
|
||||
using Moq;
|
||||
using Microsoft.Spark.CSharp.Interop;
|
||||
using Microsoft.Spark.CSharp.Proxy.Ipc;
|
||||
using System.Collections.Generic;
|
||||
|
||||
namespace AdapterTest
|
||||
{
|
||||
/// <summary>
|
||||
/// Validates interaction between SqlContext and its proxies
|
||||
/// </summary>
|
||||
[TestFixture]
|
||||
public class HiveContextTest
|
||||
{
|
||||
private static Mock<ISqlContextProxy> mockSqlContextProxy;
|
||||
|
||||
[OneTimeSetUp]
|
||||
public static void ClassInitialize()
|
||||
{
|
||||
mockSqlContextProxy = new Mock<ISqlContextProxy>();
|
||||
}
|
||||
|
||||
[SetUp]
|
||||
public void TestInitialize()
|
||||
{
|
||||
mockSqlContextProxy.Reset();
|
||||
}
|
||||
|
||||
[TearDown]
|
||||
public void TestCleanUp()
|
||||
{
|
||||
// Revert to use Static mock class to prevent blocking other test methods which uses static mock class
|
||||
SparkCLREnvironment.SparkCLRProxy = new MockSparkCLRProxy();
|
||||
}
|
||||
|
||||
[Test]
|
||||
public void TestHiveContextConstructor()
|
||||
{
|
||||
var hiveContext = new HiveContext(new SparkContext("", ""));
|
||||
Assert.IsNotNull((hiveContext.SqlContextProxy as MockSqlContextProxy).mockSqlContextReference);
|
||||
}
|
||||
|
||||
[Test]
|
||||
public void TestHiveContextRefreshTable()
|
||||
{
|
||||
// arrange
|
||||
var mockSparkContextProxy = new Mock<ISparkContextProxy>();
|
||||
mockSqlContextProxy.Setup(m => m.RefreshTable(It.IsAny<string>()));
|
||||
var hiveContext = new HiveContext(new SparkContext("", ""), mockSqlContextProxy.Object);
|
||||
|
||||
// act
|
||||
hiveContext.RefreshTable("table");
|
||||
|
||||
// assert
|
||||
mockSqlContextProxy.Verify(m => m.RefreshTable("table"));
|
||||
}
|
||||
}
|
||||
}
|
|
@ -57,7 +57,7 @@ namespace AdapterTest.Mocks
|
|||
{
|
||||
}
|
||||
|
||||
public void Checkpoint(long intervalMs)
|
||||
public void Checkpoint(int intervalSeconds)
|
||||
{
|
||||
}
|
||||
|
||||
|
|
|
@ -146,12 +146,12 @@ namespace AdapterTest.Mocks
|
|||
|
||||
public IDataFrameProxy Join(IDataFrameProxy otherScalaDataFrameReference, string[] joinColumnNames)
|
||||
{
|
||||
throw new NotImplementedException();
|
||||
return new MockDataFrameProxy(new object[] { otherScalaDataFrameReference, joinColumnNames }, SqlContextProxy);
|
||||
}
|
||||
|
||||
public IDataFrameProxy Join(IDataFrameProxy otherScalaDataFrameReference, IColumnProxy scalaColumnReference, string joinType)
|
||||
{
|
||||
throw new NotImplementedException();
|
||||
return new MockDataFrameProxy(new object[] { otherScalaDataFrameReference, scalaColumnReference, joinType }, SqlContextProxy);
|
||||
}
|
||||
|
||||
public bool IsLocal
|
||||
|
@ -329,5 +329,20 @@ namespace AdapterTest.Mocks
|
|||
{
|
||||
throw new NotImplementedException();
|
||||
}
|
||||
|
||||
public IDataFrameProxy Repartition(int numPartitions, IColumnProxy[] columns)
|
||||
{
|
||||
throw new NotImplementedException();
|
||||
}
|
||||
|
||||
public IDataFrameProxy Repartition(IColumnProxy[] columns)
|
||||
{
|
||||
throw new NotImplementedException();
|
||||
}
|
||||
|
||||
public IDataFrameProxy SortWithinPartitions(IColumnProxy[] columns)
|
||||
{
|
||||
throw new NotImplementedException();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -18,6 +18,7 @@ using NUnit.Framework;
|
|||
|
||||
namespace AdapterTest.Mocks
|
||||
{
|
||||
[Serializable]
|
||||
internal class MockRddProxy : IRDDProxy
|
||||
{
|
||||
internal IEnumerable<dynamic> result;
|
||||
|
@ -64,11 +65,6 @@ namespace AdapterTest.Mocks
|
|||
return MockSparkContextProxy.RunJob(this);
|
||||
}
|
||||
|
||||
public int PartitionLength()
|
||||
{
|
||||
return 1;
|
||||
}
|
||||
|
||||
public void Cache()
|
||||
{ }
|
||||
|
||||
|
|
|
@ -58,7 +58,7 @@ namespace AdapterTest.Mocks
|
|||
return false;
|
||||
}
|
||||
|
||||
public IStreamingContextProxy CreateStreamingContext(SparkContext sparkContext, long durationMs)
|
||||
public IStreamingContextProxy CreateStreamingContext(SparkContext sparkContext, int durationSeconds)
|
||||
{
|
||||
streamingContextProxy = new MockStreamingContextProxy();
|
||||
return streamingContextProxy;
|
||||
|
|
|
@ -3,21 +3,16 @@
|
|||
|
||||
using System;
|
||||
using System.Collections.Generic;
|
||||
using System.Diagnostics;
|
||||
using System.IO;
|
||||
using System.Linq;
|
||||
using System.Net;
|
||||
using System.Net.Sockets;
|
||||
using System.Runtime.CompilerServices;
|
||||
using System.Runtime.Serialization;
|
||||
using System.Runtime.Serialization.Formatters.Binary;
|
||||
using System.Text;
|
||||
using System.Threading.Tasks;
|
||||
using Microsoft.Spark.CSharp.Core;
|
||||
using Microsoft.Spark.CSharp.Proxy;
|
||||
using Microsoft.Spark.CSharp.Proxy.Ipc;
|
||||
using Microsoft.Spark.CSharp.Interop.Ipc;
|
||||
using NUnit.Framework;
|
||||
using Microsoft.Spark.CSharp.Network;
|
||||
|
||||
namespace AdapterTest.Mocks
|
||||
{
|
||||
|
@ -33,7 +28,7 @@ namespace AdapterTest.Mocks
|
|||
}
|
||||
|
||||
public void AddFile(string filePath)
|
||||
{}
|
||||
{ }
|
||||
|
||||
public IRDDProxy TextFile(string filePath, int minPartitions)
|
||||
{
|
||||
|
@ -84,14 +79,14 @@ namespace AdapterTest.Mocks
|
|||
}
|
||||
}
|
||||
|
||||
public IRDDProxy CreatePairwiseRDD(IRDDProxy javaReferenceInByteArrayRdd, int numPartitions)
|
||||
public IRDDProxy CreatePairwiseRDD(IRDDProxy javaReferenceInByteArrayRdd, int numPartitions, long partitionFuncId)
|
||||
{
|
||||
return javaReferenceInByteArrayRdd;
|
||||
}
|
||||
|
||||
|
||||
public void SetLogLevel(string logLevel)
|
||||
{}
|
||||
{ }
|
||||
|
||||
public string Version
|
||||
{
|
||||
|
@ -204,13 +199,13 @@ namespace AdapterTest.Mocks
|
|||
return ms.ToArray();
|
||||
});
|
||||
|
||||
TcpListener listener = new TcpListener(IPAddress.Loopback, 0);
|
||||
listener.Start();
|
||||
var listener = SocketFactory.CreateSocket();
|
||||
listener.Listen();
|
||||
|
||||
Task.Run(() =>
|
||||
{
|
||||
using (Socket socket = listener.AcceptSocket())
|
||||
using (Stream ns = new NetworkStream(socket))
|
||||
using (var socket = listener.Accept())
|
||||
using (var ns = socket.GetStream())
|
||||
{
|
||||
foreach (var item in result)
|
||||
{
|
||||
|
@ -219,7 +214,7 @@ namespace AdapterTest.Mocks
|
|||
}
|
||||
}
|
||||
});
|
||||
return (listener.LocalEndpoint as IPEndPoint).Port;
|
||||
return (listener.LocalEndPoint as IPEndPoint).Port;
|
||||
}
|
||||
|
||||
public int RunJob(IRDDProxy rdd, IEnumerable<int> partitions)
|
||||
|
@ -282,6 +277,11 @@ namespace AdapterTest.Mocks
|
|||
return new MockSqlContextProxy(this);
|
||||
}
|
||||
|
||||
public ISqlContextProxy CreateHiveContext()
|
||||
{
|
||||
return new MockSqlContextProxy(this);
|
||||
}
|
||||
|
||||
public IRDDProxy Parallelize(IEnumerable<byte[]> values, int numSlices)
|
||||
{
|
||||
return new MockRddProxy(null);
|
||||
|
|
|
@ -68,5 +68,80 @@ namespace AdapterTest.Mocks
|
|||
{
|
||||
throw new NotImplementedException();
|
||||
}
|
||||
|
||||
public ISqlContextProxy NewSession()
|
||||
{
|
||||
throw new NotImplementedException();
|
||||
}
|
||||
|
||||
public string GetConf(string key, string defaultValue)
|
||||
{
|
||||
throw new NotImplementedException();
|
||||
}
|
||||
|
||||
public void SetConf(string key, string value)
|
||||
{
|
||||
throw new NotImplementedException();
|
||||
}
|
||||
|
||||
public void RegisterDataFrameAsTable(IDataFrameProxy dataFrameProxy, string tableName)
|
||||
{
|
||||
throw new NotImplementedException();
|
||||
}
|
||||
|
||||
public void DropTempTable(string tableName)
|
||||
{
|
||||
throw new NotImplementedException();
|
||||
}
|
||||
|
||||
public IDataFrameProxy Table(string tableName)
|
||||
{
|
||||
throw new NotImplementedException();
|
||||
}
|
||||
|
||||
public IDataFrameProxy Tables()
|
||||
{
|
||||
throw new NotImplementedException();
|
||||
}
|
||||
|
||||
public IDataFrameProxy Tables(string databaseName)
|
||||
{
|
||||
throw new NotImplementedException();
|
||||
}
|
||||
|
||||
public IEnumerable<string> TableNames()
|
||||
{
|
||||
throw new NotImplementedException();
|
||||
}
|
||||
|
||||
public void CacheTable(string tableName)
|
||||
{
|
||||
throw new NotImplementedException();
|
||||
}
|
||||
|
||||
public void UncacheTable(string tableName)
|
||||
{
|
||||
throw new NotImplementedException();
|
||||
}
|
||||
|
||||
public void ClearCache()
|
||||
{
|
||||
throw new NotImplementedException();
|
||||
}
|
||||
|
||||
public IEnumerable<string> TableNames(string databaseName)
|
||||
{
|
||||
throw new NotImplementedException();
|
||||
}
|
||||
|
||||
public bool IsCached(string tableName)
|
||||
{
|
||||
throw new NotImplementedException();
|
||||
}
|
||||
|
||||
public void RefreshTable(string tableName)
|
||||
{
|
||||
throw new NotImplementedException();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -18,20 +18,16 @@ namespace AdapterTest.Mocks
|
|||
{
|
||||
private IFormatter formatter = new BinaryFormatter();
|
||||
public void Start()
|
||||
{
|
||||
}
|
||||
{}
|
||||
|
||||
public void Stop()
|
||||
{
|
||||
}
|
||||
{}
|
||||
|
||||
public void Remember(long durationMs)
|
||||
{
|
||||
}
|
||||
public void Remember(int durationSeconds)
|
||||
{}
|
||||
|
||||
public void Checkpoint(string directory)
|
||||
{
|
||||
}
|
||||
{}
|
||||
|
||||
public IDStreamProxy TextFileStream(string directory)
|
||||
{
|
||||
|
@ -53,6 +49,12 @@ namespace AdapterTest.Mocks
|
|||
return new MockDStreamProxy();
|
||||
}
|
||||
|
||||
public IDStreamProxy DirectKafkaStreamWithRepartition(List<string> topics, Dictionary<string, string> kafkaParams, Dictionary<string, long> fromOffsets,
|
||||
int numPartitions, byte[] readFunc, string serializationMode)
|
||||
{
|
||||
return new MockDStreamProxy();
|
||||
}
|
||||
|
||||
public IDStreamProxy Union(IDStreamProxy firstDStreams, IDStreamProxy[] otherDStreams)
|
||||
{
|
||||
return new MockDStreamProxy();
|
||||
|
@ -62,7 +64,7 @@ namespace AdapterTest.Mocks
|
|||
{
|
||||
}
|
||||
|
||||
public void AwaitTermination(int timeout)
|
||||
public void AwaitTerminationOrTimeout(long timeout)
|
||||
{
|
||||
}
|
||||
|
||||
|
@ -102,10 +104,24 @@ namespace AdapterTest.Mocks
|
|||
{
|
||||
Func<double, RDD<dynamic>, RDD<dynamic>, RDD<dynamic>> f = (Func<double, RDD<dynamic>, RDD<dynamic>, RDD<dynamic>>)formatter.Deserialize(new MemoryStream(func));
|
||||
RDD<dynamic> rdd = f(DateTime.UtcNow.Ticks,
|
||||
new RDD<dynamic>((jdstream as MockDStreamProxy).rddProxy ?? new MockRddProxy(null), new SparkContext("", "")),
|
||||
null,
|
||||
new RDD<dynamic>((jdstream as MockDStreamProxy).rddProxy ?? new MockRddProxy(null), new SparkContext("", "")));
|
||||
return new MockDStreamProxy(rdd.RddProxy);
|
||||
}
|
||||
|
||||
public IDStreamProxy CreateConstantInputDStream(IRDDProxy rddProxy)
|
||||
{
|
||||
return new MockDStreamProxy();
|
||||
}
|
||||
|
||||
public IDStreamProxy EventHubsUnionStream(Dictionary<string, string> eventHubsParams, StorageLevelType storageLevelType)
|
||||
{
|
||||
throw new NotImplementedException();
|
||||
}
|
||||
|
||||
public IDStreamProxy KafkaMetaStream(byte[] metaParams, uint numPartitions)
|
||||
{
|
||||
throw new NotImplementedException();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -1,9 +1,8 @@
|
|||
using System;
|
||||
using System.Collections.Generic;
|
||||
using System.IO;
|
||||
using AdapterTest.Mocks;
|
||||
using System.Linq;
|
||||
using Microsoft.Spark.CSharp.Core;
|
||||
using Microsoft.Spark.CSharp.Interop.Ipc;
|
||||
using NUnit.Framework;
|
||||
|
||||
namespace AdapterTest
|
||||
|
@ -155,6 +154,30 @@ namespace AdapterTest
|
|||
Assert.AreEqual(9, records.Length);
|
||||
}
|
||||
|
||||
[Test]
|
||||
public void TestPairRddPartitionBy()
|
||||
{
|
||||
Func<dynamic, int> partitionFunc = key => 1;
|
||||
var rddPartitionBy = pairs.PartitionBy(3, partitionFunc);
|
||||
Assert.AreEqual(new Partitioner(3, partitionFunc), rddPartitionBy.partitioner);
|
||||
}
|
||||
|
||||
[Test]
|
||||
public void TestPairRddSortByKey()
|
||||
{
|
||||
var expectedSortedRdd = pairs.Collect().OrderBy(kv => kv.Key, StringComparer.OrdinalIgnoreCase).ToArray();
|
||||
var rddSortByKey = pairs.SortByKey(true, null, key => key.ToLowerInvariant()).Collect();
|
||||
CollectionAssert.AreEqual(expectedSortedRdd, rddSortByKey);
|
||||
}
|
||||
|
||||
[Test]
|
||||
public void TestPairRddSortByKey2()
|
||||
{
|
||||
var expectedSortedRdd = pairs.Collect().OrderBy(kv => kv.Key, StringComparer.OrdinalIgnoreCase).ToArray();
|
||||
var rddSortByKey = pairs.SortByKey(true, 1, key => key.ToLowerInvariant()).Collect();
|
||||
CollectionAssert.AreEqual(expectedSortedRdd, rddSortByKey);
|
||||
}
|
||||
|
||||
[Test]
|
||||
public void TestPairRddProxy()
|
||||
{
|
||||
|
|
Некоторые файлы не были показаны из-за слишком большого количества измененных файлов Показать больше
Загрузка…
Ссылка в новой задаче