This commit is contained in:
Tao Wang 2016-07-09 18:08:17 +08:00
Родитель 5ec73d419b e75eb189e0
Коммит 470bb411fe
285 изменённых файлов: 17783 добавлений и 2917 удалений

2
.gitignore поставляемый
Просмотреть файл

@ -30,6 +30,8 @@
scala/dependency-reduced-pom.xml
build/runtime/
build/tools/
build/examples/
build/dependencies/
*.log
lib/

Просмотреть файл

@ -6,9 +6,9 @@ before_install:
- sudo apt-get install xsltproc
- nuget install NUnit.Runners -Version 3.0.0 -OutputDirectory testrunner
# install maven 3.3.3
- wget http://archive.apache.org/dist/maven/maven-3/3.3.3/binaries/apache-maven-3.3.3-bin.tar.gz
- tar zxf apache-maven-3.3.3-bin.tar.gz && rm apache-maven-3.3.3-bin.tar.gz
- export M2_HOME="$PWD/apache-maven-3.3.3"
- wget http://archive.apache.org/dist/maven/maven-3/3.3.9/binaries/apache-maven-3.3.9-bin.tar.gz
- tar zxf apache-maven-3.3.9-bin.tar.gz && rm apache-maven-3.3.9-bin.tar.gz
- export M2_HOME="$PWD/apache-maven-3.3.9"
- export M2="$M2_HOME/bin"
- export PATH="$M2:$PATH"
- hash -r

101
README.md
Просмотреть файл

@ -1,6 +1,7 @@
<h1><img src='/logo/spark-clr-clear-500x200.png' width='200px' alt='SparkCLR logo' /></h1>
<img src='logo/mobius-star-200.png' width='125px' alt='Mobius logo' />
# Mobius: C# API for Spark
[SparkCLR](https://github.com/Microsoft/SparkCLR) (pronounced Sparkler) adds C# language binding to [Apache Spark](https://spark.apache.org/), enabling the implementation of Spark driver code and data processing operations in C#.
[Mobius](https://github.com/Microsoft/Mobius) adds C# language binding to [Apache Spark](https://spark.apache.org/), enabling the implementation of Spark driver code and data processing operations in C#.
For example, the word count sample in Apache Spark can be implemented in C# as follows :
@ -49,64 +50,108 @@ maxLatencyByDcDataFrame.ShowSchema();
maxLatencyByDcDataFrame.Show();
```
Refer to [SparkCLR\csharp\Samples](csharp/Samples) directory and [sample usage](csharp/Samples/Microsoft.Spark.CSharp/samplesusage.md) for complete samples.
A simple Spark Streaming application that processes messages from Kafka using C# may be implemented using the following code:
``` c#
StreamingContext sparkStreamingContext = StreamingContext.GetOrCreate(checkpointPath, () =>
{
var ssc = new StreamingContext(sparkContext, slideDurationInMillis);
ssc.Checkpoint(checkpointPath);
var stream = KafkaUtils.CreateDirectStream(ssc, topicList, kafkaParams, perTopicPartitionKafkaOffsets);
//message format: [timestamp],[loglevel],[logmessage]
var countByLogLevelAndTime = stream
.Map(kvp => Encoding.UTF8.GetString(kvp.Value))
.Filter(line => line.Contains(","))
.Map(line => line.Split(','))
.Map(columns => new KeyValuePair<string, int>(
string.Format("{0},{1}", columns[0], columns[1]), 1))
.ReduceByKeyAndWindow((x, y) => x + y, (x, y) => x - y,
windowDurationInSecs, slideDurationInSecs, 3)
.Map(logLevelCountPair => string.Format("{0},{1}",
logLevelCountPair.Key, logLevelCountPair.Value));
countByLogLevelAndTime.ForeachRDD(countByLogLevel =>
{
foreach (var logCount in countByLogLevel.Collect())
Console.WriteLine(logCount);
});
return ssc;
});
sparkStreamingContext.Start();
sparkStreamingContext.AwaitTermination();
```
Refer to [Mobius\csharp\Samples](csharp/Samples) directory and [sample usage](csharp/Samples/Microsoft.Spark.CSharp/samplesusage.md) for complete samples.
## API Documentation
Refer to [SparkCLR C# API documentation](csharp/Adapter/documentation/SparkCLR_API_Documentation.md) for the list of Spark's data processing operations supported in SparkCLR.
Refer to [Mobius C# API documentation](csharp/Adapter/documentation/Mobius_API_Documentation.md) for the list of Spark's data processing operations supported in Mobius.
## API Usage
SparkCLR API usage samples are available at:
Mobius API usage samples are available at:
* [Samples project](csharp/Samples/Microsoft.Spark.CSharp/) which uses a comprehensive set of SparkCLR APIs to implement samples that are also used for functional validation of APIs
* [Examples folder](./examples) which contains standalone [C# projects](/notes/running-mobius-app.md#running-mobius-examples-in-local-mode) that can be used as templates to start developing Mobius applications
* [Examples folder](./examples) which contains standalone SparkCLR projects that can be used as templates to start developing SparkCLR applications
* Performance test scenarios implemented in [C#](csharp/Perf/Microsoft.Spark.CSharp) and [Scala](scala/perf) for side by side comparison of Spark driver code
* [Samples project](csharp/Samples/Microsoft.Spark.CSharp/) which uses a comprehensive set of Mobius APIs to implement samples that are also used for functional validation of APIs
* Mobius performance test scenarios implemented in [C#](csharp/Perf/Microsoft.Spark.CSharp) and [Scala](scala/perf) for side by side comparison of Spark driver code
## Documents
Refer to the [docs folder](docs) for design overview and other info on SparkCLR
Refer to the [docs folder](docs) for design overview and other info on Mobius
## Build Status
|Ubuntu 14.04.3 LTS |Windows |Unit test coverage |
|-------------------|:------:|:-----------------:|
|[![Build status](https://travis-ci.org/Microsoft/SparkCLR.svg?branch=master)](https://travis-ci.org/Microsoft/SparkCLR) |[![Build status](https://ci.appveyor.com/api/projects/status/lflkua81gg0swv6i/branch/master?svg=true)](https://ci.appveyor.com/project/SparkCLR/sparkclr/branch/master) |[![codecov.io](https://codecov.io/github/Microsoft/SparkCLR/coverage.svg?branch=master)](https://codecov.io/github/Microsoft/SparkCLR?branch=master) |
|[![Build status](https://travis-ci.org/Microsoft/Mobius.svg?branch=master)](https://travis-ci.org/Microsoft/Mobius) |[![Build status](https://ci.appveyor.com/api/projects/status/lflkua81gg0swv6i/branch/master?svg=true)](https://ci.appveyor.com/project/SparkCLR/sparkclr/branch/master) |[![codecov.io](https://codecov.io/github/Microsoft/Mobius/coverage.svg?branch=master)](https://codecov.io/github/Microsoft/Mobius?branch=master)
## Getting Started
| |Windows |Linux |
|---|:------:|:----:|
|Build & run unit tests |[windows-instructions.md](notes/windows-instructions.md#building-sparkclr) |[linux-instructions.md](notes/linux-instructions.md#building-sparkclr) |
|Run samples (functional tests) in local mode |[windows-instructions.md](notes/windows-instructions.md#running-samples) |[linux-instructions.md](notes/linux-instructions.md#running-samples) |
|Run standlone examples in Client mode |[Quick-start wiki](https://github.com/Microsoft/SparkCLR/wiki/Quick-Start#client-mode) |[Quick-start wiki](https://github.com/Microsoft/SparkCLR/wiki/Quick-Start#client-mode) |
|Run standlone examples in Cluster mode |[Quick-start wiki](https://github.com/Microsoft/SparkCLR/wiki/Quick-Start#cluster-mode) |[Quick-start wiki](https://github.com/Microsoft/SparkCLR/wiki/Quick-Start#cluster-mode) |
|---|:------|:----|
|Build & run unit tests |[Build in Windows](notes/windows-instructions.md#building-mobius) |[Build in Linux](notes/linux-instructions.md#building-mobius-in-linux) |
|Run samples (functional tests) in local mode |[Samples in Windows](notes/windows-instructions.md#running-samples) |[Samples in Linux](notes/linux-instructions.md#running-mobius-samples-in-linux) |
|Run examples in local mode |[Examples in Windows](/notes/running-mobius-app.md#running-mobius-examples-in-local-mode) |[Examples in Linux](notes/linux-instructions.md#running-mobius-examples-in-linux) |
|Run Mobius app |<ul><li>[Standalone cluster](notes/running-mobius-app.md#standalone-cluster)</li><li>[YARN cluster](notes/running-mobius-app.md#yarn-cluster)</li></ul> |<ul><li>[Linux cluster](notes/linux-instructions.md#running-mobius-applications-in-linux)</li><li>[Azure HDInsight Spark Cluster](/notes/linux-instructions.md#mobius-in-azure-hdinsight-spark-cluster)</li><li>[AWS EMR Spark Cluster](/notes/linux-instructions.md#mobius-in-amazon-web-services-emr-spark-cluster)</li> |
Note: Refer to [linux-compatibility.md](notes/linux-compatibility.md) for using SparkCLR with Spark on Linux
### Useful Links
* [Configuration parameters in Mobius](/notes/configuration-mobius.md)
* [Troubleshoot errors in Mobius](/notes/troubleshooting-mobius.md)
* [Debug Mobius apps](/notes/running-mobius-app.md#debug-mode)
## Supported Spark Versions
SparkCLR is built and tested with [Spark 1.4.1](https://github.com/Microsoft/SparkCLR/tree/branch-1.4), [Spark 1.5.2](https://github.com/Microsoft/SparkCLR/tree/branch-1.5) and [Spark 1.6.0](https://github.com/Microsoft/SparkCLR/tree/master).
Mobius is built and tested with Apache Spark [1.4.1](https://github.com/Microsoft/Mobius/tree/branch-1.4), [1.5.2](https://github.com/Microsoft/Mobius/tree/branch-1.5) and [1.6.*](https://github.com/Microsoft/Mobius/tree/branch-1.6).
## Releases
Mobius releases are available at https://github.com/Microsoft/Mobius/releases. References needed to build C# Spark driver applicaiton using Mobius are also available in [NuGet](https://www.nuget.org/packages/Microsoft.SparkCLR)
[![NuGet Badge](https://buildstats.info/nuget/Microsoft.SparkCLR)](https://www.nuget.org/packages/Microsoft.SparkCLR)
Refer to [mobius-release-info.md](notes/mobius-release-info.md) for the details on versioning policy and the contents of the release.
## License
[![License](https://img.shields.io/badge/license-MIT-blue.svg?style=plastic)](https://github.com/Microsoft/SparkCLR/blob/master/LICENSE)
[![License](https://img.shields.io/badge/license-MIT-blue.svg?style=plastic)](https://github.com/Microsoft/Mobius/blob/master/LICENSE)
SparkCLR is licensed under the MIT license. See [LICENSE](LICENSE) file for full license information.
Mobius is licensed under the MIT license. See [LICENSE](LICENSE) file for full license information.
## Community
[![Issue Stats](http://issuestats.com/github/Microsoft/SparkCLR/badge/pr)](http://issuestats.com/github/Microsoft/SparkCLR)
[![Issue Stats](http://issuestats.com/github/Microsoft/SparkCLR/badge/issue)](http://issuestats.com/github/Microsoft/SparkCLR)
[![Join the chat at https://gitter.im/Microsoft/SparkCLR](https://badges.gitter.im/Microsoft/SparkCLR.svg)](https://gitter.im/Microsoft/SparkCLR?utm_source=badge&utm_medium=badge&utm_campaign=pr-badge&utm_content=badge)
[![Issue Stats](http://issuestats.com/github/Microsoft/Mobius/badge/pr)](http://issuestats.com/github/Microsoft/Mobius)
[![Issue Stats](http://issuestats.com/github/Microsoft/Mobius/badge/issue)](http://issuestats.com/github/Microsoft/Mobius)
[![Join the chat at https://gitter.im/Microsoft/Mobius](https://badges.gitter.im/Microsoft/Mobius.svg)](https://gitter.im/Microsoft/Mobius?utm_source=badge&utm_medium=badge&utm_campaign=pr-badge&utm_content=badge)
[![Twitter](https://img.shields.io/twitter/url/http/twitter.com/MobiusForSpark.svg?style=social)](https://twitter.com/intent/tweet?text=@MobiusForSpark [your tweet] via @GitHub)
* SparkCLR project welcomes contributions. To contribute, follow the instructions in [CONTRIBUTING.md](notes/CONTRIBUTING.md)
* Mobius project welcomes contributions. To contribute, follow the instructions in [CONTRIBUTING.md](notes/CONTRIBUTING.md)
* Options to ask your question to the SparkCLR community
* create issue on [GitHub](https://github.com/Microsoft/SparkCLR)
* Options to ask your question to the Mobius community
* create issue on [GitHub](https://github.com/Microsoft/Mobius)
* create post with "sparkclr" tag in [Stack Overflow](https://stackoverflow.com/questions/tagged/sparkclr)
* send email to sparkclr-user@googlegroups.com
* join chat at [SparkCLR room in Gitter](https://gitter.im/Microsoft/SparkCLR)
* join chat at [Mobius room in Gitter](https://gitter.im/Microsoft/Mobius)
* tweet [@MobiusForSpark](http://twitter.com/MobiusForSpark)
## Code of Conduct
This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/). For more information see the [Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/) or contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with any additional questions or comments.

Просмотреть файл

@ -1,4 +1,4 @@
version: 1.6.0-SNAPSHOT.{build}
version: 1.6.2-SNAPSHOT.{build}
environment:
securefile:
@ -26,6 +26,7 @@ build_script:
- cmd: SET MAVEN_OPTS=-XX:MaxPermSize=2g -Xmx4g
- cmd: SET JAVA_OPTS=-XX:MaxPermSize=2g -Xmx4g
- cmd: SET MVN_QUIET=--quiet
- ps: if($env:APPVEYOR_REPO_TAG -eq $FALSE) {.\dev\scripts\SetSparkClrNugetPackageVersion.ps1 -nuspecDir .\csharp -version $env:APPVEYOR_BUILD_VERSION}
- cmd: cd .\build
- cmd: .\Build.cmd
- cmd: cd ..
@ -45,7 +46,7 @@ after_test:
- pip install codecov
- codecov -f "SparkCLRCodeCoverage.xml"
- cmd: cd .\build\localmode
- cmd: .\Runsamples.cmd --validate
- cmd: if not defined ProjectVersion (.\Runsamples.cmd --validate)
- cmd: cd ..\..
- cmd: dir csharp\Microsoft*.nupkg
- cmd: dir scala\target\spark-clr*.jar
@ -66,7 +67,15 @@ deploy:
- provider: NuGet # deploy to NuGet.org
api_key:
secure: TscZXMoOxrMfjR2TvGBns6b+IILWvo0WJpxikoGsMCqEcMj/x41Le1j8dHTCJMjI
skip_symbols: false # push symbols to SymbolSource.org
skip_symbols: false
artifact: /Microsoft.*\.nupkg/
on:
appveyor_repo_tag: true # deploy on tag push only
- provider: NuGet # deploy to MyGet.org
server: https://www.myget.org/F/mobiusforspark/api/v2/package
api_key:
secure: 1c6+PZ3zOdIgIy2y8rf1g/NfbcfoxwNcymNBUr1591mD3Ull2X32Qvw2QyCXqFka
skip_symbols: false
symbol_server: https://www.myget.org/F/mobiusforspark/api/v2/package
artifact: /Microsoft.*\.nupkg/

Просмотреть файл

@ -38,14 +38,14 @@ if NOT EXIST "%SPARKCLR_HOME%\lib" mkdir "%SPARKCLR_HOME%\lib"
if NOT EXIST "%SPARKCLR_HOME%\samples" mkdir "%SPARKCLR_HOME%\samples"
if NOT EXIST "%SPARKCLR_HOME%\repl" mkdir "%SPARKCLR_HOME%\repl"
@echo Assemble SparkCLR Scala components
@echo Assemble Mobius Scala components
pushd "%CMDHOME%\..\scala"
@rem clean the target directory first
call mvn.cmd %MVN_QUIET% clean
@rem
@rem Note: Shade-plugin helps creates an uber-package to simplify SparkCLR job submission;
@rem Note: Shade-plugin helps creates an uber-package to simplify running samples during CI;
@rem however, it breaks debug mode in IntellJ. So enable shade-plugin
@rem only in build.cmd to create the uber-package.
@rem
@ -80,19 +80,22 @@ IF "%APPVEYOR_REPO_TAG%" == "true" (goto :sign)
:mvndone
set MVN_ERRORLEVEL=%ERRORLEVEL%
@rem
@rem After uber package is created, restore Pom.xml
@rem
copy /y %temp%\pom.xml.original pom.xml
if %ERRORLEVEL% NEQ 0 (
@echo Build SparkCLR Scala components failed, stop building.
if %MVN_ERRORLEVEL% NEQ 0 (
@echo Build Mobius Scala components failed, stop building.
popd
goto :eof
)
@echo SparkCLR Scala binaries
copy /y target\spark*.jar "%SPARKCLR_HOME%\lib\"
@echo Mobius Scala binaries
@rem copy non-uber jar to runtime\lib folder
powershell -f ..\build\copyjar.ps1
popd
@REM Any .jar files under the lib directory will be copied to the staged runtime lib tree.
@ -105,7 +108,7 @@ if EXIST "%CMDHOME%\lib" (
)
:buildCSharp
@echo Assemble SparkCLR C# components
@echo Assemble Mobius C# components
pushd "%CMDHOME%\..\csharp"
@rem clean any possible previous build first
@ -113,20 +116,20 @@ call Clean.cmd
call Build.cmd
if %ERRORLEVEL% NEQ 0 (
@echo Build SparkCLR C# components failed, stop building.
@echo Build Mobius C# components failed, stop building.
popd
goto :eof
)
@echo SparkCLR C# binaries
@echo Mobius C# binaries
copy /y Worker\Microsoft.Spark.CSharp\bin\Release\* "%SPARKCLR_HOME%\bin\"
@echo SparkCLR C# Samples binaries
@echo Mobius C# Samples binaries
@rem need to include CSharpWorker.exe.config in samples folder
copy /y Worker\Microsoft.Spark.CSharp\bin\Release\* "%SPARKCLR_HOME%\samples\"
copy /y Samples\Microsoft.Spark.CSharp\bin\Release\* "%SPARKCLR_HOME%\samples\"
@echo SparkCLR Samples data
@echo Mobius Samples data
copy /y Samples\Microsoft.Spark.CSharp\data\* "%SPARKCLR_HOME%\data\"
@echo SparkCLR REPL
@ -135,7 +138,59 @@ copy /y Repl\bin\Release\* "%SPARKCLR_HOME%\repl\"
popd
@echo Assemble SparkCLR script components
@echo Download external dependencies
pushd "%CMDHOME%"
set DEPENDENCIES_DIR=dependencies
if NOT EXIST "%DEPENDENCIES_DIR%" mkdir %DEPENDENCIES_DIR%
set DEPENDENCIES_HOME=%CMDHOME%\%DEPENDENCIES_DIR%
powershell -f localmode\downloadtools.ps1 dependencies
@echo Assemble dependencies
xcopy /e /y "%DEPENDENCIES_HOME%" "%SPARKCLR_HOME%\dependencies\"
@echo Assemble Mobius examples
pushd "%CMDHOME%\..\examples"
call Clean.cmd
call Build.cmd
if %ERRORLEVEL% NEQ 0 (
@echo Build Mobius .NET examples failed, stop building.
popd
goto :eof
)
set EXAMPLES_HOME=%CMDHOME%\examples
@echo set EXAMPLES_HOME=%EXAMPLES_HOME%
if EXIST "%EXAMPLES_HOME%" (
@echo Delete existing %EXAMPLES_HOME% ...
rd /s /q "%EXAMPLES_HOME%"
)
if NOT EXIST "%EXAMPLES_HOME%" mkdir "%EXAMPLES_HOME%"
set CURRDIR=%cd%
for /f "delims=" %%D in ('dir /b /s bin') do call :copyexamples %%D
goto :copyscripts
:copyexamples
set EXAMPLES_SRC=%1
set EXAMPLES_TARGET=%1
call set EXAMPLES_TARGET=%%EXAMPLES_TARGET:%CURRDIR%=%EXAMPLES_HOME%%%
set EXAMPLES_TARGET=%EXAMPLES_TARGET:~0,-4%
@echo mkdir %EXAMPLES_TARGET%
if NOT EXIST "%EXAMPLES_TARGET%" mkdir "%EXAMPLES_TARGET%"
REM 1. Copy dependencies from %SPARKCLR_HOME%\bin to use latest Mobius binaries
xcopy /y "%SPARKCLR_HOME%\bin\*" "%EXAMPLES_TARGET%"
REM 2. copy Examples APPs
xcopy /d /y "%EXAMPLES_SRC%\Release" "%EXAMPLES_TARGET%"
goto :eof
:copyscripts
popd
@echo Assemble Mobius script components
xcopy /e /y "%CMDHOME%\..\scripts" "%SPARKCLR_HOME%\scripts\"
@echo Make distribution
@ -148,10 +203,21 @@ if not defined ProjectVersion (
)
set SPARKCLR_NAME=spark-clr_2.10-%ProjectVersion%
@echo "%SPARKCLR_HOME%
@rem copy samples to top-level folder before zipping
@echo move /Y "%SPARKCLR_HOME%\samples "%CMDHOME%"
move /Y %SPARKCLR_HOME%\samples %CMDHOME%
@echo move /Y "%SPARKCLR_HOME%\data" "%CMDHOME%\samples"
move /Y %SPARKCLR_HOME%\data %CMDHOME%\samples
@rem copy release info
@echo copy /Y "%CMDHOME%\..\notes\mobius-release-info.md"
copy /Y "%CMDHOME%\..\notes\mobius-release-info.md"
@rem Create the zip file
@echo 7z a .\target\%SPARKCLR_NAME%.zip runtime localmode ..\examples
7z a .\target\%SPARKCLR_NAME%.zip runtime localmode ..\examples
@echo 7z a .\target\%SPARKCLR_NAME%.zip runtime examples samples mobius-release-info.md
7z a .\target\%SPARKCLR_NAME%.zip runtime examples samples mobius-release-info.md
:distdone
popd

Просмотреть файл

@ -18,14 +18,14 @@ fi
[ ! -d "$SPARKCLR_HOME/samples" ] && mkdir "$SPARKCLR_HOME/samples"
[ ! -d "$SPARKCLR_HOME/scripts" ] && mkdir "$SPARKCLR_HOME/scripts"
echo "Assemble SparkCLR Scala components"
echo "Assemble Mobius Scala components"
pushd "$FWDIR/../scala"
# clean the target directory first
mvn clean -q
[ $? -ne 0 ] && exit 1
# Note: Shade-plugin helps creates an uber-package to simplify SparkCLR job submission;
# Note: Shade-plugin helps creates an uber-package to simplify running samples during CI;
# however, it breaks debug mode in IntellJ. So enable shade-plugin
# only in build.cmd to create the uber-package.
# build the package
@ -33,11 +33,11 @@ mvn package -Puber-jar -q
if [ $? -ne 0 ]
then
echo "Build SparkCLR Scala components failed, stop building."
echo "Build Mobius Scala components failed, stop building."
popd
exit 1
fi
echo "SparkCLR Scala binaries"
echo "Mobius Scala binaries"
cp target/spark*.jar "$SPARKCLR_HOME/lib/"
popd
@ -52,7 +52,7 @@ then
done
fi
echo "Assemble SparkCLR C# components"
echo "Assemble Mobius C# components"
pushd "$FWDIR/../csharp"
# clean any possible previous build first
@ -62,23 +62,37 @@ pushd "$FWDIR/../csharp"
if [ $? -ne 0 ];
then
echo "Build SparkCLR C# components failed, stop building."
echo "Build Mobius C# components failed, stop building."
popd
exit 1
fi
echo "SparkCLR C# binaries"
echo "Mobius C# binaries"
cp Worker/Microsoft.Spark.CSharp/bin/Release/* "$SPARKCLR_HOME/bin/"
echo "SparkCLR C# Samples binaries"
echo "Mobius C# Samples binaries"
# need to include CSharpWorker.exe.config in samples folder
cp Worker/Microsoft.Spark.CSharp/bin/Release/* "$SPARKCLR_HOME/samples/"
cp Samples/Microsoft.Spark.CSharp/bin/Release/* "$SPARKCLR_HOME/samples/"
echo "SparkCLR Samples data"
echo "Mobius Samples data"
cp Samples/Microsoft.Spark.CSharp/data/* "$SPARKCLR_HOME/data/"
popd
echo "Assemble SparkCLR script components"
echo "Assemble Mobius examples"
pushd "$FWDIR/../examples"
# clean any possible previous build first
./clean.sh
./build.sh
if [ $? -ne 0 ];
then
echo "Build Mobius .NET Examples failed, stop building."
popd
exit 1
fi
popd
echo "Assemble Mobius script components"
pushd "$FWDIR/../scripts"
cp *.sh "$SPARKCLR_HOME/scripts/"
popd

43
build/copyjar.ps1 Executable file
Просмотреть файл

@ -0,0 +1,43 @@
function Get-ScriptDirectory
{
$Invocation = (Get-Variable MyInvocation -Scope 1).Value;
if($Invocation.PSScriptRoot)
{
$Invocation.PSScriptRoot;
}
Elseif($Invocation.MyCommand.Path)
{
Split-Path $Invocation.MyCommand.Path
}
else
{
$Invocation.InvocationName.Substring(0,$Invocation.InvocationName.LastIndexOf("\"));
}
}
#
# main body of the script
# this script copies jar file for the release
#
$scriptDir= Get-ScriptDirectory
write-output "Script directory: $scriptDir"
$destDir = "$scriptDir\runtime\lib"
write-output "Directory to which file will be copied to: $destDir"
pushd ..\scala\target
#non-uber jar has original prefix - this is the file that needs to be copied over
$files = get-childitem $configPath -filter "original*"
#only one file in $files
foreach($file in $files)
{
$sourceFileName = $file.Name
write-output "Name of the file to copy: $sourceFileName"
}
$pattern = "^original-(.*)"
$destFileName = $sourceFileName -replace $pattern,'$1'
write-output "Name of the file to use in destination: $destFileName"
copy-item $sourceFileName -Destination "$destDir\$destFileName"
popd

Просмотреть файл

@ -28,7 +28,7 @@ if "%1" == "" (
@rem TODO: this check will fail if "--exe" only exists in the argument list of user application.
if "%1" == "--exe" (
set USER_EXE="true"
@echo [RunSamples.cmd] Run user specified application, instead of SparkCLR samples.
@echo [RunSamples.cmd] Run user specified application, instead of Mobius samples.
)
rem - shift the arguments and examine %1 again
@ -47,16 +47,14 @@ if "%precheck%" == "bad" (goto :EOF)
@rem
@rem setup Hadoop and Spark versions
@rem
set SPARK_VERSION=1.6.0
set SPARK_VERSION=1.6.2
set HADOOP_VERSION=2.6
@echo [RunSamples.cmd] SPARK_VERSION=%SPARK_VERSION%, HADOOP_VERSION=%HADOOP_VERSION%
@rem Windows 7/8/10 may not allow powershell scripts by default
powershell -Command Set-ExecutionPolicy -Scope CurrentUser -ExecutionPolicy Unrestricted
@rem download runtime dependencies
pushd "%CMDHOME%"
powershell -f downloadtools.ps1 run !VERBOSE!
@rem Windows 7/8/10 may not allow powershell scripts by default
powershell -ExecutionPolicy Unrestricted -File downloadtools.ps1 run !VERBOSE!
@echo [RunSamples.cmd] UpdateRuntime.cmd
type ..\tools\updateruntime.cmd
call ..\tools\updateruntime.cmd
@ -67,7 +65,12 @@ if defined ProjectVersion (
)
set SPARKCLR_HOME=%CMDHOME%\..\runtime
set SPARKCSV_JARS=
@rem spark-csv package and its depenedency are required for DataFrame operations in Mobius
set SPARKCLR_EXT_PATH=%SPARKCLR_HOME%\dependencies
set SPARKCSV_JAR1PATH=%SPARKCLR_EXT_PATH%\spark-csv_2.10-1.3.0.jar
set SPARKCSV_JAR2PATH=%SPARKCLR_EXT_PATH%\commons-csv-1.1.jar
set SPARKCLR_EXT_JARS=%SPARKCSV_JAR1PATH%,%SPARKCSV_JAR2PATH%
@rem RunSamples.cmd is in local mode, should not load Hadoop or Yarn cluster config. Disable Hadoop/Yarn conf dir.
set HADOOP_CONF_DIR=
@ -81,7 +84,7 @@ set SAMPLES_DIR=%SPARKCLR_HOME%\samples
@echo [RunSamples.cmd] JAVA_HOME=%JAVA_HOME%
@echo [RunSamples.cmd] SPARK_HOME=%SPARK_HOME%
@echo [RunSamples.cmd] SPARKCLR_HOME=%SPARKCLR_HOME%
@echo [RunSamples.cmd] SPARKCSV_JARS=%SPARKCSV_JARS%
@echo [RunSamples.cmd] SPARKCLR_EXT_JARS=%SPARKCLR_EXT_JARS%
pushd "%SPARKCLR_HOME%\scripts"
@echo [RunSamples.cmd] CWD=
@ -93,8 +96,8 @@ if !INTERACTIVE! == "interactive" (
call sparkclr-repl.cmd
) else (
if "!USER_EXE!"=="" (
@echo [RunSamples.cmd] call sparkclr-submit.cmd --exe SparkCLRSamples.exe %SAMPLES_DIR% spark.local.dir %TEMP_DIR% sparkclr.sampledata.loc %SPARKCLR_HOME%\data %*
call sparkclr-submit.cmd --exe SparkCLRSamples.exe %SAMPLES_DIR% spark.local.dir %TEMP_DIR% sparkclr.sampledata.loc %SPARKCLR_HOME%\data %*
@echo [RunSamples.cmd] call sparkclr-submit.cmd --jars %SPARKCLR_EXT_JARS% -exe SparkCLRSamples.exe %SAMPLES_DIR% spark.local.dir %TEMP_DIR% sparkclr.sampledata.loc %SPARKCLR_HOME%\data %*
call sparkclr-submit.cmd --jars %SPARKCLR_EXT_JARS% --exe SparkCLRSamples.exe %SAMPLES_DIR% spark.local.dir %TEMP_DIR% sparkclr.sampledata.loc %SPARKCLR_HOME%\data %*
) else (
@echo [RunSamples.cmd] call sparkclr-submit.cmd %*
call sparkclr-submit.cmd %*

Просмотреть файл

@ -12,7 +12,7 @@ if ($stage.ToLower() -eq "run")
$hadoopVersion = if ($envValue -eq $null) { "2.6" } else { $envValue }
$envValue = [Environment]::GetEnvironmentVariable("SPARK_VERSION")
$sparkVersion = if ($envValue -eq $null) { "1.6.0" } else { $envValue }
$sparkVersion = if ($envValue -eq $null) { "1.6.1" } else { $envValue }
Write-Output "[downloadtools] hadoopVersion=$hadoopVersion, sparkVersion=$sparkVersion"
}
@ -65,6 +65,7 @@ function Replace-VariableInFile($variable, $value, $sourceFile, $targetFile)
function Download-File($url, $output)
{
$output = [System.IO.Path]::GetFullPath($output)
if (test-path $output)
{
Write-Output "[downloadtools.Download-File] $output exists. No need to download."
@ -83,7 +84,13 @@ function Download-File($url, $output)
-SourceIdentifier Web.DownloadProgressChanged -Action {
$Global:Data = $event
}
$wc.DownloadFileAsync($url, $output)
$tmpOutput = $output + ".tmp.download"
if (test-path $tmpOutput) {
Remove-Item $tmpOutput
}
$wc.DownloadFileAsync($url, $tmpOutput)
While (!($Global:downloadComplete)) {
$percent = $Global:Data.SourceArgs.ProgressPercentage
$totalBytes = $Global:Data.SourceArgs.TotalBytesToReceive
@ -92,6 +99,8 @@ function Download-File($url, $output)
Write-Progress -Activity ("Downloading file to {0} from {1}" -f $output,$url) -Status ("{0} bytes \ {1} bytes" -f $receivedBytes,$totalBytes) -PercentComplete $percent
}
}
Rename-Item $tmpOutput -NewName $output
Write-Progress -Activity ("Downloading file to {0} from {1}" -f $output, $url) -Status ("{0} bytes \ {1} bytes" -f $receivedBytes,$totalBytes) -Completed
Unregister-Event -SourceIdentifier Web.DownloadFileCompleted
Unregister-Event -SourceIdentifier Web.DownloadProgressChanged
@ -213,11 +222,11 @@ function Download-BuildTools
}
# Apache Maven
$mvnVer = "apache-maven-3.3.3"
$mvnVer = "apache-maven-3.3.9"
$mvnCmd = "$toolsDir\$mvnVer\bin\mvn.cmd"
if (!(test-path $mvnCmd))
{
$url = "http://www.us.apache.org/dist/maven/maven-3/3.3.3/binaries/$mvnVer-bin.tar.gz"
$url = "http://www.us.apache.org/dist/maven/maven-3/3.3.9/binaries/$mvnVer-bin.tar.gz"
$output="$toolsDir\$mvnVer-bin.tar.gz"
Download-File $url $output
Untar-File $output $toolsDir
@ -257,7 +266,7 @@ function Download-BuildTools
$gpgZip = "$toolsDir\gpg4win-vanilla-2.3.0.zip"
if (!(test-path $gpgZip))
{
$url = "https://github.com/SparkCLR/build/blob/master/tools/gpg4win-vanilla-2.3.0.zip?raw=true"
$url = "https://github.com/MobiusForSpark/build/blob/master/tools/gpg4win-vanilla-2.3.0.zip?raw=true"
$output=$gpgZip
Download-File $url $output
# Unzip-File $output $toolsDir
@ -280,6 +289,39 @@ function Download-BuildTools
$envStream.close()
}
function Download-ExternalDependencies
{
$readMeStream = [System.IO.StreamWriter] "$scriptDir\..\dependencies\ReadMe.txt"
$readMeStream.WriteLine("The files in this folder are dependencies of Mobius Project")
$readMeStream.WriteLine("Refer to the following download locations for details on the jars like POM file, license etc.")
$readMeStream.WriteLine("")
$readMeStream.WriteLine("------------ Dependencies for CSV parsing in Mobius DataFrame API -----------------------------")
# Downloading spark-csv package and its depenency. These packages are required for DataFrame operations in Mobius
$url = "http://search.maven.org/remotecontent?filepath=com/databricks/spark-csv_2.10/1.3.0/spark-csv_2.10-1.3.0.jar"
$output="$scriptDir\..\dependencies\spark-csv_2.10-1.3.0.jar"
Download-File $url $output
Write-Output "[downloadtools.Download-ExternalDependencies] Downloading $url to $scriptDir\..\dependencies"
$readMeStream.WriteLine("$url")
$url = "http://search.maven.org/remotecontent?filepath=org/apache/commons/commons-csv/1.1/commons-csv-1.1.jar"
$output="$scriptDir\..\dependencies\commons-csv-1.1.jar"
Download-File $url $output
Write-Output "[downloadtools.Download-ExternalDependencies] Downloading $url to $scriptDir\..\dependencies"
$readMeStream.WriteLine("$url")
$readMeStream.WriteLine("")
$readMeStream.WriteLine("------------ Dependencies for Kafka-based processing in Mobius Streaming API -----------------------------")
$url = "http://search.maven.org/remotecontent?filepath=org/apache/spark/spark-streaming-kafka-assembly_2.10/1.6.1/spark-streaming-kafka-assembly_2.10-1.6.1.jar"
$output="$scriptDir\..\dependencies\spark-streaming-kafka-assembly_2.10-1.6.1.jar"
Download-File $url $output
Write-Output "[downloadtools.Download-ExternalDependencies] Downloading $url to $scriptDir\..\dependencies"
$readMeStream.WriteLine("$url")
$readMeStream.close()
return
}
function Download-RuntimeDependencies
{
# Create a cmd file to update environment variable
@ -340,7 +382,7 @@ function Download-RuntimeDependencies
$winutilsExe = "$winutilsBin\winutils.exe"
if (!(test-path $winutilsExe))
{
$url = "http://public-repo-1.hortonworks.com/hdp-win-alpha/winutils.exe"
$url = "https://github.com/MobiusForSpark/winutils/blob/master/hadoop-2.6.0/bin/winutils.exe?raw=true"
$output=$winutilsExe
Download-File $url $output
}
@ -480,8 +522,8 @@ function Print-Usage
Write-Output ''
Write-Output ' This script takes one input parameter ("stage"), which can be either [build | run].'
Write-Output ''
Write-Output ' Build: Download tools required in building SparkCLR;'
Write-Output ' Run: Download Apache Spark and related binaries, required to run SparkCLR samples locally.'
Write-Output ' Build: Download tools required in building Mobius;'
Write-Output ' Run: Download Apache Spark and related binaries, required to run Mobius samples locally.'
Write-Output ''
Write-Output '====================================================================================================='
}
@ -513,6 +555,10 @@ elseif ($stage.ToLower() -eq "run")
{
Download-RuntimeDependencies
}
elseif ($stage.ToLower() -eq "dependencies")
{
Download-ExternalDependencies
}
else
{
Print-Usage

Просмотреть файл

@ -6,7 +6,7 @@ if not exist "%JAVA_HOME%\bin\java.exe" (
@echo.
@echo ============================================================================================
@echo.
@echo WARNING!!! %~nx0 detected JAVA_HOME is not set properly. SparkCLR requires JDK 7u85 and above,
@echo WARNING!!! %~nx0 detected JAVA_HOME is not set properly. Mobius requires JDK 7u85 and above,
@echo or JDK 8u60 and above. You can either download OpenJDK available at
@echo http://www.azul.com/downloads/zulu/zulu-windows/, or use Oracle JDK.
@echo.
@ -33,7 +33,7 @@ goto :eof
@echo ============================================================================================
@echo.
@echo WARNING!!! %~nx0 detected version of Visual Studio in current command prompt as %version%.
@echo SparkCLR %~nx0 requires "Developer Command Prompt for VS2013" and above, or
@echo Mobius %~nx0 requires "Developer Command Prompt for VS2013" and above, or
@echo "MSBuild Command Prompt for VS2015" and above.
@echo.
@echo ============================================================================================

Просмотреть файл

@ -11,7 +11,7 @@ do
done
# setup Hadoop and Spark versions
export SPARK_VERSION=1.6.0
export SPARK_VERSION=1.6.2
export HADOOP_VERSION=2.6
echo "[run-samples.sh] SPARK_VERSION=$SPARK_VERSION, HADOOP_VERSION=$HADOOP_VERSION"
@ -27,18 +27,6 @@ if [ ! -d "$SPARK_HOME" ];
then
wget "http://www.us.apache.org/dist/spark/spark-$SPARK_VERSION/$SPARK.tgz" -O "$TOOLS_DIR/$SPARK.tgz"
tar xfz "$TOOLS_DIR/$SPARK.tgz" -C "$TOOLS_DIR"
# hack: use a customized spark
# TODO: fix the C# Worker
export SPARK_SRC="$TOOLS_DIR/spark-$SPARK_VERSION"
wget "http://www.us.apache.org/dist/spark/spark-$SPARK_VERSION/spark-$SPARK_VERSION.tgz" -O "$SPARK_SRC.tgz"
tar xfz "$SPARK_SRC.tgz" -C "$TOOLS_DIR"
pushd "$SPARK_SRC"
sed -i "s/val useDaemon = /val useDaemon = false \/\//g" "core/src/main/scala/org/apache/spark/api/python/PythonWorkerFactory.scala"
build/mvn -Pyarn -Phadoop-$HADOOP_VERSION -DskipTests package 2>&1 | grep warn
[ $? -ne 0 ] && exit 1
cp assembly/target/scala-2.10/spark-assembly*hadoop*.jar "$SPARK_HOME/lib/"
popd
fi
export PATH="$SPARK_HOME/bin:$PATH"

Просмотреть файл

@ -1,6 +1,9 @@
# Copyright (c) Microsoft. All rights reserved.
# Licensed under the MIT license. See LICENSE file in the project root for full license information.
#
# This script takes in "dir" and "target" parameters, zips all files under dir to the target file
#
Param([string]$dir, [string]$target)
function Get-ScriptDirectory

Просмотреть файл

@ -53,12 +53,7 @@
<Reference Include="System" />
<Reference Include="System.Configuration" />
<Reference Include="System.Core" />
<Reference Include="System.Runtime.Serialization" />
<Reference Include="System.Xml.Linq" />
<Reference Include="System.Data.DataSetExtensions" />
<Reference Include="Microsoft.CSharp" />
<Reference Include="System.Data" />
<Reference Include="System.Xml" />
</ItemGroup>
<ItemGroup>
<Compile Include="Configuration\ConfigurationService.cs" />
@ -66,6 +61,7 @@
<Compile Include="Core\Accumulator.cs" />
<Compile Include="Core\Broadcast.cs" />
<Compile Include="Core\Option.cs" />
<Compile Include="Core\Partitioner.cs" />
<Compile Include="Core\RDDCollector.cs" />
<Compile Include="Core\DoubleRDDFunctions.cs" />
<Compile Include="Core\IRDDCollector.cs" />
@ -80,12 +76,17 @@
<Compile Include="Core\StatusTracker.cs" />
<Compile Include="Core\StorageLevel.cs" />
<Compile Include="Interop\Ipc\JsonSerDe.cs" />
<Compile Include="Interop\Ipc\JvmBridgeUtils.cs" />
<Compile Include="Interop\Ipc\WeakObjectManager.cs" />
<Compile Include="Interop\SparkCLREnvironment.cs" />
<Compile Include="Interop\Ipc\IJvmBridge.cs" />
<Compile Include="Interop\Ipc\JvmBridge.cs" />
<Compile Include="Interop\Ipc\JvmObjectReference.cs" />
<Compile Include="Interop\Ipc\PayloadHelper.cs" />
<Compile Include="Interop\Ipc\SerDe.cs" />
<Compile Include="Network\DefaultSocketWrapper.cs" />
<Compile Include="Network\ISocketWrapper.cs" />
<Compile Include="Network\SocketFactory.cs" />
<Compile Include="Properties\AssemblyInfo.cs" />
<Compile Include="Proxy\IDataFrameNaFunctionsProxy.cs" />
<Compile Include="Proxy\IDataFrameProxy.cs" />
@ -122,6 +123,7 @@
<Compile Include="Sql\DataFrameNaFunctions.cs" />
<Compile Include="Sql\DataFrameReader.cs" />
<Compile Include="Sql\DataFrameWriter.cs" />
<Compile Include="Sql\HiveContext.cs" />
<Compile Include="Sql\PythonSerDe.cs" />
<Compile Include="Sql\RowConstructor.cs" />
<Compile Include="Sql\Row.cs" />
@ -130,8 +132,11 @@
<Compile Include="Sql\SqlContext.cs" />
<Compile Include="Sql\Types.cs" />
<Compile Include="Sql\UserDefinedFunction.cs" />
<Compile Include="Streaming\ConstantInputDStream.cs" />
<Compile Include="Streaming\DStream.cs" />
<Compile Include="Streaming\EventHubsUtils.cs" />
<Compile Include="Streaming\Kafka.cs" />
<Compile Include="Streaming\MapWithStateDStream.cs" />
<Compile Include="Streaming\PairDStreamFunctions.cs" />
<Compile Include="Streaming\StreamingContext.cs" />
<Compile Include="Streaming\TransformedDStream.cs" />
@ -157,13 +162,7 @@
</Target>
-->
<Target Name="AfterBuild">
<XslTransformation
XslInputPath="..\documentation\DocFormatter.xsl"
XmlInputPaths="..\documentation\Microsoft.Spark.CSharp.Adapter.Doc.XML"
OutputPaths="..\documentation\SparkCLR_API_Documentation.md"
Condition="'$(OS)' == 'Windows_NT'" />
<Exec
Command="xsltproc -o ../documentation/SparkCLR_API_Documentation.md ../documentation/DocFormatter.xsl ../documentation/Microsoft.Spark.CSharp.Adapter.Doc.XML"
Condition="'$(OS)' != 'Windows_NT'" />
<XslTransformation XslInputPath="..\documentation\DocFormatter.xsl" XmlInputPaths="..\documentation\Microsoft.Spark.CSharp.Adapter.Doc.XML" OutputPaths="..\documentation\Mobius_API_Documentation.md" Condition="'$(OS)' == 'Windows_NT'" />
<Exec Command="xsltproc -o ../documentation/Mobius_API_Documentation.md ../documentation/DocFormatter.xsl ../documentation/Microsoft.Spark.CSharp.Adapter.Doc.XML" Condition="'$(OS)' != 'Windows_NT'" />
</Target>
</Project>

Просмотреть файл

@ -2,15 +2,11 @@
// Licensed under the MIT license. See LICENSE file in the project root for full license information.
using System;
using System.Collections.Generic;
using System.Configuration;
using System.Diagnostics;
using System.IO;
using System.Linq;
using System.Reflection;
using System.Text;
using System.Threading.Tasks;
using Microsoft.Spark.CSharp.Core;
using Microsoft.Spark.CSharp.Services;
namespace Microsoft.Spark.CSharp.Configuration
@ -91,7 +87,7 @@ namespace Microsoft.Spark.CSharp.Configuration
{
protected readonly AppSettingsSection appSettings;
protected readonly string sparkCLRHome = Environment.GetEnvironmentVariable(SPARKCLR_HOME); //set by sparkclr-submit.cmd
protected readonly ILoggerService logger = LoggerServiceFactory.GetLogger(typeof(SparkCLRConfiguration));
private readonly ILoggerService logger = LoggerServiceFactory.GetLogger(typeof(SparkCLRConfiguration));
internal SparkCLRConfiguration(System.Configuration.Configuration configuration)
{
@ -109,14 +105,36 @@ namespace Microsoft.Spark.CSharp.Configuration
throw new Exception("Environment variable " + CSHARPBACKEND_PORT + " not set");
}
logger.LogInfo("CSharpBackend successfully read from environment variable " + CSHARPBACKEND_PORT);
logger.LogInfo("CSharpBackend successfully read from environment variable {0}", CSHARPBACKEND_PORT);
return portNo;
}
private string workerPath;
/// <summary>
/// The path of the CSharp external backend worker process.
/// </summary>
internal virtual string GetCSharpWorkerExePath()
{
// SparkCLR jar and driver, worker & dependencies are shipped using Spark file server.
// These files are available in the Spark executing directory at executor node.
if (workerPath != null) return workerPath; // Return cached value
var workerPathConfig = appSettings.Settings[CSharpWorkerPathSettingKey];
if (workerPathConfig == null)
{
workerPath = GetCSharpProcFileName();
}
else
{
// Explicit path for the CSharpWorker.exe was listed in App.config
workerPath = workerPathConfig.Value;
logger.LogDebug("Using CSharpWorkerPath value from App.config : {0}", workerPath);
}
return workerPath;
}
internal virtual string GetCSharpProcFileName()
{
return ProcFileName;
}
@ -124,50 +142,33 @@ namespace Microsoft.Spark.CSharp.Configuration
/// <summary>
/// Configuration for SparkCLR jobs in ** Local ** mode
/// Needs some investigation to find out why Local mode behaves
/// different than standalone cluster mode for the configuration values
/// overridden here
/// </summary>
private class SparkCLRLocalConfiguration : SparkCLRConfiguration
{
private readonly ILoggerService logger = LoggerServiceFactory.GetLogger(typeof(SparkCLRLocalConfiguration));
internal SparkCLRLocalConfiguration(System.Configuration.Configuration configuration)
: base(configuration)
{ }
private string workerPath;
internal override string GetCSharpWorkerExePath()
internal override string GetCSharpProcFileName()
{
// SparkCLR jar and driver, worker & dependencies are shipped using Spark file server.
// These files are available in the Spark executing directory at executor node.
if (workerPath != null) return workerPath; // Return cached value
KeyValueConfigurationElement workerPathConfig = appSettings.Settings[CSharpWorkerPathSettingKey];
if (workerPathConfig == null)
{
// Path for the CSharpWorker.exe was not specified in App.config
// Try to work out where location relative to this class.
// Construct path based on well-known file name + directory this class was loaded from.
string procDir = Path.GetDirectoryName(GetType().Assembly.Location);
workerPath = Path.Combine(procDir, ProcFileName);
logger.LogDebug("Using synthesized value for CSharpWorkerPath : " + workerPath);
}
else
{
// Explicit path for the CSharpWorker.exe was listed in App.config
workerPath = workerPathConfig.Value;
logger.LogDebug("Using CSharpWorkerPath value from App.config : " + workerPath);
}
return workerPath;
// Path for the CSharpWorker.exe was not specified in App.config
// Try to work out where location relative to this class.
// Construct path based on well-known file name + directory this class was loaded from.
string procDir = Path.GetDirectoryName(GetType().Assembly.Location);
var procFilePath = Path.Combine(procDir, ProcFileName);
logger.LogDebug("Using SparkCLR Adapter dll path to construct CSharpWorkerPath : {0}", procFilePath);
return procFilePath;
}
}
/// <summary>
/// Configuration mode for debug mode
/// This configuration exists only to make SparkCLR development & debugging easier
/// This configuration exists only to make SparkCLR development and debugging easier
/// </summary>
private class SparkCLRDebugConfiguration : SparkCLRLocalConfiguration
{
private readonly ILoggerService logger = LoggerServiceFactory.GetLogger(typeof(SparkCLRDebugConfiguration));
internal SparkCLRDebugConfiguration(System.Configuration.Configuration configuration)
: base(configuration)
{}
@ -192,9 +193,14 @@ namespace Microsoft.Spark.CSharp.Configuration
KeyValueConfigurationElement workerPathConfig = appSettings.Settings[CSharpWorkerPathSettingKey];
if (workerPathConfig != null)
{
logger.LogInfo("Worker path read from setting {0} in app config", CSharpWorkerPathSettingKey);
return workerPathConfig.Value;
}
return GetSparkCLRArtifactsPath("bin", ProcFileName);
var path = GetSparkCLRArtifactsPath("bin", ProcFileName);
logger.LogInfo("Worker path {0} constructed using {1} environment variable", path, SPARKCLR_HOME);
return path;
}
private string GetSparkCLRArtifactsPath(string sparkCLRSubFolderName, string fileName)
@ -209,14 +215,31 @@ namespace Microsoft.Spark.CSharp.Configuration
}
}
/// <summary>
/// The running mode used by Configuration Service
/// </summary>
public enum RunMode
{
/// <summary>
/// Unknown running mode
/// </summary>
UNKNOWN,
DEBUG, //not a Spark mode but exists for dev debugging purpose
/// <summary>
/// Debug mode, not a Spark mode but exists for develop debugging purpose
/// </summary>
DEBUG,
/// <summary>
/// Indicates service is running in local
/// </summary>
LOCAL,
/// <summary>
/// Indicates service is running in cluster
/// </summary>
CLUSTER,
YARN,
//following are not currently supported
MESOS
/// <summary>
/// Indicates service is running in Yarn
/// </summary>
YARN
//MESOS //not currently supported
}
}

Просмотреть файл

@ -12,6 +12,7 @@ using System.Runtime.Serialization;
using System.Runtime.Serialization.Formatters.Binary;
using Microsoft.Spark.CSharp.Interop.Ipc;
using Microsoft.Spark.CSharp.Network;
using Microsoft.Spark.CSharp.Services;
[assembly: InternalsVisibleTo("CSharpWorker")]
@ -35,10 +36,26 @@ namespace Microsoft.Spark.CSharp.Core
{
internal static Dictionary<int, Accumulator> accumulatorRegistry = new Dictionary<int, Accumulator>();
[ThreadStatic] // Thread safe is needed when running in C# worker
internal static Dictionary<int, Accumulator> threadLocalAccumulatorRegistry = new Dictionary<int, Accumulator>();
/// <summary>
/// The identity of the accumulator
/// </summary>
protected int accumulatorId;
/// <summary>
/// Indicates whether the accumulator is on driver side.
/// When deserialized on worker side, isDriver is false by default.
/// </summary>
[NonSerialized]
protected bool deserialized = true;
protected bool isDriver = false;
}
/// <summary>
/// A generic version of <see cref="Accumulator"/> where the element type is specified by the driver program.
/// </summary>
/// <typeparam name="T">The type of element in the accumulator.</typeparam>
[Serializable]
public class Accumulator<T> : Accumulator
{
@ -46,20 +63,42 @@ namespace Microsoft.Spark.CSharp.Core
internal T value;
private readonly AccumulatorParam<T> accumulatorParam = new AccumulatorParam<T>();
/// <summary>
/// Initializes a new instance of the Accumulator class with a specified identity and a value.
/// </summary>
/// <param name="accumulatorId">The Identity of the accumulator</param>
/// <param name="value">The value of the accumulator</param>
public Accumulator(int accumulatorId, T value)
{
this.accumulatorId = accumulatorId;
this.value = value;
deserialized = false;
isDriver = true;
accumulatorRegistry[accumulatorId] = this;
}
[OnDeserialized()]
internal void OnDeserializedMethod(System.Runtime.Serialization.StreamingContext context)
{
if (threadLocalAccumulatorRegistry == null)
{
threadLocalAccumulatorRegistry = new Dictionary<int, Accumulator>();
}
if (!threadLocalAccumulatorRegistry.ContainsKey(accumulatorId))
{
threadLocalAccumulatorRegistry[accumulatorId] = this;
}
}
/// <summary>
/// Gets or sets the value of the accumulator; only usable in driver program
/// </summary>
/// <exception cref="ArgumentException"></exception>
public T Value
{
// Get the accumulator's value; only usable in driver program
get
{
if (deserialized)
if (!isDriver)
{
throw new ArgumentException("Accumulator.value cannot be accessed inside tasks");
}
@ -68,7 +107,7 @@ namespace Microsoft.Spark.CSharp.Core
// Sets the accumulator's value; only usable in driver program
set
{
if (deserialized)
if (!isDriver)
{
throw new ArgumentException("Accumulator.value cannot be accessed inside tasks");
}
@ -94,14 +133,14 @@ namespace Microsoft.Spark.CSharp.Core
/// <returns></returns>
public static Accumulator<T> operator +(Accumulator<T> self, T term)
{
if (!accumulatorRegistry.ContainsKey(self.accumulatorId))
{
accumulatorRegistry[self.accumulatorId] = self;
}
self.Add(term);
return self;
}
/// <summary>
/// Creates and returns a string representation of the current accumulator
/// </summary>
/// <returns>A string representation of the current accumulator</returns>
public override string ToString()
{
return string.Format("Accumulator<id={0}, value={1}>", accumulatorId, value);
@ -143,33 +182,33 @@ namespace Microsoft.Spark.CSharp.Core
/// A simple TCP server that intercepts shutdown() in order to interrupt
/// our continuous polling on the handler.
/// </summary>
internal class AccumulatorServer : System.Net.Sockets.TcpListener
internal class AccumulatorServer
{
private readonly ILoggerService logger = LoggerServiceFactory.GetLogger(typeof(AccumulatorServer));
private volatile bool serverShutdown;
private ISocketWrapper innerSocket;
internal AccumulatorServer()
: base(IPAddress.Loopback, 0)
{
innerSocket = SocketFactory.CreateSocket();
}
internal void Shutdown()
{
serverShutdown = true;
base.Stop();
innerSocket.Close();
}
internal int StartUpdateServer()
{
base.Start();
innerSocket.Listen();
Task.Run(() =>
{
try
{
IFormatter formatter = new BinaryFormatter();
using (Socket s = AcceptSocket())
using (var ns = new NetworkStream(s))
using (var s = innerSocket.Accept())
using (var ns = s.GetStream())
{
while (!serverShutdown)
{
@ -199,7 +238,7 @@ namespace Microsoft.Spark.CSharp.Core
}
catch (SocketException e)
{
if (e.ErrorCode != 10004) // A blocking operation was interrupted by a call to WSACancelBlockingCall - TcpListener.Stop cancelled AccepSocket as expected
if (e.ErrorCode != 10004) // A blocking operation was interrupted by a call to WSACancelBlockingCall - ISocketWrapper.Close canceled Accep() as expected
throw e;
}
catch (Exception e)
@ -209,7 +248,7 @@ namespace Microsoft.Spark.CSharp.Core
}
});
return (base.LocalEndpoint as IPEndPoint).Port;
return (innerSocket.LocalEndPoint as IPEndPoint).Port;
}
}
}

Просмотреть файл

@ -8,6 +8,7 @@ using System.Text;
using System.Threading.Tasks;
using System.IO;
using System.Runtime.Serialization.Formatters.Binary;
using System.Collections.Concurrent;
using Microsoft.Spark.CSharp.Proxy;
@ -30,13 +31,21 @@ namespace Microsoft.Spark.CSharp.Core
[Serializable]
public class Broadcast
{
/// <summary>
/// A thread-safe static collection that is used to store registered broadcast objects.
/// </summary>
[NonSerialized]
public static Dictionary<long, Broadcast> broadcastRegistry = new Dictionary<long, Broadcast>();
public static ConcurrentDictionary<long, Broadcast> broadcastRegistry = new ConcurrentDictionary<long, Broadcast>();
[NonSerialized]
internal string path;
internal long broadcastId;
internal Broadcast() { }
/// <summary>
/// Initializes a new instance of Broadcast class with a specified path.
/// </summary>
/// <param name="path">The path that to be set.</param>
public Broadcast(string path)
{
this.path = path;
@ -59,6 +68,11 @@ namespace Microsoft.Spark.CSharp.Core
}
}
}
/// <summary>
/// A generic version of <see cref="Broadcast"/> where the element can be specified.
/// </summary>
/// <typeparam name="T">The type of element in Broadcast</typeparam>
[Serializable]
public class Broadcast<T> : Broadcast
{

Просмотреть файл

@ -9,6 +9,9 @@ using System.Threading.Tasks;
namespace Microsoft.Spark.CSharp.Core
{
/// <summary>
/// Extra functions available on RDDs of Doubles through an implicit conversion.
/// </summary>
public static class DoubleRDDFunctions
{
/// <summary>

Просмотреть файл

@ -16,17 +16,32 @@ namespace Microsoft.Spark.CSharp.Core
private bool isDefined = false;
private T value;
/// <summary>
/// Initialize a instance of Option class without any value.
/// </summary>
public Option()
{ }
/// <summary>
/// Initializes a instance of Option class with a specific value.
/// </summary>
/// <param name="value">The value to be associated with the new instance.</param>
public Option(T value)
{
isDefined = true;
this.value = value;
}
/// <summary>
/// Indicates whether the option value is defined.
/// </summary>
public bool IsDefined { get { return isDefined; } }
/// <summary>
/// Returns the value of the option if Option.IsDefined is TRUE;
/// otherwise, throws an <see cref="ArgumentException"/>.
/// </summary>
/// <returns></returns>
public T GetValue()
{
if (isDefined) return value;

Просмотреть файл

@ -1,6 +1,7 @@
// Copyright (c) Microsoft. All rights reserved.
// Licensed under the MIT license. See LICENSE file in the project root for full license information.
using Microsoft.Spark.CSharp.Core;
using System;
using System.Collections.Generic;
using System.Linq;
@ -9,26 +10,15 @@ using System.Threading.Tasks;
namespace Microsoft.Spark.CSharp.Core
{
/// <summary>
/// Extra functions available on RDDs of (key, value) pairs where the key is sortable through
/// a function to sort the key.
/// </summary>
public static class OrderedRDDFunctions
{
/// <summary>
/// Sort the RDD by key, so that each partition contains a sorted range of the elements. Calling
/// `collect` or `save` on the resulting RDD will return or output an ordered list of records
/// (in the `save` case, they will be written to multiple `part-X` files in the filesystem, in
/// order of the keys).
///
/// >>> tmp = [('a', 1), ('b', 2), ('1', 3), ('d', 4), ('2', 5)]
/// >>> sc.parallelize(tmp).sortByKey().first()
/// ('1', 3)
/// >>> sc.parallelize(tmp).sortByKey(True, 1).collect()
/// [('1', 3), ('2', 5), ('a', 1), ('b', 2), ('d', 4)]
/// >>> sc.parallelize(tmp).sortByKey(True, 2).collect()
/// [('1', 3), ('2', 5), ('a', 1), ('b', 2), ('d', 4)]
/// >>> tmp2 = [('Mary', 1), ('had', 2), ('a', 3), ('little', 4), ('lamb', 5)]
/// >>> tmp2.extend([('whose', 6), ('fleece', 7), ('was', 8), ('white', 9)])
/// >>> sc.parallelize(tmp2).sortByKey(True, 3, keyfunc=lambda k: k.lower()).collect()
/// [('a', 3), ('fleece', 7), ('had', 2), ('lamb', 5),...('white', 9), ('whose', 6)]
///
/// Sorts this RDD, which is assumed to consist of KeyValuePair pairs.
/// </summary>
/// <typeparam name="K"></typeparam>
/// <typeparam name="V"></typeparam>
@ -36,26 +26,75 @@ namespace Microsoft.Spark.CSharp.Core
/// <param name="ascending"></param>
/// <param name="numPartitions"></param>
/// <returns></returns>
public static RDD<KeyValuePair<K, V>> SortByKey<K, V>(
this RDD<KeyValuePair<K, V>> self,
bool ascending = true,
int? numPartitions = null)
public static RDD<KeyValuePair<K, V>> SortByKey<K, V>(this RDD<KeyValuePair<K, V>> self,
bool ascending = true, int? numPartitions = null)
{
throw new NotImplementedException();
return SortByKey<K, V, K>(self, ascending, numPartitions, new DefaultSortKeyFuncHelper<K>().Execute);
}
/// <summary>
/// Sorts this RDD, which is assumed to consist of KeyValuePairs. If key is type of string, case is sensitive.
/// </summary>
/// <typeparam name="K"></typeparam>
/// <typeparam name="V"></typeparam>
/// <typeparam name="U"></typeparam>
/// <param name="self"></param>
/// <param name="ascending"></param>
/// <param name="numPartitions">Number of partitions. Each partition of the sorted RDD contains a sorted range of the elements.</param>
/// <param name="keyFunc">RDD will sort by keyFunc(key) for every key in KeyValuePair. Must not be null.</param>
/// <returns></returns>
public static RDD<KeyValuePair<K, V>> SortByKey<K, V, U>(this RDD<KeyValuePair<K, V>> self,
bool ascending, int? numPartitions, Func<K, U> keyFunc)
{
if (keyFunc == null)
{
throw new ArgumentNullException("keyFunc cannot be null.");
}
if (numPartitions == null)
{
numPartitions = self.GetDefaultPartitionNum();
}
if (numPartitions == 1)
{
if (self.GetNumPartitions() > 1)
{
self = self.Coalesce(1);
}
return self.MapPartitionsWithIndex(new SortByKeyHelper<K, V, U>(keyFunc, ascending).Execute, true);
}
var rddSize = self.Count();
if (rddSize == 0) return self; // empty RDD
var maxSampleSize = numPartitions.Value * 20; // constant from Spark's RangePartitioner
double fraction = Math.Min((double)maxSampleSize / Math.Max(rddSize, 1), 1.0);
/* first compute the boundary of each part via sampling: we want to partition
* the key-space into bins such that the bins have roughly the same
* number of (key, value) pairs falling into them */
U[] samples = self.Sample(false, fraction, 1).Map(kv => kv.Key).Collect().Select(k => keyFunc(k)).ToArray();
Array.Sort(samples, StringComparer.Ordinal); // case sensitive if key type is string
List<U> bounds = new List<U>();
for (int i = 0; i < numPartitions - 1; i++)
{
bounds.Add(samples[(int)(samples.Length * (i + 1) / numPartitions)]);
}
return self.PartitionBy(numPartitions.Value,
new PairRDDFunctions.PartitionFuncDynamicTypeHelper<K>(
new RangePartitionerHelper<K, U>(numPartitions.Value, keyFunc, bounds, ascending).Execute)
.Execute)
.MapPartitionsWithIndex(new SortByKeyHelper<K, V, U>(keyFunc, ascending).Execute, true);
}
/// <summary>
/// Repartition the RDD according to the given partitioner and, within each resulting partition,
/// sort records by their keys.
///
/// This is more efficient than calling `repartition` and then sorting within each partition
/// because it can push the sorting down into the shuffle machinery.
///
/// >>> rdd = sc.parallelize([(0, 5), (3, 8), (2, 6), (0, 8), (3, 8), (1, 3)])
/// >>> rdd2 = rdd.repartitionAndSortWithinPartitions(2, lambda x: x % 2, 2)
/// >>> rdd2.glom().collect()
/// [[(0, 5), (0, 8), (2, 6)], [(1, 3), (3, 8), (3, 8)]]
///
/// </summary>
/// <typeparam name="K"></typeparam>
/// <typeparam name="V"></typeparam>
@ -72,5 +111,69 @@ namespace Microsoft.Spark.CSharp.Core
{
return self.MapPartitionsWithIndex<KeyValuePair<K, V>>((pid, iter) => ascending ? iter.OrderBy(kv => kv.Key) : iter.OrderByDescending(kv => kv.Key));
}
[Serializable]
internal class SortByKeyHelper<K, V, U>
{
private readonly Func<K, U> func;
private readonly bool ascending;
public SortByKeyHelper(Func<K, U> f, bool ascending = true)
{
func = f;
this.ascending = ascending;
}
public IEnumerable<KeyValuePair<K, V>> Execute(int pid, IEnumerable<KeyValuePair<K, V>> kvs)
{
IEnumerable<KeyValuePair<K, V>> ordered;
if (ascending)
{
if (typeof(K) == typeof(string))
ordered = kvs.OrderBy(k => func(k.Key).ToString(), StringComparer.Ordinal);
else
ordered = kvs.OrderBy(k => func(k.Key));
}
else
{
if (typeof(K) == typeof(string))
ordered = kvs.OrderByDescending(k => func(k.Key).ToString(), StringComparer.Ordinal);
else
ordered = kvs.OrderByDescending(k => func(k.Key));
}
return ordered;
}
}
[Serializable]
internal class DefaultSortKeyFuncHelper<K>
{
public K Execute(K key) { return key; }
}
[Serializable]
internal class RangePartitionerHelper<K, U>
{
private readonly int numPartitions;
private readonly Func<K, U> keyFunc;
private readonly List<U> bounds;
private readonly bool ascending;
public RangePartitionerHelper(int numPartitions, Func<K, U> keyFunc, List<U> bounds, bool ascending)
{
this.numPartitions = numPartitions;
this.bounds = bounds;
this.keyFunc = keyFunc;
this.ascending = ascending;
}
public int Execute(K key)
{
// Binary search the insert position in the bounds. If key found, return the insert position; if not, a negative
// number that is the bitwise complement of insert position is returned, so bitwise inversing it.
var pos = bounds.BinarySearch(keyFunc(key));
if (pos < 0) pos = ~pos;
return ascending ? pos : numPartitions - 1 - pos;
}
}
}
}

Просмотреть файл

@ -8,6 +8,7 @@ using System.Runtime.Serialization;
using System.Runtime.Serialization.Formatters.Binary;
using System.IO;
using System.Security.Cryptography;
using Microsoft.Spark.CSharp.Interop.Ipc;
namespace Microsoft.Spark.CSharp.Core
{
@ -21,7 +22,7 @@ namespace Microsoft.Spark.CSharp.Core
/// <summary>
/// Return the key-value pairs in this RDD to the master as a dictionary.
///
/// var m = sc.Parallelize(new[] { new <see cref="KeyValuePair{int, int}"/>(1, 2), new <see cref="KeyValuePair{int, int}"/>(3, 4) }, 1).CollectAsMap()
/// var m = sc.Parallelize(new[] { new KeyValuePair&lt;int, int>(1, 2), new KeyValuePair&lt;int, int>(3, 4) }, 1).CollectAsMap()
/// m[1]
/// 2
/// m[3]
@ -40,7 +41,7 @@ namespace Microsoft.Spark.CSharp.Core
/// <summary>
/// Return an RDD with the keys of each tuple.
///
/// >>> m = sc.Parallelize(new[] { new <see cref="KeyValuePair{int, int}"/>(1, 2), new <see cref="KeyValuePair{int, int}"/>(3, 4) }, 1).Keys().Collect()
/// >>> m = sc.Parallelize(new[] { new KeyValuePair&lt;int, int>(1, 2), new KeyValuePair&lt;int, int>(3, 4) }, 1).Keys().Collect()
/// [1, 3]
/// </summary>
/// <typeparam name="K"></typeparam>
@ -55,7 +56,7 @@ namespace Microsoft.Spark.CSharp.Core
/// <summary>
/// Return an RDD with the values of each tuple.
///
/// >>> m = sc.Parallelize(new[] { new <see cref="KeyValuePair{int, int}"/>(1, 2), new <see cref="KeyValuePair{int, int}"/>(3, 4) }, 1).Values().Collect()
/// >>> m = sc.Parallelize(new[] { new KeyValuePair&lt;int, int>(1, 2), new KeyValuePair&lt;int, int>(3, 4) }, 1).Values().Collect()
/// [2, 4]
///
/// </summary>
@ -79,9 +80,9 @@ namespace Microsoft.Spark.CSharp.Core
///
/// sc.Parallelize(new[]
/// {
/// new <see cref="KeyValuePair{string, int}"/>("a", 1),
/// new <see cref="KeyValuePair{string, int}"/>("b", 1),
/// new <see cref="KeyValuePair{string, int}"/>("a", 1)
/// new KeyValuePair&lt;string, int>("a", 1),
/// new KeyValuePair&lt;string, int>("b", 1),
/// new KeyValuePair&lt;string, int>("a", 1)
/// }, 2)
/// .ReduceByKey((x, y) => x + y).Collect()
///
@ -108,9 +109,9 @@ namespace Microsoft.Spark.CSharp.Core
///
/// sc.Parallelize(new[]
/// {
/// new <see cref="KeyValuePair{string, int}"/>("a", 1),
/// new <see cref="KeyValuePair{string, int}"/>("b", 1),
/// new <see cref="KeyValuePair{string, int}"/>("a", 1)
/// new KeyValuePair&lt;string, int>("a", 1),
/// new KeyValuePair&lt;string, int>("b", 1),
/// new KeyValuePair&lt;string, int>("a", 1)
/// }, 2)
/// .ReduceByKeyLocally((x, y) => x + y).Collect()
///
@ -132,9 +133,9 @@ namespace Microsoft.Spark.CSharp.Core
///
/// sc.Parallelize(new[]
/// {
/// new <see cref="KeyValuePair{string, int}"/>("a", 1),
/// new <see cref="KeyValuePair{string, int}"/>("b", 1),
/// new <see cref="KeyValuePair{string, int}"/>("a", 1)
/// new KeyValuePair&lt;string, int>("a", 1),
/// new KeyValuePair&lt;string, int>("b", 1),
/// new KeyValuePair&lt;string, int>("a", 1)
/// }, 2)
/// .CountByKey((x, y) => x + y).Collect()
///
@ -158,9 +159,9 @@ namespace Microsoft.Spark.CSharp.Core
/// Performs a hash join across the cluster.
///
/// var l = sc.Parallelize(
/// new[] { new <see cref="KeyValuePair{string, int}"/>("a", 1), new <see cref="KeyValuePair{string, int}"/>("b", 4) }, 1);
/// new[] { new KeyValuePair&lt;string, int>("a", 1), new KeyValuePair&lt;string, int>("b", 4) }, 1);
/// var r = sc.Parallelize(
/// new[] { new <see cref="KeyValuePair{string, int}"/>("a", 2), new <see cref="KeyValuePair{string, int}"/>("a", 3) }, 1);
/// new[] { new KeyValuePair&lt;string, int>("a", 2), new KeyValuePair&lt;string, int>("a", 3) }, 1);
/// var m = l.Join(r, 2).Collect();
///
/// [('a', (1, 2)), ('a', (1, 3))]
@ -193,9 +194,9 @@ namespace Microsoft.Spark.CSharp.Core
/// Hash-partitions the resulting RDD into the given number of partitions.
///
/// var l = sc.Parallelize(
/// new[] { new <see cref="KeyValuePair{string, int}"/>("a", 1), new <see cref="KeyValuePair{string, int}"/>("b", 4) }, 1);
/// new[] { new KeyValuePair&lt;string, int>("a", 1), new KeyValuePair&lt;string, int>("b", 4) }, 1);
/// var r = sc.Parallelize(
/// new[] { new <see cref="KeyValuePair{string, int}"/>("a", 2) }, 1);
/// new[] { new KeyValuePair&lt;string, int>("a", 2) }, 1);
/// var m = l.LeftOuterJoin(r).Collect();
///
/// [('a', (1, 2)), ('b', (4, Option))]
@ -227,9 +228,9 @@ namespace Microsoft.Spark.CSharp.Core
/// Hash-partitions the resulting RDD into the given number of partitions.
///
/// var l = sc.Parallelize(
/// new[] { new <see cref="KeyValuePair{string, int}"/>("a", 2) }, 1);
/// new[] { new KeyValuePair&lt;string, int>("a", 2) }, 1);
/// var r = sc.Parallelize(
/// new[] { new <see cref="KeyValuePair{string, int}"/>("a", 1), new <see cref="KeyValuePair{string, int}"/>("b", 4) }, 1);
/// new[] { new KeyValuePair&lt;string, int>("a", 1), new KeyValuePair&lt;string, int>("b", 4) }, 1);
/// var m = l.RightOuterJoin(r).Collect();
///
/// [('a', (2, 1)), ('b', (Option, 4))]
@ -266,9 +267,9 @@ namespace Microsoft.Spark.CSharp.Core
/// Hash-partitions the resulting RDD into the given number of partitions.
///
/// var l = sc.Parallelize(
/// new[] { new <see cref="KeyValuePair{string, int}"/>("a", 1), <see cref="KeyValuePair{string, int}"/>("b", 4) }, 1);
/// new[] { new KeyValuePair&lt;string, int>("a", 1), KeyValuePair&lt;string, int>("b", 4) }, 1);
/// var r = sc.Parallelize(
/// new[] { new <see cref="KeyValuePair{string, int}"/>("a", 2), new <see cref="KeyValuePair{string, int}"/>("c", 8) }, 1);
/// new[] { new KeyValuePair&lt;string, int>("a", 2), new KeyValuePair&lt;string, int>("c", 8) }, 1);
/// var m = l.FullOuterJoin(r).Collect();
///
/// [('a', (1, 2)), ('b', (4, None)), ('c', (None, 8))]
@ -294,30 +295,31 @@ namespace Microsoft.Spark.CSharp.Core
/// <summary>
/// Return a copy of the RDD partitioned using the specified partitioner.
///
/// sc.Parallelize(new[] { 1, 2, 3, 4, 2, 4, 1 }, 1).Map(x => new <see cref="KeyValuePair{int, int}"/>(x, x)).PartitionBy(3).Glom().Collect()
/// sc.Parallelize(new[] { 1, 2, 3, 4, 2, 4, 1 }, 1).Map(x => new KeyValuePair&lt;int, int>(x, x)).PartitionBy(3).Glom().Collect()
/// </summary>
/// <param name="self"></param>
/// <param name="numPartitions"></param>
/// <param name="partitionFunc"></param>
/// <returns></returns>
public static RDD<KeyValuePair<K, V>> PartitionBy<K, V>(this RDD<KeyValuePair<K, V>> self, int numPartitions = 0)
public static RDD<KeyValuePair<K, V>> PartitionBy<K, V>(this RDD<KeyValuePair<K, V>> self, int numPartitions = 0,
Func<dynamic, int> partitionFunc = null)
{
if (numPartitions == 0)
{
numPartitions = self.sparkContext.SparkConf.SparkConfProxy.GetInt("spark.default.parallelism", 0);
if (numPartitions == 0 && self.previousRddProxy != null)
numPartitions = self.previousRddProxy.PartitionLength();
numPartitions = self.GetDefaultPartitionNum();
}
int? partitioner = numPartitions;
if (self.partitioner == partitioner)
var partitioner = new Partitioner(numPartitions, partitionFunc);
if (self.partitioner != null && self.partitioner.Equals(partitioner))
return self;
var keyed = self.MapPartitionsWithIndex(new AddShuffleKeyHelper<K, V>().Execute, true);
var keyed = self.MapPartitionsWithIndex(new AddShuffleKeyHelper<K, V>(numPartitions, partitionFunc).Execute, true);
keyed.bypassSerializer = true;
// convert shuffling version of RDD[(Long, Array[Byte])] back to normal RDD[Array[Byte]]
// invoking property keyed.RddProxy marks the end of current pipeline RDD after shuffling
// and potentially starts next pipeline RDD with defult SerializedMode.Byte
var rdd = new RDD<KeyValuePair<K, V>>(self.sparkContext.SparkContextProxy.CreatePairwiseRDD(keyed.RddProxy, numPartitions), self.sparkContext);
var rdd = new RDD<KeyValuePair<K, V>>(self.sparkContext.SparkContextProxy.CreatePairwiseRDD(keyed.RddProxy, numPartitions,
GenerateObjectId(partitionFunc)), self.sparkContext);
rdd.partitioner = partitioner;
return rdd;
@ -344,9 +346,9 @@ namespace Microsoft.Spark.CSharp.Core
/// sc.Parallelize(
/// new[]
/// {
/// new <see cref="KeyValuePair{string, int}"/>("a", 1),
/// new <see cref="KeyValuePair{string, int}"/>("b", 1),
/// new <see cref="KeyValuePair{string, int}"/>("a", 1)
/// new KeyValuePair&lt;string, int>("a", 1),
/// new KeyValuePair&lt;string, int>("b", 1),
/// new KeyValuePair&lt;string, int>("a", 1)
/// }, 2)
/// .CombineByKey(() => string.Empty, (x, y) => x + y.ToString(), (x, y) => x + y).Collect()
///
@ -387,9 +389,9 @@ namespace Microsoft.Spark.CSharp.Core
/// sc.Parallelize(
/// new[]
/// {
/// new <see cref="KeyValuePair{string, int}"/>("a", 1),
/// new <see cref="KeyValuePair{string, int}"/>("b", 1),
/// new <see cref="KeyValuePair{string, int}"/>("a", 1)
/// new KeyValuePair&lt;string, int>("a", 1),
/// new KeyValuePair&lt;string, int>("b", 1),
/// new KeyValuePair&lt;string, int>("a", 1)
/// }, 2)
/// .CombineByKey(() => string.Empty, (x, y) => x + y.ToString(), (x, y) => x + y).Collect()
///
@ -423,9 +425,9 @@ namespace Microsoft.Spark.CSharp.Core
/// sc.Parallelize(
/// new[]
/// {
/// new <see cref="KeyValuePair{string, int}"/>("a", 1),
/// new <see cref="KeyValuePair{string, int}"/>("b", 1),
/// new <see cref="KeyValuePair{string, int}"/>("a", 1)
/// new KeyValuePair&lt;string, int>("a", 1),
/// new KeyValuePair&lt;string, int>("b", 1),
/// new KeyValuePair&lt;string, int>("a", 1)
/// }, 2)
/// .CombineByKey(() => string.Empty, (x, y) => x + y.ToString(), (x, y) => x + y).Collect()
///
@ -458,9 +460,9 @@ namespace Microsoft.Spark.CSharp.Core
/// sc.Parallelize(
/// new[]
/// {
/// new <see cref="KeyValuePair{string, int}"/>("a", 1),
/// new <see cref="KeyValuePair{string, int}"/>("b", 1),
/// new <see cref="KeyValuePair{string, int}"/>("a", 1)
/// new KeyValuePair&lt;string, int>("a", 1),
/// new KeyValuePair&lt;string, int>("b", 1),
/// new KeyValuePair&lt;string, int>("a", 1)
/// }, 2)
/// .GroupByKey().MapValues(l => string.Join(" ", l)).Collect()
///
@ -488,8 +490,8 @@ namespace Microsoft.Spark.CSharp.Core
/// sc.Parallelize(
/// new[]
/// {
/// new <see cref="KeyValuePair{string, string[]}"/>("a", new[]{"apple", "banana", "lemon"}),
/// new <see cref="KeyValuePair{string, string[]}"/>("b", new[]{"grapes"})
/// new KeyValuePair&lt;string, string[]>("a", new[]{"apple", "banana", "lemon"}),
/// new KeyValuePair&lt;string, string[]>("b", new[]{"grapes"})
/// }, 2)
/// .MapValues(x => x.Length).Collect()
///
@ -514,8 +516,8 @@ namespace Microsoft.Spark.CSharp.Core
/// x = sc.Parallelize(
/// new[]
/// {
/// new <see cref="KeyValuePair{string, string[]}"/>("a", new[]{"x", "y", "z"}),
/// new <see cref="KeyValuePair{string, string[]}"/>("b", new[]{"p", "r"})
/// new KeyValuePair&lt;string, string[]>("a", new[]{"x", "y", "z"}),
/// new KeyValuePair&lt;string, string[]>("b", new[]{"p", "r"})
/// }, 2)
/// .FlatMapValues(x => x).Collect()
///
@ -534,7 +536,7 @@ namespace Microsoft.Spark.CSharp.Core
}
/// <summary>
/// explicitly convert KeyValuePair<K, V> to KeyValuePair<K, dynamic>
/// explicitly convert KeyValuePair&lt;K, V> to KeyValuePair&lt;K, dynamic>
/// since they are incompatibles types unlike V to dynamic
/// </summary>
/// <typeparam name="K"></typeparam>
@ -566,8 +568,8 @@ namespace Microsoft.Spark.CSharp.Core
/// For each key k in this RDD or <paramref name="other"/>, return a resulting RDD that
/// contains a tuple with the list of values for that key in this RDD as well as <paramref name="other"/>.
///
/// var x = sc.Parallelize(new[] { new <see cref="KeyValuePair{string, int}"/>("a", 1), new <see cref="KeyValuePair{string, int}"/>("b", 4) }, 2);
/// var y = sc.Parallelize(new[] { new <see cref="KeyValuePair{string, int}"/>("a", 2) }, 1);
/// var x = sc.Parallelize(new[] { new KeyValuePair&lt;string, int>("a", 1), new KeyValuePair&lt;string, int>("b", 4) }, 2);
/// var y = sc.Parallelize(new[] { new KeyValuePair&lt;string, int>("a", 2) }, 1);
/// x.GroupWith(y).Collect();
///
/// [('a', ([1], [2])), ('b', ([4], []))]
@ -608,9 +610,9 @@ namespace Microsoft.Spark.CSharp.Core
}
/// <summary>
/// var x = sc.Parallelize(new[] { new <see cref="KeyValuePair{string, int}"/>("a", 5), new <see cref="KeyValuePair{string, int}"/>("b", 6) }, 2);
/// var y = sc.Parallelize(new[] { new <see cref="KeyValuePair{string, int}"/>("a", 1), new <see cref="KeyValuePair{string, int}"/>("b", 4) }, 2);
/// var z = sc.Parallelize(new[] { new <see cref="KeyValuePair{string, int}"/>("a", 2) }, 1);
/// var x = sc.Parallelize(new[] { new KeyValuePair&lt;string, int>("a", 5), new KeyValuePair&lt;string, int>("b", 6) }, 2);
/// var y = sc.Parallelize(new[] { new KeyValuePair&lt;string, int>("a", 1), new KeyValuePair&lt;string, int>("b", 4) }, 2);
/// var z = sc.Parallelize(new[] { new KeyValuePair&lt;string, int>("a", 2) }, 1);
/// x.GroupWith(y, z).Collect();
/// </summary>
/// <typeparam name="K"></typeparam>
@ -653,10 +655,10 @@ namespace Microsoft.Spark.CSharp.Core
}
/// <summary>
/// var x = sc.Parallelize(new[] { new <see cref="KeyValuePair{string, int}"/>("a", 5), new <see cref="KeyValuePair{string, int}"/>("b", 6) }, 2);
/// var y = sc.Parallelize(new[] { new <see cref="KeyValuePair{string, int}"/>("a", 1), new <see cref="KeyValuePair{string, int}"/>("b", 4) }, 2);
/// var z = sc.Parallelize(new[] { new <see cref="KeyValuePair{string, int}"/>("a", 2) }, 1);
/// var w = sc.Parallelize(new[] { new <see cref="KeyValuePair{string, int}"/>("b", 42) }, 1);
/// var x = sc.Parallelize(new[] { new KeyValuePair&lt;string, int>("a", 5), new KeyValuePair&lt;string, int>("b", 6) }, 2);
/// var y = sc.Parallelize(new[] { new KeyValuePair&lt;string, int>("a", 1), new KeyValuePair&lt;string, int>("b", 4) }, 2);
/// var z = sc.Parallelize(new[] { new KeyValuePair&lt;string, int>("a", 2) }, 1);
/// var w = sc.Parallelize(new[] { new KeyValuePair&lt;string, int>("b", 42) }, 1);
/// var m = x.GroupWith(y, z, w).MapValues(l => string.Join(" ", l.Item1) + " : " + string.Join(" ", l.Item2) + " : " + string.Join(" ", l.Item3) + " : " + string.Join(" ", l.Item4)).Collect();
/// </summary>
/// <typeparam name="K"></typeparam>
@ -711,7 +713,7 @@ namespace Microsoft.Spark.CSharp.Core
// ///
// /// var fractions = new <see cref="Dictionary{string, double}"/> { { "a", 0.2 }, { "b", 0.1 } };
// /// var rdd = sc.Parallelize(fractions.Keys.ToArray(), 2).Cartesian(sc.Parallelize(Enumerable.Range(0, 1000), 2));
// /// var sample = rdd.Map(t => new <see cref="KeyValuePair{string, int}"/>(t.Item1, t.Item2)).SampleByKey(false, fractions, 2).GroupByKey().Collect();
// /// var sample = rdd.Map(t => new KeyValuePair&lt;string, int>(t.Item1, t.Item2)).SampleByKey(false, fractions, 2).GroupByKey().Collect();
// ///
// /// 100 &lt; sample["a"].Length &lt; 300 and 50 &lt; sample["b"].Length &lt; 150
// /// true
@ -743,8 +745,8 @@ namespace Microsoft.Spark.CSharp.Core
/// <summary>
/// Return each (key, value) pair in this RDD that has no pair with matching key in <paramref name="other"/>.
///
/// var x = sc.Parallelize(new[] { new <see cref="KeyValuePair{string, int?}"/>("a", 1), new <see cref="KeyValuePair{string, int?}"/>("b", 4), new <see cref="KeyValuePair{string, int?}"/>("b", 5), new <see cref="KeyValuePair{string, int?}"/>("a", 2) }, 2);
/// var y = sc.Parallelize(new[] { new <see cref="KeyValuePair{string, int?}"/>("a", 3), new <see cref="KeyValuePair{string, int?}"/>("c", null) }, 2);
/// var x = sc.Parallelize(new[] { new KeyValuePair&lt;string, int?>("a", 1), new KeyValuePair&lt;string, int?>("b", 4), new KeyValuePair&lt;string, int?>("b", 5), new KeyValuePair&lt;string, int?>("a", 2) }, 2);
/// var y = sc.Parallelize(new[] { new KeyValuePair&lt;string, int?>("a", 3), new KeyValuePair&lt;string, int?>("c", null) }, 2);
/// x.SubtractByKey(y).Collect();
///
/// [('b', 4), ('b', 5)]
@ -768,7 +770,7 @@ namespace Microsoft.Spark.CSharp.Core
/// searching the partition that the key maps to.
///
/// >>> l = range(1000)
/// >>> rdd = sc.Parallelize(Enumerable.Range(0, 1000).Zip(Enumerable.Range(0, 1000), (x, y) => new <see cref="KeyValuePair{int, int}"/>(x, y)), 10)
/// >>> rdd = sc.Parallelize(Enumerable.Range(0, 1000).Zip(Enumerable.Range(0, 1000), (x, y) => new KeyValuePair&lt;int, int>(x, y)), 10)
/// >>> rdd.lookup(42)
/// [42]
///
@ -917,20 +919,42 @@ namespace Microsoft.Spark.CSharp.Core
}
[Serializable]
private class AddShuffleKeyHelper<K1, V1>
internal class AddShuffleKeyHelper<K, V>
{
[NonSerialized]
private static MD5 md5 = MD5.Create();
public IEnumerable<byte[]> Execute(int split, IEnumerable<KeyValuePair<K1, V1>> input)
private MD5 md5 = MD5.Create();
private readonly int numPartitions;
private readonly Func<dynamic, int> partitionFunc = null;
public AddShuffleKeyHelper(int numPartitions, Func<dynamic, int> partitionFunc = null)
{
this.numPartitions = numPartitions;
this.partitionFunc = partitionFunc;
}
public IEnumerable<byte[]> Execute(int split, IEnumerable<KeyValuePair<K, V>> input)
{
// make sure that md5 is not null even if it is deseriazed in C# worker
if (md5 == null)
{
md5 = MD5.Create();
}
IFormatter formatter = new BinaryFormatter();
foreach (var kvp in input)
foreach (var kv in input)
{
var ms = new MemoryStream();
formatter.Serialize(ms, kvp.Key);
yield return md5.ComputeHash(ms.ToArray()).Take(8).ToArray();
if (partitionFunc == null)
{
formatter.Serialize(ms, kv.Key);
yield return md5.ComputeHash(ms.ToArray()).Take(8).ToArray();
}
else
{
long pid = (long)(partitionFunc(kv.Key) % numPartitions);
yield return SerDe.ToBytes(pid);
}
ms = new MemoryStream();
formatter.Serialize(ms, kvp);
formatter.Serialize(ms, kv);
yield return ms.ToArray();
}
}
@ -983,9 +1007,43 @@ namespace Microsoft.Spark.CSharp.Core
}
}
[Serializable]
internal class PartitionFuncDynamicTypeHelper<K>
{
private readonly Func<K, int> func;
internal PartitionFuncDynamicTypeHelper(Func<K, int> f)
{
this.func = f;
}
internal int Execute(dynamic input)
{
return func((K)input);
}
}
/// <summary>
/// Converts a collection to a list where the element type is Option(T) type.
/// If the collection is empty, just returns the empty list.
/// </summary>
/// <param name="list">The collection that be inputted to convert</param>
/// <typeparam name="T">The element type in the collection</typeparam>
/// <returns>A list that use Option(T) as element type</returns>
public static List<Option<T>> NullIfEmpty<T>(this IEnumerable<T> list)
{
return list.Any() ? list.Select(v => new Option<T>(v)).ToList() : new List<Option<T>>() { new Option<T>() };
}
private static long GenerateObjectId(object obj)
{
if (obj == null)
return 0;
MD5 md5 = MD5.Create();
IFormatter formatter = new BinaryFormatter();
var ms = new MemoryStream();
formatter.Serialize(ms, obj);
var hash = md5.ComputeHash(ms.ToArray());
return BitConverter.ToInt64(hash.Take(8).ToArray(), 0);
}
}
}

Просмотреть файл

@ -0,0 +1,62 @@
// Copyright (c) Microsoft. All rights reserved.
// Licensed under the MIT license. See LICENSE file in the project root for full license information.
using System;
namespace Microsoft.Spark.CSharp.Core
{
/// <summary>
/// An object that defines how the elements in a key-value pair RDD are partitioned by key.
/// Maps each key to a partition ID, from 0 to "numPartitions - 1".
/// </summary>
[Serializable]
public class Partitioner
{
private readonly int numPartitions;
private readonly Func<dynamic, int> partitionFunc;
/// <summary>
/// Create a <seealso cref="Partitioner"/> instance.
/// </summary>
/// <param name="numPartitions">Number of partitions.</param>
/// <param name="partitionFunc">Defines how the elements in a key-value pair RDD are partitioned by key. Input of Func is key, output is partition index.
/// Warning: diffrent Func instances are considered as different partitions which will cause repartition.</param>
public Partitioner(int numPartitions, Func<dynamic, int> partitionFunc)
{
this.numPartitions = numPartitions;
this.partitionFunc = partitionFunc;
}
/// <summary>
/// Determines whether the specified object is equal to the current object.
/// </summary>
/// <returns>
/// true if the specified object is equal to the current object; otherwise, false.
/// </returns>
/// <param name="obj">The object to compare with the current object. </param>
public override bool Equals(object obj)
{
if (ReferenceEquals(null, obj)) return false;
if (ReferenceEquals(this, obj)) return true;
var otherPartitioner = obj as Partitioner;
if (otherPartitioner != null)
{
return otherPartitioner.numPartitions == numPartitions && otherPartitioner.partitionFunc == partitionFunc;
}
return base.Equals(obj);
}
/// <summary>
/// Serves as the default hash function.
/// </summary>
/// <returns>
/// A hash code for the current object.
/// </returns>
public override int GetHashCode()
{
return base.GetHashCode();
}
}
}

Просмотреть файл

@ -19,7 +19,7 @@ namespace Microsoft.Spark.CSharp.Core
/// <summary>
/// Wraps C#-based transformations that can be executed within a stage. It helps avoid unnecessary Ser/De of data between
/// JVM & CLR to execute C# transformations and pipelines them
/// JVM and CLR to execute C# transformations and pipelines them
/// </summary>
/// <typeparam name="U"></typeparam>
[Serializable]
@ -29,6 +29,14 @@ namespace Microsoft.Spark.CSharp.Core
internal bool preservesPartitioning;
//TODO - give generic types a better id
/// <summary>
/// Return a new RDD by applying a function to each partition of this RDD,
/// while tracking the index of the original partition.
/// </summary>
/// <typeparam name="U1">The element type</typeparam>
/// <param name="newFunc">The function to be applied to each partition</param>
/// <param name="preservesPartitioningParam">Indicates if it preserves partition parameters</param>
/// <returns>A new RDD</returns>
public override RDD<U1> MapPartitionsWithIndex<U1>(Func<int, IEnumerable<U>, IEnumerable<U1>> newFunc, bool preservesPartitioningParam = false)
{
if (IsPipelinable())

Просмотреть файл

@ -10,6 +10,9 @@ using System.Threading.Tasks;
namespace Microsoft.Spark.CSharp.Core
{
//TODO - complete the impl
/// <summary>
/// A class represents a profiler
/// </summary>
public class Profiler
{
}

Просмотреть файл

@ -5,6 +5,7 @@ using System;
using System.Collections.Generic;
using System.Linq;
using Microsoft.Spark.CSharp.Proxy;
using Microsoft.Spark.CSharp.Services;
namespace Microsoft.Spark.CSharp.Core
{
@ -18,16 +19,29 @@ namespace Microsoft.Spark.CSharp.Core
[Serializable]
public class RDD<T>
{
[NonSerialized]
private readonly ILoggerService logger = LoggerServiceFactory.GetLogger(typeof(RDD<T>));
internal IRDDProxy rddProxy;
internal IRDDProxy previousRddProxy;
// There should be only one SparkContext instance per application, mark it as NonSerialized to avoid more than one SparkContext instances created.
// Need to set this field with a valid SparkContext instance after deserialization.
[NonSerialized]
internal SparkContext sparkContext;
internal SerializedMode serializedMode; //used for deserializing data before processing in C# worker
internal SerializedMode prevSerializedMode;
/// <summary>
/// Indicates whether the RDD is cached.
/// </summary>
protected bool isCached;
/// <summary>
/// Indicates whether the RDD is checkpointed.
/// </summary>
protected bool isCheckpointed;
internal bool bypassSerializer;
internal int? partitioner;
internal Partitioner partitioner;
internal virtual IRDDProxy RddProxy
{
@ -108,6 +122,7 @@ namespace Microsoft.Spark.CSharp.Core
public RDD<T> Cache()
{
isCached = true;
logger.LogInfo("Persisting RDD to default storage cache");
RddProxy.Cache();
return this;
}
@ -127,6 +142,7 @@ namespace Microsoft.Spark.CSharp.Core
public RDD<T> Persist(StorageLevelType storageLevelType)
{
isCached = true;
logger.LogInfo("Persisting RDD to storage level type {0}", storageLevelType);
RddProxy.Persist(storageLevelType);
return this;
}
@ -140,6 +156,7 @@ namespace Microsoft.Spark.CSharp.Core
if (isCached)
{
isCached = false;
logger.LogInfo("Unpersisting RDD from the cache");
RddProxy.Unpersist();
}
return this;
@ -156,10 +173,15 @@ namespace Microsoft.Spark.CSharp.Core
public void Checkpoint()
{
isCheckpointed = true;
logger.LogInfo("Checkpointing RDD to SparkContext.SetCheckpointDir");
RddProxy.Checkpoint();
}
internal int GetNumPartitions()
/// <summary>
/// Returns the number of partitions of this RDD.
/// </summary>
/// <returns>The number of partitions of this RDD</returns>
public int GetNumPartitions()
{
return RddProxy.GetNumPartitions();
}
@ -167,7 +189,7 @@ namespace Microsoft.Spark.CSharp.Core
/// <summary>
/// Return a new RDD by applying a function to each element of this RDD.
///
/// sc.Parallelize(new string[]{"b", "a", "c"}, 1).Map(x => new <see cref="KeyValuePair{string, int}"/>(x, 1)).Collect()
/// sc.Parallelize(new string[]{"b", "a", "c"}, 1).Map(x => new KeyValuePair&lt;string, int>(x, 1)).Collect()
/// [('a', 1), ('b', 1), ('c', 1)]
///
/// </summary>
@ -177,6 +199,7 @@ namespace Microsoft.Spark.CSharp.Core
/// <returns></returns>
public RDD<U> Map<U>(Func<T, U> f, bool preservesPartitioning = false)
{
logger.LogInfo("Executing Map operation on RDD (preservesPartitioning={0})", preservesPartitioning);
return MapPartitionsWithIndex(new MapHelper<T, U>(f).Execute, preservesPartitioning);
}
@ -217,7 +240,7 @@ namespace Microsoft.Spark.CSharp.Core
/// Return a new RDD by applying a function to each partition of this RDD,
/// while tracking the index of the original partition.
///
/// <see cref="sc.Parallelize(new int[]{1, 2, 3, 4}, 4).MapPartitionsWithIndex{double}"/>((pid, iter) => (double)pid).Sum()
/// sc.Parallelize(new int[]{1, 2, 3, 4}, 4).MapPartitionsWithIndex&lt;double>((pid, iter) => (double)pid).Sum()
/// 6
/// </summary>
/// <typeparam name="U"></typeparam>
@ -417,7 +440,7 @@ namespace Microsoft.Spark.CSharp.Core
public RDD<T> Union(RDD<T> other)
{
var rdd = new RDD<T>(RddProxy.Union(other.RddProxy), sparkContext);
if (partitioner == other.partitioner && RddProxy.PartitionLength() == rdd.RddProxy.PartitionLength())
if (partitioner == other.partitioner && RddProxy.GetNumPartitions() == rdd.RddProxy.GetNumPartitions())
rdd.partitioner = partitioner;
return rdd;
}
@ -579,6 +602,7 @@ namespace Microsoft.Spark.CSharp.Core
/// <returns></returns>
public T Reduce(Func<T, T, T> f)
{
logger.LogInfo("Executing Reduce operation on RDD");
Func<int, IEnumerable<T>, IEnumerable<T>> func = new ReduceHelper<T>(f).Execute;
var vals = MapPartitionsWithIndex(func, true).Collect();
@ -1047,6 +1071,14 @@ namespace Microsoft.Spark.CSharp.Core
{
return new RDD<T>(RddProxy.RandomSampleWithRange(lb, ub, seed), sparkContext);
}
internal int GetDefaultPartitionNum()
{
var numPartitions = sparkContext.SparkConf.SparkConfProxy.GetInt("spark.default.parallelism", 0);
if (numPartitions == 0 && previousRddProxy != null)
numPartitions = previousRddProxy.GetNumPartitions();
return numPartitions;
}
}
/// <summary>
@ -1115,10 +1147,12 @@ namespace Microsoft.Spark.CSharp.Core
/// <typeparam name="T"></typeparam>
/// <param name="self"></param>
/// <param name="num"></param>
/// <param name="keyFunc"></param>
/// <returns></returns>
public static T[] TakeOrdered<T>(this RDD<T> self, int num) where T : IComparable<T>
public static T[] TakeOrdered<T>(this RDD<T> self, int num, Func<T, dynamic> keyFunc = null) where T : IComparable<T>
{
return self.MapPartitionsWithIndex<T>(new TakeOrderedHelper<T>(num).Execute).Collect().OrderBy(x => x).Take(num).ToArray();
return self.MapPartitionsWithIndex<T>(new TakeOrderedHelper<T>(num, keyFunc).Execute).Collect()
.OrderBy(x => keyFunc == null ? x : keyFunc(x)).Take(num).ToArray();
}
/// <summary>
@ -1432,13 +1466,15 @@ namespace Microsoft.Spark.CSharp.Core
internal class TakeOrderedHelper<T>
{
private readonly int num;
internal TakeOrderedHelper(int num)
private readonly Func<T, dynamic> keyFunc;
internal TakeOrderedHelper(int num, Func<T, dynamic> keyFunc)
{
this.num = num;
this.keyFunc = keyFunc;
}
internal IEnumerable<T> Execute(int pid, IEnumerable<T> input)
{
return input.OrderBy(x => x).Take(num);
return input.OrderBy(x => keyFunc == null ? x : keyFunc(x)).Take(num);
}
}
[Serializable]

Просмотреть файл

@ -5,12 +5,12 @@ using System;
using System.Collections.Generic;
using System.IO;
using System.Net;
using System.Net.Sockets;
using System.Reflection;
using System.Runtime.Serialization;
using System.Runtime.Serialization.Formatters.Binary;
using System.Text;
using Microsoft.Spark.CSharp.Interop.Ipc;
using Microsoft.Spark.CSharp.Network;
using Microsoft.Spark.CSharp.Sql;
namespace Microsoft.Spark.CSharp.Core
@ -23,10 +23,10 @@ namespace Microsoft.Spark.CSharp.Core
public IEnumerable<dynamic> Collect(int port, SerializedMode serializedMode, Type type)
{
IFormatter formatter = new BinaryFormatter();
Socket sock = new Socket(AddressFamily.InterNetwork, SocketType.Stream, ProtocolType.Tcp);
var sock = SocketFactory.CreateSocket();
sock.Connect(IPAddress.Loopback, port);
using (NetworkStream s = new NetworkStream(sock))
using (var s = sock.GetStream())
{
byte[] buffer;
while ((buffer = SerDe.ReadBytes(s)) != null && buffer.Length > 0)

Просмотреть файл

@ -63,6 +63,7 @@ namespace Microsoft.Spark.CSharp.Core
public SparkConf SetMaster(string master)
{
sparkConfProxy.SetMaster(master);
logger.LogInfo("Spark master set to {0}", master);
return this;
}
@ -73,6 +74,7 @@ namespace Microsoft.Spark.CSharp.Core
public SparkConf SetAppName(string appName)
{
sparkConfProxy.SetAppName(appName);
logger.LogInfo("Spark app name set to {0}", appName);
return this;
}
@ -84,6 +86,7 @@ namespace Microsoft.Spark.CSharp.Core
public SparkConf SetSparkHome(string sparkHome)
{
sparkConfProxy.SetSparkHome(sparkHome);
logger.LogInfo("Spark home set to {0}", sparkHome);
return this;
}
@ -95,6 +98,7 @@ namespace Microsoft.Spark.CSharp.Core
public SparkConf Set(string key, string value)
{
sparkConfProxy.Set(key, value);
logger.LogInfo("Spark configuration key-value set to {0}={1}", key, value);
return this;
}

Просмотреть файл

@ -10,14 +10,31 @@ using System.Text;
using Microsoft.Spark.CSharp.Interop;
using Microsoft.Spark.CSharp.Proxy;
using Microsoft.Spark.CSharp.Services;
namespace Microsoft.Spark.CSharp.Core
{
/// <summary>
/// Main entry point for Spark functionality. A SparkContext represents the
/// connection to a Spark cluster, and can be used to create RDDs, accumulators
/// and broadcast variables on that cluster.
/// </summary>
public class SparkContext
{
private readonly ILoggerService logger = LoggerServiceFactory.GetLogger(typeof(SparkContext));
internal ISparkContextProxy SparkContextProxy { get; private set; }
internal SparkConf SparkConf { get; private set; }
private static SparkContext _activeSparkContext = null;
/// <summary>
/// Get existing SparkContext
/// </summary>
internal static SparkContext GetActiveSparkContext()
{
return _activeSparkContext;
}
private AccumulatorServer accumulatorServer;
private int nextAccumulatorId;
@ -63,20 +80,32 @@ namespace Microsoft.Spark.CSharp.Core
/// </summary>
public StatusTracker StatusTracker { get { return new StatusTracker(SparkContextProxy.StatusTracker); } }
/// <summary>
/// Initializes a SparkContext instance with a specific master, application name, and spark home
/// </summary>
/// <param name="master">Cluster URL to connect to (e.g. mesos://host:port, spark://host:port, local)</param>
/// <param name="appName">A name for your application, to display on the cluster web UI</param>
/// <param name="sparkHome">the path that holds spark bits</param>
public SparkContext(string master, string appName, string sparkHome)
: this(master, appName, sparkHome, null)
{
}
{}
/// <summary>
/// Initializes a SparkContext instance with a specific master and application name.
/// </summary>
/// <param name="master"></param>
/// <param name="appName"></param>
public SparkContext(string master, string appName)
: this(master, appName, null, null)
{
}
{}
/// <summary>
/// Initializes a SparkContext instance with a specific spark config.
/// </summary>
/// <param name="conf">A SparkConf object that represents the settings for spark</param>
public SparkContext(SparkConf conf)
: this(null, null, null, conf)
{
}
{}
/// <summary>
/// when created from checkpoint
@ -100,6 +129,7 @@ namespace Microsoft.Spark.CSharp.Core
SparkConf.SetSparkHome(sparkHome);
SparkContextProxy = SparkCLREnvironment.SparkCLRProxy.CreateSparkContext(SparkConf.SparkConfProxy);
_activeSparkContext = this;
}
internal void StartAccumulatorServer()
@ -112,8 +142,15 @@ namespace Microsoft.Spark.CSharp.Core
}
}
/// <summary>
/// Read a text file from HDFS, a local file system (available on all nodes), or any Hadoop-supported file system URI, and return it as an RDD of Strings.
/// </summary>
/// <param name="filePath">The path of file to be read</param>
/// <param name="minPartitions">A suggestion value of the minimal splitting number for input data</param>
/// <returns>an RDD of Strings</returns>
public RDD<string> TextFile(string filePath, int minPartitions = 0)
{
logger.LogInfo("Reading text file {0} as RDD<string> with {1} partitions", filePath, minPartitions);
return new RDD<string>(SparkContextProxy.TextFile(filePath, minPartitions), this, SerializedMode.String);
}
@ -142,6 +179,7 @@ namespace Microsoft.Spark.CSharp.Core
if (numSlices < 1)
numSlices = 1;
logger.LogInfo("Parallelizing {0} items to form RDD in the cluster with {1} partitions", collectionOfByteRepresentationOfObjects.Count, numSlices);
return new RDD<T>(SparkContextProxy.Parallelize(collectionOfByteRepresentationOfObjects, numSlices), this);
}
@ -170,7 +208,7 @@ namespace Microsoft.Spark.CSharp.Core
///
/// Do
/// {{{
/// <see cref="RDD{KeyValuePair{string, string}}"/> rdd = sparkContext.WholeTextFiles("hdfs://a-hdfs-path")
/// RDD&lt;KeyValuePair&lt;string, string>> rdd = sparkContext.WholeTextFiles("hdfs://a-hdfs-path")
/// }}}
///
/// then `rdd` contains
@ -208,7 +246,7 @@ namespace Microsoft.Spark.CSharp.Core
/// }}}
///
/// Do
/// <see cref="RDD{KeyValuePair{string, byte[]}}"/> rdd = sparkContext.dataStreamFiles("hdfs://a-hdfs-path")`,
/// RDD&lt;KeyValuePair&lt;string, byte[]>>"/> rdd = sparkContext.dataStreamFiles("hdfs://a-hdfs-path")`,
///
/// then `rdd` contains
/// {{{
@ -401,9 +439,16 @@ namespace Microsoft.Spark.CSharp.Core
/// </summary>
public void Stop()
{
logger.LogInfo("Stopping SparkContext");
logger.LogInfo("Note that there might be error in Spark logs on the failure to delete userFiles directory " +
"under Spark temp directory (spark.local.dir config value in local mode)");
logger.LogInfo("This error may be ignored for now. See https://issues.apache.org/jira/browse/SPARK-8333 for details");
if (accumulatorServer != null)
accumulatorServer.Shutdown();
SparkContextProxy.Stop();
}
/// <summary>

Просмотреть файл

@ -9,6 +9,11 @@ using System.Threading.Tasks;
namespace Microsoft.Spark.CSharp.Core
{
/// <summary>
/// A class for tracking the statistics of a set of numbers (count, mean and variance) in a numerically
/// robust way. Includes support for merging two StatCounters. Based on Welford and Chan's algorithms
/// for running variance.
/// </summary>
[Serializable]
public class StatCounter
{
@ -18,9 +23,16 @@ namespace Microsoft.Spark.CSharp.Core
private double maxValue = double.MinValue; // Running max of our values
private double minValue = double.MaxValue; // Running min of our values
/// <summary>
/// Initializes the StatCounter with no values.
/// </summary>
public StatCounter()
{ }
/// <summary>
/// Initializes the StatCounter with the given values.
/// </summary>
/// <param name="values"></param>
public StatCounter(IEnumerable<double> values)
{
Merge(values);
@ -114,10 +126,30 @@ namespace Microsoft.Spark.CSharp.Core
other.minValue = minValue;
return other;
}
/// <summary>
/// Gets the count number of this StatCounter
/// </summary>
public long Count { get { return n; } }
/// <summary>
/// Gets the average number of this StatCounter
/// </summary>
public double Mean { get { return mu; } }
/// <summary>
/// Gets the sum number of this StatCounter
/// </summary>
public double Sum { get { return n * mu; } }
/// <summary>
/// Gets the maximum number of this StatCounter
/// </summary>
public double Max { get { return maxValue; } }
/// <summary>
/// Gets the minimum number of this StatCounter
/// </summary>
public double Min { get { return minValue; } }
/// <summary>
@ -139,6 +171,13 @@ namespace Microsoft.Spark.CSharp.Core
/// Return the sample standard deviation of the values, which corrects for bias in estimating the variance by dividing by N-1 instead of N.
/// </summary>
public double SampleStdev { get { return Math.Sqrt(SampleVariance); } }
/// <summary>
/// Returns a string that represents this StatCounter.
/// </summary>
/// <returns>
/// A string that represents this StatCounter.
/// </returns>
public override string ToString()
{
return string.Format("(count: {0}, mean: {1}, stdev: {2}, max: {3}, min: {4})", Count, Mean, Stdev, Max, Min);

Просмотреть файл

@ -11,6 +11,9 @@ using Microsoft.Spark.CSharp.Proxy;
namespace Microsoft.Spark.CSharp.Core
{
/// <summary>
/// Low-level status reporting APIs for monitoring job and stage progress.
/// </summary>
public class StatusTracker
{
private readonly IStatusTrackerProxy statusTrackerProxy;
@ -76,11 +79,21 @@ namespace Microsoft.Spark.CSharp.Core
}
}
/// <summary>
/// SparkJobInfo represents a job information of Spark
/// </summary>
public class SparkJobInfo
{
readonly int jobId;
readonly int[] stageIds;
readonly string status;
/// <summary>
/// Initializes a SparkJobInfo instance with a given job Id, stage Ids, and status
/// </summary>
/// <param name="jobId"></param>
/// <param name="stageIds"></param>
/// <param name="status"></param>
public SparkJobInfo(int jobId, int[] stageIds, string status)
{
this.jobId = jobId;
@ -88,12 +101,26 @@ namespace Microsoft.Spark.CSharp.Core
this.status = status;
}
/// <summary>
/// Gets the Id of this Spark job
/// </summary>
public int JobId { get { return jobId; } }
/// <summary>
/// Gets the stage Ids of this Spark job
/// </summary>
public int[] StageIds { get { return stageIds; } }
/// <summary>
/// Gets the status of this Spark job
/// </summary>
public string Status { get { return status; } }
}
/// <summary>
/// SparkJobInfo represents a stage information of Spark
/// </summary>
public class SparkStageInfo
{
readonly int stageId;
@ -104,6 +131,18 @@ namespace Microsoft.Spark.CSharp.Core
readonly int numActiveTasks;
readonly int numCompletedTasks;
readonly int numFailedTasks;
/// <summary>
/// Initializes a SparkStageInfo instance with given values
/// </summary>
/// <param name="stageId">The stage Id</param>
/// <param name="currentAttemptId">The current attempt Id</param>
/// <param name="submissionTime">The submission time</param>
/// <param name="name">The name of this stage</param>
/// <param name="numTasks">The number of tasks</param>
/// <param name="numActiveTasks">The number of active tasks</param>
/// <param name="numCompletedTasks">The number of completed tasks</param>
/// <param name="numFailedTasks">The number of failed tasks</param>
public SparkStageInfo(int stageId, int currentAttemptId, long submissionTime, string name, int numTasks, int numActiveTasks, int numCompletedTasks, int numFailedTasks)
{
this.stageId = stageId;
@ -116,13 +155,44 @@ namespace Microsoft.Spark.CSharp.Core
this.numFailedTasks = numFailedTasks;
}
/// <summary>
/// Gets the stage Id of this SparkStageInfo
/// </summary>
public int StageId { get { return stageId; } }
/// <summary>
/// Gets the current attempt Id of this SparkStageInfo
/// </summary>
public int CurrentAttemptId { get { return currentAttemptId; } }
/// <summary>
/// Gets the submission time of this SparkStageInfo
/// </summary>
public long SubmissionTime { get { return submissionTime; } }
/// <summary>
/// Gets the name of this SparkStageInfo
/// </summary>
public string Name { get { return name; } }
/// <summary>
/// Gets the number of tasks of this SparkStageInfo
/// </summary>
public int NumTasks { get { return numTasks; } }
/// <summary>
/// Gets the number of active tasks of this SparkStageInfo
/// </summary>
public int NumActiveTasks { get { return numActiveTasks; } }
/// <summary>
/// Gets the number of completed tasks of this SparkStageInfo
/// </summary>
public int NumCompletedTasks { get { return numCompletedTasks; } }
/// <summary>
/// Gets the number of failed tasks of this SparkStageInfro
/// </summary>
public int NumFailedTasks { get { return numFailedTasks; } }
}
}

Просмотреть файл

@ -9,21 +9,67 @@ using System.Threading.Tasks;
namespace Microsoft.Spark.CSharp.Core
{
/// <summary>
/// Defines the type of storage levels
/// </summary>
public enum StorageLevelType
{
/// <summary>
/// Not specified to use any storage
/// </summary>
NONE,
/// <summary>
/// Specifies to use disk only
/// </summary>
DISK_ONLY,
/// <summary>
/// Specifies to use disk only with 2 replicas
/// </summary>
DISK_ONLY_2,
/// <summary>
/// Specifies to use memory only
/// </summary>
MEMORY_ONLY,
/// <summary>
/// Specifies to use memory only 2 replicas
/// </summary>
MEMORY_ONLY_2,
/// <summary>
/// Specifies to use memory only in a serialized format
/// </summary>
MEMORY_ONLY_SER,
/// <summary>
/// Specifies to use memory only in a serialized format with 2 replicas
/// </summary>
MEMORY_ONLY_SER_2,
/// <summary>
/// Specifies to use disk and memory
/// </summary>
MEMORY_AND_DISK,
/// <summary>
/// Specifies to use disk and memory with 2 replicas
/// </summary>
MEMORY_AND_DISK_2,
/// <summary>
/// Specifies to use disk and memory in a serialized format
/// </summary>
MEMORY_AND_DISK_SER,
/// <summary>
/// Specifies to use disk and memory in a serialized format with 2 replicas
/// </summary>
MEMORY_AND_DISK_SER_2,
/// <summary>
/// Specifies to use off heap
/// </summary>
OFF_HEAP
}
/// <summary>
/// Flags for controlling the storage of an RDD. Each StorageLevel records whether to use
/// memory, whether to drop the RDD to disk if it falls out of memory, whether to keep the
/// data in memory in a serialized format, and whether to replicate the RDD partitions
/// on multiple nodes.
/// </summary>
public class StorageLevel
{
internal static Dictionary<StorageLevelType, StorageLevel> storageLevel = new Dictionary<StorageLevelType, StorageLevel>
@ -56,6 +102,10 @@ namespace Microsoft.Spark.CSharp.Core
this.replication = replication;
}
/// <summary>
/// Returns a readable string that represents the type
/// </summary>
/// <returns>A readable string</returns>
public override string ToString()
{
return string.Format("{0}{1}{2}{3}{4} Replicated",

Просмотреть файл

@ -10,7 +10,7 @@ using System.Threading.Tasks;
namespace Microsoft.Spark.CSharp.Interop.Ipc
{
/// <summary>
/// Behavior of the bridge used for the IPC interop between JVM & CLR
/// Behavior of the bridge used for the IPC interop between JVM and CLR
/// </summary>
internal interface IJvmBridge : IDisposable
{

Просмотреть файл

@ -5,23 +5,24 @@ using System;
using System.Collections.Concurrent;
using System.Collections.Generic;
using System.Diagnostics.CodeAnalysis;
using System.IO;
using System.Net;
using System.Net.Sockets;
using System.Text;
using Microsoft.Spark.CSharp.Network;
using Microsoft.Spark.CSharp.Services;
namespace Microsoft.Spark.CSharp.Interop.Ipc
{
/// <summary>
/// Implementation of thread safe IPC bridge between JVM & CLR
/// throught a concourrent socket connection queue (lightweight synchronisation mechanism)
/// Implementation of thread safe IPC bridge between JVM and CLR
/// Using a concurrent socket connection queue (lightweight synchronization mechanism)
/// supporting async JVM calls like StreamingContext.AwaitTermination()
/// </summary>
[ExcludeFromCodeCoverage] //IPC calls to JVM validated using validation-enabled samples - unit test coverage not reqiured
internal class JvmBridge : IJvmBridge
{
private int portNumber;
private readonly ConcurrentQueue<Socket> sockets = new ConcurrentQueue<Socket>();
private readonly ConcurrentQueue<ISocketWrapper> sockets = new ConcurrentQueue<ISocketWrapper>();
private readonly ILoggerService logger = LoggerServiceFactory.GetLogger(typeof(JvmBridge));
public void Initialize(int portNumber)
@ -29,12 +30,12 @@ namespace Microsoft.Spark.CSharp.Interop.Ipc
this.portNumber = portNumber;
}
private Socket GetConnection()
private ISocketWrapper GetConnection()
{
Socket socket;
ISocketWrapper socket;
if (!sockets.TryDequeue(out socket))
{
socket = new Socket(AddressFamily.InterNetwork, SocketType.Stream, ProtocolType.Tcp);
socket = SocketFactory.CreateSocket();
socket.Connect(IPAddress.Loopback, portNumber);
}
return socket;
@ -72,8 +73,8 @@ namespace Microsoft.Spark.CSharp.Interop.Ipc
{
var overallPayload = PayloadHelper.BuildPayload(isStatic, classNameOrJvmObjectReference, methodName, parameters);
Socket socket = GetConnection();
using (NetworkStream s = new NetworkStream(socket))
var socket = GetConnection();
using (var s = socket.GetStream())
{
SerDe.Write(s, overallPayload);
@ -115,7 +116,7 @@ namespace Microsoft.Spark.CSharp.Interop.Ipc
break;
case 'l':
returnValue = ReadJvmObjectReferenceCollection(s);
returnValue = ReadCollection(s);
break;
@ -207,15 +208,56 @@ namespace Microsoft.Spark.CSharp.Interop.Ipc
return paramsString.ToString();
}
private object ReadJvmObjectReferenceCollection(NetworkStream s)
private object ReadCollection(Stream s)
{
object returnValue;
var listItemTypeAsChar = Convert.ToChar(s.ReadByte());
int numOfItemsInList = SerDe.ReadInt(s);
switch (listItemTypeAsChar)
{
case 'c':
var strList = new List<string>();
for (int itemIndex = 0; itemIndex < numOfItemsInList; itemIndex++)
{
strList.Add(SerDe.ReadString(s));
}
returnValue = strList;
break;
case 'i':
var intList = new List<int>();
for (int itemIndex = 0; itemIndex < numOfItemsInList; itemIndex++)
{
intList.Add(SerDe.ReadInt(s));
}
returnValue = intList;
break;
case 'd':
var doubleList = new List<double>();
for (int itemIndex = 0; itemIndex < numOfItemsInList; itemIndex++)
{
doubleList.Add(SerDe.ReadDouble(s));
}
returnValue = doubleList;
break;
case 'b':
var boolList = new List<bool>();
for (int itemIndex = 0; itemIndex < numOfItemsInList; itemIndex++)
{
boolList.Add(Convert.ToBoolean(s.ReadByte()));
}
returnValue = boolList;
break;
case 'r':
var byteArrayList = new List<byte[]>();
for (int itemIndex = 0; itemIndex < numOfItemsInList; itemIndex++)
{
var byteArrayLen = SerDe.ReadInt(s);
byteArrayList.Add(SerDe.ReadBytes(s, byteArrayLen));
}
returnValue = byteArrayList;
break;
case 'j':
var jvmObjectReferenceList = new List<JvmObjectReference>();
var numOfItemsInList = SerDe.ReadInt(s);
for (int itemIndex = 0; itemIndex < numOfItemsInList; itemIndex++)
{
var itemIdentifier = SerDe.ReadString(s);
@ -223,7 +265,6 @@ namespace Microsoft.Spark.CSharp.Interop.Ipc
}
returnValue = jvmObjectReferenceList;
break;
default:
// convert listItemTypeAsChar to UInt32 because the char may be non-printable
throw new NotSupportedException(
@ -235,13 +276,12 @@ namespace Microsoft.Spark.CSharp.Interop.Ipc
public void Dispose()
{
Socket socket;
ISocketWrapper socket;
while (sockets.TryDequeue(out socket))
{
if (socket != null)
{
socket.Dispose();
socket = null;
}
}
}

Просмотреть файл

@ -0,0 +1,80 @@
// Copyright (c) Microsoft. All rights reserved.
// Licensed under the MIT license. See LICENSE file in the project root for full license information.
using System.Collections.Generic;
using System.Diagnostics.CodeAnalysis;
using System.Linq;
using System.Text;
using System.Threading.Tasks;
using Microsoft.Spark.CSharp.Proxy.Ipc;
namespace Microsoft.Spark.CSharp.Interop.Ipc
{
/// <summary>
/// Utility methods for C#-JVM interaction
/// </summary>
[ExcludeFromCodeCoverage] //IPC calls to JVM validated using validation-enabled samples - unit test coverage not reqiured
internal static class JvmBridgeUtils
{
public static JvmObjectReference GetJavaMap<K, V>(IEnumerable<KeyValuePair<K, V>> enumerable)
{
var jmap = SparkCLRIpcProxy.JvmBridge.CallConstructor("java.util.Hashtable", new object[] { });
if (enumerable != null)
{
foreach (var item in enumerable)
SparkCLRIpcProxy.JvmBridge.CallNonStaticJavaMethod(jmap, "put", new object[] { item.Key, item.Value });
}
return jmap;
}
public static JvmObjectReference GetJavaHashMap<K, V>(IEnumerable<KeyValuePair<K, V>> enumerable)
{
var jmap = SparkCLRIpcProxy.JvmBridge.CallConstructor("java.util.HashMap", new object[] { });
if (enumerable != null)
{
foreach (var item in enumerable)
SparkCLRIpcProxy.JvmBridge.CallNonStaticJavaMethod(jmap, "put", new object[] { item.Key, item.Value });
}
return jmap;
}
public static JvmObjectReference GetScalaMutableMap<K, V>(Dictionary<K, V> mapValues)
{
var hashMapReference = GetJavaHashMap(mapValues.Select(kvp => kvp));
return new JvmObjectReference(SparkCLRIpcProxy.JvmBridge.CallStaticJavaMethod("org.apache.spark.sql.api.csharp.JvmBridgeUtils", "toMutableMap", new object[] { hashMapReference }).ToString());
}
public static JvmObjectReference GetJavaSet<T>(IEnumerable<T> enumerable)
{
var jset = SparkCLRIpcProxy.JvmBridge.CallConstructor("java.util.HashSet", new object[] { });
if (enumerable != null)
{
foreach (var item in enumerable)
SparkCLRIpcProxy.JvmBridge.CallNonStaticJavaMethod(jset, "add", new object[] { item });
}
return jset;
}
public static JvmObjectReference GetJavaList<T>(IEnumerable<T> enumerable)
{
var jlist = SparkCLRIpcProxy.JvmBridge.CallConstructor("java.util.ArrayList", new object[] { });
if (enumerable != null)
{
foreach (var item in enumerable)
SparkCLRIpcProxy.JvmBridge.CallNonStaticJavaMethod(jlist, "add", new object[] { item });
}
return jlist;
}
public static JvmObjectReference GetJavaSeq<T>(IEnumerable<T> enumerable)
{
return new JvmObjectReference((string)SparkCLRIpcProxy.JvmBridge.CallStaticJavaMethod("org.apache.spark.sql.api.csharp.SQLUtils", "toSeq", GetJavaList<T>(enumerable)));
}
public static JvmObjectReference GetJavaDuration(int durationSeconds)
{
// java expects Duration in mini seconds and must be of long type
return SparkCLRIpcProxy.JvmBridge.CallConstructor("org.apache.spark.streaming.Duration", new object[] { (long)durationSeconds * 1000 });
}
}
}

Просмотреть файл

@ -2,8 +2,10 @@
// Licensed under the MIT license. See LICENSE file in the project root for full license information.
using System;
using System.Runtime.CompilerServices;
using Microsoft.Spark.CSharp.Proxy.Ipc;
[assembly: InternalsVisibleTo("Microsoft.Spark.CSharp.Utils")]
namespace Microsoft.Spark.CSharp.Interop.Ipc
{
/// <summary>
@ -19,6 +21,7 @@ namespace Microsoft.Spark.CSharp.Interop.Ipc
{
Id = jvmReferenceId;
creationTime = DateTime.UtcNow;
SparkCLREnvironment.WeakObjectManager.AddWeakRefereceObject(this);
}
public override string ToString()
@ -40,6 +43,11 @@ namespace Microsoft.Spark.CSharp.Interop.Ipc
return base.Equals(obj);
}
public override int GetHashCode()
{
return base.GetHashCode();
}
public string GetDebugInfo()
{
var javaObjectReferenceForClassObject = new JvmObjectReference(SparkCLRIpcProxy.JvmBridge.CallNonStaticJavaMethod(this, "getClass").ToString());

Просмотреть файл

@ -12,37 +12,84 @@ namespace Microsoft.Spark.CSharp.Interop.Ipc
/// </summary>
public enum SpecialLengths : int
{
/// <summary>
/// Flag to indicate the end of data section
/// </summary>
END_OF_DATA_SECTION = -1,
/// <summary>
/// Flag to indicate an exception thrown from .NET side
/// </summary>
DOTNET_EXCEPTION_THROWN = -2,
/// <summary>
/// Flag to indicate a timing data
/// </summary>
TIMING_DATA = -3,
/// <summary>
/// Flag to indicate the end of stream
/// </summary>
END_OF_STREAM = -4,
/// <summary>
/// Flag to indicate non-defined type
/// </summary>
NULL = -5,
}
/// <summary>
/// Serialization and Deserialization of data types between JVM & CLR
/// Serialization and Deserialization of data types between JVM and CLR
/// </summary>
public class SerDe //TODO - add ToBytes() for other types
{
/// <summary>
/// The total number of read
/// </summary>
public static long totalReadNum = 0;
/// <summary>
/// The total number of write
/// </summary>
public static long totalWriteNum = 0;
/// <summary>
/// Converts a boolean to a byte array
/// </summary>
/// <param name="value">The boolean to be converted</param>
/// <returns>The byte array converted from a boolean</returns>
public static byte[] ToBytes(bool value)
{
return new[] { System.Convert.ToByte(value) };
}
/// <summary>
/// Converts a string to a byte array.
/// </summary>
/// <param name="value">The string to be converted</param>
/// <returns>The byte array converted from a string</returns>
public static byte[] ToBytes(string value)
{
return Encoding.UTF8.GetBytes(value);
}
/// <summary>
/// Converts an integer to a byte array
/// </summary>
/// <param name="value">The intger to be converted</param>
/// <returns>The byte array converted from an integer</returns>
public static byte[] ToBytes(int value)
{
var byteRepresentationofInputLength = BitConverter.GetBytes(value);
Array.Reverse(byteRepresentationofInputLength);
return byteRepresentationofInputLength;
}
/// <summary>
/// Converts a long integer to a byte array
/// </summary>
/// <param name="value">The long intger to be converted</param>
/// <returns>The byte array converted from a long integer</returns>
public static byte[] ToBytes(long value)
{
var byteRepresentationofInputLength = BitConverter.GetBytes(value);
@ -50,6 +97,11 @@ namespace Microsoft.Spark.CSharp.Interop.Ipc
return byteRepresentationofInputLength;
}
/// <summary>
/// Converts a double to a byte array
/// </summary>
/// <param name="value">The double to be converted</param>
/// <returns>The byte array converted from a double</returns>
public static byte[] ToBytes(double value)
{
var byteRepresentationofInputLength = BitConverter.GetBytes(value);
@ -57,16 +109,31 @@ namespace Microsoft.Spark.CSharp.Interop.Ipc
return byteRepresentationofInputLength;
}
/// <summary>
/// Converts a byte to a character
/// </summary>
/// <param name="value">The byte to be converted</param>
/// <returns>The char converted from a byte</returns>
public static char ToChar(byte value)
{
return System.Convert.ToChar(value);
}
/// <summary>
/// Converts a byte array to a string
/// </summary>
/// <param name="value">The byte array to be converted</param>
/// <returns>The string converted from a byte array</returns>
public static string ToString(byte[] value)
{
return Encoding.UTF8.GetString(value);
}
/// <summary>
/// Converts a byte array to an integer
/// </summary>
/// <param name="value">The byte array to be converted</param>
/// <returns>The integer converted from a byte array</returns>
public static int ToInt(byte[] value)
{
return //Netty byte order is BigEndian
@ -76,11 +143,21 @@ namespace Microsoft.Spark.CSharp.Interop.Ipc
(int)value[0] << 24;
}
/// <summary>
/// Reads an integer from a stream
/// </summary>
/// <param name="s">The stream to be read</param>
/// <returns>The integer read from stream</returns>
public static int ReadInt(Stream s)
{
return ToInt(ReadBytes(s, 4));
}
}
/// <summary>
/// Reads a long integer from a stream
/// </summary>
/// <param name="s">The stream to be read</param>
/// <returns>The long integer read from stream</returns>
public static long ReadLong(Stream s)
{
byte[] buffer = ReadBytes(s, 8);
@ -94,7 +171,12 @@ namespace Microsoft.Spark.CSharp.Interop.Ipc
(long)buffer[1] << 48 |
(long)buffer[0] << 56;
}
/// <summary>
/// Reads a double from a stream
/// </summary>
/// <param name="s">The stream to be read</param>
/// <returns>The double read from stream</returns>
public static double ReadDouble(Stream s)
{
byte[] buffer = ReadBytes(s, 8);
@ -102,11 +184,24 @@ namespace Microsoft.Spark.CSharp.Interop.Ipc
return BitConverter.ToDouble(buffer, 0);
}
/// <summary>
/// Reads a string from a stream
/// </summary>
/// <param name="s">The stream to be read</param>
/// <returns>The string read from stream</returns>
public static string ReadString(Stream s)
{
return ToString(ReadBytes(s));
}
/// <summary>
/// Reads a byte array with a given length from a stream
/// </summary>
/// <param name="s">The stream to be read</param>
/// <param name="length">The length to be read</param>
/// <returns>The a byte array read from stream</returns>
/// <exception cref="ArgumentOutOfRangeException">An ArgumentOutOfRangeException thrown if the given length is negative</exception>
/// <exception cref="ArgumentException">An ArgumentException if the actual read length is less than the given length</exception>
public static byte[] ReadBytes(Stream s, int length)
{
if (length < 0)
@ -139,6 +234,11 @@ namespace Microsoft.Spark.CSharp.Interop.Ipc
return buffer;
}
/// <summary>
/// Reads a byte array from a stream. The first 4 bytes indicate the length of a byte array.
/// </summary>
/// <param name="s">The stream to be read</param>
/// <returns>The byte array read from stream</returns>
public static byte[] ReadBytes(Stream s)
{
var lengthBuffer = ReadBytes(s, 4);
@ -152,6 +252,11 @@ namespace Microsoft.Spark.CSharp.Interop.Ipc
return ReadBytes(s, length);
}
/// <summary>
/// Read an object Id from a stream.
/// </summary>
/// <param name="s">The stream to be read</param>
/// <returns>The object Id read from stream</returns>
public static string ReadObjectId(Stream s)
{
var type = s.ReadByte();
@ -168,18 +273,33 @@ namespace Microsoft.Spark.CSharp.Interop.Ipc
return ReadString(s);
}
/// <summary>
/// Writes a byte to a stream
/// </summary>
/// <param name="s">The stream to write</param>
/// <param name="value">The byte to write</param>
public static void Write(Stream s, byte value)
{
s.WriteByte(value);
totalWriteNum += 1;
}
/// <summary>
/// Writes a byte array to a stream
/// </summary>
/// <param name="s">The stream to write</param>
/// <param name="value">The byte array to write</param>
public static void Write(Stream s, byte[] value)
{
s.Write(value, 0, value.Length);
totalWriteNum += value.Length;
}
/// <summary>
/// Writes an integer to a stream
/// </summary>
/// <param name="s">The stream to write</param>
/// <param name="value">The integer to write</param>
public static void Write(Stream s, int value)
{
Write(s, new byte[] {
@ -190,6 +310,11 @@ namespace Microsoft.Spark.CSharp.Interop.Ipc
});
}
/// <summary>
/// Writes a long integer to a stream
/// </summary>
/// <param name="s">The stream to write</param>
/// <param name="value">The long integer to write</param>
public static void Write(Stream s, long value)
{
Write(s, new byte[] {
@ -204,6 +329,11 @@ namespace Microsoft.Spark.CSharp.Interop.Ipc
});
}
/// <summary>
/// Writes a double to a stream
/// </summary>
/// <param name="s">The stream to write</param>
/// <param name="value">The double to write</param>
public static void Write(Stream s, double value)
{
byte[] buffer = BitConverter.GetBytes(value);
@ -211,6 +341,11 @@ namespace Microsoft.Spark.CSharp.Interop.Ipc
Write(s, buffer);
}
/// <summary>
/// Writes a string to a stream
/// </summary>
/// <param name="s">The stream to write</param>
/// <param name="value">The string to write</param>
public static void Write(Stream s, string value)
{
byte[] buffer = Encoding.UTF8.GetBytes(value);

Просмотреть файл

@ -0,0 +1,261 @@
using System;
using System.Collections.Concurrent;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Threading;
using System.Threading.Tasks;
using Microsoft.Spark.CSharp.Proxy.Ipc;
using Microsoft.Spark.CSharp.Services;
namespace Microsoft.Spark.CSharp.Interop.Ipc
{
using WeakReferenceObjectIdPair = KeyValuePair<WeakReference, string>;
/// <summary>
/// Release JVMObjectTracker oject reference.
/// The reason is for the inter-operation from CSharp to Java :
/// 1.Java-side: https://github.com/Microsoft/Mobius/blob/master/scala/src/main/org/apache/spark/api/csharp/CSharpBackendHandler.scala#L269
/// JVMObjectTracker keep a HashMap[String, Object] which is [id, Java-object]
/// 2.CSharp-side :
/// 1) JvmObjectReference remember the id : https://github.com/Microsoft/Mobius/blob/master/csharp/Adapter/Microsoft.Spark.CSharp/Interop/Ipc/JvmObjectReference.cs#L20
/// 2) So JvmBridge can call java object's method https://github.com/Microsoft/Mobius/blob/master/csharp/Adapter/Microsoft.Spark.CSharp/Interop/Ipc/JvmBridge.cs#L69
///
/// So potential memory leak can happen in JVMObjectTracker.
/// To solve this, track the garbage collection in CSharp side, get the id, release JVMObjectTracker's HashMap.
/// </summary>
internal interface IWeakObjectManager : IDisposable
{
TimeSpan CheckInterval { get; set; }
void AddWeakRefereceObject(JvmObjectReference obj);
/// <summary>
/// Gets all weak object count including non-alive objects that wait for releasing.
/// </summary>
int GetReferencesCount();
/// <summary>
/// Gets alive weak object count
/// </summary>
/// <returns></returns>
int GetAliveCount();
}
/// <summary>
/// adaptively control the number of weak objects that should be checked for each interval
/// <summary>
internal class WeakReferenceCheckCountController
{
private static readonly ILoggerService logger = LoggerServiceFactory.GetLogger(typeof(WeakReferenceCheckCountController));
private int checkCount;
private int referencesCountBenchmark;
public WeakReferenceCheckCountController(int initialCheckCount, int initialReferencesCountBenchmark)
{
checkCount = initialCheckCount;
referencesCountBenchmark = initialReferencesCountBenchmark;
}
/// <summary>
/// Adjust checkCount adaptively according to current weak reference objects count
/// </summary>
public int AdjustCheckCount(int currentReferenceCount)
{
if (currentReferenceCount > (referencesCountBenchmark + referencesCountBenchmark / 2))
{
int previousCheckCount = checkCount;
int previousReferencesCountBenchmark = referencesCountBenchmark;
checkCount *= 2;
referencesCountBenchmark = referencesCountBenchmark + referencesCountBenchmark / 2;
logger.LogInfo("Adjust checkCount from {0} to {1}, referencesCountBenchmark from {2} to {3}",
previousCheckCount, checkCount, previousReferencesCountBenchmark, referencesCountBenchmark);
}
return checkCount;
}
}
internal class WeakObjectManagerImpl : IWeakObjectManager
{
private static readonly ILoggerService logger = LoggerServiceFactory.GetLogger(typeof(WeakObjectManagerImpl));
internal static TimeSpan DefaultCheckInterval = TimeSpan.FromSeconds(3);
private TimeSpan checkInterval;
private WeakReferenceCheckCountController checkCountController = new WeakReferenceCheckCountController(10, 1000);
/// <summary>
/// Sleep time for checking thread
/// </summary>
public TimeSpan CheckInterval
{
get
{
return checkInterval;
}
set
{
checkInterval = value;
}
}
/// <summary>
/// Maximum running duration for checking thread each time
/// </summary>
private static readonly TimeSpan MaxReleasingDuration = TimeSpan.FromMilliseconds(100);
private readonly ConcurrentQueue<WeakReferenceObjectIdPair> weakReferences = new ConcurrentQueue<WeakReferenceObjectIdPair>();
private bool shouldKeepRunning = true;
private IObjectReleaser objectReleaser = new JvmObjectReleaser();
internal IObjectReleaser ObjectReleaser
{
set { objectReleaser = value; }
}
private Thread releaserThread;
internal WeakObjectManagerImpl(TimeSpan checkIntervalTimeSpan)
{
checkInterval = checkIntervalTimeSpan;
releaserThread = new Thread(RunReleaseObjectLoop) { IsBackground = true };
releaserThread.Start();
}
internal WeakObjectManagerImpl() : this(DefaultCheckInterval) { }
public int GetReferencesCount()
{
return weakReferences.Count;
}
private void RunReleaseObjectLoop()
{
logger.LogInfo("Checking objects thread start ...");
while (shouldKeepRunning)
{
ReleseGarbageCollectedObjects();
Thread.Sleep(CheckInterval);
}
logger.LogDebug("Checking objects thread stopped.");
}
~WeakObjectManagerImpl()
{
Dispose();
}
public void AddWeakRefereceObject(JvmObjectReference obj)
{
if (obj == null || string.IsNullOrEmpty(obj.Id))
{
logger.LogWarn("Not add null weak object or id : {0}", obj);
return;
}
weakReferences.Enqueue(new WeakReferenceObjectIdPair(new WeakReference(obj), obj.ToString()));
}
private void ReleseGarbageCollectedObjects()
{
int referencesCount = weakReferences.Count;
if (referencesCount == 0)
{
logger.LogDebug("check begin : quit as weakReferences.Count = 0");
return;
}
var beginTime = DateTime.Now;
int checkCount = checkCountController.AdjustCheckCount(referencesCount);
logger.LogDebug("check begin : weakReferences.Count = {0}, checkCount: {1}", referencesCount, checkCount);
int garbageCount;
var aliveList = ReleseGarbageCollectedObjects(checkCount, out garbageCount);
var timeReleaseGarbage = DateTime.Now;
aliveList.ForEach(item => weakReferences.Enqueue(item));
var timeStoreAlive = DateTime.Now;
logger.LogInfo("check end : released {0} garbage, remain {1} alive, used {2} ms : release garbage used {3} ms, store alive used {4} ms",
garbageCount, weakReferences.Count, (DateTime.Now - beginTime).TotalMilliseconds,
(timeReleaseGarbage - beginTime).TotalMilliseconds,
(timeStoreAlive - timeReleaseGarbage).TotalMilliseconds
);
}
private List<WeakReferenceObjectIdPair> ReleseGarbageCollectedObjects(int checkCount, out int garbageCount)
{
var aliveList = new List<WeakReferenceObjectIdPair>();
garbageCount = 0;
int i = 0;
WeakReferenceObjectIdPair weakReferenceObjectIdPair;
while (weakReferences.TryDequeue(out weakReferenceObjectIdPair))
{
var weakRef = weakReferenceObjectIdPair.Key;
if (weakRef.IsAlive)
{
aliveList.Add(weakReferenceObjectIdPair);
}
else
{
objectReleaser.ReleaseObject(weakReferenceObjectIdPair.Value);
garbageCount++;
}
i++;
if (i >= checkCount)
{
logger.LogDebug("Stop releasing as exceeded allowed checkCount: {0}", checkCount);
break;
}
}
return aliveList;
}
/// <summary>
/// It can be an expensive operation. ** Do not use ** unless there is a real need for this method
/// </summary>
/// <returns></returns>
public int GetAliveCount()
{
//copying to get alive count at the time of this method call
var copiedList = new Queue<WeakReferenceObjectIdPair>(weakReferences);
var count = 0;
foreach (var weakReference in copiedList)
{
if (weakReference.Key.IsAlive)
{
count++;
}
}
return count;
}
public virtual void Dispose()
{
logger.LogInfo("Dispose {0}", this.GetType());
shouldKeepRunning = false;
}
}
internal interface IObjectReleaser
{
void ReleaseObject(string objId);
}
internal class JvmObjectReleaser : IObjectReleaser
{
private const string ReleaseHandler = "SparkCLRHandler";
private const string ReleaseMethod = "rm";
public void ReleaseObject(string objId)
{
SparkCLRIpcProxy.JvmBridge.CallStaticJavaMethod(ReleaseHandler, ReleaseMethod, objId);
}
}
}

Просмотреть файл

@ -8,6 +8,7 @@ using Microsoft.Spark.CSharp.Interop.Ipc;
using Microsoft.Spark.CSharp.Proxy;
using Microsoft.Spark.CSharp.Proxy.Ipc;
[assembly: InternalsVisibleTo("Tests.Common")]
[assembly: InternalsVisibleTo("AdapterTest")]
[assembly: InternalsVisibleTo("WorkerTest")]
// DynamicProxyGenAssembly2 is a temporary assembly built by mocking systems that use CastleProxy like Moq
@ -39,5 +40,12 @@ namespace Microsoft.Spark.CSharp.Interop
configurationService = value;
}
}
private static IWeakObjectManager weakObjectManager;
internal static IWeakObjectManager WeakObjectManager
{
get { return weakObjectManager ?? (weakObjectManager = new WeakObjectManagerImpl()); }
set { weakObjectManager = value; }
}
}
}

Просмотреть файл

@ -0,0 +1,130 @@
// Copyright (c) Microsoft. All rights reserved.
// Licensed under the MIT license. See LICENSE file in the project root for full license information.
using System;
using System.IO;
using System.Net;
using System.Net.Sockets;
namespace Microsoft.Spark.CSharp.Network
{
/// <summary>
/// A simple wrapper of System.Net.Sockets.Socket class.
/// </summary>
public class DefaultSocketWrapper : ISocketWrapper
{
private readonly Socket innerSocket;
/// <summary>
/// Default constructor that creates a new instance of DefaultSocket class which represents
/// a traditional socket (System.Net.Socket.Socket).
///
/// This socket is bound to Loopback with port 0.
/// </summary>
public DefaultSocketWrapper()
{
innerSocket = new Socket(AddressFamily.InterNetwork, SocketType.Stream, ProtocolType.Tcp);
var localEndPoint = new IPEndPoint(IPAddress.Loopback, 0);
innerSocket.Bind(localEndPoint);
}
/// <summary>
/// Initializes a instance of DefaultSocket class using the specified System.Net.Socket.Socket object.
/// </summary>
/// <param name="socket">The existing socket</param>
private DefaultSocketWrapper(Socket socket)
{
innerSocket = socket;
}
/// <summary>
/// Accepts a incoming connection request.
/// </summary>
/// <returns>A DefaultSocket instance used to send and receive data</returns>
public ISocketWrapper Accept()
{
var socket = innerSocket.Accept();
return new DefaultSocketWrapper(socket);
}
/// <summary>
/// Close the socket connections and releases all associated resources.
/// </summary>
public void Close()
{
innerSocket.Close();
}
/// <summary>
/// Establishes a connection to a remote host that is specified by an IP address and a port number
/// </summary>
/// <param name="remoteaddr">The IP address of the remote host</param>
/// <param name="port">The port number of the remote host</param>
public void Connect(IPAddress remoteaddr, int port)
{
var remoteEndPoint = new IPEndPoint(remoteaddr, port);
innerSocket.Connect(remoteEndPoint);
}
/// <summary>
/// Returns the NetworkStream used to send and receive data.
/// </summary>
/// <returns>The underlying Stream instance that be used to send and receive data</returns>
/// <remarks>
/// GetStream returns a NetworkStream that you can use to send and receive data. You must close/dispose
/// the NetworkStream by yourself. Closing DefaultSocketWrapper does not release the NetworkStream
/// </remarks>
public Stream GetStream()
{
return new NetworkStream(innerSocket);
}
/// <summary>
/// Starts listening for incoming connections requests
/// </summary>
/// <param name="backlog">The maximum length of the pending connections queue. </param>
public void Listen(int backlog = (int)SocketOptionName.MaxConnections)
{
innerSocket.Listen(backlog);
}
/// <summary>
/// Disposes the resources used by this instance of the DefaultSocket class.
/// </summary>
/// <param name="disposing"></param>
protected virtual void Dispose(bool disposing)
{
if (disposing)
{
innerSocket.Dispose();
}
}
/// <summary>
/// Releases all resources used by the current instance of the DefaultSocket class.
/// </summary>
public void Dispose()
{
Dispose(true);
}
/// <summary>
/// Frees resources used by DefaultSocket class
/// </summary>
~DefaultSocketWrapper()
{
Dispose(false);
}
/// <summary>
/// Returns the local endpoint.
/// </summary>
public EndPoint LocalEndPoint
{
get
{
return innerSocket.LocalEndPoint;
}
}
}
}

Просмотреть файл

@ -0,0 +1,52 @@
// Copyright (c) Microsoft. All rights reserved.
// Licensed under the MIT license. See LICENSE file in the project root for full license information.
using System;
using System.IO;
using System.Net;
using System.Net.Sockets;
namespace Microsoft.Spark.CSharp.Network
{
/// <summary>
/// ISocketWrapper interface defines the common methods to operate a socket (traditional socket or
/// Windows Registered IO socket)
/// </summary>
public interface ISocketWrapper : IDisposable
{
/// <summary>
/// Accepts a incoming connection request.
/// </summary>
/// <returns>A ISocket instance used to send and receive data</returns>
ISocketWrapper Accept();
/// <summary>
/// Close the ISocket connections and releases all associated resources.
/// </summary>
void Close();
/// <summary>
/// Establishes a connection to a remote host that is specified by an IP address and a port number
/// </summary>
/// <param name="remoteaddr">The IP address of the remote host</param>
/// <param name="port">The port number of the remote host</param>
void Connect(IPAddress remoteaddr, int port);
/// <summary>
/// Returns a stream used to send and receive data.
/// </summary>
/// <returns>The underlying Stream instance that be used to send and receive data</returns>
Stream GetStream();
/// <summary>
/// Starts listening for incoming connections requests
/// </summary>
/// <param name="backlog">The maximum length of the pending connections queue. </param>
void Listen(int backlog = (int)SocketOptionName.MaxConnections);
/// <summary>
/// Returns the local endpoint.
/// </summary>
EndPoint LocalEndPoint { get; }
}
}

Просмотреть файл

@ -0,0 +1,27 @@
// Copyright (c) Microsoft. All rights reserved.
// Licensed under the MIT license. See LICENSE file in the project root for full license information.
namespace Microsoft.Spark.CSharp.Network
{
/// <summary>
/// SocketFactory is used to create ISocketWrapper instance based on a configuration and OS version.
///
/// The ISocket instance can be RioSocket object, if the configuration is set to RioSocket and
/// only the application is running on a Windows OS that supports Registered IO socket.
/// </summary>
public static class SocketFactory
{
/// <summary>
/// Creates a ISocket instance based on the configuration and OS version.
/// </summary>
/// <returns>
/// A RioSocket instance, if the configuration is set to RioSocket and only the application
/// is running on a Window OS that supports Registered IO socket. By default, it returns
/// DefaultSocket instance which wraps System.Net.Sockets.Socket.
/// </returns>
public static ISocketWrapper CreateSocket()
{
return new DefaultSocketWrapper();
}
}
}

Просмотреть файл

@ -30,5 +30,5 @@ using System.Runtime.InteropServices;
// Build Number
// Revision
//
[assembly: AssemblyVersion("1.6.0.0")]
[assembly: AssemblyFileVersion("1.6.0.0")]
[assembly: AssemblyVersion("1.6.1.0")]
[assembly: AssemblyFileVersion("1.6.1.0")]

Просмотреть файл

@ -19,7 +19,7 @@ namespace Microsoft.Spark.CSharp.Proxy
void CallForeachRDD(byte[] func, string serializedMode);
void Print(int num = 10);
void Persist(StorageLevelType storageLevelType);
void Checkpoint(long intervalMs);
void Checkpoint(int intervalSeconds);
IRDDProxy[] Slice(long fromUnixTime, long toUnixTime);
}
}

Просмотреть файл

@ -42,6 +42,7 @@ namespace Microsoft.Spark.CSharp.Proxy
IDataFrameProxy Replace<T>(object subset, Dictionary<T, T> toReplaceAndValueDict);
IEnumerable<IDataFrameProxy> RandomSplit(IEnumerable<double> weights, long? seed);
IDataFrameProxy Sort(IColumnProxy[] columns);
IDataFrameProxy SortWithinPartitions(IColumnProxy[] columns);
IDataFrameProxy Alias(string alias);
double Corr(string column1, string column2, string method);
double Cov(string column1, string column2);
@ -55,6 +56,8 @@ namespace Microsoft.Spark.CSharp.Proxy
void Persist(StorageLevelType storageLevelType);
void Unpersist(bool blocking = true);
IDataFrameProxy Repartition(int numPartitions);
IDataFrameProxy Repartition(int numPartitions, IColumnProxy[] columns);
IDataFrameProxy Repartition(IColumnProxy[] columns);
IDataFrameProxy Sample(bool withReplacement, double fraction, long seed);
IDataFrameWriterProxy Write();
}

Просмотреть файл

@ -3,7 +3,6 @@
using System;
using System.Collections.Generic;
using System.Data;
using System.Linq;
using System.Text;
using System.Threading.Tasks;

Просмотреть файл

@ -42,6 +42,5 @@ namespace Microsoft.Spark.CSharp.Proxy
void SaveAsTextFile(string path, string compressionCodecClass);
long Count();
int CollectAndServe();
int PartitionLength();
}
}

Просмотреть файл

@ -16,7 +16,7 @@ namespace Microsoft.Spark.CSharp.Proxy
// or restore it from checkpoint. Thus this function is called before IStreamingContextProxy is initialized. So CheckpointExists()
// should not be put to IStreamingContextProxy.
bool CheckpointExists(string checkpointPath);
IStreamingContextProxy CreateStreamingContext(SparkContext sparkContext, long durationMs);
IStreamingContextProxy CreateStreamingContext(SparkContext sparkContext, int durationSeconds);
IStreamingContextProxy CreateStreamingContext(string checkpointPath);
}
}

Просмотреть файл

@ -15,6 +15,7 @@ namespace Microsoft.Spark.CSharp.Proxy
internal interface ISparkContextProxy
{
ISqlContextProxy CreateSqlContext();
ISqlContextProxy CreateHiveContext();
IColumnProxy CreateColumnFromName(string name);
IColumnProxy CreateFunction(string name, object self);
IColumnProxy CreateBinaryMathFunction(string name, object self, object other);
@ -50,7 +51,7 @@ namespace Microsoft.Spark.CSharp.Proxy
int RunJob(IRDDProxy rdd, IEnumerable<int> partitions);
IBroadcastProxy ReadBroadcastFromFile(string path, out long broadcastId);
IRDDProxy CreateCSharpRdd(IRDDProxy prefvJavaRddReference, byte[] command, Dictionary<string, string> environmentVariables, List<string> pythonIncludes, bool preservePartitioning, List<Broadcast> broadcastVariables, List<byte[]> accumulator);
IRDDProxy CreatePairwiseRDD(IRDDProxy javaReferenceInByteArrayRdd, int numPartitions);
IRDDProxy CreatePairwiseRDD(IRDDProxy javaReferenceInByteArrayRdd, int numPartitions, long partitionFuncId);
IUDFProxy CreateUserDefinedCSharpFunction(string name, byte[] command, string returnType);
}
internal interface IBroadcastProxy

Просмотреть файл

@ -14,12 +14,31 @@ namespace Microsoft.Spark.CSharp.Proxy
internal interface ISqlContextProxy
{
IDataFrameReaderProxy Read();
ISqlContextProxy NewSession();
string GetConf(string key, string defaultValue);
void SetConf(string key, string value);
IDataFrameProxy CreateDataFrame(IRDDProxy rddProxy, IStructTypeProxy structTypeProxy);
void RegisterDataFrameAsTable(IDataFrameProxy dataFrameProxy, string tableName);
void DropTempTable(string tableName);
IDataFrameProxy Table(string tableName);
IDataFrameProxy Tables();
IDataFrameProxy Tables(string databaseName);
IEnumerable<string> TableNames();
IEnumerable<string> TableNames(string databaseName);
void CacheTable(string tableName);
void UncacheTable(string tableName);
void ClearCache();
bool IsCached(string tableName);
IDataFrameProxy ReadDataFrame(string path, StructType schema, Dictionary<string, string> options);
IDataFrameProxy JsonFile(string path);
IDataFrameProxy TextFile(string path, StructType schema, string delimiter);
IDataFrameProxy TextFile(string path, string delimiter, bool hasHeader, bool inferSchema);
IDataFrameProxy Sql(string query);
void RegisterFunction(string name, byte[] command, string returnType);
#region HiveContext
void RefreshTable(string tableName);
#endregion
}
}

Просмотреть файл

@ -16,18 +16,23 @@ namespace Microsoft.Spark.CSharp.Proxy
SparkContext SparkContext { get; }
void Start();
void Stop();
void Remember(long durationMs);
void Remember(int durationSeconds);
void Checkpoint(string directory);
IDStreamProxy TextFileStream(string directory);
IDStreamProxy SocketTextStream(string hostname, int port, StorageLevelType storageLevelType);
IDStreamProxy KafkaStream(Dictionary<string, int> topics, Dictionary<string, string> kafkaParams, StorageLevelType storageLevelType);
IDStreamProxy DirectKafkaStream(List<string> topics, Dictionary<string, string> kafkaParams, Dictionary<string, long> fromOffsets);
IDStreamProxy DirectKafkaStreamWithRepartition(List<string> topics, Dictionary<string, string> kafkaParams, Dictionary<string, long> fromOffsets,
int numPartitions, byte[] readFunc, string serializationMode);
IDStreamProxy Union(IDStreamProxy firstDStreams, IDStreamProxy[] otherDStreams);
void AwaitTermination();
void AwaitTermination(int timeout);
void AwaitTerminationOrTimeout(long timeout);
IDStreamProxy CreateCSharpDStream(IDStreamProxy jdstream, byte[] func, string serializationMode);
IDStreamProxy CreateCSharpTransformed2DStream(IDStreamProxy jdstream, IDStreamProxy jother, byte[] func, string serializationMode, string serializationModeOther);
IDStreamProxy CreateCSharpReducedWindowedDStream(IDStreamProxy jdstream, byte[] func, byte[] invFunc, int windowSeconds, int slideSeconds, string serializationMode);
IDStreamProxy CreateCSharpStateDStream(IDStreamProxy jdstream, byte[] func, string className, string serializationMode, string serializationMode2);
IDStreamProxy CreateConstantInputDStream(IRDDProxy rddProxy);
IDStreamProxy EventHubsUnionStream(Dictionary<string, string> eventHubsParams, StorageLevelType storageLevelType);
}
}

Просмотреть файл

@ -40,7 +40,7 @@ namespace Microsoft.Spark.CSharp.Proxy.Ipc
public IDStreamProxy Window(int windowSeconds, int slideSeconds = 0)
{
string windowId = null;
var windowDurationReference = SparkCLRIpcProxy.JvmBridge.CallConstructor("org.apache.spark.streaming.Duration", new object[] { windowSeconds * 1000 });
var windowDurationReference = JvmBridgeUtils.GetJavaDuration(windowSeconds);
if (slideSeconds <= 0)
{
@ -48,7 +48,7 @@ namespace Microsoft.Spark.CSharp.Proxy.Ipc
return new DStreamIpcProxy(new JvmObjectReference(windowId));
}
var slideDurationReference = SparkCLRIpcProxy.JvmBridge.CallConstructor("org.apache.spark.streaming.Duration", new object[] { slideSeconds * 1000 });
var slideDurationReference = JvmBridgeUtils.GetJavaDuration(slideSeconds);
windowId = (string)SparkCLRIpcProxy.JvmBridge.CallNonStaticJavaMethod(javaDStreamReference, "window", new object[] { windowDurationReference, slideDurationReference });
return new DStreamIpcProxy(new JvmObjectReference(windowId));
@ -77,9 +77,9 @@ namespace Microsoft.Spark.CSharp.Proxy.Ipc
SparkCLRIpcProxy.JvmBridge.CallNonStaticJavaMethod(jvmDStreamReference, "persist", new object[] { jstorageLevel });
}
public void Checkpoint(long intervalMs)
public void Checkpoint(int intervalSeconds)
{
var jinterval = SparkCLRIpcProxy.JvmBridge.CallConstructor("org.apache.spark.streaming.Duration", new object[] { intervalMs });
var jinterval = JvmBridgeUtils.GetJavaDuration(intervalSeconds);
SparkCLRIpcProxy.JvmBridge.CallNonStaticJavaMethod(jvmDStreamReference, "checkpoint", new object[] { jinterval });
}

Просмотреть файл

@ -14,6 +14,12 @@ namespace Microsoft.Spark.CSharp.Proxy.Ipc
internal class DataFrameIpcProxy : IDataFrameProxy
{
private readonly JvmObjectReference jvmDataFrameReference;
internal JvmObjectReference JvmDataFrameReference
{
get { return jvmDataFrameReference; }
}
private readonly ISqlContextProxy sqlContextProxy;
private readonly DataFrameNaFunctions na;
@ -405,6 +411,20 @@ namespace Microsoft.Spark.CSharp.Proxy.Ipc
jvmDataFrameReference, "sort", columnsSeq).ToString()), sqlContextProxy);
}
/// <summary>
/// Call https://github.com/apache/spark/blob/branch-1.6/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala, sortWithinPartitions(sortExprs: Column*): DataFrame
/// </summary>
/// <param name="columns"></param>
/// <returns></returns>
public IDataFrameProxy SortWithinPartitions(IColumnProxy[] columns)
{
var columnsSeq = new JvmObjectReference((string)SparkCLRIpcProxy.JvmBridge.CallStaticJavaMethod("org.apache.spark.sql.api.csharp.SQLUtils",
"toSeq", new object[] { columns.Select(c => (c as ColumnIpcProxy).ScalaColumnReference).ToArray() }));
return new DataFrameIpcProxy(new JvmObjectReference(SparkCLRIpcProxy.JvmBridge.CallNonStaticJavaMethod(
jvmDataFrameReference, "sortWithinPartitions", columnsSeq).ToString()), sqlContextProxy);
}
/// <summary>
/// Call https://github.com/apache/spark/blob/branch-1.4/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala, as(alias: String): DataFrame
/// </summary>
@ -517,6 +537,35 @@ namespace Microsoft.Spark.CSharp.Proxy.Ipc
new object[] { numPartitions }).ToString()), sqlContextProxy);
}
/// <summary>
/// Call https://github.com/apache/spark/blob/branch-1.6/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala, repartition(numPartitions: Int, partitionExprs: Column*): DataFrame
/// </summary>
/// <param name="numPartitions"></param>
/// <param name="columns"></param>
/// <returns></returns>
public IDataFrameProxy Repartition(int numPartitions, IColumnProxy[] columns)
{
var columnsSeq = new JvmObjectReference((string)SparkCLRIpcProxy.JvmBridge.CallStaticJavaMethod("org.apache.spark.sql.api.csharp.SQLUtils",
"toSeq", new object[] { columns.Select(c => (c as ColumnIpcProxy).ScalaColumnReference).ToArray() }));
return new DataFrameIpcProxy(new JvmObjectReference(SparkCLRIpcProxy.JvmBridge.CallNonStaticJavaMethod(
jvmDataFrameReference, "repartition", new object[] { numPartitions, columnsSeq }).ToString()), sqlContextProxy);
}
/// <summary>
/// Call https://github.com/apache/spark/blob/branch-1.6/sql/core/src/main/scala/org/apache/spark/sql/DataFrame.scala, repartition(partitionExprs: Column*): DataFrame
/// </summary>
/// <param name="columns"></param>
/// <returns></returns>
public IDataFrameProxy Repartition(IColumnProxy[] columns)
{
var columnsSeq = new JvmObjectReference((string)SparkCLRIpcProxy.JvmBridge.CallStaticJavaMethod("org.apache.spark.sql.api.csharp.SQLUtils",
"toSeq", new object[] { columns.Select(c => (c as ColumnIpcProxy).ScalaColumnReference).ToArray() }));
return new DataFrameIpcProxy(new JvmObjectReference(SparkCLRIpcProxy.JvmBridge.CallNonStaticJavaMethod(
jvmDataFrameReference, "repartition", new object[] { columnsSeq }).ToString()), sqlContextProxy);
}
public IDataFrameProxy Sample(bool withReplacement, double fraction, long seed)
{
return

Просмотреть файл

@ -16,6 +16,7 @@ using Microsoft.Spark.CSharp.Interop.Ipc;
namespace Microsoft.Spark.CSharp.Proxy.Ipc
{
[ExcludeFromCodeCoverage] //IPC calls to JVM validated using validation-enabled samples - unit test coverage not reqiured
[Serializable]
internal class RDDIpcProxy : IRDDProxy
{
private readonly JvmObjectReference jvmRddReference;
@ -78,13 +79,6 @@ namespace Microsoft.Spark.CSharp.Proxy.Ipc
return new RDDIpcProxy(jref);
}
public int PartitionLength()
{
var rdd = new JvmObjectReference((string)SparkCLRIpcProxy.JvmBridge.CallNonStaticJavaMethod(jvmRddReference, "rdd"));
var partitions = SparkCLRIpcProxy.JvmBridge.CallNonStaticJavaMethod(rdd, "partitions", new object[] { });
return int.Parse(SparkCLRIpcProxy.JvmBridge.CallStaticJavaMethod("java.lang.reflect.Array", "getLength", new object[] { partitions }).ToString());
}
public IRDDProxy Coalesce(int numPartitions, bool shuffle)
{
return new RDDIpcProxy(new JvmObjectReference((string)SparkCLRIpcProxy.JvmBridge.CallNonStaticJavaMethod(jvmRddReference, "coalesce", new object[] { numPartitions, shuffle })));
@ -166,7 +160,7 @@ namespace Microsoft.Spark.CSharp.Proxy.Ipc
public IRDDProxy SampleByKey(bool withReplacement, Dictionary<string, double> fractions, long seed)
{
var jfractions = SparkContextIpcProxy.GetJavaMap(fractions) as JvmObjectReference;
var jfractions = JvmBridgeUtils.GetJavaMap(fractions) as JvmObjectReference;
return new RDDIpcProxy(new JvmObjectReference((string) SparkCLRIpcProxy.JvmBridge.CallNonStaticJavaMethod(jvmRddReference, "sampleByKey", new object[] { withReplacement, jfractions, seed })));
}
@ -184,25 +178,25 @@ namespace Microsoft.Spark.CSharp.Proxy.Ipc
public void SaveAsNewAPIHadoopDataset(IEnumerable<KeyValuePair<string, string>> conf)
{
var jconf = SparkContextIpcProxy.GetJavaMap<string, string>(conf);
var jconf = JvmBridgeUtils.GetJavaMap<string, string>(conf);
SparkCLRIpcProxy.JvmBridge.CallStaticJavaMethod("org.apache.spark.api.python.PythonRDD", "saveAsHadoopDataset", new object[] { jvmRddReference, false, jconf, null, null, true });
}
public void SaveAsNewAPIHadoopFile(string path, string outputFormatClass, string keyClass, string valueClass, IEnumerable<KeyValuePair<string, string>> conf)
{
var jconf = SparkContextIpcProxy.GetJavaMap<string, string>(conf);
var jconf = JvmBridgeUtils.GetJavaMap<string, string>(conf);
SparkCLRIpcProxy.JvmBridge.CallStaticJavaMethod("org.apache.spark.api.python.PythonRDD", "saveAsNewAPIHadoopFile", new object[] { jvmRddReference, false, path, outputFormatClass, keyClass, valueClass, null, null, jconf });
}
public void SaveAsHadoopDataset(IEnumerable<KeyValuePair<string, string>> conf)
{
var jconf = SparkContextIpcProxy.GetJavaMap<string, string>(conf);
var jconf = JvmBridgeUtils.GetJavaMap<string, string>(conf);
SparkCLRIpcProxy.JvmBridge.CallStaticJavaMethod("org.apache.spark.api.python.PythonRDD", "saveAsHadoopDataset", new object[] { jvmRddReference, false, jconf, null, null, false });
}
public void SaveAsHadoopFile(string path, string outputFormatClass, string keyClass, string valueClass, IEnumerable<KeyValuePair<string, string>> conf, string compressionCodecClass)
{
var jconf = SparkContextIpcProxy.GetJavaMap<string, string>(conf);
var jconf = JvmBridgeUtils.GetJavaMap<string, string>(conf);
SparkCLRIpcProxy.JvmBridge.CallStaticJavaMethod("org.apache.spark.api.python.PythonRDD", "saveAsHadoopFile", new object[] { jvmRddReference, false, path, outputFormatClass, keyClass, valueClass, null, null, jconf, compressionCodecClass });
}
@ -211,17 +205,18 @@ namespace Microsoft.Spark.CSharp.Proxy.Ipc
SparkCLRIpcProxy.JvmBridge.CallStaticJavaMethod("org.apache.spark.api.python.PythonRDD", "SaveAsSequenceFile", new object[] { jvmRddReference, false, path, compressionCodecClass });
}
//this method is called by RDD<string> (implementation is at StringRDDFunctions.SaveAsTextFile)
//calling saveAsTextFile() on CSharpRDDs result in bytes written to text file - so calling saveStringRddAsTextFile() which converts bytes to string before writing to file
public void SaveAsTextFile(string path, string compressionCodecClass)
{
var rdd = new JvmObjectReference((string)SparkCLRIpcProxy.JvmBridge.CallNonStaticJavaMethod(jvmRddReference, "rdd"));
if (!string.IsNullOrEmpty(compressionCodecClass))
{
var codec = new JvmObjectReference((string)SparkCLRIpcProxy.JvmBridge.CallStaticJavaMethod("java.lang.Class", "forName", new object[] { compressionCodecClass }));
SparkCLRIpcProxy.JvmBridge.CallNonStaticJavaMethod(jvmRddReference, "saveAsTextFile", new object[] { path, codec });
SparkCLRIpcProxy.JvmBridge.CallStaticJavaMethod("org.apache.spark.api.csharp.CSharpRDD", "saveStringRddAsTextFile", new object[] { jvmRddReference, path, codec });
}
else
{
SparkCLRIpcProxy.JvmBridge.CallNonStaticJavaMethod(jvmRddReference, "saveAsTextFile", new object[] { path });
SparkCLRIpcProxy.JvmBridge.CallStaticJavaMethod("org.apache.spark.api.csharp.CSharpRDD", "saveStringRddAsTextFile", new object[] { jvmRddReference, path });
}
}
public StorageLevel GetStorageLevel()

Просмотреть файл

@ -80,9 +80,9 @@ namespace Microsoft.Spark.CSharp.Proxy.Ipc
SparkCLRIpcProxy.JvmBridge.CallNonStaticJavaMethod(fs, "listStatus", path) != null;
}
public IStreamingContextProxy CreateStreamingContext(SparkContext sparkContext, long durationMs)
public IStreamingContextProxy CreateStreamingContext(SparkContext sparkContext, int durationSeconds)
{
streamingContextIpcProxy = new StreamingContextIpcProxy(sparkContext, durationMs);
streamingContextIpcProxy = new StreamingContextIpcProxy(sparkContext, durationSeconds);
return streamingContextIpcProxy;
}

Просмотреть файл

@ -39,6 +39,13 @@ namespace Microsoft.Spark.CSharp.Proxy.Ipc
return new SqlContextIpcProxy(new JvmObjectReference((string)SparkCLRIpcProxy.JvmBridge.CallStaticJavaMethod("org.apache.spark.sql.api.csharp.SQLUtils", "createSQLContext", new object[] { jvmSparkContextReference })));
}
public ISqlContextProxy CreateHiveContext()
{
return new SqlContextIpcProxy(new JvmObjectReference(
(string)SparkCLRIpcProxy.JvmBridge.CallStaticJavaMethod(
"org.apache.spark.sql.api.csharp.SQLUtils", "createHiveContext", new object[] { jvmSparkContextReference })));
}
public void CreateSparkContext(string master, string appName, string sparkHome, ISparkConfProxy conf)
{
object[] args = (new object[] { master, appName, sparkHome, (conf == null ? null : (conf as SparkConfIpcProxy).JvmSparkConfReference) }).Where(x => x != null).ToArray();
@ -152,7 +159,7 @@ namespace Microsoft.Spark.CSharp.Proxy.Ipc
public IRDDProxy NewAPIHadoopFile(string filePath, string inputFormatClass, string keyClass, string valueClass, string keyConverterClass, string valueConverterClass, IEnumerable<KeyValuePair<string, string>> conf, int batchSize)
{
var jconf = GetJavaHashMap<string, string>(conf);
var jconf = JvmBridgeUtils.GetJavaHashMap<string, string>(conf);
var jvmRddReference = new JvmObjectReference((string)SparkCLRIpcProxy.JvmBridge.CallStaticJavaMethod("org.apache.spark.api.python.PythonRDD", "newAPIHadoopFile",
new object[] { jvmJavaContextReference, filePath, inputFormatClass, keyClass, valueClass, keyConverterClass, valueConverterClass, jconf, batchSize }));
return new RDDIpcProxy(jvmRddReference);
@ -160,7 +167,7 @@ namespace Microsoft.Spark.CSharp.Proxy.Ipc
public IRDDProxy NewAPIHadoopRDD(string inputFormatClass, string keyClass, string valueClass, string keyConverterClass, string valueConverterClass, IEnumerable<KeyValuePair<string, string>> conf, int batchSize)
{
var jconf = GetJavaHashMap<string, string>(conf);
var jconf = JvmBridgeUtils.GetJavaHashMap<string, string>(conf);
var jvmRddReference = new JvmObjectReference((string)SparkCLRIpcProxy.JvmBridge.CallStaticJavaMethod("org.apache.spark.api.python.PythonRDD", "newAPIHadoopRDD",
new object[] { jvmJavaContextReference, inputFormatClass, keyClass, valueClass, keyConverterClass, valueConverterClass, jconf, batchSize }));
return new RDDIpcProxy(jvmRddReference);
@ -168,7 +175,7 @@ namespace Microsoft.Spark.CSharp.Proxy.Ipc
public IRDDProxy HadoopFile(string filePath, string inputFormatClass, string keyClass, string valueClass, string keyConverterClass, string valueConverterClass, IEnumerable<KeyValuePair<string, string>> conf, int batchSize)
{
var jconf = GetJavaHashMap<string, string>(conf);
var jconf = JvmBridgeUtils.GetJavaHashMap<string, string>(conf);
var jvmRddReference = new JvmObjectReference((string)SparkCLRIpcProxy.JvmBridge.CallStaticJavaMethod("org.apache.spark.api.python.PythonRDD", "hadoopFile",
new object[] { jvmJavaContextReference, filePath, inputFormatClass, keyClass, valueClass, keyConverterClass, valueConverterClass, jconf, batchSize }));
return new RDDIpcProxy(jvmRddReference);
@ -176,7 +183,7 @@ namespace Microsoft.Spark.CSharp.Proxy.Ipc
public IRDDProxy HadoopRDD(string inputFormatClass, string keyClass, string valueClass, string keyConverterClass, string valueConverterClass, IEnumerable<KeyValuePair<string, string>> conf, int batchSize)
{
var jconf = GetJavaHashMap<string, string>(conf);
var jconf = JvmBridgeUtils.GetJavaHashMap<string, string>(conf);
var jvmRddReference = new JvmObjectReference((string)SparkCLRIpcProxy.JvmBridge.CallStaticJavaMethod("org.apache.spark.api.python.PythonRDD", "hadoopRDD",
new object[] { jvmJavaContextReference, inputFormatClass, keyClass, valueClass, keyConverterClass, valueConverterClass, jconf, batchSize }));
return new RDDIpcProxy(jvmRddReference);
@ -191,7 +198,7 @@ namespace Microsoft.Spark.CSharp.Proxy.Ipc
public IRDDProxy Union(IEnumerable<IRDDProxy> rdds)
{
var jfirst = (rdds.First() as RDDIpcProxy).JvmRddReference;
var jrest = GetJavaList<JvmObjectReference>(rdds.Skip(1).Select(r => (r as RDDIpcProxy).JvmRddReference));
var jrest = JvmBridgeUtils.GetJavaList<JvmObjectReference>(rdds.Skip(1).Select(r => (r as RDDIpcProxy).JvmRddReference));
var jvmRddReference = new JvmObjectReference((string)SparkCLRIpcProxy.JvmBridge.CallNonStaticJavaMethod(jvmJavaContextReference, "union", new object[] { jfirst, jrest }));
return new RDDIpcProxy(jvmRddReference);
}
@ -250,13 +257,20 @@ namespace Microsoft.Spark.CSharp.Proxy.Ipc
}
}
public IRDDProxy CreatePairwiseRDD(IRDDProxy jvmReferenceOfByteArrayRdd, int numPartitions)
/// <summary>
/// Create a PairwiseRDD.
/// </summary>
/// <param name="jvmReferenceOfByteArrayRdd"></param>
/// <param name="numPartitions"></param>
/// <param name="partitionFuncId">Global unique id of partitioner which is used for comparison PythonPartitioners in JVM.</param>
/// <returns></returns>
public IRDDProxy CreatePairwiseRDD(IRDDProxy jvmReferenceOfByteArrayRdd, int numPartitions, long partitionFuncId)
{
var rdd = new JvmObjectReference((string)SparkCLRIpcProxy.JvmBridge.CallNonStaticJavaMethod((jvmReferenceOfByteArrayRdd as RDDIpcProxy).JvmRddReference, "rdd"));
var pairwiseRdd = SparkCLRIpcProxy.JvmBridge.CallConstructor("org.apache.spark.api.python.PairwiseRDD", rdd);
var pairRddJvmReference = new JvmObjectReference(SparkCLRIpcProxy.JvmBridge.CallNonStaticJavaMethod(pairwiseRdd, "asJavaPairRDD", new object[] { }).ToString());
var jpartitionerJavaReference = SparkCLRIpcProxy.JvmBridge.CallConstructor("org.apache.spark.api.python.PythonPartitioner", new object[] { numPartitions, (long)0 });
var jpartitionerJavaReference = SparkCLRIpcProxy.JvmBridge.CallConstructor("org.apache.spark.api.python.PythonPartitioner", new object[] { numPartitions, partitionFuncId });
var partitionedPairRddJvmReference = new JvmObjectReference(SparkCLRIpcProxy.JvmBridge.CallNonStaticJavaMethod(pairRddJvmReference, "partitionBy", new object[] { jpartitionerJavaReference }).ToString());
var jvmRddReference = new JvmObjectReference(SparkCLRIpcProxy.JvmBridge.CallStaticJavaMethod("org.apache.spark.api.python.PythonRDD", "valueOfPair", new object[] { partitionedPairRddJvmReference }).ToString());
//var jvmRddReference = new JvmObjectReference(SparkCLRIpcProxy.JvmBridge.CallNonStaticJavaMethod(partitionedRddJvmReference, "rdd", new object[] { }).ToString());
@ -267,7 +281,7 @@ namespace Microsoft.Spark.CSharp.Proxy.Ipc
{
var hashTableReference = SparkCLRIpcProxy.JvmBridge.CallConstructor("java.util.Hashtable", new object[] { });
var arrayListReference = SparkCLRIpcProxy.JvmBridge.CallConstructor("java.util.ArrayList", new object[] { });
var jbroadcastVariables = GetJavaList<JvmObjectReference>(jvmBroadcastReferences);
var jbroadcastVariables = JvmBridgeUtils.GetJavaList<JvmObjectReference>(jvmBroadcastReferences);
var rdd = new JvmObjectReference((string)SparkCLRIpcProxy.JvmBridge.CallNonStaticJavaMethod((prevJvmRddReference as RDDIpcProxy).JvmRddReference, "rdd"));
@ -288,7 +302,7 @@ namespace Microsoft.Spark.CSharp.Proxy.Ipc
{
var jSqlContext = SparkCLRIpcProxy.JvmBridge.CallConstructor("org.apache.spark.sql.SQLContext", new object[] { jvmSparkContextReference });
var jDataType = new JvmObjectReference((string)SparkCLRIpcProxy.JvmBridge.CallNonStaticJavaMethod(jSqlContext, "parseDataType", new object[] { "\"" + returnType + "\"" }));
var jbroadcastVariables = GetJavaList<JvmObjectReference>(jvmBroadcastReferences);
var jbroadcastVariables = JvmBridgeUtils.GetJavaList<JvmObjectReference>(jvmBroadcastReferences);
var hashTableReference = SparkCLRIpcProxy.JvmBridge.CallConstructor("java.util.Hashtable", new object[] { });
var arrayListReference = SparkCLRIpcProxy.JvmBridge.CallConstructor("java.util.ArrayList", new object[] { });
@ -306,7 +320,7 @@ namespace Microsoft.Spark.CSharp.Proxy.Ipc
public int RunJob(IRDDProxy rdd, IEnumerable<int> partitions)
{
var jpartitions = GetJavaList<int>(partitions);
var jpartitions = JvmBridgeUtils.GetJavaList<int>(partitions);
return int.Parse(SparkCLRIpcProxy.JvmBridge.CallStaticJavaMethod("org.apache.spark.api.python.PythonRDD", "runJob", new object[] { jvmSparkContextReference, (rdd as RDDIpcProxy).JvmRddReference, jpartitions }).ToString());
}
@ -333,7 +347,7 @@ namespace Microsoft.Spark.CSharp.Proxy.Ipc
if (self is ColumnIpcProxy)
self = (self as ColumnIpcProxy).ScalaColumnReference;
else if (self is IColumnProxy[])
self = GetJavaSeq<JvmObjectReference>((self as IColumnProxy[]).Select(x => (x as ColumnIpcProxy).ScalaColumnReference));
self = JvmBridgeUtils.GetJavaSeq<JvmObjectReference>((self as IColumnProxy[]).Select(x => (x as ColumnIpcProxy).ScalaColumnReference));
return new ColumnIpcProxy(new JvmObjectReference((string)SparkCLRIpcProxy.JvmBridge.CallStaticJavaMethod("org.apache.spark.sql.functions", name, self)));
}
@ -351,52 +365,6 @@ namespace Microsoft.Spark.CSharp.Proxy.Ipc
return new ColumnIpcProxy(new JvmObjectReference((string)SparkCLRIpcProxy.JvmBridge.CallStaticJavaMethod("org.apache.spark.sql.functions", name)));
}
public static JvmObjectReference GetJavaMap<K, V>(IEnumerable<KeyValuePair<K, V>> enumerable)
{
var jmap = SparkCLRIpcProxy.JvmBridge.CallConstructor("java.util.Hashtable", new object[] { });
if (enumerable != null)
{
foreach (var item in enumerable)
SparkCLRIpcProxy.JvmBridge.CallNonStaticJavaMethod(jmap, "put", new object[] { item.Key, item.Value });
}
return jmap;
}
public static JvmObjectReference GetJavaHashMap<K, V>(IEnumerable<KeyValuePair<K, V>> enumerable)
{
var jmap = SparkCLRIpcProxy.JvmBridge.CallConstructor("java.util.HashMap", new object[] { });
if (enumerable != null)
{
foreach (var item in enumerable)
SparkCLRIpcProxy.JvmBridge.CallNonStaticJavaMethod(jmap, "put", new object[] { item.Key, item.Value });
}
return jmap;
}
public static JvmObjectReference GetJavaSet<T>(IEnumerable<T> enumerable)
{
var jset = SparkCLRIpcProxy.JvmBridge.CallConstructor("java.util.HashSet", new object[] { });
if (enumerable != null)
{
foreach (var item in enumerable)
SparkCLRIpcProxy.JvmBridge.CallNonStaticJavaMethod(jset, "add", new object[] { item });
}
return jset;
}
public static JvmObjectReference GetJavaList<T>(IEnumerable<T> enumerable)
{
var jlist = SparkCLRIpcProxy.JvmBridge.CallConstructor("java.util.ArrayList", new object[] { });
if (enumerable != null)
{
foreach (var item in enumerable)
SparkCLRIpcProxy.JvmBridge.CallNonStaticJavaMethod(jlist, "add", new object[] { item });
}
return jlist;
}
public static JvmObjectReference GetJavaSeq<T>(IEnumerable<T> enumerable)
{
return new JvmObjectReference((string)SparkCLRIpcProxy.JvmBridge.CallStaticJavaMethod("org.apache.spark.sql.api.csharp.SQLUtils", "toSeq", GetJavaList<T>(enumerable)));
}
public static JvmObjectReference GetJavaStorageLevel(StorageLevelType storageLevelType)
{
return new JvmObjectReference(SparkCLRIpcProxy.JvmBridge.CallStaticJavaMethod("org.apache.spark.api.java.StorageLevels", "create",

Просмотреть файл

@ -105,5 +105,100 @@ namespace Microsoft.Spark.CSharp.Proxy.Ipc
SparkCLRIpcProxy.JvmBridge.CallNonStaticJavaMethod(judf, "registerPython", new object[] {name, udf});
}
public ISqlContextProxy NewSession()
{
return new SqlContextIpcProxy(
new JvmObjectReference((string)SparkCLRIpcProxy.JvmBridge.CallNonStaticJavaMethod(jvmSqlContextReference, "newSession")));
}
public string GetConf(string key, string defaultValue)
{
return (string)SparkCLRIpcProxy.JvmBridge.CallNonStaticJavaMethod(jvmSqlContextReference, "getConf", new object[] { key, defaultValue });
}
public void SetConf(string key, string value)
{
SparkCLRIpcProxy.JvmBridge.CallNonStaticJavaMethod(jvmSqlContextReference, "setConf", new object[] { key, value });
}
public void RegisterDataFrameAsTable(IDataFrameProxy dataFrameProxy, string tableName)
{
SparkCLRIpcProxy.JvmBridge.CallNonStaticJavaMethod(
jvmSqlContextReference, "registerDataFrameAsTable",
new object[] { (dataFrameProxy as DataFrameIpcProxy).JvmDataFrameReference, tableName });
}
public void DropTempTable(string tableName)
{
SparkCLRIpcProxy.JvmBridge.CallNonStaticJavaMethod(
jvmSqlContextReference, "dropTempTable", new object[] { tableName });
}
public IDataFrameProxy Table(string tableName)
{
return new DataFrameIpcProxy(
new JvmObjectReference(
(string)SparkCLRIpcProxy.JvmBridge.CallNonStaticJavaMethod(jvmSqlContextReference, "table",
new object[] { tableName })), this);
}
public IDataFrameProxy Tables()
{
return new DataFrameIpcProxy(
new JvmObjectReference(
(string)SparkCLRIpcProxy.JvmBridge.CallNonStaticJavaMethod(jvmSqlContextReference, "tables")), this);
}
public IDataFrameProxy Tables(string databaseName)
{
return new DataFrameIpcProxy(
new JvmObjectReference(
(string)SparkCLRIpcProxy.JvmBridge.CallNonStaticJavaMethod(jvmSqlContextReference, "tables",
new object[] { databaseName })), this);
}
public IEnumerable<string> TableNames()
{
var tableNames = SparkCLRIpcProxy.JvmBridge.CallNonStaticJavaMethod(jvmSqlContextReference, "tableNames");
return (List<string>) tableNames;
}
public IEnumerable<string> TableNames(string databaseName)
{
return (List<string>)SparkCLRIpcProxy.JvmBridge.CallNonStaticJavaMethod(jvmSqlContextReference, "tableNames",
new object[] { databaseName });
}
public void CacheTable(string tableName)
{
SparkCLRIpcProxy.JvmBridge.CallNonStaticJavaMethod(jvmSqlContextReference, "cacheTable",
new object[] { tableName });
}
public void UncacheTable(string tableName)
{
SparkCLRIpcProxy.JvmBridge.CallNonStaticJavaMethod(jvmSqlContextReference, "uncacheTable",
new object[] { tableName });
}
public void ClearCache()
{
SparkCLRIpcProxy.JvmBridge.CallNonStaticJavaMethod(jvmSqlContextReference, "clearCache");
}
public bool IsCached(string tableName)
{
return (bool)SparkCLRIpcProxy.JvmBridge.CallNonStaticJavaMethod(jvmSqlContextReference, "isCached",
new object[] { tableName });
}
#region HiveContext
public void RefreshTable(string tableName)
{
SparkCLRIpcProxy.JvmBridge.CallNonStaticJavaMethod(jvmSqlContextReference, "refreshTable",
new object[] { tableName });
}
#endregion
}
}

Просмотреть файл

@ -10,12 +10,12 @@ using System.Net;
using System.Net.Sockets;
using System.Runtime.Serialization;
using System.Runtime.Serialization.Formatters.Binary;
using System.Text;
using System.Threading;
using System.Threading.Tasks;
using Microsoft.Spark.CSharp.Core;
using Microsoft.Spark.CSharp.Interop.Ipc;
using Microsoft.Spark.CSharp.Network;
using Microsoft.Spark.CSharp.Services;
namespace Microsoft.Spark.CSharp.Proxy.Ipc
@ -26,7 +26,7 @@ namespace Microsoft.Spark.CSharp.Proxy.Ipc
[ExcludeFromCodeCoverage] //IPC calls to JVM validated using validation-enabled samples - unit test coverage not reqiured
internal class StreamingContextIpcProxy : IStreamingContextProxy
{
private readonly ILoggerService logger = LoggerServiceFactory.GetLogger(typeof(SparkConf));
private readonly ILoggerService logger = LoggerServiceFactory.GetLogger(typeof(StreamingContextIpcProxy));
internal readonly JvmObjectReference jvmStreamingContextReference;
private readonly JvmObjectReference jvmJavaStreamingReference;
private readonly ISparkContextProxy sparkContextProxy;
@ -43,36 +43,51 @@ namespace Microsoft.Spark.CSharp.Proxy.Ipc
}
}
public StreamingContextIpcProxy(SparkContext sparkContext, long durationMs)
public StreamingContextIpcProxy(SparkContext sparkContext, int durationSeconds)
{
this.sparkContext = sparkContext;
sparkContextProxy = sparkContext.SparkContextProxy;
var jduration = SparkCLRIpcProxy.JvmBridge.CallConstructor("org.apache.spark.streaming.Duration", new object[] { durationMs });
var jduration = JvmBridgeUtils.GetJavaDuration(durationSeconds);
JvmObjectReference jvmSparkContextReference = (sparkContextProxy as SparkContextIpcProxy).JvmSparkContextReference;
jvmStreamingContextReference = SparkCLRIpcProxy.JvmBridge.CallConstructor("org.apache.spark.streaming.StreamingContext", new object[] { jvmSparkContextReference, jduration });
jvmJavaStreamingReference = SparkCLRIpcProxy.JvmBridge.CallConstructor("org.apache.spark.streaming.api.java.JavaStreamingContext", new object[] { jvmStreamingContextReference });
StartAccumulatorServer(sparkContext);
StartCallbackServer();
}
public StreamingContextIpcProxy(string checkpointPath)
{
sparkContext = SparkContext.GetActiveSparkContext();
StartCallbackServer();
jvmJavaStreamingReference = SparkCLRIpcProxy.JvmBridge.CallConstructor("org.apache.spark.streaming.api.java.JavaStreamingContext", new object[] { checkpointPath });
jvmStreamingContextReference = new JvmObjectReference((string)SparkCLRIpcProxy.JvmBridge.CallNonStaticJavaMethod(jvmJavaStreamingReference, "ssc"));
JvmObjectReference jvmSparkContextReference = new JvmObjectReference((string)SparkCLRIpcProxy.JvmBridge.CallNonStaticJavaMethod(jvmStreamingContextReference, "sc"));
JvmObjectReference jvmSparkConfReference = new JvmObjectReference((string)SparkCLRIpcProxy.JvmBridge.CallNonStaticJavaMethod(jvmStreamingContextReference, "conf"));
JvmObjectReference jvmJavaContextReference = new JvmObjectReference((string)SparkCLRIpcProxy.JvmBridge.CallNonStaticJavaMethod(jvmJavaStreamingReference, "sparkContext"));
sparkContextProxy = new SparkContextIpcProxy(jvmSparkContextReference, jvmJavaContextReference);
var sparkConfProxy = new SparkConfIpcProxy(jvmSparkConfReference);
sparkContext = new SparkContext(sparkContextProxy, new SparkConf(sparkConfProxy));
if (sparkContext == null)
{
JvmObjectReference jvmSparkContextReference = new JvmObjectReference((string)SparkCLRIpcProxy.JvmBridge.CallNonStaticJavaMethod(jvmStreamingContextReference, "sc"));
JvmObjectReference jvmSparkConfReference = new JvmObjectReference((string)SparkCLRIpcProxy.JvmBridge.CallNonStaticJavaMethod(jvmStreamingContextReference, "conf"));
JvmObjectReference jvmJavaContextReference = new JvmObjectReference((string)SparkCLRIpcProxy.JvmBridge.CallNonStaticJavaMethod(jvmJavaStreamingReference, "sparkContext"));
sparkContextProxy = new SparkContextIpcProxy(jvmSparkContextReference, jvmJavaContextReference);
var sparkConfProxy = new SparkConfIpcProxy(jvmSparkConfReference);
sparkContext = new SparkContext(sparkContextProxy, new SparkConf(sparkConfProxy));
}
else
{
sparkContextProxy = sparkContext.SparkContextProxy;
}
StartAccumulatorServer(sparkContext);
}
private void StartAccumulatorServer(SparkContext sparkContext)
{
// TODO: We don't know whether accumulator variable is used before restart. We just start accumuator server for safety.
sparkContext.StartAccumulatorServer();
}
public void Start()
{
int port = StartCallback();
SparkCLRIpcProxy.JvmBridge.CallStaticJavaMethod("SparkCLRHandler", "connectCallback", port); //className and methodName hardcoded in CSharpBackendHandler
SparkCLRIpcProxy.JvmBridge.CallNonStaticJavaMethod(jvmStreamingContextReference, "start");
}
@ -84,9 +99,9 @@ namespace Microsoft.Spark.CSharp.Proxy.Ipc
SparkCLRIpcProxy.JvmBridge.CallStaticJavaMethod("SparkCLRHandler", "closeCallback");
}
public void Remember(long durationMs)
public void Remember(int durationSeconds)
{
var jduration = SparkCLRIpcProxy.JvmBridge.CallConstructor("org.apache.spark.streaming.Duration", new object[] { (int)durationMs });
var jduration = JvmBridgeUtils.GetJavaDuration(durationSeconds);
SparkCLRIpcProxy.JvmBridge.CallNonStaticJavaMethod(jvmStreamingContextReference, "remember", new object[] { jduration });
}
@ -119,8 +134,8 @@ namespace Microsoft.Spark.CSharp.Proxy.Ipc
public IDStreamProxy CreateCSharpReducedWindowedDStream(IDStreamProxy jdstream, byte[] func, byte[] invFunc, int windowSeconds, int slideSeconds, string serializationMode)
{
var windowDurationReference = SparkCLRIpcProxy.JvmBridge.CallConstructor("org.apache.spark.streaming.Duration", new object[] { windowSeconds * 1000 });
var slideDurationReference = SparkCLRIpcProxy.JvmBridge.CallConstructor("org.apache.spark.streaming.Duration", new object[] { slideSeconds * 1000 });
var windowDurationReference = JvmBridgeUtils.GetJavaDuration(windowSeconds);
var slideDurationReference = JvmBridgeUtils.GetJavaDuration(slideSeconds);
var jvmDStreamReference = SparkCLRIpcProxy.JvmBridge.CallConstructor("org.apache.spark.streaming.api.csharp.CSharpReducedWindowedDStream",
new object[] { (jdstream as DStreamIpcProxy).jvmDStreamReference, func, invFunc, windowDurationReference, slideDurationReference, serializationMode });
@ -138,6 +153,21 @@ namespace Microsoft.Spark.CSharp.Proxy.Ipc
return new DStreamIpcProxy(javaDStreamReference, jvmDStreamReference);
}
public IDStreamProxy CreateConstantInputDStream(IRDDProxy rddProxy)
{
var rddReference =
new JvmObjectReference(
(string)SparkCLRIpcProxy.JvmBridge.CallNonStaticJavaMethod(((RDDIpcProxy)rddProxy).JvmRddReference, "rdd"));
var jvmDStreamReference = SparkCLRIpcProxy.JvmBridge.CallConstructor(
"org.apache.spark.streaming.api.csharp.CSharpConstantInputDStream", jvmStreamingContextReference, rddReference);
var javaDStreamReference =
new JvmObjectReference((String)SparkCLRIpcProxy.JvmBridge.CallNonStaticJavaMethod(jvmDStreamReference, "asJavaDStream"));
return new DStreamIpcProxy(javaDStreamReference, jvmDStreamReference);
}
public IDStreamProxy TextFileStream(string directory)
{
var jstream = new JvmObjectReference(SparkCLRIpcProxy.JvmBridge.CallNonStaticJavaMethod(jvmJavaStreamingReference, "textFileStream", new object[] { directory }).ToString());
@ -153,19 +183,19 @@ namespace Microsoft.Spark.CSharp.Proxy.Ipc
public IDStreamProxy KafkaStream(Dictionary<string, int> topics, Dictionary<string, string> kafkaParams, StorageLevelType storageLevelType)
{
JvmObjectReference jtopics = SparkContextIpcProxy.GetJavaMap<string, int>(topics);
JvmObjectReference jkafkaParams = SparkContextIpcProxy.GetJavaMap<string, string>(kafkaParams);
JvmObjectReference jtopics = JvmBridgeUtils.GetJavaMap<string, int>(topics);
JvmObjectReference jkafkaParams = JvmBridgeUtils.GetJavaMap<string, string>(kafkaParams);
JvmObjectReference jlevel = SparkContextIpcProxy.GetJavaStorageLevel(storageLevelType);
// KafkaUtilsPythonHelper: external/kafka/src/main/scala/org/apache/spark/streaming/kafka/KafkaUtils.scala
JvmObjectReference jhelper = SparkCLRIpcProxy.JvmBridge.CallConstructor("org.apache.spark.streaming.kafka.KafkaUtilsPythonHelper", new object[] { });
var jstream = new JvmObjectReference(SparkCLRIpcProxy.JvmBridge.CallNonStaticJavaMethod(jhelper, "createStream", new object[] { jvmJavaStreamingReference, jkafkaParams, jtopics, jlevel }).ToString());
return new DStreamIpcProxy(jstream);
}
public IDStreamProxy DirectKafkaStream(List<string> topics, Dictionary<string, string> kafkaParams, Dictionary<string, long> fromOffsets)
{
JvmObjectReference jtopics = SparkContextIpcProxy.GetJavaSet<string>(topics);
JvmObjectReference jkafkaParams = SparkContextIpcProxy.GetJavaMap<string, string>(kafkaParams);
JvmObjectReference jtopics = JvmBridgeUtils.GetJavaSet<string>(topics);
JvmObjectReference jkafkaParams = JvmBridgeUtils.GetJavaMap<string, string>(kafkaParams);
var jTopicAndPartitions = fromOffsets.Select(x =>
new KeyValuePair<JvmObjectReference, long>
@ -175,13 +205,48 @@ namespace Microsoft.Spark.CSharp.Proxy.Ipc
)
);
JvmObjectReference jfromOffsets = SparkContextIpcProxy.GetJavaMap<JvmObjectReference, long>(jTopicAndPartitions);
JvmObjectReference jfromOffsets = JvmBridgeUtils.GetJavaMap<JvmObjectReference, long>(jTopicAndPartitions);
// KafkaUtilsPythonHelper: external/kafka/src/main/scala/org/apache/spark/streaming/kafka/KafkaUtils.scala
JvmObjectReference jhelper = SparkCLRIpcProxy.JvmBridge.CallConstructor("org.apache.spark.streaming.kafka.KafkaUtilsPythonHelper", new object[] { });
var jstream = new JvmObjectReference(SparkCLRIpcProxy.JvmBridge.CallNonStaticJavaMethod(jhelper, "createDirectStreamWithoutMessageHandler", new object[] { jvmJavaStreamingReference, jkafkaParams, jtopics, jfromOffsets }).ToString());
return new DStreamIpcProxy(jstream);
}
public IDStreamProxy DirectKafkaStreamWithRepartition(List<string> topics, Dictionary<string, string> kafkaParams,
Dictionary<string, long> fromOffsets, int numPartitions, byte[] readFunc, string serializationMode)
{
JvmObjectReference jtopics = JvmBridgeUtils.GetJavaSet<string>(topics);
JvmObjectReference jkafkaParams = JvmBridgeUtils.GetJavaMap<string, string>(kafkaParams);
var jTopicAndPartitions = fromOffsets.Select(x =>
new KeyValuePair<JvmObjectReference, long>
(
SparkCLRIpcProxy.JvmBridge.CallConstructor("kafka.common.TopicAndPartition", new object[] { x.Key.Split(':')[0], int.Parse(x.Key.Split(':')[1]) }),
x.Value
)
);
JvmObjectReference jfromOffsets = JvmBridgeUtils.GetJavaMap<JvmObjectReference, long>(jTopicAndPartitions);
// SparkCLR\scala\src\main\org\apache\spark\streaming\api\kafka\KafkaUtilsCSharpHelper.scala
JvmObjectReference jhelper = SparkCLRIpcProxy.JvmBridge.CallConstructor("org.apache.spark.streaming.kafka.KafkaUtilsCSharpHelper", new object[] { });
var jstream = new JvmObjectReference(SparkCLRIpcProxy.JvmBridge.CallNonStaticJavaMethod(jhelper, "createDirectStreamWithoutMessageHandler",
new object[] { jvmJavaStreamingReference, jkafkaParams, jtopics, jfromOffsets, (int)numPartitions, readFunc, serializationMode }).ToString());
return new DStreamIpcProxy(jstream);
}
public IDStreamProxy EventHubsUnionStream(Dictionary<string, string> eventHubsParams, StorageLevelType storageLevelType)
{
JvmObjectReference eventHubsParamsReference = JvmBridgeUtils.GetScalaMutableMap<string, string>(eventHubsParams);
JvmObjectReference storageLevelTypeReference = SparkContextIpcProxy.GetJavaStorageLevel(storageLevelType);
return
new DStreamIpcProxy(
new JvmObjectReference(
SparkCLRIpcProxy.JvmBridge.CallStaticJavaMethod(
"org.apache.spark.streaming.api.csharp.EventHubsUtils", "createUnionStream",
new object[] { jvmJavaStreamingReference, eventHubsParamsReference, storageLevelTypeReference })
.ToString()));
}
public IDStreamProxy Union(IDStreamProxy firstDStream, IDStreamProxy[] otherDStreams)
{
return new DStreamIpcProxy(
@ -190,7 +255,7 @@ namespace Microsoft.Spark.CSharp.Proxy.Ipc
new object[]
{
(firstDStream as DStreamIpcProxy).javaDStreamReference,
SparkContextIpcProxy.GetJavaList<JvmObjectReference>(otherDStreams.Select(x => (x as DStreamIpcProxy).javaDStreamReference))
JvmBridgeUtils.GetJavaList<JvmObjectReference>(otherDStreams.Select(x => (x as DStreamIpcProxy).javaDStreamReference))
}
)));
}
@ -200,19 +265,19 @@ namespace Microsoft.Spark.CSharp.Proxy.Ipc
SparkCLRIpcProxy.JvmBridge.CallNonStaticJavaMethod(jvmStreamingContextReference, "awaitTermination");
}
public void AwaitTermination(int timeout)
public void AwaitTerminationOrTimeout(long timeout)
{
SparkCLRIpcProxy.JvmBridge.CallNonStaticJavaMethod(jvmStreamingContextReference, "awaitTermination", new object[] { timeout });
SparkCLRIpcProxy.JvmBridge.CallNonStaticJavaMethod(jvmStreamingContextReference, "awaitTerminationOrTimeout", new object[] { timeout });
}
private void ProcessCallbackRequest(object socket)
{
logger.LogDebug("new thread created to process callback request");
logger.LogDebug("New thread (id={0}) created to process callback request", Thread.CurrentThread.ManagedThreadId);
try
{
using (Socket sock = (Socket)socket)
using (var s = new NetworkStream(sock))
using (var sock = (ISocketWrapper)socket)
using (var s = sock.GetStream())
{
while (true)
{
@ -268,6 +333,7 @@ namespace Microsoft.Spark.CSharp.Proxy.Ipc
//log exception only when callback socket is not shutdown explicitly
if (!callbackSocketShutdown)
{
logger.LogError("Exception processing call back request. Thread id {0}", Thread.CurrentThread.ManagedThreadId);
logger.LogException(e);
// exit when exception happens
@ -281,16 +347,17 @@ namespace Microsoft.Spark.CSharp.Proxy.Ipc
}
catch (Exception e)
{
logger.LogError("Exception in callback. Thread id {0}", Thread.CurrentThread.ManagedThreadId);
logger.LogException(e);
}
logger.LogDebug("thread to process callback request exit");
logger.LogDebug("Thread (id={0}) to process callback request exiting", Thread.CurrentThread.ManagedThreadId);
}
public int StartCallback()
private int StartCallbackServer()
{
TcpListener callbackServer = new TcpListener(IPAddress.Loopback, 0);
callbackServer.Start();
var callbackServer = SocketFactory.CreateSocket();
callbackServer.Listen();
Task.Run(() =>
{
@ -299,23 +366,28 @@ namespace Microsoft.Spark.CSharp.Proxy.Ipc
ThreadPool.SetMaxThreads(10, 10);
while (!callbackSocketShutdown)
{
Socket sock = callbackServer.AcceptSocket();
ThreadPool.QueueUserWorkItem(new WaitCallback(ProcessCallbackRequest), sock);
var sock = callbackServer.Accept();
ThreadPool.QueueUserWorkItem(ProcessCallbackRequest, sock);
}
}
catch (Exception e)
{
logger.LogError("Exception starting callback server");
logger.LogException(e);
throw;
}
finally
{
if (callbackServer != null)
callbackServer.Stop();
callbackServer.Close();
}
});
return (callbackServer.LocalEndpoint as IPEndPoint).Port;
int port = (callbackServer.LocalEndPoint as IPEndPoint).Port;
logger.LogInfo("Callback server port number is {0}", port);
SparkCLRIpcProxy.JvmBridge.CallStaticJavaMethod("SparkCLRHandler", "connectCallback", port); //className and methodName hard coded in CSharpBackendHandler
return port;
}
}
}

Просмотреть файл

@ -13,6 +13,11 @@ namespace Microsoft.Spark.CSharp.Services
public class DefaultLoggerService : ILoggerService
{
internal readonly static DefaultLoggerService Instance = new DefaultLoggerService(typeof (Type));
/// <summary>
/// Get an instance of ILoggerService by a given type of logger
/// </summary>
/// <param name="type">The type of a logger to return</param>
/// <returns>An instance of ILoggerService</returns>
public ILoggerService GetLoggerInstance(Type type)
{
return new DefaultLoggerService(type);
@ -24,31 +29,105 @@ namespace Microsoft.Spark.CSharp.Services
type = t;
}
/// <summary>
/// Logs a message at debug level.
/// </summary>
/// <param name="message">The message to be logged</param>
public void LogDebug(string message)
{
Log("Debug", message);
}
/// <summary>
/// Logs a message at debug level with a format string.
/// </summary>
/// <param name="messageFormat">The format string</param>
/// <param name="messageParameters">The array of arguments</param>
public void LogDebug(string messageFormat, params object[] messageParameters)
{
Log("Debug", string.Format(messageFormat, messageParameters));
}
/// <summary>
/// Logs a message at info level.
/// </summary>
/// <param name="message">The message to be logged</param>
public void LogInfo(string message)
{
Log("Info", message);
}
/// <summary>
/// Logs a message at info level with a format string.
/// </summary>
/// <param name="messageFormat">The format string</param>
/// <param name="messageParameters">The array of arguments</param>
public void LogInfo(string messageFormat, params object[] messageParameters)
{
Log("Info", string.Format(messageFormat, messageParameters));
}
/// <summary>
/// Logs a message at warning level.
/// </summary>
/// <param name="message">The message to be logged</param>
public void LogWarn(string message)
{
Log("Warn", message);
}
/// <summary>
/// Logs a message at warning level with a format string.
/// </summary>
/// <param name="messageFormat">The format string</param>
/// <param name="messageParameters">The array of arguments</param>
public void LogWarn(string messageFormat, params object[] messageParameters)
{
Log("Warn", string.Format(messageFormat, messageParameters));
}
/// <summary>
/// Logs a fatal message.
/// </summary>
/// <param name="message">The message to be logged</param>
public void LogFatal(string message)
{
Log("Fatal", message);
}
/// <summary>
/// Logs a fatal message with a format string.
/// </summary>
/// <param name="messageFormat">The format string</param>
/// <param name="messageParameters">The array of arguments</param>
public void LogFatal(string messageFormat, params object[] messageParameters)
{
Log("Fatal", string.Format(messageFormat, messageParameters));
}
/// <summary>
/// Logs a error message.
/// </summary>
/// <param name="message">The message to be logged</param>
public void LogError(string message)
{
Log("Error", message);
}
/// <summary>
/// Logs a error message with a format string.
/// </summary>
/// <param name="messageFormat">The format string</param>
/// <param name="messageParameters">The array of arguments</param>
public void LogError(string messageFormat, params object[] messageParameters)
{
Log("Error", string.Format(messageFormat, messageParameters));
}
/// <summary>
/// Logs an exception
/// </summary>
/// <param name="e">The exception to be logged</param>
public void LogException(Exception e)
{
Log("Exception", string.Format("{0}{1}{2}", e.Message, Environment.NewLine, e.StackTrace));

Просмотреть файл

@ -1,19 +1,77 @@
using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Threading.Tasks;
namespace Microsoft.Spark.CSharp.Services
{
/// <summary>
/// Defines a logger what be used in service
/// </summary>
public interface ILoggerService
{
/// <summary>
/// Get an instance of ILoggerService by a given type of logger
/// </summary>
/// <param name="type">The type of a logger to return</param>
/// <returns>An instance of ILoggerService</returns>
ILoggerService GetLoggerInstance(Type type);
/// <summary>
/// Logs a message at debug level.
/// </summary>
/// <param name="message">The message to be logged</param>
void LogDebug(string message);
/// <summary>
/// Logs a message at debug level with a format string.
/// </summary>
/// <param name="messageFormat">The format string</param>
/// <param name="messageParameters">The array of arguments</param>
void LogDebug(string messageFormat, params object[] messageParameters);
/// <summary>
/// Logs a message at info level.
/// </summary>
/// <param name="message">The message to be logged</param>
void LogInfo(string message);
/// <summary>
/// Logs a message at info level with a format string.
/// </summary>
/// <param name="messageFormat">The format string</param>
/// <param name="messageParameters">The array of arguments</param>
void LogInfo(string messageFormat, params object[] messageParameters);
/// <summary>
/// Logs a message at warning level.
/// </summary>
/// <param name="message">The message to be logged</param>
void LogWarn(string message);
/// <summary>
/// Logs a message at warning level with a format string.
/// </summary>
/// <param name="messageFormat">The format string</param>
/// <param name="messageParameters">The array of arguments</param>
void LogWarn(string messageFormat, params object[] messageParameters);
/// <summary>
/// Logs a fatal message.
/// </summary>
/// <param name="message">The message to be logged</param>
void LogFatal(string message);
/// <summary>
/// Logs a fatal message with a format string.
/// </summary>
/// <param name="messageFormat">The format string</param>
/// <param name="messageParameters">The array of arguments</param>
void LogFatal(string messageFormat, params object[] messageParameters);
/// <summary>
/// Logs a error message.
/// </summary>
/// <param name="message">The message to be logged</param>
void LogError(string message);
/// <summary>
/// Logs a error message with a format string.
/// </summary>
/// <param name="messageFormat">The format string</param>
/// <param name="messageParameters">The array of arguments</param>
void LogError(string messageFormat, params object[] messageParameters);
/// <summary>
/// Logs an exception
/// </summary>
/// <param name="e">The exception to be logged</param>
void LogException(Exception e);
}
}

Просмотреть файл

@ -10,11 +10,17 @@ using log4net.Config;
namespace Microsoft.Spark.CSharp.Services
{
[ExcludeFromCodeCoverage] //unit test coverage not reqiured for logger service
/// <summary>
/// Represents a Log4Net logger.
/// </summary>
[ExcludeFromCodeCoverage] //unit test coverage not required for logger service
public class Log4NetLoggerService : ILoggerService
{
private readonly ILog logger;
private const string exceptionLogDelimiter = "*******************************************************************************************************************************";
/// <summary>
/// Gets a instance of Log4Net logger
/// </summary>
public static Log4NetLoggerService Instance = new Log4NetLoggerService(typeof(Type));
static Log4NetLoggerService()
@ -22,37 +28,115 @@ namespace Microsoft.Spark.CSharp.Services
XmlConfigurator.Configure();
}
/// <summary>
/// Initializes a instance of Log4NetLoggerService with a specific type.
/// </summary>
/// <param name="type">The type of the logger</param>
public Log4NetLoggerService(Type type)
{
logger = LogManager.GetLogger(type);
log4net.GlobalContext.Properties["pid"] = Process.GetCurrentProcess().Id;
}
/// <summary>
/// Logs a message at debug level.
/// </summary>
/// <param name="message">The message to be logged</param>
public void LogDebug(string message)
{
logger.Debug(message);
}
/// <summary>
/// Logs a message at debug level with a format string.
/// </summary>
/// <param name="messageFormat">The format string</param>
/// <param name="messageParameters">The array of arguments</param>
public void LogDebug(string messageFormat, params object[] messageParameters)
{
logger.DebugFormat(messageFormat, messageParameters);
}
/// <summary>
/// Logs a message at info level.
/// </summary>
/// <param name="message">The message to be logged</param>
public void LogInfo(string message)
{
logger.Info(message);
}
/// <summary>
/// Logs a message at info level with a format string.
/// </summary>
/// <param name="messageFormat">The format string</param>
/// <param name="messageParameters">The array of arguments</param>
public void LogInfo(string messageFormat, params object[] messageParameters)
{
logger.InfoFormat(messageFormat, messageParameters);
}
/// <summary>
/// Logs a message at warning level.
/// </summary>
/// <param name="message">The message to be logged</param>
public void LogWarn(string message)
{
logger.Warn(message);
}
/// <summary>
/// Logs a message at warning level with a format string.
/// </summary>
/// <param name="messageFormat">The format string</param>
/// <param name="messageParameters">The array of arguments</param>
public void LogWarn(string messageFormat, params object[] messageParameters)
{
logger.WarnFormat(messageFormat, messageParameters);
}
/// <summary>
/// Logs a fatal message.
/// </summary>
/// <param name="message">The message to be logged</param>
public void LogFatal(string message)
{
logger.Fatal(message);
}
/// <summary>
/// Logs a fatal message with a format string.
/// </summary>
/// <param name="messageFormat">The format string</param>
/// <param name="messageParameters">The array of arguments</param>
public void LogFatal(string messageFormat, params object[] messageParameters)
{
logger.FatalFormat(messageFormat, messageParameters);
}
/// <summary>
/// Logs a error message.
/// </summary>
/// <param name="message">The message to be logged</param>
public void LogError(string message)
{
logger.Error(message);
}
/// <summary>
/// Logs a error message with a format string.
/// </summary>
/// <param name="messageFormat">The format string</param>
/// <param name="messageParameters">The array of arguments</param>
public void LogError(string messageFormat, params object[] messageParameters)
{
logger.ErrorFormat(messageFormat, messageParameters);
}
/// <summary>
/// Logs an exception
/// </summary>
/// <param name="e">The exception to be logged</param>
public void LogException(Exception e)
{
@ -92,7 +176,12 @@ namespace Microsoft.Spark.CSharp.Services
}
}
/// <summary>
/// Get an instance of ILoggerService by a given type of logger
/// </summary>
/// <param name="type">The type of a logger to return</param>
/// <returns>An instance of ILoggerService</returns>
public ILoggerService GetLoggerInstance(Type type)
{
return new Log4NetLoggerService(type);

Просмотреть файл

@ -12,11 +12,23 @@ namespace Microsoft.Spark.CSharp.Services
public class LoggerServiceFactory
{
private static ILoggerService loggerService = DefaultLoggerService.Instance;
/// <summary>
/// Overrides an existing logger by a given logger service instance
/// </summary>
/// <param name="loggerServiceOverride">The logger service instance used to overrides</param>
public static void SetLoggerService(ILoggerService loggerServiceOverride)
{
loggerService = loggerServiceOverride;
var logger = GetLogger(typeof(LoggerServiceFactory));
logger.LogInfo("Logger service configured to use {0}", logger.GetType().Name);
}
/// <summary>
/// Gets an instance of logger service for a given type.
/// </summary>
/// <param name="type">The type of logger service to get</param>
/// <returns>An instance of logger service</returns>
public static ILoggerService GetLogger(Type type)
{
return loggerService.GetLoggerInstance(type);

Просмотреть файл

@ -10,6 +10,9 @@ using Microsoft.Spark.CSharp.Interop;
namespace Microsoft.Spark.CSharp.Sql
{
/// <summary>
/// A column that will be computed based on the data in a DataFrame.
/// </summary>
public class Column
{
private readonly IColumnProxy columnProxy;
@ -27,81 +30,179 @@ namespace Microsoft.Spark.CSharp.Sql
this.columnProxy = columnProxy;
}
/// <summary>
/// The logical negation operator that negates its operand.
/// </summary>
/// <param name="self">The column self to compute</param>
/// <returns>true if and only if its operand is false</returns>
public static Column operator !(Column self)
{
return new Column(self.columnProxy.FuncOp("not"));
}
/// <summary>
/// Negation of itself.
/// </summary>
/// <param name="self">The column self to compute</param>
/// <returns>The nagation of itself</returns>
public static Column operator -(Column self)
{
return new Column(self.columnProxy.FuncOp("negate"));
}
/// <summary>
/// Sum of this expression and another expression.
/// </summary>
/// <param name="self">The column self to compute</param>
/// <param name="other">The other object to compute</param>
/// <returns>The result of sum</returns>
public static Column operator +(Column self, object other)
{
return new Column(self.columnProxy.BinOp("plus", (other is Column) ? ((Column)other).columnProxy : other));
}
/// <summary>
/// Subtraction of this expression and another expression.
/// </summary>
/// <param name="self">The column self to compute</param>
/// <param name="other">The other object to compute</param>
/// <returns>The result of subtraction</returns>
public static Column operator -(Column self, object other)
{
return new Column(self.columnProxy.BinOp("minus", (other is Column) ? ((Column)other).columnProxy : other));
}
/// <summary>
/// Multiplication of this expression and another expression.
/// </summary>
/// <param name="self">The column self to compute</param>
/// <param name="other">The other object to compute</param>
/// <returns>The result of multiplication</returns>
public static Column operator *(Column self, object other)
{
return new Column(self.columnProxy.BinOp("multiply", (other is Column) ? ((Column)other).columnProxy : other));
}
/// <summary>
/// Division this expression by another expression.
/// </summary>
/// <param name="self">The column self to compute</param>
/// <param name="other">The other object to compute</param>
/// <returns>The result of division</returns>
public static Column operator /(Column self, object other)
{
return new Column(self.columnProxy.BinOp("divide", (other is Column) ? ((Column)other).columnProxy : other));
}
/// <summary>
/// Modulo (a.k.a. remainder) expression.
/// </summary>
/// <param name="self">The column self to compute</param>
/// <param name="other">The other object to compute</param>
/// <returns>The remainder after dividing column self by other</returns>
public static Column operator %(Column self, object other)
{
return new Column(self.columnProxy.BinOp("mod", (other is Column) ? ((Column)other).columnProxy : other));
}
/// <summary>
/// The equality operator returns true if the values of its operands are equal, false otherwise.
/// </summary>
/// <param name="self">The column self to compare</param>
/// <param name="other">The other object to compare</param>
/// <returns>true if the value of self is the same as the value of other; otherwise, false.</returns>
public static Column operator ==(Column self, object other)
{
return new Column(self.columnProxy.BinOp("equalTo", (other is Column) ? ((Column)other).columnProxy : other));
}
/// <summary>
/// The inequality operator returns false if its operands are equal, true otherwise.
/// </summary>
/// <param name="self">The column self to compare</param>
/// <param name="other">The other object to compare</param>
/// <returns>true if the value of self is different from the value of other; otherwise, false.</returns>
public static Column operator !=(Column self, object other)
{
return new Column(self.columnProxy.BinOp("notEqual", (other is Column) ? ((Column)other).columnProxy : other));
}
/// <summary>
/// The "less than" relational operator that returns true if the first operand
/// is less than the second, false otherwise.
/// </summary>
/// <param name="self">The column self to compare</param>
/// <param name="other">The other object to compare</param>
/// <returns>true if the value of self is less than the value of other; otherwise, false.</returns>
public static Column operator <(Column self, object other)
{
return new Column(self.columnProxy.BinOp("lt", (other is Column) ? ((Column)other).columnProxy : other));
}
/// <summary>
/// The "less than or equal" relational operator that returns true if the first operand
/// is less than or equal to the second, false otherwise.
/// </summary>
/// <param name="self">The column self to compare</param>
/// <param name="other">The other object to compare</param>
/// <returns>true if the value of self is less than or equal to the value of other; otherwise, false.</returns>
public static Column operator <=(Column self, object other)
{
return new Column(self.columnProxy.BinOp("leq", (other is Column) ? ((Column)other).columnProxy : other));
}
/// <summary>
/// The "greater than or equal" relational operator that returns true if the first operand
/// is greater than or equal to the second, false otherwise.
/// </summary>
/// <param name="self">The column self to compare</param>
/// <param name="other">The other object to compare</param>
/// <returns>true if the value of self is greater than or equal to the value of other; otherwise, false.</returns>
public static Column operator >=(Column self, object other)
{
return new Column(self.columnProxy.BinOp("geq", (other is Column) ? ((Column)other).columnProxy : other));
}
/// <summary>
/// The "greater than" relational operator that returns true if the first operand
/// is greater than the second, false otherwise.
/// </summary>
/// <param name="self">The column self to compare</param>
/// <param name="other">The other object to compare</param>
/// <returns>true if the value of self is greater than the value of other; otherwise, false.</returns>
public static Column operator >(Column self, object other)
{
return new Column(self.columnProxy.BinOp("gt", (other is Column) ? ((Column)other).columnProxy : other));
}
/// <summary>
/// Compute bitwise OR of this expression with another expression.
/// </summary>
/// <param name="self">The column self to compute</param>
/// <param name="other">The other object to compute</param>
/// <returns>false if and only if both its operands are false; otherwise, true</returns>
public static Column operator |(Column self, object other)
{
return new Column(self.columnProxy.BinOp("bitwiseOR", (other is Column) ? ((Column)other).columnProxy : other));
}
/// <summary>
/// Compute bitwise AND of this expression with another expression.
/// </summary>
/// <param name="self">The column self to compute</param>
/// <param name="other">The other object to compute</param>
/// <returns>true if and only if both its operands are true; otherwise, false</returns>
public static Column operator &(Column self, object other)
{
return new Column(self.columnProxy.BinOp("bitwiseAND", (other is Column) ? ((Column)other).columnProxy : other));
}
/// <summary>
/// Compute bitwise XOR of this expression with another expression.
/// </summary>
/// <param name="self">The column self to compute</param>
/// <param name="other">The other object to compute</param>
/// <returns>true if and only if exactly one of its operands is true; otherwise, false</returns>
public static Column operator ^(Column self, object other)
{
return new Column(self.columnProxy.BinOp("bitwiseXOR", (other is Column) ? ((Column)other).columnProxy : other));
@ -167,20 +268,39 @@ namespace Microsoft.Spark.CSharp.Sql
return new Column(columnProxy.BinOp("endsWith", other.columnProxy));
}
/// <summary>
/// Returns a sort expression based on the ascending order.
/// </summary>
/// <returns>A column with ascending order</returns>
public Column Asc()
{
return new Column(columnProxy.UnaryOp("asc"));
}
/// <summary>
/// Returns a sort expression based on the descending order.
/// </summary>
/// <returns>A column with descending order</returns>
public Column Desc()
{
return new Column(columnProxy.UnaryOp("desc"));
}
/// <summary>
/// Returns this column aliased with a new name.
/// </summary>
/// <param name="alias">The name of alias</param>
/// <returns>A column aliased with the given name</returns>
public Column Alias(string alias)
{
return new Column(columnProxy.InvokeMethod("as", alias));
}
/// <summary>
/// Returns this column aliased with new names
/// </summary>
/// <param name="aliases">The array of names for aliases</param>
/// <returns>A column aliased with the given names</returns>
public Column Alias(string[] aliases)
{
return new Column(columnProxy.InvokeMethod("as", new object[] { aliases }));

Просмотреть файл

@ -7,6 +7,7 @@ using System.Globalization;
using System.Linq;
using Microsoft.Spark.CSharp.Core;
using Microsoft.Spark.CSharp.Proxy;
using Microsoft.Spark.CSharp.Services;
namespace Microsoft.Spark.CSharp.Sql
{
@ -18,6 +19,9 @@ namespace Microsoft.Spark.CSharp.Sql
[Serializable]
public class DataFrame
{
[NonSerialized]
private readonly ILoggerService logger = LoggerServiceFactory.GetLogger(typeof(DataFrame));
[NonSerialized]
private readonly IDataFrameProxy dataFrameProxy;
[NonSerialized]
@ -33,6 +37,9 @@ namespace Microsoft.Spark.CSharp.Sql
[NonSerialized]
private readonly Random random = new Random();
/// <summary>
/// Represents the content of the DataFrame as an RDD of Rows.
/// </summary>
public RDD<Row> Rdd
{
get
@ -59,6 +66,9 @@ namespace Microsoft.Spark.CSharp.Sql
}
}
/// <summary>
/// Returns true if the collect and take methods can be run locally (without any Spark executors).
/// </summary>
public bool IsLocal
{
get
@ -84,11 +94,18 @@ namespace Microsoft.Spark.CSharp.Sql
get { return dataFrameProxy; }
}
/// <summary>
/// Returns the schema of this DataFrame.
/// </summary>
public StructType Schema
{
get { return schema ?? (schema = new StructType(dataFrameProxy.GetSchema())); }
}
/// <summary>
/// Returns a column for a given column name.
/// </summary>
/// <param name="columnName">The name of column</param>
public Column this[string columnName]
{
get
@ -119,6 +136,7 @@ namespace Microsoft.Spark.CSharp.Sql
/// <returns>row count</returns>
public long Count()
{
logger.LogInfo("Calculating the number of rows in the dataframe");
return dataFrameProxy.Count();
}
@ -129,6 +147,7 @@ namespace Microsoft.Spark.CSharp.Sql
/// <param name="truncate">Indicates if strings more than 20 characters long will be truncated</param>
public void Show(int numberOfRows = 20, bool truncate = true)
{
logger.LogInfo("Writing {0} rows in the DataFrame to Console output", numberOfRows);
Console.WriteLine(dataFrameProxy.GetShowString(numberOfRows, truncate));
}
@ -138,6 +157,7 @@ namespace Microsoft.Spark.CSharp.Sql
public void ShowSchema()
{
var nameTypeList = Schema.Fields.Select(structField => structField.SimpleString);
logger.LogInfo("Writing Schema to Console output");
Console.WriteLine(string.Join(", ", nameTypeList));
}
@ -641,19 +661,69 @@ namespace Microsoft.Spark.CSharp.Sql
}
if (ascending != null)
{
if(columns.Length != ascending.Length)
throw new ArgumentException("ascending should have the same length with columns");
var columnsWithOrder = new Column[columns.Length];
for (var i = 0; i < columns.Length; i++)
{
columnsWithOrder[i] = ascending[i] ? columns[i].Asc() : columns[i].Desc();
}
return new DataFrame(dataFrameProxy.Sort(columnsWithOrder.Select(c => c.ColumnProxy).ToArray()), sparkContext);
var sortedColumns = SortColumns(columns, ascending);
return new DataFrame(dataFrameProxy.Sort(sortedColumns.Select(c => c.ColumnProxy).ToArray()), sparkContext);
}
return new DataFrame(dataFrameProxy.Sort(columns.Select(c => c.ColumnProxy).ToArray()), sparkContext);
}
/// <summary>
/// Returns a new DataFrame sorted by the specified column(s).
/// Reference to https://github.com/apache/spark/blob/branch-1.6/python/pyspark/sql/dataframe.py, sortWithinPartitions(self, *cols, **kwargs)
/// </summary>
/// <param name="columns">List of Columns to sort by</param>
/// <param name="ascending">List of boolean to specify multiple sort orders for <paramref name="columns"/>, TRUE for ascending, FALSE for descending.
/// if not null, it will overwrite the order specified by Column.Asc() or Column Desc() in <paramref name="columns"/>, </param>
/// <returns>A new DataFrame sorted by the specified column(s)</returns>
public DataFrame SortWithinPartitions(string[] columns, bool[] ascending = null)
{
if (columns == null || columns.Length == 0)
{
throw new ArgumentException("should sort by at least one column.");
}
if (ascending != null)
{
var sortedColumns = SortColumns(columns.Select(c => this[c]).ToArray(), ascending);
return new DataFrame(dataFrameProxy.SortWithinPartitions(sortedColumns.Select(c => c.ColumnProxy).ToArray()), sparkContext);
}
return new DataFrame(dataFrameProxy.SortWithinPartitions(columns.Select(c => this[c].ColumnProxy).ToArray()), sparkContext);
}
/// <summary>
/// Returns a new DataFrame sorted by the specified column(s).
/// Reference to https://github.com/apache/spark/blob/branch-1.6/python/pyspark/sql/dataframe.py, sortWithinPartitions(self, *cols, **kwargs)
/// </summary>
/// <param name="columns">List of Columns to sort by</param>
/// <param name="ascending">List of boolean to specify multiple sort orders for <paramref name="columns"/>, TRUE for ascending, FALSE for descending.
/// if not null, it will overwrite the order specified by Column.Asc() or Column Desc() in <paramref name="columns"/>, </param>
/// <returns>A new DataFrame sorted by the specified column(s)</returns>
public DataFrame SortWithinPartition(Column[] columns, bool[] ascending = null)
{
if (columns == null || columns.Length == 0)
{
throw new ArgumentException("should sort by at least one column.");
}
if (ascending != null)
{
var sortedColumns = SortColumns(columns, ascending);
return new DataFrame(dataFrameProxy.SortWithinPartitions(sortedColumns.Select(c => c.ColumnProxy).ToArray()), sparkContext);
}
return new DataFrame(dataFrameProxy.SortWithinPartitions(columns.Select(c => c.ColumnProxy).ToArray()), sparkContext);
}
private Column[] SortColumns(Column[] columns, bool[] ascending)
{
if (columns.Length != ascending.Length)
throw new ArgumentException("ascending should have the same length with columns");
var columnsWithOrder = new Column[columns.Length];
for (var i = 0; i < columns.Length; i++)
{
columnsWithOrder[i] = ascending[i] ? columns[i].Asc() : columns[i].Desc();
}
return columnsWithOrder;
}
/// <summary>
/// Returns a new DataFrame with an alias set.
/// Reference to https://github.com/apache/spark/blob/branch-1.4/python/pyspark/sql/dataframe.py, alias(self, alias)
@ -877,6 +947,32 @@ namespace Microsoft.Spark.CSharp.Sql
return new DataFrame(dataFrameProxy.Repartition(numPartitions), sparkContext);
}
/// <summary>
/// Returns a new [[DataFrame]] partitioned by the given partitioning columns into <paramref name="numPartitions"/>. The resulting DataFrame is hash partitioned.
/// <param name="columns"></param>
/// <param name="numPartitions">optional. If not specified, keep current partitions.</param>
/// </summary>
// Python API: https://github.com/apache/spark/blob/branch-1.6/python/pyspark/sql/dataframe.py repartition(self, numPartitions)
public DataFrame Repartition(string[] columns, int numPartitions = 0)
{
return numPartitions == 0 ?
new DataFrame(dataFrameProxy.Repartition(columns.Select(c => this[c].ColumnProxy).ToArray()), sparkContext) :
new DataFrame(dataFrameProxy.Repartition(numPartitions, columns.Select(c => this[c].ColumnProxy).ToArray()), sparkContext);
}
/// <summary>
/// Returns a new [[DataFrame]] partitioned by the given partitioning columns into <paramref name="numPartitions"/>. The resulting DataFrame is hash partitioned.
/// <param name="columns"></param>
/// <param name="numPartitions">optional. If not specified, keep current partitions.</param>
/// </summary>
// Python API: https://github.com/apache/spark/blob/branch-1.6/python/pyspark/sql/dataframe.py repartition(self, numPartitions)
public DataFrame Repartition(Column[] columns, int numPartitions = 0)
{
return numPartitions == 0 ?
new DataFrame(dataFrameProxy.Repartition(columns.Select(c => c.ColumnProxy).ToArray()), sparkContext) :
new DataFrame(dataFrameProxy.Repartition(numPartitions, columns.Select(c => c.ColumnProxy).ToArray()), sparkContext);
}
/// <summary>
/// Returns a new DataFrame by sampling a fraction of rows.
/// </summary>
@ -954,6 +1050,7 @@ namespace Microsoft.Spark.CSharp.Sql
// write(self)
public DataFrameWriter Write()
{
logger.LogInfo("Using DataFrameWriter to write output data to external data storage");
return new DataFrameWriter(dataFrameProxy.Write());
}
@ -1059,8 +1156,14 @@ namespace Microsoft.Spark.CSharp.Sql
}
}
/// <summary>
/// The type of join operation for DataFrame
/// </summary>
public class JoinType
{
/// <summary>
/// Get the string that represents a join type
/// </summary>
public string Value { get; private set; }
private JoinType(string value)
{
@ -1073,6 +1176,9 @@ namespace Microsoft.Spark.CSharp.Sql
private static readonly JoinType RightOuterJoinType = new JoinType("right_outer");
private static readonly JoinType LeftSemiJoinType = new JoinType("leftsemi");
/// <summary>
/// Inner join
/// </summary>
public static JoinType Inner
{
get
@ -1081,6 +1187,9 @@ namespace Microsoft.Spark.CSharp.Sql
}
}
/// <summary>
/// Outer join
/// </summary>
public static JoinType Outer
{
get
@ -1089,6 +1198,9 @@ namespace Microsoft.Spark.CSharp.Sql
}
}
/// <summary>
/// Left outer join
/// </summary>
public static JoinType LeftOuter
{
get
@ -1097,6 +1209,9 @@ namespace Microsoft.Spark.CSharp.Sql
}
}
/// <summary>
/// Right outer join
/// </summary>
public static JoinType RightOuter
{
get
@ -1105,6 +1220,9 @@ namespace Microsoft.Spark.CSharp.Sql
}
}
/// <summary>
/// Left semi join
/// </summary>
public static JoinType LeftSemi
{
get
@ -1114,6 +1232,9 @@ namespace Microsoft.Spark.CSharp.Sql
}
}
/// <summary>
/// A set of methods for aggregations on a DataFrame, created by DataFrame.groupBy.
/// </summary>
public class GroupedData
{
internal IGroupedDataProxy GroupedDataProxy
@ -1130,36 +1251,79 @@ namespace Microsoft.Spark.CSharp.Sql
this.dataFrame = dataFrame;
}
/// <summary>
/// Compute aggregates by specifying a dictionary from column name to aggregate methods.
/// The available aggregate methods are avg, max, min, sum, count.
/// </summary>
/// <param name="columnNameAggFunctionDictionary">The dictionary of column name to aggregate method</param>
/// <returns>The DataFrame object that contains the grouping columns.</returns>
public DataFrame Agg(Dictionary<string, string> columnNameAggFunctionDictionary)
{
return new DataFrame(dataFrame.DataFrameProxy.Agg(groupedDataProxy, columnNameAggFunctionDictionary), dataFrame.SparkContext);
}
/// <summary>
/// Count the number of rows for each group.
/// </summary>
/// <returns>The DataFrame object that contains the grouping columns.</returns>
public DataFrame Count()
{
return new DataFrame(groupedDataProxy.Count(), dataFrame.SparkContext);
}
/// <summary>
/// Compute the average value for each numeric columns for each group.
/// This is an alias for avg.
/// When specified columns are given, only compute the average values for them.
/// </summary>
/// <param name="columns">The name of columns to be computed.</param>
/// <returns>The DataFrame object that contains the grouping columns.</returns>
public DataFrame Mean(params string[] columns)
{
return new DataFrame(groupedDataProxy.Mean(columns), dataFrame.SparkContext);
}
/// <summary>
/// Compute the max value for each numeric columns for each group.
/// When specified columns are given, only compute the max values for them.
/// </summary>
/// <param name="columns"> The name of columns to be computed.</param>
/// <returns>The DataFrame object that contains the grouping columns.</returns>
public DataFrame Max(params string[] columns)
{
return new DataFrame(groupedDataProxy.Max(columns), dataFrame.SparkContext);
}
/// <summary>
/// Compute the min value for each numeric column for each group.
/// </summary>
/// <param name="columns">
/// The name of columns to be computed. When specified columns are
/// given, only compute the min values for them.
/// </param>
/// <returns>The DataFrame object that contains the grouping columns.</returns>
public DataFrame Min(params string[] columns)
{
return new DataFrame(groupedDataProxy.Min(columns), dataFrame.SparkContext);
}
/// <summary>
/// Compute the mean value for each numeric columns for each group.
/// When specified columns are given, only compute the mean values for them.
/// </summary>
/// <param name="columns">The name of columns to be computed</param>
/// <returns>The DataFrame object that contains the grouping columns.</returns>
public DataFrame Avg(params string[] columns)
{
return new DataFrame(groupedDataProxy.Avg(columns), dataFrame.SparkContext);
}
/// <summary>
/// Compute the sum for each numeric columns for each group.
/// When specified columns are given, only compute the sum for them.
/// </summary>
/// <param name="columns">The name of columns to be computed</param>
/// <returns>The DataFrame object that contains the grouping columns.</returns>
public DataFrame Sum(params string[] columns)
{
return new DataFrame(groupedDataProxy.Sum(columns), dataFrame.SparkContext);

Просмотреть файл

@ -5,6 +5,7 @@ using System;
using System.Collections.Generic;
using Microsoft.Spark.CSharp.Core;
using Microsoft.Spark.CSharp.Proxy;
using Microsoft.Spark.CSharp.Services;
namespace Microsoft.Spark.CSharp.Sql
{
@ -14,6 +15,8 @@ namespace Microsoft.Spark.CSharp.Sql
/// </summary>
public class DataFrameReader
{
private readonly ILoggerService logger = LoggerServiceFactory.GetLogger(typeof(DataFrameReader));
private readonly IDataFrameReaderProxy dataFrameReaderProxy;
private readonly SparkContext sparkContext;
@ -27,6 +30,7 @@ namespace Microsoft.Spark.CSharp.Sql
/// </summary>
public DataFrameReader Format(string source)
{
logger.LogInfo("Input data source format for the reader is '{0}'", source);
dataFrameReaderProxy.Format(source);
return this;
}
@ -48,6 +52,7 @@ namespace Microsoft.Spark.CSharp.Sql
public DataFrameReader Option(string key, string value)
{
dataFrameReaderProxy.Options(new Dictionary<string, string>(){{key, value}});
logger.LogInfo("Input key-vaue option for the data source is {0}={1}", key, value);
return this;
}
@ -75,6 +80,7 @@ namespace Microsoft.Spark.CSharp.Sql
/// </summary>
public DataFrame Load()
{
logger.LogInfo("Loading DataFrame using the reader");
return new DataFrame(dataFrameReaderProxy.Load(), sparkContext);
}
@ -84,6 +90,7 @@ namespace Microsoft.Spark.CSharp.Sql
/// </summary>
public DataFrame Jdbc(string url, string table, Dictionary<String, String> properties)
{
logger.LogInfo("Constructing DataFrame using JDBC source. Url={0}, tableName={1}", url, table);
return new DataFrame(dataFrameReaderProxy.Jdbc(url, table, properties), sparkContext);
}
@ -106,6 +113,7 @@ namespace Microsoft.Spark.CSharp.Sql
public DataFrame Jdbc(string url, string table, string columnName, string lowerBound, string upperBound,
int numPartitions, Dictionary<String, String> connectionProperties)
{
logger.LogInfo("Constructing DataFrame using JDBC source. Url={0}, tableName={1}, columnName={2}", url, table, columnName);
return new DataFrame(dataFrameReaderProxy.Jdbc(url, table, columnName, lowerBound, upperBound, numPartitions, connectionProperties), sparkContext);
}
@ -125,6 +133,7 @@ namespace Microsoft.Spark.CSharp.Sql
/// Normally at least a "user" and "password" property should be included.</param>
public DataFrame Jdbc(string url, string table, string[] predicates, Dictionary<String, String> connectionProperties)
{
logger.LogInfo("Constructing DataFrame using JDBC source. Url={0}, table={1}", url, table);
return new DataFrame(dataFrameReaderProxy.Jdbc(url, table, predicates, connectionProperties), sparkContext);
}
@ -137,6 +146,7 @@ namespace Microsoft.Spark.CSharp.Sql
/// <param name="path">input path</param>
public DataFrame Json(string path)
{
logger.LogInfo("Constructing DataFrame using JSON source {0}", path);
return Format("json").Load(path);
}
@ -146,6 +156,7 @@ namespace Microsoft.Spark.CSharp.Sql
/// </summary>
public DataFrame Parquet(params string[] path)
{
logger.LogInfo("Constructing DataFrame using Parquet source {0}", string.Join(";", path));
return new DataFrame(dataFrameReaderProxy.Parquet(path), sparkContext);
}
}

Просмотреть файл

@ -3,6 +3,7 @@
using System.Collections.Generic;
using Microsoft.Spark.CSharp.Proxy;
using Microsoft.Spark.CSharp.Services;
namespace Microsoft.Spark.CSharp.Sql
{
@ -14,6 +15,7 @@ namespace Microsoft.Spark.CSharp.Sql
/// </summary>
public class DataFrameWriter
{
private readonly ILoggerService logger = LoggerServiceFactory.GetLogger(typeof(DataFrameWriter));
internal IDataFrameWriterProxy DataFrameWriterProxy
{
get { return dataFrameWriterProxy; }
@ -56,6 +58,7 @@ namespace Microsoft.Spark.CSharp.Sql
/// </summary>
public DataFrameWriter Format(string source)
{
logger.LogInfo("Output data storage format for the writer is '{0}'", source);
dataFrameWriterProxy.Format(source);
return this;
}
@ -66,6 +69,7 @@ namespace Microsoft.Spark.CSharp.Sql
public DataFrameWriter Option(string key, string value)
{
var options = new Dictionary<string, string>() { { key, value } };
logger.LogInfo("Output key-vaue option for the external data stroage is {0}={1}", key, value);
return Options(options);
}

Просмотреть файл

@ -19,110 +19,272 @@ namespace Microsoft.Spark.CSharp.Sql
public static class Functions
{
#region functions
/// <summary>
/// Creates a Column of any literal value.
/// </summary>
/// <param name="column">The given literal value</param>
/// <returns>A new Column is created to represent the literal value</returns>
public static Column Lit(object column)
{
return new Column(SparkCLREnvironment.SparkCLRProxy.SparkContextProxy.CreateFunction("lit", column));
}
/// <summary>
/// Returns a Column based on the given column name.
/// </summary>
/// <param name="colName">The name of column specified</param>
/// <returns>The column for the given name</returns>
public static Column Col(string colName)
{
return new Column(SparkCLREnvironment.SparkCLRProxy.SparkContextProxy.CreateFunction("col", colName));
}
/// <summary>
/// Returns a Column based on the given column name.
/// </summary>
/// <param name="colName">The name of column specified</param>
/// <returns>The column for the given name</returns>
public static Column Column(string colName)
{
return new Column(SparkCLREnvironment.SparkCLRProxy.SparkContextProxy.CreateFunction("column", colName));
}
/// <summary>
/// Returns a sort expression based on ascending order of the column.
/// </summary>
/// <param name="columnName">The name of column specified</param>
/// <returns>The column with ascending order</returns>
public static Column Asc(string columnName)
{
return new Column(SparkCLREnvironment.SparkCLRProxy.SparkContextProxy.CreateFunction("asc", columnName));
}
/// <summary>
/// Returns a sort expression based on the descending order of the column.
/// </summary>
/// <param name="columnName">The name of column specified</param>
/// <returns>the column with descending order</returns>
public static Column Desc(string columnName)
{
return new Column(SparkCLREnvironment.SparkCLRProxy.SparkContextProxy.CreateFunction("desc", columnName));
}
/// <summary>
/// Converts a string column to upper case.
/// </summary>
/// <param name="column">The string column specified</param>
/// <returns>The string column in upper case</returns>
public static Column Upper(Column column)
{
return new Column(SparkCLREnvironment.SparkCLRProxy.SparkContextProxy.CreateFunction("upper", column.ColumnProxy));
}
/// <summary>
/// Converts a string column to lower case.
/// </summary>
/// <param name="column">The string column specified</param>
/// <returns>The string column in lower case</returns>
public static Column Lower(Column column)
{
return new Column(SparkCLREnvironment.SparkCLRProxy.SparkContextProxy.CreateFunction("lower", column.ColumnProxy));
}
/// <summary>
/// Computes the square root of the specified float column.
/// </summary>
/// <param name="column">The float column</param>
/// <returns>The square root of the specified float column.</returns>
public static Column Sqrt(Column column)
{
return new Column(SparkCLREnvironment.SparkCLRProxy.SparkContextProxy.CreateFunction("sqrt", column.ColumnProxy));
}
/// <summary>
/// Computes the absolute value.
/// </summary>
/// <param name="column">The column to compute</param>
/// <returns>The new column represents the absolute value of the given column</returns>
public static Column Abs(Column column)
{
return new Column(SparkCLREnvironment.SparkCLRProxy.SparkContextProxy.CreateFunction("abs", column.ColumnProxy));
}
/// <summary>
/// Returns the maximum value of the expression in a group.
/// </summary>
/// <param name="column">The given column</param>
/// <returns>The new column reprents the maximum value</returns>
public static Column Max(Column column)
{
return new Column(SparkCLREnvironment.SparkCLRProxy.SparkContextProxy.CreateFunction("max", column.ColumnProxy));
}
/// <summary>
/// Returns the minimum value of the expression in a group.
/// </summary>
/// <param name="column">The given column</param>
/// <returns>The new column represents the minimum value</returns>
public static Column Min(Column column)
{
return new Column(SparkCLREnvironment.SparkCLRProxy.SparkContextProxy.CreateFunction("min", column.ColumnProxy));
}
/// <summary>
/// Returns the first value in a group.
/// </summary>
/// <param name="column">The given column</param>
/// <returns>The new column represents the first value</returns>
public static Column First(Column column)
{
return new Column(SparkCLREnvironment.SparkCLRProxy.SparkContextProxy.CreateFunction("first", column.ColumnProxy));
}
/// <summary>
/// Returns the last value in a group.
/// </summary>
/// <param name="column">The given column</param>
/// <returns>The new column represents the last value</returns>
public static Column Last(Column column)
{
return new Column(SparkCLREnvironment.SparkCLRProxy.SparkContextProxy.CreateFunction("last", column.ColumnProxy));
}
/// <summary>
/// Returns the number of items in a group.
/// </summary>
/// <param name="column">The given column</param>
/// <returns>The new column represents the count value</returns>
public static Column Count(Column column)
{
return new Column(SparkCLREnvironment.SparkCLRProxy.SparkContextProxy.CreateFunction("count", column.ColumnProxy));
}
/// <summary>
/// Returns the sum of all values in the expression.
/// </summary>
/// <param name="column">The given column</param>
/// <returns>The new column represents the sum</returns>
public static Column Sum(Column column)
{
return new Column(SparkCLREnvironment.SparkCLRProxy.SparkContextProxy.CreateFunction("sum", column.ColumnProxy));
}
/// <summary>
/// Returns the average of the values in a group.
/// </summary>
/// <param name="column">The given column</param>
/// <returns>The new column represents the average</returns>
public static Column Avg(Column column)
{
return new Column(SparkCLREnvironment.SparkCLRProxy.SparkContextProxy.CreateFunction("avg", column.ColumnProxy));
}
/// <summary>
/// Returns the average of the values in a group.
/// </summary>
/// <param name="column">The given column</param>
/// <returns>The new column represents the average</returns>
public static Column Mean(Column column)
{
return new Column(SparkCLREnvironment.SparkCLRProxy.SparkContextProxy.CreateFunction("mean", column.ColumnProxy));
}
/// <summary>
/// Returns the sum of distinct values in the expression.
/// </summary>
/// <param name="column">The given column</param>
/// <returns>The new column represents the sum of distinct values </returns>
public static Column SumDistinct(Column column)
{
return new Column(SparkCLREnvironment.SparkCLRProxy.SparkContextProxy.CreateFunction("sumDistinct", column.ColumnProxy));
}
/// <summary>
/// Creates a new array column. The input columns must all have the same data type.
/// </summary>
/// <param name="columns">The given columns</param>
/// <returns>The new array column</returns>
public static Column Array(params Column[] columns)
{
return new Column(SparkCLREnvironment.SparkCLRProxy.SparkContextProxy.CreateFunction("array", columns.Select(x => x.ColumnProxy)));
}
/// <summary>
/// Returns the first column that is not null, or null if all inputs are null.
/// </summary>
/// <param name="columns">The given columns</param>
/// <returns>The first column that is not null</returns>
public static Column Coalesce(params Column[] columns)
{
return new Column(SparkCLREnvironment.SparkCLRProxy.SparkContextProxy.CreateFunction("coalesce", columns.Select(x => x.ColumnProxy)));
}
/// <summary>
/// Returns the number of distinct items in a group.
/// </summary>
/// <param name="columns">The given columns</param>
/// <returns>The new column represents the number of distinct items</returns>
public static Column CountDistinct(params Column[] columns)
{
return new Column(SparkCLREnvironment.SparkCLRProxy.SparkContextProxy.CreateFunction("countDistinct", columns.Select(x => x.ColumnProxy)));
}
/// <summary>
/// Creates a new struct column.
/// </summary>
/// <param name="columns">The given columns</param>
/// <returns>The new struct column</returns>
public static Column Struct(params Column[] columns)
{
return new Column(SparkCLREnvironment.SparkCLRProxy.SparkContextProxy.CreateFunction("struct", columns.Select(x => x.ColumnProxy)));
}
/// <summary>
/// Returns the approximate number of distinct items in a group
/// </summary>
/// <param name="column">The given columns</param>
/// <returns>The column represents the approximate number of distinct items</returns>
public static Column ApproxCountDistinct(Column column)
{
return new Column(SparkCLREnvironment.SparkCLRProxy.SparkContextProxy.CreateFunction("approxCountDistinct", column));
}
/// <summary>
/// Creates a new row for each element in the given array or map column.
/// </summary>
/// <param name="column">The given column</param>
/// <returns>The new column for each element in the given array or map column</returns>
public static Column Explode(Column column)
{
return new Column(SparkCLREnvironment.SparkCLRProxy.SparkContextProxy.CreateFunction("explode", column));
}
/// <summary>
/// Generate a random column with i.i.d. samples from U[0.0, 1.0].
/// </summary>
/// <param name="seed">The long integer as seed</param>
/// <returns>A random column with i.i.d. samples from U[0.0, 1.0]. </returns>
public static Column Rand(long seed)
{
return new Column(SparkCLREnvironment.SparkCLRProxy.SparkContextProxy.CreateFunction("rand", seed));
}
/// <summary>
/// Generate a column with i.i.d. samples from the standard normal distribution.
/// </summary>
/// <param name="seed">The long integer as seed</param>
/// <returns>A column with i.i.d. samples from the standard normal distribution</returns>
public static Column Randn(long seed)
{
return new Column(SparkCLREnvironment.SparkCLRProxy.SparkContextProxy.CreateFunction("randn", seed));
}
/// <summary>
/// Returns the ntile group id (from 1 to n inclusive) in an ordered window partition.
/// This is equivalent to the NTILE function in SQL.
/// </summary>
/// <param name="n">The given number</param>
/// <returns>The ntile group id</returns>
public static Column Ntile(int n)
{
return new Column(SparkCLREnvironment.SparkCLRProxy.SparkContextProxy.CreateFunction("ntile", n));
@ -130,100 +292,221 @@ namespace Microsoft.Spark.CSharp.Sql
#endregion
#region unary math functions
/// <summary>
/// Computes the cosine inverse of the given column; the returned angle is in the range 0.
/// </summary>
/// <param name="column">The given column</param>
/// <returns>The column represents the cosine inverse</returns>
public static Column Acos(Column column)
{
return new Column(SparkCLREnvironment.SparkCLRProxy.SparkContextProxy.CreateFunction("acos", column.ColumnProxy));
}
/// <summary>
/// Computes the sine inverse of the given column; the returned angle is in the range -pi/2 through pi/2.
/// </summary>
/// <param name="column"></param>
/// <returns>The column represents the sine inverse</returns>
public static Column Asin(Column column)
{
return new Column(SparkCLREnvironment.SparkCLRProxy.SparkContextProxy.CreateFunction("asin", column.ColumnProxy));
}
/// <summary>
/// Computes the tangent inverse of the given column.
/// </summary>
/// <param name="column">The given column</param>
/// <returns>The column represents the tangent inverse</returns>
public static Column Atan(Column column)
{
return new Column(SparkCLREnvironment.SparkCLRProxy.SparkContextProxy.CreateFunction("atan", column.ColumnProxy));
}
/// <summary>
/// Computes the cube-root of the given column.
/// </summary>
/// <param name="column">The given column</param>
/// <returns>The column represents the cube-root</returns>
public static Column Cbrt(Column column)
{
return new Column(SparkCLREnvironment.SparkCLRProxy.SparkContextProxy.CreateFunction("cbrt", column.ColumnProxy));
}
/// <summary>
/// Computes the ceiling of the given column.
/// </summary>
/// <param name="column">The given column</param>
/// <returns>The column represents the ceiling</returns>
public static Column Ceil(Column column)
{
return new Column(SparkCLREnvironment.SparkCLRProxy.SparkContextProxy.CreateFunction("ceil", column.ColumnProxy));
}
/// <summary>
/// Computes the cosine of the given column.
/// </summary>
/// <param name="column">The given column</param>
/// <returns>The column represents the cosine</returns>
public static Column Cos(Column column)
{
return new Column(SparkCLREnvironment.SparkCLRProxy.SparkContextProxy.CreateFunction("cos", column.ColumnProxy));
}
/// <summary>
/// Computes the hyperbolic cosine of the given column.
/// </summary>
/// <param name="column">The given column</param>
/// <returns>The column represents the hyperbolic cosine</returns>
public static Column Cosh(Column column)
{
return new Column(SparkCLREnvironment.SparkCLRProxy.SparkContextProxy.CreateFunction("cosh", column.ColumnProxy));
}
/// <summary>
/// Computes the exponential of the given value.
/// Computes the exponential of the given column.
/// </summary>
/// <param name="column"></param>
/// <returns></returns>
/// <param name="column">The given column</param>
/// <returns>The column represents the exponential</returns>
public static Column Exp(Column column)
{
return new Column(SparkCLREnvironment.SparkCLRProxy.SparkContextProxy.CreateFunction("exp", column.ColumnProxy));
}
/// <summary>
/// Computes the exponential of the given value minus one.
/// Computes the exponential of the given value minus column.
/// </summary>
/// <param name="column"></param>
/// <returns></returns>
/// <param name="column">The given column</param>
/// <returns>The column represents the exponential</returns>
public static Column Expm1(Column column)
{
return new Column(SparkCLREnvironment.SparkCLRProxy.SparkContextProxy.CreateFunction("expm1", column.ColumnProxy));
}
/// <summary>
/// Computes the floor of the given column.
/// </summary>
/// <param name="column">The given column</param>
/// <returns>The column represents the floor</returns>
public static Column Floor(Column column)
{
return new Column(SparkCLREnvironment.SparkCLRProxy.SparkContextProxy.CreateFunction("floor", column.ColumnProxy));
}
/// <summary>
/// Computes the natural logarithm of the given column.
/// </summary>
/// <param name="column">The given column</param>
/// <returns>The column represents the natural logarithm</returns>
public static Column Log(Column column)
{
return new Column(SparkCLREnvironment.SparkCLRProxy.SparkContextProxy.CreateFunction("log", column.ColumnProxy));
}
/// <summary>
/// Computes the logarithm of the given column in base 10.
/// </summary>
/// <param name="column">The given column</param>
/// <returns>The column represents the logarithm</returns>
public static Column Log10(Column column)
{
return new Column(SparkCLREnvironment.SparkCLRProxy.SparkContextProxy.CreateFunction("log10", column.ColumnProxy));
}
/// <summary>
/// Computes the natural logarithm of the given column plus one.
/// </summary>
/// <param name="column">The given column</param>
/// <returns>The column represents the logarithm</returns>
public static Column Log1p(Column column)
{
return new Column(SparkCLREnvironment.SparkCLRProxy.SparkContextProxy.CreateFunction("log1p", column.ColumnProxy));
}
/// <summary>
/// Returns the double value that is closest in value to the argument and is equal to a mathematical integer.
/// </summary>
/// <param name="column">The given column</param>
/// <returns>The column represents the double value</returns>
public static Column Rint(Column column)
{
return new Column(SparkCLREnvironment.SparkCLRProxy.SparkContextProxy.CreateFunction("rint", column.ColumnProxy));
}
/// <summary>
/// Computes the signum of the given column.
/// </summary>
/// <param name="column">The given column</param>
/// <returns>The column represents the signum</returns>
public static Column Signum(Column column)
{
return new Column(SparkCLREnvironment.SparkCLRProxy.SparkContextProxy.CreateFunction("signum", column.ColumnProxy));
}
/// <summary>
/// Computes the sine of the given column.
/// </summary>
/// <param name="column">The given column</param>
/// <returns>The column represents the sine</returns>
public static Column Sin(Column column)
{
return new Column(SparkCLREnvironment.SparkCLRProxy.SparkContextProxy.CreateFunction("sin", column.ColumnProxy));
}
/// <summary>
/// Computes the hyperbolic sine of the given column.
/// </summary>
/// <param name="column">The given column</param>
/// <returns>The column represents the hyperbolic sine</returns>
public static Column Sinh(Column column)
{
return new Column(SparkCLREnvironment.SparkCLRProxy.SparkContextProxy.CreateFunction("sinh", column.ColumnProxy));
}
/// <summary>
/// Computes the tangent of the given column.
/// </summary>
/// <param name="column">The given column</param>
/// <returns>The column represents the tangent</returns>
public static Column Tan(Column column)
{
return new Column(SparkCLREnvironment.SparkCLRProxy.SparkContextProxy.CreateFunction("tan", column.ColumnProxy));
}
/// <summary>
/// Computes the hyperbolic tangent of the given column.
/// </summary>
/// <param name="column">The given column</param>
/// <returns>The column represents the hyperbolic tangent</returns>
public static Column Tanh(Column column)
{
return new Column(SparkCLREnvironment.SparkCLRProxy.SparkContextProxy.CreateFunction("tanh", column.ColumnProxy));
}
/// <summary>
/// Converts an angle measured in radians to an approximately equivalent angle measured in degrees.
/// </summary>
/// <param name="column">The given column</param>
/// <returns>The column represents the degrees</returns>
public static Column ToDegrees(Column column)
{
return new Column(SparkCLREnvironment.SparkCLRProxy.SparkContextProxy.CreateFunction("toDegrees", column.ColumnProxy));
}
/// <summary>
/// Converts an angle measured in degrees to an approximately equivalent angle measured in radians.
/// </summary>
/// <param name="column">The given column</param>
/// <returns>The column represents the radians</returns>
public static Column ToRadians(Column column)
{
return new Column(SparkCLREnvironment.SparkCLRProxy.SparkContextProxy.CreateFunction("toRadians", column.ColumnProxy));
}
/// <summary>
/// Computes bitwise NOT.
/// </summary>
/// <param name="column">The given column</param>
/// <returns>The column of bitwise NOT result</returns>
public static Column BitwiseNOT(Column column)
{
return new Column(SparkCLREnvironment.SparkCLRProxy.SparkContextProxy.CreateFunction("bitwiseNOT", column.ColumnProxy));
@ -231,46 +514,122 @@ namespace Microsoft.Spark.CSharp.Sql
#endregion
#region binary math functions
/// <summary>
/// Returns the angle theta from the conversion of rectangular coordinates (x, y) to polar coordinates (r, theta).
/// </summary>
/// <param name="leftColumn">The left column</param>
/// <param name="rightColumn">The right column</param>
/// <returns>The column of the result</returns>
public static Column Atan2(Column leftColumn, Column rightColumn)
{
return new Column(SparkCLREnvironment.SparkCLRProxy.SparkContextProxy.CreateBinaryMathFunction("atan2", leftColumn.ColumnProxy, rightColumn.ColumnProxy));
}
/// <summary>
/// Computes sqrt(a2 + b2) without intermediate overflow or underflow.
/// </summary>
/// <param name="leftColumn">The left column</param>
/// <param name="rightColumn">The right column</param>
/// <returns>The column of the result</returns>
public static Column Hypot(Column leftColumn, Column rightColumn)
{
return new Column(SparkCLREnvironment.SparkCLRProxy.SparkContextProxy.CreateBinaryMathFunction("hypot", leftColumn.ColumnProxy, rightColumn.ColumnProxy));
}
/// <summary>
/// Computes sqrt(a2 + b2) without intermediate overflow or underflow.
/// </summary>
/// <param name="leftColumn">The left column</param>
/// <param name="rightValue">The right column</param>
/// <returns>The column of the result</returns>
public static Column Hypot(Column leftColumn, double rightValue)
{
return new Column(SparkCLREnvironment.SparkCLRProxy.SparkContextProxy.CreateBinaryMathFunction("hypot", leftColumn.ColumnProxy, rightValue));
}
/// <summary>
/// Computes sqrt(a2 + b2) without intermediate overflow or underflow.
/// </summary>
/// <param name="leftValue">The left value</param>
/// <param name="rightColumn">The right column</param>
/// <returns>The column of the result</returns>
public static Column Hypot(double leftValue, Column rightColumn)
{
return new Column(SparkCLREnvironment.SparkCLRProxy.SparkContextProxy.CreateBinaryMathFunction("hypot", leftValue, rightColumn.ColumnProxy));
}
/// <summary>
/// Returns the value of the first argument raised to the power of the second argument.
/// </summary>
/// <param name="leftColumn">The left column</param>
/// <param name="rightColumn">The right column</param>
/// <returns>The column of the result</returns>
public static Column Pow(Column leftColumn, Column rightColumn)
{
return new Column(SparkCLREnvironment.SparkCLRProxy.SparkContextProxy.CreateBinaryMathFunction("pow", leftColumn.ColumnProxy, rightColumn.ColumnProxy));
}
/// <summary>
/// Returns the value of the first argument raised to the power of the second argument.
/// </summary>
/// <param name="leftColumn">The left column</param>
/// <param name="rightValue">The right value</param>
/// <returns>The column of the result</returns>
public static Column Pow(Column leftColumn, double rightValue)
{
return new Column(SparkCLREnvironment.SparkCLRProxy.SparkContextProxy.CreateBinaryMathFunction("pow", leftColumn.ColumnProxy, rightValue));
}
/// <summary>
/// Returns the value of the first argument raised to the power of the second argument.
/// </summary>
/// <param name="leftValue">The left value</param>
/// <param name="rightColumn">The right column</param>
/// <returns>The column of the result</returns>
public static Column Pow(double leftValue, Column rightColumn)
{
return new Column(SparkCLREnvironment.SparkCLRProxy.SparkContextProxy.CreateBinaryMathFunction("pow", leftValue, rightColumn.ColumnProxy));
}
/// <summary>
/// Returns the approximate number of distinct items in a group.
/// </summary>
/// <param name="column">The given column</param>
/// <param name="rsd">The rsd</param>
/// <returns>The column of the result</returns>
public static Column ApproxCountDistinct(Column column, double rsd)
{
return new Column(SparkCLREnvironment.SparkCLRProxy.SparkContextProxy.CreateBinaryMathFunction("approxCountDistinct", column, rsd));
}
/// <summary>
/// Evaluates a list of conditions and returns one of multiple possible result expressions.
/// </summary>
/// <param name="condition">The given column of condition</param>
/// <param name="value">The value of condition</param>
/// <returns>The column of the result</returns>
public static Column When(Column condition, object value)
{
return new Column(SparkCLREnvironment.SparkCLRProxy.SparkContextProxy.CreateBinaryMathFunction("when", condition, value));
}
/// <summary>
/// Returns the value that is offset rows before the current row, and null if there is less than offset rows before the current row.
/// </summary>
/// <param name="column">The given column</param>
/// <param name="offset">The offset of the given column</param>
/// <returns>The column of the result</returns>
public static Column Lag(Column column, int offset)
{
return new Column(SparkCLREnvironment.SparkCLRProxy.SparkContextProxy.CreateBinaryMathFunction("lag", column, offset));
}
/// <summary>
/// Returns the value that is offset rows after the current row, and null if there is less than offset rows after the current row.
/// </summary>
/// <param name="column">The given column</param>
/// <param name="offset">The offset of the given column</param>
/// <returns>The column of the result</returns>
public static Column Lead(Column column, int offset)
{
return new Column(SparkCLREnvironment.SparkCLRProxy.SparkContextProxy.CreateBinaryMathFunction("lead", column, offset));
@ -278,38 +637,83 @@ namespace Microsoft.Spark.CSharp.Sql
#endregion
#region window functions
/// <summary>
/// Returns a sequential number starting at 1 within a window partition.
/// </summary>
/// <returns>The column of the result</returns>
public static Column RowNumber()
{
return new Column(SparkCLREnvironment.SparkCLRProxy.SparkContextProxy.CreateWindowFunction("rowNumber"));
return new Column(SparkCLREnvironment.SparkCLRProxy.SparkContextProxy.CreateWindowFunction("row_number"));
}
/// <summary>
/// Returns the rank of rows within a window partition, without any gaps.
/// </summary>
/// <returns>The column of the result</returns>
public static Column DenseRank()
{
return new Column(SparkCLREnvironment.SparkCLRProxy.SparkContextProxy.CreateWindowFunction("denseRank"));
return new Column(SparkCLREnvironment.SparkCLRProxy.SparkContextProxy.CreateWindowFunction("dense_rank"));
}
/// <summary>
/// Returns the rank of rows within a window partition.
/// </summary>
/// <returns>The column of the result</returns>
public static Column Rank()
{
return new Column(SparkCLREnvironment.SparkCLRProxy.SparkContextProxy.CreateWindowFunction("rank"));
}
/// <summary>
/// Returns the cumulative distribution of values within a window partition
/// </summary>
/// <returns>The column of the result</returns>
public static Column CumeDist()
{
return new Column(SparkCLREnvironment.SparkCLRProxy.SparkContextProxy.CreateWindowFunction("cumeDist"));
return new Column(SparkCLREnvironment.SparkCLRProxy.SparkContextProxy.CreateWindowFunction("cume_dist"));
}
/// <summary>
/// Returns the relative rank (i.e. percentile) of rows within a window partition.
/// </summary>
/// <returns>The column of the result</returns>
public static Column PercentRank()
{
return new Column(SparkCLREnvironment.SparkCLRProxy.SparkContextProxy.CreateWindowFunction("percentRank"));
return new Column(SparkCLREnvironment.SparkCLRProxy.SparkContextProxy.CreateWindowFunction("percent_rank"));
}
/// <summary>
/// A column expression that generates monotonically increasing 64-bit integers.
/// </summary>
/// <returns>The column of the result</returns>
public static Column MonotonicallyIncreasingId()
{
return new Column(SparkCLREnvironment.SparkCLRProxy.SparkContextProxy.CreateWindowFunction("monotonicallyIncreasingId"));
return new Column(SparkCLREnvironment.SparkCLRProxy.SparkContextProxy.CreateWindowFunction("monotonically_increasing_id"));
}
/// <summary>
/// Partition ID of the Spark task.
/// Note that this is indeterministic because it depends on data partitioning and task scheduling.
/// </summary>
/// <returns>The column of the result</returns>
public static Column SparkPartitionId()
{
return new Column(SparkCLREnvironment.SparkCLRProxy.SparkContextProxy.CreateWindowFunction("sparkPartitionId"));
return new Column(SparkCLREnvironment.SparkCLRProxy.SparkContextProxy.CreateWindowFunction("spark_partition_id"));
}
/// <summary>
/// Generate a random column with i.i.d. samples from U[0.0, 1.0].
/// </summary>
/// <returns>The column of the result</returns>
public static Column Rand()
{
return new Column(SparkCLREnvironment.SparkCLRProxy.SparkContextProxy.CreateWindowFunction("rand"));
}
/// <summary>
/// Generate a column with i.i.d. samples from the standard normal distribution.
/// </summary>
/// <returns>The column of the result</returns>
public static Column Randn()
{
return new Column(SparkCLREnvironment.SparkCLRProxy.SparkContextProxy.CreateWindowFunction("randn"));
@ -317,46 +721,188 @@ namespace Microsoft.Spark.CSharp.Sql
#endregion
#region udf
/// <summary>
/// Defines a user-defined function of 0 arguments as user-defined function (UDF).
/// The data types are automatically inferred based on the function's signature.
/// </summary>
/// <param name="f">The given function</param>
/// <typeparam name="RT">The return type of the given function</typeparam>
/// <returns>The new user-defined function</returns>
public static Func<Column> Udf<RT>(Func<RT> f)
{
return new UserDefinedFunction<RT>(new UdfHelper<RT>(f).Execute).Execute0;
}
/// <summary>
/// Defines a user-defined function of 1 arguments as user-defined function (UDF).
/// The data types are automatically inferred based on the function's signature.
/// </summary>
/// <param name="f">The given function</param>
/// <typeparam name="RT">The return type of the given function</typeparam>
/// <typeparam name="A1">The 1st arguement of the given function</typeparam>
/// <returns>The new user-defined function</returns>
public static Func<Column, Column> Udf<RT, A1>(Func<A1, RT> f)
{
return new UserDefinedFunction<RT>(new UdfHelper<RT, A1>(f).Execute).Execute1;
}
/// <summary>
/// Defines a user-defined function of 2 arguments as user-defined function (UDF).
/// The data types are automatically inferred based on the function's signature.
/// </summary>
/// <param name="f">The given function</param>
/// <typeparam name="RT">The return type of the given function</typeparam>
/// <typeparam name="A1">The 1st arguement of the given function</typeparam>
/// <typeparam name="A2">The 2nd arguement of the given function</typeparam>
/// <returns>The new user-defined function</returns>
public static Func<Column, Column, Column> Udf<RT, A1, A2>(Func<A1, A2, RT> f)
{
return new UserDefinedFunction<RT>(new UdfHelper<RT, A1, A2>(f).Execute).Execute2;
}
/// <summary>
/// Defines a user-defined function of 3 arguments as user-defined function (UDF).
/// The data types are automatically inferred based on the function's signature.
/// </summary>
/// <param name="f">The given function</param>
/// <typeparam name="RT">The return type of the given function</typeparam>
/// <typeparam name="A1">The 1st arguement of the given function</typeparam>
/// <typeparam name="A2">The 2nd arguement of the given function</typeparam>
/// <typeparam name="A3">The 3rd arguement of the given function</typeparam>
/// <returns>The new user-defined function</returns>
public static Func<Column, Column, Column, Column> Udf<RT, A1, A2, A3>(Func<A1, A2, A3, RT> f)
{
return new UserDefinedFunction<RT>(new UdfHelper<RT, A1, A2, A3>(f).Execute).Execute3;
}
/// <summary>
/// Defines a user-defined function of 4 arguments as user-defined function (UDF).
/// The data types are automatically inferred based on the function's signature.
/// </summary>
/// <param name="f">The given function</param>
/// <typeparam name="RT">The return type of the given function</typeparam>
/// <typeparam name="A1">The 1st arguement of the given function</typeparam>
/// <typeparam name="A2">The 2nd arguement of the given function</typeparam>
/// <typeparam name="A3">The 3rd arguement of the given function</typeparam>
/// <typeparam name="A4">The 4th arguement of the given function</typeparam>
/// <returns>The new user-defined function</returns>
public static Func<Column, Column, Column, Column, Column> Udf<RT, A1, A2, A3, A4>(Func<A1, A2, A3, A4, RT> f)
{
return new UserDefinedFunction<RT>(new UdfHelper<RT, A1, A2, A3, A4>(f).Execute).Execute4;
}
/// <summary>
/// Defines a user-defined function of 5 arguments as user-defined function (UDF).
/// The data types are automatically inferred based on the function's signature.
/// </summary>
/// <param name="f">The given function</param>
/// <typeparam name="RT">The return type of the given function</typeparam>
/// <typeparam name="A1">The 1st arguement of the given function</typeparam>
/// <typeparam name="A2">The 2nd arguement of the given function</typeparam>
/// <typeparam name="A3">The 3rd arguement of the given function</typeparam>
/// <typeparam name="A4">The 4th arguement of the given function</typeparam>
/// <typeparam name="A5">The 5th arguement of the given function</typeparam>
/// <returns>The new user-defined function</returns>
public static Func<Column, Column, Column, Column, Column, Column> Udf<RT, A1, A2, A3, A4, A5>(Func<A1, A2, A3, A4, A5, RT> f)
{
return new UserDefinedFunction<RT>(new UdfHelper<RT, A1, A2, A3, A4, A5>(f).Execute).Execute5;
}
/// <summary>
/// Defines a user-defined function of 6 arguments as user-defined function (UDF).
/// The data types are automatically inferred based on the function's signature.
/// </summary>
/// <param name="f">The given function</param>
/// <typeparam name="RT">The return type of the given function</typeparam>
/// <typeparam name="A1">The 1st arguement of the given function</typeparam>
/// <typeparam name="A2">The 2nd arguement of the given function</typeparam>
/// <typeparam name="A3">The 3rd arguement of the given function</typeparam>
/// <typeparam name="A4">The 4th arguement of the given function</typeparam>
/// <typeparam name="A5">The 5th arguement of the given function</typeparam>
/// <typeparam name="A6">The 6th arguement of the given function</typeparam>
/// <returns>The new user-defined function</returns>
public static Func<Column, Column, Column, Column, Column, Column, Column> Udf<RT, A1, A2, A3, A4, A5, A6>(Func<A1, A2, A3, A4, A5, A6, RT> f)
{
return new UserDefinedFunction<RT>(new UdfHelper<RT, A1, A2, A3, A4, A5, A6>(f).Execute).Execute6;
}
/// <summary>
/// Defines a user-defined function of 7 arguments as user-defined function (UDF).
/// The data types are automatically inferred based on the function's signature.
/// </summary>
/// <param name="f">The given function</param>
/// <typeparam name="RT">The return type of the given function</typeparam>
/// <typeparam name="A1">The 1st arguement of the given function</typeparam>
/// <typeparam name="A2">The 2nd arguement of the given function</typeparam>
/// <typeparam name="A3">The 3rd arguement of the given function</typeparam>
/// <typeparam name="A4">The 4th arguement of the given function</typeparam>
/// <typeparam name="A5">The 5th arguement of the given function</typeparam>
/// <typeparam name="A6">The 6th arguement of the given function</typeparam>
/// <typeparam name="A7">The 7th arguement of the given function</typeparam>
/// <returns>The new user-defined function</returns>
public static Func<Column, Column, Column, Column, Column, Column, Column, Column> Udf<RT, A1, A2, A3, A4, A5, A6, A7>(Func<A1, A2, A3, A4, A5, A6, A7, RT> f)
{
return new UserDefinedFunction<RT>(new UdfHelper<RT, A1, A2, A3, A4, A5, A6, A7>(f).Execute).Execute7;
}
/// <summary>
/// Defines a user-defined function of 8 arguments as user-defined function (UDF).
/// The data types are automatically inferred based on the function's signature.
/// </summary>
/// <param name="f">The given function</param>
/// <typeparam name="RT">The return type of the given function</typeparam>
/// <typeparam name="A1">The 1st arguement of the given function</typeparam>
/// <typeparam name="A2">The 2nd arguement of the given function</typeparam>
/// <typeparam name="A3">The 3rd arguement of the given function</typeparam>
/// <typeparam name="A4">The 4th arguement of the given function</typeparam>
/// <typeparam name="A5">The 5th arguement of the given function</typeparam>
/// <typeparam name="A6">The 6th arguement of the given function</typeparam>
/// <typeparam name="A7">The 7th arguement of the given function</typeparam>
/// <typeparam name="A8">The 8th arguement of the given function</typeparam>
/// <returns>The new user-defined function</returns>
public static Func<Column, Column, Column, Column, Column, Column, Column, Column, Column> Udf<RT, A1, A2, A3, A4, A5, A6, A7, A8>(Func<A1, A2, A3, A4, A5, A6, A7, A8, RT> f)
{
return new UserDefinedFunction<RT>(new UdfHelper<RT, A1, A2, A3, A4, A5, A6, A7, A8>(f).Execute).Execute8;
}
/// <summary>
/// Defines a user-defined function of 9 arguments as user-defined function (UDF).
/// The data types are automatically inferred based on the function's signature.
/// </summary>
/// <param name="f">The given function</param>
/// <typeparam name="RT">The return type of the given function</typeparam>
/// <typeparam name="A1">The 1st arguement of the given function</typeparam>
/// <typeparam name="A2">The 2nd arguement of the given function</typeparam>
/// <typeparam name="A3">The 3rd arguement of the given function</typeparam>
/// <typeparam name="A4">The 4th arguement of the given function</typeparam>
/// <typeparam name="A5">The 5th arguement of the given function</typeparam>
/// <typeparam name="A6">The 6th arguement of the given function</typeparam>
/// <typeparam name="A7">The 7th arguement of the given function</typeparam>
/// <typeparam name="A8">The 8th arguement of the given function</typeparam>
/// <typeparam name="A9">The 9th arguement of the given function</typeparam>
/// <returns>The new user-defined function</returns>
public static Func<Column, Column, Column, Column, Column, Column, Column, Column, Column, Column> Udf<RT, A1, A2, A3, A4, A5, A6, A7, A8, A9>(Func<A1, A2, A3, A4, A5, A6, A7, A8, A9, RT> f)
{
return new UserDefinedFunction<RT>(new UdfHelper<RT, A1, A2, A3, A4, A5, A6, A7, A8, A9>(f).Execute).Execute9;
}
/// <summary>
/// Defines a user-defined function of 10 arguments as user-defined function (UDF).
/// The data types are automatically inferred based on the function's signature.
/// </summary>
/// <param name="f">The given function</param>
/// <typeparam name="RT">The return type of the given function</typeparam>
/// <typeparam name="A1">The 1st arguement of the given function</typeparam>
/// <typeparam name="A2">The 2nd arguement of the given function</typeparam>
/// <typeparam name="A3">The 3rd arguement of the given function</typeparam>
/// <typeparam name="A4">The 4th arguement of the given function</typeparam>
/// <typeparam name="A5">The 5th arguement of the given function</typeparam>
/// <typeparam name="A6">The 6th arguement of the given function</typeparam>
/// <typeparam name="A7">The 7th arguement of the given function</typeparam>
/// <typeparam name="A8">The 8th arguement of the given function</typeparam>
/// <typeparam name="A9">The 9th arguement of the given function</typeparam>
/// <typeparam name="A10">The 10th arguement of the given function</typeparam>
/// <returns>The new user-defined function</returns>
public static Func<Column, Column, Column, Column, Column, Column, Column, Column, Column, Column, Column> Udf<RT, A1, A2, A3, A4, A5, A6, A7, A8, A9, A10>(Func<A1, A2, A3, A4, A5, A6, A7, A8, A9, A10, RT> f)
{
return new UserDefinedFunction<RT>(new UdfHelper<RT, A1, A2, A3, A4, A5, A6, A7, A8, A9, A10>(f).Execute).Execute10;

Просмотреть файл

@ -0,0 +1,42 @@
// Copyright (c) Microsoft. All rights reserved.
// Licensed under the MIT license. See LICENSE file in the project root for full license information.
using Microsoft.Spark.CSharp.Core;
using Microsoft.Spark.CSharp.Proxy;
namespace Microsoft.Spark.CSharp.Sql
{
/// <summary>
/// A variant of Spark SQL that integrates with data stored in Hive.
/// Configuration for Hive is read from hive-site.xml on the classpath.
/// It supports running both SQL and HiveQL commands.
/// </summary>
public class HiveContext : SqlContext
{
/// <summary>
/// Creates a HiveContext
/// </summary>
/// <param name="sparkContext"></param>
public HiveContext(SparkContext sparkContext)
: base(sparkContext, sparkContext.SparkContextProxy.CreateHiveContext())
{
}
internal HiveContext(SparkContext sparkContext, ISqlContextProxy sqlContextProxy)
: base(sparkContext, sqlContextProxy)
{
}
/// <summary>
/// Invalidate and refresh all the cached the metadata of the given table.
/// For performance reasons, Spark SQL or the external data source library it uses
/// might cache certain metadata about a table, such as the location of blocks.
/// When those change outside of Spark SQL, users should call this function to invalidate the cache.
/// </summary>
/// <param name="tableName"></param>
public void RefreshTable(string tableName)
{
SqlContextProxy.RefreshTable(tableName);
}
}
}

Просмотреть файл

@ -1,13 +1,15 @@
// Copyright (c) Microsoft. All rights reserved.
// Licensed under the MIT license. See LICENSE file in the project root for full license information.
using System;
using System.Threading;
using Razorvine.Pickle;
namespace Microsoft.Spark.CSharp.Sql
{
/// <summary>
/// Used by Unpickler to unpickle pickled objects. It is also used to construct a Row (C# representation of pickled objects).
/// Note this implementation is not ThreadSafe. Collect or RDD conversion where unpickling is done is not expected to be multithreaded.
/// </summary>
public class RowConstructor : IObjectConstructor
{
@ -16,11 +18,13 @@ namespace Microsoft.Spark.CSharp.Sql
/// <summary>
/// Schema of the DataFrame currently being processed
/// </summary>
[ThreadStatic] // thread safe is need when running in C# worker process
private static string currentSchema;
/// <summary>
/// Indicates if Schema is already set during construction of this type
/// </summary>
[ThreadStatic] // thread safe is need when running in C# worker process
private static bool isCurrentSchemaSet;
/// <summary>
@ -33,6 +37,10 @@ namespace Microsoft.Spark.CSharp.Sql
/// </summary>
internal string Schema;
/// <summary>
/// Returns a string that represents the current object.
/// </summary>
/// <returns>A string that represents the current object.</returns>
public override string ToString()
{
return string.Format("{{{0}}}", string.Join(",", Values));

Просмотреть файл

@ -40,6 +40,11 @@ namespace Microsoft.Spark.CSharp.Sql
/// </summary>
public static class SaveModeExtensions
{
/// <summary>
/// Gets the string for the value of SaveMode
/// </summary>
/// <param name="mode">The given SaveMode</param>
/// <returns>The string that represents the given SaveMode</returns>
public static string GetStringValue(this SaveMode mode)
{
switch (mode)

Просмотреть файл

@ -5,6 +5,7 @@ using System;
using System.Collections.Generic;
using Microsoft.Spark.CSharp.Core;
using Microsoft.Spark.CSharp.Proxy;
using Microsoft.Spark.CSharp.Services;
namespace Microsoft.Spark.CSharp.Sql
{
@ -14,13 +15,77 @@ namespace Microsoft.Spark.CSharp.Sql
/// </summary>
public class SqlContext
{
private readonly ILoggerService logger = LoggerServiceFactory.GetLogger(typeof(SqlContext));
private readonly ISqlContextProxy sqlContextProxy;
private readonly SparkContext sparkContext;
internal ISqlContextProxy SqlContextProxy { get { return sqlContextProxy; } }
private static SqlContext instance;
/// <summary>
/// Creates a SqlContext
/// </summary>
/// <param name="sparkContext"></param>
public SqlContext(SparkContext sparkContext)
{
this.sparkContext = sparkContext;
sqlContextProxy = sparkContext.SparkContextProxy.CreateSqlContext();
sqlContextProxy = sparkContext.SparkContextProxy.CreateSqlContext();
if (instance == null) instance = this;
}
internal SqlContext(SparkContext sparkContext, ISqlContextProxy sqlContextProxy)
{
this.sparkContext = sparkContext;
this.sqlContextProxy = sqlContextProxy;
if (instance == null) instance = this;
}
/// <summary>
/// Get the existing SQLContext or create a new one with given SparkContext.
/// </summary>
/// <param name="sparkContext"></param>
/// <returns></returns>
public static SqlContext GetOrCreate(SparkContext sparkContext)
{
if (instance == null)
{
return new SqlContext(sparkContext);
}
return instance;
}
/// <summary>
/// Returns a new SQLContext as new session, that has separate SQLConf,
/// registered temporary tables and UDFs, but shared SparkContext and table cache.
/// </summary>
/// <returns></returns>
public SqlContext NewSession()
{
var newSessionProxy = sqlContextProxy.NewSession();
return new SqlContext(this.sparkContext, newSessionProxy);
}
/// <summary>
/// Returns the value of Spark SQL configuration property for the given key.
/// If the key is not set, returns defaultValue.
/// </summary>
/// <param name="key"></param>
/// <param name="defaultValue"></param>
/// <returns></returns>
public string GetConf(string key, string defaultValue)
{
return sqlContextProxy.GetConf(key, defaultValue);
}
/// <summary>
/// Sets the given Spark SQL configuration property.
/// </summary>
/// <param name="key"></param>
/// <param name="value"></param>
public void SetConf(string key, string value)
{
sqlContextProxy.SetConf(key, value);
}
/// <summary>
@ -28,6 +93,7 @@ namespace Microsoft.Spark.CSharp.Sql
/// </summary>
public DataFrameReader Read()
{
logger.LogInfo("Using DataFrameReader to read input data from external data source");
return new DataFrameReader(sqlContextProxy.Read(), sparkContext);
}
@ -40,9 +106,16 @@ namespace Microsoft.Spark.CSharp.Sql
/// <returns></returns>
public DataFrame ReadDataFrame(string path, StructType schema, Dictionary<string, string> options)
{
logger.LogInfo("Reading DataFrame from file {0}", path);
return new DataFrame(sqlContextProxy.ReadDataFrame(path, schema, options), sparkContext);
}
/// <summary>
/// Creates a <see cref="DataFrame"/> from a RDD containing array of object using the given schema.
/// </summary>
/// <param name="rdd">RDD containing array of object. The array acts as a row and items within the array act as columns which the schema is specified in <paramref name="schema"/>. </param>
/// <param name="schema">The schema of DataFrame.</param>
/// <returns></returns>
public DataFrame CreateDataFrame(RDD<object[]> rdd, StructType schema)
{
// Note: This is for pickling RDD, convert to RDD<byte[]> which happens in CSharpWorker.
@ -55,6 +128,100 @@ namespace Microsoft.Spark.CSharp.Sql
return new DataFrame(sqlContextProxy.CreateDataFrame(rddRow.RddProxy, schema.StructTypeProxy), sparkContext);
}
/// <summary>
/// Registers the given <see cref="DataFrame"/> as a temporary table in the catalog.
/// Temporary tables exist only during the lifetime of this instance of SqlContext.
/// </summary>
/// <param name="dataFrame"></param>
/// <param name="tableName"></param>
public void RegisterDataFrameAsTable(DataFrame dataFrame, string tableName)
{
sqlContextProxy.RegisterDataFrameAsTable(dataFrame.DataFrameProxy, tableName);
}
/// <summary>
/// Remove the temp table from catalog.
/// </summary>
/// <param name="tableName"></param>
public void DropTempTable(string tableName)
{
sqlContextProxy.DropTempTable(tableName);
}
/// <summary>
/// Returns the specified table as a <see cref="DataFrame"/>
/// </summary>
/// <param name="tableName"></param>
/// <returns></returns>
public DataFrame Table(string tableName)
{
return new DataFrame(sqlContextProxy.Table(tableName), sparkContext);
}
/// <summary>
/// Returns a <see cref="DataFrame"/> containing names of tables in the given database.
/// If <paramref name="databaseName"/> is not specified, the current database will be used.
/// The returned DataFrame has two columns: 'tableName' and 'isTemporary' (a column with bool
/// type indicating if a table is a temporary one or not).
/// </summary>
/// <param name="databaseName">Name of the database to use. Default to the current database.
/// Note: This is only applicable to HiveContext.</param>
/// <returns></returns>
public DataFrame Tables(string databaseName = null)
{
return databaseName == null ?
new DataFrame(sqlContextProxy.Tables(), sparkContext) :
new DataFrame(sqlContextProxy.Tables(databaseName), sparkContext);
}
/// <summary>
/// Returns a list of names of tables in the database <paramref name="databaseName"/>
/// </summary>
/// <param name="databaseName">Name of the database to use. Default to the current database.
/// Note: This is only applicable to HiveContext.</param>
/// <returns></returns>
public IEnumerable<string> TableNames(string databaseName = null)
{
return databaseName == null ?
sqlContextProxy.TableNames() : sqlContextProxy.TableNames(databaseName);
}
/// <summary>
/// Caches the specified table in-memory.
/// </summary>
/// <param name="tableName"></param>
public void CacheTable(string tableName)
{
sqlContextProxy.CacheTable(tableName);
}
/// <summary>
/// Removes the specified table from the in-memory cache.
/// </summary>
/// <param name="tableName"></param>
public void UncacheTable(string tableName)
{
sqlContextProxy.UncacheTable(tableName);
}
/// <summary>
/// Removes all cached tables from the in-memory cache.
/// </summary>
public void ClearCache()
{
sqlContextProxy.ClearCache();
}
/// <summary>
/// Returns true if the table is currently cached in-memory.
/// </summary>
/// <param name="tableName"></param>
/// <returns></returns>
public bool IsCached(string tableName)
{
return sqlContextProxy.IsCached(tableName);
}
/// <summary>
/// Executes a SQL query using Spark, returning the result as a DataFrame. The dialect that is used for SQL parsing can be configured with 'spark.sql.dialect'
/// </summary>
@ -62,6 +229,7 @@ namespace Microsoft.Spark.CSharp.Sql
/// <returns></returns>
public DataFrame Sql(string sqlQuery)
{
logger.LogInfo("SQL query to execute on the dataframe is {0}", sqlQuery);
return new DataFrame(sqlContextProxy.Sql(sqlQuery), sparkContext);
}
@ -117,7 +285,7 @@ namespace Microsoft.Spark.CSharp.Sql
#region UDF Registration
/// <summary>
/// Register UDF with no input argument, e.g:
/// <see cref="SqlContext.RegisterFunction{bool}"/>("MyFilter", () => true);
/// SqlContext.RegisterFunction&lt;bool>("MyFilter", () => true);
/// sqlContext.Sql("SELECT * FROM MyTable where MyFilter()");
/// </summary>
/// <typeparam name="RT"></typeparam>
@ -131,7 +299,7 @@ namespace Microsoft.Spark.CSharp.Sql
/// <summary>
/// Register UDF with 1 input argument, e.g:
/// <see cref="SqlContext.RegisterFunction{bool, string}"/>("MyFilter", (arg1) => arg1 != null);
/// SqlContext.RegisterFunction&lt;bool, string>("MyFilter", (arg1) => arg1 != null);
/// sqlContext.Sql("SELECT * FROM MyTable where MyFilter(columnName1)");
/// </summary>
/// <typeparam name="RT"></typeparam>
@ -146,7 +314,7 @@ namespace Microsoft.Spark.CSharp.Sql
/// <summary>
/// Register UDF with 2 input arguments, e.g:
/// <see cref="SqlContext.RegisterFunction{bool, string, string}"/>("MyFilter", (arg1, arg2) => arg1 != null && arg2 != null);
/// SqlContext.RegisterFunction&lt;bool, string, string>("MyFilter", (arg1, arg2) => arg1 != null &amp;&amp; arg2 != null);
/// sqlContext.Sql("SELECT * FROM MyTable where MyFilter(columnName1, columnName2)");
/// </summary>
/// <typeparam name="RT"></typeparam>
@ -162,7 +330,7 @@ namespace Microsoft.Spark.CSharp.Sql
/// <summary>
/// Register UDF with 3 input arguments, e.g:
/// <see cref="SqlContext.RegisterFunction{bool, string, string, string}"/>("MyFilter", (arg1, arg2, arg3) => arg1 != null && arg2 != null && arg3 != null);
/// SqlContext.RegisterFunction&lt;bool, string, string, string>("MyFilter", (arg1, arg2, arg3) => arg1 != null &amp;&amp; arg2 != null &amp;&amp; arg3 != null);
/// sqlContext.Sql("SELECT * FROM MyTable where MyFilter(columnName1, columnName2, columnName3)");
/// </summary>
/// <typeparam name="RT"></typeparam>
@ -179,7 +347,7 @@ namespace Microsoft.Spark.CSharp.Sql
/// <summary>
/// Register UDF with 4 input arguments, e.g:
/// <see cref="SqlContext.RegisterFunction{bool, string, string, ..., string}"/>("MyFilter", (arg1, arg2, ..., arg4) => arg1 != null && arg2 != null && ... && arg3 != null);
/// SqlContext.RegisterFunction&lt;bool, string, string, ..., string>("MyFilter", (arg1, arg2, ..., arg4) => arg1 != null &amp;&amp; arg2 != null &amp;&amp; ... &amp;&amp; arg3 != null);
/// sqlContext.Sql("SELECT * FROM MyTable where MyFilter(columnName1, columnName2, ..., columnName4)");
/// </summary>
/// <typeparam name="RT"></typeparam>
@ -197,7 +365,7 @@ namespace Microsoft.Spark.CSharp.Sql
/// <summary>
/// Register UDF with 5 input arguments, e.g:
/// <see cref="SqlContext.RegisterFunction{bool, string, string, ..., string}"/>("MyFilter", (arg1, arg2, ..., arg5) => arg1 != null && arg2 != null && ... && arg5 != null);
/// SqlContext.RegisterFunction&lt;bool, string, string, ..., string>("MyFilter", (arg1, arg2, ..., arg5) => arg1 != null &amp;&amp; arg2 != null &amp;&amp; ... &amp;&amp; arg5 != null);
/// sqlContext.Sql("SELECT * FROM MyTable where MyFilter(columnName1, columnName2, ..., columnName5)");
/// </summary>
/// <typeparam name="RT"></typeparam>
@ -216,7 +384,7 @@ namespace Microsoft.Spark.CSharp.Sql
/// <summary>
/// Register UDF with 6 input arguments, e.g:
/// <see cref="SqlContext.RegisterFunction{bool, string, string, ..., string}"/>("MyFilter", (arg1, arg2, ..., arg6) => arg1 != null && arg2 != null && ... && arg6 != null);
/// SqlContext.RegisterFunction&lt;bool, string, string, ..., string>("MyFilter", (arg1, arg2, ..., arg6) => arg1 != null &amp;&amp; arg2 != null &amp;&amp; ... &amp;&amp; arg6 != null);
/// sqlContext.Sql("SELECT * FROM MyTable where MyFilter(columnName1, columnName2, ..., columnName6)");
/// </summary>
/// <typeparam name="RT"></typeparam>
@ -236,7 +404,7 @@ namespace Microsoft.Spark.CSharp.Sql
/// <summary>
/// Register UDF with 7 input arguments, e.g:
/// <see cref="SqlContext.RegisterFunction{bool, string, string, ..., string}"/>("MyFilter", (arg1, arg2, ..., arg7) => arg1 != null && arg2 != null && ... && arg7 != null);
/// SqlContext.RegisterFunction&lt;bool, string, string, ..., string>("MyFilter", (arg1, arg2, ..., arg7) => arg1 != null &amp;&amp; arg2 != null &amp;&amp; ... &amp;&amp; arg7 != null);
/// sqlContext.Sql("SELECT * FROM MyTable where MyFilter(columnName1, columnName2, ..., columnName7)");
/// </summary>
/// <typeparam name="RT"></typeparam>
@ -257,7 +425,7 @@ namespace Microsoft.Spark.CSharp.Sql
/// <summary>
/// Register UDF with 8 input arguments, e.g:
/// <see cref="SqlContext.RegisterFunction{bool, string, string, ..., string}"/>("MyFilter", (arg1, arg2, ..., arg8) => arg1 != null && arg2 != null && ... && arg8 != null);
/// SqlContext.RegisterFunction&lt;bool, string, string, ..., string>("MyFilter", (arg1, arg2, ..., arg8) => arg1 != null &amp;&amp; arg2 != null &amp;&amp; ... &amp;&amp; arg8 != null);
/// sqlContext.Sql("SELECT * FROM MyTable where MyFilter(columnName1, columnName2, ..., columnName8)");
/// </summary>
/// <typeparam name="RT"></typeparam>
@ -279,7 +447,7 @@ namespace Microsoft.Spark.CSharp.Sql
/// <summary>
/// Register UDF with 9 input arguments, e.g:
/// <see cref="SqlContext.RegisterFunction{bool, string, string, ..., string}"/>("MyFilter", (arg1, arg2, ..., arg9) => arg1 != null && arg2 != null && ... && arg9 != null);
/// SqlContext.RegisterFunction&lt;bool, string, string, ..., string>("MyFilter", (arg1, arg2, ..., arg9) => arg1 != null &amp;&amp; arg2 != null &amp;&amp; ... &amp;&amp; arg9 != null);
/// sqlContext.Sql("SELECT * FROM MyTable where MyFilter(columnName1, columnName2, ..., columnName9)");
/// </summary>
/// <typeparam name="RT"></typeparam>
@ -302,7 +470,7 @@ namespace Microsoft.Spark.CSharp.Sql
/// <summary>
/// Register UDF with 10 input arguments, e.g:
/// <see cref="SqlContext.RegisterFunction{bool, string, string, ..., string}"/>("MyFilter", (arg1, arg2, ..., arg10) => arg1 != null && arg2 != null && ... && arg10 != null);
/// SqlContext.RegisterFunction&lt;bool, string, string, ..., string>("MyFilter", (arg1, arg2, ..., arg10) => arg1 != null &amp;&amp; arg2 != null &amp;&amp; ... &amp;&amp; arg10 != null);
/// sqlContext.Sql("SELECT * FROM MyTable where MyFilter(columnName1, columnName2, ..., columnName10)");
/// </summary>
/// <typeparam name="RT"></typeparam>

Просмотреть файл

@ -14,6 +14,9 @@ using Newtonsoft.Json.Linq;
namespace Microsoft.Spark.CSharp.Sql
{
/// <summary>
/// The base type of all Spark SQL data types.
/// </summary>
[Serializable]
public abstract class DataType
{
@ -38,6 +41,9 @@ namespace Microsoft.Spark.CSharp.Sql
/// </summary>
internal virtual object JsonValue { get { return TypeName; } }
/// <summary>
/// The compact JSON representation of this data type.
/// </summary>
public string Json
{
get
@ -47,11 +53,23 @@ namespace Microsoft.Spark.CSharp.Sql
}
}
/// <summary>
/// Parses a Json string to construct a DataType.
/// </summary>
/// <param name="json">The Json string to be parsed</param>
/// <returns>The new DataType instance from the Json string</returns>
public static DataType ParseDataTypeFromJson(string json)
{
return ParseDataTypeFromJson(JToken.Parse(json));
}
/// <summary>
/// Parse a JToken object to construct a DataType.
/// </summary>
/// <param name="json">The JToken object to be parsed</param>
/// <returns>The new DataType instance from the Json string</returns>
/// <exception cref="NotImplementedException">Not implemented for "udt" type</exception>
/// <exception cref="ArgumentException"></exception>
protected static DataType ParseDataTypeFromJson(JToken json)
{
if (json.Type == JTokenType.Object) // {name: address, type: {type: struct,...},...}
@ -112,63 +130,125 @@ namespace Microsoft.Spark.CSharp.Sql
}
/// <summary>
/// An internal type used to represent a simple type.
/// </summary>
[Serializable]
public class AtomicType : DataType
{
}
/// <summary>
/// An internal type used to represent a complex type (such as arrays, structs, and maps).
/// </summary>
[Serializable]
public abstract class ComplexType : DataType
{
/// <summary>
/// Abstract method that constructs a complex type from a Json object
/// </summary>
/// <param name="json">The Json object to construct a complex type</param>
/// <returns>A new constructed complex type</returns>
public abstract DataType FromJson(JObject json);
/// <summary>
/// Constructs a complex type from a Json string
/// </summary>
/// <param name="json">The string that represents a Json.</param>
/// <returns>A new constructed complex type</returns>
public DataType FromJson(string json)
{
return FromJson(JObject.Parse(json));
}
}
/// <summary>
/// The data type representing NULL values.
/// </summary>
[Serializable]
public class NullType : AtomicType { }
/// <summary>
/// The data type representing String values.
/// </summary>
[Serializable]
public class StringType : AtomicType { }
/// <summary>
/// The data type representing binary values.
/// </summary>
[Serializable]
public class BinaryType : AtomicType { }
/// <summary>
/// The data type representing Boolean values.
/// </summary>
[Serializable]
public class BooleanType : AtomicType { }
/// <summary>
/// The data type representing Date values.
/// </summary>
[Serializable]
public class DateType : AtomicType { }
/// <summary>
/// The data type representing Timestamp values.
/// </summary>
[Serializable]
public class TimestampType : AtomicType { }
/// <summary>
/// The data type representing Double values.
/// </summary>
[Serializable]
public class DoubleType : AtomicType { }
/// <summary>
///
/// </summary>
[Serializable]
public class FloatType : AtomicType { }
/// <summary>
/// The data type representing Float values.
/// </summary>
[Serializable]
public class ByteType : AtomicType { }
/// <summary>
///
/// </summary>
[Serializable]
public class IntegerType : AtomicType { }
/// <summary>
/// The data type representing Int values.
/// </summary>
[Serializable]
public class LongType : AtomicType { }
/// <summary>
/// The data type representing Short values.
/// </summary>
[Serializable]
public class ShortType : AtomicType { }
/// <summary>
/// The data type representing Decimal values.
/// </summary>
[Serializable]
public class DecimalType : AtomicType
{
/// <summary>
/// Gets the regular expression that represents a fixed decimal.
/// </summary>
public static Regex FixedDecimal = new Regex(@"decimal\((\d+),\s(\d+)\)");
private int? precision, scale;
/// <summary>
/// Initializes a new instance of DecimalType from parameters specifying its precision and scale.
/// </summary>
/// <param name="precision">The precision of the type</param>
/// <param name="scale">The scale of the type</param>
public DecimalType(int? precision = null, int? scale = null)
{
this.precision = precision;
@ -180,18 +260,38 @@ namespace Microsoft.Spark.CSharp.Sql
get { throw new NotImplementedException(); }
}
/// <summary>
/// Constructs a DecimalType from a Json object
/// </summary>
/// <param name="json">The Json object used to construct a DecimalType</param>
/// <returns>A new DecimalType instance</returns>
/// <exception cref="NotImplementedException">Not implemented yet.</exception>
public DataType FromJson(JObject json)
{
throw new NotImplementedException();
}
}
/// <summary>
/// The data type for collections of multiple values.
/// </summary>
[Serializable]
public class ArrayType : ComplexType
{
/// <summary>
/// Gets the DataType of each element in the array
/// </summary>
public DataType ElementType { get { return elementType; } }
/// <summary>
/// Returns whether the array can contain null (None) values
/// </summary>
public bool ContainsNull { get { return containsNull; } }
/// <summary>
/// Initializes a ArrayType instance with a specific DataType and specifying if the array has null values.
/// </summary>
/// <param name="elementType">The data type of values</param>
/// <param name="containsNull">Indicates if values have null values</param>
public ArrayType(DataType elementType, bool containsNull = true)
{
this.elementType = elementType;
@ -203,6 +303,9 @@ namespace Microsoft.Spark.CSharp.Sql
FromJson(json);
}
/// <summary>
/// Readable string representation for the type.
/// </summary>
public override string SimpleString
{
get { return string.Format("array<{0}>", elementType.SimpleString); }
@ -219,6 +322,11 @@ namespace Microsoft.Spark.CSharp.Sql
}
}
/// <summary>
/// Constructs a ArrayType from a Json object
/// </summary>
/// <param name="json">The Json object used to construct a ArrayType</param>
/// <returns>A new ArrayType instance</returns>
public override sealed DataType FromJson(JObject json)
{
elementType = ParseDataTypeFromJson(json["elementType"]);
@ -230,6 +338,9 @@ namespace Microsoft.Spark.CSharp.Sql
private bool containsNull;
}
/// <summary>
/// The data type for Maps. Not implemented yet.
/// </summary>
[Serializable]
public class MapType : ComplexType
{
@ -238,20 +349,48 @@ namespace Microsoft.Spark.CSharp.Sql
get { throw new NotImplementedException(); }
}
/// <summary>
/// Constructs a StructField from a Json object. Not implemented yet.
/// </summary>
/// <param name="json">The Json object used to construct a MapType</param>
/// <returns>A new MapType instance</returns>
/// <exception cref="NotImplementedException"></exception>
public override DataType FromJson(JObject json)
{
throw new NotImplementedException();
}
}
/// <summary>
/// A field inside a StructType.
/// </summary>
[Serializable]
public class StructField : ComplexType
{
/// <summary>
/// The name of this field.
/// </summary>
public string Name { get { return name; } }
/// <summary>
/// The data type of this field.
/// </summary>
public DataType DataType { get { return dataType; } }
/// <summary>
/// Indicates if values of this field can be null values.
/// </summary>
public bool IsNullable { get { return isNullable; } }
/// <summary>
/// The metadata of this field. The metadata should be preserved during transformation if the content of the column is not modified, e.g, in selection.
/// </summary>
public JObject Metadata { get { return metadata; } }
/// <summary>
/// Initializes a StructField instance with a specific name, data type, nullable, and metadata
/// </summary>
/// <param name="name">The name of this field</param>
/// <param name="dataType">The data type of this field</param>
/// <param name="isNullable">Indicates if values of this field can be null values</param>
/// <param name="metadata">The metadata of this field</param>
public StructField(string name, DataType dataType, bool isNullable = true, JObject metadata = null)
{
this.name = name;
@ -265,6 +404,9 @@ namespace Microsoft.Spark.CSharp.Sql
FromJson(json);
}
/// <summary>
/// Returns a readable string that represents the type.
/// </summary>
public override string SimpleString { get { return string.Format(@"{0}:{1}", name, dataType.SimpleString); } }
internal override object JsonValue
@ -279,6 +421,11 @@ namespace Microsoft.Spark.CSharp.Sql
}
}
/// <summary>
/// Constructs a StructField from a Json object
/// </summary>
/// <param name="json">The Json object used to construct a StructField</param>
/// <returns>A new StructField instance</returns>
public override sealed DataType FromJson(JObject json)
{
name = json["name"].ToString();
@ -295,9 +442,16 @@ namespace Microsoft.Spark.CSharp.Sql
private JObject metadata;
}
/// <summary>
/// Struct type, consisting of a list of StructField
/// This is the data type representing a Row
/// </summary>
[Serializable]
public class StructType : ComplexType
{
/// <summary>
/// Gets a list of StructField.
/// </summary>
public List<StructField> Fields { get { return fields; } }
internal IStructTypeProxy StructTypeProxy
@ -311,6 +465,10 @@ namespace Microsoft.Spark.CSharp.Sql
}
}
/// <summary>
/// Initializes a StructType instance with a specific collection of SructField object.
/// </summary>
/// <param name="fields">The collection that holds StructField objects</param>
public StructType(IEnumerable<StructField> fields)
{
this.fields = fields.ToList();
@ -328,6 +486,9 @@ namespace Microsoft.Spark.CSharp.Sql
FromJson(jsonSchema);
}
/// <summary>
/// Returns a readable string that joins all <see cref="StructField"/>s together.
/// </summary>
public override string SimpleString
{
get { return string.Format(@"struct<{0}>", string.Join(",", fields.Select(f => f.SimpleString))); }
@ -343,6 +504,11 @@ namespace Microsoft.Spark.CSharp.Sql
}
}
/// <summary>
/// Constructs a StructType from a Json object
/// </summary>
/// <param name="json">The Json object used to construct a StructType</param>
/// <returns>A new StructType instance</returns>
public override sealed DataType FromJson(JObject json)
{
var fieldsJObjects = json["fields"].Select(f => (JObject)f);

Просмотреть файл

@ -0,0 +1,31 @@
// Copyright (c) Microsoft. All rights reserved.
// Licensed under the MIT license. See LICENSE file in the project root for full license information.
using System;
using Microsoft.Spark.CSharp.Core;
using Microsoft.Spark.CSharp.Interop.Ipc;
using Microsoft.Spark.CSharp.Proxy.Ipc;
namespace Microsoft.Spark.CSharp.Streaming
{
/// <summary>
/// An input stream that always returns the same RDD on each timestep. Useful for testing.
/// </summary>
public class ConstantInputDStream<T> : DStream<T>
{
/// <summary>
/// Construct a ConstantInputDStream instance.
/// </summary>
public ConstantInputDStream(RDD<T> rdd, StreamingContext ssc)
{
if (rdd == null)
{
throw new ArgumentNullException("Parameter rdd null is illegal, which will lead to NPE in the following transformation");
}
dstreamProxy = ssc.streamingContextProxy.CreateConstantInputDStream(rdd.RddProxy);
streamingContext = ssc;
serializedMode = SerializedMode.Byte;
}
}
}

Просмотреть файл

@ -221,12 +221,12 @@ namespace Microsoft.Spark.CSharp.Streaming
/// <summary>
/// Enable periodic checkpointing of RDDs of this DStream
/// </summary>
/// <param name="intervalMs">time in seconds, after each period of that, generated RDD will be checkpointed</param>
/// <param name="intervalSeconds">time in seconds, after each period of that, generated RDD will be checkpointed</param>
/// <returns></returns>
public DStream<T> Checkpoint(long intervalMs)
public DStream<T> Checkpoint(int intervalSeconds)
{
isCheckpointed = true;
DStreamProxy.Checkpoint(intervalMs);
DStreamProxy.Checkpoint(intervalSeconds);
return this;
}
@ -373,7 +373,7 @@ namespace Microsoft.Spark.CSharp.Streaming
return DStreamProxy.Slice(fromUnixTime, toUnixTime).Select(r => new RDD<T>(r, streamingContext.SparkContext, serializedMode)).ToArray();
}
internal void ValidatWindowParam(int windowSeconds, int slideSeconds)
internal void ValidateWindowParam(int windowSeconds, int slideSeconds)
{
int duration = SlideDuration;
@ -403,7 +403,7 @@ namespace Microsoft.Spark.CSharp.Streaming
/// <returns></returns>
public DStream<T> Window(int windowSeconds, int slideSeconds)
{
ValidatWindowParam(windowSeconds, slideSeconds);
ValidateWindowParam(windowSeconds, slideSeconds);
return new DStream<T>(DStreamProxy.Window(windowSeconds, slideSeconds), streamingContext, serializedMode);
}

Просмотреть файл

@ -0,0 +1,43 @@
// Copyright (c) Microsoft. All rights reserved.
// Licensed under the MIT license. See LICENSE file in the project root for full license information.
using System;
using System.Collections.Generic;
using Microsoft.Spark.CSharp.Core;
namespace Microsoft.Spark.CSharp.Streaming
{
/// <summary>
/// Utility for creating streams from
/// </summary>
public class EventHubsUtils
{
/// <summary>
/// Create a unioned EventHubs stream that receives data from Microsoft Azure Eventhubs
/// The unioned stream will receive message from all partitions of the EventHubs
/// </summary>
/// <param name="ssc">Streaming context</param>
/// <param name="eventhubsParams"> Parameters for EventHubs.
/// Required parameters are:
/// "eventhubs.policyname": EventHubs policy name
/// "eventhubs.policykey": EventHubs policy key
/// "eventhubs.namespace": EventHubs namespace
/// "eventhubs.name": EventHubs name
/// "eventhubs.partition.count": Number of partitions
/// "eventhubs.checkpoint.dir": checkpoint directory on HDFS
///
/// Optional parameters are:
/// "eventhubs.consumergroup": EventHubs consumer group name, default to "\$default"
/// "eventhubs.filter.offset": Starting offset of EventHubs, default to "-1"
/// "eventhubs.filter.enqueuetime": Unix time, millisecond since epoch, default to "0"
/// "eventhubs.default.credits": default AMQP credits, default to -1 (which is 1024)
/// "eventhubs.checkpoint.interval": checkpoint interval in second, default to 10
/// </param>
/// <param name="storageLevelType">Storage level, by default it is MEMORY_ONLY</param>
/// <returns>DStream with byte[] representing events from EventHub</returns>
public static DStream<byte[]> CreateUnionStream(StreamingContext ssc, Dictionary<string, string> eventhubsParams, StorageLevelType storageLevelType = StorageLevelType.MEMORY_ONLY)
{
return new DStream<byte[]>(ssc.streamingContextProxy.EventHubsUnionStream(eventhubsParams, storageLevelType), ssc, SerializedMode.None);
}
}
}

Просмотреть файл

@ -6,11 +6,16 @@ using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Threading.Tasks;
using System.Runtime.Serialization.Formatters.Binary;
using System.IO;
using Microsoft.Spark.CSharp.Core;
namespace Microsoft.Spark.CSharp.Streaming
{
/// <summary>
/// Utils for Kafka input stream.
/// </summary>
public class KafkaUtils
{
/// <summary>
@ -30,6 +35,7 @@ namespace Microsoft.Spark.CSharp.Streaming
/// <summary>
/// Create an input stream that pulls messages from a Kafka Broker.
/// </summary>
/// <param name="ssc">Spark Streaming Context</param>
/// <param name="zkQuorum">Zookeeper quorum (hostname:port,hostname:port,..).</param>
/// <param name="groupId">The group id for this consumer.</param>
/// <param name="topics">Dict of (topic_name -> numPartitions) to consume. Each partition is consumed in its own thread.</param>
@ -79,5 +85,90 @@ namespace Microsoft.Spark.CSharp.Streaming
{
return new DStream<KeyValuePair<byte[], byte[]>>(ssc.streamingContextProxy.DirectKafkaStream(topics, kafkaParams, fromOffsets), ssc, SerializedMode.Pair);
}
/// <summary>
/// Create an input stream that directly pulls messages from a Kafka Broker and specific offset.
///
/// This is not a receiver based Kafka input stream, it directly pulls the message from Kafka
/// in each batch duration and processed without storing.
///
/// This does not use Zookeeper to store offsets. The consumed offsets are tracked
/// by the stream itself. For interoperability with Kafka monitoring tools that depend on
/// Zookeeper, you have to update Kafka/Zookeeper yourself from the streaming application.
/// You can access the offsets used in each batch from the generated RDDs (see
/// [[org.apache.spark.streaming.kafka.HasOffsetRanges]]).
/// To recover from driver failures, you have to enable checkpointing in the StreamingContext.
/// The information on consumed offset can be recovered from the checkpoint.
/// See the programming guide for details (constraints, etc.).
///
/// </summary>
/// <param name="ssc">Spark Streaming Context</param>
/// <param name="topics">list of topic_name to consume.</param>
/// <param name="kafkaParams">
/// Additional params for Kafka. Requires "metadata.broker.list" or "bootstrap.servers" to be set
/// with Kafka broker(s) (NOT zookeeper servers), specified in host1:port1,host2:port2 form.
/// </param>
/// <param name="fromOffsets">Per-topic/partition Kafka offsets defining the (inclusive) starting point of the stream.</param>
/// <param name="numPartitions">
/// user hint on how many kafka RDD partitions to create instead of aligning with kafka partitions,
/// unbalanced kafka partitions and/or under-distributed data will be redistributed evenly across
/// a probably larger number of RDD partitions
/// If numPartitions = -1, either repartition based on spark.streaming.kafka.maxRatePerTask or do nothing if config not defined
/// If numPartitions = 0, repartition using original kafka partition count
/// If numPartitions > 0, repartition using this parameter
/// </param>
/// <returns>A DStream object</returns>
public static DStream<KeyValuePair<byte[], byte[]>> CreateDirectStreamWithRepartition(StreamingContext ssc, List<string> topics, Dictionary<string, string> kafkaParams, Dictionary<string, long> fromOffsets, int numPartitions = -1)
{
return new DStream<KeyValuePair<byte[], byte[]>>(ssc.streamingContextProxy.DirectKafkaStreamWithRepartition(topics, kafkaParams, fromOffsets, numPartitions, null, null), ssc, SerializedMode.Pair);
}
/// <summary>
/// Create an input stream that directly pulls messages from a Kafka Broker and specific offset.
///
/// This is not a receiver based Kafka input stream, it directly pulls the message from Kafka
/// in each batch duration and processed without storing.
///
/// This does not use Zookeeper to store offsets. The consumed offsets are tracked
/// by the stream itself. For interoperability with Kafka monitoring tools that depend on
/// Zookeeper, you have to update Kafka/Zookeeper yourself from the streaming application.
/// You can access the offsets used in each batch from the generated RDDs (see
/// [[org.apache.spark.streaming.kafka.HasOffsetRanges]]).
/// To recover from driver failures, you have to enable checkpointing in the StreamingContext.
/// The information on consumed offset can be recovered from the checkpoint.
/// See the programming guide for details (constraints, etc.).
///
/// </summary>
/// <param name="ssc">Spark Streaming Context</param>
/// <param name="topics">list of topic_name to consume.</param>
/// <param name="kafkaParams">
/// Additional params for Kafka. Requires "metadata.broker.list" or "bootstrap.servers" to be set
/// with Kafka broker(s) (NOT zookeeper servers), specified in host1:port1,host2:port2 form.
/// </param>
/// <param name="fromOffsets">Per-topic/partition Kafka offsets defining the (inclusive) starting point of the stream.</param>
/// <param name="numPartitions">
/// user hint on how many kafka RDD partitions to create instead of aligning with kafka partitions,
/// unbalanced kafka partitions and/or under-distributed data will be redistributed evenly across
/// a probably larger number of RDD partitions
/// If numPartitions = -1, either repartition based on spark.streaming.kafka.maxRatePerTask or do nothing if config not defined
/// If numPartitions = 0, repartition using original kafka partition count
/// If numPartitions > 0, repartition using this parameter
/// </param>
/// <param name="readFunc">user function to process the kafka data.</param>
/// <returns>A DStream object</returns>
public static DStream<T> CreateDirectStreamWithRepartitionAndReadFunc<T>(StreamingContext ssc, List<string> topics, Dictionary<string, string> kafkaParams, Dictionary<string, long> fromOffsets,
int numPartitions, Func<int, IEnumerable<KeyValuePair<byte[], byte[]>>, IEnumerable<T>> readFunc)
{
var mapPartitionsWithIndexHelper = new MapPartitionsWithIndexHelper<KeyValuePair<byte[], byte[]>, T>(readFunc, true);
var transformHelper = new TransformHelper<KeyValuePair<byte[], byte[]>, T>(mapPartitionsWithIndexHelper.Execute);
var transformDynamicHelper = new TransformDynamicHelper<KeyValuePair<byte[], byte[]>, T>(transformHelper.Execute);
Func<double, RDD<dynamic>, RDD<dynamic>> func = transformDynamicHelper.Execute;
var formatter = new BinaryFormatter();
var stream = new MemoryStream();
formatter.Serialize(stream, func);
byte[] readFuncBytes = stream.ToArray();
string serializationMode = SerializedMode.Pair.ToString();
return new DStream<T>(ssc.streamingContextProxy.DirectKafkaStreamWithRepartition(topics, kafkaParams, fromOffsets, numPartitions, readFuncBytes, serializationMode), ssc);
}
}
}

Просмотреть файл

@ -0,0 +1,433 @@
// Copyright (c) Microsoft. All rights reserved.
// Licensed under the MIT license. See LICENSE file in the project root for full license information.
using System;
using System.Collections;
using System.Collections.Generic;
using System.Diagnostics;
using System.IO;
using System.Linq;
using System.Runtime.Serialization;
using System.Runtime.Serialization.Formatters.Binary;
using Microsoft.Spark.CSharp.Core;
using Microsoft.Spark.CSharp.Interop;
using Microsoft.Spark.CSharp.Interop.Ipc;
using Microsoft.Spark.CSharp.Proxy;
using Microsoft.Spark.CSharp.Proxy.Ipc;
using Microsoft.Spark.CSharp.Services;
namespace Microsoft.Spark.CSharp.Streaming
{
/// <summary>
/// DStream representing the stream of data generated by `mapWithState` operation on a pair DStream.
/// Additionally, it also gives access to the stream of state snapshots, that is, the state data of all keys after a batch has updated them.
/// </summary>
/// <typeparam name="K">Type of the key</typeparam>
/// <typeparam name="V">Type of the value</typeparam>
/// <typeparam name="S">Type of the state data</typeparam>
/// <typeparam name="M">Type of the mapped data</typeparam>
[Serializable]
public class MapWithStateDStream<K, V, S, M> : DStream<M>
{
internal DStream<KeyValuePair<K, S>> snapshotsDStream;
internal MapWithStateDStream(DStream<M> mappedDataDStream, DStream<KeyValuePair<K, S>> snapshotsDStream)
: base(mappedDataDStream.DStreamProxy, mappedDataDStream.streamingContext)
{
this.snapshotsDStream = snapshotsDStream;
}
/// <summary>
/// Return a pair DStream where each RDD is the snapshot of the state of all the keys.
/// </summary>
public DStream<KeyValuePair<K, S>> StateSnapshots()
{
return snapshotsDStream;
}
}
/// <summary>
/// Class to hold a state instance and the timestamp when the state is updated or created.
/// No need to explicitly make this class clonable, since the serialization and deserialization in Worker is already a kind of clone mechanism.
/// </summary>
/// <typeparam name="S">Type of the state data</typeparam>
[Serializable]
internal class KeyedState<S>
{
internal S state;
internal long ticks;
internal KeyedState()
{
}
internal KeyedState(S state, long ticks)
{
this.state = state;
this.ticks = ticks;
}
}
/// <summary>
/// Record storing the keyed-state MapWithStateRDD.
/// Each record contains a stateMap and a sequence of records returned by the mapping function of MapWithState.
/// Note: don't need to explicitly make this class clonable, since the serialization and deserialization in Worker is already a kind of clone.
/// </summary>
/// <typeparam name="K">Type of the key</typeparam>
/// <typeparam name="S">Type of the state data</typeparam>
/// <typeparam name="M">Type of the mapped data</typeparam>
[Serializable]
internal class MapWithStateRDDRecord<K, S, M>
{
internal Dictionary<K, KeyedState<S>> stateMap = new Dictionary<K, KeyedState<S>>();
internal List<M> mappedData = new List<M>();
public MapWithStateRDDRecord()
{
}
public MapWithStateRDDRecord(long t, IEnumerable<KeyValuePair<K, S>> iter)
{
foreach (var p in iter)
{
stateMap[p.Key] = new KeyedState<S>(p.Value, t);
}
}
}
/// <summary>
/// Helper class to update states for a RDD partition.
/// Reference: https://github.com/apache/spark/blob/master/streaming/src/main/scala/org/apache/spark/streaming/rdd/MapWithStateRDD.scala
/// </summary>
/// <typeparam name="K">Type of the key</typeparam>
/// <typeparam name="V">Type of the value</typeparam>
/// <typeparam name="S">Type of the state data</typeparam>
/// <typeparam name="M">Type of the mapped data</typeparam>
[Serializable]
internal class UpdateStateHelper<K, V, S, M>
{
[NonSerialized]
private readonly ILoggerService logger = LoggerServiceFactory.GetLogger(typeof(UpdateStateHelper<K, V, S, M>));
private readonly Func<K, V, State<S>, M> f;
private readonly long ticks;
private readonly bool removeTimedoutData;
private readonly TimeSpan idleDuration;
internal UpdateStateHelper(Func<K, V, State<S>, M> f, long ticks, bool removeTimedoutData, TimeSpan idleDuration)
{
this.f = f;
this.ticks = ticks;
this.removeTimedoutData = removeTimedoutData;
this.idleDuration = idleDuration;
}
internal IEnumerable<dynamic> Execute(int pid, IEnumerable<dynamic> iter)
{
var enumerator = iter.GetEnumerator();
var preStateRddRecord = GetStateRecord(enumerator);
var stateRddRecord = preStateRddRecord;
while (enumerator.MoveNext())
{
KeyValuePair<K, V> kv = enumerator.Current;
KeyedState<S> keyedState;
State<S> wrappedState = stateRddRecord.stateMap.TryGetValue(kv.Key, out keyedState) ? new State<S>(keyedState.state) : new State<S>(default(S));
var mappedData = default(M);
try
{
mappedData = f(kv.Key, kv.Value, wrappedState);
}
catch (Exception e)
{
logger.LogException(e);
}
stateRddRecord.mappedData.Add(mappedData);
if (wrappedState.removed)
{
stateRddRecord.stateMap.Remove(kv.Key);
}
else if (wrappedState.updated || wrappedState.defined)
{
stateRddRecord.stateMap[kv.Key] = new KeyedState<S>(wrappedState.state, ticks);
}
}
// Get the timed out state records, call the mapping function on each and collect the data returned
if (removeTimedoutData)
{
long timeoutThresholdInTicks = ticks - idleDuration.Ticks;
var toBeRemovedKeys = new List<K>();
foreach (KeyValuePair<K, KeyedState<S>> entry in stateRddRecord.stateMap)
{
if (entry.Value.ticks >= timeoutThresholdInTicks) continue;
var timingOutstate = new State<S>(entry.Value.state, true);
var mappedData = default(M);
try
{
mappedData = f(entry.Key, default(V), timingOutstate);
}
catch (Exception e)
{
logger.LogException(e);
}
stateRddRecord.mappedData.Add(mappedData);
toBeRemovedKeys.Add(entry.Key);
}
foreach (var k in toBeRemovedKeys)
{
stateRddRecord.stateMap.Remove(k);
}
}
return new []{stateRddRecord};
}
internal MapWithStateRDDRecord<K, S, M> GetStateRecord(IEnumerator<dynamic> enumerator)
{
if (enumerator.MoveNext())
{
return enumerator.Current;
}
throw new InvalidOperationException("MapWithStateRDDRecord is missing.");
}
}
[Serializable]
internal class MapWithStateHelper<K, V, S, M>
{
private static readonly DateTime UnixTimeEpoch = new DateTime(1970, 1, 1, 0, 0, 0, DateTimeKind.Utc);
private readonly Func<double, RDD<dynamic>, RDD<dynamic>> prevFunc;
private readonly StateSpec<K, V, S, M> stateSpec;
internal MapWithStateHelper(Func<double, RDD<dynamic>, RDD<dynamic>> prevF, StateSpec<K, V, S, M> stateSpec)
{
prevFunc = prevF;
this.stateSpec = stateSpec;
}
internal RDD<dynamic> Execute(double t, RDD<dynamic> stateRDD, RDD<dynamic> valuesRDD)
{
long ticks = UnixTimeEpoch.AddMilliseconds(t).Ticks;
if (prevFunc != null)
{
valuesRDD = prevFunc(t, valuesRDD);
}
var values = valuesRDD.ConvertTo<KeyValuePair<K, V>>().PartitionBy(stateSpec.numPartitions);
if (stateRDD == null)
{
if (stateSpec.initialState != null)
{
if (stateSpec.initialState.sparkContext == null)
{
stateSpec.initialState.sparkContext = valuesRDD.sparkContext;
}
var partitionedInitialState = stateSpec.initialState.PartitionBy(stateSpec.numPartitions);
stateRDD = partitionedInitialState.MapPartitions(new MapWithStateMapPartitionHelper<K, V, S, M>(ticks).Execute, true).ConvertTo<dynamic>();
}
else
{
stateRDD = values.PartitionBy(stateSpec.numPartitions).MapPartitions(new MapWithStateMapPartitionHelper<K, V, S, M>(ticks).ExecuteWithoutInitialState, true).ConvertTo<dynamic>();
}
}
bool removeTimedoutData = stateSpec.idleDuration.Ticks != 0 && stateRDD.IsCheckpointed;
stateRDD.partitioner = values.partitioner;
RDD<dynamic> union = stateRDD.Union(values.ConvertTo<dynamic>());
return union.MapPartitionsWithIndex(new UpdateStateHelper<K, V, S, M>(stateSpec.mappingFunction, ticks, removeTimedoutData, stateSpec.idleDuration).Execute, true);
}
}
[Serializable]
internal class MapWithStateMapPartitionHelper<K, V, S, M>
{
internal long ticks;
internal MapWithStateMapPartitionHelper(long ticks)
{
this.ticks = ticks;
}
internal IEnumerable<MapWithStateRDDRecord<K, S, M>> Execute(IEnumerable<KeyValuePair<K, S>> iter)
{
return new[] {new MapWithStateRDDRecord<K, S, M>(ticks, iter)};
}
internal IEnumerable<MapWithStateRDDRecord<K, S, M>> ExecuteWithoutInitialState(IEnumerable<KeyValuePair<K, V>> iter)
{
return new[] { new MapWithStateRDDRecord<K, S, M>() };
}
}
/// <summary>
/// Representing all the specifications of the DStream transformation `mapWithState` operation.
/// </summary>
/// <typeparam name="K">Type of the key</typeparam>
/// <typeparam name="V">Type of the value</typeparam>
/// <typeparam name="S">Type of the state data</typeparam>
/// <typeparam name="M">Type of the mapped data</typeparam>
[Serializable]
public class StateSpec<K, V, S, M>
{
internal Func<K, V, State<S>, M> mappingFunction;
internal int numPartitions;
internal TimeSpan idleDuration = TimeSpan.FromTicks(0);
internal RDD<KeyValuePair<K, S>> initialState = null;
/// <summary>
/// Create a StateSpec for setting all the specifications of the `mapWithState` operation on a pair DStream.
/// </summary>
/// <param name="mappingFunction">The function applied on every data item to manage the associated state and generate the mapped data</param>
public StateSpec(Func<K, V, State<S>, M> mappingFunction)
{
this.mappingFunction = mappingFunction;
}
/// <summary>
/// Set the number of partitions by which the state RDDs generated by `mapWithState` will be partitioned.
/// Hash partitioning will be used.
/// </summary>
/// <param name="numPartitions">The number of partitions</param>
/// <returns>The new StateSpec object</returns>
public StateSpec<K, V, S, M> NumPartitions(int numPartitions)
{
this.numPartitions = numPartitions;
return this;
}
/// <summary>
/// Set the duration after which the state of an idle key will be removed. A key and its state is
/// considered idle if it has not received any data for at least the given duration. The
/// mapping function will be called one final time on the idle states that are going to be
/// removed; [[org.apache.spark.streaming.State State.isTimingOut()]] set to `true` in that call.
/// </summary>
/// <param name="idleDuration">The idle time of duration</param>
/// <returns>The new StateSpec object</returns>
public StateSpec<K, V, S, M> Timeout(TimeSpan idleDuration)
{
this.idleDuration = idleDuration;
return this;
}
/// <summary>
/// Set the RDD containing the initial states that will be used by mapWithState
/// </summary>
/// <param name="initialState">The given initial state</param>
/// <returns>The new StateSpec object</returns>
public StateSpec<K, V, S, M> InitialState(RDD<KeyValuePair<K, S>> initialState)
{
this.initialState = initialState;
return this;
}
}
/// <summary>
/// class for getting and updating the state in mapping function used in the `mapWithState` operation
/// </summary>
/// <typeparam name="S">Type of the state</typeparam>
[Serializable]
public class State<S>
{
internal S state = default(S);
[NonSerialized]
internal bool defined = false;
[NonSerialized]
internal bool timingOut = false; // FIXME: set timingOut to true for those timeouted keys
[NonSerialized]
internal bool updated = false;
[NonSerialized]
internal bool removed = false;
internal State(S state, bool timingOut = false)
{
this.state = state;
this.timingOut = timingOut;
removed = false;
updated = false;
if (!timingOut)
{
defined = !ReferenceEquals(null, state);
}
else
{
defined = true;
}
}
/// <summary>
/// Returns whether the state already exists
/// </summary>
/// <returns>true, if the state already exists; otherwise, false.</returns>
public bool Exists()
{
return defined;
}
/// <summary>
/// Gets the state if it exists, otherwise it will throw ArgumentException.
/// </summary>
/// <returns>The state</returns>
/// <exception cref="ArgumentException">ArgumentException if it does not exist.</exception>
public S Get()
{
if (defined)
{
return state;
}
throw new ArgumentException("State is not set");
}
/// <summary>
/// Updates the state with a new value.
/// </summary>
/// <param name="newState">The new state</param>
/// <exception cref="ArgumentException">ArgumentException if the state already be removed or timing out</exception>
public void Update(S newState)
{
if (removed || timingOut)
{
throw new ArgumentException("Cannot update the state that is timing out or has been removed.");
}
state = newState;
defined = true;
updated = true;
}
/// <summary>
/// Removes the state if it exists.
/// </summary>
/// <exception cref="ArgumentException">ArgumentException if the state already be removed or timing out</exception>
public void Remove()
{
if (removed || timingOut)
{
throw new ArgumentException("Cannot update the state that is timing out or has already been removed.");
}
defined = false;
updated = false;
removed = true;
}
/// <summary>
/// Returns whether the state is timing out and going to be removed by the system after the current batch.
/// </summary>
/// <returns>true, if it is timing out; otherwise, false.</returns>
public bool IsTimingOut()
{
return timingOut;
}
}
}

Просмотреть файл

@ -267,17 +267,16 @@ namespace Microsoft.Spark.CSharp.Streaming
int numPartitions = 0,
Func<KeyValuePair<K, V>, bool> filterFunc = null)
{
self.ValidatWindowParam(windowSeconds, slideSeconds);
self.ValidateWindowParam(windowSeconds, slideSeconds);
if (slideSeconds <= 0)
slideSeconds = self.SlideDuration;
// dstream to be transformed by substracting old RDDs and adding new RDDs based on the window
var reduced = self.ReduceByKey(reduceFunc, numPartitions);
reduced.Cache();
Func<double, RDD<dynamic>, RDD<dynamic>> prevFunc = reduced.Piplinable ? (reduced as TransformedDStream<KeyValuePair<K, V>>).func : null;
var helper = new ReduceByKeyAndWindowHelper<K, V>(reduceFunc, invReduceFunc, numPartitions, filterFunc, prevFunc);
var helper = new ReduceByKeyAndWindowHelper<K, V>(reduceFunc, invReduceFunc, numPartitions, filterFunc);
// function to reduce the new values that entered the window (e.g., adding new counts)
Func<double, RDD<dynamic>, RDD<dynamic>, RDD<dynamic>> reduceF = helper.Reduce;
@ -292,17 +291,17 @@ namespace Microsoft.Spark.CSharp.Streaming
Func<double, RDD<dynamic>, RDD<dynamic>, RDD<dynamic>> invReduceF = helper.InvReduce;
invStream = new MemoryStream();
formatter.Serialize(stream, invReduceF);
formatter.Serialize(invStream, invReduceF);
}
return new DStream<KeyValuePair<K, V>>(
SparkCLREnvironment.SparkCLRProxy.StreamingContextProxy.CreateCSharpReducedWindowedDStream(
reduced.Piplinable ? reduced.prevDStreamProxy : reduced.DStreamProxy,
reduced.DStreamProxy,
stream.ToArray(),
invStream == null ? null : invStream.ToArray(),
windowSeconds,
slideSeconds,
(reduced.Piplinable ? reduced.prevSerializedMode : reduced.serializedMode).ToString()),
reduced.serializedMode.ToString()),
self.streamingContext
);
}
@ -319,13 +318,14 @@ namespace Microsoft.Spark.CSharp.Streaming
/// State update function - (newValues, oldState) => newState
/// If this function returns None, then corresponding state key-value pair will be eliminated.
/// </param>
/// <param name="initialState">Initial state value of each key</param>
/// <param name="numPartitions"></param>
/// <returns></returns>
public static DStream<KeyValuePair<K, S>> UpdateStateByKey<K, V, S>(this DStream<KeyValuePair<K, V>> self,
Func<IEnumerable<V>, S, S> updateFunc,
Func<IEnumerable<V>, S, S> updateFunc, RDD<KeyValuePair<K, S>> initialState = null,
int numPartitions = 0)
{
return UpdateStateByKey<K, V, S>(self, new UpdateStateByKeyHelper<K, V, S>(updateFunc).Execute, numPartitions);
return UpdateStateByKey<K, V, S>(self, new UpdateStateByKeyHelper<K, V, S>(updateFunc).Execute, initialState, numPartitions);
}
/// <summary>
@ -337,13 +337,14 @@ namespace Microsoft.Spark.CSharp.Streaming
/// <typeparam name="S"></typeparam>
/// <param name="self"></param>
/// <param name="updateFunc">State update function - IEnumerable[K, [newValues, oldState]] => IEnumerable[K, newState]</param>
/// <param name="initialState">Initial state value of each key</param>
/// <param name="numPartitions"></param>
/// <returns></returns>
public static DStream<KeyValuePair<K, S>> UpdateStateByKey<K, V, S>(this DStream<KeyValuePair<K, V>> self,
Func<IEnumerable<KeyValuePair<K, Tuple<IEnumerable<V>, S>>>, IEnumerable<KeyValuePair<K, S>>> updateFunc,
Func<IEnumerable<KeyValuePair<K, Tuple<IEnumerable<V>, S>>>, IEnumerable<KeyValuePair<K, S>>> updateFunc, RDD<KeyValuePair<K, S>> initialState = null,
int numPartitions = 0)
{
return UpdateStateByKey<K, V, S>(self, new MapPartitionsHelper<KeyValuePair<K, Tuple<IEnumerable<V>, S>>, KeyValuePair<K, S>>(updateFunc).Execute, numPartitions);
return UpdateStateByKey<K, V, S>(self, new MapPartitionsHelper<KeyValuePair<K, Tuple<IEnumerable<V>, S>>, KeyValuePair<K, S>>(updateFunc).Execute, initialState, numPartitions);
}
/// <summary>
@ -355,30 +356,68 @@ namespace Microsoft.Spark.CSharp.Streaming
/// <typeparam name="S"></typeparam>
/// <param name="self"></param>
/// <param name="updateFunc">State update function - (pid, IEnumerable[K, [newValues, oldState]]) => IEnumerable[K, newState]</param>
/// <param name="initialState">Initial state value of each key</param>
/// <param name="numPartitions"></param>
/// <returns></returns>
public static DStream<KeyValuePair<K, S>> UpdateStateByKey<K, V, S>(this DStream<KeyValuePair<K, V>> self,
Func<int, IEnumerable<KeyValuePair<K, Tuple<IEnumerable<V>, S>>>, IEnumerable<KeyValuePair<K, S>>> updateFunc,
int numPartitions = 0)
RDD<KeyValuePair<K, S>> initialState = null, int numPartitions = 0)
{
if (numPartitions <= 0)
numPartitions = self.streamingContext.SparkContext.DefaultParallelism;
Func<double, RDD<dynamic>, RDD<dynamic>> prevFunc = self.Piplinable ? (self as TransformedDStream<KeyValuePair<K, V>>).func : null;
// completes pipelinable dstream by adding the last pipelinable operation
// before transforming to CSharpStateDStream so that UpdateStateByKey's
// parallel job covers all pipelinable operations before shuffling
var ds = self.Transform(new AddShuffleKeyHelper<K, V>(numPartitions).Execute);
Func<double, RDD<dynamic>, RDD<dynamic>, RDD<dynamic>> func = new UpdateStateByKeysHelper<K, V, S>(updateFunc, prevFunc, numPartitions).Execute;
Func<double, RDD<dynamic>, RDD<dynamic>, RDD<dynamic>> func = new UpdateStateByKeysHelper<K, V, S>(updateFunc, initialState, numPartitions).Execute;
var formatter = new BinaryFormatter();
var stream = new MemoryStream();
formatter.Serialize(stream, func);
return new DStream<KeyValuePair<K, S>>(SparkCLREnvironment.SparkCLRProxy.StreamingContextProxy.CreateCSharpStateDStream(
ds.DStreamProxy,
stream.ToArray(),
"CSharpStateDStream",
ds.serializedMode.ToString(),
ds.serializedMode.ToString()),
self.streamingContext);
}
/// <summary>
/// Return a new "state" DStream where the state for each key is updated by applying
/// the given function on the previous state of the key and the new values of the key.
/// </summary>
public static MapWithStateDStream<K, V, S, M> MapWithState<K, V, S, M>(this DStream<KeyValuePair<K, V>> self, StateSpec<K, V, S, M> stateSpec)
{
if (stateSpec.numPartitions <= 0)
{
stateSpec = stateSpec.NumPartitions(self.streamingContext.SparkContext.DefaultParallelism);
}
Func<double, RDD<dynamic>, RDD<dynamic>> prevFunc = self.Piplinable ? (self as TransformedDStream<KeyValuePair<K, V>>).func : null;
Func<double, RDD<dynamic>, RDD<dynamic>, RDD<dynamic>> func = new MapWithStateHelper<K, V, S, M>(prevFunc, stateSpec).Execute;
var formatter = new BinaryFormatter();
var stream = new MemoryStream();
formatter.Serialize(stream, func);
var mapWithStateDStream = new DStream<MapWithStateRDDRecord<K, S, M>>(SparkCLREnvironment.SparkCLRProxy.StreamingContextProxy.CreateCSharpStateDStream(
self.Piplinable ? self.prevDStreamProxy : self.DStreamProxy,
stream.ToArray(),
"CSharpStateDStream",
self.serializedMode.ToString(),
(self.Piplinable ? self.prevSerializedMode : self.serializedMode).ToString()),
self.streamingContext);
DStream<M> mappedDataDStream = mapWithStateDStream.FlatMap(r => r.mappedData);
DStream<KeyValuePair<K, S>> snapshotsDStream = mapWithStateDStream.FlatMap(
r => r.stateMap.Select(entry => new KeyValuePair<K, S>(entry.Key, entry.Value.state)));
return new MapWithStateDStream<K, V, S, M>(mappedDataDStream, snapshotsDStream);
}
}
@ -425,6 +464,25 @@ namespace Microsoft.Spark.CSharp.Streaming
}
}
[Serializable]
internal class AddShuffleKeyHelper<K, V>
{
private readonly int numPartitions;
internal AddShuffleKeyHelper(int numPartitions)
{
this.numPartitions = numPartitions;
}
internal RDD<byte[]> Execute(RDD<KeyValuePair<K, V>> rdd)
{
var keyed = rdd.MapPartitionsWithIndex(new PairRDDFunctions.AddShuffleKeyHelper<K, V>(numPartitions).Execute, true);
keyed.bypassSerializer = true;
keyed.rddProxy = keyed.RddProxy;
return keyed;
}
}
[Serializable]
internal class MapValuesHelper<K, V, U>
{
@ -509,7 +567,7 @@ namespace Microsoft.Spark.CSharp.Streaming
this.numPartitions = numPartitions;
}
internal RDD<KeyValuePair<K, Tuple<V, Option<W>>>> Execute<K,V,W>(RDD<KeyValuePair<K, V>> l, RDD<KeyValuePair<K, W>> r)
internal RDD<KeyValuePair<K, Tuple<V, Option<W>>>> Execute(RDD<KeyValuePair<K, V>> l, RDD<KeyValuePair<K, W>> r)
{
return l.LeftOuterJoin<K, V, W>(r, numPartitions);
}
@ -552,34 +610,28 @@ namespace Microsoft.Spark.CSharp.Streaming
private readonly Func<V, V, V> invReduceFunc;
private readonly int numPartitions;
private readonly Func<KeyValuePair<K, V>, bool> filterFunc;
private readonly Func<double, RDD<dynamic>, RDD<dynamic>> prevFunc;
internal ReduceByKeyAndWindowHelper(Func<V, V, V> reduceF,
Func<V, V, V> invReduceF,
int numPartitions,
Func<KeyValuePair<K, V>, bool> filterF,
Func<double, RDD<dynamic>, RDD<dynamic>> prevF)
Func<KeyValuePair<K, V>, bool> filterF)
{
reduceFunc = reduceF;
invReduceFunc = invReduceF;
this.numPartitions = numPartitions;
filterFunc = filterF;
prevFunc = prevF;
}
internal RDD<dynamic> Reduce(double t, RDD<dynamic> a, RDD<dynamic> b)
{
if (prevFunc != null)
b = prevFunc(t, b);
var r = b.ConvertTo<KeyValuePair<K, V>>().ReduceByKey<K, V>(reduceFunc);
b.partitioner = new Partitioner(numPartitions, null);
var r = b.ConvertTo<KeyValuePair<K, V>>();
if (a != null)
{
if (prevFunc != null)
a = prevFunc(t, a);
r = a.ConvertTo<KeyValuePair<K, V>>().Union(r).ReduceByKey<K, V>(reduceFunc);
a.partitioner = b.partitioner;
r = a.ConvertTo<KeyValuePair<K, V>>().Union(r);
}
r = r.ReduceByKey<K, V>(reduceFunc, numPartitions);
if (filterFunc != null)
r.Filter(filterFunc);
return r.ConvertTo<dynamic>();
@ -587,13 +639,8 @@ namespace Microsoft.Spark.CSharp.Streaming
internal RDD<dynamic> InvReduce(double t, RDD<dynamic> a, RDD<dynamic> b)
{
if (prevFunc != null)
{
a = prevFunc(t, a);
b = prevFunc(t, b);
}
var rddb = b.ConvertTo<KeyValuePair<K, V>>().ReduceByKey<K, V>(reduceFunc);
a.partitioner = b.partitioner = new Partitioner(numPartitions, null);
var rddb = b.ConvertTo<KeyValuePair<K, V>>().ReduceByKey<K, V>(reduceFunc, numPartitions);
var rdda = a.ConvertTo<KeyValuePair<K, V>>();
var joined = rdda.Join<K, V, V>(rddb, numPartitions);
var r = joined.MapValues<K, Tuple<V, V>, V>(kv => kv.Item2 != null ? invReduceFunc(kv.Item1, kv.Item2) : kv.Item1);
@ -621,14 +668,14 @@ namespace Microsoft.Spark.CSharp.Streaming
internal class UpdateStateByKeysHelper<K, V, S>
{
private readonly Func<int, IEnumerable<KeyValuePair<K, Tuple<IEnumerable<V>, S>>>, IEnumerable<KeyValuePair<K, S>>> func;
private readonly Func<double, RDD<dynamic>, RDD<dynamic>> prevFunc;
private readonly RDD<KeyValuePair<K, S>> initialState;
private readonly int numPartitions;
internal UpdateStateByKeysHelper(
Func<int, IEnumerable<KeyValuePair<K, Tuple<IEnumerable<V>, S>>>, IEnumerable<KeyValuePair<K, S>>> f,
Func<double, RDD<dynamic>, RDD<dynamic>> prevF, int numPartitions)
RDD<KeyValuePair<K, S>> initialState, int numPartitions)
{
func = f;
prevFunc = prevF;
this.initialState = initialState;
this.numPartitions = numPartitions;
}
@ -637,10 +684,21 @@ namespace Microsoft.Spark.CSharp.Streaming
RDD<KeyValuePair<K, S>> state = null;
RDD<KeyValuePair<K, Tuple<IEnumerable<V>, S>>> g = null;
if (prevFunc != null)
valuesRDD = prevFunc(t, valuesRDD);
// call into scala side partitionBy directly since AddShuffleKey already applied
var values = new RDD<KeyValuePair<K, V>>(valuesRDD.sparkContext.SparkContextProxy.CreatePairwiseRDD(valuesRDD.rddProxy, numPartitions, 0), valuesRDD.sparkContext);
values.partitioner = new Partitioner(numPartitions, null);
var values = valuesRDD.ConvertTo<KeyValuePair<K, V>>();
if (stateRDD == null)
{
if (initialState != null)
{
if (initialState.sparkContext == null)
{
initialState.sparkContext = valuesRDD.sparkContext;
}
stateRDD = initialState.ConvertTo<dynamic>();
}
}
if (stateRDD == null)
{
@ -649,7 +707,6 @@ namespace Microsoft.Spark.CSharp.Streaming
else
{
state = stateRDD.ConvertTo<KeyValuePair<K, S>>();
values = values.PartitionBy(numPartitions);
state.partitioner = values.partitioner;
g = state.GroupWith(values, numPartitions).MapValues(x => new Tuple<IEnumerable<V>, S>(new List<V>(x.Item2), x.Item1.Count > 0 ? x.Item1[0] : default(S)));
}

Просмотреть файл

@ -52,10 +52,15 @@ namespace Microsoft.Spark.CSharp.Streaming
this.streamingContextProxy = streamingContextProxy;
}
public StreamingContext(SparkContext sparkContext, long durationMs)
/// <summary>
/// Initializes a new instance of StreamingContext with a existing SparkContext
/// </summary>
/// <param name="sparkContext">An existing SparkContext</param>
/// <param name="durationSeconds">the time interval at which streaming data will be divided into batches</param>
public StreamingContext(SparkContext sparkContext, int durationSeconds)
{
this.sparkContext = sparkContext;
streamingContextProxy = SparkCLREnvironment.SparkCLRProxy.CreateStreamingContext(sparkContext, durationMs);
streamingContextProxy = SparkCLREnvironment.SparkCLRProxy.CreateStreamingContext(sparkContext, durationSeconds);
}
/// <summary>
@ -79,11 +84,17 @@ namespace Microsoft.Spark.CSharp.Streaming
return new StreamingContext(SparkCLREnvironment.SparkCLRProxy.CreateStreamingContext(checkpointPath));
}
/// <summary>
/// Start the execution of the streams.
/// </summary>
public void Start()
{
streamingContextProxy.Start();
}
/// <summary>
/// Stop the execution of the streams.
/// </summary>
public void Stop()
{
streamingContextProxy.Stop();
@ -95,10 +106,10 @@ namespace Microsoft.Spark.CSharp.Streaming
/// collection. This method allows the developer to specify how long to remember the RDDs (
/// if the developer wishes to query old data outside the DStream computation).
/// </summary>
/// <param name="durationMs">Minimum duration that each DStream should remember its RDDs</param>
public void Remember(long durationMs)
/// <param name="durationSeconds">Minimum duration that each DStream should remember its RDDs</param>
public void Remember(int durationSeconds)
{
streamingContextProxy.Remember(durationMs);
streamingContextProxy.Remember(durationSeconds);
}
/// <summary>
@ -152,10 +163,10 @@ namespace Microsoft.Spark.CSharp.Streaming
/// <summary>
/// Wait for the execution to stop.
/// </summary>
/// <param name="timeout">time to wait in seconds</param>
public void AwaitTerminationOrTimeout(int timeout)
/// <param name="timeout">time to wait in milliseconds</param>
public void AwaitTerminationOrTimeout(long timeout)
{
streamingContextProxy.AwaitTermination(timeout);
streamingContextProxy.AwaitTerminationOrTimeout(timeout);
}
/// <summary>

Просмотреть файл

@ -2,7 +2,7 @@
<xsl:stylesheet version="1.0" xmlns:xsl="http://www.w3.org/1999/XSL/Transform">
<xsl:template match="/">
##<center><H1><font color="darkorchid4">SparkCLR API Documentation<!--xsl:value-of select="$AssemblyName"/--></font></H1></center>
##<center><H1><font color="darkorchid4">Mobius API Documentation<!--xsl:value-of select="$AssemblyName"/--></font></H1></center>
<xsl:apply-templates select="//member[contains(@name,'T:') and not(contains(@name,'Helper')) and not(contains(@name,'Wrapper')) and not(contains(@name,'Configuration')) and not(contains(@name,'Proxy')) and not(contains(@name,'Interop')) and not(contains(@name,'Services'))]"/>
</xsl:template>

Разница между файлами не показана из-за своего большого размера Загрузить разницу

Различия файлов скрыты, потому что одна или несколько строк слишком длинны

Различия файлов скрыты, потому что одна или несколько строк слишком длинны

Просмотреть файл

@ -1,21 +1,14 @@
using System;
using System.IO;
using System.IO;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Threading.Tasks;
using System.Net;
using System.Net.Sockets;
using System.Runtime.Serialization.Formatters.Binary;
using Microsoft.Spark.CSharp.Core;
using Microsoft.Spark.CSharp.Interop;
using Microsoft.Spark.CSharp.Proxy;
using Microsoft.Spark.CSharp.Interop.Ipc;
using NUnit.Framework;
using Moq;
using AdapterTest.Mocks;
using Microsoft.Spark.CSharp.Network;
namespace AdapterTest
{
@ -27,7 +20,7 @@ namespace AdapterTest
public class AccumulatorTest
{
private SparkContext sc;
private Socket sock;
private ISocketWrapper sock;
[SetUp]
@ -38,7 +31,7 @@ namespace AdapterTest
// get accumulator server port and connect to accumuator server
int serverPort = (sc.SparkContextProxy as MockSparkContextProxy).AccumulatorServerPort;
sock = new Socket(AddressFamily.InterNetwork, SocketType.Stream, ProtocolType.Tcp);
sock = SocketFactory.CreateSocket();
sock.Connect(IPAddress.Loopback, serverPort);
}
@ -49,29 +42,31 @@ namespace AdapterTest
try
{
using (var s = new NetworkStream(sock))
using (var s = sock.GetStream())
{
int numUpdates = 0;
SerDe.Write(s, numUpdates);
}
sock.Close();
}
catch
{
// do nothing here
}
finally
{
sock.Close();
}
}
/// <summary>
/// test when no errors, accumuator server receives data as expected and exit with 0
/// test when no errors, accumulator server receives data as expected and exit with 0
/// </summary>
[Test]
public void TestAccumuatorSuccess()
{
Accumulator<int> accumulator = sc.Accumulator<int>(0);
using (var s = new NetworkStream(sock))
using (var s = sock.GetStream())
{
// write numUpdates
int numUpdates = 1;
@ -102,7 +97,7 @@ namespace AdapterTest
[Test]
public void TestUndefinedAccumuator()
{
using (var s = new NetworkStream(sock))
using (var s = sock.GetStream())
{
// write numUpdates
int numUpdates = 1;

Просмотреть файл

@ -72,6 +72,7 @@
<Compile Include="DataFrameNaFunctionsTest.cs" />
<Compile Include="DataFrameReaderTest.cs" />
<Compile Include="DataFrameWriterTest.cs" />
<Compile Include="EventHubsUtilsTest.cs" />
<Compile Include="JsonSerDeTest.cs" />
<Compile Include="FunctionsTest.cs" />
<Compile Include="Mocks\MockDataFrameReaderProxy.cs" />
@ -81,6 +82,7 @@
<Compile Include="Properties\AssemblyInfo.cs" />
<Compile Include="RowTest.cs" />
<Compile Include="SerDeTest.cs" />
<Compile Include="HiveContextTest.cs" />
<Compile Include="StatusTrackerTest.cs" />
<Compile Include="TestWithMoqDemo.cs" />
<Compile Include="Mocks\MockStructTypeProxy.cs" />
@ -107,12 +109,17 @@
<Compile Include="ComparableRDDTest.cs" />
<Compile Include="DoubleRDDTest.cs" />
<Compile Include="UserDefinedFunctionTest.cs" />
<Compile Include="WeakObjectManagerTest.cs" />
</ItemGroup>
<ItemGroup>
<ProjectReference Include="..\Adapter\Microsoft.Spark.CSharp\Adapter.csproj">
<Project>{ce999a96-f42b-4e80-b208-709d7f49a77c}</Project>
<Name>Adapter</Name>
</ProjectReference>
<ProjectReference Include="..\Tests.Common\Tests.Common.csproj">
<Project>{e4479c4c-e106-4b90-bf0c-319561cea9c4}</Project>
<Name>Tests.Common</Name>
</ProjectReference>
</ItemGroup>
<ItemGroup />
<ItemGroup>

Просмотреть файл

@ -108,7 +108,8 @@ namespace AdapterTest
// worker side operations
Broadcast<int> broadcastVarInWorker = CreateBroadcastVarInWorker(expectedValue, out bid, out dumpPath);
Broadcast.broadcastRegistry.Remove(bid);
Broadcast bc;
Broadcast.broadcastRegistry.TryRemove(bid, out bc);
// assert
Assert.Throws<ArgumentException>(() => { var broadcastValueInWorker = broadcastVarInWorker.Value; });

Просмотреть файл

@ -195,6 +195,24 @@ namespace AdapterTest
mockColumnProxy.Verify(m => m.BinOp("bitwiseXOR", column2.ColumnProxy), Times.Once);
}
[Test]
public void TestColumnGetHashCode()
{
var column1 = new Column(null);
Assert.AreEqual(0, column1.GetHashCode());
var column2 = new Column(mockColumnProxy.Object);
Assert.AreNotEqual(0, column2.GetHashCode());
}
[Test]
public void TestColumnEquals()
{
var column1 = new Column(mockColumnProxy.Object);
var column2 = new Column(mockColumnProxy.Object);
Assert.IsTrue(column1.Equals(column2));
}
[Test]
public void TestColumnLike()
{

Просмотреть файл

@ -4,6 +4,7 @@ using AdapterTest.Mocks;
using Microsoft.Spark.CSharp.Core;
using Microsoft.Spark.CSharp.Interop.Ipc;
using NUnit.Framework;
using System.Linq;
namespace AdapterTest
{
@ -39,6 +40,12 @@ namespace AdapterTest
Assert.AreEqual(2, taken.Length);
Assert.AreEqual("brown", taken[0]);
Assert.AreEqual("dog", taken[1]);
taken = words.Distinct().TakeOrdered(2, x => new string(x.ToCharArray().Reverse().ToArray()));
Array.Sort(taken, StringComparer.Ordinal);
Assert.AreEqual(2, taken.Length);
Assert.AreEqual("The", taken[0]);
Assert.AreEqual("the", taken[1]);
}
[Test]

Просмотреть файл

@ -4,9 +4,13 @@
using System;
using System.Collections.Generic;
using System.IO;
using System.Linq;
using AdapterTest.Mocks;
using Microsoft.Spark.CSharp.Core;
using Microsoft.Spark.CSharp.Interop;
using Microsoft.Spark.CSharp.Proxy;
using Microsoft.Spark.CSharp.Streaming;
using Moq;
using NUnit.Framework;
namespace AdapterTest
@ -17,7 +21,7 @@ namespace AdapterTest
[Test]
public void TestDStreamMapReduce()
{
var ssc = new StreamingContext(new SparkContext("", ""), 1000);
var ssc = new StreamingContext(new SparkContext("", ""), 1);
Assert.IsNotNull((ssc.streamingContextProxy as MockStreamingContextProxy));
var lines = ssc.TextFileStream(Path.GetTempPath());
@ -27,7 +31,8 @@ namespace AdapterTest
words.Slice(DateTime.MinValue, DateTime.MaxValue);
words.Cache();
words.Checkpoint(1000);
words.Checkpoint(1);
words.Window(1, 1);
words.Count().ForeachRDD((time, rdd) =>
{
@ -78,7 +83,7 @@ namespace AdapterTest
[Test]
public void TestDStreamTransform()
{
var ssc = new StreamingContext(new SparkContext("", ""), 1000);
var ssc = new StreamingContext(new SparkContext("", ""), 1);
Assert.IsNotNull((ssc.streamingContextProxy as MockStreamingContextProxy));
var lines = ssc.TextFileStream(Path.GetTempPath());
@ -134,7 +139,7 @@ namespace AdapterTest
[Test]
public void TestDStreamJoin()
{
var ssc = new StreamingContext(new SparkContext("", ""), 1000);
var ssc = new StreamingContext(new SparkContext("", ""), 1);
Assert.IsNotNull((ssc.streamingContextProxy as MockStreamingContextProxy));
var lines = ssc.TextFileStream(Path.GetTempPath());
@ -241,7 +246,7 @@ namespace AdapterTest
[Test]
public void TestDStreamUpdateStateByKey()
{
var ssc = new StreamingContext(new SparkContext("", ""), 1000);
var ssc = new StreamingContext(new SparkContext("", ""), 1);
Assert.IsNotNull((ssc.streamingContextProxy as MockStreamingContextProxy));
var lines = ssc.TextFileStream(Path.GetTempPath());
@ -267,8 +272,23 @@ namespace AdapterTest
// disable pipeline to UpdateStateByKey which replys on checkpoint mock proxy doesn't support
pairs.Cache();
var state = pairs.UpdateStateByKey<string, int, int>((v, s) => s + (v as List<int>).Count);
var initialStateRdd = ssc.SparkContext.Parallelize(new[] { "AAA" }).Map( w => new KeyValuePair<string, int>("AAA", 22));
var state = pairs.UpdateStateByKey<string, int, int>((v, s) => s + (v as List<int>).Count, initialStateRdd);
state.ForeachRDD((time, rdd) =>
{
var taken = rdd.Collect();
Assert.AreEqual(taken.Length, 10);
foreach (object record in taken)
{
KeyValuePair<string, int> countByWord = (KeyValuePair<string, int>)record;
Assert.AreEqual(countByWord.Key == "The" || countByWord.Key == "dog" || countByWord.Key == "lazy" ? 23 : 22, countByWord.Value);
}
});
// test when initialStateRdd is not provided
var state2 = pairs.UpdateStateByKey<string, int, int>((v, s) => s + (v as List<int>).Count);
state2.ForeachRDD((time, rdd) =>
{
var taken = rdd.Collect();
Assert.AreEqual(taken.Length, 9);
@ -276,9 +296,146 @@ namespace AdapterTest
foreach (object record in taken)
{
KeyValuePair<string, int> countByWord = (KeyValuePair<string, int>)record;
Assert.AreEqual(countByWord.Value, countByWord.Key == "The" || countByWord.Key == "dog" || countByWord.Key == "lazy" ? 24 : 23);
Assert.AreEqual(countByWord.Key == "The" || countByWord.Key == "dog" || countByWord.Key == "lazy" ? 23 : 22, countByWord.Value);
}
});
}
[Test]
public void TestDStreamMapWithState()
{
var mapwithStateDStreamProxy = new Mock<IDStreamProxy>();
var streamingContextProxy = new Mock<IStreamingContextProxy>();
streamingContextProxy.Setup(p =>
p.CreateCSharpStateDStream(It.IsAny<IDStreamProxy>(), It.IsAny<byte[]>(), It.IsAny<string>(), It.IsAny<string>(), It.IsAny<string>()))
.Returns(mapwithStateDStreamProxy.Object);
var sparkContextProxy = new Mock<ISparkContextProxy>();
var sparkConfProxy = new Mock<ISparkConfProxy>();
var sparkClrProxy = new Mock<ISparkCLRProxy>();
sparkClrProxy.Setup(p => p.StreamingContextProxy).Returns(streamingContextProxy.Object);
sparkClrProxy.Setup(p => p.SparkContextProxy).Returns(sparkContextProxy.Object);
sparkClrProxy.Setup(p => p.CreateSparkContext(It.IsAny<ISparkConfProxy>())).Returns(sparkContextProxy.Object);
sparkClrProxy.Setup(p => p.CreateSparkConf(It.IsAny<bool>())).Returns(sparkConfProxy.Object);
// reset sparkCLRProxy for after test completes
var originalSparkCLRProxy = SparkCLREnvironment.SparkCLRProxy;
try
{
SparkCLREnvironment.SparkCLRProxy = sparkClrProxy.Object;
var sparkConf = new SparkConf(false);
var ssc = new StreamingContext(new SparkContext(sparkContextProxy.Object, sparkConf), 10);
var dstreamProxy = new Mock<IDStreamProxy>();
var pairDStream = new DStream<KeyValuePair<string, int>>(dstreamProxy.Object, ssc);
var stateSpec = new StateSpec<string, int, int, int>((k, v, s) => v);
var stateDStream = pairDStream.MapWithState(stateSpec);
var snapshotDStream = stateDStream.StateSnapshots();
Assert.IsNotNull(stateDStream);
Assert.IsNotNull(snapshotDStream);
}
finally
{
SparkCLREnvironment.SparkCLRProxy = originalSparkCLRProxy;
}
}
[Test]
public void TestDStreamMapWithStateMapWithStateHelper()
{
// test when initialStateRdd is null
var stateSpec = new StateSpec<string, int, int, int>((k, v, s) => v).NumPartitions(2).Timeout(TimeSpan.FromSeconds(100));
var helper = new MapWithStateHelper<string, int, int, int>((t, rdd) => rdd, stateSpec);
var sparkContextProxy = new Mock<ISparkContextProxy>();
var sc = new SparkContext(sparkContextProxy.Object, null);
var pairwiseRddProxy = new Mock<IRDDProxy>();
sparkContextProxy.Setup(p => p.CreatePairwiseRDD(It.IsAny<IRDDProxy>(), It.IsAny<int>(), It.IsAny<long>())).Returns(pairwiseRddProxy.Object);
var pipelinedRddProxy = new Mock<IRDDProxy>();
pipelinedRddProxy.Setup(p => p.Union(It.IsAny<IRDDProxy>())).Returns(new Mock<IRDDProxy>().Object);
sparkContextProxy.Setup(p =>
p.CreateCSharpRdd(It.IsAny<IRDDProxy>(), It.IsAny<byte[]>(), It.IsAny<Dictionary<string, string>>(), It.IsAny<List<string>>(), It.IsAny<bool>(), It.IsAny<List<Broadcast>>(), It.IsAny<List<byte[]>>()))
.Returns(pipelinedRddProxy.Object);
var valueRddProxy = new Mock<IRDDProxy>();
var valuesRdd = new RDD<dynamic>(valueRddProxy.Object, sc);
var resultRdd = helper.Execute(DateTime.UtcNow.Millisecond, null, valuesRdd);
Assert.IsNotNull(resultRdd);
// test when initialStateRdd is not null
var initialStateRdd = new RDD<KeyValuePair<string, int>>(new Mock<IRDDProxy>().Object, null);
var stateSpec2 = new StateSpec<string, int, int, int>((k, v, s) => v).InitialState(initialStateRdd).NumPartitions(2);
var helper2 = new MapWithStateHelper<string, int, int, int>((t, rdd) => rdd, stateSpec2);
var resultRdd2 = helper2.Execute(DateTime.UtcNow.Millisecond, null, valuesRdd);
Assert.IsNotNull(resultRdd2);
}
[Test]
public void TestDStreamMapWithStateUpdateStateHelper()
{
var ticks = DateTime.UtcNow.Ticks;
var helper = new UpdateStateHelper<string, int, int, int>(
(k, v, state) =>
{
if (v < 0 && state.Exists())
{
state.Remove();
}
else if(!state.IsTimingOut())
{
state.Update(v + state.Get());
}
return v;
},
ticks, true, TimeSpan.FromSeconds(10));
var input = new dynamic[4];
var preStateRddRecord = new MapWithStateRDDRecord<string, int, int>(ticks - TimeSpan.FromSeconds(2).Ticks, new [] { new KeyValuePair<string, int>("1", 1), new KeyValuePair<string, int>("2", 2)});
preStateRddRecord.stateMap.Add("expired", new KeyedState<int>(0, ticks - TimeSpan.FromSeconds(60).Ticks));
input[0] = preStateRddRecord;
input[1] = new KeyValuePair<string, int>("1", -1);
input[2] = new KeyValuePair<string, int>("2", 2);
input[3] = new KeyValuePair<string, int>("3", 3);
var result = helper.Execute(1, input).GetEnumerator();
Assert.IsNotNull(result);
Assert.IsTrue(result.MoveNext());
MapWithStateRDDRecord<string, int, int> stateRddRecord = result.Current;
Assert.IsNotNull(stateRddRecord);
Assert.AreEqual(stateRddRecord.mappedData.Count, 4); // timedout record also appears in return results
Assert.AreEqual(stateRddRecord.stateMap.Count, 2);
}
[Test]
public void TestConstantInputDStream()
{
var sc = new SparkContext("", "");
var rdd = sc.Parallelize(Enumerable.Range(0, 10), 1);
var ssc = new StreamingContext(sc, 1);
// test when rdd is null
Assert.Throws<ArgumentNullException>(() => new ConstantInputDStream<int>(null, ssc));
var constantInputDStream = new ConstantInputDStream<int>(rdd, ssc);
Assert.IsNotNull(constantInputDStream);
Assert.AreEqual(ssc, constantInputDStream.streamingContext);
}
}
}

Просмотреть файл

@ -42,6 +42,13 @@ namespace AdapterTest
public void TestDropWithAny()
{
// arrange
const string columnName = "column1";
var mockSchemaProxy = new Mock<IStructTypeProxy>();
var mockFieldProxy = new Mock<IStructFieldProxy>();
mockDataFrameProxy.Setup(m => m.GetSchema()).Returns(mockSchemaProxy.Object);
mockSchemaProxy.Setup(m => m.GetStructTypeFields()).Returns(new List<IStructFieldProxy> { mockFieldProxy.Object });
mockFieldProxy.Setup(m => m.GetStructFieldName()).Returns(columnName);
var sparkContext = new SparkContext("", "");
mockDataFrameNaFunctionsProxy.Setup(m => m.Drop(It.IsAny<int>(), It.IsAny<string[]>())).Returns(mockDataFrameProxy.Object);
@ -50,12 +57,21 @@ namespace AdapterTest
// act
var cols = new[] { "col1", "col2" };
var df = f.Drop("any", cols);
var df1 = f.Drop("any", cols);
var df2 = f.Drop();
var df3 = f.Drop("any");
// verify
Assert.IsNotNull(df);
Assert.AreEqual(df.DataFrameProxy, dataFrame.DataFrameProxy);
Assert.IsNotNull(df1);
Assert.AreEqual(df1.DataFrameProxy, dataFrame.DataFrameProxy);
mockDataFrameNaFunctionsProxy.Verify(m => m.Drop(cols.Length, cols), Times.Once);
Assert.IsNotNull(df2);
Assert.AreEqual(df2.DataFrameProxy, dataFrame.DataFrameProxy);
Assert.IsNotNull(df3);
Assert.AreEqual(df3.DataFrameProxy, dataFrame.DataFrameProxy);
mockDataFrameNaFunctionsProxy.Verify(m => m.Drop(1, new[] { columnName }), Times.Exactly(2));
}
[Test]
@ -106,6 +122,29 @@ namespace AdapterTest
mockDataFrameNaFunctionsProxy.Verify(m => m.Drop(It.IsAny<int>(), It.IsAny<string[]>()), Times.Never);
}
[Test]
public void TestDropWithMinNonNulls()
{
const string columnName = "column1";
var mockSchemaProxy = new Mock<IStructTypeProxy>();
var mockFieldProxy = new Mock<IStructFieldProxy>();
mockDataFrameProxy.Setup(m => m.GetSchema()).Returns(mockSchemaProxy.Object);
mockSchemaProxy.Setup(m => m.GetStructTypeFields()).Returns(new List<IStructFieldProxy> { mockFieldProxy.Object });
mockFieldProxy.Setup(m => m.GetStructFieldName()).Returns(columnName);
var sparkContext = new SparkContext("", "");
mockDataFrameNaFunctionsProxy.Setup(m => m.Drop(It.IsAny<int>(), It.IsAny<string[]>())).Returns(mockDataFrameProxy.Object);
var dataFrame = new DataFrame(mockDataFrameProxy.Object, sparkContext);
var f = new DataFrameNaFunctions(mockDataFrameNaFunctionsProxy.Object, dataFrame, sparkContext);
var df = f.Drop(20);
Assert.IsNotNull(df);
Assert.AreEqual(df.DataFrameProxy, dataFrame.DataFrameProxy);
Assert.AreNotSame(dataFrame, df);
mockDataFrameNaFunctionsProxy.Verify(m => m.Drop(20, new[] { columnName }), Times.Once);
}
[Test]
public void TestFill()
{

Просмотреть файл

@ -44,6 +44,33 @@ namespace AdapterTest
SparkCLREnvironment.SparkCLRProxy = new MockSparkCLRProxy();
}
[Test]
public void TestRegisterTempTable()
{
mockDataFrameProxy.Setup(m => m.RegisterTempTable(It.IsAny<string>()));
var dataFrame = new DataFrame(mockDataFrameProxy.Object, null);
dataFrame.RegisterTempTable("TestTable");
mockDataFrameProxy.Verify(m => m.RegisterTempTable("TestTable"), Times.Once);
}
[Test]
public void TestDataFrameCount()
{
mockDataFrameProxy.Setup(m => m.Count()).Returns(1);
var dataFrame = new DataFrame(mockDataFrameProxy.Object, null);
Assert.AreEqual(1, dataFrame.Count());
mockDataFrameProxy.Verify(m => m.Count(), Times.Once);
}
[Test]
public void TestShow()
{
mockDataFrameProxy.Setup(m => m.GetShowString(It.IsAny<int>(), It.IsAny<bool>())).Returns("Show");
var dataFrame = new DataFrame(mockDataFrameProxy.Object, null);
dataFrame.Show();
mockDataFrameProxy.Verify(m => m.GetShowString(20, true), Times.Once);
}
[Test]
public void TestDataFrameJoin()
{
@ -51,10 +78,54 @@ namespace AdapterTest
var dataFrame = sqlContext.Read().Json(@"c:\path\to\input.json");
var dataFrame2 = sqlContext.Read().Json(@"c:\path\to\input2.json");
var joinedDataFrame = dataFrame.Join(dataFrame2, "JoinCol");
var paramValuesToJoinMethod = (joinedDataFrame.DataFrameProxy as MockDataFrameProxy).mockDataFrameReference as object[];
var paramValuesToSecondDataFrameJsonFileMethod = ((paramValuesToJoinMethod[0] as MockDataFrameProxy).mockDataFrameReference as object[]);
var paramValuesToJoinMethod = (joinedDataFrame.DataFrameProxy as MockDataFrameProxy).mockDataFrameReference;
var paramValuesToSecondDataFrameJsonFileMethod = (paramValuesToJoinMethod[0] as MockDataFrameProxy).mockDataFrameReference;
Assert.AreEqual(@"c:\path\to\input2.json", paramValuesToSecondDataFrameJsonFileMethod[0]);
Assert.AreEqual("JoinCol", paramValuesToJoinMethod[1]);
var joinedDataFrame2 = dataFrame.Join(dataFrame2, new[] {"JoinCol1", "JoinCol2"});
var paramValuesToJoinMethod2 = (joinedDataFrame2.DataFrameProxy as MockDataFrameProxy).mockDataFrameReference;
var paramValuesToSecondDataFrameJsonFileMethod2 = (paramValuesToJoinMethod2[0] as MockDataFrameProxy).mockDataFrameReference;
Assert.AreEqual(@"c:\path\to\input2.json", paramValuesToSecondDataFrameJsonFileMethod2[0]);
Assert.AreEqual("JoinCol1", (paramValuesToJoinMethod2[1] as string[])[0]);
Assert.AreEqual("JoinCol2", (paramValuesToJoinMethod2[1] as string[])[1]);
var mockColumnProxy = new Mock<IColumnProxy>().Object;
var mockColumn = new Column(mockColumnProxy);
var joinedDataFrame3 = dataFrame.Join(dataFrame2, mockColumn);
var paramValuesToJoinMethod3 = (joinedDataFrame3.DataFrameProxy as MockDataFrameProxy).mockDataFrameReference;
var paramValuesToSecondDataFrameJsonFileMethod3 = (paramValuesToJoinMethod3[0] as MockDataFrameProxy).mockDataFrameReference;
Assert.AreEqual(@"c:\path\to\input2.json", paramValuesToSecondDataFrameJsonFileMethod3[0]);
Assert.AreEqual(mockColumnProxy, paramValuesToJoinMethod3[1]);
Assert.AreEqual(JoinType.Inner.Value, paramValuesToJoinMethod3[2]);
var joinedDataFrame4 = dataFrame.Join(dataFrame2, mockColumn, JoinType.Outer);
var paramValuesToJoinMethod4 = (joinedDataFrame4.DataFrameProxy as MockDataFrameProxy).mockDataFrameReference;
var paramValuesToSecondDataFrameJsonFileMethod4 = (paramValuesToJoinMethod4[0] as MockDataFrameProxy).mockDataFrameReference;
Assert.AreEqual(@"c:\path\to\input2.json", paramValuesToSecondDataFrameJsonFileMethod4[0]);
Assert.AreEqual(mockColumnProxy, paramValuesToJoinMethod4[1]);
Assert.AreEqual(JoinType.Outer.Value, paramValuesToJoinMethod4[2]);
var joinedDataFrame5 = dataFrame.Join(dataFrame2, mockColumn, JoinType.LeftOuter);
var paramValuesToJoinMethod5 = (joinedDataFrame5.DataFrameProxy as MockDataFrameProxy).mockDataFrameReference;
var paramValuesToSecondDataFrameJsonFileMethod5 = (paramValuesToJoinMethod5[0] as MockDataFrameProxy).mockDataFrameReference;
Assert.AreEqual(@"c:\path\to\input2.json", paramValuesToSecondDataFrameJsonFileMethod5[0]);
Assert.AreEqual(mockColumnProxy, paramValuesToJoinMethod5[1]);
Assert.AreEqual(JoinType.LeftOuter.Value, paramValuesToJoinMethod5[2]);
var joinedDataFrame6 = dataFrame.Join(dataFrame2, mockColumn, JoinType.RightOuter);
var paramValuesToJoinMethod6 = (joinedDataFrame6.DataFrameProxy as MockDataFrameProxy).mockDataFrameReference;
var paramValuesToSecondDataFrameJsonFileMethod6 = (paramValuesToJoinMethod6[0] as MockDataFrameProxy).mockDataFrameReference;
Assert.AreEqual(@"c:\path\to\input2.json", paramValuesToSecondDataFrameJsonFileMethod6[0]);
Assert.AreEqual(mockColumnProxy, paramValuesToJoinMethod6[1]);
Assert.AreEqual(JoinType.RightOuter.Value, paramValuesToJoinMethod6[2]);
var joinedDataFrame7 = dataFrame.Join(dataFrame2, mockColumn, JoinType.LeftSemi);
var paramValuesToJoinMethod7 = (joinedDataFrame7.DataFrameProxy as MockDataFrameProxy).mockDataFrameReference;
var paramValuesToSecondDataFrameJsonFileMethod7 = (paramValuesToJoinMethod7[0] as MockDataFrameProxy).mockDataFrameReference;
Assert.AreEqual(@"c:\path\to\input2.json", paramValuesToSecondDataFrameJsonFileMethod7[0]);
Assert.AreEqual(mockColumnProxy, paramValuesToJoinMethod7[1]);
Assert.AreEqual(JoinType.LeftSemi.Value, paramValuesToJoinMethod7[2]);
}
@ -187,6 +258,48 @@ namespace AdapterTest
Assert.AreEqual(expectedResultDataFrameProxy, actualResultDataFrame.DataFrameProxy);
}
[Test]
public void TestFillNa()
{
// Arrange
const string columnName = "column1";
var mockSchemaProxy = new Mock<IStructTypeProxy>();
var mockFieldProxy = new Mock<IStructFieldProxy>();
var expectedResultDataFrameProxy = new Mock<IDataFrameProxy>().Object;
mockDataFrameProxy.Setup(m => m.GetSchema()).Returns(mockSchemaProxy.Object);
// dataframeNaFunctionsProxy
var dataFrameNaFunctionsProxy = new Mock<IDataFrameNaFunctionsProxy>();
dataFrameNaFunctionsProxy.Setup(d => d.Fill(It.IsAny<double>(), It.IsAny<string[]>())).Returns(expectedResultDataFrameProxy);
dataFrameNaFunctionsProxy.Setup(d => d.Fill(It.IsAny<string>(), It.IsAny<string[]>())).Returns(expectedResultDataFrameProxy);
dataFrameNaFunctionsProxy.Setup(d => d.Fill(It.IsAny<Dictionary<string, object>>())).Returns(expectedResultDataFrameProxy);
mockDataFrameProxy.Setup(m => m.Na()).Returns(dataFrameNaFunctionsProxy.Object);
mockSchemaProxy.Setup(m => m.GetStructTypeFields()).Returns(new List<IStructFieldProxy> { mockFieldProxy.Object });
mockFieldProxy.Setup(m => m.GetStructFieldName()).Returns(columnName);
var sc = new SparkContext(null);
var dict = new Dictionary<string, object> {{columnName, 1}};
// Act
var originalDataFrame = new DataFrame(mockDataFrameProxy.Object, sc);
var actualResultDataFrame1 = originalDataFrame.FillNa(1);
var actualResultDataFrame2 = originalDataFrame.FillNa("1", new[] {columnName});
var actualResultDataFrame3 = originalDataFrame.FillNa(dict);
// Assert
// assert DropNa of Proxy was invoked with correct parameters
dataFrameNaFunctionsProxy.Verify(m => m.Fill(1, It.Is<string[]>(subset => subset.Length == 1 &&
subset.Contains(columnName))));
dataFrameNaFunctionsProxy.Verify(m => m.Fill("1", It.Is<string[]>(subset => subset.Length == 1 &&
subset.Contains(columnName))));
dataFrameNaFunctionsProxy.Verify(m => m.Fill(dict));
Assert.AreEqual(expectedResultDataFrameProxy, actualResultDataFrame1.DataFrameProxy);
Assert.AreEqual(expectedResultDataFrameProxy, actualResultDataFrame2.DataFrameProxy);
Assert.AreEqual(expectedResultDataFrameProxy, actualResultDataFrame3.DataFrameProxy);
}
[Test]
public void TestDropDuplicates()
{
@ -352,7 +465,7 @@ namespace AdapterTest
}
[Test]
public void TestSort_ColumnNames()
public void TestSort()
{
// Arrange
const string columnName = "column1";
@ -374,6 +487,28 @@ namespace AdapterTest
Assert.AreEqual(expectedResultDataFrameProxy, actualResultDataFrameProxy.DataFrameProxy);
}
[Test]
public void TestSortWithinPartitions()
{
// Arrange
const string columnName = "column1";
var expectedResultDataFrameProxy = new Mock<IDataFrameProxy>().Object;
var mockColumnProxy = new Mock<IColumnProxy>();
var mockSortedColumnProxy = new Mock<IColumnProxy>();
mockColumnProxy.Setup(m => m.UnaryOp(It.IsAny<string>())).Returns(mockSortedColumnProxy.Object);
mockDataFrameProxy.Setup(m => m.GetColumn(It.IsAny<string>())).Returns(mockColumnProxy.Object);
mockDataFrameProxy.Setup(m => m.SortWithinPartitions(It.IsAny<IColumnProxy[]>())).Returns(expectedResultDataFrameProxy);
var sc = new SparkContext(null);
// Act
var originalDataFrame = new DataFrame(mockDataFrameProxy.Object, sc);
var actualResultDataFrameProxy = originalDataFrame.SortWithinPartitions(new[] { columnName });
// Assert
Assert.AreEqual(expectedResultDataFrameProxy, actualResultDataFrameProxy.DataFrameProxy);
}
[Test]
public void TestAlias()
{
@ -544,6 +679,30 @@ namespace AdapterTest
mockDataFrameProxy.Verify(m => m.Repartition(numPartitions), Times.Once());
}
[Test]
public void TestRepartition2()
{
// arrange
mockDataFrameProxy.Setup(m => m.Repartition(It.IsAny<int>(), It.IsAny<IColumnProxy[]>()));
var sc = new SparkContext(null);
var dataFrame = new DataFrame(mockDataFrameProxy.Object, sc);
const int numPartitions = 5;
IColumnProxy mockColumn1Proxy = new Mock<IColumnProxy>().Object;
Column mockColumn = new Column(mockColumn1Proxy);
// act
dataFrame.Repartition(new[] { mockColumn }, numPartitions);
// assert
mockDataFrameProxy.Verify(m => m.Repartition(numPartitions, new[] { mockColumn1Proxy }), Times.Once());
// act
dataFrame.Repartition(new[] { mockColumn });
// assert
mockDataFrameProxy.Verify(m => m.Repartition(new[] { mockColumn1Proxy }), Times.Once());
}
[Test]
public void TestSample()
{
@ -968,6 +1127,60 @@ namespace AdapterTest
Assert.AreEqual(expectedResultDataFrameProxy, actualResultDataFrame.DataFrameProxy);
}
[Test]
public void TestSelect_ColumnName()
{
var expectedResultDataFrameProxy = new Mock<IDataFrameProxy>().Object;
mockDataFrameProxy.Setup(m => m.Select(It.IsAny<string>(), It.IsAny<string[]>())).Returns(expectedResultDataFrameProxy);
var sc = new SparkContext(null);
const string column1Name = "colName1";
const string column2Name = "colName2";
// Act
var originalDataFrame = new DataFrame(mockDataFrameProxy.Object, sc);
var actualResultDataFrame = originalDataFrame.Select(column1Name, column2Name);
// Assert
mockDataFrameProxy.Verify(m => m.Select(column1Name, new [] { column2Name } ));
Assert.AreEqual(expectedResultDataFrameProxy, actualResultDataFrame.DataFrameProxy);
}
[Test]
public void TestSelectExpr()
{
var expectedResultDataFrameProxy = new Mock<IDataFrameProxy>().Object;
mockDataFrameProxy.Setup(m => m.SelectExpr(It.IsAny<string[]>())).Returns(expectedResultDataFrameProxy);
var sc = new SparkContext(null);
const string columnExpr = "colB as newName";
// Act
var originalDataFrame = new DataFrame(mockDataFrameProxy.Object, sc);
var actualResultDataFrame = originalDataFrame.SelectExpr(columnExpr);
// Assert
mockDataFrameProxy.Verify(m => m.SelectExpr(new[] { columnExpr }));
Assert.AreEqual(expectedResultDataFrameProxy, actualResultDataFrame.DataFrameProxy);
}
[Test]
public void TestWhere()
{
var expectedResultDataFrameProxy = new Mock<IDataFrameProxy>().Object;
mockDataFrameProxy.Setup(m => m.Filter(It.IsAny<string>())).Returns(expectedResultDataFrameProxy);
var sc = new SparkContext(null);
const string condition = "Filter Condition";
// Act
var originalDataFrame = new DataFrame(mockDataFrameProxy.Object, sc);
var actualResultDataFrame = originalDataFrame.Where(condition);
// Assert
mockDataFrameProxy.Verify(m => m.Filter(condition));
Assert.AreEqual(expectedResultDataFrameProxy, actualResultDataFrame.DataFrameProxy);
}
[Test]
public void TestWithColumn()
{
@ -1186,6 +1399,26 @@ namespace AdapterTest
#region GroupedDataTest
[Test]
public void TestAgg()
{
// Arrange
var expectedResultDataFrameProxy = new Mock<IDataFrameProxy>().Object;
var mockGroupedDataProxy = new Mock<IGroupedDataProxy>();
mockDataFrameProxy.Setup(m => m.GroupBy()).Returns(mockGroupedDataProxy.Object);
mockDataFrameProxy.Setup(m => m.Agg(It.IsAny<IGroupedDataProxy>(), It.IsAny<Dictionary<string, string>>())).Returns(expectedResultDataFrameProxy);
var sc = new SparkContext(null);
var columnNameAggFuncDic = new Dictionary<string, string> {{"name", "count"}};
// Act
var originalDataFrame = new DataFrame(mockDataFrameProxy.Object, sc);
var actualResult = originalDataFrame.Agg(columnNameAggFuncDic);
// Assert
mockDataFrameProxy.Verify(m => m.Agg(mockGroupedDataProxy.Object, columnNameAggFuncDic)); // assert Agg was invoked with correct parameters
Assert.AreEqual(expectedResultDataFrameProxy, actualResult.DataFrameProxy);
}
[Test]
public void TestCount()
{

Просмотреть файл

@ -0,0 +1,39 @@
// Copyright (c) Microsoft. All rights reserved.
// Licensed under the MIT license. See LICENSE file in the project root for full license information.
using System;
using System.Collections.Generic;
using AdapterTest.Mocks;
using Microsoft.Spark.CSharp.Core;
using Microsoft.Spark.CSharp.Interop;
using Microsoft.Spark.CSharp.Proxy;
using Microsoft.Spark.CSharp.Streaming;
using Moq;
using NUnit.Framework;
namespace AdapterTest
{
[TestFixture]
public class EventHubsUtilsTest
{
[Test]
public void TestCreateUnionStream()
{
var streamingContextProxy = new Mock<IStreamingContextProxy>();
var mockDstreamProxy = new Mock<IDStreamProxy>().Object;
streamingContextProxy.Setup(
m => m.EventHubsUnionStream(It.IsAny<Dictionary<string, string>>(), It.IsAny<StorageLevelType>()))
.Returns(mockDstreamProxy);
var mockSparkClrProxy = new Mock<ISparkCLRProxy>();
mockSparkClrProxy.Setup(m => m.CreateStreamingContext(It.IsAny<SparkContext>(), It.IsAny<int>()))
.Returns(streamingContextProxy.Object);
SparkCLREnvironment.SparkCLRProxy = mockSparkClrProxy.Object;
var sparkContext = new SparkContext(SparkCLREnvironment.SparkCLRProxy.SparkContextProxy, new SparkConf(new Mock<ISparkConfProxy>().Object));
var streamingContext = new StreamingContext(sparkContext, 123);
var dstream = EventHubsUtils.CreateUnionStream(streamingContext, new Dictionary<string, string>());
Assert.AreEqual(mockDstreamProxy, dstream.DStreamProxy);
}
}
}

Просмотреть файл

@ -565,25 +565,25 @@ namespace AdapterTest
{
mockSparkContextProxy.Setup(m => m.CreateWindowFunction(It.IsAny<string>()));
Functions.RowNumber();
mockSparkContextProxy.Verify(m => m.CreateWindowFunction("rowNumber"), Times.Once);
mockSparkContextProxy.Verify(m => m.CreateWindowFunction("row_number"), Times.Once);
Functions.DenseRank();
mockSparkContextProxy.Verify(m => m.CreateWindowFunction("denseRank"), Times.Once);
mockSparkContextProxy.Verify(m => m.CreateWindowFunction("dense_rank"), Times.Once);
Functions.Rank();
mockSparkContextProxy.Verify(m => m.CreateWindowFunction("rank"), Times.Once);
Functions.CumeDist();
mockSparkContextProxy.Verify(m => m.CreateWindowFunction("cumeDist"), Times.Once);
mockSparkContextProxy.Verify(m => m.CreateWindowFunction("cume_dist"), Times.Once);
Functions.PercentRank();
mockSparkContextProxy.Verify(m => m.CreateWindowFunction("percentRank"), Times.Once);
mockSparkContextProxy.Verify(m => m.CreateWindowFunction("percent_rank"), Times.Once);
Functions.MonotonicallyIncreasingId();
mockSparkContextProxy.Verify(m => m.CreateWindowFunction("monotonicallyIncreasingId"), Times.Once);
mockSparkContextProxy.Verify(m => m.CreateWindowFunction("monotonically_increasing_id"), Times.Once);
Functions.SparkPartitionId();
mockSparkContextProxy.Verify(m => m.CreateWindowFunction("sparkPartitionId"), Times.Once);
mockSparkContextProxy.Verify(m => m.CreateWindowFunction("spark_partition_id"), Times.Once);
Functions.Rand();
mockSparkContextProxy.Verify(m => m.CreateWindowFunction("rand"), Times.Once);
@ -594,6 +594,60 @@ namespace AdapterTest
#endregion
#region udf functions
[Test]
public void TestUdfFunction()
{
var mockUdfProxy = new Mock<IUDFProxy>();
mockUdfProxy.Setup(m => m.Apply(It.IsAny<IColumnProxy[]>()));
mockSparkContextProxy.Setup(m => m.CreateUserDefinedCSharpFunction(It.IsAny<string>(), It.IsAny<byte[]>(), It.IsAny<string>())).Returns(mockUdfProxy.Object);
Functions.Udf(() => 0).Invoke();
mockUdfProxy.Verify(m => m.Apply(new IColumnProxy[] { }), Times.Once);
var column1 = GeneratorColum();
Functions.Udf<int, int>(i => 1).Invoke(column1);
mockUdfProxy.Verify(m => m.Apply(new[] { column1.ColumnProxy }), Times.Once);
var column2 = GeneratorColum();
Functions.Udf<int, int, int>( (i1, i2) => 2).Invoke(column1, column2);
mockUdfProxy.Verify(m => m.Apply(new[] { column1.ColumnProxy, column2.ColumnProxy }), Times.Once);
var column3 = GeneratorColum();
Functions.Udf<int, int, int, int>((i1, i2, i3) => 3).Invoke(column1, column2, column3);
mockUdfProxy.Verify(m => m.Apply(new[] { column1.ColumnProxy, column2.ColumnProxy, column3.ColumnProxy }), Times.Once);
var column4 = GeneratorColum();
Functions.Udf<int, int, int, int, int>((i1, i2, i3, i4) => 4).Invoke(column1, column2, column3, column4);
mockUdfProxy.Verify(m => m.Apply(new[] { column1.ColumnProxy, column2.ColumnProxy, column3.ColumnProxy, column4.ColumnProxy }), Times.Once);
var column5 = GeneratorColum();
Functions.Udf<int, int, int, int, int, int>((i1, i2, i3, i4, i5) => 5).Invoke(column1, column2, column3, column4, column5);
mockUdfProxy.Verify(m => m.Apply(new[] { column1.ColumnProxy, column2.ColumnProxy, column3.ColumnProxy, column4.ColumnProxy, column5.ColumnProxy }), Times.Once);
var column6 = GeneratorColum();
Functions.Udf<int, int, int, int, int, int, int>((i1, i2, i3, i4, i5, i6) => 6).Invoke(column1, column2, column3, column4, column5, column6);
mockUdfProxy.Verify(m => m.Apply(new[] { column1.ColumnProxy, column2.ColumnProxy, column3.ColumnProxy, column4.ColumnProxy, column5.ColumnProxy, column6.ColumnProxy }), Times.Once);
var column7 = GeneratorColum();
Functions.Udf<int, int, int, int, int, int, int, int>((i1, i2, i3, i4, i5, i6, i7) => 7).Invoke(column1, column2, column3, column4, column5, column6, column7);
mockUdfProxy.Verify(m => m.Apply(new[] { column1.ColumnProxy, column2.ColumnProxy, column3.ColumnProxy, column4.ColumnProxy, column5.ColumnProxy, column6.ColumnProxy, column7.ColumnProxy }), Times.Once);
var column8 = GeneratorColum();
Functions.Udf<int, int, int, int, int, int, int, int, int>((i1, i2, i3, i4, i5, i6, i7, i8) => 8).Invoke(column1, column2, column3, column4, column5, column6, column7, column8);
mockUdfProxy.Verify(m => m.Apply(new[] { column1.ColumnProxy, column2.ColumnProxy, column3.ColumnProxy, column4.ColumnProxy, column5.ColumnProxy, column6.ColumnProxy, column7.ColumnProxy, column8.ColumnProxy }), Times.Once);
var column9 = GeneratorColum();
Functions.Udf<int, int, int, int, int, int, int, int, int, int>((i1, i2, i3, i4, i5, i6, i7, i8, i9) => 9).Invoke(column1, column2, column3, column4, column5, column6, column7, column8, column9);
mockUdfProxy.Verify(m => m.Apply(new[] { column1.ColumnProxy, column2.ColumnProxy, column3.ColumnProxy, column4.ColumnProxy, column5.ColumnProxy, column6.ColumnProxy, column7.ColumnProxy, column8.ColumnProxy, column9.ColumnProxy }), Times.Once);
var column10 = GeneratorColum();
Functions.Udf<int, int, int, int, int, int, int, int, int, int, int>((i1, i2, i3, i4, i5, i6, i7, i8, i9, i10) => 10).Invoke(column1, column2, column3, column4, column5, column6, column7, column8, column9, column10);
mockUdfProxy.Verify(m => m.Apply(new[] { column1.ColumnProxy, column2.ColumnProxy, column3.ColumnProxy, column4.ColumnProxy, column5.ColumnProxy, column6.ColumnProxy, column7.ColumnProxy, column8.ColumnProxy, column9.ColumnProxy, column10.ColumnProxy }), Times.Once);
}
#endregion
private Column GeneratorColum()
{
Mock<IColumnProxy> mockColumnProxy = new Mock<IColumnProxy>();

Просмотреть файл

@ -0,0 +1,67 @@
// Copyright (c) Microsoft. All rights reserved.
// Licensed under the MIT license. See LICENSE file in the project root for full license information.
using System;
using AdapterTest.Mocks;
using Microsoft.Spark.CSharp.Core;
using Microsoft.Spark.CSharp.Interop.Ipc;
using Microsoft.Spark.CSharp.Proxy;
using Microsoft.Spark.CSharp.Sql;
using NUnit.Framework;
using Moq;
using Microsoft.Spark.CSharp.Interop;
using Microsoft.Spark.CSharp.Proxy.Ipc;
using System.Collections.Generic;
namespace AdapterTest
{
/// <summary>
/// Validates interaction between SqlContext and its proxies
/// </summary>
[TestFixture]
public class HiveContextTest
{
private static Mock<ISqlContextProxy> mockSqlContextProxy;
[OneTimeSetUp]
public static void ClassInitialize()
{
mockSqlContextProxy = new Mock<ISqlContextProxy>();
}
[SetUp]
public void TestInitialize()
{
mockSqlContextProxy.Reset();
}
[TearDown]
public void TestCleanUp()
{
// Revert to use Static mock class to prevent blocking other test methods which uses static mock class
SparkCLREnvironment.SparkCLRProxy = new MockSparkCLRProxy();
}
[Test]
public void TestHiveContextConstructor()
{
var hiveContext = new HiveContext(new SparkContext("", ""));
Assert.IsNotNull((hiveContext.SqlContextProxy as MockSqlContextProxy).mockSqlContextReference);
}
[Test]
public void TestHiveContextRefreshTable()
{
// arrange
var mockSparkContextProxy = new Mock<ISparkContextProxy>();
mockSqlContextProxy.Setup(m => m.RefreshTable(It.IsAny<string>()));
var hiveContext = new HiveContext(new SparkContext("", ""), mockSqlContextProxy.Object);
// act
hiveContext.RefreshTable("table");
// assert
mockSqlContextProxy.Verify(m => m.RefreshTable("table"));
}
}
}

Просмотреть файл

@ -57,7 +57,7 @@ namespace AdapterTest.Mocks
{
}
public void Checkpoint(long intervalMs)
public void Checkpoint(int intervalSeconds)
{
}

Просмотреть файл

@ -146,12 +146,12 @@ namespace AdapterTest.Mocks
public IDataFrameProxy Join(IDataFrameProxy otherScalaDataFrameReference, string[] joinColumnNames)
{
throw new NotImplementedException();
return new MockDataFrameProxy(new object[] { otherScalaDataFrameReference, joinColumnNames }, SqlContextProxy);
}
public IDataFrameProxy Join(IDataFrameProxy otherScalaDataFrameReference, IColumnProxy scalaColumnReference, string joinType)
{
throw new NotImplementedException();
return new MockDataFrameProxy(new object[] { otherScalaDataFrameReference, scalaColumnReference, joinType }, SqlContextProxy);
}
public bool IsLocal
@ -329,5 +329,20 @@ namespace AdapterTest.Mocks
{
throw new NotImplementedException();
}
public IDataFrameProxy Repartition(int numPartitions, IColumnProxy[] columns)
{
throw new NotImplementedException();
}
public IDataFrameProxy Repartition(IColumnProxy[] columns)
{
throw new NotImplementedException();
}
public IDataFrameProxy SortWithinPartitions(IColumnProxy[] columns)
{
throw new NotImplementedException();
}
}
}

Просмотреть файл

@ -18,6 +18,7 @@ using NUnit.Framework;
namespace AdapterTest.Mocks
{
[Serializable]
internal class MockRddProxy : IRDDProxy
{
internal IEnumerable<dynamic> result;
@ -64,11 +65,6 @@ namespace AdapterTest.Mocks
return MockSparkContextProxy.RunJob(this);
}
public int PartitionLength()
{
return 1;
}
public void Cache()
{ }

Просмотреть файл

@ -58,7 +58,7 @@ namespace AdapterTest.Mocks
return false;
}
public IStreamingContextProxy CreateStreamingContext(SparkContext sparkContext, long durationMs)
public IStreamingContextProxy CreateStreamingContext(SparkContext sparkContext, int durationSeconds)
{
streamingContextProxy = new MockStreamingContextProxy();
return streamingContextProxy;

Просмотреть файл

@ -3,21 +3,16 @@
using System;
using System.Collections.Generic;
using System.Diagnostics;
using System.IO;
using System.Linq;
using System.Net;
using System.Net.Sockets;
using System.Runtime.CompilerServices;
using System.Runtime.Serialization;
using System.Runtime.Serialization.Formatters.Binary;
using System.Text;
using System.Threading.Tasks;
using Microsoft.Spark.CSharp.Core;
using Microsoft.Spark.CSharp.Proxy;
using Microsoft.Spark.CSharp.Proxy.Ipc;
using Microsoft.Spark.CSharp.Interop.Ipc;
using NUnit.Framework;
using Microsoft.Spark.CSharp.Network;
namespace AdapterTest.Mocks
{
@ -33,7 +28,7 @@ namespace AdapterTest.Mocks
}
public void AddFile(string filePath)
{}
{ }
public IRDDProxy TextFile(string filePath, int minPartitions)
{
@ -84,14 +79,14 @@ namespace AdapterTest.Mocks
}
}
public IRDDProxy CreatePairwiseRDD(IRDDProxy javaReferenceInByteArrayRdd, int numPartitions)
public IRDDProxy CreatePairwiseRDD(IRDDProxy javaReferenceInByteArrayRdd, int numPartitions, long partitionFuncId)
{
return javaReferenceInByteArrayRdd;
}
public void SetLogLevel(string logLevel)
{}
{ }
public string Version
{
@ -204,13 +199,13 @@ namespace AdapterTest.Mocks
return ms.ToArray();
});
TcpListener listener = new TcpListener(IPAddress.Loopback, 0);
listener.Start();
var listener = SocketFactory.CreateSocket();
listener.Listen();
Task.Run(() =>
{
using (Socket socket = listener.AcceptSocket())
using (Stream ns = new NetworkStream(socket))
using (var socket = listener.Accept())
using (var ns = socket.GetStream())
{
foreach (var item in result)
{
@ -219,7 +214,7 @@ namespace AdapterTest.Mocks
}
}
});
return (listener.LocalEndpoint as IPEndPoint).Port;
return (listener.LocalEndPoint as IPEndPoint).Port;
}
public int RunJob(IRDDProxy rdd, IEnumerable<int> partitions)
@ -282,6 +277,11 @@ namespace AdapterTest.Mocks
return new MockSqlContextProxy(this);
}
public ISqlContextProxy CreateHiveContext()
{
return new MockSqlContextProxy(this);
}
public IRDDProxy Parallelize(IEnumerable<byte[]> values, int numSlices)
{
return new MockRddProxy(null);

Просмотреть файл

@ -68,5 +68,80 @@ namespace AdapterTest.Mocks
{
throw new NotImplementedException();
}
public ISqlContextProxy NewSession()
{
throw new NotImplementedException();
}
public string GetConf(string key, string defaultValue)
{
throw new NotImplementedException();
}
public void SetConf(string key, string value)
{
throw new NotImplementedException();
}
public void RegisterDataFrameAsTable(IDataFrameProxy dataFrameProxy, string tableName)
{
throw new NotImplementedException();
}
public void DropTempTable(string tableName)
{
throw new NotImplementedException();
}
public IDataFrameProxy Table(string tableName)
{
throw new NotImplementedException();
}
public IDataFrameProxy Tables()
{
throw new NotImplementedException();
}
public IDataFrameProxy Tables(string databaseName)
{
throw new NotImplementedException();
}
public IEnumerable<string> TableNames()
{
throw new NotImplementedException();
}
public void CacheTable(string tableName)
{
throw new NotImplementedException();
}
public void UncacheTable(string tableName)
{
throw new NotImplementedException();
}
public void ClearCache()
{
throw new NotImplementedException();
}
public IEnumerable<string> TableNames(string databaseName)
{
throw new NotImplementedException();
}
public bool IsCached(string tableName)
{
throw new NotImplementedException();
}
public void RefreshTable(string tableName)
{
throw new NotImplementedException();
}
}
}

Просмотреть файл

@ -18,20 +18,16 @@ namespace AdapterTest.Mocks
{
private IFormatter formatter = new BinaryFormatter();
public void Start()
{
}
{}
public void Stop()
{
}
{}
public void Remember(long durationMs)
{
}
public void Remember(int durationSeconds)
{}
public void Checkpoint(string directory)
{
}
{}
public IDStreamProxy TextFileStream(string directory)
{
@ -53,6 +49,12 @@ namespace AdapterTest.Mocks
return new MockDStreamProxy();
}
public IDStreamProxy DirectKafkaStreamWithRepartition(List<string> topics, Dictionary<string, string> kafkaParams, Dictionary<string, long> fromOffsets,
int numPartitions, byte[] readFunc, string serializationMode)
{
return new MockDStreamProxy();
}
public IDStreamProxy Union(IDStreamProxy firstDStreams, IDStreamProxy[] otherDStreams)
{
return new MockDStreamProxy();
@ -62,7 +64,7 @@ namespace AdapterTest.Mocks
{
}
public void AwaitTermination(int timeout)
public void AwaitTerminationOrTimeout(long timeout)
{
}
@ -102,10 +104,24 @@ namespace AdapterTest.Mocks
{
Func<double, RDD<dynamic>, RDD<dynamic>, RDD<dynamic>> f = (Func<double, RDD<dynamic>, RDD<dynamic>, RDD<dynamic>>)formatter.Deserialize(new MemoryStream(func));
RDD<dynamic> rdd = f(DateTime.UtcNow.Ticks,
new RDD<dynamic>((jdstream as MockDStreamProxy).rddProxy ?? new MockRddProxy(null), new SparkContext("", "")),
null,
new RDD<dynamic>((jdstream as MockDStreamProxy).rddProxy ?? new MockRddProxy(null), new SparkContext("", "")));
return new MockDStreamProxy(rdd.RddProxy);
}
public IDStreamProxy CreateConstantInputDStream(IRDDProxy rddProxy)
{
return new MockDStreamProxy();
}
public IDStreamProxy EventHubsUnionStream(Dictionary<string, string> eventHubsParams, StorageLevelType storageLevelType)
{
throw new NotImplementedException();
}
public IDStreamProxy KafkaMetaStream(byte[] metaParams, uint numPartitions)
{
throw new NotImplementedException();
}
}
}

Просмотреть файл

@ -1,9 +1,8 @@
using System;
using System.Collections.Generic;
using System.IO;
using AdapterTest.Mocks;
using System.Linq;
using Microsoft.Spark.CSharp.Core;
using Microsoft.Spark.CSharp.Interop.Ipc;
using NUnit.Framework;
namespace AdapterTest
@ -155,6 +154,30 @@ namespace AdapterTest
Assert.AreEqual(9, records.Length);
}
[Test]
public void TestPairRddPartitionBy()
{
Func<dynamic, int> partitionFunc = key => 1;
var rddPartitionBy = pairs.PartitionBy(3, partitionFunc);
Assert.AreEqual(new Partitioner(3, partitionFunc), rddPartitionBy.partitioner);
}
[Test]
public void TestPairRddSortByKey()
{
var expectedSortedRdd = pairs.Collect().OrderBy(kv => kv.Key, StringComparer.OrdinalIgnoreCase).ToArray();
var rddSortByKey = pairs.SortByKey(true, null, key => key.ToLowerInvariant()).Collect();
CollectionAssert.AreEqual(expectedSortedRdd, rddSortByKey);
}
[Test]
public void TestPairRddSortByKey2()
{
var expectedSortedRdd = pairs.Collect().OrderBy(kv => kv.Key, StringComparer.OrdinalIgnoreCase).ToArray();
var rddSortByKey = pairs.SortByKey(true, 1, key => key.ToLowerInvariant()).Collect();
CollectionAssert.AreEqual(expectedSortedRdd, rddSortByKey);
}
[Test]
public void TestPairRddProxy()
{

Некоторые файлы не были показаны из-за слишком большого количества измененных файлов Показать больше