_   _ _____ _____    __              ____                   _
   | \ | | ____|_   _|  / _| ___  _ __  / ___| _ __   __ _ _ __| | __
   |  \| |  _|   | |   | |_ / _ \| '__| \___ \| '_ \ / _` | '__| |/ /
  _| |\  | |___  | |   |  _| (_) | |     ___) | |_) | (_| | |  |   <
 (_)_| \_|_____| |_|   |_|  \___/|_|    |____/| .__/ \__,_|_|  |_|\_\
                                              |_|
This commit is contained in:
dotnet-bot 2019-04-23 21:08:44 -07:00 коммит произвёл Immo Landwerth
Коммит 23ed6cddfb
Не найден ключ, соответствующий данной подписи
Идентификатор ключа GPG: 962A13C9167CE951
198 изменённых файлов: 31477 добавлений и 0 удалений

158
.editorconfig Normal file
Просмотреть файл

@ -0,0 +1,158 @@
# editorconfig.org
# top-most EditorConfig file
root = true
# Default settings:
# A newline ending every file
# Use 4 spaces as indentation
[*]
insert_final_newline = true
indent_style = space
indent_size = 4
[project.json]
indent_size = 2
# C# files
[*.cs]
# New line preferences
csharp_new_line_before_open_brace = all
csharp_new_line_before_else = true
csharp_new_line_before_catch = true
csharp_new_line_before_finally = true
csharp_new_line_before_members_in_object_initializers = true
csharp_new_line_before_members_in_anonymous_types = true
csharp_new_line_between_query_expression_clauses = true
# Indentation preferences
csharp_indent_block_contents = true
csharp_indent_braces = false
csharp_indent_case_contents = true
csharp_indent_switch_labels = true
csharp_indent_labels = one_less_than_current
# avoid this. unless absolutely necessary
dotnet_style_qualification_for_field = false:suggestion
dotnet_style_qualification_for_property = false:suggestion
dotnet_style_qualification_for_method = false:suggestion
dotnet_style_qualification_for_event = false:suggestion
# only use var when it's obvious what the variable type is
csharp_style_var_for_built_in_types = false:none
csharp_style_var_when_type_is_apparent = false:none
csharp_style_var_elsewhere = false:suggestion
# use language keywords instead of BCL types
dotnet_style_predefined_type_for_locals_parameters_members = true:suggestion
dotnet_style_predefined_type_for_member_access = true:suggestion
# name all constant fields using PascalCase
dotnet_naming_rule.constant_fields_should_be_pascal_case.severity = suggestion
dotnet_naming_rule.constant_fields_should_be_pascal_case.symbols = constant_fields
dotnet_naming_rule.constant_fields_should_be_pascal_case.style = pascal_case_style
dotnet_naming_symbols.constant_fields.applicable_kinds = field
dotnet_naming_symbols.constant_fields.required_modifiers = const
dotnet_naming_style.pascal_case_style.capitalization = pascal_case
# static fields should have s_ prefix
dotnet_naming_rule.static_fields_should_have_prefix.severity = suggestion
dotnet_naming_rule.static_fields_should_have_prefix.symbols = static_fields
dotnet_naming_rule.static_fields_should_have_prefix.style = static_prefix_style
dotnet_naming_symbols.static_fields.applicable_kinds = field
dotnet_naming_symbols.static_fields.required_modifiers = static
dotnet_naming_style.static_prefix_style.required_prefix = s_
dotnet_naming_style.static_prefix_style.capitalization = camel_case
# internal and private fields should be _camelCase
dotnet_naming_rule.camel_case_for_private_internal_fields.severity = suggestion
dotnet_naming_rule.camel_case_for_private_internal_fields.symbols = private_internal_fields
dotnet_naming_rule.camel_case_for_private_internal_fields.style = camel_case_underscore_style
dotnet_naming_symbols.private_internal_fields.applicable_kinds = field
dotnet_naming_symbols.private_internal_fields.applicable_accessibilities = private, internal
dotnet_naming_style.camel_case_underscore_style.required_prefix = _
dotnet_naming_style.camel_case_underscore_style.capitalization = camel_case
# Code style defaults
dotnet_sort_system_directives_first = true
csharp_preserve_single_line_blocks = true
csharp_preserve_single_line_statements = false
# Expression-level preferences
dotnet_style_object_initializer = true:suggestion
dotnet_style_collection_initializer = true:suggestion
dotnet_style_explicit_tuple_names = true:suggestion
dotnet_style_coalesce_expression = true:suggestion
dotnet_style_null_propagation = true:suggestion
# Expression-bodied members
csharp_style_expression_bodied_methods = false:none
csharp_style_expression_bodied_constructors = false:none
csharp_style_expression_bodied_operators = false:none
csharp_style_expression_bodied_properties = true:none
csharp_style_expression_bodied_indexers = true:none
csharp_style_expression_bodied_accessors = true:none
# Pattern matching
csharp_style_pattern_matching_over_is_with_cast_check = true:suggestion
csharp_style_pattern_matching_over_as_with_null_check = true:suggestion
csharp_style_inlined_variable_declaration = true:suggestion
# Null checking preferences
csharp_style_throw_expression = true:suggestion
csharp_style_conditional_delegate_call = true:suggestion
# Space preferences
csharp_space_after_cast = false
csharp_space_after_colon_in_inheritance_clause = true
csharp_space_after_comma = true
csharp_space_after_dot = false
csharp_space_after_keywords_in_control_flow_statements = true
csharp_space_after_semicolon_in_for_statement = true
csharp_space_around_binary_operators = before_and_after
csharp_space_around_declaration_statements = do_not_ignore
csharp_space_before_colon_in_inheritance_clause = true
csharp_space_before_comma = false
csharp_space_before_dot = false
csharp_space_before_open_square_brackets = false
csharp_space_before_semicolon_in_for_statement = false
csharp_space_between_empty_square_brackets = false
csharp_space_between_method_call_empty_parameter_list_parentheses = false
csharp_space_between_method_call_name_and_opening_parenthesis = false
csharp_space_between_method_call_parameter_list_parentheses = false
csharp_space_between_method_declaration_empty_parameter_list_parentheses = false
csharp_space_between_method_declaration_name_and_open_parenthesis = false
csharp_space_between_method_declaration_parameter_list_parentheses = false
csharp_space_between_parentheses = false
csharp_space_between_square_brackets = false
# Blocks are allowed
csharp_prefer_braces = true:silent
# Xml project files
[*.{csproj,vcxproj,vcxproj.filters,proj,nativeproj,locproj}]
indent_size = 2
# Xml build files
[*.builds]
indent_size = 2
# Xml files
[*.{xml,stylecop,resx,ruleset}]
indent_size = 2
# Xml config files
[*.{props,targets,config,nuspec}]
indent_size = 2
# Shell scripts
[*.sh]
end_of_line = lf
[*.{cmd, bat}]
end_of_line = crlf

366
.gitignore поставляемый Normal file
Просмотреть файл

@ -0,0 +1,366 @@
## Ignore Visual Studio temporary files, build results, and
## files generated by popular Visual Studio add-ons.
##
## Get latest from https://github.com/github/gitignore/blob/master/VisualStudio.gitignore
# User-specific files
*.rsuser
*.suo
*.user
*.userosscache
*.sln.docstates
# User-specific files (MonoDevelop/Xamarin Studio)
*.userprefs
# Build results
[Dd]ebug/
[Dd]ebugPublic/
[Rr]elease/
[Rr]eleases/
x64/
x86/
bld/
[Bb]in/
[Oo]bj/
[Ll]og/
# Visual Studio 2015/2017 cache/options directory
.vs/
# Uncomment if you have tasks that create the project's static files in wwwroot
#wwwroot/
# Visual Studio 2017 auto generated files
Generated\ Files/
# MSTest test Results
[Tt]est[Rr]esult*/
[Bb]uild[Ll]og.*
# NUNIT
*.VisualState.xml
TestResult.xml
# Build Results of an ATL Project
[Dd]ebugPS/
[Rr]eleasePS/
dlldata.c
# Benchmark Results
BenchmarkDotNet.Artifacts/
# .NET Core
project.lock.json
project.fragment.lock.json
artifacts/
# StyleCop
StyleCopReport.xml
# Files built by Visual Studio
*_i.c
*_p.c
*_h.h
*.ilk
*.meta
*.obj
*.iobj
*.pch
*.pdb
*.ipdb
*.pgc
*.pgd
*.rsp
*.sbr
*.tlb
*.tli
*.tlh
*.tmp
*.tmp_proj
*_wpftmp.csproj
*.log
*.vspscc
*.vssscc
.builds
*.pidb
*.svclog
*.scc
# Chutzpah Test files
_Chutzpah*
# Visual C++ cache files
ipch/
*.aps
*.ncb
*.opendb
*.opensdf
*.sdf
*.cachefile
*.VC.db
*.VC.VC.opendb
# Visual Studio profiler
*.psess
*.vsp
*.vspx
*.sap
# Visual Studio Trace Files
*.e2e
# TFS 2012 Local Workspace
$tf/
# Guidance Automation Toolkit
*.gpState
# ReSharper is a .NET coding add-in
_ReSharper*/
*.[Rr]e[Ss]harper
*.DotSettings.user
# JustCode is a .NET coding add-in
.JustCode
# TeamCity is a build add-in
_TeamCity*
# DotCover is a Code Coverage Tool
*.dotCover
# AxoCover is a Code Coverage Tool
.axoCover/*
!.axoCover/settings.json
# Visual Studio code coverage results
*.coverage
*.coveragexml
# NCrunch
_NCrunch_*
.*crunch*.local.xml
nCrunchTemp_*
# MightyMoose
*.mm.*
AutoTest.Net/
# Web workbench (sass)
.sass-cache/
# Installshield output folder
[Ee]xpress/
# DocProject is a documentation generator add-in
DocProject/buildhelp/
DocProject/Help/*.HxT
DocProject/Help/*.HxC
DocProject/Help/*.hhc
DocProject/Help/*.hhk
DocProject/Help/*.hhp
DocProject/Help/Html2
DocProject/Help/html
# Click-Once directory
publish/
# Publish Web Output
*.[Pp]ublish.xml
*.azurePubxml
# Note: Comment the next line if you want to checkin your web deploy settings,
# but database connection strings (with potential passwords) will be unencrypted
*.pubxml
*.publishproj
# Microsoft Azure Web App publish settings. Comment the next line if you want to
# checkin your Azure Web App publish settings, but sensitive information contained
# in these scripts will be unencrypted
PublishScripts/
# NuGet Packages
*.nupkg
# The packages folder can be ignored because of Package Restore
**/[Pp]ackages/*
# except build/, which is used as an MSBuild target.
!**/[Pp]ackages/build/
# Uncomment if necessary however generally it will be regenerated when needed
#!**/[Pp]ackages/repositories.config
# NuGet v3's project.json files produces more ignorable files
*.nuget.props
*.nuget.targets
# Microsoft Azure Build Output
csx/
*.build.csdef
# Microsoft Azure Emulator
ecf/
rcf/
# Windows Store app package directories and files
AppPackages/
BundleArtifacts/
Package.StoreAssociation.xml
_pkginfo.txt
*.appx
# Visual Studio cache files
# files ending in .cache can be ignored
*.[Cc]ache
# but keep track of directories ending in .cache
!*.[Cc]ache/
# Others
ClientBin/
~$*
*~
*.dbmdl
*.dbproj.schemaview
*.jfm
*.pfx
*.publishsettings
orleans.codegen.cs
# Including strong name files can present a security risk
# (https://github.com/github/gitignore/pull/2483#issue-259490424)
#*.snk
# Since there are multiple workflows, uncomment next line to ignore bower_components
# (https://github.com/github/gitignore/pull/1529#issuecomment-104372622)
#bower_components/
# RIA/Silverlight projects
Generated_Code/
# Backup & report files from converting an old project file
# to a newer Visual Studio version. Backup files are not needed,
# because we have git ;-)
_UpgradeReport_Files/
Backup*/
UpgradeLog*.XML
UpgradeLog*.htm
ServiceFabricBackup/
*.rptproj.bak
# SQL Server files
*.mdf
*.ldf
*.ndf
# Business Intelligence projects
*.rdl.data
*.bim.layout
*.bim_*.settings
*.rptproj.rsuser
# Microsoft Fakes
FakesAssemblies/
# GhostDoc plugin setting file
*.GhostDoc.xml
# Node.js Tools for Visual Studio
.ntvs_analysis.dat
node_modules/
# Visual Studio 6 build log
*.plg
# Visual Studio 6 workspace options file
*.opt
# Visual Studio 6 auto-generated workspace file (contains which files were open etc.)
*.vbw
# Visual Studio LightSwitch build output
**/*.HTMLClient/GeneratedArtifacts
**/*.DesktopClient/GeneratedArtifacts
**/*.DesktopClient/ModelManifest.xml
**/*.Server/GeneratedArtifacts
**/*.Server/ModelManifest.xml
_Pvt_Extensions
# Paket dependency manager
.paket/paket.exe
paket-files/
# FAKE - F# Make
.fake/
# JetBrains Rider
.idea/
*.sln.iml
# CodeRush personal settings
.cr/personal
# Python Tools for Visual Studio (PTVS)
__pycache__/
*.pyc
# Cake - Uncomment if you are using it
# tools/**
# !tools/packages.config
# Tabs Studio
*.tss
# Telerik's JustMock configuration file
*.jmconfig
# BizTalk build output
*.btp.cs
*.btm.cs
*.odx.cs
*.xsd.cs
# OpenCover UI analysis results
OpenCover/
# Azure Stream Analytics local run output
ASALocalRun/
# MSBuild Binary and Structured Log
*.binlog
# NVidia Nsight GPU debugger configuration file
*.nvuser
# MFractors (Xamarin productivity tool) working folder
.mfractor/
# Local History for Visual Studio
.localhistory/
# Below is for ignore files for Java taken from github/gitignore.
# Compiled class file
*.class
# Log file
*.log
# BlueJ files
*.ctxt
# Mobile Tools for Java (J2ME)
.mtj.tmp/
# Package Files #
*.jar
*.war
*.nar
*.ear
*.zip
*.tar.gz
*.rar
# virtual machine crash logs, see http://www.java.com/en/download/help/error_hotspot.xml
hs_err_pid*
# IntelliJ file
*.iml
# The target folder contains the output of building
**/target/**

31
CONTRIBUTING.md Normal file
Просмотреть файл

@ -0,0 +1,31 @@
# Welcome!
If you are here, it means you are interested in helping us out. A hearty welcome and thank you! There are many ways you can contribute to the .NET for Apache Spark project:
* Offer PR's to fix bugs or implement new features.
* Give us feedback and bug reports regarding the software or the documentation.
* Improve our examples, tutorials, and documentation.
## Getting started:
Please make sure to take a look at the project [roadmap](ROADMAP.md).
### Pull requests
If you are new to GitHub [here](https://help.github.com/categories/collaborating-with-issues-and-pull-requests/) is a detailed help source on getting involved with development on GitHub.
As a first time contributor, you will be invited to sign the Contributor License Agreement (CLA). Please follow the instructions of the dotnet foundation bot reviewer on your PR to sign the agreement indicating that you have appropriate rights to your contribution.
Your pull request needs to reference a filed issue. Please fill in the template that is populated for the pull request. Only pull requests addressing small typos can have no issues associated with them.
A .NET for Apache Spark team member will be assigned to your pull request once the continuous integration checks have passed successfully.
All commits in a pull request will be squashed to a single commit with the original creator as author.
# Contributing
See [Contributing](docs/contributing.md) for information about coding styles, source structure, making pull requests, and more.
# Developers
See the [Developer Guide](docs/developer-guide.md) for details about developing in this repo.

21
LICENSE Normal file
Просмотреть файл

@ -0,0 +1,21 @@
MIT License
Copyright (c) 2019 .NET Foundation
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.

9
PULL_REQUEST_TEMPLATE.md Normal file
Просмотреть файл

@ -0,0 +1,9 @@
We are excited to review your PR.
So we can do the best job, please check:
- [ ] There's a descriptive title that will make sense to other developers some time from now.
- [ ] There's associated issues. All PR's should have issue(s) associated - unless a trivial self-evident change such as fixing a typo. You can use the format `Fixes #nnnn` in your description to cause GitHub to automatically close the issue(s) when your PR is merged.
- [ ] Your change description explains what the change does, why you chose your approach, and anything else that reviewers should know.
- [ ] You have included any necessary tests in the same PR.

72
README.md Normal file
Просмотреть файл

@ -0,0 +1,72 @@
# Spark .NET
![Icon](docs/img/spark-dot-net-logo.PNG)
Spark .NET is the .NET API for [Apache Spark](https://spark.apache.org/).
## Build Status
| ![Ubuntu icon](docs/img/ubuntu-icon-32.png) | ![Ubuntu icon](docs/img/ubuntu-icon-32.png) | ![Windows icon](docs/img/windows-icon-32.png) |
| :---: | :---: | :---: |
| Ubuntu 16.04 | Ubuntu 18.04 | Windows 10 |
| | | [![Build Status](https://dnceng.visualstudio.com/internal/_apis/build/status/spark.net?branchName=master)](https://dnceng.visualstudio.com/internal/_build/latest?definitionId=301?branchName=master)|
## Table of Contents
- [Introduction](#introduction)
- [Quick Start (TL;DR)](#quick-start)
- [Features](docs/features.md)
- [FAQ/Troubleshooting](#faq)
- [Inspiration and Special Thanks](#inspiration)
- [How to Engage, Contribute and Provide Feedback](#community)
- [.NET Foundation](#net-foundation)
- [Code of Conduct](#code-of-conduct)
- [License](#license)
<a name="introduction"></a>
## Introduction
<a name="quick-start"></a>
## Quick Start (TL;DR)
Spark .NET will be redistributed as a Nuget package and a formal release here on Github eventually to help you build your applications easily. Until then, please feel free to build it locally on your machine and link it appropriately. Building from source is very easy and the whole process (from cloning to being able to run your app) should take less than 15 minutes!
| | | Instructions |
| :---: | :--- | :--- |
| ![Windows icon](docs/img/windows-icon-32.png) | **Windows** | <ul><li>Local - [.NET Framework 4.6.1](docs/building/windows-instructions.md#using-visual-studio-for-net-framework-461)</li><li>Local - [.NET Core 2.1.x](docs/building/windows-instructions.md#using-net-core-cli-for-net-core-21x)</li><ul> |
| ![Ubuntu icon](docs/img/ubuntu-icon-32.png) | **Ubuntu** | <ul><li>Local - [.NET Core 2.1.x](docs/building/ubuntu-instructions.md)</li><li>[Azure HDInsight Spark - .NET Core 2.1.x](deployment/README.md)</li></ul> |
## Contributing
We welcome contributions! Please review our [contribution guide](CONTRIBUTING.md).
<a name="features"></a>
## Features
<a name="faq"></a>
## Frequently Asked Questions
<a name="inspiration"></a>
## Inspiration
## Community
<a name="contact"></a>
## How to Engage, Contribute and Provide Feedback
The Spark .NET team encourages [contributions](docs/contributing.md), both issues and PRs. The first step is finding an [existing issue](https://github.com/dotnet/spark/issues) you want to contribute to or if you cannot find any, [open an issue](https://github.com/dotnet/spark/issues?utf8=%E2%9C%93&q=is%3Aissue+is%3Aopen+).
<a name="net-foundation"></a>
## .NET Foundation
The Spark .NET project is part of the [.NET Foundation](http://www.dotnetfoundation.org).
<a name="code-of-conduct"></a>
## Code of Conduct
This project has adopted the code of conduct defined by the Contributor Covenant
to clarify expected behavior in our community.
For more information, see the [.NET Foundation Code of Conduct](https://dotnetfoundation.org/code-of-conduct).
<a name="license"></a>
## License
.NET for Apache Spark is licensed under the [MIT license](LICENSE).

41
ROADMAP.md Normal file
Просмотреть файл

@ -0,0 +1,41 @@
# .NET for Apache Spark Roadmap
The goal of the .NET for Apache Spark project is to provide an easy to use, .NET-friendly integration to the popular big data platform, Apache Spark. This document describes the tentative plan for the project in the short and long-term.
.NET for Apache Spark is a community effort and we welcome community feedback on our plans. The best way to give feedback is to open an issue in this repo. We are also excited to receive contributions (check out the [contribution guide](docs/contributing.md)). It's always a good idea to open an issue for a discussion before embarking on a large code change to make sure there is not duplicated effort. Where we do know that efforts are already underway, we have used the (*) marker below.
## Short Term
### User Experience
* 1:1 API compatibility for Dataframes with Apache Spark 2.3.x, Apache Spark 2.4.x and Apache Spark 3.0.x (*)
### Performance Optimizations
* Improvements to C# Pickling Library
* Improvements to Arrow .NET Library
* Exploiting .NET Core 3.0 Vectorization (*)
* Micro-benchmarking framework for Interop
### Benchmarks
* Benchmarking scripts for all languages that include generating the dataset and running queries against it (*)
* Published reproducible benchmarks against [TPC-H](http://www.tpc.org/tpch/) (industry-standard database benchmark) (*)
### Tooling Improvements
* VS Code support (*)
* Apache Jupyter integration with C# & F# Notebook Support (*)
* Improved user experience for .NET app submission to a remote Spark cluster
## Longer Term
### User Experience
* Idiomatic C# and F# APIs
### Performance Optimizations
* Contribute extensible interop layer to Apache Spark
### Benchmarks
* Published reproducible benchmarks against [TPC-DS](http://www.tpc.org/tpcds/default.asp) (industry-standard database benchmark)
### Tooling Improvements
* Visual Studio Extension for .NET app submission to a remote Spark cluster
* Visual Studio Extension for .NET app debugging
* Make it easy to copy/paste Scala examples into Visual Studio

10
THIRD-PARTY-NOTICES.TXT Normal file
Просмотреть файл

@ -0,0 +1,10 @@
.NET for Apache Spark uses third-party libraries or other resources that may be
distributed under licenses different than the .NET for Apache Spark software.
In the event that we accidentally failed to list a required notice, please
bring it to our attention. Post an issue or email us:
dotnet@microsoft.com
The attached notices are provided for information only.

137
azure-pipelines.yml Normal file
Просмотреть файл

@ -0,0 +1,137 @@
# Spark .NET build
trigger:
- master
pool:
vmImage: 'VS2017-Win2016'
variables:
solution: '**/*.sln'
buildConfiguration: 'Release'
steps:
- task: NuGetToolInstaller@0
inputs:
versionSpec: '4.9.2'
- task: DotNetCoreCLI@2
displayName: '.NET restore'
inputs:
# Use a custom restore command because the built in restore command uses a temp nuget.config
# which overwrites the MSBuild restore properties
command: 'custom'
custom: 'restore'
projects: '$(solution)'
- task: DotNetCoreCLI@2
displayName: '.NET build'
inputs:
command: build
projects: '$(solution)'
arguments: '--configuration $(buildConfiguration)'
- task: BatchScript@1
displayName: Publish Microsoft.Spark.Worker
inputs:
filename: script\publish-workers.cmd
arguments: $(Build.SourcesDirectory) $(Build.ArtifactStagingDirectory)\Microsoft.Spark.Worker $(buildConfiguration)
- task: DotNetCoreCLI@2
displayName: '.NET unit tests'
inputs:
command: test
projects: '**/*UnitTest/*.csproj'
arguments: '--configuration $(buildConfiguration) /p:CollectCoverage=true /p:CoverletOutputFormat=cobertura'
- task: Maven@3
displayName: 'Maven build src'
inputs:
mavenPomFile: src/scala/pom.xml
- task: Maven@3
displayName: 'Maven build benchmark'
inputs:
mavenPomFile: benchmark/scala/pom.xml
- task: NuGetCommand@2
inputs:
command: pack
packagesToPack: '$(Build.SourcesDirectory)\src\csharp\Microsoft.Spark.nuspec'
- task: PublishBuildArtifacts@1
inputs:
pathtoPublish: '$(Build.ArtifactStagingDirectory)'
artifactName: Microsoft.Spark.Binaries
- task: BatchScript@1
displayName: Download Spark Distros & Winutils.exe
inputs:
filename: script\download-spark-distros.cmd
arguments: $(Build.BinariesDirectory)
- task: DotNetCoreCLI@2
displayName: 'E2E tests for Spark 2.3.0'
inputs:
command: test
projects: '**/Microsoft.Spark.E2ETest/*.csproj'
arguments: '--configuration $(buildConfiguration) /p:CollectCoverage=true /p:CoverletOutputFormat=cobertura'
env:
SPARK_HOME: $(Build.BinariesDirectory)\spark-2.3.0-bin-hadoop2.7
HADOOP_HOME: $(Build.BinariesDirectory)\hadoop
DotnetWorkerPath: $(Build.ArtifactStagingDirectory)\Microsoft.Spark.Worker\netcoreapp2.1\win-x64
- task: DotNetCoreCLI@2
displayName: 'E2E tests for Spark 2.3.1'
inputs:
command: test
projects: '**/Microsoft.Spark.E2ETest/*.csproj'
arguments: '--configuration $(buildConfiguration) /p:CollectCoverage=true /p:CoverletOutputFormat=cobertura'
env:
SPARK_HOME: $(Build.BinariesDirectory)\spark-2.3.1-bin-hadoop2.7
HADOOP_HOME: $(Build.BinariesDirectory)\hadoop
DotnetWorkerPath: $(Build.ArtifactStagingDirectory)\Microsoft.Spark.Worker\netcoreapp2.1\win-x64
- task: DotNetCoreCLI@2
displayName: 'E2E tests for Spark 2.3.2'
inputs:
command: test
projects: '**/Microsoft.Spark.E2ETest/*.csproj'
arguments: '--configuration $(buildConfiguration) /p:CollectCoverage=true /p:CoverletOutputFormat=cobertura'
env:
SPARK_HOME: $(Build.BinariesDirectory)\spark-2.3.2-bin-hadoop2.7
HADOOP_HOME: $(Build.BinariesDirectory)\hadoop
DotnetWorkerPath: $(Build.ArtifactStagingDirectory)\Microsoft.Spark.Worker\netcoreapp2.1\win-x64
- task: DotNetCoreCLI@2
displayName: 'E2E tests for Spark 2.3.3'
inputs:
command: test
projects: '**/Microsoft.Spark.E2ETest/*.csproj'
arguments: '--configuration $(buildConfiguration) /p:CollectCoverage=true /p:CoverletOutputFormat=cobertura'
env:
SPARK_HOME: $(Build.BinariesDirectory)\spark-2.3.3-bin-hadoop2.7
HADOOP_HOME: $(Build.BinariesDirectory)\hadoop
DotnetWorkerPath: $(Build.ArtifactStagingDirectory)\Microsoft.Spark.Worker\netcoreapp2.1\win-x64
- task: DotNetCoreCLI@2
displayName: 'E2E tests for Spark 2.4.0'
inputs:
command: test
projects: '**/Microsoft.Spark.E2ETest/*.csproj'
arguments: '--configuration $(buildConfiguration) /p:CollectCoverage=true /p:CoverletOutputFormat=cobertura'
env:
SPARK_HOME: $(Build.BinariesDirectory)\spark-2.4.0-bin-hadoop2.7
HADOOP_HOME: $(Build.BinariesDirectory)\hadoop
DotnetWorkerPath: $(Build.ArtifactStagingDirectory)\Microsoft.Spark.Worker\netcoreapp2.1\win-x64
- task: DotNetCoreCLI@2
displayName: 'E2E tests for Spark 2.4.1'
inputs:
command: test
projects: '**/Microsoft.Spark.E2ETest/*.csproj'
arguments: '--configuration $(buildConfiguration) /p:CollectCoverage=true /p:CoverletOutputFormat=cobertura'
env:
SPARK_HOME: $(Build.BinariesDirectory)\spark-2.4.1-bin-hadoop2.7
HADOOP_HOME: $(Build.BinariesDirectory)\hadoop
DotnetWorkerPath: $(Build.ArtifactStagingDirectory)\Microsoft.Spark.Worker\netcoreapp2.1\win-x64

Просмотреть файл

@ -0,0 +1,4 @@
<?xml version="1.0" encoding="utf-8"?>
<Project ToolsVersion="14.0" DefaultTargets="Build" xmlns="http://schemas.microsoft.com/developer/msbuild/2003" TreatAsLocalProperty="ExcludeRestorePackageImports">
<Import Project="..\..\src\csharp\Directory.Build.props" />
</Project>

Просмотреть файл

@ -0,0 +1,37 @@
Microsoft Visual Studio Solution File, Format Version 12.00
# Visual Studio 15
VisualStudioVersion = 15.0.28307.168
MinimumVisualStudioVersion = 10.0.40219.1
Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "Tpch", "Tpch\Tpch.csproj", "{C1A5ED09-7924-4784-A880-97DF975DE78A}"
EndProject
Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "Microsoft.Spark", "..\..\src\csharp\Microsoft.Spark\Microsoft.Spark.csproj", "{8EB85725-BEB0-4807-B9E2-2BB26B1D7D55}"
EndProject
Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "Microsoft.Spark.Worker", "..\..\src\csharp\Microsoft.Spark.Worker\Microsoft.Spark.Worker.csproj", "{A267D1A0-8EF6-475F-B118-67DDACD4373A}"
EndProject
Global
GlobalSection(SolutionConfigurationPlatforms) = preSolution
Debug|Any CPU = Debug|Any CPU
Release|Any CPU = Release|Any CPU
EndGlobalSection
GlobalSection(ProjectConfigurationPlatforms) = postSolution
{C1A5ED09-7924-4784-A880-97DF975DE78A}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
{C1A5ED09-7924-4784-A880-97DF975DE78A}.Debug|Any CPU.Build.0 = Debug|Any CPU
{C1A5ED09-7924-4784-A880-97DF975DE78A}.Release|Any CPU.ActiveCfg = Release|Any CPU
{C1A5ED09-7924-4784-A880-97DF975DE78A}.Release|Any CPU.Build.0 = Release|Any CPU
{8EB85725-BEB0-4807-B9E2-2BB26B1D7D55}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
{8EB85725-BEB0-4807-B9E2-2BB26B1D7D55}.Debug|Any CPU.Build.0 = Debug|Any CPU
{8EB85725-BEB0-4807-B9E2-2BB26B1D7D55}.Release|Any CPU.ActiveCfg = Release|Any CPU
{8EB85725-BEB0-4807-B9E2-2BB26B1D7D55}.Release|Any CPU.Build.0 = Release|Any CPU
{A267D1A0-8EF6-475F-B118-67DDACD4373A}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
{A267D1A0-8EF6-475F-B118-67DDACD4373A}.Debug|Any CPU.Build.0 = Debug|Any CPU
{A267D1A0-8EF6-475F-B118-67DDACD4373A}.Release|Any CPU.ActiveCfg = Release|Any CPU
{A267D1A0-8EF6-475F-B118-67DDACD4373A}.Release|Any CPU.Build.0 = Release|Any CPU
EndGlobalSection
GlobalSection(SolutionProperties) = preSolution
HideSolutionNode = FALSE
EndGlobalSection
GlobalSection(ExtensibilityGlobals) = postSolution
SolutionGuid = {88B4A99F-3096-495A-9055-7A270C9269B2}
EndGlobalSection
EndGlobal

Просмотреть файл

@ -0,0 +1,55 @@
// Licensed to the .NET Foundation under one or more agreements.
// The .NET Foundation licenses this file to you under the MIT license.
// See the LICENSE file in the project root for more information.
using System;
using System.Diagnostics;
using Microsoft.Spark.Sql;
namespace Tpch
{
internal class Program
{
private static void Main(string[] args)
{
if (args.Length != 4)
{
Console.WriteLine("Usage:");
Console.WriteLine("\t<spark-submit> --master local");
Console.WriteLine("\t\t--class org.apache.spark.deploy.DotnetRunner <path-to-microsoft-spark-jar>");
Console.WriteLine("\t\tTpch.exe <tpch_data_root_path> <query_number> <num_iterations> <true for SQL | false for functional>");
}
var tpchRoot = args[0];
var queryNumber = int.Parse(args[1]);
var numIteration = int.Parse(args[2]);
var isSQL = bool.Parse(args[3]);
for (var i = 0; i < numIteration; ++i)
{
SparkSession spark = SparkSession
.Builder()
.AppName("TPC-H Benchmark for DotNet")
.GetOrCreate();
Stopwatch sw = Stopwatch.StartNew();
if (!isSQL)
{
var tpchFunctional = new TpchFunctionalQueries(tpchRoot, spark);
tpchFunctional.Run(queryNumber.ToString());
}
else
{
var tpchSql = new TpchSqlQueries(tpchRoot, spark);
tpchSql.Run(queryNumber.ToString());
}
sw.Stop();
var typeStr = isSQL ? "SQL" : "Functional";
Console.WriteLine($"TPCH_Result,DotNet,{typeStr},{queryNumber},{i},{sw.ElapsedMilliseconds}");
spark.Stop();
}
}
}
}

Просмотреть файл

@ -0,0 +1,16 @@
// Licensed to the .NET Foundation under one or more agreements.
// The .NET Foundation licenses this file to you under the MIT license.
// See the LICENSE file in the project root for more information.
using System.Text.RegularExpressions;
namespace Tpch
{
internal static class StringExtensions
{
internal static string StripMargin(this string s)
{
return Regex.Replace(s, @"[ \t]+\|", string.Empty);
}
}
}

Просмотреть файл

@ -0,0 +1,16 @@
<Project Sdk="Microsoft.NET.Sdk">
<PropertyGroup>
<OutputType>Exe</OutputType>
<TargetFrameworks>net461;netcoreapp2.1</TargetFrameworks>
<TargetFrameworks Condition="'$(OS)' != 'Windows_NT'">netcoreapp2.1</TargetFrameworks>
<RootNamespace>Tpch</RootNamespace>
<AssemblyName>Tpch</AssemblyName>
</PropertyGroup>
<ItemGroup>
<ProjectReference Include="..\..\..\src\csharp\Microsoft.Spark.Worker\Microsoft.Spark.Worker.csproj" />
<ProjectReference Include="..\..\..\src\csharp\Microsoft.Spark\Microsoft.Spark.csproj" />
</ItemGroup>
</Project>

Просмотреть файл

@ -0,0 +1,29 @@
// Licensed to the .NET Foundation under one or more agreements.
// The .NET Foundation licenses this file to you under the MIT license.
// See the LICENSE file in the project root for more information.
using System.IO;
using Microsoft.Spark.Sql;
namespace Tpch
{
internal class TpchBase
{
protected readonly DataFrame _customer, _lineitem, _nation, _orders,
_part, _partsupp, _region, _supplier;
internal TpchBase(string tpchRoot, SparkSession spark)
{
// Load all the TPC-H tables.
tpchRoot += Path.DirectorySeparatorChar;
_customer = spark.Read().Parquet($"{tpchRoot}customer");
_lineitem = spark.Read().Parquet($"{tpchRoot}lineitem");
_nation = spark.Read().Parquet($"{tpchRoot}nation");
_orders = spark.Read().Parquet($"{tpchRoot}orders");
_part = spark.Read().Parquet($"{tpchRoot}part");
_partsupp = spark.Read().Parquet($"{tpchRoot}partsupp");
_region = spark.Read().Parquet($"{tpchRoot}region");
_supplier = spark.Read().Parquet($"{tpchRoot}supplier");
}
}
}

Просмотреть файл

@ -0,0 +1,512 @@
// Licensed to the .NET Foundation under one or more agreements.
// The .NET Foundation licenses this file to you under the MIT license.
// See the LICENSE file in the project root for more information.
using System;
using System.Diagnostics;
using System.Reflection;
using System.Text.RegularExpressions;
using Microsoft.Spark.Sql;
using static Microsoft.Spark.Sql.Functions;
namespace Tpch
{
internal class TpchFunctionalQueries : TpchBase
{
internal TpchFunctionalQueries(string tpchRoot, SparkSession spark)
: base(tpchRoot, spark)
{
}
internal void RunAll()
{
for (var i = 1; i <= 22; i++)
{
Run(i.ToString());
}
}
internal void Run(string queryNumber)
{
Console.WriteLine($"Spark.NET TPCH Functional Query: #{queryNumber}");
Type thisType = GetType();
MethodInfo queryMethod = thisType.GetMethod(
$"Q{queryNumber}", BindingFlags.Instance | BindingFlags.NonPublic);
var sw = Stopwatch.StartNew();
queryMethod.Invoke(this, null);
Console.WriteLine($"\tElapsed: {sw.Elapsed}");
}
internal void Q1()
{
Func<Column, Column, Column> decrease = Udf<double, double, double>((x, y) => x * (1 - y));
Func<Column, Column, Column> increase = Udf<double, double, double>((x, y) => x * (1 + y));
_lineitem.Filter(Col("l_shipdate") <= "1998-09-02")
.GroupBy(Col("l_returnflag"), Col("l_linestatus"))
.Agg(Sum(Col("l_quantity")).As("sum_qty"), Sum(Col("l_extendedprice")).As("sum_base_price"),
Sum(decrease(Col("l_extendedprice"), Col("l_discount"))).As("sum_disc_price"),
Sum(increase(decrease(Col("l_extendedprice"), Col("l_discount")), Col("l_tax"))).As("sum_charge"),
Avg(Col("l_quantity")).As("avg_qty"),
Avg(Col("l_extendedprice")).As("avg_price"),
Avg(Col("l_discount")).As("avg_disc"),
Count(Col("l_quantity")).As("count_order")
)
.Sort(Col("l_returnflag"), Col("l_linestatus"))
.Show();
}
internal void Q2()
{
DataFrame europe = _region.Filter(Col("r_name") == "EUROPE")
.Join(_nation, Col("r_regionkey") == _nation["n_regionkey"])
.Join(_supplier, Col("n_nationkey") == _supplier["s_nationkey"])
.Join(_partsupp, _supplier["s_suppkey"] == _partsupp["ps_suppkey"]);
DataFrame brass = _part
.Filter(_part["p_size"] == 15 & _part["p_type"].EndsWith("BRASS"))
.Join(europe, europe["ps_partkey"] == Col("p_partkey"));
DataFrame minCost = brass.GroupBy(brass["ps_partkey"])
.Agg(Min("ps_supplycost").As("min"));
brass.Join(minCost, brass["ps_partkey"] == minCost["ps_partkey"])
.Filter(brass["ps_supplycost"] == minCost["min"])
.Select("s_acctbal", "s_name", "n_name", "p_partkey", "p_mfgr", "s_address", "s_phone", "s_comment")
.Sort(Col("s_acctbal").Desc(), Col("n_name"), Col("s_name"), Col("p_partkey"))
.Show();
}
internal void Q3()
{
Func<Column, Column, Column> decrease = Udf<double, double, double>((x, y) => x * (1 - y));
DataFrame fcust = _customer.Filter(Col("c_mktsegment") == "BUILDING");
DataFrame forders = _orders.Filter(Col("o_orderdate") < "1995-03-15");
DataFrame flineitems = _lineitem.Filter(Col("l_shipdate") > "1995-03-15");
fcust.Join(forders, Col("c_custkey") == forders["o_custkey"])
.Select(Col("o_orderkey"), Col("o_orderdate"), Col("o_shippriority"))
.Join(flineitems, Col("o_orderkey") == flineitems["l_orderkey"])
.Select(Col("l_orderkey"),
decrease(Col("l_extendedprice"), Col("l_discount")).As("volume"),
Col("o_orderdate"), Col("o_shippriority"))
.GroupBy(Col("l_orderkey"), Col("o_orderdate"), Col("o_shippriority"))
.Agg(Sum(Col("volume")).As("revenue"))
.Sort(Col("revenue").Desc(), Col("o_orderdate"))
.Show();
}
internal void Q4()
{
DataFrame forders = _orders.Filter(Col("o_orderdate") >= "1993-07-01" &
Col("o_orderdate") < "1993-10-01");
DataFrame flineitems = _lineitem.Filter(Col("l_commitdate") < Col("l_receiptdate"))
.Select($"l_orderkey")
.Distinct();
flineitems.Join(forders, Col("l_orderkey") == forders["o_orderkey"])
.GroupBy(Col("o_orderpriority"))
.Agg(Count(Col("o_orderpriority")).As("order_count"))
.Sort(Col("o_orderpriority"))
.Show();
}
internal void Q5()
{
Func<Column, Column, Column> decrease = Udf<double, double, double>((x, y) => x * (1 - y));
DataFrame forders = _orders.Filter(Col("o_orderdate") < "1995-01-01" & Col("o_orderdate") >= "1994-01-01");
_region.Filter(Col("r_name") == "ASIA")
.Join(_nation, Col("r_regionkey") == _nation["n_regionkey"])
.Join(_supplier, Col("n_nationkey") == _supplier["s_nationkey"])
.Join(_lineitem, Col("s_suppkey") == _lineitem["l_suppkey"])
.Select(Col("n_name"), Col("l_extendedprice"), Col("l_discount"), Col("l_orderkey"), Col("s_nationkey"))
.Join(forders, Col("l_orderkey") == forders["o_orderkey"])
.Join(_customer, Col("o_custkey") == _customer["c_custkey"]
& Col("s_nationkey") == _customer["c_nationkey"])
.Select(Col("n_name"), decrease(Col("l_extendedprice"), Col("l_discount")).As("value"))
.GroupBy(Col("n_name"))
.Agg(Sum(Col("value")).As("revenue"))
.Sort(Col("revenue").Desc())
.Show();
}
internal void Q6()
{
_lineitem.Filter(Col("l_shipdate") >= "1994-01-01" & Col("l_shipdate") < "1995-01-01"
& Col("l_discount") >= 0.05 & Col("l_discount") <= 0.07 & Col("l_quantity") < 24)
.Agg(Sum(Col("l_extendedprice") * Col("l_discount")).As("revenue"))
.Show();
}
// C#, Scala and SparkSQL results match but SparkSQL has different precision.
internal void Q7()
{
Func<Column, Column> getYear = Udf<string, string>(x => x.Substring(0, 4));
Func<Column, Column, Column> decrease = Udf<double, double, double>((x, y) => x * (1 - y));
DataFrame fnation = _nation.Filter(Col("n_name") == "FRANCE" | Col("n_name") == "GERMANY");
DataFrame fline = _lineitem.Filter(Col("l_shipdate") >= "1995-01-01" & Col("l_shipdate") <= "1996-12-31");
DataFrame supNation = fnation.Join(_supplier, Col("n_nationkey") == _supplier["s_nationkey"])
.Join(fline, Col("s_suppkey") == fline["l_suppkey"])
.Select(Col("n_name").As("supp_nation"), Col("l_orderkey"), Col("l_extendedprice"), Col("l_discount"), Col("l_shipdate"));
fnation.Join(_customer, Col("n_nationkey") == _customer["c_nationkey"])
.Join(_orders, Col("c_custkey") == _orders["o_custkey"])
.Select(Col("n_name").As("cust_nation"), Col("o_orderkey"))
.Join(supNation, Col("o_orderkey") == supNation["l_orderkey"])
.Filter(Col("supp_nation") == "FRANCE" & Col("cust_nation") == "GERMANY"
| Col("supp_nation") == "GERMANY" & Col("cust_nation") == "FRANCE")
.Select(Col("supp_nation"), Col("cust_nation"),
getYear(Col("l_shipdate")).As("l_year"),
decrease(Col("l_extendedprice"), Col("l_discount")).As("volume"))
.GroupBy(Col("supp_nation"), Col("cust_nation"), Col("l_year"))
.Agg(Sum(Col("volume")).As("revenue"))
.Sort(Col("supp_nation"), Col("cust_nation"), Col("l_year"))
.Show();
}
internal void Q8()
{
Func<Column, Column> getYear = Udf<string, string>(x => x.Substring(0, 4));
Func<Column, Column, Column> decrease = Udf<double, double, double>((x, y) => x * (1 - y));
Func<Column, Column, Column> isBrazil = Udf<string, double, double>((x, y) => x == "BRAZIL" ? y : 0);
DataFrame fregion = _region.Filter(Col("r_name") == "AMERICA");
DataFrame forder = _orders.Filter(Col("o_orderdate") <= "1996-12-31" & Col("o_orderdate") >= "1995-01-01");
DataFrame fpart = _part.Filter(Col("p_type") == "ECONOMY ANODIZED STEEL");
DataFrame nat = _nation.Join(_supplier, Col("n_nationkey") == _supplier["s_nationkey"]);
DataFrame line = _lineitem.Select(Col("l_partkey"), Col("l_suppkey"), Col("l_orderkey"),
decrease(Col("l_extendedprice"), Col("l_discount")).As("volume"))
.Join(fpart, Col("l_partkey") == fpart["p_partkey"])
.Join(nat, Col("l_suppkey") == nat["s_suppkey"]);
_nation.Join(fregion, Col("n_regionkey") == fregion["r_regionkey"])
.Select(Col("n_nationkey"))
.Join(_customer, Col("n_nationkey") == _customer["c_nationkey"])
.Select(Col("c_custkey"))
.Join(forder, Col("c_custkey") == forder["o_custkey"])
.Select(Col("o_orderkey"), Col("o_orderdate"))
.Join(line, Col("o_orderkey") == line["l_orderkey"])
.Select(getYear(Col("o_orderdate")).As("o_year"), Col("volume"),
isBrazil(Col("n_name"), Col("volume")).As("case_volume"))
.GroupBy(Col("o_year"))
.Agg((Sum(Col("case_volume")) / Sum("volume")).As("mkt_share"))
.Sort(Col("o_year"))
.Show();
}
internal void Q9()
{
Func<Column, Column> getYear = Udf<string, string>(x => x.Substring(0, 4));
Func<Column, Column, Column, Column, Column> expr = Udf<double, double, double, double, double>((x, y, v, w) => x * (1 - y) - (v * w));
DataFrame linePart = _part.Filter(Col("p_name").Contains("green"))
.Join(_lineitem, Col("p_partkey") == _lineitem["l_partkey"]);
DataFrame natSup = _nation.Join(_supplier, Col("n_nationkey") == _supplier["s_nationkey"]);
linePart.Join(natSup, Col("l_suppkey") == natSup["s_suppkey"])
.Join(_partsupp, Col("l_suppkey") == _partsupp["ps_suppkey"]
& Col("l_partkey") == _partsupp["ps_partkey"])
.Join(_orders, Col("l_orderkey") == _orders["o_orderkey"])
.Select(Col("n_name"), getYear(Col("o_orderdate")).As("o_year"),
expr(Col("l_extendedprice"), Col("l_discount"), Col("ps_supplycost"), Col("l_quantity")).As("amount"))
.GroupBy(Col("n_name"), Col("o_year"))
.Agg(Sum(Col("amount")))
.Sort(Col("n_name"), Col("o_year").Desc())
.Show();
}
internal void Q10()
{
Func<Column, Column, Column> decrease = Udf<double, double, double>((x, y) => x * (1 - y));
DataFrame flineitem = _lineitem.Filter(Col("l_returnflag") == "R");
_orders.Filter(Col("o_orderdate") < "1994-01-01" & Col("o_orderdate") >= "1993-10-01")
.Join(_customer, Col("o_custkey") == _customer["c_custkey"])
.Join(_nation, Col("c_nationkey") == _nation["n_nationkey"])
.Join(flineitem, Col("o_orderkey") == flineitem["l_orderkey"])
.Select(Col("c_custkey"), Col("c_name"),
decrease(Col("l_extendedprice"), Col("l_discount")).As("volume"),
Col("c_acctbal"), Col("n_name"), Col("c_address"), Col("c_phone"), Col("c_comment"))
.GroupBy(Col("c_custkey"), Col("c_name"), Col("c_acctbal"), Col("c_phone"), Col("n_name"), Col("c_address"), Col("c_comment"))
.Agg(Sum(Col("volume")).As("revenue"))
.Sort(Col("revenue").Desc())
.Show();
}
internal void Q11()
{
Func<Column, Column, Column> mul = Udf<double, int, double>((x, y) => x * y);
Func<Column, Column> mul01 = Udf<double, double>(x => x * 0.0001);
DataFrame tmp = _nation.Filter(Col("n_name") == "GERMANY")
.Join(_supplier, Col("n_nationkey") == _supplier["s_nationkey"])
.Select(Col("s_suppkey"))
.Join(_partsupp, Col("s_suppkey") == _partsupp["ps_suppkey"])
.Select(Col("ps_partkey"), mul(Col("ps_supplycost"), Col("ps_availqty")).As("value"));
DataFrame sumRes = tmp.Agg(Sum("value").As("total_value"));
tmp.GroupBy(Col("ps_partkey")).Agg(Sum("value").As("part_value"))
.Join(sumRes, Col("part_value") > mul01(Col("total_value")))
.Sort(Col("part_value").Desc())
.Show();
}
internal void Q12()
{
Func<Column, Column, Column> mul = Udf<double, int, double>((x, y) => x * y);
Func<Column, Column> highPriority = Udf<string, int>(x => (x == "1-URGENT" || x == "2-HIGH") ? 1 : 0);
Func<Column, Column> lowPriority = Udf<string, int>(x => (x != "1-URGENT" && x != "2-HIGH") ? 1 : 0);
_lineitem.Filter((
Col("l_shipmode") == "MAIL" | Col("l_shipmode") == "SHIP") &
Col("l_commitdate") < Col("l_receiptdate") &
Col("l_shipdate") < Col("l_commitdate") &
Col("l_receiptdate") >= "1994-01-01" & Col("l_receiptdate") < "1995-01-01")
.Join(_orders, Col("l_orderkey") == _orders["o_orderkey"])
.Select(Col("l_shipmode"), Col("o_orderpriority"))
.GroupBy(Col("l_shipmode"))
.Agg(Sum(highPriority(Col("o_orderpriority"))).As("sum_highorderpriority"),
Sum(lowPriority(Col("o_orderpriority"))).As("sum_loworderpriority"))
.Sort(Col("l_shipmode"))
.Show();
}
private static readonly Regex s_q13SpecialRegex = new Regex("^.*special.*requests.*", RegexOptions.Compiled);
internal void Q13()
{
Func<Column, Column> special = Udf<string, bool>((x) => s_q13SpecialRegex.IsMatch(x));
DataFrame c_orders = _customer.Join(_orders, Col("c_custkey") == _orders["o_custkey"]
& !special(_orders["o_comment"]), "left_outer")
.GroupBy(Col("c_custkey"))
.Agg(Count(Col("o_orderkey")).As("c_count"));
c_orders
.GroupBy(Col("c_count"))
.Agg(Count(Col("*")).As("custdist"))
.Sort(Col("custdist").Desc(), Col("c_count").Desc())
.Show();
}
internal void Q14()
{
Func<Column, Column, Column> reduce = Udf<double, double, double>((x, y) => x * (1 - y));
Func<Column, Column, Column> promo = Udf<string, double, double>((x, y) => x.StartsWith("PROMO") ? y : 0);
_part.Join(_lineitem, Col("l_partkey") == Col("p_partkey") &
Col("l_shipdate") >= "1995-09-01" & Col("l_shipdate") < "1995-10-01")
.Select(Col("p_type"), reduce(Col("l_extendedprice"), Col("l_discount")).As("value"))
.Agg(Sum(promo(Col("p_type"), Col("value"))) * 100 / Sum(Col("value")))
.Show();
}
internal void Q15()
{
Func<Column, Column, Column> decrease = Udf<double, double, double>((x, y) => x * (1 - y));
DataFrame revenue = _lineitem.Filter(Col("l_shipdate") >= "1996-01-01" &
Col("l_shipdate") < "1996-04-01")
.Select(Col("l_suppkey"), decrease(Col("l_extendedprice"), Col("l_discount")).As("value"))
.GroupBy(Col("l_suppkey"))
.Agg(Sum(Col("value")).As("total"));
revenue.Agg(Max(Col("total")).As("max_total"))
.Join(revenue, Col("max_total") == revenue["total"])
.Join(_supplier, Col("l_suppkey") == _supplier["s_suppkey"])
.Select(Col("s_suppkey"), Col("s_name"), Col("s_address"), Col("s_phone"), Col("total"))
.Sort(Col("s_suppkey"))
.Show();
}
private static readonly Regex s_q16CompainsRegex = new Regex(".*Customer.*Complaints.*", RegexOptions.Compiled);
private static readonly Regex s_q16NumbersRegex = new Regex("^(49|14|23|45|19|3|36|9)$", RegexOptions.Compiled);
internal void Q16()
{
Func<Column, Column, Column> decrease = Udf<double, double, double>((x, y) => x * (1 - y));
Func<Column, Column> complains = Udf<string, bool>((x) => s_q16CompainsRegex.Match(x).Success);
Func<Column, Column> polished = Udf<string, bool>((x) => x.StartsWith("MEDIUM POLISHED"));
Func<Column, Column> numbers = Udf<int, bool>((x) => s_q16NumbersRegex.Match(x.ToString()).Success);
DataFrame fparts = _part.Filter((Col("p_brand") != "Brand#45") & !polished(Col("p_type")) &
numbers(Col("p_size")))
.Select(Col("p_partkey"), Col("p_brand"), Col("p_type"), Col("p_size"));
_supplier.Filter(!complains(Col("s_comment")))
.Join(_partsupp, Col("s_suppkey") == _partsupp["ps_suppkey"])
.Select(Col("ps_partkey"), Col("ps_suppkey"))
.Join(fparts, Col("ps_partkey") == fparts["p_partkey"])
.GroupBy(Col("p_brand"), Col("p_type"), Col("p_size"))
.Agg(CountDistinct(Col("ps_suppkey")).As("supplier_count"))
.Sort(Col("supplier_count").Desc(), Col("p_brand"), Col("p_type"), Col("p_size"))
.Show();
}
internal void Q17()
{
Func<Column, Column> mul02 = Udf<double, double>((x) => x * 0.2);
DataFrame flineitem = _lineitem.Select(Col("l_partkey"), Col("l_quantity"), Col("l_extendedprice"));
DataFrame fpart = _part.Filter(Col("p_brand") == "Brand#23" & Col("p_container") == "MED BOX")
.Select(Col("p_partkey"))
.Join(_lineitem, Col("p_partkey") == _lineitem["l_partkey"], "left_outer");
fpart.GroupBy("p_partkey")
.Agg(mul02(Avg(Col("l_quantity"))).As("avg_quantity"))
.Select(Col("p_partkey").As("key"), Col("avg_quantity"))
.Join(fpart, Col("key") == fpart["p_partkey"])
.Filter(Col("l_quantity") < Col("avg_quantity"))
.Agg((Sum(Col("l_extendedprice")) / 7.0).As("avg_yearly"))
.Show();
}
internal void Q18()
{
_lineitem.GroupBy(Col("l_orderkey"))
.Agg(Sum(Col("l_quantity")).As("sum_quantity"))
.Filter(Col("sum_quantity") > 300)
.Select(Col("l_orderkey").As("key"), Col("sum_quantity"))
.Join(_orders, _orders["o_orderkey"] == Col("key"))
.Join(_lineitem, Col("o_orderkey") == _lineitem["l_orderkey"])
.Join(_customer, _customer["c_custkey"] == Col("o_custkey"))
.Select(Col("l_quantity"), Col("c_name"), Col("c_custkey"), Col("o_orderkey"), Col("o_orderdate"), Col("o_totalprice"))
.GroupBy(Col("c_name"), Col("c_custkey"), Col("o_orderkey"), Col("o_orderdate"), Col("o_totalprice"))
.Agg(Sum("l_quantity"))
.Sort(Col("o_totalprice").Desc(), Col("o_orderdate"))
.Show();
}
private static readonly Regex s_q19SmRegex = new Regex("SM CASE|SM BOX|SM PACK|SM PKG", RegexOptions.Compiled);
private static readonly Regex s_q19MdRegex = new Regex("MED BAG|MED BOX|MED PKG|MED PACK", RegexOptions.Compiled);
private static readonly Regex s_q19LgRegex = new Regex("LG CASE|LG BOX|LG PACK|LG PKG", RegexOptions.Compiled);
internal void Q19()
{
Func<Column, Column> sm = Udf<string, bool>(x => s_q19SmRegex.IsMatch(x));
Func<Column, Column> md = Udf<string, bool>(x => s_q19MdRegex.IsMatch(x));
Func<Column, Column> lg = Udf<string, bool>(x => s_q19LgRegex.IsMatch(x));
Func<Column, Column, Column> decrease = Udf<double, double, double>((x, y) => x * (1 - y));
_part.Join(_lineitem, Col("l_partkey") == Col("p_partkey"))
.Filter((Col("l_shipmode") == "AIR" | Col("l_shipmode") == "AIR REG") &
Col("l_shipinstruct") == "DELIVER IN PERSON")
.Filter(
((Col("p_brand") == "Brand#12") &
sm(Col("p_container")) &
Col("l_quantity") >= 1 & Col("l_quantity") <= 11 &
Col("p_size") >= 1 & Col("p_size") <= 5) |
((Col("p_brand") == "Brand#23") &
md(Col("p_container")) &
Col("l_quantity") >= 10 & Col("l_quantity") <= 20 &
Col("p_size") >= 1 & Col("p_size") <= 10) |
((Col("p_brand") == "Brand#34") &
lg(Col("p_container")) &
Col("l_quantity") >= 20 & Col("l_quantity") <= 30 &
Col("p_size") >= 1 & Col("p_size") <= 15))
.Select(decrease(Col("l_extendedprice"), Col("l_discount")).As("volume"))
.Agg(Sum("volume").As("revenue"))
.Show();
}
internal void Q20()
{
Func<Column, Column> forest = Udf<string, bool>(x => x.StartsWith("forest"));
DataFrame flineitem = _lineitem.Filter(Col("l_shipdate") >= "1994-01-01" & Col("l_shipdate") < "1995-01-01")
.GroupBy(Col("l_partkey"), Col("l_suppkey"))
.Agg((Sum(Col("l_quantity")) * 0.5).As("sum_quantity"));
DataFrame fnation = _nation.Filter(Col("n_name") == "CANADA");
DataFrame nat_supp = _supplier.Select(Col("s_suppkey"), Col("s_name"), Col("s_nationkey"), Col("s_address"))
.Join(fnation, Col("s_nationkey") == fnation["n_nationkey"]);
_part.Filter(forest(Col("p_name")))
.Select(Col("p_partkey"))
.Distinct()
.Join(_partsupp, Col("p_partkey") == _partsupp["ps_partkey"])
.Join(flineitem, Col("ps_suppkey") == flineitem["l_suppkey"] & Col("ps_partkey") == flineitem["l_partkey"])
.Filter(Col("ps_availqty") > Col("sum_quantity"))
.Select(Col("ps_suppkey"))
.Distinct()
.Join(nat_supp, Col("ps_suppkey") == nat_supp["s_suppkey"])
.Select(Col("s_name"), Col("s_address"))
.Sort(Col("s_name"))
.Show();
}
internal void Q21()
{
DataFrame fsupplier = _supplier.Select(Col("s_suppkey"), Col("s_nationkey"), Col("s_name"));
DataFrame plineitem = _lineitem
.Select(Col("l_suppkey"), Col("l_orderkey"), Col("l_receiptdate"), Col("l_commitdate"));
DataFrame flineitem = plineitem.Filter(Col("l_receiptdate") > Col("l_commitdate"));
DataFrame line1 = plineitem.GroupBy(Col("l_orderkey"))
.Agg(CountDistinct(Col("l_suppkey")).As("suppkey_count"), Max(Col("l_suppkey")).As("suppkey_max"))
.Select(Col("l_orderkey").As("key"), Col("suppkey_count"), Col("suppkey_max"));
DataFrame line2 = flineitem.GroupBy(Col("l_orderkey"))
.Agg(CountDistinct(Col("l_suppkey")).As("suppkey_count"), Max(Col("l_suppkey")).As("suppkey_max"))
.Select(Col("l_orderkey").As("key"), Col("suppkey_count"), Col("suppkey_max"));
DataFrame forder = _orders.Select(Col("o_orderkey"), Col("o_orderstatus"))
.Filter(Col("o_orderstatus") == "F");
_nation.Filter(Col("n_name") == "SAUDI ARABIA")
.Join(fsupplier, Col("n_nationkey") == fsupplier["s_nationkey"])
.Join(flineitem, Col("s_suppkey") == flineitem["l_suppkey"])
.Join(forder, Col("l_orderkey") == forder["o_orderkey"])
.Join(line1, Col("l_orderkey") == line1["key"])
.Filter(Col("suppkey_count") > 1)
.Select(Col("s_name"), Col("l_orderkey"), Col("l_suppkey"))
.Join(line2, Col("l_orderkey") == line2["key"], "left_outer")
.Select(Col("s_name"), Col("l_orderkey"), Col("l_suppkey"), Col("suppkey_count"), Col("suppkey_max"))
.Filter(Col("suppkey_count") == 1 & Col("l_suppkey") == Col("suppkey_max"))
.GroupBy(Col("s_name"))
.Agg(Count(Col("l_suppkey")).As("numwait"))
.Sort(Col("numwait").Desc(), Col("s_name"))
.Show();
}
private static readonly Regex s_q22PhoneRegex = new Regex("^(13|31|23|29|30|18|17)$", RegexOptions.Compiled);
internal void Q22()
{
Func<Column, Column> sub2 = Udf<string, string>(x => x.Substring(0, 2));
Func<Column, Column> phone = Udf<string, bool>(x => s_q22PhoneRegex.IsMatch(x));
DataFrame fcustomer = _customer.Select(Col("c_acctbal"), Col("c_custkey"), sub2(Col("c_phone")).As("cntrycode"))
.Filter(phone(Col("cntrycode")));
DataFrame avg_customer = fcustomer.Filter(Col("c_acctbal") > 0.0)
.Agg(Avg(Col("c_acctbal")).As("avg_acctbal"));
_orders.GroupBy(Col("o_custkey"))
.Agg(Col("o_custkey")).Select(Col("o_custkey"))
.Join(fcustomer, Col("o_custkey") == fcustomer["c_custkey"], "right_outer")
.Filter(Col("o_custkey").IsNull())
.Join(avg_customer)
.Filter(Col("c_acctbal") > Col("avg_acctbal"))
.GroupBy(Col("cntrycode"))
.Agg(Count(Col("c_acctbal")).As("numcust"), Sum(Col("c_acctbal")).As("totacctbal"))
.Sort(Col("cntrycode"))
.Show();
}
}
}

Просмотреть файл

@ -0,0 +1,758 @@
// Licensed to the .NET Foundation under one or more agreements.
// The .NET Foundation licenses this file to you under the MIT license.
// See the LICENSE file in the project root for more information.
using System;
using System.Diagnostics;
using System.Reflection;
using Microsoft.Spark.Sql;
/**
* The queries are based off of the original SparkSQL TPC-H queries from:
* https://github.com/databricks/spark-sql-perf/tree/master/src/main/resources/tpch/queries
*
* Also see:
* https://github.com/chiwanpark/tpch-benchmark/tree/master/spark/src/main/scala/tpch/spark/query
*
* A non-SQL version of the queries that directly uses Scala's DataFrame API can be obtained here:
* https://github.com/ssavvides/tpch-spark/tree/master/src/main/scala
*/
namespace Tpch
{
internal class TpchSqlQueries : TpchBase
{
private readonly SparkSession _spark;
internal TpchSqlQueries(string tpchRoot, SparkSession spark)
: base(tpchRoot, spark)
{
_spark = spark;
// Register SQL views
_customer.CreateOrReplaceTempView("customer");
_lineitem.CreateOrReplaceTempView("lineitem");
_nation.CreateOrReplaceTempView("nation");
_orders.CreateOrReplaceTempView("orders");
_part.CreateOrReplaceTempView("part");
_partsupp.CreateOrReplaceTempView("partsupp");
_region.CreateOrReplaceTempView("region");
_supplier.CreateOrReplaceTempView("supplier");
}
internal void RunAll()
{
for (var i = 1; i <= 22; i++)
{
Run(i.ToString());
}
}
internal void Run(string queryNumber)
{
Console.WriteLine($"Spark .NET TPCH SQL Query: #{queryNumber}");
Type thisType = GetType();
var queryString = (string)thisType.GetField(
$"s_q{queryNumber}", BindingFlags.Static | BindingFlags.NonPublic).GetValue(null);
var sw = Stopwatch.StartNew();
_spark.Sql(queryString).Show(numRows: 20, truncate: 0);
Console.WriteLine($"\tElapsed: {sw.Elapsed}");
}
private static readonly string s_q1 = @"
select
| l_returnflag,
| l_linestatus,
| sum(l_quantity) as sum_qty,
| sum(l_extendedprice) as sum_base_price,
| sum(l_extendedprice * (1 - l_discount)) as sum_disc_price,
| sum(l_extendedprice * (1 - l_discount) * (1 + l_tax)) as sum_charge,
| avg(l_quantity) as avg_qty,
| avg(l_extendedprice) as avg_price,
| avg(l_discount) as avg_disc,
| count(*) as count_order
| from
| lineitem
| where
| l_shipdate <= date '1998-12-01' - interval '90' day
| group by
| l_returnflag,
| l_linestatus
| order by
| l_returnflag,
| l_linestatus
".StripMargin();
private static readonly string s_q2 = @"
select
| s_acctbal,
| s_name,
| n_name,
| p_partkey,
| p_mfgr,
| s_address,
| s_phone,
| s_comment
| from
| part,
| supplier,
| partsupp,
| nation,
| region
| where
| p_partkey = ps_partkey
| and s_suppkey = ps_suppkey
| and p_size = 15
| and p_type like '%BRASS'
| and s_nationkey = n_nationkey
| and n_regionkey = r_regionkey
| and r_name = 'EUROPE'
| and ps_supplycost = (
| select
| min(ps_supplycost)
| from
| partsupp,
| supplier,
| nation,
| region
| where
| p_partkey = ps_partkey
| and s_suppkey = ps_suppkey
| and s_nationkey = n_nationkey
| and n_regionkey = r_regionkey
| and r_name = 'EUROPE'
| )
| order by
| s_acctbal desc,
| n_name,
| s_name,
| p_partkey
".StripMargin();
private static readonly string s_q3 = @"
select
| l_orderkey,
| sum(l_extendedprice * (1 - l_discount)) as revenue,
| o_orderdate,
| o_shippriority
| from
| customer,
| orders,
| lineitem
| where
| c_mktsegment = 'BUILDING'
| and c_custkey = o_custkey
| and l_orderkey = o_orderkey
| and o_orderdate < date '1995-03-15'
| and l_shipdate > date '1995-03-15'
| group by
| l_orderkey,
| o_orderdate,
| o_shippriority
| order by
| revenue desc,
| o_orderdate
".StripMargin();
private static readonly string s_q4 = @"
select
| o_orderpriority,
| count(*) as order_count
| from
| orders
| where
| o_orderdate >= date '1993-07-01'
| and o_orderdate < date '1993-07-01' + interval '3' month
| and exists(
| select
| *
| from
| lineitem
| where
| l_orderkey = o_orderkey
| and l_commitdate < l_receiptdate
| )
| group by
| o_orderpriority
| order by
| o_orderpriority
".StripMargin();
private static readonly string s_q5 = @"
select
| n_name,
| sum(l_extendedprice * (1 - l_discount)) as revenue
| from
| customer,
| orders,
| lineitem,
| supplier,
| nation,
| region
| where
| c_custkey = o_custkey
| and l_orderkey = o_orderkey
| and l_suppkey = s_suppkey
| and c_nationkey = s_nationkey
| and s_nationkey = n_nationkey
| and n_regionkey = r_regionkey
| and r_name = 'ASIA'
| and o_orderdate >= date '1994-01-01'
| and o_orderdate < date '1994-01-01' + interval '1' year
| group by
| n_name
| order by
| revenue desc
".StripMargin();
private static readonly string s_q6 = @"
select
| sum(l_extendedprice * l_discount) as revenue
| from
| lineitem
| where
| l_shipdate >= date '1994-01-01'
| and l_shipdate < date '1994-01-01' + interval '1' year
| and l_discount between .06 - 0.01 and .06 + 0.01
| and l_quantity < 24
".StripMargin();
private static readonly string s_q7 = @"
select
| supp_nation,
| cust_nation,
| l_year,
| cast(sum(volume) as double) as revenue
| from
| (
| select
| n1.n_name as supp_nation,
| n2.n_name as cust_nation,
| year(l_shipdate) as l_year,
| cast(l_extendedprice * (1 - l_discount) as double) as volume
| from
| supplier,
| lineitem,
| orders,
| customer,
| nation n1,
| nation n2
| where
| s_suppkey = l_suppkey
| and o_orderkey = l_orderkey
| and c_custkey = o_custkey
| and s_nationkey = n1.n_nationkey
| and c_nationkey = n2.n_nationkey
| and(
| (n1.n_name = 'FRANCE' and n2.n_name = 'GERMANY')
| or(n1.n_name = 'GERMANY' and n2.n_name = 'FRANCE')
| )
| and l_shipdate between date '1995-01-01' and date '1996-12-31'
| ) as shipping
| group by
| supp_nation,
| cust_nation,
| l_year
| order by
| supp_nation,
| cust_nation,
| l_year
".StripMargin();
private static readonly string s_q8 = @"
select
| o_year,
| sum(case
| when nation = 'BRAZIL' then volume
| else 0
| end) / sum(volume) as mkt_share
| from
| (
| select
| year(o_orderdate) as o_year,
| l_extendedprice * (1 - l_discount) as volume,
| n2.n_name as nation
| from
| part,
| supplier,
| lineitem,
| orders,
| customer,
| nation n1,
| nation n2,
| region
| where
| p_partkey = l_partkey
| and s_suppkey = l_suppkey
| and l_orderkey = o_orderkey
| and o_custkey = c_custkey
| and c_nationkey = n1.n_nationkey
| and n1.n_regionkey = r_regionkey
| and r_name = 'AMERICA'
| and s_nationkey = n2.n_nationkey
| and o_orderdate between date '1995-01-01' and date '1996-12-31'
| and p_type = 'ECONOMY ANODIZED STEEL'
| ) as all_nations
| group by
| o_year
| order by
| o_year
".StripMargin();
private static readonly string s_q9 = @"
select
| nation,
| o_year,
| sum(amount) as sum_profit
| from
| (
| select
| n_name as nation,
| year(o_orderdate) as o_year,
| l_extendedprice * (1 - l_discount) - ps_supplycost * l_quantity as amount
| from
| part,
| supplier,
| lineitem,
| partsupp,
| orders,
| nation
| where
| s_suppkey = l_suppkey
| and ps_suppkey = l_suppkey
| and ps_partkey = l_partkey
| and p_partkey = l_partkey
| and o_orderkey = l_orderkey
| and s_nationkey = n_nationkey
| and p_name like '%green%'
| ) as profit
| group by
| nation,
| o_year
| order by
| nation,
| o_year desc
".StripMargin();
private static readonly string s_q10 = @"
select
| c_custkey,
| c_name,
| sum(l_extendedprice * (1 - l_discount)) as revenue,
| c_acctbal,
| n_name,
| c_address,
| c_phone,
| c_comment
| from
| customer,
| orders,
| lineitem,
| nation
| where
| c_custkey = o_custkey
| and l_orderkey = o_orderkey
| and o_orderdate >= date '1993-10-01'
| and o_orderdate < date '1993-10-01' + interval '3' month
| and l_returnflag = 'R'
| and c_nationkey = n_nationkey
| group by
| c_custkey,
| c_name,
| c_acctbal,
| c_phone,
| n_name,
| c_address,
| c_comment
| order by
| revenue desc
".StripMargin();
private static readonly string s_q11 = @"
select
| ps_partkey,
| sum(ps_supplycost * ps_availqty) as value
| from
| partsupp,
| supplier,
| nation
| where
| ps_suppkey = s_suppkey
| and s_nationkey = n_nationkey
| and n_name = 'GERMANY'
| group by
| ps_partkey having
| sum(ps_supplycost * ps_availqty) > (
| select
| sum(ps_supplycost * ps_availqty) * 0.0001000000
| from
| partsupp,
| supplier,
| nation
| where
| ps_suppkey = s_suppkey
| and s_nationkey = n_nationkey
| and n_name = 'GERMANY'
| )
| order by
| value desc
".StripMargin();
private static readonly string s_q12 = @"
select
| l_shipmode,
| sum(case
| when o_orderpriority = '1-URGENT'
| or o_orderpriority = '2-HIGH'
| then 1
| else 0
| end) as sum_highorderpriority,
| sum(case
| when o_orderpriority <> '1-URGENT'
| and o_orderpriority <> '2-HIGH'
| then 1
| else 0
| end) as sum_loworderpriority
| from
| orders,
| lineitem
| where
| o_orderkey = l_orderkey
| and l_shipmode in ('MAIL', 'SHIP')
| and l_commitdate < l_receiptdate
| and l_shipdate < l_commitdate
| and l_receiptdate >= date '1994-01-01'
| and l_receiptdate < date '1994-01-01' + interval '1' year
| group by
| l_shipmode
| order by
| l_shipmode
".StripMargin();
private static readonly string s_q13 = @"
select
| c_count,
| count(*) as custdist
| from
| (
| select
| c_custkey,
| count(o_orderkey) as c_count
| from
| customer left outer join orders on
| c_custkey = o_custkey
| and o_comment not like '%special%requests%'
| group by
| c_custkey
| ) as c_orders
| group by
| c_count
| order by
| custdist desc,
| c_count desc".StripMargin();
private static readonly string s_q14 = @"
| select
| 100.00 * sum(case
| when p_type like 'PROMO%'
| then l_extendedprice * (1 - l_discount)
| else 0
| end) / sum(l_extendedprice * (1 - l_discount)) as promo_revenue
| from
| lineitem,
| part
| where
| l_partkey = p_partkey
| and l_shipdate >= date '1995-09-01'
| and l_shipdate < date '1995-09-01' + interval '1' month
".StripMargin();
private static readonly string s_q15 = @"
with revenue0 as
| (select
| l_suppkey as supplier_no,
| sum(l_extendedprice * (1 - l_discount)) as total_revenue
| from
| lineitem
| where
| l_shipdate >= date '1996-01-01'
| and l_shipdate < date '1996-01-01' + interval '3' month
| group by
| l_suppkey)
|
| select
| s_suppkey,
| s_name,
| s_address,
| s_phone,
| total_revenue
| from
| supplier,
| revenue0
| where
| s_suppkey = supplier_no
| and total_revenue = (
| select
| max(total_revenue)
| from
| revenue0
| )
| order by
| s_suppkey
".StripMargin();
private static readonly string s_q16 = @"
| select
| p_brand,
| p_type,
| p_size,
| count(distinct ps_suppkey) as supplier_cnt
| from
| partsupp,
| part
| where
| p_partkey = ps_partkey
| and p_brand <> 'Brand#45'
| and p_type not like 'MEDIUM POLISHED%'
| and p_size in (49, 14, 23, 45, 19, 3, 36, 9)
| and ps_suppkey not in (
| select
| s_suppkey
| from
| supplier
| where
| s_comment like '%Customer%Complaints%'
| )
| group by
| p_brand,
| p_type,
| p_size
| order by
| supplier_cnt desc,
| p_brand,
| p_type,
| p_size
| limit 20
".StripMargin();
private static readonly string s_q17 = @"
| select
| sum(l_extendedprice) / 7.0 as avg_yearly
| from
| lineitem,
| part
| where
| p_partkey = l_partkey
| and p_brand = 'Brand#23'
| and p_container = 'MED BOX'
| and l_quantity < (
| select
| 0.2 * avg(l_quantity)
| from
| lineitem
| where
| l_partkey = p_partkey
| )
".StripMargin();
private static readonly string s_q18 = @"
| select
| c_name,
| c_custkey,
| o_orderkey,
| o_orderdate,
| o_totalprice,
| sum(l_quantity)
| from
| customer,
| orders,
| lineitem
| where
| o_orderkey in (
| select
| l_orderkey
| from
| lineitem
| group by
| l_orderkey having
| sum(l_quantity) > 300
| )
| and c_custkey = o_custkey
| and o_orderkey = l_orderkey
| group by
| c_name,
| c_custkey,
| o_orderkey,
| o_orderdate,
| o_totalprice
| order by
| o_totalprice desc,
| o_orderdate
".StripMargin();
private static readonly string s_q19 = @"
| select
| sum(l_extendedprice * (1 - l_discount)) as revenue
| from
| lineitem,
| part
| where
| (
| p_partkey = l_partkey
| and p_brand = 'Brand#12'
| and p_container in ('SM CASE', 'SM BOX', 'SM PACK', 'SM PKG')
| and l_quantity >= 1 and l_quantity <= 1 + 10
| and p_size between 1 and 5
| and l_shipmode in ('AIR', 'AIR REG')
| and l_shipinstruct = 'DELIVER IN PERSON'
| )
| or
| (
| p_partkey = l_partkey
| and p_brand = 'Brand#23'
| and p_container in ('MED BAG', 'MED BOX', 'MED PKG', 'MED PACK')
| and l_quantity >= 10 and l_quantity <= 10 + 10
| and p_size between 1 and 10
| and l_shipmode in ('AIR', 'AIR REG')
| and l_shipinstruct = 'DELIVER IN PERSON'
| )
| or
| (
| p_partkey = l_partkey
| and p_brand = 'Brand#34'
| and p_container in ('LG CASE', 'LG BOX', 'LG PACK', 'LG PKG')
| and l_quantity >= 20 and l_quantity <= 20 + 10
| and p_size between 1 and 15
| and l_shipmode in ('AIR', 'AIR REG')
| and l_shipinstruct = 'DELIVER IN PERSON'
| )
".StripMargin();
private static readonly string s_q20 = @"
| select
| s_name,
| s_address
| from
| supplier,
| nation
| where
| s_suppkey in (
| select
| ps_suppkey
| from
| partsupp
| where
| ps_partkey in (
| select
| p_partkey
| from
| part
| where
| p_name like 'forest%'
| )
| and ps_availqty > (
| select
| 0.5 * sum(l_quantity)
| from
| lineitem
| where
| l_partkey = ps_partkey
| and l_suppkey = ps_suppkey
| and l_shipdate >= date '1994-01-01'
| and l_shipdate < date '1994-01-01' + interval '1' year
| )
| )
| and s_nationkey = n_nationkey
| and n_name = 'CANADA'
| order by
| s_name
".StripMargin();
private static readonly string s_q21 = @"
| select
| s_name,
| count(*) as numwait
| from
| supplier,
| lineitem l1,
| orders,
| nation
| where
| s_suppkey = l1.l_suppkey
| and o_orderkey = l1.l_orderkey
| and o_orderstatus = 'F'
| and l1.l_receiptdate > l1.l_commitdate
| and exists(
| select
| *
| from
| lineitem l2
| where
| l2.l_orderkey = l1.l_orderkey
| and l2.l_suppkey <> l1.l_suppkey
| )
| and not exists(
| select
| *
| from
| lineitem l3
| where
| l3.l_orderkey = l1.l_orderkey
| and l3.l_suppkey <> l1.l_suppkey
| and l3.l_receiptdate > l3.l_commitdate
| )
| and s_nationkey = n_nationkey
| and n_name = 'SAUDI ARABIA'
| group by
| s_name
| order by
| numwait desc,
| s_name
".StripMargin();
private static readonly string s_q22 = @"
select
| cntrycode,
| count(*) as numcust,
| sum(c_acctbal) as totacctbal
| from
| (
| select
| substring(c_phone, 1, 2) as cntrycode,
| c_acctbal
| from
| customer
| where
| substring(c_phone, 1, 2) in
| ('13', '31', '23', '29', '30', '18', '17')
| and c_acctbal > (
| select
| avg(c_acctbal)
| from
| customer
| where
| c_acctbal > 0.00
| and substring(c_phone, 1, 2) in
| ('13', '31', '23', '29', '30', '18', '17')
| )
| and not exists(
| select
| *
| from
| orders
| where
| o_custkey = c_custkey
| )
| ) as custsale
| group by
| cntrycode
| order by
| cntrycode
".StripMargin();
}
}

43
benchmark/python/tpch.py Normal file
Просмотреть файл

@ -0,0 +1,43 @@
# Licensed to the .NET Foundation under one or more agreements.
# The .NET Foundation licenses this file to you under the MIT license.
# See the LICENSE file in the project root for more information.
import sys
import time
from tpch_functional_queries import *
from tpch_sql_queries import *
from pyspark.sql import SparkSession
def main():
if len(sys.argv) != 5:
print("Usage:")
print("\t<spark-submit> --master local tpch.py")
print("\t\t<tpch_data_root_path> <query_number> <num_iterations> <true for SQL | false for functional>")
exit(1)
input_dir = sys.argv[1]
query_number = sys.argv[2]
num_iterations = int(sys.argv[3])
is_sql = sys.argv[4].lower() in ("true")
for iter in range(0, num_iterations):
print("TPCH Starting iteration {0} with query #{1}".format(iter, query_number))
spark = SparkSession.builder.appName('TPCH Benchmark for Python').getOrCreate()
start = time.time()
if (is_sql == False):
queries = TpchFunctionalQueries(spark, input_dir)
getattr(queries, "q" + query_number)()
else:
queries = TpchSqlQueries(spark, input_dir)
getattr(queries, "q" + query_number)()
end = time.time()
typeStr = "SQL" if is_sql else "Functional"
print("TPCH_Result,Python,%s,%s,%d,%d" % (typeStr, query_number, iter, (end-start) * 1000))
spark.stop()
if __name__ == '__main__':
main()

Просмотреть файл

@ -0,0 +1,17 @@
# Licensed to the .NET Foundation under one or more agreements.
# The .NET Foundation licenses this file to you under the MIT license.
# See the LICENSE file in the project root for more information.
import pyspark
from pyspark.sql import SparkSession
class TpchBase:
def __init__(self, spark, dir):
self.customer = spark.read.parquet(dir + "customer")
self.lineitem = spark.read.parquet(dir + "lineitem")
self.nation = spark.read.parquet(dir + "nation")
self.region = spark.read.parquet(dir + "region")
self.orders = spark.read.parquet(dir + "orders")
self.part = spark.read.parquet(dir + "part")
self.partsupp = spark.read.parquet(dir + "partsupp")
self.supplier = spark.read.parquet(dir + "supplier")

Просмотреть файл

@ -0,0 +1,449 @@
# Licensed to the .NET Foundation under one or more agreements.
# The .NET Foundation licenses this file to you under the MIT license.
# See the LICENSE file in the project root for more information.
from tpch_base import TpchBase
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.functions import *
from pyspark.sql.types import *
import re
class TpchFunctionalQueries(TpchBase):
def __init__(self, spark, dir):
TpchBase.__init__(self, spark, dir)
def q1(self):
decrease = udf(lambda x, y: x * (1 - y), FloatType())
increase = udf(lambda x, y: x * (1 + y), FloatType())
self.lineitem.filter(col("l_shipdate") <= "1998-09-02") \
.groupBy(col("l_returnflag"), col("l_linestatus")) \
.agg(F.sum(col("l_quantity")).alias("sum_qty"),
F.sum(col("l_extendedprice")).alias("sum_base_price"),
F.sum(decrease(col("l_extendedprice"), col("l_discount"))).alias("sum_disc_price"),
F.sum(increase(decrease(col("l_extendedprice"), col("l_discount")), col("l_tax"))).alias("sum_charge"),
F.avg(col("l_quantity")).alias("avg_qty"),
F.avg(col("l_extendedprice")).alias("avg_price"),
F.avg(col("l_discount")).alias("avg_disc"),
F.count(col("l_quantity")).alias("count_order")) \
.sort(col("l_returnflag"), col("l_linestatus")) \
.show()
def q2(self):
europe = self.region.filter(col("r_name") == "EUROPE") \
.join(self.nation, col("r_regionkey") == col("n_regionkey")) \
.join(self.supplier, col("n_nationkey") == col("s_nationkey")) \
.join(self.partsupp, self.supplier.s_suppkey == self.partsupp.ps_suppkey)
brass = self.part.filter((col("p_size") == 15) & (self.part.p_type.endswith("BRASS"))) \
.join(europe, col("p_partkey") == europe.ps_partkey)
minimumCost = brass.groupBy(col("ps_partkey")) \
.agg(F.min(col("ps_supplycost")).alias("min"))
brass.join(minimumCost, brass.ps_partkey == minimumCost.ps_partkey) \
.filter(brass.ps_supplycost == minimumCost.min) \
.select("s_acctbal", "s_name", "n_name", "p_partkey", "p_mfgr", "s_address", "s_phone", "s_comment") \
.sort(col("s_acctbal").desc(), col("n_name"), col("s_name"), col("p_partkey")) \
.show()
def q3(self):
decrease = udf(lambda x, y: x * (1 - y), FloatType())
filteredCustomers = self.customer.filter(col("c_mktsegment") == "BUILDING")
filteredOrders = self.orders.filter(col("o_orderdate") < "1995-03-15")
filteredLineItems = self.lineitem.filter(col("l_shipdate") > "1995-03-15")
filteredCustomers.join(filteredOrders, col("c_custkey") == col("o_custkey")) \
.select("o_orderkey", "o_orderdate", "o_shippriority") \
.join(filteredLineItems, col("o_orderkey") == col("l_orderkey")) \
.select(col("l_orderkey"),
decrease(col("l_extendedprice"), col("l_discount")).alias("volume"),
col("o_orderdate"), col("o_shippriority")) \
.groupBy(col("l_orderkey"), col("o_orderdate"), col("o_shippriority")) \
.agg(F.sum(col("volume")).alias("revenue")) \
.sort(col("revenue").desc(), col("o_orderdate")) \
.show()
def q4(self):
filteredOrders = self.orders.filter((col("o_orderdate") >= "1993-07-01") & (col("o_orderdate") < "1993-10-01"))
filteredLineItems = self.lineitem.filter(col("l_commitdate") < col("l_receiptdate")) \
.select("l_orderkey") \
.distinct()
filteredLineItems.join(filteredOrders, col("l_orderkey") == col("o_orderkey")) \
.groupBy("o_orderpriority") \
.agg(F.count(col("o_orderpriority")).alias("order_count")) \
.sort(col("o_orderpriority")) \
.show()
def q5(self):
decrease = udf(lambda x, y: x * (1 - y), FloatType())
filteredOrders = self.orders.filter((col("o_orderdate") < "1995-01-01") & (col("o_orderdate") >= "1994-01-01"))
self.region.filter(col("r_name") == "ASIA") \
.join(self.nation, col("r_regionkey") == col("n_regionkey")) \
.join(self.supplier, col("n_nationkey") == col("s_nationkey")) \
.join(self.lineitem, col("s_suppkey") == col("l_suppkey")) \
.select("n_name", "l_extendedprice", "l_discount", "l_orderkey", "s_nationkey") \
.join(filteredOrders, col("l_orderkey") == col("o_orderkey")) \
.join(self.customer, (col("o_custkey") == col("c_custkey")) & (col("s_nationkey") == col("c_nationkey"))) \
.select(col("n_name"), decrease(col("l_extendedprice"), col("l_discount")).alias("value")) \
.groupBy("n_name") \
.agg(F.sum(col("value")).alias("revenue")) \
.sort(col("revenue").desc()) \
.show()
def q6(self):
self.lineitem.filter((col("l_shipdate") >= "1994-01-01")
& (col("l_shipdate") < "1995-01-01")
& (col("l_discount") >= 0.05)
& (col("l_discount") <= 0.07)
& (col("l_quantity") < 24)) \
.agg(F.sum(col("l_extendedprice") * col("l_discount")).alias("revenue")) \
.show()
def q7(self):
getYear = udf(lambda x: x[0:4], StringType())
decrease = udf(lambda x, y: x * (1 - y), FloatType())
filteredNations = self.nation.filter((col("n_name") == "FRANCE") | (col("n_name") == "GERMANY"))
filteredLineitems = self.lineitem.filter((col("l_shipdate") >= "1995-01-01") & (col("l_shipdate") <= "1996-12-31"))
supplierNations = filteredNations.join(self.supplier, col("n_nationkey") == col("s_nationkey")) \
.join(filteredLineitems, col("s_suppkey") == col("l_suppkey")) \
.select(col("n_name").alias("supp_nation"), col("l_orderkey"), col("l_extendedprice"), col("l_discount"), col("l_shipdate"))
filteredNations.join(self.customer, col("n_nationkey") == col("c_nationkey")) \
.join(self.orders, col("c_custkey") == col("o_custkey")) \
.select(col("n_name").alias("cust_nation"), col("o_orderkey")) \
.join(supplierNations, col("o_orderkey") == col("l_orderkey")) \
.filter(((col("supp_nation") == "FRANCE") & (col("cust_nation") == "GERMANY"))
| ((col("supp_nation") == "GERMANY") & (col("cust_nation") == "FRANCE"))) \
.select(col("supp_nation"), col("cust_nation"),
getYear(col("l_shipdate")).alias("l_year"),
decrease(col("l_extendedprice"), col("l_discount")).alias("volume")) \
.groupBy(col("supp_nation"), col("cust_nation"), col("l_year")) \
.agg(F.sum(col("volume")).alias("revenue")) \
.sort(col("supp_nation"), col("cust_nation"), col("l_year")) \
.show()
def q8(self):
getYear = udf(lambda x: x[0:4], StringType())
decrease = udf(lambda x, y: x * (1 - y), FloatType())
isBrazil = udf(lambda x, y: (y if (x == "BRAZIL") else 0), FloatType())
filteredRegions = self.region.filter(col("r_name") == "AMERICA")
filteredOrders = self.orders.filter((col("o_orderdate") <= "1996-12-31") & (col("o_orderdate") >= "1995-01-01"))
filteredParts = self.part.filter(col("p_type") == "ECONOMY ANODIZED STEEL")
filteredNations = self.nation.join(self.supplier, col("n_nationkey") == col("s_nationkey"))
filteredLineitems = self.lineitem.select(col("l_partkey"), col("l_suppkey"), col("l_orderkey"),
decrease(col("l_extendedprice"), col("l_discount")).alias("volume")) \
.join(filteredParts, col("l_partkey") == col("p_partkey")) \
.join(filteredNations, col("l_suppkey") == col("s_suppkey"))
self.nation.join(filteredRegions, col("n_regionkey") == col("r_regionkey")) \
.select(col("n_nationkey")) \
.join(self.customer, col("n_nationkey") == col("c_nationkey")) \
.select(col("c_custkey")) \
.join(filteredOrders, col("c_custkey") == col("o_custkey")) \
.select(col("o_orderkey"), col("o_orderdate")) \
.join(filteredLineitems, col("o_orderkey") == col("l_orderkey")) \
.select(getYear(col("o_orderdate")).alias("o_year"), col("volume"),
isBrazil(col("n_name"), col("volume")).alias("case_volume")) \
.groupBy(col("o_year")) \
.agg((F.sum(col("case_volume")) / F.sum(col("volume"))).alias("mkt_share")) \
.sort(col("o_year")) \
.show()
def q9(self):
getYear = udf(lambda x: x[0:4], StringType())
expression = udf(lambda x, y, v, w: x * (1 - y) - (v * w), FloatType())
lineitemParts = self.part.filter(col("p_name").contains("green")) \
.join(self.lineitem, col("p_partkey") == col("l_partkey"))
nationPartSuppliers = self.nation.join(self.supplier, col("n_nationkey") == col("s_nationkey"))
lineitemParts.join(nationPartSuppliers, col("l_suppkey") == col("s_suppkey")) \
.join(self.partsupp, (col("l_suppkey") == col("ps_suppkey"))
& (col("l_partkey") == col("ps_partkey"))) \
.join(self.orders, col("l_orderkey") == col("o_orderkey")) \
.select(col("n_name"), getYear(col("o_orderdate")).alias("o_year"),
expression(col("l_extendedprice"), col("l_discount"),
col("ps_supplycost"), col("l_quantity")).alias("amount")) \
.groupBy(col("n_name"), col("o_year")) \
.agg(F.sum(col("amount"))) \
.sort(col("n_name"), col("o_year").desc()) \
.show()
def q10(self):
decrease = udf(lambda x, y: x * (1 - y), FloatType())
filteredLineitems = self.lineitem.filter(col("l_returnflag") == "R")
self.orders.filter((col("o_orderdate") < "1994-01-01") & (col("o_orderdate") >= "1993-10-01")) \
.join(self.customer, col("o_custkey") == col("c_custkey")) \
.join(self.nation, col("c_nationkey") == col("n_nationkey")) \
.join(filteredLineitems, col("o_orderkey") == col("l_orderkey")) \
.select(col("c_custkey"), col("c_name"), decrease(col("l_extendedprice"), col("l_discount")).alias("volume"),
col("c_acctbal"), col("n_name"), col("c_address"), col("c_phone"), col("c_comment")) \
.groupBy(col("c_custkey"), col("c_name"), col("c_acctbal"), col("c_phone"), col("n_name"), col("c_address"), col("c_comment")) \
.agg(F.sum(col("volume")).alias("revenue")) \
.sort(col("revenue").desc()) \
.limit(20) \
.show()
def q11(self):
multiplication = udf(lambda x, y: x * y, FloatType())
division = udf(lambda x: x * 0.0001, FloatType())
nationPartSuppliers = self.nation.filter(col("n_name") == "GERMANY") \
.join(self.supplier, col("n_nationkey") == col("s_nationkey")) \
.select(col("s_suppkey")) \
.join(self.partsupp, col("s_suppkey") == col("ps_suppkey")) \
.select(col("ps_partkey"), multiplication(col("ps_supplycost"), col("ps_availqty")).alias("value"))
aggregatedValue = nationPartSuppliers.agg(F.sum(col("value")).alias("total_value"))
nationPartSuppliers.groupBy(col("ps_partkey")).agg(F.sum(col("value")).alias("part_value")) \
.join(aggregatedValue, col("part_value") > division(col("total_value"))) \
.sort(col("part_value").desc()) \
.show()
def q12(self):
highPriority = udf(lambda x: (1 if ((x == "1-URGENT") or (x == "2-HIGH")) else 0), IntegerType())
lowPriority = udf(lambda x: (1 if ((x != "1-URGENT") and (x != "2-HIGH")) else 0), IntegerType())
self.lineitem.filter(((col("l_shipmode") == "MAIL") | (col("l_shipmode") == "SHIP"))
& (col("l_commitdate") < col("l_receiptdate"))
& (col("l_shipdate") < col("l_commitdate"))
& (col("l_receiptdate") >= "1994-01-01")
& (col("l_receiptdate") < "1995-01-01")) \
.join(self.orders, col("l_orderkey") == col("o_orderkey")) \
.select(col("l_shipmode"), col("o_orderpriority")) \
.groupBy(col("l_shipmode")) \
.agg(F.sum(highPriority(col("o_orderpriority"))).alias("sum_highorderpriority"),
F.sum(lowPriority(col("o_orderpriority"))).alias("sum_loworderpriority")) \
.sort(col("l_shipmode")) \
.show()
def q13(self):
special_regex = re.compile(".*special.*requests.*")
special = udf(lambda x: special_regex.match(x) is not None, BooleanType())
self.customer.join(self.orders, (col("c_custkey") == col("o_custkey"))
& ~special(col("o_comment")), "left_outer") \
.groupBy(col("c_custkey")) \
.agg(F.count(col("o_orderkey")).alias("c_count")) \
.groupBy(col("c_count")) \
.agg(F.count(col("*")).alias("custdist")) \
.sort(col("custdist").desc(), col("c_count").desc()) \
.show()
def q14(self):
decrease = udf(lambda x, y: x * (1 - y), FloatType())
promotion = udf(lambda x, y: (y if (x.startswith("PROMO")) else 0), FloatType())
self.part.join(self.lineitem, (col("l_partkey") == col("p_partkey"))
& (col("l_shipdate") >= "1995-09-01")
& (col("l_shipdate") < "1995-10-01")) \
.select(col("p_type"), decrease(col("l_extendedprice"), col("l_discount")).alias("value")) \
.agg(F.sum(promotion(col("p_type"), col("value"))) * 100 / F.sum(col("value"))) \
.show()
def q15(self):
decrease = udf(lambda x, y: x * (1 - y), FloatType())
revenue = self.lineitem.filter((col("l_shipdate") >= "1996-01-01")
& (col("l_shipdate") < "1996-04-01")) \
.select(col("l_suppkey"), decrease(col("l_extendedprice"), col("l_discount")).alias("value")) \
.groupBy(col("l_suppkey")) \
.agg(F.sum(col("value")).alias("total"))
revenue.agg(F.max(col("total")).alias("max_total")) \
.join(revenue, col("max_total") == col("total")) \
.join(self.supplier, col("l_suppkey") == col("s_suppkey")) \
.select(col("s_suppkey"), col("s_name"), col("s_address"), col("s_phone"), col("total")) \
.sort(col("s_suppkey")) \
.show()
def q16(self):
decrease = udf(lambda x, y: x * (1 - y), FloatType())
polished = udf(lambda x: x.startswith("MEDIUM POLISHED"), BooleanType())
complains_regex = re.compile(".*Customer.*Complaints.*")
complains = udf(lambda x: complains_regex.match(x) is not None, BooleanType())
numbers_regex = re.compile("^(49|14|23|45|19|3|36|9)$")
numbers = udf(lambda x: numbers_regex.match(str(x)) is not None, BooleanType())
filteredParts = self.part.filter((col("p_brand") != "Brand#45")
& (~polished(col("p_type")))
& numbers(col("p_size"))) \
.select(col("p_partkey"), col("p_brand"), col("p_type"), col("p_size"))
self.supplier.filter(~complains(col("s_comment"))) \
.join(self.partsupp, col("s_suppkey") == col("ps_suppkey")) \
.select(col("ps_partkey"), col("ps_suppkey")) \
.join(filteredParts, col("ps_partkey") == col("p_partkey")) \
.groupBy(col("p_brand"), col("p_type"), col("p_size")) \
.agg(F.countDistinct(col("ps_suppkey")).alias("supplier_count")) \
.sort(col("supplier_count").desc(), col("p_brand"), col("p_type"), col("p_size")) \
.show()
def q17(self):
multiplier = udf(lambda x: x * 0.2)
filteredLineitems = self.lineitem.select(col("l_partkey"), col("l_quantity"), col("l_extendedprice"))
filteredParts = self.part.filter((col("p_brand") == "Brand#23") & (col("p_container") == "MED BOX")) \
.select(col("p_partkey")) \
.join(self.lineitem, col("p_partkey") == col("l_partkey"), "left_outer")
filteredParts.groupBy(col("p_partkey")) \
.agg(multiplier(F.avg(col("l_quantity"))).alias("avg_quantity")) \
.select(col("p_partkey").alias("key"), col("avg_quantity")) \
.join(filteredParts, col("key") == col("p_partkey")) \
.filter(col("l_quantity") < col("avg_quantity")) \
.agg((F.sum(col("l_extendedprice")) / 7.0).alias("avg_yearly")) \
.show()
def q18(self):
self.lineitem.groupBy(col("l_orderkey")) \
.agg(F.sum(col("l_quantity")).alias("sum_quantity")) \
.filter(col("sum_quantity") > 300) \
.select(col("l_orderkey").alias("key"), col("sum_quantity")) \
.join(self.orders, col("o_orderkey") == col("key")) \
.join(self.lineitem, col("o_orderkey") == col("l_orderkey")) \
.join(self.customer, col("c_custkey") == col("o_custkey")) \
.select(col("l_quantity"), col("c_name"), col("c_custkey"), col("o_orderkey"), col("o_orderdate"), col("o_totalprice")) \
.groupBy(col("c_name"), col("c_custkey"), col("o_orderkey"), col("o_orderdate"), col("o_totalprice")) \
.agg(F.sum(col("l_quantity"))) \
.sort(col("o_totalprice").desc(), col("o_orderdate")) \
.show()
def q19(self):
decrease = udf(lambda x, y: x * (1 - y), FloatType())
sm_regex = re.compile("SM CASE|SM BOX|SM PACK|SM PKG")
sm = udf(lambda x: sm_regex.match(x) is not None, BooleanType())
med_regex = re.compile("MED BAG|MED BOX|MED PKG|MED PACK")
med = udf(lambda x: med_regex.match(x) is not None, BooleanType())
lg_regex = re.compile("LG CASE|LG BOX|LG PACK|LG PKG")
lg = udf(lambda x: lg_regex.match(x) is not None, BooleanType())
self.part.join(self.lineitem, col("l_partkey") == col("p_partkey")) \
.filter(((col("l_shipmode") == "AIR")
| (col("l_shipmode") == "AIR REG"))
& (col("l_shipinstruct") == "DELIVER IN PERSON")) \
.filter(((col("p_brand") == "Brand#12")
& (sm(col("p_container")))
& (col("l_quantity") >= 1)
& (col("l_quantity") <= 11)
& (col("p_size") >= 1)
& (col("p_size") <= 5))
| ((col("p_brand") == "Brand#23")
& (med(col("p_container")))
& (col("l_quantity") >= 10)
& (col("l_quantity") <= 20)
& (col("p_size") >= 1)
& (col("p_size") <= 10))
| ((col("p_brand") == "Brand#34")
& (lg(col("p_container")))
& (col("l_quantity") >= 20)
& (col("l_quantity") <= 30)
& (col("p_size") >= 1)
& (col("p_size") <= 15))) \
.select(decrease(col("l_extendedprice"), col("l_discount")).alias("volume")) \
.agg(F.sum(col("volume")).alias("revenue")) \
.show()
def q20(self):
forest = udf(lambda x: x.startswith("forest"), BooleanType())
filteredLineitems = self.lineitem.filter((col("l_shipdate") >= "1994-01-01") & (col("l_shipdate") < "1995-01-01")) \
.groupBy(col("l_partkey"), col("l_suppkey")) \
.agg((F.sum(col("l_quantity")) * 0.5).alias("sum_quantity"))
filteredNations = self.nation.filter(col("n_name") == "CANADA")
nationSuppliers = self.supplier.select(col("s_suppkey"), col("s_name"), col("s_nationkey"), col("s_address")) \
.join(filteredNations, col("s_nationkey") == col("n_nationkey"))
self.part.filter(forest(col("p_name"))) \
.select(col("p_partkey")).distinct() \
.join(self.partsupp, col("p_partkey") == col("ps_partkey")) \
.join(filteredLineitems, (col("ps_suppkey") == col("l_suppkey")) & (col("ps_partkey") == col("l_partkey"))) \
.filter(col("ps_availqty") > col("sum_quantity")) \
.select(col("ps_suppkey")).distinct() \
.join(nationSuppliers, col("ps_suppkey") == col("s_suppkey")) \
.select(col("s_name"), col("s_address")) \
.sort(col("s_name")) \
.show()
def q21(self):
filteredSuppliers = self.supplier.select(col("s_suppkey"), col("s_nationkey"), col("s_name"))
selectedLineitems = self.lineitem.select(col("l_suppkey"), col("l_orderkey"), col("l_receiptdate"), col("l_commitdate"))
filteredLineitems = selectedLineitems.filter(col("l_receiptdate") > col("l_commitdate"))
selectedGroupedLineItems = selectedLineitems.groupBy(col("l_orderkey")) \
.agg(F.countDistinct(col("l_suppkey")).alias("suppkey_count"), F.max(col("l_suppkey")).alias("suppkey_max")) \
.select(col("l_orderkey").alias("key"), col("suppkey_count"), col("suppkey_max"))
filteredGroupedLineItems = filteredLineitems.groupBy(col("l_orderkey")) \
.agg(F.countDistinct(col("l_suppkey")).alias("suppkey_count"), F.max(col("l_suppkey")).alias("suppkey_max")) \
.select(col("l_orderkey").alias("key"), col("suppkey_count"), col("suppkey_max"))
filteredOrders = self.orders.select(col("o_orderkey"), col("o_orderstatus")) \
.filter(col("o_orderstatus") == "F")
self.nation.filter(col("n_name") == "SAUDI ARABIA") \
.join(filteredSuppliers, col("n_nationkey") == col("s_nationkey")) \
.join(filteredLineitems, col("s_suppkey") == col("l_suppkey")) \
.join(filteredOrders, col("l_orderkey") == col("o_orderkey")) \
.join(selectedGroupedLineItems, col("l_orderkey") == col("key")) \
.filter(col("suppkey_count") > 1) \
.select(col("s_name"), col("l_orderkey"), col("l_suppkey")) \
.join(filteredGroupedLineItems, col("l_orderkey") == col("key"), "left_outer") \
.select(col("s_name"), col("l_orderkey"), col("l_suppkey"), col("suppkey_count"), col("suppkey_max")) \
.filter((col("suppkey_count") == 1)
& (col("l_suppkey") == col("suppkey_max"))) \
.groupBy(col("s_name")) \
.agg(F.count(col("l_suppkey")).alias("numwait")) \
.sort(col("numwait").desc(), col("s_name")) \
.show()
def q22(self):
substring = udf(lambda x: x[0:2], StringType())
phone_regex = re.compile("^(13|31|23|29|30|18|17)$")
phone = udf(lambda x: phone_regex.match(x) is not None, BooleanType())
filteredCustomers = self.customer.select(col("c_acctbal"), col("c_custkey"), substring(col("c_phone")).alias("cntrycode")) \
.filter(phone(col("cntrycode")))
customerAverage = filteredCustomers.filter(col("c_acctbal") > 0.0) \
.agg(F.avg(col("c_acctbal")).alias("avg_acctbal"))
self.orders.groupBy(col("o_custkey")) \
.agg(col("o_custkey")) \
.select(col("o_custkey")) \
.join(filteredCustomers, col("o_custkey") == col("c_custkey"), "right_outer") \
.filter(col("o_custkey").isNull()) \
.join(customerAverage) \
.filter(col("c_acctbal") > col("avg_acctbal")) \
.groupBy(col("cntrycode")) \
.agg(F.count(col("c_acctbal")).alias("numcust"), F.sum(col("c_acctbal")).alias("totalacctbal")) \
.sort(col("cntrycode")) \
.show()

Просмотреть файл

@ -0,0 +1,719 @@
# Licensed to the .NET Foundation under one or more agreements.
# The .NET Foundation licenses this file to you under the MIT license.
# See the LICENSE file in the project root for more information.
from tpch_base import TpchBase
from pyspark.sql import SparkSession
class TpchSqlQueries(TpchBase):
def __init__(self, spark, dir):
TpchBase.__init__(self, spark, dir)
self.spark = spark
self.customer.createOrReplaceTempView("customer")
self.lineitem.createOrReplaceTempView("lineitem")
self.nation.createOrReplaceTempView("nation")
self.region.createOrReplaceTempView("region")
self.orders.createOrReplaceTempView("orders")
self.part.createOrReplaceTempView("part")
self.partsupp.createOrReplaceTempView("partsupp")
self.supplier.createOrReplaceTempView("supplier")
def q1(self):
query = """select
l_returnflag,
l_linestatus,
sum(l_quantity) as sum_qty,
sum(l_extendedprice) as sum_base_price,
sum(l_extendedprice * (1 - l_discount)) as sum_disc_price,
sum(l_extendedprice * (1 - l_discount) * (1 + l_tax)) as sum_charge,
avg(l_quantity) as avg_qty,
avg(l_extendedprice) as avg_price,
avg(l_discount) as avg_disc,
count(*) as count_order
from
lineitem
where
l_shipdate <= date '1998-12-01' - interval '90' day
group by
l_returnflag,
l_linestatus
order by
l_returnflag,
l_linestatus"""
self.spark.sql(query).show()
def q2(self):
query = """select
s_acctbal,
s_name,
n_name,
p_partkey,
p_mfgr,
s_address,
s_phone,
s_comment
from
part,
supplier,
partsupp,
nation,
region
where
p_partkey = ps_partkey
and s_suppkey = ps_suppkey
and p_size = 15
and p_type like '%BRASS'
and s_nationkey = n_nationkey
and n_regionkey = r_regionkey
and r_name = 'EUROPE'
and ps_supplycost = (
select
min(ps_supplycost)
from
partsupp,
supplier,
nation,
region
where
p_partkey = ps_partkey
and s_suppkey = ps_suppkey
and s_nationkey = n_nationkey
and n_regionkey = r_regionkey
and r_name = 'EUROPE'
)
order by
s_acctbal desc,
n_name,
s_name,
p_partkey"""
self.spark.sql(query).show()
def q3(self):
query = """select
l_orderkey,
sum(l_extendedprice * (1 - l_discount)) as revenue,
o_orderdate,
o_shippriority
from
customer,
orders,
lineitem
where
c_mktsegment = 'BUILDING'
and c_custkey = o_custkey
and l_orderkey = o_orderkey
and o_orderdate < date '1995-03-15'
and l_shipdate > date '1995-03-15'
group by
l_orderkey,
o_orderdate,
o_shippriority
order by
revenue desc,
o_orderdate"""
self.spark.sql(query).show()
def q4(self):
query = """select
o_orderpriority,
count(*) as order_count
from
orders
where
o_orderdate >= date '1993-07-01'
and o_orderdate < date '1993-07-01' + interval '3' month
and exists(
select
*
from
lineitem
where
l_orderkey = o_orderkey
and l_commitdate < l_receiptdate
)
group by
o_orderpriority
order by
o_orderpriority"""
self.spark.sql(query).show()
def q5(self):
query = """select
n_name,
sum(l_extendedprice * (1 - l_discount)) as revenue
from
customer,
orders,
lineitem,
supplier,
nation,
region
where
c_custkey = o_custkey
and l_orderkey = o_orderkey
and l_suppkey = s_suppkey
and c_nationkey = s_nationkey
and s_nationkey = n_nationkey
and n_regionkey = r_regionkey
and r_name = 'ASIA'
and o_orderdate >= date '1994-01-01'
and o_orderdate < date '1994-01-01' + interval '1' year
group by
n_name
order by
revenue desc"""
self.spark.sql(query).show()
def q6(self):
query = """select
sum(l_extendedprice * l_discount) as revenue
from
lineitem
where
l_shipdate >= date '1994-01-01'
and l_shipdate < date '1994-01-01' + interval '1' year
and l_discount between .06 - 0.01 and .06 + 0.01
and l_quantity < 24"""
self.spark.sql(query).show()
def q7(self):
query = """select
supp_nation,
cust_nation,
l_year,
cast(sum(volume) as double) as revenue
from
(
select
n1.n_name as supp_nation,
n2.n_name as cust_nation,
year(l_shipdate) as l_year,
cast(l_extendedprice * (1 - l_discount) as double) as volume
from
supplier,
lineitem,
orders,
customer,
nation n1,
nation n2
where
s_suppkey = l_suppkey
and o_orderkey = l_orderkey
and c_custkey = o_custkey
and s_nationkey = n1.n_nationkey
and c_nationkey = n2.n_nationkey
and(
(n1.n_name = 'FRANCE' and n2.n_name = 'GERMANY')
or(n1.n_name = 'GERMANY' and n2.n_name = 'FRANCE')
)
and l_shipdate between date '1995-01-01' and date '1996-12-31'
) as shipping
group by
supp_nation,
cust_nation,
l_year
order by
supp_nation,
cust_nation,
l_year"""
self.spark.sql(query).show()
def q8(self):
query = """select
o_year,
sum(case
when nation = 'BRAZIL' then volume
else 0
end) / sum(volume) as mkt_share
from
(
select
year(o_orderdate) as o_year,
l_extendedprice * (1 - l_discount) as volume,
n2.n_name as nation
from
part,
supplier,
lineitem,
orders,
customer,
nation n1,
nation n2,
region
where
p_partkey = l_partkey
and s_suppkey = l_suppkey
and l_orderkey = o_orderkey
and o_custkey = c_custkey
and c_nationkey = n1.n_nationkey
and n1.n_regionkey = r_regionkey
and r_name = 'AMERICA'
and s_nationkey = n2.n_nationkey
and o_orderdate between date '1995-01-01' and date '1996-12-31'
and p_type = 'ECONOMY ANODIZED STEEL'
) as all_nations
group by
o_year
order by
o_year"""
self.spark.sql(query).show()
def q9(self):
query = """select
nation,
o_year,
sum(amount) as sum_profit
from
(
select
n_name as nation,
year(o_orderdate) as o_year,
l_extendedprice * (1 - l_discount) - ps_supplycost * l_quantity as amount
from
part,
supplier,
lineitem,
partsupp,
orders,
nation
where
s_suppkey = l_suppkey
and ps_suppkey = l_suppkey
and ps_partkey = l_partkey
and p_partkey = l_partkey
and o_orderkey = l_orderkey
and s_nationkey = n_nationkey
and p_name like '%green%'
) as profit
group by
nation,
o_year
order by
nation,
o_year desc"""
self.spark.sql(query).show()
def q10(self):
query = """select
c_custkey,
c_name,
sum(l_extendedprice * (1 - l_discount)) as revenue,
c_acctbal,
n_name,
c_address,
c_phone,
c_comment
from
customer,
orders,
lineitem,
nation
where
c_custkey = o_custkey
and l_orderkey = o_orderkey
and o_orderdate >= date '1993-10-01'
and o_orderdate < date '1993-10-01' + interval '3' month
and l_returnflag = 'R'
and c_nationkey = n_nationkey
group by
c_custkey,
c_name,
c_acctbal,
c_phone,
n_name,
c_address,
c_comment
order by
revenue desc"""
self.spark.sql(query).show()
def q11(self):
query = """select
ps_partkey,
sum(ps_supplycost * ps_availqty) as value
from
partsupp,
supplier,
nation
where
ps_suppkey = s_suppkey
and s_nationkey = n_nationkey
and n_name = 'GERMANY'
group by
ps_partkey having
sum(ps_supplycost * ps_availqty) > (
select
sum(ps_supplycost * ps_availqty) * 0.0001000000
from
partsupp,
supplier,
nation
where
ps_suppkey = s_suppkey
and s_nationkey = n_nationkey
and n_name = 'GERMANY'
)
order by
value desc"""
self.spark.sql(query).show()
def q12(self):
query = """select
l_shipmode,
sum(case
when o_orderpriority = '1-URGENT'
or o_orderpriority = '2-HIGH'
then 1
else 0
end) as sum_highorderpriority,
sum(case
when o_orderpriority <> '1-URGENT'
and o_orderpriority <> '2-HIGH'
then 1
else 0
end) as sum_loworderpriority
from
orders,
lineitem
where
o_orderkey = l_orderkey
and l_shipmode in ('MAIL', 'SHIP')
and l_commitdate < l_receiptdate
and l_shipdate < l_commitdate
and l_receiptdate >= date '1994-01-01'
and l_receiptdate < date '1994-01-01' + interval '1' year
group by
l_shipmode
order by
l_shipmode"""
self.spark.sql(query).show()
def q13(self):
query = """select
c_count,
count(*) as custdist
from
(
select
c_custkey,
count(o_orderkey) as c_count
from
customer left outer join orders on
c_custkey = o_custkey
and o_comment not like '%special%requests%'
group by
c_custkey
) as c_orders
group by
c_count
order by
custdist desc,
c_count desc"""
self.spark.sql(query).show()
def q14(self):
query = """
select
100.00 * sum(case
when p_type like 'PROMO%'
then l_extendedprice * (1 - l_discount)
else 0
end) / sum(l_extendedprice * (1 - l_discount)) as promo_revenue
from
lineitem,
part
where
l_partkey = p_partkey
and l_shipdate >= date '1995-09-01'
and l_shipdate < date '1995-09-01' + interval '1' month"""
self.spark.sql(query).show()
def q15(self):
query = """with revenue0 as
(select
l_suppkey as supplier_no,
sum(l_extendedprice * (1 - l_discount)) as total_revenue
from
lineitem
where
l_shipdate >= date '1996-01-01'
and l_shipdate < date '1996-01-01' + interval '3' month
group by
l_suppkey)
select
s_suppkey,
s_name,
s_address,
s_phone,
total_revenue
from
supplier,
revenue0
where
s_suppkey = supplier_no
and total_revenue = (
select
max(total_revenue)
from
revenue0
)
order by
s_suppkey"""
self.spark.sql(query).show()
def q16(self):
query = """select
p_brand,
p_type,
p_size,
count(distinct ps_suppkey) as supplier_cnt
from
partsupp,
part
where
p_partkey = ps_partkey
and p_brand <> 'Brand#45'
and p_type not like 'MEDIUM POLISHED%'
and p_size in (49, 14, 23, 45, 19, 3, 36, 9)
and ps_suppkey not in (
select
s_suppkey
from
supplier
where
s_comment like '%Customer%Complaints%'
)
group by
p_brand,
p_type,
p_size
order by
supplier_cnt desc,
p_brand,
p_type,
p_size
limit 20"""
self.spark.sql(query).show()
def q17(self):
query = """select
sum(l_extendedprice) / 7.0 as avg_yearly
from
lineitem,
part
where
p_partkey = l_partkey
and p_brand = 'Brand#23'
and p_container = 'MED BOX'
and l_quantity < (
select
0.2 * avg(l_quantity)
from
lineitem
where
l_partkey = p_partkey
)"""
self.spark.sql(query).show()
def q18(self):
query = """
select
c_name,
c_custkey,
o_orderkey,
o_orderdate,
o_totalprice,
sum(l_quantity)
from
customer,
orders,
lineitem
where
o_orderkey in (
select
l_orderkey
from
lineitem
group by
l_orderkey having
sum(l_quantity) > 300
)
and c_custkey = o_custkey
and o_orderkey = l_orderkey
group by
c_name,
c_custkey,
o_orderkey,
o_orderdate,
o_totalprice
order by
o_totalprice desc,
o_orderdate"""
self.spark.sql(query).show()
def q19(self):
query = """select
sum(l_extendedprice * (1 - l_discount)) as revenue
from
lineitem,
part
where
(
p_partkey = l_partkey
and p_brand = 'Brand#12'
and p_container in ('SM CASE', 'SM BOX', 'SM PACK', 'SM PKG')
and l_quantity >= 1 and l_quantity <= 1 + 10
and p_size between 1 and 5
and l_shipmode in ('AIR', 'AIR REG')
and l_shipinstruct = 'DELIVER IN PERSON'
)
or
(
p_partkey = l_partkey
and p_brand = 'Brand#23'
and p_container in ('MED BAG', 'MED BOX', 'MED PKG', 'MED PACK')
and l_quantity >= 10 and l_quantity <= 10 + 10
and p_size between 1 and 10
and l_shipmode in ('AIR', 'AIR REG')
and l_shipinstruct = 'DELIVER IN PERSON'
)
or
(
p_partkey = l_partkey
and p_brand = 'Brand#34'
and p_container in ('LG CASE', 'LG BOX', 'LG PACK', 'LG PKG')
and l_quantity >= 20 and l_quantity <= 20 + 10
and p_size between 1 and 15
and l_shipmode in ('AIR', 'AIR REG')
and l_shipinstruct = 'DELIVER IN PERSON'
)"""
self.spark.sql(query).show()
def q20(self):
query = """
select
s_name,
s_address
from
supplier,
nation
where
s_suppkey in (
select
ps_suppkey
from
partsupp
where
ps_partkey in (
select
p_partkey
from
part
where
p_name like 'forest%'
)
and ps_availqty > (
select
0.5 * sum(l_quantity)
from
lineitem
where
l_partkey = ps_partkey
and l_suppkey = ps_suppkey
and l_shipdate >= date '1994-01-01'
and l_shipdate < date '1994-01-01' + interval '1' year
)
)
and s_nationkey = n_nationkey
and n_name = 'CANADA'
order by
s_name"""
self.spark.sql(query).show()
def q21(self):
query = """select
s_name,
count(*) as numwait
from
supplier,
lineitem l1,
orders,
nation
where
s_suppkey = l1.l_suppkey
and o_orderkey = l1.l_orderkey
and o_orderstatus = 'F'
and l1.l_receiptdate > l1.l_commitdate
and exists(
select
*
from
lineitem l2
where
l2.l_orderkey = l1.l_orderkey
and l2.l_suppkey <> l1.l_suppkey
)
and not exists(
select
*
from
lineitem l3
where
l3.l_orderkey = l1.l_orderkey
and l3.l_suppkey <> l1.l_suppkey
and l3.l_receiptdate > l3.l_commitdate
)
and s_nationkey = n_nationkey
and n_name = 'SAUDI ARABIA'
group by
s_name
order by
numwait desc,
s_name"""
self.spark.sql(query).show()
def q22(self):
query = """select
cntrycode,
count(*) as numcust,
sum(c_acctbal) as totacctbal
from
(
select
substring(c_phone, 1, 2) as cntrycode,
c_acctbal
from
customer
where
substring(c_phone, 1, 2) in
('13', '31', '23', '29', '30', '18', '17')
and c_acctbal > (
select
avg(c_acctbal)
from
customer
where
c_acctbal > 0.00
and substring(c_phone, 1, 2) in
('13', '31', '23', '29', '30', '18', '17')
)
and not exists(
select
*
from
orders
where
o_custkey = c_custkey
)
) as custsale
group by
cntrycode
order by
cntrycode"""
self.spark.sql(query).show()

88
benchmark/scala/pom.xml Normal file
Просмотреть файл

@ -0,0 +1,88 @@
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>com.microsoft.spark</groupId>
<artifactId>microsoft-spark-benchmark</artifactId>
<version>0.1.0</version>
<inceptionYear>2019</inceptionYear>
<properties>
<encoding>UTF-8</encoding>
<scala.version>2.11.8</scala.version>
<scala.binary.version>2.11</scala.binary.version>
<spark.version>2.3.2</spark.version>
</properties>
<pluginRepositories>
<pluginRepository>
<id>scala</id>
<name>Scala Tools</name>
<url>http://scala-tools.org/repo-releases/</url>
<releases>
<enabled>true</enabled>
</releases>
<snapshots>
<enabled>false</enabled>
</snapshots>
</pluginRepository>
</pluginRepositories>
<dependencies>
<dependency>
<groupId>org.scala-lang</groupId>
<artifactId>scala-library</artifactId>
<version>${scala.version}</version>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-core_${scala.binary.version}</artifactId>
<version>${spark.version}</version>
<scope>provided</scope>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-sql_${scala.binary.version}</artifactId>
<version>${spark.version}</version>
<scope>provided</scope>
</dependency>
<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
<version>4.4</version>
<scope>test</scope>
</dependency>
<dependency>
<groupId>org.specs</groupId>
<artifactId>specs</artifactId>
<version>1.2.5</version>
<scope>test</scope>
</dependency>
</dependencies>
<build>
<sourceDirectory>src/main/scala</sourceDirectory>
<testSourceDirectory>src/test/scala</testSourceDirectory>
<plugins>
<plugin>
<groupId>org.scala-tools</groupId>
<artifactId>maven-scala-plugin</artifactId>
<version>2.15.2</version>
<executions>
<execution>
<goals>
<goal>compile</goal>
<goal>testCompile</goal>
</goals>
</execution>
</executions>
<configuration>
<scalaVersion>${scala.version}</scalaVersion>
<args>
<arg>-target:jvm-1.8</arg>
<arg>-deprecation</arg>
<arg>-feature</arg>
</args>
</configuration>
</plugin>
</plugins>
</build>
</project>

Просмотреть файл

@ -0,0 +1,386 @@
<!--
~ Licensed to the Apache Software Foundation (ASF) under one or more
~ contributor license agreements. See the NOTICE file distributed with
~ this work for additional information regarding copyright ownership.
~ The ASF licenses this file to You under the Apache License, Version 2.0
~ (the "License"); you may not use this file except in compliance with
~ the License. You may obtain a copy of the License at
~
~ http://www.apache.org/licenses/LICENSE-2.0
~
~ Unless required by applicable law or agreed to in writing, software
~ distributed under the License is distributed on an "AS IS" BASIS,
~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
~ See the License for the specific language governing permissions and
~ limitations under the License.
-->
<!--
If you wish to turn off checking for a section of code, you can put a comment in the source
before and after the section, with the following syntax:
// scalastyle:off
... // stuff that breaks the styles
// scalastyle:on
You can also disable only one rule, by specifying its rule id, as specified in:
http://www.scalastyle.org/rules-0.7.0.html
// scalastyle:off no.finalize
override def finalize(): Unit = ...
// scalastyle:on no.finalize
This file is divided into 3 sections:
(1) rules that we enforce.
(2) rules that we would like to enforce, but haven't cleaned up the codebase to turn on yet
(or we need to make the scalastyle rule more configurable).
(3) rules that we don't want to enforce.
-->
<scalastyle>
<name>Scalastyle standard configuration</name>
<!-- ================================================================================ -->
<!-- rules we enforce -->
<!-- ================================================================================ -->
<check level="error" class="org.scalastyle.file.FileTabChecker" enabled="true"></check>
<check level="error" class="org.scalastyle.file.HeaderMatchesChecker" enabled="true">
<parameters>
<!--parameter name="header"><![CDATA[/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/]]></parameter-->
</parameters>
</check>
<check level="error" class="org.scalastyle.scalariform.SpacesAfterPlusChecker" enabled="true"></check>
<check level="error" class="org.scalastyle.scalariform.SpacesBeforePlusChecker" enabled="true"></check>
<check level="error" class="org.scalastyle.file.WhitespaceEndOfLineChecker" enabled="true"></check>
<check level="error" class="org.scalastyle.file.FileLineLengthChecker" enabled="true">
<parameters>
<parameter name="maxLineLength"><![CDATA[100]]></parameter>
<parameter name="tabSize"><![CDATA[2]]></parameter>
<parameter name="ignoreImports">true</parameter>
</parameters>
</check>
<check level="error" class="org.scalastyle.scalariform.ClassNamesChecker" enabled="true">
<parameters><parameter name="regex"><![CDATA[[A-Z][A-Za-z]*]]></parameter></parameters>
</check>
<check level="error" class="org.scalastyle.scalariform.ObjectNamesChecker" enabled="true">
<parameters><parameter name="regex"><![CDATA[(config|[A-Z][A-Za-z]*)]]></parameter></parameters>
</check>
<check level="error" class="org.scalastyle.scalariform.PackageObjectNamesChecker" enabled="true">
<parameters><parameter name="regex"><![CDATA[^[a-z][A-Za-z]*$]]></parameter></parameters>
</check>
<check customId="argcount" level="error" class="org.scalastyle.scalariform.ParameterNumberChecker" enabled="true">
<parameters><parameter name="maxParameters"><![CDATA[10]]></parameter></parameters>
</check>
<check level="error" class="org.scalastyle.scalariform.NoFinalizeChecker" enabled="true"></check>
<check level="error" class="org.scalastyle.scalariform.CovariantEqualsChecker" enabled="true"></check>
<check level="error" class="org.scalastyle.scalariform.StructuralTypeChecker" enabled="true"></check>
<check level="error" class="org.scalastyle.scalariform.UppercaseLChecker" enabled="true"></check>
<check level="error" class="org.scalastyle.scalariform.IfBraceChecker" enabled="true">
<parameters>
<parameter name="singleLineAllowed"><![CDATA[true]]></parameter>
<parameter name="doubleLineAllowed"><![CDATA[true]]></parameter>
</parameters>
</check>
<check level="error" class="org.scalastyle.scalariform.PublicMethodsHaveTypeChecker" enabled="true"></check>
<check level="error" class="org.scalastyle.file.NewLineAtEofChecker" enabled="true"></check>
<check customId="nonascii" level="error" class="org.scalastyle.scalariform.NonASCIICharacterChecker" enabled="true"></check>
<check level="error" class="org.scalastyle.scalariform.SpaceAfterCommentStartChecker" enabled="true"></check>
<check level="error" class="org.scalastyle.scalariform.EnsureSingleSpaceBeforeTokenChecker" enabled="true">
<parameters>
<parameter name="tokens">ARROW, EQUALS, ELSE, TRY, CATCH, FINALLY, LARROW, RARROW</parameter>
</parameters>
</check>
<check level="error" class="org.scalastyle.scalariform.EnsureSingleSpaceAfterTokenChecker" enabled="true">
<parameters>
<parameter name="tokens">ARROW, EQUALS, COMMA, COLON, IF, ELSE, DO, WHILE, FOR, MATCH, TRY, CATCH, FINALLY, LARROW, RARROW</parameter>
</parameters>
</check>
<!-- ??? usually shouldn't be checked into the code base. -->
<check level="error" class="org.scalastyle.scalariform.NotImplementedErrorUsage" enabled="true"></check>
<!-- As of SPARK-7558, all tests in Spark should extend o.a.s.SparkFunSuite instead of FunSuite directly -->
<check customId="funsuite" level="error" class="org.scalastyle.scalariform.TokenChecker" enabled="true">
<parameters><parameter name="regex">^FunSuite[A-Za-z]*$</parameter></parameters>
<customMessage>Tests must extend org.apache.spark.SparkFunSuite instead.</customMessage>
</check>
<!-- As of SPARK-7977 all printlns need to be wrapped in '// scalastyle:off/on println' -->
<check customId="println" level="error" class="org.scalastyle.scalariform.TokenChecker" enabled="true">
<parameters><parameter name="regex">^println$</parameter></parameters>
<customMessage><![CDATA[Are you sure you want to println? If yes, wrap the code block with
// scalastyle:off println
println(...)
// scalastyle:on println]]></customMessage>
</check>
<check customId="hadoopconfiguration" level="error" class="org.scalastyle.file.RegexChecker" enabled="true">
<parameters><parameter name="regex">spark(.sqlContext)?.sparkContext.hadoopConfiguration</parameter></parameters>
<customMessage><![CDATA[
Are you sure that you want to use sparkContext.hadoopConfiguration? In most cases, you should use
spark.sessionState.newHadoopConf() instead, so that the hadoop configurations specified in Spark session
configuration will come into effect.
If you must use sparkContext.hadoopConfiguration, wrap the code block with
// scalastyle:off hadoopconfiguration
spark.sparkContext.hadoopConfiguration...
// scalastyle:on hadoopconfiguration
]]></customMessage>
</check>
<check customId="visiblefortesting" level="error" class="org.scalastyle.file.RegexChecker" enabled="true">
<parameters><parameter name="regex">@VisibleForTesting</parameter></parameters>
<customMessage><![CDATA[
@VisibleForTesting causes classpath issues. Please note this in the java doc instead (SPARK-11615).
]]></customMessage>
</check>
<check customId="runtimeaddshutdownhook" level="error" class="org.scalastyle.file.RegexChecker" enabled="true">
<parameters><parameter name="regex">Runtime\.getRuntime\.addShutdownHook</parameter></parameters>
<customMessage><![CDATA[
Are you sure that you want to use Runtime.getRuntime.addShutdownHook? In most cases, you should use
ShutdownHookManager.addShutdownHook instead.
If you must use Runtime.getRuntime.addShutdownHook, wrap the code block with
// scalastyle:off runtimeaddshutdownhook
Runtime.getRuntime.addShutdownHook(...)
// scalastyle:on runtimeaddshutdownhook
]]></customMessage>
</check>
<check customId="mutablesynchronizedbuffer" level="error" class="org.scalastyle.file.RegexChecker" enabled="true">
<parameters><parameter name="regex">mutable\.SynchronizedBuffer</parameter></parameters>
<customMessage><![CDATA[
Are you sure that you want to use mutable.SynchronizedBuffer? In most cases, you should use
java.util.concurrent.ConcurrentLinkedQueue instead.
If you must use mutable.SynchronizedBuffer, wrap the code block with
// scalastyle:off mutablesynchronizedbuffer
mutable.SynchronizedBuffer[...]
// scalastyle:on mutablesynchronizedbuffer
]]></customMessage>
</check>
<check customId="classforname" level="error" class="org.scalastyle.file.RegexChecker" enabled="true">
<parameters><parameter name="regex">Class\.forName</parameter></parameters>
<customMessage><![CDATA[
Are you sure that you want to use Class.forName? In most cases, you should use Utils.classForName instead.
If you must use Class.forName, wrap the code block with
// scalastyle:off classforname
Class.forName(...)
// scalastyle:on classforname
]]></customMessage>
</check>
<check customId="awaitresult" level="error" class="org.scalastyle.file.RegexChecker" enabled="true">
<parameters><parameter name="regex">Await\.result</parameter></parameters>
<customMessage><![CDATA[
Are you sure that you want to use Await.result? In most cases, you should use ThreadUtils.awaitResult instead.
If you must use Await.result, wrap the code block with
// scalastyle:off awaitresult
Await.result(...)
// scalastyle:on awaitresult
]]></customMessage>
</check>
<check customId="awaitready" level="error" class="org.scalastyle.file.RegexChecker" enabled="true">
<parameters><parameter name="regex">Await\.ready</parameter></parameters>
<customMessage><![CDATA[
Are you sure that you want to use Await.ready? In most cases, you should use ThreadUtils.awaitReady instead.
If you must use Await.ready, wrap the code block with
// scalastyle:off awaitready
Await.ready(...)
// scalastyle:on awaitready
]]></customMessage>
</check>
<check customId="caselocale" level="error" class="org.scalastyle.file.RegexChecker" enabled="true">
<parameters><parameter name="regex">(\.toUpperCase|\.toLowerCase)(?!(\(|\(Locale.ROOT\)))</parameter></parameters>
<customMessage><![CDATA[
Are you sure that you want to use toUpperCase or toLowerCase without the root locale? In most cases, you
should use toUpperCase(Locale.ROOT) or toLowerCase(Locale.ROOT) instead.
If you must use toUpperCase or toLowerCase without the root locale, wrap the code block with
// scalastyle:off caselocale
.toUpperCase
.toLowerCase
// scalastyle:on caselocale
]]></customMessage>
</check>
<check customId="throwerror" level="error" class="org.scalastyle.file.RegexChecker" enabled="true">
<parameters><parameter name="regex">throw new \w+Error\(</parameter></parameters>
<customMessage><![CDATA[
Are you sure that you want to throw Error? In most cases, you should use appropriate Exception instead.
If you must throw Error, wrap the code block with
// scalastyle:off throwerror
throw new XXXError(...)
// scalastyle:on throwerror
]]></customMessage>
</check>
<!-- As of SPARK-9613 JavaConversions should be replaced with JavaConverters -->
<check customId="javaconversions" level="error" class="org.scalastyle.scalariform.TokenChecker" enabled="true">
<parameters><parameter name="regex">JavaConversions</parameter></parameters>
<customMessage>Instead of importing implicits in scala.collection.JavaConversions._, import
scala.collection.JavaConverters._ and use .asScala / .asJava methods</customMessage>
</check>
<check customId="commonslang2" level="error" class="org.scalastyle.scalariform.TokenChecker" enabled="true">
<parameters><parameter name="regex">org\.apache\.commons\.lang\.</parameter></parameters>
<customMessage>Use Commons Lang 3 classes (package org.apache.commons.lang3.*) instead
of Commons Lang 2 (package org.apache.commons.lang.*)</customMessage>
</check>
<check customId="extractopt" level="error" class="org.scalastyle.scalariform.TokenChecker" enabled="true">
<parameters><parameter name="regex">extractOpt</parameter></parameters>
<customMessage>Use jsonOption(x).map(.extract[T]) instead of .extractOpt[T], as the latter
is slower. </customMessage>
</check>
<check level="error" class="org.scalastyle.scalariform.ImportOrderChecker" enabled="true">
<parameters>
<parameter name="groups">java,scala,3rdParty,spark</parameter>
<parameter name="group.java">javax?\..*</parameter>
<parameter name="group.scala">scala\..*</parameter>
<parameter name="group.3rdParty">(?!org\.apache\.spark\.).*</parameter>
<parameter name="group.spark">org\.apache\.spark\..*</parameter>
</parameters>
</check>
<check level="error" class="org.scalastyle.scalariform.DisallowSpaceBeforeTokenChecker" enabled="true">
<parameters>
<parameter name="tokens">COMMA</parameter>
</parameters>
</check>
<!-- SPARK-3854: Single Space between ')' and '{' -->
<check customId="SingleSpaceBetweenRParenAndLCurlyBrace" level="error" class="org.scalastyle.file.RegexChecker" enabled="true">
<parameters><parameter name="regex">\)\{</parameter></parameters>
<customMessage><![CDATA[
Single Space between ')' and `{`.
]]></customMessage>
</check>
<check customId="NoScalaDoc" level="error" class="org.scalastyle.file.RegexChecker" enabled="true">
<parameters><parameter name="regex">(?m)^(\s*)/[*][*].*$(\r|)\n^\1 [*]</parameter></parameters>
<customMessage>Use Javadoc style indentation for multiline comments</customMessage>
</check>
<check customId="OmitBracesInCase" level="error" class="org.scalastyle.file.RegexChecker" enabled="true">
<parameters><parameter name="regex">case[^\n>]*=>\s*\{</parameter></parameters>
<customMessage>Omit braces in case clauses.</customMessage>
</check>
<!-- SPARK-16877: Avoid Java annotations -->
<check level="error" class="org.scalastyle.scalariform.OverrideJavaChecker" enabled="true"></check>
<check level="error" class="org.scalastyle.scalariform.DeprecatedJavaChecker" enabled="true"></check>
<!-- ================================================================================ -->
<!-- rules we'd like to enforce, but haven't cleaned up the codebase yet -->
<!-- ================================================================================ -->
<!-- We cannot turn the following two on, because it'd fail a lot of string interpolation use cases. -->
<!-- Ideally the following two rules should be configurable to rule out string interpolation. -->
<check level="error" class="org.scalastyle.scalariform.NoWhitespaceBeforeLeftBracketChecker" enabled="false"></check>
<check level="error" class="org.scalastyle.scalariform.NoWhitespaceAfterLeftBracketChecker" enabled="false"></check>
<!-- This breaks symbolic method names so we don't turn it on. -->
<!-- Maybe we should update it to allow basic symbolic names, and then we are good to go. -->
<check level="error" class="org.scalastyle.scalariform.MethodNamesChecker" enabled="false">
<parameters>
<parameter name="regex"><![CDATA[^[a-z][A-Za-z0-9]*$]]></parameter>
</parameters>
</check>
<!-- Should turn this on, but we have a few places that need to be fixed first -->
<check level="error" class="org.scalastyle.scalariform.EqualsHashCodeChecker" enabled="true"></check>
<!-- ================================================================================ -->
<!-- rules we don't want -->
<!-- ================================================================================ -->
<check level="error" class="org.scalastyle.scalariform.IllegalImportsChecker" enabled="false">
<parameters><parameter name="illegalImports"><![CDATA[sun._,java.awt._]]></parameter></parameters>
</check>
<!-- We want the opposite of this: NewLineAtEofChecker -->
<check level="error" class="org.scalastyle.file.NoNewLineAtEofChecker" enabled="false"></check>
<!-- This one complains about all kinds of random things. Disable. -->
<check level="error" class="org.scalastyle.scalariform.SimplifyBooleanExpressionChecker" enabled="false"></check>
<!-- We use return quite a bit for control flows and guards -->
<check level="error" class="org.scalastyle.scalariform.ReturnChecker" enabled="false"></check>
<!-- We use null a lot in low level code and to interface with 3rd party code -->
<check level="error" class="org.scalastyle.scalariform.NullChecker" enabled="false"></check>
<!-- Doesn't seem super big deal here ... -->
<check level="error" class="org.scalastyle.scalariform.NoCloneChecker" enabled="false"></check>
<!-- Doesn't seem super big deal here ... -->
<check level="error" class="org.scalastyle.file.FileLengthChecker" enabled="false">
<parameters><parameter name="maxFileLength">800></parameter></parameters>
</check>
<!-- Doesn't seem super big deal here ... -->
<check level="error" class="org.scalastyle.scalariform.NumberOfTypesChecker" enabled="false">
<parameters><parameter name="maxTypes">30</parameter></parameters>
</check>
<!-- Doesn't seem super big deal here ... -->
<check level="error" class="org.scalastyle.scalariform.CyclomaticComplexityChecker" enabled="false">
<parameters><parameter name="maximum">10</parameter></parameters>
</check>
<!-- Doesn't seem super big deal here ... -->
<check level="error" class="org.scalastyle.scalariform.MethodLengthChecker" enabled="false">
<parameters><parameter name="maxLength">50</parameter></parameters>
</check>
<!-- Not exactly feasible to enforce this right now. -->
<!-- It is also infrequent that somebody introduces a new class with a lot of methods. -->
<check level="error" class="org.scalastyle.scalariform.NumberOfMethodsInTypeChecker" enabled="false">
<parameters><parameter name="maxMethods"><![CDATA[30]]></parameter></parameters>
</check>
<!-- Doesn't seem super big deal here, and we have a lot of magic numbers ... -->
<check level="error" class="org.scalastyle.scalariform.MagicNumberChecker" enabled="false">
<parameters><parameter name="ignore">-1,0,1,2,3</parameter></parameters>
</check>
</scalastyle>

Просмотреть файл

@ -0,0 +1,52 @@
/*
* Licensed to the .NET Foundation under one or more agreements.
* The .NET Foundation licenses this file to you under the MIT license.
* See the LICENSE file in the project root for more information.
*/
package com.microsoft.tpch
import scala.util.Try
import org.apache.spark.sql.SparkSession
object App {
def main(args: Array[String]) {
if (args.length != 4) {
println("Usage:")
println("\t<spark-submit> --master local --class com.microsoft.tpch.App microsoft-spark-examples-<version>.jar")
println("\t\t<tpch_data_root_path> <query_number> <num_iterations> <true for SQL | false for functional>")
}
val tpchRoot = args(0)
val queryNumber = args(1).toInt
val numIteration = args(2).toInt
val isSql = Try(args(3).toBoolean).getOrElse(false)
for (i <- 0 until numIteration) {
val spark = SparkSession
.builder()
.appName("TPC-H Benchmark for Scala")
.getOrCreate()
val startTs = System.currentTimeMillis
if (!isSql) {
val tpchFunctional = new TpchFunctionalQueries(spark, tpchRoot)
tpchFunctional.run(queryNumber.toString)
}
else {
}
val endTs = System.currentTimeMillis
val totalTime = endTs - startTs
val typeStr = if (isSql) "SQL"
else "Functional"
println(s"TPCH_Result,Scala,$typeStr,$queryNumber,$i,$totalTime")
spark.stop()
}
}
}

Просмотреть файл

@ -0,0 +1,20 @@
/*
* Licensed to the .NET Foundation under one or more agreements.
* The .NET Foundation licenses this file to you under the MIT license.
* See the LICENSE file in the project root for more information.
*/
package com.microsoft.tpch
import org.apache.spark.sql.{DataFrame, SparkSession}
class TpchBase(spark: SparkSession, tpchRoot: String) {
val customer: DataFrame = spark.read.parquet(s"${tpchRoot}customer")
val lineitem: DataFrame = spark.read.parquet(s"${tpchRoot}lineitem")
val nation: DataFrame = spark.read.parquet(s"${tpchRoot}nation")
val order: DataFrame = spark.read.parquet(s"${tpchRoot}orders")
val part: DataFrame = spark.read.parquet(s"${tpchRoot}part")
val partsupp: DataFrame = spark.read.parquet(s"${tpchRoot}partsupp")
val region: DataFrame = spark.read.parquet(s"${tpchRoot}region")
val supplier: DataFrame = spark.read.parquet(s"${tpchRoot}supplier")
}

Просмотреть файл

@ -0,0 +1,445 @@
/*
* Licensed to the .NET Foundation under one or more agreements.
* The .NET Foundation licenses this file to you under the MIT license.
* See the LICENSE file in the project root for more information.
*/
package com.microsoft.tpch
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.functions._
/*
* Note that the queries are taken from: https://github.com/ssavvides/tpch-spark and updated.
*/
class TpchFunctionalQueries(spark: SparkSession, tpchRoot: String)
extends TpchBase(spark, tpchRoot) {
import spark.implicits._
def run(queryNumber: String): Unit = {
val method = this.getClass.getMethod("q" + queryNumber)
method.invoke(this)
}
def q1(): Unit = {
val decrease = udf { (x: Double, y: Double) => x * (1 - y) }
val increase = udf { (x: Double, y: Double) => x * (1 + y) }
lineitem.filter($"l_shipdate" <= "1998-09-02")
.groupBy($"l_returnflag", $"l_linestatus")
.agg(sum($"l_quantity"), sum($"l_extendedprice"),
sum(decrease($"l_extendedprice", $"l_discount")),
sum(increase(decrease($"l_extendedprice", $"l_discount"), $"l_tax")),
avg($"l_quantity"), avg($"l_extendedprice"), avg($"l_discount"), count($"l_quantity"))
.sort($"l_returnflag", $"l_linestatus")
.show()
}
def q2(): Unit = {
val europe = region.filter($"r_name" === "EUROPE")
.join(nation, $"r_regionkey" === nation("n_regionkey"))
.join(supplier, $"n_nationkey" === supplier("s_nationkey"))
.join(partsupp, supplier("s_suppkey") === partsupp("ps_suppkey"))
val brass = part.filter(part("p_size") === 15 && part("p_type").endsWith("BRASS"))
.join(europe, europe("ps_partkey") === $"p_partkey")
val minCost = brass.groupBy(brass("ps_partkey"))
.agg(min("ps_supplycost").as("min"))
brass.join(minCost, brass("ps_partkey") === minCost("ps_partkey"))
.filter(brass("ps_supplycost") === minCost("min"))
.select("s_acctbal", "s_name", "n_name", "p_partkey", "p_mfgr", "s_address", "s_phone", "s_comment")
.sort($"s_acctbal".desc, $"n_name", $"s_name", $"p_partkey")
.show()
}
def q3(): Unit = {
val decrease = udf { (x: Double, y: Double) => x * (1 - y) }
val fcust = customer.filter($"c_mktsegment" === "BUILDING")
val forders = order.filter($"o_orderdate" < "1995-03-15")
val flineitems = lineitem.filter($"l_shipdate" > "1995-03-15")
fcust.join(forders, $"c_custkey" === forders("o_custkey"))
.select($"o_orderkey", $"o_orderdate", $"o_shippriority")
.join(flineitems, $"o_orderkey" === flineitems("l_orderkey"))
.select($"l_orderkey",
decrease($"l_extendedprice", $"l_discount").as("volume"),
$"o_orderdate", $"o_shippriority")
.groupBy($"l_orderkey", $"o_orderdate", $"o_shippriority")
.agg(sum($"volume").as("revenue"))
.sort($"revenue".desc, $"o_orderdate")
.show()
}
def q4(): Unit = {
val forders = order.filter($"o_orderdate" >= "1993-07-01" && $"o_orderdate" < "1993-10-01")
val flineitems = lineitem.filter($"l_commitdate" < $"l_receiptdate")
.select($"l_orderkey")
.distinct
flineitems.join(forders, $"l_orderkey" === forders("o_orderkey"))
.groupBy($"o_orderpriority")
.agg(count($"o_orderpriority"))
.sort($"o_orderpriority")
.show()
}
def q5(): Unit = {
val decrease = udf { (x: Double, y: Double) => x * (1 - y) }
val forders = order.filter($"o_orderdate" < "1995-01-01" && $"o_orderdate" >= "1994-01-01")
region.filter($"r_name" === "ASIA")
.join(nation, $"r_regionkey" === nation("n_regionkey"))
.join(supplier, $"n_nationkey" === supplier("s_nationkey"))
.join(lineitem, $"s_suppkey" === lineitem("l_suppkey"))
.select($"n_name", $"l_extendedprice", $"l_discount", $"l_orderkey", $"s_nationkey")
.join(forders, $"l_orderkey" === forders("o_orderkey"))
.join(customer, $"o_custkey" === customer("c_custkey") && $"s_nationkey" === customer("c_nationkey"))
.select($"n_name", decrease($"l_extendedprice", $"l_discount").as("value"))
.groupBy($"n_name")
.agg(sum($"value").as("revenue"))
.sort($"revenue".desc)
.show()
}
def q6(): Unit = {
lineitem.filter($"l_shipdate" >= "1994-01-01" && $"l_shipdate" < "1995-01-01" && $"l_discount" >= 0.05 && $"l_discount" <= 0.07 && $"l_quantity" < 24)
.agg(sum($"l_extendedprice" * $"l_discount"))
.show()
}
def q7(): Unit = {
val getYear = udf { x: String => x.substring(0, 4) }
val decrease = udf { (x: Double, y: Double) => x * (1 - y) }
val fnation = nation.filter($"n_name" === "FRANCE" || $"n_name" === "GERMANY")
val fline = lineitem.filter($"l_shipdate" >= "1995-01-01" && $"l_shipdate" <= "1996-12-31")
val supNation = fnation.join(supplier, $"n_nationkey" === supplier("s_nationkey"))
.join(fline, $"s_suppkey" === fline("l_suppkey"))
.select($"n_name".as("supp_nation"), $"l_orderkey", $"l_extendedprice", $"l_discount", $"l_shipdate")
fnation.join(customer, $"n_nationkey" === customer("c_nationkey"))
.join(order, $"c_custkey" === order("o_custkey"))
.select($"n_name".as("cust_nation"), $"o_orderkey")
.join(supNation, $"o_orderkey" === supNation("l_orderkey"))
.filter($"supp_nation" === "FRANCE" && $"cust_nation" === "GERMANY"
|| $"supp_nation" === "GERMANY" && $"cust_nation" === "FRANCE")
.select($"supp_nation", $"cust_nation",
getYear($"l_shipdate").as("l_year"),
decrease($"l_extendedprice", $"l_discount").as("volume"))
.groupBy($"supp_nation", $"cust_nation", $"l_year")
.agg(sum($"volume").as("revenue"))
.sort($"supp_nation", $"cust_nation", $"l_year")
.show()
}
def q8(): Unit = {
val getYear = udf { x: String => x.substring(0, 4) }
val decrease = udf { (x: Double, y: Double) => x * (1 - y) }
val isBrazil = udf { (x: String, y: Double) => if (x == "BRAZIL") y else 0 }
val fregion = region.filter($"r_name" === "AMERICA")
val forder = order.filter($"o_orderdate" <= "1996-12-31" && $"o_orderdate" >= "1995-01-01")
val fpart = part.filter($"p_type" === "ECONOMY ANODIZED STEEL")
val nat = nation.join(supplier, $"n_nationkey" === supplier("s_nationkey"))
val line = lineitem.select($"l_partkey", $"l_suppkey", $"l_orderkey",
decrease($"l_extendedprice", $"l_discount").as("volume")).
join(fpart, $"l_partkey" === fpart("p_partkey"))
.join(nat, $"l_suppkey" === nat("s_suppkey"))
nation.join(fregion, $"n_regionkey" === fregion("r_regionkey"))
.select($"n_nationkey")
.join(customer, $"n_nationkey" === customer("c_nationkey"))
.select($"c_custkey")
.join(forder, $"c_custkey" === forder("o_custkey"))
.select($"o_orderkey", $"o_orderdate")
.join(line, $"o_orderkey" === line("l_orderkey"))
.select(getYear($"o_orderdate").as("o_year"), $"volume",
isBrazil($"n_name", $"volume").as("case_volume"))
.groupBy($"o_year")
.agg(sum($"case_volume") / sum("volume"))
.sort($"o_year")
.show()
}
def q9(): Unit = {
val getYear = udf { x: String => x.substring(0, 4) }
val expr = udf { (x: Double, y: Double, v: Double, w: Double) => x * (1 - y) - (v * w) }
val linePart = part.filter($"p_name".contains("green"))
.join(lineitem, $"p_partkey" === lineitem("l_partkey"))
val natSup = nation.join(supplier, $"n_nationkey" === supplier("s_nationkey"))
linePart.join(natSup, $"l_suppkey" === natSup("s_suppkey"))
.join(partsupp, $"l_suppkey" === partsupp("ps_suppkey")
&& $"l_partkey" === partsupp("ps_partkey"))
.join(order, $"l_orderkey" === order("o_orderkey"))
.select($"n_name", getYear($"o_orderdate").as("o_year"),
expr($"l_extendedprice", $"l_discount", $"ps_supplycost", $"l_quantity").as("amount"))
.groupBy($"n_name", $"o_year")
.agg(sum($"amount"))
.sort($"n_name", $"o_year".desc)
.show()
}
def q10(): Unit = {
val decrease = udf { (x: Double, y: Double) => x * (1 - y) }
val flineitem = lineitem.filter($"l_returnflag" === "R")
order.filter($"o_orderdate" < "1994-01-01" && $"o_orderdate" >= "1993-10-01")
.join(customer, $"o_custkey" === customer("c_custkey"))
.join(nation, $"c_nationkey" === nation("n_nationkey"))
.join(flineitem, $"o_orderkey" === flineitem("l_orderkey"))
.select($"c_custkey", $"c_name",
decrease($"l_extendedprice", $"l_discount").as("volume"),
$"c_acctbal", $"n_name", $"c_address", $"c_phone", $"c_comment")
.groupBy($"c_custkey", $"c_name", $"c_acctbal", $"c_phone", $"n_name", $"c_address", $"c_comment")
.agg(sum($"volume").as("revenue"))
.sort($"revenue".desc)
.show()
}
def q11(): Unit = {
val mul = udf { (x: Double, y: Int) => x * y }
val mul01 = udf { x: Double => x * 0.0001 }
val tmp = nation.filter($"n_name" === "GERMANY")
.join(supplier, $"n_nationkey" === supplier("s_nationkey"))
.select($"s_suppkey")
.join(partsupp, $"s_suppkey" === partsupp("ps_suppkey"))
.select($"ps_partkey", mul($"ps_supplycost", $"ps_availqty").as("value"))
// .cache()
val sumRes = tmp.agg(sum("value").as("total_value"))
tmp.groupBy($"ps_partkey").agg(sum("value").as("part_value"))
.join(sumRes, $"part_value" > mul01($"total_value"))
.sort($"part_value".desc)
.show()
}
def q12(): Unit = {
val highPriority = udf { x: String => if (x == "1-URGENT" || x == "2-HIGH") 1 else 0 }
val lowPriority = udf { x: String => if (x != "1-URGENT" && x != "2-HIGH") 1 else 0 }
lineitem.filter((
$"l_shipmode" === "MAIL" || $"l_shipmode" === "SHIP") &&
$"l_commitdate" < $"l_receiptdate" &&
$"l_shipdate" < $"l_commitdate" &&
$"l_receiptdate" >= "1994-01-01" && $"l_receiptdate" < "1995-01-01")
.join(order, $"l_orderkey" === order("o_orderkey"))
.select($"l_shipmode", $"o_orderpriority")
.groupBy($"l_shipmode")
.agg(sum(highPriority($"o_orderpriority")).as("sum_highorderpriority"),
sum(lowPriority($"o_orderpriority")).as("sum_loworderpriority"))
.sort($"l_shipmode")
.show()
}
def q13(): Unit = {
val special = udf { x: String => x.matches(".*special.*requests.*") }
customer.join(order, $"c_custkey" === order("o_custkey")
&& !special(order("o_comment")), "left_outer")
.groupBy($"o_custkey")
.agg(count($"o_orderkey").as("c_count"))
.groupBy($"c_count")
.agg(count($"o_custkey").as("custdist"))
.sort($"custdist".desc, $"c_count".desc)
.show()
}
def q14(): Unit = {
val reduce = udf { (x: Double, y: Double) => x * (1 - y) }
val promo = udf { (x: String, y: Double) => if (x.startsWith("PROMO")) y else 0 }
part.join(lineitem, $"l_partkey" === $"p_partkey" &&
$"l_shipdate" >= "1995-09-01" && $"l_shipdate" < "1995-10-01")
.select($"p_type", reduce($"l_extendedprice", $"l_discount").as("value"))
.agg(sum(promo($"p_type", $"value")) * 100 / sum($"value"))
.show()
}
def q15(): Unit = {
val decrease = udf { (x: Double, y: Double) => x * (1 - y) }
val revenue = lineitem.filter($"l_shipdate" >= "1996-01-01" &&
$"l_shipdate" < "1996-04-01")
.select($"l_suppkey", decrease($"l_extendedprice", $"l_discount").as("value"))
.groupBy($"l_suppkey")
.agg(sum($"value").as("total"))
revenue.agg(max($"total").as("max_total"))
.join(revenue, $"max_total" === revenue("total"))
.join(supplier, $"l_suppkey" === supplier("s_suppkey"))
.select($"s_suppkey", $"s_name", $"s_address", $"s_phone", $"total")
.sort($"s_suppkey")
.show()
}
def q16(): Unit = {
val complains = udf { x: String => x.matches(".*Customer.*Complaints.*") }
val polished = udf { x: String => x.startsWith("MEDIUM POLISHED") }
val numbers = udf { x: Int => x.toString.matches("49|14|23|45|19|3|36|9") }
val fparts = part.filter(($"p_brand" =!= "Brand#45") && !polished($"p_type") &&
numbers($"p_size"))
.select($"p_partkey", $"p_brand", $"p_type", $"p_size")
supplier.filter(!complains($"s_comment"))
.join(partsupp, $"s_suppkey" === partsupp("ps_suppkey"))
.select($"ps_partkey", $"ps_suppkey")
.join(fparts, $"ps_partkey" === fparts("p_partkey"))
.groupBy($"p_brand", $"p_type", $"p_size")
.agg(countDistinct($"ps_suppkey").as("supplier_count"))
.sort($"supplier_count".desc, $"p_brand", $"p_type", $"p_size")
.show()
}
def q17(): Unit = {
val mul02 = udf { x: Double => x * 0.2 }
val fpart = part.filter($"p_brand" === "Brand#23" && $"p_container" === "MED BOX")
.select($"p_partkey")
.join(lineitem, $"p_partkey" === lineitem("l_partkey"), "left_outer")
fpart.groupBy("p_partkey")
.agg(mul02(avg($"l_quantity")).as("avg_quantity"))
.select($"p_partkey".as("key"), $"avg_quantity")
.join(fpart, $"key" === fpart("p_partkey"))
.filter($"l_quantity" < $"avg_quantity")
.agg(sum($"l_extendedprice") / 7.0)
.show()
}
def q18(): Unit = {
lineitem.groupBy($"l_orderkey")
.agg(sum($"l_quantity").as("sum_quantity"))
.filter($"sum_quantity" > 300)
.select($"l_orderkey".as("key"), $"sum_quantity")
.join(order, order("o_orderkey") === $"key")
.join(lineitem, $"o_orderkey" === lineitem("l_orderkey"))
.join(customer, customer("c_custkey") === $"o_custkey")
.select($"l_quantity", $"c_name", $"c_custkey", $"o_orderkey", $"o_orderdate", $"o_totalprice")
.groupBy($"c_name", $"c_custkey", $"o_orderkey", $"o_orderdate", $"o_totalprice")
.agg(sum("l_quantity"))
.sort($"o_totalprice".desc, $"o_orderdate")
.show()
}
def q19(): Unit = {
val sm = udf { x: String => x.matches("SM CASE|SM BOX|SM PACK|SM PKG") }
val md = udf { x: String => x.matches("MED BAG|MED BOX|MED PKG|MED PACK") }
val lg = udf { x: String => x.matches("LG CASE|LG BOX|LG PACK|LG PKG") }
val decrease = udf { (x: Double, y: Double) => x * (1 - y) }
part.join(lineitem, $"l_partkey" === $"p_partkey")
.filter(($"l_shipmode" === "AIR" || $"l_shipmode" === "AIR REG") &&
$"l_shipinstruct" === "DELIVER IN PERSON")
.filter(
(($"p_brand" === "Brand#12") &&
sm($"p_container") &&
$"l_quantity" >= 1 && $"l_quantity" <= 11 &&
$"p_size" >= 1 && $"p_size" <= 5) ||
(($"p_brand" === "Brand#23") &&
md($"p_container") &&
$"l_quantity" >= 10 && $"l_quantity" <= 20 &&
$"p_size" >= 1 && $"p_size" <= 10) ||
(($"p_brand" === "Brand#34") &&
lg($"p_container") &&
$"l_quantity" >= 20 && $"l_quantity" <= 30 &&
$"p_size" >= 1 && $"p_size" <= 15))
.select(decrease($"l_extendedprice", $"l_discount").as("volume"))
.agg(sum("volume"))
.show()
}
def q20(): Unit = {
val forest = udf { x: String => x.startsWith("forest") }
val flineitem = lineitem.filter($"l_shipdate" >= "1994-01-01" && $"l_shipdate" < "1995-01-01")
.groupBy($"l_partkey", $"l_suppkey")
.agg((sum($"l_quantity") * 0.5).as("sum_quantity"))
val fnation = nation.filter($"n_name" === "CANADA")
val nat_supp = supplier.select($"s_suppkey", $"s_name", $"s_nationkey", $"s_address")
.join(fnation, $"s_nationkey" === fnation("n_nationkey"))
part.filter(forest($"p_name"))
.select($"p_partkey").distinct
.join(partsupp, $"p_partkey" === partsupp("ps_partkey"))
.join(flineitem, $"ps_suppkey" === flineitem("l_suppkey") && $"ps_partkey" === flineitem("l_partkey"))
.filter($"ps_availqty" > $"sum_quantity")
.select($"ps_suppkey").distinct
.join(nat_supp, $"ps_suppkey" === nat_supp("s_suppkey"))
.select($"s_name", $"s_address")
.sort($"s_name")
.show()
}
def q21(): Unit = {
val fsupplier = supplier.select($"s_suppkey", $"s_nationkey", $"s_name")
val plineitem = lineitem.select($"l_suppkey", $"l_orderkey", $"l_receiptdate", $"l_commitdate")
val flineitem = plineitem.filter($"l_receiptdate" > $"l_commitdate")
val line1 = plineitem.groupBy($"l_orderkey")
.agg(countDistinct($"l_suppkey").as("suppkey_count"), max($"l_suppkey").as("suppkey_max"))
.select($"l_orderkey".as("key"), $"suppkey_count", $"suppkey_max")
val line2 = flineitem.groupBy($"l_orderkey")
.agg(countDistinct($"l_suppkey").as("suppkey_count"), max($"l_suppkey").as("suppkey_max"))
.select($"l_orderkey".as("key"), $"suppkey_count", $"suppkey_max")
val forder = order.select($"o_orderkey", $"o_orderstatus")
.filter($"o_orderstatus" === "F")
nation.filter($"n_name" === "SAUDI ARABIA")
.join(fsupplier, $"n_nationkey" === fsupplier("s_nationkey"))
.join(flineitem, $"s_suppkey" === flineitem("l_suppkey"))
.join(forder, $"l_orderkey" === forder("o_orderkey"))
.join(line1, $"l_orderkey" === line1("key"))
.filter($"suppkey_count" > 1 || ($"suppkey_count" == 1 && $"l_suppkey" == $"max_suppkey"))
.select($"s_name", $"l_orderkey", $"l_suppkey")
.join(line2, $"l_orderkey" === line2("key"), "left_outer")
.select($"s_name", $"l_orderkey", $"l_suppkey", $"suppkey_count", $"suppkey_max")
.filter($"suppkey_count" === 1 && $"l_suppkey" === $"suppkey_max")
.groupBy($"s_name")
.agg(count($"l_suppkey").as("numwait"))
.sort($"numwait".desc, $"s_name")
.show()
}
def q22(): Unit = {
val sub2 = udf { x: String => x.substring(0, 2) }
val phone = udf { x: String => x.matches("13|31|23|29|30|18|17") }
val fcustomer = customer.select($"c_acctbal", $"c_custkey", sub2($"c_phone").as("cntrycode"))
.filter(phone($"cntrycode"))
val avg_customer = fcustomer.filter($"c_acctbal" > 0.0)
.agg(avg($"c_acctbal").as("avg_acctbal"))
order.groupBy($"o_custkey")
.agg($"o_custkey").select($"o_custkey")
.join(fcustomer, $"o_custkey" === fcustomer("c_custkey"), "right_outer")
.filter($"o_custkey".isNull)
.join(avg_customer)
.filter($"c_acctbal" > $"avg_acctbal")
.groupBy($"cntrycode")
.agg(count($"c_acctbal"), sum($"c_acctbal"))
.sort($"cntrycode")
.show()
}
}

204
deployment/README.md Normal file
Просмотреть файл

@ -0,0 +1,204 @@
Deploying your App on the Cloud
===
# Table of Contents
- [Pre-requisites](#pre-requisites)
- [Preparing Worker Dependencies](#preparing-worker-dependencies)
- [Cloud Deployment](#cloud-deployment)
- [Azure HDInsight Spark](#azure-hdinsight-spark)
- [Deploy Worker to Spark Cluster](#deploy-microsoftsparkworker)
- [App deployment using spark-submit](#using-spark-submit)
- [App deployment using Apache Livy](#using-apache-livy)
- [Amazon EMR Spark](#amazon-emr-spark)
- [Deploy Worker to Spark Cluster](#deploy-microsoftsparkworker-1)
- [App deployment using spark-submit](#using-spark-submit-1)
- [App deployment using Amazon EMR Steps](#using-amazon-emr-steps)
- [Databricks (Azure & AWS)](#databricks)
- [Deploy Worker to Spark Cluster](#deploy-microsoftsparkworker-2)
- [App deployment using spark-submit](#using-spark-submit-2)
# Pre-requisites:
1. Clone and successfully build [Spark .NET](https://github.com/dotnet) by following the [Quick Start instructions](https://github.com/dotnet/spark#quick-start-tldr).
2. Download and install [.NET Core](https://dotnet.microsoft.com/download) <span style="color: red">2.1+</span> for your operating system.
3. Tool for creating a `tgz` file: `tar` on Linux, [7-ZIP](https://www.7-zip.org/) on Windows, etc.
4. Tool to copy files to a distributed file system.
- ADLS, WASB &rarr; [Azure Storage Explorer](https://azure.microsoft.com/en-us/features/storage-explorer/)
- S3 &rarr; [AWS CLI](https://aws.amazon.com/cli/)
5. Download [install-worker.sh](install-worker.sh) to your local machine. This is a helper script that we will use later in the installation section to copy Spark .NET dependent files into your Spark cluster's worker nodes. install-worker.sh takes in three parameters:
1. The Cloud Provider: `azure` or `aws`
2. URI where `worker.tgz` is uploaded.
3. Path on the executor node where the worker package will be installed (the path should be the directory that `yarn` user has access to).
Example Usage:
```shell
install-worker.sh azure adl://<cluster name>.azuredatalakestore.net/<some dir>/worker.tgz /usr/local/bin
```
# Preparing Worker Dependencies
Microsoft.Spark.Worker is a backend component that lives on the individual worker nodes of your Spark cluster. When you want to execute a C# UDF (user-defined function), Spark needs to understand how to launch the .NET CLR to execute this UDF. Microsoft.Spark.Worker provides a collection of classes to Spark that enable this functionality.
## Microsoft.Spark.Worker
1. Publish Microsoft.Spark.Worker as self-contained.
```shell
# For example, you can run the following on Linux.
foo@bar:~/dotnet/spark/src/csharp/Microsoft.Spark.Worker$ dotnet publish -c Release -f netcoreapp2.1 -r ubuntu.16.04-x64
```
> **Note**: Ensure that the correct [dotnet Runtime Identifier](https://github.com/dotnet/corefx/blob/master/pkg/Microsoft.NETCore.Platforms/runtime.json) is used for your cluster.
2. Produce `worker.tgz` for the published files.
```shell
# For example, you can run the following on Linux using `tar`.
foo@bar:~/dotnet/spark/src/csharp/Microsoft.Spark.Worker$ tar czvf worker.tgz -C bin/Release/netcoreapp2.1/ubuntu.16.04-x64/publish/ .
```
3. Upload `worker.tgz` and [install-worker.sh](install-worker.sh) to a distributed file system (e.g., HDFS, WASB, ADLS, S3) that your cluster has access to.
## Your Spark .NET `app`
1. Publish your Spark .NET `app` as self-contained.
```shell
# For example, you can run the following on Linux.
foo@bar:~/path/to/app$ dotnet publish -c Release -f netcoreapp2.1 -r ubuntu.16.04-x64
```
2. Produce `<your app>.zip` for the published files.
```shell
# For example, you can run the following on Linux using `zip`.
foo@bar:~/path/to/app/bin/Release/netcoreapp2.1/ubuntu.16.04-x64/publish$ zip -r <your app>.zip .
```
3. Upload the following to a distributed file system (e.g., HDFS, WASB, ADLS, S3) that your cluster has access to:
* `microsoft-spark-<spark_majorversion.spark_minorversion.x>-<spark_dotnet_version>.jar` (created in the [Build](../README.md#build) step)
* `<your app>.zip`
* Files (e.g., dependency files, common data accessible to every worker) or Assemblies (e.g., DLLs that contain your user-defined functions, libraries that your `app` depends on) to be placed in the working directory of each executor.
# Cloud Deployment
## Azure HDInsight Spark
[Azure HDInsight Spark](https://docs.microsoft.com/en-us/azure/hdinsight/spark/apache-spark-overview) is the Microsoft implementation of Apache Spark in the cloud that allows users to launch and configure Spark clusters in Azure. You can use HDInsight Spark clusters to process your data stored in Azure (e.g., [Azure Storage](https://azure.microsoft.com/en-us/services/storage/) and [Azure Data Lake Storage](https://docs.microsoft.com/en-us/azure/storage/blobs/data-lake-storage-introduction)).
> **Note**: Azure HDInsight Spark is Linux-based. Therefore, if you are interested in deploying your app to Azure HDInsight Spark, make sure your app is .NET Standard compatible and that you use [.NET Core compiler](https://dotnet.microsoft.com/download) to compile your app.
### Deploy Microsoft.Spark.Worker
*Note that this step is required only once*
#### Run HDInsight Script Action
Run `install-worker.sh` on the cluster using [HDInsight Script Actions](https://docs.microsoft.com/en-us/azure/hdinsight/hdinsight-hadoop-customize-cluster-linux):
* Script type: Custom
* Name: Install Microsoft.Spark.Worker (or anything that is descriptive)
* Bash script URI: The URI to which you uploaded `install-worker.sh` (e.g. adl://\<cluster name\>.azuredatalakestore.net/\<some dir\>/install-worker.sh)
* Node type(s): Worker
* Parameters: Parameters to `install-worker.sh`. For example, if you uploaded to Azure Data Lake then it would be `azure adl://<cluster name>.azuredatalakestore.net/<some dir>/worker.tgz /usr/local/bin`.
The following captures the setting for a HDInsight Script Action:
<img src="../docs/img/deployment-hdi-action-script.png" alt="ScriptActionImage" width="500"/>
### Run your app on the cloud!
#### Using [spark-submit](https://spark.apache.org/docs/latest/submitting-applications.html)
1. `ssh` into one of the head nodes in the cluster.
2. Run `spark-submit`:
```shell
foo@bar:~$ $SPARK_HOME/bin/spark-submit \
--master yarn \
--class org.apache.spark.deploy.DotnetRunner \
--files <comma-separated list of assemblies that contain UDF definitions, if any> \
adl://<cluster name>.azuredatalakestore.net/<some dir>/microsoft-spark-<spark_majorversion.spark_minorversion.x>-<spark_dotnet_version>.jar \
adl://<cluster name>.azuredatalakestore.net/<some dir>/<your app>.zip <your app> <app arg 1> <app arg 2> ... <app arg n>
```
#### Using [Apache Livy](https://livy.incubator.apache.org/)
You can use Apache Livy, the Apache Spark REST API, to submit Spark .NET jobs to an Azure HDInsight Spark cluster as documented in [Remote jobs with Apache Livy](https://docs.microsoft.com/en-us/azure/hdinsight/spark/apache-spark-livy-rest-interface).
```shell
# For example, you can run the following on Linux using `curl`.
foo@bar:~$ curl -k -v -X POST "https://<your spark cluster>.azurehdinsight.net/livy/batches" \
-u "<hdinsight username>:<hdinsight password>" \
-H "Content-Type: application/json" \
-H "X-Requested-By: <hdinsight username>" \
-d @- << EOF
{
"file":"adl://<cluster name>.azuredatalakestore.net/<some dir>/microsoft-spark-<spark_majorversion.spark_minorversion.x>-<spark_dotnet_version>.jar",
"className":"org.apache.spark.deploy.DotnetRunner",
"files":["adl://<cluster name>.azuredatalakestore.net/<some dir>/<udf assembly>", "adl://<cluster name>.azuredatalakestore.net/<some dir>/<file>"],
"args":["adl://<cluster name>.azuredatalakestore.net/<some dir>/<your app>.zip","<your app>","<app arg 1>","<app arg 2>,"...","<app arg n>"]
}
EOF
```
## Amazon EMR Spark
[Amazon EMR](https://docs.aws.amazon.com/emr/latest/ManagementGuide/emr-what-is-emr.html) is a managed cluster platform that simplifies running big data frameworks on AWS.
> **Note**: AWS EMR Spark is Linux-based. Therefore, if you are interested in deploying your app to AWS EMR Spark, make sure your app is .NET Standard compatible and that you use [.NET Core compiler](https://dotnet.microsoft.com/download) to compile your app.
### Deploy Microsoft.Spark.Worker
*Note that this step is only required at cluster creation*
#### Create cluster using Amazon EMR Bootstrap Actions
Run `install-worker.sh` during cluster creation using [Bootstrap Actions](https://docs.aws.amazon.com/emr/latest/ManagementGuide/emr-plan-bootstrap.html).
```shell
# For example, you can run the following on Linux using `aws` cli.
foo@bar:~$ aws emr create-cluster \
--name "Test cluster" \
--release-label emr-5.23.0 \
--use-default-roles \
--ec2-attributes KeyName=myKey \
--applications Name=Spark \
--instance-count 3 \
--instance-type m1.medium \
--bootstrap-actions Path=s3://mybucket/<some dir>/install-worker.sh,Name="Install Microsoft.Spark.Worker",Args=["aws","s3://mybucket/<some dir>/worker.tgz","/usr/local/bin"]
```
### Run your app on the cloud!
Upload the following to an S3 bucket your cluster has access to:
* `microsoft-spark-<spark_majorversion.spark_minorversion.x>-<spark_dotnet_version>.jar` (created in the [Build](../README.md#build) step)
* `<your app>.zip`
* Files (e.g., dependency files, common data accessible to every worker) or Assemblies (e.g., DLLs that contain your user-defined functions, libraries that your `app` depends on) to be placed in the working directory of each executor.
#### Using [spark-submit](https://spark.apache.org/docs/latest/submitting-applications.html)
1. `ssh` into one of the nodes in the cluster.
2. Run `spark-submit`:
```shell
foo@bar:~$ spark-submit \
--master yarn \
--class org.apache.spark.deploy.DotnetRunner \
--files <comma-separated list of assemblies that contain UDF definitions, if any> \
s3://mybucket/<some dir>/microsoft-spark-<spark_majorversion.spark_minorversion.x>-<spark_dotnet_version>.jar \
s3://mybucket/<some dir>/<your app>.zip <your app> <app args>
```
#### Using [Amazon EMR Steps](https://docs.aws.amazon.com/emr/latest/ReleaseGuide/emr-spark-submit-step.html)
Amazon EMR Steps can be used to submit jobs to the Spark framework installed on the EMR cluster.
```bash
# For example, you can run the following on Linux using `aws` cli.
foo@bar:~$ aws emr add-steps \
--cluster-id j-xxxxxxxxxxxxx \
--steps Type=spark,Name="Spark Program",Args=[--master,yarn,--files,s3://mybucket/<some dir>/<udf assembly>,--class,org.apache.spark.deploy.DotnetRunner,s3://mybucket/<some dir>/microsoft-spark-<spark_majorversion.spark_minorversion.x>-<spark_dotnet_version>.jar,s3://mybucket/<some dir>/<your app>.zip,<your app>,<app arg 1>,<app arg 2>,...,<app arg n>],ActionOnFailure=CONTINUE
```
## Databricks
[Databricks](http://databricks.com) is a platform that provides cloud-based big data processing using Apache Spark.
> **Note**: [Azure](https://azure.microsoft.com/en-us/services/databricks/) and [AWS](https://databricks.com/aws) Databricks is Linux-based. Therefore, if you are interested in deploying your app to Databricks, make sure your app is .NET Standard compatible and that you use [.NET Core compiler](https://dotnet.microsoft.com/download) to compile your app.
### Deploy Microsoft.Spark.Worker
*Note that this step is required only once*
#### Cluster Node Initialization Scripts
Using Databrick's [init script](https://docs.databricks.com/user-guide/clusters/init-scripts.html) mechanism, we will run a shell script during startup for each cluster node before the Spark driver or worker JVM starts.
1. Configure and your [Data Source](https://docs.databricks.com/spark/latest/data-sources/index.html) and mount it using [Databricks File System](https://docs.databricks.com/user-guide/dbfs-databricks-file-system.html#dbfs).
2. Use the following [init script](https://docs.databricks.com/user-guide/clusters/init-scripts.html) to install `Microsoft.Spark.Worker` on the cluster nodes.
```scala
dbutils.fs.put("dbfs:/databricks/<cluster-scoped or global path>/install-worker-wrapper.sh" ,"""
#!/bin/bash
set +e
/bin/bash /dbfs/<your mount>/<path to>/install-worker.sh local /dbfs/<your mount>/<path to>/worker.tgz /usr/local/bin
""", true)
```
3. Restart the cluster.
### Run your app on the cloud!
#### Using [spark-submit](https://spark.apache.org/docs/latest/submitting-applications.html)
1. [Create a Job](https://docs.databricks.com/user-guide/jobs.html) and select *Configure spark-submit*.
2. Configure `spark-submit` with the following parameters:
```shell
["--files","/dbfs/<your mount>/<path-to>/<app assembly/file to deploy to worker>","--class"," org.apache.spark.deploy.DotnetRunner","/dbfs/<your mount>/<path to>/microsoft-spark-<spark_majorversion.spark_minorversion.x>-<spark_dotnet_version>.jar","/dbfs/<your mount>/<path to>/<app name>.zip","<app bin name>","app arg1","app arg2"]
```

Просмотреть файл

@ -0,0 +1,40 @@
#!/bin/bash
set +e
# Cloud Provider
CLOUD_PROVIDER=$1
# Path where packaged worker file (tgz) exists.
WORKER_PATH=$2
# The path on the executor nodes where Microsoft.Spark.Worker executable is installed.
DEST_PATH=$3
# The path where all the dependent libraies are installed so that it doesn't
# pollute the $DEST_PATH.
DEST_PATH_BINARIES=$DEST_PATH/microsoft.spark.worker
# Temporary worker file.
TEMP_WORKER_FILENAME=/tmp/temp_worker.tgz
# Clean up any existing files.
sudo rm -f $DEST_PATH/Microsoft.Spark.Worker
sudo rm -rf $DEST_PATH_BINARIES
# Copy the worker file to a local temporary file.
if [ "${CLOUD_PROVIDER,,}" = "azure" ]; then
hdfs dfs -get $WORKER_PATH $TEMP_WORKER_FILENAME
elif [ "${CLOUD_PROVIDER,,}" = "aws" ]; then
aws s3 cp $WORKER_PATH $TEMP_WORKER_FILENAME
else
cp -f $WORKER_PATH $TEMP_WORKER_FILENAME
fi
# Untar the file.
sudo mkdir -p $DEST_PATH_BINARIES
sudo tar xzf $TEMP_WORKER_FILENAME -C $DEST_PATH_BINARIES
# Make the file executable since dotnet doesn't set this correctly.
sudo chmod 755 $DEST_PATH_BINARIES/Microsoft.Spark.Worker
# Create a symlink.
sudo ln -sf $DEST_PATH_BINARIES/Microsoft.Spark.Worker $DEST_PATH/Microsoft.Spark.Worker
# Remove the temporary worker file.
sudo rm $TEMP_WORKER_FILENAME

14
dev/.scalafmt.conf Normal file
Просмотреть файл

@ -0,0 +1,14 @@
# The following configs are taken from https://github.com/apache/spark/blob/master/dev/.scalafmt.conf
align = none
align.openParenDefnSite = false
align.openParenCallSite = false
align.tokens = []
optIn = {
configStyleArguments = false
}
danglingParentheses = false
docstrings = JavaDoc
maxColumn = 98
# The following are specific to donet/spark.
importSelectors = singleLine

Просмотреть файл

@ -0,0 +1,247 @@
Building Spark .NET on Ubuntu 18.04
==========================
# Table of Contents
- [Open Issues](#open-issues)
- [Pre-requisites](#pre-requisites)
- [Building](#building)
- [Building Spark .NET Scala Extensions Layer](#building-spark-net-scala-extensions-layer)
- [Building .NET Sample Applications using .NET Core CLI](#building-net-sample-applications-using-net-core-cli)
- [Run Samples](#run-samples)
# Open Issues:
- [Building through Visual Studio Code]()
# Pre-requisites:
If you already have all the pre-requisites, skip to the [build](ubuntu-instructions.md#building) steps below.
1. Download and install **[.NET Core 2.1 SDK](https://dotnet.microsoft.com/download/dotnet-core/2.1)** or the **[.NET Core 3.0 preview SDK](https://dotnet.microsoft.com/download/dotnet-core/3.0)** - installing the SDK will add the `dotnet` toolchain to your path.
2. Install **[OpenJDK 8](https://openjdk.java.net/install/)**
- You can use the following command:
```bash
sudo apt install openjdk-8-jdk
```
- Verify you are able to run `java` from your command-line
<details>
<summary>&#x1F4D9; Click to see sample java -version output</summary>
```
openjdk version "1.8.0_191"
OpenJDK Runtime Environment (build 1.8.0_191-8u191-b12-2ubuntu0.18.04.1-b12)
OpenJDK 64-Bit Server VM (build 25.191-b12, mixed mode)
```
- If you already have multiple OpenJDK versions installed and want to select OpenJDK 8, use the following command:
```bash
sudo update-alternatives --config java
```
3. Install **[Apache Maven 3.6.0+](https://maven.apache.org/download.cgi)**
- Run the following command:
```bash
mkdir -p ~/bin/maven
cd ~/bin/maven
wget https://www-us.apache.org/dist/maven/maven-3/3.6.0/binaries/apache-maven-3.6.0-bin.tar.gz
tar -xvzf apache-maven-3.6.0-bin.tar.gz
ln -s apache-maven-3.6.0 current
export M2_HOME=~/bin/maven/current
export PATH=${M2_HOME}/bin:${PATH}
source ~/.bashrc
```
Note that these environment variables will be lost when you close your terminal. If you want the changes to be permanent, add the `export` lines to your `~/.bashrc` file.
- Verify you are able to run `mvn` from your command-line
<details>
<summary>&#x1F4D9; Click to see sample mvn -version output</summary>
```
Apache Maven 3.6.0 (97c98ec64a1fdfee7767ce5ffb20918da4f719f3; 2018-10-24T18:41:47Z)
Maven home: ~/bin/apache-maven-3.6.0
Java version: 1.8.0_191, vendor: Oracle Corporation, runtime: /usr/lib/jvm/java-8-openjdk-amd64/jre
Default locale: en, platform encoding: UTF-8
OS name: "linux", version: "4.4.0-17763-microsoft", arch: "amd64", family: "unix"
```
4. Install **[Apache Spark 2.3+](https://spark.apache.org/downloads.html)**
- Download [Apache Spark 2.3+](https://spark.apache.org/downloads.html) and extract it into a local folder (e.g., `~/bin/spark-2.3.2-bin-hadoop2.7`)
- Add the necessary [environment variables](https://www.java.com/en/download/help/path.xml) `SPARK_HOME` e.g., `~/bin/spark-2.3.2-bin-hadoop2.7/`
```bash
export SPARK_HOME=~/bin/spark-2.3.2-hadoop2.7
export PATH="$SPARK_HOME/bin:$PATH"
source ~/.bashrc
```
Note that these environment variables will be lost when you close your terminal. If you want the changes to be permanent, add the `export` lines to your `~/.bashrc` file.
- Verify you are able to run `spark-shell` from your command-line
<details>
<summary>&#x1F4D9; Click to see sample console output</summary>
```
Welcome to
____ __
/ __/__ ___ _____/ /__
_\ \/ _ \/ _ `/ __/ '_/
/___/ .__/\_,_/_/ /_/\_\ version 2.3.2
/_/
Using Scala version 2.11.8 (Java HotSpot(TM) 64-Bit Server VM, Java 1.8.0_201)
Type in expressions to have them evaluated.
Type :help for more information.
scala> sc
res0: org.apache.spark.SparkContext = org.apache.spark.SparkContext@6eaa6b0c
```
</details>
Please make sure you are able to run `dotnet`, `java`, `mvn`, `spark-shell` from your command-line before you move to the next section. Feel there is a better way? Please [open an issue](https://github.com/dotnet/spark/issues) and feel free to contribute.
# Building
For the rest of the section, it is assumed that you have cloned Spark .NET repo into your machine e.g., `~/dotnet.spark/`
```
git clone https://github.com/dotnet/spark.git ~/dotnet.spark
```
## Building Spark .NET Scala Extensions Layer
When you submit a .NET application, Spark .NET has the necessary logic written in Scala that inform Apache Spark how to handle your requests (e.g., request to create a new Spark Session, request to transfer data from .NET side to JVM side etc.). This logic can be found in the [Spark .NET Scala Source Code](../../../src/scala).
Let us now build the Spark .NET Scala extension layer. This is easy to do:
```
cd src/scala
mvn clean package
```
You should see JARs created for the supported Spark versions:
* `microsoft-spark-2.3.x/target/microsoft-spark-2.3.x-<version>.jar`
* `microsoft-spark-2.4.x/target/microsoft-spark-2.4.x-<version>.jar`
## Building .NET Sample Applications using .NET Core CLI
1. Build the Worker
```bash
cd ~/dotnet.spark/src/csharp/Microsoft.Spark.Worker/
dotnet publish -f netcoreapp2.1 -r ubuntu.18.04-x64
```
<details>
<summary>&#x1F4D9; Click to see sample console output</summary>
```
user@machine:/home/user/dotnet.spark/src/csharp/Microsoft.Spark.Worker$ dotnet publish -f netcoreapp2.1 -r ubuntu.18.04-x64
Welcome to .NET Core!
---------------------
Learn more about .NET Core: https://aka.ms/dotnet-docs
Use 'dotnet --help' to see available commands or visit: https://aka.ms/dotnet-cli-docs
...
output omitted
...
Restore completed in 20.09 sec for /home/user/dotnet.spark/src/csharp/Microsoft.Spark/Microsoft.Spark.csproj.
Installing runtime.linux-x64.Microsoft.NETCore.DotNetAppHost 2.1.9.
Installing runtime.linux-x64.Microsoft.NETCore.DotNetHostResolver 2.1.9.
Installing runtime.linux-x64.Microsoft.NETCore.DotNetHostPolicy 2.1.9.
Installing runtime.linux-x64.Microsoft.NETCore.App 2.1.9.
Generating MSBuild file /home/user/dotnet.spark/src/csharp/Microsoft.Spark.Worker/obj/Microsoft.Spark.Worker.csproj.nuget.g.props.
Generating MSBuild file /home/user/dotnet.spark/src/csharp/Microsoft.Spark.Worker/obj/Microsoft.Spark.Worker.csproj.nuget.g.targets.
Restore completed in 37.09 sec for /home/user/dotnet.spark/src/csharp/Microsoft.Spark.Worker/Microsoft.Spark.Worker.csproj.
Microsoft.Spark -> /home/user/dotnet.spark/src/csharp/Microsoft.Spark/bin/Debug/netstandard2.0/Microsoft.Spark.dll
Microsoft.Spark.Worker -> /home/user/dotnet.spark/src/csharp/Microsoft.Spark.Worker/bin/Debug/netcoreapp2.1/ubuntu.18.04-x64/Microsoft.Spark.Worker.dll
Microsoft.Spark.Worker -> /home/user/dotnet.spark/src/csharp/Microsoft.Spark.Worker/bin/Debug/netcoreapp2.1/ubuntu.18.04-x64/publish/
```
</details>
2. Build the Samples
**.NET Core 2.1.x**
Due to a bug in .NET Core 2.1.x CLI that causes problems with building a dependency project that creates executables, we have to resort to modifying the `.csproj` file. We are working with the .NET team towards resolving this.
```
cd ~/dotnet.spark/examples/Microsoft.Spark.CSharp.Examples/
cat Microsoft.Spark.CSharp.Examples.csproj | grep -v "Microsoft.Spark.Worker.csproj" > Microsoft.Spark.CSharp.Examples.Patched.csproj
dotnet publish -f netcoreapp2.1 -r ubuntu.18.04-x64 Microsoft.Spark.CSharp.Examples.Patched.csproj
```
**.NET Core 3.x**
If you are using .NET Core 3.x, you can avoid creating a new patched `.csproj` file and instead compile the project directly:
```
cd ~/dotnet.spark/examples/Microsoft.Spark.CSharp.Examples/
dotnet publish -f netcoreapp2.1 -r ubuntu.18.04-x64 Microsoft.Spark.CSharp.Examples.csproj
```
<details>
<summary>&#x1F4D9; Click to see sample console output</summary>
```
user@machine:/home/user/dotnet.spark/examples/Microsoft.Spark.CSharp.Examples$ dotnet publish -f netcoreapp2.1 -r ubuntu.18.04-x64 Microsoft.Spark.CSharp.Examples.Patched.csproj
Microsoft (R) Build Engine version 15.9.20+g88f5fadfbe for .NET Core
Copyright (C) Microsoft Corporation. All rights reserved.
Restoring packages for /home/user/dotnet.spark/examples/Microsoft.Spark.CSharp.Examples/Microsoft.Spark.CSharp.Examples.Patched.csproj...
Restore completed in 53 ms for /home/user/dotnet.spark/src/csharp/Microsoft.Spark/Microsoft.Spark.csproj.
Generating MSBuild file /home/user/dotnet.spark/examples/Microsoft.Spark.CSharp.Examples/obj/Microsoft.Spark.CSharp.Examples.Patched.csproj.nuget.g.props.
Generating MSBuild file /home/user/dotnet.spark/examples/Microsoft.Spark.CSharp.Examples/obj/Microsoft.Spark.CSharp.Examples.Patched.csproj.nuget.g.targets.
Restore completed in 305.72 ms for /home/user/dotnet.spark/examples/Microsoft.Spark.CSharp.Examples/Microsoft.Spark.CSharp.Examples.Patched.csproj.
Microsoft.Spark -> /home/user/dotnet.spark/src/csharp/Microsoft.Spark/bin/Debug/netstandard2.0/Microsoft.Spark.dll
Microsoft.Spark.CSharp.Examples.Patched -> /home/user/dotnet.spark/examples/Microsoft.Spark.CSharp.Examples/bin/Debug/netcoreapp2.1/ubuntu.18.04-x64/Microsoft.Spark.CSharp.Examples.dll
Microsoft.Spark.CSharp.Examples.Patched -> /home/user/dotnet.spark/examples/Microsoft.Spark.CSharp.Examples/bin/Debug/netcoreapp2.1/ubuntu.18.04-x64/publish/
```
</details>
3. Manually copy Worker binaries into the Samples output location.
```
cp ~/dotnet.spark/src/csharp/Microsoft.Spark.Worker/bin/Debug/netcoreapp2.1/ubuntu.18.04-x64/publish/* ~/dotnet.spark/examples/Microsoft.Spark.CSharp.Examples/bin/Debug/netcoreapp2.1/ubuntu.18.04-x64/publish/
```
# Run Samples
Once you build the samples, you can use `spark-submit` to submit your .NET Core apps. Make sure you have followed the [pre-requisites](#pre-requisites) section and installed Apache Spark.
1. Open a terminal and go to the directory where your app binary has been generated (e.g., `~/dotnet.spark/examples/Microsoft.Spark.CSharp.Examples/bin/Debug/netcoreapp2.1/ubuntu.18.04-x64/publish`)
2. Running your app follows the basic structure:
```bash
spark-submit \
[--jars <any-jars-your-app-is-dependent-on>] \
--class org.apache.spark.deploy.DotnetRunner \
--master local \
<path-to-microsoft-spark-jar> \
<path-to-your-app-binary> <argument(s)-to-your-app>
```
Here are some examples you can run:
- **[Microsoft.Spark.Examples.Sql.Basic](../../examples/Microsoft.Spark.CSharp.Examples/Sql/Basic.cs)**
```bash
spark-submit \
--class org.apache.spark.deploy.DotnetRunner \
--master local \
~/dotnet.spark/src/scala/microsoft-spark-2.3.x/target/microsoft-spark-2.3.x-1.0.0-alpha.jar \
Microsoft.Spark.CSharp.Examples Sql.Basic $SPARK_HOME/examples/src/main/resources/people.json
```
- **[Microsoft.Spark.Examples.Sql.Streaming.StructuredNetworkWordCount](../../examples/Microsoft.Spark.CSharp.Examples/Sql/Streaming/StructuredNetworkWordCount.cs)**
```bash
spark-submit \
--class org.apache.spark.deploy.DotnetRunner \
--master local \
~/dotnet.spark/src/scala/microsoft-spark-2.3.x/target/microsoft-spark-2.3.x-1.0.0-alpha.jar \
Microsoft.Spark.CSharp.Examples Sql.Streaming.StructuredNetworkWordCount localhost 9999
```
- **[Microsoft.Spark.Examples.Sql.Streaming.StructuredKafkaWordCount (maven accessible)](../../examples/Microsoft.Spark.CSharp.Examples/Sql/Streaming/StructuredKafkaWordCount.cs)**
```bash
spark-submit \
--packages org.apache.spark:spark-sql-kafka-0-10_2.11:2.3.2 \
--class org.apache.spark.deploy.DotnetRunner \
--master local \
~/dotnet.spark/src/scala/microsoft-spark-2.3.x/target/microsoft-spark-2.3.x-1.0.0-alpha.jar \
Microsoft.Spark.CSharp.Examples Sql.Streaming.StructuredKafkaWordCount localhost:9092 subscribe test
```
- **[Microsoft.Spark.Examples.Sql.Streaming.StructuredKafkaWordCount (jars provided)](../../examples/Microsoft.Spark.CSharp.Examples/Sql/Streaming/StructuredKafkaWordCount.cs)**
```bash
spark-submit \
--jars path/to/net.jpountz.lz4/lz4-1.3.0.jar,path/to/org.apache.kafka/kafka-clients-0.10.0.1.jar,path/to/org.apache.spark/spark-sql-kafka-0-10_2.11-2.3.2.jar,`path/to/org.slf4j/slf4j-api-1.7.6.jar,path/to/org.spark-project.spark/unused-1.0.0.jar,path/to/org.xerial.snappy/snappy-java-1.1.2.6.jar \
--class org.apache.spark.deploy.DotnetRunner \
--master local \
~/dotnet.spark/src/scala/microsoft-spark-2.3.x/target/microsoft-spark-2.3.x-1.0.0-alpha.jar \
Microsoft.Spark.CSharp.Examples Sql.Streaming.StructuredKafkaWordCount localhost:9092 subscribe test
```
Feel this experience is complicated? Help us by taking up [Simplify User Experience for Running an App](https://github.com/dotnet/spark/issues/6)

Просмотреть файл

@ -0,0 +1,267 @@
Building Spark .NET on Windows
==========================
# Table of Contents
- [Open Issues](#open-issues)
- [Pre-requisites](#pre-requisites)
- [Building](#building)
- [Building Spark .NET Scala Extensions Layer](#building-spark-net-scala-extensions-layer)
- [Building .NET Samples Application](#building-net-samples-application)
- [Using Visual Studio for .NET Framework](#using-visual-studio-for-net-framework)
- [Using .NET Core CLI for .NET Core](#using-net-core-cli-for-net-core)
- [Run Samples](#run-samples)
# Open Issues:
- [Allow users to choose which .NET framework to build for]()
- [Building through Visual Studio Code]()
- [Building fully automatically through .NET Core CLI]()
# Pre-requisites:
If you already have all the pre-requisites, skip to the [build](windows-instructions.md#building) steps below.
1. Download and install the **[.NET Core SDK](https://dotnet.microsoft.com/download/dotnet-core/2.1)** - installing the SDK will add the `dotnet` toolchain to your path. .NET Core 2.1, 2.2 and 3.0 preview are supported.
2. Install any edition of **[Visual Studio 2019](https://www.visualstudio.com/downloads/)** or [Visual Studio 2017](https://www.visualstudio.com/downloads/). The Community version is completely free. When configuring your installation, include these components at minimum:
* .NET desktop development
* All Required Components
* .NET Framework 4.6.1 Development Tools
* .NET Core cross-platform development
* All Required Components
3. Install **[Java 1.8](https://www.oracle.com/technetwork/java/javase/downloads/jdk8-downloads-2133151.html)**
- Select the appropriate version for your operating system e.g., jdk-8u201-windows-x64.exe for Win x64 machine.
- Install using the installer and verify you are able to run `java` from your command-line
4. Install **[Apache Maven 3.6.0+](https://maven.apache.org/download.cgi)**
- Download [Apache Maven 3.6.0](http://mirror.metrocast.net/apache/maven/maven-3/3.6.0/binaries/apache-maven-3.6.0-bin.zip)
- Extract to a local directory e.g., `c:\bin\apache-maven-3.6.0\`
- Add Apache Maven to your [PATH environment variable](https://www.java.com/en/download/help/path.xml) e.g., `c:\bin\apache-maven-3.6.0\bin`
- Verify you are able to run `mvn` from your command-line
5. Install **[Apache Spark 2.3+](https://spark.apache.org/downloads.html)**
- Download [Apache Spark 2.3+](https://spark.apache.org/downloads.html) and extract it into a local folder (e.g., `c:\bin\spark-2.3.2-bin-hadoop2.7\`) using [7-zip](https://www.7-zip.org/).
- Add Apache Spark to your [PATH environment variable](https://www.java.com/en/download/help/path.xml) e.g., `c:\bin\spark-2.3.2-bin-hadoop2.7\bin`
- Add a [new environment variable](https://www.java.com/en/download/help/path.xml) `SPARK_HOME` e.g., `C:\bin\spark-2.3.2-bin-hadoop2.7\`
- Verify you are able to run `spark-shell` from your command-line
<details>
<summary>&#x1F4D9; Click to see sample console output</summary>
```
Welcome to
____ __
/ __/__ ___ _____/ /__
_\ \/ _ \/ _ `/ __/ '_/
/___/ .__/\_,_/_/ /_/\_\ version 2.3.2
/_/
Using Scala version 2.11.8 (Java HotSpot(TM) 64-Bit Server VM, Java 1.8.0_201)
Type in expressions to have them evaluated.
Type :help for more information.
scala> sc
res0: org.apache.spark.SparkContext = org.apache.spark.SparkContext@6eaa6b0c
```
Note: If you observe the following:
> ERROR Shell:397 - Failed to locate the winutils binary in the hadoop binary path
> java.io.IOException: Could not locate executable null\bin\winutils.exe in the Hadoop binaries.
You can ignore this if you are planning on running Spark in [Standalone mode](https://spark.apache.org/docs/latest/spark-standalone.html). If not, you would have to setup **[WinUtils](https://github.com/steveloughran/winutils)**
- Download winutils.exe binary from [WinUtils repository](https://github.com/steveloughran/winutils). You should select the version of Hadoop the Spark distribution was compiled with, e.g. use hadoop-2.7.1 for Spark 2.3.2.
- Save winutils.exe binary to a directory of your choice, e.g. c:\hadoop\bin.
- Set `HADOOP_HOME` to reflect the directory with winutils.exe (without bin). For instance, using command-line:
```
set HADOOP_HOME=c:\hadoop
```
- Set PATH environment variable to include `%HADOOP_HOME%\bin`. For instance, using command-line:
```
set PATH=%HADOOP_HOME%\bin;%PATH%
```
</details>
Please make sure you are able to run `dotnet`, `java`, `mvn`, `spark-shell` from your command-line before you move to the next section. Feel there is a better way? Please [open an issue](https://github.com/dotnet/spark/issues) and feel free to contribute.
> **Note**: A new instance of the command-line may be required if any environment variables were updated.
# Building
For the rest of the section, it is assumed that you have cloned Spark .NET repo into your machine e.g., `c:\github\dotnet-spark\`
```
git clone https://github.com/dotnet/spark.git c:\github\dotnet-spark
```
## Building Spark .NET Scala Extensions Layer
When you submit a .NET application, Spark .NET has the necessary logic written in Scala that inform Apache Spark how to handle your requests (e.g., request to create a new Spark Session, request to transfer data from .NET side to JVM side etc.). This logic can be found in the [Spark .NET Scala Source Code](../../../src/scala).
Regardless of whether you are using .NET Framework or .NET Core, you will need to build the Spark .NET Scala extension layer. This is easy to do:
```
cd src\scala
mvn clean package
```
You should see JARs created for the supported Spark versions:
* `microsoft-spark-2.3.x\target\microsoft-spark-2.3.x-<version>.jar`
* `microsoft-spark-2.4.x\target\microsoft-spark-2.4.x-<version>.jar`
## Building .NET Samples Application
### Using Visual Studio for .NET Framework
1. Open `src\csharp\Microsoft.Spark.sln` in Visual Studio and build the `Microsoft.Spark.CSharp.Examples` project under the `examples` folder (this will in turn build the .NET bindings project as well). If you want, you can write your own code in the `Microsoft.Spark.Examples` project:
```csharp
// Instantiate a session
var spark = SparkSession
.Builder()
.AppName("Hello Spark!")
.GetOrCreate();
var df = spark.Read().Json(args[0]);
// Print schema
df.PrintSchema();
// Apply a filter and show results
df.Filter(df["age"] > 21).Show();
```
Once the build is successfuly, you will see the appropriate binaries produced in the output directory.
<details>
<summary>&#x1F4D9; Click to see sample console output</summary>
```
Directory: C:\github\dotnet-spark\examples\Microsoft.Spark.CSharp.Examples\bin\Debug\net461
Mode LastWriteTime Length Name
---- ------------- ------ ----
-a---- 3/6/2019 12:18 AM 125440 Apache.Arrow.dll
-a---- 3/16/2019 12:00 AM 13824 Microsoft.Spark.CSharp.Examples.exe
-a---- 3/16/2019 12:00 AM 19423 Microsoft.Spark.CSharp.Examples.exe.config
-a---- 3/16/2019 12:00 AM 2720 Microsoft.Spark.CSharp.Examples.pdb
-a---- 3/16/2019 12:00 AM 143360 Microsoft.Spark.dll
-a---- 3/16/2019 12:00 AM 63388 Microsoft.Spark.pdb
-a---- 3/16/2019 12:00 AM 34304 Microsoft.Spark.Worker.exe
-a---- 3/16/2019 12:00 AM 19423 Microsoft.Spark.Worker.exe.config
-a---- 3/16/2019 12:00 AM 11900 Microsoft.Spark.Worker.pdb
-a---- 3/16/2019 12:00 AM 23552 Microsoft.Spark.Worker.xml
-a---- 3/16/2019 12:00 AM 332363 Microsoft.Spark.xml
------------------------------------------- More framework files -------------------------------------
```
</details>
### Using .NET Core CLI for .NET Core
> Note: We are currently working on automating .NET Core builds for Spark .NET. Until then, we appreciate your patience in performing some of the steps manually.
1. Build the Worker
```
cd C:\github\dotnet-spark\src\csharp\Microsoft.Spark.Worker\
dotnet publish -f netcoreapp2.1 -r win10-x64
```
<details>
<summary>&#x1F4D9; Click to see sample console output</summary>
```
PS C:\github\dotnet-spark\src\csharp\Microsoft.Spark.Worker> dotnet publish -f netcoreapp2.1 -r win10-x64
Microsoft (R) Build Engine version 15.9.20+g88f5fadfbe for .NET Core
Copyright (C) Microsoft Corporation. All rights reserved.
Restoring packages for C:\github\dotnet-spark\src\csharp\Microsoft.Spark.Worker\Microsoft.Spark.Worker.csproj...
Restore completed in 37.29 ms for C:\github\dotnet-spark\src\csharp\Microsoft.Spark\Microsoft.Spark.csproj.
Generating MSBuild file C:\github\dotnet-spark\src\csharp\Microsoft.Spark.Worker\obj\Microsoft.Spark.Worker.csproj.nuget.g.props.
Generating MSBuild file C:\github\dotnet-spark\src\csharp\Microsoft.Spark.Worker\obj\Microsoft.Spark.Worker.csproj.nuget.g.targets.
Restore completed in 230.49 ms for C:\github\dotnet-spark\src\csharp\Microsoft.Spark.Worker\Microsoft.Spark.Worker.csproj.
Microsoft.Spark -> C:\github\dotnet-spark\src\csharp\Microsoft.Spark\bin\Debug\netstandard2.0\Microsoft.Spark.dll
Microsoft.Spark.Worker -> C:\github\dotnet-spark\src\csharp\Microsoft.Spark.Worker\bin\Debug\netcoreapp2.1\win10-x64\Microsoft.Spark.Worker.dll
Microsoft.Spark.Worker -> C:\github\dotnet-spark\src\csharp\Microsoft.Spark.Worker\bin\Debug\netcoreapp2.1\win10-x64\publish\
```
</details>
2. Build the Samples
```
cd C:\github\dotnet-spark\examples\Microsoft.Spark.CSharp.Examples\
Get-Content .\Microsoft.Spark.CSharp.Examples.csproj | Where-Object {$_ -notmatch 'Microsoft.Spark.Worker.csproj'} | Set-Content .\Microsoft.Spark.CSharp.Examples.Patched.csproj
dotnet publish -f netcoreapp2.1 -r win10-x64 .\Microsoft.Spark.CSharp.Examples.Patched.csproj
```
Note the creation of a new patched `.csproj` file. This is due to a bug in .NET Core CLI that causes problems with building a dependency project that creates executables and we are working with the .NET team towards resolving this.
<details>
<summary>&#x1F4D9; Click to see sample console output</summary>
```
PS C:\github\dotnet-spark\examples\Microsoft.Spark.CSharp.Examples> dotnet publish -f netcoreapp2.1 -r win10-x64 .\Microsoft.Spark.CSharp.Examples.Patched.csproj
Microsoft (R) Build Engine version 15.9.20+g88f5fadfbe for .NET Core
Copyright (C) Microsoft Corporation. All rights reserved.
Restoring packages for C:\github\dotnet-spark\examples\Microsoft.Spark.CSharp.Examples\Microsoft.Spark.CSharp.Examples.Patched.csproj...
Restoring packages for C:\github\dotnet-spark\src\csharp\Microsoft.Spark\Microsoft.Spark.csproj...
Generating MSBuild file C:\github\dotnet-spark\examples\Microsoft.Spark.CSharp.Examples\obj\Microsoft.Spark.CSharp.Examples.Patched.csproj.nuget.g.props.
Generating MSBuild file C:\github\dotnet-spark\src\csharp\Microsoft.Spark\obj\Microsoft.Spark.csproj.nuget.g.props.
Generating MSBuild file C:\github\dotnet-spark\examples\Microsoft.Spark.CSharp.Examples\obj\Microsoft.Spark.CSharp.Examples.Patched.csproj.nuget.g.targets.
Restore completed in 208.34 ms for C:\github\dotnet-spark\examples\Microsoft.Spark.CSharp.Examples\Microsoft.Spark.CSharp.Examples.Patched.csproj.
Restore completed in 208.34 ms for C:\github\dotnet-spark\src\csharp\Microsoft.Spark\Microsoft.Spark.csproj.
Microsoft.Spark -> C:\github\dotnet-spark\src\csharp\Microsoft.Spark\bin\Debug\netstandard2.0\Microsoft.Spark.dll
Microsoft.Spark.CSharp.Examples.Patched -> C:\github\dotnet-spark\examples\Microsoft.Spark.CSharp.Examples\bin\Debug\netcoreapp2.1\win10-x64\Microsoft.Spark.CSharp.Examples.dll
Microsoft.Spark.CSharp.Examples.Patched -> C:\github\dotnet-spark\examples\Microsoft.Spark.CSharp.Examples\bin\Debug\netcoreapp2.1\win10-x64\publish\
```
</details>
3. Manually copy Worker binaries into the Samples output location.
```
cp c:\github\dotnet-spark\src\csharp\Microsoft.Spark.Worker\bin\Debug\netcoreapp2.1\win10-x64\publish\* C:\github\dotnet-spark\examples\Microsoft.Spark.CSharp.Examples\bin\Debug\netcoreapp2.1\win10-x64\publish\
```
# Run Samples
Once you build the samples, running them will be through `spark-submit` regardless of whether you are targeting .NET Framework or .NET Core apps. Make sure you have followed the [pre-requisites](#pre-requisites) section and installed Apache Spark.
1. Open Powershell and go to the directory where your app binary has been generated (e.g., `c:\github\dotnet\spark\examples\Microsoft.Spark.CSharp.Examples\bin\Debug\net461` for .NET Framework, `c:\github\dotnet-spark\examples\Microsoft.Spark.CSharp.Examples\bin\Debug\netcoreapp2.1\win10-x64\publish` for .NET Core)
2. Running your app follows the basic structure:
```powershell
spark-submit.cmd `
[--jars <any-jars-your-app-is-dependent-on>] `
--class org.apache.spark.deploy.DotnetRunner `
--master local `
<path-to-microsoft-spark-jar> `
<path-to-your-app-exe> <argument(s)-to-your-app>
```
Here are some examples you can run:
- **[Microsoft.Spark.Examples.Sql.Basic](../../examples/Microsoft.Spark.CSharp.Examples/Sql/Basic.cs)**
```powershell
spark-submit.cmd `
--class org.apache.spark.deploy.DotnetRunner `
--master local `
C:\github\dotnet-spark\src\scala\microsoft-spark-2.3.x\target\microsoft-spark-2.3.x-1.0.0-alpha.jar `
Microsoft.Spark.CSharp.Examples.exe Sql.Basic %SPARK_HOME%\examples\src\main\resources\people.json
```
- **[Microsoft.Spark.Examples.Sql.Streaming.StructuredNetworkWordCount](../../examples/Microsoft.Spark.CSharp.Examples/Sql/Streaming/StructuredNetworkWordCount.cs)**
```powershell
spark-submit.cmd `
--class org.apache.spark.deploy.DotnetRunner `
--master local `
C:\github\dotnet-spark\src\scala\microsoft-spark-2.3.x\target\microsoft-spark-2.3.x-1.0.0-alpha.jar `
Microsoft.Spark.CSharp.Examples.exe Sql.Streaming.StructuredNetworkWordCount localhost 9999
```
- **[Microsoft.Spark.Examples.Sql.Streaming.StructuredKafkaWordCount (maven accessible)](../../examples/Microsoft.Spark.CSharp.Examples/Sql/Streaming/StructuredKafkaWordCount.cs)**
```powershell
spark-submit.cmd `
--packages org.apache.spark:spark-sql-kafka-0-10_2.11:2.3.2 `
--class org.apache.spark.deploy.DotnetRunner `
--master local `
C:\github\dotnet-spark\src\scala\microsoft-spark-2.3.x\target\microsoft-spark-2.3.x-1.0.0-alpha.jar `
Microsoft.Spark.CSharp.Examples.exe Sql.Streaming.StructuredKafkaWordCount localhost:9092 subscribe test
```
- **[Microsoft.Spark.Examples.Sql.Streaming.StructuredKafkaWordCount (jars provided)](../../examples/Microsoft.Spark.CSharp.Examples/Sql/Streaming/StructuredKafkaWordCount.cs)**
```powershell
spark-submit.cmd
--jars path\to\net.jpountz.lz4\lz4-1.3.0.jar,path\to\org.apache.kafka\kafka-clients-0.10.0.1.jar,path\to\org.apache.spark\spark-sql-kafka-0-10_2.11-2.3.2.jar,`path\to\org.slf4j\slf4j-api-1.7.6.jar,path\to\org.spark-project.spark\unused-1.0.0.jar,path\to\org.xerial.snappy\snappy-java-1.1.2.6.jar `
--class org.apache.spark.deploy.DotnetRunner `
--master local `
C:\github\dotnet-spark\src\scala\microsoft-spark-2.3.x\target\microsoft-spark-2.3.x-1.0.0-alpha.jar `
Microsoft.Spark.CSharp.Examples.exe Sql.Streaming.StructuredKafkaWordCount localhost:9092 subscribe test
```
Feel this experience is complicated? Help us by taking up [Simplify User Experience for Running an App](https://github.com/dotnet/spark/issues/6)

Просмотреть файл

@ -0,0 +1,89 @@
C# Coding Style
===============
We use the same [coding style](https://github.com/dotnet/corefx/blob/master/Documentation/coding-guidelines/coding-style.md) and [EditorConfig](https://editorconfig.org "EditorConfig homepage") file (`.editorconfig`) used by [dotnet/corefx](https://github.com/dotnet/corefx) with the following differences:
* **A single line statement block must go with braces.**
```C#
// OK
if (foo)
{
return false;
}
// NOT OK
if (foo) return false;
if (foo) { return false };
```
* **Use prefix increment/decrement operator.**
Unless post increment/decrement operator usage is intended, use prefix increment/decrement operator.
```C#
// OK
for (int i = 0; i < arr.Length; ++i)
// NOT OK
for (int i = 0; i < arr.Length; i++)
// OK
arr[i++]; // Post increment operator usage is intended.
```
* **The max number of characters in a line is 100.**
This can be easily done using the following line-break rules:
(If you cannot find a rule for your scenario, please look through the existing code to find a match and create an issue to update this list.)
* Line-break for the assignment
```C#
// Try the following first to fit within the limit.
SomeType someVariable
= SomeMethod(arg1, arg2, arg3, arg4, arg5);
// Then fall back to this.
SomeType someVariable = SomeMethod(
arg1,
arg2,
arg3,
arg4,
arg5);
```
* Line-break for each method parameters:
```C#
return UserDefinedFunction.Create(
name,
CommandSerDe.Serialize(
execute,
CommandSerDe.SerializedMode.Row,
CommandSerDe.SerializedMode.Row),
UdfUtils.GetPythonEvalType(),
UdfUtils.GetReturnType(typeof(RT)));
```
* Line-break for each method call:
```C#
// If you have chained method calls, line-break each method call
Enumerable.Range(0, numRows)
.Select(i => i.ToString())
.ToArray();
```
There are few exceptions to this rule:
* Log message with string interpolation:
```C#
Logger.LogInfo($"This message {someVariable} is too long but try your best to fit in 100 character limit.");
```
* The method signature without method parameters is long due to type paramters:
```C#
public static Func<Column, Column, Column, Column, Column, Column, Column, Column, Column, Column, Column> Udf<A1, A2, A3, A4, A5, A6, A7, A8, A9, A10, RT>(
Func<A1, A2, A3, A4, A5, A6, A7, A8, A9, A10, RT> udf)
```

Просмотреть файл

@ -0,0 +1,6 @@
Scala Coding Style
===============
* For Scala code, we follow the official [Scala style guide](https://docs.scala-lang.org/style/).
* For formatting, [scalafmt](https://scalameta.org/scalafmt) is used with the custom configuration (found in [/dev/.scalafmt.conf](/dev/.scalafmt.conf))
* Installation of `scalafmt` can be found [here](https://scalameta.org/scalafmt/docs/installation.html)

9
docs/contributing.md Normal file
Просмотреть файл

@ -0,0 +1,9 @@
Contributing to dotnet/spark
======================
This document describes contribution guidelines.
Coding Style
------------
We intend to bring dotnet/spark into full conformance with the following style guidelines:
* [C# Coding Style](coding-guidelines/csharp-coding-style.md)
* [Scala Coding Style](coding-guidelines/scala-coding-style.md)

1
docs/developer-guide.md Normal file
Просмотреть файл

@ -0,0 +1 @@
# Developer Guide

1
docs/features.md Normal file
Просмотреть файл

@ -0,0 +1 @@
# Features

1
docs/img/.gitignore поставляемый Normal file
Просмотреть файл

@ -0,0 +1 @@

Двоичные данные
docs/img/deployment-hdi-action-script.png Normal file

Двоичный файл не отображается.

После

Ширина:  |  Высота:  |  Размер: 32 KiB

Двоичные данные
docs/img/spark-dot-net-logo.PNG Normal file

Двоичный файл не отображается.

После

Ширина:  |  Высота:  |  Размер: 20 KiB

Двоичные данные
docs/img/ubuntu-icon-32.png Normal file

Двоичный файл не отображается.

После

Ширина:  |  Высота:  |  Размер: 1.1 KiB

Двоичные данные
docs/img/windows-icon-32.png Normal file

Двоичный файл не отображается.

После

Ширина:  |  Высота:  |  Размер: 1.0 KiB

Просмотреть файл

@ -0,0 +1,4 @@
<?xml version="1.0" encoding="utf-8"?>
<Project ToolsVersion="14.0" DefaultTargets="Build" xmlns="http://schemas.microsoft.com/developer/msbuild/2003" TreatAsLocalProperty="ExcludeRestorePackageImports">
<Import Project="..\src\csharp\Directory.Build.props" />
</Project>

Просмотреть файл

@ -0,0 +1,11 @@
// Licensed to the .NET Foundation under one or more agreements.
// The .NET Foundation licenses this file to you under the MIT license.
// See the LICENSE file in the project root for more information.
namespace Microsoft.Spark.Examples
{
internal interface IExample
{
void Run(string[] args);
}
}

Просмотреть файл

@ -0,0 +1,16 @@
<Project Sdk="Microsoft.NET.Sdk">
<PropertyGroup>
<OutputType>Exe</OutputType>
<TargetFrameworks>net461;netcoreapp2.1</TargetFrameworks>
<TargetFrameworks Condition="'$(OS)' != 'Windows_NT'">netcoreapp2.1</TargetFrameworks>
<RootNamespace>Microsoft.Spark.Examples</RootNamespace>
<AssemblyName>Microsoft.Spark.CSharp.Examples</AssemblyName>
</PropertyGroup>
<ItemGroup>
<ProjectReference Include="..\..\src\csharp\Microsoft.Spark.Worker\Microsoft.Spark.Worker.csproj" />
<ProjectReference Include="..\..\src\csharp\Microsoft.Spark\Microsoft.Spark.csproj" />
</ItemGroup>
</Project>

Просмотреть файл

@ -0,0 +1,63 @@
// Licensed to the .NET Foundation under one or more agreements.
// The .NET Foundation licenses this file to you under the MIT license.
// See the LICENSE file in the project root for more information.
using System;
using System.Collections.Generic;
using System.Linq;
using System.Reflection;
namespace Microsoft.Spark.Examples
{
public class Program
{
public static void Main(string[] args)
{
string rootNamespace = MethodBase.GetCurrentMethod().DeclaringType.Namespace;
// Find all types in the current assembly that implement IExample
// and is in or starts with rootNamespace. Track the fully qualified
// name of the type after the rootNamespace.
IEnumerable<string> examples = Assembly.GetExecutingAssembly().GetTypes()
.Where(t =>
typeof(IExample).IsAssignableFrom(t) &&
!t.IsInterface &&
!t.IsAbstract &&
t.Namespace.StartsWith(rootNamespace) &&
((t.Namespace.Length == rootNamespace.Length) ||
(t.Namespace[rootNamespace.Length] == '.')))
.Select(t => t.FullName.Substring(rootNamespace.Length + 1));
if ((args.Length == 0) || !TryFindExample(examples, args[0], out string exampleName))
{
PrintUsage(examples);
return;
}
string[] exampleArgs = args.Skip(1).ToArray();
Type type = Assembly.GetExecutingAssembly().GetType($"{rootNamespace}.{exampleName}");
object instance = Activator.CreateInstance(type);
MethodInfo method = type.GetMethod("Run");
method.Invoke(instance, new object[] { exampleArgs });
}
private static void PrintUsage(IEnumerable<string> examples)
{
string assemblyName = Assembly.GetExecutingAssembly().GetName().Name;
Console.WriteLine($"Usage: {assemblyName} <example> <example args>");
if (examples.Any())
{
Console.WriteLine("Examples:\n\t*" + string.Join("\n\t*", examples));
}
Console.WriteLine($"\n'{assemblyName} <example>' to get the usage info of each example.");
}
private static bool TryFindExample(IEnumerable<string> examples, string search,
out string found)
{
found = examples.FirstOrDefault(e =>
e.Equals(search, StringComparison.InvariantCultureIgnoreCase));
return !string.IsNullOrWhiteSpace(found);
}
}
}

Просмотреть файл

@ -0,0 +1,96 @@
// Licensed to the .NET Foundation under one or more agreements.
// The .NET Foundation licenses this file to you under the MIT license.
// See the LICENSE file in the project root for more information.
using System;
using Microsoft.Spark.Sql;
using static Microsoft.Spark.Sql.Functions;
namespace Microsoft.Spark.Examples.Sql
{
/// <summary>
/// A simple example demonstrating basic Spark SQL features.
/// /// </summary>
internal sealed class Basic : IExample
{
public void Run(string[] args)
{
if (args.Length != 1)
{
Console.Error.WriteLine(
"Usage: Basic <path to SPARK_HOME/examples/src/main/resources/people.json>");
Environment.Exit(1);
}
SparkSession spark = SparkSession
.Builder()
.AppName(".NET Spark SQL basic example")
.Config("spark.some.config.option", "some-value")
.GetOrCreate();
// Need to explicitly specify the schema since pickling vs. arrow formatting
// will return different types. Pickling will turn longs into ints if the values fit.
DataFrame df = spark.Read().Schema("age INT, name STRING").Json(args[0]);
Spark.Sql.Types.StructType schema = df.Schema();
Console.WriteLine(schema.SimpleString);
System.Collections.Generic.IEnumerable<Row> rows = df.Collect();
foreach (Row row in rows)
{
Console.WriteLine(row);
}
df.Show();
df.PrintSchema();
df.Select("name", "age", "age", "name").Show();
df.Select(df["name"], df["age"] + 1).Show();
df.Filter(df["age"] > 21).Show();
df.GroupBy("age")
.Agg(Avg(df["age"]), Avg(df["age"]), CountDistinct(df["age"], df["age"]))
.Show();
df.CreateOrReplaceTempView("people");
// Registering Udf for SQL expression.
DataFrame sqlDf = spark.Sql("SELECT * FROM people");
sqlDf.Show();
spark.Udf().Register<int?, string, string>(
"my_udf",
(age, name) => name + " with " + ((age.HasValue) ? age.Value.ToString() : "null"));
sqlDf = spark.Sql("SELECT my_udf(*) FROM people");
sqlDf.Show();
// Using UDF via data frames.
Func<Column, Column, Column> addition = Udf<int?, string, string>(
(age, name) => name + " is " + (age.HasValue ? age.Value + 10 : 0));
df.Select(addition(df["age"], df["name"])).Show();
// Chaining example:
Func<Column, Column> addition2 = Udf<string, string>(str => $"hello {str}!");
df.Select(addition2(addition(df["age"], df["name"]))).Show();
// Multiple UDF example:
df.Select(addition(df["age"], df["name"]), addition2(df["name"])).Show();
// Joins.
DataFrame joinedDf = df.Join(df, "name");
joinedDf.Show();
DataFrame joinedDf2 = df.Join(df, new[] { "name", "age" });
joinedDf2.Show();
DataFrame joinedDf3 = df.Join(df, df["name"] == df["name"], "outer");
joinedDf3.Show();
spark.Stop();
}
}
}

Просмотреть файл

@ -0,0 +1,58 @@
// Licensed to the .NET Foundation under one or more agreements.
// The .NET Foundation licenses this file to you under the MIT license.
// See the LICENSE file in the project root for more information.
using System;
using Microsoft.Spark.Sql;
using static Microsoft.Spark.Sql.Functions;
namespace Microsoft.Spark.Examples.Sql.Streaming
{
/// <summary>
/// The example is taken/modified from
/// spark/examples/src/main/python/sql/streaming/structured_kafka_wordcount.py
/// </summary>
internal sealed class StructuredKafkaWordCount : IExample
{
public void Run(string[] args)
{
if (args.Length != 3)
{
Console.Error.WriteLine(
"Usage: StructuredKafkaWordCount " +
"<bootstrap-servers> <subscribe-type> <topics>");
Environment.Exit(1);
}
string bootstrapServers = args[0];
string subscribeType = args[1];
string topics = args[2];
SparkSession spark = SparkSession
.Builder()
.AppName("StructuredKafkaWordCount")
.GetOrCreate();
DataFrame lines = spark
.ReadStream()
.Format("kafka")
.Option("kafka.bootstrap.servers", bootstrapServers)
.Option(subscribeType, topics)
.Load()
.SelectExpr("CAST(value AS STRING)");
DataFrame words = lines
.Select(Explode(Split(lines["value"], " "))
.Alias("word"));
DataFrame wordCounts = words.GroupBy("word").Count();
Spark.Sql.Streaming.StreamingQuery query = wordCounts
.WriteStream()
.OutputMode("complete")
.Format("console")
.Start();
query.AwaitTermination();
}
}
}

Просмотреть файл

@ -0,0 +1,59 @@
// Licensed to the .NET Foundation under one or more agreements.
// The .NET Foundation licenses this file to you under the MIT license.
// See the LICENSE file in the project root for more information.
using System;
using Microsoft.Spark.Sql;
using static Microsoft.Spark.Sql.Functions;
namespace Microsoft.Spark.Examples.Sql.Streaming
{
/// <summary>
/// The example is taken/modified from
/// spark/examples/src/main/python/sql/streaming/structured_network_wordcount.py
///
/// You can set up the data source as follow in a separated terminal:
/// `$ nc -lk 9999`
/// to start writing standard input to port 9999.
/// </summary>
internal sealed class StructuredNetworkWordCount : IExample
{
public void Run(string[] args)
{
if (args.Length != 2)
{
Console.Error.WriteLine(
"Usage: StructuredNetworkWordCount <hostname> <port>");
Environment.Exit(1);
}
string hostname = args[0];
var port = int.Parse(args[1]);
SparkSession spark = SparkSession
.Builder()
.AppName("StructuredNetworkWordCount")
.GetOrCreate();
DataFrame lines = spark
.ReadStream()
.Format("socket")
.Option("host", hostname)
.Option("port", port)
.Load();
DataFrame words = lines
.Select(Explode(Split(lines["value"], " "))
.Alias("word"));
DataFrame wordCounts = words.GroupBy("word").Count();
Spark.Sql.Streaming.StreamingQuery query = wordCounts
.WriteStream()
.OutputMode("complete")
.Format("console")
.Start();
query.AwaitTermination();
}
}
}

Просмотреть файл

@ -0,0 +1,77 @@
// Licensed to the .NET Foundation under one or more agreements.
// The .NET Foundation licenses this file to you under the MIT license.
// See the LICENSE file in the project root for more information.
using System;
using Microsoft.Spark.Sql;
using static Microsoft.Spark.Sql.Functions;
namespace Microsoft.Spark.Examples.Sql.Streaming
{
/// <summary>
/// The example is taken/modified from
/// spark/examples/src/main/python/sql/streaming/structured_network_wordcount_windowed.py
///
/// You can set up the data source as follow in a separated terminal:
/// `$ nc -lk 9999`
/// to start writing standard input to port 9999.
/// </summary>
internal sealed class StructuredNetworkWordCountWindowed : IExample
{
public void Run(string[] args)
{
if (args.Length != 3 && args.Length != 4)
{
Console.Error.WriteLine(
"Usage: StructuredNetworkWordCountWindowed " +
"<hostname> <port> <window duration in seconds> " +
"[<slide duration in seconds>]");
Environment.Exit(1);
}
string hostname = args[0];
var port = int.Parse(args[1]);
var windowSize = int.Parse(args[2]);
var slideSize = (args.Length == 3) ? windowSize : int.Parse(args[3]);
if (slideSize > windowSize)
{
Console.Error.WriteLine(
"<slide duration> must be less than or equal " +
"to <window duration>");
}
var windowDuration = $"{windowSize} seconds";
var slideDuration = $"{slideSize} seconds";
SparkSession spark = SparkSession
.Builder()
.AppName("StructuredNetworkWordCountWindowed")
.GetOrCreate();
DataFrame lines = spark
.ReadStream()
.Format("socket")
.Option("host", hostname)
.Option("port", port)
.Option("includeTimestamp", true)
.Load();
DataFrame words = lines
.Select(Explode(Split(lines["value"], " "))
.Alias("word"), lines["timestamp"]);
DataFrame windowedCounts = words
.GroupBy(Window(words["timestamp"], windowDuration, slideDuration),
words["word"])
.Count()
.OrderBy("window");
Spark.Sql.Streaming.StreamingQuery query = windowedCounts
.WriteStream()
.OutputMode("complete")
.Format("console")
.Option("truncate", false)
.Start();
query.AwaitTermination();
}
}
}

Просмотреть файл

@ -0,0 +1,54 @@
Microsoft Visual Studio Solution File, Format Version 12.00
# Visual Studio 15
VisualStudioVersion = 15.0.28307.421
MinimumVisualStudioVersion = 10.0.40219.1
Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "Microsoft.Spark.CSharp.Examples", "Microsoft.Spark.CSharp.Examples\Microsoft.Spark.CSharp.Examples.csproj", "{32A34828-20F4-40FE-A3D5-C9458BF424E6}"
EndProject
Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "Microsoft.Spark", "..\src\csharp\Microsoft.Spark\Microsoft.Spark.csproj", "{F5C246F7-9CDE-44DC-8D4F-DAAC557048EF}"
EndProject
Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "Microsoft.Spark.Worker", "..\src\csharp\Microsoft.Spark.Worker\Microsoft.Spark.Worker.csproj", "{606FBD5E-D5C6-48F6-8FBF-2F0E25527760}"
EndProject
Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "Reference", "Reference", "{CE5FBCF2-F92E-4A2F-A76E-149B7118491B}"
EndProject
Project("{6EC3EE1D-3C4E-46DD-8F32-0CC8E7565705}") = "Microsoft.Spark.FSharp.Examples", "Microsoft.Spark.FSharp.Examples\Microsoft.Spark.FSharp.Examples.fsproj", "{127370FE-D19D-4489-AB7C-2F1AA7908994}"
EndProject
Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "Solution Items", "Solution Items", "{7BC2C5FB-10A1-492D-952B-D8662C368CB2}"
ProjectSection(SolutionItems) = preProject
..\.editorconfig = ..\.editorconfig
EndProjectSection
EndProject
Global
GlobalSection(SolutionConfigurationPlatforms) = preSolution
Debug|Any CPU = Debug|Any CPU
Release|Any CPU = Release|Any CPU
EndGlobalSection
GlobalSection(ProjectConfigurationPlatforms) = postSolution
{32A34828-20F4-40FE-A3D5-C9458BF424E6}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
{32A34828-20F4-40FE-A3D5-C9458BF424E6}.Debug|Any CPU.Build.0 = Debug|Any CPU
{32A34828-20F4-40FE-A3D5-C9458BF424E6}.Release|Any CPU.ActiveCfg = Release|Any CPU
{32A34828-20F4-40FE-A3D5-C9458BF424E6}.Release|Any CPU.Build.0 = Release|Any CPU
{F5C246F7-9CDE-44DC-8D4F-DAAC557048EF}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
{F5C246F7-9CDE-44DC-8D4F-DAAC557048EF}.Debug|Any CPU.Build.0 = Debug|Any CPU
{F5C246F7-9CDE-44DC-8D4F-DAAC557048EF}.Release|Any CPU.ActiveCfg = Release|Any CPU
{F5C246F7-9CDE-44DC-8D4F-DAAC557048EF}.Release|Any CPU.Build.0 = Release|Any CPU
{606FBD5E-D5C6-48F6-8FBF-2F0E25527760}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
{606FBD5E-D5C6-48F6-8FBF-2F0E25527760}.Debug|Any CPU.Build.0 = Debug|Any CPU
{606FBD5E-D5C6-48F6-8FBF-2F0E25527760}.Release|Any CPU.ActiveCfg = Release|Any CPU
{606FBD5E-D5C6-48F6-8FBF-2F0E25527760}.Release|Any CPU.Build.0 = Release|Any CPU
{127370FE-D19D-4489-AB7C-2F1AA7908994}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
{127370FE-D19D-4489-AB7C-2F1AA7908994}.Debug|Any CPU.Build.0 = Debug|Any CPU
{127370FE-D19D-4489-AB7C-2F1AA7908994}.Release|Any CPU.ActiveCfg = Release|Any CPU
{127370FE-D19D-4489-AB7C-2F1AA7908994}.Release|Any CPU.Build.0 = Release|Any CPU
EndGlobalSection
GlobalSection(SolutionProperties) = preSolution
HideSolutionNode = FALSE
EndGlobalSection
GlobalSection(NestedProjects) = preSolution
{F5C246F7-9CDE-44DC-8D4F-DAAC557048EF} = {CE5FBCF2-F92E-4A2F-A76E-149B7118491B}
{606FBD5E-D5C6-48F6-8FBF-2F0E25527760} = {CE5FBCF2-F92E-4A2F-A76E-149B7118491B}
EndGlobalSection
GlobalSection(ExtensibilityGlobals) = postSolution
SolutionGuid = {9D60F114-5B77-445C-B67C-DCACC90A35CD}
EndGlobalSection
EndGlobal

Просмотреть файл

@ -0,0 +1,8 @@
// Licensed to the .NET Foundation under one or more agreements.
// The .NET Foundation licenses this file to you under the MIT license.
// See the LICENSE file in the project root for more information.
namespace Microsoft.Spark.Examples
type IExample =
abstract member Run : string[] -> int

Просмотреть файл

@ -0,0 +1,25 @@
<Project Sdk="Microsoft.NET.Sdk">
<PropertyGroup>
<OutputType>Exe</OutputType>
<TargetFrameworks>net461;netcoreapp2.1</TargetFrameworks>
<TargetFrameworks Condition="'$(OS)' != 'Windows_NT'">netcoreapp2.1</TargetFrameworks>
<RootNamespace>Microsoft.Spark.Examples</RootNamespace>
<AssemblyName>Microsoft.Spark.FSharp.Examples</AssemblyName>
</PropertyGroup>
<ItemGroup>
<Compile Include="IExample.fs" />
<Compile Include="Sql\Streaming\StructuredNetworkWordCountWindowed.fs" />
<Compile Include="Sql\Streaming\StructuredNetworkWordCount.fs" />
<Compile Include="Sql\Streaming\StructuredKafkaWordCount.fs" />
<Compile Include="Sql\Basic.fs" />
<Compile Include="Program.fs" />
</ItemGroup>
<ItemGroup>
<ProjectReference Include="..\..\src\csharp\Microsoft.Spark.Worker\Microsoft.Spark.Worker.csproj" />
<ProjectReference Include="..\..\src\csharp\Microsoft.Spark\Microsoft.Spark.csproj" />
</ItemGroup>
</Project>

Просмотреть файл

@ -0,0 +1,60 @@
// Licensed to the .NET Foundation under one or more agreements.
// The .NET Foundation licenses this file to you under the MIT license.
// See the LICENSE file in the project root for more information.
module Microsoft.Spark.Examples.Main
open System
open System.Collections.Generic
open System.Linq
open System.Reflection
open System.Runtime.InteropServices
let printUsage (examples : IEnumerable<string>) =
let assemblyName = Assembly.GetExecutingAssembly().GetName().Name
printfn "Usage: %s <example> <example args>" assemblyName
if examples.Any() then
printfn "Examples:\n\t*%s" (examples |> String.concat "\n\t*")
printfn "\n'%s <example>' to get the usage info of each example." assemblyName
let tryFindExample (examples: IEnumerable<string>, search: string, [<Out>] found : string byref) =
found <- examples.FirstOrDefault(fun e ->
e.Equals(search, StringComparison.InvariantCultureIgnoreCase))
not (String.IsNullOrWhiteSpace(found))
[<EntryPoint>]
let main args =
let rootNamespace = MethodBase.GetCurrentMethod().DeclaringType.Namespace
// Find all types in the current assembly that implement IExample
// and is in or starts with rootNamespace. Track the fully qualified
// name of the type after the rootNamespace.
let examples =
Assembly.GetExecutingAssembly().GetTypes()
.Where(fun t ->
typeof<IExample>.IsAssignableFrom(t) &&
not t.IsInterface &&
not t.IsAbstract &&
t.Namespace.StartsWith(rootNamespace) &&
((t.Namespace.Length = rootNamespace.Length) ||
(t.Namespace.[rootNamespace.Length] = '.')))
.Select(fun t -> t.FullName.Substring(rootNamespace.Length + 1))
match args with
| [||] ->
printUsage(examples)
1
| _ ->
let mutable exampleName = String.Empty
if not (tryFindExample(examples, args.[0], &exampleName)) then
printUsage(examples)
1
else
let exampleArgs = args.Skip(1).ToArray()
let exampleType =
Assembly.GetExecutingAssembly()
.GetType(sprintf "%s.%s" rootNamespace exampleName)
let instance = Activator.CreateInstance(exampleType)
let method = exampleType.GetMethod("Run")
method.Invoke(instance, [|exampleArgs|]) :?> int

Просмотреть файл

@ -0,0 +1,89 @@
// Licensed to the .NET Foundation under one or more agreements.
// The .NET Foundation licenses this file to you under the MIT license.
// See the LICENSE file in the project root for more information.
namespace Microsoft.Spark.Examples.Sql
open System
open Microsoft.Spark.Examples
open Microsoft.Spark.Sql
type Basic() =
member this.Run(args : string[]) =
match args with
| [| filePath |] ->
let spark = SparkSession.Builder().AppName("Hello F#").GetOrCreate()
let df = spark.Read().Json(filePath)
let schema = df.Schema()
printfn "%s" (schema.SimpleString)
for row in df.Collect() do
printfn "%s" (row.ToString())
df.Show()
df.PrintSchema()
df.Select("name", "age", "age", "name").Show()
df.Select(df.["name"], df.["age"] + 1).Show()
df.Filter(df.["age"].Gt(21)).Show()
df.GroupBy("age")
.Agg(Functions.Avg(df.["age"]),
Functions.Avg(df.["age"]),
Functions.CountDistinct(df.["age"], df.["age"]))
.Show()
// SQL example.
df.CreateOrReplaceTempView("people")
// Registering UDF for SQL expression.
let sqlDf = spark.Sql("SELECT * FROM people")
sqlDf.Show()
spark.Udf().Register<Nullable<int>, string, string>(
"my_udf",
fun age name ->
name + " with " + (if age.HasValue then (string)(age.Value) else "null"))
let sqlDf = spark.Sql("SELECT my_udf(*) FROM people")
sqlDf.Show()
// Using UDF via data frames.
let addition = Functions.Udf<Nullable<int>, string, string>(
fun age name ->
name + " is " +
(if age.HasValue then (string)(age.Value + 10) else "0"))
df.Select(addition.Invoke(df.["age"], df.["name"])).Show()
// Chaining example:
let addition2 = Functions.Udf<string, string>(fun str -> "hello " + str + "!")
df.Select(addition2.Invoke(addition.Invoke(df.["age"], df.["name"]))).Show()
// Multiple UDF example:
df.Select(addition.Invoke(df.["age"], df.["name"]), addition2.Invoke(df.["name"]))
.Show()
// Joins.
let joinedDf = df.Join(df, "name")
joinedDf.Show()
let joinedDf2 = df.Join(df, ["name"; "age"] |> List.toSeq)
joinedDf2.Show()
let joinedDf3 = df.Join(df, df.["name"].EqualTo(df.["name"]), "outer")
joinedDf3.Show()
spark.Stop()
0
| _ ->
printfn "Usage: Basic <path to SPARK_HOME/examples/src/main/resources/people.json>"
1
interface IExample with
member this.Run (args) = this.Run (args)

Просмотреть файл

@ -0,0 +1,43 @@
// Licensed to the .NET Foundation under one or more agreements.
// The .NET Foundation licenses this file to you under the MIT license.
// See the LICENSE file in the project root for more information.
namespace Microsoft.Spark.Examples.Sql.Streaming
open Microsoft.Spark.Examples
open Microsoft.Spark.Sql
/// <summary>
/// The example is taken/modified from
/// spark/examples/src/main/python/sql/streaming/structured_kafka_wordcount.py
/// </summary>
type StructuredKafkaWordCount() =
member this.Run(args : string[]) =
match args with
| [| bootstrapServers; subscribeType; topics |] ->
let spark = SparkSession.Builder().AppName("StructuredKafkaWordCount").GetOrCreate()
let lines =
spark.ReadStream()
.Format("kafka")
.Option("kafka.bootstrap.servers", bootstrapServers)
.Option(subscribeType, topics)
.Load()
.SelectExpr("CAST(value AS STRING)")
let words =
lines.Select(Functions.Explode(Functions.Split(lines.["value"], " "))
.Alias("word"))
let wordCounts = words.GroupBy("word").Count()
let query = wordCounts.WriteStream().OutputMode("complete").Format("console").Start()
query.AwaitTermination()
0
| _ ->
printfn "Usage: StructuredKafkaWordCount <bootstrap-servers> <subscribe-type> <topics>"
1
interface IExample with
member this.Run (args) = this.Run (args)

Просмотреть файл

@ -0,0 +1,48 @@
// Licensed to the .NET Foundation under one or more agreements.
// The .NET Foundation licenses this file to you under the MIT license.
// See the LICENSE file in the project root for more information.
namespace Microsoft.Spark.Examples.Sql.Streaming
open Microsoft.Spark.Examples
open Microsoft.Spark.Sql
/// <summary>
/// The example is taken/modified from
/// spark/examples/src/main/python/sql/streaming/structured_network_wordcount.py
///
/// You can set up the data source as follow in a separated terminal:
/// `$ nc -lk 9999`
/// to start writing standard input to port 9999.
/// </summary>
type StructuredNetworkWordCount() =
member this.Run(args : string[]) =
match args with
| [| hostname; portStr |] ->
let port = portStr |> int64
let spark = SparkSession.Builder().AppName("StructuredNetworkWordCount").GetOrCreate()
let lines =
spark.ReadStream()
.Format("socket")
.Option("host", hostname)
.Option("port", port)
.Load()
let words =
lines.Select(Functions.Explode(Functions.Split(lines.["value"], " "))
.Alias("word"))
let wordCounts = words.GroupBy("word").Count()
let query = wordCounts.WriteStream().OutputMode("complete").Format("console").Start()
query.AwaitTermination()
0
| _ ->
printfn "Usage: StructuredNetworkWordCount <hostname> <port>"
1
interface IExample with
member this.Run (args) = this.Run (args)

Просмотреть файл

@ -0,0 +1,66 @@
// Licensed to the .NET Foundation under one or more agreements.
// The .NET Foundation licenses this file to you under the MIT license.
// See the LICENSE file in the project root for more information.
namespace Microsoft.Spark.Examples.Sql.Streaming
open Microsoft.Spark.Examples
open Microsoft.Spark.Sql
/// <summary>
/// The example is taken/modified from
/// spark/examples/src/main/python/sql/streaming/structured_network_wordcount_windowed.py
///
/// You can set up the data source as follow in a separated terminal:
/// `$ nc -lk 9999`
/// to start writing standard input to port 9999.
/// </summary>
type StructuredNetworkWordCountWindowed() =
member this.Run(args : string[]) =
match args with
| ([| hostname; portStr; windowSizeStr |] | [| hostname; portStr; windowSizeStr; _ |]) ->
let port = portStr |> int64
let windowSize = windowSizeStr |> int64
let slideSize = if (args.Length = 3) then windowSize else (args.[3] |> int64)
if (slideSize > windowSize) then
printfn "<slide duration> must be less than or equal to <window duration>"
let windowDuration = sprintf "%d seconds" windowSize
let slideDuration = sprintf "%d seconds" slideSize
let spark =
SparkSession.Builder().AppName("StructuredNetworkWordCountWindowed").GetOrCreate()
let lines =
spark.ReadStream()
.Format("socket")
.Option("host", hostname)
.Option("port", port)
.Option("includeTimestamp", true)
.Load()
let words =
lines.Select(Functions.Explode(Functions.Split(lines.["value"], " "))
.Alias("word"), lines.["timestamp"])
let windowedCounts =
words.GroupBy(Functions.Window(words.["timestamp"], windowDuration, slideDuration),
words.["word"])
.Count()
.OrderBy("window")
let query =
windowedCounts.WriteStream()
.OutputMode("complete")
.Format("console")
.Option("truncate", false)
.Start()
query.AwaitTermination()
0
| _ ->
printfn "Usage: StructuredNetworkWordCountWindowed \
<hostname> <port> <window duration in seconds> \
[<slide duration in seconds>]"
1
interface IExample with
member this.Run (args) = this.Run (args)

Просмотреть файл

@ -0,0 +1,23 @@
@echo off
setlocal
set OutputDir=%1
cd %OutputDir%
echo "Download Hadoop binaries for Windows."
curl -k -L -o hadoop.zip https://github.com/steveloughran/winutils/releases/download/tag_2017-08-29-hadoop-2.8.1-native/hadoop-2.8.1.zip
unzip hadoop.zip
mkdir -p hadoop\bin
cp hadoop-2.8.1\winutils.exe hadoop\bin
echo "Downloading Spark distros."
curl -k -L -o spark-2.3.0.tgz https://archive.apache.org/dist/spark/spark-2.3.0/spark-2.3.0-bin-hadoop2.7.tgz && tar xzvf spark-2.3.0.tgz
curl -k -L -o spark-2.3.1.tgz https://archive.apache.org/dist/spark/spark-2.3.1/spark-2.3.1-bin-hadoop2.7.tgz && tar xzvf spark-2.3.1.tgz
curl -k -L -o spark-2.3.2.tgz https://archive.apache.org/dist/spark/spark-2.3.2/spark-2.3.2-bin-hadoop2.7.tgz && tar xzvf spark-2.3.2.tgz
curl -k -L -o spark-2.3.3.tgz https://archive.apache.org/dist/spark/spark-2.3.3/spark-2.3.3-bin-hadoop2.7.tgz && tar xzvf spark-2.3.3.tgz
curl -k -L -o spark-2.4.0.tgz https://archive.apache.org/dist/spark/spark-2.4.0/spark-2.4.0-bin-hadoop2.7.tgz && tar xzvf spark-2.4.0.tgz
curl -k -L -o spark-2.4.1.tgz https://archive.apache.org/dist/spark/spark-2.4.1/spark-2.4.1-bin-hadoop2.7.tgz && tar xzvf spark-2.4.1.tgz
endlocal

Просмотреть файл

@ -0,0 +1,22 @@
@echo off
setlocal
set Build.SourcesDirectory=%1
set Build.ArtifactStagingDirectory=%2
set Build.Configuration=%3
CALL :PublishWorker net461, win-x64
CALL :PublishWorker netcoreapp2.1, win-x64
CALL :PublishWorker netcoreapp2.1, ubuntu.16.04-x64
CALL :PublishWorker netcoreapp2.1, ubuntu.18.04-x64
EXIT /B %ERRORLEVEL%
:PublishWorker
set Framework=%~1
set Runtime=%~2
mkdir %Build.ArtifactStagingDirectory%\%Framework%\%Runtime%
dotnet publish %Build.SourcesDirectory%\src\csharp\Microsoft.Spark.Worker\Microsoft.Spark.Worker.csproj --configuration %Build.Configuration% --framework %Framework% --runtime %Runtime% --output %Build.ArtifactStagingDirectory%\%Framework%\%Runtime%
EXIT /B 0
endlocal

Просмотреть файл

@ -0,0 +1,15 @@
<?xml version="1.0" encoding="utf-8"?>
<Project ToolsVersion="14.0" DefaultTargets="Build" xmlns="http://schemas.microsoft.com/developer/msbuild/2003" TreatAsLocalProperty="ExcludeRestorePackageImports">
<PropertyGroup>
<CheckForOverflowUnderflow>false</CheckForOverflowUnderflow>
<Deterministic>true</Deterministic>
<Features>strict</Features>
<LangVersion>latest</LangVersion>
<TreatWarningsAsErrors>true</TreatWarningsAsErrors>
<WarningLevel>4</WarningLevel>
<RestoreSources>
https://api.nuget.org/v3/index.json;
https://dotnet.myget.org/F/dotnet-core/api/v3/index.json;
</RestoreSources>
</PropertyGroup>
</Project>

Просмотреть файл

@ -0,0 +1,49 @@
// Licensed to the .NET Foundation under one or more agreements.
// The .NET Foundation licenses this file to you under the MIT license.
// See the LICENSE file in the project root for more information.
using System;
using System.Collections.Generic;
using Xunit;
namespace Microsoft.Spark.E2ETest.IpcTests
{
[Collection("Spark E2E Tests")]
public class PairRDDFunctionsTests
{
private readonly SparkContext _sc;
public PairRDDFunctionsTests()
{
_sc = SparkContext.GetOrCreate(new SparkConf());
}
[Fact]
public void TestCollect()
{
RDD<Tuple<string, int>> rdd = _sc.Parallelize(new[] {
new Tuple<string, int>("a", 1),
new Tuple<string, int>("b", 2) });
// Validate CollectAsMap().
{
var expected = new Dictionary<string, int>
{
["a"] = 1,
["b"] = 2
};
Assert.Equal(expected, rdd.CollectAsMap());
}
// Validate Keys().
{
Assert.Equal(new[] { "a", "b" }, rdd.Keys().Collect());
}
// Validate Values().
{
Assert.Equal(new[] { 1, 2 }, rdd.Values().Collect());
}
}
}
}

Просмотреть файл

@ -0,0 +1,115 @@
// Licensed to the .NET Foundation under one or more agreements.
// The .NET Foundation licenses this file to you under the MIT license.
// See the LICENSE file in the project root for more information.
using System.Linq;
using Xunit;
namespace Microsoft.Spark.E2ETest.IpcTests
{
[Collection("Spark E2E Tests")]
public class RDDTests
{
private readonly SparkContext _sc;
public RDDTests()
{
_sc = SparkContext.GetOrCreate(new SparkConf());
}
[Fact]
public void TestParallelize()
{
{
RDD<int> rdd = _sc.Parallelize(Enumerable.Range(0, 5));
Assert.Equal(new[] { 0, 1, 2, 3, 4 }, rdd.Collect());
}
{
var strs = new[] { "hello", "spark", "for", "dotnet" };
RDD<string> rdd = _sc.Parallelize(strs);
Assert.Equal(strs, rdd.Collect());
}
}
[Fact]
public void TestTextFile()
{
RDD<string> rdd = _sc.TextFile(TestEnvironment.ResourceDirectory + "people.txt");
var strs = new[] { "Michael, 29", "Andy, 30", "Justin, 19" };
Assert.Equal(strs, rdd.Collect());
// Test a transformation so that SerializedMode is correctly propagated.
RDD<int> intRdd = rdd.Map(str => 0);
Assert.Equal(new[] { 0, 0, 0 }, intRdd.Collect());
}
[Fact]
public void TestMap()
{
RDD<int> rdd = _sc.Parallelize(Enumerable.Range(0, 5))
.Map(x => x * 2);
Assert.Equal(new[] { 0, 2, 4, 6, 8 }, rdd.Collect());
}
[Fact]
public void TestFlatMap()
{
RDD<string> rdd = _sc.Parallelize(new[] { "hello spark", "for dotnet" })
.FlatMap(str => str.Split(new char[] { ' ' }));
Assert.Equal(new[] { "hello", "spark", "for", "dotnet" }, rdd.Collect());
}
[Fact]
public void TestMapPartitions()
{
RDD<string> rdd = _sc.Parallelize(Enumerable.Range(0, 5))
.MapPartitions(inputs => inputs.Select(input => $"str{input}"));
Assert.Equal(new[] { "str0", "str1", "str2", "str3", "str4" }, rdd.Collect());
}
[Fact]
public void TestMapPartitionsWithIndex()
{
RDD<string> rdd = _sc.Parallelize(Enumerable.Range(0, 3))
.MapPartitionsWithIndex(
(pid, inputs) => inputs.Select(input => $"str_{pid}_{input}"));
Assert.Equal(new[] { "str_0_0", "str_0_1", "str_0_2" }, rdd.Collect());
}
[Fact]
public void TestPipelinedRDD()
{
RDD<string> rdd = _sc.Parallelize(Enumerable.Range(0, 3))
.Map(i => i + 5)
.Map(i => i * 2)
.Map(i => $"str_{i}")
.FlatMap(str => str.Split(new[] { '_' }));
Assert.Equal(new[] { "str", "10", "str", "12", "str", "14" }, rdd.Collect());
}
[Fact]
public void TestFilter()
{
RDD<int> rdd = _sc.Parallelize(Enumerable.Range(0, 5))
.Filter(x => (x % 2) == 0);
Assert.Equal(new[] { 0, 2, 4 }, rdd.Collect());
}
[Fact]
public void TestSample()
{
RDD<int> rdd = _sc.Parallelize(Enumerable.Range(0, 10))
.Sample(true, 0.9, 0);
var count = rdd.Collect().Count();
Assert.True(count > 0);
Assert.True(count <= 10);
}
}
}

Просмотреть файл

@ -0,0 +1,55 @@
// Licensed to the .NET Foundation under one or more agreements.
// The .NET Foundation licenses this file to you under the MIT license.
// See the LICENSE file in the project root for more information.
using System.Collections.Generic;
using System.Linq;
using Xunit;
namespace Microsoft.Spark.E2ETest.IpcTests
{
[Collection("Spark E2E Tests")]
public class SparkConfTests
{
private readonly SparkFixture _fixture;
public SparkConfTests(SparkFixture fixture)
{
_fixture = fixture;
}
[Fact]
public void TestSparkConf()
{
var sparkConf = new SparkConf(false);
sparkConf.SetMaster("master");
sparkConf.SetAppName("test");
sparkConf.SetSparkHome("test home");
sparkConf.Set("key_string", "value");
sparkConf.Set("key_int", "100");
var expectedConfigs = new Dictionary<string, string>()
{
{ "spark.master", "master" },
{ "spark.app.name", "test" },
{ "spark.home", "test home" },
{ "key_string", "value" },
{ "key_int", "100" }
};
foreach (KeyValuePair<string, string> kv in expectedConfigs)
{
Assert.Equal(kv.Value, sparkConf.Get(kv.Key, string.Empty));
}
Assert.Equal(100, sparkConf.GetInt("key_int", 0));
// Validate GetAll().
Dictionary<string, string> actualAllConfigs =
sparkConf.GetAll().ToDictionary(x => x.Key, x => x.Value);
Assert.Equal(expectedConfigs, actualAllConfigs);
}
}
}

Просмотреть файл

@ -0,0 +1,40 @@
// Licensed to the .NET Foundation under one or more agreements.
// The .NET Foundation licenses this file to you under the MIT license.
// See the LICENSE file in the project root for more information.
using Xunit;
namespace Microsoft.Spark.E2ETest.IpcTests
{
[Collection("Spark E2E Tests")]
public class SparkContextTests
{
/// <summary>
/// Test signatures for APIs up to Spark 2.3.*.
/// </summary>
/// <remarks>
/// For the RDD related tests, refer to <see cref="RDDTests"/>.
/// </remarks>
[Fact]
public void TestSignaturesV2_3_X()
{
SparkContext sc = SparkContext.GetOrCreate(new SparkConf());
_ = sc.GetConf();
_ = sc.DefaultParallelism;
sc.SetJobDescription("job description");
sc.SetJobGroup("group id", "description");
sc.SetJobGroup("group id", "description", true);
sc.ClearJobGroup();
string filePath = TestEnvironment.ResourceDirectory + "people.txt";
sc.AddFile(filePath);
sc.AddFile(filePath, true);
sc.SetCheckpointDir(TestEnvironment.ResourceDirectory);
}
}
}

Просмотреть файл

@ -0,0 +1,142 @@
// Licensed to the .NET Foundation under one or more agreements.
// The .NET Foundation licenses this file to you under the MIT license.
// See the LICENSE file in the project root for more information.
using Microsoft.Spark.Sql;
using Xunit;
using static Microsoft.Spark.Sql.Expressions.Window;
using static Microsoft.Spark.Sql.Functions;
namespace Microsoft.Spark.E2ETest.IpcTests
{
[Collection("Spark E2E Tests")]
public class ColumnTests
{
/// <summary>
/// Test signatures for APIs up to Spark 2.3.*.
/// </summary>
[Fact]
public void TestSignaturesV2_3_X()
{
Column col = Column("col");
Column col1 = Column("col1");
Column col2 = Column("col2");
col = -col1;
col = !col;
col = col1 == col2;
col = col1.EqualTo(col2);
col = col1 != col2;
col = col1.NotEqual(col2);
col = col1 > col2;
col = col1 > "hello";
col = col1.Gt(col2);
col = col1.Gt("hello");
col = col1 < col2;
col = col1 < "hello";
col = col1.Lt(col2);
col = col1.Lt("hello");
col = col1 <= col2;
col = col1 <= "hello";
col = col1.Leq(col2);
col = col1.Leq("hello");
col = col1 >= col2;
col = col1 >= "hello";
col = col1.Geq(col2);
col = col1.Geq("hello");
col = col1.EqNullSafe(col2);
col = col1.EqNullSafe("hello");
col = When(col1 == col2, 0).When(col1 == col2, 0);
col = When(col1 == col2, 0).Otherwise(col2);
col = When(col1 == col2, 0).Otherwise("hello");
col = col1.Between(col1, col2);
col = col1.Between(1, 3);
col = col.IsNaN();
col = col.IsNotNull();
col = col1 | col2;
col = col.Or(col2);
col = col1 & col2;
col = col.And(col2);
col = col1 + col2;
col = col1.Plus(col2);
col = col1 - col2;
col = col1.Minus(col2);
col = col1 * col2;
col = col1.Multiply(col2);
col = col1 / col2;
col = col1.Divide(col2);
col = col1 % col2;
col = col1.Mod(col2);
col = col1.Like("hello");
col = col1.RLike("hello");
col = col1.GetItem(1);
col = col1.GetItem("key");
col = col1.GetField("field");
col = col1.SubStr(col1, col2);
col = col1.SubStr(0, 5);
col = col1.Contains(col2);
col = col1.Contains("hello");
col = col1.StartsWith(col2);
col = col1.StartsWith("hello");
col = col1.EndsWith(col2);
col = col1.EndsWith("hello");
col = col1.Alias("alias");
col = col1.As("alias");
col = col1.As(new string[] { });
col = col1.As(new[] { "alias1", "alias2" });
col = col1.Name("alias");
col = col1.Cast("string");
col = col1.Desc();
col = col1.DescNullsFirst();
col = col1.DescNullsLast();
col = col1.Asc();
col = col1.AscNullsFirst();
col = col1.AscNullsLast();
col.Explain(true);
col = col1.BitwiseOR(col2);
col = col1.BitwiseAND(col2);
col = col1.BitwiseXOR(col2);
col = col1.Over(PartitionBy(col1));
col = col1.Over();
}
}
}

Просмотреть файл

@ -0,0 +1,95 @@
// Licensed to the .NET Foundation under one or more agreements.
// The .NET Foundation licenses this file to you under the MIT license.
// See the LICENSE file in the project root for more information.
using System.Collections.Generic;
using Microsoft.Spark.Sql;
using Xunit;
namespace Microsoft.Spark.E2ETest.IpcTests
{
[Collection("Spark E2E Tests")]
public class DataFrameFunctionsTests
{
private readonly SparkSession _spark;
private readonly DataFrame _df;
public DataFrameFunctionsTests(SparkFixture fixture)
{
_spark = fixture.Spark;
_df = _spark
.Read()
.Json(TestEnvironment.ResourceDirectory + "people.json");
}
[Fact]
public void TestDataFrameNaFunctionSignatures()
{
DataFrameNaFunctions dfNaFuncs = _df.Na();
var emptyColumn = new string[] { };
var validColumn = new string[] { "age" };
DataFrame df = dfNaFuncs.Drop("any");
df = dfNaFuncs.Drop("all");
df = dfNaFuncs.Drop(emptyColumn);
df = dfNaFuncs.Drop(validColumn);
df = dfNaFuncs.Drop("any", emptyColumn);
df = dfNaFuncs.Drop("all", validColumn);
df = dfNaFuncs.Drop(20);
df = dfNaFuncs.Drop(20, emptyColumn);
df = dfNaFuncs.Drop(20, validColumn);
df = dfNaFuncs.Fill(100L);
df = dfNaFuncs.Fill(100.0);
df = dfNaFuncs.Fill("hello");
df = dfNaFuncs.Fill(false);
df = dfNaFuncs.Fill(100L, emptyColumn);
df = dfNaFuncs.Fill(100L, validColumn);
df = dfNaFuncs.Fill(100.0, emptyColumn);
df = dfNaFuncs.Fill(100.0, validColumn);
df = dfNaFuncs.Fill("hello", emptyColumn);
df = dfNaFuncs.Fill("hello", validColumn);
df = dfNaFuncs.Fill(true, emptyColumn);
df = dfNaFuncs.Fill(true, validColumn);
df = dfNaFuncs.Fill(new Dictionary<string, int>() { { "age", 10 } });
df = dfNaFuncs.Fill(new Dictionary<string, long>() { { "age", 10L } });
df = dfNaFuncs.Fill(new Dictionary<string, double>() { { "age", 10.0 } });
df = dfNaFuncs.Fill(new Dictionary<string, string>() { { "age", "name" } });
df = dfNaFuncs.Fill(new Dictionary<string, bool>() { { "age", false } });
var doubleReplacement = new Dictionary<double, double>() { { 1.0, 5.0 } };
var boolReplacement = new Dictionary<bool, bool>() { { true, false } };
var stringReplacement = new Dictionary<string, string>() { { "a", "b" } };
df = dfNaFuncs.Replace("age", doubleReplacement);
df = dfNaFuncs.Replace("age", boolReplacement);
df = dfNaFuncs.Replace("age", stringReplacement);
df = dfNaFuncs.Replace(emptyColumn, doubleReplacement);
df = dfNaFuncs.Replace(validColumn, doubleReplacement);
df = dfNaFuncs.Replace(emptyColumn, boolReplacement);
df = dfNaFuncs.Replace(validColumn, boolReplacement);
df = dfNaFuncs.Replace(emptyColumn, stringReplacement);
df = dfNaFuncs.Replace(validColumn, stringReplacement);
}
[Fact]
public void TestDataFrameStatFunctionSignatures()
{
DataFrameStatFunctions stat = _df.Stat();
double[] result = stat.ApproxQuantile("age", new[] { 0.5, 0.5 }, 0.3);
double cov = stat.Cov("age", "age");
double corr = stat.Corr("age", "age", "pearson");
corr = stat.Corr("age", "age");
var columnNames = new[] { "age", "name" };
DataFrame df = stat.FreqItems(columnNames, 0.2);
df = stat.FreqItems(columnNames);
df = stat.SampleBy("age", new Dictionary<int, double> { { 1, 0.5 } }, 100);
}
}
}

Просмотреть файл

@ -0,0 +1,329 @@
// Licensed to the .NET Foundation under one or more agreements.
// The .NET Foundation licenses this file to you under the MIT license.
// See the LICENSE file in the project root for more information.
using System;
using System.Linq;
using System.Reflection;
using Microsoft.Spark.E2ETest.Utils;
using Microsoft.Spark.Sql;
using Microsoft.Spark.Sql.Types;
using Microsoft.Spark.Utils;
using Xunit;
using static Microsoft.Spark.Sql.Functions;
namespace Microsoft.Spark.E2ETest.IpcTests
{
[Collection("Spark E2E Tests")]
public class DataFrameTests
{
private static FieldInfo s_udfUtilsUseArrow =
typeof(UdfUtils).GetField("s_useArrow", BindingFlags.Static | BindingFlags.NonPublic);
private readonly SparkSession _spark;
private readonly DataFrame _df;
public DataFrameTests(SparkFixture fixture)
{
_spark = fixture.Spark;
_df = _spark
.Read()
.Schema("age INT, name STRING")
.Json(TestEnvironment.ResourceDirectory + "people.json");
}
[Fact]
public void TestCollect()
{
Row[] rows = _df.Collect().ToArray();
Assert.Equal(3, rows.Length);
Row row1 = rows[0];
Assert.Equal("Michael", row1.GetAs<string>("name"));
Assert.Null(row1.Get("age"));
Row row2 = rows[1];
Assert.Equal("Andy", row2.GetAs<string>("name"));
Assert.Equal(30, row2.GetAs<int>("age"));
Row row3 = rows[2];
Assert.Equal("Justin", row3.GetAs<string>("name"));
Assert.Equal(19, row3.GetAs<int>("age"));
}
[Theory]
[InlineData(true)]
[InlineData(false)]
public void TestUDF(bool useArrow)
{
bool originalUseArrow = GetUseArrowValue();
SetUseArrowValue(useArrow);
try
{
// Single UDF.
Func<Column, Column, Column> udf1 = Udf<int?, string, string>(
(age, name) => name + " is " + (age ?? 0));
{
Row[] rows = _df.Select(udf1(_df["age"], _df["name"])).Collect().ToArray();
Assert.Equal(3, rows.Length);
Assert.Equal("Michael is 0", rows[0].GetAs<string>(0));
Assert.Equal("Andy is 30", rows[1].GetAs<string>(0));
Assert.Equal("Justin is 19", rows[2].GetAs<string>(0));
}
// Chained UDFs.
Func<Column, Column> udf2 = Udf<string, string>(str => $"hello {str}!");
{
Row[] rows = _df
.Select(udf2(udf1(_df["age"], _df["name"])))
.Collect()
.ToArray();
Assert.Equal(3, rows.Length);
Assert.Equal("hello Michael is 0!", rows[0].GetAs<string>(0));
Assert.Equal("hello Andy is 30!", rows[1].GetAs<string>(0));
Assert.Equal("hello Justin is 19!", rows[2].GetAs<string>(0));
}
// Multiple UDFs:
{
Row[] rows = _df
.Select(udf1(_df["age"], _df["name"]), udf2(_df["name"]))
.Collect()
.ToArray();
Assert.Equal(3, rows.Length);
Assert.Equal("Michael is 0", rows[0].GetAs<string>(0));
Assert.Equal("hello Michael!", rows[0].GetAs<string>(1));
Assert.Equal("Andy is 30", rows[1].GetAs<string>(0));
Assert.Equal("hello Andy!", rows[1].GetAs<string>(1));
Assert.Equal("Justin is 19", rows[2].GetAs<string>(0));
Assert.Equal("hello Justin!", rows[2].GetAs<string>(1));
}
}
finally
{
SetUseArrowValue(originalUseArrow);
}
}
/// <summary>
/// Test signatures for APIs up to Spark 2.3.*.
/// </summary>
[Fact]
public void TestSignaturesV2_3_X()
{
Column col = _df["name"];
col = _df["age"];
DataFrame df = _df.ToDF();
df = df.ToDF("name2", "age2");
StructType schema = _df.Schema();
Assert.NotNull(schema);
_df.PrintSchema();
_df.Explain();
_df.Explain(true);
_df.Explain(false);
Assert.Equal(2, _df.Columns().ToArray().Length);
_df.IsLocal();
_df.IsStreaming();
// The following is required for *CheckPoint().
_spark.SparkContext.SetCheckpointDir(TestEnvironment.ResourceDirectory);
_df.Checkpoint();
_df.Checkpoint(false);
_df.LocalCheckpoint();
_df.LocalCheckpoint(false);
_df.WithWatermark("time", "10 minutes");
_df.Show();
_df.Show(10);
_df.Show(10, 10);
_df.Show(10, 10, true);
_df.Join(_df);
_df.Join(_df, "name");
_df.Join(_df, new[] { "name" });
_df.Join(_df, new[] { "name" }, "outer");
_df.Join(_df, _df["age"] == _df["age"]);
_df.Join(_df, _df["age"] == _df["age"], "outer");
_df.CrossJoin(_df);
_df.SortWithinPartitions("age");
_df.SortWithinPartitions("age", "name");
_df.SortWithinPartitions();
_df.SortWithinPartitions(_df["age"]);
_df.SortWithinPartitions(_df["age"], _df["name"]);
_df.Sort("age");
_df.Sort("age", "name");
_df.Sort();
_df.Sort(_df["age"]);
_df.Sort(_df["age"], _df["name"]);
_df.OrderBy("age");
_df.OrderBy("age", "name");
_df.OrderBy();
_df.OrderBy(_df["age"]);
_df.OrderBy(_df["age"], _df["name"]);
_df.Hint("broadcast");
_df.Hint("broadcast", new[] { "hello", "world" });
_df.Col("age");
_df.ColRegex("age");
_df.As("alias");
_df.Alias("alias");
_df.Select("age");
_df.Select("age", "name");
_df.Select();
_df.Select(_df["age"]);
_df.Select(_df["age"], _df["name"]);
_df.SelectExpr();
_df.SelectExpr("age * 2");
_df.SelectExpr("age * 2", "abs(age)");
_df.Filter(_df["age"] > 21);
_df.Filter("age > 21");
_df.Where(_df["age"] > 21);
_df.Where("age > 21");
_df.GroupBy("age");
_df.GroupBy("age", "name");
_df.GroupBy();
_df.GroupBy(_df["age"]);
_df.GroupBy(_df["age"], _df["name"]);
_df.Rollup("age");
_df.Rollup("age", "name");
_df.Rollup();
_df.Rollup(_df["age"]);
_df.Rollup(_df["age"], _df["name"]);
_df.Cube("age");
_df.Cube("age", "name");
_df.Cube();
_df.Cube(_df["age"]);
_df.Cube(_df["age"], _df["name"]);
_df.Agg(Avg(_df["age"]));
_df.Agg(Avg(_df["age"]), Avg(_df["name"]));
_df.Limit(10);
_df.Union(_df);
_df.UnionByName(_df);
_df.Intersect(_df);
_df.Except(_df);
_df.Sample(0.5);
_df.Sample(0.5, true);
_df.Sample(0.5, false, 12345);
_df.RandomSplit(new[] { 0.2, 0.8 });
_df.RandomSplit(new[] { 0.2, 0.8 }, 12345);
_df.WithColumn("age2", _df["age"]);
_df.WithColumnRenamed("age", "age2");
_df.Drop();
_df.Drop("age");
_df.Drop("age", "name");
_df.Drop(_df["age"]);
_df.DropDuplicates();
_df.DropDuplicates("age");
_df.DropDuplicates("age", "name");
_df.Describe();
_df.Describe("age");
_df.Describe("age", "name");
_df.Summary();
_df.Summary("count");
_df.Summary("count", "mean");
_df.Head(2);
_df.Head();
_df.First();
_df.Take(3).ToArray();
_df.Collect().ToArray();
_df.ToLocalIterator().ToArray();
_df.Count();
_df.Repartition(2);
_df.Repartition(2, _df["age"]);
_df.Repartition(_df["age"]);
_df.Repartition();
_df.RepartitionByRange(2, _df["age"]);
_df.RepartitionByRange(_df["age"]);
_df.Coalesce(1);
_df.Distinct();
_df.Persist();
_df.Cache();
_df.Unpersist();
_df.CreateTempView("view");
_df.CreateOrReplaceTempView("view");
_df.CreateGlobalTempView("global_view");
_df.CreateOrReplaceGlobalTempView("global_view");
}
/// <summary>
/// Test signatures for APIs introduced in Spark 2.4.*.
/// </summary>
[SkipIfSparkVersionIsLessThan(Versions.V2_4_0)]
public void TestSignaturesV2_4_X()
{
_df.IsEmpty();
_df.IntersectAll(_df);
_df.ExceptAll(_df);
}
private static bool GetUseArrowValue()
{
return (bool)s_udfUtilsUseArrow.GetValue(null);
}
private static void SetUseArrowValue(bool value)
{
s_udfUtilsUseArrow.SetValue(null, value);
}
}
}

Просмотреть файл

@ -0,0 +1,48 @@
// Licensed to the .NET Foundation under one or more agreements.
// The .NET Foundation licenses this file to you under the MIT license.
// See the LICENSE file in the project root for more information.
using Microsoft.Spark.Sql;
using Microsoft.Spark.Sql.Expressions;
using Xunit;
using static Microsoft.Spark.Sql.Expressions.Window;
using static Microsoft.Spark.Sql.Functions;
namespace Microsoft.Spark.E2ETest.IpcTests
{
[Collection("Spark E2E Tests")]
public class WindowSpecTests
{
/// <summary>
/// Test signatures for APIs up to Spark 2.3.*.
/// </summary>
[Fact]
public void TestSignaturesV2_3_X()
{
Column col1 = Column("age");
Column col2 = Column("name");
WindowSpec windowSpec = PartitionBy("age");
windowSpec = windowSpec.PartitionBy("age");
windowSpec = windowSpec.PartitionBy("age", "name");
windowSpec = windowSpec.PartitionBy();
windowSpec = windowSpec.PartitionBy(col1);
windowSpec = windowSpec.PartitionBy(col1, col2);
windowSpec = windowSpec.OrderBy("age");
windowSpec = windowSpec.OrderBy("age", "name");
windowSpec = windowSpec.OrderBy();
windowSpec = windowSpec.OrderBy(col1);
windowSpec = windowSpec.OrderBy(col1, col2);
windowSpec = windowSpec.RowsBetween(
Sql.Expressions.Window.UnboundedPreceding,
Sql.Expressions.Window.UnboundedFollowing);
windowSpec = windowSpec.RangeBetween(
Sql.Expressions.Window.UnboundedPreceding,
Sql.Expressions.Window.UnboundedFollowing);
windowSpec = windowSpec.RangeBetween(UnboundedPreceding(), UnboundedFollowing());
}
}
}

Просмотреть файл

@ -0,0 +1,51 @@
// Licensed to the .NET Foundation under one or more agreements.
// The .NET Foundation licenses this file to you under the MIT license.
// See the LICENSE file in the project root for more information.
using Microsoft.Spark.Sql;
using Microsoft.Spark.Sql.Expressions;
using Xunit;
using static Microsoft.Spark.Sql.Expressions.Window;
using static Microsoft.Spark.Sql.Functions;
namespace Microsoft.Spark.E2ETest.IpcTests
{
[Collection("Spark E2E Tests")]
public class WindowTests
{
/// <summary>
/// Test signatures for APIs up to Spark 2.3.*.
/// </summary>
[Fact]
public void TestSignaturesV2_3_X()
{
Column col1 = Column("age");
Column col2 = Column("name");
_ = Sql.Expressions.Window.UnboundedPreceding;
_ = Sql.Expressions.Window.UnboundedFollowing;
_ = Sql.Expressions.Window.CurrentRow;
WindowSpec windowSpec = PartitionBy("age");
windowSpec = PartitionBy("age", "name");
windowSpec = PartitionBy();
windowSpec = PartitionBy(col1);
windowSpec = PartitionBy(col1, col2);
windowSpec = OrderBy("age");
windowSpec = OrderBy("age", "name");
windowSpec = OrderBy();
windowSpec = OrderBy(col1);
windowSpec = OrderBy(col1, col2);
windowSpec = RowsBetween(
Sql.Expressions.Window.UnboundedPreceding,
Sql.Expressions.Window.UnboundedFollowing);
windowSpec = RangeBetween(
Sql.Expressions.Window.UnboundedPreceding,
Sql.Expressions.Window.UnboundedFollowing);
windowSpec = RangeBetween(UnboundedPreceding(), UnboundedFollowing());
}
}
}

Просмотреть файл

@ -0,0 +1,703 @@
// Licensed to the .NET Foundation under one or more agreements.
// The .NET Foundation licenses this file to you under the MIT license.
// See the LICENSE file in the project root for more information.
using System.Collections.Generic;
using Microsoft.Spark.E2ETest.Utils;
using Microsoft.Spark.Sql;
using Xunit;
using static Microsoft.Spark.Sql.Functions;
namespace Microsoft.Spark.E2ETest.IpcTests
{
[Collection("Spark E2E Tests")]
public class FunctionsTests
{
private readonly SparkSession _spark;
public FunctionsTests(SparkFixture fixture)
{
_spark = fixture.Spark;
}
/// <summary>
/// Test signatures for APIs up to Spark 2.3.*.
/// The purpose of this test is to ensure that JVM calls can be successfully made.
/// Note that this is not testing functionality of each function.
/// </summary>
[Fact]
public void TestSignaturesV2_3_X()
{
//////////////////////////////
// Basic Functions
//////////////////////////////
Column col = Column("col1");
col = Col("col2");
col = Lit(1);
col = Lit("some column");
col = Lit(col);
//////////////////////////////
// Sort Functions
//////////////////////////////
col = Asc("col");
col = AscNullsFirst("col");
col = AscNullsLast("col");
col = Desc("col");
col = DescNullsFirst("col");
col = DescNullsLast("col");
//////////////////////////////
// Aggregate Functions
//////////////////////////////
col = Column("col");
col = ApproxCountDistinct(col);
col = ApproxCountDistinct("col");
col = ApproxCountDistinct(col, 0.05);
col = ApproxCountDistinct("col", 0.05);
col = Avg(col);
col = Avg("col");
col = CollectList(col);
col = CollectList("col");
col = CollectSet(col);
col = CollectSet("col");
col = Corr(col, col);
col = Corr("col1", "col2");
col = Count(col);
col = Count("col");
col = CountDistinct(col);
col = CountDistinct(col, col);
col = CountDistinct(col, col, col);
col = CountDistinct("col1");
col = CountDistinct("col1", "col2");
col = CountDistinct("col1", "col2", "col3");
col = CovarPop(col, col);
col = CovarPop("col1", "col2");
col = CovarSamp(col, col);
col = CovarSamp("col1", "col2");
col = First(col);
col = First(col, true);
col = First(col, false);
col = First("col");
col = First("col", true);
col = First("col", false);
col = Grouping(col);
col = Grouping("col");
col = GroupingId();
col = GroupingId(col);
col = GroupingId(col, col);
col = GroupingId("col1");
col = GroupingId("col1", "col2");
col = GroupingId("col1", "col2", "col3");
col = Kurtosis(col);
col = Kurtosis("col");
col = Last(col);
col = Last(col, true);
col = Last(col, false);
col = Last("col");
col = Last("col", true);
col = Last("col", false);
col = Max(col);
col = Max("col");
col = Mean(col);
col = Mean("col");
col = Min(col);
col = Min("col");
col = Skewness(col);
col = Skewness("col");
col = Stddev(col);
col = Stddev("col");
col = StddevSamp(col);
col = StddevSamp("col");
col = StddevPop(col);
col = StddevPop("col");
col = Sum(col);
col = Sum("col");
col = SumDistinct(col);
col = SumDistinct("col");
col = Variance(col);
col = Variance("col");
col = VarSamp(col);
col = VarSamp("col");
col = VarPop(col);
col = VarPop("col");
//////////////////////////////
// Window Functions
//////////////////////////////
col = UnboundedPreceding();
col = UnboundedFollowing();
col = CurrentRow();
col = CumeDist();
col = DenseRank();
col = Lag(col, 0);
col = Lag(col, 2, "col2");
col = Lag("col", 0);
col = Lag("col", 2, "col2");
col = Lead(col, 0);
col = Lead(col, 2, "col2");
col = Lead("col", 0);
col = Lead("col", 2, "col2");
col = Ntile(100);
col = PercentRank();
col = Rank();
col = RowNumber();
//////////////////////////////
// Non-Aggregate Functions
//////////////////////////////
col = Column("col");
col = Abs(col);
col = Array();
col = Array(col);
col = Array(col, col);
col = Array("col1");
col = Array("col1", "col2");
col = Array("col1", "col2", "col3");
col = Map();
col = Map(col);
col = Map(col, col);
DataFrame df = _spark
.Read()
.Json(TestEnvironment.ResourceDirectory + "people.json");
df = Broadcast(df);
col = Coalesce();
col = Coalesce(col);
col = Coalesce(col, col);
col = InputFileName();
col = IsNaN(col);
col = IsNull(col);
col = MonotonicallyIncreasingId();
col = NaNvl(col, col);
col = Negate(col);
col = Not(col);
col = Rand(12345);
col = Rand();
col = Randn(12345);
col = Randn();
col = SparkPartitionId();
col = Sqrt(col);
col = Sqrt("col");
col = Struct();
col = Struct(col);
col = Struct(col, col);
col = Struct("col1");
col = Struct("col1", "col2");
col = Struct("col1", "col2", "col3");
col = When(col, col);
col = When(col, "col");
col = When(col, 12345);
col = BitwiseNOT(col);
col = Expr("expr");
//////////////////////////////
// Math Functions
//////////////////////////////
col = Column("col");
col = Acos(col);
col = Acos("col");
col = Asin(col);
col = Asin("col");
col = Atan(col);
col = Atan("col");
col = Atan2(col, col);
col = Atan2(col, "x");
col = Atan2("y", col);
col = Atan2("y", "x");
col = Atan2(col, 0.5);
col = Atan2("y", 0.5);
col = Atan2(0.5, col);
col = Atan2(0.5, "x");
col = Bin(col);
col = Bin("col");
col = Cbrt(col);
col = Cbrt("col");
col = Ceil(col);
col = Ceil("col");
col = Conv(col, 2, 10);
col = Cos(col);
col = Cos("col");
col = Cosh(col);
col = Cosh("col");
col = Exp(col);
col = Exp("col");
col = Expm1(col);
col = Expm1("col");
col = Factorial(col);
col = Floor(col);
col = Floor("col");
col = Greatest();
col = Greatest(col);
col = Greatest(col, col);
col = Greatest("col1");
col = Greatest("col1", "col2");
col = Greatest("col1", "col2", "col3");
col = Hex(col);
col = Unhex(col);
col = Hypot(col, col);
col = Hypot(col, "right");
col = Hypot("left", col);
col = Hypot("left", "right");
col = Hypot(col, 0.5);
col = Hypot("left", 0.5);
col = Hypot(0.5, col);
col = Hypot(0.5, "right");
col = Least();
col = Least(col);
col = Least(col, col);
col = Least("col1");
col = Least("col1", "col2");
col = Least("col1", "col2", "col3");
col = Log(col);
col = Log("col");
col = Log(2.0, col);
col = Log(2.0, "col");
col = Log10(col);
col = Log10("col");
col = Log1p(col);
col = Log1p("col");
col = Log2(col);
col = Log2("col");
col = Pow(col, col);
col = Pow(col, "right");
col = Pow("left", col);
col = Pow("left", "right");
col = Pow(col, 0.5);
col = Pow("left", 0.5);
col = Pow(0.5, col);
col = Pow(0.5, "right");
col = Pmod(col, col);
col = Rint(col);
col = Rint("col");
col = Round(col);
col = Round(col, 10);
col = Bround(col);
col = Bround(col, 10);
col = ShiftLeft(col, 4);
col = ShiftRight(col, 4);
col = ShiftRightUnsigned(col, 4);
col = Signum(col);
col = Signum("col");
col = Sin(col);
col = Sin("col");
col = Sinh(col);
col = Sinh("col");
col = Tan(col);
col = Tan("col");
col = Tanh(col);
col = Tanh("col");
col = Degrees(col);
col = Degrees("col");
col = Radians(col);
col = Radians("col");
//////////////////////////////
// Miscellaneous Functions
//////////////////////////////
col = Md5(col);
col = Sha1(col);
col = Sha2(col, 224);
col = Crc32(col);
col = Hash();
col = Hash(col);
col = Hash(col, col);
//////////////////////////////
// String Functions
//////////////////////////////
col = Ascii(col);
col = Base64(col);
col = ConcatWs(";");
col = ConcatWs(";", col);
col = ConcatWs(";", col, col);
col = Decode(col, "UTF-8");
col = Encode(col, "UTF-8");
col = FormatNumber(col, 2);
col = FormatString("%s %d");
col = FormatString("%s %d", col);
col = FormatString("%s %d", col, col);
col = InitCap(col);
col = Instr(col, "abc");
col = Length(col);
col = Lower(col);
col = Levenshtein(col, col);
col = Locate("abc", col);
col = Locate("abc", col, 3);
col = Lpad(col, 3, "pad");
col = Ltrim(col);
col = Ltrim(col, "\n");
col = RegexpExtract(col, "[a-z]", 0);
col = RegexpReplace(col, "[a-z]", "hello");
col = RegexpReplace(col, col, col);
col = Unbase64(col);
col = Rpad(col, 3, "pad");
col = Repeat(col, 3);
col = Rtrim(col);
col = Rtrim(col, "\n");
col = Soundex(col);
col = Split(col, "\t");
col = Substring(col, 0, 5);
col = SubstringIndex(col, ";", 5);
col = Translate(col, "abc", "edf");
col = Trim(col);
col = Trim(col, "\n");
col = Upper(col);
//////////////////////////////
// DateTime Functions
//////////////////////////////
col = AddMonths(col, 3);
col = CurrentDate();
col = CurrentTimestamp();
col = DateFormat(col, "format");
col = DateAdd(col, 5);
col = DateSub(col, 5);
col = DateDiff(col, col);
col = Year(col);
col = Quarter(col);
col = Month(col);
col = DayOfWeek(col);
col = DayOfMonth(col);
col = DayOfYear(col);
col = Hour(col);
col = LastDay(col);
col = Minute(col);
col = MonthsBetween(col, col);
col = NextDay(col, "Mon");
col = Second(col);
col = WeekOfYear(col);
col = FromUnixTime(col);
col = FromUnixTime(col, "yyyy-MM-dd HH:mm:ss");
col = UnixTimestamp();
col = UnixTimestamp(col);
col = UnixTimestamp(col, "yyyy-MM-dd HH:mm:ss");
col = ToTimestamp(col);
col = ToTimestamp(col, "yyyy-MM-dd HH:mm:ss");
col = ToDate(col);
col = ToDate(col, "yyyy-MM-dd HH:mm:ss");
col = Trunc(col, "yyyy");
col = DateTrunc("mon", col);
col = FromUtcTimestamp(col, "GMT+1");
col = ToUtcTimestamp(col, "GMT+1");
col = Window(col, "1 minute", "10 seconds");
col = Window(col, "1 minute", "10 seconds", "5 seconds");
col = Window(col, "1 minute");
//////////////////////////////
// Collection Functions
//////////////////////////////
col = ArrayContains(col, 12345);
col = ArrayContains(col, "str");
col = Concat();
col = Concat(col);
col = Concat(col, col);
col = Explode(col);
col = ExplodeOuter(col);
col = PosExplode(col);
col = PosExplodeOuter(col);
col = GetJsonObject(col, "abc.json");
col = JsonTuple(col, "a");
col = JsonTuple(col, "a", "b");
var options = new Dictionary<string, string>() { { "hello", "world" } };
col = FromJson(col, "a Int");
col = FromJson(col, "a Int", options);
col = ToJson(col);
col = ToJson(col, options);
col = Size(col);
col = SortArray(col);
col = SortArray(col, true);
col = SortArray(col, false);
col = Reverse(col);
col = MapKeys(col);
col = MapValues(col);
//////////////////////////////
// Udf Functions
//////////////////////////////
col = Udf(() => 1)();
col = Udf<int, int>((a1) => 1)(col);
col = Udf<int, int, int>((a1, a2) => 1)(col, col);
col = Udf<int, int, int, int>((a1, a2, a3) => 1)(col, col, col);
col = Udf<int, int, int, int, int>((a1, a2, a3, a4) => 1)(col, col, col, col);
col = Udf<int, int, int, int, int, int>(
(a1, a2, a3, a4, a5) => 1)(col, col, col, col, col);
col = Udf<int, int, int, int, int, int, int>(
(a1, a2, a3, a4, a5, a6) => 1)(col, col, col, col, col, col);
col = Udf<int, int, int, int, int, int, int, int>(
(a1, a2, a3, a4, a5, a6, a7) => 1)(col, col, col, col, col, col, col);
col = Udf<int, int, int, int, int, int, int, int, int>(
(a1, a2, a3, a4, a5, a6, a7, a8) => 1)(col, col, col, col, col, col, col, col);
col = Udf<int, int, int, int, int, int, int, int, int, int>(
(a1, a2, a3, a4, a5, a6, a7, a8, a9) => 1)(
col, col, col, col, col, col, col, col, col);
col = Udf<int, int, int, int, int, int, int, int, int, int, int>(
(a1, a2, a3, a4, a5, a6, a7, a8, a9, a10) => 1)(
col, col, col, col, col, col, col, col, col, col);
col = CallUDF("udf");
col = CallUDF("udf", col);
col = CallUDF("udf", col, col);
}
/// <summary>
/// Test signatures for APIs introduced in Spark 2.4.*.
/// </summary>
[SkipIfSparkVersionIsLessThan(Versions.V2_4_0)]
public void TestSignaturesV2_4_X()
{
Column col = Column("col");
col = MapFromArrays(col, col);
col = MonthsBetween(col, col, false);
col = FromUtcTimestamp(col, col);
col = ToUtcTimestamp(col, col);
col = ArraysOverlap(col, col);
col = Slice(col, 0, 4);
col = ArrayJoin(col, ":", "replacement");
col = ArrayJoin(col, ":");
col = ArrayPosition(col, 1);
col = ElementAt(col, 1);
col = ArraySort(col);
col = ArrayRemove(col, "elementToRemove");
col = ArrayDistinct(col);
col = ArrayIntersect(col, col);
col = ArrayUnion(col, col);
col = ArrayExcept(col, col);
var options = new Dictionary<string, string>() { { "hello", "world" } };
Column schema = SchemaOfJson("[{\"col\":0}]");
col = FromJson(col, schema);
col = FromJson(col, schema, options);
col = SchemaOfJson("{}");
col = SchemaOfJson(col);
col = ArrayMin(col);
col = ArrayMax(col);
col = Shuffle(col);
col = Reverse(col);
col = Flatten(col);
col = Sequence(col, col, col);
col = Sequence(col, col);
col = ArrayRepeat(col, col);
col = ArrayRepeat(col, 5);
col = MapFromEntries(col);
col = ArraysZip();
col = ArraysZip(col);
col = ArraysZip(col, col);
col = MapConcat();
col = MapConcat(col);
col = MapConcat(col, col);
}
}
}

Просмотреть файл

@ -0,0 +1,30 @@
<Project Sdk="Microsoft.NET.Sdk">
<PropertyGroup>
<TargetFrameworks>net461;netcoreapp2.1</TargetFrameworks>
<TargetFrameworks Condition="'$(OS)' != 'Windows_NT'">netcoreapp2.1</TargetFrameworks>
<IsPackable>false</IsPackable>
</PropertyGroup>
<ItemGroup>
<PackageReference Include="coverlet.msbuild" Version="2.4.0">
<PrivateAssets>all</PrivateAssets>
<IncludeAssets>runtime; build; native; contentfiles; analyzers</IncludeAssets>
</PackageReference>
<PackageReference Include="Microsoft.NET.Test.Sdk" Version="15.9.0" />
<PackageReference Include="xunit" Version="2.4.1" />
<PackageReference Include="xunit.runner.visualstudio" Version="2.4.1" />
</ItemGroup>
<ItemGroup>
<Content Include="Resources\*">
<CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
</Content>
</ItemGroup>
<ItemGroup>
<ProjectReference Include="..\Microsoft.Spark.Worker\Microsoft.Spark.Worker.csproj" />
<ProjectReference Include="..\Microsoft.Spark\Microsoft.Spark.csproj" />
</ItemGroup>
</Project>

Просмотреть файл

@ -0,0 +1,4 @@
{"name":"Michael", "salary":3000}
{"name":"Andy", "salary":4500}
{"name":"Justin", "salary":3500}
{"name":"Berta", "salary":4000}

Просмотреть файл

@ -0,0 +1,4 @@
# Set everything to be logged to the console
log4j.rootCategory=ERROR,console
# Use NullAppender for E2E testing. There is a deadlock issue using ConsoleAppender when the JVM process is launched from the C# process.
log4j.appender.console=org.apache.log4j.varia.NullAppender

Просмотреть файл

@ -0,0 +1,3 @@
name;age;job
Jorge;30;Developer
Bob;32;Developer
1 name age job
2 Jorge 30 Developer
3 Bob 32 Developer

Просмотреть файл

@ -0,0 +1,3 @@
{"name":"Michael"}
{"name":"Andy", "age":30}
{"name":"Justin", "age":19}

Просмотреть файл

@ -0,0 +1,3 @@
Michael, 29
Andy, 30
Justin, 19

Двоичные данные
src/csharp/Microsoft.Spark.E2ETest/Resources/users.parquet Normal file

Двоичный файл не отображается.

Просмотреть файл

@ -0,0 +1,165 @@
// Licensed to the .NET Foundation under one or more agreements.
// The .NET Foundation licenses this file to you under the MIT license.
// See the LICENSE file in the project root for more information.
using System;
using System.Diagnostics;
using System.IO;
using System.Runtime.InteropServices;
using Microsoft.Spark.Sql;
using Xunit;
namespace Microsoft.Spark.E2ETest
{
/// <summary>
/// SparkFixture acts as a global fixture to start Spark application in a debug
/// mode through the spark-submit. It also provides a default SparkSession
/// object that any tests can use.
/// </summary>
public class SparkFixture : IDisposable
{
private Process _process = new Process();
internal SparkSession Spark { get; }
public SparkFixture()
{
string workerPathKey = Services.ConfigurationService.WorkerPathSettingKey;
#if NET461
// Set the path for the worker executable to the location of the current
// assembly since xunit framework copies the Microsoft.Spark.dll to an
// isolated location for testing; the default mechanism of getting the directory
// of the worker executable is the location of the Microsoft.Spark.dll.
Environment.SetEnvironmentVariable(
workerPathKey,
AppDomain.CurrentDomain.BaseDirectory);
#elif NETCOREAPP2_1
// For .NET Core, the user must have published the worker as a standalone
// executable and set DotnetWorkerPath to the published directory.
if (string.IsNullOrEmpty(Environment.GetEnvironmentVariable(workerPathKey)))
{
throw new Exception(
$"Environment variable '{workerPathKey}' must be set for .NET Core.");
}
#else
// Compile-time error for not supported frameworks.
throw new NotSupportedException("Not supported frameworks.");
#endif
BuildSparkCmd(out var filename, out var args);
// Configure the process using the StartInfo properties.
_process.StartInfo.FileName = filename;
_process.StartInfo.Arguments = args;
// UseShellExecute defaults to true in .NET Framework,
// but defaults to false in .NET Core. To support both, set it
// to false which is required for stream redirection.
_process.StartInfo.UseShellExecute = false;
_process.StartInfo.RedirectStandardInput = true;
_process.StartInfo.RedirectStandardOutput = true;
_process.StartInfo.RedirectStandardError = true;
bool isSparkReady = false;
_process.OutputDataReceived += (sender, arguments) =>
{
// Scala-side driver for .NET emits the following message after it is
// launched and ready to accept connections.
if (!isSparkReady &&
arguments.Data.Contains("Backend running debug mode"))
{
isSparkReady = true;
}
};
_process.Start();
_process.BeginOutputReadLine();
bool processExited = false;
while (!isSparkReady && !processExited)
{
processExited = _process.WaitForExit(500);
}
if (processExited)
{
_process.Dispose();
// The process should not have been exited.
throw new Exception(
$"Process exited prematurely with '{filename} {args}'.");
}
Spark = SparkSession
.Builder()
.AppName("Microsoft.Spark.E2ETest")
.GetOrCreate();
}
public void Dispose()
{
Spark.Dispose();
// CSparkRunner will exit upon receiving newline from
// the standard input stream.
_process.StandardInput.WriteLine("done");
_process.StandardInput.Flush();
_process.WaitForExit();
}
private void BuildSparkCmd(out string filename, out string args)
{
string sparkHome = SparkSettings.SparkHome;
// Build the executable name.
char sep = Path.DirectorySeparatorChar;
filename = $"{sparkHome}{sep}bin{sep}spark-submit";
if (RuntimeInformation.IsOSPlatform(OSPlatform.Windows))
{
filename += ".cmd";
}
if (!File.Exists(filename))
{
throw new FileNotFoundException($"{filename} does not exist.");
}
// Build the arguments for the spark-submit.
string classArg = "--class org.apache.spark.deploy.DotnetRunner";
string curDir = AppDomain.CurrentDomain.BaseDirectory;
string jarPrefix = GetJarPrefix(sparkHome);
string scalaDir = $"{curDir}{sep}..{sep}..{sep}..{sep}..{sep}..{sep}scala";
string jarDir = $"{scalaDir}{sep}{jarPrefix}{sep}target";
string jar = $"{jarDir}{sep}{jarPrefix}-0.1.0.jar";
if (!File.Exists(jar))
{
throw new FileNotFoundException($"{jar} does not exist.");
}
// If there exists log4j.properties in SPARK_HOME/conf directory, Spark from 2.3.*
// to 2.4.0 hang in E2E test. The reverse behavior is true for Spark 2.4.1; if
// there does not exist log4j.properties, the tests hang.
// Note that the hang happens in JVM when it tries to append a console logger (log4j).
// The solution is to use custom log configuration that appends NullLogger, which
// works across all Spark versions.
string resourceUri = new Uri(TestEnvironment.ResourceDirectory).AbsoluteUri;
string logOption = $"--conf spark.driver.extraJavaOptions=-Dlog4j.configuration=" +
$"{resourceUri}/log4j.properties";
args = $"{logOption} {classArg} --master local {jar} debug";
}
private string GetJarPrefix(string sparkHome)
{
Version sparkVersion = SparkSettings.Version;
return $"microsoft-spark-{sparkVersion.Major}.{sparkVersion.Minor}.x";
}
}
[CollectionDefinition("Spark E2E Tests")]
public class SparkCollection : ICollectionFixture<SparkFixture>
{
// This class has no code, and is never created. Its purpose is simply
// to be the place to apply [CollectionDefinition] and all the
// ICollectionFixture<> interfaces.
}
}

Просмотреть файл

@ -0,0 +1,41 @@
// Licensed to the .NET Foundation under one or more agreements.
// The .NET Foundation licenses this file to you under the MIT license.
// See the LICENSE file in the project root for more information.
using System;
using System.IO;
using System.Linq;
using Xunit.Sdk;
namespace Microsoft.Spark.E2ETest
{
internal static class SparkSettings
{
internal static Version Version { get; private set; }
internal static string SparkHome { get; private set; }
static SparkSettings()
{
InitSparkHome();
InitVersion();
}
private static void InitSparkHome()
{
SparkHome = Environment.GetEnvironmentVariable("SPARK_HOME");
if (SparkHome == null)
{
throw new NullException("SPARK_HOME environment variable is not set.");
}
}
private static void InitVersion()
{
// First line of the RELEASE file under SPARK_HOME will be something similar to:
// Spark 2.3.2 built for Hadoop 2.7.3
string firstLine =
File.ReadLines($"{SparkHome}{Path.DirectorySeparatorChar}RELEASE").First();
Version = new Version(firstLine.Split(' ')[1]);
}
}
}

Просмотреть файл

@ -0,0 +1,33 @@
// Licensed to the .NET Foundation under one or more agreements.
// The .NET Foundation licenses this file to you under the MIT license.
// See the LICENSE file in the project root for more information.
using System;
using System.IO;
namespace Microsoft.Spark.E2ETest
{
/// <summary>
/// TestEnvironment provides functionalities related to E2E test environment.
/// </summary>
internal static class TestEnvironment
{
private static string s_resourceDirectory;
internal static string ResourceDirectory
{
get
{
if (s_resourceDirectory is null)
{
s_resourceDirectory =
AppDomain.CurrentDomain.BaseDirectory +
Path.DirectorySeparatorChar +
"Resources" +
Path.DirectorySeparatorChar;
}
return s_resourceDirectory;
}
}
}
}

Просмотреть файл

@ -0,0 +1,20 @@
// Licensed to the .NET Foundation under one or more agreements.
// The .NET Foundation licenses this file to you under the MIT license.
// See the LICENSE file in the project root for more information.
using System;
using Xunit;
namespace Microsoft.Spark.E2ETest.Utils
{
public sealed class SkipIfSparkVersionIsLessThan : FactAttribute
{
public SkipIfSparkVersionIsLessThan(string version)
{
if (SparkSettings.Version < new Version(version))
{
Skip = $"Ignore on Spark version ({SparkSettings.Version}) <= {version}";
}
}
}
}

Просмотреть файл

@ -0,0 +1,117 @@
// Licensed to the .NET Foundation under one or more agreements.
// The .NET Foundation licenses this file to you under the MIT license.
// See the LICENSE file in the project root for more information.
using System.Collections.Generic;
using System.IO;
using System.Linq;
using Microsoft.Spark.Sql;
using Microsoft.Spark.UnitTest.TestUtils;
using Xunit;
namespace Microsoft.Spark.UnitTest
{
public class CommandSerDeTests
{
[Fact]
public void TestCommandSerDeForSqlPickling()
{
var udfWrapper = new Sql.PicklingUdfWrapper<string, string>((str) => $"hello {str}");
var workerFunction = new Sql.PicklingWorkerFunction(udfWrapper.Execute);
var serializedCommand = Utils.CommandSerDe.Serialize(
workerFunction.Func,
Utils.CommandSerDe.SerializedMode.Row,
Utils.CommandSerDe.SerializedMode.Row);
using (var ms = new MemoryStream(serializedCommand))
{
var deserializedWorkerFunction = new Sql.PicklingWorkerFunction(
Utils.CommandSerDe.Deserialize<Sql.PicklingWorkerFunction.ExecuteDelegate>(
ms,
out Utils.CommandSerDe.SerializedMode serializerMode,
out Utils.CommandSerDe.SerializedMode deserializerMode,
out var runMode));
Assert.Equal(Utils.CommandSerDe.SerializedMode.Row, serializerMode);
Assert.Equal(Utils.CommandSerDe.SerializedMode.Row, deserializerMode);
Assert.Equal("N", runMode);
var result = deserializedWorkerFunction.Func(0, new[] { "spark" }, new[] { 0 });
Assert.Equal("hello spark", result);
}
}
[Fact]
public void TestCommandSerDeForSqlArrow()
{
var udfWrapper = new ArrowUdfWrapper<string, string>((str) => $"hello {str}");
var workerFunction = new ArrowWorkerFunction(udfWrapper.Execute);
var serializedCommand = Utils.CommandSerDe.Serialize(
workerFunction.Func,
Utils.CommandSerDe.SerializedMode.Row,
Utils.CommandSerDe.SerializedMode.Row);
using (var ms = new MemoryStream(serializedCommand))
{
var deserializedWorkerFunction = new ArrowWorkerFunction(
Utils.CommandSerDe.Deserialize<ArrowWorkerFunction.ExecuteDelegate>(
ms,
out Utils.CommandSerDe.SerializedMode serializerMode,
out Utils.CommandSerDe.SerializedMode deserializerMode,
out var runMode));
Assert.Equal(Utils.CommandSerDe.SerializedMode.Row, serializerMode);
Assert.Equal(Utils.CommandSerDe.SerializedMode.Row, deserializerMode);
Assert.Equal("N", runMode);
Apache.Arrow.IArrowArray input = ArrowArrayHelpers.ToArrowArray(new[] { "spark" });
Apache.Arrow.IArrowArray result =
deserializedWorkerFunction.Func(0, new[] { input }, new[] { 0 });
ArrowTestUtils.AssertEquals("hello spark", result);
}
}
[Fact]
public void TestCommandSerDeForRDD()
{
// Construct the UDF tree such that func1, func2, and func3
// are executed in that order.
var func1 = new RDD.WorkerFunction(
new RDD<int>.MapUdfWrapper<int, int>((a) => a + 3).Execute);
var func2 = new RDD.WorkerFunction(
new RDD<int>.MapUdfWrapper<int, int>((a) => a * 2).Execute);
var func3 = new RDD.WorkerFunction(
new RDD<int>.MapUdfWrapper<int, int>((a) => a + 5).Execute);
var chainedFunc1 = RDD.WorkerFunction.Chain(func1, func2);
var chainedFunc2 = RDD.WorkerFunction.Chain(chainedFunc1, func3);
var serializedCommand = Utils.CommandSerDe.Serialize(
chainedFunc2.Func,
Utils.CommandSerDe.SerializedMode.Byte,
Utils.CommandSerDe.SerializedMode.Byte);
using (var ms = new MemoryStream(serializedCommand))
{
var deserializedWorkerFunction = new RDD.WorkerFunction(
Utils.CommandSerDe.Deserialize<RDD.WorkerFunction.ExecuteDelegate>(
ms,
out Utils.CommandSerDe.SerializedMode serializerMode,
out Utils.CommandSerDe.SerializedMode deserializerMode,
out var runMode));
Assert.Equal(Utils.CommandSerDe.SerializedMode.Byte, serializerMode);
Assert.Equal(Utils.CommandSerDe.SerializedMode.Byte, deserializerMode);
Assert.Equal("N", runMode);
IEnumerable<object> result =
deserializedWorkerFunction.Func(0, new object[] { 1, 2, 3 });
Assert.Equal(new[] { 13, 15, 17 }, result.Cast<int>());
}
}
}
}

Просмотреть файл

@ -0,0 +1,31 @@
<Project Sdk="Microsoft.NET.Sdk">
<PropertyGroup>
<TargetFrameworks>net461;netcoreapp2.1</TargetFrameworks>
<TargetFrameworks Condition="'$(OS)' != 'Windows_NT'">netcoreapp2.1</TargetFrameworks>
<RootNamespace>Microsoft.Spark.UnitTest</RootNamespace>
<IsPackable>false</IsPackable>
</PropertyGroup>
<ItemGroup>
<PackageReference Include="coverlet.msbuild" Version="2.4.0">
<PrivateAssets>all</PrivateAssets>
<IncludeAssets>runtime; build; native; contentfiles; analyzers</IncludeAssets>
</PackageReference>
<PackageReference Include="Microsoft.NET.Test.Sdk" Version="15.9.0" />
<PackageReference Include="Moq" Version="4.10.0" />
<PackageReference Include="System.Memory" Version="4.5.2" />
<PackageReference Include="xunit" Version="2.4.1" />
<PackageReference Include="xunit.runner.visualstudio" Version="2.4.1" />
</ItemGroup>
<ItemGroup Condition=" '$(TargetFramework)' != 'netcoreapp2.1' ">
<Reference Include="System" />
<Reference Include="Microsoft.CSharp" />
</ItemGroup>
<ItemGroup>
<ProjectReference Include="..\Microsoft.Spark\Microsoft.Spark.csproj" />
</ItemGroup>
</Project>

Просмотреть файл

@ -0,0 +1,70 @@
// Licensed to the .NET Foundation under one or more agreements.
// The .NET Foundation licenses this file to you under the MIT license.
// See the LICENSE file in the project root for more information.
using System;
using System.IO;
using Microsoft.Spark.Interop.Ipc;
using Xunit;
namespace Microsoft.Spark.UnitTest
{
public class SerDeTests
{
[Fact]
public void TestReadAndWrite()
{
using (var ms = new MemoryStream())
{
// Test bool.
SerDe.Write(ms, true);
ms.Seek(0, SeekOrigin.Begin);
Assert.True(SerDe.ReadBool(ms));
ms.Seek(0, SeekOrigin.Begin);
SerDe.Write(ms, false);
ms.Seek(0, SeekOrigin.Begin);
Assert.False(SerDe.ReadBool(ms));
ms.Seek(0, SeekOrigin.Begin);
// Test int.
SerDe.Write(ms, 12345);
ms.Seek(0, SeekOrigin.Begin);
Assert.Equal(12345, SerDe.ReadInt32(ms));
ms.Seek(0, SeekOrigin.Begin);
// Test long.
SerDe.Write(ms, 123456789000);
ms.Seek(0, SeekOrigin.Begin);
Assert.Equal(123456789000, SerDe.ReadInt64(ms));
ms.Seek(0, SeekOrigin.Begin);
// Test double.
SerDe.Write(ms, Math.PI);
ms.Seek(0, SeekOrigin.Begin);
Assert.Equal(Math.PI, SerDe.ReadDouble(ms));
ms.Seek(0, SeekOrigin.Begin);
// Test string.
SerDe.Write(ms, "hello world!");
ms.Seek(0, SeekOrigin.Begin);
Assert.Equal("hello world!", SerDe.ReadString(ms));
ms.Seek(0, SeekOrigin.Begin);
}
}
[Fact]
public void TestReadBytes()
{
// Test the case where invalid length is given.
Assert.Throws<ArgumentOutOfRangeException>(
() => SerDe.ReadBytes(new MemoryStream(), -1));
// Test reading null length.
var ms = new MemoryStream();
SerDe.Write(ms, (int)SpecialLengths.NULL);
ms.Seek(0, SeekOrigin.Begin);
Assert.Null(SerDe.ReadBytes(ms));
}
}
}

Просмотреть файл

@ -0,0 +1,488 @@
// Licensed to the .NET Foundation under one or more agreements.
// The .NET Foundation licenses this file to you under the MIT license.
// See the LICENSE file in the project root for more information.
using System;
using Microsoft.Spark.Interop.Ipc;
using Microsoft.Spark.Sql;
using Microsoft.Spark.Sql.Expressions;
using Moq;
using Xunit;
namespace Microsoft.Spark.UnitTest
{
public class ColumnTestsFixture : IDisposable
{
internal Mock<IJvmBridge> MockJvm { get; }
public ColumnTestsFixture()
{
MockJvm = new Mock<IJvmBridge>();
MockJvm
.Setup(m => m.CallStaticJavaMethod(
It.IsAny<string>(),
It.IsAny<string>(),
It.IsAny<object>()))
.Returns(
new JvmObjectReference("result", MockJvm.Object));
MockJvm
.Setup(m => m.CallStaticJavaMethod(
It.IsAny<string>(),
It.IsAny<string>(),
It.IsAny<object>(),
It.IsAny<object>()))
.Returns(
new JvmObjectReference("result", MockJvm.Object));
MockJvm
.Setup(m => m.CallStaticJavaMethod(
It.IsAny<string>(),
It.IsAny<string>(),
It.IsAny<object[]>()))
.Returns(
new JvmObjectReference("result", MockJvm.Object));
MockJvm
.Setup(m => m.CallNonStaticJavaMethod(
It.IsAny<JvmObjectReference>(),
It.IsAny<string>(),
It.IsAny<object>()))
.Returns(
new JvmObjectReference("result", MockJvm.Object));
MockJvm
.Setup(m => m.CallNonStaticJavaMethod(
It.IsAny<JvmObjectReference>(),
It.IsAny<string>(),
It.IsAny<object>(),
It.IsAny<object>()))
.Returns(
new JvmObjectReference("result", MockJvm.Object));
MockJvm
.Setup(m => m.CallNonStaticJavaMethod(
It.IsAny<JvmObjectReference>(),
It.IsAny<string>(),
It.IsAny<object[]>()))
.Returns(
new JvmObjectReference("result", MockJvm.Object));
}
public void Dispose()
{
}
}
public class ColumnTests : IClassFixture<ColumnTestsFixture>
{
private readonly Mock<IJvmBridge> _mockJvm;
public ColumnTests(ColumnTestsFixture fixture)
{
_mockJvm = fixture.MockJvm;
}
private static JvmObjectId GetId(IJvmObjectReferenceProvider provider) => provider.Reference.Id;
[Fact]
public void TestColumnNegateOperator()
{
Column column1 = CreateColumn("col1");
Column column2 = -column1;
_mockJvm.Verify(m => m.CallStaticJavaMethod(
"org.apache.spark.sql.functions",
"negate",
column1), Times.Once);
Assert.Equal("result", GetId(column2));
}
[Fact]
public void TestNotOperator()
{
Column column1 = CreateColumn("col1");
Column column2 = !column1;
_mockJvm.Verify(m => m.CallStaticJavaMethod(
"org.apache.spark.sql.functions",
"not",
column1), Times.Once);
Assert.Equal("result", GetId(column2));
}
[Fact]
public void TestEqualOperator()
{
{
// Column as a right operand.
Column column1 = CreateColumn("col1");
Column column2 = CreateColumn("col2");
Column result = column1 == column2;
VerifyNonStaticCall(column1, "equalTo", column2);
Assert.Equal("result", GetId(result));
}
{
// String as a right operand.
// Note that any object can be used in place of string.
Column column1 = CreateColumn("col1");
Column result = column1 == "abc";
VerifyNonStaticCall(column1, "equalTo", "abc");
Assert.Equal("result", GetId(result));
}
}
[Fact]
public void TestNotEqualOperator()
{
{
// Column as a right operand.
Column column1 = CreateColumn("col1");
Column column2 = CreateColumn("col2");
Column result = column1 != column2;
VerifyNonStaticCall(column1, "notEqual", column2);
Assert.Equal("result", GetId(result));
}
{
// String as a right operand.
// Note that any object can be used in place of string.
Column column1 = CreateColumn("col1");
Column result = column1 != "abc";
VerifyNonStaticCall(column1, "notEqual", "abc");
Assert.Equal("result", GetId(result));
}
}
[Fact]
public void TestGreaterThanOperator()
{
{
// Column as a right operand.
Column column1 = CreateColumn("col1");
Column column2 = CreateColumn("col2");
Column result = column1 > column2;
VerifyNonStaticCall(column1, "gt", column2);
Assert.Equal("result", GetId(result));
}
{
// String as a right operand.
// Note that any object can be used in place of string.
Column column1 = CreateColumn("col1");
Column result = column1 > "abc";
VerifyNonStaticCall(column1, "gt", "abc");
Assert.Equal("result", GetId(result));
}
}
[Fact]
public void TestLessThanOperator()
{
{
// Column as a right operand.
Column column1 = CreateColumn("col1");
Column column2 = CreateColumn("col2");
Column result = column1 < column2;
VerifyNonStaticCall(column1, "lt", column2);
Assert.Equal("result", GetId(result));
}
{
// String as a right operand.
// Note that any object can be used in place of string.
Column column1 = CreateColumn("col1");
Column result = column1 < "abc";
VerifyNonStaticCall(column1, "lt", "abc");
Assert.Equal("result", GetId(result));
}
}
[Fact]
public void TestLessThanEqualToOperator()
{
{
// Column as a right operand.
Column column1 = CreateColumn("col1");
Column column2 = CreateColumn("col2");
Column result = column1 <= column2;
VerifyNonStaticCall(column1, "leq", column2);
Assert.Equal("result", GetId(result));
}
{
// String as a right operand.
// Note that any object can be used in place of string.
Column column1 = CreateColumn("col1");
Column result = column1 <= "abc";
VerifyNonStaticCall(column1, "leq", "abc");
Assert.Equal("result", GetId(result));
}
}
[Fact]
public void TestGreaterThanEqualToOperator()
{
{
// Column as a right operand.
Column column1 = CreateColumn("col1");
Column column2 = CreateColumn("col2");
Column result = column1 >= column2;
VerifyNonStaticCall(column1, "geq", column2);
Assert.Equal("result", GetId(result));
}
{
// String as a right operand.
// Note that any object can be used in place of string.
Column column1 = CreateColumn("col1");
Column result = column1 >= "abc";
VerifyNonStaticCall(column1, "geq", "abc");
Assert.Equal("result", GetId(result));
}
}
[Fact]
public void TestAndOperator()
{
Column column1 = CreateColumn("col1");
Column column2 = CreateColumn("col2");
Column result = column1 & column2;
VerifyNonStaticCall(column1, "and", column2);
Assert.Equal("result", GetId(result));
}
[Fact]
public void TestOrOperator()
{
Column column1 = CreateColumn("col1");
Column column2 = CreateColumn("col2");
Column result = column1 | column2;
VerifyNonStaticCall(column1, "or", column2);
Assert.Equal("result", GetId(result));
}
[Fact]
public void TestPlusOperator()
{
Column column1 = CreateColumn("col1");
Column column2 = CreateColumn("col2");
Column result = column1 + column2;
VerifyNonStaticCall(column1, "plus", column2);
Assert.Equal("result", GetId(result));
}
[Fact]
public void TestMinusOperator()
{
Column column1 = CreateColumn("col1");
Column column2 = CreateColumn("col2");
Column result = column1 - column2;
VerifyNonStaticCall(column1, "minus", column2);
Assert.Equal("result", GetId(result));
}
[Fact]
public void TestMultiplyOperator()
{
Column column1 = CreateColumn("col1");
Column column2 = CreateColumn("col2");
Column result = column1 * column2;
VerifyNonStaticCall(column1, "multiply", column2);
Assert.Equal("result", GetId(result));
}
[Fact]
public void TestDivideOperator()
{
Column column1 = CreateColumn("col1");
Column column2 = CreateColumn("col2");
Column result = column1 / column2;
VerifyNonStaticCall(column1, "divide", column2);
Assert.Equal("result", GetId(result));
}
[Fact]
public void TestModOperator()
{
Column column1 = CreateColumn("col1");
Column column2 = CreateColumn("col2");
Column result = column1 % column2;
VerifyNonStaticCall(column1, "mod", column2);
}
[Fact]
public void TestWhenCondition()
{
Column column1 = CreateColumn("col1");
Column column2 = CreateColumn("col2");
var value = 0;
column1.When(column2, value);
VerifyNonStaticCall(column1, "when", column2, value);
}
[Fact]
public void TestBetweenCondition()
{
Column column1 = CreateColumn("col1");
var val1 = 1;
var val2 = 2;
column1.Between(val1, val2);
VerifyNonStaticCall(column1, "between", val1, val2);
}
[Fact]
public void TestSubStr()
{
{
Column column1 = CreateColumn("col1");
var pos = 1;
var len = 2;
column1.SubStr(pos, len);
VerifyNonStaticCall(column1, "substr", pos, len);
}
{
Column column1 = CreateColumn("col1");
Column pos = CreateColumn("col2");
Column len = CreateColumn("col3");
column1.SubStr(pos, len);
VerifyNonStaticCall(column1, "substr", pos, len);
}
}
[Fact]
public void TestOver()
{
{
Column column1 = CreateColumn("col1");
var windowSpec =
new WindowSpec(new JvmObjectReference("windowSpec", _mockJvm.Object));
column1.Over();
VerifyNonStaticCall(column1, "over");
}
{
Column column1 = CreateColumn("col1");
var windowSpec =
new WindowSpec(new JvmObjectReference("windowSpec", _mockJvm.Object));
column1.Over(windowSpec);
VerifyNonStaticCall(column1, "over", windowSpec);
}
}
private void VerifyNonStaticCall(
IJvmObjectReferenceProvider obj,
string methodName,
object arg0)
{
_mockJvm.Verify(m => m.CallNonStaticJavaMethod(
obj.Reference,
methodName,
arg0));
}
private void VerifyNonStaticCall(
IJvmObjectReferenceProvider obj,
string methodName,
object arg0,
object arg1)
{
_mockJvm.Verify(m => m.CallNonStaticJavaMethod(
obj.Reference,
methodName,
arg0, arg1));
}
private void VerifyNonStaticCall(
IJvmObjectReferenceProvider obj,
string methodName,
params object[] args)
{
_mockJvm.Verify(m => m.CallNonStaticJavaMethod(
obj.Reference,
methodName,
args));
}
[Theory]
[InlineData("EqNullSafe", "eqNullSafe")]
[InlineData("Or", "or")]
[InlineData("And", "and")]
[InlineData("Contains", "contains")]
[InlineData("StartsWith", "startsWith")]
[InlineData("EndsWith", "endsWith")]
[InlineData("EqualTo", "equalTo")]
[InlineData("NotEqual", "notEqual")]
[InlineData("Gt", "gt")]
[InlineData("Lt", "lt")]
[InlineData("Leq", "leq")]
[InlineData("Geq", "geq")]
[InlineData("Otherwise", "otherwise")]
[InlineData("Plus", "plus")]
[InlineData("Minus", "minus")]
[InlineData("Multiply", "multiply")]
[InlineData("Divide", "divide")]
[InlineData("Mod", "mod")]
[InlineData("GetItem", "getItem")]
[InlineData("BitwiseOR", "bitwiseOR")]
[InlineData("BitwiseAND", "bitwiseAND")]
[InlineData("BitwiseXOR", "bitwiseXOR")]
public void TestNamedOperators(string funcName, string opName)
{
Column column1 = CreateColumn("col1");
Column column2 = CreateColumn("col2");
System.Reflection.MethodInfo func = column1.GetType().GetMethod(
funcName,
new Type[] { typeof(Column) });
var result = func.Invoke(column1, new[] { column2 }) as Column;
VerifyNonStaticCall(column1, opName, column2);
Assert.Equal("result", GetId(result));
}
[Theory]
[InlineData("Contains", "contains")]
[InlineData("StartsWith", "startsWith")]
[InlineData("EndsWith", "endsWith")]
[InlineData("Alias", "alias")]
[InlineData("As", "alias")]
[InlineData("Name", "name")]
[InlineData("Cast", "cast")]
[InlineData("Otherwise", "otherwise")]
[InlineData("Like", "like")]
[InlineData("RLike", "rlike")]
[InlineData("GetItem", "getItem")]
[InlineData("GetField", "getField")]
public void TestNamedOperatorsWithString(string funcName, string opName)
{
// These operators take string as the operand.
Column column = CreateColumn("col");
var literal = "hello";
System.Reflection.MethodInfo func = column.GetType().GetMethod(
funcName,
new Type[] { typeof(string) });
var result = func.Invoke(column, new[] { literal }) as Column;
Assert.Equal("result", GetId(result));
VerifyNonStaticCall(column, opName, literal);
}
[Theory]
[InlineData("Asc", "asc")]
[InlineData("AscNullsFirst", "asc_nulls_first")]
[InlineData("AscNullsLast", "asc_nulls_last")]
[InlineData("Desc", "desc")]
[InlineData("DescNullsFirst", "desc_nulls_first")]
[InlineData("DescNullsLast", "desc_nulls_last")]
[InlineData("IsNaN", "isNaN")]
[InlineData("IsNull", "isNull")]
[InlineData("IsNotNull", "isNotNull")]
public void TestUnaryOperators(string funcName, string opName)
{
Column column = CreateColumn("col");
// Use an empty array of Type objects to get a method that takes no parameters.
System.Reflection.MethodInfo func =
column.GetType().GetMethod(funcName, Type.EmptyTypes);
var result = func.Invoke(column, null) as Column;
Assert.Equal("result", GetId(result));
VerifyNonStaticCall(column, opName);
}
private Column CreateColumn(string id)
{
return new Column(new JvmObjectReference(id, _mockJvm.Object));
}
}
}

Просмотреть файл

@ -0,0 +1,141 @@
// Licensed to the .NET Foundation under one or more agreements.
// The .NET Foundation licenses this file to you under the MIT license.
// See the LICENSE file in the project root for more information.
using System;
using System.Collections.Generic;
using System.IO;
using System.Linq;
using Microsoft.Spark.Interop.Ipc;
using Microsoft.Spark.Network;
using Microsoft.Spark.Sql;
using Microsoft.Spark.Sql.Types;
using Microsoft.Spark.UnitTest.TestUtils;
using Microsoft.Spark.Utils;
using Moq;
using Razorvine.Pickle;
using Xunit;
namespace Microsoft.Spark.UnitTest
{
public class RowTests
{
private readonly string _testJsonSchema =
@"{
""type"":""struct"",
""fields"":[
{
""name"":""age"",
""type"":""integer"",
""nullable"":true,
""metadata"":{}
},
{
""name"":""name"",
""type"":""string"",
""nullable"":false,
""metadata"":{}
}
]}";
[Fact]
public void RowTest()
{
var structFields = new List<StructField>()
{
new StructField("col1", new IntegerType()),
new StructField("col2", new StringType()),
};
var schema = new StructType(structFields);
var row = new Row(new object[] { 1, "abc" }, schema);
// Validate Size().
Assert.Equal(2, row.Size());
// Validate [] operator.
Assert.Equal(1, row[0]);
Assert.Equal("abc", row[1]);
// Validate Get*(int).
Assert.Equal(1, row.Get(0));
Assert.Equal("abc", row.Get(1));
Assert.Equal(1, row.GetAs<int>(0));
Assert.ThrowsAny<Exception>(() => row.GetAs<string>(0));
Assert.Equal("abc", row.GetAs<string>(1));
Assert.ThrowsAny<Exception>(() => row.GetAs<int>(1));
// Validate Get*(string).
Assert.Equal(1, row.Get("col1"));
Assert.Equal("abc", row.Get("col2"));
Assert.Equal(1, row.GetAs<int>("col1"));
Assert.ThrowsAny<Exception>(() => row.GetAs<string>("col1"));
Assert.Equal("abc", row.GetAs<string>("col2"));
Assert.ThrowsAny<Exception>(() => row.GetAs<int>("col2"));
}
[Fact]
public void RowConstructorTest()
{
Pickler pickler = CreatePickler();
var schema = (StructType)DataType.ParseDataType(_testJsonSchema);
var row1 = new Row(new object[] { 10, "name1" }, schema);
var row2 = new Row(new object[] { 15, "name2" }, schema);
var pickledBytes = pickler.dumps(new[] { row1, row2 });
// Note that the following will invoke RowConstructor.construct().
var unpickledData = PythonSerDe.GetUnpickledObjects(new MemoryStream(pickledBytes));
Assert.Equal(2, unpickledData.Length);
Assert.Equal(row1, (unpickledData[0] as RowConstructor).GetRow());
Assert.Equal(row2, (unpickledData[1] as RowConstructor).GetRow());
}
[Fact]
public void RowCollectorTest()
{
var stream = new MemoryStream();
Pickler pickler = CreatePickler();
var schema = (StructType)DataType.ParseDataType(_testJsonSchema);
// Pickle two rows in one batch.
var row1 = new Row(new object[] { 10, "name1" }, schema);
var row2 = new Row(new object[] { 15, "name2" }, schema);
var batch1 = pickler.dumps(new[] { row1, row2 });
SerDe.Write(stream, batch1.Length);
SerDe.Write(stream, batch1);
// Pickle one row in one batch.
var row3 = new Row(new object[] { 20, "name3" }, schema);
var batch2 = pickler.dumps(new[] { row3 });
SerDe.Write(stream, batch2.Length);
SerDe.Write(stream, batch2);
// Rewind the memory stream so that the row collect can read from beginning.
stream.Seek(0, SeekOrigin.Begin);
// Set up the mock to return memory stream to which pickled data is written.
var socket = new Mock<ISocketWrapper>();
socket.Setup(m => m.InputStream).Returns(stream);
socket.Setup(m => m.OutputStream).Returns(stream);
var rowCollector = new RowCollector();
Row[] rows = rowCollector.Collect(socket.Object).ToArray();
Assert.Equal(3, rows.Length);
Assert.Equal(row1, rows[0]);
Assert.Equal(row2, rows[1]);
Assert.Equal(row3, rows[2]);
}
private Pickler CreatePickler()
{
new StructTypePickler().Register();
new RowPickler().Register();
return new Pickler();
}
}
}

Просмотреть файл

@ -0,0 +1,91 @@
// Licensed to the .NET Foundation under one or more agreements.
// The .NET Foundation licenses this file to you under the MIT license.
// See the LICENSE file in the project root for more information.
using Microsoft.Spark.Sql.Types;
using Newtonsoft.Json.Linq;
using Xunit;
namespace Microsoft.Spark.UnitTest
{
public class TypesTests
{
[Theory]
[InlineData("null")]
[InlineData("string")]
[InlineData("binary")]
[InlineData("boolean")]
[InlineData("date")]
[InlineData("timestamp")]
[InlineData("double")]
[InlineData("float")]
[InlineData("byte")]
[InlineData("integer")]
[InlineData("long")]
[InlineData("short")]
public void TestSimpleTypes(string typeName)
{
var atomicType = DataType.ParseDataType($@"""{typeName}""");
Assert.Equal(typeName, atomicType.TypeName);
Assert.Equal(typeName, atomicType.SimpleString);
}
[Fact]
public void TestArrayType()
{
var schemaJson =
@"{
""type"":""array"",
""elementType"":""integer"",
""containsNull"":true
}";
var arrayType = (ArrayType)DataType.ParseDataType(schemaJson);
Assert.Equal("array", arrayType.TypeName);
Assert.Equal("array<integer>", arrayType.SimpleString);
Assert.Equal("integer", arrayType.ElementType.TypeName);
Assert.True(arrayType.ContainsNull);
}
[Fact]
public void TestStructTypeAndStructFieldTypes()
{
var schemaJson =
@"{
""type"":""struct"",
""fields"":[
{
""name"":""age"",
""type"":""long"",
""nullable"":true,
""metadata"":{}
},
{
""name"":""name"",
""type"":""string"",
""nullable"":false,
""metadata"":{}
}
]}";
var structType = (StructType)DataType.ParseDataType(schemaJson);
Assert.Equal("struct", structType.TypeName);
Assert.Equal("struct<age:long,name:string>", structType.SimpleString);
Assert.Equal(2, structType.Fields.Count);
{
StructField field = structType.Fields[0];
Assert.Equal("age", field.Name);
Assert.Equal("long", field.DataType.TypeName);
Assert.True(field.IsNullable);
Assert.Equal(new JObject(), field.Metadata);
}
{
StructField field = structType.Fields[1];
Assert.Equal("name", field.Name);
Assert.Equal("string", field.DataType.TypeName);
Assert.False(field.IsNullable);
Assert.Equal(new JObject(), field.Metadata);
}
}
}
}

Просмотреть файл

@ -0,0 +1,20 @@
// Licensed to the .NET Foundation under one or more agreements.
// The .NET Foundation licenses this file to you under the MIT license.
// See the LICENSE file in the project root for more information.
using Apache.Arrow;
using Xunit;
namespace Microsoft.Spark.UnitTest.TestUtils
{
public static class ArrowTestUtils
{
public static void AssertEquals(string expectedValue, IArrowArray arrowArray)
{
Assert.IsType<StringArray>(arrowArray);
var stringArray = (StringArray)arrowArray;
Assert.Equal(1, stringArray.Length);
Assert.Equal(expectedValue, stringArray.GetString(0));
}
}
}

Просмотреть файл

@ -0,0 +1,92 @@
// Licensed to the .NET Foundation under one or more agreements.
// The .NET Foundation licenses this file to you under the MIT license.
// See the LICENSE file in the project root for more information.
using System;
using System.IO;
using System.Text;
using Microsoft.Spark.Interop.Ipc;
using Microsoft.Spark.Sql;
using Microsoft.Spark.Sql.Types;
using Razorvine.Pickle;
namespace Microsoft.Spark.UnitTest.TestUtils
{
/// <summary>
/// Custom pickler for StructType objects.
/// Refer to
/// spark/sql/core/src/main/scala/org/apache/spark/sql/execution/python/EvaluatePython.scala
/// </summary>
internal class StructTypePickler : IObjectPickler
{
private readonly string _module = "pyspark.sql.types";
public void Register()
{
Pickler.registerCustomPickler(GetType(), this);
Pickler.registerCustomPickler(typeof(StructType), this);
}
public void pickle(object o, Stream stream, Pickler currentPickler)
{
if (!(o is StructType schema))
{
throw new InvalidOperationException("A StructType object is expected.");
}
SerDe.Write(stream, Opcodes.GLOBAL);
SerDe.Write(stream, Encoding.UTF8.GetBytes(
$"{_module}\n_parse_datatype_json_string\n"));
currentPickler.save(schema.Json);
SerDe.Write(stream, Opcodes.TUPLE1);
SerDe.Write(stream, Opcodes.REDUCE);
}
}
/// <summary>
/// Custom pickler for Row objects.
/// Refer to
/// spark/sql/core/src/main/scala/org/apache/spark/sql/execution/python/EvaluatePython.scala
/// </summary>
internal class RowPickler : IObjectPickler
{
private readonly string _module = "pyspark.sql.types";
public void Register()
{
Pickler.registerCustomPickler(GetType(), this);
Pickler.registerCustomPickler(typeof(Row), this);
}
public void pickle(object o, Stream stream, Pickler currentPickler)
{
if (o.Equals(this))
{
SerDe.Write(stream, Opcodes.GLOBAL);
SerDe.Write(stream, Encoding.UTF8.GetBytes(
$"{_module}\n_create_row_inbound_converter\n"));
}
else
{
if (!(o is Row row))
{
throw new InvalidOperationException("A Row object is expected.");
}
currentPickler.save(this);
currentPickler.save(row.Schema);
SerDe.Write(stream, Opcodes.TUPLE1);
SerDe.Write(stream, Opcodes.REDUCE);
SerDe.Write(stream, Opcodes.MARK);
for (int i = 0; i < row.Size(); ++i)
{
currentPickler.save(row.Get(i));
}
SerDe.Write(stream, Opcodes.TUPLE);
SerDe.Write(stream, Opcodes.REDUCE);
}
}
}
}

Просмотреть файл

@ -0,0 +1,268 @@
// Licensed to the .NET Foundation under one or more agreements.
// The .NET Foundation licenses this file to you under the MIT license.
// See the LICENSE file in the project root for more information.
using System.Collections.Generic;
using System.Linq;
using Apache.Arrow;
using Microsoft.Spark.Sql;
using Microsoft.Spark.UnitTest.TestUtils;
using Xunit;
namespace Microsoft.Spark.UnitTest
{
public class UdfWrapperTests
{
[Fact]
public void TestPicklingUdfWrapper0()
{
var udfWrapper = new PicklingUdfWrapper<int>(() => 10);
Assert.Equal(10, udfWrapper.Execute(0, null, null));
}
[Fact]
public void TestPicklingUdfWrapper1()
{
var udfWrapper = new PicklingUdfWrapper<string, string>(
(str1) => str1);
ValidatePicklingWrapper(1, udfWrapper);
}
[Fact]
public void TestPicklingUdfWrapper2()
{
var udfWrapper = new PicklingUdfWrapper<string, string, string>(
(str1, str2) => str1 + str2);
ValidatePicklingWrapper(2, udfWrapper);
}
[Fact]
public void TestPicklingUdfWrapper3()
{
var udfWrapper = new PicklingUdfWrapper<string, string, string, string>(
(str1, str2, str3) => str1 + str2 + str3);
ValidatePicklingWrapper(3, udfWrapper);
}
[Fact]
public void TestPicklingUdfWrapper4()
{
var udfWrapper = new PicklingUdfWrapper<string, string, string, string, string>(
(str1, str2, str3, str4) => str1 + str2 + str3 + str4);
ValidatePicklingWrapper(4, udfWrapper);
}
[Fact]
public void TestPicklingUdfWrapper5()
{
var udfWrapper = new PicklingUdfWrapper<string, string, string, string, string, string>(
(str1, str2, str3, str4, str5) => str1 + str2 + str3 + str4 + str5);
ValidatePicklingWrapper(5, udfWrapper);
}
[Fact]
public void TestPicklingUdfWrapper6()
{
var udfWrapper = new PicklingUdfWrapper<
string, string, string, string, string, string, string>(
(str1, str2, str3, str4, str5, str6)
=> str1 + str2 + str3 + str4 + str5 + str6);
ValidatePicklingWrapper(6, udfWrapper);
}
[Fact]
public void TestPicklingUdfWrapper7()
{
var udfWrapper = new PicklingUdfWrapper<
string, string, string, string, string, string, string, string>(
(str1, str2, str3, str4, str5, str6, str7)
=> str1 + str2 + str3 + str4 + str5 + str6 + str7);
ValidatePicklingWrapper(7, udfWrapper);
}
[Fact]
public void TestPicklingUdfWrapper8()
{
var udfWrapper = new PicklingUdfWrapper<
string, string, string, string, string, string, string, string, string>(
(str1, str2, str3, str4, str5, str6, str7, str8)
=> str1 + str2 + str3 + str4 + str5 + str6 + str7 + str8);
ValidatePicklingWrapper(8, udfWrapper);
}
[Fact]
public void TestPicklingUdfWrapper9()
{
var udfWrapper = new PicklingUdfWrapper<
string, string, string, string, string, string, string, string, string, string>(
(str1, str2, str3, str4, str5, str6, str7, str8, str9)
=> str1 + str2 + str3 + str4 + str5 + str6 + str7 + str8 + str9);
ValidatePicklingWrapper(9, udfWrapper);
}
[Fact]
public void TestPicklingUdfWrapper10()
{
var udfWrapper = new PicklingUdfWrapper<
string, string, string, string, string, string, string, string, string, string, string>(
(str1, str2, str3, str4, str5, str6, str7, str8, str9, str10)
=> str1 + str2 + str3 + str4 + str5 + str6 + str7 + str8 + str9 + str10);
ValidatePicklingWrapper(10, udfWrapper);
}
// Validates the given udfWrapper, whose internal UDF concatenates all the input strings.
private void ValidatePicklingWrapper(int numArgs, dynamic udfWrapper)
{
// Create one more input data than the given numArgs to validate
// the indexing is working correctly inside UdfWrapper.
var input = new List<string>();
for (int i = 0; i < numArgs + 1; ++i)
{
input.Add($"arg{i}");
}
// First create argOffsets from 0 to numArgs.
// For example, the numArgs was 3, the expected strings is "arg0arg1arg2"
// where the argOffsets are created with { 0, 1, 2 }.
Assert.Equal(
string.Join("", input.GetRange(0, numArgs)),
udfWrapper.Execute(0, input.ToArray(), Enumerable.Range(0, numArgs).ToArray()));
// Create argOffsets from 1 to numArgs + 1.
// For example, the numArgs was 3, the expected strings is "arg1arg2arg3"
// where the argOffsets are created with { 1, 2, 3 }.
Assert.Equal(
string.Join("", input.GetRange(1, numArgs)),
udfWrapper.Execute(0, input.ToArray(), Enumerable.Range(1, numArgs).ToArray()));
}
[Fact]
public void TestArrowUdfWrapper0()
{
var udfWrapper = new ArrowUdfWrapper<int>(() => 10);
IArrowArray result = udfWrapper.Execute(0, null, null);
Assert.IsType<Int32Array>(result);
var intArray = (Int32Array)result;
Assert.Equal(1, intArray.Length);
Assert.Equal(10, intArray.Values[0]);
}
[Fact]
public void TestArrowUdfWrapper1()
{
var udfWrapper = new ArrowUdfWrapper<string, string>(
(str1) => str1);
ValidateArrowWrapper(1, udfWrapper);
}
[Fact]
public void TestArrowUdfWrapper2()
{
var udfWrapper = new ArrowUdfWrapper<string, string, string>(
(str1, str2) => str1 + str2);
ValidateArrowWrapper(2, udfWrapper);
}
[Fact]
public void TestArrowUdfWrapper3()
{
var udfWrapper = new ArrowUdfWrapper<string, string, string, string>(
(str1, str2, str3) => str1 + str2 + str3);
ValidateArrowWrapper(3, udfWrapper);
}
[Fact]
public void TestArrowUdfWrapper4()
{
var udfWrapper = new ArrowUdfWrapper<string, string, string, string, string>(
(str1, str2, str3, str4) => str1 + str2 + str3 + str4);
ValidateArrowWrapper(4, udfWrapper);
}
[Fact]
public void TestArrowUdfWrapper5()
{
var udfWrapper = new ArrowUdfWrapper<string, string, string, string, string, string>(
(str1, str2, str3, str4, str5) => str1 + str2 + str3 + str4 + str5);
ValidateArrowWrapper(5, udfWrapper);
}
[Fact]
public void TestArrowUdfWrapper6()
{
var udfWrapper = new ArrowUdfWrapper<
string, string, string, string, string, string, string>(
(str1, str2, str3, str4, str5, str6)
=> str1 + str2 + str3 + str4 + str5 + str6);
ValidateArrowWrapper(6, udfWrapper);
}
[Fact]
public void TestArrowUdfWrapper7()
{
var udfWrapper = new ArrowUdfWrapper<
string, string, string, string, string, string, string, string>(
(str1, str2, str3, str4, str5, str6, str7)
=> str1 + str2 + str3 + str4 + str5 + str6 + str7);
ValidateArrowWrapper(7, udfWrapper);
}
[Fact]
public void TestArrowUdfWrapper8()
{
var udfWrapper = new ArrowUdfWrapper<
string, string, string, string, string, string, string, string, string>(
(str1, str2, str3, str4, str5, str6, str7, str8)
=> str1 + str2 + str3 + str4 + str5 + str6 + str7 + str8);
ValidateArrowWrapper(8, udfWrapper);
}
[Fact]
public void TestArrowUdfWrapper9()
{
var udfWrapper = new ArrowUdfWrapper<
string, string, string, string, string, string, string, string, string, string>(
(str1, str2, str3, str4, str5, str6, str7, str8, str9)
=> str1 + str2 + str3 + str4 + str5 + str6 + str7 + str8 + str9);
ValidateArrowWrapper(9, udfWrapper);
}
[Fact]
public void TestArrowUdfWrapper10()
{
var udfWrapper = new ArrowUdfWrapper<
string, string, string, string, string, string, string, string, string, string, string>(
(str1, str2, str3, str4, str5, str6, str7, str8, str9, str10)
=> str1 + str2 + str3 + str4 + str5 + str6 + str7 + str8 + str9 + str10);
ValidateArrowWrapper(10, udfWrapper);
}
// Validates the given udfWrapper, whose internal UDF concatenates all the input strings.
private void ValidateArrowWrapper(int numArgs, dynamic udfWrapper)
{
// Create one more input data than the given numArgs to validate
// the indexing is working correctly inside ArrowUdfWrapper.
var input = new IArrowArray[numArgs + 1];
var inputStrings = new List<string>();
for (int i = 0; i < input.Length; ++i)
{
inputStrings.Add($"arg{i}");
input[i] = ArrowArrayHelpers.ToArrowArray(new string[] { $"arg{i}" });
}
// First create argOffsets from 0 to numArgs.
// For example, the numArgs was 3, the expected strings is "arg0arg1arg2"
// where the argOffsets are created with { 0, 1, 2 }.
ArrowTestUtils.AssertEquals(
string.Join("", inputStrings.GetRange(0, numArgs)),
udfWrapper.Execute(0, input, Enumerable.Range(0, numArgs).ToArray()));
// Create argOffsets from 1 to numArgs + 1.
// For example, the numArgs was 3, the expected strings is "arg1arg2arg3"
// where the argOffsets are created with { 1, 2, 3 }.
ArrowTestUtils.AssertEquals(
string.Join("", inputStrings.GetRange(1, numArgs)),
udfWrapper.Execute(0, input, Enumerable.Range(1, numArgs).ToArray()));
}
}
}

Просмотреть файл

@ -0,0 +1,164 @@
// Licensed to the .NET Foundation under one or more agreements.
// The .NET Foundation licenses this file to you under the MIT license.
// See the LICENSE file in the project root for more information.
using System;
using Apache.Arrow;
using Microsoft.Spark.Sql;
using Microsoft.Spark.UnitTest.TestUtils;
using Xunit;
using static Microsoft.Spark.Sql.ArrowArrayHelpers;
namespace Microsoft.Spark.UnitTest
{
public class WorkerFunctionTests
{
[Fact]
public void TestPicklingWorkerFunction()
{
var func = new PicklingWorkerFunction(
new PicklingUdfWrapper<string, string>(
(str) => str).Execute);
string[] input = { "arg1" };
Assert.Equal(input[0], func.Func(0, input, new[] { 0 }));
}
[Fact]
public void TestChainingPicklingWorkerFunction()
{
var func1 = new PicklingWorkerFunction(
new PicklingUdfWrapper<int, string, string>(
(number, str) => $"{str}:{number}").Execute);
var func2 = new PicklingWorkerFunction(
new PicklingUdfWrapper<string, string>(
(str) => $"outer1:{str}").Execute);
var func3 = new PicklingWorkerFunction(
new PicklingUdfWrapper<string, string>(
(str) => $"outer2:{str}").Execute);
object[] input = { 100, "name" };
// Validate one-level chaining.
var chainedFunc1 = PicklingWorkerFunction.Chain(func1, func2);
Assert.Equal("outer1:name:100", chainedFunc1.Func(0, input, new[] { 0, 1 }));
// Validate two-level chaining.
var chainedFunc2 = PicklingWorkerFunction.Chain(chainedFunc1, func3);
Assert.Equal("outer2:outer1:name:100", chainedFunc2.Func(0, input, new[] { 0, 1 }));
}
[Fact]
public void TestInvalidChainingPickling()
{
var func1 = new PicklingWorkerFunction(
new PicklingUdfWrapper<int, string, string>(
(number, str) => $"{str}:{number}").Execute);
var func2 = new PicklingWorkerFunction(
new PicklingUdfWrapper<string, string>(
(str) => $"outer1:{str}").Execute);
object[] input = { 100, "name" };
// The order does not align since workerFunction2 is executed first.
var chainedFunc1 = PicklingWorkerFunction.Chain(func2, func1);
Assert.ThrowsAny<Exception>(() => chainedFunc1.Func(0, input, new[] { 0, 1 }));
}
[Fact]
public void TestArrowWorkerFunction()
{
var func = new ArrowWorkerFunction(
new ArrowUdfWrapper<string, string>(
(str) => str).Execute);
string[] input = { "arg1" };
ArrowTestUtils.AssertEquals(
input[0],
func.Func(0, new[] { ToArrowArray(input) }, new[] { 0 }));
}
/// <summary>
/// Tests the ArrowWorkerFunction handles boolean types correctly
/// for both input and output.
/// </summary>
[Fact]
public void TestArrowWorkerFunctionForBool()
{
var func = new ArrowWorkerFunction(
new ArrowUdfWrapper<string, bool, bool>(
(str, flag) => flag || str.Contains("true")).Execute);
IArrowArray[] input = new[]
{
ToArrowArray(new[] { "arg1_true", "arg1_true", "arg1_false", "arg1_false" }),
ToArrowArray(new[] { true, false, true, false }),
};
var results = (BooleanArray)func.Func(0, input, new[] { 0, 1 });
Assert.Equal(4, results.Length);
Assert.True(results.GetBoolean(0));
Assert.True(results.GetBoolean(1));
Assert.True(results.GetBoolean(2));
Assert.False(results.GetBoolean(3));
}
[Fact]
public void TestChainingArrowWorkerFunction()
{
var func1 = new ArrowWorkerFunction(
new ArrowUdfWrapper<int, string, string>(
(number, str) => $"{str}:{number}").Execute);
var func2 = new ArrowWorkerFunction(
new ArrowUdfWrapper<string, string>(
(str) => $"outer1:{str}").Execute);
var func3 = new ArrowWorkerFunction(
new ArrowUdfWrapper<string, string>(
(str) => $"outer2:{str}").Execute);
Apache.Arrow.IArrowArray[] input = new[]
{
ToArrowArray(new[] { 100 }),
ToArrowArray(new[] { "name" })
};
// Validate one-level chaining.
var chainedFunc1 = ArrowWorkerFunction.Chain(func1, func2);
ArrowTestUtils.AssertEquals(
"outer1:name:100",
chainedFunc1.Func(0, input, new[] { 0, 1 }));
// Validate two-level chaining.
var chainedFunc2 = ArrowWorkerFunction.Chain(chainedFunc1, func3);
ArrowTestUtils.AssertEquals(
"outer2:outer1:name:100",
chainedFunc2.Func(0, input, new[] { 0, 1 }));
}
[Fact]
public void TestInvalidChainingArrow()
{
var func1 = new ArrowWorkerFunction(
new ArrowUdfWrapper<int, string, string>(
(number, str) => $"{str}:{number}").Execute);
var func2 = new ArrowWorkerFunction(
new ArrowUdfWrapper<string, string>(
(str) => $"outer1:{str}").Execute);
Apache.Arrow.IArrowArray[] input = new[]
{
ToArrowArray(new[] { 100 }),
ToArrowArray(new[] { "name" })
};
// The order does not align since workerFunction2 is executed first.
var chainedFunc1 = ArrowWorkerFunction.Chain(func2, func1);
Assert.ThrowsAny<Exception>(() => chainedFunc1.Func(0, input, new[] { 0, 1 }));
}
}
}

Просмотреть файл

@ -0,0 +1,553 @@
// Licensed to the .NET Foundation under one or more agreements.
// The .NET Foundation licenses this file to you under the MIT license.
// See the LICENSE file in the project root for more information.
using System.Collections;
using System.Collections.Generic;
using System.IO;
using System.Linq;
using System.Reflection;
using System.Runtime.Serialization.Formatters.Binary;
using System.Threading;
using System.Threading.Tasks;
using Apache.Arrow;
using Apache.Arrow.Ipc;
using Apache.Arrow.Types;
using Microsoft.Spark.Interop.Ipc;
using Microsoft.Spark.Utils;
using Microsoft.Spark.Worker.Command;
using Razorvine.Pickle;
using Xunit;
using static Microsoft.Spark.Sql.ArrowArrayHelpers;
namespace Microsoft.Spark.Worker.UnitTest
{
public class CommandExecutorTests
{
[Fact]
public void TestPicklingSqlCommandExecutorWithSingleCommand()
{
var udfWrapper = new Sql.PicklingUdfWrapper<string, string>(
(str) => "udf: " + ((str is null) ? "NULL" : str));
var command = new SqlCommand()
{
ArgOffsets = new[] { 0 },
NumChainedFunctions = 1,
WorkerFunction = new Sql.PicklingWorkerFunction(udfWrapper.Execute),
SerializerMode = CommandSerDe.SerializedMode.Row,
DeserializerMode = CommandSerDe.SerializedMode.Row
};
var commandPayload = new Worker.CommandPayload()
{
EvalType = UdfUtils.PythonEvalType.SQL_BATCHED_UDF,
Commands = new[] { command }
};
using (var inputStream = new MemoryStream())
using (var outputStream = new MemoryStream())
{
int numRows = 10;
// Write test data to the input stream.
var pickler = new Pickler();
for (int i = 0; i < numRows; ++i)
{
var pickled = pickler.dumps(
new[] { new object[] { (i % 2 == 0) ? null : i.ToString() } });
SerDe.Write(inputStream, pickled.Length);
SerDe.Write(inputStream, pickled);
}
SerDe.Write(inputStream, (int)SpecialLengths.END_OF_DATA_SECTION);
inputStream.Seek(0, SeekOrigin.Begin);
CommandExecutorStat stat = new CommandExecutor().Execute(
inputStream,
outputStream,
0,
commandPayload);
// Validate that all the data on the stream is read.
Assert.Equal(inputStream.Length, inputStream.Position);
Assert.Equal(10, stat.NumEntriesProcessed);
// Validate the output stream.
outputStream.Seek(0, SeekOrigin.Begin);
var unpickler = new Unpickler();
// One row was written as a batch above, thus need to read 'numRows' batches.
List<object> rows = new List<object>();
for (int i = 0; i < numRows; ++i)
{
int length = SerDe.ReadInt32(outputStream);
byte[] pickledBytes = SerDe.ReadBytes(outputStream, length);
rows.Add((unpickler.loads(pickledBytes) as ArrayList)[0] as object);
}
Assert.Equal(numRows, rows.Count);
// Validate the single command.
for (int i = 0; i < numRows; ++i)
{
Assert.Equal(
"udf: " + ((i % 2 == 0) ? "NULL" : i.ToString()),
(string)rows[i]);
}
// Validate all the data on the stream is read.
Assert.Equal(outputStream.Length, outputStream.Position);
}
}
[Fact]
public void TestPicklingSqlCommandExecutorWithMultiCommands()
{
var udfWrapper1 = new Sql.PicklingUdfWrapper<string, string>((str) => $"udf: {str}");
var udfWrapper2 = new Sql.PicklingUdfWrapper<int, int, int>(
(arg1, arg2) => arg1 * arg2);
var command1 = new SqlCommand()
{
ArgOffsets = new[] { 0 },
NumChainedFunctions = 1,
WorkerFunction = new Sql.PicklingWorkerFunction(udfWrapper1.Execute),
SerializerMode = CommandSerDe.SerializedMode.Row,
DeserializerMode = CommandSerDe.SerializedMode.Row
};
var command2 = new SqlCommand()
{
ArgOffsets = new[] { 1, 2 },
NumChainedFunctions = 1,
WorkerFunction = new Sql.PicklingWorkerFunction(udfWrapper2.Execute),
SerializerMode = CommandSerDe.SerializedMode.Row,
DeserializerMode = CommandSerDe.SerializedMode.Row
};
var commandPayload = new Worker.CommandPayload()
{
EvalType = UdfUtils.PythonEvalType.SQL_BATCHED_UDF,
Commands = new[] { command1, command2 }
};
using (var inputStream = new MemoryStream())
using (var outputStream = new MemoryStream())
{
int numRows = 10;
// Write test data to the input stream.
var pickler = new Pickler();
for (int i = 0; i < numRows; ++i)
{
byte[] pickled = pickler.dumps(
new[] { new object[] { i.ToString(), i, i } });
SerDe.Write(inputStream, pickled.Length);
SerDe.Write(inputStream, pickled);
}
SerDe.Write(inputStream, (int)SpecialLengths.END_OF_DATA_SECTION);
inputStream.Seek(0, SeekOrigin.Begin);
CommandExecutorStat stat = new CommandExecutor().Execute(
inputStream,
outputStream,
0,
commandPayload);
// Validate all the data on the stream is read.
Assert.Equal(inputStream.Length, inputStream.Position);
Assert.Equal(10, stat.NumEntriesProcessed);
// Validate the output stream.
outputStream.Seek(0, SeekOrigin.Begin);
var unpickler = new Unpickler();
// One row was written as a batch above, thus need to read 'numRows' batches.
List<object[]> rows = new List<object[]>();
for (int i = 0; i < numRows; ++i)
{
int length = SerDe.ReadInt32(outputStream);
byte[] pickledBytes = SerDe.ReadBytes(outputStream, length);
rows.Add((unpickler.loads(pickledBytes) as ArrayList)[0] as object[]);
}
Assert.Equal(numRows, rows.Count);
for (int i = 0; i < numRows; ++i)
{
// There were two UDFs each of which produces one column.
object[] columns = rows[i];
Assert.Equal($"udf: {i}", (string)columns[0]);
Assert.Equal(i * i, (int)columns[1]);
}
// Validate all the data on the stream is read.
Assert.Equal(outputStream.Length, outputStream.Position);
}
}
[Fact]
public void TestPicklingSqlCommandExecutorWithEmptyInput()
{
var udfWrapper = new Sql.PicklingUdfWrapper<string, string>((str) => $"udf: {str}");
var command = new SqlCommand()
{
ArgOffsets = new[] { 0 },
NumChainedFunctions = 1,
WorkerFunction = new Sql.PicklingWorkerFunction(udfWrapper.Execute),
SerializerMode = CommandSerDe.SerializedMode.Row,
DeserializerMode = CommandSerDe.SerializedMode.Row
};
var commandPayload = new Worker.CommandPayload()
{
EvalType = UdfUtils.PythonEvalType.SQL_BATCHED_UDF,
Commands = new[] { command }
};
using (var inputStream = new MemoryStream())
using (var outputStream = new MemoryStream())
{
// Write test data to the input stream. For the empty input scenario,
// only send SpecialLengths.END_OF_DATA_SECTION.
SerDe.Write(inputStream, (int)SpecialLengths.END_OF_DATA_SECTION);
inputStream.Seek(0, SeekOrigin.Begin);
CommandExecutorStat stat = new CommandExecutor().Execute(
inputStream,
outputStream,
0,
commandPayload);
// Validate that all the data on the stream is read.
Assert.Equal(inputStream.Length, inputStream.Position);
Assert.Equal(0, stat.NumEntriesProcessed);
// Validate the output stream.
Assert.Equal(0, outputStream.Length);
}
}
[Fact]
public async Task TestArrowSqlCommandExecutorWithSingleCommand()
{
var udfWrapper = new Sql.ArrowUdfWrapper<string, string>((str) => $"udf: {str}");
var command = new SqlCommand()
{
ArgOffsets = new[] { 0 },
NumChainedFunctions = 1,
WorkerFunction = new Sql.ArrowWorkerFunction(udfWrapper.Execute),
SerializerMode = CommandSerDe.SerializedMode.Row,
DeserializerMode = CommandSerDe.SerializedMode.Row
};
var commandPayload = new Worker.CommandPayload()
{
EvalType = UdfUtils.PythonEvalType.SQL_SCALAR_PANDAS_UDF,
Commands = new[] { command }
};
using (var inputStream = new MemoryStream())
using (var outputStream = new MemoryStream())
{
int numRows = 10;
// Write test data to the input stream.
Schema schema = new Schema.Builder()
.Field(b => b.Name("arg1").DataType(StringType.Default))
.Build();
var arrowWriter = new ArrowStreamWriter(inputStream, schema);
await arrowWriter.WriteRecordBatchAsync(
new RecordBatch(
schema,
new[]
{
ToArrowArray(
Enumerable.Range(0, numRows)
.Select(i => i.ToString())
.ToArray())
},
numRows));
inputStream.Seek(0, SeekOrigin.Begin);
CommandExecutorStat stat = new CommandExecutor().Execute(
inputStream,
outputStream,
0,
commandPayload);
// Validate that all the data on the stream is read.
Assert.Equal(inputStream.Length, inputStream.Position);
Assert.Equal(numRows, stat.NumEntriesProcessed);
// Validate the output stream.
outputStream.Seek(0, SeekOrigin.Begin);
int arrowLength = SerDe.ReadInt32(outputStream);
Assert.Equal((int)SpecialLengths.START_ARROW_STREAM, arrowLength);
var arrowReader = new ArrowStreamReader(outputStream);
RecordBatch outputBatch = await arrowReader.ReadNextRecordBatchAsync();
Assert.Equal(numRows, outputBatch.Length);
Assert.Single(outputBatch.Arrays);
var array = (StringArray)outputBatch.Arrays.ElementAt(0);
// Validate the single command.
for (int i = 0; i < numRows; ++i)
{
Assert.Equal($"udf: {i}", array.GetString(i));
}
int end = SerDe.ReadInt32(outputStream);
Assert.Equal(0, end);
// Validate all the data on the stream is read.
Assert.Equal(outputStream.Length, outputStream.Position);
}
}
[Fact]
public async Task TestArrowSqlCommandExecutorWithMultiCommands()
{
var udfWrapper1 = new Sql.ArrowUdfWrapper<string, string>((str) => $"udf: {str}");
var udfWrapper2 = new Sql.ArrowUdfWrapper<int, int, int>((arg1, arg2) => arg1 * arg2);
var command1 = new SqlCommand()
{
ArgOffsets = new[] { 0 },
NumChainedFunctions = 1,
WorkerFunction = new Sql.ArrowWorkerFunction(udfWrapper1.Execute),
SerializerMode = CommandSerDe.SerializedMode.Row,
DeserializerMode = CommandSerDe.SerializedMode.Row
};
var command2 = new SqlCommand()
{
ArgOffsets = new[] { 1, 2 },
NumChainedFunctions = 1,
WorkerFunction = new Sql.ArrowWorkerFunction(udfWrapper2.Execute),
SerializerMode = CommandSerDe.SerializedMode.Row,
DeserializerMode = CommandSerDe.SerializedMode.Row
};
var commandPayload = new Worker.CommandPayload()
{
EvalType = UdfUtils.PythonEvalType.SQL_SCALAR_PANDAS_UDF,
Commands = new[] { command1, command2 }
};
using (var inputStream = new MemoryStream())
using (var outputStream = new MemoryStream())
{
int numRows = 10;
// Write test data to the input stream.
Schema schema = new Schema.Builder()
.Field(b => b.Name("arg1").DataType(StringType.Default))
.Field(b => b.Name("arg2").DataType(Int32Type.Default))
.Field(b => b.Name("arg3").DataType(Int32Type.Default))
.Build();
var arrowWriter = new ArrowStreamWriter(inputStream, schema);
await arrowWriter.WriteRecordBatchAsync(
new RecordBatch(
schema,
new[]
{
ToArrowArray(
Enumerable.Range(0, numRows)
.Select(i => i.ToString())
.ToArray()),
ToArrowArray(Enumerable.Range(0, numRows).ToArray()),
ToArrowArray(Enumerable.Range(0, numRows).ToArray()),
},
numRows));
inputStream.Seek(0, SeekOrigin.Begin);
CommandExecutorStat stat = new CommandExecutor().Execute(
inputStream,
outputStream,
0,
commandPayload);
// Validate all the data on the stream is read.
Assert.Equal(inputStream.Length, inputStream.Position);
Assert.Equal(numRows, stat.NumEntriesProcessed);
// Validate the output stream.
outputStream.Seek(0, SeekOrigin.Begin);
var arrowLength = SerDe.ReadInt32(outputStream);
Assert.Equal((int)SpecialLengths.START_ARROW_STREAM, arrowLength);
var arrowReader = new ArrowStreamReader(outputStream);
RecordBatch outputBatch = await arrowReader.ReadNextRecordBatchAsync();
Assert.Equal(numRows, outputBatch.Length);
Assert.Equal(2, outputBatch.Arrays.Count());
var array1 = (StringArray)outputBatch.Arrays.ElementAt(0);
var array2 = (Int32Array)outputBatch.Arrays.ElementAt(1);
for (int i = 0; i < numRows; ++i)
{
Assert.Equal($"udf: {i}", array1.GetString(i));
Assert.Equal(i * i, array2.Values[i]);
}
int end = SerDe.ReadInt32(outputStream);
Assert.Equal(0, end);
// Validate all the data on the stream is read.
Assert.Equal(outputStream.Length, outputStream.Position);
}
}
/// <summary>
/// Tests when Spark writes an input stream that only contains a
/// Schema, and no record batches, that CommandExecutor writes the
/// appropriate response back.
/// </summary>
[Fact]
public void TestArrowSqlCommandExecutorWithEmptyInput()
{
var udfWrapper = new Sql.ArrowUdfWrapper<string, string>((str) => $"udf: {str}");
var command = new SqlCommand()
{
ArgOffsets = new[] { 0 },
NumChainedFunctions = 1,
WorkerFunction = new Sql.ArrowWorkerFunction(udfWrapper.Execute),
SerializerMode = CommandSerDe.SerializedMode.Row,
DeserializerMode = CommandSerDe.SerializedMode.Row
};
var commandPayload = new Worker.CommandPayload()
{
EvalType = UdfUtils.PythonEvalType.SQL_SCALAR_PANDAS_UDF,
Commands = new[] { command }
};
using (var inputStream = new MemoryStream())
using (var outputStream = new MemoryStream())
{
// Write test data to the input stream.
Schema schema = new Schema.Builder()
.Field(b => b.Name("arg1").DataType(StringType.Default))
.Build();
var arrowWriter = new ArrowStreamWriter(inputStream, schema);
// The .NET ArrowStreamWriter doesn't currently support writing just a
// schema with no batches - but Java does. We use Reflection to simulate
// the request Spark sends.
MethodInfo writeSchemaMethod = arrowWriter.GetType().GetMethod(
"WriteSchemaAsync",
BindingFlags.NonPublic | BindingFlags.Instance);
writeSchemaMethod.Invoke(
arrowWriter,
new object[] { schema, CancellationToken.None });
SerDe.Write(inputStream, 0);
inputStream.Seek(0, SeekOrigin.Begin);
CommandExecutorStat stat = new CommandExecutor().Execute(
inputStream,
outputStream,
0,
commandPayload);
// Validate that all the data on the stream is read.
Assert.Equal(inputStream.Length, inputStream.Position);
Assert.Equal(0, stat.NumEntriesProcessed);
// Validate the output stream.
outputStream.Seek(0, SeekOrigin.Begin);
int arrowLength = SerDe.ReadInt32(outputStream);
Assert.Equal((int)SpecialLengths.START_ARROW_STREAM, arrowLength);
var arrowReader = new ArrowStreamReader(outputStream);
RecordBatch outputBatch = arrowReader.ReadNextRecordBatch();
Assert.Equal(1, outputBatch.Schema.Fields.Count);
Assert.IsType<StringType>(outputBatch.Schema.GetFieldByIndex(0).DataType);
Assert.Equal(0, outputBatch.Length);
Assert.Single(outputBatch.Arrays);
var array = (StringArray)outputBatch.Arrays.ElementAt(0);
Assert.Equal(0, array.Length);
int end = SerDe.ReadInt32(outputStream);
Assert.Equal(0, end);
// Validate all the data on the stream is read.
Assert.Equal(outputStream.Length, outputStream.Position);
}
}
[Fact]
public void TestRDDCommandExecutor()
{
int mapUdf(int a) => a + 3;
var command = new RDDCommand()
{
WorkerFunction = new RDD.WorkerFunction(
new RDD<int>.MapUdfWrapper<int, int>(mapUdf).Execute),
SerializerMode = CommandSerDe.SerializedMode.Byte,
DeserializerMode = CommandSerDe.SerializedMode.Byte
};
var commandPayload = new Worker.CommandPayload()
{
EvalType = UdfUtils.PythonEvalType.NON_UDF,
Commands = new[] { command }
};
using (var inputStream = new MemoryStream())
using (var outputStream = new MemoryStream())
{
// Write test data to the input stream.
var formatter = new BinaryFormatter();
var memoryStream = new MemoryStream();
var inputs = new[] { 0, 1, 2, 3, 4 };
var values = new List<byte[]>();
foreach (int input in inputs)
{
memoryStream.Position = 0;
formatter.Serialize(memoryStream, input);
values.Add(memoryStream.ToArray());
}
foreach (byte[] value in values)
{
SerDe.Write(inputStream, value.Length);
SerDe.Write(inputStream, value);
}
SerDe.Write(inputStream, (int)SpecialLengths.END_OF_DATA_SECTION);
inputStream.Seek(0, SeekOrigin.Begin);
// Execute the command.
CommandExecutorStat stat = new CommandExecutor().Execute(
inputStream,
outputStream,
0,
commandPayload);
// Validate all the data on the stream is read.
Assert.Equal(inputStream.Length, inputStream.Position);
Assert.Equal(5, stat.NumEntriesProcessed);
// Validate the output stream.
outputStream.Seek(0, SeekOrigin.Begin);
for (int i = 0; i < inputs.Length; ++i)
{
Assert.True(SerDe.ReadInt32(outputStream) > 0);
Assert.Equal(
mapUdf(i),
formatter.Deserialize(outputStream));
}
// Validate all the data on the stream is read.
Assert.Equal(outputStream.Length, outputStream.Position);
}
}
}
}

Просмотреть файл

@ -0,0 +1,24 @@
<Project Sdk="Microsoft.NET.Sdk">
<PropertyGroup>
<TargetFrameworks>net461;netcoreapp2.1</TargetFrameworks>
<TargetFrameworks Condition="'$(OS)' != 'Windows_NT'">netcoreapp2.1</TargetFrameworks>
<IsPackable>false</IsPackable>
</PropertyGroup>
<ItemGroup>
<PackageReference Include="coverlet.msbuild" Version="2.4.0">
<PrivateAssets>all</PrivateAssets>
<IncludeAssets>runtime; build; native; contentfiles; analyzers</IncludeAssets>
</PackageReference>
<PackageReference Include="Microsoft.NET.Test.Sdk" Version="15.9.0" />
<PackageReference Include="xunit" Version="2.4.1" />
<PackageReference Include="xunit.runner.visualstudio" Version="2.4.1" />
</ItemGroup>
<ItemGroup>
<ProjectReference Include="..\Microsoft.Spark.Worker\Microsoft.Spark.Worker.csproj" />
<ProjectReference Include="..\Microsoft.Spark\Microsoft.Spark.csproj" />
</ItemGroup>
</Project>

Просмотреть файл

@ -0,0 +1,154 @@
// Licensed to the .NET Foundation under one or more agreements.
// The .NET Foundation licenses this file to you under the MIT license.
// See the LICENSE file in the project root for more information.
using System;
using System.IO;
using System.Linq;
using System.Net;
using Microsoft.Spark.Network;
using Microsoft.Spark.Sql;
using Microsoft.Spark.Utils;
using Microsoft.Spark.Worker.Processor;
using Xunit;
namespace Microsoft.Spark.Worker.UnitTest
{
public class PayloadProcessorTests
{
[Theory]
[InlineData(Versions.V2_3_0)]
[InlineData(Versions.V2_3_1)]
[InlineData(Versions.V2_3_2)]
[InlineData(Versions.V2_3_3)]
[InlineData(Versions.V2_4_0)]
public void TestPayloadProcessor(string version)
{
CommandPayload commandPayload = TestData.GetDefaultCommandPayload();
PayloadWriter payloadWriter = new PayloadWriterFactory().Create(new Version(version));
Payload payload = TestData.GetDefaultPayload();
Payload actualPayload = null;
using (var outStream = new MemoryStream())
{
payloadWriter.Write(outStream, payload, commandPayload);
using (var inputStream = new MemoryStream(outStream.ToArray()))
{
actualPayload =
new PayloadProcessor(payloadWriter.Version).Process(inputStream);
}
}
// Validate the read payload.
Assert.Equal(payload.SplitIndex, actualPayload.SplitIndex);
Assert.Equal(payload.Version, actualPayload.Version);
Assert.Equal(payload.TaskContext, actualPayload.TaskContext);
Assert.Equal(payload.SparkFilesDir, actualPayload.SparkFilesDir);
Assert.Equal(payload.IncludeItems, actualPayload.IncludeItems);
Assert.Equal(payload.BroadcastVariables.Count, actualPayload.BroadcastVariables.Count);
ValidateCommandPayload(commandPayload, actualPayload.Command);
// Validate the UDFs.
var actualCommand1 = (SqlCommand)actualPayload.Command.Commands[0];
var result1 = ((PicklingWorkerFunction)actualCommand1.WorkerFunction).Func(
0,
new object[] { "hello", 10, 20 },
actualCommand1.ArgOffsets);
Assert.Equal("udf2 udf1 hello", result1);
var actualCommand2 = (SqlCommand)actualPayload.Command.Commands[1];
var result2 = ((PicklingWorkerFunction)actualCommand2.WorkerFunction).Func(
0,
new object[] { "hello", 10, 20 },
actualCommand2.ArgOffsets);
Assert.Equal(30, result2);
}
[Fact]
public void TestClosedStreamWithSocket()
{
var commandPayload = new CommandPayload()
{
EvalType = UdfUtils.PythonEvalType.SQL_BATCHED_UDF,
Commands = new Command[] { }
};
PayloadWriter payloadWriter = new PayloadWriterFactory().Create();
Payload payload = TestData.GetDefaultPayload();
var serverListener = new DefaultSocketWrapper();
serverListener.Listen();
var port = (serverListener.LocalEndPoint as IPEndPoint).Port;
var clientSocket = new DefaultSocketWrapper();
clientSocket.Connect(IPAddress.Loopback, port, null);
using (ISocketWrapper serverSocket = serverListener.Accept())
{
Stream outStream = serverSocket.OutputStream;
payloadWriter.Write(outStream, payload, commandPayload);
outStream.Flush();
}
// At this point server socket is closed.
Stream inStream = clientSocket.InputStream;
// Consume bytes already written to the socket.
var payloadProcessor = new PayloadProcessor(payloadWriter.Version);
Payload actualPayload = payloadProcessor.Process(inStream);
Assert.Equal(payload.SplitIndex, actualPayload.SplitIndex);
Assert.Equal(payload.Version, actualPayload.Version);
Assert.Equal(payload.TaskContext, actualPayload.TaskContext);
Assert.Equal(payload.SparkFilesDir, actualPayload.SparkFilesDir);
Assert.Equal(payload.IncludeItems, actualPayload.IncludeItems);
Assert.Equal(payload.BroadcastVariables.Count, actualPayload.BroadcastVariables.Count);
ValidateCommandPayload(commandPayload, actualPayload.Command);
// Another read will detect that the socket is closed.
Assert.Null(payloadProcessor.Process(inStream));
}
[Fact]
public void TestClosedStreamWithMemoryStream()
{
var inputStream = new MemoryStream();
// Version is not used in this scenario.
var processor = new PayloadProcessor(null);
// Nothing is written to the stream.
Assert.Null(processor.Process(inputStream));
inputStream.Dispose();
// The stream is closed. Payload with null is expected.
Assert.Null(processor.Process(inputStream));
}
private void ValidateCommandPayload(
CommandPayload expected,
Worker.CommandPayload actual)
{
Assert.Equal(expected.EvalType, actual.EvalType);
Assert.Equal(expected.Commands.Length, actual.Commands.Count());
for (int i = 0; i < expected.Commands.Length; ++i)
{
Command expectedCommand = expected.Commands[i];
var actualCommand = (SqlCommand)actual.Commands[i];
Assert.Equal(expectedCommand.ArgOffsets, actualCommand.ArgOffsets);
Assert.Equal(
expectedCommand.ChainedUdfs.Length,
actualCommand.NumChainedFunctions);
Assert.Equal(
expectedCommand.SerializerMode,
actualCommand.SerializerMode);
Assert.Equal(
expectedCommand.DeserializerMode,
actualCommand.DeserializerMode);
}
}
}
}

Просмотреть файл

@ -0,0 +1,310 @@
// Licensed to the .NET Foundation under one or more agreements.
// The .NET Foundation licenses this file to you under the MIT license.
// See the LICENSE file in the project root for more information.
using System;
using System.Collections.Generic;
using System.Diagnostics;
using System.IO;
using System.Linq;
using Microsoft.Spark.Interop.Ipc;
using Microsoft.Spark.Utils;
using static Microsoft.Spark.Utils.UdfUtils;
namespace Microsoft.Spark.Worker.UnitTest
{
/// <summary>
/// Command stores data necessary to create a payload for a single command,
/// which can have chained UDFs. The reason Microsoft.Spark.Worker.Command
/// cannot be used is because it stores WorkerFunction which already abstracts
/// out the chained UDFs.
/// </summary>
internal sealed class Command
{
internal Delegate[] ChainedUdfs { get; set; }
internal int[] ArgOffsets { get; set; }
internal CommandSerDe.SerializedMode SerializerMode { get; set; }
internal CommandSerDe.SerializedMode DeserializerMode { get; set; }
}
/// <summary>
/// CommandPayload stores data necessary to create a payload for multiple commands.
/// </summary>
internal sealed class CommandPayload
{
internal PythonEvalType EvalType { get; set; }
internal Command[] Commands { get; set; }
}
///////////////////////////////////////////////////////////////////////////
// TaskContext writer for different Spark versions.
///////////////////////////////////////////////////////////////////////////
internal interface ITaskContextWriter
{
void Write(Stream stream, TaskContext taskContext);
}
/// <summary>
/// TaskContextWriter for version 2.3.*.
/// </summary>
internal sealed class TaskContextWriterV2_3_X : ITaskContextWriter
{
public void Write(Stream stream, TaskContext taskContext)
{
SerDe.Write(stream, taskContext.StageId);
SerDe.Write(stream, taskContext.PartitionId);
SerDe.Write(stream, taskContext.AttemptNumber);
SerDe.Write(stream, taskContext.AttemptId);
}
}
/// <summary>
/// TaskContextWriter for version 2.4.*.
/// </summary>
internal sealed class TaskContextWriterV2_4_X : ITaskContextWriter
{
public void Write(Stream stream, TaskContext taskContext)
{
SerDe.Write(stream, taskContext.IsBarrier);
SerDe.Write(stream, taskContext.Port);
SerDe.Write(stream, taskContext.Secret);
SerDe.Write(stream, taskContext.StageId);
SerDe.Write(stream, taskContext.PartitionId);
SerDe.Write(stream, taskContext.AttemptNumber);
SerDe.Write(stream, taskContext.AttemptId);
SerDe.Write(stream, taskContext.LocalProperties.Count);
foreach (KeyValuePair<string, string> kv in taskContext.LocalProperties)
{
SerDe.Write(stream, kv.Key);
SerDe.Write(stream, kv.Value);
}
}
}
///////////////////////////////////////////////////////////////////////////
// BroadcastVariable writer for different Spark versions.
///////////////////////////////////////////////////////////////////////////
internal interface IBroadcastVariableWriter
{
void Write(Stream stream, BroadcastVariables broadcastVars);
}
/// <summary>
/// BroadcastVariableWriter for version 2.3.0 and 2.3.1.
/// </summary>
internal sealed class BroadcastVariableWriterV2_3_0 : IBroadcastVariableWriter
{
public void Write(Stream stream, BroadcastVariables broadcastVars)
{
Debug.Assert(broadcastVars.Count == 0);
SerDe.Write(stream, broadcastVars.Count);
}
}
/// <summary>
/// BroadcastVariableWriter for version 2.3.2 and up.
/// </summary>
internal sealed class BroadcastVariableWriterV2_3_2 : IBroadcastVariableWriter
{
public void Write(Stream stream, BroadcastVariables broadcastVars)
{
SerDe.Write(stream, broadcastVars.DecryptionServerNeeded);
SerDe.Write(stream, broadcastVars.Count);
Debug.Assert(broadcastVars.Count == 0);
if (broadcastVars.DecryptionServerNeeded)
{
SerDe.Write(stream, broadcastVars.DecryptionServerPort);
SerDe.Write(stream, broadcastVars.Secret);
}
}
}
///////////////////////////////////////////////////////////////////////////
// Command writer for different Spark versions.
///////////////////////////////////////////////////////////////////////////
internal interface ICommandWriter
{
void Write(Stream stream, CommandPayload commandPayload);
}
/// <summary>
/// Provides a functionality to write Command[].
/// </summary>
internal abstract class CommandWriterBase
{
public void Write(Stream stream, Command[] commands)
{
SerDe.Write(stream, commands.Length);
foreach (Command command in commands)
{
SerDe.Write(stream, command.ArgOffsets.Length);
foreach (int argOffset in command.ArgOffsets)
{
SerDe.Write(stream, argOffset);
}
SerDe.Write(stream, command.ChainedUdfs.Length);
foreach (Delegate udf in command.ChainedUdfs)
{
byte[] serializedCommand = CommandSerDe.Serialize(
udf,
CommandSerDe.SerializedMode.Row,
CommandSerDe.SerializedMode.Row);
SerDe.Write(stream, serializedCommand.Length);
SerDe.Write(stream, serializedCommand);
}
}
}
}
/// <summary>
/// CommandWriter for version 2.3.*.
/// </summary>
internal sealed class CommandWriterV2_3_X : CommandWriterBase, ICommandWriter
{
public void Write(Stream stream, CommandPayload commandPayload)
{
SerDe.Write(stream, (int)commandPayload.EvalType);
Write(stream, commandPayload.Commands);
if ((commandPayload.EvalType == PythonEvalType.SQL_SCALAR_PANDAS_UDF) ||
(commandPayload.EvalType == PythonEvalType.SQL_GROUPED_MAP_PANDAS_UDF))
{
SerDe.Write(stream, "unused timezone");
}
}
}
/// <summary>
/// CommandWriter for version 2.4.*.
/// </summary>
internal sealed class CommandWriterV2_4_X : CommandWriterBase, ICommandWriter
{
public void Write(Stream stream, CommandPayload commandPayload)
{
if (commandPayload.EvalType == PythonEvalType.SQL_SCALAR_PANDAS_UDF ||
commandPayload.EvalType == PythonEvalType.SQL_GROUPED_MAP_PANDAS_UDF ||
commandPayload.EvalType == PythonEvalType.SQL_GROUPED_AGG_PANDAS_UDF ||
commandPayload.EvalType == PythonEvalType.SQL_WINDOW_AGG_PANDAS_UDF)
{
SerDe.Write(stream, 1);
for (int i = 0; i < 1; ++i)
{
SerDe.Write(stream, "unused key");
SerDe.Write(stream, "unused value");
}
}
SerDe.Write(stream, (int)commandPayload.EvalType);
Write(stream, commandPayload.Commands);
}
}
/// <summary>
/// Payload writer that supports different Spark versions.
/// </summary>
internal sealed class PayloadWriter
{
private readonly ITaskContextWriter _taskContextWriter;
private readonly IBroadcastVariableWriter _broadcastVariableWriter;
private readonly ICommandWriter _commandWriter;
internal PayloadWriter(
Version version,
ITaskContextWriter taskContextWriter,
IBroadcastVariableWriter broadcastVariableWriter,
ICommandWriter commandWriter)
{
Version = version;
_taskContextWriter = taskContextWriter;
_broadcastVariableWriter = broadcastVariableWriter;
_commandWriter = commandWriter;
}
internal Version Version { get; }
internal void Write(
Stream stream,
Payload payload,
CommandPayload commandPayload)
{
SerDe.Write(stream, payload.SplitIndex);
SerDe.Write(stream, payload.Version);
_taskContextWriter.Write(stream, payload.TaskContext);
SerDe.Write(stream, payload.SparkFilesDir);
Write(stream, payload.IncludeItems);
_broadcastVariableWriter.Write(stream, payload.BroadcastVariables);
_commandWriter.Write(stream, commandPayload);
}
private static void Write(Stream stream, IEnumerable<string> includeItems)
{
if (includeItems is null)
{
SerDe.Write(stream, 0);
return;
}
SerDe.Write(stream, includeItems.Count());
foreach (string includeItem in includeItems)
{
SerDe.Write(stream, includeItem);
}
}
}
/// <summary>
/// Factory class for creating a PayloadWriter given a version.
/// </summary>
internal sealed class PayloadWriterFactory
{
internal PayloadWriter Create(Version version = null)
{
if (version == null)
{
version = new Version(Versions.V2_4_0);
}
switch (version.ToString())
{
case Versions.V2_3_0:
case Versions.V2_3_1:
return new PayloadWriter(
version,
new TaskContextWriterV2_3_X(),
new BroadcastVariableWriterV2_3_0(),
new CommandWriterV2_3_X());
case Versions.V2_3_2:
case Versions.V2_3_3:
return new PayloadWriter(
version,
new TaskContextWriterV2_3_X(),
new BroadcastVariableWriterV2_3_2(),
new CommandWriterV2_3_X());
case Versions.V2_4_0:
return new PayloadWriter(
version,
new TaskContextWriterV2_4_X(),
new BroadcastVariableWriterV2_3_2(),
new CommandWriterV2_4_X());
default:
throw new NotSupportedException($"Spark {version} is not supported.");
}
}
}
}

Просмотреть файл

@ -0,0 +1,119 @@
// Licensed to the .NET Foundation under one or more agreements.
// The .NET Foundation licenses this file to you under the MIT license.
// See the LICENSE file in the project root for more information.
using System.Collections;
using System.Collections.Generic;
using System.Net;
using System.Threading.Tasks;
using Microsoft.Spark.Interop.Ipc;
using Microsoft.Spark.Network;
using Razorvine.Pickle;
using Xunit;
namespace Microsoft.Spark.Worker.UnitTest
{
public class TaskRunnerTests
{
[Fact]
public void TestTaskRunner()
{
using (var serverListener = new DefaultSocketWrapper())
{
serverListener.Listen();
var port = (serverListener.LocalEndPoint as IPEndPoint).Port;
var clientSocket = new DefaultSocketWrapper();
clientSocket.Connect(IPAddress.Loopback, port, null);
PayloadWriter payloadWriter = new PayloadWriterFactory().Create();
var taskRunner = new TaskRunner(0, clientSocket, false, payloadWriter.Version);
var clientTask = Task.Run(() => taskRunner.Run());
using (ISocketWrapper serverSocket = serverListener.Accept())
{
System.IO.Stream inputStream = serverSocket.InputStream;
System.IO.Stream outputStream = serverSocket.OutputStream;
Payload payload = TestData.GetDefaultPayload();
CommandPayload commandPayload = TestData.GetDefaultCommandPayload();
payloadWriter.Write(outputStream, payload, commandPayload);
// Write 10 rows to the output stream.
var pickler = new Pickler();
for (int i = 0; i < 10; ++i)
{
var pickled = pickler.dumps(
new[] { new object[] { i.ToString(), i, i } });
SerDe.Write(outputStream, pickled.Length);
SerDe.Write(outputStream, pickled);
}
// Signal the end of data and stream.
SerDe.Write(outputStream, (int)SpecialLengths.END_OF_DATA_SECTION);
SerDe.Write(outputStream, (int)SpecialLengths.END_OF_STREAM);
outputStream.Flush();
// Now process the bytes flowing in from the client.
var timingDataReceived = false;
var exceptionThrown = false;
var rowsReceived = new List<object[]>();
while (true)
{
var length = SerDe.ReadInt32(inputStream);
if (length > 0)
{
var pickledBytes = SerDe.ReadBytes(inputStream, length);
var unpickler = new Unpickler();
var rows = unpickler.loads(pickledBytes) as ArrayList;
foreach (object row in rows)
{
rowsReceived.Add((object[])row);
}
}
else if (length == (int)SpecialLengths.TIMING_DATA)
{
var bootTime = SerDe.ReadInt64(inputStream);
var initTime = SerDe.ReadInt64(inputStream);
var finishTime = SerDe.ReadInt64(inputStream);
var memoryBytesSpilled = SerDe.ReadInt64(inputStream);
var diskBytesSpilled = SerDe.ReadInt64(inputStream);
timingDataReceived = true;
}
else if (length == (int)SpecialLengths.PYTHON_EXCEPTION_THROWN)
{
SerDe.ReadString(inputStream);
exceptionThrown = true;
break;
}
else if (length == (int)SpecialLengths.END_OF_DATA_SECTION)
{
var numAccumulatorUpdates = SerDe.ReadInt32(inputStream);
SerDe.ReadInt32(inputStream);
break;
}
}
Assert.True(timingDataReceived);
Assert.False(exceptionThrown);
// Validate rows received.
Assert.Equal(10, rowsReceived.Count);
for (int i = 0; i < 10; ++i)
{
// Two UDFs registered, thus expecting two columns.
// Refer to TestData.GetDefaultCommandPayload().
var row = rowsReceived[i];
Assert.Equal(2, rowsReceived[i].Length);
Assert.Equal($"udf2 udf1 {i}", row[0]);
Assert.Equal(i + i, row[1]);
}
}
Assert.True(clientTask.Wait(5000));
}
}
}
}

Просмотреть файл

@ -0,0 +1,78 @@
// Licensed to the .NET Foundation under one or more agreements.
// The .NET Foundation licenses this file to you under the MIT license.
// See the LICENSE file in the project root for more information.
using Microsoft.Spark.Sql;
using Microsoft.Spark.Utils;
namespace Microsoft.Spark.Worker.UnitTest
{
/// <summary>
/// TestData provides helper functions to create default test data.
/// </summary>
internal static class TestData
{
internal static Payload GetDefaultPayload()
{
var taskContext = new TaskContext()
{
StageId = 1,
PartitionId = 2,
AttemptNumber = 1,
AttemptId = 100L,
Port = 9999,
Secret = "secret"
};
var broadcastVars = new BroadcastVariables()
{
DecryptionServerNeeded = true,
DecryptionServerPort = 9999,
Secret = "secret"
};
return new Payload()
{
SplitIndex = 10,
Version = "1.0",
TaskContext = taskContext,
SparkFilesDir = "directory",
IncludeItems = new[] { "file1", "file2" },
BroadcastVariables = broadcastVars
};
}
internal static CommandPayload GetDefaultCommandPayload()
{
var udfWrapper1 = new PicklingUdfWrapper<string, string>((str) => $"udf1 {str}");
var udfWrapper2 = new PicklingUdfWrapper<string, string>((str) => $"udf2 {str}");
var udfWrapper3 = new PicklingUdfWrapper<int, int, int>((arg1, arg2) => arg1 + arg2);
var command1 = new Command()
{
ChainedUdfs = new PicklingWorkerFunction.ExecuteDelegate[]
{
udfWrapper1.Execute,
udfWrapper2.Execute
},
ArgOffsets = new[] { 0 },
SerializerMode = CommandSerDe.SerializedMode.Row,
DeserializerMode = CommandSerDe.SerializedMode.Row
};
var command2 = new Command()
{
ChainedUdfs = new PicklingWorkerFunction.ExecuteDelegate[] {udfWrapper3.Execute },
ArgOffsets = new[] { 1, 2 },
SerializerMode = CommandSerDe.SerializedMode.Row,
DeserializerMode = CommandSerDe.SerializedMode.Row
};
return new CommandPayload()
{
EvalType = UdfUtils.PythonEvalType.SQL_BATCHED_UDF,
Commands = new[] { command1, command2 }
};
}
}
}

Просмотреть файл

@ -0,0 +1,65 @@
// Licensed to the .NET Foundation under one or more agreements.
// The .NET Foundation licenses this file to you under the MIT license.
// See the LICENSE file in the project root for more information.
using System.IO;
using System.Linq;
namespace Microsoft.Spark.Worker.Command
{
/// <summary>
/// CommandExecutorStat stores statistics information for executing a command payload.
/// </summary>
internal sealed class CommandExecutorStat
{
/// <summary>
/// Number of non-null entries received/processed.
/// </summary>
internal int NumEntriesProcessed { get; set; }
}
/// <summary>
/// CommandExecutor reads input data from the input stream,
/// runs commands on them, and writes result to the output stream.
/// </summary>
internal sealed class CommandExecutor
{
/// <summary>
/// Executes the commands on the input data read from input stream
/// and writes results to the output stream.
/// </summary>
/// <param name="inputStream">Input stream to read data from</param>
/// <param name="outputStream">Output stream to write results to</param>
/// <param name="splitIndex">Split index for this task</param>
/// <param name="commandPayload">Contains the commands to execute</param>
/// <returns>Statistics captured during the Execute() run</returns>
internal CommandExecutorStat Execute(
Stream inputStream,
Stream outputStream,
int splitIndex,
CommandPayload commandPayload)
{
if (commandPayload.EvalType == Spark.Utils.UdfUtils.PythonEvalType.NON_UDF)
{
if (commandPayload.Commands.Length != 1)
{
throw new System.Exception(
"Invalid number of commands for RDD: {commandPayload.Commands.Length}");
}
return new RDDCommandExecutor().Execute(
inputStream,
outputStream,
splitIndex,
(RDDCommand)commandPayload.Commands[0]);
}
return SqlCommandExecutor.Execute(
inputStream,
outputStream,
commandPayload.EvalType,
commandPayload.Commands.Cast<SqlCommand>().ToArray());
}
}
}

Просмотреть файл

@ -0,0 +1,128 @@
// Licensed to the .NET Foundation under one or more agreements.
// The .NET Foundation licenses this file to you under the MIT license.
// See the LICENSE file in the project root for more information.
using System;
using System.Collections.Generic;
using System.IO;
using System.Runtime.Serialization.Formatters.Binary;
using Microsoft.Spark.Interop.Ipc;
using Microsoft.Spark.Utils;
namespace Microsoft.Spark.Worker.Command
{
/// <summary>
/// CommandExecutor reads input data from the input stream,
/// runs commands on them, and writes result to the output stream.
/// </summary>
internal class RDDCommandExecutor
{
[ThreadStatic]
private static MemoryStream s_writeOutputStream;
[ThreadStatic]
private static BinaryFormatter s_binaryFormatter;
/// <summary>
/// Executes the commands on the input data read from input stream
/// and writes results to the output stream.
/// </summary>
/// <param name="inputStream">Input stream to read data from</param>
/// <param name="outputStream">Output stream to write results to</param>
/// <param name="splitIndex">Split index for this task</param>
/// <param name="command">Contains the commands to execute</param>
/// <returns>Statistics captured during the Execute() run</returns>
internal CommandExecutorStat Execute(
Stream inputStream,
Stream outputStream,
int splitIndex,
RDDCommand command)
{
var stat = new CommandExecutorStat();
CommandSerDe.SerializedMode serializerMode = command.SerializerMode;
CommandSerDe.SerializedMode deserializerMode = command.DeserializerMode;
RDD.WorkerFunction.ExecuteDelegate func = command.WorkerFunction.Func;
foreach (object output in func(
splitIndex,
GetInputIterator(inputStream, deserializerMode)))
{
WriteOutput(outputStream, serializerMode, output);
++stat.NumEntriesProcessed;
}
return stat;
}
/// <summary>
/// Create input iterator from the given input stream.
/// </summary>
/// <param name="inputStream">Stream to read from</param>
/// <param name="deserializerMode">Mode for deserialization</param>
/// <returns></returns>
private IEnumerable<object> GetInputIterator(
Stream inputStream,
CommandSerDe.SerializedMode deserializerMode)
{
RDD.Collector.IDeserializer deserializer =
RDD.Collector.GetDeserializer(deserializerMode);
var messageLength = 0;
while ((messageLength = SerDe.ReadInt32(inputStream)) !=
(int)SpecialLengths.END_OF_DATA_SECTION)
{
if ((messageLength > 0) || (messageLength == (int)SpecialLengths.NULL))
{
yield return deserializer.Deserialize(inputStream, messageLength);
}
}
}
/// <summary>
/// Writes the given message to the stream.
/// </summary>
/// <param name="stream">Stream to write to</param>
/// <param name="serializerMode">Mode for serialization</param>
/// <param name="message">Message to write to</param>
private void WriteOutput(
Stream stream,
CommandSerDe.SerializedMode serializerMode,
object message)
{
MemoryStream writeOutputStream = s_writeOutputStream ??
(s_writeOutputStream = new MemoryStream());
writeOutputStream.Position = 0;
Serialize(serializerMode, message, writeOutputStream);
SerDe.Write(stream, (int)writeOutputStream.Position);
SerDe.Write(stream, writeOutputStream.GetBuffer(), (int)writeOutputStream.Position);
}
/// <summary>
/// Serialize a row based on the given serializer mode.
/// </summary>
/// <param name="serializerMode"></param>
/// <param name="message"></param>
/// <param name="stream"></param>
private void Serialize(
CommandSerDe.SerializedMode serializerMode,
object message,
MemoryStream stream)
{
switch (serializerMode)
{
case CommandSerDe.SerializedMode.Byte:
BinaryFormatter formatter = s_binaryFormatter ??
(s_binaryFormatter = new BinaryFormatter());
formatter.Serialize(stream, message);
break;
case CommandSerDe.SerializedMode.None:
case CommandSerDe.SerializedMode.String:
case CommandSerDe.SerializedMode.Pair:
default:
throw new NotImplementedException(
$"Unsupported serializerMode: {serializerMode}");
}
}
}
}

Просмотреть файл

@ -0,0 +1,532 @@
// Licensed to the .NET Foundation under one or more agreements.
// The .NET Foundation licenses this file to you under the MIT license.
// See the LICENSE file in the project root for more information.
using System;
using System.Buffers;
using System.Collections.Generic;
using System.Diagnostics;
using System.IO;
using System.Linq;
using Apache.Arrow;
using Apache.Arrow.Ipc;
using Microsoft.Spark.Interop.Ipc;
using Microsoft.Spark.IO;
using Microsoft.Spark.Sql;
using Microsoft.Spark.Utils;
using Razorvine.Pickle;
namespace Microsoft.Spark.Worker.Command
{
/// <summary>
/// SqlCommandExecutor reads input data from the input stream,
/// runs commands on them, and writes result to the output stream.
/// </summary>
internal abstract class SqlCommandExecutor
{
/// <summary>
/// Executes the commands on the input data read from input stream
/// and writes results to the output stream.
/// </summary>
/// <param name="inputStream">Input stream to read data from</param>
/// <param name="outputStream">Output stream to write results to</param>
/// <param name="evalType">Evaluation type for the current commands</param>
/// <param name="commands">Contains the commands to execute</param>
/// <returns>Statistics captured during the Execute() run</returns>
internal static CommandExecutorStat Execute(
Stream inputStream,
Stream outputStream,
UdfUtils.PythonEvalType evalType,
SqlCommand[] commands)
{
if (commands.Length <= 0)
{
throw new ArgumentException("Commands cannot be empty.");
}
if (commands.Any(c =>
(c.SerializerMode != CommandSerDe.SerializedMode.Row) ||
(c.DeserializerMode != CommandSerDe.SerializedMode.Row)))
{
throw new ArgumentException("Unexpected serialization mode found.");
}
SqlCommandExecutor executor;
if (evalType == UdfUtils.PythonEvalType.SQL_SCALAR_PANDAS_UDF)
{
executor = new ArrowSqlCommandExecutor();
}
else if (evalType == UdfUtils.PythonEvalType.SQL_BATCHED_UDF)
{
executor = new PicklingSqlCommandExecutor();
}
else
{
throw new NotSupportedException($"{evalType} is not supported.");
}
return executor.ExecuteCore(inputStream, outputStream, commands);
}
protected abstract CommandExecutorStat ExecuteCore(
Stream inputStream,
Stream outputStream,
SqlCommand[] commands);
}
/// <summary>
/// A SqlCommandExecutor that reads and writes using the
/// Python pickling format.
/// </summary>
internal class PicklingSqlCommandExecutor : SqlCommandExecutor
{
[ThreadStatic]
private static MemoryStream s_writeOutputStream;
[ThreadStatic]
private static MaxLengthReadStream s_slicedReadStream;
[ThreadStatic]
private static Pickler s_pickler;
protected override CommandExecutorStat ExecuteCore(
Stream inputStream,
Stream outputStream,
SqlCommand[] commands)
{
var stat = new CommandExecutorStat();
ICommandRunner commandRunner = CreateCommandRunner(commands);
// On the Spark side, each object in the following List<> is considered as a row.
// See the ICommandRunner comments above for the types for a row.
var outputRows = new List<object>();
// If the input is empty (no rows) or all rows have been read, then
// SpecialLengths.END_OF_DATA_SECTION is sent as the messageLength.
// For example, no rows:
// +---+----+
// |age|name|
// +---+----+
// +---+----+
int messageLength = 0;
while ((messageLength = SerDe.ReadInt32(inputStream)) !=
(int)SpecialLengths.END_OF_DATA_SECTION)
{
if ((messageLength > 0) || (messageLength == (int)SpecialLengths.NULL))
{
if (messageLength <= 0)
{
throw new InvalidDataException(
$"Invalid message length: {messageLength}");
}
MaxLengthReadStream readStream = s_slicedReadStream ??
(s_slicedReadStream = new MaxLengthReadStream());
readStream.Reset(inputStream, messageLength);
// Each row in inputRows is of type object[]. If a null is present in a row
// then the corresponding index column of the row object[] will be set to null.
// For example, (inputRows.Length == 2) and (inputRows[0][0] == null)
// +----+
// | age|
// +----+
// |null|
// | 11|
// +----+
object[] inputRows = PythonSerDe.GetUnpickledObjects(readStream);
for (int i = 0; i < inputRows.Length; ++i)
{
// Split id is not used for SQL UDFs, so 0 is passed.
outputRows.Add(commandRunner.Run(0, inputRows[i]));
}
WriteOutput(outputStream, outputRows);
stat.NumEntriesProcessed += inputRows.Length;
outputRows.Clear();
}
}
return stat;
}
/// <summary>
/// Writes the given message to the stream.
/// </summary>
/// <param name="stream">Stream to write to</param>
/// <param name="rows">Rows to write to</param>
private void WriteOutput(Stream stream, IEnumerable<object> rows)
{
MemoryStream writeOutputStream = s_writeOutputStream ??
(s_writeOutputStream = new MemoryStream());
writeOutputStream.Position = 0;
Pickler pickler = s_pickler ?? (s_pickler = new Pickler(false));
pickler.dump(rows, writeOutputStream);
if (writeOutputStream.Position == 0)
{
throw new Exception("Message buffer cannot be null.");
}
SerDe.Write(stream, (int)writeOutputStream.Position);
SerDe.Write(stream, writeOutputStream.GetBuffer(), (int)writeOutputStream.Position);
}
/// <summary>
/// Creates an ICommandRunner instance based on the given commands.
/// </summary>
/// <param name="commands">Commands used for creating a command runner</param>
/// <returns>An ICommandRunner instance</returns>
private static ICommandRunner CreateCommandRunner(SqlCommand[] commands)
{
return (commands.Length == 1) ?
(ICommandRunner)new SingleCommandRunner(commands[0]) :
new MultiCommandRunner(commands);
}
/// <summary>
/// Interface for running commands.
/// On the Spark side, the following is expected for the Pickling to work:
/// If there is a single command (one UDF), the computed value is returned
/// as an object (one element). If there are multiple commands (multiple UDF scenario),
/// the computed value should be an array (not IEnumerable) where each element
/// in the array corresponds to the value returned by a command.
/// Refer to EvaluatePython.scala for StructType case.
/// </summary>
private interface ICommandRunner
{
/// <summary>
/// Runs commands based on the given split id and input.
/// </summary>
/// <param name="splitId">Split id for the commands to run</param>
/// <param name="input">Input data for the commands to run</param>
/// <returns>Value returned by running the commands</returns>
object Run(int splitId, object input);
}
/// <summary>
/// SingleCommandRunner handles running a single command.
/// </summary>
private sealed class SingleCommandRunner : ICommandRunner
{
/// <summary>
/// A command to run.
/// </summary>
private readonly SqlCommand _command;
/// <summary>
/// Constructor.
/// </summary>
/// <param name="command">A command to run</param>
internal SingleCommandRunner(SqlCommand command)
{
_command = command;
}
/// <summary>
/// Runs a single command.
/// </summary>
/// <param name="splitId">Split id for the command to run</param>
/// <param name="input">Input data for the command to run</param>
/// <returns>Value returned by running the command</returns>
public object Run(int splitId, object input)
{
return ((PicklingWorkerFunction)_command.WorkerFunction).Func(
splitId,
(object[])input,
_command.ArgOffsets);
}
}
/// <summary>
/// MultiCommandRunner handles running multiple commands.
/// </summary>
private sealed class MultiCommandRunner : ICommandRunner
{
/// <summary>
/// Commands to run.
/// </summary>
private readonly SqlCommand[] _commands;
/// <summary>
/// Constructor.
/// </summary>
/// <param name="commands">Multiple commands top run</param>
internal MultiCommandRunner(SqlCommand[] commands)
{
_commands = commands;
}
/// <summary>
/// Runs multiple commands.
/// </summary>
/// <param name="splitId">Split id for the commands to run</param>
/// <param name="input">Input data for the commands to run</param>
/// <returns>An array of values returned by running the commands</returns>
public object Run(int splitId, object input)
{
var row = new object[_commands.Length];
for (int i = 0; i < _commands.Length; ++i)
{
SqlCommand command = _commands[i];
row[i] = ((PicklingWorkerFunction)command.WorkerFunction).Func(
splitId,
(object[])input,
command.ArgOffsets);
}
return row;
}
}
}
/// <summary>
/// A SqlCommandExecutor that reads and writes using the
/// Apache Arrow format.
/// </summary>
internal class ArrowSqlCommandExecutor : SqlCommandExecutor
{
[ThreadStatic]
private static MemoryStream s_writeOutputStream;
protected override CommandExecutorStat ExecuteCore(
Stream inputStream,
Stream outputStream,
SqlCommand[] commands)
{
var stat = new CommandExecutorStat();
ICommandRunner commandRunner = CreateCommandRunner(commands);
SerDe.Write(outputStream, (int)SpecialLengths.START_ARROW_STREAM);
// TODO: Remove this MemoryStream once the arrow writer supports non-seekable streams.
// For now, we write to a temporary seekable MemoryStream which we then copy to
// the actual destination stream.
MemoryStream tmp = s_writeOutputStream ?? (s_writeOutputStream = new MemoryStream());
ArrowStreamWriter writer = null;
Schema resultSchema = null;
foreach (ReadOnlyMemory<IArrowArray> input in GetInputIterator(inputStream))
{
// Split id is currently not used, so 0 is passed.
IArrowArray[] results = commandRunner.Run(0, input);
// Assumes all columns have the same length, so uses 0th for num entries.
int numEntries = results[0].Length;
stat.NumEntriesProcessed += numEntries;
tmp.SetLength(0);
if (writer == null)
{
Debug.Assert(resultSchema == null);
resultSchema = BuildSchema(results);
writer = new ArrowStreamWriter(tmp, resultSchema, leaveOpen: true);
}
var recordBatch = new RecordBatch(resultSchema, results, numEntries);
// TODO: Remove sync-over-async once WriteRecordBatch exists.
writer.WriteRecordBatchAsync(recordBatch).GetAwaiter().GetResult();
tmp.Position = 0;
tmp.CopyTo(outputStream);
outputStream.Flush();
}
SerDe.Write(outputStream, 0);
if (writer != null)
{
writer.Dispose();
}
return stat;
}
/// <summary>
/// Create input iterator from the given input stream.
/// </summary>
/// <param name="inputStream">Stream to read from</param>
/// <returns></returns>
private IEnumerable<ReadOnlyMemory<IArrowArray>> GetInputIterator(Stream inputStream)
{
IArrowArray[] arrays = null;
int columnCount = 0;
try
{
using (var reader = new ArrowStreamReader(inputStream, leaveOpen: true))
{
RecordBatch batch;
while ((batch = reader.ReadNextRecordBatch()) != null)
{
columnCount = batch.ColumnCount;
if (arrays == null)
{
// Note that every batch in a stream has the same schema.
arrays = ArrayPool<IArrowArray>.Shared.Rent(columnCount);
}
for (int i = 0; i < columnCount; ++i)
{
arrays[i] = batch.Column(i);
}
yield return new ReadOnlyMemory<IArrowArray>(arrays, 0, columnCount);
}
if (arrays == null)
{
// When no input batches were received, return empty IArrowArrays
// in order to create and write back the result schema.
columnCount = reader.Schema.Fields.Count;
arrays = ArrayPool<IArrowArray>.Shared.Rent(columnCount);
for (int i = 0; i < columnCount; ++i)
{
arrays[i] = null;
}
yield return new ReadOnlyMemory<IArrowArray>(arrays, 0, columnCount);
}
}
}
finally
{
if (arrays != null)
{
arrays.AsSpan(0, columnCount).Clear();
ArrayPool<IArrowArray>.Shared.Return(arrays);
}
}
}
private static Schema BuildSchema(IArrowArray[] resultColumns)
{
var schemaBuilder = new Schema.Builder();
if (resultColumns.Length == 1)
{
schemaBuilder = schemaBuilder
.Field(f => f.Name("Result")
.DataType(resultColumns[0].Data.DataType)
.Nullable(false));
}
else
{
for (int i = 0; i < resultColumns.Length; ++i)
{
schemaBuilder = schemaBuilder
.Field(f => f.Name("Result" + i)
.DataType(resultColumns[i].Data.DataType)
.Nullable(false));
}
}
return schemaBuilder.Build();
}
/// <summary>
/// Creates an ICommandRunner instance based on the given commands.
/// </summary>
/// <param name="commands">Commands used for creating a command runner</param>
/// <returns>An ICommandRunner instance</returns>
private static ICommandRunner CreateCommandRunner(SqlCommand[] commands)
{
return (commands.Length == 1) ?
(ICommandRunner)new SingleCommandRunner(commands[0]) :
new MultiCommandRunner(commands);
}
/// <summary>
/// Interface for running commands.
/// On the Spark side, the following is expected for the Pickling to work:
/// If there is a single command (one UDF), the computed value is returned
/// as an object (one element). If there are multiple commands (multiple UDF scenario),
/// the computed value should be an array (not IEnumerable) where each element
/// in the array corresponds to the value returned by a command.
/// Refer to EvaluatePython.scala for StructType case.
/// </summary>
private interface ICommandRunner
{
/// <summary>
/// Runs commands based on the given split id and input.
/// </summary>
/// <param name="splitId">Split id for the commands to run</param>
/// <param name="input">Input data for the commands to run</param>
/// <returns>Value returned by running the commands</returns>
IArrowArray[] Run(int splitId, ReadOnlyMemory<IArrowArray> input);
}
/// <summary>
/// SingleCommandRunner handles running a single command.
/// </summary>
private sealed class SingleCommandRunner : ICommandRunner
{
/// <summary>
/// A command to run.
/// </summary>
private readonly SqlCommand _command;
/// <summary>
/// Constructor.
/// </summary>
/// <param name="command">A command to run</param>
internal SingleCommandRunner(SqlCommand command)
{
_command = command;
}
/// <summary>
/// Runs a single command.
/// </summary>
/// <param name="splitId">Split id for the command to run</param>
/// <param name="input">Input data for the command to run</param>
/// <returns>Value returned by running the command</returns>
public IArrowArray[] Run(int splitId, ReadOnlyMemory<IArrowArray> input)
{
return new[] { ((ArrowWorkerFunction)_command.WorkerFunction).Func(
splitId,
input,
_command.ArgOffsets) };
}
}
/// <summary>
/// MultiCommandRunner handles running multiple commands.
/// </summary>
private sealed class MultiCommandRunner : ICommandRunner
{
/// <summary>
/// Commands to run.
/// </summary>
private readonly SqlCommand[] _commands;
/// <summary>
/// Constructor.
/// </summary>
/// <param name="commands">Multiple commands top run</param>
internal MultiCommandRunner(SqlCommand[] commands)
{
_commands = commands;
}
/// <summary>
/// Runs multiple commands.
/// </summary>
/// <param name="splitId">Split id for the commands to run</param>
/// <param name="input">Input data for the commands to run</param>
/// <returns>An array of values returned by running the commands</returns>
public IArrowArray[] Run(int splitId, ReadOnlyMemory<IArrowArray> input)
{
var resultColumns = new IArrowArray[_commands.Length];
for (int i = 0; i < resultColumns.Length; ++i)
{
SqlCommand command = _commands[i];
resultColumns[i] = ((ArrowWorkerFunction)command.WorkerFunction).Func(
splitId,
input,
command.ArgOffsets);
}
return resultColumns;
}
}
}
}

Некоторые файлы не были показаны из-за слишком большого количества измененных файлов Показать больше