зеркало из https://github.com/dotnet/spark.git
Initial commit
_ _ _____ _____ __ ____ _ | \ | | ____|_ _| / _| ___ _ __ / ___| _ __ __ _ _ __| | __ | \| | _| | | | |_ / _ \| '__| \___ \| '_ \ / _` | '__| |/ / _| |\ | |___ | | | _| (_) | | ___) | |_) | (_| | | | < (_)_| \_|_____| |_| |_| \___/|_| |____/| .__/ \__,_|_| |_|\_\ |_|
This commit is contained in:
Коммит
23ed6cddfb
|
@ -0,0 +1,158 @@
|
|||
# editorconfig.org
|
||||
|
||||
# top-most EditorConfig file
|
||||
root = true
|
||||
|
||||
# Default settings:
|
||||
# A newline ending every file
|
||||
# Use 4 spaces as indentation
|
||||
[*]
|
||||
insert_final_newline = true
|
||||
indent_style = space
|
||||
indent_size = 4
|
||||
|
||||
[project.json]
|
||||
indent_size = 2
|
||||
|
||||
# C# files
|
||||
[*.cs]
|
||||
# New line preferences
|
||||
csharp_new_line_before_open_brace = all
|
||||
csharp_new_line_before_else = true
|
||||
csharp_new_line_before_catch = true
|
||||
csharp_new_line_before_finally = true
|
||||
csharp_new_line_before_members_in_object_initializers = true
|
||||
csharp_new_line_before_members_in_anonymous_types = true
|
||||
csharp_new_line_between_query_expression_clauses = true
|
||||
|
||||
# Indentation preferences
|
||||
csharp_indent_block_contents = true
|
||||
csharp_indent_braces = false
|
||||
csharp_indent_case_contents = true
|
||||
csharp_indent_switch_labels = true
|
||||
csharp_indent_labels = one_less_than_current
|
||||
|
||||
# avoid this. unless absolutely necessary
|
||||
dotnet_style_qualification_for_field = false:suggestion
|
||||
dotnet_style_qualification_for_property = false:suggestion
|
||||
dotnet_style_qualification_for_method = false:suggestion
|
||||
dotnet_style_qualification_for_event = false:suggestion
|
||||
|
||||
# only use var when it's obvious what the variable type is
|
||||
csharp_style_var_for_built_in_types = false:none
|
||||
csharp_style_var_when_type_is_apparent = false:none
|
||||
csharp_style_var_elsewhere = false:suggestion
|
||||
|
||||
# use language keywords instead of BCL types
|
||||
dotnet_style_predefined_type_for_locals_parameters_members = true:suggestion
|
||||
dotnet_style_predefined_type_for_member_access = true:suggestion
|
||||
|
||||
# name all constant fields using PascalCase
|
||||
dotnet_naming_rule.constant_fields_should_be_pascal_case.severity = suggestion
|
||||
dotnet_naming_rule.constant_fields_should_be_pascal_case.symbols = constant_fields
|
||||
dotnet_naming_rule.constant_fields_should_be_pascal_case.style = pascal_case_style
|
||||
|
||||
dotnet_naming_symbols.constant_fields.applicable_kinds = field
|
||||
dotnet_naming_symbols.constant_fields.required_modifiers = const
|
||||
|
||||
dotnet_naming_style.pascal_case_style.capitalization = pascal_case
|
||||
|
||||
# static fields should have s_ prefix
|
||||
dotnet_naming_rule.static_fields_should_have_prefix.severity = suggestion
|
||||
dotnet_naming_rule.static_fields_should_have_prefix.symbols = static_fields
|
||||
dotnet_naming_rule.static_fields_should_have_prefix.style = static_prefix_style
|
||||
|
||||
dotnet_naming_symbols.static_fields.applicable_kinds = field
|
||||
dotnet_naming_symbols.static_fields.required_modifiers = static
|
||||
|
||||
dotnet_naming_style.static_prefix_style.required_prefix = s_
|
||||
dotnet_naming_style.static_prefix_style.capitalization = camel_case
|
||||
|
||||
# internal and private fields should be _camelCase
|
||||
dotnet_naming_rule.camel_case_for_private_internal_fields.severity = suggestion
|
||||
dotnet_naming_rule.camel_case_for_private_internal_fields.symbols = private_internal_fields
|
||||
dotnet_naming_rule.camel_case_for_private_internal_fields.style = camel_case_underscore_style
|
||||
|
||||
dotnet_naming_symbols.private_internal_fields.applicable_kinds = field
|
||||
dotnet_naming_symbols.private_internal_fields.applicable_accessibilities = private, internal
|
||||
|
||||
dotnet_naming_style.camel_case_underscore_style.required_prefix = _
|
||||
dotnet_naming_style.camel_case_underscore_style.capitalization = camel_case
|
||||
|
||||
# Code style defaults
|
||||
dotnet_sort_system_directives_first = true
|
||||
csharp_preserve_single_line_blocks = true
|
||||
csharp_preserve_single_line_statements = false
|
||||
|
||||
# Expression-level preferences
|
||||
dotnet_style_object_initializer = true:suggestion
|
||||
dotnet_style_collection_initializer = true:suggestion
|
||||
dotnet_style_explicit_tuple_names = true:suggestion
|
||||
dotnet_style_coalesce_expression = true:suggestion
|
||||
dotnet_style_null_propagation = true:suggestion
|
||||
|
||||
# Expression-bodied members
|
||||
csharp_style_expression_bodied_methods = false:none
|
||||
csharp_style_expression_bodied_constructors = false:none
|
||||
csharp_style_expression_bodied_operators = false:none
|
||||
csharp_style_expression_bodied_properties = true:none
|
||||
csharp_style_expression_bodied_indexers = true:none
|
||||
csharp_style_expression_bodied_accessors = true:none
|
||||
|
||||
# Pattern matching
|
||||
csharp_style_pattern_matching_over_is_with_cast_check = true:suggestion
|
||||
csharp_style_pattern_matching_over_as_with_null_check = true:suggestion
|
||||
csharp_style_inlined_variable_declaration = true:suggestion
|
||||
|
||||
# Null checking preferences
|
||||
csharp_style_throw_expression = true:suggestion
|
||||
csharp_style_conditional_delegate_call = true:suggestion
|
||||
|
||||
# Space preferences
|
||||
csharp_space_after_cast = false
|
||||
csharp_space_after_colon_in_inheritance_clause = true
|
||||
csharp_space_after_comma = true
|
||||
csharp_space_after_dot = false
|
||||
csharp_space_after_keywords_in_control_flow_statements = true
|
||||
csharp_space_after_semicolon_in_for_statement = true
|
||||
csharp_space_around_binary_operators = before_and_after
|
||||
csharp_space_around_declaration_statements = do_not_ignore
|
||||
csharp_space_before_colon_in_inheritance_clause = true
|
||||
csharp_space_before_comma = false
|
||||
csharp_space_before_dot = false
|
||||
csharp_space_before_open_square_brackets = false
|
||||
csharp_space_before_semicolon_in_for_statement = false
|
||||
csharp_space_between_empty_square_brackets = false
|
||||
csharp_space_between_method_call_empty_parameter_list_parentheses = false
|
||||
csharp_space_between_method_call_name_and_opening_parenthesis = false
|
||||
csharp_space_between_method_call_parameter_list_parentheses = false
|
||||
csharp_space_between_method_declaration_empty_parameter_list_parentheses = false
|
||||
csharp_space_between_method_declaration_name_and_open_parenthesis = false
|
||||
csharp_space_between_method_declaration_parameter_list_parentheses = false
|
||||
csharp_space_between_parentheses = false
|
||||
csharp_space_between_square_brackets = false
|
||||
|
||||
# Blocks are allowed
|
||||
csharp_prefer_braces = true:silent
|
||||
|
||||
# Xml project files
|
||||
[*.{csproj,vcxproj,vcxproj.filters,proj,nativeproj,locproj}]
|
||||
indent_size = 2
|
||||
|
||||
# Xml build files
|
||||
[*.builds]
|
||||
indent_size = 2
|
||||
|
||||
# Xml files
|
||||
[*.{xml,stylecop,resx,ruleset}]
|
||||
indent_size = 2
|
||||
|
||||
# Xml config files
|
||||
[*.{props,targets,config,nuspec}]
|
||||
indent_size = 2
|
||||
|
||||
# Shell scripts
|
||||
[*.sh]
|
||||
end_of_line = lf
|
||||
[*.{cmd, bat}]
|
||||
end_of_line = crlf
|
|
@ -0,0 +1,366 @@
|
|||
## Ignore Visual Studio temporary files, build results, and
|
||||
## files generated by popular Visual Studio add-ons.
|
||||
##
|
||||
## Get latest from https://github.com/github/gitignore/blob/master/VisualStudio.gitignore
|
||||
|
||||
# User-specific files
|
||||
*.rsuser
|
||||
*.suo
|
||||
*.user
|
||||
*.userosscache
|
||||
*.sln.docstates
|
||||
|
||||
# User-specific files (MonoDevelop/Xamarin Studio)
|
||||
*.userprefs
|
||||
|
||||
# Build results
|
||||
[Dd]ebug/
|
||||
[Dd]ebugPublic/
|
||||
[Rr]elease/
|
||||
[Rr]eleases/
|
||||
x64/
|
||||
x86/
|
||||
bld/
|
||||
[Bb]in/
|
||||
[Oo]bj/
|
||||
[Ll]og/
|
||||
|
||||
# Visual Studio 2015/2017 cache/options directory
|
||||
.vs/
|
||||
# Uncomment if you have tasks that create the project's static files in wwwroot
|
||||
#wwwroot/
|
||||
|
||||
# Visual Studio 2017 auto generated files
|
||||
Generated\ Files/
|
||||
|
||||
# MSTest test Results
|
||||
[Tt]est[Rr]esult*/
|
||||
[Bb]uild[Ll]og.*
|
||||
|
||||
# NUNIT
|
||||
*.VisualState.xml
|
||||
TestResult.xml
|
||||
|
||||
# Build Results of an ATL Project
|
||||
[Dd]ebugPS/
|
||||
[Rr]eleasePS/
|
||||
dlldata.c
|
||||
|
||||
# Benchmark Results
|
||||
BenchmarkDotNet.Artifacts/
|
||||
|
||||
# .NET Core
|
||||
project.lock.json
|
||||
project.fragment.lock.json
|
||||
artifacts/
|
||||
|
||||
# StyleCop
|
||||
StyleCopReport.xml
|
||||
|
||||
# Files built by Visual Studio
|
||||
*_i.c
|
||||
*_p.c
|
||||
*_h.h
|
||||
*.ilk
|
||||
*.meta
|
||||
*.obj
|
||||
*.iobj
|
||||
*.pch
|
||||
*.pdb
|
||||
*.ipdb
|
||||
*.pgc
|
||||
*.pgd
|
||||
*.rsp
|
||||
*.sbr
|
||||
*.tlb
|
||||
*.tli
|
||||
*.tlh
|
||||
*.tmp
|
||||
*.tmp_proj
|
||||
*_wpftmp.csproj
|
||||
*.log
|
||||
*.vspscc
|
||||
*.vssscc
|
||||
.builds
|
||||
*.pidb
|
||||
*.svclog
|
||||
*.scc
|
||||
|
||||
# Chutzpah Test files
|
||||
_Chutzpah*
|
||||
|
||||
# Visual C++ cache files
|
||||
ipch/
|
||||
*.aps
|
||||
*.ncb
|
||||
*.opendb
|
||||
*.opensdf
|
||||
*.sdf
|
||||
*.cachefile
|
||||
*.VC.db
|
||||
*.VC.VC.opendb
|
||||
|
||||
# Visual Studio profiler
|
||||
*.psess
|
||||
*.vsp
|
||||
*.vspx
|
||||
*.sap
|
||||
|
||||
# Visual Studio Trace Files
|
||||
*.e2e
|
||||
|
||||
# TFS 2012 Local Workspace
|
||||
$tf/
|
||||
|
||||
# Guidance Automation Toolkit
|
||||
*.gpState
|
||||
|
||||
# ReSharper is a .NET coding add-in
|
||||
_ReSharper*/
|
||||
*.[Rr]e[Ss]harper
|
||||
*.DotSettings.user
|
||||
|
||||
# JustCode is a .NET coding add-in
|
||||
.JustCode
|
||||
|
||||
# TeamCity is a build add-in
|
||||
_TeamCity*
|
||||
|
||||
# DotCover is a Code Coverage Tool
|
||||
*.dotCover
|
||||
|
||||
# AxoCover is a Code Coverage Tool
|
||||
.axoCover/*
|
||||
!.axoCover/settings.json
|
||||
|
||||
# Visual Studio code coverage results
|
||||
*.coverage
|
||||
*.coveragexml
|
||||
|
||||
# NCrunch
|
||||
_NCrunch_*
|
||||
.*crunch*.local.xml
|
||||
nCrunchTemp_*
|
||||
|
||||
# MightyMoose
|
||||
*.mm.*
|
||||
AutoTest.Net/
|
||||
|
||||
# Web workbench (sass)
|
||||
.sass-cache/
|
||||
|
||||
# Installshield output folder
|
||||
[Ee]xpress/
|
||||
|
||||
# DocProject is a documentation generator add-in
|
||||
DocProject/buildhelp/
|
||||
DocProject/Help/*.HxT
|
||||
DocProject/Help/*.HxC
|
||||
DocProject/Help/*.hhc
|
||||
DocProject/Help/*.hhk
|
||||
DocProject/Help/*.hhp
|
||||
DocProject/Help/Html2
|
||||
DocProject/Help/html
|
||||
|
||||
# Click-Once directory
|
||||
publish/
|
||||
|
||||
# Publish Web Output
|
||||
*.[Pp]ublish.xml
|
||||
*.azurePubxml
|
||||
# Note: Comment the next line if you want to checkin your web deploy settings,
|
||||
# but database connection strings (with potential passwords) will be unencrypted
|
||||
*.pubxml
|
||||
*.publishproj
|
||||
|
||||
# Microsoft Azure Web App publish settings. Comment the next line if you want to
|
||||
# checkin your Azure Web App publish settings, but sensitive information contained
|
||||
# in these scripts will be unencrypted
|
||||
PublishScripts/
|
||||
|
||||
# NuGet Packages
|
||||
*.nupkg
|
||||
# The packages folder can be ignored because of Package Restore
|
||||
**/[Pp]ackages/*
|
||||
# except build/, which is used as an MSBuild target.
|
||||
!**/[Pp]ackages/build/
|
||||
# Uncomment if necessary however generally it will be regenerated when needed
|
||||
#!**/[Pp]ackages/repositories.config
|
||||
# NuGet v3's project.json files produces more ignorable files
|
||||
*.nuget.props
|
||||
*.nuget.targets
|
||||
|
||||
# Microsoft Azure Build Output
|
||||
csx/
|
||||
*.build.csdef
|
||||
|
||||
# Microsoft Azure Emulator
|
||||
ecf/
|
||||
rcf/
|
||||
|
||||
# Windows Store app package directories and files
|
||||
AppPackages/
|
||||
BundleArtifacts/
|
||||
Package.StoreAssociation.xml
|
||||
_pkginfo.txt
|
||||
*.appx
|
||||
|
||||
# Visual Studio cache files
|
||||
# files ending in .cache can be ignored
|
||||
*.[Cc]ache
|
||||
# but keep track of directories ending in .cache
|
||||
!*.[Cc]ache/
|
||||
|
||||
# Others
|
||||
ClientBin/
|
||||
~$*
|
||||
*~
|
||||
*.dbmdl
|
||||
*.dbproj.schemaview
|
||||
*.jfm
|
||||
*.pfx
|
||||
*.publishsettings
|
||||
orleans.codegen.cs
|
||||
|
||||
# Including strong name files can present a security risk
|
||||
# (https://github.com/github/gitignore/pull/2483#issue-259490424)
|
||||
#*.snk
|
||||
|
||||
# Since there are multiple workflows, uncomment next line to ignore bower_components
|
||||
# (https://github.com/github/gitignore/pull/1529#issuecomment-104372622)
|
||||
#bower_components/
|
||||
|
||||
# RIA/Silverlight projects
|
||||
Generated_Code/
|
||||
|
||||
# Backup & report files from converting an old project file
|
||||
# to a newer Visual Studio version. Backup files are not needed,
|
||||
# because we have git ;-)
|
||||
_UpgradeReport_Files/
|
||||
Backup*/
|
||||
UpgradeLog*.XML
|
||||
UpgradeLog*.htm
|
||||
ServiceFabricBackup/
|
||||
*.rptproj.bak
|
||||
|
||||
# SQL Server files
|
||||
*.mdf
|
||||
*.ldf
|
||||
*.ndf
|
||||
|
||||
# Business Intelligence projects
|
||||
*.rdl.data
|
||||
*.bim.layout
|
||||
*.bim_*.settings
|
||||
*.rptproj.rsuser
|
||||
|
||||
# Microsoft Fakes
|
||||
FakesAssemblies/
|
||||
|
||||
# GhostDoc plugin setting file
|
||||
*.GhostDoc.xml
|
||||
|
||||
# Node.js Tools for Visual Studio
|
||||
.ntvs_analysis.dat
|
||||
node_modules/
|
||||
|
||||
# Visual Studio 6 build log
|
||||
*.plg
|
||||
|
||||
# Visual Studio 6 workspace options file
|
||||
*.opt
|
||||
|
||||
# Visual Studio 6 auto-generated workspace file (contains which files were open etc.)
|
||||
*.vbw
|
||||
|
||||
# Visual Studio LightSwitch build output
|
||||
**/*.HTMLClient/GeneratedArtifacts
|
||||
**/*.DesktopClient/GeneratedArtifacts
|
||||
**/*.DesktopClient/ModelManifest.xml
|
||||
**/*.Server/GeneratedArtifacts
|
||||
**/*.Server/ModelManifest.xml
|
||||
_Pvt_Extensions
|
||||
|
||||
# Paket dependency manager
|
||||
.paket/paket.exe
|
||||
paket-files/
|
||||
|
||||
# FAKE - F# Make
|
||||
.fake/
|
||||
|
||||
# JetBrains Rider
|
||||
.idea/
|
||||
*.sln.iml
|
||||
|
||||
# CodeRush personal settings
|
||||
.cr/personal
|
||||
|
||||
# Python Tools for Visual Studio (PTVS)
|
||||
__pycache__/
|
||||
*.pyc
|
||||
|
||||
# Cake - Uncomment if you are using it
|
||||
# tools/**
|
||||
# !tools/packages.config
|
||||
|
||||
# Tabs Studio
|
||||
*.tss
|
||||
|
||||
# Telerik's JustMock configuration file
|
||||
*.jmconfig
|
||||
|
||||
# BizTalk build output
|
||||
*.btp.cs
|
||||
*.btm.cs
|
||||
*.odx.cs
|
||||
*.xsd.cs
|
||||
|
||||
# OpenCover UI analysis results
|
||||
OpenCover/
|
||||
|
||||
# Azure Stream Analytics local run output
|
||||
ASALocalRun/
|
||||
|
||||
# MSBuild Binary and Structured Log
|
||||
*.binlog
|
||||
|
||||
# NVidia Nsight GPU debugger configuration file
|
||||
*.nvuser
|
||||
|
||||
# MFractors (Xamarin productivity tool) working folder
|
||||
.mfractor/
|
||||
|
||||
# Local History for Visual Studio
|
||||
.localhistory/
|
||||
|
||||
# Below is for ignore files for Java taken from github/gitignore.
|
||||
|
||||
# Compiled class file
|
||||
*.class
|
||||
|
||||
# Log file
|
||||
*.log
|
||||
|
||||
# BlueJ files
|
||||
*.ctxt
|
||||
|
||||
# Mobile Tools for Java (J2ME)
|
||||
.mtj.tmp/
|
||||
|
||||
# Package Files #
|
||||
*.jar
|
||||
*.war
|
||||
*.nar
|
||||
*.ear
|
||||
*.zip
|
||||
*.tar.gz
|
||||
*.rar
|
||||
|
||||
# virtual machine crash logs, see http://www.java.com/en/download/help/error_hotspot.xml
|
||||
hs_err_pid*
|
||||
|
||||
# IntelliJ file
|
||||
*.iml
|
||||
|
||||
# The target folder contains the output of building
|
||||
**/target/**
|
|
@ -0,0 +1,31 @@
|
|||
# Welcome!
|
||||
|
||||
If you are here, it means you are interested in helping us out. A hearty welcome and thank you! There are many ways you can contribute to the .NET for Apache Spark project:
|
||||
|
||||
* Offer PR's to fix bugs or implement new features.
|
||||
* Give us feedback and bug reports regarding the software or the documentation.
|
||||
* Improve our examples, tutorials, and documentation.
|
||||
|
||||
## Getting started:
|
||||
|
||||
Please make sure to take a look at the project [roadmap](ROADMAP.md).
|
||||
|
||||
### Pull requests
|
||||
|
||||
If you are new to GitHub [here](https://help.github.com/categories/collaborating-with-issues-and-pull-requests/) is a detailed help source on getting involved with development on GitHub.
|
||||
|
||||
As a first time contributor, you will be invited to sign the Contributor License Agreement (CLA). Please follow the instructions of the dotnet foundation bot reviewer on your PR to sign the agreement indicating that you have appropriate rights to your contribution.
|
||||
|
||||
Your pull request needs to reference a filed issue. Please fill in the template that is populated for the pull request. Only pull requests addressing small typos can have no issues associated with them.
|
||||
|
||||
A .NET for Apache Spark team member will be assigned to your pull request once the continuous integration checks have passed successfully.
|
||||
|
||||
All commits in a pull request will be squashed to a single commit with the original creator as author.
|
||||
|
||||
# Contributing
|
||||
|
||||
See [Contributing](docs/contributing.md) for information about coding styles, source structure, making pull requests, and more.
|
||||
|
||||
# Developers
|
||||
|
||||
See the [Developer Guide](docs/developer-guide.md) for details about developing in this repo.
|
|
@ -0,0 +1,21 @@
|
|||
MIT License
|
||||
|
||||
Copyright (c) 2019 .NET Foundation
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in all
|
||||
copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
SOFTWARE.
|
|
@ -0,0 +1,9 @@
|
|||
We are excited to review your PR.
|
||||
|
||||
So we can do the best job, please check:
|
||||
|
||||
- [ ] There's a descriptive title that will make sense to other developers some time from now.
|
||||
- [ ] There's associated issues. All PR's should have issue(s) associated - unless a trivial self-evident change such as fixing a typo. You can use the format `Fixes #nnnn` in your description to cause GitHub to automatically close the issue(s) when your PR is merged.
|
||||
- [ ] Your change description explains what the change does, why you chose your approach, and anything else that reviewers should know.
|
||||
- [ ] You have included any necessary tests in the same PR.
|
||||
|
|
@ -0,0 +1,72 @@
|
|||
# Spark .NET
|
||||
|
||||
![Icon](docs/img/spark-dot-net-logo.PNG)
|
||||
|
||||
Spark .NET is the .NET API for [Apache Spark](https://spark.apache.org/).
|
||||
|
||||
## Build Status
|
||||
| ![Ubuntu icon](docs/img/ubuntu-icon-32.png) | ![Ubuntu icon](docs/img/ubuntu-icon-32.png) | ![Windows icon](docs/img/windows-icon-32.png) |
|
||||
| :---: | :---: | :---: |
|
||||
| Ubuntu 16.04 | Ubuntu 18.04 | Windows 10 |
|
||||
| | | [![Build Status](https://dnceng.visualstudio.com/internal/_apis/build/status/spark.net?branchName=master)](https://dnceng.visualstudio.com/internal/_build/latest?definitionId=301?branchName=master)|
|
||||
|
||||
## Table of Contents
|
||||
|
||||
- [Introduction](#introduction)
|
||||
- [Quick Start (TL;DR)](#quick-start)
|
||||
- [Features](docs/features.md)
|
||||
- [FAQ/Troubleshooting](#faq)
|
||||
- [Inspiration and Special Thanks](#inspiration)
|
||||
- [How to Engage, Contribute and Provide Feedback](#community)
|
||||
- [.NET Foundation](#net-foundation)
|
||||
- [Code of Conduct](#code-of-conduct)
|
||||
- [License](#license)
|
||||
|
||||
<a name="introduction"></a>
|
||||
## Introduction
|
||||
|
||||
<a name="quick-start"></a>
|
||||
## Quick Start (TL;DR)
|
||||
|
||||
Spark .NET will be redistributed as a Nuget package and a formal release here on Github eventually to help you build your applications easily. Until then, please feel free to build it locally on your machine and link it appropriately. Building from source is very easy and the whole process (from cloning to being able to run your app) should take less than 15 minutes!
|
||||
|
||||
| | | Instructions |
|
||||
| :---: | :--- | :--- |
|
||||
| ![Windows icon](docs/img/windows-icon-32.png) | **Windows** | <ul><li>Local - [.NET Framework 4.6.1](docs/building/windows-instructions.md#using-visual-studio-for-net-framework-461)</li><li>Local - [.NET Core 2.1.x](docs/building/windows-instructions.md#using-net-core-cli-for-net-core-21x)</li><ul> |
|
||||
| ![Ubuntu icon](docs/img/ubuntu-icon-32.png) | **Ubuntu** | <ul><li>Local - [.NET Core 2.1.x](docs/building/ubuntu-instructions.md)</li><li>[Azure HDInsight Spark - .NET Core 2.1.x](deployment/README.md)</li></ul> |
|
||||
|
||||
## Contributing
|
||||
We welcome contributions! Please review our [contribution guide](CONTRIBUTING.md).
|
||||
|
||||
<a name="features"></a>
|
||||
## Features
|
||||
|
||||
<a name="faq"></a>
|
||||
## Frequently Asked Questions
|
||||
|
||||
<a name="inspiration"></a>
|
||||
## Inspiration
|
||||
|
||||
## Community
|
||||
|
||||
<a name="contact"></a>
|
||||
## How to Engage, Contribute and Provide Feedback
|
||||
|
||||
The Spark .NET team encourages [contributions](docs/contributing.md), both issues and PRs. The first step is finding an [existing issue](https://github.com/dotnet/spark/issues) you want to contribute to or if you cannot find any, [open an issue](https://github.com/dotnet/spark/issues?utf8=%E2%9C%93&q=is%3Aissue+is%3Aopen+).
|
||||
|
||||
<a name="net-foundation"></a>
|
||||
## .NET Foundation
|
||||
|
||||
The Spark .NET project is part of the [.NET Foundation](http://www.dotnetfoundation.org).
|
||||
|
||||
<a name="code-of-conduct"></a>
|
||||
## Code of Conduct
|
||||
|
||||
This project has adopted the code of conduct defined by the Contributor Covenant
|
||||
to clarify expected behavior in our community.
|
||||
For more information, see the [.NET Foundation Code of Conduct](https://dotnetfoundation.org/code-of-conduct).
|
||||
|
||||
<a name="license"></a>
|
||||
## License
|
||||
|
||||
.NET for Apache Spark is licensed under the [MIT license](LICENSE).
|
|
@ -0,0 +1,41 @@
|
|||
# .NET for Apache Spark Roadmap
|
||||
|
||||
The goal of the .NET for Apache Spark project is to provide an easy to use, .NET-friendly integration to the popular big data platform, Apache Spark. This document describes the tentative plan for the project in the short and long-term.
|
||||
|
||||
.NET for Apache Spark is a community effort and we welcome community feedback on our plans. The best way to give feedback is to open an issue in this repo. We are also excited to receive contributions (check out the [contribution guide](docs/contributing.md)). It's always a good idea to open an issue for a discussion before embarking on a large code change to make sure there is not duplicated effort. Where we do know that efforts are already underway, we have used the (*) marker below.
|
||||
|
||||
## Short Term
|
||||
|
||||
### User Experience
|
||||
* 1:1 API compatibility for Dataframes with Apache Spark 2.3.x, Apache Spark 2.4.x and Apache Spark 3.0.x (*)
|
||||
|
||||
### Performance Optimizations
|
||||
* Improvements to C# Pickling Library
|
||||
* Improvements to Arrow .NET Library
|
||||
* Exploiting .NET Core 3.0 Vectorization (*)
|
||||
* Micro-benchmarking framework for Interop
|
||||
|
||||
### Benchmarks
|
||||
* Benchmarking scripts for all languages that include generating the dataset and running queries against it (*)
|
||||
* Published reproducible benchmarks against [TPC-H](http://www.tpc.org/tpch/) (industry-standard database benchmark) (*)
|
||||
|
||||
### Tooling Improvements
|
||||
* VS Code support (*)
|
||||
* Apache Jupyter integration with C# & F# Notebook Support (*)
|
||||
* Improved user experience for .NET app submission to a remote Spark cluster
|
||||
|
||||
## Longer Term
|
||||
|
||||
### User Experience
|
||||
* Idiomatic C# and F# APIs
|
||||
|
||||
### Performance Optimizations
|
||||
* Contribute extensible interop layer to Apache Spark
|
||||
|
||||
### Benchmarks
|
||||
* Published reproducible benchmarks against [TPC-DS](http://www.tpc.org/tpcds/default.asp) (industry-standard database benchmark)
|
||||
|
||||
### Tooling Improvements
|
||||
* Visual Studio Extension for .NET app submission to a remote Spark cluster
|
||||
* Visual Studio Extension for .NET app debugging
|
||||
* Make it easy to copy/paste Scala examples into Visual Studio
|
|
@ -0,0 +1,10 @@
|
|||
.NET for Apache Spark uses third-party libraries or other resources that may be
|
||||
distributed under licenses different than the .NET for Apache Spark software.
|
||||
|
||||
In the event that we accidentally failed to list a required notice, please
|
||||
bring it to our attention. Post an issue or email us:
|
||||
|
||||
dotnet@microsoft.com
|
||||
|
||||
The attached notices are provided for information only.
|
||||
|
|
@ -0,0 +1,137 @@
|
|||
# Spark .NET build
|
||||
|
||||
trigger:
|
||||
- master
|
||||
|
||||
pool:
|
||||
vmImage: 'VS2017-Win2016'
|
||||
|
||||
variables:
|
||||
solution: '**/*.sln'
|
||||
buildConfiguration: 'Release'
|
||||
|
||||
steps:
|
||||
- task: NuGetToolInstaller@0
|
||||
inputs:
|
||||
versionSpec: '4.9.2'
|
||||
|
||||
- task: DotNetCoreCLI@2
|
||||
displayName: '.NET restore'
|
||||
inputs:
|
||||
# Use a custom restore command because the built in restore command uses a temp nuget.config
|
||||
# which overwrites the MSBuild restore properties
|
||||
command: 'custom'
|
||||
custom: 'restore'
|
||||
projects: '$(solution)'
|
||||
|
||||
- task: DotNetCoreCLI@2
|
||||
displayName: '.NET build'
|
||||
inputs:
|
||||
command: build
|
||||
projects: '$(solution)'
|
||||
arguments: '--configuration $(buildConfiguration)'
|
||||
|
||||
- task: BatchScript@1
|
||||
displayName: Publish Microsoft.Spark.Worker
|
||||
inputs:
|
||||
filename: script\publish-workers.cmd
|
||||
arguments: $(Build.SourcesDirectory) $(Build.ArtifactStagingDirectory)\Microsoft.Spark.Worker $(buildConfiguration)
|
||||
|
||||
- task: DotNetCoreCLI@2
|
||||
displayName: '.NET unit tests'
|
||||
inputs:
|
||||
command: test
|
||||
projects: '**/*UnitTest/*.csproj'
|
||||
arguments: '--configuration $(buildConfiguration) /p:CollectCoverage=true /p:CoverletOutputFormat=cobertura'
|
||||
|
||||
- task: Maven@3
|
||||
displayName: 'Maven build src'
|
||||
inputs:
|
||||
mavenPomFile: src/scala/pom.xml
|
||||
|
||||
- task: Maven@3
|
||||
displayName: 'Maven build benchmark'
|
||||
inputs:
|
||||
mavenPomFile: benchmark/scala/pom.xml
|
||||
|
||||
- task: NuGetCommand@2
|
||||
inputs:
|
||||
command: pack
|
||||
packagesToPack: '$(Build.SourcesDirectory)\src\csharp\Microsoft.Spark.nuspec'
|
||||
|
||||
- task: PublishBuildArtifacts@1
|
||||
inputs:
|
||||
pathtoPublish: '$(Build.ArtifactStagingDirectory)'
|
||||
artifactName: Microsoft.Spark.Binaries
|
||||
|
||||
- task: BatchScript@1
|
||||
displayName: Download Spark Distros & Winutils.exe
|
||||
inputs:
|
||||
filename: script\download-spark-distros.cmd
|
||||
arguments: $(Build.BinariesDirectory)
|
||||
|
||||
- task: DotNetCoreCLI@2
|
||||
displayName: 'E2E tests for Spark 2.3.0'
|
||||
inputs:
|
||||
command: test
|
||||
projects: '**/Microsoft.Spark.E2ETest/*.csproj'
|
||||
arguments: '--configuration $(buildConfiguration) /p:CollectCoverage=true /p:CoverletOutputFormat=cobertura'
|
||||
env:
|
||||
SPARK_HOME: $(Build.BinariesDirectory)\spark-2.3.0-bin-hadoop2.7
|
||||
HADOOP_HOME: $(Build.BinariesDirectory)\hadoop
|
||||
DotnetWorkerPath: $(Build.ArtifactStagingDirectory)\Microsoft.Spark.Worker\netcoreapp2.1\win-x64
|
||||
|
||||
- task: DotNetCoreCLI@2
|
||||
displayName: 'E2E tests for Spark 2.3.1'
|
||||
inputs:
|
||||
command: test
|
||||
projects: '**/Microsoft.Spark.E2ETest/*.csproj'
|
||||
arguments: '--configuration $(buildConfiguration) /p:CollectCoverage=true /p:CoverletOutputFormat=cobertura'
|
||||
env:
|
||||
SPARK_HOME: $(Build.BinariesDirectory)\spark-2.3.1-bin-hadoop2.7
|
||||
HADOOP_HOME: $(Build.BinariesDirectory)\hadoop
|
||||
DotnetWorkerPath: $(Build.ArtifactStagingDirectory)\Microsoft.Spark.Worker\netcoreapp2.1\win-x64
|
||||
|
||||
- task: DotNetCoreCLI@2
|
||||
displayName: 'E2E tests for Spark 2.3.2'
|
||||
inputs:
|
||||
command: test
|
||||
projects: '**/Microsoft.Spark.E2ETest/*.csproj'
|
||||
arguments: '--configuration $(buildConfiguration) /p:CollectCoverage=true /p:CoverletOutputFormat=cobertura'
|
||||
env:
|
||||
SPARK_HOME: $(Build.BinariesDirectory)\spark-2.3.2-bin-hadoop2.7
|
||||
HADOOP_HOME: $(Build.BinariesDirectory)\hadoop
|
||||
DotnetWorkerPath: $(Build.ArtifactStagingDirectory)\Microsoft.Spark.Worker\netcoreapp2.1\win-x64
|
||||
|
||||
- task: DotNetCoreCLI@2
|
||||
displayName: 'E2E tests for Spark 2.3.3'
|
||||
inputs:
|
||||
command: test
|
||||
projects: '**/Microsoft.Spark.E2ETest/*.csproj'
|
||||
arguments: '--configuration $(buildConfiguration) /p:CollectCoverage=true /p:CoverletOutputFormat=cobertura'
|
||||
env:
|
||||
SPARK_HOME: $(Build.BinariesDirectory)\spark-2.3.3-bin-hadoop2.7
|
||||
HADOOP_HOME: $(Build.BinariesDirectory)\hadoop
|
||||
DotnetWorkerPath: $(Build.ArtifactStagingDirectory)\Microsoft.Spark.Worker\netcoreapp2.1\win-x64
|
||||
|
||||
- task: DotNetCoreCLI@2
|
||||
displayName: 'E2E tests for Spark 2.4.0'
|
||||
inputs:
|
||||
command: test
|
||||
projects: '**/Microsoft.Spark.E2ETest/*.csproj'
|
||||
arguments: '--configuration $(buildConfiguration) /p:CollectCoverage=true /p:CoverletOutputFormat=cobertura'
|
||||
env:
|
||||
SPARK_HOME: $(Build.BinariesDirectory)\spark-2.4.0-bin-hadoop2.7
|
||||
HADOOP_HOME: $(Build.BinariesDirectory)\hadoop
|
||||
DotnetWorkerPath: $(Build.ArtifactStagingDirectory)\Microsoft.Spark.Worker\netcoreapp2.1\win-x64
|
||||
|
||||
- task: DotNetCoreCLI@2
|
||||
displayName: 'E2E tests for Spark 2.4.1'
|
||||
inputs:
|
||||
command: test
|
||||
projects: '**/Microsoft.Spark.E2ETest/*.csproj'
|
||||
arguments: '--configuration $(buildConfiguration) /p:CollectCoverage=true /p:CoverletOutputFormat=cobertura'
|
||||
env:
|
||||
SPARK_HOME: $(Build.BinariesDirectory)\spark-2.4.1-bin-hadoop2.7
|
||||
HADOOP_HOME: $(Build.BinariesDirectory)\hadoop
|
||||
DotnetWorkerPath: $(Build.ArtifactStagingDirectory)\Microsoft.Spark.Worker\netcoreapp2.1\win-x64
|
|
@ -0,0 +1,4 @@
|
|||
<?xml version="1.0" encoding="utf-8"?>
|
||||
<Project ToolsVersion="14.0" DefaultTargets="Build" xmlns="http://schemas.microsoft.com/developer/msbuild/2003" TreatAsLocalProperty="ExcludeRestorePackageImports">
|
||||
<Import Project="..\..\src\csharp\Directory.Build.props" />
|
||||
</Project>
|
|
@ -0,0 +1,37 @@
|
|||
|
||||
Microsoft Visual Studio Solution File, Format Version 12.00
|
||||
# Visual Studio 15
|
||||
VisualStudioVersion = 15.0.28307.168
|
||||
MinimumVisualStudioVersion = 10.0.40219.1
|
||||
Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "Tpch", "Tpch\Tpch.csproj", "{C1A5ED09-7924-4784-A880-97DF975DE78A}"
|
||||
EndProject
|
||||
Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "Microsoft.Spark", "..\..\src\csharp\Microsoft.Spark\Microsoft.Spark.csproj", "{8EB85725-BEB0-4807-B9E2-2BB26B1D7D55}"
|
||||
EndProject
|
||||
Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "Microsoft.Spark.Worker", "..\..\src\csharp\Microsoft.Spark.Worker\Microsoft.Spark.Worker.csproj", "{A267D1A0-8EF6-475F-B118-67DDACD4373A}"
|
||||
EndProject
|
||||
Global
|
||||
GlobalSection(SolutionConfigurationPlatforms) = preSolution
|
||||
Debug|Any CPU = Debug|Any CPU
|
||||
Release|Any CPU = Release|Any CPU
|
||||
EndGlobalSection
|
||||
GlobalSection(ProjectConfigurationPlatforms) = postSolution
|
||||
{C1A5ED09-7924-4784-A880-97DF975DE78A}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
|
||||
{C1A5ED09-7924-4784-A880-97DF975DE78A}.Debug|Any CPU.Build.0 = Debug|Any CPU
|
||||
{C1A5ED09-7924-4784-A880-97DF975DE78A}.Release|Any CPU.ActiveCfg = Release|Any CPU
|
||||
{C1A5ED09-7924-4784-A880-97DF975DE78A}.Release|Any CPU.Build.0 = Release|Any CPU
|
||||
{8EB85725-BEB0-4807-B9E2-2BB26B1D7D55}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
|
||||
{8EB85725-BEB0-4807-B9E2-2BB26B1D7D55}.Debug|Any CPU.Build.0 = Debug|Any CPU
|
||||
{8EB85725-BEB0-4807-B9E2-2BB26B1D7D55}.Release|Any CPU.ActiveCfg = Release|Any CPU
|
||||
{8EB85725-BEB0-4807-B9E2-2BB26B1D7D55}.Release|Any CPU.Build.0 = Release|Any CPU
|
||||
{A267D1A0-8EF6-475F-B118-67DDACD4373A}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
|
||||
{A267D1A0-8EF6-475F-B118-67DDACD4373A}.Debug|Any CPU.Build.0 = Debug|Any CPU
|
||||
{A267D1A0-8EF6-475F-B118-67DDACD4373A}.Release|Any CPU.ActiveCfg = Release|Any CPU
|
||||
{A267D1A0-8EF6-475F-B118-67DDACD4373A}.Release|Any CPU.Build.0 = Release|Any CPU
|
||||
EndGlobalSection
|
||||
GlobalSection(SolutionProperties) = preSolution
|
||||
HideSolutionNode = FALSE
|
||||
EndGlobalSection
|
||||
GlobalSection(ExtensibilityGlobals) = postSolution
|
||||
SolutionGuid = {88B4A99F-3096-495A-9055-7A270C9269B2}
|
||||
EndGlobalSection
|
||||
EndGlobal
|
|
@ -0,0 +1,55 @@
|
|||
// Licensed to the .NET Foundation under one or more agreements.
|
||||
// The .NET Foundation licenses this file to you under the MIT license.
|
||||
// See the LICENSE file in the project root for more information.
|
||||
|
||||
using System;
|
||||
using System.Diagnostics;
|
||||
using Microsoft.Spark.Sql;
|
||||
|
||||
namespace Tpch
|
||||
{
|
||||
internal class Program
|
||||
{
|
||||
private static void Main(string[] args)
|
||||
{
|
||||
if (args.Length != 4)
|
||||
{
|
||||
Console.WriteLine("Usage:");
|
||||
Console.WriteLine("\t<spark-submit> --master local");
|
||||
Console.WriteLine("\t\t--class org.apache.spark.deploy.DotnetRunner <path-to-microsoft-spark-jar>");
|
||||
Console.WriteLine("\t\tTpch.exe <tpch_data_root_path> <query_number> <num_iterations> <true for SQL | false for functional>");
|
||||
}
|
||||
|
||||
var tpchRoot = args[0];
|
||||
var queryNumber = int.Parse(args[1]);
|
||||
var numIteration = int.Parse(args[2]);
|
||||
var isSQL = bool.Parse(args[3]);
|
||||
|
||||
for (var i = 0; i < numIteration; ++i)
|
||||
{
|
||||
SparkSession spark = SparkSession
|
||||
.Builder()
|
||||
.AppName("TPC-H Benchmark for DotNet")
|
||||
.GetOrCreate();
|
||||
|
||||
Stopwatch sw = Stopwatch.StartNew();
|
||||
if (!isSQL)
|
||||
{
|
||||
var tpchFunctional = new TpchFunctionalQueries(tpchRoot, spark);
|
||||
tpchFunctional.Run(queryNumber.ToString());
|
||||
}
|
||||
else
|
||||
{
|
||||
var tpchSql = new TpchSqlQueries(tpchRoot, spark);
|
||||
tpchSql.Run(queryNumber.ToString());
|
||||
}
|
||||
sw.Stop();
|
||||
|
||||
var typeStr = isSQL ? "SQL" : "Functional";
|
||||
Console.WriteLine($"TPCH_Result,DotNet,{typeStr},{queryNumber},{i},{sw.ElapsedMilliseconds}");
|
||||
|
||||
spark.Stop();
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,16 @@
|
|||
// Licensed to the .NET Foundation under one or more agreements.
|
||||
// The .NET Foundation licenses this file to you under the MIT license.
|
||||
// See the LICENSE file in the project root for more information.
|
||||
|
||||
using System.Text.RegularExpressions;
|
||||
|
||||
namespace Tpch
|
||||
{
|
||||
internal static class StringExtensions
|
||||
{
|
||||
internal static string StripMargin(this string s)
|
||||
{
|
||||
return Regex.Replace(s, @"[ \t]+\|", string.Empty);
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,16 @@
|
|||
<Project Sdk="Microsoft.NET.Sdk">
|
||||
|
||||
<PropertyGroup>
|
||||
<OutputType>Exe</OutputType>
|
||||
<TargetFrameworks>net461;netcoreapp2.1</TargetFrameworks>
|
||||
<TargetFrameworks Condition="'$(OS)' != 'Windows_NT'">netcoreapp2.1</TargetFrameworks>
|
||||
<RootNamespace>Tpch</RootNamespace>
|
||||
<AssemblyName>Tpch</AssemblyName>
|
||||
</PropertyGroup>
|
||||
|
||||
<ItemGroup>
|
||||
<ProjectReference Include="..\..\..\src\csharp\Microsoft.Spark.Worker\Microsoft.Spark.Worker.csproj" />
|
||||
<ProjectReference Include="..\..\..\src\csharp\Microsoft.Spark\Microsoft.Spark.csproj" />
|
||||
</ItemGroup>
|
||||
|
||||
</Project>
|
|
@ -0,0 +1,29 @@
|
|||
// Licensed to the .NET Foundation under one or more agreements.
|
||||
// The .NET Foundation licenses this file to you under the MIT license.
|
||||
// See the LICENSE file in the project root for more information.
|
||||
|
||||
using System.IO;
|
||||
using Microsoft.Spark.Sql;
|
||||
|
||||
namespace Tpch
|
||||
{
|
||||
internal class TpchBase
|
||||
{
|
||||
protected readonly DataFrame _customer, _lineitem, _nation, _orders,
|
||||
_part, _partsupp, _region, _supplier;
|
||||
|
||||
internal TpchBase(string tpchRoot, SparkSession spark)
|
||||
{
|
||||
// Load all the TPC-H tables.
|
||||
tpchRoot += Path.DirectorySeparatorChar;
|
||||
_customer = spark.Read().Parquet($"{tpchRoot}customer");
|
||||
_lineitem = spark.Read().Parquet($"{tpchRoot}lineitem");
|
||||
_nation = spark.Read().Parquet($"{tpchRoot}nation");
|
||||
_orders = spark.Read().Parquet($"{tpchRoot}orders");
|
||||
_part = spark.Read().Parquet($"{tpchRoot}part");
|
||||
_partsupp = spark.Read().Parquet($"{tpchRoot}partsupp");
|
||||
_region = spark.Read().Parquet($"{tpchRoot}region");
|
||||
_supplier = spark.Read().Parquet($"{tpchRoot}supplier");
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,512 @@
|
|||
// Licensed to the .NET Foundation under one or more agreements.
|
||||
// The .NET Foundation licenses this file to you under the MIT license.
|
||||
// See the LICENSE file in the project root for more information.
|
||||
|
||||
using System;
|
||||
using System.Diagnostics;
|
||||
using System.Reflection;
|
||||
using System.Text.RegularExpressions;
|
||||
using Microsoft.Spark.Sql;
|
||||
using static Microsoft.Spark.Sql.Functions;
|
||||
|
||||
namespace Tpch
|
||||
{
|
||||
internal class TpchFunctionalQueries : TpchBase
|
||||
{
|
||||
internal TpchFunctionalQueries(string tpchRoot, SparkSession spark)
|
||||
: base(tpchRoot, spark)
|
||||
{
|
||||
}
|
||||
|
||||
internal void RunAll()
|
||||
{
|
||||
for (var i = 1; i <= 22; i++)
|
||||
{
|
||||
Run(i.ToString());
|
||||
}
|
||||
}
|
||||
|
||||
internal void Run(string queryNumber)
|
||||
{
|
||||
Console.WriteLine($"Spark.NET TPCH Functional Query: #{queryNumber}");
|
||||
Type thisType = GetType();
|
||||
MethodInfo queryMethod = thisType.GetMethod(
|
||||
$"Q{queryNumber}", BindingFlags.Instance | BindingFlags.NonPublic);
|
||||
|
||||
var sw = Stopwatch.StartNew();
|
||||
queryMethod.Invoke(this, null);
|
||||
Console.WriteLine($"\tElapsed: {sw.Elapsed}");
|
||||
}
|
||||
|
||||
internal void Q1()
|
||||
{
|
||||
Func<Column, Column, Column> decrease = Udf<double, double, double>((x, y) => x * (1 - y));
|
||||
Func<Column, Column, Column> increase = Udf<double, double, double>((x, y) => x * (1 + y));
|
||||
|
||||
_lineitem.Filter(Col("l_shipdate") <= "1998-09-02")
|
||||
.GroupBy(Col("l_returnflag"), Col("l_linestatus"))
|
||||
.Agg(Sum(Col("l_quantity")).As("sum_qty"), Sum(Col("l_extendedprice")).As("sum_base_price"),
|
||||
Sum(decrease(Col("l_extendedprice"), Col("l_discount"))).As("sum_disc_price"),
|
||||
Sum(increase(decrease(Col("l_extendedprice"), Col("l_discount")), Col("l_tax"))).As("sum_charge"),
|
||||
Avg(Col("l_quantity")).As("avg_qty"),
|
||||
Avg(Col("l_extendedprice")).As("avg_price"),
|
||||
Avg(Col("l_discount")).As("avg_disc"),
|
||||
Count(Col("l_quantity")).As("count_order")
|
||||
)
|
||||
.Sort(Col("l_returnflag"), Col("l_linestatus"))
|
||||
.Show();
|
||||
}
|
||||
|
||||
internal void Q2()
|
||||
{
|
||||
DataFrame europe = _region.Filter(Col("r_name") == "EUROPE")
|
||||
.Join(_nation, Col("r_regionkey") == _nation["n_regionkey"])
|
||||
.Join(_supplier, Col("n_nationkey") == _supplier["s_nationkey"])
|
||||
.Join(_partsupp, _supplier["s_suppkey"] == _partsupp["ps_suppkey"]);
|
||||
|
||||
DataFrame brass = _part
|
||||
.Filter(_part["p_size"] == 15 & _part["p_type"].EndsWith("BRASS"))
|
||||
.Join(europe, europe["ps_partkey"] == Col("p_partkey"));
|
||||
|
||||
DataFrame minCost = brass.GroupBy(brass["ps_partkey"])
|
||||
.Agg(Min("ps_supplycost").As("min"));
|
||||
|
||||
brass.Join(minCost, brass["ps_partkey"] == minCost["ps_partkey"])
|
||||
.Filter(brass["ps_supplycost"] == minCost["min"])
|
||||
.Select("s_acctbal", "s_name", "n_name", "p_partkey", "p_mfgr", "s_address", "s_phone", "s_comment")
|
||||
.Sort(Col("s_acctbal").Desc(), Col("n_name"), Col("s_name"), Col("p_partkey"))
|
||||
.Show();
|
||||
}
|
||||
|
||||
internal void Q3()
|
||||
{
|
||||
Func<Column, Column, Column> decrease = Udf<double, double, double>((x, y) => x * (1 - y));
|
||||
|
||||
DataFrame fcust = _customer.Filter(Col("c_mktsegment") == "BUILDING");
|
||||
DataFrame forders = _orders.Filter(Col("o_orderdate") < "1995-03-15");
|
||||
DataFrame flineitems = _lineitem.Filter(Col("l_shipdate") > "1995-03-15");
|
||||
|
||||
fcust.Join(forders, Col("c_custkey") == forders["o_custkey"])
|
||||
.Select(Col("o_orderkey"), Col("o_orderdate"), Col("o_shippriority"))
|
||||
.Join(flineitems, Col("o_orderkey") == flineitems["l_orderkey"])
|
||||
.Select(Col("l_orderkey"),
|
||||
decrease(Col("l_extendedprice"), Col("l_discount")).As("volume"),
|
||||
Col("o_orderdate"), Col("o_shippriority"))
|
||||
.GroupBy(Col("l_orderkey"), Col("o_orderdate"), Col("o_shippriority"))
|
||||
.Agg(Sum(Col("volume")).As("revenue"))
|
||||
.Sort(Col("revenue").Desc(), Col("o_orderdate"))
|
||||
.Show();
|
||||
}
|
||||
|
||||
internal void Q4()
|
||||
{
|
||||
DataFrame forders = _orders.Filter(Col("o_orderdate") >= "1993-07-01" &
|
||||
Col("o_orderdate") < "1993-10-01");
|
||||
DataFrame flineitems = _lineitem.Filter(Col("l_commitdate") < Col("l_receiptdate"))
|
||||
.Select($"l_orderkey")
|
||||
.Distinct();
|
||||
|
||||
flineitems.Join(forders, Col("l_orderkey") == forders["o_orderkey"])
|
||||
.GroupBy(Col("o_orderpriority"))
|
||||
.Agg(Count(Col("o_orderpriority")).As("order_count"))
|
||||
.Sort(Col("o_orderpriority"))
|
||||
.Show();
|
||||
}
|
||||
|
||||
internal void Q5()
|
||||
{
|
||||
Func<Column, Column, Column> decrease = Udf<double, double, double>((x, y) => x * (1 - y));
|
||||
|
||||
DataFrame forders = _orders.Filter(Col("o_orderdate") < "1995-01-01" & Col("o_orderdate") >= "1994-01-01");
|
||||
|
||||
_region.Filter(Col("r_name") == "ASIA")
|
||||
.Join(_nation, Col("r_regionkey") == _nation["n_regionkey"])
|
||||
.Join(_supplier, Col("n_nationkey") == _supplier["s_nationkey"])
|
||||
.Join(_lineitem, Col("s_suppkey") == _lineitem["l_suppkey"])
|
||||
.Select(Col("n_name"), Col("l_extendedprice"), Col("l_discount"), Col("l_orderkey"), Col("s_nationkey"))
|
||||
.Join(forders, Col("l_orderkey") == forders["o_orderkey"])
|
||||
.Join(_customer, Col("o_custkey") == _customer["c_custkey"]
|
||||
& Col("s_nationkey") == _customer["c_nationkey"])
|
||||
.Select(Col("n_name"), decrease(Col("l_extendedprice"), Col("l_discount")).As("value"))
|
||||
.GroupBy(Col("n_name"))
|
||||
.Agg(Sum(Col("value")).As("revenue"))
|
||||
.Sort(Col("revenue").Desc())
|
||||
.Show();
|
||||
}
|
||||
|
||||
internal void Q6()
|
||||
{
|
||||
_lineitem.Filter(Col("l_shipdate") >= "1994-01-01" & Col("l_shipdate") < "1995-01-01"
|
||||
& Col("l_discount") >= 0.05 & Col("l_discount") <= 0.07 & Col("l_quantity") < 24)
|
||||
.Agg(Sum(Col("l_extendedprice") * Col("l_discount")).As("revenue"))
|
||||
.Show();
|
||||
}
|
||||
|
||||
// C#, Scala and SparkSQL results match but SparkSQL has different precision.
|
||||
internal void Q7()
|
||||
{
|
||||
Func<Column, Column> getYear = Udf<string, string>(x => x.Substring(0, 4));
|
||||
Func<Column, Column, Column> decrease = Udf<double, double, double>((x, y) => x * (1 - y));
|
||||
|
||||
DataFrame fnation = _nation.Filter(Col("n_name") == "FRANCE" | Col("n_name") == "GERMANY");
|
||||
DataFrame fline = _lineitem.Filter(Col("l_shipdate") >= "1995-01-01" & Col("l_shipdate") <= "1996-12-31");
|
||||
|
||||
DataFrame supNation = fnation.Join(_supplier, Col("n_nationkey") == _supplier["s_nationkey"])
|
||||
.Join(fline, Col("s_suppkey") == fline["l_suppkey"])
|
||||
.Select(Col("n_name").As("supp_nation"), Col("l_orderkey"), Col("l_extendedprice"), Col("l_discount"), Col("l_shipdate"));
|
||||
|
||||
fnation.Join(_customer, Col("n_nationkey") == _customer["c_nationkey"])
|
||||
.Join(_orders, Col("c_custkey") == _orders["o_custkey"])
|
||||
.Select(Col("n_name").As("cust_nation"), Col("o_orderkey"))
|
||||
.Join(supNation, Col("o_orderkey") == supNation["l_orderkey"])
|
||||
.Filter(Col("supp_nation") == "FRANCE" & Col("cust_nation") == "GERMANY"
|
||||
| Col("supp_nation") == "GERMANY" & Col("cust_nation") == "FRANCE")
|
||||
.Select(Col("supp_nation"), Col("cust_nation"),
|
||||
getYear(Col("l_shipdate")).As("l_year"),
|
||||
decrease(Col("l_extendedprice"), Col("l_discount")).As("volume"))
|
||||
.GroupBy(Col("supp_nation"), Col("cust_nation"), Col("l_year"))
|
||||
.Agg(Sum(Col("volume")).As("revenue"))
|
||||
.Sort(Col("supp_nation"), Col("cust_nation"), Col("l_year"))
|
||||
.Show();
|
||||
}
|
||||
|
||||
internal void Q8()
|
||||
{
|
||||
Func<Column, Column> getYear = Udf<string, string>(x => x.Substring(0, 4));
|
||||
Func<Column, Column, Column> decrease = Udf<double, double, double>((x, y) => x * (1 - y));
|
||||
Func<Column, Column, Column> isBrazil = Udf<string, double, double>((x, y) => x == "BRAZIL" ? y : 0);
|
||||
|
||||
DataFrame fregion = _region.Filter(Col("r_name") == "AMERICA");
|
||||
DataFrame forder = _orders.Filter(Col("o_orderdate") <= "1996-12-31" & Col("o_orderdate") >= "1995-01-01");
|
||||
DataFrame fpart = _part.Filter(Col("p_type") == "ECONOMY ANODIZED STEEL");
|
||||
|
||||
DataFrame nat = _nation.Join(_supplier, Col("n_nationkey") == _supplier["s_nationkey"]);
|
||||
|
||||
DataFrame line = _lineitem.Select(Col("l_partkey"), Col("l_suppkey"), Col("l_orderkey"),
|
||||
decrease(Col("l_extendedprice"), Col("l_discount")).As("volume"))
|
||||
.Join(fpart, Col("l_partkey") == fpart["p_partkey"])
|
||||
.Join(nat, Col("l_suppkey") == nat["s_suppkey"]);
|
||||
|
||||
_nation.Join(fregion, Col("n_regionkey") == fregion["r_regionkey"])
|
||||
.Select(Col("n_nationkey"))
|
||||
.Join(_customer, Col("n_nationkey") == _customer["c_nationkey"])
|
||||
.Select(Col("c_custkey"))
|
||||
.Join(forder, Col("c_custkey") == forder["o_custkey"])
|
||||
.Select(Col("o_orderkey"), Col("o_orderdate"))
|
||||
.Join(line, Col("o_orderkey") == line["l_orderkey"])
|
||||
.Select(getYear(Col("o_orderdate")).As("o_year"), Col("volume"),
|
||||
isBrazil(Col("n_name"), Col("volume")).As("case_volume"))
|
||||
.GroupBy(Col("o_year"))
|
||||
.Agg((Sum(Col("case_volume")) / Sum("volume")).As("mkt_share"))
|
||||
.Sort(Col("o_year"))
|
||||
.Show();
|
||||
}
|
||||
|
||||
internal void Q9()
|
||||
{
|
||||
Func<Column, Column> getYear = Udf<string, string>(x => x.Substring(0, 4));
|
||||
Func<Column, Column, Column, Column, Column> expr = Udf<double, double, double, double, double>((x, y, v, w) => x * (1 - y) - (v * w));
|
||||
|
||||
DataFrame linePart = _part.Filter(Col("p_name").Contains("green"))
|
||||
.Join(_lineitem, Col("p_partkey") == _lineitem["l_partkey"]);
|
||||
|
||||
DataFrame natSup = _nation.Join(_supplier, Col("n_nationkey") == _supplier["s_nationkey"]);
|
||||
|
||||
linePart.Join(natSup, Col("l_suppkey") == natSup["s_suppkey"])
|
||||
.Join(_partsupp, Col("l_suppkey") == _partsupp["ps_suppkey"]
|
||||
& Col("l_partkey") == _partsupp["ps_partkey"])
|
||||
.Join(_orders, Col("l_orderkey") == _orders["o_orderkey"])
|
||||
.Select(Col("n_name"), getYear(Col("o_orderdate")).As("o_year"),
|
||||
expr(Col("l_extendedprice"), Col("l_discount"), Col("ps_supplycost"), Col("l_quantity")).As("amount"))
|
||||
.GroupBy(Col("n_name"), Col("o_year"))
|
||||
.Agg(Sum(Col("amount")))
|
||||
.Sort(Col("n_name"), Col("o_year").Desc())
|
||||
.Show();
|
||||
}
|
||||
|
||||
internal void Q10()
|
||||
{
|
||||
Func<Column, Column, Column> decrease = Udf<double, double, double>((x, y) => x * (1 - y));
|
||||
|
||||
DataFrame flineitem = _lineitem.Filter(Col("l_returnflag") == "R");
|
||||
|
||||
_orders.Filter(Col("o_orderdate") < "1994-01-01" & Col("o_orderdate") >= "1993-10-01")
|
||||
.Join(_customer, Col("o_custkey") == _customer["c_custkey"])
|
||||
.Join(_nation, Col("c_nationkey") == _nation["n_nationkey"])
|
||||
.Join(flineitem, Col("o_orderkey") == flineitem["l_orderkey"])
|
||||
.Select(Col("c_custkey"), Col("c_name"),
|
||||
decrease(Col("l_extendedprice"), Col("l_discount")).As("volume"),
|
||||
Col("c_acctbal"), Col("n_name"), Col("c_address"), Col("c_phone"), Col("c_comment"))
|
||||
.GroupBy(Col("c_custkey"), Col("c_name"), Col("c_acctbal"), Col("c_phone"), Col("n_name"), Col("c_address"), Col("c_comment"))
|
||||
.Agg(Sum(Col("volume")).As("revenue"))
|
||||
.Sort(Col("revenue").Desc())
|
||||
.Show();
|
||||
}
|
||||
|
||||
internal void Q11()
|
||||
{
|
||||
Func<Column, Column, Column> mul = Udf<double, int, double>((x, y) => x * y);
|
||||
Func<Column, Column> mul01 = Udf<double, double>(x => x * 0.0001);
|
||||
|
||||
DataFrame tmp = _nation.Filter(Col("n_name") == "GERMANY")
|
||||
.Join(_supplier, Col("n_nationkey") == _supplier["s_nationkey"])
|
||||
.Select(Col("s_suppkey"))
|
||||
.Join(_partsupp, Col("s_suppkey") == _partsupp["ps_suppkey"])
|
||||
.Select(Col("ps_partkey"), mul(Col("ps_supplycost"), Col("ps_availqty")).As("value"));
|
||||
|
||||
DataFrame sumRes = tmp.Agg(Sum("value").As("total_value"));
|
||||
|
||||
tmp.GroupBy(Col("ps_partkey")).Agg(Sum("value").As("part_value"))
|
||||
.Join(sumRes, Col("part_value") > mul01(Col("total_value")))
|
||||
.Sort(Col("part_value").Desc())
|
||||
.Show();
|
||||
}
|
||||
|
||||
internal void Q12()
|
||||
{
|
||||
Func<Column, Column, Column> mul = Udf<double, int, double>((x, y) => x * y);
|
||||
Func<Column, Column> highPriority = Udf<string, int>(x => (x == "1-URGENT" || x == "2-HIGH") ? 1 : 0);
|
||||
Func<Column, Column> lowPriority = Udf<string, int>(x => (x != "1-URGENT" && x != "2-HIGH") ? 1 : 0);
|
||||
|
||||
_lineitem.Filter((
|
||||
Col("l_shipmode") == "MAIL" | Col("l_shipmode") == "SHIP") &
|
||||
Col("l_commitdate") < Col("l_receiptdate") &
|
||||
Col("l_shipdate") < Col("l_commitdate") &
|
||||
Col("l_receiptdate") >= "1994-01-01" & Col("l_receiptdate") < "1995-01-01")
|
||||
.Join(_orders, Col("l_orderkey") == _orders["o_orderkey"])
|
||||
.Select(Col("l_shipmode"), Col("o_orderpriority"))
|
||||
.GroupBy(Col("l_shipmode"))
|
||||
.Agg(Sum(highPriority(Col("o_orderpriority"))).As("sum_highorderpriority"),
|
||||
Sum(lowPriority(Col("o_orderpriority"))).As("sum_loworderpriority"))
|
||||
.Sort(Col("l_shipmode"))
|
||||
.Show();
|
||||
}
|
||||
|
||||
private static readonly Regex s_q13SpecialRegex = new Regex("^.*special.*requests.*", RegexOptions.Compiled);
|
||||
internal void Q13()
|
||||
{
|
||||
Func<Column, Column> special = Udf<string, bool>((x) => s_q13SpecialRegex.IsMatch(x));
|
||||
|
||||
DataFrame c_orders = _customer.Join(_orders, Col("c_custkey") == _orders["o_custkey"]
|
||||
& !special(_orders["o_comment"]), "left_outer")
|
||||
.GroupBy(Col("c_custkey"))
|
||||
.Agg(Count(Col("o_orderkey")).As("c_count"));
|
||||
|
||||
c_orders
|
||||
.GroupBy(Col("c_count"))
|
||||
.Agg(Count(Col("*")).As("custdist"))
|
||||
.Sort(Col("custdist").Desc(), Col("c_count").Desc())
|
||||
.Show();
|
||||
}
|
||||
|
||||
internal void Q14()
|
||||
{
|
||||
Func<Column, Column, Column> reduce = Udf<double, double, double>((x, y) => x * (1 - y));
|
||||
Func<Column, Column, Column> promo = Udf<string, double, double>((x, y) => x.StartsWith("PROMO") ? y : 0);
|
||||
|
||||
_part.Join(_lineitem, Col("l_partkey") == Col("p_partkey") &
|
||||
Col("l_shipdate") >= "1995-09-01" & Col("l_shipdate") < "1995-10-01")
|
||||
.Select(Col("p_type"), reduce(Col("l_extendedprice"), Col("l_discount")).As("value"))
|
||||
.Agg(Sum(promo(Col("p_type"), Col("value"))) * 100 / Sum(Col("value")))
|
||||
.Show();
|
||||
}
|
||||
|
||||
internal void Q15()
|
||||
{
|
||||
Func<Column, Column, Column> decrease = Udf<double, double, double>((x, y) => x * (1 - y));
|
||||
|
||||
DataFrame revenue = _lineitem.Filter(Col("l_shipdate") >= "1996-01-01" &
|
||||
Col("l_shipdate") < "1996-04-01")
|
||||
.Select(Col("l_suppkey"), decrease(Col("l_extendedprice"), Col("l_discount")).As("value"))
|
||||
.GroupBy(Col("l_suppkey"))
|
||||
.Agg(Sum(Col("value")).As("total"));
|
||||
|
||||
revenue.Agg(Max(Col("total")).As("max_total"))
|
||||
.Join(revenue, Col("max_total") == revenue["total"])
|
||||
.Join(_supplier, Col("l_suppkey") == _supplier["s_suppkey"])
|
||||
.Select(Col("s_suppkey"), Col("s_name"), Col("s_address"), Col("s_phone"), Col("total"))
|
||||
.Sort(Col("s_suppkey"))
|
||||
.Show();
|
||||
}
|
||||
|
||||
private static readonly Regex s_q16CompainsRegex = new Regex(".*Customer.*Complaints.*", RegexOptions.Compiled);
|
||||
private static readonly Regex s_q16NumbersRegex = new Regex("^(49|14|23|45|19|3|36|9)$", RegexOptions.Compiled);
|
||||
internal void Q16()
|
||||
{
|
||||
Func<Column, Column, Column> decrease = Udf<double, double, double>((x, y) => x * (1 - y));
|
||||
|
||||
Func<Column, Column> complains = Udf<string, bool>((x) => s_q16CompainsRegex.Match(x).Success);
|
||||
|
||||
Func<Column, Column> polished = Udf<string, bool>((x) => x.StartsWith("MEDIUM POLISHED"));
|
||||
|
||||
Func<Column, Column> numbers = Udf<int, bool>((x) => s_q16NumbersRegex.Match(x.ToString()).Success);
|
||||
|
||||
DataFrame fparts = _part.Filter((Col("p_brand") != "Brand#45") & !polished(Col("p_type")) &
|
||||
numbers(Col("p_size")))
|
||||
.Select(Col("p_partkey"), Col("p_brand"), Col("p_type"), Col("p_size"));
|
||||
|
||||
_supplier.Filter(!complains(Col("s_comment")))
|
||||
.Join(_partsupp, Col("s_suppkey") == _partsupp["ps_suppkey"])
|
||||
.Select(Col("ps_partkey"), Col("ps_suppkey"))
|
||||
.Join(fparts, Col("ps_partkey") == fparts["p_partkey"])
|
||||
.GroupBy(Col("p_brand"), Col("p_type"), Col("p_size"))
|
||||
.Agg(CountDistinct(Col("ps_suppkey")).As("supplier_count"))
|
||||
.Sort(Col("supplier_count").Desc(), Col("p_brand"), Col("p_type"), Col("p_size"))
|
||||
.Show();
|
||||
}
|
||||
|
||||
internal void Q17()
|
||||
{
|
||||
Func<Column, Column> mul02 = Udf<double, double>((x) => x * 0.2);
|
||||
|
||||
DataFrame flineitem = _lineitem.Select(Col("l_partkey"), Col("l_quantity"), Col("l_extendedprice"));
|
||||
|
||||
DataFrame fpart = _part.Filter(Col("p_brand") == "Brand#23" & Col("p_container") == "MED BOX")
|
||||
.Select(Col("p_partkey"))
|
||||
.Join(_lineitem, Col("p_partkey") == _lineitem["l_partkey"], "left_outer");
|
||||
|
||||
fpart.GroupBy("p_partkey")
|
||||
.Agg(mul02(Avg(Col("l_quantity"))).As("avg_quantity"))
|
||||
.Select(Col("p_partkey").As("key"), Col("avg_quantity"))
|
||||
.Join(fpart, Col("key") == fpart["p_partkey"])
|
||||
.Filter(Col("l_quantity") < Col("avg_quantity"))
|
||||
.Agg((Sum(Col("l_extendedprice")) / 7.0).As("avg_yearly"))
|
||||
.Show();
|
||||
}
|
||||
|
||||
internal void Q18()
|
||||
{
|
||||
_lineitem.GroupBy(Col("l_orderkey"))
|
||||
.Agg(Sum(Col("l_quantity")).As("sum_quantity"))
|
||||
.Filter(Col("sum_quantity") > 300)
|
||||
.Select(Col("l_orderkey").As("key"), Col("sum_quantity"))
|
||||
.Join(_orders, _orders["o_orderkey"] == Col("key"))
|
||||
.Join(_lineitem, Col("o_orderkey") == _lineitem["l_orderkey"])
|
||||
.Join(_customer, _customer["c_custkey"] == Col("o_custkey"))
|
||||
.Select(Col("l_quantity"), Col("c_name"), Col("c_custkey"), Col("o_orderkey"), Col("o_orderdate"), Col("o_totalprice"))
|
||||
.GroupBy(Col("c_name"), Col("c_custkey"), Col("o_orderkey"), Col("o_orderdate"), Col("o_totalprice"))
|
||||
.Agg(Sum("l_quantity"))
|
||||
.Sort(Col("o_totalprice").Desc(), Col("o_orderdate"))
|
||||
.Show();
|
||||
}
|
||||
|
||||
private static readonly Regex s_q19SmRegex = new Regex("SM CASE|SM BOX|SM PACK|SM PKG", RegexOptions.Compiled);
|
||||
private static readonly Regex s_q19MdRegex = new Regex("MED BAG|MED BOX|MED PKG|MED PACK", RegexOptions.Compiled);
|
||||
private static readonly Regex s_q19LgRegex = new Regex("LG CASE|LG BOX|LG PACK|LG PKG", RegexOptions.Compiled);
|
||||
internal void Q19()
|
||||
{
|
||||
Func<Column, Column> sm = Udf<string, bool>(x => s_q19SmRegex.IsMatch(x));
|
||||
Func<Column, Column> md = Udf<string, bool>(x => s_q19MdRegex.IsMatch(x));
|
||||
Func<Column, Column> lg = Udf<string, bool>(x => s_q19LgRegex.IsMatch(x));
|
||||
|
||||
Func<Column, Column, Column> decrease = Udf<double, double, double>((x, y) => x * (1 - y));
|
||||
|
||||
_part.Join(_lineitem, Col("l_partkey") == Col("p_partkey"))
|
||||
.Filter((Col("l_shipmode") == "AIR" | Col("l_shipmode") == "AIR REG") &
|
||||
Col("l_shipinstruct") == "DELIVER IN PERSON")
|
||||
.Filter(
|
||||
((Col("p_brand") == "Brand#12") &
|
||||
sm(Col("p_container")) &
|
||||
Col("l_quantity") >= 1 & Col("l_quantity") <= 11 &
|
||||
Col("p_size") >= 1 & Col("p_size") <= 5) |
|
||||
((Col("p_brand") == "Brand#23") &
|
||||
md(Col("p_container")) &
|
||||
Col("l_quantity") >= 10 & Col("l_quantity") <= 20 &
|
||||
Col("p_size") >= 1 & Col("p_size") <= 10) |
|
||||
((Col("p_brand") == "Brand#34") &
|
||||
lg(Col("p_container")) &
|
||||
Col("l_quantity") >= 20 & Col("l_quantity") <= 30 &
|
||||
Col("p_size") >= 1 & Col("p_size") <= 15))
|
||||
.Select(decrease(Col("l_extendedprice"), Col("l_discount")).As("volume"))
|
||||
.Agg(Sum("volume").As("revenue"))
|
||||
.Show();
|
||||
}
|
||||
|
||||
internal void Q20()
|
||||
{
|
||||
Func<Column, Column> forest = Udf<string, bool>(x => x.StartsWith("forest"));
|
||||
|
||||
DataFrame flineitem = _lineitem.Filter(Col("l_shipdate") >= "1994-01-01" & Col("l_shipdate") < "1995-01-01")
|
||||
.GroupBy(Col("l_partkey"), Col("l_suppkey"))
|
||||
.Agg((Sum(Col("l_quantity")) * 0.5).As("sum_quantity"));
|
||||
|
||||
DataFrame fnation = _nation.Filter(Col("n_name") == "CANADA");
|
||||
DataFrame nat_supp = _supplier.Select(Col("s_suppkey"), Col("s_name"), Col("s_nationkey"), Col("s_address"))
|
||||
.Join(fnation, Col("s_nationkey") == fnation["n_nationkey"]);
|
||||
|
||||
_part.Filter(forest(Col("p_name")))
|
||||
.Select(Col("p_partkey"))
|
||||
.Distinct()
|
||||
.Join(_partsupp, Col("p_partkey") == _partsupp["ps_partkey"])
|
||||
.Join(flineitem, Col("ps_suppkey") == flineitem["l_suppkey"] & Col("ps_partkey") == flineitem["l_partkey"])
|
||||
.Filter(Col("ps_availqty") > Col("sum_quantity"))
|
||||
.Select(Col("ps_suppkey"))
|
||||
.Distinct()
|
||||
.Join(nat_supp, Col("ps_suppkey") == nat_supp["s_suppkey"])
|
||||
.Select(Col("s_name"), Col("s_address"))
|
||||
.Sort(Col("s_name"))
|
||||
.Show();
|
||||
}
|
||||
|
||||
internal void Q21()
|
||||
{
|
||||
DataFrame fsupplier = _supplier.Select(Col("s_suppkey"), Col("s_nationkey"), Col("s_name"));
|
||||
|
||||
DataFrame plineitem = _lineitem
|
||||
.Select(Col("l_suppkey"), Col("l_orderkey"), Col("l_receiptdate"), Col("l_commitdate"));
|
||||
|
||||
DataFrame flineitem = plineitem.Filter(Col("l_receiptdate") > Col("l_commitdate"));
|
||||
|
||||
DataFrame line1 = plineitem.GroupBy(Col("l_orderkey"))
|
||||
.Agg(CountDistinct(Col("l_suppkey")).As("suppkey_count"), Max(Col("l_suppkey")).As("suppkey_max"))
|
||||
.Select(Col("l_orderkey").As("key"), Col("suppkey_count"), Col("suppkey_max"));
|
||||
|
||||
DataFrame line2 = flineitem.GroupBy(Col("l_orderkey"))
|
||||
.Agg(CountDistinct(Col("l_suppkey")).As("suppkey_count"), Max(Col("l_suppkey")).As("suppkey_max"))
|
||||
.Select(Col("l_orderkey").As("key"), Col("suppkey_count"), Col("suppkey_max"));
|
||||
|
||||
DataFrame forder = _orders.Select(Col("o_orderkey"), Col("o_orderstatus"))
|
||||
.Filter(Col("o_orderstatus") == "F");
|
||||
|
||||
_nation.Filter(Col("n_name") == "SAUDI ARABIA")
|
||||
.Join(fsupplier, Col("n_nationkey") == fsupplier["s_nationkey"])
|
||||
.Join(flineitem, Col("s_suppkey") == flineitem["l_suppkey"])
|
||||
.Join(forder, Col("l_orderkey") == forder["o_orderkey"])
|
||||
.Join(line1, Col("l_orderkey") == line1["key"])
|
||||
.Filter(Col("suppkey_count") > 1)
|
||||
.Select(Col("s_name"), Col("l_orderkey"), Col("l_suppkey"))
|
||||
.Join(line2, Col("l_orderkey") == line2["key"], "left_outer")
|
||||
.Select(Col("s_name"), Col("l_orderkey"), Col("l_suppkey"), Col("suppkey_count"), Col("suppkey_max"))
|
||||
.Filter(Col("suppkey_count") == 1 & Col("l_suppkey") == Col("suppkey_max"))
|
||||
.GroupBy(Col("s_name"))
|
||||
.Agg(Count(Col("l_suppkey")).As("numwait"))
|
||||
.Sort(Col("numwait").Desc(), Col("s_name"))
|
||||
.Show();
|
||||
}
|
||||
|
||||
private static readonly Regex s_q22PhoneRegex = new Regex("^(13|31|23|29|30|18|17)$", RegexOptions.Compiled);
|
||||
internal void Q22()
|
||||
{
|
||||
Func<Column, Column> sub2 = Udf<string, string>(x => x.Substring(0, 2));
|
||||
|
||||
Func<Column, Column> phone = Udf<string, bool>(x => s_q22PhoneRegex.IsMatch(x));
|
||||
|
||||
DataFrame fcustomer = _customer.Select(Col("c_acctbal"), Col("c_custkey"), sub2(Col("c_phone")).As("cntrycode"))
|
||||
.Filter(phone(Col("cntrycode")));
|
||||
|
||||
DataFrame avg_customer = fcustomer.Filter(Col("c_acctbal") > 0.0)
|
||||
.Agg(Avg(Col("c_acctbal")).As("avg_acctbal"));
|
||||
|
||||
_orders.GroupBy(Col("o_custkey"))
|
||||
.Agg(Col("o_custkey")).Select(Col("o_custkey"))
|
||||
.Join(fcustomer, Col("o_custkey") == fcustomer["c_custkey"], "right_outer")
|
||||
.Filter(Col("o_custkey").IsNull())
|
||||
.Join(avg_customer)
|
||||
.Filter(Col("c_acctbal") > Col("avg_acctbal"))
|
||||
.GroupBy(Col("cntrycode"))
|
||||
.Agg(Count(Col("c_acctbal")).As("numcust"), Sum(Col("c_acctbal")).As("totacctbal"))
|
||||
.Sort(Col("cntrycode"))
|
||||
.Show();
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,758 @@
|
|||
// Licensed to the .NET Foundation under one or more agreements.
|
||||
// The .NET Foundation licenses this file to you under the MIT license.
|
||||
// See the LICENSE file in the project root for more information.
|
||||
|
||||
using System;
|
||||
using System.Diagnostics;
|
||||
using System.Reflection;
|
||||
using Microsoft.Spark.Sql;
|
||||
|
||||
/**
|
||||
* The queries are based off of the original SparkSQL TPC-H queries from:
|
||||
* https://github.com/databricks/spark-sql-perf/tree/master/src/main/resources/tpch/queries
|
||||
*
|
||||
* Also see:
|
||||
* https://github.com/chiwanpark/tpch-benchmark/tree/master/spark/src/main/scala/tpch/spark/query
|
||||
*
|
||||
* A non-SQL version of the queries that directly uses Scala's DataFrame API can be obtained here:
|
||||
* https://github.com/ssavvides/tpch-spark/tree/master/src/main/scala
|
||||
*/
|
||||
namespace Tpch
|
||||
{
|
||||
internal class TpchSqlQueries : TpchBase
|
||||
{
|
||||
private readonly SparkSession _spark;
|
||||
|
||||
internal TpchSqlQueries(string tpchRoot, SparkSession spark)
|
||||
: base(tpchRoot, spark)
|
||||
{
|
||||
_spark = spark;
|
||||
|
||||
// Register SQL views
|
||||
_customer.CreateOrReplaceTempView("customer");
|
||||
_lineitem.CreateOrReplaceTempView("lineitem");
|
||||
_nation.CreateOrReplaceTempView("nation");
|
||||
_orders.CreateOrReplaceTempView("orders");
|
||||
_part.CreateOrReplaceTempView("part");
|
||||
_partsupp.CreateOrReplaceTempView("partsupp");
|
||||
_region.CreateOrReplaceTempView("region");
|
||||
_supplier.CreateOrReplaceTempView("supplier");
|
||||
}
|
||||
|
||||
internal void RunAll()
|
||||
{
|
||||
for (var i = 1; i <= 22; i++)
|
||||
{
|
||||
Run(i.ToString());
|
||||
}
|
||||
}
|
||||
|
||||
internal void Run(string queryNumber)
|
||||
{
|
||||
Console.WriteLine($"Spark .NET TPCH SQL Query: #{queryNumber}");
|
||||
Type thisType = GetType();
|
||||
var queryString = (string)thisType.GetField(
|
||||
$"s_q{queryNumber}", BindingFlags.Static | BindingFlags.NonPublic).GetValue(null);
|
||||
|
||||
var sw = Stopwatch.StartNew();
|
||||
_spark.Sql(queryString).Show(numRows: 20, truncate: 0);
|
||||
Console.WriteLine($"\tElapsed: {sw.Elapsed}");
|
||||
}
|
||||
|
||||
private static readonly string s_q1 = @"
|
||||
select
|
||||
| l_returnflag,
|
||||
| l_linestatus,
|
||||
| sum(l_quantity) as sum_qty,
|
||||
| sum(l_extendedprice) as sum_base_price,
|
||||
| sum(l_extendedprice * (1 - l_discount)) as sum_disc_price,
|
||||
| sum(l_extendedprice * (1 - l_discount) * (1 + l_tax)) as sum_charge,
|
||||
| avg(l_quantity) as avg_qty,
|
||||
| avg(l_extendedprice) as avg_price,
|
||||
| avg(l_discount) as avg_disc,
|
||||
| count(*) as count_order
|
||||
| from
|
||||
| lineitem
|
||||
| where
|
||||
| l_shipdate <= date '1998-12-01' - interval '90' day
|
||||
| group by
|
||||
| l_returnflag,
|
||||
| l_linestatus
|
||||
| order by
|
||||
| l_returnflag,
|
||||
| l_linestatus
|
||||
".StripMargin();
|
||||
|
||||
private static readonly string s_q2 = @"
|
||||
select
|
||||
| s_acctbal,
|
||||
| s_name,
|
||||
| n_name,
|
||||
| p_partkey,
|
||||
| p_mfgr,
|
||||
| s_address,
|
||||
| s_phone,
|
||||
| s_comment
|
||||
| from
|
||||
| part,
|
||||
| supplier,
|
||||
| partsupp,
|
||||
| nation,
|
||||
| region
|
||||
| where
|
||||
| p_partkey = ps_partkey
|
||||
| and s_suppkey = ps_suppkey
|
||||
| and p_size = 15
|
||||
| and p_type like '%BRASS'
|
||||
| and s_nationkey = n_nationkey
|
||||
| and n_regionkey = r_regionkey
|
||||
| and r_name = 'EUROPE'
|
||||
| and ps_supplycost = (
|
||||
| select
|
||||
| min(ps_supplycost)
|
||||
| from
|
||||
| partsupp,
|
||||
| supplier,
|
||||
| nation,
|
||||
| region
|
||||
| where
|
||||
| p_partkey = ps_partkey
|
||||
| and s_suppkey = ps_suppkey
|
||||
| and s_nationkey = n_nationkey
|
||||
| and n_regionkey = r_regionkey
|
||||
| and r_name = 'EUROPE'
|
||||
| )
|
||||
| order by
|
||||
| s_acctbal desc,
|
||||
| n_name,
|
||||
| s_name,
|
||||
| p_partkey
|
||||
".StripMargin();
|
||||
|
||||
private static readonly string s_q3 = @"
|
||||
select
|
||||
| l_orderkey,
|
||||
| sum(l_extendedprice * (1 - l_discount)) as revenue,
|
||||
| o_orderdate,
|
||||
| o_shippriority
|
||||
| from
|
||||
| customer,
|
||||
| orders,
|
||||
| lineitem
|
||||
| where
|
||||
| c_mktsegment = 'BUILDING'
|
||||
| and c_custkey = o_custkey
|
||||
| and l_orderkey = o_orderkey
|
||||
| and o_orderdate < date '1995-03-15'
|
||||
| and l_shipdate > date '1995-03-15'
|
||||
| group by
|
||||
| l_orderkey,
|
||||
| o_orderdate,
|
||||
| o_shippriority
|
||||
| order by
|
||||
| revenue desc,
|
||||
| o_orderdate
|
||||
".StripMargin();
|
||||
|
||||
private static readonly string s_q4 = @"
|
||||
select
|
||||
| o_orderpriority,
|
||||
| count(*) as order_count
|
||||
| from
|
||||
| orders
|
||||
| where
|
||||
| o_orderdate >= date '1993-07-01'
|
||||
| and o_orderdate < date '1993-07-01' + interval '3' month
|
||||
| and exists(
|
||||
| select
|
||||
| *
|
||||
| from
|
||||
| lineitem
|
||||
| where
|
||||
| l_orderkey = o_orderkey
|
||||
| and l_commitdate < l_receiptdate
|
||||
| )
|
||||
| group by
|
||||
| o_orderpriority
|
||||
| order by
|
||||
| o_orderpriority
|
||||
".StripMargin();
|
||||
|
||||
private static readonly string s_q5 = @"
|
||||
select
|
||||
| n_name,
|
||||
| sum(l_extendedprice * (1 - l_discount)) as revenue
|
||||
| from
|
||||
| customer,
|
||||
| orders,
|
||||
| lineitem,
|
||||
| supplier,
|
||||
| nation,
|
||||
| region
|
||||
| where
|
||||
| c_custkey = o_custkey
|
||||
| and l_orderkey = o_orderkey
|
||||
| and l_suppkey = s_suppkey
|
||||
| and c_nationkey = s_nationkey
|
||||
| and s_nationkey = n_nationkey
|
||||
| and n_regionkey = r_regionkey
|
||||
| and r_name = 'ASIA'
|
||||
| and o_orderdate >= date '1994-01-01'
|
||||
| and o_orderdate < date '1994-01-01' + interval '1' year
|
||||
| group by
|
||||
| n_name
|
||||
| order by
|
||||
| revenue desc
|
||||
".StripMargin();
|
||||
|
||||
private static readonly string s_q6 = @"
|
||||
select
|
||||
| sum(l_extendedprice * l_discount) as revenue
|
||||
| from
|
||||
| lineitem
|
||||
| where
|
||||
| l_shipdate >= date '1994-01-01'
|
||||
| and l_shipdate < date '1994-01-01' + interval '1' year
|
||||
| and l_discount between .06 - 0.01 and .06 + 0.01
|
||||
| and l_quantity < 24
|
||||
".StripMargin();
|
||||
|
||||
private static readonly string s_q7 = @"
|
||||
select
|
||||
| supp_nation,
|
||||
| cust_nation,
|
||||
| l_year,
|
||||
| cast(sum(volume) as double) as revenue
|
||||
| from
|
||||
| (
|
||||
| select
|
||||
| n1.n_name as supp_nation,
|
||||
| n2.n_name as cust_nation,
|
||||
| year(l_shipdate) as l_year,
|
||||
| cast(l_extendedprice * (1 - l_discount) as double) as volume
|
||||
| from
|
||||
| supplier,
|
||||
| lineitem,
|
||||
| orders,
|
||||
| customer,
|
||||
| nation n1,
|
||||
| nation n2
|
||||
| where
|
||||
| s_suppkey = l_suppkey
|
||||
| and o_orderkey = l_orderkey
|
||||
| and c_custkey = o_custkey
|
||||
| and s_nationkey = n1.n_nationkey
|
||||
| and c_nationkey = n2.n_nationkey
|
||||
| and(
|
||||
| (n1.n_name = 'FRANCE' and n2.n_name = 'GERMANY')
|
||||
| or(n1.n_name = 'GERMANY' and n2.n_name = 'FRANCE')
|
||||
| )
|
||||
| and l_shipdate between date '1995-01-01' and date '1996-12-31'
|
||||
| ) as shipping
|
||||
| group by
|
||||
| supp_nation,
|
||||
| cust_nation,
|
||||
| l_year
|
||||
| order by
|
||||
| supp_nation,
|
||||
| cust_nation,
|
||||
| l_year
|
||||
".StripMargin();
|
||||
|
||||
private static readonly string s_q8 = @"
|
||||
select
|
||||
| o_year,
|
||||
| sum(case
|
||||
|
||||
| when nation = 'BRAZIL' then volume
|
||||
| else 0
|
||||
| end) / sum(volume) as mkt_share
|
||||
| from
|
||||
| (
|
||||
| select
|
||||
| year(o_orderdate) as o_year,
|
||||
| l_extendedprice * (1 - l_discount) as volume,
|
||||
| n2.n_name as nation
|
||||
| from
|
||||
| part,
|
||||
| supplier,
|
||||
| lineitem,
|
||||
| orders,
|
||||
| customer,
|
||||
| nation n1,
|
||||
| nation n2,
|
||||
| region
|
||||
| where
|
||||
| p_partkey = l_partkey
|
||||
| and s_suppkey = l_suppkey
|
||||
| and l_orderkey = o_orderkey
|
||||
| and o_custkey = c_custkey
|
||||
| and c_nationkey = n1.n_nationkey
|
||||
| and n1.n_regionkey = r_regionkey
|
||||
| and r_name = 'AMERICA'
|
||||
| and s_nationkey = n2.n_nationkey
|
||||
| and o_orderdate between date '1995-01-01' and date '1996-12-31'
|
||||
| and p_type = 'ECONOMY ANODIZED STEEL'
|
||||
| ) as all_nations
|
||||
| group by
|
||||
| o_year
|
||||
| order by
|
||||
| o_year
|
||||
".StripMargin();
|
||||
|
||||
private static readonly string s_q9 = @"
|
||||
select
|
||||
| nation,
|
||||
| o_year,
|
||||
| sum(amount) as sum_profit
|
||||
| from
|
||||
| (
|
||||
| select
|
||||
| n_name as nation,
|
||||
| year(o_orderdate) as o_year,
|
||||
| l_extendedprice * (1 - l_discount) - ps_supplycost * l_quantity as amount
|
||||
| from
|
||||
| part,
|
||||
| supplier,
|
||||
| lineitem,
|
||||
| partsupp,
|
||||
| orders,
|
||||
| nation
|
||||
| where
|
||||
| s_suppkey = l_suppkey
|
||||
| and ps_suppkey = l_suppkey
|
||||
| and ps_partkey = l_partkey
|
||||
| and p_partkey = l_partkey
|
||||
| and o_orderkey = l_orderkey
|
||||
| and s_nationkey = n_nationkey
|
||||
| and p_name like '%green%'
|
||||
| ) as profit
|
||||
| group by
|
||||
| nation,
|
||||
| o_year
|
||||
| order by
|
||||
| nation,
|
||||
| o_year desc
|
||||
".StripMargin();
|
||||
|
||||
private static readonly string s_q10 = @"
|
||||
select
|
||||
| c_custkey,
|
||||
| c_name,
|
||||
| sum(l_extendedprice * (1 - l_discount)) as revenue,
|
||||
| c_acctbal,
|
||||
| n_name,
|
||||
| c_address,
|
||||
| c_phone,
|
||||
| c_comment
|
||||
| from
|
||||
| customer,
|
||||
| orders,
|
||||
| lineitem,
|
||||
| nation
|
||||
| where
|
||||
| c_custkey = o_custkey
|
||||
| and l_orderkey = o_orderkey
|
||||
| and o_orderdate >= date '1993-10-01'
|
||||
| and o_orderdate < date '1993-10-01' + interval '3' month
|
||||
| and l_returnflag = 'R'
|
||||
| and c_nationkey = n_nationkey
|
||||
| group by
|
||||
| c_custkey,
|
||||
| c_name,
|
||||
| c_acctbal,
|
||||
| c_phone,
|
||||
| n_name,
|
||||
| c_address,
|
||||
| c_comment
|
||||
| order by
|
||||
| revenue desc
|
||||
".StripMargin();
|
||||
|
||||
private static readonly string s_q11 = @"
|
||||
select
|
||||
| ps_partkey,
|
||||
| sum(ps_supplycost * ps_availqty) as value
|
||||
| from
|
||||
| partsupp,
|
||||
| supplier,
|
||||
| nation
|
||||
| where
|
||||
| ps_suppkey = s_suppkey
|
||||
| and s_nationkey = n_nationkey
|
||||
| and n_name = 'GERMANY'
|
||||
| group by
|
||||
| ps_partkey having
|
||||
| sum(ps_supplycost * ps_availqty) > (
|
||||
| select
|
||||
| sum(ps_supplycost * ps_availqty) * 0.0001000000
|
||||
| from
|
||||
| partsupp,
|
||||
| supplier,
|
||||
| nation
|
||||
| where
|
||||
| ps_suppkey = s_suppkey
|
||||
| and s_nationkey = n_nationkey
|
||||
| and n_name = 'GERMANY'
|
||||
| )
|
||||
| order by
|
||||
| value desc
|
||||
".StripMargin();
|
||||
|
||||
private static readonly string s_q12 = @"
|
||||
select
|
||||
| l_shipmode,
|
||||
| sum(case
|
||||
|
||||
| when o_orderpriority = '1-URGENT'
|
||||
| or o_orderpriority = '2-HIGH'
|
||||
| then 1
|
||||
| else 0
|
||||
| end) as sum_highorderpriority,
|
||||
| sum(case
|
||||
|
||||
| when o_orderpriority <> '1-URGENT'
|
||||
| and o_orderpriority <> '2-HIGH'
|
||||
| then 1
|
||||
| else 0
|
||||
| end) as sum_loworderpriority
|
||||
| from
|
||||
| orders,
|
||||
| lineitem
|
||||
| where
|
||||
| o_orderkey = l_orderkey
|
||||
| and l_shipmode in ('MAIL', 'SHIP')
|
||||
| and l_commitdate < l_receiptdate
|
||||
| and l_shipdate < l_commitdate
|
||||
| and l_receiptdate >= date '1994-01-01'
|
||||
| and l_receiptdate < date '1994-01-01' + interval '1' year
|
||||
| group by
|
||||
| l_shipmode
|
||||
| order by
|
||||
| l_shipmode
|
||||
".StripMargin();
|
||||
|
||||
private static readonly string s_q13 = @"
|
||||
select
|
||||
| c_count,
|
||||
| count(*) as custdist
|
||||
| from
|
||||
| (
|
||||
| select
|
||||
| c_custkey,
|
||||
| count(o_orderkey) as c_count
|
||||
| from
|
||||
| customer left outer join orders on
|
||||
| c_custkey = o_custkey
|
||||
| and o_comment not like '%special%requests%'
|
||||
| group by
|
||||
| c_custkey
|
||||
| ) as c_orders
|
||||
| group by
|
||||
| c_count
|
||||
| order by
|
||||
| custdist desc,
|
||||
| c_count desc".StripMargin();
|
||||
|
||||
private static readonly string s_q14 = @"
|
||||
| select
|
||||
| 100.00 * sum(case
|
||||
|
||||
| when p_type like 'PROMO%'
|
||||
| then l_extendedprice * (1 - l_discount)
|
||||
| else 0
|
||||
| end) / sum(l_extendedprice * (1 - l_discount)) as promo_revenue
|
||||
| from
|
||||
| lineitem,
|
||||
| part
|
||||
| where
|
||||
| l_partkey = p_partkey
|
||||
| and l_shipdate >= date '1995-09-01'
|
||||
| and l_shipdate < date '1995-09-01' + interval '1' month
|
||||
".StripMargin();
|
||||
|
||||
private static readonly string s_q15 = @"
|
||||
with revenue0 as
|
||||
| (select
|
||||
| l_suppkey as supplier_no,
|
||||
| sum(l_extendedprice * (1 - l_discount)) as total_revenue
|
||||
| from
|
||||
| lineitem
|
||||
| where
|
||||
| l_shipdate >= date '1996-01-01'
|
||||
| and l_shipdate < date '1996-01-01' + interval '3' month
|
||||
| group by
|
||||
| l_suppkey)
|
||||
|
|
||||
| select
|
||||
| s_suppkey,
|
||||
| s_name,
|
||||
| s_address,
|
||||
| s_phone,
|
||||
| total_revenue
|
||||
| from
|
||||
| supplier,
|
||||
| revenue0
|
||||
| where
|
||||
| s_suppkey = supplier_no
|
||||
| and total_revenue = (
|
||||
| select
|
||||
| max(total_revenue)
|
||||
| from
|
||||
| revenue0
|
||||
| )
|
||||
| order by
|
||||
| s_suppkey
|
||||
".StripMargin();
|
||||
|
||||
private static readonly string s_q16 = @"
|
||||
| select
|
||||
| p_brand,
|
||||
| p_type,
|
||||
| p_size,
|
||||
| count(distinct ps_suppkey) as supplier_cnt
|
||||
| from
|
||||
| partsupp,
|
||||
| part
|
||||
| where
|
||||
| p_partkey = ps_partkey
|
||||
| and p_brand <> 'Brand#45'
|
||||
| and p_type not like 'MEDIUM POLISHED%'
|
||||
| and p_size in (49, 14, 23, 45, 19, 3, 36, 9)
|
||||
| and ps_suppkey not in (
|
||||
| select
|
||||
| s_suppkey
|
||||
| from
|
||||
| supplier
|
||||
| where
|
||||
| s_comment like '%Customer%Complaints%'
|
||||
| )
|
||||
| group by
|
||||
| p_brand,
|
||||
| p_type,
|
||||
| p_size
|
||||
| order by
|
||||
| supplier_cnt desc,
|
||||
| p_brand,
|
||||
| p_type,
|
||||
| p_size
|
||||
| limit 20
|
||||
".StripMargin();
|
||||
|
||||
private static readonly string s_q17 = @"
|
||||
| select
|
||||
| sum(l_extendedprice) / 7.0 as avg_yearly
|
||||
| from
|
||||
| lineitem,
|
||||
| part
|
||||
| where
|
||||
| p_partkey = l_partkey
|
||||
| and p_brand = 'Brand#23'
|
||||
| and p_container = 'MED BOX'
|
||||
| and l_quantity < (
|
||||
| select
|
||||
| 0.2 * avg(l_quantity)
|
||||
| from
|
||||
| lineitem
|
||||
| where
|
||||
| l_partkey = p_partkey
|
||||
| )
|
||||
".StripMargin();
|
||||
|
||||
private static readonly string s_q18 = @"
|
||||
| select
|
||||
| c_name,
|
||||
| c_custkey,
|
||||
| o_orderkey,
|
||||
| o_orderdate,
|
||||
| o_totalprice,
|
||||
| sum(l_quantity)
|
||||
| from
|
||||
| customer,
|
||||
| orders,
|
||||
| lineitem
|
||||
| where
|
||||
| o_orderkey in (
|
||||
| select
|
||||
| l_orderkey
|
||||
| from
|
||||
| lineitem
|
||||
| group by
|
||||
| l_orderkey having
|
||||
| sum(l_quantity) > 300
|
||||
| )
|
||||
| and c_custkey = o_custkey
|
||||
| and o_orderkey = l_orderkey
|
||||
| group by
|
||||
| c_name,
|
||||
| c_custkey,
|
||||
| o_orderkey,
|
||||
| o_orderdate,
|
||||
| o_totalprice
|
||||
| order by
|
||||
| o_totalprice desc,
|
||||
| o_orderdate
|
||||
".StripMargin();
|
||||
|
||||
private static readonly string s_q19 = @"
|
||||
| select
|
||||
| sum(l_extendedprice * (1 - l_discount)) as revenue
|
||||
| from
|
||||
| lineitem,
|
||||
| part
|
||||
| where
|
||||
| (
|
||||
| p_partkey = l_partkey
|
||||
| and p_brand = 'Brand#12'
|
||||
| and p_container in ('SM CASE', 'SM BOX', 'SM PACK', 'SM PKG')
|
||||
| and l_quantity >= 1 and l_quantity <= 1 + 10
|
||||
| and p_size between 1 and 5
|
||||
| and l_shipmode in ('AIR', 'AIR REG')
|
||||
| and l_shipinstruct = 'DELIVER IN PERSON'
|
||||
| )
|
||||
| or
|
||||
| (
|
||||
| p_partkey = l_partkey
|
||||
| and p_brand = 'Brand#23'
|
||||
| and p_container in ('MED BAG', 'MED BOX', 'MED PKG', 'MED PACK')
|
||||
| and l_quantity >= 10 and l_quantity <= 10 + 10
|
||||
| and p_size between 1 and 10
|
||||
| and l_shipmode in ('AIR', 'AIR REG')
|
||||
| and l_shipinstruct = 'DELIVER IN PERSON'
|
||||
| )
|
||||
| or
|
||||
| (
|
||||
| p_partkey = l_partkey
|
||||
| and p_brand = 'Brand#34'
|
||||
| and p_container in ('LG CASE', 'LG BOX', 'LG PACK', 'LG PKG')
|
||||
| and l_quantity >= 20 and l_quantity <= 20 + 10
|
||||
| and p_size between 1 and 15
|
||||
| and l_shipmode in ('AIR', 'AIR REG')
|
||||
| and l_shipinstruct = 'DELIVER IN PERSON'
|
||||
| )
|
||||
".StripMargin();
|
||||
|
||||
private static readonly string s_q20 = @"
|
||||
| select
|
||||
| s_name,
|
||||
| s_address
|
||||
| from
|
||||
| supplier,
|
||||
| nation
|
||||
| where
|
||||
| s_suppkey in (
|
||||
| select
|
||||
| ps_suppkey
|
||||
| from
|
||||
| partsupp
|
||||
| where
|
||||
| ps_partkey in (
|
||||
| select
|
||||
| p_partkey
|
||||
| from
|
||||
| part
|
||||
| where
|
||||
| p_name like 'forest%'
|
||||
| )
|
||||
| and ps_availqty > (
|
||||
| select
|
||||
| 0.5 * sum(l_quantity)
|
||||
| from
|
||||
| lineitem
|
||||
| where
|
||||
| l_partkey = ps_partkey
|
||||
| and l_suppkey = ps_suppkey
|
||||
| and l_shipdate >= date '1994-01-01'
|
||||
| and l_shipdate < date '1994-01-01' + interval '1' year
|
||||
| )
|
||||
| )
|
||||
| and s_nationkey = n_nationkey
|
||||
| and n_name = 'CANADA'
|
||||
| order by
|
||||
| s_name
|
||||
".StripMargin();
|
||||
|
||||
private static readonly string s_q21 = @"
|
||||
| select
|
||||
| s_name,
|
||||
| count(*) as numwait
|
||||
| from
|
||||
| supplier,
|
||||
| lineitem l1,
|
||||
| orders,
|
||||
| nation
|
||||
| where
|
||||
| s_suppkey = l1.l_suppkey
|
||||
| and o_orderkey = l1.l_orderkey
|
||||
| and o_orderstatus = 'F'
|
||||
| and l1.l_receiptdate > l1.l_commitdate
|
||||
| and exists(
|
||||
| select
|
||||
| *
|
||||
| from
|
||||
| lineitem l2
|
||||
| where
|
||||
| l2.l_orderkey = l1.l_orderkey
|
||||
| and l2.l_suppkey <> l1.l_suppkey
|
||||
| )
|
||||
| and not exists(
|
||||
| select
|
||||
| *
|
||||
| from
|
||||
| lineitem l3
|
||||
| where
|
||||
| l3.l_orderkey = l1.l_orderkey
|
||||
| and l3.l_suppkey <> l1.l_suppkey
|
||||
| and l3.l_receiptdate > l3.l_commitdate
|
||||
| )
|
||||
| and s_nationkey = n_nationkey
|
||||
| and n_name = 'SAUDI ARABIA'
|
||||
| group by
|
||||
| s_name
|
||||
| order by
|
||||
| numwait desc,
|
||||
| s_name
|
||||
".StripMargin();
|
||||
|
||||
private static readonly string s_q22 = @"
|
||||
select
|
||||
| cntrycode,
|
||||
| count(*) as numcust,
|
||||
| sum(c_acctbal) as totacctbal
|
||||
| from
|
||||
| (
|
||||
| select
|
||||
| substring(c_phone, 1, 2) as cntrycode,
|
||||
| c_acctbal
|
||||
| from
|
||||
| customer
|
||||
| where
|
||||
| substring(c_phone, 1, 2) in
|
||||
| ('13', '31', '23', '29', '30', '18', '17')
|
||||
| and c_acctbal > (
|
||||
| select
|
||||
| avg(c_acctbal)
|
||||
| from
|
||||
| customer
|
||||
| where
|
||||
| c_acctbal > 0.00
|
||||
| and substring(c_phone, 1, 2) in
|
||||
| ('13', '31', '23', '29', '30', '18', '17')
|
||||
| )
|
||||
| and not exists(
|
||||
| select
|
||||
| *
|
||||
| from
|
||||
| orders
|
||||
| where
|
||||
| o_custkey = c_custkey
|
||||
| )
|
||||
| ) as custsale
|
||||
| group by
|
||||
| cntrycode
|
||||
| order by
|
||||
| cntrycode
|
||||
".StripMargin();
|
||||
|
||||
}
|
||||
}
|
|
@ -0,0 +1,43 @@
|
|||
# Licensed to the .NET Foundation under one or more agreements.
|
||||
# The .NET Foundation licenses this file to you under the MIT license.
|
||||
# See the LICENSE file in the project root for more information.
|
||||
|
||||
import sys
|
||||
import time
|
||||
|
||||
from tpch_functional_queries import *
|
||||
from tpch_sql_queries import *
|
||||
from pyspark.sql import SparkSession
|
||||
|
||||
def main():
|
||||
if len(sys.argv) != 5:
|
||||
print("Usage:")
|
||||
print("\t<spark-submit> --master local tpch.py")
|
||||
print("\t\t<tpch_data_root_path> <query_number> <num_iterations> <true for SQL | false for functional>")
|
||||
exit(1)
|
||||
|
||||
input_dir = sys.argv[1]
|
||||
query_number = sys.argv[2]
|
||||
num_iterations = int(sys.argv[3])
|
||||
is_sql = sys.argv[4].lower() in ("true")
|
||||
|
||||
for iter in range(0, num_iterations):
|
||||
print("TPCH Starting iteration {0} with query #{1}".format(iter, query_number))
|
||||
spark = SparkSession.builder.appName('TPCH Benchmark for Python').getOrCreate()
|
||||
|
||||
start = time.time()
|
||||
if (is_sql == False):
|
||||
queries = TpchFunctionalQueries(spark, input_dir)
|
||||
getattr(queries, "q" + query_number)()
|
||||
else:
|
||||
queries = TpchSqlQueries(spark, input_dir)
|
||||
getattr(queries, "q" + query_number)()
|
||||
end = time.time()
|
||||
|
||||
typeStr = "SQL" if is_sql else "Functional"
|
||||
print("TPCH_Result,Python,%s,%s,%d,%d" % (typeStr, query_number, iter, (end-start) * 1000))
|
||||
|
||||
spark.stop()
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
|
@ -0,0 +1,17 @@
|
|||
# Licensed to the .NET Foundation under one or more agreements.
|
||||
# The .NET Foundation licenses this file to you under the MIT license.
|
||||
# See the LICENSE file in the project root for more information.
|
||||
|
||||
import pyspark
|
||||
from pyspark.sql import SparkSession
|
||||
|
||||
class TpchBase:
|
||||
def __init__(self, spark, dir):
|
||||
self.customer = spark.read.parquet(dir + "customer")
|
||||
self.lineitem = spark.read.parquet(dir + "lineitem")
|
||||
self.nation = spark.read.parquet(dir + "nation")
|
||||
self.region = spark.read.parquet(dir + "region")
|
||||
self.orders = spark.read.parquet(dir + "orders")
|
||||
self.part = spark.read.parquet(dir + "part")
|
||||
self.partsupp = spark.read.parquet(dir + "partsupp")
|
||||
self.supplier = spark.read.parquet(dir + "supplier")
|
|
@ -0,0 +1,449 @@
|
|||
# Licensed to the .NET Foundation under one or more agreements.
|
||||
# The .NET Foundation licenses this file to you under the MIT license.
|
||||
# See the LICENSE file in the project root for more information.
|
||||
|
||||
from tpch_base import TpchBase
|
||||
|
||||
from pyspark.sql import SparkSession
|
||||
from pyspark.sql import functions as F
|
||||
from pyspark.sql.functions import *
|
||||
from pyspark.sql.types import *
|
||||
import re
|
||||
|
||||
class TpchFunctionalQueries(TpchBase):
|
||||
def __init__(self, spark, dir):
|
||||
TpchBase.__init__(self, spark, dir)
|
||||
|
||||
def q1(self):
|
||||
decrease = udf(lambda x, y: x * (1 - y), FloatType())
|
||||
increase = udf(lambda x, y: x * (1 + y), FloatType())
|
||||
|
||||
self.lineitem.filter(col("l_shipdate") <= "1998-09-02") \
|
||||
.groupBy(col("l_returnflag"), col("l_linestatus")) \
|
||||
.agg(F.sum(col("l_quantity")).alias("sum_qty"),
|
||||
F.sum(col("l_extendedprice")).alias("sum_base_price"),
|
||||
F.sum(decrease(col("l_extendedprice"), col("l_discount"))).alias("sum_disc_price"),
|
||||
F.sum(increase(decrease(col("l_extendedprice"), col("l_discount")), col("l_tax"))).alias("sum_charge"),
|
||||
F.avg(col("l_quantity")).alias("avg_qty"),
|
||||
F.avg(col("l_extendedprice")).alias("avg_price"),
|
||||
F.avg(col("l_discount")).alias("avg_disc"),
|
||||
F.count(col("l_quantity")).alias("count_order")) \
|
||||
.sort(col("l_returnflag"), col("l_linestatus")) \
|
||||
.show()
|
||||
|
||||
def q2(self):
|
||||
europe = self.region.filter(col("r_name") == "EUROPE") \
|
||||
.join(self.nation, col("r_regionkey") == col("n_regionkey")) \
|
||||
.join(self.supplier, col("n_nationkey") == col("s_nationkey")) \
|
||||
.join(self.partsupp, self.supplier.s_suppkey == self.partsupp.ps_suppkey)
|
||||
|
||||
brass = self.part.filter((col("p_size") == 15) & (self.part.p_type.endswith("BRASS"))) \
|
||||
.join(europe, col("p_partkey") == europe.ps_partkey)
|
||||
|
||||
minimumCost = brass.groupBy(col("ps_partkey")) \
|
||||
.agg(F.min(col("ps_supplycost")).alias("min"))
|
||||
|
||||
brass.join(minimumCost, brass.ps_partkey == minimumCost.ps_partkey) \
|
||||
.filter(brass.ps_supplycost == minimumCost.min) \
|
||||
.select("s_acctbal", "s_name", "n_name", "p_partkey", "p_mfgr", "s_address", "s_phone", "s_comment") \
|
||||
.sort(col("s_acctbal").desc(), col("n_name"), col("s_name"), col("p_partkey")) \
|
||||
.show()
|
||||
|
||||
def q3(self):
|
||||
decrease = udf(lambda x, y: x * (1 - y), FloatType())
|
||||
|
||||
filteredCustomers = self.customer.filter(col("c_mktsegment") == "BUILDING")
|
||||
filteredOrders = self.orders.filter(col("o_orderdate") < "1995-03-15")
|
||||
filteredLineItems = self.lineitem.filter(col("l_shipdate") > "1995-03-15")
|
||||
|
||||
filteredCustomers.join(filteredOrders, col("c_custkey") == col("o_custkey")) \
|
||||
.select("o_orderkey", "o_orderdate", "o_shippriority") \
|
||||
.join(filteredLineItems, col("o_orderkey") == col("l_orderkey")) \
|
||||
.select(col("l_orderkey"),
|
||||
decrease(col("l_extendedprice"), col("l_discount")).alias("volume"),
|
||||
col("o_orderdate"), col("o_shippriority")) \
|
||||
.groupBy(col("l_orderkey"), col("o_orderdate"), col("o_shippriority")) \
|
||||
.agg(F.sum(col("volume")).alias("revenue")) \
|
||||
.sort(col("revenue").desc(), col("o_orderdate")) \
|
||||
.show()
|
||||
|
||||
def q4(self):
|
||||
filteredOrders = self.orders.filter((col("o_orderdate") >= "1993-07-01") & (col("o_orderdate") < "1993-10-01"))
|
||||
|
||||
filteredLineItems = self.lineitem.filter(col("l_commitdate") < col("l_receiptdate")) \
|
||||
.select("l_orderkey") \
|
||||
.distinct()
|
||||
|
||||
filteredLineItems.join(filteredOrders, col("l_orderkey") == col("o_orderkey")) \
|
||||
.groupBy("o_orderpriority") \
|
||||
.agg(F.count(col("o_orderpriority")).alias("order_count")) \
|
||||
.sort(col("o_orderpriority")) \
|
||||
.show()
|
||||
|
||||
def q5(self):
|
||||
decrease = udf(lambda x, y: x * (1 - y), FloatType())
|
||||
|
||||
filteredOrders = self.orders.filter((col("o_orderdate") < "1995-01-01") & (col("o_orderdate") >= "1994-01-01"))
|
||||
|
||||
self.region.filter(col("r_name") == "ASIA") \
|
||||
.join(self.nation, col("r_regionkey") == col("n_regionkey")) \
|
||||
.join(self.supplier, col("n_nationkey") == col("s_nationkey")) \
|
||||
.join(self.lineitem, col("s_suppkey") == col("l_suppkey")) \
|
||||
.select("n_name", "l_extendedprice", "l_discount", "l_orderkey", "s_nationkey") \
|
||||
.join(filteredOrders, col("l_orderkey") == col("o_orderkey")) \
|
||||
.join(self.customer, (col("o_custkey") == col("c_custkey")) & (col("s_nationkey") == col("c_nationkey"))) \
|
||||
.select(col("n_name"), decrease(col("l_extendedprice"), col("l_discount")).alias("value")) \
|
||||
.groupBy("n_name") \
|
||||
.agg(F.sum(col("value")).alias("revenue")) \
|
||||
.sort(col("revenue").desc()) \
|
||||
.show()
|
||||
|
||||
def q6(self):
|
||||
self.lineitem.filter((col("l_shipdate") >= "1994-01-01")
|
||||
& (col("l_shipdate") < "1995-01-01")
|
||||
& (col("l_discount") >= 0.05)
|
||||
& (col("l_discount") <= 0.07)
|
||||
& (col("l_quantity") < 24)) \
|
||||
.agg(F.sum(col("l_extendedprice") * col("l_discount")).alias("revenue")) \
|
||||
.show()
|
||||
|
||||
def q7(self):
|
||||
getYear = udf(lambda x: x[0:4], StringType())
|
||||
decrease = udf(lambda x, y: x * (1 - y), FloatType())
|
||||
|
||||
filteredNations = self.nation.filter((col("n_name") == "FRANCE") | (col("n_name") == "GERMANY"))
|
||||
|
||||
filteredLineitems = self.lineitem.filter((col("l_shipdate") >= "1995-01-01") & (col("l_shipdate") <= "1996-12-31"))
|
||||
|
||||
supplierNations = filteredNations.join(self.supplier, col("n_nationkey") == col("s_nationkey")) \
|
||||
.join(filteredLineitems, col("s_suppkey") == col("l_suppkey")) \
|
||||
.select(col("n_name").alias("supp_nation"), col("l_orderkey"), col("l_extendedprice"), col("l_discount"), col("l_shipdate"))
|
||||
|
||||
filteredNations.join(self.customer, col("n_nationkey") == col("c_nationkey")) \
|
||||
.join(self.orders, col("c_custkey") == col("o_custkey")) \
|
||||
.select(col("n_name").alias("cust_nation"), col("o_orderkey")) \
|
||||
.join(supplierNations, col("o_orderkey") == col("l_orderkey")) \
|
||||
.filter(((col("supp_nation") == "FRANCE") & (col("cust_nation") == "GERMANY"))
|
||||
| ((col("supp_nation") == "GERMANY") & (col("cust_nation") == "FRANCE"))) \
|
||||
.select(col("supp_nation"), col("cust_nation"),
|
||||
getYear(col("l_shipdate")).alias("l_year"),
|
||||
decrease(col("l_extendedprice"), col("l_discount")).alias("volume")) \
|
||||
.groupBy(col("supp_nation"), col("cust_nation"), col("l_year")) \
|
||||
.agg(F.sum(col("volume")).alias("revenue")) \
|
||||
.sort(col("supp_nation"), col("cust_nation"), col("l_year")) \
|
||||
.show()
|
||||
|
||||
def q8(self):
|
||||
getYear = udf(lambda x: x[0:4], StringType())
|
||||
decrease = udf(lambda x, y: x * (1 - y), FloatType())
|
||||
isBrazil = udf(lambda x, y: (y if (x == "BRAZIL") else 0), FloatType())
|
||||
|
||||
filteredRegions = self.region.filter(col("r_name") == "AMERICA")
|
||||
filteredOrders = self.orders.filter((col("o_orderdate") <= "1996-12-31") & (col("o_orderdate") >= "1995-01-01"))
|
||||
filteredParts = self.part.filter(col("p_type") == "ECONOMY ANODIZED STEEL")
|
||||
|
||||
filteredNations = self.nation.join(self.supplier, col("n_nationkey") == col("s_nationkey"))
|
||||
|
||||
filteredLineitems = self.lineitem.select(col("l_partkey"), col("l_suppkey"), col("l_orderkey"),
|
||||
decrease(col("l_extendedprice"), col("l_discount")).alias("volume")) \
|
||||
.join(filteredParts, col("l_partkey") == col("p_partkey")) \
|
||||
.join(filteredNations, col("l_suppkey") == col("s_suppkey"))
|
||||
|
||||
self.nation.join(filteredRegions, col("n_regionkey") == col("r_regionkey")) \
|
||||
.select(col("n_nationkey")) \
|
||||
.join(self.customer, col("n_nationkey") == col("c_nationkey")) \
|
||||
.select(col("c_custkey")) \
|
||||
.join(filteredOrders, col("c_custkey") == col("o_custkey")) \
|
||||
.select(col("o_orderkey"), col("o_orderdate")) \
|
||||
.join(filteredLineitems, col("o_orderkey") == col("l_orderkey")) \
|
||||
.select(getYear(col("o_orderdate")).alias("o_year"), col("volume"),
|
||||
isBrazil(col("n_name"), col("volume")).alias("case_volume")) \
|
||||
.groupBy(col("o_year")) \
|
||||
.agg((F.sum(col("case_volume")) / F.sum(col("volume"))).alias("mkt_share")) \
|
||||
.sort(col("o_year")) \
|
||||
.show()
|
||||
|
||||
def q9(self):
|
||||
getYear = udf(lambda x: x[0:4], StringType())
|
||||
expression = udf(lambda x, y, v, w: x * (1 - y) - (v * w), FloatType())
|
||||
|
||||
lineitemParts = self.part.filter(col("p_name").contains("green")) \
|
||||
.join(self.lineitem, col("p_partkey") == col("l_partkey"))
|
||||
|
||||
nationPartSuppliers = self.nation.join(self.supplier, col("n_nationkey") == col("s_nationkey"))
|
||||
|
||||
lineitemParts.join(nationPartSuppliers, col("l_suppkey") == col("s_suppkey")) \
|
||||
.join(self.partsupp, (col("l_suppkey") == col("ps_suppkey"))
|
||||
& (col("l_partkey") == col("ps_partkey"))) \
|
||||
.join(self.orders, col("l_orderkey") == col("o_orderkey")) \
|
||||
.select(col("n_name"), getYear(col("o_orderdate")).alias("o_year"),
|
||||
expression(col("l_extendedprice"), col("l_discount"),
|
||||
col("ps_supplycost"), col("l_quantity")).alias("amount")) \
|
||||
.groupBy(col("n_name"), col("o_year")) \
|
||||
.agg(F.sum(col("amount"))) \
|
||||
.sort(col("n_name"), col("o_year").desc()) \
|
||||
.show()
|
||||
|
||||
def q10(self):
|
||||
decrease = udf(lambda x, y: x * (1 - y), FloatType())
|
||||
|
||||
filteredLineitems = self.lineitem.filter(col("l_returnflag") == "R")
|
||||
|
||||
self.orders.filter((col("o_orderdate") < "1994-01-01") & (col("o_orderdate") >= "1993-10-01")) \
|
||||
.join(self.customer, col("o_custkey") == col("c_custkey")) \
|
||||
.join(self.nation, col("c_nationkey") == col("n_nationkey")) \
|
||||
.join(filteredLineitems, col("o_orderkey") == col("l_orderkey")) \
|
||||
.select(col("c_custkey"), col("c_name"), decrease(col("l_extendedprice"), col("l_discount")).alias("volume"),
|
||||
col("c_acctbal"), col("n_name"), col("c_address"), col("c_phone"), col("c_comment")) \
|
||||
.groupBy(col("c_custkey"), col("c_name"), col("c_acctbal"), col("c_phone"), col("n_name"), col("c_address"), col("c_comment")) \
|
||||
.agg(F.sum(col("volume")).alias("revenue")) \
|
||||
.sort(col("revenue").desc()) \
|
||||
.limit(20) \
|
||||
.show()
|
||||
|
||||
def q11(self):
|
||||
multiplication = udf(lambda x, y: x * y, FloatType())
|
||||
division = udf(lambda x: x * 0.0001, FloatType())
|
||||
|
||||
nationPartSuppliers = self.nation.filter(col("n_name") == "GERMANY") \
|
||||
.join(self.supplier, col("n_nationkey") == col("s_nationkey")) \
|
||||
.select(col("s_suppkey")) \
|
||||
.join(self.partsupp, col("s_suppkey") == col("ps_suppkey")) \
|
||||
.select(col("ps_partkey"), multiplication(col("ps_supplycost"), col("ps_availqty")).alias("value"))
|
||||
|
||||
aggregatedValue = nationPartSuppliers.agg(F.sum(col("value")).alias("total_value"))
|
||||
|
||||
nationPartSuppliers.groupBy(col("ps_partkey")).agg(F.sum(col("value")).alias("part_value")) \
|
||||
.join(aggregatedValue, col("part_value") > division(col("total_value"))) \
|
||||
.sort(col("part_value").desc()) \
|
||||
.show()
|
||||
|
||||
def q12(self):
|
||||
highPriority = udf(lambda x: (1 if ((x == "1-URGENT") or (x == "2-HIGH")) else 0), IntegerType())
|
||||
lowPriority = udf(lambda x: (1 if ((x != "1-URGENT") and (x != "2-HIGH")) else 0), IntegerType())
|
||||
|
||||
self.lineitem.filter(((col("l_shipmode") == "MAIL") | (col("l_shipmode") == "SHIP"))
|
||||
& (col("l_commitdate") < col("l_receiptdate"))
|
||||
& (col("l_shipdate") < col("l_commitdate"))
|
||||
& (col("l_receiptdate") >= "1994-01-01")
|
||||
& (col("l_receiptdate") < "1995-01-01")) \
|
||||
.join(self.orders, col("l_orderkey") == col("o_orderkey")) \
|
||||
.select(col("l_shipmode"), col("o_orderpriority")) \
|
||||
.groupBy(col("l_shipmode")) \
|
||||
.agg(F.sum(highPriority(col("o_orderpriority"))).alias("sum_highorderpriority"),
|
||||
F.sum(lowPriority(col("o_orderpriority"))).alias("sum_loworderpriority")) \
|
||||
.sort(col("l_shipmode")) \
|
||||
.show()
|
||||
|
||||
def q13(self):
|
||||
special_regex = re.compile(".*special.*requests.*")
|
||||
special = udf(lambda x: special_regex.match(x) is not None, BooleanType())
|
||||
|
||||
self.customer.join(self.orders, (col("c_custkey") == col("o_custkey"))
|
||||
& ~special(col("o_comment")), "left_outer") \
|
||||
.groupBy(col("c_custkey")) \
|
||||
.agg(F.count(col("o_orderkey")).alias("c_count")) \
|
||||
.groupBy(col("c_count")) \
|
||||
.agg(F.count(col("*")).alias("custdist")) \
|
||||
.sort(col("custdist").desc(), col("c_count").desc()) \
|
||||
.show()
|
||||
|
||||
def q14(self):
|
||||
decrease = udf(lambda x, y: x * (1 - y), FloatType())
|
||||
promotion = udf(lambda x, y: (y if (x.startswith("PROMO")) else 0), FloatType())
|
||||
|
||||
self.part.join(self.lineitem, (col("l_partkey") == col("p_partkey"))
|
||||
& (col("l_shipdate") >= "1995-09-01")
|
||||
& (col("l_shipdate") < "1995-10-01")) \
|
||||
.select(col("p_type"), decrease(col("l_extendedprice"), col("l_discount")).alias("value")) \
|
||||
.agg(F.sum(promotion(col("p_type"), col("value"))) * 100 / F.sum(col("value"))) \
|
||||
.show()
|
||||
|
||||
def q15(self):
|
||||
decrease = udf(lambda x, y: x * (1 - y), FloatType())
|
||||
|
||||
revenue = self.lineitem.filter((col("l_shipdate") >= "1996-01-01")
|
||||
& (col("l_shipdate") < "1996-04-01")) \
|
||||
.select(col("l_suppkey"), decrease(col("l_extendedprice"), col("l_discount")).alias("value")) \
|
||||
.groupBy(col("l_suppkey")) \
|
||||
.agg(F.sum(col("value")).alias("total"))
|
||||
|
||||
revenue.agg(F.max(col("total")).alias("max_total")) \
|
||||
.join(revenue, col("max_total") == col("total")) \
|
||||
.join(self.supplier, col("l_suppkey") == col("s_suppkey")) \
|
||||
.select(col("s_suppkey"), col("s_name"), col("s_address"), col("s_phone"), col("total")) \
|
||||
.sort(col("s_suppkey")) \
|
||||
.show()
|
||||
|
||||
def q16(self):
|
||||
decrease = udf(lambda x, y: x * (1 - y), FloatType())
|
||||
polished = udf(lambda x: x.startswith("MEDIUM POLISHED"), BooleanType())
|
||||
|
||||
complains_regex = re.compile(".*Customer.*Complaints.*")
|
||||
complains = udf(lambda x: complains_regex.match(x) is not None, BooleanType())
|
||||
|
||||
numbers_regex = re.compile("^(49|14|23|45|19|3|36|9)$")
|
||||
numbers = udf(lambda x: numbers_regex.match(str(x)) is not None, BooleanType())
|
||||
|
||||
filteredParts = self.part.filter((col("p_brand") != "Brand#45")
|
||||
& (~polished(col("p_type")))
|
||||
& numbers(col("p_size"))) \
|
||||
.select(col("p_partkey"), col("p_brand"), col("p_type"), col("p_size"))
|
||||
|
||||
self.supplier.filter(~complains(col("s_comment"))) \
|
||||
.join(self.partsupp, col("s_suppkey") == col("ps_suppkey")) \
|
||||
.select(col("ps_partkey"), col("ps_suppkey")) \
|
||||
.join(filteredParts, col("ps_partkey") == col("p_partkey")) \
|
||||
.groupBy(col("p_brand"), col("p_type"), col("p_size")) \
|
||||
.agg(F.countDistinct(col("ps_suppkey")).alias("supplier_count")) \
|
||||
.sort(col("supplier_count").desc(), col("p_brand"), col("p_type"), col("p_size")) \
|
||||
.show()
|
||||
|
||||
def q17(self):
|
||||
multiplier = udf(lambda x: x * 0.2)
|
||||
|
||||
filteredLineitems = self.lineitem.select(col("l_partkey"), col("l_quantity"), col("l_extendedprice"))
|
||||
|
||||
filteredParts = self.part.filter((col("p_brand") == "Brand#23") & (col("p_container") == "MED BOX")) \
|
||||
.select(col("p_partkey")) \
|
||||
.join(self.lineitem, col("p_partkey") == col("l_partkey"), "left_outer")
|
||||
|
||||
filteredParts.groupBy(col("p_partkey")) \
|
||||
.agg(multiplier(F.avg(col("l_quantity"))).alias("avg_quantity")) \
|
||||
.select(col("p_partkey").alias("key"), col("avg_quantity")) \
|
||||
.join(filteredParts, col("key") == col("p_partkey")) \
|
||||
.filter(col("l_quantity") < col("avg_quantity")) \
|
||||
.agg((F.sum(col("l_extendedprice")) / 7.0).alias("avg_yearly")) \
|
||||
.show()
|
||||
|
||||
def q18(self):
|
||||
self.lineitem.groupBy(col("l_orderkey")) \
|
||||
.agg(F.sum(col("l_quantity")).alias("sum_quantity")) \
|
||||
.filter(col("sum_quantity") > 300) \
|
||||
.select(col("l_orderkey").alias("key"), col("sum_quantity")) \
|
||||
.join(self.orders, col("o_orderkey") == col("key")) \
|
||||
.join(self.lineitem, col("o_orderkey") == col("l_orderkey")) \
|
||||
.join(self.customer, col("c_custkey") == col("o_custkey")) \
|
||||
.select(col("l_quantity"), col("c_name"), col("c_custkey"), col("o_orderkey"), col("o_orderdate"), col("o_totalprice")) \
|
||||
.groupBy(col("c_name"), col("c_custkey"), col("o_orderkey"), col("o_orderdate"), col("o_totalprice")) \
|
||||
.agg(F.sum(col("l_quantity"))) \
|
||||
.sort(col("o_totalprice").desc(), col("o_orderdate")) \
|
||||
.show()
|
||||
|
||||
def q19(self):
|
||||
decrease = udf(lambda x, y: x * (1 - y), FloatType())
|
||||
|
||||
sm_regex = re.compile("SM CASE|SM BOX|SM PACK|SM PKG")
|
||||
sm = udf(lambda x: sm_regex.match(x) is not None, BooleanType())
|
||||
|
||||
med_regex = re.compile("MED BAG|MED BOX|MED PKG|MED PACK")
|
||||
med = udf(lambda x: med_regex.match(x) is not None, BooleanType())
|
||||
|
||||
lg_regex = re.compile("LG CASE|LG BOX|LG PACK|LG PKG")
|
||||
lg = udf(lambda x: lg_regex.match(x) is not None, BooleanType())
|
||||
|
||||
self.part.join(self.lineitem, col("l_partkey") == col("p_partkey")) \
|
||||
.filter(((col("l_shipmode") == "AIR")
|
||||
| (col("l_shipmode") == "AIR REG"))
|
||||
& (col("l_shipinstruct") == "DELIVER IN PERSON")) \
|
||||
.filter(((col("p_brand") == "Brand#12")
|
||||
& (sm(col("p_container")))
|
||||
& (col("l_quantity") >= 1)
|
||||
& (col("l_quantity") <= 11)
|
||||
& (col("p_size") >= 1)
|
||||
& (col("p_size") <= 5))
|
||||
| ((col("p_brand") == "Brand#23")
|
||||
& (med(col("p_container")))
|
||||
& (col("l_quantity") >= 10)
|
||||
& (col("l_quantity") <= 20)
|
||||
& (col("p_size") >= 1)
|
||||
& (col("p_size") <= 10))
|
||||
| ((col("p_brand") == "Brand#34")
|
||||
& (lg(col("p_container")))
|
||||
& (col("l_quantity") >= 20)
|
||||
& (col("l_quantity") <= 30)
|
||||
& (col("p_size") >= 1)
|
||||
& (col("p_size") <= 15))) \
|
||||
.select(decrease(col("l_extendedprice"), col("l_discount")).alias("volume")) \
|
||||
.agg(F.sum(col("volume")).alias("revenue")) \
|
||||
.show()
|
||||
|
||||
def q20(self):
|
||||
forest = udf(lambda x: x.startswith("forest"), BooleanType())
|
||||
|
||||
filteredLineitems = self.lineitem.filter((col("l_shipdate") >= "1994-01-01") & (col("l_shipdate") < "1995-01-01")) \
|
||||
.groupBy(col("l_partkey"), col("l_suppkey")) \
|
||||
.agg((F.sum(col("l_quantity")) * 0.5).alias("sum_quantity"))
|
||||
|
||||
filteredNations = self.nation.filter(col("n_name") == "CANADA")
|
||||
|
||||
nationSuppliers = self.supplier.select(col("s_suppkey"), col("s_name"), col("s_nationkey"), col("s_address")) \
|
||||
.join(filteredNations, col("s_nationkey") == col("n_nationkey"))
|
||||
|
||||
self.part.filter(forest(col("p_name"))) \
|
||||
.select(col("p_partkey")).distinct() \
|
||||
.join(self.partsupp, col("p_partkey") == col("ps_partkey")) \
|
||||
.join(filteredLineitems, (col("ps_suppkey") == col("l_suppkey")) & (col("ps_partkey") == col("l_partkey"))) \
|
||||
.filter(col("ps_availqty") > col("sum_quantity")) \
|
||||
.select(col("ps_suppkey")).distinct() \
|
||||
.join(nationSuppliers, col("ps_suppkey") == col("s_suppkey")) \
|
||||
.select(col("s_name"), col("s_address")) \
|
||||
.sort(col("s_name")) \
|
||||
.show()
|
||||
|
||||
def q21(self):
|
||||
filteredSuppliers = self.supplier.select(col("s_suppkey"), col("s_nationkey"), col("s_name"))
|
||||
|
||||
selectedLineitems = self.lineitem.select(col("l_suppkey"), col("l_orderkey"), col("l_receiptdate"), col("l_commitdate"))
|
||||
filteredLineitems = selectedLineitems.filter(col("l_receiptdate") > col("l_commitdate"))
|
||||
|
||||
selectedGroupedLineItems = selectedLineitems.groupBy(col("l_orderkey")) \
|
||||
.agg(F.countDistinct(col("l_suppkey")).alias("suppkey_count"), F.max(col("l_suppkey")).alias("suppkey_max")) \
|
||||
.select(col("l_orderkey").alias("key"), col("suppkey_count"), col("suppkey_max"))
|
||||
|
||||
filteredGroupedLineItems = filteredLineitems.groupBy(col("l_orderkey")) \
|
||||
.agg(F.countDistinct(col("l_suppkey")).alias("suppkey_count"), F.max(col("l_suppkey")).alias("suppkey_max")) \
|
||||
.select(col("l_orderkey").alias("key"), col("suppkey_count"), col("suppkey_max"))
|
||||
|
||||
filteredOrders = self.orders.select(col("o_orderkey"), col("o_orderstatus")) \
|
||||
.filter(col("o_orderstatus") == "F")
|
||||
|
||||
self.nation.filter(col("n_name") == "SAUDI ARABIA") \
|
||||
.join(filteredSuppliers, col("n_nationkey") == col("s_nationkey")) \
|
||||
.join(filteredLineitems, col("s_suppkey") == col("l_suppkey")) \
|
||||
.join(filteredOrders, col("l_orderkey") == col("o_orderkey")) \
|
||||
.join(selectedGroupedLineItems, col("l_orderkey") == col("key")) \
|
||||
.filter(col("suppkey_count") > 1) \
|
||||
.select(col("s_name"), col("l_orderkey"), col("l_suppkey")) \
|
||||
.join(filteredGroupedLineItems, col("l_orderkey") == col("key"), "left_outer") \
|
||||
.select(col("s_name"), col("l_orderkey"), col("l_suppkey"), col("suppkey_count"), col("suppkey_max")) \
|
||||
.filter((col("suppkey_count") == 1)
|
||||
& (col("l_suppkey") == col("suppkey_max"))) \
|
||||
.groupBy(col("s_name")) \
|
||||
.agg(F.count(col("l_suppkey")).alias("numwait")) \
|
||||
.sort(col("numwait").desc(), col("s_name")) \
|
||||
.show()
|
||||
|
||||
def q22(self):
|
||||
substring = udf(lambda x: x[0:2], StringType())
|
||||
|
||||
phone_regex = re.compile("^(13|31|23|29|30|18|17)$")
|
||||
phone = udf(lambda x: phone_regex.match(x) is not None, BooleanType())
|
||||
|
||||
filteredCustomers = self.customer.select(col("c_acctbal"), col("c_custkey"), substring(col("c_phone")).alias("cntrycode")) \
|
||||
.filter(phone(col("cntrycode")))
|
||||
|
||||
customerAverage = filteredCustomers.filter(col("c_acctbal") > 0.0) \
|
||||
.agg(F.avg(col("c_acctbal")).alias("avg_acctbal"))
|
||||
|
||||
self.orders.groupBy(col("o_custkey")) \
|
||||
.agg(col("o_custkey")) \
|
||||
.select(col("o_custkey")) \
|
||||
.join(filteredCustomers, col("o_custkey") == col("c_custkey"), "right_outer") \
|
||||
.filter(col("o_custkey").isNull()) \
|
||||
.join(customerAverage) \
|
||||
.filter(col("c_acctbal") > col("avg_acctbal")) \
|
||||
.groupBy(col("cntrycode")) \
|
||||
.agg(F.count(col("c_acctbal")).alias("numcust"), F.sum(col("c_acctbal")).alias("totalacctbal")) \
|
||||
.sort(col("cntrycode")) \
|
||||
.show()
|
|
@ -0,0 +1,719 @@
|
|||
# Licensed to the .NET Foundation under one or more agreements.
|
||||
# The .NET Foundation licenses this file to you under the MIT license.
|
||||
# See the LICENSE file in the project root for more information.
|
||||
|
||||
from tpch_base import TpchBase
|
||||
|
||||
from pyspark.sql import SparkSession
|
||||
|
||||
class TpchSqlQueries(TpchBase):
|
||||
def __init__(self, spark, dir):
|
||||
TpchBase.__init__(self, spark, dir)
|
||||
|
||||
self.spark = spark
|
||||
self.customer.createOrReplaceTempView("customer")
|
||||
self.lineitem.createOrReplaceTempView("lineitem")
|
||||
self.nation.createOrReplaceTempView("nation")
|
||||
self.region.createOrReplaceTempView("region")
|
||||
self.orders.createOrReplaceTempView("orders")
|
||||
self.part.createOrReplaceTempView("part")
|
||||
self.partsupp.createOrReplaceTempView("partsupp")
|
||||
self.supplier.createOrReplaceTempView("supplier")
|
||||
|
||||
def q1(self):
|
||||
query = """select
|
||||
l_returnflag,
|
||||
l_linestatus,
|
||||
sum(l_quantity) as sum_qty,
|
||||
sum(l_extendedprice) as sum_base_price,
|
||||
sum(l_extendedprice * (1 - l_discount)) as sum_disc_price,
|
||||
sum(l_extendedprice * (1 - l_discount) * (1 + l_tax)) as sum_charge,
|
||||
avg(l_quantity) as avg_qty,
|
||||
avg(l_extendedprice) as avg_price,
|
||||
avg(l_discount) as avg_disc,
|
||||
count(*) as count_order
|
||||
from
|
||||
lineitem
|
||||
where
|
||||
l_shipdate <= date '1998-12-01' - interval '90' day
|
||||
group by
|
||||
l_returnflag,
|
||||
l_linestatus
|
||||
order by
|
||||
l_returnflag,
|
||||
l_linestatus"""
|
||||
self.spark.sql(query).show()
|
||||
|
||||
def q2(self):
|
||||
query = """select
|
||||
s_acctbal,
|
||||
s_name,
|
||||
n_name,
|
||||
p_partkey,
|
||||
p_mfgr,
|
||||
s_address,
|
||||
s_phone,
|
||||
s_comment
|
||||
from
|
||||
part,
|
||||
supplier,
|
||||
partsupp,
|
||||
nation,
|
||||
region
|
||||
where
|
||||
p_partkey = ps_partkey
|
||||
and s_suppkey = ps_suppkey
|
||||
and p_size = 15
|
||||
and p_type like '%BRASS'
|
||||
and s_nationkey = n_nationkey
|
||||
and n_regionkey = r_regionkey
|
||||
and r_name = 'EUROPE'
|
||||
and ps_supplycost = (
|
||||
select
|
||||
min(ps_supplycost)
|
||||
from
|
||||
partsupp,
|
||||
supplier,
|
||||
nation,
|
||||
region
|
||||
where
|
||||
p_partkey = ps_partkey
|
||||
and s_suppkey = ps_suppkey
|
||||
and s_nationkey = n_nationkey
|
||||
and n_regionkey = r_regionkey
|
||||
and r_name = 'EUROPE'
|
||||
)
|
||||
order by
|
||||
s_acctbal desc,
|
||||
n_name,
|
||||
s_name,
|
||||
p_partkey"""
|
||||
self.spark.sql(query).show()
|
||||
|
||||
def q3(self):
|
||||
query = """select
|
||||
l_orderkey,
|
||||
sum(l_extendedprice * (1 - l_discount)) as revenue,
|
||||
o_orderdate,
|
||||
o_shippriority
|
||||
from
|
||||
customer,
|
||||
orders,
|
||||
lineitem
|
||||
where
|
||||
c_mktsegment = 'BUILDING'
|
||||
and c_custkey = o_custkey
|
||||
and l_orderkey = o_orderkey
|
||||
and o_orderdate < date '1995-03-15'
|
||||
and l_shipdate > date '1995-03-15'
|
||||
group by
|
||||
l_orderkey,
|
||||
o_orderdate,
|
||||
o_shippriority
|
||||
order by
|
||||
revenue desc,
|
||||
o_orderdate"""
|
||||
self.spark.sql(query).show()
|
||||
|
||||
def q4(self):
|
||||
query = """select
|
||||
o_orderpriority,
|
||||
count(*) as order_count
|
||||
from
|
||||
orders
|
||||
where
|
||||
o_orderdate >= date '1993-07-01'
|
||||
and o_orderdate < date '1993-07-01' + interval '3' month
|
||||
and exists(
|
||||
select
|
||||
*
|
||||
from
|
||||
lineitem
|
||||
where
|
||||
l_orderkey = o_orderkey
|
||||
and l_commitdate < l_receiptdate
|
||||
)
|
||||
group by
|
||||
o_orderpriority
|
||||
order by
|
||||
o_orderpriority"""
|
||||
self.spark.sql(query).show()
|
||||
|
||||
def q5(self):
|
||||
query = """select
|
||||
n_name,
|
||||
sum(l_extendedprice * (1 - l_discount)) as revenue
|
||||
from
|
||||
customer,
|
||||
orders,
|
||||
lineitem,
|
||||
supplier,
|
||||
nation,
|
||||
region
|
||||
where
|
||||
c_custkey = o_custkey
|
||||
and l_orderkey = o_orderkey
|
||||
and l_suppkey = s_suppkey
|
||||
and c_nationkey = s_nationkey
|
||||
and s_nationkey = n_nationkey
|
||||
and n_regionkey = r_regionkey
|
||||
and r_name = 'ASIA'
|
||||
and o_orderdate >= date '1994-01-01'
|
||||
and o_orderdate < date '1994-01-01' + interval '1' year
|
||||
group by
|
||||
n_name
|
||||
order by
|
||||
revenue desc"""
|
||||
self.spark.sql(query).show()
|
||||
|
||||
def q6(self):
|
||||
query = """select
|
||||
sum(l_extendedprice * l_discount) as revenue
|
||||
from
|
||||
lineitem
|
||||
where
|
||||
l_shipdate >= date '1994-01-01'
|
||||
and l_shipdate < date '1994-01-01' + interval '1' year
|
||||
and l_discount between .06 - 0.01 and .06 + 0.01
|
||||
and l_quantity < 24"""
|
||||
self.spark.sql(query).show()
|
||||
|
||||
def q7(self):
|
||||
query = """select
|
||||
supp_nation,
|
||||
cust_nation,
|
||||
l_year,
|
||||
cast(sum(volume) as double) as revenue
|
||||
from
|
||||
(
|
||||
select
|
||||
n1.n_name as supp_nation,
|
||||
n2.n_name as cust_nation,
|
||||
year(l_shipdate) as l_year,
|
||||
cast(l_extendedprice * (1 - l_discount) as double) as volume
|
||||
from
|
||||
supplier,
|
||||
lineitem,
|
||||
orders,
|
||||
customer,
|
||||
nation n1,
|
||||
nation n2
|
||||
where
|
||||
s_suppkey = l_suppkey
|
||||
and o_orderkey = l_orderkey
|
||||
and c_custkey = o_custkey
|
||||
and s_nationkey = n1.n_nationkey
|
||||
and c_nationkey = n2.n_nationkey
|
||||
and(
|
||||
(n1.n_name = 'FRANCE' and n2.n_name = 'GERMANY')
|
||||
or(n1.n_name = 'GERMANY' and n2.n_name = 'FRANCE')
|
||||
)
|
||||
and l_shipdate between date '1995-01-01' and date '1996-12-31'
|
||||
) as shipping
|
||||
group by
|
||||
supp_nation,
|
||||
cust_nation,
|
||||
l_year
|
||||
order by
|
||||
supp_nation,
|
||||
cust_nation,
|
||||
l_year"""
|
||||
self.spark.sql(query).show()
|
||||
|
||||
def q8(self):
|
||||
query = """select
|
||||
o_year,
|
||||
sum(case
|
||||
|
||||
when nation = 'BRAZIL' then volume
|
||||
else 0
|
||||
end) / sum(volume) as mkt_share
|
||||
from
|
||||
(
|
||||
select
|
||||
year(o_orderdate) as o_year,
|
||||
l_extendedprice * (1 - l_discount) as volume,
|
||||
n2.n_name as nation
|
||||
from
|
||||
part,
|
||||
supplier,
|
||||
lineitem,
|
||||
orders,
|
||||
customer,
|
||||
nation n1,
|
||||
nation n2,
|
||||
region
|
||||
where
|
||||
p_partkey = l_partkey
|
||||
and s_suppkey = l_suppkey
|
||||
and l_orderkey = o_orderkey
|
||||
and o_custkey = c_custkey
|
||||
and c_nationkey = n1.n_nationkey
|
||||
and n1.n_regionkey = r_regionkey
|
||||
and r_name = 'AMERICA'
|
||||
and s_nationkey = n2.n_nationkey
|
||||
and o_orderdate between date '1995-01-01' and date '1996-12-31'
|
||||
and p_type = 'ECONOMY ANODIZED STEEL'
|
||||
) as all_nations
|
||||
group by
|
||||
o_year
|
||||
order by
|
||||
o_year"""
|
||||
self.spark.sql(query).show()
|
||||
|
||||
def q9(self):
|
||||
query = """select
|
||||
nation,
|
||||
o_year,
|
||||
sum(amount) as sum_profit
|
||||
from
|
||||
(
|
||||
select
|
||||
n_name as nation,
|
||||
year(o_orderdate) as o_year,
|
||||
l_extendedprice * (1 - l_discount) - ps_supplycost * l_quantity as amount
|
||||
from
|
||||
part,
|
||||
supplier,
|
||||
lineitem,
|
||||
partsupp,
|
||||
orders,
|
||||
nation
|
||||
where
|
||||
s_suppkey = l_suppkey
|
||||
and ps_suppkey = l_suppkey
|
||||
and ps_partkey = l_partkey
|
||||
and p_partkey = l_partkey
|
||||
and o_orderkey = l_orderkey
|
||||
and s_nationkey = n_nationkey
|
||||
and p_name like '%green%'
|
||||
) as profit
|
||||
group by
|
||||
nation,
|
||||
o_year
|
||||
order by
|
||||
nation,
|
||||
o_year desc"""
|
||||
self.spark.sql(query).show()
|
||||
|
||||
def q10(self):
|
||||
query = """select
|
||||
c_custkey,
|
||||
c_name,
|
||||
sum(l_extendedprice * (1 - l_discount)) as revenue,
|
||||
c_acctbal,
|
||||
n_name,
|
||||
c_address,
|
||||
c_phone,
|
||||
c_comment
|
||||
from
|
||||
customer,
|
||||
orders,
|
||||
lineitem,
|
||||
nation
|
||||
where
|
||||
c_custkey = o_custkey
|
||||
and l_orderkey = o_orderkey
|
||||
and o_orderdate >= date '1993-10-01'
|
||||
and o_orderdate < date '1993-10-01' + interval '3' month
|
||||
and l_returnflag = 'R'
|
||||
and c_nationkey = n_nationkey
|
||||
group by
|
||||
c_custkey,
|
||||
c_name,
|
||||
c_acctbal,
|
||||
c_phone,
|
||||
n_name,
|
||||
c_address,
|
||||
c_comment
|
||||
order by
|
||||
revenue desc"""
|
||||
self.spark.sql(query).show()
|
||||
|
||||
def q11(self):
|
||||
query = """select
|
||||
ps_partkey,
|
||||
sum(ps_supplycost * ps_availqty) as value
|
||||
from
|
||||
partsupp,
|
||||
supplier,
|
||||
nation
|
||||
where
|
||||
ps_suppkey = s_suppkey
|
||||
and s_nationkey = n_nationkey
|
||||
and n_name = 'GERMANY'
|
||||
group by
|
||||
ps_partkey having
|
||||
sum(ps_supplycost * ps_availqty) > (
|
||||
select
|
||||
sum(ps_supplycost * ps_availqty) * 0.0001000000
|
||||
from
|
||||
partsupp,
|
||||
supplier,
|
||||
nation
|
||||
where
|
||||
ps_suppkey = s_suppkey
|
||||
and s_nationkey = n_nationkey
|
||||
and n_name = 'GERMANY'
|
||||
)
|
||||
order by
|
||||
value desc"""
|
||||
self.spark.sql(query).show()
|
||||
|
||||
def q12(self):
|
||||
query = """select
|
||||
l_shipmode,
|
||||
sum(case
|
||||
|
||||
when o_orderpriority = '1-URGENT'
|
||||
or o_orderpriority = '2-HIGH'
|
||||
then 1
|
||||
else 0
|
||||
end) as sum_highorderpriority,
|
||||
sum(case
|
||||
|
||||
when o_orderpriority <> '1-URGENT'
|
||||
and o_orderpriority <> '2-HIGH'
|
||||
then 1
|
||||
else 0
|
||||
end) as sum_loworderpriority
|
||||
from
|
||||
orders,
|
||||
lineitem
|
||||
where
|
||||
o_orderkey = l_orderkey
|
||||
and l_shipmode in ('MAIL', 'SHIP')
|
||||
and l_commitdate < l_receiptdate
|
||||
and l_shipdate < l_commitdate
|
||||
and l_receiptdate >= date '1994-01-01'
|
||||
and l_receiptdate < date '1994-01-01' + interval '1' year
|
||||
group by
|
||||
l_shipmode
|
||||
order by
|
||||
l_shipmode"""
|
||||
self.spark.sql(query).show()
|
||||
|
||||
def q13(self):
|
||||
query = """select
|
||||
c_count,
|
||||
count(*) as custdist
|
||||
from
|
||||
(
|
||||
select
|
||||
c_custkey,
|
||||
count(o_orderkey) as c_count
|
||||
from
|
||||
customer left outer join orders on
|
||||
c_custkey = o_custkey
|
||||
and o_comment not like '%special%requests%'
|
||||
group by
|
||||
c_custkey
|
||||
) as c_orders
|
||||
group by
|
||||
c_count
|
||||
order by
|
||||
custdist desc,
|
||||
c_count desc"""
|
||||
self.spark.sql(query).show()
|
||||
|
||||
def q14(self):
|
||||
query = """
|
||||
select
|
||||
100.00 * sum(case
|
||||
when p_type like 'PROMO%'
|
||||
then l_extendedprice * (1 - l_discount)
|
||||
else 0
|
||||
end) / sum(l_extendedprice * (1 - l_discount)) as promo_revenue
|
||||
from
|
||||
lineitem,
|
||||
part
|
||||
where
|
||||
l_partkey = p_partkey
|
||||
and l_shipdate >= date '1995-09-01'
|
||||
and l_shipdate < date '1995-09-01' + interval '1' month"""
|
||||
self.spark.sql(query).show()
|
||||
|
||||
def q15(self):
|
||||
query = """with revenue0 as
|
||||
(select
|
||||
l_suppkey as supplier_no,
|
||||
sum(l_extendedprice * (1 - l_discount)) as total_revenue
|
||||
from
|
||||
lineitem
|
||||
where
|
||||
l_shipdate >= date '1996-01-01'
|
||||
and l_shipdate < date '1996-01-01' + interval '3' month
|
||||
group by
|
||||
l_suppkey)
|
||||
|
||||
select
|
||||
s_suppkey,
|
||||
s_name,
|
||||
s_address,
|
||||
s_phone,
|
||||
total_revenue
|
||||
from
|
||||
supplier,
|
||||
revenue0
|
||||
where
|
||||
s_suppkey = supplier_no
|
||||
and total_revenue = (
|
||||
select
|
||||
max(total_revenue)
|
||||
from
|
||||
revenue0
|
||||
)
|
||||
order by
|
||||
s_suppkey"""
|
||||
self.spark.sql(query).show()
|
||||
|
||||
def q16(self):
|
||||
query = """select
|
||||
p_brand,
|
||||
p_type,
|
||||
p_size,
|
||||
count(distinct ps_suppkey) as supplier_cnt
|
||||
from
|
||||
partsupp,
|
||||
part
|
||||
where
|
||||
p_partkey = ps_partkey
|
||||
and p_brand <> 'Brand#45'
|
||||
and p_type not like 'MEDIUM POLISHED%'
|
||||
and p_size in (49, 14, 23, 45, 19, 3, 36, 9)
|
||||
and ps_suppkey not in (
|
||||
select
|
||||
s_suppkey
|
||||
from
|
||||
supplier
|
||||
where
|
||||
s_comment like '%Customer%Complaints%'
|
||||
)
|
||||
group by
|
||||
p_brand,
|
||||
p_type,
|
||||
p_size
|
||||
order by
|
||||
supplier_cnt desc,
|
||||
p_brand,
|
||||
p_type,
|
||||
p_size
|
||||
limit 20"""
|
||||
self.spark.sql(query).show()
|
||||
|
||||
def q17(self):
|
||||
query = """select
|
||||
sum(l_extendedprice) / 7.0 as avg_yearly
|
||||
from
|
||||
lineitem,
|
||||
part
|
||||
where
|
||||
p_partkey = l_partkey
|
||||
and p_brand = 'Brand#23'
|
||||
and p_container = 'MED BOX'
|
||||
and l_quantity < (
|
||||
select
|
||||
0.2 * avg(l_quantity)
|
||||
from
|
||||
lineitem
|
||||
where
|
||||
l_partkey = p_partkey
|
||||
)"""
|
||||
self.spark.sql(query).show()
|
||||
|
||||
def q18(self):
|
||||
query = """
|
||||
select
|
||||
c_name,
|
||||
c_custkey,
|
||||
o_orderkey,
|
||||
o_orderdate,
|
||||
o_totalprice,
|
||||
sum(l_quantity)
|
||||
from
|
||||
customer,
|
||||
orders,
|
||||
lineitem
|
||||
where
|
||||
o_orderkey in (
|
||||
select
|
||||
l_orderkey
|
||||
from
|
||||
lineitem
|
||||
group by
|
||||
l_orderkey having
|
||||
sum(l_quantity) > 300
|
||||
)
|
||||
and c_custkey = o_custkey
|
||||
and o_orderkey = l_orderkey
|
||||
group by
|
||||
c_name,
|
||||
c_custkey,
|
||||
o_orderkey,
|
||||
o_orderdate,
|
||||
o_totalprice
|
||||
order by
|
||||
o_totalprice desc,
|
||||
o_orderdate"""
|
||||
self.spark.sql(query).show()
|
||||
|
||||
def q19(self):
|
||||
query = """select
|
||||
sum(l_extendedprice * (1 - l_discount)) as revenue
|
||||
from
|
||||
lineitem,
|
||||
part
|
||||
where
|
||||
(
|
||||
p_partkey = l_partkey
|
||||
and p_brand = 'Brand#12'
|
||||
and p_container in ('SM CASE', 'SM BOX', 'SM PACK', 'SM PKG')
|
||||
and l_quantity >= 1 and l_quantity <= 1 + 10
|
||||
and p_size between 1 and 5
|
||||
and l_shipmode in ('AIR', 'AIR REG')
|
||||
and l_shipinstruct = 'DELIVER IN PERSON'
|
||||
)
|
||||
or
|
||||
(
|
||||
p_partkey = l_partkey
|
||||
and p_brand = 'Brand#23'
|
||||
and p_container in ('MED BAG', 'MED BOX', 'MED PKG', 'MED PACK')
|
||||
and l_quantity >= 10 and l_quantity <= 10 + 10
|
||||
and p_size between 1 and 10
|
||||
and l_shipmode in ('AIR', 'AIR REG')
|
||||
and l_shipinstruct = 'DELIVER IN PERSON'
|
||||
)
|
||||
or
|
||||
(
|
||||
p_partkey = l_partkey
|
||||
and p_brand = 'Brand#34'
|
||||
and p_container in ('LG CASE', 'LG BOX', 'LG PACK', 'LG PKG')
|
||||
and l_quantity >= 20 and l_quantity <= 20 + 10
|
||||
and p_size between 1 and 15
|
||||
and l_shipmode in ('AIR', 'AIR REG')
|
||||
and l_shipinstruct = 'DELIVER IN PERSON'
|
||||
)"""
|
||||
self.spark.sql(query).show()
|
||||
|
||||
def q20(self):
|
||||
query = """
|
||||
select
|
||||
s_name,
|
||||
s_address
|
||||
from
|
||||
supplier,
|
||||
nation
|
||||
where
|
||||
s_suppkey in (
|
||||
select
|
||||
ps_suppkey
|
||||
from
|
||||
partsupp
|
||||
where
|
||||
ps_partkey in (
|
||||
select
|
||||
p_partkey
|
||||
from
|
||||
part
|
||||
where
|
||||
p_name like 'forest%'
|
||||
)
|
||||
and ps_availqty > (
|
||||
select
|
||||
0.5 * sum(l_quantity)
|
||||
from
|
||||
lineitem
|
||||
where
|
||||
l_partkey = ps_partkey
|
||||
and l_suppkey = ps_suppkey
|
||||
and l_shipdate >= date '1994-01-01'
|
||||
and l_shipdate < date '1994-01-01' + interval '1' year
|
||||
)
|
||||
)
|
||||
and s_nationkey = n_nationkey
|
||||
and n_name = 'CANADA'
|
||||
order by
|
||||
s_name"""
|
||||
self.spark.sql(query).show()
|
||||
|
||||
def q21(self):
|
||||
query = """select
|
||||
s_name,
|
||||
count(*) as numwait
|
||||
from
|
||||
supplier,
|
||||
lineitem l1,
|
||||
orders,
|
||||
nation
|
||||
where
|
||||
s_suppkey = l1.l_suppkey
|
||||
and o_orderkey = l1.l_orderkey
|
||||
and o_orderstatus = 'F'
|
||||
and l1.l_receiptdate > l1.l_commitdate
|
||||
and exists(
|
||||
select
|
||||
*
|
||||
from
|
||||
lineitem l2
|
||||
where
|
||||
l2.l_orderkey = l1.l_orderkey
|
||||
and l2.l_suppkey <> l1.l_suppkey
|
||||
)
|
||||
and not exists(
|
||||
select
|
||||
*
|
||||
from
|
||||
lineitem l3
|
||||
where
|
||||
l3.l_orderkey = l1.l_orderkey
|
||||
and l3.l_suppkey <> l1.l_suppkey
|
||||
and l3.l_receiptdate > l3.l_commitdate
|
||||
)
|
||||
and s_nationkey = n_nationkey
|
||||
and n_name = 'SAUDI ARABIA'
|
||||
group by
|
||||
s_name
|
||||
order by
|
||||
numwait desc,
|
||||
s_name"""
|
||||
self.spark.sql(query).show()
|
||||
|
||||
def q22(self):
|
||||
query = """select
|
||||
cntrycode,
|
||||
count(*) as numcust,
|
||||
sum(c_acctbal) as totacctbal
|
||||
from
|
||||
(
|
||||
select
|
||||
substring(c_phone, 1, 2) as cntrycode,
|
||||
c_acctbal
|
||||
from
|
||||
customer
|
||||
where
|
||||
substring(c_phone, 1, 2) in
|
||||
('13', '31', '23', '29', '30', '18', '17')
|
||||
and c_acctbal > (
|
||||
select
|
||||
avg(c_acctbal)
|
||||
from
|
||||
customer
|
||||
where
|
||||
c_acctbal > 0.00
|
||||
and substring(c_phone, 1, 2) in
|
||||
('13', '31', '23', '29', '30', '18', '17')
|
||||
)
|
||||
and not exists(
|
||||
select
|
||||
*
|
||||
from
|
||||
orders
|
||||
where
|
||||
o_custkey = c_custkey
|
||||
)
|
||||
) as custsale
|
||||
group by
|
||||
cntrycode
|
||||
order by
|
||||
cntrycode"""
|
||||
self.spark.sql(query).show()
|
|
@ -0,0 +1,88 @@
|
|||
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
|
||||
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
|
||||
<modelVersion>4.0.0</modelVersion>
|
||||
<groupId>com.microsoft.spark</groupId>
|
||||
<artifactId>microsoft-spark-benchmark</artifactId>
|
||||
<version>0.1.0</version>
|
||||
<inceptionYear>2019</inceptionYear>
|
||||
<properties>
|
||||
<encoding>UTF-8</encoding>
|
||||
<scala.version>2.11.8</scala.version>
|
||||
<scala.binary.version>2.11</scala.binary.version>
|
||||
<spark.version>2.3.2</spark.version>
|
||||
</properties>
|
||||
|
||||
<pluginRepositories>
|
||||
<pluginRepository>
|
||||
<id>scala</id>
|
||||
<name>Scala Tools</name>
|
||||
<url>http://scala-tools.org/repo-releases/</url>
|
||||
<releases>
|
||||
<enabled>true</enabled>
|
||||
</releases>
|
||||
<snapshots>
|
||||
<enabled>false</enabled>
|
||||
</snapshots>
|
||||
</pluginRepository>
|
||||
</pluginRepositories>
|
||||
|
||||
<dependencies>
|
||||
<dependency>
|
||||
<groupId>org.scala-lang</groupId>
|
||||
<artifactId>scala-library</artifactId>
|
||||
<version>${scala.version}</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.apache.spark</groupId>
|
||||
<artifactId>spark-core_${scala.binary.version}</artifactId>
|
||||
<version>${spark.version}</version>
|
||||
<scope>provided</scope>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.apache.spark</groupId>
|
||||
<artifactId>spark-sql_${scala.binary.version}</artifactId>
|
||||
<version>${spark.version}</version>
|
||||
<scope>provided</scope>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>junit</groupId>
|
||||
<artifactId>junit</artifactId>
|
||||
<version>4.4</version>
|
||||
<scope>test</scope>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.specs</groupId>
|
||||
<artifactId>specs</artifactId>
|
||||
<version>1.2.5</version>
|
||||
<scope>test</scope>
|
||||
</dependency>
|
||||
</dependencies>
|
||||
|
||||
<build>
|
||||
<sourceDirectory>src/main/scala</sourceDirectory>
|
||||
<testSourceDirectory>src/test/scala</testSourceDirectory>
|
||||
<plugins>
|
||||
<plugin>
|
||||
<groupId>org.scala-tools</groupId>
|
||||
<artifactId>maven-scala-plugin</artifactId>
|
||||
<version>2.15.2</version>
|
||||
<executions>
|
||||
<execution>
|
||||
<goals>
|
||||
<goal>compile</goal>
|
||||
<goal>testCompile</goal>
|
||||
</goals>
|
||||
</execution>
|
||||
</executions>
|
||||
<configuration>
|
||||
<scalaVersion>${scala.version}</scalaVersion>
|
||||
<args>
|
||||
<arg>-target:jvm-1.8</arg>
|
||||
<arg>-deprecation</arg>
|
||||
<arg>-feature</arg>
|
||||
</args>
|
||||
</configuration>
|
||||
</plugin>
|
||||
</plugins>
|
||||
</build>
|
||||
</project>
|
|
@ -0,0 +1,386 @@
|
|||
<!--
|
||||
~ Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
~ contributor license agreements. See the NOTICE file distributed with
|
||||
~ this work for additional information regarding copyright ownership.
|
||||
~ The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
~ (the "License"); you may not use this file except in compliance with
|
||||
~ the License. You may obtain a copy of the License at
|
||||
~
|
||||
~ http://www.apache.org/licenses/LICENSE-2.0
|
||||
~
|
||||
~ Unless required by applicable law or agreed to in writing, software
|
||||
~ distributed under the License is distributed on an "AS IS" BASIS,
|
||||
~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
~ See the License for the specific language governing permissions and
|
||||
~ limitations under the License.
|
||||
-->
|
||||
<!--
|
||||
|
||||
If you wish to turn off checking for a section of code, you can put a comment in the source
|
||||
before and after the section, with the following syntax:
|
||||
|
||||
// scalastyle:off
|
||||
... // stuff that breaks the styles
|
||||
// scalastyle:on
|
||||
|
||||
You can also disable only one rule, by specifying its rule id, as specified in:
|
||||
http://www.scalastyle.org/rules-0.7.0.html
|
||||
|
||||
// scalastyle:off no.finalize
|
||||
override def finalize(): Unit = ...
|
||||
// scalastyle:on no.finalize
|
||||
|
||||
This file is divided into 3 sections:
|
||||
(1) rules that we enforce.
|
||||
(2) rules that we would like to enforce, but haven't cleaned up the codebase to turn on yet
|
||||
(or we need to make the scalastyle rule more configurable).
|
||||
(3) rules that we don't want to enforce.
|
||||
-->
|
||||
|
||||
<scalastyle>
|
||||
<name>Scalastyle standard configuration</name>
|
||||
|
||||
<!-- ================================================================================ -->
|
||||
<!-- rules we enforce -->
|
||||
<!-- ================================================================================ -->
|
||||
|
||||
<check level="error" class="org.scalastyle.file.FileTabChecker" enabled="true"></check>
|
||||
|
||||
<check level="error" class="org.scalastyle.file.HeaderMatchesChecker" enabled="true">
|
||||
<parameters>
|
||||
<!--parameter name="header"><![CDATA[/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/]]></parameter-->
|
||||
</parameters>
|
||||
</check>
|
||||
|
||||
<check level="error" class="org.scalastyle.scalariform.SpacesAfterPlusChecker" enabled="true"></check>
|
||||
|
||||
<check level="error" class="org.scalastyle.scalariform.SpacesBeforePlusChecker" enabled="true"></check>
|
||||
|
||||
<check level="error" class="org.scalastyle.file.WhitespaceEndOfLineChecker" enabled="true"></check>
|
||||
|
||||
<check level="error" class="org.scalastyle.file.FileLineLengthChecker" enabled="true">
|
||||
<parameters>
|
||||
<parameter name="maxLineLength"><![CDATA[100]]></parameter>
|
||||
<parameter name="tabSize"><![CDATA[2]]></parameter>
|
||||
<parameter name="ignoreImports">true</parameter>
|
||||
</parameters>
|
||||
</check>
|
||||
|
||||
<check level="error" class="org.scalastyle.scalariform.ClassNamesChecker" enabled="true">
|
||||
<parameters><parameter name="regex"><![CDATA[[A-Z][A-Za-z]*]]></parameter></parameters>
|
||||
</check>
|
||||
|
||||
<check level="error" class="org.scalastyle.scalariform.ObjectNamesChecker" enabled="true">
|
||||
<parameters><parameter name="regex"><![CDATA[(config|[A-Z][A-Za-z]*)]]></parameter></parameters>
|
||||
</check>
|
||||
|
||||
<check level="error" class="org.scalastyle.scalariform.PackageObjectNamesChecker" enabled="true">
|
||||
<parameters><parameter name="regex"><![CDATA[^[a-z][A-Za-z]*$]]></parameter></parameters>
|
||||
</check>
|
||||
|
||||
<check customId="argcount" level="error" class="org.scalastyle.scalariform.ParameterNumberChecker" enabled="true">
|
||||
<parameters><parameter name="maxParameters"><![CDATA[10]]></parameter></parameters>
|
||||
</check>
|
||||
|
||||
<check level="error" class="org.scalastyle.scalariform.NoFinalizeChecker" enabled="true"></check>
|
||||
|
||||
<check level="error" class="org.scalastyle.scalariform.CovariantEqualsChecker" enabled="true"></check>
|
||||
|
||||
<check level="error" class="org.scalastyle.scalariform.StructuralTypeChecker" enabled="true"></check>
|
||||
|
||||
<check level="error" class="org.scalastyle.scalariform.UppercaseLChecker" enabled="true"></check>
|
||||
|
||||
<check level="error" class="org.scalastyle.scalariform.IfBraceChecker" enabled="true">
|
||||
<parameters>
|
||||
<parameter name="singleLineAllowed"><![CDATA[true]]></parameter>
|
||||
<parameter name="doubleLineAllowed"><![CDATA[true]]></parameter>
|
||||
</parameters>
|
||||
</check>
|
||||
|
||||
<check level="error" class="org.scalastyle.scalariform.PublicMethodsHaveTypeChecker" enabled="true"></check>
|
||||
|
||||
<check level="error" class="org.scalastyle.file.NewLineAtEofChecker" enabled="true"></check>
|
||||
|
||||
<check customId="nonascii" level="error" class="org.scalastyle.scalariform.NonASCIICharacterChecker" enabled="true"></check>
|
||||
|
||||
<check level="error" class="org.scalastyle.scalariform.SpaceAfterCommentStartChecker" enabled="true"></check>
|
||||
|
||||
<check level="error" class="org.scalastyle.scalariform.EnsureSingleSpaceBeforeTokenChecker" enabled="true">
|
||||
<parameters>
|
||||
<parameter name="tokens">ARROW, EQUALS, ELSE, TRY, CATCH, FINALLY, LARROW, RARROW</parameter>
|
||||
</parameters>
|
||||
</check>
|
||||
|
||||
<check level="error" class="org.scalastyle.scalariform.EnsureSingleSpaceAfterTokenChecker" enabled="true">
|
||||
<parameters>
|
||||
<parameter name="tokens">ARROW, EQUALS, COMMA, COLON, IF, ELSE, DO, WHILE, FOR, MATCH, TRY, CATCH, FINALLY, LARROW, RARROW</parameter>
|
||||
</parameters>
|
||||
</check>
|
||||
|
||||
<!-- ??? usually shouldn't be checked into the code base. -->
|
||||
<check level="error" class="org.scalastyle.scalariform.NotImplementedErrorUsage" enabled="true"></check>
|
||||
|
||||
<!-- As of SPARK-7558, all tests in Spark should extend o.a.s.SparkFunSuite instead of FunSuite directly -->
|
||||
<check customId="funsuite" level="error" class="org.scalastyle.scalariform.TokenChecker" enabled="true">
|
||||
<parameters><parameter name="regex">^FunSuite[A-Za-z]*$</parameter></parameters>
|
||||
<customMessage>Tests must extend org.apache.spark.SparkFunSuite instead.</customMessage>
|
||||
</check>
|
||||
|
||||
<!-- As of SPARK-7977 all printlns need to be wrapped in '// scalastyle:off/on println' -->
|
||||
<check customId="println" level="error" class="org.scalastyle.scalariform.TokenChecker" enabled="true">
|
||||
<parameters><parameter name="regex">^println$</parameter></parameters>
|
||||
<customMessage><![CDATA[Are you sure you want to println? If yes, wrap the code block with
|
||||
// scalastyle:off println
|
||||
println(...)
|
||||
// scalastyle:on println]]></customMessage>
|
||||
</check>
|
||||
|
||||
<check customId="hadoopconfiguration" level="error" class="org.scalastyle.file.RegexChecker" enabled="true">
|
||||
<parameters><parameter name="regex">spark(.sqlContext)?.sparkContext.hadoopConfiguration</parameter></parameters>
|
||||
<customMessage><![CDATA[
|
||||
Are you sure that you want to use sparkContext.hadoopConfiguration? In most cases, you should use
|
||||
spark.sessionState.newHadoopConf() instead, so that the hadoop configurations specified in Spark session
|
||||
configuration will come into effect.
|
||||
If you must use sparkContext.hadoopConfiguration, wrap the code block with
|
||||
// scalastyle:off hadoopconfiguration
|
||||
spark.sparkContext.hadoopConfiguration...
|
||||
// scalastyle:on hadoopconfiguration
|
||||
]]></customMessage>
|
||||
</check>
|
||||
|
||||
<check customId="visiblefortesting" level="error" class="org.scalastyle.file.RegexChecker" enabled="true">
|
||||
<parameters><parameter name="regex">@VisibleForTesting</parameter></parameters>
|
||||
<customMessage><![CDATA[
|
||||
@VisibleForTesting causes classpath issues. Please note this in the java doc instead (SPARK-11615).
|
||||
]]></customMessage>
|
||||
</check>
|
||||
|
||||
<check customId="runtimeaddshutdownhook" level="error" class="org.scalastyle.file.RegexChecker" enabled="true">
|
||||
<parameters><parameter name="regex">Runtime\.getRuntime\.addShutdownHook</parameter></parameters>
|
||||
<customMessage><![CDATA[
|
||||
Are you sure that you want to use Runtime.getRuntime.addShutdownHook? In most cases, you should use
|
||||
ShutdownHookManager.addShutdownHook instead.
|
||||
If you must use Runtime.getRuntime.addShutdownHook, wrap the code block with
|
||||
// scalastyle:off runtimeaddshutdownhook
|
||||
Runtime.getRuntime.addShutdownHook(...)
|
||||
// scalastyle:on runtimeaddshutdownhook
|
||||
]]></customMessage>
|
||||
</check>
|
||||
|
||||
<check customId="mutablesynchronizedbuffer" level="error" class="org.scalastyle.file.RegexChecker" enabled="true">
|
||||
<parameters><parameter name="regex">mutable\.SynchronizedBuffer</parameter></parameters>
|
||||
<customMessage><![CDATA[
|
||||
Are you sure that you want to use mutable.SynchronizedBuffer? In most cases, you should use
|
||||
java.util.concurrent.ConcurrentLinkedQueue instead.
|
||||
If you must use mutable.SynchronizedBuffer, wrap the code block with
|
||||
// scalastyle:off mutablesynchronizedbuffer
|
||||
mutable.SynchronizedBuffer[...]
|
||||
// scalastyle:on mutablesynchronizedbuffer
|
||||
]]></customMessage>
|
||||
</check>
|
||||
|
||||
<check customId="classforname" level="error" class="org.scalastyle.file.RegexChecker" enabled="true">
|
||||
<parameters><parameter name="regex">Class\.forName</parameter></parameters>
|
||||
<customMessage><![CDATA[
|
||||
Are you sure that you want to use Class.forName? In most cases, you should use Utils.classForName instead.
|
||||
If you must use Class.forName, wrap the code block with
|
||||
// scalastyle:off classforname
|
||||
Class.forName(...)
|
||||
// scalastyle:on classforname
|
||||
]]></customMessage>
|
||||
</check>
|
||||
|
||||
<check customId="awaitresult" level="error" class="org.scalastyle.file.RegexChecker" enabled="true">
|
||||
<parameters><parameter name="regex">Await\.result</parameter></parameters>
|
||||
<customMessage><![CDATA[
|
||||
Are you sure that you want to use Await.result? In most cases, you should use ThreadUtils.awaitResult instead.
|
||||
If you must use Await.result, wrap the code block with
|
||||
// scalastyle:off awaitresult
|
||||
Await.result(...)
|
||||
// scalastyle:on awaitresult
|
||||
]]></customMessage>
|
||||
</check>
|
||||
|
||||
<check customId="awaitready" level="error" class="org.scalastyle.file.RegexChecker" enabled="true">
|
||||
<parameters><parameter name="regex">Await\.ready</parameter></parameters>
|
||||
<customMessage><![CDATA[
|
||||
Are you sure that you want to use Await.ready? In most cases, you should use ThreadUtils.awaitReady instead.
|
||||
If you must use Await.ready, wrap the code block with
|
||||
// scalastyle:off awaitready
|
||||
Await.ready(...)
|
||||
// scalastyle:on awaitready
|
||||
]]></customMessage>
|
||||
</check>
|
||||
|
||||
<check customId="caselocale" level="error" class="org.scalastyle.file.RegexChecker" enabled="true">
|
||||
<parameters><parameter name="regex">(\.toUpperCase|\.toLowerCase)(?!(\(|\(Locale.ROOT\)))</parameter></parameters>
|
||||
<customMessage><![CDATA[
|
||||
Are you sure that you want to use toUpperCase or toLowerCase without the root locale? In most cases, you
|
||||
should use toUpperCase(Locale.ROOT) or toLowerCase(Locale.ROOT) instead.
|
||||
If you must use toUpperCase or toLowerCase without the root locale, wrap the code block with
|
||||
// scalastyle:off caselocale
|
||||
.toUpperCase
|
||||
.toLowerCase
|
||||
// scalastyle:on caselocale
|
||||
]]></customMessage>
|
||||
</check>
|
||||
|
||||
<check customId="throwerror" level="error" class="org.scalastyle.file.RegexChecker" enabled="true">
|
||||
<parameters><parameter name="regex">throw new \w+Error\(</parameter></parameters>
|
||||
<customMessage><![CDATA[
|
||||
Are you sure that you want to throw Error? In most cases, you should use appropriate Exception instead.
|
||||
If you must throw Error, wrap the code block with
|
||||
// scalastyle:off throwerror
|
||||
throw new XXXError(...)
|
||||
// scalastyle:on throwerror
|
||||
]]></customMessage>
|
||||
</check>
|
||||
|
||||
<!-- As of SPARK-9613 JavaConversions should be replaced with JavaConverters -->
|
||||
<check customId="javaconversions" level="error" class="org.scalastyle.scalariform.TokenChecker" enabled="true">
|
||||
<parameters><parameter name="regex">JavaConversions</parameter></parameters>
|
||||
<customMessage>Instead of importing implicits in scala.collection.JavaConversions._, import
|
||||
scala.collection.JavaConverters._ and use .asScala / .asJava methods</customMessage>
|
||||
</check>
|
||||
|
||||
<check customId="commonslang2" level="error" class="org.scalastyle.scalariform.TokenChecker" enabled="true">
|
||||
<parameters><parameter name="regex">org\.apache\.commons\.lang\.</parameter></parameters>
|
||||
<customMessage>Use Commons Lang 3 classes (package org.apache.commons.lang3.*) instead
|
||||
of Commons Lang 2 (package org.apache.commons.lang.*)</customMessage>
|
||||
</check>
|
||||
|
||||
<check customId="extractopt" level="error" class="org.scalastyle.scalariform.TokenChecker" enabled="true">
|
||||
<parameters><parameter name="regex">extractOpt</parameter></parameters>
|
||||
<customMessage>Use jsonOption(x).map(.extract[T]) instead of .extractOpt[T], as the latter
|
||||
is slower. </customMessage>
|
||||
</check>
|
||||
|
||||
<check level="error" class="org.scalastyle.scalariform.ImportOrderChecker" enabled="true">
|
||||
<parameters>
|
||||
<parameter name="groups">java,scala,3rdParty,spark</parameter>
|
||||
<parameter name="group.java">javax?\..*</parameter>
|
||||
<parameter name="group.scala">scala\..*</parameter>
|
||||
<parameter name="group.3rdParty">(?!org\.apache\.spark\.).*</parameter>
|
||||
<parameter name="group.spark">org\.apache\.spark\..*</parameter>
|
||||
</parameters>
|
||||
</check>
|
||||
|
||||
<check level="error" class="org.scalastyle.scalariform.DisallowSpaceBeforeTokenChecker" enabled="true">
|
||||
<parameters>
|
||||
<parameter name="tokens">COMMA</parameter>
|
||||
</parameters>
|
||||
</check>
|
||||
|
||||
<!-- SPARK-3854: Single Space between ')' and '{' -->
|
||||
<check customId="SingleSpaceBetweenRParenAndLCurlyBrace" level="error" class="org.scalastyle.file.RegexChecker" enabled="true">
|
||||
<parameters><parameter name="regex">\)\{</parameter></parameters>
|
||||
<customMessage><![CDATA[
|
||||
Single Space between ')' and `{`.
|
||||
]]></customMessage>
|
||||
</check>
|
||||
|
||||
<check customId="NoScalaDoc" level="error" class="org.scalastyle.file.RegexChecker" enabled="true">
|
||||
<parameters><parameter name="regex">(?m)^(\s*)/[*][*].*$(\r|)\n^\1 [*]</parameter></parameters>
|
||||
<customMessage>Use Javadoc style indentation for multiline comments</customMessage>
|
||||
</check>
|
||||
|
||||
<check customId="OmitBracesInCase" level="error" class="org.scalastyle.file.RegexChecker" enabled="true">
|
||||
<parameters><parameter name="regex">case[^\n>]*=>\s*\{</parameter></parameters>
|
||||
<customMessage>Omit braces in case clauses.</customMessage>
|
||||
</check>
|
||||
|
||||
<!-- SPARK-16877: Avoid Java annotations -->
|
||||
<check level="error" class="org.scalastyle.scalariform.OverrideJavaChecker" enabled="true"></check>
|
||||
|
||||
<check level="error" class="org.scalastyle.scalariform.DeprecatedJavaChecker" enabled="true"></check>
|
||||
|
||||
<!-- ================================================================================ -->
|
||||
<!-- rules we'd like to enforce, but haven't cleaned up the codebase yet -->
|
||||
<!-- ================================================================================ -->
|
||||
|
||||
<!-- We cannot turn the following two on, because it'd fail a lot of string interpolation use cases. -->
|
||||
<!-- Ideally the following two rules should be configurable to rule out string interpolation. -->
|
||||
<check level="error" class="org.scalastyle.scalariform.NoWhitespaceBeforeLeftBracketChecker" enabled="false"></check>
|
||||
<check level="error" class="org.scalastyle.scalariform.NoWhitespaceAfterLeftBracketChecker" enabled="false"></check>
|
||||
|
||||
<!-- This breaks symbolic method names so we don't turn it on. -->
|
||||
<!-- Maybe we should update it to allow basic symbolic names, and then we are good to go. -->
|
||||
<check level="error" class="org.scalastyle.scalariform.MethodNamesChecker" enabled="false">
|
||||
<parameters>
|
||||
<parameter name="regex"><![CDATA[^[a-z][A-Za-z0-9]*$]]></parameter>
|
||||
</parameters>
|
||||
</check>
|
||||
|
||||
<!-- Should turn this on, but we have a few places that need to be fixed first -->
|
||||
<check level="error" class="org.scalastyle.scalariform.EqualsHashCodeChecker" enabled="true"></check>
|
||||
|
||||
<!-- ================================================================================ -->
|
||||
<!-- rules we don't want -->
|
||||
<!-- ================================================================================ -->
|
||||
|
||||
<check level="error" class="org.scalastyle.scalariform.IllegalImportsChecker" enabled="false">
|
||||
<parameters><parameter name="illegalImports"><![CDATA[sun._,java.awt._]]></parameter></parameters>
|
||||
</check>
|
||||
|
||||
<!-- We want the opposite of this: NewLineAtEofChecker -->
|
||||
<check level="error" class="org.scalastyle.file.NoNewLineAtEofChecker" enabled="false"></check>
|
||||
|
||||
<!-- This one complains about all kinds of random things. Disable. -->
|
||||
<check level="error" class="org.scalastyle.scalariform.SimplifyBooleanExpressionChecker" enabled="false"></check>
|
||||
|
||||
<!-- We use return quite a bit for control flows and guards -->
|
||||
<check level="error" class="org.scalastyle.scalariform.ReturnChecker" enabled="false"></check>
|
||||
|
||||
<!-- We use null a lot in low level code and to interface with 3rd party code -->
|
||||
<check level="error" class="org.scalastyle.scalariform.NullChecker" enabled="false"></check>
|
||||
|
||||
<!-- Doesn't seem super big deal here ... -->
|
||||
<check level="error" class="org.scalastyle.scalariform.NoCloneChecker" enabled="false"></check>
|
||||
|
||||
<!-- Doesn't seem super big deal here ... -->
|
||||
<check level="error" class="org.scalastyle.file.FileLengthChecker" enabled="false">
|
||||
<parameters><parameter name="maxFileLength">800></parameter></parameters>
|
||||
</check>
|
||||
|
||||
<!-- Doesn't seem super big deal here ... -->
|
||||
<check level="error" class="org.scalastyle.scalariform.NumberOfTypesChecker" enabled="false">
|
||||
<parameters><parameter name="maxTypes">30</parameter></parameters>
|
||||
</check>
|
||||
|
||||
<!-- Doesn't seem super big deal here ... -->
|
||||
<check level="error" class="org.scalastyle.scalariform.CyclomaticComplexityChecker" enabled="false">
|
||||
<parameters><parameter name="maximum">10</parameter></parameters>
|
||||
</check>
|
||||
|
||||
<!-- Doesn't seem super big deal here ... -->
|
||||
<check level="error" class="org.scalastyle.scalariform.MethodLengthChecker" enabled="false">
|
||||
<parameters><parameter name="maxLength">50</parameter></parameters>
|
||||
</check>
|
||||
|
||||
<!-- Not exactly feasible to enforce this right now. -->
|
||||
<!-- It is also infrequent that somebody introduces a new class with a lot of methods. -->
|
||||
<check level="error" class="org.scalastyle.scalariform.NumberOfMethodsInTypeChecker" enabled="false">
|
||||
<parameters><parameter name="maxMethods"><![CDATA[30]]></parameter></parameters>
|
||||
</check>
|
||||
|
||||
<!-- Doesn't seem super big deal here, and we have a lot of magic numbers ... -->
|
||||
<check level="error" class="org.scalastyle.scalariform.MagicNumberChecker" enabled="false">
|
||||
<parameters><parameter name="ignore">-1,0,1,2,3</parameter></parameters>
|
||||
</check>
|
||||
|
||||
</scalastyle>
|
|
@ -0,0 +1,52 @@
|
|||
/*
|
||||
* Licensed to the .NET Foundation under one or more agreements.
|
||||
* The .NET Foundation licenses this file to you under the MIT license.
|
||||
* See the LICENSE file in the project root for more information.
|
||||
*/
|
||||
|
||||
package com.microsoft.tpch
|
||||
|
||||
import scala.util.Try
|
||||
|
||||
import org.apache.spark.sql.SparkSession
|
||||
|
||||
object App {
|
||||
def main(args: Array[String]) {
|
||||
if (args.length != 4) {
|
||||
println("Usage:")
|
||||
println("\t<spark-submit> --master local --class com.microsoft.tpch.App microsoft-spark-examples-<version>.jar")
|
||||
println("\t\t<tpch_data_root_path> <query_number> <num_iterations> <true for SQL | false for functional>")
|
||||
}
|
||||
|
||||
val tpchRoot = args(0)
|
||||
val queryNumber = args(1).toInt
|
||||
val numIteration = args(2).toInt
|
||||
val isSql = Try(args(3).toBoolean).getOrElse(false)
|
||||
|
||||
for (i <- 0 until numIteration) {
|
||||
val spark = SparkSession
|
||||
.builder()
|
||||
.appName("TPC-H Benchmark for Scala")
|
||||
.getOrCreate()
|
||||
|
||||
val startTs = System.currentTimeMillis
|
||||
|
||||
if (!isSql) {
|
||||
val tpchFunctional = new TpchFunctionalQueries(spark, tpchRoot)
|
||||
tpchFunctional.run(queryNumber.toString)
|
||||
}
|
||||
else {
|
||||
}
|
||||
|
||||
val endTs = System.currentTimeMillis
|
||||
val totalTime = endTs - startTs
|
||||
|
||||
val typeStr = if (isSql) "SQL"
|
||||
else "Functional"
|
||||
|
||||
println(s"TPCH_Result,Scala,$typeStr,$queryNumber,$i,$totalTime")
|
||||
|
||||
spark.stop()
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,20 @@
|
|||
/*
|
||||
* Licensed to the .NET Foundation under one or more agreements.
|
||||
* The .NET Foundation licenses this file to you under the MIT license.
|
||||
* See the LICENSE file in the project root for more information.
|
||||
*/
|
||||
|
||||
package com.microsoft.tpch
|
||||
|
||||
import org.apache.spark.sql.{DataFrame, SparkSession}
|
||||
|
||||
class TpchBase(spark: SparkSession, tpchRoot: String) {
|
||||
val customer: DataFrame = spark.read.parquet(s"${tpchRoot}customer")
|
||||
val lineitem: DataFrame = spark.read.parquet(s"${tpchRoot}lineitem")
|
||||
val nation: DataFrame = spark.read.parquet(s"${tpchRoot}nation")
|
||||
val order: DataFrame = spark.read.parquet(s"${tpchRoot}orders")
|
||||
val part: DataFrame = spark.read.parquet(s"${tpchRoot}part")
|
||||
val partsupp: DataFrame = spark.read.parquet(s"${tpchRoot}partsupp")
|
||||
val region: DataFrame = spark.read.parquet(s"${tpchRoot}region")
|
||||
val supplier: DataFrame = spark.read.parquet(s"${tpchRoot}supplier")
|
||||
}
|
|
@ -0,0 +1,445 @@
|
|||
/*
|
||||
* Licensed to the .NET Foundation under one or more agreements.
|
||||
* The .NET Foundation licenses this file to you under the MIT license.
|
||||
* See the LICENSE file in the project root for more information.
|
||||
*/
|
||||
|
||||
package com.microsoft.tpch
|
||||
|
||||
import org.apache.spark.sql.SparkSession
|
||||
import org.apache.spark.sql.functions._
|
||||
|
||||
/*
|
||||
* Note that the queries are taken from: https://github.com/ssavvides/tpch-spark and updated.
|
||||
*/
|
||||
class TpchFunctionalQueries(spark: SparkSession, tpchRoot: String)
|
||||
extends TpchBase(spark, tpchRoot) {
|
||||
|
||||
import spark.implicits._
|
||||
|
||||
def run(queryNumber: String): Unit = {
|
||||
val method = this.getClass.getMethod("q" + queryNumber)
|
||||
method.invoke(this)
|
||||
}
|
||||
|
||||
def q1(): Unit = {
|
||||
val decrease = udf { (x: Double, y: Double) => x * (1 - y) }
|
||||
val increase = udf { (x: Double, y: Double) => x * (1 + y) }
|
||||
|
||||
lineitem.filter($"l_shipdate" <= "1998-09-02")
|
||||
.groupBy($"l_returnflag", $"l_linestatus")
|
||||
.agg(sum($"l_quantity"), sum($"l_extendedprice"),
|
||||
sum(decrease($"l_extendedprice", $"l_discount")),
|
||||
sum(increase(decrease($"l_extendedprice", $"l_discount"), $"l_tax")),
|
||||
avg($"l_quantity"), avg($"l_extendedprice"), avg($"l_discount"), count($"l_quantity"))
|
||||
.sort($"l_returnflag", $"l_linestatus")
|
||||
.show()
|
||||
}
|
||||
|
||||
def q2(): Unit = {
|
||||
val europe = region.filter($"r_name" === "EUROPE")
|
||||
.join(nation, $"r_regionkey" === nation("n_regionkey"))
|
||||
.join(supplier, $"n_nationkey" === supplier("s_nationkey"))
|
||||
.join(partsupp, supplier("s_suppkey") === partsupp("ps_suppkey"))
|
||||
|
||||
val brass = part.filter(part("p_size") === 15 && part("p_type").endsWith("BRASS"))
|
||||
.join(europe, europe("ps_partkey") === $"p_partkey")
|
||||
|
||||
val minCost = brass.groupBy(brass("ps_partkey"))
|
||||
.agg(min("ps_supplycost").as("min"))
|
||||
|
||||
brass.join(minCost, brass("ps_partkey") === minCost("ps_partkey"))
|
||||
.filter(brass("ps_supplycost") === minCost("min"))
|
||||
.select("s_acctbal", "s_name", "n_name", "p_partkey", "p_mfgr", "s_address", "s_phone", "s_comment")
|
||||
.sort($"s_acctbal".desc, $"n_name", $"s_name", $"p_partkey")
|
||||
.show()
|
||||
}
|
||||
|
||||
def q3(): Unit = {
|
||||
val decrease = udf { (x: Double, y: Double) => x * (1 - y) }
|
||||
|
||||
val fcust = customer.filter($"c_mktsegment" === "BUILDING")
|
||||
val forders = order.filter($"o_orderdate" < "1995-03-15")
|
||||
val flineitems = lineitem.filter($"l_shipdate" > "1995-03-15")
|
||||
|
||||
fcust.join(forders, $"c_custkey" === forders("o_custkey"))
|
||||
.select($"o_orderkey", $"o_orderdate", $"o_shippriority")
|
||||
.join(flineitems, $"o_orderkey" === flineitems("l_orderkey"))
|
||||
.select($"l_orderkey",
|
||||
decrease($"l_extendedprice", $"l_discount").as("volume"),
|
||||
$"o_orderdate", $"o_shippriority")
|
||||
.groupBy($"l_orderkey", $"o_orderdate", $"o_shippriority")
|
||||
.agg(sum($"volume").as("revenue"))
|
||||
.sort($"revenue".desc, $"o_orderdate")
|
||||
.show()
|
||||
}
|
||||
|
||||
def q4(): Unit = {
|
||||
val forders = order.filter($"o_orderdate" >= "1993-07-01" && $"o_orderdate" < "1993-10-01")
|
||||
val flineitems = lineitem.filter($"l_commitdate" < $"l_receiptdate")
|
||||
.select($"l_orderkey")
|
||||
.distinct
|
||||
|
||||
flineitems.join(forders, $"l_orderkey" === forders("o_orderkey"))
|
||||
.groupBy($"o_orderpriority")
|
||||
.agg(count($"o_orderpriority"))
|
||||
.sort($"o_orderpriority")
|
||||
.show()
|
||||
}
|
||||
|
||||
def q5(): Unit = {
|
||||
val decrease = udf { (x: Double, y: Double) => x * (1 - y) }
|
||||
|
||||
val forders = order.filter($"o_orderdate" < "1995-01-01" && $"o_orderdate" >= "1994-01-01")
|
||||
|
||||
region.filter($"r_name" === "ASIA")
|
||||
.join(nation, $"r_regionkey" === nation("n_regionkey"))
|
||||
.join(supplier, $"n_nationkey" === supplier("s_nationkey"))
|
||||
.join(lineitem, $"s_suppkey" === lineitem("l_suppkey"))
|
||||
.select($"n_name", $"l_extendedprice", $"l_discount", $"l_orderkey", $"s_nationkey")
|
||||
.join(forders, $"l_orderkey" === forders("o_orderkey"))
|
||||
.join(customer, $"o_custkey" === customer("c_custkey") && $"s_nationkey" === customer("c_nationkey"))
|
||||
.select($"n_name", decrease($"l_extendedprice", $"l_discount").as("value"))
|
||||
.groupBy($"n_name")
|
||||
.agg(sum($"value").as("revenue"))
|
||||
.sort($"revenue".desc)
|
||||
.show()
|
||||
}
|
||||
|
||||
def q6(): Unit = {
|
||||
lineitem.filter($"l_shipdate" >= "1994-01-01" && $"l_shipdate" < "1995-01-01" && $"l_discount" >= 0.05 && $"l_discount" <= 0.07 && $"l_quantity" < 24)
|
||||
.agg(sum($"l_extendedprice" * $"l_discount"))
|
||||
.show()
|
||||
}
|
||||
|
||||
def q7(): Unit = {
|
||||
val getYear = udf { x: String => x.substring(0, 4) }
|
||||
val decrease = udf { (x: Double, y: Double) => x * (1 - y) }
|
||||
|
||||
val fnation = nation.filter($"n_name" === "FRANCE" || $"n_name" === "GERMANY")
|
||||
val fline = lineitem.filter($"l_shipdate" >= "1995-01-01" && $"l_shipdate" <= "1996-12-31")
|
||||
|
||||
val supNation = fnation.join(supplier, $"n_nationkey" === supplier("s_nationkey"))
|
||||
.join(fline, $"s_suppkey" === fline("l_suppkey"))
|
||||
.select($"n_name".as("supp_nation"), $"l_orderkey", $"l_extendedprice", $"l_discount", $"l_shipdate")
|
||||
|
||||
fnation.join(customer, $"n_nationkey" === customer("c_nationkey"))
|
||||
.join(order, $"c_custkey" === order("o_custkey"))
|
||||
.select($"n_name".as("cust_nation"), $"o_orderkey")
|
||||
.join(supNation, $"o_orderkey" === supNation("l_orderkey"))
|
||||
.filter($"supp_nation" === "FRANCE" && $"cust_nation" === "GERMANY"
|
||||
|| $"supp_nation" === "GERMANY" && $"cust_nation" === "FRANCE")
|
||||
.select($"supp_nation", $"cust_nation",
|
||||
getYear($"l_shipdate").as("l_year"),
|
||||
decrease($"l_extendedprice", $"l_discount").as("volume"))
|
||||
.groupBy($"supp_nation", $"cust_nation", $"l_year")
|
||||
.agg(sum($"volume").as("revenue"))
|
||||
.sort($"supp_nation", $"cust_nation", $"l_year")
|
||||
.show()
|
||||
}
|
||||
|
||||
def q8(): Unit = {
|
||||
val getYear = udf { x: String => x.substring(0, 4) }
|
||||
val decrease = udf { (x: Double, y: Double) => x * (1 - y) }
|
||||
val isBrazil = udf { (x: String, y: Double) => if (x == "BRAZIL") y else 0 }
|
||||
|
||||
val fregion = region.filter($"r_name" === "AMERICA")
|
||||
val forder = order.filter($"o_orderdate" <= "1996-12-31" && $"o_orderdate" >= "1995-01-01")
|
||||
val fpart = part.filter($"p_type" === "ECONOMY ANODIZED STEEL")
|
||||
|
||||
val nat = nation.join(supplier, $"n_nationkey" === supplier("s_nationkey"))
|
||||
|
||||
val line = lineitem.select($"l_partkey", $"l_suppkey", $"l_orderkey",
|
||||
decrease($"l_extendedprice", $"l_discount").as("volume")).
|
||||
join(fpart, $"l_partkey" === fpart("p_partkey"))
|
||||
.join(nat, $"l_suppkey" === nat("s_suppkey"))
|
||||
|
||||
nation.join(fregion, $"n_regionkey" === fregion("r_regionkey"))
|
||||
.select($"n_nationkey")
|
||||
.join(customer, $"n_nationkey" === customer("c_nationkey"))
|
||||
.select($"c_custkey")
|
||||
.join(forder, $"c_custkey" === forder("o_custkey"))
|
||||
.select($"o_orderkey", $"o_orderdate")
|
||||
.join(line, $"o_orderkey" === line("l_orderkey"))
|
||||
.select(getYear($"o_orderdate").as("o_year"), $"volume",
|
||||
isBrazil($"n_name", $"volume").as("case_volume"))
|
||||
.groupBy($"o_year")
|
||||
.agg(sum($"case_volume") / sum("volume"))
|
||||
.sort($"o_year")
|
||||
.show()
|
||||
}
|
||||
|
||||
def q9(): Unit = {
|
||||
val getYear = udf { x: String => x.substring(0, 4) }
|
||||
val expr = udf { (x: Double, y: Double, v: Double, w: Double) => x * (1 - y) - (v * w) }
|
||||
|
||||
val linePart = part.filter($"p_name".contains("green"))
|
||||
.join(lineitem, $"p_partkey" === lineitem("l_partkey"))
|
||||
|
||||
val natSup = nation.join(supplier, $"n_nationkey" === supplier("s_nationkey"))
|
||||
|
||||
linePart.join(natSup, $"l_suppkey" === natSup("s_suppkey"))
|
||||
.join(partsupp, $"l_suppkey" === partsupp("ps_suppkey")
|
||||
&& $"l_partkey" === partsupp("ps_partkey"))
|
||||
.join(order, $"l_orderkey" === order("o_orderkey"))
|
||||
.select($"n_name", getYear($"o_orderdate").as("o_year"),
|
||||
expr($"l_extendedprice", $"l_discount", $"ps_supplycost", $"l_quantity").as("amount"))
|
||||
.groupBy($"n_name", $"o_year")
|
||||
.agg(sum($"amount"))
|
||||
.sort($"n_name", $"o_year".desc)
|
||||
.show()
|
||||
}
|
||||
|
||||
def q10(): Unit = {
|
||||
val decrease = udf { (x: Double, y: Double) => x * (1 - y) }
|
||||
|
||||
val flineitem = lineitem.filter($"l_returnflag" === "R")
|
||||
|
||||
order.filter($"o_orderdate" < "1994-01-01" && $"o_orderdate" >= "1993-10-01")
|
||||
.join(customer, $"o_custkey" === customer("c_custkey"))
|
||||
.join(nation, $"c_nationkey" === nation("n_nationkey"))
|
||||
.join(flineitem, $"o_orderkey" === flineitem("l_orderkey"))
|
||||
.select($"c_custkey", $"c_name",
|
||||
decrease($"l_extendedprice", $"l_discount").as("volume"),
|
||||
$"c_acctbal", $"n_name", $"c_address", $"c_phone", $"c_comment")
|
||||
.groupBy($"c_custkey", $"c_name", $"c_acctbal", $"c_phone", $"n_name", $"c_address", $"c_comment")
|
||||
.agg(sum($"volume").as("revenue"))
|
||||
.sort($"revenue".desc)
|
||||
.show()
|
||||
}
|
||||
|
||||
def q11(): Unit = {
|
||||
val mul = udf { (x: Double, y: Int) => x * y }
|
||||
val mul01 = udf { x: Double => x * 0.0001 }
|
||||
|
||||
val tmp = nation.filter($"n_name" === "GERMANY")
|
||||
.join(supplier, $"n_nationkey" === supplier("s_nationkey"))
|
||||
.select($"s_suppkey")
|
||||
.join(partsupp, $"s_suppkey" === partsupp("ps_suppkey"))
|
||||
.select($"ps_partkey", mul($"ps_supplycost", $"ps_availqty").as("value"))
|
||||
// .cache()
|
||||
|
||||
val sumRes = tmp.agg(sum("value").as("total_value"))
|
||||
|
||||
tmp.groupBy($"ps_partkey").agg(sum("value").as("part_value"))
|
||||
.join(sumRes, $"part_value" > mul01($"total_value"))
|
||||
.sort($"part_value".desc)
|
||||
.show()
|
||||
}
|
||||
|
||||
def q12(): Unit = {
|
||||
val highPriority = udf { x: String => if (x == "1-URGENT" || x == "2-HIGH") 1 else 0 }
|
||||
val lowPriority = udf { x: String => if (x != "1-URGENT" && x != "2-HIGH") 1 else 0 }
|
||||
|
||||
lineitem.filter((
|
||||
$"l_shipmode" === "MAIL" || $"l_shipmode" === "SHIP") &&
|
||||
$"l_commitdate" < $"l_receiptdate" &&
|
||||
$"l_shipdate" < $"l_commitdate" &&
|
||||
$"l_receiptdate" >= "1994-01-01" && $"l_receiptdate" < "1995-01-01")
|
||||
.join(order, $"l_orderkey" === order("o_orderkey"))
|
||||
.select($"l_shipmode", $"o_orderpriority")
|
||||
.groupBy($"l_shipmode")
|
||||
.agg(sum(highPriority($"o_orderpriority")).as("sum_highorderpriority"),
|
||||
sum(lowPriority($"o_orderpriority")).as("sum_loworderpriority"))
|
||||
.sort($"l_shipmode")
|
||||
.show()
|
||||
}
|
||||
|
||||
def q13(): Unit = {
|
||||
val special = udf { x: String => x.matches(".*special.*requests.*") }
|
||||
|
||||
customer.join(order, $"c_custkey" === order("o_custkey")
|
||||
&& !special(order("o_comment")), "left_outer")
|
||||
.groupBy($"o_custkey")
|
||||
.agg(count($"o_orderkey").as("c_count"))
|
||||
.groupBy($"c_count")
|
||||
.agg(count($"o_custkey").as("custdist"))
|
||||
.sort($"custdist".desc, $"c_count".desc)
|
||||
.show()
|
||||
}
|
||||
|
||||
def q14(): Unit = {
|
||||
val reduce = udf { (x: Double, y: Double) => x * (1 - y) }
|
||||
val promo = udf { (x: String, y: Double) => if (x.startsWith("PROMO")) y else 0 }
|
||||
|
||||
part.join(lineitem, $"l_partkey" === $"p_partkey" &&
|
||||
$"l_shipdate" >= "1995-09-01" && $"l_shipdate" < "1995-10-01")
|
||||
.select($"p_type", reduce($"l_extendedprice", $"l_discount").as("value"))
|
||||
.agg(sum(promo($"p_type", $"value")) * 100 / sum($"value"))
|
||||
.show()
|
||||
}
|
||||
|
||||
def q15(): Unit = {
|
||||
val decrease = udf { (x: Double, y: Double) => x * (1 - y) }
|
||||
|
||||
val revenue = lineitem.filter($"l_shipdate" >= "1996-01-01" &&
|
||||
$"l_shipdate" < "1996-04-01")
|
||||
.select($"l_suppkey", decrease($"l_extendedprice", $"l_discount").as("value"))
|
||||
.groupBy($"l_suppkey")
|
||||
.agg(sum($"value").as("total"))
|
||||
|
||||
revenue.agg(max($"total").as("max_total"))
|
||||
.join(revenue, $"max_total" === revenue("total"))
|
||||
.join(supplier, $"l_suppkey" === supplier("s_suppkey"))
|
||||
.select($"s_suppkey", $"s_name", $"s_address", $"s_phone", $"total")
|
||||
.sort($"s_suppkey")
|
||||
.show()
|
||||
}
|
||||
|
||||
def q16(): Unit = {
|
||||
val complains = udf { x: String => x.matches(".*Customer.*Complaints.*") }
|
||||
val polished = udf { x: String => x.startsWith("MEDIUM POLISHED") }
|
||||
val numbers = udf { x: Int => x.toString.matches("49|14|23|45|19|3|36|9") }
|
||||
|
||||
val fparts = part.filter(($"p_brand" =!= "Brand#45") && !polished($"p_type") &&
|
||||
numbers($"p_size"))
|
||||
.select($"p_partkey", $"p_brand", $"p_type", $"p_size")
|
||||
|
||||
supplier.filter(!complains($"s_comment"))
|
||||
.join(partsupp, $"s_suppkey" === partsupp("ps_suppkey"))
|
||||
.select($"ps_partkey", $"ps_suppkey")
|
||||
.join(fparts, $"ps_partkey" === fparts("p_partkey"))
|
||||
.groupBy($"p_brand", $"p_type", $"p_size")
|
||||
.agg(countDistinct($"ps_suppkey").as("supplier_count"))
|
||||
.sort($"supplier_count".desc, $"p_brand", $"p_type", $"p_size")
|
||||
.show()
|
||||
}
|
||||
|
||||
def q17(): Unit = {
|
||||
val mul02 = udf { x: Double => x * 0.2 }
|
||||
|
||||
val fpart = part.filter($"p_brand" === "Brand#23" && $"p_container" === "MED BOX")
|
||||
.select($"p_partkey")
|
||||
.join(lineitem, $"p_partkey" === lineitem("l_partkey"), "left_outer")
|
||||
|
||||
fpart.groupBy("p_partkey")
|
||||
.agg(mul02(avg($"l_quantity")).as("avg_quantity"))
|
||||
.select($"p_partkey".as("key"), $"avg_quantity")
|
||||
.join(fpart, $"key" === fpart("p_partkey"))
|
||||
.filter($"l_quantity" < $"avg_quantity")
|
||||
.agg(sum($"l_extendedprice") / 7.0)
|
||||
.show()
|
||||
}
|
||||
|
||||
def q18(): Unit = {
|
||||
lineitem.groupBy($"l_orderkey")
|
||||
.agg(sum($"l_quantity").as("sum_quantity"))
|
||||
.filter($"sum_quantity" > 300)
|
||||
.select($"l_orderkey".as("key"), $"sum_quantity")
|
||||
.join(order, order("o_orderkey") === $"key")
|
||||
.join(lineitem, $"o_orderkey" === lineitem("l_orderkey"))
|
||||
.join(customer, customer("c_custkey") === $"o_custkey")
|
||||
.select($"l_quantity", $"c_name", $"c_custkey", $"o_orderkey", $"o_orderdate", $"o_totalprice")
|
||||
.groupBy($"c_name", $"c_custkey", $"o_orderkey", $"o_orderdate", $"o_totalprice")
|
||||
.agg(sum("l_quantity"))
|
||||
.sort($"o_totalprice".desc, $"o_orderdate")
|
||||
.show()
|
||||
}
|
||||
|
||||
def q19(): Unit = {
|
||||
val sm = udf { x: String => x.matches("SM CASE|SM BOX|SM PACK|SM PKG") }
|
||||
val md = udf { x: String => x.matches("MED BAG|MED BOX|MED PKG|MED PACK") }
|
||||
val lg = udf { x: String => x.matches("LG CASE|LG BOX|LG PACK|LG PKG") }
|
||||
|
||||
val decrease = udf { (x: Double, y: Double) => x * (1 - y) }
|
||||
|
||||
part.join(lineitem, $"l_partkey" === $"p_partkey")
|
||||
.filter(($"l_shipmode" === "AIR" || $"l_shipmode" === "AIR REG") &&
|
||||
$"l_shipinstruct" === "DELIVER IN PERSON")
|
||||
.filter(
|
||||
(($"p_brand" === "Brand#12") &&
|
||||
sm($"p_container") &&
|
||||
$"l_quantity" >= 1 && $"l_quantity" <= 11 &&
|
||||
$"p_size" >= 1 && $"p_size" <= 5) ||
|
||||
(($"p_brand" === "Brand#23") &&
|
||||
md($"p_container") &&
|
||||
$"l_quantity" >= 10 && $"l_quantity" <= 20 &&
|
||||
$"p_size" >= 1 && $"p_size" <= 10) ||
|
||||
(($"p_brand" === "Brand#34") &&
|
||||
lg($"p_container") &&
|
||||
$"l_quantity" >= 20 && $"l_quantity" <= 30 &&
|
||||
$"p_size" >= 1 && $"p_size" <= 15))
|
||||
.select(decrease($"l_extendedprice", $"l_discount").as("volume"))
|
||||
.agg(sum("volume"))
|
||||
.show()
|
||||
}
|
||||
|
||||
def q20(): Unit = {
|
||||
val forest = udf { x: String => x.startsWith("forest") }
|
||||
|
||||
val flineitem = lineitem.filter($"l_shipdate" >= "1994-01-01" && $"l_shipdate" < "1995-01-01")
|
||||
.groupBy($"l_partkey", $"l_suppkey")
|
||||
.agg((sum($"l_quantity") * 0.5).as("sum_quantity"))
|
||||
|
||||
val fnation = nation.filter($"n_name" === "CANADA")
|
||||
val nat_supp = supplier.select($"s_suppkey", $"s_name", $"s_nationkey", $"s_address")
|
||||
.join(fnation, $"s_nationkey" === fnation("n_nationkey"))
|
||||
|
||||
part.filter(forest($"p_name"))
|
||||
.select($"p_partkey").distinct
|
||||
.join(partsupp, $"p_partkey" === partsupp("ps_partkey"))
|
||||
.join(flineitem, $"ps_suppkey" === flineitem("l_suppkey") && $"ps_partkey" === flineitem("l_partkey"))
|
||||
.filter($"ps_availqty" > $"sum_quantity")
|
||||
.select($"ps_suppkey").distinct
|
||||
.join(nat_supp, $"ps_suppkey" === nat_supp("s_suppkey"))
|
||||
.select($"s_name", $"s_address")
|
||||
.sort($"s_name")
|
||||
.show()
|
||||
}
|
||||
|
||||
def q21(): Unit = {
|
||||
val fsupplier = supplier.select($"s_suppkey", $"s_nationkey", $"s_name")
|
||||
|
||||
val plineitem = lineitem.select($"l_suppkey", $"l_orderkey", $"l_receiptdate", $"l_commitdate")
|
||||
|
||||
val flineitem = plineitem.filter($"l_receiptdate" > $"l_commitdate")
|
||||
|
||||
val line1 = plineitem.groupBy($"l_orderkey")
|
||||
.agg(countDistinct($"l_suppkey").as("suppkey_count"), max($"l_suppkey").as("suppkey_max"))
|
||||
.select($"l_orderkey".as("key"), $"suppkey_count", $"suppkey_max")
|
||||
|
||||
val line2 = flineitem.groupBy($"l_orderkey")
|
||||
.agg(countDistinct($"l_suppkey").as("suppkey_count"), max($"l_suppkey").as("suppkey_max"))
|
||||
.select($"l_orderkey".as("key"), $"suppkey_count", $"suppkey_max")
|
||||
|
||||
val forder = order.select($"o_orderkey", $"o_orderstatus")
|
||||
.filter($"o_orderstatus" === "F")
|
||||
|
||||
nation.filter($"n_name" === "SAUDI ARABIA")
|
||||
.join(fsupplier, $"n_nationkey" === fsupplier("s_nationkey"))
|
||||
.join(flineitem, $"s_suppkey" === flineitem("l_suppkey"))
|
||||
.join(forder, $"l_orderkey" === forder("o_orderkey"))
|
||||
.join(line1, $"l_orderkey" === line1("key"))
|
||||
.filter($"suppkey_count" > 1 || ($"suppkey_count" == 1 && $"l_suppkey" == $"max_suppkey"))
|
||||
.select($"s_name", $"l_orderkey", $"l_suppkey")
|
||||
.join(line2, $"l_orderkey" === line2("key"), "left_outer")
|
||||
.select($"s_name", $"l_orderkey", $"l_suppkey", $"suppkey_count", $"suppkey_max")
|
||||
.filter($"suppkey_count" === 1 && $"l_suppkey" === $"suppkey_max")
|
||||
.groupBy($"s_name")
|
||||
.agg(count($"l_suppkey").as("numwait"))
|
||||
.sort($"numwait".desc, $"s_name")
|
||||
.show()
|
||||
}
|
||||
|
||||
def q22(): Unit = {
|
||||
val sub2 = udf { x: String => x.substring(0, 2) }
|
||||
val phone = udf { x: String => x.matches("13|31|23|29|30|18|17") }
|
||||
|
||||
val fcustomer = customer.select($"c_acctbal", $"c_custkey", sub2($"c_phone").as("cntrycode"))
|
||||
.filter(phone($"cntrycode"))
|
||||
|
||||
val avg_customer = fcustomer.filter($"c_acctbal" > 0.0)
|
||||
.agg(avg($"c_acctbal").as("avg_acctbal"))
|
||||
|
||||
order.groupBy($"o_custkey")
|
||||
.agg($"o_custkey").select($"o_custkey")
|
||||
.join(fcustomer, $"o_custkey" === fcustomer("c_custkey"), "right_outer")
|
||||
.filter($"o_custkey".isNull)
|
||||
.join(avg_customer)
|
||||
.filter($"c_acctbal" > $"avg_acctbal")
|
||||
.groupBy($"cntrycode")
|
||||
.agg(count($"c_acctbal"), sum($"c_acctbal"))
|
||||
.sort($"cntrycode")
|
||||
.show()
|
||||
}
|
||||
}
|
|
@ -0,0 +1,204 @@
|
|||
Deploying your App on the Cloud
|
||||
===
|
||||
|
||||
# Table of Contents
|
||||
- [Pre-requisites](#pre-requisites)
|
||||
- [Preparing Worker Dependencies](#preparing-worker-dependencies)
|
||||
- [Cloud Deployment](#cloud-deployment)
|
||||
- [Azure HDInsight Spark](#azure-hdinsight-spark)
|
||||
- [Deploy Worker to Spark Cluster](#deploy-microsoftsparkworker)
|
||||
- [App deployment using spark-submit](#using-spark-submit)
|
||||
- [App deployment using Apache Livy](#using-apache-livy)
|
||||
- [Amazon EMR Spark](#amazon-emr-spark)
|
||||
- [Deploy Worker to Spark Cluster](#deploy-microsoftsparkworker-1)
|
||||
- [App deployment using spark-submit](#using-spark-submit-1)
|
||||
- [App deployment using Amazon EMR Steps](#using-amazon-emr-steps)
|
||||
- [Databricks (Azure & AWS)](#databricks)
|
||||
- [Deploy Worker to Spark Cluster](#deploy-microsoftsparkworker-2)
|
||||
- [App deployment using spark-submit](#using-spark-submit-2)
|
||||
|
||||
# Pre-requisites:
|
||||
1. Clone and successfully build [Spark .NET](https://github.com/dotnet) by following the [Quick Start instructions](https://github.com/dotnet/spark#quick-start-tldr).
|
||||
2. Download and install [.NET Core](https://dotnet.microsoft.com/download) <span style="color: red">2.1+</span> for your operating system.
|
||||
3. Tool for creating a `tgz` file: `tar` on Linux, [7-ZIP](https://www.7-zip.org/) on Windows, etc.
|
||||
4. Tool to copy files to a distributed file system.
|
||||
- ADLS, WASB → [Azure Storage Explorer](https://azure.microsoft.com/en-us/features/storage-explorer/)
|
||||
- S3 → [AWS CLI](https://aws.amazon.com/cli/)
|
||||
5. Download [install-worker.sh](install-worker.sh) to your local machine. This is a helper script that we will use later in the installation section to copy Spark .NET dependent files into your Spark cluster's worker nodes. install-worker.sh takes in three parameters:
|
||||
1. The Cloud Provider: `azure` or `aws`
|
||||
2. URI where `worker.tgz` is uploaded.
|
||||
3. Path on the executor node where the worker package will be installed (the path should be the directory that `yarn` user has access to).
|
||||
|
||||
Example Usage:
|
||||
```shell
|
||||
install-worker.sh azure adl://<cluster name>.azuredatalakestore.net/<some dir>/worker.tgz /usr/local/bin
|
||||
```
|
||||
|
||||
# Preparing Worker Dependencies
|
||||
Microsoft.Spark.Worker is a backend component that lives on the individual worker nodes of your Spark cluster. When you want to execute a C# UDF (user-defined function), Spark needs to understand how to launch the .NET CLR to execute this UDF. Microsoft.Spark.Worker provides a collection of classes to Spark that enable this functionality.
|
||||
|
||||
## Microsoft.Spark.Worker
|
||||
1. Publish Microsoft.Spark.Worker as self-contained.
|
||||
```shell
|
||||
# For example, you can run the following on Linux.
|
||||
foo@bar:~/dotnet/spark/src/csharp/Microsoft.Spark.Worker$ dotnet publish -c Release -f netcoreapp2.1 -r ubuntu.16.04-x64
|
||||
```
|
||||
> **Note**: Ensure that the correct [dotnet Runtime Identifier](https://github.com/dotnet/corefx/blob/master/pkg/Microsoft.NETCore.Platforms/runtime.json) is used for your cluster.
|
||||
|
||||
2. Produce `worker.tgz` for the published files.
|
||||
```shell
|
||||
# For example, you can run the following on Linux using `tar`.
|
||||
foo@bar:~/dotnet/spark/src/csharp/Microsoft.Spark.Worker$ tar czvf worker.tgz -C bin/Release/netcoreapp2.1/ubuntu.16.04-x64/publish/ .
|
||||
```
|
||||
|
||||
3. Upload `worker.tgz` and [install-worker.sh](install-worker.sh) to a distributed file system (e.g., HDFS, WASB, ADLS, S3) that your cluster has access to.
|
||||
|
||||
## Your Spark .NET `app`
|
||||
1. Publish your Spark .NET `app` as self-contained.
|
||||
```shell
|
||||
# For example, you can run the following on Linux.
|
||||
foo@bar:~/path/to/app$ dotnet publish -c Release -f netcoreapp2.1 -r ubuntu.16.04-x64
|
||||
```
|
||||
2. Produce `<your app>.zip` for the published files.
|
||||
```shell
|
||||
# For example, you can run the following on Linux using `zip`.
|
||||
foo@bar:~/path/to/app/bin/Release/netcoreapp2.1/ubuntu.16.04-x64/publish$ zip -r <your app>.zip .
|
||||
```
|
||||
3. Upload the following to a distributed file system (e.g., HDFS, WASB, ADLS, S3) that your cluster has access to:
|
||||
* `microsoft-spark-<spark_majorversion.spark_minorversion.x>-<spark_dotnet_version>.jar` (created in the [Build](../README.md#build) step)
|
||||
* `<your app>.zip`
|
||||
* Files (e.g., dependency files, common data accessible to every worker) or Assemblies (e.g., DLLs that contain your user-defined functions, libraries that your `app` depends on) to be placed in the working directory of each executor.
|
||||
|
||||
# Cloud Deployment
|
||||
## Azure HDInsight Spark
|
||||
[Azure HDInsight Spark](https://docs.microsoft.com/en-us/azure/hdinsight/spark/apache-spark-overview) is the Microsoft implementation of Apache Spark in the cloud that allows users to launch and configure Spark clusters in Azure. You can use HDInsight Spark clusters to process your data stored in Azure (e.g., [Azure Storage](https://azure.microsoft.com/en-us/services/storage/) and [Azure Data Lake Storage](https://docs.microsoft.com/en-us/azure/storage/blobs/data-lake-storage-introduction)).
|
||||
|
||||
> **Note**: Azure HDInsight Spark is Linux-based. Therefore, if you are interested in deploying your app to Azure HDInsight Spark, make sure your app is .NET Standard compatible and that you use [.NET Core compiler](https://dotnet.microsoft.com/download) to compile your app.
|
||||
|
||||
### Deploy Microsoft.Spark.Worker
|
||||
*Note that this step is required only once*
|
||||
|
||||
#### Run HDInsight Script Action
|
||||
Run `install-worker.sh` on the cluster using [HDInsight Script Actions](https://docs.microsoft.com/en-us/azure/hdinsight/hdinsight-hadoop-customize-cluster-linux):
|
||||
|
||||
* Script type: Custom
|
||||
* Name: Install Microsoft.Spark.Worker (or anything that is descriptive)
|
||||
* Bash script URI: The URI to which you uploaded `install-worker.sh` (e.g. adl://\<cluster name\>.azuredatalakestore.net/\<some dir\>/install-worker.sh)
|
||||
* Node type(s): Worker
|
||||
* Parameters: Parameters to `install-worker.sh`. For example, if you uploaded to Azure Data Lake then it would be `azure adl://<cluster name>.azuredatalakestore.net/<some dir>/worker.tgz /usr/local/bin`.
|
||||
|
||||
The following captures the setting for a HDInsight Script Action:
|
||||
|
||||
<img src="../docs/img/deployment-hdi-action-script.png" alt="ScriptActionImage" width="500"/>
|
||||
|
||||
### Run your app on the cloud!
|
||||
#### Using [spark-submit](https://spark.apache.org/docs/latest/submitting-applications.html)
|
||||
1. `ssh` into one of the head nodes in the cluster.
|
||||
2. Run `spark-submit`:
|
||||
```shell
|
||||
foo@bar:~$ $SPARK_HOME/bin/spark-submit \
|
||||
--master yarn \
|
||||
--class org.apache.spark.deploy.DotnetRunner \
|
||||
--files <comma-separated list of assemblies that contain UDF definitions, if any> \
|
||||
adl://<cluster name>.azuredatalakestore.net/<some dir>/microsoft-spark-<spark_majorversion.spark_minorversion.x>-<spark_dotnet_version>.jar \
|
||||
adl://<cluster name>.azuredatalakestore.net/<some dir>/<your app>.zip <your app> <app arg 1> <app arg 2> ... <app arg n>
|
||||
```
|
||||
|
||||
#### Using [Apache Livy](https://livy.incubator.apache.org/)
|
||||
You can use Apache Livy, the Apache Spark REST API, to submit Spark .NET jobs to an Azure HDInsight Spark cluster as documented in [Remote jobs with Apache Livy](https://docs.microsoft.com/en-us/azure/hdinsight/spark/apache-spark-livy-rest-interface).
|
||||
```shell
|
||||
# For example, you can run the following on Linux using `curl`.
|
||||
foo@bar:~$ curl -k -v -X POST "https://<your spark cluster>.azurehdinsight.net/livy/batches" \
|
||||
-u "<hdinsight username>:<hdinsight password>" \
|
||||
-H "Content-Type: application/json" \
|
||||
-H "X-Requested-By: <hdinsight username>" \
|
||||
-d @- << EOF
|
||||
{
|
||||
"file":"adl://<cluster name>.azuredatalakestore.net/<some dir>/microsoft-spark-<spark_majorversion.spark_minorversion.x>-<spark_dotnet_version>.jar",
|
||||
"className":"org.apache.spark.deploy.DotnetRunner",
|
||||
"files":["adl://<cluster name>.azuredatalakestore.net/<some dir>/<udf assembly>", "adl://<cluster name>.azuredatalakestore.net/<some dir>/<file>"],
|
||||
"args":["adl://<cluster name>.azuredatalakestore.net/<some dir>/<your app>.zip","<your app>","<app arg 1>","<app arg 2>,"...","<app arg n>"]
|
||||
}
|
||||
EOF
|
||||
```
|
||||
|
||||
## Amazon EMR Spark
|
||||
[Amazon EMR](https://docs.aws.amazon.com/emr/latest/ManagementGuide/emr-what-is-emr.html) is a managed cluster platform that simplifies running big data frameworks on AWS.
|
||||
|
||||
> **Note**: AWS EMR Spark is Linux-based. Therefore, if you are interested in deploying your app to AWS EMR Spark, make sure your app is .NET Standard compatible and that you use [.NET Core compiler](https://dotnet.microsoft.com/download) to compile your app.
|
||||
|
||||
### Deploy Microsoft.Spark.Worker
|
||||
*Note that this step is only required at cluster creation*
|
||||
|
||||
#### Create cluster using Amazon EMR Bootstrap Actions
|
||||
Run `install-worker.sh` during cluster creation using [Bootstrap Actions](https://docs.aws.amazon.com/emr/latest/ManagementGuide/emr-plan-bootstrap.html).
|
||||
```shell
|
||||
# For example, you can run the following on Linux using `aws` cli.
|
||||
foo@bar:~$ aws emr create-cluster \
|
||||
--name "Test cluster" \
|
||||
--release-label emr-5.23.0 \
|
||||
--use-default-roles \
|
||||
--ec2-attributes KeyName=myKey \
|
||||
--applications Name=Spark \
|
||||
--instance-count 3 \
|
||||
--instance-type m1.medium \
|
||||
--bootstrap-actions Path=s3://mybucket/<some dir>/install-worker.sh,Name="Install Microsoft.Spark.Worker",Args=["aws","s3://mybucket/<some dir>/worker.tgz","/usr/local/bin"]
|
||||
```
|
||||
|
||||
### Run your app on the cloud!
|
||||
Upload the following to an S3 bucket your cluster has access to:
|
||||
* `microsoft-spark-<spark_majorversion.spark_minorversion.x>-<spark_dotnet_version>.jar` (created in the [Build](../README.md#build) step)
|
||||
* `<your app>.zip`
|
||||
* Files (e.g., dependency files, common data accessible to every worker) or Assemblies (e.g., DLLs that contain your user-defined functions, libraries that your `app` depends on) to be placed in the working directory of each executor.
|
||||
|
||||
#### Using [spark-submit](https://spark.apache.org/docs/latest/submitting-applications.html)
|
||||
1. `ssh` into one of the nodes in the cluster.
|
||||
2. Run `spark-submit`:
|
||||
```shell
|
||||
foo@bar:~$ spark-submit \
|
||||
--master yarn \
|
||||
--class org.apache.spark.deploy.DotnetRunner \
|
||||
--files <comma-separated list of assemblies that contain UDF definitions, if any> \
|
||||
s3://mybucket/<some dir>/microsoft-spark-<spark_majorversion.spark_minorversion.x>-<spark_dotnet_version>.jar \
|
||||
s3://mybucket/<some dir>/<your app>.zip <your app> <app args>
|
||||
```
|
||||
|
||||
#### Using [Amazon EMR Steps](https://docs.aws.amazon.com/emr/latest/ReleaseGuide/emr-spark-submit-step.html)
|
||||
Amazon EMR Steps can be used to submit jobs to the Spark framework installed on the EMR cluster.
|
||||
```bash
|
||||
# For example, you can run the following on Linux using `aws` cli.
|
||||
foo@bar:~$ aws emr add-steps \
|
||||
--cluster-id j-xxxxxxxxxxxxx \
|
||||
--steps Type=spark,Name="Spark Program",Args=[--master,yarn,--files,s3://mybucket/<some dir>/<udf assembly>,--class,org.apache.spark.deploy.DotnetRunner,s3://mybucket/<some dir>/microsoft-spark-<spark_majorversion.spark_minorversion.x>-<spark_dotnet_version>.jar,s3://mybucket/<some dir>/<your app>.zip,<your app>,<app arg 1>,<app arg 2>,...,<app arg n>],ActionOnFailure=CONTINUE
|
||||
```
|
||||
|
||||
## Databricks
|
||||
[Databricks](http://databricks.com) is a platform that provides cloud-based big data processing using Apache Spark.
|
||||
|
||||
> **Note**: [Azure](https://azure.microsoft.com/en-us/services/databricks/) and [AWS](https://databricks.com/aws) Databricks is Linux-based. Therefore, if you are interested in deploying your app to Databricks, make sure your app is .NET Standard compatible and that you use [.NET Core compiler](https://dotnet.microsoft.com/download) to compile your app.
|
||||
|
||||
### Deploy Microsoft.Spark.Worker
|
||||
*Note that this step is required only once*
|
||||
|
||||
#### Cluster Node Initialization Scripts
|
||||
Using Databrick's [init script](https://docs.databricks.com/user-guide/clusters/init-scripts.html) mechanism, we will run a shell script during startup for each cluster node before the Spark driver or worker JVM starts.
|
||||
|
||||
1. Configure and your [Data Source](https://docs.databricks.com/spark/latest/data-sources/index.html) and mount it using [Databricks File System](https://docs.databricks.com/user-guide/dbfs-databricks-file-system.html#dbfs).
|
||||
2. Use the following [init script](https://docs.databricks.com/user-guide/clusters/init-scripts.html) to install `Microsoft.Spark.Worker` on the cluster nodes.
|
||||
```scala
|
||||
dbutils.fs.put("dbfs:/databricks/<cluster-scoped or global path>/install-worker-wrapper.sh" ,"""
|
||||
#!/bin/bash
|
||||
set +e
|
||||
|
||||
/bin/bash /dbfs/<your mount>/<path to>/install-worker.sh local /dbfs/<your mount>/<path to>/worker.tgz /usr/local/bin
|
||||
""", true)
|
||||
```
|
||||
3. Restart the cluster.
|
||||
|
||||
### Run your app on the cloud!
|
||||
#### Using [spark-submit](https://spark.apache.org/docs/latest/submitting-applications.html)
|
||||
1. [Create a Job](https://docs.databricks.com/user-guide/jobs.html) and select *Configure spark-submit*.
|
||||
2. Configure `spark-submit` with the following parameters:
|
||||
```shell
|
||||
["--files","/dbfs/<your mount>/<path-to>/<app assembly/file to deploy to worker>","--class"," org.apache.spark.deploy.DotnetRunner","/dbfs/<your mount>/<path to>/microsoft-spark-<spark_majorversion.spark_minorversion.x>-<spark_dotnet_version>.jar","/dbfs/<your mount>/<path to>/<app name>.zip","<app bin name>","app arg1","app arg2"]
|
||||
```
|
|
@ -0,0 +1,40 @@
|
|||
#!/bin/bash
|
||||
set +e
|
||||
|
||||
# Cloud Provider
|
||||
CLOUD_PROVIDER=$1
|
||||
# Path where packaged worker file (tgz) exists.
|
||||
WORKER_PATH=$2
|
||||
# The path on the executor nodes where Microsoft.Spark.Worker executable is installed.
|
||||
DEST_PATH=$3
|
||||
# The path where all the dependent libraies are installed so that it doesn't
|
||||
# pollute the $DEST_PATH.
|
||||
DEST_PATH_BINARIES=$DEST_PATH/microsoft.spark.worker
|
||||
# Temporary worker file.
|
||||
TEMP_WORKER_FILENAME=/tmp/temp_worker.tgz
|
||||
|
||||
# Clean up any existing files.
|
||||
sudo rm -f $DEST_PATH/Microsoft.Spark.Worker
|
||||
sudo rm -rf $DEST_PATH_BINARIES
|
||||
|
||||
# Copy the worker file to a local temporary file.
|
||||
if [ "${CLOUD_PROVIDER,,}" = "azure" ]; then
|
||||
hdfs dfs -get $WORKER_PATH $TEMP_WORKER_FILENAME
|
||||
elif [ "${CLOUD_PROVIDER,,}" = "aws" ]; then
|
||||
aws s3 cp $WORKER_PATH $TEMP_WORKER_FILENAME
|
||||
else
|
||||
cp -f $WORKER_PATH $TEMP_WORKER_FILENAME
|
||||
fi
|
||||
|
||||
# Untar the file.
|
||||
sudo mkdir -p $DEST_PATH_BINARIES
|
||||
sudo tar xzf $TEMP_WORKER_FILENAME -C $DEST_PATH_BINARIES
|
||||
|
||||
# Make the file executable since dotnet doesn't set this correctly.
|
||||
sudo chmod 755 $DEST_PATH_BINARIES/Microsoft.Spark.Worker
|
||||
|
||||
# Create a symlink.
|
||||
sudo ln -sf $DEST_PATH_BINARIES/Microsoft.Spark.Worker $DEST_PATH/Microsoft.Spark.Worker
|
||||
|
||||
# Remove the temporary worker file.
|
||||
sudo rm $TEMP_WORKER_FILENAME
|
|
@ -0,0 +1,14 @@
|
|||
# The following configs are taken from https://github.com/apache/spark/blob/master/dev/.scalafmt.conf
|
||||
align = none
|
||||
align.openParenDefnSite = false
|
||||
align.openParenCallSite = false
|
||||
align.tokens = []
|
||||
optIn = {
|
||||
configStyleArguments = false
|
||||
}
|
||||
danglingParentheses = false
|
||||
docstrings = JavaDoc
|
||||
maxColumn = 98
|
||||
|
||||
# The following are specific to donet/spark.
|
||||
importSelectors = singleLine
|
|
@ -0,0 +1,247 @@
|
|||
Building Spark .NET on Ubuntu 18.04
|
||||
==========================
|
||||
|
||||
# Table of Contents
|
||||
- [Open Issues](#open-issues)
|
||||
- [Pre-requisites](#pre-requisites)
|
||||
- [Building](#building)
|
||||
- [Building Spark .NET Scala Extensions Layer](#building-spark-net-scala-extensions-layer)
|
||||
- [Building .NET Sample Applications using .NET Core CLI](#building-net-sample-applications-using-net-core-cli)
|
||||
- [Run Samples](#run-samples)
|
||||
|
||||
# Open Issues:
|
||||
- [Building through Visual Studio Code]()
|
||||
|
||||
# Pre-requisites:
|
||||
|
||||
If you already have all the pre-requisites, skip to the [build](ubuntu-instructions.md#building) steps below.
|
||||
|
||||
1. Download and install **[.NET Core 2.1 SDK](https://dotnet.microsoft.com/download/dotnet-core/2.1)** or the **[.NET Core 3.0 preview SDK](https://dotnet.microsoft.com/download/dotnet-core/3.0)** - installing the SDK will add the `dotnet` toolchain to your path.
|
||||
2. Install **[OpenJDK 8](https://openjdk.java.net/install/)**
|
||||
- You can use the following command:
|
||||
```bash
|
||||
sudo apt install openjdk-8-jdk
|
||||
```
|
||||
- Verify you are able to run `java` from your command-line
|
||||
<details>
|
||||
<summary>📙 Click to see sample java -version output</summary>
|
||||
|
||||
```
|
||||
openjdk version "1.8.0_191"
|
||||
OpenJDK Runtime Environment (build 1.8.0_191-8u191-b12-2ubuntu0.18.04.1-b12)
|
||||
OpenJDK 64-Bit Server VM (build 25.191-b12, mixed mode)
|
||||
```
|
||||
- If you already have multiple OpenJDK versions installed and want to select OpenJDK 8, use the following command:
|
||||
```bash
|
||||
sudo update-alternatives --config java
|
||||
```
|
||||
3. Install **[Apache Maven 3.6.0+](https://maven.apache.org/download.cgi)**
|
||||
- Run the following command:
|
||||
```bash
|
||||
mkdir -p ~/bin/maven
|
||||
cd ~/bin/maven
|
||||
wget https://www-us.apache.org/dist/maven/maven-3/3.6.0/binaries/apache-maven-3.6.0-bin.tar.gz
|
||||
tar -xvzf apache-maven-3.6.0-bin.tar.gz
|
||||
ln -s apache-maven-3.6.0 current
|
||||
export M2_HOME=~/bin/maven/current
|
||||
export PATH=${M2_HOME}/bin:${PATH}
|
||||
source ~/.bashrc
|
||||
```
|
||||
|
||||
Note that these environment variables will be lost when you close your terminal. If you want the changes to be permanent, add the `export` lines to your `~/.bashrc` file.
|
||||
- Verify you are able to run `mvn` from your command-line
|
||||
<details>
|
||||
<summary>📙 Click to see sample mvn -version output</summary>
|
||||
|
||||
```
|
||||
Apache Maven 3.6.0 (97c98ec64a1fdfee7767ce5ffb20918da4f719f3; 2018-10-24T18:41:47Z)
|
||||
Maven home: ~/bin/apache-maven-3.6.0
|
||||
Java version: 1.8.0_191, vendor: Oracle Corporation, runtime: /usr/lib/jvm/java-8-openjdk-amd64/jre
|
||||
Default locale: en, platform encoding: UTF-8
|
||||
OS name: "linux", version: "4.4.0-17763-microsoft", arch: "amd64", family: "unix"
|
||||
```
|
||||
4. Install **[Apache Spark 2.3+](https://spark.apache.org/downloads.html)**
|
||||
- Download [Apache Spark 2.3+](https://spark.apache.org/downloads.html) and extract it into a local folder (e.g., `~/bin/spark-2.3.2-bin-hadoop2.7`)
|
||||
- Add the necessary [environment variables](https://www.java.com/en/download/help/path.xml) `SPARK_HOME` e.g., `~/bin/spark-2.3.2-bin-hadoop2.7/`
|
||||
```bash
|
||||
export SPARK_HOME=~/bin/spark-2.3.2-hadoop2.7
|
||||
export PATH="$SPARK_HOME/bin:$PATH"
|
||||
source ~/.bashrc
|
||||
```
|
||||
|
||||
Note that these environment variables will be lost when you close your terminal. If you want the changes to be permanent, add the `export` lines to your `~/.bashrc` file.
|
||||
- Verify you are able to run `spark-shell` from your command-line
|
||||
<details>
|
||||
<summary>📙 Click to see sample console output</summary>
|
||||
|
||||
```
|
||||
Welcome to
|
||||
____ __
|
||||
/ __/__ ___ _____/ /__
|
||||
_\ \/ _ \/ _ `/ __/ '_/
|
||||
/___/ .__/\_,_/_/ /_/\_\ version 2.3.2
|
||||
/_/
|
||||
|
||||
Using Scala version 2.11.8 (Java HotSpot(TM) 64-Bit Server VM, Java 1.8.0_201)
|
||||
Type in expressions to have them evaluated.
|
||||
Type :help for more information.
|
||||
|
||||
scala> sc
|
||||
res0: org.apache.spark.SparkContext = org.apache.spark.SparkContext@6eaa6b0c
|
||||
```
|
||||
|
||||
</details>
|
||||
|
||||
Please make sure you are able to run `dotnet`, `java`, `mvn`, `spark-shell` from your command-line before you move to the next section. Feel there is a better way? Please [open an issue](https://github.com/dotnet/spark/issues) and feel free to contribute.
|
||||
|
||||
# Building
|
||||
|
||||
For the rest of the section, it is assumed that you have cloned Spark .NET repo into your machine e.g., `~/dotnet.spark/`
|
||||
|
||||
```
|
||||
git clone https://github.com/dotnet/spark.git ~/dotnet.spark
|
||||
```
|
||||
|
||||
## Building Spark .NET Scala Extensions Layer
|
||||
|
||||
When you submit a .NET application, Spark .NET has the necessary logic written in Scala that inform Apache Spark how to handle your requests (e.g., request to create a new Spark Session, request to transfer data from .NET side to JVM side etc.). This logic can be found in the [Spark .NET Scala Source Code](../../../src/scala).
|
||||
|
||||
Let us now build the Spark .NET Scala extension layer. This is easy to do:
|
||||
|
||||
```
|
||||
cd src/scala
|
||||
mvn clean package
|
||||
```
|
||||
You should see JARs created for the supported Spark versions:
|
||||
* `microsoft-spark-2.3.x/target/microsoft-spark-2.3.x-<version>.jar`
|
||||
* `microsoft-spark-2.4.x/target/microsoft-spark-2.4.x-<version>.jar`
|
||||
|
||||
## Building .NET Sample Applications using .NET Core CLI
|
||||
|
||||
1. Build the Worker
|
||||
```bash
|
||||
cd ~/dotnet.spark/src/csharp/Microsoft.Spark.Worker/
|
||||
dotnet publish -f netcoreapp2.1 -r ubuntu.18.04-x64
|
||||
```
|
||||
<details>
|
||||
<summary>📙 Click to see sample console output</summary>
|
||||
|
||||
```
|
||||
user@machine:/home/user/dotnet.spark/src/csharp/Microsoft.Spark.Worker$ dotnet publish -f netcoreapp2.1 -r ubuntu.18.04-x64
|
||||
|
||||
Welcome to .NET Core!
|
||||
---------------------
|
||||
Learn more about .NET Core: https://aka.ms/dotnet-docs
|
||||
Use 'dotnet --help' to see available commands or visit: https://aka.ms/dotnet-cli-docs
|
||||
|
||||
...
|
||||
output omitted
|
||||
...
|
||||
|
||||
Restore completed in 20.09 sec for /home/user/dotnet.spark/src/csharp/Microsoft.Spark/Microsoft.Spark.csproj.
|
||||
Installing runtime.linux-x64.Microsoft.NETCore.DotNetAppHost 2.1.9.
|
||||
Installing runtime.linux-x64.Microsoft.NETCore.DotNetHostResolver 2.1.9.
|
||||
Installing runtime.linux-x64.Microsoft.NETCore.DotNetHostPolicy 2.1.9.
|
||||
Installing runtime.linux-x64.Microsoft.NETCore.App 2.1.9.
|
||||
Generating MSBuild file /home/user/dotnet.spark/src/csharp/Microsoft.Spark.Worker/obj/Microsoft.Spark.Worker.csproj.nuget.g.props.
|
||||
Generating MSBuild file /home/user/dotnet.spark/src/csharp/Microsoft.Spark.Worker/obj/Microsoft.Spark.Worker.csproj.nuget.g.targets.
|
||||
Restore completed in 37.09 sec for /home/user/dotnet.spark/src/csharp/Microsoft.Spark.Worker/Microsoft.Spark.Worker.csproj.
|
||||
Microsoft.Spark -> /home/user/dotnet.spark/src/csharp/Microsoft.Spark/bin/Debug/netstandard2.0/Microsoft.Spark.dll
|
||||
Microsoft.Spark.Worker -> /home/user/dotnet.spark/src/csharp/Microsoft.Spark.Worker/bin/Debug/netcoreapp2.1/ubuntu.18.04-x64/Microsoft.Spark.Worker.dll
|
||||
Microsoft.Spark.Worker -> /home/user/dotnet.spark/src/csharp/Microsoft.Spark.Worker/bin/Debug/netcoreapp2.1/ubuntu.18.04-x64/publish/
|
||||
```
|
||||
|
||||
</details>
|
||||
2. Build the Samples
|
||||
|
||||
**.NET Core 2.1.x**
|
||||
Due to a bug in .NET Core 2.1.x CLI that causes problems with building a dependency project that creates executables, we have to resort to modifying the `.csproj` file. We are working with the .NET team towards resolving this.
|
||||
```
|
||||
cd ~/dotnet.spark/examples/Microsoft.Spark.CSharp.Examples/
|
||||
cat Microsoft.Spark.CSharp.Examples.csproj | grep -v "Microsoft.Spark.Worker.csproj" > Microsoft.Spark.CSharp.Examples.Patched.csproj
|
||||
dotnet publish -f netcoreapp2.1 -r ubuntu.18.04-x64 Microsoft.Spark.CSharp.Examples.Patched.csproj
|
||||
```
|
||||
|
||||
**.NET Core 3.x**
|
||||
If you are using .NET Core 3.x, you can avoid creating a new patched `.csproj` file and instead compile the project directly:
|
||||
```
|
||||
cd ~/dotnet.spark/examples/Microsoft.Spark.CSharp.Examples/
|
||||
dotnet publish -f netcoreapp2.1 -r ubuntu.18.04-x64 Microsoft.Spark.CSharp.Examples.csproj
|
||||
```
|
||||
|
||||
<details>
|
||||
<summary>📙 Click to see sample console output</summary>
|
||||
|
||||
```
|
||||
user@machine:/home/user/dotnet.spark/examples/Microsoft.Spark.CSharp.Examples$ dotnet publish -f netcoreapp2.1 -r ubuntu.18.04-x64 Microsoft.Spark.CSharp.Examples.Patched.csproj
|
||||
Microsoft (R) Build Engine version 15.9.20+g88f5fadfbe for .NET Core
|
||||
Copyright (C) Microsoft Corporation. All rights reserved.
|
||||
|
||||
Restoring packages for /home/user/dotnet.spark/examples/Microsoft.Spark.CSharp.Examples/Microsoft.Spark.CSharp.Examples.Patched.csproj...
|
||||
Restore completed in 53 ms for /home/user/dotnet.spark/src/csharp/Microsoft.Spark/Microsoft.Spark.csproj.
|
||||
Generating MSBuild file /home/user/dotnet.spark/examples/Microsoft.Spark.CSharp.Examples/obj/Microsoft.Spark.CSharp.Examples.Patched.csproj.nuget.g.props.
|
||||
Generating MSBuild file /home/user/dotnet.spark/examples/Microsoft.Spark.CSharp.Examples/obj/Microsoft.Spark.CSharp.Examples.Patched.csproj.nuget.g.targets.
|
||||
Restore completed in 305.72 ms for /home/user/dotnet.spark/examples/Microsoft.Spark.CSharp.Examples/Microsoft.Spark.CSharp.Examples.Patched.csproj.
|
||||
Microsoft.Spark -> /home/user/dotnet.spark/src/csharp/Microsoft.Spark/bin/Debug/netstandard2.0/Microsoft.Spark.dll
|
||||
Microsoft.Spark.CSharp.Examples.Patched -> /home/user/dotnet.spark/examples/Microsoft.Spark.CSharp.Examples/bin/Debug/netcoreapp2.1/ubuntu.18.04-x64/Microsoft.Spark.CSharp.Examples.dll
|
||||
Microsoft.Spark.CSharp.Examples.Patched -> /home/user/dotnet.spark/examples/Microsoft.Spark.CSharp.Examples/bin/Debug/netcoreapp2.1/ubuntu.18.04-x64/publish/
|
||||
```
|
||||
|
||||
</details>
|
||||
3. Manually copy Worker binaries into the Samples output location.
|
||||
```
|
||||
cp ~/dotnet.spark/src/csharp/Microsoft.Spark.Worker/bin/Debug/netcoreapp2.1/ubuntu.18.04-x64/publish/* ~/dotnet.spark/examples/Microsoft.Spark.CSharp.Examples/bin/Debug/netcoreapp2.1/ubuntu.18.04-x64/publish/
|
||||
```
|
||||
|
||||
# Run Samples
|
||||
|
||||
Once you build the samples, you can use `spark-submit` to submit your .NET Core apps. Make sure you have followed the [pre-requisites](#pre-requisites) section and installed Apache Spark.
|
||||
|
||||
1. Open a terminal and go to the directory where your app binary has been generated (e.g., `~/dotnet.spark/examples/Microsoft.Spark.CSharp.Examples/bin/Debug/netcoreapp2.1/ubuntu.18.04-x64/publish`)
|
||||
2. Running your app follows the basic structure:
|
||||
```bash
|
||||
spark-submit \
|
||||
[--jars <any-jars-your-app-is-dependent-on>] \
|
||||
--class org.apache.spark.deploy.DotnetRunner \
|
||||
--master local \
|
||||
<path-to-microsoft-spark-jar> \
|
||||
<path-to-your-app-binary> <argument(s)-to-your-app>
|
||||
```
|
||||
|
||||
Here are some examples you can run:
|
||||
- **[Microsoft.Spark.Examples.Sql.Basic](../../examples/Microsoft.Spark.CSharp.Examples/Sql/Basic.cs)**
|
||||
```bash
|
||||
spark-submit \
|
||||
--class org.apache.spark.deploy.DotnetRunner \
|
||||
--master local \
|
||||
~/dotnet.spark/src/scala/microsoft-spark-2.3.x/target/microsoft-spark-2.3.x-1.0.0-alpha.jar \
|
||||
Microsoft.Spark.CSharp.Examples Sql.Basic $SPARK_HOME/examples/src/main/resources/people.json
|
||||
```
|
||||
- **[Microsoft.Spark.Examples.Sql.Streaming.StructuredNetworkWordCount](../../examples/Microsoft.Spark.CSharp.Examples/Sql/Streaming/StructuredNetworkWordCount.cs)**
|
||||
```bash
|
||||
spark-submit \
|
||||
--class org.apache.spark.deploy.DotnetRunner \
|
||||
--master local \
|
||||
~/dotnet.spark/src/scala/microsoft-spark-2.3.x/target/microsoft-spark-2.3.x-1.0.0-alpha.jar \
|
||||
Microsoft.Spark.CSharp.Examples Sql.Streaming.StructuredNetworkWordCount localhost 9999
|
||||
```
|
||||
- **[Microsoft.Spark.Examples.Sql.Streaming.StructuredKafkaWordCount (maven accessible)](../../examples/Microsoft.Spark.CSharp.Examples/Sql/Streaming/StructuredKafkaWordCount.cs)**
|
||||
```bash
|
||||
spark-submit \
|
||||
--packages org.apache.spark:spark-sql-kafka-0-10_2.11:2.3.2 \
|
||||
--class org.apache.spark.deploy.DotnetRunner \
|
||||
--master local \
|
||||
~/dotnet.spark/src/scala/microsoft-spark-2.3.x/target/microsoft-spark-2.3.x-1.0.0-alpha.jar \
|
||||
Microsoft.Spark.CSharp.Examples Sql.Streaming.StructuredKafkaWordCount localhost:9092 subscribe test
|
||||
```
|
||||
- **[Microsoft.Spark.Examples.Sql.Streaming.StructuredKafkaWordCount (jars provided)](../../examples/Microsoft.Spark.CSharp.Examples/Sql/Streaming/StructuredKafkaWordCount.cs)**
|
||||
```bash
|
||||
spark-submit \
|
||||
--jars path/to/net.jpountz.lz4/lz4-1.3.0.jar,path/to/org.apache.kafka/kafka-clients-0.10.0.1.jar,path/to/org.apache.spark/spark-sql-kafka-0-10_2.11-2.3.2.jar,`path/to/org.slf4j/slf4j-api-1.7.6.jar,path/to/org.spark-project.spark/unused-1.0.0.jar,path/to/org.xerial.snappy/snappy-java-1.1.2.6.jar \
|
||||
--class org.apache.spark.deploy.DotnetRunner \
|
||||
--master local \
|
||||
~/dotnet.spark/src/scala/microsoft-spark-2.3.x/target/microsoft-spark-2.3.x-1.0.0-alpha.jar \
|
||||
Microsoft.Spark.CSharp.Examples Sql.Streaming.StructuredKafkaWordCount localhost:9092 subscribe test
|
||||
```
|
||||
|
||||
Feel this experience is complicated? Help us by taking up [Simplify User Experience for Running an App](https://github.com/dotnet/spark/issues/6)
|
|
@ -0,0 +1,267 @@
|
|||
Building Spark .NET on Windows
|
||||
==========================
|
||||
|
||||
# Table of Contents
|
||||
- [Open Issues](#open-issues)
|
||||
- [Pre-requisites](#pre-requisites)
|
||||
- [Building](#building)
|
||||
- [Building Spark .NET Scala Extensions Layer](#building-spark-net-scala-extensions-layer)
|
||||
- [Building .NET Samples Application](#building-net-samples-application)
|
||||
- [Using Visual Studio for .NET Framework](#using-visual-studio-for-net-framework)
|
||||
- [Using .NET Core CLI for .NET Core](#using-net-core-cli-for-net-core)
|
||||
- [Run Samples](#run-samples)
|
||||
|
||||
# Open Issues:
|
||||
- [Allow users to choose which .NET framework to build for]()
|
||||
- [Building through Visual Studio Code]()
|
||||
- [Building fully automatically through .NET Core CLI]()
|
||||
|
||||
# Pre-requisites:
|
||||
|
||||
If you already have all the pre-requisites, skip to the [build](windows-instructions.md#building) steps below.
|
||||
|
||||
1. Download and install the **[.NET Core SDK](https://dotnet.microsoft.com/download/dotnet-core/2.1)** - installing the SDK will add the `dotnet` toolchain to your path. .NET Core 2.1, 2.2 and 3.0 preview are supported.
|
||||
2. Install any edition of **[Visual Studio 2019](https://www.visualstudio.com/downloads/)** or [Visual Studio 2017](https://www.visualstudio.com/downloads/). The Community version is completely free. When configuring your installation, include these components at minimum:
|
||||
* .NET desktop development
|
||||
* All Required Components
|
||||
* .NET Framework 4.6.1 Development Tools
|
||||
* .NET Core cross-platform development
|
||||
* All Required Components
|
||||
3. Install **[Java 1.8](https://www.oracle.com/technetwork/java/javase/downloads/jdk8-downloads-2133151.html)**
|
||||
- Select the appropriate version for your operating system e.g., jdk-8u201-windows-x64.exe for Win x64 machine.
|
||||
- Install using the installer and verify you are able to run `java` from your command-line
|
||||
4. Install **[Apache Maven 3.6.0+](https://maven.apache.org/download.cgi)**
|
||||
- Download [Apache Maven 3.6.0](http://mirror.metrocast.net/apache/maven/maven-3/3.6.0/binaries/apache-maven-3.6.0-bin.zip)
|
||||
- Extract to a local directory e.g., `c:\bin\apache-maven-3.6.0\`
|
||||
- Add Apache Maven to your [PATH environment variable](https://www.java.com/en/download/help/path.xml) e.g., `c:\bin\apache-maven-3.6.0\bin`
|
||||
- Verify you are able to run `mvn` from your command-line
|
||||
5. Install **[Apache Spark 2.3+](https://spark.apache.org/downloads.html)**
|
||||
- Download [Apache Spark 2.3+](https://spark.apache.org/downloads.html) and extract it into a local folder (e.g., `c:\bin\spark-2.3.2-bin-hadoop2.7\`) using [7-zip](https://www.7-zip.org/).
|
||||
- Add Apache Spark to your [PATH environment variable](https://www.java.com/en/download/help/path.xml) e.g., `c:\bin\spark-2.3.2-bin-hadoop2.7\bin`
|
||||
- Add a [new environment variable](https://www.java.com/en/download/help/path.xml) `SPARK_HOME` e.g., `C:\bin\spark-2.3.2-bin-hadoop2.7\`
|
||||
- Verify you are able to run `spark-shell` from your command-line
|
||||
<details>
|
||||
<summary>📙 Click to see sample console output</summary>
|
||||
|
||||
```
|
||||
Welcome to
|
||||
____ __
|
||||
/ __/__ ___ _____/ /__
|
||||
_\ \/ _ \/ _ `/ __/ '_/
|
||||
/___/ .__/\_,_/_/ /_/\_\ version 2.3.2
|
||||
/_/
|
||||
|
||||
Using Scala version 2.11.8 (Java HotSpot(TM) 64-Bit Server VM, Java 1.8.0_201)
|
||||
Type in expressions to have them evaluated.
|
||||
Type :help for more information.
|
||||
|
||||
scala> sc
|
||||
res0: org.apache.spark.SparkContext = org.apache.spark.SparkContext@6eaa6b0c
|
||||
```
|
||||
|
||||
Note: If you observe the following:
|
||||
> ERROR Shell:397 - Failed to locate the winutils binary in the hadoop binary path
|
||||
> java.io.IOException: Could not locate executable null\bin\winutils.exe in the Hadoop binaries.
|
||||
|
||||
You can ignore this if you are planning on running Spark in [Standalone mode](https://spark.apache.org/docs/latest/spark-standalone.html). If not, you would have to setup **[WinUtils](https://github.com/steveloughran/winutils)**
|
||||
|
||||
- Download winutils.exe binary from [WinUtils repository](https://github.com/steveloughran/winutils). You should select the version of Hadoop the Spark distribution was compiled with, e.g. use hadoop-2.7.1 for Spark 2.3.2.
|
||||
- Save winutils.exe binary to a directory of your choice, e.g. c:\hadoop\bin.
|
||||
- Set `HADOOP_HOME` to reflect the directory with winutils.exe (without bin). For instance, using command-line:
|
||||
```
|
||||
set HADOOP_HOME=c:\hadoop
|
||||
```
|
||||
- Set PATH environment variable to include `%HADOOP_HOME%\bin`. For instance, using command-line:
|
||||
```
|
||||
set PATH=%HADOOP_HOME%\bin;%PATH%
|
||||
```
|
||||
|
||||
</details>
|
||||
|
||||
Please make sure you are able to run `dotnet`, `java`, `mvn`, `spark-shell` from your command-line before you move to the next section. Feel there is a better way? Please [open an issue](https://github.com/dotnet/spark/issues) and feel free to contribute.
|
||||
|
||||
> **Note**: A new instance of the command-line may be required if any environment variables were updated.
|
||||
|
||||
# Building
|
||||
|
||||
For the rest of the section, it is assumed that you have cloned Spark .NET repo into your machine e.g., `c:\github\dotnet-spark\`
|
||||
|
||||
```
|
||||
git clone https://github.com/dotnet/spark.git c:\github\dotnet-spark
|
||||
```
|
||||
|
||||
## Building Spark .NET Scala Extensions Layer
|
||||
|
||||
When you submit a .NET application, Spark .NET has the necessary logic written in Scala that inform Apache Spark how to handle your requests (e.g., request to create a new Spark Session, request to transfer data from .NET side to JVM side etc.). This logic can be found in the [Spark .NET Scala Source Code](../../../src/scala).
|
||||
|
||||
Regardless of whether you are using .NET Framework or .NET Core, you will need to build the Spark .NET Scala extension layer. This is easy to do:
|
||||
|
||||
```
|
||||
cd src\scala
|
||||
mvn clean package
|
||||
```
|
||||
You should see JARs created for the supported Spark versions:
|
||||
* `microsoft-spark-2.3.x\target\microsoft-spark-2.3.x-<version>.jar`
|
||||
* `microsoft-spark-2.4.x\target\microsoft-spark-2.4.x-<version>.jar`
|
||||
|
||||
## Building .NET Samples Application
|
||||
|
||||
### Using Visual Studio for .NET Framework
|
||||
|
||||
1. Open `src\csharp\Microsoft.Spark.sln` in Visual Studio and build the `Microsoft.Spark.CSharp.Examples` project under the `examples` folder (this will in turn build the .NET bindings project as well). If you want, you can write your own code in the `Microsoft.Spark.Examples` project:
|
||||
|
||||
```csharp
|
||||
// Instantiate a session
|
||||
var spark = SparkSession
|
||||
.Builder()
|
||||
.AppName("Hello Spark!")
|
||||
.GetOrCreate();
|
||||
|
||||
var df = spark.Read().Json(args[0]);
|
||||
|
||||
// Print schema
|
||||
df.PrintSchema();
|
||||
|
||||
// Apply a filter and show results
|
||||
df.Filter(df["age"] > 21).Show();
|
||||
```
|
||||
Once the build is successfuly, you will see the appropriate binaries produced in the output directory.
|
||||
<details>
|
||||
<summary>📙 Click to see sample console output</summary>
|
||||
|
||||
```
|
||||
Directory: C:\github\dotnet-spark\examples\Microsoft.Spark.CSharp.Examples\bin\Debug\net461
|
||||
|
||||
|
||||
Mode LastWriteTime Length Name
|
||||
---- ------------- ------ ----
|
||||
-a---- 3/6/2019 12:18 AM 125440 Apache.Arrow.dll
|
||||
-a---- 3/16/2019 12:00 AM 13824 Microsoft.Spark.CSharp.Examples.exe
|
||||
-a---- 3/16/2019 12:00 AM 19423 Microsoft.Spark.CSharp.Examples.exe.config
|
||||
-a---- 3/16/2019 12:00 AM 2720 Microsoft.Spark.CSharp.Examples.pdb
|
||||
-a---- 3/16/2019 12:00 AM 143360 Microsoft.Spark.dll
|
||||
-a---- 3/16/2019 12:00 AM 63388 Microsoft.Spark.pdb
|
||||
-a---- 3/16/2019 12:00 AM 34304 Microsoft.Spark.Worker.exe
|
||||
-a---- 3/16/2019 12:00 AM 19423 Microsoft.Spark.Worker.exe.config
|
||||
-a---- 3/16/2019 12:00 AM 11900 Microsoft.Spark.Worker.pdb
|
||||
-a---- 3/16/2019 12:00 AM 23552 Microsoft.Spark.Worker.xml
|
||||
-a---- 3/16/2019 12:00 AM 332363 Microsoft.Spark.xml
|
||||
------------------------------------------- More framework files -------------------------------------
|
||||
```
|
||||
|
||||
</details>
|
||||
|
||||
### Using .NET Core CLI for .NET Core
|
||||
|
||||
> Note: We are currently working on automating .NET Core builds for Spark .NET. Until then, we appreciate your patience in performing some of the steps manually.
|
||||
|
||||
1. Build the Worker
|
||||
```
|
||||
cd C:\github\dotnet-spark\src\csharp\Microsoft.Spark.Worker\
|
||||
dotnet publish -f netcoreapp2.1 -r win10-x64
|
||||
```
|
||||
<details>
|
||||
<summary>📙 Click to see sample console output</summary>
|
||||
|
||||
```
|
||||
PS C:\github\dotnet-spark\src\csharp\Microsoft.Spark.Worker> dotnet publish -f netcoreapp2.1 -r win10-x64
|
||||
Microsoft (R) Build Engine version 15.9.20+g88f5fadfbe for .NET Core
|
||||
Copyright (C) Microsoft Corporation. All rights reserved.
|
||||
|
||||
Restoring packages for C:\github\dotnet-spark\src\csharp\Microsoft.Spark.Worker\Microsoft.Spark.Worker.csproj...
|
||||
Restore completed in 37.29 ms for C:\github\dotnet-spark\src\csharp\Microsoft.Spark\Microsoft.Spark.csproj.
|
||||
Generating MSBuild file C:\github\dotnet-spark\src\csharp\Microsoft.Spark.Worker\obj\Microsoft.Spark.Worker.csproj.nuget.g.props.
|
||||
Generating MSBuild file C:\github\dotnet-spark\src\csharp\Microsoft.Spark.Worker\obj\Microsoft.Spark.Worker.csproj.nuget.g.targets.
|
||||
Restore completed in 230.49 ms for C:\github\dotnet-spark\src\csharp\Microsoft.Spark.Worker\Microsoft.Spark.Worker.csproj.
|
||||
Microsoft.Spark -> C:\github\dotnet-spark\src\csharp\Microsoft.Spark\bin\Debug\netstandard2.0\Microsoft.Spark.dll
|
||||
Microsoft.Spark.Worker -> C:\github\dotnet-spark\src\csharp\Microsoft.Spark.Worker\bin\Debug\netcoreapp2.1\win10-x64\Microsoft.Spark.Worker.dll
|
||||
Microsoft.Spark.Worker -> C:\github\dotnet-spark\src\csharp\Microsoft.Spark.Worker\bin\Debug\netcoreapp2.1\win10-x64\publish\
|
||||
```
|
||||
|
||||
</details>
|
||||
2. Build the Samples
|
||||
```
|
||||
cd C:\github\dotnet-spark\examples\Microsoft.Spark.CSharp.Examples\
|
||||
Get-Content .\Microsoft.Spark.CSharp.Examples.csproj | Where-Object {$_ -notmatch 'Microsoft.Spark.Worker.csproj'} | Set-Content .\Microsoft.Spark.CSharp.Examples.Patched.csproj
|
||||
dotnet publish -f netcoreapp2.1 -r win10-x64 .\Microsoft.Spark.CSharp.Examples.Patched.csproj
|
||||
```
|
||||
Note the creation of a new patched `.csproj` file. This is due to a bug in .NET Core CLI that causes problems with building a dependency project that creates executables and we are working with the .NET team towards resolving this.
|
||||
|
||||
<details>
|
||||
<summary>📙 Click to see sample console output</summary>
|
||||
|
||||
```
|
||||
PS C:\github\dotnet-spark\examples\Microsoft.Spark.CSharp.Examples> dotnet publish -f netcoreapp2.1 -r win10-x64 .\Microsoft.Spark.CSharp.Examples.Patched.csproj
|
||||
Microsoft (R) Build Engine version 15.9.20+g88f5fadfbe for .NET Core
|
||||
Copyright (C) Microsoft Corporation. All rights reserved.
|
||||
|
||||
Restoring packages for C:\github\dotnet-spark\examples\Microsoft.Spark.CSharp.Examples\Microsoft.Spark.CSharp.Examples.Patched.csproj...
|
||||
Restoring packages for C:\github\dotnet-spark\src\csharp\Microsoft.Spark\Microsoft.Spark.csproj...
|
||||
Generating MSBuild file C:\github\dotnet-spark\examples\Microsoft.Spark.CSharp.Examples\obj\Microsoft.Spark.CSharp.Examples.Patched.csproj.nuget.g.props.
|
||||
Generating MSBuild file C:\github\dotnet-spark\src\csharp\Microsoft.Spark\obj\Microsoft.Spark.csproj.nuget.g.props.
|
||||
Generating MSBuild file C:\github\dotnet-spark\examples\Microsoft.Spark.CSharp.Examples\obj\Microsoft.Spark.CSharp.Examples.Patched.csproj.nuget.g.targets.
|
||||
Restore completed in 208.34 ms for C:\github\dotnet-spark\examples\Microsoft.Spark.CSharp.Examples\Microsoft.Spark.CSharp.Examples.Patched.csproj.
|
||||
Restore completed in 208.34 ms for C:\github\dotnet-spark\src\csharp\Microsoft.Spark\Microsoft.Spark.csproj.
|
||||
Microsoft.Spark -> C:\github\dotnet-spark\src\csharp\Microsoft.Spark\bin\Debug\netstandard2.0\Microsoft.Spark.dll
|
||||
Microsoft.Spark.CSharp.Examples.Patched -> C:\github\dotnet-spark\examples\Microsoft.Spark.CSharp.Examples\bin\Debug\netcoreapp2.1\win10-x64\Microsoft.Spark.CSharp.Examples.dll
|
||||
Microsoft.Spark.CSharp.Examples.Patched -> C:\github\dotnet-spark\examples\Microsoft.Spark.CSharp.Examples\bin\Debug\netcoreapp2.1\win10-x64\publish\
|
||||
```
|
||||
|
||||
</details>
|
||||
3. Manually copy Worker binaries into the Samples output location.
|
||||
```
|
||||
cp c:\github\dotnet-spark\src\csharp\Microsoft.Spark.Worker\bin\Debug\netcoreapp2.1\win10-x64\publish\* C:\github\dotnet-spark\examples\Microsoft.Spark.CSharp.Examples\bin\Debug\netcoreapp2.1\win10-x64\publish\
|
||||
```
|
||||
|
||||
# Run Samples
|
||||
|
||||
Once you build the samples, running them will be through `spark-submit` regardless of whether you are targeting .NET Framework or .NET Core apps. Make sure you have followed the [pre-requisites](#pre-requisites) section and installed Apache Spark.
|
||||
|
||||
1. Open Powershell and go to the directory where your app binary has been generated (e.g., `c:\github\dotnet\spark\examples\Microsoft.Spark.CSharp.Examples\bin\Debug\net461` for .NET Framework, `c:\github\dotnet-spark\examples\Microsoft.Spark.CSharp.Examples\bin\Debug\netcoreapp2.1\win10-x64\publish` for .NET Core)
|
||||
2. Running your app follows the basic structure:
|
||||
```powershell
|
||||
spark-submit.cmd `
|
||||
[--jars <any-jars-your-app-is-dependent-on>] `
|
||||
--class org.apache.spark.deploy.DotnetRunner `
|
||||
--master local `
|
||||
<path-to-microsoft-spark-jar> `
|
||||
<path-to-your-app-exe> <argument(s)-to-your-app>
|
||||
```
|
||||
|
||||
Here are some examples you can run:
|
||||
- **[Microsoft.Spark.Examples.Sql.Basic](../../examples/Microsoft.Spark.CSharp.Examples/Sql/Basic.cs)**
|
||||
```powershell
|
||||
spark-submit.cmd `
|
||||
--class org.apache.spark.deploy.DotnetRunner `
|
||||
--master local `
|
||||
C:\github\dotnet-spark\src\scala\microsoft-spark-2.3.x\target\microsoft-spark-2.3.x-1.0.0-alpha.jar `
|
||||
Microsoft.Spark.CSharp.Examples.exe Sql.Basic %SPARK_HOME%\examples\src\main\resources\people.json
|
||||
```
|
||||
- **[Microsoft.Spark.Examples.Sql.Streaming.StructuredNetworkWordCount](../../examples/Microsoft.Spark.CSharp.Examples/Sql/Streaming/StructuredNetworkWordCount.cs)**
|
||||
```powershell
|
||||
spark-submit.cmd `
|
||||
--class org.apache.spark.deploy.DotnetRunner `
|
||||
--master local `
|
||||
C:\github\dotnet-spark\src\scala\microsoft-spark-2.3.x\target\microsoft-spark-2.3.x-1.0.0-alpha.jar `
|
||||
Microsoft.Spark.CSharp.Examples.exe Sql.Streaming.StructuredNetworkWordCount localhost 9999
|
||||
```
|
||||
- **[Microsoft.Spark.Examples.Sql.Streaming.StructuredKafkaWordCount (maven accessible)](../../examples/Microsoft.Spark.CSharp.Examples/Sql/Streaming/StructuredKafkaWordCount.cs)**
|
||||
```powershell
|
||||
spark-submit.cmd `
|
||||
--packages org.apache.spark:spark-sql-kafka-0-10_2.11:2.3.2 `
|
||||
--class org.apache.spark.deploy.DotnetRunner `
|
||||
--master local `
|
||||
C:\github\dotnet-spark\src\scala\microsoft-spark-2.3.x\target\microsoft-spark-2.3.x-1.0.0-alpha.jar `
|
||||
Microsoft.Spark.CSharp.Examples.exe Sql.Streaming.StructuredKafkaWordCount localhost:9092 subscribe test
|
||||
```
|
||||
- **[Microsoft.Spark.Examples.Sql.Streaming.StructuredKafkaWordCount (jars provided)](../../examples/Microsoft.Spark.CSharp.Examples/Sql/Streaming/StructuredKafkaWordCount.cs)**
|
||||
```powershell
|
||||
spark-submit.cmd
|
||||
--jars path\to\net.jpountz.lz4\lz4-1.3.0.jar,path\to\org.apache.kafka\kafka-clients-0.10.0.1.jar,path\to\org.apache.spark\spark-sql-kafka-0-10_2.11-2.3.2.jar,`path\to\org.slf4j\slf4j-api-1.7.6.jar,path\to\org.spark-project.spark\unused-1.0.0.jar,path\to\org.xerial.snappy\snappy-java-1.1.2.6.jar `
|
||||
--class org.apache.spark.deploy.DotnetRunner `
|
||||
--master local `
|
||||
C:\github\dotnet-spark\src\scala\microsoft-spark-2.3.x\target\microsoft-spark-2.3.x-1.0.0-alpha.jar `
|
||||
Microsoft.Spark.CSharp.Examples.exe Sql.Streaming.StructuredKafkaWordCount localhost:9092 subscribe test
|
||||
```
|
||||
|
||||
Feel this experience is complicated? Help us by taking up [Simplify User Experience for Running an App](https://github.com/dotnet/spark/issues/6)
|
|
@ -0,0 +1,89 @@
|
|||
C# Coding Style
|
||||
===============
|
||||
|
||||
We use the same [coding style](https://github.com/dotnet/corefx/blob/master/Documentation/coding-guidelines/coding-style.md) and [EditorConfig](https://editorconfig.org "EditorConfig homepage") file (`.editorconfig`) used by [dotnet/corefx](https://github.com/dotnet/corefx) with the following differences:
|
||||
|
||||
* **A single line statement block must go with braces.**
|
||||
|
||||
```C#
|
||||
// OK
|
||||
if (foo)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
|
||||
// NOT OK
|
||||
if (foo) return false;
|
||||
if (foo) { return false };
|
||||
|
||||
```
|
||||
|
||||
* **Use prefix increment/decrement operator.**
|
||||
|
||||
Unless post increment/decrement operator usage is intended, use prefix increment/decrement operator.
|
||||
|
||||
```C#
|
||||
// OK
|
||||
for (int i = 0; i < arr.Length; ++i)
|
||||
|
||||
// NOT OK
|
||||
for (int i = 0; i < arr.Length; i++)
|
||||
|
||||
// OK
|
||||
arr[i++]; // Post increment operator usage is intended.
|
||||
```
|
||||
|
||||
* **The max number of characters in a line is 100.**
|
||||
|
||||
This can be easily done using the following line-break rules:
|
||||
|
||||
(If you cannot find a rule for your scenario, please look through the existing code to find a match and create an issue to update this list.)
|
||||
|
||||
* Line-break for the assignment
|
||||
```C#
|
||||
// Try the following first to fit within the limit.
|
||||
SomeType someVariable
|
||||
= SomeMethod(arg1, arg2, arg3, arg4, arg5);
|
||||
|
||||
// Then fall back to this.
|
||||
SomeType someVariable = SomeMethod(
|
||||
arg1,
|
||||
arg2,
|
||||
arg3,
|
||||
arg4,
|
||||
arg5);
|
||||
```
|
||||
|
||||
* Line-break for each method parameters:
|
||||
```C#
|
||||
return UserDefinedFunction.Create(
|
||||
name,
|
||||
CommandSerDe.Serialize(
|
||||
execute,
|
||||
CommandSerDe.SerializedMode.Row,
|
||||
CommandSerDe.SerializedMode.Row),
|
||||
UdfUtils.GetPythonEvalType(),
|
||||
UdfUtils.GetReturnType(typeof(RT)));
|
||||
```
|
||||
|
||||
* Line-break for each method call:
|
||||
```C#
|
||||
// If you have chained method calls, line-break each method call
|
||||
Enumerable.Range(0, numRows)
|
||||
.Select(i => i.ToString())
|
||||
.ToArray();
|
||||
```
|
||||
|
||||
There are few exceptions to this rule:
|
||||
|
||||
* Log message with string interpolation:
|
||||
```C#
|
||||
Logger.LogInfo($"This message {someVariable} is too long but try your best to fit in 100 character limit.");
|
||||
```
|
||||
|
||||
* The method signature without method parameters is long due to type paramters:
|
||||
```C#
|
||||
public static Func<Column, Column, Column, Column, Column, Column, Column, Column, Column, Column, Column> Udf<A1, A2, A3, A4, A5, A6, A7, A8, A9, A10, RT>(
|
||||
Func<A1, A2, A3, A4, A5, A6, A7, A8, A9, A10, RT> udf)
|
||||
```
|
||||
|
|
@ -0,0 +1,6 @@
|
|||
Scala Coding Style
|
||||
===============
|
||||
|
||||
* For Scala code, we follow the official [Scala style guide](https://docs.scala-lang.org/style/).
|
||||
* For formatting, [scalafmt](https://scalameta.org/scalafmt) is used with the custom configuration (found in [/dev/.scalafmt.conf](/dev/.scalafmt.conf))
|
||||
* Installation of `scalafmt` can be found [here](https://scalameta.org/scalafmt/docs/installation.html)
|
|
@ -0,0 +1,9 @@
|
|||
Contributing to dotnet/spark
|
||||
======================
|
||||
This document describes contribution guidelines.
|
||||
|
||||
Coding Style
|
||||
------------
|
||||
We intend to bring dotnet/spark into full conformance with the following style guidelines:
|
||||
* [C# Coding Style](coding-guidelines/csharp-coding-style.md)
|
||||
* [Scala Coding Style](coding-guidelines/scala-coding-style.md)
|
|
@ -0,0 +1 @@
|
|||
# Developer Guide
|
|
@ -0,0 +1 @@
|
|||
# Features
|
|
@ -0,0 +1 @@
|
|||
|
Двоичный файл не отображается.
После Ширина: | Высота: | Размер: 32 KiB |
Двоичный файл не отображается.
После Ширина: | Высота: | Размер: 20 KiB |
Двоичный файл не отображается.
После Ширина: | Высота: | Размер: 1.1 KiB |
Двоичный файл не отображается.
После Ширина: | Высота: | Размер: 1.0 KiB |
|
@ -0,0 +1,4 @@
|
|||
<?xml version="1.0" encoding="utf-8"?>
|
||||
<Project ToolsVersion="14.0" DefaultTargets="Build" xmlns="http://schemas.microsoft.com/developer/msbuild/2003" TreatAsLocalProperty="ExcludeRestorePackageImports">
|
||||
<Import Project="..\src\csharp\Directory.Build.props" />
|
||||
</Project>
|
|
@ -0,0 +1,11 @@
|
|||
// Licensed to the .NET Foundation under one or more agreements.
|
||||
// The .NET Foundation licenses this file to you under the MIT license.
|
||||
// See the LICENSE file in the project root for more information.
|
||||
|
||||
namespace Microsoft.Spark.Examples
|
||||
{
|
||||
internal interface IExample
|
||||
{
|
||||
void Run(string[] args);
|
||||
}
|
||||
}
|
|
@ -0,0 +1,16 @@
|
|||
<Project Sdk="Microsoft.NET.Sdk">
|
||||
|
||||
<PropertyGroup>
|
||||
<OutputType>Exe</OutputType>
|
||||
<TargetFrameworks>net461;netcoreapp2.1</TargetFrameworks>
|
||||
<TargetFrameworks Condition="'$(OS)' != 'Windows_NT'">netcoreapp2.1</TargetFrameworks>
|
||||
<RootNamespace>Microsoft.Spark.Examples</RootNamespace>
|
||||
<AssemblyName>Microsoft.Spark.CSharp.Examples</AssemblyName>
|
||||
</PropertyGroup>
|
||||
|
||||
<ItemGroup>
|
||||
<ProjectReference Include="..\..\src\csharp\Microsoft.Spark.Worker\Microsoft.Spark.Worker.csproj" />
|
||||
<ProjectReference Include="..\..\src\csharp\Microsoft.Spark\Microsoft.Spark.csproj" />
|
||||
</ItemGroup>
|
||||
|
||||
</Project>
|
|
@ -0,0 +1,63 @@
|
|||
// Licensed to the .NET Foundation under one or more agreements.
|
||||
// The .NET Foundation licenses this file to you under the MIT license.
|
||||
// See the LICENSE file in the project root for more information.
|
||||
|
||||
using System;
|
||||
using System.Collections.Generic;
|
||||
using System.Linq;
|
||||
using System.Reflection;
|
||||
|
||||
namespace Microsoft.Spark.Examples
|
||||
{
|
||||
public class Program
|
||||
{
|
||||
public static void Main(string[] args)
|
||||
{
|
||||
string rootNamespace = MethodBase.GetCurrentMethod().DeclaringType.Namespace;
|
||||
|
||||
// Find all types in the current assembly that implement IExample
|
||||
// and is in or starts with rootNamespace. Track the fully qualified
|
||||
// name of the type after the rootNamespace.
|
||||
IEnumerable<string> examples = Assembly.GetExecutingAssembly().GetTypes()
|
||||
.Where(t =>
|
||||
typeof(IExample).IsAssignableFrom(t) &&
|
||||
!t.IsInterface &&
|
||||
!t.IsAbstract &&
|
||||
t.Namespace.StartsWith(rootNamespace) &&
|
||||
((t.Namespace.Length == rootNamespace.Length) ||
|
||||
(t.Namespace[rootNamespace.Length] == '.')))
|
||||
.Select(t => t.FullName.Substring(rootNamespace.Length + 1));
|
||||
|
||||
if ((args.Length == 0) || !TryFindExample(examples, args[0], out string exampleName))
|
||||
{
|
||||
PrintUsage(examples);
|
||||
return;
|
||||
}
|
||||
|
||||
string[] exampleArgs = args.Skip(1).ToArray();
|
||||
Type type = Assembly.GetExecutingAssembly().GetType($"{rootNamespace}.{exampleName}");
|
||||
object instance = Activator.CreateInstance(type);
|
||||
MethodInfo method = type.GetMethod("Run");
|
||||
method.Invoke(instance, new object[] { exampleArgs });
|
||||
}
|
||||
|
||||
private static void PrintUsage(IEnumerable<string> examples)
|
||||
{
|
||||
string assemblyName = Assembly.GetExecutingAssembly().GetName().Name;
|
||||
Console.WriteLine($"Usage: {assemblyName} <example> <example args>");
|
||||
if (examples.Any())
|
||||
{
|
||||
Console.WriteLine("Examples:\n\t*" + string.Join("\n\t*", examples));
|
||||
}
|
||||
Console.WriteLine($"\n'{assemblyName} <example>' to get the usage info of each example.");
|
||||
}
|
||||
|
||||
private static bool TryFindExample(IEnumerable<string> examples, string search,
|
||||
out string found)
|
||||
{
|
||||
found = examples.FirstOrDefault(e =>
|
||||
e.Equals(search, StringComparison.InvariantCultureIgnoreCase));
|
||||
return !string.IsNullOrWhiteSpace(found);
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,96 @@
|
|||
// Licensed to the .NET Foundation under one or more agreements.
|
||||
// The .NET Foundation licenses this file to you under the MIT license.
|
||||
// See the LICENSE file in the project root for more information.
|
||||
|
||||
using System;
|
||||
using Microsoft.Spark.Sql;
|
||||
using static Microsoft.Spark.Sql.Functions;
|
||||
|
||||
namespace Microsoft.Spark.Examples.Sql
|
||||
{
|
||||
/// <summary>
|
||||
/// A simple example demonstrating basic Spark SQL features.
|
||||
/// /// </summary>
|
||||
internal sealed class Basic : IExample
|
||||
{
|
||||
public void Run(string[] args)
|
||||
{
|
||||
if (args.Length != 1)
|
||||
{
|
||||
Console.Error.WriteLine(
|
||||
"Usage: Basic <path to SPARK_HOME/examples/src/main/resources/people.json>");
|
||||
Environment.Exit(1);
|
||||
}
|
||||
|
||||
SparkSession spark = SparkSession
|
||||
.Builder()
|
||||
.AppName(".NET Spark SQL basic example")
|
||||
.Config("spark.some.config.option", "some-value")
|
||||
.GetOrCreate();
|
||||
|
||||
// Need to explicitly specify the schema since pickling vs. arrow formatting
|
||||
// will return different types. Pickling will turn longs into ints if the values fit.
|
||||
DataFrame df = spark.Read().Schema("age INT, name STRING").Json(args[0]);
|
||||
|
||||
Spark.Sql.Types.StructType schema = df.Schema();
|
||||
Console.WriteLine(schema.SimpleString);
|
||||
|
||||
System.Collections.Generic.IEnumerable<Row> rows = df.Collect();
|
||||
foreach (Row row in rows)
|
||||
{
|
||||
Console.WriteLine(row);
|
||||
}
|
||||
|
||||
df.Show();
|
||||
|
||||
df.PrintSchema();
|
||||
|
||||
df.Select("name", "age", "age", "name").Show();
|
||||
|
||||
df.Select(df["name"], df["age"] + 1).Show();
|
||||
|
||||
df.Filter(df["age"] > 21).Show();
|
||||
|
||||
df.GroupBy("age")
|
||||
.Agg(Avg(df["age"]), Avg(df["age"]), CountDistinct(df["age"], df["age"]))
|
||||
.Show();
|
||||
|
||||
df.CreateOrReplaceTempView("people");
|
||||
|
||||
// Registering Udf for SQL expression.
|
||||
DataFrame sqlDf = spark.Sql("SELECT * FROM people");
|
||||
sqlDf.Show();
|
||||
|
||||
spark.Udf().Register<int?, string, string>(
|
||||
"my_udf",
|
||||
(age, name) => name + " with " + ((age.HasValue) ? age.Value.ToString() : "null"));
|
||||
|
||||
sqlDf = spark.Sql("SELECT my_udf(*) FROM people");
|
||||
sqlDf.Show();
|
||||
|
||||
// Using UDF via data frames.
|
||||
Func<Column, Column, Column> addition = Udf<int?, string, string>(
|
||||
(age, name) => name + " is " + (age.HasValue ? age.Value + 10 : 0));
|
||||
df.Select(addition(df["age"], df["name"])).Show();
|
||||
|
||||
// Chaining example:
|
||||
Func<Column, Column> addition2 = Udf<string, string>(str => $"hello {str}!");
|
||||
df.Select(addition2(addition(df["age"], df["name"]))).Show();
|
||||
|
||||
// Multiple UDF example:
|
||||
df.Select(addition(df["age"], df["name"]), addition2(df["name"])).Show();
|
||||
|
||||
// Joins.
|
||||
DataFrame joinedDf = df.Join(df, "name");
|
||||
joinedDf.Show();
|
||||
|
||||
DataFrame joinedDf2 = df.Join(df, new[] { "name", "age" });
|
||||
joinedDf2.Show();
|
||||
|
||||
DataFrame joinedDf3 = df.Join(df, df["name"] == df["name"], "outer");
|
||||
joinedDf3.Show();
|
||||
|
||||
spark.Stop();
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,58 @@
|
|||
// Licensed to the .NET Foundation under one or more agreements.
|
||||
// The .NET Foundation licenses this file to you under the MIT license.
|
||||
// See the LICENSE file in the project root for more information.
|
||||
|
||||
using System;
|
||||
using Microsoft.Spark.Sql;
|
||||
using static Microsoft.Spark.Sql.Functions;
|
||||
|
||||
namespace Microsoft.Spark.Examples.Sql.Streaming
|
||||
{
|
||||
/// <summary>
|
||||
/// The example is taken/modified from
|
||||
/// spark/examples/src/main/python/sql/streaming/structured_kafka_wordcount.py
|
||||
/// </summary>
|
||||
internal sealed class StructuredKafkaWordCount : IExample
|
||||
{
|
||||
public void Run(string[] args)
|
||||
{
|
||||
if (args.Length != 3)
|
||||
{
|
||||
Console.Error.WriteLine(
|
||||
"Usage: StructuredKafkaWordCount " +
|
||||
"<bootstrap-servers> <subscribe-type> <topics>");
|
||||
Environment.Exit(1);
|
||||
}
|
||||
|
||||
string bootstrapServers = args[0];
|
||||
string subscribeType = args[1];
|
||||
string topics = args[2];
|
||||
|
||||
SparkSession spark = SparkSession
|
||||
.Builder()
|
||||
.AppName("StructuredKafkaWordCount")
|
||||
.GetOrCreate();
|
||||
|
||||
DataFrame lines = spark
|
||||
.ReadStream()
|
||||
.Format("kafka")
|
||||
.Option("kafka.bootstrap.servers", bootstrapServers)
|
||||
.Option(subscribeType, topics)
|
||||
.Load()
|
||||
.SelectExpr("CAST(value AS STRING)");
|
||||
|
||||
DataFrame words = lines
|
||||
.Select(Explode(Split(lines["value"], " "))
|
||||
.Alias("word"));
|
||||
DataFrame wordCounts = words.GroupBy("word").Count();
|
||||
|
||||
Spark.Sql.Streaming.StreamingQuery query = wordCounts
|
||||
.WriteStream()
|
||||
.OutputMode("complete")
|
||||
.Format("console")
|
||||
.Start();
|
||||
|
||||
query.AwaitTermination();
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,59 @@
|
|||
// Licensed to the .NET Foundation under one or more agreements.
|
||||
// The .NET Foundation licenses this file to you under the MIT license.
|
||||
// See the LICENSE file in the project root for more information.
|
||||
|
||||
using System;
|
||||
using Microsoft.Spark.Sql;
|
||||
using static Microsoft.Spark.Sql.Functions;
|
||||
|
||||
namespace Microsoft.Spark.Examples.Sql.Streaming
|
||||
{
|
||||
/// <summary>
|
||||
/// The example is taken/modified from
|
||||
/// spark/examples/src/main/python/sql/streaming/structured_network_wordcount.py
|
||||
///
|
||||
/// You can set up the data source as follow in a separated terminal:
|
||||
/// `$ nc -lk 9999`
|
||||
/// to start writing standard input to port 9999.
|
||||
/// </summary>
|
||||
internal sealed class StructuredNetworkWordCount : IExample
|
||||
{
|
||||
public void Run(string[] args)
|
||||
{
|
||||
if (args.Length != 2)
|
||||
{
|
||||
Console.Error.WriteLine(
|
||||
"Usage: StructuredNetworkWordCount <hostname> <port>");
|
||||
Environment.Exit(1);
|
||||
}
|
||||
|
||||
string hostname = args[0];
|
||||
var port = int.Parse(args[1]);
|
||||
|
||||
SparkSession spark = SparkSession
|
||||
.Builder()
|
||||
.AppName("StructuredNetworkWordCount")
|
||||
.GetOrCreate();
|
||||
|
||||
DataFrame lines = spark
|
||||
.ReadStream()
|
||||
.Format("socket")
|
||||
.Option("host", hostname)
|
||||
.Option("port", port)
|
||||
.Load();
|
||||
|
||||
DataFrame words = lines
|
||||
.Select(Explode(Split(lines["value"], " "))
|
||||
.Alias("word"));
|
||||
DataFrame wordCounts = words.GroupBy("word").Count();
|
||||
|
||||
Spark.Sql.Streaming.StreamingQuery query = wordCounts
|
||||
.WriteStream()
|
||||
.OutputMode("complete")
|
||||
.Format("console")
|
||||
.Start();
|
||||
|
||||
query.AwaitTermination();
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,77 @@
|
|||
// Licensed to the .NET Foundation under one or more agreements.
|
||||
// The .NET Foundation licenses this file to you under the MIT license.
|
||||
// See the LICENSE file in the project root for more information.
|
||||
|
||||
using System;
|
||||
using Microsoft.Spark.Sql;
|
||||
using static Microsoft.Spark.Sql.Functions;
|
||||
|
||||
namespace Microsoft.Spark.Examples.Sql.Streaming
|
||||
{
|
||||
/// <summary>
|
||||
/// The example is taken/modified from
|
||||
/// spark/examples/src/main/python/sql/streaming/structured_network_wordcount_windowed.py
|
||||
///
|
||||
/// You can set up the data source as follow in a separated terminal:
|
||||
/// `$ nc -lk 9999`
|
||||
/// to start writing standard input to port 9999.
|
||||
/// </summary>
|
||||
internal sealed class StructuredNetworkWordCountWindowed : IExample
|
||||
{
|
||||
public void Run(string[] args)
|
||||
{
|
||||
if (args.Length != 3 && args.Length != 4)
|
||||
{
|
||||
Console.Error.WriteLine(
|
||||
"Usage: StructuredNetworkWordCountWindowed " +
|
||||
"<hostname> <port> <window duration in seconds> " +
|
||||
"[<slide duration in seconds>]");
|
||||
Environment.Exit(1);
|
||||
}
|
||||
|
||||
string hostname = args[0];
|
||||
var port = int.Parse(args[1]);
|
||||
var windowSize = int.Parse(args[2]);
|
||||
var slideSize = (args.Length == 3) ? windowSize : int.Parse(args[3]);
|
||||
if (slideSize > windowSize)
|
||||
{
|
||||
Console.Error.WriteLine(
|
||||
"<slide duration> must be less than or equal " +
|
||||
"to <window duration>");
|
||||
}
|
||||
var windowDuration = $"{windowSize} seconds";
|
||||
var slideDuration = $"{slideSize} seconds";
|
||||
|
||||
SparkSession spark = SparkSession
|
||||
.Builder()
|
||||
.AppName("StructuredNetworkWordCountWindowed")
|
||||
.GetOrCreate();
|
||||
|
||||
DataFrame lines = spark
|
||||
.ReadStream()
|
||||
.Format("socket")
|
||||
.Option("host", hostname)
|
||||
.Option("port", port)
|
||||
.Option("includeTimestamp", true)
|
||||
.Load();
|
||||
|
||||
DataFrame words = lines
|
||||
.Select(Explode(Split(lines["value"], " "))
|
||||
.Alias("word"), lines["timestamp"]);
|
||||
DataFrame windowedCounts = words
|
||||
.GroupBy(Window(words["timestamp"], windowDuration, slideDuration),
|
||||
words["word"])
|
||||
.Count()
|
||||
.OrderBy("window");
|
||||
|
||||
Spark.Sql.Streaming.StreamingQuery query = windowedCounts
|
||||
.WriteStream()
|
||||
.OutputMode("complete")
|
||||
.Format("console")
|
||||
.Option("truncate", false)
|
||||
.Start();
|
||||
|
||||
query.AwaitTermination();
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,54 @@
|
|||
|
||||
Microsoft Visual Studio Solution File, Format Version 12.00
|
||||
# Visual Studio 15
|
||||
VisualStudioVersion = 15.0.28307.421
|
||||
MinimumVisualStudioVersion = 10.0.40219.1
|
||||
Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "Microsoft.Spark.CSharp.Examples", "Microsoft.Spark.CSharp.Examples\Microsoft.Spark.CSharp.Examples.csproj", "{32A34828-20F4-40FE-A3D5-C9458BF424E6}"
|
||||
EndProject
|
||||
Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "Microsoft.Spark", "..\src\csharp\Microsoft.Spark\Microsoft.Spark.csproj", "{F5C246F7-9CDE-44DC-8D4F-DAAC557048EF}"
|
||||
EndProject
|
||||
Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "Microsoft.Spark.Worker", "..\src\csharp\Microsoft.Spark.Worker\Microsoft.Spark.Worker.csproj", "{606FBD5E-D5C6-48F6-8FBF-2F0E25527760}"
|
||||
EndProject
|
||||
Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "Reference", "Reference", "{CE5FBCF2-F92E-4A2F-A76E-149B7118491B}"
|
||||
EndProject
|
||||
Project("{6EC3EE1D-3C4E-46DD-8F32-0CC8E7565705}") = "Microsoft.Spark.FSharp.Examples", "Microsoft.Spark.FSharp.Examples\Microsoft.Spark.FSharp.Examples.fsproj", "{127370FE-D19D-4489-AB7C-2F1AA7908994}"
|
||||
EndProject
|
||||
Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "Solution Items", "Solution Items", "{7BC2C5FB-10A1-492D-952B-D8662C368CB2}"
|
||||
ProjectSection(SolutionItems) = preProject
|
||||
..\.editorconfig = ..\.editorconfig
|
||||
EndProjectSection
|
||||
EndProject
|
||||
Global
|
||||
GlobalSection(SolutionConfigurationPlatforms) = preSolution
|
||||
Debug|Any CPU = Debug|Any CPU
|
||||
Release|Any CPU = Release|Any CPU
|
||||
EndGlobalSection
|
||||
GlobalSection(ProjectConfigurationPlatforms) = postSolution
|
||||
{32A34828-20F4-40FE-A3D5-C9458BF424E6}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
|
||||
{32A34828-20F4-40FE-A3D5-C9458BF424E6}.Debug|Any CPU.Build.0 = Debug|Any CPU
|
||||
{32A34828-20F4-40FE-A3D5-C9458BF424E6}.Release|Any CPU.ActiveCfg = Release|Any CPU
|
||||
{32A34828-20F4-40FE-A3D5-C9458BF424E6}.Release|Any CPU.Build.0 = Release|Any CPU
|
||||
{F5C246F7-9CDE-44DC-8D4F-DAAC557048EF}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
|
||||
{F5C246F7-9CDE-44DC-8D4F-DAAC557048EF}.Debug|Any CPU.Build.0 = Debug|Any CPU
|
||||
{F5C246F7-9CDE-44DC-8D4F-DAAC557048EF}.Release|Any CPU.ActiveCfg = Release|Any CPU
|
||||
{F5C246F7-9CDE-44DC-8D4F-DAAC557048EF}.Release|Any CPU.Build.0 = Release|Any CPU
|
||||
{606FBD5E-D5C6-48F6-8FBF-2F0E25527760}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
|
||||
{606FBD5E-D5C6-48F6-8FBF-2F0E25527760}.Debug|Any CPU.Build.0 = Debug|Any CPU
|
||||
{606FBD5E-D5C6-48F6-8FBF-2F0E25527760}.Release|Any CPU.ActiveCfg = Release|Any CPU
|
||||
{606FBD5E-D5C6-48F6-8FBF-2F0E25527760}.Release|Any CPU.Build.0 = Release|Any CPU
|
||||
{127370FE-D19D-4489-AB7C-2F1AA7908994}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
|
||||
{127370FE-D19D-4489-AB7C-2F1AA7908994}.Debug|Any CPU.Build.0 = Debug|Any CPU
|
||||
{127370FE-D19D-4489-AB7C-2F1AA7908994}.Release|Any CPU.ActiveCfg = Release|Any CPU
|
||||
{127370FE-D19D-4489-AB7C-2F1AA7908994}.Release|Any CPU.Build.0 = Release|Any CPU
|
||||
EndGlobalSection
|
||||
GlobalSection(SolutionProperties) = preSolution
|
||||
HideSolutionNode = FALSE
|
||||
EndGlobalSection
|
||||
GlobalSection(NestedProjects) = preSolution
|
||||
{F5C246F7-9CDE-44DC-8D4F-DAAC557048EF} = {CE5FBCF2-F92E-4A2F-A76E-149B7118491B}
|
||||
{606FBD5E-D5C6-48F6-8FBF-2F0E25527760} = {CE5FBCF2-F92E-4A2F-A76E-149B7118491B}
|
||||
EndGlobalSection
|
||||
GlobalSection(ExtensibilityGlobals) = postSolution
|
||||
SolutionGuid = {9D60F114-5B77-445C-B67C-DCACC90A35CD}
|
||||
EndGlobalSection
|
||||
EndGlobal
|
|
@ -0,0 +1,8 @@
|
|||
// Licensed to the .NET Foundation under one or more agreements.
|
||||
// The .NET Foundation licenses this file to you under the MIT license.
|
||||
// See the LICENSE file in the project root for more information.
|
||||
|
||||
namespace Microsoft.Spark.Examples
|
||||
|
||||
type IExample =
|
||||
abstract member Run : string[] -> int
|
|
@ -0,0 +1,25 @@
|
|||
<Project Sdk="Microsoft.NET.Sdk">
|
||||
|
||||
<PropertyGroup>
|
||||
<OutputType>Exe</OutputType>
|
||||
<TargetFrameworks>net461;netcoreapp2.1</TargetFrameworks>
|
||||
<TargetFrameworks Condition="'$(OS)' != 'Windows_NT'">netcoreapp2.1</TargetFrameworks>
|
||||
<RootNamespace>Microsoft.Spark.Examples</RootNamespace>
|
||||
<AssemblyName>Microsoft.Spark.FSharp.Examples</AssemblyName>
|
||||
</PropertyGroup>
|
||||
|
||||
<ItemGroup>
|
||||
<Compile Include="IExample.fs" />
|
||||
<Compile Include="Sql\Streaming\StructuredNetworkWordCountWindowed.fs" />
|
||||
<Compile Include="Sql\Streaming\StructuredNetworkWordCount.fs" />
|
||||
<Compile Include="Sql\Streaming\StructuredKafkaWordCount.fs" />
|
||||
<Compile Include="Sql\Basic.fs" />
|
||||
<Compile Include="Program.fs" />
|
||||
</ItemGroup>
|
||||
|
||||
<ItemGroup>
|
||||
<ProjectReference Include="..\..\src\csharp\Microsoft.Spark.Worker\Microsoft.Spark.Worker.csproj" />
|
||||
<ProjectReference Include="..\..\src\csharp\Microsoft.Spark\Microsoft.Spark.csproj" />
|
||||
</ItemGroup>
|
||||
|
||||
</Project>
|
|
@ -0,0 +1,60 @@
|
|||
// Licensed to the .NET Foundation under one or more agreements.
|
||||
// The .NET Foundation licenses this file to you under the MIT license.
|
||||
// See the LICENSE file in the project root for more information.
|
||||
|
||||
module Microsoft.Spark.Examples.Main
|
||||
|
||||
open System
|
||||
open System.Collections.Generic
|
||||
open System.Linq
|
||||
open System.Reflection
|
||||
open System.Runtime.InteropServices
|
||||
|
||||
let printUsage (examples : IEnumerable<string>) =
|
||||
let assemblyName = Assembly.GetExecutingAssembly().GetName().Name
|
||||
printfn "Usage: %s <example> <example args>" assemblyName
|
||||
|
||||
if examples.Any() then
|
||||
printfn "Examples:\n\t*%s" (examples |> String.concat "\n\t*")
|
||||
printfn "\n'%s <example>' to get the usage info of each example." assemblyName
|
||||
|
||||
let tryFindExample (examples: IEnumerable<string>, search: string, [<Out>] found : string byref) =
|
||||
found <- examples.FirstOrDefault(fun e ->
|
||||
e.Equals(search, StringComparison.InvariantCultureIgnoreCase))
|
||||
not (String.IsNullOrWhiteSpace(found))
|
||||
|
||||
[<EntryPoint>]
|
||||
let main args =
|
||||
let rootNamespace = MethodBase.GetCurrentMethod().DeclaringType.Namespace
|
||||
|
||||
// Find all types in the current assembly that implement IExample
|
||||
// and is in or starts with rootNamespace. Track the fully qualified
|
||||
// name of the type after the rootNamespace.
|
||||
let examples =
|
||||
Assembly.GetExecutingAssembly().GetTypes()
|
||||
.Where(fun t ->
|
||||
typeof<IExample>.IsAssignableFrom(t) &&
|
||||
not t.IsInterface &&
|
||||
not t.IsAbstract &&
|
||||
t.Namespace.StartsWith(rootNamespace) &&
|
||||
((t.Namespace.Length = rootNamespace.Length) ||
|
||||
(t.Namespace.[rootNamespace.Length] = '.')))
|
||||
.Select(fun t -> t.FullName.Substring(rootNamespace.Length + 1))
|
||||
|
||||
match args with
|
||||
| [||] ->
|
||||
printUsage(examples)
|
||||
1
|
||||
| _ ->
|
||||
let mutable exampleName = String.Empty
|
||||
if not (tryFindExample(examples, args.[0], &exampleName)) then
|
||||
printUsage(examples)
|
||||
1
|
||||
else
|
||||
let exampleArgs = args.Skip(1).ToArray()
|
||||
let exampleType =
|
||||
Assembly.GetExecutingAssembly()
|
||||
.GetType(sprintf "%s.%s" rootNamespace exampleName)
|
||||
let instance = Activator.CreateInstance(exampleType)
|
||||
let method = exampleType.GetMethod("Run")
|
||||
method.Invoke(instance, [|exampleArgs|]) :?> int
|
|
@ -0,0 +1,89 @@
|
|||
// Licensed to the .NET Foundation under one or more agreements.
|
||||
// The .NET Foundation licenses this file to you under the MIT license.
|
||||
// See the LICENSE file in the project root for more information.
|
||||
|
||||
namespace Microsoft.Spark.Examples.Sql
|
||||
|
||||
open System
|
||||
open Microsoft.Spark.Examples
|
||||
open Microsoft.Spark.Sql
|
||||
|
||||
type Basic() =
|
||||
member this.Run(args : string[]) =
|
||||
match args with
|
||||
| [| filePath |] ->
|
||||
let spark = SparkSession.Builder().AppName("Hello F#").GetOrCreate()
|
||||
|
||||
let df = spark.Read().Json(filePath)
|
||||
|
||||
let schema = df.Schema()
|
||||
printfn "%s" (schema.SimpleString)
|
||||
|
||||
for row in df.Collect() do
|
||||
printfn "%s" (row.ToString())
|
||||
|
||||
df.Show()
|
||||
|
||||
df.PrintSchema()
|
||||
|
||||
df.Select("name", "age", "age", "name").Show()
|
||||
|
||||
df.Select(df.["name"], df.["age"] + 1).Show()
|
||||
|
||||
df.Filter(df.["age"].Gt(21)).Show()
|
||||
|
||||
df.GroupBy("age")
|
||||
.Agg(Functions.Avg(df.["age"]),
|
||||
Functions.Avg(df.["age"]),
|
||||
Functions.CountDistinct(df.["age"], df.["age"]))
|
||||
.Show()
|
||||
|
||||
// SQL example.
|
||||
df.CreateOrReplaceTempView("people")
|
||||
|
||||
// Registering UDF for SQL expression.
|
||||
let sqlDf = spark.Sql("SELECT * FROM people")
|
||||
sqlDf.Show()
|
||||
|
||||
spark.Udf().Register<Nullable<int>, string, string>(
|
||||
"my_udf",
|
||||
fun age name ->
|
||||
name + " with " + (if age.HasValue then (string)(age.Value) else "null"))
|
||||
|
||||
let sqlDf = spark.Sql("SELECT my_udf(*) FROM people")
|
||||
sqlDf.Show()
|
||||
|
||||
// Using UDF via data frames.
|
||||
let addition = Functions.Udf<Nullable<int>, string, string>(
|
||||
fun age name ->
|
||||
name + " is " +
|
||||
(if age.HasValue then (string)(age.Value + 10) else "0"))
|
||||
|
||||
df.Select(addition.Invoke(df.["age"], df.["name"])).Show()
|
||||
|
||||
// Chaining example:
|
||||
let addition2 = Functions.Udf<string, string>(fun str -> "hello " + str + "!")
|
||||
df.Select(addition2.Invoke(addition.Invoke(df.["age"], df.["name"]))).Show()
|
||||
|
||||
// Multiple UDF example:
|
||||
df.Select(addition.Invoke(df.["age"], df.["name"]), addition2.Invoke(df.["name"]))
|
||||
.Show()
|
||||
|
||||
// Joins.
|
||||
let joinedDf = df.Join(df, "name")
|
||||
joinedDf.Show()
|
||||
|
||||
let joinedDf2 = df.Join(df, ["name"; "age"] |> List.toSeq)
|
||||
joinedDf2.Show()
|
||||
|
||||
let joinedDf3 = df.Join(df, df.["name"].EqualTo(df.["name"]), "outer")
|
||||
joinedDf3.Show()
|
||||
|
||||
spark.Stop()
|
||||
0
|
||||
| _ ->
|
||||
printfn "Usage: Basic <path to SPARK_HOME/examples/src/main/resources/people.json>"
|
||||
1
|
||||
|
||||
interface IExample with
|
||||
member this.Run (args) = this.Run (args)
|
|
@ -0,0 +1,43 @@
|
|||
// Licensed to the .NET Foundation under one or more agreements.
|
||||
// The .NET Foundation licenses this file to you under the MIT license.
|
||||
// See the LICENSE file in the project root for more information.
|
||||
|
||||
namespace Microsoft.Spark.Examples.Sql.Streaming
|
||||
|
||||
open Microsoft.Spark.Examples
|
||||
open Microsoft.Spark.Sql
|
||||
|
||||
/// <summary>
|
||||
/// The example is taken/modified from
|
||||
/// spark/examples/src/main/python/sql/streaming/structured_kafka_wordcount.py
|
||||
/// </summary>
|
||||
type StructuredKafkaWordCount() =
|
||||
member this.Run(args : string[]) =
|
||||
match args with
|
||||
| [| bootstrapServers; subscribeType; topics |] ->
|
||||
let spark = SparkSession.Builder().AppName("StructuredKafkaWordCount").GetOrCreate()
|
||||
|
||||
let lines =
|
||||
spark.ReadStream()
|
||||
.Format("kafka")
|
||||
.Option("kafka.bootstrap.servers", bootstrapServers)
|
||||
.Option(subscribeType, topics)
|
||||
.Load()
|
||||
.SelectExpr("CAST(value AS STRING)")
|
||||
|
||||
let words =
|
||||
lines.Select(Functions.Explode(Functions.Split(lines.["value"], " "))
|
||||
.Alias("word"))
|
||||
let wordCounts = words.GroupBy("word").Count()
|
||||
|
||||
let query = wordCounts.WriteStream().OutputMode("complete").Format("console").Start()
|
||||
|
||||
query.AwaitTermination()
|
||||
|
||||
0
|
||||
| _ ->
|
||||
printfn "Usage: StructuredKafkaWordCount <bootstrap-servers> <subscribe-type> <topics>"
|
||||
1
|
||||
|
||||
interface IExample with
|
||||
member this.Run (args) = this.Run (args)
|
|
@ -0,0 +1,48 @@
|
|||
// Licensed to the .NET Foundation under one or more agreements.
|
||||
// The .NET Foundation licenses this file to you under the MIT license.
|
||||
// See the LICENSE file in the project root for more information.
|
||||
|
||||
namespace Microsoft.Spark.Examples.Sql.Streaming
|
||||
|
||||
open Microsoft.Spark.Examples
|
||||
open Microsoft.Spark.Sql
|
||||
|
||||
/// <summary>
|
||||
/// The example is taken/modified from
|
||||
/// spark/examples/src/main/python/sql/streaming/structured_network_wordcount.py
|
||||
///
|
||||
/// You can set up the data source as follow in a separated terminal:
|
||||
/// `$ nc -lk 9999`
|
||||
/// to start writing standard input to port 9999.
|
||||
/// </summary>
|
||||
type StructuredNetworkWordCount() =
|
||||
member this.Run(args : string[]) =
|
||||
match args with
|
||||
| [| hostname; portStr |] ->
|
||||
let port = portStr |> int64
|
||||
|
||||
let spark = SparkSession.Builder().AppName("StructuredNetworkWordCount").GetOrCreate()
|
||||
|
||||
let lines =
|
||||
spark.ReadStream()
|
||||
.Format("socket")
|
||||
.Option("host", hostname)
|
||||
.Option("port", port)
|
||||
.Load()
|
||||
|
||||
let words =
|
||||
lines.Select(Functions.Explode(Functions.Split(lines.["value"], " "))
|
||||
.Alias("word"))
|
||||
let wordCounts = words.GroupBy("word").Count()
|
||||
|
||||
let query = wordCounts.WriteStream().OutputMode("complete").Format("console").Start()
|
||||
|
||||
query.AwaitTermination()
|
||||
|
||||
0
|
||||
| _ ->
|
||||
printfn "Usage: StructuredNetworkWordCount <hostname> <port>"
|
||||
1
|
||||
|
||||
interface IExample with
|
||||
member this.Run (args) = this.Run (args)
|
|
@ -0,0 +1,66 @@
|
|||
// Licensed to the .NET Foundation under one or more agreements.
|
||||
// The .NET Foundation licenses this file to you under the MIT license.
|
||||
// See the LICENSE file in the project root for more information.
|
||||
|
||||
namespace Microsoft.Spark.Examples.Sql.Streaming
|
||||
|
||||
open Microsoft.Spark.Examples
|
||||
open Microsoft.Spark.Sql
|
||||
|
||||
/// <summary>
|
||||
/// The example is taken/modified from
|
||||
/// spark/examples/src/main/python/sql/streaming/structured_network_wordcount_windowed.py
|
||||
///
|
||||
/// You can set up the data source as follow in a separated terminal:
|
||||
/// `$ nc -lk 9999`
|
||||
/// to start writing standard input to port 9999.
|
||||
/// </summary>
|
||||
type StructuredNetworkWordCountWindowed() =
|
||||
member this.Run(args : string[]) =
|
||||
match args with
|
||||
| ([| hostname; portStr; windowSizeStr |] | [| hostname; portStr; windowSizeStr; _ |]) ->
|
||||
let port = portStr |> int64
|
||||
let windowSize = windowSizeStr |> int64
|
||||
let slideSize = if (args.Length = 3) then windowSize else (args.[3] |> int64)
|
||||
if (slideSize > windowSize) then
|
||||
printfn "<slide duration> must be less than or equal to <window duration>"
|
||||
let windowDuration = sprintf "%d seconds" windowSize
|
||||
let slideDuration = sprintf "%d seconds" slideSize
|
||||
|
||||
let spark =
|
||||
SparkSession.Builder().AppName("StructuredNetworkWordCountWindowed").GetOrCreate()
|
||||
|
||||
let lines =
|
||||
spark.ReadStream()
|
||||
.Format("socket")
|
||||
.Option("host", hostname)
|
||||
.Option("port", port)
|
||||
.Option("includeTimestamp", true)
|
||||
.Load()
|
||||
|
||||
let words =
|
||||
lines.Select(Functions.Explode(Functions.Split(lines.["value"], " "))
|
||||
.Alias("word"), lines.["timestamp"])
|
||||
let windowedCounts =
|
||||
words.GroupBy(Functions.Window(words.["timestamp"], windowDuration, slideDuration),
|
||||
words.["word"])
|
||||
.Count()
|
||||
.OrderBy("window")
|
||||
|
||||
let query =
|
||||
windowedCounts.WriteStream()
|
||||
.OutputMode("complete")
|
||||
.Format("console")
|
||||
.Option("truncate", false)
|
||||
.Start()
|
||||
|
||||
query.AwaitTermination()
|
||||
0
|
||||
| _ ->
|
||||
printfn "Usage: StructuredNetworkWordCountWindowed \
|
||||
<hostname> <port> <window duration in seconds> \
|
||||
[<slide duration in seconds>]"
|
||||
1
|
||||
|
||||
interface IExample with
|
||||
member this.Run (args) = this.Run (args)
|
|
@ -0,0 +1,23 @@
|
|||
@echo off
|
||||
|
||||
setlocal
|
||||
|
||||
set OutputDir=%1
|
||||
cd %OutputDir%
|
||||
|
||||
echo "Download Hadoop binaries for Windows."
|
||||
curl -k -L -o hadoop.zip https://github.com/steveloughran/winutils/releases/download/tag_2017-08-29-hadoop-2.8.1-native/hadoop-2.8.1.zip
|
||||
unzip hadoop.zip
|
||||
mkdir -p hadoop\bin
|
||||
cp hadoop-2.8.1\winutils.exe hadoop\bin
|
||||
|
||||
echo "Downloading Spark distros."
|
||||
|
||||
curl -k -L -o spark-2.3.0.tgz https://archive.apache.org/dist/spark/spark-2.3.0/spark-2.3.0-bin-hadoop2.7.tgz && tar xzvf spark-2.3.0.tgz
|
||||
curl -k -L -o spark-2.3.1.tgz https://archive.apache.org/dist/spark/spark-2.3.1/spark-2.3.1-bin-hadoop2.7.tgz && tar xzvf spark-2.3.1.tgz
|
||||
curl -k -L -o spark-2.3.2.tgz https://archive.apache.org/dist/spark/spark-2.3.2/spark-2.3.2-bin-hadoop2.7.tgz && tar xzvf spark-2.3.2.tgz
|
||||
curl -k -L -o spark-2.3.3.tgz https://archive.apache.org/dist/spark/spark-2.3.3/spark-2.3.3-bin-hadoop2.7.tgz && tar xzvf spark-2.3.3.tgz
|
||||
curl -k -L -o spark-2.4.0.tgz https://archive.apache.org/dist/spark/spark-2.4.0/spark-2.4.0-bin-hadoop2.7.tgz && tar xzvf spark-2.4.0.tgz
|
||||
curl -k -L -o spark-2.4.1.tgz https://archive.apache.org/dist/spark/spark-2.4.1/spark-2.4.1-bin-hadoop2.7.tgz && tar xzvf spark-2.4.1.tgz
|
||||
|
||||
endlocal
|
|
@ -0,0 +1,22 @@
|
|||
@echo off
|
||||
|
||||
setlocal
|
||||
|
||||
set Build.SourcesDirectory=%1
|
||||
set Build.ArtifactStagingDirectory=%2
|
||||
set Build.Configuration=%3
|
||||
|
||||
CALL :PublishWorker net461, win-x64
|
||||
CALL :PublishWorker netcoreapp2.1, win-x64
|
||||
CALL :PublishWorker netcoreapp2.1, ubuntu.16.04-x64
|
||||
CALL :PublishWorker netcoreapp2.1, ubuntu.18.04-x64
|
||||
EXIT /B %ERRORLEVEL%
|
||||
|
||||
:PublishWorker
|
||||
set Framework=%~1
|
||||
set Runtime=%~2
|
||||
mkdir %Build.ArtifactStagingDirectory%\%Framework%\%Runtime%
|
||||
dotnet publish %Build.SourcesDirectory%\src\csharp\Microsoft.Spark.Worker\Microsoft.Spark.Worker.csproj --configuration %Build.Configuration% --framework %Framework% --runtime %Runtime% --output %Build.ArtifactStagingDirectory%\%Framework%\%Runtime%
|
||||
EXIT /B 0
|
||||
|
||||
endlocal
|
|
@ -0,0 +1,15 @@
|
|||
<?xml version="1.0" encoding="utf-8"?>
|
||||
<Project ToolsVersion="14.0" DefaultTargets="Build" xmlns="http://schemas.microsoft.com/developer/msbuild/2003" TreatAsLocalProperty="ExcludeRestorePackageImports">
|
||||
<PropertyGroup>
|
||||
<CheckForOverflowUnderflow>false</CheckForOverflowUnderflow>
|
||||
<Deterministic>true</Deterministic>
|
||||
<Features>strict</Features>
|
||||
<LangVersion>latest</LangVersion>
|
||||
<TreatWarningsAsErrors>true</TreatWarningsAsErrors>
|
||||
<WarningLevel>4</WarningLevel>
|
||||
<RestoreSources>
|
||||
https://api.nuget.org/v3/index.json;
|
||||
https://dotnet.myget.org/F/dotnet-core/api/v3/index.json;
|
||||
</RestoreSources>
|
||||
</PropertyGroup>
|
||||
</Project>
|
|
@ -0,0 +1,49 @@
|
|||
// Licensed to the .NET Foundation under one or more agreements.
|
||||
// The .NET Foundation licenses this file to you under the MIT license.
|
||||
// See the LICENSE file in the project root for more information.
|
||||
|
||||
using System;
|
||||
using System.Collections.Generic;
|
||||
using Xunit;
|
||||
|
||||
namespace Microsoft.Spark.E2ETest.IpcTests
|
||||
{
|
||||
[Collection("Spark E2E Tests")]
|
||||
public class PairRDDFunctionsTests
|
||||
{
|
||||
private readonly SparkContext _sc;
|
||||
|
||||
public PairRDDFunctionsTests()
|
||||
{
|
||||
_sc = SparkContext.GetOrCreate(new SparkConf());
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void TestCollect()
|
||||
{
|
||||
RDD<Tuple<string, int>> rdd = _sc.Parallelize(new[] {
|
||||
new Tuple<string, int>("a", 1),
|
||||
new Tuple<string, int>("b", 2) });
|
||||
|
||||
// Validate CollectAsMap().
|
||||
{
|
||||
var expected = new Dictionary<string, int>
|
||||
{
|
||||
["a"] = 1,
|
||||
["b"] = 2
|
||||
};
|
||||
|
||||
Assert.Equal(expected, rdd.CollectAsMap());
|
||||
}
|
||||
// Validate Keys().
|
||||
{
|
||||
Assert.Equal(new[] { "a", "b" }, rdd.Keys().Collect());
|
||||
}
|
||||
|
||||
// Validate Values().
|
||||
{
|
||||
Assert.Equal(new[] { 1, 2 }, rdd.Values().Collect());
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,115 @@
|
|||
// Licensed to the .NET Foundation under one or more agreements.
|
||||
// The .NET Foundation licenses this file to you under the MIT license.
|
||||
// See the LICENSE file in the project root for more information.
|
||||
|
||||
using System.Linq;
|
||||
using Xunit;
|
||||
|
||||
namespace Microsoft.Spark.E2ETest.IpcTests
|
||||
{
|
||||
[Collection("Spark E2E Tests")]
|
||||
public class RDDTests
|
||||
{
|
||||
private readonly SparkContext _sc;
|
||||
|
||||
public RDDTests()
|
||||
{
|
||||
_sc = SparkContext.GetOrCreate(new SparkConf());
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void TestParallelize()
|
||||
{
|
||||
{
|
||||
RDD<int> rdd = _sc.Parallelize(Enumerable.Range(0, 5));
|
||||
Assert.Equal(new[] { 0, 1, 2, 3, 4 }, rdd.Collect());
|
||||
}
|
||||
{
|
||||
var strs = new[] { "hello", "spark", "for", "dotnet" };
|
||||
RDD<string> rdd = _sc.Parallelize(strs);
|
||||
Assert.Equal(strs, rdd.Collect());
|
||||
}
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void TestTextFile()
|
||||
{
|
||||
RDD<string> rdd = _sc.TextFile(TestEnvironment.ResourceDirectory + "people.txt");
|
||||
var strs = new[] { "Michael, 29", "Andy, 30", "Justin, 19" };
|
||||
Assert.Equal(strs, rdd.Collect());
|
||||
|
||||
// Test a transformation so that SerializedMode is correctly propagated.
|
||||
RDD<int> intRdd = rdd.Map(str => 0);
|
||||
Assert.Equal(new[] { 0, 0, 0 }, intRdd.Collect());
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void TestMap()
|
||||
{
|
||||
RDD<int> rdd = _sc.Parallelize(Enumerable.Range(0, 5))
|
||||
.Map(x => x * 2);
|
||||
|
||||
Assert.Equal(new[] { 0, 2, 4, 6, 8 }, rdd.Collect());
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void TestFlatMap()
|
||||
{
|
||||
RDD<string> rdd = _sc.Parallelize(new[] { "hello spark", "for dotnet" })
|
||||
.FlatMap(str => str.Split(new char[] { ' ' }));
|
||||
|
||||
Assert.Equal(new[] { "hello", "spark", "for", "dotnet" }, rdd.Collect());
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void TestMapPartitions()
|
||||
{
|
||||
RDD<string> rdd = _sc.Parallelize(Enumerable.Range(0, 5))
|
||||
.MapPartitions(inputs => inputs.Select(input => $"str{input}"));
|
||||
|
||||
Assert.Equal(new[] { "str0", "str1", "str2", "str3", "str4" }, rdd.Collect());
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void TestMapPartitionsWithIndex()
|
||||
{
|
||||
RDD<string> rdd = _sc.Parallelize(Enumerable.Range(0, 3))
|
||||
.MapPartitionsWithIndex(
|
||||
(pid, inputs) => inputs.Select(input => $"str_{pid}_{input}"));
|
||||
|
||||
Assert.Equal(new[] { "str_0_0", "str_0_1", "str_0_2" }, rdd.Collect());
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void TestPipelinedRDD()
|
||||
{
|
||||
RDD<string> rdd = _sc.Parallelize(Enumerable.Range(0, 3))
|
||||
.Map(i => i + 5)
|
||||
.Map(i => i * 2)
|
||||
.Map(i => $"str_{i}")
|
||||
.FlatMap(str => str.Split(new[] { '_' }));
|
||||
|
||||
Assert.Equal(new[] { "str", "10", "str", "12", "str", "14" }, rdd.Collect());
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void TestFilter()
|
||||
{
|
||||
RDD<int> rdd = _sc.Parallelize(Enumerable.Range(0, 5))
|
||||
.Filter(x => (x % 2) == 0);
|
||||
|
||||
Assert.Equal(new[] { 0, 2, 4 }, rdd.Collect());
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void TestSample()
|
||||
{
|
||||
RDD<int> rdd = _sc.Parallelize(Enumerable.Range(0, 10))
|
||||
.Sample(true, 0.9, 0);
|
||||
|
||||
var count = rdd.Collect().Count();
|
||||
Assert.True(count > 0);
|
||||
Assert.True(count <= 10);
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,55 @@
|
|||
// Licensed to the .NET Foundation under one or more agreements.
|
||||
// The .NET Foundation licenses this file to you under the MIT license.
|
||||
// See the LICENSE file in the project root for more information.
|
||||
|
||||
using System.Collections.Generic;
|
||||
using System.Linq;
|
||||
using Xunit;
|
||||
|
||||
namespace Microsoft.Spark.E2ETest.IpcTests
|
||||
{
|
||||
[Collection("Spark E2E Tests")]
|
||||
public class SparkConfTests
|
||||
{
|
||||
private readonly SparkFixture _fixture;
|
||||
|
||||
public SparkConfTests(SparkFixture fixture)
|
||||
{
|
||||
_fixture = fixture;
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void TestSparkConf()
|
||||
{
|
||||
var sparkConf = new SparkConf(false);
|
||||
|
||||
sparkConf.SetMaster("master");
|
||||
sparkConf.SetAppName("test");
|
||||
sparkConf.SetSparkHome("test home");
|
||||
sparkConf.Set("key_string", "value");
|
||||
sparkConf.Set("key_int", "100");
|
||||
|
||||
var expectedConfigs = new Dictionary<string, string>()
|
||||
{
|
||||
{ "spark.master", "master" },
|
||||
{ "spark.app.name", "test" },
|
||||
{ "spark.home", "test home" },
|
||||
{ "key_string", "value" },
|
||||
{ "key_int", "100" }
|
||||
};
|
||||
|
||||
foreach (KeyValuePair<string, string> kv in expectedConfigs)
|
||||
{
|
||||
Assert.Equal(kv.Value, sparkConf.Get(kv.Key, string.Empty));
|
||||
}
|
||||
|
||||
Assert.Equal(100, sparkConf.GetInt("key_int", 0));
|
||||
|
||||
// Validate GetAll().
|
||||
Dictionary<string, string> actualAllConfigs =
|
||||
sparkConf.GetAll().ToDictionary(x => x.Key, x => x.Value);
|
||||
|
||||
Assert.Equal(expectedConfigs, actualAllConfigs);
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,40 @@
|
|||
// Licensed to the .NET Foundation under one or more agreements.
|
||||
// The .NET Foundation licenses this file to you under the MIT license.
|
||||
// See the LICENSE file in the project root for more information.
|
||||
|
||||
using Xunit;
|
||||
|
||||
namespace Microsoft.Spark.E2ETest.IpcTests
|
||||
{
|
||||
[Collection("Spark E2E Tests")]
|
||||
public class SparkContextTests
|
||||
{
|
||||
/// <summary>
|
||||
/// Test signatures for APIs up to Spark 2.3.*.
|
||||
/// </summary>
|
||||
/// <remarks>
|
||||
/// For the RDD related tests, refer to <see cref="RDDTests"/>.
|
||||
/// </remarks>
|
||||
[Fact]
|
||||
public void TestSignaturesV2_3_X()
|
||||
{
|
||||
SparkContext sc = SparkContext.GetOrCreate(new SparkConf());
|
||||
|
||||
_ = sc.GetConf();
|
||||
_ = sc.DefaultParallelism;
|
||||
|
||||
sc.SetJobDescription("job description");
|
||||
|
||||
sc.SetJobGroup("group id", "description");
|
||||
sc.SetJobGroup("group id", "description", true);
|
||||
|
||||
sc.ClearJobGroup();
|
||||
|
||||
string filePath = TestEnvironment.ResourceDirectory + "people.txt";
|
||||
sc.AddFile(filePath);
|
||||
sc.AddFile(filePath, true);
|
||||
|
||||
sc.SetCheckpointDir(TestEnvironment.ResourceDirectory);
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,142 @@
|
|||
// Licensed to the .NET Foundation under one or more agreements.
|
||||
// The .NET Foundation licenses this file to you under the MIT license.
|
||||
// See the LICENSE file in the project root for more information.
|
||||
|
||||
using Microsoft.Spark.Sql;
|
||||
using Xunit;
|
||||
using static Microsoft.Spark.Sql.Expressions.Window;
|
||||
using static Microsoft.Spark.Sql.Functions;
|
||||
|
||||
namespace Microsoft.Spark.E2ETest.IpcTests
|
||||
{
|
||||
[Collection("Spark E2E Tests")]
|
||||
public class ColumnTests
|
||||
{
|
||||
/// <summary>
|
||||
/// Test signatures for APIs up to Spark 2.3.*.
|
||||
/// </summary>
|
||||
[Fact]
|
||||
public void TestSignaturesV2_3_X()
|
||||
{
|
||||
Column col = Column("col");
|
||||
Column col1 = Column("col1");
|
||||
Column col2 = Column("col2");
|
||||
|
||||
col = -col1;
|
||||
|
||||
col = !col;
|
||||
|
||||
col = col1 == col2;
|
||||
col = col1.EqualTo(col2);
|
||||
|
||||
col = col1 != col2;
|
||||
col = col1.NotEqual(col2);
|
||||
|
||||
col = col1 > col2;
|
||||
col = col1 > "hello";
|
||||
col = col1.Gt(col2);
|
||||
col = col1.Gt("hello");
|
||||
|
||||
col = col1 < col2;
|
||||
col = col1 < "hello";
|
||||
col = col1.Lt(col2);
|
||||
col = col1.Lt("hello");
|
||||
|
||||
col = col1 <= col2;
|
||||
col = col1 <= "hello";
|
||||
col = col1.Leq(col2);
|
||||
col = col1.Leq("hello");
|
||||
|
||||
col = col1 >= col2;
|
||||
col = col1 >= "hello";
|
||||
col = col1.Geq(col2);
|
||||
col = col1.Geq("hello");
|
||||
|
||||
col = col1.EqNullSafe(col2);
|
||||
col = col1.EqNullSafe("hello");
|
||||
|
||||
col = When(col1 == col2, 0).When(col1 == col2, 0);
|
||||
|
||||
col = When(col1 == col2, 0).Otherwise(col2);
|
||||
col = When(col1 == col2, 0).Otherwise("hello");
|
||||
|
||||
col = col1.Between(col1, col2);
|
||||
col = col1.Between(1, 3);
|
||||
|
||||
col = col.IsNaN();
|
||||
|
||||
col = col.IsNotNull();
|
||||
|
||||
col = col1 | col2;
|
||||
col = col.Or(col2);
|
||||
|
||||
col = col1 & col2;
|
||||
col = col.And(col2);
|
||||
|
||||
col = col1 + col2;
|
||||
col = col1.Plus(col2);
|
||||
|
||||
col = col1 - col2;
|
||||
col = col1.Minus(col2);
|
||||
|
||||
col = col1 * col2;
|
||||
col = col1.Multiply(col2);
|
||||
|
||||
col = col1 / col2;
|
||||
col = col1.Divide(col2);
|
||||
|
||||
col = col1 % col2;
|
||||
col = col1.Mod(col2);
|
||||
|
||||
col = col1.Like("hello");
|
||||
|
||||
col = col1.RLike("hello");
|
||||
|
||||
col = col1.GetItem(1);
|
||||
col = col1.GetItem("key");
|
||||
|
||||
col = col1.GetField("field");
|
||||
|
||||
col = col1.SubStr(col1, col2);
|
||||
col = col1.SubStr(0, 5);
|
||||
|
||||
col = col1.Contains(col2);
|
||||
col = col1.Contains("hello");
|
||||
|
||||
col = col1.StartsWith(col2);
|
||||
col = col1.StartsWith("hello");
|
||||
|
||||
col = col1.EndsWith(col2);
|
||||
col = col1.EndsWith("hello");
|
||||
|
||||
col = col1.Alias("alias");
|
||||
|
||||
col = col1.As("alias");
|
||||
col = col1.As(new string[] { });
|
||||
col = col1.As(new[] { "alias1", "alias2" });
|
||||
|
||||
col = col1.Name("alias");
|
||||
|
||||
col = col1.Cast("string");
|
||||
|
||||
col = col1.Desc();
|
||||
col = col1.DescNullsFirst();
|
||||
col = col1.DescNullsLast();
|
||||
|
||||
col = col1.Asc();
|
||||
col = col1.AscNullsFirst();
|
||||
col = col1.AscNullsLast();
|
||||
|
||||
col.Explain(true);
|
||||
|
||||
col = col1.BitwiseOR(col2);
|
||||
|
||||
col = col1.BitwiseAND(col2);
|
||||
|
||||
col = col1.BitwiseXOR(col2);
|
||||
|
||||
col = col1.Over(PartitionBy(col1));
|
||||
col = col1.Over();
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,95 @@
|
|||
// Licensed to the .NET Foundation under one or more agreements.
|
||||
// The .NET Foundation licenses this file to you under the MIT license.
|
||||
// See the LICENSE file in the project root for more information.
|
||||
|
||||
using System.Collections.Generic;
|
||||
using Microsoft.Spark.Sql;
|
||||
using Xunit;
|
||||
|
||||
namespace Microsoft.Spark.E2ETest.IpcTests
|
||||
{
|
||||
[Collection("Spark E2E Tests")]
|
||||
public class DataFrameFunctionsTests
|
||||
{
|
||||
private readonly SparkSession _spark;
|
||||
private readonly DataFrame _df;
|
||||
|
||||
public DataFrameFunctionsTests(SparkFixture fixture)
|
||||
{
|
||||
_spark = fixture.Spark;
|
||||
_df = _spark
|
||||
.Read()
|
||||
.Json(TestEnvironment.ResourceDirectory + "people.json");
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void TestDataFrameNaFunctionSignatures()
|
||||
{
|
||||
DataFrameNaFunctions dfNaFuncs = _df.Na();
|
||||
|
||||
var emptyColumn = new string[] { };
|
||||
var validColumn = new string[] { "age" };
|
||||
|
||||
DataFrame df = dfNaFuncs.Drop("any");
|
||||
df = dfNaFuncs.Drop("all");
|
||||
df = dfNaFuncs.Drop(emptyColumn);
|
||||
df = dfNaFuncs.Drop(validColumn);
|
||||
df = dfNaFuncs.Drop("any", emptyColumn);
|
||||
df = dfNaFuncs.Drop("all", validColumn);
|
||||
df = dfNaFuncs.Drop(20);
|
||||
df = dfNaFuncs.Drop(20, emptyColumn);
|
||||
df = dfNaFuncs.Drop(20, validColumn);
|
||||
|
||||
df = dfNaFuncs.Fill(100L);
|
||||
df = dfNaFuncs.Fill(100.0);
|
||||
df = dfNaFuncs.Fill("hello");
|
||||
df = dfNaFuncs.Fill(false);
|
||||
df = dfNaFuncs.Fill(100L, emptyColumn);
|
||||
df = dfNaFuncs.Fill(100L, validColumn);
|
||||
df = dfNaFuncs.Fill(100.0, emptyColumn);
|
||||
df = dfNaFuncs.Fill(100.0, validColumn);
|
||||
df = dfNaFuncs.Fill("hello", emptyColumn);
|
||||
df = dfNaFuncs.Fill("hello", validColumn);
|
||||
df = dfNaFuncs.Fill(true, emptyColumn);
|
||||
df = dfNaFuncs.Fill(true, validColumn);
|
||||
df = dfNaFuncs.Fill(new Dictionary<string, int>() { { "age", 10 } });
|
||||
df = dfNaFuncs.Fill(new Dictionary<string, long>() { { "age", 10L } });
|
||||
df = dfNaFuncs.Fill(new Dictionary<string, double>() { { "age", 10.0 } });
|
||||
df = dfNaFuncs.Fill(new Dictionary<string, string>() { { "age", "name" } });
|
||||
df = dfNaFuncs.Fill(new Dictionary<string, bool>() { { "age", false } });
|
||||
|
||||
var doubleReplacement = new Dictionary<double, double>() { { 1.0, 5.0 } };
|
||||
var boolReplacement = new Dictionary<bool, bool>() { { true, false } };
|
||||
var stringReplacement = new Dictionary<string, string>() { { "a", "b" } };
|
||||
|
||||
df = dfNaFuncs.Replace("age", doubleReplacement);
|
||||
df = dfNaFuncs.Replace("age", boolReplacement);
|
||||
df = dfNaFuncs.Replace("age", stringReplacement);
|
||||
df = dfNaFuncs.Replace(emptyColumn, doubleReplacement);
|
||||
df = dfNaFuncs.Replace(validColumn, doubleReplacement);
|
||||
df = dfNaFuncs.Replace(emptyColumn, boolReplacement);
|
||||
df = dfNaFuncs.Replace(validColumn, boolReplacement);
|
||||
df = dfNaFuncs.Replace(emptyColumn, stringReplacement);
|
||||
df = dfNaFuncs.Replace(validColumn, stringReplacement);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void TestDataFrameStatFunctionSignatures()
|
||||
{
|
||||
DataFrameStatFunctions stat = _df.Stat();
|
||||
|
||||
double[] result = stat.ApproxQuantile("age", new[] { 0.5, 0.5 }, 0.3);
|
||||
|
||||
double cov = stat.Cov("age", "age");
|
||||
|
||||
double corr = stat.Corr("age", "age", "pearson");
|
||||
corr = stat.Corr("age", "age");
|
||||
|
||||
var columnNames = new[] { "age", "name" };
|
||||
DataFrame df = stat.FreqItems(columnNames, 0.2);
|
||||
df = stat.FreqItems(columnNames);
|
||||
|
||||
df = stat.SampleBy("age", new Dictionary<int, double> { { 1, 0.5 } }, 100);
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,329 @@
|
|||
// Licensed to the .NET Foundation under one or more agreements.
|
||||
// The .NET Foundation licenses this file to you under the MIT license.
|
||||
// See the LICENSE file in the project root for more information.
|
||||
|
||||
using System;
|
||||
using System.Linq;
|
||||
using System.Reflection;
|
||||
using Microsoft.Spark.E2ETest.Utils;
|
||||
using Microsoft.Spark.Sql;
|
||||
using Microsoft.Spark.Sql.Types;
|
||||
using Microsoft.Spark.Utils;
|
||||
using Xunit;
|
||||
using static Microsoft.Spark.Sql.Functions;
|
||||
|
||||
namespace Microsoft.Spark.E2ETest.IpcTests
|
||||
{
|
||||
[Collection("Spark E2E Tests")]
|
||||
public class DataFrameTests
|
||||
{
|
||||
private static FieldInfo s_udfUtilsUseArrow =
|
||||
typeof(UdfUtils).GetField("s_useArrow", BindingFlags.Static | BindingFlags.NonPublic);
|
||||
|
||||
private readonly SparkSession _spark;
|
||||
private readonly DataFrame _df;
|
||||
|
||||
public DataFrameTests(SparkFixture fixture)
|
||||
{
|
||||
_spark = fixture.Spark;
|
||||
_df = _spark
|
||||
.Read()
|
||||
.Schema("age INT, name STRING")
|
||||
.Json(TestEnvironment.ResourceDirectory + "people.json");
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void TestCollect()
|
||||
{
|
||||
Row[] rows = _df.Collect().ToArray();
|
||||
Assert.Equal(3, rows.Length);
|
||||
|
||||
Row row1 = rows[0];
|
||||
Assert.Equal("Michael", row1.GetAs<string>("name"));
|
||||
Assert.Null(row1.Get("age"));
|
||||
|
||||
Row row2 = rows[1];
|
||||
Assert.Equal("Andy", row2.GetAs<string>("name"));
|
||||
Assert.Equal(30, row2.GetAs<int>("age"));
|
||||
|
||||
Row row3 = rows[2];
|
||||
Assert.Equal("Justin", row3.GetAs<string>("name"));
|
||||
Assert.Equal(19, row3.GetAs<int>("age"));
|
||||
}
|
||||
|
||||
[Theory]
|
||||
[InlineData(true)]
|
||||
[InlineData(false)]
|
||||
public void TestUDF(bool useArrow)
|
||||
{
|
||||
bool originalUseArrow = GetUseArrowValue();
|
||||
SetUseArrowValue(useArrow);
|
||||
|
||||
try
|
||||
{
|
||||
// Single UDF.
|
||||
Func<Column, Column, Column> udf1 = Udf<int?, string, string>(
|
||||
(age, name) => name + " is " + (age ?? 0));
|
||||
{
|
||||
Row[] rows = _df.Select(udf1(_df["age"], _df["name"])).Collect().ToArray();
|
||||
Assert.Equal(3, rows.Length);
|
||||
Assert.Equal("Michael is 0", rows[0].GetAs<string>(0));
|
||||
Assert.Equal("Andy is 30", rows[1].GetAs<string>(0));
|
||||
Assert.Equal("Justin is 19", rows[2].GetAs<string>(0));
|
||||
}
|
||||
|
||||
// Chained UDFs.
|
||||
Func<Column, Column> udf2 = Udf<string, string>(str => $"hello {str}!");
|
||||
{
|
||||
Row[] rows = _df
|
||||
.Select(udf2(udf1(_df["age"], _df["name"])))
|
||||
.Collect()
|
||||
.ToArray();
|
||||
Assert.Equal(3, rows.Length);
|
||||
Assert.Equal("hello Michael is 0!", rows[0].GetAs<string>(0));
|
||||
Assert.Equal("hello Andy is 30!", rows[1].GetAs<string>(0));
|
||||
Assert.Equal("hello Justin is 19!", rows[2].GetAs<string>(0));
|
||||
}
|
||||
|
||||
// Multiple UDFs:
|
||||
{
|
||||
Row[] rows = _df
|
||||
.Select(udf1(_df["age"], _df["name"]), udf2(_df["name"]))
|
||||
.Collect()
|
||||
.ToArray();
|
||||
Assert.Equal(3, rows.Length);
|
||||
Assert.Equal("Michael is 0", rows[0].GetAs<string>(0));
|
||||
Assert.Equal("hello Michael!", rows[0].GetAs<string>(1));
|
||||
|
||||
Assert.Equal("Andy is 30", rows[1].GetAs<string>(0));
|
||||
Assert.Equal("hello Andy!", rows[1].GetAs<string>(1));
|
||||
|
||||
Assert.Equal("Justin is 19", rows[2].GetAs<string>(0));
|
||||
Assert.Equal("hello Justin!", rows[2].GetAs<string>(1));
|
||||
}
|
||||
}
|
||||
finally
|
||||
{
|
||||
SetUseArrowValue(originalUseArrow);
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Test signatures for APIs up to Spark 2.3.*.
|
||||
/// </summary>
|
||||
[Fact]
|
||||
public void TestSignaturesV2_3_X()
|
||||
{
|
||||
Column col = _df["name"];
|
||||
col = _df["age"];
|
||||
|
||||
DataFrame df = _df.ToDF();
|
||||
df = df.ToDF("name2", "age2");
|
||||
|
||||
StructType schema = _df.Schema();
|
||||
Assert.NotNull(schema);
|
||||
|
||||
_df.PrintSchema();
|
||||
|
||||
_df.Explain();
|
||||
_df.Explain(true);
|
||||
_df.Explain(false);
|
||||
|
||||
Assert.Equal(2, _df.Columns().ToArray().Length);
|
||||
|
||||
_df.IsLocal();
|
||||
|
||||
_df.IsStreaming();
|
||||
|
||||
// The following is required for *CheckPoint().
|
||||
_spark.SparkContext.SetCheckpointDir(TestEnvironment.ResourceDirectory);
|
||||
|
||||
_df.Checkpoint();
|
||||
_df.Checkpoint(false);
|
||||
|
||||
_df.LocalCheckpoint();
|
||||
_df.LocalCheckpoint(false);
|
||||
|
||||
_df.WithWatermark("time", "10 minutes");
|
||||
|
||||
_df.Show();
|
||||
_df.Show(10);
|
||||
_df.Show(10, 10);
|
||||
_df.Show(10, 10, true);
|
||||
|
||||
_df.Join(_df);
|
||||
_df.Join(_df, "name");
|
||||
_df.Join(_df, new[] { "name" });
|
||||
_df.Join(_df, new[] { "name" }, "outer");
|
||||
_df.Join(_df, _df["age"] == _df["age"]);
|
||||
_df.Join(_df, _df["age"] == _df["age"], "outer");
|
||||
|
||||
_df.CrossJoin(_df);
|
||||
|
||||
_df.SortWithinPartitions("age");
|
||||
_df.SortWithinPartitions("age", "name");
|
||||
_df.SortWithinPartitions();
|
||||
_df.SortWithinPartitions(_df["age"]);
|
||||
_df.SortWithinPartitions(_df["age"], _df["name"]);
|
||||
|
||||
_df.Sort("age");
|
||||
_df.Sort("age", "name");
|
||||
_df.Sort();
|
||||
_df.Sort(_df["age"]);
|
||||
_df.Sort(_df["age"], _df["name"]);
|
||||
|
||||
_df.OrderBy("age");
|
||||
_df.OrderBy("age", "name");
|
||||
_df.OrderBy();
|
||||
_df.OrderBy(_df["age"]);
|
||||
_df.OrderBy(_df["age"], _df["name"]);
|
||||
|
||||
_df.Hint("broadcast");
|
||||
_df.Hint("broadcast", new[] { "hello", "world" });
|
||||
|
||||
_df.Col("age");
|
||||
|
||||
_df.ColRegex("age");
|
||||
|
||||
_df.As("alias");
|
||||
|
||||
_df.Alias("alias");
|
||||
|
||||
_df.Select("age");
|
||||
_df.Select("age", "name");
|
||||
_df.Select();
|
||||
_df.Select(_df["age"]);
|
||||
_df.Select(_df["age"], _df["name"]);
|
||||
|
||||
_df.SelectExpr();
|
||||
_df.SelectExpr("age * 2");
|
||||
_df.SelectExpr("age * 2", "abs(age)");
|
||||
|
||||
_df.Filter(_df["age"] > 21);
|
||||
_df.Filter("age > 21");
|
||||
|
||||
_df.Where(_df["age"] > 21);
|
||||
_df.Where("age > 21");
|
||||
|
||||
_df.GroupBy("age");
|
||||
_df.GroupBy("age", "name");
|
||||
_df.GroupBy();
|
||||
_df.GroupBy(_df["age"]);
|
||||
_df.GroupBy(_df["age"], _df["name"]);
|
||||
|
||||
_df.Rollup("age");
|
||||
_df.Rollup("age", "name");
|
||||
_df.Rollup();
|
||||
_df.Rollup(_df["age"]);
|
||||
_df.Rollup(_df["age"], _df["name"]);
|
||||
|
||||
_df.Cube("age");
|
||||
_df.Cube("age", "name");
|
||||
_df.Cube();
|
||||
_df.Cube(_df["age"]);
|
||||
_df.Cube(_df["age"], _df["name"]);
|
||||
|
||||
_df.Agg(Avg(_df["age"]));
|
||||
_df.Agg(Avg(_df["age"]), Avg(_df["name"]));
|
||||
|
||||
_df.Limit(10);
|
||||
|
||||
_df.Union(_df);
|
||||
|
||||
_df.UnionByName(_df);
|
||||
|
||||
_df.Intersect(_df);
|
||||
|
||||
_df.Except(_df);
|
||||
|
||||
_df.Sample(0.5);
|
||||
_df.Sample(0.5, true);
|
||||
_df.Sample(0.5, false, 12345);
|
||||
|
||||
_df.RandomSplit(new[] { 0.2, 0.8 });
|
||||
_df.RandomSplit(new[] { 0.2, 0.8 }, 12345);
|
||||
|
||||
_df.WithColumn("age2", _df["age"]);
|
||||
|
||||
_df.WithColumnRenamed("age", "age2");
|
||||
|
||||
_df.Drop();
|
||||
_df.Drop("age");
|
||||
_df.Drop("age", "name");
|
||||
|
||||
_df.Drop(_df["age"]);
|
||||
|
||||
_df.DropDuplicates();
|
||||
_df.DropDuplicates("age");
|
||||
_df.DropDuplicates("age", "name");
|
||||
|
||||
_df.Describe();
|
||||
_df.Describe("age");
|
||||
_df.Describe("age", "name");
|
||||
|
||||
_df.Summary();
|
||||
_df.Summary("count");
|
||||
_df.Summary("count", "mean");
|
||||
|
||||
_df.Head(2);
|
||||
_df.Head();
|
||||
|
||||
_df.First();
|
||||
|
||||
_df.Take(3).ToArray();
|
||||
|
||||
_df.Collect().ToArray();
|
||||
|
||||
_df.ToLocalIterator().ToArray();
|
||||
|
||||
_df.Count();
|
||||
|
||||
_df.Repartition(2);
|
||||
_df.Repartition(2, _df["age"]);
|
||||
_df.Repartition(_df["age"]);
|
||||
_df.Repartition();
|
||||
|
||||
_df.RepartitionByRange(2, _df["age"]);
|
||||
_df.RepartitionByRange(_df["age"]);
|
||||
|
||||
_df.Coalesce(1);
|
||||
|
||||
_df.Distinct();
|
||||
|
||||
_df.Persist();
|
||||
|
||||
_df.Cache();
|
||||
|
||||
_df.Unpersist();
|
||||
|
||||
_df.CreateTempView("view");
|
||||
_df.CreateOrReplaceTempView("view");
|
||||
|
||||
_df.CreateGlobalTempView("global_view");
|
||||
_df.CreateOrReplaceGlobalTempView("global_view");
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Test signatures for APIs introduced in Spark 2.4.*.
|
||||
/// </summary>
|
||||
[SkipIfSparkVersionIsLessThan(Versions.V2_4_0)]
|
||||
public void TestSignaturesV2_4_X()
|
||||
{
|
||||
_df.IsEmpty();
|
||||
|
||||
_df.IntersectAll(_df);
|
||||
|
||||
_df.ExceptAll(_df);
|
||||
}
|
||||
|
||||
private static bool GetUseArrowValue()
|
||||
{
|
||||
return (bool)s_udfUtilsUseArrow.GetValue(null);
|
||||
}
|
||||
|
||||
private static void SetUseArrowValue(bool value)
|
||||
{
|
||||
s_udfUtilsUseArrow.SetValue(null, value);
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,48 @@
|
|||
// Licensed to the .NET Foundation under one or more agreements.
|
||||
// The .NET Foundation licenses this file to you under the MIT license.
|
||||
// See the LICENSE file in the project root for more information.
|
||||
|
||||
using Microsoft.Spark.Sql;
|
||||
using Microsoft.Spark.Sql.Expressions;
|
||||
using Xunit;
|
||||
using static Microsoft.Spark.Sql.Expressions.Window;
|
||||
using static Microsoft.Spark.Sql.Functions;
|
||||
|
||||
namespace Microsoft.Spark.E2ETest.IpcTests
|
||||
{
|
||||
[Collection("Spark E2E Tests")]
|
||||
public class WindowSpecTests
|
||||
{
|
||||
/// <summary>
|
||||
/// Test signatures for APIs up to Spark 2.3.*.
|
||||
/// </summary>
|
||||
[Fact]
|
||||
public void TestSignaturesV2_3_X()
|
||||
{
|
||||
Column col1 = Column("age");
|
||||
Column col2 = Column("name");
|
||||
WindowSpec windowSpec = PartitionBy("age");
|
||||
|
||||
windowSpec = windowSpec.PartitionBy("age");
|
||||
windowSpec = windowSpec.PartitionBy("age", "name");
|
||||
windowSpec = windowSpec.PartitionBy();
|
||||
windowSpec = windowSpec.PartitionBy(col1);
|
||||
windowSpec = windowSpec.PartitionBy(col1, col2);
|
||||
|
||||
windowSpec = windowSpec.OrderBy("age");
|
||||
windowSpec = windowSpec.OrderBy("age", "name");
|
||||
windowSpec = windowSpec.OrderBy();
|
||||
windowSpec = windowSpec.OrderBy(col1);
|
||||
windowSpec = windowSpec.OrderBy(col1, col2);
|
||||
|
||||
windowSpec = windowSpec.RowsBetween(
|
||||
Sql.Expressions.Window.UnboundedPreceding,
|
||||
Sql.Expressions.Window.UnboundedFollowing);
|
||||
|
||||
windowSpec = windowSpec.RangeBetween(
|
||||
Sql.Expressions.Window.UnboundedPreceding,
|
||||
Sql.Expressions.Window.UnboundedFollowing);
|
||||
windowSpec = windowSpec.RangeBetween(UnboundedPreceding(), UnboundedFollowing());
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,51 @@
|
|||
// Licensed to the .NET Foundation under one or more agreements.
|
||||
// The .NET Foundation licenses this file to you under the MIT license.
|
||||
// See the LICENSE file in the project root for more information.
|
||||
|
||||
using Microsoft.Spark.Sql;
|
||||
using Microsoft.Spark.Sql.Expressions;
|
||||
using Xunit;
|
||||
using static Microsoft.Spark.Sql.Expressions.Window;
|
||||
using static Microsoft.Spark.Sql.Functions;
|
||||
|
||||
namespace Microsoft.Spark.E2ETest.IpcTests
|
||||
{
|
||||
[Collection("Spark E2E Tests")]
|
||||
public class WindowTests
|
||||
{
|
||||
/// <summary>
|
||||
/// Test signatures for APIs up to Spark 2.3.*.
|
||||
/// </summary>
|
||||
[Fact]
|
||||
public void TestSignaturesV2_3_X()
|
||||
{
|
||||
Column col1 = Column("age");
|
||||
Column col2 = Column("name");
|
||||
|
||||
_ = Sql.Expressions.Window.UnboundedPreceding;
|
||||
_ = Sql.Expressions.Window.UnboundedFollowing;
|
||||
_ = Sql.Expressions.Window.CurrentRow;
|
||||
|
||||
WindowSpec windowSpec = PartitionBy("age");
|
||||
windowSpec = PartitionBy("age", "name");
|
||||
windowSpec = PartitionBy();
|
||||
windowSpec = PartitionBy(col1);
|
||||
windowSpec = PartitionBy(col1, col2);
|
||||
|
||||
windowSpec = OrderBy("age");
|
||||
windowSpec = OrderBy("age", "name");
|
||||
windowSpec = OrderBy();
|
||||
windowSpec = OrderBy(col1);
|
||||
windowSpec = OrderBy(col1, col2);
|
||||
|
||||
windowSpec = RowsBetween(
|
||||
Sql.Expressions.Window.UnboundedPreceding,
|
||||
Sql.Expressions.Window.UnboundedFollowing);
|
||||
|
||||
windowSpec = RangeBetween(
|
||||
Sql.Expressions.Window.UnboundedPreceding,
|
||||
Sql.Expressions.Window.UnboundedFollowing);
|
||||
windowSpec = RangeBetween(UnboundedPreceding(), UnboundedFollowing());
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,703 @@
|
|||
// Licensed to the .NET Foundation under one or more agreements.
|
||||
// The .NET Foundation licenses this file to you under the MIT license.
|
||||
// See the LICENSE file in the project root for more information.
|
||||
|
||||
using System.Collections.Generic;
|
||||
using Microsoft.Spark.E2ETest.Utils;
|
||||
using Microsoft.Spark.Sql;
|
||||
using Xunit;
|
||||
using static Microsoft.Spark.Sql.Functions;
|
||||
|
||||
namespace Microsoft.Spark.E2ETest.IpcTests
|
||||
{
|
||||
[Collection("Spark E2E Tests")]
|
||||
public class FunctionsTests
|
||||
{
|
||||
private readonly SparkSession _spark;
|
||||
|
||||
public FunctionsTests(SparkFixture fixture)
|
||||
{
|
||||
_spark = fixture.Spark;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Test signatures for APIs up to Spark 2.3.*.
|
||||
/// The purpose of this test is to ensure that JVM calls can be successfully made.
|
||||
/// Note that this is not testing functionality of each function.
|
||||
/// </summary>
|
||||
[Fact]
|
||||
public void TestSignaturesV2_3_X()
|
||||
{
|
||||
//////////////////////////////
|
||||
// Basic Functions
|
||||
//////////////////////////////
|
||||
|
||||
Column col = Column("col1");
|
||||
|
||||
col = Col("col2");
|
||||
|
||||
col = Lit(1);
|
||||
col = Lit("some column");
|
||||
col = Lit(col);
|
||||
|
||||
//////////////////////////////
|
||||
// Sort Functions
|
||||
//////////////////////////////
|
||||
col = Asc("col");
|
||||
|
||||
col = AscNullsFirst("col");
|
||||
|
||||
col = AscNullsLast("col");
|
||||
|
||||
col = Desc("col");
|
||||
|
||||
col = DescNullsFirst("col");
|
||||
|
||||
col = DescNullsLast("col");
|
||||
|
||||
//////////////////////////////
|
||||
// Aggregate Functions
|
||||
//////////////////////////////
|
||||
col = Column("col");
|
||||
|
||||
col = ApproxCountDistinct(col);
|
||||
col = ApproxCountDistinct("col");
|
||||
col = ApproxCountDistinct(col, 0.05);
|
||||
col = ApproxCountDistinct("col", 0.05);
|
||||
|
||||
col = Avg(col);
|
||||
col = Avg("col");
|
||||
|
||||
col = CollectList(col);
|
||||
col = CollectList("col");
|
||||
|
||||
col = CollectSet(col);
|
||||
col = CollectSet("col");
|
||||
|
||||
col = Corr(col, col);
|
||||
col = Corr("col1", "col2");
|
||||
|
||||
col = Count(col);
|
||||
col = Count("col");
|
||||
|
||||
col = CountDistinct(col);
|
||||
col = CountDistinct(col, col);
|
||||
col = CountDistinct(col, col, col);
|
||||
col = CountDistinct("col1");
|
||||
col = CountDistinct("col1", "col2");
|
||||
col = CountDistinct("col1", "col2", "col3");
|
||||
|
||||
col = CovarPop(col, col);
|
||||
col = CovarPop("col1", "col2");
|
||||
|
||||
col = CovarSamp(col, col);
|
||||
col = CovarSamp("col1", "col2");
|
||||
|
||||
col = First(col);
|
||||
col = First(col, true);
|
||||
col = First(col, false);
|
||||
col = First("col");
|
||||
col = First("col", true);
|
||||
col = First("col", false);
|
||||
|
||||
col = Grouping(col);
|
||||
col = Grouping("col");
|
||||
|
||||
col = GroupingId();
|
||||
col = GroupingId(col);
|
||||
col = GroupingId(col, col);
|
||||
col = GroupingId("col1");
|
||||
col = GroupingId("col1", "col2");
|
||||
col = GroupingId("col1", "col2", "col3");
|
||||
|
||||
col = Kurtosis(col);
|
||||
col = Kurtosis("col");
|
||||
|
||||
col = Last(col);
|
||||
col = Last(col, true);
|
||||
col = Last(col, false);
|
||||
col = Last("col");
|
||||
col = Last("col", true);
|
||||
col = Last("col", false);
|
||||
|
||||
col = Max(col);
|
||||
col = Max("col");
|
||||
|
||||
col = Mean(col);
|
||||
col = Mean("col");
|
||||
|
||||
col = Min(col);
|
||||
col = Min("col");
|
||||
|
||||
col = Skewness(col);
|
||||
col = Skewness("col");
|
||||
|
||||
col = Stddev(col);
|
||||
col = Stddev("col");
|
||||
|
||||
col = StddevSamp(col);
|
||||
col = StddevSamp("col");
|
||||
|
||||
col = StddevPop(col);
|
||||
col = StddevPop("col");
|
||||
|
||||
col = Sum(col);
|
||||
col = Sum("col");
|
||||
|
||||
col = SumDistinct(col);
|
||||
col = SumDistinct("col");
|
||||
|
||||
col = Variance(col);
|
||||
col = Variance("col");
|
||||
|
||||
col = VarSamp(col);
|
||||
col = VarSamp("col");
|
||||
|
||||
col = VarPop(col);
|
||||
col = VarPop("col");
|
||||
|
||||
//////////////////////////////
|
||||
// Window Functions
|
||||
//////////////////////////////
|
||||
col = UnboundedPreceding();
|
||||
|
||||
col = UnboundedFollowing();
|
||||
|
||||
col = CurrentRow();
|
||||
|
||||
col = CumeDist();
|
||||
|
||||
col = DenseRank();
|
||||
|
||||
col = Lag(col, 0);
|
||||
col = Lag(col, 2, "col2");
|
||||
col = Lag("col", 0);
|
||||
col = Lag("col", 2, "col2");
|
||||
|
||||
col = Lead(col, 0);
|
||||
col = Lead(col, 2, "col2");
|
||||
col = Lead("col", 0);
|
||||
col = Lead("col", 2, "col2");
|
||||
|
||||
col = Ntile(100);
|
||||
|
||||
col = PercentRank();
|
||||
|
||||
col = Rank();
|
||||
|
||||
col = RowNumber();
|
||||
|
||||
//////////////////////////////
|
||||
// Non-Aggregate Functions
|
||||
//////////////////////////////
|
||||
col = Column("col");
|
||||
|
||||
col = Abs(col);
|
||||
|
||||
col = Array();
|
||||
col = Array(col);
|
||||
col = Array(col, col);
|
||||
col = Array("col1");
|
||||
col = Array("col1", "col2");
|
||||
col = Array("col1", "col2", "col3");
|
||||
|
||||
col = Map();
|
||||
col = Map(col);
|
||||
col = Map(col, col);
|
||||
|
||||
DataFrame df = _spark
|
||||
.Read()
|
||||
.Json(TestEnvironment.ResourceDirectory + "people.json");
|
||||
df = Broadcast(df);
|
||||
|
||||
col = Coalesce();
|
||||
col = Coalesce(col);
|
||||
col = Coalesce(col, col);
|
||||
|
||||
col = InputFileName();
|
||||
|
||||
col = IsNaN(col);
|
||||
|
||||
col = IsNull(col);
|
||||
|
||||
col = MonotonicallyIncreasingId();
|
||||
|
||||
col = NaNvl(col, col);
|
||||
|
||||
col = Negate(col);
|
||||
|
||||
col = Not(col);
|
||||
|
||||
col = Rand(12345);
|
||||
col = Rand();
|
||||
|
||||
col = Randn(12345);
|
||||
col = Randn();
|
||||
|
||||
col = SparkPartitionId();
|
||||
|
||||
col = Sqrt(col);
|
||||
col = Sqrt("col");
|
||||
|
||||
col = Struct();
|
||||
col = Struct(col);
|
||||
col = Struct(col, col);
|
||||
col = Struct("col1");
|
||||
col = Struct("col1", "col2");
|
||||
col = Struct("col1", "col2", "col3");
|
||||
|
||||
col = When(col, col);
|
||||
col = When(col, "col");
|
||||
col = When(col, 12345);
|
||||
|
||||
col = BitwiseNOT(col);
|
||||
|
||||
col = Expr("expr");
|
||||
|
||||
//////////////////////////////
|
||||
// Math Functions
|
||||
//////////////////////////////
|
||||
col = Column("col");
|
||||
|
||||
col = Acos(col);
|
||||
col = Acos("col");
|
||||
|
||||
col = Asin(col);
|
||||
col = Asin("col");
|
||||
|
||||
col = Atan(col);
|
||||
col = Atan("col");
|
||||
|
||||
col = Atan2(col, col);
|
||||
col = Atan2(col, "x");
|
||||
col = Atan2("y", col);
|
||||
col = Atan2("y", "x");
|
||||
col = Atan2(col, 0.5);
|
||||
col = Atan2("y", 0.5);
|
||||
col = Atan2(0.5, col);
|
||||
col = Atan2(0.5, "x");
|
||||
|
||||
col = Bin(col);
|
||||
col = Bin("col");
|
||||
|
||||
col = Cbrt(col);
|
||||
col = Cbrt("col");
|
||||
|
||||
col = Ceil(col);
|
||||
col = Ceil("col");
|
||||
|
||||
col = Conv(col, 2, 10);
|
||||
|
||||
col = Cos(col);
|
||||
col = Cos("col");
|
||||
|
||||
col = Cosh(col);
|
||||
col = Cosh("col");
|
||||
|
||||
col = Exp(col);
|
||||
col = Exp("col");
|
||||
|
||||
col = Expm1(col);
|
||||
col = Expm1("col");
|
||||
|
||||
col = Factorial(col);
|
||||
|
||||
col = Floor(col);
|
||||
col = Floor("col");
|
||||
|
||||
col = Greatest();
|
||||
col = Greatest(col);
|
||||
col = Greatest(col, col);
|
||||
col = Greatest("col1");
|
||||
col = Greatest("col1", "col2");
|
||||
col = Greatest("col1", "col2", "col3");
|
||||
|
||||
col = Hex(col);
|
||||
|
||||
col = Unhex(col);
|
||||
|
||||
col = Hypot(col, col);
|
||||
col = Hypot(col, "right");
|
||||
col = Hypot("left", col);
|
||||
col = Hypot("left", "right");
|
||||
col = Hypot(col, 0.5);
|
||||
col = Hypot("left", 0.5);
|
||||
col = Hypot(0.5, col);
|
||||
col = Hypot(0.5, "right");
|
||||
|
||||
col = Least();
|
||||
col = Least(col);
|
||||
col = Least(col, col);
|
||||
col = Least("col1");
|
||||
col = Least("col1", "col2");
|
||||
col = Least("col1", "col2", "col3");
|
||||
|
||||
col = Log(col);
|
||||
col = Log("col");
|
||||
col = Log(2.0, col);
|
||||
col = Log(2.0, "col");
|
||||
|
||||
col = Log10(col);
|
||||
col = Log10("col");
|
||||
|
||||
col = Log1p(col);
|
||||
col = Log1p("col");
|
||||
|
||||
col = Log2(col);
|
||||
col = Log2("col");
|
||||
|
||||
col = Pow(col, col);
|
||||
col = Pow(col, "right");
|
||||
col = Pow("left", col);
|
||||
col = Pow("left", "right");
|
||||
col = Pow(col, 0.5);
|
||||
col = Pow("left", 0.5);
|
||||
col = Pow(0.5, col);
|
||||
col = Pow(0.5, "right");
|
||||
|
||||
col = Pmod(col, col);
|
||||
|
||||
col = Rint(col);
|
||||
col = Rint("col");
|
||||
|
||||
col = Round(col);
|
||||
col = Round(col, 10);
|
||||
|
||||
col = Bround(col);
|
||||
col = Bround(col, 10);
|
||||
|
||||
col = ShiftLeft(col, 4);
|
||||
|
||||
col = ShiftRight(col, 4);
|
||||
|
||||
col = ShiftRightUnsigned(col, 4);
|
||||
|
||||
col = Signum(col);
|
||||
col = Signum("col");
|
||||
|
||||
col = Sin(col);
|
||||
col = Sin("col");
|
||||
|
||||
col = Sinh(col);
|
||||
col = Sinh("col");
|
||||
|
||||
col = Tan(col);
|
||||
col = Tan("col");
|
||||
|
||||
col = Tanh(col);
|
||||
col = Tanh("col");
|
||||
|
||||
col = Degrees(col);
|
||||
col = Degrees("col");
|
||||
|
||||
col = Radians(col);
|
||||
col = Radians("col");
|
||||
|
||||
//////////////////////////////
|
||||
// Miscellaneous Functions
|
||||
//////////////////////////////
|
||||
col = Md5(col);
|
||||
|
||||
col = Sha1(col);
|
||||
|
||||
col = Sha2(col, 224);
|
||||
|
||||
col = Crc32(col);
|
||||
|
||||
col = Hash();
|
||||
col = Hash(col);
|
||||
col = Hash(col, col);
|
||||
|
||||
//////////////////////////////
|
||||
// String Functions
|
||||
//////////////////////////////
|
||||
col = Ascii(col);
|
||||
|
||||
col = Base64(col);
|
||||
|
||||
col = ConcatWs(";");
|
||||
col = ConcatWs(";", col);
|
||||
col = ConcatWs(";", col, col);
|
||||
|
||||
col = Decode(col, "UTF-8");
|
||||
|
||||
col = Encode(col, "UTF-8");
|
||||
|
||||
col = FormatNumber(col, 2);
|
||||
|
||||
col = FormatString("%s %d");
|
||||
col = FormatString("%s %d", col);
|
||||
col = FormatString("%s %d", col, col);
|
||||
|
||||
col = InitCap(col);
|
||||
|
||||
col = Instr(col, "abc");
|
||||
|
||||
col = Length(col);
|
||||
|
||||
col = Lower(col);
|
||||
|
||||
col = Levenshtein(col, col);
|
||||
|
||||
col = Locate("abc", col);
|
||||
col = Locate("abc", col, 3);
|
||||
|
||||
col = Lpad(col, 3, "pad");
|
||||
|
||||
col = Ltrim(col);
|
||||
col = Ltrim(col, "\n");
|
||||
|
||||
col = RegexpExtract(col, "[a-z]", 0);
|
||||
|
||||
col = RegexpReplace(col, "[a-z]", "hello");
|
||||
col = RegexpReplace(col, col, col);
|
||||
|
||||
col = Unbase64(col);
|
||||
|
||||
col = Rpad(col, 3, "pad");
|
||||
|
||||
col = Repeat(col, 3);
|
||||
|
||||
col = Rtrim(col);
|
||||
col = Rtrim(col, "\n");
|
||||
|
||||
col = Soundex(col);
|
||||
|
||||
col = Split(col, "\t");
|
||||
|
||||
col = Substring(col, 0, 5);
|
||||
|
||||
col = SubstringIndex(col, ";", 5);
|
||||
|
||||
col = Translate(col, "abc", "edf");
|
||||
|
||||
col = Trim(col);
|
||||
col = Trim(col, "\n");
|
||||
|
||||
col = Upper(col);
|
||||
|
||||
//////////////////////////////
|
||||
// DateTime Functions
|
||||
//////////////////////////////
|
||||
col = AddMonths(col, 3);
|
||||
|
||||
col = CurrentDate();
|
||||
|
||||
col = CurrentTimestamp();
|
||||
|
||||
col = DateFormat(col, "format");
|
||||
|
||||
col = DateAdd(col, 5);
|
||||
|
||||
col = DateSub(col, 5);
|
||||
|
||||
col = DateDiff(col, col);
|
||||
|
||||
col = Year(col);
|
||||
|
||||
col = Quarter(col);
|
||||
|
||||
col = Month(col);
|
||||
|
||||
col = DayOfWeek(col);
|
||||
|
||||
col = DayOfMonth(col);
|
||||
|
||||
col = DayOfYear(col);
|
||||
|
||||
col = Hour(col);
|
||||
|
||||
col = LastDay(col);
|
||||
|
||||
col = Minute(col);
|
||||
|
||||
col = MonthsBetween(col, col);
|
||||
|
||||
col = NextDay(col, "Mon");
|
||||
|
||||
col = Second(col);
|
||||
|
||||
col = WeekOfYear(col);
|
||||
|
||||
col = FromUnixTime(col);
|
||||
col = FromUnixTime(col, "yyyy-MM-dd HH:mm:ss");
|
||||
|
||||
col = UnixTimestamp();
|
||||
col = UnixTimestamp(col);
|
||||
col = UnixTimestamp(col, "yyyy-MM-dd HH:mm:ss");
|
||||
|
||||
col = ToTimestamp(col);
|
||||
col = ToTimestamp(col, "yyyy-MM-dd HH:mm:ss");
|
||||
|
||||
col = ToDate(col);
|
||||
col = ToDate(col, "yyyy-MM-dd HH:mm:ss");
|
||||
|
||||
col = Trunc(col, "yyyy");
|
||||
|
||||
col = DateTrunc("mon", col);
|
||||
|
||||
col = FromUtcTimestamp(col, "GMT+1");
|
||||
|
||||
col = ToUtcTimestamp(col, "GMT+1");
|
||||
|
||||
col = Window(col, "1 minute", "10 seconds");
|
||||
col = Window(col, "1 minute", "10 seconds", "5 seconds");
|
||||
col = Window(col, "1 minute");
|
||||
|
||||
//////////////////////////////
|
||||
// Collection Functions
|
||||
//////////////////////////////
|
||||
col = ArrayContains(col, 12345);
|
||||
col = ArrayContains(col, "str");
|
||||
|
||||
col = Concat();
|
||||
col = Concat(col);
|
||||
col = Concat(col, col);
|
||||
|
||||
col = Explode(col);
|
||||
|
||||
col = ExplodeOuter(col);
|
||||
|
||||
col = PosExplode(col);
|
||||
|
||||
col = PosExplodeOuter(col);
|
||||
|
||||
col = GetJsonObject(col, "abc.json");
|
||||
|
||||
col = JsonTuple(col, "a");
|
||||
col = JsonTuple(col, "a", "b");
|
||||
|
||||
var options = new Dictionary<string, string>() { { "hello", "world" } };
|
||||
|
||||
col = FromJson(col, "a Int");
|
||||
col = FromJson(col, "a Int", options);
|
||||
|
||||
col = ToJson(col);
|
||||
col = ToJson(col, options);
|
||||
|
||||
col = Size(col);
|
||||
|
||||
col = SortArray(col);
|
||||
col = SortArray(col, true);
|
||||
col = SortArray(col, false);
|
||||
|
||||
col = Reverse(col);
|
||||
|
||||
col = MapKeys(col);
|
||||
|
||||
col = MapValues(col);
|
||||
|
||||
//////////////////////////////
|
||||
// Udf Functions
|
||||
//////////////////////////////
|
||||
col = Udf(() => 1)();
|
||||
|
||||
col = Udf<int, int>((a1) => 1)(col);
|
||||
|
||||
col = Udf<int, int, int>((a1, a2) => 1)(col, col);
|
||||
|
||||
col = Udf<int, int, int, int>((a1, a2, a3) => 1)(col, col, col);
|
||||
|
||||
col = Udf<int, int, int, int, int>((a1, a2, a3, a4) => 1)(col, col, col, col);
|
||||
|
||||
col = Udf<int, int, int, int, int, int>(
|
||||
(a1, a2, a3, a4, a5) => 1)(col, col, col, col, col);
|
||||
|
||||
col = Udf<int, int, int, int, int, int, int>(
|
||||
(a1, a2, a3, a4, a5, a6) => 1)(col, col, col, col, col, col);
|
||||
|
||||
col = Udf<int, int, int, int, int, int, int, int>(
|
||||
(a1, a2, a3, a4, a5, a6, a7) => 1)(col, col, col, col, col, col, col);
|
||||
|
||||
col = Udf<int, int, int, int, int, int, int, int, int>(
|
||||
(a1, a2, a3, a4, a5, a6, a7, a8) => 1)(col, col, col, col, col, col, col, col);
|
||||
|
||||
col = Udf<int, int, int, int, int, int, int, int, int, int>(
|
||||
(a1, a2, a3, a4, a5, a6, a7, a8, a9) => 1)(
|
||||
col, col, col, col, col, col, col, col, col);
|
||||
|
||||
col = Udf<int, int, int, int, int, int, int, int, int, int, int>(
|
||||
(a1, a2, a3, a4, a5, a6, a7, a8, a9, a10) => 1)(
|
||||
col, col, col, col, col, col, col, col, col, col);
|
||||
|
||||
col = CallUDF("udf");
|
||||
col = CallUDF("udf", col);
|
||||
col = CallUDF("udf", col, col);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Test signatures for APIs introduced in Spark 2.4.*.
|
||||
/// </summary>
|
||||
[SkipIfSparkVersionIsLessThan(Versions.V2_4_0)]
|
||||
public void TestSignaturesV2_4_X()
|
||||
{
|
||||
Column col = Column("col");
|
||||
|
||||
col = MapFromArrays(col, col);
|
||||
|
||||
col = MonthsBetween(col, col, false);
|
||||
|
||||
col = FromUtcTimestamp(col, col);
|
||||
|
||||
col = ToUtcTimestamp(col, col);
|
||||
|
||||
col = ArraysOverlap(col, col);
|
||||
|
||||
col = Slice(col, 0, 4);
|
||||
|
||||
col = ArrayJoin(col, ":", "replacement");
|
||||
col = ArrayJoin(col, ":");
|
||||
|
||||
col = ArrayPosition(col, 1);
|
||||
|
||||
col = ElementAt(col, 1);
|
||||
|
||||
col = ArraySort(col);
|
||||
|
||||
col = ArrayRemove(col, "elementToRemove");
|
||||
|
||||
col = ArrayDistinct(col);
|
||||
|
||||
col = ArrayIntersect(col, col);
|
||||
|
||||
col = ArrayUnion(col, col);
|
||||
|
||||
col = ArrayExcept(col, col);
|
||||
|
||||
var options = new Dictionary<string, string>() { { "hello", "world" } };
|
||||
Column schema = SchemaOfJson("[{\"col\":0}]");
|
||||
|
||||
col = FromJson(col, schema);
|
||||
col = FromJson(col, schema, options);
|
||||
|
||||
col = SchemaOfJson("{}");
|
||||
col = SchemaOfJson(col);
|
||||
|
||||
col = ArrayMin(col);
|
||||
|
||||
col = ArrayMax(col);
|
||||
|
||||
col = Shuffle(col);
|
||||
|
||||
col = Reverse(col);
|
||||
|
||||
col = Flatten(col);
|
||||
|
||||
col = Sequence(col, col, col);
|
||||
col = Sequence(col, col);
|
||||
|
||||
col = ArrayRepeat(col, col);
|
||||
col = ArrayRepeat(col, 5);
|
||||
|
||||
col = MapFromEntries(col);
|
||||
|
||||
col = ArraysZip();
|
||||
col = ArraysZip(col);
|
||||
col = ArraysZip(col, col);
|
||||
|
||||
col = MapConcat();
|
||||
col = MapConcat(col);
|
||||
col = MapConcat(col, col);
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,30 @@
|
|||
<Project Sdk="Microsoft.NET.Sdk">
|
||||
|
||||
<PropertyGroup>
|
||||
<TargetFrameworks>net461;netcoreapp2.1</TargetFrameworks>
|
||||
<TargetFrameworks Condition="'$(OS)' != 'Windows_NT'">netcoreapp2.1</TargetFrameworks>
|
||||
<IsPackable>false</IsPackable>
|
||||
</PropertyGroup>
|
||||
|
||||
<ItemGroup>
|
||||
<PackageReference Include="coverlet.msbuild" Version="2.4.0">
|
||||
<PrivateAssets>all</PrivateAssets>
|
||||
<IncludeAssets>runtime; build; native; contentfiles; analyzers</IncludeAssets>
|
||||
</PackageReference>
|
||||
<PackageReference Include="Microsoft.NET.Test.Sdk" Version="15.9.0" />
|
||||
<PackageReference Include="xunit" Version="2.4.1" />
|
||||
<PackageReference Include="xunit.runner.visualstudio" Version="2.4.1" />
|
||||
</ItemGroup>
|
||||
|
||||
<ItemGroup>
|
||||
<Content Include="Resources\*">
|
||||
<CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
|
||||
</Content>
|
||||
</ItemGroup>
|
||||
|
||||
<ItemGroup>
|
||||
<ProjectReference Include="..\Microsoft.Spark.Worker\Microsoft.Spark.Worker.csproj" />
|
||||
<ProjectReference Include="..\Microsoft.Spark\Microsoft.Spark.csproj" />
|
||||
</ItemGroup>
|
||||
|
||||
</Project>
|
|
@ -0,0 +1,4 @@
|
|||
{"name":"Michael", "salary":3000}
|
||||
{"name":"Andy", "salary":4500}
|
||||
{"name":"Justin", "salary":3500}
|
||||
{"name":"Berta", "salary":4000}
|
|
@ -0,0 +1,4 @@
|
|||
# Set everything to be logged to the console
|
||||
log4j.rootCategory=ERROR,console
|
||||
# Use NullAppender for E2E testing. There is a deadlock issue using ConsoleAppender when the JVM process is launched from the C# process.
|
||||
log4j.appender.console=org.apache.log4j.varia.NullAppender
|
|
@ -0,0 +1,3 @@
|
|||
name;age;job
|
||||
Jorge;30;Developer
|
||||
Bob;32;Developer
|
|
|
@ -0,0 +1,3 @@
|
|||
{"name":"Michael"}
|
||||
{"name":"Andy", "age":30}
|
||||
{"name":"Justin", "age":19}
|
|
@ -0,0 +1,3 @@
|
|||
Michael, 29
|
||||
Andy, 30
|
||||
Justin, 19
|
Двоичный файл не отображается.
|
@ -0,0 +1,165 @@
|
|||
// Licensed to the .NET Foundation under one or more agreements.
|
||||
// The .NET Foundation licenses this file to you under the MIT license.
|
||||
// See the LICENSE file in the project root for more information.
|
||||
|
||||
using System;
|
||||
using System.Diagnostics;
|
||||
using System.IO;
|
||||
using System.Runtime.InteropServices;
|
||||
using Microsoft.Spark.Sql;
|
||||
using Xunit;
|
||||
|
||||
namespace Microsoft.Spark.E2ETest
|
||||
{
|
||||
/// <summary>
|
||||
/// SparkFixture acts as a global fixture to start Spark application in a debug
|
||||
/// mode through the spark-submit. It also provides a default SparkSession
|
||||
/// object that any tests can use.
|
||||
/// </summary>
|
||||
public class SparkFixture : IDisposable
|
||||
{
|
||||
private Process _process = new Process();
|
||||
|
||||
internal SparkSession Spark { get; }
|
||||
|
||||
public SparkFixture()
|
||||
{
|
||||
string workerPathKey = Services.ConfigurationService.WorkerPathSettingKey;
|
||||
#if NET461
|
||||
// Set the path for the worker executable to the location of the current
|
||||
// assembly since xunit framework copies the Microsoft.Spark.dll to an
|
||||
// isolated location for testing; the default mechanism of getting the directory
|
||||
// of the worker executable is the location of the Microsoft.Spark.dll.
|
||||
Environment.SetEnvironmentVariable(
|
||||
workerPathKey,
|
||||
AppDomain.CurrentDomain.BaseDirectory);
|
||||
#elif NETCOREAPP2_1
|
||||
// For .NET Core, the user must have published the worker as a standalone
|
||||
// executable and set DotnetWorkerPath to the published directory.
|
||||
if (string.IsNullOrEmpty(Environment.GetEnvironmentVariable(workerPathKey)))
|
||||
{
|
||||
throw new Exception(
|
||||
$"Environment variable '{workerPathKey}' must be set for .NET Core.");
|
||||
}
|
||||
#else
|
||||
// Compile-time error for not supported frameworks.
|
||||
throw new NotSupportedException("Not supported frameworks.");
|
||||
#endif
|
||||
BuildSparkCmd(out var filename, out var args);
|
||||
|
||||
// Configure the process using the StartInfo properties.
|
||||
_process.StartInfo.FileName = filename;
|
||||
_process.StartInfo.Arguments = args;
|
||||
// UseShellExecute defaults to true in .NET Framework,
|
||||
// but defaults to false in .NET Core. To support both, set it
|
||||
// to false which is required for stream redirection.
|
||||
_process.StartInfo.UseShellExecute = false;
|
||||
_process.StartInfo.RedirectStandardInput = true;
|
||||
_process.StartInfo.RedirectStandardOutput = true;
|
||||
_process.StartInfo.RedirectStandardError = true;
|
||||
|
||||
bool isSparkReady = false;
|
||||
_process.OutputDataReceived += (sender, arguments) =>
|
||||
{
|
||||
// Scala-side driver for .NET emits the following message after it is
|
||||
// launched and ready to accept connections.
|
||||
if (!isSparkReady &&
|
||||
arguments.Data.Contains("Backend running debug mode"))
|
||||
{
|
||||
isSparkReady = true;
|
||||
}
|
||||
};
|
||||
|
||||
_process.Start();
|
||||
_process.BeginOutputReadLine();
|
||||
|
||||
bool processExited = false;
|
||||
while (!isSparkReady && !processExited)
|
||||
{
|
||||
processExited = _process.WaitForExit(500);
|
||||
}
|
||||
|
||||
if (processExited)
|
||||
{
|
||||
_process.Dispose();
|
||||
|
||||
// The process should not have been exited.
|
||||
throw new Exception(
|
||||
$"Process exited prematurely with '{filename} {args}'.");
|
||||
}
|
||||
|
||||
Spark = SparkSession
|
||||
.Builder()
|
||||
.AppName("Microsoft.Spark.E2ETest")
|
||||
.GetOrCreate();
|
||||
}
|
||||
|
||||
public void Dispose()
|
||||
{
|
||||
Spark.Dispose();
|
||||
|
||||
// CSparkRunner will exit upon receiving newline from
|
||||
// the standard input stream.
|
||||
_process.StandardInput.WriteLine("done");
|
||||
_process.StandardInput.Flush();
|
||||
_process.WaitForExit();
|
||||
}
|
||||
|
||||
private void BuildSparkCmd(out string filename, out string args)
|
||||
{
|
||||
string sparkHome = SparkSettings.SparkHome;
|
||||
|
||||
// Build the executable name.
|
||||
char sep = Path.DirectorySeparatorChar;
|
||||
filename = $"{sparkHome}{sep}bin{sep}spark-submit";
|
||||
if (RuntimeInformation.IsOSPlatform(OSPlatform.Windows))
|
||||
{
|
||||
filename += ".cmd";
|
||||
}
|
||||
|
||||
if (!File.Exists(filename))
|
||||
{
|
||||
throw new FileNotFoundException($"{filename} does not exist.");
|
||||
}
|
||||
|
||||
// Build the arguments for the spark-submit.
|
||||
string classArg = "--class org.apache.spark.deploy.DotnetRunner";
|
||||
string curDir = AppDomain.CurrentDomain.BaseDirectory;
|
||||
string jarPrefix = GetJarPrefix(sparkHome);
|
||||
string scalaDir = $"{curDir}{sep}..{sep}..{sep}..{sep}..{sep}..{sep}scala";
|
||||
string jarDir = $"{scalaDir}{sep}{jarPrefix}{sep}target";
|
||||
string jar = $"{jarDir}{sep}{jarPrefix}-0.1.0.jar";
|
||||
|
||||
if (!File.Exists(jar))
|
||||
{
|
||||
throw new FileNotFoundException($"{jar} does not exist.");
|
||||
}
|
||||
|
||||
// If there exists log4j.properties in SPARK_HOME/conf directory, Spark from 2.3.*
|
||||
// to 2.4.0 hang in E2E test. The reverse behavior is true for Spark 2.4.1; if
|
||||
// there does not exist log4j.properties, the tests hang.
|
||||
// Note that the hang happens in JVM when it tries to append a console logger (log4j).
|
||||
// The solution is to use custom log configuration that appends NullLogger, which
|
||||
// works across all Spark versions.
|
||||
string resourceUri = new Uri(TestEnvironment.ResourceDirectory).AbsoluteUri;
|
||||
string logOption = $"--conf spark.driver.extraJavaOptions=-Dlog4j.configuration=" +
|
||||
$"{resourceUri}/log4j.properties";
|
||||
|
||||
args = $"{logOption} {classArg} --master local {jar} debug";
|
||||
}
|
||||
|
||||
private string GetJarPrefix(string sparkHome)
|
||||
{
|
||||
Version sparkVersion = SparkSettings.Version;
|
||||
return $"microsoft-spark-{sparkVersion.Major}.{sparkVersion.Minor}.x";
|
||||
}
|
||||
}
|
||||
|
||||
[CollectionDefinition("Spark E2E Tests")]
|
||||
public class SparkCollection : ICollectionFixture<SparkFixture>
|
||||
{
|
||||
// This class has no code, and is never created. Its purpose is simply
|
||||
// to be the place to apply [CollectionDefinition] and all the
|
||||
// ICollectionFixture<> interfaces.
|
||||
}
|
||||
}
|
|
@ -0,0 +1,41 @@
|
|||
// Licensed to the .NET Foundation under one or more agreements.
|
||||
// The .NET Foundation licenses this file to you under the MIT license.
|
||||
// See the LICENSE file in the project root for more information.
|
||||
|
||||
using System;
|
||||
using System.IO;
|
||||
using System.Linq;
|
||||
using Xunit.Sdk;
|
||||
|
||||
namespace Microsoft.Spark.E2ETest
|
||||
{
|
||||
internal static class SparkSettings
|
||||
{
|
||||
internal static Version Version { get; private set; }
|
||||
internal static string SparkHome { get; private set; }
|
||||
|
||||
static SparkSettings()
|
||||
{
|
||||
InitSparkHome();
|
||||
InitVersion();
|
||||
}
|
||||
|
||||
private static void InitSparkHome()
|
||||
{
|
||||
SparkHome = Environment.GetEnvironmentVariable("SPARK_HOME");
|
||||
if (SparkHome == null)
|
||||
{
|
||||
throw new NullException("SPARK_HOME environment variable is not set.");
|
||||
}
|
||||
}
|
||||
|
||||
private static void InitVersion()
|
||||
{
|
||||
// First line of the RELEASE file under SPARK_HOME will be something similar to:
|
||||
// Spark 2.3.2 built for Hadoop 2.7.3
|
||||
string firstLine =
|
||||
File.ReadLines($"{SparkHome}{Path.DirectorySeparatorChar}RELEASE").First();
|
||||
Version = new Version(firstLine.Split(' ')[1]);
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,33 @@
|
|||
// Licensed to the .NET Foundation under one or more agreements.
|
||||
// The .NET Foundation licenses this file to you under the MIT license.
|
||||
// See the LICENSE file in the project root for more information.
|
||||
|
||||
using System;
|
||||
using System.IO;
|
||||
|
||||
namespace Microsoft.Spark.E2ETest
|
||||
{
|
||||
/// <summary>
|
||||
/// TestEnvironment provides functionalities related to E2E test environment.
|
||||
/// </summary>
|
||||
internal static class TestEnvironment
|
||||
{
|
||||
private static string s_resourceDirectory;
|
||||
internal static string ResourceDirectory
|
||||
{
|
||||
get
|
||||
{
|
||||
if (s_resourceDirectory is null)
|
||||
{
|
||||
s_resourceDirectory =
|
||||
AppDomain.CurrentDomain.BaseDirectory +
|
||||
Path.DirectorySeparatorChar +
|
||||
"Resources" +
|
||||
Path.DirectorySeparatorChar;
|
||||
}
|
||||
|
||||
return s_resourceDirectory;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,20 @@
|
|||
// Licensed to the .NET Foundation under one or more agreements.
|
||||
// The .NET Foundation licenses this file to you under the MIT license.
|
||||
// See the LICENSE file in the project root for more information.
|
||||
|
||||
using System;
|
||||
using Xunit;
|
||||
|
||||
namespace Microsoft.Spark.E2ETest.Utils
|
||||
{
|
||||
public sealed class SkipIfSparkVersionIsLessThan : FactAttribute
|
||||
{
|
||||
public SkipIfSparkVersionIsLessThan(string version)
|
||||
{
|
||||
if (SparkSettings.Version < new Version(version))
|
||||
{
|
||||
Skip = $"Ignore on Spark version ({SparkSettings.Version}) <= {version}";
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,117 @@
|
|||
// Licensed to the .NET Foundation under one or more agreements.
|
||||
// The .NET Foundation licenses this file to you under the MIT license.
|
||||
// See the LICENSE file in the project root for more information.
|
||||
|
||||
using System.Collections.Generic;
|
||||
using System.IO;
|
||||
using System.Linq;
|
||||
using Microsoft.Spark.Sql;
|
||||
using Microsoft.Spark.UnitTest.TestUtils;
|
||||
using Xunit;
|
||||
|
||||
namespace Microsoft.Spark.UnitTest
|
||||
{
|
||||
public class CommandSerDeTests
|
||||
{
|
||||
[Fact]
|
||||
public void TestCommandSerDeForSqlPickling()
|
||||
{
|
||||
var udfWrapper = new Sql.PicklingUdfWrapper<string, string>((str) => $"hello {str}");
|
||||
var workerFunction = new Sql.PicklingWorkerFunction(udfWrapper.Execute);
|
||||
|
||||
var serializedCommand = Utils.CommandSerDe.Serialize(
|
||||
workerFunction.Func,
|
||||
Utils.CommandSerDe.SerializedMode.Row,
|
||||
Utils.CommandSerDe.SerializedMode.Row);
|
||||
|
||||
using (var ms = new MemoryStream(serializedCommand))
|
||||
{
|
||||
var deserializedWorkerFunction = new Sql.PicklingWorkerFunction(
|
||||
Utils.CommandSerDe.Deserialize<Sql.PicklingWorkerFunction.ExecuteDelegate>(
|
||||
ms,
|
||||
out Utils.CommandSerDe.SerializedMode serializerMode,
|
||||
out Utils.CommandSerDe.SerializedMode deserializerMode,
|
||||
out var runMode));
|
||||
|
||||
Assert.Equal(Utils.CommandSerDe.SerializedMode.Row, serializerMode);
|
||||
Assert.Equal(Utils.CommandSerDe.SerializedMode.Row, deserializerMode);
|
||||
Assert.Equal("N", runMode);
|
||||
|
||||
var result = deserializedWorkerFunction.Func(0, new[] { "spark" }, new[] { 0 });
|
||||
Assert.Equal("hello spark", result);
|
||||
}
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void TestCommandSerDeForSqlArrow()
|
||||
{
|
||||
var udfWrapper = new ArrowUdfWrapper<string, string>((str) => $"hello {str}");
|
||||
var workerFunction = new ArrowWorkerFunction(udfWrapper.Execute);
|
||||
|
||||
var serializedCommand = Utils.CommandSerDe.Serialize(
|
||||
workerFunction.Func,
|
||||
Utils.CommandSerDe.SerializedMode.Row,
|
||||
Utils.CommandSerDe.SerializedMode.Row);
|
||||
|
||||
using (var ms = new MemoryStream(serializedCommand))
|
||||
{
|
||||
var deserializedWorkerFunction = new ArrowWorkerFunction(
|
||||
Utils.CommandSerDe.Deserialize<ArrowWorkerFunction.ExecuteDelegate>(
|
||||
ms,
|
||||
out Utils.CommandSerDe.SerializedMode serializerMode,
|
||||
out Utils.CommandSerDe.SerializedMode deserializerMode,
|
||||
out var runMode));
|
||||
|
||||
Assert.Equal(Utils.CommandSerDe.SerializedMode.Row, serializerMode);
|
||||
Assert.Equal(Utils.CommandSerDe.SerializedMode.Row, deserializerMode);
|
||||
Assert.Equal("N", runMode);
|
||||
|
||||
Apache.Arrow.IArrowArray input = ArrowArrayHelpers.ToArrowArray(new[] { "spark" });
|
||||
Apache.Arrow.IArrowArray result =
|
||||
deserializedWorkerFunction.Func(0, new[] { input }, new[] { 0 });
|
||||
ArrowTestUtils.AssertEquals("hello spark", result);
|
||||
}
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void TestCommandSerDeForRDD()
|
||||
{
|
||||
// Construct the UDF tree such that func1, func2, and func3
|
||||
// are executed in that order.
|
||||
var func1 = new RDD.WorkerFunction(
|
||||
new RDD<int>.MapUdfWrapper<int, int>((a) => a + 3).Execute);
|
||||
|
||||
var func2 = new RDD.WorkerFunction(
|
||||
new RDD<int>.MapUdfWrapper<int, int>((a) => a * 2).Execute);
|
||||
|
||||
var func3 = new RDD.WorkerFunction(
|
||||
new RDD<int>.MapUdfWrapper<int, int>((a) => a + 5).Execute);
|
||||
|
||||
var chainedFunc1 = RDD.WorkerFunction.Chain(func1, func2);
|
||||
var chainedFunc2 = RDD.WorkerFunction.Chain(chainedFunc1, func3);
|
||||
|
||||
var serializedCommand = Utils.CommandSerDe.Serialize(
|
||||
chainedFunc2.Func,
|
||||
Utils.CommandSerDe.SerializedMode.Byte,
|
||||
Utils.CommandSerDe.SerializedMode.Byte);
|
||||
|
||||
using (var ms = new MemoryStream(serializedCommand))
|
||||
{
|
||||
var deserializedWorkerFunction = new RDD.WorkerFunction(
|
||||
Utils.CommandSerDe.Deserialize<RDD.WorkerFunction.ExecuteDelegate>(
|
||||
ms,
|
||||
out Utils.CommandSerDe.SerializedMode serializerMode,
|
||||
out Utils.CommandSerDe.SerializedMode deserializerMode,
|
||||
out var runMode));
|
||||
|
||||
Assert.Equal(Utils.CommandSerDe.SerializedMode.Byte, serializerMode);
|
||||
Assert.Equal(Utils.CommandSerDe.SerializedMode.Byte, deserializerMode);
|
||||
Assert.Equal("N", runMode);
|
||||
|
||||
IEnumerable<object> result =
|
||||
deserializedWorkerFunction.Func(0, new object[] { 1, 2, 3 });
|
||||
Assert.Equal(new[] { 13, 15, 17 }, result.Cast<int>());
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,31 @@
|
|||
<Project Sdk="Microsoft.NET.Sdk">
|
||||
|
||||
<PropertyGroup>
|
||||
<TargetFrameworks>net461;netcoreapp2.1</TargetFrameworks>
|
||||
<TargetFrameworks Condition="'$(OS)' != 'Windows_NT'">netcoreapp2.1</TargetFrameworks>
|
||||
<RootNamespace>Microsoft.Spark.UnitTest</RootNamespace>
|
||||
<IsPackable>false</IsPackable>
|
||||
</PropertyGroup>
|
||||
|
||||
<ItemGroup>
|
||||
<PackageReference Include="coverlet.msbuild" Version="2.4.0">
|
||||
<PrivateAssets>all</PrivateAssets>
|
||||
<IncludeAssets>runtime; build; native; contentfiles; analyzers</IncludeAssets>
|
||||
</PackageReference>
|
||||
<PackageReference Include="Microsoft.NET.Test.Sdk" Version="15.9.0" />
|
||||
<PackageReference Include="Moq" Version="4.10.0" />
|
||||
<PackageReference Include="System.Memory" Version="4.5.2" />
|
||||
<PackageReference Include="xunit" Version="2.4.1" />
|
||||
<PackageReference Include="xunit.runner.visualstudio" Version="2.4.1" />
|
||||
</ItemGroup>
|
||||
|
||||
<ItemGroup Condition=" '$(TargetFramework)' != 'netcoreapp2.1' ">
|
||||
<Reference Include="System" />
|
||||
<Reference Include="Microsoft.CSharp" />
|
||||
</ItemGroup>
|
||||
|
||||
<ItemGroup>
|
||||
<ProjectReference Include="..\Microsoft.Spark\Microsoft.Spark.csproj" />
|
||||
</ItemGroup>
|
||||
|
||||
</Project>
|
|
@ -0,0 +1,70 @@
|
|||
// Licensed to the .NET Foundation under one or more agreements.
|
||||
// The .NET Foundation licenses this file to you under the MIT license.
|
||||
// See the LICENSE file in the project root for more information.
|
||||
|
||||
using System;
|
||||
using System.IO;
|
||||
using Microsoft.Spark.Interop.Ipc;
|
||||
using Xunit;
|
||||
|
||||
namespace Microsoft.Spark.UnitTest
|
||||
{
|
||||
public class SerDeTests
|
||||
{
|
||||
[Fact]
|
||||
public void TestReadAndWrite()
|
||||
{
|
||||
using (var ms = new MemoryStream())
|
||||
{
|
||||
// Test bool.
|
||||
SerDe.Write(ms, true);
|
||||
ms.Seek(0, SeekOrigin.Begin);
|
||||
Assert.True(SerDe.ReadBool(ms));
|
||||
ms.Seek(0, SeekOrigin.Begin);
|
||||
|
||||
SerDe.Write(ms, false);
|
||||
ms.Seek(0, SeekOrigin.Begin);
|
||||
Assert.False(SerDe.ReadBool(ms));
|
||||
ms.Seek(0, SeekOrigin.Begin);
|
||||
|
||||
// Test int.
|
||||
SerDe.Write(ms, 12345);
|
||||
ms.Seek(0, SeekOrigin.Begin);
|
||||
Assert.Equal(12345, SerDe.ReadInt32(ms));
|
||||
ms.Seek(0, SeekOrigin.Begin);
|
||||
|
||||
// Test long.
|
||||
SerDe.Write(ms, 123456789000);
|
||||
ms.Seek(0, SeekOrigin.Begin);
|
||||
Assert.Equal(123456789000, SerDe.ReadInt64(ms));
|
||||
ms.Seek(0, SeekOrigin.Begin);
|
||||
|
||||
// Test double.
|
||||
SerDe.Write(ms, Math.PI);
|
||||
ms.Seek(0, SeekOrigin.Begin);
|
||||
Assert.Equal(Math.PI, SerDe.ReadDouble(ms));
|
||||
ms.Seek(0, SeekOrigin.Begin);
|
||||
|
||||
// Test string.
|
||||
SerDe.Write(ms, "hello world!");
|
||||
ms.Seek(0, SeekOrigin.Begin);
|
||||
Assert.Equal("hello world!", SerDe.ReadString(ms));
|
||||
ms.Seek(0, SeekOrigin.Begin);
|
||||
}
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void TestReadBytes()
|
||||
{
|
||||
// Test the case where invalid length is given.
|
||||
Assert.Throws<ArgumentOutOfRangeException>(
|
||||
() => SerDe.ReadBytes(new MemoryStream(), -1));
|
||||
|
||||
// Test reading null length.
|
||||
var ms = new MemoryStream();
|
||||
SerDe.Write(ms, (int)SpecialLengths.NULL);
|
||||
ms.Seek(0, SeekOrigin.Begin);
|
||||
Assert.Null(SerDe.ReadBytes(ms));
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,488 @@
|
|||
// Licensed to the .NET Foundation under one or more agreements.
|
||||
// The .NET Foundation licenses this file to you under the MIT license.
|
||||
// See the LICENSE file in the project root for more information.
|
||||
|
||||
using System;
|
||||
using Microsoft.Spark.Interop.Ipc;
|
||||
using Microsoft.Spark.Sql;
|
||||
using Microsoft.Spark.Sql.Expressions;
|
||||
using Moq;
|
||||
using Xunit;
|
||||
|
||||
namespace Microsoft.Spark.UnitTest
|
||||
{
|
||||
public class ColumnTestsFixture : IDisposable
|
||||
{
|
||||
internal Mock<IJvmBridge> MockJvm { get; }
|
||||
|
||||
public ColumnTestsFixture()
|
||||
{
|
||||
MockJvm = new Mock<IJvmBridge>();
|
||||
|
||||
MockJvm
|
||||
.Setup(m => m.CallStaticJavaMethod(
|
||||
It.IsAny<string>(),
|
||||
It.IsAny<string>(),
|
||||
It.IsAny<object>()))
|
||||
.Returns(
|
||||
new JvmObjectReference("result", MockJvm.Object));
|
||||
MockJvm
|
||||
.Setup(m => m.CallStaticJavaMethod(
|
||||
It.IsAny<string>(),
|
||||
It.IsAny<string>(),
|
||||
It.IsAny<object>(),
|
||||
It.IsAny<object>()))
|
||||
.Returns(
|
||||
new JvmObjectReference("result", MockJvm.Object));
|
||||
MockJvm
|
||||
.Setup(m => m.CallStaticJavaMethod(
|
||||
It.IsAny<string>(),
|
||||
It.IsAny<string>(),
|
||||
It.IsAny<object[]>()))
|
||||
.Returns(
|
||||
new JvmObjectReference("result", MockJvm.Object));
|
||||
|
||||
MockJvm
|
||||
.Setup(m => m.CallNonStaticJavaMethod(
|
||||
It.IsAny<JvmObjectReference>(),
|
||||
It.IsAny<string>(),
|
||||
It.IsAny<object>()))
|
||||
.Returns(
|
||||
new JvmObjectReference("result", MockJvm.Object));
|
||||
MockJvm
|
||||
.Setup(m => m.CallNonStaticJavaMethod(
|
||||
It.IsAny<JvmObjectReference>(),
|
||||
It.IsAny<string>(),
|
||||
It.IsAny<object>(),
|
||||
It.IsAny<object>()))
|
||||
.Returns(
|
||||
new JvmObjectReference("result", MockJvm.Object));
|
||||
MockJvm
|
||||
.Setup(m => m.CallNonStaticJavaMethod(
|
||||
It.IsAny<JvmObjectReference>(),
|
||||
It.IsAny<string>(),
|
||||
It.IsAny<object[]>()))
|
||||
.Returns(
|
||||
new JvmObjectReference("result", MockJvm.Object));
|
||||
}
|
||||
|
||||
public void Dispose()
|
||||
{
|
||||
}
|
||||
}
|
||||
|
||||
public class ColumnTests : IClassFixture<ColumnTestsFixture>
|
||||
{
|
||||
private readonly Mock<IJvmBridge> _mockJvm;
|
||||
|
||||
public ColumnTests(ColumnTestsFixture fixture)
|
||||
{
|
||||
_mockJvm = fixture.MockJvm;
|
||||
}
|
||||
|
||||
private static JvmObjectId GetId(IJvmObjectReferenceProvider provider) => provider.Reference.Id;
|
||||
|
||||
[Fact]
|
||||
public void TestColumnNegateOperator()
|
||||
{
|
||||
Column column1 = CreateColumn("col1");
|
||||
Column column2 = -column1;
|
||||
|
||||
_mockJvm.Verify(m => m.CallStaticJavaMethod(
|
||||
"org.apache.spark.sql.functions",
|
||||
"negate",
|
||||
column1), Times.Once);
|
||||
|
||||
Assert.Equal("result", GetId(column2));
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void TestNotOperator()
|
||||
{
|
||||
Column column1 = CreateColumn("col1");
|
||||
Column column2 = !column1;
|
||||
|
||||
_mockJvm.Verify(m => m.CallStaticJavaMethod(
|
||||
"org.apache.spark.sql.functions",
|
||||
"not",
|
||||
column1), Times.Once);
|
||||
|
||||
Assert.Equal("result", GetId(column2));
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void TestEqualOperator()
|
||||
{
|
||||
{
|
||||
// Column as a right operand.
|
||||
Column column1 = CreateColumn("col1");
|
||||
Column column2 = CreateColumn("col2");
|
||||
Column result = column1 == column2;
|
||||
VerifyNonStaticCall(column1, "equalTo", column2);
|
||||
Assert.Equal("result", GetId(result));
|
||||
}
|
||||
{
|
||||
// String as a right operand.
|
||||
// Note that any object can be used in place of string.
|
||||
Column column1 = CreateColumn("col1");
|
||||
Column result = column1 == "abc";
|
||||
VerifyNonStaticCall(column1, "equalTo", "abc");
|
||||
Assert.Equal("result", GetId(result));
|
||||
}
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void TestNotEqualOperator()
|
||||
{
|
||||
{
|
||||
// Column as a right operand.
|
||||
Column column1 = CreateColumn("col1");
|
||||
Column column2 = CreateColumn("col2");
|
||||
Column result = column1 != column2;
|
||||
VerifyNonStaticCall(column1, "notEqual", column2);
|
||||
Assert.Equal("result", GetId(result));
|
||||
}
|
||||
{
|
||||
// String as a right operand.
|
||||
// Note that any object can be used in place of string.
|
||||
Column column1 = CreateColumn("col1");
|
||||
Column result = column1 != "abc";
|
||||
VerifyNonStaticCall(column1, "notEqual", "abc");
|
||||
Assert.Equal("result", GetId(result));
|
||||
}
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void TestGreaterThanOperator()
|
||||
{
|
||||
{
|
||||
// Column as a right operand.
|
||||
Column column1 = CreateColumn("col1");
|
||||
Column column2 = CreateColumn("col2");
|
||||
Column result = column1 > column2;
|
||||
VerifyNonStaticCall(column1, "gt", column2);
|
||||
Assert.Equal("result", GetId(result));
|
||||
}
|
||||
{
|
||||
// String as a right operand.
|
||||
// Note that any object can be used in place of string.
|
||||
Column column1 = CreateColumn("col1");
|
||||
Column result = column1 > "abc";
|
||||
VerifyNonStaticCall(column1, "gt", "abc");
|
||||
Assert.Equal("result", GetId(result));
|
||||
}
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void TestLessThanOperator()
|
||||
{
|
||||
{
|
||||
// Column as a right operand.
|
||||
Column column1 = CreateColumn("col1");
|
||||
Column column2 = CreateColumn("col2");
|
||||
Column result = column1 < column2;
|
||||
VerifyNonStaticCall(column1, "lt", column2);
|
||||
Assert.Equal("result", GetId(result));
|
||||
}
|
||||
{
|
||||
// String as a right operand.
|
||||
// Note that any object can be used in place of string.
|
||||
Column column1 = CreateColumn("col1");
|
||||
Column result = column1 < "abc";
|
||||
VerifyNonStaticCall(column1, "lt", "abc");
|
||||
Assert.Equal("result", GetId(result));
|
||||
}
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void TestLessThanEqualToOperator()
|
||||
{
|
||||
{
|
||||
// Column as a right operand.
|
||||
Column column1 = CreateColumn("col1");
|
||||
Column column2 = CreateColumn("col2");
|
||||
Column result = column1 <= column2;
|
||||
VerifyNonStaticCall(column1, "leq", column2);
|
||||
Assert.Equal("result", GetId(result));
|
||||
}
|
||||
{
|
||||
// String as a right operand.
|
||||
// Note that any object can be used in place of string.
|
||||
Column column1 = CreateColumn("col1");
|
||||
Column result = column1 <= "abc";
|
||||
VerifyNonStaticCall(column1, "leq", "abc");
|
||||
Assert.Equal("result", GetId(result));
|
||||
}
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void TestGreaterThanEqualToOperator()
|
||||
{
|
||||
{
|
||||
// Column as a right operand.
|
||||
Column column1 = CreateColumn("col1");
|
||||
Column column2 = CreateColumn("col2");
|
||||
Column result = column1 >= column2;
|
||||
VerifyNonStaticCall(column1, "geq", column2);
|
||||
Assert.Equal("result", GetId(result));
|
||||
}
|
||||
{
|
||||
// String as a right operand.
|
||||
// Note that any object can be used in place of string.
|
||||
Column column1 = CreateColumn("col1");
|
||||
Column result = column1 >= "abc";
|
||||
VerifyNonStaticCall(column1, "geq", "abc");
|
||||
Assert.Equal("result", GetId(result));
|
||||
}
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void TestAndOperator()
|
||||
{
|
||||
Column column1 = CreateColumn("col1");
|
||||
Column column2 = CreateColumn("col2");
|
||||
Column result = column1 & column2;
|
||||
VerifyNonStaticCall(column1, "and", column2);
|
||||
Assert.Equal("result", GetId(result));
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void TestOrOperator()
|
||||
{
|
||||
Column column1 = CreateColumn("col1");
|
||||
Column column2 = CreateColumn("col2");
|
||||
Column result = column1 | column2;
|
||||
VerifyNonStaticCall(column1, "or", column2);
|
||||
Assert.Equal("result", GetId(result));
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void TestPlusOperator()
|
||||
{
|
||||
Column column1 = CreateColumn("col1");
|
||||
Column column2 = CreateColumn("col2");
|
||||
Column result = column1 + column2;
|
||||
VerifyNonStaticCall(column1, "plus", column2);
|
||||
Assert.Equal("result", GetId(result));
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void TestMinusOperator()
|
||||
{
|
||||
Column column1 = CreateColumn("col1");
|
||||
Column column2 = CreateColumn("col2");
|
||||
Column result = column1 - column2;
|
||||
VerifyNonStaticCall(column1, "minus", column2);
|
||||
Assert.Equal("result", GetId(result));
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void TestMultiplyOperator()
|
||||
{
|
||||
Column column1 = CreateColumn("col1");
|
||||
Column column2 = CreateColumn("col2");
|
||||
Column result = column1 * column2;
|
||||
VerifyNonStaticCall(column1, "multiply", column2);
|
||||
Assert.Equal("result", GetId(result));
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void TestDivideOperator()
|
||||
{
|
||||
Column column1 = CreateColumn("col1");
|
||||
Column column2 = CreateColumn("col2");
|
||||
Column result = column1 / column2;
|
||||
VerifyNonStaticCall(column1, "divide", column2);
|
||||
Assert.Equal("result", GetId(result));
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void TestModOperator()
|
||||
{
|
||||
Column column1 = CreateColumn("col1");
|
||||
Column column2 = CreateColumn("col2");
|
||||
Column result = column1 % column2;
|
||||
VerifyNonStaticCall(column1, "mod", column2);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void TestWhenCondition()
|
||||
{
|
||||
Column column1 = CreateColumn("col1");
|
||||
Column column2 = CreateColumn("col2");
|
||||
var value = 0;
|
||||
column1.When(column2, value);
|
||||
VerifyNonStaticCall(column1, "when", column2, value);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void TestBetweenCondition()
|
||||
{
|
||||
Column column1 = CreateColumn("col1");
|
||||
var val1 = 1;
|
||||
var val2 = 2;
|
||||
column1.Between(val1, val2);
|
||||
VerifyNonStaticCall(column1, "between", val1, val2);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void TestSubStr()
|
||||
{
|
||||
{
|
||||
Column column1 = CreateColumn("col1");
|
||||
var pos = 1;
|
||||
var len = 2;
|
||||
column1.SubStr(pos, len);
|
||||
VerifyNonStaticCall(column1, "substr", pos, len);
|
||||
}
|
||||
{
|
||||
Column column1 = CreateColumn("col1");
|
||||
Column pos = CreateColumn("col2");
|
||||
Column len = CreateColumn("col3");
|
||||
column1.SubStr(pos, len);
|
||||
VerifyNonStaticCall(column1, "substr", pos, len);
|
||||
}
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void TestOver()
|
||||
{
|
||||
{
|
||||
Column column1 = CreateColumn("col1");
|
||||
var windowSpec =
|
||||
new WindowSpec(new JvmObjectReference("windowSpec", _mockJvm.Object));
|
||||
column1.Over();
|
||||
VerifyNonStaticCall(column1, "over");
|
||||
}
|
||||
{
|
||||
Column column1 = CreateColumn("col1");
|
||||
var windowSpec =
|
||||
new WindowSpec(new JvmObjectReference("windowSpec", _mockJvm.Object));
|
||||
column1.Over(windowSpec);
|
||||
VerifyNonStaticCall(column1, "over", windowSpec);
|
||||
}
|
||||
}
|
||||
|
||||
private void VerifyNonStaticCall(
|
||||
IJvmObjectReferenceProvider obj,
|
||||
string methodName,
|
||||
object arg0)
|
||||
{
|
||||
_mockJvm.Verify(m => m.CallNonStaticJavaMethod(
|
||||
obj.Reference,
|
||||
methodName,
|
||||
arg0));
|
||||
}
|
||||
|
||||
private void VerifyNonStaticCall(
|
||||
IJvmObjectReferenceProvider obj,
|
||||
string methodName,
|
||||
object arg0,
|
||||
object arg1)
|
||||
{
|
||||
_mockJvm.Verify(m => m.CallNonStaticJavaMethod(
|
||||
obj.Reference,
|
||||
methodName,
|
||||
arg0, arg1));
|
||||
}
|
||||
|
||||
private void VerifyNonStaticCall(
|
||||
IJvmObjectReferenceProvider obj,
|
||||
string methodName,
|
||||
params object[] args)
|
||||
{
|
||||
_mockJvm.Verify(m => m.CallNonStaticJavaMethod(
|
||||
obj.Reference,
|
||||
methodName,
|
||||
args));
|
||||
}
|
||||
|
||||
[Theory]
|
||||
[InlineData("EqNullSafe", "eqNullSafe")]
|
||||
[InlineData("Or", "or")]
|
||||
[InlineData("And", "and")]
|
||||
[InlineData("Contains", "contains")]
|
||||
[InlineData("StartsWith", "startsWith")]
|
||||
[InlineData("EndsWith", "endsWith")]
|
||||
[InlineData("EqualTo", "equalTo")]
|
||||
[InlineData("NotEqual", "notEqual")]
|
||||
[InlineData("Gt", "gt")]
|
||||
[InlineData("Lt", "lt")]
|
||||
[InlineData("Leq", "leq")]
|
||||
[InlineData("Geq", "geq")]
|
||||
[InlineData("Otherwise", "otherwise")]
|
||||
[InlineData("Plus", "plus")]
|
||||
[InlineData("Minus", "minus")]
|
||||
[InlineData("Multiply", "multiply")]
|
||||
[InlineData("Divide", "divide")]
|
||||
[InlineData("Mod", "mod")]
|
||||
[InlineData("GetItem", "getItem")]
|
||||
[InlineData("BitwiseOR", "bitwiseOR")]
|
||||
[InlineData("BitwiseAND", "bitwiseAND")]
|
||||
[InlineData("BitwiseXOR", "bitwiseXOR")]
|
||||
public void TestNamedOperators(string funcName, string opName)
|
||||
{
|
||||
Column column1 = CreateColumn("col1");
|
||||
Column column2 = CreateColumn("col2");
|
||||
System.Reflection.MethodInfo func = column1.GetType().GetMethod(
|
||||
funcName,
|
||||
new Type[] { typeof(Column) });
|
||||
var result = func.Invoke(column1, new[] { column2 }) as Column;
|
||||
VerifyNonStaticCall(column1, opName, column2);
|
||||
Assert.Equal("result", GetId(result));
|
||||
}
|
||||
|
||||
[Theory]
|
||||
[InlineData("Contains", "contains")]
|
||||
[InlineData("StartsWith", "startsWith")]
|
||||
[InlineData("EndsWith", "endsWith")]
|
||||
[InlineData("Alias", "alias")]
|
||||
[InlineData("As", "alias")]
|
||||
[InlineData("Name", "name")]
|
||||
[InlineData("Cast", "cast")]
|
||||
[InlineData("Otherwise", "otherwise")]
|
||||
[InlineData("Like", "like")]
|
||||
[InlineData("RLike", "rlike")]
|
||||
[InlineData("GetItem", "getItem")]
|
||||
[InlineData("GetField", "getField")]
|
||||
public void TestNamedOperatorsWithString(string funcName, string opName)
|
||||
{
|
||||
// These operators take string as the operand.
|
||||
Column column = CreateColumn("col");
|
||||
var literal = "hello";
|
||||
System.Reflection.MethodInfo func = column.GetType().GetMethod(
|
||||
funcName,
|
||||
new Type[] { typeof(string) });
|
||||
var result = func.Invoke(column, new[] { literal }) as Column;
|
||||
Assert.Equal("result", GetId(result));
|
||||
VerifyNonStaticCall(column, opName, literal);
|
||||
}
|
||||
|
||||
[Theory]
|
||||
[InlineData("Asc", "asc")]
|
||||
[InlineData("AscNullsFirst", "asc_nulls_first")]
|
||||
[InlineData("AscNullsLast", "asc_nulls_last")]
|
||||
[InlineData("Desc", "desc")]
|
||||
[InlineData("DescNullsFirst", "desc_nulls_first")]
|
||||
[InlineData("DescNullsLast", "desc_nulls_last")]
|
||||
[InlineData("IsNaN", "isNaN")]
|
||||
[InlineData("IsNull", "isNull")]
|
||||
[InlineData("IsNotNull", "isNotNull")]
|
||||
public void TestUnaryOperators(string funcName, string opName)
|
||||
{
|
||||
Column column = CreateColumn("col");
|
||||
|
||||
// Use an empty array of Type objects to get a method that takes no parameters.
|
||||
System.Reflection.MethodInfo func =
|
||||
column.GetType().GetMethod(funcName, Type.EmptyTypes);
|
||||
var result = func.Invoke(column, null) as Column;
|
||||
Assert.Equal("result", GetId(result));
|
||||
VerifyNonStaticCall(column, opName);
|
||||
}
|
||||
|
||||
private Column CreateColumn(string id)
|
||||
{
|
||||
return new Column(new JvmObjectReference(id, _mockJvm.Object));
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,141 @@
|
|||
// Licensed to the .NET Foundation under one or more agreements.
|
||||
// The .NET Foundation licenses this file to you under the MIT license.
|
||||
// See the LICENSE file in the project root for more information.
|
||||
|
||||
using System;
|
||||
using System.Collections.Generic;
|
||||
using System.IO;
|
||||
using System.Linq;
|
||||
using Microsoft.Spark.Interop.Ipc;
|
||||
using Microsoft.Spark.Network;
|
||||
using Microsoft.Spark.Sql;
|
||||
using Microsoft.Spark.Sql.Types;
|
||||
using Microsoft.Spark.UnitTest.TestUtils;
|
||||
using Microsoft.Spark.Utils;
|
||||
using Moq;
|
||||
using Razorvine.Pickle;
|
||||
using Xunit;
|
||||
|
||||
namespace Microsoft.Spark.UnitTest
|
||||
{
|
||||
public class RowTests
|
||||
{
|
||||
private readonly string _testJsonSchema =
|
||||
@"{
|
||||
""type"":""struct"",
|
||||
""fields"":[
|
||||
{
|
||||
""name"":""age"",
|
||||
""type"":""integer"",
|
||||
""nullable"":true,
|
||||
""metadata"":{}
|
||||
},
|
||||
{
|
||||
""name"":""name"",
|
||||
""type"":""string"",
|
||||
""nullable"":false,
|
||||
""metadata"":{}
|
||||
}
|
||||
]}";
|
||||
|
||||
[Fact]
|
||||
public void RowTest()
|
||||
{
|
||||
var structFields = new List<StructField>()
|
||||
{
|
||||
new StructField("col1", new IntegerType()),
|
||||
new StructField("col2", new StringType()),
|
||||
};
|
||||
|
||||
var schema = new StructType(structFields);
|
||||
|
||||
var row = new Row(new object[] { 1, "abc" }, schema);
|
||||
|
||||
// Validate Size().
|
||||
Assert.Equal(2, row.Size());
|
||||
|
||||
// Validate [] operator.
|
||||
Assert.Equal(1, row[0]);
|
||||
Assert.Equal("abc", row[1]);
|
||||
|
||||
// Validate Get*(int).
|
||||
Assert.Equal(1, row.Get(0));
|
||||
Assert.Equal("abc", row.Get(1));
|
||||
Assert.Equal(1, row.GetAs<int>(0));
|
||||
Assert.ThrowsAny<Exception>(() => row.GetAs<string>(0));
|
||||
Assert.Equal("abc", row.GetAs<string>(1));
|
||||
Assert.ThrowsAny<Exception>(() => row.GetAs<int>(1));
|
||||
|
||||
// Validate Get*(string).
|
||||
Assert.Equal(1, row.Get("col1"));
|
||||
Assert.Equal("abc", row.Get("col2"));
|
||||
Assert.Equal(1, row.GetAs<int>("col1"));
|
||||
Assert.ThrowsAny<Exception>(() => row.GetAs<string>("col1"));
|
||||
Assert.Equal("abc", row.GetAs<string>("col2"));
|
||||
Assert.ThrowsAny<Exception>(() => row.GetAs<int>("col2"));
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void RowConstructorTest()
|
||||
{
|
||||
Pickler pickler = CreatePickler();
|
||||
|
||||
var schema = (StructType)DataType.ParseDataType(_testJsonSchema);
|
||||
var row1 = new Row(new object[] { 10, "name1" }, schema);
|
||||
var row2 = new Row(new object[] { 15, "name2" }, schema);
|
||||
var pickledBytes = pickler.dumps(new[] { row1, row2 });
|
||||
|
||||
// Note that the following will invoke RowConstructor.construct().
|
||||
var unpickledData = PythonSerDe.GetUnpickledObjects(new MemoryStream(pickledBytes));
|
||||
|
||||
Assert.Equal(2, unpickledData.Length);
|
||||
Assert.Equal(row1, (unpickledData[0] as RowConstructor).GetRow());
|
||||
Assert.Equal(row2, (unpickledData[1] as RowConstructor).GetRow());
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void RowCollectorTest()
|
||||
{
|
||||
var stream = new MemoryStream();
|
||||
Pickler pickler = CreatePickler();
|
||||
|
||||
var schema = (StructType)DataType.ParseDataType(_testJsonSchema);
|
||||
|
||||
// Pickle two rows in one batch.
|
||||
var row1 = new Row(new object[] { 10, "name1" }, schema);
|
||||
var row2 = new Row(new object[] { 15, "name2" }, schema);
|
||||
var batch1 = pickler.dumps(new[] { row1, row2 });
|
||||
SerDe.Write(stream, batch1.Length);
|
||||
SerDe.Write(stream, batch1);
|
||||
|
||||
// Pickle one row in one batch.
|
||||
var row3 = new Row(new object[] { 20, "name3" }, schema);
|
||||
var batch2 = pickler.dumps(new[] { row3 });
|
||||
SerDe.Write(stream, batch2.Length);
|
||||
SerDe.Write(stream, batch2);
|
||||
|
||||
// Rewind the memory stream so that the row collect can read from beginning.
|
||||
stream.Seek(0, SeekOrigin.Begin);
|
||||
|
||||
// Set up the mock to return memory stream to which pickled data is written.
|
||||
var socket = new Mock<ISocketWrapper>();
|
||||
socket.Setup(m => m.InputStream).Returns(stream);
|
||||
socket.Setup(m => m.OutputStream).Returns(stream);
|
||||
|
||||
var rowCollector = new RowCollector();
|
||||
Row[] rows = rowCollector.Collect(socket.Object).ToArray();
|
||||
|
||||
Assert.Equal(3, rows.Length);
|
||||
Assert.Equal(row1, rows[0]);
|
||||
Assert.Equal(row2, rows[1]);
|
||||
Assert.Equal(row3, rows[2]);
|
||||
}
|
||||
|
||||
private Pickler CreatePickler()
|
||||
{
|
||||
new StructTypePickler().Register();
|
||||
new RowPickler().Register();
|
||||
return new Pickler();
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,91 @@
|
|||
// Licensed to the .NET Foundation under one or more agreements.
|
||||
// The .NET Foundation licenses this file to you under the MIT license.
|
||||
// See the LICENSE file in the project root for more information.
|
||||
|
||||
using Microsoft.Spark.Sql.Types;
|
||||
using Newtonsoft.Json.Linq;
|
||||
using Xunit;
|
||||
|
||||
namespace Microsoft.Spark.UnitTest
|
||||
{
|
||||
public class TypesTests
|
||||
{
|
||||
[Theory]
|
||||
[InlineData("null")]
|
||||
[InlineData("string")]
|
||||
[InlineData("binary")]
|
||||
[InlineData("boolean")]
|
||||
[InlineData("date")]
|
||||
[InlineData("timestamp")]
|
||||
[InlineData("double")]
|
||||
[InlineData("float")]
|
||||
[InlineData("byte")]
|
||||
[InlineData("integer")]
|
||||
[InlineData("long")]
|
||||
[InlineData("short")]
|
||||
public void TestSimpleTypes(string typeName)
|
||||
{
|
||||
var atomicType = DataType.ParseDataType($@"""{typeName}""");
|
||||
Assert.Equal(typeName, atomicType.TypeName);
|
||||
Assert.Equal(typeName, atomicType.SimpleString);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void TestArrayType()
|
||||
{
|
||||
var schemaJson =
|
||||
@"{
|
||||
""type"":""array"",
|
||||
""elementType"":""integer"",
|
||||
""containsNull"":true
|
||||
}";
|
||||
var arrayType = (ArrayType)DataType.ParseDataType(schemaJson);
|
||||
Assert.Equal("array", arrayType.TypeName);
|
||||
Assert.Equal("array<integer>", arrayType.SimpleString);
|
||||
Assert.Equal("integer", arrayType.ElementType.TypeName);
|
||||
Assert.True(arrayType.ContainsNull);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void TestStructTypeAndStructFieldTypes()
|
||||
{
|
||||
var schemaJson =
|
||||
@"{
|
||||
""type"":""struct"",
|
||||
""fields"":[
|
||||
{
|
||||
""name"":""age"",
|
||||
""type"":""long"",
|
||||
""nullable"":true,
|
||||
""metadata"":{}
|
||||
},
|
||||
{
|
||||
""name"":""name"",
|
||||
""type"":""string"",
|
||||
""nullable"":false,
|
||||
""metadata"":{}
|
||||
}
|
||||
]}";
|
||||
|
||||
var structType = (StructType)DataType.ParseDataType(schemaJson);
|
||||
Assert.Equal("struct", structType.TypeName);
|
||||
Assert.Equal("struct<age:long,name:string>", structType.SimpleString);
|
||||
Assert.Equal(2, structType.Fields.Count);
|
||||
|
||||
{
|
||||
StructField field = structType.Fields[0];
|
||||
Assert.Equal("age", field.Name);
|
||||
Assert.Equal("long", field.DataType.TypeName);
|
||||
Assert.True(field.IsNullable);
|
||||
Assert.Equal(new JObject(), field.Metadata);
|
||||
}
|
||||
{
|
||||
StructField field = structType.Fields[1];
|
||||
Assert.Equal("name", field.Name);
|
||||
Assert.Equal("string", field.DataType.TypeName);
|
||||
Assert.False(field.IsNullable);
|
||||
Assert.Equal(new JObject(), field.Metadata);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,20 @@
|
|||
// Licensed to the .NET Foundation under one or more agreements.
|
||||
// The .NET Foundation licenses this file to you under the MIT license.
|
||||
// See the LICENSE file in the project root for more information.
|
||||
|
||||
using Apache.Arrow;
|
||||
using Xunit;
|
||||
|
||||
namespace Microsoft.Spark.UnitTest.TestUtils
|
||||
{
|
||||
public static class ArrowTestUtils
|
||||
{
|
||||
public static void AssertEquals(string expectedValue, IArrowArray arrowArray)
|
||||
{
|
||||
Assert.IsType<StringArray>(arrowArray);
|
||||
var stringArray = (StringArray)arrowArray;
|
||||
Assert.Equal(1, stringArray.Length);
|
||||
Assert.Equal(expectedValue, stringArray.GetString(0));
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,92 @@
|
|||
// Licensed to the .NET Foundation under one or more agreements.
|
||||
// The .NET Foundation licenses this file to you under the MIT license.
|
||||
// See the LICENSE file in the project root for more information.
|
||||
|
||||
using System;
|
||||
using System.IO;
|
||||
using System.Text;
|
||||
using Microsoft.Spark.Interop.Ipc;
|
||||
using Microsoft.Spark.Sql;
|
||||
using Microsoft.Spark.Sql.Types;
|
||||
using Razorvine.Pickle;
|
||||
|
||||
namespace Microsoft.Spark.UnitTest.TestUtils
|
||||
{
|
||||
/// <summary>
|
||||
/// Custom pickler for StructType objects.
|
||||
/// Refer to
|
||||
/// spark/sql/core/src/main/scala/org/apache/spark/sql/execution/python/EvaluatePython.scala
|
||||
/// </summary>
|
||||
internal class StructTypePickler : IObjectPickler
|
||||
{
|
||||
private readonly string _module = "pyspark.sql.types";
|
||||
|
||||
public void Register()
|
||||
{
|
||||
Pickler.registerCustomPickler(GetType(), this);
|
||||
Pickler.registerCustomPickler(typeof(StructType), this);
|
||||
}
|
||||
|
||||
public void pickle(object o, Stream stream, Pickler currentPickler)
|
||||
{
|
||||
if (!(o is StructType schema))
|
||||
{
|
||||
throw new InvalidOperationException("A StructType object is expected.");
|
||||
}
|
||||
|
||||
SerDe.Write(stream, Opcodes.GLOBAL);
|
||||
SerDe.Write(stream, Encoding.UTF8.GetBytes(
|
||||
$"{_module}\n_parse_datatype_json_string\n"));
|
||||
currentPickler.save(schema.Json);
|
||||
SerDe.Write(stream, Opcodes.TUPLE1);
|
||||
SerDe.Write(stream, Opcodes.REDUCE);
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Custom pickler for Row objects.
|
||||
/// Refer to
|
||||
/// spark/sql/core/src/main/scala/org/apache/spark/sql/execution/python/EvaluatePython.scala
|
||||
/// </summary>
|
||||
internal class RowPickler : IObjectPickler
|
||||
{
|
||||
private readonly string _module = "pyspark.sql.types";
|
||||
|
||||
public void Register()
|
||||
{
|
||||
Pickler.registerCustomPickler(GetType(), this);
|
||||
Pickler.registerCustomPickler(typeof(Row), this);
|
||||
}
|
||||
|
||||
public void pickle(object o, Stream stream, Pickler currentPickler)
|
||||
{
|
||||
if (o.Equals(this))
|
||||
{
|
||||
SerDe.Write(stream, Opcodes.GLOBAL);
|
||||
SerDe.Write(stream, Encoding.UTF8.GetBytes(
|
||||
$"{_module}\n_create_row_inbound_converter\n"));
|
||||
}
|
||||
else
|
||||
{
|
||||
if (!(o is Row row))
|
||||
{
|
||||
throw new InvalidOperationException("A Row object is expected.");
|
||||
}
|
||||
|
||||
currentPickler.save(this);
|
||||
currentPickler.save(row.Schema);
|
||||
SerDe.Write(stream, Opcodes.TUPLE1);
|
||||
SerDe.Write(stream, Opcodes.REDUCE);
|
||||
|
||||
SerDe.Write(stream, Opcodes.MARK);
|
||||
for (int i = 0; i < row.Size(); ++i)
|
||||
{
|
||||
currentPickler.save(row.Get(i));
|
||||
}
|
||||
|
||||
SerDe.Write(stream, Opcodes.TUPLE);
|
||||
SerDe.Write(stream, Opcodes.REDUCE);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,268 @@
|
|||
// Licensed to the .NET Foundation under one or more agreements.
|
||||
// The .NET Foundation licenses this file to you under the MIT license.
|
||||
// See the LICENSE file in the project root for more information.
|
||||
|
||||
using System.Collections.Generic;
|
||||
using System.Linq;
|
||||
using Apache.Arrow;
|
||||
using Microsoft.Spark.Sql;
|
||||
using Microsoft.Spark.UnitTest.TestUtils;
|
||||
using Xunit;
|
||||
|
||||
namespace Microsoft.Spark.UnitTest
|
||||
{
|
||||
public class UdfWrapperTests
|
||||
{
|
||||
[Fact]
|
||||
public void TestPicklingUdfWrapper0()
|
||||
{
|
||||
var udfWrapper = new PicklingUdfWrapper<int>(() => 10);
|
||||
Assert.Equal(10, udfWrapper.Execute(0, null, null));
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void TestPicklingUdfWrapper1()
|
||||
{
|
||||
var udfWrapper = new PicklingUdfWrapper<string, string>(
|
||||
(str1) => str1);
|
||||
ValidatePicklingWrapper(1, udfWrapper);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void TestPicklingUdfWrapper2()
|
||||
{
|
||||
var udfWrapper = new PicklingUdfWrapper<string, string, string>(
|
||||
(str1, str2) => str1 + str2);
|
||||
ValidatePicklingWrapper(2, udfWrapper);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void TestPicklingUdfWrapper3()
|
||||
{
|
||||
var udfWrapper = new PicklingUdfWrapper<string, string, string, string>(
|
||||
(str1, str2, str3) => str1 + str2 + str3);
|
||||
ValidatePicklingWrapper(3, udfWrapper);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void TestPicklingUdfWrapper4()
|
||||
{
|
||||
var udfWrapper = new PicklingUdfWrapper<string, string, string, string, string>(
|
||||
(str1, str2, str3, str4) => str1 + str2 + str3 + str4);
|
||||
ValidatePicklingWrapper(4, udfWrapper);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void TestPicklingUdfWrapper5()
|
||||
{
|
||||
var udfWrapper = new PicklingUdfWrapper<string, string, string, string, string, string>(
|
||||
(str1, str2, str3, str4, str5) => str1 + str2 + str3 + str4 + str5);
|
||||
ValidatePicklingWrapper(5, udfWrapper);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void TestPicklingUdfWrapper6()
|
||||
{
|
||||
var udfWrapper = new PicklingUdfWrapper<
|
||||
string, string, string, string, string, string, string>(
|
||||
(str1, str2, str3, str4, str5, str6)
|
||||
=> str1 + str2 + str3 + str4 + str5 + str6);
|
||||
ValidatePicklingWrapper(6, udfWrapper);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void TestPicklingUdfWrapper7()
|
||||
{
|
||||
var udfWrapper = new PicklingUdfWrapper<
|
||||
string, string, string, string, string, string, string, string>(
|
||||
(str1, str2, str3, str4, str5, str6, str7)
|
||||
=> str1 + str2 + str3 + str4 + str5 + str6 + str7);
|
||||
ValidatePicklingWrapper(7, udfWrapper);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void TestPicklingUdfWrapper8()
|
||||
{
|
||||
var udfWrapper = new PicklingUdfWrapper<
|
||||
string, string, string, string, string, string, string, string, string>(
|
||||
(str1, str2, str3, str4, str5, str6, str7, str8)
|
||||
=> str1 + str2 + str3 + str4 + str5 + str6 + str7 + str8);
|
||||
ValidatePicklingWrapper(8, udfWrapper);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void TestPicklingUdfWrapper9()
|
||||
{
|
||||
var udfWrapper = new PicklingUdfWrapper<
|
||||
string, string, string, string, string, string, string, string, string, string>(
|
||||
(str1, str2, str3, str4, str5, str6, str7, str8, str9)
|
||||
=> str1 + str2 + str3 + str4 + str5 + str6 + str7 + str8 + str9);
|
||||
ValidatePicklingWrapper(9, udfWrapper);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void TestPicklingUdfWrapper10()
|
||||
{
|
||||
var udfWrapper = new PicklingUdfWrapper<
|
||||
string, string, string, string, string, string, string, string, string, string, string>(
|
||||
(str1, str2, str3, str4, str5, str6, str7, str8, str9, str10)
|
||||
=> str1 + str2 + str3 + str4 + str5 + str6 + str7 + str8 + str9 + str10);
|
||||
ValidatePicklingWrapper(10, udfWrapper);
|
||||
}
|
||||
|
||||
// Validates the given udfWrapper, whose internal UDF concatenates all the input strings.
|
||||
private void ValidatePicklingWrapper(int numArgs, dynamic udfWrapper)
|
||||
{
|
||||
// Create one more input data than the given numArgs to validate
|
||||
// the indexing is working correctly inside UdfWrapper.
|
||||
var input = new List<string>();
|
||||
for (int i = 0; i < numArgs + 1; ++i)
|
||||
{
|
||||
input.Add($"arg{i}");
|
||||
}
|
||||
|
||||
// First create argOffsets from 0 to numArgs.
|
||||
// For example, the numArgs was 3, the expected strings is "arg0arg1arg2"
|
||||
// where the argOffsets are created with { 0, 1, 2 }.
|
||||
Assert.Equal(
|
||||
string.Join("", input.GetRange(0, numArgs)),
|
||||
udfWrapper.Execute(0, input.ToArray(), Enumerable.Range(0, numArgs).ToArray()));
|
||||
|
||||
// Create argOffsets from 1 to numArgs + 1.
|
||||
// For example, the numArgs was 3, the expected strings is "arg1arg2arg3"
|
||||
// where the argOffsets are created with { 1, 2, 3 }.
|
||||
Assert.Equal(
|
||||
string.Join("", input.GetRange(1, numArgs)),
|
||||
udfWrapper.Execute(0, input.ToArray(), Enumerable.Range(1, numArgs).ToArray()));
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void TestArrowUdfWrapper0()
|
||||
{
|
||||
var udfWrapper = new ArrowUdfWrapper<int>(() => 10);
|
||||
IArrowArray result = udfWrapper.Execute(0, null, null);
|
||||
Assert.IsType<Int32Array>(result);
|
||||
var intArray = (Int32Array)result;
|
||||
Assert.Equal(1, intArray.Length);
|
||||
Assert.Equal(10, intArray.Values[0]);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void TestArrowUdfWrapper1()
|
||||
{
|
||||
var udfWrapper = new ArrowUdfWrapper<string, string>(
|
||||
(str1) => str1);
|
||||
ValidateArrowWrapper(1, udfWrapper);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void TestArrowUdfWrapper2()
|
||||
{
|
||||
var udfWrapper = new ArrowUdfWrapper<string, string, string>(
|
||||
(str1, str2) => str1 + str2);
|
||||
ValidateArrowWrapper(2, udfWrapper);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void TestArrowUdfWrapper3()
|
||||
{
|
||||
var udfWrapper = new ArrowUdfWrapper<string, string, string, string>(
|
||||
(str1, str2, str3) => str1 + str2 + str3);
|
||||
ValidateArrowWrapper(3, udfWrapper);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void TestArrowUdfWrapper4()
|
||||
{
|
||||
var udfWrapper = new ArrowUdfWrapper<string, string, string, string, string>(
|
||||
(str1, str2, str3, str4) => str1 + str2 + str3 + str4);
|
||||
ValidateArrowWrapper(4, udfWrapper);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void TestArrowUdfWrapper5()
|
||||
{
|
||||
var udfWrapper = new ArrowUdfWrapper<string, string, string, string, string, string>(
|
||||
(str1, str2, str3, str4, str5) => str1 + str2 + str3 + str4 + str5);
|
||||
ValidateArrowWrapper(5, udfWrapper);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void TestArrowUdfWrapper6()
|
||||
{
|
||||
var udfWrapper = new ArrowUdfWrapper<
|
||||
string, string, string, string, string, string, string>(
|
||||
(str1, str2, str3, str4, str5, str6)
|
||||
=> str1 + str2 + str3 + str4 + str5 + str6);
|
||||
ValidateArrowWrapper(6, udfWrapper);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void TestArrowUdfWrapper7()
|
||||
{
|
||||
var udfWrapper = new ArrowUdfWrapper<
|
||||
string, string, string, string, string, string, string, string>(
|
||||
(str1, str2, str3, str4, str5, str6, str7)
|
||||
=> str1 + str2 + str3 + str4 + str5 + str6 + str7);
|
||||
ValidateArrowWrapper(7, udfWrapper);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void TestArrowUdfWrapper8()
|
||||
{
|
||||
var udfWrapper = new ArrowUdfWrapper<
|
||||
string, string, string, string, string, string, string, string, string>(
|
||||
(str1, str2, str3, str4, str5, str6, str7, str8)
|
||||
=> str1 + str2 + str3 + str4 + str5 + str6 + str7 + str8);
|
||||
ValidateArrowWrapper(8, udfWrapper);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void TestArrowUdfWrapper9()
|
||||
{
|
||||
var udfWrapper = new ArrowUdfWrapper<
|
||||
string, string, string, string, string, string, string, string, string, string>(
|
||||
(str1, str2, str3, str4, str5, str6, str7, str8, str9)
|
||||
=> str1 + str2 + str3 + str4 + str5 + str6 + str7 + str8 + str9);
|
||||
ValidateArrowWrapper(9, udfWrapper);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void TestArrowUdfWrapper10()
|
||||
{
|
||||
var udfWrapper = new ArrowUdfWrapper<
|
||||
string, string, string, string, string, string, string, string, string, string, string>(
|
||||
(str1, str2, str3, str4, str5, str6, str7, str8, str9, str10)
|
||||
=> str1 + str2 + str3 + str4 + str5 + str6 + str7 + str8 + str9 + str10);
|
||||
ValidateArrowWrapper(10, udfWrapper);
|
||||
}
|
||||
|
||||
// Validates the given udfWrapper, whose internal UDF concatenates all the input strings.
|
||||
private void ValidateArrowWrapper(int numArgs, dynamic udfWrapper)
|
||||
{
|
||||
// Create one more input data than the given numArgs to validate
|
||||
// the indexing is working correctly inside ArrowUdfWrapper.
|
||||
var input = new IArrowArray[numArgs + 1];
|
||||
var inputStrings = new List<string>();
|
||||
for (int i = 0; i < input.Length; ++i)
|
||||
{
|
||||
inputStrings.Add($"arg{i}");
|
||||
input[i] = ArrowArrayHelpers.ToArrowArray(new string[] { $"arg{i}" });
|
||||
}
|
||||
|
||||
// First create argOffsets from 0 to numArgs.
|
||||
// For example, the numArgs was 3, the expected strings is "arg0arg1arg2"
|
||||
// where the argOffsets are created with { 0, 1, 2 }.
|
||||
ArrowTestUtils.AssertEquals(
|
||||
string.Join("", inputStrings.GetRange(0, numArgs)),
|
||||
udfWrapper.Execute(0, input, Enumerable.Range(0, numArgs).ToArray()));
|
||||
|
||||
// Create argOffsets from 1 to numArgs + 1.
|
||||
// For example, the numArgs was 3, the expected strings is "arg1arg2arg3"
|
||||
// where the argOffsets are created with { 1, 2, 3 }.
|
||||
ArrowTestUtils.AssertEquals(
|
||||
string.Join("", inputStrings.GetRange(1, numArgs)),
|
||||
udfWrapper.Execute(0, input, Enumerable.Range(1, numArgs).ToArray()));
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,164 @@
|
|||
// Licensed to the .NET Foundation under one or more agreements.
|
||||
// The .NET Foundation licenses this file to you under the MIT license.
|
||||
// See the LICENSE file in the project root for more information.
|
||||
|
||||
using System;
|
||||
using Apache.Arrow;
|
||||
using Microsoft.Spark.Sql;
|
||||
using Microsoft.Spark.UnitTest.TestUtils;
|
||||
using Xunit;
|
||||
using static Microsoft.Spark.Sql.ArrowArrayHelpers;
|
||||
|
||||
namespace Microsoft.Spark.UnitTest
|
||||
{
|
||||
public class WorkerFunctionTests
|
||||
{
|
||||
[Fact]
|
||||
public void TestPicklingWorkerFunction()
|
||||
{
|
||||
var func = new PicklingWorkerFunction(
|
||||
new PicklingUdfWrapper<string, string>(
|
||||
(str) => str).Execute);
|
||||
|
||||
string[] input = { "arg1" };
|
||||
Assert.Equal(input[0], func.Func(0, input, new[] { 0 }));
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void TestChainingPicklingWorkerFunction()
|
||||
{
|
||||
var func1 = new PicklingWorkerFunction(
|
||||
new PicklingUdfWrapper<int, string, string>(
|
||||
(number, str) => $"{str}:{number}").Execute);
|
||||
|
||||
var func2 = new PicklingWorkerFunction(
|
||||
new PicklingUdfWrapper<string, string>(
|
||||
(str) => $"outer1:{str}").Execute);
|
||||
|
||||
var func3 = new PicklingWorkerFunction(
|
||||
new PicklingUdfWrapper<string, string>(
|
||||
(str) => $"outer2:{str}").Execute);
|
||||
|
||||
object[] input = { 100, "name" };
|
||||
|
||||
// Validate one-level chaining.
|
||||
var chainedFunc1 = PicklingWorkerFunction.Chain(func1, func2);
|
||||
Assert.Equal("outer1:name:100", chainedFunc1.Func(0, input, new[] { 0, 1 }));
|
||||
|
||||
// Validate two-level chaining.
|
||||
var chainedFunc2 = PicklingWorkerFunction.Chain(chainedFunc1, func3);
|
||||
Assert.Equal("outer2:outer1:name:100", chainedFunc2.Func(0, input, new[] { 0, 1 }));
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void TestInvalidChainingPickling()
|
||||
{
|
||||
var func1 = new PicklingWorkerFunction(
|
||||
new PicklingUdfWrapper<int, string, string>(
|
||||
(number, str) => $"{str}:{number}").Execute);
|
||||
|
||||
var func2 = new PicklingWorkerFunction(
|
||||
new PicklingUdfWrapper<string, string>(
|
||||
(str) => $"outer1:{str}").Execute);
|
||||
|
||||
object[] input = { 100, "name" };
|
||||
|
||||
// The order does not align since workerFunction2 is executed first.
|
||||
var chainedFunc1 = PicklingWorkerFunction.Chain(func2, func1);
|
||||
Assert.ThrowsAny<Exception>(() => chainedFunc1.Func(0, input, new[] { 0, 1 }));
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void TestArrowWorkerFunction()
|
||||
{
|
||||
var func = new ArrowWorkerFunction(
|
||||
new ArrowUdfWrapper<string, string>(
|
||||
(str) => str).Execute);
|
||||
|
||||
string[] input = { "arg1" };
|
||||
ArrowTestUtils.AssertEquals(
|
||||
input[0],
|
||||
func.Func(0, new[] { ToArrowArray(input) }, new[] { 0 }));
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Tests the ArrowWorkerFunction handles boolean types correctly
|
||||
/// for both input and output.
|
||||
/// </summary>
|
||||
[Fact]
|
||||
public void TestArrowWorkerFunctionForBool()
|
||||
{
|
||||
var func = new ArrowWorkerFunction(
|
||||
new ArrowUdfWrapper<string, bool, bool>(
|
||||
(str, flag) => flag || str.Contains("true")).Execute);
|
||||
|
||||
IArrowArray[] input = new[]
|
||||
{
|
||||
ToArrowArray(new[] { "arg1_true", "arg1_true", "arg1_false", "arg1_false" }),
|
||||
ToArrowArray(new[] { true, false, true, false }),
|
||||
};
|
||||
var results = (BooleanArray)func.Func(0, input, new[] { 0, 1 });
|
||||
Assert.Equal(4, results.Length);
|
||||
Assert.True(results.GetBoolean(0));
|
||||
Assert.True(results.GetBoolean(1));
|
||||
Assert.True(results.GetBoolean(2));
|
||||
Assert.False(results.GetBoolean(3));
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void TestChainingArrowWorkerFunction()
|
||||
{
|
||||
var func1 = new ArrowWorkerFunction(
|
||||
new ArrowUdfWrapper<int, string, string>(
|
||||
(number, str) => $"{str}:{number}").Execute);
|
||||
|
||||
var func2 = new ArrowWorkerFunction(
|
||||
new ArrowUdfWrapper<string, string>(
|
||||
(str) => $"outer1:{str}").Execute);
|
||||
|
||||
var func3 = new ArrowWorkerFunction(
|
||||
new ArrowUdfWrapper<string, string>(
|
||||
(str) => $"outer2:{str}").Execute);
|
||||
|
||||
Apache.Arrow.IArrowArray[] input = new[]
|
||||
{
|
||||
ToArrowArray(new[] { 100 }),
|
||||
ToArrowArray(new[] { "name" })
|
||||
};
|
||||
|
||||
// Validate one-level chaining.
|
||||
var chainedFunc1 = ArrowWorkerFunction.Chain(func1, func2);
|
||||
ArrowTestUtils.AssertEquals(
|
||||
"outer1:name:100",
|
||||
chainedFunc1.Func(0, input, new[] { 0, 1 }));
|
||||
|
||||
// Validate two-level chaining.
|
||||
var chainedFunc2 = ArrowWorkerFunction.Chain(chainedFunc1, func3);
|
||||
ArrowTestUtils.AssertEquals(
|
||||
"outer2:outer1:name:100",
|
||||
chainedFunc2.Func(0, input, new[] { 0, 1 }));
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void TestInvalidChainingArrow()
|
||||
{
|
||||
var func1 = new ArrowWorkerFunction(
|
||||
new ArrowUdfWrapper<int, string, string>(
|
||||
(number, str) => $"{str}:{number}").Execute);
|
||||
|
||||
var func2 = new ArrowWorkerFunction(
|
||||
new ArrowUdfWrapper<string, string>(
|
||||
(str) => $"outer1:{str}").Execute);
|
||||
|
||||
Apache.Arrow.IArrowArray[] input = new[]
|
||||
{
|
||||
ToArrowArray(new[] { 100 }),
|
||||
ToArrowArray(new[] { "name" })
|
||||
};
|
||||
|
||||
// The order does not align since workerFunction2 is executed first.
|
||||
var chainedFunc1 = ArrowWorkerFunction.Chain(func2, func1);
|
||||
Assert.ThrowsAny<Exception>(() => chainedFunc1.Func(0, input, new[] { 0, 1 }));
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,553 @@
|
|||
// Licensed to the .NET Foundation under one or more agreements.
|
||||
// The .NET Foundation licenses this file to you under the MIT license.
|
||||
// See the LICENSE file in the project root for more information.
|
||||
|
||||
using System.Collections;
|
||||
using System.Collections.Generic;
|
||||
using System.IO;
|
||||
using System.Linq;
|
||||
using System.Reflection;
|
||||
using System.Runtime.Serialization.Formatters.Binary;
|
||||
using System.Threading;
|
||||
using System.Threading.Tasks;
|
||||
using Apache.Arrow;
|
||||
using Apache.Arrow.Ipc;
|
||||
using Apache.Arrow.Types;
|
||||
using Microsoft.Spark.Interop.Ipc;
|
||||
using Microsoft.Spark.Utils;
|
||||
using Microsoft.Spark.Worker.Command;
|
||||
using Razorvine.Pickle;
|
||||
using Xunit;
|
||||
using static Microsoft.Spark.Sql.ArrowArrayHelpers;
|
||||
|
||||
namespace Microsoft.Spark.Worker.UnitTest
|
||||
{
|
||||
public class CommandExecutorTests
|
||||
{
|
||||
[Fact]
|
||||
public void TestPicklingSqlCommandExecutorWithSingleCommand()
|
||||
{
|
||||
var udfWrapper = new Sql.PicklingUdfWrapper<string, string>(
|
||||
(str) => "udf: " + ((str is null) ? "NULL" : str));
|
||||
var command = new SqlCommand()
|
||||
{
|
||||
ArgOffsets = new[] { 0 },
|
||||
NumChainedFunctions = 1,
|
||||
WorkerFunction = new Sql.PicklingWorkerFunction(udfWrapper.Execute),
|
||||
SerializerMode = CommandSerDe.SerializedMode.Row,
|
||||
DeserializerMode = CommandSerDe.SerializedMode.Row
|
||||
};
|
||||
|
||||
var commandPayload = new Worker.CommandPayload()
|
||||
{
|
||||
EvalType = UdfUtils.PythonEvalType.SQL_BATCHED_UDF,
|
||||
Commands = new[] { command }
|
||||
};
|
||||
|
||||
using (var inputStream = new MemoryStream())
|
||||
using (var outputStream = new MemoryStream())
|
||||
{
|
||||
int numRows = 10;
|
||||
|
||||
// Write test data to the input stream.
|
||||
var pickler = new Pickler();
|
||||
for (int i = 0; i < numRows; ++i)
|
||||
{
|
||||
var pickled = pickler.dumps(
|
||||
new[] { new object[] { (i % 2 == 0) ? null : i.ToString() } });
|
||||
SerDe.Write(inputStream, pickled.Length);
|
||||
SerDe.Write(inputStream, pickled);
|
||||
}
|
||||
SerDe.Write(inputStream, (int)SpecialLengths.END_OF_DATA_SECTION);
|
||||
inputStream.Seek(0, SeekOrigin.Begin);
|
||||
|
||||
CommandExecutorStat stat = new CommandExecutor().Execute(
|
||||
inputStream,
|
||||
outputStream,
|
||||
0,
|
||||
commandPayload);
|
||||
|
||||
// Validate that all the data on the stream is read.
|
||||
Assert.Equal(inputStream.Length, inputStream.Position);
|
||||
Assert.Equal(10, stat.NumEntriesProcessed);
|
||||
|
||||
// Validate the output stream.
|
||||
outputStream.Seek(0, SeekOrigin.Begin);
|
||||
var unpickler = new Unpickler();
|
||||
|
||||
// One row was written as a batch above, thus need to read 'numRows' batches.
|
||||
List<object> rows = new List<object>();
|
||||
for (int i = 0; i < numRows; ++i)
|
||||
{
|
||||
int length = SerDe.ReadInt32(outputStream);
|
||||
byte[] pickledBytes = SerDe.ReadBytes(outputStream, length);
|
||||
rows.Add((unpickler.loads(pickledBytes) as ArrayList)[0] as object);
|
||||
}
|
||||
|
||||
Assert.Equal(numRows, rows.Count);
|
||||
|
||||
// Validate the single command.
|
||||
for (int i = 0; i < numRows; ++i)
|
||||
{
|
||||
Assert.Equal(
|
||||
"udf: " + ((i % 2 == 0) ? "NULL" : i.ToString()),
|
||||
(string)rows[i]);
|
||||
}
|
||||
|
||||
// Validate all the data on the stream is read.
|
||||
Assert.Equal(outputStream.Length, outputStream.Position);
|
||||
}
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void TestPicklingSqlCommandExecutorWithMultiCommands()
|
||||
{
|
||||
var udfWrapper1 = new Sql.PicklingUdfWrapper<string, string>((str) => $"udf: {str}");
|
||||
var udfWrapper2 = new Sql.PicklingUdfWrapper<int, int, int>(
|
||||
(arg1, arg2) => arg1 * arg2);
|
||||
|
||||
var command1 = new SqlCommand()
|
||||
{
|
||||
ArgOffsets = new[] { 0 },
|
||||
NumChainedFunctions = 1,
|
||||
WorkerFunction = new Sql.PicklingWorkerFunction(udfWrapper1.Execute),
|
||||
SerializerMode = CommandSerDe.SerializedMode.Row,
|
||||
DeserializerMode = CommandSerDe.SerializedMode.Row
|
||||
};
|
||||
|
||||
var command2 = new SqlCommand()
|
||||
{
|
||||
ArgOffsets = new[] { 1, 2 },
|
||||
NumChainedFunctions = 1,
|
||||
WorkerFunction = new Sql.PicklingWorkerFunction(udfWrapper2.Execute),
|
||||
SerializerMode = CommandSerDe.SerializedMode.Row,
|
||||
DeserializerMode = CommandSerDe.SerializedMode.Row
|
||||
};
|
||||
|
||||
var commandPayload = new Worker.CommandPayload()
|
||||
{
|
||||
EvalType = UdfUtils.PythonEvalType.SQL_BATCHED_UDF,
|
||||
Commands = new[] { command1, command2 }
|
||||
};
|
||||
|
||||
using (var inputStream = new MemoryStream())
|
||||
using (var outputStream = new MemoryStream())
|
||||
{
|
||||
int numRows = 10;
|
||||
|
||||
// Write test data to the input stream.
|
||||
var pickler = new Pickler();
|
||||
for (int i = 0; i < numRows; ++i)
|
||||
{
|
||||
byte[] pickled = pickler.dumps(
|
||||
new[] { new object[] { i.ToString(), i, i } });
|
||||
SerDe.Write(inputStream, pickled.Length);
|
||||
SerDe.Write(inputStream, pickled);
|
||||
}
|
||||
SerDe.Write(inputStream, (int)SpecialLengths.END_OF_DATA_SECTION);
|
||||
inputStream.Seek(0, SeekOrigin.Begin);
|
||||
|
||||
CommandExecutorStat stat = new CommandExecutor().Execute(
|
||||
inputStream,
|
||||
outputStream,
|
||||
0,
|
||||
commandPayload);
|
||||
|
||||
// Validate all the data on the stream is read.
|
||||
Assert.Equal(inputStream.Length, inputStream.Position);
|
||||
Assert.Equal(10, stat.NumEntriesProcessed);
|
||||
|
||||
// Validate the output stream.
|
||||
outputStream.Seek(0, SeekOrigin.Begin);
|
||||
var unpickler = new Unpickler();
|
||||
|
||||
// One row was written as a batch above, thus need to read 'numRows' batches.
|
||||
List<object[]> rows = new List<object[]>();
|
||||
for (int i = 0; i < numRows; ++i)
|
||||
{
|
||||
int length = SerDe.ReadInt32(outputStream);
|
||||
byte[] pickledBytes = SerDe.ReadBytes(outputStream, length);
|
||||
rows.Add((unpickler.loads(pickledBytes) as ArrayList)[0] as object[]);
|
||||
}
|
||||
|
||||
Assert.Equal(numRows, rows.Count);
|
||||
|
||||
for (int i = 0; i < numRows; ++i)
|
||||
{
|
||||
// There were two UDFs each of which produces one column.
|
||||
object[] columns = rows[i];
|
||||
Assert.Equal($"udf: {i}", (string)columns[0]);
|
||||
Assert.Equal(i * i, (int)columns[1]);
|
||||
}
|
||||
|
||||
// Validate all the data on the stream is read.
|
||||
Assert.Equal(outputStream.Length, outputStream.Position);
|
||||
}
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void TestPicklingSqlCommandExecutorWithEmptyInput()
|
||||
{
|
||||
var udfWrapper = new Sql.PicklingUdfWrapper<string, string>((str) => $"udf: {str}");
|
||||
var command = new SqlCommand()
|
||||
{
|
||||
ArgOffsets = new[] { 0 },
|
||||
NumChainedFunctions = 1,
|
||||
WorkerFunction = new Sql.PicklingWorkerFunction(udfWrapper.Execute),
|
||||
SerializerMode = CommandSerDe.SerializedMode.Row,
|
||||
DeserializerMode = CommandSerDe.SerializedMode.Row
|
||||
};
|
||||
|
||||
var commandPayload = new Worker.CommandPayload()
|
||||
{
|
||||
EvalType = UdfUtils.PythonEvalType.SQL_BATCHED_UDF,
|
||||
Commands = new[] { command }
|
||||
};
|
||||
|
||||
using (var inputStream = new MemoryStream())
|
||||
using (var outputStream = new MemoryStream())
|
||||
{
|
||||
// Write test data to the input stream. For the empty input scenario,
|
||||
// only send SpecialLengths.END_OF_DATA_SECTION.
|
||||
SerDe.Write(inputStream, (int)SpecialLengths.END_OF_DATA_SECTION);
|
||||
inputStream.Seek(0, SeekOrigin.Begin);
|
||||
|
||||
CommandExecutorStat stat = new CommandExecutor().Execute(
|
||||
inputStream,
|
||||
outputStream,
|
||||
0,
|
||||
commandPayload);
|
||||
|
||||
// Validate that all the data on the stream is read.
|
||||
Assert.Equal(inputStream.Length, inputStream.Position);
|
||||
Assert.Equal(0, stat.NumEntriesProcessed);
|
||||
|
||||
// Validate the output stream.
|
||||
Assert.Equal(0, outputStream.Length);
|
||||
}
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public async Task TestArrowSqlCommandExecutorWithSingleCommand()
|
||||
{
|
||||
var udfWrapper = new Sql.ArrowUdfWrapper<string, string>((str) => $"udf: {str}");
|
||||
var command = new SqlCommand()
|
||||
{
|
||||
ArgOffsets = new[] { 0 },
|
||||
NumChainedFunctions = 1,
|
||||
WorkerFunction = new Sql.ArrowWorkerFunction(udfWrapper.Execute),
|
||||
SerializerMode = CommandSerDe.SerializedMode.Row,
|
||||
DeserializerMode = CommandSerDe.SerializedMode.Row
|
||||
};
|
||||
|
||||
var commandPayload = new Worker.CommandPayload()
|
||||
{
|
||||
EvalType = UdfUtils.PythonEvalType.SQL_SCALAR_PANDAS_UDF,
|
||||
Commands = new[] { command }
|
||||
};
|
||||
|
||||
using (var inputStream = new MemoryStream())
|
||||
using (var outputStream = new MemoryStream())
|
||||
{
|
||||
int numRows = 10;
|
||||
|
||||
// Write test data to the input stream.
|
||||
Schema schema = new Schema.Builder()
|
||||
.Field(b => b.Name("arg1").DataType(StringType.Default))
|
||||
.Build();
|
||||
var arrowWriter = new ArrowStreamWriter(inputStream, schema);
|
||||
await arrowWriter.WriteRecordBatchAsync(
|
||||
new RecordBatch(
|
||||
schema,
|
||||
new[]
|
||||
{
|
||||
ToArrowArray(
|
||||
Enumerable.Range(0, numRows)
|
||||
.Select(i => i.ToString())
|
||||
.ToArray())
|
||||
},
|
||||
numRows));
|
||||
|
||||
inputStream.Seek(0, SeekOrigin.Begin);
|
||||
|
||||
CommandExecutorStat stat = new CommandExecutor().Execute(
|
||||
inputStream,
|
||||
outputStream,
|
||||
0,
|
||||
commandPayload);
|
||||
|
||||
// Validate that all the data on the stream is read.
|
||||
Assert.Equal(inputStream.Length, inputStream.Position);
|
||||
Assert.Equal(numRows, stat.NumEntriesProcessed);
|
||||
|
||||
// Validate the output stream.
|
||||
outputStream.Seek(0, SeekOrigin.Begin);
|
||||
int arrowLength = SerDe.ReadInt32(outputStream);
|
||||
Assert.Equal((int)SpecialLengths.START_ARROW_STREAM, arrowLength);
|
||||
var arrowReader = new ArrowStreamReader(outputStream);
|
||||
RecordBatch outputBatch = await arrowReader.ReadNextRecordBatchAsync();
|
||||
|
||||
Assert.Equal(numRows, outputBatch.Length);
|
||||
Assert.Single(outputBatch.Arrays);
|
||||
var array = (StringArray)outputBatch.Arrays.ElementAt(0);
|
||||
// Validate the single command.
|
||||
for (int i = 0; i < numRows; ++i)
|
||||
{
|
||||
Assert.Equal($"udf: {i}", array.GetString(i));
|
||||
}
|
||||
|
||||
int end = SerDe.ReadInt32(outputStream);
|
||||
Assert.Equal(0, end);
|
||||
|
||||
// Validate all the data on the stream is read.
|
||||
Assert.Equal(outputStream.Length, outputStream.Position);
|
||||
}
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public async Task TestArrowSqlCommandExecutorWithMultiCommands()
|
||||
{
|
||||
var udfWrapper1 = new Sql.ArrowUdfWrapper<string, string>((str) => $"udf: {str}");
|
||||
var udfWrapper2 = new Sql.ArrowUdfWrapper<int, int, int>((arg1, arg2) => arg1 * arg2);
|
||||
|
||||
var command1 = new SqlCommand()
|
||||
{
|
||||
ArgOffsets = new[] { 0 },
|
||||
NumChainedFunctions = 1,
|
||||
WorkerFunction = new Sql.ArrowWorkerFunction(udfWrapper1.Execute),
|
||||
SerializerMode = CommandSerDe.SerializedMode.Row,
|
||||
DeserializerMode = CommandSerDe.SerializedMode.Row
|
||||
};
|
||||
|
||||
var command2 = new SqlCommand()
|
||||
{
|
||||
ArgOffsets = new[] { 1, 2 },
|
||||
NumChainedFunctions = 1,
|
||||
WorkerFunction = new Sql.ArrowWorkerFunction(udfWrapper2.Execute),
|
||||
SerializerMode = CommandSerDe.SerializedMode.Row,
|
||||
DeserializerMode = CommandSerDe.SerializedMode.Row
|
||||
};
|
||||
|
||||
var commandPayload = new Worker.CommandPayload()
|
||||
{
|
||||
EvalType = UdfUtils.PythonEvalType.SQL_SCALAR_PANDAS_UDF,
|
||||
Commands = new[] { command1, command2 }
|
||||
};
|
||||
|
||||
using (var inputStream = new MemoryStream())
|
||||
using (var outputStream = new MemoryStream())
|
||||
{
|
||||
int numRows = 10;
|
||||
|
||||
// Write test data to the input stream.
|
||||
Schema schema = new Schema.Builder()
|
||||
.Field(b => b.Name("arg1").DataType(StringType.Default))
|
||||
.Field(b => b.Name("arg2").DataType(Int32Type.Default))
|
||||
.Field(b => b.Name("arg3").DataType(Int32Type.Default))
|
||||
.Build();
|
||||
var arrowWriter = new ArrowStreamWriter(inputStream, schema);
|
||||
await arrowWriter.WriteRecordBatchAsync(
|
||||
new RecordBatch(
|
||||
schema,
|
||||
new[]
|
||||
{
|
||||
ToArrowArray(
|
||||
Enumerable.Range(0, numRows)
|
||||
.Select(i => i.ToString())
|
||||
.ToArray()),
|
||||
ToArrowArray(Enumerable.Range(0, numRows).ToArray()),
|
||||
ToArrowArray(Enumerable.Range(0, numRows).ToArray()),
|
||||
},
|
||||
numRows));
|
||||
|
||||
inputStream.Seek(0, SeekOrigin.Begin);
|
||||
|
||||
CommandExecutorStat stat = new CommandExecutor().Execute(
|
||||
inputStream,
|
||||
outputStream,
|
||||
0,
|
||||
commandPayload);
|
||||
|
||||
// Validate all the data on the stream is read.
|
||||
Assert.Equal(inputStream.Length, inputStream.Position);
|
||||
Assert.Equal(numRows, stat.NumEntriesProcessed);
|
||||
|
||||
// Validate the output stream.
|
||||
outputStream.Seek(0, SeekOrigin.Begin);
|
||||
var arrowLength = SerDe.ReadInt32(outputStream);
|
||||
Assert.Equal((int)SpecialLengths.START_ARROW_STREAM, arrowLength);
|
||||
var arrowReader = new ArrowStreamReader(outputStream);
|
||||
RecordBatch outputBatch = await arrowReader.ReadNextRecordBatchAsync();
|
||||
|
||||
Assert.Equal(numRows, outputBatch.Length);
|
||||
Assert.Equal(2, outputBatch.Arrays.Count());
|
||||
var array1 = (StringArray)outputBatch.Arrays.ElementAt(0);
|
||||
var array2 = (Int32Array)outputBatch.Arrays.ElementAt(1);
|
||||
for (int i = 0; i < numRows; ++i)
|
||||
{
|
||||
Assert.Equal($"udf: {i}", array1.GetString(i));
|
||||
Assert.Equal(i * i, array2.Values[i]);
|
||||
}
|
||||
|
||||
int end = SerDe.ReadInt32(outputStream);
|
||||
Assert.Equal(0, end);
|
||||
|
||||
// Validate all the data on the stream is read.
|
||||
Assert.Equal(outputStream.Length, outputStream.Position);
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Tests when Spark writes an input stream that only contains a
|
||||
/// Schema, and no record batches, that CommandExecutor writes the
|
||||
/// appropriate response back.
|
||||
/// </summary>
|
||||
[Fact]
|
||||
public void TestArrowSqlCommandExecutorWithEmptyInput()
|
||||
{
|
||||
var udfWrapper = new Sql.ArrowUdfWrapper<string, string>((str) => $"udf: {str}");
|
||||
var command = new SqlCommand()
|
||||
{
|
||||
ArgOffsets = new[] { 0 },
|
||||
NumChainedFunctions = 1,
|
||||
WorkerFunction = new Sql.ArrowWorkerFunction(udfWrapper.Execute),
|
||||
SerializerMode = CommandSerDe.SerializedMode.Row,
|
||||
DeserializerMode = CommandSerDe.SerializedMode.Row
|
||||
};
|
||||
|
||||
var commandPayload = new Worker.CommandPayload()
|
||||
{
|
||||
EvalType = UdfUtils.PythonEvalType.SQL_SCALAR_PANDAS_UDF,
|
||||
Commands = new[] { command }
|
||||
};
|
||||
|
||||
using (var inputStream = new MemoryStream())
|
||||
using (var outputStream = new MemoryStream())
|
||||
{
|
||||
// Write test data to the input stream.
|
||||
Schema schema = new Schema.Builder()
|
||||
.Field(b => b.Name("arg1").DataType(StringType.Default))
|
||||
.Build();
|
||||
var arrowWriter = new ArrowStreamWriter(inputStream, schema);
|
||||
|
||||
// The .NET ArrowStreamWriter doesn't currently support writing just a
|
||||
// schema with no batches - but Java does. We use Reflection to simulate
|
||||
// the request Spark sends.
|
||||
MethodInfo writeSchemaMethod = arrowWriter.GetType().GetMethod(
|
||||
"WriteSchemaAsync",
|
||||
BindingFlags.NonPublic | BindingFlags.Instance);
|
||||
|
||||
writeSchemaMethod.Invoke(
|
||||
arrowWriter,
|
||||
new object[] { schema, CancellationToken.None });
|
||||
|
||||
SerDe.Write(inputStream, 0);
|
||||
|
||||
inputStream.Seek(0, SeekOrigin.Begin);
|
||||
|
||||
CommandExecutorStat stat = new CommandExecutor().Execute(
|
||||
inputStream,
|
||||
outputStream,
|
||||
0,
|
||||
commandPayload);
|
||||
|
||||
// Validate that all the data on the stream is read.
|
||||
Assert.Equal(inputStream.Length, inputStream.Position);
|
||||
Assert.Equal(0, stat.NumEntriesProcessed);
|
||||
|
||||
// Validate the output stream.
|
||||
outputStream.Seek(0, SeekOrigin.Begin);
|
||||
int arrowLength = SerDe.ReadInt32(outputStream);
|
||||
Assert.Equal((int)SpecialLengths.START_ARROW_STREAM, arrowLength);
|
||||
var arrowReader = new ArrowStreamReader(outputStream);
|
||||
RecordBatch outputBatch = arrowReader.ReadNextRecordBatch();
|
||||
|
||||
Assert.Equal(1, outputBatch.Schema.Fields.Count);
|
||||
Assert.IsType<StringType>(outputBatch.Schema.GetFieldByIndex(0).DataType);
|
||||
|
||||
Assert.Equal(0, outputBatch.Length);
|
||||
Assert.Single(outputBatch.Arrays);
|
||||
|
||||
var array = (StringArray)outputBatch.Arrays.ElementAt(0);
|
||||
Assert.Equal(0, array.Length);
|
||||
|
||||
int end = SerDe.ReadInt32(outputStream);
|
||||
Assert.Equal(0, end);
|
||||
|
||||
// Validate all the data on the stream is read.
|
||||
Assert.Equal(outputStream.Length, outputStream.Position);
|
||||
}
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void TestRDDCommandExecutor()
|
||||
{
|
||||
int mapUdf(int a) => a + 3;
|
||||
var command = new RDDCommand()
|
||||
{
|
||||
WorkerFunction = new RDD.WorkerFunction(
|
||||
new RDD<int>.MapUdfWrapper<int, int>(mapUdf).Execute),
|
||||
SerializerMode = CommandSerDe.SerializedMode.Byte,
|
||||
DeserializerMode = CommandSerDe.SerializedMode.Byte
|
||||
};
|
||||
|
||||
var commandPayload = new Worker.CommandPayload()
|
||||
{
|
||||
EvalType = UdfUtils.PythonEvalType.NON_UDF,
|
||||
Commands = new[] { command }
|
||||
};
|
||||
|
||||
using (var inputStream = new MemoryStream())
|
||||
using (var outputStream = new MemoryStream())
|
||||
{
|
||||
// Write test data to the input stream.
|
||||
var formatter = new BinaryFormatter();
|
||||
var memoryStream = new MemoryStream();
|
||||
|
||||
var inputs = new[] { 0, 1, 2, 3, 4 };
|
||||
|
||||
var values = new List<byte[]>();
|
||||
foreach (int input in inputs)
|
||||
{
|
||||
memoryStream.Position = 0;
|
||||
formatter.Serialize(memoryStream, input);
|
||||
values.Add(memoryStream.ToArray());
|
||||
}
|
||||
|
||||
foreach (byte[] value in values)
|
||||
{
|
||||
SerDe.Write(inputStream, value.Length);
|
||||
SerDe.Write(inputStream, value);
|
||||
}
|
||||
|
||||
SerDe.Write(inputStream, (int)SpecialLengths.END_OF_DATA_SECTION);
|
||||
inputStream.Seek(0, SeekOrigin.Begin);
|
||||
|
||||
// Execute the command.
|
||||
CommandExecutorStat stat = new CommandExecutor().Execute(
|
||||
inputStream,
|
||||
outputStream,
|
||||
0,
|
||||
commandPayload);
|
||||
|
||||
// Validate all the data on the stream is read.
|
||||
Assert.Equal(inputStream.Length, inputStream.Position);
|
||||
Assert.Equal(5, stat.NumEntriesProcessed);
|
||||
|
||||
// Validate the output stream.
|
||||
outputStream.Seek(0, SeekOrigin.Begin);
|
||||
|
||||
for (int i = 0; i < inputs.Length; ++i)
|
||||
{
|
||||
Assert.True(SerDe.ReadInt32(outputStream) > 0);
|
||||
Assert.Equal(
|
||||
mapUdf(i),
|
||||
formatter.Deserialize(outputStream));
|
||||
}
|
||||
|
||||
// Validate all the data on the stream is read.
|
||||
Assert.Equal(outputStream.Length, outputStream.Position);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,24 @@
|
|||
<Project Sdk="Microsoft.NET.Sdk">
|
||||
|
||||
<PropertyGroup>
|
||||
<TargetFrameworks>net461;netcoreapp2.1</TargetFrameworks>
|
||||
<TargetFrameworks Condition="'$(OS)' != 'Windows_NT'">netcoreapp2.1</TargetFrameworks>
|
||||
<IsPackable>false</IsPackable>
|
||||
</PropertyGroup>
|
||||
|
||||
<ItemGroup>
|
||||
<PackageReference Include="coverlet.msbuild" Version="2.4.0">
|
||||
<PrivateAssets>all</PrivateAssets>
|
||||
<IncludeAssets>runtime; build; native; contentfiles; analyzers</IncludeAssets>
|
||||
</PackageReference>
|
||||
<PackageReference Include="Microsoft.NET.Test.Sdk" Version="15.9.0" />
|
||||
<PackageReference Include="xunit" Version="2.4.1" />
|
||||
<PackageReference Include="xunit.runner.visualstudio" Version="2.4.1" />
|
||||
</ItemGroup>
|
||||
|
||||
<ItemGroup>
|
||||
<ProjectReference Include="..\Microsoft.Spark.Worker\Microsoft.Spark.Worker.csproj" />
|
||||
<ProjectReference Include="..\Microsoft.Spark\Microsoft.Spark.csproj" />
|
||||
</ItemGroup>
|
||||
|
||||
</Project>
|
|
@ -0,0 +1,154 @@
|
|||
// Licensed to the .NET Foundation under one or more agreements.
|
||||
// The .NET Foundation licenses this file to you under the MIT license.
|
||||
// See the LICENSE file in the project root for more information.
|
||||
|
||||
using System;
|
||||
using System.IO;
|
||||
using System.Linq;
|
||||
using System.Net;
|
||||
using Microsoft.Spark.Network;
|
||||
using Microsoft.Spark.Sql;
|
||||
using Microsoft.Spark.Utils;
|
||||
using Microsoft.Spark.Worker.Processor;
|
||||
using Xunit;
|
||||
|
||||
namespace Microsoft.Spark.Worker.UnitTest
|
||||
{
|
||||
public class PayloadProcessorTests
|
||||
{
|
||||
[Theory]
|
||||
[InlineData(Versions.V2_3_0)]
|
||||
[InlineData(Versions.V2_3_1)]
|
||||
[InlineData(Versions.V2_3_2)]
|
||||
[InlineData(Versions.V2_3_3)]
|
||||
[InlineData(Versions.V2_4_0)]
|
||||
public void TestPayloadProcessor(string version)
|
||||
{
|
||||
CommandPayload commandPayload = TestData.GetDefaultCommandPayload();
|
||||
PayloadWriter payloadWriter = new PayloadWriterFactory().Create(new Version(version));
|
||||
Payload payload = TestData.GetDefaultPayload();
|
||||
|
||||
Payload actualPayload = null;
|
||||
using (var outStream = new MemoryStream())
|
||||
{
|
||||
payloadWriter.Write(outStream, payload, commandPayload);
|
||||
|
||||
using (var inputStream = new MemoryStream(outStream.ToArray()))
|
||||
{
|
||||
actualPayload =
|
||||
new PayloadProcessor(payloadWriter.Version).Process(inputStream);
|
||||
}
|
||||
}
|
||||
|
||||
// Validate the read payload.
|
||||
Assert.Equal(payload.SplitIndex, actualPayload.SplitIndex);
|
||||
Assert.Equal(payload.Version, actualPayload.Version);
|
||||
Assert.Equal(payload.TaskContext, actualPayload.TaskContext);
|
||||
Assert.Equal(payload.SparkFilesDir, actualPayload.SparkFilesDir);
|
||||
Assert.Equal(payload.IncludeItems, actualPayload.IncludeItems);
|
||||
Assert.Equal(payload.BroadcastVariables.Count, actualPayload.BroadcastVariables.Count);
|
||||
ValidateCommandPayload(commandPayload, actualPayload.Command);
|
||||
|
||||
// Validate the UDFs.
|
||||
var actualCommand1 = (SqlCommand)actualPayload.Command.Commands[0];
|
||||
var result1 = ((PicklingWorkerFunction)actualCommand1.WorkerFunction).Func(
|
||||
0,
|
||||
new object[] { "hello", 10, 20 },
|
||||
actualCommand1.ArgOffsets);
|
||||
Assert.Equal("udf2 udf1 hello", result1);
|
||||
|
||||
var actualCommand2 = (SqlCommand)actualPayload.Command.Commands[1];
|
||||
var result2 = ((PicklingWorkerFunction)actualCommand2.WorkerFunction).Func(
|
||||
0,
|
||||
new object[] { "hello", 10, 20 },
|
||||
actualCommand2.ArgOffsets);
|
||||
Assert.Equal(30, result2);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void TestClosedStreamWithSocket()
|
||||
{
|
||||
var commandPayload = new CommandPayload()
|
||||
{
|
||||
EvalType = UdfUtils.PythonEvalType.SQL_BATCHED_UDF,
|
||||
Commands = new Command[] { }
|
||||
};
|
||||
|
||||
PayloadWriter payloadWriter = new PayloadWriterFactory().Create();
|
||||
Payload payload = TestData.GetDefaultPayload();
|
||||
|
||||
var serverListener = new DefaultSocketWrapper();
|
||||
serverListener.Listen();
|
||||
|
||||
var port = (serverListener.LocalEndPoint as IPEndPoint).Port;
|
||||
var clientSocket = new DefaultSocketWrapper();
|
||||
clientSocket.Connect(IPAddress.Loopback, port, null);
|
||||
|
||||
using (ISocketWrapper serverSocket = serverListener.Accept())
|
||||
{
|
||||
Stream outStream = serverSocket.OutputStream;
|
||||
payloadWriter.Write(outStream, payload, commandPayload);
|
||||
outStream.Flush();
|
||||
}
|
||||
|
||||
// At this point server socket is closed.
|
||||
Stream inStream = clientSocket.InputStream;
|
||||
|
||||
// Consume bytes already written to the socket.
|
||||
var payloadProcessor = new PayloadProcessor(payloadWriter.Version);
|
||||
Payload actualPayload = payloadProcessor.Process(inStream);
|
||||
|
||||
Assert.Equal(payload.SplitIndex, actualPayload.SplitIndex);
|
||||
Assert.Equal(payload.Version, actualPayload.Version);
|
||||
Assert.Equal(payload.TaskContext, actualPayload.TaskContext);
|
||||
Assert.Equal(payload.SparkFilesDir, actualPayload.SparkFilesDir);
|
||||
Assert.Equal(payload.IncludeItems, actualPayload.IncludeItems);
|
||||
Assert.Equal(payload.BroadcastVariables.Count, actualPayload.BroadcastVariables.Count);
|
||||
ValidateCommandPayload(commandPayload, actualPayload.Command);
|
||||
|
||||
// Another read will detect that the socket is closed.
|
||||
Assert.Null(payloadProcessor.Process(inStream));
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void TestClosedStreamWithMemoryStream()
|
||||
{
|
||||
var inputStream = new MemoryStream();
|
||||
|
||||
// Version is not used in this scenario.
|
||||
var processor = new PayloadProcessor(null);
|
||||
|
||||
// Nothing is written to the stream.
|
||||
Assert.Null(processor.Process(inputStream));
|
||||
|
||||
inputStream.Dispose();
|
||||
|
||||
// The stream is closed. Payload with null is expected.
|
||||
Assert.Null(processor.Process(inputStream));
|
||||
}
|
||||
|
||||
private void ValidateCommandPayload(
|
||||
CommandPayload expected,
|
||||
Worker.CommandPayload actual)
|
||||
{
|
||||
Assert.Equal(expected.EvalType, actual.EvalType);
|
||||
Assert.Equal(expected.Commands.Length, actual.Commands.Count());
|
||||
|
||||
for (int i = 0; i < expected.Commands.Length; ++i)
|
||||
{
|
||||
Command expectedCommand = expected.Commands[i];
|
||||
var actualCommand = (SqlCommand)actual.Commands[i];
|
||||
Assert.Equal(expectedCommand.ArgOffsets, actualCommand.ArgOffsets);
|
||||
Assert.Equal(
|
||||
expectedCommand.ChainedUdfs.Length,
|
||||
actualCommand.NumChainedFunctions);
|
||||
Assert.Equal(
|
||||
expectedCommand.SerializerMode,
|
||||
actualCommand.SerializerMode);
|
||||
Assert.Equal(
|
||||
expectedCommand.DeserializerMode,
|
||||
actualCommand.DeserializerMode);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,310 @@
|
|||
// Licensed to the .NET Foundation under one or more agreements.
|
||||
// The .NET Foundation licenses this file to you under the MIT license.
|
||||
// See the LICENSE file in the project root for more information.
|
||||
|
||||
using System;
|
||||
using System.Collections.Generic;
|
||||
using System.Diagnostics;
|
||||
using System.IO;
|
||||
using System.Linq;
|
||||
using Microsoft.Spark.Interop.Ipc;
|
||||
using Microsoft.Spark.Utils;
|
||||
using static Microsoft.Spark.Utils.UdfUtils;
|
||||
|
||||
namespace Microsoft.Spark.Worker.UnitTest
|
||||
{
|
||||
/// <summary>
|
||||
/// Command stores data necessary to create a payload for a single command,
|
||||
/// which can have chained UDFs. The reason Microsoft.Spark.Worker.Command
|
||||
/// cannot be used is because it stores WorkerFunction which already abstracts
|
||||
/// out the chained UDFs.
|
||||
/// </summary>
|
||||
internal sealed class Command
|
||||
{
|
||||
internal Delegate[] ChainedUdfs { get; set; }
|
||||
|
||||
internal int[] ArgOffsets { get; set; }
|
||||
|
||||
internal CommandSerDe.SerializedMode SerializerMode { get; set; }
|
||||
|
||||
internal CommandSerDe.SerializedMode DeserializerMode { get; set; }
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// CommandPayload stores data necessary to create a payload for multiple commands.
|
||||
/// </summary>
|
||||
internal sealed class CommandPayload
|
||||
{
|
||||
internal PythonEvalType EvalType { get; set; }
|
||||
|
||||
internal Command[] Commands { get; set; }
|
||||
}
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
// TaskContext writer for different Spark versions.
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
|
||||
internal interface ITaskContextWriter
|
||||
{
|
||||
void Write(Stream stream, TaskContext taskContext);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// TaskContextWriter for version 2.3.*.
|
||||
/// </summary>
|
||||
internal sealed class TaskContextWriterV2_3_X : ITaskContextWriter
|
||||
{
|
||||
public void Write(Stream stream, TaskContext taskContext)
|
||||
{
|
||||
SerDe.Write(stream, taskContext.StageId);
|
||||
SerDe.Write(stream, taskContext.PartitionId);
|
||||
SerDe.Write(stream, taskContext.AttemptNumber);
|
||||
SerDe.Write(stream, taskContext.AttemptId);
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// TaskContextWriter for version 2.4.*.
|
||||
/// </summary>
|
||||
internal sealed class TaskContextWriterV2_4_X : ITaskContextWriter
|
||||
{
|
||||
public void Write(Stream stream, TaskContext taskContext)
|
||||
{
|
||||
SerDe.Write(stream, taskContext.IsBarrier);
|
||||
SerDe.Write(stream, taskContext.Port);
|
||||
SerDe.Write(stream, taskContext.Secret);
|
||||
|
||||
SerDe.Write(stream, taskContext.StageId);
|
||||
SerDe.Write(stream, taskContext.PartitionId);
|
||||
SerDe.Write(stream, taskContext.AttemptNumber);
|
||||
SerDe.Write(stream, taskContext.AttemptId);
|
||||
|
||||
SerDe.Write(stream, taskContext.LocalProperties.Count);
|
||||
foreach (KeyValuePair<string, string> kv in taskContext.LocalProperties)
|
||||
{
|
||||
SerDe.Write(stream, kv.Key);
|
||||
SerDe.Write(stream, kv.Value);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
// BroadcastVariable writer for different Spark versions.
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
|
||||
internal interface IBroadcastVariableWriter
|
||||
{
|
||||
void Write(Stream stream, BroadcastVariables broadcastVars);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// BroadcastVariableWriter for version 2.3.0 and 2.3.1.
|
||||
/// </summary>
|
||||
internal sealed class BroadcastVariableWriterV2_3_0 : IBroadcastVariableWriter
|
||||
{
|
||||
public void Write(Stream stream, BroadcastVariables broadcastVars)
|
||||
{
|
||||
Debug.Assert(broadcastVars.Count == 0);
|
||||
SerDe.Write(stream, broadcastVars.Count);
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// BroadcastVariableWriter for version 2.3.2 and up.
|
||||
/// </summary>
|
||||
internal sealed class BroadcastVariableWriterV2_3_2 : IBroadcastVariableWriter
|
||||
{
|
||||
public void Write(Stream stream, BroadcastVariables broadcastVars)
|
||||
{
|
||||
SerDe.Write(stream, broadcastVars.DecryptionServerNeeded);
|
||||
SerDe.Write(stream, broadcastVars.Count);
|
||||
|
||||
Debug.Assert(broadcastVars.Count == 0);
|
||||
|
||||
if (broadcastVars.DecryptionServerNeeded)
|
||||
{
|
||||
SerDe.Write(stream, broadcastVars.DecryptionServerPort);
|
||||
SerDe.Write(stream, broadcastVars.Secret);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
// Command writer for different Spark versions.
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
|
||||
internal interface ICommandWriter
|
||||
{
|
||||
void Write(Stream stream, CommandPayload commandPayload);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Provides a functionality to write Command[].
|
||||
/// </summary>
|
||||
internal abstract class CommandWriterBase
|
||||
{
|
||||
public void Write(Stream stream, Command[] commands)
|
||||
{
|
||||
SerDe.Write(stream, commands.Length);
|
||||
foreach (Command command in commands)
|
||||
{
|
||||
SerDe.Write(stream, command.ArgOffsets.Length);
|
||||
foreach (int argOffset in command.ArgOffsets)
|
||||
{
|
||||
SerDe.Write(stream, argOffset);
|
||||
}
|
||||
|
||||
SerDe.Write(stream, command.ChainedUdfs.Length);
|
||||
foreach (Delegate udf in command.ChainedUdfs)
|
||||
{
|
||||
byte[] serializedCommand = CommandSerDe.Serialize(
|
||||
udf,
|
||||
CommandSerDe.SerializedMode.Row,
|
||||
CommandSerDe.SerializedMode.Row);
|
||||
|
||||
SerDe.Write(stream, serializedCommand.Length);
|
||||
SerDe.Write(stream, serializedCommand);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// CommandWriter for version 2.3.*.
|
||||
/// </summary>
|
||||
internal sealed class CommandWriterV2_3_X : CommandWriterBase, ICommandWriter
|
||||
{
|
||||
public void Write(Stream stream, CommandPayload commandPayload)
|
||||
{
|
||||
SerDe.Write(stream, (int)commandPayload.EvalType);
|
||||
|
||||
Write(stream, commandPayload.Commands);
|
||||
|
||||
if ((commandPayload.EvalType == PythonEvalType.SQL_SCALAR_PANDAS_UDF) ||
|
||||
(commandPayload.EvalType == PythonEvalType.SQL_GROUPED_MAP_PANDAS_UDF))
|
||||
{
|
||||
SerDe.Write(stream, "unused timezone");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// CommandWriter for version 2.4.*.
|
||||
/// </summary>
|
||||
internal sealed class CommandWriterV2_4_X : CommandWriterBase, ICommandWriter
|
||||
{
|
||||
public void Write(Stream stream, CommandPayload commandPayload)
|
||||
{
|
||||
|
||||
if (commandPayload.EvalType == PythonEvalType.SQL_SCALAR_PANDAS_UDF ||
|
||||
commandPayload.EvalType == PythonEvalType.SQL_GROUPED_MAP_PANDAS_UDF ||
|
||||
commandPayload.EvalType == PythonEvalType.SQL_GROUPED_AGG_PANDAS_UDF ||
|
||||
commandPayload.EvalType == PythonEvalType.SQL_WINDOW_AGG_PANDAS_UDF)
|
||||
{
|
||||
SerDe.Write(stream, 1);
|
||||
for (int i = 0; i < 1; ++i)
|
||||
{
|
||||
SerDe.Write(stream, "unused key");
|
||||
SerDe.Write(stream, "unused value");
|
||||
}
|
||||
}
|
||||
|
||||
SerDe.Write(stream, (int)commandPayload.EvalType);
|
||||
|
||||
Write(stream, commandPayload.Commands);
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Payload writer that supports different Spark versions.
|
||||
/// </summary>
|
||||
internal sealed class PayloadWriter
|
||||
{
|
||||
private readonly ITaskContextWriter _taskContextWriter;
|
||||
private readonly IBroadcastVariableWriter _broadcastVariableWriter;
|
||||
private readonly ICommandWriter _commandWriter;
|
||||
|
||||
internal PayloadWriter(
|
||||
Version version,
|
||||
ITaskContextWriter taskContextWriter,
|
||||
IBroadcastVariableWriter broadcastVariableWriter,
|
||||
ICommandWriter commandWriter)
|
||||
{
|
||||
Version = version;
|
||||
_taskContextWriter = taskContextWriter;
|
||||
_broadcastVariableWriter = broadcastVariableWriter;
|
||||
_commandWriter = commandWriter;
|
||||
}
|
||||
|
||||
internal Version Version { get; }
|
||||
|
||||
internal void Write(
|
||||
Stream stream,
|
||||
Payload payload,
|
||||
CommandPayload commandPayload)
|
||||
{
|
||||
SerDe.Write(stream, payload.SplitIndex);
|
||||
SerDe.Write(stream, payload.Version);
|
||||
_taskContextWriter.Write(stream, payload.TaskContext);
|
||||
SerDe.Write(stream, payload.SparkFilesDir);
|
||||
Write(stream, payload.IncludeItems);
|
||||
_broadcastVariableWriter.Write(stream, payload.BroadcastVariables);
|
||||
_commandWriter.Write(stream, commandPayload);
|
||||
}
|
||||
|
||||
private static void Write(Stream stream, IEnumerable<string> includeItems)
|
||||
{
|
||||
if (includeItems is null)
|
||||
{
|
||||
SerDe.Write(stream, 0);
|
||||
return;
|
||||
}
|
||||
|
||||
SerDe.Write(stream, includeItems.Count());
|
||||
foreach (string includeItem in includeItems)
|
||||
{
|
||||
SerDe.Write(stream, includeItem);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Factory class for creating a PayloadWriter given a version.
|
||||
/// </summary>
|
||||
internal sealed class PayloadWriterFactory
|
||||
{
|
||||
internal PayloadWriter Create(Version version = null)
|
||||
{
|
||||
if (version == null)
|
||||
{
|
||||
version = new Version(Versions.V2_4_0);
|
||||
}
|
||||
|
||||
switch (version.ToString())
|
||||
{
|
||||
case Versions.V2_3_0:
|
||||
case Versions.V2_3_1:
|
||||
return new PayloadWriter(
|
||||
version,
|
||||
new TaskContextWriterV2_3_X(),
|
||||
new BroadcastVariableWriterV2_3_0(),
|
||||
new CommandWriterV2_3_X());
|
||||
case Versions.V2_3_2:
|
||||
case Versions.V2_3_3:
|
||||
return new PayloadWriter(
|
||||
version,
|
||||
new TaskContextWriterV2_3_X(),
|
||||
new BroadcastVariableWriterV2_3_2(),
|
||||
new CommandWriterV2_3_X());
|
||||
case Versions.V2_4_0:
|
||||
return new PayloadWriter(
|
||||
version,
|
||||
new TaskContextWriterV2_4_X(),
|
||||
new BroadcastVariableWriterV2_3_2(),
|
||||
new CommandWriterV2_4_X());
|
||||
default:
|
||||
throw new NotSupportedException($"Spark {version} is not supported.");
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,119 @@
|
|||
// Licensed to the .NET Foundation under one or more agreements.
|
||||
// The .NET Foundation licenses this file to you under the MIT license.
|
||||
// See the LICENSE file in the project root for more information.
|
||||
|
||||
using System.Collections;
|
||||
using System.Collections.Generic;
|
||||
using System.Net;
|
||||
using System.Threading.Tasks;
|
||||
using Microsoft.Spark.Interop.Ipc;
|
||||
using Microsoft.Spark.Network;
|
||||
using Razorvine.Pickle;
|
||||
using Xunit;
|
||||
|
||||
namespace Microsoft.Spark.Worker.UnitTest
|
||||
{
|
||||
public class TaskRunnerTests
|
||||
{
|
||||
[Fact]
|
||||
public void TestTaskRunner()
|
||||
{
|
||||
using (var serverListener = new DefaultSocketWrapper())
|
||||
{
|
||||
serverListener.Listen();
|
||||
|
||||
var port = (serverListener.LocalEndPoint as IPEndPoint).Port;
|
||||
var clientSocket = new DefaultSocketWrapper();
|
||||
clientSocket.Connect(IPAddress.Loopback, port, null);
|
||||
|
||||
PayloadWriter payloadWriter = new PayloadWriterFactory().Create();
|
||||
var taskRunner = new TaskRunner(0, clientSocket, false, payloadWriter.Version);
|
||||
var clientTask = Task.Run(() => taskRunner.Run());
|
||||
|
||||
using (ISocketWrapper serverSocket = serverListener.Accept())
|
||||
{
|
||||
System.IO.Stream inputStream = serverSocket.InputStream;
|
||||
System.IO.Stream outputStream = serverSocket.OutputStream;
|
||||
|
||||
Payload payload = TestData.GetDefaultPayload();
|
||||
CommandPayload commandPayload = TestData.GetDefaultCommandPayload();
|
||||
|
||||
payloadWriter.Write(outputStream, payload, commandPayload);
|
||||
|
||||
// Write 10 rows to the output stream.
|
||||
var pickler = new Pickler();
|
||||
for (int i = 0; i < 10; ++i)
|
||||
{
|
||||
var pickled = pickler.dumps(
|
||||
new[] { new object[] { i.ToString(), i, i } });
|
||||
SerDe.Write(outputStream, pickled.Length);
|
||||
SerDe.Write(outputStream, pickled);
|
||||
}
|
||||
|
||||
// Signal the end of data and stream.
|
||||
SerDe.Write(outputStream, (int)SpecialLengths.END_OF_DATA_SECTION);
|
||||
SerDe.Write(outputStream, (int)SpecialLengths.END_OF_STREAM);
|
||||
outputStream.Flush();
|
||||
|
||||
// Now process the bytes flowing in from the client.
|
||||
var timingDataReceived = false;
|
||||
var exceptionThrown = false;
|
||||
var rowsReceived = new List<object[]>();
|
||||
|
||||
while (true)
|
||||
{
|
||||
var length = SerDe.ReadInt32(inputStream);
|
||||
if (length > 0)
|
||||
{
|
||||
var pickledBytes = SerDe.ReadBytes(inputStream, length);
|
||||
var unpickler = new Unpickler();
|
||||
var rows = unpickler.loads(pickledBytes) as ArrayList;
|
||||
foreach (object row in rows)
|
||||
{
|
||||
rowsReceived.Add((object[])row);
|
||||
}
|
||||
}
|
||||
else if (length == (int)SpecialLengths.TIMING_DATA)
|
||||
{
|
||||
var bootTime = SerDe.ReadInt64(inputStream);
|
||||
var initTime = SerDe.ReadInt64(inputStream);
|
||||
var finishTime = SerDe.ReadInt64(inputStream);
|
||||
var memoryBytesSpilled = SerDe.ReadInt64(inputStream);
|
||||
var diskBytesSpilled = SerDe.ReadInt64(inputStream);
|
||||
timingDataReceived = true;
|
||||
}
|
||||
else if (length == (int)SpecialLengths.PYTHON_EXCEPTION_THROWN)
|
||||
{
|
||||
SerDe.ReadString(inputStream);
|
||||
exceptionThrown = true;
|
||||
break;
|
||||
}
|
||||
else if (length == (int)SpecialLengths.END_OF_DATA_SECTION)
|
||||
{
|
||||
var numAccumulatorUpdates = SerDe.ReadInt32(inputStream);
|
||||
SerDe.ReadInt32(inputStream);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
Assert.True(timingDataReceived);
|
||||
Assert.False(exceptionThrown);
|
||||
|
||||
// Validate rows received.
|
||||
Assert.Equal(10, rowsReceived.Count);
|
||||
for (int i = 0; i < 10; ++i)
|
||||
{
|
||||
// Two UDFs registered, thus expecting two columns.
|
||||
// Refer to TestData.GetDefaultCommandPayload().
|
||||
var row = rowsReceived[i];
|
||||
Assert.Equal(2, rowsReceived[i].Length);
|
||||
Assert.Equal($"udf2 udf1 {i}", row[0]);
|
||||
Assert.Equal(i + i, row[1]);
|
||||
}
|
||||
}
|
||||
|
||||
Assert.True(clientTask.Wait(5000));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,78 @@
|
|||
// Licensed to the .NET Foundation under one or more agreements.
|
||||
// The .NET Foundation licenses this file to you under the MIT license.
|
||||
// See the LICENSE file in the project root for more information.
|
||||
|
||||
using Microsoft.Spark.Sql;
|
||||
using Microsoft.Spark.Utils;
|
||||
|
||||
namespace Microsoft.Spark.Worker.UnitTest
|
||||
{
|
||||
/// <summary>
|
||||
/// TestData provides helper functions to create default test data.
|
||||
/// </summary>
|
||||
internal static class TestData
|
||||
{
|
||||
internal static Payload GetDefaultPayload()
|
||||
{
|
||||
var taskContext = new TaskContext()
|
||||
{
|
||||
StageId = 1,
|
||||
PartitionId = 2,
|
||||
AttemptNumber = 1,
|
||||
AttemptId = 100L,
|
||||
Port = 9999,
|
||||
Secret = "secret"
|
||||
};
|
||||
|
||||
var broadcastVars = new BroadcastVariables()
|
||||
{
|
||||
DecryptionServerNeeded = true,
|
||||
DecryptionServerPort = 9999,
|
||||
Secret = "secret"
|
||||
};
|
||||
|
||||
return new Payload()
|
||||
{
|
||||
SplitIndex = 10,
|
||||
Version = "1.0",
|
||||
TaskContext = taskContext,
|
||||
SparkFilesDir = "directory",
|
||||
IncludeItems = new[] { "file1", "file2" },
|
||||
BroadcastVariables = broadcastVars
|
||||
};
|
||||
}
|
||||
|
||||
internal static CommandPayload GetDefaultCommandPayload()
|
||||
{
|
||||
var udfWrapper1 = new PicklingUdfWrapper<string, string>((str) => $"udf1 {str}");
|
||||
var udfWrapper2 = new PicklingUdfWrapper<string, string>((str) => $"udf2 {str}");
|
||||
var udfWrapper3 = new PicklingUdfWrapper<int, int, int>((arg1, arg2) => arg1 + arg2);
|
||||
|
||||
var command1 = new Command()
|
||||
{
|
||||
ChainedUdfs = new PicklingWorkerFunction.ExecuteDelegate[]
|
||||
{
|
||||
udfWrapper1.Execute,
|
||||
udfWrapper2.Execute
|
||||
},
|
||||
ArgOffsets = new[] { 0 },
|
||||
SerializerMode = CommandSerDe.SerializedMode.Row,
|
||||
DeserializerMode = CommandSerDe.SerializedMode.Row
|
||||
};
|
||||
|
||||
var command2 = new Command()
|
||||
{
|
||||
ChainedUdfs = new PicklingWorkerFunction.ExecuteDelegate[] {udfWrapper3.Execute },
|
||||
ArgOffsets = new[] { 1, 2 },
|
||||
SerializerMode = CommandSerDe.SerializedMode.Row,
|
||||
DeserializerMode = CommandSerDe.SerializedMode.Row
|
||||
};
|
||||
|
||||
return new CommandPayload()
|
||||
{
|
||||
EvalType = UdfUtils.PythonEvalType.SQL_BATCHED_UDF,
|
||||
Commands = new[] { command1, command2 }
|
||||
};
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,65 @@
|
|||
// Licensed to the .NET Foundation under one or more agreements.
|
||||
// The .NET Foundation licenses this file to you under the MIT license.
|
||||
// See the LICENSE file in the project root for more information.
|
||||
|
||||
using System.IO;
|
||||
using System.Linq;
|
||||
|
||||
namespace Microsoft.Spark.Worker.Command
|
||||
{
|
||||
/// <summary>
|
||||
/// CommandExecutorStat stores statistics information for executing a command payload.
|
||||
/// </summary>
|
||||
internal sealed class CommandExecutorStat
|
||||
{
|
||||
/// <summary>
|
||||
/// Number of non-null entries received/processed.
|
||||
/// </summary>
|
||||
internal int NumEntriesProcessed { get; set; }
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// CommandExecutor reads input data from the input stream,
|
||||
/// runs commands on them, and writes result to the output stream.
|
||||
/// </summary>
|
||||
internal sealed class CommandExecutor
|
||||
{
|
||||
/// <summary>
|
||||
/// Executes the commands on the input data read from input stream
|
||||
/// and writes results to the output stream.
|
||||
/// </summary>
|
||||
/// <param name="inputStream">Input stream to read data from</param>
|
||||
/// <param name="outputStream">Output stream to write results to</param>
|
||||
/// <param name="splitIndex">Split index for this task</param>
|
||||
/// <param name="commandPayload">Contains the commands to execute</param>
|
||||
/// <returns>Statistics captured during the Execute() run</returns>
|
||||
internal CommandExecutorStat Execute(
|
||||
Stream inputStream,
|
||||
Stream outputStream,
|
||||
int splitIndex,
|
||||
CommandPayload commandPayload)
|
||||
{
|
||||
if (commandPayload.EvalType == Spark.Utils.UdfUtils.PythonEvalType.NON_UDF)
|
||||
{
|
||||
if (commandPayload.Commands.Length != 1)
|
||||
{
|
||||
throw new System.Exception(
|
||||
"Invalid number of commands for RDD: {commandPayload.Commands.Length}");
|
||||
}
|
||||
|
||||
return new RDDCommandExecutor().Execute(
|
||||
inputStream,
|
||||
outputStream,
|
||||
splitIndex,
|
||||
(RDDCommand)commandPayload.Commands[0]);
|
||||
}
|
||||
|
||||
return SqlCommandExecutor.Execute(
|
||||
inputStream,
|
||||
outputStream,
|
||||
commandPayload.EvalType,
|
||||
commandPayload.Commands.Cast<SqlCommand>().ToArray());
|
||||
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,128 @@
|
|||
// Licensed to the .NET Foundation under one or more agreements.
|
||||
// The .NET Foundation licenses this file to you under the MIT license.
|
||||
// See the LICENSE file in the project root for more information.
|
||||
|
||||
using System;
|
||||
using System.Collections.Generic;
|
||||
using System.IO;
|
||||
using System.Runtime.Serialization.Formatters.Binary;
|
||||
using Microsoft.Spark.Interop.Ipc;
|
||||
using Microsoft.Spark.Utils;
|
||||
|
||||
namespace Microsoft.Spark.Worker.Command
|
||||
{
|
||||
/// <summary>
|
||||
/// CommandExecutor reads input data from the input stream,
|
||||
/// runs commands on them, and writes result to the output stream.
|
||||
/// </summary>
|
||||
internal class RDDCommandExecutor
|
||||
{
|
||||
[ThreadStatic]
|
||||
private static MemoryStream s_writeOutputStream;
|
||||
[ThreadStatic]
|
||||
private static BinaryFormatter s_binaryFormatter;
|
||||
|
||||
/// <summary>
|
||||
/// Executes the commands on the input data read from input stream
|
||||
/// and writes results to the output stream.
|
||||
/// </summary>
|
||||
/// <param name="inputStream">Input stream to read data from</param>
|
||||
/// <param name="outputStream">Output stream to write results to</param>
|
||||
/// <param name="splitIndex">Split index for this task</param>
|
||||
/// <param name="command">Contains the commands to execute</param>
|
||||
/// <returns>Statistics captured during the Execute() run</returns>
|
||||
internal CommandExecutorStat Execute(
|
||||
Stream inputStream,
|
||||
Stream outputStream,
|
||||
int splitIndex,
|
||||
RDDCommand command)
|
||||
{
|
||||
var stat = new CommandExecutorStat();
|
||||
|
||||
CommandSerDe.SerializedMode serializerMode = command.SerializerMode;
|
||||
CommandSerDe.SerializedMode deserializerMode = command.DeserializerMode;
|
||||
|
||||
RDD.WorkerFunction.ExecuteDelegate func = command.WorkerFunction.Func;
|
||||
foreach (object output in func(
|
||||
splitIndex,
|
||||
GetInputIterator(inputStream, deserializerMode)))
|
||||
{
|
||||
WriteOutput(outputStream, serializerMode, output);
|
||||
|
||||
++stat.NumEntriesProcessed;
|
||||
}
|
||||
|
||||
return stat;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Create input iterator from the given input stream.
|
||||
/// </summary>
|
||||
/// <param name="inputStream">Stream to read from</param>
|
||||
/// <param name="deserializerMode">Mode for deserialization</param>
|
||||
/// <returns></returns>
|
||||
private IEnumerable<object> GetInputIterator(
|
||||
Stream inputStream,
|
||||
CommandSerDe.SerializedMode deserializerMode)
|
||||
{
|
||||
RDD.Collector.IDeserializer deserializer =
|
||||
RDD.Collector.GetDeserializer(deserializerMode);
|
||||
|
||||
var messageLength = 0;
|
||||
while ((messageLength = SerDe.ReadInt32(inputStream)) !=
|
||||
(int)SpecialLengths.END_OF_DATA_SECTION)
|
||||
{
|
||||
if ((messageLength > 0) || (messageLength == (int)SpecialLengths.NULL))
|
||||
{
|
||||
yield return deserializer.Deserialize(inputStream, messageLength);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Writes the given message to the stream.
|
||||
/// </summary>
|
||||
/// <param name="stream">Stream to write to</param>
|
||||
/// <param name="serializerMode">Mode for serialization</param>
|
||||
/// <param name="message">Message to write to</param>
|
||||
private void WriteOutput(
|
||||
Stream stream,
|
||||
CommandSerDe.SerializedMode serializerMode,
|
||||
object message)
|
||||
{
|
||||
MemoryStream writeOutputStream = s_writeOutputStream ??
|
||||
(s_writeOutputStream = new MemoryStream());
|
||||
writeOutputStream.Position = 0;
|
||||
Serialize(serializerMode, message, writeOutputStream);
|
||||
SerDe.Write(stream, (int)writeOutputStream.Position);
|
||||
SerDe.Write(stream, writeOutputStream.GetBuffer(), (int)writeOutputStream.Position);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Serialize a row based on the given serializer mode.
|
||||
/// </summary>
|
||||
/// <param name="serializerMode"></param>
|
||||
/// <param name="message"></param>
|
||||
/// <param name="stream"></param>
|
||||
private void Serialize(
|
||||
CommandSerDe.SerializedMode serializerMode,
|
||||
object message,
|
||||
MemoryStream stream)
|
||||
{
|
||||
switch (serializerMode)
|
||||
{
|
||||
case CommandSerDe.SerializedMode.Byte:
|
||||
BinaryFormatter formatter = s_binaryFormatter ??
|
||||
(s_binaryFormatter = new BinaryFormatter());
|
||||
formatter.Serialize(stream, message);
|
||||
break;
|
||||
case CommandSerDe.SerializedMode.None:
|
||||
case CommandSerDe.SerializedMode.String:
|
||||
case CommandSerDe.SerializedMode.Pair:
|
||||
default:
|
||||
throw new NotImplementedException(
|
||||
$"Unsupported serializerMode: {serializerMode}");
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,532 @@
|
|||
// Licensed to the .NET Foundation under one or more agreements.
|
||||
// The .NET Foundation licenses this file to you under the MIT license.
|
||||
// See the LICENSE file in the project root for more information.
|
||||
|
||||
using System;
|
||||
using System.Buffers;
|
||||
using System.Collections.Generic;
|
||||
using System.Diagnostics;
|
||||
using System.IO;
|
||||
using System.Linq;
|
||||
using Apache.Arrow;
|
||||
using Apache.Arrow.Ipc;
|
||||
using Microsoft.Spark.Interop.Ipc;
|
||||
using Microsoft.Spark.IO;
|
||||
using Microsoft.Spark.Sql;
|
||||
using Microsoft.Spark.Utils;
|
||||
using Razorvine.Pickle;
|
||||
|
||||
namespace Microsoft.Spark.Worker.Command
|
||||
{
|
||||
/// <summary>
|
||||
/// SqlCommandExecutor reads input data from the input stream,
|
||||
/// runs commands on them, and writes result to the output stream.
|
||||
/// </summary>
|
||||
internal abstract class SqlCommandExecutor
|
||||
{
|
||||
/// <summary>
|
||||
/// Executes the commands on the input data read from input stream
|
||||
/// and writes results to the output stream.
|
||||
/// </summary>
|
||||
/// <param name="inputStream">Input stream to read data from</param>
|
||||
/// <param name="outputStream">Output stream to write results to</param>
|
||||
/// <param name="evalType">Evaluation type for the current commands</param>
|
||||
/// <param name="commands">Contains the commands to execute</param>
|
||||
/// <returns>Statistics captured during the Execute() run</returns>
|
||||
internal static CommandExecutorStat Execute(
|
||||
Stream inputStream,
|
||||
Stream outputStream,
|
||||
UdfUtils.PythonEvalType evalType,
|
||||
SqlCommand[] commands)
|
||||
{
|
||||
if (commands.Length <= 0)
|
||||
{
|
||||
throw new ArgumentException("Commands cannot be empty.");
|
||||
}
|
||||
|
||||
if (commands.Any(c =>
|
||||
(c.SerializerMode != CommandSerDe.SerializedMode.Row) ||
|
||||
(c.DeserializerMode != CommandSerDe.SerializedMode.Row)))
|
||||
{
|
||||
throw new ArgumentException("Unexpected serialization mode found.");
|
||||
}
|
||||
|
||||
SqlCommandExecutor executor;
|
||||
if (evalType == UdfUtils.PythonEvalType.SQL_SCALAR_PANDAS_UDF)
|
||||
{
|
||||
executor = new ArrowSqlCommandExecutor();
|
||||
}
|
||||
else if (evalType == UdfUtils.PythonEvalType.SQL_BATCHED_UDF)
|
||||
{
|
||||
executor = new PicklingSqlCommandExecutor();
|
||||
}
|
||||
else
|
||||
{
|
||||
throw new NotSupportedException($"{evalType} is not supported.");
|
||||
}
|
||||
|
||||
return executor.ExecuteCore(inputStream, outputStream, commands);
|
||||
}
|
||||
|
||||
protected abstract CommandExecutorStat ExecuteCore(
|
||||
Stream inputStream,
|
||||
Stream outputStream,
|
||||
SqlCommand[] commands);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// A SqlCommandExecutor that reads and writes using the
|
||||
/// Python pickling format.
|
||||
/// </summary>
|
||||
internal class PicklingSqlCommandExecutor : SqlCommandExecutor
|
||||
{
|
||||
[ThreadStatic]
|
||||
private static MemoryStream s_writeOutputStream;
|
||||
[ThreadStatic]
|
||||
private static MaxLengthReadStream s_slicedReadStream;
|
||||
[ThreadStatic]
|
||||
private static Pickler s_pickler;
|
||||
|
||||
protected override CommandExecutorStat ExecuteCore(
|
||||
Stream inputStream,
|
||||
Stream outputStream,
|
||||
SqlCommand[] commands)
|
||||
{
|
||||
var stat = new CommandExecutorStat();
|
||||
ICommandRunner commandRunner = CreateCommandRunner(commands);
|
||||
|
||||
// On the Spark side, each object in the following List<> is considered as a row.
|
||||
// See the ICommandRunner comments above for the types for a row.
|
||||
var outputRows = new List<object>();
|
||||
|
||||
// If the input is empty (no rows) or all rows have been read, then
|
||||
// SpecialLengths.END_OF_DATA_SECTION is sent as the messageLength.
|
||||
// For example, no rows:
|
||||
// +---+----+
|
||||
// |age|name|
|
||||
// +---+----+
|
||||
// +---+----+
|
||||
int messageLength = 0;
|
||||
while ((messageLength = SerDe.ReadInt32(inputStream)) !=
|
||||
(int)SpecialLengths.END_OF_DATA_SECTION)
|
||||
{
|
||||
if ((messageLength > 0) || (messageLength == (int)SpecialLengths.NULL))
|
||||
{
|
||||
if (messageLength <= 0)
|
||||
{
|
||||
throw new InvalidDataException(
|
||||
$"Invalid message length: {messageLength}");
|
||||
}
|
||||
|
||||
MaxLengthReadStream readStream = s_slicedReadStream ??
|
||||
(s_slicedReadStream = new MaxLengthReadStream());
|
||||
readStream.Reset(inputStream, messageLength);
|
||||
|
||||
// Each row in inputRows is of type object[]. If a null is present in a row
|
||||
// then the corresponding index column of the row object[] will be set to null.
|
||||
// For example, (inputRows.Length == 2) and (inputRows[0][0] == null)
|
||||
// +----+
|
||||
// | age|
|
||||
// +----+
|
||||
// |null|
|
||||
// | 11|
|
||||
// +----+
|
||||
object[] inputRows = PythonSerDe.GetUnpickledObjects(readStream);
|
||||
|
||||
for (int i = 0; i < inputRows.Length; ++i)
|
||||
{
|
||||
// Split id is not used for SQL UDFs, so 0 is passed.
|
||||
outputRows.Add(commandRunner.Run(0, inputRows[i]));
|
||||
}
|
||||
|
||||
WriteOutput(outputStream, outputRows);
|
||||
stat.NumEntriesProcessed += inputRows.Length;
|
||||
outputRows.Clear();
|
||||
}
|
||||
}
|
||||
|
||||
return stat;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Writes the given message to the stream.
|
||||
/// </summary>
|
||||
/// <param name="stream">Stream to write to</param>
|
||||
/// <param name="rows">Rows to write to</param>
|
||||
private void WriteOutput(Stream stream, IEnumerable<object> rows)
|
||||
{
|
||||
MemoryStream writeOutputStream = s_writeOutputStream ??
|
||||
(s_writeOutputStream = new MemoryStream());
|
||||
writeOutputStream.Position = 0;
|
||||
|
||||
Pickler pickler = s_pickler ?? (s_pickler = new Pickler(false));
|
||||
pickler.dump(rows, writeOutputStream);
|
||||
|
||||
if (writeOutputStream.Position == 0)
|
||||
{
|
||||
throw new Exception("Message buffer cannot be null.");
|
||||
}
|
||||
|
||||
SerDe.Write(stream, (int)writeOutputStream.Position);
|
||||
SerDe.Write(stream, writeOutputStream.GetBuffer(), (int)writeOutputStream.Position);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Creates an ICommandRunner instance based on the given commands.
|
||||
/// </summary>
|
||||
/// <param name="commands">Commands used for creating a command runner</param>
|
||||
/// <returns>An ICommandRunner instance</returns>
|
||||
private static ICommandRunner CreateCommandRunner(SqlCommand[] commands)
|
||||
{
|
||||
return (commands.Length == 1) ?
|
||||
(ICommandRunner)new SingleCommandRunner(commands[0]) :
|
||||
new MultiCommandRunner(commands);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Interface for running commands.
|
||||
/// On the Spark side, the following is expected for the Pickling to work:
|
||||
/// If there is a single command (one UDF), the computed value is returned
|
||||
/// as an object (one element). If there are multiple commands (multiple UDF scenario),
|
||||
/// the computed value should be an array (not IEnumerable) where each element
|
||||
/// in the array corresponds to the value returned by a command.
|
||||
/// Refer to EvaluatePython.scala for StructType case.
|
||||
/// </summary>
|
||||
private interface ICommandRunner
|
||||
{
|
||||
/// <summary>
|
||||
/// Runs commands based on the given split id and input.
|
||||
/// </summary>
|
||||
/// <param name="splitId">Split id for the commands to run</param>
|
||||
/// <param name="input">Input data for the commands to run</param>
|
||||
/// <returns>Value returned by running the commands</returns>
|
||||
object Run(int splitId, object input);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// SingleCommandRunner handles running a single command.
|
||||
/// </summary>
|
||||
private sealed class SingleCommandRunner : ICommandRunner
|
||||
{
|
||||
/// <summary>
|
||||
/// A command to run.
|
||||
/// </summary>
|
||||
private readonly SqlCommand _command;
|
||||
|
||||
/// <summary>
|
||||
/// Constructor.
|
||||
/// </summary>
|
||||
/// <param name="command">A command to run</param>
|
||||
internal SingleCommandRunner(SqlCommand command)
|
||||
{
|
||||
_command = command;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Runs a single command.
|
||||
/// </summary>
|
||||
/// <param name="splitId">Split id for the command to run</param>
|
||||
/// <param name="input">Input data for the command to run</param>
|
||||
/// <returns>Value returned by running the command</returns>
|
||||
public object Run(int splitId, object input)
|
||||
{
|
||||
return ((PicklingWorkerFunction)_command.WorkerFunction).Func(
|
||||
splitId,
|
||||
(object[])input,
|
||||
_command.ArgOffsets);
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// MultiCommandRunner handles running multiple commands.
|
||||
/// </summary>
|
||||
private sealed class MultiCommandRunner : ICommandRunner
|
||||
{
|
||||
/// <summary>
|
||||
/// Commands to run.
|
||||
/// </summary>
|
||||
private readonly SqlCommand[] _commands;
|
||||
|
||||
/// <summary>
|
||||
/// Constructor.
|
||||
/// </summary>
|
||||
/// <param name="commands">Multiple commands top run</param>
|
||||
internal MultiCommandRunner(SqlCommand[] commands)
|
||||
{
|
||||
_commands = commands;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Runs multiple commands.
|
||||
/// </summary>
|
||||
/// <param name="splitId">Split id for the commands to run</param>
|
||||
/// <param name="input">Input data for the commands to run</param>
|
||||
/// <returns>An array of values returned by running the commands</returns>
|
||||
public object Run(int splitId, object input)
|
||||
{
|
||||
var row = new object[_commands.Length];
|
||||
for (int i = 0; i < _commands.Length; ++i)
|
||||
{
|
||||
SqlCommand command = _commands[i];
|
||||
row[i] = ((PicklingWorkerFunction)command.WorkerFunction).Func(
|
||||
splitId,
|
||||
(object[])input,
|
||||
command.ArgOffsets);
|
||||
}
|
||||
|
||||
return row;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// A SqlCommandExecutor that reads and writes using the
|
||||
/// Apache Arrow format.
|
||||
/// </summary>
|
||||
internal class ArrowSqlCommandExecutor : SqlCommandExecutor
|
||||
{
|
||||
[ThreadStatic]
|
||||
private static MemoryStream s_writeOutputStream;
|
||||
|
||||
protected override CommandExecutorStat ExecuteCore(
|
||||
Stream inputStream,
|
||||
Stream outputStream,
|
||||
SqlCommand[] commands)
|
||||
{
|
||||
var stat = new CommandExecutorStat();
|
||||
ICommandRunner commandRunner = CreateCommandRunner(commands);
|
||||
|
||||
SerDe.Write(outputStream, (int)SpecialLengths.START_ARROW_STREAM);
|
||||
|
||||
// TODO: Remove this MemoryStream once the arrow writer supports non-seekable streams.
|
||||
// For now, we write to a temporary seekable MemoryStream which we then copy to
|
||||
// the actual destination stream.
|
||||
MemoryStream tmp = s_writeOutputStream ?? (s_writeOutputStream = new MemoryStream());
|
||||
|
||||
ArrowStreamWriter writer = null;
|
||||
Schema resultSchema = null;
|
||||
foreach (ReadOnlyMemory<IArrowArray> input in GetInputIterator(inputStream))
|
||||
{
|
||||
// Split id is currently not used, so 0 is passed.
|
||||
IArrowArray[] results = commandRunner.Run(0, input);
|
||||
|
||||
// Assumes all columns have the same length, so uses 0th for num entries.
|
||||
int numEntries = results[0].Length;
|
||||
stat.NumEntriesProcessed += numEntries;
|
||||
|
||||
tmp.SetLength(0);
|
||||
|
||||
if (writer == null)
|
||||
{
|
||||
Debug.Assert(resultSchema == null);
|
||||
resultSchema = BuildSchema(results);
|
||||
|
||||
writer = new ArrowStreamWriter(tmp, resultSchema, leaveOpen: true);
|
||||
}
|
||||
|
||||
var recordBatch = new RecordBatch(resultSchema, results, numEntries);
|
||||
|
||||
// TODO: Remove sync-over-async once WriteRecordBatch exists.
|
||||
writer.WriteRecordBatchAsync(recordBatch).GetAwaiter().GetResult();
|
||||
|
||||
tmp.Position = 0;
|
||||
tmp.CopyTo(outputStream);
|
||||
outputStream.Flush();
|
||||
}
|
||||
|
||||
SerDe.Write(outputStream, 0);
|
||||
|
||||
if (writer != null)
|
||||
{
|
||||
writer.Dispose();
|
||||
}
|
||||
|
||||
return stat;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Create input iterator from the given input stream.
|
||||
/// </summary>
|
||||
/// <param name="inputStream">Stream to read from</param>
|
||||
/// <returns></returns>
|
||||
private IEnumerable<ReadOnlyMemory<IArrowArray>> GetInputIterator(Stream inputStream)
|
||||
{
|
||||
IArrowArray[] arrays = null;
|
||||
int columnCount = 0;
|
||||
try
|
||||
{
|
||||
using (var reader = new ArrowStreamReader(inputStream, leaveOpen: true))
|
||||
{
|
||||
RecordBatch batch;
|
||||
while ((batch = reader.ReadNextRecordBatch()) != null)
|
||||
{
|
||||
columnCount = batch.ColumnCount;
|
||||
if (arrays == null)
|
||||
{
|
||||
// Note that every batch in a stream has the same schema.
|
||||
arrays = ArrayPool<IArrowArray>.Shared.Rent(columnCount);
|
||||
}
|
||||
|
||||
for (int i = 0; i < columnCount; ++i)
|
||||
{
|
||||
arrays[i] = batch.Column(i);
|
||||
}
|
||||
|
||||
yield return new ReadOnlyMemory<IArrowArray>(arrays, 0, columnCount);
|
||||
}
|
||||
|
||||
if (arrays == null)
|
||||
{
|
||||
// When no input batches were received, return empty IArrowArrays
|
||||
// in order to create and write back the result schema.
|
||||
columnCount = reader.Schema.Fields.Count;
|
||||
arrays = ArrayPool<IArrowArray>.Shared.Rent(columnCount);
|
||||
|
||||
for (int i = 0; i < columnCount; ++i)
|
||||
{
|
||||
arrays[i] = null;
|
||||
}
|
||||
yield return new ReadOnlyMemory<IArrowArray>(arrays, 0, columnCount);
|
||||
}
|
||||
}
|
||||
}
|
||||
finally
|
||||
{
|
||||
if (arrays != null)
|
||||
{
|
||||
arrays.AsSpan(0, columnCount).Clear();
|
||||
ArrayPool<IArrowArray>.Shared.Return(arrays);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private static Schema BuildSchema(IArrowArray[] resultColumns)
|
||||
{
|
||||
var schemaBuilder = new Schema.Builder();
|
||||
if (resultColumns.Length == 1)
|
||||
{
|
||||
schemaBuilder = schemaBuilder
|
||||
.Field(f => f.Name("Result")
|
||||
.DataType(resultColumns[0].Data.DataType)
|
||||
.Nullable(false));
|
||||
}
|
||||
else
|
||||
{
|
||||
for (int i = 0; i < resultColumns.Length; ++i)
|
||||
{
|
||||
schemaBuilder = schemaBuilder
|
||||
.Field(f => f.Name("Result" + i)
|
||||
.DataType(resultColumns[i].Data.DataType)
|
||||
.Nullable(false));
|
||||
}
|
||||
}
|
||||
return schemaBuilder.Build();
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Creates an ICommandRunner instance based on the given commands.
|
||||
/// </summary>
|
||||
/// <param name="commands">Commands used for creating a command runner</param>
|
||||
/// <returns>An ICommandRunner instance</returns>
|
||||
private static ICommandRunner CreateCommandRunner(SqlCommand[] commands)
|
||||
{
|
||||
return (commands.Length == 1) ?
|
||||
(ICommandRunner)new SingleCommandRunner(commands[0]) :
|
||||
new MultiCommandRunner(commands);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Interface for running commands.
|
||||
/// On the Spark side, the following is expected for the Pickling to work:
|
||||
/// If there is a single command (one UDF), the computed value is returned
|
||||
/// as an object (one element). If there are multiple commands (multiple UDF scenario),
|
||||
/// the computed value should be an array (not IEnumerable) where each element
|
||||
/// in the array corresponds to the value returned by a command.
|
||||
/// Refer to EvaluatePython.scala for StructType case.
|
||||
/// </summary>
|
||||
private interface ICommandRunner
|
||||
{
|
||||
/// <summary>
|
||||
/// Runs commands based on the given split id and input.
|
||||
/// </summary>
|
||||
/// <param name="splitId">Split id for the commands to run</param>
|
||||
/// <param name="input">Input data for the commands to run</param>
|
||||
/// <returns>Value returned by running the commands</returns>
|
||||
IArrowArray[] Run(int splitId, ReadOnlyMemory<IArrowArray> input);
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// SingleCommandRunner handles running a single command.
|
||||
/// </summary>
|
||||
private sealed class SingleCommandRunner : ICommandRunner
|
||||
{
|
||||
/// <summary>
|
||||
/// A command to run.
|
||||
/// </summary>
|
||||
private readonly SqlCommand _command;
|
||||
|
||||
/// <summary>
|
||||
/// Constructor.
|
||||
/// </summary>
|
||||
/// <param name="command">A command to run</param>
|
||||
internal SingleCommandRunner(SqlCommand command)
|
||||
{
|
||||
_command = command;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Runs a single command.
|
||||
/// </summary>
|
||||
/// <param name="splitId">Split id for the command to run</param>
|
||||
/// <param name="input">Input data for the command to run</param>
|
||||
/// <returns>Value returned by running the command</returns>
|
||||
public IArrowArray[] Run(int splitId, ReadOnlyMemory<IArrowArray> input)
|
||||
{
|
||||
return new[] { ((ArrowWorkerFunction)_command.WorkerFunction).Func(
|
||||
splitId,
|
||||
input,
|
||||
_command.ArgOffsets) };
|
||||
}
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// MultiCommandRunner handles running multiple commands.
|
||||
/// </summary>
|
||||
private sealed class MultiCommandRunner : ICommandRunner
|
||||
{
|
||||
/// <summary>
|
||||
/// Commands to run.
|
||||
/// </summary>
|
||||
private readonly SqlCommand[] _commands;
|
||||
|
||||
/// <summary>
|
||||
/// Constructor.
|
||||
/// </summary>
|
||||
/// <param name="commands">Multiple commands top run</param>
|
||||
internal MultiCommandRunner(SqlCommand[] commands)
|
||||
{
|
||||
_commands = commands;
|
||||
}
|
||||
|
||||
/// <summary>
|
||||
/// Runs multiple commands.
|
||||
/// </summary>
|
||||
/// <param name="splitId">Split id for the commands to run</param>
|
||||
/// <param name="input">Input data for the commands to run</param>
|
||||
/// <returns>An array of values returned by running the commands</returns>
|
||||
public IArrowArray[] Run(int splitId, ReadOnlyMemory<IArrowArray> input)
|
||||
{
|
||||
var resultColumns = new IArrowArray[_commands.Length];
|
||||
for (int i = 0; i < resultColumns.Length; ++i)
|
||||
{
|
||||
SqlCommand command = _commands[i];
|
||||
resultColumns[i] = ((ArrowWorkerFunction)command.WorkerFunction).Func(
|
||||
splitId,
|
||||
input,
|
||||
command.ArgOffsets);
|
||||
}
|
||||
return resultColumns;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
Некоторые файлы не были показаны из-за слишком большого количества измененных файлов Показать больше
Загрузка…
Ссылка в новой задаче