Initial Checkin
This commit is contained in:
Родитель
43cd5e5fdf
Коммит
c8104966a1
|
@ -1,330 +1,7 @@
|
|||
## Ignore Visual Studio temporary files, build results, and
|
||||
## files generated by popular Visual Studio add-ons.
|
||||
##
|
||||
## Get latest from https://github.com/github/gitignore/blob/master/VisualStudio.gitignore
|
||||
################################################################################
|
||||
# This .gitignore file was automatically created by Microsoft(R) Visual Studio.
|
||||
################################################################################
|
||||
|
||||
# User-specific files
|
||||
*.suo
|
||||
*.user
|
||||
*.userosscache
|
||||
*.sln.docstates
|
||||
|
||||
# User-specific files (MonoDevelop/Xamarin Studio)
|
||||
*.userprefs
|
||||
|
||||
# Build results
|
||||
[Dd]ebug/
|
||||
[Dd]ebugPublic/
|
||||
[Rr]elease/
|
||||
[Rr]eleases/
|
||||
x64/
|
||||
x86/
|
||||
bld/
|
||||
[Bb]in/
|
||||
[Oo]bj/
|
||||
[Ll]og/
|
||||
|
||||
# Visual Studio 2015/2017 cache/options directory
|
||||
.vs/
|
||||
# Uncomment if you have tasks that create the project's static files in wwwroot
|
||||
#wwwroot/
|
||||
|
||||
# Visual Studio 2017 auto generated files
|
||||
Generated\ Files/
|
||||
|
||||
# MSTest test Results
|
||||
[Tt]est[Rr]esult*/
|
||||
[Bb]uild[Ll]og.*
|
||||
|
||||
# NUNIT
|
||||
*.VisualState.xml
|
||||
TestResult.xml
|
||||
|
||||
# Build Results of an ATL Project
|
||||
[Dd]ebugPS/
|
||||
[Rr]eleasePS/
|
||||
dlldata.c
|
||||
|
||||
# Benchmark Results
|
||||
BenchmarkDotNet.Artifacts/
|
||||
|
||||
# .NET Core
|
||||
project.lock.json
|
||||
project.fragment.lock.json
|
||||
artifacts/
|
||||
**/Properties/launchSettings.json
|
||||
|
||||
# StyleCop
|
||||
StyleCopReport.xml
|
||||
|
||||
# Files built by Visual Studio
|
||||
*_i.c
|
||||
*_p.c
|
||||
*_i.h
|
||||
*.ilk
|
||||
*.meta
|
||||
*.obj
|
||||
*.iobj
|
||||
*.pch
|
||||
*.pdb
|
||||
*.ipdb
|
||||
*.pgc
|
||||
*.pgd
|
||||
*.rsp
|
||||
*.sbr
|
||||
*.tlb
|
||||
*.tli
|
||||
*.tlh
|
||||
*.tmp
|
||||
*.tmp_proj
|
||||
*.log
|
||||
*.vspscc
|
||||
*.vssscc
|
||||
.builds
|
||||
*.pidb
|
||||
*.svclog
|
||||
*.scc
|
||||
|
||||
# Chutzpah Test files
|
||||
_Chutzpah*
|
||||
|
||||
# Visual C++ cache files
|
||||
ipch/
|
||||
*.aps
|
||||
*.ncb
|
||||
*.opendb
|
||||
*.opensdf
|
||||
*.sdf
|
||||
*.cachefile
|
||||
*.VC.db
|
||||
*.VC.VC.opendb
|
||||
|
||||
# Visual Studio profiler
|
||||
*.psess
|
||||
*.vsp
|
||||
*.vspx
|
||||
*.sap
|
||||
|
||||
# Visual Studio Trace Files
|
||||
*.e2e
|
||||
|
||||
# TFS 2012 Local Workspace
|
||||
$tf/
|
||||
|
||||
# Guidance Automation Toolkit
|
||||
*.gpState
|
||||
|
||||
# ReSharper is a .NET coding add-in
|
||||
_ReSharper*/
|
||||
*.[Rr]e[Ss]harper
|
||||
*.DotSettings.user
|
||||
|
||||
# JustCode is a .NET coding add-in
|
||||
.JustCode
|
||||
|
||||
# TeamCity is a build add-in
|
||||
_TeamCity*
|
||||
|
||||
# DotCover is a Code Coverage Tool
|
||||
*.dotCover
|
||||
|
||||
# AxoCover is a Code Coverage Tool
|
||||
.axoCover/*
|
||||
!.axoCover/settings.json
|
||||
|
||||
# Visual Studio code coverage results
|
||||
*.coverage
|
||||
*.coveragexml
|
||||
|
||||
# NCrunch
|
||||
_NCrunch_*
|
||||
.*crunch*.local.xml
|
||||
nCrunchTemp_*
|
||||
|
||||
# MightyMoose
|
||||
*.mm.*
|
||||
AutoTest.Net/
|
||||
|
||||
# Web workbench (sass)
|
||||
.sass-cache/
|
||||
|
||||
# Installshield output folder
|
||||
[Ee]xpress/
|
||||
|
||||
# DocProject is a documentation generator add-in
|
||||
DocProject/buildhelp/
|
||||
DocProject/Help/*.HxT
|
||||
DocProject/Help/*.HxC
|
||||
DocProject/Help/*.hhc
|
||||
DocProject/Help/*.hhk
|
||||
DocProject/Help/*.hhp
|
||||
DocProject/Help/Html2
|
||||
DocProject/Help/html
|
||||
|
||||
# Click-Once directory
|
||||
publish/
|
||||
|
||||
# Publish Web Output
|
||||
*.[Pp]ublish.xml
|
||||
*.azurePubxml
|
||||
# Note: Comment the next line if you want to checkin your web deploy settings,
|
||||
# but database connection strings (with potential passwords) will be unencrypted
|
||||
*.pubxml
|
||||
*.publishproj
|
||||
|
||||
# Microsoft Azure Web App publish settings. Comment the next line if you want to
|
||||
# checkin your Azure Web App publish settings, but sensitive information contained
|
||||
# in these scripts will be unencrypted
|
||||
PublishScripts/
|
||||
|
||||
# NuGet Packages
|
||||
*.nupkg
|
||||
# The packages folder can be ignored because of Package Restore
|
||||
**/[Pp]ackages/*
|
||||
# except build/, which is used as an MSBuild target.
|
||||
!**/[Pp]ackages/build/
|
||||
# Uncomment if necessary however generally it will be regenerated when needed
|
||||
#!**/[Pp]ackages/repositories.config
|
||||
# NuGet v3's project.json files produces more ignorable files
|
||||
*.nuget.props
|
||||
*.nuget.targets
|
||||
|
||||
# Microsoft Azure Build Output
|
||||
csx/
|
||||
*.build.csdef
|
||||
|
||||
# Microsoft Azure Emulator
|
||||
ecf/
|
||||
rcf/
|
||||
|
||||
# Windows Store app package directories and files
|
||||
AppPackages/
|
||||
BundleArtifacts/
|
||||
Package.StoreAssociation.xml
|
||||
_pkginfo.txt
|
||||
*.appx
|
||||
|
||||
# Visual Studio cache files
|
||||
# files ending in .cache can be ignored
|
||||
*.[Cc]ache
|
||||
# but keep track of directories ending in .cache
|
||||
!*.[Cc]ache/
|
||||
|
||||
# Others
|
||||
ClientBin/
|
||||
~$*
|
||||
*~
|
||||
*.dbmdl
|
||||
*.dbproj.schemaview
|
||||
*.jfm
|
||||
*.pfx
|
||||
*.publishsettings
|
||||
orleans.codegen.cs
|
||||
|
||||
# Including strong name files can present a security risk
|
||||
# (https://github.com/github/gitignore/pull/2483#issue-259490424)
|
||||
#*.snk
|
||||
|
||||
# Since there are multiple workflows, uncomment next line to ignore bower_components
|
||||
# (https://github.com/github/gitignore/pull/1529#issuecomment-104372622)
|
||||
#bower_components/
|
||||
|
||||
# RIA/Silverlight projects
|
||||
Generated_Code/
|
||||
|
||||
# Backup & report files from converting an old project file
|
||||
# to a newer Visual Studio version. Backup files are not needed,
|
||||
# because we have git ;-)
|
||||
_UpgradeReport_Files/
|
||||
Backup*/
|
||||
UpgradeLog*.XML
|
||||
UpgradeLog*.htm
|
||||
ServiceFabricBackup/
|
||||
*.rptproj.bak
|
||||
|
||||
# SQL Server files
|
||||
*.mdf
|
||||
*.ldf
|
||||
*.ndf
|
||||
|
||||
# Business Intelligence projects
|
||||
*.rdl.data
|
||||
*.bim.layout
|
||||
*.bim_*.settings
|
||||
*.rptproj.rsuser
|
||||
|
||||
# Microsoft Fakes
|
||||
FakesAssemblies/
|
||||
|
||||
# GhostDoc plugin setting file
|
||||
*.GhostDoc.xml
|
||||
|
||||
# Node.js Tools for Visual Studio
|
||||
.ntvs_analysis.dat
|
||||
node_modules/
|
||||
|
||||
# Visual Studio 6 build log
|
||||
*.plg
|
||||
|
||||
# Visual Studio 6 workspace options file
|
||||
*.opt
|
||||
|
||||
# Visual Studio 6 auto-generated workspace file (contains which files were open etc.)
|
||||
*.vbw
|
||||
|
||||
# Visual Studio LightSwitch build output
|
||||
**/*.HTMLClient/GeneratedArtifacts
|
||||
**/*.DesktopClient/GeneratedArtifacts
|
||||
**/*.DesktopClient/ModelManifest.xml
|
||||
**/*.Server/GeneratedArtifacts
|
||||
**/*.Server/ModelManifest.xml
|
||||
_Pvt_Extensions
|
||||
|
||||
# Paket dependency manager
|
||||
.paket/paket.exe
|
||||
paket-files/
|
||||
|
||||
# FAKE - F# Make
|
||||
.fake/
|
||||
|
||||
# JetBrains Rider
|
||||
.idea/
|
||||
*.sln.iml
|
||||
|
||||
# CodeRush
|
||||
.cr/
|
||||
|
||||
# Python Tools for Visual Studio (PTVS)
|
||||
__pycache__/
|
||||
*.pyc
|
||||
|
||||
# Cake - Uncomment if you are using it
|
||||
# tools/**
|
||||
# !tools/packages.config
|
||||
|
||||
# Tabs Studio
|
||||
*.tss
|
||||
|
||||
# Telerik's JustMock configuration file
|
||||
*.jmconfig
|
||||
|
||||
# BizTalk build output
|
||||
*.btp.cs
|
||||
*.btm.cs
|
||||
*.odx.cs
|
||||
*.xsd.cs
|
||||
|
||||
# OpenCover UI analysis results
|
||||
OpenCover/
|
||||
|
||||
# Azure Stream Analytics local run output
|
||||
ASALocalRun/
|
||||
|
||||
# MSBuild Binary and Structured Log
|
||||
*.binlog
|
||||
|
||||
# NVidia Nsight GPU debugger configuration file
|
||||
*.nvuser
|
||||
|
||||
# MFractors (Xamarin productivity tool) working folder
|
||||
.mfractor/
|
||||
/.vs
|
||||
/v15
|
||||
npm-debug.log
|
||||
|
|
|
@ -0,0 +1 @@
|
|||
This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/). For more information see the [Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/) or contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with any additional questions or comments.
|
|
@ -0,0 +1,35 @@
|
|||
#Feedback
|
||||
|
||||
- Ask a question on [Stack Overflow](https://stackoverflow.com/questions/tagged/data-accelerator)
|
||||
- Request new features on [GitHub](CONTRIBUTING.md)
|
||||
- Open a new issue on [GitHub](https://github.com/Microsoft/data-accelerator/issues)
|
||||
|
||||
#Contributing
|
||||
|
||||
#Building
|
||||
* The Services can be built using the Visual Studio solution. See [Guide](Services/CONTRIBUTING.md)
|
||||
* The Spark folder can be built using Maven. See [Guide](Spark/CONTRIBUTING.md)
|
||||
* The Website can be built using NPM. See [Guide](Website/CONTRIBUTING.md)
|
||||
|
||||
#Create a change
|
||||
The development workflow, including debugging and running tests]
|
||||
|
||||
#Coding Guidelines
|
||||
* The services use .editorconfig files to maintain coding guidelines. See [editorconfig](Services/.editorconfig)
|
||||
* The website uses the Prettier extension. See [Prettier](https://marketplace.visualstudio.com/items?itemName=esbenp.prettier-vscode)
|
||||
* The Spark folder doesn't have guidelines yet. For contribution, please keep the code looking similar to the rest of the file being modified.
|
||||
|
||||
#Submitting pull requests
|
||||
|
||||
You will need to sign a Contributor License Agreement when submitting your pull request. To complete the Contributor License Agreement (CLA), you will need to follow the instructions provided by the CLA bot when you send the pull request. This needs to only be done once for any .NET Foundation OSS project.
|
||||
|
||||
If you don't know what a pull request is read this article: https://help.github.com/articles/using-pull-requests. Make sure the respository can build and all tests pass. Familiarize yourself with the project workflow and our coding conventions. The coding, style, and general engineering guidelines are published on the Engineering guidelines page.
|
||||
|
||||
Please also see our [Code of Conduct](CODE_OF_CONDUCT.md).
|
||||
|
||||
#Contributor License Agreement
|
||||
This project welcomes contributions and suggestions. Most contributions require you to agree to a Contributor License Agreement (CLA) declaring that you have the right to, and actually do, grant us the rights to use your contribution. For details, visit https://cla.microsoft.com.
|
||||
|
||||
When you submit a pull request, a CLA-bot will automatically determine whether you need to provide a CLA and decorate the PR appropriately (e.g., label, comment). Simply follow the instructions provided by the bot. You will only need to do this once across all repos using our CLA.
|
||||
|
||||
This project has adopted the Microsoft Open Source Code of Conduct. For more information see the Code of Conduct FAQ or contact opencode@microsoft.com with any additional questions or comments.
|
|
@ -0,0 +1,8 @@
|
|||
*.class
|
||||
*.log
|
||||
*/.vs
|
||||
.idea
|
||||
*/target
|
||||
*/src/main/resources
|
||||
*.iml
|
||||
out/
|
|
@ -0,0 +1,54 @@
|
|||
#Building Spark engine
|
||||
In order to build the Data Accelerator Spark engine, you will need the following
|
||||
|
||||
#Requirements
|
||||
- Maven
|
||||
- Java SDK
|
||||
- Ensure both JAVA_HOME, M2_HOME and MAVEN_HOME are properly defined in your environment
|
||||
|
||||
#How to build
|
||||
From a prompt:
|
||||
|
||||
> mvn package -f project-name
|
||||
|
||||
examples:
|
||||
```
|
||||
> mvn package -f datax-core
|
||||
> mvn package -f datax-keyvault
|
||||
> mvn package -f datax-utility
|
||||
> mvn package -f datax-host
|
||||
> mvn package -f datax-udf-samples
|
||||
```
|
||||
|
||||
|
||||
## Publish to Maven Repo
|
||||
<TODO replace with external repo>
|
||||
|
||||
> mvn deploy -f project-name
|
||||
|
||||
examples:
|
||||
```
|
||||
> mvn deploy -f datax-core
|
||||
> mvn deploy -f datax-utility
|
||||
> mvn deploy -f datax-host
|
||||
> mvn deploy -f datax-udf-samples
|
||||
```
|
||||
|
||||
## Publish to Storage Account to cluster
|
||||
Note: you will have to do `az login` first in order to use the login mode when uploading blob to remote storage account, also your account should have permission to the storage account associated with the cluster.
|
||||
|
||||
> deploy module-name staging
|
||||
|
||||
examples:
|
||||
```
|
||||
> deploy core staging
|
||||
> deploy utility staging
|
||||
> deploy host staging
|
||||
> deploy udf-samples staging
|
||||
```
|
||||
|
||||
|
||||
#How to create a PR
|
||||
- Ensure all tests are passing by doing the following
|
||||
- Create a pull request aginst the master branch
|
||||
|
|
@ -0,0 +1,25 @@
|
|||
|
||||
Microsoft Visual Studio Solution File, Format Version 12.00
|
||||
# Visual Studio 15
|
||||
VisualStudioVersion = 15.0.28307.168
|
||||
MinimumVisualStudioVersion = 10.0.40219.1
|
||||
Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "DataX.Utility.CodeSign", "DataX.Utility.CodeSign\DataX.Utility.CodeSign.csproj", "{F42E9A11-BA7C-4050-B8E4-33532615C230}"
|
||||
EndProject
|
||||
Global
|
||||
GlobalSection(SolutionConfigurationPlatforms) = preSolution
|
||||
Debug|Any CPU = Debug|Any CPU
|
||||
Release|Any CPU = Release|Any CPU
|
||||
EndGlobalSection
|
||||
GlobalSection(ProjectConfigurationPlatforms) = postSolution
|
||||
{F42E9A11-BA7C-4050-B8E4-33532615C230}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
|
||||
{F42E9A11-BA7C-4050-B8E4-33532615C230}.Debug|Any CPU.Build.0 = Debug|Any CPU
|
||||
{F42E9A11-BA7C-4050-B8E4-33532615C230}.Release|Any CPU.ActiveCfg = Release|Any CPU
|
||||
{F42E9A11-BA7C-4050-B8E4-33532615C230}.Release|Any CPU.Build.0 = Release|Any CPU
|
||||
EndGlobalSection
|
||||
GlobalSection(SolutionProperties) = preSolution
|
||||
HideSolutionNode = FALSE
|
||||
EndGlobalSection
|
||||
GlobalSection(ExtensibilityGlobals) = postSolution
|
||||
SolutionGuid = {6EBAC466-53ED-4C98-8790-81D8AF7ACC06}
|
||||
EndGlobalSection
|
||||
EndGlobal
|
|
@ -0,0 +1,34 @@
|
|||
<Project Sdk="Microsoft.NET.Sdk">
|
||||
|
||||
<PropertyGroup>
|
||||
<OutputType>Library</OutputType>
|
||||
<TargetFramework>netcoreapp2.1</TargetFramework>
|
||||
<ApplicationIcon />
|
||||
<StartupObject />
|
||||
<SignAssembly>true</SignAssembly>
|
||||
<AssemblyOriginatorKeyFile>FinalPublicKey.snk</AssemblyOriginatorKeyFile>
|
||||
<DelaySign>true</DelaySign>
|
||||
<BaseOutputDirectory>bin</BaseOutputDirectory>
|
||||
<OutDir>$(BaseOutputDirectory)</OutDir>
|
||||
</PropertyGroup>
|
||||
|
||||
<ItemGroup>
|
||||
<PackageReference Include="Microsoft.VisualStudioEng.MicroBuild.Core" Version="0.4.1">
|
||||
<PrivateAssets>all</PrivateAssets>
|
||||
<IncludeAssets>runtime; build; native; contentfiles; analyzers</IncludeAssets>
|
||||
</PackageReference>
|
||||
</ItemGroup>
|
||||
|
||||
<ItemGroup>
|
||||
<FilesToSign Include="$(OutDir)\**\*.nupkg">
|
||||
<Authenticode>NuGet</Authenticode>
|
||||
</FilesToSign>
|
||||
</ItemGroup>
|
||||
|
||||
<ItemGroup>
|
||||
<FilesToSign Include="$(OutDir)\**\*.jar">
|
||||
<Authenticode>MicrosoftJARSHA2</Authenticode>
|
||||
</FilesToSign>
|
||||
</ItemGroup>
|
||||
|
||||
</Project>
|
Двоичные данные
DataProcessing/DataX.Utilities/DataX.Utility.CodeSign/FinalPublicKey.snk
Normal file
Двоичные данные
DataProcessing/DataX.Utilities/DataX.Utility.CodeSign/FinalPublicKey.snk
Normal file
Двоичный файл не отображается.
|
@ -0,0 +1,26 @@
|
|||
Data Accelerator for Spark Engine
|
||||
|
||||
## Projects Structure
|
||||
|
||||
#Core
|
||||
Interface and classes definition for contracts of the Data Accelerator Engine
|
||||
|
||||
#Host
|
||||
Spark-specific app jar for Data Accelerator
|
||||
|
||||
#Samples
|
||||
Examples for UDFs and UDAFs in Scala
|
||||
|
||||
#Utility
|
||||
Common classes and singleton helps used across projects
|
||||
|
||||
##Properties
|
||||
|
||||
Some basic rules:
|
||||
* Property names are akin to a full JSON path to locate a leaf in the JSON object
|
||||
* The root namespace is **datax.job**
|
||||
* Property names are all lowercase for known fields from the JSON config object, except cases from Map and Array
|
||||
* Map case - e.g. the outputs is a Map of string to individual output config, in this case, put the string into the property name as part of the path
|
||||
* Array case - e.g. the timeWindows is an Array of time window specs, in this case, extract the name as part of the path into property name
|
||||
* When flatten Map/Array, change the plural words into singular term, e.g. change *outputs* to *output*, *timeWindows* to *timewindow*, etc.
|
||||
|
|
@ -0,0 +1,33 @@
|
|||
<?xml version="1.0"?>
|
||||
<package xmlns:xsd="http://www.w3.org/2001/XMLSchema" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
|
||||
<metadata xmlns="http://schemas.microsoft.com/packaging/2010/07/nuspec.xsd">
|
||||
<id>Microsoft.DataX.Spark</id>
|
||||
<version>$version$</version>
|
||||
<authors>Microsoft</authors>
|
||||
<projectUrl>http://aka.ms/data-accelerator</projectUrl>
|
||||
<license type="expression">MIT</license>
|
||||
<iconUrl>https://raw.githubusercontent.com/wiki/Microsoft/data-accelerator/tutorials/images/roundwhite6464.PNG</iconUrl>
|
||||
<requireLicenseAcceptance>true</requireLicenseAcceptance>
|
||||
<description>Data Accelerator for Apache Spark simplifies streaming of Big Data using Spark. It is used internally at Microsoft for processing streamed data from multiple products everyday, handling data volumes at Microsoft scale. It offers a no-code experience to build Rules and Alerts, as well as numerous productivity improvements to develop and manage Spark SQL jobs on Azure HDInsights. This is the package of jar files for Data Processing.</description>
|
||||
<copyright>© Microsoft Corporation. All rights reserved.</copyright>
|
||||
<tags>DataX.Spark, DataX</tags>
|
||||
<dependencies>
|
||||
</dependencies>
|
||||
</metadata>
|
||||
<files>
|
||||
<file src="**\applicationinsights-core-2.2.1.jar" target="lib" />
|
||||
<file src="**\azure-documentdb-1.16.1.jar" target="lib" />
|
||||
<file src="**\azure-eventhubs-1.2.1.jar" target="lib" />
|
||||
<file src="**\azure-eventhubs-spark_2.11-2.3.6.jar" target="lib" />
|
||||
<file src="**\azure-keyvault-webkey-1.1.jar" target="lib" />
|
||||
<file src="**\datax-core_2.11-1.0.0-SNAPSHOT.jar" target="lib" />
|
||||
<file src="**\datax-host_2.11-1.0.0-SNAPSHOT.jar" target="lib" />
|
||||
<file src="**\datax-utility_2.11-1.0.0-SNAPSHOT.jar" target="lib" />
|
||||
<file src="**\datax-keyvault_2.11-1.0.0-SNAPSHOT-with-dependencies.jar" target="lib" />
|
||||
<file src="**\datax-udf-samples_2.11-1.0.0-SNAPSHOT.jar" target="lib" />
|
||||
<file src="**\java-uuid-generator-3.1.5.jar" target="lib" />
|
||||
<file src="**\proton-j-0.27.1.jar" target="lib" />
|
||||
<file src="**\scala-java8-compat_2.11-0.9.0.jar" target="lib" />
|
||||
<file src="NOTICE.txt" target="" />
|
||||
</files>
|
||||
</package>
|
|
@ -0,0 +1,199 @@
|
|||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<project xmlns="http://maven.apache.org/POM/4.0.0"
|
||||
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
|
||||
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
|
||||
<modelVersion>4.0.0</modelVersion>
|
||||
|
||||
<!--
|
||||
MIT License
|
||||
|
||||
Copyright (c) Microsoft Corporation. All rights reserved.
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
|
||||
The above copyright notice and this permission notice shall be included in all
|
||||
copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
SOFTWARE
|
||||
-->
|
||||
<developers>
|
||||
<developer>
|
||||
<id>microsoft</id>
|
||||
<name>Microsoft</name>
|
||||
</developer>
|
||||
</developers>
|
||||
|
||||
<licenses>
|
||||
<license>
|
||||
<name>MIT License</name>
|
||||
<url>http://opensource.org/licenses/MIT</url>
|
||||
<distribution>repo</distribution>
|
||||
</license>
|
||||
</licenses>
|
||||
|
||||
<scm>
|
||||
<connection>scm:git:git@github.com:Microsoft/data-accelerator.git</connection>
|
||||
<developerConnection>scm:git:git@github.com:Microsoft/data-accelerator.git</developerConnection>
|
||||
<url>https://github.com/Microsoft/data-accelerator.git</url>
|
||||
</scm>
|
||||
|
||||
<groupId>com.microsoft.datax</groupId>
|
||||
<artifactId>datax-core_2.11</artifactId>
|
||||
<version>1.0.0-SNAPSHOT</version>
|
||||
|
||||
<properties>
|
||||
<spark.version>2.3.0</spark.version>
|
||||
<scala.version.major>2.11</scala.version.major>
|
||||
<scala.version.minor>8</scala.version.minor>
|
||||
<scala.version>${scala.version.major}.${scala.version.minor}</scala.version>
|
||||
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
|
||||
</properties>
|
||||
|
||||
<repositories>
|
||||
</repositories>
|
||||
|
||||
<dependencies>
|
||||
<dependency>
|
||||
<groupId>org.apache.spark</groupId>
|
||||
<artifactId>spark-core_${scala.version.major}</artifactId>
|
||||
<version>${spark.version}</version>
|
||||
<scope>provided</scope>
|
||||
<exclusions>
|
||||
<exclusion>
|
||||
<groupId>org.json4s</groupId>
|
||||
<artifactId>json4s-jackson_2.11</artifactId>
|
||||
</exclusion>
|
||||
</exclusions>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.apache.spark</groupId>
|
||||
<artifactId>spark-sql_${scala.version.major}</artifactId>
|
||||
<version>${spark.version}</version>
|
||||
<scope>provided</scope>
|
||||
<exclusions>
|
||||
<exclusion>
|
||||
<groupId>org.scala-lang.modules</groupId>
|
||||
<artifactId>scala-parser-combinators_2.11</artifactId>
|
||||
</exclusion>
|
||||
</exclusions>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.apache.spark</groupId>
|
||||
<artifactId>spark-streaming_${scala.version.major}</artifactId>
|
||||
<version>${spark.version}</version>
|
||||
<scope>provided</scope>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.apache.spark</groupId>
|
||||
<artifactId>spark-mllib_${scala.version.major}</artifactId>
|
||||
<version>${spark.version}</version>
|
||||
<scope>provided</scope>
|
||||
<exclusions>
|
||||
<exclusion>
|
||||
<groupId>org.scala-lang.modules</groupId>
|
||||
<artifactId>scala-parser-combinators_2.11</artifactId>
|
||||
</exclusion>
|
||||
<exclusion>
|
||||
<groupId>org.typelevel</groupId>
|
||||
<artifactId>macro-compat_2.11</artifactId>
|
||||
</exclusion>
|
||||
</exclusions>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.jmockit</groupId>
|
||||
<artifactId>jmockit</artifactId>
|
||||
<version>1.34</version>
|
||||
<scope>test</scope>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.scalatest</groupId>
|
||||
<artifactId>scalatest_2.11</artifactId>
|
||||
<version>2.2.6</version>
|
||||
<scope>test</scope>
|
||||
</dependency>
|
||||
</dependencies>
|
||||
|
||||
<build>
|
||||
<plugins>
|
||||
<plugin>
|
||||
<groupId>net.alchim31.maven</groupId>
|
||||
<artifactId>scala-maven-plugin</artifactId>
|
||||
<version>3.2.2</version>
|
||||
<executions>
|
||||
<execution>
|
||||
<goals>
|
||||
<goal>compile</goal>
|
||||
<goal>testCompile</goal>
|
||||
</goals>
|
||||
</execution>
|
||||
</executions>
|
||||
<configuration>
|
||||
<scalaVersion>${scala.version}</scalaVersion>
|
||||
<args>
|
||||
<arg>-deprecation</arg>
|
||||
<arg>-feature</arg>
|
||||
</args>
|
||||
</configuration>
|
||||
</plugin>
|
||||
<plugin>
|
||||
<artifactId>maven-compiler-plugin</artifactId>
|
||||
<version>3.3</version>
|
||||
<configuration>
|
||||
<source>1.8</source>
|
||||
<target>1.8</target>
|
||||
</configuration>
|
||||
</plugin>
|
||||
<plugin>
|
||||
<groupId>org.apache.maven.plugins</groupId>
|
||||
<artifactId>maven-dependency-plugin</artifactId>
|
||||
<version>3.0.1</version>
|
||||
<executions>
|
||||
<execution>
|
||||
<id>copy-dependencies</id>
|
||||
<phase>package</phase>
|
||||
<goals>
|
||||
<goal>copy-dependencies</goal>
|
||||
</goals>
|
||||
<configuration>
|
||||
<outputDirectory>${project.build.directory}/dependency</outputDirectory>
|
||||
<overWriteReleases>false</overWriteReleases>
|
||||
<overWriteSnapshots>true</overWriteSnapshots>
|
||||
<excludeTransitive>true</excludeTransitive>
|
||||
</configuration>
|
||||
</execution>
|
||||
</executions>
|
||||
</plugin>
|
||||
<!-- enable scalatest -->
|
||||
<plugin>
|
||||
<groupId>org.scalatest</groupId>
|
||||
<artifactId>scalatest-maven-plugin</artifactId>
|
||||
<version>1.0</version>
|
||||
<configuration>
|
||||
<reportsDirectory>${project.build.directory}/surefire-reports</reportsDirectory>
|
||||
<junitxml>.</junitxml>
|
||||
<filereports>WDF TestSuite.txt</filereports>
|
||||
</configuration>
|
||||
<executions>
|
||||
<execution>
|
||||
<id>test</id>
|
||||
<goals>
|
||||
<goal>test</goal>
|
||||
</goals>
|
||||
</execution>
|
||||
</executions>
|
||||
</plugin>
|
||||
</plugins>
|
||||
</build>
|
||||
</project>
|
|
@ -0,0 +1,11 @@
|
|||
// *********************************************************************
|
||||
// Copyright (c) Microsoft Corporation. All rights reserved.
|
||||
// Licensed under the MIT License
|
||||
// *********************************************************************
|
||||
package datax.config
|
||||
|
||||
import datax.constants.ProductConstant
|
||||
|
||||
object DefaultValue {
|
||||
def DefaultAppName = ProductConstant.DefaultAppName
|
||||
}
|
|
@ -0,0 +1,151 @@
|
|||
// *********************************************************************
|
||||
// Copyright (c) Microsoft Corporation. All rights reserved.
|
||||
// Licensed under the MIT License
|
||||
// *********************************************************************
|
||||
package datax.config
|
||||
|
||||
import datax.constants.{JobArgument, ProductConstant}
|
||||
import datax.exception.EngineException
|
||||
import org.apache.log4j.Level
|
||||
|
||||
import scala.concurrent.duration.Duration
|
||||
|
||||
|
||||
/***
|
||||
* represents configuration of a job
|
||||
*
|
||||
* @param elems starting elements in the configuration
|
||||
* @param parentPrefix prefix of all of the setting names in this dictionary, mainly for logging
|
||||
*/
|
||||
case class SettingDictionary(elems: Map[String, String], parentPrefix: String = SettingNamespace.DefaultSettingName){
|
||||
val dict = elems
|
||||
val size = dict.size
|
||||
|
||||
def getDictMap() = dict
|
||||
def get(key: String) = dict.get(key)
|
||||
def getOrNull(key: String) = get(key).orNull
|
||||
def getDefault() = get(SettingNamespace.DefaultSettingName)
|
||||
|
||||
def getOrThrow[T](opt: Option[T], key: String) = {
|
||||
opt match {
|
||||
case Some(v)=> v
|
||||
case None => throw new EngineException(s"config setting '${parentPrefix+key}' is not found")
|
||||
}
|
||||
}
|
||||
|
||||
def getString(key: String) = getOrThrow(dict.get(key), key)
|
||||
def getOrElse(key: String, defaultValue: String) = dict.getOrElse(key, defaultValue)
|
||||
def getIntOption(key: String) = get(key).map(_.toInt)
|
||||
def getLongOption(key: String) = get(key).map(_.toLong)
|
||||
def getLong(key: String) = getOrThrow(getLongOption(key), key)
|
||||
|
||||
def getDoubleOption(key: String) = get(key).map(_.toDouble)
|
||||
def getDouble(key: String) = getOrThrow(getDoubleOption(key), key)
|
||||
def getBooleanOption(key: String) = get(key).map(_.toBoolean)
|
||||
def getDurationOption(key: String) = get(key).map(Duration.create(_))
|
||||
def getDuration(key: String) = getOrThrow(getDurationOption(key), key)
|
||||
|
||||
def getStringSeqOption(key: String) = get(key).map(str => {
|
||||
val seq = str.split(SettingNamespace.ValueSeperator).filterNot(_.isEmpty).toSeq
|
||||
if (seq.length > 0) seq else null
|
||||
})
|
||||
|
||||
private def findWithPrefix(prefix: String): Map[String, String] = dict.filter(kv=>kv._1.startsWith(prefix))
|
||||
|
||||
private def stripKeys(dict: Map[String, String], startIndex: Int) = {
|
||||
dict.filter(kv=>kv._1!=null&&kv._1.length>startIndex).map{case(k, v) => k.substring(startIndex)->v}
|
||||
}
|
||||
|
||||
private def stripKeysByNamespace(dict: Map[String, String], namespace: String) = {
|
||||
val prefixLength = (namespace+SettingNamespace.Seperator).length
|
||||
dict.filter(kv=>kv._1!=null&&kv._1.length>=namespace.length).map{case(k, v) => {
|
||||
if(k==namespace)
|
||||
SettingNamespace.DefaultSettingName -> v
|
||||
else
|
||||
k.substring(prefixLength)->v
|
||||
}}
|
||||
}
|
||||
|
||||
/** group the dictionary into sub dictionary based on namespaces
|
||||
*
|
||||
* strip off the prefix on every setting names, and distribute settings into groups of [[SettingDictionary]] with the group name
|
||||
* is the first namespace in the setting name.
|
||||
*
|
||||
* @param prefix the prefix to look for and strip off when grouping
|
||||
* @return a group of SettingDictionary
|
||||
*/
|
||||
def groupBySubNamespace(prefix: String = null) = {
|
||||
val sub = if(prefix==null || prefix.isEmpty)
|
||||
dict
|
||||
else
|
||||
stripKeys(findWithPrefix(prefix), prefix.length)
|
||||
|
||||
sub.groupBy(kv=>SettingNamespace.getSubNamespace(kv._1, 0))
|
||||
.filterKeys(_!=null)
|
||||
.map{case (k, v) => k-> SettingDictionary(stripKeysByNamespace(v, k), parentPrefix + k + SettingNamespace.Seperator)}
|
||||
}
|
||||
|
||||
/** get a sub [[SettingDictionary]] with only setting whose names start with the give prefix
|
||||
*
|
||||
* @param prefix prefix to filter the setting name
|
||||
* @return a [[SettingDictionary]] instance containing only the settings with prefix in the name
|
||||
*/
|
||||
def getSubDictionary(prefix: String) = {
|
||||
SettingDictionary(stripKeys(findWithPrefix(prefix), prefix.length), parentPrefix+prefix)
|
||||
}
|
||||
|
||||
def buildConfigIterable[TConf](builder: (SettingDictionary, String)=>TConf, prefix: String = null) = {
|
||||
groupBySubNamespace(prefix)
|
||||
.map{case(k,v)=>builder(v, k)}
|
||||
}
|
||||
|
||||
def buildConfigMap[TConf](builder: (SettingDictionary, String)=>TConf, prefix: String = null) = {
|
||||
groupBySubNamespace(prefix)
|
||||
.map{case(k,v)=>k->builder(v, k)}
|
||||
}
|
||||
|
||||
/***
|
||||
* get name of the job
|
||||
* @return name of the job
|
||||
*/
|
||||
def getAppName(): String = {
|
||||
dict.getOrElse(JobArgument.ConfName_AppName, DefaultValue.DefaultAppName)
|
||||
}
|
||||
|
||||
def getJobName(): String = {
|
||||
dict.get(SettingNamespace.JobNameFullPath).getOrElse(getAppName())
|
||||
}
|
||||
|
||||
def getMetricAppName() = {
|
||||
ProductConstant.MetricAppNamePrefix + getJobName()
|
||||
}
|
||||
|
||||
def getClientNodeName() = {
|
||||
SparkEnvVariables.getClientNodeName(this.getAppName())
|
||||
}
|
||||
|
||||
/***
|
||||
* get path to configuration file of the job
|
||||
* @return path to the configuration file
|
||||
*/
|
||||
def getAppConfigurationFile(): String = {
|
||||
dict.getOrElse(JobArgument.ConfName_AppConf, null)
|
||||
}
|
||||
|
||||
/***
|
||||
* get setting of logging level on driver nodes of the job
|
||||
* @return logging level on driver nodes
|
||||
*/
|
||||
def getDriverLogLevel(): Option[Level] = {
|
||||
dict.get(JobArgument.ConfName_DriverLogLevel).map(Level.toLevel(_))
|
||||
}
|
||||
|
||||
/***
|
||||
* get setting of logging level on executor nodes of the job
|
||||
* @return logging level on executor nodes
|
||||
*/
|
||||
def getExecutorLogLevel(): Option[Level] = {
|
||||
dict.get(JobArgument.ConfName_LogLevel).map(Level.toLevel(_))
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,48 @@
|
|||
// *********************************************************************
|
||||
// Copyright (c) Microsoft Corporation. All rights reserved.
|
||||
// Licensed under the MIT License
|
||||
// *********************************************************************
|
||||
package datax.config
|
||||
|
||||
import datax.constants.ProductConstant
|
||||
|
||||
object SettingNamespace {
|
||||
val DefaultSettingName = ""
|
||||
val Seperator = "."
|
||||
val ValueSeperator = ";"
|
||||
def Root = ProductConstant.ProductRoot
|
||||
def RootPrefix = Root + Seperator
|
||||
val Job = "job"
|
||||
def JobPrefix = RootPrefix + Job + Seperator
|
||||
|
||||
val JobName = "name"
|
||||
def JobNameFullPath = JobPrefix + JobName
|
||||
|
||||
val JobInput = "input.default"
|
||||
def JobInputPrefix = JobPrefix + JobInput + Seperator
|
||||
|
||||
val JobProcess = "process"
|
||||
def JobProcessPrefix = JobPrefix + JobProcess + Seperator
|
||||
|
||||
val JobOutput = "output"
|
||||
def JobOutputPrefix = JobPrefix + JobOutput + Seperator
|
||||
|
||||
val JobOutputDefault = "default"
|
||||
def JobOutputDefaultPreifx = JobOutputPrefix + JobOutputDefault + Seperator
|
||||
|
||||
def buildSettingPath(names: String*) = {
|
||||
names.filterNot(_==null).mkString(Seperator)
|
||||
}
|
||||
|
||||
def getSubNamespace(propName: String, startIndex: Int): String = {
|
||||
if(propName.length>startIndex) {
|
||||
val pos = propName.indexOf(SettingNamespace.Seperator, startIndex)
|
||||
if(pos>=0)
|
||||
propName.substring(startIndex, pos)
|
||||
else
|
||||
propName.substring(startIndex)
|
||||
}
|
||||
else
|
||||
null
|
||||
}
|
||||
}
|
|
@ -0,0 +1,18 @@
|
|||
// *********************************************************************
|
||||
// Copyright (c) Microsoft Corporation. All rights reserved.
|
||||
// Licensed under the MIT License
|
||||
// *********************************************************************
|
||||
package datax.config
|
||||
|
||||
import org.apache.spark.{SparkEnv, TaskContext}
|
||||
|
||||
object SparkEnvVariables {
|
||||
def getClientNodeName(appName: String): String = {
|
||||
appName+"-"+SparkEnv.get.executorId
|
||||
}
|
||||
|
||||
def getLoggerSuffix(): String = {
|
||||
val tc = TaskContext.get()
|
||||
if(tc==null)"" else s"-P${tc.partitionId()}-T${tc.taskAttemptId()}"
|
||||
}
|
||||
}
|
|
@ -0,0 +1,12 @@
|
|||
// *********************************************************************
|
||||
// Copyright (c) Microsoft Corporation. All rights reserved.
|
||||
// Licensed under the MIT License
|
||||
// *********************************************************************
|
||||
package datax.config
|
||||
|
||||
import org.apache.spark.SparkConf
|
||||
|
||||
case class UnifiedConfig(
|
||||
sparkConf: SparkConf,
|
||||
dict: SettingDictionary
|
||||
)
|
|
@ -0,0 +1,23 @@
|
|||
// *********************************************************************
|
||||
// Copyright (c) Microsoft Corporation. All rights reserved.
|
||||
// Licensed under the MIT License
|
||||
// *********************************************************************
|
||||
package datax.constants
|
||||
|
||||
|
||||
object ColumnName {
|
||||
|
||||
|
||||
// Define constants for column names
|
||||
val RawObjectColumn = "Raw"
|
||||
val EventNameColumn = "EventName"
|
||||
def PropertiesColumn = s"${NamePrefix.Value}Properties"
|
||||
|
||||
def InternalColumnPrefix = s"__${NamePrefix.Value}_"
|
||||
def InternalColumnFileInfo = InternalColumnPrefix + "FileInfo"
|
||||
def MetadataColumnPrefix = s"__${NamePrefix.Value}Metadata_"
|
||||
def MetadataColumnOutputPartitionTime = MetadataColumnPrefix + "OutputPartitionTime"
|
||||
|
||||
def OutputGroupColumn = s"${NamePrefix.Value}OutputGroup"
|
||||
|
||||
}
|
|
@ -0,0 +1,12 @@
|
|||
// *********************************************************************
|
||||
// Copyright (c) Microsoft Corporation. All rights reserved.
|
||||
// Licensed under the MIT License
|
||||
// *********************************************************************
|
||||
package datax.constants
|
||||
|
||||
object DatasetName {
|
||||
def DataStreamRaw = s"${NamePrefix.Value}RawInput"
|
||||
def DataStreamProjection = s"${NamePrefix.Value}ProcessedInput"
|
||||
def DataStreamProjectionBatch = s"${NamePrefix.Value}ProcessedInput_Batch"
|
||||
def DataStreamProjectionWithWindow = s"${NamePrefix.Value}ProcessedInput_Window"
|
||||
}
|
|
@ -0,0 +1,11 @@
|
|||
// *********************************************************************
|
||||
// Copyright (c) Microsoft Corporation. All rights reserved.
|
||||
// Licensed under the MIT License
|
||||
// *********************************************************************
|
||||
package datax.constants
|
||||
|
||||
object FeatureName {
|
||||
// Define built-in functions
|
||||
val FunctionDisableCommonCaching = "disableCommonCaching"
|
||||
|
||||
}
|
|
@ -0,0 +1,18 @@
|
|||
// *********************************************************************
|
||||
// Copyright (c) Microsoft Corporation. All rights reserved.
|
||||
// Licensed under the MIT License
|
||||
// *********************************************************************
|
||||
package datax.constants
|
||||
|
||||
|
||||
object JobArgument {
|
||||
def ConfNamePrefix = s"${NamePrefix.Value}_".toUpperCase
|
||||
def ConfName_AppConf = s"${ConfNamePrefix}APPCONF"
|
||||
def ConfName_AppName = s"${ConfNamePrefix}APPNAME"
|
||||
def ConfName_LogLevel = s"${ConfNamePrefix}LOGLEVEL"
|
||||
def ConfName_DriverLogLevel = s"${ConfNamePrefix}DRIVERLOGLEVEL"
|
||||
def ConfName_CheckpointEnabled = s"${ConfNamePrefix}CHECKPOINTENABLED"
|
||||
def ConfName_AppInsightKeyRef = s"${ConfNamePrefix}APPINSIGHTKEYREF"
|
||||
def ConfName_BlobWriterTimeout: String = s"${ConfNamePrefix}BlobWriterTimeout"
|
||||
def ConfName_DefaultVaultName: String = s"${ConfNamePrefix}DEFAULTVAULTNAME"
|
||||
}
|
|
@ -0,0 +1,9 @@
|
|||
// *********************************************************************
|
||||
// Copyright (c) Microsoft Corporation. All rights reserved.
|
||||
// Licensed under the MIT License
|
||||
// *********************************************************************
|
||||
package datax.constants
|
||||
|
||||
object MetricName {
|
||||
val MetricSinkPrefix="Sink_"
|
||||
}
|
|
@ -0,0 +1,11 @@
|
|||
// *********************************************************************
|
||||
// Copyright (c) Microsoft Corporation. All rights reserved.
|
||||
// Licensed under the MIT License
|
||||
// *********************************************************************
|
||||
package datax.constants
|
||||
|
||||
object NamePrefix {
|
||||
val DefaultValue = "DataX"
|
||||
val ConfSetting = "DATAX_NAMEPREFIX"
|
||||
val Value: String = sys.env.getOrElse(ConfSetting, DefaultValue)
|
||||
}
|
|
@ -0,0 +1,14 @@
|
|||
// *********************************************************************
|
||||
// Copyright (c) Microsoft Corporation. All rights reserved.
|
||||
// Licensed under the MIT License
|
||||
// *********************************************************************
|
||||
package datax.constants
|
||||
|
||||
object ProcessingPropertyName {
|
||||
// Define constants for names in the processing Properties column
|
||||
val BlobPathHint = "Partition"
|
||||
val BatchTime = "BatchTime"
|
||||
val BlobTime = "InputTime"
|
||||
val CPTime = "CPTime"
|
||||
val CPExecutor = "CPExecutor"
|
||||
}
|
|
@ -0,0 +1,19 @@
|
|||
// *********************************************************************
|
||||
// Copyright (c) Microsoft Corporation. All rights reserved.
|
||||
// Licensed under the MIT License
|
||||
// *********************************************************************
|
||||
package datax.constants
|
||||
|
||||
object ProductConstant {
|
||||
def DefaultAppName = s"${NamePrefix.Value}_Unknown_App"
|
||||
def MetricAppNamePrefix = s"${NamePrefix.Value}-".toUpperCase
|
||||
def ProductRoot = s"${NamePrefix.Value}".toLowerCase
|
||||
def ProductJobTags = s"${NamePrefix.Value}JobTags"
|
||||
def ProductRedisBase = s"${NamePrefix.Value}_RedisBase"
|
||||
def ProductRedisStandardConnection = s"${NamePrefix.Value}_RedisStandardConnection"
|
||||
def ProductRedisClusterConnection = s"${NamePrefix.Value}_RedisClusterConnection"
|
||||
def DataStreamProcessDataSetLogger = s"${NamePrefix.Value}-ProcessDataset"
|
||||
def ProductInstrumentLogger = s"${NamePrefix.Value}-Instrument"
|
||||
def ProductOutputFilter = s"${NamePrefix.Value}OutputFilter"
|
||||
def ProductQuery = s"^--${NamePrefix.Value}Query--"
|
||||
}
|
|
@ -0,0 +1,8 @@
|
|||
// *********************************************************************
|
||||
// Copyright (c) Microsoft Corporation. All rights reserved.
|
||||
// Licensed under the MIT License
|
||||
// *********************************************************************
|
||||
package datax.exception
|
||||
|
||||
case class EngineException(private val message: String = "", private val cause: Throwable = None.orNull)
|
||||
extends Exception(message, cause)
|
|
@ -0,0 +1,48 @@
|
|||
// *********************************************************************
|
||||
// Copyright (c) Microsoft Corporation. All rights reserved.
|
||||
// Licensed under the MIT License
|
||||
// *********************************************************************
|
||||
package datax.extension
|
||||
|
||||
import java.sql.Timestamp
|
||||
|
||||
import datax.config.SettingDictionary
|
||||
import org.apache.spark.sql.SparkSession
|
||||
|
||||
object DynamicUDF{
|
||||
type IntervalUpdateHandler = (SparkSession, Timestamp)=>Unit
|
||||
|
||||
trait DynamicUDFTrait {
|
||||
val name: String
|
||||
val funcRef: AnyRef
|
||||
val onInterval: IntervalUpdateHandler
|
||||
}
|
||||
|
||||
case class UDF0[RT] (func: ()=>RT, onInterval: IntervalUpdateHandler = null)
|
||||
|
||||
case class UDF1[T1, RT] (func: T1=>RT, onInterval: IntervalUpdateHandler = null)
|
||||
|
||||
case class UDF2[T1, T2, RT] (func: (T1, T2) => RT, onInterval: IntervalUpdateHandler = null)
|
||||
|
||||
case class UDF3[T1, T2, T3, RT] (func: (T1, T2, T3) => RT,onInterval: IntervalUpdateHandler = null)
|
||||
|
||||
case class UDF4[T1, T2, T3, T4, RT] (func: (T1, T2, T3, T4) => RT,onInterval: IntervalUpdateHandler = null)
|
||||
|
||||
trait Generator0[RT] extends Serializable{
|
||||
def initialize(spark:SparkSession, dict: SettingDictionary): UDF0[RT]
|
||||
}
|
||||
|
||||
trait Generator1[T1, RT] extends Serializable {
|
||||
def initialize(spark:SparkSession, dict: SettingDictionary): UDF1[T1, RT]
|
||||
}
|
||||
|
||||
trait Generator2[T1, T2, RT] extends Serializable {
|
||||
def initialize(spark:SparkSession, dict: SettingDictionary): UDF2[T1, T2, RT]
|
||||
}
|
||||
|
||||
trait Generator3[T1, T2, T3, RT] extends Serializable {
|
||||
def initialize(spark:SparkSession, dict: SettingDictionary): UDF3[T1, T2, T3, RT]
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -0,0 +1,12 @@
|
|||
// *********************************************************************
|
||||
// Copyright (c) Microsoft Corporation. All rights reserved.
|
||||
// Licensed under the MIT License
|
||||
// *********************************************************************
|
||||
package datax.extension
|
||||
|
||||
import datax.config.SettingDictionary
|
||||
import org.apache.spark.sql.{DataFrame, SparkSession}
|
||||
|
||||
trait PreProjectionProcessor extends Serializable{
|
||||
def initialize(spark:SparkSession, dict: SettingDictionary): DataFrame=>DataFrame
|
||||
}
|
|
@ -0,0 +1,12 @@
|
|||
// *********************************************************************
|
||||
// Copyright (c) Microsoft Corporation. All rights reserved.
|
||||
// Licensed under the MIT License
|
||||
// *********************************************************************
|
||||
package datax.extension
|
||||
|
||||
/*
|
||||
extension to normalize the input string in the streaming pipeline before parsing them as JSON object
|
||||
*/
|
||||
trait StringNormalizer {
|
||||
def normalize(str: String): String
|
||||
}
|
|
@ -0,0 +1,15 @@
|
|||
// *********************************************************************
|
||||
// Copyright (c) Microsoft Corporation. All rights reserved.
|
||||
// Licensed under the MIT License
|
||||
// *********************************************************************
|
||||
package datax.host
|
||||
|
||||
import datax.service.{ConfigService, TelemetryService}
|
||||
import org.apache.spark.SparkConf
|
||||
import org.apache.spark.sql.SparkSession
|
||||
|
||||
trait AppHost {
|
||||
def getConfigService(): ConfigService
|
||||
def getTelemetryService(): TelemetryService
|
||||
def getSpark(sparkConf: SparkConf): SparkSession
|
||||
}
|
|
@ -0,0 +1,12 @@
|
|||
// *********************************************************************
|
||||
// Copyright (c) Microsoft Corporation. All rights reserved.
|
||||
// Licensed under the MIT License
|
||||
// *********************************************************************
|
||||
package datax.service
|
||||
|
||||
import datax.config.{SettingDictionary}
|
||||
|
||||
trait ConfigService {
|
||||
def getActiveDictionary(): SettingDictionary
|
||||
def setActiveDictionary(conf: SettingDictionary)
|
||||
}
|
|
@ -0,0 +1,10 @@
|
|||
// *********************************************************************
|
||||
// Copyright (c) Microsoft Corporation. All rights reserved.
|
||||
// Licensed under the MIT License
|
||||
// *********************************************************************
|
||||
package datax.service
|
||||
|
||||
trait TelemetryService {
|
||||
def trackEvent(event: String, properties: Map[String, String], measurements: Map[String, Double])
|
||||
def trackException(e: Exception, properties: Map[String, String], measurements: Map[String, Double])
|
||||
}
|
|
@ -0,0 +1,35 @@
|
|||
// *********************************************************************
|
||||
// Copyright (c) Microsoft Corporation. All rights reserved.
|
||||
// Licensed under the MIT License
|
||||
// *********************************************************************
|
||||
package datax
|
||||
|
||||
import java.sql.Timestamp
|
||||
|
||||
import datax.config.SettingDictionary
|
||||
import org.apache.spark.sql.{DataFrame, Row, SparkSession}
|
||||
|
||||
package object sink {
|
||||
type SinkDelegate = (Row, Seq[Row], Timestamp, Int, String)=>Map[String, Int]
|
||||
type Metrics = Map[String, Double]
|
||||
|
||||
trait SinkOperatorFactory{
|
||||
def getSinkOperator(dict:SettingDictionary, name: String):SinkOperator
|
||||
|
||||
def getSettingNamespace(): String
|
||||
}
|
||||
|
||||
case class SinkOperator(name: String,
|
||||
isEnabled: Boolean,
|
||||
flagColumnExprGenerator: () => String,
|
||||
generator: (Int)=>SinkDelegate,
|
||||
onInitialization: (SparkSession)=>Unit = null,
|
||||
onBatch: (SparkSession, Timestamp, Set[String])=>Unit = null
|
||||
)
|
||||
|
||||
case class OutputOperator(name: String,
|
||||
onInitialization: (SparkSession) => Unit,
|
||||
onBatch: (SparkSession, Timestamp, Set[String]) => Unit,
|
||||
output: (DataFrame, Timestamp) => Map[String, Int])
|
||||
}
|
||||
|
|
@ -0,0 +1,225 @@
|
|||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<project xmlns="http://maven.apache.org/POM/4.0.0"
|
||||
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
|
||||
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
|
||||
<modelVersion>4.0.0</modelVersion>
|
||||
|
||||
<!--
|
||||
MIT License
|
||||
|
||||
Copyright (c) Microsoft Corporation. All rights reserved.
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
|
||||
The above copyright notice and this permission notice shall be included in all
|
||||
copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
SOFTWARE
|
||||
-->
|
||||
<developers>
|
||||
<developer>
|
||||
<id>microsoft</id>
|
||||
<name>Microsoft</name>
|
||||
</developer>
|
||||
</developers>
|
||||
|
||||
<licenses>
|
||||
<license>
|
||||
<name>MIT License</name>
|
||||
<url>http://opensource.org/licenses/MIT</url>
|
||||
<distribution>repo</distribution>
|
||||
</license>
|
||||
</licenses>
|
||||
|
||||
<scm>
|
||||
<connection>scm:git:git@github.com:Microsoft/data-accelerator.git</connection>
|
||||
<developerConnection>scm:git:git@github.com:Microsoft/data-accelerator.git</developerConnection>
|
||||
<url>https://github.com/Microsoft/data-accelerator.git</url>
|
||||
</scm>
|
||||
|
||||
<groupId>com.microsoft.datax</groupId>
|
||||
<artifactId>datax-host_2.11</artifactId>
|
||||
<version>1.0.0-SNAPSHOT</version>
|
||||
|
||||
<properties>
|
||||
<spark.version>2.3.0</spark.version>
|
||||
<scala.version.major>2.11</scala.version.major>
|
||||
<scala.version.minor>8</scala.version.minor>
|
||||
<scala.version>${scala.version.major}.${scala.version.minor}</scala.version>
|
||||
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
|
||||
</properties>
|
||||
|
||||
<repositories>
|
||||
</repositories>
|
||||
|
||||
<dependencies>
|
||||
<dependency>
|
||||
<groupId>org.apache.spark</groupId>
|
||||
<artifactId>spark-core_${scala.version.major}</artifactId>
|
||||
<version>${spark.version}</version>
|
||||
<scope>provided</scope>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.apache.spark</groupId>
|
||||
<artifactId>spark-sql_${scala.version.major}</artifactId>
|
||||
<version>${spark.version}</version>
|
||||
<scope>provided</scope>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.apache.spark</groupId>
|
||||
<artifactId>spark-streaming_${scala.version.major}</artifactId>
|
||||
<version>${spark.version}</version>
|
||||
<scope>provided</scope>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.apache.spark</groupId>
|
||||
<artifactId>spark-mllib_${scala.version.major}</artifactId>
|
||||
<version>${spark.version}</version>
|
||||
<scope>provided</scope>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.jmockit</groupId>
|
||||
<artifactId>jmockit</artifactId>
|
||||
<version>1.34</version>
|
||||
<scope>test</scope>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.scalatest</groupId>
|
||||
<artifactId>scalatest_2.11</artifactId>
|
||||
<version>2.2.6</version>
|
||||
<scope>test</scope>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>datax</groupId>
|
||||
<artifactId>datax-core_2.11</artifactId>
|
||||
<version>1.0.0-SNAPSHOT</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>datax</groupId>
|
||||
<artifactId>datax-utility_2.11</artifactId>
|
||||
<version>1.0.0-SNAPSHOT</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>com.microsoft.azure</groupId>
|
||||
<artifactId>azure-eventhubs-spark_2.11</artifactId>
|
||||
<version>2.3.6</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>io.lettuce</groupId>
|
||||
<artifactId>lettuce-core</artifactId>
|
||||
<version>5.0.4.RELEASE</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>com.microsoft.azure</groupId>
|
||||
<artifactId>azure-storage</artifactId>
|
||||
<version>5.3.0</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>com.microsoft.azure</groupId>
|
||||
<artifactId>azure-documentdb</artifactId>
|
||||
<version>1.16.1</version>
|
||||
</dependency>
|
||||
</dependencies>
|
||||
|
||||
<build>
|
||||
<plugins>
|
||||
<plugin>
|
||||
<groupId>net.alchim31.maven</groupId>
|
||||
<artifactId>scala-maven-plugin</artifactId>
|
||||
<version>3.2.2</version>
|
||||
<executions>
|
||||
<execution>
|
||||
<goals>
|
||||
<goal>compile</goal>
|
||||
<goal>testCompile</goal>
|
||||
</goals>
|
||||
</execution>
|
||||
</executions>
|
||||
<configuration>
|
||||
<scalaVersion>${scala.version}</scalaVersion>
|
||||
<args>
|
||||
<arg>-deprecation</arg>
|
||||
<arg>-feature</arg>
|
||||
</args>
|
||||
</configuration>
|
||||
</plugin>
|
||||
<plugin>
|
||||
<artifactId>maven-compiler-plugin</artifactId>
|
||||
<version>3.3</version>
|
||||
<configuration>
|
||||
<source>1.8</source>
|
||||
<target>1.8</target>
|
||||
</configuration>
|
||||
</plugin>
|
||||
<plugin>
|
||||
<groupId>org.apache.maven.plugins</groupId>
|
||||
<artifactId>maven-dependency-plugin</artifactId>
|
||||
<version>3.0.1</version>
|
||||
<executions>
|
||||
<execution>
|
||||
<id>copy-dependencies</id>
|
||||
<phase>package</phase>
|
||||
<goals>
|
||||
<goal>copy-dependencies</goal>
|
||||
</goals>
|
||||
<configuration>
|
||||
<outputDirectory>${project.build.directory}/dependency</outputDirectory>
|
||||
<overWriteReleases>false</overWriteReleases>
|
||||
<overWriteSnapshots>true</overWriteSnapshots>
|
||||
<excludeTransitive>true</excludeTransitive>
|
||||
</configuration>
|
||||
</execution>
|
||||
</executions>
|
||||
</plugin>
|
||||
<!-- enable scalatest -->
|
||||
<plugin>
|
||||
<groupId>org.scalatest</groupId>
|
||||
<artifactId>scalatest-maven-plugin</artifactId>
|
||||
<version>1.0</version>
|
||||
<configuration>
|
||||
<reportsDirectory>${project.build.directory}/surefire-reports</reportsDirectory>
|
||||
<junitxml>.</junitxml>
|
||||
<filereports>WDF TestSuite.txt</filereports>
|
||||
</configuration>
|
||||
<executions>
|
||||
<execution>
|
||||
<id>test</id>
|
||||
<goals>
|
||||
<goal>test</goal>
|
||||
</goals>
|
||||
</execution>
|
||||
</executions>
|
||||
</plugin>
|
||||
<plugin>
|
||||
<artifactId>maven-assembly-plugin</artifactId>
|
||||
<version>2.4.1</version>
|
||||
<configuration>
|
||||
<descriptors>
|
||||
<descriptor>with-dependencies.xml</descriptor>
|
||||
</descriptors>
|
||||
</configuration>
|
||||
<executions>
|
||||
<execution>
|
||||
<id>make-assembly</id>
|
||||
<phase>package</phase>
|
||||
<goals>
|
||||
<goal>single</goal>
|
||||
</goals>
|
||||
</execution>
|
||||
</executions>
|
||||
</plugin>
|
||||
</plugins>
|
||||
</build>
|
||||
</project>
|
|
@ -0,0 +1,16 @@
|
|||
// *********************************************************************
|
||||
// Copyright (c) Microsoft Corporation. All rights reserved.
|
||||
// Licensed under the MIT License
|
||||
// *********************************************************************
|
||||
package datax.app
|
||||
|
||||
import datax.host.BlobBatchingHost
|
||||
import datax.processor.CommonProcessorFactory
|
||||
|
||||
object BatchApp {
|
||||
def main(inputArguments: Array[String]): Unit = {
|
||||
BlobBatchingHost.runBatchApp(
|
||||
inputArguments,
|
||||
config => CommonProcessorFactory.createProcessor(config).asBlobPointerProcessor())
|
||||
}
|
||||
}
|
|
@ -0,0 +1,16 @@
|
|||
// *********************************************************************
|
||||
// Copyright (c) Microsoft Corporation. All rights reserved.
|
||||
// Licensed under the MIT License
|
||||
// *********************************************************************
|
||||
package datax.app
|
||||
|
||||
import datax.host.StreamingHost
|
||||
import datax.processor.CommonProcessorFactory
|
||||
|
||||
object BlobStreamingApp {
|
||||
def main(inputArguments: Array[String]): Unit = {
|
||||
StreamingHost.runStreamingApp(
|
||||
inputArguments,
|
||||
config => CommonProcessorFactory.createProcessor(config).asBlobPointerProcessor())
|
||||
}
|
||||
}
|
|
@ -0,0 +1,17 @@
|
|||
// *********************************************************************
|
||||
// Copyright (c) Microsoft Corporation. All rights reserved.
|
||||
// Licensed under the MIT License
|
||||
// *********************************************************************
|
||||
package datax.app
|
||||
|
||||
import datax.host.StreamingHost
|
||||
import datax.processor.CommonProcessorFactory
|
||||
|
||||
object DirectLocalStreamingApp {
|
||||
|
||||
def main(inputArguments: Array[String]): Unit = {
|
||||
StreamingHost.runLocalStreamingApp(
|
||||
inputArguments,
|
||||
config => CommonProcessorFactory.createProcessor(config).asDirectLocalProcessor())
|
||||
}
|
||||
}
|
|
@ -0,0 +1,16 @@
|
|||
// *********************************************************************
|
||||
// Copyright (c) Microsoft Corporation. All rights reserved.
|
||||
// Licensed under the MIT License
|
||||
// *********************************************************************
|
||||
package datax.app
|
||||
|
||||
import datax.host.StreamingHost
|
||||
import datax.processor.CommonProcessorFactory
|
||||
|
||||
object DirectStreamingApp {
|
||||
def main(inputArguments: Array[String]): Unit = {
|
||||
StreamingHost.runStreamingApp(
|
||||
inputArguments,
|
||||
config => CommonProcessorFactory.createProcessor(config).asDirectProcessor())
|
||||
}
|
||||
}
|
|
@ -0,0 +1,74 @@
|
|||
// *********************************************************************
|
||||
// Copyright (c) Microsoft Corporation. All rights reserved.
|
||||
// Licensed under the MIT License
|
||||
// *********************************************************************
|
||||
package datax.checkpoint
|
||||
|
||||
import datax.fs.HadoopClient
|
||||
import org.apache.hadoop.conf.Configuration
|
||||
import org.apache.hadoop.fs.{FileUtil, Path}
|
||||
import org.apache.log4j.LogManager
|
||||
import org.apache.spark.eventhubs.{EventHubsConf, EventPosition, NameAndPartition}
|
||||
|
||||
object EventhubCheckpointer {
|
||||
def getCheckpointFile(checkpointDir: String) = checkpointDir+"/offsets.txt"
|
||||
|
||||
val OffsetTokenSeparator = ","
|
||||
def readOffsetsFromFile(file: String): Iterable[(Long, String, Int, Long, Long)] =
|
||||
HadoopClient.readHdfsFile(file).filterNot(s=>s==null || s.isEmpty).map(s=>{
|
||||
val offset = s.split(OffsetTokenSeparator)
|
||||
Tuple5(offset(0).toLong, offset(1), offset(2).toInt, offset(3).toLong, offset(4).toLong)
|
||||
})
|
||||
|
||||
def readOffsetsFromCheckpoints(checkpointDir: String): Iterable[(Long, String, Int, Long, Long)] = {
|
||||
val conf = HadoopClient.getConf()
|
||||
val checkpointFile = getCheckpointFile(checkpointDir)
|
||||
val path = new Path(checkpointFile)
|
||||
val fs = path.getFileSystem(conf)
|
||||
if(fs.exists(path))
|
||||
readOffsetsFromFile(checkpointFile)
|
||||
else{
|
||||
val backupPath = path.suffix(".old")
|
||||
if(fs.exists(backupPath)){
|
||||
val logger = LogManager.getLogger("readOffsetsFromCheckpoints")
|
||||
logger.warn(s"offsets file at checkpoint folder is not found, but found a backup one: ${backupPath.toUri}")
|
||||
FileUtil.copy(fs, backupPath, fs, path, false, conf)
|
||||
readOffsetsFromFile(checkpointFile)
|
||||
}
|
||||
else
|
||||
null
|
||||
}
|
||||
}
|
||||
|
||||
def writeOffsetsToCheckpoints(checkpointDir: String, offsets: Seq[(Long, String, Int, Long, Long)], conf: Configuration) = {
|
||||
val folder = new Path(checkpointDir)
|
||||
val fs = folder.getFileSystem(conf)
|
||||
if(!fs.exists(folder)){
|
||||
fs.mkdirs(folder)
|
||||
}
|
||||
|
||||
val checkpointFile = getCheckpointFile(checkpointDir)
|
||||
val path = new Path(checkpointFile)
|
||||
|
||||
if(fs.exists(path)){
|
||||
// backup the old one
|
||||
val backupPath = path.suffix(".old")
|
||||
FileUtil.copy(fs, path, fs, backupPath, false, true, conf)
|
||||
}
|
||||
|
||||
HadoopClient.writeHdfsFile(checkpointFile,
|
||||
offsets.map(v=>v._1+OffsetTokenSeparator+v._2+OffsetTokenSeparator+v._3+OffsetTokenSeparator+v._4+OffsetTokenSeparator+v._5).mkString("\n"), true)
|
||||
}
|
||||
|
||||
def applyCheckpointsIfExists(ehConf: EventHubsConf, checkpointDir: String) = {
|
||||
val fromOffsets = readOffsetsFromCheckpoints(checkpointDir)
|
||||
val logger = LogManager.getLogger("EventHubConfBuilder")
|
||||
if(fromOffsets!=null) {
|
||||
logger.warn(s"Checkpoints of offsets are detected. Applying the offsets:\n" + fromOffsets.mkString("\n"))
|
||||
ehConf.setStartingPositions(fromOffsets.map { v => new NameAndPartition(v._2, v._3) -> EventPosition.fromSequenceNumber(v._5) }.toMap)
|
||||
}
|
||||
else{
|
||||
logger.warn(s"Checkpoints don't exist, skipped.")
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,156 @@
|
|||
// *********************************************************************
|
||||
// Copyright (c) Microsoft Corporation. All rights reserved.
|
||||
// Licensed under the MIT License
|
||||
// *********************************************************************
|
||||
package datax.classloader
|
||||
|
||||
import java.net.{URL, URLClassLoader}
|
||||
|
||||
import org.apache.spark.SparkEnv
|
||||
import org.apache.spark.sql.catalyst.ScalaReflection
|
||||
|
||||
import scala.collection.JavaConverters._
|
||||
import scala.collection.mutable.HashMap
|
||||
|
||||
object ClassLoaderHost {
|
||||
/**
|
||||
* Get the ClassLoader which loaded Spark.
|
||||
*/
|
||||
def getSparkClassLoader: ClassLoader = getClass.getClassLoader
|
||||
|
||||
/**
|
||||
* Get the Context ClassLoader on this thread or, if not present, the ClassLoader that
|
||||
* loaded Spark.
|
||||
*/
|
||||
def getContextOrSparkClassLoader: ClassLoader =
|
||||
Option(Thread.currentThread().getContextClassLoader).getOrElse(getSparkClassLoader)
|
||||
|
||||
/**
|
||||
* Create a ClassLoader for use in tasks, adding any JARs specified by the user or any classes
|
||||
* created by the interpreter to the search path
|
||||
*/
|
||||
private def createClassLoader(): MutableURLClassLoader = {
|
||||
// Bootstrap the list of jars with the user class path.
|
||||
val now = System.currentTimeMillis()
|
||||
userClassPath.foreach { url =>
|
||||
currentJars(url.getPath().split("/").last) = now
|
||||
}
|
||||
|
||||
val currentLoader = getContextOrSparkClassLoader
|
||||
val userClassPathFirst = true
|
||||
|
||||
// For each of the jars in the jarSet, add them to the class loader.
|
||||
// We assume each of the files has already been fetched.
|
||||
val urls = userClassPath.toArray ++ currentJars.keySet.map { uri =>
|
||||
new java.io.File(uri.split("/").last).toURI.toURL
|
||||
}
|
||||
if (userClassPathFirst) {
|
||||
new ChildFirstURLClassLoader(urls, currentLoader)
|
||||
} else {
|
||||
new MutableURLClassLoader(urls, currentLoader)
|
||||
}
|
||||
}
|
||||
|
||||
val userClassPath: Seq[URL] = Nil
|
||||
val currentJars = new HashMap[String, Long]
|
||||
val urlClassLoader = createClassLoader()
|
||||
val derivedClassLoader = urlClassLoader
|
||||
|
||||
val env = SparkEnv.get
|
||||
if(env!=null){
|
||||
// Set the classloader for serializer
|
||||
env.serializer.setDefaultClassLoader(derivedClassLoader)
|
||||
// SPARK-21928. SerializerManager's internal instance of Kryo might get used in netty threads
|
||||
// for fetching remote cached RDD blocks, so need to make sure it uses the right classloader too.
|
||||
env.serializerManager.setDefaultClassLoader(derivedClassLoader)
|
||||
}
|
||||
|
||||
/** Preferred alternative to Class.forName(className) */
|
||||
def classForName(className: String): Class[_] = {
|
||||
Class.forName(className, true, derivedClassLoader)
|
||||
// scalastyle:on classforname
|
||||
}
|
||||
|
||||
def getType[T](clazz: Class[T])(implicit runtimeMirror: scala.reflect.runtime.universe.Mirror) =
|
||||
runtimeMirror.classSymbol(clazz).toType
|
||||
|
||||
def javaTypeToDataType(t: java.lang.reflect.Type) = {
|
||||
val mirror = scala.reflect.runtime.universe.runtimeMirror(derivedClassLoader)
|
||||
//TODO: ParameterizedType (aka. generic type) cannot be casted to Class[_],
|
||||
// thus getJavaUDFReturnDataType should be used instead in most of the case.
|
||||
val udfScalaType = mirror.classSymbol(t.asInstanceOf[Class[_]]).toType
|
||||
ScalaReflection.schemaFor(udfScalaType).dataType
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* A class loader which makes some protected methods in ClassLoader accessible.
|
||||
*/
|
||||
class ParentClassLoader(parent: ClassLoader) extends ClassLoader(parent) {
|
||||
|
||||
override def findClass(name: String): Class[_] = {
|
||||
super.findClass(name)
|
||||
}
|
||||
|
||||
override def loadClass(name: String): Class[_] = {
|
||||
super.loadClass(name)
|
||||
}
|
||||
|
||||
override def loadClass(name: String, resolve: Boolean): Class[_] = {
|
||||
super.loadClass(name, resolve)
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
/**
|
||||
* URL class loader that exposes the `addURL` and `getURLs` methods in URLClassLoader.
|
||||
*/
|
||||
class MutableURLClassLoader(urls: Array[URL], parent: ClassLoader)
|
||||
extends URLClassLoader(urls, parent) {
|
||||
|
||||
override def addURL(url: URL): Unit = {
|
||||
super.addURL(url)
|
||||
}
|
||||
|
||||
override def getURLs(): Array[URL] = {
|
||||
super.getURLs()
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
/**
|
||||
* A mutable class loader that gives preference to its own URLs over the parent class loader
|
||||
* when loading classes and resources.
|
||||
*/
|
||||
class ChildFirstURLClassLoader(urls: Array[URL], parent: ClassLoader)
|
||||
extends MutableURLClassLoader(urls, null) {
|
||||
|
||||
private val parentClassLoader = new ParentClassLoader(parent)
|
||||
|
||||
override def loadClass(name: String, resolve: Boolean): Class[_] = {
|
||||
try {
|
||||
super.loadClass(name, resolve)
|
||||
} catch {
|
||||
case e: ClassNotFoundException =>
|
||||
parentClassLoader.loadClass(name, resolve)
|
||||
}
|
||||
}
|
||||
|
||||
override def getResource(name: String): URL = {
|
||||
val url = super.findResource(name)
|
||||
val res = if (url != null) url else parentClassLoader.getResource(name)
|
||||
res
|
||||
}
|
||||
|
||||
override def getResources(name: String): java.util.Enumeration[URL] = {
|
||||
val childUrls = super.findResources(name).asScala
|
||||
val parentUrls = parentClassLoader.getResources(name).asScala
|
||||
(childUrls ++ parentUrls).asJavaEnumeration
|
||||
}
|
||||
|
||||
override def addURL(url: URL) {
|
||||
super.addURL(url)
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,18 @@
|
|||
// *********************************************************************
|
||||
// Copyright (c) Microsoft Corporation. All rights reserved.
|
||||
// Licensed under the MIT License
|
||||
// *********************************************************************
|
||||
package datax.client.cosmosdb
|
||||
|
||||
/***
|
||||
* Represents the connection info for sinker to access cosmos DB
|
||||
* @param name name of this cosmosdb connection info mainly for logging purpose
|
||||
* @param connectionString connection string to the cosmosdb
|
||||
* @param database name of the cosmos database
|
||||
* @param collection name of the collection in cosmos db
|
||||
*/
|
||||
case class CosmosDBConf(name: String, connectionString: String, database: String, collection: String)
|
||||
|
||||
object CosmosDBBase {
|
||||
|
||||
}
|
|
@ -0,0 +1,27 @@
|
|||
// *********************************************************************
|
||||
// Copyright (c) Microsoft Corporation. All rights reserved.
|
||||
// Licensed under the MIT License
|
||||
// *********************************************************************
|
||||
package datax.client.eventhub
|
||||
|
||||
import java.util.concurrent.Executors
|
||||
|
||||
import com.microsoft.azure.eventhubs.EventHubClient
|
||||
import org.apache.spark.eventhubs.ConnectionStringBuilder
|
||||
|
||||
object EventHubBase {
|
||||
val executorService = Executors.newSingleThreadExecutor
|
||||
|
||||
def buildConnectionString(namespace: String, name: String, policy: String, key: String) = {
|
||||
ConnectionStringBuilder()
|
||||
.setNamespaceName(namespace)
|
||||
.setEventHubName(name)
|
||||
.setSasKeyName(policy)
|
||||
.setSasKey(key)
|
||||
.build
|
||||
}
|
||||
|
||||
def getOutputClient(connString: String) = {
|
||||
EventHubClient.createSync(connString,executorService)
|
||||
}
|
||||
}
|
|
@ -0,0 +1,7 @@
|
|||
// *********************************************************************
|
||||
// Copyright (c) Microsoft Corporation. All rights reserved.
|
||||
// Licensed under the MIT License
|
||||
// *********************************************************************
|
||||
package datax.client.eventhub
|
||||
|
||||
case class EventHubConf(name: String, connectionString: String)
|
|
@ -0,0 +1,69 @@
|
|||
// *********************************************************************
|
||||
// Copyright (c) Microsoft Corporation. All rights reserved.
|
||||
// Licensed under the MIT License
|
||||
// *********************************************************************
|
||||
package datax.client.eventhub
|
||||
|
||||
import com.microsoft.azure.eventhubs.{EventData, EventHubClient}
|
||||
import datax.exception.EngineException
|
||||
import org.apache.log4j.{LogManager, Logger}
|
||||
|
||||
import scala.collection.JavaConverters._
|
||||
|
||||
class EventHubSender(client: EventHubClient){
|
||||
val senderName : String = this.getClass.getSimpleName
|
||||
def getLogger(): Logger = LogManager.getLogger(this.getClass)
|
||||
|
||||
def sendString(data: String, properties: Map[String, String]) ={
|
||||
if(data!=null) {
|
||||
sendBytes(data.getBytes, properties)
|
||||
}
|
||||
}
|
||||
|
||||
def sendBytes(data: Array[Byte], properties: Map[String, String]) = {
|
||||
val eventData = EventData.create(data)
|
||||
if(properties!=null)
|
||||
eventData.getProperties().putAll(properties.asJava)
|
||||
val t1=System.nanoTime()
|
||||
client.send(eventData)
|
||||
val et = (System.nanoTime()-t1)/1E9
|
||||
getLogger().info(s"sent ${data.length} bytes in $et seconds")
|
||||
}
|
||||
|
||||
def sendAvroData(data: Array[Byte] ) ={
|
||||
if(data!=null) {
|
||||
val eventData = EventData.create(data)
|
||||
client.send(eventData)
|
||||
getLogger.info("avro eventData length = " + data.length)
|
||||
}
|
||||
}
|
||||
|
||||
def close() = {
|
||||
if(client!=null){
|
||||
client.closeSync()
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
object EventHubSenderPool {
|
||||
private var sender:EventHubSender = null
|
||||
def getSender(outputEventHubConf: EventHubConf): EventHubSender ={
|
||||
if(outputEventHubConf == null
|
||||
|| outputEventHubConf.connectionString==null
|
||||
|| outputEventHubConf.connectionString.isEmpty){
|
||||
throw new EngineException(s"Unexpected empty eventhub conf")
|
||||
}
|
||||
|
||||
if(sender==null){
|
||||
this.synchronized {
|
||||
if (sender == null) {
|
||||
LogManager.getLogger(this.getClass).warn(s"Constructing eventhub sender for ${outputEventHubConf.name}")
|
||||
sender = new EventHubSender(EventHubBase.getOutputClient(outputEventHubConf.connectionString))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
sender
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,63 @@
|
|||
// *********************************************************************
|
||||
// Copyright (c) Microsoft Corporation. All rights reserved.
|
||||
// Licensed under the MIT License
|
||||
// *********************************************************************
|
||||
package datax.client.http
|
||||
|
||||
import org.apache.http.client.config.RequestConfig
|
||||
import org.apache.http.client.methods.{CloseableHttpResponse, HttpPost}
|
||||
import org.apache.http.entity.StringEntity
|
||||
import org.apache.http.impl.client.{CloseableHttpClient, HttpClientBuilder}
|
||||
import org.apache.http.util.EntityUtils
|
||||
import org.apache.log4j.LogManager
|
||||
|
||||
class HttpPostSender(url: String) {
|
||||
def createClient(): CloseableHttpClient ={
|
||||
val timeout = 5*1000
|
||||
val requestConfig = RequestConfig.custom()
|
||||
.setConnectionRequestTimeout(timeout)
|
||||
.setConnectTimeout(timeout)
|
||||
.setSocketTimeout(timeout)
|
||||
.build()
|
||||
HttpClientBuilder.create().setDefaultRequestConfig(requestConfig).build()
|
||||
}
|
||||
|
||||
private val client = createClient()
|
||||
private val poster = new HttpPost(url)
|
||||
poster.addHeader("Content-Type", "application/json")
|
||||
|
||||
protected val senderName: String = this.getClass.getSimpleName
|
||||
|
||||
val logger = LogManager.getLogger(this.getClass)
|
||||
logger.info(s"Constructing HttpPoster '$senderName' with url '$url'")
|
||||
|
||||
def sendString(data: String): String = {
|
||||
val t1 = System.nanoTime()
|
||||
val result = this.synchronized{
|
||||
poster.setEntity(new StringEntity(data))
|
||||
var response: CloseableHttpResponse = null
|
||||
try {
|
||||
response = client.execute(poster)
|
||||
EntityUtils.toString(response.getEntity())
|
||||
}
|
||||
catch {
|
||||
case e: Exception =>
|
||||
val msg = s"!!HttpPoster failed to send '$data' to '$url'"
|
||||
logger.error(msg, e)
|
||||
msg
|
||||
}
|
||||
finally {
|
||||
if(response!=null){
|
||||
response.close()
|
||||
response=null
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
val time = System.nanoTime()-t1
|
||||
logger.warn(s"HttpPoster Result:'$result' within ${time/1E9} seconds")
|
||||
result
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -0,0 +1,181 @@
|
|||
// *********************************************************************
|
||||
// Copyright (c) Microsoft Corporation. All rights reserved.
|
||||
// Licensed under the MIT License
|
||||
// *********************************************************************
|
||||
package datax.client.redis
|
||||
|
||||
import java.net.SocketAddress
|
||||
import java.time.Duration
|
||||
import java.util.concurrent.ConcurrentHashMap
|
||||
|
||||
import io.lettuce.core._
|
||||
import io.lettuce.core.api.sync.{RedisSortedSetCommands, RedisStringCommands}
|
||||
import io.lettuce.core.cluster.ClusterTopologyRefreshOptions.RefreshTrigger
|
||||
import io.lettuce.core.cluster.{ClusterClientOptions, ClusterTopologyRefreshOptions, RedisClusterClient}
|
||||
import datax.constants.ProductConstant
|
||||
import datax.exception.EngineException
|
||||
import org.apache.log4j.LogManager
|
||||
import datax.utility.ConverterUtil.scalaFunctionToJava
|
||||
|
||||
case class RedisServerConf(name:String, host: String, port: Int, key: String, timeout: Int, useSsl: Boolean, isCluster: Boolean)
|
||||
|
||||
object RedisBase {
|
||||
val logger = LogManager.getLogger(this.getClass)
|
||||
|
||||
/**
|
||||
* convert the redis cache connection string to type of @RedisServerConf
|
||||
* the format of the connection string should be like
|
||||
* <host>:<port>,password=<password>,ssl=True|False,cluster=True|False,timeout=<timeout>
|
||||
* @param connString connection string to parse
|
||||
* @return a RedisServerConf object
|
||||
*/
|
||||
def parseConnectionString(connString: String): RedisServerConf = {
|
||||
if(connString == null || connString.isEmpty) return null
|
||||
val parts = connString.split(",")
|
||||
val hostAndPort = parts(0).trim.split(":")
|
||||
if(hostAndPort.length!=2) throw new EngineException(s"Malformed format of host and port in redis connection string")
|
||||
val host = hostAndPort(0)
|
||||
val port = hostAndPort(1).toInt
|
||||
|
||||
val dict = parts.drop(1).map(p=>{
|
||||
val pos = p.indexOf("=")
|
||||
if(pos>0){
|
||||
p.substring(0, pos) -> p.substring(pos+1)
|
||||
}
|
||||
else
|
||||
throw new EngineException(s"Malformed format of parts in redis connection string")
|
||||
}).toMap
|
||||
|
||||
RedisServerConf(
|
||||
name = host,
|
||||
host = host,
|
||||
port = port,
|
||||
key = dict.get("password").orNull,
|
||||
timeout = dict.getOrElse("timeout","3000").toInt,
|
||||
useSsl = dict.getOrElse("ssl", "True").toBoolean,
|
||||
isCluster = dict.getOrElse("cluster", "True").toBoolean
|
||||
)
|
||||
}
|
||||
|
||||
def buildUri(conf: RedisServerConf, clientName: String = ProductConstant.ProductRedisBase) ={
|
||||
RedisURI.Builder.redis(conf.host)
|
||||
.withPassword(conf.key)
|
||||
.withPort(conf.port.toInt)
|
||||
.withSsl(conf.useSsl)
|
||||
.withVerifyPeer(false)
|
||||
.withClientName(clientName)
|
||||
.withTimeout(Duration.ofMillis(conf.timeout))
|
||||
.build()
|
||||
}
|
||||
|
||||
def getClusterConnection(client: RedisClusterClient) = {
|
||||
val topologyRefreshOptions = ClusterTopologyRefreshOptions.builder()
|
||||
.enablePeriodicRefresh(false)
|
||||
//.refreshPeriod(Duration.ofMinutes(30))
|
||||
.enableAdaptiveRefreshTrigger(RefreshTrigger.MOVED_REDIRECT, RefreshTrigger.PERSISTENT_RECONNECTS)
|
||||
.adaptiveRefreshTriggersTimeout(Duration.ofSeconds(120))
|
||||
.build()
|
||||
|
||||
val so = SocketOptions.builder()
|
||||
.connectTimeout(Duration.ofSeconds(120))
|
||||
.keepAlive(true)
|
||||
.tcpNoDelay(true)
|
||||
.build()
|
||||
|
||||
client.setOptions(ClusterClientOptions.builder()
|
||||
.socketOptions(so)
|
||||
.validateClusterNodeMembership(false)
|
||||
.topologyRefreshOptions(topologyRefreshOptions)
|
||||
.build())
|
||||
|
||||
client.connect
|
||||
}
|
||||
|
||||
def getStandardConnection(client: RedisClient) = {
|
||||
client.connect()
|
||||
}
|
||||
|
||||
private def getInternalConnection(conf: RedisServerConf, clientName: String):RedisConnection = {
|
||||
if (conf.isCluster)
|
||||
new RedisClusterConnection(conf, clientName)
|
||||
else
|
||||
new RedisStandardConnection(conf, clientName)
|
||||
}
|
||||
|
||||
private val connectionPool = new ConcurrentHashMap[String, RedisConnection]
|
||||
def getConnection(connectionString: String, clientName: String) = {
|
||||
val conf = parseConnectionString(connectionString)
|
||||
connectionPool.computeIfAbsent(clientName, (k: String) => getInternalConnection(conf, clientName))
|
||||
}
|
||||
}
|
||||
|
||||
class RedisStandardConnection(redisServerConf: RedisServerConf, clientName: String = ProductConstant.ProductRedisStandardConnection) extends RedisConnection{
|
||||
private val loggerPrefix = s"RedisClusterConnection-${redisServerConf.host}"
|
||||
private val logger = LogManager.getLogger(loggerPrefix)
|
||||
private val uri = RedisBase.buildUri(redisServerConf, clientName)
|
||||
private val client = RedisClient.create(uri)
|
||||
client.setDefaultTimeout(Duration.ofSeconds(120))
|
||||
|
||||
logger.warn(s"${loggerPrefix}:Init connection to ${uri.getHost}")
|
||||
var connection = RedisBase.getStandardConnection(client)
|
||||
logger.info(s"${loggerPrefix}:Created connection")
|
||||
|
||||
override def getStringCommands: RedisStringCommands[String, String] = {
|
||||
connection.sync()
|
||||
}
|
||||
|
||||
override def getSortedSetCommands: RedisSortedSetCommands[String, String] = {
|
||||
connection.sync()
|
||||
}
|
||||
|
||||
override def reconnect: Unit = {
|
||||
this.synchronized{
|
||||
logger.warn(s"${loggerPrefix}:Closing the connection for reconnect")
|
||||
connection.close()
|
||||
connection=RedisBase.getStandardConnection(client)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
class RedisClusterConnection(redisServerConf: RedisServerConf, clientName: String = ProductConstant.ProductRedisClusterConnection) extends RedisConnection{
|
||||
private val loggerPrefix = s"RedisClusterConnection-${redisServerConf.host}"
|
||||
private val logger = LogManager.getLogger(loggerPrefix)
|
||||
private val uri = RedisBase.buildUri(redisServerConf, clientName)
|
||||
private val client = RedisClusterClient.create(uri)
|
||||
client.setDefaultTimeout(Duration.ofSeconds(120))
|
||||
|
||||
logger.warn(s"${loggerPrefix}:Init connection")
|
||||
private var connection = RedisBase.getClusterConnection(client)
|
||||
|
||||
client.addListener(new RedisConnectionStateListener(){
|
||||
override def onRedisConnected(connection: RedisChannelHandler[_, _], socketAddress: SocketAddress): Unit = {
|
||||
logger.warn(s"${loggerPrefix}:Connected with socket:${socketAddress}")
|
||||
}
|
||||
|
||||
override def onRedisDisconnected(redisChannelHandler: RedisChannelHandler[_, _]): Unit = {
|
||||
logger.warn(s"${loggerPrefix}:Lost connection")
|
||||
}
|
||||
|
||||
override def onRedisExceptionCaught(redisChannelHandler: RedisChannelHandler[_, _], throwable: Throwable): Unit = {
|
||||
logger.error(s"${loggerPrefix}:Encounter exception", throwable)
|
||||
}
|
||||
})
|
||||
|
||||
logger.info(s"${loggerPrefix}:Created connection")
|
||||
override def getStringCommands: RedisStringCommands[String, String] = {
|
||||
connection.sync()
|
||||
}
|
||||
|
||||
override def getSortedSetCommands: RedisSortedSetCommands[String, String] = {
|
||||
connection.sync()
|
||||
}
|
||||
|
||||
override def reconnect: Unit = {
|
||||
this.synchronized{
|
||||
|
||||
logger.warn(s"${loggerPrefix}:Closing the connection for reconnect")
|
||||
connection.close()
|
||||
connection=RedisBase.getClusterConnection(client)
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,14 @@
|
|||
// *********************************************************************
|
||||
// Copyright (c) Microsoft Corporation. All rights reserved.
|
||||
// Licensed under the MIT License
|
||||
// *********************************************************************
|
||||
package datax.client.redis
|
||||
|
||||
import io.lettuce.core.api.sync.{RedisSortedSetCommands, RedisStringCommands}
|
||||
|
||||
trait RedisConnection{
|
||||
def getStringCommands: RedisStringCommands[String, String]
|
||||
def getSortedSetCommands: RedisSortedSetCommands[String, String]
|
||||
def reconnect(): Unit
|
||||
}
|
||||
|
|
@ -0,0 +1,136 @@
|
|||
// *********************************************************************
|
||||
// Copyright (c) Microsoft Corporation. All rights reserved.
|
||||
// Licensed under the MIT License
|
||||
// *********************************************************************
|
||||
package datax.config
|
||||
|
||||
import datax.constants.JobArgument
|
||||
import datax.exception.EngineException
|
||||
import datax.fs.HadoopClient
|
||||
import datax.service.ConfigService
|
||||
import datax.utility.ArgumentsParser
|
||||
import org.apache.log4j.LogManager
|
||||
import org.apache.spark.SparkConf
|
||||
|
||||
/***
|
||||
* Singleton service to set and get a Dictionary object
|
||||
*/
|
||||
object ConfigManager extends ConfigService {
|
||||
private val logger = LogManager.getLogger("DictionaryigManager")
|
||||
|
||||
def initSparkConf() = new SparkConf()
|
||||
|
||||
def loadLocalConfigIfExists[T](configurationFile: String)(implicit mf: Manifest[T]) : Option[T] = {
|
||||
val configString = HadoopClient.readLocalFileIfExists(configurationFile)
|
||||
|
||||
if(configString==null)
|
||||
None
|
||||
else {
|
||||
implicit val formats = org.json4s.DefaultFormats
|
||||
Some(org.json4s.jackson.parseJson(configString).extract[T])
|
||||
}
|
||||
}
|
||||
|
||||
private def getLocalEnvVars(): Map[String, String] = {
|
||||
sys.env.filterKeys(_.startsWith(JobArgument.ConfNamePrefix))
|
||||
}
|
||||
|
||||
private def getLocalConf(): SettingDictionary = {
|
||||
SettingDictionary(getLocalEnvVars())
|
||||
}
|
||||
|
||||
var singletonConfDict: SettingDictionary = null
|
||||
def getActiveDictionary(): SettingDictionary = {
|
||||
if(singletonConfDict==null){
|
||||
ConfigManager.synchronized{
|
||||
if(singletonConfDict==null){
|
||||
singletonConfDict = getLocalConf()
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
singletonConfDict
|
||||
}
|
||||
|
||||
def setActiveDictionary(conf: SettingDictionary) = {
|
||||
ConfigManager.synchronized{
|
||||
singletonConfDict = conf
|
||||
}
|
||||
}
|
||||
|
||||
def getConfigurationFromArguments(inputArguments: Array[String]):SettingDictionary = {
|
||||
val namedArgs = ArgumentsParser.getNamedArgs(inputArguments)
|
||||
|
||||
logger.warn("cmd line args:"+namedArgs)
|
||||
|
||||
if (!namedArgs.contains("conf")) {
|
||||
throw new Error("configuration file is not specified.")
|
||||
}
|
||||
|
||||
val envs = getLocalEnvVars()
|
||||
val convertedConf = Map(
|
||||
JobArgument.ConfName_AppConf -> namedArgs.getOrElse("conf", null),
|
||||
JobArgument.ConfName_DriverLogLevel -> namedArgs.getOrElse("driverLogLevel", null),
|
||||
JobArgument.ConfName_LogLevel -> namedArgs.getOrElse("executorLogLevel", null),
|
||||
JobArgument.ConfName_CheckpointEnabled -> namedArgs.getOrElse("checkpointEnabled", null)
|
||||
).filter(_._2!=null)
|
||||
|
||||
logger.warn("local env:"+envs.mkString(","))
|
||||
setActiveDictionary(SettingDictionary(envs ++ namedArgs ++ convertedConf))
|
||||
singletonConfDict
|
||||
}
|
||||
|
||||
private def replaceTokens(src: String, tokens: Map[String, String]) = {
|
||||
if(tokens==null || src==null ||src.isEmpty)
|
||||
src
|
||||
else
|
||||
tokens.foldLeft(src)((r, kv) => r.replaceAllLiterally("${" + kv._1 + "}", kv._2))
|
||||
}
|
||||
|
||||
private def readJsonFile[T](configurationFile: String, replacements: Map[String, String])(implicit mf: Manifest[T]): T = {
|
||||
val configString = HadoopClient.readHdfsFile(configurationFile).mkString("")
|
||||
implicit val formats = org.json4s.DefaultFormats
|
||||
org.json4s.jackson
|
||||
.parseJson(replaceTokens(configString, replacements))
|
||||
.extract[T]
|
||||
}
|
||||
|
||||
private def splitString(s: String):(String, String) = {
|
||||
val pos = s.indexOf("=")
|
||||
if(pos==0)
|
||||
""->s
|
||||
else if(pos>0)
|
||||
s.substring(0, pos).trim()->s.substring(pos+1).trim()
|
||||
else
|
||||
s.trim()->null
|
||||
}
|
||||
|
||||
private def readConfFile(filePath: String, replacements: Map[String, String]) = {
|
||||
if(filePath==null)
|
||||
throw new EngineException(s"No conf file is provided")
|
||||
else if(!filePath.toLowerCase().endsWith(".conf"))
|
||||
throw new EngineException(s"non-conf file is not supported as configuration input")
|
||||
|
||||
parseConfLines(HadoopClient.readHdfsFile(filePath), replacements)
|
||||
}
|
||||
|
||||
def loadConfig(sparkConf: SparkConf): UnifiedConfig = {
|
||||
val dict = getActiveDictionary()
|
||||
val confFile = dict.getAppConfigurationFile()
|
||||
val confProps = readConfFile(confFile, dict.dict)
|
||||
val newdict = SettingDictionary(dict.dict ++ confProps)
|
||||
setActiveDictionary(newdict)
|
||||
|
||||
logger.warn("Load Dictionary as following:\n"+newdict.dict.map(kv=>s"${kv._1}->${kv._2}").mkString("\n"))
|
||||
UnifiedConfig(sparkConf = sparkConf, dict = newdict)
|
||||
}
|
||||
|
||||
private def parseConfLines(lines: Iterable[String], replacements: Map[String, String]) = {
|
||||
lines
|
||||
// skip empty lines or commented lines
|
||||
.filter(l=>l!=null && !l.trim().isEmpty && !l.trim().startsWith("#"))
|
||||
.map(splitString)
|
||||
.map{case(k,v)=>k->replaceTokens(v, replacements)}
|
||||
.toMap
|
||||
}
|
||||
}
|
|
@ -0,0 +1,36 @@
|
|||
// *********************************************************************
|
||||
// Copyright (c) Microsoft Corporation. All rights reserved.
|
||||
// Licensed under the MIT License
|
||||
// *********************************************************************
|
||||
package datax.data
|
||||
|
||||
import java.sql.Timestamp
|
||||
|
||||
import datax.utility.DateTimeUtil
|
||||
import org.apache.spark.SparkEnv
|
||||
import org.apache.spark.sql.Row
|
||||
import org.apache.spark.sql.functions.udf
|
||||
|
||||
case class FileInternal(inputPath: String = null,
|
||||
outputFolders: scala.collection.Map[String, String] = null,
|
||||
outputFileName: String = null,
|
||||
fileTime: Timestamp = null,
|
||||
ruleIndexPrefix: String = null,
|
||||
target: String = null
|
||||
)
|
||||
|
||||
object FileInternal {
|
||||
def appendEmptyInternalInfo(json: String) = {
|
||||
(FileInternal(), json)
|
||||
}
|
||||
|
||||
def udfEmptyInternalInfo = udf(()=>FileInternal())
|
||||
|
||||
val getInfoInputPath = (rowInfo: Row) => rowInfo.getString(0)
|
||||
val getInfoOutputFolder = (rowInfo: Row, group: String) => if(rowInfo.isNullAt(1)) null else rowInfo.getMap[String, String](1).getOrElse(group, null)
|
||||
val getInfoOutputFileName = (rowInfo: Row) => rowInfo.getString(2)
|
||||
val getInfoFileTimeString = (rowInfo: Row) => if(rowInfo.isNullAt(3))null else rowInfo.getTimestamp(3).toString
|
||||
val getInfoRuleIndexPrefix = (rowInfo: Row) => rowInfo.getString(4)
|
||||
val getInfoTargetTag = (rowInfo: Row) => rowInfo.getString(5)
|
||||
|
||||
}
|
|
@ -0,0 +1,8 @@
|
|||
// *********************************************************************
|
||||
// Copyright (c) Microsoft Corporation. All rights reserved.
|
||||
// Licensed under the MIT License
|
||||
// *********************************************************************
|
||||
package datax.data
|
||||
|
||||
case class ProcessResult(blobsCount: Long, eventsCount: Long)
|
||||
|
|
@ -0,0 +1,778 @@
|
|||
// *********************************************************************
|
||||
// Copyright (c) Microsoft Corporation. All rights reserved.
|
||||
// Licensed under the MIT License
|
||||
// *********************************************************************
|
||||
package datax.fs
|
||||
|
||||
import java.io._
|
||||
import java.net.URI
|
||||
import java.nio.channels.FileChannel
|
||||
import java.nio.file.Files
|
||||
import java.util.concurrent.{Executors, TimeUnit}
|
||||
import java.util.zip.GZIPInputStream
|
||||
|
||||
import com.google.common.io.{Files => GFiles}
|
||||
import datax.config.SparkEnvVariables
|
||||
import datax.constants.ProductConstant
|
||||
import datax.exception.EngineException
|
||||
import datax.securedsetting.KeyVaultClient
|
||||
import datax.telemetry.AppInsightLogger
|
||||
import org.apache.commons.codec.digest.DigestUtils
|
||||
import org.apache.hadoop.conf.Configuration
|
||||
import org.apache.hadoop.fs.{FileSystem, Path, RemoteIterator}
|
||||
import org.apache.log4j.LogManager
|
||||
|
||||
import scala.language.implicitConversions
|
||||
import scala.collection.mutable
|
||||
import scala.collection.mutable.ListBuffer
|
||||
import scala.concurrent.duration.Duration
|
||||
import scala.concurrent.{Await, ExecutionContext, Future, TimeoutException}
|
||||
import scala.io.Source
|
||||
|
||||
object HadoopClient {
|
||||
val logger = LogManager.getLogger(this.getClass)
|
||||
private val threadPool = Executors.newFixedThreadPool(5)
|
||||
implicit private val ec = ExecutionContext.fromExecutorService(threadPool)
|
||||
|
||||
private var hadoopConf:Configuration = null
|
||||
|
||||
/***
|
||||
* initialize the cached hadoop configuration
|
||||
* @param conf hadoop configuration for initialization
|
||||
*/
|
||||
def setConf(conf: Configuration = null): Unit ={
|
||||
if(conf==null)
|
||||
hadoopConf = new Configuration()
|
||||
else
|
||||
hadoopConf = conf
|
||||
}
|
||||
|
||||
/***
|
||||
* get the cached hadoop configuration
|
||||
* @return the cached hadoop configuration
|
||||
*/
|
||||
def getConf() = {
|
||||
if(hadoopConf==null)
|
||||
this.synchronized{
|
||||
if(hadoopConf==null)
|
||||
setConf()
|
||||
}
|
||||
|
||||
hadoopConf
|
||||
}
|
||||
|
||||
/***
|
||||
* get the name of storage account from a wasb-format path
|
||||
* @param path a hdfs path
|
||||
* @return the storage account name if there is storage account name in the wasbs/wasb path, else null
|
||||
*/
|
||||
private def getWasbStorageAccount(path: String): String = {
|
||||
val uri = new URI(path.replace(" ", "%20"))
|
||||
val scheme = uri.getScheme
|
||||
if(scheme == "wasb" || scheme == "wasbs")
|
||||
Option(uri.getHost) match {
|
||||
case Some(host) => host.toLowerCase().replace(".blob.core.windows.net", "")
|
||||
case None => null
|
||||
}
|
||||
else
|
||||
null
|
||||
}
|
||||
|
||||
/***
|
||||
* get a distinct set of storage accounts from a list of file paths
|
||||
* @param paths a list of hdfs paths which might contains wasb/wasbs paths
|
||||
* @return a distinct set of names of storage accounts
|
||||
*/
|
||||
private def getWasbStorageAccounts(paths: Seq[String]): Set[String] = {
|
||||
paths.map(getWasbStorageAccount _).filter(_!=null).toSet
|
||||
}
|
||||
|
||||
/***
|
||||
* internal cache of storage keys against storage account names.
|
||||
*/
|
||||
private val storageAccountKeys = new mutable.HashMap[String, String]
|
||||
|
||||
/***
|
||||
* set key for storage account for azure-hadoop adapter to access that later
|
||||
* @param sa name of the storage account
|
||||
* @param key key to the storage account
|
||||
*/
|
||||
private def setStorageAccountKey(sa: String, key: String): Unit ={
|
||||
storageAccountKeys.synchronized{
|
||||
storageAccountKeys += sa->key
|
||||
}
|
||||
|
||||
// get the default storage account
|
||||
val defaultFS = getConf().get("fs.defaultFS","")
|
||||
// set the key only if its a non-default storage account
|
||||
if(!defaultFS.toLowerCase().contains(s"$sa.blob.core.windows.net")) {
|
||||
logger.warn(s"Setting the key in hdfs conf for storage account $sa")
|
||||
getConf().set(s"fs.azure.account.key.$sa.blob.core.windows.net", key)
|
||||
}
|
||||
else {
|
||||
logger.warn(s"Default storage account $sa found, skipping setting the key")
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/***
|
||||
* resolve key for storage account with a keyvault name
|
||||
* warn if key is not found but we let it continue so static key settings outside of the job can still work
|
||||
* @param vaultName key vault name to get the key of storage account
|
||||
* @param sa name of the storage account
|
||||
*/
|
||||
private def resolveStorageAccount(vaultName: String, sa: String) = {
|
||||
val secretId = s"keyvault://$vaultName/${ProductConstant.ProductRoot}-sa-$sa"
|
||||
KeyVaultClient.getSecret(secretId) match {
|
||||
case Some(value)=>
|
||||
logger.warn(s"Retrieved key for storage account '$sa' with secretid:'$secretId'")
|
||||
setStorageAccountKey(sa, value)
|
||||
case None =>
|
||||
logger.warn(s"Failed to find key for storage account '$sa' with secretid:'$secretId'")
|
||||
}
|
||||
}
|
||||
|
||||
/***
|
||||
* set key for storage account required by the specified hdfs path
|
||||
* @param path hdfs file to resolve the key of storage account if it is a valid wasb/wasbs path, do nothing if it isn't
|
||||
*/
|
||||
private def resolveStorageAccountKeyForPath(path: String) = {
|
||||
val sa = getWasbStorageAccount(path)
|
||||
|
||||
if(sa != null && !sa.isEmpty){
|
||||
KeyVaultClient.withKeyVault {vaultName => resolveStorageAccount(vaultName, sa)}
|
||||
}
|
||||
}
|
||||
|
||||
/***
|
||||
* resolve key for storage accounts required by the specified hdfs paths
|
||||
* @param paths a list of hdfs paths, do nothing if there isn't any valid wasb/wasbs paths
|
||||
*/
|
||||
private def resolveStorageAccountKeysForPaths(paths: Seq[String]) = {
|
||||
val storageAccounts = getWasbStorageAccounts(paths)
|
||||
.filter(p=>p!=null & !p.isEmpty)
|
||||
.filterNot(storageAccountKeys.contains(_)) //TODO: make storageAccountKeys thread-safe
|
||||
|
||||
if(!storageAccounts.isEmpty){
|
||||
KeyVaultClient.withKeyVault {vaultName => storageAccounts.foreach(sa=>resolveStorageAccount(vaultName, sa))}
|
||||
}
|
||||
}
|
||||
|
||||
/***
|
||||
* export storage account keys to a immutable dictionary for serialization
|
||||
* @param paths hdfs paths to determine the storage accounts we need
|
||||
* @return storage accounts and corresponding keys resolved from the input hdfs paths
|
||||
*/
|
||||
private def exportWasbKeys(paths: Seq[String]): Map[String, String] = {
|
||||
//TODO: make storageAccountKeys thread-safe
|
||||
getWasbStorageAccounts(paths).map(sa => sa->storageAccountKeys.getOrElse(sa, null))
|
||||
.filter(_._2!=null)
|
||||
.toMap
|
||||
}
|
||||
|
||||
/**
|
||||
* Return a Hadoop FileSystem with the scheme encoded in the given path.
|
||||
* @param path hdfs path to determine the file system from
|
||||
* @param conf hadoop configuration for the determination
|
||||
*/
|
||||
private def getHadoopFileSystem(path: URI, conf: Configuration): FileSystem = {
|
||||
FileSystem.get(path, conf)
|
||||
}
|
||||
|
||||
/***
|
||||
* read local file (non-hadoop) from disk if it exists
|
||||
* @param fileName path to the local file
|
||||
* @return content of the file if it exists, else null.
|
||||
*/
|
||||
def readLocalFileIfExists(fileName: String): String = {
|
||||
val file = new File(fileName)
|
||||
if(file.exists()){
|
||||
val openFile = Source.fromFile(file)
|
||||
val result = openFile.getLines().mkString
|
||||
openFile.close()
|
||||
result
|
||||
}
|
||||
else{
|
||||
null
|
||||
}
|
||||
}
|
||||
|
||||
private def readLocalFile(fileName: String): String = {
|
||||
val file = Source.fromFile(fileName)
|
||||
val result = file.getLines().mkString
|
||||
file.close()
|
||||
result
|
||||
}
|
||||
|
||||
def fileExists(hdfsPath: String): Boolean = {
|
||||
val path = new Path(hdfsPath)
|
||||
val fs = path.getFileSystem(getConf())
|
||||
fs.exists(path)
|
||||
}
|
||||
|
||||
/***
|
||||
* read a hdfs file
|
||||
* @param hdfsPath path to the hdfs file
|
||||
* @param gzip whether it is a gzipped file
|
||||
* @throws IOException if any
|
||||
* @return a iterable of strings from content of the file
|
||||
*/
|
||||
@throws[IOException]
|
||||
def readHdfsFile(hdfsPath: String, gzip:Boolean=false): Iterable[String] = {
|
||||
val logger = LogManager.getLogger(s"FileLoader${SparkEnvVariables.getLoggerSuffix()}")
|
||||
|
||||
// resolve key to access azure storage account
|
||||
resolveStorageAccountKeyForPath(hdfsPath)
|
||||
|
||||
val lines = new ListBuffer[String]
|
||||
val t1= System.nanoTime()
|
||||
logger.info(s"Loading '$hdfsPath'")
|
||||
|
||||
try{
|
||||
val path = new Path(hdfsPath)
|
||||
val fs = path.getFileSystem(getConf())
|
||||
val is = fs.open(path)
|
||||
|
||||
//val source = Source.fromInputStream(is)
|
||||
val inputStream = if(gzip)new GZIPInputStream(is) else is
|
||||
val reader = new BufferedReader(new InputStreamReader(inputStream))
|
||||
|
||||
try{
|
||||
//source.getLines().toList
|
||||
var line = reader.readLine()
|
||||
while(line!=null){
|
||||
lines += line
|
||||
line = reader.readLine()
|
||||
}
|
||||
}
|
||||
finally {
|
||||
reader.close()
|
||||
}
|
||||
}
|
||||
catch {
|
||||
case e: Exception =>{
|
||||
logger.error(s"Error in reading '$hdfsPath'", e)
|
||||
AppInsightLogger.trackException(e, Map(
|
||||
"errorLocation" -> "readHdfsFile",
|
||||
"errorMessage" -> "Error in reading file",
|
||||
"failedHdfsPath" -> hdfsPath
|
||||
), null)
|
||||
|
||||
throw e
|
||||
}
|
||||
}
|
||||
|
||||
val elapsedTime = (System.nanoTime()-t1)/1E9
|
||||
logger.info(s"Done loading '$hdfsPath', count: ${lines.size}, elapsed time: $elapsedTime seconds")
|
||||
|
||||
//TODO: return a iterator instead of the entire list to reduce memory consumption, may also possibly help optimize job performance
|
||||
lines
|
||||
}
|
||||
|
||||
/**
|
||||
* write string content to a specified hdfs path
|
||||
* @param hdfsPath path to the specified hdfs file
|
||||
* @param content string content to write into the file
|
||||
* @param overwriteIfExists flag to specify if the file needs to be overwritten if it already exists in hdfs
|
||||
* @throws IOException if any occurs in the write operation
|
||||
*/
|
||||
@throws[IOException]
|
||||
def writeHdfsFile(hdfsPath: String, content: String, overwriteIfExists:Boolean) {
|
||||
writeHdfsFile(hdfsPath, content.getBytes("UTF-8"), getConf(), overwriteIfExists)
|
||||
}
|
||||
|
||||
/**
|
||||
* generate a random file name
|
||||
* @return a random file name of 8 characers.
|
||||
*/
|
||||
private def randomFileName():String = {
|
||||
java.util.UUID.randomUUID().toString.substring(0, 8)
|
||||
}
|
||||
|
||||
/**
|
||||
* generate a random string for prefixing a temp file name
|
||||
* @param seed seed for the randomization of names
|
||||
* @return a random string with 8 characters for prefixing file names
|
||||
*/
|
||||
def tempFilePrefix(seed: String): String = {
|
||||
DigestUtils.sha256Hex(seed).substring(0, 8)
|
||||
}
|
||||
|
||||
/**
|
||||
* write to a specified hdfs file with retries
|
||||
* @param hdfsPath the specified hdfs file
|
||||
* @param content conent to write into the file
|
||||
* @param timeout timeout duration for the write operation, by default 5 seconds
|
||||
* @param retries times in retries, by default 0 meaning no retries.
|
||||
*/
|
||||
def writeWithTimeoutAndRetries(hdfsPath: String,
|
||||
content: Array[Byte],
|
||||
timeout: Duration = Duration(5, TimeUnit.SECONDS),
|
||||
retries: Int = 0
|
||||
) = {
|
||||
val logger = LogManager.getLogger(s"FileWriter${SparkEnvVariables.getLoggerSuffix()}")
|
||||
def f = Future{
|
||||
writeHdfsFile(hdfsPath, content, getConf(), false)
|
||||
}
|
||||
var remainingAttempts = retries+1
|
||||
while(remainingAttempts>0) {
|
||||
try {
|
||||
remainingAttempts -= 1
|
||||
logger.info(s"writing to $hdfsPath with remaining attempts: $remainingAttempts")
|
||||
Await.result(f, timeout)
|
||||
remainingAttempts = 0
|
||||
}
|
||||
catch {
|
||||
case e: TimeoutException =>
|
||||
remainingAttempts = 0
|
||||
throw e
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* make sure parent folder exists for path, create the folder if it doesn't exist
|
||||
* @param path specified path to check its parent folder
|
||||
*/
|
||||
def ensureParentFolderExists(path: String): Unit = {
|
||||
val file = new Path(path)
|
||||
val folder = file.getParent
|
||||
val fs = folder.getFileSystem(getConf())
|
||||
if(!fs.exists(folder)){
|
||||
fs.mkdirs(folder)
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* write content to a hdfs file
|
||||
* @param hdfsPath path to the specified hdfs file
|
||||
* @param content content to write into the file
|
||||
* @param conf hadoop configuration
|
||||
* @param overwriteIfExists flag to specify if the file needs to be overwritten if it already exists in hdfs
|
||||
* @throws IOException if any from lower file system operation
|
||||
*/
|
||||
@throws[IOException]
|
||||
def writeHdfsFile(hdfsPath: String, content: Array[Byte], conf: Configuration, overwriteIfExists:Boolean) {
|
||||
resolveStorageAccountKeyForPath(hdfsPath)
|
||||
|
||||
val logger = LogManager.getLogger("writeHdfsFile")
|
||||
|
||||
val path = new Path(hdfsPath)
|
||||
val uri = path.toUri
|
||||
val fsy = path.getFileSystem(conf)
|
||||
|
||||
// If output file already exists and overwrite flag is not set, bail out
|
||||
if(fsy.exists(path) && !overwriteIfExists){
|
||||
logger.warn(s"Output file ${path} already exists and overwrite flag ${overwriteIfExists}. Skipping writing again .")
|
||||
return
|
||||
}
|
||||
|
||||
val tempHdfsPath = new URI(uri.getScheme, uri.getAuthority, "/_$tmpHdfsFolder$/"+tempFilePrefix(hdfsPath) + "-" + path.getName, null, null)
|
||||
//val pos = hdfsPath.lastIndexOf('/')
|
||||
//val tempHdfsPath = hdfsPath.patch(pos, "/_temporary", 0)
|
||||
// TODO: create unique name for each temp file.
|
||||
val tempPath = new Path(tempHdfsPath)
|
||||
val fs = path.getFileSystem(conf)
|
||||
val bs = new BufferedOutputStream(fs.create(tempPath, true))
|
||||
bs.write(content)
|
||||
bs.close()
|
||||
|
||||
// If output file already exists and overwrite flag is set, delete old file and then rewrite new file
|
||||
if(fs.exists(path) && overwriteIfExists){
|
||||
logger.warn(s"Output file ${path} already exists and overwrite flag ${overwriteIfExists}. Deleting it.")
|
||||
fs.delete(path, true)
|
||||
}
|
||||
|
||||
if(!fs.rename(tempPath, path)) {
|
||||
// Rename failed, check if it was due to destination path already exists.
|
||||
// If yes, fail only if overwrite is set. If destination does not exist, then fail as-well.
|
||||
val fileExists = fs.exists(path)
|
||||
|
||||
if (!fileExists || (fileExists && overwriteIfExists)) {
|
||||
val parent = path.getParent
|
||||
val msg = if(fs.exists(parent)) s"Move ${tempPath} to ${path} did not succeed"
|
||||
else s"Move ${tempPath} to ${path} did not succeed since parent folder does not exist!"
|
||||
throw new IOException(msg)
|
||||
}
|
||||
else {
|
||||
logger.warn(s"Blob rename from ${tempPath} to ${path} failed, but moving on since target already exists and overwrite is set to false.")
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* create a folder at the specified path
|
||||
* @param folderPath path to create the folder
|
||||
*/
|
||||
def createFolder(folderPath: String): Unit ={
|
||||
resolveStorageAccountKeyForPath(folderPath)
|
||||
val path = new Path(folderPath)
|
||||
val fs = path.getFileSystem(getConf())
|
||||
fs.mkdirs(path)
|
||||
}
|
||||
|
||||
/**
|
||||
* implict convert RemoteIterator to Iterator
|
||||
* @param underlying the underlying RemoteIterator instance
|
||||
* @tparam T type of the element in Iterator
|
||||
* @return a Iterator instance
|
||||
*/
|
||||
implicit def convertToScalaIterator[T](underlying: RemoteIterator[T]): Iterator[T] = {
|
||||
case class wrapper(underlying: RemoteIterator[T]) extends Iterator[T] {
|
||||
override def hasNext = underlying.hasNext
|
||||
|
||||
override def next = underlying.next
|
||||
}
|
||||
wrapper(underlying)
|
||||
}
|
||||
|
||||
/**
|
||||
* list files under a folder
|
||||
* @param folder path to the specified folder
|
||||
* @return a list of file paths under the folder
|
||||
*/
|
||||
def listFiles(folder: String): Iterator[String] = {
|
||||
val path = new Path(folder)
|
||||
val fs = path.getFileSystem(getConf)
|
||||
|
||||
if(fs.exists(path))
|
||||
fs.listFiles(path, true).map(f=>f.getPath.toString)
|
||||
else
|
||||
Iterator.empty
|
||||
}
|
||||
|
||||
/*
|
||||
* This function is copied from Apache Spark source code located at https://github.com/apache/spark/blob/master/core/src/main/scala/org/apache/spark/util/Utils.scala
|
||||
* Copy of the apache license can be obtained from http://www.apache.org/licenses/LICENSE-2.0
|
||||
* */
|
||||
/**
|
||||
* Execute a block of code, then a finally block, but if exceptions happen in
|
||||
* the finally block, do not suppress the original exception.
|
||||
*
|
||||
* This is primarily an issue with `finally { out.close() }` blocks, where
|
||||
* close needs to be called to clean up `out`, but if an exception happened
|
||||
* in `out.write`, it's likely `out` may be corrupted and `out.close` will
|
||||
* fail as well. This would then suppress the original/likely more meaningful
|
||||
* exception from the original `out.write` call.
|
||||
*/
|
||||
def tryWithSafeFinally[T](block: => T)(finallyBlock: => Unit): T = {
|
||||
var originalThrowable: Throwable = null
|
||||
try {
|
||||
block
|
||||
} catch {
|
||||
case t: Throwable =>
|
||||
// Purposefully not using NonFatal, because even fatal exceptions
|
||||
// we don't want to have our finallyBlock suppress
|
||||
originalThrowable = t
|
||||
throw originalThrowable
|
||||
} finally {
|
||||
try {
|
||||
finallyBlock
|
||||
} catch {
|
||||
case t: Throwable if (originalThrowable != null && originalThrowable != t) =>
|
||||
originalThrowable.addSuppressed(t)
|
||||
val logger = LogManager.getLogger("TryWithSafe")
|
||||
logger.warn(s"Suppressing exception in finally: ${t.getMessage}", t)
|
||||
throw originalThrowable
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* This function is copied from Apache Spark source code located at https://github.com/apache/spark/blob/master/core/src/main/scala/org/apache/spark/util/Utils.scala
|
||||
* Copy of the apache license can be obtained from http://www.apache.org/licenses/LICENSE-2.0
|
||||
* */
|
||||
def copyFileStreamNIO(
|
||||
input: FileChannel,
|
||||
output: FileChannel,
|
||||
startPosition: Long,
|
||||
bytesToCopy: Long): Unit = {
|
||||
val initialPos = output.position()
|
||||
var count = 0L
|
||||
// In case transferTo method transferred less data than we have required.
|
||||
while (count < bytesToCopy) {
|
||||
count += input.transferTo(count + startPosition, bytesToCopy - count, output)
|
||||
}
|
||||
assert(count == bytesToCopy,
|
||||
s"request to copy $bytesToCopy bytes, but actually copied $count bytes.")
|
||||
|
||||
// Check the position after transferTo loop to see if it is in the right position and
|
||||
// give user information if not.
|
||||
// Position will not be increased to the expected length after calling transferTo in
|
||||
// kernel version 2.6.32, this issue can be seen in
|
||||
// https://bugs.openjdk.java.net/browse/JDK-7052359
|
||||
// This will lead to stream corruption issue when using sort-based shuffle (SPARK-3948).
|
||||
val finalPos = output.position()
|
||||
val expectedPos = initialPos + bytesToCopy
|
||||
assert(finalPos == expectedPos,
|
||||
s"""
|
||||
|Current position $finalPos do not equal to expected position $expectedPos
|
||||
|after transferTo, please check your kernel version to see if it is 2.6.32,
|
||||
|this is a kernel bug which will lead to unexpected behavior when using transferTo.
|
||||
|You can set spark.file.transferTo = false to disable this NIO feature.
|
||||
""".stripMargin)
|
||||
}
|
||||
|
||||
/*
|
||||
* This function is copied from Apache Spark source code located at https://github.com/apache/spark/blob/master/core/src/main/scala/org/apache/spark/util/Utils.scala
|
||||
* Copy of the apache license can be obtained from http://www.apache.org/licenses/LICENSE-2.0
|
||||
* */
|
||||
/**
|
||||
* Copy all data from an InputStream to an OutputStream. NIO way of file stream to file stream
|
||||
* copying is disabled by default unless explicitly set transferToEnabled as true,
|
||||
* the parameter transferToEnabled should be configured by spark.file.transferTo = [true|false].
|
||||
*/
|
||||
def copyStream(
|
||||
in: InputStream,
|
||||
out: OutputStream,
|
||||
closeStreams: Boolean = false,
|
||||
transferToEnabled: Boolean = false): Long = {
|
||||
tryWithSafeFinally {
|
||||
if (in.isInstanceOf[FileInputStream] && out.isInstanceOf[FileOutputStream]
|
||||
&& transferToEnabled) {
|
||||
// When both streams are File stream, use transferTo to improve copy performance.
|
||||
val inChannel = in.asInstanceOf[FileInputStream].getChannel()
|
||||
val outChannel = out.asInstanceOf[FileOutputStream].getChannel()
|
||||
val size = inChannel.size()
|
||||
copyFileStreamNIO(inChannel, outChannel, 0, size)
|
||||
size
|
||||
} else {
|
||||
var count = 0L
|
||||
val buf = new Array[Byte](8192)
|
||||
var n = 0
|
||||
while (n != -1) {
|
||||
n = in.read(buf)
|
||||
if (n != -1) {
|
||||
out.write(buf, 0, n)
|
||||
count += n
|
||||
}
|
||||
}
|
||||
count
|
||||
}
|
||||
} {
|
||||
if (closeStreams) {
|
||||
try {
|
||||
in.close()
|
||||
} finally {
|
||||
out.close()
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* This function is copied from Apache Spark source code located at https://github.com/apache/spark/blob/master/core/src/main/scala/org/apache/spark/util/Utils.scala
|
||||
* Copy of the apache license can be obtained from http://www.apache.org/licenses/LICENSE-2.0
|
||||
* */
|
||||
/**
|
||||
* Copy `sourceFile` to `destFile`.
|
||||
*
|
||||
* If `destFile` already exists:
|
||||
* - no-op if its contents equal those of `sourceFile`,
|
||||
* - throw an exception if `fileOverwrite` is false,
|
||||
* - attempt to overwrite it otherwise.
|
||||
*
|
||||
* @param url URL that `sourceFile` originated from, for logging purposes.
|
||||
* @param sourceFile File path to copy/move from.
|
||||
* @param destFile File path to copy/move to.
|
||||
* @param fileOverwrite Whether to delete/overwrite an existing `destFile` that does not match
|
||||
* `sourceFile`
|
||||
* @param removeSourceFile Whether to remove `sourceFile` after / as part of moving/copying it to
|
||||
* `destFile`.
|
||||
*/
|
||||
private def copyFile(
|
||||
url: String,
|
||||
sourceFile: File,
|
||||
destFile: File,
|
||||
fileOverwrite: Boolean,
|
||||
removeSourceFile: Boolean = false): Unit = {
|
||||
|
||||
val logger = LogManager.getLogger("CopyFile")
|
||||
if (destFile.exists) {
|
||||
if (!filesEqualRecursive(sourceFile, destFile)) {
|
||||
if (fileOverwrite) {
|
||||
logger.info(
|
||||
s"File $destFile exists and does not match contents of $url, replacing it with $url"
|
||||
)
|
||||
if (!destFile.delete()) {
|
||||
throw new EngineException(
|
||||
"Failed to delete %s while attempting to overwrite it with %s".format(
|
||||
destFile.getAbsolutePath,
|
||||
sourceFile.getAbsolutePath
|
||||
)
|
||||
)
|
||||
}
|
||||
} else {
|
||||
throw new EngineException(
|
||||
s"File $destFile exists and does not match contents of $url")
|
||||
}
|
||||
} else {
|
||||
// Do nothing if the file contents are the same, i.e. this file has been copied
|
||||
// previously.
|
||||
logger.info(
|
||||
"%s has been previously copied to %s".format(
|
||||
sourceFile.getAbsolutePath,
|
||||
destFile.getAbsolutePath
|
||||
)
|
||||
)
|
||||
return
|
||||
}
|
||||
}
|
||||
|
||||
// The file does not exist in the target directory. Copy or move it there.
|
||||
if (removeSourceFile) {
|
||||
Files.move(sourceFile.toPath, destFile.toPath)
|
||||
} else {
|
||||
logger.info(s"Copying ${sourceFile.getAbsolutePath} to ${destFile.getAbsolutePath}")
|
||||
copyRecursive(sourceFile, destFile)
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* This function is copied from Apache Spark source code located at https://github.com/apache/spark/blob/master/core/src/main/scala/org/apache/spark/util/Utils.scala
|
||||
* Copy of the apache license can be obtained from http://www.apache.org/licenses/LICENSE-2.0
|
||||
* */
|
||||
private def filesEqualRecursive(file1: File, file2: File): Boolean = {
|
||||
if (file1.isDirectory && file2.isDirectory) {
|
||||
val subfiles1 = file1.listFiles()
|
||||
val subfiles2 = file2.listFiles()
|
||||
if (subfiles1.size != subfiles2.size) {
|
||||
return false
|
||||
}
|
||||
subfiles1.sortBy(_.getName).zip(subfiles2.sortBy(_.getName)).forall {
|
||||
case (f1, f2) => filesEqualRecursive(f1, f2)
|
||||
}
|
||||
} else if (file1.isFile && file2.isFile) {
|
||||
GFiles.equal(file1, file2)
|
||||
} else {
|
||||
false
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* This function is copied from Apache Spark source code located at https://github.com/apache/spark/blob/master/core/src/main/scala/org/apache/spark/util/Utils.scala
|
||||
* Copy of the apache license can be obtained from http://www.apache.org/licenses/LICENSE-2.0
|
||||
* */
|
||||
private def copyRecursive(source: File, dest: File): Unit = {
|
||||
if (source.isDirectory) {
|
||||
if (!dest.mkdir()) {
|
||||
throw new IOException(s"Failed to create directory ${dest.getPath}")
|
||||
}
|
||||
val subfiles = source.listFiles()
|
||||
subfiles.foreach(f => copyRecursive(f, new File(dest, f.getName)))
|
||||
} else {
|
||||
Files.copy(source.toPath, dest.toPath)
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* This function is copied from Apache Spark source code located at https://github.com/apache/spark/blob/master/core/src/main/scala/org/apache/spark/util/Utils.scala
|
||||
* Copy of the apache license can be obtained from http://www.apache.org/licenses/LICENSE-2.0
|
||||
* */
|
||||
/**
|
||||
* Download `in` to `tempFile`, then move it to `destFile`.
|
||||
*
|
||||
* If `destFile` already exists:
|
||||
* - no-op if its contents equal those of `sourceFile`,
|
||||
* - throw an exception if `fileOverwrite` is false,
|
||||
* - attempt to overwrite it otherwise.
|
||||
*
|
||||
* @param url URL that `sourceFile` originated from, for logging purposes.
|
||||
* @param in InputStream to download.
|
||||
* @param destFile File path to move `tempFile` to.
|
||||
* @param fileOverwrite Whether to delete/overwrite an existing `destFile` that does not match
|
||||
* `sourceFile`
|
||||
*/
|
||||
private def downloadFile(
|
||||
url: String,
|
||||
in: InputStream,
|
||||
destFile: File,
|
||||
fileOverwrite: Boolean): Unit = {
|
||||
val logger = LogManager.getLogger("DownloadFile")
|
||||
val tempFile = File.createTempFile("fetchFileTemp", null,
|
||||
new File(destFile.getParentFile.getAbsolutePath))
|
||||
logger.info(s"Fetching $url to $tempFile")
|
||||
|
||||
try {
|
||||
val out = new FileOutputStream(tempFile)
|
||||
copyStream(in, out, closeStreams = true)
|
||||
copyFile(url, tempFile, destFile, fileOverwrite, removeSourceFile = true)
|
||||
} finally {
|
||||
// Catch-all for the couple of cases where for some reason we didn't move `tempFile` to
|
||||
// `destFile`.
|
||||
if (tempFile.exists()) {
|
||||
tempFile.delete()
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* This function is copied from Apache Spark source code located at https://github.com/apache/spark/blob/master/core/src/main/scala/org/apache/spark/util/Utils.scala
|
||||
* Copy of the apache license can be obtained from http://www.apache.org/licenses/LICENSE-2.0
|
||||
* */
|
||||
def fetchHdfsFile(path: Path,
|
||||
targetDir: java.io.File,
|
||||
fs: FileSystem,
|
||||
hadoopConf: Configuration,
|
||||
fileOverwrite: Boolean,
|
||||
filename: Option[String] = None): Unit = {
|
||||
if (!targetDir.exists() && !targetDir.mkdir()) {
|
||||
throw new IOException(s"Failed to create directory ${targetDir.getPath}")
|
||||
}
|
||||
val dest = new File(targetDir, filename.getOrElse(path.getName))
|
||||
if (fs.isFile(path)) {
|
||||
val in = fs.open(path)
|
||||
try {
|
||||
downloadFile(path.toString, in, dest, fileOverwrite)
|
||||
} finally {
|
||||
in.close()
|
||||
}
|
||||
} else {
|
||||
fs.listStatus(path).foreach { fileStatus =>
|
||||
fetchHdfsFile(fileStatus.getPath(), dest, fs, hadoopConf, fileOverwrite)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* This function is a modified version of Apache Spark source code located at https://github.com/apache/spark/blob/master/core/src/main/scala/org/apache/spark/util/Utils.scala
|
||||
* Copy of the apache license can be obtained from http://www.apache.org/licenses/LICENSE-2.0
|
||||
* */
|
||||
/**
|
||||
* Download a file or directory to target directory. Supports fetching the file in a variety of
|
||||
* ways, including HTTP, Hadoop-compatible filesystems, and files on a standard filesystem, based
|
||||
* on the URL parameter. Fetching directories is only supported from Hadoop-compatible
|
||||
* filesystems.
|
||||
* 'resolveStorageKey' param controls whether to retrieve the storage key from keyvault.
|
||||
* Throws SparkException if the target file already exists and has different contents than
|
||||
* the requested file.
|
||||
*/
|
||||
def fetchFile(url: String,
|
||||
targetDir: java.io.File,
|
||||
filename: String, resolveStorageKey:Boolean=true): java.io.File = {
|
||||
val targetFile = new File(targetDir, filename)
|
||||
val uri = new URI(url)
|
||||
val fileOverwrite = false
|
||||
Option(uri.getScheme).getOrElse("file") match {
|
||||
case "file" =>
|
||||
// In the case of a local file, copy the local file to the target directory.
|
||||
// Note the difference between uri vs url.
|
||||
val sourceFile = if (uri.isAbsolute) new File(uri) else new File(url)
|
||||
copyFile(url, sourceFile, targetFile, fileOverwrite)
|
||||
case "wasb" | "wasbs" =>
|
||||
if(resolveStorageKey) {
|
||||
resolveStorageAccountKeyForPath(url)
|
||||
}
|
||||
val conf = getConf()
|
||||
val path = new Path(uri)
|
||||
val fs = path.getFileSystem(conf)
|
||||
fetchHdfsFile(path, targetDir, fs, conf, fileOverwrite, filename = Some(filename))
|
||||
case other =>
|
||||
throw new EngineException(s"unsupported file paths with '$other' scheme")
|
||||
}
|
||||
|
||||
targetFile
|
||||
}
|
||||
}
|
|
@ -0,0 +1,61 @@
|
|||
// *********************************************************************
|
||||
// Copyright (c) Microsoft Corporation. All rights reserved.
|
||||
// Licensed under the MIT License
|
||||
// *********************************************************************
|
||||
package datax.handler
|
||||
|
||||
import datax.config.{SettingDictionary, SettingNamespace}
|
||||
import datax.exception.EngineException
|
||||
import datax.securedsetting.KeyVaultClient
|
||||
import datax.utility.AzureFunctionCaller
|
||||
import org.apache.log4j.LogManager
|
||||
import org.apache.spark.sql.SparkSession
|
||||
|
||||
object AzureFunctionHandler {
|
||||
case class AzureFunctionConf(name: String, serviceEndpoint: String, api: String, code: String, params:Array[String])
|
||||
|
||||
val logger = LogManager.getLogger(this.getClass)
|
||||
val SettingAzureFunction = "azurefunction"
|
||||
val SettingAzureFunctionServiceEndpoint = "serviceendpoint"
|
||||
val SettingAzureFunctionApi = "api"
|
||||
val SettingAzureFunctionCode = "code"
|
||||
val SettingAzureFunctionParams = "params"
|
||||
|
||||
private def buildAzureFunctionConf(dict: SettingDictionary, name: String): AzureFunctionConf = {
|
||||
AzureFunctionConf(
|
||||
name = name,
|
||||
serviceEndpoint = dict.getOrNull(SettingAzureFunctionServiceEndpoint),
|
||||
api = dict.getOrNull(SettingAzureFunctionApi),
|
||||
code = KeyVaultClient.resolveSecretIfAny(dict.getOrNull(SettingAzureFunctionCode)),
|
||||
params = dict.getStringSeqOption(SettingAzureFunctionParams).map(_.toArray).orNull
|
||||
)
|
||||
}
|
||||
|
||||
private def buildAzureFunctionConfArray(dict: SettingDictionary, prefix: String): List[AzureFunctionConf] = {
|
||||
dict.buildConfigIterable(buildAzureFunctionConf, prefix).toList
|
||||
}
|
||||
|
||||
def initialize(spark: SparkSession, dict: SettingDictionary) = {
|
||||
val azFuncs = buildAzureFunctionConfArray(dict, SettingNamespace.JobProcessPrefix+SettingAzureFunction+SettingNamespace.Seperator)
|
||||
for (azFunc <- azFuncs) {
|
||||
val azFuncAccessCode = KeyVaultClient.getSecretOrThrow(azFunc.code)
|
||||
azFunc.params.length match {
|
||||
case 0 => spark.udf.register(azFunc.name, () => AzureFunctionCaller.call(azFunc.serviceEndpoint, azFunc.api, azFuncAccessCode, null))
|
||||
case 1 => spark.udf.register(azFunc.name, (s:String) => AzureFunctionCaller.call(azFunc.serviceEndpoint, azFunc.api, azFuncAccessCode, Map(
|
||||
azFunc.params(0)->s
|
||||
)))
|
||||
case 2 => spark.udf.register(azFunc.name,(s1:String, s2: String) => AzureFunctionCaller.call(azFunc.serviceEndpoint, azFunc.api, azFuncAccessCode, Map(
|
||||
azFunc.params(0)->s1,
|
||||
azFunc.params(1)->s2
|
||||
)))
|
||||
case 3 => spark.udf.register(azFunc.name,(s1:String, s2: String, s3: String) => AzureFunctionCaller.call(azFunc.serviceEndpoint, azFunc.api, azFuncAccessCode, Map(
|
||||
azFunc.params(0)->s1,
|
||||
azFunc.params(1)->s2,
|
||||
azFunc.params(2)->s3
|
||||
)))
|
||||
// TODO: Add support for more than 3 input parameters for AzureFunction
|
||||
case _=> throw new EngineException("AzureFunction with more than 3 input parameters are currently not supported. Please contact datax dev team for adding support if needed.")
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,18 @@
|
|||
// *********************************************************************
|
||||
// Copyright (c) Microsoft Corporation. All rights reserved.
|
||||
// Licensed under the MIT License
|
||||
// *********************************************************************
|
||||
package datax.handler
|
||||
|
||||
import datax.config.SettingDictionary
|
||||
import datax.utility.{AzureFunctionCaller, ConcurrentDateFormat}
|
||||
import org.apache.log4j.LogManager
|
||||
import org.apache.spark.sql.SparkSession
|
||||
|
||||
object BuiltInFunctionsHandler {
|
||||
val logger = LogManager.getLogger(this.getClass)
|
||||
|
||||
def initialize(spark: SparkSession, dict: SettingDictionary) = {
|
||||
spark.udf.register("stringToTimestamp", ConcurrentDateFormat.stringToTimestamp _)
|
||||
}
|
||||
}
|
|
@ -0,0 +1,101 @@
|
|||
// *********************************************************************
|
||||
// Copyright (c) Microsoft Corporation. All rights reserved.
|
||||
// Licensed under the MIT License
|
||||
// *********************************************************************
|
||||
package datax.handler
|
||||
|
||||
import datax.classloader.ClassLoaderHost
|
||||
import datax.config.{SettingDictionary, SettingNamespace}
|
||||
import datax.exception.EngineException
|
||||
import datax.extension.DynamicUDF._
|
||||
import org.apache.spark.sql.SparkSession
|
||||
import org.apache.spark.sql.catalyst.ScalaReflection
|
||||
import org.apache.spark.sql.catalyst.expressions.{Expression, ScalaUDF}
|
||||
import org.apache.spark.sql.types.DataType
|
||||
|
||||
object ExtendedUDFHandler {
|
||||
val NamespacePrefix = SettingNamespace.JobProcessPrefix + "udf."
|
||||
|
||||
def getUdfClasses(dict: SettingDictionary) = {
|
||||
dict.getSubDictionary(NamespacePrefix)
|
||||
}
|
||||
|
||||
def initialize(spark: SparkSession, dict: SettingDictionary) = {
|
||||
getUdfClasses(dict).getDictMap().par.map{ case(k, v)=>{
|
||||
k-> registerUdf(k, v, spark, dict)
|
||||
}}
|
||||
}
|
||||
|
||||
val ClassNamePrefix = classOf[Generator0[_]].getCanonicalName.dropRight(1)
|
||||
|
||||
val mirror = scala.reflect.runtime.universe.runtimeMirror(ClassLoaderHost.derivedClassLoader)
|
||||
|
||||
def getUdfBaseClassNames(className: String) = {
|
||||
val clazz = ClassLoaderHost.classForName(className)
|
||||
val ts = mirror.classSymbol(clazz).typeSignature
|
||||
ts.baseClasses.map(s=>s.fullName)
|
||||
}
|
||||
|
||||
def registerUdf(name: String, className: String, spark: SparkSession, dict: SettingDictionary) = {
|
||||
val clazz = ClassLoaderHost.classForName(className)
|
||||
val ts = mirror.classSymbol(clazz).typeSignature
|
||||
val udfInterfaces = ts.baseClasses.filter(c=>c.fullName.startsWith(ClassNamePrefix))
|
||||
|
||||
if (udfInterfaces.length == 0) {
|
||||
throw new EngineException(s"UDF class $className doesn't implement any UDF interface")
|
||||
} else if (udfInterfaces.length > 1) {
|
||||
throw new EngineException(s"It is invalid to implement multiple UDF interfaces, UDF class $className")
|
||||
} else {
|
||||
val udfInterface = udfInterfaces(0)
|
||||
val typeArgs = ts.baseType(udfInterface).typeArgs
|
||||
val returnType = ScalaReflection.schemaFor(typeArgs.last).dataType
|
||||
val udf = clazz.newInstance()
|
||||
val argumentCount = typeArgs.length - 1
|
||||
val wrap = generateFunctionRef(udf, argumentCount, spark, dict)
|
||||
registerFunction(spark, name, wrap.func, returnType, argumentCount)
|
||||
wrap.onInterval
|
||||
}
|
||||
}
|
||||
|
||||
case class UdfWrap(func: AnyRef, onInterval: IntervalUpdateHandler)
|
||||
|
||||
def generateFunctionRef(udf: Any, argumentCount: Int, spark: SparkSession, dict: SettingDictionary):UdfWrap = {
|
||||
argumentCount match {
|
||||
case 0=> initializeUdf0(udf, spark, dict)
|
||||
case 1=> initializeUdf1(udf, spark, dict)
|
||||
case 2=> initializeUdf2(udf, spark, dict)
|
||||
case 3=> initializeUdf3(udf, spark, dict)
|
||||
case _=> throw new EngineException(s"UDF with $argumentCount arguments is not supported yet.")
|
||||
}
|
||||
}
|
||||
|
||||
def initializeUdf0(udf: Any, spark: SparkSession, dict: SettingDictionary) = {
|
||||
val obj = udf.asInstanceOf[Generator0[Any]].initialize(spark, dict)
|
||||
UdfWrap(obj.func, obj.onInterval)
|
||||
}
|
||||
|
||||
def initializeUdf1(udf: Any, spark: SparkSession, dict: SettingDictionary) = {
|
||||
val obj = udf.asInstanceOf[Generator1[Any, Any]].initialize(spark, dict)
|
||||
UdfWrap(obj.func.apply(_:Any), obj.onInterval)
|
||||
}
|
||||
|
||||
def initializeUdf2(udf: Any, spark: SparkSession, dict: SettingDictionary) = {
|
||||
val obj = udf.asInstanceOf[Generator2[Any, Any, Any]].initialize(spark, dict)
|
||||
UdfWrap(obj.func.apply(_:Any, _:Any), obj.onInterval)
|
||||
}
|
||||
|
||||
def initializeUdf3(udf: Any, spark: SparkSession, dict: SettingDictionary) = {
|
||||
val obj = udf.asInstanceOf[Generator3[Any, Any, Any, Any]].initialize(spark, dict)
|
||||
UdfWrap(obj.func.apply(_:Any, _:Any, _:Any), obj.onInterval)
|
||||
}
|
||||
|
||||
def registerFunction(spark:SparkSession, name: String, func: AnyRef, returnType: DataType, argumentCount: Int) = {
|
||||
def builder(e: Seq[Expression]) = if (e.length == argumentCount) {
|
||||
ScalaUDF(func, returnType, e, udfName = Some(name))
|
||||
} else {
|
||||
throw new EngineException(s"Invalid number of arguments for function $name. Expected: $argumentCount; Found: ${e.length}")
|
||||
}
|
||||
|
||||
spark.sessionState.functionRegistry.createOrReplaceTempFunction(name, builder)
|
||||
}
|
||||
}
|
|
@ -0,0 +1,38 @@
|
|||
// *********************************************************************
|
||||
// Copyright (c) Microsoft Corporation. All rights reserved.
|
||||
// Licensed under the MIT License
|
||||
// *********************************************************************
|
||||
package datax.handler
|
||||
|
||||
import datax.classloader.ClassLoaderHost
|
||||
import datax.config.{SettingDictionary, SettingNamespace}
|
||||
import datax.extension.StringNormalizer
|
||||
import org.apache.log4j.LogManager
|
||||
import org.apache.spark.sql.SparkSession
|
||||
|
||||
object InputNormalizerHandler {
|
||||
val logger = LogManager.getLogger(this.getClass)
|
||||
val SettingPreProjection = SettingNamespace.JobProcessPrefix + "inputnormalizer"
|
||||
def initialize(spark: SparkSession, dict: SettingDictionary) = {
|
||||
loadProcessor(spark,
|
||||
dict,
|
||||
"default",
|
||||
dict.get(SettingPreProjection).orNull)
|
||||
}
|
||||
|
||||
def loadProcessor(spark:SparkSession, dict: SettingDictionary, processorName: String, className: String) = {
|
||||
if(className==null||className.isEmpty){
|
||||
logger.warn(s"no input normalizer processor is defined")
|
||||
null
|
||||
}
|
||||
else {
|
||||
logger.warn(s"loading class ${className} for input normalizer handler '${processorName}'")
|
||||
val clazz = ClassLoaderHost.classForName(className)
|
||||
val processor = clazz.newInstance().asInstanceOf[StringNormalizer]
|
||||
logger.warn(s"loaded class ${className} for input normalizer handler '${processorName}'")
|
||||
|
||||
// transform the method to a delegate
|
||||
processor.normalize _
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,63 @@
|
|||
// *********************************************************************
|
||||
// Copyright (c) Microsoft Corporation. All rights reserved.
|
||||
// Licensed under the MIT License
|
||||
// *********************************************************************
|
||||
package datax.handler
|
||||
|
||||
import datax.config.{SettingDictionary, SettingNamespace}
|
||||
import datax.host.SparkJarLoader
|
||||
import datax.securedsetting.KeyVaultClient
|
||||
import org.apache.log4j.LogManager
|
||||
import org.apache.spark.sql.SparkSession
|
||||
|
||||
|
||||
object JarUDFHandler {
|
||||
case class JarUDFConf(name: String, path: String, `class`: String, libs: List[String])
|
||||
|
||||
val logger = LogManager.getLogger(this.getClass)
|
||||
val SettingJarUDF = "jar.udf"
|
||||
val SettingJarUDAF = "jar.udaf"
|
||||
val SettingJarUDFPath = "path"
|
||||
val SettingJarUDFClass = "class"
|
||||
val SettingJarUDFLibs = "libs"
|
||||
|
||||
private def buildJarUdfConf(dict: SettingDictionary, name: String): JarUDFConf = {
|
||||
JarUDFConf(
|
||||
name = name,
|
||||
path = dict.getOrNull(SettingJarUDFPath),
|
||||
`class` = dict.getOrNull(SettingJarUDFClass),
|
||||
libs = dict.getStringSeqOption(SettingJarUDFLibs).map(_.toList).orNull
|
||||
)
|
||||
}
|
||||
|
||||
private def buildJarUdfConfArray(dict: SettingDictionary, prefix: String): List[JarUDFConf] = {
|
||||
logger.warn("#### JarUDFHandler prefix=" + prefix)
|
||||
dict.groupBySubNamespace(prefix)
|
||||
.map{ case(k, v) =>
|
||||
buildJarUdfConf(v, k)
|
||||
}
|
||||
.toList
|
||||
}
|
||||
|
||||
def loadJarUdf(spark: SparkSession, dict: SettingDictionary) = {
|
||||
val jarUDFs = buildJarUdfConfArray(dict, SettingNamespace.JobProcessPrefix+SettingJarUDF+SettingNamespace.Seperator)
|
||||
val jarUDAFs = buildJarUdfConfArray(dict, SettingNamespace.JobProcessPrefix+SettingJarUDAF+SettingNamespace.Seperator)
|
||||
|
||||
val libs = jarUDFs.flatMap(c=>if(c.libs==null) Seq(c.path) else c.libs:+c.path).toSet ++ jarUDAFs.flatMap(c=>if(c.libs==null) Seq(c.path) else c.libs:+c.path).toSet
|
||||
libs.foreach(libPath => {
|
||||
logger.warn(s"Adding JAR at $libPath to driver and executors")
|
||||
val actualPath = KeyVaultClient.resolveSecretIfAny(libPath)
|
||||
SparkJarLoader.addJar(spark, actualPath)
|
||||
})
|
||||
|
||||
jarUDFs.foreach(udf => {
|
||||
logger.warn(s"###########jarUDFs class name ="+ udf.`class`)
|
||||
SparkJarLoader.registerJavaUDF(spark.udf, udf.name, udf.`class`, null)
|
||||
})
|
||||
|
||||
jarUDAFs.foreach(udf=>{
|
||||
logger.warn(s"###########jarUDAFs class name ="+ udf.`class`)
|
||||
SparkJarLoader.registerJavaUDAF(spark.udf, udf.name, udf.`class`)
|
||||
})
|
||||
}
|
||||
}
|
|
@ -0,0 +1,33 @@
|
|||
// *********************************************************************
|
||||
// Copyright (c) Microsoft Corporation. All rights reserved.
|
||||
// Licensed under the MIT License
|
||||
// *********************************************************************
|
||||
package datax.handler
|
||||
|
||||
import datax.config.{SettingDictionary, SettingNamespace}
|
||||
import datax.securedsetting.KeyVaultClient
|
||||
import org.apache.log4j.LogManager
|
||||
|
||||
case class MetricSinkConf(redis: String, eventhub: String, httpEndpoint:String)
|
||||
|
||||
object MetricsHandler {
|
||||
val Namespace = "metric"
|
||||
val NamespacePrefix = SettingNamespace.JobProcessPrefix+Namespace+SettingNamespace.Seperator
|
||||
val SettingRedisConnection= "redis"
|
||||
val SettingEventHubConnection= "eventhub"
|
||||
val SettingHttpConnection= "httppost"
|
||||
|
||||
val logger = LogManager.getLogger(this.getClass)
|
||||
|
||||
def getMetricsSinkConf(dict: SettingDictionary):MetricSinkConf = {
|
||||
val prefix = NamespacePrefix
|
||||
val subdict = dict.getSubDictionary(prefix)
|
||||
|
||||
MetricSinkConf(
|
||||
redis = KeyVaultClient.resolveSecretIfAny(subdict.getOrNull(SettingRedisConnection)),
|
||||
eventhub = KeyVaultClient.resolveSecretIfAny(subdict.getOrNull(SettingEventHubConnection)),
|
||||
httpEndpoint = subdict.getOrNull(SettingHttpConnection)
|
||||
)
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,38 @@
|
|||
// *********************************************************************
|
||||
// Copyright (c) Microsoft Corporation. All rights reserved.
|
||||
// Licensed under the MIT License
|
||||
// *********************************************************************
|
||||
package datax.handler
|
||||
|
||||
import datax.classloader.ClassLoaderHost
|
||||
import datax.config.{SettingDictionary, SettingNamespace}
|
||||
import datax.extension.PreProjectionProcessor
|
||||
import org.apache.log4j.LogManager
|
||||
import org.apache.spark.sql.SparkSession
|
||||
|
||||
object PreProjectionHandler {
|
||||
val logger = LogManager.getLogger(this.getClass)
|
||||
val SettingPreProjection = SettingNamespace.JobProcessPrefix + "preprojection"
|
||||
|
||||
def initialize(spark: SparkSession, dict: SettingDictionary) = {
|
||||
loadProcessor(spark,
|
||||
dict,
|
||||
"default",
|
||||
dict.get(SettingPreProjection).orNull)
|
||||
}
|
||||
|
||||
def loadProcessor(spark:SparkSession, dict: SettingDictionary, processorName: String, className: String) = {
|
||||
if(className==null||className.isEmpty){
|
||||
logger.warn(s"no preprojection processor is defined")
|
||||
null
|
||||
}
|
||||
else {
|
||||
logger.warn(s"loading class ${className} for preprojection handler '${processorName}'")
|
||||
val clazz = ClassLoaderHost.classForName(className)
|
||||
val generator = clazz.newInstance().asInstanceOf[PreProjectionProcessor]
|
||||
val processor = generator.initialize(spark, dict)
|
||||
logger.warn(s"loaded class ${className} for preprojection handler '${processorName}'")
|
||||
processor
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,45 @@
|
|||
// *********************************************************************
|
||||
// Copyright (c) Microsoft Corporation. All rights reserved.
|
||||
// Licensed under the MIT License
|
||||
// *********************************************************************
|
||||
package datax.handler
|
||||
|
||||
import datax.config.{SettingDictionary, SettingNamespace}
|
||||
import datax.fs.HadoopClient
|
||||
import datax.securedsetting.KeyVaultClient
|
||||
import datax.utility.Validation
|
||||
import org.apache.log4j.LogManager
|
||||
|
||||
import scala.concurrent.{ExecutionContext, Future}
|
||||
|
||||
object ProjectionHandler {
|
||||
val logger = LogManager.getLogger(this.getClass)
|
||||
val SettingProjection = "projection"
|
||||
|
||||
private def getProjectionFilePaths(dict: SettingDictionary): Option[Seq[String]] = {
|
||||
dict.getStringSeqOption(SettingNamespace.JobProcessPrefix + SettingProjection)
|
||||
}
|
||||
|
||||
def loadProjectionsFuture(dict: SettingDictionary)(implicit ec: ExecutionContext): Future[Seq[Seq[String]]] = {
|
||||
getProjectionFilePaths(dict) match {
|
||||
case Some(projections) => {
|
||||
Future.sequence(projections.map(projectionFile => Future {
|
||||
logger.warn(s"Load projection file from '$projectionFile'")
|
||||
val filePath = KeyVaultClient.resolveSecretIfAny(projectionFile)
|
||||
HadoopClient.readHdfsFile(filePath).toSeq
|
||||
}))
|
||||
}
|
||||
case None => Future {
|
||||
Seq()
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
def validateProjections(projections: Seq[Seq[String]]) = {
|
||||
for(i <- 0 until projections.length){
|
||||
val expr = projections(i)
|
||||
Validation.ensureNotNull(expr, s"projection-$i")
|
||||
logger.warn(s"Projection Step #$i = \n" + expr.mkString("\n"))
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,36 @@
|
|||
// *********************************************************************
|
||||
// Copyright (c) Microsoft Corporation. All rights reserved.
|
||||
// Licensed under the MIT License
|
||||
// *********************************************************************
|
||||
package datax.handler
|
||||
|
||||
import java.sql.Timestamp
|
||||
|
||||
import datax.config.{SettingDictionary, SettingNamespace}
|
||||
import datax.constants.ProcessingPropertyName
|
||||
import datax.data.FileInternal
|
||||
import datax.utility.DateTimeUtil
|
||||
import org.apache.log4j.LogManager
|
||||
import org.apache.spark.SparkEnv
|
||||
import org.apache.spark.sql.functions.udf
|
||||
import org.apache.spark.sql.{Row, SparkSession}
|
||||
|
||||
object PropertiesHandler {
|
||||
val logger = LogManager.getLogger(this.getClass)
|
||||
val SettingPropertyToAppend = "appendproperty"
|
||||
|
||||
def buildAppendProperties(dict: SettingDictionary, prefix: String):Map[String, String] = {
|
||||
dict.getSubDictionary(prefix).getDictMap()
|
||||
}
|
||||
|
||||
def initialize(spark: SparkSession, dict: SettingDictionary) = {
|
||||
val appendProperties = buildAppendProperties(dict, SettingNamespace.JobProcessPrefix+SettingPropertyToAppend+SettingNamespace.Seperator)
|
||||
udf((internalColumn:Row, batchTime:Timestamp) =>
|
||||
(appendProperties ++ Map(
|
||||
ProcessingPropertyName.BatchTime -> batchTime.toString,
|
||||
ProcessingPropertyName.BlobTime -> FileInternal.getInfoFileTimeString(internalColumn),
|
||||
ProcessingPropertyName.BlobPathHint -> FileInternal.getInfoOutputFileName(internalColumn),
|
||||
ProcessingPropertyName.CPTime -> DateTimeUtil.getCurrentTime().toString,
|
||||
ProcessingPropertyName.CPExecutor -> SparkEnv.get.executorId)).filter(_._2!=null))
|
||||
}
|
||||
}
|
|
@ -0,0 +1,61 @@
|
|||
// *********************************************************************
|
||||
// Copyright (c) Microsoft Corporation. All rights reserved.
|
||||
// Licensed under the MIT License
|
||||
// *********************************************************************
|
||||
package datax.handler
|
||||
|
||||
import datax.config.{SettingDictionary, SettingNamespace}
|
||||
import datax.securedsetting.KeyVaultClient
|
||||
import datax.telemetry.AppInsightLogger
|
||||
import datax.utility.CSVUtil
|
||||
import org.apache.log4j.LogManager
|
||||
import org.apache.spark.sql.SparkSession
|
||||
|
||||
import scala.concurrent.{ExecutionContext, Future}
|
||||
|
||||
object ReferenceDataHandler {
|
||||
case class ReferenceDataConf(format:String, name:String, path:String, delimiter: Option[String], header: Option[String])
|
||||
val logger = LogManager.getLogger(this.getClass)
|
||||
val Namespace = "referencedata"
|
||||
val SettingFormat = "format"
|
||||
val SettingPath = "path"
|
||||
val SettingDelimiter = "delimiter"
|
||||
val SettingHeader = "header"
|
||||
|
||||
private def buildReferenceDataConf(dict: SettingDictionary, name: String): ReferenceDataConf = {
|
||||
ReferenceDataConf(
|
||||
format = dict.getOrNull(SettingFormat),
|
||||
name = name,
|
||||
path = dict.getOrNull(SettingPath),
|
||||
delimiter = dict.get(SettingDelimiter),
|
||||
header = dict.get(SettingHeader)
|
||||
)
|
||||
}
|
||||
|
||||
private def buildReferenceDataConfArray(dict: SettingDictionary, prefix: String): List[ReferenceDataConf] = {
|
||||
logger.warn("#### ReferenceDataHandler prefix:" +prefix)
|
||||
dict.groupBySubNamespace(prefix)
|
||||
.map{ case(k, v) => buildReferenceDataConf(v, k)}
|
||||
.toList
|
||||
}
|
||||
|
||||
def loadReferenceDataFuture(spark: SparkSession, dict: SettingDictionary)(implicit ec: ExecutionContext): Future[Int] = {
|
||||
Future {
|
||||
val rds = buildReferenceDataConfArray(dict, SettingNamespace.JobInputPrefix+Namespace + SettingNamespace.Seperator)
|
||||
rds.foreach(rd => {
|
||||
val actualPath = KeyVaultClient.resolveSecretIfAny(rd.path)
|
||||
rd.format.toLowerCase() match {
|
||||
case "csv" => {
|
||||
CSVUtil.loadCSVReferenceData(spark, rd.format, actualPath, rd.name, rd.delimiter, rd.header, AppInsightLogger)
|
||||
true
|
||||
}
|
||||
case other: String => throw new Error(s"The ReferenceDataType:'$other' at path '${rd.path}' as specified in the configuration is not currently supported.")
|
||||
}
|
||||
})
|
||||
|
||||
val count = rds.length
|
||||
logger.warn(s"Loaded ${count} reference data as tables")
|
||||
count
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,129 @@
|
|||
// *********************************************************************
|
||||
// Copyright (c) Microsoft Corporation. All rights reserved.
|
||||
// Licensed under the MIT License
|
||||
// *********************************************************************
|
||||
package datax.handler
|
||||
|
||||
import datax.config._
|
||||
import datax.exception.EngineException
|
||||
import datax.fs.HadoopClient
|
||||
import datax.securedsetting.KeyVaultClient
|
||||
import datax.utility.AzureFunctionCaller
|
||||
import org.apache.log4j.LogManager
|
||||
import org.apache.spark.sql.SparkSession
|
||||
|
||||
import scala.collection.mutable.HashMap
|
||||
|
||||
object StateTableHandler {
|
||||
case class StateTableConf(name: String, schema: String, location: String)
|
||||
|
||||
val logger = LogManager.getLogger(this.getClass)
|
||||
val SettingStateTable = "statetable"
|
||||
val SettingStateTableSchema = "schema"
|
||||
val SettingStateTableLocation = "location"
|
||||
|
||||
// table initialization
|
||||
val TableMetadata_Active = "active"
|
||||
val TableMetadata_Standby = "standby"
|
||||
private def readTableMetadata(metadataFile: String): HashMap[String, String] = {
|
||||
if(HadoopClient.fileExists(metadataFile)) {
|
||||
HashMap(
|
||||
HadoopClient.readHdfsFile(metadataFile).map(s => {
|
||||
val pos = s.indexOf('=')
|
||||
if (pos <= 0) {
|
||||
throw new EngineException(s"Invalid content in '${metadataFile}': '${s}'")
|
||||
}
|
||||
else {
|
||||
(s.substring(0, pos), s.substring(pos + 1, s.length))
|
||||
}
|
||||
}).toSeq: _*)
|
||||
}
|
||||
else{
|
||||
HadoopClient.ensureParentFolderExists(metadataFile)
|
||||
HashMap[String, String](
|
||||
TableMetadata_Active -> "A",
|
||||
TableMetadata_Standby -> "B"
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
private def writeTableMetadata(metadataFile: String, parameters: HashMap[String, String]): Unit ={
|
||||
HadoopClient.writeHdfsFile(metadataFile, parameters.map(i=>i._1+"="+i._2).mkString("\n"), true)
|
||||
}
|
||||
|
||||
private def getTableNameVersioned(name: String, suffix: String) = name+"_"+suffix
|
||||
|
||||
private def buildAccumulationTableConf(dict: SettingDictionary, name: String): StateTableConf = {
|
||||
StateTableConf(
|
||||
name = name,
|
||||
schema = dict.getOrNull(SettingStateTableSchema),
|
||||
location = dict.getOrNull(SettingStateTableLocation)
|
||||
)
|
||||
}
|
||||
|
||||
private def buildAccumulationTableArrayConf(dict: SettingDictionary, prefix: String) ={
|
||||
dict.groupBySubNamespace(prefix)
|
||||
.map{ case(k, v) => buildAccumulationTableConf(v, k)}
|
||||
.toList
|
||||
}
|
||||
|
||||
def createTables(spark:SparkSession, dict: SettingDictionary) = {
|
||||
buildAccumulationTableArrayConf(dict, SettingNamespace.JobProcessPrefix+SettingStateTable+SettingNamespace.Seperator)
|
||||
.map(t=>{
|
||||
val location = t.location.stripSuffix("/") + "/"
|
||||
val metadataFile = location + "metadata.info"
|
||||
val parameters = readTableMetadata(metadataFile)
|
||||
var parametersModified = false
|
||||
|
||||
val tables = Seq("active", "standby").map(version => {
|
||||
val suffix = parameters.get(version).get
|
||||
val spec = s"STORED AS PARQUET LOCATION '${location+suffix}/'"
|
||||
val tableName = getTableNameVersioned(t.name, suffix)
|
||||
val sql =s"CREATE TABLE IF NOT EXISTS $tableName (${t.schema}) $spec"
|
||||
logger.warn(s"Creating '$version' Table ${t.name}: $sql")
|
||||
spark.sql(sql)
|
||||
|
||||
suffix -> tableName
|
||||
}).toMap
|
||||
|
||||
t.name -> new {
|
||||
private val tableLogger = LogManager.getLogger(s"TableStore-${t.name}")
|
||||
def getActiveTableName(): String = {
|
||||
getTableNameVersioned(t.name, parameters(TableMetadata_Active))
|
||||
}
|
||||
|
||||
def getStandbyTableName(): String = {
|
||||
getTableNameVersioned(t.name, parameters(TableMetadata_Standby))
|
||||
}
|
||||
|
||||
def overwrite(selectSql: String) = {
|
||||
val standbyTableName = getStandbyTableName()
|
||||
tableLogger.warn(s"Overwriting standby table: $standbyTableName")
|
||||
val sql = s"INSERT OVERWRITE TABLE $standbyTableName $selectSql"
|
||||
spark.sql(sql)
|
||||
}
|
||||
|
||||
def flip():String = {
|
||||
parameters ++= Map(
|
||||
TableMetadata_Active -> parameters(TableMetadata_Standby),
|
||||
TableMetadata_Standby -> parameters(TableMetadata_Active)
|
||||
)
|
||||
parametersModified = !parametersModified
|
||||
val result = getActiveTableName()
|
||||
tableLogger.warn(s"Fliped active and standby, now active instance is ${result}")
|
||||
result
|
||||
}
|
||||
|
||||
def persist() = {
|
||||
if(parametersModified) {
|
||||
tableLogger.warn(s"persisting parameters: ${parameters}")
|
||||
writeTableMetadata(metadataFile, parameters)
|
||||
}
|
||||
else{
|
||||
tableLogger.warn(s"Skip persisting parameters: ${parameters}")
|
||||
}
|
||||
}
|
||||
}
|
||||
}).toMap
|
||||
}
|
||||
}
|
|
@ -0,0 +1,68 @@
|
|||
// *********************************************************************
|
||||
// Copyright (c) Microsoft Corporation. All rights reserved.
|
||||
// Licensed under the MIT License
|
||||
// *********************************************************************
|
||||
package datax.handler
|
||||
|
||||
import datax.config.{SettingDictionary, SettingNamespace}
|
||||
import datax.exception.EngineException
|
||||
import org.apache.log4j.LogManager
|
||||
import org.apache.spark.sql.{Column, SparkSession}
|
||||
import org.apache.spark.sql.functions.col
|
||||
|
||||
import scala.concurrent.duration.Duration
|
||||
|
||||
case class TimeWindowConf(
|
||||
windows: Map[String, Duration],
|
||||
isEnabled: Boolean,
|
||||
timestampColumn: Column,
|
||||
watermark: Duration,
|
||||
maxWindow: Duration
|
||||
)
|
||||
|
||||
object TimeWindowHandler {
|
||||
val NamespacePrefix=SettingNamespace.JobProcessPrefix
|
||||
val SettingWatermark = "watermark"
|
||||
val SettingTimestampColumn = "timestampcolumn"
|
||||
val SettingTimeWindow = "timewindow"
|
||||
val SettingTimeWindowDuration = "windowduration"
|
||||
|
||||
val logger = LogManager.getLogger(this.getClass)
|
||||
|
||||
|
||||
private def buildTimeWindowConf(dict: SettingDictionary, name: String) = {
|
||||
dict.getDuration(SettingTimeWindowDuration)
|
||||
}
|
||||
|
||||
private def buildTimeWindowsConf(dict: SettingDictionary, prefix: String)= {
|
||||
dict.buildConfigMap(buildTimeWindowConf, prefix)
|
||||
}
|
||||
|
||||
def initialize(spark: SparkSession, dict: SettingDictionary) = {
|
||||
val windows = buildTimeWindowsConf(dict, NamespacePrefix + SettingTimeWindow + SettingNamespace.Seperator)
|
||||
val watermark = dict.get(NamespacePrefix + SettingWatermark)
|
||||
val timestampColumn = dict.get(NamespacePrefix + SettingTimestampColumn)
|
||||
val isEnabled = windows.size > 0 && watermark.isDefined && timestampColumn.isDefined
|
||||
|
||||
if (isEnabled) {
|
||||
logger.warn(s"Windowing is ON, window are ${windows}, watermark is ${watermark}")
|
||||
TimeWindowConf(
|
||||
windows = windows,
|
||||
isEnabled = windows.size > 0 && watermark.isDefined && timestampColumn.isDefined,
|
||||
timestampColumn =col(timestampColumn.get),
|
||||
watermark = Duration.create(watermark.get),
|
||||
maxWindow = windows.values.max
|
||||
)
|
||||
}
|
||||
else {
|
||||
logger.warn(s"Windowing is OFF")
|
||||
TimeWindowConf(
|
||||
null,
|
||||
false,
|
||||
null,
|
||||
Duration.Zero,
|
||||
Duration.Zero
|
||||
)
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,54 @@
|
|||
// *********************************************************************
|
||||
// Copyright (c) Microsoft Corporation. All rights reserved.
|
||||
// Licensed under the MIT License
|
||||
// *********************************************************************
|
||||
package datax.handler
|
||||
|
||||
import datax.config.{SettingDictionary, SettingNamespace}
|
||||
import datax.fs.HadoopClient
|
||||
import datax.securedsetting.KeyVaultClient
|
||||
import datax.sql.{ParsedResult, TransformSQLParser}
|
||||
import datax.utility.Validation
|
||||
import org.apache.log4j.LogManager
|
||||
|
||||
import scala.concurrent.{ExecutionContext, Future}
|
||||
|
||||
object TransformHandler {
|
||||
private def getTransformFilePath(dict: SettingDictionary): Option[String] = {
|
||||
dict.get(SettingNamespace.JobProcessPrefix + "transform")
|
||||
}
|
||||
|
||||
def shouldCacheCommonViews(dict: SettingDictionary): Boolean = {
|
||||
dict.getOrElse(SettingNamespace.JobProcessPrefix + "cachecommonviews", "True").toBoolean
|
||||
}
|
||||
|
||||
def loadTransformFuture(dict: SettingDictionary)(implicit ec: ExecutionContext): Future[ParsedResult] = {
|
||||
val logger = LogManager.getLogger(this.getClass)
|
||||
getTransformFilePath(dict) match {
|
||||
case Some(transform) =>Future {
|
||||
logger.warn(s"Load transform script from '${transform}'")
|
||||
val filePath = KeyVaultClient.resolveSecretIfAny(transform)
|
||||
val sqlParsed = TransformSQLParser.parse(HadoopClient.readHdfsFile(filePath).toSeq)
|
||||
|
||||
if(sqlParsed!=null) {
|
||||
val queries = sqlParsed.commands.filter(_.commandType==TransformSQLParser.CommandType_Query)
|
||||
for (i <- 0 until queries.length) {
|
||||
val transformation = queries(i).text
|
||||
Validation.ensureNotNull(transformation, s"transform-$i")
|
||||
logger.warn(s"Transform step #$i = \n" + transformation)
|
||||
}
|
||||
|
||||
sqlParsed.viewReferenceCount.foreach(v=>{
|
||||
logger.warn(s"View ${v._1} is referenced ${v._2} times")
|
||||
})
|
||||
}
|
||||
|
||||
sqlParsed
|
||||
}
|
||||
case None => Future {
|
||||
logger.warn(s"Transform file is not defined.")
|
||||
null
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,103 @@
|
|||
// *********************************************************************
|
||||
// Copyright (c) Microsoft Corporation. All rights reserved.
|
||||
// Licensed under the MIT License
|
||||
// *********************************************************************
|
||||
package datax.host
|
||||
|
||||
import java.sql.Timestamp
|
||||
import java.text.SimpleDateFormat
|
||||
import java.time.Instant
|
||||
import java.util.concurrent.Executors
|
||||
|
||||
import datax.config.UnifiedConfig
|
||||
import datax.constants.ProductConstant
|
||||
import datax.fs.HadoopClient
|
||||
import datax.input.BatchBlobInputSetting
|
||||
import datax.processor.BlobPointerProcessor
|
||||
import datax.telemetry.AppInsightLogger
|
||||
import datax.utility.DataMerger
|
||||
import org.apache.log4j.LogManager
|
||||
|
||||
import scala.language.postfixOps
|
||||
import scala.collection.mutable.{HashSet, ListBuffer}
|
||||
import scala.collection.parallel.ExecutionContextTaskSupport
|
||||
import scala.concurrent.ExecutionContext
|
||||
import scala.concurrent.duration._
|
||||
|
||||
object BlobBatchingHost {
|
||||
def getInputBlobPathPrefixes(prefix: String, datetimeFormat: String, startTime: Instant, durationInSeconds: Long, timeIntervalInSeconds: Long):Iterable[(String, Timestamp)] = {
|
||||
val result = new ListBuffer[(String, Timestamp)]
|
||||
val cache = new HashSet[String]
|
||||
|
||||
var t:Long = 0
|
||||
//val utcZoneId = ZoneId.of("UTC")
|
||||
val dateFormat = new SimpleDateFormat(datetimeFormat)
|
||||
while(t<durationInSeconds){
|
||||
val timestamp = Timestamp.from(startTime.plusSeconds(t))
|
||||
val partitionFolder = dateFormat.format(timestamp)
|
||||
if(!cache.contains(partitionFolder)){
|
||||
val path = prefix+partitionFolder
|
||||
result += Tuple2(path, timestamp)
|
||||
cache += partitionFolder
|
||||
}
|
||||
t+= timeIntervalInSeconds
|
||||
}
|
||||
|
||||
result
|
||||
}
|
||||
|
||||
def runBatchApp(inputArguments: Array[String],processorGenerator: UnifiedConfig=>BlobPointerProcessor ) = {
|
||||
val appLog = LogManager.getLogger("runBatchApp")
|
||||
val (appHost, config) = CommonAppHost.initApp(inputArguments)
|
||||
|
||||
appLog.warn(s"Batch Mode Work Started")
|
||||
|
||||
val blobsConf = BatchBlobInputSetting.getInputBlobsArrayConf(config.dict)
|
||||
AppInsightLogger.trackEvent(ProductConstant.ProductRoot + "/batch/app/begin")
|
||||
|
||||
val prefixes = blobsConf.flatMap(blobs=>{
|
||||
val inputBlobPathPrefix = blobs.pathPrefix
|
||||
val inputBlobDateTimeFormat = blobs.pathPartitionFolderFormat
|
||||
val inputBlobStartTime = Instant.parse(blobs.startTime)
|
||||
val inputBlobDurationInHours = blobs.durationInHours
|
||||
val inputBlobTimeIntervalInHours = 1
|
||||
|
||||
getInputBlobPathPrefixes(
|
||||
prefix = inputBlobPathPrefix,
|
||||
datetimeFormat = inputBlobDateTimeFormat,
|
||||
startTime = inputBlobStartTime,
|
||||
durationInSeconds = inputBlobDurationInHours*3600,
|
||||
timeIntervalInSeconds = inputBlobTimeIntervalInHours*3600
|
||||
)
|
||||
}).par
|
||||
|
||||
val spark = appHost.getSpark(config.sparkConf)
|
||||
val sc = spark.sparkContext
|
||||
val processor = processorGenerator(config)
|
||||
|
||||
val ec = new ExecutionContext {
|
||||
val threadPool = Executors.newFixedThreadPool(16)
|
||||
def execute(runnable: Runnable) {
|
||||
threadPool.submit(runnable)
|
||||
}
|
||||
def reportFailure(t: Throwable) {}
|
||||
}
|
||||
|
||||
prefixes.tasksupport = new ExecutionContextTaskSupport(ec)
|
||||
|
||||
val batchResult = prefixes.map(prefix =>{
|
||||
appLog.warn(s"Start processing ${prefix}")
|
||||
val namespace = "_"+HadoopClient.tempFilePrefix(prefix._1)
|
||||
appLog.warn(s"Namespace for prefix ${prefix._1} is '$namespace'")
|
||||
val pathsRDD = sc.makeRDD(HadoopClient.listFiles(prefix._1).toSeq)
|
||||
val result = processor.processPathsRDD(pathsRDD, prefix._2, 1 hour, prefix._2, namespace)
|
||||
appLog.warn(s"End processing ${prefix}")
|
||||
|
||||
result
|
||||
}).reduce(DataMerger.mergeMapOfDoubles)
|
||||
|
||||
appLog.warn(s"Batch Mode Work Ended, processed metrics: $batchResult")
|
||||
AppInsightLogger.trackEvent(ProductConstant.ProductRoot + "/batch/end", null, batchResult)
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,52 @@
|
|||
// *********************************************************************
|
||||
// Copyright (c) Microsoft Corporation. All rights reserved.
|
||||
// Licensed under the MIT License
|
||||
// *********************************************************************
|
||||
package datax.host
|
||||
|
||||
import datax.config.{ConfigManager, UnifiedConfig}
|
||||
import datax.constants._
|
||||
import datax.fs.HadoopClient
|
||||
import datax.service.{ConfigService, TelemetryService}
|
||||
import datax.telemetry.AppInsightLogger
|
||||
import org.apache.log4j.LogManager
|
||||
import org.apache.spark.SparkConf
|
||||
import org.apache.spark.sql.SparkSession
|
||||
|
||||
object CommonAppHost extends AppHost {
|
||||
override def getConfigService(): ConfigService = ConfigManager
|
||||
override def getTelemetryService(): TelemetryService = AppInsightLogger
|
||||
|
||||
def initApp(inputArguments: Array[String]): (AppHost, UnifiedConfig) = {
|
||||
val appLog = LogManager.getLogger(this.getClass)
|
||||
appLog.warn("===App log turned ON===")
|
||||
|
||||
val sparkConf = ConfigManager.initSparkConf
|
||||
|
||||
// Get the singleton instance of SparkSession
|
||||
val spark = SparkSessionSingleton.getInstance(sparkConf)
|
||||
|
||||
val conf = ConfigManager.getConfigurationFromArguments(inputArguments)
|
||||
|
||||
// Initialize FileSystemUtil
|
||||
HadoopClient.setConf(spark.sparkContext.hadoopConfiguration)
|
||||
|
||||
appLog.warn(s"initializing with conf:"+ conf.toString)
|
||||
|
||||
AppInsightLogger.initForApp(spark.sparkContext.appName)
|
||||
conf.getDriverLogLevel() match {
|
||||
case Some(level) => Logger.setLogLevel(level)
|
||||
case None =>
|
||||
}
|
||||
|
||||
AppInsightLogger.trackEvent(DatasetName.DataStreamProjection + "/app/init")
|
||||
|
||||
val unifiedConfig = ConfigManager.loadConfig(sparkConf)
|
||||
|
||||
(this, unifiedConfig)
|
||||
}
|
||||
|
||||
def getSpark(sparkConf: SparkConf): SparkSession = {
|
||||
SparkSessionSingleton.getInstance(sparkConf)
|
||||
}
|
||||
}
|
|
@ -0,0 +1,17 @@
|
|||
// *********************************************************************
|
||||
// Copyright (c) Microsoft Corporation. All rights reserved.
|
||||
// Licensed under the MIT License
|
||||
// *********************************************************************
|
||||
package datax.host
|
||||
|
||||
import org.apache.log4j.{Level, LogManager}
|
||||
|
||||
object Logger {
|
||||
var logLevel = LogManager.getRootLogger.getLevel
|
||||
def setLogLevel(level: Level) = {
|
||||
logLevel = level
|
||||
val logger = LogManager.getRootLogger
|
||||
logger.setLevel(level)
|
||||
logger.warn(s"root logger level set to ${level}")
|
||||
}
|
||||
}
|
|
@ -0,0 +1,211 @@
|
|||
// *********************************************************************
|
||||
// Copyright (c) Microsoft Corporation. All rights reserved.
|
||||
// Licensed under the MIT License
|
||||
// *********************************************************************
|
||||
package datax.host
|
||||
|
||||
import java.lang.reflect.ParameterizedType
|
||||
import java.net.URI
|
||||
|
||||
import datax.classloader.ClassLoaderHost
|
||||
import datax.constants.ProductConstant
|
||||
import datax.exception.EngineException
|
||||
import datax.fs.HadoopClient
|
||||
import org.apache.log4j.LogManager
|
||||
import org.apache.spark.SparkFiles
|
||||
import org.apache.spark.sql.api.java._
|
||||
import org.apache.spark.sql.catalyst.ScalaReflection
|
||||
import org.apache.spark.sql.expressions.{MutableAggregationBuffer, UserDefinedAggregateFunction}
|
||||
import org.apache.spark.sql.types.{DataType, StructType}
|
||||
import org.apache.spark.sql.{Row, SparkSession, UDFRegistration}
|
||||
|
||||
import scala.collection.mutable.HashMap
|
||||
|
||||
object SparkJarLoader {
|
||||
val currentJars = new HashMap[String, Long]
|
||||
|
||||
def getJavaUDFReturnDataType(t: Class[_]): DataType = {
|
||||
val mirror = scala.reflect.runtime.universe.runtimeMirror(ClassLoaderHost.derivedClassLoader)
|
||||
val ts = mirror.classSymbol(t).typeSignature
|
||||
val udfInterface = ts.baseClasses.filter(c=>c.fullName.startsWith("org.apache.spark.sql.api.java.UDF"))(0)
|
||||
ScalaReflection.schemaFor(ts.baseType(udfInterface).typeArgs.last).dataType
|
||||
}
|
||||
|
||||
def addJarOnDriver(spark: SparkSession, jarPath: String, timestamp: Long = 0, resolveStorageKey:Boolean=true) = {
|
||||
val logger = LogManager.getLogger("AddJar")
|
||||
val localName = new URI(jarPath).getPath.split("/").last
|
||||
val currentTimeStamp = currentJars.get(jarPath)
|
||||
.orElse(currentJars.get(localName))
|
||||
.getOrElse(-1L)
|
||||
if (currentTimeStamp < timestamp) {
|
||||
logger.warn("Fetching " + jarPath + " with timestamp " + timestamp)
|
||||
// Fetch file with useCache mode, close cache for local mode..
|
||||
// resolveStorageKey controls whether to retrieve the actual jarPath from
|
||||
// keyvault for the case where jarPath is keyvault url
|
||||
HadoopClient.fetchFile(jarPath,
|
||||
new java.io.File(SparkFiles.getRootDirectory()),
|
||||
localName, resolveStorageKey)
|
||||
|
||||
// Add it to our class loader
|
||||
val url = new java.io.File(SparkFiles.getRootDirectory(), localName).toURI.toURL
|
||||
if (!ClassLoaderHost.urlClassLoader.getURLs().contains(url)) {
|
||||
logger.info("Adding " + url + " to class loader")
|
||||
ClassLoaderHost.urlClassLoader.addURL(url)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
def addJar(spark: SparkSession, jarPath: String) = {
|
||||
addJarOnDriver(spark, jarPath)
|
||||
spark.sparkContext.addJar(jarPath)
|
||||
}
|
||||
|
||||
def loadUdf(spark: SparkSession, udfName: String, jarPath: String, mainClass: String, method: String) = {
|
||||
addJar(spark, jarPath)
|
||||
registerJavaUDF(spark.udf, udfName, mainClass, null)
|
||||
}
|
||||
|
||||
/**
|
||||
* Register a Java UDF class using reflection
|
||||
*
|
||||
* @param name udf name
|
||||
* @param className fully qualified class name of udf
|
||||
* @param returnDataType return type of udf. If it is null, spark would try to infer
|
||||
* via reflection.
|
||||
*/
|
||||
def registerJavaUDF(udfReg: UDFRegistration, name: String, className: String, returnDataType: DataType): Unit = {
|
||||
try {
|
||||
val clazz = ClassLoaderHost.classForName(className)
|
||||
val udfInterfaces = clazz.getGenericInterfaces
|
||||
.filter(_.isInstanceOf[ParameterizedType])
|
||||
.map(_.asInstanceOf[ParameterizedType])
|
||||
.filter(e => e.getRawType.isInstanceOf[Class[_]] && e.getRawType.asInstanceOf[Class[_]].getCanonicalName.startsWith("org.apache.spark.sql.api.java.UDF"))
|
||||
if (udfInterfaces.length == 0) {
|
||||
throw new EngineException(s"UDF class $className doesn't implement any UDF interface")
|
||||
} else if (udfInterfaces.length > 1) {
|
||||
throw new EngineException(s"It is invalid to implement multiple UDF interfaces, UDF class $className")
|
||||
} else {
|
||||
try {
|
||||
val udf = clazz.newInstance()
|
||||
//val udfReturnType = udfInterfaces(0).getActualTypeArguments.last
|
||||
val returnType = if(returnDataType==null) getJavaUDFReturnDataType(clazz) else returnDataType
|
||||
|
||||
udfInterfaces(0).getActualTypeArguments.length match {
|
||||
case 1 => udfReg.register(name, udf.asInstanceOf[UDF0[_]], returnType)
|
||||
case 2 => udfReg.register(name, udf.asInstanceOf[UDF1[_, _]], returnType)
|
||||
case 3 => udfReg.register(name, udf.asInstanceOf[UDF2[_, _, _]], returnType)
|
||||
case 4 => udfReg.register(name, udf.asInstanceOf[UDF3[_, _, _, _]], returnType)
|
||||
case 5 => udfReg.register(name, udf.asInstanceOf[UDF4[_, _, _, _, _]], returnType)
|
||||
case 6 => udfReg.register(name, udf.asInstanceOf[UDF5[_, _, _, _, _, _]], returnType)
|
||||
case 7 => udfReg.register(name, udf.asInstanceOf[UDF6[_, _, _, _, _, _, _]], returnType)
|
||||
case 8 => udfReg.register(name, udf.asInstanceOf[UDF7[_, _, _, _, _, _, _, _]], returnType)
|
||||
case 9 => udfReg.register(name, udf.asInstanceOf[UDF8[_, _, _, _, _, _, _, _, _]], returnType)
|
||||
case 10 => udfReg.register(name, udf.asInstanceOf[UDF9[_, _, _, _, _, _, _, _, _, _]], returnType)
|
||||
case 11 => udfReg.register(name, udf.asInstanceOf[UDF10[_, _, _, _, _, _, _, _, _, _, _]], returnType)
|
||||
case 12 => udfReg.register(name, udf.asInstanceOf[UDF11[_, _, _, _, _, _, _, _, _, _, _, _]], returnType)
|
||||
case 13 => udfReg.register(name, udf.asInstanceOf[UDF12[_, _, _, _, _, _, _, _, _, _, _, _, _]], returnType)
|
||||
case 14 => udfReg.register(name, udf.asInstanceOf[UDF13[_, _, _, _, _, _, _, _, _, _, _, _, _, _]], returnType)
|
||||
case 15 => udfReg.register(name, udf.asInstanceOf[UDF14[_, _, _, _, _, _, _, _, _, _, _, _, _, _, _]], returnType)
|
||||
case 16 => udfReg.register(name, udf.asInstanceOf[UDF15[_, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _]], returnType)
|
||||
case 17 => udfReg.register(name, udf.asInstanceOf[UDF16[_, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _]], returnType)
|
||||
case 18 => udfReg.register(name, udf.asInstanceOf[UDF17[_, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _]], returnType)
|
||||
case 19 => udfReg.register(name, udf.asInstanceOf[UDF18[_, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _]], returnType)
|
||||
case 20 => udfReg.register(name, udf.asInstanceOf[UDF19[_, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _]], returnType)
|
||||
case 21 => udfReg.register(name, udf.asInstanceOf[UDF20[_, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _]], returnType)
|
||||
case 22 => udfReg.register(name, udf.asInstanceOf[UDF21[_, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _]], returnType)
|
||||
case 23 => udfReg.register(name, udf.asInstanceOf[UDF22[_, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _]], returnType)
|
||||
case n =>
|
||||
throw new EngineException(s"UDF class with $n type arguments is not supported.")
|
||||
}
|
||||
} catch {
|
||||
case e @ (_: InstantiationException | _: IllegalArgumentException) =>
|
||||
throw new EngineException(s"Can not instantiate class $className, please make sure it has public non argument constructor")
|
||||
}
|
||||
}
|
||||
} catch {
|
||||
case e: ClassNotFoundException => throw new EngineException(s"Can not load class $className, please make sure it is on the classpath")
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Register a Java UDAF class using reflection, for use from pyspark
|
||||
*
|
||||
* @param name UDAF name
|
||||
* @param className fully qualified class name of UDAF
|
||||
*/
|
||||
def registerJavaUDAF(udfReg: UDFRegistration, name: String, className: String): Unit = {
|
||||
try {
|
||||
val clazz = ClassLoaderHost.classForName(className)
|
||||
if (!classOf[UserDefinedAggregateFunction].isAssignableFrom(clazz)) {
|
||||
throw new EngineException(s"class $className doesn't implement interface UserDefinedAggregateFunction")
|
||||
}
|
||||
val udaf = clazz.newInstance().asInstanceOf[UserDefinedAggregateFunction]
|
||||
udfReg.register(name, udaf)
|
||||
} catch {
|
||||
case e: ClassNotFoundException => throw new EngineException(s"Can not load class ${className}, please make sure it is on the classpath")
|
||||
case e @ (_: InstantiationException | _: IllegalArgumentException) =>
|
||||
throw new EngineException(s"Can not instantiate class ${className}, please make sure it has public non argument constructor")
|
||||
}
|
||||
}
|
||||
|
||||
case class CaseUDAF(inputType: StructType, bufferType: StructType, returnType: DataType) extends UserDefinedAggregateFunction{
|
||||
override def inputSchema: StructType = inputType
|
||||
|
||||
override def bufferSchema: StructType = bufferType
|
||||
|
||||
override def dataType: DataType = returnType
|
||||
|
||||
override def deterministic: Boolean = true
|
||||
|
||||
override def initialize(buffer: MutableAggregationBuffer): Unit = ???
|
||||
|
||||
override def update(buffer: MutableAggregationBuffer, input: Row): Unit = ???
|
||||
|
||||
override def merge(buffer1: MutableAggregationBuffer, buffer2: Row): Unit = ???
|
||||
|
||||
override def evaluate(buffer: Row): Any = ???
|
||||
}
|
||||
|
||||
/**
|
||||
* Register a UDAF class derived from api using reflection
|
||||
*
|
||||
* @param name UDAF name
|
||||
* @param className fully qualified class name of UDAF
|
||||
*/
|
||||
def registerApiUDAF(spark: SparkSession, name: String, className: String): Unit = {
|
||||
try {
|
||||
val clazz = ClassLoaderHost.classForName(className)
|
||||
val udfInterfaces = clazz.getGenericInterfaces
|
||||
.filter(_.isInstanceOf[ParameterizedType])
|
||||
.map(_.asInstanceOf[ParameterizedType])
|
||||
.filter(e => e.getRawType.isInstanceOf[Class[_]] && e.getRawType.asInstanceOf[Class[_]].getCanonicalName.startsWith(ProductConstant.ProductRoot + ".api.udf.UDAF"))
|
||||
|
||||
if (udfInterfaces.length == 0) {
|
||||
throw new EngineException(s"UDF class $className doesn't implement any ${ProductConstant.ProductRoot}.api.udf.UDF interface")
|
||||
} else if (udfInterfaces.length > 1) {
|
||||
throw new EngineException(s"It is invalid to implement multiple UDF interfaces, UDF class $className")
|
||||
} else {
|
||||
try {
|
||||
val udf = clazz.newInstance()
|
||||
val typeArguments = udfInterfaces(0).getActualTypeArguments
|
||||
val (inputTypes, bufferAndOutputTypes) = typeArguments.splitAt(typeArguments.length-2)
|
||||
|
||||
val returnType = ClassLoaderHost.javaTypeToDataType(bufferAndOutputTypes(1))
|
||||
val bufferType = ClassLoaderHost.javaTypeToDataType(bufferAndOutputTypes(0))
|
||||
|
||||
//TODO: complete the implementation
|
||||
|
||||
} catch {
|
||||
case e @ (_: InstantiationException | _: IllegalArgumentException) =>
|
||||
throw new EngineException(s"Can not instantiate class $className, please make sure it has public non argument constructor")
|
||||
}
|
||||
}
|
||||
val udaf = clazz.newInstance().asInstanceOf[UserDefinedAggregateFunction]
|
||||
spark.udf.register(name, udaf)
|
||||
} catch {
|
||||
case e: ClassNotFoundException => throw new EngineException(s"Can not load class ${className}, please make sure it is on the classpath")
|
||||
case e @ (_: InstantiationException | _: IllegalArgumentException) =>
|
||||
throw new EngineException(s"Can not instantiate class ${className}, please make sure it has public non argument constructor")
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,30 @@
|
|||
// *********************************************************************
|
||||
// Copyright (c) Microsoft Corporation. All rights reserved.
|
||||
// Licensed under the MIT License
|
||||
// *********************************************************************
|
||||
package datax.host
|
||||
|
||||
import datax.telemetry.AppInsightLogger
|
||||
import org.apache.log4j.{Level, LogManager}
|
||||
import org.apache.spark.SparkConf
|
||||
import org.apache.spark.sql.SparkSession
|
||||
|
||||
object SparkSessionSingleton {
|
||||
def getLogger = LogManager.getLogger(this.getClass)
|
||||
|
||||
@transient private var instance: SparkSession = _
|
||||
|
||||
def getInstance(sparkConf: SparkConf): SparkSession = {
|
||||
|
||||
if (instance == null) {
|
||||
instance = SparkSession
|
||||
.builder
|
||||
.config(sparkConf)
|
||||
.enableHiveSupport()
|
||||
.getOrCreate()
|
||||
}
|
||||
|
||||
instance
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,143 @@
|
|||
// *********************************************************************
|
||||
// Copyright (c) Microsoft Corporation. All rights reserved.
|
||||
// Licensed under the MIT License
|
||||
// *********************************************************************
|
||||
package datax.host
|
||||
|
||||
import java.sql.Timestamp
|
||||
|
||||
import com.microsoft.azure.eventhubs.EventData
|
||||
import datax.config._
|
||||
import datax.constants.{JobArgument, ProductConstant}
|
||||
import datax.exception.EngineException
|
||||
import datax.input._
|
||||
import datax.processor.EventHubStreamingProcessor
|
||||
import datax.telemetry.AppInsightLogger
|
||||
import datax.utility.DateTimeUtil
|
||||
import org.apache.log4j.LogManager
|
||||
import org.apache.spark.rdd.RDD
|
||||
import org.apache.spark.sql.SparkSession
|
||||
import org.apache.spark.streaming.{Seconds, StreamingContext}
|
||||
|
||||
import scala.concurrent.duration._
|
||||
|
||||
object StreamingHost {
|
||||
def getLogger = LogManager.getLogger(this.getClass)
|
||||
|
||||
def createStreamingContext(spark: SparkSession, intervalInSeconds: Long) = {
|
||||
new StreamingContext(spark.sparkContext, Seconds(intervalInSeconds))
|
||||
}
|
||||
|
||||
def createStreamingContextWithCheckpoint(spark:SparkSession, streamingCheckpointDir: String, intervalInSeconds: Long) = {
|
||||
val streamingContext = createStreamingContext(spark, intervalInSeconds)
|
||||
getLogger.warn("create a new streaming context with checkpointDir=" + streamingCheckpointDir)
|
||||
streamingContext.checkpoint(streamingCheckpointDir)
|
||||
streamingContext
|
||||
}
|
||||
|
||||
def initStreamingContext(spark: SparkSession, streamingCheckpointDir: String, intervalInSeconds: Long) = {
|
||||
getLogger.warn("spark streaming checkpointDir=" + streamingCheckpointDir)
|
||||
StreamingContext.getOrCreate(streamingCheckpointDir,
|
||||
() => createStreamingContextWithCheckpoint(spark, streamingCheckpointDir, intervalInSeconds),
|
||||
spark.sparkContext.hadoopConfiguration,
|
||||
false
|
||||
)
|
||||
}
|
||||
|
||||
def runStreamingApp(inputArguments: Array[String], processorGenerator: UnifiedConfig=>EventHubStreamingProcessor): Unit = {
|
||||
val (appHost, config) = CommonAppHost.initApp(inputArguments)
|
||||
val spark = CommonAppHost.getSpark(config.sparkConf)
|
||||
|
||||
val dict = config.dict
|
||||
val streamingConf = StreamingInputSetting.getStreamingInputConf(dict)
|
||||
val eventhubConf = EventHubInputSetting.getInputEventHubConf(dict)
|
||||
if(eventhubConf==null)
|
||||
throw new EngineException(s"No proper eventhub config is provided")
|
||||
|
||||
val logger = LogManager.getLogger("runStreamingApp")
|
||||
logger.warn(s"Get or create streaming context from checkpoint folder:${streamingConf.checkpointDir}")
|
||||
val checkpointEnabled = dict.getOrElse(JobArgument.ConfName_CheckpointEnabled, "false").toBoolean
|
||||
|
||||
def createSC() = {
|
||||
val createStreamContextLogger = LogManager.getLogger("runStreamingApp-createSC")
|
||||
val spark = CommonAppHost.getSpark(config.sparkConf)
|
||||
createStreamContextLogger.warn(s"Create streaming context checkpoints folder=${streamingConf.checkpointDir}, internalInSeconds=${streamingConf.intervalInSeconds}")
|
||||
val streamingContext = createStreamingContext(spark, streamingConf.intervalInSeconds)
|
||||
val batchInterval = streamingConf.intervalInSeconds.seconds
|
||||
val repartitionNumber = eventhubConf.repartition.getOrElse(0)
|
||||
val repartition = if(repartitionNumber==0) (r:RDD[EventData])=>r else (r:RDD[EventData])=>r.repartition(repartitionNumber)
|
||||
EventHubStreamingFactory.getStream(streamingContext, eventhubConf, (rdd, time) => {
|
||||
val streamingLogger = LogManager.getLogger("EventHubStreamingLoop")
|
||||
val batchTime = new Timestamp(time.milliseconds)
|
||||
val batchTimeStr = DateTimeUtil.formatSimple(batchTime)
|
||||
streamingLogger.warn(s"===============================Batch $batchTimeStr Started===============================")
|
||||
val processor = EventHubStreamingFactory.getOrCreateProcessor(config, processorGenerator)
|
||||
processor.process(repartition(rdd), batchTime, batchInterval)
|
||||
streamingLogger.warn(s"===============================Batch $batchTimeStr End ===============================")
|
||||
})
|
||||
|
||||
streamingContext
|
||||
}
|
||||
|
||||
val streamingContext = if(checkpointEnabled)
|
||||
StreamingContext.getOrCreate(
|
||||
streamingConf.checkpointDir,
|
||||
createSC _,
|
||||
spark.sparkContext.hadoopConfiguration,
|
||||
false)
|
||||
else createSC()
|
||||
|
||||
//streamingContext.remember(org.apache.spark.streaming.Duration(65000))
|
||||
streamingContext.start()
|
||||
|
||||
AppInsightLogger.trackEvent(ProductConstant.ProductRoot + "/streaming/app/start")
|
||||
logger.warn(s"Streaming Context Started")
|
||||
streamingContext.awaitTermination()
|
||||
}
|
||||
|
||||
def runLocalStreamingApp(inputArguments: Array[String], processorGenerator: UnifiedConfig=>EventHubStreamingProcessor): Unit = {
|
||||
val (appHost, config) = CommonAppHost.initApp(inputArguments)
|
||||
val spark = CommonAppHost.getSpark(config.sparkConf)
|
||||
|
||||
val dict = config.dict
|
||||
val streamingConf = StreamingInputSetting.getStreamingInputConf(dict)
|
||||
val logger = LogManager.getLogger("runLocalStreamingApp")
|
||||
logger.warn(s"Get or create streaming context from checkpoint folder:${streamingConf.checkpointDir}")
|
||||
val checkpointEnabled = dict.getOrElse(JobArgument.ConfName_CheckpointEnabled, "false").toBoolean
|
||||
|
||||
def createSC() = {
|
||||
val createStreamContextLogger = LogManager.getLogger("runLocalStreamingApp-createSC")
|
||||
val spark = CommonAppHost.getSpark(config.sparkConf)
|
||||
createStreamContextLogger.warn(s"Create streaming context checkpoints folder=${streamingConf.checkpointDir}, internalInSeconds=${streamingConf.intervalInSeconds}")
|
||||
val streamingContext = createStreamingContext(spark, streamingConf.intervalInSeconds)
|
||||
val batchInterval = streamingConf.intervalInSeconds.seconds
|
||||
|
||||
val inputSchema = SchemaFile.loadInputSchema(dict)
|
||||
LocalStreamingFactory.getStream(streamingContext, inputSchema, (rdd, time) => {
|
||||
val streamingLogger = LogManager.getLogger("LocalStreamingLoop")
|
||||
val batchTime = new Timestamp(time.milliseconds)
|
||||
val batchTimeStr = DateTimeUtil.formatSimple(batchTime)
|
||||
streamingLogger.warn(s"===============================Batch $batchTimeStr Started===============================")
|
||||
val processor = LocalStreamingFactory.getOrCreateProcessor(config, processorGenerator)
|
||||
processor.process(rdd, batchTime, batchInterval)
|
||||
streamingLogger.warn(s"===============================Batch $batchTimeStr End ===============================")
|
||||
})
|
||||
|
||||
streamingContext
|
||||
}
|
||||
|
||||
val streamingContext = if(checkpointEnabled)
|
||||
StreamingContext.getOrCreate(
|
||||
streamingConf.checkpointDir,
|
||||
createSC _,
|
||||
spark.sparkContext.hadoopConfiguration,
|
||||
false)
|
||||
else createSC()
|
||||
|
||||
streamingContext.start()
|
||||
|
||||
AppInsightLogger.trackEvent(ProductConstant.ProductRoot + "/localStreaming/app/start")
|
||||
logger.warn(s"Local Streaming Context Started")
|
||||
streamingContext.awaitTermination()
|
||||
}
|
||||
}
|
|
@ -0,0 +1,19 @@
|
|||
// *********************************************************************
|
||||
// Copyright (c) Microsoft Corporation. All rights reserved.
|
||||
// Licensed under the MIT License
|
||||
// *********************************************************************
|
||||
package datax.host
|
||||
|
||||
import datax.config.SettingDictionary
|
||||
import org.apache.spark.sql.SparkSession
|
||||
|
||||
object UdfInitializer {
|
||||
def initialize(spark: SparkSession, dict: SettingDictionary) = {
|
||||
// Register UDF functions
|
||||
spark.udf.register("filterNull", filterNull _)
|
||||
}
|
||||
|
||||
def filterNull(elems: Seq[Map[String, String]]) : Seq[Map[String, String]] = {
|
||||
elems.filter(_!=null)
|
||||
}
|
||||
}
|
|
@ -0,0 +1,33 @@
|
|||
// *********************************************************************
|
||||
// Copyright (c) Microsoft Corporation. All rights reserved.
|
||||
// Licensed under the MIT License
|
||||
// *********************************************************************
|
||||
package datax.input
|
||||
|
||||
import datax.config.{SettingDictionary, SettingNamespace}
|
||||
import org.apache.log4j.LogManager
|
||||
|
||||
import scala.collection.mutable
|
||||
|
||||
case class InputBlobsConf(pathPrefix:String,
|
||||
pathPartitionFolderFormat: String,
|
||||
startTime: String,
|
||||
durationInHours: Long)
|
||||
|
||||
object BatchBlobInputSetting {
|
||||
val NamespaceBlobsSource = "blobs"
|
||||
val NamespacePrefix = SettingNamespace.JobInputPrefix+NamespaceBlobsSource+"."
|
||||
|
||||
private def buildInputBlobsConf(dict: SettingDictionary, name: String): InputBlobsConf = {
|
||||
InputBlobsConf(
|
||||
pathPrefix = dict.getOrNull("pathprefix"),
|
||||
pathPartitionFolderFormat = dict.getOrNull("pathpartitionfolderformat"),
|
||||
startTime = dict.getOrNull("starttime"),
|
||||
durationInHours = dict.getLong("durationinhours")
|
||||
)
|
||||
}
|
||||
|
||||
def getInputBlobsArrayConf(dict: SettingDictionary): Seq[InputBlobsConf] = {
|
||||
dict.buildConfigIterable(buildInputBlobsConf, NamespacePrefix).toSeq
|
||||
}
|
||||
}
|
|
@ -0,0 +1,161 @@
|
|||
// *********************************************************************
|
||||
// Copyright (c) Microsoft Corporation. All rights reserved.
|
||||
// Licensed under the MIT License
|
||||
// *********************************************************************
|
||||
package datax.input
|
||||
|
||||
import java.sql.Timestamp
|
||||
import java.text.SimpleDateFormat
|
||||
|
||||
import com.fasterxml.jackson.annotation.{JsonCreator, JsonProperty}
|
||||
import com.fasterxml.jackson.databind.ObjectMapper
|
||||
import com.microsoft.azure.eventhubs.EventData
|
||||
import datax.config._
|
||||
import datax.data.FileInternal
|
||||
import datax.exception.EngineException
|
||||
import datax.input.BlobPointerInputSetting.BlobPointerInputConf
|
||||
import datax.sink.BlobOutputSetting.BlobOutputConf
|
||||
import datax.sink.{BlobOutputSetting, BlobSinker}
|
||||
import datax.telemetry.AppInsightLogger
|
||||
import org.apache.log4j.LogManager
|
||||
import org.apache.spark.rdd.RDD
|
||||
import org.apache.spark.sql.types.{StringType, StructType}
|
||||
|
||||
import scala.collection.mutable
|
||||
import scala.util.matching.Regex
|
||||
|
||||
case class BlobPointer @JsonCreator()(@JsonProperty("BlobPath") BlobPath: String)
|
||||
|
||||
object BlobPointerInput {
|
||||
val logger = LogManager.getLogger(this.getClass)
|
||||
|
||||
val blobPointerMapper = new ObjectMapper()
|
||||
blobPointerMapper.registerSubtypes(classOf[BlobPointer])
|
||||
def parseBlobPath(eventData: EventData) =
|
||||
blobPointerMapper.readValue(eventData.getBytes, classOf[BlobPointer]).BlobPath
|
||||
|
||||
private def loadBlobPointerSchema() = {
|
||||
(new StructType).add("BlobPath", StringType)
|
||||
}
|
||||
|
||||
private val saRegex = """wasbs?://[\w-]+@([\w\d]+)\.blob.core.windows.net/.*""".r
|
||||
private def extractSourceId(blobPath: String, regex: String): String = {
|
||||
val r = if(regex == null) saRegex else regex.r
|
||||
r.findFirstMatchIn(blobPath) match {
|
||||
case Some(partition) => partition.group(1)
|
||||
case None => null
|
||||
}
|
||||
}
|
||||
|
||||
private def extractTimeFromBlobPath(blobPath: String, fileTimeRegex: Regex, fileTimeFormat: String): Timestamp = {
|
||||
fileTimeRegex.findFirstMatchIn(blobPath) match {
|
||||
case Some(timeStr) => try{
|
||||
if(fileTimeFormat==null){
|
||||
Timestamp.valueOf(timeStr.group(1).replace('_', ':').replace('T', ' '))
|
||||
}
|
||||
else{
|
||||
val format = new SimpleDateFormat(fileTimeFormat)
|
||||
new Timestamp(format.parse(timeStr.group(1)).getTime())
|
||||
}
|
||||
}
|
||||
catch {
|
||||
case e: Exception =>
|
||||
logger.error(s"Error when parsing date time string from path $blobPath: $e")
|
||||
AppInsightLogger.trackException(e, Map(
|
||||
"errorLocation" -> "extractTimeFromBlobPath",
|
||||
"errorMessage" -> "Error in parsing date time string",
|
||||
"failedBlobPath" -> blobPath
|
||||
), null)
|
||||
null
|
||||
}
|
||||
case None =>
|
||||
logger.error(s"Failed to extract blob time from path $blobPath")
|
||||
AppInsightLogger.trackException(new EngineException(s"Cannot find blob time from path $blobPath"), Map(
|
||||
"errorLocation" -> "extractTimeFromBlobPath",
|
||||
"errorMessage" -> "Failed to extract blob time",
|
||||
"failedBlobPath" -> blobPath
|
||||
), null)
|
||||
null
|
||||
}
|
||||
}
|
||||
|
||||
private def pathHintsFromBlobPath(blobPath: String, blobPathRegex: Regex): String = {
|
||||
blobPathRegex.findFirstMatchIn(blobPath) match {
|
||||
case Some(m) => try{
|
||||
m.subgroups.mkString("-")
|
||||
}
|
||||
catch {
|
||||
case e: Exception =>
|
||||
val msg = s"Error occurs in generating output from blob path. \n Please check: \nregex='$blobPathRegex'\nblobPath='$blobPath'\nmatch='$m'"
|
||||
logger.error(msg, e)
|
||||
AppInsightLogger.trackException(e, Map(
|
||||
"errorLocation" -> "pathHintsFromBlobPath",
|
||||
"errorMessage" -> "Error occurs in generating output file name from blob path",
|
||||
"failedBlobPath" -> blobPath,
|
||||
"regex" -> blobPathRegex.toString()
|
||||
), null)
|
||||
//null
|
||||
throw new EngineException(msg, e)
|
||||
}
|
||||
case None =>
|
||||
val msg = s"Error occurs in extract file name from blob path. \n Please check: \nregex='$blobPathRegex'\nblobPath='$blobPath'"
|
||||
logger.error(msg)
|
||||
AppInsightLogger.trackException(new EngineException("Cannot find file name from blob path"), Map(
|
||||
"errorLocation" -> "pathHintsFromBlobPath",
|
||||
"errorMessage" -> "Error occurs in extracting file name from blob path",
|
||||
"failedBlobPath" -> blobPath,
|
||||
"regex" -> blobPathRegex.toString()
|
||||
), null)
|
||||
//null
|
||||
throw new EngineException(msg)
|
||||
}
|
||||
}
|
||||
|
||||
private def inputPathToInternalProps(inputFilePath: String,
|
||||
inputConf: BlobPointerInputConf,
|
||||
outputConf: BlobOutputConf,
|
||||
outputTimestamp: Timestamp) = {
|
||||
val sourceId = extractSourceId(inputFilePath, inputConf.sourceIdRegex)
|
||||
inputConf.sources.get(sourceId) match {
|
||||
case Some(source) =>
|
||||
val fileTime = extractTimeFromBlobPath(inputFilePath, inputConf.fileTimeRegex.r, inputConf.fileTimeFormat)
|
||||
val outputPartitionTime = if(outputTimestamp==null) fileTime else outputTimestamp
|
||||
FileInternal(inputPath = inputFilePath,
|
||||
outputFolders = outputConf.groups.map{case (k,v)=>
|
||||
k-> BlobSinker.generateOutputFolderPath(v.folder, outputPartitionTime, Some(source.target))
|
||||
},
|
||||
outputFileName = pathHintsFromBlobPath(inputFilePath, inputConf.blobPathRegex.r),
|
||||
fileTime = fileTime,
|
||||
ruleIndexPrefix = source.catalogPrefix.getOrElse(""),
|
||||
target = source.target
|
||||
)
|
||||
case None =>
|
||||
FileInternal(inputPath = inputFilePath)
|
||||
}
|
||||
}
|
||||
|
||||
def pathsToGroups(rdd: RDD[String],
|
||||
jobName: String,
|
||||
dict: SettingDictionary,
|
||||
outputTimestamp: Timestamp) = {
|
||||
val initialSet = mutable.HashSet.empty[FileInternal]
|
||||
val inputConf = BlobPointerInputSetting.getInputConfig(dict)
|
||||
val blobOutputConf = BlobOutputSetting.getDefaultBlobOutputConf(dict)
|
||||
rdd.map(s => {
|
||||
val propsFile = inputPathToInternalProps(s, inputConf, blobOutputConf, outputTimestamp)
|
||||
(if(propsFile.outputFolders==null || propsFile.outputFolders.isEmpty) null else jobName, propsFile)
|
||||
})
|
||||
.aggregateByKey(initialSet)(_ += _, _ ++ _) // drop duplicates
|
||||
.collect()
|
||||
}
|
||||
|
||||
def filterPathGroups(groups: Array[(String, mutable.HashSet[FileInternal])]) = {
|
||||
groups.find(_._1==null) match {
|
||||
case Some(invalidPaths) =>
|
||||
logger.warn("Found out-of-scope paths count=" + invalidPaths._2.size + ", First File=" + invalidPaths._2.head.inputPath)
|
||||
groups.filter(_._1 != null)
|
||||
case None =>
|
||||
groups
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,83 @@
|
|||
// *********************************************************************
|
||||
// Copyright (c) Microsoft Corporation. All rights reserved.
|
||||
// Licensed under the MIT License
|
||||
// *********************************************************************
|
||||
package datax.input
|
||||
|
||||
import datax.config.{SettingDictionary, SettingNamespace}
|
||||
import datax.input.EventHubInputSetting.InputEventHubConf
|
||||
import datax.input.StreamingInputSetting.StreamingConf
|
||||
import org.apache.log4j.LogManager
|
||||
|
||||
import scala.collection.mutable
|
||||
|
||||
object BlobPointerInputSetting {
|
||||
case class InputSource(target: String, catalogPrefix:Option[String])
|
||||
case class BlobPointerInputConf(sources: Map[String, InputSource],
|
||||
eventhub: InputEventHubConf,
|
||||
streaming: StreamingConf,
|
||||
sourceIdRegex: String,
|
||||
eventNamePath: String,
|
||||
blobPathRegex: String,
|
||||
fileTimeRegex: String,
|
||||
fileTimeFormat: String)
|
||||
|
||||
val NamespacePrefix = SettingNamespace.JobInputPrefix
|
||||
val SettingSourceIdRegex = "sourceidregex"
|
||||
val SettingEventNamePath = "eventnamepath"
|
||||
val SettingBlobPathRegex = "blobpathregex"
|
||||
val SettingFileTimeRegex = "filetimeregex"
|
||||
val SettingFileTimeFormat = "filetimeformat"
|
||||
|
||||
val NamespaceSource = "source"
|
||||
val SettingInputSourceTarget = "target"
|
||||
val SettingInputSourceCatalogPrefix = "catalogprefix"
|
||||
|
||||
private def buildInputSource(dict: SettingDictionary, name: String): InputSource = {
|
||||
InputSource(
|
||||
target = dict.getOrNull(SettingInputSourceTarget),
|
||||
catalogPrefix = dict.get(SettingInputSourceCatalogPrefix)
|
||||
)
|
||||
}
|
||||
|
||||
def getInputConfig(dict: SettingDictionary): BlobPointerInputConf = {
|
||||
val logger = LogManager.getLogger(this.getClass)
|
||||
|
||||
var sources: Map[String, InputSource] = null
|
||||
var eventhub: InputEventHubConf = null
|
||||
var streaming: StreamingConf = null
|
||||
var sourceIdRegex: String = null
|
||||
var eventNamePath: String = null
|
||||
var blobPathRegex: String = null
|
||||
var fileTimeRegex: String = null
|
||||
var fileTimeFormat: String = null
|
||||
|
||||
dict.groupBySubNamespace(NamespacePrefix)
|
||||
.foreach{case (g, v) => {
|
||||
g match {
|
||||
case NamespaceSource => sources = v.buildConfigMap(buildInputSource)
|
||||
case EventHubInputSetting.NamespaceEventHub => eventhub = EventHubInputSetting.buildInputEventHubConf(v)
|
||||
case StreamingInputSetting.NamespaceStreaming => streaming = StreamingInputSetting.buildStreamingConf(v)
|
||||
case SettingSourceIdRegex => sourceIdRegex = v.getDefault().orNull
|
||||
case SettingEventNamePath => eventNamePath = v.getDefault().orNull
|
||||
case SettingBlobPathRegex => blobPathRegex = v.getDefault().orNull
|
||||
case SettingFileTimeRegex => fileTimeRegex = v.getDefault().orNull
|
||||
case SettingFileTimeFormat => fileTimeFormat = v.getDefault().orNull
|
||||
case "blobschemafile" =>
|
||||
case groupName:String =>
|
||||
logger.warn(s"Unsupported setting group '$groupName' under namespace '$NamespacePrefix': \n ${v.getDictMap().keys.mkString("\n")}")
|
||||
}
|
||||
}}
|
||||
|
||||
BlobPointerInputConf(
|
||||
sources = sources,
|
||||
eventhub = eventhub,
|
||||
streaming = streaming,
|
||||
sourceIdRegex = sourceIdRegex,
|
||||
eventNamePath = eventNamePath,
|
||||
blobPathRegex = blobPathRegex,
|
||||
fileTimeRegex = fileTimeRegex,
|
||||
fileTimeFormat = fileTimeFormat
|
||||
)
|
||||
}
|
||||
}
|
|
@ -0,0 +1,56 @@
|
|||
// *********************************************************************
|
||||
// Copyright (c) Microsoft Corporation. All rights reserved.
|
||||
// Licensed under the MIT License
|
||||
// *********************************************************************
|
||||
package datax.input
|
||||
|
||||
import datax.config.{SettingDictionary, SettingNamespace}
|
||||
import org.apache.log4j.LogManager
|
||||
|
||||
object EventHubInputSetting {
|
||||
case class InputEventHubConf(connectionString: String,
|
||||
consumerGroup: String,
|
||||
checkpointDir: String,
|
||||
checkpointInterval: String,
|
||||
maxRate: String,
|
||||
startEnqueueTime: Option[Long],
|
||||
flushExistingCheckpoints: Option[Boolean],
|
||||
repartition: Option[Int]
|
||||
)
|
||||
|
||||
val NamespaceEventHub = "eventhub"
|
||||
val NamespacePrefix = SettingNamespace.JobInputPrefix + NamespaceEventHub+SettingNamespace.Seperator
|
||||
val SettingConnectionString = "connectionstring"
|
||||
val SettingConsumerGroup = "consumergroup"
|
||||
val SettingCheckpointDir = "checkpointdir"
|
||||
val SettingCheckpointInterval = "checkpointinterval"
|
||||
val SettingMaxRate = "maxrate"
|
||||
val SettingStartEnqueueTime = "startenqueuetime"
|
||||
val SettingFlushExistingCheckpoints = "flushexistingcheckpoints"
|
||||
val SettingRepartition = "repartition"
|
||||
|
||||
private val logger = LogManager.getLogger("EventHubInputSetting")
|
||||
|
||||
def buildInputEventHubConf(dict: SettingDictionary): InputEventHubConf = {
|
||||
dict.get(SettingConnectionString) match {
|
||||
case Some(connectionString) =>
|
||||
InputEventHubConf(
|
||||
connectionString = connectionString,
|
||||
consumerGroup = dict.getString(SettingConsumerGroup),
|
||||
checkpointDir = dict.getString(SettingCheckpointDir),
|
||||
checkpointInterval = dict.getString(SettingCheckpointInterval),
|
||||
maxRate = dict.getString(SettingMaxRate),
|
||||
startEnqueueTime = dict.getLongOption(SettingStartEnqueueTime),
|
||||
flushExistingCheckpoints = dict.getBooleanOption(SettingFlushExistingCheckpoints),
|
||||
repartition = dict.getIntOption(SettingRepartition)
|
||||
)
|
||||
case None =>
|
||||
null
|
||||
}
|
||||
}
|
||||
|
||||
def getInputEventHubConf(dict: SettingDictionary): InputEventHubConf = {
|
||||
logger.warn("EventHub NamespacePrefix=" + NamespacePrefix)
|
||||
buildInputEventHubConf(dict.getSubDictionary(NamespacePrefix))
|
||||
}
|
||||
}
|
|
@ -0,0 +1,135 @@
|
|||
// *********************************************************************
|
||||
// Copyright (c) Microsoft Corporation. All rights reserved.
|
||||
// Licensed under the MIT License
|
||||
// *********************************************************************
|
||||
package datax.input
|
||||
|
||||
import java.sql.Timestamp
|
||||
import java.time.Instant
|
||||
|
||||
import com.microsoft.azure.eventhubs.EventData
|
||||
import datax.checkpoint.EventhubCheckpointer
|
||||
import datax.config.UnifiedConfig
|
||||
import datax.constants.ProductConstant
|
||||
import datax.exception.EngineException
|
||||
import datax.input.EventHubInputSetting.InputEventHubConf
|
||||
import datax.processor.EventHubStreamingProcessor
|
||||
import datax.securedsetting.KeyVaultClient
|
||||
import datax.telemetry.AppInsightLogger
|
||||
import datax.utility.DateTimeUtil
|
||||
import org.apache.log4j.LogManager
|
||||
import org.apache.spark.eventhubs.{EventHubsConf, EventHubsUtils, EventPosition}
|
||||
import org.apache.spark.eventhubs.rdd.HasOffsetRanges
|
||||
import org.apache.spark.rdd.RDD
|
||||
import org.apache.spark.streaming.{StreamingContext, Time}
|
||||
|
||||
object EventHubStreamingFactory {
|
||||
def getEventHubConf(eventhubInput:InputEventHubConf) = {
|
||||
val logger = LogManager.getLogger("EventHubConfBuilder")
|
||||
|
||||
val connectionString = KeyVaultClient.resolveSecretIfAny(eventhubInput.connectionString)
|
||||
if(connectionString==null||connectionString.isEmpty){
|
||||
val errMsg = s"Connection string is empty for eventhub input"
|
||||
logger.error(errMsg)
|
||||
throw new EngineException(errMsg)
|
||||
}
|
||||
|
||||
val checkpointDir = eventhubInput.checkpointDir
|
||||
val consumerGroup = eventhubInput.consumerGroup
|
||||
|
||||
logger.warn("eventhub checkpointDir=" + checkpointDir)
|
||||
logger.warn("eventhub consumerGroup=" + consumerGroup)
|
||||
|
||||
val ehConf = EventHubsConf(connectionString = connectionString)
|
||||
.setConsumerGroup(consumerGroup)
|
||||
.setMaxRatePerPartition(eventhubInput.maxRate.toInt)
|
||||
.setReceiverTimeout(java.time.Duration.ofSeconds(60))
|
||||
.setOperationTimeout(java.time.Duration.ofSeconds(120))
|
||||
|
||||
eventhubInput.startEnqueueTime match {
|
||||
case Some(startEnqueueTimeInSeconds) =>
|
||||
if(startEnqueueTimeInSeconds<0){
|
||||
val startEnqueueTime = Instant.now.plusSeconds(startEnqueueTimeInSeconds)
|
||||
ehConf.setStartingPosition(EventPosition.fromEnqueuedTime(startEnqueueTime))
|
||||
logger.warn(s"eventhub startEnqueueTime from config:${startEnqueueTimeInSeconds}, passing startEnqueueTime=$startEnqueueTime")
|
||||
}
|
||||
else if(startEnqueueTimeInSeconds>0){
|
||||
val startEnqueueTime = Instant.ofEpochSecond(startEnqueueTimeInSeconds)
|
||||
ehConf.setStartingPosition(EventPosition.fromEnqueuedTime(startEnqueueTime))
|
||||
logger.warn(s"eventhub startEnqueueTime from config:${startEnqueueTimeInSeconds}, passing startEnqueueTime=$startEnqueueTime")
|
||||
}
|
||||
else{
|
||||
ehConf.setStartingPosition(EventPosition.fromStartOfStream)
|
||||
}
|
||||
case None =>
|
||||
ehConf.setStartingPosition(EventPosition.fromEndOfStream)
|
||||
}
|
||||
|
||||
ehConf
|
||||
}
|
||||
|
||||
def getStream(streamingContext: StreamingContext,
|
||||
eventhubInput:InputEventHubConf,
|
||||
foreachRDDHandler: (RDD[EventData], Time)=>Unit
|
||||
) ={
|
||||
///////////////////////////////////////////////////////////////
|
||||
//Create direct stream from EventHub
|
||||
///////////////////////////////////////////////////////////////
|
||||
val preparationLogger = LogManager.getLogger("PrepareEventHubDirectStream")
|
||||
val checkpointDir = eventhubInput.checkpointDir
|
||||
val ehConf = getEventHubConf(eventhubInput)
|
||||
if(eventhubInput.flushExistingCheckpoints.getOrElse(false))
|
||||
preparationLogger.warn("Flush the existing checkpoints according to configuration")
|
||||
else
|
||||
EventhubCheckpointer.applyCheckpointsIfExists(ehConf, checkpointDir)
|
||||
|
||||
val checkpointIntervalInMilliseconds = eventhubInput.checkpointInterval.toLong*1000
|
||||
EventHubsUtils.createDirectStream(streamingContext, ehConf)
|
||||
//.persist()
|
||||
//.window(org.apache.spark.streaming.Duration.10))
|
||||
.foreachRDD((rdd, time)=>{
|
||||
AppInsightLogger.trackEvent(ProductConstant.ProductRoot + "/streaming/batch/begin", Map("batchTime"->time.toString), null)
|
||||
val offsetRanges = rdd.asInstanceOf[HasOffsetRanges].offsetRanges
|
||||
val batchTime = new Timestamp(time.milliseconds)
|
||||
val batchTimeStr = DateTimeUtil.formatSimple(batchTime)
|
||||
val streamLogger = LogManager.getLogger(s"CheckOffsets-${batchTimeStr}")
|
||||
|
||||
streamLogger.warn(s"Processing offsets: \n" +
|
||||
offsetRanges.map(offset=>s"${offset.name}-${offset.partitionId.toString}: from=${offset.fromSeqNo}, until=${offset.untilSeqNo}").mkString("\n"))
|
||||
|
||||
try {
|
||||
foreachRDDHandler(rdd, time)
|
||||
}
|
||||
catch {
|
||||
case e: Exception =>
|
||||
AppInsightLogger.trackException(e,
|
||||
Map("batchTime"->time.toString()),
|
||||
offsetRanges.map(offset=>s"${offset.name}-${offset.partitionId.toString}-fromSeqNo"->offset.fromSeqNo.toDouble).toMap)
|
||||
throw e
|
||||
}
|
||||
|
||||
if(time.isMultipleOf(org.apache.spark.streaming.Duration(checkpointIntervalInMilliseconds))) {
|
||||
streamLogger.info(s"Start writing eventhub checkpoints to ${checkpointDir}")
|
||||
val conf = rdd.sparkContext.hadoopConfiguration
|
||||
EventhubCheckpointer.writeOffsetsToCheckpoints(checkpointDir, offsetRanges.map(r => (time.milliseconds, r.nameAndPartition.ehName, r.nameAndPartition.partitionId, r.fromSeqNo, r.untilSeqNo)), conf)
|
||||
streamLogger.warn(s"Done writing eventhub checkpoints to ${checkpointDir}")
|
||||
}
|
||||
|
||||
AppInsightLogger.trackEvent(ProductConstant.ProductRoot + "/streaming/batch/end", Map("batchTime"->time.toString), null)
|
||||
})
|
||||
}
|
||||
|
||||
@volatile private var instance: EventHubStreamingProcessor = null
|
||||
def getOrCreateProcessor(config: UnifiedConfig,
|
||||
generator: UnifiedConfig =>EventHubStreamingProcessor) = {
|
||||
if (instance == null) {
|
||||
synchronized {
|
||||
if (instance == null) {
|
||||
instance = generator(config)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
instance
|
||||
}
|
||||
}
|
|
@ -0,0 +1,12 @@
|
|||
// *********************************************************************
|
||||
// Copyright (c) Microsoft Corporation. All rights reserved.
|
||||
// Licensed under the MIT License
|
||||
// *********************************************************************
|
||||
package datax.input
|
||||
|
||||
import datax.config._
|
||||
|
||||
|
||||
object InputManager {
|
||||
val NamespacePrefix = SettingNamespace.JobInputPrefix
|
||||
}
|
|
@ -0,0 +1,76 @@
|
|||
// *********************************************************************
|
||||
// Copyright (c) Microsoft Corporation. All rights reserved.
|
||||
// Licensed under the MIT License
|
||||
// *********************************************************************
|
||||
package datax.input
|
||||
|
||||
import java.sql.Timestamp
|
||||
import java.time.Instant
|
||||
|
||||
import com.microsoft.azure.eventhubs.EventData
|
||||
import datax.checkpoint.EventhubCheckpointer
|
||||
import datax.config.UnifiedConfig
|
||||
import datax.constants.ProductConstant
|
||||
import datax.exception.EngineException
|
||||
import datax.input.EventHubInputSetting.InputEventHubConf
|
||||
import datax.processor.EventHubStreamingProcessor
|
||||
import datax.securedsetting.KeyVaultClient
|
||||
import datax.telemetry.AppInsightLogger
|
||||
import datax.utility.DateTimeUtil
|
||||
import org.apache.log4j.LogManager
|
||||
import org.apache.spark.eventhubs.{EventHubsConf, EventHubsUtils, EventPosition}
|
||||
import org.apache.spark.eventhubs.rdd.HasOffsetRanges
|
||||
import org.apache.spark.rdd.RDD
|
||||
import org.apache.spark.sql.types.{DataType, StructType}
|
||||
import org.apache.spark.streaming.{StreamingContext, Time}
|
||||
|
||||
// Factory class for streaming local events
|
||||
object LocalStreamingFactory {
|
||||
|
||||
def getStream(streamingContext: StreamingContext,
|
||||
inputSchema: DataType,
|
||||
foreachRDDHandler: (RDD[EventData], Time)=>Unit
|
||||
) ={
|
||||
|
||||
val preparationLogger = LogManager.getLogger("PrepareLocalDirectStream")
|
||||
///////////////////////////////////////////////////////////////
|
||||
//Create direct stream from custom receiver
|
||||
///////////////////////////////////////////////////////////////
|
||||
streamingContext.receiverStream(new LocalStreamingSource(inputSchema))
|
||||
.foreachRDD((rdd, time)=>{
|
||||
AppInsightLogger.trackEvent(ProductConstant.ProductRoot + "/localstreaming/batch/begin", Map("batchTime"->time.toString), null)
|
||||
val batchTime = new Timestamp(time.milliseconds)
|
||||
val batchTimeStr = DateTimeUtil.formatSimple(batchTime)
|
||||
val streamLogger = LogManager.getLogger(s"CheckOffsets-${batchTimeStr}")
|
||||
|
||||
try {
|
||||
foreachRDDHandler(rdd, time)
|
||||
}
|
||||
catch {
|
||||
case e: Exception =>
|
||||
AppInsightLogger.trackException(e,
|
||||
Map("batchTime"->time.toString()),
|
||||
Map("batchMetric"->1))
|
||||
throw e
|
||||
}
|
||||
|
||||
AppInsightLogger.trackEvent(ProductConstant.ProductRoot + "/localstreaming/batch/end", Map("batchTime"->time.toString), null)
|
||||
})
|
||||
}
|
||||
|
||||
|
||||
@volatile private var instance: EventHubStreamingProcessor = null
|
||||
def getOrCreateProcessor(config: UnifiedConfig,
|
||||
generator: UnifiedConfig =>EventHubStreamingProcessor) = {
|
||||
if (instance == null) {
|
||||
synchronized {
|
||||
if (instance == null) {
|
||||
instance = generator(config)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
instance
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,40 @@
|
|||
// *********************************************************************
|
||||
// Copyright (c) Microsoft Corporation. All rights reserved.
|
||||
// Licensed under the MIT License
|
||||
// *********************************************************************
|
||||
package datax.input
|
||||
|
||||
import java.nio.charset.Charset
|
||||
|
||||
import com.microsoft.azure.eventhubs.EventData
|
||||
import org.apache.log4j.LogManager
|
||||
import org.apache.spark.storage._
|
||||
import org.apache.spark.streaming.receiver._
|
||||
import org.apache.spark.sql.types.{DataType, StructType}
|
||||
import datax.utility
|
||||
|
||||
|
||||
/** This is a test receiver that generates data. */
|
||||
class LocalStreamingSource(inputSchema: DataType) extends Receiver[EventData](StorageLevel.MEMORY_AND_DISK_2) {
|
||||
|
||||
/** Start the thread that receives data over a connection */
|
||||
def onStart() {
|
||||
new Thread("Local Data Source") { override def run() { receive() } }.start()
|
||||
}
|
||||
|
||||
def onStop() { }
|
||||
|
||||
/** Periodically generate random data based on given schema */
|
||||
private def receive() {
|
||||
val logger = LogManager.getLogger("LocalStreamingSource")
|
||||
|
||||
|
||||
while(!isStopped()) {
|
||||
val jsonStr = DataGenerator.getRandomJson(inputSchema)
|
||||
logger.warn("Generated json="+jsonStr)
|
||||
val eventData = EventData.create(jsonStr.getBytes(Charset.defaultCharset()))
|
||||
store(Iterator(eventData))
|
||||
Thread.sleep(500)
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,60 @@
|
|||
// *********************************************************************
|
||||
// Copyright (c) Microsoft Corporation. All rights reserved.
|
||||
// Licensed under the MIT License
|
||||
// *********************************************************************
|
||||
package datax.input
|
||||
|
||||
import datax.config.{SettingDictionary, SettingNamespace}
|
||||
import datax.securedsetting.KeyVaultClient
|
||||
|
||||
|
||||
object RouterSetting {
|
||||
val NamespacePrefix = SettingNamespace.JobPrefix+"router."
|
||||
val NamespaceFilterJobPrefix = NamespacePrefix + "job."
|
||||
val NamespaceFilterPrefix = "filter."
|
||||
|
||||
case class FilterOutput(compressionType: String, eventhub: String, folder: String, format: String, outputType: String)
|
||||
case class SPFilter(sourceIdRegex: String, mappingOperations: Map[String, String], filterCondition: String, filterType: String, jobName:String, coalescingRatio: Double, numPartitions: String, output: FilterOutput)
|
||||
|
||||
|
||||
def buildFilterOutput(dict: SettingDictionary) = {
|
||||
FilterOutput(
|
||||
compressionType = dict.getOrNull("compressiontype"),
|
||||
eventhub = KeyVaultClient.resolveSecretIfAny(dict.getOrNull("eventhub")),
|
||||
folder = dict.getOrNull("folder"),
|
||||
format = dict.getOrNull("format"),
|
||||
outputType = dict.getOrNull("outputType")
|
||||
)
|
||||
}
|
||||
|
||||
def buildMappingOperations(s: Option[String]) = {
|
||||
s match {
|
||||
case Some(str) => str.split(";").map(p=>{
|
||||
val parts = p.split("=", 1)
|
||||
if(parts.length==2)
|
||||
parts(0).trim()->parts(1).trim()
|
||||
else
|
||||
parts(0).trim()->null
|
||||
}).toMap
|
||||
|
||||
case None => null
|
||||
}
|
||||
}
|
||||
|
||||
def buildFilterJob(dict: SettingDictionary, name: String) = {
|
||||
SPFilter(
|
||||
sourceIdRegex = dict.getOrNull("sourceidregex"),
|
||||
mappingOperations = buildMappingOperations(dict.get("mappingoperations")),
|
||||
filterCondition = dict.getOrNull("filterCondition"),
|
||||
filterType = dict.getOrNull("filterType"),
|
||||
jobName = name,
|
||||
coalescingRatio = dict.getDouble("coalescingRatio"),
|
||||
numPartitions = dict.getOrNull("numPartitions"),
|
||||
output = buildFilterOutput(dict.getSubDictionary(NamespaceFilterPrefix))
|
||||
)
|
||||
}
|
||||
|
||||
def getFiltersConfig(dict: SettingDictionary) = {
|
||||
dict.buildConfigIterable(buildFilterJob, NamespaceFilterJobPrefix).toSeq
|
||||
}
|
||||
}
|
|
@ -0,0 +1,35 @@
|
|||
// *********************************************************************
|
||||
// Copyright (c) Microsoft Corporation. All rights reserved.
|
||||
// Licensed under the MIT License
|
||||
// *********************************************************************
|
||||
package datax.input
|
||||
|
||||
import datax.config.{ConfigManager, SettingDictionary, SettingNamespace}
|
||||
import datax.fs.HadoopClient
|
||||
import datax.securedsetting.KeyVaultClient
|
||||
import org.apache.log4j.LogManager
|
||||
import org.apache.spark.sql.types.{DataType, StructType}
|
||||
|
||||
import scala.concurrent.{ExecutionContext, Future}
|
||||
|
||||
object SchemaFile {
|
||||
val SettingSchemaFile="blobschemafile"
|
||||
|
||||
private def getInputBlobSchemaFilePath(dict: SettingDictionary) = {
|
||||
dict.getOrNull(SettingNamespace.JobInputPrefix + SettingSchemaFile)
|
||||
}
|
||||
|
||||
private def loadRawBlobSchema(blobSchemaFile: String) = {
|
||||
// Schema of VS block extraction data
|
||||
val schemaJsonString = HadoopClient.readHdfsFile(blobSchemaFile).mkString("")
|
||||
DataType.fromJson(schemaJsonString)
|
||||
}
|
||||
|
||||
def loadInputSchema(dict: SettingDictionary) = {
|
||||
val file = getInputBlobSchemaFilePath(dict)
|
||||
val logger = LogManager.getLogger(this.getClass)
|
||||
logger.warn(s"Load input schema from '${file}'")
|
||||
val filePath = KeyVaultClient.resolveSecretIfAny(file)
|
||||
loadRawBlobSchema(filePath)
|
||||
}
|
||||
}
|
|
@ -0,0 +1,26 @@
|
|||
// *********************************************************************
|
||||
// Copyright (c) Microsoft Corporation. All rights reserved.
|
||||
// Licensed under the MIT License
|
||||
// *********************************************************************
|
||||
package datax.input
|
||||
|
||||
import datax.config._
|
||||
import org.apache.log4j.LogManager
|
||||
|
||||
object StreamingInputSetting {
|
||||
case class StreamingConf(checkpointDir: String,intervalInSeconds: Long)
|
||||
|
||||
val NamespaceStreaming = "streaming"
|
||||
val NamespacePrefix = SettingNamespace.JobInputPrefix + NamespaceStreaming + "."
|
||||
|
||||
def buildStreamingConf(dict: SettingDictionary): StreamingConf = {
|
||||
StreamingConf(
|
||||
checkpointDir = dict.getOrNull("checkpointdir"),
|
||||
intervalInSeconds = dict.getOrElse("intervalinseconds", "0").toLong
|
||||
)
|
||||
}
|
||||
|
||||
def getStreamingInputConf(dict: SettingDictionary): StreamingConf = {
|
||||
buildStreamingConf(dict.getSubDictionary(NamespacePrefix))
|
||||
}
|
||||
}
|
|
@ -0,0 +1,24 @@
|
|||
// *********************************************************************
|
||||
// Copyright (c) Microsoft Corporation. All rights reserved.
|
||||
// Licensed under the MIT License
|
||||
// *********************************************************************
|
||||
package datax.processor
|
||||
|
||||
import java.sql.Timestamp
|
||||
|
||||
import com.microsoft.azure.eventhubs.EventData
|
||||
import datax.input.BlobPointerInput
|
||||
import datax.utility.DateTimeUtil
|
||||
import org.apache.spark.rdd.RDD
|
||||
|
||||
import scala.concurrent.duration.Duration
|
||||
|
||||
class BlobPointerProcessor(processPaths: (RDD[String], Timestamp, Duration, Timestamp, String) => Map[String, Double])
|
||||
extends EventHubStreamingProcessor{
|
||||
val processPathsRDD = processPaths
|
||||
|
||||
override val process = (rdd: RDD[EventData], batchTime: Timestamp, batchInterval: Duration) => {
|
||||
val currentTime = DateTimeUtil.getCurrentTime()
|
||||
processPaths(rdd.map(BlobPointerInput.parseBlobPath), batchTime, batchInterval, currentTime, "")
|
||||
}
|
||||
}
|
|
@ -0,0 +1,26 @@
|
|||
// *********************************************************************
|
||||
// Copyright (c) Microsoft Corporation. All rights reserved.
|
||||
// Licensed under the MIT License
|
||||
// *********************************************************************
|
||||
package datax.processor
|
||||
|
||||
import java.sql.Timestamp
|
||||
|
||||
import com.microsoft.azure.eventhubs.EventData
|
||||
import org.apache.spark.rdd.RDD
|
||||
import org.apache.spark.sql.DataFrame
|
||||
import org.apache.spark.sql.streaming.StreamingQuery
|
||||
|
||||
import scala.concurrent.duration.Duration
|
||||
|
||||
case class CommonProcessor( processJson: (RDD[String], Timestamp, Duration, Timestamp) => Map[String, Double],
|
||||
processEventHubDataFrame: (DataFrame) => Map[String, StreamingQuery],
|
||||
processEventData: (RDD[EventData], Timestamp, Duration, Timestamp) => Map[String, Double],
|
||||
processPaths: (RDD[String], Timestamp, Duration, Timestamp, String) => Map[String, Double]){
|
||||
|
||||
def asBlobPointerProcessor() = new BlobPointerProcessor(processPaths = this.processPaths)
|
||||
def asJsonProcessor() = new JsonProcessor(processJson = this.processJson)
|
||||
def asDirectProcessor() = new DirectProcessor(processEventData = this.processEventData)
|
||||
def asStructuredStreamingProcessor = new EventHubStructuredStreamingProcessor(processDataFrame = this.processEventHubDataFrame)
|
||||
def asDirectLocalProcessor() = new DirectLocalProcessor(processEventData = this.processEventData)
|
||||
}
|
|
@ -0,0 +1,569 @@
|
|||
// *********************************************************************
|
||||
// Copyright (c) Microsoft Corporation. All rights reserved.
|
||||
// Licensed under the MIT License
|
||||
// *********************************************************************
|
||||
package datax.processor
|
||||
|
||||
import java.sql.Timestamp
|
||||
import java.util.concurrent.Executors
|
||||
|
||||
import com.microsoft.azure.eventhubs.EventData
|
||||
import datax.config._
|
||||
import datax.constants.{ColumnName, DatasetName, FeatureName, ProcessingPropertyName, ProductConstant}
|
||||
import datax.data.FileInternal
|
||||
import datax.exception.EngineException
|
||||
import datax.fs.HadoopClient
|
||||
import datax.host.{AppHost, CommonAppHost, SparkSessionSingleton, UdfInitializer}
|
||||
import datax.input.{BlobPointerInput, InputManager, SchemaFile, StreamingInputSetting}
|
||||
import datax.sink.{OutputManager, OutputOperator}
|
||||
import datax.telemetry.{AppInsightLogger, MetricLoggerFactory}
|
||||
import datax.utility._
|
||||
import datax.handler._
|
||||
import datax.sql.TransformSQLParser
|
||||
import org.apache.log4j.LogManager
|
||||
import org.apache.spark.rdd.RDD
|
||||
import org.apache.spark.sql.{DataFrame, Row, SparkSession}
|
||||
import org.apache.spark.sql.functions._
|
||||
import org.apache.spark.sql.streaming.{OutputMode, Trigger}
|
||||
import org.apache.spark.storage.StorageLevel
|
||||
|
||||
import scala.language.{postfixOps, reflectiveCalls}
|
||||
import scala.collection.mutable.{HashMap, HashSet, ListBuffer}
|
||||
import scala.concurrent.{Await, ExecutionContext, Future}
|
||||
import scala.concurrent.duration._
|
||||
import scala.collection.JavaConverters._
|
||||
|
||||
/*
|
||||
Generate the common processor
|
||||
*/
|
||||
object CommonProcessorFactory {
|
||||
private val threadPool = Executors.newFixedThreadPool(8)
|
||||
implicit private val ec = ExecutionContext.fromExecutorService(threadPool)
|
||||
val appHost = CommonAppHost
|
||||
|
||||
/*
|
||||
Create the processor based on input config, initialize all functions to be used in streaming iterations
|
||||
*/
|
||||
def createProcessor(config: UnifiedConfig):CommonProcessor = {
|
||||
val sparkConf = config.sparkConf
|
||||
val spark = appHost.getSpark(sparkConf)
|
||||
|
||||
val dict = config.dict
|
||||
import spark.implicits._
|
||||
|
||||
// Load and initialize functions in parallel to be used in streaming iterations.
|
||||
val loadings = Tuple5(
|
||||
Future{SchemaFile.loadInputSchema(dict)},
|
||||
ProjectionHandler.loadProjectionsFuture(dict),
|
||||
TransformHandler.loadTransformFuture(dict),
|
||||
ReferenceDataHandler.loadReferenceDataFuture(spark, dict),
|
||||
Future {ExtendedUDFHandler.initialize(spark, dict)}
|
||||
)
|
||||
|
||||
val (rawBlockSchema, projections, sqlParsed, referencedDataLoaded, udfs) =
|
||||
Await.result(for {
|
||||
r1 <- loadings._1
|
||||
r2 <- loadings._2
|
||||
r3 <- loadings._3
|
||||
r4 <- loadings._4
|
||||
r5 <- loadings._5
|
||||
} yield (r1, r2, r3, r4, r5), 10 minutes)
|
||||
|
||||
BuiltInFunctionsHandler.initialize(spark, dict)
|
||||
ProjectionHandler.validateProjections(projections)
|
||||
JarUDFHandler.loadJarUdf(spark, dict)
|
||||
AzureFunctionHandler.initialize(spark, dict)
|
||||
UdfInitializer.initialize(spark, dict)
|
||||
val createdTables = StateTableHandler.createTables(spark, dict)
|
||||
val inputNormalizer = InputNormalizerHandler.initialize(spark, dict)
|
||||
val inputNormalizerUdf = if(inputNormalizer==null) udf((s:String)=>s) else udf(inputNormalizer)
|
||||
val preProjection = PreProjectionHandler.initialize(spark, dict)
|
||||
val buildPropertiesUdf = PropertiesHandler.initialize(spark, dict)
|
||||
|
||||
/*
|
||||
function parse input string into a Raw object column based on the input raw blob schema and also project the data frame
|
||||
based on the columns from projection files
|
||||
*/
|
||||
def project(inputDf: DataFrame, batchTime: Timestamp): DataFrame = {
|
||||
// Initial schema and data set
|
||||
var df = inputDf
|
||||
.withColumn(ColumnName.RawObjectColumn, from_json(inputNormalizerUdf(col(ColumnName.RawObjectColumn)), rawBlockSchema))
|
||||
|
||||
df = if(preProjection==null)df else preProjection(df)
|
||||
val preservedColumns = df.schema.fieldNames.filter(_.startsWith(ColumnName.InternalColumnPrefix))
|
||||
df = df.withColumn(ColumnName.PropertiesColumn, buildPropertiesUdf(col(ColumnName.InternalColumnFileInfo), lit(batchTime)))
|
||||
for(step <- 0 until projections.length)
|
||||
df = df.selectExpr(projections(step)++preservedColumns: _*)
|
||||
|
||||
df
|
||||
}
|
||||
|
||||
// initialize metrics settings
|
||||
val metricAppName = dict.getMetricAppName()
|
||||
val metricConf = MetricsHandler.getMetricsSinkConf(dict)
|
||||
|
||||
// figure out how to output
|
||||
val outputs = OutputManager.getOperatators(dict)
|
||||
val outputCount = outputs.size
|
||||
|
||||
// initialize output handlers
|
||||
outputs.par.foreach(o => {
|
||||
if (o.onInitialization != null)
|
||||
o.onInitialization(spark)
|
||||
})
|
||||
|
||||
// initialize settings of time windows
|
||||
val timewindows = TimeWindowHandler.initialize(spark, dict)
|
||||
|
||||
// store past RDDs for overlapping time windows
|
||||
val pastRdds = new HashMap[Timestamp, RDD[Row]]()
|
||||
|
||||
// store names of data frames for outputting
|
||||
val dataframes = new HashMap[String, DataFrame]
|
||||
|
||||
/*
|
||||
function to execute the queries for transforming data frames and output them accordingly
|
||||
*/
|
||||
def route(projectedDf: DataFrame, batchTime: Timestamp, batchInterval: Duration, outputPartitionTime: Timestamp, targets: Set[String], tableNamespace:String) = {
|
||||
val transformLogger = LogManager.getLogger("Transformer")
|
||||
|
||||
// store the mapping information between table names in the query and the actual data frame name we are processing
|
||||
val tableNames = new HashMap[String, String]
|
||||
|
||||
// register the start input table for the query
|
||||
val initTableName = DatasetName.DataStreamProjection+tableNamespace
|
||||
tableNames(DatasetName.DataStreamProjection) = initTableName
|
||||
dataframes += initTableName -> projectedDf
|
||||
|
||||
// store the data frames that we should unpersist after one iteration
|
||||
val dataFramesToUncache = new ListBuffer[DataFrame]
|
||||
transformLogger.warn("Persisting the current projected dataframe")
|
||||
projectedDf.persist()
|
||||
dataFramesToUncache += projectedDf
|
||||
|
||||
// store the metrics to send after one iteration
|
||||
val outputMetrics = HashMap[String, Double]()
|
||||
|
||||
// log metric of how many events are incoming on this iteration
|
||||
val inputRawEventsCount = projectedDf.count()
|
||||
transformLogger.warn(s"Received $inputRawEventsCount raw input events")
|
||||
outputMetrics += s"Input_Normalized_Events_Count" -> inputRawEventsCount
|
||||
|
||||
if(timewindows.isEnabled){
|
||||
// when time window is turned on, we need to calculate the start and end time of the window and query against
|
||||
// the past RDD from history
|
||||
// we calculate the time window as the maximum time span from the time windows passed-in from settings
|
||||
// this one below determines the end time as event filter, note to minus the watermark span which is the buffer for events to finalize
|
||||
val windowEndTime = Timestamp.from(batchTime.toInstant.minusSeconds(timewindows.watermark.toSeconds))
|
||||
// this one below determines the start time filter, which is maximum start time of all time windows
|
||||
val windowStartTime = Timestamp.from(windowEndTime.toInstant.minusSeconds(timewindows.maxWindow.toSeconds))
|
||||
val rdd = projectedDf.where(timewindows.timestampColumn>=windowEndTime).rdd
|
||||
|
||||
transformLogger.warn("Persisting the windowed projected data frame")
|
||||
rdd.persist(StorageLevel.MEMORY_ONLY_SER)
|
||||
|
||||
// log metric of after filtered by the time window, how many events actually are participating in the transformation
|
||||
val inputEventsCount = rdd.mapPartitions(it=>{
|
||||
val loggerSuffix = SparkEnvVariables.getLoggerSuffix()
|
||||
val instrumentLogger = LogManager.getLogger(s"${ProductConstant.ProductInstrumentLogger}$loggerSuffix")
|
||||
val t1 = System.nanoTime()
|
||||
instrumentLogger.warn(s"Start collecting events at $t1")
|
||||
val count = it.toArray.length
|
||||
val timeNow = System.nanoTime()
|
||||
instrumentLogger.warn(s"Collected $count events for caching, spent time=${(timeNow-t1)/1E9} seconds")
|
||||
Iterator.single(count)
|
||||
}).reduce(_+_)
|
||||
|
||||
transformLogger.warn(s"Received $inputEventsCount input events for ${initTableName}")
|
||||
outputMetrics += s"Input_${DatasetName.DataStreamProjection}_Events_Count" -> inputEventsCount
|
||||
|
||||
// collect data from past RDDs that fits in the time window
|
||||
val cutTime = Timestamp.from(batchTime.toInstant.minusSeconds((timewindows.watermark+timewindows.maxWindow).toSeconds))
|
||||
pastRdds.keys.filter(_.compareTo(cutTime)<=0).foreach(k=>{
|
||||
pastRdds.remove(k) match {
|
||||
case Some(rdd) =>
|
||||
transformLogger.warn(s"removing past RDD at ${k} since it is before or equal to ${cutTime}")
|
||||
rdd.unpersist(false)
|
||||
case None =>
|
||||
transformLogger.warn(s"Unexpectedly ${k} does exist in the pastRDDs")
|
||||
}
|
||||
})
|
||||
|
||||
// union the data from current projected data frame and the past ones
|
||||
val sc = rdd.sparkContext
|
||||
val pastDataUnion = spark.createDataFrame(if(pastRdds.size>1){
|
||||
transformLogger.warn(s"union ${pastRdds.size} batches, including ${pastRdds.keySet.mkString(",")}")
|
||||
sc.union(rdd, pastRdds.values.toSeq: _*)
|
||||
} else rdd, projectedDf.schema)
|
||||
|
||||
val unionTableNameInSql = DatasetName.DataStreamProjectionWithWindow
|
||||
val unionTableName = unionTableNameInSql+tableNamespace
|
||||
pastDataUnion
|
||||
.where(timewindows.timestampColumn>=windowStartTime && timewindows.timestampColumn<windowEndTime)
|
||||
.createOrReplaceTempView(unionTableName)
|
||||
tableNames(unionTableNameInSql) = unionTableName
|
||||
dataframes(unionTableName)=spark.table(unionTableName)
|
||||
|
||||
// register time-windowed tables and their corresponding data frames for different time window spec
|
||||
for (tw <- timewindows.windows) {
|
||||
val winTableName = tw._1
|
||||
val winTableNameInScope = winTableName + tableNamespace
|
||||
val winStartTime = Timestamp.from(windowEndTime.toInstant.minusSeconds(tw._2.toSeconds))
|
||||
transformLogger.warn(s"Create or replace time windowed view '${winTableNameInScope}' within window('$winStartTime' - '$windowEndTime')")
|
||||
pastDataUnion
|
||||
.where(timewindows.timestampColumn>=winStartTime && timewindows.timestampColumn<windowEndTime)
|
||||
.createOrReplaceTempView(winTableNameInScope)
|
||||
tableNames(winTableName) = winTableNameInScope
|
||||
dataframes(winTableNameInScope)=spark.table(winTableNameInScope)
|
||||
}
|
||||
|
||||
// replace the starting table
|
||||
val adjustedBatchStartTime = Timestamp.from(windowEndTime.toInstant.minusSeconds(batchInterval.toSeconds))
|
||||
val cachedProjectedDf = pastDataUnion.where(timewindows.timestampColumn>=adjustedBatchStartTime && timewindows.timestampColumn<windowEndTime)
|
||||
cachedProjectedDf.createOrReplaceTempView(initTableName)
|
||||
|
||||
// register a table to reference to the projected data frame within only the current iteration batch
|
||||
val batchedTableName = DatasetName.DataStreamProjectionBatch + tableNamespace
|
||||
projectedDf.createOrReplaceTempView(batchedTableName)
|
||||
tableNames(DatasetName.DataStreamProjectionBatch) = batchedTableName
|
||||
dataframes(batchedTableName)=projectedDf
|
||||
|
||||
pastRdds(batchTime) = rdd
|
||||
}
|
||||
else{
|
||||
// if time window is not turned on, we simply register the projected data frame as input starting table for query
|
||||
outputMetrics += s"Input_${DatasetName.DataStreamProjection}_Events_Count" -> inputRawEventsCount
|
||||
projectedDf.createOrReplaceTempView(initTableName)
|
||||
}
|
||||
|
||||
// register state-store tables
|
||||
for (elem <- createdTables) {
|
||||
tableNames(elem._1)=elem._2.getActiveTableName()
|
||||
}
|
||||
|
||||
// start executing queries
|
||||
if(sqlParsed!=null && sqlParsed.commands.length>0){
|
||||
val partitionNumber = projectedDf.rdd.getNumPartitions
|
||||
val queries = sqlParsed.commands
|
||||
queries.foreach(expr=>{
|
||||
val statement = TransformSQLParser.replaceTableNames(expr.text, tableNames)
|
||||
expr.commandType match {
|
||||
case TransformSQLParser.CommandType_Command =>
|
||||
transformLogger.warn(s"Executing command '$statement'")
|
||||
spark.sql(statement)
|
||||
case TransformSQLParser.CommandType_Query =>
|
||||
createdTables.find(_._1 == expr.name) match {
|
||||
case Some(t) =>
|
||||
// this case is a query statement assigns data set back to a registered state-store table
|
||||
// so we have to overwrite the existing state-store table with the new data
|
||||
t._2.overwrite(statement)
|
||||
tableNames(t._1) = t._2.flip()
|
||||
case None =>
|
||||
// this is a normal case that we borther to handle state-store tables
|
||||
val tableName = expr.name + tableNamespace
|
||||
transformLogger.warn(s"Creating view '$tableName' for '$statement'")
|
||||
|
||||
val ds = if(partitionNumber > 0) {
|
||||
spark.sql(statement).coalesce(partitionNumber)
|
||||
}
|
||||
else {
|
||||
transformLogger.warn(s"Zero events found for $tableName' for '$statement'")
|
||||
spark.sql(statement)
|
||||
}
|
||||
|
||||
tableNames(expr.name) = tableName
|
||||
dataframes(tableName) = ds
|
||||
|
||||
// cache data frames which has been referenced more than once to improve performance
|
||||
if(TransformHandler.shouldCacheCommonViews(dict) && sqlParsed.viewReferenceCount.getOrElse(expr.name, 0)>1){
|
||||
transformLogger.warn(s"Caching view '$tableName' for it would be used more than once")
|
||||
ds.cache()
|
||||
dataFramesToUncache += ds
|
||||
}
|
||||
|
||||
ds.createOrReplaceTempView(tableName)
|
||||
}
|
||||
case _ =>
|
||||
throw new EngineException(s"unknown commandType : ${expr.commandType}")
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
// start outputting data
|
||||
def outputHandler(operator: OutputOperator) = {
|
||||
val tableName = operator.name
|
||||
val outputTableName = tableName+tableNamespace
|
||||
dataframes.get(outputTableName) match {
|
||||
case None => throw new EngineException(s"could not find data set name '$outputTableName' for output '${operator.name}'")
|
||||
case Some(df) =>
|
||||
if (operator.onBatch != null) operator.onBatch(df.sparkSession, outputPartitionTime, targets)
|
||||
operator.output(df, outputPartitionTime).map { case (k, v) => (s"Output_${operator.name}_" + k) -> v.toDouble }
|
||||
}
|
||||
}
|
||||
|
||||
var result = Map.empty[String,Double]
|
||||
if(outputCount>0) {
|
||||
// if there are multiple outputs, we kick off them in parallel
|
||||
result = if (outputCount > 1)
|
||||
outputs.par.map(outputHandler).reduce(_ ++ _)
|
||||
else
|
||||
outputHandler(outputs(0))
|
||||
}
|
||||
|
||||
// persisting state-store tables
|
||||
for (elem <- createdTables) {
|
||||
elem._2.persist()
|
||||
}
|
||||
|
||||
// clear cache of the data frames in this batch of iteration
|
||||
transformLogger.warn("Un-persisting the dataframes")
|
||||
dataFramesToUncache.foreach(_.unpersist(false))
|
||||
dataFramesToUncache.clear()
|
||||
|
||||
outputMetrics ++ result
|
||||
}
|
||||
|
||||
/*
|
||||
function to process unified data frame - which has 4 columns: raw string input, Properties, SystemProperties and an internal metadata column for processing
|
||||
*/
|
||||
def processDataset(data: DataFrame,
|
||||
batchTime: Timestamp,
|
||||
batchInterval: Duration,
|
||||
outputPartitionTime: Timestamp,
|
||||
targets: Set[String],
|
||||
namespace: String):Map[String, Double] = {
|
||||
val t1 = System.nanoTime()
|
||||
val batchLogger = LogManager.getLogger(ProductConstant.DataStreamProcessDataSetLogger)
|
||||
val metricLogger = MetricLoggerFactory.getMetricLogger(metricAppName, metricConf)
|
||||
val spark = data.sparkSession
|
||||
|
||||
def postMetrics(metrics: Iterable[(String, Double)]): Unit = {
|
||||
batchLogger.warn(s"Sending metrics:\n${metrics.map(m => m._1 + " -> " + m._2).mkString("\n")}")
|
||||
metricLogger.sendBatchMetrics(metrics, batchTime.getTime)
|
||||
}
|
||||
|
||||
try{
|
||||
// call ExtendedUDFs to refresh their data
|
||||
udfs.foreach(udf=>{
|
||||
if(udf._2!=null)udf._2(spark, batchTime)
|
||||
})
|
||||
|
||||
// if raw input is specified in the output settings as one of the output, we cache it and register that to allow it to be output
|
||||
val persistRaw = outputs.find(p=>p.name==DatasetName.DataStreamRaw).isDefined
|
||||
if(persistRaw){
|
||||
data.cache()
|
||||
dataframes(DatasetName.DataStreamRaw) = data
|
||||
}
|
||||
|
||||
// main processing steps
|
||||
val baseProjection = project(data, batchTime)
|
||||
val counts = route(baseProjection, batchTime, batchInterval, outputPartitionTime, targets, namespace)
|
||||
|
||||
// clear the cache of raw input table if needed.
|
||||
if(persistRaw){
|
||||
data.unpersist(false)
|
||||
}
|
||||
|
||||
// calculate performance metrics
|
||||
val partitionProcessedTime = System.nanoTime
|
||||
val latencyInSeconds = (DateTimeUtil.getCurrentTime().getTime - batchTime.getTime)/1000D
|
||||
val metrics = Map[String, Double](
|
||||
"Latency-Process" -> (partitionProcessedTime - t1) / 1E9,
|
||||
"Latency-Batch" -> latencyInSeconds
|
||||
) ++ counts
|
||||
|
||||
postMetrics(metrics)
|
||||
metrics
|
||||
}
|
||||
catch{
|
||||
case e: Exception =>
|
||||
appHost.getTelemetryService().trackEvent(ProductConstant.ProductRoot + "/error", Map(
|
||||
"errorLocation" -> "ProcessDataFrame",
|
||||
"errorMessage" -> e.getMessage,
|
||||
"errorStackTrace" -> e.getStackTrace.take(10).mkString("\n"),
|
||||
"batchTime" -> batchTime.toString
|
||||
), null)
|
||||
appHost.getTelemetryService().trackException(e, Map(
|
||||
"errorLocation" -> "ProcessDataFrame",
|
||||
"errorMessage" -> e.getMessage,
|
||||
"batchTime" -> batchTime.toString
|
||||
), null)
|
||||
|
||||
Thread.sleep(1000)
|
||||
throw e
|
||||
}
|
||||
}
|
||||
|
||||
CommonProcessor(
|
||||
/*
|
||||
process a batch of EventData from EventHub
|
||||
*/
|
||||
processEventData = (rdd: RDD[EventData], batchTime: Timestamp, batchInterval: Duration, outputPartitionTime: Timestamp) =>{
|
||||
processDataset(rdd
|
||||
.map(d=>{
|
||||
val bodyBytes = d.getBytes
|
||||
if(bodyBytes==null) throw new EngineException(s"null bytes from event: ${d.getObject}, properties:${d.getProperties}, systemProperties:${d.getSystemProperties}")
|
||||
(
|
||||
new String(bodyBytes),
|
||||
d.getProperties.asScala.map{case(k,v)=>k->v.toString},
|
||||
if(d.getSystemProperties!=null) d.getSystemProperties.asScala.map{case(k,v)=>k->v.toString} else Map.empty[String, String],
|
||||
FileInternal())
|
||||
})
|
||||
.toDF(
|
||||
ColumnName.RawObjectColumn,
|
||||
"Properties",
|
||||
"SystemProperties",
|
||||
ColumnName.InternalColumnFileInfo
|
||||
), batchTime, batchInterval, outputPartitionTime, null, "")
|
||||
},
|
||||
|
||||
/*
|
||||
process structured streaming for given data frame
|
||||
Note this is incomplete and not used for now
|
||||
*/
|
||||
processEventHubDataFrame = (df: DataFrame) => {
|
||||
val logger = LogManager.getLogger("processEventHubDataFrame")
|
||||
df
|
||||
.select(
|
||||
from_json(col("body").cast("string"), rawBlockSchema).alias("Raw"),
|
||||
col("properties"),
|
||||
col("enqueuedTime")
|
||||
)
|
||||
.selectExpr("Raw.*", "properties", "enqueuedTime")
|
||||
.withWatermark("enqueuedTime", "60 seconds")
|
||||
.createOrReplaceTempView(DatasetName.DataStreamProjection)
|
||||
val outputs = sqlParsed.commands
|
||||
.filter(n=>n.commandType==TransformSQLParser.CommandType_Query).map(n=>n.name->n.text)
|
||||
.toMap
|
||||
|
||||
val streamingConf = StreamingInputSetting.getStreamingInputConf(dict)
|
||||
val interval = streamingConf.intervalInSeconds
|
||||
|
||||
outputs.map{case(k, v)=>{
|
||||
k-> spark.sql(v).writeStream
|
||||
.outputMode(OutputMode.Append())
|
||||
.format("console")
|
||||
.trigger(Trigger.ProcessingTime(interval, SECONDS))
|
||||
.start()
|
||||
}}
|
||||
},
|
||||
|
||||
/*
|
||||
process json data frame
|
||||
*/
|
||||
processJson = (jsonRdd: RDD[String], batchTime: Timestamp, batchInterval: Duration, outputPartitionTime: Timestamp) =>{
|
||||
processDataset(jsonRdd.map((FileInternal(), _)).toDF(ColumnName.InternalColumnFileInfo, ColumnName.RawObjectColumn),
|
||||
batchTime, batchInterval, outputPartitionTime, null, "")
|
||||
},
|
||||
|
||||
// process blob path pointer data frame
|
||||
processPaths = (pathsRDD: RDD[String],
|
||||
batchTime: Timestamp,
|
||||
batchInterval: Duration,
|
||||
outputPartitionTime: Timestamp,
|
||||
namespace: String) => {
|
||||
val spark = SparkSessionSingleton.getInstance(pathsRDD.sparkContext.getConf)
|
||||
|
||||
val metricLogger = MetricLoggerFactory.getMetricLogger(metricAppName, metricConf)
|
||||
val batchTimeStr = DateTimeUtil.formatSimple(batchTime)
|
||||
val batchLog = LogManager.getLogger(s"BatchProcessor-B$batchTimeStr")
|
||||
|
||||
// Functions used with in processPaths
|
||||
val batchTimeInMs = batchTime.getTime
|
||||
|
||||
def postMetrics(metrics: Iterable[(String, Double)]): Unit = {
|
||||
metricLogger.sendBatchMetrics(metrics, batchTimeInMs)
|
||||
batchLog.warn(s"Metric ${metrics.map(m => m._1 + "=" + m._2).mkString(",")}")
|
||||
}
|
||||
|
||||
// Process the array of input files, and sink them
|
||||
// Return metrics: (number of processed blobs, number of processed events, number of filtered events sent to eventhub)
|
||||
def processBlobs(files: Array[FileInternal],
|
||||
outputPartitionTime: Timestamp,
|
||||
partition: String,
|
||||
targetPar: String): Map[String, Double] = {
|
||||
val filesCount = files.length
|
||||
val t1 = System.nanoTime()
|
||||
|
||||
// Get the earliest blob to calculate latency
|
||||
val paths = files.map(_.inputPath)
|
||||
val blobTimes = files.map(_.fileTime).filterNot(_ == null).toList
|
||||
|
||||
postMetrics(Map(s"InputBlobs" -> filesCount.toDouble))
|
||||
|
||||
val (minBlobTime, maxBlobTime) =
|
||||
if(blobTimes.length>0) {
|
||||
val minBlobTime = blobTimes.minBy(_.getTime)
|
||||
val maxBlobTime = blobTimes.maxBy(_.getTime)
|
||||
batchLog.warn(s"partition '$partition': started, size: $filesCount, blob time range[${DateTimeUtil.formatSimple(minBlobTime)}, ${DateTimeUtil.formatSimple(maxBlobTime)}]")
|
||||
(minBlobTime, maxBlobTime)
|
||||
}
|
||||
else{
|
||||
batchLog.warn(s"Cannot figure out timestamp from file name, please check if there is misconfiguration in the fileTimeRegex setting")
|
||||
(null, null)
|
||||
}
|
||||
|
||||
val pathsList = paths.mkString(",")
|
||||
batchLog.debug(s"Batch loading files:$pathsList")
|
||||
val inputDf = spark.sparkContext.parallelize(files, filesCount)
|
||||
.flatMap(file => HadoopClient.readHdfsFile(file.inputPath, gzip = file.inputPath.endsWith(".gz"))
|
||||
.filter(l=>l!=null && !l.isEmpty).map((file, outputPartitionTime, _)))
|
||||
.toDF(ColumnName.InternalColumnFileInfo, ColumnName.MetadataColumnOutputPartitionTime, ColumnName.RawObjectColumn)
|
||||
|
||||
val targets = files.map(_.target).toSet
|
||||
val processedMetrics = processDataset(inputDf, batchTime, batchInterval, outputPartitionTime, targets, partition)
|
||||
if(minBlobTime!=null){
|
||||
val latencyInSeconds = (DateTimeUtil.getCurrentTime().getTime - minBlobTime.getTime)/1000D
|
||||
val latencyMetrics = Map(s"Latency-Blobs" -> latencyInSeconds)
|
||||
postMetrics(latencyMetrics)
|
||||
latencyMetrics++processedMetrics
|
||||
}
|
||||
else{
|
||||
processedMetrics
|
||||
}
|
||||
}
|
||||
|
||||
def processPartition(v: (String, HashSet[FileInternal])) = {
|
||||
val par = v._1
|
||||
val paths = v._2.toArray
|
||||
processBlobs(paths, outputPartitionTime, par+namespace, par)
|
||||
}
|
||||
|
||||
batchLog.warn(s"Start batch ${batchTime}, output partition time:${outputPartitionTime}, namespace:${namespace}")
|
||||
val t1 = System.nanoTime
|
||||
val pathsGroups = BlobPointerInput.pathsToGroups(rdd = pathsRDD,
|
||||
jobName = dict.getAppName(),
|
||||
dict = dict,
|
||||
outputTimestamp = outputPartitionTime)
|
||||
val pathsFilteredGroups = BlobPointerInput.filterPathGroups(pathsGroups)
|
||||
val pathsCount = pathsFilteredGroups.aggregate(0)(_ + _._2.size, _ + _)
|
||||
//try {
|
||||
val result =
|
||||
if (pathsCount > 0) {
|
||||
batchLog.warn(s"Loading filtered blob files count=$pathsCount, First File=${pathsFilteredGroups.head._2.head}")
|
||||
if (pathsFilteredGroups.length > 1)
|
||||
Await.result(FutureUtil.failFast(pathsFilteredGroups
|
||||
.map(kv => Future {
|
||||
processPartition(kv)
|
||||
})), 5 minutes).reduce(DataMerger.mergeMapOfDoubles)
|
||||
else
|
||||
processPartition(pathsFilteredGroups(0))
|
||||
}
|
||||
else {
|
||||
batchLog.warn(s"No valid paths is found to process for this batch")
|
||||
Map[String, Double]()
|
||||
}
|
||||
val batchProcessingTime = (System.nanoTime - t1) / 1E9
|
||||
|
||||
val metrics = Map[String, Double](
|
||||
"BatchProcessedET" -> batchProcessingTime
|
||||
)
|
||||
|
||||
postMetrics(metrics)
|
||||
batchLog.warn(s"End batch ${batchTime}, output partition time:${outputPartitionTime}, namespace:${namespace}")
|
||||
|
||||
metrics ++ result
|
||||
} // end of processPaths
|
||||
) // end of CommonProcessor
|
||||
} // end of init
|
||||
} // end of CommonProcessorFactory
|
|
@ -0,0 +1,22 @@
|
|||
// *********************************************************************
|
||||
// Copyright (c) Microsoft Corporation. All rights reserved.
|
||||
// Licensed under the MIT License
|
||||
// *********************************************************************
|
||||
package datax.processor
|
||||
|
||||
import java.sql.Timestamp
|
||||
|
||||
import com.microsoft.azure.eventhubs.EventData
|
||||
import datax.utility.DateTimeUtil
|
||||
import org.apache.spark.rdd.RDD
|
||||
|
||||
import scala.concurrent.duration.Duration
|
||||
|
||||
// Data processor for processing events in OneBox mode where job running is running locally
|
||||
class DirectLocalProcessor(processEventData: (RDD[EventData], Timestamp, Duration, Timestamp) => Map[String, Double])
|
||||
extends EventHubStreamingProcessor{
|
||||
override val process = (rdd: RDD[EventData], batchTime: Timestamp, batchInterval: Duration) => {
|
||||
val outputPartitionTime =DateTimeUtil.getCurrentTime()
|
||||
processEventData(rdd, batchTime, batchInterval, outputPartitionTime)
|
||||
}
|
||||
}
|
|
@ -0,0 +1,21 @@
|
|||
// *********************************************************************
|
||||
// Copyright (c) Microsoft Corporation. All rights reserved.
|
||||
// Licensed under the MIT License
|
||||
// *********************************************************************
|
||||
package datax.processor
|
||||
|
||||
import java.sql.Timestamp
|
||||
|
||||
import com.microsoft.azure.eventhubs.EventData
|
||||
import datax.utility.DateTimeUtil
|
||||
import org.apache.spark.rdd.RDD
|
||||
|
||||
import scala.concurrent.duration.Duration
|
||||
|
||||
class DirectProcessor(processEventData: (RDD[EventData], Timestamp, Duration, Timestamp) => Map[String, Double])
|
||||
extends EventHubStreamingProcessor{
|
||||
override val process = (rdd: RDD[EventData], batchTime: Timestamp, batchInterval: Duration) => {
|
||||
val outputPartitionTime = DateTimeUtil.getCurrentTime()
|
||||
processEventData(rdd, batchTime, batchInterval, outputPartitionTime)
|
||||
}
|
||||
}
|
|
@ -0,0 +1,16 @@
|
|||
// *********************************************************************
|
||||
// Copyright (c) Microsoft Corporation. All rights reserved.
|
||||
// Licensed under the MIT License
|
||||
// *********************************************************************
|
||||
package datax.processor
|
||||
|
||||
import java.sql.Timestamp
|
||||
|
||||
import com.microsoft.azure.eventhubs.EventData
|
||||
import org.apache.spark.rdd.RDD
|
||||
|
||||
import scala.concurrent.duration.Duration
|
||||
|
||||
trait EventHubStreamingProcessor {
|
||||
val process: (RDD[EventData], Timestamp, Duration) => Map[String, Double]
|
||||
}
|
|
@ -0,0 +1,13 @@
|
|||
// *********************************************************************
|
||||
// Copyright (c) Microsoft Corporation. All rights reserved.
|
||||
// Licensed under the MIT License
|
||||
// *********************************************************************
|
||||
package datax.processor
|
||||
|
||||
import org.apache.spark.sql.DataFrame
|
||||
import org.apache.spark.sql.streaming.StreamingQuery
|
||||
|
||||
class EventHubStructuredStreamingProcessor(processDataFrame: DataFrame=>Map[String, StreamingQuery])
|
||||
extends StructuredStreamingProcessor {
|
||||
override val process: DataFrame => Map[String, StreamingQuery] = processDataFrame
|
||||
}
|
|
@ -0,0 +1,21 @@
|
|||
// *********************************************************************
|
||||
// Copyright (c) Microsoft Corporation. All rights reserved.
|
||||
// Licensed under the MIT License
|
||||
// *********************************************************************
|
||||
package datax.processor
|
||||
|
||||
import java.sql.Timestamp
|
||||
|
||||
import com.microsoft.azure.eventhubs.EventData
|
||||
import datax.utility.DateTimeUtil
|
||||
import org.apache.spark.rdd.RDD
|
||||
|
||||
import scala.concurrent.duration.Duration
|
||||
|
||||
class JsonProcessor(processJson: (RDD[String], Timestamp, Duration, Timestamp) => Map[String, Double])
|
||||
extends EventHubStreamingProcessor{
|
||||
override val process = (rdd: RDD[EventData], batchTime: Timestamp, batchInterval: Duration) => {
|
||||
val outputPartitionTime = DateTimeUtil.getCurrentTime()
|
||||
processJson(rdd.map(w=>new String(w.getBytes)), batchTime, batchInterval, outputPartitionTime)
|
||||
}
|
||||
}
|
|
@ -0,0 +1,12 @@
|
|||
// *********************************************************************
|
||||
// Copyright (c) Microsoft Corporation. All rights reserved.
|
||||
// Licensed under the MIT License
|
||||
// *********************************************************************
|
||||
package datax.processor
|
||||
|
||||
import org.apache.spark.sql.DataFrame
|
||||
import org.apache.spark.sql.streaming.StreamingQuery
|
||||
|
||||
trait StructuredStreamingProcessor {
|
||||
val process: (DataFrame) => Map[String, StreamingQuery]
|
||||
}
|
|
@ -0,0 +1,125 @@
|
|||
// *********************************************************************
|
||||
// Copyright (c) Microsoft Corporation. All rights reserved.
|
||||
// Licensed under the MIT License
|
||||
// *********************************************************************
|
||||
package datax.securedsetting
|
||||
|
||||
import datax.config.ConfigManager
|
||||
import datax.constants.JobArgument
|
||||
import datax.exception.EngineException
|
||||
import datax.keyvault.KeyVaultMsiAuthenticatorClient
|
||||
import org.apache.log4j.LogManager
|
||||
|
||||
import scala.collection.mutable
|
||||
|
||||
|
||||
/***
|
||||
* Utility module to access KeyVault service from Azure.
|
||||
*/
|
||||
object KeyVaultClient {
|
||||
private val logger = LogManager.getLogger(this.getClass)
|
||||
private val secretRegex = "^keyvault:\\/\\/([a-zA-Z0-9-_]+)\\/([a-zA-Z0-9-_]+)$".r
|
||||
|
||||
private val kvc = KeyVaultMsiAuthenticatorClient.getKeyVaultClient()
|
||||
private val cache = new mutable.HashMap[String, String]
|
||||
|
||||
/**
|
||||
* get value of a matched secretId from keyvault
|
||||
* @param secretId secretId
|
||||
* @return value of the secret
|
||||
*/
|
||||
private def resolveSecret(secretId: String): Option[String] = {
|
||||
if(secretId==null||secretId.isEmpty)
|
||||
return Option(secretId)
|
||||
|
||||
secretRegex.findFirstMatchIn(secretId) match {
|
||||
case Some(secretInfo) => val vaultName = secretInfo.group(1)
|
||||
val secretName = secretInfo.group(2)
|
||||
|
||||
cache.synchronized{
|
||||
cache.get(secretId) match {
|
||||
case Some(value) => Some(value)
|
||||
case None =>
|
||||
val secret = kvc.synchronized{
|
||||
kvc.getSecret(s"https://$vaultName.vault.azure.net",secretName)
|
||||
}
|
||||
|
||||
logger.warn(s"resolved secret:'$secretId'")
|
||||
val value = secret.value()
|
||||
cache(secretId) = value
|
||||
Some(value)
|
||||
}
|
||||
}
|
||||
|
||||
case None =>
|
||||
logger.warn(s"did not resolve:'$secretId', return as is")
|
||||
None
|
||||
}
|
||||
}
|
||||
|
||||
/***
|
||||
* get secret from KeyVault with the specified name, could throw exception
|
||||
* @param secretId secret uri to retrieve the secret value
|
||||
* @return value of the secret
|
||||
*/
|
||||
@throws[EngineException]
|
||||
def getSecretOrThrow(secretId: String): String = {
|
||||
if(secretId==null || secretId.isEmpty){
|
||||
throw new EngineException(s"secret reference cannot be null or empty")
|
||||
}
|
||||
else{
|
||||
resolveSecret(secretId) match {
|
||||
case Some(m) => m
|
||||
case None => throw new EngineException(s"secret is not found with reference name: '${secretId}'.")
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/***
|
||||
* get secret from KeyVault with the specified name, exception handled internally.
|
||||
* * @param secretId secret uri to retrieve the secret value
|
||||
* * @return value of the secret or None if any exception occurs.
|
||||
*/
|
||||
def getSecret(secretId: String): Option[String] = {
|
||||
try{
|
||||
Some(getSecretOrThrow(secretId))
|
||||
}
|
||||
catch {
|
||||
case e: Exception =>
|
||||
logger.warn(s"skipped '$secretId': ${e.getMessage}")
|
||||
None
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Try resolve the secretId from input string if there is any
|
||||
* a secretId is in format like "keyvault://keyvault-name/secret-name"
|
||||
* @param input string with potential secretId in it
|
||||
* @return resolve the secret value according to the secretId or return the input string
|
||||
*/
|
||||
def resolveSecretIfAny(input: String): String = {
|
||||
resolveSecret(input).getOrElse(input)
|
||||
}
|
||||
|
||||
/**
|
||||
* Try resolve the secretId from input string if there is any
|
||||
* a secretId is in format like "keyvault://keyvault-name/secret-name"
|
||||
* @param input string with potential secretId in it
|
||||
* @return resolve the secret value according to the secretId or return the input string
|
||||
*/
|
||||
def resolveSecretIfAny(input: Option[String]): Option[String] = {
|
||||
input.map(resolveSecretIfAny(_))
|
||||
}
|
||||
|
||||
|
||||
/***
|
||||
* a scope to execute operation with the default keyvault name, skip the operation if that doesn't exist.
|
||||
* @param callback execution within the scope
|
||||
*/
|
||||
def withKeyVault(callback: (String)=> Unit) = {
|
||||
ConfigManager.getActiveDictionary().get(JobArgument.ConfName_DefaultVaultName) match {
|
||||
case Some(vaultName) => callback(vaultName)
|
||||
case None => logger.warn(s"No default vault is defined, skipped finding key for storage accounts")
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,44 @@
|
|||
// *********************************************************************
|
||||
// Copyright (c) Microsoft Corporation. All rights reserved.
|
||||
// Licensed under the MIT License
|
||||
// *********************************************************************
|
||||
package datax.sink
|
||||
|
||||
import datax.config.{SettingDictionary, SettingNamespace}
|
||||
|
||||
object BlobOutputSetting {
|
||||
case class BlobGroupOutputConf(folder: String)
|
||||
case class BlobOutputConf(groupEvaluation: Option[String], groups: Map[String, BlobGroupOutputConf], compressionType: Option[String], format: Option[String])
|
||||
|
||||
val Namespace = "blob"
|
||||
val SettingGroupEvaluation = "groupevaluation"
|
||||
val SettingCompressionType = "compressiontype"
|
||||
val SettingFormat = "format"
|
||||
val SettingGroup = "group"
|
||||
val BlobGroupPrefix = SettingGroup + SettingNamespace.Seperator
|
||||
|
||||
val SettingGroupOutputFolder = "folder"
|
||||
|
||||
private def buildBlobGroupOutputConf(dict: SettingDictionary, name: String): BlobGroupOutputConf = {
|
||||
dict.get(SettingGroupOutputFolder).map(BlobGroupOutputConf(_)).orNull
|
||||
}
|
||||
|
||||
def buildBlobOutputConf(dict: SettingDictionary, name: String): BlobOutputConf = {
|
||||
val groups = dict.buildConfigMap(buildBlobGroupOutputConf, BlobGroupPrefix)
|
||||
|
||||
if(groups.size>0)
|
||||
BlobOutputConf(
|
||||
groupEvaluation = dict.get(SettingGroupEvaluation),
|
||||
groups = groups,
|
||||
compressionType = dict.get(SettingCompressionType),
|
||||
format = dict.get(SettingFormat)
|
||||
)
|
||||
else
|
||||
null
|
||||
}
|
||||
|
||||
def getDefaultBlobOutputConf(dict: SettingDictionary): BlobOutputConf = {
|
||||
val prefix = SettingNamespace.JobOutputDefaultPreifx + Namespace + SettingNamespace.Seperator
|
||||
BlobOutputSetting.buildBlobOutputConf(dict.getSubDictionary(prefix), SettingNamespace.JobOutputDefaultPreifx + Namespace)
|
||||
}
|
||||
}
|
|
@ -0,0 +1,211 @@
|
|||
// *********************************************************************
|
||||
// Copyright (c) Microsoft Corporation. All rights reserved.
|
||||
// Licensed under the MIT License
|
||||
// *********************************************************************
|
||||
|
||||
package datax.sink
|
||||
|
||||
import java.sql.Timestamp
|
||||
import java.text.SimpleDateFormat
|
||||
|
||||
import datax.config._
|
||||
import datax.constants.{JobArgument, MetricName}
|
||||
import datax.data.{FileInternal, ProcessResult}
|
||||
import datax.fs.HadoopClient
|
||||
import datax.securedsetting.KeyVaultClient
|
||||
import datax.sink.BlobOutputSetting.BlobOutputConf
|
||||
import datax.utility.{GZipHelper, SinkerUtil}
|
||||
import org.apache.log4j.LogManager
|
||||
import org.apache.spark.TaskContext
|
||||
import org.apache.spark.rdd.RDD
|
||||
import org.apache.spark.sql.{Row, SparkSession}
|
||||
|
||||
import scala.collection.mutable
|
||||
import scala.concurrent.duration.Duration
|
||||
|
||||
object BlobSinker extends SinkOperatorFactory {
|
||||
val SinkName = "Blobs"
|
||||
val DefaultOutputGroup = "main"
|
||||
|
||||
def generateOutputFolderPath(folderFormat: String, outputTimestamp: Timestamp, target: Option[String]) = {
|
||||
if(folderFormat==null || folderFormat.isEmpty)
|
||||
null
|
||||
else {
|
||||
// val timestamp = new Timestamp(new java.util.Date().getTime)
|
||||
val minute = outputTimestamp.toLocalDateTime().getMinute()
|
||||
val quarter = Array("00", "15", "30", "45")
|
||||
val quarterBucket = quarter(Math.round(minute / 15))
|
||||
//getLogger.warn("Minute Bucket: " + quarterBucket)
|
||||
val simpleTimeFormat = new SimpleDateFormat("HHmmss")
|
||||
val minuteBucket = simpleTimeFormat.format(outputTimestamp)
|
||||
String.format(folderFormat, outputTimestamp)
|
||||
.replaceAllLiterally("${quarterBucket}", quarterBucket)
|
||||
.replaceAllLiterally("${minuteBucket}", minuteBucket)
|
||||
.replaceAllLiterally("${target}", target.getOrElse("UNKNOWN"))
|
||||
.stripSuffix("/") + "/"
|
||||
}
|
||||
}
|
||||
|
||||
// write events to blob location
|
||||
def writeEventsToBlob(data: Seq[String], outputPath: String, compression: Boolean) {
|
||||
val logger = LogManager.getLogger(s"EventsToBlob-Writer${SparkEnvVariables.getLoggerSuffix()}")
|
||||
val countEvents = data.length
|
||||
|
||||
val t1 = System.nanoTime()
|
||||
var timeLast = t1
|
||||
var timeNow: Long = 0
|
||||
logger.info(s"$timeNow:Partition started")
|
||||
|
||||
//val data = it.toArray
|
||||
if (countEvents > 0) {
|
||||
timeNow = System.nanoTime()
|
||||
logger.info(s"$timeNow:Step 1: collected ${countEvents} records, spent time=${(timeNow - timeLast) / 1E9} seconds")
|
||||
timeLast = timeNow
|
||||
val content = if(compression){
|
||||
val result = GZipHelper.deflateToBytes(data)
|
||||
timeNow = System.nanoTime()
|
||||
logger.info(s"$timeNow:Step 2: compressed to ${result.length} bytes, spent time=${(timeNow - timeLast) / 1E9} seconds")
|
||||
timeLast = timeNow
|
||||
result
|
||||
}
|
||||
else {
|
||||
data.mkString("\n").getBytes
|
||||
}
|
||||
|
||||
HadoopClient.writeWithTimeoutAndRetries(
|
||||
hdfsPath = outputPath,
|
||||
content = content,
|
||||
timeout = Duration.create(ConfigManager.getActiveDictionary().getOrElse(JobArgument.ConfName_BlobWriterTimeout, "10 seconds")),
|
||||
retries = 0
|
||||
)
|
||||
timeNow = System.nanoTime()
|
||||
logger.info(s"$timeNow:Step 3: done writing to $outputPath, spent time=${(timeNow - timeLast) / 1E9} seconds")
|
||||
timeLast = timeNow
|
||||
}
|
||||
|
||||
logger.info(s"$timeNow:Done writing events ${countEvents} events, spent time=${(timeLast - t1) / 1E9} seconds")
|
||||
}
|
||||
|
||||
def writeDatasetToBlobs(rdd: RDD[String], outputFolder: String, fileSuffix: String, compression: Boolean):RDD[ProcessResult] = {
|
||||
val outputPathPrefix = outputFolder.stripSuffix("/")
|
||||
rdd.mapPartitions(it=>{
|
||||
val tc = TaskContext.get()
|
||||
val logger = LogManager.getLogger(s"DatasetToBlobs-Writer${SparkEnvVariables.getLoggerSuffix()}")
|
||||
|
||||
val t1 = System.nanoTime()
|
||||
var timeLast = t1
|
||||
var timeNow: Long = 0
|
||||
logger.info(s"$timeNow:Partition started")
|
||||
|
||||
val dataAll = it.toArray
|
||||
val dataSize = dataAll.length
|
||||
|
||||
timeNow = System.nanoTime()
|
||||
logger.info(s"$timeNow:Collected ${dataSize} events, spent time=${(timeNow - timeLast) / 1E9} seconds")
|
||||
timeLast = timeNow
|
||||
|
||||
val path = outputPathPrefix + "/part-%05d".format(tc.partitionId()) + fileSuffix
|
||||
if(dataSize>0) {
|
||||
writeEventsToBlob(dataAll, path, compression)
|
||||
timeNow = System.nanoTime()
|
||||
logger.info(s"$timeNow:Done writting ${dataAll.length} events, spent time=${(timeNow - timeLast) / 1E9} seconds")
|
||||
timeLast = timeNow
|
||||
Iterator.single(ProcessResult(1, dataSize))
|
||||
}
|
||||
else {
|
||||
logger.warn(s"There is 0 events to output, skipped output partition file:'$path'")
|
||||
Iterator.single(ProcessResult(0, 0))
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
val MetricPrefixEvents = s"${MetricName.MetricSinkPrefix}${SinkName}_Events_"
|
||||
val MetricPrefixBlobs = s"${MetricName.MetricSinkPrefix}${SinkName}_Count_"
|
||||
|
||||
def sinkDataGroups(rowInfo: Row,
|
||||
dataGenerator: ()=>Map[String, Iterator[String]],
|
||||
outputFolders: Map[String, String],
|
||||
partitionId: Int,
|
||||
compression: Boolean,
|
||||
loggerSuffix: String): Map[String, Int] = {
|
||||
val logger = LogManager.getLogger(s"Sinker-BlobSinker$loggerSuffix")
|
||||
val dataGroups = dataGenerator()
|
||||
val timeStart = System.nanoTime ()
|
||||
val eventCounts = dataGroups.flatMap {
|
||||
case (group, data) =>
|
||||
val fileName = FileInternal.getInfoOutputFileName(rowInfo)
|
||||
outputFolders.get(group) match {
|
||||
case None =>
|
||||
Seq(s"${MetricPrefixEvents}$group" -> 0, s"${MetricPrefixBlobs}$group" -> 0)
|
||||
case Some(folder) =>
|
||||
val path = folder +
|
||||
(if (fileName == null) s"part-$partitionId" else fileName) + (if(compression) ".json.gz" else ".json")
|
||||
val jsonList = data.toSeq
|
||||
BlobSinker.writeEventsToBlob(jsonList, path, compression )
|
||||
|
||||
Seq(s"${MetricPrefixEvents}$group" -> jsonList.length, s"${MetricPrefixBlobs}$group" -> 1)
|
||||
}
|
||||
}
|
||||
|
||||
val timeNow = System.nanoTime ()
|
||||
logger.info (s"$timeNow:Written all event groups ${eventCounts.toString}, spent time=${(timeNow - timeStart) / 1E9} seconds")
|
||||
eventCounts
|
||||
}
|
||||
|
||||
def getRowsSinkerGenerator(blobOutputConf: BlobOutputConf, flagColumnIndex: Int) : SinkDelegate = {
|
||||
val compressionTypeConf = blobOutputConf.compressionType
|
||||
val formatConf = blobOutputConf.format
|
||||
if(formatConf.isDefined && !formatConf.get.equalsIgnoreCase("json"))
|
||||
throw new Error(s"Output format: ${formatConf.get} as specified in the config is not supported")
|
||||
val outputFolders = blobOutputConf.groups.map{case(k,v)=>k->KeyVaultClient.resolveSecretIfAny(v.folder)}
|
||||
(rowInfo: Row, rows: Seq[Row], outputPartitionTime: Timestamp, partitionId: Int, loggerSuffix: String) => {
|
||||
val target = FileInternal.getInfoTargetTag(rowInfo)
|
||||
if(compressionTypeConf.isDefined && !(compressionTypeConf.get.equalsIgnoreCase("gzip")|| compressionTypeConf.get.equalsIgnoreCase("none")|| compressionTypeConf.get.equals("")))
|
||||
throw new Error(s"Output compressionType: ${compressionTypeConf.get} as specified in the config is not supported")
|
||||
val compression = compressionTypeConf.getOrElse("gzip").equalsIgnoreCase("gzip")
|
||||
|
||||
sinkDataGroups(
|
||||
rowInfo = rowInfo,
|
||||
dataGenerator =
|
||||
if(flagColumnIndex<0)
|
||||
() => Map(DefaultOutputGroup -> rows.iterator.map(_.getString(1)))
|
||||
else
|
||||
() => rows.groupBy(_.getString(flagColumnIndex)).map { case (k, v) => k -> v.iterator.map(_.getString(1)) },
|
||||
outputFolders = outputFolders.map{case (k,v) =>
|
||||
k->generateOutputFolderPath(v, outputPartitionTime, Option(target))},
|
||||
partitionId = partitionId,
|
||||
compression = compression,
|
||||
loggerSuffix = loggerSuffix
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
def getSinkOperator(dict: SettingDictionary, name: String): SinkOperator = {
|
||||
val blobConf = BlobOutputSetting.buildBlobOutputConf(dict, name)
|
||||
SinkOperator(
|
||||
name = SinkName,
|
||||
isEnabled = blobConf!=null,
|
||||
flagColumnExprGenerator = () => blobConf.groupEvaluation.getOrElse(null),
|
||||
generator = flagColumnIndex=>getRowsSinkerGenerator(blobConf, flagColumnIndex),
|
||||
onBatch = (spark: SparkSession, outputPartitionTime: Timestamp, targets: Set[String]) => {
|
||||
val logger = LogManager.getLogger(this.getClass)
|
||||
val groups = blobConf.groups
|
||||
val outputFolders = groups.filter(g=>g._2!=null && g._2.folder!=null)
|
||||
.flatMap(g=>{
|
||||
val actualFolder = KeyVaultClient.resolveSecretIfAny(g._2.folder)
|
||||
if(targets!=null && targets.size>0)
|
||||
targets.map(t=>generateOutputFolderPath(actualFolder, outputPartitionTime, Option(t)))
|
||||
else
|
||||
Seq(generateOutputFolderPath(actualFolder, outputPartitionTime, None))
|
||||
})
|
||||
.filter(_!=null)
|
||||
.toSet
|
||||
|
||||
outputFolders.par.foreach(HadoopClient.createFolder)
|
||||
logger.warn(s"Created folders at ------\n${outputFolders.mkString("\n")}")
|
||||
}
|
||||
)
|
||||
}
|
||||
|
||||
override def getSettingNamespace(): String = BlobOutputSetting.Namespace
|
||||
}
|
|
@ -0,0 +1,29 @@
|
|||
// *********************************************************************
|
||||
// Copyright (c) Microsoft Corporation. All rights reserved.
|
||||
// Licensed under the MIT License
|
||||
// *********************************************************************
|
||||
package datax.sink
|
||||
|
||||
import datax.client.cosmosdb.CosmosDBConf
|
||||
import datax.config.SettingDictionary
|
||||
import datax.securedsetting.KeyVaultClient
|
||||
|
||||
object CosmosDBOutputSetting {
|
||||
val Namespace = "cosmosdb"
|
||||
val SettingConnectionString = "connectionstring"
|
||||
val SettingDatabase = "database"
|
||||
val SettingCollection = "collection"
|
||||
|
||||
def buildCosmosDBOutputConf(dict: SettingDictionary, name: String): CosmosDBConf = {
|
||||
KeyVaultClient.resolveSecretIfAny(dict.get(SettingConnectionString)) match {
|
||||
case Some(connectionString) =>
|
||||
CosmosDBConf(
|
||||
connectionString = connectionString,
|
||||
name = name,
|
||||
database = dict.getOrNull(SettingDatabase),
|
||||
collection = dict.getOrNull(SettingCollection)
|
||||
)
|
||||
case None => null
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,140 @@
|
|||
// *********************************************************************
|
||||
// Copyright (c) Microsoft Corporation. All rights reserved.
|
||||
// Licensed under the MIT License
|
||||
// *********************************************************************
|
||||
package datax.sink
|
||||
|
||||
import java.util.concurrent.ConcurrentHashMap
|
||||
|
||||
import com.microsoft.azure.documentdb._
|
||||
import datax.client.cosmosdb.CosmosDBConf
|
||||
import datax.config.SettingDictionary
|
||||
import datax.exception.EngineException
|
||||
import datax.securedsetting.KeyVaultClient
|
||||
import datax.utility.ConverterUtil._
|
||||
import datax.utility.SinkerUtil
|
||||
import org.apache.log4j.LogManager
|
||||
import org.apache.spark.sql.SparkSession
|
||||
|
||||
class CosmosDBSinker(key: String, conf: CosmosDBConf) {
|
||||
private val logger = LogManager.getLogger(s"CosmosDBSinker-${key}")
|
||||
private val client = CosmosDBSinkerManager.getClient(conf.connectionString)
|
||||
logger.warn(s"Initialized")
|
||||
|
||||
private def getDatabase(databaseName: String) = {
|
||||
val databases = client.queryDatabases(s"SELECT * FROM root r WHERE r.id='${databaseName}'", null)
|
||||
.getQueryIterable().toList
|
||||
if(databases.size()>0){
|
||||
databases.get(0)
|
||||
}
|
||||
else{
|
||||
try{
|
||||
val definition = new Database()
|
||||
definition.setId(databaseName)
|
||||
client.createDatabase(definition, null).getResource()
|
||||
}
|
||||
catch {
|
||||
case e: DocumentClientException => throw e
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private val databaseLink = getDatabase(conf.database).getSelfLink
|
||||
|
||||
private def getCollection(collectionName: String) = {
|
||||
val collections = client.queryCollections(databaseLink, s"SELECT * FROM root r WHERE r.id='$collectionName'", null)
|
||||
.getQueryIterable().toList
|
||||
if(collections.size()>0){
|
||||
collections.get(0)
|
||||
}
|
||||
else{
|
||||
try{
|
||||
val definition = new DocumentCollection()
|
||||
definition.setId(collectionName)
|
||||
client.createCollection(databaseLink, definition, null).getResource()
|
||||
}
|
||||
catch {
|
||||
case e: DocumentClientException => throw e
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private val collectionLink = getCollection(conf.collection).getSelfLink
|
||||
|
||||
def createDocument(json: String) = {
|
||||
val doc = new Document(json)
|
||||
|
||||
try{
|
||||
client.createDocument(collectionLink, doc, null, false)
|
||||
true
|
||||
}
|
||||
catch {
|
||||
case e:DocumentClientException =>
|
||||
throw e
|
||||
}
|
||||
}
|
||||
|
||||
private def createDocuments(jsons: Seq[String]) = {
|
||||
jsons.foreach(createDocument(_))
|
||||
}
|
||||
}
|
||||
|
||||
object CosmosDBSinkerManager extends SinkOperatorFactory {
|
||||
private val SinkName = "CosmosDB"
|
||||
private val logger = LogManager.getLogger("CosmosDBSinkerManager")
|
||||
private val pool = new ConcurrentHashMap[String, CosmosDBSinker]
|
||||
private val clientPool = new ConcurrentHashMap[String, DocumentClient]
|
||||
|
||||
private val connectionStringRegex = "^AccountEndpoint=([^;]*);AccountKey=([^;]*);".r
|
||||
def parseConnectionString(conn: String) = {
|
||||
connectionStringRegex.findFirstMatchIn(conn).map(m=>(m.group(1), m.group(2)))
|
||||
}
|
||||
|
||||
def getSinker(conf: CosmosDBConf) = {
|
||||
val key = conf.name
|
||||
pool.computeIfAbsent(key, (k: String) => new CosmosDBSinker(k, conf))
|
||||
}
|
||||
|
||||
def getClient(connectionString: String) = {
|
||||
parseConnectionString(connectionString) match {
|
||||
case Some((serviceEndpoint, masterKey)) =>
|
||||
clientPool.computeIfAbsent(serviceEndpoint, (k: String) => {
|
||||
logger.warn(s"Create new client for serviceEndpoint:$k")
|
||||
new DocumentClient(serviceEndpoint, masterKey, null, null)
|
||||
})
|
||||
case None =>
|
||||
throw new EngineException(s"unexpected connection string:'${connectionString}'")
|
||||
}
|
||||
}
|
||||
|
||||
def getClient(serviceEndpoint: String, masterKey: String) = {
|
||||
clientPool.computeIfAbsent(serviceEndpoint, (k: String) => {
|
||||
logger.warn(s"Create new client for serviceEndpoint:$k")
|
||||
new DocumentClient(serviceEndpoint, masterKey, null, null)
|
||||
})
|
||||
}
|
||||
|
||||
|
||||
def getSinkOperator(dict: SettingDictionary, name: String) : SinkOperator = {
|
||||
val conf = CosmosDBOutputSetting.buildCosmosDBOutputConf(dict, name)
|
||||
SinkOperator(
|
||||
name = SinkName,
|
||||
isEnabled = conf!=null,
|
||||
flagColumnExprGenerator = () => null,
|
||||
generator = flagColumnIndex => SinkerUtil.outputGenerator(
|
||||
(dataToSend:Seq[String],ls: String) => {
|
||||
val cosmosDBSinker = CosmosDBSinkerManager.getSinker(conf)
|
||||
dataToSend.count(d=>cosmosDBSinker.createDocument(d))
|
||||
},
|
||||
SinkName
|
||||
)(flagColumnIndex),
|
||||
onInitialization = (spark: SparkSession) => {
|
||||
val logger = LogManager.getLogger(this.getClass)
|
||||
CosmosDBSinkerManager.getSinker(conf)
|
||||
logger.warn(s"initialize cosmos DB sinker destination at ${conf.name}")
|
||||
}
|
||||
)
|
||||
}
|
||||
|
||||
override def getSettingNamespace(): String = CosmosDBOutputSetting.Namespace
|
||||
}
|
|
@ -0,0 +1,46 @@
|
|||
// *********************************************************************
|
||||
// Copyright (c) Microsoft Corporation. All rights reserved.
|
||||
// Licensed under the MIT License
|
||||
// *********************************************************************
|
||||
package datax.sink
|
||||
|
||||
import datax.config.{SettingDictionary, SettingNamespace}
|
||||
import datax.securedsetting.KeyVaultClient
|
||||
|
||||
object EventHubOutputSetting {
|
||||
case class EventHubOutputConf(connectionString: String,
|
||||
filter: String,
|
||||
appendProperties: Map[String, String],
|
||||
compressionType: String,
|
||||
format: String)
|
||||
|
||||
val Namespace = "eventhub"
|
||||
val SettingConnectionString = "connectionstring"
|
||||
val SettingFilter = "filter"
|
||||
val SettingCompressionType = "compressiontype"
|
||||
val SettingFormat = "format"
|
||||
val SettingAppendProperty = "appendproperty"
|
||||
val AppendPropertyPrefix = SettingAppendProperty + SettingNamespace.Seperator
|
||||
|
||||
val FormatValueJson = "json"
|
||||
val FormatValueDefault = FormatValueJson
|
||||
val CompressionValueNone = "none"
|
||||
val CompressionValueGZip = "gzip"
|
||||
val CompressionValueDefault = CompressionValueGZip
|
||||
|
||||
|
||||
def buildEventHubOutputConf(dict: SettingDictionary, name: String) = {
|
||||
KeyVaultClient.resolveSecretIfAny(dict.get(SettingConnectionString)) match {
|
||||
case Some(connectionString) =>
|
||||
val properties = dict.getSubDictionary(AppendPropertyPrefix).getDictMap()
|
||||
EventHubOutputConf(
|
||||
connectionString = connectionString,
|
||||
appendProperties = properties,
|
||||
filter = dict.getOrNull(SettingFilter),
|
||||
compressionType = dict.get(SettingCompressionType).getOrElse(CompressionValueDefault),
|
||||
format = dict.get(SettingFormat).getOrElse(FormatValueDefault)
|
||||
)
|
||||
case None => null
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,81 @@
|
|||
// *********************************************************************
|
||||
// Copyright (c) Microsoft Corporation. All rights reserved.
|
||||
// Licensed under the MIT License
|
||||
// *********************************************************************
|
||||
|
||||
package datax.sink
|
||||
|
||||
import datax.utility.{GZipHelper, SinkerUtil}
|
||||
import datax.client.eventhub.{EventHubConf, EventHubSenderPool}
|
||||
import datax.config.SettingDictionary
|
||||
import datax.exception.EngineException
|
||||
import datax.sink.EventHubOutputSetting.EventHubOutputConf
|
||||
import org.apache.log4j.LogManager
|
||||
|
||||
object EventHubStreamPoster extends SinkOperatorFactory {
|
||||
val SinkName = "EventHub"
|
||||
def sendFilteredEvents(data: Seq[String],
|
||||
outputEventhubConf: EventHubConf,
|
||||
appendProperties: Map[String, String],
|
||||
loggerSuffix: String,
|
||||
compressionType: String): Int = {
|
||||
|
||||
val logger = LogManager.getLogger(s"FilteredEvent-Sender${loggerSuffix}")
|
||||
val countEvents = data.length
|
||||
val chunkSize = 200
|
||||
if (countEvents > 0) {
|
||||
val sender = EventHubSenderPool.getSender(outputEventhubConf)
|
||||
var i = 0
|
||||
data.grouped(chunkSize).foreach(events => {
|
||||
val eventsSize = events.length
|
||||
val json = events.mkString("\n")
|
||||
val t1 = System.nanoTime()
|
||||
val elpasedTime = (System.nanoTime() - t1) / 1E9
|
||||
val stage = s"[$i-${i + eventsSize}]/$countEvents"
|
||||
if(compressionType.equalsIgnoreCase(EventHubOutputSetting.CompressionValueGZip)) {
|
||||
val compressedJson = GZipHelper.deflateToBytes(json)
|
||||
logger.info(s"$stage: compressed filtered events, count=$eventsSize, json=${json.length} bytes, compressed= ${compressedJson.length} bytes, spent time=$elpasedTime seconds")
|
||||
sender.sendBytes(compressedJson, appendProperties)
|
||||
}
|
||||
else
|
||||
{
|
||||
logger.info(s"$stage: compressed filtered events, count=$eventsSize, json=${json.length} bytes, spent time=$elpasedTime seconds")
|
||||
sender.sendBytes(json.getBytes(), appendProperties)
|
||||
}
|
||||
logger.info(s"$stage: done sending")
|
||||
i += eventsSize
|
||||
eventsSize
|
||||
})
|
||||
countEvents
|
||||
}
|
||||
else 0
|
||||
}
|
||||
|
||||
def getRowsSinkerGenerator(conf: EventHubOutputConf, flagColumnIndex: Int) : SinkDelegate = {
|
||||
val format = conf.format
|
||||
if(!format.equalsIgnoreCase(EventHubOutputSetting.FormatValueJson))
|
||||
throw new EngineException(s"Eventhub: Output format: ${format} as specified in the config is not supported")
|
||||
|
||||
val compressionType = conf.compressionType
|
||||
if(compressionType!=EventHubOutputSetting.CompressionValueNone && compressionType!=EventHubOutputSetting.CompressionValueGZip)
|
||||
throw new EngineException(s"EventHub: compressionType: ${compressionType} as specified in the config is not supported")
|
||||
|
||||
val sender = (dataToSend: Seq[String], ls: String) => EventHubStreamPoster.sendFilteredEvents(dataToSend, EventHubConf(
|
||||
name = SinkerUtil.hashName(conf.connectionString),
|
||||
connectionString = conf.connectionString
|
||||
), conf.appendProperties, ls, compressionType)
|
||||
SinkerUtil.outputGenerator(sender,SinkName)(flagColumnIndex)
|
||||
}
|
||||
|
||||
def getSinkOperator(dict: SettingDictionary, name: String) : SinkOperator = {
|
||||
val conf = EventHubOutputSetting.buildEventHubOutputConf(dict, name)
|
||||
SinkOperator(
|
||||
name = SinkName,
|
||||
isEnabled = conf!=null,
|
||||
flagColumnExprGenerator = () => conf.filter,
|
||||
generator = flagColumnIndex => getRowsSinkerGenerator(conf, flagColumnIndex)
|
||||
)
|
||||
}
|
||||
|
||||
override def getSettingNamespace(): String = EventHubOutputSetting.Namespace
|
||||
}
|
|
@ -0,0 +1,30 @@
|
|||
// *********************************************************************
|
||||
// Copyright (c) Microsoft Corporation. All rights reserved.
|
||||
// Licensed under the MIT License
|
||||
// *********************************************************************
|
||||
package datax.sink
|
||||
|
||||
import datax.config.{SettingDictionary, SettingNamespace}
|
||||
|
||||
object HttpPostOutputSetting {
|
||||
case class HttpPostConf(endpoint: String, filter: String, appendHeaders: Option[Map[String, String]])
|
||||
|
||||
val Namespace = "httppost"
|
||||
val SettingEndpoint = "endpoint"
|
||||
val SettingFilter = "filter"
|
||||
val SettingHeader = "header"
|
||||
val AppendHeaderPrefix = SettingHeader+SettingNamespace.Seperator
|
||||
|
||||
def getHttpPostConf(dict: SettingDictionary, name: String) = {
|
||||
if(dict.size>0) {
|
||||
val headers = dict.getSubDictionary(AppendHeaderPrefix).getDictMap()
|
||||
HttpPostConf(
|
||||
endpoint = dict.getOrNull(SettingEndpoint),
|
||||
filter = dict.getOrNull(SettingHeader),
|
||||
appendHeaders = Option(headers)
|
||||
)
|
||||
}
|
||||
else
|
||||
null
|
||||
}
|
||||
}
|
|
@ -0,0 +1,84 @@
|
|||
// *********************************************************************
|
||||
// Copyright (c) Microsoft Corporation. All rights reserved.
|
||||
// Licensed under the MIT License
|
||||
// *********************************************************************
|
||||
package datax.sink
|
||||
|
||||
import datax.config.SettingDictionary
|
||||
import datax.sink.HttpPostOutputSetting.HttpPostConf
|
||||
import datax.utility.SinkerUtil
|
||||
import org.apache.http.client.config.RequestConfig
|
||||
import org.apache.http.client.methods.HttpPost
|
||||
import org.apache.http.entity.StringEntity
|
||||
import org.apache.http.impl.client.{BasicResponseHandler, HttpClients}
|
||||
import org.apache.log4j.LogManager
|
||||
|
||||
object HttpPoster extends SinkOperatorFactory {
|
||||
val SinkName = "HttpPost"
|
||||
|
||||
def getHttpClient() = {
|
||||
val requestConfig = RequestConfig.custom().setConnectionRequestTimeout(5000).setSocketTimeout(5000).build()
|
||||
HttpClients.custom().setDefaultRequestConfig(requestConfig).build()
|
||||
}
|
||||
|
||||
def postEvents(data: Seq[String], httpEndpoint: String, headers: Option[Map[String, String]], loggerSuffix: String): Int = {
|
||||
val logger = LogManager.getLogger(s"HttpPoster${loggerSuffix}")
|
||||
val countEvents = data.length
|
||||
val chunkSize = 200
|
||||
if (countEvents > 0) {
|
||||
val clientItr = getHttpClient
|
||||
val handler = new BasicResponseHandler
|
||||
|
||||
var i = 0
|
||||
data.grouped(chunkSize).foreach(events => {
|
||||
val eventsSize = events.length
|
||||
val json = "[" + events.mkString(",") + "]"
|
||||
val t1 = System.nanoTime()
|
||||
val stage = s"[$i-${i + eventsSize}]/$countEvents"
|
||||
|
||||
// post data
|
||||
try{
|
||||
val post = new HttpPost(httpEndpoint)
|
||||
if(headers.isDefined){
|
||||
headers.get.foreach(h=>post.addHeader(h._1, h._2))
|
||||
}
|
||||
post.setEntity(new StringEntity(json))
|
||||
val response = clientItr.execute(post)
|
||||
val body = handler.handleResponse(response)
|
||||
|
||||
logger.info(s"$stage is sent:${body}")
|
||||
}
|
||||
catch{
|
||||
case e: Exception => {
|
||||
logger.error(s"$stage: failed", e)
|
||||
}
|
||||
}
|
||||
|
||||
i += eventsSize
|
||||
eventsSize
|
||||
})
|
||||
|
||||
clientItr.close()
|
||||
|
||||
countEvents
|
||||
}
|
||||
else 0
|
||||
}
|
||||
|
||||
def getRowsSinkerGenerator(httpPostConf: HttpPostConf, flagColumnIndex: Int) : SinkDelegate = {
|
||||
val sender = (dataToSend:Seq[String],ls: String) => HttpPoster.postEvents(dataToSend, httpPostConf.endpoint, httpPostConf.appendHeaders, ls)
|
||||
SinkerUtil.outputGenerator(sender,SinkName)(flagColumnIndex)
|
||||
}
|
||||
|
||||
def getSinkOperator(dict: SettingDictionary, name: String) : SinkOperator = {
|
||||
val conf = HttpPostOutputSetting.getHttpPostConf(dict, name)
|
||||
SinkOperator(
|
||||
name = SinkName,
|
||||
isEnabled = conf!=null,
|
||||
flagColumnExprGenerator = () => conf.filter,
|
||||
generator = (flagColumnIndex)=>getRowsSinkerGenerator(conf, flagColumnIndex)
|
||||
)
|
||||
}
|
||||
|
||||
override def getSettingNamespace(): String = HttpPostOutputSetting.Namespace
|
||||
}
|
|
@ -0,0 +1,154 @@
|
|||
// *********************************************************************
|
||||
// Copyright (c) Microsoft Corporation. All rights reserved.
|
||||
// Licensed under the MIT License
|
||||
// *********************************************************************
|
||||
package datax.sink
|
||||
|
||||
import java.sql.Timestamp
|
||||
|
||||
import datax.config.{SettingDictionary, SettingNamespace, SparkEnvVariables}
|
||||
import datax.constants.{ColumnName, MetricName, ProductConstant}
|
||||
import datax.data.FileInternal
|
||||
import datax.exception.EngineException
|
||||
import datax.fs.HadoopClient
|
||||
import datax.utility.{DataMerger, DataNormalization, SinkerUtil}
|
||||
import org.apache.log4j.LogManager
|
||||
import org.apache.spark.TaskContext
|
||||
import org.apache.spark.sql.functions.{col, struct, to_json}
|
||||
import org.apache.spark.sql.types.StructType
|
||||
import org.apache.spark.sql.{DataFrame, Row, SparkSession}
|
||||
|
||||
|
||||
object OutputManager {
|
||||
val NamespacePrefix = SettingNamespace.JobOutputPrefix
|
||||
val SettingOutputProcessedSchemaPath = "processedschemapath"
|
||||
|
||||
val sinkFactories = Seq[SinkOperatorFactory](
|
||||
BlobSinker, EventHubStreamPoster, HttpPoster, CosmosDBSinkerManager
|
||||
).map(f=>f.getSettingNamespace()->f).toMap
|
||||
|
||||
def getOperatators(dict: SettingDictionary): Seq[OutputOperator] ={
|
||||
dict.groupBySubNamespace(NamespacePrefix)
|
||||
.map(g=>generateOperator(g._2, g._1))
|
||||
.toSeq
|
||||
}
|
||||
|
||||
def outputResultReducer = (s1: (Int, Map[String, Int]), s2: (Int, Map[String, Int])) => (s1._1 + s2._1, DataMerger.mergeMapOfCounts(s1._2, s2._2))
|
||||
|
||||
def generateOperator(dict:SettingDictionary, name: String) = {
|
||||
val logger = LogManager.getLogger("OutputOperatorsBuilder")
|
||||
var flagColumnIndex = 1
|
||||
|
||||
val processedSchemaPath = dict.get(SettingOutputProcessedSchemaPath).orNull
|
||||
val sinkOperators = dict
|
||||
.groupBySubNamespace()
|
||||
.map { case (k, v) => sinkFactories.get(k).map(_.getSinkOperator(v, k))}
|
||||
.filter(o => o match {
|
||||
case Some(oper) =>
|
||||
logger.info(s"Output '$name':${oper.name} is ${SinkerUtil.boolToOnOff(oper.isEnabled)}")
|
||||
oper.isEnabled
|
||||
case None => false
|
||||
}).map(o=>{
|
||||
val oper = o.get
|
||||
val flagColumnExpr = oper.flagColumnExprGenerator()
|
||||
if(flagColumnExpr==null){
|
||||
logger.warn(s"Output type:'${oper.name}': no flag column")
|
||||
(oper.name, (null, null), oper.generator(-1), oper.onBatch, oper.onInitialization)
|
||||
}
|
||||
else{
|
||||
val appendColumn = (flagColumnExpr, s"_${ProductConstant.ProductOutputFilter}_${oper.name}")
|
||||
logger.warn(s"Output type:'${oper.name}': append column:$appendColumn")
|
||||
flagColumnIndex+=1
|
||||
(oper.name, appendColumn, oper.generator(flagColumnIndex), oper.onBatch, oper.onInitialization)
|
||||
}
|
||||
}).toSeq
|
||||
|
||||
if(sinkOperators.length==0)throw new EngineException(s"no sink is defined for output '$name'!")
|
||||
logger.warn(s"Output '$name' to ${sinkOperators.length} sinkers: ${sinkOperators.map(s=>s"'${s._1}'").mkString(",")}")
|
||||
|
||||
val flagColumns = sinkOperators.map(_._2).filter(_._1!=null)
|
||||
val sinkers = sinkOperators.map(o=>o._1 -> o._3).toMap
|
||||
val onBatchHandlers = sinkOperators.map(_._4).filter(_!=null)
|
||||
val onInitHandlers = sinkOperators.map(_._5).filter(_!=null)
|
||||
var shouldGeneratorProcessedSchema = processedSchemaPath!=null && !processedSchemaPath.isEmpty
|
||||
|
||||
OutputOperator(
|
||||
name = name,
|
||||
onInitialization = if(onInitHandlers.size>0) (spark: SparkSession)=> for (elem <- onInitHandlers) {elem(spark)} else null,
|
||||
onBatch = if(onBatchHandlers.size>0) (spark:SparkSession, time: Timestamp, targets: Set[String]) => {
|
||||
onBatchHandlers.foreach(_(spark, time, targets))
|
||||
} else null,
|
||||
output = (df: DataFrame, partitionTime: Timestamp) => {
|
||||
val outputLogger = LogManager.getLogger(s"Output-${name}")
|
||||
val outputColumns = df.schema.filterNot(_.name.startsWith(ColumnName.InternalColumnPrefix)).toArray
|
||||
if(shouldGeneratorProcessedSchema){
|
||||
val spark = df.sparkSession
|
||||
spark.synchronized{
|
||||
if(shouldGeneratorProcessedSchema){
|
||||
HadoopClient.writeHdfsFile(processedSchemaPath, new StructType(outputColumns).prettyJson, true)
|
||||
outputLogger.warn(s"Saved processed schema to $processedSchemaPath")
|
||||
shouldGeneratorProcessedSchema = false
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
val outputColumnNames = outputColumns.map(c=>DataNormalization.sanitizeColumnName(c.name))
|
||||
outputLogger.warn(s"Output fields: ${outputColumnNames.mkString(",")}")
|
||||
|
||||
sink(df, outputColumnNames, partitionTime, flagColumns, sinkers)
|
||||
}
|
||||
)
|
||||
}
|
||||
|
||||
def sink(df: DataFrame,
|
||||
outputFieldNames: Seq[String],
|
||||
partitionTime: Timestamp,
|
||||
flagColumns: Seq[(String, String)],
|
||||
outputOperators: Map[String, SinkDelegate]) = {
|
||||
val amendDf = if(df.schema.fieldNames.contains(ColumnName.InternalColumnFileInfo))df
|
||||
else {
|
||||
df.withColumn(ColumnName.InternalColumnFileInfo, FileInternal.udfEmptyInternalInfo())
|
||||
}
|
||||
|
||||
val query = amendDf.selectExpr("*" +: flagColumns.map(c => c._1 + " AS " + c._2): _*)
|
||||
.select(Seq(col(ColumnName.InternalColumnFileInfo), to_json(struct(outputFieldNames.map(col): _*))) ++
|
||||
flagColumns.map(_._2).map(col): _*)
|
||||
|
||||
//query.explain will dump the execution plan of sql to stdout
|
||||
//query.explain(true)
|
||||
query
|
||||
.rdd
|
||||
.mapPartitions(it => {
|
||||
val partitionId = TaskContext.getPartitionId()
|
||||
val loggerSuffix = SparkEnvVariables.getLoggerSuffix()
|
||||
val logger = LogManager.getLogger(s"EventsSinker${loggerSuffix}")
|
||||
//val path = outputFileFolder+"/part-"+tc.partitionId().toString + ".json.gz"
|
||||
|
||||
val t1 = System.nanoTime()
|
||||
var timeLast = t1
|
||||
var timeNow: Long = 0
|
||||
logger.info(s"$timeNow:Partition started")
|
||||
|
||||
val dataAll = it.toArray
|
||||
val count = dataAll.length
|
||||
timeNow = System.nanoTime()
|
||||
logger.info(s"$timeNow:Collected $count events, spent time=${(timeNow - timeLast) / 1E9} seconds")
|
||||
timeLast = timeNow
|
||||
|
||||
val inputMetric = Map(s"${MetricName.MetricSinkPrefix}InputEvents" -> count)
|
||||
Seq(if (count > 0) {
|
||||
val rowInfo = dataAll(0).getAs[Row](0)
|
||||
if(outputOperators.size==0)
|
||||
throw new EngineException("no output operators are found!")
|
||||
outputOperators
|
||||
.par
|
||||
.map(_._2(rowInfo, dataAll, partitionTime, partitionId, loggerSuffix))
|
||||
.reduce(DataMerger.mergeMapOfCounts) ++ inputMetric
|
||||
}
|
||||
else
|
||||
inputMetric
|
||||
).iterator
|
||||
})
|
||||
.reduce(DataMerger.mergeMapOfCounts)
|
||||
}
|
||||
}
|
Некоторые файлы не были показаны из-за слишком большого количества измененных файлов Показать больше
Загрузка…
Ссылка в новой задаче