This commit is contained in:
Rohit Agrawal 2019-04-15 23:57:37 -07:00
Родитель 43cd5e5fdf
Коммит c8104966a1
1052 изменённых файлов: 82872 добавлений и 330 удалений

335
.gitignore поставляемый
Просмотреть файл

@ -1,330 +1,7 @@
## Ignore Visual Studio temporary files, build results, and
## files generated by popular Visual Studio add-ons.
##
## Get latest from https://github.com/github/gitignore/blob/master/VisualStudio.gitignore
################################################################################
# This .gitignore file was automatically created by Microsoft(R) Visual Studio.
################################################################################
# User-specific files
*.suo
*.user
*.userosscache
*.sln.docstates
# User-specific files (MonoDevelop/Xamarin Studio)
*.userprefs
# Build results
[Dd]ebug/
[Dd]ebugPublic/
[Rr]elease/
[Rr]eleases/
x64/
x86/
bld/
[Bb]in/
[Oo]bj/
[Ll]og/
# Visual Studio 2015/2017 cache/options directory
.vs/
# Uncomment if you have tasks that create the project's static files in wwwroot
#wwwroot/
# Visual Studio 2017 auto generated files
Generated\ Files/
# MSTest test Results
[Tt]est[Rr]esult*/
[Bb]uild[Ll]og.*
# NUNIT
*.VisualState.xml
TestResult.xml
# Build Results of an ATL Project
[Dd]ebugPS/
[Rr]eleasePS/
dlldata.c
# Benchmark Results
BenchmarkDotNet.Artifacts/
# .NET Core
project.lock.json
project.fragment.lock.json
artifacts/
**/Properties/launchSettings.json
# StyleCop
StyleCopReport.xml
# Files built by Visual Studio
*_i.c
*_p.c
*_i.h
*.ilk
*.meta
*.obj
*.iobj
*.pch
*.pdb
*.ipdb
*.pgc
*.pgd
*.rsp
*.sbr
*.tlb
*.tli
*.tlh
*.tmp
*.tmp_proj
*.log
*.vspscc
*.vssscc
.builds
*.pidb
*.svclog
*.scc
# Chutzpah Test files
_Chutzpah*
# Visual C++ cache files
ipch/
*.aps
*.ncb
*.opendb
*.opensdf
*.sdf
*.cachefile
*.VC.db
*.VC.VC.opendb
# Visual Studio profiler
*.psess
*.vsp
*.vspx
*.sap
# Visual Studio Trace Files
*.e2e
# TFS 2012 Local Workspace
$tf/
# Guidance Automation Toolkit
*.gpState
# ReSharper is a .NET coding add-in
_ReSharper*/
*.[Rr]e[Ss]harper
*.DotSettings.user
# JustCode is a .NET coding add-in
.JustCode
# TeamCity is a build add-in
_TeamCity*
# DotCover is a Code Coverage Tool
*.dotCover
# AxoCover is a Code Coverage Tool
.axoCover/*
!.axoCover/settings.json
# Visual Studio code coverage results
*.coverage
*.coveragexml
# NCrunch
_NCrunch_*
.*crunch*.local.xml
nCrunchTemp_*
# MightyMoose
*.mm.*
AutoTest.Net/
# Web workbench (sass)
.sass-cache/
# Installshield output folder
[Ee]xpress/
# DocProject is a documentation generator add-in
DocProject/buildhelp/
DocProject/Help/*.HxT
DocProject/Help/*.HxC
DocProject/Help/*.hhc
DocProject/Help/*.hhk
DocProject/Help/*.hhp
DocProject/Help/Html2
DocProject/Help/html
# Click-Once directory
publish/
# Publish Web Output
*.[Pp]ublish.xml
*.azurePubxml
# Note: Comment the next line if you want to checkin your web deploy settings,
# but database connection strings (with potential passwords) will be unencrypted
*.pubxml
*.publishproj
# Microsoft Azure Web App publish settings. Comment the next line if you want to
# checkin your Azure Web App publish settings, but sensitive information contained
# in these scripts will be unencrypted
PublishScripts/
# NuGet Packages
*.nupkg
# The packages folder can be ignored because of Package Restore
**/[Pp]ackages/*
# except build/, which is used as an MSBuild target.
!**/[Pp]ackages/build/
# Uncomment if necessary however generally it will be regenerated when needed
#!**/[Pp]ackages/repositories.config
# NuGet v3's project.json files produces more ignorable files
*.nuget.props
*.nuget.targets
# Microsoft Azure Build Output
csx/
*.build.csdef
# Microsoft Azure Emulator
ecf/
rcf/
# Windows Store app package directories and files
AppPackages/
BundleArtifacts/
Package.StoreAssociation.xml
_pkginfo.txt
*.appx
# Visual Studio cache files
# files ending in .cache can be ignored
*.[Cc]ache
# but keep track of directories ending in .cache
!*.[Cc]ache/
# Others
ClientBin/
~$*
*~
*.dbmdl
*.dbproj.schemaview
*.jfm
*.pfx
*.publishsettings
orleans.codegen.cs
# Including strong name files can present a security risk
# (https://github.com/github/gitignore/pull/2483#issue-259490424)
#*.snk
# Since there are multiple workflows, uncomment next line to ignore bower_components
# (https://github.com/github/gitignore/pull/1529#issuecomment-104372622)
#bower_components/
# RIA/Silverlight projects
Generated_Code/
# Backup & report files from converting an old project file
# to a newer Visual Studio version. Backup files are not needed,
# because we have git ;-)
_UpgradeReport_Files/
Backup*/
UpgradeLog*.XML
UpgradeLog*.htm
ServiceFabricBackup/
*.rptproj.bak
# SQL Server files
*.mdf
*.ldf
*.ndf
# Business Intelligence projects
*.rdl.data
*.bim.layout
*.bim_*.settings
*.rptproj.rsuser
# Microsoft Fakes
FakesAssemblies/
# GhostDoc plugin setting file
*.GhostDoc.xml
# Node.js Tools for Visual Studio
.ntvs_analysis.dat
node_modules/
# Visual Studio 6 build log
*.plg
# Visual Studio 6 workspace options file
*.opt
# Visual Studio 6 auto-generated workspace file (contains which files were open etc.)
*.vbw
# Visual Studio LightSwitch build output
**/*.HTMLClient/GeneratedArtifacts
**/*.DesktopClient/GeneratedArtifacts
**/*.DesktopClient/ModelManifest.xml
**/*.Server/GeneratedArtifacts
**/*.Server/ModelManifest.xml
_Pvt_Extensions
# Paket dependency manager
.paket/paket.exe
paket-files/
# FAKE - F# Make
.fake/
# JetBrains Rider
.idea/
*.sln.iml
# CodeRush
.cr/
# Python Tools for Visual Studio (PTVS)
__pycache__/
*.pyc
# Cake - Uncomment if you are using it
# tools/**
# !tools/packages.config
# Tabs Studio
*.tss
# Telerik's JustMock configuration file
*.jmconfig
# BizTalk build output
*.btp.cs
*.btm.cs
*.odx.cs
*.xsd.cs
# OpenCover UI analysis results
OpenCover/
# Azure Stream Analytics local run output
ASALocalRun/
# MSBuild Binary and Structured Log
*.binlog
# NVidia Nsight GPU debugger configuration file
*.nvuser
# MFractors (Xamarin productivity tool) working folder
.mfractor/
/.vs
/v15
npm-debug.log

1
CODE_OF_CONDUCT.md Normal file
Просмотреть файл

@ -0,0 +1 @@
This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/). For more information see the [Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/) or contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with any additional questions or comments.

35
CONTRIBUTING.md Normal file
Просмотреть файл

@ -0,0 +1,35 @@
#Feedback
- Ask a question on [Stack Overflow](https://stackoverflow.com/questions/tagged/data-accelerator)
- Request new features on [GitHub](CONTRIBUTING.md)
- Open a new issue on [GitHub](https://github.com/Microsoft/data-accelerator/issues)
#Contributing
#Building
* The Services can be built using the Visual Studio solution. See [Guide](Services/CONTRIBUTING.md)
* The Spark folder can be built using Maven. See [Guide](Spark/CONTRIBUTING.md)
* The Website can be built using NPM. See [Guide](Website/CONTRIBUTING.md)
#Create a change
The development workflow, including debugging and running tests]
#Coding Guidelines
* The services use .editorconfig files to maintain coding guidelines. See [editorconfig](Services/.editorconfig)
* The website uses the Prettier extension. See [Prettier](https://marketplace.visualstudio.com/items?itemName=esbenp.prettier-vscode)
* The Spark folder doesn't have guidelines yet. For contribution, please keep the code looking similar to the rest of the file being modified.
#Submitting pull requests
You will need to sign a Contributor License Agreement when submitting your pull request. To complete the Contributor License Agreement (CLA), you will need to follow the instructions provided by the CLA bot when you send the pull request. This needs to only be done once for any .NET Foundation OSS project.
If you don't know what a pull request is read this article: https://help.github.com/articles/using-pull-requests. Make sure the respository can build and all tests pass. Familiarize yourself with the project workflow and our coding conventions. The coding, style, and general engineering guidelines are published on the Engineering guidelines page.
Please also see our [Code of Conduct](CODE_OF_CONDUCT.md).
#Contributor License Agreement
This project welcomes contributions and suggestions. Most contributions require you to agree to a Contributor License Agreement (CLA) declaring that you have the right to, and actually do, grant us the rights to use your contribution. For details, visit https://cla.microsoft.com.
When you submit a pull request, a CLA-bot will automatically determine whether you need to provide a CLA and decorate the PR appropriately (e.g., label, comment). Simply follow the instructions provided by the bot. You will only need to do this once across all repos using our CLA.
This project has adopted the Microsoft Open Source Code of Conduct. For more information see the Code of Conduct FAQ or contact opencode@microsoft.com with any additional questions or comments.

8
DataProcessing/.gitignore поставляемый Normal file
Просмотреть файл

@ -0,0 +1,8 @@
*.class
*.log
*/.vs
.idea
*/target
*/src/main/resources
*.iml
out/

Просмотреть файл

@ -0,0 +1,54 @@
#Building Spark engine
In order to build the Data Accelerator Spark engine, you will need the following
#Requirements
- Maven
- Java SDK
- Ensure both JAVA_HOME, M2_HOME and MAVEN_HOME are properly defined in your environment
#How to build
From a prompt:
> mvn package -f project-name
examples:
```
> mvn package -f datax-core
> mvn package -f datax-keyvault
> mvn package -f datax-utility
> mvn package -f datax-host
> mvn package -f datax-udf-samples
```
## Publish to Maven Repo
<TODO replace with external repo>
> mvn deploy -f project-name
examples:
```
> mvn deploy -f datax-core
> mvn deploy -f datax-utility
> mvn deploy -f datax-host
> mvn deploy -f datax-udf-samples
```
## Publish to Storage Account to cluster
Note: you will have to do `az login` first in order to use the login mode when uploading blob to remote storage account, also your account should have permission to the storage account associated with the cluster.
> deploy module-name staging
examples:
```
> deploy core staging
> deploy utility staging
> deploy host staging
> deploy udf-samples staging
```
#How to create a PR
- Ensure all tests are passing by doing the following
- Create a pull request aginst the master branch

Просмотреть файл

@ -0,0 +1,25 @@

Microsoft Visual Studio Solution File, Format Version 12.00
# Visual Studio 15
VisualStudioVersion = 15.0.28307.168
MinimumVisualStudioVersion = 10.0.40219.1
Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "DataX.Utility.CodeSign", "DataX.Utility.CodeSign\DataX.Utility.CodeSign.csproj", "{F42E9A11-BA7C-4050-B8E4-33532615C230}"
EndProject
Global
GlobalSection(SolutionConfigurationPlatforms) = preSolution
Debug|Any CPU = Debug|Any CPU
Release|Any CPU = Release|Any CPU
EndGlobalSection
GlobalSection(ProjectConfigurationPlatforms) = postSolution
{F42E9A11-BA7C-4050-B8E4-33532615C230}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
{F42E9A11-BA7C-4050-B8E4-33532615C230}.Debug|Any CPU.Build.0 = Debug|Any CPU
{F42E9A11-BA7C-4050-B8E4-33532615C230}.Release|Any CPU.ActiveCfg = Release|Any CPU
{F42E9A11-BA7C-4050-B8E4-33532615C230}.Release|Any CPU.Build.0 = Release|Any CPU
EndGlobalSection
GlobalSection(SolutionProperties) = preSolution
HideSolutionNode = FALSE
EndGlobalSection
GlobalSection(ExtensibilityGlobals) = postSolution
SolutionGuid = {6EBAC466-53ED-4C98-8790-81D8AF7ACC06}
EndGlobalSection
EndGlobal

Просмотреть файл

@ -0,0 +1,34 @@
<Project Sdk="Microsoft.NET.Sdk">
<PropertyGroup>
<OutputType>Library</OutputType>
<TargetFramework>netcoreapp2.1</TargetFramework>
<ApplicationIcon />
<StartupObject />
<SignAssembly>true</SignAssembly>
<AssemblyOriginatorKeyFile>FinalPublicKey.snk</AssemblyOriginatorKeyFile>
<DelaySign>true</DelaySign>
<BaseOutputDirectory>bin</BaseOutputDirectory>
<OutDir>$(BaseOutputDirectory)</OutDir>
</PropertyGroup>
<ItemGroup>
<PackageReference Include="Microsoft.VisualStudioEng.MicroBuild.Core" Version="0.4.1">
<PrivateAssets>all</PrivateAssets>
<IncludeAssets>runtime; build; native; contentfiles; analyzers</IncludeAssets>
</PackageReference>
</ItemGroup>
<ItemGroup>
<FilesToSign Include="$(OutDir)\**\*.nupkg">
<Authenticode>NuGet</Authenticode>
</FilesToSign>
</ItemGroup>
<ItemGroup>
<FilesToSign Include="$(OutDir)\**\*.jar">
<Authenticode>MicrosoftJARSHA2</Authenticode>
</FilesToSign>
</ItemGroup>
</Project>

Двоичный файл не отображается.

26
DataProcessing/README.md Normal file
Просмотреть файл

@ -0,0 +1,26 @@
Data Accelerator for Spark Engine
## Projects Structure
#Core
Interface and classes definition for contracts of the Data Accelerator Engine
#Host
Spark-specific app jar for Data Accelerator
#Samples
Examples for UDFs and UDAFs in Scala
#Utility
Common classes and singleton helps used across projects
##Properties
Some basic rules:
* Property names are akin to a full JSON path to locate a leaf in the JSON object
* The root namespace is **datax.job**
* Property names are all lowercase for known fields from the JSON config object, except cases from Map and Array
* Map case - e.g. the outputs is a Map of string to individual output config, in this case, put the string into the property name as part of the path
* Array case - e.g. the timeWindows is an Array of time window specs, in this case, extract the name as part of the path into property name
* When flatten Map/Array, change the plural words into singular term, e.g. change *outputs* to *output*, *timeWindows* to *timewindow*, etc.

Просмотреть файл

@ -0,0 +1,33 @@
<?xml version="1.0"?>
<package xmlns:xsd="http://www.w3.org/2001/XMLSchema" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
<metadata xmlns="http://schemas.microsoft.com/packaging/2010/07/nuspec.xsd">
<id>Microsoft.DataX.Spark</id>
<version>$version$</version>
<authors>Microsoft</authors>
<projectUrl>http://aka.ms/data-accelerator</projectUrl>
<license type="expression">MIT</license>
<iconUrl>https://raw.githubusercontent.com/wiki/Microsoft/data-accelerator/tutorials/images/roundwhite6464.PNG</iconUrl>
<requireLicenseAcceptance>true</requireLicenseAcceptance>
<description>Data Accelerator for Apache Spark simplifies streaming of Big Data using Spark. It is used internally at Microsoft for processing streamed data from multiple products everyday, handling data volumes at Microsoft scale. It offers a no-code experience to build Rules and Alerts, as well as numerous productivity improvements to develop and manage Spark SQL jobs on Azure HDInsights. This is the package of jar files for Data Processing.</description>
<copyright>© Microsoft Corporation. All rights reserved.</copyright>
<tags>DataX.Spark, DataX</tags>
<dependencies>
</dependencies>
</metadata>
<files>
<file src="**\applicationinsights-core-2.2.1.jar" target="lib" />
<file src="**\azure-documentdb-1.16.1.jar" target="lib" />
<file src="**\azure-eventhubs-1.2.1.jar" target="lib" />
<file src="**\azure-eventhubs-spark_2.11-2.3.6.jar" target="lib" />
<file src="**\azure-keyvault-webkey-1.1.jar" target="lib" />
<file src="**\datax-core_2.11-1.0.0-SNAPSHOT.jar" target="lib" />
<file src="**\datax-host_2.11-1.0.0-SNAPSHOT.jar" target="lib" />
<file src="**\datax-utility_2.11-1.0.0-SNAPSHOT.jar" target="lib" />
<file src="**\datax-keyvault_2.11-1.0.0-SNAPSHOT-with-dependencies.jar" target="lib" />
<file src="**\datax-udf-samples_2.11-1.0.0-SNAPSHOT.jar" target="lib" />
<file src="**\java-uuid-generator-3.1.5.jar" target="lib" />
<file src="**\proton-j-0.27.1.jar" target="lib" />
<file src="**\scala-java8-compat_2.11-0.9.0.jar" target="lib" />
<file src="NOTICE.txt" target="" />
</files>
</package>

Просмотреть файл

@ -0,0 +1,199 @@
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<!--
MIT License
Copyright (c) Microsoft Corporation. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE
-->
<developers>
<developer>
<id>microsoft</id>
<name>Microsoft</name>
</developer>
</developers>
<licenses>
<license>
<name>MIT License</name>
<url>http://opensource.org/licenses/MIT</url>
<distribution>repo</distribution>
</license>
</licenses>
<scm>
<connection>scm:git:git@github.com:Microsoft/data-accelerator.git</connection>
<developerConnection>scm:git:git@github.com:Microsoft/data-accelerator.git</developerConnection>
<url>https://github.com/Microsoft/data-accelerator.git</url>
</scm>
<groupId>com.microsoft.datax</groupId>
<artifactId>datax-core_2.11</artifactId>
<version>1.0.0-SNAPSHOT</version>
<properties>
<spark.version>2.3.0</spark.version>
<scala.version.major>2.11</scala.version.major>
<scala.version.minor>8</scala.version.minor>
<scala.version>${scala.version.major}.${scala.version.minor}</scala.version>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
</properties>
<repositories>
</repositories>
<dependencies>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-core_${scala.version.major}</artifactId>
<version>${spark.version}</version>
<scope>provided</scope>
<exclusions>
<exclusion>
<groupId>org.json4s</groupId>
<artifactId>json4s-jackson_2.11</artifactId>
</exclusion>
</exclusions>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-sql_${scala.version.major}</artifactId>
<version>${spark.version}</version>
<scope>provided</scope>
<exclusions>
<exclusion>
<groupId>org.scala-lang.modules</groupId>
<artifactId>scala-parser-combinators_2.11</artifactId>
</exclusion>
</exclusions>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-streaming_${scala.version.major}</artifactId>
<version>${spark.version}</version>
<scope>provided</scope>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-mllib_${scala.version.major}</artifactId>
<version>${spark.version}</version>
<scope>provided</scope>
<exclusions>
<exclusion>
<groupId>org.scala-lang.modules</groupId>
<artifactId>scala-parser-combinators_2.11</artifactId>
</exclusion>
<exclusion>
<groupId>org.typelevel</groupId>
<artifactId>macro-compat_2.11</artifactId>
</exclusion>
</exclusions>
</dependency>
<dependency>
<groupId>org.jmockit</groupId>
<artifactId>jmockit</artifactId>
<version>1.34</version>
<scope>test</scope>
</dependency>
<dependency>
<groupId>org.scalatest</groupId>
<artifactId>scalatest_2.11</artifactId>
<version>2.2.6</version>
<scope>test</scope>
</dependency>
</dependencies>
<build>
<plugins>
<plugin>
<groupId>net.alchim31.maven</groupId>
<artifactId>scala-maven-plugin</artifactId>
<version>3.2.2</version>
<executions>
<execution>
<goals>
<goal>compile</goal>
<goal>testCompile</goal>
</goals>
</execution>
</executions>
<configuration>
<scalaVersion>${scala.version}</scalaVersion>
<args>
<arg>-deprecation</arg>
<arg>-feature</arg>
</args>
</configuration>
</plugin>
<plugin>
<artifactId>maven-compiler-plugin</artifactId>
<version>3.3</version>
<configuration>
<source>1.8</source>
<target>1.8</target>
</configuration>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-dependency-plugin</artifactId>
<version>3.0.1</version>
<executions>
<execution>
<id>copy-dependencies</id>
<phase>package</phase>
<goals>
<goal>copy-dependencies</goal>
</goals>
<configuration>
<outputDirectory>${project.build.directory}/dependency</outputDirectory>
<overWriteReleases>false</overWriteReleases>
<overWriteSnapshots>true</overWriteSnapshots>
<excludeTransitive>true</excludeTransitive>
</configuration>
</execution>
</executions>
</plugin>
<!-- enable scalatest -->
<plugin>
<groupId>org.scalatest</groupId>
<artifactId>scalatest-maven-plugin</artifactId>
<version>1.0</version>
<configuration>
<reportsDirectory>${project.build.directory}/surefire-reports</reportsDirectory>
<junitxml>.</junitxml>
<filereports>WDF TestSuite.txt</filereports>
</configuration>
<executions>
<execution>
<id>test</id>
<goals>
<goal>test</goal>
</goals>
</execution>
</executions>
</plugin>
</plugins>
</build>
</project>

Просмотреть файл

@ -0,0 +1,11 @@
// *********************************************************************
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License
// *********************************************************************
package datax.config
import datax.constants.ProductConstant
object DefaultValue {
def DefaultAppName = ProductConstant.DefaultAppName
}

Просмотреть файл

@ -0,0 +1,151 @@
// *********************************************************************
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License
// *********************************************************************
package datax.config
import datax.constants.{JobArgument, ProductConstant}
import datax.exception.EngineException
import org.apache.log4j.Level
import scala.concurrent.duration.Duration
/***
* represents configuration of a job
*
* @param elems starting elements in the configuration
* @param parentPrefix prefix of all of the setting names in this dictionary, mainly for logging
*/
case class SettingDictionary(elems: Map[String, String], parentPrefix: String = SettingNamespace.DefaultSettingName){
val dict = elems
val size = dict.size
def getDictMap() = dict
def get(key: String) = dict.get(key)
def getOrNull(key: String) = get(key).orNull
def getDefault() = get(SettingNamespace.DefaultSettingName)
def getOrThrow[T](opt: Option[T], key: String) = {
opt match {
case Some(v)=> v
case None => throw new EngineException(s"config setting '${parentPrefix+key}' is not found")
}
}
def getString(key: String) = getOrThrow(dict.get(key), key)
def getOrElse(key: String, defaultValue: String) = dict.getOrElse(key, defaultValue)
def getIntOption(key: String) = get(key).map(_.toInt)
def getLongOption(key: String) = get(key).map(_.toLong)
def getLong(key: String) = getOrThrow(getLongOption(key), key)
def getDoubleOption(key: String) = get(key).map(_.toDouble)
def getDouble(key: String) = getOrThrow(getDoubleOption(key), key)
def getBooleanOption(key: String) = get(key).map(_.toBoolean)
def getDurationOption(key: String) = get(key).map(Duration.create(_))
def getDuration(key: String) = getOrThrow(getDurationOption(key), key)
def getStringSeqOption(key: String) = get(key).map(str => {
val seq = str.split(SettingNamespace.ValueSeperator).filterNot(_.isEmpty).toSeq
if (seq.length > 0) seq else null
})
private def findWithPrefix(prefix: String): Map[String, String] = dict.filter(kv=>kv._1.startsWith(prefix))
private def stripKeys(dict: Map[String, String], startIndex: Int) = {
dict.filter(kv=>kv._1!=null&&kv._1.length>startIndex).map{case(k, v) => k.substring(startIndex)->v}
}
private def stripKeysByNamespace(dict: Map[String, String], namespace: String) = {
val prefixLength = (namespace+SettingNamespace.Seperator).length
dict.filter(kv=>kv._1!=null&&kv._1.length>=namespace.length).map{case(k, v) => {
if(k==namespace)
SettingNamespace.DefaultSettingName -> v
else
k.substring(prefixLength)->v
}}
}
/** group the dictionary into sub dictionary based on namespaces
*
* strip off the prefix on every setting names, and distribute settings into groups of [[SettingDictionary]] with the group name
* is the first namespace in the setting name.
*
* @param prefix the prefix to look for and strip off when grouping
* @return a group of SettingDictionary
*/
def groupBySubNamespace(prefix: String = null) = {
val sub = if(prefix==null || prefix.isEmpty)
dict
else
stripKeys(findWithPrefix(prefix), prefix.length)
sub.groupBy(kv=>SettingNamespace.getSubNamespace(kv._1, 0))
.filterKeys(_!=null)
.map{case (k, v) => k-> SettingDictionary(stripKeysByNamespace(v, k), parentPrefix + k + SettingNamespace.Seperator)}
}
/** get a sub [[SettingDictionary]] with only setting whose names start with the give prefix
*
* @param prefix prefix to filter the setting name
* @return a [[SettingDictionary]] instance containing only the settings with prefix in the name
*/
def getSubDictionary(prefix: String) = {
SettingDictionary(stripKeys(findWithPrefix(prefix), prefix.length), parentPrefix+prefix)
}
def buildConfigIterable[TConf](builder: (SettingDictionary, String)=>TConf, prefix: String = null) = {
groupBySubNamespace(prefix)
.map{case(k,v)=>builder(v, k)}
}
def buildConfigMap[TConf](builder: (SettingDictionary, String)=>TConf, prefix: String = null) = {
groupBySubNamespace(prefix)
.map{case(k,v)=>k->builder(v, k)}
}
/***
* get name of the job
* @return name of the job
*/
def getAppName(): String = {
dict.getOrElse(JobArgument.ConfName_AppName, DefaultValue.DefaultAppName)
}
def getJobName(): String = {
dict.get(SettingNamespace.JobNameFullPath).getOrElse(getAppName())
}
def getMetricAppName() = {
ProductConstant.MetricAppNamePrefix + getJobName()
}
def getClientNodeName() = {
SparkEnvVariables.getClientNodeName(this.getAppName())
}
/***
* get path to configuration file of the job
* @return path to the configuration file
*/
def getAppConfigurationFile(): String = {
dict.getOrElse(JobArgument.ConfName_AppConf, null)
}
/***
* get setting of logging level on driver nodes of the job
* @return logging level on driver nodes
*/
def getDriverLogLevel(): Option[Level] = {
dict.get(JobArgument.ConfName_DriverLogLevel).map(Level.toLevel(_))
}
/***
* get setting of logging level on executor nodes of the job
* @return logging level on executor nodes
*/
def getExecutorLogLevel(): Option[Level] = {
dict.get(JobArgument.ConfName_LogLevel).map(Level.toLevel(_))
}
}

Просмотреть файл

@ -0,0 +1,48 @@
// *********************************************************************
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License
// *********************************************************************
package datax.config
import datax.constants.ProductConstant
object SettingNamespace {
val DefaultSettingName = ""
val Seperator = "."
val ValueSeperator = ";"
def Root = ProductConstant.ProductRoot
def RootPrefix = Root + Seperator
val Job = "job"
def JobPrefix = RootPrefix + Job + Seperator
val JobName = "name"
def JobNameFullPath = JobPrefix + JobName
val JobInput = "input.default"
def JobInputPrefix = JobPrefix + JobInput + Seperator
val JobProcess = "process"
def JobProcessPrefix = JobPrefix + JobProcess + Seperator
val JobOutput = "output"
def JobOutputPrefix = JobPrefix + JobOutput + Seperator
val JobOutputDefault = "default"
def JobOutputDefaultPreifx = JobOutputPrefix + JobOutputDefault + Seperator
def buildSettingPath(names: String*) = {
names.filterNot(_==null).mkString(Seperator)
}
def getSubNamespace(propName: String, startIndex: Int): String = {
if(propName.length>startIndex) {
val pos = propName.indexOf(SettingNamespace.Seperator, startIndex)
if(pos>=0)
propName.substring(startIndex, pos)
else
propName.substring(startIndex)
}
else
null
}
}

Просмотреть файл

@ -0,0 +1,18 @@
// *********************************************************************
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License
// *********************************************************************
package datax.config
import org.apache.spark.{SparkEnv, TaskContext}
object SparkEnvVariables {
def getClientNodeName(appName: String): String = {
appName+"-"+SparkEnv.get.executorId
}
def getLoggerSuffix(): String = {
val tc = TaskContext.get()
if(tc==null)"" else s"-P${tc.partitionId()}-T${tc.taskAttemptId()}"
}
}

Просмотреть файл

@ -0,0 +1,12 @@
// *********************************************************************
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License
// *********************************************************************
package datax.config
import org.apache.spark.SparkConf
case class UnifiedConfig(
sparkConf: SparkConf,
dict: SettingDictionary
)

Просмотреть файл

@ -0,0 +1,23 @@
// *********************************************************************
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License
// *********************************************************************
package datax.constants
object ColumnName {
// Define constants for column names
val RawObjectColumn = "Raw"
val EventNameColumn = "EventName"
def PropertiesColumn = s"${NamePrefix.Value}Properties"
def InternalColumnPrefix = s"__${NamePrefix.Value}_"
def InternalColumnFileInfo = InternalColumnPrefix + "FileInfo"
def MetadataColumnPrefix = s"__${NamePrefix.Value}Metadata_"
def MetadataColumnOutputPartitionTime = MetadataColumnPrefix + "OutputPartitionTime"
def OutputGroupColumn = s"${NamePrefix.Value}OutputGroup"
}

Просмотреть файл

@ -0,0 +1,12 @@
// *********************************************************************
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License
// *********************************************************************
package datax.constants
object DatasetName {
def DataStreamRaw = s"${NamePrefix.Value}RawInput"
def DataStreamProjection = s"${NamePrefix.Value}ProcessedInput"
def DataStreamProjectionBatch = s"${NamePrefix.Value}ProcessedInput_Batch"
def DataStreamProjectionWithWindow = s"${NamePrefix.Value}ProcessedInput_Window"
}

Просмотреть файл

@ -0,0 +1,11 @@
// *********************************************************************
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License
// *********************************************************************
package datax.constants
object FeatureName {
// Define built-in functions
val FunctionDisableCommonCaching = "disableCommonCaching"
}

Просмотреть файл

@ -0,0 +1,18 @@
// *********************************************************************
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License
// *********************************************************************
package datax.constants
object JobArgument {
def ConfNamePrefix = s"${NamePrefix.Value}_".toUpperCase
def ConfName_AppConf = s"${ConfNamePrefix}APPCONF"
def ConfName_AppName = s"${ConfNamePrefix}APPNAME"
def ConfName_LogLevel = s"${ConfNamePrefix}LOGLEVEL"
def ConfName_DriverLogLevel = s"${ConfNamePrefix}DRIVERLOGLEVEL"
def ConfName_CheckpointEnabled = s"${ConfNamePrefix}CHECKPOINTENABLED"
def ConfName_AppInsightKeyRef = s"${ConfNamePrefix}APPINSIGHTKEYREF"
def ConfName_BlobWriterTimeout: String = s"${ConfNamePrefix}BlobWriterTimeout"
def ConfName_DefaultVaultName: String = s"${ConfNamePrefix}DEFAULTVAULTNAME"
}

Просмотреть файл

@ -0,0 +1,9 @@
// *********************************************************************
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License
// *********************************************************************
package datax.constants
object MetricName {
val MetricSinkPrefix="Sink_"
}

Просмотреть файл

@ -0,0 +1,11 @@
// *********************************************************************
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License
// *********************************************************************
package datax.constants
object NamePrefix {
val DefaultValue = "DataX"
val ConfSetting = "DATAX_NAMEPREFIX"
val Value: String = sys.env.getOrElse(ConfSetting, DefaultValue)
}

Просмотреть файл

@ -0,0 +1,14 @@
// *********************************************************************
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License
// *********************************************************************
package datax.constants
object ProcessingPropertyName {
// Define constants for names in the processing Properties column
val BlobPathHint = "Partition"
val BatchTime = "BatchTime"
val BlobTime = "InputTime"
val CPTime = "CPTime"
val CPExecutor = "CPExecutor"
}

Просмотреть файл

@ -0,0 +1,19 @@
// *********************************************************************
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License
// *********************************************************************
package datax.constants
object ProductConstant {
def DefaultAppName = s"${NamePrefix.Value}_Unknown_App"
def MetricAppNamePrefix = s"${NamePrefix.Value}-".toUpperCase
def ProductRoot = s"${NamePrefix.Value}".toLowerCase
def ProductJobTags = s"${NamePrefix.Value}JobTags"
def ProductRedisBase = s"${NamePrefix.Value}_RedisBase"
def ProductRedisStandardConnection = s"${NamePrefix.Value}_RedisStandardConnection"
def ProductRedisClusterConnection = s"${NamePrefix.Value}_RedisClusterConnection"
def DataStreamProcessDataSetLogger = s"${NamePrefix.Value}-ProcessDataset"
def ProductInstrumentLogger = s"${NamePrefix.Value}-Instrument"
def ProductOutputFilter = s"${NamePrefix.Value}OutputFilter"
def ProductQuery = s"^--${NamePrefix.Value}Query--"
}

Просмотреть файл

@ -0,0 +1,8 @@
// *********************************************************************
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License
// *********************************************************************
package datax.exception
case class EngineException(private val message: String = "", private val cause: Throwable = None.orNull)
extends Exception(message, cause)

Просмотреть файл

@ -0,0 +1,48 @@
// *********************************************************************
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License
// *********************************************************************
package datax.extension
import java.sql.Timestamp
import datax.config.SettingDictionary
import org.apache.spark.sql.SparkSession
object DynamicUDF{
type IntervalUpdateHandler = (SparkSession, Timestamp)=>Unit
trait DynamicUDFTrait {
val name: String
val funcRef: AnyRef
val onInterval: IntervalUpdateHandler
}
case class UDF0[RT] (func: ()=>RT, onInterval: IntervalUpdateHandler = null)
case class UDF1[T1, RT] (func: T1=>RT, onInterval: IntervalUpdateHandler = null)
case class UDF2[T1, T2, RT] (func: (T1, T2) => RT, onInterval: IntervalUpdateHandler = null)
case class UDF3[T1, T2, T3, RT] (func: (T1, T2, T3) => RT,onInterval: IntervalUpdateHandler = null)
case class UDF4[T1, T2, T3, T4, RT] (func: (T1, T2, T3, T4) => RT,onInterval: IntervalUpdateHandler = null)
trait Generator0[RT] extends Serializable{
def initialize(spark:SparkSession, dict: SettingDictionary): UDF0[RT]
}
trait Generator1[T1, RT] extends Serializable {
def initialize(spark:SparkSession, dict: SettingDictionary): UDF1[T1, RT]
}
trait Generator2[T1, T2, RT] extends Serializable {
def initialize(spark:SparkSession, dict: SettingDictionary): UDF2[T1, T2, RT]
}
trait Generator3[T1, T2, T3, RT] extends Serializable {
def initialize(spark:SparkSession, dict: SettingDictionary): UDF3[T1, T2, T3, RT]
}
}

Просмотреть файл

@ -0,0 +1,12 @@
// *********************************************************************
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License
// *********************************************************************
package datax.extension
import datax.config.SettingDictionary
import org.apache.spark.sql.{DataFrame, SparkSession}
trait PreProjectionProcessor extends Serializable{
def initialize(spark:SparkSession, dict: SettingDictionary): DataFrame=>DataFrame
}

Просмотреть файл

@ -0,0 +1,12 @@
// *********************************************************************
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License
// *********************************************************************
package datax.extension
/*
extension to normalize the input string in the streaming pipeline before parsing them as JSON object
*/
trait StringNormalizer {
def normalize(str: String): String
}

Просмотреть файл

@ -0,0 +1,15 @@
// *********************************************************************
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License
// *********************************************************************
package datax.host
import datax.service.{ConfigService, TelemetryService}
import org.apache.spark.SparkConf
import org.apache.spark.sql.SparkSession
trait AppHost {
def getConfigService(): ConfigService
def getTelemetryService(): TelemetryService
def getSpark(sparkConf: SparkConf): SparkSession
}

Просмотреть файл

@ -0,0 +1,12 @@
// *********************************************************************
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License
// *********************************************************************
package datax.service
import datax.config.{SettingDictionary}
trait ConfigService {
def getActiveDictionary(): SettingDictionary
def setActiveDictionary(conf: SettingDictionary)
}

Просмотреть файл

@ -0,0 +1,10 @@
// *********************************************************************
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License
// *********************************************************************
package datax.service
trait TelemetryService {
def trackEvent(event: String, properties: Map[String, String], measurements: Map[String, Double])
def trackException(e: Exception, properties: Map[String, String], measurements: Map[String, Double])
}

Просмотреть файл

@ -0,0 +1,35 @@
// *********************************************************************
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License
// *********************************************************************
package datax
import java.sql.Timestamp
import datax.config.SettingDictionary
import org.apache.spark.sql.{DataFrame, Row, SparkSession}
package object sink {
type SinkDelegate = (Row, Seq[Row], Timestamp, Int, String)=>Map[String, Int]
type Metrics = Map[String, Double]
trait SinkOperatorFactory{
def getSinkOperator(dict:SettingDictionary, name: String):SinkOperator
def getSettingNamespace(): String
}
case class SinkOperator(name: String,
isEnabled: Boolean,
flagColumnExprGenerator: () => String,
generator: (Int)=>SinkDelegate,
onInitialization: (SparkSession)=>Unit = null,
onBatch: (SparkSession, Timestamp, Set[String])=>Unit = null
)
case class OutputOperator(name: String,
onInitialization: (SparkSession) => Unit,
onBatch: (SparkSession, Timestamp, Set[String]) => Unit,
output: (DataFrame, Timestamp) => Map[String, Int])
}

Просмотреть файл

@ -0,0 +1,225 @@
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<!--
MIT License
Copyright (c) Microsoft Corporation. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE
-->
<developers>
<developer>
<id>microsoft</id>
<name>Microsoft</name>
</developer>
</developers>
<licenses>
<license>
<name>MIT License</name>
<url>http://opensource.org/licenses/MIT</url>
<distribution>repo</distribution>
</license>
</licenses>
<scm>
<connection>scm:git:git@github.com:Microsoft/data-accelerator.git</connection>
<developerConnection>scm:git:git@github.com:Microsoft/data-accelerator.git</developerConnection>
<url>https://github.com/Microsoft/data-accelerator.git</url>
</scm>
<groupId>com.microsoft.datax</groupId>
<artifactId>datax-host_2.11</artifactId>
<version>1.0.0-SNAPSHOT</version>
<properties>
<spark.version>2.3.0</spark.version>
<scala.version.major>2.11</scala.version.major>
<scala.version.minor>8</scala.version.minor>
<scala.version>${scala.version.major}.${scala.version.minor}</scala.version>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
</properties>
<repositories>
</repositories>
<dependencies>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-core_${scala.version.major}</artifactId>
<version>${spark.version}</version>
<scope>provided</scope>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-sql_${scala.version.major}</artifactId>
<version>${spark.version}</version>
<scope>provided</scope>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-streaming_${scala.version.major}</artifactId>
<version>${spark.version}</version>
<scope>provided</scope>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-mllib_${scala.version.major}</artifactId>
<version>${spark.version}</version>
<scope>provided</scope>
</dependency>
<dependency>
<groupId>org.jmockit</groupId>
<artifactId>jmockit</artifactId>
<version>1.34</version>
<scope>test</scope>
</dependency>
<dependency>
<groupId>org.scalatest</groupId>
<artifactId>scalatest_2.11</artifactId>
<version>2.2.6</version>
<scope>test</scope>
</dependency>
<dependency>
<groupId>datax</groupId>
<artifactId>datax-core_2.11</artifactId>
<version>1.0.0-SNAPSHOT</version>
</dependency>
<dependency>
<groupId>datax</groupId>
<artifactId>datax-utility_2.11</artifactId>
<version>1.0.0-SNAPSHOT</version>
</dependency>
<dependency>
<groupId>com.microsoft.azure</groupId>
<artifactId>azure-eventhubs-spark_2.11</artifactId>
<version>2.3.6</version>
</dependency>
<dependency>
<groupId>io.lettuce</groupId>
<artifactId>lettuce-core</artifactId>
<version>5.0.4.RELEASE</version>
</dependency>
<dependency>
<groupId>com.microsoft.azure</groupId>
<artifactId>azure-storage</artifactId>
<version>5.3.0</version>
</dependency>
<dependency>
<groupId>com.microsoft.azure</groupId>
<artifactId>azure-documentdb</artifactId>
<version>1.16.1</version>
</dependency>
</dependencies>
<build>
<plugins>
<plugin>
<groupId>net.alchim31.maven</groupId>
<artifactId>scala-maven-plugin</artifactId>
<version>3.2.2</version>
<executions>
<execution>
<goals>
<goal>compile</goal>
<goal>testCompile</goal>
</goals>
</execution>
</executions>
<configuration>
<scalaVersion>${scala.version}</scalaVersion>
<args>
<arg>-deprecation</arg>
<arg>-feature</arg>
</args>
</configuration>
</plugin>
<plugin>
<artifactId>maven-compiler-plugin</artifactId>
<version>3.3</version>
<configuration>
<source>1.8</source>
<target>1.8</target>
</configuration>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-dependency-plugin</artifactId>
<version>3.0.1</version>
<executions>
<execution>
<id>copy-dependencies</id>
<phase>package</phase>
<goals>
<goal>copy-dependencies</goal>
</goals>
<configuration>
<outputDirectory>${project.build.directory}/dependency</outputDirectory>
<overWriteReleases>false</overWriteReleases>
<overWriteSnapshots>true</overWriteSnapshots>
<excludeTransitive>true</excludeTransitive>
</configuration>
</execution>
</executions>
</plugin>
<!-- enable scalatest -->
<plugin>
<groupId>org.scalatest</groupId>
<artifactId>scalatest-maven-plugin</artifactId>
<version>1.0</version>
<configuration>
<reportsDirectory>${project.build.directory}/surefire-reports</reportsDirectory>
<junitxml>.</junitxml>
<filereports>WDF TestSuite.txt</filereports>
</configuration>
<executions>
<execution>
<id>test</id>
<goals>
<goal>test</goal>
</goals>
</execution>
</executions>
</plugin>
<plugin>
<artifactId>maven-assembly-plugin</artifactId>
<version>2.4.1</version>
<configuration>
<descriptors>
<descriptor>with-dependencies.xml</descriptor>
</descriptors>
</configuration>
<executions>
<execution>
<id>make-assembly</id>
<phase>package</phase>
<goals>
<goal>single</goal>
</goals>
</execution>
</executions>
</plugin>
</plugins>
</build>
</project>

Просмотреть файл

@ -0,0 +1,16 @@
// *********************************************************************
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License
// *********************************************************************
package datax.app
import datax.host.BlobBatchingHost
import datax.processor.CommonProcessorFactory
object BatchApp {
def main(inputArguments: Array[String]): Unit = {
BlobBatchingHost.runBatchApp(
inputArguments,
config => CommonProcessorFactory.createProcessor(config).asBlobPointerProcessor())
}
}

Просмотреть файл

@ -0,0 +1,16 @@
// *********************************************************************
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License
// *********************************************************************
package datax.app
import datax.host.StreamingHost
import datax.processor.CommonProcessorFactory
object BlobStreamingApp {
def main(inputArguments: Array[String]): Unit = {
StreamingHost.runStreamingApp(
inputArguments,
config => CommonProcessorFactory.createProcessor(config).asBlobPointerProcessor())
}
}

Просмотреть файл

@ -0,0 +1,17 @@
// *********************************************************************
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License
// *********************************************************************
package datax.app
import datax.host.StreamingHost
import datax.processor.CommonProcessorFactory
object DirectLocalStreamingApp {
def main(inputArguments: Array[String]): Unit = {
StreamingHost.runLocalStreamingApp(
inputArguments,
config => CommonProcessorFactory.createProcessor(config).asDirectLocalProcessor())
}
}

Просмотреть файл

@ -0,0 +1,16 @@
// *********************************************************************
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License
// *********************************************************************
package datax.app
import datax.host.StreamingHost
import datax.processor.CommonProcessorFactory
object DirectStreamingApp {
def main(inputArguments: Array[String]): Unit = {
StreamingHost.runStreamingApp(
inputArguments,
config => CommonProcessorFactory.createProcessor(config).asDirectProcessor())
}
}

Просмотреть файл

@ -0,0 +1,74 @@
// *********************************************************************
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License
// *********************************************************************
package datax.checkpoint
import datax.fs.HadoopClient
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.{FileUtil, Path}
import org.apache.log4j.LogManager
import org.apache.spark.eventhubs.{EventHubsConf, EventPosition, NameAndPartition}
object EventhubCheckpointer {
def getCheckpointFile(checkpointDir: String) = checkpointDir+"/offsets.txt"
val OffsetTokenSeparator = ","
def readOffsetsFromFile(file: String): Iterable[(Long, String, Int, Long, Long)] =
HadoopClient.readHdfsFile(file).filterNot(s=>s==null || s.isEmpty).map(s=>{
val offset = s.split(OffsetTokenSeparator)
Tuple5(offset(0).toLong, offset(1), offset(2).toInt, offset(3).toLong, offset(4).toLong)
})
def readOffsetsFromCheckpoints(checkpointDir: String): Iterable[(Long, String, Int, Long, Long)] = {
val conf = HadoopClient.getConf()
val checkpointFile = getCheckpointFile(checkpointDir)
val path = new Path(checkpointFile)
val fs = path.getFileSystem(conf)
if(fs.exists(path))
readOffsetsFromFile(checkpointFile)
else{
val backupPath = path.suffix(".old")
if(fs.exists(backupPath)){
val logger = LogManager.getLogger("readOffsetsFromCheckpoints")
logger.warn(s"offsets file at checkpoint folder is not found, but found a backup one: ${backupPath.toUri}")
FileUtil.copy(fs, backupPath, fs, path, false, conf)
readOffsetsFromFile(checkpointFile)
}
else
null
}
}
def writeOffsetsToCheckpoints(checkpointDir: String, offsets: Seq[(Long, String, Int, Long, Long)], conf: Configuration) = {
val folder = new Path(checkpointDir)
val fs = folder.getFileSystem(conf)
if(!fs.exists(folder)){
fs.mkdirs(folder)
}
val checkpointFile = getCheckpointFile(checkpointDir)
val path = new Path(checkpointFile)
if(fs.exists(path)){
// backup the old one
val backupPath = path.suffix(".old")
FileUtil.copy(fs, path, fs, backupPath, false, true, conf)
}
HadoopClient.writeHdfsFile(checkpointFile,
offsets.map(v=>v._1+OffsetTokenSeparator+v._2+OffsetTokenSeparator+v._3+OffsetTokenSeparator+v._4+OffsetTokenSeparator+v._5).mkString("\n"), true)
}
def applyCheckpointsIfExists(ehConf: EventHubsConf, checkpointDir: String) = {
val fromOffsets = readOffsetsFromCheckpoints(checkpointDir)
val logger = LogManager.getLogger("EventHubConfBuilder")
if(fromOffsets!=null) {
logger.warn(s"Checkpoints of offsets are detected. Applying the offsets:\n" + fromOffsets.mkString("\n"))
ehConf.setStartingPositions(fromOffsets.map { v => new NameAndPartition(v._2, v._3) -> EventPosition.fromSequenceNumber(v._5) }.toMap)
}
else{
logger.warn(s"Checkpoints don't exist, skipped.")
}
}
}

Просмотреть файл

@ -0,0 +1,156 @@
// *********************************************************************
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License
// *********************************************************************
package datax.classloader
import java.net.{URL, URLClassLoader}
import org.apache.spark.SparkEnv
import org.apache.spark.sql.catalyst.ScalaReflection
import scala.collection.JavaConverters._
import scala.collection.mutable.HashMap
object ClassLoaderHost {
/**
* Get the ClassLoader which loaded Spark.
*/
def getSparkClassLoader: ClassLoader = getClass.getClassLoader
/**
* Get the Context ClassLoader on this thread or, if not present, the ClassLoader that
* loaded Spark.
*/
def getContextOrSparkClassLoader: ClassLoader =
Option(Thread.currentThread().getContextClassLoader).getOrElse(getSparkClassLoader)
/**
* Create a ClassLoader for use in tasks, adding any JARs specified by the user or any classes
* created by the interpreter to the search path
*/
private def createClassLoader(): MutableURLClassLoader = {
// Bootstrap the list of jars with the user class path.
val now = System.currentTimeMillis()
userClassPath.foreach { url =>
currentJars(url.getPath().split("/").last) = now
}
val currentLoader = getContextOrSparkClassLoader
val userClassPathFirst = true
// For each of the jars in the jarSet, add them to the class loader.
// We assume each of the files has already been fetched.
val urls = userClassPath.toArray ++ currentJars.keySet.map { uri =>
new java.io.File(uri.split("/").last).toURI.toURL
}
if (userClassPathFirst) {
new ChildFirstURLClassLoader(urls, currentLoader)
} else {
new MutableURLClassLoader(urls, currentLoader)
}
}
val userClassPath: Seq[URL] = Nil
val currentJars = new HashMap[String, Long]
val urlClassLoader = createClassLoader()
val derivedClassLoader = urlClassLoader
val env = SparkEnv.get
if(env!=null){
// Set the classloader for serializer
env.serializer.setDefaultClassLoader(derivedClassLoader)
// SPARK-21928. SerializerManager's internal instance of Kryo might get used in netty threads
// for fetching remote cached RDD blocks, so need to make sure it uses the right classloader too.
env.serializerManager.setDefaultClassLoader(derivedClassLoader)
}
/** Preferred alternative to Class.forName(className) */
def classForName(className: String): Class[_] = {
Class.forName(className, true, derivedClassLoader)
// scalastyle:on classforname
}
def getType[T](clazz: Class[T])(implicit runtimeMirror: scala.reflect.runtime.universe.Mirror) =
runtimeMirror.classSymbol(clazz).toType
def javaTypeToDataType(t: java.lang.reflect.Type) = {
val mirror = scala.reflect.runtime.universe.runtimeMirror(derivedClassLoader)
//TODO: ParameterizedType (aka. generic type) cannot be casted to Class[_],
// thus getJavaUDFReturnDataType should be used instead in most of the case.
val udfScalaType = mirror.classSymbol(t.asInstanceOf[Class[_]]).toType
ScalaReflection.schemaFor(udfScalaType).dataType
}
}
/**
* A class loader which makes some protected methods in ClassLoader accessible.
*/
class ParentClassLoader(parent: ClassLoader) extends ClassLoader(parent) {
override def findClass(name: String): Class[_] = {
super.findClass(name)
}
override def loadClass(name: String): Class[_] = {
super.loadClass(name)
}
override def loadClass(name: String, resolve: Boolean): Class[_] = {
super.loadClass(name, resolve)
}
}
/**
* URL class loader that exposes the `addURL` and `getURLs` methods in URLClassLoader.
*/
class MutableURLClassLoader(urls: Array[URL], parent: ClassLoader)
extends URLClassLoader(urls, parent) {
override def addURL(url: URL): Unit = {
super.addURL(url)
}
override def getURLs(): Array[URL] = {
super.getURLs()
}
}
/**
* A mutable class loader that gives preference to its own URLs over the parent class loader
* when loading classes and resources.
*/
class ChildFirstURLClassLoader(urls: Array[URL], parent: ClassLoader)
extends MutableURLClassLoader(urls, null) {
private val parentClassLoader = new ParentClassLoader(parent)
override def loadClass(name: String, resolve: Boolean): Class[_] = {
try {
super.loadClass(name, resolve)
} catch {
case e: ClassNotFoundException =>
parentClassLoader.loadClass(name, resolve)
}
}
override def getResource(name: String): URL = {
val url = super.findResource(name)
val res = if (url != null) url else parentClassLoader.getResource(name)
res
}
override def getResources(name: String): java.util.Enumeration[URL] = {
val childUrls = super.findResources(name).asScala
val parentUrls = parentClassLoader.getResources(name).asScala
(childUrls ++ parentUrls).asJavaEnumeration
}
override def addURL(url: URL) {
super.addURL(url)
}
}

Просмотреть файл

@ -0,0 +1,18 @@
// *********************************************************************
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License
// *********************************************************************
package datax.client.cosmosdb
/***
* Represents the connection info for sinker to access cosmos DB
* @param name name of this cosmosdb connection info mainly for logging purpose
* @param connectionString connection string to the cosmosdb
* @param database name of the cosmos database
* @param collection name of the collection in cosmos db
*/
case class CosmosDBConf(name: String, connectionString: String, database: String, collection: String)
object CosmosDBBase {
}

Просмотреть файл

@ -0,0 +1,27 @@
// *********************************************************************
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License
// *********************************************************************
package datax.client.eventhub
import java.util.concurrent.Executors
import com.microsoft.azure.eventhubs.EventHubClient
import org.apache.spark.eventhubs.ConnectionStringBuilder
object EventHubBase {
val executorService = Executors.newSingleThreadExecutor
def buildConnectionString(namespace: String, name: String, policy: String, key: String) = {
ConnectionStringBuilder()
.setNamespaceName(namespace)
.setEventHubName(name)
.setSasKeyName(policy)
.setSasKey(key)
.build
}
def getOutputClient(connString: String) = {
EventHubClient.createSync(connString,executorService)
}
}

Просмотреть файл

@ -0,0 +1,7 @@
// *********************************************************************
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License
// *********************************************************************
package datax.client.eventhub
case class EventHubConf(name: String, connectionString: String)

Просмотреть файл

@ -0,0 +1,69 @@
// *********************************************************************
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License
// *********************************************************************
package datax.client.eventhub
import com.microsoft.azure.eventhubs.{EventData, EventHubClient}
import datax.exception.EngineException
import org.apache.log4j.{LogManager, Logger}
import scala.collection.JavaConverters._
class EventHubSender(client: EventHubClient){
val senderName : String = this.getClass.getSimpleName
def getLogger(): Logger = LogManager.getLogger(this.getClass)
def sendString(data: String, properties: Map[String, String]) ={
if(data!=null) {
sendBytes(data.getBytes, properties)
}
}
def sendBytes(data: Array[Byte], properties: Map[String, String]) = {
val eventData = EventData.create(data)
if(properties!=null)
eventData.getProperties().putAll(properties.asJava)
val t1=System.nanoTime()
client.send(eventData)
val et = (System.nanoTime()-t1)/1E9
getLogger().info(s"sent ${data.length} bytes in $et seconds")
}
def sendAvroData(data: Array[Byte] ) ={
if(data!=null) {
val eventData = EventData.create(data)
client.send(eventData)
getLogger.info("avro eventData length = " + data.length)
}
}
def close() = {
if(client!=null){
client.closeSync()
}
}
}
object EventHubSenderPool {
private var sender:EventHubSender = null
def getSender(outputEventHubConf: EventHubConf): EventHubSender ={
if(outputEventHubConf == null
|| outputEventHubConf.connectionString==null
|| outputEventHubConf.connectionString.isEmpty){
throw new EngineException(s"Unexpected empty eventhub conf")
}
if(sender==null){
this.synchronized {
if (sender == null) {
LogManager.getLogger(this.getClass).warn(s"Constructing eventhub sender for ${outputEventHubConf.name}")
sender = new EventHubSender(EventHubBase.getOutputClient(outputEventHubConf.connectionString))
}
}
}
sender
}
}

Просмотреть файл

@ -0,0 +1,63 @@
// *********************************************************************
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License
// *********************************************************************
package datax.client.http
import org.apache.http.client.config.RequestConfig
import org.apache.http.client.methods.{CloseableHttpResponse, HttpPost}
import org.apache.http.entity.StringEntity
import org.apache.http.impl.client.{CloseableHttpClient, HttpClientBuilder}
import org.apache.http.util.EntityUtils
import org.apache.log4j.LogManager
class HttpPostSender(url: String) {
def createClient(): CloseableHttpClient ={
val timeout = 5*1000
val requestConfig = RequestConfig.custom()
.setConnectionRequestTimeout(timeout)
.setConnectTimeout(timeout)
.setSocketTimeout(timeout)
.build()
HttpClientBuilder.create().setDefaultRequestConfig(requestConfig).build()
}
private val client = createClient()
private val poster = new HttpPost(url)
poster.addHeader("Content-Type", "application/json")
protected val senderName: String = this.getClass.getSimpleName
val logger = LogManager.getLogger(this.getClass)
logger.info(s"Constructing HttpPoster '$senderName' with url '$url'")
def sendString(data: String): String = {
val t1 = System.nanoTime()
val result = this.synchronized{
poster.setEntity(new StringEntity(data))
var response: CloseableHttpResponse = null
try {
response = client.execute(poster)
EntityUtils.toString(response.getEntity())
}
catch {
case e: Exception =>
val msg = s"!!HttpPoster failed to send '$data' to '$url'"
logger.error(msg, e)
msg
}
finally {
if(response!=null){
response.close()
response=null
}
}
}
val time = System.nanoTime()-t1
logger.warn(s"HttpPoster Result:'$result' within ${time/1E9} seconds")
result
}
}

Просмотреть файл

@ -0,0 +1,181 @@
// *********************************************************************
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License
// *********************************************************************
package datax.client.redis
import java.net.SocketAddress
import java.time.Duration
import java.util.concurrent.ConcurrentHashMap
import io.lettuce.core._
import io.lettuce.core.api.sync.{RedisSortedSetCommands, RedisStringCommands}
import io.lettuce.core.cluster.ClusterTopologyRefreshOptions.RefreshTrigger
import io.lettuce.core.cluster.{ClusterClientOptions, ClusterTopologyRefreshOptions, RedisClusterClient}
import datax.constants.ProductConstant
import datax.exception.EngineException
import org.apache.log4j.LogManager
import datax.utility.ConverterUtil.scalaFunctionToJava
case class RedisServerConf(name:String, host: String, port: Int, key: String, timeout: Int, useSsl: Boolean, isCluster: Boolean)
object RedisBase {
val logger = LogManager.getLogger(this.getClass)
/**
* convert the redis cache connection string to type of @RedisServerConf
* the format of the connection string should be like
* <host>:<port>,password=<password>,ssl=True|False,cluster=True|False,timeout=<timeout>
* @param connString connection string to parse
* @return a RedisServerConf object
*/
def parseConnectionString(connString: String): RedisServerConf = {
if(connString == null || connString.isEmpty) return null
val parts = connString.split(",")
val hostAndPort = parts(0).trim.split(":")
if(hostAndPort.length!=2) throw new EngineException(s"Malformed format of host and port in redis connection string")
val host = hostAndPort(0)
val port = hostAndPort(1).toInt
val dict = parts.drop(1).map(p=>{
val pos = p.indexOf("=")
if(pos>0){
p.substring(0, pos) -> p.substring(pos+1)
}
else
throw new EngineException(s"Malformed format of parts in redis connection string")
}).toMap
RedisServerConf(
name = host,
host = host,
port = port,
key = dict.get("password").orNull,
timeout = dict.getOrElse("timeout","3000").toInt,
useSsl = dict.getOrElse("ssl", "True").toBoolean,
isCluster = dict.getOrElse("cluster", "True").toBoolean
)
}
def buildUri(conf: RedisServerConf, clientName: String = ProductConstant.ProductRedisBase) ={
RedisURI.Builder.redis(conf.host)
.withPassword(conf.key)
.withPort(conf.port.toInt)
.withSsl(conf.useSsl)
.withVerifyPeer(false)
.withClientName(clientName)
.withTimeout(Duration.ofMillis(conf.timeout))
.build()
}
def getClusterConnection(client: RedisClusterClient) = {
val topologyRefreshOptions = ClusterTopologyRefreshOptions.builder()
.enablePeriodicRefresh(false)
//.refreshPeriod(Duration.ofMinutes(30))
.enableAdaptiveRefreshTrigger(RefreshTrigger.MOVED_REDIRECT, RefreshTrigger.PERSISTENT_RECONNECTS)
.adaptiveRefreshTriggersTimeout(Duration.ofSeconds(120))
.build()
val so = SocketOptions.builder()
.connectTimeout(Duration.ofSeconds(120))
.keepAlive(true)
.tcpNoDelay(true)
.build()
client.setOptions(ClusterClientOptions.builder()
.socketOptions(so)
.validateClusterNodeMembership(false)
.topologyRefreshOptions(topologyRefreshOptions)
.build())
client.connect
}
def getStandardConnection(client: RedisClient) = {
client.connect()
}
private def getInternalConnection(conf: RedisServerConf, clientName: String):RedisConnection = {
if (conf.isCluster)
new RedisClusterConnection(conf, clientName)
else
new RedisStandardConnection(conf, clientName)
}
private val connectionPool = new ConcurrentHashMap[String, RedisConnection]
def getConnection(connectionString: String, clientName: String) = {
val conf = parseConnectionString(connectionString)
connectionPool.computeIfAbsent(clientName, (k: String) => getInternalConnection(conf, clientName))
}
}
class RedisStandardConnection(redisServerConf: RedisServerConf, clientName: String = ProductConstant.ProductRedisStandardConnection) extends RedisConnection{
private val loggerPrefix = s"RedisClusterConnection-${redisServerConf.host}"
private val logger = LogManager.getLogger(loggerPrefix)
private val uri = RedisBase.buildUri(redisServerConf, clientName)
private val client = RedisClient.create(uri)
client.setDefaultTimeout(Duration.ofSeconds(120))
logger.warn(s"${loggerPrefix}:Init connection to ${uri.getHost}")
var connection = RedisBase.getStandardConnection(client)
logger.info(s"${loggerPrefix}:Created connection")
override def getStringCommands: RedisStringCommands[String, String] = {
connection.sync()
}
override def getSortedSetCommands: RedisSortedSetCommands[String, String] = {
connection.sync()
}
override def reconnect: Unit = {
this.synchronized{
logger.warn(s"${loggerPrefix}:Closing the connection for reconnect")
connection.close()
connection=RedisBase.getStandardConnection(client)
}
}
}
class RedisClusterConnection(redisServerConf: RedisServerConf, clientName: String = ProductConstant.ProductRedisClusterConnection) extends RedisConnection{
private val loggerPrefix = s"RedisClusterConnection-${redisServerConf.host}"
private val logger = LogManager.getLogger(loggerPrefix)
private val uri = RedisBase.buildUri(redisServerConf, clientName)
private val client = RedisClusterClient.create(uri)
client.setDefaultTimeout(Duration.ofSeconds(120))
logger.warn(s"${loggerPrefix}:Init connection")
private var connection = RedisBase.getClusterConnection(client)
client.addListener(new RedisConnectionStateListener(){
override def onRedisConnected(connection: RedisChannelHandler[_, _], socketAddress: SocketAddress): Unit = {
logger.warn(s"${loggerPrefix}:Connected with socket:${socketAddress}")
}
override def onRedisDisconnected(redisChannelHandler: RedisChannelHandler[_, _]): Unit = {
logger.warn(s"${loggerPrefix}:Lost connection")
}
override def onRedisExceptionCaught(redisChannelHandler: RedisChannelHandler[_, _], throwable: Throwable): Unit = {
logger.error(s"${loggerPrefix}:Encounter exception", throwable)
}
})
logger.info(s"${loggerPrefix}:Created connection")
override def getStringCommands: RedisStringCommands[String, String] = {
connection.sync()
}
override def getSortedSetCommands: RedisSortedSetCommands[String, String] = {
connection.sync()
}
override def reconnect: Unit = {
this.synchronized{
logger.warn(s"${loggerPrefix}:Closing the connection for reconnect")
connection.close()
connection=RedisBase.getClusterConnection(client)
}
}
}

Просмотреть файл

@ -0,0 +1,14 @@
// *********************************************************************
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License
// *********************************************************************
package datax.client.redis
import io.lettuce.core.api.sync.{RedisSortedSetCommands, RedisStringCommands}
trait RedisConnection{
def getStringCommands: RedisStringCommands[String, String]
def getSortedSetCommands: RedisSortedSetCommands[String, String]
def reconnect(): Unit
}

Просмотреть файл

@ -0,0 +1,136 @@
// *********************************************************************
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License
// *********************************************************************
package datax.config
import datax.constants.JobArgument
import datax.exception.EngineException
import datax.fs.HadoopClient
import datax.service.ConfigService
import datax.utility.ArgumentsParser
import org.apache.log4j.LogManager
import org.apache.spark.SparkConf
/***
* Singleton service to set and get a Dictionary object
*/
object ConfigManager extends ConfigService {
private val logger = LogManager.getLogger("DictionaryigManager")
def initSparkConf() = new SparkConf()
def loadLocalConfigIfExists[T](configurationFile: String)(implicit mf: Manifest[T]) : Option[T] = {
val configString = HadoopClient.readLocalFileIfExists(configurationFile)
if(configString==null)
None
else {
implicit val formats = org.json4s.DefaultFormats
Some(org.json4s.jackson.parseJson(configString).extract[T])
}
}
private def getLocalEnvVars(): Map[String, String] = {
sys.env.filterKeys(_.startsWith(JobArgument.ConfNamePrefix))
}
private def getLocalConf(): SettingDictionary = {
SettingDictionary(getLocalEnvVars())
}
var singletonConfDict: SettingDictionary = null
def getActiveDictionary(): SettingDictionary = {
if(singletonConfDict==null){
ConfigManager.synchronized{
if(singletonConfDict==null){
singletonConfDict = getLocalConf()
}
}
}
singletonConfDict
}
def setActiveDictionary(conf: SettingDictionary) = {
ConfigManager.synchronized{
singletonConfDict = conf
}
}
def getConfigurationFromArguments(inputArguments: Array[String]):SettingDictionary = {
val namedArgs = ArgumentsParser.getNamedArgs(inputArguments)
logger.warn("cmd line args:"+namedArgs)
if (!namedArgs.contains("conf")) {
throw new Error("configuration file is not specified.")
}
val envs = getLocalEnvVars()
val convertedConf = Map(
JobArgument.ConfName_AppConf -> namedArgs.getOrElse("conf", null),
JobArgument.ConfName_DriverLogLevel -> namedArgs.getOrElse("driverLogLevel", null),
JobArgument.ConfName_LogLevel -> namedArgs.getOrElse("executorLogLevel", null),
JobArgument.ConfName_CheckpointEnabled -> namedArgs.getOrElse("checkpointEnabled", null)
).filter(_._2!=null)
logger.warn("local env:"+envs.mkString(","))
setActiveDictionary(SettingDictionary(envs ++ namedArgs ++ convertedConf))
singletonConfDict
}
private def replaceTokens(src: String, tokens: Map[String, String]) = {
if(tokens==null || src==null ||src.isEmpty)
src
else
tokens.foldLeft(src)((r, kv) => r.replaceAllLiterally("${" + kv._1 + "}", kv._2))
}
private def readJsonFile[T](configurationFile: String, replacements: Map[String, String])(implicit mf: Manifest[T]): T = {
val configString = HadoopClient.readHdfsFile(configurationFile).mkString("")
implicit val formats = org.json4s.DefaultFormats
org.json4s.jackson
.parseJson(replaceTokens(configString, replacements))
.extract[T]
}
private def splitString(s: String):(String, String) = {
val pos = s.indexOf("=")
if(pos==0)
""->s
else if(pos>0)
s.substring(0, pos).trim()->s.substring(pos+1).trim()
else
s.trim()->null
}
private def readConfFile(filePath: String, replacements: Map[String, String]) = {
if(filePath==null)
throw new EngineException(s"No conf file is provided")
else if(!filePath.toLowerCase().endsWith(".conf"))
throw new EngineException(s"non-conf file is not supported as configuration input")
parseConfLines(HadoopClient.readHdfsFile(filePath), replacements)
}
def loadConfig(sparkConf: SparkConf): UnifiedConfig = {
val dict = getActiveDictionary()
val confFile = dict.getAppConfigurationFile()
val confProps = readConfFile(confFile, dict.dict)
val newdict = SettingDictionary(dict.dict ++ confProps)
setActiveDictionary(newdict)
logger.warn("Load Dictionary as following:\n"+newdict.dict.map(kv=>s"${kv._1}->${kv._2}").mkString("\n"))
UnifiedConfig(sparkConf = sparkConf, dict = newdict)
}
private def parseConfLines(lines: Iterable[String], replacements: Map[String, String]) = {
lines
// skip empty lines or commented lines
.filter(l=>l!=null && !l.trim().isEmpty && !l.trim().startsWith("#"))
.map(splitString)
.map{case(k,v)=>k->replaceTokens(v, replacements)}
.toMap
}
}

Просмотреть файл

@ -0,0 +1,36 @@
// *********************************************************************
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License
// *********************************************************************
package datax.data
import java.sql.Timestamp
import datax.utility.DateTimeUtil
import org.apache.spark.SparkEnv
import org.apache.spark.sql.Row
import org.apache.spark.sql.functions.udf
case class FileInternal(inputPath: String = null,
outputFolders: scala.collection.Map[String, String] = null,
outputFileName: String = null,
fileTime: Timestamp = null,
ruleIndexPrefix: String = null,
target: String = null
)
object FileInternal {
def appendEmptyInternalInfo(json: String) = {
(FileInternal(), json)
}
def udfEmptyInternalInfo = udf(()=>FileInternal())
val getInfoInputPath = (rowInfo: Row) => rowInfo.getString(0)
val getInfoOutputFolder = (rowInfo: Row, group: String) => if(rowInfo.isNullAt(1)) null else rowInfo.getMap[String, String](1).getOrElse(group, null)
val getInfoOutputFileName = (rowInfo: Row) => rowInfo.getString(2)
val getInfoFileTimeString = (rowInfo: Row) => if(rowInfo.isNullAt(3))null else rowInfo.getTimestamp(3).toString
val getInfoRuleIndexPrefix = (rowInfo: Row) => rowInfo.getString(4)
val getInfoTargetTag = (rowInfo: Row) => rowInfo.getString(5)
}

Просмотреть файл

@ -0,0 +1,8 @@
// *********************************************************************
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License
// *********************************************************************
package datax.data
case class ProcessResult(blobsCount: Long, eventsCount: Long)

Просмотреть файл

@ -0,0 +1,778 @@
// *********************************************************************
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License
// *********************************************************************
package datax.fs
import java.io._
import java.net.URI
import java.nio.channels.FileChannel
import java.nio.file.Files
import java.util.concurrent.{Executors, TimeUnit}
import java.util.zip.GZIPInputStream
import com.google.common.io.{Files => GFiles}
import datax.config.SparkEnvVariables
import datax.constants.ProductConstant
import datax.exception.EngineException
import datax.securedsetting.KeyVaultClient
import datax.telemetry.AppInsightLogger
import org.apache.commons.codec.digest.DigestUtils
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.{FileSystem, Path, RemoteIterator}
import org.apache.log4j.LogManager
import scala.language.implicitConversions
import scala.collection.mutable
import scala.collection.mutable.ListBuffer
import scala.concurrent.duration.Duration
import scala.concurrent.{Await, ExecutionContext, Future, TimeoutException}
import scala.io.Source
object HadoopClient {
val logger = LogManager.getLogger(this.getClass)
private val threadPool = Executors.newFixedThreadPool(5)
implicit private val ec = ExecutionContext.fromExecutorService(threadPool)
private var hadoopConf:Configuration = null
/***
* initialize the cached hadoop configuration
* @param conf hadoop configuration for initialization
*/
def setConf(conf: Configuration = null): Unit ={
if(conf==null)
hadoopConf = new Configuration()
else
hadoopConf = conf
}
/***
* get the cached hadoop configuration
* @return the cached hadoop configuration
*/
def getConf() = {
if(hadoopConf==null)
this.synchronized{
if(hadoopConf==null)
setConf()
}
hadoopConf
}
/***
* get the name of storage account from a wasb-format path
* @param path a hdfs path
* @return the storage account name if there is storage account name in the wasbs/wasb path, else null
*/
private def getWasbStorageAccount(path: String): String = {
val uri = new URI(path.replace(" ", "%20"))
val scheme = uri.getScheme
if(scheme == "wasb" || scheme == "wasbs")
Option(uri.getHost) match {
case Some(host) => host.toLowerCase().replace(".blob.core.windows.net", "")
case None => null
}
else
null
}
/***
* get a distinct set of storage accounts from a list of file paths
* @param paths a list of hdfs paths which might contains wasb/wasbs paths
* @return a distinct set of names of storage accounts
*/
private def getWasbStorageAccounts(paths: Seq[String]): Set[String] = {
paths.map(getWasbStorageAccount _).filter(_!=null).toSet
}
/***
* internal cache of storage keys against storage account names.
*/
private val storageAccountKeys = new mutable.HashMap[String, String]
/***
* set key for storage account for azure-hadoop adapter to access that later
* @param sa name of the storage account
* @param key key to the storage account
*/
private def setStorageAccountKey(sa: String, key: String): Unit ={
storageAccountKeys.synchronized{
storageAccountKeys += sa->key
}
// get the default storage account
val defaultFS = getConf().get("fs.defaultFS","")
// set the key only if its a non-default storage account
if(!defaultFS.toLowerCase().contains(s"$sa.blob.core.windows.net")) {
logger.warn(s"Setting the key in hdfs conf for storage account $sa")
getConf().set(s"fs.azure.account.key.$sa.blob.core.windows.net", key)
}
else {
logger.warn(s"Default storage account $sa found, skipping setting the key")
}
}
/***
* resolve key for storage account with a keyvault name
* warn if key is not found but we let it continue so static key settings outside of the job can still work
* @param vaultName key vault name to get the key of storage account
* @param sa name of the storage account
*/
private def resolveStorageAccount(vaultName: String, sa: String) = {
val secretId = s"keyvault://$vaultName/${ProductConstant.ProductRoot}-sa-$sa"
KeyVaultClient.getSecret(secretId) match {
case Some(value)=>
logger.warn(s"Retrieved key for storage account '$sa' with secretid:'$secretId'")
setStorageAccountKey(sa, value)
case None =>
logger.warn(s"Failed to find key for storage account '$sa' with secretid:'$secretId'")
}
}
/***
* set key for storage account required by the specified hdfs path
* @param path hdfs file to resolve the key of storage account if it is a valid wasb/wasbs path, do nothing if it isn't
*/
private def resolveStorageAccountKeyForPath(path: String) = {
val sa = getWasbStorageAccount(path)
if(sa != null && !sa.isEmpty){
KeyVaultClient.withKeyVault {vaultName => resolveStorageAccount(vaultName, sa)}
}
}
/***
* resolve key for storage accounts required by the specified hdfs paths
* @param paths a list of hdfs paths, do nothing if there isn't any valid wasb/wasbs paths
*/
private def resolveStorageAccountKeysForPaths(paths: Seq[String]) = {
val storageAccounts = getWasbStorageAccounts(paths)
.filter(p=>p!=null & !p.isEmpty)
.filterNot(storageAccountKeys.contains(_)) //TODO: make storageAccountKeys thread-safe
if(!storageAccounts.isEmpty){
KeyVaultClient.withKeyVault {vaultName => storageAccounts.foreach(sa=>resolveStorageAccount(vaultName, sa))}
}
}
/***
* export storage account keys to a immutable dictionary for serialization
* @param paths hdfs paths to determine the storage accounts we need
* @return storage accounts and corresponding keys resolved from the input hdfs paths
*/
private def exportWasbKeys(paths: Seq[String]): Map[String, String] = {
//TODO: make storageAccountKeys thread-safe
getWasbStorageAccounts(paths).map(sa => sa->storageAccountKeys.getOrElse(sa, null))
.filter(_._2!=null)
.toMap
}
/**
* Return a Hadoop FileSystem with the scheme encoded in the given path.
* @param path hdfs path to determine the file system from
* @param conf hadoop configuration for the determination
*/
private def getHadoopFileSystem(path: URI, conf: Configuration): FileSystem = {
FileSystem.get(path, conf)
}
/***
* read local file (non-hadoop) from disk if it exists
* @param fileName path to the local file
* @return content of the file if it exists, else null.
*/
def readLocalFileIfExists(fileName: String): String = {
val file = new File(fileName)
if(file.exists()){
val openFile = Source.fromFile(file)
val result = openFile.getLines().mkString
openFile.close()
result
}
else{
null
}
}
private def readLocalFile(fileName: String): String = {
val file = Source.fromFile(fileName)
val result = file.getLines().mkString
file.close()
result
}
def fileExists(hdfsPath: String): Boolean = {
val path = new Path(hdfsPath)
val fs = path.getFileSystem(getConf())
fs.exists(path)
}
/***
* read a hdfs file
* @param hdfsPath path to the hdfs file
* @param gzip whether it is a gzipped file
* @throws IOException if any
* @return a iterable of strings from content of the file
*/
@throws[IOException]
def readHdfsFile(hdfsPath: String, gzip:Boolean=false): Iterable[String] = {
val logger = LogManager.getLogger(s"FileLoader${SparkEnvVariables.getLoggerSuffix()}")
// resolve key to access azure storage account
resolveStorageAccountKeyForPath(hdfsPath)
val lines = new ListBuffer[String]
val t1= System.nanoTime()
logger.info(s"Loading '$hdfsPath'")
try{
val path = new Path(hdfsPath)
val fs = path.getFileSystem(getConf())
val is = fs.open(path)
//val source = Source.fromInputStream(is)
val inputStream = if(gzip)new GZIPInputStream(is) else is
val reader = new BufferedReader(new InputStreamReader(inputStream))
try{
//source.getLines().toList
var line = reader.readLine()
while(line!=null){
lines += line
line = reader.readLine()
}
}
finally {
reader.close()
}
}
catch {
case e: Exception =>{
logger.error(s"Error in reading '$hdfsPath'", e)
AppInsightLogger.trackException(e, Map(
"errorLocation" -> "readHdfsFile",
"errorMessage" -> "Error in reading file",
"failedHdfsPath" -> hdfsPath
), null)
throw e
}
}
val elapsedTime = (System.nanoTime()-t1)/1E9
logger.info(s"Done loading '$hdfsPath', count: ${lines.size}, elapsed time: $elapsedTime seconds")
//TODO: return a iterator instead of the entire list to reduce memory consumption, may also possibly help optimize job performance
lines
}
/**
* write string content to a specified hdfs path
* @param hdfsPath path to the specified hdfs file
* @param content string content to write into the file
* @param overwriteIfExists flag to specify if the file needs to be overwritten if it already exists in hdfs
* @throws IOException if any occurs in the write operation
*/
@throws[IOException]
def writeHdfsFile(hdfsPath: String, content: String, overwriteIfExists:Boolean) {
writeHdfsFile(hdfsPath, content.getBytes("UTF-8"), getConf(), overwriteIfExists)
}
/**
* generate a random file name
* @return a random file name of 8 characers.
*/
private def randomFileName():String = {
java.util.UUID.randomUUID().toString.substring(0, 8)
}
/**
* generate a random string for prefixing a temp file name
* @param seed seed for the randomization of names
* @return a random string with 8 characters for prefixing file names
*/
def tempFilePrefix(seed: String): String = {
DigestUtils.sha256Hex(seed).substring(0, 8)
}
/**
* write to a specified hdfs file with retries
* @param hdfsPath the specified hdfs file
* @param content conent to write into the file
* @param timeout timeout duration for the write operation, by default 5 seconds
* @param retries times in retries, by default 0 meaning no retries.
*/
def writeWithTimeoutAndRetries(hdfsPath: String,
content: Array[Byte],
timeout: Duration = Duration(5, TimeUnit.SECONDS),
retries: Int = 0
) = {
val logger = LogManager.getLogger(s"FileWriter${SparkEnvVariables.getLoggerSuffix()}")
def f = Future{
writeHdfsFile(hdfsPath, content, getConf(), false)
}
var remainingAttempts = retries+1
while(remainingAttempts>0) {
try {
remainingAttempts -= 1
logger.info(s"writing to $hdfsPath with remaining attempts: $remainingAttempts")
Await.result(f, timeout)
remainingAttempts = 0
}
catch {
case e: TimeoutException =>
remainingAttempts = 0
throw e
}
}
}
/**
* make sure parent folder exists for path, create the folder if it doesn't exist
* @param path specified path to check its parent folder
*/
def ensureParentFolderExists(path: String): Unit = {
val file = new Path(path)
val folder = file.getParent
val fs = folder.getFileSystem(getConf())
if(!fs.exists(folder)){
fs.mkdirs(folder)
}
}
/**
* write content to a hdfs file
* @param hdfsPath path to the specified hdfs file
* @param content content to write into the file
* @param conf hadoop configuration
* @param overwriteIfExists flag to specify if the file needs to be overwritten if it already exists in hdfs
* @throws IOException if any from lower file system operation
*/
@throws[IOException]
def writeHdfsFile(hdfsPath: String, content: Array[Byte], conf: Configuration, overwriteIfExists:Boolean) {
resolveStorageAccountKeyForPath(hdfsPath)
val logger = LogManager.getLogger("writeHdfsFile")
val path = new Path(hdfsPath)
val uri = path.toUri
val fsy = path.getFileSystem(conf)
// If output file already exists and overwrite flag is not set, bail out
if(fsy.exists(path) && !overwriteIfExists){
logger.warn(s"Output file ${path} already exists and overwrite flag ${overwriteIfExists}. Skipping writing again .")
return
}
val tempHdfsPath = new URI(uri.getScheme, uri.getAuthority, "/_$tmpHdfsFolder$/"+tempFilePrefix(hdfsPath) + "-" + path.getName, null, null)
//val pos = hdfsPath.lastIndexOf('/')
//val tempHdfsPath = hdfsPath.patch(pos, "/_temporary", 0)
// TODO: create unique name for each temp file.
val tempPath = new Path(tempHdfsPath)
val fs = path.getFileSystem(conf)
val bs = new BufferedOutputStream(fs.create(tempPath, true))
bs.write(content)
bs.close()
// If output file already exists and overwrite flag is set, delete old file and then rewrite new file
if(fs.exists(path) && overwriteIfExists){
logger.warn(s"Output file ${path} already exists and overwrite flag ${overwriteIfExists}. Deleting it.")
fs.delete(path, true)
}
if(!fs.rename(tempPath, path)) {
// Rename failed, check if it was due to destination path already exists.
// If yes, fail only if overwrite is set. If destination does not exist, then fail as-well.
val fileExists = fs.exists(path)
if (!fileExists || (fileExists && overwriteIfExists)) {
val parent = path.getParent
val msg = if(fs.exists(parent)) s"Move ${tempPath} to ${path} did not succeed"
else s"Move ${tempPath} to ${path} did not succeed since parent folder does not exist!"
throw new IOException(msg)
}
else {
logger.warn(s"Blob rename from ${tempPath} to ${path} failed, but moving on since target already exists and overwrite is set to false.")
}
}
}
/**
* create a folder at the specified path
* @param folderPath path to create the folder
*/
def createFolder(folderPath: String): Unit ={
resolveStorageAccountKeyForPath(folderPath)
val path = new Path(folderPath)
val fs = path.getFileSystem(getConf())
fs.mkdirs(path)
}
/**
* implict convert RemoteIterator to Iterator
* @param underlying the underlying RemoteIterator instance
* @tparam T type of the element in Iterator
* @return a Iterator instance
*/
implicit def convertToScalaIterator[T](underlying: RemoteIterator[T]): Iterator[T] = {
case class wrapper(underlying: RemoteIterator[T]) extends Iterator[T] {
override def hasNext = underlying.hasNext
override def next = underlying.next
}
wrapper(underlying)
}
/**
* list files under a folder
* @param folder path to the specified folder
* @return a list of file paths under the folder
*/
def listFiles(folder: String): Iterator[String] = {
val path = new Path(folder)
val fs = path.getFileSystem(getConf)
if(fs.exists(path))
fs.listFiles(path, true).map(f=>f.getPath.toString)
else
Iterator.empty
}
/*
* This function is copied from Apache Spark source code located at https://github.com/apache/spark/blob/master/core/src/main/scala/org/apache/spark/util/Utils.scala
* Copy of the apache license can be obtained from http://www.apache.org/licenses/LICENSE-2.0
* */
/**
* Execute a block of code, then a finally block, but if exceptions happen in
* the finally block, do not suppress the original exception.
*
* This is primarily an issue with `finally { out.close() }` blocks, where
* close needs to be called to clean up `out`, but if an exception happened
* in `out.write`, it's likely `out` may be corrupted and `out.close` will
* fail as well. This would then suppress the original/likely more meaningful
* exception from the original `out.write` call.
*/
def tryWithSafeFinally[T](block: => T)(finallyBlock: => Unit): T = {
var originalThrowable: Throwable = null
try {
block
} catch {
case t: Throwable =>
// Purposefully not using NonFatal, because even fatal exceptions
// we don't want to have our finallyBlock suppress
originalThrowable = t
throw originalThrowable
} finally {
try {
finallyBlock
} catch {
case t: Throwable if (originalThrowable != null && originalThrowable != t) =>
originalThrowable.addSuppressed(t)
val logger = LogManager.getLogger("TryWithSafe")
logger.warn(s"Suppressing exception in finally: ${t.getMessage}", t)
throw originalThrowable
}
}
}
/*
* This function is copied from Apache Spark source code located at https://github.com/apache/spark/blob/master/core/src/main/scala/org/apache/spark/util/Utils.scala
* Copy of the apache license can be obtained from http://www.apache.org/licenses/LICENSE-2.0
* */
def copyFileStreamNIO(
input: FileChannel,
output: FileChannel,
startPosition: Long,
bytesToCopy: Long): Unit = {
val initialPos = output.position()
var count = 0L
// In case transferTo method transferred less data than we have required.
while (count < bytesToCopy) {
count += input.transferTo(count + startPosition, bytesToCopy - count, output)
}
assert(count == bytesToCopy,
s"request to copy $bytesToCopy bytes, but actually copied $count bytes.")
// Check the position after transferTo loop to see if it is in the right position and
// give user information if not.
// Position will not be increased to the expected length after calling transferTo in
// kernel version 2.6.32, this issue can be seen in
// https://bugs.openjdk.java.net/browse/JDK-7052359
// This will lead to stream corruption issue when using sort-based shuffle (SPARK-3948).
val finalPos = output.position()
val expectedPos = initialPos + bytesToCopy
assert(finalPos == expectedPos,
s"""
|Current position $finalPos do not equal to expected position $expectedPos
|after transferTo, please check your kernel version to see if it is 2.6.32,
|this is a kernel bug which will lead to unexpected behavior when using transferTo.
|You can set spark.file.transferTo = false to disable this NIO feature.
""".stripMargin)
}
/*
* This function is copied from Apache Spark source code located at https://github.com/apache/spark/blob/master/core/src/main/scala/org/apache/spark/util/Utils.scala
* Copy of the apache license can be obtained from http://www.apache.org/licenses/LICENSE-2.0
* */
/**
* Copy all data from an InputStream to an OutputStream. NIO way of file stream to file stream
* copying is disabled by default unless explicitly set transferToEnabled as true,
* the parameter transferToEnabled should be configured by spark.file.transferTo = [true|false].
*/
def copyStream(
in: InputStream,
out: OutputStream,
closeStreams: Boolean = false,
transferToEnabled: Boolean = false): Long = {
tryWithSafeFinally {
if (in.isInstanceOf[FileInputStream] && out.isInstanceOf[FileOutputStream]
&& transferToEnabled) {
// When both streams are File stream, use transferTo to improve copy performance.
val inChannel = in.asInstanceOf[FileInputStream].getChannel()
val outChannel = out.asInstanceOf[FileOutputStream].getChannel()
val size = inChannel.size()
copyFileStreamNIO(inChannel, outChannel, 0, size)
size
} else {
var count = 0L
val buf = new Array[Byte](8192)
var n = 0
while (n != -1) {
n = in.read(buf)
if (n != -1) {
out.write(buf, 0, n)
count += n
}
}
count
}
} {
if (closeStreams) {
try {
in.close()
} finally {
out.close()
}
}
}
}
/*
* This function is copied from Apache Spark source code located at https://github.com/apache/spark/blob/master/core/src/main/scala/org/apache/spark/util/Utils.scala
* Copy of the apache license can be obtained from http://www.apache.org/licenses/LICENSE-2.0
* */
/**
* Copy `sourceFile` to `destFile`.
*
* If `destFile` already exists:
* - no-op if its contents equal those of `sourceFile`,
* - throw an exception if `fileOverwrite` is false,
* - attempt to overwrite it otherwise.
*
* @param url URL that `sourceFile` originated from, for logging purposes.
* @param sourceFile File path to copy/move from.
* @param destFile File path to copy/move to.
* @param fileOverwrite Whether to delete/overwrite an existing `destFile` that does not match
* `sourceFile`
* @param removeSourceFile Whether to remove `sourceFile` after / as part of moving/copying it to
* `destFile`.
*/
private def copyFile(
url: String,
sourceFile: File,
destFile: File,
fileOverwrite: Boolean,
removeSourceFile: Boolean = false): Unit = {
val logger = LogManager.getLogger("CopyFile")
if (destFile.exists) {
if (!filesEqualRecursive(sourceFile, destFile)) {
if (fileOverwrite) {
logger.info(
s"File $destFile exists and does not match contents of $url, replacing it with $url"
)
if (!destFile.delete()) {
throw new EngineException(
"Failed to delete %s while attempting to overwrite it with %s".format(
destFile.getAbsolutePath,
sourceFile.getAbsolutePath
)
)
}
} else {
throw new EngineException(
s"File $destFile exists and does not match contents of $url")
}
} else {
// Do nothing if the file contents are the same, i.e. this file has been copied
// previously.
logger.info(
"%s has been previously copied to %s".format(
sourceFile.getAbsolutePath,
destFile.getAbsolutePath
)
)
return
}
}
// The file does not exist in the target directory. Copy or move it there.
if (removeSourceFile) {
Files.move(sourceFile.toPath, destFile.toPath)
} else {
logger.info(s"Copying ${sourceFile.getAbsolutePath} to ${destFile.getAbsolutePath}")
copyRecursive(sourceFile, destFile)
}
}
/*
* This function is copied from Apache Spark source code located at https://github.com/apache/spark/blob/master/core/src/main/scala/org/apache/spark/util/Utils.scala
* Copy of the apache license can be obtained from http://www.apache.org/licenses/LICENSE-2.0
* */
private def filesEqualRecursive(file1: File, file2: File): Boolean = {
if (file1.isDirectory && file2.isDirectory) {
val subfiles1 = file1.listFiles()
val subfiles2 = file2.listFiles()
if (subfiles1.size != subfiles2.size) {
return false
}
subfiles1.sortBy(_.getName).zip(subfiles2.sortBy(_.getName)).forall {
case (f1, f2) => filesEqualRecursive(f1, f2)
}
} else if (file1.isFile && file2.isFile) {
GFiles.equal(file1, file2)
} else {
false
}
}
/*
* This function is copied from Apache Spark source code located at https://github.com/apache/spark/blob/master/core/src/main/scala/org/apache/spark/util/Utils.scala
* Copy of the apache license can be obtained from http://www.apache.org/licenses/LICENSE-2.0
* */
private def copyRecursive(source: File, dest: File): Unit = {
if (source.isDirectory) {
if (!dest.mkdir()) {
throw new IOException(s"Failed to create directory ${dest.getPath}")
}
val subfiles = source.listFiles()
subfiles.foreach(f => copyRecursive(f, new File(dest, f.getName)))
} else {
Files.copy(source.toPath, dest.toPath)
}
}
/*
* This function is copied from Apache Spark source code located at https://github.com/apache/spark/blob/master/core/src/main/scala/org/apache/spark/util/Utils.scala
* Copy of the apache license can be obtained from http://www.apache.org/licenses/LICENSE-2.0
* */
/**
* Download `in` to `tempFile`, then move it to `destFile`.
*
* If `destFile` already exists:
* - no-op if its contents equal those of `sourceFile`,
* - throw an exception if `fileOverwrite` is false,
* - attempt to overwrite it otherwise.
*
* @param url URL that `sourceFile` originated from, for logging purposes.
* @param in InputStream to download.
* @param destFile File path to move `tempFile` to.
* @param fileOverwrite Whether to delete/overwrite an existing `destFile` that does not match
* `sourceFile`
*/
private def downloadFile(
url: String,
in: InputStream,
destFile: File,
fileOverwrite: Boolean): Unit = {
val logger = LogManager.getLogger("DownloadFile")
val tempFile = File.createTempFile("fetchFileTemp", null,
new File(destFile.getParentFile.getAbsolutePath))
logger.info(s"Fetching $url to $tempFile")
try {
val out = new FileOutputStream(tempFile)
copyStream(in, out, closeStreams = true)
copyFile(url, tempFile, destFile, fileOverwrite, removeSourceFile = true)
} finally {
// Catch-all for the couple of cases where for some reason we didn't move `tempFile` to
// `destFile`.
if (tempFile.exists()) {
tempFile.delete()
}
}
}
/*
* This function is copied from Apache Spark source code located at https://github.com/apache/spark/blob/master/core/src/main/scala/org/apache/spark/util/Utils.scala
* Copy of the apache license can be obtained from http://www.apache.org/licenses/LICENSE-2.0
* */
def fetchHdfsFile(path: Path,
targetDir: java.io.File,
fs: FileSystem,
hadoopConf: Configuration,
fileOverwrite: Boolean,
filename: Option[String] = None): Unit = {
if (!targetDir.exists() && !targetDir.mkdir()) {
throw new IOException(s"Failed to create directory ${targetDir.getPath}")
}
val dest = new File(targetDir, filename.getOrElse(path.getName))
if (fs.isFile(path)) {
val in = fs.open(path)
try {
downloadFile(path.toString, in, dest, fileOverwrite)
} finally {
in.close()
}
} else {
fs.listStatus(path).foreach { fileStatus =>
fetchHdfsFile(fileStatus.getPath(), dest, fs, hadoopConf, fileOverwrite)
}
}
}
/*
* This function is a modified version of Apache Spark source code located at https://github.com/apache/spark/blob/master/core/src/main/scala/org/apache/spark/util/Utils.scala
* Copy of the apache license can be obtained from http://www.apache.org/licenses/LICENSE-2.0
* */
/**
* Download a file or directory to target directory. Supports fetching the file in a variety of
* ways, including HTTP, Hadoop-compatible filesystems, and files on a standard filesystem, based
* on the URL parameter. Fetching directories is only supported from Hadoop-compatible
* filesystems.
* 'resolveStorageKey' param controls whether to retrieve the storage key from keyvault.
* Throws SparkException if the target file already exists and has different contents than
* the requested file.
*/
def fetchFile(url: String,
targetDir: java.io.File,
filename: String, resolveStorageKey:Boolean=true): java.io.File = {
val targetFile = new File(targetDir, filename)
val uri = new URI(url)
val fileOverwrite = false
Option(uri.getScheme).getOrElse("file") match {
case "file" =>
// In the case of a local file, copy the local file to the target directory.
// Note the difference between uri vs url.
val sourceFile = if (uri.isAbsolute) new File(uri) else new File(url)
copyFile(url, sourceFile, targetFile, fileOverwrite)
case "wasb" | "wasbs" =>
if(resolveStorageKey) {
resolveStorageAccountKeyForPath(url)
}
val conf = getConf()
val path = new Path(uri)
val fs = path.getFileSystem(conf)
fetchHdfsFile(path, targetDir, fs, conf, fileOverwrite, filename = Some(filename))
case other =>
throw new EngineException(s"unsupported file paths with '$other' scheme")
}
targetFile
}
}

Просмотреть файл

@ -0,0 +1,61 @@
// *********************************************************************
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License
// *********************************************************************
package datax.handler
import datax.config.{SettingDictionary, SettingNamespace}
import datax.exception.EngineException
import datax.securedsetting.KeyVaultClient
import datax.utility.AzureFunctionCaller
import org.apache.log4j.LogManager
import org.apache.spark.sql.SparkSession
object AzureFunctionHandler {
case class AzureFunctionConf(name: String, serviceEndpoint: String, api: String, code: String, params:Array[String])
val logger = LogManager.getLogger(this.getClass)
val SettingAzureFunction = "azurefunction"
val SettingAzureFunctionServiceEndpoint = "serviceendpoint"
val SettingAzureFunctionApi = "api"
val SettingAzureFunctionCode = "code"
val SettingAzureFunctionParams = "params"
private def buildAzureFunctionConf(dict: SettingDictionary, name: String): AzureFunctionConf = {
AzureFunctionConf(
name = name,
serviceEndpoint = dict.getOrNull(SettingAzureFunctionServiceEndpoint),
api = dict.getOrNull(SettingAzureFunctionApi),
code = KeyVaultClient.resolveSecretIfAny(dict.getOrNull(SettingAzureFunctionCode)),
params = dict.getStringSeqOption(SettingAzureFunctionParams).map(_.toArray).orNull
)
}
private def buildAzureFunctionConfArray(dict: SettingDictionary, prefix: String): List[AzureFunctionConf] = {
dict.buildConfigIterable(buildAzureFunctionConf, prefix).toList
}
def initialize(spark: SparkSession, dict: SettingDictionary) = {
val azFuncs = buildAzureFunctionConfArray(dict, SettingNamespace.JobProcessPrefix+SettingAzureFunction+SettingNamespace.Seperator)
for (azFunc <- azFuncs) {
val azFuncAccessCode = KeyVaultClient.getSecretOrThrow(azFunc.code)
azFunc.params.length match {
case 0 => spark.udf.register(azFunc.name, () => AzureFunctionCaller.call(azFunc.serviceEndpoint, azFunc.api, azFuncAccessCode, null))
case 1 => spark.udf.register(azFunc.name, (s:String) => AzureFunctionCaller.call(azFunc.serviceEndpoint, azFunc.api, azFuncAccessCode, Map(
azFunc.params(0)->s
)))
case 2 => spark.udf.register(azFunc.name,(s1:String, s2: String) => AzureFunctionCaller.call(azFunc.serviceEndpoint, azFunc.api, azFuncAccessCode, Map(
azFunc.params(0)->s1,
azFunc.params(1)->s2
)))
case 3 => spark.udf.register(azFunc.name,(s1:String, s2: String, s3: String) => AzureFunctionCaller.call(azFunc.serviceEndpoint, azFunc.api, azFuncAccessCode, Map(
azFunc.params(0)->s1,
azFunc.params(1)->s2,
azFunc.params(2)->s3
)))
// TODO: Add support for more than 3 input parameters for AzureFunction
case _=> throw new EngineException("AzureFunction with more than 3 input parameters are currently not supported. Please contact datax dev team for adding support if needed.")
}
}
}
}

Просмотреть файл

@ -0,0 +1,18 @@
// *********************************************************************
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License
// *********************************************************************
package datax.handler
import datax.config.SettingDictionary
import datax.utility.{AzureFunctionCaller, ConcurrentDateFormat}
import org.apache.log4j.LogManager
import org.apache.spark.sql.SparkSession
object BuiltInFunctionsHandler {
val logger = LogManager.getLogger(this.getClass)
def initialize(spark: SparkSession, dict: SettingDictionary) = {
spark.udf.register("stringToTimestamp", ConcurrentDateFormat.stringToTimestamp _)
}
}

Просмотреть файл

@ -0,0 +1,101 @@
// *********************************************************************
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License
// *********************************************************************
package datax.handler
import datax.classloader.ClassLoaderHost
import datax.config.{SettingDictionary, SettingNamespace}
import datax.exception.EngineException
import datax.extension.DynamicUDF._
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.catalyst.ScalaReflection
import org.apache.spark.sql.catalyst.expressions.{Expression, ScalaUDF}
import org.apache.spark.sql.types.DataType
object ExtendedUDFHandler {
val NamespacePrefix = SettingNamespace.JobProcessPrefix + "udf."
def getUdfClasses(dict: SettingDictionary) = {
dict.getSubDictionary(NamespacePrefix)
}
def initialize(spark: SparkSession, dict: SettingDictionary) = {
getUdfClasses(dict).getDictMap().par.map{ case(k, v)=>{
k-> registerUdf(k, v, spark, dict)
}}
}
val ClassNamePrefix = classOf[Generator0[_]].getCanonicalName.dropRight(1)
val mirror = scala.reflect.runtime.universe.runtimeMirror(ClassLoaderHost.derivedClassLoader)
def getUdfBaseClassNames(className: String) = {
val clazz = ClassLoaderHost.classForName(className)
val ts = mirror.classSymbol(clazz).typeSignature
ts.baseClasses.map(s=>s.fullName)
}
def registerUdf(name: String, className: String, spark: SparkSession, dict: SettingDictionary) = {
val clazz = ClassLoaderHost.classForName(className)
val ts = mirror.classSymbol(clazz).typeSignature
val udfInterfaces = ts.baseClasses.filter(c=>c.fullName.startsWith(ClassNamePrefix))
if (udfInterfaces.length == 0) {
throw new EngineException(s"UDF class $className doesn't implement any UDF interface")
} else if (udfInterfaces.length > 1) {
throw new EngineException(s"It is invalid to implement multiple UDF interfaces, UDF class $className")
} else {
val udfInterface = udfInterfaces(0)
val typeArgs = ts.baseType(udfInterface).typeArgs
val returnType = ScalaReflection.schemaFor(typeArgs.last).dataType
val udf = clazz.newInstance()
val argumentCount = typeArgs.length - 1
val wrap = generateFunctionRef(udf, argumentCount, spark, dict)
registerFunction(spark, name, wrap.func, returnType, argumentCount)
wrap.onInterval
}
}
case class UdfWrap(func: AnyRef, onInterval: IntervalUpdateHandler)
def generateFunctionRef(udf: Any, argumentCount: Int, spark: SparkSession, dict: SettingDictionary):UdfWrap = {
argumentCount match {
case 0=> initializeUdf0(udf, spark, dict)
case 1=> initializeUdf1(udf, spark, dict)
case 2=> initializeUdf2(udf, spark, dict)
case 3=> initializeUdf3(udf, spark, dict)
case _=> throw new EngineException(s"UDF with $argumentCount arguments is not supported yet.")
}
}
def initializeUdf0(udf: Any, spark: SparkSession, dict: SettingDictionary) = {
val obj = udf.asInstanceOf[Generator0[Any]].initialize(spark, dict)
UdfWrap(obj.func, obj.onInterval)
}
def initializeUdf1(udf: Any, spark: SparkSession, dict: SettingDictionary) = {
val obj = udf.asInstanceOf[Generator1[Any, Any]].initialize(spark, dict)
UdfWrap(obj.func.apply(_:Any), obj.onInterval)
}
def initializeUdf2(udf: Any, spark: SparkSession, dict: SettingDictionary) = {
val obj = udf.asInstanceOf[Generator2[Any, Any, Any]].initialize(spark, dict)
UdfWrap(obj.func.apply(_:Any, _:Any), obj.onInterval)
}
def initializeUdf3(udf: Any, spark: SparkSession, dict: SettingDictionary) = {
val obj = udf.asInstanceOf[Generator3[Any, Any, Any, Any]].initialize(spark, dict)
UdfWrap(obj.func.apply(_:Any, _:Any, _:Any), obj.onInterval)
}
def registerFunction(spark:SparkSession, name: String, func: AnyRef, returnType: DataType, argumentCount: Int) = {
def builder(e: Seq[Expression]) = if (e.length == argumentCount) {
ScalaUDF(func, returnType, e, udfName = Some(name))
} else {
throw new EngineException(s"Invalid number of arguments for function $name. Expected: $argumentCount; Found: ${e.length}")
}
spark.sessionState.functionRegistry.createOrReplaceTempFunction(name, builder)
}
}

Просмотреть файл

@ -0,0 +1,38 @@
// *********************************************************************
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License
// *********************************************************************
package datax.handler
import datax.classloader.ClassLoaderHost
import datax.config.{SettingDictionary, SettingNamespace}
import datax.extension.StringNormalizer
import org.apache.log4j.LogManager
import org.apache.spark.sql.SparkSession
object InputNormalizerHandler {
val logger = LogManager.getLogger(this.getClass)
val SettingPreProjection = SettingNamespace.JobProcessPrefix + "inputnormalizer"
def initialize(spark: SparkSession, dict: SettingDictionary) = {
loadProcessor(spark,
dict,
"default",
dict.get(SettingPreProjection).orNull)
}
def loadProcessor(spark:SparkSession, dict: SettingDictionary, processorName: String, className: String) = {
if(className==null||className.isEmpty){
logger.warn(s"no input normalizer processor is defined")
null
}
else {
logger.warn(s"loading class ${className} for input normalizer handler '${processorName}'")
val clazz = ClassLoaderHost.classForName(className)
val processor = clazz.newInstance().asInstanceOf[StringNormalizer]
logger.warn(s"loaded class ${className} for input normalizer handler '${processorName}'")
// transform the method to a delegate
processor.normalize _
}
}
}

Просмотреть файл

@ -0,0 +1,63 @@
// *********************************************************************
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License
// *********************************************************************
package datax.handler
import datax.config.{SettingDictionary, SettingNamespace}
import datax.host.SparkJarLoader
import datax.securedsetting.KeyVaultClient
import org.apache.log4j.LogManager
import org.apache.spark.sql.SparkSession
object JarUDFHandler {
case class JarUDFConf(name: String, path: String, `class`: String, libs: List[String])
val logger = LogManager.getLogger(this.getClass)
val SettingJarUDF = "jar.udf"
val SettingJarUDAF = "jar.udaf"
val SettingJarUDFPath = "path"
val SettingJarUDFClass = "class"
val SettingJarUDFLibs = "libs"
private def buildJarUdfConf(dict: SettingDictionary, name: String): JarUDFConf = {
JarUDFConf(
name = name,
path = dict.getOrNull(SettingJarUDFPath),
`class` = dict.getOrNull(SettingJarUDFClass),
libs = dict.getStringSeqOption(SettingJarUDFLibs).map(_.toList).orNull
)
}
private def buildJarUdfConfArray(dict: SettingDictionary, prefix: String): List[JarUDFConf] = {
logger.warn("#### JarUDFHandler prefix=" + prefix)
dict.groupBySubNamespace(prefix)
.map{ case(k, v) =>
buildJarUdfConf(v, k)
}
.toList
}
def loadJarUdf(spark: SparkSession, dict: SettingDictionary) = {
val jarUDFs = buildJarUdfConfArray(dict, SettingNamespace.JobProcessPrefix+SettingJarUDF+SettingNamespace.Seperator)
val jarUDAFs = buildJarUdfConfArray(dict, SettingNamespace.JobProcessPrefix+SettingJarUDAF+SettingNamespace.Seperator)
val libs = jarUDFs.flatMap(c=>if(c.libs==null) Seq(c.path) else c.libs:+c.path).toSet ++ jarUDAFs.flatMap(c=>if(c.libs==null) Seq(c.path) else c.libs:+c.path).toSet
libs.foreach(libPath => {
logger.warn(s"Adding JAR at $libPath to driver and executors")
val actualPath = KeyVaultClient.resolveSecretIfAny(libPath)
SparkJarLoader.addJar(spark, actualPath)
})
jarUDFs.foreach(udf => {
logger.warn(s"###########jarUDFs class name ="+ udf.`class`)
SparkJarLoader.registerJavaUDF(spark.udf, udf.name, udf.`class`, null)
})
jarUDAFs.foreach(udf=>{
logger.warn(s"###########jarUDAFs class name ="+ udf.`class`)
SparkJarLoader.registerJavaUDAF(spark.udf, udf.name, udf.`class`)
})
}
}

Просмотреть файл

@ -0,0 +1,33 @@
// *********************************************************************
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License
// *********************************************************************
package datax.handler
import datax.config.{SettingDictionary, SettingNamespace}
import datax.securedsetting.KeyVaultClient
import org.apache.log4j.LogManager
case class MetricSinkConf(redis: String, eventhub: String, httpEndpoint:String)
object MetricsHandler {
val Namespace = "metric"
val NamespacePrefix = SettingNamespace.JobProcessPrefix+Namespace+SettingNamespace.Seperator
val SettingRedisConnection= "redis"
val SettingEventHubConnection= "eventhub"
val SettingHttpConnection= "httppost"
val logger = LogManager.getLogger(this.getClass)
def getMetricsSinkConf(dict: SettingDictionary):MetricSinkConf = {
val prefix = NamespacePrefix
val subdict = dict.getSubDictionary(prefix)
MetricSinkConf(
redis = KeyVaultClient.resolveSecretIfAny(subdict.getOrNull(SettingRedisConnection)),
eventhub = KeyVaultClient.resolveSecretIfAny(subdict.getOrNull(SettingEventHubConnection)),
httpEndpoint = subdict.getOrNull(SettingHttpConnection)
)
}
}

Просмотреть файл

@ -0,0 +1,38 @@
// *********************************************************************
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License
// *********************************************************************
package datax.handler
import datax.classloader.ClassLoaderHost
import datax.config.{SettingDictionary, SettingNamespace}
import datax.extension.PreProjectionProcessor
import org.apache.log4j.LogManager
import org.apache.spark.sql.SparkSession
object PreProjectionHandler {
val logger = LogManager.getLogger(this.getClass)
val SettingPreProjection = SettingNamespace.JobProcessPrefix + "preprojection"
def initialize(spark: SparkSession, dict: SettingDictionary) = {
loadProcessor(spark,
dict,
"default",
dict.get(SettingPreProjection).orNull)
}
def loadProcessor(spark:SparkSession, dict: SettingDictionary, processorName: String, className: String) = {
if(className==null||className.isEmpty){
logger.warn(s"no preprojection processor is defined")
null
}
else {
logger.warn(s"loading class ${className} for preprojection handler '${processorName}'")
val clazz = ClassLoaderHost.classForName(className)
val generator = clazz.newInstance().asInstanceOf[PreProjectionProcessor]
val processor = generator.initialize(spark, dict)
logger.warn(s"loaded class ${className} for preprojection handler '${processorName}'")
processor
}
}
}

Просмотреть файл

@ -0,0 +1,45 @@
// *********************************************************************
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License
// *********************************************************************
package datax.handler
import datax.config.{SettingDictionary, SettingNamespace}
import datax.fs.HadoopClient
import datax.securedsetting.KeyVaultClient
import datax.utility.Validation
import org.apache.log4j.LogManager
import scala.concurrent.{ExecutionContext, Future}
object ProjectionHandler {
val logger = LogManager.getLogger(this.getClass)
val SettingProjection = "projection"
private def getProjectionFilePaths(dict: SettingDictionary): Option[Seq[String]] = {
dict.getStringSeqOption(SettingNamespace.JobProcessPrefix + SettingProjection)
}
def loadProjectionsFuture(dict: SettingDictionary)(implicit ec: ExecutionContext): Future[Seq[Seq[String]]] = {
getProjectionFilePaths(dict) match {
case Some(projections) => {
Future.sequence(projections.map(projectionFile => Future {
logger.warn(s"Load projection file from '$projectionFile'")
val filePath = KeyVaultClient.resolveSecretIfAny(projectionFile)
HadoopClient.readHdfsFile(filePath).toSeq
}))
}
case None => Future {
Seq()
}
}
}
def validateProjections(projections: Seq[Seq[String]]) = {
for(i <- 0 until projections.length){
val expr = projections(i)
Validation.ensureNotNull(expr, s"projection-$i")
logger.warn(s"Projection Step #$i = \n" + expr.mkString("\n"))
}
}
}

Просмотреть файл

@ -0,0 +1,36 @@
// *********************************************************************
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License
// *********************************************************************
package datax.handler
import java.sql.Timestamp
import datax.config.{SettingDictionary, SettingNamespace}
import datax.constants.ProcessingPropertyName
import datax.data.FileInternal
import datax.utility.DateTimeUtil
import org.apache.log4j.LogManager
import org.apache.spark.SparkEnv
import org.apache.spark.sql.functions.udf
import org.apache.spark.sql.{Row, SparkSession}
object PropertiesHandler {
val logger = LogManager.getLogger(this.getClass)
val SettingPropertyToAppend = "appendproperty"
def buildAppendProperties(dict: SettingDictionary, prefix: String):Map[String, String] = {
dict.getSubDictionary(prefix).getDictMap()
}
def initialize(spark: SparkSession, dict: SettingDictionary) = {
val appendProperties = buildAppendProperties(dict, SettingNamespace.JobProcessPrefix+SettingPropertyToAppend+SettingNamespace.Seperator)
udf((internalColumn:Row, batchTime:Timestamp) =>
(appendProperties ++ Map(
ProcessingPropertyName.BatchTime -> batchTime.toString,
ProcessingPropertyName.BlobTime -> FileInternal.getInfoFileTimeString(internalColumn),
ProcessingPropertyName.BlobPathHint -> FileInternal.getInfoOutputFileName(internalColumn),
ProcessingPropertyName.CPTime -> DateTimeUtil.getCurrentTime().toString,
ProcessingPropertyName.CPExecutor -> SparkEnv.get.executorId)).filter(_._2!=null))
}
}

Просмотреть файл

@ -0,0 +1,61 @@
// *********************************************************************
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License
// *********************************************************************
package datax.handler
import datax.config.{SettingDictionary, SettingNamespace}
import datax.securedsetting.KeyVaultClient
import datax.telemetry.AppInsightLogger
import datax.utility.CSVUtil
import org.apache.log4j.LogManager
import org.apache.spark.sql.SparkSession
import scala.concurrent.{ExecutionContext, Future}
object ReferenceDataHandler {
case class ReferenceDataConf(format:String, name:String, path:String, delimiter: Option[String], header: Option[String])
val logger = LogManager.getLogger(this.getClass)
val Namespace = "referencedata"
val SettingFormat = "format"
val SettingPath = "path"
val SettingDelimiter = "delimiter"
val SettingHeader = "header"
private def buildReferenceDataConf(dict: SettingDictionary, name: String): ReferenceDataConf = {
ReferenceDataConf(
format = dict.getOrNull(SettingFormat),
name = name,
path = dict.getOrNull(SettingPath),
delimiter = dict.get(SettingDelimiter),
header = dict.get(SettingHeader)
)
}
private def buildReferenceDataConfArray(dict: SettingDictionary, prefix: String): List[ReferenceDataConf] = {
logger.warn("#### ReferenceDataHandler prefix:" +prefix)
dict.groupBySubNamespace(prefix)
.map{ case(k, v) => buildReferenceDataConf(v, k)}
.toList
}
def loadReferenceDataFuture(spark: SparkSession, dict: SettingDictionary)(implicit ec: ExecutionContext): Future[Int] = {
Future {
val rds = buildReferenceDataConfArray(dict, SettingNamespace.JobInputPrefix+Namespace + SettingNamespace.Seperator)
rds.foreach(rd => {
val actualPath = KeyVaultClient.resolveSecretIfAny(rd.path)
rd.format.toLowerCase() match {
case "csv" => {
CSVUtil.loadCSVReferenceData(spark, rd.format, actualPath, rd.name, rd.delimiter, rd.header, AppInsightLogger)
true
}
case other: String => throw new Error(s"The ReferenceDataType:'$other' at path '${rd.path}' as specified in the configuration is not currently supported.")
}
})
val count = rds.length
logger.warn(s"Loaded ${count} reference data as tables")
count
}
}
}

Просмотреть файл

@ -0,0 +1,129 @@
// *********************************************************************
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License
// *********************************************************************
package datax.handler
import datax.config._
import datax.exception.EngineException
import datax.fs.HadoopClient
import datax.securedsetting.KeyVaultClient
import datax.utility.AzureFunctionCaller
import org.apache.log4j.LogManager
import org.apache.spark.sql.SparkSession
import scala.collection.mutable.HashMap
object StateTableHandler {
case class StateTableConf(name: String, schema: String, location: String)
val logger = LogManager.getLogger(this.getClass)
val SettingStateTable = "statetable"
val SettingStateTableSchema = "schema"
val SettingStateTableLocation = "location"
// table initialization
val TableMetadata_Active = "active"
val TableMetadata_Standby = "standby"
private def readTableMetadata(metadataFile: String): HashMap[String, String] = {
if(HadoopClient.fileExists(metadataFile)) {
HashMap(
HadoopClient.readHdfsFile(metadataFile).map(s => {
val pos = s.indexOf('=')
if (pos <= 0) {
throw new EngineException(s"Invalid content in '${metadataFile}': '${s}'")
}
else {
(s.substring(0, pos), s.substring(pos + 1, s.length))
}
}).toSeq: _*)
}
else{
HadoopClient.ensureParentFolderExists(metadataFile)
HashMap[String, String](
TableMetadata_Active -> "A",
TableMetadata_Standby -> "B"
)
}
}
private def writeTableMetadata(metadataFile: String, parameters: HashMap[String, String]): Unit ={
HadoopClient.writeHdfsFile(metadataFile, parameters.map(i=>i._1+"="+i._2).mkString("\n"), true)
}
private def getTableNameVersioned(name: String, suffix: String) = name+"_"+suffix
private def buildAccumulationTableConf(dict: SettingDictionary, name: String): StateTableConf = {
StateTableConf(
name = name,
schema = dict.getOrNull(SettingStateTableSchema),
location = dict.getOrNull(SettingStateTableLocation)
)
}
private def buildAccumulationTableArrayConf(dict: SettingDictionary, prefix: String) ={
dict.groupBySubNamespace(prefix)
.map{ case(k, v) => buildAccumulationTableConf(v, k)}
.toList
}
def createTables(spark:SparkSession, dict: SettingDictionary) = {
buildAccumulationTableArrayConf(dict, SettingNamespace.JobProcessPrefix+SettingStateTable+SettingNamespace.Seperator)
.map(t=>{
val location = t.location.stripSuffix("/") + "/"
val metadataFile = location + "metadata.info"
val parameters = readTableMetadata(metadataFile)
var parametersModified = false
val tables = Seq("active", "standby").map(version => {
val suffix = parameters.get(version).get
val spec = s"STORED AS PARQUET LOCATION '${location+suffix}/'"
val tableName = getTableNameVersioned(t.name, suffix)
val sql =s"CREATE TABLE IF NOT EXISTS $tableName (${t.schema}) $spec"
logger.warn(s"Creating '$version' Table ${t.name}: $sql")
spark.sql(sql)
suffix -> tableName
}).toMap
t.name -> new {
private val tableLogger = LogManager.getLogger(s"TableStore-${t.name}")
def getActiveTableName(): String = {
getTableNameVersioned(t.name, parameters(TableMetadata_Active))
}
def getStandbyTableName(): String = {
getTableNameVersioned(t.name, parameters(TableMetadata_Standby))
}
def overwrite(selectSql: String) = {
val standbyTableName = getStandbyTableName()
tableLogger.warn(s"Overwriting standby table: $standbyTableName")
val sql = s"INSERT OVERWRITE TABLE $standbyTableName $selectSql"
spark.sql(sql)
}
def flip():String = {
parameters ++= Map(
TableMetadata_Active -> parameters(TableMetadata_Standby),
TableMetadata_Standby -> parameters(TableMetadata_Active)
)
parametersModified = !parametersModified
val result = getActiveTableName()
tableLogger.warn(s"Fliped active and standby, now active instance is ${result}")
result
}
def persist() = {
if(parametersModified) {
tableLogger.warn(s"persisting parameters: ${parameters}")
writeTableMetadata(metadataFile, parameters)
}
else{
tableLogger.warn(s"Skip persisting parameters: ${parameters}")
}
}
}
}).toMap
}
}

Просмотреть файл

@ -0,0 +1,68 @@
// *********************************************************************
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License
// *********************************************************************
package datax.handler
import datax.config.{SettingDictionary, SettingNamespace}
import datax.exception.EngineException
import org.apache.log4j.LogManager
import org.apache.spark.sql.{Column, SparkSession}
import org.apache.spark.sql.functions.col
import scala.concurrent.duration.Duration
case class TimeWindowConf(
windows: Map[String, Duration],
isEnabled: Boolean,
timestampColumn: Column,
watermark: Duration,
maxWindow: Duration
)
object TimeWindowHandler {
val NamespacePrefix=SettingNamespace.JobProcessPrefix
val SettingWatermark = "watermark"
val SettingTimestampColumn = "timestampcolumn"
val SettingTimeWindow = "timewindow"
val SettingTimeWindowDuration = "windowduration"
val logger = LogManager.getLogger(this.getClass)
private def buildTimeWindowConf(dict: SettingDictionary, name: String) = {
dict.getDuration(SettingTimeWindowDuration)
}
private def buildTimeWindowsConf(dict: SettingDictionary, prefix: String)= {
dict.buildConfigMap(buildTimeWindowConf, prefix)
}
def initialize(spark: SparkSession, dict: SettingDictionary) = {
val windows = buildTimeWindowsConf(dict, NamespacePrefix + SettingTimeWindow + SettingNamespace.Seperator)
val watermark = dict.get(NamespacePrefix + SettingWatermark)
val timestampColumn = dict.get(NamespacePrefix + SettingTimestampColumn)
val isEnabled = windows.size > 0 && watermark.isDefined && timestampColumn.isDefined
if (isEnabled) {
logger.warn(s"Windowing is ON, window are ${windows}, watermark is ${watermark}")
TimeWindowConf(
windows = windows,
isEnabled = windows.size > 0 && watermark.isDefined && timestampColumn.isDefined,
timestampColumn =col(timestampColumn.get),
watermark = Duration.create(watermark.get),
maxWindow = windows.values.max
)
}
else {
logger.warn(s"Windowing is OFF")
TimeWindowConf(
null,
false,
null,
Duration.Zero,
Duration.Zero
)
}
}
}

Просмотреть файл

@ -0,0 +1,54 @@
// *********************************************************************
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License
// *********************************************************************
package datax.handler
import datax.config.{SettingDictionary, SettingNamespace}
import datax.fs.HadoopClient
import datax.securedsetting.KeyVaultClient
import datax.sql.{ParsedResult, TransformSQLParser}
import datax.utility.Validation
import org.apache.log4j.LogManager
import scala.concurrent.{ExecutionContext, Future}
object TransformHandler {
private def getTransformFilePath(dict: SettingDictionary): Option[String] = {
dict.get(SettingNamespace.JobProcessPrefix + "transform")
}
def shouldCacheCommonViews(dict: SettingDictionary): Boolean = {
dict.getOrElse(SettingNamespace.JobProcessPrefix + "cachecommonviews", "True").toBoolean
}
def loadTransformFuture(dict: SettingDictionary)(implicit ec: ExecutionContext): Future[ParsedResult] = {
val logger = LogManager.getLogger(this.getClass)
getTransformFilePath(dict) match {
case Some(transform) =>Future {
logger.warn(s"Load transform script from '${transform}'")
val filePath = KeyVaultClient.resolveSecretIfAny(transform)
val sqlParsed = TransformSQLParser.parse(HadoopClient.readHdfsFile(filePath).toSeq)
if(sqlParsed!=null) {
val queries = sqlParsed.commands.filter(_.commandType==TransformSQLParser.CommandType_Query)
for (i <- 0 until queries.length) {
val transformation = queries(i).text
Validation.ensureNotNull(transformation, s"transform-$i")
logger.warn(s"Transform step #$i = \n" + transformation)
}
sqlParsed.viewReferenceCount.foreach(v=>{
logger.warn(s"View ${v._1} is referenced ${v._2} times")
})
}
sqlParsed
}
case None => Future {
logger.warn(s"Transform file is not defined.")
null
}
}
}
}

Просмотреть файл

@ -0,0 +1,103 @@
// *********************************************************************
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License
// *********************************************************************
package datax.host
import java.sql.Timestamp
import java.text.SimpleDateFormat
import java.time.Instant
import java.util.concurrent.Executors
import datax.config.UnifiedConfig
import datax.constants.ProductConstant
import datax.fs.HadoopClient
import datax.input.BatchBlobInputSetting
import datax.processor.BlobPointerProcessor
import datax.telemetry.AppInsightLogger
import datax.utility.DataMerger
import org.apache.log4j.LogManager
import scala.language.postfixOps
import scala.collection.mutable.{HashSet, ListBuffer}
import scala.collection.parallel.ExecutionContextTaskSupport
import scala.concurrent.ExecutionContext
import scala.concurrent.duration._
object BlobBatchingHost {
def getInputBlobPathPrefixes(prefix: String, datetimeFormat: String, startTime: Instant, durationInSeconds: Long, timeIntervalInSeconds: Long):Iterable[(String, Timestamp)] = {
val result = new ListBuffer[(String, Timestamp)]
val cache = new HashSet[String]
var t:Long = 0
//val utcZoneId = ZoneId.of("UTC")
val dateFormat = new SimpleDateFormat(datetimeFormat)
while(t<durationInSeconds){
val timestamp = Timestamp.from(startTime.plusSeconds(t))
val partitionFolder = dateFormat.format(timestamp)
if(!cache.contains(partitionFolder)){
val path = prefix+partitionFolder
result += Tuple2(path, timestamp)
cache += partitionFolder
}
t+= timeIntervalInSeconds
}
result
}
def runBatchApp(inputArguments: Array[String],processorGenerator: UnifiedConfig=>BlobPointerProcessor ) = {
val appLog = LogManager.getLogger("runBatchApp")
val (appHost, config) = CommonAppHost.initApp(inputArguments)
appLog.warn(s"Batch Mode Work Started")
val blobsConf = BatchBlobInputSetting.getInputBlobsArrayConf(config.dict)
AppInsightLogger.trackEvent(ProductConstant.ProductRoot + "/batch/app/begin")
val prefixes = blobsConf.flatMap(blobs=>{
val inputBlobPathPrefix = blobs.pathPrefix
val inputBlobDateTimeFormat = blobs.pathPartitionFolderFormat
val inputBlobStartTime = Instant.parse(blobs.startTime)
val inputBlobDurationInHours = blobs.durationInHours
val inputBlobTimeIntervalInHours = 1
getInputBlobPathPrefixes(
prefix = inputBlobPathPrefix,
datetimeFormat = inputBlobDateTimeFormat,
startTime = inputBlobStartTime,
durationInSeconds = inputBlobDurationInHours*3600,
timeIntervalInSeconds = inputBlobTimeIntervalInHours*3600
)
}).par
val spark = appHost.getSpark(config.sparkConf)
val sc = spark.sparkContext
val processor = processorGenerator(config)
val ec = new ExecutionContext {
val threadPool = Executors.newFixedThreadPool(16)
def execute(runnable: Runnable) {
threadPool.submit(runnable)
}
def reportFailure(t: Throwable) {}
}
prefixes.tasksupport = new ExecutionContextTaskSupport(ec)
val batchResult = prefixes.map(prefix =>{
appLog.warn(s"Start processing ${prefix}")
val namespace = "_"+HadoopClient.tempFilePrefix(prefix._1)
appLog.warn(s"Namespace for prefix ${prefix._1} is '$namespace'")
val pathsRDD = sc.makeRDD(HadoopClient.listFiles(prefix._1).toSeq)
val result = processor.processPathsRDD(pathsRDD, prefix._2, 1 hour, prefix._2, namespace)
appLog.warn(s"End processing ${prefix}")
result
}).reduce(DataMerger.mergeMapOfDoubles)
appLog.warn(s"Batch Mode Work Ended, processed metrics: $batchResult")
AppInsightLogger.trackEvent(ProductConstant.ProductRoot + "/batch/end", null, batchResult)
}
}

Просмотреть файл

@ -0,0 +1,52 @@
// *********************************************************************
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License
// *********************************************************************
package datax.host
import datax.config.{ConfigManager, UnifiedConfig}
import datax.constants._
import datax.fs.HadoopClient
import datax.service.{ConfigService, TelemetryService}
import datax.telemetry.AppInsightLogger
import org.apache.log4j.LogManager
import org.apache.spark.SparkConf
import org.apache.spark.sql.SparkSession
object CommonAppHost extends AppHost {
override def getConfigService(): ConfigService = ConfigManager
override def getTelemetryService(): TelemetryService = AppInsightLogger
def initApp(inputArguments: Array[String]): (AppHost, UnifiedConfig) = {
val appLog = LogManager.getLogger(this.getClass)
appLog.warn("===App log turned ON===")
val sparkConf = ConfigManager.initSparkConf
// Get the singleton instance of SparkSession
val spark = SparkSessionSingleton.getInstance(sparkConf)
val conf = ConfigManager.getConfigurationFromArguments(inputArguments)
// Initialize FileSystemUtil
HadoopClient.setConf(spark.sparkContext.hadoopConfiguration)
appLog.warn(s"initializing with conf:"+ conf.toString)
AppInsightLogger.initForApp(spark.sparkContext.appName)
conf.getDriverLogLevel() match {
case Some(level) => Logger.setLogLevel(level)
case None =>
}
AppInsightLogger.trackEvent(DatasetName.DataStreamProjection + "/app/init")
val unifiedConfig = ConfigManager.loadConfig(sparkConf)
(this, unifiedConfig)
}
def getSpark(sparkConf: SparkConf): SparkSession = {
SparkSessionSingleton.getInstance(sparkConf)
}
}

Просмотреть файл

@ -0,0 +1,17 @@
// *********************************************************************
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License
// *********************************************************************
package datax.host
import org.apache.log4j.{Level, LogManager}
object Logger {
var logLevel = LogManager.getRootLogger.getLevel
def setLogLevel(level: Level) = {
logLevel = level
val logger = LogManager.getRootLogger
logger.setLevel(level)
logger.warn(s"root logger level set to ${level}")
}
}

Просмотреть файл

@ -0,0 +1,211 @@
// *********************************************************************
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License
// *********************************************************************
package datax.host
import java.lang.reflect.ParameterizedType
import java.net.URI
import datax.classloader.ClassLoaderHost
import datax.constants.ProductConstant
import datax.exception.EngineException
import datax.fs.HadoopClient
import org.apache.log4j.LogManager
import org.apache.spark.SparkFiles
import org.apache.spark.sql.api.java._
import org.apache.spark.sql.catalyst.ScalaReflection
import org.apache.spark.sql.expressions.{MutableAggregationBuffer, UserDefinedAggregateFunction}
import org.apache.spark.sql.types.{DataType, StructType}
import org.apache.spark.sql.{Row, SparkSession, UDFRegistration}
import scala.collection.mutable.HashMap
object SparkJarLoader {
val currentJars = new HashMap[String, Long]
def getJavaUDFReturnDataType(t: Class[_]): DataType = {
val mirror = scala.reflect.runtime.universe.runtimeMirror(ClassLoaderHost.derivedClassLoader)
val ts = mirror.classSymbol(t).typeSignature
val udfInterface = ts.baseClasses.filter(c=>c.fullName.startsWith("org.apache.spark.sql.api.java.UDF"))(0)
ScalaReflection.schemaFor(ts.baseType(udfInterface).typeArgs.last).dataType
}
def addJarOnDriver(spark: SparkSession, jarPath: String, timestamp: Long = 0, resolveStorageKey:Boolean=true) = {
val logger = LogManager.getLogger("AddJar")
val localName = new URI(jarPath).getPath.split("/").last
val currentTimeStamp = currentJars.get(jarPath)
.orElse(currentJars.get(localName))
.getOrElse(-1L)
if (currentTimeStamp < timestamp) {
logger.warn("Fetching " + jarPath + " with timestamp " + timestamp)
// Fetch file with useCache mode, close cache for local mode..
// resolveStorageKey controls whether to retrieve the actual jarPath from
// keyvault for the case where jarPath is keyvault url
HadoopClient.fetchFile(jarPath,
new java.io.File(SparkFiles.getRootDirectory()),
localName, resolveStorageKey)
// Add it to our class loader
val url = new java.io.File(SparkFiles.getRootDirectory(), localName).toURI.toURL
if (!ClassLoaderHost.urlClassLoader.getURLs().contains(url)) {
logger.info("Adding " + url + " to class loader")
ClassLoaderHost.urlClassLoader.addURL(url)
}
}
}
def addJar(spark: SparkSession, jarPath: String) = {
addJarOnDriver(spark, jarPath)
spark.sparkContext.addJar(jarPath)
}
def loadUdf(spark: SparkSession, udfName: String, jarPath: String, mainClass: String, method: String) = {
addJar(spark, jarPath)
registerJavaUDF(spark.udf, udfName, mainClass, null)
}
/**
* Register a Java UDF class using reflection
*
* @param name udf name
* @param className fully qualified class name of udf
* @param returnDataType return type of udf. If it is null, spark would try to infer
* via reflection.
*/
def registerJavaUDF(udfReg: UDFRegistration, name: String, className: String, returnDataType: DataType): Unit = {
try {
val clazz = ClassLoaderHost.classForName(className)
val udfInterfaces = clazz.getGenericInterfaces
.filter(_.isInstanceOf[ParameterizedType])
.map(_.asInstanceOf[ParameterizedType])
.filter(e => e.getRawType.isInstanceOf[Class[_]] && e.getRawType.asInstanceOf[Class[_]].getCanonicalName.startsWith("org.apache.spark.sql.api.java.UDF"))
if (udfInterfaces.length == 0) {
throw new EngineException(s"UDF class $className doesn't implement any UDF interface")
} else if (udfInterfaces.length > 1) {
throw new EngineException(s"It is invalid to implement multiple UDF interfaces, UDF class $className")
} else {
try {
val udf = clazz.newInstance()
//val udfReturnType = udfInterfaces(0).getActualTypeArguments.last
val returnType = if(returnDataType==null) getJavaUDFReturnDataType(clazz) else returnDataType
udfInterfaces(0).getActualTypeArguments.length match {
case 1 => udfReg.register(name, udf.asInstanceOf[UDF0[_]], returnType)
case 2 => udfReg.register(name, udf.asInstanceOf[UDF1[_, _]], returnType)
case 3 => udfReg.register(name, udf.asInstanceOf[UDF2[_, _, _]], returnType)
case 4 => udfReg.register(name, udf.asInstanceOf[UDF3[_, _, _, _]], returnType)
case 5 => udfReg.register(name, udf.asInstanceOf[UDF4[_, _, _, _, _]], returnType)
case 6 => udfReg.register(name, udf.asInstanceOf[UDF5[_, _, _, _, _, _]], returnType)
case 7 => udfReg.register(name, udf.asInstanceOf[UDF6[_, _, _, _, _, _, _]], returnType)
case 8 => udfReg.register(name, udf.asInstanceOf[UDF7[_, _, _, _, _, _, _, _]], returnType)
case 9 => udfReg.register(name, udf.asInstanceOf[UDF8[_, _, _, _, _, _, _, _, _]], returnType)
case 10 => udfReg.register(name, udf.asInstanceOf[UDF9[_, _, _, _, _, _, _, _, _, _]], returnType)
case 11 => udfReg.register(name, udf.asInstanceOf[UDF10[_, _, _, _, _, _, _, _, _, _, _]], returnType)
case 12 => udfReg.register(name, udf.asInstanceOf[UDF11[_, _, _, _, _, _, _, _, _, _, _, _]], returnType)
case 13 => udfReg.register(name, udf.asInstanceOf[UDF12[_, _, _, _, _, _, _, _, _, _, _, _, _]], returnType)
case 14 => udfReg.register(name, udf.asInstanceOf[UDF13[_, _, _, _, _, _, _, _, _, _, _, _, _, _]], returnType)
case 15 => udfReg.register(name, udf.asInstanceOf[UDF14[_, _, _, _, _, _, _, _, _, _, _, _, _, _, _]], returnType)
case 16 => udfReg.register(name, udf.asInstanceOf[UDF15[_, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _]], returnType)
case 17 => udfReg.register(name, udf.asInstanceOf[UDF16[_, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _]], returnType)
case 18 => udfReg.register(name, udf.asInstanceOf[UDF17[_, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _]], returnType)
case 19 => udfReg.register(name, udf.asInstanceOf[UDF18[_, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _]], returnType)
case 20 => udfReg.register(name, udf.asInstanceOf[UDF19[_, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _]], returnType)
case 21 => udfReg.register(name, udf.asInstanceOf[UDF20[_, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _]], returnType)
case 22 => udfReg.register(name, udf.asInstanceOf[UDF21[_, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _]], returnType)
case 23 => udfReg.register(name, udf.asInstanceOf[UDF22[_, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _]], returnType)
case n =>
throw new EngineException(s"UDF class with $n type arguments is not supported.")
}
} catch {
case e @ (_: InstantiationException | _: IllegalArgumentException) =>
throw new EngineException(s"Can not instantiate class $className, please make sure it has public non argument constructor")
}
}
} catch {
case e: ClassNotFoundException => throw new EngineException(s"Can not load class $className, please make sure it is on the classpath")
}
}
/**
* Register a Java UDAF class using reflection, for use from pyspark
*
* @param name UDAF name
* @param className fully qualified class name of UDAF
*/
def registerJavaUDAF(udfReg: UDFRegistration, name: String, className: String): Unit = {
try {
val clazz = ClassLoaderHost.classForName(className)
if (!classOf[UserDefinedAggregateFunction].isAssignableFrom(clazz)) {
throw new EngineException(s"class $className doesn't implement interface UserDefinedAggregateFunction")
}
val udaf = clazz.newInstance().asInstanceOf[UserDefinedAggregateFunction]
udfReg.register(name, udaf)
} catch {
case e: ClassNotFoundException => throw new EngineException(s"Can not load class ${className}, please make sure it is on the classpath")
case e @ (_: InstantiationException | _: IllegalArgumentException) =>
throw new EngineException(s"Can not instantiate class ${className}, please make sure it has public non argument constructor")
}
}
case class CaseUDAF(inputType: StructType, bufferType: StructType, returnType: DataType) extends UserDefinedAggregateFunction{
override def inputSchema: StructType = inputType
override def bufferSchema: StructType = bufferType
override def dataType: DataType = returnType
override def deterministic: Boolean = true
override def initialize(buffer: MutableAggregationBuffer): Unit = ???
override def update(buffer: MutableAggregationBuffer, input: Row): Unit = ???
override def merge(buffer1: MutableAggregationBuffer, buffer2: Row): Unit = ???
override def evaluate(buffer: Row): Any = ???
}
/**
* Register a UDAF class derived from api using reflection
*
* @param name UDAF name
* @param className fully qualified class name of UDAF
*/
def registerApiUDAF(spark: SparkSession, name: String, className: String): Unit = {
try {
val clazz = ClassLoaderHost.classForName(className)
val udfInterfaces = clazz.getGenericInterfaces
.filter(_.isInstanceOf[ParameterizedType])
.map(_.asInstanceOf[ParameterizedType])
.filter(e => e.getRawType.isInstanceOf[Class[_]] && e.getRawType.asInstanceOf[Class[_]].getCanonicalName.startsWith(ProductConstant.ProductRoot + ".api.udf.UDAF"))
if (udfInterfaces.length == 0) {
throw new EngineException(s"UDF class $className doesn't implement any ${ProductConstant.ProductRoot}.api.udf.UDF interface")
} else if (udfInterfaces.length > 1) {
throw new EngineException(s"It is invalid to implement multiple UDF interfaces, UDF class $className")
} else {
try {
val udf = clazz.newInstance()
val typeArguments = udfInterfaces(0).getActualTypeArguments
val (inputTypes, bufferAndOutputTypes) = typeArguments.splitAt(typeArguments.length-2)
val returnType = ClassLoaderHost.javaTypeToDataType(bufferAndOutputTypes(1))
val bufferType = ClassLoaderHost.javaTypeToDataType(bufferAndOutputTypes(0))
//TODO: complete the implementation
} catch {
case e @ (_: InstantiationException | _: IllegalArgumentException) =>
throw new EngineException(s"Can not instantiate class $className, please make sure it has public non argument constructor")
}
}
val udaf = clazz.newInstance().asInstanceOf[UserDefinedAggregateFunction]
spark.udf.register(name, udaf)
} catch {
case e: ClassNotFoundException => throw new EngineException(s"Can not load class ${className}, please make sure it is on the classpath")
case e @ (_: InstantiationException | _: IllegalArgumentException) =>
throw new EngineException(s"Can not instantiate class ${className}, please make sure it has public non argument constructor")
}
}
}

Просмотреть файл

@ -0,0 +1,30 @@
// *********************************************************************
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License
// *********************************************************************
package datax.host
import datax.telemetry.AppInsightLogger
import org.apache.log4j.{Level, LogManager}
import org.apache.spark.SparkConf
import org.apache.spark.sql.SparkSession
object SparkSessionSingleton {
def getLogger = LogManager.getLogger(this.getClass)
@transient private var instance: SparkSession = _
def getInstance(sparkConf: SparkConf): SparkSession = {
if (instance == null) {
instance = SparkSession
.builder
.config(sparkConf)
.enableHiveSupport()
.getOrCreate()
}
instance
}
}

Просмотреть файл

@ -0,0 +1,143 @@
// *********************************************************************
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License
// *********************************************************************
package datax.host
import java.sql.Timestamp
import com.microsoft.azure.eventhubs.EventData
import datax.config._
import datax.constants.{JobArgument, ProductConstant}
import datax.exception.EngineException
import datax.input._
import datax.processor.EventHubStreamingProcessor
import datax.telemetry.AppInsightLogger
import datax.utility.DateTimeUtil
import org.apache.log4j.LogManager
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.SparkSession
import org.apache.spark.streaming.{Seconds, StreamingContext}
import scala.concurrent.duration._
object StreamingHost {
def getLogger = LogManager.getLogger(this.getClass)
def createStreamingContext(spark: SparkSession, intervalInSeconds: Long) = {
new StreamingContext(spark.sparkContext, Seconds(intervalInSeconds))
}
def createStreamingContextWithCheckpoint(spark:SparkSession, streamingCheckpointDir: String, intervalInSeconds: Long) = {
val streamingContext = createStreamingContext(spark, intervalInSeconds)
getLogger.warn("create a new streaming context with checkpointDir=" + streamingCheckpointDir)
streamingContext.checkpoint(streamingCheckpointDir)
streamingContext
}
def initStreamingContext(spark: SparkSession, streamingCheckpointDir: String, intervalInSeconds: Long) = {
getLogger.warn("spark streaming checkpointDir=" + streamingCheckpointDir)
StreamingContext.getOrCreate(streamingCheckpointDir,
() => createStreamingContextWithCheckpoint(spark, streamingCheckpointDir, intervalInSeconds),
spark.sparkContext.hadoopConfiguration,
false
)
}
def runStreamingApp(inputArguments: Array[String], processorGenerator: UnifiedConfig=>EventHubStreamingProcessor): Unit = {
val (appHost, config) = CommonAppHost.initApp(inputArguments)
val spark = CommonAppHost.getSpark(config.sparkConf)
val dict = config.dict
val streamingConf = StreamingInputSetting.getStreamingInputConf(dict)
val eventhubConf = EventHubInputSetting.getInputEventHubConf(dict)
if(eventhubConf==null)
throw new EngineException(s"No proper eventhub config is provided")
val logger = LogManager.getLogger("runStreamingApp")
logger.warn(s"Get or create streaming context from checkpoint folder:${streamingConf.checkpointDir}")
val checkpointEnabled = dict.getOrElse(JobArgument.ConfName_CheckpointEnabled, "false").toBoolean
def createSC() = {
val createStreamContextLogger = LogManager.getLogger("runStreamingApp-createSC")
val spark = CommonAppHost.getSpark(config.sparkConf)
createStreamContextLogger.warn(s"Create streaming context checkpoints folder=${streamingConf.checkpointDir}, internalInSeconds=${streamingConf.intervalInSeconds}")
val streamingContext = createStreamingContext(spark, streamingConf.intervalInSeconds)
val batchInterval = streamingConf.intervalInSeconds.seconds
val repartitionNumber = eventhubConf.repartition.getOrElse(0)
val repartition = if(repartitionNumber==0) (r:RDD[EventData])=>r else (r:RDD[EventData])=>r.repartition(repartitionNumber)
EventHubStreamingFactory.getStream(streamingContext, eventhubConf, (rdd, time) => {
val streamingLogger = LogManager.getLogger("EventHubStreamingLoop")
val batchTime = new Timestamp(time.milliseconds)
val batchTimeStr = DateTimeUtil.formatSimple(batchTime)
streamingLogger.warn(s"===============================Batch $batchTimeStr Started===============================")
val processor = EventHubStreamingFactory.getOrCreateProcessor(config, processorGenerator)
processor.process(repartition(rdd), batchTime, batchInterval)
streamingLogger.warn(s"===============================Batch $batchTimeStr End ===============================")
})
streamingContext
}
val streamingContext = if(checkpointEnabled)
StreamingContext.getOrCreate(
streamingConf.checkpointDir,
createSC _,
spark.sparkContext.hadoopConfiguration,
false)
else createSC()
//streamingContext.remember(org.apache.spark.streaming.Duration(65000))
streamingContext.start()
AppInsightLogger.trackEvent(ProductConstant.ProductRoot + "/streaming/app/start")
logger.warn(s"Streaming Context Started")
streamingContext.awaitTermination()
}
def runLocalStreamingApp(inputArguments: Array[String], processorGenerator: UnifiedConfig=>EventHubStreamingProcessor): Unit = {
val (appHost, config) = CommonAppHost.initApp(inputArguments)
val spark = CommonAppHost.getSpark(config.sparkConf)
val dict = config.dict
val streamingConf = StreamingInputSetting.getStreamingInputConf(dict)
val logger = LogManager.getLogger("runLocalStreamingApp")
logger.warn(s"Get or create streaming context from checkpoint folder:${streamingConf.checkpointDir}")
val checkpointEnabled = dict.getOrElse(JobArgument.ConfName_CheckpointEnabled, "false").toBoolean
def createSC() = {
val createStreamContextLogger = LogManager.getLogger("runLocalStreamingApp-createSC")
val spark = CommonAppHost.getSpark(config.sparkConf)
createStreamContextLogger.warn(s"Create streaming context checkpoints folder=${streamingConf.checkpointDir}, internalInSeconds=${streamingConf.intervalInSeconds}")
val streamingContext = createStreamingContext(spark, streamingConf.intervalInSeconds)
val batchInterval = streamingConf.intervalInSeconds.seconds
val inputSchema = SchemaFile.loadInputSchema(dict)
LocalStreamingFactory.getStream(streamingContext, inputSchema, (rdd, time) => {
val streamingLogger = LogManager.getLogger("LocalStreamingLoop")
val batchTime = new Timestamp(time.milliseconds)
val batchTimeStr = DateTimeUtil.formatSimple(batchTime)
streamingLogger.warn(s"===============================Batch $batchTimeStr Started===============================")
val processor = LocalStreamingFactory.getOrCreateProcessor(config, processorGenerator)
processor.process(rdd, batchTime, batchInterval)
streamingLogger.warn(s"===============================Batch $batchTimeStr End ===============================")
})
streamingContext
}
val streamingContext = if(checkpointEnabled)
StreamingContext.getOrCreate(
streamingConf.checkpointDir,
createSC _,
spark.sparkContext.hadoopConfiguration,
false)
else createSC()
streamingContext.start()
AppInsightLogger.trackEvent(ProductConstant.ProductRoot + "/localStreaming/app/start")
logger.warn(s"Local Streaming Context Started")
streamingContext.awaitTermination()
}
}

Просмотреть файл

@ -0,0 +1,19 @@
// *********************************************************************
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License
// *********************************************************************
package datax.host
import datax.config.SettingDictionary
import org.apache.spark.sql.SparkSession
object UdfInitializer {
def initialize(spark: SparkSession, dict: SettingDictionary) = {
// Register UDF functions
spark.udf.register("filterNull", filterNull _)
}
def filterNull(elems: Seq[Map[String, String]]) : Seq[Map[String, String]] = {
elems.filter(_!=null)
}
}

Просмотреть файл

@ -0,0 +1,33 @@
// *********************************************************************
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License
// *********************************************************************
package datax.input
import datax.config.{SettingDictionary, SettingNamespace}
import org.apache.log4j.LogManager
import scala.collection.mutable
case class InputBlobsConf(pathPrefix:String,
pathPartitionFolderFormat: String,
startTime: String,
durationInHours: Long)
object BatchBlobInputSetting {
val NamespaceBlobsSource = "blobs"
val NamespacePrefix = SettingNamespace.JobInputPrefix+NamespaceBlobsSource+"."
private def buildInputBlobsConf(dict: SettingDictionary, name: String): InputBlobsConf = {
InputBlobsConf(
pathPrefix = dict.getOrNull("pathprefix"),
pathPartitionFolderFormat = dict.getOrNull("pathpartitionfolderformat"),
startTime = dict.getOrNull("starttime"),
durationInHours = dict.getLong("durationinhours")
)
}
def getInputBlobsArrayConf(dict: SettingDictionary): Seq[InputBlobsConf] = {
dict.buildConfigIterable(buildInputBlobsConf, NamespacePrefix).toSeq
}
}

Просмотреть файл

@ -0,0 +1,161 @@
// *********************************************************************
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License
// *********************************************************************
package datax.input
import java.sql.Timestamp
import java.text.SimpleDateFormat
import com.fasterxml.jackson.annotation.{JsonCreator, JsonProperty}
import com.fasterxml.jackson.databind.ObjectMapper
import com.microsoft.azure.eventhubs.EventData
import datax.config._
import datax.data.FileInternal
import datax.exception.EngineException
import datax.input.BlobPointerInputSetting.BlobPointerInputConf
import datax.sink.BlobOutputSetting.BlobOutputConf
import datax.sink.{BlobOutputSetting, BlobSinker}
import datax.telemetry.AppInsightLogger
import org.apache.log4j.LogManager
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.types.{StringType, StructType}
import scala.collection.mutable
import scala.util.matching.Regex
case class BlobPointer @JsonCreator()(@JsonProperty("BlobPath") BlobPath: String)
object BlobPointerInput {
val logger = LogManager.getLogger(this.getClass)
val blobPointerMapper = new ObjectMapper()
blobPointerMapper.registerSubtypes(classOf[BlobPointer])
def parseBlobPath(eventData: EventData) =
blobPointerMapper.readValue(eventData.getBytes, classOf[BlobPointer]).BlobPath
private def loadBlobPointerSchema() = {
(new StructType).add("BlobPath", StringType)
}
private val saRegex = """wasbs?://[\w-]+@([\w\d]+)\.blob.core.windows.net/.*""".r
private def extractSourceId(blobPath: String, regex: String): String = {
val r = if(regex == null) saRegex else regex.r
r.findFirstMatchIn(blobPath) match {
case Some(partition) => partition.group(1)
case None => null
}
}
private def extractTimeFromBlobPath(blobPath: String, fileTimeRegex: Regex, fileTimeFormat: String): Timestamp = {
fileTimeRegex.findFirstMatchIn(blobPath) match {
case Some(timeStr) => try{
if(fileTimeFormat==null){
Timestamp.valueOf(timeStr.group(1).replace('_', ':').replace('T', ' '))
}
else{
val format = new SimpleDateFormat(fileTimeFormat)
new Timestamp(format.parse(timeStr.group(1)).getTime())
}
}
catch {
case e: Exception =>
logger.error(s"Error when parsing date time string from path $blobPath: $e")
AppInsightLogger.trackException(e, Map(
"errorLocation" -> "extractTimeFromBlobPath",
"errorMessage" -> "Error in parsing date time string",
"failedBlobPath" -> blobPath
), null)
null
}
case None =>
logger.error(s"Failed to extract blob time from path $blobPath")
AppInsightLogger.trackException(new EngineException(s"Cannot find blob time from path $blobPath"), Map(
"errorLocation" -> "extractTimeFromBlobPath",
"errorMessage" -> "Failed to extract blob time",
"failedBlobPath" -> blobPath
), null)
null
}
}
private def pathHintsFromBlobPath(blobPath: String, blobPathRegex: Regex): String = {
blobPathRegex.findFirstMatchIn(blobPath) match {
case Some(m) => try{
m.subgroups.mkString("-")
}
catch {
case e: Exception =>
val msg = s"Error occurs in generating output from blob path. \n Please check: \nregex='$blobPathRegex'\nblobPath='$blobPath'\nmatch='$m'"
logger.error(msg, e)
AppInsightLogger.trackException(e, Map(
"errorLocation" -> "pathHintsFromBlobPath",
"errorMessage" -> "Error occurs in generating output file name from blob path",
"failedBlobPath" -> blobPath,
"regex" -> blobPathRegex.toString()
), null)
//null
throw new EngineException(msg, e)
}
case None =>
val msg = s"Error occurs in extract file name from blob path. \n Please check: \nregex='$blobPathRegex'\nblobPath='$blobPath'"
logger.error(msg)
AppInsightLogger.trackException(new EngineException("Cannot find file name from blob path"), Map(
"errorLocation" -> "pathHintsFromBlobPath",
"errorMessage" -> "Error occurs in extracting file name from blob path",
"failedBlobPath" -> blobPath,
"regex" -> blobPathRegex.toString()
), null)
//null
throw new EngineException(msg)
}
}
private def inputPathToInternalProps(inputFilePath: String,
inputConf: BlobPointerInputConf,
outputConf: BlobOutputConf,
outputTimestamp: Timestamp) = {
val sourceId = extractSourceId(inputFilePath, inputConf.sourceIdRegex)
inputConf.sources.get(sourceId) match {
case Some(source) =>
val fileTime = extractTimeFromBlobPath(inputFilePath, inputConf.fileTimeRegex.r, inputConf.fileTimeFormat)
val outputPartitionTime = if(outputTimestamp==null) fileTime else outputTimestamp
FileInternal(inputPath = inputFilePath,
outputFolders = outputConf.groups.map{case (k,v)=>
k-> BlobSinker.generateOutputFolderPath(v.folder, outputPartitionTime, Some(source.target))
},
outputFileName = pathHintsFromBlobPath(inputFilePath, inputConf.blobPathRegex.r),
fileTime = fileTime,
ruleIndexPrefix = source.catalogPrefix.getOrElse(""),
target = source.target
)
case None =>
FileInternal(inputPath = inputFilePath)
}
}
def pathsToGroups(rdd: RDD[String],
jobName: String,
dict: SettingDictionary,
outputTimestamp: Timestamp) = {
val initialSet = mutable.HashSet.empty[FileInternal]
val inputConf = BlobPointerInputSetting.getInputConfig(dict)
val blobOutputConf = BlobOutputSetting.getDefaultBlobOutputConf(dict)
rdd.map(s => {
val propsFile = inputPathToInternalProps(s, inputConf, blobOutputConf, outputTimestamp)
(if(propsFile.outputFolders==null || propsFile.outputFolders.isEmpty) null else jobName, propsFile)
})
.aggregateByKey(initialSet)(_ += _, _ ++ _) // drop duplicates
.collect()
}
def filterPathGroups(groups: Array[(String, mutable.HashSet[FileInternal])]) = {
groups.find(_._1==null) match {
case Some(invalidPaths) =>
logger.warn("Found out-of-scope paths count=" + invalidPaths._2.size + ", First File=" + invalidPaths._2.head.inputPath)
groups.filter(_._1 != null)
case None =>
groups
}
}
}

Просмотреть файл

@ -0,0 +1,83 @@
// *********************************************************************
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License
// *********************************************************************
package datax.input
import datax.config.{SettingDictionary, SettingNamespace}
import datax.input.EventHubInputSetting.InputEventHubConf
import datax.input.StreamingInputSetting.StreamingConf
import org.apache.log4j.LogManager
import scala.collection.mutable
object BlobPointerInputSetting {
case class InputSource(target: String, catalogPrefix:Option[String])
case class BlobPointerInputConf(sources: Map[String, InputSource],
eventhub: InputEventHubConf,
streaming: StreamingConf,
sourceIdRegex: String,
eventNamePath: String,
blobPathRegex: String,
fileTimeRegex: String,
fileTimeFormat: String)
val NamespacePrefix = SettingNamespace.JobInputPrefix
val SettingSourceIdRegex = "sourceidregex"
val SettingEventNamePath = "eventnamepath"
val SettingBlobPathRegex = "blobpathregex"
val SettingFileTimeRegex = "filetimeregex"
val SettingFileTimeFormat = "filetimeformat"
val NamespaceSource = "source"
val SettingInputSourceTarget = "target"
val SettingInputSourceCatalogPrefix = "catalogprefix"
private def buildInputSource(dict: SettingDictionary, name: String): InputSource = {
InputSource(
target = dict.getOrNull(SettingInputSourceTarget),
catalogPrefix = dict.get(SettingInputSourceCatalogPrefix)
)
}
def getInputConfig(dict: SettingDictionary): BlobPointerInputConf = {
val logger = LogManager.getLogger(this.getClass)
var sources: Map[String, InputSource] = null
var eventhub: InputEventHubConf = null
var streaming: StreamingConf = null
var sourceIdRegex: String = null
var eventNamePath: String = null
var blobPathRegex: String = null
var fileTimeRegex: String = null
var fileTimeFormat: String = null
dict.groupBySubNamespace(NamespacePrefix)
.foreach{case (g, v) => {
g match {
case NamespaceSource => sources = v.buildConfigMap(buildInputSource)
case EventHubInputSetting.NamespaceEventHub => eventhub = EventHubInputSetting.buildInputEventHubConf(v)
case StreamingInputSetting.NamespaceStreaming => streaming = StreamingInputSetting.buildStreamingConf(v)
case SettingSourceIdRegex => sourceIdRegex = v.getDefault().orNull
case SettingEventNamePath => eventNamePath = v.getDefault().orNull
case SettingBlobPathRegex => blobPathRegex = v.getDefault().orNull
case SettingFileTimeRegex => fileTimeRegex = v.getDefault().orNull
case SettingFileTimeFormat => fileTimeFormat = v.getDefault().orNull
case "blobschemafile" =>
case groupName:String =>
logger.warn(s"Unsupported setting group '$groupName' under namespace '$NamespacePrefix': \n ${v.getDictMap().keys.mkString("\n")}")
}
}}
BlobPointerInputConf(
sources = sources,
eventhub = eventhub,
streaming = streaming,
sourceIdRegex = sourceIdRegex,
eventNamePath = eventNamePath,
blobPathRegex = blobPathRegex,
fileTimeRegex = fileTimeRegex,
fileTimeFormat = fileTimeFormat
)
}
}

Просмотреть файл

@ -0,0 +1,56 @@
// *********************************************************************
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License
// *********************************************************************
package datax.input
import datax.config.{SettingDictionary, SettingNamespace}
import org.apache.log4j.LogManager
object EventHubInputSetting {
case class InputEventHubConf(connectionString: String,
consumerGroup: String,
checkpointDir: String,
checkpointInterval: String,
maxRate: String,
startEnqueueTime: Option[Long],
flushExistingCheckpoints: Option[Boolean],
repartition: Option[Int]
)
val NamespaceEventHub = "eventhub"
val NamespacePrefix = SettingNamespace.JobInputPrefix + NamespaceEventHub+SettingNamespace.Seperator
val SettingConnectionString = "connectionstring"
val SettingConsumerGroup = "consumergroup"
val SettingCheckpointDir = "checkpointdir"
val SettingCheckpointInterval = "checkpointinterval"
val SettingMaxRate = "maxrate"
val SettingStartEnqueueTime = "startenqueuetime"
val SettingFlushExistingCheckpoints = "flushexistingcheckpoints"
val SettingRepartition = "repartition"
private val logger = LogManager.getLogger("EventHubInputSetting")
def buildInputEventHubConf(dict: SettingDictionary): InputEventHubConf = {
dict.get(SettingConnectionString) match {
case Some(connectionString) =>
InputEventHubConf(
connectionString = connectionString,
consumerGroup = dict.getString(SettingConsumerGroup),
checkpointDir = dict.getString(SettingCheckpointDir),
checkpointInterval = dict.getString(SettingCheckpointInterval),
maxRate = dict.getString(SettingMaxRate),
startEnqueueTime = dict.getLongOption(SettingStartEnqueueTime),
flushExistingCheckpoints = dict.getBooleanOption(SettingFlushExistingCheckpoints),
repartition = dict.getIntOption(SettingRepartition)
)
case None =>
null
}
}
def getInputEventHubConf(dict: SettingDictionary): InputEventHubConf = {
logger.warn("EventHub NamespacePrefix=" + NamespacePrefix)
buildInputEventHubConf(dict.getSubDictionary(NamespacePrefix))
}
}

Просмотреть файл

@ -0,0 +1,135 @@
// *********************************************************************
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License
// *********************************************************************
package datax.input
import java.sql.Timestamp
import java.time.Instant
import com.microsoft.azure.eventhubs.EventData
import datax.checkpoint.EventhubCheckpointer
import datax.config.UnifiedConfig
import datax.constants.ProductConstant
import datax.exception.EngineException
import datax.input.EventHubInputSetting.InputEventHubConf
import datax.processor.EventHubStreamingProcessor
import datax.securedsetting.KeyVaultClient
import datax.telemetry.AppInsightLogger
import datax.utility.DateTimeUtil
import org.apache.log4j.LogManager
import org.apache.spark.eventhubs.{EventHubsConf, EventHubsUtils, EventPosition}
import org.apache.spark.eventhubs.rdd.HasOffsetRanges
import org.apache.spark.rdd.RDD
import org.apache.spark.streaming.{StreamingContext, Time}
object EventHubStreamingFactory {
def getEventHubConf(eventhubInput:InputEventHubConf) = {
val logger = LogManager.getLogger("EventHubConfBuilder")
val connectionString = KeyVaultClient.resolveSecretIfAny(eventhubInput.connectionString)
if(connectionString==null||connectionString.isEmpty){
val errMsg = s"Connection string is empty for eventhub input"
logger.error(errMsg)
throw new EngineException(errMsg)
}
val checkpointDir = eventhubInput.checkpointDir
val consumerGroup = eventhubInput.consumerGroup
logger.warn("eventhub checkpointDir=" + checkpointDir)
logger.warn("eventhub consumerGroup=" + consumerGroup)
val ehConf = EventHubsConf(connectionString = connectionString)
.setConsumerGroup(consumerGroup)
.setMaxRatePerPartition(eventhubInput.maxRate.toInt)
.setReceiverTimeout(java.time.Duration.ofSeconds(60))
.setOperationTimeout(java.time.Duration.ofSeconds(120))
eventhubInput.startEnqueueTime match {
case Some(startEnqueueTimeInSeconds) =>
if(startEnqueueTimeInSeconds<0){
val startEnqueueTime = Instant.now.plusSeconds(startEnqueueTimeInSeconds)
ehConf.setStartingPosition(EventPosition.fromEnqueuedTime(startEnqueueTime))
logger.warn(s"eventhub startEnqueueTime from config:${startEnqueueTimeInSeconds}, passing startEnqueueTime=$startEnqueueTime")
}
else if(startEnqueueTimeInSeconds>0){
val startEnqueueTime = Instant.ofEpochSecond(startEnqueueTimeInSeconds)
ehConf.setStartingPosition(EventPosition.fromEnqueuedTime(startEnqueueTime))
logger.warn(s"eventhub startEnqueueTime from config:${startEnqueueTimeInSeconds}, passing startEnqueueTime=$startEnqueueTime")
}
else{
ehConf.setStartingPosition(EventPosition.fromStartOfStream)
}
case None =>
ehConf.setStartingPosition(EventPosition.fromEndOfStream)
}
ehConf
}
def getStream(streamingContext: StreamingContext,
eventhubInput:InputEventHubConf,
foreachRDDHandler: (RDD[EventData], Time)=>Unit
) ={
///////////////////////////////////////////////////////////////
//Create direct stream from EventHub
///////////////////////////////////////////////////////////////
val preparationLogger = LogManager.getLogger("PrepareEventHubDirectStream")
val checkpointDir = eventhubInput.checkpointDir
val ehConf = getEventHubConf(eventhubInput)
if(eventhubInput.flushExistingCheckpoints.getOrElse(false))
preparationLogger.warn("Flush the existing checkpoints according to configuration")
else
EventhubCheckpointer.applyCheckpointsIfExists(ehConf, checkpointDir)
val checkpointIntervalInMilliseconds = eventhubInput.checkpointInterval.toLong*1000
EventHubsUtils.createDirectStream(streamingContext, ehConf)
//.persist()
//.window(org.apache.spark.streaming.Duration.10))
.foreachRDD((rdd, time)=>{
AppInsightLogger.trackEvent(ProductConstant.ProductRoot + "/streaming/batch/begin", Map("batchTime"->time.toString), null)
val offsetRanges = rdd.asInstanceOf[HasOffsetRanges].offsetRanges
val batchTime = new Timestamp(time.milliseconds)
val batchTimeStr = DateTimeUtil.formatSimple(batchTime)
val streamLogger = LogManager.getLogger(s"CheckOffsets-${batchTimeStr}")
streamLogger.warn(s"Processing offsets: \n" +
offsetRanges.map(offset=>s"${offset.name}-${offset.partitionId.toString}: from=${offset.fromSeqNo}, until=${offset.untilSeqNo}").mkString("\n"))
try {
foreachRDDHandler(rdd, time)
}
catch {
case e: Exception =>
AppInsightLogger.trackException(e,
Map("batchTime"->time.toString()),
offsetRanges.map(offset=>s"${offset.name}-${offset.partitionId.toString}-fromSeqNo"->offset.fromSeqNo.toDouble).toMap)
throw e
}
if(time.isMultipleOf(org.apache.spark.streaming.Duration(checkpointIntervalInMilliseconds))) {
streamLogger.info(s"Start writing eventhub checkpoints to ${checkpointDir}")
val conf = rdd.sparkContext.hadoopConfiguration
EventhubCheckpointer.writeOffsetsToCheckpoints(checkpointDir, offsetRanges.map(r => (time.milliseconds, r.nameAndPartition.ehName, r.nameAndPartition.partitionId, r.fromSeqNo, r.untilSeqNo)), conf)
streamLogger.warn(s"Done writing eventhub checkpoints to ${checkpointDir}")
}
AppInsightLogger.trackEvent(ProductConstant.ProductRoot + "/streaming/batch/end", Map("batchTime"->time.toString), null)
})
}
@volatile private var instance: EventHubStreamingProcessor = null
def getOrCreateProcessor(config: UnifiedConfig,
generator: UnifiedConfig =>EventHubStreamingProcessor) = {
if (instance == null) {
synchronized {
if (instance == null) {
instance = generator(config)
}
}
}
instance
}
}

Просмотреть файл

@ -0,0 +1,12 @@
// *********************************************************************
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License
// *********************************************************************
package datax.input
import datax.config._
object InputManager {
val NamespacePrefix = SettingNamespace.JobInputPrefix
}

Просмотреть файл

@ -0,0 +1,76 @@
// *********************************************************************
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License
// *********************************************************************
package datax.input
import java.sql.Timestamp
import java.time.Instant
import com.microsoft.azure.eventhubs.EventData
import datax.checkpoint.EventhubCheckpointer
import datax.config.UnifiedConfig
import datax.constants.ProductConstant
import datax.exception.EngineException
import datax.input.EventHubInputSetting.InputEventHubConf
import datax.processor.EventHubStreamingProcessor
import datax.securedsetting.KeyVaultClient
import datax.telemetry.AppInsightLogger
import datax.utility.DateTimeUtil
import org.apache.log4j.LogManager
import org.apache.spark.eventhubs.{EventHubsConf, EventHubsUtils, EventPosition}
import org.apache.spark.eventhubs.rdd.HasOffsetRanges
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.types.{DataType, StructType}
import org.apache.spark.streaming.{StreamingContext, Time}
// Factory class for streaming local events
object LocalStreamingFactory {
def getStream(streamingContext: StreamingContext,
inputSchema: DataType,
foreachRDDHandler: (RDD[EventData], Time)=>Unit
) ={
val preparationLogger = LogManager.getLogger("PrepareLocalDirectStream")
///////////////////////////////////////////////////////////////
//Create direct stream from custom receiver
///////////////////////////////////////////////////////////////
streamingContext.receiverStream(new LocalStreamingSource(inputSchema))
.foreachRDD((rdd, time)=>{
AppInsightLogger.trackEvent(ProductConstant.ProductRoot + "/localstreaming/batch/begin", Map("batchTime"->time.toString), null)
val batchTime = new Timestamp(time.milliseconds)
val batchTimeStr = DateTimeUtil.formatSimple(batchTime)
val streamLogger = LogManager.getLogger(s"CheckOffsets-${batchTimeStr}")
try {
foreachRDDHandler(rdd, time)
}
catch {
case e: Exception =>
AppInsightLogger.trackException(e,
Map("batchTime"->time.toString()),
Map("batchMetric"->1))
throw e
}
AppInsightLogger.trackEvent(ProductConstant.ProductRoot + "/localstreaming/batch/end", Map("batchTime"->time.toString), null)
})
}
@volatile private var instance: EventHubStreamingProcessor = null
def getOrCreateProcessor(config: UnifiedConfig,
generator: UnifiedConfig =>EventHubStreamingProcessor) = {
if (instance == null) {
synchronized {
if (instance == null) {
instance = generator(config)
}
}
}
instance
}
}

Просмотреть файл

@ -0,0 +1,40 @@
// *********************************************************************
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License
// *********************************************************************
package datax.input
import java.nio.charset.Charset
import com.microsoft.azure.eventhubs.EventData
import org.apache.log4j.LogManager
import org.apache.spark.storage._
import org.apache.spark.streaming.receiver._
import org.apache.spark.sql.types.{DataType, StructType}
import datax.utility
/** This is a test receiver that generates data. */
class LocalStreamingSource(inputSchema: DataType) extends Receiver[EventData](StorageLevel.MEMORY_AND_DISK_2) {
/** Start the thread that receives data over a connection */
def onStart() {
new Thread("Local Data Source") { override def run() { receive() } }.start()
}
def onStop() { }
/** Periodically generate random data based on given schema */
private def receive() {
val logger = LogManager.getLogger("LocalStreamingSource")
while(!isStopped()) {
val jsonStr = DataGenerator.getRandomJson(inputSchema)
logger.warn("Generated json="+jsonStr)
val eventData = EventData.create(jsonStr.getBytes(Charset.defaultCharset()))
store(Iterator(eventData))
Thread.sleep(500)
}
}
}

Просмотреть файл

@ -0,0 +1,60 @@
// *********************************************************************
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License
// *********************************************************************
package datax.input
import datax.config.{SettingDictionary, SettingNamespace}
import datax.securedsetting.KeyVaultClient
object RouterSetting {
val NamespacePrefix = SettingNamespace.JobPrefix+"router."
val NamespaceFilterJobPrefix = NamespacePrefix + "job."
val NamespaceFilterPrefix = "filter."
case class FilterOutput(compressionType: String, eventhub: String, folder: String, format: String, outputType: String)
case class SPFilter(sourceIdRegex: String, mappingOperations: Map[String, String], filterCondition: String, filterType: String, jobName:String, coalescingRatio: Double, numPartitions: String, output: FilterOutput)
def buildFilterOutput(dict: SettingDictionary) = {
FilterOutput(
compressionType = dict.getOrNull("compressiontype"),
eventhub = KeyVaultClient.resolveSecretIfAny(dict.getOrNull("eventhub")),
folder = dict.getOrNull("folder"),
format = dict.getOrNull("format"),
outputType = dict.getOrNull("outputType")
)
}
def buildMappingOperations(s: Option[String]) = {
s match {
case Some(str) => str.split(";").map(p=>{
val parts = p.split("=", 1)
if(parts.length==2)
parts(0).trim()->parts(1).trim()
else
parts(0).trim()->null
}).toMap
case None => null
}
}
def buildFilterJob(dict: SettingDictionary, name: String) = {
SPFilter(
sourceIdRegex = dict.getOrNull("sourceidregex"),
mappingOperations = buildMappingOperations(dict.get("mappingoperations")),
filterCondition = dict.getOrNull("filterCondition"),
filterType = dict.getOrNull("filterType"),
jobName = name,
coalescingRatio = dict.getDouble("coalescingRatio"),
numPartitions = dict.getOrNull("numPartitions"),
output = buildFilterOutput(dict.getSubDictionary(NamespaceFilterPrefix))
)
}
def getFiltersConfig(dict: SettingDictionary) = {
dict.buildConfigIterable(buildFilterJob, NamespaceFilterJobPrefix).toSeq
}
}

Просмотреть файл

@ -0,0 +1,35 @@
// *********************************************************************
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License
// *********************************************************************
package datax.input
import datax.config.{ConfigManager, SettingDictionary, SettingNamespace}
import datax.fs.HadoopClient
import datax.securedsetting.KeyVaultClient
import org.apache.log4j.LogManager
import org.apache.spark.sql.types.{DataType, StructType}
import scala.concurrent.{ExecutionContext, Future}
object SchemaFile {
val SettingSchemaFile="blobschemafile"
private def getInputBlobSchemaFilePath(dict: SettingDictionary) = {
dict.getOrNull(SettingNamespace.JobInputPrefix + SettingSchemaFile)
}
private def loadRawBlobSchema(blobSchemaFile: String) = {
// Schema of VS block extraction data
val schemaJsonString = HadoopClient.readHdfsFile(blobSchemaFile).mkString("")
DataType.fromJson(schemaJsonString)
}
def loadInputSchema(dict: SettingDictionary) = {
val file = getInputBlobSchemaFilePath(dict)
val logger = LogManager.getLogger(this.getClass)
logger.warn(s"Load input schema from '${file}'")
val filePath = KeyVaultClient.resolveSecretIfAny(file)
loadRawBlobSchema(filePath)
}
}

Просмотреть файл

@ -0,0 +1,26 @@
// *********************************************************************
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License
// *********************************************************************
package datax.input
import datax.config._
import org.apache.log4j.LogManager
object StreamingInputSetting {
case class StreamingConf(checkpointDir: String,intervalInSeconds: Long)
val NamespaceStreaming = "streaming"
val NamespacePrefix = SettingNamespace.JobInputPrefix + NamespaceStreaming + "."
def buildStreamingConf(dict: SettingDictionary): StreamingConf = {
StreamingConf(
checkpointDir = dict.getOrNull("checkpointdir"),
intervalInSeconds = dict.getOrElse("intervalinseconds", "0").toLong
)
}
def getStreamingInputConf(dict: SettingDictionary): StreamingConf = {
buildStreamingConf(dict.getSubDictionary(NamespacePrefix))
}
}

Просмотреть файл

@ -0,0 +1,24 @@
// *********************************************************************
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License
// *********************************************************************
package datax.processor
import java.sql.Timestamp
import com.microsoft.azure.eventhubs.EventData
import datax.input.BlobPointerInput
import datax.utility.DateTimeUtil
import org.apache.spark.rdd.RDD
import scala.concurrent.duration.Duration
class BlobPointerProcessor(processPaths: (RDD[String], Timestamp, Duration, Timestamp, String) => Map[String, Double])
extends EventHubStreamingProcessor{
val processPathsRDD = processPaths
override val process = (rdd: RDD[EventData], batchTime: Timestamp, batchInterval: Duration) => {
val currentTime = DateTimeUtil.getCurrentTime()
processPaths(rdd.map(BlobPointerInput.parseBlobPath), batchTime, batchInterval, currentTime, "")
}
}

Просмотреть файл

@ -0,0 +1,26 @@
// *********************************************************************
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License
// *********************************************************************
package datax.processor
import java.sql.Timestamp
import com.microsoft.azure.eventhubs.EventData
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.DataFrame
import org.apache.spark.sql.streaming.StreamingQuery
import scala.concurrent.duration.Duration
case class CommonProcessor( processJson: (RDD[String], Timestamp, Duration, Timestamp) => Map[String, Double],
processEventHubDataFrame: (DataFrame) => Map[String, StreamingQuery],
processEventData: (RDD[EventData], Timestamp, Duration, Timestamp) => Map[String, Double],
processPaths: (RDD[String], Timestamp, Duration, Timestamp, String) => Map[String, Double]){
def asBlobPointerProcessor() = new BlobPointerProcessor(processPaths = this.processPaths)
def asJsonProcessor() = new JsonProcessor(processJson = this.processJson)
def asDirectProcessor() = new DirectProcessor(processEventData = this.processEventData)
def asStructuredStreamingProcessor = new EventHubStructuredStreamingProcessor(processDataFrame = this.processEventHubDataFrame)
def asDirectLocalProcessor() = new DirectLocalProcessor(processEventData = this.processEventData)
}

Просмотреть файл

@ -0,0 +1,569 @@
// *********************************************************************
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License
// *********************************************************************
package datax.processor
import java.sql.Timestamp
import java.util.concurrent.Executors
import com.microsoft.azure.eventhubs.EventData
import datax.config._
import datax.constants.{ColumnName, DatasetName, FeatureName, ProcessingPropertyName, ProductConstant}
import datax.data.FileInternal
import datax.exception.EngineException
import datax.fs.HadoopClient
import datax.host.{AppHost, CommonAppHost, SparkSessionSingleton, UdfInitializer}
import datax.input.{BlobPointerInput, InputManager, SchemaFile, StreamingInputSetting}
import datax.sink.{OutputManager, OutputOperator}
import datax.telemetry.{AppInsightLogger, MetricLoggerFactory}
import datax.utility._
import datax.handler._
import datax.sql.TransformSQLParser
import org.apache.log4j.LogManager
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.{DataFrame, Row, SparkSession}
import org.apache.spark.sql.functions._
import org.apache.spark.sql.streaming.{OutputMode, Trigger}
import org.apache.spark.storage.StorageLevel
import scala.language.{postfixOps, reflectiveCalls}
import scala.collection.mutable.{HashMap, HashSet, ListBuffer}
import scala.concurrent.{Await, ExecutionContext, Future}
import scala.concurrent.duration._
import scala.collection.JavaConverters._
/*
Generate the common processor
*/
object CommonProcessorFactory {
private val threadPool = Executors.newFixedThreadPool(8)
implicit private val ec = ExecutionContext.fromExecutorService(threadPool)
val appHost = CommonAppHost
/*
Create the processor based on input config, initialize all functions to be used in streaming iterations
*/
def createProcessor(config: UnifiedConfig):CommonProcessor = {
val sparkConf = config.sparkConf
val spark = appHost.getSpark(sparkConf)
val dict = config.dict
import spark.implicits._
// Load and initialize functions in parallel to be used in streaming iterations.
val loadings = Tuple5(
Future{SchemaFile.loadInputSchema(dict)},
ProjectionHandler.loadProjectionsFuture(dict),
TransformHandler.loadTransformFuture(dict),
ReferenceDataHandler.loadReferenceDataFuture(spark, dict),
Future {ExtendedUDFHandler.initialize(spark, dict)}
)
val (rawBlockSchema, projections, sqlParsed, referencedDataLoaded, udfs) =
Await.result(for {
r1 <- loadings._1
r2 <- loadings._2
r3 <- loadings._3
r4 <- loadings._4
r5 <- loadings._5
} yield (r1, r2, r3, r4, r5), 10 minutes)
BuiltInFunctionsHandler.initialize(spark, dict)
ProjectionHandler.validateProjections(projections)
JarUDFHandler.loadJarUdf(spark, dict)
AzureFunctionHandler.initialize(spark, dict)
UdfInitializer.initialize(spark, dict)
val createdTables = StateTableHandler.createTables(spark, dict)
val inputNormalizer = InputNormalizerHandler.initialize(spark, dict)
val inputNormalizerUdf = if(inputNormalizer==null) udf((s:String)=>s) else udf(inputNormalizer)
val preProjection = PreProjectionHandler.initialize(spark, dict)
val buildPropertiesUdf = PropertiesHandler.initialize(spark, dict)
/*
function parse input string into a Raw object column based on the input raw blob schema and also project the data frame
based on the columns from projection files
*/
def project(inputDf: DataFrame, batchTime: Timestamp): DataFrame = {
// Initial schema and data set
var df = inputDf
.withColumn(ColumnName.RawObjectColumn, from_json(inputNormalizerUdf(col(ColumnName.RawObjectColumn)), rawBlockSchema))
df = if(preProjection==null)df else preProjection(df)
val preservedColumns = df.schema.fieldNames.filter(_.startsWith(ColumnName.InternalColumnPrefix))
df = df.withColumn(ColumnName.PropertiesColumn, buildPropertiesUdf(col(ColumnName.InternalColumnFileInfo), lit(batchTime)))
for(step <- 0 until projections.length)
df = df.selectExpr(projections(step)++preservedColumns: _*)
df
}
// initialize metrics settings
val metricAppName = dict.getMetricAppName()
val metricConf = MetricsHandler.getMetricsSinkConf(dict)
// figure out how to output
val outputs = OutputManager.getOperatators(dict)
val outputCount = outputs.size
// initialize output handlers
outputs.par.foreach(o => {
if (o.onInitialization != null)
o.onInitialization(spark)
})
// initialize settings of time windows
val timewindows = TimeWindowHandler.initialize(spark, dict)
// store past RDDs for overlapping time windows
val pastRdds = new HashMap[Timestamp, RDD[Row]]()
// store names of data frames for outputting
val dataframes = new HashMap[String, DataFrame]
/*
function to execute the queries for transforming data frames and output them accordingly
*/
def route(projectedDf: DataFrame, batchTime: Timestamp, batchInterval: Duration, outputPartitionTime: Timestamp, targets: Set[String], tableNamespace:String) = {
val transformLogger = LogManager.getLogger("Transformer")
// store the mapping information between table names in the query and the actual data frame name we are processing
val tableNames = new HashMap[String, String]
// register the start input table for the query
val initTableName = DatasetName.DataStreamProjection+tableNamespace
tableNames(DatasetName.DataStreamProjection) = initTableName
dataframes += initTableName -> projectedDf
// store the data frames that we should unpersist after one iteration
val dataFramesToUncache = new ListBuffer[DataFrame]
transformLogger.warn("Persisting the current projected dataframe")
projectedDf.persist()
dataFramesToUncache += projectedDf
// store the metrics to send after one iteration
val outputMetrics = HashMap[String, Double]()
// log metric of how many events are incoming on this iteration
val inputRawEventsCount = projectedDf.count()
transformLogger.warn(s"Received $inputRawEventsCount raw input events")
outputMetrics += s"Input_Normalized_Events_Count" -> inputRawEventsCount
if(timewindows.isEnabled){
// when time window is turned on, we need to calculate the start and end time of the window and query against
// the past RDD from history
// we calculate the time window as the maximum time span from the time windows passed-in from settings
// this one below determines the end time as event filter, note to minus the watermark span which is the buffer for events to finalize
val windowEndTime = Timestamp.from(batchTime.toInstant.minusSeconds(timewindows.watermark.toSeconds))
// this one below determines the start time filter, which is maximum start time of all time windows
val windowStartTime = Timestamp.from(windowEndTime.toInstant.minusSeconds(timewindows.maxWindow.toSeconds))
val rdd = projectedDf.where(timewindows.timestampColumn>=windowEndTime).rdd
transformLogger.warn("Persisting the windowed projected data frame")
rdd.persist(StorageLevel.MEMORY_ONLY_SER)
// log metric of after filtered by the time window, how many events actually are participating in the transformation
val inputEventsCount = rdd.mapPartitions(it=>{
val loggerSuffix = SparkEnvVariables.getLoggerSuffix()
val instrumentLogger = LogManager.getLogger(s"${ProductConstant.ProductInstrumentLogger}$loggerSuffix")
val t1 = System.nanoTime()
instrumentLogger.warn(s"Start collecting events at $t1")
val count = it.toArray.length
val timeNow = System.nanoTime()
instrumentLogger.warn(s"Collected $count events for caching, spent time=${(timeNow-t1)/1E9} seconds")
Iterator.single(count)
}).reduce(_+_)
transformLogger.warn(s"Received $inputEventsCount input events for ${initTableName}")
outputMetrics += s"Input_${DatasetName.DataStreamProjection}_Events_Count" -> inputEventsCount
// collect data from past RDDs that fits in the time window
val cutTime = Timestamp.from(batchTime.toInstant.minusSeconds((timewindows.watermark+timewindows.maxWindow).toSeconds))
pastRdds.keys.filter(_.compareTo(cutTime)<=0).foreach(k=>{
pastRdds.remove(k) match {
case Some(rdd) =>
transformLogger.warn(s"removing past RDD at ${k} since it is before or equal to ${cutTime}")
rdd.unpersist(false)
case None =>
transformLogger.warn(s"Unexpectedly ${k} does exist in the pastRDDs")
}
})
// union the data from current projected data frame and the past ones
val sc = rdd.sparkContext
val pastDataUnion = spark.createDataFrame(if(pastRdds.size>1){
transformLogger.warn(s"union ${pastRdds.size} batches, including ${pastRdds.keySet.mkString(",")}")
sc.union(rdd, pastRdds.values.toSeq: _*)
} else rdd, projectedDf.schema)
val unionTableNameInSql = DatasetName.DataStreamProjectionWithWindow
val unionTableName = unionTableNameInSql+tableNamespace
pastDataUnion
.where(timewindows.timestampColumn>=windowStartTime && timewindows.timestampColumn<windowEndTime)
.createOrReplaceTempView(unionTableName)
tableNames(unionTableNameInSql) = unionTableName
dataframes(unionTableName)=spark.table(unionTableName)
// register time-windowed tables and their corresponding data frames for different time window spec
for (tw <- timewindows.windows) {
val winTableName = tw._1
val winTableNameInScope = winTableName + tableNamespace
val winStartTime = Timestamp.from(windowEndTime.toInstant.minusSeconds(tw._2.toSeconds))
transformLogger.warn(s"Create or replace time windowed view '${winTableNameInScope}' within window('$winStartTime' - '$windowEndTime')")
pastDataUnion
.where(timewindows.timestampColumn>=winStartTime && timewindows.timestampColumn<windowEndTime)
.createOrReplaceTempView(winTableNameInScope)
tableNames(winTableName) = winTableNameInScope
dataframes(winTableNameInScope)=spark.table(winTableNameInScope)
}
// replace the starting table
val adjustedBatchStartTime = Timestamp.from(windowEndTime.toInstant.minusSeconds(batchInterval.toSeconds))
val cachedProjectedDf = pastDataUnion.where(timewindows.timestampColumn>=adjustedBatchStartTime && timewindows.timestampColumn<windowEndTime)
cachedProjectedDf.createOrReplaceTempView(initTableName)
// register a table to reference to the projected data frame within only the current iteration batch
val batchedTableName = DatasetName.DataStreamProjectionBatch + tableNamespace
projectedDf.createOrReplaceTempView(batchedTableName)
tableNames(DatasetName.DataStreamProjectionBatch) = batchedTableName
dataframes(batchedTableName)=projectedDf
pastRdds(batchTime) = rdd
}
else{
// if time window is not turned on, we simply register the projected data frame as input starting table for query
outputMetrics += s"Input_${DatasetName.DataStreamProjection}_Events_Count" -> inputRawEventsCount
projectedDf.createOrReplaceTempView(initTableName)
}
// register state-store tables
for (elem <- createdTables) {
tableNames(elem._1)=elem._2.getActiveTableName()
}
// start executing queries
if(sqlParsed!=null && sqlParsed.commands.length>0){
val partitionNumber = projectedDf.rdd.getNumPartitions
val queries = sqlParsed.commands
queries.foreach(expr=>{
val statement = TransformSQLParser.replaceTableNames(expr.text, tableNames)
expr.commandType match {
case TransformSQLParser.CommandType_Command =>
transformLogger.warn(s"Executing command '$statement'")
spark.sql(statement)
case TransformSQLParser.CommandType_Query =>
createdTables.find(_._1 == expr.name) match {
case Some(t) =>
// this case is a query statement assigns data set back to a registered state-store table
// so we have to overwrite the existing state-store table with the new data
t._2.overwrite(statement)
tableNames(t._1) = t._2.flip()
case None =>
// this is a normal case that we borther to handle state-store tables
val tableName = expr.name + tableNamespace
transformLogger.warn(s"Creating view '$tableName' for '$statement'")
val ds = if(partitionNumber > 0) {
spark.sql(statement).coalesce(partitionNumber)
}
else {
transformLogger.warn(s"Zero events found for $tableName' for '$statement'")
spark.sql(statement)
}
tableNames(expr.name) = tableName
dataframes(tableName) = ds
// cache data frames which has been referenced more than once to improve performance
if(TransformHandler.shouldCacheCommonViews(dict) && sqlParsed.viewReferenceCount.getOrElse(expr.name, 0)>1){
transformLogger.warn(s"Caching view '$tableName' for it would be used more than once")
ds.cache()
dataFramesToUncache += ds
}
ds.createOrReplaceTempView(tableName)
}
case _ =>
throw new EngineException(s"unknown commandType : ${expr.commandType}")
}
})
}
// start outputting data
def outputHandler(operator: OutputOperator) = {
val tableName = operator.name
val outputTableName = tableName+tableNamespace
dataframes.get(outputTableName) match {
case None => throw new EngineException(s"could not find data set name '$outputTableName' for output '${operator.name}'")
case Some(df) =>
if (operator.onBatch != null) operator.onBatch(df.sparkSession, outputPartitionTime, targets)
operator.output(df, outputPartitionTime).map { case (k, v) => (s"Output_${operator.name}_" + k) -> v.toDouble }
}
}
var result = Map.empty[String,Double]
if(outputCount>0) {
// if there are multiple outputs, we kick off them in parallel
result = if (outputCount > 1)
outputs.par.map(outputHandler).reduce(_ ++ _)
else
outputHandler(outputs(0))
}
// persisting state-store tables
for (elem <- createdTables) {
elem._2.persist()
}
// clear cache of the data frames in this batch of iteration
transformLogger.warn("Un-persisting the dataframes")
dataFramesToUncache.foreach(_.unpersist(false))
dataFramesToUncache.clear()
outputMetrics ++ result
}
/*
function to process unified data frame - which has 4 columns: raw string input, Properties, SystemProperties and an internal metadata column for processing
*/
def processDataset(data: DataFrame,
batchTime: Timestamp,
batchInterval: Duration,
outputPartitionTime: Timestamp,
targets: Set[String],
namespace: String):Map[String, Double] = {
val t1 = System.nanoTime()
val batchLogger = LogManager.getLogger(ProductConstant.DataStreamProcessDataSetLogger)
val metricLogger = MetricLoggerFactory.getMetricLogger(metricAppName, metricConf)
val spark = data.sparkSession
def postMetrics(metrics: Iterable[(String, Double)]): Unit = {
batchLogger.warn(s"Sending metrics:\n${metrics.map(m => m._1 + " -> " + m._2).mkString("\n")}")
metricLogger.sendBatchMetrics(metrics, batchTime.getTime)
}
try{
// call ExtendedUDFs to refresh their data
udfs.foreach(udf=>{
if(udf._2!=null)udf._2(spark, batchTime)
})
// if raw input is specified in the output settings as one of the output, we cache it and register that to allow it to be output
val persistRaw = outputs.find(p=>p.name==DatasetName.DataStreamRaw).isDefined
if(persistRaw){
data.cache()
dataframes(DatasetName.DataStreamRaw) = data
}
// main processing steps
val baseProjection = project(data, batchTime)
val counts = route(baseProjection, batchTime, batchInterval, outputPartitionTime, targets, namespace)
// clear the cache of raw input table if needed.
if(persistRaw){
data.unpersist(false)
}
// calculate performance metrics
val partitionProcessedTime = System.nanoTime
val latencyInSeconds = (DateTimeUtil.getCurrentTime().getTime - batchTime.getTime)/1000D
val metrics = Map[String, Double](
"Latency-Process" -> (partitionProcessedTime - t1) / 1E9,
"Latency-Batch" -> latencyInSeconds
) ++ counts
postMetrics(metrics)
metrics
}
catch{
case e: Exception =>
appHost.getTelemetryService().trackEvent(ProductConstant.ProductRoot + "/error", Map(
"errorLocation" -> "ProcessDataFrame",
"errorMessage" -> e.getMessage,
"errorStackTrace" -> e.getStackTrace.take(10).mkString("\n"),
"batchTime" -> batchTime.toString
), null)
appHost.getTelemetryService().trackException(e, Map(
"errorLocation" -> "ProcessDataFrame",
"errorMessage" -> e.getMessage,
"batchTime" -> batchTime.toString
), null)
Thread.sleep(1000)
throw e
}
}
CommonProcessor(
/*
process a batch of EventData from EventHub
*/
processEventData = (rdd: RDD[EventData], batchTime: Timestamp, batchInterval: Duration, outputPartitionTime: Timestamp) =>{
processDataset(rdd
.map(d=>{
val bodyBytes = d.getBytes
if(bodyBytes==null) throw new EngineException(s"null bytes from event: ${d.getObject}, properties:${d.getProperties}, systemProperties:${d.getSystemProperties}")
(
new String(bodyBytes),
d.getProperties.asScala.map{case(k,v)=>k->v.toString},
if(d.getSystemProperties!=null) d.getSystemProperties.asScala.map{case(k,v)=>k->v.toString} else Map.empty[String, String],
FileInternal())
})
.toDF(
ColumnName.RawObjectColumn,
"Properties",
"SystemProperties",
ColumnName.InternalColumnFileInfo
), batchTime, batchInterval, outputPartitionTime, null, "")
},
/*
process structured streaming for given data frame
Note this is incomplete and not used for now
*/
processEventHubDataFrame = (df: DataFrame) => {
val logger = LogManager.getLogger("processEventHubDataFrame")
df
.select(
from_json(col("body").cast("string"), rawBlockSchema).alias("Raw"),
col("properties"),
col("enqueuedTime")
)
.selectExpr("Raw.*", "properties", "enqueuedTime")
.withWatermark("enqueuedTime", "60 seconds")
.createOrReplaceTempView(DatasetName.DataStreamProjection)
val outputs = sqlParsed.commands
.filter(n=>n.commandType==TransformSQLParser.CommandType_Query).map(n=>n.name->n.text)
.toMap
val streamingConf = StreamingInputSetting.getStreamingInputConf(dict)
val interval = streamingConf.intervalInSeconds
outputs.map{case(k, v)=>{
k-> spark.sql(v).writeStream
.outputMode(OutputMode.Append())
.format("console")
.trigger(Trigger.ProcessingTime(interval, SECONDS))
.start()
}}
},
/*
process json data frame
*/
processJson = (jsonRdd: RDD[String], batchTime: Timestamp, batchInterval: Duration, outputPartitionTime: Timestamp) =>{
processDataset(jsonRdd.map((FileInternal(), _)).toDF(ColumnName.InternalColumnFileInfo, ColumnName.RawObjectColumn),
batchTime, batchInterval, outputPartitionTime, null, "")
},
// process blob path pointer data frame
processPaths = (pathsRDD: RDD[String],
batchTime: Timestamp,
batchInterval: Duration,
outputPartitionTime: Timestamp,
namespace: String) => {
val spark = SparkSessionSingleton.getInstance(pathsRDD.sparkContext.getConf)
val metricLogger = MetricLoggerFactory.getMetricLogger(metricAppName, metricConf)
val batchTimeStr = DateTimeUtil.formatSimple(batchTime)
val batchLog = LogManager.getLogger(s"BatchProcessor-B$batchTimeStr")
// Functions used with in processPaths
val batchTimeInMs = batchTime.getTime
def postMetrics(metrics: Iterable[(String, Double)]): Unit = {
metricLogger.sendBatchMetrics(metrics, batchTimeInMs)
batchLog.warn(s"Metric ${metrics.map(m => m._1 + "=" + m._2).mkString(",")}")
}
// Process the array of input files, and sink them
// Return metrics: (number of processed blobs, number of processed events, number of filtered events sent to eventhub)
def processBlobs(files: Array[FileInternal],
outputPartitionTime: Timestamp,
partition: String,
targetPar: String): Map[String, Double] = {
val filesCount = files.length
val t1 = System.nanoTime()
// Get the earliest blob to calculate latency
val paths = files.map(_.inputPath)
val blobTimes = files.map(_.fileTime).filterNot(_ == null).toList
postMetrics(Map(s"InputBlobs" -> filesCount.toDouble))
val (minBlobTime, maxBlobTime) =
if(blobTimes.length>0) {
val minBlobTime = blobTimes.minBy(_.getTime)
val maxBlobTime = blobTimes.maxBy(_.getTime)
batchLog.warn(s"partition '$partition': started, size: $filesCount, blob time range[${DateTimeUtil.formatSimple(minBlobTime)}, ${DateTimeUtil.formatSimple(maxBlobTime)}]")
(minBlobTime, maxBlobTime)
}
else{
batchLog.warn(s"Cannot figure out timestamp from file name, please check if there is misconfiguration in the fileTimeRegex setting")
(null, null)
}
val pathsList = paths.mkString(",")
batchLog.debug(s"Batch loading files:$pathsList")
val inputDf = spark.sparkContext.parallelize(files, filesCount)
.flatMap(file => HadoopClient.readHdfsFile(file.inputPath, gzip = file.inputPath.endsWith(".gz"))
.filter(l=>l!=null && !l.isEmpty).map((file, outputPartitionTime, _)))
.toDF(ColumnName.InternalColumnFileInfo, ColumnName.MetadataColumnOutputPartitionTime, ColumnName.RawObjectColumn)
val targets = files.map(_.target).toSet
val processedMetrics = processDataset(inputDf, batchTime, batchInterval, outputPartitionTime, targets, partition)
if(minBlobTime!=null){
val latencyInSeconds = (DateTimeUtil.getCurrentTime().getTime - minBlobTime.getTime)/1000D
val latencyMetrics = Map(s"Latency-Blobs" -> latencyInSeconds)
postMetrics(latencyMetrics)
latencyMetrics++processedMetrics
}
else{
processedMetrics
}
}
def processPartition(v: (String, HashSet[FileInternal])) = {
val par = v._1
val paths = v._2.toArray
processBlobs(paths, outputPartitionTime, par+namespace, par)
}
batchLog.warn(s"Start batch ${batchTime}, output partition time:${outputPartitionTime}, namespace:${namespace}")
val t1 = System.nanoTime
val pathsGroups = BlobPointerInput.pathsToGroups(rdd = pathsRDD,
jobName = dict.getAppName(),
dict = dict,
outputTimestamp = outputPartitionTime)
val pathsFilteredGroups = BlobPointerInput.filterPathGroups(pathsGroups)
val pathsCount = pathsFilteredGroups.aggregate(0)(_ + _._2.size, _ + _)
//try {
val result =
if (pathsCount > 0) {
batchLog.warn(s"Loading filtered blob files count=$pathsCount, First File=${pathsFilteredGroups.head._2.head}")
if (pathsFilteredGroups.length > 1)
Await.result(FutureUtil.failFast(pathsFilteredGroups
.map(kv => Future {
processPartition(kv)
})), 5 minutes).reduce(DataMerger.mergeMapOfDoubles)
else
processPartition(pathsFilteredGroups(0))
}
else {
batchLog.warn(s"No valid paths is found to process for this batch")
Map[String, Double]()
}
val batchProcessingTime = (System.nanoTime - t1) / 1E9
val metrics = Map[String, Double](
"BatchProcessedET" -> batchProcessingTime
)
postMetrics(metrics)
batchLog.warn(s"End batch ${batchTime}, output partition time:${outputPartitionTime}, namespace:${namespace}")
metrics ++ result
} // end of processPaths
) // end of CommonProcessor
} // end of init
} // end of CommonProcessorFactory

Просмотреть файл

@ -0,0 +1,22 @@
// *********************************************************************
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License
// *********************************************************************
package datax.processor
import java.sql.Timestamp
import com.microsoft.azure.eventhubs.EventData
import datax.utility.DateTimeUtil
import org.apache.spark.rdd.RDD
import scala.concurrent.duration.Duration
// Data processor for processing events in OneBox mode where job running is running locally
class DirectLocalProcessor(processEventData: (RDD[EventData], Timestamp, Duration, Timestamp) => Map[String, Double])
extends EventHubStreamingProcessor{
override val process = (rdd: RDD[EventData], batchTime: Timestamp, batchInterval: Duration) => {
val outputPartitionTime =DateTimeUtil.getCurrentTime()
processEventData(rdd, batchTime, batchInterval, outputPartitionTime)
}
}

Просмотреть файл

@ -0,0 +1,21 @@
// *********************************************************************
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License
// *********************************************************************
package datax.processor
import java.sql.Timestamp
import com.microsoft.azure.eventhubs.EventData
import datax.utility.DateTimeUtil
import org.apache.spark.rdd.RDD
import scala.concurrent.duration.Duration
class DirectProcessor(processEventData: (RDD[EventData], Timestamp, Duration, Timestamp) => Map[String, Double])
extends EventHubStreamingProcessor{
override val process = (rdd: RDD[EventData], batchTime: Timestamp, batchInterval: Duration) => {
val outputPartitionTime = DateTimeUtil.getCurrentTime()
processEventData(rdd, batchTime, batchInterval, outputPartitionTime)
}
}

Просмотреть файл

@ -0,0 +1,16 @@
// *********************************************************************
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License
// *********************************************************************
package datax.processor
import java.sql.Timestamp
import com.microsoft.azure.eventhubs.EventData
import org.apache.spark.rdd.RDD
import scala.concurrent.duration.Duration
trait EventHubStreamingProcessor {
val process: (RDD[EventData], Timestamp, Duration) => Map[String, Double]
}

Просмотреть файл

@ -0,0 +1,13 @@
// *********************************************************************
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License
// *********************************************************************
package datax.processor
import org.apache.spark.sql.DataFrame
import org.apache.spark.sql.streaming.StreamingQuery
class EventHubStructuredStreamingProcessor(processDataFrame: DataFrame=>Map[String, StreamingQuery])
extends StructuredStreamingProcessor {
override val process: DataFrame => Map[String, StreamingQuery] = processDataFrame
}

Просмотреть файл

@ -0,0 +1,21 @@
// *********************************************************************
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License
// *********************************************************************
package datax.processor
import java.sql.Timestamp
import com.microsoft.azure.eventhubs.EventData
import datax.utility.DateTimeUtil
import org.apache.spark.rdd.RDD
import scala.concurrent.duration.Duration
class JsonProcessor(processJson: (RDD[String], Timestamp, Duration, Timestamp) => Map[String, Double])
extends EventHubStreamingProcessor{
override val process = (rdd: RDD[EventData], batchTime: Timestamp, batchInterval: Duration) => {
val outputPartitionTime = DateTimeUtil.getCurrentTime()
processJson(rdd.map(w=>new String(w.getBytes)), batchTime, batchInterval, outputPartitionTime)
}
}

Просмотреть файл

@ -0,0 +1,12 @@
// *********************************************************************
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License
// *********************************************************************
package datax.processor
import org.apache.spark.sql.DataFrame
import org.apache.spark.sql.streaming.StreamingQuery
trait StructuredStreamingProcessor {
val process: (DataFrame) => Map[String, StreamingQuery]
}

Просмотреть файл

@ -0,0 +1,125 @@
// *********************************************************************
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License
// *********************************************************************
package datax.securedsetting
import datax.config.ConfigManager
import datax.constants.JobArgument
import datax.exception.EngineException
import datax.keyvault.KeyVaultMsiAuthenticatorClient
import org.apache.log4j.LogManager
import scala.collection.mutable
/***
* Utility module to access KeyVault service from Azure.
*/
object KeyVaultClient {
private val logger = LogManager.getLogger(this.getClass)
private val secretRegex = "^keyvault:\\/\\/([a-zA-Z0-9-_]+)\\/([a-zA-Z0-9-_]+)$".r
private val kvc = KeyVaultMsiAuthenticatorClient.getKeyVaultClient()
private val cache = new mutable.HashMap[String, String]
/**
* get value of a matched secretId from keyvault
* @param secretId secretId
* @return value of the secret
*/
private def resolveSecret(secretId: String): Option[String] = {
if(secretId==null||secretId.isEmpty)
return Option(secretId)
secretRegex.findFirstMatchIn(secretId) match {
case Some(secretInfo) => val vaultName = secretInfo.group(1)
val secretName = secretInfo.group(2)
cache.synchronized{
cache.get(secretId) match {
case Some(value) => Some(value)
case None =>
val secret = kvc.synchronized{
kvc.getSecret(s"https://$vaultName.vault.azure.net",secretName)
}
logger.warn(s"resolved secret:'$secretId'")
val value = secret.value()
cache(secretId) = value
Some(value)
}
}
case None =>
logger.warn(s"did not resolve:'$secretId', return as is")
None
}
}
/***
* get secret from KeyVault with the specified name, could throw exception
* @param secretId secret uri to retrieve the secret value
* @return value of the secret
*/
@throws[EngineException]
def getSecretOrThrow(secretId: String): String = {
if(secretId==null || secretId.isEmpty){
throw new EngineException(s"secret reference cannot be null or empty")
}
else{
resolveSecret(secretId) match {
case Some(m) => m
case None => throw new EngineException(s"secret is not found with reference name: '${secretId}'.")
}
}
}
/***
* get secret from KeyVault with the specified name, exception handled internally.
* * @param secretId secret uri to retrieve the secret value
* * @return value of the secret or None if any exception occurs.
*/
def getSecret(secretId: String): Option[String] = {
try{
Some(getSecretOrThrow(secretId))
}
catch {
case e: Exception =>
logger.warn(s"skipped '$secretId': ${e.getMessage}")
None
}
}
/**
* Try resolve the secretId from input string if there is any
* a secretId is in format like "keyvault://keyvault-name/secret-name"
* @param input string with potential secretId in it
* @return resolve the secret value according to the secretId or return the input string
*/
def resolveSecretIfAny(input: String): String = {
resolveSecret(input).getOrElse(input)
}
/**
* Try resolve the secretId from input string if there is any
* a secretId is in format like "keyvault://keyvault-name/secret-name"
* @param input string with potential secretId in it
* @return resolve the secret value according to the secretId or return the input string
*/
def resolveSecretIfAny(input: Option[String]): Option[String] = {
input.map(resolveSecretIfAny(_))
}
/***
* a scope to execute operation with the default keyvault name, skip the operation if that doesn't exist.
* @param callback execution within the scope
*/
def withKeyVault(callback: (String)=> Unit) = {
ConfigManager.getActiveDictionary().get(JobArgument.ConfName_DefaultVaultName) match {
case Some(vaultName) => callback(vaultName)
case None => logger.warn(s"No default vault is defined, skipped finding key for storage accounts")
}
}
}

Просмотреть файл

@ -0,0 +1,44 @@
// *********************************************************************
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License
// *********************************************************************
package datax.sink
import datax.config.{SettingDictionary, SettingNamespace}
object BlobOutputSetting {
case class BlobGroupOutputConf(folder: String)
case class BlobOutputConf(groupEvaluation: Option[String], groups: Map[String, BlobGroupOutputConf], compressionType: Option[String], format: Option[String])
val Namespace = "blob"
val SettingGroupEvaluation = "groupevaluation"
val SettingCompressionType = "compressiontype"
val SettingFormat = "format"
val SettingGroup = "group"
val BlobGroupPrefix = SettingGroup + SettingNamespace.Seperator
val SettingGroupOutputFolder = "folder"
private def buildBlobGroupOutputConf(dict: SettingDictionary, name: String): BlobGroupOutputConf = {
dict.get(SettingGroupOutputFolder).map(BlobGroupOutputConf(_)).orNull
}
def buildBlobOutputConf(dict: SettingDictionary, name: String): BlobOutputConf = {
val groups = dict.buildConfigMap(buildBlobGroupOutputConf, BlobGroupPrefix)
if(groups.size>0)
BlobOutputConf(
groupEvaluation = dict.get(SettingGroupEvaluation),
groups = groups,
compressionType = dict.get(SettingCompressionType),
format = dict.get(SettingFormat)
)
else
null
}
def getDefaultBlobOutputConf(dict: SettingDictionary): BlobOutputConf = {
val prefix = SettingNamespace.JobOutputDefaultPreifx + Namespace + SettingNamespace.Seperator
BlobOutputSetting.buildBlobOutputConf(dict.getSubDictionary(prefix), SettingNamespace.JobOutputDefaultPreifx + Namespace)
}
}

Просмотреть файл

@ -0,0 +1,211 @@
// *********************************************************************
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License
// *********************************************************************
package datax.sink
import java.sql.Timestamp
import java.text.SimpleDateFormat
import datax.config._
import datax.constants.{JobArgument, MetricName}
import datax.data.{FileInternal, ProcessResult}
import datax.fs.HadoopClient
import datax.securedsetting.KeyVaultClient
import datax.sink.BlobOutputSetting.BlobOutputConf
import datax.utility.{GZipHelper, SinkerUtil}
import org.apache.log4j.LogManager
import org.apache.spark.TaskContext
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.{Row, SparkSession}
import scala.collection.mutable
import scala.concurrent.duration.Duration
object BlobSinker extends SinkOperatorFactory {
val SinkName = "Blobs"
val DefaultOutputGroup = "main"
def generateOutputFolderPath(folderFormat: String, outputTimestamp: Timestamp, target: Option[String]) = {
if(folderFormat==null || folderFormat.isEmpty)
null
else {
// val timestamp = new Timestamp(new java.util.Date().getTime)
val minute = outputTimestamp.toLocalDateTime().getMinute()
val quarter = Array("00", "15", "30", "45")
val quarterBucket = quarter(Math.round(minute / 15))
//getLogger.warn("Minute Bucket: " + quarterBucket)
val simpleTimeFormat = new SimpleDateFormat("HHmmss")
val minuteBucket = simpleTimeFormat.format(outputTimestamp)
String.format(folderFormat, outputTimestamp)
.replaceAllLiterally("${quarterBucket}", quarterBucket)
.replaceAllLiterally("${minuteBucket}", minuteBucket)
.replaceAllLiterally("${target}", target.getOrElse("UNKNOWN"))
.stripSuffix("/") + "/"
}
}
// write events to blob location
def writeEventsToBlob(data: Seq[String], outputPath: String, compression: Boolean) {
val logger = LogManager.getLogger(s"EventsToBlob-Writer${SparkEnvVariables.getLoggerSuffix()}")
val countEvents = data.length
val t1 = System.nanoTime()
var timeLast = t1
var timeNow: Long = 0
logger.info(s"$timeNow:Partition started")
//val data = it.toArray
if (countEvents > 0) {
timeNow = System.nanoTime()
logger.info(s"$timeNow:Step 1: collected ${countEvents} records, spent time=${(timeNow - timeLast) / 1E9} seconds")
timeLast = timeNow
val content = if(compression){
val result = GZipHelper.deflateToBytes(data)
timeNow = System.nanoTime()
logger.info(s"$timeNow:Step 2: compressed to ${result.length} bytes, spent time=${(timeNow - timeLast) / 1E9} seconds")
timeLast = timeNow
result
}
else {
data.mkString("\n").getBytes
}
HadoopClient.writeWithTimeoutAndRetries(
hdfsPath = outputPath,
content = content,
timeout = Duration.create(ConfigManager.getActiveDictionary().getOrElse(JobArgument.ConfName_BlobWriterTimeout, "10 seconds")),
retries = 0
)
timeNow = System.nanoTime()
logger.info(s"$timeNow:Step 3: done writing to $outputPath, spent time=${(timeNow - timeLast) / 1E9} seconds")
timeLast = timeNow
}
logger.info(s"$timeNow:Done writing events ${countEvents} events, spent time=${(timeLast - t1) / 1E9} seconds")
}
def writeDatasetToBlobs(rdd: RDD[String], outputFolder: String, fileSuffix: String, compression: Boolean):RDD[ProcessResult] = {
val outputPathPrefix = outputFolder.stripSuffix("/")
rdd.mapPartitions(it=>{
val tc = TaskContext.get()
val logger = LogManager.getLogger(s"DatasetToBlobs-Writer${SparkEnvVariables.getLoggerSuffix()}")
val t1 = System.nanoTime()
var timeLast = t1
var timeNow: Long = 0
logger.info(s"$timeNow:Partition started")
val dataAll = it.toArray
val dataSize = dataAll.length
timeNow = System.nanoTime()
logger.info(s"$timeNow:Collected ${dataSize} events, spent time=${(timeNow - timeLast) / 1E9} seconds")
timeLast = timeNow
val path = outputPathPrefix + "/part-%05d".format(tc.partitionId()) + fileSuffix
if(dataSize>0) {
writeEventsToBlob(dataAll, path, compression)
timeNow = System.nanoTime()
logger.info(s"$timeNow:Done writting ${dataAll.length} events, spent time=${(timeNow - timeLast) / 1E9} seconds")
timeLast = timeNow
Iterator.single(ProcessResult(1, dataSize))
}
else {
logger.warn(s"There is 0 events to output, skipped output partition file:'$path'")
Iterator.single(ProcessResult(0, 0))
}
})
}
val MetricPrefixEvents = s"${MetricName.MetricSinkPrefix}${SinkName}_Events_"
val MetricPrefixBlobs = s"${MetricName.MetricSinkPrefix}${SinkName}_Count_"
def sinkDataGroups(rowInfo: Row,
dataGenerator: ()=>Map[String, Iterator[String]],
outputFolders: Map[String, String],
partitionId: Int,
compression: Boolean,
loggerSuffix: String): Map[String, Int] = {
val logger = LogManager.getLogger(s"Sinker-BlobSinker$loggerSuffix")
val dataGroups = dataGenerator()
val timeStart = System.nanoTime ()
val eventCounts = dataGroups.flatMap {
case (group, data) =>
val fileName = FileInternal.getInfoOutputFileName(rowInfo)
outputFolders.get(group) match {
case None =>
Seq(s"${MetricPrefixEvents}$group" -> 0, s"${MetricPrefixBlobs}$group" -> 0)
case Some(folder) =>
val path = folder +
(if (fileName == null) s"part-$partitionId" else fileName) + (if(compression) ".json.gz" else ".json")
val jsonList = data.toSeq
BlobSinker.writeEventsToBlob(jsonList, path, compression )
Seq(s"${MetricPrefixEvents}$group" -> jsonList.length, s"${MetricPrefixBlobs}$group" -> 1)
}
}
val timeNow = System.nanoTime ()
logger.info (s"$timeNow:Written all event groups ${eventCounts.toString}, spent time=${(timeNow - timeStart) / 1E9} seconds")
eventCounts
}
def getRowsSinkerGenerator(blobOutputConf: BlobOutputConf, flagColumnIndex: Int) : SinkDelegate = {
val compressionTypeConf = blobOutputConf.compressionType
val formatConf = blobOutputConf.format
if(formatConf.isDefined && !formatConf.get.equalsIgnoreCase("json"))
throw new Error(s"Output format: ${formatConf.get} as specified in the config is not supported")
val outputFolders = blobOutputConf.groups.map{case(k,v)=>k->KeyVaultClient.resolveSecretIfAny(v.folder)}
(rowInfo: Row, rows: Seq[Row], outputPartitionTime: Timestamp, partitionId: Int, loggerSuffix: String) => {
val target = FileInternal.getInfoTargetTag(rowInfo)
if(compressionTypeConf.isDefined && !(compressionTypeConf.get.equalsIgnoreCase("gzip")|| compressionTypeConf.get.equalsIgnoreCase("none")|| compressionTypeConf.get.equals("")))
throw new Error(s"Output compressionType: ${compressionTypeConf.get} as specified in the config is not supported")
val compression = compressionTypeConf.getOrElse("gzip").equalsIgnoreCase("gzip")
sinkDataGroups(
rowInfo = rowInfo,
dataGenerator =
if(flagColumnIndex<0)
() => Map(DefaultOutputGroup -> rows.iterator.map(_.getString(1)))
else
() => rows.groupBy(_.getString(flagColumnIndex)).map { case (k, v) => k -> v.iterator.map(_.getString(1)) },
outputFolders = outputFolders.map{case (k,v) =>
k->generateOutputFolderPath(v, outputPartitionTime, Option(target))},
partitionId = partitionId,
compression = compression,
loggerSuffix = loggerSuffix
)
}
}
def getSinkOperator(dict: SettingDictionary, name: String): SinkOperator = {
val blobConf = BlobOutputSetting.buildBlobOutputConf(dict, name)
SinkOperator(
name = SinkName,
isEnabled = blobConf!=null,
flagColumnExprGenerator = () => blobConf.groupEvaluation.getOrElse(null),
generator = flagColumnIndex=>getRowsSinkerGenerator(blobConf, flagColumnIndex),
onBatch = (spark: SparkSession, outputPartitionTime: Timestamp, targets: Set[String]) => {
val logger = LogManager.getLogger(this.getClass)
val groups = blobConf.groups
val outputFolders = groups.filter(g=>g._2!=null && g._2.folder!=null)
.flatMap(g=>{
val actualFolder = KeyVaultClient.resolveSecretIfAny(g._2.folder)
if(targets!=null && targets.size>0)
targets.map(t=>generateOutputFolderPath(actualFolder, outputPartitionTime, Option(t)))
else
Seq(generateOutputFolderPath(actualFolder, outputPartitionTime, None))
})
.filter(_!=null)
.toSet
outputFolders.par.foreach(HadoopClient.createFolder)
logger.warn(s"Created folders at ------\n${outputFolders.mkString("\n")}")
}
)
}
override def getSettingNamespace(): String = BlobOutputSetting.Namespace
}

Просмотреть файл

@ -0,0 +1,29 @@
// *********************************************************************
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License
// *********************************************************************
package datax.sink
import datax.client.cosmosdb.CosmosDBConf
import datax.config.SettingDictionary
import datax.securedsetting.KeyVaultClient
object CosmosDBOutputSetting {
val Namespace = "cosmosdb"
val SettingConnectionString = "connectionstring"
val SettingDatabase = "database"
val SettingCollection = "collection"
def buildCosmosDBOutputConf(dict: SettingDictionary, name: String): CosmosDBConf = {
KeyVaultClient.resolveSecretIfAny(dict.get(SettingConnectionString)) match {
case Some(connectionString) =>
CosmosDBConf(
connectionString = connectionString,
name = name,
database = dict.getOrNull(SettingDatabase),
collection = dict.getOrNull(SettingCollection)
)
case None => null
}
}
}

Просмотреть файл

@ -0,0 +1,140 @@
// *********************************************************************
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License
// *********************************************************************
package datax.sink
import java.util.concurrent.ConcurrentHashMap
import com.microsoft.azure.documentdb._
import datax.client.cosmosdb.CosmosDBConf
import datax.config.SettingDictionary
import datax.exception.EngineException
import datax.securedsetting.KeyVaultClient
import datax.utility.ConverterUtil._
import datax.utility.SinkerUtil
import org.apache.log4j.LogManager
import org.apache.spark.sql.SparkSession
class CosmosDBSinker(key: String, conf: CosmosDBConf) {
private val logger = LogManager.getLogger(s"CosmosDBSinker-${key}")
private val client = CosmosDBSinkerManager.getClient(conf.connectionString)
logger.warn(s"Initialized")
private def getDatabase(databaseName: String) = {
val databases = client.queryDatabases(s"SELECT * FROM root r WHERE r.id='${databaseName}'", null)
.getQueryIterable().toList
if(databases.size()>0){
databases.get(0)
}
else{
try{
val definition = new Database()
definition.setId(databaseName)
client.createDatabase(definition, null).getResource()
}
catch {
case e: DocumentClientException => throw e
}
}
}
private val databaseLink = getDatabase(conf.database).getSelfLink
private def getCollection(collectionName: String) = {
val collections = client.queryCollections(databaseLink, s"SELECT * FROM root r WHERE r.id='$collectionName'", null)
.getQueryIterable().toList
if(collections.size()>0){
collections.get(0)
}
else{
try{
val definition = new DocumentCollection()
definition.setId(collectionName)
client.createCollection(databaseLink, definition, null).getResource()
}
catch {
case e: DocumentClientException => throw e
}
}
}
private val collectionLink = getCollection(conf.collection).getSelfLink
def createDocument(json: String) = {
val doc = new Document(json)
try{
client.createDocument(collectionLink, doc, null, false)
true
}
catch {
case e:DocumentClientException =>
throw e
}
}
private def createDocuments(jsons: Seq[String]) = {
jsons.foreach(createDocument(_))
}
}
object CosmosDBSinkerManager extends SinkOperatorFactory {
private val SinkName = "CosmosDB"
private val logger = LogManager.getLogger("CosmosDBSinkerManager")
private val pool = new ConcurrentHashMap[String, CosmosDBSinker]
private val clientPool = new ConcurrentHashMap[String, DocumentClient]
private val connectionStringRegex = "^AccountEndpoint=([^;]*);AccountKey=([^;]*);".r
def parseConnectionString(conn: String) = {
connectionStringRegex.findFirstMatchIn(conn).map(m=>(m.group(1), m.group(2)))
}
def getSinker(conf: CosmosDBConf) = {
val key = conf.name
pool.computeIfAbsent(key, (k: String) => new CosmosDBSinker(k, conf))
}
def getClient(connectionString: String) = {
parseConnectionString(connectionString) match {
case Some((serviceEndpoint, masterKey)) =>
clientPool.computeIfAbsent(serviceEndpoint, (k: String) => {
logger.warn(s"Create new client for serviceEndpoint:$k")
new DocumentClient(serviceEndpoint, masterKey, null, null)
})
case None =>
throw new EngineException(s"unexpected connection string:'${connectionString}'")
}
}
def getClient(serviceEndpoint: String, masterKey: String) = {
clientPool.computeIfAbsent(serviceEndpoint, (k: String) => {
logger.warn(s"Create new client for serviceEndpoint:$k")
new DocumentClient(serviceEndpoint, masterKey, null, null)
})
}
def getSinkOperator(dict: SettingDictionary, name: String) : SinkOperator = {
val conf = CosmosDBOutputSetting.buildCosmosDBOutputConf(dict, name)
SinkOperator(
name = SinkName,
isEnabled = conf!=null,
flagColumnExprGenerator = () => null,
generator = flagColumnIndex => SinkerUtil.outputGenerator(
(dataToSend:Seq[String],ls: String) => {
val cosmosDBSinker = CosmosDBSinkerManager.getSinker(conf)
dataToSend.count(d=>cosmosDBSinker.createDocument(d))
},
SinkName
)(flagColumnIndex),
onInitialization = (spark: SparkSession) => {
val logger = LogManager.getLogger(this.getClass)
CosmosDBSinkerManager.getSinker(conf)
logger.warn(s"initialize cosmos DB sinker destination at ${conf.name}")
}
)
}
override def getSettingNamespace(): String = CosmosDBOutputSetting.Namespace
}

Просмотреть файл

@ -0,0 +1,46 @@
// *********************************************************************
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License
// *********************************************************************
package datax.sink
import datax.config.{SettingDictionary, SettingNamespace}
import datax.securedsetting.KeyVaultClient
object EventHubOutputSetting {
case class EventHubOutputConf(connectionString: String,
filter: String,
appendProperties: Map[String, String],
compressionType: String,
format: String)
val Namespace = "eventhub"
val SettingConnectionString = "connectionstring"
val SettingFilter = "filter"
val SettingCompressionType = "compressiontype"
val SettingFormat = "format"
val SettingAppendProperty = "appendproperty"
val AppendPropertyPrefix = SettingAppendProperty + SettingNamespace.Seperator
val FormatValueJson = "json"
val FormatValueDefault = FormatValueJson
val CompressionValueNone = "none"
val CompressionValueGZip = "gzip"
val CompressionValueDefault = CompressionValueGZip
def buildEventHubOutputConf(dict: SettingDictionary, name: String) = {
KeyVaultClient.resolveSecretIfAny(dict.get(SettingConnectionString)) match {
case Some(connectionString) =>
val properties = dict.getSubDictionary(AppendPropertyPrefix).getDictMap()
EventHubOutputConf(
connectionString = connectionString,
appendProperties = properties,
filter = dict.getOrNull(SettingFilter),
compressionType = dict.get(SettingCompressionType).getOrElse(CompressionValueDefault),
format = dict.get(SettingFormat).getOrElse(FormatValueDefault)
)
case None => null
}
}
}

Просмотреть файл

@ -0,0 +1,81 @@
// *********************************************************************
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License
// *********************************************************************
package datax.sink
import datax.utility.{GZipHelper, SinkerUtil}
import datax.client.eventhub.{EventHubConf, EventHubSenderPool}
import datax.config.SettingDictionary
import datax.exception.EngineException
import datax.sink.EventHubOutputSetting.EventHubOutputConf
import org.apache.log4j.LogManager
object EventHubStreamPoster extends SinkOperatorFactory {
val SinkName = "EventHub"
def sendFilteredEvents(data: Seq[String],
outputEventhubConf: EventHubConf,
appendProperties: Map[String, String],
loggerSuffix: String,
compressionType: String): Int = {
val logger = LogManager.getLogger(s"FilteredEvent-Sender${loggerSuffix}")
val countEvents = data.length
val chunkSize = 200
if (countEvents > 0) {
val sender = EventHubSenderPool.getSender(outputEventhubConf)
var i = 0
data.grouped(chunkSize).foreach(events => {
val eventsSize = events.length
val json = events.mkString("\n")
val t1 = System.nanoTime()
val elpasedTime = (System.nanoTime() - t1) / 1E9
val stage = s"[$i-${i + eventsSize}]/$countEvents"
if(compressionType.equalsIgnoreCase(EventHubOutputSetting.CompressionValueGZip)) {
val compressedJson = GZipHelper.deflateToBytes(json)
logger.info(s"$stage: compressed filtered events, count=$eventsSize, json=${json.length} bytes, compressed= ${compressedJson.length} bytes, spent time=$elpasedTime seconds")
sender.sendBytes(compressedJson, appendProperties)
}
else
{
logger.info(s"$stage: compressed filtered events, count=$eventsSize, json=${json.length} bytes, spent time=$elpasedTime seconds")
sender.sendBytes(json.getBytes(), appendProperties)
}
logger.info(s"$stage: done sending")
i += eventsSize
eventsSize
})
countEvents
}
else 0
}
def getRowsSinkerGenerator(conf: EventHubOutputConf, flagColumnIndex: Int) : SinkDelegate = {
val format = conf.format
if(!format.equalsIgnoreCase(EventHubOutputSetting.FormatValueJson))
throw new EngineException(s"Eventhub: Output format: ${format} as specified in the config is not supported")
val compressionType = conf.compressionType
if(compressionType!=EventHubOutputSetting.CompressionValueNone && compressionType!=EventHubOutputSetting.CompressionValueGZip)
throw new EngineException(s"EventHub: compressionType: ${compressionType} as specified in the config is not supported")
val sender = (dataToSend: Seq[String], ls: String) => EventHubStreamPoster.sendFilteredEvents(dataToSend, EventHubConf(
name = SinkerUtil.hashName(conf.connectionString),
connectionString = conf.connectionString
), conf.appendProperties, ls, compressionType)
SinkerUtil.outputGenerator(sender,SinkName)(flagColumnIndex)
}
def getSinkOperator(dict: SettingDictionary, name: String) : SinkOperator = {
val conf = EventHubOutputSetting.buildEventHubOutputConf(dict, name)
SinkOperator(
name = SinkName,
isEnabled = conf!=null,
flagColumnExprGenerator = () => conf.filter,
generator = flagColumnIndex => getRowsSinkerGenerator(conf, flagColumnIndex)
)
}
override def getSettingNamespace(): String = EventHubOutputSetting.Namespace
}

Просмотреть файл

@ -0,0 +1,30 @@
// *********************************************************************
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License
// *********************************************************************
package datax.sink
import datax.config.{SettingDictionary, SettingNamespace}
object HttpPostOutputSetting {
case class HttpPostConf(endpoint: String, filter: String, appendHeaders: Option[Map[String, String]])
val Namespace = "httppost"
val SettingEndpoint = "endpoint"
val SettingFilter = "filter"
val SettingHeader = "header"
val AppendHeaderPrefix = SettingHeader+SettingNamespace.Seperator
def getHttpPostConf(dict: SettingDictionary, name: String) = {
if(dict.size>0) {
val headers = dict.getSubDictionary(AppendHeaderPrefix).getDictMap()
HttpPostConf(
endpoint = dict.getOrNull(SettingEndpoint),
filter = dict.getOrNull(SettingHeader),
appendHeaders = Option(headers)
)
}
else
null
}
}

Просмотреть файл

@ -0,0 +1,84 @@
// *********************************************************************
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License
// *********************************************************************
package datax.sink
import datax.config.SettingDictionary
import datax.sink.HttpPostOutputSetting.HttpPostConf
import datax.utility.SinkerUtil
import org.apache.http.client.config.RequestConfig
import org.apache.http.client.methods.HttpPost
import org.apache.http.entity.StringEntity
import org.apache.http.impl.client.{BasicResponseHandler, HttpClients}
import org.apache.log4j.LogManager
object HttpPoster extends SinkOperatorFactory {
val SinkName = "HttpPost"
def getHttpClient() = {
val requestConfig = RequestConfig.custom().setConnectionRequestTimeout(5000).setSocketTimeout(5000).build()
HttpClients.custom().setDefaultRequestConfig(requestConfig).build()
}
def postEvents(data: Seq[String], httpEndpoint: String, headers: Option[Map[String, String]], loggerSuffix: String): Int = {
val logger = LogManager.getLogger(s"HttpPoster${loggerSuffix}")
val countEvents = data.length
val chunkSize = 200
if (countEvents > 0) {
val clientItr = getHttpClient
val handler = new BasicResponseHandler
var i = 0
data.grouped(chunkSize).foreach(events => {
val eventsSize = events.length
val json = "[" + events.mkString(",") + "]"
val t1 = System.nanoTime()
val stage = s"[$i-${i + eventsSize}]/$countEvents"
// post data
try{
val post = new HttpPost(httpEndpoint)
if(headers.isDefined){
headers.get.foreach(h=>post.addHeader(h._1, h._2))
}
post.setEntity(new StringEntity(json))
val response = clientItr.execute(post)
val body = handler.handleResponse(response)
logger.info(s"$stage is sent:${body}")
}
catch{
case e: Exception => {
logger.error(s"$stage: failed", e)
}
}
i += eventsSize
eventsSize
})
clientItr.close()
countEvents
}
else 0
}
def getRowsSinkerGenerator(httpPostConf: HttpPostConf, flagColumnIndex: Int) : SinkDelegate = {
val sender = (dataToSend:Seq[String],ls: String) => HttpPoster.postEvents(dataToSend, httpPostConf.endpoint, httpPostConf.appendHeaders, ls)
SinkerUtil.outputGenerator(sender,SinkName)(flagColumnIndex)
}
def getSinkOperator(dict: SettingDictionary, name: String) : SinkOperator = {
val conf = HttpPostOutputSetting.getHttpPostConf(dict, name)
SinkOperator(
name = SinkName,
isEnabled = conf!=null,
flagColumnExprGenerator = () => conf.filter,
generator = (flagColumnIndex)=>getRowsSinkerGenerator(conf, flagColumnIndex)
)
}
override def getSettingNamespace(): String = HttpPostOutputSetting.Namespace
}

Просмотреть файл

@ -0,0 +1,154 @@
// *********************************************************************
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License
// *********************************************************************
package datax.sink
import java.sql.Timestamp
import datax.config.{SettingDictionary, SettingNamespace, SparkEnvVariables}
import datax.constants.{ColumnName, MetricName, ProductConstant}
import datax.data.FileInternal
import datax.exception.EngineException
import datax.fs.HadoopClient
import datax.utility.{DataMerger, DataNormalization, SinkerUtil}
import org.apache.log4j.LogManager
import org.apache.spark.TaskContext
import org.apache.spark.sql.functions.{col, struct, to_json}
import org.apache.spark.sql.types.StructType
import org.apache.spark.sql.{DataFrame, Row, SparkSession}
object OutputManager {
val NamespacePrefix = SettingNamespace.JobOutputPrefix
val SettingOutputProcessedSchemaPath = "processedschemapath"
val sinkFactories = Seq[SinkOperatorFactory](
BlobSinker, EventHubStreamPoster, HttpPoster, CosmosDBSinkerManager
).map(f=>f.getSettingNamespace()->f).toMap
def getOperatators(dict: SettingDictionary): Seq[OutputOperator] ={
dict.groupBySubNamespace(NamespacePrefix)
.map(g=>generateOperator(g._2, g._1))
.toSeq
}
def outputResultReducer = (s1: (Int, Map[String, Int]), s2: (Int, Map[String, Int])) => (s1._1 + s2._1, DataMerger.mergeMapOfCounts(s1._2, s2._2))
def generateOperator(dict:SettingDictionary, name: String) = {
val logger = LogManager.getLogger("OutputOperatorsBuilder")
var flagColumnIndex = 1
val processedSchemaPath = dict.get(SettingOutputProcessedSchemaPath).orNull
val sinkOperators = dict
.groupBySubNamespace()
.map { case (k, v) => sinkFactories.get(k).map(_.getSinkOperator(v, k))}
.filter(o => o match {
case Some(oper) =>
logger.info(s"Output '$name':${oper.name} is ${SinkerUtil.boolToOnOff(oper.isEnabled)}")
oper.isEnabled
case None => false
}).map(o=>{
val oper = o.get
val flagColumnExpr = oper.flagColumnExprGenerator()
if(flagColumnExpr==null){
logger.warn(s"Output type:'${oper.name}': no flag column")
(oper.name, (null, null), oper.generator(-1), oper.onBatch, oper.onInitialization)
}
else{
val appendColumn = (flagColumnExpr, s"_${ProductConstant.ProductOutputFilter}_${oper.name}")
logger.warn(s"Output type:'${oper.name}': append column:$appendColumn")
flagColumnIndex+=1
(oper.name, appendColumn, oper.generator(flagColumnIndex), oper.onBatch, oper.onInitialization)
}
}).toSeq
if(sinkOperators.length==0)throw new EngineException(s"no sink is defined for output '$name'!")
logger.warn(s"Output '$name' to ${sinkOperators.length} sinkers: ${sinkOperators.map(s=>s"'${s._1}'").mkString(",")}")
val flagColumns = sinkOperators.map(_._2).filter(_._1!=null)
val sinkers = sinkOperators.map(o=>o._1 -> o._3).toMap
val onBatchHandlers = sinkOperators.map(_._4).filter(_!=null)
val onInitHandlers = sinkOperators.map(_._5).filter(_!=null)
var shouldGeneratorProcessedSchema = processedSchemaPath!=null && !processedSchemaPath.isEmpty
OutputOperator(
name = name,
onInitialization = if(onInitHandlers.size>0) (spark: SparkSession)=> for (elem <- onInitHandlers) {elem(spark)} else null,
onBatch = if(onBatchHandlers.size>0) (spark:SparkSession, time: Timestamp, targets: Set[String]) => {
onBatchHandlers.foreach(_(spark, time, targets))
} else null,
output = (df: DataFrame, partitionTime: Timestamp) => {
val outputLogger = LogManager.getLogger(s"Output-${name}")
val outputColumns = df.schema.filterNot(_.name.startsWith(ColumnName.InternalColumnPrefix)).toArray
if(shouldGeneratorProcessedSchema){
val spark = df.sparkSession
spark.synchronized{
if(shouldGeneratorProcessedSchema){
HadoopClient.writeHdfsFile(processedSchemaPath, new StructType(outputColumns).prettyJson, true)
outputLogger.warn(s"Saved processed schema to $processedSchemaPath")
shouldGeneratorProcessedSchema = false
}
}
}
val outputColumnNames = outputColumns.map(c=>DataNormalization.sanitizeColumnName(c.name))
outputLogger.warn(s"Output fields: ${outputColumnNames.mkString(",")}")
sink(df, outputColumnNames, partitionTime, flagColumns, sinkers)
}
)
}
def sink(df: DataFrame,
outputFieldNames: Seq[String],
partitionTime: Timestamp,
flagColumns: Seq[(String, String)],
outputOperators: Map[String, SinkDelegate]) = {
val amendDf = if(df.schema.fieldNames.contains(ColumnName.InternalColumnFileInfo))df
else {
df.withColumn(ColumnName.InternalColumnFileInfo, FileInternal.udfEmptyInternalInfo())
}
val query = amendDf.selectExpr("*" +: flagColumns.map(c => c._1 + " AS " + c._2): _*)
.select(Seq(col(ColumnName.InternalColumnFileInfo), to_json(struct(outputFieldNames.map(col): _*))) ++
flagColumns.map(_._2).map(col): _*)
//query.explain will dump the execution plan of sql to stdout
//query.explain(true)
query
.rdd
.mapPartitions(it => {
val partitionId = TaskContext.getPartitionId()
val loggerSuffix = SparkEnvVariables.getLoggerSuffix()
val logger = LogManager.getLogger(s"EventsSinker${loggerSuffix}")
//val path = outputFileFolder+"/part-"+tc.partitionId().toString + ".json.gz"
val t1 = System.nanoTime()
var timeLast = t1
var timeNow: Long = 0
logger.info(s"$timeNow:Partition started")
val dataAll = it.toArray
val count = dataAll.length
timeNow = System.nanoTime()
logger.info(s"$timeNow:Collected $count events, spent time=${(timeNow - timeLast) / 1E9} seconds")
timeLast = timeNow
val inputMetric = Map(s"${MetricName.MetricSinkPrefix}InputEvents" -> count)
Seq(if (count > 0) {
val rowInfo = dataAll(0).getAs[Row](0)
if(outputOperators.size==0)
throw new EngineException("no output operators are found!")
outputOperators
.par
.map(_._2(rowInfo, dataAll, partitionTime, partitionId, loggerSuffix))
.reduce(DataMerger.mergeMapOfCounts) ++ inputMetric
}
else
inputMetric
).iterator
})
.reduce(DataMerger.mergeMapOfCounts)
}
}

Некоторые файлы не были показаны из-за слишком большого количества измененных файлов Показать больше