Codebase cleanup. Details in in description. (#174)

This commit is contained in:
Sabee Grewal 2017-10-16 18:45:51 -07:00 коммит произвёл SJ
Родитель 68bd06de72
Коммит 60931e35e4
62 изменённых файлов: 2855 добавлений и 5140 удалений

2
.gitignore поставляемый
Просмотреть файл

@ -5,6 +5,6 @@
.idea/
target/
*.iml
scalastyle-output.xml
scalafmt-output.xml
dependency-reduced-pom.xml
metastore_db

11
.scalafmt.conf Normal file
Просмотреть файл

@ -0,0 +1,11 @@
maxColumn = 100
project.git = true
project.excludeFilters = []
# http://docs.scala-lang.org/style/scaladoc.html recommends the JavaDoc style.
# scala/scala is written that way too https://github.com/scala/scala/blob/v2.12.2/src/library/scala/Predef.scala
docstrings = JavaDoc
# This also seems more idiomatic to include whitespace in import x.{ yyy }
spaces.inImportCurlyBraces = true

Просмотреть файл

@ -1,13 +1,18 @@
language: scala
scala:
- 2.11.8
- 2.11.8
jdk:
- oraclejdk8
- oraclejdk8
script:
- ./run_tests.sh
- ./run_tests.sh
branches:
except:
- maven-repo
only:
- master
- 2.1.x
- 2.0.x
- dev
except:
- maven-repo

Просмотреть файл

@ -7,6 +7,7 @@
|Branch|Status|
|------|-------------|
|master|[![Build Status](https://travis-ci.org/Azure/spark-eventhubs.svg?branch=master)](https://travis-ci.org/Azure/spark-eventhubs)|
|dev|[![Build Status](https://travis-ci.org/Azure/spark-eventhubs.svg?branch=dev)](https://travis-ci.org/Azure/spark-eventhubs)|
|2.1.x|[![Build Status](https://travis-ci.org/Azure/spark-eventhubs.svg?branch=2.1.x)](https://travis-ci.org/Azure/spark-eventhubs)|
|2.0.x|[![Build Status](https://travis-ci.org/Azure/spark-eventhubs.svg?branch=2.0.x)](https://travis-ci.org/Azure/spark-eventhubs)|

Просмотреть файл

@ -1,116 +0,0 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.spark.eventhubscommon
import java.util.concurrent.Executors
import scala.concurrent.ExecutionContext
import scala.language.implicitConversions
import com.google.common.util.concurrent.ThreadFactoryBuilder
import org.apache.spark.eventhubscommon.client.EventHubsClientWrapper
import org.apache.spark.storage.StorageLevel
import org.apache.spark.streaming.StreamingContext
import org.apache.spark.streaming.dstream.DStream
import org.apache.spark.streaming.eventhubs.EventHubsUtils
import org.apache.spark.streaming.eventhubs.checkpoint.OffsetStore
/**
* Import the members of this object to enable the use of the unionedEventhubStream and
* eventhubStream methods on the StreamingContext instead of the EventHubsUtils class.
*/
private[eventhubscommon] object Implicits {
// will be used to execute requests to EventHub
private[spark] implicit val exec = {
val tp = new ThreadFactoryBuilder().setDaemon(true).setNameFormat("restclientthread" + "-%d").
build()
ExecutionContext.fromExecutor(Executors.newCachedThreadPool(tp))
}
/**
* Converts the StreamingContext into an EventHub enabled streaming context
*
* @param streamingContext Streaming context to convert
* @return Returns the Azure EventHub enabled StreamingContext
*/
implicit def eventHubContext(streamingContext: StreamingContext): SparkEventHubContext =
new SparkEventHubContext(streamingContext)
/**
* Azure EventHub enabled streaming context
*/
class SparkEventHubContext(ssc: StreamingContext) {
// scalastyle:off
/**
* Create a unioned EventHubs stream that receives data from Microsoft Azure Eventhubs
* The unioned stream will receive message from all partitions of the EventHubs
*
* @param eventhubsParams a Map that contains parameters for EventHubs.
* Required parameters are:
* "eventhubs.policyname": EventHubs policy name
* "eventhubs.policykey": EventHubs policy key
* "eventhubs.namespace": EventHubs namespace
* "eventhubs.name": EventHubs name
* "eventhubs.partition.count": Number of partitions
* "eventhubs.checkpoint.dir": checkpoint directory on HDFS
*
* Optional parameters are:
* "eventhubs.consumergroup": EventHubs consumer group name, default to "\$default"
* "eventhubs.filter.offset": Starting offset of EventHubs, default to "-1"
* "eventhubs.filter.enqueuetime": Unix time, seconds since epoch, default to "0"
* "eventhubs.default.credits": default AMQP credits, default to -1 (which is 1024)
* "eventhubs.checkpoint.interval": checkpoint interval in second, default to 10
* @param storageLevel Storage level, by default it is MEMORY_ONLY
* @return ReceiverInputStream
*/
// scalastyle:on
def unionedEventHubStream(
eventhubsParams: Map[String, String],
storageLevel: StorageLevel = StorageLevel.MEMORY_ONLY): DStream[Array[Byte]] = {
EventHubsUtils.createUnionStream(ssc, eventhubsParams, storageLevel)
}
// scalastyle:off
/**
* Create a single EventHubs stream that receives data from Microsoft Azure EventHubs
* A single stream only receives message from one EventHubs partition
*
* @param eventhubsParams a Map that contains parameters for EventHubs. Same as above.
* @param partitionId Partition ID
* @param storageLevel Storage level
* @param offsetStore Offset store implementation, defaults to DFSBasedOffsetStore
* @param receiverClient the EventHubs client implementation, defaults to EventHubsClientWrapper
* @return ReceiverInputStream
*/
// scalastyle:on
def eventHubStream(
eventhubsParams: Map[String, String],
partitionId: String,
storageLevel: StorageLevel = StorageLevel.MEMORY_ONLY,
offsetStore: OffsetStore = null,
receiverClient: EventHubsClientWrapper = new EventHubsClientWrapper):
DStream[Array[Byte]] = {
EventHubsUtils.createStream(ssc, eventhubsParams, partitionId, storageLevel, offsetStore,
receiverClient)
}
}
}

Просмотреть файл

@ -20,6 +20,5 @@ package org.apache.spark.eventhubscommon
/**
* this class represents the in-memory offset record hold by [[EventHubsConnector]]s
*/
private[spark] case class OffsetRecord(
timestamp: Long,
offsets: Map[EventHubNameAndPartition, (Long, Long)])
private[spark] case class OffsetRecord(timestamp: Long,
offsets: Map[EventHubNameAndPartition, (Long, Long)])

Просмотреть файл

@ -17,29 +17,36 @@
package org.apache.spark.eventhubscommon
import org.apache.spark.eventhubscommon.client.{EventHubClient, EventHubsClientWrapper, EventHubsOffsetTypes}
import org.apache.spark.eventhubscommon.client.{
Client,
EventHubsClientWrapper,
EventHubsOffsetTypes
}
import org.apache.spark.eventhubscommon.client.EventHubsOffsetTypes.EventHubsOffsetType
import org.apache.spark.internal.Logging
private[spark] object RateControlUtils extends Logging {
private def maxRateLimitPerPartition(
eventHubName: String,
eventhubsParams: Map[String, _]): Int = {
private def maxRateLimitPerPartition(eventHubName: String,
eventhubsParams: Map[String, _]): Int = {
val maxRate = eventhubsParams.get(eventHubName) match {
case Some(eventHubsConfigEntries) =>
// this part shall be called by direct dstream where the parameters are indexed by eventhubs
// names
eventHubsConfigEntries.asInstanceOf[Map[String, String]].
getOrElse("eventhubs.maxRate", "10000").toInt
eventHubsConfigEntries
.asInstanceOf[Map[String, String]]
.getOrElse("eventhubs.maxRate", "10000")
.toInt
case None =>
// this is called by structured streaming where eventhubsParams only contains the parameters
// for a single eventhubs instance
eventhubsParams.asInstanceOf[Map[String, String]].
getOrElse("eventhubs.maxRate", "10000").toInt
eventhubsParams
.asInstanceOf[Map[String, String]]
.getOrElse("eventhubs.maxRate", "10000")
.toInt
}
require(maxRate > 0,
s"eventhubs.maxRate has to be larger than zero, violated by $eventHubName ($maxRate)")
s"eventhubs.maxRate has to be larger than zero, violated by $eventHubName ($maxRate)")
maxRate
}
@ -55,28 +62,28 @@ private[spark] object RateControlUtils extends Logging {
eventhubsParams: Map[String, _]): Map[EventHubNameAndPartition, Long] = {
highestEndpoints.map {
case (eventHubNameAndPar, (_, latestSeq)) =>
val maximumAllowedMessageCnt = maxRateLimitPerPartition(
eventHubNameAndPar.eventHubName, eventhubsParams)
val endSeq = math.min(latestSeq,
maximumAllowedMessageCnt + currentOffsetsAndSeqNums(eventHubNameAndPar)._2)
val maximumAllowedMessageCnt =
maxRateLimitPerPartition(eventHubNameAndPar.eventHubName, eventhubsParams)
val endSeq =
math.min(latestSeq,
maximumAllowedMessageCnt + currentOffsetsAndSeqNums(eventHubNameAndPar)._2)
(eventHubNameAndPar, endSeq)
}
}
private[spark] def clamp(
currentOffsetsAndSeqNums: Map[EventHubNameAndPartition, (Long, Long)],
highestEndpoints: Map[EventHubNameAndPartition, (Long, Long)],
eventhubsParams: Map[String, _]): Map[EventHubNameAndPartition, Long] = {
private[spark] def clamp(currentOffsetsAndSeqNums: Map[EventHubNameAndPartition, (Long, Long)],
highestEndpoints: Map[EventHubNameAndPartition, (Long, Long)],
eventhubsParams: Map[String, _]): Map[EventHubNameAndPartition, Long] = {
defaultRateControl(currentOffsetsAndSeqNums, highestEndpoints, eventhubsParams)
}
private[spark] def fetchLatestOffset(
eventHubClient: EventHubClient,
eventHubClient: Client,
retryIfFail: Boolean,
fetchedHighestOffsetsAndSeqNums: Map[EventHubNameAndPartition, (Long, Long)]):
Option[Map[EventHubNameAndPartition, (Long, Long)]] = {
val r = eventHubClient.endPointOfPartition(
retryIfFail, fetchedHighestOffsetsAndSeqNums.keySet.toList)
fetchedHighestOffsetsAndSeqNums: Map[EventHubNameAndPartition, (Long, Long)])
: Option[Map[EventHubNameAndPartition, (Long, Long)]] = {
val r =
eventHubClient.endPointOfPartition(retryIfFail, fetchedHighestOffsetsAndSeqNums.keySet.toList)
if (r.isDefined) {
// merge results
val mergedOffsets = if (fetchedHighestOffsetsAndSeqNums != null) {
@ -91,35 +98,44 @@ private[spark] object RateControlUtils extends Logging {
}
private[spark] def validateFilteringParams(
eventHubsClient: EventHubClient,
eventHubsClient: Client,
eventhubsParams: Map[String, _],
ehNameAndPartitions: List[EventHubNameAndPartition]): Unit = {
// first check if the parameters are valid
val latestEnqueueTimeOfPartitions = eventHubsClient.lastEnqueueTimeOfPartitions(
retryIfFail = true, ehNameAndPartitions)
require(latestEnqueueTimeOfPartitions.isDefined, "cannot get latest enqueue time from Event" +
" Hubs Rest Endpoint")
val latestEnqueueTimeOfPartitions =
eventHubsClient.lastEnqueueTimeOfPartitions(retryIfFail = true, ehNameAndPartitions)
require(latestEnqueueTimeOfPartitions.isDefined,
"cannot get latest enqueue time from Event" +
" Hubs Rest Endpoint")
latestEnqueueTimeOfPartitions.get.foreach {
case (ehNameAndPartition, latestEnqueueTime) =>
val passInEnqueueTime = eventhubsParams.get(ehNameAndPartition.eventHubName) match {
case Some(ehParams) =>
ehParams.asInstanceOf[Map[String, String]].getOrElse(
"eventhubs.filter.enqueuetime", Long.MinValue.toString).toLong
ehParams
.asInstanceOf[Map[String, String]]
.getOrElse("eventhubs.filter.enqueuetime", Long.MinValue.toString)
.toLong
case None =>
eventhubsParams.asInstanceOf[Map[String, String]].getOrElse(
"eventhubs.filter.enqueuetime", Long.MinValue.toString).toLong
eventhubsParams
.asInstanceOf[Map[String, String]]
.getOrElse("eventhubs.filter.enqueuetime", Long.MinValue.toString)
.toLong
}
require(latestEnqueueTime >= passInEnqueueTime,
require(
latestEnqueueTime >= passInEnqueueTime,
"you cannot pass in an enqueue time which is later than the highest enqueue time in" +
s" event hubs, ($ehNameAndPartition, pass-in-enqueuetime $passInEnqueueTime," +
s" latest-enqueuetime $latestEnqueueTime)")
s" latest-enqueuetime $latestEnqueueTime)"
)
}
}
private[spark] def composeFromOffsetWithFilteringParams(
eventhubsParams: Map[String, _],
fetchedStartOffsetsInNextBatch: Map[EventHubNameAndPartition, (Long, Long)]):
Map[EventHubNameAndPartition, (EventHubsOffsetType, Long)] = {
fetchedStartOffsetsInNextBatch: Map[EventHubNameAndPartition, (Long, Long)])
: Map[EventHubNameAndPartition, (EventHubsOffsetType, Long)] = {
fetchedStartOffsetsInNextBatch.map {
case (ehNameAndPartition, (offset, seq)) =>
val (offsetType, offsetStr) = EventHubsClientWrapper.configureStartOffset(
@ -129,7 +145,8 @@ private[spark] object RateControlUtils extends Logging {
ehConfig.asInstanceOf[Map[String, String]]
case None =>
eventhubsParams.asInstanceOf[Map[String, String]]
})
}
)
(ehNameAndPartition, (offsetType, offsetStr.toLong))
}
}
@ -137,8 +154,8 @@ private[spark] object RateControlUtils extends Logging {
private[spark] def calculateStartOffset(
ehNameAndPartition: EventHubNameAndPartition,
filteringOffsetAndType: Map[EventHubNameAndPartition, (EventHubsOffsetType, Long)],
startOffsetInNextBatch: Map[EventHubNameAndPartition, (Long, Long)]):
(EventHubsOffsetType, Long) = {
startOffsetInNextBatch: Map[EventHubNameAndPartition, (Long, Long)])
: (EventHubsOffsetType, Long) = {
filteringOffsetAndType.getOrElse(
ehNameAndPartition,
(EventHubsOffsetTypes.PreviousCheckpoint, startOffsetInNextBatch(ehNameAndPartition)._1)

Просмотреть файл

@ -18,27 +18,19 @@
package org.apache.spark.eventhubscommon.client
import scala.collection.mutable
import com.microsoft.azure.eventhubs.{EventHubClient => AzureEventHubClient, EventHubPartitionRuntimeInformation}
import com.microsoft.azure.eventhubs.{ EventHubClient, EventHubPartitionRuntimeInformation }
import org.apache.spark.eventhubscommon.EventHubNameAndPartition
import org.apache.spark.internal.Logging
private[client] class AMQPEventHubsClient(
eventHubNamespace: String,
eventHubsNames: List[String],
ehParams: Map[String, Map[String, String]]) extends EventHubClient with Logging {
private[client] class AMQPEventHubsClient(ehNames: List[String],
ehParams: Map[String, Map[String, String]])
extends Client
with Logging {
private val ehNameToClient = new mutable.HashMap[String, AzureEventHubClient]
init()
private def init(): Unit = {
for (ehName <- eventHubsNames) {
ehNameToClient += ehName ->
new EventHubsClientWrapper().createClient(ehParams(ehName))
}
}
private val nameToClient = new mutable.HashMap[String, EventHubClient]
for (ehName <- ehNames)
nameToClient += ehName -> new EventHubsClientWrapper(ehParams(ehName))
.createClient(ehParams(ehName))
private def getRunTimeInfoOfPartitions(
targetEventHubNameAndPartitions: List[EventHubNameAndPartition]) = {
@ -47,9 +39,10 @@ private[client] class AMQPEventHubsClient(
for (ehNameAndPartition <- targetEventHubNameAndPartitions) {
val ehName = ehNameAndPartition.eventHubName
val partitionId = ehNameAndPartition.partitionId
val client = ehNameToClient.get(ehName)
val client = nameToClient.get(ehName)
require(client.isDefined, "cannot find client for EventHubs instance " + ehName)
val runTimeInfo = client.get.getPartitionRuntimeInformation(partitionId.toString).get()
val runTimeInfo =
client.get.getPartitionRuntimeInformation(partitionId.toString).get()
results += ehNameAndPartition -> runTimeInfo
}
results.toMap.view
@ -65,15 +58,16 @@ private[client] class AMQPEventHubsClient(
*
* @return a map from eventhubName-partition to (offset, seq)
*/
override def endPointOfPartition(
retryIfFail: Boolean,
targetEventHubNameAndPartitions: List[EventHubNameAndPartition]):
Option[Map[EventHubNameAndPartition, (Long, Long)]] = {
override def endPointOfPartition(retryIfFail: Boolean,
targetEventHubNameAndPartitions: List[EventHubNameAndPartition])
: Option[Map[EventHubNameAndPartition, (Long, Long)]] = {
try {
val runtimeInformation = getRunTimeInfoOfPartitions(targetEventHubNameAndPartitions)
Some(runtimeInformation.map{case (ehNameAndPartition, runTimeInfo) =>
(ehNameAndPartition, (runTimeInfo.getLastEnqueuedOffset.toLong,
runTimeInfo.getLastEnqueuedSequenceNumber))}.toMap)
Some(runtimeInformation.map {
case (ehNameAndPartition, runTimeInfo) =>
(ehNameAndPartition,
(runTimeInfo.getLastEnqueuedOffset.toLong, runTimeInfo.getLastEnqueuedSequenceNumber))
}.toMap)
} catch {
case e: Exception =>
e.printStackTrace()
@ -88,12 +82,14 @@ private[client] class AMQPEventHubsClient(
*/
override def lastEnqueueTimeOfPartitions(
retryIfFail: Boolean,
targetEventHubNameAndPartitions: List[EventHubNameAndPartition]):
Option[Map[EventHubNameAndPartition, Long]] = {
targetEventHubNameAndPartitions: List[EventHubNameAndPartition])
: Option[Map[EventHubNameAndPartition, Long]] = {
try {
val runtimeInformation = getRunTimeInfoOfPartitions(targetEventHubNameAndPartitions)
Some(runtimeInformation.map{case (ehNameAndPartition, runTimeInfo) =>
(ehNameAndPartition, runTimeInfo.getLastEnqueuedTimeUtc.getEpochSecond)}.toMap)
Some(runtimeInformation.map {
case (ehNameAndPartition, runTimeInfo) =>
(ehNameAndPartition, runTimeInfo.getLastEnqueuedTimeUtc.getEpochSecond)
}.toMap)
} catch {
case e: Exception =>
e.printStackTrace()
@ -106,14 +102,15 @@ private[client] class AMQPEventHubsClient(
*
* @return a map from eventhubName-partition to seq
*/
override def startSeqOfPartition(
retryIfFail: Boolean,
targetEventHubNameAndPartitions: List[EventHubNameAndPartition]):
Option[Map[EventHubNameAndPartition, Long]] = {
override def startSeqOfPartition(retryIfFail: Boolean,
targetEventHubNameAndPartitions: List[EventHubNameAndPartition])
: Option[Map[EventHubNameAndPartition, Long]] = {
try {
val runtimeInformation = getRunTimeInfoOfPartitions(targetEventHubNameAndPartitions)
Some(runtimeInformation.map{case (ehNameAndPartition, runTimeInfo) =>
(ehNameAndPartition, runTimeInfo.getBeginSequenceNumber)}.toMap)
Some(runtimeInformation.map {
case (ehNameAndPartition, runTimeInfo) =>
(ehNameAndPartition, runTimeInfo.getBeginSequenceNumber)
}.toMap)
} catch {
case e: Exception =>
e.printStackTrace()
@ -126,18 +123,15 @@ private[client] class AMQPEventHubsClient(
*/
override def close(): Unit = {
logInfo("close: Closing AMQPEventHubClient.")
for ((_, ehClient) <- ehNameToClient) {
for ((_, ehClient) <- nameToClient) {
ehClient.closeSync()
}
}
}
private[spark] object AMQPEventHubsClient {
def getInstance(eventHubsNamespace: String, eventhubsParams: Map[String, Map[String, String]]):
AMQPEventHubsClient = {
new AMQPEventHubsClient(eventHubsNamespace, eventhubsParams.keys.toList, eventhubsParams)
def getInstance(eventHubsNamespace: String,
eventhubsParams: Map[String, Map[String, String]]): AMQPEventHubsClient = {
new AMQPEventHubsClient(eventhubsParams.keys.toList, eventhubsParams)
}
}

Просмотреть файл

@ -19,35 +19,31 @@ package org.apache.spark.eventhubscommon.client
import org.apache.spark.eventhubscommon.EventHubNameAndPartition
private[spark] trait EventHubClient extends Serializable {
private[spark] trait Client extends Serializable {
/**
* return the start seq number of each partition
* @return a map from eventhubName-partition to seq
*/
def startSeqOfPartition(
retryIfFail: Boolean,
targetEventHubNameAndPartitions: List[EventHubNameAndPartition] = List()):
Option[Map[EventHubNameAndPartition, Long]]
def startSeqOfPartition(retryIfFail: Boolean,
targetEventHubNameAndPartitions: List[EventHubNameAndPartition] = List())
: Option[Map[EventHubNameAndPartition, Long]]
/**
* return the end point of each partition
* @return a map from eventhubName-partition to (offset, seq)
*/
def endPointOfPartition(
retryIfFail: Boolean,
targetEventHubNameAndPartitions: List[EventHubNameAndPartition] = List()):
Option[Map[EventHubNameAndPartition, (Long, Long)]]
def endPointOfPartition(retryIfFail: Boolean,
targetEventHubNameAndPartitions: List[EventHubNameAndPartition] = List())
: Option[Map[EventHubNameAndPartition, (Long, Long)]]
/**
* return the last enqueueTime of each partition
* @return a map from eventHubsNamePartition to EnqueueTime
*/
def lastEnqueueTimeOfPartitions(
retryIfFail: Boolean,
targetEventHubNameAndPartitions: List[EventHubNameAndPartition]):
Option[Map[EventHubNameAndPartition, Long]]
def lastEnqueueTimeOfPartitions(retryIfFail: Boolean,
targetEventHubNameAndPartitions: List[EventHubNameAndPartition])
: Option[Map[EventHubNameAndPartition, Long]]
/**
* close this client

Просмотреть файл

@ -19,11 +19,8 @@ package org.apache.spark.eventhubscommon.client
import java.time.Instant
import scala.collection.JavaConverters._
import EventHubsOffsetTypes.EventHubsOffsetType
import com.microsoft.azure.eventhubs.{EventHubClient => AzureEventHubClient, _}
import org.apache.spark.{SparkEnv, TaskContext}
import com.microsoft.azure.eventhubs._
import org.apache.spark.eventhubscommon.EventHubNameAndPartition
import org.apache.spark.internal.Logging
import org.apache.spark.streaming.eventhubs.checkpoint.OffsetStore
@ -32,49 +29,37 @@ import org.apache.spark.streaming.eventhubs.checkpoint.OffsetStore
* Wraps a raw EventHubReceiver to make it easier for unit tests
*/
@SerialVersionUID(1L)
private[spark] class EventHubsClientWrapper extends Serializable with EventHubClient with Logging {
private[spark] class EventHubsClientWrapper(
ehParams: Map[String, String]
) extends Serializable
with Client
with Logging {
var eventhubsClient: AzureEventHubClient = _
private val MINIMUM_PREFETCH_COUNT: Int = 10
private var MAXIMUM_PREFETCH_COUNT: Int = 999
private var MAXIMUM_EVENT_RATE: Int = 0
private val DEFAULT_RECEIVER_EPOCH = -1L
// TODO: the design of this class is not simple enough
// ideally, we shall not require the user to explicitly call createReceiver first
// and then call receive
// we shall let the user pass parameters in the constructor directly
private val ehNamespace = ehParams("eventhubs.namespace").toString
private val ehName = ehParams("eventhubs.name").toString
private val ehPolicyName = ehParams("eventhubs.policyname").toString
private val ehPolicy = ehParams("eventhubs.policykey").toString
private def configureGeneralParameters(eventhubsParams: Predef.Map[String, String]) = {
if (eventhubsParams.contains("eventhubs.uri") &&
eventhubsParams.contains("eventhubs.namespace")) {
throw new IllegalArgumentException(s"Eventhubs URI and namespace cannot both be specified" +
s" at the same time.")
}
private val connectionString =
new ConnectionStringBuilder(ehNamespace, ehName, ehPolicyName, ehPolicy).toString
private val consumerGroup = ehParams
.getOrElse("eventhubs.consumergroup", EventHubClient.DEFAULT_CONSUMER_GROUP_NAME)
.toString
private val receiverEpoch = ehParams
.getOrElse("eventhubs.epoch", DEFAULT_RECEIVER_EPOCH.toString)
.toString
.toLong
val namespaceName = if (eventhubsParams.contains("eventhubs.namespace")) {
eventhubsParams.get("eventhubs.namespace")
} else {
eventhubsParams.get("eventhubs.uri")
}
if (namespaceName.isEmpty) {
throw new IllegalArgumentException(s"Either Eventhubs URI or namespace nust be" +
s" specified.")
}
// TODO: validate inputs
val evhName = eventhubsParams("eventhubs.name")
val evhPolicyName = eventhubsParams("eventhubs.policyname")
val evhPolicyKey = eventhubsParams("eventhubs.policykey")
val connectionString = new ConnectionStringBuilder(namespaceName.get, evhName, evhPolicyName,
evhPolicyKey)
// Set the consumer group if specified.
val consumerGroup = eventhubsParams.getOrElse("eventhubs.consumergroup",
AzureEventHubClient.DEFAULT_CONSUMER_GROUP_NAME)
// Set the epoch if specified
val receiverEpoch = eventhubsParams.getOrElse("eventhubs.epoch",
DEFAULT_RECEIVER_EPOCH.toString).toLong
(connectionString, consumerGroup, receiverEpoch)
}
var eventhubsClient: EventHubClient = _
private var eventhubsReceiver: PartitionReceiver = _
private def configureStartOffset(
eventhubsParams: Predef.Map[String, String], offsetStore: OffsetStore):
(EventHubsOffsetType, String) = {
private def configureStartOffset(eventhubsParams: Predef.Map[String, String],
offsetStore: OffsetStore): (EventHubsOffsetType, String) = {
// Determine the offset to start receiving data
val previousOffset = offsetStore.read()
EventHubsClientWrapper.configureStartOffset(previousOffset, eventhubsParams)
@ -84,7 +69,7 @@ private[spark] class EventHubsClientWrapper extends Serializable with EventHubCl
if (userDefinedEventRate > 0 && userDefinedEventRate < MINIMUM_PREFETCH_COUNT) {
MAXIMUM_PREFETCH_COUNT = MINIMUM_PREFETCH_COUNT
} else if (userDefinedEventRate >= MINIMUM_PREFETCH_COUNT &&
userDefinedEventRate < MAXIMUM_PREFETCH_COUNT) {
userDefinedEventRate < MAXIMUM_PREFETCH_COUNT) {
MAXIMUM_PREFETCH_COUNT = userDefinedEventRate + 1
} else {
MAXIMUM_EVENT_RATE = MAXIMUM_PREFETCH_COUNT - 1
@ -97,95 +82,68 @@ private[spark] class EventHubsClientWrapper extends Serializable with EventHubCl
*
* the major purpose of this API is for creating AMQP management client
*/
def createClient(eventhubsParams: Map[String, String]): AzureEventHubClient = {
val (connectionString, _, _) = configureGeneralParameters(
eventhubsParams)
eventhubsClient = AzureEventHubClient.createFromConnectionStringSync(connectionString.toString)
eventhubsClient
}
def createClient(eventhubsParams: Map[String, String]): EventHubClient =
EventHubClient.createFromConnectionStringSync(connectionString.toString)
def createReceiver(
eventhubsParams: Predef.Map[String, String],
partitionId: String,
startOffset: String,
offsetType: EventHubsOffsetType,
maximumEventRate: Int): Unit = {
val (connectionString, consumerGroup, receiverEpoch) = configureGeneralParameters(
eventhubsParams)
val currentOffset = startOffset
def createReceiver(partitionId: String,
startOffset: String,
offsetType: EventHubsOffsetType,
maximumEventRate: Int): Unit = {
MAXIMUM_EVENT_RATE = configureMaxEventRate(maximumEventRate)
createReceiverInternal(connectionString.toString,
eventhubsParams("eventhubs.name"),
consumerGroup, partitionId, offsetType,
currentOffset, receiverEpoch)
createReceiverInternal(partitionId, offsetType, startOffset)
}
def createReceiver(
eventhubsParams: Map[String, String],
partitionId: String,
offsetStore: OffsetStore,
maximumEventRate: Int): Unit = {
val (connectionString, consumerGroup, receiverEpoch) = configureGeneralParameters(
eventhubsParams)
val (offsetType, currentOffset) = configureStartOffset(eventhubsParams, offsetStore)
def createReceiver(ehParams: Map[String, String],
partitionId: String,
offsetStore: OffsetStore,
maximumEventRate: Int): Unit = {
val (offsetType, currentOffset) =
configureStartOffset(ehParams, offsetStore)
logInfo(s"start a receiver for partition $partitionId with the start offset $currentOffset")
MAXIMUM_EVENT_RATE = configureMaxEventRate(maximumEventRate)
createReceiverInternal(connectionString.toString,
eventhubsParams("eventhubs.name"),
consumerGroup, partitionId, offsetType,
currentOffset, receiverEpoch)
createReceiverInternal(partitionId, offsetType, currentOffset)
}
private[spark] def createReceiverInternal(
connectionString: String,
eventHubsName: String,
consumerGroup: String,
partitionId: String,
offsetType: EventHubsOffsetType,
currentOffset: String,
receiverEpoch: Long): Unit = {
// Create Eventhubs client
eventhubsClient = AzureEventHubClient.createFromConnectionStringSync(connectionString)
val receiverOption = new ReceiverOptions()
receiverOption.setReceiverRuntimeMetricEnabled(false)
receiverOption.setIdentifier(
s"${SparkEnv.get.executorId}-${TaskContext.get().taskAttemptId()}")
private[spark] def createReceiverInternal(partitionId: String,
offsetType: EventHubsOffsetType,
currentOffset: String): Unit = {
eventhubsClient = EventHubClient.createFromConnectionStringSync(connectionString)
eventhubsReceiver = offsetType match {
case EventHubsOffsetTypes.None | EventHubsOffsetTypes.PreviousCheckpoint
| EventHubsOffsetTypes.InputByteOffset =>
case EventHubsOffsetTypes.None | EventHubsOffsetTypes.PreviousCheckpoint |
EventHubsOffsetTypes.InputByteOffset =>
if (receiverEpoch > DEFAULT_RECEIVER_EPOCH) {
eventhubsClient.createEpochReceiverSync(consumerGroup, partitionId, currentOffset,
receiverEpoch)
eventhubsClient.createEpochReceiverSync(consumerGroup,
partitionId,
currentOffset,
receiverEpoch)
} else {
eventhubsClient.createReceiverSync(consumerGroup, partitionId, currentOffset)
}
case EventHubsOffsetTypes.InputTimeOffset =>
if (receiverEpoch > DEFAULT_RECEIVER_EPOCH) {
eventhubsClient.createEpochReceiverSync(consumerGroup, partitionId,
Instant.ofEpochSecond(currentOffset.toLong), receiverEpoch)
eventhubsClient.createEpochReceiverSync(consumerGroup,
partitionId,
Instant.ofEpochSecond(currentOffset.toLong),
receiverEpoch)
} else {
eventhubsClient.createReceiverSync(consumerGroup, partitionId,
Instant.ofEpochSecond(currentOffset.toLong))
eventhubsClient.createReceiverSync(consumerGroup,
partitionId,
Instant.ofEpochSecond(currentOffset.toLong))
}
}
eventhubsReceiver.setPrefetchCount(MAXIMUM_PREFETCH_COUNT)
}
def receive(): Iterable[EventData] = {
val events = eventhubsReceiver.receive(MAXIMUM_EVENT_RATE).get()
if (events == null) Iterable.empty else events.asScala
}
/**
* starting from EventHubs client 0.13.1, returning a null from receiver means that there is
* no message in server end
*/
def receive(expectedEventNum: Int): Iterable[EventData] = {
val events = eventhubsReceiver.receive(
math.min(expectedEventNum, eventhubsReceiver.getPrefetchCount)).get()
val events = eventhubsReceiver
.receive(math.min(expectedEventNum, eventhubsReceiver.getPrefetchCount))
.get()
if (events != null) events.asScala else null
}
@ -198,18 +156,12 @@ private[spark] class EventHubsClientWrapper extends Serializable with EventHubCl
eventhubsReceiver.closeSync()
}
private var eventhubsReceiver: PartitionReceiver = _
private val MINIMUM_PREFETCH_COUNT: Int = 10
private var MAXIMUM_PREFETCH_COUNT: Int = 999
private var MAXIMUM_EVENT_RATE: Int = 0
private val DEFAULT_RECEIVER_EPOCH = -1L
override def endPointOfPartition(
retryIfFail: Boolean,
targetEventHubsNameAndPartitions: List[EventHubNameAndPartition]):
Option[Predef.Map[EventHubNameAndPartition, (Long, Long)]] = {
throw new UnsupportedOperationException("endPointOfPartition is not supported by this client" +
" yet, please use AMQPEventHubsClient")
override def endPointOfPartition(retryIfFail: Boolean,
targetEventHubsNameAndPartitions: List[EventHubNameAndPartition])
: Option[Predef.Map[EventHubNameAndPartition, (Long, Long)]] = {
throw new UnsupportedOperationException(
"endPointOfPartition is not supported by this client" +
" yet, please use AMQPEventHubsClient")
}
/**
@ -219,10 +171,11 @@ private[spark] class EventHubsClientWrapper extends Serializable with EventHubCl
*/
override def lastEnqueueTimeOfPartitions(
retryIfFail: Boolean,
targetEventHubNameAndPartitions: List[EventHubNameAndPartition]):
Option[Predef.Map[EventHubNameAndPartition, Long]] = {
throw new UnsupportedOperationException("lastEnqueueTimeOfPartitions is not supported by this" +
" client yet, please use AMQPEventHubsClient")
targetEventHubNameAndPartitions: List[EventHubNameAndPartition])
: Option[Predef.Map[EventHubNameAndPartition, Long]] = {
throw new UnsupportedOperationException(
"lastEnqueueTimeOfPartitions is not supported by this" +
" client yet, please use AMQPEventHubsClient")
}
/**
@ -230,17 +183,16 @@ private[spark] class EventHubsClientWrapper extends Serializable with EventHubCl
*
* @return a map from eventhubName-partition to seq
*/
override def startSeqOfPartition(
retryIfFail: Boolean,
targetEventHubNameAndPartitions: List[EventHubNameAndPartition]):
Option[Predef.Map[EventHubNameAndPartition, Long]] = {
throw new UnsupportedOperationException("startSeqOfPartition is not supported by this client" +
" yet, please use AMQPEventHubsClient")
override def startSeqOfPartition(retryIfFail: Boolean,
targetEventHubNameAndPartitions: List[EventHubNameAndPartition])
: Option[Predef.Map[EventHubNameAndPartition, Long]] = {
throw new UnsupportedOperationException(
"startSeqOfPartition is not supported by this client" +
" yet, please use AMQPEventHubsClient")
}
}
private[spark] object EventHubsClientWrapper {
private[eventhubscommon] def configureStartOffset(
previousOffset: String,
eventhubsParams: Predef.Map[String, String]): (EventHubsOffsetType, String) = {
@ -255,21 +207,17 @@ private[spark] object EventHubsClientWrapper {
}
}
def getEventHubsClient(eventhubsParams: Map[String, String]): AzureEventHubClient = {
new EventHubsClientWrapper().createClient(eventhubsParams)
}
def getEventHubReceiver(
eventhubsParams: Predef.Map[String, String],
partitionId: Int,
startOffset: Long,
offsetType: EventHubsOffsetType,
maximumEventRate: Int): EventHubsClientWrapper = {
// TODO: reuse client
val eventHubClientWrapperInstance = new EventHubsClientWrapper()
eventHubClientWrapperInstance.createReceiver(eventhubsParams, partitionId.toString,
startOffset.toString, offsetType, maximumEventRate)
def getEventHubReceiver(ehParams: Map[String, String],
partitionId: Int,
startOffset: Long,
offsetType: EventHubsOffsetType,
maximumEventRate: Int): EventHubsClientWrapper = {
val ehName = ehParams.get("eventhubs.name").toString
val eventHubClientWrapperInstance = new EventHubsClientWrapper(ehParams)
eventHubClientWrapperInstance.createReceiver(partitionId.toString,
startOffset.toString,
offsetType,
maximumEventRate)
eventHubClientWrapperInstance
}
}

Просмотреть файл

@ -1,236 +0,0 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.spark.eventhubscommon.client
import java.net.SocketTimeoutException
import java.time.{Duration, Instant}
import scala.collection.mutable.ListBuffer
import scala.concurrent.{Await, Future}
import scala.concurrent.duration._
import scala.util.{Failure, Success}
import scala.xml.XML
import com.microsoft.azure.eventhubs.SharedAccessSignatureTokenProvider
import scalaj.http.{Http, HttpResponse}
import org.apache.spark.eventhubscommon.EventHubNameAndPartition
import org.apache.spark.internal.Logging
/**
* a Restful API based client of EventHub
*
* @param eventHubNamespace the namespace of eventhub
* @param numPartitionsEventHubs a map from eventHub name to the total number of partitions
* @param consumerGroups a map from eventHub name to consumer group names
* @param policyKeys a map from eventHub name to (policyName, policyKey) pair
* @param threadNum the number of threads used to communicate with remote EventHub
*/
private[spark] class RestfulEventHubClient(
eventHubNamespace: String,
numPartitionsEventHubs: Map[String, Int],
consumerGroups: Map[String, String],
policyKeys: Map[String, (String, String)],
threadNum: Int) extends EventHubClient with Logging {
private val RETRY_INTERVAL_SECONDS = Array(8, 16, 32, 64, 128)
// will be used to execute requests to EventHub
import org.apache.spark.eventhubscommon.Implicits.exec
private def createSasToken(eventHubName: String, policyName: String, policyKey: String):
String = {
// the default value of 10 mins is hardcoded, and this method will be called for everytime when
// a new batch is started, we may figure out whether there will be any negative impact for
// creating a new sasToken everytime
SharedAccessSignatureTokenProvider.generateSharedAccessSignature(
s"$policyName", s"$policyKey",
s"$eventHubNamespace.servicebus.windows.net/$eventHubName",
Duration.ofMinutes(10))
}
private def fromResponseBodyToEndpoint(responseBody: String): (Long, Long) = {
val partitionDescription = XML.loadString(responseBody) \\ "entry" \
"content" \ "PartitionDescription"
((partitionDescription \ "LastEnqueuedOffset").text.toLong,
(partitionDescription \ "EndSequenceNumber").text.toLong)
}
private def fromParametersToURLString(eventHubName: String, partitionId: Int): String = {
s"https://$eventHubNamespace.servicebus.windows.net/$eventHubName" +
s"/consumergroups/${consumerGroups(eventHubName)}/partitions/$partitionId?api-version=2015-01"
}
private def fromResponseBodyToStartSeq(responseBody: String): Long = {
val partitionDescription = XML.loadString(responseBody) \\ "entry" \
"content" \ "PartitionDescription"
(partitionDescription \ "BeginSequenceNumber").text.toLong
}
private def aggregateResults[T](undergoingRequests: List[Future[(EventHubNameAndPartition, T)]]):
Option[Map[EventHubNameAndPartition, T]] = {
Await.ready(Future.sequence(undergoingRequests), 60 seconds).value.get match {
case Success(queryResponse) =>
Some(queryResponse.toMap.map {case (eventHubQueryKey, queryResponseString) =>
(eventHubQueryKey, queryResponseString.asInstanceOf[T])})
case Failure(e) =>
e.printStackTrace()
None
}
}
private def composeQuery[T](
retryIfFail: Boolean,
fromResponseBodyToResult: String => T,
nameAndPartition: EventHubNameAndPartition):
Future[(EventHubNameAndPartition, T)] = {
Future {
var retryTime = 0
var successfullyFetched = false
var response: HttpResponse[String] = null
val ehNameAndPartition = nameAndPartition
val eventHubName = nameAndPartition.eventHubName
val partitionId = nameAndPartition.partitionId
while (!successfullyFetched) {
logDebug(s"start fetching latest offset of $ehNameAndPartition")
val urlString = fromParametersToURLString(eventHubName, partitionId)
try {
response = Http(urlString).header("Authorization",
createSasToken(eventHubName,
policyName = policyKeys(eventHubName)._1,
policyKey = policyKeys(eventHubName)._2)).
header("Content-Type", "application/atom+xml;type=entry;charset=utf-8").
timeout(connTimeoutMs = 3000, readTimeoutMs = 30000).asString
if (response.code != 200) {
if (!retryIfFail || retryTime > RETRY_INTERVAL_SECONDS.length - 1) {
val errorInfoString = s"cannot get latest offset of" +
s" $ehNameAndPartition, status code: ${response.code}, ${response.headers}" +
s" returned error: ${response.body}"
logError(errorInfoString)
throw new Exception(errorInfoString)
} else {
val retryInterval = 1000 * RETRY_INTERVAL_SECONDS(retryTime)
logError(s"cannot get connect with Event Hubs Rest Endpoint for partition" +
s" $ehNameAndPartition, retry after $retryInterval seconds")
Thread.sleep(retryInterval)
retryTime += 1
}
} else {
successfullyFetched = true
}
} catch {
case e: SocketTimeoutException =>
e.printStackTrace()
logError("Event Hubs return ReadTimeout with 30s as threshold, retrying...")
case e: Exception =>
e.printStackTrace()
throw e
}
}
val results = fromResponseBodyToResult(response.body)
logDebug(s"results of $ehNameAndPartition: $results")
(ehNameAndPartition, results)
}
}
private def queryPartitionRuntimeInfo[T](
targetEventHubsNameAndPartitions: List[EventHubNameAndPartition],
fromResponseBodyToResult: String => T, retryIfFail: Boolean):
Option[Map[EventHubNameAndPartition, T]] = {
val futures = new ListBuffer[Future[(EventHubNameAndPartition, T)]]
if (targetEventHubsNameAndPartitions.isEmpty) {
for ((eventHubName, numPartitions) <- numPartitionsEventHubs;
partitionId <- 0 until numPartitions) {
futures += composeQuery(retryIfFail, fromResponseBodyToResult,
EventHubNameAndPartition(eventHubName, partitionId))
}
} else {
for (targetNameAndPartition <- targetEventHubsNameAndPartitions) {
futures += composeQuery(retryIfFail, fromResponseBodyToResult, targetNameAndPartition)
}
}
aggregateResults(futures.toList)
}
override def close(): Unit = {
// empty
}
/**
* return highest offset/seq and latest enqueueTime of each partition
*/
override def endPointOfPartition(
retryIfFail: Boolean,
targetEventHubsNameAndPartitions: List[EventHubNameAndPartition]):
Option[Map[EventHubNameAndPartition, (Long, Long)]] = {
queryPartitionRuntimeInfo(targetEventHubsNameAndPartitions,
fromResponseBodyToEndpoint, retryIfFail)
}
private def fromResponseBodyToEnqueueTime(responseBody: String): Long = {
val partitionDescription = XML.loadString(responseBody) \\ "entry" \
"content" \ "PartitionDescription"
Instant.parse((partitionDescription \ "LastEnqueuedTimeUtc").text).getEpochSecond
}
/**
* return the last enqueueTime of each partition
* @return a map from eventHubsNamePartition to EnqueueTime
*/
override def lastEnqueueTimeOfPartitions(
retryIfFail: Boolean,
targetEventHubNameAndPartitions: List[EventHubNameAndPartition]):
Option[Map[EventHubNameAndPartition, Long]] = {
queryPartitionRuntimeInfo(targetEventHubNameAndPartitions,
fromResponseBodyToEnqueueTime, retryIfFail)
}
/**
* return the start seq number of each partition
*
* @return a map from eventhubName-partition to seq
*/
override def startSeqOfPartition(
retryIfFail: Boolean,
targetEventHubNameAndPartitions: List[EventHubNameAndPartition]):
Option[Map[EventHubNameAndPartition, Long]] = {
queryPartitionRuntimeInfo(targetEventHubNameAndPartitions,
fromResponseBodyToStartSeq, retryIfFail)
}
}
private[spark] object RestfulEventHubClient {
def getInstance(eventHubNameSpace: String, eventhubsParams: Map[String, Map[String, String]]):
RestfulEventHubClient = {
new RestfulEventHubClient(eventHubNameSpace,
numPartitionsEventHubs = {
eventhubsParams.map { case (eventhubName, params) => (eventhubName,
params("eventhubs.partition.count").toInt)
}
},
consumerGroups = {
eventhubsParams.map { case (eventhubName, params) => (eventhubName,
params("eventhubs.consumergroup"))
}
},
policyKeys = eventhubsParams.map { case (eventhubName, params) => (eventhubName,
(params("eventhubs.policyname"), params("eventhubs.policykey")))
},
threadNum = 15)
}
}

Просмотреть файл

@ -50,21 +50,19 @@ private[spark] object PathTools extends Serializable {
def makeProgressFileName(timestamp: Long): String =
s"progress-$timestamp"
def makeTempFileName(
streamId: Int,
uid: String,
eventHubNameAndPartition: EventHubNameAndPartition,
timestamp: Long): String =
def makeTempFileName(streamId: Int,
uid: String,
eventHubNameAndPartition: EventHubNameAndPartition,
timestamp: Long): String =
s"$streamId-$uid-$eventHubNameAndPartition-$timestamp"
def makeTempFilePath(
basePath: String,
streamId: Int,
uid: String,
eventHubNameAndPartition: EventHubNameAndPartition,
timestamp: Long): Path =
new Path(s"${combineDirectoryNames(
basePath, makeTempFileName(streamId, uid, eventHubNameAndPartition, timestamp))}")
def makeTempFilePath(basePath: String,
streamId: Int,
uid: String,
eventHubNameAndPartition: EventHubNameAndPartition,
timestamp: Long): Path =
new Path(
s"${combineDirectoryNames(basePath, makeTempFileName(streamId, uid, eventHubNameAndPartition, timestamp))}")
def makeMetadataFileName(timestamp: Long): String = timestamp.toString
}

Просмотреть файл

@ -28,13 +28,12 @@ package org.apache.spark.eventhubscommon.progress
* BatchID
*
*/
private[spark] case class ProgressRecord(
timestamp: Long,
uid: String,
eventHubName: String,
partitionId: Int,
offset: Long,
seqId: Long) {
private[spark] case class ProgressRecord(timestamp: Long,
uid: String,
eventHubName: String,
partitionId: Int,
offset: Long,
seqId: Long) {
override def toString: String = {
s"$timestamp $uid $eventHubName $partitionId $offset $seqId"
}
@ -44,10 +43,15 @@ private[spark] object ProgressRecord {
def parse(line: String): Option[ProgressRecord] = {
try {
val Array(timestampStr, namespace, eventHubName, partitionIdStr, offsetStr,
seqStr) = line.split(" ")
Some(ProgressRecord(timestampStr.toLong, namespace, eventHubName,
partitionIdStr.toInt, offsetStr.toLong, seqStr.toLong))
val Array(timestampStr, namespace, eventHubName, partitionIdStr, offsetStr, seqStr) =
line.split(" ")
Some(
ProgressRecord(timestampStr.toLong,
namespace,
eventHubName,
partitionIdStr.toInt,
offsetStr.toLong,
seqStr.toLong))
} catch {
case m: RuntimeException =>
m.printStackTrace()

Просмотреть файл

@ -17,8 +17,8 @@
package org.apache.spark.eventhubscommon.progress
import java.io.{BufferedReader, InputStreamReader, IOException}
import java.util.concurrent.{ScheduledFuture, ScheduledThreadPoolExecutor, TimeUnit}
import java.io.{ BufferedReader, InputStreamReader, IOException }
import java.util.concurrent.{ ScheduledFuture, ScheduledThreadPoolExecutor, TimeUnit }
import scala.collection.mutable
import scala.collection.mutable.ListBuffer
@ -27,17 +27,24 @@ import com.microsoft.azure.eventhubs.PartitionReceiver
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs._
import org.apache.spark.eventhubscommon.{EventHubNameAndPartition, EventHubsConnector, OffsetRecord}
import org.apache.spark.eventhubscommon.{
EventHubNameAndPartition,
EventHubsConnector,
OffsetRecord
}
import org.apache.spark.internal.Logging
private[spark] abstract class ProgressTrackerBase[T <: EventHubsConnector](
progressDir: String, appName: String, hadoopConfiguration: Configuration) extends Logging {
progressDir: String,
appName: String,
hadoopConfiguration: Configuration)
extends Logging {
private[spark] lazy val progressDirectoryStr = PathTools.makeProgressDirectoryStr(progressDir,
appName)
private[spark] lazy val progressDirectoryStr =
PathTools.makeProgressDirectoryStr(progressDir, appName)
private[spark] lazy val tempDirectoryStr = PathTools.makeTempDirectoryStr(progressDir, appName)
private[spark] lazy val metadataDirectoryStr = PathTools.makeMetadataDirectoryStr(progressDir,
appName)
private[spark] lazy val metadataDirectoryStr =
PathTools.makeMetadataDirectoryStr(progressDir, appName)
private[spark] lazy val progressDirectoryPath = new Path(progressDirectoryStr)
private[spark] lazy val tempDirectoryPath = new Path(tempDirectoryStr)
@ -57,10 +64,9 @@ private[spark] abstract class ProgressTrackerBase[T <: EventHubsConnector](
private[spark] def fromPathToTimestamp(path: Path): Long =
path.getName.split("-").last.toLong
protected def allEventNameAndPartitionExist(
candidateEhNameAndPartitions: Map[String, List[EventHubNameAndPartition]]): Boolean = {
eventHubNameAndPartitions.forall{
eventHubNameAndPartitions.forall {
case (uid, ehNameAndPartitions) =>
candidateEhNameAndPartitions.contains(uid) &&
ehNameAndPartitions.forall(candidateEhNameAndPartitions(uid).contains)
@ -68,21 +74,28 @@ private[spark] abstract class ProgressTrackerBase[T <: EventHubsConnector](
}
// no metadata (for backward compatibility)
private def getLatestFileWithoutMetadata(fs: FileSystem, timestamp: Long = Long.MaxValue):
Option[Path] = {
private def getLatestFileWithoutMetadata(fs: FileSystem,
timestamp: Long = Long.MaxValue): Option[Path] = {
val allFiles = fs.listStatus(progressDirectoryPath)
if (allFiles.length < 1) {
None
} else {
Some(allFiles.filter(fsStatus => fromPathToTimestamp(fsStatus.getPath) <= timestamp).
sortWith((f1, f2) => fromPathToTimestamp(f1.getPath) > fromPathToTimestamp(f2.getPath))
(0).getPath)
Some(
allFiles
.filter(fsStatus => fromPathToTimestamp(fsStatus.getPath) <= timestamp)
.sortWith((f1, f2) => fromPathToTimestamp(f1.getPath) > fromPathToTimestamp(f2.getPath))(
0)
.getPath)
}
}
private def getLatestFileWithMetadata(metadataFiles: Array[FileStatus]): Option[Path] = {
val latestMetadata = metadataFiles.sortWith((f1, f2) => f1.getPath.getName.toLong >
f2.getPath.getName.toLong).head
val latestMetadata = metadataFiles
.sortWith(
(f1, f2) =>
f1.getPath.getName.toLong >
f2.getPath.getName.toLong)
.head
logInfo(s"locate latest timestamp in metadata as ${latestMetadata.getPath.getName}")
Some(new Path(progressDirectoryStr + "/progress-" + latestMetadata.getPath.getName))
}
@ -92,12 +105,13 @@ private[spark] abstract class ProgressTrackerBase[T <: EventHubsConnector](
*
* NOTE: the additional integer in return value is to simplify the test (could be improved)
*/
private[spark] def getLatestFile(fs: FileSystem, timestamp: Long = Long.MaxValue):
(Int, Option[Path]) = {
private[spark] def getLatestFile(fs: FileSystem,
timestamp: Long = Long.MaxValue): (Int, Option[Path]) = {
// first check metadata directory if exists
if (fs.exists(metadataDirectoryPath)) {
val metadataFiles = fs.listStatus(metadataDirectoryPath).filter(
file => file.isFile && file.getPath.getName.toLong <= timestamp)
val metadataFiles = fs
.listStatus(metadataDirectoryPath)
.filter(file => file.isFile && file.getPath.getName.toLong <= timestamp)
if (metadataFiles.nonEmpty) {
// metadata files exists
(0, getLatestFileWithMetadata(metadataFiles))
@ -136,7 +150,7 @@ private[spark] abstract class ProgressTrackerBase[T <: EventHubsConnector](
}
val progressRecord = progressRecordOpt.get
val newList = allProgressFiles.getOrElseUpdate(progressRecord.uid,
List[EventHubNameAndPartition]()) :+
List[EventHubNameAndPartition]()) :+
EventHubNameAndPartition(progressRecord.eventHubName, progressRecord.partitionId)
allProgressFiles(progressRecord.uid) = newList
if (timestamp == -1L) {
@ -161,10 +175,8 @@ private[spark] abstract class ProgressTrackerBase[T <: EventHubsConnector](
(allEventNameAndPartitionExist(allProgressFiles.toMap), latestFileOpt)
}
protected def readProgressRecordLines(
progressFilePath: Path,
fs: FileSystem): List[ProgressRecord] = {
protected def readProgressRecordLines(progressFilePath: Path,
fs: FileSystem): List[ProgressRecord] = {
val ret = new ListBuffer[ProgressRecord]
var ins: FSDataInputStream = null
var br: BufferedReader = null
@ -175,8 +187,9 @@ private[spark] abstract class ProgressTrackerBase[T <: EventHubsConnector](
while (line != null) {
val progressRecordOpt = ProgressRecord.parse(line)
if (progressRecordOpt.isEmpty) {
throw new IllegalStateException(s"detect corrupt progress tracking file at $line" +
s" it might be a bug in the implementation of underlying file system")
throw new IllegalStateException(
s"detect corrupt progress tracking file at $line" +
s" it might be a bug in the implementation of underlying file system")
}
val progressRecord = progressRecordOpt.get
ret += progressRecord
@ -238,24 +251,28 @@ private[spark] abstract class ProgressTrackerBase[T <: EventHubsConnector](
if (progressFileOption.isEmpty) {
// if no progress file, then start from the beginning of the streams
val connectedEventHubs = eventHubNameAndPartitions.find {
case (connectorUID, _) => connectorUID == targetConnectorUID}
require(connectedEventHubs.isDefined, s"cannot find $targetConnectorUID in" +
s" $eventHubNameAndPartitions")
case (connectorUID, _) => connectorUID == targetConnectorUID
}
require(connectedEventHubs.isDefined,
s"cannot find $targetConnectorUID in" +
s" $eventHubNameAndPartitions")
// it's hacky to take timestamp -1 as the start of streams
readTimestamp = -1
recordToReturn = connectedEventHubs.get._2.map(
(_, (PartitionReceiver.START_OF_STREAM.toLong, -1L))).toMap
recordToReturn =
connectedEventHubs.get._2.map((_, (PartitionReceiver.START_OF_STREAM.toLong, -1L))).toMap
} else {
val expectedTimestamp = fromPathToTimestamp(progressFileOption.get)
val progressFilePath = progressFileOption.get
val recordLines = readProgressRecordLines(progressFilePath, fs)
require(recordLines.count(_.timestamp != expectedTimestamp) == 0, "detected inconsistent" +
s" progress record, expected timestamp $expectedTimestamp")
require(recordLines.count(_.timestamp != expectedTimestamp) == 0,
"detected inconsistent" +
s" progress record, expected timestamp $expectedTimestamp")
readTimestamp = expectedTimestamp
recordToReturn = recordLines.filter(
progressRecord => progressRecord.uid == targetConnectorUID).map(
progressRecord => EventHubNameAndPartition(progressRecord.eventHubName,
progressRecord.partitionId) -> (progressRecord.offset, progressRecord.seqId)).toMap
recordToReturn = recordLines
.filter(progressRecord => progressRecord.uid == targetConnectorUID)
.map(progressRecord =>
EventHubNameAndPartition(progressRecord.eventHubName, progressRecord.partitionId) -> (progressRecord.offset, progressRecord.seqId))
.toMap
}
} catch {
case ias: IllegalArgumentException =>
@ -273,16 +290,20 @@ private[spark] abstract class ProgressTrackerBase[T <: EventHubsConnector](
var oos: FSDataOutputStream = null
try {
// write progress file
oos = fs.create(new Path(s"$progressDirectoryPath/${PathTools.makeProgressFileName(
commitTime)}"), true)
oos = fs.create(
new Path(s"$progressDirectoryPath/${PathTools.makeProgressFileName(commitTime)}"),
true)
offsetToCommit.foreach {
case (namespace, ehNameAndPartitionToOffsetAndSeq) =>
ehNameAndPartitionToOffsetAndSeq.foreach {
case (nameAndPartitionId, (offset, seq)) =>
oos.writeBytes(
ProgressRecord(commitTime, namespace,
nameAndPartitionId.eventHubName, nameAndPartitionId.partitionId, offset,
seq).toString + "\n"
ProgressRecord(commitTime,
namespace,
nameAndPartitionId.eventHubName,
nameAndPartitionId.partitionId,
offset,
seq).toString + "\n"
)
}
}
@ -301,8 +322,9 @@ private[spark] abstract class ProgressTrackerBase[T <: EventHubsConnector](
private def createMetadata(fs: FileSystem, commitTime: Long): Boolean = {
var oos: FSDataOutputStream = null
try {
oos = fs.create(new Path(s"$metadataDirectoryStr/" + s"${PathTools.makeMetadataFileName(
commitTime)}"), true)
oos = fs.create(
new Path(s"$metadataDirectoryStr/" + s"${PathTools.makeMetadataFileName(commitTime)}"),
true)
true
} catch {
case e: Exception =>
@ -316,29 +338,29 @@ private[spark] abstract class ProgressTrackerBase[T <: EventHubsConnector](
}
// write offsetToCommit to a progress tracking file
private def transaction(
offsetToCommit: Map[String, Map[EventHubNameAndPartition, (Long, Long)]],
fs: FileSystem,
commitTime: Long): Unit = {
private def transaction(offsetToCommit: Map[String, Map[EventHubNameAndPartition, (Long, Long)]],
fs: FileSystem,
commitTime: Long): Unit = {
if (createProgressFile(offsetToCommit, fs, commitTime)) {
if (!createMetadata(fs, commitTime)) {
logError(s"cannot create progress file at $commitTime")
throw new IOException(s"cannot create metadata file at $commitTime," +
s" check the previous exception for the root cause")
throw new IOException(
s"cannot create metadata file at $commitTime," +
s" check the previous exception for the root cause")
}
} else {
logError(s"cannot create progress file at $commitTime")
throw new IOException(s"cannot create progress file at $commitTime," +
s" check the previous exception for the root cause")
throw new IOException(
s"cannot create progress file at $commitTime," +
s" check the previous exception for the root cause")
}
}
/**
* commit offsetToCommit to a new progress tracking file
*/
def commit(
offsetToCommit: Map[String, Map[EventHubNameAndPartition, (Long, Long)]],
commitTime: Long): Unit = {
def commit(offsetToCommit: Map[String, Map[EventHubNameAndPartition, (Long, Long)]],
commitTime: Long): Unit = {
val fs = new Path(progressDir).getFileSystem(hadoopConfiguration)
try {
transaction(offsetToCommit, fs, commitTime)
@ -352,25 +374,28 @@ private[spark] abstract class ProgressTrackerBase[T <: EventHubsConnector](
}
}
private def allProgressRecords(
timestamp: Long,
ehConnectors: List[EventHubsConnector]): List[Path] = {
private def allProgressRecords(timestamp: Long,
ehConnectors: List[EventHubsConnector]): List[Path] = {
val fs = tempDirectoryPath.getFileSystem(hadoopConfiguration)
ehConnectors.flatMap { ehConnector =>
ehConnector.connectedInstances.map(ehNameAndPartition =>
PathTools.makeTempFilePath(
tempDirectoryStr, ehConnector.streamId, ehConnector.uid, ehNameAndPartition, timestamp))
}.filter(fs.exists)
ehConnectors
.flatMap { ehConnector =>
ehConnector.connectedInstances.map(
ehNameAndPartition =>
PathTools.makeTempFilePath(tempDirectoryStr,
ehConnector.streamId,
ehConnector.uid,
ehNameAndPartition,
timestamp))
}
.filter(fs.exists)
}
/**
* read progress records from temp directories
* @return Map(Namespace -> Map(EventHubNameAndPartition -> (Offset, Seq))
*/
def collectProgressRecordsForBatch(
timestamp: Long,
ehConnectors: List[EventHubsConnector]):
Map[String, Map[EventHubNameAndPartition, (Long, Long)]] = {
def collectProgressRecordsForBatch(timestamp: Long, ehConnectors: List[EventHubsConnector])
: Map[String, Map[EventHubNameAndPartition, (Long, Long)]] = {
val records = new ListBuffer[ProgressRecord]
val ret = new mutable.HashMap[String, Map[EventHubNameAndPartition, (Long, Long)]]
try {
@ -384,10 +409,11 @@ private[spark] abstract class ProgressTrackerBase[T <: EventHubsConnector](
// check timestamp consistency
records.foreach(progressRecord =>
if (timestamp != progressRecord.timestamp) {
throw new IllegalStateException(s"detect inconsistent progress tracking file at" +
s" $progressRecord, expected timestamp: $timestamp, it might be a bug in the" +
s" implementation of underlying file system")
})
throw new IllegalStateException(
s"detect inconsistent progress tracking file at" +
s" $progressRecord, expected timestamp: $timestamp, it might be a bug in the" +
s" implementation of underlying file system")
})
} catch {
case ioe: IOException =>
logError(s"error: ${ioe.getMessage}")
@ -410,11 +436,15 @@ private[spark] abstract class ProgressTrackerBase[T <: EventHubsConnector](
def cleanProgressFile(timestampToClean: Long): Unit = {
val fs = progressDirectoryPath.getFileSystem(hadoopConfiguration)
val allUselessFiles = fs.listStatus(progressDirectoryPath, new PathFilter {
override def accept(path: Path): Boolean = fromPathToTimestamp(path) <= timestampToClean
}).map(_.getPath)
val sortedFileList = allUselessFiles.sortWith((p1, p2) => fromPathToTimestamp(p1) >
fromPathToTimestamp(p2))
val allUselessFiles = fs
.listStatus(progressDirectoryPath, new PathFilter {
override def accept(path: Path): Boolean = fromPathToTimestamp(path) <= timestampToClean
})
.map(_.getPath)
val sortedFileList = allUselessFiles.sortWith(
(p1, p2) =>
fromPathToTimestamp(p1) >
fromPathToTimestamp(p2))
if (sortedFileList.nonEmpty) {
sortedFileList.tail.foreach { filePath =>
logInfo(s"delete $filePath")
@ -422,15 +452,22 @@ private[spark] abstract class ProgressTrackerBase[T <: EventHubsConnector](
}
}
// clean temp directory
val allUselessTempFiles = fs.listStatus(tempDirectoryPath, new PathFilter {
override def accept(path: Path): Boolean = fromPathToTimestamp(path) <= timestampToClean
}).map(_.getPath)
val allUselessTempFiles = fs
.listStatus(tempDirectoryPath, new PathFilter {
override def accept(path: Path): Boolean = fromPathToTimestamp(path) <= timestampToClean
})
.map(_.getPath)
if (allUselessTempFiles.nonEmpty) {
allUselessTempFiles.groupBy(fromPathToTimestamp).toList.sortWith((p1, p2) => p1._1 > p2._1).
tail.flatMap(_._2).foreach {
filePath => logInfo(s"delete $filePath")
allUselessTempFiles
.groupBy(fromPathToTimestamp)
.toList
.sortWith((p1, p2) => p1._1 > p2._1)
.tail
.flatMap(_._2)
.foreach { filePath =>
logInfo(s"delete $filePath")
fs.delete(filePath, true)
}
}
}
}
@ -439,11 +476,12 @@ private[spark] abstract class ProgressTrackerBase[T <: EventHubsConnector](
override def run(): Unit = {
val fs = metadataDirectoryPath.getFileSystem(new Configuration())
val allMetadataFiles = fs.listStatus(metadataDirectoryPath)
val sortedMetadataFiles = allMetadataFiles.sortWith((f1, f2) => f1.getPath.getName.toLong <
f2.getPath.getName.toLong)
sortedMetadataFiles.take(math.max(sortedMetadataFiles.length - 1, 0)).map{
file =>
fs.delete(file.getPath, true)
val sortedMetadataFiles = allMetadataFiles.sortWith(
(f1, f2) =>
f1.getPath.getName.toLong <
f2.getPath.getName.toLong)
sortedMetadataFiles.take(math.max(sortedMetadataFiles.length - 1, 0)).map { file =>
fs.delete(file.getPath, true)
}
}
}

Просмотреть файл

@ -20,19 +20,19 @@ package org.apache.spark.eventhubscommon.progress
import java.io.IOException
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.{FSDataOutputStream, Path}
import org.apache.hadoop.fs.{ FSDataOutputStream, Path }
import org.apache.spark.eventhubscommon.EventHubNameAndPartition
import org.apache.spark.internal.Logging
private[spark] class ProgressWriter(
streamId: Int,
uid: String,
eventHubNameAndPartition: EventHubNameAndPartition,
timestamp: Long,
hadoopConfiguration: Configuration,
progressDir: String,
subDirIdentifiers: String*) extends Logging {
private[spark] class ProgressWriter(streamId: Int,
uid: String,
eventHubNameAndPartition: EventHubNameAndPartition,
timestamp: Long,
hadoopConfiguration: Configuration,
progressDir: String,
subDirIdentifiers: String*)
extends Logging {
// TODO: Why can't we get this info from one of the ProgressTrackers?
// TODO: Come up with better name for this guy
@ -51,9 +51,12 @@ private[spark] class ProgressWriter(
// it would be safe to overwrite checkpoint, since we will not start a new job when
// checkpoint hasn't been committed
cpFileStream = fs.create(tempProgressTrackingPointPath, true)
val record = ProgressRecord(recordTime, uid,
eventHubNameAndPartition.eventHubName, eventHubNameAndPartition.partitionId, cpOffset,
cpSeq)
val record = ProgressRecord(recordTime,
uid,
eventHubNameAndPartition.eventHubName,
eventHubNameAndPartition.partitionId,
cpOffset,
cpSeq)
cpFileStream.writeBytes(s"$record")
} catch {
case ioe: IOException =>
@ -66,5 +69,3 @@ private[spark] class ProgressWriter(
}
}
}

Просмотреть файл

@ -17,7 +17,6 @@
package org.apache.spark.eventhubscommon.rdd
// scalastyle:off
import scala.collection.mutable.ListBuffer
import com.microsoft.azure.eventhubs.EventData
@ -29,49 +28,55 @@ import org.apache.spark.eventhubscommon.EventHubNameAndPartition
import org.apache.spark.eventhubscommon.client.EventHubsOffsetTypes.EventHubsOffsetType
import org.apache.spark.eventhubscommon.progress.ProgressWriter
import org.apache.spark.rdd.RDD
import org.apache.spark.{Partition, SparkContext, TaskContext}
// scalastyle:on
import org.apache.spark.{ Partition, SparkContext, TaskContext }
private class EventHubRDDPartition(
val sparkPartitionId: Int,
val eventHubNameAndPartitionID: EventHubNameAndPartition,
val fromOffset: Long,
val fromSeq: Long,
val untilSeq: Long,
val offsetType: EventHubsOffsetType) extends Partition {
private class EventHubRDDPartition(val sparkPartitionId: Int,
val eventHubNameAndPartitionID: EventHubNameAndPartition,
val fromOffset: Long,
val fromSeq: Long,
val untilSeq: Long,
val offsetType: EventHubsOffsetType)
extends Partition {
override def index: Int = sparkPartitionId
}
private[spark] class EventHubsRDD(
sc: SparkContext,
eventHubsParamsMap: Map[String, Map[String, String]],
val offsetRanges: List[OffsetRange],
batchTime: Long,
offsetParams: OffsetStoreParams,
eventHubReceiverCreator: (Map[String, String], Int, Long, EventHubsOffsetType, Int) =>
EventHubsClientWrapper)
extends RDD[EventData](sc, Nil) {
private[spark] class EventHubsRDD(sc: SparkContext,
eventHubsParamsMap: Map[String, Map[String, String]],
val offsetRanges: List[OffsetRange],
batchTime: Long,
offsetParams: OffsetStoreParams,
eventHubReceiverCreator: (Map[String, String],
Int,
Long,
EventHubsOffsetType,
Int) => EventHubsClientWrapper)
extends RDD[EventData](sc, Nil) {
override def getPartitions: Array[Partition] = {
offsetRanges.zipWithIndex.map { case (offsetRange, index) =>
new EventHubRDDPartition(index, offsetRange.eventHubNameAndPartition, offsetRange.fromOffset,
offsetRange.fromSeq, offsetRange.untilSeq, offsetRange.offsetType)
offsetRanges.zipWithIndex.map {
case (offsetRange, index) =>
new EventHubRDDPartition(index,
offsetRange.eventHubNameAndPartition,
offsetRange.fromOffset,
offsetRange.fromSeq,
offsetRange.untilSeq,
offsetRange.offsetType)
}.toArray
}
private def wrappingReceive(
eventHubNameAndPartition: EventHubNameAndPartition,
eventHubClient: EventHubsClientWrapper,
expectedEventNumber: Int,
expectedHighestSeqNum: Long): List[EventData] = {
private def wrappingReceive(eventHubNameAndPartition: EventHubNameAndPartition,
eventHubClient: EventHubsClientWrapper,
expectedEventNumber: Int,
expectedHighestSeqNum: Long): List[EventData] = {
val receivedBuffer = new ListBuffer[EventData]
val receivingTrace = new ListBuffer[Long]
var cnt = 0
while (receivedBuffer.size < expectedEventNumber) {
if (cnt > expectedEventNumber * 2) {
throw new Exception(s"$eventHubNameAndPartition cannot return data, the trace is" +
s" ${receivingTrace.toList}")
throw new Exception(
s"$eventHubNameAndPartition cannot return data, the trace is" +
s" ${receivingTrace.toList}")
}
val receivedEventsItr = eventHubClient.receive(expectedEventNumber - receivedBuffer.size)
if (receivedEventsItr == null) {
@ -83,7 +88,7 @@ private[spark] class EventHubsRDD(
cnt += 1
receivedBuffer ++= receivedEvents
if (receivedBuffer.nonEmpty &&
receivedBuffer.last.getSystemProperties.getSequenceNumber >= expectedHighestSeqNum) {
receivedBuffer.last.getSystemProperties.getSequenceNumber >= expectedHighestSeqNum) {
// this is for the case where user has passed in filtering params and the remaining
// msg number is less than expectedEventNumber
return receivedBuffer.toList
@ -92,60 +97,69 @@ private[spark] class EventHubsRDD(
receivedBuffer.toList
}
private def processFullyConsumedPartition(
ehRDDPartition: EventHubRDDPartition, progressWriter: ProgressWriter): Iterator[EventData] = {
private def processFullyConsumedPartition(ehRDDPartition: EventHubRDDPartition,
progressWriter: ProgressWriter): Iterator[EventData] = {
logInfo(s"No new data in ${ehRDDPartition.eventHubNameAndPartitionID} at $batchTime")
val fromOffset = ehRDDPartition.fromOffset
progressWriter.write(batchTime, ehRDDPartition.fromOffset,
ehRDDPartition.fromSeq)
logInfo(s"write offset $fromOffset, sequence number" +
s" ${ehRDDPartition.fromSeq} for EventHub" +
s" ${ehRDDPartition.eventHubNameAndPartitionID} at $batchTime")
progressWriter.write(batchTime, ehRDDPartition.fromOffset, ehRDDPartition.fromSeq)
logInfo(
s"write offset $fromOffset, sequence number" +
s" ${ehRDDPartition.fromSeq} for EventHub" +
s" ${ehRDDPartition.eventHubNameAndPartitionID} at $batchTime")
Iterator()
}
private def extractOffsetAndSeqToWrite(
receivedEvents: List[EventData],
eventHubReceiver: EventHubsClientWrapper,
ehRDDPartition: EventHubRDDPartition): (Long, Long) = {
private def extractOffsetAndSeqToWrite(receivedEvents: List[EventData],
eventHubReceiver: EventHubsClientWrapper,
ehRDDPartition: EventHubRDDPartition): (Long, Long) = {
if (receivedEvents.nonEmpty) {
val lastEvent = receivedEvents.last
(lastEvent.getSystemProperties.getOffset.toLong,
lastEvent.getSystemProperties.getSequenceNumber)
lastEvent.getSystemProperties.getSequenceNumber)
} else {
val partitionInfo = eventHubReceiver.eventhubsClient.getPartitionRuntimeInformation(
ehRDDPartition.eventHubNameAndPartitionID.partitionId.toString).get()
val partitionInfo = eventHubReceiver.eventhubsClient
.getPartitionRuntimeInformation(
ehRDDPartition.eventHubNameAndPartitionID.partitionId.toString)
.get()
(partitionInfo.getLastEnqueuedOffset.toLong, partitionInfo.getLastEnqueuedSequenceNumber)
}
}
private def retrieveDataFromPartition(
ehRDDPartition: EventHubRDDPartition, progressWriter: ProgressWriter): Iterator[EventData] = {
private def retrieveDataFromPartition(ehRDDPartition: EventHubRDDPartition,
progressWriter: ProgressWriter): Iterator[EventData] = {
val fromOffset = ehRDDPartition.fromOffset
val fromSeq = ehRDDPartition.fromSeq
val untilSeq = ehRDDPartition.untilSeq
val maxRate = (untilSeq - fromSeq).toInt
val startTime = System.currentTimeMillis()
logInfo(s"${ehRDDPartition.eventHubNameAndPartitionID}" +
s" expected rate $maxRate, fromSeq $fromSeq (exclusive) untilSeq" +
s" $untilSeq (inclusive) at $batchTime")
logInfo(
s"${ehRDDPartition.eventHubNameAndPartitionID}" +
s" expected rate $maxRate, fromSeq $fromSeq (exclusive) untilSeq" +
s" $untilSeq (inclusive) at $batchTime")
var eventHubReceiver: EventHubsClientWrapper = null
try {
val eventHubParameters = eventHubsParamsMap(ehRDDPartition.eventHubNameAndPartitionID.
eventHubName)
eventHubReceiver = eventHubReceiverCreator(eventHubParameters,
ehRDDPartition.eventHubNameAndPartitionID.partitionId, fromOffset,
ehRDDPartition.offsetType, maxRate)
val eventHubParameters = eventHubsParamsMap(
ehRDDPartition.eventHubNameAndPartitionID.eventHubName)
eventHubReceiver = eventHubReceiverCreator(
eventHubParameters,
ehRDDPartition.eventHubNameAndPartitionID.partitionId,
fromOffset,
ehRDDPartition.offsetType,
maxRate)
val receivedEvents = wrappingReceive(ehRDDPartition.eventHubNameAndPartitionID,
eventHubReceiver, maxRate, ehRDDPartition.untilSeq)
logInfo(s"received ${receivedEvents.length} messages before Event Hubs server indicates" +
s" there is no more messages, time cost:" +
s" ${(System.currentTimeMillis() - startTime) / 1000.0} seconds")
val (offsetToWrite, seqToWrite) = extractOffsetAndSeqToWrite(receivedEvents, eventHubReceiver,
ehRDDPartition)
eventHubReceiver,
maxRate,
ehRDDPartition.untilSeq)
logInfo(
s"received ${receivedEvents.length} messages before Event Hubs server indicates" +
s" there is no more messages, time cost:" +
s" ${(System.currentTimeMillis() - startTime) / 1000.0} seconds")
val (offsetToWrite, seqToWrite) =
extractOffsetAndSeqToWrite(receivedEvents, eventHubReceiver, ehRDDPartition)
progressWriter.write(batchTime, offsetToWrite, seqToWrite)
logInfo(s"write offset $offsetToWrite, sequence number $seqToWrite for EventHub" +
s" ${ehRDDPartition.eventHubNameAndPartitionID} at $batchTime")
logInfo(
s"write offset $offsetToWrite, sequence number $seqToWrite for EventHub" +
s" ${ehRDDPartition.eventHubNameAndPartitionID} at $batchTime")
receivedEvents.iterator
} catch {
case e: Exception =>
@ -161,9 +175,15 @@ private[spark] class EventHubsRDD(
@DeveloperApi
override def compute(split: Partition, context: TaskContext): Iterator[EventData] = {
val ehRDDPartition = split.asInstanceOf[EventHubRDDPartition]
val progressWriter = new ProgressWriter(offsetParams.streamId, offsetParams.uid,
ehRDDPartition.eventHubNameAndPartitionID, batchTime, new Configuration(),
offsetParams.checkpointDir, offsetParams.subDirs: _*)
val progressWriter = new ProgressWriter(
offsetParams.streamId,
offsetParams.uid,
ehRDDPartition.eventHubNameAndPartitionID,
batchTime,
new Configuration(),
offsetParams.checkpointDir,
offsetParams.subDirs: _*
)
if (ehRDDPartition.fromSeq >= ehRDDPartition.untilSeq) {
processFullyConsumedPartition(ehRDDPartition, progressWriter)
} else {
@ -171,4 +191,3 @@ private[spark] class EventHubsRDD(
}
}
}

Просмотреть файл

@ -20,12 +20,11 @@ package org.apache.spark.eventhubscommon.rdd
import org.apache.spark.eventhubscommon.EventHubNameAndPartition
import org.apache.spark.eventhubscommon.client.EventHubsOffsetTypes.EventHubsOffsetType
private[spark] case class OffsetRange(
eventHubNameAndPartition: EventHubNameAndPartition,
fromOffset: Long,
fromSeq: Long,
untilSeq: Long,
offsetType: EventHubsOffsetType) {
private[spark] case class OffsetRange(eventHubNameAndPartition: EventHubNameAndPartition,
fromOffset: Long,
fromSeq: Long,
untilSeq: Long,
offsetType: EventHubsOffsetType) {
private[spark] def toTuple = (eventHubNameAndPartition, fromOffset, fromSeq, untilSeq, offsetType)
}

Просмотреть файл

@ -18,5 +18,7 @@
package org.apache.spark.eventhubscommon.rdd
// a helper object to avoid serialzing offsetstore instances
private[spark] case class OffsetStoreParams(
checkpointDir: String, streamId: Int, uid: String, subDirs: String*)
private[spark] case class OffsetStoreParams(checkpointDir: String,
streamId: Int,
uid: String,
subDirs: String*)

Просмотреть файл

@ -28,7 +28,9 @@ import org.apache.spark.sql.execution.streaming.Offset
// the descriptor of EventHubsBatchRecord to communicate with StreamExecution
private[streaming] case class EventHubsBatchRecord(
batchId: Long, targetSeqNums: Map[EventHubNameAndPartition, Long]) extends Offset {
batchId: Long,
targetSeqNums: Map[EventHubNameAndPartition, Long])
extends Offset {
override def json: String = JsonUtils.partitionAndSeqNum(batchId, targetSeqNums)
}
@ -37,8 +39,10 @@ private object JsonUtils {
def partitionAndSeqNum(batchId: Long, seqNums: Map[EventHubNameAndPartition, Long]): String = {
val convertedStringIndexedMap = new mutable.HashMap[String, Long]
seqNums.foreach{case (eventHubNameAndPartition, offsetAndSeqNum) =>
convertedStringIndexedMap += eventHubNameAndPartition.toString -> offsetAndSeqNum}
seqNums.foreach {
case (eventHubNameAndPartition, offsetAndSeqNum) =>
convertedStringIndexedMap += eventHubNameAndPartition.toString -> offsetAndSeqNum
}
Serialization.write((batchId, convertedStringIndexedMap.toMap))
}
@ -46,19 +50,23 @@ private object JsonUtils {
try {
val deserializedTuple = Serialization.read[(Int, Map[String, Long])](jsonStr)
val batchId = deserializedTuple._1
EventHubsBatchRecord(batchId, deserializedTuple._2.map{case (ehNameAndPartitionStr, seqNum) =>
(EventHubNameAndPartition.fromString(ehNameAndPartitionStr), seqNum)})
EventHubsBatchRecord(batchId, deserializedTuple._2.map {
case (ehNameAndPartitionStr, seqNum) =>
(EventHubNameAndPartition.fromString(ehNameAndPartitionStr), seqNum)
})
} catch {
case NonFatal(x) =>
throw new IllegalArgumentException(s"failed to parse $jsonStr")
}
}
def partitionOffsetAndSeqNums(
batchId: Long, offsets: Map[EventHubNameAndPartition, (Long, Long)]): String = {
def partitionOffsetAndSeqNums(batchId: Long,
offsets: Map[EventHubNameAndPartition, (Long, Long)]): String = {
val convertedStringIndexedMap = new mutable.HashMap[String, (Long, Long)]
offsets.foreach{case (eventHubNameAndPartition, offsetAndSeqNum) =>
convertedStringIndexedMap += eventHubNameAndPartition.toString -> offsetAndSeqNum}
offsets.foreach {
case (eventHubNameAndPartition, offsetAndSeqNum) =>
convertedStringIndexedMap += eventHubNameAndPartition.toString -> offsetAndSeqNum
}
Serialization.write((batchId, convertedStringIndexedMap))
}
}

Просмотреть файл

@ -20,27 +20,42 @@ package org.apache.spark.sql.streaming.eventhubs
import java.util.concurrent.Executors
import java.util.concurrent.atomic.AtomicInteger
import scala.concurrent.{ExecutionContext, Future}
import scala.util.{Failure, Success}
import scala.concurrent.{ ExecutionContext, Future }
import scala.util.{ Failure, Success }
import org.apache.spark.eventhubscommon.{EventHubNameAndPartition, EventHubsConnector, OffsetRecord, RateControlUtils}
import org.apache.spark.eventhubscommon.client.{AMQPEventHubsClient, EventHubClient, EventHubsClientWrapper, RestfulEventHubClient}
import org.apache.spark.eventhubscommon.{
EventHubNameAndPartition,
EventHubsConnector,
OffsetRecord,
RateControlUtils
}
import org.apache.spark.eventhubscommon.client.{
AMQPEventHubsClient,
Client,
EventHubsClientWrapper
}
import org.apache.spark.eventhubscommon.client.EventHubsOffsetTypes.EventHubsOffsetType
import org.apache.spark.eventhubscommon.rdd.{EventHubsRDD, OffsetRange, OffsetStoreParams}
import org.apache.spark.eventhubscommon.rdd.{ EventHubsRDD, OffsetRange, OffsetStoreParams }
import org.apache.spark.internal.Logging
import org.apache.spark.sql.{DataFrame, Row, SQLContext}
import org.apache.spark.sql.execution.streaming.{Offset, SerializedOffset, Source}
import org.apache.spark.sql.{ DataFrame, Row, SQLContext }
import org.apache.spark.sql.execution.streaming.{ Offset, SerializedOffset, Source }
import org.apache.spark.sql.streaming.eventhubs.checkpoint.StructuredStreamingProgressTracker
import org.apache.spark.sql.types._
private[spark] class EventHubsSource(
sqlContext: SQLContext,
eventHubsParams: Map[String, String],
eventhubReceiverCreator: (Map[String, String], Int, Long, EventHubsOffsetType, Int) =>
EventHubsClientWrapper = EventHubsClientWrapper.getEventHubReceiver,
eventhubClientCreator: (String, Map[String, Map[String, String]]) =>
EventHubClient = AMQPEventHubsClient.getInstance)
extends Source with EventHubsConnector with Logging {
eventhubReceiverCreator: (Map[String, String],
Int,
Long,
EventHubsOffsetType,
Int) => EventHubsClientWrapper =
EventHubsClientWrapper.getEventHubReceiver,
eventhubClientCreator: (String, Map[String, Map[String, String]]) => Client =
AMQPEventHubsClient.getInstance)
extends Source
with EventHubsConnector
with Logging {
case class EventHubsOffset(batchId: Long, offsets: Map[EventHubNameAndPartition, (Long, Long)])
@ -53,15 +68,15 @@ private[spark] class EventHubsSource(
require(eventHubsNamespace != null, "eventhubs.namespace is not defined")
require(eventHubsName != null, "eventhubs.name is not defined")
private var _eventHubsClient: EventHubClient = _
private var _eventHubsClient: Client = _
private var _eventHubsReceiver: (Map[String, String], Int, Long, EventHubsOffsetType, Int)
=> EventHubsClientWrapper = _
private var _eventHubsReceiver
: (Map[String, String], Int, Long, EventHubsOffsetType, Int) => EventHubsClientWrapper = _
private[eventhubs] def eventHubClient = {
if (_eventHubsClient == null) {
_eventHubsClient = eventhubClientCreator(eventHubsNamespace,
Map(eventHubsName -> eventHubsParams))
_eventHubsClient =
eventhubClientCreator(eventHubsNamespace, Map(eventHubsName -> eventHubsParams))
}
_eventHubsClient
}
@ -79,8 +94,8 @@ private[spark] class EventHubsSource(
yield EventHubNameAndPartition(eventHubsName, partitionId)).toList
}
private implicit val cleanupExecutorService = ExecutionContext.fromExecutor(
Executors.newFixedThreadPool(1))
private implicit val cleanupExecutorService =
ExecutionContext.fromExecutor(Executors.newFixedThreadPool(1))
// EventHubsSource is created for each instance of program, that means it is different with
// DStream which will load the serialized Direct DStream instance from checkpoint
@ -88,17 +103,22 @@ private[spark] class EventHubsSource(
// initialize ProgressTracker
private val progressTracker = StructuredStreamingProgressTracker.initInstance(
uid, eventHubsParams("eventhubs.progressTrackingDir"), sqlContext.sparkContext.appName,
uid,
eventHubsParams("eventhubs.progressTrackingDir"),
sqlContext.sparkContext.appName,
sqlContext.sparkContext.hadoopConfiguration)
private[spark] def setEventHubClient(eventHubClient: EventHubClient): EventHubsSource = {
private[spark] def setEventHubClient(eventHubClient: Client): EventHubsSource = {
_eventHubsClient = eventHubClient
this
}
private[spark] def setEventHubsReceiver(
eventhubReceiverCreator: (Map[String, String], Int, Long, EventHubsOffsetType, Int) =>
EventHubsClientWrapper): EventHubsSource = {
eventhubReceiverCreator: (Map[String, String],
Int,
Long,
EventHubsOffsetType,
Int) => EventHubsClientWrapper): EventHubsSource = {
_eventHubsReceiver = eventhubReceiverCreator
this
}
@ -116,18 +136,20 @@ private[spark] class EventHubsSource(
EventHubsSourceProvider.sourceSchema(eventHubsParams)
}
private[spark] def composeHighestOffset(retryIfFail: Boolean):
Option[Map[EventHubNameAndPartition, (Long, Long)]] = {
RateControlUtils.fetchLatestOffset(eventHubClient,
private[spark] def composeHighestOffset(
retryIfFail: Boolean): Option[Map[EventHubNameAndPartition, (Long, Long)]] = {
RateControlUtils.fetchLatestOffset(
eventHubClient,
retryIfFail = retryIfFail,
if (fetchedHighestOffsetsAndSeqNums == null) {
committedOffsetsAndSeqNums.offsets
} else {
fetchedHighestOffsetsAndSeqNums.offsets
}) match {
}
) match {
case Some(highestOffsets) =>
fetchedHighestOffsetsAndSeqNums = EventHubsOffset(committedOffsetsAndSeqNums.batchId,
highestOffsets)
fetchedHighestOffsetsAndSeqNums =
EventHubsOffset(committedOffsetsAndSeqNums.batchId, highestOffsets)
Some(fetchedHighestOffsetsAndSeqNums.offsets)
case _ =>
logWarning(s"failed to fetch highest offset")
@ -144,8 +166,9 @@ private[spark] class EventHubsSource(
* idea about the highest offset, we shall fail the app when rest endpoint is not responsive, and
* to prevent us from dying too much, we shall retry with 2-power interval in this case
*/
private def failAppIfRestEndpointFail = fetchedHighestOffsetsAndSeqNums == null ||
committedOffsetsAndSeqNums.offsets.equals(fetchedHighestOffsetsAndSeqNums.offsets)
private def failAppIfRestEndpointFail =
fetchedHighestOffsetsAndSeqNums == null ||
committedOffsetsAndSeqNums.offsets.equals(fetchedHighestOffsetsAndSeqNums.offsets)
private def cleanupFiles(batchIdToClean: Long): Unit = {
Future {
@ -154,8 +177,9 @@ private[spark] class EventHubsSource(
case Success(r) =>
logInfo(s"finished cleanup for batch $batchIdToClean")
case Failure(exception) =>
logWarning(s"error happened when clean up for batch $batchIdToClean," +
s" $exception")
logWarning(
s"error happened when clean up for batch $batchIdToClean," +
s" $exception")
}
}
@ -168,8 +192,9 @@ private[spark] class EventHubsSource(
*/
override def getOffset: Option[Offset] = {
val highestOffsetsOpt = composeHighestOffset(failAppIfRestEndpointFail)
require(highestOffsetsOpt.isDefined, "cannot get highest offset from rest endpoint of" +
" eventhubs")
require(highestOffsetsOpt.isDefined,
"cannot get highest offset from rest endpoint of" +
" eventhubs")
if (!firstBatch) {
// committedOffsetsAndSeqNums.batchId is always no larger than the latest finished batch id
val lastCommittedBatchId = committedOffsetsAndSeqNums.batchId
@ -182,11 +207,17 @@ private[spark] class EventHubsSource(
firstBatch = false
}
val targetOffsets = RateControlUtils.clamp(committedOffsetsAndSeqNums.offsets,
highestOffsetsOpt.get, eventHubsParams)
Some(EventHubsBatchRecord(committedOffsetsAndSeqNums.batchId + 1,
targetOffsets.map{case (ehNameAndPartition, seqNum) =>
(ehNameAndPartition, math.min(seqNum,
fetchedHighestOffsetsAndSeqNums.offsets(ehNameAndPartition)._2))}))
highestOffsetsOpt.get,
eventHubsParams)
Some(
EventHubsBatchRecord(
committedOffsetsAndSeqNums.batchId + 1,
targetOffsets.map {
case (ehNameAndPartition, seqNum) =>
(ehNameAndPartition,
math.min(seqNum, fetchedHighestOffsetsAndSeqNums.offsets(ehNameAndPartition)._2))
}
))
}
/**
@ -200,21 +231,29 @@ private[spark] class EventHubsSource(
// a file, we need to read the latest progress file in the directory and see if we have commit
// the offsests (check if the timestamp matches) and then collect the files if necessary
progressTracker.commit(Map(uid -> committedOffsetsAndSeqNums.offsets), committedBatchId)
logInfo(s"committed offsets of batch $committedBatchId, collectedCommits:" +
s" $committedOffsetsAndSeqNums")
logInfo(
s"committed offsets of batch $committedBatchId, collectedCommits:" +
s" $committedOffsetsAndSeqNums")
}
private def fetchEndingOffsetOfLastBatch(committedBatchId: Long) = {
val startOffsetOfUndergoingBatch = progressTracker.collectProgressRecordsForBatch(
committedBatchId, List(this))
val startOffsetOfUndergoingBatch =
progressTracker.collectProgressRecordsForBatch(committedBatchId, List(this))
if (startOffsetOfUndergoingBatch.isEmpty) {
// first batch, take the initial value of the offset, -1
EventHubsOffset(committedBatchId, committedOffsetsAndSeqNums.offsets)
} else {
EventHubsOffset(committedBatchId,
startOffsetOfUndergoingBatch.filter { case (connectorUID, _) =>
connectorUID == uid
}.values.head.filter(_._1.eventHubName == eventHubsParams("eventhubs.name")))
EventHubsOffset(
committedBatchId,
startOffsetOfUndergoingBatch
.filter {
case (connectorUID, _) =>
connectorUID == uid
}
.values
.head
.filter(_._1.eventHubName == eventHubsParams("eventhubs.name"))
)
}
}
@ -225,24 +264,28 @@ private[spark] class EventHubsSource(
require(startSeqs.isDefined, s"cannot fetch start seqs for eventhubs $eventHubsName")
committedOffsetsAndSeqNums = EventHubsOffset(-1, committedOffsetsAndSeqNums.offsets.map {
case (ehNameAndPartition, (offset, _)) =>
(ehNameAndPartition, (offset, startSeqs.get(ehNameAndPartition)))})
RateControlUtils.validateFilteringParams(eventHubClient, eventHubsParams,
ehNameAndPartitions)
(ehNameAndPartition, (offset, startSeqs.get(ehNameAndPartition)))
})
RateControlUtils.validateFilteringParams(eventHubClient,
eventHubsParams,
ehNameAndPartitions)
RateControlUtils.composeFromOffsetWithFilteringParams(eventHubsParams,
committedOffsetsAndSeqNums.offsets)
committedOffsetsAndSeqNums.offsets)
} else {
Map[EventHubNameAndPartition, (EventHubsOffsetType, Long)]()
}
}
endOffset.targetSeqNums.map {
case (ehNameAndPartition, seqNum) =>
val (offsetType, offset) = RateControlUtils.calculateStartOffset(ehNameAndPartition,
filterOffsetAndType, committedOffsetsAndSeqNums.offsets)
val (offsetType, offset) =
RateControlUtils.calculateStartOffset(ehNameAndPartition,
filterOffsetAndType,
committedOffsetsAndSeqNums.offsets)
OffsetRange(ehNameAndPartition,
fromOffset = offset,
fromSeq = committedOffsetsAndSeqNums.offsets(ehNameAndPartition)._2,
untilSeq = seqNum,
offsetType = offsetType)
fromOffset = offset,
fromSeq = committedOffsetsAndSeqNums.offsets(ehNameAndPartition)._2,
untilSeq = seqNum,
offsetType = offsetType)
}.toList
}
@ -254,7 +297,10 @@ private[spark] class EventHubsSource(
offsetRanges,
committedOffsetsAndSeqNums.batchId + 1,
OffsetStoreParams(eventHubsParams("eventhubs.progressTrackingDir"),
streamId, uid = uid, subDirs = sqlContext.sparkContext.appName, uid),
streamId,
uid = uid,
subDirs = sqlContext.sparkContext.appName,
uid),
eventHubsReceiver
)
}
@ -263,27 +309,31 @@ private[spark] class EventHubsSource(
import scala.collection.JavaConverters._
val (containsProperties, userDefinedKeys) =
EventHubsSourceProvider.ifContainsPropertiesAndUserDefinedKeys(eventHubsParams)
val rowRDD = eventHubsRDD.map(eventData =>
Row.fromSeq(Seq(eventData.getBytes, eventData.getSystemProperties.getOffset.toLong,
eventData.getSystemProperties.getSequenceNumber,
eventData.getSystemProperties.getEnqueuedTime.getEpochSecond,
eventData.getSystemProperties.getPublisher,
eventData.getSystemProperties.getPartitionKey
) ++ {
if (containsProperties) {
if (userDefinedKeys.nonEmpty) {
userDefinedKeys.map(k => {
eventData.getProperties.asScala.getOrElse(k, "").toString
})
val rowRDD = eventHubsRDD.map(
eventData =>
Row.fromSeq(Seq(
eventData.getBytes,
eventData.getSystemProperties.getOffset.toLong,
eventData.getSystemProperties.getSequenceNumber,
eventData.getSystemProperties.getEnqueuedTime.getEpochSecond,
eventData.getSystemProperties.getPublisher,
eventData.getSystemProperties.getPartitionKey
) ++ {
if (containsProperties) {
if (userDefinedKeys.nonEmpty) {
userDefinedKeys.map(k => {
eventData.getProperties.asScala.getOrElse(k, "").toString
})
} else {
Seq(eventData.getProperties.asScala.map {
case (k, v) =>
k -> (if (v == null) null else v.toString)
})
}
} else {
Seq(eventData.getProperties.asScala.map { case (k, v) =>
k -> (if (v == null) null else v.toString) })
Seq()
}
} else {
Seq()
}
}
))
}))
sqlContext.createDataFrame(rowRDD, schema)
}
@ -328,8 +378,8 @@ private[spark] class EventHubsSource(
logInfo(s"recovered from a failure, startOffset: $start, endOffset: $end")
val highestOffsets = composeHighestOffset(failAppIfRestEndpointFail)
require(highestOffsets.isDefined, "cannot get highest offsets when recovering from a failure")
fetchedHighestOffsetsAndSeqNums = EventHubsOffset(committedOffsetsAndSeqNums.batchId,
highestOffsets.get)
fetchedHighestOffsetsAndSeqNums =
EventHubsOffset(committedOffsetsAndSeqNums.batchId, highestOffsets.get)
firstBatch = false
}

Просмотреть файл

@ -20,28 +20,28 @@ package org.apache.spark.sql.streaming.eventhubs
import org.apache.spark.internal.Logging
import org.apache.spark.sql.SQLContext
import org.apache.spark.sql.execution.streaming.Source
import org.apache.spark.sql.sources.{DataSourceRegister, StreamSourceProvider}
import org.apache.spark.sql.sources.{ DataSourceRegister, StreamSourceProvider }
import org.apache.spark.sql.types._
private[sql] class EventHubsSourceProvider extends DataSourceRegister
with StreamSourceProvider with Logging {
private[sql] class EventHubsSourceProvider
extends DataSourceRegister
with StreamSourceProvider
with Logging {
override def shortName(): String = "eventhubs"
override def sourceSchema(
sqlContext: SQLContext,
schema: Option[StructType],
providerName: String,
parameters: Map[String, String]): (String, StructType) = {
override def sourceSchema(sqlContext: SQLContext,
schema: Option[StructType],
providerName: String,
parameters: Map[String, String]): (String, StructType) = {
(shortName(), EventHubsSourceProvider.sourceSchema(parameters))
}
override def createSource(
sqlContext: SQLContext,
metadataPath: String,
schema: Option[StructType],
providerName: String,
parameters: Map[String, String]): Source = {
override def createSource(sqlContext: SQLContext,
metadataPath: String,
schema: Option[StructType],
providerName: String,
parameters: Map[String, String]): Source = {
// TODO: use serviceLoader to pass in customized eventhubReceiverCreator and
// eventhubClientCreator
new EventHubsSource(sqlContext, parameters)
@ -50,10 +50,10 @@ private[sql] class EventHubsSourceProvider extends DataSourceRegister
private[sql] object EventHubsSourceProvider extends Serializable {
private[eventhubs] def ifContainsPropertiesAndUserDefinedKeys(parameters: Map[String, String]):
(Boolean, Seq[String]) = {
val containsProperties = parameters.getOrElse("eventhubs.sql.containsProperties",
"false").toBoolean
private[eventhubs] def ifContainsPropertiesAndUserDefinedKeys(
parameters: Map[String, String]): (Boolean, Seq[String]) = {
val containsProperties =
parameters.getOrElse("eventhubs.sql.containsProperties", "false").toBoolean
val userDefinedKeys = {
if (parameters.contains("eventhubs.sql.userDefinedKeys")) {
parameters("eventhubs.sql.userDefinedKeys").split(",").toSeq
@ -66,21 +66,25 @@ private[sql] object EventHubsSourceProvider extends Serializable {
def sourceSchema(parameters: Map[String, String]): StructType = {
val (containsProperties, userDefinedKeys) = ifContainsPropertiesAndUserDefinedKeys(parameters)
StructType(Seq(
StructField("body", BinaryType),
StructField("offset", LongType),
StructField("seqNumber", LongType),
StructField("enqueuedTime", LongType),
StructField("publisher", StringType),
StructField("partitionKey", StringType)
) ++ {if (containsProperties) {
if (userDefinedKeys.nonEmpty) {
userDefinedKeys.map(key => StructField(key, StringType))
} else {
Seq(StructField("properties", MapType(StringType, StringType, valueContainsNull = true)))
}
} else {
Seq()
}})
StructType(
Seq(
StructField("body", BinaryType),
StructField("offset", LongType),
StructField("seqNumber", LongType),
StructField("enqueuedTime", LongType),
StructField("publisher", StringType),
StructField("partitionKey", StringType)
) ++ {
if (containsProperties) {
if (userDefinedKeys.nonEmpty) {
userDefinedKeys.map(key => StructField(key, StringType))
} else {
Seq(
StructField("properties", MapType(StringType, StringType, valueContainsNull = true)))
}
} else {
Seq()
}
})
}
}

Просмотреть файл

@ -21,22 +21,22 @@ import scala.collection.mutable
import org.apache.hadoop.conf.Configuration
import org.apache.spark.eventhubscommon.{EventHubNameAndPartition, EventHubsConnector}
import org.apache.spark.eventhubscommon.progress.{PathTools, ProgressTrackerBase}
import org.apache.spark.eventhubscommon.{ EventHubNameAndPartition, EventHubsConnector }
import org.apache.spark.eventhubscommon.progress.{ PathTools, ProgressTrackerBase }
private[spark] class StructuredStreamingProgressTracker private[spark](
private[spark] class StructuredStreamingProgressTracker private[spark] (
uid: String,
progressDir: String,
appName: String,
hadoopConfiguration: Configuration)
extends ProgressTrackerBase(progressDir, appName, hadoopConfiguration) {
extends ProgressTrackerBase(progressDir, appName, hadoopConfiguration) {
private[spark] override lazy val progressDirectoryStr = PathTools.makeProgressDirectoryStr(
progressDir, appName, uid)
private[spark] override lazy val tempDirectoryStr = PathTools.makeTempDirectoryStr(progressDir,
appName, uid)
private[spark] override lazy val metadataDirectoryStr = PathTools.makeMetadataDirectoryStr(
progressDir, appName, uid)
private[spark] override lazy val progressDirectoryStr =
PathTools.makeProgressDirectoryStr(progressDir, appName, uid)
private[spark] override lazy val tempDirectoryStr =
PathTools.makeTempDirectoryStr(progressDir, appName, uid)
private[spark] override lazy val metadataDirectoryStr =
PathTools.makeMetadataDirectoryStr(progressDir, appName, uid)
override def eventHubNameAndPartitions: Map[String, List[EventHubNameAndPartition]] = {
val connector = StructuredStreamingProgressTracker.registeredConnectors(uid)
@ -67,7 +67,8 @@ private[spark] class StructuredStreamingProgressTracker private[spark](
if (latestFile.isDefined) {
logWarning(s"latest progress file ${latestFile.get} corrupt, rebuild file...")
val latestFileTimestamp = fromPathToTimestamp(latestFile.get)
val progressRecords = collectProgressRecordsForBatch(latestFileTimestamp,
val progressRecords = collectProgressRecordsForBatch(
latestFileTimestamp,
List(StructuredStreamingProgressTracker.registeredConnectors(uid)))
commit(progressRecords, latestFileTimestamp)
}
@ -111,9 +112,10 @@ object StructuredStreamingProgressTracker {
this.synchronized {
// DirectDStream shall have singleton progress tracker
if (_progressTrackers.get(uid).isEmpty) {
_progressTrackers += uid -> new StructuredStreamingProgressTracker(uid, progressDirStr,
appName,
hadoopConfiguration)
_progressTrackers += uid -> new StructuredStreamingProgressTracker(uid,
progressDirStr,
appName,
hadoopConfiguration)
}
_progressTrackers(uid).init()
}

Просмотреть файл

@ -17,22 +17,26 @@
package org.apache.spark.streaming.eventhubs
import java.io.{IOException, ObjectInputStream}
import java.io.{ IOException, ObjectInputStream }
import scala.collection.mutable
import com.microsoft.azure.eventhubs.EventData
import org.apache.spark.eventhubscommon._
import org.apache.spark.eventhubscommon.client.{AMQPEventHubsClient, EventHubClient, EventHubsClientWrapper}
import org.apache.spark.eventhubscommon.client.{
AMQPEventHubsClient,
Client,
EventHubsClientWrapper
}
import org.apache.spark.eventhubscommon.client.EventHubsOffsetTypes.EventHubsOffsetType
import org.apache.spark.eventhubscommon.rdd.{EventHubsRDD, OffsetRange, OffsetStoreParams}
import org.apache.spark.eventhubscommon.rdd.{ EventHubsRDD, OffsetRange, OffsetStoreParams }
import org.apache.spark.internal.Logging
import org.apache.spark.rdd.RDD
import org.apache.spark.streaming.{StreamingContext, Time}
import org.apache.spark.streaming.dstream.{DStreamCheckpointData, InputDStream}
import org.apache.spark.streaming.{ StreamingContext, Time }
import org.apache.spark.streaming.dstream.{ DStreamCheckpointData, InputDStream }
import org.apache.spark.streaming.eventhubs.checkpoint._
import org.apache.spark.streaming.scheduler.{RateController, StreamInputInfo}
import org.apache.spark.streaming.scheduler.{ RateController, StreamInputInfo }
import org.apache.spark.streaming.scheduler.rate.RateEstimator
import org.apache.spark.util.Utils
@ -49,11 +53,17 @@ private[eventhubs] class EventHubDirectDStream private[eventhubs] (
private[eventhubs] val eventHubNameSpace: String,
progressDir: String,
eventhubsParams: Map[String, Map[String, String]],
eventhubReceiverCreator: (Map[String, String], Int, Long, EventHubsOffsetType, Int) =>
EventHubsClientWrapper = EventHubsClientWrapper.getEventHubReceiver,
eventhubClientCreator: (String, Map[String, Map[String, String]]) =>
EventHubClient = AMQPEventHubsClient.getInstance)
extends InputDStream[EventData](_ssc) with EventHubsConnector with Logging {
eventhubReceiverCreator: (Map[String, String],
Int,
Long,
EventHubsOffsetType,
Int) => EventHubsClientWrapper =
EventHubsClientWrapper.getEventHubReceiver,
eventhubClientCreator: (String, Map[String, Map[String, String]]) => Client =
AMQPEventHubsClient.getInstance)
extends InputDStream[EventData](_ssc)
with EventHubsConnector
with Logging {
private[streaming] override def name: String = s"EventHub direct stream [$id]"
@ -67,8 +77,8 @@ private[eventhubs] class EventHubDirectDStream private[eventhubs] (
private val eventhubNameAndPartitions = {
for (eventHubName <- eventhubsParams.keySet;
partitionId <- 0 until eventhubsParams(eventHubName)(
"eventhubs.partition.count").toInt) yield EventHubNameAndPartition(eventHubName, partitionId)
partitionId <- 0 until eventhubsParams(eventHubName)("eventhubs.partition.count").toInt)
yield EventHubNameAndPartition(eventHubName, partitionId)
}
// uniquely identify the entities in eventhubs side, it can be the namespace or the name of a
@ -87,16 +97,15 @@ private[eventhubs] class EventHubDirectDStream private[eventhubs] (
} else {
None
}
*/
*/
}
@transient private var _eventHubClient: EventHubClient = _
@transient private var _eventHubClient: Client = _
private def progressTracker = DirectDStreamProgressTracker.getInstance.
asInstanceOf[DirectDStreamProgressTracker]
private def progressTracker =
DirectDStreamProgressTracker.getInstance.asInstanceOf[DirectDStreamProgressTracker]
private[eventhubs] def setEventHubClient(eventHubClient: EventHubClient):
EventHubDirectDStream = {
private[eventhubs] def setEventHubClient(eventHubClient: Client): EventHubDirectDStream = {
_eventHubClient = eventHubClient
this
}
@ -108,22 +117,26 @@ private[eventhubs] class EventHubDirectDStream private[eventhubs] (
_eventHubClient
}
private[eventhubs] var currentOffsetsAndSeqNums = OffsetRecord(-1L,
{eventhubNameAndPartitions.map{ehNameAndSpace => (ehNameAndSpace, (-1L, -1L))}.toMap})
private[eventhubs] var currentOffsetsAndSeqNums = OffsetRecord(-1L, {
eventhubNameAndPartitions.map { ehNameAndSpace =>
(ehNameAndSpace, (-1L, -1L))
}.toMap
})
private[eventhubs] var fetchedHighestOffsetsAndSeqNums: OffsetRecord = _
override def start(): Unit = {
val concurrentJobs = ssc.conf.getInt("spark.streaming.concurrentJobs", 1)
require(concurrentJobs == 1,
require(
concurrentJobs == 1,
"due to the limitation from eventhub, we do not allow to have multiple concurrent spark jobs")
DirectDStreamProgressTracker.initInstance(progressDir,
context.sparkContext.appName, context.sparkContext.hadoopConfiguration)
context.sparkContext.appName,
context.sparkContext.hadoopConfiguration)
ProgressTrackingListener.initInstance(ssc, progressDir)
}
override def stop(): Unit = {
logInfo("stop: stopping EventHubDirectDStream")
eventHubClient.close()
}
/**
@ -139,44 +152,50 @@ private[eventhubs] class EventHubDirectDStream private[eventhubs] (
*/
private def fetchStartOffsetForEachPartition(validTime: Time, fallBack: Boolean): OffsetRecord = {
val offsetRecord = progressTracker.read(
eventHubNameSpace, validTime.milliseconds - ssc.graph.batchDuration.milliseconds, fallBack)
eventHubNameSpace,
validTime.milliseconds - ssc.graph.batchDuration.milliseconds,
fallBack)
require(offsetRecord.offsets.nonEmpty, "progress file cannot be empty")
if (offsetRecord.timestamp != -1) {
OffsetRecord(math.max(ssc.graph.startTime.milliseconds, offsetRecord.timestamp),
offsetRecord.offsets)
offsetRecord.offsets)
} else {
// query start startSeqs
val startSeqs = eventHubClient.startSeqOfPartition(retryIfFail = false,
eventhubNameAndPartitions.toList)
require(startSeqs.isDefined, "We cannot get starting seq number of partitions," +
" EventHubs endpoint is not available")
OffsetRecord(math.max(ssc.graph.startTime.milliseconds, offsetRecord.timestamp),
val startSeqs =
eventHubClient.startSeqOfPartition(retryIfFail = false, eventhubNameAndPartitions.toList)
require(startSeqs.isDefined,
"We cannot get starting seq number of partitions," +
" EventHubs endpoint is not available")
OffsetRecord(
math.max(ssc.graph.startTime.milliseconds, offsetRecord.timestamp),
offsetRecord.offsets.map {
case (ehNameAndPartition, (offset, _)) =>
(ehNameAndPartition, (offset, startSeqs.get(ehNameAndPartition)))
})
}
)
}
}
private def reportInputInto(validTime: Time,
offsetRanges: List[OffsetRange], inputSize: Int): Unit = {
offsetRanges: List[OffsetRange],
inputSize: Int): Unit = {
require(inputSize >= 0, s"invalid inputSize ($inputSize) with offsetRanges: $offsetRanges")
val description = offsetRanges.map { offsetRange =>
s"eventhub: ${offsetRange.eventHubNameAndPartition}\t" +
s"starting offsets: ${offsetRange.fromOffset}" +
s"sequenceNumbers: ${offsetRange.fromSeq} to ${offsetRange.untilSeq}"
}.mkString("\n")
val description = offsetRanges
.map { offsetRange =>
s"eventhub: ${offsetRange.eventHubNameAndPartition}\t" +
s"starting offsets: ${offsetRange.fromOffset}" +
s"sequenceNumbers: ${offsetRange.fromSeq} to ${offsetRange.untilSeq}"
}
.mkString("\n")
// Copy offsetRanges to immutable.List to prevent from being modified by the user
val metadata = Map(
"offsets" -> offsetRanges,
StreamInputInfo.METADATA_KEY_DESCRIPTION -> description)
val metadata =
Map("offsets" -> offsetRanges, StreamInputInfo.METADATA_KEY_DESCRIPTION -> description)
val inputInfo = StreamInputInfo(id, inputSize, metadata)
ssc.scheduler.inputInfoTracker.reportInfo(validTime, inputInfo)
}
private def validatePartitions(
validTime: Time,
calculatedPartitions: List[EventHubNameAndPartition]): Unit = {
private def validatePartitions(validTime: Time,
calculatedPartitions: List[EventHubNameAndPartition]): Unit = {
if (currentOffsetsAndSeqNums != null) {
val currentPartitions = currentOffsetsAndSeqNums.offsets.keys.toList
val diff = currentPartitions.diff(calculatedPartitions)
@ -187,24 +206,26 @@ private[eventhubs] class EventHubDirectDStream private[eventhubs] (
}
}
private def clamp(highestEndpoints: Map[EventHubNameAndPartition, (Long, Long)]):
Map[EventHubNameAndPartition, Long] = {
private def clamp(highestEndpoints: Map[EventHubNameAndPartition, (Long, Long)])
: Map[EventHubNameAndPartition, Long] = {
if (rateController.isEmpty) {
RateControlUtils.clamp(currentOffsetsAndSeqNums.offsets,
fetchedHighestOffsetsAndSeqNums.offsets, eventhubsParams)
fetchedHighestOffsetsAndSeqNums.offsets,
eventhubsParams)
} else {
val estimateRateLimit = rateController.map(_.getLatestRate().toInt)
estimateRateLimit.filter(_ > 0) match {
case None =>
highestEndpoints.map{case (ehNameAndPartition, _) =>
(ehNameAndPartition, currentOffsetsAndSeqNums.offsets(ehNameAndPartition)._2)
highestEndpoints.map {
case (ehNameAndPartition, _) =>
(ehNameAndPartition, currentOffsetsAndSeqNums.offsets(ehNameAndPartition)._2)
}
case Some(allowedRate) =>
val lagPerPartition = highestEndpoints.map {
case (eventHubNameAndPartition, (_, latestSeq)) =>
eventHubNameAndPartition ->
math.max(latestSeq - currentOffsetsAndSeqNums.offsets(eventHubNameAndPartition)._2,
0)
0)
}
val totalLag = lagPerPartition.values.sum
lagPerPartition.map {
@ -229,31 +250,36 @@ private[eventhubs] class EventHubDirectDStream private[eventhubs] (
val filteringOffsetAndType = {
if (shouldCareEnqueueTimeOrOffset) {
// first check if the parameters are valid
RateControlUtils.validateFilteringParams(eventHubClient, eventhubsParams,
eventhubNameAndPartitions.toList)
RateControlUtils.validateFilteringParams(eventHubClient,
eventhubsParams,
eventhubNameAndPartitions.toList)
RateControlUtils.composeFromOffsetWithFilteringParams(eventhubsParams,
startOffsetInNextBatch.offsets)
startOffsetInNextBatch.offsets)
} else {
Map[EventHubNameAndPartition, (EventHubsOffsetType, Long)]()
}
}
highestOffsets.map {
case (eventHubNameAndPartition, (_, endSeqNum)) =>
val (offsetType, offset) = RateControlUtils.calculateStartOffset(eventHubNameAndPartition,
filteringOffsetAndType, startOffsetInNextBatch.offsets)
OffsetRange(eventHubNameAndPartition,
val (offsetType, offset) =
RateControlUtils.calculateStartOffset(eventHubNameAndPartition,
filteringOffsetAndType,
startOffsetInNextBatch.offsets)
OffsetRange(
eventHubNameAndPartition,
fromOffset = offset,
fromSeq = startOffsetInNextBatch.offsets(eventHubNameAndPartition)._2,
untilSeq = math.min(clampedSeqIDs(eventHubNameAndPartition), endSeqNum),
offsetType = offsetType)
offsetType = offsetType
)
}.toList
}
private def proceedWithNonEmptyRDD(
validTime: Time,
startOffsetInNextBatch: OffsetRecord,
highestOffsetOfAllPartitions: Map[EventHubNameAndPartition, (Long, Long)]):
Option[EventHubsRDD] = {
highestOffsetOfAllPartitions: Map[EventHubNameAndPartition, (Long, Long)])
: Option[EventHubsRDD] = {
// normal processing
validatePartitions(validTime, startOffsetInNextBatch.offsets.keys.toList)
currentOffsetsAndSeqNums = startOffsetInNextBatch
@ -264,11 +290,15 @@ private[eventhubs] class EventHubDirectDStream private[eventhubs] (
eventhubsParams,
offsetRanges,
validTime.milliseconds,
OffsetStoreParams(progressDir, streamId, uid = eventHubNameSpace,
subDirs = ssc.sparkContext.appName),
eventhubReceiverCreator)
reportInputInto(validTime, offsetRanges,
offsetRanges.map(ofr => ofr.untilSeq - ofr.fromSeq).sum.toInt)
OffsetStoreParams(progressDir,
streamId,
uid = eventHubNameSpace,
subDirs = ssc.sparkContext.appName),
eventhubReceiverCreator
)
reportInputInto(validTime,
offsetRanges,
offsetRanges.map(ofr => ofr.untilSeq - ofr.fromSeq).sum.toInt)
Some(eventHubRDD)
}
@ -288,19 +318,18 @@ private[eventhubs] class EventHubDirectDStream private[eventhubs] (
* idea about the highest offset, we shall fail the app when rest endpoint is not responsive, and
* to prevent we die too much, we shall retry with 2-power interval in this case
*/
private def failAppIfRestEndpointFail = fetchedHighestOffsetsAndSeqNums == null ||
currentOffsetsAndSeqNums.offsets.equals(fetchedHighestOffsetsAndSeqNums.offsets)
private def failAppIfRestEndpointFail =
fetchedHighestOffsetsAndSeqNums == null ||
currentOffsetsAndSeqNums.offsets.equals(fetchedHighestOffsetsAndSeqNums.offsets)
private[spark] def composeHighestOffset(validTime: Time, retryIfFail: Boolean) = {
RateControlUtils.fetchLatestOffset(
eventHubClient,
retryIfFail,
if (fetchedHighestOffsetsAndSeqNums == null) {
currentOffsetsAndSeqNums.offsets
} else {
fetchedHighestOffsetsAndSeqNums.offsets
})
match {
RateControlUtils.fetchLatestOffset(eventHubClient,
retryIfFail,
if (fetchedHighestOffsetsAndSeqNums == null) {
currentOffsetsAndSeqNums.offsets
} else {
fetchedHighestOffsetsAndSeqNums.offsets
}) match {
case Some(highestOffsets) =>
fetchedHighestOffsetsAndSeqNums = OffsetRecord(validTime.milliseconds, highestOffsets)
Some(fetchedHighestOffsetsAndSeqNums.offsets)
@ -321,9 +350,10 @@ private[eventhubs] class EventHubDirectDStream private[eventhubs] (
require(progressTracker != null, "ProgressTracker hasn't been initialized")
var startPointRecord = fetchStartOffsetForEachPartition(validTime, !initialized)
while (startPointRecord.timestamp < validTime.milliseconds -
ssc.graph.batchDuration.milliseconds) {
logInfo(s"wait for ProgressTrackingListener to commit offsets at Batch" +
s" ${validTime.milliseconds}")
ssc.graph.batchDuration.milliseconds) {
logInfo(
s"wait for ProgressTrackingListener to commit offsets at Batch" +
s" ${validTime.milliseconds}")
graph.wait()
logInfo(s"wake up at Batch ${validTime.milliseconds} at DStream $id")
startPointRecord = fetchStartOffsetForEachPartition(validTime, !initialized)
@ -334,11 +364,13 @@ private[eventhubs] class EventHubDirectDStream private[eventhubs] (
// 2) when the last batch was delayed, we should catch up by detecting the latest highest
// offset
val highestOffsetOption = composeHighestOffset(validTime, failAppIfRestEndpointFail)
require(highestOffsetOption.isDefined, "We cannot get starting highest offset of partitions," +
" EventHubs endpoint is not available")
require(highestOffsetOption.isDefined,
"We cannot get starting highest offset of partitions," +
" EventHubs endpoint is not available")
logInfo(s"highestOffsetOfAllPartitions at $validTime: ${highestOffsetOption.get}")
logInfo(s"$validTime currentOffsetTimestamp: ${currentOffsetsAndSeqNums.timestamp}\t" +
s" startPointRecordTimestamp: ${startPointRecord.timestamp}")
logInfo(
s"$validTime currentOffsetTimestamp: ${currentOffsetsAndSeqNums.timestamp}\t" +
s" startPointRecordTimestamp: ${startPointRecord.timestamp}")
val rdd = proceedWithNonEmptyRDD(validTime, startPointRecord, highestOffsetOption.get)
initialized = true
rdd
@ -352,11 +384,14 @@ private[eventhubs] class EventHubDirectDStream private[eventhubs] (
}
private[eventhubs] class EventHubDirectDStreamCheckpointData(
eventHubDirectDStream: EventHubDirectDStream) extends DStreamCheckpointData(this) {
eventHubDirectDStream: EventHubDirectDStream)
extends DStreamCheckpointData(this) {
def batchForTime: mutable.HashMap[Time, Array[(EventHubNameAndPartition, Long, Long, Long,
EventHubsOffsetType)]] = {
data.asInstanceOf[mutable.HashMap[Time,
def batchForTime: mutable.HashMap[
Time,
Array[(EventHubNameAndPartition, Long, Long, Long, EventHubsOffsetType)]] = {
data.asInstanceOf[mutable.HashMap[
Time,
Array[(EventHubNameAndPartition, Long, Long, Long, EventHubsOffsetType)]]]
}
@ -372,32 +407,36 @@ private[eventhubs] class EventHubDirectDStream private[eventhubs] (
}
}
override def cleanup(time: Time): Unit = { }
override def cleanup(time: Time): Unit = {}
override def restore(): Unit = {
// we have to initialize here, otherwise there is a race condition when recovering from spark
// checkpoint
logInfo("initialized ProgressTracker")
val appName = context.sparkContext.appName
DirectDStreamProgressTracker.initInstance(progressDir, appName,
context.sparkContext.hadoopConfiguration)
batchForTime.toSeq.sortBy(_._1)(Time.ordering).foreach { case (t, b) =>
logInfo(s"Restoring EventHubRDD for time $t ${b.mkString("[", ", ", "]")}")
generatedRDDs += t -> new EventHubsRDD(
context.sparkContext,
eventhubsParams,
b.map {case (ehNameAndPar, fromOffset, fromSeq, untilSeq, offsetType) =>
OffsetRange(ehNameAndPar, fromOffset, fromSeq, untilSeq, offsetType)}.toList,
t.milliseconds,
OffsetStoreParams(progressDir, streamId, uid = eventHubNameSpace,
subDirs = appName),
eventhubReceiverCreator)
DirectDStreamProgressTracker.initInstance(progressDir,
appName,
context.sparkContext.hadoopConfiguration)
batchForTime.toSeq.sortBy(_._1)(Time.ordering).foreach {
case (t, b) =>
logInfo(s"Restoring EventHubRDD for time $t ${b.mkString("[", ", ", "]")}")
generatedRDDs += t -> new EventHubsRDD(
context.sparkContext,
eventhubsParams,
b.map {
case (ehNameAndPar, fromOffset, fromSeq, untilSeq, offsetType) =>
OffsetRange(ehNameAndPar, fromOffset, fromSeq, untilSeq, offsetType)
}.toList,
t.milliseconds,
OffsetStoreParams(progressDir, streamId, uid = eventHubNameSpace, subDirs = appName),
eventhubReceiverCreator
)
}
}
}
private[eventhubs] class EventHubDirectDStreamRateController(id: Int, estimator: RateEstimator)
extends RateController(id, estimator) {
extends RateController(id, estimator) {
override protected def publish(rate: Long): Unit = {
// publish nothing as there is no receiver
}

Просмотреть файл

@ -1,162 +0,0 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.spark.streaming.eventhubs
import java.util.concurrent.ExecutorService
import com.microsoft.azure.eventhubs._
import org.apache.spark.eventhubscommon.client.EventHubsClientWrapper
import org.apache.spark.internal.Logging
import org.apache.spark.storage.StorageLevel
import org.apache.spark.streaming.eventhubs.checkpoint.{DfsBasedOffsetStore, OffsetStore}
import org.apache.spark.streaming.receiver.Receiver
import org.apache.spark.util.ThreadUtils
private[eventhubs] class EventHubsReceiver(
eventhubsParams: Map[String, String],
partitionId: String,
storageLevel: StorageLevel,
offsetStore: Option[OffsetStore],
receiverClient: EventHubsClientWrapper,
maximumEventRate: Int) extends Receiver[Array[Byte]](storageLevel) with Logging {
// If offset store is empty we construct one using provided parameters
val myOffsetStore: OffsetStore = offsetStore.getOrElse(new DfsBasedOffsetStore(
eventhubsParams("eventhubs.checkpoint.dir"),
eventhubsParams("eventhubs.namespace"),
eventhubsParams("eventhubs.name"),
partitionId))
/**
* A state communicates between main thread and the MessageHandler thread.
* Note we cannot use Receiver.isStopped() because there could be race condition when the
* MessageHandler thread is started the state of the receiver has not been updated yet.
*/
@volatile private var stopMessageHandler = false
/**
* The latest sequence number this receiver has seen in messages from EventHubs.
* It is used to throw away messages with backwards sequence number, to avoid duplicates
* when receiver is restarted due to transient errors.
* Note that Sequence number is monotonically increasing
*/
// private var latestSequence: Long = Long.MinValue
/** The offset to be saved after current checkpoint interval */
protected var offsetToSave: String = _
private var executorPool: ExecutorService = _
/** The last saved offset */
protected var savedOffset: String = _
def onStop() {
logInfo("Stopping EventHubsReceiver for partition " + partitionId)
stopMessageHandler = true
executorPool.shutdown()
executorPool = null
// Don't need to do anything else here. Message handling thread will check stopMessageHandler
// and close EventHubs client receiver.
}
def onStart() {
logInfo("Starting EventHubsReceiver for partition " + partitionId)
stopMessageHandler = false
executorPool = ThreadUtils.newDaemonFixedThreadPool(1, "EventHubsMessageHandler")
try {
executorPool.submit(new EventHubsMessageHandler)
} catch {
case e: Exception =>
// just in case anything is thrown (TODO: should not have anything here)
e.printStackTrace()
} finally {
executorPool.shutdown() // Just causes threads to terminate after work is done
}
}
def processReceivedMessagesInBatch(eventDataBatch: Iterable[EventData]): Unit = {
store(eventDataBatch.map(x => x.getBytes).toIterator)
val maximumSequenceNumber: Long = eventDataBatch.map(x =>
x.getSystemProperties.getSequenceNumber).reduceLeft { (x, y) => if (x > y) x else y }
// It is guaranteed by Eventhubs that the event data with the highest sequence number has
// the largest offset
offsetToSave = eventDataBatch.find(x => x.getSystemProperties.getSequenceNumber ==
maximumSequenceNumber).get.getSystemProperties.getOffset
}
// Handles EventHubs messages
private[eventhubs] class EventHubsMessageHandler() extends Runnable {
// The checkpoint interval defaults to 10 seconds if not provided
val checkpointInterval = eventhubsParams.getOrElse("eventhubs.checkpoint.interval", "10")
.toLong * 1000
var nextCheckpointTime = System.currentTimeMillis() + checkpointInterval
def run() {
logInfo("Begin EventHubsMessageHandler for partition " + partitionId)
myOffsetStore.open()
// Create an EventHubs client receiver
receiverClient.createReceiver(eventhubsParams, partitionId, myOffsetStore, maximumEventRate)
var lastMaximumSequence = 0L
while (!stopMessageHandler) {
try {
val receivedEvents = receiverClient.receive()
if (receivedEvents != null && receivedEvents.nonEmpty) {
val eventCount = receivedEvents.count(x => x.getBytes.length > 0)
val sequenceNumbers = receivedEvents.map(x =>
x.getSystemProperties.getSequenceNumber)
if (sequenceNumbers != null && sequenceNumbers.nonEmpty) {
val maximumSequenceNumber = sequenceNumbers.max
val minimumSequenceNumber = sequenceNumbers.min
val missingSequenceCount =
maximumSequenceNumber - minimumSequenceNumber - eventCount + 1
val sequenceNumberDiscontinuity = minimumSequenceNumber - (lastMaximumSequence + 1)
lastMaximumSequence = maximumSequenceNumber
logDebug(s"Partition Id: $partitionId, Event Count: $eventCount," +
s" Maximum Sequence Number: $maximumSequenceNumber, Minimum Sequence Number:" +
s" $minimumSequenceNumber," +
s" Missing Sequence Count: $missingSequenceCount," +
s" Sequence Number Discontinuity = $sequenceNumberDiscontinuity")
} else {
logDebug(s"Partition Id: $partitionId, Event Count: $eventCount")
}
processReceivedMessagesInBatch(receivedEvents)
}
val currentTime = System.currentTimeMillis()
if (currentTime >= nextCheckpointTime && offsetToSave != savedOffset) {
logInfo(s"Partition Id: $partitionId, Current Time: $currentTime," +
s" Next Checkpoint Time: $nextCheckpointTime, Saved Offset: $offsetToSave")
myOffsetStore.write(offsetToSave)
savedOffset = offsetToSave
nextCheckpointTime = currentTime + checkpointInterval
}
} catch {
case e: Throwable =>
val errorMsg = s"Error Handling Messages, ${e.getMessage}"
logError(errorMsg)
logInfo(s"recreating the receiver for partition $partitionId")
receiverClient.closeReceiver()
receiverClient.createReceiver(eventhubsParams, partitionId, myOffsetStore,
maximumEventRate)
}
}
}
}
}

Просмотреть файл

@ -16,19 +16,12 @@
*/
package org.apache.spark.streaming.eventhubs
import scala.collection.Map
import com.microsoft.azure.eventhubs.EventData
import org.apache.spark.SparkConf
import org.apache.spark.eventhubscommon.client.{EventHubClient, EventHubsClientWrapper}
import org.apache.spark.eventhubscommon.client.{ Client, EventHubsClientWrapper }
import org.apache.spark.eventhubscommon.client.EventHubsOffsetTypes.EventHubsOffsetType
import org.apache.spark.storage.StorageLevel
import org.apache.spark.streaming.StreamingContext
import org.apache.spark.streaming.dstream.{DStream, ReceiverInputDStream}
import org.apache.spark.streaming.eventhubs.checkpoint.OffsetStore
import org.apache.spark.streaming.receiver.Receiver
object EventHubsUtils {
@ -42,66 +35,6 @@ object EventHubsUtils {
new SparkConf().registerKryoClasses(Array(classOf[EventData]))
}
// scalastyle:off
/**
* Create a unioned EventHubs stream that receives data from Microsoft Azure Eventhubs
* The unioned stream will receive message from all partitions of the EventHubs
*
* @param streamingContext Streaming Context object
* @param eventhubsParams a Map that contains parameters for EventHubs.
* Required parameters are:
* "eventhubs.policyname": EventHubs policy name
* "eventhubs.policykey": EventHubs policy key
* "eventhubs.namespace": EventHubs namespace
* "eventhubs.name": EventHubs name
* "eventhubs.partition.count": Number of partitions
* "eventhubs.checkpoint.dir": checkpoint directory on HDFS
*
* Optional parameters are:
* "eventhubs.consumergroup": EventHubs consumer group name, default to "\$default"
* "eventhubs.filter.offset": Starting offset of EventHubs, default to "-1"
* "eventhubs.filter.enqueuetime": Unix time, seconds since epoch, default to "0"
* "eventhubs.default.credits": default AMQP credits, default to -1 (which is 1024)
* "eventhubs.checkpoint.interval": checkpoint interval in second, default to 10
* @param storageLevel Storage level, by default it is MEMORY_ONLY
* @return ReceiverInputStream
*/
// scalastyle:on
@deprecated("this method is deprecated, please use createDirectStreams", "2.0.5")
def createUnionStream(streamingContext: StreamingContext, eventhubsParams: Map[String, String],
storageLevel: StorageLevel = StorageLevel.MEMORY_AND_DISK_SER): DStream[Array[Byte]] = {
val partitionCount = eventhubsParams("eventhubs.partition.count").toInt
val streams = (0 until partitionCount).map {
i => createStream(streamingContext, eventhubsParams, i.toString, storageLevel)
}
streamingContext.union(streams)
}
/**
* Create a single EventHubs stream that receives data from Microsoft Azure EventHubs
* A single stream only receives message from one EventHubs partition
*
* @param streamingContext Streaming Context object
* @param eventhubsParams a Map that contains parameters for EventHubs. Same as above.
* @param partitionId Partition ID
* @param storageLevel Storage level
* @param offsetStore Offset store implementation, defaults to DFSBasedOffsetStore
* @param receiverClient the EventHubs client implementation, defaults to EventHubsClientWrapper
* @return ReceiverInputStream
*/
@deprecated("this method is deprecated, please use createDirectStreams", "2.0.5")
def createStream(streamingContext: StreamingContext,
eventhubsParams: Map[String, String],
partitionId: String,
storageLevel: StorageLevel = StorageLevel.MEMORY_AND_DISK_SER,
offsetStore: OffsetStore = null,
receiverClient: EventHubsClientWrapper = new EventHubsClientWrapper):
ReceiverInputDStream[Array[Byte]] = {
streamingContext.receiverStream(
getReceiver(streamingContext, eventhubsParams.toMap, partitionId,
storageLevel, Option(offsetStore), receiverClient))
}
/**
* create direct stream based on eventhubs
* @param ssc the streaming context this stream belongs to
@ -129,35 +62,20 @@ object EventHubsUtils {
eventHubNamespace: String,
progressDir: String,
eventParams: Predef.Map[String, Predef.Map[String, String]],
eventHubsReceiverCreator: (Predef.Map[String, String], Int, Long, EventHubsOffsetType, Int) =>
EventHubsClientWrapper = EventHubsClientWrapper.getEventHubReceiver,
eventHubsClientCreator: (String, Predef.Map[String, Predef.Map[String, String]]) =>
EventHubClient): EventHubDirectDStream = {
val newStream = new EventHubDirectDStream(ssc, eventHubNamespace, progressDir, eventParams,
eventHubsReceiverCreator, eventHubsClientCreator)
eventHubsReceiverCreator: (Predef.Map[String, String],
Int,
Long,
EventHubsOffsetType,
Int) => EventHubsClientWrapper =
EventHubsClientWrapper.getEventHubReceiver,
eventHubsClientCreator: (String, Predef.Map[String, Predef.Map[String, String]]) => Client)
: EventHubDirectDStream = {
val newStream = new EventHubDirectDStream(ssc,
eventHubNamespace,
progressDir,
eventParams,
eventHubsReceiverCreator,
eventHubsClientCreator)
newStream
}
/**
* A helper function to get EventHubsReceiver or ReliableEventHubsReceiver based on whether
* Write Ahead Log is enabled or not ("spark.streaming.receiver.writeAheadLog.enable")
*/
private[eventhubs] def getReceiver(streamingContext: StreamingContext,
eventhubsParams: scala.collection.immutable.Map[String, String],
partitionId: String,
storageLevel: StorageLevel,
offsetStore: Option[OffsetStore],
receiverClient: EventHubsClientWrapper): Receiver[Array[Byte]] = {
val maximumEventRate = streamingContext.conf.getInt("spark.streaming.receiver.maxRate", 0)
val walEnabled = streamingContext.conf.getBoolean(
"spark.streaming.receiver.writeAheadLog.enable", defaultValue = false)
if (walEnabled) {
new ReliableEventHubsReceiver(eventhubsParams, partitionId, storageLevel, offsetStore,
receiverClient, maximumEventRate)
} else {
new EventHubsReceiver(eventhubsParams, partitionId, storageLevel, offsetStore, receiverClient,
maximumEventRate)
}
}
}

Просмотреть файл

@ -1,157 +0,0 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.spark.streaming.eventhubs
import java.util.concurrent.ConcurrentHashMap
import scala.collection.mutable.ArrayBuffer
import com.microsoft.azure.eventhubs._
import org.apache.spark.SparkEnv
import org.apache.spark.eventhubscommon.client.EventHubsClientWrapper
import org.apache.spark.storage.{StorageLevel, StreamBlockId}
import org.apache.spark.streaming.eventhubs.checkpoint.OffsetStore
import org.apache.spark.streaming.receiver.{BlockGenerator, BlockGeneratorListener}
/**
* ReliableEventHubsReceiver offers the ability to reliably store data into BlockManager without
* loss.
* It is turned off by default and will be enabled when
* spark.streaming.receiver.writeAheadLog.enable is true.
*
* The difference compared to EventHubsReceiver is that the offset is updated in persistent
* store only after data is reliably stored as write-ahead log, so the potential data loss
* problem of EventHubsReceiver can be eliminated.
*/
private[eventhubs]
class ReliableEventHubsReceiver(
eventhubsParams: Map[String, String],
partitionId: String,
storageLevel: StorageLevel,
offsetStore: Option[OffsetStore],
receiverClient: EventHubsClientWrapper,
maximumEventRate: Int)
extends EventHubsReceiver(
eventhubsParams, partitionId, storageLevel, offsetStore, receiverClient, maximumEventRate) {
override def onStop() {
super.onStop()
if (blockGenerator != null) {
blockGenerator.stop()
blockGenerator = null
}
if (blockOffsetMap != null) {
blockOffsetMap.clear()
blockOffsetMap = null
}
}
override def onStart() {
blockOffsetMap = new ConcurrentHashMap[StreamBlockId, String]
// Initialize the block generator for storing EventHubs message.
blockGenerator = new BlockGenerator(new GeneratedBlockHandler, streamId, SparkEnv.get.conf)
blockGenerator.start()
super.onStart()
}
override def processReceivedMessagesInBatch(eventDataBatch: Iterable[EventData]): Unit = {
val maximumSequenceNumber = eventDataBatch.map(x => x.getSystemProperties.getSequenceNumber).
reduceLeft { (x, y) => if (x > y) x else y }
val offsetMetadata = eventDataBatch.find(x =>
x.getSystemProperties.getSequenceNumber == maximumSequenceNumber).get.getSystemProperties.
getOffset
/**
* It is guaranteed by Eventhubs that the event data with the highest sequence number has the
* largest offset
*/
blockGenerator.addMultipleDataWithCallback(eventDataBatch.map(x => x.getBytes).toIterator,
offsetMetadata)
}
/**
* Store the ready-to-be-stored block and commit the related offsets to OffsetStore. This method
* will try a fixed number of times to push the block. If the push fails, the receiver is stopped.
*/
private def storeBlockAndCommitOffset(
blockId: StreamBlockId,
arrayBuffer: ArrayBuffer[_]): Unit = {
var count = 0
var pushed = false
var exception: Exception = null
while (!pushed && count < RETRY_COUNT) {
try {
store(arrayBuffer.asInstanceOf[ArrayBuffer[Array[Byte]]])
pushed = true
} catch {
case e: Exception =>
count += 1
exception = e
Thread.sleep(SECONDS_BETWEEN_RETRY * 1000)
}
}
if (pushed) {
// commit the latest offset of the block to offsetToSave, when the checkpoint interval
// passes the offset is saved to offset store
offsetToSave = blockOffsetMap.get(blockId)
blockOffsetMap.remove(blockId)
} else {
stop("Error while storing block into Spark", exception)
}
}
/** Class to handle blocks generated by the block generator. */
private final class GeneratedBlockHandler extends BlockGeneratorListener {
def onAddData(data: Any, metadata: Any): Unit = {
// Update the offset of the data that was added to the generator
if (metadata != null) {
val offset = metadata.asInstanceOf[String]
latestOffsetCurBlock = offset
}
}
def onGenerateBlock(blockId: StreamBlockId): Unit = {
// Remember the offsets when a block has been generated
blockOffsetMap.put(blockId, latestOffsetCurBlock)
}
def onPushBlock(blockId: StreamBlockId, arrayBuffer: ArrayBuffer[_]): Unit = {
// Store block and commit the blocks offset
storeBlockAndCommitOffset(blockId, arrayBuffer)
}
def onError(message: String, throwable: Throwable): Unit = {
reportError(message, throwable)
}
}
/** Use block generator to generate blocks to Spark block manager synchronously */
private var blockGenerator: BlockGenerator = _
/** A string to store the latest offset in the current block for the current partition. */
private var latestOffsetCurBlock: String = _
/** A concurrent HashMap to store the stream block id and related offset snapshot. */
private var blockOffsetMap: ConcurrentHashMap[StreamBlockId, String] = _
private val RETRY_COUNT: Int = 10
private val SECONDS_BETWEEN_RETRY = 1
}

Просмотреть файл

@ -17,7 +17,7 @@
package org.apache.spark.streaming.eventhubs.checkpoint
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.{FileSystem, Path}
import org.apache.hadoop.fs.{ FileSystem, Path }
import org.apache.spark.SparkContext
import org.apache.spark.internal.Logging
@ -26,15 +26,13 @@ import org.apache.spark.internal.Logging
* A DFS based OffsetStore implementation
*/
@SerialVersionUID(1L)
class DfsBasedOffsetStore(
directory: String,
namespace: String,
name: String,
partition: String) extends OffsetStore with Logging {
class DfsBasedOffsetStore(directory: String, namespace: String, name: String, partition: String)
extends OffsetStore
with Logging {
if (!SparkContext.getOrCreate().isLocal) {
require(directory.startsWith("hdfs://") || directory.startsWith("adl://"),
"we only support to store offset in HDFS/ADLS when running Spark in non-local mode ")
"we only support to store offset in HDFS/ADLS when running Spark in non-local mode ")
}
var path: Path = _
@ -45,7 +43,6 @@ class DfsBasedOffsetStore(
/**
* Open two files, the actual checkpoint file and the backup checkpoint file
*/
override def open(): Unit = {
if (checkpointFile == null) {
path = new Path(directory + "/" + namespace + "/" + name + "/" + partition)
@ -183,4 +180,3 @@ class DfsBasedOffsetStore(
// pass
}
}

Просмотреть файл

@ -22,7 +22,11 @@ import scala.collection.mutable.ListBuffer
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs._
import org.apache.spark.eventhubscommon.{EventHubNameAndPartition, EventHubsConnector, OffsetRecord}
import org.apache.spark.eventhubscommon.{
EventHubNameAndPartition,
EventHubsConnector,
OffsetRecord
}
import org.apache.spark.eventhubscommon.progress.ProgressTrackerBase
/**
@ -36,23 +40,22 @@ import org.apache.spark.eventhubscommon.progress.ProgressTrackerBase
* @param appName the name of Spark application
* @param hadoopConfiguration the hadoop configuration instance
*/
private[spark] class DirectDStreamProgressTracker private[spark](
private[spark] class DirectDStreamProgressTracker private[spark] (
progressDir: String,
appName: String,
hadoopConfiguration: Configuration)
extends ProgressTrackerBase(progressDir, appName, hadoopConfiguration) {
extends ProgressTrackerBase(progressDir, appName, hadoopConfiguration) {
// the lock synchronizing the read and committing operations, since they are executed in driver
// and listener thread respectively.
private val driverLock = new Object
override def eventHubNameAndPartitions: Map[String, List[EventHubNameAndPartition]] = {
DirectDStreamProgressTracker.registeredConnectors.map {
connector => (connector.uid, connector.connectedInstances)
DirectDStreamProgressTracker.registeredConnectors.map { connector =>
(connector.uid, connector.connectedInstances)
}.toMap
}
private def initProgressFileDirectory(): Unit = {
try {
val fs = progressDirectoryPath.getFileSystem(hadoopConfiguration)
@ -118,10 +121,10 @@ private[spark] class DirectDStreamProgressTracker private[spark](
/**
* read the progress record for the specified namespace, streamId and timestamp
*/
override def read(namespace: String, timestamp: Long, fallBack: Boolean):
OffsetRecord = driverLock.synchronized {
super.read(namespace, timestamp, fallBack)
}
override def read(namespace: String, timestamp: Long, fallBack: Boolean): OffsetRecord =
driverLock.synchronized {
super.read(namespace, timestamp, fallBack)
}
def close(): Unit = {}
@ -143,26 +146,32 @@ private[spark] class DirectDStreamProgressTracker private[spark](
fs.delete(filePath, true)
}
}
*/
*/
// clean temp directory
val allUselessTempFiles = fs.listStatus(tempDirectoryPath, new PathFilter {
override def accept(path: Path): Boolean = fromPathToTimestamp(path) <= timestampToClean
}).map(_.getPath)
val allUselessTempFiles = fs
.listStatus(tempDirectoryPath, new PathFilter {
override def accept(path: Path): Boolean = fromPathToTimestamp(path) <= timestampToClean
})
.map(_.getPath)
if (allUselessTempFiles.nonEmpty) {
allUselessTempFiles.groupBy(fromPathToTimestamp).toList.sortWith((p1, p2) => p1._1 > p2._1).
tail.flatMap(_._2).foreach {
filePath => logInfo(s"delete $filePath")
fs.delete(filePath, true)
}
allUselessTempFiles
.groupBy(fromPathToTimestamp)
.toList
.sortWith((p1, p2) => p1._1 > p2._1)
.tail
.flatMap(_._2)
.foreach { filePath =>
logInfo(s"delete $filePath")
fs.delete(filePath, true)
}
}
}
/**
* commit offsetToCommit to a new progress tracking file
*/
override def commit(
offsetToCommit: Map[String, Map[EventHubNameAndPartition, (Long, Long)]],
commitTime: Long): Unit = driverLock.synchronized {
override def commit(offsetToCommit: Map[String, Map[EventHubNameAndPartition, (Long, Long)]],
commitTime: Long): Unit = driverLock.synchronized {
super.commit(offsetToCommit, commitTime)
}
}
@ -193,9 +202,8 @@ object DirectDStreamProgressTracker {
this.synchronized {
// DirectDStream shall have singleton progress tracker
if (_progressTracker == null) {
_progressTracker = new DirectDStreamProgressTracker(progressDirStr,
appName,
hadoopConfiguration)
_progressTracker =
new DirectDStreamProgressTracker(progressDirStr, appName, hadoopConfiguration)
}
_progressTracker.init()
}

Просмотреть файл

@ -23,10 +23,6 @@ package org.apache.spark.streaming.eventhubs.checkpoint
trait OffsetStore extends Serializable {
def open(): Unit
def write(offset: String): Unit
def read() : String
def read(): String
def close(): Unit
}

Просмотреть файл

@ -20,39 +20,46 @@ package org.apache.spark.streaming.eventhubs.checkpoint
import org.apache.spark.internal.Logging
import org.apache.spark.streaming.StreamingContext
import org.apache.spark.streaming.eventhubs.EventHubDirectDStream
import org.apache.spark.streaming.scheduler.{StreamingListener, StreamingListenerBatchCompleted}
import org.apache.spark.streaming.scheduler.{ StreamingListener, StreamingListenerBatchCompleted }
/**
* The listener asynchronously commits the temp checkpoint to the path which is read by DStream
* driver. It monitors the input size to prevent those empty batches from committing checkpoints
*/
private[eventhubs] class ProgressTrackingListener private (
ssc: StreamingContext, progressDirectory: String) extends StreamingListener with Logging {
private[eventhubs] class ProgressTrackingListener private (ssc: StreamingContext,
progressDirectory: String)
extends StreamingListener
with Logging {
override def onBatchCompleted(batchCompleted: StreamingListenerBatchCompleted): Unit = {
logInfo(s"Batch ${batchCompleted.batchInfo.batchTime} completed")
val batchTime = batchCompleted.batchInfo.batchTime.milliseconds
try {
if (batchCompleted.batchInfo.outputOperationInfos.forall(_._2.failureReason.isEmpty)) {
val progressTracker = DirectDStreamProgressTracker.getInstance.
asInstanceOf[DirectDStreamProgressTracker]
val progressTracker =
DirectDStreamProgressTracker.getInstance.asInstanceOf[DirectDStreamProgressTracker]
// build current offsets
val allEventDStreams = DirectDStreamProgressTracker.registeredConnectors
// merge with the temp directory
val startTime = System.currentTimeMillis()
val progressInLastBatch = progressTracker.collectProgressRecordsForBatch(
batchTime, allEventDStreams.toList)
val progressInLastBatch =
progressTracker.collectProgressRecordsForBatch(batchTime, allEventDStreams.toList)
logInfo(s"progressInLastBatch $progressInLastBatch")
if (progressInLastBatch.nonEmpty) {
val contentToCommit = allEventDStreams.map {
case dstream: EventHubDirectDStream =>
(dstream.eventHubNameSpace, dstream.currentOffsetsAndSeqNums.offsets)
}.toMap.map { case (namespace, currentOffsets) =>
(namespace, currentOffsets ++ progressInLastBatch.getOrElse(namespace, Map()))
}
val contentToCommit = allEventDStreams
.map {
case dstream: EventHubDirectDStream =>
(dstream.eventHubNameSpace, dstream.currentOffsetsAndSeqNums.offsets)
}
.toMap
.map {
case (namespace, currentOffsets) =>
(namespace, currentOffsets ++ progressInLastBatch.getOrElse(namespace, Map()))
}
progressTracker.commit(contentToCommit, batchTime)
logInfo(s"commit ending offset of Batch $batchTime $contentToCommit time cost:" +
s" ${System.currentTimeMillis() - startTime}")
logInfo(
s"commit ending offset of Batch $batchTime $contentToCommit time cost:" +
s" ${System.currentTimeMillis() - startTime}")
} else {
logInfo(s"read RDD data from Checkpoint at $batchTime, skip commits")
}
@ -73,9 +80,8 @@ private[eventhubs] object ProgressTrackingListener {
private var _progressTrackerListener: ProgressTrackingListener = _
private def getOrCreateProgressTrackerListener(
ssc: StreamingContext,
progressDirectory: String) = {
private def getOrCreateProgressTrackerListener(ssc: StreamingContext,
progressDirectory: String) = {
if (_progressTrackerListener == null) {
_progressTrackerListener = new ProgressTrackingListener(ssc, progressDirectory)
ssc.scheduler.listenerBus.listeners.add(0, _progressTrackerListener)
@ -88,10 +94,8 @@ private[eventhubs] object ProgressTrackingListener {
_progressTrackerListener = null
}
def initInstance(
ssc: StreamingContext,
progressDirectory: String): ProgressTrackingListener = this.synchronized {
getOrCreateProgressTrackerListener(ssc, progressDirectory)
}
def initInstance(ssc: StreamingContext, progressDirectory: String): ProgressTrackingListener =
this.synchronized {
getOrCreateProgressTrackerListener(ssc, progressDirectory)
}
}

Просмотреть файл

@ -1,47 +0,0 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.spark.eventhubscommon
import org.scalatest.mock.MockitoSugar
import org.apache.spark.eventhubscommon.client.EventHubsClientWrapper
import org.apache.spark.streaming.{StreamingContext, TestSuiteBase}
class EventhubsImplicitsSuite
extends TestSuiteBase with org.scalatest.Matchers with MockitoSugar {
val ehParams = Map(
"eventhubs.policyname" -> "policyname",
"eventhubs.policykey" -> "policykey",
"eventhubs.namespace" -> "namespace",
"eventhubs.name" -> "name",
"eventhubs.partition.count" -> "4",
"eventhubs.checkpoint.dir" -> "checkpointdir",
"eventhubs.checkpoint.interval" -> "1000"
)
test("StreamingContext can be implicitly converted to eventhub streaming context") {
val ssc = new StreamingContext(master, framework, batchDuration)
import org.apache.spark.eventhubscommon.Implicits._
val stream = ssc.unionedEventHubStream(ehParams)
val stream2 = ssc.eventHubStream(ehParams, "0")
ssc.stop()
}
}

Просмотреть файл

@ -16,10 +16,9 @@
*/
package org.apache.spark.eventhubscommon.client
import com.microsoft.azure.eventhubs._
import org.mockito.{Matchers, Mockito}
import org.mockito.{ Matchers, Mockito }
import org.mockito.Mockito._
import org.scalatest.{BeforeAndAfter, FunSuite}
import org.scalatest.{ BeforeAndAfter, FunSuite }
import org.scalatest.mock.MockitoSugar
import org.apache.spark.eventhubscommon.client.EventHubsOffsetTypes.EventHubsOffsetType
@ -42,83 +41,65 @@ class EventHubsClientWrapperSuite extends FunSuite with BeforeAndAfter with Mock
)
before {
ehClientWrapperMock = spy(new EventHubsClientWrapper)
ehClientWrapperMock = spy(new EventHubsClientWrapper(ehParams))
offsetStoreMock = mock[OffsetStore]
}
test("EventHubsClientWrapper converts parameters correctly when offset was previously saved") {
Mockito.when(offsetStoreMock.read()).thenReturn("2147483647")
Mockito.doNothing().when(ehClientWrapperMock).createReceiverInternal(
Matchers.anyString,
Matchers.anyString,
Matchers.anyString,
Matchers.anyString,
Matchers.eq[EventHubsOffsetType](EventHubsOffsetTypes.PreviousCheckpoint),
Matchers.anyString,
Matchers.anyLong)
Mockito
.doNothing()
.when(ehClientWrapperMock)
.createReceiverInternal(
Matchers.anyString,
Matchers.eq[EventHubsOffsetType](EventHubsOffsetTypes.PreviousCheckpoint),
Matchers.anyString)
ehClientWrapperMock.createReceiver(ehParams, "4", offsetStoreMock, 999)
verify(ehClientWrapperMock, times(1)).createReceiverInternal(
Matchers.eq("Endpoint=amqps://namespace.servicebus.windows.net;EntityPath=name;" +
"SharedAccessKeyName=policyname;" +
"SharedAccessKey=policykey;OperationTimeout=PT1M;RetryPolicy=Default"),
Matchers.anyString,
Matchers.eq(EventHubClient.DEFAULT_CONSUMER_GROUP_NAME),
Matchers.eq("4"),
Matchers.eq(EventHubsOffsetTypes.PreviousCheckpoint),
Matchers.eq("2147483647"),
Matchers.eq(-1L))
Matchers.eq("2147483647"))
}
test("EventHubsClientWrapper converts parameters for consumergroup") {
var ehParams2 = ehParams
ehParams2 += "eventhubs.consumergroup" -> "$consumergroup"
when(offsetStoreMock.read()).thenReturn("-1")
doNothing().when(ehClientWrapperMock).createReceiverInternal(Matchers.anyString,
Matchers.anyString,
Matchers.anyString,
Matchers.anyString,
Matchers.eq[EventHubsOffsetType](EventHubsOffsetTypes.None),
Matchers.anyString,
Matchers.anyLong)
doNothing()
.when(ehClientWrapperMock)
.createReceiverInternal(
Matchers.anyString,
Matchers.eq[EventHubsOffsetType](EventHubsOffsetTypes.None),
Matchers.anyString
)
ehClientWrapperMock.createReceiver(ehParams2, "4", offsetStoreMock, 999)
verify(ehClientWrapperMock, times(1)).createReceiverInternal(
Matchers.eq("Endpoint=amqps://namespace.servicebus.windows.net;EntityPath=name;" +
"SharedAccessKeyName=policyname;" +
"SharedAccessKey=policykey;OperationTimeout=PT1M;RetryPolicy=Default"),
Matchers.anyString,
Matchers.eq("$consumergroup"),
Matchers.eq("4"),
Matchers.eq(EventHubsOffsetTypes.None),
Matchers.eq("-1"),
Matchers.eq(-1L))
Matchers.eq("-1")
)
}
test("EventHubsClientWrapper converts parameters for enqueuetime filter") {
var ehParams2 = ehParams
ehParams2 += "eventhubs.filter.enqueuetime" -> "1433887583"
when(offsetStoreMock.read()).thenReturn("-1")
doNothing().when(ehClientWrapperMock).createReceiverInternal(
Matchers.anyString,
Matchers.anyString,
Matchers.anyString,
Matchers.anyString,
Matchers.eq[EventHubsOffsetType](EventHubsOffsetTypes.InputTimeOffset),
Matchers.anyString,
Matchers.anyLong)
doNothing()
.when(ehClientWrapperMock)
.createReceiverInternal(
Matchers.anyString,
Matchers.eq[EventHubsOffsetType](EventHubsOffsetTypes.InputTimeOffset),
Matchers.anyString
)
ehClientWrapperMock.createReceiver(ehParams2, "4", offsetStoreMock, 999)
verify(ehClientWrapperMock, times(1)).createReceiverInternal(
Matchers.eq("Endpoint=amqps://namespace.servicebus.windows.net;EntityPath=name;" +
"SharedAccessKeyName=policyname;" +
"SharedAccessKey=policykey;OperationTimeout=PT1M;RetryPolicy=Default"),
Matchers.anyString,
Matchers.eq(EventHubClient.DEFAULT_CONSUMER_GROUP_NAME),
Matchers.eq("4"),
Matchers.eq(EventHubsOffsetTypes.InputTimeOffset),
Matchers.eq("1433887583"),
Matchers.eq(-1L))
Matchers.eq("1433887583")
)
}
}

Просмотреть файл

@ -31,8 +31,8 @@ import org.apache.spark.internal.Logging
private[spark] object EventHubsTestUtilities extends Logging {
def simulateEventHubs[T, U](
eventHubsParameters: Map[String, String],
eventPayloadsAndProperties: Seq[(T, Seq[U])] = Seq.empty[(T, Seq[U])]): SimulatedEventHubs = {
eventHubsParameters: Map[String, String],
eventPayloadsAndProperties: Seq[(T, Seq[U])] = Seq.empty[(T, Seq[U])]): SimulatedEventHubs = {
assert(eventHubsParameters != null)
assert(eventHubsParameters.nonEmpty)
@ -45,43 +45,48 @@ private[spark] object EventHubsTestUtilities extends Logging {
yield EventHubNameAndPartition(eventHubsName, i)
}
val payloadPropertyStore = roundRobinAllocation(eventHubsPartitionList.map(x => x -> 0).toMap,
eventPayloadsAndProperties)
eventPayloadsAndProperties)
simulatedEventHubs = new SimulatedEventHubs(eventHubsNamespace, payloadPropertyStore)
simulatedEventHubs
}
def getOrSimulateEventHubs[T, U](
eventHubsParameters: Map[String, String],
eventPayloadsAndProperties: Seq[(T, Seq[U])] = Seq.empty[(T, Seq[U])]): SimulatedEventHubs = {
def getOrSimulateEventHubs[T, U](eventHubsParameters: Map[String, String],
eventPayloadsAndProperties: Seq[(T, Seq[U])] =
Seq.empty[(T, Seq[U])]): SimulatedEventHubs = {
if (simulatedEventHubs == null) {
simulatedEventHubs = simulateEventHubs(eventHubsParameters, eventPayloadsAndProperties)
}
simulatedEventHubs
}
def getHighestOffsetPerPartition(eventHubs: SimulatedEventHubs):
Map[EventHubNameAndPartition, (Long, Long, Long)] = {
def getHighestOffsetPerPartition(
eventHubs: SimulatedEventHubs): Map[EventHubNameAndPartition, (Long, Long, Long)] = {
eventHubs.messageStore.map {
case (ehNameAndPartition, messageQueue) => (ehNameAndPartition,
(messageQueue.length.toLong - 1, messageQueue.length.toLong - 1,
case (ehNameAndPartition, messageQueue) =>
(ehNameAndPartition,
(messageQueue.length.toLong - 1,
messageQueue.length.toLong - 1,
messageQueue.last.getSystemProperties.getEnqueuedTime.getEpochSecond))
}
}
def addEventsToEventHubs[T, U](
eventHubs: SimulatedEventHubs,
eventPayloadsAndProperties: Seq[(T, Seq[U])]): SimulatedEventHubs = {
eventHubs: SimulatedEventHubs,
eventPayloadsAndProperties: Seq[(T, Seq[U])]): SimulatedEventHubs = {
// Round-robin allocation of payloads to partitions
val payloadPropertyStore = roundRobinAllocation(eventHubs.eventHubsNamedPartitions
.map(x => x -> eventHubs.messageStore(x).length).toMap, eventPayloadsAndProperties)
val payloadPropertyStore = roundRobinAllocation(
eventHubs.eventHubsNamedPartitions
.map(x => x -> eventHubs.messageStore(x).length)
.toMap,
eventPayloadsAndProperties)
eventHubs.send(payloadPropertyStore)
eventHubs
}
private def roundRobinAllocation[T, U](
eventHubsPartitionOffsetMap: Map[EventHubNameAndPartition, Int],
eventPayloadsAndProperties: Seq[(T, Seq[U])] = Seq.empty[(T, Seq[U])]):
Map[EventHubNameAndPartition, Array[EventData]] = {
eventPayloadsAndProperties: Seq[(T, Seq[U])] = Seq.empty[(T, Seq[U])])
: Map[EventHubNameAndPartition, Array[EventData]] = {
val eventHubsPartitionList = eventHubsPartitionOffsetMap.keys.toSeq
if (eventPayloadsAndProperties.isEmpty) {
eventHubsPartitionList.map(x => x -> Seq.empty[EventData].toArray).toMap
@ -92,45 +97,43 @@ private[spark] object EventHubsTestUtilities extends Logging {
} else {
eventPayloadsAndProperties.zipWithIndex
.map(x => (eventHubsPartitionList(x._2 % eventHubsPartitionList.length), x._1))
.groupBy(_._1).map { case (k, v) => (k, v.map(_._2)) }
.groupBy(_._1)
.map { case (k, v) => (k, v.map(_._2)) }
}.toSeq
}
eventAllocation.map {
case (eventHubNameAndPartition, payloadPropertyBag) =>
(eventHubNameAndPartition,
generateEventData(payloadPropertyBag, eventHubNameAndPartition.partitionId,
eventHubsPartitionOffsetMap(eventHubNameAndPartition)))
generateEventData(payloadPropertyBag,
eventHubNameAndPartition.partitionId,
eventHubsPartitionOffsetMap(eventHubNameAndPartition)))
}.toMap
}
}
private[spark] def generateEventData[T, U](
payloadPropertyBag: Seq[(T, Seq[U])],
partitionId: Int,
startOffset: Int): Array[EventData] = {
private[spark] def generateEventData[T, U](payloadPropertyBag: Seq[(T, Seq[U])],
partitionId: Int,
startOffset: Int): Array[EventData] = {
var offsetSetInQueue = startOffset
val eventDataArray = new Array[EventData](payloadPropertyBag.length)
val publisherName = "Microsoft Corporation"
var enqueueTime = 0L
var eventIndex = 0
for((payload, properties) <- payloadPropertyBag) {
for ((payload, properties) <- payloadPropertyBag) {
val eventData = new EventData(payload.toString.getBytes)
val systemPropertiesMap = new java.util.HashMap[String, AnyRef]()
systemPropertiesMap.put(AmqpConstants.OFFSET_ANNOTATION_NAME,
offsetSetInQueue.toString)
systemPropertiesMap.put(AmqpConstants.OFFSET_ANNOTATION_NAME, offsetSetInQueue.toString)
systemPropertiesMap.put(AmqpConstants.SEQUENCE_NUMBER_ANNOTATION_NAME,
Long.box(offsetSetInQueue))
systemPropertiesMap.put(AmqpConstants.PARTITION_KEY_ANNOTATION_NAME,
partitionId.toString)
systemPropertiesMap.put(AmqpConstants.PUBLISHER_ANNOTATION_NAME,
publisherName.toString)
Long.box(offsetSetInQueue))
systemPropertiesMap.put(AmqpConstants.PARTITION_KEY_ANNOTATION_NAME, partitionId.toString)
systemPropertiesMap.put(AmqpConstants.PUBLISHER_ANNOTATION_NAME, publisherName.toString)
systemPropertiesMap.put(AmqpConstants.ENQUEUED_TIME_UTC_ANNOTATION_NAME,
Date.from(Instant.ofEpochSecond(enqueueTime)))
Date.from(Instant.ofEpochSecond(enqueueTime)))
val systemProperties = new SystemProperties(systemPropertiesMap)
Whitebox.setInternalState(eventData, "systemProperties", systemProperties.asInstanceOf[Any])
for (property <- properties) {
property match {
case p@Tuple2(_, _) =>
case p @ Tuple2(_, _) =>
eventData.getProperties.put(p._1.toString, p._2.asInstanceOf[AnyRef])
case _ =>
eventData.getProperties.put("output", property.asInstanceOf[AnyRef])

Просмотреть файл

@ -22,23 +22,26 @@ import scala.collection.mutable.ListBuffer
import com.microsoft.azure.eventhubs.EventData
import org.apache.spark.eventhubscommon.EventHubNameAndPartition
import org.apache.spark.eventhubscommon.client.{EventHubClient, EventHubsClientWrapper, EventHubsOffsetTypes}
import org.apache.spark.eventhubscommon.client.{
Client,
EventHubsClientWrapper,
EventHubsOffsetTypes
}
import org.apache.spark.eventhubscommon.client.EventHubsOffsetTypes.EventHubsOffsetType
import org.apache.spark.streaming.StreamingContext
class SimulatedEventHubs(
eventHubsNamespace: String,
initialData: Map[EventHubNameAndPartition, Array[EventData]]) extends Serializable {
class SimulatedEventHubs(eventHubsNamespace: String,
initialData: Map[EventHubNameAndPartition, Array[EventData]])
extends Serializable {
assert(initialData != null)
var messageStore: Map[EventHubNameAndPartition, Array[EventData]] = initialData
val eventHubsNamedPartitions: Seq[EventHubNameAndPartition] = initialData.keys.toSeq
def searchWithTime(
eventHubsNamedPartition: EventHubNameAndPartition,
enqueueTime: Long,
eventCount: Int): List[EventData] = {
def searchWithTime(eventHubsNamedPartition: EventHubNameAndPartition,
enqueueTime: Long,
eventCount: Int): List[EventData] = {
val resultData = new ListBuffer[EventData]
for (msg <- messageStore(eventHubsNamedPartition)) {
if (resultData.length >= eventCount) {
@ -51,8 +54,9 @@ class SimulatedEventHubs(
resultData.toList
}
def search(eventHubsNamedPartition: EventHubNameAndPartition, eventOffset: Int, eventCount: Int):
List[EventData] = {
def search(eventHubsNamedPartition: EventHubNameAndPartition,
eventOffset: Int,
eventCount: Int): List[EventData] = {
val resultData = new ListBuffer[EventData]
for (i <- 0 until eventCount) {
// as in eventhub, offset is exclusive
@ -66,43 +70,43 @@ class SimulatedEventHubs(
def send(newData: Map[EventHubNameAndPartition, Array[EventData]]): Unit = {
val combinedData: Map[EventHubNameAndPartition, Array[EventData]] =
(messageStore.toSeq ++ newData.toSeq).groupBy(_._1)
.map{case (k, v) => (k, v.flatMap(_._2).toArray)}
(messageStore.toSeq ++ newData.toSeq)
.groupBy(_._1)
.map { case (k, v) => (k, v.flatMap(_._2).toArray) }
messageStore = combinedData
}
}
class TestEventHubsReceiver(
eventHubParameters: Map[String, String],
eventHubs: SimulatedEventHubs,
partitionId: Int,
startOffset: Long,
offsetType: EventHubsOffsetType)
extends EventHubsClientWrapper {
val eventHubName = eventHubParameters("eventhubs.name")
class TestEventHubsReceiver(ehParams: Map[String, String],
eventHubs: SimulatedEventHubs,
partitionId: Int,
startOffset: Long,
offsetType: EventHubsOffsetType)
extends EventHubsClientWrapper(ehParams) {
override def receive(expectedEventNum: Int): Iterable[EventData] = {
val eventHubName = eventHubParameters("eventhubs.name")
val eventHubName = ehParams("eventhubs.name")
if (offsetType != EventHubsOffsetTypes.InputTimeOffset) {
eventHubs.search(EventHubNameAndPartition(eventHubName, partitionId), startOffset.toInt,
expectedEventNum)
eventHubs.search(EventHubNameAndPartition(eventHubName, partitionId),
startOffset.toInt,
expectedEventNum)
} else {
eventHubs.searchWithTime(EventHubNameAndPartition(eventHubName, partitionId),
eventHubParameters("eventhubs.filter.enqueuetime").toLong, expectedEventNum)
ehParams("eventhubs.filter.enqueuetime").toLong,
expectedEventNum)
}
}
}
class SimulatedEventHubsRestClient(
eventHubs: SimulatedEventHubs) extends EventHubClient {
class SimulatedEventHubsRestClient(eventHubs: SimulatedEventHubs) extends Client {
override def endPointOfPartition(
retryIfFail: Boolean,
targetEventHubNameAndPartitions: List[EventHubNameAndPartition] = List()):
Option[Predef.Map[EventHubNameAndPartition, (Long, Long)]] = {
Some(eventHubs.messageStore
.map(x => x._1 -> (x._2.length.toLong - 1, x._2.length.toLong - 1)))
override def endPointOfPartition(retryIfFail: Boolean,
targetEventHubNameAndPartitions: List[EventHubNameAndPartition] =
List())
: Option[Predef.Map[EventHubNameAndPartition, (Long, Long)]] = {
Some(
eventHubs.messageStore
.map(x => x._1 -> (x._2.length.toLong - 1, x._2.length.toLong - 1)))
}
override def close(): Unit = {}
@ -114,13 +118,16 @@ class SimulatedEventHubsRestClient(
*/
override def lastEnqueueTimeOfPartitions(
retryIfFail: Boolean,
targetEventHubNameAndPartitions: List[EventHubNameAndPartition]):
Option[Map[EventHubNameAndPartition, Long]] = {
Some(targetEventHubNameAndPartitions.map{
ehNameAndPartition =>
(ehNameAndPartition,
eventHubs.messageStore(ehNameAndPartition).last.getSystemProperties.getEnqueuedTime.
toEpochMilli)
targetEventHubNameAndPartitions: List[EventHubNameAndPartition])
: Option[Map[EventHubNameAndPartition, Long]] = {
Some(targetEventHubNameAndPartitions.map { ehNameAndPartition =>
(ehNameAndPartition,
eventHubs
.messageStore(ehNameAndPartition)
.last
.getSystemProperties
.getEnqueuedTime
.toEpochMilli)
}.toMap)
}
@ -129,26 +136,26 @@ class SimulatedEventHubsRestClient(
*
* @return a map from eventhubName-partition to seq
*/
override def startSeqOfPartition(
retryIfFail: Boolean,
targetEventHubNameAndPartitions: List[EventHubNameAndPartition]):
Option[Map[EventHubNameAndPartition, Long]] = {
Some(targetEventHubNameAndPartitions.map {
ehNameAndPartition =>
(ehNameAndPartition, -1L)}.toMap)
override def startSeqOfPartition(retryIfFail: Boolean,
targetEventHubNameAndPartitions: List[EventHubNameAndPartition])
: Option[Map[EventHubNameAndPartition, Long]] = {
Some(targetEventHubNameAndPartitions.map { ehNameAndPartition =>
(ehNameAndPartition, -1L)
}.toMap)
}
}
class TestRestEventHubClient(
latestRecords: Map[EventHubNameAndPartition, (Long, Long, Long)])
extends EventHubClient {
class TestRestEventHubClient(latestRecords: Map[EventHubNameAndPartition, (Long, Long, Long)])
extends Client {
override def endPointOfPartition(
retryIfFail: Boolean,
targetEventHubNameAndPartitions: List[EventHubNameAndPartition] = List()):
Option[Predef.Map[EventHubNameAndPartition, (Long, Long)]] = {
Some(latestRecords.map{case (ehNameAndPartition, (offset, seq, _)) =>
(ehNameAndPartition, (offset, seq))})
override def endPointOfPartition(retryIfFail: Boolean,
targetEventHubNameAndPartitions: List[EventHubNameAndPartition] =
List())
: Option[Predef.Map[EventHubNameAndPartition, (Long, Long)]] = {
Some(latestRecords.map {
case (ehNameAndPartition, (offset, seq, _)) =>
(ehNameAndPartition, (offset, seq))
})
}
/**
@ -158,11 +165,10 @@ class TestRestEventHubClient(
*/
override def lastEnqueueTimeOfPartitions(
retryIfFail: Boolean,
targetEventHubNameAndPartitions: List[EventHubNameAndPartition]):
Option[Map[EventHubNameAndPartition, Long]] = {
Some(targetEventHubNameAndPartitions.map{
ehNameAndPartition =>
(ehNameAndPartition, latestRecords(ehNameAndPartition)._3)
targetEventHubNameAndPartitions: List[EventHubNameAndPartition])
: Option[Map[EventHubNameAndPartition, Long]] = {
Some(targetEventHubNameAndPartitions.map { ehNameAndPartition =>
(ehNameAndPartition, latestRecords(ehNameAndPartition)._3)
}.toMap)
}
@ -173,22 +179,21 @@ class TestRestEventHubClient(
*
* @return a map from eventhubName-partition to seq
*/
override def startSeqOfPartition(
retryIfFail: Boolean,
targetEventHubNameAndPartitions: List[EventHubNameAndPartition]):
Option[Map[EventHubNameAndPartition, Long]] = {
Some(targetEventHubNameAndPartitions.map {
ehNameAndPartition =>
(ehNameAndPartition, -1L)}.toMap)
override def startSeqOfPartition(retryIfFail: Boolean,
targetEventHubNameAndPartitions: List[EventHubNameAndPartition])
: Option[Map[EventHubNameAndPartition, Long]] = {
Some(targetEventHubNameAndPartitions.map { ehNameAndPartition =>
(ehNameAndPartition, -1L)
}.toMap)
}
}
class FragileEventHubClient private extends EventHubClient {
class FragileEventHubClient private extends Client {
override def endPointOfPartition(
retryIfFail: Boolean,
targetEventHubNameAndPartitions: List[EventHubNameAndPartition] = List()):
Option[Predef.Map[EventHubNameAndPartition, (Long, Long)]] = {
override def endPointOfPartition(retryIfFail: Boolean,
targetEventHubNameAndPartitions: List[EventHubNameAndPartition] =
List())
: Option[Predef.Map[EventHubNameAndPartition, (Long, Long)]] = {
import FragileEventHubClient._
callIndex += 1
@ -208,8 +213,8 @@ class FragileEventHubClient private extends EventHubClient {
*/
override def lastEnqueueTimeOfPartitions(
retryIfFail: Boolean,
targetEventHubNameAndPartitions: List[EventHubNameAndPartition]):
Option[Map[EventHubNameAndPartition, Long]] = {
targetEventHubNameAndPartitions: List[EventHubNameAndPartition])
: Option[Map[EventHubNameAndPartition, Long]] = {
Some(targetEventHubNameAndPartitions.map((_, Long.MaxValue)).toMap)
}
@ -220,13 +225,12 @@ class FragileEventHubClient private extends EventHubClient {
*
* @return a map from eventhubName-partition to seq
*/
override def startSeqOfPartition(
retryIfFail: Boolean,
targetEventHubNameAndPartitions: List[EventHubNameAndPartition]):
Option[Map[EventHubNameAndPartition, Long]] = {
Some(targetEventHubNameAndPartitions.map {
ehNameAndPartition =>
(ehNameAndPartition, -1L)}.toMap)
override def startSeqOfPartition(retryIfFail: Boolean,
targetEventHubNameAndPartitions: List[EventHubNameAndPartition])
: Option[Map[EventHubNameAndPartition, Long]] = {
Some(targetEventHubNameAndPartitions.map { ehNameAndPartition =>
(ehNameAndPartition, -1L)
}.toMap)
}
}
@ -238,28 +242,27 @@ object FragileEventHubClient {
var lastBatchWhenEndpointCrashed = 0
var latestRecords: Map[EventHubNameAndPartition, (Long, Long)] = Map()
def getInstance(eventHubNameSpace: String, eventhubsParams: Map[String, Map[String, String]]):
FragileEventHubClient = {
def getInstance(eventHubNameSpace: String,
eventhubsParams: Map[String, Map[String, String]]): FragileEventHubClient = {
new FragileEventHubClient()
}
}
class FluctuatedEventHubClient(
ssc: StreamingContext,
messagesBeforeEmpty: Long,
numBatchesBeforeNewData: Int,
latestRecords: Map[EventHubNameAndPartition, (Long, Long)]) extends EventHubClient {
class FluctuatedEventHubClient(ssc: StreamingContext,
messagesBeforeEmpty: Long,
numBatchesBeforeNewData: Int,
latestRecords: Map[EventHubNameAndPartition, (Long, Long)])
extends Client {
private var callIndex = -1
override def endPointOfPartition(
retryIfFail: Boolean,
targetEventHubNameAndPartitions: List[EventHubNameAndPartition] = List()):
Option[Predef.Map[EventHubNameAndPartition, (Long, Long)]] = {
override def endPointOfPartition(retryIfFail: Boolean,
targetEventHubNameAndPartitions: List[EventHubNameAndPartition] =
List())
: Option[Predef.Map[EventHubNameAndPartition, (Long, Long)]] = {
callIndex += 1
if (callIndex < numBatchesBeforeNewData) {
Some(latestRecords.map{
Some(latestRecords.map {
case (ehNameAndPartition, _) =>
(ehNameAndPartition, (messagesBeforeEmpty - 1, messagesBeforeEmpty - 1))
})
@ -277,8 +280,8 @@ class FluctuatedEventHubClient(
*/
override def lastEnqueueTimeOfPartitions(
retryIfFail: Boolean,
targetEventHubNameAndPartitions: List[EventHubNameAndPartition]):
Option[Map[EventHubNameAndPartition, Long]] = {
targetEventHubNameAndPartitions: List[EventHubNameAndPartition])
: Option[Map[EventHubNameAndPartition, Long]] = {
Some(targetEventHubNameAndPartitions.map((_, Long.MaxValue)).toMap)
}
@ -287,13 +290,11 @@ class FluctuatedEventHubClient(
*
* @return a map from eventhubName-partition to seq
*/
override def startSeqOfPartition(
retryIfFail: Boolean,
targetEventHubNameAndPartitions: List[EventHubNameAndPartition]):
Option[Map[EventHubNameAndPartition, Long]] = {
Some(targetEventHubNameAndPartitions.map {
ehNameAndPartition =>
(ehNameAndPartition, -1L)}.toMap)
override def startSeqOfPartition(retryIfFail: Boolean,
targetEventHubNameAndPartitions: List[EventHubNameAndPartition])
: Option[Map[EventHubNameAndPartition, Long]] = {
Some(targetEventHubNameAndPartitions.map { ehNameAndPartition =>
(ehNameAndPartition, -1L)
}.toMap)
}
}

Просмотреть файл

@ -26,7 +26,7 @@ import org.apache.spark.sql.execution.streaming._
trait StreamAction
case class EventHubsAddDataMemory[A](source: MemoryStream[A], data: Seq[A])
extends EventHubsAddData {
extends EventHubsAddData {
override def toString: String = s"AddData to $source: ${data.mkString(",")}"
override def addData(query: Option[StreamExecution]): (Source, Offset) = {
@ -45,6 +45,7 @@ object EventHubsAddData {
/** A trait that can be extended when testing a source. */
trait EventHubsAddData extends StreamAction with Serializable {
/**
* Called to adding the data to a source. It should find the source to add data to from
* the active query, and then return the source object the data was added, as well as the
@ -53,11 +54,11 @@ trait EventHubsAddData extends StreamAction with Serializable {
def addData(query: Option[StreamExecution]): (Source, Offset)
}
case class AddEventHubsData[T: ClassTag, U: ClassTag](
eventHubsParameters: Map[String, String],
highestBatchId: Long = 0,
eventPayloadsAndProperties: Seq[(T, Seq[U])] = Seq.empty[(T, Seq[U])])
extends EventHubsAddData {
case class AddEventHubsData[T: ClassTag, U: ClassTag](eventHubsParameters: Map[String, String],
highestBatchId: Long = 0,
eventPayloadsAndProperties: Seq[(T, Seq[U])] =
Seq.empty[(T, Seq[U])])
extends EventHubsAddData {
override def addData(query: Option[StreamExecution]): (Source, Offset) = {
val sources = query.get.logicalPlan.collect {
@ -76,8 +77,9 @@ case class AddEventHubsData[T: ClassTag, U: ClassTag](
val eventHubs = EventHubsTestUtilities.getOrSimulateEventHubs(eventHubsParameters)
EventHubsTestUtilities.addEventsToEventHubs(eventHubs, eventPayloadsAndProperties)
val highestOffsetPerPartition = EventHubsTestUtilities.getHighestOffsetPerPartition(eventHubs)
val targetOffsetPerPartition = highestOffsetPerPartition.map{
case (ehNameAndPartition, (offset, _, _)) => (ehNameAndPartition, offset)}
val targetOffsetPerPartition = highestOffsetPerPartition.map {
case (ehNameAndPartition, (offset, _, _)) => (ehNameAndPartition, offset)
}
val eventHubsBatchRecord = EventHubsBatchRecord(highestBatchId, targetOffsetPerPartition)
(eventHubsSource, eventHubsBatchRecord)
}

Просмотреть файл

@ -23,10 +23,11 @@ import org.apache.spark.sql.test.SharedSQLContext
class EventHubsOffsetSuite extends OffsetSuite with SharedSQLContext {
val eventHubsBatchRecord = EventHubsBatchRecord(0L,
val eventHubsBatchRecord = EventHubsBatchRecord(
0L,
Map(EventHubNameAndPartition("eventhub", 0) -> 0L,
EventHubNameAndPartition("eventhub", 1) -> 100L,
EventHubNameAndPartition("eventhub", 2) -> 200L))
EventHubNameAndPartition("eventhub", 1) -> 100L,
EventHubNameAndPartition("eventhub", 2) -> 200L))
test("basic serialization and deserialization of Eventhubs batch record") {
@ -36,4 +37,3 @@ class EventHubsOffsetSuite extends OffsetSuite with SharedSQLContext {
assert(deserializedEventhubsBatchRecord.targetSeqNums === eventHubsBatchRecord.targetSeqNums)
}
}

Просмотреть файл

@ -24,21 +24,20 @@ import org.apache.spark.eventhubscommon.EventHubNameAndPartition
import org.apache.spark.eventhubscommon.client.EventHubsOffsetTypes
import org.apache.spark.eventhubscommon.client.EventHubsOffsetTypes.EventHubsOffsetType
import org.apache.spark.eventhubscommon.utils._
import org.apache.spark.sql.{Dataset, SparkSession}
import org.apache.spark.sql.streaming.{OutputMode, ProcessingTime}
import org.apache.spark.sql.types.{LongType, TimestampType}
import org.apache.spark.sql.{ Dataset, SparkSession }
import org.apache.spark.sql.streaming.{ OutputMode, ProcessingTime }
import org.apache.spark.sql.types.{ LongType, TimestampType }
import org.apache.spark.util.Utils
class EventHubsSourceSuite extends EventHubsStreamTest {
private def buildEventHubsParamters(
namespace: String,
name: String,
partitionCount: Int,
maxRate: Int,
containsProperties: Boolean = false,
userDefinedKeys: Option[String] = None,
enqueueTime: Option[Long] = None): Map[String, String] = {
private def buildEventHubsParamters(namespace: String,
name: String,
partitionCount: Int,
maxRate: Int,
containsProperties: Boolean = false,
userDefinedKeys: Option[String] = None,
enqueueTime: Option[Long] = None): Map[String, String] = {
Map[String, String](
"eventhubs.policyname" -> "policyName",
"eventhubs.policykey" -> "policyKey",
@ -70,18 +69,29 @@ class EventHubsSourceSuite extends EventHubsStreamTest {
test("Verify expected offsets are correct when rate is less than the available data") {
val eventHubsParameters = buildEventHubsParamters("ns1", "eh1", 2, 2)
val eventPayloadsAndProperties = generateIntKeyedData(6).map{case (body, properties) =>
(body.asInstanceOf[Int], properties)}
val eventHubs = EventHubsTestUtilities.simulateEventHubs(eventHubsParameters,
eventPayloadsAndProperties)
val eventPayloadsAndProperties = generateIntKeyedData(6).map {
case (body, properties) =>
(body.asInstanceOf[Int], properties)
}
val eventHubs =
EventHubsTestUtilities.simulateEventHubs(eventHubsParameters, eventPayloadsAndProperties)
val highestOffsetPerPartition = EventHubsTestUtilities.getHighestOffsetPerPartition(eventHubs)
val eventHubsSource = new EventHubsSource(spark.sqlContext, eventHubsParameters,
(eventHubsParams: Map[String, String], partitionId: Int, startOffset: Long,
eventHubsOffsetType: EventHubsOffsetType, _: Int) =>
new TestEventHubsReceiver(eventHubsParams, eventHubs, partitionId, startOffset,
eventHubsOffsetType),
val eventHubsSource = new EventHubsSource(
spark.sqlContext,
eventHubsParameters,
(eventHubsParams: Map[String, String],
partitionId: Int,
startOffset: Long,
eventHubsOffsetType: EventHubsOffsetType,
_: Int) =>
new TestEventHubsReceiver(eventHubsParams,
eventHubs,
partitionId,
startOffset,
eventHubsOffsetType),
(_: String, _: Map[String, Map[String, String]]) =>
new TestRestEventHubClient(highestOffsetPerPartition))
new TestRestEventHubClient(highestOffsetPerPartition)
)
val offset = eventHubsSource.getOffset.get.asInstanceOf[EventHubsBatchRecord]
assert(offset.batchId == 0)
offset.targetSeqNums.values.foreach(x => assert(x == 1))
@ -90,35 +100,54 @@ class EventHubsSourceSuite extends EventHubsStreamTest {
test("Verify expected offsets are correct when rate is more than the available data") {
val eventHubsParameters = buildEventHubsParamters("ns1", "eh1", 2, 10)
val eventPayloadsAndProperties = generateIntKeyedData(6)
val eventHubs = EventHubsTestUtilities.simulateEventHubs(eventHubsParameters,
eventPayloadsAndProperties)
val eventHubs =
EventHubsTestUtilities.simulateEventHubs(eventHubsParameters, eventPayloadsAndProperties)
val highestOffsetPerPartition = EventHubsTestUtilities.getHighestOffsetPerPartition(eventHubs)
val eventHubsSource = new EventHubsSource(spark.sqlContext, eventHubsParameters,
(eventHubsParams: Map[String, String], partitionId: Int, startOffset: Long,
offsetType: EventHubsOffsetType, _: Int) =>
new TestEventHubsReceiver(eventHubsParams, eventHubs, partitionId, startOffset,
EventHubsOffsetTypes.PreviousCheckpoint),
val eventHubsSource = new EventHubsSource(
spark.sqlContext,
eventHubsParameters,
(eventHubsParams: Map[String, String],
partitionId: Int,
startOffset: Long,
offsetType: EventHubsOffsetType,
_: Int) =>
new TestEventHubsReceiver(eventHubsParams,
eventHubs,
partitionId,
startOffset,
EventHubsOffsetTypes.PreviousCheckpoint),
(_: String, _: Map[String, Map[String, String]]) =>
new TestRestEventHubClient(highestOffsetPerPartition))
new TestRestEventHubClient(highestOffsetPerPartition)
)
val offset = eventHubsSource.getOffset.get.asInstanceOf[EventHubsBatchRecord]
assert(offset.batchId == 0)
offset.targetSeqNums.values.foreach(x => assert(x == 2))
}
test("Verify expected offsets are correct when in subsequent fetch when rate is less than the" +
" available data") {
test(
"Verify expected offsets are correct when in subsequent fetch when rate is less than the" +
" available data") {
val eventHubsParameters = buildEventHubsParamters("ns1", "eh1", 2, 3)
val eventPayloadsAndProperties = generateIntKeyedData(10)
val eventHubs = EventHubsTestUtilities.simulateEventHubs(eventHubsParameters,
eventPayloadsAndProperties)
val eventHubs =
EventHubsTestUtilities.simulateEventHubs(eventHubsParameters, eventPayloadsAndProperties)
val highestOffsetPerPartition = EventHubsTestUtilities.getHighestOffsetPerPartition(eventHubs)
val eventHubsSource = new EventHubsSource(spark.sqlContext, eventHubsParameters,
(eventHubsParams: Map[String, String], partitionId: Int, startOffset: Long,
eventHubsOffsetType: EventHubsOffsetType, _: Int) =>
new TestEventHubsReceiver(eventHubsParams, eventHubs, partitionId, startOffset,
eventHubsOffsetType),
val eventHubsSource = new EventHubsSource(
spark.sqlContext,
eventHubsParameters,
(eventHubsParams: Map[String, String],
partitionId: Int,
startOffset: Long,
eventHubsOffsetType: EventHubsOffsetType,
_: Int) =>
new TestEventHubsReceiver(eventHubsParams,
eventHubs,
partitionId,
startOffset,
eventHubsOffsetType),
(_: String, _: Map[String, Map[String, String]]) =>
new TestRestEventHubClient(highestOffsetPerPartition))
new TestRestEventHubClient(highestOffsetPerPartition)
)
// First batch
var offset = eventHubsSource.getOffset.get.asInstanceOf[EventHubsBatchRecord]
var dataFrame = eventHubsSource.getBatch(None, offset)
@ -138,16 +167,25 @@ class EventHubsSourceSuite extends EventHubsStreamTest {
test("Verify expected dataframe size is correct when the rate is less than the available data") {
val eventHubsParameters = buildEventHubsParamters("ns1", "eh1", 2, 2)
val eventPayloadsAndProperties = generateIntKeyedData(6)
val eventHubs = EventHubsTestUtilities.simulateEventHubs(eventHubsParameters,
eventPayloadsAndProperties)
val eventHubs =
EventHubsTestUtilities.simulateEventHubs(eventHubsParameters, eventPayloadsAndProperties)
val highestOffsetPerPartition = EventHubsTestUtilities.getHighestOffsetPerPartition(eventHubs)
val eventHubsSource = new EventHubsSource(spark.sqlContext, eventHubsParameters,
(eventHubsParams: Map[String, String], partitionId: Int, startOffset: Long,
eventHubsOffsetType: EventHubsOffsetType, _: Int) =>
new TestEventHubsReceiver(eventHubsParams, eventHubs, partitionId, startOffset,
eventHubsOffsetType),
val eventHubsSource = new EventHubsSource(
spark.sqlContext,
eventHubsParameters,
(eventHubsParams: Map[String, String],
partitionId: Int,
startOffset: Long,
eventHubsOffsetType: EventHubsOffsetType,
_: Int) =>
new TestEventHubsReceiver(eventHubsParams,
eventHubs,
partitionId,
startOffset,
eventHubsOffsetType),
(_: String, _: Map[String, Map[String, String]]) =>
new TestRestEventHubClient(highestOffsetPerPartition))
new TestRestEventHubClient(highestOffsetPerPartition)
)
val offset = eventHubsSource.getOffset.get.asInstanceOf[EventHubsBatchRecord]
val dataFrame = eventHubsSource.getBatch(None, offset)
assert(dataFrame.schema == eventHubsSource.schema)
@ -158,16 +196,25 @@ class EventHubsSourceSuite extends EventHubsStreamTest {
test("Verify expected dataframe size is correct when the rate is more than the available data") {
val eventHubsParameters = buildEventHubsParamters("ns1", "eh1", 2, 10)
val eventPayloadsAndProperties = generateIntKeyedData(6)
val eventHubs = EventHubsTestUtilities.simulateEventHubs(eventHubsParameters,
eventPayloadsAndProperties)
val eventHubs =
EventHubsTestUtilities.simulateEventHubs(eventHubsParameters, eventPayloadsAndProperties)
val highestOffsetPerPartition = EventHubsTestUtilities.getHighestOffsetPerPartition(eventHubs)
val eventHubsSource = new EventHubsSource(spark.sqlContext, eventHubsParameters,
(eventHubsParams: Map[String, String], partitionId: Int, startOffset: Long,
eventHubsOffsetType: EventHubsOffsetType, _: Int) =>
new TestEventHubsReceiver(eventHubsParams, eventHubs, partitionId, startOffset,
eventHubsOffsetType),
val eventHubsSource = new EventHubsSource(
spark.sqlContext,
eventHubsParameters,
(eventHubsParams: Map[String, String],
partitionId: Int,
startOffset: Long,
eventHubsOffsetType: EventHubsOffsetType,
_: Int) =>
new TestEventHubsReceiver(eventHubsParams,
eventHubs,
partitionId,
startOffset,
eventHubsOffsetType),
(_: String, _: Map[String, Map[String, String]]) =>
new TestRestEventHubClient(highestOffsetPerPartition))
new TestRestEventHubClient(highestOffsetPerPartition)
)
val offset = eventHubsSource.getOffset.get.asInstanceOf[EventHubsBatchRecord]
val dataFrame = eventHubsSource.getBatch(None, offset)
assert(dataFrame.schema == eventHubsSource.schema)
@ -175,20 +222,30 @@ class EventHubsSourceSuite extends EventHubsStreamTest {
assert(dataFrame.select("body").count == 6)
}
test("Verify expected dataframe size is correct in subsequent fetch when the rate is" +
" less than the available data") {
test(
"Verify expected dataframe size is correct in subsequent fetch when the rate is" +
" less than the available data") {
val eventHubsParameters = buildEventHubsParamters("ns1", "eh1", 2, 3)
val eventPayloadsAndProperties = generateIntKeyedData(10)
val eventHubs = EventHubsTestUtilities.simulateEventHubs(eventHubsParameters,
eventPayloadsAndProperties)
val eventHubs =
EventHubsTestUtilities.simulateEventHubs(eventHubsParameters, eventPayloadsAndProperties)
val highestOffsetPerPartition = EventHubsTestUtilities.getHighestOffsetPerPartition(eventHubs)
val eventHubsSource = new EventHubsSource(spark.sqlContext, eventHubsParameters,
(eventHubsParams: Map[String, String], partitionId: Int, startOffset: Long,
eventHubsOffsetType: EventHubsOffsetType, _: Int) =>
new TestEventHubsReceiver(eventHubsParams, eventHubs, partitionId, startOffset,
eventHubsOffsetType),
val eventHubsSource = new EventHubsSource(
spark.sqlContext,
eventHubsParameters,
(eventHubsParams: Map[String, String],
partitionId: Int,
startOffset: Long,
eventHubsOffsetType: EventHubsOffsetType,
_: Int) =>
new TestEventHubsReceiver(eventHubsParams,
eventHubs,
partitionId,
startOffset,
eventHubsOffsetType),
(_: String, _: Map[String, Map[String, String]]) =>
new TestRestEventHubClient(highestOffsetPerPartition))
new TestRestEventHubClient(highestOffsetPerPartition)
)
// First batch
var offset = eventHubsSource.getOffset.get.asInstanceOf[EventHubsBatchRecord]
var dataFrame = eventHubsSource.getBatch(None, offset)
@ -203,10 +260,11 @@ class EventHubsSourceSuite extends EventHubsStreamTest {
assert(dataFrame.select("body").count == 4)
}
test("Verify all user-defined keys show up in dataframe schema if not specify" +
" userDefinedKeys") {
val eventHubsParameters = buildEventHubsParamters("ns1", "eh1", 2, 10,
containsProperties = true)
test(
"Verify all user-defined keys show up in dataframe schema if not specify" +
" userDefinedKeys") {
val eventHubsParameters =
buildEventHubsParamters("ns1", "eh1", 2, 10, containsProperties = true)
val eventPayloadsAndProperties = Seq(
1 -> Seq("creationTime" -> Calendar.getInstance().getTime, "otherUserDefinedKey" -> 1),
3 -> Seq("creationTime" -> Calendar.getInstance().getTime, "otherUserDefinedKey" -> 1),
@ -215,29 +273,50 @@ class EventHubsSourceSuite extends EventHubsStreamTest {
9 -> Seq("creationTime" -> Calendar.getInstance().getTime, "otherUserDefinedKey" -> 1),
11 -> Seq("creationTime" -> Calendar.getInstance().getTime, "otherUserDefinedKey" -> 1)
)
val eventHubs = EventHubsTestUtilities.simulateEventHubs(eventHubsParameters,
eventPayloadsAndProperties)
val eventHubs =
EventHubsTestUtilities.simulateEventHubs(eventHubsParameters, eventPayloadsAndProperties)
val highestOffsetPerPartition = EventHubsTestUtilities.getHighestOffsetPerPartition(eventHubs)
val eventHubsSource = new EventHubsSource(spark.sqlContext, eventHubsParameters,
(eventHubsParams: Map[String, String], partitionId: Int, startOffset: Long,
eventHubsOffsetType: EventHubsOffsetType, _: Int) =>
new TestEventHubsReceiver(eventHubsParams, eventHubs, partitionId, startOffset,
eventHubsOffsetType),
val eventHubsSource = new EventHubsSource(
spark.sqlContext,
eventHubsParameters,
(eventHubsParams: Map[String, String],
partitionId: Int,
startOffset: Long,
eventHubsOffsetType: EventHubsOffsetType,
_: Int) =>
new TestEventHubsReceiver(eventHubsParams,
eventHubs,
partitionId,
startOffset,
eventHubsOffsetType),
(_: String, _: Map[String, Map[String, String]]) =>
new TestRestEventHubClient(highestOffsetPerPartition))
new TestRestEventHubClient(highestOffsetPerPartition)
)
val offset = eventHubsSource.getOffset.get.asInstanceOf[EventHubsBatchRecord]
val dataFrame = eventHubsSource.getBatch(None, offset)
assert(dataFrame.schema == eventHubsSource.schema)
eventHubsSource.commit(offset)
val properties = dataFrame.select("properties").rdd.map(r => r.get(0)
.asInstanceOf[Map[String, String]])
assert(properties.collect().forall(propertyMap => propertyMap.keySet == Set("creationTime",
"otherUserDefinedKey")))
val properties = dataFrame
.select("properties")
.rdd
.map(
r =>
r.get(0)
.asInstanceOf[Map[String, String]])
assert(
properties
.collect()
.forall(propertyMap => propertyMap.keySet == Set("creationTime", "otherUserDefinedKey")))
}
test("Verify user-defined keys show up in dataframe schema if specify userDefinedKey") {
val eventHubsParameters = buildEventHubsParamters("ns1", "eh1", 2, 10,
containsProperties = true, userDefinedKeys = Some("otherUserDefinedKey,"))
val eventHubsParameters =
buildEventHubsParamters("ns1",
"eh1",
2,
10,
containsProperties = true,
userDefinedKeys = Some("otherUserDefinedKey,"))
val eventPayloadsAndProperties = Seq(
1 -> Seq("creationTime" -> Calendar.getInstance().getTime, "otherUserDefinedKey" -> 1),
3 -> Seq("creationTime" -> Calendar.getInstance().getTime, "otherUserDefinedKey" -> 1),
@ -246,16 +325,25 @@ class EventHubsSourceSuite extends EventHubsStreamTest {
9 -> Seq("creationTime" -> Calendar.getInstance().getTime, "otherUserDefinedKey" -> 1),
11 -> Seq("creationTime" -> Calendar.getInstance().getTime, "otherUserDefinedKey" -> 1)
)
val eventHubs = EventHubsTestUtilities.simulateEventHubs(eventHubsParameters,
eventPayloadsAndProperties)
val eventHubs =
EventHubsTestUtilities.simulateEventHubs(eventHubsParameters, eventPayloadsAndProperties)
val highestOffsetPerPartition = EventHubsTestUtilities.getHighestOffsetPerPartition(eventHubs)
val eventHubsSource = new EventHubsSource(spark.sqlContext, eventHubsParameters,
(eventHubsParams: Map[String, String], partitionId: Int, startOffset: Long,
ehOffsetType: EventHubsOffsetType, _: Int) =>
new TestEventHubsReceiver(eventHubsParams, eventHubs, partitionId, startOffset,
ehOffsetType),
val eventHubsSource = new EventHubsSource(
spark.sqlContext,
eventHubsParameters,
(eventHubsParams: Map[String, String],
partitionId: Int,
startOffset: Long,
ehOffsetType: EventHubsOffsetType,
_: Int) =>
new TestEventHubsReceiver(eventHubsParams,
eventHubs,
partitionId,
startOffset,
ehOffsetType),
(_: String, _: Map[String, Map[String, String]]) =>
new TestRestEventHubClient(highestOffsetPerPartition))
new TestRestEventHubClient(highestOffsetPerPartition)
)
val offset = eventHubsSource.getOffset.get.asInstanceOf[EventHubsBatchRecord]
val dataFrame = eventHubsSource.getBatch(None, offset)
assert(dataFrame.schema == eventHubsSource.schema)
@ -265,90 +353,118 @@ class EventHubsSourceSuite extends EventHubsStreamTest {
}
test("Verify null references in user-defined keys are handled correctly") {
val eventHubsParameters = buildEventHubsParamters("ns1", "eh1", 2, 10,
containsProperties = true)
val eventHubsParameters =
buildEventHubsParamters("ns1", "eh1", 2, 10, containsProperties = true)
val eventPayloadsAndProperties = generateKeyedDataWithNullValue(6)
val eventHubs = EventHubsTestUtilities.simulateEventHubs(eventHubsParameters,
eventPayloadsAndProperties)
val eventHubs =
EventHubsTestUtilities.simulateEventHubs(eventHubsParameters, eventPayloadsAndProperties)
val highestOffsetPerPartition = EventHubsTestUtilities.getHighestOffsetPerPartition(eventHubs)
val eventHubsSource = new EventHubsSource(spark.sqlContext, eventHubsParameters,
(eventHubsParams: Map[String, String], partitionId: Int, startOffset: Long,
eventHubsOffsetType: EventHubsOffsetType, _: Int) =>
new TestEventHubsReceiver(eventHubsParams, eventHubs, partitionId, startOffset,
eventHubsOffsetType),
val eventHubsSource = new EventHubsSource(
spark.sqlContext,
eventHubsParameters,
(eventHubsParams: Map[String, String],
partitionId: Int,
startOffset: Long,
eventHubsOffsetType: EventHubsOffsetType,
_: Int) =>
new TestEventHubsReceiver(eventHubsParams,
eventHubs,
partitionId,
startOffset,
eventHubsOffsetType),
(_: String, _: Map[String, Map[String, String]]) =>
new TestRestEventHubClient(highestOffsetPerPartition))
new TestRestEventHubClient(highestOffsetPerPartition)
)
val offset = eventHubsSource.getOffset.get.asInstanceOf[EventHubsBatchRecord]
val dataFrame = eventHubsSource.getBatch(None, offset)
assert(dataFrame.schema == eventHubsSource.schema)
eventHubsSource.commit(offset)
val sparkSession = spark
import sparkSession.implicits._
val bodyDataFrame = dataFrame.select("body")
val bodyDataFrame = dataFrame
.select("body")
.map(r => new String(r.getAs[Array[Byte]](0), "UTF-8"))
val inputArray = eventPayloadsAndProperties.map(x => x._1).toArray
val outputArray = bodyDataFrame.collect()
assert(outputArray.sorted.corresponds(inputArray.sorted) {_ == _})
assert(outputArray.sorted.corresponds(inputArray.sorted) { _ == _ })
}
test("Verify dataframe body is correct for String type") {
val eventHubsParameters = buildEventHubsParamters("ns1", "eh1", 2, 10)
val eventPayloadsAndProperties = generateStringKeyedData(6)
val eventHubs = EventHubsTestUtilities.simulateEventHubs(eventHubsParameters,
eventPayloadsAndProperties)
val eventHubs =
EventHubsTestUtilities.simulateEventHubs(eventHubsParameters, eventPayloadsAndProperties)
val highestOffsetPerPartition = EventHubsTestUtilities.getHighestOffsetPerPartition(eventHubs)
val eventHubsSource = new EventHubsSource(spark.sqlContext, eventHubsParameters,
(eventHubsParams: Map[String, String], partitionId: Int, startOffset: Long,
eventHubsOffsetType: EventHubsOffsetType, _: Int) =>
new TestEventHubsReceiver(eventHubsParams, eventHubs, partitionId, startOffset,
eventHubsOffsetType),
val eventHubsSource = new EventHubsSource(
spark.sqlContext,
eventHubsParameters,
(eventHubsParams: Map[String, String],
partitionId: Int,
startOffset: Long,
eventHubsOffsetType: EventHubsOffsetType,
_: Int) =>
new TestEventHubsReceiver(eventHubsParams,
eventHubs,
partitionId,
startOffset,
eventHubsOffsetType),
(_: String, _: Map[String, Map[String, String]]) =>
new TestRestEventHubClient(highestOffsetPerPartition))
new TestRestEventHubClient(highestOffsetPerPartition)
)
val offset = eventHubsSource.getOffset.get.asInstanceOf[EventHubsBatchRecord]
val dataFrame = eventHubsSource.getBatch(None, offset)
assert(dataFrame.schema == eventHubsSource.schema)
eventHubsSource.commit(offset)
val sparkSession = spark
import sparkSession.implicits._
val bodyDataFrame = dataFrame.select("body")
val bodyDataFrame = dataFrame
.select("body")
.map(r => new String(r.getAs[Array[Byte]](0), "UTF-8"))
val inputArray = eventPayloadsAndProperties.map(x => x._1).toArray
val outputArray = bodyDataFrame.collect()
assert(outputArray.sorted.corresponds(inputArray.sorted) {_ == _})
assert(outputArray.sorted.corresponds(inputArray.sorted) { _ == _ })
}
test("Verify dataframe body is correct for Int type") {
val eventHubsParameters = buildEventHubsParamters("ns1", "eh1", 2, 10)
val eventPayloadsAndProperties = generateIntKeyedData(6)
val eventHubs = EventHubsTestUtilities.simulateEventHubs(eventHubsParameters,
eventPayloadsAndProperties)
val eventHubs =
EventHubsTestUtilities.simulateEventHubs(eventHubsParameters, eventPayloadsAndProperties)
val highestOffsetPerPartition = EventHubsTestUtilities.getHighestOffsetPerPartition(eventHubs)
val eventHubsSource = new EventHubsSource(spark.sqlContext, eventHubsParameters,
(eventHubsParams: Map[String, String], partitionId: Int, startOffset: Long,
eventHubsOffsetType: EventHubsOffsetType, _: Int) =>
new TestEventHubsReceiver(eventHubsParams, eventHubs, partitionId, startOffset,
eventHubsOffsetType),
val eventHubsSource = new EventHubsSource(
spark.sqlContext,
eventHubsParameters,
(eventHubsParams: Map[String, String],
partitionId: Int,
startOffset: Long,
eventHubsOffsetType: EventHubsOffsetType,
_: Int) =>
new TestEventHubsReceiver(eventHubsParams,
eventHubs,
partitionId,
startOffset,
eventHubsOffsetType),
(_: String, _: Map[String, Map[String, String]]) =>
new TestRestEventHubClient(highestOffsetPerPartition))
new TestRestEventHubClient(highestOffsetPerPartition)
)
val offset = eventHubsSource.getOffset.get.asInstanceOf[EventHubsBatchRecord]
val dataFrame = eventHubsSource.getBatch(None, offset)
assert(dataFrame.schema == eventHubsSource.schema)
eventHubsSource.commit(offset)
val sparkSession = spark
import sparkSession.implicits._
val bodyDataFrame = dataFrame.select("body")
val bodyDataFrame = dataFrame
.select("body")
.map(r => new String(r.getAs[Array[Byte]](0), "UTF-8").toInt)
val inputArray = eventPayloadsAndProperties.map(x => x._1).toArray
val outputArray = bodyDataFrame.collect()
assert(outputArray.sorted.corresponds(inputArray.sorted) {_ == _})
assert(outputArray.sorted.corresponds(inputArray.sorted) { _ == _ })
}
private def generateInputQuery(
eventHubsParams: Map[String, String],
sparkSession: SparkSession): Dataset[_] = {
private def generateInputQuery(eventHubsParams: Map[String, String],
sparkSession: SparkSession): Dataset[_] = {
import sparkSession.implicits._
val dataSource = spark
.readStream
val dataSource = spark.readStream
.format("eventhubs")
.options(eventHubsParams)
.load()
@ -391,8 +507,9 @@ class EventHubsSourceSuite extends EventHubsStreamTest {
)
}
test("Verify expected dataframe can be retrieved after data added to source in excess" +
" of the rate") {
test(
"Verify expected dataframe can be retrieved after data added to source in excess" +
" of the rate") {
import testImplicits._
val eventHubsParameters = buildEventHubsParamters("ns1", "eh1", 2, 3)
val eventPayloadsAndProperties = generateIntKeyedData(15)
@ -411,8 +528,9 @@ class EventHubsSourceSuite extends EventHubsStreamTest {
)
}
test("Verify expected dataframe can be retrieved when more data is added to" +
" source after stream has started") {
test(
"Verify expected dataframe can be retrieved when more data is added to" +
" source after stream has started") {
import testImplicits._
val eventHubsParameters = buildEventHubsParamters("ns1", "eh1", 2, 3)
val eventPayloadsAndProperties1 = generateIntKeyedData(6)
@ -426,19 +544,22 @@ class EventHubsSourceSuite extends EventHubsStreamTest {
StartStream(trigger = ProcessingTime(10), triggerClock = manualClock),
AddEventHubsData(eventHubsParameters),
CheckAnswer(1, 3, 5, 2, 4, 6),
AddEventHubsData(eventHubsParameters, highestBatchId.incrementAndGet.toLong,
eventPayloadsAndProperties2),
AddEventHubsData(eventHubsParameters,
highestBatchId.incrementAndGet.toLong,
eventPayloadsAndProperties2),
AdvanceManualClock(10),
CheckAnswer(1, 3, 5, 2, 4, 6, 3, 5, 7, 4, 6, 8),
AddEventHubsData(eventHubsParameters, highestBatchId.incrementAndGet.toLong,
eventPayloadsAndProperties3),
AddEventHubsData(eventHubsParameters,
highestBatchId.incrementAndGet.toLong,
eventPayloadsAndProperties3),
AdvanceManualClock(10),
CheckAnswer(1, 3, 5, 2, 4, 6, 3, 5, 7, 4, 6, 8, 4, 6, 8, 5, 7, 9)
)
}
test("Verify expected dataframe can be retrieved with data added to source after the stream" +
" has started") {
test(
"Verify expected dataframe can be retrieved with data added to source after the stream" +
" has started") {
import testImplicits._
val eventHubsParameters = buildEventHubsParamters("ns1", "eh1", 2, 3)
val eventPayloadsAndProperties1 = generateIntKeyedData(6)
@ -450,19 +571,22 @@ class EventHubsSourceSuite extends EventHubsStreamTest {
testStream(sourceQuery)(
StartStream(trigger = ProcessingTime(10), triggerClock = manualClock),
CheckAnswer(),
AddEventHubsData(eventHubsParameters, highestBatchId.incrementAndGet().toLong,
eventPayloadsAndProperties1),
AddEventHubsData(eventHubsParameters,
highestBatchId.incrementAndGet().toLong,
eventPayloadsAndProperties1),
AdvanceManualClock(10),
CheckAnswer(1, 3, 5, 2, 4, 6),
AddEventHubsData(eventHubsParameters, highestBatchId.incrementAndGet().toLong,
eventPayloadsAndProperties2),
AddEventHubsData(eventHubsParameters,
highestBatchId.incrementAndGet().toLong,
eventPayloadsAndProperties2),
AdvanceManualClock(10),
CheckAnswer(1, 3, 5, 2, 4, 6, 3, 5, 7, 4, 6, 8)
)
}
test("Verify expected dataframe can be retrieved from different" +
" sources with same event hubs on different streams on different queries at same rate") {
test(
"Verify expected dataframe can be retrieved from different" +
" sources with same event hubs on different streams on different queries at same rate") {
import testImplicits._
val eventHubsParameters = buildEventHubsParamters("ns1", "eh1", 2, 30)
val eventPayloadsAndProperties = generateIntKeyedData(1000)
@ -481,8 +605,9 @@ class EventHubsSourceSuite extends EventHubsStreamTest {
)
}
test("Verify expected dataframe can be retrieved from different " +
"sources with same event hubs on different streams on different queries at different rates") {
test(
"Verify expected dataframe can be retrieved from different " +
"sources with same event hubs on different streams on different queries at different rates") {
import testImplicits._
val eventHubsParameters1 = buildEventHubsParamters("ns1", "eh1", 2, 30)
val eventHubsParameters2 = buildEventHubsParamters("ns1", "eh1", 2, 10)
@ -502,8 +627,9 @@ class EventHubsSourceSuite extends EventHubsStreamTest {
)
}
test("Verify expected dataframe can be retrieved from same " +
"source on different queries") {
test(
"Verify expected dataframe can be retrieved from same " +
"source on different queries") {
import testImplicits._
val eventHubsParameters1 = buildEventHubsParamters("ns1", "eh1", 2, 30)
val eventPayloadsAndProperties = generateIntKeyedData(1000)
@ -521,49 +647,53 @@ class EventHubsSourceSuite extends EventHubsStreamTest {
)
}
test("Verify expected dataframe can be retrieved when the stream is stopped before the last" +
" batch's offset is committed") {
test(
"Verify expected dataframe can be retrieved when the stream is stopped before the last" +
" batch's offset is committed") {
import testImplicits._
val eventHubsParameters = buildEventHubsParamters("ns1", "eh1", 2, 30)
val eventPayloadsAndProperties = generateIntKeyedData(1000)
EventHubsTestUtilities.simulateEventHubs(eventHubsParameters,
eventPayloadsAndProperties.take(30 * 10 * 2))
eventPayloadsAndProperties.take(30 * 10 * 2))
val sourceQuery = generateInputQuery(eventHubsParameters, spark)
val manualClock = new StreamManualClock
val firstBatch = Seq(
StartStream(trigger = ProcessingTime(10), triggerClock = manualClock),
AddEventHubsData(eventHubsParameters, 9))
val firstBatch = Seq(StartStream(trigger = ProcessingTime(10), triggerClock = manualClock),
AddEventHubsData(eventHubsParameters, 9))
val clockMove = Array.fill(9)(AdvanceManualClock(10)).toSeq
val secondBatch = Seq(
CheckAnswer(1 to 600: _*),
StopStream(recoverStreamId = true, commitPartialOffset = true, partialType = "delete"),
StartStream(trigger = ProcessingTime(10), triggerClock = manualClock,
additionalConfs = Map("eventhubs.test.newSink" -> "true")),
AddEventHubsData(eventHubsParameters, 17, eventPayloadsAndProperties.takeRight(400)))
StartStream(trigger = ProcessingTime(10),
triggerClock = manualClock,
additionalConfs = Map("eventhubs.test.newSink" -> "true")),
AddEventHubsData(eventHubsParameters, 17, eventPayloadsAndProperties.takeRight(400))
)
val clockMove2 = Array.fill(8)(AdvanceManualClock(10)).toSeq
val thirdBatch = Seq(CheckAnswer(541 to 1000: _*))
testStream(sourceQuery)(firstBatch ++ clockMove ++ secondBatch ++ clockMove2 ++ thirdBatch: _*)
}
test("Verify expected dataframe can be retrieved when the stream is stopped after the last" +
" batch's offset is committed") {
test(
"Verify expected dataframe can be retrieved when the stream is stopped after the last" +
" batch's offset is committed") {
import testImplicits._
val eventHubsParameters = buildEventHubsParamters("ns1", "eh1", 2, 30)
val eventPayloadsAndProperties = generateIntKeyedData(1000)
EventHubsTestUtilities.simulateEventHubs(eventHubsParameters,
eventPayloadsAndProperties.take(30 * 10 * 2))
eventPayloadsAndProperties.take(30 * 10 * 2))
val sourceQuery = generateInputQuery(eventHubsParameters, spark)
val manualClock = new StreamManualClock
val firstBatch = Seq(
StartStream(trigger = ProcessingTime(10), triggerClock = manualClock),
AddEventHubsData(eventHubsParameters, 9))
val firstBatch = Seq(StartStream(trigger = ProcessingTime(10), triggerClock = manualClock),
AddEventHubsData(eventHubsParameters, 9))
val clockMove = Array.fill(9)(AdvanceManualClock(10)).toSeq
val secondBatch = Seq(
CheckAnswer(1 to 600: _*),
StopStream(recoverStreamId = true, commitOffset = true),
StartStream(trigger = ProcessingTime(10), triggerClock = manualClock,
additionalConfs = Map("eventhubs.test.newSink" -> "true")),
AddEventHubsData(eventHubsParameters, 17, eventPayloadsAndProperties.takeRight(400)))
StartStream(trigger = ProcessingTime(10),
triggerClock = manualClock,
additionalConfs = Map("eventhubs.test.newSink" -> "true")),
AddEventHubsData(eventHubsParameters, 17, eventPayloadsAndProperties.takeRight(400))
)
val clockMove2 = Array.fill(8)(AdvanceManualClock(10)).toSeq
val thirdBatch = Seq(CheckAnswer(601 to 1000: _*))
testStream(sourceQuery)(firstBatch ++ clockMove ++ secondBatch ++ clockMove2 ++ thirdBatch: _*)
@ -574,20 +704,20 @@ class EventHubsSourceSuite extends EventHubsStreamTest {
val eventHubsParameters = buildEventHubsParamters("ns1", "eh1", 2, 30)
val eventPayloadsAndProperties = generateIntKeyedData(1000)
EventHubsTestUtilities.simulateEventHubs(eventHubsParameters,
eventPayloadsAndProperties.take(30 * 10 * 2))
eventPayloadsAndProperties.take(30 * 10 * 2))
val sourceQuery = generateInputQuery(eventHubsParameters, spark)
val manualClock = new StreamManualClock
val firstBatch = Seq(
StartStream(trigger = ProcessingTime(10), triggerClock = manualClock),
AddEventHubsData(eventHubsParameters, 9))
val firstBatch = Seq(StartStream(trigger = ProcessingTime(10), triggerClock = manualClock),
AddEventHubsData(eventHubsParameters, 9))
val clockMove = Array.fill(9)(AdvanceManualClock(10)).toSeq
val secondBatch = Seq(
CheckAnswer(1 to 600: _*),
StopStream(recoverStreamId = true, commitPartialOffset = true,
partialType = "partial"),
StartStream(trigger = ProcessingTime(10), triggerClock = manualClock,
additionalConfs = Map("eventhubs.test.newSink" -> "true")),
AddEventHubsData(eventHubsParameters, 17, eventPayloadsAndProperties.takeRight(400)))
StopStream(recoverStreamId = true, commitPartialOffset = true, partialType = "partial"),
StartStream(trigger = ProcessingTime(10),
triggerClock = manualClock,
additionalConfs = Map("eventhubs.test.newSink" -> "true")),
AddEventHubsData(eventHubsParameters, 17, eventPayloadsAndProperties.takeRight(400))
)
val clockMove2 = Array.fill(8)(AdvanceManualClock(10)).toSeq
// in structured streaming, even metadata is not committed, we will be able to skip the
// processed data, since we will pinpoint progress file with the recovered batch id
@ -595,26 +725,27 @@ class EventHubsSourceSuite extends EventHubsStreamTest {
testStream(sourceQuery)(firstBatch ++ clockMove ++ secondBatch ++ clockMove2 ++ thirdBatch: _*)
}
test("Verify expected dataframe can be retrieved when upgrading from a directory without" +
" metadata") {
test(
"Verify expected dataframe can be retrieved when upgrading from a directory without" +
" metadata") {
import testImplicits._
val eventHubsParameters = buildEventHubsParamters("ns1", "eh1", 2, 30)
val eventPayloadsAndProperties = generateIntKeyedData(1000)
EventHubsTestUtilities.simulateEventHubs(eventHubsParameters,
eventPayloadsAndProperties.take(30 * 10 * 2))
eventPayloadsAndProperties.take(30 * 10 * 2))
val sourceQuery = generateInputQuery(eventHubsParameters, spark)
val manualClock = new StreamManualClock
val firstBatch = Seq(
StartStream(trigger = ProcessingTime(10), triggerClock = manualClock),
AddEventHubsData(eventHubsParameters, 9))
val firstBatch = Seq(StartStream(trigger = ProcessingTime(10), triggerClock = manualClock),
AddEventHubsData(eventHubsParameters, 9))
val clockMove = Array.fill(9)(AdvanceManualClock(10)).toSeq
val secondBatch = Seq(
CheckAnswer(1 to 600: _*),
StopStream(recoverStreamId = true, commitPartialOffset = true,
partialType = "nometadata"),
StartStream(trigger = ProcessingTime(10), triggerClock = manualClock,
additionalConfs = Map("eventhubs.test.newSink" -> "true")),
AddEventHubsData(eventHubsParameters, 17, eventPayloadsAndProperties.takeRight(400)))
StopStream(recoverStreamId = true, commitPartialOffset = true, partialType = "nometadata"),
StartStream(trigger = ProcessingTime(10),
triggerClock = manualClock,
additionalConfs = Map("eventhubs.test.newSink" -> "true")),
AddEventHubsData(eventHubsParameters, 17, eventPayloadsAndProperties.takeRight(400))
)
val clockMove2 = Array.fill(8)(AdvanceManualClock(10)).toSeq
// in structured streaming, even metadata is not committed, we will be able to skip the
// processed data, since we will pinpoint progress file with the recovered batch id
@ -627,20 +758,22 @@ class EventHubsSourceSuite extends EventHubsStreamTest {
val eventHubsParameters = buildEventHubsParamters("ns1", "eh1", 2, 30)
val eventPayloadsAndProperties = generateIntKeyedData(1000)
EventHubsTestUtilities.simulateEventHubs(eventHubsParameters,
eventPayloadsAndProperties.take(30 * 10 * 2))
eventPayloadsAndProperties.take(30 * 10 * 2))
val sourceQuery = generateInputQuery(eventHubsParameters, spark)
val manualClock = new StreamManualClock
val firstBatch = Seq(
StartStream(trigger = ProcessingTime(10), triggerClock = manualClock),
AddEventHubsData(eventHubsParameters, 9))
val firstBatch = Seq(StartStream(trigger = ProcessingTime(10), triggerClock = manualClock),
AddEventHubsData(eventHubsParameters, 9))
val clockMove = Array.fill(9)(AdvanceManualClock(10)).toSeq
val secondBatch = Seq(
CheckAnswer(1 to 600: _*),
StopStream(recoverStreamId = true, commitPartialOffset = true,
partialType = "deletemetadata"),
StartStream(trigger = ProcessingTime(10), triggerClock = manualClock,
additionalConfs = Map("eventhubs.test.newSink" -> "true")),
AddEventHubsData(eventHubsParameters, 17, eventPayloadsAndProperties.takeRight(400)))
StopStream(recoverStreamId = true,
commitPartialOffset = true,
partialType = "deletemetadata"),
StartStream(trigger = ProcessingTime(10),
triggerClock = manualClock,
additionalConfs = Map("eventhubs.test.newSink" -> "true")),
AddEventHubsData(eventHubsParameters, 17, eventPayloadsAndProperties.takeRight(400))
)
val clockMove2 = Array.fill(8)(AdvanceManualClock(10)).toSeq
// in structured streaming, even metadata is not committed, we will be able to skip the
// processed data, since we will pinpoint progress file with the recovered batch id
@ -648,8 +781,9 @@ class EventHubsSourceSuite extends EventHubsStreamTest {
testStream(sourceQuery)(firstBatch ++ clockMove ++ secondBatch ++ clockMove2 ++ thirdBatch: _*)
}
test("Verify expected dataframe is retrieved from starting offset" +
" on different streams on the same query") {
test(
"Verify expected dataframe is retrieved from starting offset" +
" on different streams on the same query") {
import testImplicits._
val eventHubsParameters = buildEventHubsParamters("ns1", "eh1", 2, 3)
val eventPayloadsAndProperties1 = generateIntKeyedData(6)
@ -664,15 +798,19 @@ class EventHubsSourceSuite extends EventHubsStreamTest {
AdvanceManualClock(10),
CheckAnswer(1, 2, 3, 4, 5, 6),
StopStream(),
StartStream(trigger = ProcessingTime(10), triggerClock = manualClock,
additionalConfs = Map(
"eventhubs.test.checkpointLocation" ->
s"${Utils.createTempDir(namePrefix = "streaming.metadata").getCanonicalPath}",
"eventhubs.test.newSink" -> "true")),
StartStream(
trigger = ProcessingTime(10),
triggerClock = manualClock,
additionalConfs =
Map("eventhubs.test.checkpointLocation" ->
s"${Utils.createTempDir(namePrefix = "streaming.metadata").getCanonicalPath}",
"eventhubs.test.newSink" -> "true")
),
AddEventHubsData(eventHubsParameters),
CheckAnswer(1, 2, 3, 4, 5, 6),
AddEventHubsData(eventHubsParameters, highestBatchId.incrementAndGet().toLong,
eventPayloadsAndProperties2),
AddEventHubsData(eventHubsParameters,
highestBatchId.incrementAndGet().toLong,
eventPayloadsAndProperties2),
AdvanceManualClock(10),
AdvanceManualClock(10),
CheckAnswer(1, 2, 3, 4, 5, 6, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12)
@ -685,28 +823,31 @@ class EventHubsSourceSuite extends EventHubsStreamTest {
}
test("Verify expected dataframe is retrieved with windowing operation") {
val eventHubsParameters = buildEventHubsParamters("ns1", "eh1", 2, 40,
containsProperties = true, userDefinedKeys = Some("creationTime"))
val eventHubsParameters = buildEventHubsParamters("ns1",
"eh1",
2,
40,
containsProperties = true,
userDefinedKeys = Some("creationTime"))
val eventPayloadsAndProperties = {
for (time <- Range(0, 10))
yield testDataForWindowingOperation(100, time)
}.reduce((a, b) => a ++ b)
EventHubsTestUtilities.simulateEventHubs(eventHubsParameters,
eventPayloadsAndProperties)
EventHubsTestUtilities.simulateEventHubs(eventHubsParameters, eventPayloadsAndProperties)
val sourceQuery = spark.readStream.format("eventhubs").options(eventHubsParameters).load()
import sourceQuery.sparkSession.implicits._
import org.apache.spark.sql.functions._
val windowedStream = sourceQuery.groupBy(
window(
$"creationTime".cast(TimestampType),
"3 second",
"1 second")).count().sort("window").select("count")
val windowedStream = sourceQuery
.groupBy(window($"creationTime".cast(TimestampType), "3 second", "1 second"))
.count()
.sort("window")
.select("count")
val manualClock = new StreamManualClock
val firstBatch = Seq(StartStream(trigger = ProcessingTime(1000), triggerClock = manualClock))
val clockMove = Array.fill(13)(AdvanceManualClock(1000)).toSeq
val secondBatch = Seq(
AddEventHubsData(eventHubsParameters, 12),
CheckAnswer(true, 100, 200, 300, 300, 300, 300, 300, 300, 300, 300, 200, 100))
val secondBatch =
Seq(AddEventHubsData(eventHubsParameters, 12),
CheckAnswer(true, 100, 200, 300, 300, 300, 300, 300, 300, 300, 300, 200, 100))
testStream(windowedStream, outputMode = OutputMode.Complete())(
firstBatch ++ clockMove ++ secondBatch: _*)
}
@ -728,25 +869,29 @@ class EventHubsSourceSuite extends EventHubsStreamTest {
}
test("Verify expected dataframe is retrieved with watermarks") {
val eventHubsParameters = buildEventHubsParamters("ns1", "eh1", 1, 1,
containsProperties = true, userDefinedKeys = Some("creationTime"))
val eventHubsParameters = buildEventHubsParamters("ns1",
"eh1",
1,
1,
containsProperties = true,
userDefinedKeys = Some("creationTime"))
val eventPayloadsAndProperties = testDataForWatermark(2)
EventHubsTestUtilities.simulateEventHubs(eventHubsParameters,
eventPayloadsAndProperties)
EventHubsTestUtilities.simulateEventHubs(eventHubsParameters, eventPayloadsAndProperties)
val sourceQuery = spark.readStream.format("eventhubs").options(eventHubsParameters).load()
import sourceQuery.sparkSession.implicits._
import org.apache.spark.sql.functions._
val windowedStream = sourceQuery.selectExpr(
"CAST(creationTime AS TIMESTAMP) as creationTimeT").
withWatermark("creationTimeT", "5 second").
groupBy(window($"creationTimeT", "3 second", "1 second")).
count().select("count")
val windowedStream = sourceQuery
.selectExpr("CAST(creationTime AS TIMESTAMP) as creationTimeT")
.withWatermark("creationTimeT", "5 second")
.groupBy(window($"creationTimeT", "3 second", "1 second"))
.count()
.select("count")
val manualClock = new StreamManualClock
val firstBatch = Seq(StartStream(trigger = ProcessingTime(1000), triggerClock = manualClock))
val clockMove = Array.fill(35)(AdvanceManualClock(1000)).toSeq
val secondBatch = Seq(
AddEventHubsData(eventHubsParameters, 35),
CheckAnswer(true, 1, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 5, 6, 6))
val secondBatch =
Seq(AddEventHubsData(eventHubsParameters, 35),
CheckAnswer(true, 1, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 5, 6, 6))
testStream(windowedStream, outputMode = OutputMode.Append())(
firstBatch ++ clockMove ++ secondBatch: _*)
}
@ -763,20 +908,22 @@ class EventHubsSourceSuite extends EventHubsStreamTest {
AddEventHubsData(eventHubsParameters, 2),
UpdatePartialCheck(
EventHubsBatchRecord(0,
Map(EventHubNameAndPartition("eh1", 1) -> 2, EventHubNameAndPartition("eh1", 0) -> 2))),
Map(EventHubNameAndPartition("eh1", 1) -> 2,
EventHubNameAndPartition("eh1", 0) -> 2))),
CheckAnswer(true, false, 7, 8, 9, 10, 11, 12),
// in the second batch we have the right seq number of msgs
UpdatePartialCheck(
EventHubsBatchRecord(1,
Map(EventHubNameAndPartition("eh1", 1) -> 6, EventHubNameAndPartition("eh1", 0) -> 7))),
Map(EventHubNameAndPartition("eh1", 1) -> 6,
EventHubNameAndPartition("eh1", 0) -> 7))),
AdvanceManualClock(10),
CheckAnswer(true, false, 7, 8, 9, 10, 11, 12, 13, 14, 15)
)
}
test("Users cannot submit enqueueTime which is later than the latest in the queue") {
val eventHubsParameters = buildEventHubsParamters("ns1", "eh1", 2, 3,
enqueueTime = Some(Long.MaxValue))
val eventHubsParameters =
buildEventHubsParamters("ns1", "eh1", 2, 3, enqueueTime = Some(Long.MaxValue))
val eventPayloadsAndProperties = generateIntKeyedData(15)
EventHubsTestUtilities.simulateEventHubs(eventHubsParameters, eventPayloadsAndProperties)
val sourceQuery = generateInputQuery(eventHubsParameters, spark)

Просмотреть файл

@ -27,9 +27,9 @@ import scala.util.Random
import scala.util.control.NonFatal
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.{FileSystem, Path}
import org.scalatest.{Assertions, BeforeAndAfter}
import org.scalatest.concurrent.{Eventually, Timeouts}
import org.apache.hadoop.fs.{ FileSystem, Path }
import org.scalatest.{ Assertions, BeforeAndAfter }
import org.scalatest.concurrent.{ Eventually, Timeouts }
import org.scalatest.concurrent.Eventually._
import org.scalatest.concurrent.PatienceConfiguration.Timeout
import org.scalatest.exceptions.TestFailedDueToTimeoutException
@ -41,15 +41,15 @@ import org.apache.spark.eventhubscommon.EventHubsConnector
import org.apache.spark.eventhubscommon.client.EventHubsOffsetTypes.EventHubsOffsetType
import org.apache.spark.eventhubscommon.progress.ProgressTrackerBase
import org.apache.spark.eventhubscommon.utils._
import org.apache.spark.sql.{Dataset, Encoder, QueryTest, Row}
import org.apache.spark.sql.catalyst.encoders.{encoderFor, ExpressionEncoder, RowEncoder}
import org.apache.spark.sql.{ Dataset, Encoder, QueryTest, Row }
import org.apache.spark.sql.catalyst.encoders.{ encoderFor, ExpressionEncoder, RowEncoder }
import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
import org.apache.spark.sql.catalyst.util._
import org.apache.spark.sql.execution.streaming._
import org.apache.spark.sql.streaming._
import org.apache.spark.sql.streaming.eventhubs.checkpoint.StructuredStreamingProgressTracker
import org.apache.spark.sql.test.{SharedSQLContext, TestSparkSession}
import org.apache.spark.util.{Clock, ManualClock, SystemClock, Utils}
import org.apache.spark.sql.test.{ SharedSQLContext, TestSparkSession }
import org.apache.spark.util.{ Clock, ManualClock, SystemClock, Utils }
/**
* A framework for implementing tests for streaming queries and sources.
@ -75,9 +75,12 @@ import org.apache.spark.util.{Clock, ManualClock, SystemClock, Utils}
* avoid hanging forever in the case of failures. However, individual suites can change this
* by overriding `streamingTimeout`.
*/
trait EventHubsStreamTest extends QueryTest with BeforeAndAfter
with SharedSQLContext with Timeouts with Serializable {
trait EventHubsStreamTest
extends QueryTest
with BeforeAndAfter
with SharedSQLContext
with Timeouts
with Serializable {
protected val tempRoot = "/tmp"
@ -88,14 +91,14 @@ trait EventHubsStreamTest extends QueryTest with BeforeAndAfter
override protected def createSparkSession: TestSparkSession = {
new TestSparkSession(
sparkConf.set("spark.hadoop.fs.file.impl", classOf[DebugFilesystem].getName).setAppName(
s"EventHubsStreamTest_${System.currentTimeMillis()}"))
sparkConf
.set("spark.hadoop.fs.file.impl", classOf[DebugFilesystem].getName)
.setAppName(s"EventHubsStreamTest_${System.currentTimeMillis()}"))
}
/** How long to wait for an active stream to catch up when checking a result. */
val streamingTimeout = 60 seconds
/** A trait for actions that can be performed while testing a streaming DataFrame. */
// trait StreamAction
@ -113,33 +116,32 @@ trait EventHubsStreamTest extends QueryTest with BeforeAndAfter
*/
object CheckAnswer {
def apply[A : Encoder](isSort: Boolean, data: A*): CheckAnswerRows = {
def apply[A: Encoder](isSort: Boolean, data: A*): CheckAnswerRows = {
val encoder = encoderFor[A]
val toExternalRow = RowEncoder(encoder.schema).resolveAndBind()
CheckAnswerRows(
data.map(d => toExternalRow.fromRow(encoder.toRow(d))),
lastOnly = false,
isSorted = isSort)
CheckAnswerRows(data.map(d => toExternalRow.fromRow(encoder.toRow(d))),
lastOnly = false,
isSorted = isSort)
}
def apply[A : Encoder](data: A*): CheckAnswerRows = {
def apply[A: Encoder](data: A*): CheckAnswerRows = {
val encoder = encoderFor[A]
val toExternalRow = RowEncoder(encoder.schema).resolveAndBind()
CheckAnswerRows(
data.map(d => toExternalRow.fromRow(encoder.toRow(d))),
lastOnly = false,
isSorted = false)
CheckAnswerRows(data.map(d => toExternalRow.fromRow(encoder.toRow(d))),
lastOnly = false,
isSorted = false)
}
def apply(rows: Row*): CheckAnswerRows =
CheckAnswerRows(rows, lastOnly = false, isSorted = false)
def apply[A : Encoder](partial: Boolean, lastOnly: Boolean, rows: A*): CheckAnswerRows = {
def apply[A: Encoder](partial: Boolean, lastOnly: Boolean, rows: A*): CheckAnswerRows = {
val encoder = encoderFor[A]
val toExternalRow = RowEncoder(encoder.schema).resolveAndBind()
CheckAnswerRows(
rows.map(r => toExternalRow.fromRow(encoder.toRow(r))),
isSorted = false, lastOnly = lastOnly, ifCheckPartialResult = partial)
CheckAnswerRows(rows.map(r => toExternalRow.fromRow(encoder.toRow(r))),
isSorted = false,
lastOnly = lastOnly,
ifCheckPartialResult = partial)
}
}
@ -148,29 +150,28 @@ trait EventHubsStreamTest extends QueryTest with BeforeAndAfter
* This operation automatically blocks until all added data has been processed.
*/
object CheckLastBatch {
def apply[A : Encoder](data: A*): CheckAnswerRows = {
def apply[A: Encoder](data: A*): CheckAnswerRows = {
apply(isSorted = false, data: _*)
}
def apply[A: Encoder](isSorted: Boolean, data: A*): CheckAnswerRows = {
val encoder = encoderFor[A]
val toExternalRow = RowEncoder(encoder.schema).resolveAndBind()
CheckAnswerRows(
data.map(d => toExternalRow.fromRow(encoder.toRow(d))),
lastOnly = true,
isSorted = isSorted)
CheckAnswerRows(data.map(d => toExternalRow.fromRow(encoder.toRow(d))),
lastOnly = true,
isSorted = isSorted)
}
def apply(rows: Row*): CheckAnswerRows = CheckAnswerRows(rows, lastOnly = true,
isSorted = false)
def apply(rows: Row*): CheckAnswerRows =
CheckAnswerRows(rows, lastOnly = true, isSorted = false)
}
case class CheckAnswerRows(
expectedAnswer: Seq[Row],
lastOnly: Boolean,
isSorted: Boolean,
ifCheckPartialResult: Boolean = false)
extends StreamAction with StreamMustBeRunning {
case class CheckAnswerRows(expectedAnswer: Seq[Row],
lastOnly: Boolean,
isSorted: Boolean,
ifCheckPartialResult: Boolean = false)
extends StreamAction
with StreamMustBeRunning {
override def toString: String = s"$operatorName: ${expectedAnswer.mkString(",")}"
private def operatorName = if (lastOnly) "CheckLastBatch" else "CheckAnswer"
}
@ -182,19 +183,20 @@ trait EventHubsStreamTest extends QueryTest with BeforeAndAfter
commitOffset: Boolean = false,
commitPartialOffset: Boolean = false,
partialType: String = "delete")
extends StreamAction with StreamMustBeRunning
extends StreamAction
with StreamMustBeRunning
/** Starts the stream, resuming if data has already been processed. It must not be running. */
case class StartStream(trigger: Trigger = ProcessingTime(0),
triggerClock: Clock = new SystemClock,
additionalConfs: Map[String, String] = Map.empty)
extends StreamAction
extends StreamAction
/** Advance the trigger clock's time manually. */
case class AdvanceManualClock(timeToAdd: Long) extends StreamAction
/** Signals that a failure is expected and should not kill the test. */
case class ExpectFailure[T <: Throwable : ClassTag]() extends StreamAction {
case class ExpectFailure[T <: Throwable: ClassTag]() extends StreamAction {
val causeClass: Class[T] = implicitly[ClassTag[T]].runtimeClass.asInstanceOf[Class[T]]
override def toString: String = s"ExpectFailure[${causeClass.getName}]"
}
@ -207,13 +209,13 @@ trait EventHubsStreamTest extends QueryTest with BeforeAndAfter
object Assert {
def apply(condition: => Boolean, message: String = ""): Assert = new Assert(condition, message)
def apply(message: String)(body: => Unit): Assert = new Assert( { body; true }, message)
def apply(body: => Unit): Assert = new Assert( { body; true }, "")
def apply(message: String)(body: => Unit): Assert = new Assert({ body; true }, message)
def apply(body: => Unit): Assert = new Assert({ body; true }, "")
}
/** Assert that a condition on the active query is true */
class AssertOnQuery(val condition: StreamExecution => Boolean, val message: String)
extends StreamAction {
extends StreamAction {
override def toString: String = s"AssertOnQuery(<condition>, $message)"
}
@ -228,7 +230,8 @@ trait EventHubsStreamTest extends QueryTest with BeforeAndAfter
}
class StreamManualClock(@volatile var currentTime: Long = 0L)
extends ManualClock(currentTime) with Serializable {
extends ManualClock(currentTime)
with Serializable {
private var waitStartTime: Option[Long] = None
@ -280,10 +283,9 @@ trait EventHubsStreamTest extends QueryTest with BeforeAndAfter
}
}
def isStreamWaitingAt(time: Long): Boolean = synchronized {waitStartTime contains time}
def isStreamWaitingAt(time: Long): Boolean = synchronized { waitStartTime contains time }
}
/**
* Executes the specified actions on the given streaming DataFrame and provides helpful
* error messages in the case of failures or incorrect answers.
@ -291,11 +293,11 @@ trait EventHubsStreamTest extends QueryTest with BeforeAndAfter
* Note that if the stream is not explicitly started before an action that requires it to be
* running then it will be automatically started before performing any other actions.
*/
def testStream(_stream: Dataset[_],
outputMode: OutputMode = OutputMode.Append)(actions: StreamAction*): Unit = {
def testStream(_stream: Dataset[_], outputMode: OutputMode = OutputMode.Append)(
actions: StreamAction*): Unit = {
val stream = _stream.toDF()
val sparkSession = stream.sparkSession // use the session in DF, not the default session
val sparkSession = stream.sparkSession // use the session in DF, not the default session
var pos = 0
var currentStream: StreamExecution = null
var lastStream: StreamExecution = null
@ -312,14 +314,17 @@ trait EventHubsStreamTest extends QueryTest with BeforeAndAfter
actions.takeWhile(!_.isInstanceOf[StreamMustBeRunning]).exists(_.isInstanceOf[StartStream])
val startedTest = if (startedManually) actions else StartStream() +: actions
def testActions = actions.zipWithIndex.map {
case (a, i) =>
if ((pos == i && startedManually) || (pos == (i + 1) && !startedManually)) {
"=> " + a.toString
} else {
" " + a.toString
def testActions =
actions.zipWithIndex
.map {
case (a, i) =>
if ((pos == i && startedManually) || (pos == (i + 1) && !startedManually)) {
"=> " + a.toString
} else {
" " + a.toString
}
}
}.mkString("\n")
.mkString("\n")
def currentOffsets =
if (currentStream != null) currentStream.committedOffsets.toString else "not started"
@ -385,8 +390,7 @@ trait EventHubsStreamTest extends QueryTest with BeforeAndAfter
}
val c = Option(cause).map(exceptionToString(_))
val m = if (message != null && message.nonEmpty) Some(message) else None
fail(
s"""
fail(s"""
|${(m ++ c).mkString(": ")}
|$testState
""".stripMargin)
@ -399,20 +403,21 @@ trait EventHubsStreamTest extends QueryTest with BeforeAndAfter
}
if (sources.isEmpty) {
throw new Exception("Could not find EventHubs source in the StreamExecution" +
" logical plan to add data to")
throw new Exception(
"Could not find EventHubs source in the StreamExecution" +
" logical plan to add data to")
} else if (sources.size > 1) {
throw new Exception("Could not select the EventHubs source in the StreamExecution " +
"logical plan as there" +
"are multiple EventHubs sources:\n\t" + sources.mkString("\n\t"))
throw new Exception(
"Could not select the EventHubs source in the StreamExecution " +
"logical plan as there" +
"are multiple EventHubs sources:\n\t" + sources.mkString("\n\t"))
}
sources.head
}
def createBrokenProgressFile(
progressTracker: ProgressTrackerBase[_ <: EventHubsConnector],
timestamp: Long,
brokenType: String): Unit = {
def createBrokenProgressFile(progressTracker: ProgressTrackerBase[_ <: EventHubsConnector],
timestamp: Long,
brokenType: String): Unit = {
val progressDir = progressTracker.progressDirectoryPath.toString
val metadataDir = progressTracker.metadataDirectoryPath.toString
val progressFilePath = new Path(s"$progressDir/progress-$timestamp")
@ -423,7 +428,7 @@ trait EventHubsStreamTest extends QueryTest with BeforeAndAfter
fs.delete(metadataFilePath, true)
} else if (brokenType == "deletemetadata") {
fs.delete(metadataFilePath, true)
} else if (brokenType == "partial" ) {
} else if (brokenType == "partial") {
fs.delete(progressFilePath, true)
fs.delete(metadataFilePath, true)
val fsos = fs.create(progressFilePath)
@ -444,9 +449,11 @@ trait EventHubsStreamTest extends QueryTest with BeforeAndAfter
action match {
case StartStream(trigger, triggerClock, additionalConfs) =>
verify(currentStream == null, "stream already running")
verify(triggerClock.isInstanceOf[SystemClock]
|| triggerClock.isInstanceOf[StreamManualClock],
"Use either SystemClock or StreamManualClock to start the stream")
verify(
triggerClock.isInstanceOf[SystemClock]
|| triggerClock.isInstanceOf[StreamManualClock],
"Use either SystemClock or StreamManualClock to start the stream"
)
if (triggerClock.isInstanceOf[StreamManualClock]) {
manualClockExpectedTime = triggerClock.asInstanceOf[StreamManualClock].getTimeMillis()
}
@ -461,27 +468,30 @@ trait EventHubsStreamTest extends QueryTest with BeforeAndAfter
})
lastStream = currentStream
val createQueryMethod = sparkSession.streams.getClass.getDeclaredMethods.filter(m =>
m.getName == "createQuery").head
val createQueryMethod = sparkSession.streams.getClass.getDeclaredMethods
.filter(m => m.getName == "createQuery")
.head
createQueryMethod.setAccessible(true)
val checkpointLocation = additionalConfs.getOrElse[String](
"eventhubs.test.checkpointLocation",
metadataRoot)
val checkpointLocation =
additionalConfs.getOrElse[String]("eventhubs.test.checkpointLocation", metadataRoot)
if (additionalConfs.contains("eventhubs.test.newSink") &&
additionalConfs("eventhubs.test.newSink").toBoolean) {
additionalConfs("eventhubs.test.newSink").toBoolean) {
sink = new MemorySink(stream.schema, outputMode)
}
currentStream = createQueryMethod.invoke(
sparkSession.streams,
None,
Some(checkpointLocation),
stream,
sink,
outputMode,
Boolean.box(false), // useTempCheckpointLocation
Boolean.box(true), // recoverFromCheckpointLocation
trigger,
triggerClock).asInstanceOf[StreamExecution]
currentStream = createQueryMethod
.invoke(
sparkSession.streams,
None,
Some(checkpointLocation),
stream,
sink,
outputMode,
Boolean.box(false), // useTempCheckpointLocation
Boolean.box(true), // recoverFromCheckpointLocation
trigger,
triggerClock
)
.asInstanceOf[StreamExecution]
triggerClock match {
case smc: StreamManualClock =>
@ -489,30 +499,38 @@ trait EventHubsStreamTest extends QueryTest with BeforeAndAfter
case _ =>
}
val activeQueriesField = sparkSession.streams.getClass.getDeclaredFields.filter(f =>
f.getName == "org$apache$spark$sql$streaming$StreamingQueryManager$$activeQueries").
head
val activeQueriesField = sparkSession.streams.getClass.getDeclaredFields
.filter(f =>
f.getName == "org$apache$spark$sql$streaming$StreamingQueryManager$$activeQueries")
.head
activeQueriesField.setAccessible(true)
val activeQueries = activeQueriesField.get(sparkSession.streams).
asInstanceOf[mutable.HashMap[UUID, StreamingQuery]]
val activeQueries = activeQueriesField
.get(sparkSession.streams)
.asInstanceOf[mutable.HashMap[UUID, StreamingQuery]]
activeQueries += currentStream.id -> currentStream
val eventHubsSource = searchCurrentSource()
val eventHubs = EventHubsTestUtilities.getOrSimulateEventHubs(null)
eventHubsSource.setEventHubClient(new SimulatedEventHubsRestClient(eventHubs))
eventHubsSource.setEventHubsReceiver(
(eventHubsParameters: Map[String, String], partitionId: Int,
startOffset: Long, offsetType: EventHubsOffsetType, _: Int) =>
new TestEventHubsReceiver(eventHubsParameters, eventHubs, partitionId, startOffset,
offsetType)
(eventHubsParameters: Map[String, String],
partitionId: Int,
startOffset: Long,
offsetType: EventHubsOffsetType,
_: Int) =>
new TestEventHubsReceiver(eventHubsParameters,
eventHubs,
partitionId,
startOffset,
offsetType)
)
currentStream.start()
case AdvanceManualClock(timeToAdd) =>
verify(currentStream != null,
"can not advance manual clock when a stream is not running")
"can not advance manual clock when a stream is not running")
verify(currentStream.triggerClock.isInstanceOf[StreamManualClock],
s"can not advance clock of type ${currentStream.triggerClock.getClass}")
s"can not advance clock of type ${currentStream.triggerClock.getClass}")
val clock = currentStream.triggerClock.asInstanceOf[StreamManualClock]
assert(manualClockExpectedTime >= 0)
@ -523,15 +541,19 @@ trait EventHubsStreamTest extends QueryTest with BeforeAndAfter
clock.advance(timeToAdd)
manualClockExpectedTime += timeToAdd
verify(clock.getTimeMillis() === manualClockExpectedTime,
verify(
clock.getTimeMillis() === manualClockExpectedTime,
s"Unexpected clock time after updating: " +
s"expecting $manualClockExpectedTime, current ${clock.getTimeMillis()}")
s"expecting $manualClockExpectedTime, current ${clock.getTimeMillis()}"
)
case StopStream(recoverStreamId: Boolean, commitOffset: Boolean,
commitPartialOffset: Boolean, partialType: String) =>
case StopStream(recoverStreamId: Boolean,
commitOffset: Boolean,
commitPartialOffset: Boolean,
partialType: String) =>
verify(currentStream != null, "can not stop a stream that is not running")
require(!(commitOffset && commitPartialOffset),
"cannot set both of commitOffset and commitPartialOffset as true")
"cannot set both of commitOffset and commitPartialOffset as true")
if (recoverStreamId) {
EventHubsSource.streamIdGenerator.decrementAndGet()
}
@ -548,15 +570,14 @@ trait EventHubsStreamTest extends QueryTest with BeforeAndAfter
source.collectFinishedBatchOffsetsAndCommit(
source.committedOffsetsAndSeqNums.batchId + 1)
createBrokenProgressFile(progressTracker,
source.committedOffsetsAndSeqNums.batchId, partialType)
source.committedOffsetsAndSeqNums.batchId,
partialType)
}
verify(!currentStream.microBatchThread.isAlive,
s"microbatch thread not stopped")
verify(!currentStream.isActive,
"query.isActive() is false even after stopping")
verify(!currentStream.microBatchThread.isAlive, s"microbatch thread not stopped")
verify(!currentStream.isActive, "query.isActive() is false even after stopping")
verify(currentStream.exception.isEmpty,
s"query.exception() is not empty after clean stop: " +
currentStream.exception.map(_.toString()).getOrElse(""))
s"query.exception() is not empty after clean stop: " +
currentStream.exception.map(_.toString()).getOrElse(""))
} catch {
case _: InterruptedException =>
case _: org.scalatest.exceptions.TestFailedDueToTimeoutException =>
@ -578,12 +599,14 @@ trait EventHubsStreamTest extends QueryTest with BeforeAndAfter
assert(!currentStream.microBatchThread.isAlive)
}
verify(currentStream.exception === Some(thrownException),
s"incorrect exception returned by query.exception()")
s"incorrect exception returned by query.exception()")
val exception = currentStream.exception.get
verify(exception.cause.getClass === ef.causeClass,
verify(
exception.cause.getClass === ef.causeClass,
"incorrect cause in exception returned by query.exception()\n" +
s"\tExpected: ${ef.causeClass}\n\tReturned: ${exception.cause.getClass}")
s"\tExpected: ${ef.causeClass}\n\tReturned: ${exception.cause.getClass}"
)
} catch {
case _: InterruptedException =>
case _: org.scalatest.exceptions.TestFailedDueToTimeoutException =>
@ -597,7 +620,7 @@ trait EventHubsStreamTest extends QueryTest with BeforeAndAfter
}
case a: AssertOnQuery =>
verify(currentStream != null || lastStream != null,
"cannot assert when not stream has been started")
"cannot assert when not stream has been started")
val streamToAssert = Option(currentStream).getOrElse(lastStream)
verify(a.condition(streamToAssert), s"Assert on query failed: ${a.message}")
case a: Assert =>
@ -619,14 +642,17 @@ trait EventHubsStreamTest extends QueryTest with BeforeAndAfter
// Try to find the index of the source to which data was added. Either get the index
// from the current active query or the original input logical plan.
val sourceIndex =
queryToUse.flatMap { query =>
findSourceIndex(query.logicalPlan, source)
}.orElse {
findSourceIndex(stream.logicalPlan, source)
}.getOrElse {
throw new IllegalArgumentException(
"Could find index of the source to which data was added")
}
queryToUse
.flatMap { query =>
findSourceIndex(query.logicalPlan, source)
}
.orElse {
findSourceIndex(stream.logicalPlan, source)
}
.getOrElse {
throw new IllegalArgumentException(
"Could find index of the source to which data was added")
}
// Store the expected offset of added data to wait for it later
awaiting.put(sourceIndex, offset)
} catch {
@ -640,32 +666,33 @@ trait EventHubsStreamTest extends QueryTest with BeforeAndAfter
case CheckAnswerRows(expectedAnswer, lastOnly, isSorted, partial) =>
verify(currentStream != null, "stream not running")
// Get the map of source index to the current source objects
val indexToSource = currentStream
.logicalPlan
val indexToSource = currentStream.logicalPlan
.collect { case StreamingExecutionRelation(s, _) => s }
.zipWithIndex
.map(_.swap)
.toMap
// Block until all data added has been processed for all the source
{if (!partial) awaiting else partialAwaiting}.foreach { case (sourceIndex, offset) =>
try {
failAfter(streamingTimeout) {
currentStream.awaitOffset(indexToSource(sourceIndex), offset)
{ if (!partial) awaiting else partialAwaiting }.foreach {
case (sourceIndex, offset) =>
try {
failAfter(streamingTimeout) {
currentStream.awaitOffset(indexToSource(sourceIndex), offset)
}
} catch {
case e: Exception =>
e.printStackTrace()
throw e
}
} catch {
case e: Exception =>
e.printStackTrace()
throw e
}
}
val sparkAnswer = try if (lastOnly) sink.latestBatchData else sink.allData catch {
val sparkAnswer = try if (lastOnly) sink.latestBatchData else sink.allData
catch {
case e: Exception =>
failTest("Exception while getting data from sink", e)
}
QueryTest.sameRows(expectedAnswer, sparkAnswer, isSorted).foreach {
error => failTest(error)
QueryTest.sameRows(expectedAnswer, sparkAnswer, isSorted).foreach { error =>
failTest(error)
}
}
pos += 1
@ -691,7 +718,7 @@ trait EventHubsStreamTest extends QueryTest with BeforeAndAfter
// Rollback prev configuration values
resetConfValues.foreach {
case (key, Some(value)) => sparkSession.conf.set(key, value)
case (key, None) => sparkSession.conf.unset(key)
case (key, None) => sparkSession.conf.unset(key)
}
}
}
@ -742,7 +769,7 @@ trait EventHubsStreamTest extends QueryTest with BeforeAndAfter
(1 to iterations).foreach { i =>
val rand = Random.nextDouble()
if(!running) {
if (!running) {
rand match {
case r if r < 0.7 => // AddData
addRandomData()
@ -766,7 +793,7 @@ trait EventHubsStreamTest extends QueryTest with BeforeAndAfter
}
}
}
if(!running) { actions += StartStream() }
if (!running) { actions += StartStream() }
addCheck()
testStream(ds)(actions: _*)
}
@ -783,11 +810,12 @@ trait EventHubsStreamTest extends QueryTest with BeforeAndAfter
/** Expect awaitTermination to throw an exception */
case class ExpectException[E <: Exception]()(implicit val t: ClassTag[E])
extends ExpectedBehavior
extends ExpectedBehavior
private val DEFAULT_TEST_TIMEOUT = 1.second
def test(expectedBehavior: ExpectedBehavior, awaitTermFunc: () => Unit,
def test(expectedBehavior: ExpectedBehavior,
awaitTermFunc: () => Unit,
testTimeout: Span = DEFAULT_TEST_TIMEOUT): Unit = {
expectedBehavior match {
case ExpectNotBlocked =>
@ -814,7 +842,7 @@ trait EventHubsStreamTest extends QueryTest with BeforeAndAfter
}
}
assert(thrownException.cause.getClass === e.t.runtimeClass,
"exception of incorrect type was throw")
"exception of incorrect type was throw")
}
}
}

Просмотреть файл

@ -1,498 +0,0 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.spark.sql.streaming.eventhubs.checkpoint
import java.nio.file.Files
import java.time.Instant
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.{FileSystem, Path}
import org.apache.spark.eventhubscommon._
import org.apache.spark.eventhubscommon.progress._
import org.apache.spark.sql.streaming.eventhubs.EventHubsSource
import org.apache.spark.sql.test.SharedSQLContext
class StructuredStreamingProgressTrackerSuite extends SharedSQLContext {
test("progress directory is created properly when it does not exist") {
progressTracker = StructuredStreamingProgressTracker
.initInstance(eventhubsSource1.uid, progressRootPath.toString, appName, new Configuration())
assert(fileSystem.exists(progressTracker.progressDirectoryPath))
}
test("progress directory is created properly when it exists") {
fileSystem.mkdirs(PathTools.makeTempDirectoryPath(progressRootPath.toString, appName))
progressTracker = StructuredStreamingProgressTracker
.initInstance(eventhubsSource1.uid, progressRootPath.toString, appName, new Configuration())
assert(fileSystem.exists(progressTracker.progressDirectoryPath))
}
test("temp progress is not cleaned up when partial temp progress exists") {
val tempPath = PathTools.makeTempDirectoryPath(progressRootPath.toString, appName)
fileSystem.mkdirs(tempPath)
val streamId = EventHubsSource.streamIdGenerator.get()
var tempFilePath = PathTools.makeTempFilePath(tempPath.toString,
streamId, eventhubsSource1.uid, eventhubsNamedPartitions("ns1").head, unixTimestamp)
fileSystem.create(tempFilePath)
tempFilePath = PathTools.makeTempFilePath(tempPath.toString, streamId,
eventhubsSource1.uid, eventhubsNamedPartitions("ns1").tail.head, unixTimestamp)
fileSystem.create(tempFilePath)
val filesBefore = fileSystem.listStatus(tempPath)
assert(filesBefore.size === 2)
progressTracker = StructuredStreamingProgressTracker
.initInstance(eventhubsSource1.uid, progressRootPath.toString, appName, new Configuration())
val filesAfter = fileSystem.listStatus(tempPath)
assert(filesAfter.size === 2)
}
test("incomplete progress will not be discarded") {
// Register two eventhubs connectors to structured streaming progress tracker
StructuredStreamingProgressTracker.registeredConnectors +=
eventhubsSource1.uid -> eventhubsSource1
StructuredStreamingProgressTracker.registeredConnectors +=
eventhubsSource2.uid -> eventhubsSource2
// Progress record of all partitions of eventhubsSource1 are updated
val eventHubsSourceStreamId1 = EventHubsSource.streamIdGenerator.get()
var progressWriter = new ProgressWriter(eventHubsSourceStreamId1,
eventhubsSource1.uid, eventhubsSource1.connectedInstances.head, unixTimestamp,
new Configuration(), progressRootPath.toString, appName, eventhubsSource1.uid)
progressWriter.write(Instant.now.getEpochSecond, 0L, 0L)
progressWriter = new ProgressWriter(eventHubsSourceStreamId1,
eventhubsSource1.uid, eventhubsSource1.connectedInstances(1), unixTimestamp,
new Configuration(), progressRootPath.toString, appName, eventhubsSource1.uid)
progressWriter.write(Instant.now.getEpochSecond, 10L, 10L)
// Progress records of all partitions of eventhubsSource2 are not updated
val eventHubsSourceStreamId2 = EventHubsSource.streamIdGenerator.get()
progressWriter = new ProgressWriter(eventHubsSourceStreamId2,
eventhubsSource2.uid, eventhubsSource2.connectedInstances.head, unixTimestamp,
new Configuration(), progressRootPath.toString, appName, eventhubsSource2.uid)
progressWriter.write(Instant.now.getEpochSecond, 0L, 0L)
progressWriter = new ProgressWriter(eventHubsSourceStreamId2,
eventhubsSource2.uid, eventhubsSource2.connectedInstances(1), unixTimestamp,
new Configuration(), progressRootPath.toString, appName, eventhubsSource2.uid)
progressWriter.write(Instant.now.getEpochSecond, 100L, 100L)
StructuredStreamingProgressTracker.initInstance(eventhubsSource1.uid,
progressRootPath.toString, appName, new Configuration())
StructuredStreamingProgressTracker.initInstance(eventhubsSource2.uid,
progressRootPath.toString, appName, new Configuration())
var progressTempPath = PathTools.makeTempDirectoryStr(progressRootPath.toString,
appName, eventhubsSource1.uid)
assert(fileSystem.exists(PathTools.makeTempFilePath(progressTempPath, eventHubsSourceStreamId1,
eventhubsSource1.uid, eventhubsSource1.connectedInstances.head, unixTimestamp)))
assert(fileSystem.exists(PathTools.makeTempFilePath(progressTempPath, eventHubsSourceStreamId1,
eventhubsSource1.uid, eventhubsSource1.connectedInstances(1), unixTimestamp)))
progressTempPath = PathTools.makeTempDirectoryStr(
progressRootPath.toString, appName, eventhubsSource2.uid)
assert(fileSystem.exists(PathTools.makeTempFilePath(progressTempPath, eventHubsSourceStreamId2,
eventhubsSource2.uid, eventhubsSource2.connectedInstances.head, unixTimestamp)))
assert(fileSystem.exists(PathTools.makeTempFilePath(progressTempPath, eventHubsSourceStreamId2,
eventhubsSource2.uid, eventhubsSource2.connectedInstances(1), unixTimestamp)))
assert(!fileSystem.exists(PathTools.makeTempFilePath(progressTempPath, eventHubsSourceStreamId2,
eventhubsSource2.uid, eventhubsSource2.connectedInstances(2), unixTimestamp)))
}
test("start from the beginning of the streams when the latest progress file does not exist") {
// Register the two eventhubs connectors to structured streaming progress tracker
StructuredStreamingProgressTracker.registeredConnectors +=
eventhubsSource3.uid -> eventhubsSource3
StructuredStreamingProgressTracker.registeredConnectors +=
eventhubsSource4.uid -> eventhubsSource4
val progressTracker3 = StructuredStreamingProgressTracker
.initInstance(eventhubsSource3.uid, progressRootPath.toString, appName, new Configuration())
val progressTracker4 = StructuredStreamingProgressTracker
.initInstance(eventhubsSource4.uid, progressRootPath.toString, appName, new Configuration())
val eh3Progress = progressTracker3.read(eventhubsSource3.uid, unixTimestamp - 1000L,
fallBack = false)
val eh4Progress = progressTracker4.read(eventhubsSource4.uid, unixTimestamp - 1000L,
fallBack = false)
assert(eh3Progress.offsets(eventhubsSource3.connectedInstances.head) === (-1L, -1L))
assert(eh3Progress.offsets(eventhubsSource3.connectedInstances(1)) === (-1L, -1L))
assert(eh3Progress.offsets(eventhubsSource3.connectedInstances(2)) === (-1L, -1L))
assert(eh3Progress.offsets(eventhubsSource3.connectedInstances(3)) === (-1L, -1L))
assert(eh4Progress.offsets(eventhubsSource4.connectedInstances.head) === (-1L, -1L))
assert(eh4Progress.offsets(eventhubsSource4.connectedInstances(1)) === (-1L, -1L))
}
test("progress tracker can read back last progress correctly") {
// Register two eventhubs connectors to structured streaming progress tracker
StructuredStreamingProgressTracker.registeredConnectors +=
eventhubsSource1.uid -> eventhubsSource1
StructuredStreamingProgressTracker.registeredConnectors +=
eventhubsSource2.uid -> eventhubsSource2
// Progress record of all partitions of eventhubsSource1 are updated
val eventHubsSourceStreamId1 = EventHubsSource.streamIdGenerator.get()
var progressWriter = new ProgressWriter(eventHubsSourceStreamId1,
eventhubsSource1.uid, eventhubsSource1.connectedInstances.head, unixTimestamp,
new Configuration(), progressRootPath.toString, appName, eventhubsSource1.uid)
progressWriter.write(unixTimestamp, 0L, 0L)
progressWriter = new ProgressWriter(eventHubsSourceStreamId1,
eventhubsSource1.uid, eventhubsSource1.connectedInstances(1), unixTimestamp,
new Configuration(), progressRootPath.toString, appName, eventhubsSource1.uid)
progressWriter.write(unixTimestamp, 10L, 10L)
// Progress records of all partitions of eventhubsSource2 are updated
val eventHubsSourceStreamId2 = EventHubsSource.streamIdGenerator.get()
progressWriter = new ProgressWriter(eventHubsSourceStreamId2,
eventhubsSource2.uid, eventhubsSource2.connectedInstances.head, unixTimestamp,
new Configuration(), progressRootPath.toString, appName, eventhubsSource2.uid)
progressWriter.write(unixTimestamp, 0L, 0L)
progressWriter = new ProgressWriter(eventHubsSourceStreamId2,
eventhubsSource2.uid, eventhubsSource2.connectedInstances(1), unixTimestamp,
new Configuration(), progressRootPath.toString, appName, eventhubsSource2.uid)
progressWriter.write(unixTimestamp, 100L, 100L)
progressWriter = new ProgressWriter(eventHubsSourceStreamId2,
eventhubsSource2.uid, eventhubsSource2.connectedInstances(2), unixTimestamp,
new Configuration(), progressRootPath.toString, appName, eventhubsSource2.uid)
progressWriter.write(unixTimestamp, 200L, 200L)
val progressTracker1 = StructuredStreamingProgressTracker
.initInstance(eventhubsSource1.uid, progressRootPath.toString, appName, new Configuration())
progressTracker1.commit(
progressTracker1.collectProgressRecordsForBatch(unixTimestamp, List(eventhubsSource1)),
unixTimestamp)
val progressTracker2 = StructuredStreamingProgressTracker
.initInstance(eventhubsSource2.uid, progressRootPath.toString, appName, new Configuration())
progressTracker2.commit(progressTracker2.collectProgressRecordsForBatch(unixTimestamp,
List(eventhubsSource2)), unixTimestamp)
val eh1Progress = progressTracker1.read(eventhubsSource1.uid, unixTimestamp,
fallBack = false)
val eh2Progress = progressTracker2.read(eventhubsSource2.uid, unixTimestamp,
fallBack = false)
assert(eh1Progress.offsets(eventhubsSource1.connectedInstances.head) === (0L, 0L))
assert(eh1Progress.offsets(eventhubsSource1.connectedInstances(1)) === (10L, 10L))
assert(eh2Progress.offsets(eventhubsSource2.connectedInstances.head) === (0L, 0L))
assert(eh2Progress.offsets(eventhubsSource2.connectedInstances(1)) === (100L, 100L))
assert(eh2Progress.offsets(eventhubsSource2.connectedInstances(2)) === (200L, 200L))
}
test("inconsistent timestamp in the progress tracks can be detected") {
// Register two eventhubs connectors to structured streaming progress tracker
StructuredStreamingProgressTracker.registeredConnectors +=
eventhubsSource1.uid -> eventhubsSource1
StructuredStreamingProgressTracker.registeredConnectors +=
eventhubsSource2.uid -> eventhubsSource2
// Progress record of all partitions of eventhubsSource1 are updated
val eventHubsSourceStreamId1 = EventHubsSource.streamIdGenerator.get()
var progressWriter = new ProgressWriter(eventHubsSourceStreamId1,
eventhubsSource1.uid, eventhubsSource1.connectedInstances.head, unixTimestamp,
new Configuration(), progressRootPath.toString, appName, eventhubsSource1.uid)
progressWriter.write(unixTimestamp, 0L, 0L)
progressWriter = new ProgressWriter(eventHubsSourceStreamId1,
eventhubsSource1.uid, eventhubsSource1.connectedInstances(1), unixTimestamp,
new Configuration(), progressRootPath.toString, appName, eventhubsSource1.uid)
progressWriter.write(unixTimestamp, 10L, 10L)
// Progress records of all partitions of eventhubsSource2 are not updated
val eventHubsSourceStreamId2 = EventHubsSource.streamIdGenerator.get()
progressWriter = new ProgressWriter(eventHubsSourceStreamId2,
eventhubsSource2.uid, eventhubsSource2.connectedInstances.head, unixTimestamp,
new Configuration(), progressRootPath.toString, appName, eventhubsSource2.uid)
progressWriter.write(unixTimestamp, 0L, 0L)
progressWriter = new ProgressWriter(eventHubsSourceStreamId2,
eventhubsSource2.uid, eventhubsSource2.connectedInstances(1), unixTimestamp,
new Configuration(), progressRootPath.toString, appName, eventhubsSource2.uid)
progressWriter.write(unixTimestamp, 100L, 100L)
progressWriter = new ProgressWriter(eventHubsSourceStreamId2,
eventhubsSource2.uid, eventhubsSource2.connectedInstances(2), unixTimestamp,
new Configuration(), progressRootPath.toString, appName, eventhubsSource2.uid)
progressWriter.write(unixTimestamp + 1000L, 200L, 200L)
val progressTracker1 = StructuredStreamingProgressTracker
.initInstance(eventhubsSource1.uid, progressRootPath.toString, appName, new Configuration())
progressTracker1.commit(progressTracker1.collectProgressRecordsForBatch(unixTimestamp,
List(eventhubsSource1)), unixTimestamp)
val progressTracker2 = StructuredStreamingProgressTracker
.initInstance(eventhubsSource2.uid, progressRootPath.toString, appName, new Configuration())
intercept[IllegalStateException] {
progressTracker2.commit(progressTracker2.collectProgressRecordsForBatch(unixTimestamp,
List(eventhubsSource2)), unixTimestamp)
}
}
test("latest offsets can be committed correctly and temp directory is not cleaned") {
// Register two eventhubs connectors to structured streaming progress tracker
StructuredStreamingProgressTracker.registeredConnectors +=
eventhubsSource1.uid -> eventhubsSource1
StructuredStreamingProgressTracker.registeredConnectors +=
eventhubsSource2.uid -> eventhubsSource2
// Progress record of all partitions of eventhubsSource1 are updated
val eventHubsSourceStreamId1 = EventHubsSource.streamIdGenerator.get()
var progressWriter = new ProgressWriter(eventHubsSourceStreamId1,
eventhubsSource1.uid, eventhubsSource1.connectedInstances.head, unixTimestamp,
new Configuration(), progressRootPath.toString, appName, eventhubsSource1.uid)
progressWriter.write(unixTimestamp, 0L, 0L)
progressWriter = new ProgressWriter(eventHubsSourceStreamId1,
eventhubsSource1.uid, eventhubsSource1.connectedInstances(1), unixTimestamp,
new Configuration(), progressRootPath.toString, appName, eventhubsSource1.uid)
progressWriter.write(unixTimestamp, 10L, 10L)
// Progress records of all partitions of eventhubsSource2 are not updated
val eventHubsSourceStreamId2 = EventHubsSource.streamIdGenerator.get()
progressWriter = new ProgressWriter(eventHubsSourceStreamId2,
eventhubsSource2.uid, eventhubsSource2.connectedInstances.head, unixTimestamp,
new Configuration(), progressRootPath.toString, appName, eventhubsSource2.uid)
progressWriter.write(unixTimestamp, 0L, 0L)
progressWriter = new ProgressWriter(eventHubsSourceStreamId2,
eventhubsSource2.uid, eventhubsSource2.connectedInstances(1), unixTimestamp,
new Configuration(), progressRootPath.toString, appName, eventhubsSource2.uid)
progressWriter.write(unixTimestamp, 100L, 100L)
progressWriter = new ProgressWriter(eventHubsSourceStreamId2,
eventhubsSource2.uid, eventhubsSource2.connectedInstances(2), unixTimestamp,
new Configuration(), progressRootPath.toString, appName, eventhubsSource2.uid)
progressWriter.write(unixTimestamp, 200L, 200L)
val progressTracker1 = StructuredStreamingProgressTracker
.initInstance(eventhubsSource1.uid, progressRootPath.toString, appName, new Configuration())
progressTracker1.commit(progressTracker1.collectProgressRecordsForBatch(
unixTimestamp, List(eventhubsSource1)), unixTimestamp)
val progressTracker2 = StructuredStreamingProgressTracker
.initInstance(eventhubsSource2.uid, progressRootPath.toString, appName, new Configuration())
progressTracker2.commit(progressTracker2.collectProgressRecordsForBatch(
unixTimestamp, List(eventhubsSource2)), unixTimestamp)
var progressTempPath = PathTools.makeTempDirectoryStr(
progressRootPath.toString, appName, eventhubsSource1.uid)
assert(fileSystem.exists(PathTools.makeTempFilePath(progressTempPath, eventHubsSourceStreamId1,
eventhubsSource1.uid, eventhubsSource1.connectedInstances.head, unixTimestamp)))
assert(fileSystem.exists(PathTools.makeTempFilePath(progressTempPath, eventHubsSourceStreamId1,
eventhubsSource1.uid, eventhubsSource1.connectedInstances(1), unixTimestamp)))
progressTempPath = PathTools.makeTempDirectoryStr(
progressRootPath.toString, appName, eventhubsSource2.uid)
assert(fileSystem.exists(PathTools.makeTempFilePath(progressTempPath, eventHubsSourceStreamId2,
eventhubsSource2.uid, eventhubsSource2.connectedInstances.head, unixTimestamp)))
assert(fileSystem.exists(PathTools.makeTempFilePath(progressTempPath, eventHubsSourceStreamId2,
eventhubsSource2.uid, eventhubsSource2.connectedInstances(1), unixTimestamp)))
assert(fileSystem.exists(PathTools.makeTempFilePath(progressTempPath, eventHubsSourceStreamId2,
eventhubsSource2.uid, eventhubsSource2.connectedInstances(2), unixTimestamp)))
}
test("locate progress file correctly based on timestamp") {
// Register one eventhubs connector to structured streaming progress tracker
StructuredStreamingProgressTracker.registeredConnectors +=
eventhubsSource1.uid -> eventhubsSource1
// Progress record of all partitions of eventhubsSource1 are updated
val eventHubsSourceStreamId1 = EventHubsSource.streamIdGenerator.get()
// Update progress for unixTimestamp
var progressWriter = new ProgressWriter(eventHubsSourceStreamId1,
eventhubsSource1.uid, eventhubsSource1.connectedInstances.head, unixTimestamp,
new Configuration(), progressRootPath.toString, appName, eventhubsSource1.uid)
progressWriter.write(unixTimestamp, 0L, 0L)
progressWriter = new ProgressWriter(eventHubsSourceStreamId1,
eventhubsSource1.uid, eventhubsSource1.connectedInstances(1), unixTimestamp,
new Configuration(), progressRootPath.toString, appName, eventhubsSource1.uid)
progressWriter.write(unixTimestamp, 10L, 10L)
// Update progress for unixTimestamp + 1000L
progressWriter = new ProgressWriter(eventHubsSourceStreamId1,
eventhubsSource1.uid, eventhubsSource1.connectedInstances.head, unixTimestamp + 1000L,
new Configuration(), progressRootPath.toString, appName, eventhubsSource1.uid)
progressWriter.write(unixTimestamp + 1000L, 20L, 20L)
progressWriter = new ProgressWriter(eventHubsSourceStreamId1,
eventhubsSource1.uid, eventhubsSource1.connectedInstances(1), unixTimestamp + 1000L,
new Configuration(), progressRootPath.toString, appName, eventhubsSource1.uid)
progressWriter.write(unixTimestamp + 1000L, 30L, 30L)
// Update progress for unixTimestamp + 2000L
progressWriter = new ProgressWriter(eventHubsSourceStreamId1,
eventhubsSource1.uid, eventhubsSource1.connectedInstances.head, unixTimestamp + 2000L,
new Configuration(), progressRootPath.toString, appName, eventhubsSource1.uid)
progressWriter.write(unixTimestamp + 2000L, 40L, 40L)
progressWriter = new ProgressWriter(eventHubsSourceStreamId1,
eventhubsSource1.uid, eventhubsSource1.connectedInstances(1), unixTimestamp + 2000L,
new Configuration(), progressRootPath.toString, appName, eventhubsSource1.uid)
progressWriter.write(unixTimestamp + 2000L, 50L, 50L)
val progressTracker1 = StructuredStreamingProgressTracker
.initInstance(eventhubsSource1.uid, progressRootPath.toString, appName, new Configuration())
progressTracker1.commit(progressTracker1.collectProgressRecordsForBatch(
unixTimestamp, List(eventhubsSource1)), unixTimestamp)
progressTracker1.commit(progressTracker1.collectProgressRecordsForBatch(
unixTimestamp + 1000L, List(eventhubsSource1)), unixTimestamp + 1000L)
progressTracker1.commit(progressTracker1.collectProgressRecordsForBatch(
unixTimestamp + 2000L, List(eventhubsSource1)), unixTimestamp + 2000L)
var eh1Progress = progressTracker1.read(eventhubsSource1.uid, unixTimestamp,
fallBack = false)
assert(eh1Progress.offsets(eventhubsSource1.connectedInstances.head) === (0L, 0L))
assert(eh1Progress.offsets(eventhubsSource1.connectedInstances(1)) === (10L, 10L))
eh1Progress = progressTracker1.read(eventhubsSource1.uid, unixTimestamp + 1000L,
fallBack = false)
assert(eh1Progress.offsets(eventhubsSource1.connectedInstances.head) === (20L, 20L))
assert(eh1Progress.offsets(eventhubsSource1.connectedInstances(1)) === (30L, 30L))
val progressFilePath = progressTracker1.pinPointProgressFile(fileSystem, unixTimestamp + 3000L)
assert(progressFilePath === None)
}
override def beforeEach(): Unit = {
super.beforeEach()
init()
}
override def afterEach(): Unit = {
reset()
}
protected def init(): Unit = {
progressRootPath = new Path(Files.createTempDirectory("progress_root").toString)
fileSystem = progressRootPath.getFileSystem(new Configuration())
unixTimestamp = Instant.now.getEpochSecond
}
protected def reset(): Unit = {
StructuredStreamingProgressTracker.reset()
progressTracker = null
}
private val appName = "StrutcuredStreamingApp"
private val eventhubsNamedPartitions = Map("ns1" -> Seq(EventHubNameAndPartition("eh1", 0),
EventHubNameAndPartition("eh1", 1)),
"ns2" -> Seq(EventHubNameAndPartition("eh2", 0), EventHubNameAndPartition("eh2", 1),
EventHubNameAndPartition("eh", 2)),
"ns3" -> Seq(EventHubNameAndPartition("eh3", 0), EventHubNameAndPartition("eh3", 1),
EventHubNameAndPartition("eh3", 2), EventHubNameAndPartition("eh3", 3),
EventHubNameAndPartition("eh2", 0), EventHubNameAndPartition("eh2", 1)))
private val eventhubsSource1: EventHubsConnector = new EventHubsConnector {
override def streamId = 0
override def uid = "ns1_eh1"
override def connectedInstances : List[EventHubNameAndPartition] =
eventhubsNamedPartitions("ns1").toList
}
private val eventhubsSource2: EventHubsConnector = new EventHubsConnector {
override def streamId = 0
override def uid = "ns2_eh2"
override def connectedInstances : List[EventHubNameAndPartition] =
eventhubsNamedPartitions("ns2").toList
}
private val eventhubsSource3: EventHubsConnector = new EventHubsConnector {
override def streamId = 0
override def uid = "ns3_eh3"
override def connectedInstances : List[EventHubNameAndPartition] =
eventhubsNamedPartitions("ns3").filter(x => x.eventHubName.equals("eh3")).toList
}
private val eventhubsSource4: EventHubsConnector = new EventHubsConnector {
override def streamId = 0
override def uid = "ns3_eh2"
override def connectedInstances : List[EventHubNameAndPartition] =
eventhubsNamedPartitions("ns3").filter(x => x.eventHubName.equals("eh2")).toList
}
private var fileSystem: FileSystem = _
private var progressRootPath: Path = _
private var progressTracker: ProgressTrackerBase[_ <: EventHubsConnector] = _
private var unixTimestamp: Long = _
}

Просмотреть файл

@ -17,16 +17,15 @@
package org.apache.spark.streaming.eventhubs
import scala.reflect.ClassTag
import org.apache.hadoop.fs.{Path, PathFilter}
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.hadoop.fs.{ Path, PathFilter }
import org.apache.spark.eventhubscommon.OffsetRecord
import org.apache.spark.streaming._
import org.apache.spark.streaming.dstream.DStream
import org.apache.spark.streaming.eventhubs.checkpoint.DirectDStreamProgressTracker
import org.apache.spark.util.ManualClock
import org.apache.spark.{ SparkConf, SparkContext }
import scala.reflect.ClassTag
/**
* A trait of that can be mixed in to get methods for testing DStream operations under
@ -36,7 +35,8 @@ import org.apache.spark.util.ManualClock
trait CheckpointAndProgressTrackerTestSuiteBase extends EventHubTestSuiteBase { self: SharedUtils =>
protected def createContextForCheckpointOperation(
batchDuration: Duration, checkpointDirectory: String): StreamingContext = {
batchDuration: Duration,
checkpointDirectory: String): StreamingContext = {
val conf = new SparkConf().setMaster("local[*]").setAppName(appName)
conf.set("spark.streaming.clock", classOf[ManualClock].getName)
val ssc = new StreamingContext(SparkContext.getOrCreate(conf), batchDuration)
@ -44,32 +44,41 @@ trait CheckpointAndProgressTrackerTestSuiteBase extends EventHubTestSuiteBase {
ssc
}
protected def getTestOutputStream[V: ClassTag](streams: Array[DStream[_]]):
TestEventHubOutputStream[V] = {
protected def getTestOutputStream[V: ClassTag](
streams: Array[DStream[_]]): TestEventHubOutputStream[V] = {
streams.collect {
case ds: TestEventHubOutputStream[V @unchecked] => ds
}.head
}
private def validateTempFileCleanup(
numNonExistBatch: Int,
numBatches: Int,
expectedFileNum: Int): Unit = {
assert(fs.listStatus(new Path(progressRootPath.toString + s"/${appName}_temp"),
new PathFilter {
override def accept(path: Path): Boolean = {
DirectDStreamProgressTracker.getInstance.asInstanceOf[DirectDStreamProgressTracker].
fromPathToTimestamp(path) < 1000 * numNonExistBatch
}
}).length == 0)
private def validateTempFileCleanup(numNonExistBatch: Int,
numBatches: Int,
expectedFileNum: Int): Unit = {
assert(
fs.listStatus(
new Path(progressRootPath.toString + s"/${appName}_temp"),
new PathFilter {
override def accept(path: Path): Boolean = {
DirectDStreamProgressTracker.getInstance
.asInstanceOf[DirectDStreamProgressTracker]
.fromPathToTimestamp(path) < 1000 * numNonExistBatch
}
}
)
.length == 0)
// we do not consider APIs like take() here
assert(fs.listStatus(new Path(progressRootPath.toString + s"/${appName}_temp"),
new PathFilter {
override def accept(path: Path): Boolean = {
DirectDStreamProgressTracker.getInstance.asInstanceOf[DirectDStreamProgressTracker].
fromPathToTimestamp(path) == 1000 * numBatches
}
}).length == expectedFileNum)
assert(
fs.listStatus(
new Path(progressRootPath.toString + s"/${appName}_temp"),
new PathFilter {
override def accept(path: Path): Boolean = {
DirectDStreamProgressTracker.getInstance
.asInstanceOf[DirectDStreamProgressTracker]
.fromPathToTimestamp(path) == 1000 * numBatches
}
}
)
.length == expectedFileNum)
}
// NOTE: due to SPARK-19280 (https://issues.apache.org/jira/browse/SPARK-19280)
@ -96,41 +105,41 @@ trait CheckpointAndProgressTrackerTestSuiteBase extends EventHubTestSuiteBase {
}
assert(fs.exists(new Path(progressRootPath.toString + s"/$appName/" +
s"progress-${numBatches * 1000}")))
*/
*/
}
protected def testCheckpointedOperation[U: ClassTag, V: ClassTag, W: ClassTag](
input1: Seq[Seq[U]],
input2: Seq[Seq[V]],
eventhubsParams1: Map[String, Map[String, String]],
eventhubsParams2: Map[String, Map[String, String]],
expectedStartingOffsetsAndSeqs1: Map[String, OffsetRecord],
expectedStartingOffsetsAndSeqs2: Map[String, OffsetRecord],
operation: (EventHubDirectDStream, EventHubDirectDStream) => DStream[W],
expectedOutputBeforeRestart: Seq[Seq[W]],
expectedOutputAfterRestart: Seq[Seq[W]]) {
input1: Seq[Seq[U]],
input2: Seq[Seq[V]],
eventhubsParams1: Map[String, Map[String, String]],
eventhubsParams2: Map[String, Map[String, String]],
expectedStartingOffsetsAndSeqs1: Map[String, OffsetRecord],
expectedStartingOffsetsAndSeqs2: Map[String, OffsetRecord],
operation: (EventHubDirectDStream, EventHubDirectDStream) => DStream[W],
expectedOutputBeforeRestart: Seq[Seq[W]],
expectedOutputAfterRestart: Seq[Seq[W]]) {
require(ssc.conf.get("spark.streaming.clock") === classOf[ManualClock].getName,
"Cannot run test without manual clock in the conf")
"Cannot run test without manual clock in the conf")
testBinaryOperation(
input1,
input2,
eventhubsParams1,
eventhubsParams2,
expectedStartingOffsetsAndSeqs1,
expectedStartingOffsetsAndSeqs2,
operation,
expectedOutputBeforeRestart)
testBinaryOperation(input1,
input2,
eventhubsParams1,
eventhubsParams2,
expectedStartingOffsetsAndSeqs1,
expectedStartingOffsetsAndSeqs2,
operation,
expectedOutputBeforeRestart)
validateProgressFileCleanup(expectedOutputBeforeRestart.length - 2,
expectedOutputBeforeRestart.length)
validateTempFileCleanup(expectedOutputBeforeRestart.length - 1,
expectedOutputBeforeRestart.length)
validateTempFileCleanup(
expectedOutputBeforeRestart.length - 1,
expectedOutputBeforeRestart.length,
expectedStartingOffsetsAndSeqs1.values.flatMap(_.offsets).size +
expectedStartingOffsetsAndSeqs2.values.flatMap(_.offsets).size)
expectedStartingOffsetsAndSeqs2.values.flatMap(_.offsets).size
)
val currentCheckpointDir = ssc.checkpointDir
// simulate down
@ -146,8 +155,9 @@ trait CheckpointAndProgressTrackerTestSuiteBase extends EventHubTestSuiteBase {
)
runStreamsWithEventHubInput(ssc,
expectedOutputAfterRestart.length - 1,
expectedOutputAfterRestart, useSet = true)
expectedOutputAfterRestart.length - 1,
expectedOutputAfterRestart,
useSet = true)
// test cleanup of progress files
validateProgressFileCleanup(
@ -157,7 +167,8 @@ trait CheckpointAndProgressTrackerTestSuiteBase extends EventHubTestSuiteBase {
expectedOutputBeforeRestart.length + expectedOutputAfterRestart.length - 2,
expectedOutputBeforeRestart.length + expectedOutputAfterRestart.length - 1,
expectedStartingOffsetsAndSeqs1.values.flatMap(_.offsets).size +
expectedStartingOffsetsAndSeqs2.values.flatMap(_.offsets).size)
expectedStartingOffsetsAndSeqs2.values.flatMap(_.offsets).size
)
}
protected def runStopAndRecover[U: ClassTag, V: ClassTag](
@ -169,21 +180,19 @@ trait CheckpointAndProgressTrackerTestSuiteBase extends EventHubTestSuiteBase {
expectedOutputBeforeRestart: Seq[Seq[V]],
useSetFlag: Boolean = false): Unit = {
testUnaryOperation(
input,
eventhubsParams,
expectedStartingOffsetsAndSeqs,
operation,
expectedOutputBeforeRestart,
useSet = useSetFlag)
testUnaryOperation(input,
eventhubsParams,
expectedStartingOffsetsAndSeqs,
operation,
expectedOutputBeforeRestart,
useSet = useSetFlag)
testProgressTracker(eventhubNamespace, expectedOffsetsAndSeqs, 4000L)
validateProgressFileCleanup(expectedOutputBeforeRestart.length - 2,
expectedOutputBeforeRestart.length)
validateTempFileCleanup(
expectedOutputBeforeRestart.length - 1,
expectedOutputBeforeRestart.length,
expectedOffsetsAndSeqs.offsets.size)
expectedOutputBeforeRestart.length)
validateTempFileCleanup(expectedOutputBeforeRestart.length - 1,
expectedOutputBeforeRestart.length,
expectedOffsetsAndSeqs.offsets.size)
val currentCheckpointDir = ssc.checkpointDir
// simulate down
@ -204,10 +213,15 @@ trait CheckpointAndProgressTrackerTestSuiteBase extends EventHubTestSuiteBase {
directoryToClean: Option[Path] = None) {
require(ssc.conf.get("spark.streaming.clock") === classOf[ManualClock].getName,
"Cannot run test without manual clock in the conf")
"Cannot run test without manual clock in the conf")
runStopAndRecover(input, eventhubsParams, expectedStartingOffsetsAndSeqs,
expectedOffsetsAndSeqs, operation, expectedOutputBeforeRestart, useSetFlag = useSetFlag)
runStopAndRecover(input,
eventhubsParams,
expectedStartingOffsetsAndSeqs,
expectedOffsetsAndSeqs,
operation,
expectedOutputBeforeRestart,
useSetFlag = useSetFlag)
if (directoryToClean.isDefined) {
fs.delete(directoryToClean.get, true)
@ -220,8 +234,10 @@ trait CheckpointAndProgressTrackerTestSuiteBase extends EventHubTestSuiteBase {
"\n-------------------------------------------\n"
)
runStreamsWithEventHubInput(ssc, expectedOutputAfterRestart.length - 1,
expectedOutputAfterRestart, useSet = useSetFlag)
runStreamsWithEventHubInput(ssc,
expectedOutputAfterRestart.length - 1,
expectedOutputAfterRestart,
useSet = useSetFlag)
validateProgressFileCleanup(
expectedOutputBeforeRestart.length + expectedOutputAfterRestart.length - 3,
@ -229,6 +245,7 @@ trait CheckpointAndProgressTrackerTestSuiteBase extends EventHubTestSuiteBase {
validateTempFileCleanup(
expectedOutputBeforeRestart.length + expectedOutputAfterRestart.length - 2,
expectedOutputBeforeRestart.length + expectedOutputAfterRestart.length - 1,
expectedOffsetsAndSeqs.offsets.size)
expectedOffsetsAndSeqs.offsets.size
)
}
}

Просмотреть файл

@ -20,10 +20,10 @@ package org.apache.spark.streaming.eventhubs
import org.mockito.Mockito
import org.scalatest.mock.MockitoSugar
import org.apache.spark.eventhubscommon.{EventHubNameAndPartition, OffsetRecord}
import org.apache.spark.eventhubscommon.client.EventHubClient
import org.apache.spark.eventhubscommon.{ EventHubNameAndPartition, OffsetRecord }
import org.apache.spark.eventhubscommon.client.Client
import org.apache.spark.rdd.RDD
import org.apache.spark.streaming.{Duration, Seconds, Time}
import org.apache.spark.streaming.{ Duration, Seconds, Time }
class EventHubDirectDStreamSuite extends EventHubTestSuiteBase with MockitoSugar with SharedUtils {
@ -41,12 +41,15 @@ class EventHubDirectDStreamSuite extends EventHubTestSuiteBase with MockitoSugar
)
test("skip the batch when EH endpoint is unavailable for starting seq number query") {
val ehDStream = new EventHubDirectDStream(ssc, eventhubNamespace, progressRootPath.toString,
Map("eh1" -> eventhubParameters))
val eventHubClientMock = mock[EventHubClient]
Mockito.when(eventHubClientMock.startSeqOfPartition(retryIfFail = false,
ehDStream.connectedInstances)).
thenReturn(None)
val ehDStream = new EventHubDirectDStream(ssc,
eventhubNamespace,
progressRootPath.toString,
Map("eh1" -> eventhubParameters))
val eventHubClientMock = mock[Client]
Mockito
.when(
eventHubClientMock.startSeqOfPartition(retryIfFail = false, ehDStream.connectedInstances))
.thenReturn(None)
ehDStream.setEventHubClient(eventHubClientMock)
ssc.scheduler.start()
intercept[IllegalArgumentException] {
@ -55,17 +58,21 @@ class EventHubDirectDStreamSuite extends EventHubTestSuiteBase with MockitoSugar
}
test("skip the batch when EH endpoint is unavailable for highest offset query") {
val ehDStream = new EventHubDirectDStream(ssc, eventhubNamespace, progressRootPath.toString,
Map("eh1" -> eventhubParameters))
val eventHubClientMock = mock[EventHubClient]
val dummyStartSeqMap = (0 until 32).map(partitionId =>
(EventHubNameAndPartition("eh1", partitionId), 1L)).toMap
Mockito.when(eventHubClientMock.startSeqOfPartition(retryIfFail = false,
ehDStream.connectedInstances)).
thenReturn(Some(dummyStartSeqMap))
Mockito.when(eventHubClientMock.endPointOfPartition(retryIfFail = true,
ehDStream.connectedInstances)).
thenReturn(None)
val ehDStream = new EventHubDirectDStream(ssc,
eventhubNamespace,
progressRootPath.toString,
Map("eh1" -> eventhubParameters))
val eventHubClientMock = mock[Client]
val dummyStartSeqMap =
(0 until 32).map(partitionId => (EventHubNameAndPartition("eh1", partitionId), 1L)).toMap
Mockito
.when(
eventHubClientMock.startSeqOfPartition(retryIfFail = false, ehDStream.connectedInstances))
.thenReturn(Some(dummyStartSeqMap))
Mockito
.when(
eventHubClientMock.endPointOfPartition(retryIfFail = true, ehDStream.connectedInstances))
.thenReturn(None)
ehDStream.setEventHubClient(eventHubClientMock)
ssc.scheduler.start()
intercept[IllegalArgumentException] {
@ -81,112 +88,166 @@ class EventHubDirectDStreamSuite extends EventHubTestSuiteBase with MockitoSugar
test("interaction among Listener/ProgressTracker/Spark Streaming (single stream)") {
val input = Seq(Seq(1, 2, 3, 4, 5, 6), Seq(4, 5, 6, 7, 8, 9), Seq(7, 8, 9, 1, 2, 3))
val expectedOutput = Seq(
Seq(2, 3, 5, 6, 8, 9), Seq(4, 5, 7, 8, 10, 2), Seq(6, 7, 9, 10, 3, 4))
val expectedOutput = Seq(Seq(2, 3, 5, 6, 8, 9), Seq(4, 5, 7, 8, 10, 2), Seq(6, 7, 9, 10, 3, 4))
testUnaryOperation(
input,
eventhubsParams = Map[String, Map[String, String]](
"eh1" -> Map(
"eventhubs.partition.count" -> "3",
"eventhubs.maxRate" -> "2",
"eventhubs.name" -> "eh1")
),
expectedOffsetsAndSeqs = Map(eventhubNamespace ->
OffsetRecord(2000L, Map(EventHubNameAndPartition("eh1", 0) -> (3L, 3L),
EventHubNameAndPartition("eh1", 1) -> (3L, 3L),
EventHubNameAndPartition("eh1", 2) -> (3L, 3L)))
"eventhubs.name" -> "eh1",
"eventhubs.namespace" -> "namespace",
"eventhubs.policyname" -> "policyname",
"eventhubs.policykey" -> "policykey"
)
),
expectedOffsetsAndSeqs = Map(
eventhubNamespace ->
OffsetRecord(2000L,
Map(EventHubNameAndPartition("eh1", 0) -> (3L, 3L),
EventHubNameAndPartition("eh1", 1) -> (3L, 3L),
EventHubNameAndPartition("eh1", 2) -> (3L, 3L)))),
operation = (inputDStream: EventHubDirectDStream) =>
inputDStream.map(eventData => eventData.getProperties.get("output").asInstanceOf[Int] + 1),
expectedOutput)
testProgressTracker(eventhubNamespace,
OffsetRecord(3000L, Map(EventHubNameAndPartition("eh1", 0) -> (5L, 5L),
EventHubNameAndPartition("eh1", 1) -> (5L, 5L),
EventHubNameAndPartition("eh1", 2) -> (5L, 5L))), 4000L)
expectedOutput
)
testProgressTracker(
eventhubNamespace,
OffsetRecord(3000L,
Map(EventHubNameAndPartition("eh1", 0) -> (5L, 5L),
EventHubNameAndPartition("eh1", 1) -> (5L, 5L),
EventHubNameAndPartition("eh1", 2) -> (5L, 5L))),
4000L
)
}
test("interaction among Listener/ProgressTracker/Spark Streaming (single stream +" +
" windowing function)") {
test(
"interaction among Listener/ProgressTracker/Spark Streaming (single stream +" +
" windowing function)") {
val input = Seq(Seq(1, 2, 3, 4, 5, 6), Seq(4, 5, 6, 7, 8, 9), Seq(7, 8, 9, 1, 2, 3))
val expectedOutput = Seq(
Seq(2, 3, 5, 6, 8, 9), Seq(2, 3, 5, 6, 8, 9, 4, 5, 7, 8, 10, 2),
Seq(4, 5, 7, 8, 10, 2, 6, 7, 9, 10, 3, 4))
val expectedOutput = Seq(Seq(2, 3, 5, 6, 8, 9),
Seq(2, 3, 5, 6, 8, 9, 4, 5, 7, 8, 10, 2),
Seq(4, 5, 7, 8, 10, 2, 6, 7, 9, 10, 3, 4))
testUnaryOperation(
input,
eventhubsParams = Map[String, Map[String, String]](
"eh1" -> Map(
"eventhubs.partition.count" -> "3",
"eventhubs.maxRate" -> "2",
"eventhubs.name" -> "eh1")
),
expectedOffsetsAndSeqs = Map(eventhubNamespace ->
OffsetRecord(2000L, Map(EventHubNameAndPartition("eh1", 0) -> (3L, 3L),
EventHubNameAndPartition("eh1", 1) -> (3L, 3L),
EventHubNameAndPartition("eh1", 2) -> (3L, 3L)))
"eventhubs.name" -> "eh1",
"eventhubs.namespace" -> "namespace",
"eventhubs.policyname" -> "policyname",
"eventhubs.policykey" -> "policykey"
)
),
expectedOffsetsAndSeqs = Map(
eventhubNamespace ->
OffsetRecord(2000L,
Map(EventHubNameAndPartition("eh1", 0) -> (3L, 3L),
EventHubNameAndPartition("eh1", 1) -> (3L, 3L),
EventHubNameAndPartition("eh1", 2) -> (3L, 3L)))),
operation = (inputDStream: EventHubDirectDStream) =>
inputDStream.window(Seconds(2), Seconds(1)).map(
eventData => eventData.getProperties.get("output").asInstanceOf[Int] + 1),
expectedOutput)
testProgressTracker(eventhubNamespace,
OffsetRecord(3000L, Map(EventHubNameAndPartition("eh1", 0) -> (5L, 5L),
EventHubNameAndPartition("eh1", 1) -> (5L, 5L),
EventHubNameAndPartition("eh1", 2) -> (5L, 5L))), 4000L)
inputDStream
.window(Seconds(2), Seconds(1))
.map(eventData => eventData.getProperties.get("output").asInstanceOf[Int] + 1),
expectedOutput
)
testProgressTracker(
eventhubNamespace,
OffsetRecord(3000L,
Map(EventHubNameAndPartition("eh1", 0) -> (5L, 5L),
EventHubNameAndPartition("eh1", 1) -> (5L, 5L),
EventHubNameAndPartition("eh1", 2) -> (5L, 5L))),
4000L
)
}
test("interaction among Listener/ProgressTracker/Spark Streaming (multi-streams join)") {
import scala.collection.JavaConverters._
val input1 = Seq(
Seq("a" -> 1, "b" -> 2, "c" -> 3, "d" -> 4, "e" -> 5, "f" -> 6),
Seq("g" -> 4, "h" -> 5, "i" -> 6, "j" -> 7, "k" -> 8, "l" -> 9),
Seq("m" -> 7, "n" -> 8, "o" -> 9, "p" -> 1, "q" -> 2, "r" -> 3))
val input2 = Seq(
Seq("a" -> 1, "b" -> 2, "c" -> 3, "d" -> 4, "e" -> 5, "f" -> 6),
Seq("g" -> 4, "h" -> 5, "i" -> 6, "j" -> 7, "k" -> 8, "l" -> 9),
Seq("m" -> 7, "n" -> 8, "o" -> 9, "p" -> 1, "q" -> 2, "r" -> 3))
val expectedOutput = Seq(
Seq("a" -> 2, "b" -> 4, "c" -> 6, "g" -> 8, "h" -> 10, "i" -> 12, "m" -> 14, "n" -> 16,
"o" -> 18),
Seq("d" -> 8, "e" -> 10, "f" -> 12, "j" -> 14, "k" -> 16, "l" -> 18, "p" -> 2, "q" -> 4,
"r" -> 6))
val input1 = Seq(Seq("a" -> 1, "b" -> 2, "c" -> 3, "d" -> 4, "e" -> 5, "f" -> 6),
Seq("g" -> 4, "h" -> 5, "i" -> 6, "j" -> 7, "k" -> 8, "l" -> 9),
Seq("m" -> 7, "n" -> 8, "o" -> 9, "p" -> 1, "q" -> 2, "r" -> 3))
val input2 = Seq(Seq("a" -> 1, "b" -> 2, "c" -> 3, "d" -> 4, "e" -> 5, "f" -> 6),
Seq("g" -> 4, "h" -> 5, "i" -> 6, "j" -> 7, "k" -> 8, "l" -> 9),
Seq("m" -> 7, "n" -> 8, "o" -> 9, "p" -> 1, "q" -> 2, "r" -> 3))
val expectedOutput = Seq(Seq("a" -> 2,
"b" -> 4,
"c" -> 6,
"g" -> 8,
"h" -> 10,
"i" -> 12,
"m" -> 14,
"n" -> 16,
"o" -> 18),
Seq("d" -> 8,
"e" -> 10,
"f" -> 12,
"j" -> 14,
"k" -> 16,
"l" -> 18,
"p" -> 2,
"q" -> 4,
"r" -> 6))
testBinaryOperation(input1, input2,
testBinaryOperation(
input1,
input2,
eventhubsParams1 = Map[String, Map[String, String]](
"eh11" -> Map(
"eventhubs.partition.count" -> "3",
"eventhubs.maxRate" -> "3",
"eventhubs.name" -> "eh11")
"eh11" -> Map(
"eventhubs.partition.count" -> "3",
"eventhubs.maxRate" -> "3",
"eventhubs.name" -> "eh11",
"eventhubs.namespace" -> "namespace",
"eventhubs.policyname" -> "policyname",
"eventhubs.policykey" -> "policykey"
)
),
eventhubsParams2 = Map[String, Map[String, String]](
"eh21" -> Map(
"eventhubs.partition.count" -> "3",
"eventhubs.maxRate" -> "3",
"eventhubs.name" -> "eh21")
"eventhubs.name" -> "eh21",
"eventhubs.namespace" -> "namespace",
"eventhubs.policyname" -> "policyname",
"eventhubs.policykey" -> "policykey"
)
),
expectedOffsetsAndSeqs1 = Map("namespace1" ->
OffsetRecord(1000L, Map(EventHubNameAndPartition("eh11", 0) -> (2L, 2L),
EventHubNameAndPartition("eh11", 1) -> (2L, 2L),
EventHubNameAndPartition("eh11", 2) -> (2L, 2L))
)),
expectedOffsetsAndSeqs2 = Map("namespace2" ->
OffsetRecord(1000L, Map(EventHubNameAndPartition("eh21", 0) -> (2L, 2L),
EventHubNameAndPartition("eh21", 1) -> (2L, 2L),
EventHubNameAndPartition("eh21", 2) -> (2L, 2L))
)),
expectedOffsetsAndSeqs1 = Map(
"namespace1" ->
OffsetRecord(1000L,
Map(EventHubNameAndPartition("eh11", 0) -> (2L, 2L),
EventHubNameAndPartition("eh11", 1) -> (2L, 2L),
EventHubNameAndPartition("eh11", 2) -> (2L, 2L)))),
expectedOffsetsAndSeqs2 = Map(
"namespace2" ->
OffsetRecord(1000L,
Map(EventHubNameAndPartition("eh21", 0) -> (2L, 2L),
EventHubNameAndPartition("eh21", 1) -> (2L, 2L),
EventHubNameAndPartition("eh21", 2) -> (2L, 2L)))),
// join and sum up the value
operation = (inputDStream1: EventHubDirectDStream, inputDStream2: EventHubDirectDStream) =>
inputDStream1.flatMap(eventData => eventData.getProperties.asScala).
join(inputDStream2.flatMap(eventData => eventData.getProperties.asScala)).
map{case (key, (v1, v2)) => (key, v1.asInstanceOf[Int] + v2.asInstanceOf[Int])},
expectedOutput)
testProgressTracker("namespace1",
OffsetRecord(2000L, Map(EventHubNameAndPartition("eh11", 0) -> (5L, 5L),
EventHubNameAndPartition("eh11", 1) -> (5L, 5L),
EventHubNameAndPartition("eh11", 2) -> (5L, 5L))), 3000L)
testProgressTracker("namespace2",
OffsetRecord(2000L, Map(EventHubNameAndPartition("eh21", 0) -> (5L, 5L),
EventHubNameAndPartition("eh21", 1) -> (5L, 5L),
EventHubNameAndPartition("eh21", 2) -> (5L, 5L))), 3000L)
inputDStream1
.flatMap(eventData => eventData.getProperties.asScala)
.join(inputDStream2.flatMap(eventData => eventData.getProperties.asScala))
.map { case (key, (v1, v2)) => (key, v1.asInstanceOf[Int] + v2.asInstanceOf[Int]) },
expectedOutput
)
testProgressTracker(
"namespace1",
OffsetRecord(2000L,
Map(EventHubNameAndPartition("eh11", 0) -> (5L, 5L),
EventHubNameAndPartition("eh11", 1) -> (5L, 5L),
EventHubNameAndPartition("eh11", 2) -> (5L, 5L))),
3000L
)
testProgressTracker(
"namespace2",
OffsetRecord(2000L,
Map(EventHubNameAndPartition("eh21", 0) -> (5L, 5L),
EventHubNameAndPartition("eh21", 1) -> (5L, 5L),
EventHubNameAndPartition("eh21", 2) -> (5L, 5L))),
3000L
)
}
test("update offset correctly when RDD operation only involves some of the partitions") {
@ -198,60 +259,81 @@ class EventHubDirectDStreamSuite extends EventHubTestSuiteBase with MockitoSugar
"eh1" -> Map(
"eventhubs.partition.count" -> "3",
"eventhubs.maxRate" -> "2",
"eventhubs.name" -> "eh1")
"eventhubs.name" -> "eh1",
"eventhubs.namespace" -> "namespace",
"eventhubs.policyname" -> "policyname",
"eventhubs.policykey" -> "policykey"
)
),
expectedOffsetsAndSeqs = Map(eventhubNamespace ->
OffsetRecord(2000L, Map(EventHubNameAndPartition("eh1", 0) -> (3L, 3L),
EventHubNameAndPartition("eh1", 1) -> (-1L, -1L),
EventHubNameAndPartition("eh1", 2) -> (-1L, -1L))
)),
expectedOffsetsAndSeqs = Map(
eventhubNamespace ->
OffsetRecord(2000L,
Map(EventHubNameAndPartition("eh1", 0) -> (3L, 3L),
EventHubNameAndPartition("eh1", 1) -> (-1L, -1L),
EventHubNameAndPartition("eh1", 2) -> (-1L, -1L)))),
operation = (inputDStream: EventHubDirectDStream) =>
inputDStream.map(eventData => eventData.getProperties.get("output").asInstanceOf[Int] + 1),
expectedOutput,
rddOperation = Some((rdd: RDD[Int], t: Time) => {
Array(rdd.take(1).toSeq)
}))
})
)
testProgressTracker(eventhubNamespace,
OffsetRecord(3000L, Map(
EventHubNameAndPartition("eh1", 0) -> (5L, 5L),
EventHubNameAndPartition("eh1", 1) -> (-1L, -1L),
EventHubNameAndPartition("eh1", 2) -> (-1L, -1L))),
4000L)
testProgressTracker(
eventhubNamespace,
OffsetRecord(3000L,
Map(EventHubNameAndPartition("eh1", 0) -> (5L, 5L),
EventHubNameAndPartition("eh1", 1) -> (-1L, -1L),
EventHubNameAndPartition("eh1", 2) -> (-1L, -1L))),
4000L
)
}
test("continue stream correctly when there is fluctuation") {
val input = Seq(Seq(1, 2, 3, 4, 5, 6), Seq(4, 5, 6, 7, 8, 9), Seq(7, 8, 9, 1, 2, 3))
val expectedOutput = Seq(
Seq(2, 3, 5, 6, 8, 9), Seq(4, 5, 7, 8, 10, 2), Seq(), Seq(), Seq(), Seq(6, 7, 9, 10, 3, 4))
val expectedOutput = Seq(Seq(2, 3, 5, 6, 8, 9),
Seq(4, 5, 7, 8, 10, 2),
Seq(),
Seq(),
Seq(),
Seq(6, 7, 9, 10, 3, 4))
testFluctuatedStream(
input,
eventhubsParams = Map[String, Map[String, String]](
"eh1" -> Map(
"eventhubs.partition.count" -> "3",
"eventhubs.maxRate" -> "2",
"eventhubs.name" -> "eh1")
"eventhubs.name" -> "eh1",
"eventhubs.namespace" -> "namespace",
"eventhubs.policyname" -> "policyname",
"eventhubs.policykey" -> "policykey"
)
),
expectedOffsetsAndSeqs = Map(eventhubNamespace ->
OffsetRecord(5000L, Map(EventHubNameAndPartition("eh1", 0) -> (3L, 3L),
EventHubNameAndPartition("eh1", 1) -> (3L, 3L),
EventHubNameAndPartition("eh1", 2) -> (3L, 3L)))),
expectedOffsetsAndSeqs = Map(
eventhubNamespace ->
OffsetRecord(5000L,
Map(EventHubNameAndPartition("eh1", 0) -> (3L, 3L),
EventHubNameAndPartition("eh1", 1) -> (3L, 3L),
EventHubNameAndPartition("eh1", 2) -> (3L, 3L)))),
operation = (inputDStream: EventHubDirectDStream) =>
inputDStream.map(eventData => eventData.getProperties.get("output").asInstanceOf[Int] + 1),
expectedOutput,
messagesBeforeEmpty = 4,
numBatchesBeforeNewData = 5)
testProgressTracker(eventhubNamespace,
OffsetRecord(6000L, Map(EventHubNameAndPartition("eh1", 0) -> (5L, 5L),
EventHubNameAndPartition("eh1", 1) -> (5L, 5L),
EventHubNameAndPartition("eh1", 2) -> (5L, 5L))),
7000L)
numBatchesBeforeNewData = 5
)
testProgressTracker(
eventhubNamespace,
OffsetRecord(6000L,
Map(EventHubNameAndPartition("eh1", 0) -> (5L, 5L),
EventHubNameAndPartition("eh1", 1) -> (5L, 5L),
EventHubNameAndPartition("eh1", 2) -> (5L, 5L))),
7000L
)
}
test("filter messages for enqueueTime correctly") {
val input = Seq(Seq(1, 2, 3, 4, 5, 6), Seq(4, 5, 6, 7, 8, 9), Seq(7, 8, 9, 1, 2, 3))
val expectedOutput = Seq(
Seq(5, 6, 8, 9, 2, 3), Seq(7, 10, 4), Seq())
val expectedOutput = Seq(Seq(5, 6, 8, 9, 2, 3), Seq(7, 10, 4), Seq())
testUnaryOperation(
input,
eventhubsParams = Map[String, Map[String, String]](
@ -259,27 +341,35 @@ class EventHubDirectDStreamSuite extends EventHubTestSuiteBase with MockitoSugar
"eventhubs.partition.count" -> "3",
"eventhubs.maxRate" -> "2",
"eventhubs.name" -> "eh1",
"eventhubs.filter.enqueuetime" -> "3000"
"eventhubs.filter.enqueuetime" -> "3000",
"eventhubs.namespace" -> "namespace",
"eventhubs.policyname" -> "policyname",
"eventhubs.policykey" -> "policykey"
)
),
expectedOffsetsAndSeqs = Map(eventhubNamespace ->
OffsetRecord(2000L, Map(EventHubNameAndPartition("eh1", 0) -> (5L, 5L),
EventHubNameAndPartition("eh1", 1) -> (5L, 5L),
EventHubNameAndPartition("eh1", 2) -> (5L, 5L)))
),
expectedOffsetsAndSeqs = Map(
eventhubNamespace ->
OffsetRecord(2000L,
Map(EventHubNameAndPartition("eh1", 0) -> (5L, 5L),
EventHubNameAndPartition("eh1", 1) -> (5L, 5L),
EventHubNameAndPartition("eh1", 2) -> (5L, 5L)))),
operation = (inputDStream: EventHubDirectDStream) =>
inputDStream.map(eventData => eventData.getProperties.get("output").asInstanceOf[Int] + 1),
expectedOutput)
testProgressTracker(eventhubNamespace,
OffsetRecord(3000L, Map(EventHubNameAndPartition("eh1", 0) -> (5L, 5L),
EventHubNameAndPartition("eh1", 1) -> (5L, 5L),
EventHubNameAndPartition("eh1", 2) -> (5L, 5L))), 4000L)
expectedOutput
)
testProgressTracker(
eventhubNamespace,
OffsetRecord(3000L,
Map(EventHubNameAndPartition("eh1", 0) -> (5L, 5L),
EventHubNameAndPartition("eh1", 1) -> (5L, 5L),
EventHubNameAndPartition("eh1", 2) -> (5L, 5L))),
4000L
)
}
test("pass-in enqueuetime is not allowed to be later than the highest enqueuetime") {
val input = Seq(Seq(1, 2, 3, 4, 5, 6), Seq(4, 5, 6, 7, 8, 9), Seq(7, 8, 9, 1, 2, 3))
val expectedOutput = Seq(
Seq(5, 6, 8, 9, 2, 3), Seq(7, 10, 4), Seq())
val expectedOutput = Seq(Seq(5, 6, 8, 9, 2, 3), Seq(7, 10, 4), Seq())
intercept[IllegalArgumentException] {
testUnaryOperation(
input,
@ -291,15 +381,17 @@ class EventHubDirectDStreamSuite extends EventHubTestSuiteBase with MockitoSugar
"eventhubs.filter.enqueuetime" -> "10000"
)
),
expectedOffsetsAndSeqs = Map(eventhubNamespace ->
OffsetRecord(2000L, Map(EventHubNameAndPartition("eh1", 0) -> (5L, 5L),
EventHubNameAndPartition("eh1", 1) -> (5L, 5L),
EventHubNameAndPartition("eh1", 2) -> (5L, 5L)))
),
expectedOffsetsAndSeqs = Map(
eventhubNamespace ->
OffsetRecord(2000L,
Map(EventHubNameAndPartition("eh1", 0) -> (5L, 5L),
EventHubNameAndPartition("eh1", 1) -> (5L, 5L),
EventHubNameAndPartition("eh1", 2) -> (5L, 5L)))),
operation = (inputDStream: EventHubDirectDStream) =>
inputDStream.map(eventData => eventData.getProperties.get("output").
asInstanceOf[Int] + 1),
expectedOutput)
inputDStream.map(eventData =>
eventData.getProperties.get("output").asInstanceOf[Int] + 1),
expectedOutput
)
}
}
}

Просмотреть файл

@ -17,32 +17,33 @@
package org.apache.spark.streaming.eventhubs
import java.io.{IOException, ObjectInputStream}
import java.io.{ IOException, ObjectInputStream }
import java.util.concurrent.ConcurrentLinkedQueue
import scala.reflect.ClassTag
import org.apache.spark.eventhubscommon.{EventHubNameAndPartition, OffsetRecord}
import org.apache.spark.eventhubscommon.{ EventHubNameAndPartition, OffsetRecord }
import org.apache.spark.eventhubscommon.client.EventHubsOffsetTypes.EventHubsOffsetType
import org.apache.spark.eventhubscommon.utils._
import org.apache.spark.rdd.RDD
import org.apache.spark.streaming._
import org.apache.spark.streaming.dstream.{DStream, ForEachDStream}
import org.apache.spark.streaming.dstream.{ DStream, ForEachDStream }
import org.apache.spark.streaming.eventhubs.checkpoint.DirectDStreamProgressTracker
import org.apache.spark.util.{ManualClock, Utils}
import org.apache.spark.util.{ ManualClock, Utils }
private[eventhubs] class TestEventHubOutputStream[T: ClassTag](
parent: DStream[T],
val output: ConcurrentLinkedQueue[Seq[Seq[T]]] = new ConcurrentLinkedQueue[Seq[Seq[T]]](),
rddOperation: Option[(RDD[T], Time) => Array[Seq[T]]])
extends ForEachDStream[T](parent, {
(rdd: RDD[T], t: Time) =>
val rddOpToApply = rddOperation.getOrElse(
(rdd: RDD[T], t: Time) => rdd.glom().collect().map(_.toSeq))
val resultsInABatch = rddOpToApply(rdd, t)
output.add(resultsInABatch)
}, false) {
extends ForEachDStream[T](
parent, { (rdd: RDD[T], t: Time) =>
val rddOpToApply =
rddOperation.getOrElse((rdd: RDD[T], t: Time) => rdd.glom().collect().map(_.toSeq))
val resultsInABatch = rddOpToApply(rdd, t)
output.add(resultsInABatch)
},
false
) {
// This is to clear the output buffer every it is read from a checkpoint
@throws(classOf[IOException])
@ -75,8 +76,8 @@ private[eventhubs] trait EventHubTestSuiteBase extends TestSuiteBase {
val inputStream1 = setupEventHubInputStream(namespace1, simulatedEventHubs1, eventhubsParams1)
val inputStream2 = setupEventHubInputStream(namespace2, simulatedEventHubs2, eventhubsParams2)
val operatedStream = operation(inputStream1, inputStream2)
val outputStream = new TestEventHubOutputStream(operatedStream,
new ConcurrentLinkedQueue[Seq[Seq[V]]], None)
val outputStream =
new TestEventHubOutputStream(operatedStream, new ConcurrentLinkedQueue[Seq[Seq[V]]], None)
outputStream.register()
ssc
}
@ -88,11 +89,12 @@ private[eventhubs] trait EventHubTestSuiteBase extends TestSuiteBase {
rddOperation: Option[(RDD[V], Time) => Array[Seq[V]]]): StreamingContext = {
// Setup the stream computation
val inputStream = setupEventHubInputStream(eventhubNamespace, simulatedEventHubs,
eventhubsParams)
val inputStream =
setupEventHubInputStream(eventhubNamespace, simulatedEventHubs, eventhubsParams)
val operatedStream = operation(inputStream)
val outputStream = new TestEventHubOutputStream(operatedStream,
new ConcurrentLinkedQueue[Seq[Seq[V]]], rddOperation)
new ConcurrentLinkedQueue[Seq[Seq[V]]],
rddOperation)
outputStream.register()
ssc
}
@ -107,12 +109,19 @@ private[eventhubs] trait EventHubTestSuiteBase extends TestSuiteBase {
namespace,
progressRootPath.toString,
eventhubsParams,
(eventHubParams: Map[String, String], partitionId: Int, startOffset: Long,
eventHubsOffsetType: EventHubsOffsetType, _: Int) =>
new TestEventHubsReceiver(eventHubParams, simulatedEventHubs, partitionId,
startOffset, eventHubsOffsetType),
(_: String, _: Map[String, Map[String, String]]) => FragileEventHubClient.getInstance("",
Map()))
(eventHubParams: Map[String, String],
partitionId: Int,
startOffset: Long,
eventHubsOffsetType: EventHubsOffsetType,
_: Int) =>
new TestEventHubsReceiver(eventHubParams,
simulatedEventHubs,
partitionId,
startOffset,
eventHubsOffsetType),
(_: String, _: Map[String, Map[String, String]]) =>
FragileEventHubClient.getInstance("", Map())
)
}
private def setupFragileEventHubStream[V: ClassTag](
@ -120,25 +129,23 @@ private[eventhubs] trait EventHubTestSuiteBase extends TestSuiteBase {
eventhubsParams: Map[String, Map[String, String]],
operation: EventHubDirectDStream => DStream[V]): StreamingContext = {
val inputStream = setupFragileInputStream(eventhubNamespace, simulatedEventHubs,
eventhubsParams)
val inputStream =
setupFragileInputStream(eventhubNamespace, simulatedEventHubs, eventhubsParams)
val operatedStream = operation(inputStream)
val outputStream = new TestEventHubOutputStream(operatedStream,
new ConcurrentLinkedQueue[Seq[Seq[V]]], None)
val outputStream =
new TestEventHubOutputStream(operatedStream, new ConcurrentLinkedQueue[Seq[Seq[V]]], None)
outputStream.register()
ssc
}
def testFragileStream[U: ClassTag, V: ClassTag](
input: Seq[Seq[U]],
eventhubsParams: Map[String, Map[String, String]],
expectedOffsetsAndSeqs: Map[String, OffsetRecord],
operation: EventHubDirectDStream => DStream[V],
expectedOutput: Seq[Seq[V]]) {
def testFragileStream[U: ClassTag, V: ClassTag](input: Seq[Seq[U]],
eventhubsParams: Map[String, Map[String, String]],
expectedOffsetsAndSeqs: Map[String, OffsetRecord],
operation: EventHubDirectDStream => DStream[V],
expectedOutput: Seq[Seq[V]]) {
val numBatches_ = expectedOutput.size
val simulatedEventHubs = createSimulatedEventHub(eventhubNamespace, input, eventhubsParams)
withStreamingContext(
setupFragileEventHubStream(simulatedEventHubs, eventhubsParams, operation)) {
withStreamingContext(setupFragileEventHubStream(simulatedEventHubs, eventhubsParams, operation)) {
ssc =>
runStreamsWithEventHubInput(ssc, numBatches_, expectedOutput, useSet = false)
}
@ -150,23 +157,32 @@ private[eventhubs] trait EventHubTestSuiteBase extends TestSuiteBase {
simulatedEventHubs: SimulatedEventHubs,
eventhubsParams: Map[String, Map[String, String]]): EventHubDirectDStream = {
val maxOffsetForEachEventHub = EventHubsTestUtilities.getHighestOffsetPerPartition(
simulatedEventHubs)
val maxOffsetForEachEventHub =
EventHubsTestUtilities.getHighestOffsetPerPartition(simulatedEventHubs)
new EventHubDirectDStream(ssc, namespace,
progressRootPath.toString, eventhubsParams,
(eventHubParams: Map[String, String], partitionId: Int, startOffset: Long,
eventHubsOffsetType: EventHubsOffsetType, _: Int) =>
new TestEventHubsReceiver(eventHubParams, simulatedEventHubs, partitionId, startOffset,
eventHubsOffsetType),
new EventHubDirectDStream(
ssc,
namespace,
progressRootPath.toString,
eventhubsParams,
(eventHubParams: Map[String, String],
partitionId: Int,
startOffset: Long,
eventHubsOffsetType: EventHubsOffsetType,
_: Int) =>
new TestEventHubsReceiver(eventHubParams,
simulatedEventHubs,
partitionId,
startOffset,
eventHubsOffsetType),
(_: String, _: Map[String, Map[String, String]]) =>
new TestRestEventHubClient(maxOffsetForEachEventHub))
new TestRestEventHubClient(maxOffsetForEachEventHub)
)
}
def runEventHubStreams[V: ClassTag](
ssc: StreamingContext,
numBatches: Int,
numExpectedOutput: Int): Seq[Seq[V]] = {
def runEventHubStreams[V: ClassTag](ssc: StreamingContext,
numBatches: Int,
numExpectedOutput: Int): Seq[Seq[V]] = {
// Flatten each RDD into a single Seq
runEventHubStreamsWithPartitions(ssc, numBatches, numExpectedOutput).map(_.flatten.toSeq)
}
@ -182,10 +198,9 @@ private[eventhubs] trait EventHubTestSuiteBase extends TestSuiteBase {
* This function is copied from Spark code base and modified by changing the TestOutputStream
* implementation
*/
def runEventHubStreamsWithPartitions[V: ClassTag](
ssc: StreamingContext,
numBatches: Int,
numExpectedOutput: Int): Seq[Seq[Seq[V]]] = {
def runEventHubStreamsWithPartitions[V: ClassTag](ssc: StreamingContext,
numBatches: Int,
numExpectedOutput: Int): Seq[Seq[Seq[V]]] = {
import scala.collection.JavaConverters._
@ -194,9 +209,10 @@ private[eventhubs] trait EventHubTestSuiteBase extends TestSuiteBase {
logInfo("numBatches = " + numBatches + ", numExpectedOutput = " + numExpectedOutput)
// Get the output buffer
val outputStream = ssc.graph.getOutputStreams.
filter(_.isInstanceOf[TestEventHubOutputStream[_]]).
head.asInstanceOf[TestEventHubOutputStream[V]]
val outputStream = ssc.graph.getOutputStreams
.filter(_.isInstanceOf[TestEventHubOutputStream[_]])
.head
.asInstanceOf[TestEventHubOutputStream[V]]
val output = outputStream.output
try {
@ -220,7 +236,7 @@ private[eventhubs] trait EventHubTestSuiteBase extends TestSuiteBase {
// Wait until expected number of output items have been generated
val startTime = System.currentTimeMillis()
while (output.size < numExpectedOutput &&
System.currentTimeMillis() - startTime < maxWaitTimeMillis) {
System.currentTimeMillis() - startTime < maxWaitTimeMillis) {
logInfo("output.size = " + output.size + ", numExpectedOutput = " + numExpectedOutput)
ssc.awaitTerminationOrTimeout(50)
}
@ -241,44 +257,46 @@ private[eventhubs] trait EventHubTestSuiteBase extends TestSuiteBase {
namespace: String,
input: Seq[Seq[U]],
eventhubsParams: Map[String, Map[String, String]]): SimulatedEventHubs = {
val ehAndRawInputMap = eventhubsParams.keys.flatMap {
eventHubName =>
val ehList = {
for (i <- 0 until eventhubsParams(eventHubName)("eventhubs.partition.count").toInt)
yield EventHubNameAndPartition(eventHubName, i)
}.toArray
ehList.zip(input)
val ehAndRawInputMap = eventhubsParams.keys.flatMap { eventHubName =>
val ehList = {
for (i <- 0 until eventhubsParams(eventHubName)("eventhubs.partition.count").toInt)
yield EventHubNameAndPartition(eventHubName, i)
}.toArray
ehList.zip(input)
}.toMap
new SimulatedEventHubs(namespace,
new SimulatedEventHubs(
namespace,
ehAndRawInputMap.map {
case (eventHubNameAndPartition, propertyQueue) =>
(eventHubNameAndPartition,
EventHubsTestUtilities.generateEventData(
propertyQueue.map(property => ('e', Seq(property))),
eventHubNameAndPartition.partitionId,
0))
})
EventHubsTestUtilities.generateEventData(
propertyQueue.map(property => ('e', Seq(property))),
eventHubNameAndPartition.partitionId,
0))
}
)
}
protected def verifyOffsetsAndSeqs(
ssc: StreamingContext,
namespace: String,
expectedOffsetsAndSeqs: Map[String, OffsetRecord]): Unit = {
val producedOffsetsAndSeqs = ssc.graph.getInputStreams().filter(
_.isInstanceOf[EventHubDirectDStream]).map(_.asInstanceOf[EventHubDirectDStream]).
filter(_.eventHubNameSpace == namespace).
map(eventHubStream => (eventHubStream.eventHubNameSpace,
eventHubStream.currentOffsetsAndSeqNums)).toMap
protected def verifyOffsetsAndSeqs(ssc: StreamingContext,
namespace: String,
expectedOffsetsAndSeqs: Map[String, OffsetRecord]): Unit = {
val producedOffsetsAndSeqs = ssc.graph
.getInputStreams()
.filter(_.isInstanceOf[EventHubDirectDStream])
.map(_.asInstanceOf[EventHubDirectDStream])
.filter(_.eventHubNameSpace == namespace)
.map(eventHubStream =>
(eventHubStream.eventHubNameSpace, eventHubStream.currentOffsetsAndSeqNums))
.toMap
assert(expectedOffsetsAndSeqs === producedOffsetsAndSeqs)
}
def testProgressTracker(
namespace: String,
expectedOffsetsAndSeqs: OffsetRecord,
timestamp: Long): Unit = {
val producedOffsetsAndSeqs = DirectDStreamProgressTracker.getInstance.
asInstanceOf[DirectDStreamProgressTracker].read(namespace,
timestamp - batchDuration.milliseconds, fallBack = true)
def testProgressTracker(namespace: String,
expectedOffsetsAndSeqs: OffsetRecord,
timestamp: Long): Unit = {
val producedOffsetsAndSeqs = DirectDStreamProgressTracker.getInstance
.asInstanceOf[DirectDStreamProgressTracker]
.read(namespace, timestamp - batchDuration.milliseconds, fallBack = true)
assert(producedOffsetsAndSeqs === expectedOffsetsAndSeqs)
}
@ -298,19 +316,24 @@ private[eventhubs] trait EventHubTestSuiteBase extends TestSuiteBase {
val simulatedEventHubs1 = createSimulatedEventHub("namespace1", input1, eventhubsParams1)
val simulatedEventHubs2 = createSimulatedEventHub("namespace2", input2, eventhubsParams2)
withStreamingContext(setupMultiEventHubStreams(simulatedEventHubs1, simulatedEventHubs2,
eventhubsParams1, eventhubsParams2, "namespace1", "namespace2", operation)) { ssc =>
runStreamsWithEventHubInput(ssc, numBatches_, expectedOutput, useSet = true)
withStreamingContext(
setupMultiEventHubStreams(simulatedEventHubs1,
simulatedEventHubs2,
eventhubsParams1,
eventhubsParams2,
"namespace1",
"namespace2",
operation)) { ssc =>
runStreamsWithEventHubInput(ssc, numBatches_, expectedOutput, useSet = true)
}
verifyOffsetsAndSeqs(ssc, "namespace1", expectedOffsetsAndSeqs1)
verifyOffsetsAndSeqs(ssc, "namespace2", expectedOffsetsAndSeqs2)
}
protected def runStreamsWithEventHubInput[V: ClassTag](
ssc: StreamingContext,
numBatches: Int,
expectedOutput: Seq[Seq[V]],
useSet: Boolean): Unit = {
protected def runStreamsWithEventHubInput[V: ClassTag](ssc: StreamingContext,
numBatches: Int,
expectedOutput: Seq[Seq[V]],
useSet: Boolean): Unit = {
val output = runEventHubStreams[V](ssc, numBatches, expectedOutput.size)
verifyOutput[V](output, expectedOutput, useSet)
}
@ -323,20 +346,30 @@ private[eventhubs] trait EventHubTestSuiteBase extends TestSuiteBase {
eventhubsParams: Map[String, Map[String, String]]): EventHubDirectDStream = {
val maxOffsetForEachEventHub = simulatedEventHubs.messageStore.map {
case (ehNameAndPartition, messageQueue) => (ehNameAndPartition,
(messageQueue.length.toLong - 1, messageQueue.length.toLong - 1))
case (ehNameAndPartition, messageQueue) =>
(ehNameAndPartition, (messageQueue.length.toLong - 1, messageQueue.length.toLong - 1))
}
new EventHubDirectDStream(ssc,
new EventHubDirectDStream(
ssc,
namespace,
progressRootPath.toString,
eventhubsParams,
(eventHubParams: Map[String, String], partitionId: Int, startOffset: Long,
eventHubsOffsetType: EventHubsOffsetType, _: Int) =>
new TestEventHubsReceiver(eventHubParams, simulatedEventHubs, partitionId, startOffset,
eventHubsOffsetType),
(eventHubParams: Map[String, String],
partitionId: Int,
startOffset: Long,
eventHubsOffsetType: EventHubsOffsetType,
_: Int) =>
new TestEventHubsReceiver(eventHubParams,
simulatedEventHubs,
partitionId,
startOffset,
eventHubsOffsetType),
(_: String, _: Map[String, Map[String, String]]) =>
new FluctuatedEventHubClient(ssc, messagesBeforeEmpty, numBatchesBeforeNewData,
maxOffsetForEachEventHub))
new FluctuatedEventHubClient(ssc,
messagesBeforeEmpty,
numBatchesBeforeNewData,
maxOffsetForEachEventHub)
)
}
private def setupFluctuatedEventHubStream[V: ClassTag](
@ -346,11 +379,14 @@ private[eventhubs] trait EventHubTestSuiteBase extends TestSuiteBase {
messagesBeforeEmpty: Long,
numBatchesBeforeNewData: Int): StreamingContext = {
val inputStream = setupFluctuatedInputStream(eventhubNamespace, simulatedEventHubs,
messagesBeforeEmpty, numBatchesBeforeNewData, eventhubsParams)
val inputStream = setupFluctuatedInputStream(eventhubNamespace,
simulatedEventHubs,
messagesBeforeEmpty,
numBatchesBeforeNewData,
eventhubsParams)
val operatedStream = operation(inputStream)
val outputStream = new TestEventHubOutputStream(operatedStream,
new ConcurrentLinkedQueue[Seq[Seq[V]]], None)
val outputStream =
new TestEventHubOutputStream(operatedStream, new ConcurrentLinkedQueue[Seq[Seq[V]]], None)
outputStream.register()
ssc
}
@ -368,10 +404,12 @@ private[eventhubs] trait EventHubTestSuiteBase extends TestSuiteBase {
val simulatedEventHubs = createSimulatedEventHub(eventhubNamespace, input, eventhubsParams)
withStreamingContext(
setupFluctuatedEventHubStream(simulatedEventHubs, eventhubsParams, operation,
messagesBeforeEmpty, numBatchesBeforeNewData)) {
ssc =>
runStreamsWithEventHubInput(ssc, numBatches_, expectedOutput, useSet = false)
setupFluctuatedEventHubStream(simulatedEventHubs,
eventhubsParams,
operation,
messagesBeforeEmpty,
numBatchesBeforeNewData)) { ssc =>
runStreamsWithEventHubInput(ssc, numBatches_, expectedOutput, useSet = false)
}
verifyOffsetsAndSeqs(ssc, eventhubNamespace, expectedOffsetsAndSeqs)
}

Просмотреть файл

@ -1,171 +0,0 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.spark.streaming.eventhubs
import scala.collection.mutable.ArrayBuffer
import com.microsoft.azure.eventhubs._
import com.microsoft.azure.eventhubs.EventData.SystemProperties
import com.microsoft.azure.eventhubs.amqp.AmqpConstants
import org.mockito.Mockito._
import org.mockito.internal.util.reflection.Whitebox
import org.scalatest.mock.MockitoSugar
import org.apache.spark.eventhubscommon.client.EventHubsClientWrapper
import org.apache.spark.storage.StorageLevel
import org.apache.spark.streaming.{StreamingContext, TestSuiteBase}
import org.apache.spark.streaming.eventhubs.checkpoint.OffsetStore
import org.apache.spark.streaming.receiver.ReceiverSupervisor
/**
* Suite of EventHubs streaming receiver tests
* This suite of tests are low level unit tests, they directly call EventHubsReceiver with mocks
*/
class EventHubsReceiverSuite extends TestSuiteBase with MockitoSugar{
var eventhubsClientWrapperMock: EventHubsClientWrapper = _
var offsetStoreMock: OffsetStore = _
var executorMock: ReceiverSupervisor = _
val eventhubParameters = Map(
"eventhubs.policyname" -> "policyname",
"eventhubs.policykey" -> "policykey",
"eventhubs.namespace" -> "namespace",
"eventhubs.name" -> "name",
"eventhubs.partition.count" -> "4",
"eventhubs.checkpoint.dir" -> "checkpointdir",
"eventhubs.checkpoint.interval" -> "1000"
)
override def beforeFunction(): Unit = {
eventhubsClientWrapperMock = mock[EventHubsClientWrapper]
offsetStoreMock = mock[OffsetStore]
executorMock = mock[ReceiverSupervisor]
}
override def afterFunction(): Unit = {
super.afterFunction()
// Since this suite was originally written using EasyMock, add this to preserve the old
// mocking semantics (see SPARK-5735 for more details)
// verifyNoMoreInteractions(ehClientWrapperMock, offsetStoreMock)
}
test("EventHubsUtils API works") {
val streamingContext = new StreamingContext(master, framework, batchDuration)
EventHubsUtils.createStream(streamingContext, eventhubParameters, "0", StorageLevel.MEMORY_ONLY)
EventHubsUtils.createUnionStream(streamingContext, eventhubParameters,
StorageLevel.MEMORY_ONLY_2)
streamingContext.stop()
}
test("EventHubsReceiver can receive message with proper checkpointing") {
val eventhubPartitionId = "0"
val eventCheckpointIntervalInSeconds: Int = 1
val eventOffset: String = "2147483647"
val eventSequenceNumber: Long = 1
val maximumEventRate: Int = 999
var updatedEventhubsParams = eventhubParameters
updatedEventhubsParams += "eventhubs.checkpoint.interval" ->
eventCheckpointIntervalInSeconds.toString
var eventData = new EventData(Array.fill(8)((scala.util.Random.nextInt(256) - 128).toByte))
val systemPropertiesMap = new java.util.HashMap[String, AnyRef]()
systemPropertiesMap.put(AmqpConstants.OFFSET_ANNOTATION_NAME, eventOffset)
systemPropertiesMap.put(AmqpConstants.SEQUENCE_NUMBER_ANNOTATION_NAME,
Long.box(eventSequenceNumber))
systemPropertiesMap.put(AmqpConstants.PARTITION_KEY_ANNOTATION_NAME, eventhubPartitionId)
val systemProperties = new SystemProperties(systemPropertiesMap)
Whitebox.setInternalState(eventData, "systemProperties", systemProperties)
val eventDataCollection = new ArrayBuffer[EventData]
eventDataCollection += eventData
when(offsetStoreMock.read()).thenReturn("-1")
when(eventhubsClientWrapperMock.receive()).thenReturn(eventDataCollection)
val receiver = new EventHubsReceiver(updatedEventhubsParams, eventhubPartitionId,
StorageLevel.MEMORY_ONLY, Option(offsetStoreMock), eventhubsClientWrapperMock,
maximumEventRate)
receiver.attachSupervisor(executorMock)
receiver.onStart()
Thread sleep eventCheckpointIntervalInSeconds * 1000
receiver.onStop()
Thread sleep eventCheckpointIntervalInSeconds * 1000
verify(offsetStoreMock, times(1)).open()
verify(offsetStoreMock, times(1)).write(eventOffset)
verify(eventhubsClientWrapperMock, times(1)).createReceiver(updatedEventhubsParams,
eventhubPartitionId, offsetStoreMock, maximumEventRate)
verify(eventhubsClientWrapperMock, atLeastOnce).receive()
}
ignore("EventHubsReceiver can restart when exception is thrown") {
val eventhubPartitionId = "0"
val eventOffset = "2147483647"
val eventSequenceNumber = 1L
val maximumEventRate = 999
val eventData = new EventData(Array.fill(8)((scala.util.Random.nextInt(256) - 128).toByte))
val systemPropertiesMap = new java.util.HashMap[String, AnyRef]()
systemPropertiesMap.put(AmqpConstants.OFFSET_ANNOTATION_NAME, eventOffset)
systemPropertiesMap.put(AmqpConstants.SEQUENCE_NUMBER_ANNOTATION_NAME,
Long.box(eventSequenceNumber))
systemPropertiesMap.put(AmqpConstants.PARTITION_KEY_ANNOTATION_NAME, eventhubPartitionId)
val systemProperties = new SystemProperties(systemPropertiesMap)
Whitebox.setInternalState(eventData, "systemProperties", systemProperties)
val eventDataCollection: ArrayBuffer[EventData] = new ArrayBuffer[EventData]()
eventDataCollection += eventData
val eventhubException = new RuntimeException("error")
when(offsetStoreMock.read()).thenReturn("-1")
when(eventhubsClientWrapperMock.receive()).thenReturn(eventDataCollection).
thenThrow(eventhubException)
val receiver = new EventHubsReceiver(eventhubParameters, eventhubPartitionId,
StorageLevel.MEMORY_ONLY, Option(offsetStoreMock), eventhubsClientWrapperMock,
maximumEventRate)
receiver.attachSupervisor(executorMock)
receiver.onStart()
Thread sleep 1000
receiver.onStop()
verify(executorMock, times(1)).restartReceiver(s"Error handling message," +
s" restarting receiver for partition $eventhubPartitionId", Some(eventhubException))
verify(offsetStoreMock, times(1)).open()
verify(offsetStoreMock, times(1)).close()
verify(eventhubsClientWrapperMock, times(1)).createReceiver(eventhubParameters, "0",
offsetStoreMock, maximumEventRate)
verify(eventhubsClientWrapperMock, times(2)).receive()
verify(eventhubsClientWrapperMock, times(1)).close()
}
}

Просмотреть файл

@ -20,18 +20,20 @@ package org.apache.spark.streaming.eventhubs
import java.nio.file.Files
import scala.collection.JavaConverters._
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.{FileSystem, Path}
import org.apache.spark.eventhubscommon.{EventHubNameAndPartition, OffsetRecord}
import org.apache.hadoop.fs.{ FileSystem, Path }
import org.apache.spark.eventhubscommon.{ EventHubNameAndPartition, OffsetRecord }
import org.apache.spark.eventhubscommon.utils.FragileEventHubClient
import org.apache.spark.streaming._
import org.apache.spark.streaming.eventhubs.checkpoint.{DirectDStreamProgressTracker, ProgressTrackingListener}
import org.apache.spark.streaming.eventhubs.checkpoint.{
DirectDStreamProgressTracker,
ProgressTrackingListener
}
import org.apache.spark.util.ManualClock
class ProgressTrackingAndCheckpointSuite extends CheckpointAndProgressTrackerTestSuiteBase
with SharedUtils {
class ProgressTrackingAndCheckpointSuite
extends CheckpointAndProgressTrackerTestSuiteBase
with SharedUtils {
override def init(): Unit = {
progressRootPath = new Path(Files.createTempDirectory("progress_root").toString)
@ -39,59 +41,67 @@ class ProgressTrackingAndCheckpointSuite extends CheckpointAndProgressTrackerTes
ssc = createContextForCheckpointOperation(batchDuration, checkpointDirectory)
progressListener = ProgressTrackingListener.initInstance(ssc, progressRootPath.toString)
progressTracker = DirectDStreamProgressTracker.initInstance(progressRootPath.toString,
appName, new Configuration())
appName,
new Configuration())
}
override def batchDuration: Duration = Seconds(1)
test("currentOffset, ProgressTracker and EventHubClient are setup correctly when" +
" EventHubDirectDStream is recovered") {
val input = Seq(
Seq(1, 2, 3, 4, 5, 6),
Seq(4, 5, 6, 7, 8, 9),
Seq(7, 8, 9, 1, 2, 3))
val expectedOutputBeforeRestart = Seq(
Seq(2, 3, 5, 6, 8, 9), Seq(4, 5, 7, 8, 10, 2), Seq(6, 7, 9, 10, 3, 4))
test(
"currentOffset, ProgressTracker and EventHubClient are setup correctly when" +
" EventHubDirectDStream is recovered") {
val input = Seq(Seq(1, 2, 3, 4, 5, 6), Seq(4, 5, 6, 7, 8, 9), Seq(7, 8, 9, 1, 2, 3))
val expectedOutputBeforeRestart =
Seq(Seq(2, 3, 5, 6, 8, 9), Seq(4, 5, 7, 8, 10, 2), Seq(6, 7, 9, 10, 3, 4))
runStopAndRecover(
input,
eventhubsParams = Map[String, Map[String, String]](
"eh1" -> Map(
"eventhubs.partition.count" -> "3",
"eventhubs.maxRate" -> "2",
"eventhubs.name" -> "eh1")
"eventhubs.name" -> "eh1",
"eventhubs.namespace" -> "namespace",
"eventhubs.policyname" -> "policyname",
"eventhubs.policykey" -> "policykey"
)
),
expectedStartingOffsetsAndSeqs = Map(eventhubNamespace ->
OffsetRecord(2000L, Map(EventHubNameAndPartition("eh1", 0) -> (3L, 3L),
EventHubNameAndPartition("eh1", 1) -> (3L, 3L),
EventHubNameAndPartition("eh1", 2) -> (3L, 3L))
)),
expectedOffsetsAndSeqs =
OffsetRecord(3000L, Map(EventHubNameAndPartition("eh1", 0) -> (5L, 5L),
EventHubNameAndPartition("eh1", 1) -> (5L, 5L),
EventHubNameAndPartition("eh1", 2) -> (5L, 5L))),
expectedStartingOffsetsAndSeqs = Map(
eventhubNamespace ->
OffsetRecord(2000L,
Map(EventHubNameAndPartition("eh1", 0) -> (3L, 3L),
EventHubNameAndPartition("eh1", 1) -> (3L, 3L),
EventHubNameAndPartition("eh1", 2) -> (3L, 3L)))),
expectedOffsetsAndSeqs = OffsetRecord(3000L,
Map(EventHubNameAndPartition("eh1", 0) -> (5L, 5L),
EventHubNameAndPartition("eh1", 1) -> (5L, 5L),
EventHubNameAndPartition("eh1", 2) -> (5L, 5L))),
operation = (inputDStream: EventHubDirectDStream) =>
inputDStream.map(eventData => eventData.getProperties.get("output").asInstanceOf[Int] + 1),
expectedOutputBeforeRestart)
val eventHubDirectDStream = ssc.graph.getInputStreams().filter(
_.isInstanceOf[EventHubDirectDStream]).head.asInstanceOf[EventHubDirectDStream]
assert(eventHubDirectDStream.currentOffsetsAndSeqNums ===
OffsetRecord(2000L, Map(
EventHubNameAndPartition("eh1", 0) -> (3L, 3L),
EventHubNameAndPartition("eh1", 1) -> (3L, 3L),
EventHubNameAndPartition("eh1", 2) -> (3L, 3L))))
expectedOutputBeforeRestart
)
val eventHubDirectDStream = ssc.graph
.getInputStreams()
.filter(_.isInstanceOf[EventHubDirectDStream])
.head
.asInstanceOf[EventHubDirectDStream]
assert(
eventHubDirectDStream.currentOffsetsAndSeqNums ===
OffsetRecord(2000L,
Map(EventHubNameAndPartition("eh1", 0) -> (3L, 3L),
EventHubNameAndPartition("eh1", 1) -> (3L, 3L),
EventHubNameAndPartition("eh1", 2) -> (3L, 3L))))
assert(DirectDStreamProgressTracker.getInstance != null)
assert(eventHubDirectDStream.eventHubClient != null)
}
test("test integration of spark checkpoint and progress tracking (single stream)") {
val input = Seq(
Seq(1, 2, 3, 4, 5, 6, 7, 8, 9, 10),
Seq(4, 5, 6, 7, 8, 9, 10, 1, 2, 3),
Seq(7, 8, 9, 1, 2, 3, 4, 5, 6, 7))
val expectedOutputBeforeRestart = Seq(
Seq(2, 3, 5, 6, 8, 9), Seq(4, 5, 7, 8, 10, 2), Seq(6, 7, 9, 10, 3, 4))
val expectedOutputAfterRestart = Seq(
Seq(6, 7, 9, 10, 3, 4), Seq(8, 9, 11, 2, 5, 6), Seq(10, 11, 3, 4, 7, 8), Seq())
val input = Seq(Seq(1, 2, 3, 4, 5, 6, 7, 8, 9, 10),
Seq(4, 5, 6, 7, 8, 9, 10, 1, 2, 3),
Seq(7, 8, 9, 1, 2, 3, 4, 5, 6, 7))
val expectedOutputBeforeRestart =
Seq(Seq(2, 3, 5, 6, 8, 9), Seq(4, 5, 7, 8, 10, 2), Seq(6, 7, 9, 10, 3, 4))
val expectedOutputAfterRestart =
Seq(Seq(6, 7, 9, 10, 3, 4), Seq(8, 9, 11, 2, 5, 6), Seq(10, 11, 3, 4, 7, 8), Seq())
testCheckpointedOperation(
input,
@ -99,41 +109,61 @@ class ProgressTrackingAndCheckpointSuite extends CheckpointAndProgressTrackerTes
"eh1" -> Map(
"eventhubs.partition.count" -> "3",
"eventhubs.maxRate" -> "2",
"eventhubs.name" -> "eh1")
"eventhubs.name" -> "eh1",
"eventhubs.namespace" -> "namespace",
"eventhubs.policyname" -> "policyname",
"eventhubs.policykey" -> "policykey"
)
),
expectedStartingOffsetsAndSeqs = Map(eventhubNamespace ->
OffsetRecord(2000L, Map(EventHubNameAndPartition("eh1", 0) -> (3L, 3L),
EventHubNameAndPartition("eh1", 1) -> (3L, 3L),
EventHubNameAndPartition("eh1", 2) -> (3L, 3L))
)),
expectedStartingOffsetsAndSeqs = Map(
eventhubNamespace ->
OffsetRecord(2000L,
Map(EventHubNameAndPartition("eh1", 0) -> (3L, 3L),
EventHubNameAndPartition("eh1", 1) -> (3L, 3L),
EventHubNameAndPartition("eh1", 2) -> (3L, 3L)))),
expectedOffsetsAndSeqs = OffsetRecord(3000L,
Map(EventHubNameAndPartition("eh1", 0) -> (5L, 5L),
EventHubNameAndPartition("eh1", 1) -> (5L, 5L),
EventHubNameAndPartition("eh1", 2) -> (5L, 5L))),
Map(EventHubNameAndPartition("eh1", 0) -> (5L, 5L),
EventHubNameAndPartition("eh1", 1) -> (5L, 5L),
EventHubNameAndPartition("eh1", 2) -> (5L, 5L))),
operation = (inputDStream: EventHubDirectDStream) =>
inputDStream.map(eventData => eventData.getProperties.get("output").asInstanceOf[Int] + 1),
expectedOutputBeforeRestart,
expectedOutputAfterRestart)
expectedOutputAfterRestart
)
}
test("test integration of spark checkpoint and progress tracking (reduceByKeyAndWindow)") {
val input = Seq(
Seq("1", "2", "3", "4", "5", "6", "7", "8", "9", "10"),
Seq("4", "5", "6", "7", "8", "9", "10", "1", "2", "3"),
Seq("7", "8", "9", "1", "2", "3", "4", "5", "6", "7"))
val input = Seq(Seq("1", "2", "3", "4", "5", "6", "7", "8", "9", "10"),
Seq("4", "5", "6", "7", "8", "9", "10", "1", "2", "3"),
Seq("7", "8", "9", "1", "2", "3", "4", "5", "6", "7"))
val expectedOutputBeforeRestart = Seq(
Seq("1" -> 1, "2" -> 1, "4" -> 1, "5" -> 1, "7" -> 1, "8" -> 1),
Seq("1" -> 2, "2" -> 1, "4" -> 2, "5" -> 1, "7" -> 2, "8" -> 1, "3" -> 1, "6" -> 1,
"9" -> 1),
Seq("1" -> 1, "2" -> 1, "4" -> 1, "5" -> 1, "7" -> 1, "8" -> 1, "3" -> 2, "6" -> 2,
"9" -> 2))
Seq("1" -> 2, "2" -> 1, "4" -> 2, "5" -> 1, "7" -> 2, "8" -> 1, "3" -> 1, "6" -> 1, "9" -> 1),
Seq("1" -> 1, "2" -> 1, "4" -> 1, "5" -> 1, "7" -> 1, "8" -> 1, "3" -> 2, "6" -> 2, "9" -> 2)
)
val expectedOutputAfterRestart = Seq(
Seq("1" -> 1, "2" -> 1, "4" -> 1, "5" -> 1, "7" -> 1, "8" -> 1, "3" -> 2, "6" -> 2,
"9" -> 2),
Seq("5" -> 2, "6" -> 1, "9" -> 1, "2" -> 1, "3" -> 1, "7" -> 1, "8" -> 2,
"10" -> 1, "1" -> 1, "4" -> 1),
Seq("7" -> 2, "8" -> 1, "10" -> 2, "1" -> 1, "4" -> 1, "5" -> 1, "9" -> 1,
"2" -> 1, "3" -> 1, "6" -> 1))
Seq("1" -> 1, "2" -> 1, "4" -> 1, "5" -> 1, "7" -> 1, "8" -> 1, "3" -> 2, "6" -> 2, "9" -> 2),
Seq("5" -> 2,
"6" -> 1,
"9" -> 1,
"2" -> 1,
"3" -> 1,
"7" -> 1,
"8" -> 2,
"10" -> 1,
"1" -> 1,
"4" -> 1),
Seq("7" -> 2,
"8" -> 1,
"10" -> 2,
"1" -> 1,
"4" -> 1,
"5" -> 1,
"9" -> 1,
"2" -> 1,
"3" -> 1,
"6" -> 1)
)
testCheckpointedOperation(
input,
@ -141,39 +171,46 @@ class ProgressTrackingAndCheckpointSuite extends CheckpointAndProgressTrackerTes
"eh1" -> Map(
"eventhubs.partition.count" -> "3",
"eventhubs.maxRate" -> "2",
"eventhubs.name" -> "eh1")
"eventhubs.name" -> "eh1",
"eventhubs.namespace" -> "namespace",
"eventhubs.policyname" -> "policyname",
"eventhubs.policykey" -> "policykey"
)
),
expectedStartingOffsetsAndSeqs = Map(eventhubNamespace ->
OffsetRecord(2000L, Map(EventHubNameAndPartition("eh1", 0) -> (3L, 3L),
EventHubNameAndPartition("eh1", 1) -> (3L, 3L),
EventHubNameAndPartition("eh1", 2) -> (3L, 3L))
)),
expectedStartingOffsetsAndSeqs = Map(
eventhubNamespace ->
OffsetRecord(2000L,
Map(EventHubNameAndPartition("eh1", 0) -> (3L, 3L),
EventHubNameAndPartition("eh1", 1) -> (3L, 3L),
EventHubNameAndPartition("eh1", 2) -> (3L, 3L)))),
expectedOffsetsAndSeqs = OffsetRecord(3000L,
Map(EventHubNameAndPartition("eh1", 0) -> (5L, 5L),
EventHubNameAndPartition("eh1", 1) -> (5L, 5L),
EventHubNameAndPartition("eh1", 2) -> (5L, 5L))),
Map(EventHubNameAndPartition("eh1", 0) -> (5L, 5L),
EventHubNameAndPartition("eh1", 1) -> (5L, 5L),
EventHubNameAndPartition("eh1", 2) -> (5L, 5L))),
operation = (inputDStream: EventHubDirectDStream) =>
inputDStream.flatMap(eventData => eventData.getProperties.asScala.
map{case (_, value) => (value, 1)}).
reduceByKeyAndWindow(_ + _, _ - _, Seconds(2), Seconds(1)),
inputDStream
.flatMap(eventData =>
eventData.getProperties.asScala.map { case (_, value) => (value, 1) })
.reduceByKeyAndWindow(_ + _, _ - _, Seconds(2), Seconds(1)),
expectedOutputBeforeRestart,
expectedOutputAfterRestart,
useSetFlag = true)
useSetFlag = true
)
}
test("test integration of spark checkpoint and progress tracking (single stream +" +
" windowing function)") {
val input = Seq(
Seq(1, 2, 3, 4, 5, 6, 7, 8, 9, 10),
Seq(4, 5, 6, 7, 8, 9, 10, 1, 2, 3),
Seq(7, 8, 9, 1, 2, 3, 4, 5, 6, 7))
val expectedOutputBeforeRestart = Seq(
Seq(2, 3, 5, 6, 8, 9), Seq(2, 3, 5, 6, 8, 9, 4, 5, 7, 8, 10, 2),
Seq(4, 5, 7, 8, 10, 2, 6, 7, 9, 10, 3, 4))
val expectedOutputAfterRestart = Seq(
Seq(4, 5, 7, 8, 10, 2, 6, 7, 9, 10, 3, 4),
Seq(6, 7, 9, 10, 3, 4, 8, 9, 11, 2, 5, 6),
Seq(8, 9, 11, 2, 5, 6, 10, 11, 3, 4, 7, 8), Seq(10, 11, 3, 4, 7, 8))
test(
"test integration of spark checkpoint and progress tracking (single stream +" +
" windowing function)") {
val input = Seq(Seq(1, 2, 3, 4, 5, 6, 7, 8, 9, 10),
Seq(4, 5, 6, 7, 8, 9, 10, 1, 2, 3),
Seq(7, 8, 9, 1, 2, 3, 4, 5, 6, 7))
val expectedOutputBeforeRestart = Seq(Seq(2, 3, 5, 6, 8, 9),
Seq(2, 3, 5, 6, 8, 9, 4, 5, 7, 8, 10, 2),
Seq(4, 5, 7, 8, 10, 2, 6, 7, 9, 10, 3, 4))
val expectedOutputAfterRestart = Seq(Seq(4, 5, 7, 8, 10, 2, 6, 7, 9, 10, 3, 4),
Seq(6, 7, 9, 10, 3, 4, 8, 9, 11, 2, 5, 6),
Seq(8, 9, 11, 2, 5, 6, 10, 11, 3, 4, 7, 8),
Seq(10, 11, 3, 4, 7, 8))
testCheckpointedOperation(
input,
@ -181,43 +218,81 @@ class ProgressTrackingAndCheckpointSuite extends CheckpointAndProgressTrackerTes
"eh1" -> Map(
"eventhubs.partition.count" -> "3",
"eventhubs.maxRate" -> "2",
"eventhubs.name" -> "eh1")
"eventhubs.name" -> "eh1",
"eventhubs.namespace" -> "namespace",
"eventhubs.policyname" -> "policyname",
"eventhubs.policykey" -> "policykey"
)
),
expectedStartingOffsetsAndSeqs = Map(eventhubNamespace ->
OffsetRecord(2000L, Map(EventHubNameAndPartition("eh1", 0) -> (3L, 3L),
EventHubNameAndPartition("eh1", 1) -> (3L, 3L),
EventHubNameAndPartition("eh1", 2) -> (3L, 3L))
)),
expectedStartingOffsetsAndSeqs = Map(
eventhubNamespace ->
OffsetRecord(2000L,
Map(EventHubNameAndPartition("eh1", 0) -> (3L, 3L),
EventHubNameAndPartition("eh1", 1) -> (3L, 3L),
EventHubNameAndPartition("eh1", 2) -> (3L, 3L)))),
expectedOffsetsAndSeqs = OffsetRecord(3000L,
Map(EventHubNameAndPartition("eh1", 0) -> (5L, 5L),
EventHubNameAndPartition("eh1", 1) -> (5L, 5L),
EventHubNameAndPartition("eh1", 2) -> (5L, 5L))),
Map(EventHubNameAndPartition("eh1", 0) -> (5L, 5L),
EventHubNameAndPartition("eh1", 1) -> (5L, 5L),
EventHubNameAndPartition("eh1", 2) -> (5L, 5L))),
operation = (inputDStream: EventHubDirectDStream) =>
inputDStream.window(Seconds(2), Seconds(1)).map(
eventData => eventData.getProperties.get("output").asInstanceOf[Int] + 1),
inputDStream
.window(Seconds(2), Seconds(1))
.map(eventData => eventData.getProperties.get("output").asInstanceOf[Int] + 1),
expectedOutputBeforeRestart,
expectedOutputAfterRestart)
expectedOutputAfterRestart
)
}
test("test integration of spark checkpoint and progress tracking (multi-streams join)") {
val input1 = Seq(
Seq("a" -> 1, "b" -> 2, "c" -> 3, "d" -> 4, "e" -> 5, "f" -> 6, "g" -> 4, "h" -> 5, "i" -> 6),
Seq("g" -> 4, "h" -> 5, "i" -> 6, "j" -> 7, "k" -> 8, "l" -> 9, "m" -> 7, "n" -> 8, "o" -> 9),
Seq("m" -> 7, "n" -> 8, "o" -> 9, "p" -> 1, "q" -> 2, "r" -> 3, "a" -> 1, "b" -> 2, "c" -> 3))
Seq("m" -> 7, "n" -> 8, "o" -> 9, "p" -> 1, "q" -> 2, "r" -> 3, "a" -> 1, "b" -> 2, "c" -> 3)
)
val input2 = Seq(
Seq("a" -> 1, "b" -> 2, "c" -> 3, "d" -> 4, "e" -> 5, "f" -> 6, "g" -> 4, "h" -> 5, "i" -> 6),
Seq("g" -> 4, "h" -> 5, "i" -> 6, "j" -> 7, "k" -> 8, "l" -> 9, "m" -> 7, "n" -> 8, "o" -> 9),
Seq("m" -> 7, "n" -> 8, "o" -> 9, "p" -> 1, "q" -> 2, "r" -> 3, "a" -> 1, "b" -> 2, "c" -> 3))
val expectedOutputBeforeRestart = Seq(
Seq("a" -> 2, "b" -> 4, "c" -> 6, "g" -> 8, "h" -> 10, "i" -> 12, "m" -> 14, "n" -> 16,
"o" -> 18),
Seq("d" -> 8, "e" -> 10, "f" -> 12, "j" -> 14, "k" -> 16, "l" -> 18, "p" -> 2, "q" -> 4,
"r" -> 6))
Seq("m" -> 7, "n" -> 8, "o" -> 9, "p" -> 1, "q" -> 2, "r" -> 3, "a" -> 1, "b" -> 2, "c" -> 3)
)
val expectedOutputBeforeRestart = Seq(Seq("a" -> 2,
"b" -> 4,
"c" -> 6,
"g" -> 8,
"h" -> 10,
"i" -> 12,
"m" -> 14,
"n" -> 16,
"o" -> 18),
Seq("d" -> 8,
"e" -> 10,
"f" -> 12,
"j" -> 14,
"k" -> 16,
"l" -> 18,
"p" -> 2,
"q" -> 4,
"r" -> 6))
val expectedOutputAfterRestart = Seq(
Seq("d" -> 8, "e" -> 10, "f" -> 12, "j" -> 14, "k" -> 16, "l" -> 18, "p" -> 2, "q" -> 4,
"r" -> 6),
Seq("g" -> 8, "h" -> 10, "i" -> 12, "m" -> 14, "n" -> 16, "o" -> 18,
"a" -> 2, "b" -> 4, "c" -> 6), Seq())
Seq("d" -> 8,
"e" -> 10,
"f" -> 12,
"j" -> 14,
"k" -> 16,
"l" -> 18,
"p" -> 2,
"q" -> 4,
"r" -> 6),
Seq("g" -> 8,
"h" -> 10,
"i" -> 12,
"m" -> 14,
"n" -> 16,
"o" -> 18,
"a" -> 2,
"b" -> 4,
"c" -> 6),
Seq()
)
testCheckpointedOperation(
input1,
@ -226,41 +301,52 @@ class ProgressTrackingAndCheckpointSuite extends CheckpointAndProgressTrackerTes
"eh1" -> Map(
"eventhubs.partition.count" -> "3",
"eventhubs.maxRate" -> "3",
"eventhubs.name" -> "eh1")
"eventhubs.name" -> "eh1",
"eventhubs.namespace" -> "namespace",
"eventhubs.policyname" -> "policyname",
"eventhubs.policykey" -> "policykey"
)
),
eventhubsParams2 = Map[String, Map[String, String]](
"eh1" -> Map(
"eventhubs.partition.count" -> "3",
"eventhubs.maxRate" -> "3",
"eventhubs.name" -> "eh1")
"eventhubs.name" -> "eh1",
"eventhubs.namespace" -> "namespace",
"eventhubs.policyname" -> "policyname",
"eventhubs.policykey" -> "policykey"
)
),
expectedStartingOffsetsAndSeqs1 = Map("namespace1" ->
OffsetRecord(1000L, Map(EventHubNameAndPartition("eh1", 0) -> (2L, 2L),
EventHubNameAndPartition("eh1", 1) -> (2L, 2L),
EventHubNameAndPartition("eh1", 2) -> (2L, 2L))
)),
expectedStartingOffsetsAndSeqs2 = Map("namespace2" ->
OffsetRecord(1000L, Map(EventHubNameAndPartition("eh1", 0) -> (2L, 2L),
EventHubNameAndPartition("eh1", 1) -> (2L, 2L),
EventHubNameAndPartition("eh1", 2) -> (2L, 2L))
)),
expectedStartingOffsetsAndSeqs1 = Map(
"namespace1" ->
OffsetRecord(1000L,
Map(EventHubNameAndPartition("eh1", 0) -> (2L, 2L),
EventHubNameAndPartition("eh1", 1) -> (2L, 2L),
EventHubNameAndPartition("eh1", 2) -> (2L, 2L)))),
expectedStartingOffsetsAndSeqs2 = Map(
"namespace2" ->
OffsetRecord(1000L,
Map(EventHubNameAndPartition("eh1", 0) -> (2L, 2L),
EventHubNameAndPartition("eh1", 1) -> (2L, 2L),
EventHubNameAndPartition("eh1", 2) -> (2L, 2L)))),
operation = (inputDStream1: EventHubDirectDStream, inputDStream2: EventHubDirectDStream) =>
inputDStream1.flatMap(eventData => eventData.getProperties.asScala).
join(inputDStream2.flatMap(eventData => eventData.getProperties.asScala)).
map{case (key, (v1, v2)) => (key, v1.asInstanceOf[Int] + v2.asInstanceOf[Int])},
inputDStream1
.flatMap(eventData => eventData.getProperties.asScala)
.join(inputDStream2.flatMap(eventData => eventData.getProperties.asScala))
.map { case (key, (v1, v2)) => (key, v1.asInstanceOf[Int] + v2.asInstanceOf[Int]) },
expectedOutputBeforeRestart,
expectedOutputAfterRestart)
expectedOutputAfterRestart
)
}
test("recover from a progress directory where has no metadata record") {
val input = Seq(
Seq(1, 2, 3, 4, 5, 6, 7, 8, 9, 10),
Seq(4, 5, 6, 7, 8, 9, 10, 1, 2, 3),
Seq(7, 8, 9, 1, 2, 3, 4, 5, 6, 7))
val expectedOutputBeforeRestart = Seq(
Seq(2, 3, 5, 6, 8, 9), Seq(4, 5, 7, 8, 10, 2), Seq(6, 7, 9, 10, 3, 4))
val expectedOutputAfterRestart = Seq(
Seq(6, 7, 9, 10, 3, 4), Seq(8, 9, 11, 2, 5, 6), Seq(10, 11, 3, 4, 7, 8), Seq())
val input = Seq(Seq(1, 2, 3, 4, 5, 6, 7, 8, 9, 10),
Seq(4, 5, 6, 7, 8, 9, 10, 1, 2, 3),
Seq(7, 8, 9, 1, 2, 3, 4, 5, 6, 7))
val expectedOutputBeforeRestart =
Seq(Seq(2, 3, 5, 6, 8, 9), Seq(4, 5, 7, 8, 10, 2), Seq(6, 7, 9, 10, 3, 4))
val expectedOutputAfterRestart =
Seq(Seq(6, 7, 9, 10, 3, 4), Seq(8, 9, 11, 2, 5, 6), Seq(10, 11, 3, 4, 7, 8), Seq())
testCheckpointedOperation(
input,
@ -268,34 +354,37 @@ class ProgressTrackingAndCheckpointSuite extends CheckpointAndProgressTrackerTes
"eh1" -> Map(
"eventhubs.partition.count" -> "3",
"eventhubs.maxRate" -> "2",
"eventhubs.name" -> "eh1")
"eventhubs.name" -> "eh1",
"eventhubs.namespace" -> "namespace",
"eventhubs.policyname" -> "policyname",
"eventhubs.policykey" -> "policykey"
)
),
expectedStartingOffsetsAndSeqs = Map(eventhubNamespace ->
OffsetRecord(2000L, Map(EventHubNameAndPartition("eh1", 0) -> (3L, 3L),
EventHubNameAndPartition("eh1", 1) -> (3L, 3L),
EventHubNameAndPartition("eh1", 2) -> (3L, 3L))
)),
expectedStartingOffsetsAndSeqs = Map(
eventhubNamespace ->
OffsetRecord(2000L,
Map(EventHubNameAndPartition("eh1", 0) -> (3L, 3L),
EventHubNameAndPartition("eh1", 1) -> (3L, 3L),
EventHubNameAndPartition("eh1", 2) -> (3L, 3L)))),
expectedOffsetsAndSeqs = OffsetRecord(3000L,
Map(EventHubNameAndPartition("eh1", 0) -> (5L, 5L),
EventHubNameAndPartition("eh1", 1) -> (5L, 5L),
EventHubNameAndPartition("eh1", 2) -> (5L, 5L))),
Map(EventHubNameAndPartition("eh1", 0) -> (5L, 5L),
EventHubNameAndPartition("eh1", 1) -> (5L, 5L),
EventHubNameAndPartition("eh1", 2) -> (5L, 5L))),
operation = (inputDStream: EventHubDirectDStream) =>
inputDStream.map(eventData => eventData.getProperties.get("output").asInstanceOf[Int] + 1),
expectedOutputBeforeRestart,
expectedOutputAfterRestart,
directoryToClean = Some(progressTracker.metadataDirectoryPath))
directoryToClean = Some(progressTracker.metadataDirectoryPath)
)
}
test("recover from progress after updating code (no checkpoint provided)") {
val input = Seq(
Seq(1, 2, 3, 4, 5, 6, 7, 8, 9, 10),
Seq(4, 5, 6, 7, 8, 9, 10, 1, 2, 3),
Seq(7, 8, 9, 1, 2, 3, 4, 5, 6, 7))
val expectedOutputBeforeRestart = Seq(
Seq(2, 3, 5, 6, 8, 9), Seq(4, 5, 7, 8, 10, 2), Seq(6, 7, 9, 10, 3, 4))
val expectedOutputAfterRestart = Seq(
Seq(8, 9, 11, 2, 5, 6), Seq(10, 11, 3, 4, 7, 8))
val input = Seq(Seq(1, 2, 3, 4, 5, 6, 7, 8, 9, 10),
Seq(4, 5, 6, 7, 8, 9, 10, 1, 2, 3),
Seq(7, 8, 9, 1, 2, 3, 4, 5, 6, 7))
val expectedOutputBeforeRestart =
Seq(Seq(2, 3, 5, 6, 8, 9), Seq(4, 5, 7, 8, 10, 2), Seq(6, 7, 9, 10, 3, 4))
val expectedOutputAfterRestart = Seq(Seq(8, 9, 11, 2, 5, 6), Seq(10, 11, 3, 4, 7, 8))
testUnaryOperation(
input,
@ -303,24 +392,31 @@ class ProgressTrackingAndCheckpointSuite extends CheckpointAndProgressTrackerTes
"eh1" -> Map(
"eventhubs.partition.count" -> "3",
"eventhubs.maxRate" -> "2",
"eventhubs.name" -> "eh1")
"eventhubs.name" -> "eh1",
"eventhubs.namespace" -> "namespace",
"eventhubs.policyname" -> "policyname",
"eventhubs.policykey" -> "policykey"
)
),
expectedOffsetsAndSeqs = Map(eventhubNamespace ->
OffsetRecord(2000L, Map(EventHubNameAndPartition("eh1", 0) -> (3L, 3L),
EventHubNameAndPartition("eh1", 1) -> (3L, 3L),
EventHubNameAndPartition("eh1", 2) -> (3L, 3L))
)),
expectedOffsetsAndSeqs = Map(
eventhubNamespace ->
OffsetRecord(2000L,
Map(EventHubNameAndPartition("eh1", 0) -> (3L, 3L),
EventHubNameAndPartition("eh1", 1) -> (3L, 3L),
EventHubNameAndPartition("eh1", 2) -> (3L, 3L)))),
operation = (inputDStream: EventHubDirectDStream) =>
inputDStream.map(eventData => eventData.getProperties.get("output").asInstanceOf[Int] + 1),
expectedOutputBeforeRestart)
expectedOutputBeforeRestart
)
testProgressTracker(
eventhubNamespace,
expectedOffsetsAndSeqs =
OffsetRecord(3000L, Map(EventHubNameAndPartition("eh1", 0) -> (5L, 5L),
EventHubNameAndPartition("eh1", 1) -> (5L, 5L),
EventHubNameAndPartition("eh1", 2) -> (5L, 5L))),
4000L)
expectedOffsetsAndSeqs = OffsetRecord(3000L,
Map(EventHubNameAndPartition("eh1", 0) -> (5L, 5L),
EventHubNameAndPartition("eh1", 1) -> (5L, 5L),
EventHubNameAndPartition("eh1", 2) -> (5L, 5L))),
4000L
)
ssc.stop()
reset()
@ -334,36 +430,43 @@ class ProgressTrackingAndCheckpointSuite extends CheckpointAndProgressTrackerTes
"eh1" -> Map(
"eventhubs.partition.count" -> "3",
"eventhubs.maxRate" -> "2",
"eventhubs.name" -> "eh1")
"eventhubs.name" -> "eh1",
"eventhubs.namespace" -> "namespace",
"eventhubs.policyname" -> "policyname",
"eventhubs.policykey" -> "policykey"
)
),
expectedOffsetsAndSeqs = Map(eventhubNamespace ->
OffsetRecord(6000, Map(EventHubNameAndPartition("eh1", 0) -> (7L, 7L),
EventHubNameAndPartition("eh1", 1) -> (7L, 7L),
EventHubNameAndPartition("eh1", 2) -> (7L, 7L))
)),
expectedOffsetsAndSeqs = Map(
eventhubNamespace ->
OffsetRecord(6000,
Map(EventHubNameAndPartition("eh1", 0) -> (7L, 7L),
EventHubNameAndPartition("eh1", 1) -> (7L, 7L),
EventHubNameAndPartition("eh1", 2) -> (7L, 7L)))),
operation = (inputDStream: EventHubDirectDStream) =>
inputDStream.map(eventData => eventData.getProperties.get("output").asInstanceOf[Int] + 1),
expectedOutputAfterRestart)
expectedOutputAfterRestart
)
testProgressTracker(
eventhubNamespace,
expectedOffsetsAndSeqs =
OffsetRecord(7000L, Map(EventHubNameAndPartition("eh1", 0) -> (9L, 9L),
EventHubNameAndPartition("eh1", 1) -> (9L, 9L),
EventHubNameAndPartition("eh1", 2) -> (9L, 9L))),
8000L)
expectedOffsetsAndSeqs = OffsetRecord(7000L,
Map(EventHubNameAndPartition("eh1", 0) -> (9L, 9L),
EventHubNameAndPartition("eh1", 1) -> (9L, 9L),
EventHubNameAndPartition("eh1", 2) -> (9L, 9L))),
8000L
)
}
test("recover correctly when checkpoint writing is delayed") {
val input = Seq(
Seq(1, 2, 3, 4, 5, 6, 7, 8, 9, 10),
Seq(4, 5, 6, 7, 8, 9, 10, 1, 2, 3),
Seq(7, 8, 9, 1, 2, 3, 4, 5, 6, 7))
val expectedOutputBeforeRestart = Seq(
Seq(2, 3, 5, 6, 8, 9), Seq(4, 5, 7, 8, 10, 2), Seq(6, 7, 9, 10, 3, 4))
val expectedOutputAfterRestart = Seq(
Seq(4, 5, 7, 8, 10, 2), Seq(6, 7, 9, 10, 3, 4), Seq(8, 9, 11, 2, 5, 6),
Seq(10, 11, 3, 4, 7, 8))
val input = Seq(Seq(1, 2, 3, 4, 5, 6, 7, 8, 9, 10),
Seq(4, 5, 6, 7, 8, 9, 10, 1, 2, 3),
Seq(7, 8, 9, 1, 2, 3, 4, 5, 6, 7))
val expectedOutputBeforeRestart =
Seq(Seq(2, 3, 5, 6, 8, 9), Seq(4, 5, 7, 8, 10, 2), Seq(6, 7, 9, 10, 3, 4))
val expectedOutputAfterRestart = Seq(Seq(4, 5, 7, 8, 10, 2),
Seq(6, 7, 9, 10, 3, 4),
Seq(8, 9, 11, 2, 5, 6),
Seq(10, 11, 3, 4, 7, 8))
testUnaryOperation(
input,
@ -371,24 +474,31 @@ class ProgressTrackingAndCheckpointSuite extends CheckpointAndProgressTrackerTes
"eh1" -> Map(
"eventhubs.partition.count" -> "3",
"eventhubs.maxRate" -> "2",
"eventhubs.name" -> "eh1")
"eventhubs.name" -> "eh1",
"eventhubs.namespace" -> "namespace",
"eventhubs.policyname" -> "policyname",
"eventhubs.policykey" -> "policykey"
)
),
expectedOffsetsAndSeqs = Map(eventhubNamespace ->
OffsetRecord(2000L, Map(EventHubNameAndPartition("eh1", 0) -> (3L, 3L),
EventHubNameAndPartition("eh1", 1) -> (3L, 3L),
EventHubNameAndPartition("eh1", 2) -> (3L, 3L))
)),
expectedOffsetsAndSeqs = Map(
eventhubNamespace ->
OffsetRecord(2000L,
Map(EventHubNameAndPartition("eh1", 0) -> (3L, 3L),
EventHubNameAndPartition("eh1", 1) -> (3L, 3L),
EventHubNameAndPartition("eh1", 2) -> (3L, 3L)))),
operation = (inputDStream: EventHubDirectDStream) =>
inputDStream.map(eventData => eventData.getProperties.get("output").asInstanceOf[Int] + 1),
expectedOutputBeforeRestart)
expectedOutputBeforeRestart
)
testProgressTracker(
eventhubNamespace,
expectedOffsetsAndSeqs =
OffsetRecord(3000L, Map(EventHubNameAndPartition("eh1", 0) -> (5L, 5L),
EventHubNameAndPartition("eh1", 1) -> (5L, 5L),
EventHubNameAndPartition("eh1", 2) -> (5L, 5L))),
4000L)
expectedOffsetsAndSeqs = OffsetRecord(3000L,
Map(EventHubNameAndPartition("eh1", 0) -> (5L, 5L),
EventHubNameAndPartition("eh1", 1) -> (5L, 5L),
EventHubNameAndPartition("eh1", 2) -> (5L, 5L))),
4000L
)
val currentCheckpointDirectory = ssc.checkpointDir
@ -400,38 +510,43 @@ class ProgressTrackingAndCheckpointSuite extends CheckpointAndProgressTrackerTes
ssc.stop()
reset()
ssc = StreamingContext.getOrCreate(currentCheckpointDirectory,
ssc = StreamingContext.getOrCreate(
currentCheckpointDirectory,
() => createContextForCheckpointOperation(batchDuration, checkpointDirectory))
ssc.graph.getInputStreams().filter(_.isInstanceOf[EventHubDirectDStream]).map(
_.asInstanceOf[EventHubDirectDStream]).head.currentOffsetsAndSeqNums =
OffsetRecord(2000L, Map(EventHubNameAndPartition("eh1", 0) -> (1L, 1L),
EventHubNameAndPartition("eh1", 1) -> (1L, 1L),
EventHubNameAndPartition("eh1", 2) -> (1L, 1L)))
ssc.graph
.getInputStreams()
.filter(_.isInstanceOf[EventHubDirectDStream])
.map(_.asInstanceOf[EventHubDirectDStream])
.head
.currentOffsetsAndSeqNums = OffsetRecord(2000L,
Map(EventHubNameAndPartition("eh1", 0) -> (1L, 1L),
EventHubNameAndPartition("eh1", 1) -> (1L, 1L),
EventHubNameAndPartition("eh1", 2) -> (1L, 1L)))
runStreamsWithEventHubInput(ssc,
expectedOutputAfterRestart.length - 1,
expectedOutputAfterRestart, useSet = true)
expectedOutputAfterRestart.length - 1,
expectedOutputAfterRestart,
useSet = true)
testProgressTracker(
eventhubNamespace,
expectedOffsetsAndSeqs =
OffsetRecord(5000L, Map(EventHubNameAndPartition("eh1", 0) -> (9L, 9L),
EventHubNameAndPartition("eh1", 1) -> (9L, 9L),
EventHubNameAndPartition("eh1", 2) -> (9L, 9L))),
6000L)
expectedOffsetsAndSeqs = OffsetRecord(5000L,
Map(EventHubNameAndPartition("eh1", 0) -> (9L, 9L),
EventHubNameAndPartition("eh1", 1) -> (9L, 9L),
EventHubNameAndPartition("eh1", 2) -> (9L, 9L))),
6000L
)
}
test("continue processing when the application crash before the last commit finished") {
val input = Seq(
Seq(1, 2, 3, 4, 5, 6, 7, 8, 9, 10),
Seq(4, 5, 6, 7, 8, 9, 10, 1, 2, 3),
Seq(7, 8, 9, 1, 2, 3, 4, 5, 6, 7))
val expectedOutputBeforeRestart = Seq(
Seq(2, 3, 5, 6, 8, 9), Seq(4, 5, 7, 8, 10, 2), Seq(6, 7, 9, 10, 3, 4))
val expectedOutputAfterRestart = Seq(
Seq(6, 7, 9, 10, 3, 4), Seq(8, 9, 11, 2, 5, 6), Seq(10, 11, 3, 4, 7, 8))
val input = Seq(Seq(1, 2, 3, 4, 5, 6, 7, 8, 9, 10),
Seq(4, 5, 6, 7, 8, 9, 10, 1, 2, 3),
Seq(7, 8, 9, 1, 2, 3, 4, 5, 6, 7))
val expectedOutputBeforeRestart =
Seq(Seq(2, 3, 5, 6, 8, 9), Seq(4, 5, 7, 8, 10, 2), Seq(6, 7, 9, 10, 3, 4))
val expectedOutputAfterRestart =
Seq(Seq(6, 7, 9, 10, 3, 4), Seq(8, 9, 11, 2, 5, 6), Seq(10, 11, 3, 4, 7, 8))
testUnaryOperation(
input,
@ -439,24 +554,31 @@ class ProgressTrackingAndCheckpointSuite extends CheckpointAndProgressTrackerTes
"eh1" -> Map(
"eventhubs.partition.count" -> "3",
"eventhubs.maxRate" -> "2",
"eventhubs.name" -> "eh1")
"eventhubs.name" -> "eh1",
"eventhubs.namespace" -> "namespace",
"eventhubs.policyname" -> "policyname",
"eventhubs.policykey" -> "policykey"
)
),
expectedOffsetsAndSeqs = Map(eventhubNamespace ->
OffsetRecord(2000L, Map(EventHubNameAndPartition("eh1", 0) -> (3L, 3L),
EventHubNameAndPartition("eh1", 1) -> (3L, 3L),
EventHubNameAndPartition("eh1", 2) -> (3L, 3L))
)),
expectedOffsetsAndSeqs = Map(
eventhubNamespace ->
OffsetRecord(2000L,
Map(EventHubNameAndPartition("eh1", 0) -> (3L, 3L),
EventHubNameAndPartition("eh1", 1) -> (3L, 3L),
EventHubNameAndPartition("eh1", 2) -> (3L, 3L)))),
operation = (inputDStream: EventHubDirectDStream) =>
inputDStream.map(eventData => eventData.getProperties.get("output").asInstanceOf[Int] + 1),
expectedOutputBeforeRestart)
expectedOutputBeforeRestart
)
testProgressTracker(
eventhubNamespace,
expectedOffsetsAndSeqs =
OffsetRecord(3000L, Map(EventHubNameAndPartition("eh1", 0) -> (5L, 5L),
EventHubNameAndPartition("eh1", 1) -> (5L, 5L),
EventHubNameAndPartition("eh1", 2) -> (5L, 5L))),
4000L)
expectedOffsetsAndSeqs = OffsetRecord(3000L,
Map(EventHubNameAndPartition("eh1", 0) -> (5L, 5L),
EventHubNameAndPartition("eh1", 1) -> (5L, 5L),
EventHubNameAndPartition("eh1", 2) -> (5L, 5L))),
4000L
)
val currentCheckpointDirectory = ssc.checkpointDir
@ -468,47 +590,58 @@ class ProgressTrackingAndCheckpointSuite extends CheckpointAndProgressTrackerTes
fs.delete(new Path(progressRootPath.toString + s"/$appName/progress-3000"), true)
fs.delete(new Path(progressRootPath.toString + s"/${appName}_metadata/3000"), true)
ssc = StreamingContext.getOrCreate(currentCheckpointDirectory,
ssc = StreamingContext.getOrCreate(
currentCheckpointDirectory,
() => createContextForCheckpointOperation(batchDuration, checkpointDirectory))
assert(ssc.graph.getInputStreams().filter(_.isInstanceOf[EventHubDirectDStream]).map(
_.asInstanceOf[EventHubDirectDStream]).head.currentOffsetsAndSeqNums ===
OffsetRecord(2000L, Map(EventHubNameAndPartition("eh1", 0) -> (3L, 3L),
EventHubNameAndPartition("eh1", 1) -> (3L, 3L),
EventHubNameAndPartition("eh1", 2) -> (3L, 3L))))
assert(
ssc.graph
.getInputStreams()
.filter(_.isInstanceOf[EventHubDirectDStream])
.map(_.asInstanceOf[EventHubDirectDStream])
.head
.currentOffsetsAndSeqNums ===
OffsetRecord(2000L,
Map(EventHubNameAndPartition("eh1", 0) -> (3L, 3L),
EventHubNameAndPartition("eh1", 1) -> (3L, 3L),
EventHubNameAndPartition("eh1", 2) -> (3L, 3L))))
runStreamsWithEventHubInput(ssc,
expectedOutputAfterRestart.length - 1,
expectedOutputAfterRestart, useSet = true)
expectedOutputAfterRestart.length - 1,
expectedOutputAfterRestart,
useSet = true)
testProgressTracker(
eventhubNamespace,
expectedOffsetsAndSeqs =
OffsetRecord(5000L, Map(EventHubNameAndPartition("eh1", 0) -> (9L, 9L),
EventHubNameAndPartition("eh1", 1) -> (9L, 9L),
EventHubNameAndPartition("eh1", 2) -> (9L, 9L))),
6000L)
expectedOffsetsAndSeqs = OffsetRecord(5000L,
Map(EventHubNameAndPartition("eh1", 0) -> (9L, 9L),
EventHubNameAndPartition("eh1", 1) -> (9L, 9L),
EventHubNameAndPartition("eh1", 2) -> (9L, 9L))),
6000L
)
}
test("progress files are clean up correctly with a fragile rest endpoint") {
val input = Seq(
Seq(1, 2, 3, 4, 5, 6, 7, 8, 9, 10),
Seq(4, 5, 6, 7, 8, 9, 10, 1, 2, 3),
Seq(7, 8, 9, 1, 2, 3, 4, 5, 6, 7))
val expectedOutputBeforeRestart = Seq(
Seq(2, 3, 5, 6, 8, 9), Seq(4, 5, 7, 8, 10, 2), Seq(6, 7, 9, 10, 3, 4))
val input = Seq(Seq(1, 2, 3, 4, 5, 6, 7, 8, 9, 10),
Seq(4, 5, 6, 7, 8, 9, 10, 1, 2, 3),
Seq(7, 8, 9, 1, 2, 3, 4, 5, 6, 7))
val expectedOutputBeforeRestart =
Seq(Seq(2, 3, 5, 6, 8, 9), Seq(4, 5, 7, 8, 10, 2), Seq(6, 7, 9, 10, 3, 4))
// the order of the output should looks like there is no issue, because we reuse the fetched
// highest offset
val expectedOutputAfterRestart = Seq(
Seq(6, 7, 9, 10, 3, 4), Seq(8, 9, 11, 2, 5, 6), Seq(10, 11, 3, 4, 7, 8), Seq(), Seq(), Seq())
val expectedOutputAfterRestart = Seq(Seq(6, 7, 9, 10, 3, 4),
Seq(8, 9, 11, 2, 5, 6),
Seq(10, 11, 3, 4, 7, 8),
Seq(),
Seq(),
Seq())
// ugly stuff to make things serializable
FragileEventHubClient.numBatchesBeforeCrashedEndpoint = 3
FragileEventHubClient.lastBatchWhenEndpointCrashed = 6
FragileEventHubClient.latestRecords = Map(
EventHubNameAndPartition("eh1", 0) -> (9L, 9L),
EventHubNameAndPartition("eh1", 1) -> (9L, 9L),
EventHubNameAndPartition("eh1", 2) -> (9L, 9L))
FragileEventHubClient.latestRecords = Map(EventHubNameAndPartition("eh1", 0) -> (9L, 9L),
EventHubNameAndPartition("eh1", 1) -> (9L, 9L),
EventHubNameAndPartition("eh1", 2) -> (9L, 9L))
testFragileStream(
input,
@ -516,16 +649,22 @@ class ProgressTrackingAndCheckpointSuite extends CheckpointAndProgressTrackerTes
"eh1" -> Map(
"eventhubs.partition.count" -> "3",
"eventhubs.maxRate" -> "2",
"eventhubs.name" -> "eh1")
"eventhubs.name" -> "eh1",
"eventhubs.namespace" -> "namespace",
"eventhubs.policyname" -> "policyname",
"eventhubs.policykey" -> "policykey"
)
),
expectedOffsetsAndSeqs = Map(eventhubNamespace ->
OffsetRecord(2000L, Map(EventHubNameAndPartition("eh1", 0) -> (3L, 3L),
EventHubNameAndPartition("eh1", 1) -> (3L, 3L),
EventHubNameAndPartition("eh1", 2) -> (3L, 3L))
)),
expectedOffsetsAndSeqs = Map(
eventhubNamespace ->
OffsetRecord(2000L,
Map(EventHubNameAndPartition("eh1", 0) -> (3L, 3L),
EventHubNameAndPartition("eh1", 1) -> (3L, 3L),
EventHubNameAndPartition("eh1", 2) -> (3L, 3L)))),
operation = (inputDStream: EventHubDirectDStream) =>
inputDStream.map(eventData => eventData.getProperties.get("output").asInstanceOf[Int] + 1),
expectedOutput = expectedOutputBeforeRestart)
expectedOutput = expectedOutputBeforeRestart
)
val currentCheckpointDirectory = ssc.checkpointDir
ssc.stop()
@ -534,28 +673,30 @@ class ProgressTrackingAndCheckpointSuite extends CheckpointAndProgressTrackerTes
ssc = new StreamingContext(currentCheckpointDirectory)
runStreamsWithEventHubInput(ssc,
expectedOutputAfterRestart.length - 1,
expectedOutputAfterRestart, useSet = true)
expectedOutputAfterRestart.length - 1,
expectedOutputAfterRestart,
useSet = true)
testProgressTracker(
eventhubNamespace,
expectedOffsetsAndSeqs =
OffsetRecord(8000L, Map(EventHubNameAndPartition("eh1", 0) -> (9L, 9L),
EventHubNameAndPartition("eh1", 1) -> (9L, 9L),
EventHubNameAndPartition("eh1", 2) -> (9L, 9L))),
9000L)
expectedOffsetsAndSeqs = OffsetRecord(8000L,
Map(EventHubNameAndPartition("eh1", 0) -> (9L, 9L),
EventHubNameAndPartition("eh1", 1) -> (9L, 9L),
EventHubNameAndPartition("eh1", 2) -> (9L, 9L))),
9000L
)
}
test("offset type is saved and recovered correctly from checkpoint") {
val input = Seq(
Seq(1, 2, 3, 4, 5, 6, 7, 8, 9, 10),
Seq(4, 5, 6, 7, 8, 9, 10, 1, 2, 3),
Seq(7, 8, 9, 1, 2, 3, 4, 5, 6, 7))
val expectedOutputBeforeRestart = Seq(
Seq(4, 5, 7, 8, 10, 2))
val expectedOutputAfterRestart = Seq(
Seq(4, 5, 7, 8, 10, 2), Seq(6, 7, 9, 10, 3, 4), Seq(8, 9, 11, 2, 5, 6),
Seq(10, 11, 3, 4, 7, 8), Seq())
val input = Seq(Seq(1, 2, 3, 4, 5, 6, 7, 8, 9, 10),
Seq(4, 5, 6, 7, 8, 9, 10, 1, 2, 3),
Seq(7, 8, 9, 1, 2, 3, 4, 5, 6, 7))
val expectedOutputBeforeRestart = Seq(Seq(4, 5, 7, 8, 10, 2))
val expectedOutputAfterRestart = Seq(Seq(4, 5, 7, 8, 10, 2),
Seq(6, 7, 9, 10, 3, 4),
Seq(8, 9, 11, 2, 5, 6),
Seq(10, 11, 3, 4, 7, 8),
Seq())
testCheckpointedOperation(
input,
@ -564,20 +705,26 @@ class ProgressTrackingAndCheckpointSuite extends CheckpointAndProgressTrackerTes
"eventhubs.partition.count" -> "3",
"eventhubs.maxRate" -> "2",
"eventhubs.name" -> "eh1",
"eventhubs.filter.enqueuetime" -> "2000")
"eventhubs.filter.enqueuetime" -> "2000",
"eventhubs.namespace" -> "namespace",
"eventhubs.policyname" -> "policyname",
"eventhubs.policykey" -> "policykey"
)
),
expectedStartingOffsetsAndSeqs = Map(eventhubNamespace ->
OffsetRecord(0L, Map(EventHubNameAndPartition("eh1", 0) -> (-1L, -1L),
EventHubNameAndPartition("eh1", 1) -> (-1L, -1L),
EventHubNameAndPartition("eh1", 2) -> (-1L, -1L))
)),
expectedStartingOffsetsAndSeqs = Map(
eventhubNamespace ->
OffsetRecord(0L,
Map(EventHubNameAndPartition("eh1", 0) -> (-1L, -1L),
EventHubNameAndPartition("eh1", 1) -> (-1L, -1L),
EventHubNameAndPartition("eh1", 2) -> (-1L, -1L)))),
expectedOffsetsAndSeqs = OffsetRecord(1000L,
Map(EventHubNameAndPartition("eh1", 0) -> (3L, 3L),
EventHubNameAndPartition("eh1", 1) -> (3L, 3L),
EventHubNameAndPartition("eh1", 2) -> (3L, 3L))),
Map(EventHubNameAndPartition("eh1", 0) -> (3L, 3L),
EventHubNameAndPartition("eh1", 1) -> (3L, 3L),
EventHubNameAndPartition("eh1", 2) -> (3L, 3L))),
operation = (inputDStream: EventHubDirectDStream) =>
inputDStream.map(eventData => eventData.getProperties.get("output").asInstanceOf[Int] + 1),
expectedOutputBeforeRestart,
expectedOutputAfterRestart)
expectedOutputAfterRestart
)
}
}

Просмотреть файл

@ -1,223 +0,0 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.spark.streaming.eventhubs
import java.io.File
import scala.collection.mutable.ArrayBuffer
import scala.concurrent.duration._
import com.microsoft.azure.eventhubs.EventData
import com.microsoft.azure.eventhubs.EventData.SystemProperties
import com.microsoft.azure.eventhubs.amqp.AmqpConstants
import org.mockito.internal.util.reflection.Whitebox
import org.scalatest.{BeforeAndAfter, BeforeAndAfterAll, FunSuite}
import org.scalatest.concurrent.Eventually
import org.scalatest.mock.MockitoSugar
import org.apache.spark.SparkConf
import org.apache.spark.eventhubscommon.client.{EventHubsClientWrapper, EventHubsOffsetTypes}
import org.apache.spark.eventhubscommon.client.EventHubsOffsetTypes.EventHubsOffsetType
import org.apache.spark.storage.StorageLevel
import org.apache.spark.streaming.{Milliseconds, StreamingContext}
import org.apache.spark.streaming.eventhubs.checkpoint.OffsetStore
import org.apache.spark.util.Utils
/**
* Test suite for ReliableEventHubsReceiver
* This suite of tests use Spark local mode with EventHubs dummy receiver for e2e testing
*/
class ReliableEventHubsReceiverSuite extends FunSuite with BeforeAndAfter with BeforeAndAfterAll
with MockitoSugar with Eventually {
private var streamingContext: StreamingContext = _
private var ehClientWrapperMock: EventHubsClientWrapper = _
private var offsetStoreMock: OffsetStore = _
private var tempDirectory: File = _
private val eventhubParameters = Map[String, String] (
"eventhubs.policyname" -> "policyname",
"eventhubs.policykey" -> "policykey",
"eventhubs.namespace" -> "namespace",
"eventhubs.name" -> "name",
"eventhubs.partition.count" -> "4",
"eventhubs.checkpoint.dir" -> "checkpointdir",
"eventhubs.checkpoint.interval" -> "0"
)
private val sparkConf = new SparkConf()
.setMaster("local[3]") // At least 2, 1 for receiver and 1 for data transform
.setAppName("ReliableEventHubsReceiverSuite")
.set("spark.streaming.receiver.writeAheadLog.enable", "true")
.set("spark.driver.allowMultipleContexts", "true")
override def beforeAll() : Unit = {
}
override def afterAll() : Unit = {
}
before {
tempDirectory = Utils.createTempDir()
// tempDirectory.deleteOnExit()
streamingContext = new StreamingContext(sparkConf, Milliseconds(500))
streamingContext.checkpoint(tempDirectory.getAbsolutePath)
offsetStoreMock = new MyMockedOffsetStore
}
after {
if (streamingContext != null) {
streamingContext.stop()
streamingContext = null
}
if(tempDirectory != null) {
// Utils.deleteRecursively(tempDirectory)
tempDirectory.delete()
tempDirectory = null
}
}
// Test ignored due to an issue with mocking library unavailable to the executors.
test("Reliable EventHubs input stream") {
// after 100 messages then start to receive null
ehClientWrapperMock = new MyMockedEventHubsClientWrapper(100, -1)
val stream = EventHubsUtils.createStream(streamingContext, eventhubParameters, "0",
StorageLevel.MEMORY_ONLY, offsetStoreMock, ehClientWrapperMock)
var count = 0
stream.map { v => v }.foreachRDD { r =>
val ret = r.collect()
ret.foreach { v =>
count += 1
}
}
streamingContext.start()
eventually(timeout(4000.milliseconds), interval(200.milliseconds)) {
// Make sure we have received 100 messages
assert(count === 100)
}
}
test("Reliable EventHubs input stream recover from exception") {
// After 60 messages then exception, after 100 messages then receive null
ehClientWrapperMock = new MyMockedEventHubsClientWrapper(100, 60)
val stream = EventHubsUtils.createStream(streamingContext, eventhubParameters, "0",
StorageLevel.MEMORY_ONLY, offsetStoreMock, ehClientWrapperMock)
var count = 0
stream.map { v => v }.foreachRDD { r =>
val ret = r.collect()
ret.foreach { v =>
count += 1
}
}
streamingContext.start()
eventually(timeout(10000.milliseconds), interval(200.milliseconds)) {
// Make sure we have received 100 messages
assert(count === 100)
}
}
}
/**
* The Mock class for EventHubsClientWrapper.
* Note this class only support offset filter.
*
* @param emitCount the number of message emitted before it returns null
* @param exceptionCount the number of message emitted before it throws exception
* it only throws exception once
*/
class MyMockedEventHubsClientWrapper(
emitCount: Int,
exceptionCount: Int) extends EventHubsClientWrapper with MockitoSugar {
var offset: Int = -1
var count = 0
var partition = "0"
var myExceptionCount: Int = exceptionCount
override def createReceiverInternal(
connectionString: String,
eventhubsName: String,
consumerGroup: String,
partitionId: String,
offsetType: EventHubsOffsetType,
currentOffset: String,
receiverEpoch: Long): Unit = {
if (offsetType != EventHubsOffsetTypes.None) {
offset = currentOffset.toInt
partition = partitionId
}
}
override def closeReceiver(): Unit = {
// no ops
}
override def receive(): Iterable[EventData] = {
if (count == myExceptionCount) {
// make sure we only throw exception once
myExceptionCount = -1
throw new RuntimeException("count = " + count)
}
offset += 1
count += 1
// do not send more than emitCount number of messages
if(count <= emitCount) {
val eventData = new EventData(Array.fill(8)(
(scala.util.Random.nextInt(256) - 128).toByte))
val systemPropertiesMap = new java.util.HashMap[String, AnyRef]()
systemPropertiesMap.put(AmqpConstants.OFFSET_ANNOTATION_NAME, offset.toString)
systemPropertiesMap.put(AmqpConstants.SEQUENCE_NUMBER_ANNOTATION_NAME, Long.box(count))
systemPropertiesMap.put(AmqpConstants.PARTITION_KEY_ANNOTATION_NAME, partition)
val systemProperties = new SystemProperties(systemPropertiesMap)
Whitebox.setInternalState(eventData, "systemProperties", systemProperties)
val eventDataCollection: ArrayBuffer[EventData] = new ArrayBuffer[EventData]()
eventDataCollection += eventData
eventDataCollection
} else {
Thread sleep 1000
null
}
}
}
/**
* The Mock class for OffsetStore
*/
class MyMockedOffsetStore extends OffsetStore {
var myOffset: String = "-1"
override def open(): Unit = {
}
override def write(offset: String): Unit = {
println("writing offset to MyMockedOffsetStore:" + offset)
myOffset = offset
}
override def read(): String = {
println("reading offset from MyMockedOffsetStore:" + myOffset)
myOffset
}
override def close(): Unit = {
}
}

Просмотреть файл

@ -20,14 +20,17 @@ package org.apache.spark.streaming.eventhubs
import java.nio.file.Files
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.{FileSystem, Path}
import org.scalatest.{BeforeAndAfterEach, FunSuite}
import org.apache.hadoop.fs.{ FileSystem, Path }
import org.scalatest.{ BeforeAndAfterEach, FunSuite }
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.{ SparkConf, SparkContext }
import org.apache.spark.eventhubscommon.EventHubsConnector
import org.apache.spark.eventhubscommon.progress.ProgressTrackerBase
import org.apache.spark.streaming.{Duration, Seconds, StreamingContext}
import org.apache.spark.streaming.eventhubs.checkpoint.{DirectDStreamProgressTracker, ProgressTrackingListener}
import org.apache.spark.streaming.{ Duration, Seconds, StreamingContext }
import org.apache.spark.streaming.eventhubs.checkpoint.{
DirectDStreamProgressTracker,
ProgressTrackingListener
}
private[spark] trait SharedUtils extends FunSuite with BeforeAndAfterEach {
@ -59,13 +62,17 @@ private[spark] trait SharedUtils extends FunSuite with BeforeAndAfterEach {
protected def init(): Unit = {
progressRootPath = new Path(Files.createTempDirectory("progress_root").toString)
fs = progressRootPath.getFileSystem(new Configuration())
val sparkContext = new SparkContext(new SparkConf().setAppName(appName).
setMaster("local[*]").set("spark.streaming.clock", streamingClock))
val sparkContext = new SparkContext(
new SparkConf()
.setAppName(appName)
.setMaster("local[*]")
.set("spark.streaming.clock", streamingClock))
sparkContext.setLogLevel("INFO")
ssc = new StreamingContext(sparkContext, batchDuration)
progressListener = ProgressTrackingListener.initInstance(ssc, progressRootPath.toString)
progressTracker = DirectDStreamProgressTracker.initInstance(progressRootPath.toString, appName,
new Configuration())
progressTracker = DirectDStreamProgressTracker.initInstance(progressRootPath.toString,
appName,
new Configuration())
}
protected def reset(): Unit = {

Просмотреть файл

@ -17,21 +17,24 @@
package org.apache.spark.streaming.eventhubs.checkpoint
import java.nio.file.{Files, Paths, StandardOpenOption}
import java.nio.file.{ Files, Paths, StandardOpenOption }
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.{FileSystem, Path}
import org.apache.spark.eventhubscommon.{EventHubNameAndPartition, EventHubsConnector, OffsetRecord}
import org.apache.spark.eventhubscommon.progress.{PathTools, ProgressRecord, ProgressWriter}
import org.apache.hadoop.fs.{ FileSystem, Path }
import org.apache.spark.eventhubscommon.progress.{ PathTools, ProgressRecord, ProgressWriter }
import org.apache.spark.eventhubscommon.{
EventHubNameAndPartition,
EventHubsConnector,
OffsetRecord
}
import org.apache.spark.streaming.eventhubs.SharedUtils
class ProgressTrackerSuite extends SharedUtils {
class DummyEventHubsConnector(
sId: Int,
uniqueId: String,
connedInstances: List[EventHubNameAndPartition]) extends EventHubsConnector {
class DummyEventHubsConnector(sId: Int,
uniqueId: String,
connedInstances: List[EventHubNameAndPartition])
extends EventHubsConnector {
override def streamId: Int = sId
override def uid: String = uniqueId
@ -44,16 +47,15 @@ class ProgressTrackerSuite extends SharedUtils {
DirectDStreamProgressTracker.reset()
}
private def writeProgressFile(
progressPath: String,
streamId: Int,
fs: FileSystem,
timestamp: Long,
namespace: String,
ehName: String,
partitionRange: Range,
offset: Int,
seq: Int): Unit = {
private def writeProgressFile(progressPath: String,
streamId: Int,
fs: FileSystem,
timestamp: Long,
namespace: String,
ehName: String,
partitionRange: Range,
offset: Int,
seq: Int): Unit = {
for (partitionId <- partitionRange) {
val filePath = Paths.get(progressPath + s"/${PathTools.makeProgressFileName(timestamp)}")
val stdOpenOption = if (Files.exists(filePath)) {
@ -62,9 +64,9 @@ class ProgressTrackerSuite extends SharedUtils {
StandardOpenOption.CREATE
}
Files.write(filePath,
s"${ProgressRecord(timestamp, namespace, ehName, partitionId, offset, seq)
.toString}\n".getBytes,
Files.write(
filePath,
s"${ProgressRecord(timestamp, namespace, ehName, partitionId, offset, seq).toString}\n".getBytes,
stdOpenOption)
}
}
@ -72,16 +74,18 @@ class ProgressTrackerSuite extends SharedUtils {
private def createMetadataFile(fs: FileSystem, metadataPath: String, timestamp: Long): Unit =
fs.create(new Path(s"$metadataPath/${PathTools.makeMetadataFileName(timestamp)}"))
test("progress temp directory is created properly when progress and progress temp" +
" directory do not exist") {
test(
"progress temp directory is created properly when progress and progress temp" +
" directory do not exist") {
progressTracker = DirectDStreamProgressTracker
.initInstance(progressRootPath.toString, appName, new Configuration())
assert(fs.exists(progressTracker.progressDirectoryPath))
assert(fs.exists(progressTracker.tempDirectoryPath))
}
test("progress temp directory is created properly when progress exists while progress" +
" temp does not") {
test(
"progress temp directory is created properly when progress exists while progress" +
" temp does not") {
fs.mkdirs(PathTools.makeTempDirectoryPath(progressRootPath.toString, appName))
progressTracker = DirectDStreamProgressTracker
.initInstance(progressRootPath.toString, appName, new Configuration())
@ -103,10 +107,14 @@ class ProgressTrackerSuite extends SharedUtils {
}
test("incomplete progress would be discarded") {
createDirectStreams(ssc, "namespace1", progressRootPath.toString,
createDirectStreams(
ssc,
"namespace1",
progressRootPath.toString,
Map("eh1" -> Map("eventhubs.partition.count" -> "1"),
"eh2" -> Map("eventhubs.partition.count" -> "2"),
"eh3" -> Map("eventhubs.partition.count" -> "3")))
"eh3" -> Map("eventhubs.partition.count" -> "3"))
)
val progressPath = PathTools.makeProgressDirectoryStr(progressRootPath.toString, appName)
fs.mkdirs(new Path(progressPath))
@ -137,15 +145,19 @@ class ProgressTrackerSuite extends SharedUtils {
assert(fs.exists(new Path(progressPath + "/progress-1000")))
}
private def verifyProgressFile(
namespace: String, ehName: String, partitionRange: Range,
timestamp: Long, expectedOffsetAndSeq: Seq[(Long, Long)]): Unit = {
val ehMap = progressTracker.asInstanceOf[DirectDStreamProgressTracker]
private def verifyProgressFile(namespace: String,
ehName: String,
partitionRange: Range,
timestamp: Long,
expectedOffsetAndSeq: Seq[(Long, Long)]): Unit = {
val ehMap = progressTracker
.asInstanceOf[DirectDStreamProgressTracker]
.read(namespace, timestamp - 1000L, fallBack = false)
var expectedOffsetAndSeqIdx = 0
for (partitionId <- partitionRange) {
assert(ehMap.offsets(EventHubNameAndPartition(ehName, partitionId)) ===
expectedOffsetAndSeq(expectedOffsetAndSeqIdx))
assert(
ehMap.offsets(EventHubNameAndPartition(ehName, partitionId)) ===
expectedOffsetAndSeq(expectedOffsetAndSeqIdx))
expectedOffsetAndSeqIdx += 1
}
}
@ -153,15 +165,23 @@ class ProgressTrackerSuite extends SharedUtils {
test("start from the beginning of the streams when the latest progress file does not exist") {
// generate 6 EventHubAndPartitions
val dStream =
createDirectStreams(ssc, "namespace1", progressRootPath.toString,
createDirectStreams(
ssc,
"namespace1",
progressRootPath.toString,
Map("eh1" -> Map("eventhubs.partition.count" -> "1"),
"eh2" -> Map("eventhubs.partition.count" -> "2"),
"eh3" -> Map("eventhubs.partition.count" -> "3")))
"eh2" -> Map("eventhubs.partition.count" -> "2"),
"eh3" -> Map("eventhubs.partition.count" -> "3"))
)
val dStream1 =
createDirectStreams(ssc, "namespace2", progressRootPath.toString,
createDirectStreams(
ssc,
"namespace2",
progressRootPath.toString,
Map("eh11" -> Map("eventhubs.partition.count" -> "1"),
"eh12" -> Map("eventhubs.partition.count" -> "2"),
"eh13" -> Map("eventhubs.partition.count" -> "3")))
"eh12" -> Map("eventhubs.partition.count" -> "2"),
"eh13" -> Map("eventhubs.partition.count" -> "3"))
)
dStream.start()
dStream1.start()
@ -214,16 +234,16 @@ class ProgressTrackerSuite extends SharedUtils {
writeProgressFile(progressPath, 1, fs, 1000L, "namespace2", "eh11", 0 to 0, 1, 2)
// write wrong record
Files.write(
Paths.get(progressPath + s"/progress-1000"),
(ProgressRecord(2000L, "namespace2", "eh12", 0, 2, 3).toString + "\n").getBytes,
StandardOpenOption.APPEND)
Files.write(Paths.get(progressPath + s"/progress-1000"),
(ProgressRecord(2000L, "namespace2", "eh12", 0, 2, 3).toString + "\n").getBytes,
StandardOpenOption.APPEND)
writeProgressFile(progressPath, 1, fs, 1000L, "namespace2", "eh12", 1 to 1, 2, 3)
writeProgressFile(progressPath, 1, fs, 1000L, "namespace2", "eh13", 0 to 2, 3, 4)
intercept[IllegalArgumentException] {
progressTracker.asInstanceOf[DirectDStreamProgressTracker]
progressTracker
.asInstanceOf[DirectDStreamProgressTracker]
.read("namespace2", 1000L, fallBack = false)
}
}
@ -238,19 +258,40 @@ class ProgressTrackerSuite extends SharedUtils {
val connector1 = new DummyEventHubsConnector(0, "namespace1", connectedInstances)
val connector2 = new DummyEventHubsConnector(0, "namespace2", connectedInstances)
var progressWriter = new ProgressWriter(0, "namespace1", eh1Partition0,
1000L, new Configuration(), progressRootPath.toString, appName)
var progressWriter = new ProgressWriter(0,
"namespace1",
eh1Partition0,
1000L,
new Configuration(),
progressRootPath.toString,
appName)
progressWriter.write(1000L, 0, 1)
progressWriter = new ProgressWriter(0, "namespace1", eh2Partition0, 1000L,
new Configuration(), progressRootPath.toString, appName)
progressWriter = new ProgressWriter(0,
"namespace1",
eh2Partition0,
1000L,
new Configuration(),
progressRootPath.toString,
appName)
progressWriter.write(1000L, 0, 1)
progressWriter = new ProgressWriter(0, "namespace2", eh1Partition0, 1000L,
new Configuration(), progressRootPath.toString, appName)
progressWriter = new ProgressWriter(0,
"namespace2",
eh1Partition0,
1000L,
new Configuration(),
progressRootPath.toString,
appName)
progressWriter.write(1000L, 10, 20)
progressWriter = new ProgressWriter(0, "namespace2", eh2Partition0, 1000L,
new Configuration(), progressRootPath.toString, appName)
progressWriter = new ProgressWriter(0,
"namespace2",
eh2Partition0,
1000L,
new Configuration(),
progressRootPath.toString,
appName)
progressWriter.write(1000L, 20, 30)
val s = progressTracker.asInstanceOf[DirectDStreamProgressTracker]
val s = progressTracker
.asInstanceOf[DirectDStreamProgressTracker]
.collectProgressRecordsForBatch(1000L, List(connector1, connector2))
assert(s.contains("namespace1"))
@ -271,118 +312,147 @@ class ProgressTrackerSuite extends SharedUtils {
val connector1 = new DummyEventHubsConnector(0, "namespace1", connectedInstances)
val connector2 = new DummyEventHubsConnector(0, "namespace2", connectedInstances)
var progressWriter = new ProgressWriter(0, "namespace1", eh1Partition0,
1000L, new Configuration(), progressRootPath.toString, appName)
var progressWriter = new ProgressWriter(0,
"namespace1",
eh1Partition0,
1000L,
new Configuration(),
progressRootPath.toString,
appName)
progressWriter.write(1000L, 0, 1)
progressWriter = new ProgressWriter(0, "namespace1", eh2Partition0, 1000L,
new Configuration(), progressRootPath.toString, appName)
progressWriter = new ProgressWriter(0,
"namespace1",
eh2Partition0,
1000L,
new Configuration(),
progressRootPath.toString,
appName)
progressWriter.write(1000L, 0, 1)
progressWriter = new ProgressWriter(0, "namespace2", eh1Partition0, 1000L,
new Configuration(), progressRootPath.toString, appName)
progressWriter = new ProgressWriter(0,
"namespace2",
eh1Partition0,
1000L,
new Configuration(),
progressRootPath.toString,
appName)
progressWriter.write(2000L, 10, 20)
progressWriter = new ProgressWriter(0, "namespace2", eh2Partition0, 1000L,
new Configuration(), progressRootPath.toString, appName)
progressWriter = new ProgressWriter(0,
"namespace2",
eh2Partition0,
1000L,
new Configuration(),
progressRootPath.toString,
appName)
progressWriter.write(1000L, 20, 30)
intercept[IllegalStateException] {
progressTracker.asInstanceOf[DirectDStreamProgressTracker].
collectProgressRecordsForBatch(1000L, List(connector1, connector2))
progressTracker
.asInstanceOf[DirectDStreamProgressTracker]
.collectProgressRecordsForBatch(1000L, List(connector1, connector2))
}
}
test("latest offsets can be committed correctly and temp directory is not cleaned") {
progressTracker = DirectDStreamProgressTracker.initInstance(progressRootPath.toString, appName,
new Configuration())
progressTracker = DirectDStreamProgressTracker.initInstance(progressRootPath.toString,
appName,
new Configuration())
var progressWriter = new ProgressWriter(0, "namespace1", EventHubNameAndPartition("eh1", 0),
1000L, new Configuration(), progressRootPath.toString, appName)
var progressWriter = new ProgressWriter(0,
"namespace1",
EventHubNameAndPartition("eh1", 0),
1000L,
new Configuration(),
progressRootPath.toString,
appName)
progressWriter.write(1000L, 0, 0)
progressWriter = new ProgressWriter(0, "namespace1", EventHubNameAndPartition("eh2", 0), 1000L,
new Configuration(), progressRootPath.toString, appName)
progressWriter = new ProgressWriter(0,
"namespace1",
EventHubNameAndPartition("eh2", 0),
1000L,
new Configuration(),
progressRootPath.toString,
appName)
progressWriter.write(1000L, 1, 1)
progressWriter = new ProgressWriter(0, "namespace2", EventHubNameAndPartition("eh1", 0), 1000L,
new Configuration(), progressRootPath.toString, appName)
progressWriter = new ProgressWriter(0,
"namespace2",
EventHubNameAndPartition("eh1", 0),
1000L,
new Configuration(),
progressRootPath.toString,
appName)
progressWriter.write(1000L, 2, 2)
progressWriter = new ProgressWriter(0, "namespace2", EventHubNameAndPartition("eh2", 0), 1000L,
new Configuration(), progressRootPath.toString, appName)
progressWriter = new ProgressWriter(0,
"namespace2",
EventHubNameAndPartition("eh2", 0),
1000L,
new Configuration(),
progressRootPath.toString,
appName)
progressWriter.write(1000L, 3, 3)
val offsetToCommit = Map(
"namespace1" -> Map(
EventHubNameAndPartition("eh1", 0) -> (0L, 0L),
EventHubNameAndPartition("eh2", 1) -> (1L, 1L)),
"namespace2" -> Map(
EventHubNameAndPartition("eh1", 3) -> (2L, 2L),
EventHubNameAndPartition("eh2", 4) -> (3L, 3L)))
"namespace1" -> Map(EventHubNameAndPartition("eh1", 0) -> (0L, 0L),
EventHubNameAndPartition("eh2", 1) -> (1L, 1L)),
"namespace2" -> Map(EventHubNameAndPartition("eh1", 3) -> (2L, 2L),
EventHubNameAndPartition("eh2", 4) -> (3L, 3L))
)
progressTracker.asInstanceOf[DirectDStreamProgressTracker].commit(offsetToCommit, 1000L)
val namespace1Offsets = progressTracker.asInstanceOf[DirectDStreamProgressTracker]
val namespace1Offsets = progressTracker
.asInstanceOf[DirectDStreamProgressTracker]
.read("namespace1", 1000L, fallBack = false)
assert(namespace1Offsets === OffsetRecord(1000L, Map(
EventHubNameAndPartition("eh1", 0) -> (0L, 0L),
EventHubNameAndPartition("eh2", 1) -> (1L, 1L))))
val namespace2Offsets = progressTracker.asInstanceOf[DirectDStreamProgressTracker]
assert(
namespace1Offsets === OffsetRecord(1000L,
Map(EventHubNameAndPartition("eh1", 0) -> (0L, 0L),
EventHubNameAndPartition("eh2", 1) -> (1L, 1L))))
val namespace2Offsets = progressTracker
.asInstanceOf[DirectDStreamProgressTracker]
.read("namespace2", 1000L, fallBack = false)
assert(namespace2Offsets === OffsetRecord(1000L, Map(
EventHubNameAndPartition("eh1", 3) -> (2L, 2L),
EventHubNameAndPartition("eh2", 4) -> (3L, 3L))))
assert(
namespace2Offsets === OffsetRecord(1000L,
Map(EventHubNameAndPartition("eh1", 3) -> (2L, 2L),
EventHubNameAndPartition("eh2", 4) -> (3L, 3L))))
// test temp directory cleanup
assert(fs.exists(PathTools.makeTempDirectoryPath(
progressRootPath.toString, appName)))
assert(fs.listStatus(PathTools.makeTempDirectoryPath(
progressRootPath.toString, appName)).length === 4)
assert(fs.exists(PathTools.makeTempDirectoryPath(progressRootPath.toString, appName)))
assert(
fs.listStatus(PathTools.makeTempDirectoryPath(progressRootPath.toString, appName))
.length === 4)
}
test("locate ProgressFile correctly") {
progressTracker = DirectDStreamProgressTracker
.initInstance(progressRootPath.toString, appName, new Configuration())
assert(progressTracker.asInstanceOf[DirectDStreamProgressTracker]
.pinPointProgressFile(fs, 1000L) === None)
assert(
progressTracker
.asInstanceOf[DirectDStreamProgressTracker]
.pinPointProgressFile(fs, 1000L) === None)
val progressPath = PathTools.makeProgressDirectoryStr(progressRootPath.toString, appName)
fs.mkdirs(new Path(progressPath))
// 1000
writeProgressFile(progressPath, 0, fs, 1000L, "namespace1", "eh1",
0 to 0, 0, 1)
writeProgressFile(progressPath, 0, fs, 1000L, "namespace1", "eh2",
0 to 1, 0, 2)
writeProgressFile(progressPath, 0, fs, 1000L, "namespace1", "eh3",
0 to 2, 0, 3)
writeProgressFile(progressPath, 1, fs, 1000L, "namespace2", "eh11",
0 to 0, 1, 2)
writeProgressFile(progressPath, 1, fs, 1000L, "namespace2", "eh12",
0 to 1, 2, 3)
writeProgressFile(progressPath, 1, fs, 1000L, "namespace2", "eh13",
0 to 2, 3, 4)
writeProgressFile(progressPath, 0, fs, 1000L, "namespace1", "eh1", 0 to 0, 0, 1)
writeProgressFile(progressPath, 0, fs, 1000L, "namespace1", "eh2", 0 to 1, 0, 2)
writeProgressFile(progressPath, 0, fs, 1000L, "namespace1", "eh3", 0 to 2, 0, 3)
writeProgressFile(progressPath, 1, fs, 1000L, "namespace2", "eh11", 0 to 0, 1, 2)
writeProgressFile(progressPath, 1, fs, 1000L, "namespace2", "eh12", 0 to 1, 2, 3)
writeProgressFile(progressPath, 1, fs, 1000L, "namespace2", "eh13", 0 to 2, 3, 4)
// 2000
writeProgressFile(progressPath, 0, fs, 2000L, "namespace1", "eh1",
0 to 0, 1, 2)
writeProgressFile(progressPath, 0, fs, 2000L, "namespace1", "eh2",
0 to 1, 1, 3)
writeProgressFile(progressPath, 0, fs, 2000L, "namespace1", "eh3",
0 to 2, 1, 4)
writeProgressFile(progressPath, 1, fs, 2000L, "namespace2", "eh11",
0 to 0, 2, 3)
writeProgressFile(progressPath, 1, fs, 2000L, "namespace2", "eh12",
0 to 1, 3, 4)
writeProgressFile(progressPath, 1, fs, 2000L, "namespace2", "eh13",
0 to 2, 4, 5)
writeProgressFile(progressPath, 0, fs, 2000L, "namespace1", "eh1", 0 to 0, 1, 2)
writeProgressFile(progressPath, 0, fs, 2000L, "namespace1", "eh2", 0 to 1, 1, 3)
writeProgressFile(progressPath, 0, fs, 2000L, "namespace1", "eh3", 0 to 2, 1, 4)
writeProgressFile(progressPath, 1, fs, 2000L, "namespace2", "eh11", 0 to 0, 2, 3)
writeProgressFile(progressPath, 1, fs, 2000L, "namespace2", "eh12", 0 to 1, 3, 4)
writeProgressFile(progressPath, 1, fs, 2000L, "namespace2", "eh13", 0 to 2, 4, 5)
// 3000
writeProgressFile(progressPath, 0, fs, 3000L, "namespace1", "eh1",
0 to 0, 2, 3)
writeProgressFile(progressPath, 0, fs, 3000L, "namespace1", "eh2",
0 to 1, 2, 4)
writeProgressFile(progressPath, 0, fs, 3000L, "namespace1", "eh3",
0 to 2, 2, 5)
writeProgressFile(progressPath, 1, fs, 3000L, "namespace2", "eh11",
0 to 0, 3, 4)
writeProgressFile(progressPath, 1, fs, 3000L, "namespace2", "eh12",
0 to 1, 4, 5)
writeProgressFile(progressPath, 1, fs, 3000L, "namespace2", "eh13",
0 to 2, 5, 6)
writeProgressFile(progressPath, 0, fs, 3000L, "namespace1", "eh1", 0 to 0, 2, 3)
writeProgressFile(progressPath, 0, fs, 3000L, "namespace1", "eh2", 0 to 1, 2, 4)
writeProgressFile(progressPath, 0, fs, 3000L, "namespace1", "eh3", 0 to 2, 2, 5)
writeProgressFile(progressPath, 1, fs, 3000L, "namespace2", "eh11", 0 to 0, 3, 4)
writeProgressFile(progressPath, 1, fs, 3000L, "namespace2", "eh12", 0 to 1, 4, 5)
writeProgressFile(progressPath, 1, fs, 3000L, "namespace2", "eh13", 0 to 2, 5, 6)
// if latest timestamp is earlier than the specified timestamp,
// then we shall return the latest offsets

Просмотреть файл

@ -19,65 +19,85 @@ package org.apache.spark.streaming.eventhubs.checkpoint
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.Path
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.eventhubscommon.{EventHubNameAndPartition, OffsetRecord}
import org.apache.spark.eventhubscommon.progress.ProgressWriter
import org.apache.spark.streaming.{Seconds, StreamingContext}
import org.apache.spark.eventhubscommon.{ EventHubNameAndPartition, OffsetRecord }
import org.apache.spark.streaming.eventhubs.SharedUtils
import org.apache.spark.streaming.scheduler.OutputOperationInfo
import org.apache.spark.streaming.{ Seconds, StreamingContext }
import org.apache.spark.{ SparkConf, SparkContext }
// scalastyle:off
import org.apache.spark.streaming.Time
import org.apache.spark.streaming.scheduler.{BatchInfo, StreamInputInfo, StreamingListenerBatchCompleted}
// scalastyle:on
import org.apache.spark.streaming.scheduler.{
BatchInfo,
StreamInputInfo,
StreamingListenerBatchCompleted
}
class ProgressTrackingListenerSuite extends SharedUtils {
test("commit offsets with a successful micro batch correctly") {
val batchCompletedEvent = StreamingListenerBatchCompleted(BatchInfo(
Time(1000L),
Map(0 -> StreamInputInfo(0, 10000)),
0L,
None,
None,
Map(1 -> OutputOperationInfo(Time(1000L), 1, "output", "", None, None, None))
))
val dstream = createDirectStreams(ssc, eventhubNamespace, progressRootPath.toString,
Map("eh1" -> Map("eventhubs.partition.count" -> "2")))
val batchCompletedEvent = StreamingListenerBatchCompleted(
BatchInfo(
Time(1000L),
Map(0 -> StreamInputInfo(0, 10000)),
0L,
None,
None,
Map(1 -> OutputOperationInfo(Time(1000L), 1, "output", "", None, None, None))
))
val dstream = createDirectStreams(ssc,
eventhubNamespace,
progressRootPath.toString,
Map("eh1" -> Map("eventhubs.partition.count" -> "2")))
dstream.start()
val progressWriter = new ProgressWriter(streamId, eventhubNamespace,
EventHubNameAndPartition("eh1", 1), 1000L,
new Configuration(), progressRootPath.toString, appName)
val progressWriter = new ProgressWriter(streamId,
eventhubNamespace,
EventHubNameAndPartition("eh1", 1),
1000L,
new Configuration(),
progressRootPath.toString,
appName)
progressWriter.write(1000L, 1L, 2L)
assert(fs.exists(progressWriter.tempProgressTrackingPointPath))
progressListener.onBatchCompleted(batchCompletedEvent)
assert(fs.exists(progressWriter.tempProgressTrackingPointPath))
assert(fs.exists(new Path(progressTracker.progressDirectoryPath + "/progress-1000")))
val record = progressTracker.asInstanceOf[DirectDStreamProgressTracker].read(eventhubNamespace,
1000L, fallBack = false)
assert(record === OffsetRecord(1000L,
Map(EventHubNameAndPartition("eh1", 0) -> (-1L, -1L),
EventHubNameAndPartition("eh1", 1) -> (1L, 2L))))
val record = progressTracker
.asInstanceOf[DirectDStreamProgressTracker]
.read(eventhubNamespace, 1000L, fallBack = false)
assert(
record === OffsetRecord(1000L,
Map(EventHubNameAndPartition("eh1", 0) -> (-1L, -1L),
EventHubNameAndPartition("eh1", 1) -> (1L, 2L))))
}
test("do not commit offsets when there is a failure in microbatch") {
val batchCompletedEvent = StreamingListenerBatchCompleted(BatchInfo(
Time(1000L),
Map(0 -> StreamInputInfo(0, 10000)),
0L,
None,
None,
Map(
1 -> OutputOperationInfo(Time(1000L), 1, "outputWithFailure", "", None, None,
Some("instrumented failure")),
2 -> OutputOperationInfo(Time(1000L), 2, "correct output", "", None, None, None)))
)
val batchCompletedEvent = StreamingListenerBatchCompleted(
BatchInfo(
Time(1000L),
Map(0 -> StreamInputInfo(0, 10000)),
0L,
None,
None,
Map(
1 -> OutputOperationInfo(Time(1000L),
1,
"outputWithFailure",
"",
None,
None,
Some("instrumented failure")),
2 -> OutputOperationInfo(Time(1000L), 2, "correct output", "", None, None, None)
)
))
// build temp directories
val progressWriter = new ProgressWriter(streamId, eventhubNamespace,
EventHubNameAndPartition("eh1", 1), 1000L,
new Configuration(), progressTracker.tempDirectoryPath.toString,
appName)
val progressWriter = new ProgressWriter(streamId,
eventhubNamespace,
EventHubNameAndPartition("eh1", 1),
1000L,
new Configuration(),
progressTracker.tempDirectoryPath.toString,
appName)
progressWriter.write(1000L, 0L, 0L)
assert(fs.exists(progressWriter.tempProgressTrackingPointPath))
progressListener.onBatchCompleted(batchCompletedEvent)
@ -90,20 +110,30 @@ class ProgressTrackingListenerSuite extends SharedUtils {
ProgressTrackingListener.reset(ssc)
ssc.stop()
// create new streaming context
ssc = new StreamingContext(new SparkContext(new SparkConf().setAppName(appName).
setMaster("local[*]")), Seconds(5))
createDirectStreams(ssc, "namespace1", progressRootPath.toString,
ssc = new StreamingContext(
new SparkContext(new SparkConf().setAppName(appName).setMaster("local[*]")),
Seconds(5))
createDirectStreams(
ssc,
"namespace1",
progressRootPath.toString,
Map("eh1" -> Map("eventhubs.partition.count" -> "1"),
"eh2" -> Map("eventhubs.partition.count" -> "2"),
"eh3" -> Map("eventhubs.partition.count" -> "3"))).start()
createDirectStreams(ssc, "namespace2", progressRootPath.toString,
"eh2" -> Map("eventhubs.partition.count" -> "2"),
"eh3" -> Map("eventhubs.partition.count" -> "3"))
).start()
createDirectStreams(
ssc,
"namespace2",
progressRootPath.toString,
Map("eh11" -> Map("eventhubs.partition.count" -> "1"),
"eh12" -> Map("eventhubs.partition.count" -> "2"),
"eh13" -> Map("eventhubs.partition.count" -> "3"))).start()
"eh12" -> Map("eventhubs.partition.count" -> "2"),
"eh13" -> Map("eventhubs.partition.count" -> "3"))
).start()
import scala.collection.JavaConverters._
assert(ssc.scheduler.listenerBus.listeners.asScala.count(
_.isInstanceOf[ProgressTrackingListener]) === 1)
assert(
ssc.scheduler.listenerBus.listeners.asScala
.count(_.isInstanceOf[ProgressTrackingListener]) === 1)
assert(DirectDStreamProgressTracker.registeredConnectors.length === 2)
ssc.stop()
}
}
}

Просмотреть файл

@ -1,38 +0,0 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.microsoft.spark.streaming.examples.receiverdstream.arguments
object EventhubsArgumentKeys extends Enumeration {
val EventhubsNamespace: String = "eventhubsNamespace"
val EventhubsName: String = "eventhubsName"
val PolicyName: String = "policyName"
val PolicyKey: String = "policyKey"
val ConsumerGroup: String = "consumerGroup"
val PartitionCount: String = "partitionCount"
val BatchIntervalInSeconds: String = "batchInterval"
val CheckpointDirectory: String = "checkpointDirectory"
val EventCountFolder: String = "eventCountFolder"
val EventStoreFolder: String = "eventStoreFolder"
val EventHiveTable: String = "eventHiveTable"
val SQLServerFQDN: String = "sqlServerFQDN"
val SQLDatabaseName: String = "sqlDatabaseName"
val DatabaseUsername: String = "databaseUsername"
val DatabasePassword: String = "databasePassword"
val EventSQLTable: String = "eventSQLTable"
val TimeoutInMinutes: String = "jobTimeout"
}

Просмотреть файл

@ -1,173 +0,0 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.microsoft.spark.streaming.examples.receiverdstream.arguments
object EventhubsArgumentParser {
type ArgumentMap = Map[Symbol, Any]
def usageExample(): Unit = {
val eventhubsNamespace: String = "sparkstreamingeventhub-ns"
val eventhubsName: String = "sparkstreamingeventhub"
val policyName: String = "[EventhubsPolicyName]"
val policyKey: String = "[EventhubsPolicyKey]"
val consumerGroup: String = "$default"
val partitionCount: Int = 32
val batchInterval: Int = 10
val checkpointDirectory: String = "/EventCheckpoint10"
val eventCountFolder: String = "/EventCount/EventCount10"
val eventStoreFolder: String = "/EventStore/EventStore10"
val eventHiveTable: String = "EventHiveTable10"
val sqlServerFQDN: String = "servername.database.windows.net"
val sqlDatabaseName: String = "databasename"
val databaseUsername: String = "[DatabaseUsername]"
val databasePassword: String = "[DatabasePassword]"
val eventSQLTable: String = "EventSQLTable10"
val timeoutInMinutes: Long = -1
println()
// scalastyle:off
println(s"Usage [EventhubsEventCount]: spark-submit --master yarn-cluster ..." +
s" --class com.microsoft.spark.streaming.examples.EventHubsEventCount" +
s" /home/hdiuser/spark/SparkStreamingDataPersistence.jar --eventhubs-namespace \'$eventhubsNamespace\'" +
s" --eventhubs-name \'$eventhubsName\' --policy-name \'$policyName\' --policy-key \'$policyKey\'" +
s" --consumer-group \'$consumerGroup\' --partition-count $partitionCount" +
s" --batch-interval-in-seconds $batchInterval --checkpoint-directory \'$checkpointDirectory\'" +
s" --event-count-folder \'$eventCountFolder\' --job-timeout-in-minutes $timeoutInMinutes")
println()
println(s"Usage [EventhubsToAzureBlobAsJSON]: spark-submit --master yarn-cluster ..." +
s" --class com.microsoft.spark.streaming.examples.EventHubsEventCount" +
s" /home/hdiuser/spark/SparkStreamingDataPersistence.jar --eventhubs-namespace \'$eventhubsNamespace\'" +
s" --eventhubs-name \'$eventhubsName\' --policy-name \'$policyName\' --policy-key \'$policyKey\'" +
s" --consumer-group \'$consumerGroup\' --partition-count $partitionCount" +
s" --batch-interval-in-seconds $batchInterval --checkpoint-directory \'$checkpointDirectory\'" +
s" --event-count-folder \'$eventCountFolder\' --event-store-folder \'$eventStoreFolder\'" +
s" --job-timeout-in-minutes $timeoutInMinutes")
println()
println(s"Usage [EventhubsToHiveTable]: spark-submit --master yarn-cluster ..." +
s" --class com.microsoft.spark.streaming.examples.EventHubsEventCount" +
s" /home/hdiuser/spark/SparkStreamingDataPersistence.jar --eventhubs-namespace \'$eventhubsNamespace\'" +
s" --eventhubs-name \'$eventhubsName\' --policy-name \'$policyName\' --policy-key \'$policyKey\'" +
s" --consumer-group \'$consumerGroup --partition-count $partitionCount" +
s" --batch-interval-in-seconds $batchInterval --checkpoint-directory \'$checkpointDirectory\'" +
s" --event-count-folder \'$eventCountFolder\' --event-hive-table \'$eventHiveTable\'" +
s" --job-timeout-in-minutes $timeoutInMinutes")
println()
println(s"Usage [EventhubsToSQLTable]: spark-submit --master yarn-cluster ..." +
s" --class com.microsoft.spark.streaming.examples.EventHubsEventCount" +
s" /home/hdiuser/spark/SparkStreamingDataPersistence.jar --eventhubs-namespace $eventhubsNamespace" +
s" --eventhubs-name \'$eventhubsName\' --policy-name \'$policyName\' --policy-key \'$policyKey\'" +
s" --consumer-group \'$consumerGroup\' --partition-count $partitionCount" +
s" --batch-interval-in-seconds $batchInterval --checkpoint-directory \'$checkpointDirectory\'" +
s" --event-count-folder \'$eventCountFolder\' --sql-server-fqdn \'$sqlServerFQDN\'" +
s" --sql-database-name \'$sqlDatabaseName\' --database-username \'$databaseUsername\'" +
s" --database-password \'$databasePassword\' --event-sql-table \'$eventSQLTable\'" +
s" --job-timeout-in-minutes $timeoutInMinutes")
println()
}
def parseArguments(argumentMap : ArgumentMap, argumentList: List[String]) : ArgumentMap = {
argumentList match {
case Nil => argumentMap
case "--eventhubs-namespace" :: value:: tail =>
parseArguments(argumentMap ++ Map(Symbol(EventhubsArgumentKeys.EventhubsNamespace) -> value.toString), tail)
case "--eventhubs-name" :: value :: tail =>
parseArguments(argumentMap ++ Map(Symbol(EventhubsArgumentKeys.EventhubsName) -> value.toString), tail)
case "--policy-name" :: value :: tail =>
parseArguments(argumentMap ++ Map(Symbol(EventhubsArgumentKeys.PolicyName) -> value.toString), tail)
case "--policy-key" :: value :: tail =>
parseArguments(argumentMap ++ Map(Symbol(EventhubsArgumentKeys.PolicyKey) -> value.toString), tail)
case "--consumer-group" :: value :: tail =>
parseArguments(argumentMap ++ Map(Symbol(EventhubsArgumentKeys.ConsumerGroup) -> value.toString), tail)
case "--partition-count" :: value :: tail =>
parseArguments(argumentMap ++ Map(Symbol(EventhubsArgumentKeys.PartitionCount) -> value.toInt), tail)
case "--batch-interval-in-seconds" :: value :: tail =>
parseArguments(argumentMap ++ Map(Symbol(EventhubsArgumentKeys.BatchIntervalInSeconds) -> value.toInt), tail)
case "--checkpoint-directory" :: value :: tail =>
parseArguments(argumentMap ++ Map(Symbol(EventhubsArgumentKeys.CheckpointDirectory) -> value.toString), tail)
case "--event-count-folder" :: value :: tail =>
parseArguments(argumentMap ++ Map(Symbol(EventhubsArgumentKeys.EventCountFolder) -> value.toString), tail)
case "--event-store-folder" :: value :: tail =>
parseArguments(argumentMap ++ Map(Symbol(EventhubsArgumentKeys.EventStoreFolder) -> value.toString), tail)
case "--event-hive-table" :: value :: tail =>
parseArguments(argumentMap ++ Map(Symbol(EventhubsArgumentKeys.EventHiveTable) -> value.toString), tail)
case "--sql-server-fqdn" :: value :: tail =>
parseArguments(argumentMap ++ Map(Symbol(EventhubsArgumentKeys.SQLServerFQDN) -> value.toString), tail)
case "--sql-database-name" :: value :: tail =>
parseArguments(argumentMap ++ Map(Symbol(EventhubsArgumentKeys.SQLDatabaseName) -> value.toString), tail)
case "--database-username" :: value :: tail =>
parseArguments(argumentMap ++ Map(Symbol(EventhubsArgumentKeys.DatabaseUsername) -> value.toString), tail)
case "--database-password" :: value :: tail =>
parseArguments(argumentMap ++ Map(Symbol(EventhubsArgumentKeys.DatabasePassword) -> value.toString), tail)
case "--event-sql-table" :: value :: tail =>
parseArguments(argumentMap ++ Map(Symbol(EventhubsArgumentKeys.EventSQLTable) -> value.toString), tail)
case "--job-timeout-in-minutes" :: value :: tail =>
parseArguments(argumentMap ++ Map(Symbol(EventhubsArgumentKeys.TimeoutInMinutes) -> value.toLong), tail)
case option :: tail =>
println()
println("Unknown option: " + option)
println()
usageExample()
sys.exit(1)
}
}
// scalastyle:on
def verifyEventhubsEventCountArguments(argumentMap : ArgumentMap): Unit = {
assert(argumentMap.contains(Symbol(EventhubsArgumentKeys.EventhubsNamespace)))
assert(argumentMap.contains(Symbol(EventhubsArgumentKeys.EventhubsName)))
assert(argumentMap.contains(Symbol(EventhubsArgumentKeys.PolicyName)))
assert(argumentMap.contains(Symbol(EventhubsArgumentKeys.PolicyKey)))
assert(argumentMap.contains(Symbol(EventhubsArgumentKeys.ConsumerGroup)))
assert(argumentMap.contains(Symbol(EventhubsArgumentKeys.PartitionCount)))
assert(argumentMap.contains(Symbol(EventhubsArgumentKeys.BatchIntervalInSeconds)))
assert(argumentMap.contains(Symbol(EventhubsArgumentKeys.CheckpointDirectory)))
assert(argumentMap(Symbol(EventhubsArgumentKeys.PartitionCount)).asInstanceOf[Int] > 0)
assert(argumentMap(Symbol(EventhubsArgumentKeys.BatchIntervalInSeconds)).asInstanceOf[Int] > 0)
}
def verifyEventhubsToAzureBlobAsJSONArguments(argumentMap : ArgumentMap): Unit = {
verifyEventhubsEventCountArguments(argumentMap)
assert(argumentMap.contains(Symbol(EventhubsArgumentKeys.EventStoreFolder)))
}
def verifyEventhubsToHiveTableArguments(argumentMap : ArgumentMap): Unit = {
verifyEventhubsEventCountArguments(argumentMap)
assert(argumentMap.contains(Symbol(EventhubsArgumentKeys.EventHiveTable)))
}
def verifyEventhubsToSQLTableArguments(argumentMap : ArgumentMap): Unit = {
verifyEventhubsEventCountArguments(argumentMap)
assert(argumentMap.contains(Symbol(EventhubsArgumentKeys.SQLServerFQDN)))
assert(argumentMap.contains(Symbol(EventhubsArgumentKeys.SQLDatabaseName)))
assert(argumentMap.contains(Symbol(EventhubsArgumentKeys.DatabaseUsername)))
assert(argumentMap.contains(Symbol(EventhubsArgumentKeys.DatabasePassword)))
assert(argumentMap.contains(Symbol(EventhubsArgumentKeys.EventSQLTable)))
}
}

Просмотреть файл

@ -1,57 +0,0 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.microsoft.spark.streaming.examples.receiverdstream.common
import java.sql.{Connection, DriverManager}
import org.apache.spark.sql.DataFrame
object DataFrameExtensions {
implicit def extendedDataFrame(dataFrame: DataFrame): ExtendedDataFrame =
new ExtendedDataFrame(dataFrame: DataFrame)
class ExtendedDataFrame(dataFrame: DataFrame) {
def insertToAzureSql(sqlDatabaseConnectionString: String, sqlTableName: String): Unit = {
val tableHeader: String = dataFrame.columns.mkString(",")
dataFrame.foreachPartition { partition =>
val sqlExecutorConnection: Connection = DriverManager.getConnection(
sqlDatabaseConnectionString)
// Batch size of 1000 is used since Azure SQL database cannot insert more than 1000 rows
// at the same time.
partition.grouped(1000).foreach {
group =>
val insertString: scala.collection.mutable.StringBuilder = new StringBuilder()
group.foreach {
record => insertString.append("('" + record.mkString(",") + "'),")
}
sqlExecutorConnection.createStatement()
.executeUpdate(f"INSERT INTO [dbo].[$sqlTableName] ($tableHeader) VALUES "
+ insertString.stripSuffix(","))
}
sqlExecutorConnection.close()
}
}
}
}

Просмотреть файл

@ -1,20 +0,0 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.microsoft.spark.streaming.examples.receiverdstream.common
case class EventContent(EventDetails: String)

Просмотреть файл

@ -1,29 +0,0 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.microsoft.spark.streaming.examples.receiverdstream.common
object StreamStatistics {
val streamLengthKey: String = "StreamLength"
val streamLength = (values: Seq[Long], state: Option[Long]) => {
val currentCount = values.foldLeft(0L)(_ + _)
val previousCount = state.getOrElse(0L)
Some(currentCount + previousCount)
}
}

Просмотреть файл

@ -1,38 +0,0 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.microsoft.spark.streaming.examples.receiverdstream.common
object StreamUtilities {
def getSqlJdbcConnectionString(sqlServerFQDN: String, sqlDatabaseName: String,
databaseUsername: String, databasePassword: String): String = {
val serverName = sqlServerFQDN.split('.')(0)
val certificateHostname = sqlServerFQDN.replace(serverName, "*")
val serverPort = "1433"
val sqlDatabaseConnectionString = f"jdbc:sqlserver://$sqlServerFQDN:$serverPort;" +
f"database=$sqlDatabaseName;" +
f"user=$databaseUsername@$serverName;password=$databasePassword;" +
f"encrypt=true;hostNameInCertificate=$certificateHostname;loginTimeout=30;"
Class.forName("com.microsoft.sqlserver.jdbc.SQLServerDriver")
sqlDatabaseConnectionString
}
}

Просмотреть файл

@ -1,131 +0,0 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.microsoft.spark.streaming.examples.receiverdstream.workloads
import com.microsoft.spark.streaming.examples.receiverdstream.arguments.{EventhubsArgumentKeys, EventhubsArgumentParser}
import com.microsoft.spark.streaming.examples.receiverdstream.arguments.EventhubsArgumentParser.ArgumentMap
import com.microsoft.spark.streaming.examples.receiverdstream.common.StreamStatistics
import org.apache.spark.sql.SparkSession
import org.apache.spark.streaming.{Seconds, StreamingContext}
import org.apache.spark.streaming.eventhubs.EventHubsUtils
object EventhubsEventCount {
def createStreamingContext(inputOptions: ArgumentMap): StreamingContext = {
val eventHubsParameters = Map[String, String](
"eventhubs.namespace" -> inputOptions(Symbol(EventhubsArgumentKeys.EventhubsNamespace)).
asInstanceOf[String],
"eventhubs.name" -> inputOptions(Symbol(EventhubsArgumentKeys.EventhubsName)).
asInstanceOf[String],
"eventhubs.policyname" -> inputOptions(Symbol(EventhubsArgumentKeys.PolicyName)).
asInstanceOf[String],
"eventhubs.policykey" -> inputOptions(Symbol(EventhubsArgumentKeys.PolicyKey)).
asInstanceOf[String],
"eventhubs.consumergroup" -> inputOptions(Symbol(EventhubsArgumentKeys.ConsumerGroup)).
asInstanceOf[String],
"eventhubs.partition.count" -> inputOptions(Symbol(EventhubsArgumentKeys.PartitionCount))
.asInstanceOf[Int].toString,
"eventhubs.checkpoint.interval" -> inputOptions(Symbol(EventhubsArgumentKeys.
BatchIntervalInSeconds)).asInstanceOf[Int].toString,
"eventhubs.checkpoint.dir" -> inputOptions(Symbol(EventhubsArgumentKeys.CheckpointDirectory)).
asInstanceOf[String]
)
/**
* In Spark 2.0.x, SparkConf must be initialized through EventhubsUtil so that required
* data structures internal to Azure Eventhubs Client get registered with the Kryo Serializer.
*/
val sparkConfiguration = EventHubsUtils.initializeSparkStreamingConfigurations
sparkConfiguration.setAppName(this.getClass.getSimpleName)
sparkConfiguration.set("spark.streaming.driver.writeAheadLog.allowBatching", "true")
sparkConfiguration.set("spark.streaming.driver.writeAheadLog.batchingTimeout", "60000")
sparkConfiguration.set("spark.streaming.receiver.writeAheadLog.enable", "true")
sparkConfiguration.set("spark.streaming.driver.writeAheadLog.closeFileAfterWrite", "true")
sparkConfiguration.set("spark.streaming.receiver.writeAheadLog.closeFileAfterWrite", "true")
sparkConfiguration.set("spark.streaming.stopGracefullyOnShutdown", "true")
val sparkSession = SparkSession.builder().config(sparkConfiguration).getOrCreate()
val streamingContext = new StreamingContext(sparkSession.sparkContext,
Seconds(inputOptions(Symbol(EventhubsArgumentKeys.BatchIntervalInSeconds)).asInstanceOf[Int]))
streamingContext.checkpoint(inputOptions(Symbol(EventhubsArgumentKeys.CheckpointDirectory)).
asInstanceOf[String])
val eventHubsStream = EventHubsUtils.createUnionStream(streamingContext, eventHubsParameters)
val eventHubsWindowedStream = eventHubsStream.window(
Seconds(inputOptions(Symbol(EventhubsArgumentKeys.BatchIntervalInSeconds)).asInstanceOf[Int]))
// Count number of events received the past batch
val batchEventCount = eventHubsWindowedStream.count()
batchEventCount.print()
// Count number of events received so far
val totalEventCountDStream = eventHubsWindowedStream.map(m =>
(StreamStatistics.streamLengthKey, 1L))
val totalEventCount = totalEventCountDStream.updateStateByKey[Long](
StreamStatistics.streamLength)
totalEventCount.checkpoint(Seconds(inputOptions(Symbol(EventhubsArgumentKeys.
BatchIntervalInSeconds)).asInstanceOf[Int]))
if (inputOptions.contains(Symbol(EventhubsArgumentKeys.EventCountFolder))) {
totalEventCount.saveAsTextFiles(inputOptions(Symbol(EventhubsArgumentKeys.EventCountFolder))
.asInstanceOf[String])
}
totalEventCount.print()
streamingContext
}
def main(inputArguments: Array[String]): Unit = {
val inputOptions: ArgumentMap = EventhubsArgumentParser.parseArguments(Map(),
inputArguments.toList)
EventhubsArgumentParser.verifyEventhubsEventCountArguments(inputOptions)
// Create or recreate streaming context
val streamingContext = StreamingContext.getOrCreate(inputOptions(Symbol(EventhubsArgumentKeys.
CheckpointDirectory)).asInstanceOf[String], () => createStreamingContext(inputOptions))
streamingContext.start()
if(inputOptions.contains(Symbol(EventhubsArgumentKeys.TimeoutInMinutes))) {
streamingContext.awaitTerminationOrTimeout(inputOptions(Symbol(EventhubsArgumentKeys.
TimeoutInMinutes)).asInstanceOf[Long] * 60 * 1000)
}
else {
streamingContext.awaitTermination()
}
}
}

Просмотреть файл

@ -1,130 +0,0 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.microsoft.spark.streaming.examples.receiverdstream.workloads
import com.microsoft.spark.streaming.examples.receiverdstream.arguments.{EventhubsArgumentKeys, EventhubsArgumentParser}
import com.microsoft.spark.streaming.examples.receiverdstream.arguments.EventhubsArgumentParser.ArgumentMap
import com.microsoft.spark.streaming.examples.receiverdstream.common.{EventContent, StreamStatistics}
import org.apache.spark._
import org.apache.spark.sql.{SaveMode, SparkSession}
import org.apache.spark.streaming.{Seconds, StreamingContext}
import org.apache.spark.streaming.eventhubs.EventHubsUtils
object EventhubsToAzureBlobAsJSON {
def createStreamingContext(inputOptions: ArgumentMap): StreamingContext = {
// scalastyle:off
val eventHubsParameters = Map[String, String](
"eventhubs.namespace" -> inputOptions(Symbol(EventhubsArgumentKeys.EventhubsNamespace)).asInstanceOf[String],
"eventhubs.name" -> inputOptions(Symbol(EventhubsArgumentKeys.EventhubsName)).asInstanceOf[String],
"eventhubs.policyname" -> inputOptions(Symbol(EventhubsArgumentKeys.PolicyName)).asInstanceOf[String],
"eventhubs.policykey" -> inputOptions(Symbol(EventhubsArgumentKeys.PolicyKey)).asInstanceOf[String],
"eventhubs.consumergroup" -> inputOptions(Symbol(EventhubsArgumentKeys.ConsumerGroup)).asInstanceOf[String],
"eventhubs.partition.count" -> inputOptions(Symbol(EventhubsArgumentKeys.PartitionCount))
.asInstanceOf[Int].toString,
"eventhubs.checkpoint.interval" -> inputOptions(Symbol(EventhubsArgumentKeys.BatchIntervalInSeconds))
.asInstanceOf[Int].toString,
"eventhubs.checkpoint.dir" -> inputOptions(Symbol(EventhubsArgumentKeys.CheckpointDirectory)).asInstanceOf[String]
)
// scalastyle:on
/**
* In Spark 2.0.x, SparkConf must be initialized through EventhubsUtil so that required
* data structures internal to Azure Eventhubs Client get registered with the Kryo Serializer.
*/
val sparkConfiguration : SparkConf = EventHubsUtils.initializeSparkStreamingConfigurations
sparkConfiguration.setAppName(this.getClass.getSimpleName)
sparkConfiguration.set("spark.streaming.driver.writeAheadLog.allowBatching", "true")
sparkConfiguration.set("spark.streaming.driver.writeAheadLog.batchingTimeout", "60000")
sparkConfiguration.set("spark.streaming.receiver.writeAheadLog.enable", "true")
sparkConfiguration.set("spark.streaming.driver.writeAheadLog.closeFileAfterWrite", "true")
sparkConfiguration.set("spark.streaming.receiver.writeAheadLog.closeFileAfterWrite", "true")
sparkConfiguration.set("spark.streaming.stopGracefullyOnShutdown", "true")
val sparkSession : SparkSession = SparkSession.builder.config(sparkConfiguration).getOrCreate
val streamingContext = new StreamingContext(sparkSession.sparkContext,
Seconds(inputOptions(Symbol(EventhubsArgumentKeys.BatchIntervalInSeconds)).asInstanceOf[Int]))
streamingContext.checkpoint(inputOptions(Symbol(EventhubsArgumentKeys.CheckpointDirectory)).
asInstanceOf[String])
val eventHubsStream = EventHubsUtils.createUnionStream(streamingContext, eventHubsParameters)
val eventHubsWindowedStream = eventHubsStream.window(
Seconds(inputOptions(Symbol(EventhubsArgumentKeys.BatchIntervalInSeconds)).asInstanceOf[Int]))
eventHubsWindowedStream.map(x => EventContent(new String(x)))
.foreachRDD(rdd => {
val sparkSession = SparkSession.builder.getOrCreate
import sparkSession.implicits._
rdd.toDS.toJSON.write.mode(SaveMode.Overwrite)
.save(inputOptions(Symbol(EventhubsArgumentKeys.EventStoreFolder)).asInstanceOf[String])
})
// Count number of events received the past batch
val batchEventCount = eventHubsWindowedStream.count()
batchEventCount.print()
// Count number of events received so far
val totalEventCountDStream =
eventHubsWindowedStream.map(m => (StreamStatistics.streamLengthKey, 1L))
val totalEventCount =
totalEventCountDStream.updateStateByKey[Long](StreamStatistics.streamLength)
totalEventCount.checkpoint(
Seconds(inputOptions(Symbol(EventhubsArgumentKeys.BatchIntervalInSeconds)).asInstanceOf[Int]))
if (inputOptions.contains(Symbol(EventhubsArgumentKeys.EventCountFolder))) {
totalEventCount.saveAsTextFiles(inputOptions(Symbol(EventhubsArgumentKeys.EventCountFolder))
.asInstanceOf[String])
}
totalEventCount.print()
streamingContext
}
def main(inputArguments: Array[String]): Unit = {
val inputOptions = EventhubsArgumentParser.parseArguments(Map(), inputArguments.toList)
EventhubsArgumentParser.verifyEventhubsToAzureBlobAsJSONArguments(inputOptions)
val streamingContext = StreamingContext.getOrCreate(
inputOptions(Symbol(EventhubsArgumentKeys.CheckpointDirectory)).asInstanceOf[String],
() => createStreamingContext(inputOptions))
streamingContext.start()
if(inputOptions.contains(Symbol(EventhubsArgumentKeys.TimeoutInMinutes))) {
streamingContext.awaitTerminationOrTimeout(
inputOptions(Symbol(EventhubsArgumentKeys.TimeoutInMinutes)).asInstanceOf[Long] * 60 * 1000)
}
else {
streamingContext.awaitTermination()
}
}
}

Просмотреть файл

@ -1,165 +0,0 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.microsoft.spark.streaming.examples.receiverdstream.workloads
import java.sql.{Connection, DriverManager, Statement}
import com.microsoft.spark.streaming.examples.receiverdstream.arguments.{EventhubsArgumentKeys, EventhubsArgumentParser}
import com.microsoft.spark.streaming.examples.receiverdstream.arguments.EventhubsArgumentParser.ArgumentMap
import com.microsoft.spark.streaming.examples.receiverdstream.common.{EventContent, StreamStatistics, StreamUtilities}
import org.apache.spark._
import org.apache.spark.sql.SparkSession
import org.apache.spark.streaming.{Seconds, StreamingContext}
import org.apache.spark.streaming.eventhubs.EventHubsUtils
object EventhubsToAzureSQLTable {
def createStreamingContext(inputOptions: ArgumentMap): StreamingContext = {
// scalastyle:off
val eventHubsParameters = Map[String, String](
"eventhubs.namespace" -> inputOptions(Symbol(EventhubsArgumentKeys.EventhubsNamespace)).asInstanceOf[String],
"eventhubs.name" -> inputOptions(Symbol(EventhubsArgumentKeys.EventhubsName)).asInstanceOf[String],
"eventhubs.policyname" -> inputOptions(Symbol(EventhubsArgumentKeys.PolicyName)).asInstanceOf[String],
"eventhubs.policykey" -> inputOptions(Symbol(EventhubsArgumentKeys.PolicyKey)).asInstanceOf[String],
"eventhubs.consumergroup" -> inputOptions(Symbol(EventhubsArgumentKeys.ConsumerGroup)).asInstanceOf[String],
"eventhubs.partition.count" -> inputOptions(Symbol(EventhubsArgumentKeys.PartitionCount))
.asInstanceOf[Int].toString,
"eventhubs.checkpoint.interval" -> inputOptions(Symbol(EventhubsArgumentKeys.BatchIntervalInSeconds))
.asInstanceOf[Int].toString,
"eventhubs.checkpoint.dir" -> inputOptions(Symbol(EventhubsArgumentKeys.CheckpointDirectory)).asInstanceOf[String]
)
// scalastyle:on
val sqlDatabaseConnectionString : String = StreamUtilities.getSqlJdbcConnectionString(
inputOptions(Symbol(EventhubsArgumentKeys.SQLServerFQDN)).asInstanceOf[String],
inputOptions(Symbol(EventhubsArgumentKeys.SQLDatabaseName)).asInstanceOf[String],
inputOptions(Symbol(EventhubsArgumentKeys.DatabaseUsername)).asInstanceOf[String],
inputOptions(Symbol(EventhubsArgumentKeys.DatabasePassword)).asInstanceOf[String])
val sqlTableName: String = inputOptions(Symbol(EventhubsArgumentKeys.EventSQLTable)).
asInstanceOf[String]
/**
* In Spark 2.0.x, SparkConf must be initialized through EventhubsUtil so that required
* data structures internal to Azure Eventhubs Client get registered with the Kryo Serializer.
*/
val sparkConfiguration : SparkConf = EventHubsUtils.initializeSparkStreamingConfigurations
sparkConfiguration.setAppName(this.getClass.getSimpleName)
sparkConfiguration.set("spark.streaming.driver.writeAheadLog.allowBatching", "true")
sparkConfiguration.set("spark.streaming.driver.writeAheadLog.batchingTimeout", "60000")
sparkConfiguration.set("spark.streaming.receiver.writeAheadLog.enable", "true")
sparkConfiguration.set("spark.streaming.driver.writeAheadLog.closeFileAfterWrite", "true")
sparkConfiguration.set("spark.streaming.receiver.writeAheadLog.closeFileAfterWrite", "true")
sparkConfiguration.set("spark.streaming.stopGracefullyOnShutdown", "true")
val sparkSession = SparkSession.builder().config(sparkConfiguration).getOrCreate()
val streamingContext = new StreamingContext(sparkSession.sparkContext,
Seconds(inputOptions(Symbol(EventhubsArgumentKeys.BatchIntervalInSeconds)).asInstanceOf[Int]))
streamingContext.checkpoint(inputOptions(Symbol(EventhubsArgumentKeys.CheckpointDirectory)).
asInstanceOf[String])
val eventHubsStream = EventHubsUtils.createUnionStream(streamingContext, eventHubsParameters)
val eventHubsWindowedStream = eventHubsStream.window(
Seconds(inputOptions(Symbol(EventhubsArgumentKeys.BatchIntervalInSeconds)).asInstanceOf[Int]))
import com.microsoft.spark.streaming.examples.receiverdstream.common.DataFrameExtensions._
eventHubsWindowedStream.map(m => EventContent(new String(m)))
.foreachRDD { rdd => {
val sparkSession = SparkSession.builder.getOrCreate
import sparkSession.implicits._
rdd.toDF.insertToAzureSql(sqlDatabaseConnectionString, sqlTableName)
}
}
// Count number of events received the past batch
val batchEventCount = eventHubsWindowedStream.count()
batchEventCount.print()
// Count number of events received so far
val totalEventCountDStream = eventHubsWindowedStream.map(
m => (StreamStatistics.streamLengthKey, 1L))
val totalEventCount = totalEventCountDStream.updateStateByKey[Long](
StreamStatistics.streamLength)
totalEventCount.checkpoint(
Seconds(inputOptions(Symbol(EventhubsArgumentKeys.BatchIntervalInSeconds)).asInstanceOf[Int]))
if (inputOptions.contains(Symbol(EventhubsArgumentKeys.EventCountFolder))) {
totalEventCount.saveAsTextFiles(inputOptions(Symbol(EventhubsArgumentKeys.EventCountFolder))
.asInstanceOf[String])
}
totalEventCount.print()
streamingContext
}
def main(inputArguments: Array[String]): Unit = {
val inputOptions = EventhubsArgumentParser.parseArguments(Map(), inputArguments.toList)
EventhubsArgumentParser.verifyEventhubsToSQLTableArguments(inputOptions)
val sqlDatabaseConnectionString : String = StreamUtilities.getSqlJdbcConnectionString(
inputOptions(Symbol(EventhubsArgumentKeys.SQLServerFQDN)).asInstanceOf[String],
inputOptions(Symbol(EventhubsArgumentKeys.SQLDatabaseName)).asInstanceOf[String],
inputOptions(Symbol(EventhubsArgumentKeys.DatabaseUsername)).asInstanceOf[String],
inputOptions(Symbol(EventhubsArgumentKeys.DatabasePassword)).asInstanceOf[String])
val sqlTableName: String = inputOptions(Symbol(EventhubsArgumentKeys.EventSQLTable)).
asInstanceOf[String]
val sqlDriverConnection = DriverManager.getConnection(sqlDatabaseConnectionString)
sqlDriverConnection.setAutoCommit(false)
val sqlDriverStatement: Statement = sqlDriverConnection.createStatement()
sqlDriverStatement.addBatch(f"IF NOT EXISTS(SELECT * FROM sys.objects WHERE object_id" +
f" = OBJECT_ID(N'[dbo].[$sqlTableName]') AND type in (N'U'))" +
f"\nCREATE TABLE $sqlTableName(EventDetails NVARCHAR(128) NOT NULL)")
sqlDriverStatement.addBatch(f"IF IndexProperty(Object_Id('$sqlTableName'), 'IX_EventDetails'," +
f" 'IndexId') IS NULL" +
f"\nCREATE CLUSTERED INDEX IX_EventDetails ON $sqlTableName(EventDetails)")
sqlDriverStatement.executeBatch()
sqlDriverConnection.commit()
sqlDriverConnection.close()
val streamingContext = StreamingContext.getOrCreate(
inputOptions(Symbol(EventhubsArgumentKeys.CheckpointDirectory)).asInstanceOf[String],
() => createStreamingContext(inputOptions))
streamingContext.start()
if (inputOptions.contains(Symbol(EventhubsArgumentKeys.TimeoutInMinutes))) {
streamingContext.awaitTerminationOrTimeout(
inputOptions(Symbol(EventhubsArgumentKeys.TimeoutInMinutes)).asInstanceOf[Long] * 60 * 1000)
} else {
streamingContext.awaitTermination()
}
}
}

Просмотреть файл

@ -1,147 +0,0 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.microsoft.spark.streaming.examples.receiverdstream.workloads
import com.microsoft.spark.streaming.examples.receiverdstream.arguments.{EventhubsArgumentKeys, EventhubsArgumentParser}
import com.microsoft.spark.streaming.examples.receiverdstream.arguments.EventhubsArgumentParser.ArgumentMap
import com.microsoft.spark.streaming.examples.receiverdstream.common.{EventContent, StreamStatistics}
import org.apache.spark._
import org.apache.spark.sql.SparkSession
import org.apache.spark.streaming.{Seconds, StreamingContext}
import org.apache.spark.streaming.eventhubs.EventHubsUtils
object EventhubsToHiveTable {
def createStreamingContext(inputOptions: ArgumentMap): StreamingContext = {
// scalastyle:off
val eventHubsParameters = Map[String, String](
"eventhubs.namespace" -> inputOptions(Symbol(EventhubsArgumentKeys.EventhubsNamespace)).asInstanceOf[String],
"eventhubs.name" -> inputOptions(Symbol(EventhubsArgumentKeys.EventhubsName)).asInstanceOf[String],
"eventhubs.policyname" -> inputOptions(Symbol(EventhubsArgumentKeys.PolicyName)).asInstanceOf[String],
"eventhubs.policykey" -> inputOptions(Symbol(EventhubsArgumentKeys.PolicyKey)).asInstanceOf[String],
"eventhubs.consumergroup" -> inputOptions(Symbol(EventhubsArgumentKeys.ConsumerGroup)).asInstanceOf[String],
"eventhubs.partition.count" -> inputOptions(Symbol(EventhubsArgumentKeys.PartitionCount))
.asInstanceOf[Int].toString,
"eventhubs.checkpoint.interval" -> inputOptions(Symbol(EventhubsArgumentKeys.BatchIntervalInSeconds))
.asInstanceOf[Int].toString,
"eventhubs.checkpoint.dir" -> inputOptions(Symbol(EventhubsArgumentKeys.CheckpointDirectory)).asInstanceOf[String]
)
// scalastyle:on
/**
* In Spark 2.0.x, SparkConf must be initialized through EventhubsUtil so that required
* data structures internal to Azure Eventhubs Client get registered with the Kryo Serializer.
*/
val sparkConfiguration : SparkConf = EventHubsUtils.initializeSparkStreamingConfigurations
sparkConfiguration.setAppName(this.getClass.getSimpleName)
sparkConfiguration.set("spark.streaming.driver.writeAheadLog.allowBatching", "true")
sparkConfiguration.set("spark.streaming.driver.writeAheadLog.batchingTimeout", "60000")
sparkConfiguration.set("spark.streaming.receiver.writeAheadLog.enable", "true")
sparkConfiguration.set("spark.streaming.driver.writeAheadLog.closeFileAfterWrite", "true")
sparkConfiguration.set("spark.streaming.receiver.writeAheadLog.closeFileAfterWrite", "true")
sparkConfiguration.set("spark.streaming.stopGracefullyOnShutdown", "true")
val sparkSession = SparkSession.builder.config(sparkConfiguration).enableHiveSupport.getOrCreate
val streamingContext = new StreamingContext(sparkSession.sparkContext,
Seconds(inputOptions(Symbol(EventhubsArgumentKeys.BatchIntervalInSeconds)).asInstanceOf[Int]))
streamingContext.checkpoint(inputOptions(Symbol(EventhubsArgumentKeys.CheckpointDirectory)).
asInstanceOf[String])
val eventHubsStream = EventHubsUtils.createUnionStream(streamingContext, eventHubsParameters)
val eventHubsWindowedStream = eventHubsStream.window(
Seconds(inputOptions(Symbol(EventhubsArgumentKeys.BatchIntervalInSeconds)).asInstanceOf[Int]))
val hiveTableName = inputOptions(Symbol(EventhubsArgumentKeys.EventHiveTable)).
asInstanceOf[String]
// Table needs to be explicitly created to match the Parquet format in which the data is stored
// by default by Spark. If not explicitly created the Hive table cannot be used from Hive and
// can only be used from inside Spark.
val hiveTableDDL =
f"CREATE TABLE IF NOT EXISTS $hiveTableName (EventContent string) STORED AS PARQUET"
sparkSession.sql(hiveTableDDL)
/**
* .saveAsTable does not work so insertInto is used.
* Refer to SPARK-16803 (https://issues.apache.org/jira/browse/SPARK-16803)
*/
eventHubsWindowedStream.map(x => EventContent(new String(x)))
.foreachRDD(rdd => {
val sparkSession = SparkSession.builder.enableHiveSupport.getOrCreate
import sparkSession.implicits._
rdd.toDS.write.mode(org.apache.spark.sql.SaveMode.Append).insertInto(hiveTableName)
})
// Count number of events received the past batch
val batchEventCount = eventHubsWindowedStream.count()
batchEventCount.print()
// Count number of events received so far
val totalEventCountDStream = eventHubsWindowedStream.map(
m => (StreamStatistics.streamLengthKey, 1L))
val totalEventCount = totalEventCountDStream.updateStateByKey[Long](
StreamStatistics.streamLength)
totalEventCount.checkpoint(
Seconds(inputOptions(Symbol(EventhubsArgumentKeys.BatchIntervalInSeconds)).asInstanceOf[Int]))
if (inputOptions.contains(Symbol(EventhubsArgumentKeys.EventCountFolder))) {
totalEventCount.saveAsTextFiles(inputOptions(Symbol(EventhubsArgumentKeys.EventCountFolder))
.asInstanceOf[String])
}
totalEventCount.print()
streamingContext
}
def main(inputArguments: Array[String]): Unit = {
val inputOptions = EventhubsArgumentParser.parseArguments(Map(), inputArguments.toList)
EventhubsArgumentParser.verifyEventhubsToHiveTableArguments(inputOptions)
// Create or recreate streaming context
val streamingContext = StreamingContext.getOrCreate(
inputOptions(Symbol(EventhubsArgumentKeys.CheckpointDirectory)).asInstanceOf[String],
() => createStreamingContext(inputOptions))
streamingContext.start()
if(inputOptions.contains(Symbol(EventhubsArgumentKeys.TimeoutInMinutes))) {
streamingContext.awaitTerminationOrTimeout(
inputOptions(Symbol(EventhubsArgumentKeys.TimeoutInMinutes)).asInstanceOf[Long] * 60 * 1000)
} else {
streamingContext.awaitTermination()
}
}
}

32
pom.xml
Просмотреть файл

@ -31,8 +31,8 @@
</licenses>
<developers>
<developer>
<name>Arijit Tarafdar</name>
<email>arijitt@microsoft.com</email>
<name>Sabee Grewal</name>
<email>sagrewal@microsoft.com</email>
<organization>Microsoft Corporation</organization>
<organizationUrl>http://www.microsoft.com</organizationUrl>
</developer>
@ -100,7 +100,7 @@
<branch>refs/heads/maven-repo</branch>
<includes><include>**/*</include></includes>
<repositoryName>spark-eventhubs</repositoryName>
<repositoryOwner>sabeegrewal</repositoryOwner>
<repositoryOwner>Azure</repositoryOwner>
<merge>true</merge>
</configuration>
<executions>
@ -306,32 +306,6 @@
<outputDirectory>target/scala-${scala.binary.version}/classes</outputDirectory>
<testOutputDirectory>target/scala-${scala.binary.version}/test-classes</testOutputDirectory>
<plugins>
<plugin>
<groupId>org.scalastyle</groupId>
<artifactId>scalastyle-maven-plugin</artifactId>
<version>0.8.0</version>
<configuration>
<verbose>false</verbose>
<failOnViolation>true</failOnViolation>
<includeTestSourceDirectory>true</includeTestSourceDirectory>
<failOnWarning>false</failOnWarning>
<sourceDirectories>
<dir>${basedir}/core/src/main/scala</dir>
<dir>${basedir}/examples/src/main/scala</dir>
</sourceDirectories>
<testSourceDirectory>${basedir}/core/src/test/scala</testSourceDirectory>
<configLocation>scalastyle-config.xml</configLocation>
<outputFile>${project.basedir}/scalastyle-output.xml</outputFile>
<outputEncoding>UTF-8</outputEncoding>
</configuration>
<executions>
<execution>
<goals>
<goal>check</goal>
</goals>
</execution>
</executions>
</plugin>
<plugin>
<groupId>org.scala-tools</groupId>
<artifactId>maven-scala-plugin</artifactId>

Просмотреть файл

@ -1 +1,3 @@
addSbtPlugin("org.scalastyle" %% "scalastyle-sbt-plugin" % "0.8.0")
scalaVersion := "2.11.8"
addSbtPlugin("org.lucidchart" %% "sbt-scalafmt" % "1.12")

Просмотреть файл

@ -1,4 +1,3 @@
set -e
mvn install -DskipTests
mvn scalastyle:check
mvn test

Просмотреть файл

@ -1,342 +0,0 @@
<!--
~ Licensed to the Apache Software Foundation (ASF) under one or more
~ contributor license agreements. See the NOTICE file distributed with
~ this work for additional information regarding copyright ownership.
~ The ASF licenses this file to You under the Apache License, Version 2.0
~ (the "License"); you may not use this file except in compliance with
~ the License. You may obtain a copy of the License at
~
~ http://www.apache.org/licenses/LICENSE-2.0
~
~ Unless required by applicable law or agreed to in writing, software
~ distributed under the License is distributed on an "AS IS" BASIS,
~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
~ See the License for the specific language governing permissions and
~ limitations under the License.
-->
<!--
If you wish to turn off checking for a section of code, you can put a comment in the source
before and after the section, with the following syntax:
// scalastyle:off
... // stuff that breaks the styles
// scalastyle:on
You can also disable only one rule, by specifying its rule id, as specified in:
http://www.scalastyle.org/rules-0.7.0.html
// scalastyle:off no.finalize
override def finalize(): Unit = ...
// scalastyle:on no.finalize
This file is divided into 3 sections:
(1) rules that we enforce.
(2) rules that we would like to enforce, but haven't cleaned up the codebase to turn on yet
(or we need to make the scalastyle rule more configurable).
(3) rules that we don't want to enforce.
-->
<scalastyle>
<name>Scalastyle standard configuration</name>
<!-- ================================================================================ -->
<!-- rules we enforce -->
<!-- ================================================================================ -->
<check level="error" class="org.scalastyle.file.FileTabChecker" enabled="true"></check>
<check level="error" class="org.scalastyle.file.HeaderMatchesChecker" enabled="true">
<parameters>
<parameter name="header"><![CDATA[/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/]]></parameter>
</parameters>
</check>
<check level="error" class="org.scalastyle.scalariform.SpacesAfterPlusChecker" enabled="true"></check>
<check level="error" class="org.scalastyle.scalariform.SpacesBeforePlusChecker" enabled="true"></check>
<check level="error" class="org.scalastyle.file.WhitespaceEndOfLineChecker" enabled="true"></check>
<check level="error" class="org.scalastyle.file.FileLineLengthChecker" enabled="true">
<parameters>
<parameter name="maxLineLength"><![CDATA[100]]></parameter>
<parameter name="tabSize"><![CDATA[2]]></parameter>
<parameter name="ignoreImports">true</parameter>
</parameters>
</check>
<check level="error" class="org.scalastyle.scalariform.ClassNamesChecker" enabled="true">
<parameters><parameter name="regex"><![CDATA[[A-Z][A-Za-z]*]]></parameter></parameters>
</check>
<check level="error" class="org.scalastyle.scalariform.ObjectNamesChecker" enabled="true">
<parameters><parameter name="regex"><![CDATA[[A-Z][A-Za-z]*]]></parameter></parameters>
</check>
<check level="error" class="org.scalastyle.scalariform.PackageObjectNamesChecker" enabled="true">
<parameters><parameter name="regex"><![CDATA[^[a-z][A-Za-z]*$]]></parameter></parameters>
</check>
<check level="error" class="org.scalastyle.scalariform.ParameterNumberChecker" enabled="true">
<parameters><parameter name="maxParameters"><![CDATA[10]]></parameter></parameters>
</check>
<check level="error" class="org.scalastyle.scalariform.NoFinalizeChecker" enabled="true"></check>
<check level="error" class="org.scalastyle.scalariform.CovariantEqualsChecker" enabled="true"></check>
<check level="error" class="org.scalastyle.scalariform.StructuralTypeChecker" enabled="true"></check>
<check level="error" class="org.scalastyle.scalariform.UppercaseLChecker" enabled="true"></check>
<check level="error" class="org.scalastyle.scalariform.IfBraceChecker" enabled="true">
<parameters>
<parameter name="singleLineAllowed"><![CDATA[true]]></parameter>
<parameter name="doubleLineAllowed"><![CDATA[true]]></parameter>
</parameters>
</check>
<check level="error" class="org.scalastyle.scalariform.PublicMethodsHaveTypeChecker" enabled="true"></check>
<check level="error" class="org.scalastyle.file.NewLineAtEofChecker" enabled="true"></check>
<check customId="nonascii" level="error" class="org.scalastyle.scalariform.NonASCIICharacterChecker" enabled="true"></check>
<check level="error" class="org.scalastyle.scalariform.SpaceAfterCommentStartChecker" enabled="true"></check>
<check level="error" class="org.scalastyle.scalariform.EnsureSingleSpaceBeforeTokenChecker" enabled="true">
<parameters>
<parameter name="tokens">ARROW, EQUALS, ELSE, TRY, CATCH, FINALLY, LARROW, RARROW</parameter>
</parameters>
</check>
<check level="error" class="org.scalastyle.scalariform.EnsureSingleSpaceAfterTokenChecker" enabled="true">
<parameters>
<parameter name="tokens">ARROW, EQUALS, COMMA, COLON, IF, ELSE, DO, WHILE, FOR, MATCH, TRY, CATCH, FINALLY, LARROW, RARROW</parameter>
</parameters>
</check>
<!-- ??? usually shouldn't be checked into the code base. -->
<check level="error" class="org.scalastyle.scalariform.NotImplementedErrorUsage" enabled="true"></check>
<!-- As of SPARK-7558, all tests in Spark should extend o.a.s.SparkFunSuite instead of FunSuite directly -->
<check customId="funsuite" level="error" class="org.scalastyle.scalariform.TokenChecker" enabled="false">
<parameters><parameter name="regex">^FunSuite[A-Za-z]*$</parameter></parameters>
<customMessage>Tests must extend org.apache.spark.SparkFunSuite instead.</customMessage>
</check>
<!-- As of SPARK-7977 all printlns need to be wrapped in '// scalastyle:off/on println' -->
<check customId="println" level="error" class="org.scalastyle.scalariform.TokenChecker" enabled="false">
<parameters><parameter name="regex">^println$</parameter></parameters>
<customMessage><![CDATA[Are you sure you want to println? If yes, wrap the code block with
// scalastyle:off println
println(...)
// scalastyle:on println]]></customMessage>
</check>
<check customId="visiblefortesting" level="error" class="org.scalastyle.file.RegexChecker" enabled="true">
<parameters><parameter name="regex">@VisibleForTesting</parameter></parameters>
<customMessage><![CDATA[
@VisibleForTesting causes classpath issues. Please note this in the java doc instead (SPARK-11615).
]]></customMessage>
</check>
<check customId="runtimeaddshutdownhook" level="error" class="org.scalastyle.file.RegexChecker" enabled="true">
<parameters><parameter name="regex">Runtime\.getRuntime\.addShutdownHook</parameter></parameters>
<customMessage><![CDATA[
Are you sure that you want to use Runtime.getRuntime.addShutdownHook? In most cases, you should use
ShutdownHookManager.addShutdownHook instead.
If you must use Runtime.getRuntime.addShutdownHook, wrap the code block with
// scalastyle:off runtimeaddshutdownhook
Runtime.getRuntime.addShutdownHook(...)
// scalastyle:on runtimeaddshutdownhook
]]></customMessage>
</check>
<check customId="mutablesynchronizedbuffer" level="error" class="org.scalastyle.file.RegexChecker" enabled="true">
<parameters><parameter name="regex">mutable\.SynchronizedBuffer</parameter></parameters>
<customMessage><![CDATA[
Are you sure that you want to use mutable.SynchronizedBuffer? In most cases, you should use
java.util.concurrent.ConcurrentLinkedQueue instead.
If you must use mutable.SynchronizedBuffer, wrap the code block with
// scalastyle:off mutablesynchronizedbuffer
mutable.SynchronizedBuffer[...]
// scalastyle:on mutablesynchronizedbuffer
]]></customMessage>
</check>
<check customId="classforname" level="error" class="org.scalastyle.file.RegexChecker" enabled="false">
<parameters><parameter name="regex">Class\.forName</parameter></parameters>
<customMessage><![CDATA[
Are you sure that you want to use Class.forName? In most cases, you should use Utils.classForName instead.
If you must use Class.forName, wrap the code block with
// scalastyle:off classforname
Class.forName(...)
// scalastyle:on classforname
]]></customMessage>
</check>
<check customId="awaitresult" level="error" class="org.scalastyle.file.RegexChecker" enabled="true">
<parameters><parameter name="regex">Await\.result</parameter></parameters>
<customMessage><![CDATA[
Are you sure that you want to use Await.result? In most cases, you should use ThreadUtils.awaitResult instead.
If you must use Await.result, wrap the code block with
// scalastyle:off awaitresult
Await.result(...)
// scalastyle:on awaitresult
If your codes use ThreadLocal and may run in threads created by the user, use ThreadUtils.awaitResultInForkJoinSafely instead.
]]></customMessage>
</check>
<!-- As of SPARK-9613 JavaConversions should be replaced with JavaConverters -->
<check customId="javaconversions" level="error" class="org.scalastyle.scalariform.TokenChecker" enabled="true">
<parameters><parameter name="regex">JavaConversions</parameter></parameters>
<customMessage>Instead of importing implicits in scala.collection.JavaConversions._, import
scala.collection.JavaConverters._ and use .asScala / .asJava methods</customMessage>
</check>
<check customId="commonslang2" level="error" class="org.scalastyle.scalariform.TokenChecker" enabled="true">
<parameters><parameter name="regex">org\.apache\.commons\.lang\.</parameter></parameters>
<customMessage>Use Commons Lang 3 classes (package org.apache.commons.lang3.*) instead
of Commons Lang 2 (package org.apache.commons.lang.*)</customMessage>
</check>
<check customId="extractopt" level="error" class="org.scalastyle.scalariform.TokenChecker" enabled="true">
<parameters><parameter name="regex">extractOpt</parameter></parameters>
<customMessage>Use Utils.jsonOption(x).map(.extract[T]) instead of .extractOpt[T], as the latter
is slower. </customMessage>
</check>
<check level="error" class="org.scalastyle.scalariform.ImportOrderChecker" enabled="true">
<parameters>
<parameter name="groups">java,scala,3rdParty,spark</parameter>
<parameter name="group.java">javax?\..*</parameter>
<parameter name="group.scala">scala\..*</parameter>
<parameter name="group.3rdParty">(?!org\.apache\.spark\.).*</parameter>
<parameter name="group.spark">org\.apache\.spark\..*</parameter>
</parameters>
</check>
<check level="error" class="org.scalastyle.scalariform.DisallowSpaceBeforeTokenChecker" enabled="true">
<parameters>
<parameter name="tokens">COMMA</parameter>
</parameters>
</check>
<!-- SPARK-3854: Single Space between ')' and '{' -->
<check customId="SingleSpaceBetweenRParenAndLCurlyBrace" level="error" class="org.scalastyle.file.RegexChecker" enabled="true">
<parameters><parameter name="regex">\)\{</parameter></parameters>
<customMessage><![CDATA[
Single Space between ')' and `{`.
]]></customMessage>
</check>
<check customId="NoScalaDoc" level="error" class="org.scalastyle.file.RegexChecker" enabled="true">
<parameters><parameter name="regex">(?m)^(\s*)/[*][*].*$(\r|)\n^\1 [*]</parameter></parameters>
<customMessage>Use Javadoc style indentation for multiline comments</customMessage>
</check>
<check customId="OmitBracesInCase" level="error" class="org.scalastyle.file.RegexChecker" enabled="true">
<parameters><parameter name="regex">case[^\n>]*=>\s*\{</parameter></parameters>
<customMessage>Omit braces in case clauses.</customMessage>
</check>
<!-- SPARK-16877: Avoid Java annotations -->
<check customId="OverrideJavaCase" level="error" class="org.scalastyle.scalariform.TokenChecker" enabled="true">
<parameters><parameter name="regex">^Override$</parameter></parameters>
<customMessage>override modifier should be used instead of @java.lang.Override.</customMessage>
</check>
<check level="error" class="org.scalastyle.scalariform.DeprecatedJavaChecker" enabled="true"></check>
<!-- ================================================================================ -->
<!-- rules we'd like to enforce, but haven't cleaned up the codebase yet -->
<!-- ================================================================================ -->
<!-- We cannot turn the following two on, because it'd fail a lot of string interpolation use cases. -->
<!-- Ideally the following two rules should be configurable to rule out string interpolation. -->
<check level="error" class="org.scalastyle.scalariform.NoWhitespaceBeforeLeftBracketChecker" enabled="false"></check>
<check level="error" class="org.scalastyle.scalariform.NoWhitespaceAfterLeftBracketChecker" enabled="false"></check>
<!-- This breaks symbolic method names so we don't turn it on. -->
<!-- Maybe we should update it to allow basic symbolic names, and then we are good to go. -->
<check level="error" class="org.scalastyle.scalariform.MethodNamesChecker" enabled="false">
<parameters>
<parameter name="regex"><![CDATA[^[a-z][A-Za-z0-9]*$]]></parameter>
</parameters>
</check>
<!-- Should turn this on, but we have a few places that need to be fixed first -->
<check level="error" class="org.scalastyle.scalariform.EqualsHashCodeChecker" enabled="true"></check>
<!-- ================================================================================ -->
<!-- rules we don't want -->
<!-- ================================================================================ -->
<check level="error" class="org.scalastyle.scalariform.IllegalImportsChecker" enabled="false">
<parameters><parameter name="illegalImports"><![CDATA[sun._,java.awt._]]></parameter></parameters>
</check>
<!-- We want the opposite of this: NewLineAtEofChecker -->
<check level="error" class="org.scalastyle.file.NoNewLineAtEofChecker" enabled="false"></check>
<!-- This one complains about all kinds of random things. Disable. -->
<check level="error" class="org.scalastyle.scalariform.SimplifyBooleanExpressionChecker" enabled="false"></check>
<!-- We use return quite a bit for control flows and guards -->
<check level="error" class="org.scalastyle.scalariform.ReturnChecker" enabled="false"></check>
<!-- We use null a lot in low level code and to interface with 3rd party code -->
<check level="error" class="org.scalastyle.scalariform.NullChecker" enabled="false"></check>
<!-- Doesn't seem super big deal here ... -->
<check level="error" class="org.scalastyle.scalariform.NoCloneChecker" enabled="false"></check>
<!-- Doesn't seem super big deal here ... -->
<check level="error" class="org.scalastyle.file.FileLengthChecker" enabled="false">
<parameters><parameter name="maxFileLength">800></parameter></parameters>
</check>
<!-- Doesn't seem super big deal here ... -->
<check level="error" class="org.scalastyle.scalariform.NumberOfTypesChecker" enabled="false">
<parameters><parameter name="maxTypes">30</parameter></parameters>
</check>
<!-- Doesn't seem super big deal here ... -->
<check level="error" class="org.scalastyle.scalariform.CyclomaticComplexityChecker" enabled="false">
<parameters><parameter name="maximum">10</parameter></parameters>
</check>
<!-- Doesn't seem super big deal here ... -->
<check level="error" class="org.scalastyle.scalariform.MethodLengthChecker" enabled="false">
<parameters><parameter name="maxLength">50</parameter></parameters>
</check>
<!-- Not exactly feasible to enforce this right now. -->
<!-- It is also infrequent that somebody introduces a new class with a lot of methods. -->
<check level="error" class="org.scalastyle.scalariform.NumberOfMethodsInTypeChecker" enabled="false">
<parameters><parameter name="maxMethods"><![CDATA[30]]></parameter></parameters>
</check>
<!-- Doesn't seem super big deal here, and we have a lot of magic numbers ... -->
<check level="error" class="org.scalastyle.scalariform.MagicNumberChecker" enabled="false">
<parameters><parameter name="ignore">-1,0,1,2,3</parameter></parameters>
</check>
</scalastyle>