Codebase cleanup. Details in in description. (#174)
This commit is contained in:
Родитель
68bd06de72
Коммит
60931e35e4
|
@ -5,6 +5,6 @@
|
|||
.idea/
|
||||
target/
|
||||
*.iml
|
||||
scalastyle-output.xml
|
||||
scalafmt-output.xml
|
||||
dependency-reduced-pom.xml
|
||||
metastore_db
|
||||
|
|
|
@ -0,0 +1,11 @@
|
|||
maxColumn = 100
|
||||
project.git = true
|
||||
project.excludeFilters = []
|
||||
|
||||
# http://docs.scala-lang.org/style/scaladoc.html recommends the JavaDoc style.
|
||||
# scala/scala is written that way too https://github.com/scala/scala/blob/v2.12.2/src/library/scala/Predef.scala
|
||||
docstrings = JavaDoc
|
||||
|
||||
# This also seems more idiomatic to include whitespace in import x.{ yyy }
|
||||
spaces.inImportCurlyBraces = true
|
||||
|
15
.travis.yml
15
.travis.yml
|
@ -1,13 +1,18 @@
|
|||
language: scala
|
||||
scala:
|
||||
- 2.11.8
|
||||
- 2.11.8
|
||||
|
||||
jdk:
|
||||
- oraclejdk8
|
||||
- oraclejdk8
|
||||
|
||||
script:
|
||||
- ./run_tests.sh
|
||||
- ./run_tests.sh
|
||||
|
||||
branches:
|
||||
except:
|
||||
- maven-repo
|
||||
only:
|
||||
- master
|
||||
- 2.1.x
|
||||
- 2.0.x
|
||||
- dev
|
||||
except:
|
||||
- maven-repo
|
||||
|
|
|
@ -7,6 +7,7 @@
|
|||
|Branch|Status|
|
||||
|------|-------------|
|
||||
|master|[![Build Status](https://travis-ci.org/Azure/spark-eventhubs.svg?branch=master)](https://travis-ci.org/Azure/spark-eventhubs)|
|
||||
|dev|[![Build Status](https://travis-ci.org/Azure/spark-eventhubs.svg?branch=dev)](https://travis-ci.org/Azure/spark-eventhubs)|
|
||||
|2.1.x|[![Build Status](https://travis-ci.org/Azure/spark-eventhubs.svg?branch=2.1.x)](https://travis-ci.org/Azure/spark-eventhubs)|
|
||||
|2.0.x|[![Build Status](https://travis-ci.org/Azure/spark-eventhubs.svg?branch=2.0.x)](https://travis-ci.org/Azure/spark-eventhubs)|
|
||||
|
||||
|
|
|
@ -1,116 +0,0 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.spark.eventhubscommon
|
||||
|
||||
import java.util.concurrent.Executors
|
||||
|
||||
import scala.concurrent.ExecutionContext
|
||||
import scala.language.implicitConversions
|
||||
|
||||
import com.google.common.util.concurrent.ThreadFactoryBuilder
|
||||
|
||||
import org.apache.spark.eventhubscommon.client.EventHubsClientWrapper
|
||||
import org.apache.spark.storage.StorageLevel
|
||||
import org.apache.spark.streaming.StreamingContext
|
||||
import org.apache.spark.streaming.dstream.DStream
|
||||
import org.apache.spark.streaming.eventhubs.EventHubsUtils
|
||||
import org.apache.spark.streaming.eventhubs.checkpoint.OffsetStore
|
||||
|
||||
|
||||
/**
|
||||
* Import the members of this object to enable the use of the unionedEventhubStream and
|
||||
* eventhubStream methods on the StreamingContext instead of the EventHubsUtils class.
|
||||
*/
|
||||
private[eventhubscommon] object Implicits {
|
||||
|
||||
// will be used to execute requests to EventHub
|
||||
private[spark] implicit val exec = {
|
||||
val tp = new ThreadFactoryBuilder().setDaemon(true).setNameFormat("restclientthread" + "-%d").
|
||||
build()
|
||||
ExecutionContext.fromExecutor(Executors.newCachedThreadPool(tp))
|
||||
}
|
||||
|
||||
/**
|
||||
* Converts the StreamingContext into an EventHub enabled streaming context
|
||||
*
|
||||
* @param streamingContext Streaming context to convert
|
||||
* @return Returns the Azure EventHub enabled StreamingContext
|
||||
*/
|
||||
implicit def eventHubContext(streamingContext: StreamingContext): SparkEventHubContext =
|
||||
new SparkEventHubContext(streamingContext)
|
||||
|
||||
/**
|
||||
* Azure EventHub enabled streaming context
|
||||
*/
|
||||
class SparkEventHubContext(ssc: StreamingContext) {
|
||||
// scalastyle:off
|
||||
/**
|
||||
* Create a unioned EventHubs stream that receives data from Microsoft Azure Eventhubs
|
||||
* The unioned stream will receive message from all partitions of the EventHubs
|
||||
*
|
||||
* @param eventhubsParams a Map that contains parameters for EventHubs.
|
||||
* Required parameters are:
|
||||
* "eventhubs.policyname": EventHubs policy name
|
||||
* "eventhubs.policykey": EventHubs policy key
|
||||
* "eventhubs.namespace": EventHubs namespace
|
||||
* "eventhubs.name": EventHubs name
|
||||
* "eventhubs.partition.count": Number of partitions
|
||||
* "eventhubs.checkpoint.dir": checkpoint directory on HDFS
|
||||
*
|
||||
* Optional parameters are:
|
||||
* "eventhubs.consumergroup": EventHubs consumer group name, default to "\$default"
|
||||
* "eventhubs.filter.offset": Starting offset of EventHubs, default to "-1"
|
||||
* "eventhubs.filter.enqueuetime": Unix time, seconds since epoch, default to "0"
|
||||
* "eventhubs.default.credits": default AMQP credits, default to -1 (which is 1024)
|
||||
* "eventhubs.checkpoint.interval": checkpoint interval in second, default to 10
|
||||
* @param storageLevel Storage level, by default it is MEMORY_ONLY
|
||||
* @return ReceiverInputStream
|
||||
*/
|
||||
// scalastyle:on
|
||||
def unionedEventHubStream(
|
||||
eventhubsParams: Map[String, String],
|
||||
storageLevel: StorageLevel = StorageLevel.MEMORY_ONLY): DStream[Array[Byte]] = {
|
||||
EventHubsUtils.createUnionStream(ssc, eventhubsParams, storageLevel)
|
||||
}
|
||||
|
||||
// scalastyle:off
|
||||
/**
|
||||
* Create a single EventHubs stream that receives data from Microsoft Azure EventHubs
|
||||
* A single stream only receives message from one EventHubs partition
|
||||
*
|
||||
* @param eventhubsParams a Map that contains parameters for EventHubs. Same as above.
|
||||
* @param partitionId Partition ID
|
||||
* @param storageLevel Storage level
|
||||
* @param offsetStore Offset store implementation, defaults to DFSBasedOffsetStore
|
||||
* @param receiverClient the EventHubs client implementation, defaults to EventHubsClientWrapper
|
||||
* @return ReceiverInputStream
|
||||
*/
|
||||
// scalastyle:on
|
||||
def eventHubStream(
|
||||
eventhubsParams: Map[String, String],
|
||||
partitionId: String,
|
||||
storageLevel: StorageLevel = StorageLevel.MEMORY_ONLY,
|
||||
offsetStore: OffsetStore = null,
|
||||
receiverClient: EventHubsClientWrapper = new EventHubsClientWrapper):
|
||||
DStream[Array[Byte]] = {
|
||||
EventHubsUtils.createStream(ssc, eventhubsParams, partitionId, storageLevel, offsetStore,
|
||||
receiverClient)
|
||||
}
|
||||
}
|
||||
|
||||
}
|
|
@ -20,6 +20,5 @@ package org.apache.spark.eventhubscommon
|
|||
/**
|
||||
* this class represents the in-memory offset record hold by [[EventHubsConnector]]s
|
||||
*/
|
||||
private[spark] case class OffsetRecord(
|
||||
timestamp: Long,
|
||||
offsets: Map[EventHubNameAndPartition, (Long, Long)])
|
||||
private[spark] case class OffsetRecord(timestamp: Long,
|
||||
offsets: Map[EventHubNameAndPartition, (Long, Long)])
|
||||
|
|
|
@ -17,29 +17,36 @@
|
|||
|
||||
package org.apache.spark.eventhubscommon
|
||||
|
||||
import org.apache.spark.eventhubscommon.client.{EventHubClient, EventHubsClientWrapper, EventHubsOffsetTypes}
|
||||
import org.apache.spark.eventhubscommon.client.{
|
||||
Client,
|
||||
EventHubsClientWrapper,
|
||||
EventHubsOffsetTypes
|
||||
}
|
||||
import org.apache.spark.eventhubscommon.client.EventHubsOffsetTypes.EventHubsOffsetType
|
||||
import org.apache.spark.internal.Logging
|
||||
|
||||
private[spark] object RateControlUtils extends Logging {
|
||||
|
||||
private def maxRateLimitPerPartition(
|
||||
eventHubName: String,
|
||||
eventhubsParams: Map[String, _]): Int = {
|
||||
private def maxRateLimitPerPartition(eventHubName: String,
|
||||
eventhubsParams: Map[String, _]): Int = {
|
||||
val maxRate = eventhubsParams.get(eventHubName) match {
|
||||
case Some(eventHubsConfigEntries) =>
|
||||
// this part shall be called by direct dstream where the parameters are indexed by eventhubs
|
||||
// names
|
||||
eventHubsConfigEntries.asInstanceOf[Map[String, String]].
|
||||
getOrElse("eventhubs.maxRate", "10000").toInt
|
||||
eventHubsConfigEntries
|
||||
.asInstanceOf[Map[String, String]]
|
||||
.getOrElse("eventhubs.maxRate", "10000")
|
||||
.toInt
|
||||
case None =>
|
||||
// this is called by structured streaming where eventhubsParams only contains the parameters
|
||||
// for a single eventhubs instance
|
||||
eventhubsParams.asInstanceOf[Map[String, String]].
|
||||
getOrElse("eventhubs.maxRate", "10000").toInt
|
||||
eventhubsParams
|
||||
.asInstanceOf[Map[String, String]]
|
||||
.getOrElse("eventhubs.maxRate", "10000")
|
||||
.toInt
|
||||
}
|
||||
require(maxRate > 0,
|
||||
s"eventhubs.maxRate has to be larger than zero, violated by $eventHubName ($maxRate)")
|
||||
s"eventhubs.maxRate has to be larger than zero, violated by $eventHubName ($maxRate)")
|
||||
maxRate
|
||||
}
|
||||
|
||||
|
@ -55,28 +62,28 @@ private[spark] object RateControlUtils extends Logging {
|
|||
eventhubsParams: Map[String, _]): Map[EventHubNameAndPartition, Long] = {
|
||||
highestEndpoints.map {
|
||||
case (eventHubNameAndPar, (_, latestSeq)) =>
|
||||
val maximumAllowedMessageCnt = maxRateLimitPerPartition(
|
||||
eventHubNameAndPar.eventHubName, eventhubsParams)
|
||||
val endSeq = math.min(latestSeq,
|
||||
maximumAllowedMessageCnt + currentOffsetsAndSeqNums(eventHubNameAndPar)._2)
|
||||
val maximumAllowedMessageCnt =
|
||||
maxRateLimitPerPartition(eventHubNameAndPar.eventHubName, eventhubsParams)
|
||||
val endSeq =
|
||||
math.min(latestSeq,
|
||||
maximumAllowedMessageCnt + currentOffsetsAndSeqNums(eventHubNameAndPar)._2)
|
||||
(eventHubNameAndPar, endSeq)
|
||||
}
|
||||
}
|
||||
|
||||
private[spark] def clamp(
|
||||
currentOffsetsAndSeqNums: Map[EventHubNameAndPartition, (Long, Long)],
|
||||
highestEndpoints: Map[EventHubNameAndPartition, (Long, Long)],
|
||||
eventhubsParams: Map[String, _]): Map[EventHubNameAndPartition, Long] = {
|
||||
private[spark] def clamp(currentOffsetsAndSeqNums: Map[EventHubNameAndPartition, (Long, Long)],
|
||||
highestEndpoints: Map[EventHubNameAndPartition, (Long, Long)],
|
||||
eventhubsParams: Map[String, _]): Map[EventHubNameAndPartition, Long] = {
|
||||
defaultRateControl(currentOffsetsAndSeqNums, highestEndpoints, eventhubsParams)
|
||||
}
|
||||
|
||||
private[spark] def fetchLatestOffset(
|
||||
eventHubClient: EventHubClient,
|
||||
eventHubClient: Client,
|
||||
retryIfFail: Boolean,
|
||||
fetchedHighestOffsetsAndSeqNums: Map[EventHubNameAndPartition, (Long, Long)]):
|
||||
Option[Map[EventHubNameAndPartition, (Long, Long)]] = {
|
||||
val r = eventHubClient.endPointOfPartition(
|
||||
retryIfFail, fetchedHighestOffsetsAndSeqNums.keySet.toList)
|
||||
fetchedHighestOffsetsAndSeqNums: Map[EventHubNameAndPartition, (Long, Long)])
|
||||
: Option[Map[EventHubNameAndPartition, (Long, Long)]] = {
|
||||
val r =
|
||||
eventHubClient.endPointOfPartition(retryIfFail, fetchedHighestOffsetsAndSeqNums.keySet.toList)
|
||||
if (r.isDefined) {
|
||||
// merge results
|
||||
val mergedOffsets = if (fetchedHighestOffsetsAndSeqNums != null) {
|
||||
|
@ -91,35 +98,44 @@ private[spark] object RateControlUtils extends Logging {
|
|||
}
|
||||
|
||||
private[spark] def validateFilteringParams(
|
||||
eventHubsClient: EventHubClient,
|
||||
eventHubsClient: Client,
|
||||
eventhubsParams: Map[String, _],
|
||||
ehNameAndPartitions: List[EventHubNameAndPartition]): Unit = {
|
||||
|
||||
// first check if the parameters are valid
|
||||
val latestEnqueueTimeOfPartitions = eventHubsClient.lastEnqueueTimeOfPartitions(
|
||||
retryIfFail = true, ehNameAndPartitions)
|
||||
require(latestEnqueueTimeOfPartitions.isDefined, "cannot get latest enqueue time from Event" +
|
||||
" Hubs Rest Endpoint")
|
||||
val latestEnqueueTimeOfPartitions =
|
||||
eventHubsClient.lastEnqueueTimeOfPartitions(retryIfFail = true, ehNameAndPartitions)
|
||||
require(latestEnqueueTimeOfPartitions.isDefined,
|
||||
"cannot get latest enqueue time from Event" +
|
||||
" Hubs Rest Endpoint")
|
||||
latestEnqueueTimeOfPartitions.get.foreach {
|
||||
case (ehNameAndPartition, latestEnqueueTime) =>
|
||||
val passInEnqueueTime = eventhubsParams.get(ehNameAndPartition.eventHubName) match {
|
||||
case Some(ehParams) =>
|
||||
ehParams.asInstanceOf[Map[String, String]].getOrElse(
|
||||
"eventhubs.filter.enqueuetime", Long.MinValue.toString).toLong
|
||||
ehParams
|
||||
.asInstanceOf[Map[String, String]]
|
||||
.getOrElse("eventhubs.filter.enqueuetime", Long.MinValue.toString)
|
||||
.toLong
|
||||
case None =>
|
||||
eventhubsParams.asInstanceOf[Map[String, String]].getOrElse(
|
||||
"eventhubs.filter.enqueuetime", Long.MinValue.toString).toLong
|
||||
eventhubsParams
|
||||
.asInstanceOf[Map[String, String]]
|
||||
.getOrElse("eventhubs.filter.enqueuetime", Long.MinValue.toString)
|
||||
.toLong
|
||||
}
|
||||
require(latestEnqueueTime >= passInEnqueueTime,
|
||||
require(
|
||||
latestEnqueueTime >= passInEnqueueTime,
|
||||
"you cannot pass in an enqueue time which is later than the highest enqueue time in" +
|
||||
s" event hubs, ($ehNameAndPartition, pass-in-enqueuetime $passInEnqueueTime," +
|
||||
s" latest-enqueuetime $latestEnqueueTime)")
|
||||
s" latest-enqueuetime $latestEnqueueTime)"
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
private[spark] def composeFromOffsetWithFilteringParams(
|
||||
eventhubsParams: Map[String, _],
|
||||
fetchedStartOffsetsInNextBatch: Map[EventHubNameAndPartition, (Long, Long)]):
|
||||
Map[EventHubNameAndPartition, (EventHubsOffsetType, Long)] = {
|
||||
fetchedStartOffsetsInNextBatch: Map[EventHubNameAndPartition, (Long, Long)])
|
||||
: Map[EventHubNameAndPartition, (EventHubsOffsetType, Long)] = {
|
||||
|
||||
fetchedStartOffsetsInNextBatch.map {
|
||||
case (ehNameAndPartition, (offset, seq)) =>
|
||||
val (offsetType, offsetStr) = EventHubsClientWrapper.configureStartOffset(
|
||||
|
@ -129,7 +145,8 @@ private[spark] object RateControlUtils extends Logging {
|
|||
ehConfig.asInstanceOf[Map[String, String]]
|
||||
case None =>
|
||||
eventhubsParams.asInstanceOf[Map[String, String]]
|
||||
})
|
||||
}
|
||||
)
|
||||
(ehNameAndPartition, (offsetType, offsetStr.toLong))
|
||||
}
|
||||
}
|
||||
|
@ -137,8 +154,8 @@ private[spark] object RateControlUtils extends Logging {
|
|||
private[spark] def calculateStartOffset(
|
||||
ehNameAndPartition: EventHubNameAndPartition,
|
||||
filteringOffsetAndType: Map[EventHubNameAndPartition, (EventHubsOffsetType, Long)],
|
||||
startOffsetInNextBatch: Map[EventHubNameAndPartition, (Long, Long)]):
|
||||
(EventHubsOffsetType, Long) = {
|
||||
startOffsetInNextBatch: Map[EventHubNameAndPartition, (Long, Long)])
|
||||
: (EventHubsOffsetType, Long) = {
|
||||
filteringOffsetAndType.getOrElse(
|
||||
ehNameAndPartition,
|
||||
(EventHubsOffsetTypes.PreviousCheckpoint, startOffsetInNextBatch(ehNameAndPartition)._1)
|
||||
|
|
|
@ -18,27 +18,19 @@
|
|||
package org.apache.spark.eventhubscommon.client
|
||||
|
||||
import scala.collection.mutable
|
||||
|
||||
import com.microsoft.azure.eventhubs.{EventHubClient => AzureEventHubClient, EventHubPartitionRuntimeInformation}
|
||||
|
||||
import com.microsoft.azure.eventhubs.{ EventHubClient, EventHubPartitionRuntimeInformation }
|
||||
import org.apache.spark.eventhubscommon.EventHubNameAndPartition
|
||||
import org.apache.spark.internal.Logging
|
||||
|
||||
private[client] class AMQPEventHubsClient(
|
||||
eventHubNamespace: String,
|
||||
eventHubsNames: List[String],
|
||||
ehParams: Map[String, Map[String, String]]) extends EventHubClient with Logging {
|
||||
private[client] class AMQPEventHubsClient(ehNames: List[String],
|
||||
ehParams: Map[String, Map[String, String]])
|
||||
extends Client
|
||||
with Logging {
|
||||
|
||||
private val ehNameToClient = new mutable.HashMap[String, AzureEventHubClient]
|
||||
|
||||
init()
|
||||
|
||||
private def init(): Unit = {
|
||||
for (ehName <- eventHubsNames) {
|
||||
ehNameToClient += ehName ->
|
||||
new EventHubsClientWrapper().createClient(ehParams(ehName))
|
||||
}
|
||||
}
|
||||
private val nameToClient = new mutable.HashMap[String, EventHubClient]
|
||||
for (ehName <- ehNames)
|
||||
nameToClient += ehName -> new EventHubsClientWrapper(ehParams(ehName))
|
||||
.createClient(ehParams(ehName))
|
||||
|
||||
private def getRunTimeInfoOfPartitions(
|
||||
targetEventHubNameAndPartitions: List[EventHubNameAndPartition]) = {
|
||||
|
@ -47,9 +39,10 @@ private[client] class AMQPEventHubsClient(
|
|||
for (ehNameAndPartition <- targetEventHubNameAndPartitions) {
|
||||
val ehName = ehNameAndPartition.eventHubName
|
||||
val partitionId = ehNameAndPartition.partitionId
|
||||
val client = ehNameToClient.get(ehName)
|
||||
val client = nameToClient.get(ehName)
|
||||
require(client.isDefined, "cannot find client for EventHubs instance " + ehName)
|
||||
val runTimeInfo = client.get.getPartitionRuntimeInformation(partitionId.toString).get()
|
||||
val runTimeInfo =
|
||||
client.get.getPartitionRuntimeInformation(partitionId.toString).get()
|
||||
results += ehNameAndPartition -> runTimeInfo
|
||||
}
|
||||
results.toMap.view
|
||||
|
@ -65,15 +58,16 @@ private[client] class AMQPEventHubsClient(
|
|||
*
|
||||
* @return a map from eventhubName-partition to (offset, seq)
|
||||
*/
|
||||
override def endPointOfPartition(
|
||||
retryIfFail: Boolean,
|
||||
targetEventHubNameAndPartitions: List[EventHubNameAndPartition]):
|
||||
Option[Map[EventHubNameAndPartition, (Long, Long)]] = {
|
||||
override def endPointOfPartition(retryIfFail: Boolean,
|
||||
targetEventHubNameAndPartitions: List[EventHubNameAndPartition])
|
||||
: Option[Map[EventHubNameAndPartition, (Long, Long)]] = {
|
||||
try {
|
||||
val runtimeInformation = getRunTimeInfoOfPartitions(targetEventHubNameAndPartitions)
|
||||
Some(runtimeInformation.map{case (ehNameAndPartition, runTimeInfo) =>
|
||||
(ehNameAndPartition, (runTimeInfo.getLastEnqueuedOffset.toLong,
|
||||
runTimeInfo.getLastEnqueuedSequenceNumber))}.toMap)
|
||||
Some(runtimeInformation.map {
|
||||
case (ehNameAndPartition, runTimeInfo) =>
|
||||
(ehNameAndPartition,
|
||||
(runTimeInfo.getLastEnqueuedOffset.toLong, runTimeInfo.getLastEnqueuedSequenceNumber))
|
||||
}.toMap)
|
||||
} catch {
|
||||
case e: Exception =>
|
||||
e.printStackTrace()
|
||||
|
@ -88,12 +82,14 @@ private[client] class AMQPEventHubsClient(
|
|||
*/
|
||||
override def lastEnqueueTimeOfPartitions(
|
||||
retryIfFail: Boolean,
|
||||
targetEventHubNameAndPartitions: List[EventHubNameAndPartition]):
|
||||
Option[Map[EventHubNameAndPartition, Long]] = {
|
||||
targetEventHubNameAndPartitions: List[EventHubNameAndPartition])
|
||||
: Option[Map[EventHubNameAndPartition, Long]] = {
|
||||
try {
|
||||
val runtimeInformation = getRunTimeInfoOfPartitions(targetEventHubNameAndPartitions)
|
||||
Some(runtimeInformation.map{case (ehNameAndPartition, runTimeInfo) =>
|
||||
(ehNameAndPartition, runTimeInfo.getLastEnqueuedTimeUtc.getEpochSecond)}.toMap)
|
||||
Some(runtimeInformation.map {
|
||||
case (ehNameAndPartition, runTimeInfo) =>
|
||||
(ehNameAndPartition, runTimeInfo.getLastEnqueuedTimeUtc.getEpochSecond)
|
||||
}.toMap)
|
||||
} catch {
|
||||
case e: Exception =>
|
||||
e.printStackTrace()
|
||||
|
@ -106,14 +102,15 @@ private[client] class AMQPEventHubsClient(
|
|||
*
|
||||
* @return a map from eventhubName-partition to seq
|
||||
*/
|
||||
override def startSeqOfPartition(
|
||||
retryIfFail: Boolean,
|
||||
targetEventHubNameAndPartitions: List[EventHubNameAndPartition]):
|
||||
Option[Map[EventHubNameAndPartition, Long]] = {
|
||||
override def startSeqOfPartition(retryIfFail: Boolean,
|
||||
targetEventHubNameAndPartitions: List[EventHubNameAndPartition])
|
||||
: Option[Map[EventHubNameAndPartition, Long]] = {
|
||||
try {
|
||||
val runtimeInformation = getRunTimeInfoOfPartitions(targetEventHubNameAndPartitions)
|
||||
Some(runtimeInformation.map{case (ehNameAndPartition, runTimeInfo) =>
|
||||
(ehNameAndPartition, runTimeInfo.getBeginSequenceNumber)}.toMap)
|
||||
Some(runtimeInformation.map {
|
||||
case (ehNameAndPartition, runTimeInfo) =>
|
||||
(ehNameAndPartition, runTimeInfo.getBeginSequenceNumber)
|
||||
}.toMap)
|
||||
} catch {
|
||||
case e: Exception =>
|
||||
e.printStackTrace()
|
||||
|
@ -126,18 +123,15 @@ private[client] class AMQPEventHubsClient(
|
|||
*/
|
||||
override def close(): Unit = {
|
||||
logInfo("close: Closing AMQPEventHubClient.")
|
||||
for ((_, ehClient) <- ehNameToClient) {
|
||||
for ((_, ehClient) <- nameToClient) {
|
||||
ehClient.closeSync()
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
||||
private[spark] object AMQPEventHubsClient {
|
||||
|
||||
def getInstance(eventHubsNamespace: String, eventhubsParams: Map[String, Map[String, String]]):
|
||||
AMQPEventHubsClient = {
|
||||
new AMQPEventHubsClient(eventHubsNamespace, eventhubsParams.keys.toList, eventhubsParams)
|
||||
def getInstance(eventHubsNamespace: String,
|
||||
eventhubsParams: Map[String, Map[String, String]]): AMQPEventHubsClient = {
|
||||
new AMQPEventHubsClient(eventhubsParams.keys.toList, eventhubsParams)
|
||||
}
|
||||
}
|
||||
|
|
|
@ -19,35 +19,31 @@ package org.apache.spark.eventhubscommon.client
|
|||
|
||||
import org.apache.spark.eventhubscommon.EventHubNameAndPartition
|
||||
|
||||
private[spark] trait EventHubClient extends Serializable {
|
||||
private[spark] trait Client extends Serializable {
|
||||
|
||||
/**
|
||||
* return the start seq number of each partition
|
||||
* @return a map from eventhubName-partition to seq
|
||||
*/
|
||||
def startSeqOfPartition(
|
||||
retryIfFail: Boolean,
|
||||
targetEventHubNameAndPartitions: List[EventHubNameAndPartition] = List()):
|
||||
Option[Map[EventHubNameAndPartition, Long]]
|
||||
|
||||
def startSeqOfPartition(retryIfFail: Boolean,
|
||||
targetEventHubNameAndPartitions: List[EventHubNameAndPartition] = List())
|
||||
: Option[Map[EventHubNameAndPartition, Long]]
|
||||
|
||||
/**
|
||||
* return the end point of each partition
|
||||
* @return a map from eventhubName-partition to (offset, seq)
|
||||
*/
|
||||
def endPointOfPartition(
|
||||
retryIfFail: Boolean,
|
||||
targetEventHubNameAndPartitions: List[EventHubNameAndPartition] = List()):
|
||||
Option[Map[EventHubNameAndPartition, (Long, Long)]]
|
||||
def endPointOfPartition(retryIfFail: Boolean,
|
||||
targetEventHubNameAndPartitions: List[EventHubNameAndPartition] = List())
|
||||
: Option[Map[EventHubNameAndPartition, (Long, Long)]]
|
||||
|
||||
/**
|
||||
* return the last enqueueTime of each partition
|
||||
* @return a map from eventHubsNamePartition to EnqueueTime
|
||||
*/
|
||||
def lastEnqueueTimeOfPartitions(
|
||||
retryIfFail: Boolean,
|
||||
targetEventHubNameAndPartitions: List[EventHubNameAndPartition]):
|
||||
Option[Map[EventHubNameAndPartition, Long]]
|
||||
def lastEnqueueTimeOfPartitions(retryIfFail: Boolean,
|
||||
targetEventHubNameAndPartitions: List[EventHubNameAndPartition])
|
||||
: Option[Map[EventHubNameAndPartition, Long]]
|
||||
|
||||
/**
|
||||
* close this client
|
|
@ -19,11 +19,8 @@ package org.apache.spark.eventhubscommon.client
|
|||
import java.time.Instant
|
||||
|
||||
import scala.collection.JavaConverters._
|
||||
|
||||
import EventHubsOffsetTypes.EventHubsOffsetType
|
||||
import com.microsoft.azure.eventhubs.{EventHubClient => AzureEventHubClient, _}
|
||||
|
||||
import org.apache.spark.{SparkEnv, TaskContext}
|
||||
import com.microsoft.azure.eventhubs._
|
||||
import org.apache.spark.eventhubscommon.EventHubNameAndPartition
|
||||
import org.apache.spark.internal.Logging
|
||||
import org.apache.spark.streaming.eventhubs.checkpoint.OffsetStore
|
||||
|
@ -32,49 +29,37 @@ import org.apache.spark.streaming.eventhubs.checkpoint.OffsetStore
|
|||
* Wraps a raw EventHubReceiver to make it easier for unit tests
|
||||
*/
|
||||
@SerialVersionUID(1L)
|
||||
private[spark] class EventHubsClientWrapper extends Serializable with EventHubClient with Logging {
|
||||
private[spark] class EventHubsClientWrapper(
|
||||
ehParams: Map[String, String]
|
||||
) extends Serializable
|
||||
with Client
|
||||
with Logging {
|
||||
|
||||
var eventhubsClient: AzureEventHubClient = _
|
||||
private val MINIMUM_PREFETCH_COUNT: Int = 10
|
||||
private var MAXIMUM_PREFETCH_COUNT: Int = 999
|
||||
private var MAXIMUM_EVENT_RATE: Int = 0
|
||||
private val DEFAULT_RECEIVER_EPOCH = -1L
|
||||
|
||||
// TODO: the design of this class is not simple enough
|
||||
// ideally, we shall not require the user to explicitly call createReceiver first
|
||||
// and then call receive
|
||||
// we shall let the user pass parameters in the constructor directly
|
||||
private val ehNamespace = ehParams("eventhubs.namespace").toString
|
||||
private val ehName = ehParams("eventhubs.name").toString
|
||||
private val ehPolicyName = ehParams("eventhubs.policyname").toString
|
||||
private val ehPolicy = ehParams("eventhubs.policykey").toString
|
||||
|
||||
private def configureGeneralParameters(eventhubsParams: Predef.Map[String, String]) = {
|
||||
if (eventhubsParams.contains("eventhubs.uri") &&
|
||||
eventhubsParams.contains("eventhubs.namespace")) {
|
||||
throw new IllegalArgumentException(s"Eventhubs URI and namespace cannot both be specified" +
|
||||
s" at the same time.")
|
||||
}
|
||||
private val connectionString =
|
||||
new ConnectionStringBuilder(ehNamespace, ehName, ehPolicyName, ehPolicy).toString
|
||||
private val consumerGroup = ehParams
|
||||
.getOrElse("eventhubs.consumergroup", EventHubClient.DEFAULT_CONSUMER_GROUP_NAME)
|
||||
.toString
|
||||
private val receiverEpoch = ehParams
|
||||
.getOrElse("eventhubs.epoch", DEFAULT_RECEIVER_EPOCH.toString)
|
||||
.toString
|
||||
.toLong
|
||||
|
||||
val namespaceName = if (eventhubsParams.contains("eventhubs.namespace")) {
|
||||
eventhubsParams.get("eventhubs.namespace")
|
||||
} else {
|
||||
eventhubsParams.get("eventhubs.uri")
|
||||
}
|
||||
if (namespaceName.isEmpty) {
|
||||
throw new IllegalArgumentException(s"Either Eventhubs URI or namespace nust be" +
|
||||
s" specified.")
|
||||
}
|
||||
// TODO: validate inputs
|
||||
val evhName = eventhubsParams("eventhubs.name")
|
||||
val evhPolicyName = eventhubsParams("eventhubs.policyname")
|
||||
val evhPolicyKey = eventhubsParams("eventhubs.policykey")
|
||||
val connectionString = new ConnectionStringBuilder(namespaceName.get, evhName, evhPolicyName,
|
||||
evhPolicyKey)
|
||||
// Set the consumer group if specified.
|
||||
val consumerGroup = eventhubsParams.getOrElse("eventhubs.consumergroup",
|
||||
AzureEventHubClient.DEFAULT_CONSUMER_GROUP_NAME)
|
||||
// Set the epoch if specified
|
||||
val receiverEpoch = eventhubsParams.getOrElse("eventhubs.epoch",
|
||||
DEFAULT_RECEIVER_EPOCH.toString).toLong
|
||||
(connectionString, consumerGroup, receiverEpoch)
|
||||
}
|
||||
var eventhubsClient: EventHubClient = _
|
||||
private var eventhubsReceiver: PartitionReceiver = _
|
||||
|
||||
private def configureStartOffset(
|
||||
eventhubsParams: Predef.Map[String, String], offsetStore: OffsetStore):
|
||||
(EventHubsOffsetType, String) = {
|
||||
private def configureStartOffset(eventhubsParams: Predef.Map[String, String],
|
||||
offsetStore: OffsetStore): (EventHubsOffsetType, String) = {
|
||||
// Determine the offset to start receiving data
|
||||
val previousOffset = offsetStore.read()
|
||||
EventHubsClientWrapper.configureStartOffset(previousOffset, eventhubsParams)
|
||||
|
@ -84,7 +69,7 @@ private[spark] class EventHubsClientWrapper extends Serializable with EventHubCl
|
|||
if (userDefinedEventRate > 0 && userDefinedEventRate < MINIMUM_PREFETCH_COUNT) {
|
||||
MAXIMUM_PREFETCH_COUNT = MINIMUM_PREFETCH_COUNT
|
||||
} else if (userDefinedEventRate >= MINIMUM_PREFETCH_COUNT &&
|
||||
userDefinedEventRate < MAXIMUM_PREFETCH_COUNT) {
|
||||
userDefinedEventRate < MAXIMUM_PREFETCH_COUNT) {
|
||||
MAXIMUM_PREFETCH_COUNT = userDefinedEventRate + 1
|
||||
} else {
|
||||
MAXIMUM_EVENT_RATE = MAXIMUM_PREFETCH_COUNT - 1
|
||||
|
@ -97,95 +82,68 @@ private[spark] class EventHubsClientWrapper extends Serializable with EventHubCl
|
|||
*
|
||||
* the major purpose of this API is for creating AMQP management client
|
||||
*/
|
||||
def createClient(eventhubsParams: Map[String, String]): AzureEventHubClient = {
|
||||
val (connectionString, _, _) = configureGeneralParameters(
|
||||
eventhubsParams)
|
||||
eventhubsClient = AzureEventHubClient.createFromConnectionStringSync(connectionString.toString)
|
||||
eventhubsClient
|
||||
}
|
||||
def createClient(eventhubsParams: Map[String, String]): EventHubClient =
|
||||
EventHubClient.createFromConnectionStringSync(connectionString.toString)
|
||||
|
||||
def createReceiver(
|
||||
eventhubsParams: Predef.Map[String, String],
|
||||
partitionId: String,
|
||||
startOffset: String,
|
||||
offsetType: EventHubsOffsetType,
|
||||
maximumEventRate: Int): Unit = {
|
||||
val (connectionString, consumerGroup, receiverEpoch) = configureGeneralParameters(
|
||||
eventhubsParams)
|
||||
val currentOffset = startOffset
|
||||
def createReceiver(partitionId: String,
|
||||
startOffset: String,
|
||||
offsetType: EventHubsOffsetType,
|
||||
maximumEventRate: Int): Unit = {
|
||||
MAXIMUM_EVENT_RATE = configureMaxEventRate(maximumEventRate)
|
||||
createReceiverInternal(connectionString.toString,
|
||||
eventhubsParams("eventhubs.name"),
|
||||
consumerGroup, partitionId, offsetType,
|
||||
currentOffset, receiverEpoch)
|
||||
createReceiverInternal(partitionId, offsetType, startOffset)
|
||||
}
|
||||
|
||||
def createReceiver(
|
||||
eventhubsParams: Map[String, String],
|
||||
partitionId: String,
|
||||
offsetStore: OffsetStore,
|
||||
maximumEventRate: Int): Unit = {
|
||||
val (connectionString, consumerGroup, receiverEpoch) = configureGeneralParameters(
|
||||
eventhubsParams)
|
||||
val (offsetType, currentOffset) = configureStartOffset(eventhubsParams, offsetStore)
|
||||
def createReceiver(ehParams: Map[String, String],
|
||||
partitionId: String,
|
||||
offsetStore: OffsetStore,
|
||||
maximumEventRate: Int): Unit = {
|
||||
val (offsetType, currentOffset) =
|
||||
configureStartOffset(ehParams, offsetStore)
|
||||
logInfo(s"start a receiver for partition $partitionId with the start offset $currentOffset")
|
||||
MAXIMUM_EVENT_RATE = configureMaxEventRate(maximumEventRate)
|
||||
createReceiverInternal(connectionString.toString,
|
||||
eventhubsParams("eventhubs.name"),
|
||||
consumerGroup, partitionId, offsetType,
|
||||
currentOffset, receiverEpoch)
|
||||
createReceiverInternal(partitionId, offsetType, currentOffset)
|
||||
}
|
||||
|
||||
private[spark] def createReceiverInternal(
|
||||
connectionString: String,
|
||||
eventHubsName: String,
|
||||
consumerGroup: String,
|
||||
partitionId: String,
|
||||
offsetType: EventHubsOffsetType,
|
||||
currentOffset: String,
|
||||
receiverEpoch: Long): Unit = {
|
||||
// Create Eventhubs client
|
||||
eventhubsClient = AzureEventHubClient.createFromConnectionStringSync(connectionString)
|
||||
|
||||
val receiverOption = new ReceiverOptions()
|
||||
receiverOption.setReceiverRuntimeMetricEnabled(false)
|
||||
receiverOption.setIdentifier(
|
||||
s"${SparkEnv.get.executorId}-${TaskContext.get().taskAttemptId()}")
|
||||
private[spark] def createReceiverInternal(partitionId: String,
|
||||
offsetType: EventHubsOffsetType,
|
||||
currentOffset: String): Unit = {
|
||||
eventhubsClient = EventHubClient.createFromConnectionStringSync(connectionString)
|
||||
|
||||
eventhubsReceiver = offsetType match {
|
||||
case EventHubsOffsetTypes.None | EventHubsOffsetTypes.PreviousCheckpoint
|
||||
| EventHubsOffsetTypes.InputByteOffset =>
|
||||
case EventHubsOffsetTypes.None | EventHubsOffsetTypes.PreviousCheckpoint |
|
||||
EventHubsOffsetTypes.InputByteOffset =>
|
||||
if (receiverEpoch > DEFAULT_RECEIVER_EPOCH) {
|
||||
eventhubsClient.createEpochReceiverSync(consumerGroup, partitionId, currentOffset,
|
||||
receiverEpoch)
|
||||
eventhubsClient.createEpochReceiverSync(consumerGroup,
|
||||
partitionId,
|
||||
currentOffset,
|
||||
receiverEpoch)
|
||||
} else {
|
||||
eventhubsClient.createReceiverSync(consumerGroup, partitionId, currentOffset)
|
||||
}
|
||||
case EventHubsOffsetTypes.InputTimeOffset =>
|
||||
if (receiverEpoch > DEFAULT_RECEIVER_EPOCH) {
|
||||
eventhubsClient.createEpochReceiverSync(consumerGroup, partitionId,
|
||||
Instant.ofEpochSecond(currentOffset.toLong), receiverEpoch)
|
||||
eventhubsClient.createEpochReceiverSync(consumerGroup,
|
||||
partitionId,
|
||||
Instant.ofEpochSecond(currentOffset.toLong),
|
||||
receiverEpoch)
|
||||
} else {
|
||||
eventhubsClient.createReceiverSync(consumerGroup, partitionId,
|
||||
Instant.ofEpochSecond(currentOffset.toLong))
|
||||
eventhubsClient.createReceiverSync(consumerGroup,
|
||||
partitionId,
|
||||
Instant.ofEpochSecond(currentOffset.toLong))
|
||||
}
|
||||
}
|
||||
|
||||
eventhubsReceiver.setPrefetchCount(MAXIMUM_PREFETCH_COUNT)
|
||||
}
|
||||
|
||||
def receive(): Iterable[EventData] = {
|
||||
val events = eventhubsReceiver.receive(MAXIMUM_EVENT_RATE).get()
|
||||
if (events == null) Iterable.empty else events.asScala
|
||||
}
|
||||
|
||||
/**
|
||||
* starting from EventHubs client 0.13.1, returning a null from receiver means that there is
|
||||
* no message in server end
|
||||
*/
|
||||
def receive(expectedEventNum: Int): Iterable[EventData] = {
|
||||
val events = eventhubsReceiver.receive(
|
||||
math.min(expectedEventNum, eventhubsReceiver.getPrefetchCount)).get()
|
||||
val events = eventhubsReceiver
|
||||
.receive(math.min(expectedEventNum, eventhubsReceiver.getPrefetchCount))
|
||||
.get()
|
||||
if (events != null) events.asScala else null
|
||||
}
|
||||
|
||||
|
@ -198,18 +156,12 @@ private[spark] class EventHubsClientWrapper extends Serializable with EventHubCl
|
|||
eventhubsReceiver.closeSync()
|
||||
}
|
||||
|
||||
private var eventhubsReceiver: PartitionReceiver = _
|
||||
private val MINIMUM_PREFETCH_COUNT: Int = 10
|
||||
private var MAXIMUM_PREFETCH_COUNT: Int = 999
|
||||
private var MAXIMUM_EVENT_RATE: Int = 0
|
||||
private val DEFAULT_RECEIVER_EPOCH = -1L
|
||||
|
||||
override def endPointOfPartition(
|
||||
retryIfFail: Boolean,
|
||||
targetEventHubsNameAndPartitions: List[EventHubNameAndPartition]):
|
||||
Option[Predef.Map[EventHubNameAndPartition, (Long, Long)]] = {
|
||||
throw new UnsupportedOperationException("endPointOfPartition is not supported by this client" +
|
||||
" yet, please use AMQPEventHubsClient")
|
||||
override def endPointOfPartition(retryIfFail: Boolean,
|
||||
targetEventHubsNameAndPartitions: List[EventHubNameAndPartition])
|
||||
: Option[Predef.Map[EventHubNameAndPartition, (Long, Long)]] = {
|
||||
throw new UnsupportedOperationException(
|
||||
"endPointOfPartition is not supported by this client" +
|
||||
" yet, please use AMQPEventHubsClient")
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -219,10 +171,11 @@ private[spark] class EventHubsClientWrapper extends Serializable with EventHubCl
|
|||
*/
|
||||
override def lastEnqueueTimeOfPartitions(
|
||||
retryIfFail: Boolean,
|
||||
targetEventHubNameAndPartitions: List[EventHubNameAndPartition]):
|
||||
Option[Predef.Map[EventHubNameAndPartition, Long]] = {
|
||||
throw new UnsupportedOperationException("lastEnqueueTimeOfPartitions is not supported by this" +
|
||||
" client yet, please use AMQPEventHubsClient")
|
||||
targetEventHubNameAndPartitions: List[EventHubNameAndPartition])
|
||||
: Option[Predef.Map[EventHubNameAndPartition, Long]] = {
|
||||
throw new UnsupportedOperationException(
|
||||
"lastEnqueueTimeOfPartitions is not supported by this" +
|
||||
" client yet, please use AMQPEventHubsClient")
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -230,17 +183,16 @@ private[spark] class EventHubsClientWrapper extends Serializable with EventHubCl
|
|||
*
|
||||
* @return a map from eventhubName-partition to seq
|
||||
*/
|
||||
override def startSeqOfPartition(
|
||||
retryIfFail: Boolean,
|
||||
targetEventHubNameAndPartitions: List[EventHubNameAndPartition]):
|
||||
Option[Predef.Map[EventHubNameAndPartition, Long]] = {
|
||||
throw new UnsupportedOperationException("startSeqOfPartition is not supported by this client" +
|
||||
" yet, please use AMQPEventHubsClient")
|
||||
override def startSeqOfPartition(retryIfFail: Boolean,
|
||||
targetEventHubNameAndPartitions: List[EventHubNameAndPartition])
|
||||
: Option[Predef.Map[EventHubNameAndPartition, Long]] = {
|
||||
throw new UnsupportedOperationException(
|
||||
"startSeqOfPartition is not supported by this client" +
|
||||
" yet, please use AMQPEventHubsClient")
|
||||
}
|
||||
}
|
||||
|
||||
private[spark] object EventHubsClientWrapper {
|
||||
|
||||
private[eventhubscommon] def configureStartOffset(
|
||||
previousOffset: String,
|
||||
eventhubsParams: Predef.Map[String, String]): (EventHubsOffsetType, String) = {
|
||||
|
@ -255,21 +207,17 @@ private[spark] object EventHubsClientWrapper {
|
|||
}
|
||||
}
|
||||
|
||||
def getEventHubsClient(eventhubsParams: Map[String, String]): AzureEventHubClient = {
|
||||
new EventHubsClientWrapper().createClient(eventhubsParams)
|
||||
}
|
||||
|
||||
def getEventHubReceiver(
|
||||
eventhubsParams: Predef.Map[String, String],
|
||||
partitionId: Int,
|
||||
startOffset: Long,
|
||||
offsetType: EventHubsOffsetType,
|
||||
maximumEventRate: Int): EventHubsClientWrapper = {
|
||||
|
||||
// TODO: reuse client
|
||||
val eventHubClientWrapperInstance = new EventHubsClientWrapper()
|
||||
eventHubClientWrapperInstance.createReceiver(eventhubsParams, partitionId.toString,
|
||||
startOffset.toString, offsetType, maximumEventRate)
|
||||
def getEventHubReceiver(ehParams: Map[String, String],
|
||||
partitionId: Int,
|
||||
startOffset: Long,
|
||||
offsetType: EventHubsOffsetType,
|
||||
maximumEventRate: Int): EventHubsClientWrapper = {
|
||||
val ehName = ehParams.get("eventhubs.name").toString
|
||||
val eventHubClientWrapperInstance = new EventHubsClientWrapper(ehParams)
|
||||
eventHubClientWrapperInstance.createReceiver(partitionId.toString,
|
||||
startOffset.toString,
|
||||
offsetType,
|
||||
maximumEventRate)
|
||||
eventHubClientWrapperInstance
|
||||
}
|
||||
}
|
||||
|
|
|
@ -1,236 +0,0 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.spark.eventhubscommon.client
|
||||
|
||||
import java.net.SocketTimeoutException
|
||||
import java.time.{Duration, Instant}
|
||||
|
||||
import scala.collection.mutable.ListBuffer
|
||||
import scala.concurrent.{Await, Future}
|
||||
import scala.concurrent.duration._
|
||||
import scala.util.{Failure, Success}
|
||||
import scala.xml.XML
|
||||
|
||||
import com.microsoft.azure.eventhubs.SharedAccessSignatureTokenProvider
|
||||
import scalaj.http.{Http, HttpResponse}
|
||||
|
||||
import org.apache.spark.eventhubscommon.EventHubNameAndPartition
|
||||
import org.apache.spark.internal.Logging
|
||||
|
||||
/**
|
||||
* a Restful API based client of EventHub
|
||||
*
|
||||
* @param eventHubNamespace the namespace of eventhub
|
||||
* @param numPartitionsEventHubs a map from eventHub name to the total number of partitions
|
||||
* @param consumerGroups a map from eventHub name to consumer group names
|
||||
* @param policyKeys a map from eventHub name to (policyName, policyKey) pair
|
||||
* @param threadNum the number of threads used to communicate with remote EventHub
|
||||
*/
|
||||
private[spark] class RestfulEventHubClient(
|
||||
eventHubNamespace: String,
|
||||
numPartitionsEventHubs: Map[String, Int],
|
||||
consumerGroups: Map[String, String],
|
||||
policyKeys: Map[String, (String, String)],
|
||||
threadNum: Int) extends EventHubClient with Logging {
|
||||
|
||||
private val RETRY_INTERVAL_SECONDS = Array(8, 16, 32, 64, 128)
|
||||
|
||||
// will be used to execute requests to EventHub
|
||||
import org.apache.spark.eventhubscommon.Implicits.exec
|
||||
|
||||
private def createSasToken(eventHubName: String, policyName: String, policyKey: String):
|
||||
String = {
|
||||
// the default value of 10 mins is hardcoded, and this method will be called for everytime when
|
||||
// a new batch is started, we may figure out whether there will be any negative impact for
|
||||
// creating a new sasToken everytime
|
||||
SharedAccessSignatureTokenProvider.generateSharedAccessSignature(
|
||||
s"$policyName", s"$policyKey",
|
||||
s"$eventHubNamespace.servicebus.windows.net/$eventHubName",
|
||||
Duration.ofMinutes(10))
|
||||
}
|
||||
|
||||
private def fromResponseBodyToEndpoint(responseBody: String): (Long, Long) = {
|
||||
val partitionDescription = XML.loadString(responseBody) \\ "entry" \
|
||||
"content" \ "PartitionDescription"
|
||||
((partitionDescription \ "LastEnqueuedOffset").text.toLong,
|
||||
(partitionDescription \ "EndSequenceNumber").text.toLong)
|
||||
}
|
||||
|
||||
private def fromParametersToURLString(eventHubName: String, partitionId: Int): String = {
|
||||
s"https://$eventHubNamespace.servicebus.windows.net/$eventHubName" +
|
||||
s"/consumergroups/${consumerGroups(eventHubName)}/partitions/$partitionId?api-version=2015-01"
|
||||
}
|
||||
|
||||
private def fromResponseBodyToStartSeq(responseBody: String): Long = {
|
||||
val partitionDescription = XML.loadString(responseBody) \\ "entry" \
|
||||
"content" \ "PartitionDescription"
|
||||
(partitionDescription \ "BeginSequenceNumber").text.toLong
|
||||
}
|
||||
|
||||
private def aggregateResults[T](undergoingRequests: List[Future[(EventHubNameAndPartition, T)]]):
|
||||
Option[Map[EventHubNameAndPartition, T]] = {
|
||||
Await.ready(Future.sequence(undergoingRequests), 60 seconds).value.get match {
|
||||
case Success(queryResponse) =>
|
||||
Some(queryResponse.toMap.map {case (eventHubQueryKey, queryResponseString) =>
|
||||
(eventHubQueryKey, queryResponseString.asInstanceOf[T])})
|
||||
case Failure(e) =>
|
||||
e.printStackTrace()
|
||||
None
|
||||
}
|
||||
}
|
||||
|
||||
private def composeQuery[T](
|
||||
retryIfFail: Boolean,
|
||||
fromResponseBodyToResult: String => T,
|
||||
nameAndPartition: EventHubNameAndPartition):
|
||||
Future[(EventHubNameAndPartition, T)] = {
|
||||
Future {
|
||||
var retryTime = 0
|
||||
var successfullyFetched = false
|
||||
var response: HttpResponse[String] = null
|
||||
val ehNameAndPartition = nameAndPartition
|
||||
val eventHubName = nameAndPartition.eventHubName
|
||||
val partitionId = nameAndPartition.partitionId
|
||||
while (!successfullyFetched) {
|
||||
logDebug(s"start fetching latest offset of $ehNameAndPartition")
|
||||
val urlString = fromParametersToURLString(eventHubName, partitionId)
|
||||
try {
|
||||
response = Http(urlString).header("Authorization",
|
||||
createSasToken(eventHubName,
|
||||
policyName = policyKeys(eventHubName)._1,
|
||||
policyKey = policyKeys(eventHubName)._2)).
|
||||
header("Content-Type", "application/atom+xml;type=entry;charset=utf-8").
|
||||
timeout(connTimeoutMs = 3000, readTimeoutMs = 30000).asString
|
||||
if (response.code != 200) {
|
||||
if (!retryIfFail || retryTime > RETRY_INTERVAL_SECONDS.length - 1) {
|
||||
val errorInfoString = s"cannot get latest offset of" +
|
||||
s" $ehNameAndPartition, status code: ${response.code}, ${response.headers}" +
|
||||
s" returned error: ${response.body}"
|
||||
logError(errorInfoString)
|
||||
throw new Exception(errorInfoString)
|
||||
} else {
|
||||
val retryInterval = 1000 * RETRY_INTERVAL_SECONDS(retryTime)
|
||||
logError(s"cannot get connect with Event Hubs Rest Endpoint for partition" +
|
||||
s" $ehNameAndPartition, retry after $retryInterval seconds")
|
||||
Thread.sleep(retryInterval)
|
||||
retryTime += 1
|
||||
}
|
||||
} else {
|
||||
successfullyFetched = true
|
||||
}
|
||||
} catch {
|
||||
case e: SocketTimeoutException =>
|
||||
e.printStackTrace()
|
||||
logError("Event Hubs return ReadTimeout with 30s as threshold, retrying...")
|
||||
case e: Exception =>
|
||||
e.printStackTrace()
|
||||
throw e
|
||||
}
|
||||
}
|
||||
val results = fromResponseBodyToResult(response.body)
|
||||
logDebug(s"results of $ehNameAndPartition: $results")
|
||||
(ehNameAndPartition, results)
|
||||
}
|
||||
}
|
||||
|
||||
private def queryPartitionRuntimeInfo[T](
|
||||
targetEventHubsNameAndPartitions: List[EventHubNameAndPartition],
|
||||
fromResponseBodyToResult: String => T, retryIfFail: Boolean):
|
||||
Option[Map[EventHubNameAndPartition, T]] = {
|
||||
val futures = new ListBuffer[Future[(EventHubNameAndPartition, T)]]
|
||||
if (targetEventHubsNameAndPartitions.isEmpty) {
|
||||
for ((eventHubName, numPartitions) <- numPartitionsEventHubs;
|
||||
partitionId <- 0 until numPartitions) {
|
||||
futures += composeQuery(retryIfFail, fromResponseBodyToResult,
|
||||
EventHubNameAndPartition(eventHubName, partitionId))
|
||||
}
|
||||
} else {
|
||||
for (targetNameAndPartition <- targetEventHubsNameAndPartitions) {
|
||||
futures += composeQuery(retryIfFail, fromResponseBodyToResult, targetNameAndPartition)
|
||||
}
|
||||
}
|
||||
aggregateResults(futures.toList)
|
||||
}
|
||||
|
||||
override def close(): Unit = {
|
||||
// empty
|
||||
}
|
||||
|
||||
/**
|
||||
* return highest offset/seq and latest enqueueTime of each partition
|
||||
*/
|
||||
override def endPointOfPartition(
|
||||
retryIfFail: Boolean,
|
||||
targetEventHubsNameAndPartitions: List[EventHubNameAndPartition]):
|
||||
Option[Map[EventHubNameAndPartition, (Long, Long)]] = {
|
||||
queryPartitionRuntimeInfo(targetEventHubsNameAndPartitions,
|
||||
fromResponseBodyToEndpoint, retryIfFail)
|
||||
}
|
||||
|
||||
private def fromResponseBodyToEnqueueTime(responseBody: String): Long = {
|
||||
val partitionDescription = XML.loadString(responseBody) \\ "entry" \
|
||||
"content" \ "PartitionDescription"
|
||||
Instant.parse((partitionDescription \ "LastEnqueuedTimeUtc").text).getEpochSecond
|
||||
}
|
||||
|
||||
/**
|
||||
* return the last enqueueTime of each partition
|
||||
* @return a map from eventHubsNamePartition to EnqueueTime
|
||||
*/
|
||||
override def lastEnqueueTimeOfPartitions(
|
||||
retryIfFail: Boolean,
|
||||
targetEventHubNameAndPartitions: List[EventHubNameAndPartition]):
|
||||
Option[Map[EventHubNameAndPartition, Long]] = {
|
||||
queryPartitionRuntimeInfo(targetEventHubNameAndPartitions,
|
||||
fromResponseBodyToEnqueueTime, retryIfFail)
|
||||
}
|
||||
|
||||
/**
|
||||
* return the start seq number of each partition
|
||||
*
|
||||
* @return a map from eventhubName-partition to seq
|
||||
*/
|
||||
override def startSeqOfPartition(
|
||||
retryIfFail: Boolean,
|
||||
targetEventHubNameAndPartitions: List[EventHubNameAndPartition]):
|
||||
Option[Map[EventHubNameAndPartition, Long]] = {
|
||||
queryPartitionRuntimeInfo(targetEventHubNameAndPartitions,
|
||||
fromResponseBodyToStartSeq, retryIfFail)
|
||||
}
|
||||
}
|
||||
|
||||
private[spark] object RestfulEventHubClient {
|
||||
def getInstance(eventHubNameSpace: String, eventhubsParams: Map[String, Map[String, String]]):
|
||||
RestfulEventHubClient = {
|
||||
new RestfulEventHubClient(eventHubNameSpace,
|
||||
numPartitionsEventHubs = {
|
||||
eventhubsParams.map { case (eventhubName, params) => (eventhubName,
|
||||
params("eventhubs.partition.count").toInt)
|
||||
}
|
||||
},
|
||||
consumerGroups = {
|
||||
eventhubsParams.map { case (eventhubName, params) => (eventhubName,
|
||||
params("eventhubs.consumergroup"))
|
||||
}
|
||||
},
|
||||
policyKeys = eventhubsParams.map { case (eventhubName, params) => (eventhubName,
|
||||
(params("eventhubs.policyname"), params("eventhubs.policykey")))
|
||||
},
|
||||
threadNum = 15)
|
||||
}
|
||||
}
|
|
@ -50,21 +50,19 @@ private[spark] object PathTools extends Serializable {
|
|||
def makeProgressFileName(timestamp: Long): String =
|
||||
s"progress-$timestamp"
|
||||
|
||||
def makeTempFileName(
|
||||
streamId: Int,
|
||||
uid: String,
|
||||
eventHubNameAndPartition: EventHubNameAndPartition,
|
||||
timestamp: Long): String =
|
||||
def makeTempFileName(streamId: Int,
|
||||
uid: String,
|
||||
eventHubNameAndPartition: EventHubNameAndPartition,
|
||||
timestamp: Long): String =
|
||||
s"$streamId-$uid-$eventHubNameAndPartition-$timestamp"
|
||||
|
||||
def makeTempFilePath(
|
||||
basePath: String,
|
||||
streamId: Int,
|
||||
uid: String,
|
||||
eventHubNameAndPartition: EventHubNameAndPartition,
|
||||
timestamp: Long): Path =
|
||||
new Path(s"${combineDirectoryNames(
|
||||
basePath, makeTempFileName(streamId, uid, eventHubNameAndPartition, timestamp))}")
|
||||
def makeTempFilePath(basePath: String,
|
||||
streamId: Int,
|
||||
uid: String,
|
||||
eventHubNameAndPartition: EventHubNameAndPartition,
|
||||
timestamp: Long): Path =
|
||||
new Path(
|
||||
s"${combineDirectoryNames(basePath, makeTempFileName(streamId, uid, eventHubNameAndPartition, timestamp))}")
|
||||
|
||||
def makeMetadataFileName(timestamp: Long): String = timestamp.toString
|
||||
}
|
||||
|
|
|
@ -28,13 +28,12 @@ package org.apache.spark.eventhubscommon.progress
|
|||
* BatchID
|
||||
*
|
||||
*/
|
||||
private[spark] case class ProgressRecord(
|
||||
timestamp: Long,
|
||||
uid: String,
|
||||
eventHubName: String,
|
||||
partitionId: Int,
|
||||
offset: Long,
|
||||
seqId: Long) {
|
||||
private[spark] case class ProgressRecord(timestamp: Long,
|
||||
uid: String,
|
||||
eventHubName: String,
|
||||
partitionId: Int,
|
||||
offset: Long,
|
||||
seqId: Long) {
|
||||
override def toString: String = {
|
||||
s"$timestamp $uid $eventHubName $partitionId $offset $seqId"
|
||||
}
|
||||
|
@ -44,10 +43,15 @@ private[spark] object ProgressRecord {
|
|||
|
||||
def parse(line: String): Option[ProgressRecord] = {
|
||||
try {
|
||||
val Array(timestampStr, namespace, eventHubName, partitionIdStr, offsetStr,
|
||||
seqStr) = line.split(" ")
|
||||
Some(ProgressRecord(timestampStr.toLong, namespace, eventHubName,
|
||||
partitionIdStr.toInt, offsetStr.toLong, seqStr.toLong))
|
||||
val Array(timestampStr, namespace, eventHubName, partitionIdStr, offsetStr, seqStr) =
|
||||
line.split(" ")
|
||||
Some(
|
||||
ProgressRecord(timestampStr.toLong,
|
||||
namespace,
|
||||
eventHubName,
|
||||
partitionIdStr.toInt,
|
||||
offsetStr.toLong,
|
||||
seqStr.toLong))
|
||||
} catch {
|
||||
case m: RuntimeException =>
|
||||
m.printStackTrace()
|
||||
|
|
|
@ -17,8 +17,8 @@
|
|||
|
||||
package org.apache.spark.eventhubscommon.progress
|
||||
|
||||
import java.io.{BufferedReader, InputStreamReader, IOException}
|
||||
import java.util.concurrent.{ScheduledFuture, ScheduledThreadPoolExecutor, TimeUnit}
|
||||
import java.io.{ BufferedReader, InputStreamReader, IOException }
|
||||
import java.util.concurrent.{ ScheduledFuture, ScheduledThreadPoolExecutor, TimeUnit }
|
||||
|
||||
import scala.collection.mutable
|
||||
import scala.collection.mutable.ListBuffer
|
||||
|
@ -27,17 +27,24 @@ import com.microsoft.azure.eventhubs.PartitionReceiver
|
|||
import org.apache.hadoop.conf.Configuration
|
||||
import org.apache.hadoop.fs._
|
||||
|
||||
import org.apache.spark.eventhubscommon.{EventHubNameAndPartition, EventHubsConnector, OffsetRecord}
|
||||
import org.apache.spark.eventhubscommon.{
|
||||
EventHubNameAndPartition,
|
||||
EventHubsConnector,
|
||||
OffsetRecord
|
||||
}
|
||||
import org.apache.spark.internal.Logging
|
||||
|
||||
private[spark] abstract class ProgressTrackerBase[T <: EventHubsConnector](
|
||||
progressDir: String, appName: String, hadoopConfiguration: Configuration) extends Logging {
|
||||
progressDir: String,
|
||||
appName: String,
|
||||
hadoopConfiguration: Configuration)
|
||||
extends Logging {
|
||||
|
||||
private[spark] lazy val progressDirectoryStr = PathTools.makeProgressDirectoryStr(progressDir,
|
||||
appName)
|
||||
private[spark] lazy val progressDirectoryStr =
|
||||
PathTools.makeProgressDirectoryStr(progressDir, appName)
|
||||
private[spark] lazy val tempDirectoryStr = PathTools.makeTempDirectoryStr(progressDir, appName)
|
||||
private[spark] lazy val metadataDirectoryStr = PathTools.makeMetadataDirectoryStr(progressDir,
|
||||
appName)
|
||||
private[spark] lazy val metadataDirectoryStr =
|
||||
PathTools.makeMetadataDirectoryStr(progressDir, appName)
|
||||
|
||||
private[spark] lazy val progressDirectoryPath = new Path(progressDirectoryStr)
|
||||
private[spark] lazy val tempDirectoryPath = new Path(tempDirectoryStr)
|
||||
|
@ -57,10 +64,9 @@ private[spark] abstract class ProgressTrackerBase[T <: EventHubsConnector](
|
|||
private[spark] def fromPathToTimestamp(path: Path): Long =
|
||||
path.getName.split("-").last.toLong
|
||||
|
||||
|
||||
protected def allEventNameAndPartitionExist(
|
||||
candidateEhNameAndPartitions: Map[String, List[EventHubNameAndPartition]]): Boolean = {
|
||||
eventHubNameAndPartitions.forall{
|
||||
eventHubNameAndPartitions.forall {
|
||||
case (uid, ehNameAndPartitions) =>
|
||||
candidateEhNameAndPartitions.contains(uid) &&
|
||||
ehNameAndPartitions.forall(candidateEhNameAndPartitions(uid).contains)
|
||||
|
@ -68,21 +74,28 @@ private[spark] abstract class ProgressTrackerBase[T <: EventHubsConnector](
|
|||
}
|
||||
|
||||
// no metadata (for backward compatibility)
|
||||
private def getLatestFileWithoutMetadata(fs: FileSystem, timestamp: Long = Long.MaxValue):
|
||||
Option[Path] = {
|
||||
private def getLatestFileWithoutMetadata(fs: FileSystem,
|
||||
timestamp: Long = Long.MaxValue): Option[Path] = {
|
||||
val allFiles = fs.listStatus(progressDirectoryPath)
|
||||
if (allFiles.length < 1) {
|
||||
None
|
||||
} else {
|
||||
Some(allFiles.filter(fsStatus => fromPathToTimestamp(fsStatus.getPath) <= timestamp).
|
||||
sortWith((f1, f2) => fromPathToTimestamp(f1.getPath) > fromPathToTimestamp(f2.getPath))
|
||||
(0).getPath)
|
||||
Some(
|
||||
allFiles
|
||||
.filter(fsStatus => fromPathToTimestamp(fsStatus.getPath) <= timestamp)
|
||||
.sortWith((f1, f2) => fromPathToTimestamp(f1.getPath) > fromPathToTimestamp(f2.getPath))(
|
||||
0)
|
||||
.getPath)
|
||||
}
|
||||
}
|
||||
|
||||
private def getLatestFileWithMetadata(metadataFiles: Array[FileStatus]): Option[Path] = {
|
||||
val latestMetadata = metadataFiles.sortWith((f1, f2) => f1.getPath.getName.toLong >
|
||||
f2.getPath.getName.toLong).head
|
||||
val latestMetadata = metadataFiles
|
||||
.sortWith(
|
||||
(f1, f2) =>
|
||||
f1.getPath.getName.toLong >
|
||||
f2.getPath.getName.toLong)
|
||||
.head
|
||||
logInfo(s"locate latest timestamp in metadata as ${latestMetadata.getPath.getName}")
|
||||
Some(new Path(progressDirectoryStr + "/progress-" + latestMetadata.getPath.getName))
|
||||
}
|
||||
|
@ -92,12 +105,13 @@ private[spark] abstract class ProgressTrackerBase[T <: EventHubsConnector](
|
|||
*
|
||||
* NOTE: the additional integer in return value is to simplify the test (could be improved)
|
||||
*/
|
||||
private[spark] def getLatestFile(fs: FileSystem, timestamp: Long = Long.MaxValue):
|
||||
(Int, Option[Path]) = {
|
||||
private[spark] def getLatestFile(fs: FileSystem,
|
||||
timestamp: Long = Long.MaxValue): (Int, Option[Path]) = {
|
||||
// first check metadata directory if exists
|
||||
if (fs.exists(metadataDirectoryPath)) {
|
||||
val metadataFiles = fs.listStatus(metadataDirectoryPath).filter(
|
||||
file => file.isFile && file.getPath.getName.toLong <= timestamp)
|
||||
val metadataFiles = fs
|
||||
.listStatus(metadataDirectoryPath)
|
||||
.filter(file => file.isFile && file.getPath.getName.toLong <= timestamp)
|
||||
if (metadataFiles.nonEmpty) {
|
||||
// metadata files exists
|
||||
(0, getLatestFileWithMetadata(metadataFiles))
|
||||
|
@ -136,7 +150,7 @@ private[spark] abstract class ProgressTrackerBase[T <: EventHubsConnector](
|
|||
}
|
||||
val progressRecord = progressRecordOpt.get
|
||||
val newList = allProgressFiles.getOrElseUpdate(progressRecord.uid,
|
||||
List[EventHubNameAndPartition]()) :+
|
||||
List[EventHubNameAndPartition]()) :+
|
||||
EventHubNameAndPartition(progressRecord.eventHubName, progressRecord.partitionId)
|
||||
allProgressFiles(progressRecord.uid) = newList
|
||||
if (timestamp == -1L) {
|
||||
|
@ -161,10 +175,8 @@ private[spark] abstract class ProgressTrackerBase[T <: EventHubsConnector](
|
|||
(allEventNameAndPartitionExist(allProgressFiles.toMap), latestFileOpt)
|
||||
}
|
||||
|
||||
|
||||
protected def readProgressRecordLines(
|
||||
progressFilePath: Path,
|
||||
fs: FileSystem): List[ProgressRecord] = {
|
||||
protected def readProgressRecordLines(progressFilePath: Path,
|
||||
fs: FileSystem): List[ProgressRecord] = {
|
||||
val ret = new ListBuffer[ProgressRecord]
|
||||
var ins: FSDataInputStream = null
|
||||
var br: BufferedReader = null
|
||||
|
@ -175,8 +187,9 @@ private[spark] abstract class ProgressTrackerBase[T <: EventHubsConnector](
|
|||
while (line != null) {
|
||||
val progressRecordOpt = ProgressRecord.parse(line)
|
||||
if (progressRecordOpt.isEmpty) {
|
||||
throw new IllegalStateException(s"detect corrupt progress tracking file at $line" +
|
||||
s" it might be a bug in the implementation of underlying file system")
|
||||
throw new IllegalStateException(
|
||||
s"detect corrupt progress tracking file at $line" +
|
||||
s" it might be a bug in the implementation of underlying file system")
|
||||
}
|
||||
val progressRecord = progressRecordOpt.get
|
||||
ret += progressRecord
|
||||
|
@ -238,24 +251,28 @@ private[spark] abstract class ProgressTrackerBase[T <: EventHubsConnector](
|
|||
if (progressFileOption.isEmpty) {
|
||||
// if no progress file, then start from the beginning of the streams
|
||||
val connectedEventHubs = eventHubNameAndPartitions.find {
|
||||
case (connectorUID, _) => connectorUID == targetConnectorUID}
|
||||
require(connectedEventHubs.isDefined, s"cannot find $targetConnectorUID in" +
|
||||
s" $eventHubNameAndPartitions")
|
||||
case (connectorUID, _) => connectorUID == targetConnectorUID
|
||||
}
|
||||
require(connectedEventHubs.isDefined,
|
||||
s"cannot find $targetConnectorUID in" +
|
||||
s" $eventHubNameAndPartitions")
|
||||
// it's hacky to take timestamp -1 as the start of streams
|
||||
readTimestamp = -1
|
||||
recordToReturn = connectedEventHubs.get._2.map(
|
||||
(_, (PartitionReceiver.START_OF_STREAM.toLong, -1L))).toMap
|
||||
recordToReturn =
|
||||
connectedEventHubs.get._2.map((_, (PartitionReceiver.START_OF_STREAM.toLong, -1L))).toMap
|
||||
} else {
|
||||
val expectedTimestamp = fromPathToTimestamp(progressFileOption.get)
|
||||
val progressFilePath = progressFileOption.get
|
||||
val recordLines = readProgressRecordLines(progressFilePath, fs)
|
||||
require(recordLines.count(_.timestamp != expectedTimestamp) == 0, "detected inconsistent" +
|
||||
s" progress record, expected timestamp $expectedTimestamp")
|
||||
require(recordLines.count(_.timestamp != expectedTimestamp) == 0,
|
||||
"detected inconsistent" +
|
||||
s" progress record, expected timestamp $expectedTimestamp")
|
||||
readTimestamp = expectedTimestamp
|
||||
recordToReturn = recordLines.filter(
|
||||
progressRecord => progressRecord.uid == targetConnectorUID).map(
|
||||
progressRecord => EventHubNameAndPartition(progressRecord.eventHubName,
|
||||
progressRecord.partitionId) -> (progressRecord.offset, progressRecord.seqId)).toMap
|
||||
recordToReturn = recordLines
|
||||
.filter(progressRecord => progressRecord.uid == targetConnectorUID)
|
||||
.map(progressRecord =>
|
||||
EventHubNameAndPartition(progressRecord.eventHubName, progressRecord.partitionId) -> (progressRecord.offset, progressRecord.seqId))
|
||||
.toMap
|
||||
}
|
||||
} catch {
|
||||
case ias: IllegalArgumentException =>
|
||||
|
@ -273,16 +290,20 @@ private[spark] abstract class ProgressTrackerBase[T <: EventHubsConnector](
|
|||
var oos: FSDataOutputStream = null
|
||||
try {
|
||||
// write progress file
|
||||
oos = fs.create(new Path(s"$progressDirectoryPath/${PathTools.makeProgressFileName(
|
||||
commitTime)}"), true)
|
||||
oos = fs.create(
|
||||
new Path(s"$progressDirectoryPath/${PathTools.makeProgressFileName(commitTime)}"),
|
||||
true)
|
||||
offsetToCommit.foreach {
|
||||
case (namespace, ehNameAndPartitionToOffsetAndSeq) =>
|
||||
ehNameAndPartitionToOffsetAndSeq.foreach {
|
||||
case (nameAndPartitionId, (offset, seq)) =>
|
||||
oos.writeBytes(
|
||||
ProgressRecord(commitTime, namespace,
|
||||
nameAndPartitionId.eventHubName, nameAndPartitionId.partitionId, offset,
|
||||
seq).toString + "\n"
|
||||
ProgressRecord(commitTime,
|
||||
namespace,
|
||||
nameAndPartitionId.eventHubName,
|
||||
nameAndPartitionId.partitionId,
|
||||
offset,
|
||||
seq).toString + "\n"
|
||||
)
|
||||
}
|
||||
}
|
||||
|
@ -301,8 +322,9 @@ private[spark] abstract class ProgressTrackerBase[T <: EventHubsConnector](
|
|||
private def createMetadata(fs: FileSystem, commitTime: Long): Boolean = {
|
||||
var oos: FSDataOutputStream = null
|
||||
try {
|
||||
oos = fs.create(new Path(s"$metadataDirectoryStr/" + s"${PathTools.makeMetadataFileName(
|
||||
commitTime)}"), true)
|
||||
oos = fs.create(
|
||||
new Path(s"$metadataDirectoryStr/" + s"${PathTools.makeMetadataFileName(commitTime)}"),
|
||||
true)
|
||||
true
|
||||
} catch {
|
||||
case e: Exception =>
|
||||
|
@ -316,29 +338,29 @@ private[spark] abstract class ProgressTrackerBase[T <: EventHubsConnector](
|
|||
}
|
||||
|
||||
// write offsetToCommit to a progress tracking file
|
||||
private def transaction(
|
||||
offsetToCommit: Map[String, Map[EventHubNameAndPartition, (Long, Long)]],
|
||||
fs: FileSystem,
|
||||
commitTime: Long): Unit = {
|
||||
private def transaction(offsetToCommit: Map[String, Map[EventHubNameAndPartition, (Long, Long)]],
|
||||
fs: FileSystem,
|
||||
commitTime: Long): Unit = {
|
||||
if (createProgressFile(offsetToCommit, fs, commitTime)) {
|
||||
if (!createMetadata(fs, commitTime)) {
|
||||
logError(s"cannot create progress file at $commitTime")
|
||||
throw new IOException(s"cannot create metadata file at $commitTime," +
|
||||
s" check the previous exception for the root cause")
|
||||
throw new IOException(
|
||||
s"cannot create metadata file at $commitTime," +
|
||||
s" check the previous exception for the root cause")
|
||||
}
|
||||
} else {
|
||||
logError(s"cannot create progress file at $commitTime")
|
||||
throw new IOException(s"cannot create progress file at $commitTime," +
|
||||
s" check the previous exception for the root cause")
|
||||
throw new IOException(
|
||||
s"cannot create progress file at $commitTime," +
|
||||
s" check the previous exception for the root cause")
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* commit offsetToCommit to a new progress tracking file
|
||||
*/
|
||||
def commit(
|
||||
offsetToCommit: Map[String, Map[EventHubNameAndPartition, (Long, Long)]],
|
||||
commitTime: Long): Unit = {
|
||||
def commit(offsetToCommit: Map[String, Map[EventHubNameAndPartition, (Long, Long)]],
|
||||
commitTime: Long): Unit = {
|
||||
val fs = new Path(progressDir).getFileSystem(hadoopConfiguration)
|
||||
try {
|
||||
transaction(offsetToCommit, fs, commitTime)
|
||||
|
@ -352,25 +374,28 @@ private[spark] abstract class ProgressTrackerBase[T <: EventHubsConnector](
|
|||
}
|
||||
}
|
||||
|
||||
private def allProgressRecords(
|
||||
timestamp: Long,
|
||||
ehConnectors: List[EventHubsConnector]): List[Path] = {
|
||||
private def allProgressRecords(timestamp: Long,
|
||||
ehConnectors: List[EventHubsConnector]): List[Path] = {
|
||||
val fs = tempDirectoryPath.getFileSystem(hadoopConfiguration)
|
||||
ehConnectors.flatMap { ehConnector =>
|
||||
ehConnector.connectedInstances.map(ehNameAndPartition =>
|
||||
PathTools.makeTempFilePath(
|
||||
tempDirectoryStr, ehConnector.streamId, ehConnector.uid, ehNameAndPartition, timestamp))
|
||||
}.filter(fs.exists)
|
||||
ehConnectors
|
||||
.flatMap { ehConnector =>
|
||||
ehConnector.connectedInstances.map(
|
||||
ehNameAndPartition =>
|
||||
PathTools.makeTempFilePath(tempDirectoryStr,
|
||||
ehConnector.streamId,
|
||||
ehConnector.uid,
|
||||
ehNameAndPartition,
|
||||
timestamp))
|
||||
}
|
||||
.filter(fs.exists)
|
||||
}
|
||||
|
||||
/**
|
||||
* read progress records from temp directories
|
||||
* @return Map(Namespace -> Map(EventHubNameAndPartition -> (Offset, Seq))
|
||||
*/
|
||||
def collectProgressRecordsForBatch(
|
||||
timestamp: Long,
|
||||
ehConnectors: List[EventHubsConnector]):
|
||||
Map[String, Map[EventHubNameAndPartition, (Long, Long)]] = {
|
||||
def collectProgressRecordsForBatch(timestamp: Long, ehConnectors: List[EventHubsConnector])
|
||||
: Map[String, Map[EventHubNameAndPartition, (Long, Long)]] = {
|
||||
val records = new ListBuffer[ProgressRecord]
|
||||
val ret = new mutable.HashMap[String, Map[EventHubNameAndPartition, (Long, Long)]]
|
||||
try {
|
||||
|
@ -384,10 +409,11 @@ private[spark] abstract class ProgressTrackerBase[T <: EventHubsConnector](
|
|||
// check timestamp consistency
|
||||
records.foreach(progressRecord =>
|
||||
if (timestamp != progressRecord.timestamp) {
|
||||
throw new IllegalStateException(s"detect inconsistent progress tracking file at" +
|
||||
s" $progressRecord, expected timestamp: $timestamp, it might be a bug in the" +
|
||||
s" implementation of underlying file system")
|
||||
})
|
||||
throw new IllegalStateException(
|
||||
s"detect inconsistent progress tracking file at" +
|
||||
s" $progressRecord, expected timestamp: $timestamp, it might be a bug in the" +
|
||||
s" implementation of underlying file system")
|
||||
})
|
||||
} catch {
|
||||
case ioe: IOException =>
|
||||
logError(s"error: ${ioe.getMessage}")
|
||||
|
@ -410,11 +436,15 @@ private[spark] abstract class ProgressTrackerBase[T <: EventHubsConnector](
|
|||
|
||||
def cleanProgressFile(timestampToClean: Long): Unit = {
|
||||
val fs = progressDirectoryPath.getFileSystem(hadoopConfiguration)
|
||||
val allUselessFiles = fs.listStatus(progressDirectoryPath, new PathFilter {
|
||||
override def accept(path: Path): Boolean = fromPathToTimestamp(path) <= timestampToClean
|
||||
}).map(_.getPath)
|
||||
val sortedFileList = allUselessFiles.sortWith((p1, p2) => fromPathToTimestamp(p1) >
|
||||
fromPathToTimestamp(p2))
|
||||
val allUselessFiles = fs
|
||||
.listStatus(progressDirectoryPath, new PathFilter {
|
||||
override def accept(path: Path): Boolean = fromPathToTimestamp(path) <= timestampToClean
|
||||
})
|
||||
.map(_.getPath)
|
||||
val sortedFileList = allUselessFiles.sortWith(
|
||||
(p1, p2) =>
|
||||
fromPathToTimestamp(p1) >
|
||||
fromPathToTimestamp(p2))
|
||||
if (sortedFileList.nonEmpty) {
|
||||
sortedFileList.tail.foreach { filePath =>
|
||||
logInfo(s"delete $filePath")
|
||||
|
@ -422,15 +452,22 @@ private[spark] abstract class ProgressTrackerBase[T <: EventHubsConnector](
|
|||
}
|
||||
}
|
||||
// clean temp directory
|
||||
val allUselessTempFiles = fs.listStatus(tempDirectoryPath, new PathFilter {
|
||||
override def accept(path: Path): Boolean = fromPathToTimestamp(path) <= timestampToClean
|
||||
}).map(_.getPath)
|
||||
val allUselessTempFiles = fs
|
||||
.listStatus(tempDirectoryPath, new PathFilter {
|
||||
override def accept(path: Path): Boolean = fromPathToTimestamp(path) <= timestampToClean
|
||||
})
|
||||
.map(_.getPath)
|
||||
if (allUselessTempFiles.nonEmpty) {
|
||||
allUselessTempFiles.groupBy(fromPathToTimestamp).toList.sortWith((p1, p2) => p1._1 > p2._1).
|
||||
tail.flatMap(_._2).foreach {
|
||||
filePath => logInfo(s"delete $filePath")
|
||||
allUselessTempFiles
|
||||
.groupBy(fromPathToTimestamp)
|
||||
.toList
|
||||
.sortWith((p1, p2) => p1._1 > p2._1)
|
||||
.tail
|
||||
.flatMap(_._2)
|
||||
.foreach { filePath =>
|
||||
logInfo(s"delete $filePath")
|
||||
fs.delete(filePath, true)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -439,11 +476,12 @@ private[spark] abstract class ProgressTrackerBase[T <: EventHubsConnector](
|
|||
override def run(): Unit = {
|
||||
val fs = metadataDirectoryPath.getFileSystem(new Configuration())
|
||||
val allMetadataFiles = fs.listStatus(metadataDirectoryPath)
|
||||
val sortedMetadataFiles = allMetadataFiles.sortWith((f1, f2) => f1.getPath.getName.toLong <
|
||||
f2.getPath.getName.toLong)
|
||||
sortedMetadataFiles.take(math.max(sortedMetadataFiles.length - 1, 0)).map{
|
||||
file =>
|
||||
fs.delete(file.getPath, true)
|
||||
val sortedMetadataFiles = allMetadataFiles.sortWith(
|
||||
(f1, f2) =>
|
||||
f1.getPath.getName.toLong <
|
||||
f2.getPath.getName.toLong)
|
||||
sortedMetadataFiles.take(math.max(sortedMetadataFiles.length - 1, 0)).map { file =>
|
||||
fs.delete(file.getPath, true)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -20,19 +20,19 @@ package org.apache.spark.eventhubscommon.progress
|
|||
import java.io.IOException
|
||||
|
||||
import org.apache.hadoop.conf.Configuration
|
||||
import org.apache.hadoop.fs.{FSDataOutputStream, Path}
|
||||
import org.apache.hadoop.fs.{ FSDataOutputStream, Path }
|
||||
|
||||
import org.apache.spark.eventhubscommon.EventHubNameAndPartition
|
||||
import org.apache.spark.internal.Logging
|
||||
|
||||
private[spark] class ProgressWriter(
|
||||
streamId: Int,
|
||||
uid: String,
|
||||
eventHubNameAndPartition: EventHubNameAndPartition,
|
||||
timestamp: Long,
|
||||
hadoopConfiguration: Configuration,
|
||||
progressDir: String,
|
||||
subDirIdentifiers: String*) extends Logging {
|
||||
private[spark] class ProgressWriter(streamId: Int,
|
||||
uid: String,
|
||||
eventHubNameAndPartition: EventHubNameAndPartition,
|
||||
timestamp: Long,
|
||||
hadoopConfiguration: Configuration,
|
||||
progressDir: String,
|
||||
subDirIdentifiers: String*)
|
||||
extends Logging {
|
||||
|
||||
// TODO: Why can't we get this info from one of the ProgressTrackers?
|
||||
// TODO: Come up with better name for this guy
|
||||
|
@ -51,9 +51,12 @@ private[spark] class ProgressWriter(
|
|||
// it would be safe to overwrite checkpoint, since we will not start a new job when
|
||||
// checkpoint hasn't been committed
|
||||
cpFileStream = fs.create(tempProgressTrackingPointPath, true)
|
||||
val record = ProgressRecord(recordTime, uid,
|
||||
eventHubNameAndPartition.eventHubName, eventHubNameAndPartition.partitionId, cpOffset,
|
||||
cpSeq)
|
||||
val record = ProgressRecord(recordTime,
|
||||
uid,
|
||||
eventHubNameAndPartition.eventHubName,
|
||||
eventHubNameAndPartition.partitionId,
|
||||
cpOffset,
|
||||
cpSeq)
|
||||
cpFileStream.writeBytes(s"$record")
|
||||
} catch {
|
||||
case ioe: IOException =>
|
||||
|
@ -66,5 +69,3 @@ private[spark] class ProgressWriter(
|
|||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
|
|
@ -17,7 +17,6 @@
|
|||
|
||||
package org.apache.spark.eventhubscommon.rdd
|
||||
|
||||
// scalastyle:off
|
||||
import scala.collection.mutable.ListBuffer
|
||||
|
||||
import com.microsoft.azure.eventhubs.EventData
|
||||
|
@ -29,49 +28,55 @@ import org.apache.spark.eventhubscommon.EventHubNameAndPartition
|
|||
import org.apache.spark.eventhubscommon.client.EventHubsOffsetTypes.EventHubsOffsetType
|
||||
import org.apache.spark.eventhubscommon.progress.ProgressWriter
|
||||
import org.apache.spark.rdd.RDD
|
||||
import org.apache.spark.{Partition, SparkContext, TaskContext}
|
||||
// scalastyle:on
|
||||
import org.apache.spark.{ Partition, SparkContext, TaskContext }
|
||||
|
||||
private class EventHubRDDPartition(
|
||||
val sparkPartitionId: Int,
|
||||
val eventHubNameAndPartitionID: EventHubNameAndPartition,
|
||||
val fromOffset: Long,
|
||||
val fromSeq: Long,
|
||||
val untilSeq: Long,
|
||||
val offsetType: EventHubsOffsetType) extends Partition {
|
||||
private class EventHubRDDPartition(val sparkPartitionId: Int,
|
||||
val eventHubNameAndPartitionID: EventHubNameAndPartition,
|
||||
val fromOffset: Long,
|
||||
val fromSeq: Long,
|
||||
val untilSeq: Long,
|
||||
val offsetType: EventHubsOffsetType)
|
||||
extends Partition {
|
||||
|
||||
override def index: Int = sparkPartitionId
|
||||
}
|
||||
|
||||
private[spark] class EventHubsRDD(
|
||||
sc: SparkContext,
|
||||
eventHubsParamsMap: Map[String, Map[String, String]],
|
||||
val offsetRanges: List[OffsetRange],
|
||||
batchTime: Long,
|
||||
offsetParams: OffsetStoreParams,
|
||||
eventHubReceiverCreator: (Map[String, String], Int, Long, EventHubsOffsetType, Int) =>
|
||||
EventHubsClientWrapper)
|
||||
extends RDD[EventData](sc, Nil) {
|
||||
private[spark] class EventHubsRDD(sc: SparkContext,
|
||||
eventHubsParamsMap: Map[String, Map[String, String]],
|
||||
val offsetRanges: List[OffsetRange],
|
||||
batchTime: Long,
|
||||
offsetParams: OffsetStoreParams,
|
||||
eventHubReceiverCreator: (Map[String, String],
|
||||
Int,
|
||||
Long,
|
||||
EventHubsOffsetType,
|
||||
Int) => EventHubsClientWrapper)
|
||||
extends RDD[EventData](sc, Nil) {
|
||||
|
||||
override def getPartitions: Array[Partition] = {
|
||||
offsetRanges.zipWithIndex.map { case (offsetRange, index) =>
|
||||
new EventHubRDDPartition(index, offsetRange.eventHubNameAndPartition, offsetRange.fromOffset,
|
||||
offsetRange.fromSeq, offsetRange.untilSeq, offsetRange.offsetType)
|
||||
offsetRanges.zipWithIndex.map {
|
||||
case (offsetRange, index) =>
|
||||
new EventHubRDDPartition(index,
|
||||
offsetRange.eventHubNameAndPartition,
|
||||
offsetRange.fromOffset,
|
||||
offsetRange.fromSeq,
|
||||
offsetRange.untilSeq,
|
||||
offsetRange.offsetType)
|
||||
}.toArray
|
||||
}
|
||||
|
||||
private def wrappingReceive(
|
||||
eventHubNameAndPartition: EventHubNameAndPartition,
|
||||
eventHubClient: EventHubsClientWrapper,
|
||||
expectedEventNumber: Int,
|
||||
expectedHighestSeqNum: Long): List[EventData] = {
|
||||
private def wrappingReceive(eventHubNameAndPartition: EventHubNameAndPartition,
|
||||
eventHubClient: EventHubsClientWrapper,
|
||||
expectedEventNumber: Int,
|
||||
expectedHighestSeqNum: Long): List[EventData] = {
|
||||
val receivedBuffer = new ListBuffer[EventData]
|
||||
val receivingTrace = new ListBuffer[Long]
|
||||
var cnt = 0
|
||||
while (receivedBuffer.size < expectedEventNumber) {
|
||||
if (cnt > expectedEventNumber * 2) {
|
||||
throw new Exception(s"$eventHubNameAndPartition cannot return data, the trace is" +
|
||||
s" ${receivingTrace.toList}")
|
||||
throw new Exception(
|
||||
s"$eventHubNameAndPartition cannot return data, the trace is" +
|
||||
s" ${receivingTrace.toList}")
|
||||
}
|
||||
val receivedEventsItr = eventHubClient.receive(expectedEventNumber - receivedBuffer.size)
|
||||
if (receivedEventsItr == null) {
|
||||
|
@ -83,7 +88,7 @@ private[spark] class EventHubsRDD(
|
|||
cnt += 1
|
||||
receivedBuffer ++= receivedEvents
|
||||
if (receivedBuffer.nonEmpty &&
|
||||
receivedBuffer.last.getSystemProperties.getSequenceNumber >= expectedHighestSeqNum) {
|
||||
receivedBuffer.last.getSystemProperties.getSequenceNumber >= expectedHighestSeqNum) {
|
||||
// this is for the case where user has passed in filtering params and the remaining
|
||||
// msg number is less than expectedEventNumber
|
||||
return receivedBuffer.toList
|
||||
|
@ -92,60 +97,69 @@ private[spark] class EventHubsRDD(
|
|||
receivedBuffer.toList
|
||||
}
|
||||
|
||||
private def processFullyConsumedPartition(
|
||||
ehRDDPartition: EventHubRDDPartition, progressWriter: ProgressWriter): Iterator[EventData] = {
|
||||
private def processFullyConsumedPartition(ehRDDPartition: EventHubRDDPartition,
|
||||
progressWriter: ProgressWriter): Iterator[EventData] = {
|
||||
logInfo(s"No new data in ${ehRDDPartition.eventHubNameAndPartitionID} at $batchTime")
|
||||
val fromOffset = ehRDDPartition.fromOffset
|
||||
progressWriter.write(batchTime, ehRDDPartition.fromOffset,
|
||||
ehRDDPartition.fromSeq)
|
||||
logInfo(s"write offset $fromOffset, sequence number" +
|
||||
s" ${ehRDDPartition.fromSeq} for EventHub" +
|
||||
s" ${ehRDDPartition.eventHubNameAndPartitionID} at $batchTime")
|
||||
progressWriter.write(batchTime, ehRDDPartition.fromOffset, ehRDDPartition.fromSeq)
|
||||
logInfo(
|
||||
s"write offset $fromOffset, sequence number" +
|
||||
s" ${ehRDDPartition.fromSeq} for EventHub" +
|
||||
s" ${ehRDDPartition.eventHubNameAndPartitionID} at $batchTime")
|
||||
Iterator()
|
||||
}
|
||||
|
||||
private def extractOffsetAndSeqToWrite(
|
||||
receivedEvents: List[EventData],
|
||||
eventHubReceiver: EventHubsClientWrapper,
|
||||
ehRDDPartition: EventHubRDDPartition): (Long, Long) = {
|
||||
private def extractOffsetAndSeqToWrite(receivedEvents: List[EventData],
|
||||
eventHubReceiver: EventHubsClientWrapper,
|
||||
ehRDDPartition: EventHubRDDPartition): (Long, Long) = {
|
||||
if (receivedEvents.nonEmpty) {
|
||||
val lastEvent = receivedEvents.last
|
||||
(lastEvent.getSystemProperties.getOffset.toLong,
|
||||
lastEvent.getSystemProperties.getSequenceNumber)
|
||||
lastEvent.getSystemProperties.getSequenceNumber)
|
||||
} else {
|
||||
val partitionInfo = eventHubReceiver.eventhubsClient.getPartitionRuntimeInformation(
|
||||
ehRDDPartition.eventHubNameAndPartitionID.partitionId.toString).get()
|
||||
val partitionInfo = eventHubReceiver.eventhubsClient
|
||||
.getPartitionRuntimeInformation(
|
||||
ehRDDPartition.eventHubNameAndPartitionID.partitionId.toString)
|
||||
.get()
|
||||
(partitionInfo.getLastEnqueuedOffset.toLong, partitionInfo.getLastEnqueuedSequenceNumber)
|
||||
}
|
||||
}
|
||||
|
||||
private def retrieveDataFromPartition(
|
||||
ehRDDPartition: EventHubRDDPartition, progressWriter: ProgressWriter): Iterator[EventData] = {
|
||||
private def retrieveDataFromPartition(ehRDDPartition: EventHubRDDPartition,
|
||||
progressWriter: ProgressWriter): Iterator[EventData] = {
|
||||
val fromOffset = ehRDDPartition.fromOffset
|
||||
val fromSeq = ehRDDPartition.fromSeq
|
||||
val untilSeq = ehRDDPartition.untilSeq
|
||||
val maxRate = (untilSeq - fromSeq).toInt
|
||||
val startTime = System.currentTimeMillis()
|
||||
logInfo(s"${ehRDDPartition.eventHubNameAndPartitionID}" +
|
||||
s" expected rate $maxRate, fromSeq $fromSeq (exclusive) untilSeq" +
|
||||
s" $untilSeq (inclusive) at $batchTime")
|
||||
logInfo(
|
||||
s"${ehRDDPartition.eventHubNameAndPartitionID}" +
|
||||
s" expected rate $maxRate, fromSeq $fromSeq (exclusive) untilSeq" +
|
||||
s" $untilSeq (inclusive) at $batchTime")
|
||||
var eventHubReceiver: EventHubsClientWrapper = null
|
||||
try {
|
||||
val eventHubParameters = eventHubsParamsMap(ehRDDPartition.eventHubNameAndPartitionID.
|
||||
eventHubName)
|
||||
eventHubReceiver = eventHubReceiverCreator(eventHubParameters,
|
||||
ehRDDPartition.eventHubNameAndPartitionID.partitionId, fromOffset,
|
||||
ehRDDPartition.offsetType, maxRate)
|
||||
val eventHubParameters = eventHubsParamsMap(
|
||||
ehRDDPartition.eventHubNameAndPartitionID.eventHubName)
|
||||
eventHubReceiver = eventHubReceiverCreator(
|
||||
eventHubParameters,
|
||||
ehRDDPartition.eventHubNameAndPartitionID.partitionId,
|
||||
fromOffset,
|
||||
ehRDDPartition.offsetType,
|
||||
maxRate)
|
||||
val receivedEvents = wrappingReceive(ehRDDPartition.eventHubNameAndPartitionID,
|
||||
eventHubReceiver, maxRate, ehRDDPartition.untilSeq)
|
||||
logInfo(s"received ${receivedEvents.length} messages before Event Hubs server indicates" +
|
||||
s" there is no more messages, time cost:" +
|
||||
s" ${(System.currentTimeMillis() - startTime) / 1000.0} seconds")
|
||||
val (offsetToWrite, seqToWrite) = extractOffsetAndSeqToWrite(receivedEvents, eventHubReceiver,
|
||||
ehRDDPartition)
|
||||
eventHubReceiver,
|
||||
maxRate,
|
||||
ehRDDPartition.untilSeq)
|
||||
logInfo(
|
||||
s"received ${receivedEvents.length} messages before Event Hubs server indicates" +
|
||||
s" there is no more messages, time cost:" +
|
||||
s" ${(System.currentTimeMillis() - startTime) / 1000.0} seconds")
|
||||
val (offsetToWrite, seqToWrite) =
|
||||
extractOffsetAndSeqToWrite(receivedEvents, eventHubReceiver, ehRDDPartition)
|
||||
progressWriter.write(batchTime, offsetToWrite, seqToWrite)
|
||||
logInfo(s"write offset $offsetToWrite, sequence number $seqToWrite for EventHub" +
|
||||
s" ${ehRDDPartition.eventHubNameAndPartitionID} at $batchTime")
|
||||
logInfo(
|
||||
s"write offset $offsetToWrite, sequence number $seqToWrite for EventHub" +
|
||||
s" ${ehRDDPartition.eventHubNameAndPartitionID} at $batchTime")
|
||||
receivedEvents.iterator
|
||||
} catch {
|
||||
case e: Exception =>
|
||||
|
@ -161,9 +175,15 @@ private[spark] class EventHubsRDD(
|
|||
@DeveloperApi
|
||||
override def compute(split: Partition, context: TaskContext): Iterator[EventData] = {
|
||||
val ehRDDPartition = split.asInstanceOf[EventHubRDDPartition]
|
||||
val progressWriter = new ProgressWriter(offsetParams.streamId, offsetParams.uid,
|
||||
ehRDDPartition.eventHubNameAndPartitionID, batchTime, new Configuration(),
|
||||
offsetParams.checkpointDir, offsetParams.subDirs: _*)
|
||||
val progressWriter = new ProgressWriter(
|
||||
offsetParams.streamId,
|
||||
offsetParams.uid,
|
||||
ehRDDPartition.eventHubNameAndPartitionID,
|
||||
batchTime,
|
||||
new Configuration(),
|
||||
offsetParams.checkpointDir,
|
||||
offsetParams.subDirs: _*
|
||||
)
|
||||
if (ehRDDPartition.fromSeq >= ehRDDPartition.untilSeq) {
|
||||
processFullyConsumedPartition(ehRDDPartition, progressWriter)
|
||||
} else {
|
||||
|
@ -171,4 +191,3 @@ private[spark] class EventHubsRDD(
|
|||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -20,12 +20,11 @@ package org.apache.spark.eventhubscommon.rdd
|
|||
import org.apache.spark.eventhubscommon.EventHubNameAndPartition
|
||||
import org.apache.spark.eventhubscommon.client.EventHubsOffsetTypes.EventHubsOffsetType
|
||||
|
||||
private[spark] case class OffsetRange(
|
||||
eventHubNameAndPartition: EventHubNameAndPartition,
|
||||
fromOffset: Long,
|
||||
fromSeq: Long,
|
||||
untilSeq: Long,
|
||||
offsetType: EventHubsOffsetType) {
|
||||
private[spark] case class OffsetRange(eventHubNameAndPartition: EventHubNameAndPartition,
|
||||
fromOffset: Long,
|
||||
fromSeq: Long,
|
||||
untilSeq: Long,
|
||||
offsetType: EventHubsOffsetType) {
|
||||
|
||||
private[spark] def toTuple = (eventHubNameAndPartition, fromOffset, fromSeq, untilSeq, offsetType)
|
||||
}
|
||||
|
|
|
@ -18,5 +18,7 @@
|
|||
package org.apache.spark.eventhubscommon.rdd
|
||||
|
||||
// a helper object to avoid serialzing offsetstore instances
|
||||
private[spark] case class OffsetStoreParams(
|
||||
checkpointDir: String, streamId: Int, uid: String, subDirs: String*)
|
||||
private[spark] case class OffsetStoreParams(checkpointDir: String,
|
||||
streamId: Int,
|
||||
uid: String,
|
||||
subDirs: String*)
|
||||
|
|
|
@ -28,7 +28,9 @@ import org.apache.spark.sql.execution.streaming.Offset
|
|||
|
||||
// the descriptor of EventHubsBatchRecord to communicate with StreamExecution
|
||||
private[streaming] case class EventHubsBatchRecord(
|
||||
batchId: Long, targetSeqNums: Map[EventHubNameAndPartition, Long]) extends Offset {
|
||||
batchId: Long,
|
||||
targetSeqNums: Map[EventHubNameAndPartition, Long])
|
||||
extends Offset {
|
||||
override def json: String = JsonUtils.partitionAndSeqNum(batchId, targetSeqNums)
|
||||
}
|
||||
|
||||
|
@ -37,8 +39,10 @@ private object JsonUtils {
|
|||
|
||||
def partitionAndSeqNum(batchId: Long, seqNums: Map[EventHubNameAndPartition, Long]): String = {
|
||||
val convertedStringIndexedMap = new mutable.HashMap[String, Long]
|
||||
seqNums.foreach{case (eventHubNameAndPartition, offsetAndSeqNum) =>
|
||||
convertedStringIndexedMap += eventHubNameAndPartition.toString -> offsetAndSeqNum}
|
||||
seqNums.foreach {
|
||||
case (eventHubNameAndPartition, offsetAndSeqNum) =>
|
||||
convertedStringIndexedMap += eventHubNameAndPartition.toString -> offsetAndSeqNum
|
||||
}
|
||||
Serialization.write((batchId, convertedStringIndexedMap.toMap))
|
||||
}
|
||||
|
||||
|
@ -46,19 +50,23 @@ private object JsonUtils {
|
|||
try {
|
||||
val deserializedTuple = Serialization.read[(Int, Map[String, Long])](jsonStr)
|
||||
val batchId = deserializedTuple._1
|
||||
EventHubsBatchRecord(batchId, deserializedTuple._2.map{case (ehNameAndPartitionStr, seqNum) =>
|
||||
(EventHubNameAndPartition.fromString(ehNameAndPartitionStr), seqNum)})
|
||||
EventHubsBatchRecord(batchId, deserializedTuple._2.map {
|
||||
case (ehNameAndPartitionStr, seqNum) =>
|
||||
(EventHubNameAndPartition.fromString(ehNameAndPartitionStr), seqNum)
|
||||
})
|
||||
} catch {
|
||||
case NonFatal(x) =>
|
||||
throw new IllegalArgumentException(s"failed to parse $jsonStr")
|
||||
}
|
||||
}
|
||||
|
||||
def partitionOffsetAndSeqNums(
|
||||
batchId: Long, offsets: Map[EventHubNameAndPartition, (Long, Long)]): String = {
|
||||
def partitionOffsetAndSeqNums(batchId: Long,
|
||||
offsets: Map[EventHubNameAndPartition, (Long, Long)]): String = {
|
||||
val convertedStringIndexedMap = new mutable.HashMap[String, (Long, Long)]
|
||||
offsets.foreach{case (eventHubNameAndPartition, offsetAndSeqNum) =>
|
||||
convertedStringIndexedMap += eventHubNameAndPartition.toString -> offsetAndSeqNum}
|
||||
offsets.foreach {
|
||||
case (eventHubNameAndPartition, offsetAndSeqNum) =>
|
||||
convertedStringIndexedMap += eventHubNameAndPartition.toString -> offsetAndSeqNum
|
||||
}
|
||||
Serialization.write((batchId, convertedStringIndexedMap))
|
||||
}
|
||||
}
|
||||
|
|
|
@ -20,27 +20,42 @@ package org.apache.spark.sql.streaming.eventhubs
|
|||
import java.util.concurrent.Executors
|
||||
import java.util.concurrent.atomic.AtomicInteger
|
||||
|
||||
import scala.concurrent.{ExecutionContext, Future}
|
||||
import scala.util.{Failure, Success}
|
||||
import scala.concurrent.{ ExecutionContext, Future }
|
||||
import scala.util.{ Failure, Success }
|
||||
|
||||
import org.apache.spark.eventhubscommon.{EventHubNameAndPartition, EventHubsConnector, OffsetRecord, RateControlUtils}
|
||||
import org.apache.spark.eventhubscommon.client.{AMQPEventHubsClient, EventHubClient, EventHubsClientWrapper, RestfulEventHubClient}
|
||||
import org.apache.spark.eventhubscommon.{
|
||||
EventHubNameAndPartition,
|
||||
EventHubsConnector,
|
||||
OffsetRecord,
|
||||
RateControlUtils
|
||||
}
|
||||
import org.apache.spark.eventhubscommon.client.{
|
||||
AMQPEventHubsClient,
|
||||
Client,
|
||||
EventHubsClientWrapper
|
||||
}
|
||||
import org.apache.spark.eventhubscommon.client.EventHubsOffsetTypes.EventHubsOffsetType
|
||||
import org.apache.spark.eventhubscommon.rdd.{EventHubsRDD, OffsetRange, OffsetStoreParams}
|
||||
import org.apache.spark.eventhubscommon.rdd.{ EventHubsRDD, OffsetRange, OffsetStoreParams }
|
||||
import org.apache.spark.internal.Logging
|
||||
import org.apache.spark.sql.{DataFrame, Row, SQLContext}
|
||||
import org.apache.spark.sql.execution.streaming.{Offset, SerializedOffset, Source}
|
||||
import org.apache.spark.sql.{ DataFrame, Row, SQLContext }
|
||||
import org.apache.spark.sql.execution.streaming.{ Offset, SerializedOffset, Source }
|
||||
import org.apache.spark.sql.streaming.eventhubs.checkpoint.StructuredStreamingProgressTracker
|
||||
import org.apache.spark.sql.types._
|
||||
|
||||
private[spark] class EventHubsSource(
|
||||
sqlContext: SQLContext,
|
||||
eventHubsParams: Map[String, String],
|
||||
eventhubReceiverCreator: (Map[String, String], Int, Long, EventHubsOffsetType, Int) =>
|
||||
EventHubsClientWrapper = EventHubsClientWrapper.getEventHubReceiver,
|
||||
eventhubClientCreator: (String, Map[String, Map[String, String]]) =>
|
||||
EventHubClient = AMQPEventHubsClient.getInstance)
|
||||
extends Source with EventHubsConnector with Logging {
|
||||
eventhubReceiverCreator: (Map[String, String],
|
||||
Int,
|
||||
Long,
|
||||
EventHubsOffsetType,
|
||||
Int) => EventHubsClientWrapper =
|
||||
EventHubsClientWrapper.getEventHubReceiver,
|
||||
eventhubClientCreator: (String, Map[String, Map[String, String]]) => Client =
|
||||
AMQPEventHubsClient.getInstance)
|
||||
extends Source
|
||||
with EventHubsConnector
|
||||
with Logging {
|
||||
|
||||
case class EventHubsOffset(batchId: Long, offsets: Map[EventHubNameAndPartition, (Long, Long)])
|
||||
|
||||
|
@ -53,15 +68,15 @@ private[spark] class EventHubsSource(
|
|||
require(eventHubsNamespace != null, "eventhubs.namespace is not defined")
|
||||
require(eventHubsName != null, "eventhubs.name is not defined")
|
||||
|
||||
private var _eventHubsClient: EventHubClient = _
|
||||
private var _eventHubsClient: Client = _
|
||||
|
||||
private var _eventHubsReceiver: (Map[String, String], Int, Long, EventHubsOffsetType, Int)
|
||||
=> EventHubsClientWrapper = _
|
||||
private var _eventHubsReceiver
|
||||
: (Map[String, String], Int, Long, EventHubsOffsetType, Int) => EventHubsClientWrapper = _
|
||||
|
||||
private[eventhubs] def eventHubClient = {
|
||||
if (_eventHubsClient == null) {
|
||||
_eventHubsClient = eventhubClientCreator(eventHubsNamespace,
|
||||
Map(eventHubsName -> eventHubsParams))
|
||||
_eventHubsClient =
|
||||
eventhubClientCreator(eventHubsNamespace, Map(eventHubsName -> eventHubsParams))
|
||||
}
|
||||
_eventHubsClient
|
||||
}
|
||||
|
@ -79,8 +94,8 @@ private[spark] class EventHubsSource(
|
|||
yield EventHubNameAndPartition(eventHubsName, partitionId)).toList
|
||||
}
|
||||
|
||||
private implicit val cleanupExecutorService = ExecutionContext.fromExecutor(
|
||||
Executors.newFixedThreadPool(1))
|
||||
private implicit val cleanupExecutorService =
|
||||
ExecutionContext.fromExecutor(Executors.newFixedThreadPool(1))
|
||||
|
||||
// EventHubsSource is created for each instance of program, that means it is different with
|
||||
// DStream which will load the serialized Direct DStream instance from checkpoint
|
||||
|
@ -88,17 +103,22 @@ private[spark] class EventHubsSource(
|
|||
|
||||
// initialize ProgressTracker
|
||||
private val progressTracker = StructuredStreamingProgressTracker.initInstance(
|
||||
uid, eventHubsParams("eventhubs.progressTrackingDir"), sqlContext.sparkContext.appName,
|
||||
uid,
|
||||
eventHubsParams("eventhubs.progressTrackingDir"),
|
||||
sqlContext.sparkContext.appName,
|
||||
sqlContext.sparkContext.hadoopConfiguration)
|
||||
|
||||
private[spark] def setEventHubClient(eventHubClient: EventHubClient): EventHubsSource = {
|
||||
private[spark] def setEventHubClient(eventHubClient: Client): EventHubsSource = {
|
||||
_eventHubsClient = eventHubClient
|
||||
this
|
||||
}
|
||||
|
||||
private[spark] def setEventHubsReceiver(
|
||||
eventhubReceiverCreator: (Map[String, String], Int, Long, EventHubsOffsetType, Int) =>
|
||||
EventHubsClientWrapper): EventHubsSource = {
|
||||
eventhubReceiverCreator: (Map[String, String],
|
||||
Int,
|
||||
Long,
|
||||
EventHubsOffsetType,
|
||||
Int) => EventHubsClientWrapper): EventHubsSource = {
|
||||
_eventHubsReceiver = eventhubReceiverCreator
|
||||
this
|
||||
}
|
||||
|
@ -116,18 +136,20 @@ private[spark] class EventHubsSource(
|
|||
EventHubsSourceProvider.sourceSchema(eventHubsParams)
|
||||
}
|
||||
|
||||
private[spark] def composeHighestOffset(retryIfFail: Boolean):
|
||||
Option[Map[EventHubNameAndPartition, (Long, Long)]] = {
|
||||
RateControlUtils.fetchLatestOffset(eventHubClient,
|
||||
private[spark] def composeHighestOffset(
|
||||
retryIfFail: Boolean): Option[Map[EventHubNameAndPartition, (Long, Long)]] = {
|
||||
RateControlUtils.fetchLatestOffset(
|
||||
eventHubClient,
|
||||
retryIfFail = retryIfFail,
|
||||
if (fetchedHighestOffsetsAndSeqNums == null) {
|
||||
committedOffsetsAndSeqNums.offsets
|
||||
} else {
|
||||
fetchedHighestOffsetsAndSeqNums.offsets
|
||||
}) match {
|
||||
}
|
||||
) match {
|
||||
case Some(highestOffsets) =>
|
||||
fetchedHighestOffsetsAndSeqNums = EventHubsOffset(committedOffsetsAndSeqNums.batchId,
|
||||
highestOffsets)
|
||||
fetchedHighestOffsetsAndSeqNums =
|
||||
EventHubsOffset(committedOffsetsAndSeqNums.batchId, highestOffsets)
|
||||
Some(fetchedHighestOffsetsAndSeqNums.offsets)
|
||||
case _ =>
|
||||
logWarning(s"failed to fetch highest offset")
|
||||
|
@ -144,8 +166,9 @@ private[spark] class EventHubsSource(
|
|||
* idea about the highest offset, we shall fail the app when rest endpoint is not responsive, and
|
||||
* to prevent us from dying too much, we shall retry with 2-power interval in this case
|
||||
*/
|
||||
private def failAppIfRestEndpointFail = fetchedHighestOffsetsAndSeqNums == null ||
|
||||
committedOffsetsAndSeqNums.offsets.equals(fetchedHighestOffsetsAndSeqNums.offsets)
|
||||
private def failAppIfRestEndpointFail =
|
||||
fetchedHighestOffsetsAndSeqNums == null ||
|
||||
committedOffsetsAndSeqNums.offsets.equals(fetchedHighestOffsetsAndSeqNums.offsets)
|
||||
|
||||
private def cleanupFiles(batchIdToClean: Long): Unit = {
|
||||
Future {
|
||||
|
@ -154,8 +177,9 @@ private[spark] class EventHubsSource(
|
|||
case Success(r) =>
|
||||
logInfo(s"finished cleanup for batch $batchIdToClean")
|
||||
case Failure(exception) =>
|
||||
logWarning(s"error happened when clean up for batch $batchIdToClean," +
|
||||
s" $exception")
|
||||
logWarning(
|
||||
s"error happened when clean up for batch $batchIdToClean," +
|
||||
s" $exception")
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -168,8 +192,9 @@ private[spark] class EventHubsSource(
|
|||
*/
|
||||
override def getOffset: Option[Offset] = {
|
||||
val highestOffsetsOpt = composeHighestOffset(failAppIfRestEndpointFail)
|
||||
require(highestOffsetsOpt.isDefined, "cannot get highest offset from rest endpoint of" +
|
||||
" eventhubs")
|
||||
require(highestOffsetsOpt.isDefined,
|
||||
"cannot get highest offset from rest endpoint of" +
|
||||
" eventhubs")
|
||||
if (!firstBatch) {
|
||||
// committedOffsetsAndSeqNums.batchId is always no larger than the latest finished batch id
|
||||
val lastCommittedBatchId = committedOffsetsAndSeqNums.batchId
|
||||
|
@ -182,11 +207,17 @@ private[spark] class EventHubsSource(
|
|||
firstBatch = false
|
||||
}
|
||||
val targetOffsets = RateControlUtils.clamp(committedOffsetsAndSeqNums.offsets,
|
||||
highestOffsetsOpt.get, eventHubsParams)
|
||||
Some(EventHubsBatchRecord(committedOffsetsAndSeqNums.batchId + 1,
|
||||
targetOffsets.map{case (ehNameAndPartition, seqNum) =>
|
||||
(ehNameAndPartition, math.min(seqNum,
|
||||
fetchedHighestOffsetsAndSeqNums.offsets(ehNameAndPartition)._2))}))
|
||||
highestOffsetsOpt.get,
|
||||
eventHubsParams)
|
||||
Some(
|
||||
EventHubsBatchRecord(
|
||||
committedOffsetsAndSeqNums.batchId + 1,
|
||||
targetOffsets.map {
|
||||
case (ehNameAndPartition, seqNum) =>
|
||||
(ehNameAndPartition,
|
||||
math.min(seqNum, fetchedHighestOffsetsAndSeqNums.offsets(ehNameAndPartition)._2))
|
||||
}
|
||||
))
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -200,21 +231,29 @@ private[spark] class EventHubsSource(
|
|||
// a file, we need to read the latest progress file in the directory and see if we have commit
|
||||
// the offsests (check if the timestamp matches) and then collect the files if necessary
|
||||
progressTracker.commit(Map(uid -> committedOffsetsAndSeqNums.offsets), committedBatchId)
|
||||
logInfo(s"committed offsets of batch $committedBatchId, collectedCommits:" +
|
||||
s" $committedOffsetsAndSeqNums")
|
||||
logInfo(
|
||||
s"committed offsets of batch $committedBatchId, collectedCommits:" +
|
||||
s" $committedOffsetsAndSeqNums")
|
||||
}
|
||||
|
||||
private def fetchEndingOffsetOfLastBatch(committedBatchId: Long) = {
|
||||
val startOffsetOfUndergoingBatch = progressTracker.collectProgressRecordsForBatch(
|
||||
committedBatchId, List(this))
|
||||
val startOffsetOfUndergoingBatch =
|
||||
progressTracker.collectProgressRecordsForBatch(committedBatchId, List(this))
|
||||
if (startOffsetOfUndergoingBatch.isEmpty) {
|
||||
// first batch, take the initial value of the offset, -1
|
||||
EventHubsOffset(committedBatchId, committedOffsetsAndSeqNums.offsets)
|
||||
} else {
|
||||
EventHubsOffset(committedBatchId,
|
||||
startOffsetOfUndergoingBatch.filter { case (connectorUID, _) =>
|
||||
connectorUID == uid
|
||||
}.values.head.filter(_._1.eventHubName == eventHubsParams("eventhubs.name")))
|
||||
EventHubsOffset(
|
||||
committedBatchId,
|
||||
startOffsetOfUndergoingBatch
|
||||
.filter {
|
||||
case (connectorUID, _) =>
|
||||
connectorUID == uid
|
||||
}
|
||||
.values
|
||||
.head
|
||||
.filter(_._1.eventHubName == eventHubsParams("eventhubs.name"))
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -225,24 +264,28 @@ private[spark] class EventHubsSource(
|
|||
require(startSeqs.isDefined, s"cannot fetch start seqs for eventhubs $eventHubsName")
|
||||
committedOffsetsAndSeqNums = EventHubsOffset(-1, committedOffsetsAndSeqNums.offsets.map {
|
||||
case (ehNameAndPartition, (offset, _)) =>
|
||||
(ehNameAndPartition, (offset, startSeqs.get(ehNameAndPartition)))})
|
||||
RateControlUtils.validateFilteringParams(eventHubClient, eventHubsParams,
|
||||
ehNameAndPartitions)
|
||||
(ehNameAndPartition, (offset, startSeqs.get(ehNameAndPartition)))
|
||||
})
|
||||
RateControlUtils.validateFilteringParams(eventHubClient,
|
||||
eventHubsParams,
|
||||
ehNameAndPartitions)
|
||||
RateControlUtils.composeFromOffsetWithFilteringParams(eventHubsParams,
|
||||
committedOffsetsAndSeqNums.offsets)
|
||||
committedOffsetsAndSeqNums.offsets)
|
||||
} else {
|
||||
Map[EventHubNameAndPartition, (EventHubsOffsetType, Long)]()
|
||||
}
|
||||
}
|
||||
endOffset.targetSeqNums.map {
|
||||
case (ehNameAndPartition, seqNum) =>
|
||||
val (offsetType, offset) = RateControlUtils.calculateStartOffset(ehNameAndPartition,
|
||||
filterOffsetAndType, committedOffsetsAndSeqNums.offsets)
|
||||
val (offsetType, offset) =
|
||||
RateControlUtils.calculateStartOffset(ehNameAndPartition,
|
||||
filterOffsetAndType,
|
||||
committedOffsetsAndSeqNums.offsets)
|
||||
OffsetRange(ehNameAndPartition,
|
||||
fromOffset = offset,
|
||||
fromSeq = committedOffsetsAndSeqNums.offsets(ehNameAndPartition)._2,
|
||||
untilSeq = seqNum,
|
||||
offsetType = offsetType)
|
||||
fromOffset = offset,
|
||||
fromSeq = committedOffsetsAndSeqNums.offsets(ehNameAndPartition)._2,
|
||||
untilSeq = seqNum,
|
||||
offsetType = offsetType)
|
||||
}.toList
|
||||
}
|
||||
|
||||
|
@ -254,7 +297,10 @@ private[spark] class EventHubsSource(
|
|||
offsetRanges,
|
||||
committedOffsetsAndSeqNums.batchId + 1,
|
||||
OffsetStoreParams(eventHubsParams("eventhubs.progressTrackingDir"),
|
||||
streamId, uid = uid, subDirs = sqlContext.sparkContext.appName, uid),
|
||||
streamId,
|
||||
uid = uid,
|
||||
subDirs = sqlContext.sparkContext.appName,
|
||||
uid),
|
||||
eventHubsReceiver
|
||||
)
|
||||
}
|
||||
|
@ -263,27 +309,31 @@ private[spark] class EventHubsSource(
|
|||
import scala.collection.JavaConverters._
|
||||
val (containsProperties, userDefinedKeys) =
|
||||
EventHubsSourceProvider.ifContainsPropertiesAndUserDefinedKeys(eventHubsParams)
|
||||
val rowRDD = eventHubsRDD.map(eventData =>
|
||||
Row.fromSeq(Seq(eventData.getBytes, eventData.getSystemProperties.getOffset.toLong,
|
||||
eventData.getSystemProperties.getSequenceNumber,
|
||||
eventData.getSystemProperties.getEnqueuedTime.getEpochSecond,
|
||||
eventData.getSystemProperties.getPublisher,
|
||||
eventData.getSystemProperties.getPartitionKey
|
||||
) ++ {
|
||||
if (containsProperties) {
|
||||
if (userDefinedKeys.nonEmpty) {
|
||||
userDefinedKeys.map(k => {
|
||||
eventData.getProperties.asScala.getOrElse(k, "").toString
|
||||
})
|
||||
val rowRDD = eventHubsRDD.map(
|
||||
eventData =>
|
||||
Row.fromSeq(Seq(
|
||||
eventData.getBytes,
|
||||
eventData.getSystemProperties.getOffset.toLong,
|
||||
eventData.getSystemProperties.getSequenceNumber,
|
||||
eventData.getSystemProperties.getEnqueuedTime.getEpochSecond,
|
||||
eventData.getSystemProperties.getPublisher,
|
||||
eventData.getSystemProperties.getPartitionKey
|
||||
) ++ {
|
||||
if (containsProperties) {
|
||||
if (userDefinedKeys.nonEmpty) {
|
||||
userDefinedKeys.map(k => {
|
||||
eventData.getProperties.asScala.getOrElse(k, "").toString
|
||||
})
|
||||
} else {
|
||||
Seq(eventData.getProperties.asScala.map {
|
||||
case (k, v) =>
|
||||
k -> (if (v == null) null else v.toString)
|
||||
})
|
||||
}
|
||||
} else {
|
||||
Seq(eventData.getProperties.asScala.map { case (k, v) =>
|
||||
k -> (if (v == null) null else v.toString) })
|
||||
Seq()
|
||||
}
|
||||
} else {
|
||||
Seq()
|
||||
}
|
||||
}
|
||||
))
|
||||
}))
|
||||
sqlContext.createDataFrame(rowRDD, schema)
|
||||
}
|
||||
|
||||
|
@ -328,8 +378,8 @@ private[spark] class EventHubsSource(
|
|||
logInfo(s"recovered from a failure, startOffset: $start, endOffset: $end")
|
||||
val highestOffsets = composeHighestOffset(failAppIfRestEndpointFail)
|
||||
require(highestOffsets.isDefined, "cannot get highest offsets when recovering from a failure")
|
||||
fetchedHighestOffsetsAndSeqNums = EventHubsOffset(committedOffsetsAndSeqNums.batchId,
|
||||
highestOffsets.get)
|
||||
fetchedHighestOffsetsAndSeqNums =
|
||||
EventHubsOffset(committedOffsetsAndSeqNums.batchId, highestOffsets.get)
|
||||
firstBatch = false
|
||||
}
|
||||
|
||||
|
|
|
@ -20,28 +20,28 @@ package org.apache.spark.sql.streaming.eventhubs
|
|||
import org.apache.spark.internal.Logging
|
||||
import org.apache.spark.sql.SQLContext
|
||||
import org.apache.spark.sql.execution.streaming.Source
|
||||
import org.apache.spark.sql.sources.{DataSourceRegister, StreamSourceProvider}
|
||||
import org.apache.spark.sql.sources.{ DataSourceRegister, StreamSourceProvider }
|
||||
import org.apache.spark.sql.types._
|
||||
|
||||
private[sql] class EventHubsSourceProvider extends DataSourceRegister
|
||||
with StreamSourceProvider with Logging {
|
||||
private[sql] class EventHubsSourceProvider
|
||||
extends DataSourceRegister
|
||||
with StreamSourceProvider
|
||||
with Logging {
|
||||
|
||||
override def shortName(): String = "eventhubs"
|
||||
|
||||
override def sourceSchema(
|
||||
sqlContext: SQLContext,
|
||||
schema: Option[StructType],
|
||||
providerName: String,
|
||||
parameters: Map[String, String]): (String, StructType) = {
|
||||
override def sourceSchema(sqlContext: SQLContext,
|
||||
schema: Option[StructType],
|
||||
providerName: String,
|
||||
parameters: Map[String, String]): (String, StructType) = {
|
||||
(shortName(), EventHubsSourceProvider.sourceSchema(parameters))
|
||||
}
|
||||
|
||||
override def createSource(
|
||||
sqlContext: SQLContext,
|
||||
metadataPath: String,
|
||||
schema: Option[StructType],
|
||||
providerName: String,
|
||||
parameters: Map[String, String]): Source = {
|
||||
override def createSource(sqlContext: SQLContext,
|
||||
metadataPath: String,
|
||||
schema: Option[StructType],
|
||||
providerName: String,
|
||||
parameters: Map[String, String]): Source = {
|
||||
// TODO: use serviceLoader to pass in customized eventhubReceiverCreator and
|
||||
// eventhubClientCreator
|
||||
new EventHubsSource(sqlContext, parameters)
|
||||
|
@ -50,10 +50,10 @@ private[sql] class EventHubsSourceProvider extends DataSourceRegister
|
|||
|
||||
private[sql] object EventHubsSourceProvider extends Serializable {
|
||||
|
||||
private[eventhubs] def ifContainsPropertiesAndUserDefinedKeys(parameters: Map[String, String]):
|
||||
(Boolean, Seq[String]) = {
|
||||
val containsProperties = parameters.getOrElse("eventhubs.sql.containsProperties",
|
||||
"false").toBoolean
|
||||
private[eventhubs] def ifContainsPropertiesAndUserDefinedKeys(
|
||||
parameters: Map[String, String]): (Boolean, Seq[String]) = {
|
||||
val containsProperties =
|
||||
parameters.getOrElse("eventhubs.sql.containsProperties", "false").toBoolean
|
||||
val userDefinedKeys = {
|
||||
if (parameters.contains("eventhubs.sql.userDefinedKeys")) {
|
||||
parameters("eventhubs.sql.userDefinedKeys").split(",").toSeq
|
||||
|
@ -66,21 +66,25 @@ private[sql] object EventHubsSourceProvider extends Serializable {
|
|||
|
||||
def sourceSchema(parameters: Map[String, String]): StructType = {
|
||||
val (containsProperties, userDefinedKeys) = ifContainsPropertiesAndUserDefinedKeys(parameters)
|
||||
StructType(Seq(
|
||||
StructField("body", BinaryType),
|
||||
StructField("offset", LongType),
|
||||
StructField("seqNumber", LongType),
|
||||
StructField("enqueuedTime", LongType),
|
||||
StructField("publisher", StringType),
|
||||
StructField("partitionKey", StringType)
|
||||
) ++ {if (containsProperties) {
|
||||
if (userDefinedKeys.nonEmpty) {
|
||||
userDefinedKeys.map(key => StructField(key, StringType))
|
||||
} else {
|
||||
Seq(StructField("properties", MapType(StringType, StringType, valueContainsNull = true)))
|
||||
}
|
||||
} else {
|
||||
Seq()
|
||||
}})
|
||||
StructType(
|
||||
Seq(
|
||||
StructField("body", BinaryType),
|
||||
StructField("offset", LongType),
|
||||
StructField("seqNumber", LongType),
|
||||
StructField("enqueuedTime", LongType),
|
||||
StructField("publisher", StringType),
|
||||
StructField("partitionKey", StringType)
|
||||
) ++ {
|
||||
if (containsProperties) {
|
||||
if (userDefinedKeys.nonEmpty) {
|
||||
userDefinedKeys.map(key => StructField(key, StringType))
|
||||
} else {
|
||||
Seq(
|
||||
StructField("properties", MapType(StringType, StringType, valueContainsNull = true)))
|
||||
}
|
||||
} else {
|
||||
Seq()
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
|
|
@ -21,22 +21,22 @@ import scala.collection.mutable
|
|||
|
||||
import org.apache.hadoop.conf.Configuration
|
||||
|
||||
import org.apache.spark.eventhubscommon.{EventHubNameAndPartition, EventHubsConnector}
|
||||
import org.apache.spark.eventhubscommon.progress.{PathTools, ProgressTrackerBase}
|
||||
import org.apache.spark.eventhubscommon.{ EventHubNameAndPartition, EventHubsConnector }
|
||||
import org.apache.spark.eventhubscommon.progress.{ PathTools, ProgressTrackerBase }
|
||||
|
||||
private[spark] class StructuredStreamingProgressTracker private[spark](
|
||||
private[spark] class StructuredStreamingProgressTracker private[spark] (
|
||||
uid: String,
|
||||
progressDir: String,
|
||||
appName: String,
|
||||
hadoopConfiguration: Configuration)
|
||||
extends ProgressTrackerBase(progressDir, appName, hadoopConfiguration) {
|
||||
extends ProgressTrackerBase(progressDir, appName, hadoopConfiguration) {
|
||||
|
||||
private[spark] override lazy val progressDirectoryStr = PathTools.makeProgressDirectoryStr(
|
||||
progressDir, appName, uid)
|
||||
private[spark] override lazy val tempDirectoryStr = PathTools.makeTempDirectoryStr(progressDir,
|
||||
appName, uid)
|
||||
private[spark] override lazy val metadataDirectoryStr = PathTools.makeMetadataDirectoryStr(
|
||||
progressDir, appName, uid)
|
||||
private[spark] override lazy val progressDirectoryStr =
|
||||
PathTools.makeProgressDirectoryStr(progressDir, appName, uid)
|
||||
private[spark] override lazy val tempDirectoryStr =
|
||||
PathTools.makeTempDirectoryStr(progressDir, appName, uid)
|
||||
private[spark] override lazy val metadataDirectoryStr =
|
||||
PathTools.makeMetadataDirectoryStr(progressDir, appName, uid)
|
||||
|
||||
override def eventHubNameAndPartitions: Map[String, List[EventHubNameAndPartition]] = {
|
||||
val connector = StructuredStreamingProgressTracker.registeredConnectors(uid)
|
||||
|
@ -67,7 +67,8 @@ private[spark] class StructuredStreamingProgressTracker private[spark](
|
|||
if (latestFile.isDefined) {
|
||||
logWarning(s"latest progress file ${latestFile.get} corrupt, rebuild file...")
|
||||
val latestFileTimestamp = fromPathToTimestamp(latestFile.get)
|
||||
val progressRecords = collectProgressRecordsForBatch(latestFileTimestamp,
|
||||
val progressRecords = collectProgressRecordsForBatch(
|
||||
latestFileTimestamp,
|
||||
List(StructuredStreamingProgressTracker.registeredConnectors(uid)))
|
||||
commit(progressRecords, latestFileTimestamp)
|
||||
}
|
||||
|
@ -111,9 +112,10 @@ object StructuredStreamingProgressTracker {
|
|||
this.synchronized {
|
||||
// DirectDStream shall have singleton progress tracker
|
||||
if (_progressTrackers.get(uid).isEmpty) {
|
||||
_progressTrackers += uid -> new StructuredStreamingProgressTracker(uid, progressDirStr,
|
||||
appName,
|
||||
hadoopConfiguration)
|
||||
_progressTrackers += uid -> new StructuredStreamingProgressTracker(uid,
|
||||
progressDirStr,
|
||||
appName,
|
||||
hadoopConfiguration)
|
||||
}
|
||||
_progressTrackers(uid).init()
|
||||
}
|
||||
|
|
|
@ -17,22 +17,26 @@
|
|||
|
||||
package org.apache.spark.streaming.eventhubs
|
||||
|
||||
import java.io.{IOException, ObjectInputStream}
|
||||
import java.io.{ IOException, ObjectInputStream }
|
||||
|
||||
import scala.collection.mutable
|
||||
|
||||
import com.microsoft.azure.eventhubs.EventData
|
||||
|
||||
import org.apache.spark.eventhubscommon._
|
||||
import org.apache.spark.eventhubscommon.client.{AMQPEventHubsClient, EventHubClient, EventHubsClientWrapper}
|
||||
import org.apache.spark.eventhubscommon.client.{
|
||||
AMQPEventHubsClient,
|
||||
Client,
|
||||
EventHubsClientWrapper
|
||||
}
|
||||
import org.apache.spark.eventhubscommon.client.EventHubsOffsetTypes.EventHubsOffsetType
|
||||
import org.apache.spark.eventhubscommon.rdd.{EventHubsRDD, OffsetRange, OffsetStoreParams}
|
||||
import org.apache.spark.eventhubscommon.rdd.{ EventHubsRDD, OffsetRange, OffsetStoreParams }
|
||||
import org.apache.spark.internal.Logging
|
||||
import org.apache.spark.rdd.RDD
|
||||
import org.apache.spark.streaming.{StreamingContext, Time}
|
||||
import org.apache.spark.streaming.dstream.{DStreamCheckpointData, InputDStream}
|
||||
import org.apache.spark.streaming.{ StreamingContext, Time }
|
||||
import org.apache.spark.streaming.dstream.{ DStreamCheckpointData, InputDStream }
|
||||
import org.apache.spark.streaming.eventhubs.checkpoint._
|
||||
import org.apache.spark.streaming.scheduler.{RateController, StreamInputInfo}
|
||||
import org.apache.spark.streaming.scheduler.{ RateController, StreamInputInfo }
|
||||
import org.apache.spark.streaming.scheduler.rate.RateEstimator
|
||||
import org.apache.spark.util.Utils
|
||||
|
||||
|
@ -49,11 +53,17 @@ private[eventhubs] class EventHubDirectDStream private[eventhubs] (
|
|||
private[eventhubs] val eventHubNameSpace: String,
|
||||
progressDir: String,
|
||||
eventhubsParams: Map[String, Map[String, String]],
|
||||
eventhubReceiverCreator: (Map[String, String], Int, Long, EventHubsOffsetType, Int) =>
|
||||
EventHubsClientWrapper = EventHubsClientWrapper.getEventHubReceiver,
|
||||
eventhubClientCreator: (String, Map[String, Map[String, String]]) =>
|
||||
EventHubClient = AMQPEventHubsClient.getInstance)
|
||||
extends InputDStream[EventData](_ssc) with EventHubsConnector with Logging {
|
||||
eventhubReceiverCreator: (Map[String, String],
|
||||
Int,
|
||||
Long,
|
||||
EventHubsOffsetType,
|
||||
Int) => EventHubsClientWrapper =
|
||||
EventHubsClientWrapper.getEventHubReceiver,
|
||||
eventhubClientCreator: (String, Map[String, Map[String, String]]) => Client =
|
||||
AMQPEventHubsClient.getInstance)
|
||||
extends InputDStream[EventData](_ssc)
|
||||
with EventHubsConnector
|
||||
with Logging {
|
||||
|
||||
private[streaming] override def name: String = s"EventHub direct stream [$id]"
|
||||
|
||||
|
@ -67,8 +77,8 @@ private[eventhubs] class EventHubDirectDStream private[eventhubs] (
|
|||
|
||||
private val eventhubNameAndPartitions = {
|
||||
for (eventHubName <- eventhubsParams.keySet;
|
||||
partitionId <- 0 until eventhubsParams(eventHubName)(
|
||||
"eventhubs.partition.count").toInt) yield EventHubNameAndPartition(eventHubName, partitionId)
|
||||
partitionId <- 0 until eventhubsParams(eventHubName)("eventhubs.partition.count").toInt)
|
||||
yield EventHubNameAndPartition(eventHubName, partitionId)
|
||||
}
|
||||
|
||||
// uniquely identify the entities in eventhubs side, it can be the namespace or the name of a
|
||||
|
@ -87,16 +97,15 @@ private[eventhubs] class EventHubDirectDStream private[eventhubs] (
|
|||
} else {
|
||||
None
|
||||
}
|
||||
*/
|
||||
*/
|
||||
}
|
||||
|
||||
@transient private var _eventHubClient: EventHubClient = _
|
||||
@transient private var _eventHubClient: Client = _
|
||||
|
||||
private def progressTracker = DirectDStreamProgressTracker.getInstance.
|
||||
asInstanceOf[DirectDStreamProgressTracker]
|
||||
private def progressTracker =
|
||||
DirectDStreamProgressTracker.getInstance.asInstanceOf[DirectDStreamProgressTracker]
|
||||
|
||||
private[eventhubs] def setEventHubClient(eventHubClient: EventHubClient):
|
||||
EventHubDirectDStream = {
|
||||
private[eventhubs] def setEventHubClient(eventHubClient: Client): EventHubDirectDStream = {
|
||||
_eventHubClient = eventHubClient
|
||||
this
|
||||
}
|
||||
|
@ -108,22 +117,26 @@ private[eventhubs] class EventHubDirectDStream private[eventhubs] (
|
|||
_eventHubClient
|
||||
}
|
||||
|
||||
private[eventhubs] var currentOffsetsAndSeqNums = OffsetRecord(-1L,
|
||||
{eventhubNameAndPartitions.map{ehNameAndSpace => (ehNameAndSpace, (-1L, -1L))}.toMap})
|
||||
private[eventhubs] var currentOffsetsAndSeqNums = OffsetRecord(-1L, {
|
||||
eventhubNameAndPartitions.map { ehNameAndSpace =>
|
||||
(ehNameAndSpace, (-1L, -1L))
|
||||
}.toMap
|
||||
})
|
||||
private[eventhubs] var fetchedHighestOffsetsAndSeqNums: OffsetRecord = _
|
||||
|
||||
override def start(): Unit = {
|
||||
val concurrentJobs = ssc.conf.getInt("spark.streaming.concurrentJobs", 1)
|
||||
require(concurrentJobs == 1,
|
||||
require(
|
||||
concurrentJobs == 1,
|
||||
"due to the limitation from eventhub, we do not allow to have multiple concurrent spark jobs")
|
||||
DirectDStreamProgressTracker.initInstance(progressDir,
|
||||
context.sparkContext.appName, context.sparkContext.hadoopConfiguration)
|
||||
context.sparkContext.appName,
|
||||
context.sparkContext.hadoopConfiguration)
|
||||
ProgressTrackingListener.initInstance(ssc, progressDir)
|
||||
}
|
||||
|
||||
override def stop(): Unit = {
|
||||
logInfo("stop: stopping EventHubDirectDStream")
|
||||
eventHubClient.close()
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -139,44 +152,50 @@ private[eventhubs] class EventHubDirectDStream private[eventhubs] (
|
|||
*/
|
||||
private def fetchStartOffsetForEachPartition(validTime: Time, fallBack: Boolean): OffsetRecord = {
|
||||
val offsetRecord = progressTracker.read(
|
||||
eventHubNameSpace, validTime.milliseconds - ssc.graph.batchDuration.milliseconds, fallBack)
|
||||
eventHubNameSpace,
|
||||
validTime.milliseconds - ssc.graph.batchDuration.milliseconds,
|
||||
fallBack)
|
||||
require(offsetRecord.offsets.nonEmpty, "progress file cannot be empty")
|
||||
if (offsetRecord.timestamp != -1) {
|
||||
OffsetRecord(math.max(ssc.graph.startTime.milliseconds, offsetRecord.timestamp),
|
||||
offsetRecord.offsets)
|
||||
offsetRecord.offsets)
|
||||
} else {
|
||||
// query start startSeqs
|
||||
val startSeqs = eventHubClient.startSeqOfPartition(retryIfFail = false,
|
||||
eventhubNameAndPartitions.toList)
|
||||
require(startSeqs.isDefined, "We cannot get starting seq number of partitions," +
|
||||
" EventHubs endpoint is not available")
|
||||
OffsetRecord(math.max(ssc.graph.startTime.milliseconds, offsetRecord.timestamp),
|
||||
val startSeqs =
|
||||
eventHubClient.startSeqOfPartition(retryIfFail = false, eventhubNameAndPartitions.toList)
|
||||
require(startSeqs.isDefined,
|
||||
"We cannot get starting seq number of partitions," +
|
||||
" EventHubs endpoint is not available")
|
||||
OffsetRecord(
|
||||
math.max(ssc.graph.startTime.milliseconds, offsetRecord.timestamp),
|
||||
offsetRecord.offsets.map {
|
||||
case (ehNameAndPartition, (offset, _)) =>
|
||||
(ehNameAndPartition, (offset, startSeqs.get(ehNameAndPartition)))
|
||||
})
|
||||
}
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
private def reportInputInto(validTime: Time,
|
||||
offsetRanges: List[OffsetRange], inputSize: Int): Unit = {
|
||||
offsetRanges: List[OffsetRange],
|
||||
inputSize: Int): Unit = {
|
||||
require(inputSize >= 0, s"invalid inputSize ($inputSize) with offsetRanges: $offsetRanges")
|
||||
val description = offsetRanges.map { offsetRange =>
|
||||
s"eventhub: ${offsetRange.eventHubNameAndPartition}\t" +
|
||||
s"starting offsets: ${offsetRange.fromOffset}" +
|
||||
s"sequenceNumbers: ${offsetRange.fromSeq} to ${offsetRange.untilSeq}"
|
||||
}.mkString("\n")
|
||||
val description = offsetRanges
|
||||
.map { offsetRange =>
|
||||
s"eventhub: ${offsetRange.eventHubNameAndPartition}\t" +
|
||||
s"starting offsets: ${offsetRange.fromOffset}" +
|
||||
s"sequenceNumbers: ${offsetRange.fromSeq} to ${offsetRange.untilSeq}"
|
||||
}
|
||||
.mkString("\n")
|
||||
// Copy offsetRanges to immutable.List to prevent from being modified by the user
|
||||
val metadata = Map(
|
||||
"offsets" -> offsetRanges,
|
||||
StreamInputInfo.METADATA_KEY_DESCRIPTION -> description)
|
||||
val metadata =
|
||||
Map("offsets" -> offsetRanges, StreamInputInfo.METADATA_KEY_DESCRIPTION -> description)
|
||||
val inputInfo = StreamInputInfo(id, inputSize, metadata)
|
||||
ssc.scheduler.inputInfoTracker.reportInfo(validTime, inputInfo)
|
||||
}
|
||||
|
||||
private def validatePartitions(
|
||||
validTime: Time,
|
||||
calculatedPartitions: List[EventHubNameAndPartition]): Unit = {
|
||||
private def validatePartitions(validTime: Time,
|
||||
calculatedPartitions: List[EventHubNameAndPartition]): Unit = {
|
||||
if (currentOffsetsAndSeqNums != null) {
|
||||
val currentPartitions = currentOffsetsAndSeqNums.offsets.keys.toList
|
||||
val diff = currentPartitions.diff(calculatedPartitions)
|
||||
|
@ -187,24 +206,26 @@ private[eventhubs] class EventHubDirectDStream private[eventhubs] (
|
|||
}
|
||||
}
|
||||
|
||||
private def clamp(highestEndpoints: Map[EventHubNameAndPartition, (Long, Long)]):
|
||||
Map[EventHubNameAndPartition, Long] = {
|
||||
private def clamp(highestEndpoints: Map[EventHubNameAndPartition, (Long, Long)])
|
||||
: Map[EventHubNameAndPartition, Long] = {
|
||||
if (rateController.isEmpty) {
|
||||
RateControlUtils.clamp(currentOffsetsAndSeqNums.offsets,
|
||||
fetchedHighestOffsetsAndSeqNums.offsets, eventhubsParams)
|
||||
fetchedHighestOffsetsAndSeqNums.offsets,
|
||||
eventhubsParams)
|
||||
} else {
|
||||
val estimateRateLimit = rateController.map(_.getLatestRate().toInt)
|
||||
estimateRateLimit.filter(_ > 0) match {
|
||||
case None =>
|
||||
highestEndpoints.map{case (ehNameAndPartition, _) =>
|
||||
(ehNameAndPartition, currentOffsetsAndSeqNums.offsets(ehNameAndPartition)._2)
|
||||
highestEndpoints.map {
|
||||
case (ehNameAndPartition, _) =>
|
||||
(ehNameAndPartition, currentOffsetsAndSeqNums.offsets(ehNameAndPartition)._2)
|
||||
}
|
||||
case Some(allowedRate) =>
|
||||
val lagPerPartition = highestEndpoints.map {
|
||||
case (eventHubNameAndPartition, (_, latestSeq)) =>
|
||||
eventHubNameAndPartition ->
|
||||
math.max(latestSeq - currentOffsetsAndSeqNums.offsets(eventHubNameAndPartition)._2,
|
||||
0)
|
||||
0)
|
||||
}
|
||||
val totalLag = lagPerPartition.values.sum
|
||||
lagPerPartition.map {
|
||||
|
@ -229,31 +250,36 @@ private[eventhubs] class EventHubDirectDStream private[eventhubs] (
|
|||
val filteringOffsetAndType = {
|
||||
if (shouldCareEnqueueTimeOrOffset) {
|
||||
// first check if the parameters are valid
|
||||
RateControlUtils.validateFilteringParams(eventHubClient, eventhubsParams,
|
||||
eventhubNameAndPartitions.toList)
|
||||
RateControlUtils.validateFilteringParams(eventHubClient,
|
||||
eventhubsParams,
|
||||
eventhubNameAndPartitions.toList)
|
||||
RateControlUtils.composeFromOffsetWithFilteringParams(eventhubsParams,
|
||||
startOffsetInNextBatch.offsets)
|
||||
startOffsetInNextBatch.offsets)
|
||||
} else {
|
||||
Map[EventHubNameAndPartition, (EventHubsOffsetType, Long)]()
|
||||
}
|
||||
}
|
||||
highestOffsets.map {
|
||||
case (eventHubNameAndPartition, (_, endSeqNum)) =>
|
||||
val (offsetType, offset) = RateControlUtils.calculateStartOffset(eventHubNameAndPartition,
|
||||
filteringOffsetAndType, startOffsetInNextBatch.offsets)
|
||||
OffsetRange(eventHubNameAndPartition,
|
||||
val (offsetType, offset) =
|
||||
RateControlUtils.calculateStartOffset(eventHubNameAndPartition,
|
||||
filteringOffsetAndType,
|
||||
startOffsetInNextBatch.offsets)
|
||||
OffsetRange(
|
||||
eventHubNameAndPartition,
|
||||
fromOffset = offset,
|
||||
fromSeq = startOffsetInNextBatch.offsets(eventHubNameAndPartition)._2,
|
||||
untilSeq = math.min(clampedSeqIDs(eventHubNameAndPartition), endSeqNum),
|
||||
offsetType = offsetType)
|
||||
offsetType = offsetType
|
||||
)
|
||||
}.toList
|
||||
}
|
||||
|
||||
private def proceedWithNonEmptyRDD(
|
||||
validTime: Time,
|
||||
startOffsetInNextBatch: OffsetRecord,
|
||||
highestOffsetOfAllPartitions: Map[EventHubNameAndPartition, (Long, Long)]):
|
||||
Option[EventHubsRDD] = {
|
||||
highestOffsetOfAllPartitions: Map[EventHubNameAndPartition, (Long, Long)])
|
||||
: Option[EventHubsRDD] = {
|
||||
// normal processing
|
||||
validatePartitions(validTime, startOffsetInNextBatch.offsets.keys.toList)
|
||||
currentOffsetsAndSeqNums = startOffsetInNextBatch
|
||||
|
@ -264,11 +290,15 @@ private[eventhubs] class EventHubDirectDStream private[eventhubs] (
|
|||
eventhubsParams,
|
||||
offsetRanges,
|
||||
validTime.milliseconds,
|
||||
OffsetStoreParams(progressDir, streamId, uid = eventHubNameSpace,
|
||||
subDirs = ssc.sparkContext.appName),
|
||||
eventhubReceiverCreator)
|
||||
reportInputInto(validTime, offsetRanges,
|
||||
offsetRanges.map(ofr => ofr.untilSeq - ofr.fromSeq).sum.toInt)
|
||||
OffsetStoreParams(progressDir,
|
||||
streamId,
|
||||
uid = eventHubNameSpace,
|
||||
subDirs = ssc.sparkContext.appName),
|
||||
eventhubReceiverCreator
|
||||
)
|
||||
reportInputInto(validTime,
|
||||
offsetRanges,
|
||||
offsetRanges.map(ofr => ofr.untilSeq - ofr.fromSeq).sum.toInt)
|
||||
Some(eventHubRDD)
|
||||
}
|
||||
|
||||
|
@ -288,19 +318,18 @@ private[eventhubs] class EventHubDirectDStream private[eventhubs] (
|
|||
* idea about the highest offset, we shall fail the app when rest endpoint is not responsive, and
|
||||
* to prevent we die too much, we shall retry with 2-power interval in this case
|
||||
*/
|
||||
private def failAppIfRestEndpointFail = fetchedHighestOffsetsAndSeqNums == null ||
|
||||
currentOffsetsAndSeqNums.offsets.equals(fetchedHighestOffsetsAndSeqNums.offsets)
|
||||
private def failAppIfRestEndpointFail =
|
||||
fetchedHighestOffsetsAndSeqNums == null ||
|
||||
currentOffsetsAndSeqNums.offsets.equals(fetchedHighestOffsetsAndSeqNums.offsets)
|
||||
|
||||
private[spark] def composeHighestOffset(validTime: Time, retryIfFail: Boolean) = {
|
||||
RateControlUtils.fetchLatestOffset(
|
||||
eventHubClient,
|
||||
retryIfFail,
|
||||
if (fetchedHighestOffsetsAndSeqNums == null) {
|
||||
currentOffsetsAndSeqNums.offsets
|
||||
} else {
|
||||
fetchedHighestOffsetsAndSeqNums.offsets
|
||||
})
|
||||
match {
|
||||
RateControlUtils.fetchLatestOffset(eventHubClient,
|
||||
retryIfFail,
|
||||
if (fetchedHighestOffsetsAndSeqNums == null) {
|
||||
currentOffsetsAndSeqNums.offsets
|
||||
} else {
|
||||
fetchedHighestOffsetsAndSeqNums.offsets
|
||||
}) match {
|
||||
case Some(highestOffsets) =>
|
||||
fetchedHighestOffsetsAndSeqNums = OffsetRecord(validTime.milliseconds, highestOffsets)
|
||||
Some(fetchedHighestOffsetsAndSeqNums.offsets)
|
||||
|
@ -321,9 +350,10 @@ private[eventhubs] class EventHubDirectDStream private[eventhubs] (
|
|||
require(progressTracker != null, "ProgressTracker hasn't been initialized")
|
||||
var startPointRecord = fetchStartOffsetForEachPartition(validTime, !initialized)
|
||||
while (startPointRecord.timestamp < validTime.milliseconds -
|
||||
ssc.graph.batchDuration.milliseconds) {
|
||||
logInfo(s"wait for ProgressTrackingListener to commit offsets at Batch" +
|
||||
s" ${validTime.milliseconds}")
|
||||
ssc.graph.batchDuration.milliseconds) {
|
||||
logInfo(
|
||||
s"wait for ProgressTrackingListener to commit offsets at Batch" +
|
||||
s" ${validTime.milliseconds}")
|
||||
graph.wait()
|
||||
logInfo(s"wake up at Batch ${validTime.milliseconds} at DStream $id")
|
||||
startPointRecord = fetchStartOffsetForEachPartition(validTime, !initialized)
|
||||
|
@ -334,11 +364,13 @@ private[eventhubs] class EventHubDirectDStream private[eventhubs] (
|
|||
// 2) when the last batch was delayed, we should catch up by detecting the latest highest
|
||||
// offset
|
||||
val highestOffsetOption = composeHighestOffset(validTime, failAppIfRestEndpointFail)
|
||||
require(highestOffsetOption.isDefined, "We cannot get starting highest offset of partitions," +
|
||||
" EventHubs endpoint is not available")
|
||||
require(highestOffsetOption.isDefined,
|
||||
"We cannot get starting highest offset of partitions," +
|
||||
" EventHubs endpoint is not available")
|
||||
logInfo(s"highestOffsetOfAllPartitions at $validTime: ${highestOffsetOption.get}")
|
||||
logInfo(s"$validTime currentOffsetTimestamp: ${currentOffsetsAndSeqNums.timestamp}\t" +
|
||||
s" startPointRecordTimestamp: ${startPointRecord.timestamp}")
|
||||
logInfo(
|
||||
s"$validTime currentOffsetTimestamp: ${currentOffsetsAndSeqNums.timestamp}\t" +
|
||||
s" startPointRecordTimestamp: ${startPointRecord.timestamp}")
|
||||
val rdd = proceedWithNonEmptyRDD(validTime, startPointRecord, highestOffsetOption.get)
|
||||
initialized = true
|
||||
rdd
|
||||
|
@ -352,11 +384,14 @@ private[eventhubs] class EventHubDirectDStream private[eventhubs] (
|
|||
}
|
||||
|
||||
private[eventhubs] class EventHubDirectDStreamCheckpointData(
|
||||
eventHubDirectDStream: EventHubDirectDStream) extends DStreamCheckpointData(this) {
|
||||
eventHubDirectDStream: EventHubDirectDStream)
|
||||
extends DStreamCheckpointData(this) {
|
||||
|
||||
def batchForTime: mutable.HashMap[Time, Array[(EventHubNameAndPartition, Long, Long, Long,
|
||||
EventHubsOffsetType)]] = {
|
||||
data.asInstanceOf[mutable.HashMap[Time,
|
||||
def batchForTime: mutable.HashMap[
|
||||
Time,
|
||||
Array[(EventHubNameAndPartition, Long, Long, Long, EventHubsOffsetType)]] = {
|
||||
data.asInstanceOf[mutable.HashMap[
|
||||
Time,
|
||||
Array[(EventHubNameAndPartition, Long, Long, Long, EventHubsOffsetType)]]]
|
||||
}
|
||||
|
||||
|
@ -372,32 +407,36 @@ private[eventhubs] class EventHubDirectDStream private[eventhubs] (
|
|||
}
|
||||
}
|
||||
|
||||
override def cleanup(time: Time): Unit = { }
|
||||
override def cleanup(time: Time): Unit = {}
|
||||
|
||||
override def restore(): Unit = {
|
||||
// we have to initialize here, otherwise there is a race condition when recovering from spark
|
||||
// checkpoint
|
||||
logInfo("initialized ProgressTracker")
|
||||
val appName = context.sparkContext.appName
|
||||
DirectDStreamProgressTracker.initInstance(progressDir, appName,
|
||||
context.sparkContext.hadoopConfiguration)
|
||||
batchForTime.toSeq.sortBy(_._1)(Time.ordering).foreach { case (t, b) =>
|
||||
logInfo(s"Restoring EventHubRDD for time $t ${b.mkString("[", ", ", "]")}")
|
||||
generatedRDDs += t -> new EventHubsRDD(
|
||||
context.sparkContext,
|
||||
eventhubsParams,
|
||||
b.map {case (ehNameAndPar, fromOffset, fromSeq, untilSeq, offsetType) =>
|
||||
OffsetRange(ehNameAndPar, fromOffset, fromSeq, untilSeq, offsetType)}.toList,
|
||||
t.milliseconds,
|
||||
OffsetStoreParams(progressDir, streamId, uid = eventHubNameSpace,
|
||||
subDirs = appName),
|
||||
eventhubReceiverCreator)
|
||||
DirectDStreamProgressTracker.initInstance(progressDir,
|
||||
appName,
|
||||
context.sparkContext.hadoopConfiguration)
|
||||
batchForTime.toSeq.sortBy(_._1)(Time.ordering).foreach {
|
||||
case (t, b) =>
|
||||
logInfo(s"Restoring EventHubRDD for time $t ${b.mkString("[", ", ", "]")}")
|
||||
generatedRDDs += t -> new EventHubsRDD(
|
||||
context.sparkContext,
|
||||
eventhubsParams,
|
||||
b.map {
|
||||
case (ehNameAndPar, fromOffset, fromSeq, untilSeq, offsetType) =>
|
||||
OffsetRange(ehNameAndPar, fromOffset, fromSeq, untilSeq, offsetType)
|
||||
}.toList,
|
||||
t.milliseconds,
|
||||
OffsetStoreParams(progressDir, streamId, uid = eventHubNameSpace, subDirs = appName),
|
||||
eventhubReceiverCreator
|
||||
)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private[eventhubs] class EventHubDirectDStreamRateController(id: Int, estimator: RateEstimator)
|
||||
extends RateController(id, estimator) {
|
||||
extends RateController(id, estimator) {
|
||||
override protected def publish(rate: Long): Unit = {
|
||||
// publish nothing as there is no receiver
|
||||
}
|
||||
|
|
|
@ -1,162 +0,0 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.spark.streaming.eventhubs
|
||||
|
||||
import java.util.concurrent.ExecutorService
|
||||
|
||||
import com.microsoft.azure.eventhubs._
|
||||
|
||||
import org.apache.spark.eventhubscommon.client.EventHubsClientWrapper
|
||||
import org.apache.spark.internal.Logging
|
||||
import org.apache.spark.storage.StorageLevel
|
||||
import org.apache.spark.streaming.eventhubs.checkpoint.{DfsBasedOffsetStore, OffsetStore}
|
||||
import org.apache.spark.streaming.receiver.Receiver
|
||||
import org.apache.spark.util.ThreadUtils
|
||||
|
||||
private[eventhubs] class EventHubsReceiver(
|
||||
eventhubsParams: Map[String, String],
|
||||
partitionId: String,
|
||||
storageLevel: StorageLevel,
|
||||
offsetStore: Option[OffsetStore],
|
||||
receiverClient: EventHubsClientWrapper,
|
||||
maximumEventRate: Int) extends Receiver[Array[Byte]](storageLevel) with Logging {
|
||||
|
||||
// If offset store is empty we construct one using provided parameters
|
||||
val myOffsetStore: OffsetStore = offsetStore.getOrElse(new DfsBasedOffsetStore(
|
||||
eventhubsParams("eventhubs.checkpoint.dir"),
|
||||
eventhubsParams("eventhubs.namespace"),
|
||||
eventhubsParams("eventhubs.name"),
|
||||
partitionId))
|
||||
|
||||
/**
|
||||
* A state communicates between main thread and the MessageHandler thread.
|
||||
* Note we cannot use Receiver.isStopped() because there could be race condition when the
|
||||
* MessageHandler thread is started the state of the receiver has not been updated yet.
|
||||
*/
|
||||
@volatile private var stopMessageHandler = false
|
||||
|
||||
/**
|
||||
* The latest sequence number this receiver has seen in messages from EventHubs.
|
||||
* It is used to throw away messages with backwards sequence number, to avoid duplicates
|
||||
* when receiver is restarted due to transient errors.
|
||||
* Note that Sequence number is monotonically increasing
|
||||
*/
|
||||
// private var latestSequence: Long = Long.MinValue
|
||||
|
||||
/** The offset to be saved after current checkpoint interval */
|
||||
protected var offsetToSave: String = _
|
||||
|
||||
private var executorPool: ExecutorService = _
|
||||
|
||||
/** The last saved offset */
|
||||
protected var savedOffset: String = _
|
||||
|
||||
def onStop() {
|
||||
logInfo("Stopping EventHubsReceiver for partition " + partitionId)
|
||||
stopMessageHandler = true
|
||||
executorPool.shutdown()
|
||||
executorPool = null
|
||||
// Don't need to do anything else here. Message handling thread will check stopMessageHandler
|
||||
// and close EventHubs client receiver.
|
||||
}
|
||||
|
||||
def onStart() {
|
||||
logInfo("Starting EventHubsReceiver for partition " + partitionId)
|
||||
stopMessageHandler = false
|
||||
executorPool = ThreadUtils.newDaemonFixedThreadPool(1, "EventHubsMessageHandler")
|
||||
try {
|
||||
executorPool.submit(new EventHubsMessageHandler)
|
||||
} catch {
|
||||
case e: Exception =>
|
||||
// just in case anything is thrown (TODO: should not have anything here)
|
||||
e.printStackTrace()
|
||||
} finally {
|
||||
executorPool.shutdown() // Just causes threads to terminate after work is done
|
||||
}
|
||||
}
|
||||
|
||||
def processReceivedMessagesInBatch(eventDataBatch: Iterable[EventData]): Unit = {
|
||||
store(eventDataBatch.map(x => x.getBytes).toIterator)
|
||||
val maximumSequenceNumber: Long = eventDataBatch.map(x =>
|
||||
x.getSystemProperties.getSequenceNumber).reduceLeft { (x, y) => if (x > y) x else y }
|
||||
|
||||
// It is guaranteed by Eventhubs that the event data with the highest sequence number has
|
||||
// the largest offset
|
||||
offsetToSave = eventDataBatch.find(x => x.getSystemProperties.getSequenceNumber ==
|
||||
maximumSequenceNumber).get.getSystemProperties.getOffset
|
||||
}
|
||||
|
||||
// Handles EventHubs messages
|
||||
private[eventhubs] class EventHubsMessageHandler() extends Runnable {
|
||||
|
||||
// The checkpoint interval defaults to 10 seconds if not provided
|
||||
val checkpointInterval = eventhubsParams.getOrElse("eventhubs.checkpoint.interval", "10")
|
||||
.toLong * 1000
|
||||
var nextCheckpointTime = System.currentTimeMillis() + checkpointInterval
|
||||
|
||||
def run() {
|
||||
logInfo("Begin EventHubsMessageHandler for partition " + partitionId)
|
||||
myOffsetStore.open()
|
||||
// Create an EventHubs client receiver
|
||||
receiverClient.createReceiver(eventhubsParams, partitionId, myOffsetStore, maximumEventRate)
|
||||
var lastMaximumSequence = 0L
|
||||
while (!stopMessageHandler) {
|
||||
try {
|
||||
val receivedEvents = receiverClient.receive()
|
||||
if (receivedEvents != null && receivedEvents.nonEmpty) {
|
||||
val eventCount = receivedEvents.count(x => x.getBytes.length > 0)
|
||||
val sequenceNumbers = receivedEvents.map(x =>
|
||||
x.getSystemProperties.getSequenceNumber)
|
||||
if (sequenceNumbers != null && sequenceNumbers.nonEmpty) {
|
||||
val maximumSequenceNumber = sequenceNumbers.max
|
||||
val minimumSequenceNumber = sequenceNumbers.min
|
||||
val missingSequenceCount =
|
||||
maximumSequenceNumber - minimumSequenceNumber - eventCount + 1
|
||||
val sequenceNumberDiscontinuity = minimumSequenceNumber - (lastMaximumSequence + 1)
|
||||
lastMaximumSequence = maximumSequenceNumber
|
||||
logDebug(s"Partition Id: $partitionId, Event Count: $eventCount," +
|
||||
s" Maximum Sequence Number: $maximumSequenceNumber, Minimum Sequence Number:" +
|
||||
s" $minimumSequenceNumber," +
|
||||
s" Missing Sequence Count: $missingSequenceCount," +
|
||||
s" Sequence Number Discontinuity = $sequenceNumberDiscontinuity")
|
||||
} else {
|
||||
logDebug(s"Partition Id: $partitionId, Event Count: $eventCount")
|
||||
}
|
||||
processReceivedMessagesInBatch(receivedEvents)
|
||||
}
|
||||
val currentTime = System.currentTimeMillis()
|
||||
if (currentTime >= nextCheckpointTime && offsetToSave != savedOffset) {
|
||||
logInfo(s"Partition Id: $partitionId, Current Time: $currentTime," +
|
||||
s" Next Checkpoint Time: $nextCheckpointTime, Saved Offset: $offsetToSave")
|
||||
myOffsetStore.write(offsetToSave)
|
||||
savedOffset = offsetToSave
|
||||
nextCheckpointTime = currentTime + checkpointInterval
|
||||
}
|
||||
} catch {
|
||||
case e: Throwable =>
|
||||
val errorMsg = s"Error Handling Messages, ${e.getMessage}"
|
||||
logError(errorMsg)
|
||||
logInfo(s"recreating the receiver for partition $partitionId")
|
||||
receiverClient.closeReceiver()
|
||||
receiverClient.createReceiver(eventhubsParams, partitionId, myOffsetStore,
|
||||
maximumEventRate)
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
}
|
|
@ -16,19 +16,12 @@
|
|||
*/
|
||||
package org.apache.spark.streaming.eventhubs
|
||||
|
||||
import scala.collection.Map
|
||||
|
||||
import com.microsoft.azure.eventhubs.EventData
|
||||
|
||||
import org.apache.spark.SparkConf
|
||||
import org.apache.spark.eventhubscommon.client.{EventHubClient, EventHubsClientWrapper}
|
||||
import org.apache.spark.eventhubscommon.client.{ Client, EventHubsClientWrapper }
|
||||
import org.apache.spark.eventhubscommon.client.EventHubsOffsetTypes.EventHubsOffsetType
|
||||
import org.apache.spark.storage.StorageLevel
|
||||
import org.apache.spark.streaming.StreamingContext
|
||||
import org.apache.spark.streaming.dstream.{DStream, ReceiverInputDStream}
|
||||
import org.apache.spark.streaming.eventhubs.checkpoint.OffsetStore
|
||||
import org.apache.spark.streaming.receiver.Receiver
|
||||
|
||||
|
||||
object EventHubsUtils {
|
||||
|
||||
|
@ -42,66 +35,6 @@ object EventHubsUtils {
|
|||
new SparkConf().registerKryoClasses(Array(classOf[EventData]))
|
||||
}
|
||||
|
||||
// scalastyle:off
|
||||
/**
|
||||
* Create a unioned EventHubs stream that receives data from Microsoft Azure Eventhubs
|
||||
* The unioned stream will receive message from all partitions of the EventHubs
|
||||
*
|
||||
* @param streamingContext Streaming Context object
|
||||
* @param eventhubsParams a Map that contains parameters for EventHubs.
|
||||
* Required parameters are:
|
||||
* "eventhubs.policyname": EventHubs policy name
|
||||
* "eventhubs.policykey": EventHubs policy key
|
||||
* "eventhubs.namespace": EventHubs namespace
|
||||
* "eventhubs.name": EventHubs name
|
||||
* "eventhubs.partition.count": Number of partitions
|
||||
* "eventhubs.checkpoint.dir": checkpoint directory on HDFS
|
||||
*
|
||||
* Optional parameters are:
|
||||
* "eventhubs.consumergroup": EventHubs consumer group name, default to "\$default"
|
||||
* "eventhubs.filter.offset": Starting offset of EventHubs, default to "-1"
|
||||
* "eventhubs.filter.enqueuetime": Unix time, seconds since epoch, default to "0"
|
||||
* "eventhubs.default.credits": default AMQP credits, default to -1 (which is 1024)
|
||||
* "eventhubs.checkpoint.interval": checkpoint interval in second, default to 10
|
||||
* @param storageLevel Storage level, by default it is MEMORY_ONLY
|
||||
* @return ReceiverInputStream
|
||||
*/
|
||||
// scalastyle:on
|
||||
@deprecated("this method is deprecated, please use createDirectStreams", "2.0.5")
|
||||
def createUnionStream(streamingContext: StreamingContext, eventhubsParams: Map[String, String],
|
||||
storageLevel: StorageLevel = StorageLevel.MEMORY_AND_DISK_SER): DStream[Array[Byte]] = {
|
||||
val partitionCount = eventhubsParams("eventhubs.partition.count").toInt
|
||||
val streams = (0 until partitionCount).map {
|
||||
i => createStream(streamingContext, eventhubsParams, i.toString, storageLevel)
|
||||
}
|
||||
streamingContext.union(streams)
|
||||
}
|
||||
|
||||
/**
|
||||
* Create a single EventHubs stream that receives data from Microsoft Azure EventHubs
|
||||
* A single stream only receives message from one EventHubs partition
|
||||
*
|
||||
* @param streamingContext Streaming Context object
|
||||
* @param eventhubsParams a Map that contains parameters for EventHubs. Same as above.
|
||||
* @param partitionId Partition ID
|
||||
* @param storageLevel Storage level
|
||||
* @param offsetStore Offset store implementation, defaults to DFSBasedOffsetStore
|
||||
* @param receiverClient the EventHubs client implementation, defaults to EventHubsClientWrapper
|
||||
* @return ReceiverInputStream
|
||||
*/
|
||||
@deprecated("this method is deprecated, please use createDirectStreams", "2.0.5")
|
||||
def createStream(streamingContext: StreamingContext,
|
||||
eventhubsParams: Map[String, String],
|
||||
partitionId: String,
|
||||
storageLevel: StorageLevel = StorageLevel.MEMORY_AND_DISK_SER,
|
||||
offsetStore: OffsetStore = null,
|
||||
receiverClient: EventHubsClientWrapper = new EventHubsClientWrapper):
|
||||
ReceiverInputDStream[Array[Byte]] = {
|
||||
streamingContext.receiverStream(
|
||||
getReceiver(streamingContext, eventhubsParams.toMap, partitionId,
|
||||
storageLevel, Option(offsetStore), receiverClient))
|
||||
}
|
||||
|
||||
/**
|
||||
* create direct stream based on eventhubs
|
||||
* @param ssc the streaming context this stream belongs to
|
||||
|
@ -129,35 +62,20 @@ object EventHubsUtils {
|
|||
eventHubNamespace: String,
|
||||
progressDir: String,
|
||||
eventParams: Predef.Map[String, Predef.Map[String, String]],
|
||||
eventHubsReceiverCreator: (Predef.Map[String, String], Int, Long, EventHubsOffsetType, Int) =>
|
||||
EventHubsClientWrapper = EventHubsClientWrapper.getEventHubReceiver,
|
||||
eventHubsClientCreator: (String, Predef.Map[String, Predef.Map[String, String]]) =>
|
||||
EventHubClient): EventHubDirectDStream = {
|
||||
val newStream = new EventHubDirectDStream(ssc, eventHubNamespace, progressDir, eventParams,
|
||||
eventHubsReceiverCreator, eventHubsClientCreator)
|
||||
eventHubsReceiverCreator: (Predef.Map[String, String],
|
||||
Int,
|
||||
Long,
|
||||
EventHubsOffsetType,
|
||||
Int) => EventHubsClientWrapper =
|
||||
EventHubsClientWrapper.getEventHubReceiver,
|
||||
eventHubsClientCreator: (String, Predef.Map[String, Predef.Map[String, String]]) => Client)
|
||||
: EventHubDirectDStream = {
|
||||
val newStream = new EventHubDirectDStream(ssc,
|
||||
eventHubNamespace,
|
||||
progressDir,
|
||||
eventParams,
|
||||
eventHubsReceiverCreator,
|
||||
eventHubsClientCreator)
|
||||
newStream
|
||||
}
|
||||
|
||||
/**
|
||||
* A helper function to get EventHubsReceiver or ReliableEventHubsReceiver based on whether
|
||||
* Write Ahead Log is enabled or not ("spark.streaming.receiver.writeAheadLog.enable")
|
||||
*/
|
||||
private[eventhubs] def getReceiver(streamingContext: StreamingContext,
|
||||
eventhubsParams: scala.collection.immutable.Map[String, String],
|
||||
partitionId: String,
|
||||
storageLevel: StorageLevel,
|
||||
offsetStore: Option[OffsetStore],
|
||||
receiverClient: EventHubsClientWrapper): Receiver[Array[Byte]] = {
|
||||
val maximumEventRate = streamingContext.conf.getInt("spark.streaming.receiver.maxRate", 0)
|
||||
val walEnabled = streamingContext.conf.getBoolean(
|
||||
"spark.streaming.receiver.writeAheadLog.enable", defaultValue = false)
|
||||
|
||||
if (walEnabled) {
|
||||
new ReliableEventHubsReceiver(eventhubsParams, partitionId, storageLevel, offsetStore,
|
||||
receiverClient, maximumEventRate)
|
||||
} else {
|
||||
new EventHubsReceiver(eventhubsParams, partitionId, storageLevel, offsetStore, receiverClient,
|
||||
maximumEventRate)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -1,157 +0,0 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.spark.streaming.eventhubs
|
||||
|
||||
import java.util.concurrent.ConcurrentHashMap
|
||||
|
||||
import scala.collection.mutable.ArrayBuffer
|
||||
|
||||
import com.microsoft.azure.eventhubs._
|
||||
|
||||
import org.apache.spark.SparkEnv
|
||||
import org.apache.spark.eventhubscommon.client.EventHubsClientWrapper
|
||||
import org.apache.spark.storage.{StorageLevel, StreamBlockId}
|
||||
import org.apache.spark.streaming.eventhubs.checkpoint.OffsetStore
|
||||
import org.apache.spark.streaming.receiver.{BlockGenerator, BlockGeneratorListener}
|
||||
|
||||
/**
|
||||
* ReliableEventHubsReceiver offers the ability to reliably store data into BlockManager without
|
||||
* loss.
|
||||
* It is turned off by default and will be enabled when
|
||||
* spark.streaming.receiver.writeAheadLog.enable is true.
|
||||
*
|
||||
* The difference compared to EventHubsReceiver is that the offset is updated in persistent
|
||||
* store only after data is reliably stored as write-ahead log, so the potential data loss
|
||||
* problem of EventHubsReceiver can be eliminated.
|
||||
*/
|
||||
private[eventhubs]
|
||||
class ReliableEventHubsReceiver(
|
||||
eventhubsParams: Map[String, String],
|
||||
partitionId: String,
|
||||
storageLevel: StorageLevel,
|
||||
offsetStore: Option[OffsetStore],
|
||||
receiverClient: EventHubsClientWrapper,
|
||||
maximumEventRate: Int)
|
||||
extends EventHubsReceiver(
|
||||
eventhubsParams, partitionId, storageLevel, offsetStore, receiverClient, maximumEventRate) {
|
||||
|
||||
override def onStop() {
|
||||
super.onStop()
|
||||
if (blockGenerator != null) {
|
||||
blockGenerator.stop()
|
||||
blockGenerator = null
|
||||
}
|
||||
if (blockOffsetMap != null) {
|
||||
blockOffsetMap.clear()
|
||||
blockOffsetMap = null
|
||||
}
|
||||
}
|
||||
|
||||
override def onStart() {
|
||||
blockOffsetMap = new ConcurrentHashMap[StreamBlockId, String]
|
||||
// Initialize the block generator for storing EventHubs message.
|
||||
blockGenerator = new BlockGenerator(new GeneratedBlockHandler, streamId, SparkEnv.get.conf)
|
||||
blockGenerator.start()
|
||||
super.onStart()
|
||||
}
|
||||
|
||||
override def processReceivedMessagesInBatch(eventDataBatch: Iterable[EventData]): Unit = {
|
||||
val maximumSequenceNumber = eventDataBatch.map(x => x.getSystemProperties.getSequenceNumber).
|
||||
reduceLeft { (x, y) => if (x > y) x else y }
|
||||
val offsetMetadata = eventDataBatch.find(x =>
|
||||
x.getSystemProperties.getSequenceNumber == maximumSequenceNumber).get.getSystemProperties.
|
||||
getOffset
|
||||
/**
|
||||
* It is guaranteed by Eventhubs that the event data with the highest sequence number has the
|
||||
* largest offset
|
||||
*/
|
||||
blockGenerator.addMultipleDataWithCallback(eventDataBatch.map(x => x.getBytes).toIterator,
|
||||
offsetMetadata)
|
||||
}
|
||||
|
||||
/**
|
||||
* Store the ready-to-be-stored block and commit the related offsets to OffsetStore. This method
|
||||
* will try a fixed number of times to push the block. If the push fails, the receiver is stopped.
|
||||
*/
|
||||
private def storeBlockAndCommitOffset(
|
||||
blockId: StreamBlockId,
|
||||
arrayBuffer: ArrayBuffer[_]): Unit = {
|
||||
var count = 0
|
||||
var pushed = false
|
||||
var exception: Exception = null
|
||||
while (!pushed && count < RETRY_COUNT) {
|
||||
try {
|
||||
store(arrayBuffer.asInstanceOf[ArrayBuffer[Array[Byte]]])
|
||||
pushed = true
|
||||
} catch {
|
||||
case e: Exception =>
|
||||
count += 1
|
||||
exception = e
|
||||
Thread.sleep(SECONDS_BETWEEN_RETRY * 1000)
|
||||
}
|
||||
}
|
||||
if (pushed) {
|
||||
// commit the latest offset of the block to offsetToSave, when the checkpoint interval
|
||||
// passes the offset is saved to offset store
|
||||
offsetToSave = blockOffsetMap.get(blockId)
|
||||
blockOffsetMap.remove(blockId)
|
||||
} else {
|
||||
stop("Error while storing block into Spark", exception)
|
||||
}
|
||||
}
|
||||
|
||||
/** Class to handle blocks generated by the block generator. */
|
||||
private final class GeneratedBlockHandler extends BlockGeneratorListener {
|
||||
|
||||
def onAddData(data: Any, metadata: Any): Unit = {
|
||||
// Update the offset of the data that was added to the generator
|
||||
if (metadata != null) {
|
||||
val offset = metadata.asInstanceOf[String]
|
||||
latestOffsetCurBlock = offset
|
||||
}
|
||||
}
|
||||
|
||||
def onGenerateBlock(blockId: StreamBlockId): Unit = {
|
||||
|
||||
// Remember the offsets when a block has been generated
|
||||
blockOffsetMap.put(blockId, latestOffsetCurBlock)
|
||||
}
|
||||
|
||||
def onPushBlock(blockId: StreamBlockId, arrayBuffer: ArrayBuffer[_]): Unit = {
|
||||
|
||||
// Store block and commit the blocks offset
|
||||
storeBlockAndCommitOffset(blockId, arrayBuffer)
|
||||
}
|
||||
|
||||
def onError(message: String, throwable: Throwable): Unit = {
|
||||
reportError(message, throwable)
|
||||
}
|
||||
}
|
||||
|
||||
/** Use block generator to generate blocks to Spark block manager synchronously */
|
||||
private var blockGenerator: BlockGenerator = _
|
||||
|
||||
/** A string to store the latest offset in the current block for the current partition. */
|
||||
private var latestOffsetCurBlock: String = _
|
||||
|
||||
/** A concurrent HashMap to store the stream block id and related offset snapshot. */
|
||||
private var blockOffsetMap: ConcurrentHashMap[StreamBlockId, String] = _
|
||||
|
||||
private val RETRY_COUNT: Int = 10
|
||||
|
||||
private val SECONDS_BETWEEN_RETRY = 1
|
||||
}
|
|
@ -17,7 +17,7 @@
|
|||
package org.apache.spark.streaming.eventhubs.checkpoint
|
||||
|
||||
import org.apache.hadoop.conf.Configuration
|
||||
import org.apache.hadoop.fs.{FileSystem, Path}
|
||||
import org.apache.hadoop.fs.{ FileSystem, Path }
|
||||
|
||||
import org.apache.spark.SparkContext
|
||||
import org.apache.spark.internal.Logging
|
||||
|
@ -26,15 +26,13 @@ import org.apache.spark.internal.Logging
|
|||
* A DFS based OffsetStore implementation
|
||||
*/
|
||||
@SerialVersionUID(1L)
|
||||
class DfsBasedOffsetStore(
|
||||
directory: String,
|
||||
namespace: String,
|
||||
name: String,
|
||||
partition: String) extends OffsetStore with Logging {
|
||||
class DfsBasedOffsetStore(directory: String, namespace: String, name: String, partition: String)
|
||||
extends OffsetStore
|
||||
with Logging {
|
||||
|
||||
if (!SparkContext.getOrCreate().isLocal) {
|
||||
require(directory.startsWith("hdfs://") || directory.startsWith("adl://"),
|
||||
"we only support to store offset in HDFS/ADLS when running Spark in non-local mode ")
|
||||
"we only support to store offset in HDFS/ADLS when running Spark in non-local mode ")
|
||||
}
|
||||
|
||||
var path: Path = _
|
||||
|
@ -45,7 +43,6 @@ class DfsBasedOffsetStore(
|
|||
/**
|
||||
* Open two files, the actual checkpoint file and the backup checkpoint file
|
||||
*/
|
||||
|
||||
override def open(): Unit = {
|
||||
if (checkpointFile == null) {
|
||||
path = new Path(directory + "/" + namespace + "/" + name + "/" + partition)
|
||||
|
@ -183,4 +180,3 @@ class DfsBasedOffsetStore(
|
|||
// pass
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -22,7 +22,11 @@ import scala.collection.mutable.ListBuffer
|
|||
import org.apache.hadoop.conf.Configuration
|
||||
import org.apache.hadoop.fs._
|
||||
|
||||
import org.apache.spark.eventhubscommon.{EventHubNameAndPartition, EventHubsConnector, OffsetRecord}
|
||||
import org.apache.spark.eventhubscommon.{
|
||||
EventHubNameAndPartition,
|
||||
EventHubsConnector,
|
||||
OffsetRecord
|
||||
}
|
||||
import org.apache.spark.eventhubscommon.progress.ProgressTrackerBase
|
||||
|
||||
/**
|
||||
|
@ -36,23 +40,22 @@ import org.apache.spark.eventhubscommon.progress.ProgressTrackerBase
|
|||
* @param appName the name of Spark application
|
||||
* @param hadoopConfiguration the hadoop configuration instance
|
||||
*/
|
||||
private[spark] class DirectDStreamProgressTracker private[spark](
|
||||
private[spark] class DirectDStreamProgressTracker private[spark] (
|
||||
progressDir: String,
|
||||
appName: String,
|
||||
hadoopConfiguration: Configuration)
|
||||
extends ProgressTrackerBase(progressDir, appName, hadoopConfiguration) {
|
||||
extends ProgressTrackerBase(progressDir, appName, hadoopConfiguration) {
|
||||
|
||||
// the lock synchronizing the read and committing operations, since they are executed in driver
|
||||
// and listener thread respectively.
|
||||
private val driverLock = new Object
|
||||
|
||||
override def eventHubNameAndPartitions: Map[String, List[EventHubNameAndPartition]] = {
|
||||
DirectDStreamProgressTracker.registeredConnectors.map {
|
||||
connector => (connector.uid, connector.connectedInstances)
|
||||
DirectDStreamProgressTracker.registeredConnectors.map { connector =>
|
||||
(connector.uid, connector.connectedInstances)
|
||||
}.toMap
|
||||
}
|
||||
|
||||
|
||||
private def initProgressFileDirectory(): Unit = {
|
||||
try {
|
||||
val fs = progressDirectoryPath.getFileSystem(hadoopConfiguration)
|
||||
|
@ -118,10 +121,10 @@ private[spark] class DirectDStreamProgressTracker private[spark](
|
|||
/**
|
||||
* read the progress record for the specified namespace, streamId and timestamp
|
||||
*/
|
||||
override def read(namespace: String, timestamp: Long, fallBack: Boolean):
|
||||
OffsetRecord = driverLock.synchronized {
|
||||
super.read(namespace, timestamp, fallBack)
|
||||
}
|
||||
override def read(namespace: String, timestamp: Long, fallBack: Boolean): OffsetRecord =
|
||||
driverLock.synchronized {
|
||||
super.read(namespace, timestamp, fallBack)
|
||||
}
|
||||
|
||||
def close(): Unit = {}
|
||||
|
||||
|
@ -143,26 +146,32 @@ private[spark] class DirectDStreamProgressTracker private[spark](
|
|||
fs.delete(filePath, true)
|
||||
}
|
||||
}
|
||||
*/
|
||||
*/
|
||||
// clean temp directory
|
||||
val allUselessTempFiles = fs.listStatus(tempDirectoryPath, new PathFilter {
|
||||
override def accept(path: Path): Boolean = fromPathToTimestamp(path) <= timestampToClean
|
||||
}).map(_.getPath)
|
||||
val allUselessTempFiles = fs
|
||||
.listStatus(tempDirectoryPath, new PathFilter {
|
||||
override def accept(path: Path): Boolean = fromPathToTimestamp(path) <= timestampToClean
|
||||
})
|
||||
.map(_.getPath)
|
||||
if (allUselessTempFiles.nonEmpty) {
|
||||
allUselessTempFiles.groupBy(fromPathToTimestamp).toList.sortWith((p1, p2) => p1._1 > p2._1).
|
||||
tail.flatMap(_._2).foreach {
|
||||
filePath => logInfo(s"delete $filePath")
|
||||
fs.delete(filePath, true)
|
||||
}
|
||||
allUselessTempFiles
|
||||
.groupBy(fromPathToTimestamp)
|
||||
.toList
|
||||
.sortWith((p1, p2) => p1._1 > p2._1)
|
||||
.tail
|
||||
.flatMap(_._2)
|
||||
.foreach { filePath =>
|
||||
logInfo(s"delete $filePath")
|
||||
fs.delete(filePath, true)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* commit offsetToCommit to a new progress tracking file
|
||||
*/
|
||||
override def commit(
|
||||
offsetToCommit: Map[String, Map[EventHubNameAndPartition, (Long, Long)]],
|
||||
commitTime: Long): Unit = driverLock.synchronized {
|
||||
override def commit(offsetToCommit: Map[String, Map[EventHubNameAndPartition, (Long, Long)]],
|
||||
commitTime: Long): Unit = driverLock.synchronized {
|
||||
super.commit(offsetToCommit, commitTime)
|
||||
}
|
||||
}
|
||||
|
@ -193,9 +202,8 @@ object DirectDStreamProgressTracker {
|
|||
this.synchronized {
|
||||
// DirectDStream shall have singleton progress tracker
|
||||
if (_progressTracker == null) {
|
||||
_progressTracker = new DirectDStreamProgressTracker(progressDirStr,
|
||||
appName,
|
||||
hadoopConfiguration)
|
||||
_progressTracker =
|
||||
new DirectDStreamProgressTracker(progressDirStr, appName, hadoopConfiguration)
|
||||
}
|
||||
_progressTracker.init()
|
||||
}
|
||||
|
|
|
@ -23,10 +23,6 @@ package org.apache.spark.streaming.eventhubs.checkpoint
|
|||
trait OffsetStore extends Serializable {
|
||||
def open(): Unit
|
||||
def write(offset: String): Unit
|
||||
def read() : String
|
||||
def read(): String
|
||||
def close(): Unit
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
|
|
|
@ -20,39 +20,46 @@ package org.apache.spark.streaming.eventhubs.checkpoint
|
|||
import org.apache.spark.internal.Logging
|
||||
import org.apache.spark.streaming.StreamingContext
|
||||
import org.apache.spark.streaming.eventhubs.EventHubDirectDStream
|
||||
import org.apache.spark.streaming.scheduler.{StreamingListener, StreamingListenerBatchCompleted}
|
||||
import org.apache.spark.streaming.scheduler.{ StreamingListener, StreamingListenerBatchCompleted }
|
||||
|
||||
/**
|
||||
* The listener asynchronously commits the temp checkpoint to the path which is read by DStream
|
||||
* driver. It monitors the input size to prevent those empty batches from committing checkpoints
|
||||
*/
|
||||
private[eventhubs] class ProgressTrackingListener private (
|
||||
ssc: StreamingContext, progressDirectory: String) extends StreamingListener with Logging {
|
||||
private[eventhubs] class ProgressTrackingListener private (ssc: StreamingContext,
|
||||
progressDirectory: String)
|
||||
extends StreamingListener
|
||||
with Logging {
|
||||
|
||||
override def onBatchCompleted(batchCompleted: StreamingListenerBatchCompleted): Unit = {
|
||||
logInfo(s"Batch ${batchCompleted.batchInfo.batchTime} completed")
|
||||
val batchTime = batchCompleted.batchInfo.batchTime.milliseconds
|
||||
try {
|
||||
if (batchCompleted.batchInfo.outputOperationInfos.forall(_._2.failureReason.isEmpty)) {
|
||||
val progressTracker = DirectDStreamProgressTracker.getInstance.
|
||||
asInstanceOf[DirectDStreamProgressTracker]
|
||||
val progressTracker =
|
||||
DirectDStreamProgressTracker.getInstance.asInstanceOf[DirectDStreamProgressTracker]
|
||||
// build current offsets
|
||||
val allEventDStreams = DirectDStreamProgressTracker.registeredConnectors
|
||||
// merge with the temp directory
|
||||
val startTime = System.currentTimeMillis()
|
||||
val progressInLastBatch = progressTracker.collectProgressRecordsForBatch(
|
||||
batchTime, allEventDStreams.toList)
|
||||
val progressInLastBatch =
|
||||
progressTracker.collectProgressRecordsForBatch(batchTime, allEventDStreams.toList)
|
||||
logInfo(s"progressInLastBatch $progressInLastBatch")
|
||||
if (progressInLastBatch.nonEmpty) {
|
||||
val contentToCommit = allEventDStreams.map {
|
||||
case dstream: EventHubDirectDStream =>
|
||||
(dstream.eventHubNameSpace, dstream.currentOffsetsAndSeqNums.offsets)
|
||||
}.toMap.map { case (namespace, currentOffsets) =>
|
||||
(namespace, currentOffsets ++ progressInLastBatch.getOrElse(namespace, Map()))
|
||||
}
|
||||
val contentToCommit = allEventDStreams
|
||||
.map {
|
||||
case dstream: EventHubDirectDStream =>
|
||||
(dstream.eventHubNameSpace, dstream.currentOffsetsAndSeqNums.offsets)
|
||||
}
|
||||
.toMap
|
||||
.map {
|
||||
case (namespace, currentOffsets) =>
|
||||
(namespace, currentOffsets ++ progressInLastBatch.getOrElse(namespace, Map()))
|
||||
}
|
||||
progressTracker.commit(contentToCommit, batchTime)
|
||||
logInfo(s"commit ending offset of Batch $batchTime $contentToCommit time cost:" +
|
||||
s" ${System.currentTimeMillis() - startTime}")
|
||||
logInfo(
|
||||
s"commit ending offset of Batch $batchTime $contentToCommit time cost:" +
|
||||
s" ${System.currentTimeMillis() - startTime}")
|
||||
} else {
|
||||
logInfo(s"read RDD data from Checkpoint at $batchTime, skip commits")
|
||||
}
|
||||
|
@ -73,9 +80,8 @@ private[eventhubs] object ProgressTrackingListener {
|
|||
|
||||
private var _progressTrackerListener: ProgressTrackingListener = _
|
||||
|
||||
private def getOrCreateProgressTrackerListener(
|
||||
ssc: StreamingContext,
|
||||
progressDirectory: String) = {
|
||||
private def getOrCreateProgressTrackerListener(ssc: StreamingContext,
|
||||
progressDirectory: String) = {
|
||||
if (_progressTrackerListener == null) {
|
||||
_progressTrackerListener = new ProgressTrackingListener(ssc, progressDirectory)
|
||||
ssc.scheduler.listenerBus.listeners.add(0, _progressTrackerListener)
|
||||
|
@ -88,10 +94,8 @@ private[eventhubs] object ProgressTrackingListener {
|
|||
_progressTrackerListener = null
|
||||
}
|
||||
|
||||
def initInstance(
|
||||
ssc: StreamingContext,
|
||||
progressDirectory: String): ProgressTrackingListener = this.synchronized {
|
||||
getOrCreateProgressTrackerListener(ssc, progressDirectory)
|
||||
}
|
||||
def initInstance(ssc: StreamingContext, progressDirectory: String): ProgressTrackingListener =
|
||||
this.synchronized {
|
||||
getOrCreateProgressTrackerListener(ssc, progressDirectory)
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -1,47 +0,0 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.spark.eventhubscommon
|
||||
|
||||
import org.scalatest.mock.MockitoSugar
|
||||
|
||||
import org.apache.spark.eventhubscommon.client.EventHubsClientWrapper
|
||||
import org.apache.spark.streaming.{StreamingContext, TestSuiteBase}
|
||||
|
||||
class EventhubsImplicitsSuite
|
||||
extends TestSuiteBase with org.scalatest.Matchers with MockitoSugar {
|
||||
|
||||
val ehParams = Map(
|
||||
"eventhubs.policyname" -> "policyname",
|
||||
"eventhubs.policykey" -> "policykey",
|
||||
"eventhubs.namespace" -> "namespace",
|
||||
"eventhubs.name" -> "name",
|
||||
"eventhubs.partition.count" -> "4",
|
||||
"eventhubs.checkpoint.dir" -> "checkpointdir",
|
||||
"eventhubs.checkpoint.interval" -> "1000"
|
||||
)
|
||||
|
||||
test("StreamingContext can be implicitly converted to eventhub streaming context") {
|
||||
val ssc = new StreamingContext(master, framework, batchDuration)
|
||||
|
||||
import org.apache.spark.eventhubscommon.Implicits._
|
||||
|
||||
val stream = ssc.unionedEventHubStream(ehParams)
|
||||
val stream2 = ssc.eventHubStream(ehParams, "0")
|
||||
ssc.stop()
|
||||
}
|
||||
}
|
|
@ -16,10 +16,9 @@
|
|||
*/
|
||||
package org.apache.spark.eventhubscommon.client
|
||||
|
||||
import com.microsoft.azure.eventhubs._
|
||||
import org.mockito.{Matchers, Mockito}
|
||||
import org.mockito.{ Matchers, Mockito }
|
||||
import org.mockito.Mockito._
|
||||
import org.scalatest.{BeforeAndAfter, FunSuite}
|
||||
import org.scalatest.{ BeforeAndAfter, FunSuite }
|
||||
import org.scalatest.mock.MockitoSugar
|
||||
|
||||
import org.apache.spark.eventhubscommon.client.EventHubsOffsetTypes.EventHubsOffsetType
|
||||
|
@ -42,83 +41,65 @@ class EventHubsClientWrapperSuite extends FunSuite with BeforeAndAfter with Mock
|
|||
)
|
||||
|
||||
before {
|
||||
ehClientWrapperMock = spy(new EventHubsClientWrapper)
|
||||
ehClientWrapperMock = spy(new EventHubsClientWrapper(ehParams))
|
||||
offsetStoreMock = mock[OffsetStore]
|
||||
}
|
||||
|
||||
test("EventHubsClientWrapper converts parameters correctly when offset was previously saved") {
|
||||
Mockito.when(offsetStoreMock.read()).thenReturn("2147483647")
|
||||
Mockito.doNothing().when(ehClientWrapperMock).createReceiverInternal(
|
||||
Matchers.anyString,
|
||||
Matchers.anyString,
|
||||
Matchers.anyString,
|
||||
Matchers.anyString,
|
||||
Matchers.eq[EventHubsOffsetType](EventHubsOffsetTypes.PreviousCheckpoint),
|
||||
Matchers.anyString,
|
||||
Matchers.anyLong)
|
||||
Mockito
|
||||
.doNothing()
|
||||
.when(ehClientWrapperMock)
|
||||
.createReceiverInternal(
|
||||
Matchers.anyString,
|
||||
Matchers.eq[EventHubsOffsetType](EventHubsOffsetTypes.PreviousCheckpoint),
|
||||
Matchers.anyString)
|
||||
|
||||
ehClientWrapperMock.createReceiver(ehParams, "4", offsetStoreMock, 999)
|
||||
|
||||
verify(ehClientWrapperMock, times(1)).createReceiverInternal(
|
||||
Matchers.eq("Endpoint=amqps://namespace.servicebus.windows.net;EntityPath=name;" +
|
||||
"SharedAccessKeyName=policyname;" +
|
||||
"SharedAccessKey=policykey;OperationTimeout=PT1M;RetryPolicy=Default"),
|
||||
Matchers.anyString,
|
||||
Matchers.eq(EventHubClient.DEFAULT_CONSUMER_GROUP_NAME),
|
||||
Matchers.eq("4"),
|
||||
Matchers.eq(EventHubsOffsetTypes.PreviousCheckpoint),
|
||||
Matchers.eq("2147483647"),
|
||||
Matchers.eq(-1L))
|
||||
Matchers.eq("2147483647"))
|
||||
}
|
||||
|
||||
test("EventHubsClientWrapper converts parameters for consumergroup") {
|
||||
var ehParams2 = ehParams
|
||||
ehParams2 += "eventhubs.consumergroup" -> "$consumergroup"
|
||||
when(offsetStoreMock.read()).thenReturn("-1")
|
||||
doNothing().when(ehClientWrapperMock).createReceiverInternal(Matchers.anyString,
|
||||
Matchers.anyString,
|
||||
Matchers.anyString,
|
||||
Matchers.anyString,
|
||||
Matchers.eq[EventHubsOffsetType](EventHubsOffsetTypes.None),
|
||||
Matchers.anyString,
|
||||
Matchers.anyLong)
|
||||
doNothing()
|
||||
.when(ehClientWrapperMock)
|
||||
.createReceiverInternal(
|
||||
Matchers.anyString,
|
||||
Matchers.eq[EventHubsOffsetType](EventHubsOffsetTypes.None),
|
||||
Matchers.anyString
|
||||
)
|
||||
ehClientWrapperMock.createReceiver(ehParams2, "4", offsetStoreMock, 999)
|
||||
verify(ehClientWrapperMock, times(1)).createReceiverInternal(
|
||||
Matchers.eq("Endpoint=amqps://namespace.servicebus.windows.net;EntityPath=name;" +
|
||||
"SharedAccessKeyName=policyname;" +
|
||||
"SharedAccessKey=policykey;OperationTimeout=PT1M;RetryPolicy=Default"),
|
||||
Matchers.anyString,
|
||||
Matchers.eq("$consumergroup"),
|
||||
Matchers.eq("4"),
|
||||
Matchers.eq(EventHubsOffsetTypes.None),
|
||||
Matchers.eq("-1"),
|
||||
Matchers.eq(-1L))
|
||||
Matchers.eq("-1")
|
||||
)
|
||||
}
|
||||
|
||||
test("EventHubsClientWrapper converts parameters for enqueuetime filter") {
|
||||
var ehParams2 = ehParams
|
||||
ehParams2 += "eventhubs.filter.enqueuetime" -> "1433887583"
|
||||
when(offsetStoreMock.read()).thenReturn("-1")
|
||||
doNothing().when(ehClientWrapperMock).createReceiverInternal(
|
||||
Matchers.anyString,
|
||||
Matchers.anyString,
|
||||
Matchers.anyString,
|
||||
Matchers.anyString,
|
||||
Matchers.eq[EventHubsOffsetType](EventHubsOffsetTypes.InputTimeOffset),
|
||||
Matchers.anyString,
|
||||
Matchers.anyLong)
|
||||
doNothing()
|
||||
.when(ehClientWrapperMock)
|
||||
.createReceiverInternal(
|
||||
Matchers.anyString,
|
||||
Matchers.eq[EventHubsOffsetType](EventHubsOffsetTypes.InputTimeOffset),
|
||||
Matchers.anyString
|
||||
)
|
||||
|
||||
ehClientWrapperMock.createReceiver(ehParams2, "4", offsetStoreMock, 999)
|
||||
|
||||
verify(ehClientWrapperMock, times(1)).createReceiverInternal(
|
||||
Matchers.eq("Endpoint=amqps://namespace.servicebus.windows.net;EntityPath=name;" +
|
||||
"SharedAccessKeyName=policyname;" +
|
||||
"SharedAccessKey=policykey;OperationTimeout=PT1M;RetryPolicy=Default"),
|
||||
Matchers.anyString,
|
||||
Matchers.eq(EventHubClient.DEFAULT_CONSUMER_GROUP_NAME),
|
||||
Matchers.eq("4"),
|
||||
Matchers.eq(EventHubsOffsetTypes.InputTimeOffset),
|
||||
Matchers.eq("1433887583"),
|
||||
Matchers.eq(-1L))
|
||||
Matchers.eq("1433887583")
|
||||
)
|
||||
}
|
||||
}
|
||||
|
|
|
@ -31,8 +31,8 @@ import org.apache.spark.internal.Logging
|
|||
private[spark] object EventHubsTestUtilities extends Logging {
|
||||
|
||||
def simulateEventHubs[T, U](
|
||||
eventHubsParameters: Map[String, String],
|
||||
eventPayloadsAndProperties: Seq[(T, Seq[U])] = Seq.empty[(T, Seq[U])]): SimulatedEventHubs = {
|
||||
eventHubsParameters: Map[String, String],
|
||||
eventPayloadsAndProperties: Seq[(T, Seq[U])] = Seq.empty[(T, Seq[U])]): SimulatedEventHubs = {
|
||||
|
||||
assert(eventHubsParameters != null)
|
||||
assert(eventHubsParameters.nonEmpty)
|
||||
|
@ -45,43 +45,48 @@ private[spark] object EventHubsTestUtilities extends Logging {
|
|||
yield EventHubNameAndPartition(eventHubsName, i)
|
||||
}
|
||||
val payloadPropertyStore = roundRobinAllocation(eventHubsPartitionList.map(x => x -> 0).toMap,
|
||||
eventPayloadsAndProperties)
|
||||
eventPayloadsAndProperties)
|
||||
simulatedEventHubs = new SimulatedEventHubs(eventHubsNamespace, payloadPropertyStore)
|
||||
simulatedEventHubs
|
||||
}
|
||||
|
||||
def getOrSimulateEventHubs[T, U](
|
||||
eventHubsParameters: Map[String, String],
|
||||
eventPayloadsAndProperties: Seq[(T, Seq[U])] = Seq.empty[(T, Seq[U])]): SimulatedEventHubs = {
|
||||
def getOrSimulateEventHubs[T, U](eventHubsParameters: Map[String, String],
|
||||
eventPayloadsAndProperties: Seq[(T, Seq[U])] =
|
||||
Seq.empty[(T, Seq[U])]): SimulatedEventHubs = {
|
||||
if (simulatedEventHubs == null) {
|
||||
simulatedEventHubs = simulateEventHubs(eventHubsParameters, eventPayloadsAndProperties)
|
||||
}
|
||||
simulatedEventHubs
|
||||
}
|
||||
|
||||
def getHighestOffsetPerPartition(eventHubs: SimulatedEventHubs):
|
||||
Map[EventHubNameAndPartition, (Long, Long, Long)] = {
|
||||
def getHighestOffsetPerPartition(
|
||||
eventHubs: SimulatedEventHubs): Map[EventHubNameAndPartition, (Long, Long, Long)] = {
|
||||
eventHubs.messageStore.map {
|
||||
case (ehNameAndPartition, messageQueue) => (ehNameAndPartition,
|
||||
(messageQueue.length.toLong - 1, messageQueue.length.toLong - 1,
|
||||
case (ehNameAndPartition, messageQueue) =>
|
||||
(ehNameAndPartition,
|
||||
(messageQueue.length.toLong - 1,
|
||||
messageQueue.length.toLong - 1,
|
||||
messageQueue.last.getSystemProperties.getEnqueuedTime.getEpochSecond))
|
||||
}
|
||||
}
|
||||
|
||||
def addEventsToEventHubs[T, U](
|
||||
eventHubs: SimulatedEventHubs,
|
||||
eventPayloadsAndProperties: Seq[(T, Seq[U])]): SimulatedEventHubs = {
|
||||
eventHubs: SimulatedEventHubs,
|
||||
eventPayloadsAndProperties: Seq[(T, Seq[U])]): SimulatedEventHubs = {
|
||||
// Round-robin allocation of payloads to partitions
|
||||
val payloadPropertyStore = roundRobinAllocation(eventHubs.eventHubsNamedPartitions
|
||||
.map(x => x -> eventHubs.messageStore(x).length).toMap, eventPayloadsAndProperties)
|
||||
val payloadPropertyStore = roundRobinAllocation(
|
||||
eventHubs.eventHubsNamedPartitions
|
||||
.map(x => x -> eventHubs.messageStore(x).length)
|
||||
.toMap,
|
||||
eventPayloadsAndProperties)
|
||||
eventHubs.send(payloadPropertyStore)
|
||||
eventHubs
|
||||
}
|
||||
|
||||
private def roundRobinAllocation[T, U](
|
||||
eventHubsPartitionOffsetMap: Map[EventHubNameAndPartition, Int],
|
||||
eventPayloadsAndProperties: Seq[(T, Seq[U])] = Seq.empty[(T, Seq[U])]):
|
||||
Map[EventHubNameAndPartition, Array[EventData]] = {
|
||||
eventPayloadsAndProperties: Seq[(T, Seq[U])] = Seq.empty[(T, Seq[U])])
|
||||
: Map[EventHubNameAndPartition, Array[EventData]] = {
|
||||
val eventHubsPartitionList = eventHubsPartitionOffsetMap.keys.toSeq
|
||||
if (eventPayloadsAndProperties.isEmpty) {
|
||||
eventHubsPartitionList.map(x => x -> Seq.empty[EventData].toArray).toMap
|
||||
|
@ -92,45 +97,43 @@ private[spark] object EventHubsTestUtilities extends Logging {
|
|||
} else {
|
||||
eventPayloadsAndProperties.zipWithIndex
|
||||
.map(x => (eventHubsPartitionList(x._2 % eventHubsPartitionList.length), x._1))
|
||||
.groupBy(_._1).map { case (k, v) => (k, v.map(_._2)) }
|
||||
.groupBy(_._1)
|
||||
.map { case (k, v) => (k, v.map(_._2)) }
|
||||
}.toSeq
|
||||
}
|
||||
eventAllocation.map {
|
||||
case (eventHubNameAndPartition, payloadPropertyBag) =>
|
||||
(eventHubNameAndPartition,
|
||||
generateEventData(payloadPropertyBag, eventHubNameAndPartition.partitionId,
|
||||
eventHubsPartitionOffsetMap(eventHubNameAndPartition)))
|
||||
generateEventData(payloadPropertyBag,
|
||||
eventHubNameAndPartition.partitionId,
|
||||
eventHubsPartitionOffsetMap(eventHubNameAndPartition)))
|
||||
}.toMap
|
||||
}
|
||||
}
|
||||
|
||||
private[spark] def generateEventData[T, U](
|
||||
payloadPropertyBag: Seq[(T, Seq[U])],
|
||||
partitionId: Int,
|
||||
startOffset: Int): Array[EventData] = {
|
||||
private[spark] def generateEventData[T, U](payloadPropertyBag: Seq[(T, Seq[U])],
|
||||
partitionId: Int,
|
||||
startOffset: Int): Array[EventData] = {
|
||||
var offsetSetInQueue = startOffset
|
||||
val eventDataArray = new Array[EventData](payloadPropertyBag.length)
|
||||
val publisherName = "Microsoft Corporation"
|
||||
var enqueueTime = 0L
|
||||
var eventIndex = 0
|
||||
for((payload, properties) <- payloadPropertyBag) {
|
||||
for ((payload, properties) <- payloadPropertyBag) {
|
||||
val eventData = new EventData(payload.toString.getBytes)
|
||||
val systemPropertiesMap = new java.util.HashMap[String, AnyRef]()
|
||||
systemPropertiesMap.put(AmqpConstants.OFFSET_ANNOTATION_NAME,
|
||||
offsetSetInQueue.toString)
|
||||
systemPropertiesMap.put(AmqpConstants.OFFSET_ANNOTATION_NAME, offsetSetInQueue.toString)
|
||||
systemPropertiesMap.put(AmqpConstants.SEQUENCE_NUMBER_ANNOTATION_NAME,
|
||||
Long.box(offsetSetInQueue))
|
||||
systemPropertiesMap.put(AmqpConstants.PARTITION_KEY_ANNOTATION_NAME,
|
||||
partitionId.toString)
|
||||
systemPropertiesMap.put(AmqpConstants.PUBLISHER_ANNOTATION_NAME,
|
||||
publisherName.toString)
|
||||
Long.box(offsetSetInQueue))
|
||||
systemPropertiesMap.put(AmqpConstants.PARTITION_KEY_ANNOTATION_NAME, partitionId.toString)
|
||||
systemPropertiesMap.put(AmqpConstants.PUBLISHER_ANNOTATION_NAME, publisherName.toString)
|
||||
systemPropertiesMap.put(AmqpConstants.ENQUEUED_TIME_UTC_ANNOTATION_NAME,
|
||||
Date.from(Instant.ofEpochSecond(enqueueTime)))
|
||||
Date.from(Instant.ofEpochSecond(enqueueTime)))
|
||||
val systemProperties = new SystemProperties(systemPropertiesMap)
|
||||
Whitebox.setInternalState(eventData, "systemProperties", systemProperties.asInstanceOf[Any])
|
||||
for (property <- properties) {
|
||||
property match {
|
||||
case p@Tuple2(_, _) =>
|
||||
case p @ Tuple2(_, _) =>
|
||||
eventData.getProperties.put(p._1.toString, p._2.asInstanceOf[AnyRef])
|
||||
case _ =>
|
||||
eventData.getProperties.put("output", property.asInstanceOf[AnyRef])
|
||||
|
|
|
@ -22,23 +22,26 @@ import scala.collection.mutable.ListBuffer
|
|||
import com.microsoft.azure.eventhubs.EventData
|
||||
|
||||
import org.apache.spark.eventhubscommon.EventHubNameAndPartition
|
||||
import org.apache.spark.eventhubscommon.client.{EventHubClient, EventHubsClientWrapper, EventHubsOffsetTypes}
|
||||
import org.apache.spark.eventhubscommon.client.{
|
||||
Client,
|
||||
EventHubsClientWrapper,
|
||||
EventHubsOffsetTypes
|
||||
}
|
||||
import org.apache.spark.eventhubscommon.client.EventHubsOffsetTypes.EventHubsOffsetType
|
||||
import org.apache.spark.streaming.StreamingContext
|
||||
|
||||
class SimulatedEventHubs(
|
||||
eventHubsNamespace: String,
|
||||
initialData: Map[EventHubNameAndPartition, Array[EventData]]) extends Serializable {
|
||||
class SimulatedEventHubs(eventHubsNamespace: String,
|
||||
initialData: Map[EventHubNameAndPartition, Array[EventData]])
|
||||
extends Serializable {
|
||||
|
||||
assert(initialData != null)
|
||||
|
||||
var messageStore: Map[EventHubNameAndPartition, Array[EventData]] = initialData
|
||||
val eventHubsNamedPartitions: Seq[EventHubNameAndPartition] = initialData.keys.toSeq
|
||||
|
||||
def searchWithTime(
|
||||
eventHubsNamedPartition: EventHubNameAndPartition,
|
||||
enqueueTime: Long,
|
||||
eventCount: Int): List[EventData] = {
|
||||
def searchWithTime(eventHubsNamedPartition: EventHubNameAndPartition,
|
||||
enqueueTime: Long,
|
||||
eventCount: Int): List[EventData] = {
|
||||
val resultData = new ListBuffer[EventData]
|
||||
for (msg <- messageStore(eventHubsNamedPartition)) {
|
||||
if (resultData.length >= eventCount) {
|
||||
|
@ -51,8 +54,9 @@ class SimulatedEventHubs(
|
|||
resultData.toList
|
||||
}
|
||||
|
||||
def search(eventHubsNamedPartition: EventHubNameAndPartition, eventOffset: Int, eventCount: Int):
|
||||
List[EventData] = {
|
||||
def search(eventHubsNamedPartition: EventHubNameAndPartition,
|
||||
eventOffset: Int,
|
||||
eventCount: Int): List[EventData] = {
|
||||
val resultData = new ListBuffer[EventData]
|
||||
for (i <- 0 until eventCount) {
|
||||
// as in eventhub, offset is exclusive
|
||||
|
@ -66,43 +70,43 @@ class SimulatedEventHubs(
|
|||
|
||||
def send(newData: Map[EventHubNameAndPartition, Array[EventData]]): Unit = {
|
||||
val combinedData: Map[EventHubNameAndPartition, Array[EventData]] =
|
||||
(messageStore.toSeq ++ newData.toSeq).groupBy(_._1)
|
||||
.map{case (k, v) => (k, v.flatMap(_._2).toArray)}
|
||||
(messageStore.toSeq ++ newData.toSeq)
|
||||
.groupBy(_._1)
|
||||
.map { case (k, v) => (k, v.flatMap(_._2).toArray) }
|
||||
messageStore = combinedData
|
||||
}
|
||||
}
|
||||
|
||||
class TestEventHubsReceiver(
|
||||
eventHubParameters: Map[String, String],
|
||||
eventHubs: SimulatedEventHubs,
|
||||
partitionId: Int,
|
||||
startOffset: Long,
|
||||
offsetType: EventHubsOffsetType)
|
||||
extends EventHubsClientWrapper {
|
||||
|
||||
val eventHubName = eventHubParameters("eventhubs.name")
|
||||
class TestEventHubsReceiver(ehParams: Map[String, String],
|
||||
eventHubs: SimulatedEventHubs,
|
||||
partitionId: Int,
|
||||
startOffset: Long,
|
||||
offsetType: EventHubsOffsetType)
|
||||
extends EventHubsClientWrapper(ehParams) {
|
||||
|
||||
override def receive(expectedEventNum: Int): Iterable[EventData] = {
|
||||
val eventHubName = eventHubParameters("eventhubs.name")
|
||||
val eventHubName = ehParams("eventhubs.name")
|
||||
if (offsetType != EventHubsOffsetTypes.InputTimeOffset) {
|
||||
eventHubs.search(EventHubNameAndPartition(eventHubName, partitionId), startOffset.toInt,
|
||||
expectedEventNum)
|
||||
eventHubs.search(EventHubNameAndPartition(eventHubName, partitionId),
|
||||
startOffset.toInt,
|
||||
expectedEventNum)
|
||||
} else {
|
||||
eventHubs.searchWithTime(EventHubNameAndPartition(eventHubName, partitionId),
|
||||
eventHubParameters("eventhubs.filter.enqueuetime").toLong, expectedEventNum)
|
||||
ehParams("eventhubs.filter.enqueuetime").toLong,
|
||||
expectedEventNum)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
class SimulatedEventHubsRestClient(
|
||||
eventHubs: SimulatedEventHubs) extends EventHubClient {
|
||||
class SimulatedEventHubsRestClient(eventHubs: SimulatedEventHubs) extends Client {
|
||||
|
||||
override def endPointOfPartition(
|
||||
retryIfFail: Boolean,
|
||||
targetEventHubNameAndPartitions: List[EventHubNameAndPartition] = List()):
|
||||
Option[Predef.Map[EventHubNameAndPartition, (Long, Long)]] = {
|
||||
Some(eventHubs.messageStore
|
||||
.map(x => x._1 -> (x._2.length.toLong - 1, x._2.length.toLong - 1)))
|
||||
override def endPointOfPartition(retryIfFail: Boolean,
|
||||
targetEventHubNameAndPartitions: List[EventHubNameAndPartition] =
|
||||
List())
|
||||
: Option[Predef.Map[EventHubNameAndPartition, (Long, Long)]] = {
|
||||
Some(
|
||||
eventHubs.messageStore
|
||||
.map(x => x._1 -> (x._2.length.toLong - 1, x._2.length.toLong - 1)))
|
||||
}
|
||||
|
||||
override def close(): Unit = {}
|
||||
|
@ -114,13 +118,16 @@ class SimulatedEventHubsRestClient(
|
|||
*/
|
||||
override def lastEnqueueTimeOfPartitions(
|
||||
retryIfFail: Boolean,
|
||||
targetEventHubNameAndPartitions: List[EventHubNameAndPartition]):
|
||||
Option[Map[EventHubNameAndPartition, Long]] = {
|
||||
Some(targetEventHubNameAndPartitions.map{
|
||||
ehNameAndPartition =>
|
||||
(ehNameAndPartition,
|
||||
eventHubs.messageStore(ehNameAndPartition).last.getSystemProperties.getEnqueuedTime.
|
||||
toEpochMilli)
|
||||
targetEventHubNameAndPartitions: List[EventHubNameAndPartition])
|
||||
: Option[Map[EventHubNameAndPartition, Long]] = {
|
||||
Some(targetEventHubNameAndPartitions.map { ehNameAndPartition =>
|
||||
(ehNameAndPartition,
|
||||
eventHubs
|
||||
.messageStore(ehNameAndPartition)
|
||||
.last
|
||||
.getSystemProperties
|
||||
.getEnqueuedTime
|
||||
.toEpochMilli)
|
||||
}.toMap)
|
||||
}
|
||||
|
||||
|
@ -129,26 +136,26 @@ class SimulatedEventHubsRestClient(
|
|||
*
|
||||
* @return a map from eventhubName-partition to seq
|
||||
*/
|
||||
override def startSeqOfPartition(
|
||||
retryIfFail: Boolean,
|
||||
targetEventHubNameAndPartitions: List[EventHubNameAndPartition]):
|
||||
Option[Map[EventHubNameAndPartition, Long]] = {
|
||||
Some(targetEventHubNameAndPartitions.map {
|
||||
ehNameAndPartition =>
|
||||
(ehNameAndPartition, -1L)}.toMap)
|
||||
override def startSeqOfPartition(retryIfFail: Boolean,
|
||||
targetEventHubNameAndPartitions: List[EventHubNameAndPartition])
|
||||
: Option[Map[EventHubNameAndPartition, Long]] = {
|
||||
Some(targetEventHubNameAndPartitions.map { ehNameAndPartition =>
|
||||
(ehNameAndPartition, -1L)
|
||||
}.toMap)
|
||||
}
|
||||
}
|
||||
|
||||
class TestRestEventHubClient(
|
||||
latestRecords: Map[EventHubNameAndPartition, (Long, Long, Long)])
|
||||
extends EventHubClient {
|
||||
class TestRestEventHubClient(latestRecords: Map[EventHubNameAndPartition, (Long, Long, Long)])
|
||||
extends Client {
|
||||
|
||||
override def endPointOfPartition(
|
||||
retryIfFail: Boolean,
|
||||
targetEventHubNameAndPartitions: List[EventHubNameAndPartition] = List()):
|
||||
Option[Predef.Map[EventHubNameAndPartition, (Long, Long)]] = {
|
||||
Some(latestRecords.map{case (ehNameAndPartition, (offset, seq, _)) =>
|
||||
(ehNameAndPartition, (offset, seq))})
|
||||
override def endPointOfPartition(retryIfFail: Boolean,
|
||||
targetEventHubNameAndPartitions: List[EventHubNameAndPartition] =
|
||||
List())
|
||||
: Option[Predef.Map[EventHubNameAndPartition, (Long, Long)]] = {
|
||||
Some(latestRecords.map {
|
||||
case (ehNameAndPartition, (offset, seq, _)) =>
|
||||
(ehNameAndPartition, (offset, seq))
|
||||
})
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -158,11 +165,10 @@ class TestRestEventHubClient(
|
|||
*/
|
||||
override def lastEnqueueTimeOfPartitions(
|
||||
retryIfFail: Boolean,
|
||||
targetEventHubNameAndPartitions: List[EventHubNameAndPartition]):
|
||||
Option[Map[EventHubNameAndPartition, Long]] = {
|
||||
Some(targetEventHubNameAndPartitions.map{
|
||||
ehNameAndPartition =>
|
||||
(ehNameAndPartition, latestRecords(ehNameAndPartition)._3)
|
||||
targetEventHubNameAndPartitions: List[EventHubNameAndPartition])
|
||||
: Option[Map[EventHubNameAndPartition, Long]] = {
|
||||
Some(targetEventHubNameAndPartitions.map { ehNameAndPartition =>
|
||||
(ehNameAndPartition, latestRecords(ehNameAndPartition)._3)
|
||||
}.toMap)
|
||||
}
|
||||
|
||||
|
@ -173,22 +179,21 @@ class TestRestEventHubClient(
|
|||
*
|
||||
* @return a map from eventhubName-partition to seq
|
||||
*/
|
||||
override def startSeqOfPartition(
|
||||
retryIfFail: Boolean,
|
||||
targetEventHubNameAndPartitions: List[EventHubNameAndPartition]):
|
||||
Option[Map[EventHubNameAndPartition, Long]] = {
|
||||
Some(targetEventHubNameAndPartitions.map {
|
||||
ehNameAndPartition =>
|
||||
(ehNameAndPartition, -1L)}.toMap)
|
||||
override def startSeqOfPartition(retryIfFail: Boolean,
|
||||
targetEventHubNameAndPartitions: List[EventHubNameAndPartition])
|
||||
: Option[Map[EventHubNameAndPartition, Long]] = {
|
||||
Some(targetEventHubNameAndPartitions.map { ehNameAndPartition =>
|
||||
(ehNameAndPartition, -1L)
|
||||
}.toMap)
|
||||
}
|
||||
}
|
||||
|
||||
class FragileEventHubClient private extends EventHubClient {
|
||||
class FragileEventHubClient private extends Client {
|
||||
|
||||
override def endPointOfPartition(
|
||||
retryIfFail: Boolean,
|
||||
targetEventHubNameAndPartitions: List[EventHubNameAndPartition] = List()):
|
||||
Option[Predef.Map[EventHubNameAndPartition, (Long, Long)]] = {
|
||||
override def endPointOfPartition(retryIfFail: Boolean,
|
||||
targetEventHubNameAndPartitions: List[EventHubNameAndPartition] =
|
||||
List())
|
||||
: Option[Predef.Map[EventHubNameAndPartition, (Long, Long)]] = {
|
||||
import FragileEventHubClient._
|
||||
|
||||
callIndex += 1
|
||||
|
@ -208,8 +213,8 @@ class FragileEventHubClient private extends EventHubClient {
|
|||
*/
|
||||
override def lastEnqueueTimeOfPartitions(
|
||||
retryIfFail: Boolean,
|
||||
targetEventHubNameAndPartitions: List[EventHubNameAndPartition]):
|
||||
Option[Map[EventHubNameAndPartition, Long]] = {
|
||||
targetEventHubNameAndPartitions: List[EventHubNameAndPartition])
|
||||
: Option[Map[EventHubNameAndPartition, Long]] = {
|
||||
Some(targetEventHubNameAndPartitions.map((_, Long.MaxValue)).toMap)
|
||||
}
|
||||
|
||||
|
@ -220,13 +225,12 @@ class FragileEventHubClient private extends EventHubClient {
|
|||
*
|
||||
* @return a map from eventhubName-partition to seq
|
||||
*/
|
||||
override def startSeqOfPartition(
|
||||
retryIfFail: Boolean,
|
||||
targetEventHubNameAndPartitions: List[EventHubNameAndPartition]):
|
||||
Option[Map[EventHubNameAndPartition, Long]] = {
|
||||
Some(targetEventHubNameAndPartitions.map {
|
||||
ehNameAndPartition =>
|
||||
(ehNameAndPartition, -1L)}.toMap)
|
||||
override def startSeqOfPartition(retryIfFail: Boolean,
|
||||
targetEventHubNameAndPartitions: List[EventHubNameAndPartition])
|
||||
: Option[Map[EventHubNameAndPartition, Long]] = {
|
||||
Some(targetEventHubNameAndPartitions.map { ehNameAndPartition =>
|
||||
(ehNameAndPartition, -1L)
|
||||
}.toMap)
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -238,28 +242,27 @@ object FragileEventHubClient {
|
|||
var lastBatchWhenEndpointCrashed = 0
|
||||
var latestRecords: Map[EventHubNameAndPartition, (Long, Long)] = Map()
|
||||
|
||||
def getInstance(eventHubNameSpace: String, eventhubsParams: Map[String, Map[String, String]]):
|
||||
FragileEventHubClient = {
|
||||
def getInstance(eventHubNameSpace: String,
|
||||
eventhubsParams: Map[String, Map[String, String]]): FragileEventHubClient = {
|
||||
new FragileEventHubClient()
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
class FluctuatedEventHubClient(
|
||||
ssc: StreamingContext,
|
||||
messagesBeforeEmpty: Long,
|
||||
numBatchesBeforeNewData: Int,
|
||||
latestRecords: Map[EventHubNameAndPartition, (Long, Long)]) extends EventHubClient {
|
||||
class FluctuatedEventHubClient(ssc: StreamingContext,
|
||||
messagesBeforeEmpty: Long,
|
||||
numBatchesBeforeNewData: Int,
|
||||
latestRecords: Map[EventHubNameAndPartition, (Long, Long)])
|
||||
extends Client {
|
||||
|
||||
private var callIndex = -1
|
||||
|
||||
override def endPointOfPartition(
|
||||
retryIfFail: Boolean,
|
||||
targetEventHubNameAndPartitions: List[EventHubNameAndPartition] = List()):
|
||||
Option[Predef.Map[EventHubNameAndPartition, (Long, Long)]] = {
|
||||
override def endPointOfPartition(retryIfFail: Boolean,
|
||||
targetEventHubNameAndPartitions: List[EventHubNameAndPartition] =
|
||||
List())
|
||||
: Option[Predef.Map[EventHubNameAndPartition, (Long, Long)]] = {
|
||||
callIndex += 1
|
||||
if (callIndex < numBatchesBeforeNewData) {
|
||||
Some(latestRecords.map{
|
||||
Some(latestRecords.map {
|
||||
case (ehNameAndPartition, _) =>
|
||||
(ehNameAndPartition, (messagesBeforeEmpty - 1, messagesBeforeEmpty - 1))
|
||||
})
|
||||
|
@ -277,8 +280,8 @@ class FluctuatedEventHubClient(
|
|||
*/
|
||||
override def lastEnqueueTimeOfPartitions(
|
||||
retryIfFail: Boolean,
|
||||
targetEventHubNameAndPartitions: List[EventHubNameAndPartition]):
|
||||
Option[Map[EventHubNameAndPartition, Long]] = {
|
||||
targetEventHubNameAndPartitions: List[EventHubNameAndPartition])
|
||||
: Option[Map[EventHubNameAndPartition, Long]] = {
|
||||
Some(targetEventHubNameAndPartitions.map((_, Long.MaxValue)).toMap)
|
||||
}
|
||||
|
||||
|
@ -287,13 +290,11 @@ class FluctuatedEventHubClient(
|
|||
*
|
||||
* @return a map from eventhubName-partition to seq
|
||||
*/
|
||||
override def startSeqOfPartition(
|
||||
retryIfFail: Boolean,
|
||||
targetEventHubNameAndPartitions: List[EventHubNameAndPartition]):
|
||||
Option[Map[EventHubNameAndPartition, Long]] = {
|
||||
Some(targetEventHubNameAndPartitions.map {
|
||||
ehNameAndPartition =>
|
||||
(ehNameAndPartition, -1L)}.toMap)
|
||||
override def startSeqOfPartition(retryIfFail: Boolean,
|
||||
targetEventHubNameAndPartitions: List[EventHubNameAndPartition])
|
||||
: Option[Map[EventHubNameAndPartition, Long]] = {
|
||||
Some(targetEventHubNameAndPartitions.map { ehNameAndPartition =>
|
||||
(ehNameAndPartition, -1L)
|
||||
}.toMap)
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -26,7 +26,7 @@ import org.apache.spark.sql.execution.streaming._
|
|||
trait StreamAction
|
||||
|
||||
case class EventHubsAddDataMemory[A](source: MemoryStream[A], data: Seq[A])
|
||||
extends EventHubsAddData {
|
||||
extends EventHubsAddData {
|
||||
override def toString: String = s"AddData to $source: ${data.mkString(",")}"
|
||||
|
||||
override def addData(query: Option[StreamExecution]): (Source, Offset) = {
|
||||
|
@ -45,6 +45,7 @@ object EventHubsAddData {
|
|||
|
||||
/** A trait that can be extended when testing a source. */
|
||||
trait EventHubsAddData extends StreamAction with Serializable {
|
||||
|
||||
/**
|
||||
* Called to adding the data to a source. It should find the source to add data to from
|
||||
* the active query, and then return the source object the data was added, as well as the
|
||||
|
@ -53,11 +54,11 @@ trait EventHubsAddData extends StreamAction with Serializable {
|
|||
def addData(query: Option[StreamExecution]): (Source, Offset)
|
||||
}
|
||||
|
||||
case class AddEventHubsData[T: ClassTag, U: ClassTag](
|
||||
eventHubsParameters: Map[String, String],
|
||||
highestBatchId: Long = 0,
|
||||
eventPayloadsAndProperties: Seq[(T, Seq[U])] = Seq.empty[(T, Seq[U])])
|
||||
extends EventHubsAddData {
|
||||
case class AddEventHubsData[T: ClassTag, U: ClassTag](eventHubsParameters: Map[String, String],
|
||||
highestBatchId: Long = 0,
|
||||
eventPayloadsAndProperties: Seq[(T, Seq[U])] =
|
||||
Seq.empty[(T, Seq[U])])
|
||||
extends EventHubsAddData {
|
||||
|
||||
override def addData(query: Option[StreamExecution]): (Source, Offset) = {
|
||||
val sources = query.get.logicalPlan.collect {
|
||||
|
@ -76,8 +77,9 @@ case class AddEventHubsData[T: ClassTag, U: ClassTag](
|
|||
val eventHubs = EventHubsTestUtilities.getOrSimulateEventHubs(eventHubsParameters)
|
||||
EventHubsTestUtilities.addEventsToEventHubs(eventHubs, eventPayloadsAndProperties)
|
||||
val highestOffsetPerPartition = EventHubsTestUtilities.getHighestOffsetPerPartition(eventHubs)
|
||||
val targetOffsetPerPartition = highestOffsetPerPartition.map{
|
||||
case (ehNameAndPartition, (offset, _, _)) => (ehNameAndPartition, offset)}
|
||||
val targetOffsetPerPartition = highestOffsetPerPartition.map {
|
||||
case (ehNameAndPartition, (offset, _, _)) => (ehNameAndPartition, offset)
|
||||
}
|
||||
val eventHubsBatchRecord = EventHubsBatchRecord(highestBatchId, targetOffsetPerPartition)
|
||||
(eventHubsSource, eventHubsBatchRecord)
|
||||
}
|
||||
|
|
|
@ -23,10 +23,11 @@ import org.apache.spark.sql.test.SharedSQLContext
|
|||
|
||||
class EventHubsOffsetSuite extends OffsetSuite with SharedSQLContext {
|
||||
|
||||
val eventHubsBatchRecord = EventHubsBatchRecord(0L,
|
||||
val eventHubsBatchRecord = EventHubsBatchRecord(
|
||||
0L,
|
||||
Map(EventHubNameAndPartition("eventhub", 0) -> 0L,
|
||||
EventHubNameAndPartition("eventhub", 1) -> 100L,
|
||||
EventHubNameAndPartition("eventhub", 2) -> 200L))
|
||||
EventHubNameAndPartition("eventhub", 1) -> 100L,
|
||||
EventHubNameAndPartition("eventhub", 2) -> 200L))
|
||||
|
||||
test("basic serialization and deserialization of Eventhubs batch record") {
|
||||
|
||||
|
@ -36,4 +37,3 @@ class EventHubsOffsetSuite extends OffsetSuite with SharedSQLContext {
|
|||
assert(deserializedEventhubsBatchRecord.targetSeqNums === eventHubsBatchRecord.targetSeqNums)
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -24,21 +24,20 @@ import org.apache.spark.eventhubscommon.EventHubNameAndPartition
|
|||
import org.apache.spark.eventhubscommon.client.EventHubsOffsetTypes
|
||||
import org.apache.spark.eventhubscommon.client.EventHubsOffsetTypes.EventHubsOffsetType
|
||||
import org.apache.spark.eventhubscommon.utils._
|
||||
import org.apache.spark.sql.{Dataset, SparkSession}
|
||||
import org.apache.spark.sql.streaming.{OutputMode, ProcessingTime}
|
||||
import org.apache.spark.sql.types.{LongType, TimestampType}
|
||||
import org.apache.spark.sql.{ Dataset, SparkSession }
|
||||
import org.apache.spark.sql.streaming.{ OutputMode, ProcessingTime }
|
||||
import org.apache.spark.sql.types.{ LongType, TimestampType }
|
||||
import org.apache.spark.util.Utils
|
||||
|
||||
class EventHubsSourceSuite extends EventHubsStreamTest {
|
||||
|
||||
private def buildEventHubsParamters(
|
||||
namespace: String,
|
||||
name: String,
|
||||
partitionCount: Int,
|
||||
maxRate: Int,
|
||||
containsProperties: Boolean = false,
|
||||
userDefinedKeys: Option[String] = None,
|
||||
enqueueTime: Option[Long] = None): Map[String, String] = {
|
||||
private def buildEventHubsParamters(namespace: String,
|
||||
name: String,
|
||||
partitionCount: Int,
|
||||
maxRate: Int,
|
||||
containsProperties: Boolean = false,
|
||||
userDefinedKeys: Option[String] = None,
|
||||
enqueueTime: Option[Long] = None): Map[String, String] = {
|
||||
Map[String, String](
|
||||
"eventhubs.policyname" -> "policyName",
|
||||
"eventhubs.policykey" -> "policyKey",
|
||||
|
@ -70,18 +69,29 @@ class EventHubsSourceSuite extends EventHubsStreamTest {
|
|||
|
||||
test("Verify expected offsets are correct when rate is less than the available data") {
|
||||
val eventHubsParameters = buildEventHubsParamters("ns1", "eh1", 2, 2)
|
||||
val eventPayloadsAndProperties = generateIntKeyedData(6).map{case (body, properties) =>
|
||||
(body.asInstanceOf[Int], properties)}
|
||||
val eventHubs = EventHubsTestUtilities.simulateEventHubs(eventHubsParameters,
|
||||
eventPayloadsAndProperties)
|
||||
val eventPayloadsAndProperties = generateIntKeyedData(6).map {
|
||||
case (body, properties) =>
|
||||
(body.asInstanceOf[Int], properties)
|
||||
}
|
||||
val eventHubs =
|
||||
EventHubsTestUtilities.simulateEventHubs(eventHubsParameters, eventPayloadsAndProperties)
|
||||
val highestOffsetPerPartition = EventHubsTestUtilities.getHighestOffsetPerPartition(eventHubs)
|
||||
val eventHubsSource = new EventHubsSource(spark.sqlContext, eventHubsParameters,
|
||||
(eventHubsParams: Map[String, String], partitionId: Int, startOffset: Long,
|
||||
eventHubsOffsetType: EventHubsOffsetType, _: Int) =>
|
||||
new TestEventHubsReceiver(eventHubsParams, eventHubs, partitionId, startOffset,
|
||||
eventHubsOffsetType),
|
||||
val eventHubsSource = new EventHubsSource(
|
||||
spark.sqlContext,
|
||||
eventHubsParameters,
|
||||
(eventHubsParams: Map[String, String],
|
||||
partitionId: Int,
|
||||
startOffset: Long,
|
||||
eventHubsOffsetType: EventHubsOffsetType,
|
||||
_: Int) =>
|
||||
new TestEventHubsReceiver(eventHubsParams,
|
||||
eventHubs,
|
||||
partitionId,
|
||||
startOffset,
|
||||
eventHubsOffsetType),
|
||||
(_: String, _: Map[String, Map[String, String]]) =>
|
||||
new TestRestEventHubClient(highestOffsetPerPartition))
|
||||
new TestRestEventHubClient(highestOffsetPerPartition)
|
||||
)
|
||||
val offset = eventHubsSource.getOffset.get.asInstanceOf[EventHubsBatchRecord]
|
||||
assert(offset.batchId == 0)
|
||||
offset.targetSeqNums.values.foreach(x => assert(x == 1))
|
||||
|
@ -90,35 +100,54 @@ class EventHubsSourceSuite extends EventHubsStreamTest {
|
|||
test("Verify expected offsets are correct when rate is more than the available data") {
|
||||
val eventHubsParameters = buildEventHubsParamters("ns1", "eh1", 2, 10)
|
||||
val eventPayloadsAndProperties = generateIntKeyedData(6)
|
||||
val eventHubs = EventHubsTestUtilities.simulateEventHubs(eventHubsParameters,
|
||||
eventPayloadsAndProperties)
|
||||
val eventHubs =
|
||||
EventHubsTestUtilities.simulateEventHubs(eventHubsParameters, eventPayloadsAndProperties)
|
||||
val highestOffsetPerPartition = EventHubsTestUtilities.getHighestOffsetPerPartition(eventHubs)
|
||||
val eventHubsSource = new EventHubsSource(spark.sqlContext, eventHubsParameters,
|
||||
(eventHubsParams: Map[String, String], partitionId: Int, startOffset: Long,
|
||||
offsetType: EventHubsOffsetType, _: Int) =>
|
||||
new TestEventHubsReceiver(eventHubsParams, eventHubs, partitionId, startOffset,
|
||||
EventHubsOffsetTypes.PreviousCheckpoint),
|
||||
val eventHubsSource = new EventHubsSource(
|
||||
spark.sqlContext,
|
||||
eventHubsParameters,
|
||||
(eventHubsParams: Map[String, String],
|
||||
partitionId: Int,
|
||||
startOffset: Long,
|
||||
offsetType: EventHubsOffsetType,
|
||||
_: Int) =>
|
||||
new TestEventHubsReceiver(eventHubsParams,
|
||||
eventHubs,
|
||||
partitionId,
|
||||
startOffset,
|
||||
EventHubsOffsetTypes.PreviousCheckpoint),
|
||||
(_: String, _: Map[String, Map[String, String]]) =>
|
||||
new TestRestEventHubClient(highestOffsetPerPartition))
|
||||
new TestRestEventHubClient(highestOffsetPerPartition)
|
||||
)
|
||||
val offset = eventHubsSource.getOffset.get.asInstanceOf[EventHubsBatchRecord]
|
||||
assert(offset.batchId == 0)
|
||||
offset.targetSeqNums.values.foreach(x => assert(x == 2))
|
||||
}
|
||||
|
||||
test("Verify expected offsets are correct when in subsequent fetch when rate is less than the" +
|
||||
" available data") {
|
||||
test(
|
||||
"Verify expected offsets are correct when in subsequent fetch when rate is less than the" +
|
||||
" available data") {
|
||||
val eventHubsParameters = buildEventHubsParamters("ns1", "eh1", 2, 3)
|
||||
val eventPayloadsAndProperties = generateIntKeyedData(10)
|
||||
val eventHubs = EventHubsTestUtilities.simulateEventHubs(eventHubsParameters,
|
||||
eventPayloadsAndProperties)
|
||||
val eventHubs =
|
||||
EventHubsTestUtilities.simulateEventHubs(eventHubsParameters, eventPayloadsAndProperties)
|
||||
val highestOffsetPerPartition = EventHubsTestUtilities.getHighestOffsetPerPartition(eventHubs)
|
||||
val eventHubsSource = new EventHubsSource(spark.sqlContext, eventHubsParameters,
|
||||
(eventHubsParams: Map[String, String], partitionId: Int, startOffset: Long,
|
||||
eventHubsOffsetType: EventHubsOffsetType, _: Int) =>
|
||||
new TestEventHubsReceiver(eventHubsParams, eventHubs, partitionId, startOffset,
|
||||
eventHubsOffsetType),
|
||||
val eventHubsSource = new EventHubsSource(
|
||||
spark.sqlContext,
|
||||
eventHubsParameters,
|
||||
(eventHubsParams: Map[String, String],
|
||||
partitionId: Int,
|
||||
startOffset: Long,
|
||||
eventHubsOffsetType: EventHubsOffsetType,
|
||||
_: Int) =>
|
||||
new TestEventHubsReceiver(eventHubsParams,
|
||||
eventHubs,
|
||||
partitionId,
|
||||
startOffset,
|
||||
eventHubsOffsetType),
|
||||
(_: String, _: Map[String, Map[String, String]]) =>
|
||||
new TestRestEventHubClient(highestOffsetPerPartition))
|
||||
new TestRestEventHubClient(highestOffsetPerPartition)
|
||||
)
|
||||
// First batch
|
||||
var offset = eventHubsSource.getOffset.get.asInstanceOf[EventHubsBatchRecord]
|
||||
var dataFrame = eventHubsSource.getBatch(None, offset)
|
||||
|
@ -138,16 +167,25 @@ class EventHubsSourceSuite extends EventHubsStreamTest {
|
|||
test("Verify expected dataframe size is correct when the rate is less than the available data") {
|
||||
val eventHubsParameters = buildEventHubsParamters("ns1", "eh1", 2, 2)
|
||||
val eventPayloadsAndProperties = generateIntKeyedData(6)
|
||||
val eventHubs = EventHubsTestUtilities.simulateEventHubs(eventHubsParameters,
|
||||
eventPayloadsAndProperties)
|
||||
val eventHubs =
|
||||
EventHubsTestUtilities.simulateEventHubs(eventHubsParameters, eventPayloadsAndProperties)
|
||||
val highestOffsetPerPartition = EventHubsTestUtilities.getHighestOffsetPerPartition(eventHubs)
|
||||
val eventHubsSource = new EventHubsSource(spark.sqlContext, eventHubsParameters,
|
||||
(eventHubsParams: Map[String, String], partitionId: Int, startOffset: Long,
|
||||
eventHubsOffsetType: EventHubsOffsetType, _: Int) =>
|
||||
new TestEventHubsReceiver(eventHubsParams, eventHubs, partitionId, startOffset,
|
||||
eventHubsOffsetType),
|
||||
val eventHubsSource = new EventHubsSource(
|
||||
spark.sqlContext,
|
||||
eventHubsParameters,
|
||||
(eventHubsParams: Map[String, String],
|
||||
partitionId: Int,
|
||||
startOffset: Long,
|
||||
eventHubsOffsetType: EventHubsOffsetType,
|
||||
_: Int) =>
|
||||
new TestEventHubsReceiver(eventHubsParams,
|
||||
eventHubs,
|
||||
partitionId,
|
||||
startOffset,
|
||||
eventHubsOffsetType),
|
||||
(_: String, _: Map[String, Map[String, String]]) =>
|
||||
new TestRestEventHubClient(highestOffsetPerPartition))
|
||||
new TestRestEventHubClient(highestOffsetPerPartition)
|
||||
)
|
||||
val offset = eventHubsSource.getOffset.get.asInstanceOf[EventHubsBatchRecord]
|
||||
val dataFrame = eventHubsSource.getBatch(None, offset)
|
||||
assert(dataFrame.schema == eventHubsSource.schema)
|
||||
|
@ -158,16 +196,25 @@ class EventHubsSourceSuite extends EventHubsStreamTest {
|
|||
test("Verify expected dataframe size is correct when the rate is more than the available data") {
|
||||
val eventHubsParameters = buildEventHubsParamters("ns1", "eh1", 2, 10)
|
||||
val eventPayloadsAndProperties = generateIntKeyedData(6)
|
||||
val eventHubs = EventHubsTestUtilities.simulateEventHubs(eventHubsParameters,
|
||||
eventPayloadsAndProperties)
|
||||
val eventHubs =
|
||||
EventHubsTestUtilities.simulateEventHubs(eventHubsParameters, eventPayloadsAndProperties)
|
||||
val highestOffsetPerPartition = EventHubsTestUtilities.getHighestOffsetPerPartition(eventHubs)
|
||||
val eventHubsSource = new EventHubsSource(spark.sqlContext, eventHubsParameters,
|
||||
(eventHubsParams: Map[String, String], partitionId: Int, startOffset: Long,
|
||||
eventHubsOffsetType: EventHubsOffsetType, _: Int) =>
|
||||
new TestEventHubsReceiver(eventHubsParams, eventHubs, partitionId, startOffset,
|
||||
eventHubsOffsetType),
|
||||
val eventHubsSource = new EventHubsSource(
|
||||
spark.sqlContext,
|
||||
eventHubsParameters,
|
||||
(eventHubsParams: Map[String, String],
|
||||
partitionId: Int,
|
||||
startOffset: Long,
|
||||
eventHubsOffsetType: EventHubsOffsetType,
|
||||
_: Int) =>
|
||||
new TestEventHubsReceiver(eventHubsParams,
|
||||
eventHubs,
|
||||
partitionId,
|
||||
startOffset,
|
||||
eventHubsOffsetType),
|
||||
(_: String, _: Map[String, Map[String, String]]) =>
|
||||
new TestRestEventHubClient(highestOffsetPerPartition))
|
||||
new TestRestEventHubClient(highestOffsetPerPartition)
|
||||
)
|
||||
val offset = eventHubsSource.getOffset.get.asInstanceOf[EventHubsBatchRecord]
|
||||
val dataFrame = eventHubsSource.getBatch(None, offset)
|
||||
assert(dataFrame.schema == eventHubsSource.schema)
|
||||
|
@ -175,20 +222,30 @@ class EventHubsSourceSuite extends EventHubsStreamTest {
|
|||
assert(dataFrame.select("body").count == 6)
|
||||
}
|
||||
|
||||
test("Verify expected dataframe size is correct in subsequent fetch when the rate is" +
|
||||
" less than the available data") {
|
||||
test(
|
||||
"Verify expected dataframe size is correct in subsequent fetch when the rate is" +
|
||||
" less than the available data") {
|
||||
val eventHubsParameters = buildEventHubsParamters("ns1", "eh1", 2, 3)
|
||||
val eventPayloadsAndProperties = generateIntKeyedData(10)
|
||||
val eventHubs = EventHubsTestUtilities.simulateEventHubs(eventHubsParameters,
|
||||
eventPayloadsAndProperties)
|
||||
val eventHubs =
|
||||
EventHubsTestUtilities.simulateEventHubs(eventHubsParameters, eventPayloadsAndProperties)
|
||||
val highestOffsetPerPartition = EventHubsTestUtilities.getHighestOffsetPerPartition(eventHubs)
|
||||
val eventHubsSource = new EventHubsSource(spark.sqlContext, eventHubsParameters,
|
||||
(eventHubsParams: Map[String, String], partitionId: Int, startOffset: Long,
|
||||
eventHubsOffsetType: EventHubsOffsetType, _: Int) =>
|
||||
new TestEventHubsReceiver(eventHubsParams, eventHubs, partitionId, startOffset,
|
||||
eventHubsOffsetType),
|
||||
val eventHubsSource = new EventHubsSource(
|
||||
spark.sqlContext,
|
||||
eventHubsParameters,
|
||||
(eventHubsParams: Map[String, String],
|
||||
partitionId: Int,
|
||||
startOffset: Long,
|
||||
eventHubsOffsetType: EventHubsOffsetType,
|
||||
_: Int) =>
|
||||
new TestEventHubsReceiver(eventHubsParams,
|
||||
eventHubs,
|
||||
partitionId,
|
||||
startOffset,
|
||||
eventHubsOffsetType),
|
||||
(_: String, _: Map[String, Map[String, String]]) =>
|
||||
new TestRestEventHubClient(highestOffsetPerPartition))
|
||||
new TestRestEventHubClient(highestOffsetPerPartition)
|
||||
)
|
||||
// First batch
|
||||
var offset = eventHubsSource.getOffset.get.asInstanceOf[EventHubsBatchRecord]
|
||||
var dataFrame = eventHubsSource.getBatch(None, offset)
|
||||
|
@ -203,10 +260,11 @@ class EventHubsSourceSuite extends EventHubsStreamTest {
|
|||
assert(dataFrame.select("body").count == 4)
|
||||
}
|
||||
|
||||
test("Verify all user-defined keys show up in dataframe schema if not specify" +
|
||||
" userDefinedKeys") {
|
||||
val eventHubsParameters = buildEventHubsParamters("ns1", "eh1", 2, 10,
|
||||
containsProperties = true)
|
||||
test(
|
||||
"Verify all user-defined keys show up in dataframe schema if not specify" +
|
||||
" userDefinedKeys") {
|
||||
val eventHubsParameters =
|
||||
buildEventHubsParamters("ns1", "eh1", 2, 10, containsProperties = true)
|
||||
val eventPayloadsAndProperties = Seq(
|
||||
1 -> Seq("creationTime" -> Calendar.getInstance().getTime, "otherUserDefinedKey" -> 1),
|
||||
3 -> Seq("creationTime" -> Calendar.getInstance().getTime, "otherUserDefinedKey" -> 1),
|
||||
|
@ -215,29 +273,50 @@ class EventHubsSourceSuite extends EventHubsStreamTest {
|
|||
9 -> Seq("creationTime" -> Calendar.getInstance().getTime, "otherUserDefinedKey" -> 1),
|
||||
11 -> Seq("creationTime" -> Calendar.getInstance().getTime, "otherUserDefinedKey" -> 1)
|
||||
)
|
||||
val eventHubs = EventHubsTestUtilities.simulateEventHubs(eventHubsParameters,
|
||||
eventPayloadsAndProperties)
|
||||
val eventHubs =
|
||||
EventHubsTestUtilities.simulateEventHubs(eventHubsParameters, eventPayloadsAndProperties)
|
||||
val highestOffsetPerPartition = EventHubsTestUtilities.getHighestOffsetPerPartition(eventHubs)
|
||||
val eventHubsSource = new EventHubsSource(spark.sqlContext, eventHubsParameters,
|
||||
(eventHubsParams: Map[String, String], partitionId: Int, startOffset: Long,
|
||||
eventHubsOffsetType: EventHubsOffsetType, _: Int) =>
|
||||
new TestEventHubsReceiver(eventHubsParams, eventHubs, partitionId, startOffset,
|
||||
eventHubsOffsetType),
|
||||
val eventHubsSource = new EventHubsSource(
|
||||
spark.sqlContext,
|
||||
eventHubsParameters,
|
||||
(eventHubsParams: Map[String, String],
|
||||
partitionId: Int,
|
||||
startOffset: Long,
|
||||
eventHubsOffsetType: EventHubsOffsetType,
|
||||
_: Int) =>
|
||||
new TestEventHubsReceiver(eventHubsParams,
|
||||
eventHubs,
|
||||
partitionId,
|
||||
startOffset,
|
||||
eventHubsOffsetType),
|
||||
(_: String, _: Map[String, Map[String, String]]) =>
|
||||
new TestRestEventHubClient(highestOffsetPerPartition))
|
||||
new TestRestEventHubClient(highestOffsetPerPartition)
|
||||
)
|
||||
val offset = eventHubsSource.getOffset.get.asInstanceOf[EventHubsBatchRecord]
|
||||
val dataFrame = eventHubsSource.getBatch(None, offset)
|
||||
assert(dataFrame.schema == eventHubsSource.schema)
|
||||
eventHubsSource.commit(offset)
|
||||
val properties = dataFrame.select("properties").rdd.map(r => r.get(0)
|
||||
.asInstanceOf[Map[String, String]])
|
||||
assert(properties.collect().forall(propertyMap => propertyMap.keySet == Set("creationTime",
|
||||
"otherUserDefinedKey")))
|
||||
val properties = dataFrame
|
||||
.select("properties")
|
||||
.rdd
|
||||
.map(
|
||||
r =>
|
||||
r.get(0)
|
||||
.asInstanceOf[Map[String, String]])
|
||||
assert(
|
||||
properties
|
||||
.collect()
|
||||
.forall(propertyMap => propertyMap.keySet == Set("creationTime", "otherUserDefinedKey")))
|
||||
}
|
||||
|
||||
test("Verify user-defined keys show up in dataframe schema if specify userDefinedKey") {
|
||||
val eventHubsParameters = buildEventHubsParamters("ns1", "eh1", 2, 10,
|
||||
containsProperties = true, userDefinedKeys = Some("otherUserDefinedKey,"))
|
||||
val eventHubsParameters =
|
||||
buildEventHubsParamters("ns1",
|
||||
"eh1",
|
||||
2,
|
||||
10,
|
||||
containsProperties = true,
|
||||
userDefinedKeys = Some("otherUserDefinedKey,"))
|
||||
val eventPayloadsAndProperties = Seq(
|
||||
1 -> Seq("creationTime" -> Calendar.getInstance().getTime, "otherUserDefinedKey" -> 1),
|
||||
3 -> Seq("creationTime" -> Calendar.getInstance().getTime, "otherUserDefinedKey" -> 1),
|
||||
|
@ -246,16 +325,25 @@ class EventHubsSourceSuite extends EventHubsStreamTest {
|
|||
9 -> Seq("creationTime" -> Calendar.getInstance().getTime, "otherUserDefinedKey" -> 1),
|
||||
11 -> Seq("creationTime" -> Calendar.getInstance().getTime, "otherUserDefinedKey" -> 1)
|
||||
)
|
||||
val eventHubs = EventHubsTestUtilities.simulateEventHubs(eventHubsParameters,
|
||||
eventPayloadsAndProperties)
|
||||
val eventHubs =
|
||||
EventHubsTestUtilities.simulateEventHubs(eventHubsParameters, eventPayloadsAndProperties)
|
||||
val highestOffsetPerPartition = EventHubsTestUtilities.getHighestOffsetPerPartition(eventHubs)
|
||||
val eventHubsSource = new EventHubsSource(spark.sqlContext, eventHubsParameters,
|
||||
(eventHubsParams: Map[String, String], partitionId: Int, startOffset: Long,
|
||||
ehOffsetType: EventHubsOffsetType, _: Int) =>
|
||||
new TestEventHubsReceiver(eventHubsParams, eventHubs, partitionId, startOffset,
|
||||
ehOffsetType),
|
||||
val eventHubsSource = new EventHubsSource(
|
||||
spark.sqlContext,
|
||||
eventHubsParameters,
|
||||
(eventHubsParams: Map[String, String],
|
||||
partitionId: Int,
|
||||
startOffset: Long,
|
||||
ehOffsetType: EventHubsOffsetType,
|
||||
_: Int) =>
|
||||
new TestEventHubsReceiver(eventHubsParams,
|
||||
eventHubs,
|
||||
partitionId,
|
||||
startOffset,
|
||||
ehOffsetType),
|
||||
(_: String, _: Map[String, Map[String, String]]) =>
|
||||
new TestRestEventHubClient(highestOffsetPerPartition))
|
||||
new TestRestEventHubClient(highestOffsetPerPartition)
|
||||
)
|
||||
val offset = eventHubsSource.getOffset.get.asInstanceOf[EventHubsBatchRecord]
|
||||
val dataFrame = eventHubsSource.getBatch(None, offset)
|
||||
assert(dataFrame.schema == eventHubsSource.schema)
|
||||
|
@ -265,90 +353,118 @@ class EventHubsSourceSuite extends EventHubsStreamTest {
|
|||
}
|
||||
|
||||
test("Verify null references in user-defined keys are handled correctly") {
|
||||
val eventHubsParameters = buildEventHubsParamters("ns1", "eh1", 2, 10,
|
||||
containsProperties = true)
|
||||
val eventHubsParameters =
|
||||
buildEventHubsParamters("ns1", "eh1", 2, 10, containsProperties = true)
|
||||
val eventPayloadsAndProperties = generateKeyedDataWithNullValue(6)
|
||||
val eventHubs = EventHubsTestUtilities.simulateEventHubs(eventHubsParameters,
|
||||
eventPayloadsAndProperties)
|
||||
val eventHubs =
|
||||
EventHubsTestUtilities.simulateEventHubs(eventHubsParameters, eventPayloadsAndProperties)
|
||||
val highestOffsetPerPartition = EventHubsTestUtilities.getHighestOffsetPerPartition(eventHubs)
|
||||
val eventHubsSource = new EventHubsSource(spark.sqlContext, eventHubsParameters,
|
||||
(eventHubsParams: Map[String, String], partitionId: Int, startOffset: Long,
|
||||
eventHubsOffsetType: EventHubsOffsetType, _: Int) =>
|
||||
new TestEventHubsReceiver(eventHubsParams, eventHubs, partitionId, startOffset,
|
||||
eventHubsOffsetType),
|
||||
val eventHubsSource = new EventHubsSource(
|
||||
spark.sqlContext,
|
||||
eventHubsParameters,
|
||||
(eventHubsParams: Map[String, String],
|
||||
partitionId: Int,
|
||||
startOffset: Long,
|
||||
eventHubsOffsetType: EventHubsOffsetType,
|
||||
_: Int) =>
|
||||
new TestEventHubsReceiver(eventHubsParams,
|
||||
eventHubs,
|
||||
partitionId,
|
||||
startOffset,
|
||||
eventHubsOffsetType),
|
||||
(_: String, _: Map[String, Map[String, String]]) =>
|
||||
new TestRestEventHubClient(highestOffsetPerPartition))
|
||||
new TestRestEventHubClient(highestOffsetPerPartition)
|
||||
)
|
||||
val offset = eventHubsSource.getOffset.get.asInstanceOf[EventHubsBatchRecord]
|
||||
val dataFrame = eventHubsSource.getBatch(None, offset)
|
||||
assert(dataFrame.schema == eventHubsSource.schema)
|
||||
eventHubsSource.commit(offset)
|
||||
val sparkSession = spark
|
||||
import sparkSession.implicits._
|
||||
val bodyDataFrame = dataFrame.select("body")
|
||||
val bodyDataFrame = dataFrame
|
||||
.select("body")
|
||||
.map(r => new String(r.getAs[Array[Byte]](0), "UTF-8"))
|
||||
val inputArray = eventPayloadsAndProperties.map(x => x._1).toArray
|
||||
val outputArray = bodyDataFrame.collect()
|
||||
assert(outputArray.sorted.corresponds(inputArray.sorted) {_ == _})
|
||||
assert(outputArray.sorted.corresponds(inputArray.sorted) { _ == _ })
|
||||
}
|
||||
|
||||
test("Verify dataframe body is correct for String type") {
|
||||
val eventHubsParameters = buildEventHubsParamters("ns1", "eh1", 2, 10)
|
||||
val eventPayloadsAndProperties = generateStringKeyedData(6)
|
||||
val eventHubs = EventHubsTestUtilities.simulateEventHubs(eventHubsParameters,
|
||||
eventPayloadsAndProperties)
|
||||
val eventHubs =
|
||||
EventHubsTestUtilities.simulateEventHubs(eventHubsParameters, eventPayloadsAndProperties)
|
||||
val highestOffsetPerPartition = EventHubsTestUtilities.getHighestOffsetPerPartition(eventHubs)
|
||||
val eventHubsSource = new EventHubsSource(spark.sqlContext, eventHubsParameters,
|
||||
(eventHubsParams: Map[String, String], partitionId: Int, startOffset: Long,
|
||||
eventHubsOffsetType: EventHubsOffsetType, _: Int) =>
|
||||
new TestEventHubsReceiver(eventHubsParams, eventHubs, partitionId, startOffset,
|
||||
eventHubsOffsetType),
|
||||
val eventHubsSource = new EventHubsSource(
|
||||
spark.sqlContext,
|
||||
eventHubsParameters,
|
||||
(eventHubsParams: Map[String, String],
|
||||
partitionId: Int,
|
||||
startOffset: Long,
|
||||
eventHubsOffsetType: EventHubsOffsetType,
|
||||
_: Int) =>
|
||||
new TestEventHubsReceiver(eventHubsParams,
|
||||
eventHubs,
|
||||
partitionId,
|
||||
startOffset,
|
||||
eventHubsOffsetType),
|
||||
(_: String, _: Map[String, Map[String, String]]) =>
|
||||
new TestRestEventHubClient(highestOffsetPerPartition))
|
||||
new TestRestEventHubClient(highestOffsetPerPartition)
|
||||
)
|
||||
val offset = eventHubsSource.getOffset.get.asInstanceOf[EventHubsBatchRecord]
|
||||
val dataFrame = eventHubsSource.getBatch(None, offset)
|
||||
assert(dataFrame.schema == eventHubsSource.schema)
|
||||
eventHubsSource.commit(offset)
|
||||
val sparkSession = spark
|
||||
import sparkSession.implicits._
|
||||
val bodyDataFrame = dataFrame.select("body")
|
||||
val bodyDataFrame = dataFrame
|
||||
.select("body")
|
||||
.map(r => new String(r.getAs[Array[Byte]](0), "UTF-8"))
|
||||
val inputArray = eventPayloadsAndProperties.map(x => x._1).toArray
|
||||
val outputArray = bodyDataFrame.collect()
|
||||
assert(outputArray.sorted.corresponds(inputArray.sorted) {_ == _})
|
||||
assert(outputArray.sorted.corresponds(inputArray.sorted) { _ == _ })
|
||||
}
|
||||
|
||||
test("Verify dataframe body is correct for Int type") {
|
||||
val eventHubsParameters = buildEventHubsParamters("ns1", "eh1", 2, 10)
|
||||
val eventPayloadsAndProperties = generateIntKeyedData(6)
|
||||
val eventHubs = EventHubsTestUtilities.simulateEventHubs(eventHubsParameters,
|
||||
eventPayloadsAndProperties)
|
||||
val eventHubs =
|
||||
EventHubsTestUtilities.simulateEventHubs(eventHubsParameters, eventPayloadsAndProperties)
|
||||
val highestOffsetPerPartition = EventHubsTestUtilities.getHighestOffsetPerPartition(eventHubs)
|
||||
val eventHubsSource = new EventHubsSource(spark.sqlContext, eventHubsParameters,
|
||||
(eventHubsParams: Map[String, String], partitionId: Int, startOffset: Long,
|
||||
eventHubsOffsetType: EventHubsOffsetType, _: Int) =>
|
||||
new TestEventHubsReceiver(eventHubsParams, eventHubs, partitionId, startOffset,
|
||||
eventHubsOffsetType),
|
||||
val eventHubsSource = new EventHubsSource(
|
||||
spark.sqlContext,
|
||||
eventHubsParameters,
|
||||
(eventHubsParams: Map[String, String],
|
||||
partitionId: Int,
|
||||
startOffset: Long,
|
||||
eventHubsOffsetType: EventHubsOffsetType,
|
||||
_: Int) =>
|
||||
new TestEventHubsReceiver(eventHubsParams,
|
||||
eventHubs,
|
||||
partitionId,
|
||||
startOffset,
|
||||
eventHubsOffsetType),
|
||||
(_: String, _: Map[String, Map[String, String]]) =>
|
||||
new TestRestEventHubClient(highestOffsetPerPartition))
|
||||
new TestRestEventHubClient(highestOffsetPerPartition)
|
||||
)
|
||||
val offset = eventHubsSource.getOffset.get.asInstanceOf[EventHubsBatchRecord]
|
||||
val dataFrame = eventHubsSource.getBatch(None, offset)
|
||||
assert(dataFrame.schema == eventHubsSource.schema)
|
||||
eventHubsSource.commit(offset)
|
||||
val sparkSession = spark
|
||||
import sparkSession.implicits._
|
||||
val bodyDataFrame = dataFrame.select("body")
|
||||
val bodyDataFrame = dataFrame
|
||||
.select("body")
|
||||
.map(r => new String(r.getAs[Array[Byte]](0), "UTF-8").toInt)
|
||||
val inputArray = eventPayloadsAndProperties.map(x => x._1).toArray
|
||||
val outputArray = bodyDataFrame.collect()
|
||||
assert(outputArray.sorted.corresponds(inputArray.sorted) {_ == _})
|
||||
assert(outputArray.sorted.corresponds(inputArray.sorted) { _ == _ })
|
||||
}
|
||||
|
||||
private def generateInputQuery(
|
||||
eventHubsParams: Map[String, String],
|
||||
sparkSession: SparkSession): Dataset[_] = {
|
||||
private def generateInputQuery(eventHubsParams: Map[String, String],
|
||||
sparkSession: SparkSession): Dataset[_] = {
|
||||
import sparkSession.implicits._
|
||||
val dataSource = spark
|
||||
.readStream
|
||||
val dataSource = spark.readStream
|
||||
.format("eventhubs")
|
||||
.options(eventHubsParams)
|
||||
.load()
|
||||
|
@ -391,8 +507,9 @@ class EventHubsSourceSuite extends EventHubsStreamTest {
|
|||
)
|
||||
}
|
||||
|
||||
test("Verify expected dataframe can be retrieved after data added to source in excess" +
|
||||
" of the rate") {
|
||||
test(
|
||||
"Verify expected dataframe can be retrieved after data added to source in excess" +
|
||||
" of the rate") {
|
||||
import testImplicits._
|
||||
val eventHubsParameters = buildEventHubsParamters("ns1", "eh1", 2, 3)
|
||||
val eventPayloadsAndProperties = generateIntKeyedData(15)
|
||||
|
@ -411,8 +528,9 @@ class EventHubsSourceSuite extends EventHubsStreamTest {
|
|||
)
|
||||
}
|
||||
|
||||
test("Verify expected dataframe can be retrieved when more data is added to" +
|
||||
" source after stream has started") {
|
||||
test(
|
||||
"Verify expected dataframe can be retrieved when more data is added to" +
|
||||
" source after stream has started") {
|
||||
import testImplicits._
|
||||
val eventHubsParameters = buildEventHubsParamters("ns1", "eh1", 2, 3)
|
||||
val eventPayloadsAndProperties1 = generateIntKeyedData(6)
|
||||
|
@ -426,19 +544,22 @@ class EventHubsSourceSuite extends EventHubsStreamTest {
|
|||
StartStream(trigger = ProcessingTime(10), triggerClock = manualClock),
|
||||
AddEventHubsData(eventHubsParameters),
|
||||
CheckAnswer(1, 3, 5, 2, 4, 6),
|
||||
AddEventHubsData(eventHubsParameters, highestBatchId.incrementAndGet.toLong,
|
||||
eventPayloadsAndProperties2),
|
||||
AddEventHubsData(eventHubsParameters,
|
||||
highestBatchId.incrementAndGet.toLong,
|
||||
eventPayloadsAndProperties2),
|
||||
AdvanceManualClock(10),
|
||||
CheckAnswer(1, 3, 5, 2, 4, 6, 3, 5, 7, 4, 6, 8),
|
||||
AddEventHubsData(eventHubsParameters, highestBatchId.incrementAndGet.toLong,
|
||||
eventPayloadsAndProperties3),
|
||||
AddEventHubsData(eventHubsParameters,
|
||||
highestBatchId.incrementAndGet.toLong,
|
||||
eventPayloadsAndProperties3),
|
||||
AdvanceManualClock(10),
|
||||
CheckAnswer(1, 3, 5, 2, 4, 6, 3, 5, 7, 4, 6, 8, 4, 6, 8, 5, 7, 9)
|
||||
)
|
||||
}
|
||||
|
||||
test("Verify expected dataframe can be retrieved with data added to source after the stream" +
|
||||
" has started") {
|
||||
test(
|
||||
"Verify expected dataframe can be retrieved with data added to source after the stream" +
|
||||
" has started") {
|
||||
import testImplicits._
|
||||
val eventHubsParameters = buildEventHubsParamters("ns1", "eh1", 2, 3)
|
||||
val eventPayloadsAndProperties1 = generateIntKeyedData(6)
|
||||
|
@ -450,19 +571,22 @@ class EventHubsSourceSuite extends EventHubsStreamTest {
|
|||
testStream(sourceQuery)(
|
||||
StartStream(trigger = ProcessingTime(10), triggerClock = manualClock),
|
||||
CheckAnswer(),
|
||||
AddEventHubsData(eventHubsParameters, highestBatchId.incrementAndGet().toLong,
|
||||
eventPayloadsAndProperties1),
|
||||
AddEventHubsData(eventHubsParameters,
|
||||
highestBatchId.incrementAndGet().toLong,
|
||||
eventPayloadsAndProperties1),
|
||||
AdvanceManualClock(10),
|
||||
CheckAnswer(1, 3, 5, 2, 4, 6),
|
||||
AddEventHubsData(eventHubsParameters, highestBatchId.incrementAndGet().toLong,
|
||||
eventPayloadsAndProperties2),
|
||||
AddEventHubsData(eventHubsParameters,
|
||||
highestBatchId.incrementAndGet().toLong,
|
||||
eventPayloadsAndProperties2),
|
||||
AdvanceManualClock(10),
|
||||
CheckAnswer(1, 3, 5, 2, 4, 6, 3, 5, 7, 4, 6, 8)
|
||||
)
|
||||
}
|
||||
|
||||
test("Verify expected dataframe can be retrieved from different" +
|
||||
" sources with same event hubs on different streams on different queries at same rate") {
|
||||
test(
|
||||
"Verify expected dataframe can be retrieved from different" +
|
||||
" sources with same event hubs on different streams on different queries at same rate") {
|
||||
import testImplicits._
|
||||
val eventHubsParameters = buildEventHubsParamters("ns1", "eh1", 2, 30)
|
||||
val eventPayloadsAndProperties = generateIntKeyedData(1000)
|
||||
|
@ -481,8 +605,9 @@ class EventHubsSourceSuite extends EventHubsStreamTest {
|
|||
)
|
||||
}
|
||||
|
||||
test("Verify expected dataframe can be retrieved from different " +
|
||||
"sources with same event hubs on different streams on different queries at different rates") {
|
||||
test(
|
||||
"Verify expected dataframe can be retrieved from different " +
|
||||
"sources with same event hubs on different streams on different queries at different rates") {
|
||||
import testImplicits._
|
||||
val eventHubsParameters1 = buildEventHubsParamters("ns1", "eh1", 2, 30)
|
||||
val eventHubsParameters2 = buildEventHubsParamters("ns1", "eh1", 2, 10)
|
||||
|
@ -502,8 +627,9 @@ class EventHubsSourceSuite extends EventHubsStreamTest {
|
|||
)
|
||||
}
|
||||
|
||||
test("Verify expected dataframe can be retrieved from same " +
|
||||
"source on different queries") {
|
||||
test(
|
||||
"Verify expected dataframe can be retrieved from same " +
|
||||
"source on different queries") {
|
||||
import testImplicits._
|
||||
val eventHubsParameters1 = buildEventHubsParamters("ns1", "eh1", 2, 30)
|
||||
val eventPayloadsAndProperties = generateIntKeyedData(1000)
|
||||
|
@ -521,49 +647,53 @@ class EventHubsSourceSuite extends EventHubsStreamTest {
|
|||
)
|
||||
}
|
||||
|
||||
test("Verify expected dataframe can be retrieved when the stream is stopped before the last" +
|
||||
" batch's offset is committed") {
|
||||
test(
|
||||
"Verify expected dataframe can be retrieved when the stream is stopped before the last" +
|
||||
" batch's offset is committed") {
|
||||
import testImplicits._
|
||||
val eventHubsParameters = buildEventHubsParamters("ns1", "eh1", 2, 30)
|
||||
val eventPayloadsAndProperties = generateIntKeyedData(1000)
|
||||
EventHubsTestUtilities.simulateEventHubs(eventHubsParameters,
|
||||
eventPayloadsAndProperties.take(30 * 10 * 2))
|
||||
eventPayloadsAndProperties.take(30 * 10 * 2))
|
||||
val sourceQuery = generateInputQuery(eventHubsParameters, spark)
|
||||
val manualClock = new StreamManualClock
|
||||
val firstBatch = Seq(
|
||||
StartStream(trigger = ProcessingTime(10), triggerClock = manualClock),
|
||||
AddEventHubsData(eventHubsParameters, 9))
|
||||
val firstBatch = Seq(StartStream(trigger = ProcessingTime(10), triggerClock = manualClock),
|
||||
AddEventHubsData(eventHubsParameters, 9))
|
||||
val clockMove = Array.fill(9)(AdvanceManualClock(10)).toSeq
|
||||
val secondBatch = Seq(
|
||||
CheckAnswer(1 to 600: _*),
|
||||
StopStream(recoverStreamId = true, commitPartialOffset = true, partialType = "delete"),
|
||||
StartStream(trigger = ProcessingTime(10), triggerClock = manualClock,
|
||||
additionalConfs = Map("eventhubs.test.newSink" -> "true")),
|
||||
AddEventHubsData(eventHubsParameters, 17, eventPayloadsAndProperties.takeRight(400)))
|
||||
StartStream(trigger = ProcessingTime(10),
|
||||
triggerClock = manualClock,
|
||||
additionalConfs = Map("eventhubs.test.newSink" -> "true")),
|
||||
AddEventHubsData(eventHubsParameters, 17, eventPayloadsAndProperties.takeRight(400))
|
||||
)
|
||||
val clockMove2 = Array.fill(8)(AdvanceManualClock(10)).toSeq
|
||||
val thirdBatch = Seq(CheckAnswer(541 to 1000: _*))
|
||||
testStream(sourceQuery)(firstBatch ++ clockMove ++ secondBatch ++ clockMove2 ++ thirdBatch: _*)
|
||||
}
|
||||
|
||||
test("Verify expected dataframe can be retrieved when the stream is stopped after the last" +
|
||||
" batch's offset is committed") {
|
||||
test(
|
||||
"Verify expected dataframe can be retrieved when the stream is stopped after the last" +
|
||||
" batch's offset is committed") {
|
||||
import testImplicits._
|
||||
val eventHubsParameters = buildEventHubsParamters("ns1", "eh1", 2, 30)
|
||||
val eventPayloadsAndProperties = generateIntKeyedData(1000)
|
||||
EventHubsTestUtilities.simulateEventHubs(eventHubsParameters,
|
||||
eventPayloadsAndProperties.take(30 * 10 * 2))
|
||||
eventPayloadsAndProperties.take(30 * 10 * 2))
|
||||
val sourceQuery = generateInputQuery(eventHubsParameters, spark)
|
||||
val manualClock = new StreamManualClock
|
||||
val firstBatch = Seq(
|
||||
StartStream(trigger = ProcessingTime(10), triggerClock = manualClock),
|
||||
AddEventHubsData(eventHubsParameters, 9))
|
||||
val firstBatch = Seq(StartStream(trigger = ProcessingTime(10), triggerClock = manualClock),
|
||||
AddEventHubsData(eventHubsParameters, 9))
|
||||
val clockMove = Array.fill(9)(AdvanceManualClock(10)).toSeq
|
||||
val secondBatch = Seq(
|
||||
CheckAnswer(1 to 600: _*),
|
||||
StopStream(recoverStreamId = true, commitOffset = true),
|
||||
StartStream(trigger = ProcessingTime(10), triggerClock = manualClock,
|
||||
additionalConfs = Map("eventhubs.test.newSink" -> "true")),
|
||||
AddEventHubsData(eventHubsParameters, 17, eventPayloadsAndProperties.takeRight(400)))
|
||||
StartStream(trigger = ProcessingTime(10),
|
||||
triggerClock = manualClock,
|
||||
additionalConfs = Map("eventhubs.test.newSink" -> "true")),
|
||||
AddEventHubsData(eventHubsParameters, 17, eventPayloadsAndProperties.takeRight(400))
|
||||
)
|
||||
val clockMove2 = Array.fill(8)(AdvanceManualClock(10)).toSeq
|
||||
val thirdBatch = Seq(CheckAnswer(601 to 1000: _*))
|
||||
testStream(sourceQuery)(firstBatch ++ clockMove ++ secondBatch ++ clockMove2 ++ thirdBatch: _*)
|
||||
|
@ -574,20 +704,20 @@ class EventHubsSourceSuite extends EventHubsStreamTest {
|
|||
val eventHubsParameters = buildEventHubsParamters("ns1", "eh1", 2, 30)
|
||||
val eventPayloadsAndProperties = generateIntKeyedData(1000)
|
||||
EventHubsTestUtilities.simulateEventHubs(eventHubsParameters,
|
||||
eventPayloadsAndProperties.take(30 * 10 * 2))
|
||||
eventPayloadsAndProperties.take(30 * 10 * 2))
|
||||
val sourceQuery = generateInputQuery(eventHubsParameters, spark)
|
||||
val manualClock = new StreamManualClock
|
||||
val firstBatch = Seq(
|
||||
StartStream(trigger = ProcessingTime(10), triggerClock = manualClock),
|
||||
AddEventHubsData(eventHubsParameters, 9))
|
||||
val firstBatch = Seq(StartStream(trigger = ProcessingTime(10), triggerClock = manualClock),
|
||||
AddEventHubsData(eventHubsParameters, 9))
|
||||
val clockMove = Array.fill(9)(AdvanceManualClock(10)).toSeq
|
||||
val secondBatch = Seq(
|
||||
CheckAnswer(1 to 600: _*),
|
||||
StopStream(recoverStreamId = true, commitPartialOffset = true,
|
||||
partialType = "partial"),
|
||||
StartStream(trigger = ProcessingTime(10), triggerClock = manualClock,
|
||||
additionalConfs = Map("eventhubs.test.newSink" -> "true")),
|
||||
AddEventHubsData(eventHubsParameters, 17, eventPayloadsAndProperties.takeRight(400)))
|
||||
StopStream(recoverStreamId = true, commitPartialOffset = true, partialType = "partial"),
|
||||
StartStream(trigger = ProcessingTime(10),
|
||||
triggerClock = manualClock,
|
||||
additionalConfs = Map("eventhubs.test.newSink" -> "true")),
|
||||
AddEventHubsData(eventHubsParameters, 17, eventPayloadsAndProperties.takeRight(400))
|
||||
)
|
||||
val clockMove2 = Array.fill(8)(AdvanceManualClock(10)).toSeq
|
||||
// in structured streaming, even metadata is not committed, we will be able to skip the
|
||||
// processed data, since we will pinpoint progress file with the recovered batch id
|
||||
|
@ -595,26 +725,27 @@ class EventHubsSourceSuite extends EventHubsStreamTest {
|
|||
testStream(sourceQuery)(firstBatch ++ clockMove ++ secondBatch ++ clockMove2 ++ thirdBatch: _*)
|
||||
}
|
||||
|
||||
test("Verify expected dataframe can be retrieved when upgrading from a directory without" +
|
||||
" metadata") {
|
||||
test(
|
||||
"Verify expected dataframe can be retrieved when upgrading from a directory without" +
|
||||
" metadata") {
|
||||
import testImplicits._
|
||||
val eventHubsParameters = buildEventHubsParamters("ns1", "eh1", 2, 30)
|
||||
val eventPayloadsAndProperties = generateIntKeyedData(1000)
|
||||
EventHubsTestUtilities.simulateEventHubs(eventHubsParameters,
|
||||
eventPayloadsAndProperties.take(30 * 10 * 2))
|
||||
eventPayloadsAndProperties.take(30 * 10 * 2))
|
||||
val sourceQuery = generateInputQuery(eventHubsParameters, spark)
|
||||
val manualClock = new StreamManualClock
|
||||
val firstBatch = Seq(
|
||||
StartStream(trigger = ProcessingTime(10), triggerClock = manualClock),
|
||||
AddEventHubsData(eventHubsParameters, 9))
|
||||
val firstBatch = Seq(StartStream(trigger = ProcessingTime(10), triggerClock = manualClock),
|
||||
AddEventHubsData(eventHubsParameters, 9))
|
||||
val clockMove = Array.fill(9)(AdvanceManualClock(10)).toSeq
|
||||
val secondBatch = Seq(
|
||||
CheckAnswer(1 to 600: _*),
|
||||
StopStream(recoverStreamId = true, commitPartialOffset = true,
|
||||
partialType = "nometadata"),
|
||||
StartStream(trigger = ProcessingTime(10), triggerClock = manualClock,
|
||||
additionalConfs = Map("eventhubs.test.newSink" -> "true")),
|
||||
AddEventHubsData(eventHubsParameters, 17, eventPayloadsAndProperties.takeRight(400)))
|
||||
StopStream(recoverStreamId = true, commitPartialOffset = true, partialType = "nometadata"),
|
||||
StartStream(trigger = ProcessingTime(10),
|
||||
triggerClock = manualClock,
|
||||
additionalConfs = Map("eventhubs.test.newSink" -> "true")),
|
||||
AddEventHubsData(eventHubsParameters, 17, eventPayloadsAndProperties.takeRight(400))
|
||||
)
|
||||
val clockMove2 = Array.fill(8)(AdvanceManualClock(10)).toSeq
|
||||
// in structured streaming, even metadata is not committed, we will be able to skip the
|
||||
// processed data, since we will pinpoint progress file with the recovered batch id
|
||||
|
@ -627,20 +758,22 @@ class EventHubsSourceSuite extends EventHubsStreamTest {
|
|||
val eventHubsParameters = buildEventHubsParamters("ns1", "eh1", 2, 30)
|
||||
val eventPayloadsAndProperties = generateIntKeyedData(1000)
|
||||
EventHubsTestUtilities.simulateEventHubs(eventHubsParameters,
|
||||
eventPayloadsAndProperties.take(30 * 10 * 2))
|
||||
eventPayloadsAndProperties.take(30 * 10 * 2))
|
||||
val sourceQuery = generateInputQuery(eventHubsParameters, spark)
|
||||
val manualClock = new StreamManualClock
|
||||
val firstBatch = Seq(
|
||||
StartStream(trigger = ProcessingTime(10), triggerClock = manualClock),
|
||||
AddEventHubsData(eventHubsParameters, 9))
|
||||
val firstBatch = Seq(StartStream(trigger = ProcessingTime(10), triggerClock = manualClock),
|
||||
AddEventHubsData(eventHubsParameters, 9))
|
||||
val clockMove = Array.fill(9)(AdvanceManualClock(10)).toSeq
|
||||
val secondBatch = Seq(
|
||||
CheckAnswer(1 to 600: _*),
|
||||
StopStream(recoverStreamId = true, commitPartialOffset = true,
|
||||
partialType = "deletemetadata"),
|
||||
StartStream(trigger = ProcessingTime(10), triggerClock = manualClock,
|
||||
additionalConfs = Map("eventhubs.test.newSink" -> "true")),
|
||||
AddEventHubsData(eventHubsParameters, 17, eventPayloadsAndProperties.takeRight(400)))
|
||||
StopStream(recoverStreamId = true,
|
||||
commitPartialOffset = true,
|
||||
partialType = "deletemetadata"),
|
||||
StartStream(trigger = ProcessingTime(10),
|
||||
triggerClock = manualClock,
|
||||
additionalConfs = Map("eventhubs.test.newSink" -> "true")),
|
||||
AddEventHubsData(eventHubsParameters, 17, eventPayloadsAndProperties.takeRight(400))
|
||||
)
|
||||
val clockMove2 = Array.fill(8)(AdvanceManualClock(10)).toSeq
|
||||
// in structured streaming, even metadata is not committed, we will be able to skip the
|
||||
// processed data, since we will pinpoint progress file with the recovered batch id
|
||||
|
@ -648,8 +781,9 @@ class EventHubsSourceSuite extends EventHubsStreamTest {
|
|||
testStream(sourceQuery)(firstBatch ++ clockMove ++ secondBatch ++ clockMove2 ++ thirdBatch: _*)
|
||||
}
|
||||
|
||||
test("Verify expected dataframe is retrieved from starting offset" +
|
||||
" on different streams on the same query") {
|
||||
test(
|
||||
"Verify expected dataframe is retrieved from starting offset" +
|
||||
" on different streams on the same query") {
|
||||
import testImplicits._
|
||||
val eventHubsParameters = buildEventHubsParamters("ns1", "eh1", 2, 3)
|
||||
val eventPayloadsAndProperties1 = generateIntKeyedData(6)
|
||||
|
@ -664,15 +798,19 @@ class EventHubsSourceSuite extends EventHubsStreamTest {
|
|||
AdvanceManualClock(10),
|
||||
CheckAnswer(1, 2, 3, 4, 5, 6),
|
||||
StopStream(),
|
||||
StartStream(trigger = ProcessingTime(10), triggerClock = manualClock,
|
||||
additionalConfs = Map(
|
||||
"eventhubs.test.checkpointLocation" ->
|
||||
s"${Utils.createTempDir(namePrefix = "streaming.metadata").getCanonicalPath}",
|
||||
"eventhubs.test.newSink" -> "true")),
|
||||
StartStream(
|
||||
trigger = ProcessingTime(10),
|
||||
triggerClock = manualClock,
|
||||
additionalConfs =
|
||||
Map("eventhubs.test.checkpointLocation" ->
|
||||
s"${Utils.createTempDir(namePrefix = "streaming.metadata").getCanonicalPath}",
|
||||
"eventhubs.test.newSink" -> "true")
|
||||
),
|
||||
AddEventHubsData(eventHubsParameters),
|
||||
CheckAnswer(1, 2, 3, 4, 5, 6),
|
||||
AddEventHubsData(eventHubsParameters, highestBatchId.incrementAndGet().toLong,
|
||||
eventPayloadsAndProperties2),
|
||||
AddEventHubsData(eventHubsParameters,
|
||||
highestBatchId.incrementAndGet().toLong,
|
||||
eventPayloadsAndProperties2),
|
||||
AdvanceManualClock(10),
|
||||
AdvanceManualClock(10),
|
||||
CheckAnswer(1, 2, 3, 4, 5, 6, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12)
|
||||
|
@ -685,28 +823,31 @@ class EventHubsSourceSuite extends EventHubsStreamTest {
|
|||
}
|
||||
|
||||
test("Verify expected dataframe is retrieved with windowing operation") {
|
||||
val eventHubsParameters = buildEventHubsParamters("ns1", "eh1", 2, 40,
|
||||
containsProperties = true, userDefinedKeys = Some("creationTime"))
|
||||
val eventHubsParameters = buildEventHubsParamters("ns1",
|
||||
"eh1",
|
||||
2,
|
||||
40,
|
||||
containsProperties = true,
|
||||
userDefinedKeys = Some("creationTime"))
|
||||
val eventPayloadsAndProperties = {
|
||||
for (time <- Range(0, 10))
|
||||
yield testDataForWindowingOperation(100, time)
|
||||
}.reduce((a, b) => a ++ b)
|
||||
EventHubsTestUtilities.simulateEventHubs(eventHubsParameters,
|
||||
eventPayloadsAndProperties)
|
||||
EventHubsTestUtilities.simulateEventHubs(eventHubsParameters, eventPayloadsAndProperties)
|
||||
val sourceQuery = spark.readStream.format("eventhubs").options(eventHubsParameters).load()
|
||||
import sourceQuery.sparkSession.implicits._
|
||||
import org.apache.spark.sql.functions._
|
||||
val windowedStream = sourceQuery.groupBy(
|
||||
window(
|
||||
$"creationTime".cast(TimestampType),
|
||||
"3 second",
|
||||
"1 second")).count().sort("window").select("count")
|
||||
val windowedStream = sourceQuery
|
||||
.groupBy(window($"creationTime".cast(TimestampType), "3 second", "1 second"))
|
||||
.count()
|
||||
.sort("window")
|
||||
.select("count")
|
||||
val manualClock = new StreamManualClock
|
||||
val firstBatch = Seq(StartStream(trigger = ProcessingTime(1000), triggerClock = manualClock))
|
||||
val clockMove = Array.fill(13)(AdvanceManualClock(1000)).toSeq
|
||||
val secondBatch = Seq(
|
||||
AddEventHubsData(eventHubsParameters, 12),
|
||||
CheckAnswer(true, 100, 200, 300, 300, 300, 300, 300, 300, 300, 300, 200, 100))
|
||||
val secondBatch =
|
||||
Seq(AddEventHubsData(eventHubsParameters, 12),
|
||||
CheckAnswer(true, 100, 200, 300, 300, 300, 300, 300, 300, 300, 300, 200, 100))
|
||||
testStream(windowedStream, outputMode = OutputMode.Complete())(
|
||||
firstBatch ++ clockMove ++ secondBatch: _*)
|
||||
}
|
||||
|
@ -728,25 +869,29 @@ class EventHubsSourceSuite extends EventHubsStreamTest {
|
|||
}
|
||||
|
||||
test("Verify expected dataframe is retrieved with watermarks") {
|
||||
val eventHubsParameters = buildEventHubsParamters("ns1", "eh1", 1, 1,
|
||||
containsProperties = true, userDefinedKeys = Some("creationTime"))
|
||||
val eventHubsParameters = buildEventHubsParamters("ns1",
|
||||
"eh1",
|
||||
1,
|
||||
1,
|
||||
containsProperties = true,
|
||||
userDefinedKeys = Some("creationTime"))
|
||||
val eventPayloadsAndProperties = testDataForWatermark(2)
|
||||
EventHubsTestUtilities.simulateEventHubs(eventHubsParameters,
|
||||
eventPayloadsAndProperties)
|
||||
EventHubsTestUtilities.simulateEventHubs(eventHubsParameters, eventPayloadsAndProperties)
|
||||
val sourceQuery = spark.readStream.format("eventhubs").options(eventHubsParameters).load()
|
||||
import sourceQuery.sparkSession.implicits._
|
||||
import org.apache.spark.sql.functions._
|
||||
val windowedStream = sourceQuery.selectExpr(
|
||||
"CAST(creationTime AS TIMESTAMP) as creationTimeT").
|
||||
withWatermark("creationTimeT", "5 second").
|
||||
groupBy(window($"creationTimeT", "3 second", "1 second")).
|
||||
count().select("count")
|
||||
val windowedStream = sourceQuery
|
||||
.selectExpr("CAST(creationTime AS TIMESTAMP) as creationTimeT")
|
||||
.withWatermark("creationTimeT", "5 second")
|
||||
.groupBy(window($"creationTimeT", "3 second", "1 second"))
|
||||
.count()
|
||||
.select("count")
|
||||
val manualClock = new StreamManualClock
|
||||
val firstBatch = Seq(StartStream(trigger = ProcessingTime(1000), triggerClock = manualClock))
|
||||
val clockMove = Array.fill(35)(AdvanceManualClock(1000)).toSeq
|
||||
val secondBatch = Seq(
|
||||
AddEventHubsData(eventHubsParameters, 35),
|
||||
CheckAnswer(true, 1, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 5, 6, 6))
|
||||
val secondBatch =
|
||||
Seq(AddEventHubsData(eventHubsParameters, 35),
|
||||
CheckAnswer(true, 1, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 5, 6, 6))
|
||||
testStream(windowedStream, outputMode = OutputMode.Append())(
|
||||
firstBatch ++ clockMove ++ secondBatch: _*)
|
||||
}
|
||||
|
@ -763,20 +908,22 @@ class EventHubsSourceSuite extends EventHubsStreamTest {
|
|||
AddEventHubsData(eventHubsParameters, 2),
|
||||
UpdatePartialCheck(
|
||||
EventHubsBatchRecord(0,
|
||||
Map(EventHubNameAndPartition("eh1", 1) -> 2, EventHubNameAndPartition("eh1", 0) -> 2))),
|
||||
Map(EventHubNameAndPartition("eh1", 1) -> 2,
|
||||
EventHubNameAndPartition("eh1", 0) -> 2))),
|
||||
CheckAnswer(true, false, 7, 8, 9, 10, 11, 12),
|
||||
// in the second batch we have the right seq number of msgs
|
||||
UpdatePartialCheck(
|
||||
EventHubsBatchRecord(1,
|
||||
Map(EventHubNameAndPartition("eh1", 1) -> 6, EventHubNameAndPartition("eh1", 0) -> 7))),
|
||||
Map(EventHubNameAndPartition("eh1", 1) -> 6,
|
||||
EventHubNameAndPartition("eh1", 0) -> 7))),
|
||||
AdvanceManualClock(10),
|
||||
CheckAnswer(true, false, 7, 8, 9, 10, 11, 12, 13, 14, 15)
|
||||
)
|
||||
}
|
||||
|
||||
test("Users cannot submit enqueueTime which is later than the latest in the queue") {
|
||||
val eventHubsParameters = buildEventHubsParamters("ns1", "eh1", 2, 3,
|
||||
enqueueTime = Some(Long.MaxValue))
|
||||
val eventHubsParameters =
|
||||
buildEventHubsParamters("ns1", "eh1", 2, 3, enqueueTime = Some(Long.MaxValue))
|
||||
val eventPayloadsAndProperties = generateIntKeyedData(15)
|
||||
EventHubsTestUtilities.simulateEventHubs(eventHubsParameters, eventPayloadsAndProperties)
|
||||
val sourceQuery = generateInputQuery(eventHubsParameters, spark)
|
||||
|
|
|
@ -27,9 +27,9 @@ import scala.util.Random
|
|||
import scala.util.control.NonFatal
|
||||
|
||||
import org.apache.hadoop.conf.Configuration
|
||||
import org.apache.hadoop.fs.{FileSystem, Path}
|
||||
import org.scalatest.{Assertions, BeforeAndAfter}
|
||||
import org.scalatest.concurrent.{Eventually, Timeouts}
|
||||
import org.apache.hadoop.fs.{ FileSystem, Path }
|
||||
import org.scalatest.{ Assertions, BeforeAndAfter }
|
||||
import org.scalatest.concurrent.{ Eventually, Timeouts }
|
||||
import org.scalatest.concurrent.Eventually._
|
||||
import org.scalatest.concurrent.PatienceConfiguration.Timeout
|
||||
import org.scalatest.exceptions.TestFailedDueToTimeoutException
|
||||
|
@ -41,15 +41,15 @@ import org.apache.spark.eventhubscommon.EventHubsConnector
|
|||
import org.apache.spark.eventhubscommon.client.EventHubsOffsetTypes.EventHubsOffsetType
|
||||
import org.apache.spark.eventhubscommon.progress.ProgressTrackerBase
|
||||
import org.apache.spark.eventhubscommon.utils._
|
||||
import org.apache.spark.sql.{Dataset, Encoder, QueryTest, Row}
|
||||
import org.apache.spark.sql.catalyst.encoders.{encoderFor, ExpressionEncoder, RowEncoder}
|
||||
import org.apache.spark.sql.{ Dataset, Encoder, QueryTest, Row }
|
||||
import org.apache.spark.sql.catalyst.encoders.{ encoderFor, ExpressionEncoder, RowEncoder }
|
||||
import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
|
||||
import org.apache.spark.sql.catalyst.util._
|
||||
import org.apache.spark.sql.execution.streaming._
|
||||
import org.apache.spark.sql.streaming._
|
||||
import org.apache.spark.sql.streaming.eventhubs.checkpoint.StructuredStreamingProgressTracker
|
||||
import org.apache.spark.sql.test.{SharedSQLContext, TestSparkSession}
|
||||
import org.apache.spark.util.{Clock, ManualClock, SystemClock, Utils}
|
||||
import org.apache.spark.sql.test.{ SharedSQLContext, TestSparkSession }
|
||||
import org.apache.spark.util.{ Clock, ManualClock, SystemClock, Utils }
|
||||
|
||||
/**
|
||||
* A framework for implementing tests for streaming queries and sources.
|
||||
|
@ -75,9 +75,12 @@ import org.apache.spark.util.{Clock, ManualClock, SystemClock, Utils}
|
|||
* avoid hanging forever in the case of failures. However, individual suites can change this
|
||||
* by overriding `streamingTimeout`.
|
||||
*/
|
||||
|
||||
trait EventHubsStreamTest extends QueryTest with BeforeAndAfter
|
||||
with SharedSQLContext with Timeouts with Serializable {
|
||||
trait EventHubsStreamTest
|
||||
extends QueryTest
|
||||
with BeforeAndAfter
|
||||
with SharedSQLContext
|
||||
with Timeouts
|
||||
with Serializable {
|
||||
|
||||
protected val tempRoot = "/tmp"
|
||||
|
||||
|
@ -88,14 +91,14 @@ trait EventHubsStreamTest extends QueryTest with BeforeAndAfter
|
|||
|
||||
override protected def createSparkSession: TestSparkSession = {
|
||||
new TestSparkSession(
|
||||
sparkConf.set("spark.hadoop.fs.file.impl", classOf[DebugFilesystem].getName).setAppName(
|
||||
s"EventHubsStreamTest_${System.currentTimeMillis()}"))
|
||||
sparkConf
|
||||
.set("spark.hadoop.fs.file.impl", classOf[DebugFilesystem].getName)
|
||||
.setAppName(s"EventHubsStreamTest_${System.currentTimeMillis()}"))
|
||||
}
|
||||
|
||||
/** How long to wait for an active stream to catch up when checking a result. */
|
||||
val streamingTimeout = 60 seconds
|
||||
|
||||
|
||||
/** A trait for actions that can be performed while testing a streaming DataFrame. */
|
||||
// trait StreamAction
|
||||
|
||||
|
@ -113,33 +116,32 @@ trait EventHubsStreamTest extends QueryTest with BeforeAndAfter
|
|||
*/
|
||||
object CheckAnswer {
|
||||
|
||||
def apply[A : Encoder](isSort: Boolean, data: A*): CheckAnswerRows = {
|
||||
def apply[A: Encoder](isSort: Boolean, data: A*): CheckAnswerRows = {
|
||||
val encoder = encoderFor[A]
|
||||
val toExternalRow = RowEncoder(encoder.schema).resolveAndBind()
|
||||
CheckAnswerRows(
|
||||
data.map(d => toExternalRow.fromRow(encoder.toRow(d))),
|
||||
lastOnly = false,
|
||||
isSorted = isSort)
|
||||
CheckAnswerRows(data.map(d => toExternalRow.fromRow(encoder.toRow(d))),
|
||||
lastOnly = false,
|
||||
isSorted = isSort)
|
||||
}
|
||||
|
||||
def apply[A : Encoder](data: A*): CheckAnswerRows = {
|
||||
def apply[A: Encoder](data: A*): CheckAnswerRows = {
|
||||
val encoder = encoderFor[A]
|
||||
val toExternalRow = RowEncoder(encoder.schema).resolveAndBind()
|
||||
CheckAnswerRows(
|
||||
data.map(d => toExternalRow.fromRow(encoder.toRow(d))),
|
||||
lastOnly = false,
|
||||
isSorted = false)
|
||||
CheckAnswerRows(data.map(d => toExternalRow.fromRow(encoder.toRow(d))),
|
||||
lastOnly = false,
|
||||
isSorted = false)
|
||||
}
|
||||
|
||||
def apply(rows: Row*): CheckAnswerRows =
|
||||
CheckAnswerRows(rows, lastOnly = false, isSorted = false)
|
||||
|
||||
def apply[A : Encoder](partial: Boolean, lastOnly: Boolean, rows: A*): CheckAnswerRows = {
|
||||
def apply[A: Encoder](partial: Boolean, lastOnly: Boolean, rows: A*): CheckAnswerRows = {
|
||||
val encoder = encoderFor[A]
|
||||
val toExternalRow = RowEncoder(encoder.schema).resolveAndBind()
|
||||
CheckAnswerRows(
|
||||
rows.map(r => toExternalRow.fromRow(encoder.toRow(r))),
|
||||
isSorted = false, lastOnly = lastOnly, ifCheckPartialResult = partial)
|
||||
CheckAnswerRows(rows.map(r => toExternalRow.fromRow(encoder.toRow(r))),
|
||||
isSorted = false,
|
||||
lastOnly = lastOnly,
|
||||
ifCheckPartialResult = partial)
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -148,29 +150,28 @@ trait EventHubsStreamTest extends QueryTest with BeforeAndAfter
|
|||
* This operation automatically blocks until all added data has been processed.
|
||||
*/
|
||||
object CheckLastBatch {
|
||||
def apply[A : Encoder](data: A*): CheckAnswerRows = {
|
||||
def apply[A: Encoder](data: A*): CheckAnswerRows = {
|
||||
apply(isSorted = false, data: _*)
|
||||
}
|
||||
|
||||
def apply[A: Encoder](isSorted: Boolean, data: A*): CheckAnswerRows = {
|
||||
val encoder = encoderFor[A]
|
||||
val toExternalRow = RowEncoder(encoder.schema).resolveAndBind()
|
||||
CheckAnswerRows(
|
||||
data.map(d => toExternalRow.fromRow(encoder.toRow(d))),
|
||||
lastOnly = true,
|
||||
isSorted = isSorted)
|
||||
CheckAnswerRows(data.map(d => toExternalRow.fromRow(encoder.toRow(d))),
|
||||
lastOnly = true,
|
||||
isSorted = isSorted)
|
||||
}
|
||||
|
||||
def apply(rows: Row*): CheckAnswerRows = CheckAnswerRows(rows, lastOnly = true,
|
||||
isSorted = false)
|
||||
def apply(rows: Row*): CheckAnswerRows =
|
||||
CheckAnswerRows(rows, lastOnly = true, isSorted = false)
|
||||
}
|
||||
|
||||
case class CheckAnswerRows(
|
||||
expectedAnswer: Seq[Row],
|
||||
lastOnly: Boolean,
|
||||
isSorted: Boolean,
|
||||
ifCheckPartialResult: Boolean = false)
|
||||
extends StreamAction with StreamMustBeRunning {
|
||||
case class CheckAnswerRows(expectedAnswer: Seq[Row],
|
||||
lastOnly: Boolean,
|
||||
isSorted: Boolean,
|
||||
ifCheckPartialResult: Boolean = false)
|
||||
extends StreamAction
|
||||
with StreamMustBeRunning {
|
||||
override def toString: String = s"$operatorName: ${expectedAnswer.mkString(",")}"
|
||||
private def operatorName = if (lastOnly) "CheckLastBatch" else "CheckAnswer"
|
||||
}
|
||||
|
@ -182,19 +183,20 @@ trait EventHubsStreamTest extends QueryTest with BeforeAndAfter
|
|||
commitOffset: Boolean = false,
|
||||
commitPartialOffset: Boolean = false,
|
||||
partialType: String = "delete")
|
||||
extends StreamAction with StreamMustBeRunning
|
||||
extends StreamAction
|
||||
with StreamMustBeRunning
|
||||
|
||||
/** Starts the stream, resuming if data has already been processed. It must not be running. */
|
||||
case class StartStream(trigger: Trigger = ProcessingTime(0),
|
||||
triggerClock: Clock = new SystemClock,
|
||||
additionalConfs: Map[String, String] = Map.empty)
|
||||
extends StreamAction
|
||||
extends StreamAction
|
||||
|
||||
/** Advance the trigger clock's time manually. */
|
||||
case class AdvanceManualClock(timeToAdd: Long) extends StreamAction
|
||||
|
||||
/** Signals that a failure is expected and should not kill the test. */
|
||||
case class ExpectFailure[T <: Throwable : ClassTag]() extends StreamAction {
|
||||
case class ExpectFailure[T <: Throwable: ClassTag]() extends StreamAction {
|
||||
val causeClass: Class[T] = implicitly[ClassTag[T]].runtimeClass.asInstanceOf[Class[T]]
|
||||
override def toString: String = s"ExpectFailure[${causeClass.getName}]"
|
||||
}
|
||||
|
@ -207,13 +209,13 @@ trait EventHubsStreamTest extends QueryTest with BeforeAndAfter
|
|||
|
||||
object Assert {
|
||||
def apply(condition: => Boolean, message: String = ""): Assert = new Assert(condition, message)
|
||||
def apply(message: String)(body: => Unit): Assert = new Assert( { body; true }, message)
|
||||
def apply(body: => Unit): Assert = new Assert( { body; true }, "")
|
||||
def apply(message: String)(body: => Unit): Assert = new Assert({ body; true }, message)
|
||||
def apply(body: => Unit): Assert = new Assert({ body; true }, "")
|
||||
}
|
||||
|
||||
/** Assert that a condition on the active query is true */
|
||||
class AssertOnQuery(val condition: StreamExecution => Boolean, val message: String)
|
||||
extends StreamAction {
|
||||
extends StreamAction {
|
||||
override def toString: String = s"AssertOnQuery(<condition>, $message)"
|
||||
}
|
||||
|
||||
|
@ -228,7 +230,8 @@ trait EventHubsStreamTest extends QueryTest with BeforeAndAfter
|
|||
}
|
||||
|
||||
class StreamManualClock(@volatile var currentTime: Long = 0L)
|
||||
extends ManualClock(currentTime) with Serializable {
|
||||
extends ManualClock(currentTime)
|
||||
with Serializable {
|
||||
|
||||
private var waitStartTime: Option[Long] = None
|
||||
|
||||
|
@ -280,10 +283,9 @@ trait EventHubsStreamTest extends QueryTest with BeforeAndAfter
|
|||
}
|
||||
}
|
||||
|
||||
def isStreamWaitingAt(time: Long): Boolean = synchronized {waitStartTime contains time}
|
||||
def isStreamWaitingAt(time: Long): Boolean = synchronized { waitStartTime contains time }
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Executes the specified actions on the given streaming DataFrame and provides helpful
|
||||
* error messages in the case of failures or incorrect answers.
|
||||
|
@ -291,11 +293,11 @@ trait EventHubsStreamTest extends QueryTest with BeforeAndAfter
|
|||
* Note that if the stream is not explicitly started before an action that requires it to be
|
||||
* running then it will be automatically started before performing any other actions.
|
||||
*/
|
||||
def testStream(_stream: Dataset[_],
|
||||
outputMode: OutputMode = OutputMode.Append)(actions: StreamAction*): Unit = {
|
||||
def testStream(_stream: Dataset[_], outputMode: OutputMode = OutputMode.Append)(
|
||||
actions: StreamAction*): Unit = {
|
||||
|
||||
val stream = _stream.toDF()
|
||||
val sparkSession = stream.sparkSession // use the session in DF, not the default session
|
||||
val sparkSession = stream.sparkSession // use the session in DF, not the default session
|
||||
var pos = 0
|
||||
var currentStream: StreamExecution = null
|
||||
var lastStream: StreamExecution = null
|
||||
|
@ -312,14 +314,17 @@ trait EventHubsStreamTest extends QueryTest with BeforeAndAfter
|
|||
actions.takeWhile(!_.isInstanceOf[StreamMustBeRunning]).exists(_.isInstanceOf[StartStream])
|
||||
val startedTest = if (startedManually) actions else StartStream() +: actions
|
||||
|
||||
def testActions = actions.zipWithIndex.map {
|
||||
case (a, i) =>
|
||||
if ((pos == i && startedManually) || (pos == (i + 1) && !startedManually)) {
|
||||
"=> " + a.toString
|
||||
} else {
|
||||
" " + a.toString
|
||||
def testActions =
|
||||
actions.zipWithIndex
|
||||
.map {
|
||||
case (a, i) =>
|
||||
if ((pos == i && startedManually) || (pos == (i + 1) && !startedManually)) {
|
||||
"=> " + a.toString
|
||||
} else {
|
||||
" " + a.toString
|
||||
}
|
||||
}
|
||||
}.mkString("\n")
|
||||
.mkString("\n")
|
||||
|
||||
def currentOffsets =
|
||||
if (currentStream != null) currentStream.committedOffsets.toString else "not started"
|
||||
|
@ -385,8 +390,7 @@ trait EventHubsStreamTest extends QueryTest with BeforeAndAfter
|
|||
}
|
||||
val c = Option(cause).map(exceptionToString(_))
|
||||
val m = if (message != null && message.nonEmpty) Some(message) else None
|
||||
fail(
|
||||
s"""
|
||||
fail(s"""
|
||||
|${(m ++ c).mkString(": ")}
|
||||
|$testState
|
||||
""".stripMargin)
|
||||
|
@ -399,20 +403,21 @@ trait EventHubsStreamTest extends QueryTest with BeforeAndAfter
|
|||
}
|
||||
|
||||
if (sources.isEmpty) {
|
||||
throw new Exception("Could not find EventHubs source in the StreamExecution" +
|
||||
" logical plan to add data to")
|
||||
throw new Exception(
|
||||
"Could not find EventHubs source in the StreamExecution" +
|
||||
" logical plan to add data to")
|
||||
} else if (sources.size > 1) {
|
||||
throw new Exception("Could not select the EventHubs source in the StreamExecution " +
|
||||
"logical plan as there" +
|
||||
"are multiple EventHubs sources:\n\t" + sources.mkString("\n\t"))
|
||||
throw new Exception(
|
||||
"Could not select the EventHubs source in the StreamExecution " +
|
||||
"logical plan as there" +
|
||||
"are multiple EventHubs sources:\n\t" + sources.mkString("\n\t"))
|
||||
}
|
||||
sources.head
|
||||
}
|
||||
|
||||
def createBrokenProgressFile(
|
||||
progressTracker: ProgressTrackerBase[_ <: EventHubsConnector],
|
||||
timestamp: Long,
|
||||
brokenType: String): Unit = {
|
||||
def createBrokenProgressFile(progressTracker: ProgressTrackerBase[_ <: EventHubsConnector],
|
||||
timestamp: Long,
|
||||
brokenType: String): Unit = {
|
||||
val progressDir = progressTracker.progressDirectoryPath.toString
|
||||
val metadataDir = progressTracker.metadataDirectoryPath.toString
|
||||
val progressFilePath = new Path(s"$progressDir/progress-$timestamp")
|
||||
|
@ -423,7 +428,7 @@ trait EventHubsStreamTest extends QueryTest with BeforeAndAfter
|
|||
fs.delete(metadataFilePath, true)
|
||||
} else if (brokenType == "deletemetadata") {
|
||||
fs.delete(metadataFilePath, true)
|
||||
} else if (brokenType == "partial" ) {
|
||||
} else if (brokenType == "partial") {
|
||||
fs.delete(progressFilePath, true)
|
||||
fs.delete(metadataFilePath, true)
|
||||
val fsos = fs.create(progressFilePath)
|
||||
|
@ -444,9 +449,11 @@ trait EventHubsStreamTest extends QueryTest with BeforeAndAfter
|
|||
action match {
|
||||
case StartStream(trigger, triggerClock, additionalConfs) =>
|
||||
verify(currentStream == null, "stream already running")
|
||||
verify(triggerClock.isInstanceOf[SystemClock]
|
||||
|| triggerClock.isInstanceOf[StreamManualClock],
|
||||
"Use either SystemClock or StreamManualClock to start the stream")
|
||||
verify(
|
||||
triggerClock.isInstanceOf[SystemClock]
|
||||
|| triggerClock.isInstanceOf[StreamManualClock],
|
||||
"Use either SystemClock or StreamManualClock to start the stream"
|
||||
)
|
||||
if (triggerClock.isInstanceOf[StreamManualClock]) {
|
||||
manualClockExpectedTime = triggerClock.asInstanceOf[StreamManualClock].getTimeMillis()
|
||||
}
|
||||
|
@ -461,27 +468,30 @@ trait EventHubsStreamTest extends QueryTest with BeforeAndAfter
|
|||
})
|
||||
|
||||
lastStream = currentStream
|
||||
val createQueryMethod = sparkSession.streams.getClass.getDeclaredMethods.filter(m =>
|
||||
m.getName == "createQuery").head
|
||||
val createQueryMethod = sparkSession.streams.getClass.getDeclaredMethods
|
||||
.filter(m => m.getName == "createQuery")
|
||||
.head
|
||||
createQueryMethod.setAccessible(true)
|
||||
val checkpointLocation = additionalConfs.getOrElse[String](
|
||||
"eventhubs.test.checkpointLocation",
|
||||
metadataRoot)
|
||||
val checkpointLocation =
|
||||
additionalConfs.getOrElse[String]("eventhubs.test.checkpointLocation", metadataRoot)
|
||||
if (additionalConfs.contains("eventhubs.test.newSink") &&
|
||||
additionalConfs("eventhubs.test.newSink").toBoolean) {
|
||||
additionalConfs("eventhubs.test.newSink").toBoolean) {
|
||||
sink = new MemorySink(stream.schema, outputMode)
|
||||
}
|
||||
currentStream = createQueryMethod.invoke(
|
||||
sparkSession.streams,
|
||||
None,
|
||||
Some(checkpointLocation),
|
||||
stream,
|
||||
sink,
|
||||
outputMode,
|
||||
Boolean.box(false), // useTempCheckpointLocation
|
||||
Boolean.box(true), // recoverFromCheckpointLocation
|
||||
trigger,
|
||||
triggerClock).asInstanceOf[StreamExecution]
|
||||
currentStream = createQueryMethod
|
||||
.invoke(
|
||||
sparkSession.streams,
|
||||
None,
|
||||
Some(checkpointLocation),
|
||||
stream,
|
||||
sink,
|
||||
outputMode,
|
||||
Boolean.box(false), // useTempCheckpointLocation
|
||||
Boolean.box(true), // recoverFromCheckpointLocation
|
||||
trigger,
|
||||
triggerClock
|
||||
)
|
||||
.asInstanceOf[StreamExecution]
|
||||
|
||||
triggerClock match {
|
||||
case smc: StreamManualClock =>
|
||||
|
@ -489,30 +499,38 @@ trait EventHubsStreamTest extends QueryTest with BeforeAndAfter
|
|||
case _ =>
|
||||
}
|
||||
|
||||
val activeQueriesField = sparkSession.streams.getClass.getDeclaredFields.filter(f =>
|
||||
f.getName == "org$apache$spark$sql$streaming$StreamingQueryManager$$activeQueries").
|
||||
head
|
||||
val activeQueriesField = sparkSession.streams.getClass.getDeclaredFields
|
||||
.filter(f =>
|
||||
f.getName == "org$apache$spark$sql$streaming$StreamingQueryManager$$activeQueries")
|
||||
.head
|
||||
activeQueriesField.setAccessible(true)
|
||||
val activeQueries = activeQueriesField.get(sparkSession.streams).
|
||||
asInstanceOf[mutable.HashMap[UUID, StreamingQuery]]
|
||||
val activeQueries = activeQueriesField
|
||||
.get(sparkSession.streams)
|
||||
.asInstanceOf[mutable.HashMap[UUID, StreamingQuery]]
|
||||
activeQueries += currentStream.id -> currentStream
|
||||
|
||||
val eventHubsSource = searchCurrentSource()
|
||||
val eventHubs = EventHubsTestUtilities.getOrSimulateEventHubs(null)
|
||||
eventHubsSource.setEventHubClient(new SimulatedEventHubsRestClient(eventHubs))
|
||||
eventHubsSource.setEventHubsReceiver(
|
||||
(eventHubsParameters: Map[String, String], partitionId: Int,
|
||||
startOffset: Long, offsetType: EventHubsOffsetType, _: Int) =>
|
||||
new TestEventHubsReceiver(eventHubsParameters, eventHubs, partitionId, startOffset,
|
||||
offsetType)
|
||||
(eventHubsParameters: Map[String, String],
|
||||
partitionId: Int,
|
||||
startOffset: Long,
|
||||
offsetType: EventHubsOffsetType,
|
||||
_: Int) =>
|
||||
new TestEventHubsReceiver(eventHubsParameters,
|
||||
eventHubs,
|
||||
partitionId,
|
||||
startOffset,
|
||||
offsetType)
|
||||
)
|
||||
currentStream.start()
|
||||
|
||||
case AdvanceManualClock(timeToAdd) =>
|
||||
verify(currentStream != null,
|
||||
"can not advance manual clock when a stream is not running")
|
||||
"can not advance manual clock when a stream is not running")
|
||||
verify(currentStream.triggerClock.isInstanceOf[StreamManualClock],
|
||||
s"can not advance clock of type ${currentStream.triggerClock.getClass}")
|
||||
s"can not advance clock of type ${currentStream.triggerClock.getClass}")
|
||||
val clock = currentStream.triggerClock.asInstanceOf[StreamManualClock]
|
||||
assert(manualClockExpectedTime >= 0)
|
||||
|
||||
|
@ -523,15 +541,19 @@ trait EventHubsStreamTest extends QueryTest with BeforeAndAfter
|
|||
|
||||
clock.advance(timeToAdd)
|
||||
manualClockExpectedTime += timeToAdd
|
||||
verify(clock.getTimeMillis() === manualClockExpectedTime,
|
||||
verify(
|
||||
clock.getTimeMillis() === manualClockExpectedTime,
|
||||
s"Unexpected clock time after updating: " +
|
||||
s"expecting $manualClockExpectedTime, current ${clock.getTimeMillis()}")
|
||||
s"expecting $manualClockExpectedTime, current ${clock.getTimeMillis()}"
|
||||
)
|
||||
|
||||
case StopStream(recoverStreamId: Boolean, commitOffset: Boolean,
|
||||
commitPartialOffset: Boolean, partialType: String) =>
|
||||
case StopStream(recoverStreamId: Boolean,
|
||||
commitOffset: Boolean,
|
||||
commitPartialOffset: Boolean,
|
||||
partialType: String) =>
|
||||
verify(currentStream != null, "can not stop a stream that is not running")
|
||||
require(!(commitOffset && commitPartialOffset),
|
||||
"cannot set both of commitOffset and commitPartialOffset as true")
|
||||
"cannot set both of commitOffset and commitPartialOffset as true")
|
||||
if (recoverStreamId) {
|
||||
EventHubsSource.streamIdGenerator.decrementAndGet()
|
||||
}
|
||||
|
@ -548,15 +570,14 @@ trait EventHubsStreamTest extends QueryTest with BeforeAndAfter
|
|||
source.collectFinishedBatchOffsetsAndCommit(
|
||||
source.committedOffsetsAndSeqNums.batchId + 1)
|
||||
createBrokenProgressFile(progressTracker,
|
||||
source.committedOffsetsAndSeqNums.batchId, partialType)
|
||||
source.committedOffsetsAndSeqNums.batchId,
|
||||
partialType)
|
||||
}
|
||||
verify(!currentStream.microBatchThread.isAlive,
|
||||
s"microbatch thread not stopped")
|
||||
verify(!currentStream.isActive,
|
||||
"query.isActive() is false even after stopping")
|
||||
verify(!currentStream.microBatchThread.isAlive, s"microbatch thread not stopped")
|
||||
verify(!currentStream.isActive, "query.isActive() is false even after stopping")
|
||||
verify(currentStream.exception.isEmpty,
|
||||
s"query.exception() is not empty after clean stop: " +
|
||||
currentStream.exception.map(_.toString()).getOrElse(""))
|
||||
s"query.exception() is not empty after clean stop: " +
|
||||
currentStream.exception.map(_.toString()).getOrElse(""))
|
||||
} catch {
|
||||
case _: InterruptedException =>
|
||||
case _: org.scalatest.exceptions.TestFailedDueToTimeoutException =>
|
||||
|
@ -578,12 +599,14 @@ trait EventHubsStreamTest extends QueryTest with BeforeAndAfter
|
|||
assert(!currentStream.microBatchThread.isAlive)
|
||||
}
|
||||
verify(currentStream.exception === Some(thrownException),
|
||||
s"incorrect exception returned by query.exception()")
|
||||
s"incorrect exception returned by query.exception()")
|
||||
|
||||
val exception = currentStream.exception.get
|
||||
verify(exception.cause.getClass === ef.causeClass,
|
||||
verify(
|
||||
exception.cause.getClass === ef.causeClass,
|
||||
"incorrect cause in exception returned by query.exception()\n" +
|
||||
s"\tExpected: ${ef.causeClass}\n\tReturned: ${exception.cause.getClass}")
|
||||
s"\tExpected: ${ef.causeClass}\n\tReturned: ${exception.cause.getClass}"
|
||||
)
|
||||
} catch {
|
||||
case _: InterruptedException =>
|
||||
case _: org.scalatest.exceptions.TestFailedDueToTimeoutException =>
|
||||
|
@ -597,7 +620,7 @@ trait EventHubsStreamTest extends QueryTest with BeforeAndAfter
|
|||
}
|
||||
case a: AssertOnQuery =>
|
||||
verify(currentStream != null || lastStream != null,
|
||||
"cannot assert when not stream has been started")
|
||||
"cannot assert when not stream has been started")
|
||||
val streamToAssert = Option(currentStream).getOrElse(lastStream)
|
||||
verify(a.condition(streamToAssert), s"Assert on query failed: ${a.message}")
|
||||
case a: Assert =>
|
||||
|
@ -619,14 +642,17 @@ trait EventHubsStreamTest extends QueryTest with BeforeAndAfter
|
|||
// Try to find the index of the source to which data was added. Either get the index
|
||||
// from the current active query or the original input logical plan.
|
||||
val sourceIndex =
|
||||
queryToUse.flatMap { query =>
|
||||
findSourceIndex(query.logicalPlan, source)
|
||||
}.orElse {
|
||||
findSourceIndex(stream.logicalPlan, source)
|
||||
}.getOrElse {
|
||||
throw new IllegalArgumentException(
|
||||
"Could find index of the source to which data was added")
|
||||
}
|
||||
queryToUse
|
||||
.flatMap { query =>
|
||||
findSourceIndex(query.logicalPlan, source)
|
||||
}
|
||||
.orElse {
|
||||
findSourceIndex(stream.logicalPlan, source)
|
||||
}
|
||||
.getOrElse {
|
||||
throw new IllegalArgumentException(
|
||||
"Could find index of the source to which data was added")
|
||||
}
|
||||
// Store the expected offset of added data to wait for it later
|
||||
awaiting.put(sourceIndex, offset)
|
||||
} catch {
|
||||
|
@ -640,32 +666,33 @@ trait EventHubsStreamTest extends QueryTest with BeforeAndAfter
|
|||
case CheckAnswerRows(expectedAnswer, lastOnly, isSorted, partial) =>
|
||||
verify(currentStream != null, "stream not running")
|
||||
// Get the map of source index to the current source objects
|
||||
val indexToSource = currentStream
|
||||
.logicalPlan
|
||||
val indexToSource = currentStream.logicalPlan
|
||||
.collect { case StreamingExecutionRelation(s, _) => s }
|
||||
.zipWithIndex
|
||||
.map(_.swap)
|
||||
.toMap
|
||||
|
||||
// Block until all data added has been processed for all the source
|
||||
{if (!partial) awaiting else partialAwaiting}.foreach { case (sourceIndex, offset) =>
|
||||
try {
|
||||
failAfter(streamingTimeout) {
|
||||
currentStream.awaitOffset(indexToSource(sourceIndex), offset)
|
||||
{ if (!partial) awaiting else partialAwaiting }.foreach {
|
||||
case (sourceIndex, offset) =>
|
||||
try {
|
||||
failAfter(streamingTimeout) {
|
||||
currentStream.awaitOffset(indexToSource(sourceIndex), offset)
|
||||
}
|
||||
} catch {
|
||||
case e: Exception =>
|
||||
e.printStackTrace()
|
||||
throw e
|
||||
}
|
||||
} catch {
|
||||
case e: Exception =>
|
||||
e.printStackTrace()
|
||||
throw e
|
||||
}
|
||||
}
|
||||
val sparkAnswer = try if (lastOnly) sink.latestBatchData else sink.allData catch {
|
||||
val sparkAnswer = try if (lastOnly) sink.latestBatchData else sink.allData
|
||||
catch {
|
||||
case e: Exception =>
|
||||
failTest("Exception while getting data from sink", e)
|
||||
}
|
||||
|
||||
QueryTest.sameRows(expectedAnswer, sparkAnswer, isSorted).foreach {
|
||||
error => failTest(error)
|
||||
QueryTest.sameRows(expectedAnswer, sparkAnswer, isSorted).foreach { error =>
|
||||
failTest(error)
|
||||
}
|
||||
}
|
||||
pos += 1
|
||||
|
@ -691,7 +718,7 @@ trait EventHubsStreamTest extends QueryTest with BeforeAndAfter
|
|||
// Rollback prev configuration values
|
||||
resetConfValues.foreach {
|
||||
case (key, Some(value)) => sparkSession.conf.set(key, value)
|
||||
case (key, None) => sparkSession.conf.unset(key)
|
||||
case (key, None) => sparkSession.conf.unset(key)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -742,7 +769,7 @@ trait EventHubsStreamTest extends QueryTest with BeforeAndAfter
|
|||
|
||||
(1 to iterations).foreach { i =>
|
||||
val rand = Random.nextDouble()
|
||||
if(!running) {
|
||||
if (!running) {
|
||||
rand match {
|
||||
case r if r < 0.7 => // AddData
|
||||
addRandomData()
|
||||
|
@ -766,7 +793,7 @@ trait EventHubsStreamTest extends QueryTest with BeforeAndAfter
|
|||
}
|
||||
}
|
||||
}
|
||||
if(!running) { actions += StartStream() }
|
||||
if (!running) { actions += StartStream() }
|
||||
addCheck()
|
||||
testStream(ds)(actions: _*)
|
||||
}
|
||||
|
@ -783,11 +810,12 @@ trait EventHubsStreamTest extends QueryTest with BeforeAndAfter
|
|||
|
||||
/** Expect awaitTermination to throw an exception */
|
||||
case class ExpectException[E <: Exception]()(implicit val t: ClassTag[E])
|
||||
extends ExpectedBehavior
|
||||
extends ExpectedBehavior
|
||||
|
||||
private val DEFAULT_TEST_TIMEOUT = 1.second
|
||||
|
||||
def test(expectedBehavior: ExpectedBehavior, awaitTermFunc: () => Unit,
|
||||
def test(expectedBehavior: ExpectedBehavior,
|
||||
awaitTermFunc: () => Unit,
|
||||
testTimeout: Span = DEFAULT_TEST_TIMEOUT): Unit = {
|
||||
expectedBehavior match {
|
||||
case ExpectNotBlocked =>
|
||||
|
@ -814,7 +842,7 @@ trait EventHubsStreamTest extends QueryTest with BeforeAndAfter
|
|||
}
|
||||
}
|
||||
assert(thrownException.cause.getClass === e.t.runtimeClass,
|
||||
"exception of incorrect type was throw")
|
||||
"exception of incorrect type was throw")
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -1,498 +0,0 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.spark.sql.streaming.eventhubs.checkpoint
|
||||
|
||||
import java.nio.file.Files
|
||||
import java.time.Instant
|
||||
|
||||
import org.apache.hadoop.conf.Configuration
|
||||
import org.apache.hadoop.fs.{FileSystem, Path}
|
||||
|
||||
import org.apache.spark.eventhubscommon._
|
||||
import org.apache.spark.eventhubscommon.progress._
|
||||
import org.apache.spark.sql.streaming.eventhubs.EventHubsSource
|
||||
import org.apache.spark.sql.test.SharedSQLContext
|
||||
|
||||
class StructuredStreamingProgressTrackerSuite extends SharedSQLContext {
|
||||
|
||||
test("progress directory is created properly when it does not exist") {
|
||||
progressTracker = StructuredStreamingProgressTracker
|
||||
.initInstance(eventhubsSource1.uid, progressRootPath.toString, appName, new Configuration())
|
||||
assert(fileSystem.exists(progressTracker.progressDirectoryPath))
|
||||
}
|
||||
|
||||
test("progress directory is created properly when it exists") {
|
||||
fileSystem.mkdirs(PathTools.makeTempDirectoryPath(progressRootPath.toString, appName))
|
||||
|
||||
progressTracker = StructuredStreamingProgressTracker
|
||||
.initInstance(eventhubsSource1.uid, progressRootPath.toString, appName, new Configuration())
|
||||
|
||||
assert(fileSystem.exists(progressTracker.progressDirectoryPath))
|
||||
}
|
||||
|
||||
test("temp progress is not cleaned up when partial temp progress exists") {
|
||||
val tempPath = PathTools.makeTempDirectoryPath(progressRootPath.toString, appName)
|
||||
|
||||
fileSystem.mkdirs(tempPath)
|
||||
|
||||
val streamId = EventHubsSource.streamIdGenerator.get()
|
||||
var tempFilePath = PathTools.makeTempFilePath(tempPath.toString,
|
||||
streamId, eventhubsSource1.uid, eventhubsNamedPartitions("ns1").head, unixTimestamp)
|
||||
|
||||
fileSystem.create(tempFilePath)
|
||||
|
||||
tempFilePath = PathTools.makeTempFilePath(tempPath.toString, streamId,
|
||||
eventhubsSource1.uid, eventhubsNamedPartitions("ns1").tail.head, unixTimestamp)
|
||||
|
||||
fileSystem.create(tempFilePath)
|
||||
|
||||
val filesBefore = fileSystem.listStatus(tempPath)
|
||||
assert(filesBefore.size === 2)
|
||||
|
||||
progressTracker = StructuredStreamingProgressTracker
|
||||
.initInstance(eventhubsSource1.uid, progressRootPath.toString, appName, new Configuration())
|
||||
|
||||
val filesAfter = fileSystem.listStatus(tempPath)
|
||||
assert(filesAfter.size === 2)
|
||||
}
|
||||
|
||||
test("incomplete progress will not be discarded") {
|
||||
// Register two eventhubs connectors to structured streaming progress tracker
|
||||
|
||||
StructuredStreamingProgressTracker.registeredConnectors +=
|
||||
eventhubsSource1.uid -> eventhubsSource1
|
||||
StructuredStreamingProgressTracker.registeredConnectors +=
|
||||
eventhubsSource2.uid -> eventhubsSource2
|
||||
|
||||
// Progress record of all partitions of eventhubsSource1 are updated
|
||||
|
||||
val eventHubsSourceStreamId1 = EventHubsSource.streamIdGenerator.get()
|
||||
|
||||
var progressWriter = new ProgressWriter(eventHubsSourceStreamId1,
|
||||
eventhubsSource1.uid, eventhubsSource1.connectedInstances.head, unixTimestamp,
|
||||
new Configuration(), progressRootPath.toString, appName, eventhubsSource1.uid)
|
||||
|
||||
progressWriter.write(Instant.now.getEpochSecond, 0L, 0L)
|
||||
|
||||
progressWriter = new ProgressWriter(eventHubsSourceStreamId1,
|
||||
eventhubsSource1.uid, eventhubsSource1.connectedInstances(1), unixTimestamp,
|
||||
new Configuration(), progressRootPath.toString, appName, eventhubsSource1.uid)
|
||||
|
||||
progressWriter.write(Instant.now.getEpochSecond, 10L, 10L)
|
||||
|
||||
// Progress records of all partitions of eventhubsSource2 are not updated
|
||||
|
||||
val eventHubsSourceStreamId2 = EventHubsSource.streamIdGenerator.get()
|
||||
|
||||
progressWriter = new ProgressWriter(eventHubsSourceStreamId2,
|
||||
eventhubsSource2.uid, eventhubsSource2.connectedInstances.head, unixTimestamp,
|
||||
new Configuration(), progressRootPath.toString, appName, eventhubsSource2.uid)
|
||||
|
||||
progressWriter.write(Instant.now.getEpochSecond, 0L, 0L)
|
||||
|
||||
progressWriter = new ProgressWriter(eventHubsSourceStreamId2,
|
||||
eventhubsSource2.uid, eventhubsSource2.connectedInstances(1), unixTimestamp,
|
||||
new Configuration(), progressRootPath.toString, appName, eventhubsSource2.uid)
|
||||
|
||||
progressWriter.write(Instant.now.getEpochSecond, 100L, 100L)
|
||||
|
||||
StructuredStreamingProgressTracker.initInstance(eventhubsSource1.uid,
|
||||
progressRootPath.toString, appName, new Configuration())
|
||||
StructuredStreamingProgressTracker.initInstance(eventhubsSource2.uid,
|
||||
progressRootPath.toString, appName, new Configuration())
|
||||
|
||||
var progressTempPath = PathTools.makeTempDirectoryStr(progressRootPath.toString,
|
||||
appName, eventhubsSource1.uid)
|
||||
|
||||
assert(fileSystem.exists(PathTools.makeTempFilePath(progressTempPath, eventHubsSourceStreamId1,
|
||||
eventhubsSource1.uid, eventhubsSource1.connectedInstances.head, unixTimestamp)))
|
||||
|
||||
assert(fileSystem.exists(PathTools.makeTempFilePath(progressTempPath, eventHubsSourceStreamId1,
|
||||
eventhubsSource1.uid, eventhubsSource1.connectedInstances(1), unixTimestamp)))
|
||||
|
||||
progressTempPath = PathTools.makeTempDirectoryStr(
|
||||
progressRootPath.toString, appName, eventhubsSource2.uid)
|
||||
|
||||
assert(fileSystem.exists(PathTools.makeTempFilePath(progressTempPath, eventHubsSourceStreamId2,
|
||||
eventhubsSource2.uid, eventhubsSource2.connectedInstances.head, unixTimestamp)))
|
||||
|
||||
assert(fileSystem.exists(PathTools.makeTempFilePath(progressTempPath, eventHubsSourceStreamId2,
|
||||
eventhubsSource2.uid, eventhubsSource2.connectedInstances(1), unixTimestamp)))
|
||||
|
||||
assert(!fileSystem.exists(PathTools.makeTempFilePath(progressTempPath, eventHubsSourceStreamId2,
|
||||
eventhubsSource2.uid, eventhubsSource2.connectedInstances(2), unixTimestamp)))
|
||||
}
|
||||
|
||||
test("start from the beginning of the streams when the latest progress file does not exist") {
|
||||
|
||||
// Register the two eventhubs connectors to structured streaming progress tracker
|
||||
|
||||
StructuredStreamingProgressTracker.registeredConnectors +=
|
||||
eventhubsSource3.uid -> eventhubsSource3
|
||||
StructuredStreamingProgressTracker.registeredConnectors +=
|
||||
eventhubsSource4.uid -> eventhubsSource4
|
||||
|
||||
val progressTracker3 = StructuredStreamingProgressTracker
|
||||
.initInstance(eventhubsSource3.uid, progressRootPath.toString, appName, new Configuration())
|
||||
val progressTracker4 = StructuredStreamingProgressTracker
|
||||
.initInstance(eventhubsSource4.uid, progressRootPath.toString, appName, new Configuration())
|
||||
|
||||
val eh3Progress = progressTracker3.read(eventhubsSource3.uid, unixTimestamp - 1000L,
|
||||
fallBack = false)
|
||||
val eh4Progress = progressTracker4.read(eventhubsSource4.uid, unixTimestamp - 1000L,
|
||||
fallBack = false)
|
||||
|
||||
assert(eh3Progress.offsets(eventhubsSource3.connectedInstances.head) === (-1L, -1L))
|
||||
assert(eh3Progress.offsets(eventhubsSource3.connectedInstances(1)) === (-1L, -1L))
|
||||
assert(eh3Progress.offsets(eventhubsSource3.connectedInstances(2)) === (-1L, -1L))
|
||||
assert(eh3Progress.offsets(eventhubsSource3.connectedInstances(3)) === (-1L, -1L))
|
||||
|
||||
assert(eh4Progress.offsets(eventhubsSource4.connectedInstances.head) === (-1L, -1L))
|
||||
assert(eh4Progress.offsets(eventhubsSource4.connectedInstances(1)) === (-1L, -1L))
|
||||
}
|
||||
|
||||
test("progress tracker can read back last progress correctly") {
|
||||
// Register two eventhubs connectors to structured streaming progress tracker
|
||||
|
||||
StructuredStreamingProgressTracker.registeredConnectors +=
|
||||
eventhubsSource1.uid -> eventhubsSource1
|
||||
StructuredStreamingProgressTracker.registeredConnectors +=
|
||||
eventhubsSource2.uid -> eventhubsSource2
|
||||
|
||||
// Progress record of all partitions of eventhubsSource1 are updated
|
||||
|
||||
val eventHubsSourceStreamId1 = EventHubsSource.streamIdGenerator.get()
|
||||
|
||||
var progressWriter = new ProgressWriter(eventHubsSourceStreamId1,
|
||||
eventhubsSource1.uid, eventhubsSource1.connectedInstances.head, unixTimestamp,
|
||||
new Configuration(), progressRootPath.toString, appName, eventhubsSource1.uid)
|
||||
progressWriter.write(unixTimestamp, 0L, 0L)
|
||||
|
||||
progressWriter = new ProgressWriter(eventHubsSourceStreamId1,
|
||||
eventhubsSource1.uid, eventhubsSource1.connectedInstances(1), unixTimestamp,
|
||||
new Configuration(), progressRootPath.toString, appName, eventhubsSource1.uid)
|
||||
progressWriter.write(unixTimestamp, 10L, 10L)
|
||||
|
||||
// Progress records of all partitions of eventhubsSource2 are updated
|
||||
|
||||
val eventHubsSourceStreamId2 = EventHubsSource.streamIdGenerator.get()
|
||||
|
||||
progressWriter = new ProgressWriter(eventHubsSourceStreamId2,
|
||||
eventhubsSource2.uid, eventhubsSource2.connectedInstances.head, unixTimestamp,
|
||||
new Configuration(), progressRootPath.toString, appName, eventhubsSource2.uid)
|
||||
progressWriter.write(unixTimestamp, 0L, 0L)
|
||||
|
||||
progressWriter = new ProgressWriter(eventHubsSourceStreamId2,
|
||||
eventhubsSource2.uid, eventhubsSource2.connectedInstances(1), unixTimestamp,
|
||||
new Configuration(), progressRootPath.toString, appName, eventhubsSource2.uid)
|
||||
progressWriter.write(unixTimestamp, 100L, 100L)
|
||||
|
||||
progressWriter = new ProgressWriter(eventHubsSourceStreamId2,
|
||||
eventhubsSource2.uid, eventhubsSource2.connectedInstances(2), unixTimestamp,
|
||||
new Configuration(), progressRootPath.toString, appName, eventhubsSource2.uid)
|
||||
progressWriter.write(unixTimestamp, 200L, 200L)
|
||||
|
||||
|
||||
val progressTracker1 = StructuredStreamingProgressTracker
|
||||
.initInstance(eventhubsSource1.uid, progressRootPath.toString, appName, new Configuration())
|
||||
progressTracker1.commit(
|
||||
progressTracker1.collectProgressRecordsForBatch(unixTimestamp, List(eventhubsSource1)),
|
||||
unixTimestamp)
|
||||
|
||||
val progressTracker2 = StructuredStreamingProgressTracker
|
||||
.initInstance(eventhubsSource2.uid, progressRootPath.toString, appName, new Configuration())
|
||||
progressTracker2.commit(progressTracker2.collectProgressRecordsForBatch(unixTimestamp,
|
||||
List(eventhubsSource2)), unixTimestamp)
|
||||
|
||||
val eh1Progress = progressTracker1.read(eventhubsSource1.uid, unixTimestamp,
|
||||
fallBack = false)
|
||||
val eh2Progress = progressTracker2.read(eventhubsSource2.uid, unixTimestamp,
|
||||
fallBack = false)
|
||||
|
||||
assert(eh1Progress.offsets(eventhubsSource1.connectedInstances.head) === (0L, 0L))
|
||||
assert(eh1Progress.offsets(eventhubsSource1.connectedInstances(1)) === (10L, 10L))
|
||||
assert(eh2Progress.offsets(eventhubsSource2.connectedInstances.head) === (0L, 0L))
|
||||
assert(eh2Progress.offsets(eventhubsSource2.connectedInstances(1)) === (100L, 100L))
|
||||
assert(eh2Progress.offsets(eventhubsSource2.connectedInstances(2)) === (200L, 200L))
|
||||
}
|
||||
|
||||
test("inconsistent timestamp in the progress tracks can be detected") {
|
||||
// Register two eventhubs connectors to structured streaming progress tracker
|
||||
|
||||
StructuredStreamingProgressTracker.registeredConnectors +=
|
||||
eventhubsSource1.uid -> eventhubsSource1
|
||||
StructuredStreamingProgressTracker.registeredConnectors +=
|
||||
eventhubsSource2.uid -> eventhubsSource2
|
||||
|
||||
// Progress record of all partitions of eventhubsSource1 are updated
|
||||
|
||||
val eventHubsSourceStreamId1 = EventHubsSource.streamIdGenerator.get()
|
||||
|
||||
var progressWriter = new ProgressWriter(eventHubsSourceStreamId1,
|
||||
eventhubsSource1.uid, eventhubsSource1.connectedInstances.head, unixTimestamp,
|
||||
new Configuration(), progressRootPath.toString, appName, eventhubsSource1.uid)
|
||||
progressWriter.write(unixTimestamp, 0L, 0L)
|
||||
|
||||
progressWriter = new ProgressWriter(eventHubsSourceStreamId1,
|
||||
eventhubsSource1.uid, eventhubsSource1.connectedInstances(1), unixTimestamp,
|
||||
new Configuration(), progressRootPath.toString, appName, eventhubsSource1.uid)
|
||||
progressWriter.write(unixTimestamp, 10L, 10L)
|
||||
|
||||
// Progress records of all partitions of eventhubsSource2 are not updated
|
||||
|
||||
val eventHubsSourceStreamId2 = EventHubsSource.streamIdGenerator.get()
|
||||
|
||||
progressWriter = new ProgressWriter(eventHubsSourceStreamId2,
|
||||
eventhubsSource2.uid, eventhubsSource2.connectedInstances.head, unixTimestamp,
|
||||
new Configuration(), progressRootPath.toString, appName, eventhubsSource2.uid)
|
||||
progressWriter.write(unixTimestamp, 0L, 0L)
|
||||
|
||||
progressWriter = new ProgressWriter(eventHubsSourceStreamId2,
|
||||
eventhubsSource2.uid, eventhubsSource2.connectedInstances(1), unixTimestamp,
|
||||
new Configuration(), progressRootPath.toString, appName, eventhubsSource2.uid)
|
||||
progressWriter.write(unixTimestamp, 100L, 100L)
|
||||
|
||||
progressWriter = new ProgressWriter(eventHubsSourceStreamId2,
|
||||
eventhubsSource2.uid, eventhubsSource2.connectedInstances(2), unixTimestamp,
|
||||
new Configuration(), progressRootPath.toString, appName, eventhubsSource2.uid)
|
||||
progressWriter.write(unixTimestamp + 1000L, 200L, 200L)
|
||||
|
||||
val progressTracker1 = StructuredStreamingProgressTracker
|
||||
.initInstance(eventhubsSource1.uid, progressRootPath.toString, appName, new Configuration())
|
||||
progressTracker1.commit(progressTracker1.collectProgressRecordsForBatch(unixTimestamp,
|
||||
List(eventhubsSource1)), unixTimestamp)
|
||||
|
||||
val progressTracker2 = StructuredStreamingProgressTracker
|
||||
.initInstance(eventhubsSource2.uid, progressRootPath.toString, appName, new Configuration())
|
||||
|
||||
intercept[IllegalStateException] {
|
||||
progressTracker2.commit(progressTracker2.collectProgressRecordsForBatch(unixTimestamp,
|
||||
List(eventhubsSource2)), unixTimestamp)
|
||||
}
|
||||
}
|
||||
|
||||
test("latest offsets can be committed correctly and temp directory is not cleaned") {
|
||||
// Register two eventhubs connectors to structured streaming progress tracker
|
||||
|
||||
StructuredStreamingProgressTracker.registeredConnectors +=
|
||||
eventhubsSource1.uid -> eventhubsSource1
|
||||
StructuredStreamingProgressTracker.registeredConnectors +=
|
||||
eventhubsSource2.uid -> eventhubsSource2
|
||||
|
||||
// Progress record of all partitions of eventhubsSource1 are updated
|
||||
|
||||
val eventHubsSourceStreamId1 = EventHubsSource.streamIdGenerator.get()
|
||||
|
||||
var progressWriter = new ProgressWriter(eventHubsSourceStreamId1,
|
||||
eventhubsSource1.uid, eventhubsSource1.connectedInstances.head, unixTimestamp,
|
||||
new Configuration(), progressRootPath.toString, appName, eventhubsSource1.uid)
|
||||
progressWriter.write(unixTimestamp, 0L, 0L)
|
||||
|
||||
progressWriter = new ProgressWriter(eventHubsSourceStreamId1,
|
||||
eventhubsSource1.uid, eventhubsSource1.connectedInstances(1), unixTimestamp,
|
||||
new Configuration(), progressRootPath.toString, appName, eventhubsSource1.uid)
|
||||
progressWriter.write(unixTimestamp, 10L, 10L)
|
||||
|
||||
// Progress records of all partitions of eventhubsSource2 are not updated
|
||||
|
||||
val eventHubsSourceStreamId2 = EventHubsSource.streamIdGenerator.get()
|
||||
|
||||
progressWriter = new ProgressWriter(eventHubsSourceStreamId2,
|
||||
eventhubsSource2.uid, eventhubsSource2.connectedInstances.head, unixTimestamp,
|
||||
new Configuration(), progressRootPath.toString, appName, eventhubsSource2.uid)
|
||||
progressWriter.write(unixTimestamp, 0L, 0L)
|
||||
|
||||
progressWriter = new ProgressWriter(eventHubsSourceStreamId2,
|
||||
eventhubsSource2.uid, eventhubsSource2.connectedInstances(1), unixTimestamp,
|
||||
new Configuration(), progressRootPath.toString, appName, eventhubsSource2.uid)
|
||||
progressWriter.write(unixTimestamp, 100L, 100L)
|
||||
|
||||
progressWriter = new ProgressWriter(eventHubsSourceStreamId2,
|
||||
eventhubsSource2.uid, eventhubsSource2.connectedInstances(2), unixTimestamp,
|
||||
new Configuration(), progressRootPath.toString, appName, eventhubsSource2.uid)
|
||||
progressWriter.write(unixTimestamp, 200L, 200L)
|
||||
|
||||
|
||||
val progressTracker1 = StructuredStreamingProgressTracker
|
||||
.initInstance(eventhubsSource1.uid, progressRootPath.toString, appName, new Configuration())
|
||||
progressTracker1.commit(progressTracker1.collectProgressRecordsForBatch(
|
||||
unixTimestamp, List(eventhubsSource1)), unixTimestamp)
|
||||
|
||||
val progressTracker2 = StructuredStreamingProgressTracker
|
||||
.initInstance(eventhubsSource2.uid, progressRootPath.toString, appName, new Configuration())
|
||||
progressTracker2.commit(progressTracker2.collectProgressRecordsForBatch(
|
||||
unixTimestamp, List(eventhubsSource2)), unixTimestamp)
|
||||
|
||||
var progressTempPath = PathTools.makeTempDirectoryStr(
|
||||
progressRootPath.toString, appName, eventhubsSource1.uid)
|
||||
|
||||
assert(fileSystem.exists(PathTools.makeTempFilePath(progressTempPath, eventHubsSourceStreamId1,
|
||||
eventhubsSource1.uid, eventhubsSource1.connectedInstances.head, unixTimestamp)))
|
||||
|
||||
assert(fileSystem.exists(PathTools.makeTempFilePath(progressTempPath, eventHubsSourceStreamId1,
|
||||
eventhubsSource1.uid, eventhubsSource1.connectedInstances(1), unixTimestamp)))
|
||||
|
||||
progressTempPath = PathTools.makeTempDirectoryStr(
|
||||
progressRootPath.toString, appName, eventhubsSource2.uid)
|
||||
|
||||
assert(fileSystem.exists(PathTools.makeTempFilePath(progressTempPath, eventHubsSourceStreamId2,
|
||||
eventhubsSource2.uid, eventhubsSource2.connectedInstances.head, unixTimestamp)))
|
||||
|
||||
assert(fileSystem.exists(PathTools.makeTempFilePath(progressTempPath, eventHubsSourceStreamId2,
|
||||
eventhubsSource2.uid, eventhubsSource2.connectedInstances(1), unixTimestamp)))
|
||||
|
||||
assert(fileSystem.exists(PathTools.makeTempFilePath(progressTempPath, eventHubsSourceStreamId2,
|
||||
eventhubsSource2.uid, eventhubsSource2.connectedInstances(2), unixTimestamp)))
|
||||
}
|
||||
|
||||
test("locate progress file correctly based on timestamp") {
|
||||
// Register one eventhubs connector to structured streaming progress tracker
|
||||
|
||||
StructuredStreamingProgressTracker.registeredConnectors +=
|
||||
eventhubsSource1.uid -> eventhubsSource1
|
||||
|
||||
// Progress record of all partitions of eventhubsSource1 are updated
|
||||
|
||||
val eventHubsSourceStreamId1 = EventHubsSource.streamIdGenerator.get()
|
||||
|
||||
// Update progress for unixTimestamp
|
||||
|
||||
var progressWriter = new ProgressWriter(eventHubsSourceStreamId1,
|
||||
eventhubsSource1.uid, eventhubsSource1.connectedInstances.head, unixTimestamp,
|
||||
new Configuration(), progressRootPath.toString, appName, eventhubsSource1.uid)
|
||||
progressWriter.write(unixTimestamp, 0L, 0L)
|
||||
|
||||
progressWriter = new ProgressWriter(eventHubsSourceStreamId1,
|
||||
eventhubsSource1.uid, eventhubsSource1.connectedInstances(1), unixTimestamp,
|
||||
new Configuration(), progressRootPath.toString, appName, eventhubsSource1.uid)
|
||||
progressWriter.write(unixTimestamp, 10L, 10L)
|
||||
|
||||
// Update progress for unixTimestamp + 1000L
|
||||
|
||||
progressWriter = new ProgressWriter(eventHubsSourceStreamId1,
|
||||
eventhubsSource1.uid, eventhubsSource1.connectedInstances.head, unixTimestamp + 1000L,
|
||||
new Configuration(), progressRootPath.toString, appName, eventhubsSource1.uid)
|
||||
progressWriter.write(unixTimestamp + 1000L, 20L, 20L)
|
||||
|
||||
progressWriter = new ProgressWriter(eventHubsSourceStreamId1,
|
||||
eventhubsSource1.uid, eventhubsSource1.connectedInstances(1), unixTimestamp + 1000L,
|
||||
new Configuration(), progressRootPath.toString, appName, eventhubsSource1.uid)
|
||||
progressWriter.write(unixTimestamp + 1000L, 30L, 30L)
|
||||
|
||||
// Update progress for unixTimestamp + 2000L
|
||||
|
||||
progressWriter = new ProgressWriter(eventHubsSourceStreamId1,
|
||||
eventhubsSource1.uid, eventhubsSource1.connectedInstances.head, unixTimestamp + 2000L,
|
||||
new Configuration(), progressRootPath.toString, appName, eventhubsSource1.uid)
|
||||
progressWriter.write(unixTimestamp + 2000L, 40L, 40L)
|
||||
|
||||
progressWriter = new ProgressWriter(eventHubsSourceStreamId1,
|
||||
eventhubsSource1.uid, eventhubsSource1.connectedInstances(1), unixTimestamp + 2000L,
|
||||
new Configuration(), progressRootPath.toString, appName, eventhubsSource1.uid)
|
||||
progressWriter.write(unixTimestamp + 2000L, 50L, 50L)
|
||||
|
||||
val progressTracker1 = StructuredStreamingProgressTracker
|
||||
.initInstance(eventhubsSource1.uid, progressRootPath.toString, appName, new Configuration())
|
||||
|
||||
progressTracker1.commit(progressTracker1.collectProgressRecordsForBatch(
|
||||
unixTimestamp, List(eventhubsSource1)), unixTimestamp)
|
||||
progressTracker1.commit(progressTracker1.collectProgressRecordsForBatch(
|
||||
unixTimestamp + 1000L, List(eventhubsSource1)), unixTimestamp + 1000L)
|
||||
progressTracker1.commit(progressTracker1.collectProgressRecordsForBatch(
|
||||
unixTimestamp + 2000L, List(eventhubsSource1)), unixTimestamp + 2000L)
|
||||
|
||||
var eh1Progress = progressTracker1.read(eventhubsSource1.uid, unixTimestamp,
|
||||
fallBack = false)
|
||||
|
||||
assert(eh1Progress.offsets(eventhubsSource1.connectedInstances.head) === (0L, 0L))
|
||||
assert(eh1Progress.offsets(eventhubsSource1.connectedInstances(1)) === (10L, 10L))
|
||||
|
||||
eh1Progress = progressTracker1.read(eventhubsSource1.uid, unixTimestamp + 1000L,
|
||||
fallBack = false)
|
||||
|
||||
assert(eh1Progress.offsets(eventhubsSource1.connectedInstances.head) === (20L, 20L))
|
||||
assert(eh1Progress.offsets(eventhubsSource1.connectedInstances(1)) === (30L, 30L))
|
||||
|
||||
val progressFilePath = progressTracker1.pinPointProgressFile(fileSystem, unixTimestamp + 3000L)
|
||||
|
||||
assert(progressFilePath === None)
|
||||
}
|
||||
|
||||
override def beforeEach(): Unit = {
|
||||
super.beforeEach()
|
||||
init()
|
||||
}
|
||||
|
||||
override def afterEach(): Unit = {
|
||||
reset()
|
||||
}
|
||||
|
||||
protected def init(): Unit = {
|
||||
progressRootPath = new Path(Files.createTempDirectory("progress_root").toString)
|
||||
fileSystem = progressRootPath.getFileSystem(new Configuration())
|
||||
unixTimestamp = Instant.now.getEpochSecond
|
||||
}
|
||||
|
||||
protected def reset(): Unit = {
|
||||
StructuredStreamingProgressTracker.reset()
|
||||
progressTracker = null
|
||||
}
|
||||
|
||||
private val appName = "StrutcuredStreamingApp"
|
||||
|
||||
private val eventhubsNamedPartitions = Map("ns1" -> Seq(EventHubNameAndPartition("eh1", 0),
|
||||
EventHubNameAndPartition("eh1", 1)),
|
||||
"ns2" -> Seq(EventHubNameAndPartition("eh2", 0), EventHubNameAndPartition("eh2", 1),
|
||||
EventHubNameAndPartition("eh", 2)),
|
||||
"ns3" -> Seq(EventHubNameAndPartition("eh3", 0), EventHubNameAndPartition("eh3", 1),
|
||||
EventHubNameAndPartition("eh3", 2), EventHubNameAndPartition("eh3", 3),
|
||||
EventHubNameAndPartition("eh2", 0), EventHubNameAndPartition("eh2", 1)))
|
||||
|
||||
private val eventhubsSource1: EventHubsConnector = new EventHubsConnector {
|
||||
override def streamId = 0
|
||||
override def uid = "ns1_eh1"
|
||||
override def connectedInstances : List[EventHubNameAndPartition] =
|
||||
eventhubsNamedPartitions("ns1").toList
|
||||
}
|
||||
|
||||
private val eventhubsSource2: EventHubsConnector = new EventHubsConnector {
|
||||
override def streamId = 0
|
||||
override def uid = "ns2_eh2"
|
||||
override def connectedInstances : List[EventHubNameAndPartition] =
|
||||
eventhubsNamedPartitions("ns2").toList
|
||||
}
|
||||
|
||||
private val eventhubsSource3: EventHubsConnector = new EventHubsConnector {
|
||||
override def streamId = 0
|
||||
override def uid = "ns3_eh3"
|
||||
override def connectedInstances : List[EventHubNameAndPartition] =
|
||||
eventhubsNamedPartitions("ns3").filter(x => x.eventHubName.equals("eh3")).toList
|
||||
}
|
||||
|
||||
private val eventhubsSource4: EventHubsConnector = new EventHubsConnector {
|
||||
override def streamId = 0
|
||||
override def uid = "ns3_eh2"
|
||||
override def connectedInstances : List[EventHubNameAndPartition] =
|
||||
eventhubsNamedPartitions("ns3").filter(x => x.eventHubName.equals("eh2")).toList
|
||||
}
|
||||
|
||||
private var fileSystem: FileSystem = _
|
||||
private var progressRootPath: Path = _
|
||||
private var progressTracker: ProgressTrackerBase[_ <: EventHubsConnector] = _
|
||||
private var unixTimestamp: Long = _
|
||||
}
|
|
@ -17,16 +17,15 @@
|
|||
|
||||
package org.apache.spark.streaming.eventhubs
|
||||
|
||||
import scala.reflect.ClassTag
|
||||
|
||||
import org.apache.hadoop.fs.{Path, PathFilter}
|
||||
|
||||
import org.apache.spark.{SparkConf, SparkContext}
|
||||
import org.apache.hadoop.fs.{ Path, PathFilter }
|
||||
import org.apache.spark.eventhubscommon.OffsetRecord
|
||||
import org.apache.spark.streaming._
|
||||
import org.apache.spark.streaming.dstream.DStream
|
||||
import org.apache.spark.streaming.eventhubs.checkpoint.DirectDStreamProgressTracker
|
||||
import org.apache.spark.util.ManualClock
|
||||
import org.apache.spark.{ SparkConf, SparkContext }
|
||||
|
||||
import scala.reflect.ClassTag
|
||||
|
||||
/**
|
||||
* A trait of that can be mixed in to get methods for testing DStream operations under
|
||||
|
@ -36,7 +35,8 @@ import org.apache.spark.util.ManualClock
|
|||
trait CheckpointAndProgressTrackerTestSuiteBase extends EventHubTestSuiteBase { self: SharedUtils =>
|
||||
|
||||
protected def createContextForCheckpointOperation(
|
||||
batchDuration: Duration, checkpointDirectory: String): StreamingContext = {
|
||||
batchDuration: Duration,
|
||||
checkpointDirectory: String): StreamingContext = {
|
||||
val conf = new SparkConf().setMaster("local[*]").setAppName(appName)
|
||||
conf.set("spark.streaming.clock", classOf[ManualClock].getName)
|
||||
val ssc = new StreamingContext(SparkContext.getOrCreate(conf), batchDuration)
|
||||
|
@ -44,32 +44,41 @@ trait CheckpointAndProgressTrackerTestSuiteBase extends EventHubTestSuiteBase {
|
|||
ssc
|
||||
}
|
||||
|
||||
protected def getTestOutputStream[V: ClassTag](streams: Array[DStream[_]]):
|
||||
TestEventHubOutputStream[V] = {
|
||||
protected def getTestOutputStream[V: ClassTag](
|
||||
streams: Array[DStream[_]]): TestEventHubOutputStream[V] = {
|
||||
streams.collect {
|
||||
case ds: TestEventHubOutputStream[V @unchecked] => ds
|
||||
}.head
|
||||
}
|
||||
|
||||
private def validateTempFileCleanup(
|
||||
numNonExistBatch: Int,
|
||||
numBatches: Int,
|
||||
expectedFileNum: Int): Unit = {
|
||||
assert(fs.listStatus(new Path(progressRootPath.toString + s"/${appName}_temp"),
|
||||
new PathFilter {
|
||||
override def accept(path: Path): Boolean = {
|
||||
DirectDStreamProgressTracker.getInstance.asInstanceOf[DirectDStreamProgressTracker].
|
||||
fromPathToTimestamp(path) < 1000 * numNonExistBatch
|
||||
}
|
||||
}).length == 0)
|
||||
private def validateTempFileCleanup(numNonExistBatch: Int,
|
||||
numBatches: Int,
|
||||
expectedFileNum: Int): Unit = {
|
||||
assert(
|
||||
fs.listStatus(
|
||||
new Path(progressRootPath.toString + s"/${appName}_temp"),
|
||||
new PathFilter {
|
||||
override def accept(path: Path): Boolean = {
|
||||
DirectDStreamProgressTracker.getInstance
|
||||
.asInstanceOf[DirectDStreamProgressTracker]
|
||||
.fromPathToTimestamp(path) < 1000 * numNonExistBatch
|
||||
}
|
||||
}
|
||||
)
|
||||
.length == 0)
|
||||
// we do not consider APIs like take() here
|
||||
assert(fs.listStatus(new Path(progressRootPath.toString + s"/${appName}_temp"),
|
||||
new PathFilter {
|
||||
override def accept(path: Path): Boolean = {
|
||||
DirectDStreamProgressTracker.getInstance.asInstanceOf[DirectDStreamProgressTracker].
|
||||
fromPathToTimestamp(path) == 1000 * numBatches
|
||||
}
|
||||
}).length == expectedFileNum)
|
||||
assert(
|
||||
fs.listStatus(
|
||||
new Path(progressRootPath.toString + s"/${appName}_temp"),
|
||||
new PathFilter {
|
||||
override def accept(path: Path): Boolean = {
|
||||
DirectDStreamProgressTracker.getInstance
|
||||
.asInstanceOf[DirectDStreamProgressTracker]
|
||||
.fromPathToTimestamp(path) == 1000 * numBatches
|
||||
}
|
||||
}
|
||||
)
|
||||
.length == expectedFileNum)
|
||||
}
|
||||
|
||||
// NOTE: due to SPARK-19280 (https://issues.apache.org/jira/browse/SPARK-19280)
|
||||
|
@ -96,41 +105,41 @@ trait CheckpointAndProgressTrackerTestSuiteBase extends EventHubTestSuiteBase {
|
|||
}
|
||||
assert(fs.exists(new Path(progressRootPath.toString + s"/$appName/" +
|
||||
s"progress-${numBatches * 1000}")))
|
||||
*/
|
||||
*/
|
||||
|
||||
}
|
||||
|
||||
|
||||
protected def testCheckpointedOperation[U: ClassTag, V: ClassTag, W: ClassTag](
|
||||
input1: Seq[Seq[U]],
|
||||
input2: Seq[Seq[V]],
|
||||
eventhubsParams1: Map[String, Map[String, String]],
|
||||
eventhubsParams2: Map[String, Map[String, String]],
|
||||
expectedStartingOffsetsAndSeqs1: Map[String, OffsetRecord],
|
||||
expectedStartingOffsetsAndSeqs2: Map[String, OffsetRecord],
|
||||
operation: (EventHubDirectDStream, EventHubDirectDStream) => DStream[W],
|
||||
expectedOutputBeforeRestart: Seq[Seq[W]],
|
||||
expectedOutputAfterRestart: Seq[Seq[W]]) {
|
||||
input1: Seq[Seq[U]],
|
||||
input2: Seq[Seq[V]],
|
||||
eventhubsParams1: Map[String, Map[String, String]],
|
||||
eventhubsParams2: Map[String, Map[String, String]],
|
||||
expectedStartingOffsetsAndSeqs1: Map[String, OffsetRecord],
|
||||
expectedStartingOffsetsAndSeqs2: Map[String, OffsetRecord],
|
||||
operation: (EventHubDirectDStream, EventHubDirectDStream) => DStream[W],
|
||||
expectedOutputBeforeRestart: Seq[Seq[W]],
|
||||
expectedOutputAfterRestart: Seq[Seq[W]]) {
|
||||
|
||||
require(ssc.conf.get("spark.streaming.clock") === classOf[ManualClock].getName,
|
||||
"Cannot run test without manual clock in the conf")
|
||||
"Cannot run test without manual clock in the conf")
|
||||
|
||||
testBinaryOperation(
|
||||
input1,
|
||||
input2,
|
||||
eventhubsParams1,
|
||||
eventhubsParams2,
|
||||
expectedStartingOffsetsAndSeqs1,
|
||||
expectedStartingOffsetsAndSeqs2,
|
||||
operation,
|
||||
expectedOutputBeforeRestart)
|
||||
testBinaryOperation(input1,
|
||||
input2,
|
||||
eventhubsParams1,
|
||||
eventhubsParams2,
|
||||
expectedStartingOffsetsAndSeqs1,
|
||||
expectedStartingOffsetsAndSeqs2,
|
||||
operation,
|
||||
expectedOutputBeforeRestart)
|
||||
|
||||
validateProgressFileCleanup(expectedOutputBeforeRestart.length - 2,
|
||||
expectedOutputBeforeRestart.length)
|
||||
validateTempFileCleanup(expectedOutputBeforeRestart.length - 1,
|
||||
expectedOutputBeforeRestart.length)
|
||||
validateTempFileCleanup(
|
||||
expectedOutputBeforeRestart.length - 1,
|
||||
expectedOutputBeforeRestart.length,
|
||||
expectedStartingOffsetsAndSeqs1.values.flatMap(_.offsets).size +
|
||||
expectedStartingOffsetsAndSeqs2.values.flatMap(_.offsets).size)
|
||||
expectedStartingOffsetsAndSeqs2.values.flatMap(_.offsets).size
|
||||
)
|
||||
|
||||
val currentCheckpointDir = ssc.checkpointDir
|
||||
// simulate down
|
||||
|
@ -146,8 +155,9 @@ trait CheckpointAndProgressTrackerTestSuiteBase extends EventHubTestSuiteBase {
|
|||
)
|
||||
|
||||
runStreamsWithEventHubInput(ssc,
|
||||
expectedOutputAfterRestart.length - 1,
|
||||
expectedOutputAfterRestart, useSet = true)
|
||||
expectedOutputAfterRestart.length - 1,
|
||||
expectedOutputAfterRestart,
|
||||
useSet = true)
|
||||
|
||||
// test cleanup of progress files
|
||||
validateProgressFileCleanup(
|
||||
|
@ -157,7 +167,8 @@ trait CheckpointAndProgressTrackerTestSuiteBase extends EventHubTestSuiteBase {
|
|||
expectedOutputBeforeRestart.length + expectedOutputAfterRestart.length - 2,
|
||||
expectedOutputBeforeRestart.length + expectedOutputAfterRestart.length - 1,
|
||||
expectedStartingOffsetsAndSeqs1.values.flatMap(_.offsets).size +
|
||||
expectedStartingOffsetsAndSeqs2.values.flatMap(_.offsets).size)
|
||||
expectedStartingOffsetsAndSeqs2.values.flatMap(_.offsets).size
|
||||
)
|
||||
}
|
||||
|
||||
protected def runStopAndRecover[U: ClassTag, V: ClassTag](
|
||||
|
@ -169,21 +180,19 @@ trait CheckpointAndProgressTrackerTestSuiteBase extends EventHubTestSuiteBase {
|
|||
expectedOutputBeforeRestart: Seq[Seq[V]],
|
||||
useSetFlag: Boolean = false): Unit = {
|
||||
|
||||
testUnaryOperation(
|
||||
input,
|
||||
eventhubsParams,
|
||||
expectedStartingOffsetsAndSeqs,
|
||||
operation,
|
||||
expectedOutputBeforeRestart,
|
||||
useSet = useSetFlag)
|
||||
testUnaryOperation(input,
|
||||
eventhubsParams,
|
||||
expectedStartingOffsetsAndSeqs,
|
||||
operation,
|
||||
expectedOutputBeforeRestart,
|
||||
useSet = useSetFlag)
|
||||
testProgressTracker(eventhubNamespace, expectedOffsetsAndSeqs, 4000L)
|
||||
|
||||
validateProgressFileCleanup(expectedOutputBeforeRestart.length - 2,
|
||||
expectedOutputBeforeRestart.length)
|
||||
validateTempFileCleanup(
|
||||
expectedOutputBeforeRestart.length - 1,
|
||||
expectedOutputBeforeRestart.length,
|
||||
expectedOffsetsAndSeqs.offsets.size)
|
||||
expectedOutputBeforeRestart.length)
|
||||
validateTempFileCleanup(expectedOutputBeforeRestart.length - 1,
|
||||
expectedOutputBeforeRestart.length,
|
||||
expectedOffsetsAndSeqs.offsets.size)
|
||||
|
||||
val currentCheckpointDir = ssc.checkpointDir
|
||||
// simulate down
|
||||
|
@ -204,10 +213,15 @@ trait CheckpointAndProgressTrackerTestSuiteBase extends EventHubTestSuiteBase {
|
|||
directoryToClean: Option[Path] = None) {
|
||||
|
||||
require(ssc.conf.get("spark.streaming.clock") === classOf[ManualClock].getName,
|
||||
"Cannot run test without manual clock in the conf")
|
||||
"Cannot run test without manual clock in the conf")
|
||||
|
||||
runStopAndRecover(input, eventhubsParams, expectedStartingOffsetsAndSeqs,
|
||||
expectedOffsetsAndSeqs, operation, expectedOutputBeforeRestart, useSetFlag = useSetFlag)
|
||||
runStopAndRecover(input,
|
||||
eventhubsParams,
|
||||
expectedStartingOffsetsAndSeqs,
|
||||
expectedOffsetsAndSeqs,
|
||||
operation,
|
||||
expectedOutputBeforeRestart,
|
||||
useSetFlag = useSetFlag)
|
||||
|
||||
if (directoryToClean.isDefined) {
|
||||
fs.delete(directoryToClean.get, true)
|
||||
|
@ -220,8 +234,10 @@ trait CheckpointAndProgressTrackerTestSuiteBase extends EventHubTestSuiteBase {
|
|||
"\n-------------------------------------------\n"
|
||||
)
|
||||
|
||||
runStreamsWithEventHubInput(ssc, expectedOutputAfterRestart.length - 1,
|
||||
expectedOutputAfterRestart, useSet = useSetFlag)
|
||||
runStreamsWithEventHubInput(ssc,
|
||||
expectedOutputAfterRestart.length - 1,
|
||||
expectedOutputAfterRestart,
|
||||
useSet = useSetFlag)
|
||||
|
||||
validateProgressFileCleanup(
|
||||
expectedOutputBeforeRestart.length + expectedOutputAfterRestart.length - 3,
|
||||
|
@ -229,6 +245,7 @@ trait CheckpointAndProgressTrackerTestSuiteBase extends EventHubTestSuiteBase {
|
|||
validateTempFileCleanup(
|
||||
expectedOutputBeforeRestart.length + expectedOutputAfterRestart.length - 2,
|
||||
expectedOutputBeforeRestart.length + expectedOutputAfterRestart.length - 1,
|
||||
expectedOffsetsAndSeqs.offsets.size)
|
||||
expectedOffsetsAndSeqs.offsets.size
|
||||
)
|
||||
}
|
||||
}
|
||||
|
|
|
@ -20,10 +20,10 @@ package org.apache.spark.streaming.eventhubs
|
|||
import org.mockito.Mockito
|
||||
import org.scalatest.mock.MockitoSugar
|
||||
|
||||
import org.apache.spark.eventhubscommon.{EventHubNameAndPartition, OffsetRecord}
|
||||
import org.apache.spark.eventhubscommon.client.EventHubClient
|
||||
import org.apache.spark.eventhubscommon.{ EventHubNameAndPartition, OffsetRecord }
|
||||
import org.apache.spark.eventhubscommon.client.Client
|
||||
import org.apache.spark.rdd.RDD
|
||||
import org.apache.spark.streaming.{Duration, Seconds, Time}
|
||||
import org.apache.spark.streaming.{ Duration, Seconds, Time }
|
||||
|
||||
class EventHubDirectDStreamSuite extends EventHubTestSuiteBase with MockitoSugar with SharedUtils {
|
||||
|
||||
|
@ -41,12 +41,15 @@ class EventHubDirectDStreamSuite extends EventHubTestSuiteBase with MockitoSugar
|
|||
)
|
||||
|
||||
test("skip the batch when EH endpoint is unavailable for starting seq number query") {
|
||||
val ehDStream = new EventHubDirectDStream(ssc, eventhubNamespace, progressRootPath.toString,
|
||||
Map("eh1" -> eventhubParameters))
|
||||
val eventHubClientMock = mock[EventHubClient]
|
||||
Mockito.when(eventHubClientMock.startSeqOfPartition(retryIfFail = false,
|
||||
ehDStream.connectedInstances)).
|
||||
thenReturn(None)
|
||||
val ehDStream = new EventHubDirectDStream(ssc,
|
||||
eventhubNamespace,
|
||||
progressRootPath.toString,
|
||||
Map("eh1" -> eventhubParameters))
|
||||
val eventHubClientMock = mock[Client]
|
||||
Mockito
|
||||
.when(
|
||||
eventHubClientMock.startSeqOfPartition(retryIfFail = false, ehDStream.connectedInstances))
|
||||
.thenReturn(None)
|
||||
ehDStream.setEventHubClient(eventHubClientMock)
|
||||
ssc.scheduler.start()
|
||||
intercept[IllegalArgumentException] {
|
||||
|
@ -55,17 +58,21 @@ class EventHubDirectDStreamSuite extends EventHubTestSuiteBase with MockitoSugar
|
|||
}
|
||||
|
||||
test("skip the batch when EH endpoint is unavailable for highest offset query") {
|
||||
val ehDStream = new EventHubDirectDStream(ssc, eventhubNamespace, progressRootPath.toString,
|
||||
Map("eh1" -> eventhubParameters))
|
||||
val eventHubClientMock = mock[EventHubClient]
|
||||
val dummyStartSeqMap = (0 until 32).map(partitionId =>
|
||||
(EventHubNameAndPartition("eh1", partitionId), 1L)).toMap
|
||||
Mockito.when(eventHubClientMock.startSeqOfPartition(retryIfFail = false,
|
||||
ehDStream.connectedInstances)).
|
||||
thenReturn(Some(dummyStartSeqMap))
|
||||
Mockito.when(eventHubClientMock.endPointOfPartition(retryIfFail = true,
|
||||
ehDStream.connectedInstances)).
|
||||
thenReturn(None)
|
||||
val ehDStream = new EventHubDirectDStream(ssc,
|
||||
eventhubNamespace,
|
||||
progressRootPath.toString,
|
||||
Map("eh1" -> eventhubParameters))
|
||||
val eventHubClientMock = mock[Client]
|
||||
val dummyStartSeqMap =
|
||||
(0 until 32).map(partitionId => (EventHubNameAndPartition("eh1", partitionId), 1L)).toMap
|
||||
Mockito
|
||||
.when(
|
||||
eventHubClientMock.startSeqOfPartition(retryIfFail = false, ehDStream.connectedInstances))
|
||||
.thenReturn(Some(dummyStartSeqMap))
|
||||
Mockito
|
||||
.when(
|
||||
eventHubClientMock.endPointOfPartition(retryIfFail = true, ehDStream.connectedInstances))
|
||||
.thenReturn(None)
|
||||
ehDStream.setEventHubClient(eventHubClientMock)
|
||||
ssc.scheduler.start()
|
||||
intercept[IllegalArgumentException] {
|
||||
|
@ -81,112 +88,166 @@ class EventHubDirectDStreamSuite extends EventHubTestSuiteBase with MockitoSugar
|
|||
|
||||
test("interaction among Listener/ProgressTracker/Spark Streaming (single stream)") {
|
||||
val input = Seq(Seq(1, 2, 3, 4, 5, 6), Seq(4, 5, 6, 7, 8, 9), Seq(7, 8, 9, 1, 2, 3))
|
||||
val expectedOutput = Seq(
|
||||
Seq(2, 3, 5, 6, 8, 9), Seq(4, 5, 7, 8, 10, 2), Seq(6, 7, 9, 10, 3, 4))
|
||||
val expectedOutput = Seq(Seq(2, 3, 5, 6, 8, 9), Seq(4, 5, 7, 8, 10, 2), Seq(6, 7, 9, 10, 3, 4))
|
||||
testUnaryOperation(
|
||||
input,
|
||||
eventhubsParams = Map[String, Map[String, String]](
|
||||
"eh1" -> Map(
|
||||
"eventhubs.partition.count" -> "3",
|
||||
"eventhubs.maxRate" -> "2",
|
||||
"eventhubs.name" -> "eh1")
|
||||
),
|
||||
expectedOffsetsAndSeqs = Map(eventhubNamespace ->
|
||||
OffsetRecord(2000L, Map(EventHubNameAndPartition("eh1", 0) -> (3L, 3L),
|
||||
EventHubNameAndPartition("eh1", 1) -> (3L, 3L),
|
||||
EventHubNameAndPartition("eh1", 2) -> (3L, 3L)))
|
||||
"eventhubs.name" -> "eh1",
|
||||
"eventhubs.namespace" -> "namespace",
|
||||
"eventhubs.policyname" -> "policyname",
|
||||
"eventhubs.policykey" -> "policykey"
|
||||
)
|
||||
),
|
||||
expectedOffsetsAndSeqs = Map(
|
||||
eventhubNamespace ->
|
||||
OffsetRecord(2000L,
|
||||
Map(EventHubNameAndPartition("eh1", 0) -> (3L, 3L),
|
||||
EventHubNameAndPartition("eh1", 1) -> (3L, 3L),
|
||||
EventHubNameAndPartition("eh1", 2) -> (3L, 3L)))),
|
||||
operation = (inputDStream: EventHubDirectDStream) =>
|
||||
inputDStream.map(eventData => eventData.getProperties.get("output").asInstanceOf[Int] + 1),
|
||||
expectedOutput)
|
||||
testProgressTracker(eventhubNamespace,
|
||||
OffsetRecord(3000L, Map(EventHubNameAndPartition("eh1", 0) -> (5L, 5L),
|
||||
EventHubNameAndPartition("eh1", 1) -> (5L, 5L),
|
||||
EventHubNameAndPartition("eh1", 2) -> (5L, 5L))), 4000L)
|
||||
expectedOutput
|
||||
)
|
||||
testProgressTracker(
|
||||
eventhubNamespace,
|
||||
OffsetRecord(3000L,
|
||||
Map(EventHubNameAndPartition("eh1", 0) -> (5L, 5L),
|
||||
EventHubNameAndPartition("eh1", 1) -> (5L, 5L),
|
||||
EventHubNameAndPartition("eh1", 2) -> (5L, 5L))),
|
||||
4000L
|
||||
)
|
||||
}
|
||||
|
||||
test("interaction among Listener/ProgressTracker/Spark Streaming (single stream +" +
|
||||
" windowing function)") {
|
||||
test(
|
||||
"interaction among Listener/ProgressTracker/Spark Streaming (single stream +" +
|
||||
" windowing function)") {
|
||||
val input = Seq(Seq(1, 2, 3, 4, 5, 6), Seq(4, 5, 6, 7, 8, 9), Seq(7, 8, 9, 1, 2, 3))
|
||||
val expectedOutput = Seq(
|
||||
Seq(2, 3, 5, 6, 8, 9), Seq(2, 3, 5, 6, 8, 9, 4, 5, 7, 8, 10, 2),
|
||||
Seq(4, 5, 7, 8, 10, 2, 6, 7, 9, 10, 3, 4))
|
||||
val expectedOutput = Seq(Seq(2, 3, 5, 6, 8, 9),
|
||||
Seq(2, 3, 5, 6, 8, 9, 4, 5, 7, 8, 10, 2),
|
||||
Seq(4, 5, 7, 8, 10, 2, 6, 7, 9, 10, 3, 4))
|
||||
testUnaryOperation(
|
||||
input,
|
||||
eventhubsParams = Map[String, Map[String, String]](
|
||||
"eh1" -> Map(
|
||||
"eventhubs.partition.count" -> "3",
|
||||
"eventhubs.maxRate" -> "2",
|
||||
"eventhubs.name" -> "eh1")
|
||||
),
|
||||
expectedOffsetsAndSeqs = Map(eventhubNamespace ->
|
||||
OffsetRecord(2000L, Map(EventHubNameAndPartition("eh1", 0) -> (3L, 3L),
|
||||
EventHubNameAndPartition("eh1", 1) -> (3L, 3L),
|
||||
EventHubNameAndPartition("eh1", 2) -> (3L, 3L)))
|
||||
"eventhubs.name" -> "eh1",
|
||||
"eventhubs.namespace" -> "namespace",
|
||||
"eventhubs.policyname" -> "policyname",
|
||||
"eventhubs.policykey" -> "policykey"
|
||||
)
|
||||
),
|
||||
expectedOffsetsAndSeqs = Map(
|
||||
eventhubNamespace ->
|
||||
OffsetRecord(2000L,
|
||||
Map(EventHubNameAndPartition("eh1", 0) -> (3L, 3L),
|
||||
EventHubNameAndPartition("eh1", 1) -> (3L, 3L),
|
||||
EventHubNameAndPartition("eh1", 2) -> (3L, 3L)))),
|
||||
operation = (inputDStream: EventHubDirectDStream) =>
|
||||
inputDStream.window(Seconds(2), Seconds(1)).map(
|
||||
eventData => eventData.getProperties.get("output").asInstanceOf[Int] + 1),
|
||||
expectedOutput)
|
||||
testProgressTracker(eventhubNamespace,
|
||||
OffsetRecord(3000L, Map(EventHubNameAndPartition("eh1", 0) -> (5L, 5L),
|
||||
EventHubNameAndPartition("eh1", 1) -> (5L, 5L),
|
||||
EventHubNameAndPartition("eh1", 2) -> (5L, 5L))), 4000L)
|
||||
inputDStream
|
||||
.window(Seconds(2), Seconds(1))
|
||||
.map(eventData => eventData.getProperties.get("output").asInstanceOf[Int] + 1),
|
||||
expectedOutput
|
||||
)
|
||||
testProgressTracker(
|
||||
eventhubNamespace,
|
||||
OffsetRecord(3000L,
|
||||
Map(EventHubNameAndPartition("eh1", 0) -> (5L, 5L),
|
||||
EventHubNameAndPartition("eh1", 1) -> (5L, 5L),
|
||||
EventHubNameAndPartition("eh1", 2) -> (5L, 5L))),
|
||||
4000L
|
||||
)
|
||||
}
|
||||
|
||||
test("interaction among Listener/ProgressTracker/Spark Streaming (multi-streams join)") {
|
||||
import scala.collection.JavaConverters._
|
||||
val input1 = Seq(
|
||||
Seq("a" -> 1, "b" -> 2, "c" -> 3, "d" -> 4, "e" -> 5, "f" -> 6),
|
||||
Seq("g" -> 4, "h" -> 5, "i" -> 6, "j" -> 7, "k" -> 8, "l" -> 9),
|
||||
Seq("m" -> 7, "n" -> 8, "o" -> 9, "p" -> 1, "q" -> 2, "r" -> 3))
|
||||
val input2 = Seq(
|
||||
Seq("a" -> 1, "b" -> 2, "c" -> 3, "d" -> 4, "e" -> 5, "f" -> 6),
|
||||
Seq("g" -> 4, "h" -> 5, "i" -> 6, "j" -> 7, "k" -> 8, "l" -> 9),
|
||||
Seq("m" -> 7, "n" -> 8, "o" -> 9, "p" -> 1, "q" -> 2, "r" -> 3))
|
||||
val expectedOutput = Seq(
|
||||
Seq("a" -> 2, "b" -> 4, "c" -> 6, "g" -> 8, "h" -> 10, "i" -> 12, "m" -> 14, "n" -> 16,
|
||||
"o" -> 18),
|
||||
Seq("d" -> 8, "e" -> 10, "f" -> 12, "j" -> 14, "k" -> 16, "l" -> 18, "p" -> 2, "q" -> 4,
|
||||
"r" -> 6))
|
||||
val input1 = Seq(Seq("a" -> 1, "b" -> 2, "c" -> 3, "d" -> 4, "e" -> 5, "f" -> 6),
|
||||
Seq("g" -> 4, "h" -> 5, "i" -> 6, "j" -> 7, "k" -> 8, "l" -> 9),
|
||||
Seq("m" -> 7, "n" -> 8, "o" -> 9, "p" -> 1, "q" -> 2, "r" -> 3))
|
||||
val input2 = Seq(Seq("a" -> 1, "b" -> 2, "c" -> 3, "d" -> 4, "e" -> 5, "f" -> 6),
|
||||
Seq("g" -> 4, "h" -> 5, "i" -> 6, "j" -> 7, "k" -> 8, "l" -> 9),
|
||||
Seq("m" -> 7, "n" -> 8, "o" -> 9, "p" -> 1, "q" -> 2, "r" -> 3))
|
||||
val expectedOutput = Seq(Seq("a" -> 2,
|
||||
"b" -> 4,
|
||||
"c" -> 6,
|
||||
"g" -> 8,
|
||||
"h" -> 10,
|
||||
"i" -> 12,
|
||||
"m" -> 14,
|
||||
"n" -> 16,
|
||||
"o" -> 18),
|
||||
Seq("d" -> 8,
|
||||
"e" -> 10,
|
||||
"f" -> 12,
|
||||
"j" -> 14,
|
||||
"k" -> 16,
|
||||
"l" -> 18,
|
||||
"p" -> 2,
|
||||
"q" -> 4,
|
||||
"r" -> 6))
|
||||
|
||||
testBinaryOperation(input1, input2,
|
||||
testBinaryOperation(
|
||||
input1,
|
||||
input2,
|
||||
eventhubsParams1 = Map[String, Map[String, String]](
|
||||
"eh11" -> Map(
|
||||
"eventhubs.partition.count" -> "3",
|
||||
"eventhubs.maxRate" -> "3",
|
||||
"eventhubs.name" -> "eh11")
|
||||
"eh11" -> Map(
|
||||
"eventhubs.partition.count" -> "3",
|
||||
"eventhubs.maxRate" -> "3",
|
||||
"eventhubs.name" -> "eh11",
|
||||
"eventhubs.namespace" -> "namespace",
|
||||
"eventhubs.policyname" -> "policyname",
|
||||
"eventhubs.policykey" -> "policykey"
|
||||
)
|
||||
),
|
||||
eventhubsParams2 = Map[String, Map[String, String]](
|
||||
"eh21" -> Map(
|
||||
"eventhubs.partition.count" -> "3",
|
||||
"eventhubs.maxRate" -> "3",
|
||||
"eventhubs.name" -> "eh21")
|
||||
"eventhubs.name" -> "eh21",
|
||||
"eventhubs.namespace" -> "namespace",
|
||||
"eventhubs.policyname" -> "policyname",
|
||||
"eventhubs.policykey" -> "policykey"
|
||||
)
|
||||
),
|
||||
expectedOffsetsAndSeqs1 = Map("namespace1" ->
|
||||
OffsetRecord(1000L, Map(EventHubNameAndPartition("eh11", 0) -> (2L, 2L),
|
||||
EventHubNameAndPartition("eh11", 1) -> (2L, 2L),
|
||||
EventHubNameAndPartition("eh11", 2) -> (2L, 2L))
|
||||
)),
|
||||
expectedOffsetsAndSeqs2 = Map("namespace2" ->
|
||||
OffsetRecord(1000L, Map(EventHubNameAndPartition("eh21", 0) -> (2L, 2L),
|
||||
EventHubNameAndPartition("eh21", 1) -> (2L, 2L),
|
||||
EventHubNameAndPartition("eh21", 2) -> (2L, 2L))
|
||||
)),
|
||||
expectedOffsetsAndSeqs1 = Map(
|
||||
"namespace1" ->
|
||||
OffsetRecord(1000L,
|
||||
Map(EventHubNameAndPartition("eh11", 0) -> (2L, 2L),
|
||||
EventHubNameAndPartition("eh11", 1) -> (2L, 2L),
|
||||
EventHubNameAndPartition("eh11", 2) -> (2L, 2L)))),
|
||||
expectedOffsetsAndSeqs2 = Map(
|
||||
"namespace2" ->
|
||||
OffsetRecord(1000L,
|
||||
Map(EventHubNameAndPartition("eh21", 0) -> (2L, 2L),
|
||||
EventHubNameAndPartition("eh21", 1) -> (2L, 2L),
|
||||
EventHubNameAndPartition("eh21", 2) -> (2L, 2L)))),
|
||||
// join and sum up the value
|
||||
operation = (inputDStream1: EventHubDirectDStream, inputDStream2: EventHubDirectDStream) =>
|
||||
inputDStream1.flatMap(eventData => eventData.getProperties.asScala).
|
||||
join(inputDStream2.flatMap(eventData => eventData.getProperties.asScala)).
|
||||
map{case (key, (v1, v2)) => (key, v1.asInstanceOf[Int] + v2.asInstanceOf[Int])},
|
||||
expectedOutput)
|
||||
testProgressTracker("namespace1",
|
||||
OffsetRecord(2000L, Map(EventHubNameAndPartition("eh11", 0) -> (5L, 5L),
|
||||
EventHubNameAndPartition("eh11", 1) -> (5L, 5L),
|
||||
EventHubNameAndPartition("eh11", 2) -> (5L, 5L))), 3000L)
|
||||
testProgressTracker("namespace2",
|
||||
OffsetRecord(2000L, Map(EventHubNameAndPartition("eh21", 0) -> (5L, 5L),
|
||||
EventHubNameAndPartition("eh21", 1) -> (5L, 5L),
|
||||
EventHubNameAndPartition("eh21", 2) -> (5L, 5L))), 3000L)
|
||||
inputDStream1
|
||||
.flatMap(eventData => eventData.getProperties.asScala)
|
||||
.join(inputDStream2.flatMap(eventData => eventData.getProperties.asScala))
|
||||
.map { case (key, (v1, v2)) => (key, v1.asInstanceOf[Int] + v2.asInstanceOf[Int]) },
|
||||
expectedOutput
|
||||
)
|
||||
testProgressTracker(
|
||||
"namespace1",
|
||||
OffsetRecord(2000L,
|
||||
Map(EventHubNameAndPartition("eh11", 0) -> (5L, 5L),
|
||||
EventHubNameAndPartition("eh11", 1) -> (5L, 5L),
|
||||
EventHubNameAndPartition("eh11", 2) -> (5L, 5L))),
|
||||
3000L
|
||||
)
|
||||
testProgressTracker(
|
||||
"namespace2",
|
||||
OffsetRecord(2000L,
|
||||
Map(EventHubNameAndPartition("eh21", 0) -> (5L, 5L),
|
||||
EventHubNameAndPartition("eh21", 1) -> (5L, 5L),
|
||||
EventHubNameAndPartition("eh21", 2) -> (5L, 5L))),
|
||||
3000L
|
||||
)
|
||||
}
|
||||
|
||||
test("update offset correctly when RDD operation only involves some of the partitions") {
|
||||
|
@ -198,60 +259,81 @@ class EventHubDirectDStreamSuite extends EventHubTestSuiteBase with MockitoSugar
|
|||
"eh1" -> Map(
|
||||
"eventhubs.partition.count" -> "3",
|
||||
"eventhubs.maxRate" -> "2",
|
||||
"eventhubs.name" -> "eh1")
|
||||
"eventhubs.name" -> "eh1",
|
||||
"eventhubs.namespace" -> "namespace",
|
||||
"eventhubs.policyname" -> "policyname",
|
||||
"eventhubs.policykey" -> "policykey"
|
||||
)
|
||||
),
|
||||
expectedOffsetsAndSeqs = Map(eventhubNamespace ->
|
||||
OffsetRecord(2000L, Map(EventHubNameAndPartition("eh1", 0) -> (3L, 3L),
|
||||
EventHubNameAndPartition("eh1", 1) -> (-1L, -1L),
|
||||
EventHubNameAndPartition("eh1", 2) -> (-1L, -1L))
|
||||
)),
|
||||
expectedOffsetsAndSeqs = Map(
|
||||
eventhubNamespace ->
|
||||
OffsetRecord(2000L,
|
||||
Map(EventHubNameAndPartition("eh1", 0) -> (3L, 3L),
|
||||
EventHubNameAndPartition("eh1", 1) -> (-1L, -1L),
|
||||
EventHubNameAndPartition("eh1", 2) -> (-1L, -1L)))),
|
||||
operation = (inputDStream: EventHubDirectDStream) =>
|
||||
inputDStream.map(eventData => eventData.getProperties.get("output").asInstanceOf[Int] + 1),
|
||||
expectedOutput,
|
||||
rddOperation = Some((rdd: RDD[Int], t: Time) => {
|
||||
Array(rdd.take(1).toSeq)
|
||||
}))
|
||||
})
|
||||
)
|
||||
|
||||
testProgressTracker(eventhubNamespace,
|
||||
OffsetRecord(3000L, Map(
|
||||
EventHubNameAndPartition("eh1", 0) -> (5L, 5L),
|
||||
EventHubNameAndPartition("eh1", 1) -> (-1L, -1L),
|
||||
EventHubNameAndPartition("eh1", 2) -> (-1L, -1L))),
|
||||
4000L)
|
||||
testProgressTracker(
|
||||
eventhubNamespace,
|
||||
OffsetRecord(3000L,
|
||||
Map(EventHubNameAndPartition("eh1", 0) -> (5L, 5L),
|
||||
EventHubNameAndPartition("eh1", 1) -> (-1L, -1L),
|
||||
EventHubNameAndPartition("eh1", 2) -> (-1L, -1L))),
|
||||
4000L
|
||||
)
|
||||
}
|
||||
|
||||
test("continue stream correctly when there is fluctuation") {
|
||||
val input = Seq(Seq(1, 2, 3, 4, 5, 6), Seq(4, 5, 6, 7, 8, 9), Seq(7, 8, 9, 1, 2, 3))
|
||||
val expectedOutput = Seq(
|
||||
Seq(2, 3, 5, 6, 8, 9), Seq(4, 5, 7, 8, 10, 2), Seq(), Seq(), Seq(), Seq(6, 7, 9, 10, 3, 4))
|
||||
val expectedOutput = Seq(Seq(2, 3, 5, 6, 8, 9),
|
||||
Seq(4, 5, 7, 8, 10, 2),
|
||||
Seq(),
|
||||
Seq(),
|
||||
Seq(),
|
||||
Seq(6, 7, 9, 10, 3, 4))
|
||||
testFluctuatedStream(
|
||||
input,
|
||||
eventhubsParams = Map[String, Map[String, String]](
|
||||
"eh1" -> Map(
|
||||
"eventhubs.partition.count" -> "3",
|
||||
"eventhubs.maxRate" -> "2",
|
||||
"eventhubs.name" -> "eh1")
|
||||
"eventhubs.name" -> "eh1",
|
||||
"eventhubs.namespace" -> "namespace",
|
||||
"eventhubs.policyname" -> "policyname",
|
||||
"eventhubs.policykey" -> "policykey"
|
||||
)
|
||||
),
|
||||
expectedOffsetsAndSeqs = Map(eventhubNamespace ->
|
||||
OffsetRecord(5000L, Map(EventHubNameAndPartition("eh1", 0) -> (3L, 3L),
|
||||
EventHubNameAndPartition("eh1", 1) -> (3L, 3L),
|
||||
EventHubNameAndPartition("eh1", 2) -> (3L, 3L)))),
|
||||
expectedOffsetsAndSeqs = Map(
|
||||
eventhubNamespace ->
|
||||
OffsetRecord(5000L,
|
||||
Map(EventHubNameAndPartition("eh1", 0) -> (3L, 3L),
|
||||
EventHubNameAndPartition("eh1", 1) -> (3L, 3L),
|
||||
EventHubNameAndPartition("eh1", 2) -> (3L, 3L)))),
|
||||
operation = (inputDStream: EventHubDirectDStream) =>
|
||||
inputDStream.map(eventData => eventData.getProperties.get("output").asInstanceOf[Int] + 1),
|
||||
expectedOutput,
|
||||
messagesBeforeEmpty = 4,
|
||||
numBatchesBeforeNewData = 5)
|
||||
testProgressTracker(eventhubNamespace,
|
||||
OffsetRecord(6000L, Map(EventHubNameAndPartition("eh1", 0) -> (5L, 5L),
|
||||
EventHubNameAndPartition("eh1", 1) -> (5L, 5L),
|
||||
EventHubNameAndPartition("eh1", 2) -> (5L, 5L))),
|
||||
7000L)
|
||||
numBatchesBeforeNewData = 5
|
||||
)
|
||||
testProgressTracker(
|
||||
eventhubNamespace,
|
||||
OffsetRecord(6000L,
|
||||
Map(EventHubNameAndPartition("eh1", 0) -> (5L, 5L),
|
||||
EventHubNameAndPartition("eh1", 1) -> (5L, 5L),
|
||||
EventHubNameAndPartition("eh1", 2) -> (5L, 5L))),
|
||||
7000L
|
||||
)
|
||||
}
|
||||
|
||||
test("filter messages for enqueueTime correctly") {
|
||||
val input = Seq(Seq(1, 2, 3, 4, 5, 6), Seq(4, 5, 6, 7, 8, 9), Seq(7, 8, 9, 1, 2, 3))
|
||||
val expectedOutput = Seq(
|
||||
Seq(5, 6, 8, 9, 2, 3), Seq(7, 10, 4), Seq())
|
||||
val expectedOutput = Seq(Seq(5, 6, 8, 9, 2, 3), Seq(7, 10, 4), Seq())
|
||||
testUnaryOperation(
|
||||
input,
|
||||
eventhubsParams = Map[String, Map[String, String]](
|
||||
|
@ -259,27 +341,35 @@ class EventHubDirectDStreamSuite extends EventHubTestSuiteBase with MockitoSugar
|
|||
"eventhubs.partition.count" -> "3",
|
||||
"eventhubs.maxRate" -> "2",
|
||||
"eventhubs.name" -> "eh1",
|
||||
"eventhubs.filter.enqueuetime" -> "3000"
|
||||
"eventhubs.filter.enqueuetime" -> "3000",
|
||||
"eventhubs.namespace" -> "namespace",
|
||||
"eventhubs.policyname" -> "policyname",
|
||||
"eventhubs.policykey" -> "policykey"
|
||||
)
|
||||
),
|
||||
expectedOffsetsAndSeqs = Map(eventhubNamespace ->
|
||||
OffsetRecord(2000L, Map(EventHubNameAndPartition("eh1", 0) -> (5L, 5L),
|
||||
EventHubNameAndPartition("eh1", 1) -> (5L, 5L),
|
||||
EventHubNameAndPartition("eh1", 2) -> (5L, 5L)))
|
||||
),
|
||||
expectedOffsetsAndSeqs = Map(
|
||||
eventhubNamespace ->
|
||||
OffsetRecord(2000L,
|
||||
Map(EventHubNameAndPartition("eh1", 0) -> (5L, 5L),
|
||||
EventHubNameAndPartition("eh1", 1) -> (5L, 5L),
|
||||
EventHubNameAndPartition("eh1", 2) -> (5L, 5L)))),
|
||||
operation = (inputDStream: EventHubDirectDStream) =>
|
||||
inputDStream.map(eventData => eventData.getProperties.get("output").asInstanceOf[Int] + 1),
|
||||
expectedOutput)
|
||||
testProgressTracker(eventhubNamespace,
|
||||
OffsetRecord(3000L, Map(EventHubNameAndPartition("eh1", 0) -> (5L, 5L),
|
||||
EventHubNameAndPartition("eh1", 1) -> (5L, 5L),
|
||||
EventHubNameAndPartition("eh1", 2) -> (5L, 5L))), 4000L)
|
||||
expectedOutput
|
||||
)
|
||||
testProgressTracker(
|
||||
eventhubNamespace,
|
||||
OffsetRecord(3000L,
|
||||
Map(EventHubNameAndPartition("eh1", 0) -> (5L, 5L),
|
||||
EventHubNameAndPartition("eh1", 1) -> (5L, 5L),
|
||||
EventHubNameAndPartition("eh1", 2) -> (5L, 5L))),
|
||||
4000L
|
||||
)
|
||||
}
|
||||
|
||||
test("pass-in enqueuetime is not allowed to be later than the highest enqueuetime") {
|
||||
val input = Seq(Seq(1, 2, 3, 4, 5, 6), Seq(4, 5, 6, 7, 8, 9), Seq(7, 8, 9, 1, 2, 3))
|
||||
val expectedOutput = Seq(
|
||||
Seq(5, 6, 8, 9, 2, 3), Seq(7, 10, 4), Seq())
|
||||
val expectedOutput = Seq(Seq(5, 6, 8, 9, 2, 3), Seq(7, 10, 4), Seq())
|
||||
intercept[IllegalArgumentException] {
|
||||
testUnaryOperation(
|
||||
input,
|
||||
|
@ -291,15 +381,17 @@ class EventHubDirectDStreamSuite extends EventHubTestSuiteBase with MockitoSugar
|
|||
"eventhubs.filter.enqueuetime" -> "10000"
|
||||
)
|
||||
),
|
||||
expectedOffsetsAndSeqs = Map(eventhubNamespace ->
|
||||
OffsetRecord(2000L, Map(EventHubNameAndPartition("eh1", 0) -> (5L, 5L),
|
||||
EventHubNameAndPartition("eh1", 1) -> (5L, 5L),
|
||||
EventHubNameAndPartition("eh1", 2) -> (5L, 5L)))
|
||||
),
|
||||
expectedOffsetsAndSeqs = Map(
|
||||
eventhubNamespace ->
|
||||
OffsetRecord(2000L,
|
||||
Map(EventHubNameAndPartition("eh1", 0) -> (5L, 5L),
|
||||
EventHubNameAndPartition("eh1", 1) -> (5L, 5L),
|
||||
EventHubNameAndPartition("eh1", 2) -> (5L, 5L)))),
|
||||
operation = (inputDStream: EventHubDirectDStream) =>
|
||||
inputDStream.map(eventData => eventData.getProperties.get("output").
|
||||
asInstanceOf[Int] + 1),
|
||||
expectedOutput)
|
||||
inputDStream.map(eventData =>
|
||||
eventData.getProperties.get("output").asInstanceOf[Int] + 1),
|
||||
expectedOutput
|
||||
)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -17,32 +17,33 @@
|
|||
|
||||
package org.apache.spark.streaming.eventhubs
|
||||
|
||||
import java.io.{IOException, ObjectInputStream}
|
||||
import java.io.{ IOException, ObjectInputStream }
|
||||
import java.util.concurrent.ConcurrentLinkedQueue
|
||||
|
||||
import scala.reflect.ClassTag
|
||||
|
||||
import org.apache.spark.eventhubscommon.{EventHubNameAndPartition, OffsetRecord}
|
||||
import org.apache.spark.eventhubscommon.{ EventHubNameAndPartition, OffsetRecord }
|
||||
import org.apache.spark.eventhubscommon.client.EventHubsOffsetTypes.EventHubsOffsetType
|
||||
import org.apache.spark.eventhubscommon.utils._
|
||||
import org.apache.spark.rdd.RDD
|
||||
import org.apache.spark.streaming._
|
||||
import org.apache.spark.streaming.dstream.{DStream, ForEachDStream}
|
||||
import org.apache.spark.streaming.dstream.{ DStream, ForEachDStream }
|
||||
import org.apache.spark.streaming.eventhubs.checkpoint.DirectDStreamProgressTracker
|
||||
import org.apache.spark.util.{ManualClock, Utils}
|
||||
|
||||
import org.apache.spark.util.{ ManualClock, Utils }
|
||||
|
||||
private[eventhubs] class TestEventHubOutputStream[T: ClassTag](
|
||||
parent: DStream[T],
|
||||
val output: ConcurrentLinkedQueue[Seq[Seq[T]]] = new ConcurrentLinkedQueue[Seq[Seq[T]]](),
|
||||
rddOperation: Option[(RDD[T], Time) => Array[Seq[T]]])
|
||||
extends ForEachDStream[T](parent, {
|
||||
(rdd: RDD[T], t: Time) =>
|
||||
val rddOpToApply = rddOperation.getOrElse(
|
||||
(rdd: RDD[T], t: Time) => rdd.glom().collect().map(_.toSeq))
|
||||
val resultsInABatch = rddOpToApply(rdd, t)
|
||||
output.add(resultsInABatch)
|
||||
}, false) {
|
||||
extends ForEachDStream[T](
|
||||
parent, { (rdd: RDD[T], t: Time) =>
|
||||
val rddOpToApply =
|
||||
rddOperation.getOrElse((rdd: RDD[T], t: Time) => rdd.glom().collect().map(_.toSeq))
|
||||
val resultsInABatch = rddOpToApply(rdd, t)
|
||||
output.add(resultsInABatch)
|
||||
},
|
||||
false
|
||||
) {
|
||||
|
||||
// This is to clear the output buffer every it is read from a checkpoint
|
||||
@throws(classOf[IOException])
|
||||
|
@ -75,8 +76,8 @@ private[eventhubs] trait EventHubTestSuiteBase extends TestSuiteBase {
|
|||
val inputStream1 = setupEventHubInputStream(namespace1, simulatedEventHubs1, eventhubsParams1)
|
||||
val inputStream2 = setupEventHubInputStream(namespace2, simulatedEventHubs2, eventhubsParams2)
|
||||
val operatedStream = operation(inputStream1, inputStream2)
|
||||
val outputStream = new TestEventHubOutputStream(operatedStream,
|
||||
new ConcurrentLinkedQueue[Seq[Seq[V]]], None)
|
||||
val outputStream =
|
||||
new TestEventHubOutputStream(operatedStream, new ConcurrentLinkedQueue[Seq[Seq[V]]], None)
|
||||
outputStream.register()
|
||||
ssc
|
||||
}
|
||||
|
@ -88,11 +89,12 @@ private[eventhubs] trait EventHubTestSuiteBase extends TestSuiteBase {
|
|||
rddOperation: Option[(RDD[V], Time) => Array[Seq[V]]]): StreamingContext = {
|
||||
|
||||
// Setup the stream computation
|
||||
val inputStream = setupEventHubInputStream(eventhubNamespace, simulatedEventHubs,
|
||||
eventhubsParams)
|
||||
val inputStream =
|
||||
setupEventHubInputStream(eventhubNamespace, simulatedEventHubs, eventhubsParams)
|
||||
val operatedStream = operation(inputStream)
|
||||
val outputStream = new TestEventHubOutputStream(operatedStream,
|
||||
new ConcurrentLinkedQueue[Seq[Seq[V]]], rddOperation)
|
||||
new ConcurrentLinkedQueue[Seq[Seq[V]]],
|
||||
rddOperation)
|
||||
outputStream.register()
|
||||
ssc
|
||||
}
|
||||
|
@ -107,12 +109,19 @@ private[eventhubs] trait EventHubTestSuiteBase extends TestSuiteBase {
|
|||
namespace,
|
||||
progressRootPath.toString,
|
||||
eventhubsParams,
|
||||
(eventHubParams: Map[String, String], partitionId: Int, startOffset: Long,
|
||||
eventHubsOffsetType: EventHubsOffsetType, _: Int) =>
|
||||
new TestEventHubsReceiver(eventHubParams, simulatedEventHubs, partitionId,
|
||||
startOffset, eventHubsOffsetType),
|
||||
(_: String, _: Map[String, Map[String, String]]) => FragileEventHubClient.getInstance("",
|
||||
Map()))
|
||||
(eventHubParams: Map[String, String],
|
||||
partitionId: Int,
|
||||
startOffset: Long,
|
||||
eventHubsOffsetType: EventHubsOffsetType,
|
||||
_: Int) =>
|
||||
new TestEventHubsReceiver(eventHubParams,
|
||||
simulatedEventHubs,
|
||||
partitionId,
|
||||
startOffset,
|
||||
eventHubsOffsetType),
|
||||
(_: String, _: Map[String, Map[String, String]]) =>
|
||||
FragileEventHubClient.getInstance("", Map())
|
||||
)
|
||||
}
|
||||
|
||||
private def setupFragileEventHubStream[V: ClassTag](
|
||||
|
@ -120,25 +129,23 @@ private[eventhubs] trait EventHubTestSuiteBase extends TestSuiteBase {
|
|||
eventhubsParams: Map[String, Map[String, String]],
|
||||
operation: EventHubDirectDStream => DStream[V]): StreamingContext = {
|
||||
|
||||
val inputStream = setupFragileInputStream(eventhubNamespace, simulatedEventHubs,
|
||||
eventhubsParams)
|
||||
val inputStream =
|
||||
setupFragileInputStream(eventhubNamespace, simulatedEventHubs, eventhubsParams)
|
||||
val operatedStream = operation(inputStream)
|
||||
val outputStream = new TestEventHubOutputStream(operatedStream,
|
||||
new ConcurrentLinkedQueue[Seq[Seq[V]]], None)
|
||||
val outputStream =
|
||||
new TestEventHubOutputStream(operatedStream, new ConcurrentLinkedQueue[Seq[Seq[V]]], None)
|
||||
outputStream.register()
|
||||
ssc
|
||||
}
|
||||
|
||||
def testFragileStream[U: ClassTag, V: ClassTag](
|
||||
input: Seq[Seq[U]],
|
||||
eventhubsParams: Map[String, Map[String, String]],
|
||||
expectedOffsetsAndSeqs: Map[String, OffsetRecord],
|
||||
operation: EventHubDirectDStream => DStream[V],
|
||||
expectedOutput: Seq[Seq[V]]) {
|
||||
def testFragileStream[U: ClassTag, V: ClassTag](input: Seq[Seq[U]],
|
||||
eventhubsParams: Map[String, Map[String, String]],
|
||||
expectedOffsetsAndSeqs: Map[String, OffsetRecord],
|
||||
operation: EventHubDirectDStream => DStream[V],
|
||||
expectedOutput: Seq[Seq[V]]) {
|
||||
val numBatches_ = expectedOutput.size
|
||||
val simulatedEventHubs = createSimulatedEventHub(eventhubNamespace, input, eventhubsParams)
|
||||
withStreamingContext(
|
||||
setupFragileEventHubStream(simulatedEventHubs, eventhubsParams, operation)) {
|
||||
withStreamingContext(setupFragileEventHubStream(simulatedEventHubs, eventhubsParams, operation)) {
|
||||
ssc =>
|
||||
runStreamsWithEventHubInput(ssc, numBatches_, expectedOutput, useSet = false)
|
||||
}
|
||||
|
@ -150,23 +157,32 @@ private[eventhubs] trait EventHubTestSuiteBase extends TestSuiteBase {
|
|||
simulatedEventHubs: SimulatedEventHubs,
|
||||
eventhubsParams: Map[String, Map[String, String]]): EventHubDirectDStream = {
|
||||
|
||||
val maxOffsetForEachEventHub = EventHubsTestUtilities.getHighestOffsetPerPartition(
|
||||
simulatedEventHubs)
|
||||
val maxOffsetForEachEventHub =
|
||||
EventHubsTestUtilities.getHighestOffsetPerPartition(simulatedEventHubs)
|
||||
|
||||
new EventHubDirectDStream(ssc, namespace,
|
||||
progressRootPath.toString, eventhubsParams,
|
||||
(eventHubParams: Map[String, String], partitionId: Int, startOffset: Long,
|
||||
eventHubsOffsetType: EventHubsOffsetType, _: Int) =>
|
||||
new TestEventHubsReceiver(eventHubParams, simulatedEventHubs, partitionId, startOffset,
|
||||
eventHubsOffsetType),
|
||||
new EventHubDirectDStream(
|
||||
ssc,
|
||||
namespace,
|
||||
progressRootPath.toString,
|
||||
eventhubsParams,
|
||||
(eventHubParams: Map[String, String],
|
||||
partitionId: Int,
|
||||
startOffset: Long,
|
||||
eventHubsOffsetType: EventHubsOffsetType,
|
||||
_: Int) =>
|
||||
new TestEventHubsReceiver(eventHubParams,
|
||||
simulatedEventHubs,
|
||||
partitionId,
|
||||
startOffset,
|
||||
eventHubsOffsetType),
|
||||
(_: String, _: Map[String, Map[String, String]]) =>
|
||||
new TestRestEventHubClient(maxOffsetForEachEventHub))
|
||||
new TestRestEventHubClient(maxOffsetForEachEventHub)
|
||||
)
|
||||
}
|
||||
|
||||
def runEventHubStreams[V: ClassTag](
|
||||
ssc: StreamingContext,
|
||||
numBatches: Int,
|
||||
numExpectedOutput: Int): Seq[Seq[V]] = {
|
||||
def runEventHubStreams[V: ClassTag](ssc: StreamingContext,
|
||||
numBatches: Int,
|
||||
numExpectedOutput: Int): Seq[Seq[V]] = {
|
||||
// Flatten each RDD into a single Seq
|
||||
runEventHubStreamsWithPartitions(ssc, numBatches, numExpectedOutput).map(_.flatten.toSeq)
|
||||
}
|
||||
|
@ -182,10 +198,9 @@ private[eventhubs] trait EventHubTestSuiteBase extends TestSuiteBase {
|
|||
* This function is copied from Spark code base and modified by changing the TestOutputStream
|
||||
* implementation
|
||||
*/
|
||||
def runEventHubStreamsWithPartitions[V: ClassTag](
|
||||
ssc: StreamingContext,
|
||||
numBatches: Int,
|
||||
numExpectedOutput: Int): Seq[Seq[Seq[V]]] = {
|
||||
def runEventHubStreamsWithPartitions[V: ClassTag](ssc: StreamingContext,
|
||||
numBatches: Int,
|
||||
numExpectedOutput: Int): Seq[Seq[Seq[V]]] = {
|
||||
|
||||
import scala.collection.JavaConverters._
|
||||
|
||||
|
@ -194,9 +209,10 @@ private[eventhubs] trait EventHubTestSuiteBase extends TestSuiteBase {
|
|||
logInfo("numBatches = " + numBatches + ", numExpectedOutput = " + numExpectedOutput)
|
||||
|
||||
// Get the output buffer
|
||||
val outputStream = ssc.graph.getOutputStreams.
|
||||
filter(_.isInstanceOf[TestEventHubOutputStream[_]]).
|
||||
head.asInstanceOf[TestEventHubOutputStream[V]]
|
||||
val outputStream = ssc.graph.getOutputStreams
|
||||
.filter(_.isInstanceOf[TestEventHubOutputStream[_]])
|
||||
.head
|
||||
.asInstanceOf[TestEventHubOutputStream[V]]
|
||||
val output = outputStream.output
|
||||
|
||||
try {
|
||||
|
@ -220,7 +236,7 @@ private[eventhubs] trait EventHubTestSuiteBase extends TestSuiteBase {
|
|||
// Wait until expected number of output items have been generated
|
||||
val startTime = System.currentTimeMillis()
|
||||
while (output.size < numExpectedOutput &&
|
||||
System.currentTimeMillis() - startTime < maxWaitTimeMillis) {
|
||||
System.currentTimeMillis() - startTime < maxWaitTimeMillis) {
|
||||
logInfo("output.size = " + output.size + ", numExpectedOutput = " + numExpectedOutput)
|
||||
ssc.awaitTerminationOrTimeout(50)
|
||||
}
|
||||
|
@ -241,44 +257,46 @@ private[eventhubs] trait EventHubTestSuiteBase extends TestSuiteBase {
|
|||
namespace: String,
|
||||
input: Seq[Seq[U]],
|
||||
eventhubsParams: Map[String, Map[String, String]]): SimulatedEventHubs = {
|
||||
val ehAndRawInputMap = eventhubsParams.keys.flatMap {
|
||||
eventHubName =>
|
||||
val ehList = {
|
||||
for (i <- 0 until eventhubsParams(eventHubName)("eventhubs.partition.count").toInt)
|
||||
yield EventHubNameAndPartition(eventHubName, i)
|
||||
}.toArray
|
||||
ehList.zip(input)
|
||||
val ehAndRawInputMap = eventhubsParams.keys.flatMap { eventHubName =>
|
||||
val ehList = {
|
||||
for (i <- 0 until eventhubsParams(eventHubName)("eventhubs.partition.count").toInt)
|
||||
yield EventHubNameAndPartition(eventHubName, i)
|
||||
}.toArray
|
||||
ehList.zip(input)
|
||||
}.toMap
|
||||
new SimulatedEventHubs(namespace,
|
||||
new SimulatedEventHubs(
|
||||
namespace,
|
||||
ehAndRawInputMap.map {
|
||||
case (eventHubNameAndPartition, propertyQueue) =>
|
||||
(eventHubNameAndPartition,
|
||||
EventHubsTestUtilities.generateEventData(
|
||||
propertyQueue.map(property => ('e', Seq(property))),
|
||||
eventHubNameAndPartition.partitionId,
|
||||
0))
|
||||
})
|
||||
EventHubsTestUtilities.generateEventData(
|
||||
propertyQueue.map(property => ('e', Seq(property))),
|
||||
eventHubNameAndPartition.partitionId,
|
||||
0))
|
||||
}
|
||||
)
|
||||
}
|
||||
|
||||
protected def verifyOffsetsAndSeqs(
|
||||
ssc: StreamingContext,
|
||||
namespace: String,
|
||||
expectedOffsetsAndSeqs: Map[String, OffsetRecord]): Unit = {
|
||||
val producedOffsetsAndSeqs = ssc.graph.getInputStreams().filter(
|
||||
_.isInstanceOf[EventHubDirectDStream]).map(_.asInstanceOf[EventHubDirectDStream]).
|
||||
filter(_.eventHubNameSpace == namespace).
|
||||
map(eventHubStream => (eventHubStream.eventHubNameSpace,
|
||||
eventHubStream.currentOffsetsAndSeqNums)).toMap
|
||||
protected def verifyOffsetsAndSeqs(ssc: StreamingContext,
|
||||
namespace: String,
|
||||
expectedOffsetsAndSeqs: Map[String, OffsetRecord]): Unit = {
|
||||
val producedOffsetsAndSeqs = ssc.graph
|
||||
.getInputStreams()
|
||||
.filter(_.isInstanceOf[EventHubDirectDStream])
|
||||
.map(_.asInstanceOf[EventHubDirectDStream])
|
||||
.filter(_.eventHubNameSpace == namespace)
|
||||
.map(eventHubStream =>
|
||||
(eventHubStream.eventHubNameSpace, eventHubStream.currentOffsetsAndSeqNums))
|
||||
.toMap
|
||||
assert(expectedOffsetsAndSeqs === producedOffsetsAndSeqs)
|
||||
}
|
||||
|
||||
def testProgressTracker(
|
||||
namespace: String,
|
||||
expectedOffsetsAndSeqs: OffsetRecord,
|
||||
timestamp: Long): Unit = {
|
||||
val producedOffsetsAndSeqs = DirectDStreamProgressTracker.getInstance.
|
||||
asInstanceOf[DirectDStreamProgressTracker].read(namespace,
|
||||
timestamp - batchDuration.milliseconds, fallBack = true)
|
||||
def testProgressTracker(namespace: String,
|
||||
expectedOffsetsAndSeqs: OffsetRecord,
|
||||
timestamp: Long): Unit = {
|
||||
val producedOffsetsAndSeqs = DirectDStreamProgressTracker.getInstance
|
||||
.asInstanceOf[DirectDStreamProgressTracker]
|
||||
.read(namespace, timestamp - batchDuration.milliseconds, fallBack = true)
|
||||
assert(producedOffsetsAndSeqs === expectedOffsetsAndSeqs)
|
||||
}
|
||||
|
||||
|
@ -298,19 +316,24 @@ private[eventhubs] trait EventHubTestSuiteBase extends TestSuiteBase {
|
|||
val simulatedEventHubs1 = createSimulatedEventHub("namespace1", input1, eventhubsParams1)
|
||||
val simulatedEventHubs2 = createSimulatedEventHub("namespace2", input2, eventhubsParams2)
|
||||
|
||||
withStreamingContext(setupMultiEventHubStreams(simulatedEventHubs1, simulatedEventHubs2,
|
||||
eventhubsParams1, eventhubsParams2, "namespace1", "namespace2", operation)) { ssc =>
|
||||
runStreamsWithEventHubInput(ssc, numBatches_, expectedOutput, useSet = true)
|
||||
withStreamingContext(
|
||||
setupMultiEventHubStreams(simulatedEventHubs1,
|
||||
simulatedEventHubs2,
|
||||
eventhubsParams1,
|
||||
eventhubsParams2,
|
||||
"namespace1",
|
||||
"namespace2",
|
||||
operation)) { ssc =>
|
||||
runStreamsWithEventHubInput(ssc, numBatches_, expectedOutput, useSet = true)
|
||||
}
|
||||
verifyOffsetsAndSeqs(ssc, "namespace1", expectedOffsetsAndSeqs1)
|
||||
verifyOffsetsAndSeqs(ssc, "namespace2", expectedOffsetsAndSeqs2)
|
||||
}
|
||||
|
||||
protected def runStreamsWithEventHubInput[V: ClassTag](
|
||||
ssc: StreamingContext,
|
||||
numBatches: Int,
|
||||
expectedOutput: Seq[Seq[V]],
|
||||
useSet: Boolean): Unit = {
|
||||
protected def runStreamsWithEventHubInput[V: ClassTag](ssc: StreamingContext,
|
||||
numBatches: Int,
|
||||
expectedOutput: Seq[Seq[V]],
|
||||
useSet: Boolean): Unit = {
|
||||
val output = runEventHubStreams[V](ssc, numBatches, expectedOutput.size)
|
||||
verifyOutput[V](output, expectedOutput, useSet)
|
||||
}
|
||||
|
@ -323,20 +346,30 @@ private[eventhubs] trait EventHubTestSuiteBase extends TestSuiteBase {
|
|||
eventhubsParams: Map[String, Map[String, String]]): EventHubDirectDStream = {
|
||||
|
||||
val maxOffsetForEachEventHub = simulatedEventHubs.messageStore.map {
|
||||
case (ehNameAndPartition, messageQueue) => (ehNameAndPartition,
|
||||
(messageQueue.length.toLong - 1, messageQueue.length.toLong - 1))
|
||||
case (ehNameAndPartition, messageQueue) =>
|
||||
(ehNameAndPartition, (messageQueue.length.toLong - 1, messageQueue.length.toLong - 1))
|
||||
}
|
||||
new EventHubDirectDStream(ssc,
|
||||
new EventHubDirectDStream(
|
||||
ssc,
|
||||
namespace,
|
||||
progressRootPath.toString,
|
||||
eventhubsParams,
|
||||
(eventHubParams: Map[String, String], partitionId: Int, startOffset: Long,
|
||||
eventHubsOffsetType: EventHubsOffsetType, _: Int) =>
|
||||
new TestEventHubsReceiver(eventHubParams, simulatedEventHubs, partitionId, startOffset,
|
||||
eventHubsOffsetType),
|
||||
(eventHubParams: Map[String, String],
|
||||
partitionId: Int,
|
||||
startOffset: Long,
|
||||
eventHubsOffsetType: EventHubsOffsetType,
|
||||
_: Int) =>
|
||||
new TestEventHubsReceiver(eventHubParams,
|
||||
simulatedEventHubs,
|
||||
partitionId,
|
||||
startOffset,
|
||||
eventHubsOffsetType),
|
||||
(_: String, _: Map[String, Map[String, String]]) =>
|
||||
new FluctuatedEventHubClient(ssc, messagesBeforeEmpty, numBatchesBeforeNewData,
|
||||
maxOffsetForEachEventHub))
|
||||
new FluctuatedEventHubClient(ssc,
|
||||
messagesBeforeEmpty,
|
||||
numBatchesBeforeNewData,
|
||||
maxOffsetForEachEventHub)
|
||||
)
|
||||
}
|
||||
|
||||
private def setupFluctuatedEventHubStream[V: ClassTag](
|
||||
|
@ -346,11 +379,14 @@ private[eventhubs] trait EventHubTestSuiteBase extends TestSuiteBase {
|
|||
messagesBeforeEmpty: Long,
|
||||
numBatchesBeforeNewData: Int): StreamingContext = {
|
||||
|
||||
val inputStream = setupFluctuatedInputStream(eventhubNamespace, simulatedEventHubs,
|
||||
messagesBeforeEmpty, numBatchesBeforeNewData, eventhubsParams)
|
||||
val inputStream = setupFluctuatedInputStream(eventhubNamespace,
|
||||
simulatedEventHubs,
|
||||
messagesBeforeEmpty,
|
||||
numBatchesBeforeNewData,
|
||||
eventhubsParams)
|
||||
val operatedStream = operation(inputStream)
|
||||
val outputStream = new TestEventHubOutputStream(operatedStream,
|
||||
new ConcurrentLinkedQueue[Seq[Seq[V]]], None)
|
||||
val outputStream =
|
||||
new TestEventHubOutputStream(operatedStream, new ConcurrentLinkedQueue[Seq[Seq[V]]], None)
|
||||
outputStream.register()
|
||||
ssc
|
||||
}
|
||||
|
@ -368,10 +404,12 @@ private[eventhubs] trait EventHubTestSuiteBase extends TestSuiteBase {
|
|||
val simulatedEventHubs = createSimulatedEventHub(eventhubNamespace, input, eventhubsParams)
|
||||
|
||||
withStreamingContext(
|
||||
setupFluctuatedEventHubStream(simulatedEventHubs, eventhubsParams, operation,
|
||||
messagesBeforeEmpty, numBatchesBeforeNewData)) {
|
||||
ssc =>
|
||||
runStreamsWithEventHubInput(ssc, numBatches_, expectedOutput, useSet = false)
|
||||
setupFluctuatedEventHubStream(simulatedEventHubs,
|
||||
eventhubsParams,
|
||||
operation,
|
||||
messagesBeforeEmpty,
|
||||
numBatchesBeforeNewData)) { ssc =>
|
||||
runStreamsWithEventHubInput(ssc, numBatches_, expectedOutput, useSet = false)
|
||||
}
|
||||
verifyOffsetsAndSeqs(ssc, eventhubNamespace, expectedOffsetsAndSeqs)
|
||||
}
|
||||
|
|
|
@ -1,171 +0,0 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.spark.streaming.eventhubs
|
||||
|
||||
import scala.collection.mutable.ArrayBuffer
|
||||
|
||||
import com.microsoft.azure.eventhubs._
|
||||
import com.microsoft.azure.eventhubs.EventData.SystemProperties
|
||||
import com.microsoft.azure.eventhubs.amqp.AmqpConstants
|
||||
import org.mockito.Mockito._
|
||||
import org.mockito.internal.util.reflection.Whitebox
|
||||
import org.scalatest.mock.MockitoSugar
|
||||
|
||||
import org.apache.spark.eventhubscommon.client.EventHubsClientWrapper
|
||||
import org.apache.spark.storage.StorageLevel
|
||||
import org.apache.spark.streaming.{StreamingContext, TestSuiteBase}
|
||||
import org.apache.spark.streaming.eventhubs.checkpoint.OffsetStore
|
||||
import org.apache.spark.streaming.receiver.ReceiverSupervisor
|
||||
|
||||
|
||||
/**
|
||||
* Suite of EventHubs streaming receiver tests
|
||||
* This suite of tests are low level unit tests, they directly call EventHubsReceiver with mocks
|
||||
*/
|
||||
class EventHubsReceiverSuite extends TestSuiteBase with MockitoSugar{
|
||||
var eventhubsClientWrapperMock: EventHubsClientWrapper = _
|
||||
var offsetStoreMock: OffsetStore = _
|
||||
var executorMock: ReceiverSupervisor = _
|
||||
|
||||
val eventhubParameters = Map(
|
||||
"eventhubs.policyname" -> "policyname",
|
||||
"eventhubs.policykey" -> "policykey",
|
||||
"eventhubs.namespace" -> "namespace",
|
||||
"eventhubs.name" -> "name",
|
||||
"eventhubs.partition.count" -> "4",
|
||||
"eventhubs.checkpoint.dir" -> "checkpointdir",
|
||||
"eventhubs.checkpoint.interval" -> "1000"
|
||||
)
|
||||
|
||||
override def beforeFunction(): Unit = {
|
||||
eventhubsClientWrapperMock = mock[EventHubsClientWrapper]
|
||||
offsetStoreMock = mock[OffsetStore]
|
||||
executorMock = mock[ReceiverSupervisor]
|
||||
}
|
||||
|
||||
override def afterFunction(): Unit = {
|
||||
super.afterFunction()
|
||||
// Since this suite was originally written using EasyMock, add this to preserve the old
|
||||
// mocking semantics (see SPARK-5735 for more details)
|
||||
// verifyNoMoreInteractions(ehClientWrapperMock, offsetStoreMock)
|
||||
}
|
||||
|
||||
test("EventHubsUtils API works") {
|
||||
val streamingContext = new StreamingContext(master, framework, batchDuration)
|
||||
EventHubsUtils.createStream(streamingContext, eventhubParameters, "0", StorageLevel.MEMORY_ONLY)
|
||||
EventHubsUtils.createUnionStream(streamingContext, eventhubParameters,
|
||||
StorageLevel.MEMORY_ONLY_2)
|
||||
streamingContext.stop()
|
||||
}
|
||||
|
||||
test("EventHubsReceiver can receive message with proper checkpointing") {
|
||||
val eventhubPartitionId = "0"
|
||||
val eventCheckpointIntervalInSeconds: Int = 1
|
||||
val eventOffset: String = "2147483647"
|
||||
val eventSequenceNumber: Long = 1
|
||||
val maximumEventRate: Int = 999
|
||||
|
||||
var updatedEventhubsParams = eventhubParameters
|
||||
updatedEventhubsParams += "eventhubs.checkpoint.interval" ->
|
||||
eventCheckpointIntervalInSeconds.toString
|
||||
|
||||
var eventData = new EventData(Array.fill(8)((scala.util.Random.nextInt(256) - 128).toByte))
|
||||
|
||||
val systemPropertiesMap = new java.util.HashMap[String, AnyRef]()
|
||||
|
||||
systemPropertiesMap.put(AmqpConstants.OFFSET_ANNOTATION_NAME, eventOffset)
|
||||
systemPropertiesMap.put(AmqpConstants.SEQUENCE_NUMBER_ANNOTATION_NAME,
|
||||
Long.box(eventSequenceNumber))
|
||||
systemPropertiesMap.put(AmqpConstants.PARTITION_KEY_ANNOTATION_NAME, eventhubPartitionId)
|
||||
|
||||
val systemProperties = new SystemProperties(systemPropertiesMap)
|
||||
|
||||
Whitebox.setInternalState(eventData, "systemProperties", systemProperties)
|
||||
|
||||
val eventDataCollection = new ArrayBuffer[EventData]
|
||||
eventDataCollection += eventData
|
||||
|
||||
when(offsetStoreMock.read()).thenReturn("-1")
|
||||
when(eventhubsClientWrapperMock.receive()).thenReturn(eventDataCollection)
|
||||
|
||||
val receiver = new EventHubsReceiver(updatedEventhubsParams, eventhubPartitionId,
|
||||
StorageLevel.MEMORY_ONLY, Option(offsetStoreMock), eventhubsClientWrapperMock,
|
||||
maximumEventRate)
|
||||
|
||||
receiver.attachSupervisor(executorMock)
|
||||
|
||||
receiver.onStart()
|
||||
Thread sleep eventCheckpointIntervalInSeconds * 1000
|
||||
receiver.onStop()
|
||||
|
||||
Thread sleep eventCheckpointIntervalInSeconds * 1000
|
||||
|
||||
verify(offsetStoreMock, times(1)).open()
|
||||
verify(offsetStoreMock, times(1)).write(eventOffset)
|
||||
|
||||
verify(eventhubsClientWrapperMock, times(1)).createReceiver(updatedEventhubsParams,
|
||||
eventhubPartitionId, offsetStoreMock, maximumEventRate)
|
||||
verify(eventhubsClientWrapperMock, atLeastOnce).receive()
|
||||
}
|
||||
|
||||
ignore("EventHubsReceiver can restart when exception is thrown") {
|
||||
val eventhubPartitionId = "0"
|
||||
val eventOffset = "2147483647"
|
||||
val eventSequenceNumber = 1L
|
||||
val maximumEventRate = 999
|
||||
|
||||
val eventData = new EventData(Array.fill(8)((scala.util.Random.nextInt(256) - 128).toByte))
|
||||
val systemPropertiesMap = new java.util.HashMap[String, AnyRef]()
|
||||
|
||||
systemPropertiesMap.put(AmqpConstants.OFFSET_ANNOTATION_NAME, eventOffset)
|
||||
systemPropertiesMap.put(AmqpConstants.SEQUENCE_NUMBER_ANNOTATION_NAME,
|
||||
Long.box(eventSequenceNumber))
|
||||
systemPropertiesMap.put(AmqpConstants.PARTITION_KEY_ANNOTATION_NAME, eventhubPartitionId)
|
||||
|
||||
val systemProperties = new SystemProperties(systemPropertiesMap)
|
||||
|
||||
Whitebox.setInternalState(eventData, "systemProperties", systemProperties)
|
||||
val eventDataCollection: ArrayBuffer[EventData] = new ArrayBuffer[EventData]()
|
||||
eventDataCollection += eventData
|
||||
|
||||
val eventhubException = new RuntimeException("error")
|
||||
|
||||
when(offsetStoreMock.read()).thenReturn("-1")
|
||||
when(eventhubsClientWrapperMock.receive()).thenReturn(eventDataCollection).
|
||||
thenThrow(eventhubException)
|
||||
|
||||
val receiver = new EventHubsReceiver(eventhubParameters, eventhubPartitionId,
|
||||
StorageLevel.MEMORY_ONLY, Option(offsetStoreMock), eventhubsClientWrapperMock,
|
||||
maximumEventRate)
|
||||
|
||||
receiver.attachSupervisor(executorMock)
|
||||
|
||||
receiver.onStart()
|
||||
Thread sleep 1000
|
||||
receiver.onStop()
|
||||
|
||||
verify(executorMock, times(1)).restartReceiver(s"Error handling message," +
|
||||
s" restarting receiver for partition $eventhubPartitionId", Some(eventhubException))
|
||||
|
||||
verify(offsetStoreMock, times(1)).open()
|
||||
verify(offsetStoreMock, times(1)).close()
|
||||
verify(eventhubsClientWrapperMock, times(1)).createReceiver(eventhubParameters, "0",
|
||||
offsetStoreMock, maximumEventRate)
|
||||
verify(eventhubsClientWrapperMock, times(2)).receive()
|
||||
verify(eventhubsClientWrapperMock, times(1)).close()
|
||||
}
|
||||
}
|
|
@ -20,18 +20,20 @@ package org.apache.spark.streaming.eventhubs
|
|||
import java.nio.file.Files
|
||||
|
||||
import scala.collection.JavaConverters._
|
||||
|
||||
import org.apache.hadoop.conf.Configuration
|
||||
import org.apache.hadoop.fs.{FileSystem, Path}
|
||||
|
||||
import org.apache.spark.eventhubscommon.{EventHubNameAndPartition, OffsetRecord}
|
||||
import org.apache.hadoop.fs.{ FileSystem, Path }
|
||||
import org.apache.spark.eventhubscommon.{ EventHubNameAndPartition, OffsetRecord }
|
||||
import org.apache.spark.eventhubscommon.utils.FragileEventHubClient
|
||||
import org.apache.spark.streaming._
|
||||
import org.apache.spark.streaming.eventhubs.checkpoint.{DirectDStreamProgressTracker, ProgressTrackingListener}
|
||||
import org.apache.spark.streaming.eventhubs.checkpoint.{
|
||||
DirectDStreamProgressTracker,
|
||||
ProgressTrackingListener
|
||||
}
|
||||
import org.apache.spark.util.ManualClock
|
||||
|
||||
class ProgressTrackingAndCheckpointSuite extends CheckpointAndProgressTrackerTestSuiteBase
|
||||
with SharedUtils {
|
||||
class ProgressTrackingAndCheckpointSuite
|
||||
extends CheckpointAndProgressTrackerTestSuiteBase
|
||||
with SharedUtils {
|
||||
|
||||
override def init(): Unit = {
|
||||
progressRootPath = new Path(Files.createTempDirectory("progress_root").toString)
|
||||
|
@ -39,59 +41,67 @@ class ProgressTrackingAndCheckpointSuite extends CheckpointAndProgressTrackerTes
|
|||
ssc = createContextForCheckpointOperation(batchDuration, checkpointDirectory)
|
||||
progressListener = ProgressTrackingListener.initInstance(ssc, progressRootPath.toString)
|
||||
progressTracker = DirectDStreamProgressTracker.initInstance(progressRootPath.toString,
|
||||
appName, new Configuration())
|
||||
appName,
|
||||
new Configuration())
|
||||
}
|
||||
|
||||
override def batchDuration: Duration = Seconds(1)
|
||||
|
||||
test("currentOffset, ProgressTracker and EventHubClient are setup correctly when" +
|
||||
" EventHubDirectDStream is recovered") {
|
||||
val input = Seq(
|
||||
Seq(1, 2, 3, 4, 5, 6),
|
||||
Seq(4, 5, 6, 7, 8, 9),
|
||||
Seq(7, 8, 9, 1, 2, 3))
|
||||
val expectedOutputBeforeRestart = Seq(
|
||||
Seq(2, 3, 5, 6, 8, 9), Seq(4, 5, 7, 8, 10, 2), Seq(6, 7, 9, 10, 3, 4))
|
||||
test(
|
||||
"currentOffset, ProgressTracker and EventHubClient are setup correctly when" +
|
||||
" EventHubDirectDStream is recovered") {
|
||||
val input = Seq(Seq(1, 2, 3, 4, 5, 6), Seq(4, 5, 6, 7, 8, 9), Seq(7, 8, 9, 1, 2, 3))
|
||||
val expectedOutputBeforeRestart =
|
||||
Seq(Seq(2, 3, 5, 6, 8, 9), Seq(4, 5, 7, 8, 10, 2), Seq(6, 7, 9, 10, 3, 4))
|
||||
runStopAndRecover(
|
||||
input,
|
||||
eventhubsParams = Map[String, Map[String, String]](
|
||||
"eh1" -> Map(
|
||||
"eventhubs.partition.count" -> "3",
|
||||
"eventhubs.maxRate" -> "2",
|
||||
"eventhubs.name" -> "eh1")
|
||||
"eventhubs.name" -> "eh1",
|
||||
"eventhubs.namespace" -> "namespace",
|
||||
"eventhubs.policyname" -> "policyname",
|
||||
"eventhubs.policykey" -> "policykey"
|
||||
)
|
||||
),
|
||||
expectedStartingOffsetsAndSeqs = Map(eventhubNamespace ->
|
||||
OffsetRecord(2000L, Map(EventHubNameAndPartition("eh1", 0) -> (3L, 3L),
|
||||
EventHubNameAndPartition("eh1", 1) -> (3L, 3L),
|
||||
EventHubNameAndPartition("eh1", 2) -> (3L, 3L))
|
||||
)),
|
||||
expectedOffsetsAndSeqs =
|
||||
OffsetRecord(3000L, Map(EventHubNameAndPartition("eh1", 0) -> (5L, 5L),
|
||||
EventHubNameAndPartition("eh1", 1) -> (5L, 5L),
|
||||
EventHubNameAndPartition("eh1", 2) -> (5L, 5L))),
|
||||
expectedStartingOffsetsAndSeqs = Map(
|
||||
eventhubNamespace ->
|
||||
OffsetRecord(2000L,
|
||||
Map(EventHubNameAndPartition("eh1", 0) -> (3L, 3L),
|
||||
EventHubNameAndPartition("eh1", 1) -> (3L, 3L),
|
||||
EventHubNameAndPartition("eh1", 2) -> (3L, 3L)))),
|
||||
expectedOffsetsAndSeqs = OffsetRecord(3000L,
|
||||
Map(EventHubNameAndPartition("eh1", 0) -> (5L, 5L),
|
||||
EventHubNameAndPartition("eh1", 1) -> (5L, 5L),
|
||||
EventHubNameAndPartition("eh1", 2) -> (5L, 5L))),
|
||||
operation = (inputDStream: EventHubDirectDStream) =>
|
||||
inputDStream.map(eventData => eventData.getProperties.get("output").asInstanceOf[Int] + 1),
|
||||
expectedOutputBeforeRestart)
|
||||
val eventHubDirectDStream = ssc.graph.getInputStreams().filter(
|
||||
_.isInstanceOf[EventHubDirectDStream]).head.asInstanceOf[EventHubDirectDStream]
|
||||
assert(eventHubDirectDStream.currentOffsetsAndSeqNums ===
|
||||
OffsetRecord(2000L, Map(
|
||||
EventHubNameAndPartition("eh1", 0) -> (3L, 3L),
|
||||
EventHubNameAndPartition("eh1", 1) -> (3L, 3L),
|
||||
EventHubNameAndPartition("eh1", 2) -> (3L, 3L))))
|
||||
expectedOutputBeforeRestart
|
||||
)
|
||||
val eventHubDirectDStream = ssc.graph
|
||||
.getInputStreams()
|
||||
.filter(_.isInstanceOf[EventHubDirectDStream])
|
||||
.head
|
||||
.asInstanceOf[EventHubDirectDStream]
|
||||
assert(
|
||||
eventHubDirectDStream.currentOffsetsAndSeqNums ===
|
||||
OffsetRecord(2000L,
|
||||
Map(EventHubNameAndPartition("eh1", 0) -> (3L, 3L),
|
||||
EventHubNameAndPartition("eh1", 1) -> (3L, 3L),
|
||||
EventHubNameAndPartition("eh1", 2) -> (3L, 3L))))
|
||||
assert(DirectDStreamProgressTracker.getInstance != null)
|
||||
assert(eventHubDirectDStream.eventHubClient != null)
|
||||
}
|
||||
|
||||
test("test integration of spark checkpoint and progress tracking (single stream)") {
|
||||
val input = Seq(
|
||||
Seq(1, 2, 3, 4, 5, 6, 7, 8, 9, 10),
|
||||
Seq(4, 5, 6, 7, 8, 9, 10, 1, 2, 3),
|
||||
Seq(7, 8, 9, 1, 2, 3, 4, 5, 6, 7))
|
||||
val expectedOutputBeforeRestart = Seq(
|
||||
Seq(2, 3, 5, 6, 8, 9), Seq(4, 5, 7, 8, 10, 2), Seq(6, 7, 9, 10, 3, 4))
|
||||
val expectedOutputAfterRestart = Seq(
|
||||
Seq(6, 7, 9, 10, 3, 4), Seq(8, 9, 11, 2, 5, 6), Seq(10, 11, 3, 4, 7, 8), Seq())
|
||||
val input = Seq(Seq(1, 2, 3, 4, 5, 6, 7, 8, 9, 10),
|
||||
Seq(4, 5, 6, 7, 8, 9, 10, 1, 2, 3),
|
||||
Seq(7, 8, 9, 1, 2, 3, 4, 5, 6, 7))
|
||||
val expectedOutputBeforeRestart =
|
||||
Seq(Seq(2, 3, 5, 6, 8, 9), Seq(4, 5, 7, 8, 10, 2), Seq(6, 7, 9, 10, 3, 4))
|
||||
val expectedOutputAfterRestart =
|
||||
Seq(Seq(6, 7, 9, 10, 3, 4), Seq(8, 9, 11, 2, 5, 6), Seq(10, 11, 3, 4, 7, 8), Seq())
|
||||
|
||||
testCheckpointedOperation(
|
||||
input,
|
||||
|
@ -99,41 +109,61 @@ class ProgressTrackingAndCheckpointSuite extends CheckpointAndProgressTrackerTes
|
|||
"eh1" -> Map(
|
||||
"eventhubs.partition.count" -> "3",
|
||||
"eventhubs.maxRate" -> "2",
|
||||
"eventhubs.name" -> "eh1")
|
||||
"eventhubs.name" -> "eh1",
|
||||
"eventhubs.namespace" -> "namespace",
|
||||
"eventhubs.policyname" -> "policyname",
|
||||
"eventhubs.policykey" -> "policykey"
|
||||
)
|
||||
),
|
||||
expectedStartingOffsetsAndSeqs = Map(eventhubNamespace ->
|
||||
OffsetRecord(2000L, Map(EventHubNameAndPartition("eh1", 0) -> (3L, 3L),
|
||||
EventHubNameAndPartition("eh1", 1) -> (3L, 3L),
|
||||
EventHubNameAndPartition("eh1", 2) -> (3L, 3L))
|
||||
)),
|
||||
expectedStartingOffsetsAndSeqs = Map(
|
||||
eventhubNamespace ->
|
||||
OffsetRecord(2000L,
|
||||
Map(EventHubNameAndPartition("eh1", 0) -> (3L, 3L),
|
||||
EventHubNameAndPartition("eh1", 1) -> (3L, 3L),
|
||||
EventHubNameAndPartition("eh1", 2) -> (3L, 3L)))),
|
||||
expectedOffsetsAndSeqs = OffsetRecord(3000L,
|
||||
Map(EventHubNameAndPartition("eh1", 0) -> (5L, 5L),
|
||||
EventHubNameAndPartition("eh1", 1) -> (5L, 5L),
|
||||
EventHubNameAndPartition("eh1", 2) -> (5L, 5L))),
|
||||
Map(EventHubNameAndPartition("eh1", 0) -> (5L, 5L),
|
||||
EventHubNameAndPartition("eh1", 1) -> (5L, 5L),
|
||||
EventHubNameAndPartition("eh1", 2) -> (5L, 5L))),
|
||||
operation = (inputDStream: EventHubDirectDStream) =>
|
||||
inputDStream.map(eventData => eventData.getProperties.get("output").asInstanceOf[Int] + 1),
|
||||
expectedOutputBeforeRestart,
|
||||
expectedOutputAfterRestart)
|
||||
expectedOutputAfterRestart
|
||||
)
|
||||
}
|
||||
|
||||
test("test integration of spark checkpoint and progress tracking (reduceByKeyAndWindow)") {
|
||||
val input = Seq(
|
||||
Seq("1", "2", "3", "4", "5", "6", "7", "8", "9", "10"),
|
||||
Seq("4", "5", "6", "7", "8", "9", "10", "1", "2", "3"),
|
||||
Seq("7", "8", "9", "1", "2", "3", "4", "5", "6", "7"))
|
||||
val input = Seq(Seq("1", "2", "3", "4", "5", "6", "7", "8", "9", "10"),
|
||||
Seq("4", "5", "6", "7", "8", "9", "10", "1", "2", "3"),
|
||||
Seq("7", "8", "9", "1", "2", "3", "4", "5", "6", "7"))
|
||||
val expectedOutputBeforeRestart = Seq(
|
||||
Seq("1" -> 1, "2" -> 1, "4" -> 1, "5" -> 1, "7" -> 1, "8" -> 1),
|
||||
Seq("1" -> 2, "2" -> 1, "4" -> 2, "5" -> 1, "7" -> 2, "8" -> 1, "3" -> 1, "6" -> 1,
|
||||
"9" -> 1),
|
||||
Seq("1" -> 1, "2" -> 1, "4" -> 1, "5" -> 1, "7" -> 1, "8" -> 1, "3" -> 2, "6" -> 2,
|
||||
"9" -> 2))
|
||||
Seq("1" -> 2, "2" -> 1, "4" -> 2, "5" -> 1, "7" -> 2, "8" -> 1, "3" -> 1, "6" -> 1, "9" -> 1),
|
||||
Seq("1" -> 1, "2" -> 1, "4" -> 1, "5" -> 1, "7" -> 1, "8" -> 1, "3" -> 2, "6" -> 2, "9" -> 2)
|
||||
)
|
||||
val expectedOutputAfterRestart = Seq(
|
||||
Seq("1" -> 1, "2" -> 1, "4" -> 1, "5" -> 1, "7" -> 1, "8" -> 1, "3" -> 2, "6" -> 2,
|
||||
"9" -> 2),
|
||||
Seq("5" -> 2, "6" -> 1, "9" -> 1, "2" -> 1, "3" -> 1, "7" -> 1, "8" -> 2,
|
||||
"10" -> 1, "1" -> 1, "4" -> 1),
|
||||
Seq("7" -> 2, "8" -> 1, "10" -> 2, "1" -> 1, "4" -> 1, "5" -> 1, "9" -> 1,
|
||||
"2" -> 1, "3" -> 1, "6" -> 1))
|
||||
Seq("1" -> 1, "2" -> 1, "4" -> 1, "5" -> 1, "7" -> 1, "8" -> 1, "3" -> 2, "6" -> 2, "9" -> 2),
|
||||
Seq("5" -> 2,
|
||||
"6" -> 1,
|
||||
"9" -> 1,
|
||||
"2" -> 1,
|
||||
"3" -> 1,
|
||||
"7" -> 1,
|
||||
"8" -> 2,
|
||||
"10" -> 1,
|
||||
"1" -> 1,
|
||||
"4" -> 1),
|
||||
Seq("7" -> 2,
|
||||
"8" -> 1,
|
||||
"10" -> 2,
|
||||
"1" -> 1,
|
||||
"4" -> 1,
|
||||
"5" -> 1,
|
||||
"9" -> 1,
|
||||
"2" -> 1,
|
||||
"3" -> 1,
|
||||
"6" -> 1)
|
||||
)
|
||||
|
||||
testCheckpointedOperation(
|
||||
input,
|
||||
|
@ -141,39 +171,46 @@ class ProgressTrackingAndCheckpointSuite extends CheckpointAndProgressTrackerTes
|
|||
"eh1" -> Map(
|
||||
"eventhubs.partition.count" -> "3",
|
||||
"eventhubs.maxRate" -> "2",
|
||||
"eventhubs.name" -> "eh1")
|
||||
"eventhubs.name" -> "eh1",
|
||||
"eventhubs.namespace" -> "namespace",
|
||||
"eventhubs.policyname" -> "policyname",
|
||||
"eventhubs.policykey" -> "policykey"
|
||||
)
|
||||
),
|
||||
expectedStartingOffsetsAndSeqs = Map(eventhubNamespace ->
|
||||
OffsetRecord(2000L, Map(EventHubNameAndPartition("eh1", 0) -> (3L, 3L),
|
||||
EventHubNameAndPartition("eh1", 1) -> (3L, 3L),
|
||||
EventHubNameAndPartition("eh1", 2) -> (3L, 3L))
|
||||
)),
|
||||
expectedStartingOffsetsAndSeqs = Map(
|
||||
eventhubNamespace ->
|
||||
OffsetRecord(2000L,
|
||||
Map(EventHubNameAndPartition("eh1", 0) -> (3L, 3L),
|
||||
EventHubNameAndPartition("eh1", 1) -> (3L, 3L),
|
||||
EventHubNameAndPartition("eh1", 2) -> (3L, 3L)))),
|
||||
expectedOffsetsAndSeqs = OffsetRecord(3000L,
|
||||
Map(EventHubNameAndPartition("eh1", 0) -> (5L, 5L),
|
||||
EventHubNameAndPartition("eh1", 1) -> (5L, 5L),
|
||||
EventHubNameAndPartition("eh1", 2) -> (5L, 5L))),
|
||||
Map(EventHubNameAndPartition("eh1", 0) -> (5L, 5L),
|
||||
EventHubNameAndPartition("eh1", 1) -> (5L, 5L),
|
||||
EventHubNameAndPartition("eh1", 2) -> (5L, 5L))),
|
||||
operation = (inputDStream: EventHubDirectDStream) =>
|
||||
inputDStream.flatMap(eventData => eventData.getProperties.asScala.
|
||||
map{case (_, value) => (value, 1)}).
|
||||
reduceByKeyAndWindow(_ + _, _ - _, Seconds(2), Seconds(1)),
|
||||
inputDStream
|
||||
.flatMap(eventData =>
|
||||
eventData.getProperties.asScala.map { case (_, value) => (value, 1) })
|
||||
.reduceByKeyAndWindow(_ + _, _ - _, Seconds(2), Seconds(1)),
|
||||
expectedOutputBeforeRestart,
|
||||
expectedOutputAfterRestart,
|
||||
useSetFlag = true)
|
||||
useSetFlag = true
|
||||
)
|
||||
}
|
||||
|
||||
test("test integration of spark checkpoint and progress tracking (single stream +" +
|
||||
" windowing function)") {
|
||||
val input = Seq(
|
||||
Seq(1, 2, 3, 4, 5, 6, 7, 8, 9, 10),
|
||||
Seq(4, 5, 6, 7, 8, 9, 10, 1, 2, 3),
|
||||
Seq(7, 8, 9, 1, 2, 3, 4, 5, 6, 7))
|
||||
val expectedOutputBeforeRestart = Seq(
|
||||
Seq(2, 3, 5, 6, 8, 9), Seq(2, 3, 5, 6, 8, 9, 4, 5, 7, 8, 10, 2),
|
||||
Seq(4, 5, 7, 8, 10, 2, 6, 7, 9, 10, 3, 4))
|
||||
val expectedOutputAfterRestart = Seq(
|
||||
Seq(4, 5, 7, 8, 10, 2, 6, 7, 9, 10, 3, 4),
|
||||
Seq(6, 7, 9, 10, 3, 4, 8, 9, 11, 2, 5, 6),
|
||||
Seq(8, 9, 11, 2, 5, 6, 10, 11, 3, 4, 7, 8), Seq(10, 11, 3, 4, 7, 8))
|
||||
test(
|
||||
"test integration of spark checkpoint and progress tracking (single stream +" +
|
||||
" windowing function)") {
|
||||
val input = Seq(Seq(1, 2, 3, 4, 5, 6, 7, 8, 9, 10),
|
||||
Seq(4, 5, 6, 7, 8, 9, 10, 1, 2, 3),
|
||||
Seq(7, 8, 9, 1, 2, 3, 4, 5, 6, 7))
|
||||
val expectedOutputBeforeRestart = Seq(Seq(2, 3, 5, 6, 8, 9),
|
||||
Seq(2, 3, 5, 6, 8, 9, 4, 5, 7, 8, 10, 2),
|
||||
Seq(4, 5, 7, 8, 10, 2, 6, 7, 9, 10, 3, 4))
|
||||
val expectedOutputAfterRestart = Seq(Seq(4, 5, 7, 8, 10, 2, 6, 7, 9, 10, 3, 4),
|
||||
Seq(6, 7, 9, 10, 3, 4, 8, 9, 11, 2, 5, 6),
|
||||
Seq(8, 9, 11, 2, 5, 6, 10, 11, 3, 4, 7, 8),
|
||||
Seq(10, 11, 3, 4, 7, 8))
|
||||
|
||||
testCheckpointedOperation(
|
||||
input,
|
||||
|
@ -181,43 +218,81 @@ class ProgressTrackingAndCheckpointSuite extends CheckpointAndProgressTrackerTes
|
|||
"eh1" -> Map(
|
||||
"eventhubs.partition.count" -> "3",
|
||||
"eventhubs.maxRate" -> "2",
|
||||
"eventhubs.name" -> "eh1")
|
||||
"eventhubs.name" -> "eh1",
|
||||
"eventhubs.namespace" -> "namespace",
|
||||
"eventhubs.policyname" -> "policyname",
|
||||
"eventhubs.policykey" -> "policykey"
|
||||
)
|
||||
),
|
||||
expectedStartingOffsetsAndSeqs = Map(eventhubNamespace ->
|
||||
OffsetRecord(2000L, Map(EventHubNameAndPartition("eh1", 0) -> (3L, 3L),
|
||||
EventHubNameAndPartition("eh1", 1) -> (3L, 3L),
|
||||
EventHubNameAndPartition("eh1", 2) -> (3L, 3L))
|
||||
)),
|
||||
expectedStartingOffsetsAndSeqs = Map(
|
||||
eventhubNamespace ->
|
||||
OffsetRecord(2000L,
|
||||
Map(EventHubNameAndPartition("eh1", 0) -> (3L, 3L),
|
||||
EventHubNameAndPartition("eh1", 1) -> (3L, 3L),
|
||||
EventHubNameAndPartition("eh1", 2) -> (3L, 3L)))),
|
||||
expectedOffsetsAndSeqs = OffsetRecord(3000L,
|
||||
Map(EventHubNameAndPartition("eh1", 0) -> (5L, 5L),
|
||||
EventHubNameAndPartition("eh1", 1) -> (5L, 5L),
|
||||
EventHubNameAndPartition("eh1", 2) -> (5L, 5L))),
|
||||
Map(EventHubNameAndPartition("eh1", 0) -> (5L, 5L),
|
||||
EventHubNameAndPartition("eh1", 1) -> (5L, 5L),
|
||||
EventHubNameAndPartition("eh1", 2) -> (5L, 5L))),
|
||||
operation = (inputDStream: EventHubDirectDStream) =>
|
||||
inputDStream.window(Seconds(2), Seconds(1)).map(
|
||||
eventData => eventData.getProperties.get("output").asInstanceOf[Int] + 1),
|
||||
inputDStream
|
||||
.window(Seconds(2), Seconds(1))
|
||||
.map(eventData => eventData.getProperties.get("output").asInstanceOf[Int] + 1),
|
||||
expectedOutputBeforeRestart,
|
||||
expectedOutputAfterRestart)
|
||||
expectedOutputAfterRestart
|
||||
)
|
||||
}
|
||||
|
||||
test("test integration of spark checkpoint and progress tracking (multi-streams join)") {
|
||||
val input1 = Seq(
|
||||
Seq("a" -> 1, "b" -> 2, "c" -> 3, "d" -> 4, "e" -> 5, "f" -> 6, "g" -> 4, "h" -> 5, "i" -> 6),
|
||||
Seq("g" -> 4, "h" -> 5, "i" -> 6, "j" -> 7, "k" -> 8, "l" -> 9, "m" -> 7, "n" -> 8, "o" -> 9),
|
||||
Seq("m" -> 7, "n" -> 8, "o" -> 9, "p" -> 1, "q" -> 2, "r" -> 3, "a" -> 1, "b" -> 2, "c" -> 3))
|
||||
Seq("m" -> 7, "n" -> 8, "o" -> 9, "p" -> 1, "q" -> 2, "r" -> 3, "a" -> 1, "b" -> 2, "c" -> 3)
|
||||
)
|
||||
val input2 = Seq(
|
||||
Seq("a" -> 1, "b" -> 2, "c" -> 3, "d" -> 4, "e" -> 5, "f" -> 6, "g" -> 4, "h" -> 5, "i" -> 6),
|
||||
Seq("g" -> 4, "h" -> 5, "i" -> 6, "j" -> 7, "k" -> 8, "l" -> 9, "m" -> 7, "n" -> 8, "o" -> 9),
|
||||
Seq("m" -> 7, "n" -> 8, "o" -> 9, "p" -> 1, "q" -> 2, "r" -> 3, "a" -> 1, "b" -> 2, "c" -> 3))
|
||||
val expectedOutputBeforeRestart = Seq(
|
||||
Seq("a" -> 2, "b" -> 4, "c" -> 6, "g" -> 8, "h" -> 10, "i" -> 12, "m" -> 14, "n" -> 16,
|
||||
"o" -> 18),
|
||||
Seq("d" -> 8, "e" -> 10, "f" -> 12, "j" -> 14, "k" -> 16, "l" -> 18, "p" -> 2, "q" -> 4,
|
||||
"r" -> 6))
|
||||
Seq("m" -> 7, "n" -> 8, "o" -> 9, "p" -> 1, "q" -> 2, "r" -> 3, "a" -> 1, "b" -> 2, "c" -> 3)
|
||||
)
|
||||
val expectedOutputBeforeRestart = Seq(Seq("a" -> 2,
|
||||
"b" -> 4,
|
||||
"c" -> 6,
|
||||
"g" -> 8,
|
||||
"h" -> 10,
|
||||
"i" -> 12,
|
||||
"m" -> 14,
|
||||
"n" -> 16,
|
||||
"o" -> 18),
|
||||
Seq("d" -> 8,
|
||||
"e" -> 10,
|
||||
"f" -> 12,
|
||||
"j" -> 14,
|
||||
"k" -> 16,
|
||||
"l" -> 18,
|
||||
"p" -> 2,
|
||||
"q" -> 4,
|
||||
"r" -> 6))
|
||||
val expectedOutputAfterRestart = Seq(
|
||||
Seq("d" -> 8, "e" -> 10, "f" -> 12, "j" -> 14, "k" -> 16, "l" -> 18, "p" -> 2, "q" -> 4,
|
||||
"r" -> 6),
|
||||
Seq("g" -> 8, "h" -> 10, "i" -> 12, "m" -> 14, "n" -> 16, "o" -> 18,
|
||||
"a" -> 2, "b" -> 4, "c" -> 6), Seq())
|
||||
Seq("d" -> 8,
|
||||
"e" -> 10,
|
||||
"f" -> 12,
|
||||
"j" -> 14,
|
||||
"k" -> 16,
|
||||
"l" -> 18,
|
||||
"p" -> 2,
|
||||
"q" -> 4,
|
||||
"r" -> 6),
|
||||
Seq("g" -> 8,
|
||||
"h" -> 10,
|
||||
"i" -> 12,
|
||||
"m" -> 14,
|
||||
"n" -> 16,
|
||||
"o" -> 18,
|
||||
"a" -> 2,
|
||||
"b" -> 4,
|
||||
"c" -> 6),
|
||||
Seq()
|
||||
)
|
||||
|
||||
testCheckpointedOperation(
|
||||
input1,
|
||||
|
@ -226,41 +301,52 @@ class ProgressTrackingAndCheckpointSuite extends CheckpointAndProgressTrackerTes
|
|||
"eh1" -> Map(
|
||||
"eventhubs.partition.count" -> "3",
|
||||
"eventhubs.maxRate" -> "3",
|
||||
"eventhubs.name" -> "eh1")
|
||||
"eventhubs.name" -> "eh1",
|
||||
"eventhubs.namespace" -> "namespace",
|
||||
"eventhubs.policyname" -> "policyname",
|
||||
"eventhubs.policykey" -> "policykey"
|
||||
)
|
||||
),
|
||||
eventhubsParams2 = Map[String, Map[String, String]](
|
||||
"eh1" -> Map(
|
||||
"eventhubs.partition.count" -> "3",
|
||||
"eventhubs.maxRate" -> "3",
|
||||
"eventhubs.name" -> "eh1")
|
||||
"eventhubs.name" -> "eh1",
|
||||
"eventhubs.namespace" -> "namespace",
|
||||
"eventhubs.policyname" -> "policyname",
|
||||
"eventhubs.policykey" -> "policykey"
|
||||
)
|
||||
),
|
||||
expectedStartingOffsetsAndSeqs1 = Map("namespace1" ->
|
||||
OffsetRecord(1000L, Map(EventHubNameAndPartition("eh1", 0) -> (2L, 2L),
|
||||
EventHubNameAndPartition("eh1", 1) -> (2L, 2L),
|
||||
EventHubNameAndPartition("eh1", 2) -> (2L, 2L))
|
||||
)),
|
||||
expectedStartingOffsetsAndSeqs2 = Map("namespace2" ->
|
||||
OffsetRecord(1000L, Map(EventHubNameAndPartition("eh1", 0) -> (2L, 2L),
|
||||
EventHubNameAndPartition("eh1", 1) -> (2L, 2L),
|
||||
EventHubNameAndPartition("eh1", 2) -> (2L, 2L))
|
||||
)),
|
||||
expectedStartingOffsetsAndSeqs1 = Map(
|
||||
"namespace1" ->
|
||||
OffsetRecord(1000L,
|
||||
Map(EventHubNameAndPartition("eh1", 0) -> (2L, 2L),
|
||||
EventHubNameAndPartition("eh1", 1) -> (2L, 2L),
|
||||
EventHubNameAndPartition("eh1", 2) -> (2L, 2L)))),
|
||||
expectedStartingOffsetsAndSeqs2 = Map(
|
||||
"namespace2" ->
|
||||
OffsetRecord(1000L,
|
||||
Map(EventHubNameAndPartition("eh1", 0) -> (2L, 2L),
|
||||
EventHubNameAndPartition("eh1", 1) -> (2L, 2L),
|
||||
EventHubNameAndPartition("eh1", 2) -> (2L, 2L)))),
|
||||
operation = (inputDStream1: EventHubDirectDStream, inputDStream2: EventHubDirectDStream) =>
|
||||
inputDStream1.flatMap(eventData => eventData.getProperties.asScala).
|
||||
join(inputDStream2.flatMap(eventData => eventData.getProperties.asScala)).
|
||||
map{case (key, (v1, v2)) => (key, v1.asInstanceOf[Int] + v2.asInstanceOf[Int])},
|
||||
inputDStream1
|
||||
.flatMap(eventData => eventData.getProperties.asScala)
|
||||
.join(inputDStream2.flatMap(eventData => eventData.getProperties.asScala))
|
||||
.map { case (key, (v1, v2)) => (key, v1.asInstanceOf[Int] + v2.asInstanceOf[Int]) },
|
||||
expectedOutputBeforeRestart,
|
||||
expectedOutputAfterRestart)
|
||||
expectedOutputAfterRestart
|
||||
)
|
||||
}
|
||||
|
||||
test("recover from a progress directory where has no metadata record") {
|
||||
val input = Seq(
|
||||
Seq(1, 2, 3, 4, 5, 6, 7, 8, 9, 10),
|
||||
Seq(4, 5, 6, 7, 8, 9, 10, 1, 2, 3),
|
||||
Seq(7, 8, 9, 1, 2, 3, 4, 5, 6, 7))
|
||||
val expectedOutputBeforeRestart = Seq(
|
||||
Seq(2, 3, 5, 6, 8, 9), Seq(4, 5, 7, 8, 10, 2), Seq(6, 7, 9, 10, 3, 4))
|
||||
val expectedOutputAfterRestart = Seq(
|
||||
Seq(6, 7, 9, 10, 3, 4), Seq(8, 9, 11, 2, 5, 6), Seq(10, 11, 3, 4, 7, 8), Seq())
|
||||
val input = Seq(Seq(1, 2, 3, 4, 5, 6, 7, 8, 9, 10),
|
||||
Seq(4, 5, 6, 7, 8, 9, 10, 1, 2, 3),
|
||||
Seq(7, 8, 9, 1, 2, 3, 4, 5, 6, 7))
|
||||
val expectedOutputBeforeRestart =
|
||||
Seq(Seq(2, 3, 5, 6, 8, 9), Seq(4, 5, 7, 8, 10, 2), Seq(6, 7, 9, 10, 3, 4))
|
||||
val expectedOutputAfterRestart =
|
||||
Seq(Seq(6, 7, 9, 10, 3, 4), Seq(8, 9, 11, 2, 5, 6), Seq(10, 11, 3, 4, 7, 8), Seq())
|
||||
|
||||
testCheckpointedOperation(
|
||||
input,
|
||||
|
@ -268,34 +354,37 @@ class ProgressTrackingAndCheckpointSuite extends CheckpointAndProgressTrackerTes
|
|||
"eh1" -> Map(
|
||||
"eventhubs.partition.count" -> "3",
|
||||
"eventhubs.maxRate" -> "2",
|
||||
"eventhubs.name" -> "eh1")
|
||||
"eventhubs.name" -> "eh1",
|
||||
"eventhubs.namespace" -> "namespace",
|
||||
"eventhubs.policyname" -> "policyname",
|
||||
"eventhubs.policykey" -> "policykey"
|
||||
)
|
||||
),
|
||||
expectedStartingOffsetsAndSeqs = Map(eventhubNamespace ->
|
||||
OffsetRecord(2000L, Map(EventHubNameAndPartition("eh1", 0) -> (3L, 3L),
|
||||
EventHubNameAndPartition("eh1", 1) -> (3L, 3L),
|
||||
EventHubNameAndPartition("eh1", 2) -> (3L, 3L))
|
||||
)),
|
||||
expectedStartingOffsetsAndSeqs = Map(
|
||||
eventhubNamespace ->
|
||||
OffsetRecord(2000L,
|
||||
Map(EventHubNameAndPartition("eh1", 0) -> (3L, 3L),
|
||||
EventHubNameAndPartition("eh1", 1) -> (3L, 3L),
|
||||
EventHubNameAndPartition("eh1", 2) -> (3L, 3L)))),
|
||||
expectedOffsetsAndSeqs = OffsetRecord(3000L,
|
||||
Map(EventHubNameAndPartition("eh1", 0) -> (5L, 5L),
|
||||
EventHubNameAndPartition("eh1", 1) -> (5L, 5L),
|
||||
EventHubNameAndPartition("eh1", 2) -> (5L, 5L))),
|
||||
Map(EventHubNameAndPartition("eh1", 0) -> (5L, 5L),
|
||||
EventHubNameAndPartition("eh1", 1) -> (5L, 5L),
|
||||
EventHubNameAndPartition("eh1", 2) -> (5L, 5L))),
|
||||
operation = (inputDStream: EventHubDirectDStream) =>
|
||||
inputDStream.map(eventData => eventData.getProperties.get("output").asInstanceOf[Int] + 1),
|
||||
expectedOutputBeforeRestart,
|
||||
expectedOutputAfterRestart,
|
||||
directoryToClean = Some(progressTracker.metadataDirectoryPath))
|
||||
directoryToClean = Some(progressTracker.metadataDirectoryPath)
|
||||
)
|
||||
}
|
||||
|
||||
|
||||
test("recover from progress after updating code (no checkpoint provided)") {
|
||||
val input = Seq(
|
||||
Seq(1, 2, 3, 4, 5, 6, 7, 8, 9, 10),
|
||||
Seq(4, 5, 6, 7, 8, 9, 10, 1, 2, 3),
|
||||
Seq(7, 8, 9, 1, 2, 3, 4, 5, 6, 7))
|
||||
val expectedOutputBeforeRestart = Seq(
|
||||
Seq(2, 3, 5, 6, 8, 9), Seq(4, 5, 7, 8, 10, 2), Seq(6, 7, 9, 10, 3, 4))
|
||||
val expectedOutputAfterRestart = Seq(
|
||||
Seq(8, 9, 11, 2, 5, 6), Seq(10, 11, 3, 4, 7, 8))
|
||||
val input = Seq(Seq(1, 2, 3, 4, 5, 6, 7, 8, 9, 10),
|
||||
Seq(4, 5, 6, 7, 8, 9, 10, 1, 2, 3),
|
||||
Seq(7, 8, 9, 1, 2, 3, 4, 5, 6, 7))
|
||||
val expectedOutputBeforeRestart =
|
||||
Seq(Seq(2, 3, 5, 6, 8, 9), Seq(4, 5, 7, 8, 10, 2), Seq(6, 7, 9, 10, 3, 4))
|
||||
val expectedOutputAfterRestart = Seq(Seq(8, 9, 11, 2, 5, 6), Seq(10, 11, 3, 4, 7, 8))
|
||||
|
||||
testUnaryOperation(
|
||||
input,
|
||||
|
@ -303,24 +392,31 @@ class ProgressTrackingAndCheckpointSuite extends CheckpointAndProgressTrackerTes
|
|||
"eh1" -> Map(
|
||||
"eventhubs.partition.count" -> "3",
|
||||
"eventhubs.maxRate" -> "2",
|
||||
"eventhubs.name" -> "eh1")
|
||||
"eventhubs.name" -> "eh1",
|
||||
"eventhubs.namespace" -> "namespace",
|
||||
"eventhubs.policyname" -> "policyname",
|
||||
"eventhubs.policykey" -> "policykey"
|
||||
)
|
||||
),
|
||||
expectedOffsetsAndSeqs = Map(eventhubNamespace ->
|
||||
OffsetRecord(2000L, Map(EventHubNameAndPartition("eh1", 0) -> (3L, 3L),
|
||||
EventHubNameAndPartition("eh1", 1) -> (3L, 3L),
|
||||
EventHubNameAndPartition("eh1", 2) -> (3L, 3L))
|
||||
)),
|
||||
expectedOffsetsAndSeqs = Map(
|
||||
eventhubNamespace ->
|
||||
OffsetRecord(2000L,
|
||||
Map(EventHubNameAndPartition("eh1", 0) -> (3L, 3L),
|
||||
EventHubNameAndPartition("eh1", 1) -> (3L, 3L),
|
||||
EventHubNameAndPartition("eh1", 2) -> (3L, 3L)))),
|
||||
operation = (inputDStream: EventHubDirectDStream) =>
|
||||
inputDStream.map(eventData => eventData.getProperties.get("output").asInstanceOf[Int] + 1),
|
||||
expectedOutputBeforeRestart)
|
||||
expectedOutputBeforeRestart
|
||||
)
|
||||
|
||||
testProgressTracker(
|
||||
eventhubNamespace,
|
||||
expectedOffsetsAndSeqs =
|
||||
OffsetRecord(3000L, Map(EventHubNameAndPartition("eh1", 0) -> (5L, 5L),
|
||||
EventHubNameAndPartition("eh1", 1) -> (5L, 5L),
|
||||
EventHubNameAndPartition("eh1", 2) -> (5L, 5L))),
|
||||
4000L)
|
||||
expectedOffsetsAndSeqs = OffsetRecord(3000L,
|
||||
Map(EventHubNameAndPartition("eh1", 0) -> (5L, 5L),
|
||||
EventHubNameAndPartition("eh1", 1) -> (5L, 5L),
|
||||
EventHubNameAndPartition("eh1", 2) -> (5L, 5L))),
|
||||
4000L
|
||||
)
|
||||
|
||||
ssc.stop()
|
||||
reset()
|
||||
|
@ -334,36 +430,43 @@ class ProgressTrackingAndCheckpointSuite extends CheckpointAndProgressTrackerTes
|
|||
"eh1" -> Map(
|
||||
"eventhubs.partition.count" -> "3",
|
||||
"eventhubs.maxRate" -> "2",
|
||||
"eventhubs.name" -> "eh1")
|
||||
"eventhubs.name" -> "eh1",
|
||||
"eventhubs.namespace" -> "namespace",
|
||||
"eventhubs.policyname" -> "policyname",
|
||||
"eventhubs.policykey" -> "policykey"
|
||||
)
|
||||
),
|
||||
expectedOffsetsAndSeqs = Map(eventhubNamespace ->
|
||||
OffsetRecord(6000, Map(EventHubNameAndPartition("eh1", 0) -> (7L, 7L),
|
||||
EventHubNameAndPartition("eh1", 1) -> (7L, 7L),
|
||||
EventHubNameAndPartition("eh1", 2) -> (7L, 7L))
|
||||
)),
|
||||
expectedOffsetsAndSeqs = Map(
|
||||
eventhubNamespace ->
|
||||
OffsetRecord(6000,
|
||||
Map(EventHubNameAndPartition("eh1", 0) -> (7L, 7L),
|
||||
EventHubNameAndPartition("eh1", 1) -> (7L, 7L),
|
||||
EventHubNameAndPartition("eh1", 2) -> (7L, 7L)))),
|
||||
operation = (inputDStream: EventHubDirectDStream) =>
|
||||
inputDStream.map(eventData => eventData.getProperties.get("output").asInstanceOf[Int] + 1),
|
||||
expectedOutputAfterRestart)
|
||||
expectedOutputAfterRestart
|
||||
)
|
||||
|
||||
testProgressTracker(
|
||||
eventhubNamespace,
|
||||
expectedOffsetsAndSeqs =
|
||||
OffsetRecord(7000L, Map(EventHubNameAndPartition("eh1", 0) -> (9L, 9L),
|
||||
EventHubNameAndPartition("eh1", 1) -> (9L, 9L),
|
||||
EventHubNameAndPartition("eh1", 2) -> (9L, 9L))),
|
||||
8000L)
|
||||
expectedOffsetsAndSeqs = OffsetRecord(7000L,
|
||||
Map(EventHubNameAndPartition("eh1", 0) -> (9L, 9L),
|
||||
EventHubNameAndPartition("eh1", 1) -> (9L, 9L),
|
||||
EventHubNameAndPartition("eh1", 2) -> (9L, 9L))),
|
||||
8000L
|
||||
)
|
||||
}
|
||||
|
||||
test("recover correctly when checkpoint writing is delayed") {
|
||||
val input = Seq(
|
||||
Seq(1, 2, 3, 4, 5, 6, 7, 8, 9, 10),
|
||||
Seq(4, 5, 6, 7, 8, 9, 10, 1, 2, 3),
|
||||
Seq(7, 8, 9, 1, 2, 3, 4, 5, 6, 7))
|
||||
val expectedOutputBeforeRestart = Seq(
|
||||
Seq(2, 3, 5, 6, 8, 9), Seq(4, 5, 7, 8, 10, 2), Seq(6, 7, 9, 10, 3, 4))
|
||||
val expectedOutputAfterRestart = Seq(
|
||||
Seq(4, 5, 7, 8, 10, 2), Seq(6, 7, 9, 10, 3, 4), Seq(8, 9, 11, 2, 5, 6),
|
||||
Seq(10, 11, 3, 4, 7, 8))
|
||||
val input = Seq(Seq(1, 2, 3, 4, 5, 6, 7, 8, 9, 10),
|
||||
Seq(4, 5, 6, 7, 8, 9, 10, 1, 2, 3),
|
||||
Seq(7, 8, 9, 1, 2, 3, 4, 5, 6, 7))
|
||||
val expectedOutputBeforeRestart =
|
||||
Seq(Seq(2, 3, 5, 6, 8, 9), Seq(4, 5, 7, 8, 10, 2), Seq(6, 7, 9, 10, 3, 4))
|
||||
val expectedOutputAfterRestart = Seq(Seq(4, 5, 7, 8, 10, 2),
|
||||
Seq(6, 7, 9, 10, 3, 4),
|
||||
Seq(8, 9, 11, 2, 5, 6),
|
||||
Seq(10, 11, 3, 4, 7, 8))
|
||||
|
||||
testUnaryOperation(
|
||||
input,
|
||||
|
@ -371,24 +474,31 @@ class ProgressTrackingAndCheckpointSuite extends CheckpointAndProgressTrackerTes
|
|||
"eh1" -> Map(
|
||||
"eventhubs.partition.count" -> "3",
|
||||
"eventhubs.maxRate" -> "2",
|
||||
"eventhubs.name" -> "eh1")
|
||||
"eventhubs.name" -> "eh1",
|
||||
"eventhubs.namespace" -> "namespace",
|
||||
"eventhubs.policyname" -> "policyname",
|
||||
"eventhubs.policykey" -> "policykey"
|
||||
)
|
||||
),
|
||||
expectedOffsetsAndSeqs = Map(eventhubNamespace ->
|
||||
OffsetRecord(2000L, Map(EventHubNameAndPartition("eh1", 0) -> (3L, 3L),
|
||||
EventHubNameAndPartition("eh1", 1) -> (3L, 3L),
|
||||
EventHubNameAndPartition("eh1", 2) -> (3L, 3L))
|
||||
)),
|
||||
expectedOffsetsAndSeqs = Map(
|
||||
eventhubNamespace ->
|
||||
OffsetRecord(2000L,
|
||||
Map(EventHubNameAndPartition("eh1", 0) -> (3L, 3L),
|
||||
EventHubNameAndPartition("eh1", 1) -> (3L, 3L),
|
||||
EventHubNameAndPartition("eh1", 2) -> (3L, 3L)))),
|
||||
operation = (inputDStream: EventHubDirectDStream) =>
|
||||
inputDStream.map(eventData => eventData.getProperties.get("output").asInstanceOf[Int] + 1),
|
||||
expectedOutputBeforeRestart)
|
||||
expectedOutputBeforeRestart
|
||||
)
|
||||
|
||||
testProgressTracker(
|
||||
eventhubNamespace,
|
||||
expectedOffsetsAndSeqs =
|
||||
OffsetRecord(3000L, Map(EventHubNameAndPartition("eh1", 0) -> (5L, 5L),
|
||||
EventHubNameAndPartition("eh1", 1) -> (5L, 5L),
|
||||
EventHubNameAndPartition("eh1", 2) -> (5L, 5L))),
|
||||
4000L)
|
||||
expectedOffsetsAndSeqs = OffsetRecord(3000L,
|
||||
Map(EventHubNameAndPartition("eh1", 0) -> (5L, 5L),
|
||||
EventHubNameAndPartition("eh1", 1) -> (5L, 5L),
|
||||
EventHubNameAndPartition("eh1", 2) -> (5L, 5L))),
|
||||
4000L
|
||||
)
|
||||
|
||||
val currentCheckpointDirectory = ssc.checkpointDir
|
||||
|
||||
|
@ -400,38 +510,43 @@ class ProgressTrackingAndCheckpointSuite extends CheckpointAndProgressTrackerTes
|
|||
ssc.stop()
|
||||
reset()
|
||||
|
||||
ssc = StreamingContext.getOrCreate(currentCheckpointDirectory,
|
||||
ssc = StreamingContext.getOrCreate(
|
||||
currentCheckpointDirectory,
|
||||
() => createContextForCheckpointOperation(batchDuration, checkpointDirectory))
|
||||
|
||||
ssc.graph.getInputStreams().filter(_.isInstanceOf[EventHubDirectDStream]).map(
|
||||
_.asInstanceOf[EventHubDirectDStream]).head.currentOffsetsAndSeqNums =
|
||||
OffsetRecord(2000L, Map(EventHubNameAndPartition("eh1", 0) -> (1L, 1L),
|
||||
EventHubNameAndPartition("eh1", 1) -> (1L, 1L),
|
||||
EventHubNameAndPartition("eh1", 2) -> (1L, 1L)))
|
||||
|
||||
ssc.graph
|
||||
.getInputStreams()
|
||||
.filter(_.isInstanceOf[EventHubDirectDStream])
|
||||
.map(_.asInstanceOf[EventHubDirectDStream])
|
||||
.head
|
||||
.currentOffsetsAndSeqNums = OffsetRecord(2000L,
|
||||
Map(EventHubNameAndPartition("eh1", 0) -> (1L, 1L),
|
||||
EventHubNameAndPartition("eh1", 1) -> (1L, 1L),
|
||||
EventHubNameAndPartition("eh1", 2) -> (1L, 1L)))
|
||||
|
||||
runStreamsWithEventHubInput(ssc,
|
||||
expectedOutputAfterRestart.length - 1,
|
||||
expectedOutputAfterRestart, useSet = true)
|
||||
expectedOutputAfterRestart.length - 1,
|
||||
expectedOutputAfterRestart,
|
||||
useSet = true)
|
||||
|
||||
testProgressTracker(
|
||||
eventhubNamespace,
|
||||
expectedOffsetsAndSeqs =
|
||||
OffsetRecord(5000L, Map(EventHubNameAndPartition("eh1", 0) -> (9L, 9L),
|
||||
EventHubNameAndPartition("eh1", 1) -> (9L, 9L),
|
||||
EventHubNameAndPartition("eh1", 2) -> (9L, 9L))),
|
||||
6000L)
|
||||
expectedOffsetsAndSeqs = OffsetRecord(5000L,
|
||||
Map(EventHubNameAndPartition("eh1", 0) -> (9L, 9L),
|
||||
EventHubNameAndPartition("eh1", 1) -> (9L, 9L),
|
||||
EventHubNameAndPartition("eh1", 2) -> (9L, 9L))),
|
||||
6000L
|
||||
)
|
||||
}
|
||||
|
||||
test("continue processing when the application crash before the last commit finished") {
|
||||
val input = Seq(
|
||||
Seq(1, 2, 3, 4, 5, 6, 7, 8, 9, 10),
|
||||
Seq(4, 5, 6, 7, 8, 9, 10, 1, 2, 3),
|
||||
Seq(7, 8, 9, 1, 2, 3, 4, 5, 6, 7))
|
||||
val expectedOutputBeforeRestart = Seq(
|
||||
Seq(2, 3, 5, 6, 8, 9), Seq(4, 5, 7, 8, 10, 2), Seq(6, 7, 9, 10, 3, 4))
|
||||
val expectedOutputAfterRestart = Seq(
|
||||
Seq(6, 7, 9, 10, 3, 4), Seq(8, 9, 11, 2, 5, 6), Seq(10, 11, 3, 4, 7, 8))
|
||||
val input = Seq(Seq(1, 2, 3, 4, 5, 6, 7, 8, 9, 10),
|
||||
Seq(4, 5, 6, 7, 8, 9, 10, 1, 2, 3),
|
||||
Seq(7, 8, 9, 1, 2, 3, 4, 5, 6, 7))
|
||||
val expectedOutputBeforeRestart =
|
||||
Seq(Seq(2, 3, 5, 6, 8, 9), Seq(4, 5, 7, 8, 10, 2), Seq(6, 7, 9, 10, 3, 4))
|
||||
val expectedOutputAfterRestart =
|
||||
Seq(Seq(6, 7, 9, 10, 3, 4), Seq(8, 9, 11, 2, 5, 6), Seq(10, 11, 3, 4, 7, 8))
|
||||
|
||||
testUnaryOperation(
|
||||
input,
|
||||
|
@ -439,24 +554,31 @@ class ProgressTrackingAndCheckpointSuite extends CheckpointAndProgressTrackerTes
|
|||
"eh1" -> Map(
|
||||
"eventhubs.partition.count" -> "3",
|
||||
"eventhubs.maxRate" -> "2",
|
||||
"eventhubs.name" -> "eh1")
|
||||
"eventhubs.name" -> "eh1",
|
||||
"eventhubs.namespace" -> "namespace",
|
||||
"eventhubs.policyname" -> "policyname",
|
||||
"eventhubs.policykey" -> "policykey"
|
||||
)
|
||||
),
|
||||
expectedOffsetsAndSeqs = Map(eventhubNamespace ->
|
||||
OffsetRecord(2000L, Map(EventHubNameAndPartition("eh1", 0) -> (3L, 3L),
|
||||
EventHubNameAndPartition("eh1", 1) -> (3L, 3L),
|
||||
EventHubNameAndPartition("eh1", 2) -> (3L, 3L))
|
||||
)),
|
||||
expectedOffsetsAndSeqs = Map(
|
||||
eventhubNamespace ->
|
||||
OffsetRecord(2000L,
|
||||
Map(EventHubNameAndPartition("eh1", 0) -> (3L, 3L),
|
||||
EventHubNameAndPartition("eh1", 1) -> (3L, 3L),
|
||||
EventHubNameAndPartition("eh1", 2) -> (3L, 3L)))),
|
||||
operation = (inputDStream: EventHubDirectDStream) =>
|
||||
inputDStream.map(eventData => eventData.getProperties.get("output").asInstanceOf[Int] + 1),
|
||||
expectedOutputBeforeRestart)
|
||||
expectedOutputBeforeRestart
|
||||
)
|
||||
|
||||
testProgressTracker(
|
||||
eventhubNamespace,
|
||||
expectedOffsetsAndSeqs =
|
||||
OffsetRecord(3000L, Map(EventHubNameAndPartition("eh1", 0) -> (5L, 5L),
|
||||
EventHubNameAndPartition("eh1", 1) -> (5L, 5L),
|
||||
EventHubNameAndPartition("eh1", 2) -> (5L, 5L))),
|
||||
4000L)
|
||||
expectedOffsetsAndSeqs = OffsetRecord(3000L,
|
||||
Map(EventHubNameAndPartition("eh1", 0) -> (5L, 5L),
|
||||
EventHubNameAndPartition("eh1", 1) -> (5L, 5L),
|
||||
EventHubNameAndPartition("eh1", 2) -> (5L, 5L))),
|
||||
4000L
|
||||
)
|
||||
|
||||
val currentCheckpointDirectory = ssc.checkpointDir
|
||||
|
||||
|
@ -468,47 +590,58 @@ class ProgressTrackingAndCheckpointSuite extends CheckpointAndProgressTrackerTes
|
|||
fs.delete(new Path(progressRootPath.toString + s"/$appName/progress-3000"), true)
|
||||
fs.delete(new Path(progressRootPath.toString + s"/${appName}_metadata/3000"), true)
|
||||
|
||||
ssc = StreamingContext.getOrCreate(currentCheckpointDirectory,
|
||||
ssc = StreamingContext.getOrCreate(
|
||||
currentCheckpointDirectory,
|
||||
() => createContextForCheckpointOperation(batchDuration, checkpointDirectory))
|
||||
|
||||
assert(ssc.graph.getInputStreams().filter(_.isInstanceOf[EventHubDirectDStream]).map(
|
||||
_.asInstanceOf[EventHubDirectDStream]).head.currentOffsetsAndSeqNums ===
|
||||
OffsetRecord(2000L, Map(EventHubNameAndPartition("eh1", 0) -> (3L, 3L),
|
||||
EventHubNameAndPartition("eh1", 1) -> (3L, 3L),
|
||||
EventHubNameAndPartition("eh1", 2) -> (3L, 3L))))
|
||||
assert(
|
||||
ssc.graph
|
||||
.getInputStreams()
|
||||
.filter(_.isInstanceOf[EventHubDirectDStream])
|
||||
.map(_.asInstanceOf[EventHubDirectDStream])
|
||||
.head
|
||||
.currentOffsetsAndSeqNums ===
|
||||
OffsetRecord(2000L,
|
||||
Map(EventHubNameAndPartition("eh1", 0) -> (3L, 3L),
|
||||
EventHubNameAndPartition("eh1", 1) -> (3L, 3L),
|
||||
EventHubNameAndPartition("eh1", 2) -> (3L, 3L))))
|
||||
|
||||
runStreamsWithEventHubInput(ssc,
|
||||
expectedOutputAfterRestart.length - 1,
|
||||
expectedOutputAfterRestart, useSet = true)
|
||||
expectedOutputAfterRestart.length - 1,
|
||||
expectedOutputAfterRestart,
|
||||
useSet = true)
|
||||
|
||||
testProgressTracker(
|
||||
eventhubNamespace,
|
||||
expectedOffsetsAndSeqs =
|
||||
OffsetRecord(5000L, Map(EventHubNameAndPartition("eh1", 0) -> (9L, 9L),
|
||||
EventHubNameAndPartition("eh1", 1) -> (9L, 9L),
|
||||
EventHubNameAndPartition("eh1", 2) -> (9L, 9L))),
|
||||
6000L)
|
||||
expectedOffsetsAndSeqs = OffsetRecord(5000L,
|
||||
Map(EventHubNameAndPartition("eh1", 0) -> (9L, 9L),
|
||||
EventHubNameAndPartition("eh1", 1) -> (9L, 9L),
|
||||
EventHubNameAndPartition("eh1", 2) -> (9L, 9L))),
|
||||
6000L
|
||||
)
|
||||
}
|
||||
|
||||
test("progress files are clean up correctly with a fragile rest endpoint") {
|
||||
val input = Seq(
|
||||
Seq(1, 2, 3, 4, 5, 6, 7, 8, 9, 10),
|
||||
Seq(4, 5, 6, 7, 8, 9, 10, 1, 2, 3),
|
||||
Seq(7, 8, 9, 1, 2, 3, 4, 5, 6, 7))
|
||||
val expectedOutputBeforeRestart = Seq(
|
||||
Seq(2, 3, 5, 6, 8, 9), Seq(4, 5, 7, 8, 10, 2), Seq(6, 7, 9, 10, 3, 4))
|
||||
val input = Seq(Seq(1, 2, 3, 4, 5, 6, 7, 8, 9, 10),
|
||||
Seq(4, 5, 6, 7, 8, 9, 10, 1, 2, 3),
|
||||
Seq(7, 8, 9, 1, 2, 3, 4, 5, 6, 7))
|
||||
val expectedOutputBeforeRestart =
|
||||
Seq(Seq(2, 3, 5, 6, 8, 9), Seq(4, 5, 7, 8, 10, 2), Seq(6, 7, 9, 10, 3, 4))
|
||||
// the order of the output should looks like there is no issue, because we reuse the fetched
|
||||
// highest offset
|
||||
val expectedOutputAfterRestart = Seq(
|
||||
Seq(6, 7, 9, 10, 3, 4), Seq(8, 9, 11, 2, 5, 6), Seq(10, 11, 3, 4, 7, 8), Seq(), Seq(), Seq())
|
||||
val expectedOutputAfterRestart = Seq(Seq(6, 7, 9, 10, 3, 4),
|
||||
Seq(8, 9, 11, 2, 5, 6),
|
||||
Seq(10, 11, 3, 4, 7, 8),
|
||||
Seq(),
|
||||
Seq(),
|
||||
Seq())
|
||||
|
||||
// ugly stuff to make things serializable
|
||||
FragileEventHubClient.numBatchesBeforeCrashedEndpoint = 3
|
||||
FragileEventHubClient.lastBatchWhenEndpointCrashed = 6
|
||||
FragileEventHubClient.latestRecords = Map(
|
||||
EventHubNameAndPartition("eh1", 0) -> (9L, 9L),
|
||||
EventHubNameAndPartition("eh1", 1) -> (9L, 9L),
|
||||
EventHubNameAndPartition("eh1", 2) -> (9L, 9L))
|
||||
FragileEventHubClient.latestRecords = Map(EventHubNameAndPartition("eh1", 0) -> (9L, 9L),
|
||||
EventHubNameAndPartition("eh1", 1) -> (9L, 9L),
|
||||
EventHubNameAndPartition("eh1", 2) -> (9L, 9L))
|
||||
|
||||
testFragileStream(
|
||||
input,
|
||||
|
@ -516,16 +649,22 @@ class ProgressTrackingAndCheckpointSuite extends CheckpointAndProgressTrackerTes
|
|||
"eh1" -> Map(
|
||||
"eventhubs.partition.count" -> "3",
|
||||
"eventhubs.maxRate" -> "2",
|
||||
"eventhubs.name" -> "eh1")
|
||||
"eventhubs.name" -> "eh1",
|
||||
"eventhubs.namespace" -> "namespace",
|
||||
"eventhubs.policyname" -> "policyname",
|
||||
"eventhubs.policykey" -> "policykey"
|
||||
)
|
||||
),
|
||||
expectedOffsetsAndSeqs = Map(eventhubNamespace ->
|
||||
OffsetRecord(2000L, Map(EventHubNameAndPartition("eh1", 0) -> (3L, 3L),
|
||||
EventHubNameAndPartition("eh1", 1) -> (3L, 3L),
|
||||
EventHubNameAndPartition("eh1", 2) -> (3L, 3L))
|
||||
)),
|
||||
expectedOffsetsAndSeqs = Map(
|
||||
eventhubNamespace ->
|
||||
OffsetRecord(2000L,
|
||||
Map(EventHubNameAndPartition("eh1", 0) -> (3L, 3L),
|
||||
EventHubNameAndPartition("eh1", 1) -> (3L, 3L),
|
||||
EventHubNameAndPartition("eh1", 2) -> (3L, 3L)))),
|
||||
operation = (inputDStream: EventHubDirectDStream) =>
|
||||
inputDStream.map(eventData => eventData.getProperties.get("output").asInstanceOf[Int] + 1),
|
||||
expectedOutput = expectedOutputBeforeRestart)
|
||||
expectedOutput = expectedOutputBeforeRestart
|
||||
)
|
||||
|
||||
val currentCheckpointDirectory = ssc.checkpointDir
|
||||
ssc.stop()
|
||||
|
@ -534,28 +673,30 @@ class ProgressTrackingAndCheckpointSuite extends CheckpointAndProgressTrackerTes
|
|||
ssc = new StreamingContext(currentCheckpointDirectory)
|
||||
|
||||
runStreamsWithEventHubInput(ssc,
|
||||
expectedOutputAfterRestart.length - 1,
|
||||
expectedOutputAfterRestart, useSet = true)
|
||||
expectedOutputAfterRestart.length - 1,
|
||||
expectedOutputAfterRestart,
|
||||
useSet = true)
|
||||
|
||||
testProgressTracker(
|
||||
eventhubNamespace,
|
||||
expectedOffsetsAndSeqs =
|
||||
OffsetRecord(8000L, Map(EventHubNameAndPartition("eh1", 0) -> (9L, 9L),
|
||||
EventHubNameAndPartition("eh1", 1) -> (9L, 9L),
|
||||
EventHubNameAndPartition("eh1", 2) -> (9L, 9L))),
|
||||
9000L)
|
||||
expectedOffsetsAndSeqs = OffsetRecord(8000L,
|
||||
Map(EventHubNameAndPartition("eh1", 0) -> (9L, 9L),
|
||||
EventHubNameAndPartition("eh1", 1) -> (9L, 9L),
|
||||
EventHubNameAndPartition("eh1", 2) -> (9L, 9L))),
|
||||
9000L
|
||||
)
|
||||
}
|
||||
|
||||
test("offset type is saved and recovered correctly from checkpoint") {
|
||||
val input = Seq(
|
||||
Seq(1, 2, 3, 4, 5, 6, 7, 8, 9, 10),
|
||||
Seq(4, 5, 6, 7, 8, 9, 10, 1, 2, 3),
|
||||
Seq(7, 8, 9, 1, 2, 3, 4, 5, 6, 7))
|
||||
val expectedOutputBeforeRestart = Seq(
|
||||
Seq(4, 5, 7, 8, 10, 2))
|
||||
val expectedOutputAfterRestart = Seq(
|
||||
Seq(4, 5, 7, 8, 10, 2), Seq(6, 7, 9, 10, 3, 4), Seq(8, 9, 11, 2, 5, 6),
|
||||
Seq(10, 11, 3, 4, 7, 8), Seq())
|
||||
val input = Seq(Seq(1, 2, 3, 4, 5, 6, 7, 8, 9, 10),
|
||||
Seq(4, 5, 6, 7, 8, 9, 10, 1, 2, 3),
|
||||
Seq(7, 8, 9, 1, 2, 3, 4, 5, 6, 7))
|
||||
val expectedOutputBeforeRestart = Seq(Seq(4, 5, 7, 8, 10, 2))
|
||||
val expectedOutputAfterRestart = Seq(Seq(4, 5, 7, 8, 10, 2),
|
||||
Seq(6, 7, 9, 10, 3, 4),
|
||||
Seq(8, 9, 11, 2, 5, 6),
|
||||
Seq(10, 11, 3, 4, 7, 8),
|
||||
Seq())
|
||||
|
||||
testCheckpointedOperation(
|
||||
input,
|
||||
|
@ -564,20 +705,26 @@ class ProgressTrackingAndCheckpointSuite extends CheckpointAndProgressTrackerTes
|
|||
"eventhubs.partition.count" -> "3",
|
||||
"eventhubs.maxRate" -> "2",
|
||||
"eventhubs.name" -> "eh1",
|
||||
"eventhubs.filter.enqueuetime" -> "2000")
|
||||
"eventhubs.filter.enqueuetime" -> "2000",
|
||||
"eventhubs.namespace" -> "namespace",
|
||||
"eventhubs.policyname" -> "policyname",
|
||||
"eventhubs.policykey" -> "policykey"
|
||||
)
|
||||
),
|
||||
expectedStartingOffsetsAndSeqs = Map(eventhubNamespace ->
|
||||
OffsetRecord(0L, Map(EventHubNameAndPartition("eh1", 0) -> (-1L, -1L),
|
||||
EventHubNameAndPartition("eh1", 1) -> (-1L, -1L),
|
||||
EventHubNameAndPartition("eh1", 2) -> (-1L, -1L))
|
||||
)),
|
||||
expectedStartingOffsetsAndSeqs = Map(
|
||||
eventhubNamespace ->
|
||||
OffsetRecord(0L,
|
||||
Map(EventHubNameAndPartition("eh1", 0) -> (-1L, -1L),
|
||||
EventHubNameAndPartition("eh1", 1) -> (-1L, -1L),
|
||||
EventHubNameAndPartition("eh1", 2) -> (-1L, -1L)))),
|
||||
expectedOffsetsAndSeqs = OffsetRecord(1000L,
|
||||
Map(EventHubNameAndPartition("eh1", 0) -> (3L, 3L),
|
||||
EventHubNameAndPartition("eh1", 1) -> (3L, 3L),
|
||||
EventHubNameAndPartition("eh1", 2) -> (3L, 3L))),
|
||||
Map(EventHubNameAndPartition("eh1", 0) -> (3L, 3L),
|
||||
EventHubNameAndPartition("eh1", 1) -> (3L, 3L),
|
||||
EventHubNameAndPartition("eh1", 2) -> (3L, 3L))),
|
||||
operation = (inputDStream: EventHubDirectDStream) =>
|
||||
inputDStream.map(eventData => eventData.getProperties.get("output").asInstanceOf[Int] + 1),
|
||||
expectedOutputBeforeRestart,
|
||||
expectedOutputAfterRestart)
|
||||
expectedOutputAfterRestart
|
||||
)
|
||||
}
|
||||
}
|
||||
|
|
|
@ -1,223 +0,0 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.spark.streaming.eventhubs
|
||||
|
||||
import java.io.File
|
||||
|
||||
import scala.collection.mutable.ArrayBuffer
|
||||
import scala.concurrent.duration._
|
||||
|
||||
import com.microsoft.azure.eventhubs.EventData
|
||||
import com.microsoft.azure.eventhubs.EventData.SystemProperties
|
||||
import com.microsoft.azure.eventhubs.amqp.AmqpConstants
|
||||
import org.mockito.internal.util.reflection.Whitebox
|
||||
import org.scalatest.{BeforeAndAfter, BeforeAndAfterAll, FunSuite}
|
||||
import org.scalatest.concurrent.Eventually
|
||||
import org.scalatest.mock.MockitoSugar
|
||||
|
||||
import org.apache.spark.SparkConf
|
||||
import org.apache.spark.eventhubscommon.client.{EventHubsClientWrapper, EventHubsOffsetTypes}
|
||||
import org.apache.spark.eventhubscommon.client.EventHubsOffsetTypes.EventHubsOffsetType
|
||||
import org.apache.spark.storage.StorageLevel
|
||||
import org.apache.spark.streaming.{Milliseconds, StreamingContext}
|
||||
import org.apache.spark.streaming.eventhubs.checkpoint.OffsetStore
|
||||
import org.apache.spark.util.Utils
|
||||
|
||||
|
||||
/**
|
||||
* Test suite for ReliableEventHubsReceiver
|
||||
* This suite of tests use Spark local mode with EventHubs dummy receiver for e2e testing
|
||||
*/
|
||||
class ReliableEventHubsReceiverSuite extends FunSuite with BeforeAndAfter with BeforeAndAfterAll
|
||||
with MockitoSugar with Eventually {
|
||||
private var streamingContext: StreamingContext = _
|
||||
private var ehClientWrapperMock: EventHubsClientWrapper = _
|
||||
private var offsetStoreMock: OffsetStore = _
|
||||
private var tempDirectory: File = _
|
||||
|
||||
private val eventhubParameters = Map[String, String] (
|
||||
"eventhubs.policyname" -> "policyname",
|
||||
"eventhubs.policykey" -> "policykey",
|
||||
"eventhubs.namespace" -> "namespace",
|
||||
"eventhubs.name" -> "name",
|
||||
"eventhubs.partition.count" -> "4",
|
||||
"eventhubs.checkpoint.dir" -> "checkpointdir",
|
||||
"eventhubs.checkpoint.interval" -> "0"
|
||||
)
|
||||
|
||||
private val sparkConf = new SparkConf()
|
||||
.setMaster("local[3]") // At least 2, 1 for receiver and 1 for data transform
|
||||
.setAppName("ReliableEventHubsReceiverSuite")
|
||||
.set("spark.streaming.receiver.writeAheadLog.enable", "true")
|
||||
.set("spark.driver.allowMultipleContexts", "true")
|
||||
|
||||
override def beforeAll() : Unit = {
|
||||
|
||||
}
|
||||
|
||||
override def afterAll() : Unit = {
|
||||
}
|
||||
|
||||
before {
|
||||
tempDirectory = Utils.createTempDir()
|
||||
// tempDirectory.deleteOnExit()
|
||||
streamingContext = new StreamingContext(sparkConf, Milliseconds(500))
|
||||
streamingContext.checkpoint(tempDirectory.getAbsolutePath)
|
||||
|
||||
offsetStoreMock = new MyMockedOffsetStore
|
||||
}
|
||||
|
||||
after {
|
||||
if (streamingContext != null) {
|
||||
streamingContext.stop()
|
||||
streamingContext = null
|
||||
}
|
||||
if(tempDirectory != null) {
|
||||
// Utils.deleteRecursively(tempDirectory)
|
||||
tempDirectory.delete()
|
||||
tempDirectory = null
|
||||
}
|
||||
}
|
||||
|
||||
// Test ignored due to an issue with mocking library unavailable to the executors.
|
||||
|
||||
test("Reliable EventHubs input stream") {
|
||||
// after 100 messages then start to receive null
|
||||
ehClientWrapperMock = new MyMockedEventHubsClientWrapper(100, -1)
|
||||
val stream = EventHubsUtils.createStream(streamingContext, eventhubParameters, "0",
|
||||
StorageLevel.MEMORY_ONLY, offsetStoreMock, ehClientWrapperMock)
|
||||
var count = 0
|
||||
stream.map { v => v }.foreachRDD { r =>
|
||||
val ret = r.collect()
|
||||
ret.foreach { v =>
|
||||
count += 1
|
||||
}
|
||||
}
|
||||
streamingContext.start()
|
||||
eventually(timeout(4000.milliseconds), interval(200.milliseconds)) {
|
||||
// Make sure we have received 100 messages
|
||||
assert(count === 100)
|
||||
}
|
||||
}
|
||||
|
||||
test("Reliable EventHubs input stream recover from exception") {
|
||||
// After 60 messages then exception, after 100 messages then receive null
|
||||
ehClientWrapperMock = new MyMockedEventHubsClientWrapper(100, 60)
|
||||
val stream = EventHubsUtils.createStream(streamingContext, eventhubParameters, "0",
|
||||
StorageLevel.MEMORY_ONLY, offsetStoreMock, ehClientWrapperMock)
|
||||
var count = 0
|
||||
stream.map { v => v }.foreachRDD { r =>
|
||||
val ret = r.collect()
|
||||
ret.foreach { v =>
|
||||
count += 1
|
||||
}
|
||||
}
|
||||
streamingContext.start()
|
||||
|
||||
eventually(timeout(10000.milliseconds), interval(200.milliseconds)) {
|
||||
|
||||
// Make sure we have received 100 messages
|
||||
assert(count === 100)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* The Mock class for EventHubsClientWrapper.
|
||||
* Note this class only support offset filter.
|
||||
*
|
||||
* @param emitCount the number of message emitted before it returns null
|
||||
* @param exceptionCount the number of message emitted before it throws exception
|
||||
* it only throws exception once
|
||||
*/
|
||||
class MyMockedEventHubsClientWrapper(
|
||||
emitCount: Int,
|
||||
exceptionCount: Int) extends EventHubsClientWrapper with MockitoSugar {
|
||||
var offset: Int = -1
|
||||
var count = 0
|
||||
var partition = "0"
|
||||
var myExceptionCount: Int = exceptionCount
|
||||
|
||||
override def createReceiverInternal(
|
||||
connectionString: String,
|
||||
eventhubsName: String,
|
||||
consumerGroup: String,
|
||||
partitionId: String,
|
||||
offsetType: EventHubsOffsetType,
|
||||
currentOffset: String,
|
||||
receiverEpoch: Long): Unit = {
|
||||
if (offsetType != EventHubsOffsetTypes.None) {
|
||||
offset = currentOffset.toInt
|
||||
partition = partitionId
|
||||
}
|
||||
}
|
||||
|
||||
override def closeReceiver(): Unit = {
|
||||
// no ops
|
||||
}
|
||||
|
||||
override def receive(): Iterable[EventData] = {
|
||||
|
||||
if (count == myExceptionCount) {
|
||||
// make sure we only throw exception once
|
||||
myExceptionCount = -1
|
||||
throw new RuntimeException("count = " + count)
|
||||
}
|
||||
offset += 1
|
||||
count += 1
|
||||
// do not send more than emitCount number of messages
|
||||
if(count <= emitCount) {
|
||||
|
||||
val eventData = new EventData(Array.fill(8)(
|
||||
(scala.util.Random.nextInt(256) - 128).toByte))
|
||||
val systemPropertiesMap = new java.util.HashMap[String, AnyRef]()
|
||||
systemPropertiesMap.put(AmqpConstants.OFFSET_ANNOTATION_NAME, offset.toString)
|
||||
systemPropertiesMap.put(AmqpConstants.SEQUENCE_NUMBER_ANNOTATION_NAME, Long.box(count))
|
||||
systemPropertiesMap.put(AmqpConstants.PARTITION_KEY_ANNOTATION_NAME, partition)
|
||||
val systemProperties = new SystemProperties(systemPropertiesMap)
|
||||
Whitebox.setInternalState(eventData, "systemProperties", systemProperties)
|
||||
val eventDataCollection: ArrayBuffer[EventData] = new ArrayBuffer[EventData]()
|
||||
eventDataCollection += eventData
|
||||
eventDataCollection
|
||||
} else {
|
||||
Thread sleep 1000
|
||||
null
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* The Mock class for OffsetStore
|
||||
*/
|
||||
class MyMockedOffsetStore extends OffsetStore {
|
||||
var myOffset: String = "-1"
|
||||
override def open(): Unit = {
|
||||
}
|
||||
|
||||
override def write(offset: String): Unit = {
|
||||
println("writing offset to MyMockedOffsetStore:" + offset)
|
||||
myOffset = offset
|
||||
}
|
||||
|
||||
override def read(): String = {
|
||||
println("reading offset from MyMockedOffsetStore:" + myOffset)
|
||||
myOffset
|
||||
}
|
||||
|
||||
override def close(): Unit = {
|
||||
}
|
||||
}
|
|
@ -20,14 +20,17 @@ package org.apache.spark.streaming.eventhubs
|
|||
import java.nio.file.Files
|
||||
|
||||
import org.apache.hadoop.conf.Configuration
|
||||
import org.apache.hadoop.fs.{FileSystem, Path}
|
||||
import org.scalatest.{BeforeAndAfterEach, FunSuite}
|
||||
import org.apache.hadoop.fs.{ FileSystem, Path }
|
||||
import org.scalatest.{ BeforeAndAfterEach, FunSuite }
|
||||
|
||||
import org.apache.spark.{SparkConf, SparkContext}
|
||||
import org.apache.spark.{ SparkConf, SparkContext }
|
||||
import org.apache.spark.eventhubscommon.EventHubsConnector
|
||||
import org.apache.spark.eventhubscommon.progress.ProgressTrackerBase
|
||||
import org.apache.spark.streaming.{Duration, Seconds, StreamingContext}
|
||||
import org.apache.spark.streaming.eventhubs.checkpoint.{DirectDStreamProgressTracker, ProgressTrackingListener}
|
||||
import org.apache.spark.streaming.{ Duration, Seconds, StreamingContext }
|
||||
import org.apache.spark.streaming.eventhubs.checkpoint.{
|
||||
DirectDStreamProgressTracker,
|
||||
ProgressTrackingListener
|
||||
}
|
||||
|
||||
private[spark] trait SharedUtils extends FunSuite with BeforeAndAfterEach {
|
||||
|
||||
|
@ -59,13 +62,17 @@ private[spark] trait SharedUtils extends FunSuite with BeforeAndAfterEach {
|
|||
protected def init(): Unit = {
|
||||
progressRootPath = new Path(Files.createTempDirectory("progress_root").toString)
|
||||
fs = progressRootPath.getFileSystem(new Configuration())
|
||||
val sparkContext = new SparkContext(new SparkConf().setAppName(appName).
|
||||
setMaster("local[*]").set("spark.streaming.clock", streamingClock))
|
||||
val sparkContext = new SparkContext(
|
||||
new SparkConf()
|
||||
.setAppName(appName)
|
||||
.setMaster("local[*]")
|
||||
.set("spark.streaming.clock", streamingClock))
|
||||
sparkContext.setLogLevel("INFO")
|
||||
ssc = new StreamingContext(sparkContext, batchDuration)
|
||||
progressListener = ProgressTrackingListener.initInstance(ssc, progressRootPath.toString)
|
||||
progressTracker = DirectDStreamProgressTracker.initInstance(progressRootPath.toString, appName,
|
||||
new Configuration())
|
||||
progressTracker = DirectDStreamProgressTracker.initInstance(progressRootPath.toString,
|
||||
appName,
|
||||
new Configuration())
|
||||
}
|
||||
|
||||
protected def reset(): Unit = {
|
||||
|
|
|
@ -17,21 +17,24 @@
|
|||
|
||||
package org.apache.spark.streaming.eventhubs.checkpoint
|
||||
|
||||
import java.nio.file.{Files, Paths, StandardOpenOption}
|
||||
import java.nio.file.{ Files, Paths, StandardOpenOption }
|
||||
|
||||
import org.apache.hadoop.conf.Configuration
|
||||
import org.apache.hadoop.fs.{FileSystem, Path}
|
||||
|
||||
import org.apache.spark.eventhubscommon.{EventHubNameAndPartition, EventHubsConnector, OffsetRecord}
|
||||
import org.apache.spark.eventhubscommon.progress.{PathTools, ProgressRecord, ProgressWriter}
|
||||
import org.apache.hadoop.fs.{ FileSystem, Path }
|
||||
import org.apache.spark.eventhubscommon.progress.{ PathTools, ProgressRecord, ProgressWriter }
|
||||
import org.apache.spark.eventhubscommon.{
|
||||
EventHubNameAndPartition,
|
||||
EventHubsConnector,
|
||||
OffsetRecord
|
||||
}
|
||||
import org.apache.spark.streaming.eventhubs.SharedUtils
|
||||
|
||||
class ProgressTrackerSuite extends SharedUtils {
|
||||
|
||||
class DummyEventHubsConnector(
|
||||
sId: Int,
|
||||
uniqueId: String,
|
||||
connedInstances: List[EventHubNameAndPartition]) extends EventHubsConnector {
|
||||
class DummyEventHubsConnector(sId: Int,
|
||||
uniqueId: String,
|
||||
connedInstances: List[EventHubNameAndPartition])
|
||||
extends EventHubsConnector {
|
||||
override def streamId: Int = sId
|
||||
|
||||
override def uid: String = uniqueId
|
||||
|
@ -44,16 +47,15 @@ class ProgressTrackerSuite extends SharedUtils {
|
|||
DirectDStreamProgressTracker.reset()
|
||||
}
|
||||
|
||||
private def writeProgressFile(
|
||||
progressPath: String,
|
||||
streamId: Int,
|
||||
fs: FileSystem,
|
||||
timestamp: Long,
|
||||
namespace: String,
|
||||
ehName: String,
|
||||
partitionRange: Range,
|
||||
offset: Int,
|
||||
seq: Int): Unit = {
|
||||
private def writeProgressFile(progressPath: String,
|
||||
streamId: Int,
|
||||
fs: FileSystem,
|
||||
timestamp: Long,
|
||||
namespace: String,
|
||||
ehName: String,
|
||||
partitionRange: Range,
|
||||
offset: Int,
|
||||
seq: Int): Unit = {
|
||||
for (partitionId <- partitionRange) {
|
||||
val filePath = Paths.get(progressPath + s"/${PathTools.makeProgressFileName(timestamp)}")
|
||||
val stdOpenOption = if (Files.exists(filePath)) {
|
||||
|
@ -62,9 +64,9 @@ class ProgressTrackerSuite extends SharedUtils {
|
|||
StandardOpenOption.CREATE
|
||||
}
|
||||
|
||||
Files.write(filePath,
|
||||
s"${ProgressRecord(timestamp, namespace, ehName, partitionId, offset, seq)
|
||||
.toString}\n".getBytes,
|
||||
Files.write(
|
||||
filePath,
|
||||
s"${ProgressRecord(timestamp, namespace, ehName, partitionId, offset, seq).toString}\n".getBytes,
|
||||
stdOpenOption)
|
||||
}
|
||||
}
|
||||
|
@ -72,16 +74,18 @@ class ProgressTrackerSuite extends SharedUtils {
|
|||
private def createMetadataFile(fs: FileSystem, metadataPath: String, timestamp: Long): Unit =
|
||||
fs.create(new Path(s"$metadataPath/${PathTools.makeMetadataFileName(timestamp)}"))
|
||||
|
||||
test("progress temp directory is created properly when progress and progress temp" +
|
||||
" directory do not exist") {
|
||||
test(
|
||||
"progress temp directory is created properly when progress and progress temp" +
|
||||
" directory do not exist") {
|
||||
progressTracker = DirectDStreamProgressTracker
|
||||
.initInstance(progressRootPath.toString, appName, new Configuration())
|
||||
assert(fs.exists(progressTracker.progressDirectoryPath))
|
||||
assert(fs.exists(progressTracker.tempDirectoryPath))
|
||||
}
|
||||
|
||||
test("progress temp directory is created properly when progress exists while progress" +
|
||||
" temp does not") {
|
||||
test(
|
||||
"progress temp directory is created properly when progress exists while progress" +
|
||||
" temp does not") {
|
||||
fs.mkdirs(PathTools.makeTempDirectoryPath(progressRootPath.toString, appName))
|
||||
progressTracker = DirectDStreamProgressTracker
|
||||
.initInstance(progressRootPath.toString, appName, new Configuration())
|
||||
|
@ -103,10 +107,14 @@ class ProgressTrackerSuite extends SharedUtils {
|
|||
}
|
||||
|
||||
test("incomplete progress would be discarded") {
|
||||
createDirectStreams(ssc, "namespace1", progressRootPath.toString,
|
||||
createDirectStreams(
|
||||
ssc,
|
||||
"namespace1",
|
||||
progressRootPath.toString,
|
||||
Map("eh1" -> Map("eventhubs.partition.count" -> "1"),
|
||||
"eh2" -> Map("eventhubs.partition.count" -> "2"),
|
||||
"eh3" -> Map("eventhubs.partition.count" -> "3")))
|
||||
"eh3" -> Map("eventhubs.partition.count" -> "3"))
|
||||
)
|
||||
|
||||
val progressPath = PathTools.makeProgressDirectoryStr(progressRootPath.toString, appName)
|
||||
fs.mkdirs(new Path(progressPath))
|
||||
|
@ -137,15 +145,19 @@ class ProgressTrackerSuite extends SharedUtils {
|
|||
assert(fs.exists(new Path(progressPath + "/progress-1000")))
|
||||
}
|
||||
|
||||
private def verifyProgressFile(
|
||||
namespace: String, ehName: String, partitionRange: Range,
|
||||
timestamp: Long, expectedOffsetAndSeq: Seq[(Long, Long)]): Unit = {
|
||||
val ehMap = progressTracker.asInstanceOf[DirectDStreamProgressTracker]
|
||||
private def verifyProgressFile(namespace: String,
|
||||
ehName: String,
|
||||
partitionRange: Range,
|
||||
timestamp: Long,
|
||||
expectedOffsetAndSeq: Seq[(Long, Long)]): Unit = {
|
||||
val ehMap = progressTracker
|
||||
.asInstanceOf[DirectDStreamProgressTracker]
|
||||
.read(namespace, timestamp - 1000L, fallBack = false)
|
||||
var expectedOffsetAndSeqIdx = 0
|
||||
for (partitionId <- partitionRange) {
|
||||
assert(ehMap.offsets(EventHubNameAndPartition(ehName, partitionId)) ===
|
||||
expectedOffsetAndSeq(expectedOffsetAndSeqIdx))
|
||||
assert(
|
||||
ehMap.offsets(EventHubNameAndPartition(ehName, partitionId)) ===
|
||||
expectedOffsetAndSeq(expectedOffsetAndSeqIdx))
|
||||
expectedOffsetAndSeqIdx += 1
|
||||
}
|
||||
}
|
||||
|
@ -153,15 +165,23 @@ class ProgressTrackerSuite extends SharedUtils {
|
|||
test("start from the beginning of the streams when the latest progress file does not exist") {
|
||||
// generate 6 EventHubAndPartitions
|
||||
val dStream =
|
||||
createDirectStreams(ssc, "namespace1", progressRootPath.toString,
|
||||
createDirectStreams(
|
||||
ssc,
|
||||
"namespace1",
|
||||
progressRootPath.toString,
|
||||
Map("eh1" -> Map("eventhubs.partition.count" -> "1"),
|
||||
"eh2" -> Map("eventhubs.partition.count" -> "2"),
|
||||
"eh3" -> Map("eventhubs.partition.count" -> "3")))
|
||||
"eh2" -> Map("eventhubs.partition.count" -> "2"),
|
||||
"eh3" -> Map("eventhubs.partition.count" -> "3"))
|
||||
)
|
||||
val dStream1 =
|
||||
createDirectStreams(ssc, "namespace2", progressRootPath.toString,
|
||||
createDirectStreams(
|
||||
ssc,
|
||||
"namespace2",
|
||||
progressRootPath.toString,
|
||||
Map("eh11" -> Map("eventhubs.partition.count" -> "1"),
|
||||
"eh12" -> Map("eventhubs.partition.count" -> "2"),
|
||||
"eh13" -> Map("eventhubs.partition.count" -> "3")))
|
||||
"eh12" -> Map("eventhubs.partition.count" -> "2"),
|
||||
"eh13" -> Map("eventhubs.partition.count" -> "3"))
|
||||
)
|
||||
dStream.start()
|
||||
dStream1.start()
|
||||
|
||||
|
@ -214,16 +234,16 @@ class ProgressTrackerSuite extends SharedUtils {
|
|||
writeProgressFile(progressPath, 1, fs, 1000L, "namespace2", "eh11", 0 to 0, 1, 2)
|
||||
|
||||
// write wrong record
|
||||
Files.write(
|
||||
Paths.get(progressPath + s"/progress-1000"),
|
||||
(ProgressRecord(2000L, "namespace2", "eh12", 0, 2, 3).toString + "\n").getBytes,
|
||||
StandardOpenOption.APPEND)
|
||||
Files.write(Paths.get(progressPath + s"/progress-1000"),
|
||||
(ProgressRecord(2000L, "namespace2", "eh12", 0, 2, 3).toString + "\n").getBytes,
|
||||
StandardOpenOption.APPEND)
|
||||
|
||||
writeProgressFile(progressPath, 1, fs, 1000L, "namespace2", "eh12", 1 to 1, 2, 3)
|
||||
writeProgressFile(progressPath, 1, fs, 1000L, "namespace2", "eh13", 0 to 2, 3, 4)
|
||||
|
||||
intercept[IllegalArgumentException] {
|
||||
progressTracker.asInstanceOf[DirectDStreamProgressTracker]
|
||||
progressTracker
|
||||
.asInstanceOf[DirectDStreamProgressTracker]
|
||||
.read("namespace2", 1000L, fallBack = false)
|
||||
}
|
||||
}
|
||||
|
@ -238,19 +258,40 @@ class ProgressTrackerSuite extends SharedUtils {
|
|||
val connector1 = new DummyEventHubsConnector(0, "namespace1", connectedInstances)
|
||||
val connector2 = new DummyEventHubsConnector(0, "namespace2", connectedInstances)
|
||||
|
||||
var progressWriter = new ProgressWriter(0, "namespace1", eh1Partition0,
|
||||
1000L, new Configuration(), progressRootPath.toString, appName)
|
||||
var progressWriter = new ProgressWriter(0,
|
||||
"namespace1",
|
||||
eh1Partition0,
|
||||
1000L,
|
||||
new Configuration(),
|
||||
progressRootPath.toString,
|
||||
appName)
|
||||
progressWriter.write(1000L, 0, 1)
|
||||
progressWriter = new ProgressWriter(0, "namespace1", eh2Partition0, 1000L,
|
||||
new Configuration(), progressRootPath.toString, appName)
|
||||
progressWriter = new ProgressWriter(0,
|
||||
"namespace1",
|
||||
eh2Partition0,
|
||||
1000L,
|
||||
new Configuration(),
|
||||
progressRootPath.toString,
|
||||
appName)
|
||||
progressWriter.write(1000L, 0, 1)
|
||||
progressWriter = new ProgressWriter(0, "namespace2", eh1Partition0, 1000L,
|
||||
new Configuration(), progressRootPath.toString, appName)
|
||||
progressWriter = new ProgressWriter(0,
|
||||
"namespace2",
|
||||
eh1Partition0,
|
||||
1000L,
|
||||
new Configuration(),
|
||||
progressRootPath.toString,
|
||||
appName)
|
||||
progressWriter.write(1000L, 10, 20)
|
||||
progressWriter = new ProgressWriter(0, "namespace2", eh2Partition0, 1000L,
|
||||
new Configuration(), progressRootPath.toString, appName)
|
||||
progressWriter = new ProgressWriter(0,
|
||||
"namespace2",
|
||||
eh2Partition0,
|
||||
1000L,
|
||||
new Configuration(),
|
||||
progressRootPath.toString,
|
||||
appName)
|
||||
progressWriter.write(1000L, 20, 30)
|
||||
val s = progressTracker.asInstanceOf[DirectDStreamProgressTracker]
|
||||
val s = progressTracker
|
||||
.asInstanceOf[DirectDStreamProgressTracker]
|
||||
.collectProgressRecordsForBatch(1000L, List(connector1, connector2))
|
||||
|
||||
assert(s.contains("namespace1"))
|
||||
|
@ -271,118 +312,147 @@ class ProgressTrackerSuite extends SharedUtils {
|
|||
val connector1 = new DummyEventHubsConnector(0, "namespace1", connectedInstances)
|
||||
val connector2 = new DummyEventHubsConnector(0, "namespace2", connectedInstances)
|
||||
|
||||
var progressWriter = new ProgressWriter(0, "namespace1", eh1Partition0,
|
||||
1000L, new Configuration(), progressRootPath.toString, appName)
|
||||
var progressWriter = new ProgressWriter(0,
|
||||
"namespace1",
|
||||
eh1Partition0,
|
||||
1000L,
|
||||
new Configuration(),
|
||||
progressRootPath.toString,
|
||||
appName)
|
||||
progressWriter.write(1000L, 0, 1)
|
||||
progressWriter = new ProgressWriter(0, "namespace1", eh2Partition0, 1000L,
|
||||
new Configuration(), progressRootPath.toString, appName)
|
||||
progressWriter = new ProgressWriter(0,
|
||||
"namespace1",
|
||||
eh2Partition0,
|
||||
1000L,
|
||||
new Configuration(),
|
||||
progressRootPath.toString,
|
||||
appName)
|
||||
progressWriter.write(1000L, 0, 1)
|
||||
progressWriter = new ProgressWriter(0, "namespace2", eh1Partition0, 1000L,
|
||||
new Configuration(), progressRootPath.toString, appName)
|
||||
progressWriter = new ProgressWriter(0,
|
||||
"namespace2",
|
||||
eh1Partition0,
|
||||
1000L,
|
||||
new Configuration(),
|
||||
progressRootPath.toString,
|
||||
appName)
|
||||
progressWriter.write(2000L, 10, 20)
|
||||
progressWriter = new ProgressWriter(0, "namespace2", eh2Partition0, 1000L,
|
||||
new Configuration(), progressRootPath.toString, appName)
|
||||
progressWriter = new ProgressWriter(0,
|
||||
"namespace2",
|
||||
eh2Partition0,
|
||||
1000L,
|
||||
new Configuration(),
|
||||
progressRootPath.toString,
|
||||
appName)
|
||||
progressWriter.write(1000L, 20, 30)
|
||||
|
||||
intercept[IllegalStateException] {
|
||||
progressTracker.asInstanceOf[DirectDStreamProgressTracker].
|
||||
collectProgressRecordsForBatch(1000L, List(connector1, connector2))
|
||||
progressTracker
|
||||
.asInstanceOf[DirectDStreamProgressTracker]
|
||||
.collectProgressRecordsForBatch(1000L, List(connector1, connector2))
|
||||
}
|
||||
}
|
||||
|
||||
test("latest offsets can be committed correctly and temp directory is not cleaned") {
|
||||
progressTracker = DirectDStreamProgressTracker.initInstance(progressRootPath.toString, appName,
|
||||
new Configuration())
|
||||
progressTracker = DirectDStreamProgressTracker.initInstance(progressRootPath.toString,
|
||||
appName,
|
||||
new Configuration())
|
||||
|
||||
var progressWriter = new ProgressWriter(0, "namespace1", EventHubNameAndPartition("eh1", 0),
|
||||
1000L, new Configuration(), progressRootPath.toString, appName)
|
||||
var progressWriter = new ProgressWriter(0,
|
||||
"namespace1",
|
||||
EventHubNameAndPartition("eh1", 0),
|
||||
1000L,
|
||||
new Configuration(),
|
||||
progressRootPath.toString,
|
||||
appName)
|
||||
progressWriter.write(1000L, 0, 0)
|
||||
progressWriter = new ProgressWriter(0, "namespace1", EventHubNameAndPartition("eh2", 0), 1000L,
|
||||
new Configuration(), progressRootPath.toString, appName)
|
||||
progressWriter = new ProgressWriter(0,
|
||||
"namespace1",
|
||||
EventHubNameAndPartition("eh2", 0),
|
||||
1000L,
|
||||
new Configuration(),
|
||||
progressRootPath.toString,
|
||||
appName)
|
||||
progressWriter.write(1000L, 1, 1)
|
||||
progressWriter = new ProgressWriter(0, "namespace2", EventHubNameAndPartition("eh1", 0), 1000L,
|
||||
new Configuration(), progressRootPath.toString, appName)
|
||||
progressWriter = new ProgressWriter(0,
|
||||
"namespace2",
|
||||
EventHubNameAndPartition("eh1", 0),
|
||||
1000L,
|
||||
new Configuration(),
|
||||
progressRootPath.toString,
|
||||
appName)
|
||||
progressWriter.write(1000L, 2, 2)
|
||||
progressWriter = new ProgressWriter(0, "namespace2", EventHubNameAndPartition("eh2", 0), 1000L,
|
||||
new Configuration(), progressRootPath.toString, appName)
|
||||
progressWriter = new ProgressWriter(0,
|
||||
"namespace2",
|
||||
EventHubNameAndPartition("eh2", 0),
|
||||
1000L,
|
||||
new Configuration(),
|
||||
progressRootPath.toString,
|
||||
appName)
|
||||
progressWriter.write(1000L, 3, 3)
|
||||
|
||||
val offsetToCommit = Map(
|
||||
"namespace1" -> Map(
|
||||
EventHubNameAndPartition("eh1", 0) -> (0L, 0L),
|
||||
EventHubNameAndPartition("eh2", 1) -> (1L, 1L)),
|
||||
"namespace2" -> Map(
|
||||
EventHubNameAndPartition("eh1", 3) -> (2L, 2L),
|
||||
EventHubNameAndPartition("eh2", 4) -> (3L, 3L)))
|
||||
"namespace1" -> Map(EventHubNameAndPartition("eh1", 0) -> (0L, 0L),
|
||||
EventHubNameAndPartition("eh2", 1) -> (1L, 1L)),
|
||||
"namespace2" -> Map(EventHubNameAndPartition("eh1", 3) -> (2L, 2L),
|
||||
EventHubNameAndPartition("eh2", 4) -> (3L, 3L))
|
||||
)
|
||||
progressTracker.asInstanceOf[DirectDStreamProgressTracker].commit(offsetToCommit, 1000L)
|
||||
val namespace1Offsets = progressTracker.asInstanceOf[DirectDStreamProgressTracker]
|
||||
val namespace1Offsets = progressTracker
|
||||
.asInstanceOf[DirectDStreamProgressTracker]
|
||||
.read("namespace1", 1000L, fallBack = false)
|
||||
assert(namespace1Offsets === OffsetRecord(1000L, Map(
|
||||
EventHubNameAndPartition("eh1", 0) -> (0L, 0L),
|
||||
EventHubNameAndPartition("eh2", 1) -> (1L, 1L))))
|
||||
val namespace2Offsets = progressTracker.asInstanceOf[DirectDStreamProgressTracker]
|
||||
assert(
|
||||
namespace1Offsets === OffsetRecord(1000L,
|
||||
Map(EventHubNameAndPartition("eh1", 0) -> (0L, 0L),
|
||||
EventHubNameAndPartition("eh2", 1) -> (1L, 1L))))
|
||||
val namespace2Offsets = progressTracker
|
||||
.asInstanceOf[DirectDStreamProgressTracker]
|
||||
.read("namespace2", 1000L, fallBack = false)
|
||||
assert(namespace2Offsets === OffsetRecord(1000L, Map(
|
||||
EventHubNameAndPartition("eh1", 3) -> (2L, 2L),
|
||||
EventHubNameAndPartition("eh2", 4) -> (3L, 3L))))
|
||||
assert(
|
||||
namespace2Offsets === OffsetRecord(1000L,
|
||||
Map(EventHubNameAndPartition("eh1", 3) -> (2L, 2L),
|
||||
EventHubNameAndPartition("eh2", 4) -> (3L, 3L))))
|
||||
|
||||
// test temp directory cleanup
|
||||
assert(fs.exists(PathTools.makeTempDirectoryPath(
|
||||
progressRootPath.toString, appName)))
|
||||
assert(fs.listStatus(PathTools.makeTempDirectoryPath(
|
||||
progressRootPath.toString, appName)).length === 4)
|
||||
assert(fs.exists(PathTools.makeTempDirectoryPath(progressRootPath.toString, appName)))
|
||||
assert(
|
||||
fs.listStatus(PathTools.makeTempDirectoryPath(progressRootPath.toString, appName))
|
||||
.length === 4)
|
||||
}
|
||||
|
||||
test("locate ProgressFile correctly") {
|
||||
progressTracker = DirectDStreamProgressTracker
|
||||
.initInstance(progressRootPath.toString, appName, new Configuration())
|
||||
assert(progressTracker.asInstanceOf[DirectDStreamProgressTracker]
|
||||
.pinPointProgressFile(fs, 1000L) === None)
|
||||
assert(
|
||||
progressTracker
|
||||
.asInstanceOf[DirectDStreamProgressTracker]
|
||||
.pinPointProgressFile(fs, 1000L) === None)
|
||||
|
||||
val progressPath = PathTools.makeProgressDirectoryStr(progressRootPath.toString, appName)
|
||||
fs.mkdirs(new Path(progressPath))
|
||||
|
||||
// 1000
|
||||
writeProgressFile(progressPath, 0, fs, 1000L, "namespace1", "eh1",
|
||||
0 to 0, 0, 1)
|
||||
writeProgressFile(progressPath, 0, fs, 1000L, "namespace1", "eh2",
|
||||
0 to 1, 0, 2)
|
||||
writeProgressFile(progressPath, 0, fs, 1000L, "namespace1", "eh3",
|
||||
0 to 2, 0, 3)
|
||||
writeProgressFile(progressPath, 1, fs, 1000L, "namespace2", "eh11",
|
||||
0 to 0, 1, 2)
|
||||
writeProgressFile(progressPath, 1, fs, 1000L, "namespace2", "eh12",
|
||||
0 to 1, 2, 3)
|
||||
writeProgressFile(progressPath, 1, fs, 1000L, "namespace2", "eh13",
|
||||
0 to 2, 3, 4)
|
||||
writeProgressFile(progressPath, 0, fs, 1000L, "namespace1", "eh1", 0 to 0, 0, 1)
|
||||
writeProgressFile(progressPath, 0, fs, 1000L, "namespace1", "eh2", 0 to 1, 0, 2)
|
||||
writeProgressFile(progressPath, 0, fs, 1000L, "namespace1", "eh3", 0 to 2, 0, 3)
|
||||
writeProgressFile(progressPath, 1, fs, 1000L, "namespace2", "eh11", 0 to 0, 1, 2)
|
||||
writeProgressFile(progressPath, 1, fs, 1000L, "namespace2", "eh12", 0 to 1, 2, 3)
|
||||
writeProgressFile(progressPath, 1, fs, 1000L, "namespace2", "eh13", 0 to 2, 3, 4)
|
||||
|
||||
// 2000
|
||||
writeProgressFile(progressPath, 0, fs, 2000L, "namespace1", "eh1",
|
||||
0 to 0, 1, 2)
|
||||
writeProgressFile(progressPath, 0, fs, 2000L, "namespace1", "eh2",
|
||||
0 to 1, 1, 3)
|
||||
writeProgressFile(progressPath, 0, fs, 2000L, "namespace1", "eh3",
|
||||
0 to 2, 1, 4)
|
||||
writeProgressFile(progressPath, 1, fs, 2000L, "namespace2", "eh11",
|
||||
0 to 0, 2, 3)
|
||||
writeProgressFile(progressPath, 1, fs, 2000L, "namespace2", "eh12",
|
||||
0 to 1, 3, 4)
|
||||
writeProgressFile(progressPath, 1, fs, 2000L, "namespace2", "eh13",
|
||||
0 to 2, 4, 5)
|
||||
writeProgressFile(progressPath, 0, fs, 2000L, "namespace1", "eh1", 0 to 0, 1, 2)
|
||||
writeProgressFile(progressPath, 0, fs, 2000L, "namespace1", "eh2", 0 to 1, 1, 3)
|
||||
writeProgressFile(progressPath, 0, fs, 2000L, "namespace1", "eh3", 0 to 2, 1, 4)
|
||||
writeProgressFile(progressPath, 1, fs, 2000L, "namespace2", "eh11", 0 to 0, 2, 3)
|
||||
writeProgressFile(progressPath, 1, fs, 2000L, "namespace2", "eh12", 0 to 1, 3, 4)
|
||||
writeProgressFile(progressPath, 1, fs, 2000L, "namespace2", "eh13", 0 to 2, 4, 5)
|
||||
|
||||
// 3000
|
||||
writeProgressFile(progressPath, 0, fs, 3000L, "namespace1", "eh1",
|
||||
0 to 0, 2, 3)
|
||||
writeProgressFile(progressPath, 0, fs, 3000L, "namespace1", "eh2",
|
||||
0 to 1, 2, 4)
|
||||
writeProgressFile(progressPath, 0, fs, 3000L, "namespace1", "eh3",
|
||||
0 to 2, 2, 5)
|
||||
writeProgressFile(progressPath, 1, fs, 3000L, "namespace2", "eh11",
|
||||
0 to 0, 3, 4)
|
||||
writeProgressFile(progressPath, 1, fs, 3000L, "namespace2", "eh12",
|
||||
0 to 1, 4, 5)
|
||||
writeProgressFile(progressPath, 1, fs, 3000L, "namespace2", "eh13",
|
||||
0 to 2, 5, 6)
|
||||
writeProgressFile(progressPath, 0, fs, 3000L, "namespace1", "eh1", 0 to 0, 2, 3)
|
||||
writeProgressFile(progressPath, 0, fs, 3000L, "namespace1", "eh2", 0 to 1, 2, 4)
|
||||
writeProgressFile(progressPath, 0, fs, 3000L, "namespace1", "eh3", 0 to 2, 2, 5)
|
||||
writeProgressFile(progressPath, 1, fs, 3000L, "namespace2", "eh11", 0 to 0, 3, 4)
|
||||
writeProgressFile(progressPath, 1, fs, 3000L, "namespace2", "eh12", 0 to 1, 4, 5)
|
||||
writeProgressFile(progressPath, 1, fs, 3000L, "namespace2", "eh13", 0 to 2, 5, 6)
|
||||
|
||||
// if latest timestamp is earlier than the specified timestamp,
|
||||
// then we shall return the latest offsets
|
||||
|
|
|
@ -19,65 +19,85 @@ package org.apache.spark.streaming.eventhubs.checkpoint
|
|||
|
||||
import org.apache.hadoop.conf.Configuration
|
||||
import org.apache.hadoop.fs.Path
|
||||
|
||||
import org.apache.spark.{SparkConf, SparkContext}
|
||||
import org.apache.spark.eventhubscommon.{EventHubNameAndPartition, OffsetRecord}
|
||||
import org.apache.spark.eventhubscommon.progress.ProgressWriter
|
||||
import org.apache.spark.streaming.{Seconds, StreamingContext}
|
||||
import org.apache.spark.eventhubscommon.{ EventHubNameAndPartition, OffsetRecord }
|
||||
import org.apache.spark.streaming.eventhubs.SharedUtils
|
||||
import org.apache.spark.streaming.scheduler.OutputOperationInfo
|
||||
import org.apache.spark.streaming.{ Seconds, StreamingContext }
|
||||
import org.apache.spark.{ SparkConf, SparkContext }
|
||||
|
||||
// scalastyle:off
|
||||
import org.apache.spark.streaming.Time
|
||||
import org.apache.spark.streaming.scheduler.{BatchInfo, StreamInputInfo, StreamingListenerBatchCompleted}
|
||||
// scalastyle:on
|
||||
import org.apache.spark.streaming.scheduler.{
|
||||
BatchInfo,
|
||||
StreamInputInfo,
|
||||
StreamingListenerBatchCompleted
|
||||
}
|
||||
|
||||
class ProgressTrackingListenerSuite extends SharedUtils {
|
||||
|
||||
test("commit offsets with a successful micro batch correctly") {
|
||||
val batchCompletedEvent = StreamingListenerBatchCompleted(BatchInfo(
|
||||
Time(1000L),
|
||||
Map(0 -> StreamInputInfo(0, 10000)),
|
||||
0L,
|
||||
None,
|
||||
None,
|
||||
Map(1 -> OutputOperationInfo(Time(1000L), 1, "output", "", None, None, None))
|
||||
))
|
||||
val dstream = createDirectStreams(ssc, eventhubNamespace, progressRootPath.toString,
|
||||
Map("eh1" -> Map("eventhubs.partition.count" -> "2")))
|
||||
val batchCompletedEvent = StreamingListenerBatchCompleted(
|
||||
BatchInfo(
|
||||
Time(1000L),
|
||||
Map(0 -> StreamInputInfo(0, 10000)),
|
||||
0L,
|
||||
None,
|
||||
None,
|
||||
Map(1 -> OutputOperationInfo(Time(1000L), 1, "output", "", None, None, None))
|
||||
))
|
||||
val dstream = createDirectStreams(ssc,
|
||||
eventhubNamespace,
|
||||
progressRootPath.toString,
|
||||
Map("eh1" -> Map("eventhubs.partition.count" -> "2")))
|
||||
dstream.start()
|
||||
val progressWriter = new ProgressWriter(streamId, eventhubNamespace,
|
||||
EventHubNameAndPartition("eh1", 1), 1000L,
|
||||
new Configuration(), progressRootPath.toString, appName)
|
||||
val progressWriter = new ProgressWriter(streamId,
|
||||
eventhubNamespace,
|
||||
EventHubNameAndPartition("eh1", 1),
|
||||
1000L,
|
||||
new Configuration(),
|
||||
progressRootPath.toString,
|
||||
appName)
|
||||
progressWriter.write(1000L, 1L, 2L)
|
||||
assert(fs.exists(progressWriter.tempProgressTrackingPointPath))
|
||||
progressListener.onBatchCompleted(batchCompletedEvent)
|
||||
assert(fs.exists(progressWriter.tempProgressTrackingPointPath))
|
||||
assert(fs.exists(new Path(progressTracker.progressDirectoryPath + "/progress-1000")))
|
||||
val record = progressTracker.asInstanceOf[DirectDStreamProgressTracker].read(eventhubNamespace,
|
||||
1000L, fallBack = false)
|
||||
assert(record === OffsetRecord(1000L,
|
||||
Map(EventHubNameAndPartition("eh1", 0) -> (-1L, -1L),
|
||||
EventHubNameAndPartition("eh1", 1) -> (1L, 2L))))
|
||||
val record = progressTracker
|
||||
.asInstanceOf[DirectDStreamProgressTracker]
|
||||
.read(eventhubNamespace, 1000L, fallBack = false)
|
||||
assert(
|
||||
record === OffsetRecord(1000L,
|
||||
Map(EventHubNameAndPartition("eh1", 0) -> (-1L, -1L),
|
||||
EventHubNameAndPartition("eh1", 1) -> (1L, 2L))))
|
||||
}
|
||||
|
||||
test("do not commit offsets when there is a failure in microbatch") {
|
||||
val batchCompletedEvent = StreamingListenerBatchCompleted(BatchInfo(
|
||||
Time(1000L),
|
||||
Map(0 -> StreamInputInfo(0, 10000)),
|
||||
0L,
|
||||
None,
|
||||
None,
|
||||
Map(
|
||||
1 -> OutputOperationInfo(Time(1000L), 1, "outputWithFailure", "", None, None,
|
||||
Some("instrumented failure")),
|
||||
2 -> OutputOperationInfo(Time(1000L), 2, "correct output", "", None, None, None)))
|
||||
)
|
||||
val batchCompletedEvent = StreamingListenerBatchCompleted(
|
||||
BatchInfo(
|
||||
Time(1000L),
|
||||
Map(0 -> StreamInputInfo(0, 10000)),
|
||||
0L,
|
||||
None,
|
||||
None,
|
||||
Map(
|
||||
1 -> OutputOperationInfo(Time(1000L),
|
||||
1,
|
||||
"outputWithFailure",
|
||||
"",
|
||||
None,
|
||||
None,
|
||||
Some("instrumented failure")),
|
||||
2 -> OutputOperationInfo(Time(1000L), 2, "correct output", "", None, None, None)
|
||||
)
|
||||
))
|
||||
// build temp directories
|
||||
val progressWriter = new ProgressWriter(streamId, eventhubNamespace,
|
||||
EventHubNameAndPartition("eh1", 1), 1000L,
|
||||
new Configuration(), progressTracker.tempDirectoryPath.toString,
|
||||
appName)
|
||||
val progressWriter = new ProgressWriter(streamId,
|
||||
eventhubNamespace,
|
||||
EventHubNameAndPartition("eh1", 1),
|
||||
1000L,
|
||||
new Configuration(),
|
||||
progressTracker.tempDirectoryPath.toString,
|
||||
appName)
|
||||
progressWriter.write(1000L, 0L, 0L)
|
||||
assert(fs.exists(progressWriter.tempProgressTrackingPointPath))
|
||||
progressListener.onBatchCompleted(batchCompletedEvent)
|
||||
|
@ -90,20 +110,30 @@ class ProgressTrackingListenerSuite extends SharedUtils {
|
|||
ProgressTrackingListener.reset(ssc)
|
||||
ssc.stop()
|
||||
// create new streaming context
|
||||
ssc = new StreamingContext(new SparkContext(new SparkConf().setAppName(appName).
|
||||
setMaster("local[*]")), Seconds(5))
|
||||
createDirectStreams(ssc, "namespace1", progressRootPath.toString,
|
||||
ssc = new StreamingContext(
|
||||
new SparkContext(new SparkConf().setAppName(appName).setMaster("local[*]")),
|
||||
Seconds(5))
|
||||
createDirectStreams(
|
||||
ssc,
|
||||
"namespace1",
|
||||
progressRootPath.toString,
|
||||
Map("eh1" -> Map("eventhubs.partition.count" -> "1"),
|
||||
"eh2" -> Map("eventhubs.partition.count" -> "2"),
|
||||
"eh3" -> Map("eventhubs.partition.count" -> "3"))).start()
|
||||
createDirectStreams(ssc, "namespace2", progressRootPath.toString,
|
||||
"eh2" -> Map("eventhubs.partition.count" -> "2"),
|
||||
"eh3" -> Map("eventhubs.partition.count" -> "3"))
|
||||
).start()
|
||||
createDirectStreams(
|
||||
ssc,
|
||||
"namespace2",
|
||||
progressRootPath.toString,
|
||||
Map("eh11" -> Map("eventhubs.partition.count" -> "1"),
|
||||
"eh12" -> Map("eventhubs.partition.count" -> "2"),
|
||||
"eh13" -> Map("eventhubs.partition.count" -> "3"))).start()
|
||||
"eh12" -> Map("eventhubs.partition.count" -> "2"),
|
||||
"eh13" -> Map("eventhubs.partition.count" -> "3"))
|
||||
).start()
|
||||
import scala.collection.JavaConverters._
|
||||
assert(ssc.scheduler.listenerBus.listeners.asScala.count(
|
||||
_.isInstanceOf[ProgressTrackingListener]) === 1)
|
||||
assert(
|
||||
ssc.scheduler.listenerBus.listeners.asScala
|
||||
.count(_.isInstanceOf[ProgressTrackingListener]) === 1)
|
||||
assert(DirectDStreamProgressTracker.registeredConnectors.length === 2)
|
||||
ssc.stop()
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -1,38 +0,0 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package com.microsoft.spark.streaming.examples.receiverdstream.arguments
|
||||
|
||||
object EventhubsArgumentKeys extends Enumeration {
|
||||
val EventhubsNamespace: String = "eventhubsNamespace"
|
||||
val EventhubsName: String = "eventhubsName"
|
||||
val PolicyName: String = "policyName"
|
||||
val PolicyKey: String = "policyKey"
|
||||
val ConsumerGroup: String = "consumerGroup"
|
||||
val PartitionCount: String = "partitionCount"
|
||||
val BatchIntervalInSeconds: String = "batchInterval"
|
||||
val CheckpointDirectory: String = "checkpointDirectory"
|
||||
val EventCountFolder: String = "eventCountFolder"
|
||||
val EventStoreFolder: String = "eventStoreFolder"
|
||||
val EventHiveTable: String = "eventHiveTable"
|
||||
val SQLServerFQDN: String = "sqlServerFQDN"
|
||||
val SQLDatabaseName: String = "sqlDatabaseName"
|
||||
val DatabaseUsername: String = "databaseUsername"
|
||||
val DatabasePassword: String = "databasePassword"
|
||||
val EventSQLTable: String = "eventSQLTable"
|
||||
val TimeoutInMinutes: String = "jobTimeout"
|
||||
}
|
|
@ -1,173 +0,0 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package com.microsoft.spark.streaming.examples.receiverdstream.arguments
|
||||
|
||||
object EventhubsArgumentParser {
|
||||
|
||||
type ArgumentMap = Map[Symbol, Any]
|
||||
|
||||
def usageExample(): Unit = {
|
||||
|
||||
val eventhubsNamespace: String = "sparkstreamingeventhub-ns"
|
||||
val eventhubsName: String = "sparkstreamingeventhub"
|
||||
val policyName: String = "[EventhubsPolicyName]"
|
||||
val policyKey: String = "[EventhubsPolicyKey]"
|
||||
val consumerGroup: String = "$default"
|
||||
val partitionCount: Int = 32
|
||||
val batchInterval: Int = 10
|
||||
val checkpointDirectory: String = "/EventCheckpoint10"
|
||||
val eventCountFolder: String = "/EventCount/EventCount10"
|
||||
val eventStoreFolder: String = "/EventStore/EventStore10"
|
||||
val eventHiveTable: String = "EventHiveTable10"
|
||||
val sqlServerFQDN: String = "servername.database.windows.net"
|
||||
val sqlDatabaseName: String = "databasename"
|
||||
val databaseUsername: String = "[DatabaseUsername]"
|
||||
val databasePassword: String = "[DatabasePassword]"
|
||||
val eventSQLTable: String = "EventSQLTable10"
|
||||
val timeoutInMinutes: Long = -1
|
||||
|
||||
println()
|
||||
// scalastyle:off
|
||||
println(s"Usage [EventhubsEventCount]: spark-submit --master yarn-cluster ..." +
|
||||
s" --class com.microsoft.spark.streaming.examples.EventHubsEventCount" +
|
||||
s" /home/hdiuser/spark/SparkStreamingDataPersistence.jar --eventhubs-namespace \'$eventhubsNamespace\'" +
|
||||
s" --eventhubs-name \'$eventhubsName\' --policy-name \'$policyName\' --policy-key \'$policyKey\'" +
|
||||
s" --consumer-group \'$consumerGroup\' --partition-count $partitionCount" +
|
||||
s" --batch-interval-in-seconds $batchInterval --checkpoint-directory \'$checkpointDirectory\'" +
|
||||
s" --event-count-folder \'$eventCountFolder\' --job-timeout-in-minutes $timeoutInMinutes")
|
||||
println()
|
||||
println(s"Usage [EventhubsToAzureBlobAsJSON]: spark-submit --master yarn-cluster ..." +
|
||||
s" --class com.microsoft.spark.streaming.examples.EventHubsEventCount" +
|
||||
s" /home/hdiuser/spark/SparkStreamingDataPersistence.jar --eventhubs-namespace \'$eventhubsNamespace\'" +
|
||||
s" --eventhubs-name \'$eventhubsName\' --policy-name \'$policyName\' --policy-key \'$policyKey\'" +
|
||||
s" --consumer-group \'$consumerGroup\' --partition-count $partitionCount" +
|
||||
s" --batch-interval-in-seconds $batchInterval --checkpoint-directory \'$checkpointDirectory\'" +
|
||||
s" --event-count-folder \'$eventCountFolder\' --event-store-folder \'$eventStoreFolder\'" +
|
||||
s" --job-timeout-in-minutes $timeoutInMinutes")
|
||||
println()
|
||||
println(s"Usage [EventhubsToHiveTable]: spark-submit --master yarn-cluster ..." +
|
||||
s" --class com.microsoft.spark.streaming.examples.EventHubsEventCount" +
|
||||
s" /home/hdiuser/spark/SparkStreamingDataPersistence.jar --eventhubs-namespace \'$eventhubsNamespace\'" +
|
||||
s" --eventhubs-name \'$eventhubsName\' --policy-name \'$policyName\' --policy-key \'$policyKey\'" +
|
||||
s" --consumer-group \'$consumerGroup --partition-count $partitionCount" +
|
||||
s" --batch-interval-in-seconds $batchInterval --checkpoint-directory \'$checkpointDirectory\'" +
|
||||
s" --event-count-folder \'$eventCountFolder\' --event-hive-table \'$eventHiveTable\'" +
|
||||
s" --job-timeout-in-minutes $timeoutInMinutes")
|
||||
println()
|
||||
println(s"Usage [EventhubsToSQLTable]: spark-submit --master yarn-cluster ..." +
|
||||
s" --class com.microsoft.spark.streaming.examples.EventHubsEventCount" +
|
||||
s" /home/hdiuser/spark/SparkStreamingDataPersistence.jar --eventhubs-namespace $eventhubsNamespace" +
|
||||
s" --eventhubs-name \'$eventhubsName\' --policy-name \'$policyName\' --policy-key \'$policyKey\'" +
|
||||
s" --consumer-group \'$consumerGroup\' --partition-count $partitionCount" +
|
||||
s" --batch-interval-in-seconds $batchInterval --checkpoint-directory \'$checkpointDirectory\'" +
|
||||
s" --event-count-folder \'$eventCountFolder\' --sql-server-fqdn \'$sqlServerFQDN\'" +
|
||||
s" --sql-database-name \'$sqlDatabaseName\' --database-username \'$databaseUsername\'" +
|
||||
s" --database-password \'$databasePassword\' --event-sql-table \'$eventSQLTable\'" +
|
||||
s" --job-timeout-in-minutes $timeoutInMinutes")
|
||||
println()
|
||||
}
|
||||
|
||||
def parseArguments(argumentMap : ArgumentMap, argumentList: List[String]) : ArgumentMap = {
|
||||
|
||||
argumentList match {
|
||||
case Nil => argumentMap
|
||||
case "--eventhubs-namespace" :: value:: tail =>
|
||||
parseArguments(argumentMap ++ Map(Symbol(EventhubsArgumentKeys.EventhubsNamespace) -> value.toString), tail)
|
||||
case "--eventhubs-name" :: value :: tail =>
|
||||
parseArguments(argumentMap ++ Map(Symbol(EventhubsArgumentKeys.EventhubsName) -> value.toString), tail)
|
||||
case "--policy-name" :: value :: tail =>
|
||||
parseArguments(argumentMap ++ Map(Symbol(EventhubsArgumentKeys.PolicyName) -> value.toString), tail)
|
||||
case "--policy-key" :: value :: tail =>
|
||||
parseArguments(argumentMap ++ Map(Symbol(EventhubsArgumentKeys.PolicyKey) -> value.toString), tail)
|
||||
case "--consumer-group" :: value :: tail =>
|
||||
parseArguments(argumentMap ++ Map(Symbol(EventhubsArgumentKeys.ConsumerGroup) -> value.toString), tail)
|
||||
case "--partition-count" :: value :: tail =>
|
||||
parseArguments(argumentMap ++ Map(Symbol(EventhubsArgumentKeys.PartitionCount) -> value.toInt), tail)
|
||||
case "--batch-interval-in-seconds" :: value :: tail =>
|
||||
parseArguments(argumentMap ++ Map(Symbol(EventhubsArgumentKeys.BatchIntervalInSeconds) -> value.toInt), tail)
|
||||
case "--checkpoint-directory" :: value :: tail =>
|
||||
parseArguments(argumentMap ++ Map(Symbol(EventhubsArgumentKeys.CheckpointDirectory) -> value.toString), tail)
|
||||
case "--event-count-folder" :: value :: tail =>
|
||||
parseArguments(argumentMap ++ Map(Symbol(EventhubsArgumentKeys.EventCountFolder) -> value.toString), tail)
|
||||
case "--event-store-folder" :: value :: tail =>
|
||||
parseArguments(argumentMap ++ Map(Symbol(EventhubsArgumentKeys.EventStoreFolder) -> value.toString), tail)
|
||||
case "--event-hive-table" :: value :: tail =>
|
||||
parseArguments(argumentMap ++ Map(Symbol(EventhubsArgumentKeys.EventHiveTable) -> value.toString), tail)
|
||||
case "--sql-server-fqdn" :: value :: tail =>
|
||||
parseArguments(argumentMap ++ Map(Symbol(EventhubsArgumentKeys.SQLServerFQDN) -> value.toString), tail)
|
||||
case "--sql-database-name" :: value :: tail =>
|
||||
parseArguments(argumentMap ++ Map(Symbol(EventhubsArgumentKeys.SQLDatabaseName) -> value.toString), tail)
|
||||
case "--database-username" :: value :: tail =>
|
||||
parseArguments(argumentMap ++ Map(Symbol(EventhubsArgumentKeys.DatabaseUsername) -> value.toString), tail)
|
||||
case "--database-password" :: value :: tail =>
|
||||
parseArguments(argumentMap ++ Map(Symbol(EventhubsArgumentKeys.DatabasePassword) -> value.toString), tail)
|
||||
case "--event-sql-table" :: value :: tail =>
|
||||
parseArguments(argumentMap ++ Map(Symbol(EventhubsArgumentKeys.EventSQLTable) -> value.toString), tail)
|
||||
case "--job-timeout-in-minutes" :: value :: tail =>
|
||||
parseArguments(argumentMap ++ Map(Symbol(EventhubsArgumentKeys.TimeoutInMinutes) -> value.toLong), tail)
|
||||
case option :: tail =>
|
||||
println()
|
||||
println("Unknown option: " + option)
|
||||
println()
|
||||
usageExample()
|
||||
sys.exit(1)
|
||||
}
|
||||
}
|
||||
|
||||
// scalastyle:on
|
||||
|
||||
def verifyEventhubsEventCountArguments(argumentMap : ArgumentMap): Unit = {
|
||||
|
||||
assert(argumentMap.contains(Symbol(EventhubsArgumentKeys.EventhubsNamespace)))
|
||||
assert(argumentMap.contains(Symbol(EventhubsArgumentKeys.EventhubsName)))
|
||||
assert(argumentMap.contains(Symbol(EventhubsArgumentKeys.PolicyName)))
|
||||
assert(argumentMap.contains(Symbol(EventhubsArgumentKeys.PolicyKey)))
|
||||
assert(argumentMap.contains(Symbol(EventhubsArgumentKeys.ConsumerGroup)))
|
||||
assert(argumentMap.contains(Symbol(EventhubsArgumentKeys.PartitionCount)))
|
||||
assert(argumentMap.contains(Symbol(EventhubsArgumentKeys.BatchIntervalInSeconds)))
|
||||
assert(argumentMap.contains(Symbol(EventhubsArgumentKeys.CheckpointDirectory)))
|
||||
|
||||
assert(argumentMap(Symbol(EventhubsArgumentKeys.PartitionCount)).asInstanceOf[Int] > 0)
|
||||
assert(argumentMap(Symbol(EventhubsArgumentKeys.BatchIntervalInSeconds)).asInstanceOf[Int] > 0)
|
||||
}
|
||||
|
||||
def verifyEventhubsToAzureBlobAsJSONArguments(argumentMap : ArgumentMap): Unit = {
|
||||
|
||||
verifyEventhubsEventCountArguments(argumentMap)
|
||||
|
||||
assert(argumentMap.contains(Symbol(EventhubsArgumentKeys.EventStoreFolder)))
|
||||
}
|
||||
|
||||
def verifyEventhubsToHiveTableArguments(argumentMap : ArgumentMap): Unit = {
|
||||
|
||||
verifyEventhubsEventCountArguments(argumentMap)
|
||||
|
||||
assert(argumentMap.contains(Symbol(EventhubsArgumentKeys.EventHiveTable)))
|
||||
}
|
||||
|
||||
def verifyEventhubsToSQLTableArguments(argumentMap : ArgumentMap): Unit = {
|
||||
|
||||
verifyEventhubsEventCountArguments(argumentMap)
|
||||
|
||||
assert(argumentMap.contains(Symbol(EventhubsArgumentKeys.SQLServerFQDN)))
|
||||
assert(argumentMap.contains(Symbol(EventhubsArgumentKeys.SQLDatabaseName)))
|
||||
assert(argumentMap.contains(Symbol(EventhubsArgumentKeys.DatabaseUsername)))
|
||||
assert(argumentMap.contains(Symbol(EventhubsArgumentKeys.DatabasePassword)))
|
||||
assert(argumentMap.contains(Symbol(EventhubsArgumentKeys.EventSQLTable)))
|
||||
}
|
||||
}
|
|
@ -1,57 +0,0 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package com.microsoft.spark.streaming.examples.receiverdstream.common
|
||||
|
||||
import java.sql.{Connection, DriverManager}
|
||||
|
||||
import org.apache.spark.sql.DataFrame
|
||||
|
||||
object DataFrameExtensions {
|
||||
|
||||
implicit def extendedDataFrame(dataFrame: DataFrame): ExtendedDataFrame =
|
||||
new ExtendedDataFrame(dataFrame: DataFrame)
|
||||
|
||||
class ExtendedDataFrame(dataFrame: DataFrame) {
|
||||
|
||||
def insertToAzureSql(sqlDatabaseConnectionString: String, sqlTableName: String): Unit = {
|
||||
|
||||
val tableHeader: String = dataFrame.columns.mkString(",")
|
||||
|
||||
dataFrame.foreachPartition { partition =>
|
||||
val sqlExecutorConnection: Connection = DriverManager.getConnection(
|
||||
sqlDatabaseConnectionString)
|
||||
|
||||
// Batch size of 1000 is used since Azure SQL database cannot insert more than 1000 rows
|
||||
// at the same time.
|
||||
|
||||
partition.grouped(1000).foreach {
|
||||
group =>
|
||||
val insertString: scala.collection.mutable.StringBuilder = new StringBuilder()
|
||||
group.foreach {
|
||||
record => insertString.append("('" + record.mkString(",") + "'),")
|
||||
}
|
||||
sqlExecutorConnection.createStatement()
|
||||
.executeUpdate(f"INSERT INTO [dbo].[$sqlTableName] ($tableHeader) VALUES "
|
||||
+ insertString.stripSuffix(","))
|
||||
}
|
||||
|
||||
sqlExecutorConnection.close()
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
|
@ -1,20 +0,0 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package com.microsoft.spark.streaming.examples.receiverdstream.common
|
||||
|
||||
case class EventContent(EventDetails: String)
|
|
@ -1,29 +0,0 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package com.microsoft.spark.streaming.examples.receiverdstream.common
|
||||
|
||||
object StreamStatistics {
|
||||
|
||||
val streamLengthKey: String = "StreamLength"
|
||||
|
||||
val streamLength = (values: Seq[Long], state: Option[Long]) => {
|
||||
val currentCount = values.foldLeft(0L)(_ + _)
|
||||
val previousCount = state.getOrElse(0L)
|
||||
Some(currentCount + previousCount)
|
||||
}
|
||||
}
|
|
@ -1,38 +0,0 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package com.microsoft.spark.streaming.examples.receiverdstream.common
|
||||
|
||||
object StreamUtilities {
|
||||
|
||||
def getSqlJdbcConnectionString(sqlServerFQDN: String, sqlDatabaseName: String,
|
||||
databaseUsername: String, databasePassword: String): String = {
|
||||
|
||||
val serverName = sqlServerFQDN.split('.')(0)
|
||||
val certificateHostname = sqlServerFQDN.replace(serverName, "*")
|
||||
val serverPort = "1433"
|
||||
|
||||
val sqlDatabaseConnectionString = f"jdbc:sqlserver://$sqlServerFQDN:$serverPort;" +
|
||||
f"database=$sqlDatabaseName;" +
|
||||
f"user=$databaseUsername@$serverName;password=$databasePassword;" +
|
||||
f"encrypt=true;hostNameInCertificate=$certificateHostname;loginTimeout=30;"
|
||||
|
||||
Class.forName("com.microsoft.sqlserver.jdbc.SQLServerDriver")
|
||||
|
||||
sqlDatabaseConnectionString
|
||||
}
|
||||
}
|
|
@ -1,131 +0,0 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package com.microsoft.spark.streaming.examples.receiverdstream.workloads
|
||||
|
||||
import com.microsoft.spark.streaming.examples.receiverdstream.arguments.{EventhubsArgumentKeys, EventhubsArgumentParser}
|
||||
import com.microsoft.spark.streaming.examples.receiverdstream.arguments.EventhubsArgumentParser.ArgumentMap
|
||||
import com.microsoft.spark.streaming.examples.receiverdstream.common.StreamStatistics
|
||||
|
||||
import org.apache.spark.sql.SparkSession
|
||||
import org.apache.spark.streaming.{Seconds, StreamingContext}
|
||||
import org.apache.spark.streaming.eventhubs.EventHubsUtils
|
||||
|
||||
|
||||
object EventhubsEventCount {
|
||||
|
||||
def createStreamingContext(inputOptions: ArgumentMap): StreamingContext = {
|
||||
|
||||
val eventHubsParameters = Map[String, String](
|
||||
"eventhubs.namespace" -> inputOptions(Symbol(EventhubsArgumentKeys.EventhubsNamespace)).
|
||||
asInstanceOf[String],
|
||||
"eventhubs.name" -> inputOptions(Symbol(EventhubsArgumentKeys.EventhubsName)).
|
||||
asInstanceOf[String],
|
||||
"eventhubs.policyname" -> inputOptions(Symbol(EventhubsArgumentKeys.PolicyName)).
|
||||
asInstanceOf[String],
|
||||
"eventhubs.policykey" -> inputOptions(Symbol(EventhubsArgumentKeys.PolicyKey)).
|
||||
asInstanceOf[String],
|
||||
"eventhubs.consumergroup" -> inputOptions(Symbol(EventhubsArgumentKeys.ConsumerGroup)).
|
||||
asInstanceOf[String],
|
||||
"eventhubs.partition.count" -> inputOptions(Symbol(EventhubsArgumentKeys.PartitionCount))
|
||||
.asInstanceOf[Int].toString,
|
||||
"eventhubs.checkpoint.interval" -> inputOptions(Symbol(EventhubsArgumentKeys.
|
||||
BatchIntervalInSeconds)).asInstanceOf[Int].toString,
|
||||
"eventhubs.checkpoint.dir" -> inputOptions(Symbol(EventhubsArgumentKeys.CheckpointDirectory)).
|
||||
asInstanceOf[String]
|
||||
)
|
||||
|
||||
/**
|
||||
* In Spark 2.0.x, SparkConf must be initialized through EventhubsUtil so that required
|
||||
* data structures internal to Azure Eventhubs Client get registered with the Kryo Serializer.
|
||||
*/
|
||||
val sparkConfiguration = EventHubsUtils.initializeSparkStreamingConfigurations
|
||||
|
||||
sparkConfiguration.setAppName(this.getClass.getSimpleName)
|
||||
sparkConfiguration.set("spark.streaming.driver.writeAheadLog.allowBatching", "true")
|
||||
sparkConfiguration.set("spark.streaming.driver.writeAheadLog.batchingTimeout", "60000")
|
||||
sparkConfiguration.set("spark.streaming.receiver.writeAheadLog.enable", "true")
|
||||
sparkConfiguration.set("spark.streaming.driver.writeAheadLog.closeFileAfterWrite", "true")
|
||||
sparkConfiguration.set("spark.streaming.receiver.writeAheadLog.closeFileAfterWrite", "true")
|
||||
sparkConfiguration.set("spark.streaming.stopGracefullyOnShutdown", "true")
|
||||
|
||||
val sparkSession = SparkSession.builder().config(sparkConfiguration).getOrCreate()
|
||||
|
||||
val streamingContext = new StreamingContext(sparkSession.sparkContext,
|
||||
Seconds(inputOptions(Symbol(EventhubsArgumentKeys.BatchIntervalInSeconds)).asInstanceOf[Int]))
|
||||
streamingContext.checkpoint(inputOptions(Symbol(EventhubsArgumentKeys.CheckpointDirectory)).
|
||||
asInstanceOf[String])
|
||||
|
||||
val eventHubsStream = EventHubsUtils.createUnionStream(streamingContext, eventHubsParameters)
|
||||
|
||||
val eventHubsWindowedStream = eventHubsStream.window(
|
||||
Seconds(inputOptions(Symbol(EventhubsArgumentKeys.BatchIntervalInSeconds)).asInstanceOf[Int]))
|
||||
|
||||
// Count number of events received the past batch
|
||||
|
||||
val batchEventCount = eventHubsWindowedStream.count()
|
||||
|
||||
batchEventCount.print()
|
||||
|
||||
// Count number of events received so far
|
||||
|
||||
val totalEventCountDStream = eventHubsWindowedStream.map(m =>
|
||||
(StreamStatistics.streamLengthKey, 1L))
|
||||
val totalEventCount = totalEventCountDStream.updateStateByKey[Long](
|
||||
StreamStatistics.streamLength)
|
||||
|
||||
totalEventCount.checkpoint(Seconds(inputOptions(Symbol(EventhubsArgumentKeys.
|
||||
BatchIntervalInSeconds)).asInstanceOf[Int]))
|
||||
|
||||
if (inputOptions.contains(Symbol(EventhubsArgumentKeys.EventCountFolder))) {
|
||||
|
||||
totalEventCount.saveAsTextFiles(inputOptions(Symbol(EventhubsArgumentKeys.EventCountFolder))
|
||||
.asInstanceOf[String])
|
||||
}
|
||||
|
||||
totalEventCount.print()
|
||||
|
||||
streamingContext
|
||||
}
|
||||
|
||||
def main(inputArguments: Array[String]): Unit = {
|
||||
|
||||
val inputOptions: ArgumentMap = EventhubsArgumentParser.parseArguments(Map(),
|
||||
inputArguments.toList)
|
||||
|
||||
EventhubsArgumentParser.verifyEventhubsEventCountArguments(inputOptions)
|
||||
|
||||
// Create or recreate streaming context
|
||||
|
||||
val streamingContext = StreamingContext.getOrCreate(inputOptions(Symbol(EventhubsArgumentKeys.
|
||||
CheckpointDirectory)).asInstanceOf[String], () => createStreamingContext(inputOptions))
|
||||
|
||||
streamingContext.start()
|
||||
|
||||
if(inputOptions.contains(Symbol(EventhubsArgumentKeys.TimeoutInMinutes))) {
|
||||
|
||||
streamingContext.awaitTerminationOrTimeout(inputOptions(Symbol(EventhubsArgumentKeys.
|
||||
TimeoutInMinutes)).asInstanceOf[Long] * 60 * 1000)
|
||||
}
|
||||
else {
|
||||
|
||||
streamingContext.awaitTermination()
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -1,130 +0,0 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package com.microsoft.spark.streaming.examples.receiverdstream.workloads
|
||||
|
||||
import com.microsoft.spark.streaming.examples.receiverdstream.arguments.{EventhubsArgumentKeys, EventhubsArgumentParser}
|
||||
import com.microsoft.spark.streaming.examples.receiverdstream.arguments.EventhubsArgumentParser.ArgumentMap
|
||||
import com.microsoft.spark.streaming.examples.receiverdstream.common.{EventContent, StreamStatistics}
|
||||
|
||||
import org.apache.spark._
|
||||
import org.apache.spark.sql.{SaveMode, SparkSession}
|
||||
import org.apache.spark.streaming.{Seconds, StreamingContext}
|
||||
import org.apache.spark.streaming.eventhubs.EventHubsUtils
|
||||
|
||||
object EventhubsToAzureBlobAsJSON {
|
||||
|
||||
def createStreamingContext(inputOptions: ArgumentMap): StreamingContext = {
|
||||
|
||||
// scalastyle:off
|
||||
val eventHubsParameters = Map[String, String](
|
||||
"eventhubs.namespace" -> inputOptions(Symbol(EventhubsArgumentKeys.EventhubsNamespace)).asInstanceOf[String],
|
||||
"eventhubs.name" -> inputOptions(Symbol(EventhubsArgumentKeys.EventhubsName)).asInstanceOf[String],
|
||||
"eventhubs.policyname" -> inputOptions(Symbol(EventhubsArgumentKeys.PolicyName)).asInstanceOf[String],
|
||||
"eventhubs.policykey" -> inputOptions(Symbol(EventhubsArgumentKeys.PolicyKey)).asInstanceOf[String],
|
||||
"eventhubs.consumergroup" -> inputOptions(Symbol(EventhubsArgumentKeys.ConsumerGroup)).asInstanceOf[String],
|
||||
"eventhubs.partition.count" -> inputOptions(Symbol(EventhubsArgumentKeys.PartitionCount))
|
||||
.asInstanceOf[Int].toString,
|
||||
"eventhubs.checkpoint.interval" -> inputOptions(Symbol(EventhubsArgumentKeys.BatchIntervalInSeconds))
|
||||
.asInstanceOf[Int].toString,
|
||||
"eventhubs.checkpoint.dir" -> inputOptions(Symbol(EventhubsArgumentKeys.CheckpointDirectory)).asInstanceOf[String]
|
||||
)
|
||||
// scalastyle:on
|
||||
|
||||
/**
|
||||
* In Spark 2.0.x, SparkConf must be initialized through EventhubsUtil so that required
|
||||
* data structures internal to Azure Eventhubs Client get registered with the Kryo Serializer.
|
||||
*/
|
||||
val sparkConfiguration : SparkConf = EventHubsUtils.initializeSparkStreamingConfigurations
|
||||
|
||||
sparkConfiguration.setAppName(this.getClass.getSimpleName)
|
||||
sparkConfiguration.set("spark.streaming.driver.writeAheadLog.allowBatching", "true")
|
||||
sparkConfiguration.set("spark.streaming.driver.writeAheadLog.batchingTimeout", "60000")
|
||||
sparkConfiguration.set("spark.streaming.receiver.writeAheadLog.enable", "true")
|
||||
sparkConfiguration.set("spark.streaming.driver.writeAheadLog.closeFileAfterWrite", "true")
|
||||
sparkConfiguration.set("spark.streaming.receiver.writeAheadLog.closeFileAfterWrite", "true")
|
||||
sparkConfiguration.set("spark.streaming.stopGracefullyOnShutdown", "true")
|
||||
|
||||
val sparkSession : SparkSession = SparkSession.builder.config(sparkConfiguration).getOrCreate
|
||||
|
||||
val streamingContext = new StreamingContext(sparkSession.sparkContext,
|
||||
Seconds(inputOptions(Symbol(EventhubsArgumentKeys.BatchIntervalInSeconds)).asInstanceOf[Int]))
|
||||
streamingContext.checkpoint(inputOptions(Symbol(EventhubsArgumentKeys.CheckpointDirectory)).
|
||||
asInstanceOf[String])
|
||||
|
||||
val eventHubsStream = EventHubsUtils.createUnionStream(streamingContext, eventHubsParameters)
|
||||
|
||||
val eventHubsWindowedStream = eventHubsStream.window(
|
||||
Seconds(inputOptions(Symbol(EventhubsArgumentKeys.BatchIntervalInSeconds)).asInstanceOf[Int]))
|
||||
|
||||
eventHubsWindowedStream.map(x => EventContent(new String(x)))
|
||||
.foreachRDD(rdd => {
|
||||
val sparkSession = SparkSession.builder.getOrCreate
|
||||
import sparkSession.implicits._
|
||||
rdd.toDS.toJSON.write.mode(SaveMode.Overwrite)
|
||||
.save(inputOptions(Symbol(EventhubsArgumentKeys.EventStoreFolder)).asInstanceOf[String])
|
||||
})
|
||||
|
||||
// Count number of events received the past batch
|
||||
|
||||
val batchEventCount = eventHubsWindowedStream.count()
|
||||
|
||||
batchEventCount.print()
|
||||
|
||||
// Count number of events received so far
|
||||
|
||||
val totalEventCountDStream =
|
||||
eventHubsWindowedStream.map(m => (StreamStatistics.streamLengthKey, 1L))
|
||||
val totalEventCount =
|
||||
totalEventCountDStream.updateStateByKey[Long](StreamStatistics.streamLength)
|
||||
totalEventCount.checkpoint(
|
||||
Seconds(inputOptions(Symbol(EventhubsArgumentKeys.BatchIntervalInSeconds)).asInstanceOf[Int]))
|
||||
|
||||
if (inputOptions.contains(Symbol(EventhubsArgumentKeys.EventCountFolder))) {
|
||||
|
||||
totalEventCount.saveAsTextFiles(inputOptions(Symbol(EventhubsArgumentKeys.EventCountFolder))
|
||||
.asInstanceOf[String])
|
||||
}
|
||||
|
||||
totalEventCount.print()
|
||||
|
||||
streamingContext
|
||||
}
|
||||
|
||||
def main(inputArguments: Array[String]): Unit = {
|
||||
|
||||
val inputOptions = EventhubsArgumentParser.parseArguments(Map(), inputArguments.toList)
|
||||
|
||||
EventhubsArgumentParser.verifyEventhubsToAzureBlobAsJSONArguments(inputOptions)
|
||||
|
||||
val streamingContext = StreamingContext.getOrCreate(
|
||||
inputOptions(Symbol(EventhubsArgumentKeys.CheckpointDirectory)).asInstanceOf[String],
|
||||
() => createStreamingContext(inputOptions))
|
||||
|
||||
streamingContext.start()
|
||||
|
||||
if(inputOptions.contains(Symbol(EventhubsArgumentKeys.TimeoutInMinutes))) {
|
||||
streamingContext.awaitTerminationOrTimeout(
|
||||
inputOptions(Symbol(EventhubsArgumentKeys.TimeoutInMinutes)).asInstanceOf[Long] * 60 * 1000)
|
||||
}
|
||||
else {
|
||||
|
||||
streamingContext.awaitTermination()
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -1,165 +0,0 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package com.microsoft.spark.streaming.examples.receiverdstream.workloads
|
||||
|
||||
import java.sql.{Connection, DriverManager, Statement}
|
||||
|
||||
import com.microsoft.spark.streaming.examples.receiverdstream.arguments.{EventhubsArgumentKeys, EventhubsArgumentParser}
|
||||
import com.microsoft.spark.streaming.examples.receiverdstream.arguments.EventhubsArgumentParser.ArgumentMap
|
||||
import com.microsoft.spark.streaming.examples.receiverdstream.common.{EventContent, StreamStatistics, StreamUtilities}
|
||||
|
||||
import org.apache.spark._
|
||||
import org.apache.spark.sql.SparkSession
|
||||
import org.apache.spark.streaming.{Seconds, StreamingContext}
|
||||
import org.apache.spark.streaming.eventhubs.EventHubsUtils
|
||||
|
||||
object EventhubsToAzureSQLTable {
|
||||
|
||||
def createStreamingContext(inputOptions: ArgumentMap): StreamingContext = {
|
||||
|
||||
// scalastyle:off
|
||||
val eventHubsParameters = Map[String, String](
|
||||
"eventhubs.namespace" -> inputOptions(Symbol(EventhubsArgumentKeys.EventhubsNamespace)).asInstanceOf[String],
|
||||
"eventhubs.name" -> inputOptions(Symbol(EventhubsArgumentKeys.EventhubsName)).asInstanceOf[String],
|
||||
"eventhubs.policyname" -> inputOptions(Symbol(EventhubsArgumentKeys.PolicyName)).asInstanceOf[String],
|
||||
"eventhubs.policykey" -> inputOptions(Symbol(EventhubsArgumentKeys.PolicyKey)).asInstanceOf[String],
|
||||
"eventhubs.consumergroup" -> inputOptions(Symbol(EventhubsArgumentKeys.ConsumerGroup)).asInstanceOf[String],
|
||||
"eventhubs.partition.count" -> inputOptions(Symbol(EventhubsArgumentKeys.PartitionCount))
|
||||
.asInstanceOf[Int].toString,
|
||||
"eventhubs.checkpoint.interval" -> inputOptions(Symbol(EventhubsArgumentKeys.BatchIntervalInSeconds))
|
||||
.asInstanceOf[Int].toString,
|
||||
"eventhubs.checkpoint.dir" -> inputOptions(Symbol(EventhubsArgumentKeys.CheckpointDirectory)).asInstanceOf[String]
|
||||
)
|
||||
// scalastyle:on
|
||||
|
||||
val sqlDatabaseConnectionString : String = StreamUtilities.getSqlJdbcConnectionString(
|
||||
inputOptions(Symbol(EventhubsArgumentKeys.SQLServerFQDN)).asInstanceOf[String],
|
||||
inputOptions(Symbol(EventhubsArgumentKeys.SQLDatabaseName)).asInstanceOf[String],
|
||||
inputOptions(Symbol(EventhubsArgumentKeys.DatabaseUsername)).asInstanceOf[String],
|
||||
inputOptions(Symbol(EventhubsArgumentKeys.DatabasePassword)).asInstanceOf[String])
|
||||
|
||||
val sqlTableName: String = inputOptions(Symbol(EventhubsArgumentKeys.EventSQLTable)).
|
||||
asInstanceOf[String]
|
||||
|
||||
/**
|
||||
* In Spark 2.0.x, SparkConf must be initialized through EventhubsUtil so that required
|
||||
* data structures internal to Azure Eventhubs Client get registered with the Kryo Serializer.
|
||||
*/
|
||||
val sparkConfiguration : SparkConf = EventHubsUtils.initializeSparkStreamingConfigurations
|
||||
|
||||
sparkConfiguration.setAppName(this.getClass.getSimpleName)
|
||||
sparkConfiguration.set("spark.streaming.driver.writeAheadLog.allowBatching", "true")
|
||||
sparkConfiguration.set("spark.streaming.driver.writeAheadLog.batchingTimeout", "60000")
|
||||
sparkConfiguration.set("spark.streaming.receiver.writeAheadLog.enable", "true")
|
||||
sparkConfiguration.set("spark.streaming.driver.writeAheadLog.closeFileAfterWrite", "true")
|
||||
sparkConfiguration.set("spark.streaming.receiver.writeAheadLog.closeFileAfterWrite", "true")
|
||||
sparkConfiguration.set("spark.streaming.stopGracefullyOnShutdown", "true")
|
||||
|
||||
val sparkSession = SparkSession.builder().config(sparkConfiguration).getOrCreate()
|
||||
|
||||
val streamingContext = new StreamingContext(sparkSession.sparkContext,
|
||||
Seconds(inputOptions(Symbol(EventhubsArgumentKeys.BatchIntervalInSeconds)).asInstanceOf[Int]))
|
||||
streamingContext.checkpoint(inputOptions(Symbol(EventhubsArgumentKeys.CheckpointDirectory)).
|
||||
asInstanceOf[String])
|
||||
|
||||
val eventHubsStream = EventHubsUtils.createUnionStream(streamingContext, eventHubsParameters)
|
||||
|
||||
val eventHubsWindowedStream = eventHubsStream.window(
|
||||
Seconds(inputOptions(Symbol(EventhubsArgumentKeys.BatchIntervalInSeconds)).asInstanceOf[Int]))
|
||||
|
||||
import com.microsoft.spark.streaming.examples.receiverdstream.common.DataFrameExtensions._
|
||||
|
||||
eventHubsWindowedStream.map(m => EventContent(new String(m)))
|
||||
.foreachRDD { rdd => {
|
||||
val sparkSession = SparkSession.builder.getOrCreate
|
||||
import sparkSession.implicits._
|
||||
rdd.toDF.insertToAzureSql(sqlDatabaseConnectionString, sqlTableName)
|
||||
}
|
||||
}
|
||||
|
||||
// Count number of events received the past batch
|
||||
|
||||
val batchEventCount = eventHubsWindowedStream.count()
|
||||
|
||||
batchEventCount.print()
|
||||
|
||||
// Count number of events received so far
|
||||
|
||||
val totalEventCountDStream = eventHubsWindowedStream.map(
|
||||
m => (StreamStatistics.streamLengthKey, 1L))
|
||||
val totalEventCount = totalEventCountDStream.updateStateByKey[Long](
|
||||
StreamStatistics.streamLength)
|
||||
totalEventCount.checkpoint(
|
||||
Seconds(inputOptions(Symbol(EventhubsArgumentKeys.BatchIntervalInSeconds)).asInstanceOf[Int]))
|
||||
|
||||
if (inputOptions.contains(Symbol(EventhubsArgumentKeys.EventCountFolder))) {
|
||||
|
||||
totalEventCount.saveAsTextFiles(inputOptions(Symbol(EventhubsArgumentKeys.EventCountFolder))
|
||||
.asInstanceOf[String])
|
||||
}
|
||||
|
||||
totalEventCount.print()
|
||||
|
||||
streamingContext
|
||||
}
|
||||
|
||||
def main(inputArguments: Array[String]): Unit = {
|
||||
|
||||
val inputOptions = EventhubsArgumentParser.parseArguments(Map(), inputArguments.toList)
|
||||
|
||||
EventhubsArgumentParser.verifyEventhubsToSQLTableArguments(inputOptions)
|
||||
|
||||
val sqlDatabaseConnectionString : String = StreamUtilities.getSqlJdbcConnectionString(
|
||||
inputOptions(Symbol(EventhubsArgumentKeys.SQLServerFQDN)).asInstanceOf[String],
|
||||
inputOptions(Symbol(EventhubsArgumentKeys.SQLDatabaseName)).asInstanceOf[String],
|
||||
inputOptions(Symbol(EventhubsArgumentKeys.DatabaseUsername)).asInstanceOf[String],
|
||||
inputOptions(Symbol(EventhubsArgumentKeys.DatabasePassword)).asInstanceOf[String])
|
||||
|
||||
val sqlTableName: String = inputOptions(Symbol(EventhubsArgumentKeys.EventSQLTable)).
|
||||
asInstanceOf[String]
|
||||
|
||||
val sqlDriverConnection = DriverManager.getConnection(sqlDatabaseConnectionString)
|
||||
|
||||
sqlDriverConnection.setAutoCommit(false)
|
||||
val sqlDriverStatement: Statement = sqlDriverConnection.createStatement()
|
||||
sqlDriverStatement.addBatch(f"IF NOT EXISTS(SELECT * FROM sys.objects WHERE object_id" +
|
||||
f" = OBJECT_ID(N'[dbo].[$sqlTableName]') AND type in (N'U'))" +
|
||||
f"\nCREATE TABLE $sqlTableName(EventDetails NVARCHAR(128) NOT NULL)")
|
||||
sqlDriverStatement.addBatch(f"IF IndexProperty(Object_Id('$sqlTableName'), 'IX_EventDetails'," +
|
||||
f" 'IndexId') IS NULL" +
|
||||
f"\nCREATE CLUSTERED INDEX IX_EventDetails ON $sqlTableName(EventDetails)")
|
||||
sqlDriverStatement.executeBatch()
|
||||
sqlDriverConnection.commit()
|
||||
|
||||
sqlDriverConnection.close()
|
||||
|
||||
val streamingContext = StreamingContext.getOrCreate(
|
||||
inputOptions(Symbol(EventhubsArgumentKeys.CheckpointDirectory)).asInstanceOf[String],
|
||||
() => createStreamingContext(inputOptions))
|
||||
|
||||
|
||||
streamingContext.start()
|
||||
|
||||
if (inputOptions.contains(Symbol(EventhubsArgumentKeys.TimeoutInMinutes))) {
|
||||
streamingContext.awaitTerminationOrTimeout(
|
||||
inputOptions(Symbol(EventhubsArgumentKeys.TimeoutInMinutes)).asInstanceOf[Long] * 60 * 1000)
|
||||
} else {
|
||||
streamingContext.awaitTermination()
|
||||
}
|
||||
}
|
||||
}
|
|
@ -1,147 +0,0 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package com.microsoft.spark.streaming.examples.receiverdstream.workloads
|
||||
|
||||
import com.microsoft.spark.streaming.examples.receiverdstream.arguments.{EventhubsArgumentKeys, EventhubsArgumentParser}
|
||||
import com.microsoft.spark.streaming.examples.receiverdstream.arguments.EventhubsArgumentParser.ArgumentMap
|
||||
import com.microsoft.spark.streaming.examples.receiverdstream.common.{EventContent, StreamStatistics}
|
||||
|
||||
import org.apache.spark._
|
||||
import org.apache.spark.sql.SparkSession
|
||||
import org.apache.spark.streaming.{Seconds, StreamingContext}
|
||||
import org.apache.spark.streaming.eventhubs.EventHubsUtils
|
||||
|
||||
object EventhubsToHiveTable {
|
||||
|
||||
def createStreamingContext(inputOptions: ArgumentMap): StreamingContext = {
|
||||
|
||||
// scalastyle:off
|
||||
val eventHubsParameters = Map[String, String](
|
||||
"eventhubs.namespace" -> inputOptions(Symbol(EventhubsArgumentKeys.EventhubsNamespace)).asInstanceOf[String],
|
||||
"eventhubs.name" -> inputOptions(Symbol(EventhubsArgumentKeys.EventhubsName)).asInstanceOf[String],
|
||||
"eventhubs.policyname" -> inputOptions(Symbol(EventhubsArgumentKeys.PolicyName)).asInstanceOf[String],
|
||||
"eventhubs.policykey" -> inputOptions(Symbol(EventhubsArgumentKeys.PolicyKey)).asInstanceOf[String],
|
||||
"eventhubs.consumergroup" -> inputOptions(Symbol(EventhubsArgumentKeys.ConsumerGroup)).asInstanceOf[String],
|
||||
"eventhubs.partition.count" -> inputOptions(Symbol(EventhubsArgumentKeys.PartitionCount))
|
||||
.asInstanceOf[Int].toString,
|
||||
"eventhubs.checkpoint.interval" -> inputOptions(Symbol(EventhubsArgumentKeys.BatchIntervalInSeconds))
|
||||
.asInstanceOf[Int].toString,
|
||||
"eventhubs.checkpoint.dir" -> inputOptions(Symbol(EventhubsArgumentKeys.CheckpointDirectory)).asInstanceOf[String]
|
||||
)
|
||||
// scalastyle:on
|
||||
|
||||
/**
|
||||
* In Spark 2.0.x, SparkConf must be initialized through EventhubsUtil so that required
|
||||
* data structures internal to Azure Eventhubs Client get registered with the Kryo Serializer.
|
||||
*/
|
||||
val sparkConfiguration : SparkConf = EventHubsUtils.initializeSparkStreamingConfigurations
|
||||
|
||||
sparkConfiguration.setAppName(this.getClass.getSimpleName)
|
||||
sparkConfiguration.set("spark.streaming.driver.writeAheadLog.allowBatching", "true")
|
||||
sparkConfiguration.set("spark.streaming.driver.writeAheadLog.batchingTimeout", "60000")
|
||||
sparkConfiguration.set("spark.streaming.receiver.writeAheadLog.enable", "true")
|
||||
sparkConfiguration.set("spark.streaming.driver.writeAheadLog.closeFileAfterWrite", "true")
|
||||
sparkConfiguration.set("spark.streaming.receiver.writeAheadLog.closeFileAfterWrite", "true")
|
||||
sparkConfiguration.set("spark.streaming.stopGracefullyOnShutdown", "true")
|
||||
|
||||
val sparkSession = SparkSession.builder.config(sparkConfiguration).enableHiveSupport.getOrCreate
|
||||
|
||||
val streamingContext = new StreamingContext(sparkSession.sparkContext,
|
||||
Seconds(inputOptions(Symbol(EventhubsArgumentKeys.BatchIntervalInSeconds)).asInstanceOf[Int]))
|
||||
streamingContext.checkpoint(inputOptions(Symbol(EventhubsArgumentKeys.CheckpointDirectory)).
|
||||
asInstanceOf[String])
|
||||
|
||||
val eventHubsStream = EventHubsUtils.createUnionStream(streamingContext, eventHubsParameters)
|
||||
|
||||
val eventHubsWindowedStream = eventHubsStream.window(
|
||||
Seconds(inputOptions(Symbol(EventhubsArgumentKeys.BatchIntervalInSeconds)).asInstanceOf[Int]))
|
||||
|
||||
val hiveTableName = inputOptions(Symbol(EventhubsArgumentKeys.EventHiveTable)).
|
||||
asInstanceOf[String]
|
||||
|
||||
// Table needs to be explicitly created to match the Parquet format in which the data is stored
|
||||
// by default by Spark. If not explicitly created the Hive table cannot be used from Hive and
|
||||
// can only be used from inside Spark.
|
||||
|
||||
val hiveTableDDL =
|
||||
f"CREATE TABLE IF NOT EXISTS $hiveTableName (EventContent string) STORED AS PARQUET"
|
||||
|
||||
sparkSession.sql(hiveTableDDL)
|
||||
|
||||
/**
|
||||
* .saveAsTable does not work so insertInto is used.
|
||||
* Refer to SPARK-16803 (https://issues.apache.org/jira/browse/SPARK-16803)
|
||||
*/
|
||||
eventHubsWindowedStream.map(x => EventContent(new String(x)))
|
||||
.foreachRDD(rdd => {
|
||||
val sparkSession = SparkSession.builder.enableHiveSupport.getOrCreate
|
||||
import sparkSession.implicits._
|
||||
rdd.toDS.write.mode(org.apache.spark.sql.SaveMode.Append).insertInto(hiveTableName)
|
||||
})
|
||||
|
||||
// Count number of events received the past batch
|
||||
|
||||
val batchEventCount = eventHubsWindowedStream.count()
|
||||
|
||||
batchEventCount.print()
|
||||
|
||||
// Count number of events received so far
|
||||
|
||||
val totalEventCountDStream = eventHubsWindowedStream.map(
|
||||
m => (StreamStatistics.streamLengthKey, 1L))
|
||||
val totalEventCount = totalEventCountDStream.updateStateByKey[Long](
|
||||
StreamStatistics.streamLength)
|
||||
totalEventCount.checkpoint(
|
||||
Seconds(inputOptions(Symbol(EventhubsArgumentKeys.BatchIntervalInSeconds)).asInstanceOf[Int]))
|
||||
|
||||
if (inputOptions.contains(Symbol(EventhubsArgumentKeys.EventCountFolder))) {
|
||||
|
||||
totalEventCount.saveAsTextFiles(inputOptions(Symbol(EventhubsArgumentKeys.EventCountFolder))
|
||||
.asInstanceOf[String])
|
||||
}
|
||||
|
||||
totalEventCount.print()
|
||||
|
||||
streamingContext
|
||||
}
|
||||
|
||||
def main(inputArguments: Array[String]): Unit = {
|
||||
|
||||
val inputOptions = EventhubsArgumentParser.parseArguments(Map(), inputArguments.toList)
|
||||
|
||||
EventhubsArgumentParser.verifyEventhubsToHiveTableArguments(inputOptions)
|
||||
|
||||
// Create or recreate streaming context
|
||||
|
||||
val streamingContext = StreamingContext.getOrCreate(
|
||||
inputOptions(Symbol(EventhubsArgumentKeys.CheckpointDirectory)).asInstanceOf[String],
|
||||
() => createStreamingContext(inputOptions))
|
||||
|
||||
streamingContext.start()
|
||||
|
||||
if(inputOptions.contains(Symbol(EventhubsArgumentKeys.TimeoutInMinutes))) {
|
||||
streamingContext.awaitTerminationOrTimeout(
|
||||
inputOptions(Symbol(EventhubsArgumentKeys.TimeoutInMinutes)).asInstanceOf[Long] * 60 * 1000)
|
||||
} else {
|
||||
streamingContext.awaitTermination()
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
32
pom.xml
32
pom.xml
|
@ -31,8 +31,8 @@
|
|||
</licenses>
|
||||
<developers>
|
||||
<developer>
|
||||
<name>Arijit Tarafdar</name>
|
||||
<email>arijitt@microsoft.com</email>
|
||||
<name>Sabee Grewal</name>
|
||||
<email>sagrewal@microsoft.com</email>
|
||||
<organization>Microsoft Corporation</organization>
|
||||
<organizationUrl>http://www.microsoft.com</organizationUrl>
|
||||
</developer>
|
||||
|
@ -100,7 +100,7 @@
|
|||
<branch>refs/heads/maven-repo</branch>
|
||||
<includes><include>**/*</include></includes>
|
||||
<repositoryName>spark-eventhubs</repositoryName>
|
||||
<repositoryOwner>sabeegrewal</repositoryOwner>
|
||||
<repositoryOwner>Azure</repositoryOwner>
|
||||
<merge>true</merge>
|
||||
</configuration>
|
||||
<executions>
|
||||
|
@ -306,32 +306,6 @@
|
|||
<outputDirectory>target/scala-${scala.binary.version}/classes</outputDirectory>
|
||||
<testOutputDirectory>target/scala-${scala.binary.version}/test-classes</testOutputDirectory>
|
||||
<plugins>
|
||||
<plugin>
|
||||
<groupId>org.scalastyle</groupId>
|
||||
<artifactId>scalastyle-maven-plugin</artifactId>
|
||||
<version>0.8.0</version>
|
||||
<configuration>
|
||||
<verbose>false</verbose>
|
||||
<failOnViolation>true</failOnViolation>
|
||||
<includeTestSourceDirectory>true</includeTestSourceDirectory>
|
||||
<failOnWarning>false</failOnWarning>
|
||||
<sourceDirectories>
|
||||
<dir>${basedir}/core/src/main/scala</dir>
|
||||
<dir>${basedir}/examples/src/main/scala</dir>
|
||||
</sourceDirectories>
|
||||
<testSourceDirectory>${basedir}/core/src/test/scala</testSourceDirectory>
|
||||
<configLocation>scalastyle-config.xml</configLocation>
|
||||
<outputFile>${project.basedir}/scalastyle-output.xml</outputFile>
|
||||
<outputEncoding>UTF-8</outputEncoding>
|
||||
</configuration>
|
||||
<executions>
|
||||
<execution>
|
||||
<goals>
|
||||
<goal>check</goal>
|
||||
</goals>
|
||||
</execution>
|
||||
</executions>
|
||||
</plugin>
|
||||
<plugin>
|
||||
<groupId>org.scala-tools</groupId>
|
||||
<artifactId>maven-scala-plugin</artifactId>
|
||||
|
|
|
@ -1 +1,3 @@
|
|||
addSbtPlugin("org.scalastyle" %% "scalastyle-sbt-plugin" % "0.8.0")
|
||||
scalaVersion := "2.11.8"
|
||||
|
||||
addSbtPlugin("org.lucidchart" %% "sbt-scalafmt" % "1.12")
|
||||
|
|
|
@ -1,4 +1,3 @@
|
|||
set -e
|
||||
mvn install -DskipTests
|
||||
mvn scalastyle:check
|
||||
mvn test
|
||||
|
|
|
@ -1,342 +0,0 @@
|
|||
<!--
|
||||
~ Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
~ contributor license agreements. See the NOTICE file distributed with
|
||||
~ this work for additional information regarding copyright ownership.
|
||||
~ The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
~ (the "License"); you may not use this file except in compliance with
|
||||
~ the License. You may obtain a copy of the License at
|
||||
~
|
||||
~ http://www.apache.org/licenses/LICENSE-2.0
|
||||
~
|
||||
~ Unless required by applicable law or agreed to in writing, software
|
||||
~ distributed under the License is distributed on an "AS IS" BASIS,
|
||||
~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
~ See the License for the specific language governing permissions and
|
||||
~ limitations under the License.
|
||||
-->
|
||||
<!--
|
||||
|
||||
If you wish to turn off checking for a section of code, you can put a comment in the source
|
||||
before and after the section, with the following syntax:
|
||||
|
||||
// scalastyle:off
|
||||
... // stuff that breaks the styles
|
||||
// scalastyle:on
|
||||
|
||||
You can also disable only one rule, by specifying its rule id, as specified in:
|
||||
http://www.scalastyle.org/rules-0.7.0.html
|
||||
|
||||
// scalastyle:off no.finalize
|
||||
override def finalize(): Unit = ...
|
||||
// scalastyle:on no.finalize
|
||||
|
||||
This file is divided into 3 sections:
|
||||
(1) rules that we enforce.
|
||||
(2) rules that we would like to enforce, but haven't cleaned up the codebase to turn on yet
|
||||
(or we need to make the scalastyle rule more configurable).
|
||||
(3) rules that we don't want to enforce.
|
||||
-->
|
||||
|
||||
<scalastyle>
|
||||
<name>Scalastyle standard configuration</name>
|
||||
|
||||
<!-- ================================================================================ -->
|
||||
<!-- rules we enforce -->
|
||||
<!-- ================================================================================ -->
|
||||
|
||||
<check level="error" class="org.scalastyle.file.FileTabChecker" enabled="true"></check>
|
||||
|
||||
<check level="error" class="org.scalastyle.file.HeaderMatchesChecker" enabled="true">
|
||||
<parameters>
|
||||
<parameter name="header"><![CDATA[/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/]]></parameter>
|
||||
</parameters>
|
||||
</check>
|
||||
|
||||
<check level="error" class="org.scalastyle.scalariform.SpacesAfterPlusChecker" enabled="true"></check>
|
||||
|
||||
<check level="error" class="org.scalastyle.scalariform.SpacesBeforePlusChecker" enabled="true"></check>
|
||||
|
||||
<check level="error" class="org.scalastyle.file.WhitespaceEndOfLineChecker" enabled="true"></check>
|
||||
|
||||
<check level="error" class="org.scalastyle.file.FileLineLengthChecker" enabled="true">
|
||||
<parameters>
|
||||
<parameter name="maxLineLength"><![CDATA[100]]></parameter>
|
||||
<parameter name="tabSize"><![CDATA[2]]></parameter>
|
||||
<parameter name="ignoreImports">true</parameter>
|
||||
</parameters>
|
||||
</check>
|
||||
|
||||
<check level="error" class="org.scalastyle.scalariform.ClassNamesChecker" enabled="true">
|
||||
<parameters><parameter name="regex"><![CDATA[[A-Z][A-Za-z]*]]></parameter></parameters>
|
||||
</check>
|
||||
|
||||
<check level="error" class="org.scalastyle.scalariform.ObjectNamesChecker" enabled="true">
|
||||
<parameters><parameter name="regex"><![CDATA[[A-Z][A-Za-z]*]]></parameter></parameters>
|
||||
</check>
|
||||
|
||||
<check level="error" class="org.scalastyle.scalariform.PackageObjectNamesChecker" enabled="true">
|
||||
<parameters><parameter name="regex"><![CDATA[^[a-z][A-Za-z]*$]]></parameter></parameters>
|
||||
</check>
|
||||
|
||||
<check level="error" class="org.scalastyle.scalariform.ParameterNumberChecker" enabled="true">
|
||||
<parameters><parameter name="maxParameters"><![CDATA[10]]></parameter></parameters>
|
||||
</check>
|
||||
|
||||
<check level="error" class="org.scalastyle.scalariform.NoFinalizeChecker" enabled="true"></check>
|
||||
|
||||
<check level="error" class="org.scalastyle.scalariform.CovariantEqualsChecker" enabled="true"></check>
|
||||
|
||||
<check level="error" class="org.scalastyle.scalariform.StructuralTypeChecker" enabled="true"></check>
|
||||
|
||||
<check level="error" class="org.scalastyle.scalariform.UppercaseLChecker" enabled="true"></check>
|
||||
|
||||
<check level="error" class="org.scalastyle.scalariform.IfBraceChecker" enabled="true">
|
||||
<parameters>
|
||||
<parameter name="singleLineAllowed"><![CDATA[true]]></parameter>
|
||||
<parameter name="doubleLineAllowed"><![CDATA[true]]></parameter>
|
||||
</parameters>
|
||||
</check>
|
||||
|
||||
<check level="error" class="org.scalastyle.scalariform.PublicMethodsHaveTypeChecker" enabled="true"></check>
|
||||
|
||||
<check level="error" class="org.scalastyle.file.NewLineAtEofChecker" enabled="true"></check>
|
||||
|
||||
<check customId="nonascii" level="error" class="org.scalastyle.scalariform.NonASCIICharacterChecker" enabled="true"></check>
|
||||
|
||||
<check level="error" class="org.scalastyle.scalariform.SpaceAfterCommentStartChecker" enabled="true"></check>
|
||||
|
||||
<check level="error" class="org.scalastyle.scalariform.EnsureSingleSpaceBeforeTokenChecker" enabled="true">
|
||||
<parameters>
|
||||
<parameter name="tokens">ARROW, EQUALS, ELSE, TRY, CATCH, FINALLY, LARROW, RARROW</parameter>
|
||||
</parameters>
|
||||
</check>
|
||||
|
||||
<check level="error" class="org.scalastyle.scalariform.EnsureSingleSpaceAfterTokenChecker" enabled="true">
|
||||
<parameters>
|
||||
<parameter name="tokens">ARROW, EQUALS, COMMA, COLON, IF, ELSE, DO, WHILE, FOR, MATCH, TRY, CATCH, FINALLY, LARROW, RARROW</parameter>
|
||||
</parameters>
|
||||
</check>
|
||||
|
||||
<!-- ??? usually shouldn't be checked into the code base. -->
|
||||
<check level="error" class="org.scalastyle.scalariform.NotImplementedErrorUsage" enabled="true"></check>
|
||||
|
||||
<!-- As of SPARK-7558, all tests in Spark should extend o.a.s.SparkFunSuite instead of FunSuite directly -->
|
||||
<check customId="funsuite" level="error" class="org.scalastyle.scalariform.TokenChecker" enabled="false">
|
||||
<parameters><parameter name="regex">^FunSuite[A-Za-z]*$</parameter></parameters>
|
||||
<customMessage>Tests must extend org.apache.spark.SparkFunSuite instead.</customMessage>
|
||||
</check>
|
||||
|
||||
<!-- As of SPARK-7977 all printlns need to be wrapped in '// scalastyle:off/on println' -->
|
||||
<check customId="println" level="error" class="org.scalastyle.scalariform.TokenChecker" enabled="false">
|
||||
<parameters><parameter name="regex">^println$</parameter></parameters>
|
||||
<customMessage><![CDATA[Are you sure you want to println? If yes, wrap the code block with
|
||||
// scalastyle:off println
|
||||
println(...)
|
||||
// scalastyle:on println]]></customMessage>
|
||||
</check>
|
||||
|
||||
<check customId="visiblefortesting" level="error" class="org.scalastyle.file.RegexChecker" enabled="true">
|
||||
<parameters><parameter name="regex">@VisibleForTesting</parameter></parameters>
|
||||
<customMessage><![CDATA[
|
||||
@VisibleForTesting causes classpath issues. Please note this in the java doc instead (SPARK-11615).
|
||||
]]></customMessage>
|
||||
</check>
|
||||
|
||||
<check customId="runtimeaddshutdownhook" level="error" class="org.scalastyle.file.RegexChecker" enabled="true">
|
||||
<parameters><parameter name="regex">Runtime\.getRuntime\.addShutdownHook</parameter></parameters>
|
||||
<customMessage><![CDATA[
|
||||
Are you sure that you want to use Runtime.getRuntime.addShutdownHook? In most cases, you should use
|
||||
ShutdownHookManager.addShutdownHook instead.
|
||||
If you must use Runtime.getRuntime.addShutdownHook, wrap the code block with
|
||||
// scalastyle:off runtimeaddshutdownhook
|
||||
Runtime.getRuntime.addShutdownHook(...)
|
||||
// scalastyle:on runtimeaddshutdownhook
|
||||
]]></customMessage>
|
||||
</check>
|
||||
|
||||
<check customId="mutablesynchronizedbuffer" level="error" class="org.scalastyle.file.RegexChecker" enabled="true">
|
||||
<parameters><parameter name="regex">mutable\.SynchronizedBuffer</parameter></parameters>
|
||||
<customMessage><![CDATA[
|
||||
Are you sure that you want to use mutable.SynchronizedBuffer? In most cases, you should use
|
||||
java.util.concurrent.ConcurrentLinkedQueue instead.
|
||||
If you must use mutable.SynchronizedBuffer, wrap the code block with
|
||||
// scalastyle:off mutablesynchronizedbuffer
|
||||
mutable.SynchronizedBuffer[...]
|
||||
// scalastyle:on mutablesynchronizedbuffer
|
||||
]]></customMessage>
|
||||
</check>
|
||||
|
||||
<check customId="classforname" level="error" class="org.scalastyle.file.RegexChecker" enabled="false">
|
||||
<parameters><parameter name="regex">Class\.forName</parameter></parameters>
|
||||
<customMessage><![CDATA[
|
||||
Are you sure that you want to use Class.forName? In most cases, you should use Utils.classForName instead.
|
||||
If you must use Class.forName, wrap the code block with
|
||||
// scalastyle:off classforname
|
||||
Class.forName(...)
|
||||
// scalastyle:on classforname
|
||||
]]></customMessage>
|
||||
</check>
|
||||
|
||||
<check customId="awaitresult" level="error" class="org.scalastyle.file.RegexChecker" enabled="true">
|
||||
<parameters><parameter name="regex">Await\.result</parameter></parameters>
|
||||
<customMessage><![CDATA[
|
||||
Are you sure that you want to use Await.result? In most cases, you should use ThreadUtils.awaitResult instead.
|
||||
If you must use Await.result, wrap the code block with
|
||||
// scalastyle:off awaitresult
|
||||
Await.result(...)
|
||||
// scalastyle:on awaitresult
|
||||
If your codes use ThreadLocal and may run in threads created by the user, use ThreadUtils.awaitResultInForkJoinSafely instead.
|
||||
]]></customMessage>
|
||||
</check>
|
||||
|
||||
<!-- As of SPARK-9613 JavaConversions should be replaced with JavaConverters -->
|
||||
<check customId="javaconversions" level="error" class="org.scalastyle.scalariform.TokenChecker" enabled="true">
|
||||
<parameters><parameter name="regex">JavaConversions</parameter></parameters>
|
||||
<customMessage>Instead of importing implicits in scala.collection.JavaConversions._, import
|
||||
scala.collection.JavaConverters._ and use .asScala / .asJava methods</customMessage>
|
||||
</check>
|
||||
|
||||
<check customId="commonslang2" level="error" class="org.scalastyle.scalariform.TokenChecker" enabled="true">
|
||||
<parameters><parameter name="regex">org\.apache\.commons\.lang\.</parameter></parameters>
|
||||
<customMessage>Use Commons Lang 3 classes (package org.apache.commons.lang3.*) instead
|
||||
of Commons Lang 2 (package org.apache.commons.lang.*)</customMessage>
|
||||
</check>
|
||||
|
||||
<check customId="extractopt" level="error" class="org.scalastyle.scalariform.TokenChecker" enabled="true">
|
||||
<parameters><parameter name="regex">extractOpt</parameter></parameters>
|
||||
<customMessage>Use Utils.jsonOption(x).map(.extract[T]) instead of .extractOpt[T], as the latter
|
||||
is slower. </customMessage>
|
||||
</check>
|
||||
|
||||
<check level="error" class="org.scalastyle.scalariform.ImportOrderChecker" enabled="true">
|
||||
<parameters>
|
||||
<parameter name="groups">java,scala,3rdParty,spark</parameter>
|
||||
<parameter name="group.java">javax?\..*</parameter>
|
||||
<parameter name="group.scala">scala\..*</parameter>
|
||||
<parameter name="group.3rdParty">(?!org\.apache\.spark\.).*</parameter>
|
||||
<parameter name="group.spark">org\.apache\.spark\..*</parameter>
|
||||
</parameters>
|
||||
</check>
|
||||
|
||||
<check level="error" class="org.scalastyle.scalariform.DisallowSpaceBeforeTokenChecker" enabled="true">
|
||||
<parameters>
|
||||
<parameter name="tokens">COMMA</parameter>
|
||||
</parameters>
|
||||
</check>
|
||||
|
||||
<!-- SPARK-3854: Single Space between ')' and '{' -->
|
||||
<check customId="SingleSpaceBetweenRParenAndLCurlyBrace" level="error" class="org.scalastyle.file.RegexChecker" enabled="true">
|
||||
<parameters><parameter name="regex">\)\{</parameter></parameters>
|
||||
<customMessage><![CDATA[
|
||||
Single Space between ')' and `{`.
|
||||
]]></customMessage>
|
||||
</check>
|
||||
|
||||
<check customId="NoScalaDoc" level="error" class="org.scalastyle.file.RegexChecker" enabled="true">
|
||||
<parameters><parameter name="regex">(?m)^(\s*)/[*][*].*$(\r|)\n^\1 [*]</parameter></parameters>
|
||||
<customMessage>Use Javadoc style indentation for multiline comments</customMessage>
|
||||
</check>
|
||||
|
||||
<check customId="OmitBracesInCase" level="error" class="org.scalastyle.file.RegexChecker" enabled="true">
|
||||
<parameters><parameter name="regex">case[^\n>]*=>\s*\{</parameter></parameters>
|
||||
<customMessage>Omit braces in case clauses.</customMessage>
|
||||
</check>
|
||||
|
||||
<!-- SPARK-16877: Avoid Java annotations -->
|
||||
<check customId="OverrideJavaCase" level="error" class="org.scalastyle.scalariform.TokenChecker" enabled="true">
|
||||
<parameters><parameter name="regex">^Override$</parameter></parameters>
|
||||
<customMessage>override modifier should be used instead of @java.lang.Override.</customMessage>
|
||||
</check>
|
||||
|
||||
<check level="error" class="org.scalastyle.scalariform.DeprecatedJavaChecker" enabled="true"></check>
|
||||
|
||||
<!-- ================================================================================ -->
|
||||
<!-- rules we'd like to enforce, but haven't cleaned up the codebase yet -->
|
||||
<!-- ================================================================================ -->
|
||||
|
||||
<!-- We cannot turn the following two on, because it'd fail a lot of string interpolation use cases. -->
|
||||
<!-- Ideally the following two rules should be configurable to rule out string interpolation. -->
|
||||
<check level="error" class="org.scalastyle.scalariform.NoWhitespaceBeforeLeftBracketChecker" enabled="false"></check>
|
||||
<check level="error" class="org.scalastyle.scalariform.NoWhitespaceAfterLeftBracketChecker" enabled="false"></check>
|
||||
|
||||
<!-- This breaks symbolic method names so we don't turn it on. -->
|
||||
<!-- Maybe we should update it to allow basic symbolic names, and then we are good to go. -->
|
||||
<check level="error" class="org.scalastyle.scalariform.MethodNamesChecker" enabled="false">
|
||||
<parameters>
|
||||
<parameter name="regex"><![CDATA[^[a-z][A-Za-z0-9]*$]]></parameter>
|
||||
</parameters>
|
||||
</check>
|
||||
|
||||
<!-- Should turn this on, but we have a few places that need to be fixed first -->
|
||||
<check level="error" class="org.scalastyle.scalariform.EqualsHashCodeChecker" enabled="true"></check>
|
||||
|
||||
<!-- ================================================================================ -->
|
||||
<!-- rules we don't want -->
|
||||
<!-- ================================================================================ -->
|
||||
|
||||
<check level="error" class="org.scalastyle.scalariform.IllegalImportsChecker" enabled="false">
|
||||
<parameters><parameter name="illegalImports"><![CDATA[sun._,java.awt._]]></parameter></parameters>
|
||||
</check>
|
||||
|
||||
<!-- We want the opposite of this: NewLineAtEofChecker -->
|
||||
<check level="error" class="org.scalastyle.file.NoNewLineAtEofChecker" enabled="false"></check>
|
||||
|
||||
<!-- This one complains about all kinds of random things. Disable. -->
|
||||
<check level="error" class="org.scalastyle.scalariform.SimplifyBooleanExpressionChecker" enabled="false"></check>
|
||||
|
||||
<!-- We use return quite a bit for control flows and guards -->
|
||||
<check level="error" class="org.scalastyle.scalariform.ReturnChecker" enabled="false"></check>
|
||||
|
||||
<!-- We use null a lot in low level code and to interface with 3rd party code -->
|
||||
<check level="error" class="org.scalastyle.scalariform.NullChecker" enabled="false"></check>
|
||||
|
||||
<!-- Doesn't seem super big deal here ... -->
|
||||
<check level="error" class="org.scalastyle.scalariform.NoCloneChecker" enabled="false"></check>
|
||||
|
||||
<!-- Doesn't seem super big deal here ... -->
|
||||
<check level="error" class="org.scalastyle.file.FileLengthChecker" enabled="false">
|
||||
<parameters><parameter name="maxFileLength">800></parameter></parameters>
|
||||
</check>
|
||||
|
||||
<!-- Doesn't seem super big deal here ... -->
|
||||
<check level="error" class="org.scalastyle.scalariform.NumberOfTypesChecker" enabled="false">
|
||||
<parameters><parameter name="maxTypes">30</parameter></parameters>
|
||||
</check>
|
||||
|
||||
<!-- Doesn't seem super big deal here ... -->
|
||||
<check level="error" class="org.scalastyle.scalariform.CyclomaticComplexityChecker" enabled="false">
|
||||
<parameters><parameter name="maximum">10</parameter></parameters>
|
||||
</check>
|
||||
|
||||
<!-- Doesn't seem super big deal here ... -->
|
||||
<check level="error" class="org.scalastyle.scalariform.MethodLengthChecker" enabled="false">
|
||||
<parameters><parameter name="maxLength">50</parameter></parameters>
|
||||
</check>
|
||||
|
||||
<!-- Not exactly feasible to enforce this right now. -->
|
||||
<!-- It is also infrequent that somebody introduces a new class with a lot of methods. -->
|
||||
<check level="error" class="org.scalastyle.scalariform.NumberOfMethodsInTypeChecker" enabled="false">
|
||||
<parameters><parameter name="maxMethods"><![CDATA[30]]></parameter></parameters>
|
||||
</check>
|
||||
|
||||
<!-- Doesn't seem super big deal here, and we have a lot of magic numbers ... -->
|
||||
<check level="error" class="org.scalastyle.scalariform.MagicNumberChecker" enabled="false">
|
||||
<parameters><parameter name="ignore">-1,0,1,2,3</parameter></parameters>
|
||||
</check>
|
||||
|
||||
</scalastyle>
|
Загрузка…
Ссылка в новой задаче