Refactored the code to be compatible with the structure required by Spark packages
This commit is contained in:
Родитель
bb2019f3cf
Коммит
690314c8a1
|
@ -1,2 +1,5 @@
|
||||||
*.class
|
*.class
|
||||||
*.log
|
*.log
|
||||||
|
*.jar
|
||||||
|
__pycache__
|
||||||
|
target
|
||||||
|
|
44
build.sbt
44
build.sbt
|
@ -1,43 +1,29 @@
|
||||||
val sparkVer = sys.props.getOrElse("spark.version", "2.1.1")
|
// Your sbt build file. Guides on how to write one can be found at
|
||||||
val sparkBranch = sparkVer.substring(0, 3)
|
// http://www.scala-sbt.org/0.13/docs/index.html
|
||||||
val defaultScalaVer = sparkBranch match {
|
|
||||||
case "2.0" => "2.11.8"
|
|
||||||
case "2.1" => "2.11.8"
|
|
||||||
case "2.2" => "2.11.8"
|
|
||||||
case _ => throw new IllegalArgumentException(s"Unsupported Spark version: $sparkVer.")
|
|
||||||
}
|
|
||||||
val scalaVer = sys.props.getOrElse("scala.version", defaultScalaVer)
|
|
||||||
|
|
||||||
val sparkVersion = "2.1.1"
|
scalaVersion := "2.11.8"
|
||||||
scalaVersion := scalaVer
|
|
||||||
|
|
||||||
name := "spark-image"
|
sparkVersion := "2.2.0"
|
||||||
|
|
||||||
|
spName := "microsoft/spark-images"
|
||||||
|
|
||||||
|
// Don't forget to set the version
|
||||||
version := "0.1"
|
version := "0.1"
|
||||||
|
|
||||||
|
// All Spark Packages need a license
|
||||||
licenses := Seq("Apache-2.0" -> url("http://opensource.org/licenses/Apache-2.0"))
|
licenses := Seq("Apache-2.0" -> url("http://opensource.org/licenses/Apache-2.0"))
|
||||||
|
|
||||||
|
|
||||||
// Add Spark components this package depends on, e.g, "mllib", ....
|
// Add Spark components this package depends on, e.g, "mllib", ....
|
||||||
val sparkComponents = Seq("sql")
|
sparkComponents ++= Seq("sql")
|
||||||
|
|
||||||
libraryDependencies ++= Seq(
|
libraryDependencies ++= Seq(
|
||||||
// "%%" for scala things, "%" for plain java things
|
// "%%" for scala things, "%" for plain java things
|
||||||
"org.apache.logging.log4j" % "log4j-api" % "2.8.1" % "provided",
|
|
||||||
"org.apache.logging.log4j" % "log4j-core" % "2.8.1" % "provided",
|
|
||||||
"org.apache.logging.log4j" %% "log4j-api-scala" % "2.8.1" % "provided",
|
|
||||||
"org.apache.spark" %% "spark-core" % sparkVer % "provided",
|
|
||||||
"org.apache.spark" %% "spark-mllib" % sparkVer % "provided",
|
|
||||||
"org.scalatest" %% "scalatest" % "3.0.0" % "provided"
|
"org.scalatest" %% "scalatest" % "3.0.0" % "provided"
|
||||||
)
|
)
|
||||||
|
|
||||||
parallelExecution := false
|
// uncomment and change the value below to change the directory where your zip artifact will be created
|
||||||
|
// spDistDirectory := target.value
|
||||||
|
|
||||||
// This fixes a class loader problem with scala.Tuple2 class, scala-2.11, Spark 2.x
|
// add any Spark Package dependencies using spDependencies.
|
||||||
fork in Test := true
|
// e.g. spDependencies += "databricks/spark-avro:0.1"
|
||||||
|
|
||||||
// This and the next line fix a problem with forked run: https://github.com/scalatest/scalatest/issues/770
|
|
||||||
javaOptions in Test ++= Seq("-Xmx2048m", "-XX:ReservedCodeCacheSize=384m", "-XX:MaxPermSize=384m")
|
|
||||||
|
|
||||||
concurrentRestrictions in Global := Seq(
|
|
||||||
Tags.limitAll(1))
|
|
||||||
|
|
||||||
autoAPIMappings := true
|
|
||||||
|
|
|
@ -0,0 +1,106 @@
|
||||||
|
#!/usr/bin/env bash
|
||||||
|
|
||||||
|
# When creating new tests for Spark SQL Hive, the HADOOP_CLASSPATH must contain the hive jars so
|
||||||
|
# that we can run Hive to generate the golden answer. This is not required for normal development
|
||||||
|
# or testing.
|
||||||
|
for i in $HIVE_HOME/lib/*
|
||||||
|
do HADOOP_CLASSPATH=$HADOOP_CLASSPATH:$i
|
||||||
|
done
|
||||||
|
export HADOOP_CLASSPATH
|
||||||
|
|
||||||
|
realpath () {
|
||||||
|
(
|
||||||
|
TARGET_FILE=$1
|
||||||
|
|
||||||
|
cd $(dirname $TARGET_FILE)
|
||||||
|
TARGET_FILE=$(basename $TARGET_FILE)
|
||||||
|
|
||||||
|
COUNT=0
|
||||||
|
while [ -L "$TARGET_FILE" -a $COUNT -lt 100 ]
|
||||||
|
do
|
||||||
|
TARGET_FILE=$(readlink $TARGET_FILE)
|
||||||
|
cd $(dirname $TARGET_FILE)
|
||||||
|
TARGET_FILE=$(basename $TARGET_FILE)
|
||||||
|
COUNT=$(($COUNT + 1))
|
||||||
|
done
|
||||||
|
|
||||||
|
echo $(pwd -P)/$TARGET_FILE
|
||||||
|
)
|
||||||
|
}
|
||||||
|
|
||||||
|
. $(dirname $(realpath $0))/sbt-launch-lib.bash
|
||||||
|
|
||||||
|
|
||||||
|
declare -r noshare_opts="-Dsbt.global.base=project/.sbtboot -Dsbt.boot.directory=project/.boot -Dsbt.ivy.home=project/.ivy"
|
||||||
|
declare -r sbt_opts_file=".sbtopts"
|
||||||
|
declare -r etc_sbt_opts_file="/etc/sbt/sbtopts"
|
||||||
|
|
||||||
|
usage() {
|
||||||
|
cat <<EOM
|
||||||
|
Usage: $script_name [options]
|
||||||
|
-h | -help print this message
|
||||||
|
-v | -verbose this runner is chattier
|
||||||
|
-d | -debug set sbt log level to debug
|
||||||
|
-no-colors disable ANSI color codes
|
||||||
|
-sbt-create start sbt even if current directory contains no sbt project
|
||||||
|
-sbt-dir <path> path to global settings/plugins directory (default: ~/.sbt)
|
||||||
|
-sbt-boot <path> path to shared boot directory (default: ~/.sbt/boot in 0.11 series)
|
||||||
|
-ivy <path> path to local Ivy repository (default: ~/.ivy2)
|
||||||
|
-mem <integer> set memory options (default: $sbt_mem, which is $(get_mem_opts $sbt_mem))
|
||||||
|
-no-share use all local caches; no sharing
|
||||||
|
-no-global uses global caches, but does not use global ~/.sbt directory.
|
||||||
|
-jvm-debug <port> Turn on JVM debugging, open at the given port.
|
||||||
|
-batch Disable interactive mode
|
||||||
|
# sbt version (default: from project/build.properties if present, else latest release)
|
||||||
|
-sbt-version <version> use the specified version of sbt
|
||||||
|
-sbt-jar <path> use the specified jar as the sbt launcher
|
||||||
|
-sbt-rc use an RC version of sbt
|
||||||
|
-sbt-snapshot use a snapshot version of sbt
|
||||||
|
# java version (default: java from PATH, currently $(java -version 2>&1 | grep version))
|
||||||
|
-java-home <path> alternate JAVA_HOME
|
||||||
|
# jvm options and output control
|
||||||
|
JAVA_OPTS environment variable, if unset uses "$java_opts"
|
||||||
|
SBT_OPTS environment variable, if unset uses "$default_sbt_opts"
|
||||||
|
.sbtopts if this file exists in the current directory, it is
|
||||||
|
prepended to the runner args
|
||||||
|
/etc/sbt/sbtopts if this file exists, it is prepended to the runner args
|
||||||
|
-Dkey=val pass -Dkey=val directly to the java runtime
|
||||||
|
-J-X pass option -X directly to the java runtime
|
||||||
|
(-J is stripped)
|
||||||
|
-S-X add -X to sbt's scalacOptions (-J is stripped)
|
||||||
|
-PmavenProfiles Enable a maven profile for the build.
|
||||||
|
In the case of duplicated or conflicting options, the order above
|
||||||
|
shows precedence: JAVA_OPTS lowest, command line options highest.
|
||||||
|
EOM
|
||||||
|
}
|
||||||
|
|
||||||
|
process_my_args () {
|
||||||
|
while [[ $# -gt 0 ]]; do
|
||||||
|
case "$1" in
|
||||||
|
-no-colors) addJava "-Dsbt.log.noformat=true" && shift ;;
|
||||||
|
-no-share) addJava "$noshare_opts" && shift ;;
|
||||||
|
-no-global) addJava "-Dsbt.global.base=$(pwd)/project/.sbtboot" && shift ;;
|
||||||
|
-sbt-boot) require_arg path "$1" "$2" && addJava "-Dsbt.boot.directory=$2" && shift 2 ;;
|
||||||
|
-sbt-dir) require_arg path "$1" "$2" && addJava "-Dsbt.global.base=$2" && shift 2 ;;
|
||||||
|
-debug-inc) addJava "-Dxsbt.inc.debug=true" && shift ;;
|
||||||
|
-batch) exec </dev/null && shift ;;
|
||||||
|
|
||||||
|
-sbt-create) sbt_create=true && shift ;;
|
||||||
|
|
||||||
|
*) addResidual "$1" && shift ;;
|
||||||
|
esac
|
||||||
|
done
|
||||||
|
|
||||||
|
# Now, ensure sbt version is used.
|
||||||
|
[[ "${sbt_version}XXX" != "XXX" ]] && addJava "-Dsbt.version=$sbt_version"
|
||||||
|
}
|
||||||
|
|
||||||
|
loadConfigFile() {
|
||||||
|
cat "$1" | sed '/^\#/d'
|
||||||
|
}
|
||||||
|
|
||||||
|
# if sbtopts files exist, prepend their contents to $@ so it can be processed by this runner
|
||||||
|
[[ -f "$etc_sbt_opts_file" ]] && set -- $(loadConfigFile "$etc_sbt_opts_file") "$@"
|
||||||
|
[[ -f "$sbt_opts_file" ]] && set -- $(loadConfigFile "$sbt_opts_file") "$@"
|
||||||
|
|
||||||
|
run "$@"
|
|
@ -0,0 +1,195 @@
|
||||||
|
#!/usr/bin/env bash
|
||||||
|
#
|
||||||
|
|
||||||
|
# A library to simplify using the SBT launcher from other packages.
|
||||||
|
# Note: This should be used by tools like giter8/conscript etc.
|
||||||
|
|
||||||
|
# TODO - Should we merge the main SBT script with this library?
|
||||||
|
|
||||||
|
if test -z "$HOME"; then
|
||||||
|
declare -r script_dir="$(dirname $script_path)"
|
||||||
|
else
|
||||||
|
declare -r script_dir="$HOME/.sbt"
|
||||||
|
fi
|
||||||
|
|
||||||
|
declare -a residual_args
|
||||||
|
declare -a java_args
|
||||||
|
declare -a scalac_args
|
||||||
|
declare -a sbt_commands
|
||||||
|
declare -a maven_profiles
|
||||||
|
|
||||||
|
if test -x "$JAVA_HOME/bin/java"; then
|
||||||
|
echo -e "Using $JAVA_HOME as default JAVA_HOME."
|
||||||
|
echo "Note, this will be overridden by -java-home if it is set."
|
||||||
|
declare java_cmd="$JAVA_HOME/bin/java"
|
||||||
|
else
|
||||||
|
declare java_cmd=java
|
||||||
|
fi
|
||||||
|
|
||||||
|
echoerr () {
|
||||||
|
echo 1>&2 "$@"
|
||||||
|
}
|
||||||
|
vlog () {
|
||||||
|
[[ $verbose || $debug ]] && echoerr "$@"
|
||||||
|
}
|
||||||
|
dlog () {
|
||||||
|
[[ $debug ]] && echoerr "$@"
|
||||||
|
}
|
||||||
|
|
||||||
|
acquire_sbt_jar () {
|
||||||
|
SBT_VERSION=`awk -F "=" '/sbt\\.version/ {print $2}' ./project/build.properties`
|
||||||
|
URL1=https://dl.bintray.com/typesafe/ivy-releases/org.scala-sbt/sbt-launch/${SBT_VERSION}/sbt-launch.jar
|
||||||
|
JAR=build/sbt-launch-${SBT_VERSION}.jar
|
||||||
|
|
||||||
|
sbt_jar=$JAR
|
||||||
|
|
||||||
|
if [[ ! -f "$sbt_jar" ]]; then
|
||||||
|
# Download sbt launch jar if it hasn't been downloaded yet
|
||||||
|
if [ ! -f ${JAR} ]; then
|
||||||
|
# Download
|
||||||
|
printf "Attempting to fetch sbt\n"
|
||||||
|
JAR_DL=${JAR}.part
|
||||||
|
if hash curl 2>/dev/null; then
|
||||||
|
curl --fail --location --silent ${URL1} > "${JAR_DL}" &&\
|
||||||
|
mv "${JAR_DL}" "${JAR}"
|
||||||
|
elif hash wget 2>/dev/null; then
|
||||||
|
wget --quiet ${URL1} -O "${JAR_DL}" &&\
|
||||||
|
mv "${JAR_DL}" "${JAR}"
|
||||||
|
else
|
||||||
|
printf "You do not have curl or wget installed, please install sbt manually from http://www.scala-sbt.org/\n"
|
||||||
|
exit -1
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
if [ ! -f ${JAR} ]; then
|
||||||
|
# We failed to download
|
||||||
|
printf "Our attempt to download sbt locally to ${JAR} failed. Please install sbt manually from http://www.scala-sbt.org/\n"
|
||||||
|
exit -1
|
||||||
|
fi
|
||||||
|
printf "Launching sbt from ${JAR}\n"
|
||||||
|
fi
|
||||||
|
}
|
||||||
|
|
||||||
|
execRunner () {
|
||||||
|
# print the arguments one to a line, quoting any containing spaces
|
||||||
|
[[ $verbose || $debug ]] && echo "# Executing command line:" && {
|
||||||
|
for arg; do
|
||||||
|
if printf "%s\n" "$arg" | grep -q ' '; then
|
||||||
|
printf "\"%s\"\n" "$arg"
|
||||||
|
else
|
||||||
|
printf "%s\n" "$arg"
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
echo ""
|
||||||
|
}
|
||||||
|
|
||||||
|
exec "$@"
|
||||||
|
}
|
||||||
|
|
||||||
|
addJava () {
|
||||||
|
dlog "[addJava] arg = '$1'"
|
||||||
|
java_args=( "${java_args[@]}" "$1" )
|
||||||
|
}
|
||||||
|
|
||||||
|
enableProfile () {
|
||||||
|
dlog "[enableProfile] arg = '$1'"
|
||||||
|
maven_profiles=( "${maven_profiles[@]}" "$1" )
|
||||||
|
export SBT_MAVEN_PROFILES="${maven_profiles[@]}"
|
||||||
|
}
|
||||||
|
|
||||||
|
addSbt () {
|
||||||
|
dlog "[addSbt] arg = '$1'"
|
||||||
|
sbt_commands=( "${sbt_commands[@]}" "$1" )
|
||||||
|
}
|
||||||
|
addResidual () {
|
||||||
|
dlog "[residual] arg = '$1'"
|
||||||
|
residual_args=( "${residual_args[@]}" "$1" )
|
||||||
|
}
|
||||||
|
addDebugger () {
|
||||||
|
addJava "-Xdebug -Xrunjdwp:transport=dt_socket,server=y,suspend=n,address=$1"
|
||||||
|
}
|
||||||
|
|
||||||
|
# a ham-fisted attempt to move some memory settings in concert
|
||||||
|
# so they need not be dicked around with individually.
|
||||||
|
get_mem_opts () {
|
||||||
|
local mem=${1:-2048}
|
||||||
|
local perm=$(( $mem / 4 ))
|
||||||
|
(( $perm > 256 )) || perm=256
|
||||||
|
(( $perm < 4096 )) || perm=4096
|
||||||
|
local codecache=$(( $perm / 2 ))
|
||||||
|
|
||||||
|
echo "-Xms${mem}m -Xmx${mem}m -XX:MaxPermSize=${perm}m -XX:ReservedCodeCacheSize=${codecache}m"
|
||||||
|
}
|
||||||
|
|
||||||
|
require_arg () {
|
||||||
|
local type="$1"
|
||||||
|
local opt="$2"
|
||||||
|
local arg="$3"
|
||||||
|
if [[ -z "$arg" ]] || [[ "${arg:0:1}" == "-" ]]; then
|
||||||
|
die "$opt requires <$type> argument"
|
||||||
|
fi
|
||||||
|
}
|
||||||
|
|
||||||
|
is_function_defined() {
|
||||||
|
declare -f "$1" > /dev/null
|
||||||
|
}
|
||||||
|
|
||||||
|
process_args () {
|
||||||
|
while [[ $# -gt 0 ]]; do
|
||||||
|
case "$1" in
|
||||||
|
-h|-help) usage; exit 1 ;;
|
||||||
|
-v|-verbose) verbose=1 && shift ;;
|
||||||
|
-d|-debug) debug=1 && shift ;;
|
||||||
|
|
||||||
|
-ivy) require_arg path "$1" "$2" && addJava "-Dsbt.ivy.home=$2" && shift 2 ;;
|
||||||
|
-mem) require_arg integer "$1" "$2" && sbt_mem="$2" && shift 2 ;;
|
||||||
|
-jvm-debug) require_arg port "$1" "$2" && addDebugger $2 && shift 2 ;;
|
||||||
|
-batch) exec </dev/null && shift ;;
|
||||||
|
|
||||||
|
-sbt-jar) require_arg path "$1" "$2" && sbt_jar="$2" && shift 2 ;;
|
||||||
|
-sbt-version) require_arg version "$1" "$2" && sbt_version="$2" && shift 2 ;;
|
||||||
|
-java-home) require_arg path "$1" "$2" && java_cmd="$2/bin/java" && export JAVA_HOME=$2 && shift 2 ;;
|
||||||
|
|
||||||
|
-D*) addJava "$1" && shift ;;
|
||||||
|
-J*) addJava "${1:2}" && shift ;;
|
||||||
|
-P*) enableProfile "$1" && shift ;;
|
||||||
|
*) addResidual "$1" && shift ;;
|
||||||
|
esac
|
||||||
|
done
|
||||||
|
|
||||||
|
is_function_defined process_my_args && {
|
||||||
|
myargs=("${residual_args[@]}")
|
||||||
|
residual_args=()
|
||||||
|
process_my_args "${myargs[@]}"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
run() {
|
||||||
|
# no jar? download it.
|
||||||
|
[[ -f "$sbt_jar" ]] || acquire_sbt_jar "$sbt_version" || {
|
||||||
|
# still no jar? uh-oh.
|
||||||
|
echo "Download failed. Obtain the sbt-launch.jar manually and place it at $sbt_jar"
|
||||||
|
exit 1
|
||||||
|
}
|
||||||
|
|
||||||
|
# process the combined args, then reset "$@" to the residuals
|
||||||
|
process_args "$@"
|
||||||
|
set -- "${residual_args[@]}"
|
||||||
|
argumentCount=$#
|
||||||
|
|
||||||
|
# run sbt
|
||||||
|
execRunner "$java_cmd" \
|
||||||
|
${SBT_OPTS:-$default_sbt_opts} \
|
||||||
|
$(get_mem_opts $sbt_mem) \
|
||||||
|
${java_opts} \
|
||||||
|
${java_args[@]} \
|
||||||
|
-jar "$sbt_jar" \
|
||||||
|
"${sbt_commands[@]}" \
|
||||||
|
"${residual_args[@]}"
|
||||||
|
}
|
||||||
|
|
||||||
|
runAlternateBoot() {
|
||||||
|
local bootpropsfile="$1"
|
||||||
|
shift
|
||||||
|
addJava "-Dsbt.boot.properties=$bootpropsfile"
|
||||||
|
run $@
|
||||||
|
}
|
|
@ -0,0 +1,2 @@
|
||||||
|
// This file should only contain the version of sbt to use.
|
||||||
|
sbt.version=0.13.6
|
|
@ -0,0 +1,4 @@
|
||||||
|
// You may use this file to add plugin dependencies for sbt.
|
||||||
|
resolvers += "bintray-spark-packages" at "https://dl.bintray.com/spark-packages/maven/"
|
||||||
|
|
||||||
|
addSbtPlugin("org.spark-packages" %% "sbt-spark-package" % "0.2.2")
|
|
@ -0,0 +1,4 @@
|
||||||
|
# An example MANIFEST file can be found at:
|
||||||
|
# https://github.com/pypa/sampleproject/blob/master/MANIFEST.in
|
||||||
|
# For more details about the MANIFEST file, you may read the docs at
|
||||||
|
# https://docs.python.org/2/distutils/sourcedist.html#the-manifest-in-template
|
|
@ -0,0 +1 @@
|
||||||
|
# This file should list any python package dependencies.
|
|
@ -0,0 +1,2 @@
|
||||||
|
# This file contains the default option values to be used during setup. An
|
||||||
|
# example can be found at https://github.com/pypa/sampleproject/blob/master/setup.cfg
|
|
@ -0,0 +1,2 @@
|
||||||
|
# Your python setup file. An example can be found at:
|
||||||
|
# https://github.com/pypa/sampleproject/blob/master/setup.py
|
|
@ -0,0 +1,2 @@
|
||||||
|
# This file should list any spark package dependencies as:
|
||||||
|
# :package_name==:version e.g. databricks/spark-csv==0.1
|
|
@ -0,0 +1 @@
|
||||||
|
# You may include your unit tests in this file.
|
Загрузка…
Ссылка в новой задаче