Refactored the code to be compatible with the structure required by Spark packages
This commit is contained in:
Родитель
bb2019f3cf
Коммит
690314c8a1
|
@ -1,2 +1,5 @@
|
|||
*.class
|
||||
*.log
|
||||
*.jar
|
||||
__pycache__
|
||||
target
|
||||
|
|
44
build.sbt
44
build.sbt
|
@ -1,43 +1,29 @@
|
|||
val sparkVer = sys.props.getOrElse("spark.version", "2.1.1")
|
||||
val sparkBranch = sparkVer.substring(0, 3)
|
||||
val defaultScalaVer = sparkBranch match {
|
||||
case "2.0" => "2.11.8"
|
||||
case "2.1" => "2.11.8"
|
||||
case "2.2" => "2.11.8"
|
||||
case _ => throw new IllegalArgumentException(s"Unsupported Spark version: $sparkVer.")
|
||||
}
|
||||
val scalaVer = sys.props.getOrElse("scala.version", defaultScalaVer)
|
||||
// Your sbt build file. Guides on how to write one can be found at
|
||||
// http://www.scala-sbt.org/0.13/docs/index.html
|
||||
|
||||
val sparkVersion = "2.1.1"
|
||||
scalaVersion := scalaVer
|
||||
scalaVersion := "2.11.8"
|
||||
|
||||
name := "spark-image"
|
||||
sparkVersion := "2.2.0"
|
||||
|
||||
spName := "microsoft/spark-images"
|
||||
|
||||
// Don't forget to set the version
|
||||
version := "0.1"
|
||||
|
||||
// All Spark Packages need a license
|
||||
licenses := Seq("Apache-2.0" -> url("http://opensource.org/licenses/Apache-2.0"))
|
||||
|
||||
|
||||
// Add Spark components this package depends on, e.g, "mllib", ....
|
||||
val sparkComponents = Seq("sql")
|
||||
sparkComponents ++= Seq("sql")
|
||||
|
||||
libraryDependencies ++= Seq(
|
||||
// "%%" for scala things, "%" for plain java things
|
||||
"org.apache.logging.log4j" % "log4j-api" % "2.8.1" % "provided",
|
||||
"org.apache.logging.log4j" % "log4j-core" % "2.8.1" % "provided",
|
||||
"org.apache.logging.log4j" %% "log4j-api-scala" % "2.8.1" % "provided",
|
||||
"org.apache.spark" %% "spark-core" % sparkVer % "provided",
|
||||
"org.apache.spark" %% "spark-mllib" % sparkVer % "provided",
|
||||
"org.scalatest" %% "scalatest" % "3.0.0" % "provided"
|
||||
)
|
||||
|
||||
parallelExecution := false
|
||||
// uncomment and change the value below to change the directory where your zip artifact will be created
|
||||
// spDistDirectory := target.value
|
||||
|
||||
// This fixes a class loader problem with scala.Tuple2 class, scala-2.11, Spark 2.x
|
||||
fork in Test := true
|
||||
|
||||
// This and the next line fix a problem with forked run: https://github.com/scalatest/scalatest/issues/770
|
||||
javaOptions in Test ++= Seq("-Xmx2048m", "-XX:ReservedCodeCacheSize=384m", "-XX:MaxPermSize=384m")
|
||||
|
||||
concurrentRestrictions in Global := Seq(
|
||||
Tags.limitAll(1))
|
||||
|
||||
autoAPIMappings := true
|
||||
// add any Spark Package dependencies using spDependencies.
|
||||
// e.g. spDependencies += "databricks/spark-avro:0.1"
|
||||
|
|
|
@ -0,0 +1,106 @@
|
|||
#!/usr/bin/env bash
|
||||
|
||||
# When creating new tests for Spark SQL Hive, the HADOOP_CLASSPATH must contain the hive jars so
|
||||
# that we can run Hive to generate the golden answer. This is not required for normal development
|
||||
# or testing.
|
||||
for i in $HIVE_HOME/lib/*
|
||||
do HADOOP_CLASSPATH=$HADOOP_CLASSPATH:$i
|
||||
done
|
||||
export HADOOP_CLASSPATH
|
||||
|
||||
realpath () {
|
||||
(
|
||||
TARGET_FILE=$1
|
||||
|
||||
cd $(dirname $TARGET_FILE)
|
||||
TARGET_FILE=$(basename $TARGET_FILE)
|
||||
|
||||
COUNT=0
|
||||
while [ -L "$TARGET_FILE" -a $COUNT -lt 100 ]
|
||||
do
|
||||
TARGET_FILE=$(readlink $TARGET_FILE)
|
||||
cd $(dirname $TARGET_FILE)
|
||||
TARGET_FILE=$(basename $TARGET_FILE)
|
||||
COUNT=$(($COUNT + 1))
|
||||
done
|
||||
|
||||
echo $(pwd -P)/$TARGET_FILE
|
||||
)
|
||||
}
|
||||
|
||||
. $(dirname $(realpath $0))/sbt-launch-lib.bash
|
||||
|
||||
|
||||
declare -r noshare_opts="-Dsbt.global.base=project/.sbtboot -Dsbt.boot.directory=project/.boot -Dsbt.ivy.home=project/.ivy"
|
||||
declare -r sbt_opts_file=".sbtopts"
|
||||
declare -r etc_sbt_opts_file="/etc/sbt/sbtopts"
|
||||
|
||||
usage() {
|
||||
cat <<EOM
|
||||
Usage: $script_name [options]
|
||||
-h | -help print this message
|
||||
-v | -verbose this runner is chattier
|
||||
-d | -debug set sbt log level to debug
|
||||
-no-colors disable ANSI color codes
|
||||
-sbt-create start sbt even if current directory contains no sbt project
|
||||
-sbt-dir <path> path to global settings/plugins directory (default: ~/.sbt)
|
||||
-sbt-boot <path> path to shared boot directory (default: ~/.sbt/boot in 0.11 series)
|
||||
-ivy <path> path to local Ivy repository (default: ~/.ivy2)
|
||||
-mem <integer> set memory options (default: $sbt_mem, which is $(get_mem_opts $sbt_mem))
|
||||
-no-share use all local caches; no sharing
|
||||
-no-global uses global caches, but does not use global ~/.sbt directory.
|
||||
-jvm-debug <port> Turn on JVM debugging, open at the given port.
|
||||
-batch Disable interactive mode
|
||||
# sbt version (default: from project/build.properties if present, else latest release)
|
||||
-sbt-version <version> use the specified version of sbt
|
||||
-sbt-jar <path> use the specified jar as the sbt launcher
|
||||
-sbt-rc use an RC version of sbt
|
||||
-sbt-snapshot use a snapshot version of sbt
|
||||
# java version (default: java from PATH, currently $(java -version 2>&1 | grep version))
|
||||
-java-home <path> alternate JAVA_HOME
|
||||
# jvm options and output control
|
||||
JAVA_OPTS environment variable, if unset uses "$java_opts"
|
||||
SBT_OPTS environment variable, if unset uses "$default_sbt_opts"
|
||||
.sbtopts if this file exists in the current directory, it is
|
||||
prepended to the runner args
|
||||
/etc/sbt/sbtopts if this file exists, it is prepended to the runner args
|
||||
-Dkey=val pass -Dkey=val directly to the java runtime
|
||||
-J-X pass option -X directly to the java runtime
|
||||
(-J is stripped)
|
||||
-S-X add -X to sbt's scalacOptions (-J is stripped)
|
||||
-PmavenProfiles Enable a maven profile for the build.
|
||||
In the case of duplicated or conflicting options, the order above
|
||||
shows precedence: JAVA_OPTS lowest, command line options highest.
|
||||
EOM
|
||||
}
|
||||
|
||||
process_my_args () {
|
||||
while [[ $# -gt 0 ]]; do
|
||||
case "$1" in
|
||||
-no-colors) addJava "-Dsbt.log.noformat=true" && shift ;;
|
||||
-no-share) addJava "$noshare_opts" && shift ;;
|
||||
-no-global) addJava "-Dsbt.global.base=$(pwd)/project/.sbtboot" && shift ;;
|
||||
-sbt-boot) require_arg path "$1" "$2" && addJava "-Dsbt.boot.directory=$2" && shift 2 ;;
|
||||
-sbt-dir) require_arg path "$1" "$2" && addJava "-Dsbt.global.base=$2" && shift 2 ;;
|
||||
-debug-inc) addJava "-Dxsbt.inc.debug=true" && shift ;;
|
||||
-batch) exec </dev/null && shift ;;
|
||||
|
||||
-sbt-create) sbt_create=true && shift ;;
|
||||
|
||||
*) addResidual "$1" && shift ;;
|
||||
esac
|
||||
done
|
||||
|
||||
# Now, ensure sbt version is used.
|
||||
[[ "${sbt_version}XXX" != "XXX" ]] && addJava "-Dsbt.version=$sbt_version"
|
||||
}
|
||||
|
||||
loadConfigFile() {
|
||||
cat "$1" | sed '/^\#/d'
|
||||
}
|
||||
|
||||
# if sbtopts files exist, prepend their contents to $@ so it can be processed by this runner
|
||||
[[ -f "$etc_sbt_opts_file" ]] && set -- $(loadConfigFile "$etc_sbt_opts_file") "$@"
|
||||
[[ -f "$sbt_opts_file" ]] && set -- $(loadConfigFile "$sbt_opts_file") "$@"
|
||||
|
||||
run "$@"
|
|
@ -0,0 +1,195 @@
|
|||
#!/usr/bin/env bash
|
||||
#
|
||||
|
||||
# A library to simplify using the SBT launcher from other packages.
|
||||
# Note: This should be used by tools like giter8/conscript etc.
|
||||
|
||||
# TODO - Should we merge the main SBT script with this library?
|
||||
|
||||
if test -z "$HOME"; then
|
||||
declare -r script_dir="$(dirname $script_path)"
|
||||
else
|
||||
declare -r script_dir="$HOME/.sbt"
|
||||
fi
|
||||
|
||||
declare -a residual_args
|
||||
declare -a java_args
|
||||
declare -a scalac_args
|
||||
declare -a sbt_commands
|
||||
declare -a maven_profiles
|
||||
|
||||
if test -x "$JAVA_HOME/bin/java"; then
|
||||
echo -e "Using $JAVA_HOME as default JAVA_HOME."
|
||||
echo "Note, this will be overridden by -java-home if it is set."
|
||||
declare java_cmd="$JAVA_HOME/bin/java"
|
||||
else
|
||||
declare java_cmd=java
|
||||
fi
|
||||
|
||||
echoerr () {
|
||||
echo 1>&2 "$@"
|
||||
}
|
||||
vlog () {
|
||||
[[ $verbose || $debug ]] && echoerr "$@"
|
||||
}
|
||||
dlog () {
|
||||
[[ $debug ]] && echoerr "$@"
|
||||
}
|
||||
|
||||
acquire_sbt_jar () {
|
||||
SBT_VERSION=`awk -F "=" '/sbt\\.version/ {print $2}' ./project/build.properties`
|
||||
URL1=https://dl.bintray.com/typesafe/ivy-releases/org.scala-sbt/sbt-launch/${SBT_VERSION}/sbt-launch.jar
|
||||
JAR=build/sbt-launch-${SBT_VERSION}.jar
|
||||
|
||||
sbt_jar=$JAR
|
||||
|
||||
if [[ ! -f "$sbt_jar" ]]; then
|
||||
# Download sbt launch jar if it hasn't been downloaded yet
|
||||
if [ ! -f ${JAR} ]; then
|
||||
# Download
|
||||
printf "Attempting to fetch sbt\n"
|
||||
JAR_DL=${JAR}.part
|
||||
if hash curl 2>/dev/null; then
|
||||
curl --fail --location --silent ${URL1} > "${JAR_DL}" &&\
|
||||
mv "${JAR_DL}" "${JAR}"
|
||||
elif hash wget 2>/dev/null; then
|
||||
wget --quiet ${URL1} -O "${JAR_DL}" &&\
|
||||
mv "${JAR_DL}" "${JAR}"
|
||||
else
|
||||
printf "You do not have curl or wget installed, please install sbt manually from http://www.scala-sbt.org/\n"
|
||||
exit -1
|
||||
fi
|
||||
fi
|
||||
if [ ! -f ${JAR} ]; then
|
||||
# We failed to download
|
||||
printf "Our attempt to download sbt locally to ${JAR} failed. Please install sbt manually from http://www.scala-sbt.org/\n"
|
||||
exit -1
|
||||
fi
|
||||
printf "Launching sbt from ${JAR}\n"
|
||||
fi
|
||||
}
|
||||
|
||||
execRunner () {
|
||||
# print the arguments one to a line, quoting any containing spaces
|
||||
[[ $verbose || $debug ]] && echo "# Executing command line:" && {
|
||||
for arg; do
|
||||
if printf "%s\n" "$arg" | grep -q ' '; then
|
||||
printf "\"%s\"\n" "$arg"
|
||||
else
|
||||
printf "%s\n" "$arg"
|
||||
fi
|
||||
done
|
||||
echo ""
|
||||
}
|
||||
|
||||
exec "$@"
|
||||
}
|
||||
|
||||
addJava () {
|
||||
dlog "[addJava] arg = '$1'"
|
||||
java_args=( "${java_args[@]}" "$1" )
|
||||
}
|
||||
|
||||
enableProfile () {
|
||||
dlog "[enableProfile] arg = '$1'"
|
||||
maven_profiles=( "${maven_profiles[@]}" "$1" )
|
||||
export SBT_MAVEN_PROFILES="${maven_profiles[@]}"
|
||||
}
|
||||
|
||||
addSbt () {
|
||||
dlog "[addSbt] arg = '$1'"
|
||||
sbt_commands=( "${sbt_commands[@]}" "$1" )
|
||||
}
|
||||
addResidual () {
|
||||
dlog "[residual] arg = '$1'"
|
||||
residual_args=( "${residual_args[@]}" "$1" )
|
||||
}
|
||||
addDebugger () {
|
||||
addJava "-Xdebug -Xrunjdwp:transport=dt_socket,server=y,suspend=n,address=$1"
|
||||
}
|
||||
|
||||
# a ham-fisted attempt to move some memory settings in concert
|
||||
# so they need not be dicked around with individually.
|
||||
get_mem_opts () {
|
||||
local mem=${1:-2048}
|
||||
local perm=$(( $mem / 4 ))
|
||||
(( $perm > 256 )) || perm=256
|
||||
(( $perm < 4096 )) || perm=4096
|
||||
local codecache=$(( $perm / 2 ))
|
||||
|
||||
echo "-Xms${mem}m -Xmx${mem}m -XX:MaxPermSize=${perm}m -XX:ReservedCodeCacheSize=${codecache}m"
|
||||
}
|
||||
|
||||
require_arg () {
|
||||
local type="$1"
|
||||
local opt="$2"
|
||||
local arg="$3"
|
||||
if [[ -z "$arg" ]] || [[ "${arg:0:1}" == "-" ]]; then
|
||||
die "$opt requires <$type> argument"
|
||||
fi
|
||||
}
|
||||
|
||||
is_function_defined() {
|
||||
declare -f "$1" > /dev/null
|
||||
}
|
||||
|
||||
process_args () {
|
||||
while [[ $# -gt 0 ]]; do
|
||||
case "$1" in
|
||||
-h|-help) usage; exit 1 ;;
|
||||
-v|-verbose) verbose=1 && shift ;;
|
||||
-d|-debug) debug=1 && shift ;;
|
||||
|
||||
-ivy) require_arg path "$1" "$2" && addJava "-Dsbt.ivy.home=$2" && shift 2 ;;
|
||||
-mem) require_arg integer "$1" "$2" && sbt_mem="$2" && shift 2 ;;
|
||||
-jvm-debug) require_arg port "$1" "$2" && addDebugger $2 && shift 2 ;;
|
||||
-batch) exec </dev/null && shift ;;
|
||||
|
||||
-sbt-jar) require_arg path "$1" "$2" && sbt_jar="$2" && shift 2 ;;
|
||||
-sbt-version) require_arg version "$1" "$2" && sbt_version="$2" && shift 2 ;;
|
||||
-java-home) require_arg path "$1" "$2" && java_cmd="$2/bin/java" && export JAVA_HOME=$2 && shift 2 ;;
|
||||
|
||||
-D*) addJava "$1" && shift ;;
|
||||
-J*) addJava "${1:2}" && shift ;;
|
||||
-P*) enableProfile "$1" && shift ;;
|
||||
*) addResidual "$1" && shift ;;
|
||||
esac
|
||||
done
|
||||
|
||||
is_function_defined process_my_args && {
|
||||
myargs=("${residual_args[@]}")
|
||||
residual_args=()
|
||||
process_my_args "${myargs[@]}"
|
||||
}
|
||||
}
|
||||
|
||||
run() {
|
||||
# no jar? download it.
|
||||
[[ -f "$sbt_jar" ]] || acquire_sbt_jar "$sbt_version" || {
|
||||
# still no jar? uh-oh.
|
||||
echo "Download failed. Obtain the sbt-launch.jar manually and place it at $sbt_jar"
|
||||
exit 1
|
||||
}
|
||||
|
||||
# process the combined args, then reset "$@" to the residuals
|
||||
process_args "$@"
|
||||
set -- "${residual_args[@]}"
|
||||
argumentCount=$#
|
||||
|
||||
# run sbt
|
||||
execRunner "$java_cmd" \
|
||||
${SBT_OPTS:-$default_sbt_opts} \
|
||||
$(get_mem_opts $sbt_mem) \
|
||||
${java_opts} \
|
||||
${java_args[@]} \
|
||||
-jar "$sbt_jar" \
|
||||
"${sbt_commands[@]}" \
|
||||
"${residual_args[@]}"
|
||||
}
|
||||
|
||||
runAlternateBoot() {
|
||||
local bootpropsfile="$1"
|
||||
shift
|
||||
addJava "-Dsbt.boot.properties=$bootpropsfile"
|
||||
run $@
|
||||
}
|
|
@ -0,0 +1,2 @@
|
|||
// This file should only contain the version of sbt to use.
|
||||
sbt.version=0.13.6
|
|
@ -0,0 +1,4 @@
|
|||
// You may use this file to add plugin dependencies for sbt.
|
||||
resolvers += "bintray-spark-packages" at "https://dl.bintray.com/spark-packages/maven/"
|
||||
|
||||
addSbtPlugin("org.spark-packages" %% "sbt-spark-package" % "0.2.2")
|
|
@ -0,0 +1,4 @@
|
|||
# An example MANIFEST file can be found at:
|
||||
# https://github.com/pypa/sampleproject/blob/master/MANIFEST.in
|
||||
# For more details about the MANIFEST file, you may read the docs at
|
||||
# https://docs.python.org/2/distutils/sourcedist.html#the-manifest-in-template
|
|
@ -0,0 +1 @@
|
|||
# This file should list any python package dependencies.
|
|
@ -0,0 +1,2 @@
|
|||
# This file contains the default option values to be used during setup. An
|
||||
# example can be found at https://github.com/pypa/sampleproject/blob/master/setup.cfg
|
|
@ -0,0 +1,2 @@
|
|||
# Your python setup file. An example can be found at:
|
||||
# https://github.com/pypa/sampleproject/blob/master/setup.py
|
|
@ -0,0 +1,2 @@
|
|||
# This file should list any spark package dependencies as:
|
||||
# :package_name==:version e.g. databricks/spark-csv==0.1
|
|
@ -0,0 +1 @@
|
|||
# You may include your unit tests in this file.
|
Загрузка…
Ссылка в новой задаче