Refactored the code to be compatible with the structure required by Spark packages

This commit is contained in:
Sudarshan 2017-10-02 17:11:58 -04:00
Родитель bb2019f3cf
Коммит 690314c8a1
14 изменённых файлов: 337 добавлений и 29 удалений

3
.gitignore поставляемый
Просмотреть файл

@ -1,2 +1,5 @@
*.class
*.log
*.jar
__pycache__
target

Просмотреть файл

@ -1,43 +1,29 @@
val sparkVer = sys.props.getOrElse("spark.version", "2.1.1")
val sparkBranch = sparkVer.substring(0, 3)
val defaultScalaVer = sparkBranch match {
case "2.0" => "2.11.8"
case "2.1" => "2.11.8"
case "2.2" => "2.11.8"
case _ => throw new IllegalArgumentException(s"Unsupported Spark version: $sparkVer.")
}
val scalaVer = sys.props.getOrElse("scala.version", defaultScalaVer)
// Your sbt build file. Guides on how to write one can be found at
// http://www.scala-sbt.org/0.13/docs/index.html
val sparkVersion = "2.1.1"
scalaVersion := scalaVer
scalaVersion := "2.11.8"
name := "spark-image"
sparkVersion := "2.2.0"
spName := "microsoft/spark-images"
// Don't forget to set the version
version := "0.1"
// All Spark Packages need a license
licenses := Seq("Apache-2.0" -> url("http://opensource.org/licenses/Apache-2.0"))
// Add Spark components this package depends on, e.g, "mllib", ....
val sparkComponents = Seq("sql")
sparkComponents ++= Seq("sql")
libraryDependencies ++= Seq(
// "%%" for scala things, "%" for plain java things
"org.apache.logging.log4j" % "log4j-api" % "2.8.1" % "provided",
"org.apache.logging.log4j" % "log4j-core" % "2.8.1" % "provided",
"org.apache.logging.log4j" %% "log4j-api-scala" % "2.8.1" % "provided",
"org.apache.spark" %% "spark-core" % sparkVer % "provided",
"org.apache.spark" %% "spark-mllib" % sparkVer % "provided",
"org.scalatest" %% "scalatest" % "3.0.0" % "provided"
)
parallelExecution := false
// uncomment and change the value below to change the directory where your zip artifact will be created
// spDistDirectory := target.value
// This fixes a class loader problem with scala.Tuple2 class, scala-2.11, Spark 2.x
fork in Test := true
// This and the next line fix a problem with forked run: https://github.com/scalatest/scalatest/issues/770
javaOptions in Test ++= Seq("-Xmx2048m", "-XX:ReservedCodeCacheSize=384m", "-XX:MaxPermSize=384m")
concurrentRestrictions in Global := Seq(
Tags.limitAll(1))
autoAPIMappings := true
// add any Spark Package dependencies using spDependencies.
// e.g. spDependencies += "databricks/spark-avro:0.1"

106
build/sbt Executable file
Просмотреть файл

@ -0,0 +1,106 @@
#!/usr/bin/env bash
# When creating new tests for Spark SQL Hive, the HADOOP_CLASSPATH must contain the hive jars so
# that we can run Hive to generate the golden answer. This is not required for normal development
# or testing.
for i in $HIVE_HOME/lib/*
do HADOOP_CLASSPATH=$HADOOP_CLASSPATH:$i
done
export HADOOP_CLASSPATH
realpath () {
(
TARGET_FILE=$1
cd $(dirname $TARGET_FILE)
TARGET_FILE=$(basename $TARGET_FILE)
COUNT=0
while [ -L "$TARGET_FILE" -a $COUNT -lt 100 ]
do
TARGET_FILE=$(readlink $TARGET_FILE)
cd $(dirname $TARGET_FILE)
TARGET_FILE=$(basename $TARGET_FILE)
COUNT=$(($COUNT + 1))
done
echo $(pwd -P)/$TARGET_FILE
)
}
. $(dirname $(realpath $0))/sbt-launch-lib.bash
declare -r noshare_opts="-Dsbt.global.base=project/.sbtboot -Dsbt.boot.directory=project/.boot -Dsbt.ivy.home=project/.ivy"
declare -r sbt_opts_file=".sbtopts"
declare -r etc_sbt_opts_file="/etc/sbt/sbtopts"
usage() {
cat <<EOM
Usage: $script_name [options]
-h | -help print this message
-v | -verbose this runner is chattier
-d | -debug set sbt log level to debug
-no-colors disable ANSI color codes
-sbt-create start sbt even if current directory contains no sbt project
-sbt-dir <path> path to global settings/plugins directory (default: ~/.sbt)
-sbt-boot <path> path to shared boot directory (default: ~/.sbt/boot in 0.11 series)
-ivy <path> path to local Ivy repository (default: ~/.ivy2)
-mem <integer> set memory options (default: $sbt_mem, which is $(get_mem_opts $sbt_mem))
-no-share use all local caches; no sharing
-no-global uses global caches, but does not use global ~/.sbt directory.
-jvm-debug <port> Turn on JVM debugging, open at the given port.
-batch Disable interactive mode
# sbt version (default: from project/build.properties if present, else latest release)
-sbt-version <version> use the specified version of sbt
-sbt-jar <path> use the specified jar as the sbt launcher
-sbt-rc use an RC version of sbt
-sbt-snapshot use a snapshot version of sbt
# java version (default: java from PATH, currently $(java -version 2>&1 | grep version))
-java-home <path> alternate JAVA_HOME
# jvm options and output control
JAVA_OPTS environment variable, if unset uses "$java_opts"
SBT_OPTS environment variable, if unset uses "$default_sbt_opts"
.sbtopts if this file exists in the current directory, it is
prepended to the runner args
/etc/sbt/sbtopts if this file exists, it is prepended to the runner args
-Dkey=val pass -Dkey=val directly to the java runtime
-J-X pass option -X directly to the java runtime
(-J is stripped)
-S-X add -X to sbt's scalacOptions (-J is stripped)
-PmavenProfiles Enable a maven profile for the build.
In the case of duplicated or conflicting options, the order above
shows precedence: JAVA_OPTS lowest, command line options highest.
EOM
}
process_my_args () {
while [[ $# -gt 0 ]]; do
case "$1" in
-no-colors) addJava "-Dsbt.log.noformat=true" && shift ;;
-no-share) addJava "$noshare_opts" && shift ;;
-no-global) addJava "-Dsbt.global.base=$(pwd)/project/.sbtboot" && shift ;;
-sbt-boot) require_arg path "$1" "$2" && addJava "-Dsbt.boot.directory=$2" && shift 2 ;;
-sbt-dir) require_arg path "$1" "$2" && addJava "-Dsbt.global.base=$2" && shift 2 ;;
-debug-inc) addJava "-Dxsbt.inc.debug=true" && shift ;;
-batch) exec </dev/null && shift ;;
-sbt-create) sbt_create=true && shift ;;
*) addResidual "$1" && shift ;;
esac
done
# Now, ensure sbt version is used.
[[ "${sbt_version}XXX" != "XXX" ]] && addJava "-Dsbt.version=$sbt_version"
}
loadConfigFile() {
cat "$1" | sed '/^\#/d'
}
# if sbtopts files exist, prepend their contents to $@ so it can be processed by this runner
[[ -f "$etc_sbt_opts_file" ]] && set -- $(loadConfigFile "$etc_sbt_opts_file") "$@"
[[ -f "$sbt_opts_file" ]] && set -- $(loadConfigFile "$sbt_opts_file") "$@"
run "$@"

195
build/sbt-launch-lib.bash Executable file
Просмотреть файл

@ -0,0 +1,195 @@
#!/usr/bin/env bash
#
# A library to simplify using the SBT launcher from other packages.
# Note: This should be used by tools like giter8/conscript etc.
# TODO - Should we merge the main SBT script with this library?
if test -z "$HOME"; then
declare -r script_dir="$(dirname $script_path)"
else
declare -r script_dir="$HOME/.sbt"
fi
declare -a residual_args
declare -a java_args
declare -a scalac_args
declare -a sbt_commands
declare -a maven_profiles
if test -x "$JAVA_HOME/bin/java"; then
echo -e "Using $JAVA_HOME as default JAVA_HOME."
echo "Note, this will be overridden by -java-home if it is set."
declare java_cmd="$JAVA_HOME/bin/java"
else
declare java_cmd=java
fi
echoerr () {
echo 1>&2 "$@"
}
vlog () {
[[ $verbose || $debug ]] && echoerr "$@"
}
dlog () {
[[ $debug ]] && echoerr "$@"
}
acquire_sbt_jar () {
SBT_VERSION=`awk -F "=" '/sbt\\.version/ {print $2}' ./project/build.properties`
URL1=https://dl.bintray.com/typesafe/ivy-releases/org.scala-sbt/sbt-launch/${SBT_VERSION}/sbt-launch.jar
JAR=build/sbt-launch-${SBT_VERSION}.jar
sbt_jar=$JAR
if [[ ! -f "$sbt_jar" ]]; then
# Download sbt launch jar if it hasn't been downloaded yet
if [ ! -f ${JAR} ]; then
# Download
printf "Attempting to fetch sbt\n"
JAR_DL=${JAR}.part
if hash curl 2>/dev/null; then
curl --fail --location --silent ${URL1} > "${JAR_DL}" &&\
mv "${JAR_DL}" "${JAR}"
elif hash wget 2>/dev/null; then
wget --quiet ${URL1} -O "${JAR_DL}" &&\
mv "${JAR_DL}" "${JAR}"
else
printf "You do not have curl or wget installed, please install sbt manually from http://www.scala-sbt.org/\n"
exit -1
fi
fi
if [ ! -f ${JAR} ]; then
# We failed to download
printf "Our attempt to download sbt locally to ${JAR} failed. Please install sbt manually from http://www.scala-sbt.org/\n"
exit -1
fi
printf "Launching sbt from ${JAR}\n"
fi
}
execRunner () {
# print the arguments one to a line, quoting any containing spaces
[[ $verbose || $debug ]] && echo "# Executing command line:" && {
for arg; do
if printf "%s\n" "$arg" | grep -q ' '; then
printf "\"%s\"\n" "$arg"
else
printf "%s\n" "$arg"
fi
done
echo ""
}
exec "$@"
}
addJava () {
dlog "[addJava] arg = '$1'"
java_args=( "${java_args[@]}" "$1" )
}
enableProfile () {
dlog "[enableProfile] arg = '$1'"
maven_profiles=( "${maven_profiles[@]}" "$1" )
export SBT_MAVEN_PROFILES="${maven_profiles[@]}"
}
addSbt () {
dlog "[addSbt] arg = '$1'"
sbt_commands=( "${sbt_commands[@]}" "$1" )
}
addResidual () {
dlog "[residual] arg = '$1'"
residual_args=( "${residual_args[@]}" "$1" )
}
addDebugger () {
addJava "-Xdebug -Xrunjdwp:transport=dt_socket,server=y,suspend=n,address=$1"
}
# a ham-fisted attempt to move some memory settings in concert
# so they need not be dicked around with individually.
get_mem_opts () {
local mem=${1:-2048}
local perm=$(( $mem / 4 ))
(( $perm > 256 )) || perm=256
(( $perm < 4096 )) || perm=4096
local codecache=$(( $perm / 2 ))
echo "-Xms${mem}m -Xmx${mem}m -XX:MaxPermSize=${perm}m -XX:ReservedCodeCacheSize=${codecache}m"
}
require_arg () {
local type="$1"
local opt="$2"
local arg="$3"
if [[ -z "$arg" ]] || [[ "${arg:0:1}" == "-" ]]; then
die "$opt requires <$type> argument"
fi
}
is_function_defined() {
declare -f "$1" > /dev/null
}
process_args () {
while [[ $# -gt 0 ]]; do
case "$1" in
-h|-help) usage; exit 1 ;;
-v|-verbose) verbose=1 && shift ;;
-d|-debug) debug=1 && shift ;;
-ivy) require_arg path "$1" "$2" && addJava "-Dsbt.ivy.home=$2" && shift 2 ;;
-mem) require_arg integer "$1" "$2" && sbt_mem="$2" && shift 2 ;;
-jvm-debug) require_arg port "$1" "$2" && addDebugger $2 && shift 2 ;;
-batch) exec </dev/null && shift ;;
-sbt-jar) require_arg path "$1" "$2" && sbt_jar="$2" && shift 2 ;;
-sbt-version) require_arg version "$1" "$2" && sbt_version="$2" && shift 2 ;;
-java-home) require_arg path "$1" "$2" && java_cmd="$2/bin/java" && export JAVA_HOME=$2 && shift 2 ;;
-D*) addJava "$1" && shift ;;
-J*) addJava "${1:2}" && shift ;;
-P*) enableProfile "$1" && shift ;;
*) addResidual "$1" && shift ;;
esac
done
is_function_defined process_my_args && {
myargs=("${residual_args[@]}")
residual_args=()
process_my_args "${myargs[@]}"
}
}
run() {
# no jar? download it.
[[ -f "$sbt_jar" ]] || acquire_sbt_jar "$sbt_version" || {
# still no jar? uh-oh.
echo "Download failed. Obtain the sbt-launch.jar manually and place it at $sbt_jar"
exit 1
}
# process the combined args, then reset "$@" to the residuals
process_args "$@"
set -- "${residual_args[@]}"
argumentCount=$#
# run sbt
execRunner "$java_cmd" \
${SBT_OPTS:-$default_sbt_opts} \
$(get_mem_opts $sbt_mem) \
${java_opts} \
${java_args[@]} \
-jar "$sbt_jar" \
"${sbt_commands[@]}" \
"${residual_args[@]}"
}
runAlternateBoot() {
local bootpropsfile="$1"
shift
addJava "-Dsbt.boot.properties=$bootpropsfile"
run $@
}

2
project/build.properties Normal file
Просмотреть файл

@ -0,0 +1,2 @@
// This file should only contain the version of sbt to use.
sbt.version=0.13.6

4
project/plugins.sbt Normal file
Просмотреть файл

@ -0,0 +1,4 @@
// You may use this file to add plugin dependencies for sbt.
resolvers += "bintray-spark-packages" at "https://dl.bintray.com/spark-packages/maven/"
addSbtPlugin("org.spark-packages" %% "sbt-spark-package" % "0.2.2")

4
python/MANIFEST.in Normal file
Просмотреть файл

@ -0,0 +1,4 @@
# An example MANIFEST file can be found at:
# https://github.com/pypa/sampleproject/blob/master/MANIFEST.in
# For more details about the MANIFEST file, you may read the docs at
# https://docs.python.org/2/distutils/sourcedist.html#the-manifest-in-template

1
python/requirements.txt Normal file
Просмотреть файл

@ -0,0 +1 @@
# This file should list any python package dependencies.

2
python/setup.cfg Normal file
Просмотреть файл

@ -0,0 +1,2 @@
# This file contains the default option values to be used during setup. An
# example can be found at https://github.com/pypa/sampleproject/blob/master/setup.cfg

2
python/setup.py Normal file
Просмотреть файл

@ -0,0 +1,2 @@
# Your python setup file. An example can be found at:
# https://github.com/pypa/sampleproject/blob/master/setup.py

Просмотреть файл

@ -0,0 +1,2 @@
# This file should list any spark package dependencies as:
# :package_name==:version e.g. databricks/spark-csv==0.1

Просмотреть файл

1
python/tests.py Normal file
Просмотреть файл

@ -0,0 +1 @@
# You may include your unit tests in this file.