From 690314c8a186f35ebc80f8822358ef8daf2fe1e7 Mon Sep 17 00:00:00 2001 From: Sudarshan Date: Mon, 2 Oct 2017 17:11:58 -0400 Subject: [PATCH] Refactored the code to be compatible with the structure required by Spark packages --- .gitignore | 3 + build.sbt | 44 ++-- build/sbt | 106 ++++++++++ build/sbt-launch-lib.bash | 195 ++++++++++++++++++ project/build.properties | 2 + project/plugins.sbt | 4 + python/MANIFEST.in | 4 + python/requirements.txt | 1 + python/setup.cfg | 2 + python/setup.py | 2 + python/spark-package-deps.txt | 2 + .../sparkimages}/ImageSchema.py | 0 .../sparkimages}/__init__.py | 0 python/tests.py | 1 + 14 files changed, 337 insertions(+), 29 deletions(-) create mode 100755 build/sbt create mode 100755 build/sbt-launch-lib.bash create mode 100644 project/build.properties create mode 100644 project/plugins.sbt create mode 100644 python/MANIFEST.in create mode 100644 python/requirements.txt create mode 100644 python/setup.cfg create mode 100644 python/setup.py create mode 100644 python/spark-package-deps.txt rename {src/main/python/sparkimage => python/sparkimages}/ImageSchema.py (100%) rename {src/main/python/sparkimage => python/sparkimages}/__init__.py (100%) create mode 100644 python/tests.py diff --git a/.gitignore b/.gitignore index 9c07d4a..89fe80f 100644 --- a/.gitignore +++ b/.gitignore @@ -1,2 +1,5 @@ *.class *.log +*.jar +__pycache__ +target diff --git a/build.sbt b/build.sbt index 27b1f7f..960668b 100644 --- a/build.sbt +++ b/build.sbt @@ -1,43 +1,29 @@ -val sparkVer = sys.props.getOrElse("spark.version", "2.1.1") -val sparkBranch = sparkVer.substring(0, 3) -val defaultScalaVer = sparkBranch match { - case "2.0" => "2.11.8" - case "2.1" => "2.11.8" - case "2.2" => "2.11.8" - case _ => throw new IllegalArgumentException(s"Unsupported Spark version: $sparkVer.") -} -val scalaVer = sys.props.getOrElse("scala.version", defaultScalaVer) +// Your sbt build file. Guides on how to write one can be found at +// http://www.scala-sbt.org/0.13/docs/index.html -val sparkVersion = "2.1.1" -scalaVersion := scalaVer +scalaVersion := "2.11.8" -name := "spark-image" +sparkVersion := "2.2.0" + +spName := "microsoft/spark-images" + +// Don't forget to set the version version := "0.1" +// All Spark Packages need a license licenses := Seq("Apache-2.0" -> url("http://opensource.org/licenses/Apache-2.0")) + // Add Spark components this package depends on, e.g, "mllib", .... -val sparkComponents = Seq("sql") +sparkComponents ++= Seq("sql") libraryDependencies ++= Seq( // "%%" for scala things, "%" for plain java things - "org.apache.logging.log4j" % "log4j-api" % "2.8.1" % "provided", - "org.apache.logging.log4j" % "log4j-core" % "2.8.1" % "provided", - "org.apache.logging.log4j" %% "log4j-api-scala" % "2.8.1" % "provided", - "org.apache.spark" %% "spark-core" % sparkVer % "provided", - "org.apache.spark" %% "spark-mllib" % sparkVer % "provided", "org.scalatest" %% "scalatest" % "3.0.0" % "provided" ) -parallelExecution := false +// uncomment and change the value below to change the directory where your zip artifact will be created +// spDistDirectory := target.value -// This fixes a class loader problem with scala.Tuple2 class, scala-2.11, Spark 2.x -fork in Test := true - -// This and the next line fix a problem with forked run: https://github.com/scalatest/scalatest/issues/770 -javaOptions in Test ++= Seq("-Xmx2048m", "-XX:ReservedCodeCacheSize=384m", "-XX:MaxPermSize=384m") - -concurrentRestrictions in Global := Seq( - Tags.limitAll(1)) - -autoAPIMappings := true +// add any Spark Package dependencies using spDependencies. +// e.g. spDependencies += "databricks/spark-avro:0.1" diff --git a/build/sbt b/build/sbt new file mode 100755 index 0000000..728e945 --- /dev/null +++ b/build/sbt @@ -0,0 +1,106 @@ +#!/usr/bin/env bash + +# When creating new tests for Spark SQL Hive, the HADOOP_CLASSPATH must contain the hive jars so +# that we can run Hive to generate the golden answer. This is not required for normal development +# or testing. +for i in $HIVE_HOME/lib/* +do HADOOP_CLASSPATH=$HADOOP_CLASSPATH:$i +done +export HADOOP_CLASSPATH + +realpath () { +( + TARGET_FILE=$1 + + cd $(dirname $TARGET_FILE) + TARGET_FILE=$(basename $TARGET_FILE) + + COUNT=0 + while [ -L "$TARGET_FILE" -a $COUNT -lt 100 ] + do + TARGET_FILE=$(readlink $TARGET_FILE) + cd $(dirname $TARGET_FILE) + TARGET_FILE=$(basename $TARGET_FILE) + COUNT=$(($COUNT + 1)) + done + + echo $(pwd -P)/$TARGET_FILE +) +} + +. $(dirname $(realpath $0))/sbt-launch-lib.bash + + +declare -r noshare_opts="-Dsbt.global.base=project/.sbtboot -Dsbt.boot.directory=project/.boot -Dsbt.ivy.home=project/.ivy" +declare -r sbt_opts_file=".sbtopts" +declare -r etc_sbt_opts_file="/etc/sbt/sbtopts" + +usage() { + cat < path to global settings/plugins directory (default: ~/.sbt) + -sbt-boot path to shared boot directory (default: ~/.sbt/boot in 0.11 series) + -ivy path to local Ivy repository (default: ~/.ivy2) + -mem set memory options (default: $sbt_mem, which is $(get_mem_opts $sbt_mem)) + -no-share use all local caches; no sharing + -no-global uses global caches, but does not use global ~/.sbt directory. + -jvm-debug Turn on JVM debugging, open at the given port. + -batch Disable interactive mode + # sbt version (default: from project/build.properties if present, else latest release) + -sbt-version use the specified version of sbt + -sbt-jar use the specified jar as the sbt launcher + -sbt-rc use an RC version of sbt + -sbt-snapshot use a snapshot version of sbt + # java version (default: java from PATH, currently $(java -version 2>&1 | grep version)) + -java-home alternate JAVA_HOME + # jvm options and output control + JAVA_OPTS environment variable, if unset uses "$java_opts" + SBT_OPTS environment variable, if unset uses "$default_sbt_opts" + .sbtopts if this file exists in the current directory, it is + prepended to the runner args + /etc/sbt/sbtopts if this file exists, it is prepended to the runner args + -Dkey=val pass -Dkey=val directly to the java runtime + -J-X pass option -X directly to the java runtime + (-J is stripped) + -S-X add -X to sbt's scalacOptions (-J is stripped) + -PmavenProfiles Enable a maven profile for the build. +In the case of duplicated or conflicting options, the order above +shows precedence: JAVA_OPTS lowest, command line options highest. +EOM +} + +process_my_args () { + while [[ $# -gt 0 ]]; do + case "$1" in + -no-colors) addJava "-Dsbt.log.noformat=true" && shift ;; + -no-share) addJava "$noshare_opts" && shift ;; + -no-global) addJava "-Dsbt.global.base=$(pwd)/project/.sbtboot" && shift ;; + -sbt-boot) require_arg path "$1" "$2" && addJava "-Dsbt.boot.directory=$2" && shift 2 ;; + -sbt-dir) require_arg path "$1" "$2" && addJava "-Dsbt.global.base=$2" && shift 2 ;; + -debug-inc) addJava "-Dxsbt.inc.debug=true" && shift ;; + -batch) exec &2 "$@" +} +vlog () { + [[ $verbose || $debug ]] && echoerr "$@" +} +dlog () { + [[ $debug ]] && echoerr "$@" +} + +acquire_sbt_jar () { + SBT_VERSION=`awk -F "=" '/sbt\\.version/ {print $2}' ./project/build.properties` + URL1=https://dl.bintray.com/typesafe/ivy-releases/org.scala-sbt/sbt-launch/${SBT_VERSION}/sbt-launch.jar + JAR=build/sbt-launch-${SBT_VERSION}.jar + + sbt_jar=$JAR + + if [[ ! -f "$sbt_jar" ]]; then + # Download sbt launch jar if it hasn't been downloaded yet + if [ ! -f ${JAR} ]; then + # Download + printf "Attempting to fetch sbt\n" + JAR_DL=${JAR}.part + if hash curl 2>/dev/null; then + curl --fail --location --silent ${URL1} > "${JAR_DL}" &&\ + mv "${JAR_DL}" "${JAR}" + elif hash wget 2>/dev/null; then + wget --quiet ${URL1} -O "${JAR_DL}" &&\ + mv "${JAR_DL}" "${JAR}" + else + printf "You do not have curl or wget installed, please install sbt manually from http://www.scala-sbt.org/\n" + exit -1 + fi + fi + if [ ! -f ${JAR} ]; then + # We failed to download + printf "Our attempt to download sbt locally to ${JAR} failed. Please install sbt manually from http://www.scala-sbt.org/\n" + exit -1 + fi + printf "Launching sbt from ${JAR}\n" + fi +} + +execRunner () { + # print the arguments one to a line, quoting any containing spaces + [[ $verbose || $debug ]] && echo "# Executing command line:" && { + for arg; do + if printf "%s\n" "$arg" | grep -q ' '; then + printf "\"%s\"\n" "$arg" + else + printf "%s\n" "$arg" + fi + done + echo "" + } + + exec "$@" +} + +addJava () { + dlog "[addJava] arg = '$1'" + java_args=( "${java_args[@]}" "$1" ) +} + +enableProfile () { + dlog "[enableProfile] arg = '$1'" + maven_profiles=( "${maven_profiles[@]}" "$1" ) + export SBT_MAVEN_PROFILES="${maven_profiles[@]}" +} + +addSbt () { + dlog "[addSbt] arg = '$1'" + sbt_commands=( "${sbt_commands[@]}" "$1" ) +} +addResidual () { + dlog "[residual] arg = '$1'" + residual_args=( "${residual_args[@]}" "$1" ) +} +addDebugger () { + addJava "-Xdebug -Xrunjdwp:transport=dt_socket,server=y,suspend=n,address=$1" +} + +# a ham-fisted attempt to move some memory settings in concert +# so they need not be dicked around with individually. +get_mem_opts () { + local mem=${1:-2048} + local perm=$(( $mem / 4 )) + (( $perm > 256 )) || perm=256 + (( $perm < 4096 )) || perm=4096 + local codecache=$(( $perm / 2 )) + + echo "-Xms${mem}m -Xmx${mem}m -XX:MaxPermSize=${perm}m -XX:ReservedCodeCacheSize=${codecache}m" +} + +require_arg () { + local type="$1" + local opt="$2" + local arg="$3" + if [[ -z "$arg" ]] || [[ "${arg:0:1}" == "-" ]]; then + die "$opt requires <$type> argument" + fi +} + +is_function_defined() { + declare -f "$1" > /dev/null +} + +process_args () { + while [[ $# -gt 0 ]]; do + case "$1" in + -h|-help) usage; exit 1 ;; + -v|-verbose) verbose=1 && shift ;; + -d|-debug) debug=1 && shift ;; + + -ivy) require_arg path "$1" "$2" && addJava "-Dsbt.ivy.home=$2" && shift 2 ;; + -mem) require_arg integer "$1" "$2" && sbt_mem="$2" && shift 2 ;; + -jvm-debug) require_arg port "$1" "$2" && addDebugger $2 && shift 2 ;; + -batch) exec