spark/run

#!/bin/bash

SCALA_VERSION=2.9.2

# Figure out where the Scala framework is installed
FWDIR="$(cd `dirname $0`; pwd)"

# Export this as SPARK_HOME
export SPARK_HOME="$FWDIR"

# Load environment variables from conf/spark-env.sh, if it exists
if [ -e $FWDIR/conf/spark-env.sh ] ; then
  . $FWDIR/conf/spark-env.sh
fi

if [ -z "$1" ]; then
  echo "Usage: run <spark-class> [<args>]" >&2
  exit 1
fi

# If this is a standalone cluster daemon, reset SPARK_JAVA_OPTS and SPARK_MEM to reasonable
# values for that; it doesn't need a lot
if [ "$1" = "spark.deploy.master.Master" -o "$1" = "spark.deploy.worker.Worker" ]; then
  SPARK_MEM=${SPARK_DAEMON_MEMORY:-512m}
  SPARK_JAVA_OPTS=$SPARK_DAEMON_JAVA_OPTS   # Empty by default
fi


# Add java opts for master, worker, executor. The opts maybe null
case "$1" in
  'spark.deploy.master.Master')
    SPARK_JAVA_OPTS+=" $SPARK_MASTER_OPTS"
    ;;
  'spark.deploy.worker.Worker')
    SPARK_JAVA_OPTS+=" $SPARK_WORKER_OPTS"
    ;;
  'spark.executor.StandaloneExecutorBackend')
    SPARK_JAVA_OPTS+=" $SPARK_EXECUTOR_OPTS"
    ;;
  'spark.executor.MesosExecutorBackend')
    SPARK_JAVA_OPTS+=" $SPARK_EXECUTOR_OPTS"
    ;;
  'spark.repl.Main')
    SPARK_JAVA_OPTS+=" $SPARK_REPL_OPTS"
    ;;
esac

if [ "$SPARK_LAUNCH_WITH_SCALA" == "1" ]; then
  if [ `command -v scala` ]; then
    RUNNER="scala"
  else
    if [ -z "$SCALA_HOME" ]; then
      echo "SCALA_HOME is not set" >&2
      exit 1
    fi
    RUNNER="${SCALA_HOME}/bin/scala"
  fi
else
  if [ `command -v java` ]; then
    RUNNER="java"
  else
    if [ -z "$JAVA_HOME" ]; then
      echo "JAVA_HOME is not set" >&2
      exit 1
    fi
    RUNNER="${JAVA_HOME}/bin/java"
  fi
  if [ -z "$SCALA_LIBRARY_PATH" ]; then
    if [ -z "$SCALA_HOME" ]; then
      echo "SCALA_HOME is not set" >&2
      exit 1
    fi
    SCALA_LIBRARY_PATH="$SCALA_HOME/lib"
  fi
fi

# Figure out how much memory to use per executor and set it as an environment
# variable so that our process sees it and can report it to Mesos
if [ -z "$SPARK_MEM" ] ; then
  SPARK_MEM="512m"
fi
export SPARK_MEM

# Set JAVA_OPTS to be able to load native libraries and to set heap size
JAVA_OPTS="$SPARK_JAVA_OPTS"
JAVA_OPTS+=" -Djava.library.path=$SPARK_LIBRARY_PATH"
JAVA_OPTS+=" -Xms$SPARK_MEM -Xmx$SPARK_MEM"
# Load extra JAVA_OPTS from conf/java-opts, if it exists
if [ -e $FWDIR/conf/java-opts ] ; then
  JAVA_OPTS+=" `cat $FWDIR/conf/java-opts`"
fi
export JAVA_OPTS

CORE_DIR="$FWDIR/core"
REPL_DIR="$FWDIR/repl"
EXAMPLES_DIR="$FWDIR/examples"
BAGEL_DIR="$FWDIR/bagel"
STREAMING_DIR="$FWDIR/streaming"
PYSPARK_DIR="$FWDIR/python"

# Exit if the user hasn't compiled Spark
if [ ! -e "$REPL_DIR/target" ]; then
  echo "Failed to find Spark classes in $REPL_DIR/target" >&2
  echo "You need to compile Spark before running this program" >&2
  exit 1
fi

# Build up classpath
CLASSPATH="$SPARK_CLASSPATH"
CLASSPATH+=":$FWDIR/conf"
CLASSPATH+=":$CORE_DIR/target/scala-$SCALA_VERSION/classes"
if [ -n "$SPARK_TESTING" ] ; then
  CLASSPATH+=":$CORE_DIR/target/scala-$SCALA_VERSION/test-classes"
  CLASSPATH+=":$STREAMING_DIR/target/scala-$SCALA_VERSION/test-classes"
fi
CLASSPATH+=":$CORE_DIR/src/main/resources"
CLASSPATH+=":$REPL_DIR/target/scala-$SCALA_VERSION/classes"
CLASSPATH+=":$EXAMPLES_DIR/target/scala-$SCALA_VERSION/classes"
CLASSPATH+=":$STREAMING_DIR/target/scala-$SCALA_VERSION/classes"
CLASSPATH+=":$STREAMING_DIR/lib/org/apache/kafka/kafka/0.7.2-spark/*" # <-- our in-project Kafka Jar
if [ -e "$FWDIR/lib_managed" ]; then
  CLASSPATH+=":$FWDIR/lib_managed/jars/*"
  CLASSPATH+=":$FWDIR/lib_managed/bundles/*"
fi
CLASSPATH+=":$REPL_DIR/lib/*"
if [ -e repl-bin/target ]; then
  for jar in `find "repl-bin/target" -name 'spark-repl-*-shaded-hadoop*.jar'`; do
    CLASSPATH+=":$jar"
  done
fi
CLASSPATH+=":$BAGEL_DIR/target/scala-$SCALA_VERSION/classes"
for jar in `find $PYSPARK_DIR/lib -name '*jar'`; do
  CLASSPATH+=":$jar"
done
export CLASSPATH # Needed for spark-shell

# Figure out the JAR file that our examples were packaged into. This includes a bit of a hack
# to avoid the -sources and -doc packages that are built by publish-local.
if [ -e "$EXAMPLES_DIR/target/scala-$SCALA_VERSION/spark-examples"*[0-9T].jar ]; then
  # Use the JAR from the SBT build
  export SPARK_EXAMPLES_JAR=`ls "$EXAMPLES_DIR/target/scala-$SCALA_VERSION/spark-examples"*[0-9T].jar`
fi
if [ -e "$EXAMPLES_DIR/target/spark-examples-"*hadoop[12].jar ]; then
  # Use the JAR from the Maven build
  export SPARK_EXAMPLES_JAR=`ls "$EXAMPLES_DIR/target/spark-examples-"*hadoop[12].jar`
fi

# Figure out whether to run our class with java or with the scala launcher.
# In most cases, we'd prefer to execute our process with java because scala
# creates a shell script as the parent of its Java process, which makes it
# hard to kill the child with stuff like Process.destroy(). However, for
# the Spark shell, the wrapper is necessary to properly reset the terminal
# when we exit, so we allow it to set a variable to launch with scala.
if [ "$SPARK_LAUNCH_WITH_SCALA" == "1" ]; then
  EXTRA_ARGS=""     # Java options will be passed to scala as JAVA_OPTS
else
  CLASSPATH+=":$SCALA_LIBRARY_PATH/scala-library.jar"
  CLASSPATH+=":$SCALA_LIBRARY_PATH/scala-compiler.jar"
  CLASSPATH+=":$SCALA_LIBRARY_PATH/jline.jar"
  # The JVM doesn't read JAVA_OPTS by default so we need to pass it in
  EXTRA_ARGS="$JAVA_OPTS"
fi

exec "$RUNNER" -cp "$CLASSPATH" $EXTRA_ARGS "$@"
Initial commit 2010-03-30 03:17:55 +04:00			`#!/bin/bash`

Update Scala version dependency to 2.9.2 2012-09-25 01:12:48 +04:00			`SCALA_VERSION=2.9.2`
Update run to work with SBT managed dependencies and the newly introduced repl module. 2011-05-27 13:20:34 +04:00
Initial commit 2010-03-30 03:17:55 +04:00			`# Figure out where the Scala framework is installed`
Set absolute path for SPARK_HOME 2010-10-16 23:18:02 +04:00			FWDIR="$(cd `dirname $0`; pwd)"
Initial commit 2010-03-30 03:17:55 +04:00
Added code so that Spark jobs can be launched from outside the Spark directory by setting SPARK_HOME and locating the executor relative to that. Entries on SPARK_CLASSPATH and SPARK_LIBRARY_PATH are also passed along to worker nodes. 2010-10-16 06:42:26 +04:00			`# Export this as SPARK_HOME`
			`export SPARK_HOME="$FWDIR"`

Made it possible to set various Spark options and environment variables in general through a conf/spark-env.sh script. 2010-07-20 05:00:30 +04:00			`# Load environment variables from conf/spark-env.sh, if it exists`
			`if [ -e $FWDIR/conf/spark-env.sh ] ; then`
			`. $FWDIR/conf/spark-env.sh`
			`fi`

Use a separate memory setting for standalone cluster daemons Conflicts: docs/_config.yml 2013-02-07 02:34:46 +04:00			`if [ -z "$1" ]; then`
			`echo "Usage: run <spark-class> [<args>]" >&2`
			`exit 1`
			`fi`

			`# If this is a standalone cluster daemon, reset SPARK_JAVA_OPTS and SPARK_MEM to reasonable`
			`# values for that; it doesn't need a lot`
			`if [ "$1" = "spark.deploy.master.Master" -o "$1" = "spark.deploy.worker.Worker" ]; then`
			`SPARK_MEM=${SPARK_DAEMON_MEMORY:-512m}`
			`SPARK_JAVA_OPTS=$SPARK_DAEMON_JAVA_OPTS # Empty by default`
			`fi`

support customized java options for master, worker, executor, repl shell 2013-02-16 10:42:06 +04:00
			`# Add java opts for master, worker, executor. The opts maybe null`
			`case "$1" in`
Change tabs to spaces 2013-02-25 23:53:55 +04:00			`'spark.deploy.master.Master')`
			`SPARK_JAVA_OPTS+=" $SPARK_MASTER_OPTS"`
			`;;`
			`'spark.deploy.worker.Worker')`
			`SPARK_JAVA_OPTS+=" $SPARK_WORKER_OPTS"`
			`;;`
			`'spark.executor.StandaloneExecutorBackend')`
			`SPARK_JAVA_OPTS+=" $SPARK_EXECUTOR_OPTS"`
			`;;`
			`'spark.executor.MesosExecutorBackend')`
			`SPARK_JAVA_OPTS+=" $SPARK_EXECUTOR_OPTS"`
			`;;`
			`'spark.repl.Main')`
			`SPARK_JAVA_OPTS+=" $SPARK_REPL_OPTS"`
			`;;`
support customized java options for master, worker, executor, repl shell 2013-02-16 10:42:06 +04:00			`esac`

Tweaked run file to live more happily with typesafe's debian package 2012-10-23 00:10:47 +04:00			`if [ "$SPARK_LAUNCH_WITH_SCALA" == "1" ]; then`
			if [ `command -v scala` ]; then
			`RUNNER="scala"`
			`else`
			`if [ -z "$SCALA_HOME" ]; then`
			`echo "SCALA_HOME is not set" >&2`
			`exit 1`
			`fi`
			`RUNNER="${SCALA_HOME}/bin/scala"`
			`fi`
			`else`
			if [ `command -v java` ]; then
			`RUNNER="java"`
			`else`
			`if [ -z "$JAVA_HOME" ]; then`
			`echo "JAVA_HOME is not set" >&2`
			`exit 1`
			`fi`
			`RUNNER="${JAVA_HOME}/bin/java"`
			`fi`
			`if [ -z "$SCALA_LIBRARY_PATH" ]; then`
			`if [ -z "$SCALA_HOME" ]; then`
			`echo "SCALA_HOME is not set" >&2`
			`exit 1`
			`fi`
			`SCALA_LIBRARY_PATH="$SCALA_HOME/lib"`
			`fi`
Update to work with latest Mesos API changes 2010-08-13 11:39:36 +04:00			`fi`

Further fixes to how Mesos is found and used 2012-03-18 00:39:14 +04:00			`# Figure out how much memory to use per executor and set it as an environment`
			`# variable so that our process sees it and can report it to Mesos`
More work to allow Spark to run on the standalone deploy cluster. 2012-07-09 01:00:04 +04:00			`if [ -z "$SPARK_MEM" ] ; then`
Updated to newest Mesos API, which includes better memory accounting by specifying per-executor memory. 2011-08-02 00:54:48 +04:00			`SPARK_MEM="512m"`
Made it possible to set various Spark options and environment variables in general through a conf/spark-env.sh script. 2010-07-20 05:00:30 +04:00			`fi`
Further fixes to how Mesos is found and used 2012-03-18 00:39:14 +04:00			`export SPARK_MEM`
Made it possible to set various Spark options and environment variables in general through a conf/spark-env.sh script. 2010-07-20 05:00:30 +04:00
			`# Set JAVA_OPTS to be able to load native libraries and to set heap size`
			`JAVA_OPTS="$SPARK_JAVA_OPTS"`
Further fixes to how Mesos is found and used 2012-03-18 00:39:14 +04:00			`JAVA_OPTS+=" -Djava.library.path=$SPARK_LIBRARY_PATH"`
Made it possible to set various Spark options and environment variables in general through a conf/spark-env.sh script. 2010-07-20 05:00:30 +04:00			`JAVA_OPTS+=" -Xms$SPARK_MEM -Xmx$SPARK_MEM"`
			`# Load extra JAVA_OPTS from conf/java-opts, if it exists`
Initial commit 2010-03-30 03:17:55 +04:00			`if [ -e $FWDIR/conf/java-opts ] ; then`
			JAVA_OPTS+=" `cat $FWDIR/conf/java-opts`"
			`fi`
			`export JAVA_OPTS`

Further fixes to how Mesos is found and used 2012-03-18 00:39:14 +04:00			`CORE_DIR="$FWDIR/core"`
			`REPL_DIR="$FWDIR/repl"`
			`EXAMPLES_DIR="$FWDIR/examples"`
			`BAGEL_DIR="$FWDIR/bagel"`
Added the Spark Streaing code, ported to Akka 2 2012-07-29 07:03:26 +04:00			`STREAMING_DIR="$FWDIR/streaming"`
Rename top-level 'pyspark' directory to 'python' 2013-01-02 02:48:45 +04:00			`PYSPARK_DIR="$FWDIR/python"`
Made examples and core subprojects 2011-02-02 02:11:08 +03:00
Warn users if they run pyspark or spark-shell without compiling Spark 2013-01-17 23:14:47 +04:00			`# Exit if the user hasn't compiled Spark`
			`if [ ! -e "$REPL_DIR/target" ]; then`
			`echo "Failed to find Spark classes in $REPL_DIR/target" >&2`
			`echo "You need to compile Spark before running this program" >&2`
			`exit 1`
			`fi`

Made examples and core subprojects 2011-02-02 02:11:08 +03:00			`# Build up classpath`
Further fixes to how Mesos is found and used 2012-03-18 00:39:14 +04:00			`CLASSPATH="$SPARK_CLASSPATH"`
			`CLASSPATH+=":$FWDIR/conf"`
			`CLASSPATH+=":$CORE_DIR/target/scala-$SCALA_VERSION/classes"`
Made run script add test-classes onto the classpath only if SPARK_TESTING is set; fixes #216 2012-10-07 08:19:16 +04:00			`if [ -n "$SPARK_TESTING" ] ; then`
			`CLASSPATH+=":$CORE_DIR/target/scala-$SCALA_VERSION/test-classes"`
Fixed class paths and dependencies based on Matei's comments. 2013-02-25 04:24:52 +04:00			`CLASSPATH+=":$STREAMING_DIR/target/scala-$SCALA_VERSION/test-classes"`
Made run script add test-classes onto the classpath only if SPARK_TESTING is set; fixes #216 2012-10-07 08:19:16 +04:00			`fi`
More work on deploy code (adding Worker class) 2012-07-01 03:43:27 +04:00			`CLASSPATH+=":$CORE_DIR/src/main/resources"`
Further fixes to how Mesos is found and used 2012-03-18 00:39:14 +04:00			`CLASSPATH+=":$REPL_DIR/target/scala-$SCALA_VERSION/classes"`
			`CLASSPATH+=":$EXAMPLES_DIR/target/scala-$SCALA_VERSION/classes"`
Added the Spark Streaing code, ported to Akka 2 2012-07-29 07:03:26 +04:00			`CLASSPATH+=":$STREAMING_DIR/target/scala-$SCALA_VERSION/classes"`
Fixed class paths and dependencies based on Matei's comments. 2013-02-25 04:24:52 +04:00			`CLASSPATH+=":$STREAMING_DIR/lib/org/apache/kafka/kafka/0.7.2-spark/*" # <-- our in-project Kafka Jar`
Make "run" script work with Maven builds 2012-12-11 03:12:59 +04:00			`if [ -e "$FWDIR/lib_managed" ]; then`
Retrieve jars to a flat directory so * can be used for the classpath. 2013-01-09 00:44:33 +04:00			`CLASSPATH+=":$FWDIR/lib_managed/jars/*"`
			`CLASSPATH+=":$FWDIR/lib_managed/bundles/*"`
Make "run" script work with Maven builds 2012-12-11 03:12:59 +04:00			`fi`
Retrieve jars to a flat directory so * can be used for the classpath. 2013-01-09 00:44:33 +04:00			`CLASSPATH+=":$REPL_DIR/lib/*"`
Update run script to deal with change to build of REPL shaded JAR 2013-01-21 09:05:17 +04:00			`if [ -e repl-bin/target ]; then`
			for jar in `find "repl-bin/target" -name 'spark-repl--shaded-hadoop.jar'`; do
			`CLASSPATH+=":$jar"`
			`done`
			`fi`
More work to allow Spark to run on the standalone deploy cluster. 2012-07-09 01:00:04 +04:00			`CLASSPATH+=":$BAGEL_DIR/target/scala-$SCALA_VERSION/classes"`
Simplify PySpark installation. - Bundle Py4J binaries, since it's hard to install - Uses Spark's `run` script to launch the Py4J gateway, inheriting the settings in spark-env.sh With these changes, (hopefully) nothing more than running `sbt/sbt package` will be necessary to run PySpark. 2012-12-28 10:47:37 +04:00			for jar in `find $PYSPARK_DIR/lib -name '*jar'`; do
			`CLASSPATH+=":$jar"`
			`done`
Changed printlns to log statements and fixed a bug in run that was causing it to fail on a Mesos cluster 2010-09-29 10:22:07 +04:00			`export CLASSPATH # Needed for spark-shell`
Initial commit 2010-03-30 03:17:55 +04:00
Small hack to work around multiple JARs being built by sbt package 2013-02-27 00:24:18 +04:00			`# Figure out the JAR file that our examples were packaged into. This includes a bit of a hack`
			`# to avoid the -sources and -doc packages that are built by publish-local.`
			`if [ -e "$EXAMPLES_DIR/target/scala-$SCALA_VERSION/spark-examples"*[0-9T].jar ]; then`
Pass a code JAR to SparkContext in our examples. Fixes SPARK-594. 2013-02-26 07:34:32 +04:00			`# Use the JAR from the SBT build`
Small hack to work around multiple JARs being built by sbt package 2013-02-27 00:24:18 +04:00			export SPARK_EXAMPLES_JAR=`ls "$EXAMPLES_DIR/target/scala-$SCALA_VERSION/spark-examples"*[0-9T].jar`
Pass a code JAR to SparkContext in our examples. Fixes SPARK-594. 2013-02-26 07:34:32 +04:00			`fi`
Small hack to work around multiple JARs being built by sbt package 2013-02-27 00:24:18 +04:00			`if [ -e "$EXAMPLES_DIR/target/spark-examples-"*hadoop[12].jar ]; then`
Pass a code JAR to SparkContext in our examples. Fixes SPARK-594. 2013-02-26 07:34:32 +04:00			`# Use the JAR from the Maven build`
Small hack to work around multiple JARs being built by sbt package 2013-02-27 00:24:18 +04:00			export SPARK_EXAMPLES_JAR=`ls "$EXAMPLES_DIR/target/spark-examples-"*hadoop[12].jar`
Pass a code JAR to SparkContext in our examples. Fixes SPARK-594. 2013-02-26 07:34:32 +04:00			`fi`

More work to allow Spark to run on the standalone deploy cluster. 2012-07-09 01:00:04 +04:00			`# Figure out whether to run our class with java or with the scala launcher.`
			`# In most cases, we'd prefer to execute our process with java because scala`
			`# creates a shell script as the parent of its Java process, which makes it`
			`# hard to kill the child with stuff like Process.destroy(). However, for`
			`# the Spark shell, the wrapper is necessary to properly reset the terminal`
			`# when we exit, so we allow it to set a variable to launch with scala.`
			`if [ "$SPARK_LAUNCH_WITH_SCALA" == "1" ]; then`
Fixed SPARK_MEM not being passed when runner is java 2012-07-29 06:53:31 +04:00			`EXTRA_ARGS="" # Java options will be passed to scala as JAVA_OPTS`
Initial commit 2010-03-30 03:17:55 +04:00			`else`
Tweaked run file to live more happily with typesafe's debian package 2012-10-23 00:10:47 +04:00			`CLASSPATH+=":$SCALA_LIBRARY_PATH/scala-library.jar"`
			`CLASSPATH+=":$SCALA_LIBRARY_PATH/scala-compiler.jar"`
			`CLASSPATH+=":$SCALA_LIBRARY_PATH/jline.jar"`
Fixed SPARK_MEM not being passed when runner is java 2012-07-29 06:53:31 +04:00			`# The JVM doesn't read JAVA_OPTS by default so we need to pass it in`
			`EXTRA_ARGS="$JAVA_OPTS"`
Initial commit 2010-03-30 03:17:55 +04:00			`fi`

Fixed SPARK_MEM not being passed when runner is java 2012-07-29 06:53:31 +04:00			`exec "$RUNNER" -cp "$CLASSPATH" $EXTRA_ARGS "$@"`