diff options
-rw-r--r-- | .SRCINFO | 42 | ||||
-rw-r--r-- | PKGBUILD | 84 | ||||
-rw-r--r-- | apache-spark-master.service | 12 | ||||
-rw-r--r-- | apache-spark-slave@.service | 13 | ||||
-rw-r--r-- | apache-spark-standalone.service | 14 | ||||
-rw-r--r-- | apache-spark.install | 6 | ||||
-rwxr-xr-x | run-master.sh | 81 | ||||
-rwxr-xr-x | run-slave.sh | 91 | ||||
-rwxr-xr-x | spark-daemon-run.sh | 139 | ||||
-rw-r--r-- | spark-env.sh | 2 |
10 files changed, 417 insertions, 67 deletions
@@ -1,27 +1,37 @@ -# Generated by mksrcinfo v8 -# Tue Mar 22 10:57:14 UTC 2016 -pkgbase = apache-spark +pkgbase = apache-spark-git pkgdesc = fast and general engine for large-scale data processing - pkgver = 1.6.1 + pkgver = 2.0.0.SNAPSHOT.20160708.16957 pkgrel = 1 url = http://spark.apache.org install = apache-spark.install arch = any license = APACHE - depends = maven>=3.3.3 + makedepends = git + makedepends = maven depends = java-environment>=6 - depends = scala - depends = python2>=2.7 - depends = hadoop>=2.6 - optdepends = python: PYSPARK_PYTHON=python3 pyspark - optdepends = ipython: PYSPARK_DRIVER_PYTHON=ipython pyspark; IPYTHON=1 pyspark + depends = hadoop + optdepends = python2: python2 support for pyspark + optdepends = ipython2: ipython2 support for pyspark + optdepends = python: python3 support for pyspark + optdepends = ipython: ipython3 support for pyspark + optdepends = r: support for sparkR + optdepends = rsync: support rsync hadoop binaries from master + conflicts = apache-spark backup = etc/apache-spark/spark-env.sh - source = http://d3kbcqa49mib13.cloudfront.net/spark-1.6.1.tgz - source = apache-spark-standalone.service + source = git://git.apache.org/spark.git + source = apache-spark-master.service + source = apache-spark-slave@.service source = spark-env.sh - md5sums = 12e1368138840b62f08ed22a8637955d - md5sums = bb7d8b85366e6f9cc0b2777eaea161a8 - md5sums = 0913001583e607849270090555dbd309 + source = spark-daemon-run.sh + source = run-master.sh + source = run-slave.sh + md5sums = SKIP + md5sums = 9ffe1f9c4bb2ea4e5a75ab6469fe76d4 + md5sums = 8d34bd4cc946f46625597ca606da8ab6 + md5sums = f8cc449543df418b8adfcc36a3afb384 + md5sums = 8ff953f0436209b6190add59703a34f0 + md5sums = 028472b82e9def7d5d409f008d064fe2 + md5sums = 99115eedc453c9b8ca04cca2e32e4537 -pkgname = apache-spark +pkgname = apache-spark-git @@ -1,67 +1,83 @@ # Maintainer: François Garillot ("huitseeker") <francois [at] garillot.net> # Contributor: Christian Krause ("wookietreiber") <kizkizzbangbang@gmail.com> -pkgname=apache-spark -pkgver=1.6.1 +pkgname=apache-spark-git +pkgver=2.0.0.SNAPSHOT.20160708.16957 pkgrel=1 pkgdesc="fast and general engine for large-scale data processing" arch=('any') url="http://spark.apache.org" license=('APACHE') -depends=('maven>=3.3.3' 'java-environment>=6' 'scala' 'python2>=2.7' 'hadoop>=2.6') -optdepends=('python: PYSPARK_PYTHON=python3 pyspark' - 'ipython: PYSPARK_DRIVER_PYTHON=ipython pyspark; IPYTHON=1 pyspark') +depends=('java-environment>=6' 'hadoop') +makedepends=('git' 'maven') +conflicts=('apache-spark') +optdepends=('python2: python2 support for pyspark' + 'ipython2: ipython2 support for pyspark' + 'python: python3 support for pyspark' + 'ipython: ipython3 support for pyspark' + 'r: support for sparkR' + 'rsync: support rsync hadoop binaries from master') install=apache-spark.install -source=("http://d3kbcqa49mib13.cloudfront.net/spark-$pkgver.tgz" - 'apache-spark-standalone.service' - 'spark-env.sh') -md5sums=('12e1368138840b62f08ed22a8637955d' - 'bb7d8b85366e6f9cc0b2777eaea161a8' - '0913001583e607849270090555dbd309') +#'git://github.com/apache/spark.git' +source=('git://git.apache.org/spark.git' + 'apache-spark-master.service' + 'apache-spark-slave@.service' + 'spark-env.sh' + 'spark-daemon-run.sh' + 'run-master.sh' + 'run-slave.sh') +md5sums=( SKIP + '9ffe1f9c4bb2ea4e5a75ab6469fe76d4' + '8d34bd4cc946f46625597ca606da8ab6' + 'f8cc449543df418b8adfcc36a3afb384' + '8ff953f0436209b6190add59703a34f0' + '028472b82e9def7d5d409f008d064fe2' + '99115eedc453c9b8ca04cca2e32e4537') backup=('etc/apache-spark/spark-env.sh') -PKGEXT=${PKGEXT:-'.pkg.tar.xz'} - -prepare() { - mkdir -p "$srcdir/spark-$pkgver" - cd "$srcdir/spark-$pkgver" - - sed -i 's|pid=$SPARK_PID_DIR/spark-$SPARK_IDENT_STRING-$command-$instance.pid|pid=/var/lib/apache-spark/spark-daemon.pid|' sbin/spark-daemon.sh +pkgver() { + cd "$srcdir/spark" + _author_ver=$(grep -oP '(?<=\<version\>).*(?=\</version\>)' pom.xml|head -n 2|tail -n 1|sed 's/-/./') + _last_commit_date=$(git log -1 --pretty='%cd' --date=short | tr -d '-') + _commit_count="$(git rev-list --count HEAD)" + echo $_author_ver.$_last_commit_date.$_commit_count } build() { - cd "$srcdir/spark-$pkgver" - - export MAVEN_OPTS="-Xmx2g -XX:MaxPermSize=512M -XX:ReservedCodeCacheSize=512m" - - dev/change-scala-version.sh 2.11 - - JAVA_HOME=/usr/lib/jvm/default-runtime ./make-distribution.sh -Pscala-2.11 -DskipTests -Dmaven.repo.local=/tmp -DautoVersionSubmodules=true -U -Djline.version=2.13 -Djline.groupid=jline -Pyarn -Phadoop-2.6 + cd "$srcdir/spark" + build/mvn -DskipTests clean package } package() { - cd "$srcdir/spark-$pkgver" + cd "$srcdir/spark" - install -d "$pkgdir/usr/bin" "$pkgdir/usr/share" + install -d "$pkgdir/usr/bin" "$pkgdir/opt" "$pkgdir/var/log/apache-spark" - cp -r "$srcdir/spark-$pkgver/dist" "$pkgdir/usr/share/apache-spark/" + cp -r "$srcdir/spark" "$pkgdir/opt/apache-spark/" cd "$pkgdir/usr/bin" for binary in beeline pyspark sparkR spark-class spark-shell spark-sql spark-submit load-spark-env.sh; do - binpath="/usr/share/apache-spark/bin/$binary" + binpath="/opt/apache-spark/bin/$binary" ln -s "$binpath" $binary - sed -i 's|^export SPARK_HOME=.*$|export SPARK_HOME=/usr/share/apache-spark|' "$pkgdir/$binpath" + sed -i 's|^export SPARK_HOME=.*$|export SPARK_HOME=/opt/apache-spark|' "$pkgdir/$binpath" done mkdir -p $pkgdir/etc/profile.d echo '#!/bin/sh' > $pkgdir/etc/profile.d/apache-spark.sh - echo 'SPARK_HOME=/usr/share/apache-spark' >> $pkgdir/etc/profile.d/apache-spark.sh + echo 'SPARK_HOME=/opt/apache-spark' >> $pkgdir/etc/profile.d/apache-spark.sh echo 'export SPARK_HOME' >> $pkgdir/etc/profile.d/apache-spark.sh chmod 755 $pkgdir/etc/profile.d/apache-spark.sh - install -Dm644 "$srcdir/apache-spark-standalone.service" "$pkgdir/usr/lib/systemd/system/apache-spark-standalone.service" + install -Dm644 "$srcdir/apache-spark-master.service" "$pkgdir/usr/lib/systemd/system/apache-spark-master.service" + install -Dm644 "$srcdir/apache-spark-slave@.service" "$pkgdir/usr/lib/systemd/system/apache-spark-slave@.service" install -Dm644 "$srcdir/spark-env.sh" "$pkgdir/etc/apache-spark/spark-env.sh" + for script in run-master.sh run-slave.sh spark-daemon-run.sh; do + install -Dm755 "$srcdir/$script" "$pkgdir/opt/apache-spark/sbin/$script" + done + install -Dm644 "$srcdir/spark/conf"/* "$pkgdir/etc/apache-spark" - cd "$pkgdir/usr/share/apache-spark/conf" - ln -sf "/etc/apache-spark/spark-env.sh" . + cd "$pkgdir/opt/apache-spark" + mv conf conf-templates + ln -sf "/etc/apache-spark" conf + ln -sf "/var/lib/apache-spark/work" . } diff --git a/apache-spark-master.service b/apache-spark-master.service new file mode 100644 index 000000000000..b8bc98bce44c --- /dev/null +++ b/apache-spark-master.service @@ -0,0 +1,12 @@ +[Unit] +Description=Apache Spark Standalone Master +After=network.target + +[Service] +User=apache-spark +Group=apache-spark +Environment=SPARK_LOG_DIR=/var/log/apache-spark +ExecStart=/opt/apache-spark/sbin/run-master.sh + +[Install] +WantedBy=multi-user.target diff --git a/apache-spark-slave@.service b/apache-spark-slave@.service new file mode 100644 index 000000000000..453b3465ce36 --- /dev/null +++ b/apache-spark-slave@.service @@ -0,0 +1,13 @@ +[Unit] +Description=Apache Spark Standalone Master +After=network.target + +[Service] +User=apache-spark +Group=apache-spark +Environment=SPARK_LOG_DIR=/var/log/apache-spark +ExecStart=/opt/apache-spark/sbin/run-slave.sh %i + +[Install] +WantedBy=multi-user.target +DefaultInstance=127.0.0.1:7077 diff --git a/apache-spark-standalone.service b/apache-spark-standalone.service deleted file mode 100644 index 3f81c9691450..000000000000 --- a/apache-spark-standalone.service +++ /dev/null @@ -1,14 +0,0 @@ -[Unit] -Description=Apache Spark Standalone Local Cluster -After=network.target - -[Service] -User=apache-spark -Group=apache-spark -Environment=SPARK_LOG_DIR=/var/lib/apache-spark/logs -PIDFile=/var/lib/apache-spark/spark-daemon.pid -ExecStart=/usr/share/apache-spark/sbin/start-all.sh -ExecStop=/usr/share/apache-spark/sbin/stop-all.sh - -[Install] -WantedBy=multi-user.target diff --git a/apache-spark.install b/apache-spark.install index 9ffbbf29abe8..7aa03480833c 100644 --- a/apache-spark.install +++ b/apache-spark.install @@ -2,10 +2,10 @@ post_install() { groupadd -r -f apache-spark useradd -r -g apache-spark -s /usr/bin/nologin -d /var/lib/apache-spark apache-spark || true - [[ ! -d /var/lib/apache-spark ]] && - install -d /var/lib/apache-spark + [[ ! -d /var/lib/apache-spark/work ]] && + install -d /var/lib/apache-spark/work - chown -R apache-spark:apache-spark /var/lib/apache-spark + chown -R apache-spark:apache-spark /var/{lib,log}/apache-spark } post_remove() { diff --git a/run-master.sh b/run-master.sh new file mode 100755 index 000000000000..a60ca791adc1 --- /dev/null +++ b/run-master.sh @@ -0,0 +1,81 @@ +#!/usr/bin/env bash + +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +# Starts the master on the machine this script is executed on. + +if [ -z "${SPARK_HOME}" ]; then + export SPARK_HOME="$(cd "`dirname "$0"`"/..; pwd)" +fi + +# NOTE: This exact class name is matched downstream by SparkSubmit. +# Any changes need to be reflected there. +CLASS="org.apache.spark.deploy.master.Master" + +if [[ "$@" = *--help ]] || [[ "$@" = *-h ]]; then + echo "Usage: ./sbin/run-master.sh [options]" + pattern="Usage:" + pattern+="\|Using Spark's default log4j profile:" + pattern+="\|Registered signal handlers for" + + "${SPARK_HOME}"/bin/spark-class $CLASS --help 2>&1 | grep -v "$pattern" 1>&2 + exit 1 +fi + +ORIGINAL_ARGS="$@" + +START_TACHYON=false + +while (( "$#" )); do +case $1 in + --with-tachyon) + if [ ! -e "${SPARK_HOME}"/tachyon/bin/tachyon ]; then + echo "Error: --with-tachyon specified, but tachyon not found." + exit -1 + fi + START_TACHYON=true + ;; + esac +shift +done + +. "${SPARK_HOME}/sbin/spark-config.sh" + +. "${SPARK_HOME}/bin/load-spark-env.sh" + +if [ "$SPARK_MASTER_PORT" = "" ]; then + SPARK_MASTER_PORT=7077 +fi + +if [ "$SPARK_MASTER_IP" = "" ]; then + SPARK_MASTER_IP=`hostname` +fi + +if [ "$SPARK_MASTER_WEBUI_PORT" = "" ]; then + SPARK_MASTER_WEBUI_PORT=8080 +fi + +"${SPARK_HOME}/sbin"/spark-daemon-run.sh start $CLASS 1 \ + --ip $SPARK_MASTER_IP --port $SPARK_MASTER_PORT --webui-port $SPARK_MASTER_WEBUI_PORT \ + $ORIGINAL_ARGS + +if [ "$START_TACHYON" == "true" ]; then + "${SPARK_HOME}"/tachyon/bin/tachyon bootstrap-conf $SPARK_MASTER_IP + "${SPARK_HOME}"/tachyon/bin/tachyon format -s + "${SPARK_HOME}"/tachyon/bin/tachyon-start.sh master +fi diff --git a/run-slave.sh b/run-slave.sh new file mode 100755 index 000000000000..1f92aa3bee3e --- /dev/null +++ b/run-slave.sh @@ -0,0 +1,91 @@ +#!/usr/bin/env bash + +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +# Starts a slave on the machine this script is executed on. +# +# Environment Variables +# +# SPARK_WORKER_INSTANCES The number of worker instances to run on this +# slave. Default is 1. +# SPARK_WORKER_PORT The base port number for the first worker. If set, +# subsequent workers will increment this number. If +# unset, Spark will find a valid port number, but +# with no guarantee of a predictable pattern. +# SPARK_WORKER_WEBUI_PORT The base port for the web interface of the first +# worker. Subsequent workers will increment this +# number. Default is 8081. + +if [ -z "${SPARK_HOME}" ]; then + export SPARK_HOME="$(cd "`dirname "$0"`"/..; pwd)" +fi + +# NOTE: This exact class name is matched downstream by SparkSubmit. +# Any changes need to be reflected there. +CLASS="org.apache.spark.deploy.worker.Worker" + +if [[ $# -lt 1 ]] || [[ "$@" = *--help ]] || [[ "$@" = *-h ]]; then + echo "Usage: ./sbin/run-slave.sh [options] <master>" + pattern="Usage:" + pattern+="\|Using Spark's default log4j profile:" + pattern+="\|Registered signal handlers for" + + "${SPARK_HOME}"/bin/spark-class $CLASS --help 2>&1 | grep -v "$pattern" 1>&2 + exit 1 +fi + +. "${SPARK_HOME}/sbin/spark-config.sh" + +. "${SPARK_HOME}/bin/load-spark-env.sh" + +# First argument should be the master; we need to store it aside because we may +# need to insert arguments between it and the other arguments +MASTER=$1 +shift + +# Determine desired worker port +if [ "$SPARK_WORKER_WEBUI_PORT" = "" ]; then + SPARK_WORKER_WEBUI_PORT=8081 +fi + +# Start up the appropriate number of workers on this machine. +# quick local function to start a worker +function start_instance { + WORKER_NUM=$1 + shift + + if [ "$SPARK_WORKER_PORT" = "" ]; then + PORT_FLAG= + PORT_NUM= + else + PORT_FLAG="--port" + PORT_NUM=$(( $SPARK_WORKER_PORT + $WORKER_NUM - 1 )) + fi + WEBUI_PORT=$(( $SPARK_WORKER_WEBUI_PORT + $WORKER_NUM - 1 )) + + "${SPARK_HOME}/sbin"/spark-daemon-run.sh start $CLASS $WORKER_NUM \ + --webui-port "$WEBUI_PORT" $PORT_FLAG $PORT_NUM $MASTER "$@" +} + +if [ "$SPARK_WORKER_INSTANCES" = "" ]; then + start_instance 1 "$@" +else + for ((i=0; i<$SPARK_WORKER_INSTANCES; i++)); do + start_instance $(( 1 + $i )) "$@" + done +fi diff --git a/spark-daemon-run.sh b/spark-daemon-run.sh new file mode 100755 index 000000000000..34e3a80fa37a --- /dev/null +++ b/spark-daemon-run.sh @@ -0,0 +1,139 @@ +#!/usr/bin/env bash + +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +# Runs a Spark daemon foreground. +# +# Environment Variables +# +# SPARK_CONF_DIR Alternate conf dir. Default is ${SPARK_HOME}/conf. +# SPARK_LOG_DIR Where log files are stored. ${SPARK_HOME}/logs by default. +# SPARK_MASTER host:path where spark code should be rsync'd from +# SPARK_IDENT_STRING A string representing this instance of spark. $USER by default +# SPARK_NICENESS The scheduling priority for daemons. Defaults to 0. +## + +usage="Usage: spark-daemon-run.sh [--config <conf-dir>] (class|submit) <spark-command> <spark-instance-number> <args...>" + +# if no args specified, show usage +if [ $# -le 1 ]; then + echo $usage + exit 1 +fi + +if [ -z "${SPARK_HOME}" ]; then + export SPARK_HOME="$(cd "`dirname "$0"`"/..; pwd)" +fi + +. "${SPARK_HOME}/sbin/spark-config.sh" + +# get arguments + +# Check if --config is passed as an argument. It is an optional parameter. +# Exit if the argument is not a directory. + +if [ "$1" == "--config" ] +then + shift + conf_dir="$1" + if [ ! -d "$conf_dir" ] + then + echo "ERROR : $conf_dir is not a directory" + echo $usage + exit 1 + else + export SPARK_CONF_DIR="$conf_dir" + fi + shift +fi + +mode=$1 +shift +command=$1 +shift +instance=$1 +shift + +spark_rotate_log () +{ + log=$1; + num=5; + if [ -n "$2" ]; then + num=$2 + fi + if [ -f "$log" ]; then # rotate logs + while [ $num -gt 1 ]; do + prev=`expr $num - 1` + [ -f "$log.$prev" ] && mv "$log.$prev" "$log.$num" + num=$prev + done + mv "$log" "$log.$num"; + fi +} + +. "${SPARK_HOME}/bin/load-spark-env.sh" + +if [ "$SPARK_IDENT_STRING" = "" ]; then + export SPARK_IDENT_STRING="$USER" +fi + + + +# get log directory +if [ "$SPARK_LOG_DIR" = "" ]; then + export SPARK_LOG_DIR="${SPARK_HOME}/logs" +fi +mkdir -p "$SPARK_LOG_DIR" +touch "$SPARK_LOG_DIR"/.spark_test > /dev/null 2>&1 +TEST_LOG_DIR=$? +if [ "${TEST_LOG_DIR}" = "0" ]; then + rm -f "$SPARK_LOG_DIR"/.spark_test +else + chown "$SPARK_IDENT_STRING" "$SPARK_LOG_DIR" +fi + +# some variables +log="$SPARK_LOG_DIR/spark-$SPARK_IDENT_STRING-$command-$instance-$HOSTNAME.out" + +# Set default scheduling priority +if [ "$SPARK_NICENESS" = "" ]; then + export SPARK_NICENESS=0 +fi + +if [ "$SPARK_MASTER" != "" ]; then + echo rsync from "$SPARK_MASTER" + rsync -a -e ssh --delete --exclude=.svn --exclude='logs/*' --exclude='contrib/hod/logs/*' "$SPARK_MASTER/" "${SPARK_HOME}" +fi + +spark_rotate_log "$log" +echo "running $command, logging to $log" + +case "$mode" in + (start) + exec nice -n "$SPARK_NICENESS" "${SPARK_HOME}"/bin/spark-class $command "$@" >> "$log" 2>&1 < /dev/null + ;; + + (submit) + exec nice -n "$SPARK_NICENESS" "${SPARK_HOME}"/bin/spark-submit --class $command "$@" >> "$log" 2>&1 < /dev/null + ;; + + (*) + echo "unknown mode: $mode" + exit 1 + ;; +esac diff --git a/spark-env.sh b/spark-env.sh index fe83e9da2e9f..b11344f9722a 100644 --- a/spark-env.sh +++ b/spark-env.sh @@ -1,4 +1,6 @@ #!/usr/bin/env bash +export JAVA_HOME=/usr/lib/jvm/default-runtime +export SPARK_DIST_CLASSPATH=$(hadoop classpath) SPARK_MASTER_IP=127.0.0.1 SPARK_LOCAL_IP=127.0.0.1 |