-
Notifications
You must be signed in to change notification settings - Fork 299
Allow to specify hadoop minor version (2.4 and 2.6 at the moment) #56
base: branch-2.0
Are you sure you want to change the base?
Changes from 29 commits
fde24d2
87153f2
a242d0b
064abd9
c282e06
a68ca9b
14f0d75
a308f74
97cbb6b
e46020c
c34d93e
d8d4803
c13a437
9e6920c
833f2de
750ede8
1c34483
46b6394
ad525d7
03e70b7
653f338
21e03d0
4a4f4a5
d7e73bf
332b90b
71c7047
a924690
e3ee4e2
db15dcc
246b888
9b0f1d1
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -11,10 +11,15 @@ SCALA_VERSION="2.10.3" | |
|
||
if [[ "0.7.3 0.8.0 0.8.1" =~ $SPARK_VERSION ]]; then | ||
SCALA_VERSION="2.9.3" | ||
wget http://s3.amazonaws.com/spark-related-packages/scala-$SCALA_VERSION.tgz | ||
elif [[ "2.0.0" =~ $SPARK_VERSION ]]; then | ||
SCALA_VERSION="2.11.8" | ||
wget http://downloads.lightbend.com/scala/2.11.8/scala-$SCALA_VERSION.tgz | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I've also uploaded this to the s3 bucket. Lets switch to that to avoid depending on the lightbend source ? |
||
else | ||
wget http://s3.amazonaws.com/spark-related-packages/scala-$SCALA_VERSION.tgz | ||
fi | ||
|
||
echo "Unpacking Scala" | ||
wget http://s3.amazonaws.com/spark-related-packages/scala-$SCALA_VERSION.tgz | ||
tar xvzf scala-*.tgz > /tmp/spark-ec2_scala.log | ||
rm scala-*.tgz | ||
mv `ls -d scala-* | grep -v ec2` scala | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -24,119 +24,80 @@ then | |
|
||
# Pre-packaged spark version: | ||
else | ||
case "$SPARK_VERSION" in | ||
0.7.3) | ||
case "$SPARK_VERSION" in | ||
# 0.7.3 - 1.0.2 | ||
0\.[7-9]\.[0-3]|1\.0\.[0-2]) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'm not sure this will work as the 0.8.0 and 0.9.0 have incubating in their package names. My take would be be to keep the existing long form for these early versions |
||
if [[ "$HADOOP_MAJOR_VERSION" == "1" ]]; then | ||
wget http://s3.amazonaws.com/spark-related-packages/spark-0.7.3-prebuilt-hadoop1.tgz | ||
else | ||
wget http://s3.amazonaws.com/spark-related-packages/spark-0.7.3-prebuilt-cdh4.tgz | ||
fi | ||
;; | ||
0.8.0) | ||
if [[ "$HADOOP_MAJOR_VERSION" == "1" ]]; then | ||
wget http://s3.amazonaws.com/spark-related-packages/spark-0.8.0-incubating-bin-hadoop1.tgz | ||
else | ||
wget http://s3.amazonaws.com/spark-related-packages/spark-0.8.0-incubating-bin-cdh4.tgz | ||
fi | ||
;; | ||
0.8.1) | ||
if [[ "$HADOOP_MAJOR_VERSION" == "1" ]]; then | ||
wget http://s3.amazonaws.com/spark-related-packages/spark-0.8.1-incubating-bin-hadoop1.tgz | ||
else | ||
wget http://s3.amazonaws.com/spark-related-packages/spark-0.8.1-incubating-bin-cdh4.tgz | ||
fi | ||
;; | ||
0.9.0) | ||
if [[ "$HADOOP_MAJOR_VERSION" == "1" ]]; then | ||
wget http://s3.amazonaws.com/spark-related-packages/spark-0.9.0-incubating-bin-hadoop1.tgz | ||
else | ||
wget http://s3.amazonaws.com/spark-related-packages/spark-0.9.0-incubating-bin-cdh4.tgz | ||
fi | ||
;; | ||
0.9.1) | ||
if [[ "$HADOOP_MAJOR_VERSION" == "1" ]]; then | ||
wget http://s3.amazonaws.com/spark-related-packages/spark-0.9.1-bin-hadoop1.tgz | ||
else | ||
wget http://s3.amazonaws.com/spark-related-packages/spark-0.9.1-bin-cdh4.tgz | ||
fi | ||
;; | ||
0.9.2) | ||
if [[ "$HADOOP_MAJOR_VERSION" == "1" ]]; then | ||
wget http://s3.amazonaws.com/spark-related-packages/spark-0.9.2-bin-hadoop1.tgz | ||
else | ||
wget http://s3.amazonaws.com/spark-related-packages/spark-0.9.2-bin-cdh4.tgz | ||
fi | ||
;; | ||
1.0.0) | ||
if [[ "$HADOOP_MAJOR_VERSION" == "1" ]]; then | ||
wget http://s3.amazonaws.com/spark-related-packages/spark-1.0.0-bin-hadoop1.tgz | ||
else | ||
wget http://s3.amazonaws.com/spark-related-packages/spark-1.0.0-bin-cdh4.tgz | ||
fi | ||
;; | ||
1.0.1) | ||
if [[ "$HADOOP_MAJOR_VERSION" == "1" ]]; then | ||
wget http://s3.amazonaws.com/spark-related-packages/spark-1.0.1-bin-hadoop1.tgz | ||
else | ||
wget http://s3.amazonaws.com/spark-related-packages/spark-1.0.1-bin-cdh4.tgz | ||
fi | ||
;; | ||
1.0.2) | ||
if [[ "$HADOOP_MAJOR_VERSION" == "1" ]]; then | ||
wget http://s3.amazonaws.com/spark-related-packages/spark-1.0.2-bin-hadoop1.tgz | ||
else | ||
wget http://s3.amazonaws.com/spark-related-packages/spark-1.0.2-bin-cdh4.tgz | ||
fi | ||
;; | ||
1.1.0) | ||
if [[ "$HADOOP_MAJOR_VERSION" == "1" ]]; then | ||
wget http://s3.amazonaws.com/spark-related-packages/spark-1.1.0-bin-hadoop1.tgz | ||
wget http://s3.amazonaws.com/spark-related-packages/spark-$SPARK_VERSION-prebuilt-hadoop1.tgz | ||
elif [[ "$HADOOP_MAJOR_VERSION" == "2" ]]; then | ||
wget http://s3.amazonaws.com/spark-related-packages/spark-1.1.0-bin-cdh4.tgz | ||
wget http://s3.amazonaws.com/spark-related-packages/spark-$SPARK_VERSION-prebuilt-cdh4.tgz | ||
else | ||
wget http://s3.amazonaws.com/spark-related-packages/spark-1.1.0-bin-hadoop2.4.tgz | ||
echo "ERROR: Unsupported Hadoop major version" | ||
return 1 | ||
fi | ||
;; | ||
1.1.1) | ||
;; | ||
# 1.1.0 - 1.3.0 | ||
1\.[1-2]\.[0-9]*|1\.3\.0) | ||
if [[ "$HADOOP_MAJOR_VERSION" == "1" ]]; then | ||
wget http://s3.amazonaws.com/spark-related-packages/spark-1.1.1-bin-hadoop1.tgz | ||
wget http://s3.amazonaws.com/spark-related-packages/spark-$SPARK_VERSION-bin-hadoop1.tgz | ||
elif [[ "$HADOOP_MAJOR_VERSION" == "2" ]]; then | ||
wget http://s3.amazonaws.com/spark-related-packages/spark-1.1.1-bin-cdh4.tgz | ||
wget http://s3.amazonaws.com/spark-related-packages/spark-$SPARK_VERSION-bin-cdh4.tgz | ||
elif [[ "$HADOOP_MAJOR_VERSION" == "yarn" ]]; then | ||
wget http://s3.amazonaws.com/spark-related-packages/spark-$SPARK_VERSION-bin-hadoop2.4.tgz | ||
else | ||
wget http://s3.amazonaws.com/spark-related-packages/spark-1.1.1-bin-hadoop2.4.tgz | ||
echo "ERROR: Unsupported Hadoop major version" | ||
return 1 | ||
fi | ||
;; | ||
1.2.0) | ||
;; | ||
# 1.3.1 - 1.6.2 | ||
1\.[3-6]\.[0-2]) | ||
if [[ "$HADOOP_MAJOR_VERSION" == "1" ]]; then | ||
wget http://s3.amazonaws.com/spark-related-packages/spark-1.2.0-bin-hadoop1.tgz | ||
wget http://s3.amazonaws.com/spark-related-packages/spark-$SPARK_VERSION-bin-hadoop1.tgz | ||
elif [[ "$HADOOP_MAJOR_VERSION" == "2" ]]; then | ||
wget http://s3.amazonaws.com/spark-related-packages/spark-1.2.0-bin-cdh4.tgz | ||
wget http://s3.amazonaws.com/spark-related-packages/spark-$SPARK_VERSION-bin-cdh4.tgz | ||
elif [[ "$HADOOP_MAJOR_VERSION" == "yarn" ]]; then | ||
if [[ "$HADOOP_MINOR_VERSION" == "2.4" ]]; then | ||
wget http://s3.amazonaws.com/spark-related-packages/spark-$SPARK_VERSION-bin-hadoop2.4.tgz | ||
elif [[ "$HADOOP_MINOR_VERSION" == "2.6" ]]; then | ||
wget http://s3.amazonaws.com/spark-related-packages/spark-$SPARK_VERSION-bin-hadoop2.6.tgz | ||
else | ||
echo "ERROR: Unknown Hadoop minor version" | ||
return 1 | ||
fi | ||
else | ||
wget http://s3.amazonaws.com/spark-related-packages/spark-1.2.0-bin-hadoop2.4.tgz | ||
echo "ERROR: Unsupported Hadoop major version" | ||
return 1 | ||
fi | ||
;; | ||
1.2.1) | ||
;; | ||
# 2.0.0 - 2.0.1 | ||
2\.0\.[0-1]) | ||
if [[ "$HADOOP_MAJOR_VERSION" == "1" ]]; then | ||
wget http://s3.amazonaws.com/spark-related-packages/spark-1.2.1-bin-hadoop1.tgz | ||
wget http://s3.amazonaws.com/spark-related-packages/spark-2.0.0-bin-hadoop1.tgz | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The version numbers here should not be hard coded to 2.0.0 ? Also it might be good to keep the future proof solution we had of `spark-$SPARK_VERSION-bin-hadoop$HADOOP_MINOR_VERSION.tgz' ? So thinking more about this I think the idea should be that we do the checking of available spark / hadoop version combinations in the Python file (its easier to read / review / maintain than bash). Then the bash script just does the downloading / setup to handle corner cases like |
||
elif [[ "$HADOOP_MAJOR_VERSION" == "2" ]]; then | ||
wget http://s3.amazonaws.com/spark-related-packages/spark-1.2.1-bin-cdh4.tgz | ||
wget http://s3.amazonaws.com/spark-related-packages/spark-2.0.0-bin-cdh4.tgz | ||
elif [[ "$HADOOP_MAJOR_VERSION" == "yarn" ]]; then | ||
if [[ "$HADOOP_MINOR_VERSION" == "2.4" ]]; then | ||
wget http://s3.amazonaws.com/spark-related-packages/spark-2.0.0-bin-hadoop2.4.tgz | ||
elif [[ "$HADOOP_MINOR_VERSION" == "2.6" ]]; then | ||
wget http://s3.amazonaws.com/spark-related-packages/spark-2.0.0-bin-hadoop2.6.tgz | ||
elif [[ "$HADOOP_MINOR_VERSION" == "2.7" ]]; then | ||
wget http://s3.amazonaws.com/spark-related-packages/spark-2.0.0-bin-hadoop2.7.tgz | ||
else | ||
echo "ERROR: Unknown Hadoop version" | ||
return 1 | ||
fi | ||
else | ||
wget http://s3.amazonaws.com/spark-related-packages/spark-1.2.1-bin-hadoop2.4.tgz | ||
echo "ERROR: Unsupported Hadoop major version" | ||
return 1 | ||
fi | ||
;; | ||
;; | ||
*) | ||
if [[ "$HADOOP_MAJOR_VERSION" == "1" ]]; then | ||
wget http://s3.amazonaws.com/spark-related-packages/spark-$SPARK_VERSION-bin-hadoop1.tgz | ||
elif [[ "$HADOOP_MAJOR_VERSION" == "2" ]]; then | ||
wget http://s3.amazonaws.com/spark-related-packages/spark-$SPARK_VERSION-bin-cdh4.tgz | ||
else | ||
wget http://s3.amazonaws.com/spark-related-packages/spark-$SPARK_VERSION-bin-hadoop2.4.tgz | ||
fi | ||
if [ $? != 0 ]; then | ||
echo "ERROR: Unknown Spark version" | ||
return -1 | ||
return 1 | ||
fi | ||
esac | ||
;; | ||
esac | ||
|
||
echo "Unpacking Spark" | ||
tar xvzf spark-*.tgz > /tmp/spark-ec2_spark.log | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -82,6 +82,12 @@ | |
"2.0.1" | ||
]) | ||
|
||
VALID_HADOOP_MINOR_VERSIONS = set([ | ||
"2.4", | ||
"2.6", | ||
"2.7" | ||
]) | ||
|
||
SPARK_TACHYON_MAP = { | ||
"1.0.0": "0.4.1", | ||
"1.0.1": "0.4.1", | ||
|
@@ -241,7 +247,11 @@ def parse_args(): | |
parser.add_option( | ||
"--hadoop-major-version", default="yarn", | ||
help="Major version of Hadoop. Valid options are 1 (Hadoop 1.0.4), 2 (CDH 4.2.0), yarn " + | ||
"(Hadoop 2.4.0) (default: %default)") | ||
"(Hadoop 2.x) (default: %default)") | ||
parser.add_option( | ||
"--hadoop-minor-version", default="2.4", | ||
help="Minor version of Hadoop. Valid options are 2.4 (Hadoop 2.4.0), 2.6 (Hadoop 2.6.0) and 2.7 (Hadoop 2.7.0). " + | ||
"This only has any effect if yarn is specified as Hadoop major version/ (default: %default)") | ||
parser.add_option( | ||
"-D", metavar="[ADDRESS:]PORT", dest="proxy_port", | ||
help="Use SSH dynamic port forwarding to create a SOCKS proxy at " + | ||
|
@@ -371,19 +381,35 @@ def get_or_make_group(conn, name, vpc_id): | |
print("Creating security group " + name) | ||
return conn.create_security_group(name, "Spark EC2 group", vpc_id) | ||
|
||
def validate_spark_hadoop_version(spark_version, hadoop_version): | ||
|
||
def validate_spark_hadoop_version(spark_version, hadoop_version, hadoop_minor_version): | ||
if "." in spark_version: | ||
parts = spark_version.split(".") | ||
if parts[0].isdigit(): | ||
if parts[0].isdigit() and parts[0].isdigit(): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. redundant if check ? I guess this should be |
||
spark_major_version = float(parts[0]) | ||
if spark_major_version > 1.0 and hadoop_version != "yarn": | ||
print("Spark version: {v}, does not support Hadoop version: {hv}". | ||
spark_minor_version = float(parts[1]) | ||
spark_major_minor_version = spark_major_version + (spark_minor_version / 10) | ||
if spark_major_minor_version > 1.0 and hadoop_version != "yarn": | ||
print("Spark version: {v}, does not support Hadoop major version: {hv}". | ||
format(v=spark_version, hv=hadoop_version), file=stderr) | ||
sys.exit(1) | ||
if hadoop_version == "yarn" and hadoop_minor_version not in VALID_HADOOP_MINOR_VERSIONS: | ||
print("Spark version: {v}, does not support Hadoop minor version: {hm}, supported minor versions: {sv}". | ||
format(v=spark_version, hm=hadoop_minor_version, sv=",".join(VALID_HADOOP_MINOR_VERSIONS)), file=stderr) | ||
sys.exit(1) | ||
if hadoop_minor_version == "2.7" and spark_major_minor_version < 2.0: | ||
print("Spark version: {v}, does not support Hadoop minor version: {hm}". | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Might be useful to list the supported minor versions. Also can we make this a list at the top of the file ? Might be easier to add more hadoop versions later on There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. ok. added. |
||
format(v=spark_version, hm=hadoop_minor_version, sv=",".join(VALID_HADOOP_MINOR_VERSIONS)), file=stderr) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. the variable |
||
sys.exit(1) | ||
if hadoop_minor_version == "2.6" and spark_major_minor_version < 1.3: | ||
print("Spark version: {v}, does not support Hadoop minor version: {hm}". | ||
format(v=spark_version, hm=hadoop_minor_version, sv=",".join(VALID_HADOOP_MINOR_VERSIONS)), file=stderr) | ||
sys.exit(1) | ||
else: | ||
print("Invalid Spark version: {v}".format(v=spark_version), file=stderr) | ||
sys.exit(1) | ||
|
||
|
||
def get_validate_spark_version(version, repo): | ||
if "." in version: | ||
# Remove leading v to handle inputs like v1.5.0 | ||
|
@@ -1086,7 +1112,7 @@ def deploy_files(conn, root_dir, opts, master_nodes, slave_nodes, modules): | |
if "." in opts.spark_version: | ||
# Pre-built Spark deploy | ||
spark_v = get_validate_spark_version(opts.spark_version, opts.spark_git_repo) | ||
validate_spark_hadoop_version(spark_v, opts.hadoop_major_version) | ||
validate_spark_hadoop_version(spark_v, opts.hadoop_major_version, opts.hadoop_minor_version) | ||
tachyon_v = get_tachyon_version(spark_v) | ||
else: | ||
# Spark-only custom deploy | ||
|
@@ -1113,6 +1139,7 @@ def deploy_files(conn, root_dir, opts, master_nodes, slave_nodes, modules): | |
"spark_version": spark_v, | ||
"tachyon_version": tachyon_v, | ||
"hadoop_major_version": opts.hadoop_major_version, | ||
"hadoop_minor_version": opts.hadoop_minor_version, | ||
"spark_worker_instances": worker_instances_str, | ||
"spark_master_opts": opts.master_opts | ||
} | ||
|
@@ -1297,7 +1324,7 @@ def real_main(): | |
|
||
# Input parameter validation | ||
spark_v = get_validate_spark_version(opts.spark_version, opts.spark_git_repo) | ||
validate_spark_hadoop_version(spark_v, opts.hadoop_major_version) | ||
validate_spark_hadoop_version(spark_v, opts.hadoop_major_version, opts.hadoop_minor_version) | ||
|
||
if opts.wait is not None: | ||
# NOTE: DeprecationWarnings are silent in 2.7+ by default. | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I'm not sure we need a scala installation on the cluster anymore as Spark should just work with a JRE. But it seems fine to have this if people find it useful
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
never tried spark without scala. even spark-shell does not need scala?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yes - recent Spark distribution includes the scala libraries that provide the shell and other support. But since this is a useful thing irrespective lets keep this.