From 0020b7a78130b65fcd97d5865ed015e2d93a45b1 Mon Sep 17 00:00:00 2001 From: Shivaram Venkataraman Date: Tue, 14 Jun 2016 16:09:52 -0700 Subject: [PATCH 01/30] Add support for 2.0.0-preview --- spark_ec2.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/spark_ec2.py b/spark_ec2.py index f6aa00aa..8dbda352 100644 --- a/spark_ec2.py +++ b/spark_ec2.py @@ -76,6 +76,7 @@ "1.5.1", "1.5.2", "1.6.0", + "2.0.0-preview", ]) SPARK_TACHYON_MAP = { @@ -94,6 +95,7 @@ "1.5.1": "0.7.1", "1.5.2": "0.7.1", "1.6.0": "0.8.2", + "2.0.0-preview": "", } DEFAULT_SPARK_VERSION = SPARK_EC2_VERSION @@ -365,7 +367,8 @@ def get_or_make_group(conn, name, vpc_id): def get_validate_spark_version(version, repo): if "." in version: - version = version.replace("v", "") + # Remove leading v to handle inputs like v1.5.0 + version = version.lstrip("v") if version not in VALID_SPARK_VERSIONS: print("Don't know about Spark version: {v}".format(v=version), file=stderr) sys.exit(1) @@ -1057,8 +1060,10 @@ def deploy_files(conn, root_dir, opts, master_nodes, slave_nodes, modules): # Spark-only custom deploy spark_v = "%s|%s" % (opts.spark_git_repo, opts.spark_version) tachyon_v = "" - print("Deploying Spark via git hash; Tachyon won't be set up") - modules = filter(lambda x: x != "tachyon", modules) + + if tachyon_v == "": + print("No valid Tachyon version found; Tachyon won't be set up") + modules = filter(lambda x: x != "tachyon", modules) master_addresses = [get_dns_name(i, opts.private_ips) for i in master_nodes] slave_addresses = [get_dns_name(i, opts.private_ips) for i in slave_nodes] From 38b00953cb2d9376ad4ec9743c1104c3b5c8487b Mon Sep 17 00:00:00 2001 From: Shivaram Venkataraman Date: Tue, 14 Jun 2016 17:36:55 -0700 Subject: [PATCH 02/30] Check if hadoop version is YARN for Spark 2.0 --- spark_ec2.py | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/spark_ec2.py b/spark_ec2.py index 8dbda352..6dd48f1e 100644 --- a/spark_ec2.py +++ b/spark_ec2.py @@ -364,6 +364,19 @@ def get_or_make_group(conn, name, vpc_id): print("Creating security group " + name) return conn.create_security_group(name, "Spark EC2 group", vpc_id) +def validate_spark_hadoop_version(spark_version, hadoop_version): + if "." in spark_version: + parts = spark_version.split(".") + if parts[0].isdigit(): + spark_major_version = float(parts[0]) + print("Got major spark version " + str(spark_major_version)) + if spark_major_version > 1.0 and hadoop_version != "yarn": + print("Spark version: {v}, does not support Hadoop version: {hv}". + format(v=spark_version, hv=hadoop_version), file=stderr) + sys.exit(1) + else: + print("Invalid Spark version: {v}".format(v=spark_version), file=stderr) + sys.exit(1) def get_validate_spark_version(version, repo): if "." in version: @@ -1055,6 +1068,7 @@ def deploy_files(conn, root_dir, opts, master_nodes, slave_nodes, modules): if "." in opts.spark_version: # Pre-built Spark deploy spark_v = get_validate_spark_version(opts.spark_version, opts.spark_git_repo) + validate_spark_hadoop_version(spark_v, opts.hadoop_major_version) tachyon_v = get_tachyon_version(spark_v) else: # Spark-only custom deploy @@ -1264,7 +1278,8 @@ def real_main(): (opts, action, cluster_name) = parse_args() # Input parameter validation - get_validate_spark_version(opts.spark_version, opts.spark_git_repo) + spark_v = get_validate_spark_version(opts.spark_version, opts.spark_git_repo) + validate_spark_hadoop_version(spark_v, opts.hadoop_major_version) if opts.wait is not None: # NOTE: DeprecationWarnings are silent in 2.7+ by default. From 11a29753d7d94cadbe5c7c8dd66cabf6d382b650 Mon Sep 17 00:00:00 2001 From: Shivaram Venkataraman Date: Wed, 15 Jun 2016 09:31:27 -0700 Subject: [PATCH 03/30] Address code review comments --- spark_ec2.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spark_ec2.py b/spark_ec2.py index 6dd48f1e..f752a9bb 100644 --- a/spark_ec2.py +++ b/spark_ec2.py @@ -1077,7 +1077,7 @@ def deploy_files(conn, root_dir, opts, master_nodes, slave_nodes, modules): if tachyon_v == "": print("No valid Tachyon version found; Tachyon won't be set up") - modules = filter(lambda x: x != "tachyon", modules) + modules.remove("tachyon") master_addresses = [get_dns_name(i, opts.private_ips) for i in master_nodes] slave_addresses = [get_dns_name(i, opts.private_ips) for i in slave_nodes] From 79736f2456877185e60f498839e316a3d37f0c7c Mon Sep 17 00:00:00 2001 From: Shivaram Venkataraman Date: Wed, 15 Jun 2016 09:39:47 -0700 Subject: [PATCH 04/30] Remove debug print statement --- spark_ec2.py | 1 - 1 file changed, 1 deletion(-) diff --git a/spark_ec2.py b/spark_ec2.py index f752a9bb..16764646 100644 --- a/spark_ec2.py +++ b/spark_ec2.py @@ -369,7 +369,6 @@ def validate_spark_hadoop_version(spark_version, hadoop_version): parts = spark_version.split(".") if parts[0].isdigit(): spark_major_version = float(parts[0]) - print("Got major spark version " + str(spark_major_version)) if spark_major_version > 1.0 and hadoop_version != "yarn": print("Spark version: {v}, does not support Hadoop version: {hv}". format(v=spark_version, hv=hadoop_version), file=stderr) From 8aff6d122944aeb8f702960c4110db9675b4f633 Mon Sep 17 00:00:00 2001 From: Tomer Kaftan Date: Mon, 29 Aug 2016 10:41:57 -0700 Subject: [PATCH 05/30] Now that it's been released, enable launching with spark 2.0.0 --- spark_ec2.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spark_ec2.py b/spark_ec2.py index 16764646..aa018767 100644 --- a/spark_ec2.py +++ b/spark_ec2.py @@ -76,7 +76,7 @@ "1.5.1", "1.5.2", "1.6.0", - "2.0.0-preview", + "2.0.0", ]) SPARK_TACHYON_MAP = { From 59045a17d74600eb78b0dfd4daf3a837e322ac24 Mon Sep 17 00:00:00 2001 From: Tomer Kaftan Date: Mon, 29 Aug 2016 11:44:12 -0700 Subject: [PATCH 06/30] Updated default spark version and hadoop version to 2.0.0 and yarn --- spark_ec2.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/spark_ec2.py b/spark_ec2.py index aa018767..13c02337 100644 --- a/spark_ec2.py +++ b/spark_ec2.py @@ -51,7 +51,7 @@ raw_input = input xrange = range -SPARK_EC2_VERSION = "1.6.0" +SPARK_EC2_VERSION = "2.0.0" SPARK_EC2_DIR = os.path.dirname(os.path.realpath(__file__)) VALID_SPARK_VERSIONS = set([ @@ -76,6 +76,7 @@ "1.5.1", "1.5.2", "1.6.0", + "2.0.0-preview", "2.0.0", ]) @@ -103,7 +104,7 @@ # Default location to get the spark-ec2 scripts (and ami-list) from DEFAULT_SPARK_EC2_GITHUB_REPO = "https://github.com/amplab/spark-ec2" -DEFAULT_SPARK_EC2_BRANCH = "branch-1.5" +DEFAULT_SPARK_EC2_BRANCH = "branch-2.0" def setup_external_libs(libs): @@ -236,7 +237,7 @@ def parse_args(): "the directory is not created and its contents are copied directly into /. " + "(default: %default).") parser.add_option( - "--hadoop-major-version", default="1", + "--hadoop-major-version", default="yarn", help="Major version of Hadoop. Valid options are 1 (Hadoop 1.0.4), 2 (CDH 4.2.0), yarn " + "(Hadoop 2.4.0) (default: %default)") parser.add_option( From 783a0753e8d818ace3990d35965f78937c96b42d Mon Sep 17 00:00:00 2001 From: Aaron Johnson Date: Tue, 6 Sep 2016 22:41:27 -0600 Subject: [PATCH 07/30] Apply --additional-tags to EBS volumes - Add --tag-volumes option - Apply --additional-tags to any EBS volumes that are attached to any master or slave instances. --- spark_ec2.py | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/spark_ec2.py b/spark_ec2.py index 13c02337..74dd4ffa 100644 --- a/spark_ec2.py +++ b/spark_ec2.py @@ -310,6 +310,10 @@ def parse_args(): "--additional-tags", type="string", default="", help="Additional tags to set on the machines; tags are comma-separated, while name and " + "value are colon separated; ex: \"Task:MySparkProject,Env:production\"") + parser.add_option( + "--tag-volumes", action="store_true", default=False, + help="Apply the tags given in --additional-tags to any EBS volumes " + + "attached to master and slave instances.") parser.add_option( "--copy-aws-credentials", action="store_true", default=False, help="Add AWS credentials to hadoop configuration to allow Spark to access S3") @@ -751,16 +755,28 @@ def launch_cluster(conn, opts, cluster_name): map(str.strip, tag.split(':', 1)) for tag in opts.additional_tags.split(',') ) + print('Applying tags to master nodes') for master in master_nodes: master.add_tags( dict(additional_tags, Name='{cn}-master-{iid}'.format(cn=cluster_name, iid=master.id)) ) + print('Applying tags to slave nodes') for slave in slave_nodes: slave.add_tags( dict(additional_tags, Name='{cn}-slave-{iid}'.format(cn=cluster_name, iid=slave.id)) ) + if opts.tag_volumes: + if len(additional_tags) > 0: + print('Applying tags to volumes') + all_instance_ids = [x.id for x in master_nodes + slave_nodes] + volumes = conn.get_all_volumes(filters={'attachment.instance-id': all_instance_ids}) + for v in volumes: + v.add_tags(additional_tags) + else: + print('--tag-volumes has no effect without --additional-tags') + # Return all the instances return (master_nodes, slave_nodes) From bd25efac05a612b2ad2d41aadc177798aee74339 Mon Sep 17 00:00:00 2001 From: Eemil Lagerspetz Date: Sun, 9 Oct 2016 08:50:27 +0300 Subject: [PATCH 08/30] Add Spark 2.0.1 to valid spark versions. --- spark_ec2.py | 1 + 1 file changed, 1 insertion(+) diff --git a/spark_ec2.py b/spark_ec2.py index 74dd4ffa..5cebb761 100644 --- a/spark_ec2.py +++ b/spark_ec2.py @@ -78,6 +78,7 @@ "1.6.0", "2.0.0-preview", "2.0.0", + "2.0.1" ]) SPARK_TACHYON_MAP = { From 78280cb9222de3c78ee6807b233798bb81479d5e Mon Sep 17 00:00:00 2001 From: Eemil Lagerspetz Date: Mon, 10 Oct 2016 09:24:25 +0300 Subject: [PATCH 09/30] Added also Spark 1.6.2 --- spark_ec2.py | 1 + 1 file changed, 1 insertion(+) diff --git a/spark_ec2.py b/spark_ec2.py index 5cebb761..075c863d 100644 --- a/spark_ec2.py +++ b/spark_ec2.py @@ -76,6 +76,7 @@ "1.5.1", "1.5.2", "1.6.0", + "1.6.2", "2.0.0-preview", "2.0.0", "2.0.1" From bafa07c282e61fbb2f41cf20758fe58b4657e7c0 Mon Sep 17 00:00:00 2001 From: Eemil Lagerspetz Date: Tue, 15 Nov 2016 10:02:15 +0200 Subject: [PATCH 10/30] Add missing 1.6.1 and new 1.6.3 and 2.0.2 --- spark_ec2.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/spark_ec2.py b/spark_ec2.py index 075c863d..14a50f7c 100644 --- a/spark_ec2.py +++ b/spark_ec2.py @@ -76,10 +76,13 @@ "1.5.1", "1.5.2", "1.6.0", + "1.6.1", "1.6.2", + "1.6.3, "2.0.0-preview", "2.0.0", - "2.0.1" + "2.0.1", + "2.0.2" ]) SPARK_TACHYON_MAP = { From 5188c78a0e9e5d34164ccbe1a361aeaac653bcc0 Mon Sep 17 00:00:00 2001 From: Shivaram Venkataraman Date: Wed, 16 Nov 2016 14:18:08 -0800 Subject: [PATCH 11/30] Fix missing close quote --- spark_ec2.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spark_ec2.py b/spark_ec2.py index 14a50f7c..8d23a2fc 100644 --- a/spark_ec2.py +++ b/spark_ec2.py @@ -78,7 +78,7 @@ "1.6.0", "1.6.1", "1.6.2", - "1.6.3, + "1.6.3", "2.0.0-preview", "2.0.0", "2.0.1", From fcbe85f4bbc58889cbf7d0dbb8bebf8e916241f5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Herv=C3=A9=20Werner?= Date: Tue, 10 Jan 2017 17:33:29 +0100 Subject: [PATCH 12/30] Get rid of useless mount flag See function __atime_needs_update in fs/inode.c of Linux --- create_image.sh | 4 ++-- setup-slave.sh | 10 +++++----- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/create_image.sh b/create_image.sh index 05a8cfd3..af6557be 100755 --- a/create_image.sh +++ b/create_image.sh @@ -38,12 +38,12 @@ sudo sed -i 's/.*ephemeral.*//g' /etc/cloud/cloud.cfg sudo sed -i 's/.*swap.*//g' /etc/cloud/cloud.cfg echo "mounts:" >> /etc/cloud/cloud.cfg -echo " - [ ephemeral0, /mnt, auto, \"defaults,noatime,nodiratime\", "\ +echo " - [ ephemeral0, /mnt, auto, \"defaults,noatime\", "\ "\"0\", \"0\" ]" >> /etc/cloud.cloud.cfg for x in {1..23}; do echo " - [ ephemeral$x, /mnt$((x + 1)), auto, "\ - "\"defaults,noatime,nodiratime\", \"0\", \"0\" ]" >> /etc/cloud/cloud.cfg + "\"defaults,noatime\", \"0\", \"0\" ]" >> /etc/cloud/cloud.cfg done # Install Maven (for Hadoop) diff --git a/setup-slave.sh b/setup-slave.sh index 76372d9a..cf46f069 100755 --- a/setup-slave.sh +++ b/setup-slave.sh @@ -30,24 +30,24 @@ echo "Setting up slave on `hostname`... of type $instance_type" if [[ $instance_type == r3* || $instance_type == i2* || $instance_type == hi1* ]]; then # Format & mount using ext4, which has the best performance among ext3, ext4, and xfs based # on our shuffle heavy benchmark - EXT4_MOUNT_OPTS="defaults,noatime,nodiratime" + EXT4_MOUNT_OPTS="defaults,noatime" rm -rf /mnt* mkdir /mnt # To turn TRIM support on, uncomment the following line. - #echo '/dev/sdb /mnt ext4 defaults,noatime,nodiratime,discard 0 0' >> /etc/fstab + #echo '/dev/sdb /mnt ext4 defaults,noatime,discard 0 0' >> /etc/fstab mkfs.ext4 -E lazy_itable_init=0,lazy_journal_init=0 /dev/sdb mount -o $EXT4_MOUNT_OPTS /dev/sdb /mnt if [[ $instance_type == "r3.8xlarge" || $instance_type == "hi1.4xlarge" ]]; then mkdir /mnt2 # To turn TRIM support on, uncomment the following line. - #echo '/dev/sdc /mnt2 ext4 defaults,noatime,nodiratime,discard 0 0' >> /etc/fstab + #echo '/dev/sdc /mnt2 ext4 defaults,noatime,discard 0 0' >> /etc/fstab if [[ $instance_type == "r3.8xlarge" ]]; then mkfs.ext4 -E lazy_itable_init=0,lazy_journal_init=0 /dev/sdc mount -o $EXT4_MOUNT_OPTS /dev/sdc /mnt2 fi # To turn TRIM support on, uncomment the following line. - #echo '/dev/sdf /mnt2 ext4 defaults,noatime,nodiratime,discard 0 0' >> /etc/fstab + #echo '/dev/sdf /mnt2 ext4 defaults,noatime,discard 0 0' >> /etc/fstab if [[ $instance_type == "hi1.4xlarge" ]]; then mkfs.ext4 -E lazy_itable_init=0,lazy_journal_init=0 /dev/sdf mount -o $EXT4_MOUNT_OPTS /dev/sdf /mnt2 @@ -57,7 +57,7 @@ fi # Mount options to use for ext3 and xfs disks (the ephemeral disks # are ext3, but we use xfs for EBS volumes to format them faster) -XFS_MOUNT_OPTS="defaults,noatime,nodiratime,allocsize=8m" +XFS_MOUNT_OPTS="defaults,noatime,allocsize=8m" function setup_ebs_volume { device=$1 From 045507acd92366d603e4cec3513741aa5efaa9ad Mon Sep 17 00:00:00 2001 From: "S.Gu" Date: Sun, 19 Feb 2017 21:44:56 -0500 Subject: [PATCH 13/30] add missing spark_version 2.1.0 --- spark_ec2.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/spark_ec2.py b/spark_ec2.py index 8d23a2fc..ead72353 100644 --- a/spark_ec2.py +++ b/spark_ec2.py @@ -82,7 +82,8 @@ "2.0.0-preview", "2.0.0", "2.0.1", - "2.0.2" + "2.0.2", + "2.1.0" ]) SPARK_TACHYON_MAP = { From 697e802a600b45769e182420a4837325200a4f4e Mon Sep 17 00:00:00 2001 From: Eemil Lagerspetz Date: Wed, 21 Jun 2017 07:50:23 +0000 Subject: [PATCH 14/30] Added Spark 2.1.1. --- spark_ec2.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/spark_ec2.py b/spark_ec2.py index ead72353..7c8656cf 100644 --- a/spark_ec2.py +++ b/spark_ec2.py @@ -83,7 +83,8 @@ "2.0.0", "2.0.1", "2.0.2", - "2.1.0" + "2.1.0", + "2.1.1" ]) SPARK_TACHYON_MAP = { From d9c9326058fef475200b248cfe8214c04c2a4727 Mon Sep 17 00:00:00 2001 From: "Darren L. Weber, Ph.D" Date: Tue, 15 Aug 2017 21:40:11 -0700 Subject: [PATCH 15/30] Updates for spark 2.2.0 - use scala 2.11.x --- scala/init.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scala/init.sh b/scala/init.sh index 73a299f5..8450df6c 100755 --- a/scala/init.sh +++ b/scala/init.sh @@ -7,7 +7,7 @@ if [ -d "scala" ]; then return 0 fi -SCALA_VERSION="2.10.3" +SCALA_VERSION="2.11.8" if [[ "0.7.3 0.8.0 0.8.1" =~ $SPARK_VERSION ]]; then SCALA_VERSION="2.9.3" From f21fb92c03541a2e14baa047685569496298cf2e Mon Sep 17 00:00:00 2001 From: "Darren L. Weber, Ph.D" Date: Tue, 15 Aug 2017 21:41:44 -0700 Subject: [PATCH 16/30] Updates for spark 2.2.0 - update versions and use this spark-ec2 repo/branch --- spark_ec2.py | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/spark_ec2.py b/spark_ec2.py index 7c8656cf..175aae1c 100644 --- a/spark_ec2.py +++ b/spark_ec2.py @@ -51,7 +51,11 @@ raw_input = input xrange = range -SPARK_EC2_VERSION = "2.0.0" +# spark-2.2.0-bin-hadoop2.7.tgz +# For the Scala API, Spark 2.2.0 uses Scala 2.11. You will need to use a compatible Scala version (2.11.x). +# Versions of Hadoop before 2.6 are deprecated as of Spark 2.1.0, and may be removed in Spark 2.2.0. + +SPARK_EC2_VERSION = "2.2.0" SPARK_EC2_DIR = os.path.dirname(os.path.realpath(__file__)) VALID_SPARK_VERSIONS = set([ @@ -84,9 +88,12 @@ "2.0.1", "2.0.2", "2.1.0", - "2.1.1" + "2.1.1", + "2.2.0" ]) +# https://spark.apache.org/releases/spark-release-2-0-0.html#removals-behavior-changes-and-deprecations +# Removed : Block-oriented integration with Tachyon (subsumed by file system integration) SPARK_TACHYON_MAP = { "1.0.0": "0.4.1", "1.0.1": "0.4.1", @@ -103,15 +110,15 @@ "1.5.1": "0.7.1", "1.5.2": "0.7.1", "1.6.0": "0.8.2", - "2.0.0-preview": "", + # nothing for spark >= 2.x } DEFAULT_SPARK_VERSION = SPARK_EC2_VERSION DEFAULT_SPARK_GITHUB_REPO = "https://github.com/apache/spark" # Default location to get the spark-ec2 scripts (and ami-list) from -DEFAULT_SPARK_EC2_GITHUB_REPO = "https://github.com/amplab/spark-ec2" -DEFAULT_SPARK_EC2_BRANCH = "branch-2.0" +DEFAULT_SPARK_EC2_GITHUB_REPO = "https://github.com/sul-dlss/spark-ec2" +DEFAULT_SPARK_EC2_BRANCH = "spark-2.2.0" def setup_external_libs(libs): From 435d06b86f4fe8e66883fc3fb0b33f85f80bc8a2 Mon Sep 17 00:00:00 2001 From: "Darren L. Weber, Ph.D" Date: Tue, 15 Aug 2017 22:28:02 -0700 Subject: [PATCH 17/30] Updates for spark 2.2.0 - update spark-hadhoop download dependency to 2.7 --- spark/init.sh | 3 +++ 1 file changed, 3 insertions(+) diff --git a/spark/init.sh b/spark/init.sh index 71fbc7bf..f6437dbc 100755 --- a/spark/init.sh +++ b/spark/init.sh @@ -124,6 +124,9 @@ else wget http://s3.amazonaws.com/spark-related-packages/spark-1.2.1-bin-hadoop2.4.tgz fi ;; + 2.2.0) + wget http://s3.amazonaws.com/spark-related-packages/spark-2.2.0-bin-hadoop2.7.tgz + ;; *) if [[ "$HADOOP_MAJOR_VERSION" == "1" ]]; then wget http://s3.amazonaws.com/spark-related-packages/spark-$SPARK_VERSION-bin-hadoop1.tgz From c84a77bda5082e9838d065c9452f0433a5b0aafb Mon Sep 17 00:00:00 2001 From: "Darren L. Weber, Ph.D" Date: Wed, 16 Aug 2017 09:20:38 -0700 Subject: [PATCH 18/30] Update to java-1.8.0 and use {{java_home}} consistently in templates --- create_image.sh | 8 +++++--- templates/root/ephemeral-hdfs/conf/yarn-env.sh | 12 ------------ templates/root/mapreduce/conf/hadoop-env.sh | 2 +- templates/root/tachyon/conf/tachyon-env.sh | 2 +- 4 files changed, 7 insertions(+), 17 deletions(-) diff --git a/create_image.sh b/create_image.sh index af6557be..a8e0669f 100755 --- a/create_image.sh +++ b/create_image.sh @@ -11,12 +11,14 @@ if [ "$(id -u)" != "0" ]; then fi # Dev tools -sudo yum install -y java-1.7.0-openjdk-devel gcc gcc-c++ ant git +sudo yum install -y gcc gcc-c++ ant git +sudo yum install -y java-1.8.0-openjdk-devel + # Perf tools sudo yum install -y dstat iotop strace sysstat htop perf sudo debuginfo-install -q -y glibc sudo debuginfo-install -q -y kernel -sudo yum --enablerepo='*-debug*' install -q -y java-1.7.0-openjdk-debuginfo.x86_64 +sudo yum --enablerepo='*-debug*' install -q -y java-1.8.0-openjdk-debuginfo # PySpark and MLlib deps sudo yum install -y python-matplotlib python-tornado scipy libgfortran @@ -54,7 +56,7 @@ mv apache-maven-3.2.3 /opt/ # Edit bash profile echo "export PS1=\"\\u@\\h \\W]\\$ \"" >> ~/.bash_profile -echo "export JAVA_HOME=/usr/lib/jvm/java-1.7.0" >> ~/.bash_profile +echo "export JAVA_HOME=/usr/lib/jvm/java-1.8.0" >> ~/.bash_profile echo "export M2_HOME=/opt/apache-maven-3.2.3" >> ~/.bash_profile echo "export PATH=\$PATH:\$M2_HOME/bin" >> ~/.bash_profile diff --git a/templates/root/ephemeral-hdfs/conf/yarn-env.sh b/templates/root/ephemeral-hdfs/conf/yarn-env.sh index 77e62194..a1f1a3e3 100644 --- a/templates/root/ephemeral-hdfs/conf/yarn-env.sh +++ b/templates/root/ephemeral-hdfs/conf/yarn-env.sh @@ -21,19 +21,7 @@ export HADOOP_YARN_USER=${HADOOP_YARN_USER:-yarn} export YARN_CONF_DIR="/root/ephemeral-hdfs/conf" # some Java parameters -# export JAVA_HOME=/home/y/libexec/jdk1.6.0/ -#if [ "$JAVA_HOME" != "" ]; then -# #echo "run java in $JAVA_HOME" -# JAVA_HOME=$JAVA_HOME -#fi -# -#if [ "$JAVA_HOME" = "" ]; then -# echo "Error: JAVA_HOME is not set." -# exit 1 -#fi - export JAVA_HOME={{java_home}} - JAVA=$JAVA_HOME/bin/java JAVA_HEAP_MAX=-Xmx1000m diff --git a/templates/root/mapreduce/conf/hadoop-env.sh b/templates/root/mapreduce/conf/hadoop-env.sh index 02f7d972..0c120d0b 100755 --- a/templates/root/mapreduce/conf/hadoop-env.sh +++ b/templates/root/mapreduce/conf/hadoop-env.sh @@ -6,7 +6,7 @@ # remote nodes. # The java implementation to use. Required. -export JAVA_HOME=/usr/lib/jvm/java-1.7.0 +export JAVA_HOME={{java_home}} # Extra Java CLASSPATH elements. Optional. # export HADOOP_CLASSPATH= diff --git a/templates/root/tachyon/conf/tachyon-env.sh b/templates/root/tachyon/conf/tachyon-env.sh index 5d70cf82..ae0c6e7b 100644 --- a/templates/root/tachyon/conf/tachyon-env.sh +++ b/templates/root/tachyon/conf/tachyon-env.sh @@ -20,7 +20,7 @@ if [[ `uname -a` == Darwin* ]]; then else # Assuming Linux if [ -z "$JAVA_HOME" ]; then - export JAVA_HOME=/usr/lib/jvm/java-1.7.0 + export JAVA_HOME={{java_home}} fi export TACHYON_RAM_FOLDER=/mnt/ramdisk fi From 535d0709e494104abbc2b570029acc0d9e2ad869 Mon Sep 17 00:00:00 2001 From: "Darren L. Weber, Ph.D" Date: Wed, 16 Aug 2017 11:17:38 -0700 Subject: [PATCH 19/30] Update hadoop to 2.7.4 --- create_image.sh | 9 +++++---- ephemeral-hdfs/init.sh | 5 +++-- persistent-hdfs/init.sh | 5 +++-- 3 files changed, 11 insertions(+), 8 deletions(-) diff --git a/create_image.sh b/create_image.sh index a8e0669f..f220cea1 100755 --- a/create_image.sh +++ b/create_image.sh @@ -63,14 +63,15 @@ echo "export PATH=\$PATH:\$M2_HOME/bin" >> ~/.bash_profile source ~/.bash_profile # Build Hadoop to install native libs +hadoop_version="2.7.4" sudo mkdir /root/hadoop-native cd /tmp sudo yum install -y protobuf-compiler cmake openssl-devel -wget "http://archive.apache.org/dist/hadoop/common/hadoop-2.4.1/hadoop-2.4.1-src.tar.gz" -tar xvzf hadoop-2.4.1-src.tar.gz -cd hadoop-2.4.1-src +wget "http://archive.apache.org/dist/hadoop/common/hadoop-${hadoop_version}/hadoop-${hadoop_version}-src.tar.gz" +tar xvzf "hadoop-${hadoop_version}-src.tar.gz" +cd "hadoop-${hadoop_version}-src" mvn package -Pdist,native -DskipTests -Dtar -sudo mv hadoop-dist/target/hadoop-2.4.1/lib/native/* /root/hadoop-native +sudo mv "hadoop-dist/target/hadoop-${hadoop_version}/lib/native/*" /root/hadoop-native # Install Snappy lib (for Hadoop) yum install -y snappy diff --git a/ephemeral-hdfs/init.sh b/ephemeral-hdfs/init.sh index 0e18bca8..014b94f2 100755 --- a/ephemeral-hdfs/init.sh +++ b/ephemeral-hdfs/init.sh @@ -30,11 +30,12 @@ case "$HADOOP_MAJOR_VERSION" in cp /root/hadoop-native/* /root/ephemeral-hdfs/lib/native/ ;; yarn) - wget http://s3.amazonaws.com/spark-related-packages/hadoop-2.4.0.tar.gz + hadoop_version="2.7.4" + wget http://archive.apache.org/dist/hadoop/common/hadoop-${hadoop_version}/hadoop-${hadoop_version}.tar.gz echo "Unpacking Hadoop" tar xvzf hadoop-*.tar.gz > /tmp/spark-ec2_hadoop.log rm hadoop-*.tar.gz - mv hadoop-2.4.0/ ephemeral-hdfs/ + mv hadoop-${hadoop_version}/ ephemeral-hdfs/ # Have single conf dir rm -rf /root/ephemeral-hdfs/etc/hadoop/ diff --git a/persistent-hdfs/init.sh b/persistent-hdfs/init.sh index 735cebcc..2563a00c 100755 --- a/persistent-hdfs/init.sh +++ b/persistent-hdfs/init.sh @@ -29,11 +29,12 @@ case "$HADOOP_MAJOR_VERSION" in cp /root/hadoop-native/* /root/persistent-hdfs/lib/native/ ;; yarn) - wget http://s3.amazonaws.com/spark-related-packages/hadoop-2.4.0.tar.gz + hadoop_version="2.7.4" + wget http://archive.apache.org/dist/hadoop/common/hadoop-${hadoop_version}/hadoop-${hadoop_version}.tar.gz echo "Unpacking Hadoop" tar xvzf hadoop-*.tar.gz > /tmp/spark-ec2_hadoop.log rm hadoop-*.tar.gz - mv hadoop-2.4.0/ persistent-hdfs/ + mv hadoop-${hadoop_version}/ persistent-hdfs/ # Have single conf dir rm -rf /root/persistent-hdfs/etc/hadoop/ From 9f613cce7b9e7e3f3458a1f5e73ea4d5a22bdaa9 Mon Sep 17 00:00:00 2001 From: "Darren L. Weber, Ph.D" Date: Wed, 16 Aug 2017 14:31:49 -0700 Subject: [PATCH 20/30] fu Update to java-1.8.0 --- create_image.sh | 15 ++++++++++++++- spark_ec2.py | 8 ++++++-- templates/root/ephemeral-hdfs/conf/yarn-env.sh | 5 +++-- templates/root/mapreduce/conf/hadoop-env.sh | 4 +++- templates/root/tachyon/conf/tachyon-env.sh | 5 +++-- 5 files changed, 29 insertions(+), 8 deletions(-) diff --git a/create_image.sh b/create_image.sh index f220cea1..937d6cfa 100755 --- a/create_image.sh +++ b/create_image.sh @@ -12,12 +12,21 @@ fi # Dev tools sudo yum install -y gcc gcc-c++ ant git + +# Install java-7 for Hadoop 2.7.x +# Install java-8 for Spark 2.2.x +# Installing java-8 second should set it as the default +sudo yum install -y java-1.7.0-openjdk-devel sudo yum install -y java-1.8.0-openjdk-devel +ls -lh /usr/lib/jvm +which javac +javac -version # Perf tools sudo yum install -y dstat iotop strace sysstat htop perf sudo debuginfo-install -q -y glibc sudo debuginfo-install -q -y kernel +sudo yum --enablerepo='*-debug*' install -q -y java-1.7.0-openjdk-debuginfo sudo yum --enablerepo='*-debug*' install -q -y java-1.8.0-openjdk-debuginfo # PySpark and MLlib deps @@ -62,8 +71,9 @@ echo "export PATH=\$PATH:\$M2_HOME/bin" >> ~/.bash_profile source ~/.bash_profile -# Build Hadoop to install native libs +# Build Hadoop to install native libs - Hadoop 2.7 requires java-7 hadoop_version="2.7.4" +export JAVA_HOME="/usr/lib/jvm/java-1.7.0" # temporarily use java-7 sudo mkdir /root/hadoop-native cd /tmp sudo yum install -y protobuf-compiler cmake openssl-devel @@ -73,6 +83,9 @@ cd "hadoop-${hadoop_version}-src" mvn package -Pdist,native -DskipTests -Dtar sudo mv "hadoop-dist/target/hadoop-${hadoop_version}/lib/native/*" /root/hadoop-native +# Reset JAVA_HOME etc. after building hadoop +source ~/.bash_profile + # Install Snappy lib (for Hadoop) yum install -y snappy ln -sf /usr/lib64/libsnappy.so.1 /root/hadoop-native/. diff --git a/spark_ec2.py b/spark_ec2.py index 175aae1c..aea8885e 100644 --- a/spark_ec2.py +++ b/spark_ec2.py @@ -52,8 +52,12 @@ xrange = range # spark-2.2.0-bin-hadoop2.7.tgz -# For the Scala API, Spark 2.2.0 uses Scala 2.11. You will need to use a compatible Scala version (2.11.x). -# Versions of Hadoop before 2.6 are deprecated as of Spark 2.1.0, and may be removed in Spark 2.2.0. + +# Spark 2.2.0 overview notes: +# - Spark runs on Java 8+, Python 2.7+/3.4+ and R 3.1+. +# - For the Scala API, Spark 2.2.0 uses Scala 2.11. You will need to use a compatible Scala version (2.11.x). +# - Note that support for Java 7, Python 2.6 and old Hadoop versions before 2.6.5 were removed as of Spark 2.2.0. +# - Note that support for Scala 2.10 is deprecated as of Spark 2.1.0, and may be removed in Spark 2.3.0. SPARK_EC2_VERSION = "2.2.0" SPARK_EC2_DIR = os.path.dirname(os.path.realpath(__file__)) diff --git a/templates/root/ephemeral-hdfs/conf/yarn-env.sh b/templates/root/ephemeral-hdfs/conf/yarn-env.sh index a1f1a3e3..d6de55df 100644 --- a/templates/root/ephemeral-hdfs/conf/yarn-env.sh +++ b/templates/root/ephemeral-hdfs/conf/yarn-env.sh @@ -20,8 +20,9 @@ export HADOOP_YARN_USER=${HADOOP_YARN_USER:-yarn} #export YARN_CONF_DIR="${YARN_CONF_DIR:-$HADOOP_YARN_HOME/conf}" export YARN_CONF_DIR="/root/ephemeral-hdfs/conf" -# some Java parameters -export JAVA_HOME={{java_home}} +# Java parameters - Hadoop requires java-7 +export JAVA_HOME="/usr/lib/jvm/java-1.7.0" + JAVA=$JAVA_HOME/bin/java JAVA_HEAP_MAX=-Xmx1000m diff --git a/templates/root/mapreduce/conf/hadoop-env.sh b/templates/root/mapreduce/conf/hadoop-env.sh index 0c120d0b..ac3b937b 100755 --- a/templates/root/mapreduce/conf/hadoop-env.sh +++ b/templates/root/mapreduce/conf/hadoop-env.sh @@ -6,7 +6,9 @@ # remote nodes. # The java implementation to use. Required. -export JAVA_HOME={{java_home}} +# Hadoop 2.x requires java-7 (it's not compatible with java-8), see +# https://wiki.apache.org/hadoop/HadoopJavaVersions +export JAVA_HOME=/usr/lib/jvm/java-1.7.0 # Extra Java CLASSPATH elements. Optional. # export HADOOP_CLASSPATH= diff --git a/templates/root/tachyon/conf/tachyon-env.sh b/templates/root/tachyon/conf/tachyon-env.sh index ae0c6e7b..7e4f1ba6 100644 --- a/templates/root/tachyon/conf/tachyon-env.sh +++ b/templates/root/tachyon/conf/tachyon-env.sh @@ -20,12 +20,13 @@ if [[ `uname -a` == Darwin* ]]; then else # Assuming Linux if [ -z "$JAVA_HOME" ]; then - export JAVA_HOME={{java_home}} + # TODO: Does tachyon work with java-8? + export JAVA_HOME=/usr/lib/jvm/java-1.7.0 + export JAVA="$JAVA_HOME/bin/java" fi export TACHYON_RAM_FOLDER=/mnt/ramdisk fi -export JAVA="$JAVA_HOME/bin/java" export TACHYON_MASTER_ADDRESS={{active_master}} export TACHYON_UNDERFS_ADDRESS=hdfs://{{active_master}}:9000 #export TACHYON_UNDERFS_ADDRESS=hdfs://localhost:9000 From 0f8a427a80d5e00dea12b39029c05b83222a82a2 Mon Sep 17 00:00:00 2001 From: "Darren L. Weber, Ph.D" Date: Wed, 16 Aug 2017 15:38:45 -0700 Subject: [PATCH 21/30] HADOOP/MAPREDUCE/TACHYON - removed entirely --- create_image.sh | 28 +---- .../root/spark-ec2/ec2-variables.sh | 8 +- ephemeral-hdfs/init.sh | 51 -------- ephemeral-hdfs/setup-slave.sh | 26 ---- ephemeral-hdfs/setup.sh | 49 -------- mapreduce/init.sh | 23 ---- mapreduce/setup.sh | 11 -- persistent-hdfs/init.sh | 50 -------- persistent-hdfs/setup-slave.sh | 8 -- persistent-hdfs/setup.sh | 22 ---- tachyon/init.sh | 70 ----------- tachyon/setup.sh | 9 -- .../conf/capacity-scheduler.xml | 111 ----------------- .../root/ephemeral-hdfs/conf/core-site.xml | 68 ----------- .../root/ephemeral-hdfs/conf/hadoop-env.sh | 69 ----------- .../conf/hadoop-metrics2.properties | 46 ------- .../root/ephemeral-hdfs/conf/hdfs-site.xml | 61 ---------- .../root/ephemeral-hdfs/conf/mapred-site.xml | 34 ------ templates/root/ephemeral-hdfs/conf/masters | 1 - templates/root/ephemeral-hdfs/conf/slaves | 1 - .../root/ephemeral-hdfs/conf/yarn-env.sh | 115 ------------------ .../root/ephemeral-hdfs/conf/yarn-site.xml | 60 --------- templates/root/mapreduce/conf/core-site.xml | 27 ---- templates/root/mapreduce/conf/hadoop-env.sh | 72 ----------- templates/root/mapreduce/conf/hdfs-site.xml | 11 -- templates/root/mapreduce/conf/mapred-site.xml | 29 ----- templates/root/mapreduce/conf/masters | 1 - templates/root/mapreduce/conf/slaves | 1 - templates/root/mapreduce/hadoop.version | 1 - .../root/persistent-hdfs/conf/core-site.xml | 68 ----------- .../root/persistent-hdfs/conf/hadoop-env.sh | 69 ----------- .../root/persistent-hdfs/conf/hdfs-site.xml | 101 --------------- .../root/persistent-hdfs/conf/mapred-site.xml | 29 ----- templates/root/persistent-hdfs/conf/masters | 1 - templates/root/persistent-hdfs/conf/slaves | 1 - templates/root/spark/conf/core-site.xml | 35 ------ templates/root/spark/conf/spark-defaults.conf | 6 - templates/root/spark/conf/spark-env.sh | 8 +- templates/root/tachyon/conf/slaves | 1 - templates/root/tachyon/conf/tachyon-env.sh | 57 --------- templates/root/tachyon/conf/workers | 1 - 41 files changed, 7 insertions(+), 1433 deletions(-) delete mode 100755 ephemeral-hdfs/init.sh delete mode 100755 ephemeral-hdfs/setup-slave.sh delete mode 100755 ephemeral-hdfs/setup.sh delete mode 100755 mapreduce/init.sh delete mode 100755 mapreduce/setup.sh delete mode 100755 persistent-hdfs/init.sh delete mode 100755 persistent-hdfs/setup-slave.sh delete mode 100755 persistent-hdfs/setup.sh delete mode 100755 tachyon/init.sh delete mode 100755 tachyon/setup.sh delete mode 100644 templates/root/ephemeral-hdfs/conf/capacity-scheduler.xml delete mode 100644 templates/root/ephemeral-hdfs/conf/core-site.xml delete mode 100755 templates/root/ephemeral-hdfs/conf/hadoop-env.sh delete mode 100644 templates/root/ephemeral-hdfs/conf/hadoop-metrics2.properties delete mode 100644 templates/root/ephemeral-hdfs/conf/hdfs-site.xml delete mode 100644 templates/root/ephemeral-hdfs/conf/mapred-site.xml delete mode 100644 templates/root/ephemeral-hdfs/conf/masters delete mode 100644 templates/root/ephemeral-hdfs/conf/slaves delete mode 100644 templates/root/ephemeral-hdfs/conf/yarn-env.sh delete mode 100644 templates/root/ephemeral-hdfs/conf/yarn-site.xml delete mode 100644 templates/root/mapreduce/conf/core-site.xml delete mode 100755 templates/root/mapreduce/conf/hadoop-env.sh delete mode 100644 templates/root/mapreduce/conf/hdfs-site.xml delete mode 100644 templates/root/mapreduce/conf/mapred-site.xml delete mode 100644 templates/root/mapreduce/conf/masters delete mode 100644 templates/root/mapreduce/conf/slaves delete mode 100644 templates/root/mapreduce/hadoop.version delete mode 100644 templates/root/persistent-hdfs/conf/core-site.xml delete mode 100755 templates/root/persistent-hdfs/conf/hadoop-env.sh delete mode 100644 templates/root/persistent-hdfs/conf/hdfs-site.xml delete mode 100644 templates/root/persistent-hdfs/conf/mapred-site.xml delete mode 100644 templates/root/persistent-hdfs/conf/masters delete mode 100644 templates/root/persistent-hdfs/conf/slaves delete mode 100644 templates/root/tachyon/conf/slaves delete mode 100644 templates/root/tachyon/conf/tachyon-env.sh delete mode 100644 templates/root/tachyon/conf/workers diff --git a/create_image.sh b/create_image.sh index 937d6cfa..ff4d3e7e 100755 --- a/create_image.sh +++ b/create_image.sh @@ -13,20 +13,13 @@ fi # Dev tools sudo yum install -y gcc gcc-c++ ant git -# Install java-7 for Hadoop 2.7.x # Install java-8 for Spark 2.2.x -# Installing java-8 second should set it as the default -sudo yum install -y java-1.7.0-openjdk-devel sudo yum install -y java-1.8.0-openjdk-devel -ls -lh /usr/lib/jvm -which javac -javac -version # Perf tools sudo yum install -y dstat iotop strace sysstat htop perf sudo debuginfo-install -q -y glibc sudo debuginfo-install -q -y kernel -sudo yum --enablerepo='*-debug*' install -q -y java-1.7.0-openjdk-debuginfo sudo yum --enablerepo='*-debug*' install -q -y java-1.8.0-openjdk-debuginfo # PySpark and MLlib deps @@ -57,7 +50,7 @@ for x in {1..23}; do "\"defaults,noatime\", \"0\", \"0\" ]" >> /etc/cloud/cloud.cfg done -# Install Maven (for Hadoop) +# Install Maven cd /tmp wget "http://archive.apache.org/dist/maven/maven-3/3.2.3/binaries/apache-maven-3.2.3-bin.tar.gz" tar xvzf apache-maven-3.2.3-bin.tar.gz @@ -71,25 +64,6 @@ echo "export PATH=\$PATH:\$M2_HOME/bin" >> ~/.bash_profile source ~/.bash_profile -# Build Hadoop to install native libs - Hadoop 2.7 requires java-7 -hadoop_version="2.7.4" -export JAVA_HOME="/usr/lib/jvm/java-1.7.0" # temporarily use java-7 -sudo mkdir /root/hadoop-native -cd /tmp -sudo yum install -y protobuf-compiler cmake openssl-devel -wget "http://archive.apache.org/dist/hadoop/common/hadoop-${hadoop_version}/hadoop-${hadoop_version}-src.tar.gz" -tar xvzf "hadoop-${hadoop_version}-src.tar.gz" -cd "hadoop-${hadoop_version}-src" -mvn package -Pdist,native -DskipTests -Dtar -sudo mv "hadoop-dist/target/hadoop-${hadoop_version}/lib/native/*" /root/hadoop-native - -# Reset JAVA_HOME etc. after building hadoop -source ~/.bash_profile - -# Install Snappy lib (for Hadoop) -yum install -y snappy -ln -sf /usr/lib64/libsnappy.so.1 /root/hadoop-native/. - # Create /usr/bin/realpath which is used by R to find Java installations # NOTE: /usr/bin/realpath is missing in CentOS AMIs. See # http://superuser.com/questions/771104/usr-bin-realpath-not-found-in-centos-6-5 diff --git a/deploy.generic/root/spark-ec2/ec2-variables.sh b/deploy.generic/root/spark-ec2/ec2-variables.sh index 4f3e8da8..7b28e547 100644 --- a/deploy.generic/root/spark-ec2/ec2-variables.sh +++ b/deploy.generic/root/spark-ec2/ec2-variables.sh @@ -20,13 +20,13 @@ # These variables are automatically filled in by the spark-ec2 script. export MASTERS="{{master_list}}" export SLAVES="{{slave_list}}" -export HDFS_DATA_DIRS="{{hdfs_data_dirs}}" -export MAPRED_LOCAL_DIRS="{{mapred_local_dirs}}" +#export HDFS_DATA_DIRS="{{hdfs_data_dirs}}" +#export MAPRED_LOCAL_DIRS="{{mapred_local_dirs}}" export SPARK_LOCAL_DIRS="{{spark_local_dirs}}" export MODULES="{{modules}}" export SPARK_VERSION="{{spark_version}}" -export TACHYON_VERSION="{{tachyon_version}}" -export HADOOP_MAJOR_VERSION="{{hadoop_major_version}}" +#export TACHYON_VERSION="{{tachyon_version}}" +#export HADOOP_MAJOR_VERSION="{{hadoop_major_version}}" export SWAP_MB="{{swap}}" export SPARK_WORKER_INSTANCES="{{spark_worker_instances}}" export SPARK_MASTER_OPTS="{{spark_master_opts}}" diff --git a/ephemeral-hdfs/init.sh b/ephemeral-hdfs/init.sh deleted file mode 100755 index 014b94f2..00000000 --- a/ephemeral-hdfs/init.sh +++ /dev/null @@ -1,51 +0,0 @@ -#!/bin/bash - -pushd /root > /dev/null - -if [ -d "ephemeral-hdfs" ]; then - echo "Ephemeral HDFS seems to be installed. Exiting." - return 0 -fi - -case "$HADOOP_MAJOR_VERSION" in - 1) - wget http://s3.amazonaws.com/spark-related-packages/hadoop-1.0.4.tar.gz - echo "Unpacking Hadoop" - tar xvzf hadoop-1.0.4.tar.gz > /tmp/spark-ec2_hadoop.log - rm hadoop-*.tar.gz - mv hadoop-1.0.4/ ephemeral-hdfs/ - sed -i 's/-jvm server/-server/g' /root/ephemeral-hdfs/bin/hadoop - cp /root/hadoop-native/* /root/ephemeral-hdfs/lib/native/ - ;; - 2) - wget http://s3.amazonaws.com/spark-related-packages/hadoop-2.0.0-cdh4.2.0.tar.gz - echo "Unpacking Hadoop" - tar xvzf hadoop-*.tar.gz > /tmp/spark-ec2_hadoop.log - rm hadoop-*.tar.gz - mv hadoop-2.0.0-cdh4.2.0/ ephemeral-hdfs/ - - # Have single conf dir - rm -rf /root/ephemeral-hdfs/etc/hadoop/ - ln -s /root/ephemeral-hdfs/conf /root/ephemeral-hdfs/etc/hadoop - cp /root/hadoop-native/* /root/ephemeral-hdfs/lib/native/ - ;; - yarn) - hadoop_version="2.7.4" - wget http://archive.apache.org/dist/hadoop/common/hadoop-${hadoop_version}/hadoop-${hadoop_version}.tar.gz - echo "Unpacking Hadoop" - tar xvzf hadoop-*.tar.gz > /tmp/spark-ec2_hadoop.log - rm hadoop-*.tar.gz - mv hadoop-${hadoop_version}/ ephemeral-hdfs/ - - # Have single conf dir - rm -rf /root/ephemeral-hdfs/etc/hadoop/ - ln -s /root/ephemeral-hdfs/conf /root/ephemeral-hdfs/etc/hadoop - ;; - - *) - echo "ERROR: Unknown Hadoop version" - return 1 -esac -/root/spark-ec2/copy-dir /root/ephemeral-hdfs - -popd > /dev/null diff --git a/ephemeral-hdfs/setup-slave.sh b/ephemeral-hdfs/setup-slave.sh deleted file mode 100755 index a85c4df7..00000000 --- a/ephemeral-hdfs/setup-slave.sh +++ /dev/null @@ -1,26 +0,0 @@ -#!/bin/bash - -# Setup ephemeral-hdfs -mkdir -p /mnt/ephemeral-hdfs/logs -mkdir -p /mnt/hadoop-logs - -# Setup yarn logs, local dirs -mkdir -p /mnt/yarn-local -mkdir -p /mnt/yarn-logs - -# Create Hadoop and HDFS directories in a given parent directory -# (for example /mnt, /mnt2, and so on) -function create_hadoop_dirs { - location=$1 - if [[ -e $location ]]; then - mkdir -p $location/ephemeral-hdfs $location/hadoop/tmp - chmod -R 755 $location/ephemeral-hdfs - mkdir -p $location/hadoop/mrlocal $location/hadoop/mrlocal2 - fi -} - -# Set up Hadoop and Mesos directories in /mnt -create_hadoop_dirs /mnt -create_hadoop_dirs /mnt2 -create_hadoop_dirs /mnt3 -create_hadoop_dirs /mnt4 diff --git a/ephemeral-hdfs/setup.sh b/ephemeral-hdfs/setup.sh deleted file mode 100755 index 1c171056..00000000 --- a/ephemeral-hdfs/setup.sh +++ /dev/null @@ -1,49 +0,0 @@ -#!/bin/bash - -EPHEMERAL_HDFS=/root/ephemeral-hdfs - -# Set hdfs url to make it easier -HDFS_URL="hdfs://$PUBLIC_DNS:9000" -echo "export HDFS_URL=$HDFS_URL" >> ~/.bash_profile - -pushd /root/spark-ec2/ephemeral-hdfs > /dev/null -source ./setup-slave.sh - -for node in $SLAVES $OTHER_MASTERS; do - echo $node - ssh -t -t $SSH_OPTS root@$node "/root/spark-ec2/ephemeral-hdfs/setup-slave.sh" & sleep 0.3 -done -wait - -/root/spark-ec2/copy-dir $EPHEMERAL_HDFS/conf - -NAMENODE_DIR=/mnt/ephemeral-hdfs/dfs/name - -if [ -f "$NAMENODE_DIR/current/VERSION" ] && [ -f "$NAMENODE_DIR/current/fsimage" ]; then - echo "Hadoop namenode appears to be formatted: skipping" -else - echo "Formatting ephemeral HDFS namenode..." - $EPHEMERAL_HDFS/bin/hadoop namenode -format -fi - -echo "Starting ephemeral HDFS..." - -# This is different depending on version. -case "$HADOOP_MAJOR_VERSION" in - 1) - $EPHEMERAL_HDFS/bin/start-dfs.sh - ;; - 2) - $EPHEMERAL_HDFS/sbin/start-dfs.sh - ;; - yarn) - $EPHEMERAL_HDFS/sbin/start-dfs.sh - echo "Starting YARN" - $EPHEMERAL_HDFS/sbin/start-yarn.sh - ;; - *) - echo "ERROR: Unknown Hadoop version" - return -1 -esac - -popd > /dev/null diff --git a/mapreduce/init.sh b/mapreduce/init.sh deleted file mode 100755 index 2e952799..00000000 --- a/mapreduce/init.sh +++ /dev/null @@ -1,23 +0,0 @@ -#!/bin/bash - -pushd /root > /dev/null -case "$HADOOP_MAJOR_VERSION" in - 1) - echo "Nothing to initialize for MapReduce in Hadoop 1" - ;; - 2) - wget http://s3.amazonaws.com/spark-related-packages/mr1-2.0.0-mr1-cdh4.2.0.tar.gz - tar -xvzf mr1-*.tar.gz > /tmp/spark-ec2_mapreduce.log - rm mr1-*.tar.gz - mv hadoop-2.0.0-mr1-cdh4.2.0/ mapreduce/ - ;; - yarn) - echo "Nothing to initialize for MapReduce in Hadoop 2 YARN" - ;; - - *) - echo "ERROR: Unknown Hadoop version" - return -1 -esac -/root/spark-ec2/copy-dir /root/mapreduce -popd > /dev/null diff --git a/mapreduce/setup.sh b/mapreduce/setup.sh deleted file mode 100755 index 4f71e0c7..00000000 --- a/mapreduce/setup.sh +++ /dev/null @@ -1,11 +0,0 @@ -#!/bin/bash -MAPREDUCE=/root/mapreduce - -mkdir -p /mnt/mapreduce/logs -for node in $SLAVES $OTHER_MASTERS; do - ssh -t $SSH_OPTS root@$node "mkdir -p /mnt/mapreduce/logs && chown hadoop:hadoop /mnt/mapreduce/logs && chown hadoop:hadoop /mnt/mapreduce" & sleep 0.3 -done -wait - -chown hadoop:hadoop /mnt/mapreduce -R -/root/spark-ec2/copy-dir $MAPREDUCE/conf diff --git a/persistent-hdfs/init.sh b/persistent-hdfs/init.sh deleted file mode 100755 index 2563a00c..00000000 --- a/persistent-hdfs/init.sh +++ /dev/null @@ -1,50 +0,0 @@ -#!/bin/bash - -pushd /root > /dev/null - -if [ -d "persistent-hdfs" ]; then - echo "Persistent HDFS seems to be installed. Exiting." - return 0 -fi - -case "$HADOOP_MAJOR_VERSION" in - 1) - wget http://s3.amazonaws.com/spark-related-packages/hadoop-1.0.4.tar.gz - echo "Unpacking Hadoop" - tar xvzf hadoop-1.0.4.tar.gz > /tmp/spark-ec2_hadoop.log - rm hadoop-*.tar.gz - mv hadoop-1.0.4/ persistent-hdfs/ - cp /root/hadoop-native/* /root/persistent-hdfs/lib/native/ - ;; - 2) - wget http://s3.amazonaws.com/spark-related-packages/hadoop-2.0.0-cdh4.2.0.tar.gz - echo "Unpacking Hadoop" - tar xvzf hadoop-*.tar.gz > /tmp/spark-ec2_hadoop.log - rm hadoop-*.tar.gz - mv hadoop-2.0.0-cdh4.2.0/ persistent-hdfs/ - - # Have single conf dir - rm -rf /root/persistent-hdfs/etc/hadoop/ - ln -s /root/persistent-hdfs/conf /root/persistent-hdfs/etc/hadoop - cp /root/hadoop-native/* /root/persistent-hdfs/lib/native/ - ;; - yarn) - hadoop_version="2.7.4" - wget http://archive.apache.org/dist/hadoop/common/hadoop-${hadoop_version}/hadoop-${hadoop_version}.tar.gz - echo "Unpacking Hadoop" - tar xvzf hadoop-*.tar.gz > /tmp/spark-ec2_hadoop.log - rm hadoop-*.tar.gz - mv hadoop-${hadoop_version}/ persistent-hdfs/ - - # Have single conf dir - rm -rf /root/persistent-hdfs/etc/hadoop/ - ln -s /root/persistent-hdfs/conf /root/persistent-hdfs/etc/hadoop - ;; - - *) - echo "ERROR: Unknown Hadoop version" - return 1 -esac -/root/spark-ec2/copy-dir /root/persistent-hdfs - -popd > /dev/null diff --git a/persistent-hdfs/setup-slave.sh b/persistent-hdfs/setup-slave.sh deleted file mode 100755 index 8885fde5..00000000 --- a/persistent-hdfs/setup-slave.sh +++ /dev/null @@ -1,8 +0,0 @@ -#!/bin/bash - -# Setup persistent-hdfs -mkdir -p /mnt/persistent-hdfs/logs - -if [[ -e /vol/persistent-hdfs ]] ; then - chmod -R 755 /vol/persistent-hdfs -fi diff --git a/persistent-hdfs/setup.sh b/persistent-hdfs/setup.sh deleted file mode 100755 index d1713e12..00000000 --- a/persistent-hdfs/setup.sh +++ /dev/null @@ -1,22 +0,0 @@ -#!/bin/bash - -PERSISTENT_HDFS=/root/persistent-hdfs - -pushd /root/spark-ec2/persistent-hdfs > /dev/null -source ./setup-slave.sh - -for node in $SLAVES $OTHER_MASTERS; do - ssh -t $SSH_OPTS root@$node "/root/spark-ec2/persistent-hdfs/setup-slave.sh" & sleep 0.3 -done -wait - -/root/spark-ec2/copy-dir $PERSISTENT_HDFS/conf - -if [[ ! -e /vol/persistent-hdfs/dfs/name ]] ; then - echo "Formatting persistent HDFS namenode..." - $PERSISTENT_HDFS/bin/hadoop namenode -format -fi - -echo "Persistent HDFS installed, won't start by default..." - -popd > /dev/null diff --git a/tachyon/init.sh b/tachyon/init.sh deleted file mode 100755 index d5f1e481..00000000 --- a/tachyon/init.sh +++ /dev/null @@ -1,70 +0,0 @@ -#!/bin/bash - -pushd /root > /dev/null - -if [ -d "tachyon" ]; then - echo "Tachyon seems to be installed. Exiting." - return 0 -fi - -# Github tag: -if [[ "$TACHYON_VERSION" == *\|* ]] -then - # Not yet supported - echo "Tachyon git hashes are not yet supported. Please specify a Tachyon release version." -# Pre-package tachyon version -else - case "$TACHYON_VERSION" in - 0.3.0) - wget https://s3.amazonaws.com/Tachyon/tachyon-0.3.0-bin.tar.gz - ;; - 0.4.0) - wget https://s3.amazonaws.com/Tachyon/tachyon-0.4.0-bin.tar.gz - ;; - 0.4.1) - wget https://s3.amazonaws.com/Tachyon/tachyon-0.4.1-bin.tar.gz - ;; - 0.5.0) - if [[ "$HADOOP_MAJOR_VERSION" == "1" ]]; then - wget https://s3.amazonaws.com/Tachyon/tachyon-0.5.0-bin.tar.gz - else - wget https://s3.amazonaws.com/Tachyon/tachyon-0.5.0-cdh4-bin.tar.gz - fi - ;; - 0.6.0) - if [[ "$HADOOP_MAJOR_VERSION" == "1" ]]; then - wget https://s3.amazonaws.com/Tachyon/tachyon-0.6.0-bin.tar.gz - else - wget https://s3.amazonaws.com/Tachyon/tachyon-0.6.0-cdh4-bin.tar.gz - fi - ;; - 0.6.4) - if [[ "$HADOOP_MAJOR_VERSION" == "1" ]]; then - wget https://s3.amazonaws.com/Tachyon/tachyon-0.6.4-bin.tar.gz - elif [[ "$HADOOP_MAJOR_VERSION" == "2" ]]; then - wget https://s3.amazonaws.com/Tachyon/tachyon-0.6.4-cdh4-bin.tar.gz - else - wget https://s3.amazonaws.com/Tachyon/tachyon-0.6.4-hadoop2.4-bin.tar.gz - fi - ;; - *) - if [[ "$HADOOP_MAJOR_VERSION" == "1" ]]; then - wget https://s3.amazonaws.com/Tachyon/tachyon-$TACHYON_VERSION-bin.tar.gz - elif [[ "$HADOOP_MAJOR_VERSION" == "2" ]]; then - wget https://s3.amazonaws.com/Tachyon/tachyon-$TACHYON_VERSION-cdh4-bin.tar.gz - else - wget https://s3.amazonaws.com/Tachyon/tachyon-$TACHYON_VERSION-hadoop2.4-bin.tar.gz - fi - if [ $? != 0 ]; then - echo "ERROR: Unknown Tachyon version" - return -1 - fi - esac - - echo "Unpacking Tachyon" - tar xvzf tachyon-*.tar.gz > /tmp/spark-ec2_tachyon.log - rm tachyon-*.tar.gz - mv `ls -d tachyon-*` tachyon -fi - -popd > /dev/null diff --git a/tachyon/setup.sh b/tachyon/setup.sh deleted file mode 100755 index 8d946abc..00000000 --- a/tachyon/setup.sh +++ /dev/null @@ -1,9 +0,0 @@ -#!/bin/bash - -/root/spark-ec2/copy-dir /root/tachyon - -/root/tachyon/bin/tachyon format - -sleep 1 - -/root/tachyon/bin/tachyon-start.sh all Mount diff --git a/templates/root/ephemeral-hdfs/conf/capacity-scheduler.xml b/templates/root/ephemeral-hdfs/conf/capacity-scheduler.xml deleted file mode 100644 index 2bed464b..00000000 --- a/templates/root/ephemeral-hdfs/conf/capacity-scheduler.xml +++ /dev/null @@ -1,111 +0,0 @@ - - - - - yarn.scheduler.capacity.maximum-applications - 10000 - - Maximum number of applications that can be pending and running. - - - - - yarn.scheduler.capacity.maximum-am-resource-percent - 0.1 - - Maximum percent of resources in the cluster which can be used to run - application masters i.e. controls number of concurrent running - applications. - - - - - yarn.scheduler.capacity.resource-calculator - org.apache.hadoop.yarn.util.resource.DefaultResourceCalculator - - The ResourceCalculator implementation to be used to compare - Resources in the scheduler. - The default i.e. DefaultResourceCalculator only uses Memory while - DominantResourceCalculator uses dominant-resource to compare - multi-dimensional resources such as Memory, CPU etc. - - - - - yarn.scheduler.capacity.root.queues - default - - The queues at the this level (root is the root queue). - - - - - yarn.scheduler.capacity.root.default.capacity - 100 - Default queue target capacity. - - - - yarn.scheduler.capacity.root.default.user-limit-factor - 1 - - Default queue user limit a percentage from 0.0 to 1.0. - - - - - yarn.scheduler.capacity.root.default.maximum-capacity - 100 - - The maximum capacity of the default queue. - - - - - yarn.scheduler.capacity.root.default.state - RUNNING - - The state of the default queue. State can be one of RUNNING or STOPPED. - - - - - yarn.scheduler.capacity.root.default.acl_submit_applications - * - - The ACL of who can submit jobs to the default queue. - - - - - yarn.scheduler.capacity.root.default.acl_administer_queue - * - - The ACL of who can administer jobs on the default queue. - - - - - yarn.scheduler.capacity.node-locality-delay - 40 - - Number of missed scheduling opportunities after which the CapacityScheduler - attempts to schedule rack-local containers. - Typically this should be set to number of nodes in the cluster, By default is setting - approximately number of nodes in one rack which is 40. - - - - diff --git a/templates/root/ephemeral-hdfs/conf/core-site.xml b/templates/root/ephemeral-hdfs/conf/core-site.xml deleted file mode 100644 index 65f030dd..00000000 --- a/templates/root/ephemeral-hdfs/conf/core-site.xml +++ /dev/null @@ -1,68 +0,0 @@ - - - - - - - - - hadoop.tmp.dir - /mnt/ephemeral-hdfs - - - - fs.default.name - hdfs://{{active_master}}:9000 - - - - fs.defaultFS - hdfs://{{active_master}}:9000 - - - - io.file.buffer.size - 65536 - - - - dfs.client.read.shortcircuit - false - - - - dfs.client.read.shortcircuit.skip.checksum - false - - - - dfs.domain.socket.path - /var/run/hadoop-hdfs/dn._PORT - - - - dfs.client.file-block-storage-locations.timeout - 3000 - - - - fs.tachyon.impl - tachyon.hadoop.TFS - - - - fs.s3n.awsAccessKeyId - {{aws_access_key_id}} - - - - fs.s3n.awsSecretAccessKey - {{aws_secret_access_key}} - - - - hadoop.security.group.mapping - org.apache.hadoop.security.ShellBasedUnixGroupsMapping - - - diff --git a/templates/root/ephemeral-hdfs/conf/hadoop-env.sh b/templates/root/ephemeral-hdfs/conf/hadoop-env.sh deleted file mode 100755 index f4e5d7e9..00000000 --- a/templates/root/ephemeral-hdfs/conf/hadoop-env.sh +++ /dev/null @@ -1,69 +0,0 @@ -# Set Hadoop-specific environment variables here. - -# The only required environment variable is JAVA_HOME. All others are -# optional. When running a distributed configuration it is best to -# set JAVA_HOME in this file, so that it is correctly defined on -# remote nodes. - -# The java implementation to use. Required. -export JAVA_HOME={{java_home}} - -# Extra Java CLASSPATH elements. Optional. -# export HADOOP_CLASSPATH= - -export HADOOP_HOME="/root/ephemeral-hdfs" -export HADOOP_MAPREDUCE_HOME="/root/mapreduce" - -# The maximum amount of heap to use, in MB. Default is 1000. -export HADOOP_HEAPSIZE=1000 - -# Extra Java runtime options. Empty by default. -# export HADOOP_OPTS=-server -export HADOOP_OPTS="-Djava.net.preferIPv4Stack=true" - -# Command specific options appended to HADOOP_OPTS when specified -export HADOOP_NAMENODE_OPTS="-Dcom.sun.management.jmxremote $HADOOP_NAMENODE_OPTS" -export HADOOP_SECONDARYNAMENODE_OPTS="-Dcom.sun.management.jmxremote $HADOOP_SECONDARYNAMENODE_OPTS" -export HADOOP_DATANODE_OPTS="-Dcom.sun.management.jmxremote $HADOOP_DATANODE_OPTS" -export HADOOP_BALANCER_OPTS="-Dcom.sun.management.jmxremote $HADOOP_BALANCER_OPTS" -export HADOOP_JOBTRACKER_OPTS="-Dcom.sun.management.jmxremote $HADOOP_JOBTRACKER_OPTS" -# export HADOOP_TASKTRACKER_OPTS= -# The following applies to multiple commands (fs, dfs, fsck, distcp etc) -# export HADOOP_CLIENT_OPTS - -# Extra ssh options. Empty by default. -# export HADOOP_SSH_OPTS="-o ConnectTimeout=1 -o SendEnv=HADOOP_CONF_DIR" -export HADOOP_SSH_OPTS="-o StrictHostKeyChecking=no -o ConnectTimeout=5" - -# Where log files are stored. $HADOOP_HOME/logs by default. -# export HADOOP_LOG_DIR=${HADOOP_HOME}/logs -export HADOOP_LOG_DIR=/mnt/ephemeral-hdfs/logs - -# File naming remote slave hosts. $HADOOP_HOME/conf/slaves by default. -# export HADOOP_SLAVES=${HADOOP_HOME}/conf/slaves - -# host:path where hadoop code should be rsync'd from. Unset by default. -# export HADOOP_MASTER=master:/home/$USER/src/hadoop - -# Seconds to sleep between slave commands. Unset by default. This -# can be useful in large clusters, where, e.g., slave rsyncs can -# otherwise arrive faster than the master can service them. -# export HADOOP_SLAVE_SLEEP=0.1 - -# The directory where pid files are stored. /tmp by default. -export HADOOP_PID_DIR=/var/hadoop/ephemeral-hdfs/pids - -# A string representing this instance of hadoop. $USER by default. -# export HADOOP_IDENT_STRING=$USER - -# The scheduling priority for daemon processes. See 'man nice'. -# export HADOOP_NICENESS=10 - -# Set hadoop user for CDH (which doesn't allow running as root) -export HADOOP_NAMENODE_USER=hadoop -export HADOOP_DATANODE_USER=hadoop -export HADOOP_SECONDARYNAMENODE_USER=hadoop -export HADOOP_JOBTRACKER_USER=hadoop -export HADOOP_TASKTRACKER_USER=hadoop - -ulimit -n 16000 diff --git a/templates/root/ephemeral-hdfs/conf/hadoop-metrics2.properties b/templates/root/ephemeral-hdfs/conf/hadoop-metrics2.properties deleted file mode 100644 index ec024bf8..00000000 --- a/templates/root/ephemeral-hdfs/conf/hadoop-metrics2.properties +++ /dev/null @@ -1,46 +0,0 @@ -# syntax: [prefix].[source|sink|jmx].[instance].[options] -# See package.html for org.apache.hadoop.metrics2 for details - -*.sink.file.class=org.apache.hadoop.metrics2.sink.FileSink - -#namenode.sink.file.filename=namenode-metrics.out - -#datanode.sink.file.filename=datanode-metrics.out - -#jobtracker.sink.file.filename=jobtracker-metrics.out - -#tasktracker.sink.file.filename=tasktracker-metrics.out - -#maptask.sink.file.filename=maptask-metrics.out - -#reducetask.sink.file.filename=reducetask-metrics.out - - -# -# Below are for sending metrics to Ganglia -# -# for Ganglia 3.0 support -# *.sink.ganglia.class=org.apache.hadoop.metrics2.sink.ganglia.GangliaSink30 -# -# for Ganglia 3.1 support -*.sink.ganglia.class=org.apache.hadoop.metrics2.sink.ganglia.GangliaSink31 - -*.sink.ganglia.period=10 - -# default for supportsparse is false -# *.sink.ganglia.supportsparse=true - -#*.sink.ganglia.slope=jvm.metrics.gcCount=zero,jvm.metrics.memHeapUsedM=both -#*.sink.ganglia.dmax=jvm.metrics.threadsBlocked=70,jvm.metrics.memHeapUsedM=40 - -namenode.sink.ganglia.servers={{active_master}}:8649 - -datanode.sink.ganglia.servers={{active_master}}:8649 - -#jobtracker.sink.ganglia.servers=yourgangliahost_1:8649,yourgangliahost_2:8649 - -#tasktracker.sink.ganglia.servers=yourgangliahost_1:8649,yourgangliahost_2:8649 - -#maptask.sink.ganglia.servers=yourgangliahost_1:8649,yourgangliahost_2:8649 - -#reducetask.sink.ganglia.servers=yourgangliahost_1:8649,yourgangliahost_2:8649 diff --git a/templates/root/ephemeral-hdfs/conf/hdfs-site.xml b/templates/root/ephemeral-hdfs/conf/hdfs-site.xml deleted file mode 100644 index 1938d4ff..00000000 --- a/templates/root/ephemeral-hdfs/conf/hdfs-site.xml +++ /dev/null @@ -1,61 +0,0 @@ - - - - - - - dfs.replication - 3 - - - - dfs.block.size - 134217728 - - - - dfs.blocksize - 134217728 - - - - dfs.data.dir - {{hdfs_data_dirs}} - - - - dfs.namenode.handler.count - 25 - - - - dfs.datanode.handler.count - 8 - - - - dfs.permissions - false - - - - dfs.client.read.shortcircuit - false - - - - dfs.client.read.shortcircuit.skip.checksum - false - - - - dfs.domain.socket.path - /var/run/hadoop-hdfs/dn._PORT - - - - dfs.client.file-block-storage-locations.timeout - 3000 - - - diff --git a/templates/root/ephemeral-hdfs/conf/mapred-site.xml b/templates/root/ephemeral-hdfs/conf/mapred-site.xml deleted file mode 100644 index 3efb4638..00000000 --- a/templates/root/ephemeral-hdfs/conf/mapred-site.xml +++ /dev/null @@ -1,34 +0,0 @@ - - - - - - - - - mapreduce.framework.name - yarn - - - - mapred.job.tracker - {{active_master}}:9001 - - - - mapred.tasktracker.map.tasks.maximum - 4 - The maximum number of map tasks that will be run - simultaneously by a task tracker. - - - - - mapred.tasktracker.reduce.tasks.maximum - 2 - The maximum number of reduce tasks that will be run - simultaneously by a task tracker. - - - - diff --git a/templates/root/ephemeral-hdfs/conf/masters b/templates/root/ephemeral-hdfs/conf/masters deleted file mode 100644 index d26a1943..00000000 --- a/templates/root/ephemeral-hdfs/conf/masters +++ /dev/null @@ -1 +0,0 @@ -{{active_master}} diff --git a/templates/root/ephemeral-hdfs/conf/slaves b/templates/root/ephemeral-hdfs/conf/slaves deleted file mode 100644 index 05f969e0..00000000 --- a/templates/root/ephemeral-hdfs/conf/slaves +++ /dev/null @@ -1 +0,0 @@ -{{slave_list}} diff --git a/templates/root/ephemeral-hdfs/conf/yarn-env.sh b/templates/root/ephemeral-hdfs/conf/yarn-env.sh deleted file mode 100644 index d6de55df..00000000 --- a/templates/root/ephemeral-hdfs/conf/yarn-env.sh +++ /dev/null @@ -1,115 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -# User for YARN daemons -export HADOOP_YARN_USER=${HADOOP_YARN_USER:-yarn} - -# resolve links - $0 may be a softlink -#export YARN_CONF_DIR="${YARN_CONF_DIR:-$HADOOP_YARN_HOME/conf}" -export YARN_CONF_DIR="/root/ephemeral-hdfs/conf" - -# Java parameters - Hadoop requires java-7 -export JAVA_HOME="/usr/lib/jvm/java-1.7.0" - -JAVA=$JAVA_HOME/bin/java -JAVA_HEAP_MAX=-Xmx1000m - -# For setting YARN specific HEAP sizes please use this -# Parameter and set appropriately -export YARN_HEAPSIZE=1000 - -# check envvars which might override default args -if [ "$YARN_HEAPSIZE" != "" ]; then - JAVA_HEAP_MAX="-Xmx""$YARN_HEAPSIZE""m" -fi - -# Resource Manager specific parameters - -# Specify the max Heapsize for the ResourceManager using a numerical value -# in the scale of MB. For example, to specify an jvm option of -Xmx1000m, set -# the value to 1000. -# This value will be overridden by an Xmx setting specified in either YARN_OPTS -# and/or YARN_RESOURCEMANAGER_OPTS. -# If not specified, the default value will be picked from either YARN_HEAPMAX -# or JAVA_HEAP_MAX with YARN_HEAPMAX as the preferred option of the two. -#export YARN_RESOURCEMANAGER_HEAPSIZE=1000 - -# Specify the max Heapsize for the timeline server using a numerical value -# in the scale of MB. For example, to specify an jvm option of -Xmx1000m, set -# the value to 1000. -# This value will be overridden by an Xmx setting specified in either YARN_OPTS -# and/or YARN_TIMELINESERVER_OPTS. -# If not specified, the default value will be picked from either YARN_HEAPMAX -# or JAVA_HEAP_MAX with YARN_HEAPMAX as the preferred option of the two. -#export YARN_TIMELINESERVER_HEAPSIZE=1000 - -# Specify the JVM options to be used when starting the ResourceManager. -# These options will be appended to the options specified as YARN_OPTS -# and therefore may override any similar flags set in YARN_OPTS -#export YARN_RESOURCEMANAGER_OPTS= - -# Node Manager specific parameters - -# Specify the max Heapsize for the NodeManager using a numerical value -# in the scale of MB. For example, to specify an jvm option of -Xmx1000m, set -# the value to 1000. -# This value will be overridden by an Xmx setting specified in either YARN_OPTS -# and/or YARN_NODEMANAGER_OPTS. -# If not specified, the default value will be picked from either YARN_HEAPMAX -# or JAVA_HEAP_MAX with YARN_HEAPMAX as the preferred option of the two. -#export YARN_NODEMANAGER_HEAPSIZE=1000 - -# Specify the JVM options to be used when starting the NodeManager. -# These options will be appended to the options specified as YARN_OPTS -# and therefore may override any similar flags set in YARN_OPTS -#export YARN_NODEMANAGER_OPTS= - -# so that filenames w/ spaces are handled correctly in loops below -IFS= - - -# default log directory & file -#if [ "$YARN_LOG_DIR" = "" ]; then -# YARN_LOG_DIR="$HADOOP_YARN_HOME/logs" -#fi -export YARN_LOG_DIR=/mnt/ephemeral-hdfs/logs - -if [ "$YARN_LOGFILE" = "" ]; then - YARN_LOGFILE='yarn.log' -fi - -# default policy file for service-level authorization -if [ "$YARN_POLICYFILE" = "" ]; then - YARN_POLICYFILE="hadoop-policy.xml" -fi - -# restore ordinary behaviour -unset IFS - - -YARN_OPTS="$YARN_OPTS -Dhadoop.log.dir=$YARN_LOG_DIR" -YARN_OPTS="$YARN_OPTS -Dyarn.log.dir=$YARN_LOG_DIR" -YARN_OPTS="$YARN_OPTS -Dhadoop.log.file=$YARN_LOGFILE" -YARN_OPTS="$YARN_OPTS -Dyarn.log.file=$YARN_LOGFILE" -YARN_OPTS="$YARN_OPTS -Dyarn.home.dir=$YARN_COMMON_HOME" -YARN_OPTS="$YARN_OPTS -Dyarn.id.str=$YARN_IDENT_STRING" -YARN_OPTS="$YARN_OPTS -Dhadoop.root.logger=${YARN_ROOT_LOGGER:-INFO,console}" -YARN_OPTS="$YARN_OPTS -Dyarn.root.logger=${YARN_ROOT_LOGGER:-INFO,console}" -if [ "x$JAVA_LIBRARY_PATH" != "x" ]; then - YARN_OPTS="$YARN_OPTS -Djava.library.path=$JAVA_LIBRARY_PATH" -fi -YARN_OPTS="$YARN_OPTS -Dyarn.policy.file=$YARN_POLICYFILE" - - diff --git a/templates/root/ephemeral-hdfs/conf/yarn-site.xml b/templates/root/ephemeral-hdfs/conf/yarn-site.xml deleted file mode 100644 index 2d01ebbd..00000000 --- a/templates/root/ephemeral-hdfs/conf/yarn-site.xml +++ /dev/null @@ -1,60 +0,0 @@ - - - - - - - - yarn.resourcemanager.hostname - {{active_master}} - - - - - yarn.nodemanager.local-dirs - /mnt/yarn-local - - - - yarn.nodemanager.log-dirs - /mnt/yarn-logs - - - - yarn.log-aggregation-enable - true - - - - yarn.scheduler.maximum-allocation-mb - {{system_ram_mb}} - - - - yarn.nodemanager.resource.memory-mb - {{system_ram_mb}} - - - - yarn.nodemanager.aux-services - mapreduce_shuffle - - - - yarn.nodemanager.aux-services.mapreduce.shuffle.class - org.apache.hadoop.mapred.ShuffleHandler - - - diff --git a/templates/root/mapreduce/conf/core-site.xml b/templates/root/mapreduce/conf/core-site.xml deleted file mode 100644 index 79c6de53..00000000 --- a/templates/root/mapreduce/conf/core-site.xml +++ /dev/null @@ -1,27 +0,0 @@ - - - - - - - - - hadoop.tmp.dir - /mnt/mapreduce - - - - fs.default.name - hdfs://{{active_master}}:9000 - - - - io.file.buffer.size - 65536 - - - - dfs.blocksize - 134217728 - - diff --git a/templates/root/mapreduce/conf/hadoop-env.sh b/templates/root/mapreduce/conf/hadoop-env.sh deleted file mode 100755 index ac3b937b..00000000 --- a/templates/root/mapreduce/conf/hadoop-env.sh +++ /dev/null @@ -1,72 +0,0 @@ -# Set Hadoop-specific environment variables here. - -# The only required environment variable is JAVA_HOME. All others are -# optional. When running a distributed configuration it is best to -# set JAVA_HOME in this file, so that it is correctly defined on -# remote nodes. - -# The java implementation to use. Required. -# Hadoop 2.x requires java-7 (it's not compatible with java-8), see -# https://wiki.apache.org/hadoop/HadoopJavaVersions -export JAVA_HOME=/usr/lib/jvm/java-1.7.0 - -# Extra Java CLASSPATH elements. Optional. -# export HADOOP_CLASSPATH= - -# NOTE: Set this to /root/mapreduce so that start-mapred.sh works correctly -export HADOOP_HOME="/root/mapreduce" -export HADOOP_MAPREDUCE_HOME="/root/mapreduce" - -# The maximum amount of heap to use, in MB. Default is 1000. -export HADOOP_HEAPSIZE=1000 - -# Extra Java runtime options. Empty by default. -# export HADOOP_OPTS=-server -export HADOOP_OPTS="-Djava.net.preferIPv4Stack=true" - -# Command specific options appended to HADOOP_OPTS when specified -export HADOOP_NAMENODE_OPTS="-Dcom.sun.management.jmxremote $HADOOP_NAMENODE_OPTS" -export HADOOP_SECONDARYNAMENODE_OPTS="-Dcom.sun.management.jmxremote $HADOOP_SECONDARYNAMENODE_OPTS" -export HADOOP_DATANODE_OPTS="-Dcom.sun.management.jmxremote $HADOOP_DATANODE_OPTS" -export HADOOP_BALANCER_OPTS="-Dcom.sun.management.jmxremote $HADOOP_BALANCER_OPTS" -export HADOOP_JOBTRACKER_OPTS="-Dcom.sun.management.jmxremote $HADOOP_JOBTRACKER_OPTS" -# export HADOOP_TASKTRACKER_OPTS= -# The following applies to multiple commands (fs, dfs, fsck, distcp etc) -# export HADOOP_CLIENT_OPTS - -# Extra ssh options. Empty by default. -# export HADOOP_SSH_OPTS="-o ConnectTimeout=1 -o SendEnv=HADOOP_CONF_DIR" -export HADOOP_SSH_OPTS="-o StrictHostKeyChecking=no -o ConnectTimeout=5" - -# Where log files are stored. $HADOOP_HOME/logs by default. -# export HADOOP_LOG_DIR=${HADOOP_HOME}/logs -export HADOOP_LOG_DIR=/mnt/mapreduce/logs - -# File naming remote slave hosts. $HADOOP_HOME/conf/slaves by default. -# export HADOOP_SLAVES=${HADOOP_HOME}/conf/slaves - -# host:path where hadoop code should be rsync'd from. Unset by default. -# export HADOOP_MASTER=master:/home/$USER/src/hadoop - -# Seconds to sleep between slave commands. Unset by default. This -# can be useful in large clusters, where, e.g., slave rsyncs can -# otherwise arrive faster than the master can service them. -# export HADOOP_SLAVE_SLEEP=0.1 - -# The directory where pid files are stored. /tmp by default. -export HADOOP_PID_DIR=/var/hadoop/mapreduce/pids - -# A string representing this instance of hadoop. $USER by default. -# export HADOOP_IDENT_STRING=$USER - -# The scheduling priority for daemon processes. See 'man nice'. -# export HADOOP_NICENESS=10 - -# Set hadoop user for CDH (which doesn't allow running as root) -export HADOOP_NAMENODE_USER=hadoop -export HADOOP_DATANODE_USER=hadoop -export HADOOP_SECONDARYNAMENODE_USER=hadoop -export HADOOP_JOBTRACKER_USER=hadoop -export HADOOP_TASKTRACKER_USER=hadoop - -ulimit -n 16000 diff --git a/templates/root/mapreduce/conf/hdfs-site.xml b/templates/root/mapreduce/conf/hdfs-site.xml deleted file mode 100644 index 5845ef62..00000000 --- a/templates/root/mapreduce/conf/hdfs-site.xml +++ /dev/null @@ -1,11 +0,0 @@ - - - - - - - - dfs.blocksize - 134217728 - - diff --git a/templates/root/mapreduce/conf/mapred-site.xml b/templates/root/mapreduce/conf/mapred-site.xml deleted file mode 100644 index b1637dc8..00000000 --- a/templates/root/mapreduce/conf/mapred-site.xml +++ /dev/null @@ -1,29 +0,0 @@ - - - - - - - - - mapred.job.tracker - {{active_master}}:9001 - - - - mapred.tasktracker.map.tasks.maximum - 4 - The maximum number of map tasks that will be run - simultaneously by a task tracker. - - - - - mapred.tasktracker.reduce.tasks.maximum - 2 - The maximum number of reduce tasks that will be run - simultaneously by a task tracker. - - - - diff --git a/templates/root/mapreduce/conf/masters b/templates/root/mapreduce/conf/masters deleted file mode 100644 index d26a1943..00000000 --- a/templates/root/mapreduce/conf/masters +++ /dev/null @@ -1 +0,0 @@ -{{active_master}} diff --git a/templates/root/mapreduce/conf/slaves b/templates/root/mapreduce/conf/slaves deleted file mode 100644 index 05f969e0..00000000 --- a/templates/root/mapreduce/conf/slaves +++ /dev/null @@ -1 +0,0 @@ -{{slave_list}} diff --git a/templates/root/mapreduce/hadoop.version b/templates/root/mapreduce/hadoop.version deleted file mode 100644 index 14d6eb9b..00000000 --- a/templates/root/mapreduce/hadoop.version +++ /dev/null @@ -1 +0,0 @@ -{{hadoop_major_version}} diff --git a/templates/root/persistent-hdfs/conf/core-site.xml b/templates/root/persistent-hdfs/conf/core-site.xml deleted file mode 100644 index 2c254986..00000000 --- a/templates/root/persistent-hdfs/conf/core-site.xml +++ /dev/null @@ -1,68 +0,0 @@ - - - - - - - - - hadoop.tmp.dir - /vol/persistent-hdfs - - - - fs.default.name - hdfs://{{active_master}}:9010 - - - - fs.defaultFS - hdfs://{{active_master}}:9010 - - - - io.file.buffer.size - 65536 - - - - dfs.client.read.shortcircuit - false - - - - dfs.client.read.shortcircuit.skip.checksum - false - - - - dfs.domain.socket.path - /var/run/hadoop-hdfs/dn._PORT - - - - dfs.client.file-block-storage-locations.timeout - 3000 - - - - fs.tachyon.impl - tachyon.hadoop.TFS - - - - fs.s3n.awsAccessKeyId - {{aws_access_key_id}} - - - - fs.s3n.awsSecretAccessKey - {{aws_secret_access_key}} - - - - hadoop.security.group.mapping - org.apache.hadoop.security.ShellBasedUnixGroupsMapping - - - diff --git a/templates/root/persistent-hdfs/conf/hadoop-env.sh b/templates/root/persistent-hdfs/conf/hadoop-env.sh deleted file mode 100755 index 1dc0f71b..00000000 --- a/templates/root/persistent-hdfs/conf/hadoop-env.sh +++ /dev/null @@ -1,69 +0,0 @@ -# Set Hadoop-specific environment variables here. - -# The only required environment variable is JAVA_HOME. All others are -# optional. When running a distributed configuration it is best to -# set JAVA_HOME in this file, so that it is correctly defined on -# remote nodes. - -# The java implementation to use. Required. -export JAVA_HOME={{java_home}} - -export HADOOP_HOME="/root/persistent-hdfs" -export HADOOP_MAPREDUCE_HOME="/root/mapreduce" - -# Extra Java CLASSPATH elements. Optional. -# export HADOOP_CLASSPATH= - -# The maximum amount of heap to use, in MB. Default is 1000. -export HADOOP_HEAPSIZE=1000 - -# Extra Java runtime options. Empty by default. -# export HADOOP_OPTS=-server -export HADOOP_OPTS="-Djava.net.preferIPv4Stack=true" - -# Command specific options appended to HADOOP_OPTS when specified -export HADOOP_NAMENODE_OPTS="-Dcom.sun.management.jmxremote $HADOOP_NAMENODE_OPTS" -export HADOOP_SECONDARYNAMENODE_OPTS="-Dcom.sun.management.jmxremote $HADOOP_SECONDARYNAMENODE_OPTS" -export HADOOP_DATANODE_OPTS="-Dcom.sun.management.jmxremote $HADOOP_DATANODE_OPTS" -export HADOOP_BALANCER_OPTS="-Dcom.sun.management.jmxremote $HADOOP_BALANCER_OPTS" -export HADOOP_JOBTRACKER_OPTS="-Dcom.sun.management.jmxremote $HADOOP_JOBTRACKER_OPTS" -# export HADOOP_TASKTRACKER_OPTS= -# The following applies to multiple commands (fs, dfs, fsck, distcp etc) -# export HADOOP_CLIENT_OPTS - -# Extra ssh options. Empty by default. -# export HADOOP_SSH_OPTS="-o ConnectTimeout=1 -o SendEnv=HADOOP_CONF_DIR" -export HADOOP_SSH_OPTS="-o StrictHostKeyChecking=no -o ConnectTimeout=5" - -# Where log files are stored. $HADOOP_HOME/logs by default. -# export HADOOP_LOG_DIR=${HADOOP_HOME}/logs -export HADOOP_LOG_DIR=/mnt/persistent-hdfs/logs - -# File naming remote slave hosts. $HADOOP_HOME/conf/slaves by default. -# export HADOOP_SLAVES=${HADOOP_HOME}/conf/slaves - -# host:path where hadoop code should be rsync'd from. Unset by default. -# export HADOOP_MASTER=master:/home/$USER/src/hadoop - -# Seconds to sleep between slave commands. Unset by default. This -# can be useful in large clusters, where, e.g., slave rsyncs can -# otherwise arrive faster than the master can service them. -# export HADOOP_SLAVE_SLEEP=0.1 - -# The directory where pid files are stored. /tmp by default. -export HADOOP_PID_DIR=/var/hadoop/persistent-hdfs/pids - -# A string representing this instance of hadoop. $USER by default. -# export HADOOP_IDENT_STRING=$USER - -# The scheduling priority for daemon processes. See 'man nice'. -# export HADOOP_NICENESS=10 - -# Set hadoop user for CDH (which doesn't allow running as root) -export HADOOP_NAMENODE_USER=hadoop -export HADOOP_DATANODE_USER=hadoop -export HADOOP_SECONDARYNAMENODE_USER=hadoop -export HADOOP_JOBTRACKER_USER=hadoop -export HADOOP_TASKTRACKER_USER=hadoop - -ulimit -n 16000 diff --git a/templates/root/persistent-hdfs/conf/hdfs-site.xml b/templates/root/persistent-hdfs/conf/hdfs-site.xml deleted file mode 100644 index a9993f7b..00000000 --- a/templates/root/persistent-hdfs/conf/hdfs-site.xml +++ /dev/null @@ -1,101 +0,0 @@ - - - - - - - dfs.replication - 2 - - - - dfs.block.size - 134217728 - - - - dfs.blocksize - 134217728 - - - - dfs.secondary.http.address - 0.0.0.0:60090 - - The secondary namenode http server address and port. - If the port is 0 then the server will start on a free port. - - - - - dfs.datanode.address - 0.0.0.0:60010 - - The address where the datanode server will listen to. - If the port is 0 then the server will start on a free port. - - - - - dfs.datanode.http.address - 0.0.0.0:60075 - - The datanode http server address and port. - If the port is 0 then the server will start on a free port. - - - - - dfs.datanode.ipc.address - 0.0.0.0:60020 - - The datanode ipc server address and port. - If the port is 0 then the server will start on a free port. - - - - - dfs.http.address - 0.0.0.0:60070 - - The address and the base port where the dfs namenode web ui will listen on. - If the port is 0 then the server will start on a free port. - - - - - dfs.namenode.handler.count - 25 - - - - dfs.datanode.handler.count - 8 - - - - dfs.permissions - false - - - - dfs.client.read.shortcircuit - false - - - - dfs.client.read.shortcircuit.skip.checksum - false - - - - dfs.domain.socket.path - /var/run/hadoop-hdfs/dn._PORT - - - - dfs.client.file-block-storage-locations.timeout - 3000 - - - diff --git a/templates/root/persistent-hdfs/conf/mapred-site.xml b/templates/root/persistent-hdfs/conf/mapred-site.xml deleted file mode 100644 index b1637dc8..00000000 --- a/templates/root/persistent-hdfs/conf/mapred-site.xml +++ /dev/null @@ -1,29 +0,0 @@ - - - - - - - - - mapred.job.tracker - {{active_master}}:9001 - - - - mapred.tasktracker.map.tasks.maximum - 4 - The maximum number of map tasks that will be run - simultaneously by a task tracker. - - - - - mapred.tasktracker.reduce.tasks.maximum - 2 - The maximum number of reduce tasks that will be run - simultaneously by a task tracker. - - - - diff --git a/templates/root/persistent-hdfs/conf/masters b/templates/root/persistent-hdfs/conf/masters deleted file mode 100644 index d26a1943..00000000 --- a/templates/root/persistent-hdfs/conf/masters +++ /dev/null @@ -1 +0,0 @@ -{{active_master}} diff --git a/templates/root/persistent-hdfs/conf/slaves b/templates/root/persistent-hdfs/conf/slaves deleted file mode 100644 index 05f969e0..00000000 --- a/templates/root/persistent-hdfs/conf/slaves +++ /dev/null @@ -1 +0,0 @@ -{{slave_list}} diff --git a/templates/root/spark/conf/core-site.xml b/templates/root/spark/conf/core-site.xml index c54a2e58..b5c041a6 100644 --- a/templates/root/spark/conf/core-site.xml +++ b/templates/root/spark/conf/core-site.xml @@ -5,44 +5,9 @@ - - hadoop.tmp.dir - /mnt/ephemeral-hdfs - - - - fs.default.name - hdfs://{{active_master}}:9000 - - io.file.buffer.size 65536 - - dfs.client.read.shortcircuit - false - - - - dfs.client.read.shortcircuit.skip.checksum - false - - - - dfs.domain.socket.path - /var/run/hadoop-hdfs/dn._PORT - - - - dfs.client.file-block-storage-locations.timeout - 3000 - - - - fs.tachyon.impl - tachyon.hadoop.TFS - - diff --git a/templates/root/spark/conf/spark-defaults.conf b/templates/root/spark/conf/spark-defaults.conf index c63994a5..6aa52d5a 100644 --- a/templates/root/spark/conf/spark-defaults.conf +++ b/templates/root/spark/conf/spark-defaults.conf @@ -1,8 +1,2 @@ spark.executor.memory {{spark_worker_mem}} -spark.executor.extraLibraryPath /root/ephemeral-hdfs/lib/native/ -spark.executor.extraClassPath /root/ephemeral-hdfs/conf -# for spark version < 1.4.0 -spark.tachyonStore.url tachyon://{{active_master}}:19998 -# for spark version >= 1.4.0 -spark.externalBlockStore.url tachyon://{{active_master}}:19998 diff --git a/templates/root/spark/conf/spark-env.sh b/templates/root/spark/conf/spark-env.sh index aa4490b8..7e8dd8c2 100755 --- a/templates/root/spark/conf/spark-env.sh +++ b/templates/root/spark/conf/spark-env.sh @@ -9,21 +9,17 @@ if [ -n "{{spark_worker_instances}}" ]; then fi export SPARK_WORKER_CORES={{spark_worker_cores}} -export HADOOP_HOME="/root/ephemeral-hdfs" export SPARK_MASTER_IP={{active_master}} export MASTER=`cat /root/spark-ec2/cluster-url` -export SPARK_SUBMIT_LIBRARY_PATH="$SPARK_SUBMIT_LIBRARY_PATH:/root/ephemeral-hdfs/lib/native/" -export SPARK_SUBMIT_CLASSPATH="$SPARK_CLASSPATH:$SPARK_SUBMIT_CLASSPATH:/root/ephemeral-hdfs/conf" +export SPARK_SUBMIT_LIBRARY_PATH="$SPARK_SUBMIT_LIBRARY_PATH" +export SPARK_SUBMIT_CLASSPATH="$SPARK_CLASSPATH:$SPARK_SUBMIT_CLASSPATH" # Bind Spark's web UIs to this machine's public EC2 hostname otherwise fallback to private IP: export SPARK_PUBLIC_DNS=` wget -q -O - http://169.254.169.254/latest/meta-data/public-hostname ||\ wget -q -O - http://169.254.169.254/latest/meta-data/local-ipv4` -# Used for YARN model -export YARN_CONF_DIR="/root/ephemeral-hdfs/conf" - # Set a high ulimit for large shuffles, only root can do this if [ $(id -u) == "0" ] then diff --git a/templates/root/tachyon/conf/slaves b/templates/root/tachyon/conf/slaves deleted file mode 100644 index fb0d8d5f..00000000 --- a/templates/root/tachyon/conf/slaves +++ /dev/null @@ -1 +0,0 @@ -{{slave_list}} \ No newline at end of file diff --git a/templates/root/tachyon/conf/tachyon-env.sh b/templates/root/tachyon/conf/tachyon-env.sh deleted file mode 100644 index 7e4f1ba6..00000000 --- a/templates/root/tachyon/conf/tachyon-env.sh +++ /dev/null @@ -1,57 +0,0 @@ -#!/usr/bin/env bash - -# This file contains environment variables required to run Tachyon. Copy it as tachyon-env.sh and -# edit that to configure Tachyon for your site. At a minimum, -# the following variables should be set: -# -# - JAVA_HOME, to point to your JAVA installation -# - TACHYON_MASTER_ADDRESS, to bind the master to a different IP address or hostname -# - TACHYON_UNDERFS_ADDRESS, to set the under filesystem address. -# - TACHYON_WORKER_MEMORY_SIZE, to set how much memory to use (e.g. 1000mb, 2gb) per worker -# - TACHYON_RAM_FOLDER, to set where worker stores in memory data -# -# The following gives an example: - -if [[ `uname -a` == Darwin* ]]; then - # Assuming Mac OS X - export JAVA_HOME=$(/usr/libexec/java_home) - export TACHYON_RAM_FOLDER=/Volumes/ramdisk - export TACHYON_JAVA_OPTS="-Djava.security.krb5.realm= -Djava.security.krb5.kdc=" -else - # Assuming Linux - if [ -z "$JAVA_HOME" ]; then - # TODO: Does tachyon work with java-8? - export JAVA_HOME=/usr/lib/jvm/java-1.7.0 - export JAVA="$JAVA_HOME/bin/java" - fi - export TACHYON_RAM_FOLDER=/mnt/ramdisk -fi - -export TACHYON_MASTER_ADDRESS={{active_master}} -export TACHYON_UNDERFS_ADDRESS=hdfs://{{active_master}}:9000 -#export TACHYON_UNDERFS_ADDRESS=hdfs://localhost:9000 -export TACHYON_WORKER_MEMORY_SIZE={{default_tachyon_mem}} -export TACHYON_UNDERFS_HDFS_IMPL=org.apache.hadoop.hdfs.DistributedFileSystem - -CONF_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" - -export TACHYON_JAVA_OPTS+=" - -Dlog4j.configuration=file:$CONF_DIR/log4j.properties - -Dtachyon.debug=false - -Dtachyon.underfs.address=$TACHYON_UNDERFS_ADDRESS - -Dtachyon.underfs.hdfs.impl=$TACHYON_UNDERFS_HDFS_IMPL - -Dtachyon.data.folder=$TACHYON_UNDERFS_ADDRESS/tachyon/data - -Dtachyon.workers.folder=$TACHYON_UNDERFS_ADDRESS/tachyon/workers - -Dtachyon.worker.memory.size=$TACHYON_WORKER_MEMORY_SIZE - -Dtachyon.worker.data.folder=$TACHYON_RAM_FOLDER/tachyonworker/ - -Dtachyon.master.worker.timeout.ms=60000 - -Dtachyon.master.hostname=$TACHYON_MASTER_ADDRESS - -Dtachyon.master.journal.folder=$TACHYON_HOME/journal/ - -Dtachyon.master.pinlist=/pinfiles;/pindata -" - -# Master specific parameters. Default to TACHYON_JAVA_OPTS. -export TACHYON_MASTER_JAVA_OPTS="$TACHYON_JAVA_OPTS" - -# Worker specific parameters that will be shared to all workers. Default to TACHYON_JAVA_OPTS. -export TACHYON_WORKER_JAVA_OPTS="$TACHYON_JAVA_OPTS" diff --git a/templates/root/tachyon/conf/workers b/templates/root/tachyon/conf/workers deleted file mode 100644 index 05f969e0..00000000 --- a/templates/root/tachyon/conf/workers +++ /dev/null @@ -1 +0,0 @@ -{{slave_list}} From d81a90e466a2ac7dc5c9c353d06db9b3f3985bad Mon Sep 17 00:00:00 2001 From: "Darren L. Weber, Ph.D" Date: Wed, 16 Aug 2017 16:20:27 -0700 Subject: [PATCH 22/30] Set the default java to java-1.8.0 --- create_image.sh | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/create_image.sh b/create_image.sh index ff4d3e7e..9bc8841c 100755 --- a/create_image.sh +++ b/create_image.sh @@ -14,13 +14,16 @@ fi sudo yum install -y gcc gcc-c++ ant git # Install java-8 for Spark 2.2.x -sudo yum install -y java-1.8.0-openjdk-devel +sudo yum install -y java-1.8.0 java-1.8.0-devel +sudo /usr/sbin/alternatives --set java /usr/lib/jvm/jre-1.8.0-openjdk.x86_64/bin/java +sudo /usr/sbin/alternatives --set javac /usr/lib/jvm/jre-1.8.0-openjdk.x86_64/bin/javac +#sudo yum remove java-1.7 # Perf tools sudo yum install -y dstat iotop strace sysstat htop perf sudo debuginfo-install -q -y glibc sudo debuginfo-install -q -y kernel -sudo yum --enablerepo='*-debug*' install -q -y java-1.8.0-openjdk-debuginfo +sudo yum --enablerepo='*-debug*' install -y java-1.8.0-openjdk-debuginfo # PySpark and MLlib deps sudo yum install -y python-matplotlib python-tornado scipy libgfortran From 69148160cacc540e933108d03176222752daec51 Mon Sep 17 00:00:00 2001 From: "Darren L. Weber, Ph.D" Date: Wed, 16 Aug 2017 16:27:05 -0700 Subject: [PATCH 23/30] spark-standalone/setup.sh - remove old spark versions code --- spark-standalone/setup.sh | 4 ---- 1 file changed, 4 deletions(-) diff --git a/spark-standalone/setup.sh b/spark-standalone/setup.sh index e9c04c1f..8a2c5a14 100755 --- a/spark-standalone/setup.sh +++ b/spark-standalone/setup.sh @@ -2,10 +2,6 @@ BIN_FOLDER="/root/spark/sbin" -if [[ "0.7.3 0.8.0 0.8.1" =~ $SPARK_VERSION ]]; then - BIN_FOLDER="/root/spark/bin" -fi - # Copy the slaves to spark conf cp /root/spark-ec2/slaves /root/spark/conf/ /root/spark-ec2/copy-dir /root/spark/conf From 2a26325d91e991830f03aae99176dc0645a90a7e Mon Sep 17 00:00:00 2001 From: "Darren L. Weber, Ph.D" Date: Wed, 16 Aug 2017 17:38:30 -0700 Subject: [PATCH 24/30] Modify setup scripts to ensure the java changes occur etc. --- setup-slave.sh | 4 +++- setup-tools.sh | 56 ++++++++++++++++++++++++++++++++++++++++++++++++++ setup.sh | 2 +- 3 files changed, 60 insertions(+), 2 deletions(-) create mode 100755 setup-tools.sh diff --git a/setup-slave.sh b/setup-slave.sh index cf46f069..156cddac 100755 --- a/setup-slave.sh +++ b/setup-slave.sh @@ -1,5 +1,7 @@ #!/bin/bash +sudo setup-tools.sh + # Disable Transparent Huge Pages (THP) # THP can result in system thrashing (high sys usage) due to frequent defrags of memory. # Most systems recommends turning THP off. @@ -131,4 +133,4 @@ popd > /dev/null # this is to set the ulimit for root and other users echo '* soft nofile 1000000' >> /etc/security/limits.conf -echo '* hard nofile 1000000' >> /etc/security/limits.conf \ No newline at end of file +echo '* hard nofile 1000000' >> /etc/security/limits.conf diff --git a/setup-tools.sh b/setup-tools.sh new file mode 100755 index 00000000..0b5b493a --- /dev/null +++ b/setup-tools.sh @@ -0,0 +1,56 @@ +#!/bin/bash + +set -e + +if [ "$(id -u)" != "0" ]; then + echo "This script must be run as root" 1>&2 + exit 1 +fi + + +# Connectivity tools +sudo yum install -y pssh rsync + +# Dev tools +sudo yum install -y gcc gcc-c++ ant git + +# Install java-8 for Spark 2.2.x +sudo yum install -y java-1.8.0 java-1.8.0-devel +sudo yum --enablerepo='*-debug*' install -y java-1.8.0-openjdk-debuginfo +sudo yum remove -y java-1.7.0 +sudo /usr/sbin/alternatives --auto java +sudo /usr/sbin/alternatives --auto javac + +# Perf tools +sudo yum install -y dstat iotop strace sysstat htop perf +sudo debuginfo-install -y glibc +sudo debuginfo-install -y kernel + +# PySpark and MLlib deps +sudo yum install -y python-matplotlib python-tornado scipy libgfortran +# SparkR deps +sudo yum install -y R +# Ganglia +sudo yum install -y ganglia ganglia-web ganglia-gmond ganglia-gmetad + +# Install Maven +cd /tmp +wget "http://archive.apache.org/dist/maven/maven-3/3.2.3/binaries/apache-maven-3.2.3-bin.tar.gz" +tar xvzf apache-maven-3.2.3-bin.tar.gz +mv apache-maven-3.2.3 /opt/ + +# Edit bash profile +echo "export PS1=\"\\u@\\h \\W]\\$ \"" >> ~/.bash_profile +echo "export JAVA_HOME=/usr/lib/jvm/java-1.8.0" >> ~/.bash_profile +echo "export M2_HOME=/opt/apache-maven-3.2.3" >> ~/.bash_profile +echo "export PATH=\$PATH:\$M2_HOME/bin" >> ~/.bash_profile + +source ~/.bash_profile + +# Create /usr/bin/realpath which is used by R to find Java installations +# NOTE: /usr/bin/realpath is missing in CentOS AMIs. See +# http://superuser.com/questions/771104/usr-bin-realpath-not-found-in-centos-6-5 +echo '#!/bin/bash' > /usr/bin/realpath +echo 'readlink -e "$@"' >> /usr/bin/realpath +chmod a+x /usr/bin/realpath + diff --git a/setup.sh b/setup.sh index 5a3beea2..19a61bcf 100755 --- a/setup.sh +++ b/setup.sh @@ -1,6 +1,6 @@ #!/bin/bash -sudo yum install -y -q pssh +sudo setup-tools.sh # usage: echo_time_diff name start_time end_time echo_time_diff () { From e5b2c2ed02541dd946cac8f2f52c828869270f99 Mon Sep 17 00:00:00 2001 From: "Darren L. Weber, Ph.D" Date: Wed, 16 Aug 2017 17:59:29 -0700 Subject: [PATCH 25/30] Update scala to 2.11.11, download from lightbend --- scala/init.sh | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/scala/init.sh b/scala/init.sh index 8450df6c..6a66ea3f 100755 --- a/scala/init.sh +++ b/scala/init.sh @@ -7,14 +7,11 @@ if [ -d "scala" ]; then return 0 fi -SCALA_VERSION="2.11.8" - -if [[ "0.7.3 0.8.0 0.8.1" =~ $SPARK_VERSION ]]; then - SCALA_VERSION="2.9.3" -fi +SCALA_VERSION="2.11.11" echo "Unpacking Scala" -wget http://s3.amazonaws.com/spark-related-packages/scala-$SCALA_VERSION.tgz +wget https://downloads.lightbend.com/scala/${SCALA_VERSION}/scala-${SCALA_VERSION}.tgz + tar xvzf scala-*.tgz > /tmp/spark-ec2_scala.log rm scala-*.tgz mv `ls -d scala-* | grep -v ec2` scala From eaf78b9261fe11d2226a6bf209c0ba9139b4e5fe Mon Sep 17 00:00:00 2001 From: "Darren L. Weber, Ph.D" Date: Wed, 16 Aug 2017 18:37:48 -0700 Subject: [PATCH 26/30] spark_ec2.py - remove tachyon --- spark_ec2.py | 32 +------------------------------- 1 file changed, 1 insertion(+), 31 deletions(-) diff --git a/spark_ec2.py b/spark_ec2.py index aea8885e..73c84118 100644 --- a/spark_ec2.py +++ b/spark_ec2.py @@ -96,27 +96,6 @@ "2.2.0" ]) -# https://spark.apache.org/releases/spark-release-2-0-0.html#removals-behavior-changes-and-deprecations -# Removed : Block-oriented integration with Tachyon (subsumed by file system integration) -SPARK_TACHYON_MAP = { - "1.0.0": "0.4.1", - "1.0.1": "0.4.1", - "1.0.2": "0.4.1", - "1.1.0": "0.5.0", - "1.1.1": "0.5.0", - "1.2.0": "0.5.0", - "1.2.1": "0.5.0", - "1.3.0": "0.5.0", - "1.3.1": "0.5.0", - "1.4.0": "0.6.4", - "1.4.1": "0.6.4", - "1.5.0": "0.7.1", - "1.5.1": "0.7.1", - "1.5.2": "0.7.1", - "1.6.0": "0.8.2", - # nothing for spark >= 2.x -} - DEFAULT_SPARK_VERSION = SPARK_EC2_VERSION DEFAULT_SPARK_GITHUB_REPO = "https://github.com/apache/spark" @@ -483,8 +462,6 @@ def get_validate_spark_version(version, repo): } -def get_tachyon_version(spark_version): - return SPARK_TACHYON_MAP.get(spark_version, "") # Attempt to resolve an appropriate AMI given the architecture and region of the request. @@ -857,7 +834,7 @@ def setup_cluster(conn, master_nodes, slave_nodes, opts, deploy_ssh_key): ssh_write(slave_address, opts, ['tar', 'x'], dot_ssh_tar) modules = ['spark', 'ephemeral-hdfs', 'persistent-hdfs', - 'mapreduce', 'spark-standalone', 'tachyon', 'rstudio'] + 'mapreduce', 'spark-standalone', 'rstudio'] if opts.hadoop_major_version == "1": modules = list(filter(lambda x: x != "mapreduce", modules)) @@ -1103,15 +1080,9 @@ def deploy_files(conn, root_dir, opts, master_nodes, slave_nodes, modules): # Pre-built Spark deploy spark_v = get_validate_spark_version(opts.spark_version, opts.spark_git_repo) validate_spark_hadoop_version(spark_v, opts.hadoop_major_version) - tachyon_v = get_tachyon_version(spark_v) else: # Spark-only custom deploy spark_v = "%s|%s" % (opts.spark_git_repo, opts.spark_version) - tachyon_v = "" - - if tachyon_v == "": - print("No valid Tachyon version found; Tachyon won't be set up") - modules.remove("tachyon") master_addresses = [get_dns_name(i, opts.private_ips) for i in master_nodes] slave_addresses = [get_dns_name(i, opts.private_ips) for i in slave_nodes] @@ -1127,7 +1098,6 @@ def deploy_files(conn, root_dir, opts, master_nodes, slave_nodes, modules): "swap": str(opts.swap), "modules": '\n'.join(modules), "spark_version": spark_v, - "tachyon_version": tachyon_v, "hadoop_major_version": opts.hadoop_major_version, "spark_worker_instances": worker_instances_str, "spark_master_opts": opts.master_opts From df433adbabc0778b0b4ba2f48e6dead66a3f74ef Mon Sep 17 00:00:00 2001 From: "Darren L. Weber, Ph.D" Date: Wed, 16 Aug 2017 19:04:24 -0700 Subject: [PATCH 27/30] spark_ec2.py - remove some things not specific to spark --- rstudio/init.sh | 29 ---------------------------- rstudio/startSpark.R | 14 -------------- setup-slave.sh | 2 +- setup.sh | 2 +- spark_ec2.py | 46 +++----------------------------------------- 5 files changed, 5 insertions(+), 88 deletions(-) delete mode 100644 rstudio/init.sh delete mode 100644 rstudio/startSpark.R diff --git a/rstudio/init.sh b/rstudio/init.sh deleted file mode 100644 index fd18d6be..00000000 --- a/rstudio/init.sh +++ /dev/null @@ -1,29 +0,0 @@ -#!/usr/bin/env bash - -# download rstudio -wget http://download2.rstudio.org/rstudio-server-rhel-0.99.446-x86_64.rpm -sudo yum install --nogpgcheck -y rstudio-server-rhel-0.99.446-x86_64.rpm - -# restart rstudio -rstudio-server restart - -# add user for rstudio, user needs to supply password later on -adduser rstudio - -# create a Rscript that connects to Spark, to help starting user -cp /root/spark-ec2/rstudio/startSpark.R /home/rstudio - -# make sure that the temp dirs exist and can be written to by any user -# otherwise this will create a conflict for the rstudio user -function create_temp_dirs { - location=$1 - if [[ ! -e $location ]]; then - mkdir -p $location - fi - chmod a+w $location -} - -create_temp_dirs /mnt/spark -create_temp_dirs /mnt2/spark -create_temp_dirs /mnt3/spark -create_temp_dirs /mnt4/spark diff --git a/rstudio/startSpark.R b/rstudio/startSpark.R deleted file mode 100644 index a6143cad..00000000 --- a/rstudio/startSpark.R +++ /dev/null @@ -1,14 +0,0 @@ -print('Now connecting to Spark for you.') - -spark_link <- system('cat /root/spark-ec2/cluster-url', intern=TRUE) - -.libPaths(c(.libPaths(), '/root/spark/R/lib')) -Sys.setenv(SPARK_HOME = '/root/spark') -Sys.setenv(PATH = paste(Sys.getenv(c('PATH')), '/root/spark/bin', sep=':')) -library(SparkR) - -sc <- sparkR.init(spark_link) -sqlContext <- sparkRSQL.init(sc) - -print('Spark Context available as \"sc\". \\n') -print('Spark SQL Context available as \"sqlContext\". \\n') \ No newline at end of file diff --git a/setup-slave.sh b/setup-slave.sh index 156cddac..8affb42a 100755 --- a/setup-slave.sh +++ b/setup-slave.sh @@ -1,6 +1,6 @@ #!/bin/bash -sudo setup-tools.sh +~/spark-ec2/setup-tools.sh # Disable Transparent Huge Pages (THP) # THP can result in system thrashing (high sys usage) due to frequent defrags of memory. diff --git a/setup.sh b/setup.sh index 19a61bcf..93fd28c2 100755 --- a/setup.sh +++ b/setup.sh @@ -1,6 +1,6 @@ #!/bin/bash -sudo setup-tools.sh +~/spark-ec2/setup-tools.sh # usage: echo_time_diff name start_time end_time echo_time_diff () { diff --git a/spark_ec2.py b/spark_ec2.py index 73c84118..674e0ba1 100644 --- a/spark_ec2.py +++ b/spark_ec2.py @@ -233,10 +233,6 @@ def parse_args(): "in / before copying its contents. If you append the trailing slash, " + "the directory is not created and its contents are copied directly into /. " + "(default: %default).") - parser.add_option( - "--hadoop-major-version", default="yarn", - help="Major version of Hadoop. Valid options are 1 (Hadoop 1.0.4), 2 (CDH 4.2.0), yarn " + - "(Hadoop 2.4.0) (default: %default)") parser.add_option( "-D", metavar="[ADDRESS:]PORT", dest="proxy_port", help="Use SSH dynamic port forwarding to create a SOCKS proxy at " + @@ -286,10 +282,6 @@ def parse_args(): parser.add_option( "--use-existing-master", action="store_true", default=False, help="Launch fresh slaves, but use an existing stopped master if possible") - parser.add_option( - "--worker-instances", type="int", default=1, - help="Number of instances per worker: variable SPARK_WORKER_INSTANCES. Not used if YARN " + - "is used as Hadoop major version (default: %default)") parser.add_option( "--master-opts", type="string", default="", help="Extra options to give to master through SPARK_MASTER_OPTS variable " + @@ -366,19 +358,6 @@ def get_or_make_group(conn, name, vpc_id): print("Creating security group " + name) return conn.create_security_group(name, "Spark EC2 group", vpc_id) -def validate_spark_hadoop_version(spark_version, hadoop_version): - if "." in spark_version: - parts = spark_version.split(".") - if parts[0].isdigit(): - spark_major_version = float(parts[0]) - if spark_major_version > 1.0 and hadoop_version != "yarn": - print("Spark version: {v}, does not support Hadoop version: {hv}". - format(v=spark_version, hv=hadoop_version), file=stderr) - sys.exit(1) - else: - print("Invalid Spark version: {v}".format(v=spark_version), file=stderr) - sys.exit(1) - def get_validate_spark_version(version, repo): if "." in version: # Remove leading v to handle inputs like v1.5.0 @@ -833,19 +812,11 @@ def setup_cluster(conn, master_nodes, slave_nodes, opts, deploy_ssh_key): print(slave_address) ssh_write(slave_address, opts, ['tar', 'x'], dot_ssh_tar) - modules = ['spark', 'ephemeral-hdfs', 'persistent-hdfs', - 'mapreduce', 'spark-standalone', 'rstudio'] - - if opts.hadoop_major_version == "1": - modules = list(filter(lambda x: x != "mapreduce", modules)) + modules = ['spark', 'spark-standalone'] if opts.ganglia: modules.append('ganglia') - # Clear SPARK_WORKER_INSTANCES if running on YARN - if opts.hadoop_major_version == "yarn": - opts.worker_instances = "" - # NOTE: We should clone the repository before running deploy_files to # prevent ec2-variables.sh from being overwritten print("Cloning spark-ec2 scripts from {r}/tree/{b} on master...".format( @@ -883,8 +854,8 @@ def setup_cluster(conn, master_nodes, slave_nodes, opts, deploy_ssh_key): def setup_spark_cluster(master, opts): - ssh(master, opts, "chmod u+x spark-ec2/setup.sh") - ssh(master, opts, "spark-ec2/setup.sh") + ssh(master, opts, "chmod u+x ${HOME}/spark-ec2/setup.sh") + ssh(master, opts, "${HOME}/spark-ec2/setup.sh") print("Spark standalone cluster started at http://%s:8080" % master) if opts.ganglia: @@ -1065,13 +1036,9 @@ def deploy_files(conn, root_dir, opts, master_nodes, slave_nodes, modules): active_master = get_dns_name(master_nodes[0], opts.private_ips) num_disks = get_num_disks(opts.instance_type) - hdfs_data_dirs = "/mnt/ephemeral-hdfs/data" - mapred_local_dirs = "/mnt/hadoop/mrlocal" spark_local_dirs = "/mnt/spark" if num_disks > 1: for i in range(2, num_disks + 1): - hdfs_data_dirs += ",/mnt%d/ephemeral-hdfs/data" % i - mapred_local_dirs += ",/mnt%d/hadoop/mrlocal" % i spark_local_dirs += ",/mnt%d/spark" % i cluster_url = "%s:7077" % active_master @@ -1079,27 +1046,21 @@ def deploy_files(conn, root_dir, opts, master_nodes, slave_nodes, modules): if "." in opts.spark_version: # Pre-built Spark deploy spark_v = get_validate_spark_version(opts.spark_version, opts.spark_git_repo) - validate_spark_hadoop_version(spark_v, opts.hadoop_major_version) else: # Spark-only custom deploy spark_v = "%s|%s" % (opts.spark_git_repo, opts.spark_version) master_addresses = [get_dns_name(i, opts.private_ips) for i in master_nodes] slave_addresses = [get_dns_name(i, opts.private_ips) for i in slave_nodes] - worker_instances_str = "%d" % opts.worker_instances if opts.worker_instances else "" template_vars = { "master_list": '\n'.join(master_addresses), "active_master": active_master, "slave_list": '\n'.join(slave_addresses), "cluster_url": cluster_url, - "hdfs_data_dirs": hdfs_data_dirs, - "mapred_local_dirs": mapred_local_dirs, "spark_local_dirs": spark_local_dirs, "swap": str(opts.swap), "modules": '\n'.join(modules), "spark_version": spark_v, - "hadoop_major_version": opts.hadoop_major_version, - "spark_worker_instances": worker_instances_str, "spark_master_opts": opts.master_opts } @@ -1283,7 +1244,6 @@ def real_main(): # Input parameter validation spark_v = get_validate_spark_version(opts.spark_version, opts.spark_git_repo) - validate_spark_hadoop_version(spark_v, opts.hadoop_major_version) if opts.wait is not None: # NOTE: DeprecationWarnings are silent in 2.7+ by default. From b401059ae92620b29ee37cf4f579ff71b08e1afc Mon Sep 17 00:00:00 2001 From: "Darren L. Weber, Ph.D" Date: Wed, 16 Aug 2017 22:36:17 -0700 Subject: [PATCH 28/30] deploy_templates.py - remove hadoop, mapreduce, and tachyon --- deploy_templates.py | 8 -------- 1 file changed, 8 deletions(-) diff --git a/deploy_templates.py b/deploy_templates.py index 895e55a4..35be8277 100755 --- a/deploy_templates.py +++ b/deploy_templates.py @@ -47,9 +47,6 @@ else: slave_ram_mb = max(512, slave_ram_mb - 1300) # Leave 1.3 GB RAM -# Make tachyon_mb as slave_ram_mb for now. -tachyon_mb = slave_ram_mb - worker_instances_str = "" worker_cores = slave_cpus @@ -63,18 +60,13 @@ "master_list": os.getenv("MASTERS"), "active_master": os.getenv("MASTERS").split("\n")[0], "slave_list": os.getenv("SLAVES"), - "hdfs_data_dirs": os.getenv("HDFS_DATA_DIRS"), - "mapred_local_dirs": os.getenv("MAPRED_LOCAL_DIRS"), "spark_local_dirs": os.getenv("SPARK_LOCAL_DIRS"), "spark_worker_mem": "%dm" % slave_ram_mb, "spark_worker_instances": worker_instances_str, "spark_worker_cores": "%d" % worker_cores, "spark_master_opts": os.getenv("SPARK_MASTER_OPTS", ""), "spark_version": os.getenv("SPARK_VERSION"), - "tachyon_version": os.getenv("TACHYON_VERSION"), - "hadoop_major_version": os.getenv("HADOOP_MAJOR_VERSION"), "java_home": os.getenv("JAVA_HOME"), - "default_tachyon_mem": "%dMB" % tachyon_mb, "system_ram_mb": "%d" % system_ram_mb, "aws_access_key_id": os.getenv("AWS_ACCESS_KEY_ID"), "aws_secret_access_key": os.getenv("AWS_SECRET_ACCESS_KEY"), From ea22f152c4334317d728be2b3681aeb838a4625d Mon Sep 17 00:00:00 2001 From: "Darren L. Weber, Ph.D" Date: Wed, 16 Aug 2017 22:02:27 -0700 Subject: [PATCH 29/30] setup-tools.sh - quietly and efficiently --- setup-tools.sh | 63 ++++++++++++++++++++++++++++++++------------------ 1 file changed, 41 insertions(+), 22 deletions(-) diff --git a/setup-tools.sh b/setup-tools.sh index 0b5b493a..0c69d1c6 100755 --- a/setup-tools.sh +++ b/setup-tools.sh @@ -9,48 +9,67 @@ fi # Connectivity tools -sudo yum install -y pssh rsync +echo "Install connectivity tools (ssh, rsync)" +sudo yum install -y -q pssh rsync # Dev tools -sudo yum install -y gcc gcc-c++ ant git +echo "Install dev tools (gcc, ant, git)" +sudo yum install -y -q gcc gcc-c++ ant git # Install java-8 for Spark 2.2.x -sudo yum install -y java-1.8.0 java-1.8.0-devel -sudo yum --enablerepo='*-debug*' install -y java-1.8.0-openjdk-debuginfo -sudo yum remove -y java-1.7.0 +echo "Install java-8" +sudo yum install -y -q java-1.8.0 java-1.8.0-devel +sudo yum --enablerepo='*-debug*' install -y -q java-1.8.0-openjdk-debuginfo +echo "Remove java-7 and set the default to java-8" +sudo yum remove -y -q java-1.7.0 sudo /usr/sbin/alternatives --auto java sudo /usr/sbin/alternatives --auto javac # Perf tools -sudo yum install -y dstat iotop strace sysstat htop perf -sudo debuginfo-install -y glibc -sudo debuginfo-install -y kernel +echo "Install performance tools" +sudo yum install -y -q dstat iotop strace sysstat htop perf +sudo debuginfo-install -y -q glibc +sudo debuginfo-install -y -q kernel # PySpark and MLlib deps -sudo yum install -y python-matplotlib python-tornado scipy libgfortran +echo "Install python tools" +sudo yum install -y -q python-matplotlib python-tornado scipy libgfortran # SparkR deps -sudo yum install -y R +echo "Install R tools" +sudo yum install -y -q R + # Ganglia -sudo yum install -y ganglia ganglia-web ganglia-gmond ganglia-gmetad +echo "Install Ganglia monitoring tools" +sudo yum install -y -q ganglia ganglia-web ganglia-gmond ganglia-gmetad # Install Maven -cd /tmp -wget "http://archive.apache.org/dist/maven/maven-3/3.2.3/binaries/apache-maven-3.2.3-bin.tar.gz" -tar xvzf apache-maven-3.2.3-bin.tar.gz -mv apache-maven-3.2.3 /opt/ +echo "Install Maven" +if [ ! -d /opt/apache-maven-3.2.3 ]; then + cd /tmp + wget "http://archive.apache.org/dist/maven/maven-3/3.2.3/binaries/apache-maven-3.2.3-bin.tar.gz" + tar zxf apache-maven-3.2.3-bin.tar.gz + mv apache-maven-3.2.3 /opt/ +fi # Edit bash profile -echo "export PS1=\"\\u@\\h \\W]\\$ \"" >> ~/.bash_profile -echo "export JAVA_HOME=/usr/lib/jvm/java-1.8.0" >> ~/.bash_profile -echo "export M2_HOME=/opt/apache-maven-3.2.3" >> ~/.bash_profile -echo "export PATH=\$PATH:\$M2_HOME/bin" >> ~/.bash_profile +echo "Update .bash_profile" +if grep -q 'java-1.8.0' ~/.bash_profile; then + echo ".bash_profile setup" +else + echo "export PS1=\"\\u@\\h \\W]\\$ \"" >> ~/.bash_profile + echo "export JAVA_HOME=/usr/lib/jvm/java-1.8.0" >> ~/.bash_profile + echo "export M2_HOME=/opt/apache-maven-3.2.3" >> ~/.bash_profile + echo "export PATH=\$PATH:\$M2_HOME/bin" >> ~/.bash_profile +fi source ~/.bash_profile # Create /usr/bin/realpath which is used by R to find Java installations # NOTE: /usr/bin/realpath is missing in CentOS AMIs. See # http://superuser.com/questions/771104/usr-bin-realpath-not-found-in-centos-6-5 -echo '#!/bin/bash' > /usr/bin/realpath -echo 'readlink -e "$@"' >> /usr/bin/realpath -chmod a+x /usr/bin/realpath +if [ ! -f /usr/bin/realpath ]; then + echo '#!/bin/bash' > /usr/bin/realpath + echo 'readlink -e "$@"' >> /usr/bin/realpath + chmod a+x /usr/bin/realpath +fi From 25bfa18d9d2431adea41123c641edeaa0a3b551c Mon Sep 17 00:00:00 2001 From: "Darren L. Weber, Ph.D" Date: Wed, 16 Aug 2017 22:03:11 -0700 Subject: [PATCH 30/30] setup.sh - use a full path to setup-slave script --- setup.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.sh b/setup.sh index 93fd28c2..d059a34e 100755 --- a/setup.sh +++ b/setup.sh @@ -71,7 +71,7 @@ pssh --inline \ --user root \ --extra-args "-t -t $SSH_OPTS" \ --timeout 0 \ - "spark-ec2/setup-slave.sh" + "${HOME}/spark-ec2/setup-slave.sh" setup_slave_end_time="$(date +'%s')" echo_time_diff "setup-slave" "$setup_slave_start_time" "$setup_slave_end_time"