Merge lp:~ewollesen/charms/trusty/apache-spark/spark-config into lp:~bigdata-dev/charms/trusty/apache-spark/trunk

Proposed by Eric Wollesen
Status: Needs review
Proposed branch: lp:~ewollesen/charms/trusty/apache-spark/spark-config
Merge into: lp:~bigdata-dev/charms/trusty/apache-spark/trunk
Diff against target: 235 lines (+125/-17)
4 files modified
config.yaml (+17/-1)
hooks/callbacks.py (+20/-16)
hooks/eawutils.py (+45/-0)
tests/100-spark-config (+43/-0)
To merge this branch: bzr merge lp:~ewollesen/charms/trusty/apache-spark/spark-config
Reviewer Review Type Date Requested Status
Juju Big Data Development Pending
Review via email: mp+260782@code.launchpad.net

Description of the change

Adds config options for +spark_local_dir+ and +spark_driver_cores+.

To post a comment you must log in.
13. By Eric Wollesen

Merged ~bigdata-dev's trunk

14. By Eric Wollesen

Re-arrange the install callback, so config-changed works

15. By Eric Wollesen

Take two at a spark config amulet test.

The amulet configure method isn't working as I would expect. It doesn't
appear to be triggering the juju hooks that effect the requested
changes. As a result, the values aren't being written to disk, and
the test fails. Modifying values via juju set, however, works fine.

Unmerged revisions

15. By Eric Wollesen

Take two at a spark config amulet test.

The amulet configure method isn't working as I would expect. It doesn't
appear to be triggering the juju hooks that effect the requested
changes. As a result, the values aren't being written to disk, and
the test fails. Modifying values via juju set, however, works fine.

14. By Eric Wollesen

Re-arrange the install callback, so config-changed works

13. By Eric Wollesen

Merged ~bigdata-dev's trunk

12. By Eric Wollesen

Adds configuration for driver cores and local dir.

See config.yaml for details.

Preview Diff

[H/L] Next/Prev Comment, [J/K] Next/Prev File, [N/P] Next/Prev Hunk
1=== modified file 'config.yaml'
2--- config.yaml 2015-05-30 04:34:18 +0000
3+++ config.yaml 2015-06-03 05:31:01 +0000
4@@ -4,4 +4,20 @@
5 default: ''
6 description: |
7 URL from which to fetch resources (e.g., Hadoop binaries) instead of Launchpad.
8-
9+ spark_driver_cores:
10+ type: int
11+ default: 1
12+ description: |
13+ Number of cores to use for the driver process, only in cluster
14+ mode.
15+ spark_local_dir:
16+ type: string
17+ default: /tmp
18+ description: |
19+ Directory to use for "scratch" space in Spark, including map
20+ output files and RDDs that get stored on disk. This should be on a
21+ fast, local disk in your system. It can also be a comma-separated
22+ list of multiple directories on different disks. NOTE: In Spark
23+ 1.0 and later this will be overriden by SPARK_LOCAL_DIRS
24+ (Standalone, Mesos) or LOCAL_DIRS (YARN) environment variables set
25+ by the cluster manager.
26
27=== modified file 'hooks/callbacks.py'
28--- hooks/callbacks.py 2015-06-02 13:45:28 +0000
29+++ hooks/callbacks.py 2015-06-03 05:31:01 +0000
30@@ -1,4 +1,3 @@
31-
32 from subprocess import check_output, Popen
33
34 import jujuresources
35@@ -6,7 +5,7 @@
36 from charmhelpers.core import unitdata
37 from charmhelpers.contrib.bigdata import utils
38 from path import Path
39-
40+import eawutils
41
42 class Spark(object):
43
44@@ -18,8 +17,11 @@
45 return unitdata.kv().get('spark.installed')
46
47 def install(self, force=False):
48- if not force and self.is_installed():
49- return
50+ if force or not self.is_installed():
51+ install_spark()
52+ self.configure_spark()
53+
54+ def install_spark(self):
55 mirror_url = hookenv.config()['resources_mirror']
56 jujuresources.fetch('spark-%s' % self.cpu_arch, mirror_url=mirror_url)
57 jujuresources.install('spark-%s' % self.cpu_arch,
58@@ -29,9 +31,8 @@
59 self.dist_config.add_dirs()
60 self.dist_config.add_packages()
61 self.setup_spark_config()
62- self.configure_spark()
63 unitdata.kv().set('spark.installed', True)
64-
65+
66 def install_demo(self):
67 '''
68 Install demo.sh script to /home/ubuntu
69@@ -42,11 +43,11 @@
70 Path(demo_source).copy(demo_target)
71 Path(demo_target).chmod(0o755)
72 Path(demo_target).chown('ubuntu', 'hadoop')
73-
74+
75 def setup_spark_config(self):
76 '''
77 copy Spark's default configuration files to spark_conf property defined
78- in dist.yaml
79+ in dist.yaml
80 '''
81 conf_dir = self.dist_config.path('spark') / 'conf'
82 self.dist_config.path('spark_conf').rmtree_p()
83@@ -64,10 +65,10 @@
84 utils.re_edit_in_place(spark_log4j, {
85 r'log4j.rootCategory=INFO, console': 'log4j.rootCategory=ERROR, console',
86 })
87-
88+
89 def configure_spark(self):
90 '''
91- Configure spark environment for all users
92+ Configure spark environment for all users
93 '''
94 from subprocess import call
95 spark_bin = self.dist_config.path('spark') / 'bin'
96@@ -78,12 +79,15 @@
97 env['SPARK_CONF_DIR'] = self.dist_config.path('spark_conf')
98 self.configure_spark_hdfs()
99 self.spark_optimize()
100+ spark_default = self.dist_config.path('spark_conf') / 'spark-defaults.conf'
101+ spark_config = eawutils.getSparkConfig(hookenv.config())
102+ eawutils.updateSparkConfig(spark_default, spark_config)
103 cmd = "chown -R ubuntu:hadoop {}".format (spark_home)
104 call(cmd.split())
105 cmd = "chown -R ubuntu:hadoop {}".format (self.dist_config.path('spark_conf'))
106 call(cmd.split())
107-
108- def configure_spark_hdfs(self):
109+
110+ def configure_spark_hdfs(self):
111 e = utils.read_etc_env()
112 utils.run_as('hdfs', 'hdfs', 'dfs', '-mkdir', '-p', '/user/ubuntu/directory', env=e)
113 utils.run_as('hdfs', 'hdfs', 'dfs', '-chown', '-R', 'ubuntu:hadoop', '/user/ubuntu/directory', env=e)
114@@ -107,19 +111,19 @@
115 r'.*spark.eventLog.enabled *.*':'spark.eventLog.enabled true',
116 r'.*spark.eventLog.dir *.*':'spark.eventLog.dir hdfs:///user/ubuntu/directory',
117 })
118-
119-
120+
121+
122 def start(self):
123 e = utils.read_etc_env()
124 spark_home = self.dist_config.path('spark')
125 if utils.jps("HistoryServer"):
126 self.stop()
127 utils.run_as('ubuntu', '{}/sbin/start-history-server.sh'.format(spark_home), 'hdfs:///user/ubuntu/directory', env=e)
128-
129+
130 def stop(self):
131 e = utils.read_etc_env()
132 spark_home = self.dist_config.path('spark')
133- utils.run_as('ubuntu', '{}/sbin/stop-history-server.sh'.format(spark_home), env=e)
134+ utils.run_as('ubuntu', '{}/sbin/stop-history-server.sh'.format(spark_home), env=e)
135
136 def cleanup(self):
137 self.dist_config.remove_dirs()
138
139=== added file 'hooks/eawutils.py'
140--- hooks/eawutils.py 1970-01-01 00:00:00 +0000
141+++ hooks/eawutils.py 2015-06-03 05:31:01 +0000
142@@ -0,0 +1,45 @@
143+# These functions should live in charmhelpers.contrib.bigdata.utils or
144+# somewhere similar.
145+import re
146+from charmhelpers.contrib.bigdata import utils
147+
148+def updateSparkConfig(path, config):
149+ """Updates spark config settings in +path+.
150+
151+ Assumes +path+ is in spark config file syntax."""
152+ inserts, updates = calcSparkConfigUpserts(path, config)
153+
154+ utils.re_edit_in_place(path, updates)
155+ with open(path, 'a') as configFile:
156+ for item in inserts.items():
157+ configFile.write("%s\t%s\n" % item)
158+
159+def calcSparkConfigUpserts(path, config):
160+ """Calculate upserts to transform +path+ to +config+, idempotently.
161+
162+ Returns (inserts, updates)."""
163+ inserts = config.copy()
164+ updates = {}
165+
166+ with open(path, 'r') as configFile:
167+ for line in configFile.readlines():
168+ if line.startswith("#") or re.match('\A\s*\Z', line):
169+ continue
170+ key = line.split(None, 1)[0]
171+ if key in config:
172+ updates["^%s\s.*" % key] = "%s\t%s" % (key, config[key])
173+ inserts.pop(key)
174+
175+ return inserts, updates
176+
177+def getKeysStartingWith(d, prefix):
178+ "Return a dict of the keys prefixed with +prefix+."
179+ return dict([(k,v) for k,v in d.items() if k.startswith(prefix)])
180+
181+def underscoreToDot(d):
182+ "Return the dictionary with underscores in keys replaced with dots."
183+ return dict([(k.replace("_", "."),v) for k,v in d.items()])
184+
185+def getSparkConfig(config):
186+ "Return a dict of the keys prefixed with 'spark.', that have non-default values."
187+ return underscoreToDot(getKeysStartingWith(config, "spark_"))
188
189=== added file 'tests/100-spark-config'
190--- tests/100-spark-config 1970-01-01 00:00:00 +0000
191+++ tests/100-spark-config 2015-06-03 05:31:01 +0000
192@@ -0,0 +1,43 @@
193+#!/usr/bin/python3
194+
195+import unittest
196+import amulet
197+
198+
199+class TestSparkConfig(unittest.TestCase):
200+ """
201+ Configuration settings test for Apache Spark.
202+ """
203+
204+ @classmethod
205+ def setUpClass(cls):
206+ cls.d = amulet.Deployment(series='trusty')
207+ #### Deploy a hadoop cluster
208+ cls.d.add('yarn-master', charm='cs:~bigdata-dev/trusty/apache-hadoop-yarn-master')
209+ cls.d.add('hdfs-master', charm='cs:~bigdata-dev/trusty/apache-hadoop-hdfs-master')
210+ cls.d.add('compute-slave', charm='cs:~bigdata-dev/trusty/apache-hadoop-compute-slave', units=2)
211+ cls.d.add('hadoop-plugin', charm='cs:~bigdata-dev/trusty/apache-hadoop-plugin')
212+ cls.d.relate('yarn-master:namenode', 'hdfs-master:namenode')
213+ cls.d.relate('yarn-master:resourcemanager', 'hadoop-plugin:resourcemanager')
214+ cls.d.relate('hadoop-plugin:namenode', 'hdfs-master:namenode')
215+
216+ cls.d.relate('compute-slave:nodemanager', 'yarn-master:nodemanager')
217+ cls.d.relate('compute-slave:datanode', 'hdfs-master:datanode')
218+
219+ ### Add Spark Service
220+ cls.d.add('spark', 'apache-spark')
221+ cls.d.configure('spark', {'spark_driver_cores': 2,
222+ 'spark_local_dir': '/var'})
223+ cls.d.relate('hadoop-plugin:hadoop-plugin', 'spark:hadoop-plugin')
224+
225+ cls.d.setup(timeout=9000)
226+ cls.d.sentry.wait()
227+ cls.unit = cls.d.sentry.unit['spark/0']
228+
229+ def test_config_setting(self):
230+ output, retcode = self.unit.run("grep -Pq 'spark.driver.cores\t2' /etc/spark/conf/spark-defaults.conf")
231+ self.assertEqual(retcode, 0, 'failed to configure spark service\n')
232+
233+
234+if __name__ == '__main__':
235+ unittest.main()

Subscribers

People subscribed via source and target branches