Merge lp:~julian-edwards/maas/log-celery-exceptions-bug-1184589 into lp:~maas-committers/maas/trunk

Proposed by Julian Edwards
Status: Merged
Approved by: Julian Edwards
Approved revision: no longer in the source branch.
Merged at revision: 2041
Proposed branch: lp:~julian-edwards/maas/log-celery-exceptions-bug-1184589
Merge into: lp:~maas-committers/maas/trunk
Diff against target: 215 lines (+45/-3)
1 file modified
src/provisioningserver/tasks.py (+45/-3)
To merge this branch: bzr merge lp:~julian-edwards/maas/log-celery-exceptions-bug-1184589
Reviewer Review Type Date Requested Status
Gavin Panella (community) Approve
Review via email: mp+208723@code.launchpad.net

Commit message

Make sure failing celery jobs log the exception text

Description of the change

I make no apologies for this hideous hack, but when you're dealing with stupidity, you sometimes have to fight with more stupid.

The comment I added at the top of the code will suffice to explain:

# The tasks catch bare exceptions in an attempt to circumvent Celery's
# bizarre exception handling which prints a stack trace but not the
# error message contained in the exception itself! The message is
# printed and then the exception re-raised so that it marks the task as
# failed - in doing so it logs the stack trace, which is why the code
# does not do a simple logger.exception(exc).

To post a comment you must log in.
Revision history for this message
Gavin Panella (allenap) wrote :

Fight Fire With Fire!

review: Approve
Revision history for this message
Raphaël Badin (rvb) wrote :

Arg, I had a much simpler idea to fix this :/

Revision history for this message
Julian Edwards (julian-edwards) wrote :

On Sunday 02 Mar 2014 01:56:34 you wrote:
> Arg, I had a much simpler idea to fix this :/

And there endeth the lesson of writing your thoughts down on the bug :)

Revision history for this message
Raphaël Badin (rvb) wrote :

Here is the idea I had on how to fix the bug:

Celery messes with the error a bit but it still propagates the full original traceback. What I mean is that if you change the value of error_msg from "Task %(name)s[%(id)s] %(description)s: %(exc)s" to "Task %(name)s[%(id)s] %(description)s: %(traceback)s" in /usr/lib/python2.7/dist-packages/celery/worker/job.py:Request you'll see the full error message when a task fails.

We could do this by either monkey patching celery or registering a new log handler.

Preview Diff

[H/L] Next/Prev Comment, [J/K] Next/Prev File, [N/P] Next/Prev Hunk
=== modified file 'src/provisioningserver/tasks.py'
--- src/provisioningserver/tasks.py 2014-02-27 02:33:54 +0000
+++ src/provisioningserver/tasks.py 2014-02-28 04:15:21 +0000
@@ -26,6 +26,7 @@
26 'write_full_dns_config',26 'write_full_dns_config',
27 ]27 ]
2828
29from functools import wraps
29from logging import getLogger30from logging import getLogger
30import os31import os
31from subprocess import CalledProcessError32from subprocess import CalledProcessError
@@ -77,7 +78,27 @@
77logger = getLogger(__name__)78logger = getLogger(__name__)
7879
7980
81# The tasks catch bare exceptions in an attempt to circumvent Celery's
82# bizarre exception handling which prints a stack trace but not the
83# error message contained in the exception itself! The message is
84# printed and then the exception re-raised so that it marks the task as
85# failed - in doing so it logs the stack trace, which is why the code
86# does not do a simple logger.exception(exc).
87def log_exception_text(func):
88 """Wrap a function and log any exception text raised."""
89 @wraps(func)
90 def wrapper(*args, **kwargs):
91 try:
92 func(*args, **kwargs)
93 except Exception as e:
94 logger.error("%s: %s", func.__name__, unicode(e))
95 raise
96 return wrapper
97
98
99
80@task100@task
101@log_exception_text
81def refresh_secrets(**kwargs):102def refresh_secrets(**kwargs):
82 """Update the worker's knowledge of various secrets it needs.103 """Update the worker's knowledge of various secrets it needs.
83104
@@ -152,12 +173,14 @@
152173
153174
154@task175@task
176@log_exception_text
155def power_on(power_type, **kwargs):177def power_on(power_type, **kwargs):
156 """Turn a node on."""178 """Turn a node on."""
157 issue_power_action(power_type, 'on', **kwargs)179 issue_power_action(power_type, 'on', **kwargs)
158180
159181
160@task182@task
183@log_exception_text
161def power_off(power_type, **kwargs):184def power_off(power_type, **kwargs):
162 """Turn a node off."""185 """Turn a node off."""
163 issue_power_action(power_type, 'off', **kwargs)186 issue_power_action(power_type, 'off', **kwargs)
@@ -175,6 +198,7 @@
175198
176199
177@task(max_retries=RNDC_COMMAND_MAX_RETRY, queue=celery_config.WORKER_QUEUE_DNS)200@task(max_retries=RNDC_COMMAND_MAX_RETRY, queue=celery_config.WORKER_QUEUE_DNS)
201@log_exception_text
178def rndc_command(arguments, retry=False, callback=None):202def rndc_command(arguments, retry=False, callback=None):
179 """Use rndc to execute a command.203 """Use rndc to execute a command.
180 :param arguments: Argument list passed down to the rndc command.204 :param arguments: Argument list passed down to the rndc command.
@@ -186,17 +210,19 @@
186 """210 """
187 try:211 try:
188 execute_rndc_command(arguments)212 execute_rndc_command(arguments)
189 except CalledProcessError, exc:213 except CalledProcessError as exc:
190 if retry:214 if retry:
191 return rndc_command.retry(215 return rndc_command.retry(
192 exc=exc, countdown=RNDC_COMMAND_RETRY_DELAY)216 exc=exc, countdown=RNDC_COMMAND_RETRY_DELAY)
193 else:217 else:
218 logger.error("rndc_command failed: %s", unicode(exc))
194 raise219 raise
195 if callback is not None:220 if callback is not None:
196 callback.delay()221 callback.delay()
197222
198223
199@task(queue=celery_config.WORKER_QUEUE_DNS)224@task(queue=celery_config.WORKER_QUEUE_DNS)
225@log_exception_text
200def write_full_dns_config(zones=None, callback=None, **kwargs):226def write_full_dns_config(zones=None, callback=None, **kwargs):
201 """Write out the DNS configuration files: the main configuration227 """Write out the DNS configuration files: the main configuration
202 file and the zone files.228 file and the zone files.
@@ -219,6 +245,7 @@
219245
220246
221@task(queue=celery_config.WORKER_QUEUE_DNS)247@task(queue=celery_config.WORKER_QUEUE_DNS)
248@log_exception_text
222def write_dns_config(zones=(), callback=None, **kwargs):249def write_dns_config(zones=(), callback=None, **kwargs):
223 """Write out the DNS configuration file.250 """Write out the DNS configuration file.
224251
@@ -236,6 +263,7 @@
236263
237264
238@task(queue=celery_config.WORKER_QUEUE_DNS)265@task(queue=celery_config.WORKER_QUEUE_DNS)
266@log_exception_text
239def write_dns_zone_config(zones, callback=None, **kwargs):267def write_dns_zone_config(zones, callback=None, **kwargs):
240 """Write out DNS zones.268 """Write out DNS zones.
241269
@@ -252,6 +280,7 @@
252280
253281
254@task(queue=celery_config.WORKER_QUEUE_DNS)282@task(queue=celery_config.WORKER_QUEUE_DNS)
283@log_exception_text
255def setup_rndc_configuration(callback=None):284def setup_rndc_configuration(callback=None):
256 """Write out the two rndc configuration files (rndc.conf and285 """Write out the two rndc configuration files (rndc.conf and
257 named.conf.rndc).286 named.conf.rndc).
@@ -270,6 +299,7 @@
270299
271300
272@task301@task
302@log_exception_text
273def upload_dhcp_leases():303def upload_dhcp_leases():
274 """Upload DHCP leases.304 """Upload DHCP leases.
275305
@@ -280,6 +310,7 @@
280310
281311
282@task312@task
313@log_exception_text
283def add_new_dhcp_host_map(mappings, server_address, shared_key):314def add_new_dhcp_host_map(mappings, server_address, shared_key):
284 """Add address mappings to the DHCP server.315 """Add address mappings to the DHCP server.
285316
@@ -295,15 +326,17 @@
295 try:326 try:
296 for ip_address, mac_address in mappings.items():327 for ip_address, mac_address in mappings.items():
297 omshell.create(ip_address, mac_address)328 omshell.create(ip_address, mac_address)
298 except CalledProcessError:329 except CalledProcessError as e:
299 # TODO signal to webapp that the job failed.330 # TODO signal to webapp that the job failed.
300331
301 # Re-raise, so the job is marked as failed. Only currently332 # Re-raise, so the job is marked as failed. Only currently
302 # useful for tests.333 # useful for tests.
334 logger.error("add_new_dhcp_host_map failed: %s", unicode(e))
303 raise335 raise
304336
305337
306@task338@task
339@log_exception_text
307def remove_dhcp_host_map(ip_address, server_address, omapi_key):340def remove_dhcp_host_map(ip_address, server_address, omapi_key):
308 """Remove an IP to MAC mapping in the DHCP server.341 """Remove an IP to MAC mapping in the DHCP server.
309342
@@ -317,15 +350,17 @@
317 omshell = Omshell(server_address, omapi_key)350 omshell = Omshell(server_address, omapi_key)
318 try:351 try:
319 omshell.remove(ip_address)352 omshell.remove(ip_address)
320 except CalledProcessError:353 except CalledProcessError as e:
321 # TODO signal to webapp that the job failed.354 # TODO signal to webapp that the job failed.
322355
323 # Re-raise, so the job is marked as failed. Only currently356 # Re-raise, so the job is marked as failed. Only currently
324 # useful for tests.357 # useful for tests.
358 logger.error("remove_dhcp_host_map failed: %s", unicode(e))
325 raise359 raise
326360
327361
328@task362@task
363@log_exception_text
329def write_dhcp_config(callback=None, **kwargs):364def write_dhcp_config(callback=None, **kwargs):
330 """Write out the DHCP configuration file and restart the DHCP server.365 """Write out the DHCP configuration file and restart the DHCP server.
331366
@@ -342,18 +377,21 @@
342377
343378
344@task379@task
380@log_exception_text
345def restart_dhcp_server():381def restart_dhcp_server():
346 """Restart the DHCP server."""382 """Restart the DHCP server."""
347 call_and_check(['sudo', '-n', 'service', 'maas-dhcp-server', 'restart'])383 call_and_check(['sudo', '-n', 'service', 'maas-dhcp-server', 'restart'])
348384
349385
350@task386@task
387@log_exception_text
351def stop_dhcp_server():388def stop_dhcp_server():
352 """Stop a DHCP server."""389 """Stop a DHCP server."""
353 call_and_check(['sudo', '-n', 'service', 'maas-dhcp-server', 'stop'])390 call_and_check(['sudo', '-n', 'service', 'maas-dhcp-server', 'stop'])
354391
355392
356@task393@task
394@log_exception_text
357def periodic_probe_dhcp():395def periodic_probe_dhcp():
358 """Probe for foreign DHCP servers."""396 """Probe for foreign DHCP servers."""
359 detect.periodic_probe_task()397 detect.periodic_probe_task()
@@ -365,6 +403,7 @@
365403
366404
367@task405@task
406@log_exception_text
368def report_boot_images():407def report_boot_images():
369 """For master worker only: report available netboot images."""408 """For master worker only: report available netboot images."""
370 boot_images.report_to_server()409 boot_images.report_to_server()
@@ -383,6 +422,7 @@
383422
384423
385@task(max_retries=UPDATE_NODE_TAGS_MAX_RETRY)424@task(max_retries=UPDATE_NODE_TAGS_MAX_RETRY)
425@log_exception_text
386def update_node_tags(tag_name, tag_definition, tag_nsmap, retry=True):426def update_node_tags(tag_name, tag_definition, tag_nsmap, retry=True):
387 """Update the nodes for a new/changed tag definition.427 """Update the nodes for a new/changed tag definition.
388428
@@ -405,6 +445,7 @@
405# =====================================================================445# =====================================================================
406446
407@task447@task
448@log_exception_text
408def import_boot_images(http_proxy=None, main_archive=None, ports_archive=None,449def import_boot_images(http_proxy=None, main_archive=None, ports_archive=None,
409 cloud_images_archive=None):450 cloud_images_archive=None):
410 env = dict(os.environ)451 env = dict(os.environ)
@@ -425,6 +466,7 @@
425# =====================================================================466# =====================================================================
426467
427@task468@task
469@log_exception_text
428def add_seamicro15k(mac, username, password):470def add_seamicro15k(mac, username, password):
429 """ See `maasserver.api.NodeGroupsHandler.add_seamicro15k`. """471 """ See `maasserver.api.NodeGroupsHandler.add_seamicro15k`. """
430 ip = find_ip_via_arp(mac)472 ip = find_ip_via_arp(mac)