Merge lp:~julian-edwards/maas/log-celery-exceptions-bug-1184589 into lp:~maas-committers/maas/trunk

Proposed by Julian Edwards
Status: Merged
Approved by: Julian Edwards
Approved revision: no longer in the source branch.
Merged at revision: 2041
Proposed branch: lp:~julian-edwards/maas/log-celery-exceptions-bug-1184589
Merge into: lp:~maas-committers/maas/trunk
Diff against target: 215 lines (+45/-3)
1 file modified
src/provisioningserver/tasks.py (+45/-3)
To merge this branch: bzr merge lp:~julian-edwards/maas/log-celery-exceptions-bug-1184589
Reviewer Review Type Date Requested Status
Gavin Panella (community) Approve
Review via email: mp+208723@code.launchpad.net

Commit message

Make sure failing celery jobs log the exception text

Description of the change

I make no apologies for this hideous hack, but when you're dealing with stupidity, you sometimes have to fight with more stupid.

The comment I added at the top of the code will suffice to explain:

# The tasks catch bare exceptions in an attempt to circumvent Celery's
# bizarre exception handling which prints a stack trace but not the
# error message contained in the exception itself! The message is
# printed and then the exception re-raised so that it marks the task as
# failed - in doing so it logs the stack trace, which is why the code
# does not do a simple logger.exception(exc).

To post a comment you must log in.
Revision history for this message
Gavin Panella (allenap) wrote :

Fight Fire With Fire!

review: Approve
Revision history for this message
Raphaël Badin (rvb) wrote :

Arg, I had a much simpler idea to fix this :/

Revision history for this message
Julian Edwards (julian-edwards) wrote :

On Sunday 02 Mar 2014 01:56:34 you wrote:
> Arg, I had a much simpler idea to fix this :/

And there endeth the lesson of writing your thoughts down on the bug :)

Revision history for this message
Raphaël Badin (rvb) wrote :

Here is the idea I had on how to fix the bug:

Celery messes with the error a bit but it still propagates the full original traceback. What I mean is that if you change the value of error_msg from "Task %(name)s[%(id)s] %(description)s: %(exc)s" to "Task %(name)s[%(id)s] %(description)s: %(traceback)s" in /usr/lib/python2.7/dist-packages/celery/worker/job.py:Request you'll see the full error message when a task fails.

We could do this by either monkey patching celery or registering a new log handler.

Preview Diff

[H/L] Next/Prev Comment, [J/K] Next/Prev File, [N/P] Next/Prev Hunk
1=== modified file 'src/provisioningserver/tasks.py'
2--- src/provisioningserver/tasks.py 2014-02-27 02:33:54 +0000
3+++ src/provisioningserver/tasks.py 2014-02-28 04:15:21 +0000
4@@ -26,6 +26,7 @@
5 'write_full_dns_config',
6 ]
7
8+from functools import wraps
9 from logging import getLogger
10 import os
11 from subprocess import CalledProcessError
12@@ -77,7 +78,27 @@
13 logger = getLogger(__name__)
14
15
16+# The tasks catch bare exceptions in an attempt to circumvent Celery's
17+# bizarre exception handling which prints a stack trace but not the
18+# error message contained in the exception itself! The message is
19+# printed and then the exception re-raised so that it marks the task as
20+# failed - in doing so it logs the stack trace, which is why the code
21+# does not do a simple logger.exception(exc).
22+def log_exception_text(func):
23+ """Wrap a function and log any exception text raised."""
24+ @wraps(func)
25+ def wrapper(*args, **kwargs):
26+ try:
27+ func(*args, **kwargs)
28+ except Exception as e:
29+ logger.error("%s: %s", func.__name__, unicode(e))
30+ raise
31+ return wrapper
32+
33+
34+
35 @task
36+@log_exception_text
37 def refresh_secrets(**kwargs):
38 """Update the worker's knowledge of various secrets it needs.
39
40@@ -152,12 +173,14 @@
41
42
43 @task
44+@log_exception_text
45 def power_on(power_type, **kwargs):
46 """Turn a node on."""
47 issue_power_action(power_type, 'on', **kwargs)
48
49
50 @task
51+@log_exception_text
52 def power_off(power_type, **kwargs):
53 """Turn a node off."""
54 issue_power_action(power_type, 'off', **kwargs)
55@@ -175,6 +198,7 @@
56
57
58 @task(max_retries=RNDC_COMMAND_MAX_RETRY, queue=celery_config.WORKER_QUEUE_DNS)
59+@log_exception_text
60 def rndc_command(arguments, retry=False, callback=None):
61 """Use rndc to execute a command.
62 :param arguments: Argument list passed down to the rndc command.
63@@ -186,17 +210,19 @@
64 """
65 try:
66 execute_rndc_command(arguments)
67- except CalledProcessError, exc:
68+ except CalledProcessError as exc:
69 if retry:
70 return rndc_command.retry(
71 exc=exc, countdown=RNDC_COMMAND_RETRY_DELAY)
72 else:
73+ logger.error("rndc_command failed: %s", unicode(exc))
74 raise
75 if callback is not None:
76 callback.delay()
77
78
79 @task(queue=celery_config.WORKER_QUEUE_DNS)
80+@log_exception_text
81 def write_full_dns_config(zones=None, callback=None, **kwargs):
82 """Write out the DNS configuration files: the main configuration
83 file and the zone files.
84@@ -219,6 +245,7 @@
85
86
87 @task(queue=celery_config.WORKER_QUEUE_DNS)
88+@log_exception_text
89 def write_dns_config(zones=(), callback=None, **kwargs):
90 """Write out the DNS configuration file.
91
92@@ -236,6 +263,7 @@
93
94
95 @task(queue=celery_config.WORKER_QUEUE_DNS)
96+@log_exception_text
97 def write_dns_zone_config(zones, callback=None, **kwargs):
98 """Write out DNS zones.
99
100@@ -252,6 +280,7 @@
101
102
103 @task(queue=celery_config.WORKER_QUEUE_DNS)
104+@log_exception_text
105 def setup_rndc_configuration(callback=None):
106 """Write out the two rndc configuration files (rndc.conf and
107 named.conf.rndc).
108@@ -270,6 +299,7 @@
109
110
111 @task
112+@log_exception_text
113 def upload_dhcp_leases():
114 """Upload DHCP leases.
115
116@@ -280,6 +310,7 @@
117
118
119 @task
120+@log_exception_text
121 def add_new_dhcp_host_map(mappings, server_address, shared_key):
122 """Add address mappings to the DHCP server.
123
124@@ -295,15 +326,17 @@
125 try:
126 for ip_address, mac_address in mappings.items():
127 omshell.create(ip_address, mac_address)
128- except CalledProcessError:
129+ except CalledProcessError as e:
130 # TODO signal to webapp that the job failed.
131
132 # Re-raise, so the job is marked as failed. Only currently
133 # useful for tests.
134+ logger.error("add_new_dhcp_host_map failed: %s", unicode(e))
135 raise
136
137
138 @task
139+@log_exception_text
140 def remove_dhcp_host_map(ip_address, server_address, omapi_key):
141 """Remove an IP to MAC mapping in the DHCP server.
142
143@@ -317,15 +350,17 @@
144 omshell = Omshell(server_address, omapi_key)
145 try:
146 omshell.remove(ip_address)
147- except CalledProcessError:
148+ except CalledProcessError as e:
149 # TODO signal to webapp that the job failed.
150
151 # Re-raise, so the job is marked as failed. Only currently
152 # useful for tests.
153+ logger.error("remove_dhcp_host_map failed: %s", unicode(e))
154 raise
155
156
157 @task
158+@log_exception_text
159 def write_dhcp_config(callback=None, **kwargs):
160 """Write out the DHCP configuration file and restart the DHCP server.
161
162@@ -342,18 +377,21 @@
163
164
165 @task
166+@log_exception_text
167 def restart_dhcp_server():
168 """Restart the DHCP server."""
169 call_and_check(['sudo', '-n', 'service', 'maas-dhcp-server', 'restart'])
170
171
172 @task
173+@log_exception_text
174 def stop_dhcp_server():
175 """Stop a DHCP server."""
176 call_and_check(['sudo', '-n', 'service', 'maas-dhcp-server', 'stop'])
177
178
179 @task
180+@log_exception_text
181 def periodic_probe_dhcp():
182 """Probe for foreign DHCP servers."""
183 detect.periodic_probe_task()
184@@ -365,6 +403,7 @@
185
186
187 @task
188+@log_exception_text
189 def report_boot_images():
190 """For master worker only: report available netboot images."""
191 boot_images.report_to_server()
192@@ -383,6 +422,7 @@
193
194
195 @task(max_retries=UPDATE_NODE_TAGS_MAX_RETRY)
196+@log_exception_text
197 def update_node_tags(tag_name, tag_definition, tag_nsmap, retry=True):
198 """Update the nodes for a new/changed tag definition.
199
200@@ -405,6 +445,7 @@
201 # =====================================================================
202
203 @task
204+@log_exception_text
205 def import_boot_images(http_proxy=None, main_archive=None, ports_archive=None,
206 cloud_images_archive=None):
207 env = dict(os.environ)
208@@ -425,6 +466,7 @@
209 # =====================================================================
210
211 @task
212+@log_exception_text
213 def add_seamicro15k(mac, username, password):
214 """ See `maasserver.api.NodeGroupsHandler.add_seamicro15k`. """
215 ip = find_ip_via_arp(mac)