Merge lp:~cjwatson/launchpad/job-oops-timeline into lp:launchpad
- job-oops-timeline
- Merge into devel
Proposed by
Colin Watson
Status: | Merged |
---|---|
Merged at revision: | 18600 |
Proposed branch: | lp:~cjwatson/launchpad/job-oops-timeline |
Merge into: | lp:launchpad |
Diff against target: |
332 lines (+154/-9) 7 files modified
lib/lp/code/model/branchjob.py (+15/-1) lib/lp/code/model/tests/test_branchjob.py (+28/-1) lib/lp/services/job/interfaces/job.py (+8/-1) lib/lp/services/job/runner.py (+16/-1) lib/lp/services/job/tests/test_runner.py (+37/-1) lib/lp/services/webapp/adapter.py (+32/-4) lib/lp/services/webapp/doc/test_adapter.txt (+18/-0) |
To merge this branch: | bzr merge lp:~cjwatson/launchpad/job-oops-timeline |
Related bugs: |
Reviewer | Review Type | Date Requested | Status |
---|---|---|---|
William Grant | code | Approve | |
Review via email: mp+341554@code.launchpad.net |
Commit message
Use a per-job OOPS timeline.
Description of the change
Job OOPSes currently tend to be rather useless since they don't have a proper timeline set up, so they end up lacking SQL timelines and with other junk in them from previous things the runner has done. This should improve that situation.
To post a comment you must log in.
Revision history for this message
William Grant (wgrant) : | # |
review:
Approve
(code)
Preview Diff
[H/L] Next/Prev Comment, [J/K] Next/Prev File, [N/P] Next/Prev Hunk
1 | === modified file 'lib/lp/code/model/branchjob.py' |
2 | --- lib/lp/code/model/branchjob.py 2015-09-01 17:10:46 +0000 |
3 | +++ lib/lp/code/model/branchjob.py 2018-03-30 20:43:01 +0000 |
4 | @@ -1,4 +1,4 @@ |
5 | -# Copyright 2009-2014 Canonical Ltd. This software is licensed under the |
6 | +# Copyright 2009-2018 Canonical Ltd. This software is licensed under the |
7 | # GNU Affero General Public License version 3 (see the file LICENSE). |
8 | |
9 | __all__ = [ |
10 | @@ -331,6 +331,20 @@ |
11 | super(BranchScanJob, self).__init__(branch_job) |
12 | self._cached_branch_name = self.metadata['branch_name'] |
13 | |
14 | + @staticmethod |
15 | + def timeline_detail_filter(category, detail): |
16 | + """See `IRunnableJob`.""" |
17 | + # Branch scan jobs involve very large bulk INSERT statements, which |
18 | + # we don't want to log in full. Truncate these in the same kind of |
19 | + # way that python-oops-tools does for its list of most expensive |
20 | + # statements. |
21 | + maximum = 1000 |
22 | + splitter = ' ... ' |
23 | + if category.startswith('SQL-') and len(detail) > maximum: |
24 | + half = (maximum - len(splitter)) // 2 |
25 | + detail = (detail[:half] + splitter + detail[-half:]) |
26 | + return detail |
27 | + |
28 | def run(self): |
29 | """See `IBranchScanJob`.""" |
30 | from lp.services.scripts import log |
31 | |
32 | === modified file 'lib/lp/code/model/tests/test_branchjob.py' |
33 | --- lib/lp/code/model/tests/test_branchjob.py 2017-10-04 01:49:22 +0000 |
34 | +++ lib/lp/code/model/tests/test_branchjob.py 2018-03-30 20:43:01 +0000 |
35 | @@ -1,4 +1,4 @@ |
36 | -# Copyright 2009-2017 Canonical Ltd. This software is licensed under the |
37 | +# Copyright 2009-2018 Canonical Ltd. This software is licensed under the |
38 | # GNU Affero General Public License version 3 (see the file LICENSE). |
39 | |
40 | """Tests for BranchJobs.""" |
41 | @@ -16,6 +16,7 @@ |
42 | from bzrlib.bzrdir import BzrDir |
43 | from bzrlib.revision import NULL_REVISION |
44 | from bzrlib.transport import get_transport |
45 | +from fixtures import MockPatch |
46 | import pytz |
47 | from sqlobject import SQLObjectNotFound |
48 | from storm.locals import Store |
49 | @@ -206,6 +207,32 @@ |
50 | self.assertEqual(db_branch.revision_count, 1) |
51 | self.assertIn(db_branch, private_bug.linked_branches) |
52 | |
53 | + def test_sql_statements_truncated(self): |
54 | + """SQL statements in the job timeline are truncated.""" |
55 | + self.useBzrBranches(direct_database=True) |
56 | + db_branch, bzr_tree = self.create_branch_and_tree() |
57 | + job = BranchScanJob.create(db_branch) |
58 | + |
59 | + def mock_run(*args): |
60 | + IMasterStore(BranchJob).execute("SELECT '" + 'x' * 1000 + "'") |
61 | + raise Exception('boom') |
62 | + |
63 | + self.useFixture(MockPatch( |
64 | + 'lp.code.model.branchjob.BranchScanJob.run', mock_run)) |
65 | + runner = JobRunner([job]) |
66 | + with dbuser('branchscanner'): |
67 | + runner.runJobHandleError(job) |
68 | + self.assertEqual(1, len(self.oopses)) |
69 | + actions = [action[2:4] for action in self.oopses[0]['timeline']] |
70 | + # Long action details are truncated. |
71 | + self.assertIn( |
72 | + ('SQL-main-master', |
73 | + "SELECT '" + 'x' * 489 + ' ... ' + 'x' * 496 + "'"), |
74 | + actions) |
75 | + # Short action details are left untouched. |
76 | + self.assertIn( |
77 | + ('SQL-nostore', 'Transaction completed, status: Active'), actions) |
78 | + |
79 | |
80 | class TestBranchUpgradeJob(TestCaseWithFactory): |
81 | """Tests for `BranchUpgradeJob`.""" |
82 | |
83 | === modified file 'lib/lp/services/job/interfaces/job.py' |
84 | --- lib/lp/services/job/interfaces/job.py 2013-12-16 05:04:43 +0000 |
85 | +++ lib/lp/services/job/interfaces/job.py 2018-03-30 20:43:01 +0000 |
86 | @@ -1,4 +1,4 @@ |
87 | -# Copyright 2009-2013 Canonical Ltd. This software is licensed under the |
88 | +# Copyright 2009-2018 Canonical Ltd. This software is licensed under the |
89 | # GNU Affero General Public License version 3 (see the file LICENSE). |
90 | |
91 | """Interfaces including and related to IJob.""" |
92 | @@ -193,6 +193,13 @@ |
93 | retry_error_types = Attribute( |
94 | 'A tuple of exception classes which should cause a retry.') |
95 | |
96 | + # If implemented, this must not be an instance method of a job, as that |
97 | + # will create an uncollectable reference cycle via the timeline. |
98 | + timeline_detail_filter = Attribute( |
99 | + 'An optional (category, detail) -> detail callable to filter ' |
100 | + 'timeline action details. This may be used when some details are ' |
101 | + 'expected to be very large.') |
102 | + |
103 | def notifyUserError(e): |
104 | """Notify interested parties that this job encountered a user error. |
105 | |
106 | |
107 | === modified file 'lib/lp/services/job/runner.py' |
108 | --- lib/lp/services/job/runner.py 2017-09-27 02:12:20 +0000 |
109 | +++ lib/lp/services/job/runner.py 2018-03-30 20:43:01 +0000 |
110 | @@ -1,4 +1,4 @@ |
111 | -# Copyright 2009-2017 Canonical Ltd. This software is licensed under the |
112 | +# Copyright 2009-2018 Canonical Ltd. This software is licensed under the |
113 | # GNU Affero General Public License version 3 (see the file LICENSE). |
114 | |
115 | """Facilities for running Jobs.""" |
116 | @@ -81,6 +81,10 @@ |
117 | ) |
118 | from lp.services.twistedsupport import run_reactor |
119 | from lp.services.webapp import errorlog |
120 | +from lp.services.webapp.adapter import ( |
121 | + clear_request_started, |
122 | + set_request_started, |
123 | + ) |
124 | |
125 | |
126 | class BaseRunnableJobSource: |
127 | @@ -117,6 +121,8 @@ |
128 | retry_delay = timedelta(minutes=10) |
129 | soft_time_limit = timedelta(minutes=5) |
130 | |
131 | + timeline_detail_filter = None |
132 | + |
133 | # We redefine __eq__ and __ne__ here to prevent the security proxy |
134 | # from mucking up our comparisons in tests and elsewhere. |
135 | def __eq__(self, job): |
136 | @@ -311,6 +317,15 @@ |
137 | finally: |
138 | set_default_timeout_function(original_timeout_function) |
139 | |
140 | + def runJobHandleError(self, job, fallback=None): |
141 | + set_request_started( |
142 | + enable_timeout=False, detail_filter=job.timeline_detail_filter) |
143 | + try: |
144 | + return super(BaseJobRunner, self).runJobHandleError( |
145 | + job, fallback=fallback) |
146 | + finally: |
147 | + clear_request_started() |
148 | + |
149 | def userErrorTypes(self, job): |
150 | return removeSecurityProxy(job).user_error_types |
151 | |
152 | |
153 | === modified file 'lib/lp/services/job/tests/test_runner.py' |
154 | --- lib/lp/services/job/tests/test_runner.py 2017-09-14 03:57:01 +0000 |
155 | +++ lib/lp/services/job/tests/test_runner.py 2018-03-30 20:43:01 +0000 |
156 | @@ -1,4 +1,4 @@ |
157 | -# Copyright 2009-2017 Canonical Ltd. This software is licensed under the |
158 | +# Copyright 2009-2018 Canonical Ltd. This software is licensed under the |
159 | # GNU Affero General Public License version 3 (see the file LICENSE). |
160 | |
161 | """Tests for job-running facilities.""" |
162 | @@ -17,6 +17,7 @@ |
163 | LeaseHeld, |
164 | SuspendJobException, |
165 | ) |
166 | +from lazr.restful.utils import get_current_browser_request |
167 | from pytz import UTC |
168 | from testtools.matchers import ( |
169 | GreaterThan, |
170 | @@ -29,6 +30,7 @@ |
171 | from zope.interface import implementer |
172 | |
173 | from lp.services.config import config |
174 | +from lp.services.database.sqlbase import flush_database_updates |
175 | from lp.services.features.testing import FeatureFixture |
176 | from lp.services.job.interfaces.job import ( |
177 | IRunnableJob, |
178 | @@ -42,6 +44,7 @@ |
179 | TwistedJobRunner, |
180 | ) |
181 | from lp.services.log.logger import BufferLogger |
182 | +from lp.services.timeline.requesttimeline import get_request_timeline |
183 | from lp.services.timeout import ( |
184 | get_default_timeout_function, |
185 | set_default_timeout_function, |
186 | @@ -100,6 +103,15 @@ |
187 | raise RaisingJobException(self.message) |
188 | |
189 | |
190 | +class RaisingJobTimelineMessage(NullJob): |
191 | + """A job that records a timeline action and then raises when it runs.""" |
192 | + |
193 | + def run(self): |
194 | + timeline = get_request_timeline(get_current_browser_request()) |
195 | + timeline.start('job', self.message).finish() |
196 | + raise RaisingJobException(self.message) |
197 | + |
198 | + |
199 | class RaisingJobUserError(NullJob): |
200 | """A job that raises a user error when it runs.""" |
201 | |
202 | @@ -328,6 +340,30 @@ |
203 | runner.runJobHandleError(job) |
204 | self.assertEqual(1, len(self.oopses)) |
205 | |
206 | + def test_runJobHandleErrors_oops_timeline(self): |
207 | + """The oops timeline only covers the job itself.""" |
208 | + timeline = get_request_timeline(get_current_browser_request()) |
209 | + timeline.start('test', 'sentinel').finish() |
210 | + job = RaisingJobTimelineMessage('boom') |
211 | + flush_database_updates() |
212 | + runner = JobRunner([job]) |
213 | + runner.runJobHandleError(job) |
214 | + self.assertEqual(1, len(self.oopses)) |
215 | + actions = [action[2:4] for action in self.oopses[0]['timeline']] |
216 | + self.assertIn(('job', 'boom'), actions) |
217 | + self.assertNotIn(('test', 'sentinel'), actions) |
218 | + |
219 | + def test_runJobHandleErrors_oops_timeline_detail_filter(self): |
220 | + """A job can choose to filter oops timeline details.""" |
221 | + job = RaisingJobTimelineMessage('boom') |
222 | + job.timeline_detail_filter = lambda _, detail: '<redacted>' |
223 | + flush_database_updates() |
224 | + runner = JobRunner([job]) |
225 | + runner.runJobHandleError(job) |
226 | + self.assertEqual(1, len(self.oopses)) |
227 | + actions = [action[2:4] for action in self.oopses[0]['timeline']] |
228 | + self.assertIn(('job', '<redacted>'), actions) |
229 | + |
230 | def test_runJobHandleErrors_user_error_no_oops(self): |
231 | """If the job raises a user error, there is no oops.""" |
232 | job = RaisingJobUserError('boom') |
233 | |
234 | === modified file 'lib/lp/services/webapp/adapter.py' |
235 | --- lib/lp/services/webapp/adapter.py 2016-12-22 16:32:38 +0000 |
236 | +++ lib/lp/services/webapp/adapter.py 2018-03-30 20:43:01 +0000 |
237 | @@ -1,4 +1,4 @@ |
238 | -# Copyright 2009-2016 Canonical Ltd. This software is licensed under the |
239 | +# Copyright 2009-2018 Canonical Ltd. This software is licensed under the |
240 | # GNU Affero General Public License version 3 (see the file LICENSE). |
241 | |
242 | __metaclass__ = type |
243 | @@ -156,8 +156,28 @@ |
244 | action.finish() |
245 | |
246 | |
247 | +class FilteredTimeline(Timeline): |
248 | + """A timeline that filters its actions. |
249 | + |
250 | + This is useful for requests that are expected to log actions with very |
251 | + large details (for example, large bulk SQL INSERT statements), where we |
252 | + don't want the overhead of storing those in memory. |
253 | + """ |
254 | + |
255 | + def __init__(self, actions=None, detail_filter=None, **kwargs): |
256 | + super(FilteredTimeline, self).__init__(actions=actions, **kwargs) |
257 | + self.detail_filter = detail_filter |
258 | + |
259 | + def start(self, category, detail, allow_nested=False): |
260 | + """See `Timeline`.""" |
261 | + if self.detail_filter is not None: |
262 | + detail = self.detail_filter(category, detail) |
263 | + return super(FilteredTimeline, self).start(category, detail) |
264 | + |
265 | + |
266 | def set_request_started( |
267 | - starttime=None, request_statements=None, txn=None, enable_timeout=True): |
268 | + starttime=None, request_statements=None, txn=None, enable_timeout=True, |
269 | + detail_filter=None): |
270 | """Set the start time for the request being served by the current |
271 | thread. |
272 | |
273 | @@ -171,6 +191,9 @@ |
274 | txn.abort() calls are logged too. |
275 | :param enable_timeout: If True, a timeout error is raised if the request |
276 | runs for a longer time than the configured timeout. |
277 | + :param detail_filter: An optional (category, detail) -> detail callable |
278 | + that filters action details. This may be used when some details are |
279 | + expected to be very large. |
280 | """ |
281 | if getattr(_local, 'request_start_time', None) is not None: |
282 | warnings.warn('set_request_started() called before previous request ' |
283 | @@ -180,13 +203,18 @@ |
284 | starttime = time() |
285 | _local.request_start_time = starttime |
286 | request = get_current_browser_request() |
287 | + if detail_filter is not None: |
288 | + timeline_factory = partial( |
289 | + FilteredTimeline, detail_filter=detail_filter) |
290 | + else: |
291 | + timeline_factory = Timeline |
292 | if request_statements is not None: |
293 | # Specify a specific sequence object for the timeline. |
294 | - set_request_timeline(request, Timeline(request_statements)) |
295 | + set_request_timeline(request, timeline_factory(request_statements)) |
296 | else: |
297 | # Ensure a timeline is created, so that time offset for actions is |
298 | # reasonable. |
299 | - set_request_timeline(request, Timeline()) |
300 | + set_request_timeline(request, timeline_factory()) |
301 | _local.current_statement_timeout = None |
302 | _local.enable_timeout = enable_timeout |
303 | _local.commit_logger = CommitLogger(transaction) |
304 | |
305 | === modified file 'lib/lp/services/webapp/doc/test_adapter.txt' |
306 | --- lib/lp/services/webapp/doc/test_adapter.txt 2016-09-19 12:08:08 +0000 |
307 | +++ lib/lp/services/webapp/doc/test_adapter.txt 2018-03-30 20:43:01 +0000 |
308 | @@ -138,6 +138,24 @@ |
309 | >>> print no_request_warning[0].message |
310 | clear_request_started() called outside of a request |
311 | |
312 | +Some requests are expected to log actions with very large details, such as |
313 | +large bulk SQL INSERT statements. The request can be started with a |
314 | +callable that filters action details to avoid using an inordinate amount of |
315 | +memory on logging. |
316 | + |
317 | + >>> def detail_filter(category, detail): |
318 | + ... if category != 'SQL-nostore': |
319 | + ... detail = '<redacted>' |
320 | + ... return detail |
321 | + >>> set_request_started(detail_filter=detail_filter) |
322 | + >>> store.execute('SELECT 1', noresult=True) |
323 | + >>> transaction.abort() |
324 | + >>> for _, _, _, statement, _ in get_request_statements(): |
325 | + ... print statement |
326 | + <redacted> |
327 | + Transaction completed, status: Active |
328 | + >>> clear_request_started() |
329 | + |
330 | |
331 | Statement Timeout |
332 | ================= |
Last time I looked at this I decided I didn't really want to risk enabling SQL logging for jobs (see the request_starttime conditon in connection_ raw_execute) , but it shouldn't be too terrible hopefully.