Merge lp:~cjwatson/launchpad/job-oops-timeline into lp:launchpad

Proposed by Colin Watson
Status: Merged
Merged at revision: 18600
Proposed branch: lp:~cjwatson/launchpad/job-oops-timeline
Merge into: lp:launchpad
Diff against target: 332 lines (+154/-9)
7 files modified
lib/lp/code/model/branchjob.py (+15/-1)
lib/lp/code/model/tests/test_branchjob.py (+28/-1)
lib/lp/services/job/interfaces/job.py (+8/-1)
lib/lp/services/job/runner.py (+16/-1)
lib/lp/services/job/tests/test_runner.py (+37/-1)
lib/lp/services/webapp/adapter.py (+32/-4)
lib/lp/services/webapp/doc/test_adapter.txt (+18/-0)
To merge this branch: bzr merge lp:~cjwatson/launchpad/job-oops-timeline
Reviewer Review Type Date Requested Status
William Grant code Approve
Review via email: mp+341554@code.launchpad.net

Commit message

Use a per-job OOPS timeline.

Description of the change

Job OOPSes currently tend to be rather useless since they don't have a proper timeline set up, so they end up lacking SQL timelines and with other junk in them from previous things the runner has done. This should improve that situation.

To post a comment you must log in.
Revision history for this message
William Grant (wgrant) wrote :

Last time I looked at this I decided I didn't really want to risk enabling SQL logging for jobs (see the request_starttime conditon in connection_raw_execute), but it shouldn't be too terrible hopefully.

review: Approve (code)
Revision history for this message
William Grant (wgrant) :
review: Approve (code)

Preview Diff

[H/L] Next/Prev Comment, [J/K] Next/Prev File, [N/P] Next/Prev Hunk
1=== modified file 'lib/lp/code/model/branchjob.py'
2--- lib/lp/code/model/branchjob.py 2015-09-01 17:10:46 +0000
3+++ lib/lp/code/model/branchjob.py 2018-03-30 20:43:01 +0000
4@@ -1,4 +1,4 @@
5-# Copyright 2009-2014 Canonical Ltd. This software is licensed under the
6+# Copyright 2009-2018 Canonical Ltd. This software is licensed under the
7 # GNU Affero General Public License version 3 (see the file LICENSE).
8
9 __all__ = [
10@@ -331,6 +331,20 @@
11 super(BranchScanJob, self).__init__(branch_job)
12 self._cached_branch_name = self.metadata['branch_name']
13
14+ @staticmethod
15+ def timeline_detail_filter(category, detail):
16+ """See `IRunnableJob`."""
17+ # Branch scan jobs involve very large bulk INSERT statements, which
18+ # we don't want to log in full. Truncate these in the same kind of
19+ # way that python-oops-tools does for its list of most expensive
20+ # statements.
21+ maximum = 1000
22+ splitter = ' ... '
23+ if category.startswith('SQL-') and len(detail) > maximum:
24+ half = (maximum - len(splitter)) // 2
25+ detail = (detail[:half] + splitter + detail[-half:])
26+ return detail
27+
28 def run(self):
29 """See `IBranchScanJob`."""
30 from lp.services.scripts import log
31
32=== modified file 'lib/lp/code/model/tests/test_branchjob.py'
33--- lib/lp/code/model/tests/test_branchjob.py 2017-10-04 01:49:22 +0000
34+++ lib/lp/code/model/tests/test_branchjob.py 2018-03-30 20:43:01 +0000
35@@ -1,4 +1,4 @@
36-# Copyright 2009-2017 Canonical Ltd. This software is licensed under the
37+# Copyright 2009-2018 Canonical Ltd. This software is licensed under the
38 # GNU Affero General Public License version 3 (see the file LICENSE).
39
40 """Tests for BranchJobs."""
41@@ -16,6 +16,7 @@
42 from bzrlib.bzrdir import BzrDir
43 from bzrlib.revision import NULL_REVISION
44 from bzrlib.transport import get_transport
45+from fixtures import MockPatch
46 import pytz
47 from sqlobject import SQLObjectNotFound
48 from storm.locals import Store
49@@ -206,6 +207,32 @@
50 self.assertEqual(db_branch.revision_count, 1)
51 self.assertIn(db_branch, private_bug.linked_branches)
52
53+ def test_sql_statements_truncated(self):
54+ """SQL statements in the job timeline are truncated."""
55+ self.useBzrBranches(direct_database=True)
56+ db_branch, bzr_tree = self.create_branch_and_tree()
57+ job = BranchScanJob.create(db_branch)
58+
59+ def mock_run(*args):
60+ IMasterStore(BranchJob).execute("SELECT '" + 'x' * 1000 + "'")
61+ raise Exception('boom')
62+
63+ self.useFixture(MockPatch(
64+ 'lp.code.model.branchjob.BranchScanJob.run', mock_run))
65+ runner = JobRunner([job])
66+ with dbuser('branchscanner'):
67+ runner.runJobHandleError(job)
68+ self.assertEqual(1, len(self.oopses))
69+ actions = [action[2:4] for action in self.oopses[0]['timeline']]
70+ # Long action details are truncated.
71+ self.assertIn(
72+ ('SQL-main-master',
73+ "SELECT '" + 'x' * 489 + ' ... ' + 'x' * 496 + "'"),
74+ actions)
75+ # Short action details are left untouched.
76+ self.assertIn(
77+ ('SQL-nostore', 'Transaction completed, status: Active'), actions)
78+
79
80 class TestBranchUpgradeJob(TestCaseWithFactory):
81 """Tests for `BranchUpgradeJob`."""
82
83=== modified file 'lib/lp/services/job/interfaces/job.py'
84--- lib/lp/services/job/interfaces/job.py 2013-12-16 05:04:43 +0000
85+++ lib/lp/services/job/interfaces/job.py 2018-03-30 20:43:01 +0000
86@@ -1,4 +1,4 @@
87-# Copyright 2009-2013 Canonical Ltd. This software is licensed under the
88+# Copyright 2009-2018 Canonical Ltd. This software is licensed under the
89 # GNU Affero General Public License version 3 (see the file LICENSE).
90
91 """Interfaces including and related to IJob."""
92@@ -193,6 +193,13 @@
93 retry_error_types = Attribute(
94 'A tuple of exception classes which should cause a retry.')
95
96+ # If implemented, this must not be an instance method of a job, as that
97+ # will create an uncollectable reference cycle via the timeline.
98+ timeline_detail_filter = Attribute(
99+ 'An optional (category, detail) -> detail callable to filter '
100+ 'timeline action details. This may be used when some details are '
101+ 'expected to be very large.')
102+
103 def notifyUserError(e):
104 """Notify interested parties that this job encountered a user error.
105
106
107=== modified file 'lib/lp/services/job/runner.py'
108--- lib/lp/services/job/runner.py 2017-09-27 02:12:20 +0000
109+++ lib/lp/services/job/runner.py 2018-03-30 20:43:01 +0000
110@@ -1,4 +1,4 @@
111-# Copyright 2009-2017 Canonical Ltd. This software is licensed under the
112+# Copyright 2009-2018 Canonical Ltd. This software is licensed under the
113 # GNU Affero General Public License version 3 (see the file LICENSE).
114
115 """Facilities for running Jobs."""
116@@ -81,6 +81,10 @@
117 )
118 from lp.services.twistedsupport import run_reactor
119 from lp.services.webapp import errorlog
120+from lp.services.webapp.adapter import (
121+ clear_request_started,
122+ set_request_started,
123+ )
124
125
126 class BaseRunnableJobSource:
127@@ -117,6 +121,8 @@
128 retry_delay = timedelta(minutes=10)
129 soft_time_limit = timedelta(minutes=5)
130
131+ timeline_detail_filter = None
132+
133 # We redefine __eq__ and __ne__ here to prevent the security proxy
134 # from mucking up our comparisons in tests and elsewhere.
135 def __eq__(self, job):
136@@ -311,6 +317,15 @@
137 finally:
138 set_default_timeout_function(original_timeout_function)
139
140+ def runJobHandleError(self, job, fallback=None):
141+ set_request_started(
142+ enable_timeout=False, detail_filter=job.timeline_detail_filter)
143+ try:
144+ return super(BaseJobRunner, self).runJobHandleError(
145+ job, fallback=fallback)
146+ finally:
147+ clear_request_started()
148+
149 def userErrorTypes(self, job):
150 return removeSecurityProxy(job).user_error_types
151
152
153=== modified file 'lib/lp/services/job/tests/test_runner.py'
154--- lib/lp/services/job/tests/test_runner.py 2017-09-14 03:57:01 +0000
155+++ lib/lp/services/job/tests/test_runner.py 2018-03-30 20:43:01 +0000
156@@ -1,4 +1,4 @@
157-# Copyright 2009-2017 Canonical Ltd. This software is licensed under the
158+# Copyright 2009-2018 Canonical Ltd. This software is licensed under the
159 # GNU Affero General Public License version 3 (see the file LICENSE).
160
161 """Tests for job-running facilities."""
162@@ -17,6 +17,7 @@
163 LeaseHeld,
164 SuspendJobException,
165 )
166+from lazr.restful.utils import get_current_browser_request
167 from pytz import UTC
168 from testtools.matchers import (
169 GreaterThan,
170@@ -29,6 +30,7 @@
171 from zope.interface import implementer
172
173 from lp.services.config import config
174+from lp.services.database.sqlbase import flush_database_updates
175 from lp.services.features.testing import FeatureFixture
176 from lp.services.job.interfaces.job import (
177 IRunnableJob,
178@@ -42,6 +44,7 @@
179 TwistedJobRunner,
180 )
181 from lp.services.log.logger import BufferLogger
182+from lp.services.timeline.requesttimeline import get_request_timeline
183 from lp.services.timeout import (
184 get_default_timeout_function,
185 set_default_timeout_function,
186@@ -100,6 +103,15 @@
187 raise RaisingJobException(self.message)
188
189
190+class RaisingJobTimelineMessage(NullJob):
191+ """A job that records a timeline action and then raises when it runs."""
192+
193+ def run(self):
194+ timeline = get_request_timeline(get_current_browser_request())
195+ timeline.start('job', self.message).finish()
196+ raise RaisingJobException(self.message)
197+
198+
199 class RaisingJobUserError(NullJob):
200 """A job that raises a user error when it runs."""
201
202@@ -328,6 +340,30 @@
203 runner.runJobHandleError(job)
204 self.assertEqual(1, len(self.oopses))
205
206+ def test_runJobHandleErrors_oops_timeline(self):
207+ """The oops timeline only covers the job itself."""
208+ timeline = get_request_timeline(get_current_browser_request())
209+ timeline.start('test', 'sentinel').finish()
210+ job = RaisingJobTimelineMessage('boom')
211+ flush_database_updates()
212+ runner = JobRunner([job])
213+ runner.runJobHandleError(job)
214+ self.assertEqual(1, len(self.oopses))
215+ actions = [action[2:4] for action in self.oopses[0]['timeline']]
216+ self.assertIn(('job', 'boom'), actions)
217+ self.assertNotIn(('test', 'sentinel'), actions)
218+
219+ def test_runJobHandleErrors_oops_timeline_detail_filter(self):
220+ """A job can choose to filter oops timeline details."""
221+ job = RaisingJobTimelineMessage('boom')
222+ job.timeline_detail_filter = lambda _, detail: '<redacted>'
223+ flush_database_updates()
224+ runner = JobRunner([job])
225+ runner.runJobHandleError(job)
226+ self.assertEqual(1, len(self.oopses))
227+ actions = [action[2:4] for action in self.oopses[0]['timeline']]
228+ self.assertIn(('job', '<redacted>'), actions)
229+
230 def test_runJobHandleErrors_user_error_no_oops(self):
231 """If the job raises a user error, there is no oops."""
232 job = RaisingJobUserError('boom')
233
234=== modified file 'lib/lp/services/webapp/adapter.py'
235--- lib/lp/services/webapp/adapter.py 2016-12-22 16:32:38 +0000
236+++ lib/lp/services/webapp/adapter.py 2018-03-30 20:43:01 +0000
237@@ -1,4 +1,4 @@
238-# Copyright 2009-2016 Canonical Ltd. This software is licensed under the
239+# Copyright 2009-2018 Canonical Ltd. This software is licensed under the
240 # GNU Affero General Public License version 3 (see the file LICENSE).
241
242 __metaclass__ = type
243@@ -156,8 +156,28 @@
244 action.finish()
245
246
247+class FilteredTimeline(Timeline):
248+ """A timeline that filters its actions.
249+
250+ This is useful for requests that are expected to log actions with very
251+ large details (for example, large bulk SQL INSERT statements), where we
252+ don't want the overhead of storing those in memory.
253+ """
254+
255+ def __init__(self, actions=None, detail_filter=None, **kwargs):
256+ super(FilteredTimeline, self).__init__(actions=actions, **kwargs)
257+ self.detail_filter = detail_filter
258+
259+ def start(self, category, detail, allow_nested=False):
260+ """See `Timeline`."""
261+ if self.detail_filter is not None:
262+ detail = self.detail_filter(category, detail)
263+ return super(FilteredTimeline, self).start(category, detail)
264+
265+
266 def set_request_started(
267- starttime=None, request_statements=None, txn=None, enable_timeout=True):
268+ starttime=None, request_statements=None, txn=None, enable_timeout=True,
269+ detail_filter=None):
270 """Set the start time for the request being served by the current
271 thread.
272
273@@ -171,6 +191,9 @@
274 txn.abort() calls are logged too.
275 :param enable_timeout: If True, a timeout error is raised if the request
276 runs for a longer time than the configured timeout.
277+ :param detail_filter: An optional (category, detail) -> detail callable
278+ that filters action details. This may be used when some details are
279+ expected to be very large.
280 """
281 if getattr(_local, 'request_start_time', None) is not None:
282 warnings.warn('set_request_started() called before previous request '
283@@ -180,13 +203,18 @@
284 starttime = time()
285 _local.request_start_time = starttime
286 request = get_current_browser_request()
287+ if detail_filter is not None:
288+ timeline_factory = partial(
289+ FilteredTimeline, detail_filter=detail_filter)
290+ else:
291+ timeline_factory = Timeline
292 if request_statements is not None:
293 # Specify a specific sequence object for the timeline.
294- set_request_timeline(request, Timeline(request_statements))
295+ set_request_timeline(request, timeline_factory(request_statements))
296 else:
297 # Ensure a timeline is created, so that time offset for actions is
298 # reasonable.
299- set_request_timeline(request, Timeline())
300+ set_request_timeline(request, timeline_factory())
301 _local.current_statement_timeout = None
302 _local.enable_timeout = enable_timeout
303 _local.commit_logger = CommitLogger(transaction)
304
305=== modified file 'lib/lp/services/webapp/doc/test_adapter.txt'
306--- lib/lp/services/webapp/doc/test_adapter.txt 2016-09-19 12:08:08 +0000
307+++ lib/lp/services/webapp/doc/test_adapter.txt 2018-03-30 20:43:01 +0000
308@@ -138,6 +138,24 @@
309 >>> print no_request_warning[0].message
310 clear_request_started() called outside of a request
311
312+Some requests are expected to log actions with very large details, such as
313+large bulk SQL INSERT statements. The request can be started with a
314+callable that filters action details to avoid using an inordinate amount of
315+memory on logging.
316+
317+ >>> def detail_filter(category, detail):
318+ ... if category != 'SQL-nostore':
319+ ... detail = '<redacted>'
320+ ... return detail
321+ >>> set_request_started(detail_filter=detail_filter)
322+ >>> store.execute('SELECT 1', noresult=True)
323+ >>> transaction.abort()
324+ >>> for _, _, _, statement, _ in get_request_statements():
325+ ... print statement
326+ <redacted>
327+ Transaction completed, status: Active
328+ >>> clear_request_started()
329+
330
331 Statement Timeout
332 =================