Merge lp:~lifeless/launchpad/foundations into lp:launchpad

Proposed by Robert Collins on 2010-08-11
Status: Merged
Approved by: Stuart Bishop on 2010-08-19
Approved revision: no longer in the source branch.
Merged at revision: 11396
Proposed branch: lp:~lifeless/launchpad/foundations
Merge into: lp:launchpad
Diff against target: 440 lines (+158/-73)
2 files modified
lib/lp/scripts/utilities/pageperformancereport.py (+154/-68)
utilities/page-performance-report-daily.sh (+4/-5)
To merge this branch: bzr merge lp:~lifeless/launchpad/foundations
Reviewer Review Type Date Requested Status
Stuart Bishop 2010-08-11 Approve on 2010-08-12
Review via email: mp+32299@code.launchpad.net

Commit Message

Page performance report improvements - top N urls and 99th percentile

Description of the Change

Generate a page performance report page that will (hopefully) be a useful hit-list for developers to focus on.

To post a comment you must log in.
Stuart Bishop (stub) wrote :

I've fixed some bugs in this - it doesn't look like it had been tested.

I've merged it in with changes I had neglected to push in the days before the Epic but have been running live. This work is in lp:~stub/launchpad/page-performance-report. It seems to be working locally just fine with a single log downloaded from devpad. I've pushed this to the tree on devpad so we can see the result with the scheduled run.

Robert Collins (lifeless) wrote :

Thanks Stuart; the width of 10 for the candidates-timeout graphs is a
little weird; I think it should take the same upper bound - or even a
higher one of the max time in the dataset.

-Rob

Robert Collins (lifeless) wrote :

Oh, and I was a muppet, setting the timeout -2 *default* to 10, should
have set it to 12, the staging timeout which would cut off
appropriately at 10. Mea culpa.

Robert Collins (lifeless) wrote :

Stuart, I've updated this with a patch to raise the default timeout to
12 (stagings current timeout) and to separate out the
graph capping (50% above timeout : we regularly go up to this or
higher due to the way the timeouts work), and the stddeviation
calculations : our numeric stats were being distorted by the
calculations to make the histograms nice.

Thanks,
Rob

Stuart Bishop (stub) wrote :

Looks fine and is running happily on devpad

review: Approve

Preview Diff

[H/L] Next/Prev Comment, [J/K] Next/Prev File, [N/P] Next/Prev Hunk
1=== modified file 'lib/lp/scripts/utilities/pageperformancereport.py'
2--- lib/lp/scripts/utilities/pageperformancereport.py 2010-07-08 11:44:02 +0000
3+++ lib/lp/scripts/utilities/pageperformancereport.py 2010-08-12 05:59:44 +0000
4@@ -6,15 +6,13 @@
5 __metaclass__ = type
6 __all__ = ['main']
7
8-import bz2
9 from cgi import escape as html_quote
10 from ConfigParser import RawConfigParser
11 from datetime import datetime
12-import gzip
13 import re
14 import sre_constants
15-from tempfile import TemporaryFile
16 import os.path
17+import subprocess
18 from textwrap import dedent
19 import time
20
21@@ -84,6 +82,7 @@
22 median = 0 # Median time per hit.
23 std = 0 # Standard deviation per hit.
24 var = 0 # Variance per hit.
25+ ninetyninth_percentile_time = 0
26 histogram = None # # Request times histogram.
27
28 total_sqltime = 0 # Total time spent waiting for SQL to process.
29@@ -98,27 +97,31 @@
30 std_sqlstatements = 0
31 var_sqlstatements = 0
32
33+empty_stats = Stats() # Singleton.
34+
35
36 class Times:
37 """Collection of request times."""
38 def __init__(self, timeout):
39- self.spool = TemporaryFile()
40+ self.total_hits = 0
41+ self.total_time = 0
42 self.request_times = []
43 self.sql_statements = []
44 self.sql_times = []
45 self.ticks = []
46- self.timeout = timeout
47+ self.histogram_width = int(1.5*timeout)
48
49 def add(self, request):
50- """Add the application time from the request to the collection.
51-
52- The application time is capped to our timeout.
53- """
54- print >> self.spool, "%s,%s,%s,%s" % (
55- min(request.app_seconds, self.timeout),
56- request.sql_statements or '',
57- request.sql_seconds or '',
58- request.ticks or '')
59+ """Add the application time from the request to the collection."""
60+ self.total_hits += 1
61+ self.total_time += request.app_seconds
62+ self.request_times.append(request.app_seconds)
63+ if request.sql_statements is not None:
64+ self.sql_statements.append(request.sql_statements)
65+ if request.sql_seconds is not None:
66+ self.sql_times.append(request.sql_seconds)
67+ if request.ticks is not None:
68+ self.ticks.append(request.ticks)
69
70 _stats = None
71
72@@ -133,58 +136,56 @@
73 1 and 2 seconds etc. histogram is None if there are no requests in
74 this Category.
75 """
76+ if not self.total_hits:
77+ return empty_stats
78+
79 if self._stats is not None:
80 return self._stats
81
82- def iter_spool(index, cast):
83- """Generator returning one column from our spool file.
84-
85- Skips None values.
86- """
87- self.spool.flush()
88- self.spool.seek(0)
89- for line in self.spool:
90- value = line.split(',')[index]
91- if value != '':
92- yield cast(value)
93-
94 stats = Stats()
95
96+ stats.total_hits = self.total_hits
97+
98 # Time stats
99- array = numpy.fromiter(iter_spool(0, numpy.float32), numpy.float32)
100+ array = numpy.asarray(self.request_times, numpy.float32)
101 stats.total_time = numpy.sum(array)
102- stats.total_hits = len(array)
103 stats.mean = numpy.mean(array)
104 stats.median = numpy.median(array)
105 stats.std = numpy.std(array)
106 stats.var = numpy.var(array)
107+ # This is an approximation which may not be true: we don't know if we
108+ # have a std distribution or not. We could just find the 99th
109+ # percentile by counting. Shock. Horror; however this appears pretty
110+ # good based on eyeballing things so far - once we're down in the 2-3
111+ # second range for everything we may want to revisit.
112+ stats.ninetyninth_percentile_time = stats.mean + stats.std*3
113+ capped_times = (min(a_time, self.histogram_width) for a_time in
114+ self.request_times)
115+ array = numpy.fromiter(capped_times, numpy.float32,
116+ len(self.request_times))
117 histogram = numpy.histogram(
118 array, normed=True,
119- range=(0, self.timeout), bins=self.timeout)
120+ range=(0, self.histogram_width), bins=self.histogram_width)
121 stats.histogram = zip(histogram[1], histogram[0])
122
123- # SQL query count.
124- array = numpy.fromiter(iter_spool(1, numpy.int), numpy.int)
125- stats.total_sqlstatements = numpy.sum(array)
126- stats.mean_sqlstatements = numpy.mean(array)
127- stats.median_sqlstatements = numpy.median(array)
128- stats.std_sqlstatements = numpy.std(array)
129- stats.var_sqlstatements = numpy.var(array)
130-
131 # SQL time stats.
132- array = numpy.fromiter(iter_spool(2, numpy.float32), numpy.float32)
133+ array = numpy.asarray(self.sql_times, numpy.float32)
134 stats.total_sqltime = numpy.sum(array)
135 stats.mean_sqltime = numpy.mean(array)
136 stats.median_sqltime = numpy.median(array)
137 stats.std_sqltime = numpy.std(array)
138 stats.var_sqltime = numpy.var(array)
139
140+ # SQL query count.
141+ array = numpy.asarray(self.sql_statements, numpy.int)
142+ stats.total_sqlstatements = int(numpy.sum(array))
143+ stats.mean_sqlstatements = numpy.mean(array)
144+ stats.median_sqlstatements = numpy.median(array)
145+ stats.std_sqlstatements = numpy.std(array)
146+ stats.var_sqlstatements = numpy.var(array)
147+
148 # Cache for next invocation.
149 self._stats = stats
150-
151- # Clean up the spool file
152- self.spool = None
153-
154 return stats
155
156 def __str__(self):
157@@ -194,6 +195,9 @@
158 return "%2.2f %2.2f %2.2f %s" % (
159 total, mean, median, std, hstr)
160
161+ def __cmp__(self, b):
162+ return cmp(self.total_time, b.total_time)
163+
164
165 def main():
166 parser = LPOptionParser("%prog [args] tracelog [...]")
167@@ -204,10 +208,6 @@
168 config.root, "utilities", "page-performance-report.ini"),
169 metavar="FILE", help="Load configuration from FILE")
170 parser.add_option(
171- "--timeout", dest="timeout", type="int",
172- default=20, metavar="SECS",
173- help="Requests taking more than SECS seconds are timeouts")
174- parser.add_option(
175 "--from", dest="from_ts", type="datetime",
176 default=None, metavar="TIMESTAMP",
177 help="Ignore log entries before TIMESTAMP")
178@@ -224,9 +224,17 @@
179 action="store_false", default=True,
180 help="Do not produce pageids report")
181 parser.add_option(
182+ "--top-urls", dest="top_urls", type=int, metavar="N",
183+ default=50, help="Generate report for top N urls by hitcount.")
184+ parser.add_option(
185 "--directory", dest="directory",
186 default=os.getcwd(), metavar="DIR",
187 help="Output reports in DIR directory")
188+ parser.add_option(
189+ "--timeout", dest="timeout",
190+ # Default to 12: the staging timeout.
191+ default=12, type="int",
192+ help="The configured timeout value : determines high risk page ids.")
193
194 options, args = parser.parse_args()
195
196@@ -268,25 +276,51 @@
197 parser.error("No data in [categories] section of configuration.")
198
199 pageid_times = {}
200-
201- parse(args, categories, pageid_times, options)
202+ url_times = {}
203+
204+ parse(args, categories, pageid_times, url_times, options)
205+
206+ # Truncate the URL times to the top N.
207+ if options.top_urls:
208+ sorted_urls = sorted(
209+ ((times, url) for url, times in url_times.items()
210+ if times.total_hits > 0), reverse=True)
211+ url_times = [(url, times)
212+ for times, url in sorted_urls[:options.top_urls]]
213+
214+ def _report_filename(filename):
215+ return os.path.join(options.directory, filename)
216
217 # Category only report.
218 if options.categories:
219- report_filename = os.path.join(options.directory,'categories.html')
220+ report_filename = _report_filename('categories.html')
221 log.info("Generating %s", report_filename)
222- html_report(open(report_filename, 'w'), categories, None)
223+ html_report(open(report_filename, 'w'), categories, None, None)
224
225 # Pageid only report.
226 if options.pageids:
227- report_filename = os.path.join(options.directory,'pageids.html')
228- log.info("Generating %s", report_filename)
229- html_report(open(report_filename, 'w'), None, pageid_times)
230+ report_filename = _report_filename('pageids.html')
231+ log.info("Generating %s", report_filename)
232+ html_report(open(report_filename, 'w'), None, pageid_times, None)
233+
234+ # Top URL only report.
235+ if options.top_urls:
236+ report_filename = _report_filename('top%d.html' % options.top_urls)
237+ log.info("Generating %s", report_filename)
238+ html_report(open(report_filename, 'w'), None, None, url_times)
239
240 # Combined report.
241 if options.categories and options.pageids:
242- report_filename = os.path.join(options.directory,'combined.html')
243- html_report(open(report_filename, 'w'), categories, pageid_times)
244+ report_filename = _report_filename('combined.html')
245+ html_report(
246+ open(report_filename, 'w'), categories, pageid_times, url_times)
247+
248+ # Report of likely timeout candidates
249+ report_filename = _report_filename('timeout-candidates.html')
250+ log.info("Generating %s", report_filename)
251+ html_report(
252+ open(report_filename, 'w'), None, pageid_times, None,
253+ options.timeout - 2)
254
255 return 0
256
257@@ -298,9 +332,17 @@
258 """
259 ext = os.path.splitext(filename)[1]
260 if ext == '.bz2':
261- return bz2.BZ2File(filename, mode)
262+ p = subprocess.Popen(
263+ ['bunzip2', '-c', filename],
264+ stdout=subprocess.PIPE, stdin=subprocess.PIPE)
265+ p.stdin.close()
266+ return p.stdout
267 elif ext == '.gz':
268- return gzip.open(filename, mode)
269+ p = subprocess.Popen(
270+ ['gunzip', '-c', filename],
271+ stdout=subprocess.PIPE, stdin=subprocess.PIPE)
272+ p.stdin.close()
273+ return p.stdout
274 else:
275 return open(filename, mode)
276
277@@ -321,7 +363,7 @@
278 *(int(elem) for elem in match.groups() if elem is not None))
279
280
281-def parse(tracefiles, categories, pageid_times, options):
282+def parse(tracefiles, categories, pageid_times, url_times, options):
283 requests = {}
284 total_requests = 0
285 for tracefile in tracefiles:
286@@ -402,12 +444,12 @@
287 log.debug("Parsed %d requests", total_requests)
288
289 # Add the request to any matching categories.
290- if categories is not None:
291+ if options.categories:
292 for category in categories:
293 category.add(request)
294
295 # Add the request to the times for that pageid.
296- if pageid_times is not None and request.pageid is not None:
297+ if options.pageids:
298 pageid = request.pageid
299 try:
300 times = pageid_times[pageid]
301@@ -416,6 +458,21 @@
302 pageid_times[pageid] = times
303 times.add(request)
304
305+ # Add the request to the times for that URL.
306+ if options.top_urls:
307+ url = request.url
308+ # Hack to remove opstats from top N report. This
309+ # should go into a config file if we end up with
310+ # more pages that need to be ignored because
311+ # they are just noise.
312+ if not (url is None or url.endswith('+opstats')):
313+ try:
314+ times = url_times[url]
315+ except KeyError:
316+ times = Times(options.timeout)
317+ url_times[url] = times
318+ times.add(request)
319+
320 else:
321 raise MalformedLine('Unknown record type %s', record_type)
322 except MalformedLine, x:
323@@ -442,7 +499,19 @@
324 "Unknown extension prefix %s" % prefix)
325
326
327-def html_report(outf, categories, pageid_times):
328+def html_report(
329+ outf, categories, pageid_times, url_times,
330+ ninetyninth_percentile_threshold=None):
331+ """Write an html report to outf.
332+
333+ :param outf: A file object to write the report to.
334+ :param categories: Categories to report.
335+ :param pageid_times: The time statistics for pageids.
336+ :param url_times: The time statistics for the top XXX urls.
337+ :param ninetyninth_percentile_threshold: Lower threshold for inclusion of
338+ pages in the pageid section; pages where 99 percent of the requests are
339+ served under this threshold will not be included.
340+ """
341
342 print >> outf, dedent('''\
343 <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN"
344@@ -502,6 +571,8 @@
345
346 <th class="clickable">Total Time (secs)</th>
347
348+ <th class="clickable">99% Under Time (secs)</th>
349+
350 <th class="clickable">Mean Time (secs)</th>
351 <th class="clickable">Time Standard Deviation</th>
352 <th class="clickable">Time Variance</th>
353@@ -537,6 +608,7 @@
354 <th class="category-title">%s</th>
355 <td class="numeric total_hits">%d</td>
356 <td class="numeric total_time">%.2f</td>
357+ <td class="numeric 99pc_under">%.2f</td>
358 <td class="numeric mean_time">%.2f</td>
359 <td class="numeric std_time">%.2f</td>
360 <td class="numeric var_time">%.2f</td>
361@@ -559,6 +631,7 @@
362 """ % (
363 html_title,
364 stats.total_hits, stats.total_time,
365+ stats.ninetyninth_percentile_time,
366 stats.mean, stats.std, stats.var, stats.median,
367 len(histograms) - 1,
368 stats.total_sqltime, stats.mean_sqltime,
369@@ -568,13 +641,14 @@
370 stats.median_sqlstatements))
371
372 # Table of contents
373- if categories and pageid_times:
374- print >> outf, dedent('''\
375- <ol>
376- <li><a href="#catrep">Category Report</a></li>
377- <li><a href="#pageidrep">Pageid Report</a></li>
378- </ol>
379- ''')
380+ print >> outf, '<ol>'
381+ if categories:
382+ print >> outf, '<li><a href="#catrep">Category Report</a></li>'
383+ if pageid_times:
384+ print >> outf, '<li><a href="#pageidrep">Pageid Report</a></li>'
385+ if url_times:
386+ print >> outf, '<li><a href="#topurlrep">Top URL Report</a></li>'
387+ print >> outf, '</ol>'
388
389 if categories:
390 print >> outf, '<h2 id="catrep">Category Report</h2>'
391@@ -589,9 +663,21 @@
392 print >> outf, '<h2 id="pageidrep">Pageid Report</h2>'
393 print >> outf, table_header
394 for pageid, times in sorted(pageid_times.items()):
395+ pageid = pageid or 'None'
396+ if (ninetyninth_percentile_threshold is not None and
397+ (times.stats().ninetyninth_percentile_time <
398+ ninetyninth_percentile_threshold)):
399+ continue
400 handle_times(html_quote(pageid), times)
401 print >> outf, table_footer
402
403+ if url_times:
404+ print >> outf, '<h2 id="topurlrep">Top URL Report</h2>'
405+ print >> outf, table_header
406+ for url, times in url_times:
407+ handle_times(html_quote(url), times)
408+ print >> outf, table_footer
409+
410 # Ourput the javascript to render our histograms nicely, replacing
411 # the placeholder <div> tags output earlier.
412 print >> outf, dedent("""\
413
414=== modified file 'utilities/page-performance-report-daily.sh'
415--- utilities/page-performance-report-daily.sh 2010-07-08 09:11:54 +0000
416+++ utilities/page-performance-report-daily.sh 2010-08-12 05:59:44 +0000
417@@ -26,11 +26,14 @@
418 echo Generating report from $from until $until into $dir `date`
419
420 ./page-performance-report.py -v --from=$from --until=$until \
421- --directory=${dir} $logs
422+ --top-urls=200 --directory=${dir} $logs
423
424 ln -sf ${dir}/categories.html ${root}/latest-${type}-categories.html
425 ln -sf ${dir}/pageids.html ${root}/latest-${type}-pageids.html
426 ln -sf ${dir}/combined.html ${root}/latest-${type}-combined.html
427+ ln -sf ${dir}/top200.html ${root}/latest-${type}-top200.html
428+ ln -sf ${dir}/timeout-candidates.html \
429+ ${root}/latest-${type}-timeout-candidates.html
430
431 return 0
432 }
433@@ -57,8 +60,4 @@
434 report 32 monthly `date -d 'last month' $fmt` $now
435 fi
436
437-# One off reports to populate history.
438-## report 40 monthly `date -d '1 june 2010' $fmt` `date -d '1 july 2010' $fmt`
439-## report 23 weekly `date -d '19 june 2010' $fmt` `date -d '26 june 2010' $fmt`
440-## report 16 weekly `date -d '26 june 2010' $fmt` `date -d '3 july 2010' $fmt`
441