Merge lp:~michael.nelson/launchpad/588288-log-parser-dont-read-entire-file into lp:launchpad

Proposed by Michael Nelson
Status: Merged
Approved by: Graham Binns
Approved revision: no longer in the source branch.
Merged at revision: 10942
Proposed branch: lp:~michael.nelson/launchpad/588288-log-parser-dont-read-entire-file
Merge into: lp:launchpad
Diff against target: 167 lines (+67/-13)
4 files modified
configs/testrunner/launchpad-lazr.conf (+1/-0)
lib/canonical/config/schema-lazr.conf (+4/-0)
lib/lp/services/apachelogparser/base.py (+30/-11)
lib/lp/services/apachelogparser/tests/test_apachelogparser.py (+32/-2)
To merge this branch: bzr merge lp:~michael.nelson/launchpad/588288-log-parser-dont-read-entire-file
Reviewer Review Type Date Requested Status
Graham Binns (community) code Approve
Review via email: mp+26498@code.launchpad.net

Commit message

log parser please don't read entire file into memory.

Description of the change

This branch fixes bug 588288, which was identified after running the ppa log parser for the first time (bug 139855).

It does two things:
1) Refactors the parse_file() function so that it does not read the entire remaining file contents to be parsed into memory (lines = fd.readlines()).

2) Allows an optional config logparser_max_parsed_lines to determine the maximum number of lines that should be parsed in one run.

No lint. To test:
bin/test -vvm test_apachelogparser

To post a comment you must log in.
Graham Binns (gmb) :
review: Approve (code)
Graham Binns (gmb) :
review: Approve (code)

Preview Diff

[H/L] Next/Prev Comment, [J/K] Next/Prev File, [N/P] Next/Prev Hunk
1=== modified file 'configs/testrunner/launchpad-lazr.conf'
2--- configs/testrunner/launchpad-lazr.conf 2010-05-06 20:08:41 +0000
3+++ configs/testrunner/launchpad-lazr.conf 2010-06-04 07:40:50 +0000
4@@ -131,6 +131,7 @@
5 [launchpad]
6 max_attachment_size: 1024
7 geoip_database: /usr/share/GeoIP/GeoLiteCity.dat
8+logparser_max_parsed_lines: 100000
9
10 [launchpad_session]
11 cookie: launchpad_tests
12
13=== modified file 'lib/canonical/config/schema-lazr.conf'
14--- lib/canonical/config/schema-lazr.conf 2010-05-28 23:06:25 +0000
15+++ lib/canonical/config/schema-lazr.conf 2010-06-04 07:40:50 +0000
16@@ -1131,6 +1131,10 @@
17 # ba-ws.geonames.net.
18 geonames_identity:
19
20+# The maximum number of lines that should be parsed by the launchpad
21+# log parser.
22+logparser_max_parsed_lines: 100000
23+
24
25 [launchpad_session]
26 # The hostname where the session database is located.
27
28=== modified file 'lib/lp/services/apachelogparser/base.py'
29--- lib/lp/services/apachelogparser/base.py 2009-12-22 16:10:13 +0000
30+++ lib/lp/services/apachelogparser/base.py 2010-06-04 07:40:50 +0000
31@@ -11,10 +11,11 @@
32
33 from contrib import apachelog
34
35-from lp.services.apachelogparser.model.parsedapachelog import ParsedApacheLog
36+from canonical.config import config
37 from canonical.launchpad.interfaces.geoip import IGeoIP
38 from canonical.launchpad.webapp.interfaces import (
39 IStoreSelector, MAIN_STORE, DEFAULT_FLAVOR)
40+from lp.services.apachelogparser.model.parsedapachelog import ParsedApacheLog
41
42
43 parser = apachelog.parser(apachelog.formats['extended'])
44@@ -84,20 +85,36 @@
45 """
46 # Seek file to given position, read all lines.
47 fd.seek(start_position)
48- lines = fd.readlines()
49- # Always skip the last line as it may be truncated since we're rsyncing
50- # live logs.
51- last_line = lines.pop(-1)
52+ line = fd.readline()
53+
54 parsed_bytes = start_position
55- if len(lines) == 0:
56- # This probably means we're dealing with a logfile that has been
57- # rotated already, so it should be safe to parse its last line.
58- lines = [last_line]
59
60 geoip = getUtility(IGeoIP)
61 downloads = {}
62- for line in lines:
63- try:
64+ parsed_lines = 0
65+
66+ # Check for an optional max_parsed_lines config option.
67+ max_parsed_lines = getattr(
68+ config.launchpad, 'logparser_max_parsed_lines', None)
69+
70+ while line:
71+ if max_parsed_lines is not None and parsed_lines >= max_parsed_lines:
72+ break
73+
74+ # Always skip the last line as it may be truncated since we're
75+ # rsyncing live logs, unless there is only one line for us to
76+ # parse, in which case This probably means we're dealing with a
77+ # logfile that has been rotated already, so it should be safe to
78+ # parse its last line.
79+ next_line = ''
80+ try:
81+ next_line = fd.next()
82+ except StopIteration:
83+ if parsed_lines > 0:
84+ break
85+
86+ try:
87+ parsed_lines += 1
88 parsed_bytes += len(line)
89 host, date, status, request = get_host_date_status_and_request(
90 line)
91@@ -143,6 +160,8 @@
92 parsed_bytes -= len(line)
93 logger.error('Error (%s) while parsing "%s"' % (e, line))
94 break
95+
96+ line = next_line
97 return downloads, parsed_bytes
98
99
100
101=== modified file 'lib/lp/services/apachelogparser/tests/test_apachelogparser.py'
102--- lib/lp/services/apachelogparser/tests/test_apachelogparser.py 2010-01-07 06:47:46 +0000
103+++ lib/lp/services/apachelogparser/tests/test_apachelogparser.py 2010-06-04 07:40:50 +0000
104@@ -6,10 +6,12 @@
105 import os
106 from StringIO import StringIO
107 import tempfile
108+import textwrap
109 import unittest
110
111 from zope.component import getUtility
112
113+from canonical.config import config
114 from canonical.launchpad.scripts.logger import BufferLogger
115 from canonical.launchpad.webapp.interfaces import (
116 IStoreSelector, MAIN_STORE, DEFAULT_FLAVOR)
117@@ -215,12 +217,40 @@
118 self.assertEqual(self.logger.buffer.getvalue(), '')
119
120 date = datetime(2008, 6, 13)
121- self.assertEqual(downloads,
122+ self.assertEqual(downloads,
123 {'/15018215/ul_logo_64x64.png':
124 {datetime(2008, 6, 13): {'US': 1}}})
125
126 self.assertEqual(parsed_bytes, fd.tell())
127
128+ def test_max_parsed_lines(self):
129+ # The max_parsed_lines config option limits the number of parsed
130+ # lines.
131+ config.push(
132+ 'log_parser config',
133+ textwrap.dedent('''\
134+ [launchpad]
135+ logparser_max_parsed_lines: 2
136+ '''))
137+ fd = open(os.path.join(
138+ here, 'apache-log-files', 'launchpadlibrarian.net.access-log'))
139+ downloads, parsed_bytes = parse_file(
140+ fd, start_position=0, logger=self.logger,
141+ get_download_key=get_path_download_key)
142+ config.pop("log_parser config")
143+
144+ self.assertEqual(self.logger.buffer.getvalue(), '')
145+ date = datetime(2008, 6, 13)
146+ self.assertContentEqual(
147+ downloads.items(),
148+ [('/12060796/me-tv-icon-64x64.png', {date: {'AU': 1}}),
149+ ('/9096290/me-tv-icon-14x14.png', {date: {'AU': 1}})])
150+
151+ # We should have parsed only the first two lines of data.
152+ fd.seek(0)
153+ lines = fd.readlines()
154+ self.assertEqual(parsed_bytes, len(lines[0]) + len(lines[1]))
155+
156
157 class TestParsedFilesDetection(TestCase):
158 """Test the detection of already parsed logs."""
159@@ -263,7 +293,7 @@
160
161 def test_different_files_with_same_name(self):
162 # Thanks to log rotation, two runs of our script may see files with
163- # the same name but completely different content. If we see a file
164+ # the same name but completely different content. If we see a file
165 # with a name matching that of an already parsed file but with content
166 # differing from the last file with that name parsed, we know we need
167 # to parse the file from the start.