Merge lp:~michael.nelson/launchpad/588288-log-parser-dont-read-entire-file into lp:launchpad

Proposed by Michael Nelson
Status: Merged
Approved by: Graham Binns
Approved revision: no longer in the source branch.
Merged at revision: 10942
Proposed branch: lp:~michael.nelson/launchpad/588288-log-parser-dont-read-entire-file
Merge into: lp:launchpad
Diff against target: 167 lines (+67/-13)
4 files modified
configs/testrunner/launchpad-lazr.conf (+1/-0)
lib/canonical/config/schema-lazr.conf (+4/-0)
lib/lp/services/apachelogparser/base.py (+30/-11)
lib/lp/services/apachelogparser/tests/test_apachelogparser.py (+32/-2)
To merge this branch: bzr merge lp:~michael.nelson/launchpad/588288-log-parser-dont-read-entire-file
Reviewer Review Type Date Requested Status
Graham Binns (community) code Approve
Review via email: mp+26498@code.launchpad.net

Commit message

log parser please don't read entire file into memory.

Description of the change

This branch fixes bug 588288, which was identified after running the ppa log parser for the first time (bug 139855).

It does two things:
1) Refactors the parse_file() function so that it does not read the entire remaining file contents to be parsed into memory (lines = fd.readlines()).

2) Allows an optional config logparser_max_parsed_lines to determine the maximum number of lines that should be parsed in one run.

No lint. To test:
bin/test -vvm test_apachelogparser

To post a comment you must log in.
Graham Binns (gmb) :
review: Approve (code)
Graham Binns (gmb) :
review: Approve (code)

Preview Diff

[H/L] Next/Prev Comment, [J/K] Next/Prev File, [N/P] Next/Prev Hunk
=== modified file 'configs/testrunner/launchpad-lazr.conf'
--- configs/testrunner/launchpad-lazr.conf 2010-05-06 20:08:41 +0000
+++ configs/testrunner/launchpad-lazr.conf 2010-06-04 07:40:50 +0000
@@ -131,6 +131,7 @@
131[launchpad]131[launchpad]
132max_attachment_size: 1024132max_attachment_size: 1024
133geoip_database: /usr/share/GeoIP/GeoLiteCity.dat133geoip_database: /usr/share/GeoIP/GeoLiteCity.dat
134logparser_max_parsed_lines: 100000
134135
135[launchpad_session]136[launchpad_session]
136cookie: launchpad_tests137cookie: launchpad_tests
137138
=== modified file 'lib/canonical/config/schema-lazr.conf'
--- lib/canonical/config/schema-lazr.conf 2010-05-28 23:06:25 +0000
+++ lib/canonical/config/schema-lazr.conf 2010-06-04 07:40:50 +0000
@@ -1131,6 +1131,10 @@
1131# ba-ws.geonames.net.1131# ba-ws.geonames.net.
1132geonames_identity:1132geonames_identity:
11331133
1134# The maximum number of lines that should be parsed by the launchpad
1135# log parser.
1136logparser_max_parsed_lines: 100000
1137
11341138
1135[launchpad_session]1139[launchpad_session]
1136# The hostname where the session database is located.1140# The hostname where the session database is located.
11371141
=== modified file 'lib/lp/services/apachelogparser/base.py'
--- lib/lp/services/apachelogparser/base.py 2009-12-22 16:10:13 +0000
+++ lib/lp/services/apachelogparser/base.py 2010-06-04 07:40:50 +0000
@@ -11,10 +11,11 @@
1111
12from contrib import apachelog12from contrib import apachelog
1313
14from lp.services.apachelogparser.model.parsedapachelog import ParsedApacheLog14from canonical.config import config
15from canonical.launchpad.interfaces.geoip import IGeoIP15from canonical.launchpad.interfaces.geoip import IGeoIP
16from canonical.launchpad.webapp.interfaces import (16from canonical.launchpad.webapp.interfaces import (
17 IStoreSelector, MAIN_STORE, DEFAULT_FLAVOR)17 IStoreSelector, MAIN_STORE, DEFAULT_FLAVOR)
18from lp.services.apachelogparser.model.parsedapachelog import ParsedApacheLog
1819
1920
20parser = apachelog.parser(apachelog.formats['extended'])21parser = apachelog.parser(apachelog.formats['extended'])
@@ -84,20 +85,36 @@
84 """85 """
85 # Seek file to given position, read all lines.86 # Seek file to given position, read all lines.
86 fd.seek(start_position)87 fd.seek(start_position)
87 lines = fd.readlines()88 line = fd.readline()
88 # Always skip the last line as it may be truncated since we're rsyncing89
89 # live logs.
90 last_line = lines.pop(-1)
91 parsed_bytes = start_position90 parsed_bytes = start_position
92 if len(lines) == 0:
93 # This probably means we're dealing with a logfile that has been
94 # rotated already, so it should be safe to parse its last line.
95 lines = [last_line]
9691
97 geoip = getUtility(IGeoIP)92 geoip = getUtility(IGeoIP)
98 downloads = {}93 downloads = {}
99 for line in lines:94 parsed_lines = 0
100 try:95
96 # Check for an optional max_parsed_lines config option.
97 max_parsed_lines = getattr(
98 config.launchpad, 'logparser_max_parsed_lines', None)
99
100 while line:
101 if max_parsed_lines is not None and parsed_lines >= max_parsed_lines:
102 break
103
104 # Always skip the last line as it may be truncated since we're
105 # rsyncing live logs, unless there is only one line for us to
106 # parse, in which case This probably means we're dealing with a
107 # logfile that has been rotated already, so it should be safe to
108 # parse its last line.
109 next_line = ''
110 try:
111 next_line = fd.next()
112 except StopIteration:
113 if parsed_lines > 0:
114 break
115
116 try:
117 parsed_lines += 1
101 parsed_bytes += len(line)118 parsed_bytes += len(line)
102 host, date, status, request = get_host_date_status_and_request(119 host, date, status, request = get_host_date_status_and_request(
103 line)120 line)
@@ -143,6 +160,8 @@
143 parsed_bytes -= len(line)160 parsed_bytes -= len(line)
144 logger.error('Error (%s) while parsing "%s"' % (e, line))161 logger.error('Error (%s) while parsing "%s"' % (e, line))
145 break162 break
163
164 line = next_line
146 return downloads, parsed_bytes165 return downloads, parsed_bytes
147166
148167
149168
=== modified file 'lib/lp/services/apachelogparser/tests/test_apachelogparser.py'
--- lib/lp/services/apachelogparser/tests/test_apachelogparser.py 2010-01-07 06:47:46 +0000
+++ lib/lp/services/apachelogparser/tests/test_apachelogparser.py 2010-06-04 07:40:50 +0000
@@ -6,10 +6,12 @@
6import os6import os
7from StringIO import StringIO7from StringIO import StringIO
8import tempfile8import tempfile
9import textwrap
9import unittest10import unittest
1011
11from zope.component import getUtility12from zope.component import getUtility
1213
14from canonical.config import config
13from canonical.launchpad.scripts.logger import BufferLogger15from canonical.launchpad.scripts.logger import BufferLogger
14from canonical.launchpad.webapp.interfaces import (16from canonical.launchpad.webapp.interfaces import (
15 IStoreSelector, MAIN_STORE, DEFAULT_FLAVOR)17 IStoreSelector, MAIN_STORE, DEFAULT_FLAVOR)
@@ -215,12 +217,40 @@
215 self.assertEqual(self.logger.buffer.getvalue(), '')217 self.assertEqual(self.logger.buffer.getvalue(), '')
216218
217 date = datetime(2008, 6, 13)219 date = datetime(2008, 6, 13)
218 self.assertEqual(downloads, 220 self.assertEqual(downloads,
219 {'/15018215/ul_logo_64x64.png':221 {'/15018215/ul_logo_64x64.png':
220 {datetime(2008, 6, 13): {'US': 1}}})222 {datetime(2008, 6, 13): {'US': 1}}})
221223
222 self.assertEqual(parsed_bytes, fd.tell())224 self.assertEqual(parsed_bytes, fd.tell())
223225
226 def test_max_parsed_lines(self):
227 # The max_parsed_lines config option limits the number of parsed
228 # lines.
229 config.push(
230 'log_parser config',
231 textwrap.dedent('''\
232 [launchpad]
233 logparser_max_parsed_lines: 2
234 '''))
235 fd = open(os.path.join(
236 here, 'apache-log-files', 'launchpadlibrarian.net.access-log'))
237 downloads, parsed_bytes = parse_file(
238 fd, start_position=0, logger=self.logger,
239 get_download_key=get_path_download_key)
240 config.pop("log_parser config")
241
242 self.assertEqual(self.logger.buffer.getvalue(), '')
243 date = datetime(2008, 6, 13)
244 self.assertContentEqual(
245 downloads.items(),
246 [('/12060796/me-tv-icon-64x64.png', {date: {'AU': 1}}),
247 ('/9096290/me-tv-icon-14x14.png', {date: {'AU': 1}})])
248
249 # We should have parsed only the first two lines of data.
250 fd.seek(0)
251 lines = fd.readlines()
252 self.assertEqual(parsed_bytes, len(lines[0]) + len(lines[1]))
253
224254
225class TestParsedFilesDetection(TestCase):255class TestParsedFilesDetection(TestCase):
226 """Test the detection of already parsed logs."""256 """Test the detection of already parsed logs."""
@@ -263,7 +293,7 @@
263293
264 def test_different_files_with_same_name(self):294 def test_different_files_with_same_name(self):
265 # Thanks to log rotation, two runs of our script may see files with295 # Thanks to log rotation, two runs of our script may see files with
266 # the same name but completely different content. If we see a file 296 # the same name but completely different content. If we see a file
267 # with a name matching that of an already parsed file but with content297 # with a name matching that of an already parsed file but with content
268 # differing from the last file with that name parsed, we know we need298 # differing from the last file with that name parsed, we know we need
269 # to parse the file from the start.299 # to parse the file from the start.