Merge lp:~benji/launchpad/bug-622765 into lp:launchpad

Proposed by Benji York
Status: Merged
Approved by: Brad Crittenden
Approved revision: no longer in the source branch.
Merged at revision: 11468
Proposed branch: lp:~benji/launchpad/bug-622765
Merge into: lp:launchpad
Diff against target: 125 lines (+26/-16)
3 files modified
lib/lp/services/apachelogparser/base.py (+2/-4)
lib/lp/services/apachelogparser/script.py (+1/-1)
lib/lp/services/apachelogparser/tests/test_apachelogparser.py (+23/-11)
To merge this branch: bzr merge lp:~benji/launchpad/bug-622765
Reviewer Review Type Date Requested Status
Brad Crittenden (community) code Approve
Review via email: mp+33951@code.launchpad.net

Description of the change

When parsing Apache log files, the cron scripts want to pick up where
they left off. The lp.services.apachelogparser.base.get_files_to_parse
function used to return a mapping of open files to the position at which
parsing should resume. The problem was that all the files (which could
number in the thousands) were opened at once and returned en masse.

Since there was only one caller of get_files_to_parse and it only
iterated over the .items() of the result, it was easy to refactor the
code to instead return/consume a generator. The result is that only one
file is open at any given time (the client consumes each file and closes
it before proceeding to the next).

Gary and I had a pre-implementation call.

The get_files_to_parse function was already well tested and only trivial
transformations were required to make the tests compatible with the new
API.

The tests can be run with the command:

    bin/test -c test_apachelogparser

I verified that the scripts still run by faking up enough of a
production environment that they would at least run to completion
without generating errors.

No lint was reported by "make lint".

To post a comment you must log in.
Revision history for this message
Brad Crittenden (bac) wrote :

Nice branch Benji. Neat to be able to reduce resource usage so easily.

review: Approve (code)

Preview Diff

[H/L] Next/Prev Comment, [J/K] Next/Prev File, [N/P] Next/Prev Hunk
1=== modified file 'lib/lp/services/apachelogparser/base.py'
2--- lib/lp/services/apachelogparser/base.py 2010-08-20 20:31:18 +0000
3+++ lib/lp/services/apachelogparser/base.py 2010-08-27 19:04:45 +0000
4@@ -24,7 +24,7 @@
5
6
7 def get_files_to_parse(root, file_names):
8- """Return a dict mapping files to the position where reading should start.
9+ """Return an iterator of file and position where reading should start.
10
11 The lines read from that position onwards will be the ones that have not
12 been parsed yet.
13@@ -32,7 +32,6 @@
14 :param root: The directory where the files are stored.
15 :param file_names: The names of the files.
16 """
17- files_to_parse = {}
18 store = getUtility(IStoreSelector).get(MAIN_STORE, DEFAULT_FLAVOR)
19 for file_name in file_names:
20 file_path = os.path.join(root, file_name)
21@@ -52,8 +51,7 @@
22 # parse what's new.
23 position = parsed_file.bytes_read
24
25- files_to_parse[fd] = position
26- return files_to_parse
27+ yield fd, position
28
29
30 def get_fd_and_file_size(file_path):
31
32=== modified file 'lib/lp/services/apachelogparser/script.py'
33--- lib/lp/services/apachelogparser/script.py 2010-08-20 20:31:18 +0000
34+++ lib/lp/services/apachelogparser/script.py 2010-08-27 19:04:45 +0000
35@@ -65,7 +65,7 @@
36
37 self.setUpUtilities()
38 country_set = getUtility(ICountrySet)
39- for fd, position in files_to_parse.items():
40+ for fd, position in files_to_parse:
41 downloads, parsed_bytes = parse_file(
42 fd, position, self.logger, self.getDownloadKey)
43 # Use a while loop here because we want to pop items from the dict
44
45=== modified file 'lib/lp/services/apachelogparser/tests/test_apachelogparser.py'
46--- lib/lp/services/apachelogparser/tests/test_apachelogparser.py 2010-08-20 20:31:18 +0000
47+++ lib/lp/services/apachelogparser/tests/test_apachelogparser.py 2010-08-27 19:04:45 +0000
48@@ -7,6 +7,7 @@
49 from StringIO import StringIO
50 import tempfile
51 import textwrap
52+from operator import itemgetter
53 import unittest
54
55 from zope.component import getUtility
56@@ -301,7 +302,8 @@
57 # start.
58 file_name = 'launchpadlibrarian.net.access-log'
59 files_to_parse = get_files_to_parse(self.root, [file_name])
60- self.failUnlessEqual(files_to_parse.values(), [0])
61+ fd, position = list(files_to_parse)[0]
62+ self.assertEqual(position, 0)
63
64 def test_completely_parsed_file(self):
65 # A file that has been completely parsed will be skipped.
66@@ -311,7 +313,8 @@
67 fd.seek(0)
68 ParsedApacheLog(first_line, len(fd.read()))
69
70- self.failUnlessEqual(get_files_to_parse(self.root, [file_name]), {})
71+ files_to_parse = get_files_to_parse(self.root, [file_name])
72+ self.failUnlessEqual(list(files_to_parse), [])
73
74 def test_parsed_file_with_new_content(self):
75 # A file that has been parsed already but in which new content was
76@@ -321,8 +324,12 @@
77 first_line = open(os.path.join(self.root, file_name)).readline()
78 ParsedApacheLog(first_line, len(first_line))
79
80- files_to_parse = get_files_to_parse(self.root, [file_name])
81- self.failUnlessEqual(files_to_parse.values(), [len(first_line)])
82+ files_to_parse = list(get_files_to_parse(self.root, [file_name]))
83+ self.assertEqual(len(files_to_parse), 1)
84+ fd, position = files_to_parse[0]
85+ # Since we parsed the first line above, we'll be told to start where
86+ # the first line ends.
87+ self.assertEqual(position, len(first_line))
88
89 def test_different_files_with_same_name(self):
90 # Thanks to log rotation, two runs of our script may see files with
91@@ -341,22 +348,27 @@
92 fd.write(content2)
93 fd.close()
94 files_to_parse = get_files_to_parse(self.root, [file_name])
95- self.failUnlessEqual(files_to_parse.values(), [0])
96+ positions = map(itemgetter(1), files_to_parse)
97+ self.failUnlessEqual(positions, [0])
98
99- def test_gzipped_file(self):
100+ def test_fresh_gzipped_file(self):
101 # get_files_to_parse() handles gzipped files just like uncompressed
102- # ones.
103- # The first time we see one, we'll parse from the beginning.
104+ # ones. The first time we see one, we'll parse from the beginning.
105 file_name = 'launchpadlibrarian.net.access-log.1.gz'
106 first_line = gzip.open(os.path.join(self.root, file_name)).readline()
107 files_to_parse = get_files_to_parse(self.root, [file_name])
108- self.failUnlessEqual(files_to_parse.values(), [0])
109+ positions = map(itemgetter(1), files_to_parse)
110+ self.assertEqual(positions, [0])
111
112- # And in subsequent runs of the script we will resume from where we
113+ def test_resumed_gzipped_file(self):
114+ # In subsequent runs of the script we will resume from where we
115 # stopped last time. (Here we pretend we parsed only the first line)
116+ file_name = 'launchpadlibrarian.net.access-log.1.gz'
117+ first_line = gzip.open(os.path.join(self.root, file_name)).readline()
118 ParsedApacheLog(first_line, len(first_line))
119 files_to_parse = get_files_to_parse(self.root, [file_name])
120- self.failUnlessEqual(files_to_parse.values(), [len(first_line)])
121+ positions = map(itemgetter(1), files_to_parse)
122+ self.failUnlessEqual(positions, [len(first_line)])
123
124
125 class Test_create_or_update_parsedlog_entry(TestCase):