Merge ~cjwatson/launchpad:py3-apachelogparser into launchpad:master
- Git
- lp:~cjwatson/launchpad
- py3-apachelogparser
- Merge into master
Proposed by
Colin Watson
Status: | Merged |
---|---|
Approved by: | Colin Watson |
Approved revision: | 2fb99455c5367db0cf7b84af76743ba163931697 |
Merge reported by: | Otto Co-Pilot |
Merged at revision: | not available |
Proposed branch: | ~cjwatson/launchpad:py3-apachelogparser |
Merge into: | launchpad:master |
Diff against target: |
335 lines (+77/-54) 3 files modified
lib/lp/services/apachelogparser/base.py (+10/-9) lib/lp/services/apachelogparser/tests/test_apachelogparser.py (+59/-37) lib/lp/services/librarianserver/tests/test_apachelogparser.py (+8/-8) |
Related bugs: |
Reviewer | Review Type | Date Requested | Status |
---|---|---|---|
Ioana Lasc (community) | Approve | ||
Review via email: mp+393759@code.launchpad.net |
Commit message
Fix apachelogparser for Python 3
Description of the change
To post a comment you must log in.
Revision history for this message
Ioana Lasc (ilasc) : | # |
review:
Approve
Preview Diff
[H/L] Next/Prev Comment, [J/K] Next/Prev File, [N/P] Next/Prev Hunk
1 | diff --git a/lib/lp/services/apachelogparser/base.py b/lib/lp/services/apachelogparser/base.py |
2 | index e5fbaa7..6b3a15a 100644 |
3 | --- a/lib/lp/services/apachelogparser/base.py |
4 | +++ b/lib/lp/services/apachelogparser/base.py |
5 | @@ -4,6 +4,7 @@ |
6 | from datetime import datetime |
7 | import gzip |
8 | import os |
9 | +import struct |
10 | |
11 | from contrib import apachelog |
12 | from lazr.uri import ( |
13 | @@ -35,7 +36,7 @@ def get_files_to_parse(file_paths): |
14 | file_paths = sorted(file_paths, key=lambda path: os.stat(path).st_mtime) |
15 | for file_path in file_paths: |
16 | fd, file_size = get_fd_and_file_size(file_path) |
17 | - first_line = six.ensure_text(fd.readline()) |
18 | + first_line = six.ensure_text(fd.readline(), errors='replace') |
19 | parsed_file = store.find(ParsedApacheLog, first_line=first_line).one() |
20 | position = 0 |
21 | if parsed_file is not None: |
22 | @@ -56,8 +57,8 @@ def get_files_to_parse(file_paths): |
23 | def get_fd_and_file_size(file_path): |
24 | """Return a file descriptor and the file size for the given file path. |
25 | |
26 | - The file descriptor will have the default mode ('r') and will be seeked to |
27 | - the beginning. |
28 | + The file descriptor will be read-only, opened in binary mode, and with |
29 | + its position set to the beginning of the file. |
30 | |
31 | The file size returned is that of the uncompressed file, in case the given |
32 | file_path points to a gzipped file. |
33 | @@ -68,11 +69,10 @@ def get_fd_and_file_size(file_path): |
34 | # module in Python 2.6. |
35 | fd = gzip.open(file_path) |
36 | fd.fileobj.seek(-4, os.SEEK_END) |
37 | - isize = gzip.read32(fd.fileobj) # may exceed 2GB |
38 | - file_size = isize & 0xffffffffL |
39 | + file_size = struct.unpack('<I', fd.fileobj.read(4))[0] |
40 | fd.fileobj.seek(0) |
41 | else: |
42 | - fd = open(file_path) |
43 | + fd = open(file_path, 'rb') |
44 | file_size = os.path.getsize(file_path) |
45 | return fd, file_size |
46 | |
47 | @@ -106,6 +106,7 @@ def parse_file(fd, start_position, logger, get_download_key, parsed_lines=0): |
48 | break |
49 | |
50 | line = next_line |
51 | + line_text = six.ensure_text(line, errors='replace') |
52 | |
53 | # Always skip the last line as it may be truncated since we're |
54 | # rsyncing live logs, unless there is only one line for us to |
55 | @@ -122,7 +123,7 @@ def parse_file(fd, start_position, logger, get_download_key, parsed_lines=0): |
56 | parsed_lines += 1 |
57 | parsed_bytes += len(line) |
58 | host, date, status, request = get_host_date_status_and_request( |
59 | - line) |
60 | + line_text) |
61 | |
62 | if status != '200': |
63 | continue |
64 | @@ -158,7 +159,7 @@ def parse_file(fd, start_position, logger, get_download_key, parsed_lines=0): |
65 | # successfully, log this as an error and break the loop so that |
66 | # we return. |
67 | parsed_bytes -= len(line) |
68 | - logger.error('Error (%s) while parsing "%s"' % (e, line)) |
69 | + logger.error('Error (%s) while parsing "%s"' % (e, line_text)) |
70 | break |
71 | |
72 | if parsed_lines > 0: |
73 | @@ -170,7 +171,7 @@ def parse_file(fd, start_position, logger, get_download_key, parsed_lines=0): |
74 | |
75 | def create_or_update_parsedlog_entry(first_line, parsed_bytes): |
76 | """Create or update the ParsedApacheLog with the given first_line.""" |
77 | - first_line = six.ensure_text(first_line) |
78 | + first_line = six.ensure_text(first_line, errors='replace') |
79 | parsed_file = IStore(ParsedApacheLog).find( |
80 | ParsedApacheLog, first_line=first_line).one() |
81 | if parsed_file is None: |
82 | diff --git a/lib/lp/services/apachelogparser/tests/test_apachelogparser.py b/lib/lp/services/apachelogparser/tests/test_apachelogparser.py |
83 | index ee0ef78..513ce5d 100644 |
84 | --- a/lib/lp/services/apachelogparser/tests/test_apachelogparser.py |
85 | +++ b/lib/lp/services/apachelogparser/tests/test_apachelogparser.py |
86 | @@ -3,9 +3,8 @@ |
87 | |
88 | from datetime import datetime |
89 | import gzip |
90 | -from operator import itemgetter |
91 | +import io |
92 | import os |
93 | -from StringIO import StringIO |
94 | import tempfile |
95 | import time |
96 | |
97 | @@ -199,8 +198,10 @@ class TestLogFileParsing(TestCase): |
98 | # also been downloaded once (last line of the sample log), but |
99 | # parse_file() always skips the last line as it may be truncated, so |
100 | # it doesn't show up in the dict returned. |
101 | - fd = open(os.path.join( |
102 | - here, 'apache-log-files', 'launchpadlibrarian.net.access-log')) |
103 | + fd = open( |
104 | + os.path.join( |
105 | + here, 'apache-log-files', 'launchpadlibrarian.net.access-log'), |
106 | + 'rb') |
107 | downloads, parsed_bytes, parsed_lines = parse_file( |
108 | fd, start_position=0, logger=self.logger, |
109 | get_download_key=get_path_download_key) |
110 | @@ -223,8 +224,10 @@ class TestLogFileParsing(TestCase): |
111 | # When there's only the last line of a given file for us to parse, we |
112 | # assume the file has been rotated and it's safe to parse its last |
113 | # line without worrying about whether or not it's been truncated. |
114 | - fd = open(os.path.join( |
115 | - here, 'apache-log-files', 'launchpadlibrarian.net.access-log')) |
116 | + fd = open( |
117 | + os.path.join( |
118 | + here, 'apache-log-files', 'launchpadlibrarian.net.access-log'), |
119 | + 'rb') |
120 | downloads, parsed_bytes, parsed_lines = parse_file( |
121 | fd, start_position=self._getLastLineStart(fd), logger=self.logger, |
122 | get_download_key=get_path_download_key) |
123 | @@ -242,7 +245,7 @@ class TestLogFileParsing(TestCase): |
124 | # When there's an unexpected error, we log it and return as if we had |
125 | # parsed up to the line before the one where the failure occurred. |
126 | # Here we force an unexpected error on the first line. |
127 | - fd = StringIO('Not a log') |
128 | + fd = io.BytesIO(b'Not a log') |
129 | downloads, parsed_bytes, parsed_lines = parse_file( |
130 | fd, start_position=0, logger=self.logger, |
131 | get_download_key=get_path_download_key) |
132 | @@ -252,8 +255,9 @@ class TestLogFileParsing(TestCase): |
133 | |
134 | def _assertResponseWithGivenStatusIsIgnored(self, status): |
135 | """Assert that responses with the given status are ignored.""" |
136 | - fd = StringIO( |
137 | - self.sample_line % dict(status=status, method='GET')) |
138 | + fd = io.BytesIO( |
139 | + (self.sample_line % dict(status=status, method='GET')).encode( |
140 | + 'UTF-8')) |
141 | downloads, parsed_bytes, parsed_lines = parse_file( |
142 | fd, start_position=0, logger=self.logger, |
143 | get_download_key=get_path_download_key) |
144 | @@ -277,8 +281,9 @@ class TestLogFileParsing(TestCase): |
145 | |
146 | def _assertRequestWithGivenMethodIsIgnored(self, method): |
147 | """Assert that requests with the given method are ignored.""" |
148 | - fd = StringIO( |
149 | - self.sample_line % dict(status='200', method=method)) |
150 | + fd = io.BytesIO( |
151 | + (self.sample_line % dict(status='200', method=method)).encode( |
152 | + 'UTF-8')) |
153 | downloads, parsed_bytes, parsed_lines = parse_file( |
154 | fd, start_position=0, logger=self.logger, |
155 | get_download_key=get_path_download_key) |
156 | @@ -295,8 +300,9 @@ class TestLogFileParsing(TestCase): |
157 | self._assertRequestWithGivenMethodIsIgnored('POST') |
158 | |
159 | def test_normal_request_is_not_ignored(self): |
160 | - fd = StringIO( |
161 | - self.sample_line % dict(status=200, method='GET')) |
162 | + fd = io.BytesIO( |
163 | + (self.sample_line % dict(status=200, method='GET')).encode( |
164 | + 'UTF-8')) |
165 | downloads, parsed_bytes, parsed_lines = parse_file( |
166 | fd, start_position=0, logger=self.logger, |
167 | get_download_key=get_path_download_key) |
168 | @@ -317,8 +323,10 @@ class TestLogFileParsing(TestCase): |
169 | 'log_parser config', |
170 | '[launchpad]\nlogparser_max_parsed_lines: 2') |
171 | self.addCleanup(config.pop, 'log_parser config') |
172 | - fd = open(os.path.join( |
173 | - here, 'apache-log-files', 'launchpadlibrarian.net.access-log')) |
174 | + fd = open( |
175 | + os.path.join( |
176 | + here, 'apache-log-files', 'launchpadlibrarian.net.access-log'), |
177 | + 'rb') |
178 | self.addCleanup(fd.close) |
179 | |
180 | downloads, parsed_bytes, parsed_lines = parse_file( |
181 | @@ -359,8 +367,10 @@ class TestLogFileParsing(TestCase): |
182 | 'log_parser config', |
183 | '[launchpad]\nlogparser_max_parsed_lines: 2') |
184 | self.addCleanup(config.pop, 'log_parser config') |
185 | - fd = open(os.path.join( |
186 | - here, 'apache-log-files', 'launchpadlibrarian.net.access-log')) |
187 | + fd = open( |
188 | + os.path.join( |
189 | + here, 'apache-log-files', 'launchpadlibrarian.net.access-log'), |
190 | + 'rb') |
191 | self.addCleanup(fd.close) |
192 | |
193 | # We want to start parsing on line 2 so we will have a value in |
194 | @@ -413,24 +423,27 @@ class TestParsedFilesDetection(TestCase): |
195 | for fd, _ in get_files_to_parse(file_paths): |
196 | fd.seek(0) |
197 | contents.append(fd.read()) |
198 | - self.assertEqual(['2\n', '1\n', '0\n'], contents) |
199 | + fd.close() |
200 | + self.assertEqual([b'2\n', b'1\n', b'0\n'], contents) |
201 | |
202 | def test_not_parsed_file(self): |
203 | # A file that has never been parsed will have to be parsed from the |
204 | # start. |
205 | - files_to_parse = get_files_to_parse([self.file_path]) |
206 | - fd, position = list(files_to_parse)[0] |
207 | - self.assertEqual(position, 0) |
208 | + files_to_parse = list(get_files_to_parse([self.file_path])) |
209 | + self.assertEqual(1, len(files_to_parse)) |
210 | + fd, position = files_to_parse[0] |
211 | + fd.close() |
212 | + self.assertEqual(0, position) |
213 | |
214 | def test_completely_parsed_file(self): |
215 | # A file that has been completely parsed will be skipped. |
216 | - fd = open(self.file_path) |
217 | - first_line = fd.readline() |
218 | - fd.seek(0) |
219 | - ParsedApacheLog(first_line, len(fd.read())) |
220 | + with open(self.file_path) as fd: |
221 | + first_line = fd.readline() |
222 | + fd.seek(0) |
223 | + ParsedApacheLog(first_line, len(fd.read())) |
224 | |
225 | files_to_parse = get_files_to_parse([self.file_path]) |
226 | - self.assertEqual(list(files_to_parse), []) |
227 | + self.assertEqual([], list(files_to_parse)) |
228 | |
229 | def test_parsed_file_with_new_content(self): |
230 | # A file that has been parsed already but in which new content was |
231 | @@ -440,11 +453,12 @@ class TestParsedFilesDetection(TestCase): |
232 | ParsedApacheLog(first_line, len(first_line)) |
233 | |
234 | files_to_parse = list(get_files_to_parse([self.file_path])) |
235 | - self.assertEqual(len(files_to_parse), 1) |
236 | + self.assertEqual(1, len(files_to_parse)) |
237 | fd, position = files_to_parse[0] |
238 | + fd.close() |
239 | # Since we parsed the first line above, we'll be told to start where |
240 | # the first line ends. |
241 | - self.assertEqual(position, len(first_line)) |
242 | + self.assertEqual(len(first_line), position) |
243 | |
244 | def test_different_files_with_same_name(self): |
245 | # Thanks to log rotation, two runs of our script may see files with |
246 | @@ -459,12 +473,14 @@ class TestParsedFilesDetection(TestCase): |
247 | # parse it from the start. |
248 | fd, new_path = tempfile.mkstemp() |
249 | content2 = 'Different First Line\nSecond Line' |
250 | - fd = open(new_path, 'w') |
251 | - fd.write(content2) |
252 | - fd.close() |
253 | + with open(new_path, 'w') as fd: |
254 | + fd.write(content2) |
255 | files_to_parse = get_files_to_parse([new_path]) |
256 | - positions = map(itemgetter(1), files_to_parse) |
257 | - self.assertEqual(positions, [0]) |
258 | + positions = [] |
259 | + for fd, position in files_to_parse: |
260 | + fd.close() |
261 | + positions.append(position) |
262 | + self.assertEqual([0], positions) |
263 | |
264 | def test_fresh_gzipped_file(self): |
265 | # get_files_to_parse() handles gzipped files just like uncompressed |
266 | @@ -472,8 +488,11 @@ class TestParsedFilesDetection(TestCase): |
267 | gz_name = 'launchpadlibrarian.net.access-log.1.gz' |
268 | gz_path = os.path.join(self.root, gz_name) |
269 | files_to_parse = get_files_to_parse([gz_path]) |
270 | - positions = map(itemgetter(1), files_to_parse) |
271 | - self.assertEqual(positions, [0]) |
272 | + positions = [] |
273 | + for fd, position in files_to_parse: |
274 | + fd.close() |
275 | + positions.append(position) |
276 | + self.assertEqual([0], positions) |
277 | |
278 | def test_resumed_gzipped_file(self): |
279 | # In subsequent runs of the script we will resume from where we |
280 | @@ -483,8 +502,11 @@ class TestParsedFilesDetection(TestCase): |
281 | first_line = gzip.open(gz_path).readline() |
282 | ParsedApacheLog(first_line, len(first_line)) |
283 | files_to_parse = get_files_to_parse([gz_path]) |
284 | - positions = map(itemgetter(1), files_to_parse) |
285 | - self.assertEqual(positions, [len(first_line)]) |
286 | + positions = [] |
287 | + for fd, position in files_to_parse: |
288 | + fd.close() |
289 | + positions.append(position) |
290 | + self.assertEqual([len(first_line)], positions) |
291 | |
292 | |
293 | class Test_create_or_update_parsedlog_entry(TestCase): |
294 | diff --git a/lib/lp/services/librarianserver/tests/test_apachelogparser.py b/lib/lp/services/librarianserver/tests/test_apachelogparser.py |
295 | index ffd30c7..1be4c14 100644 |
296 | --- a/lib/lp/services/librarianserver/tests/test_apachelogparser.py |
297 | +++ b/lib/lp/services/librarianserver/tests/test_apachelogparser.py |
298 | @@ -2,8 +2,8 @@ |
299 | # GNU Affero General Public License version 3 (see the file LICENSE). |
300 | |
301 | from datetime import datetime |
302 | +import io |
303 | import os |
304 | -from StringIO import StringIO |
305 | import subprocess |
306 | |
307 | from zope.component import getUtility |
308 | @@ -95,10 +95,10 @@ class TestLibrarianLogFileParsing(TestCase): |
309 | self.logger = BufferLogger() |
310 | |
311 | def test_request_to_lfa_is_parsed(self): |
312 | - fd = StringIO( |
313 | - '69.233.136.42 - - [13/Jun/2008:14:55:22 +0100] "GET ' |
314 | - '/15018215/ul_logo_64x64.png HTTP/1.1" 200 2261 ' |
315 | - '"https://launchpad.net/~ubuntulite/+archive" "Mozilla"') |
316 | + fd = io.BytesIO( |
317 | + b'69.233.136.42 - - [13/Jun/2008:14:55:22 +0100] "GET ' |
318 | + b'/15018215/ul_logo_64x64.png HTTP/1.1" 200 2261 ' |
319 | + b'"https://launchpad.net/~ubuntulite/+archive" "Mozilla"') |
320 | downloads, parsed_bytes, ignored = parse_file( |
321 | fd, start_position=0, logger=self.logger, |
322 | get_download_key=get_library_file_id) |
323 | @@ -114,9 +114,9 @@ class TestLibrarianLogFileParsing(TestCase): |
324 | def test_request_to_non_lfa_is_ignored(self): |
325 | # A request to a path which doesn't map to a LibraryFileAlias (e.g. |
326 | # '/') is ignored. |
327 | - fd = StringIO( |
328 | - '69.233.136.42 - - [13/Jun/2008:14:55:22 +0100] "GET / HTTP/1.1" ' |
329 | - '200 2261 "https://launchpad.net/~ubuntulite/+archive" "Mozilla"') |
330 | + fd = io.BytesIO( |
331 | + b'69.233.136.42 - - [13/Jun/2008:14:55:22 +0100] "GET / HTTP/1.1" ' |
332 | + b'200 2261 "https://launchpad.net/~ubuntulite/+archive" "Mozilla"') |
333 | downloads, parsed_bytes, ignored = parse_file( |
334 | fd, start_position=0, logger=self.logger, |
335 | get_download_key=get_library_file_id) |