Merge ~cjwatson/launchpad:py3-apachelogparser into launchpad:master

Proposed by Colin Watson
Status: Merged
Approved by: Colin Watson
Approved revision: 2fb99455c5367db0cf7b84af76743ba163931697
Merge reported by: Otto Co-Pilot
Merged at revision: not available
Proposed branch: ~cjwatson/launchpad:py3-apachelogparser
Merge into: launchpad:master
Diff against target: 335 lines (+77/-54)
3 files modified
lib/lp/services/apachelogparser/base.py (+10/-9)
lib/lp/services/apachelogparser/tests/test_apachelogparser.py (+59/-37)
lib/lp/services/librarianserver/tests/test_apachelogparser.py (+8/-8)
Reviewer Review Type Date Requested Status
Ioana Lasc (community) Approve
Review via email: mp+393759@code.launchpad.net

Commit message

Fix apachelogparser for Python 3

To post a comment you must log in.
Revision history for this message
Ioana Lasc (ilasc) :
review: Approve

Preview Diff

[H/L] Next/Prev Comment, [J/K] Next/Prev File, [N/P] Next/Prev Hunk
1diff --git a/lib/lp/services/apachelogparser/base.py b/lib/lp/services/apachelogparser/base.py
2index e5fbaa7..6b3a15a 100644
3--- a/lib/lp/services/apachelogparser/base.py
4+++ b/lib/lp/services/apachelogparser/base.py
5@@ -4,6 +4,7 @@
6 from datetime import datetime
7 import gzip
8 import os
9+import struct
10
11 from contrib import apachelog
12 from lazr.uri import (
13@@ -35,7 +36,7 @@ def get_files_to_parse(file_paths):
14 file_paths = sorted(file_paths, key=lambda path: os.stat(path).st_mtime)
15 for file_path in file_paths:
16 fd, file_size = get_fd_and_file_size(file_path)
17- first_line = six.ensure_text(fd.readline())
18+ first_line = six.ensure_text(fd.readline(), errors='replace')
19 parsed_file = store.find(ParsedApacheLog, first_line=first_line).one()
20 position = 0
21 if parsed_file is not None:
22@@ -56,8 +57,8 @@ def get_files_to_parse(file_paths):
23 def get_fd_and_file_size(file_path):
24 """Return a file descriptor and the file size for the given file path.
25
26- The file descriptor will have the default mode ('r') and will be seeked to
27- the beginning.
28+ The file descriptor will be read-only, opened in binary mode, and with
29+ its position set to the beginning of the file.
30
31 The file size returned is that of the uncompressed file, in case the given
32 file_path points to a gzipped file.
33@@ -68,11 +69,10 @@ def get_fd_and_file_size(file_path):
34 # module in Python 2.6.
35 fd = gzip.open(file_path)
36 fd.fileobj.seek(-4, os.SEEK_END)
37- isize = gzip.read32(fd.fileobj) # may exceed 2GB
38- file_size = isize & 0xffffffffL
39+ file_size = struct.unpack('<I', fd.fileobj.read(4))[0]
40 fd.fileobj.seek(0)
41 else:
42- fd = open(file_path)
43+ fd = open(file_path, 'rb')
44 file_size = os.path.getsize(file_path)
45 return fd, file_size
46
47@@ -106,6 +106,7 @@ def parse_file(fd, start_position, logger, get_download_key, parsed_lines=0):
48 break
49
50 line = next_line
51+ line_text = six.ensure_text(line, errors='replace')
52
53 # Always skip the last line as it may be truncated since we're
54 # rsyncing live logs, unless there is only one line for us to
55@@ -122,7 +123,7 @@ def parse_file(fd, start_position, logger, get_download_key, parsed_lines=0):
56 parsed_lines += 1
57 parsed_bytes += len(line)
58 host, date, status, request = get_host_date_status_and_request(
59- line)
60+ line_text)
61
62 if status != '200':
63 continue
64@@ -158,7 +159,7 @@ def parse_file(fd, start_position, logger, get_download_key, parsed_lines=0):
65 # successfully, log this as an error and break the loop so that
66 # we return.
67 parsed_bytes -= len(line)
68- logger.error('Error (%s) while parsing "%s"' % (e, line))
69+ logger.error('Error (%s) while parsing "%s"' % (e, line_text))
70 break
71
72 if parsed_lines > 0:
73@@ -170,7 +171,7 @@ def parse_file(fd, start_position, logger, get_download_key, parsed_lines=0):
74
75 def create_or_update_parsedlog_entry(first_line, parsed_bytes):
76 """Create or update the ParsedApacheLog with the given first_line."""
77- first_line = six.ensure_text(first_line)
78+ first_line = six.ensure_text(first_line, errors='replace')
79 parsed_file = IStore(ParsedApacheLog).find(
80 ParsedApacheLog, first_line=first_line).one()
81 if parsed_file is None:
82diff --git a/lib/lp/services/apachelogparser/tests/test_apachelogparser.py b/lib/lp/services/apachelogparser/tests/test_apachelogparser.py
83index ee0ef78..513ce5d 100644
84--- a/lib/lp/services/apachelogparser/tests/test_apachelogparser.py
85+++ b/lib/lp/services/apachelogparser/tests/test_apachelogparser.py
86@@ -3,9 +3,8 @@
87
88 from datetime import datetime
89 import gzip
90-from operator import itemgetter
91+import io
92 import os
93-from StringIO import StringIO
94 import tempfile
95 import time
96
97@@ -199,8 +198,10 @@ class TestLogFileParsing(TestCase):
98 # also been downloaded once (last line of the sample log), but
99 # parse_file() always skips the last line as it may be truncated, so
100 # it doesn't show up in the dict returned.
101- fd = open(os.path.join(
102- here, 'apache-log-files', 'launchpadlibrarian.net.access-log'))
103+ fd = open(
104+ os.path.join(
105+ here, 'apache-log-files', 'launchpadlibrarian.net.access-log'),
106+ 'rb')
107 downloads, parsed_bytes, parsed_lines = parse_file(
108 fd, start_position=0, logger=self.logger,
109 get_download_key=get_path_download_key)
110@@ -223,8 +224,10 @@ class TestLogFileParsing(TestCase):
111 # When there's only the last line of a given file for us to parse, we
112 # assume the file has been rotated and it's safe to parse its last
113 # line without worrying about whether or not it's been truncated.
114- fd = open(os.path.join(
115- here, 'apache-log-files', 'launchpadlibrarian.net.access-log'))
116+ fd = open(
117+ os.path.join(
118+ here, 'apache-log-files', 'launchpadlibrarian.net.access-log'),
119+ 'rb')
120 downloads, parsed_bytes, parsed_lines = parse_file(
121 fd, start_position=self._getLastLineStart(fd), logger=self.logger,
122 get_download_key=get_path_download_key)
123@@ -242,7 +245,7 @@ class TestLogFileParsing(TestCase):
124 # When there's an unexpected error, we log it and return as if we had
125 # parsed up to the line before the one where the failure occurred.
126 # Here we force an unexpected error on the first line.
127- fd = StringIO('Not a log')
128+ fd = io.BytesIO(b'Not a log')
129 downloads, parsed_bytes, parsed_lines = parse_file(
130 fd, start_position=0, logger=self.logger,
131 get_download_key=get_path_download_key)
132@@ -252,8 +255,9 @@ class TestLogFileParsing(TestCase):
133
134 def _assertResponseWithGivenStatusIsIgnored(self, status):
135 """Assert that responses with the given status are ignored."""
136- fd = StringIO(
137- self.sample_line % dict(status=status, method='GET'))
138+ fd = io.BytesIO(
139+ (self.sample_line % dict(status=status, method='GET')).encode(
140+ 'UTF-8'))
141 downloads, parsed_bytes, parsed_lines = parse_file(
142 fd, start_position=0, logger=self.logger,
143 get_download_key=get_path_download_key)
144@@ -277,8 +281,9 @@ class TestLogFileParsing(TestCase):
145
146 def _assertRequestWithGivenMethodIsIgnored(self, method):
147 """Assert that requests with the given method are ignored."""
148- fd = StringIO(
149- self.sample_line % dict(status='200', method=method))
150+ fd = io.BytesIO(
151+ (self.sample_line % dict(status='200', method=method)).encode(
152+ 'UTF-8'))
153 downloads, parsed_bytes, parsed_lines = parse_file(
154 fd, start_position=0, logger=self.logger,
155 get_download_key=get_path_download_key)
156@@ -295,8 +300,9 @@ class TestLogFileParsing(TestCase):
157 self._assertRequestWithGivenMethodIsIgnored('POST')
158
159 def test_normal_request_is_not_ignored(self):
160- fd = StringIO(
161- self.sample_line % dict(status=200, method='GET'))
162+ fd = io.BytesIO(
163+ (self.sample_line % dict(status=200, method='GET')).encode(
164+ 'UTF-8'))
165 downloads, parsed_bytes, parsed_lines = parse_file(
166 fd, start_position=0, logger=self.logger,
167 get_download_key=get_path_download_key)
168@@ -317,8 +323,10 @@ class TestLogFileParsing(TestCase):
169 'log_parser config',
170 '[launchpad]\nlogparser_max_parsed_lines: 2')
171 self.addCleanup(config.pop, 'log_parser config')
172- fd = open(os.path.join(
173- here, 'apache-log-files', 'launchpadlibrarian.net.access-log'))
174+ fd = open(
175+ os.path.join(
176+ here, 'apache-log-files', 'launchpadlibrarian.net.access-log'),
177+ 'rb')
178 self.addCleanup(fd.close)
179
180 downloads, parsed_bytes, parsed_lines = parse_file(
181@@ -359,8 +367,10 @@ class TestLogFileParsing(TestCase):
182 'log_parser config',
183 '[launchpad]\nlogparser_max_parsed_lines: 2')
184 self.addCleanup(config.pop, 'log_parser config')
185- fd = open(os.path.join(
186- here, 'apache-log-files', 'launchpadlibrarian.net.access-log'))
187+ fd = open(
188+ os.path.join(
189+ here, 'apache-log-files', 'launchpadlibrarian.net.access-log'),
190+ 'rb')
191 self.addCleanup(fd.close)
192
193 # We want to start parsing on line 2 so we will have a value in
194@@ -413,24 +423,27 @@ class TestParsedFilesDetection(TestCase):
195 for fd, _ in get_files_to_parse(file_paths):
196 fd.seek(0)
197 contents.append(fd.read())
198- self.assertEqual(['2\n', '1\n', '0\n'], contents)
199+ fd.close()
200+ self.assertEqual([b'2\n', b'1\n', b'0\n'], contents)
201
202 def test_not_parsed_file(self):
203 # A file that has never been parsed will have to be parsed from the
204 # start.
205- files_to_parse = get_files_to_parse([self.file_path])
206- fd, position = list(files_to_parse)[0]
207- self.assertEqual(position, 0)
208+ files_to_parse = list(get_files_to_parse([self.file_path]))
209+ self.assertEqual(1, len(files_to_parse))
210+ fd, position = files_to_parse[0]
211+ fd.close()
212+ self.assertEqual(0, position)
213
214 def test_completely_parsed_file(self):
215 # A file that has been completely parsed will be skipped.
216- fd = open(self.file_path)
217- first_line = fd.readline()
218- fd.seek(0)
219- ParsedApacheLog(first_line, len(fd.read()))
220+ with open(self.file_path) as fd:
221+ first_line = fd.readline()
222+ fd.seek(0)
223+ ParsedApacheLog(first_line, len(fd.read()))
224
225 files_to_parse = get_files_to_parse([self.file_path])
226- self.assertEqual(list(files_to_parse), [])
227+ self.assertEqual([], list(files_to_parse))
228
229 def test_parsed_file_with_new_content(self):
230 # A file that has been parsed already but in which new content was
231@@ -440,11 +453,12 @@ class TestParsedFilesDetection(TestCase):
232 ParsedApacheLog(first_line, len(first_line))
233
234 files_to_parse = list(get_files_to_parse([self.file_path]))
235- self.assertEqual(len(files_to_parse), 1)
236+ self.assertEqual(1, len(files_to_parse))
237 fd, position = files_to_parse[0]
238+ fd.close()
239 # Since we parsed the first line above, we'll be told to start where
240 # the first line ends.
241- self.assertEqual(position, len(first_line))
242+ self.assertEqual(len(first_line), position)
243
244 def test_different_files_with_same_name(self):
245 # Thanks to log rotation, two runs of our script may see files with
246@@ -459,12 +473,14 @@ class TestParsedFilesDetection(TestCase):
247 # parse it from the start.
248 fd, new_path = tempfile.mkstemp()
249 content2 = 'Different First Line\nSecond Line'
250- fd = open(new_path, 'w')
251- fd.write(content2)
252- fd.close()
253+ with open(new_path, 'w') as fd:
254+ fd.write(content2)
255 files_to_parse = get_files_to_parse([new_path])
256- positions = map(itemgetter(1), files_to_parse)
257- self.assertEqual(positions, [0])
258+ positions = []
259+ for fd, position in files_to_parse:
260+ fd.close()
261+ positions.append(position)
262+ self.assertEqual([0], positions)
263
264 def test_fresh_gzipped_file(self):
265 # get_files_to_parse() handles gzipped files just like uncompressed
266@@ -472,8 +488,11 @@ class TestParsedFilesDetection(TestCase):
267 gz_name = 'launchpadlibrarian.net.access-log.1.gz'
268 gz_path = os.path.join(self.root, gz_name)
269 files_to_parse = get_files_to_parse([gz_path])
270- positions = map(itemgetter(1), files_to_parse)
271- self.assertEqual(positions, [0])
272+ positions = []
273+ for fd, position in files_to_parse:
274+ fd.close()
275+ positions.append(position)
276+ self.assertEqual([0], positions)
277
278 def test_resumed_gzipped_file(self):
279 # In subsequent runs of the script we will resume from where we
280@@ -483,8 +502,11 @@ class TestParsedFilesDetection(TestCase):
281 first_line = gzip.open(gz_path).readline()
282 ParsedApacheLog(first_line, len(first_line))
283 files_to_parse = get_files_to_parse([gz_path])
284- positions = map(itemgetter(1), files_to_parse)
285- self.assertEqual(positions, [len(first_line)])
286+ positions = []
287+ for fd, position in files_to_parse:
288+ fd.close()
289+ positions.append(position)
290+ self.assertEqual([len(first_line)], positions)
291
292
293 class Test_create_or_update_parsedlog_entry(TestCase):
294diff --git a/lib/lp/services/librarianserver/tests/test_apachelogparser.py b/lib/lp/services/librarianserver/tests/test_apachelogparser.py
295index ffd30c7..1be4c14 100644
296--- a/lib/lp/services/librarianserver/tests/test_apachelogparser.py
297+++ b/lib/lp/services/librarianserver/tests/test_apachelogparser.py
298@@ -2,8 +2,8 @@
299 # GNU Affero General Public License version 3 (see the file LICENSE).
300
301 from datetime import datetime
302+import io
303 import os
304-from StringIO import StringIO
305 import subprocess
306
307 from zope.component import getUtility
308@@ -95,10 +95,10 @@ class TestLibrarianLogFileParsing(TestCase):
309 self.logger = BufferLogger()
310
311 def test_request_to_lfa_is_parsed(self):
312- fd = StringIO(
313- '69.233.136.42 - - [13/Jun/2008:14:55:22 +0100] "GET '
314- '/15018215/ul_logo_64x64.png HTTP/1.1" 200 2261 '
315- '"https://launchpad.net/~ubuntulite/+archive" "Mozilla"')
316+ fd = io.BytesIO(
317+ b'69.233.136.42 - - [13/Jun/2008:14:55:22 +0100] "GET '
318+ b'/15018215/ul_logo_64x64.png HTTP/1.1" 200 2261 '
319+ b'"https://launchpad.net/~ubuntulite/+archive" "Mozilla"')
320 downloads, parsed_bytes, ignored = parse_file(
321 fd, start_position=0, logger=self.logger,
322 get_download_key=get_library_file_id)
323@@ -114,9 +114,9 @@ class TestLibrarianLogFileParsing(TestCase):
324 def test_request_to_non_lfa_is_ignored(self):
325 # A request to a path which doesn't map to a LibraryFileAlias (e.g.
326 # '/') is ignored.
327- fd = StringIO(
328- '69.233.136.42 - - [13/Jun/2008:14:55:22 +0100] "GET / HTTP/1.1" '
329- '200 2261 "https://launchpad.net/~ubuntulite/+archive" "Mozilla"')
330+ fd = io.BytesIO(
331+ b'69.233.136.42 - - [13/Jun/2008:14:55:22 +0100] "GET / HTTP/1.1" '
332+ b'200 2261 "https://launchpad.net/~ubuntulite/+archive" "Mozilla"')
333 downloads, parsed_bytes, ignored = parse_file(
334 fd, start_position=0, logger=self.logger,
335 get_download_key=get_library_file_id)

Subscribers

People subscribed via source and target branches

to status/vote changes: