Merge lp:~salgado/launchpad/bug-481375 into lp:launchpad

Proposed by Guilherme Salgado
Status: Merged
Approved by: Brad Crittenden
Approved revision: not available
Merged at revision: not available
Proposed branch: lp:~salgado/launchpad/bug-481375
Merge into: lp:launchpad
Diff against target: 89 lines (+49/-6)
2 files modified
lib/lp/services/apachelogparser/base.py (+24/-5)
lib/lp/services/apachelogparser/tests/test_apachelogparser.py (+25/-1)
To merge this branch: bzr merge lp:~salgado/launchpad/bug-481375
Reviewer Review Type Date Requested Status
Francis J. Lacoste (community) release-critical Approve
Brad Crittenden (community) code Approve
Review via email: mp+14961@code.launchpad.net

Commit message

Fix bug 481375 by getting the real (i.e. uncompressed) size of gzipped log files

To post a comment you must log in.
Revision history for this message
Guilherme Salgado (salgado) wrote :

Fix bug 481375 by getting the real (i.e. uncompressed) size of gzipped files

Revision history for this message
Brad Crittenden (bac) :
review: Approve (code)
Revision history for this message
Francis J. Lacoste (flacoste) :
review: Approve (release-critical)

Preview Diff

[H/L] Next/Prev Comment, [J/K] Next/Prev File, [N/P] Next/Prev Hunk
1=== modified file 'lib/lp/services/apachelogparser/base.py'
2--- lib/lp/services/apachelogparser/base.py 2009-08-27 03:27:46 +0000
3+++ lib/lp/services/apachelogparser/base.py 2009-11-17 18:45:29 +0000
4@@ -33,11 +33,7 @@
5 store = getUtility(IStoreSelector).get(MAIN_STORE, DEFAULT_FLAVOR)
6 for file_name in file_names:
7 file_path = os.path.join(root, file_name)
8- file_size = os.path.getsize(file_path)
9- if file_name.endswith('.gz'):
10- fd = gzip.open(file_path)
11- else:
12- fd = open(file_path)
13+ fd, file_size = get_fd_and_file_size(file_path)
14 first_line = unicode(fd.readline())
15 parsed_file = store.find(ParsedApacheLog, first_line=first_line).one()
16 position = 0
17@@ -57,6 +53,29 @@
18 return files_to_parse
19
20
21+def get_fd_and_file_size(file_path):
22+ """Return a file descriptor and the file size for the given file path.
23+
24+ The file descriptor will have the default mode ('r') and will be seeked to
25+ the beginning.
26+
27+ The file size returned is that of the uncompressed file, in case the given
28+ file_path points to a gzipped file.
29+ """
30+ if file_path.endswith('.gz'):
31+ fd = gzip.open(file_path)
32+ # There doesn't seem to be a better way of figuring out the
33+ # uncompressed size of a file, so we'll read the whole file here.
34+ file_size = len(fd.read())
35+ # Seek back to the beginning of the file as if we had just opened
36+ # it.
37+ fd.seek(0)
38+ else:
39+ fd = open(file_path)
40+ file_size = os.path.getsize(file_path)
41+ return fd, file_size
42+
43+
44 def parse_file(fd, start_position, logger, get_download_key):
45 """Parse the given file starting on the given position.
46
47
48=== modified file 'lib/lp/services/apachelogparser/tests/test_apachelogparser.py'
49--- lib/lp/services/apachelogparser/tests/test_apachelogparser.py 2009-08-31 15:04:47 +0000
50+++ lib/lp/services/apachelogparser/tests/test_apachelogparser.py 2009-11-17 18:45:29 +0000
51@@ -17,7 +17,7 @@
52 from canonical.launchpad.scripts.librarian_apache_log_parser import DBUSER
53 from lp.services.apachelogparser.base import (
54 create_or_update_parsedlog_entry, get_day, get_files_to_parse,
55- get_host_date_status_and_request, parse_file)
56+ get_fd_and_file_size, get_host_date_status_and_request, parse_file)
57 from lp.services.apachelogparser.model.parsedapachelog import ParsedApacheLog
58 from lp.testing import TestCase
59
60@@ -59,6 +59,30 @@
61 self.assertEqual(get_day(date), datetime(2008, 6, 13))
62
63
64+class Test_get_fd_and_file_size(TestCase):
65+
66+ def _ensureFileSizeIsCorrect(self, file_path):
67+ """Ensure the file size returned is correct.
68+
69+ Also ensure that the file descriptors returned where seek()ed to the
70+ very beginning.
71+ """
72+ fd, file_size = get_fd_and_file_size(file_path)
73+ self.assertEqual(fd.tell(), 0)
74+ self.assertEqual(len(fd.read()), file_size)
75+
76+ def test_regular_file(self):
77+ file_path = os.path.join(
78+ here, 'apache-log-files', 'librarian-oneline.log')
79+ self._ensureFileSizeIsCorrect(file_path)
80+
81+ def test_gzip_file(self):
82+ file_path = os.path.join(
83+ here, 'apache-log-files',
84+ 'launchpadlibrarian.net.access-log.1.gz')
85+ self._ensureFileSizeIsCorrect(file_path)
86+
87+
88 def get_path_download_key(path):
89 return path
90