Merge lp:~jml/pkgme-devportal/apt-file-db into lp:pkgme-devportal

Proposed by Jonathan Lange
Status: Merged
Approved by: James Westby
Approved revision: 132
Merged at revision: 122
Proposed branch: lp:~jml/pkgme-devportal/apt-file-db
Merge into: lp:pkgme-devportal
Diff against target: 547 lines (+267/-206)
5 files modified
devportalbinary/aptfile.py (+212/-0)
devportalbinary/database.py (+1/-157)
devportalbinary/tests/test_aptfile.py (+52/-0)
devportalbinary/tests/test_database.py (+1/-49)
setup.py (+1/-0)
To merge this branch: bzr merge lp:~jml/pkgme-devportal/apt-file-db
Reviewer Review Type Date Requested Status
James Westby Approve
Review via email: mp+124696@code.launchpad.net

Commit message

Script to dump out the apt-file database as CSV

Description of the change

This script dumps out the apt-file database.

Also PEP8's some things and makes the code a little bit more understandable
to me.

To post a comment you must log in.
Revision history for this message
James Westby (james-w) wrote :

Hi,

I would suggest making the script a console entry_point. Then buildout will
fill in the paths so you can just run the script. We don't need this script
in production, so we aren't forced to avoid buildout.

Thanks,

James

lp:~jml/pkgme-devportal/apt-file-db updated
128. By Jonathan Lange

Make it an entry point. Thanks James.

129. By Jonathan Lange

Whitespace

130. By Jonathan Lange

Move AptFilePackageDatabase out of database.py module.

131. By Jonathan Lange

Move apt file tests to a separate module.

Revision history for this message
Jonathan Lange (jml) wrote :

Good idea. Done.

While at it, I moved the aptfilepackagedatabase code into a separate module.

jml

lp:~jml/pkgme-devportal/apt-file-db updated
132. By Jonathan Lange

Remove the file in bin/ again.

Revision history for this message
James Westby (james-w) :
review: Approve

Preview Diff

[H/L] Next/Prev Comment, [J/K] Next/Prev File, [N/P] Next/Prev Hunk
1=== added file 'devportalbinary/aptfile.py'
2--- devportalbinary/aptfile.py 1970-01-01 00:00:00 +0000
3+++ devportalbinary/aptfile.py 2012-09-17 15:39:29 +0000
4@@ -0,0 +1,212 @@
5+# Copyright 2012 Canonical Ltd. This software is licensed under the
6+# GNU Affero General Public License version 3 (see the file LICENSE).
7+
8+__all__ = [
9+ 'AptFilePackageDatabase',
10+ ]
11+
12+import gzip
13+import urllib
14+import argparse
15+import os
16+import re
17+
18+
19+def make_arg_parser():
20+ p = argparse.ArgumentParser()
21+ p.add_argument('--cache-dir', type=str, default='cache')
22+ p.add_argument('output_file', type=argparse.FileType('w'))
23+ return p
24+
25+
26+so_filename_re = re.compile(r'\.so(\.[0-9]+)*$')
27+def export_database(db, stream):
28+ for library, package, arch in db.iter_database():
29+ if so_filename_re.search(library):
30+ stream.write(','.join([package, library, package, arch]))
31+ stream.write('\n')
32+ stream.flush()
33+
34+
35+def dump_apt_file_db():
36+ parser = make_arg_parser()
37+ args = parser.parse_args()
38+ if not os.path.isdir(args.cache_dir):
39+ os.path.makedirs(args.cache_dir)
40+ db = AptFilePackageDatabase(args.cache_dir)
41+ export_database(db, args.output_file)
42+ return 0
43+
44+
45+def iter_contents_file(contents):
46+ """ Yield (full-library-path, set-of-pkgnames) from a Contents file.
47+
48+ It expects a line starting with "FILE" that tells it when the header ends
49+ and the actual content starts.
50+ """
51+ found_start_marker = False
52+ for line in contents:
53+ if not found_start_marker:
54+ if line.startswith("FILE"):
55+ found_start_marker = True
56+ continue
57+ (path, sep, pkgs) = [s.strip() for s in line.rpartition(" ")]
58+ # pkgs is formated a bit funny, e.g. universe/pkgname
59+ pkgs = set([os.path.basename(pkg) for pkg in pkgs.split(",")])
60+ yield (path, pkgs)
61+
62+
63+class AptFilePackageDatabase(object):
64+ """Really dumb database that just uses apt-file for local testing """
65+
66+ # we could also read /etc/ld.so.conf.d/*.conf but this maybe different on
67+ # different distroseries especially if
68+ # server-distroseries != target-distroseries
69+ # (I wish there was ldconfig --print-search-dirs)
70+ LD_SEARCH_PATH = [
71+ # standards
72+ "lib",
73+ "usr/lib",
74+ "usr/local/lib",
75+ # old biarch
76+ "lib32",
77+ "usr/lib32",
78+ # new multiarch
79+ "lib/i686-linux-gnu",
80+ "lib/i386-linux-gnu",
81+ "lib/x86_64-linux-gnu",
82+ "usr/lib/i386-linux-gnu",
83+ "usr/lib/i686-linux-gnu",
84+ "usr/lib/x86_64-linux-gnu",
85+ # ?
86+ "usr/lib/x86_64-linux-gnu/fakechroot",
87+ "usr/lib/x86_64-linux-gnu/mesa",
88+ "usr/lib/x86_64-linux-gnu/mesa-egl",
89+ "usr/lib/i386-linux-gnu/mesa",
90+ ]
91+
92+ DISTROSERIES = "oneiric"
93+
94+ CONTENTS_FILE_URL_LOCATION = (
95+ "http://archive.ubuntu.com/ubuntu/dists/%(distroseries)s/"
96+ "Contents-%(arch)s.gz")
97+
98+ CONTENTS_FILE = "Contents-%(distroseries)s-%(arch)s"
99+
100+ def __init__(self, cachedir):
101+ self.cachedir = os.path.expanduser(cachedir)
102+ self._distroseries_arch_cache = {}
103+
104+ def _get_lib_to_pkgs_mapping(self, distroseries, arch):
105+ """Returns a dict of { library-name : set([pkg1,pkg2])
106+
107+ This function will return a dict to lookup library-name to package
108+ dependencies for the given distroseries and architecture
109+ """
110+ if not (distroseries, arch) in self._distroseries_arch_cache:
111+ self._distroseries_arch_cache[(distroseries, arch)] = \
112+ self._get_mapping_from_contents_file(distroseries, arch)
113+ return self._distroseries_arch_cache[(distroseries, arch)]
114+
115+ def _get_contents_file_cache_path(self, distroseries, arch):
116+ """Return the path in the cache for the given distroseries, arch """
117+ return os.path.join(
118+ self.cachedir, self.CONTENTS_FILE % {
119+ 'distroseries': distroseries, 'arch': arch})
120+
121+ def _get_contents_file_server_url(self, distroseries, arch):
122+ """Return the remote server URL for the given distroseries, arch """
123+ return self.CONTENTS_FILE_URL_LOCATION % {
124+ 'distroseries': distroseries, 'arch': arch}
125+
126+ def _get_mapping_from_contents_file(self, distroseries, arch):
127+ """Return lib,pkgs mapping from contents file for distroseries, arch
128+
129+ This expects the contents file to be in the cachedir already.
130+ """
131+ lib_to_pkgs = {}
132+ path = self._get_contents_file_cache_path(distroseries, arch)
133+ with open(path) as f:
134+ for path, pkgs in self._iter_contents_file(f):
135+ basename = os.path.basename(path)
136+ if not basename in lib_to_pkgs:
137+ lib_to_pkgs[basename] = set()
138+ lib_to_pkgs[basename] |= pkgs
139+ return lib_to_pkgs
140+
141+ def _download_contents_file_compressed(self, distroseries, arch):
142+ """Downloads the content file for distroseries, arch into target """
143+ # XXX: we may eventually want to merge the Contents files from
144+ # the -updates repository too in addition to the main archive
145+ url = self._get_contents_file_server_url(distroseries, arch)
146+ target = self._get_contents_file_cache_path(distroseries, arch)
147+ compressed_target = target + os.path.splitext(url)[1]
148+ # download
149+ urllib.urlretrieve(url, compressed_target)
150+ return compressed_target
151+
152+ def _iter_contents_file(self, in_file):
153+ for path, pkgs in iter_contents_file(in_file):
154+ if os.path.dirname(path) in self.LD_SEARCH_PATH:
155+ yield path, pkgs
156+
157+ def _prune_contents_gz_file(self, infile, outfile):
158+ """Read a compressed Contents.gz and write out a pruned version.
159+
160+ This will use iter_contents_file to go over infile and write
161+ the relevant lines that are in the LD_SEARCH_PATH to outfile.
162+ """
163+ with open(outfile, "w") as outf, gzip.open(infile) as inf:
164+ # first write the header
165+ outf.write("FILE LOCATION\n")
166+ # then iter over all relevant lines and write them out
167+ for path, pkgs in self._iter_contents_file(inf):
168+ outf.write("%s %s\n" % (path, ",".join(pkgs)))
169+
170+ def _download_and_prepare_contents_file_if_needed(self, distroseries, arch):
171+ """Ensure there is a usable Contents file in the cachedir
172+
173+ This will download, uncompress and prune a Conents file for
174+ distroseries, arch so that get_dependencies works.
175+ """
176+ # mvo: We can (and should eventually) do etag/if-modified-since
177+ # matching here. But its not really important as long as
178+ # we package for stable distroseries as the Contents file
179+ # will not change
180+ path = self._get_contents_file_cache_path(distroseries, arch)
181+ if not os.path.exists(path):
182+ compressed_contents = self._download_contents_file_compressed(
183+ distroseries, arch)
184+ # and prune from ~300mb to 1mb uncompressed as we are only
185+ # interested in the library path parts
186+ self._prune_contents_gz_file(compressed_contents, path)
187+ os.remove(compressed_contents)
188+
189+ def iter_database(self, architectures=('i386', 'amd64'),
190+ distroseries=None):
191+ """Export the database.
192+
193+ Yields (library, package, arch) tuples for everything that we can
194+ find.
195+ """
196+ # XXX: Untested
197+ if distroseries is None:
198+ distroseries = self.DISTROSERIES
199+ for arch in architectures:
200+ self._download_and_prepare_contents_file_if_needed(
201+ distroseries, arch)
202+ mapping = self._get_lib_to_pkgs_mapping(distroseries, arch)
203+ for library in mapping:
204+ for package in mapping[library]:
205+ yield library, package, arch
206+
207+ def get_dependencies(self, lib, arch="i386"):
208+ # do lazy downloading for now, we could also make this part
209+ # of bin/fetch-symbols I guess(?)
210+ self._download_and_prepare_contents_file_if_needed(
211+ self.DISTROSERIES, arch)
212+ lib_to_pkgs = self._get_lib_to_pkgs_mapping(self.DISTROSERIES, arch)
213+ return lib_to_pkgs.get(lib)
214+
215+ def close(self):
216+ pass
217
218=== modified file 'devportalbinary/database.py'
219--- devportalbinary/database.py 2012-09-13 15:04:20 +0000
220+++ devportalbinary/database.py 2012-09-17 15:39:29 +0000
221@@ -3,12 +3,10 @@
222
223 from contextlib import closing, contextmanager
224 import errno
225-import gzip
226 from itertools import chain
227 import os
228 import shutil
229 import tempfile
230-import urllib
231
232 from bzrlib import urlutils
233 from fixtures import (
234@@ -24,6 +22,7 @@
235 from storm.locals import create_database, Store
236 from storm.uri import URI as StormURI
237
238+from .aptfile import AptFilePackageDatabase
239 from .configuration import (
240 CONF_FILE_ENV_VAR,
241 get_config_file_path,
242@@ -357,24 +356,6 @@
243 return mapping
244
245
246-def iter_contents_file(contents):
247- """ Yield (full-library-path, set-of-pkgnames) from a Contents file.
248-
249- It expects a line starting with "FILE" that tells it when the header ends
250- and the actual content starts.
251- """
252- found_start_marker = False
253- for line in contents:
254- if not found_start_marker:
255- if line.startswith("FILE"):
256- found_start_marker = True
257- continue
258- (path, sep, pkgs) = [s.strip() for s in line.rpartition(" ")]
259- # pkgs is formated a bit funny, e.g. universe/pkgname
260- pkgs = set([os.path.basename(pkg) for pkg in pkgs.split(",")])
261- yield (path, pkgs)
262-
263-
264 class URI(StormURI):
265 """A stand-in for Storm's URI class.
266
267@@ -397,143 +378,6 @@
268 self.options = dict()
269
270
271-class AptFilePackageDatabase(object):
272- """ Really dumb database that just uses apt-file for local testing """
273-
274- # we could also read /etc/ld.so.conf.d/*.conf but this maybe different on
275- # different distroseries especially if
276- # server-distroseries != target-distroseries
277- # (I wish there was ldconfig --print-search-dirs)
278- LD_SEARCH_PATH = [
279- # standards
280- "lib",
281- "usr/lib",
282- "usr/local/lib",
283- # old biarch
284- "lib32",
285- "usr/lib32",
286- # new multiarch
287- "lib/i686-linux-gnu",
288- "lib/i386-linux-gnu",
289- "lib/x86_64-linux-gnu",
290- "usr/lib/i386-linux-gnu",
291- "usr/lib/i686-linux-gnu",
292- "usr/lib/x86_64-linux-gnu",
293- # ?
294- "usr/lib/x86_64-linux-gnu/fakechroot",
295- "usr/lib/x86_64-linux-gnu/mesa",
296- "usr/lib/x86_64-linux-gnu/mesa-egl",
297- "usr/lib/i386-linux-gnu/mesa",
298- ]
299-
300- DISTROSERIES = "oneiric"
301-
302- CONTENTS_FILE_URL_LOCATION = (
303- "http://archive.ubuntu.com/ubuntu/dists/%(distroseries)s/"
304- "Contents-%(arch)s.gz")
305-
306- CONTENTS_FILE = "Contents-%(distroseries)s-%(arch)s"
307-
308- def __init__(self, cachedir=None):
309- self.cachedir = os.path.expanduser(cachedir)
310- self._distroseries_arch_cache = {}
311-
312- def _get_lib_to_pkgs_mapping(self, distroseries, arch):
313- """ Returns a dict of { library-name : set([pkg1,pkg2])
314-
315- This function will return a dict to lookup library-name to package
316- dependencies for the given distroseries and architecture
317- """
318- if not (distroseries, arch) in self._distroseries_arch_cache:
319- self._distroseries_arch_cache[(distroseries, arch)] = \
320- self._get_mapping_from_contents_file(distroseries, arch)
321- return self._distroseries_arch_cache[(distroseries, arch)]
322-
323- def _get_contents_file_cache_path(self, distroseries, arch):
324- """ Return the path in the cache for the given distroseries, arch """
325- return os.path.join(
326- self.cachedir, self.CONTENTS_FILE % {
327- 'distroseries' : distroseries, 'arch' : arch })
328-
329- def _get_contents_file_server_url(self, distroseries, arch):
330- """ Return the remote server URL for the given distroseries, arch """
331- return self.CONTENTS_FILE_URL_LOCATION % {
332- 'distroseries' : distroseries, 'arch' : arch }
333-
334- def _get_mapping_from_contents_file(self, distroseries, arch):
335- """ Return lib,pkgs mapping from contents file for distroseries, arch
336-
337- This expects the contents file to be in the cachedir already.
338- """
339- lib_to_pkgs = {}
340- path = self._get_contents_file_cache_path(distroseries, arch)
341- with open(path) as f:
342- for (path, pkgs) in filter(
343- lambda (p, pkgs): os.path.dirname(p) in self.LD_SEARCH_PATH,
344- iter_contents_file(f)):
345- basename = os.path.basename(path)
346- if not basename in lib_to_pkgs:
347- lib_to_pkgs[basename] = set()
348- lib_to_pkgs[basename] |= pkgs
349- return lib_to_pkgs
350-
351- def _download_contents_file_compressed(self, distroseries, arch):
352- """ Downloads the content file for distroseries, arch into target """
353- # XXX: we may eventually want to merge the Contents files from
354- # the -updates repository too in addition to the main archive
355- url = self._get_contents_file_server_url(distroseries, arch)
356- target = self._get_contents_file_cache_path(distroseries, arch)
357- compressed_target = target + os.path.splitext(url)[1]
358- # download
359- urllib.urlretrieve(url, compressed_target)
360- return compressed_target
361-
362- def _prune_contents_gz_file(self, infile, outfile):
363- """ Read a compressed Contents.gz and write out a pruned version.
364-
365- This will use iter_contents_file to go over infile and write
366- the relevant lines that are in the LD_SEARCH_PATH to outfile.
367- """
368- with open(outfile, "w") as outf, gzip.open(infile) as inf:
369- # first write the header
370- outf.write("FILE LOCATION\n")
371- # then iter over all relevant lines and write them out
372- for (path, pkgs) in filter(
373- lambda (p,pkgs): os.path.dirname(p) in self.LD_SEARCH_PATH,
374- iter_contents_file(inf)):
375- outf.write("%s %s\n" % (path, ",".join(pkgs)))
376-
377- def _download_and_prepare_contents_file_if_needed(self, distroseries, arch):
378- """ Ensure there is a usable Contents file in the cachedir
379-
380- This will download, uncompress and prune a Conents file for
381- distroseries, arch so that get_dependencies works.
382- """
383- # mvo: We can (and should eventually) do etag/if-modified-since
384- # matching here. But its not really important as long as
385- # we package for stable distroseries as the Contents file
386- # will not change
387- path = self._get_contents_file_cache_path(distroseries, arch)
388- if not os.path.exists(path):
389- compressed_contents = self._download_contents_file_compressed(
390- distroseries, arch)
391- # and prune from ~300mb to 1mb uncompressed as we are only
392- # interested in the library path parts
393- self._prune_contents_gz_file(compressed_contents, path)
394- os.remove(compressed_contents)
395-
396- def get_dependencies(self, lib, arch="i386"):
397- # do lazy downloading for now, we could also make this part
398- # of bin/fetch-symbols I guess(?)
399- self._download_and_prepare_contents_file_if_needed(
400- self.DISTROSERIES, arch)
401- lib_to_pkgs = self._get_lib_to_pkgs_mapping(self.DISTROSERIES, arch)
402- return lib_to_pkgs.get(lib)
403-
404- def close(self):
405- pass
406-
407-
408 class PackageDatabase(object):
409
410 # the various db backends, aptfile is a bit special
411
412=== added file 'devportalbinary/tests/test_aptfile.py'
413--- devportalbinary/tests/test_aptfile.py 1970-01-01 00:00:00 +0000
414+++ devportalbinary/tests/test_aptfile.py 2012-09-17 15:39:29 +0000
415@@ -0,0 +1,52 @@
416+import gzip
417+import os
418+
419+from mock import patch
420+from fixtures import TempDir
421+from testtools import TestCase
422+
423+from ..aptfile import AptFilePackageDatabase
424+
425+
426+class AptFilePackageDatabaseTestCase(TestCase):
427+
428+ # point to our local contents file version that is a tad smaller
429+ CONTENTS_CACHE = os.path.join(
430+ os.path.dirname(__file__), "data", "apt-file-backend")
431+
432+ def setUp(self):
433+ super(AptFilePackageDatabaseTestCase, self).setUp()
434+ self.db = AptFilePackageDatabase(self.CONTENTS_CACHE)
435+
436+ def test_read_fixture_contents_worked(self):
437+ """ test that our fixture Contents file works as expected """
438+ # our test DB has 4 entries in the default search path
439+ self.assertEqual(
440+ len(self.db._get_lib_to_pkgs_mapping("oneiric", "i386")), 4)
441+
442+ def test_get_dependencies(self):
443+ """ Test that data from the fixture dependencies file works """
444+ self.assertEqual(
445+ self.db.get_dependencies("libz.so.1"), set(["zlib1g"]))
446+
447+ @patch("urllib.urlretrieve")
448+ def test_lazy_downloading(self, mock_urlretrieve):
449+ """ test that lazy downloading works """
450+ def _put_fixture_contents_file_in_place(url, target):
451+ with gzip.open(target, "w") as f:
452+ f.write("""
453+Some header text that is ignored
454+FILE LOCATION
455+usr/lib/libfoo.so.2 pkgfoo,pkgbar
456+""")
457+ tempdir = self.useFixture(TempDir())
458+ db = AptFilePackageDatabase(tempdir.path)
459+ mock_urlretrieve.side_effect = _put_fixture_contents_file_in_place
460+ self.assertEqual(
461+ db.get_dependencies("libfoo.so.2", arch="i386"),
462+ set(["pkgfoo", "pkgbar"]))
463+ self.assertEqual(len(db._get_lib_to_pkgs_mapping("oneiric", "i386")), 1)
464+
465+ def test_close(self):
466+ # Test that there is a close method we can call
467+ self.db.close()
468
469=== modified file 'devportalbinary/tests/test_database.py'
470--- devportalbinary/tests/test_database.py 2012-09-07 19:15:08 +0000
471+++ devportalbinary/tests/test_database.py 2012-09-17 15:39:29 +0000
472@@ -1,11 +1,7 @@
473 from collections import namedtuple
474-import gzip
475 import os
476
477-from fixtures import (
478- TempDir,
479- )
480-from mock import patch
481+from fixtures import TempDir
482 from storm.databases.postgres import psycopg2
483 from storm.exceptions import ClosedError
484 from testresources import ResourcedTestCase
485@@ -192,50 +188,6 @@
486 self.assertEqual(expected_username, uri.username)
487
488
489-class AptFilePackageDatabaseTestCase(TestCase):
490-
491- # point to our local contents file version that is a tad smaller
492- CONTENTS_CACHE = os.path.join(
493- os.path.dirname(__file__), "data", "apt-file-backend")
494-
495- def setUp(self):
496- super(AptFilePackageDatabaseTestCase, self).setUp()
497- self.db = AptFilePackageDatabase(self.CONTENTS_CACHE)
498-
499- def test_read_fixture_contents_worked(self):
500- """ test that our fixture Contents file works as expected """
501- # our test DB has 4 entries in the default search path
502- self.assertEqual(
503- len(self.db._get_lib_to_pkgs_mapping("oneiric", "i386")), 4)
504-
505- def test_get_dependencies(self):
506- """ Test that data from the fixture dependencies file works """
507- self.assertEqual(
508- self.db.get_dependencies("libz.so.1"), set(["zlib1g"]))
509-
510- @patch("urllib.urlretrieve")
511- def test_lazy_downloading(self, mock_urlretrieve):
512- """ test that lazy downloading works """
513- def _put_fixture_contents_file_in_place(url, target):
514- with gzip.open(target, "w") as f:
515- f.write("""
516-Some header text that is ignored
517-FILE LOCATION
518-usr/lib/libfoo.so.2 pkgfoo,pkgbar
519-""")
520- tempdir = self.useFixture(TempDir())
521- db = AptFilePackageDatabase(tempdir.path)
522- mock_urlretrieve.side_effect = _put_fixture_contents_file_in_place
523- self.assertEqual(
524- db.get_dependencies("libfoo.so.2", arch="i386"),
525- set(["pkgfoo", "pkgbar"]))
526- self.assertEqual(len(db._get_lib_to_pkgs_mapping("oneiric", "i386")), 1)
527-
528- def test_close(self):
529- # Test that there is a close method we can call
530- self.db.close()
531-
532-
533 class FakeBPPH(object):
534
535 def __init__(self):
536
537=== modified file 'setup.py'
538--- setup.py 2012-09-12 18:31:59 +0000
539+++ setup.py 2012-09-17 15:39:29 +0000
540@@ -44,6 +44,7 @@
541 ],
542 entry_points = {
543 'console_scripts': [
544+ 'dump-apt-file-db=devportalbinary.aptfile:dump_apt_file_db',
545 'fetch-symbol-files=devportalbinary.database:main',
546 'guess-executable=devportalbinary.binary:print_executable',
547 'guess-deps=devportalbinary.binary:print_dependencies',

Subscribers

People subscribed via source and target branches